Saturday, April 24, 2021

Google colab - keras-learn and Logistic Regression example

(1) Following the previous post, this demo the keras sample from how-to-install-tensorflow-with-gpu.html
Please take note that google only allow one active session for the free service. If you need faster GPU, more RAM and sessions, please consider to subscribe colab pro.

keraslearn.ipynb   Select all
# Step 1 mount google drive if data is from google drive import os from google.colab import drive drive.mount('/content/drive') # Step 2 if using tensorflow GPU #%tensorflow_version 2.x #import tensorflow as tf #print('TensorFlow: {}'.format(tf.__version__)) #tf.test.gpu_device_name() # Step 3 from keras.models import Sequential from keras.layers import Dense import numpy import time # fix random seed for reproducibility numpy.random.seed(7) # Step 4 # download pima indians dataset to google drive !curl -L https://tinyurl.com/tensorflowwin | grep -A768 pima-indians-diabetes.data.nbsp | sed '1d' > 'drive/MyDrive/Colab Notebooks/pima-indians-diabetes.data' # or download to local data directory !mkdir -p ./data !curl -L https://tinyurl.com/tensorflowwin | grep -A768 pima-indians-diabetes.data.nbsp | sed '1d' > './data/pima-indians-diabetes.data' # Step 5 load dataset from google drive dataset = numpy.loadtxt("drive/MyDrive/Colab Notebooks/pima-indians-diabetes.data", delimiter=",") # or load data from local data directory dataset = numpy.loadtxt("./data/pima-indians-diabetes.data", delimiter=",") # Step 6 # split into input (X) and output (Y) variables X = dataset[:,0:8] Y = dataset[:,8] # Step 7 # create model model = Sequential() model.add(Dense(12, input_dim=8, activation='relu')) model.add(Dense(1, activation='sigmoid')) # Step 8 # Compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # Step 9 start_time=time.time() # Fit the model model.fit(X, Y, batch_size=10, epochs=1500) # parameters for keras 1.2.2 # evaluate the model scores = model.evaluate(X, Y) print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) print("\nTraining took %.2f seconds\n" %(time.time()-start_time))


(2) For large training data set, consider to zip them and upload to google drive. Mount the google drive, then unzip it in local session. e.g.
!mkdir -p ./data
!unzip -o './drive/MyDrive/Colab Notebooks/mydata.zip' -d ./data/


(3) To stop the running cell in Google Colab use Ctrl-M I

(4) How to quickly run an ipynb example from github ?
4.1) Go to https://colab.research.google.com/, after login gmail and choose GitHub tab and enter search say "clareyan/From-Linear-to-Logistic-Regression-Explained-Step-by-Step"
4.2) In Step 2 cell box change the importing of dataset to
df = pd.read_csv('https://raw.githubusercontent.com/clareyan/From-Linear-to-Logistic-Regression-Explained-Step-by-Step/master/Social_Network_Ads.csv')
4.3) Then choose menu -> Runtime -> Run All. After that, use menu -> File -> Save a copy in Drive.

Friday, April 23, 2021

How to setup google colab and start linear regression with tensorflow.

(1) You only need a chrome browser, google gmail account and google drive account to start cloud tensorflow computing. And it is free to use and learn.
(2) Go to https://colab.research.google.com/
(3) Create a new notebook rename it and then Copy to Drive
(4) Type the following into notebook and run it step by step (Press Alt-Enter to run after each step)
LinearRegression.ipynb   Select all
#Step 1 # mount Google Drive, will ask for authorization code import numpy as np import os from google.colab import drive drive.mount('/content/drive') #Step 2 # choose the notebook settings to use GPU, via Menu -> Edit -> Notebook Settings. %tensorflow_version 2.x import tensorflow as tf # will show GPU if successful tf.test.gpu_device_name() #Step 3 # load data import pandas as pd # either download the linear_data.csv and upload to google drive, or direct download it via the shell command as below !curl -L https://tinyurl.com/lineardatacsv | grep -A200 START_OF_LINEAR_DATA.CSV | sed '1d' | sed -n "/END_OF_LINEAR_DATA.CSV/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > 'drive/MyDrive/Colab Notebooks/linear_data.csv' df = pd.read_csv('drive/MyDrive/Colab Notebooks/linear_data.csv') df.head() #Step 4 # split into independent and dependent X = df[['X']].values y = df[['Y']].values X.shape, y.shape #Step 5 # visualize data import matplotlib.pyplot as plt %matplotlib inline plt.scatter(X,y) plt.xlabel('independent') plt.ylabel('dependent') plt.show() # Use Text box to enter # # Linear Regression $ \hat y = a + b * X $ #Step 6 # Linear Regression # define regression model class regression(): def __init__(self): self.a = tf.Variable(initial_value=0,dtype=tf.float32) self.b = tf.Variable(initial_value=0,dtype=tf.float32) def __call__(self, X): x = tf.convert_to_tensor(X,dtype=tf.float32) y_est = tf.add(self.a, tf.multiply(self.b,x)) return y_est model = regression() # Use Text box to enter # # loss = sum of square error (sse) = $ \sum (y_t - y_p) ^ 2 $ # step 7 # define loss function def loss_func(y_true, y_pred): # both values are in tensors sse = tf.reduce_sum(tf.square(tf.subtract(y_true,y_pred))) return sse # Use Text box to enter # # Gradient Descent $ a = a_i - \nabla(sse) | a * LR $ $ b = b_i - \nabla(sse) | b * LR $ # step 8 # define train function def train(model, inputs, outputs, learning_rate): # convert outputs into tensor y_true = tf.convert_to_tensor(outputs,dtype=tf.float32) # GradientTape cal gradient distance with tf.GradientTape() as g: y_pred = model(inputs) current_loss = loss_func(y_true,y_pred) da,db = g.gradient(current_loss,[model.a,model.b]) # update the values model.a.assign_sub(da*learning_rate) model.b.assign_sub(db*learning_rate) # Step 9 def plot_scatter(x,y): plt.scatter(x,y) # scatter plt.plot(x,model(x),'r--') #line plot_scatter(X,y) # step 10 # model fitting model = regression() a_values = [] b_values = [] cost_values = [] # epochs, no of steps epochs = 100 # learning_rate learning_rate = 0.0001 for epoch in range(epochs): a_values.append(model.a) b_values.append(model.b) # prediction values and error y_pred = model(X) cost_value = loss_func(y,y_pred) cost_values.append(cost_value) # training train(model,X,y,learning_rate) # visual the scatter plot_scatter(X,y) plt.show #print the value print('Epoch: %d, Loss: %0.2f, a: %0.2f, b: %0.2f' %(epoch,cost_value,a_values[-1],b_values[-1])) # step 11 plt.plot(cost_values)


(5) Linear_data.csv , download and upload to google drive
linear_data.csv   Select all
X,Y 4,2 4,10 7,4 7,22 8,16 9,10 10,18 10,26 10,34 11,17 11,28 12,14 12,20 12,24 12,28 13,25 13,34 13,24 13,46 14,26 14,36 14,60 14,80 15,20 15,26 15,54 16,32 16,40 17,32 17,40 17,50 18,42 18,56 18,76 18,84 19,36 19,45 19,68 20,32 20,48 20,52


Thursday, April 15, 2021

HelloWorld Assembler Code for x86_64, arm64 and for linux or macOS

(1) Following the previous post, this post demo the assembler code for command line program HelloWorld for x86_64, arm64 and for linux or macOS.
HelloWorld.S   Select all
// // Assembler program to print "Hello World!" // to stdout. For amr64, x86_64, linux and macOS // #define STDIN 0 // standard input device #define STDOUT 1 // standard output device #ifdef __APPLE__ #define SYS_read 0x2000003 // system call to read input macOS #define SYS_write 0x2000004 // system call to write message macOS #define SYS_exit 0x2000001 // system call to terminate program macOS #define SVC_write 4 // SVC write arm64 macOS #define SVC_exit 1 // SVC exit arm64 macOS #endif #ifdef __linux__ #define SYS_read 0 // system call to read input #define SYS_write 1 // system call to write message #define SYS_exit 60 // system call to terminate program #define SVC_write 64 // SVC write arm64 linux #define SVC_exit 93 // SVC exit arm64 linux #endif #define EXIT_OK 0 // OK exit status .globl _start // Provide program starting address to linker #ifdef __APPLE__ .align 4 #endif .text _start: #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 mov X0, #STDOUT // 1 = StdOut #ifdef __linux__ ldr X1, =helloworld // string to print mov X8, #SVC_write // linux write system call #endif #ifdef __APPLE__ // adr X1, helloworld // string to print //(adr calculates an address from the PC plus an offset, but for local) adrp X1, helloworld@PAGE // adrp can be used to access relative address of 4GB range add X1, X1, helloworld@PAGEOFF // string to print mov X16, #SVC_write // linux write system call #endif ldr X2, =len // length of our string svc #0 // Call linux to output the string // Setup the parameters to exit the program // and then call Linux to do it. mov X0, #0 // Use 0 return code #ifdef __linux__ mov X8, #SVC_exit // Service command code 93 terminates this program #endif #ifdef __APPLE__ mov X16, #1 // Service command terminates this program #endif svc #0 // Call linux to terminate the program #endif #if defined __x86_64__ movq $STDOUT, %rdi #ifdef __linux__ movq $helloworld, %rsi // char * #endif #ifdef __APPLE__ leaq helloworld(%rip), %rsi #endif movq $len, %rdx // length of our string movq $SYS_write, %rax // write system call syscall movq $EXIT_OK, %rdi // Use 0 return code movq $SYS_exit, %rax // exit system call syscall #endif .data helloworld: .ascii "Hello World!\n" len = . - helloworld // len = start - end


(2) To compile and debug for different systems
shell scripts   Select all
# To download the above code using command line. curl -L https://tinyurl.com/helloworld-gas | grep -A200 START_OF_HELLOWORLD.S | sed '1d' | sed -n "/END_OF_HELLOWORLD.S/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > HelloWorld.S # To compile with debug symbols under linux, e.g. Win10 WSL2 or Linux or Android Termux App clang -g -c HelloWorld.S -o HelloWorld.o ; ld HelloWorld.o -o HelloWorld # To compile under macOS (e.g. with M1 cpu) clang -g HelloWorld.S -o HelloWorld_x86_64 -e _start -arch x86_64 clang -g HelloWorld.S -o HelloWorld_arm64 -e _start -arch arm64


(3) To debug using lldb
shell scripts   Select all
# To start program debug lldb HelloWorld_x86_64 # or lldb HelloWorld_arm64 # lldb debug session for arm64 - useful commands (lldb) breakpoint set --name _start (lldb) breakpoint list (lldb) run (lldb) step (lldb) reg read x0 x1 x2 x8 lr pc (lldb) reg read -f t cpsr # lldb debug session for x86_64 - useful commands (lldb) reg read -f d rax rdi rsi rdx rflags (lldb) reg read -f t rflags # print the address value in the stackpointer for x86_64 (lldb) p *(int **)$sp # hint: to search lldb command history use ctrl-r


(4) Summary of differences
4.1) In order to preprocess the assembler file using clang compiler, the filename extension should be capital letter S in linux. Subroutine name between C and global asm labels should prefix by underscore for macOS.
4.2) A64 (arm64) parameter/ results registers are X0-7. If the function has a return value, it will be stored in X0.
4.3) x86_64 parameter registers for integer or pointer are %rdi. %rsi, %rdx, %rcx, %r8, %r9. If the function has a return value, it will be stored in %rax.
4.4) Linux and macOS has different syscall number (x86_64) or Service call number (for arm64). They are defined in this source code.
4.5) Absolute addressing is not allowed for arm64. For macOS, adr instruction can be used for accessing readonly local data. But for non-local data section (which is a buffer in RAM), adrp instruction and @PAGE and @PAGEOFF operators should be used as demo in the code.


Tuesday, April 13, 2021

Mixing C and Assembler for x86_64 and arm64, major differences.

(1) These demo the mixing of C and Assembler Language for x86_64 and arm64 and show the differences in linux and macOS environment.
callsum.c   Select all
/* * callsum.c * * Illustrates how to call the sum function in assembly language. */ #include <stdio.h> double sum(double[], unsigned); int main() { double test[] = { 40.5, 26.7, 21.9, 1.5, -40.5, -23.4 }; printf("%20.7f\n", sum(test, 6)); printf("%20.7f\n", sum(test, 2)); printf("%20.7f\n", sum(test, 0)); printf("%20.7f\n", sum(test, 3)); printf("I am "); #ifdef __ARM_ARCH_ISA_A64 printf(" __ARM_ARCH_ISA_A64 "); #endif #ifdef __arm64__ printf(" __arm64__ "); #endif #ifdef __x86_64__ printf(" __x86_64__ "); #endif #ifdef __linux__ printf(" __linux__ "); #endif #ifdef __APPLE__ printf(" __APPLE__ "); #endif printf("\n"); return 0; }


sum.S   Select all
# --------------------------------------------------------------- # A 64-bit function that returns the sum of the elements in a # floating-point array. The function has prototype: # # double sum(double[] array, unsigned length) # ----------------------------------------------------------------------- #ifdef __linux__ .global sum #endif #ifdef __APPLE__ .global _sum #endif .text #ifdef __ARM_ARCH_ISA_A64 .align 4 #endif #ifdef __linux__ sum: #endif #ifdef __APPLE__ _sum: #endif #ifdef __x86_64__ xorpd %xmm0, %xmm0 // initialize the sum to 0 cmp $0, %rsi // special case for length = 0 je done #endif #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 movi d0, #0 // initialize the sum to 0 // floats in s0-7 and doubles in the d0-7 registers. cmp x1, #0 // special case for length = 0 b.eq done #endif next: #ifdef __x86_64__ addsd (%rdi), %xmm0 // add in the current array element add $8, %rdi // move to next array element dec %rsi // count down jnz next // if not done counting, continue #endif #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 ldr d16, [x0] // load the float into d16 // floats in s0-7 and doubles in the d0-7 registers. fadd d0, d0, d16 // add in the current array element add x0, x0, #8 // move to next array element subs x1, x1, #1 // count down cbnz w1, next // if not done counting, continue #endif done: ret


callfactorial.c   Select all
/* * An application that illustrates calling the factorial function defined elsewhere. */ #include <stdio.h> #include <inttypes.h> #ifdef __USE_C_FUNCTION uint64_t factorial(unsigned n) { return (n <= 1) ? 1 : n * factorial(n-1); } #else uint64_t factorial(unsigned n); #endif int main() { for (unsigned i = 0; i < 20; i++) { #ifdef __linux__ printf("factorial(%2u) = %lu\n", i, factorial(i)); #endif #ifdef __APPLE__ printf("factorial(%2u) = %llu\n", i, factorial(i)); #endif } printf("I am "); #ifdef __ARM_ARCH_ISA_A64 printf(" __ARM_ARCH_ISA_A64 "); #endif #ifdef __arm64__ printf(" __arm64__ "); #endif #ifdef __x86_64__ printf(" __x86_64__ "); #endif #ifdef __linux__ printf(" __linux__ "); #endif #ifdef __APPLE__ printf(" __APPLE__ "); #endif printf("\n"); }


factorial.S   Select all
# ---------------------------------------------------------------------------- # A 64-bit recursive implementation of the function # # uint64_t factorial(unsigned n) # # implemented recursively # ---------------------------------------------------------------------------- #ifdef __linux__ .globl factorial #endif #ifdef __APPLE__ .globl _factorial #endif .text #ifdef __ARM_ARCH_ISA_A64 .align 4 #endif #ifdef __linux__ factorial: #endif #ifdef __APPLE__ _factorial: #endif #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 cmp x8, #1 //# n > 1? b.gt L1 //# if yes, go do a recursive call mov x0, #1 //# otherwise return 1 ret #endif #ifdef __x86_64__ cmp $1, %rdi # n <= 1? jnbe L1 # if not, go do a recursive call mov $1, %rax # otherwise return 1 ret #endif L1: #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 STP X8, LR, [SP, #-16]! //# push x8 and LR(x30) // LR is used to return from subroutine subs x8, x8, #1 //# n-1 #ifdef __linux__ bl factorial //# factorial(n-1), result goes in x0 #endif #ifdef __APPLE__ bl _factorial //# factorial(n-1), result goes in x0 #endif LDP X8, LR, [SP], #16 //# pop x8 and LR(x30) mul x0, x0, x8 //# n * factorial(n-1), stored in x0 ret #endif #ifdef __x86_64__ push %rdi # save n on stack (also aligns %rsp!) dec %rdi # n-1 #ifdef __linux__ call factorial # factorial(n-1), result goes in %rax #endif #ifdef __APPLE__ call _factorial # factorial(n-1), result goes in %rax #endif pop %rdi # restore n imul %rdi, %rax # n * factorial(n-1), stored in %rax ret #endif


callmaxofthree.c   Select all
/* * callmaxofthree.c * * A small program that illustrates how to call the maxofthree function we wrote in * assembly language. */ #include <stdio.h> #include <inttypes.h> int64_t maxofthree(int64_t, int64_t, int64_t); int main() { #ifdef __linux__ printf("%ld\n", maxofthree(1, -4, -7)); printf("%ld\n", maxofthree(2, -6, 1)); printf("%ld\n", maxofthree(2, 3, 1)); printf("%ld\n", maxofthree(-2, 4, 3)); printf("%ld\n", maxofthree(2, -6, 5)); printf("%ld\n", maxofthree(2, 4, 6)); #endif #ifdef __APPLE__ printf("%lld\n", maxofthree(1, -4, -7)); printf("%lld\n", maxofthree(2, -6, 1)); printf("%lld\n", maxofthree(2, 3, 1)); printf("%lld\n", maxofthree(-2, 4, 3)); printf("%lld\n", maxofthree(2, -6, 5)); printf("%lld\n", maxofthree(2, 4, 6)); #endif printf("I am "); #ifdef __ARM_ARCH_ISA_A64 printf(" __ARM_ARCH_ISA_A64 "); #endif #ifdef __arm64__ printf(" __arm64__ "); #endif #ifdef __x86_64__ printf(" __x86_64__ "); #endif #ifdef __linux__ printf(" __linux__ "); #endif #ifdef __APPLE__ printf(" __APPLE__ "); #endif printf("\n"); return 0; }


maxofthree.S   Select all
# ----------------------------------------------------------------------------- # A 64-bit function that returns the maximum value of its three 64-bit integer # arguments. The function has signature: # # int64_t maxofthree(int64_t x, int64_t y, int64_t z) # # Note that the parameters for x86_64 have already been passed in rdi, rsi, and rdx. We # Note that the parameters for arm64 have already been passed in x0, x1, x2. We # just have to return the value in rax(x86_64), x0(arm64). # ----------------------------------------------------------------------------- #ifdef __linux__ .globl maxofthree #endif #ifdef __APPLE__ .globl _maxofthree #endif .text #ifdef __ARM_ARCH_ISA_A64 .align 4 #endif #ifdef __linux__ maxofthree: #endif #ifdef __APPLE__ _maxofthree: #endif #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 cmp x0, x1 //# is x0 > x1 csel x0, x0, x1, GT // if GT, x0 = x0 else x0 = x1 cmp x0, x2 //# is x0 > x2 csel x0, x0, x2, GT // if GT, x0 = x0 else x0 = x2 ret //# the max will be in x0 #endif #ifdef __x86_64__ mov %rdi, %rax # result (rax) initially holds x cmp %rsi, %rax # is x less than y? cmovl %rsi, %rax # if so, set result to y cmp %rdx, %rax # is max(x,y) less than z? cmovl %rdx, %rax # if so, set result to z ret # the max will be in eax #endif


chaskey.h   Select all
#ifndef CHASKEY_H #define CHASKEY_H #define CHASKEY_ENCRYPT 1 #define CHASKEY_DECRYPT 0 #ifdef __cplusplus extern "C" { #endif void chas_encrypt(int, void*, void*); void chaskey(void*, void*); void chas_encryptx(void*, void*); #ifdef __cplusplus } #endif #endif


testckey.c   Select all
// test unit for chaskey #include <stdio.h> #include <string.h> #include <inttypes.h> #include "chaskey.h" uint8_t plain[16]= { 0xb8, 0x23, 0x28, 0x26, 0xfd, 0x5e, 0x40, 0x5e, 0x69, 0xa3, 0x01, 0xa9, 0x78, 0xea, 0x7a, 0xd8 }; uint8_t key[16] = { 0x56, 0x09, 0xe9, 0x68, 0x5f, 0x58, 0xe3, 0x29, 0x40, 0xec, 0xec, 0x98, 0xc5, 0x22, 0x98, 0x2f }; uint8_t cipher[16] = { 0xd5, 0x60, 0x8d, 0x4d, 0xa2, 0xbf, 0x34, 0x7b, 0xab, 0xf8, 0x77, 0x2f, 0xdf, 0xed, 0xde, 0x07 }; int main(void) { uint8_t t[16]; int e; memcpy(t, plain, 16); chaskey(key, t); e = memcmp(t, cipher, 16)==0; printf("\nCHASKEY Encryption: %s\n", e ? "OK" : "FAILED"); printf("I am "); #ifdef __ARM_ARCH_ISA_A64 printf(" __ARM_ARCH_ISA_A64 "); #endif #ifdef __arm64__ printf(" __arm64__ "); #endif #ifdef __x86_64__ printf(" __x86_64__ "); #endif #ifdef __linux__ printf(" __linux__ "); #endif #ifdef __APPLE__ printf(" __APPLE__ "); #endif printf("\n"); return 0; }


ckey.S   Select all
// CHASKEY in ARM64 assembly // Chaskey-LTS Block Cipher in AMD64 assembly (Encryption only) .text #ifdef __x86_64__ .intel_syntax noprefix #endif .globl chaskey .globl _chaskey #ifdef __ARM_ARCH_ISA_A64 .align 4 #endif // chaskey(void*mk, void*data); chaskey: _chaskey: #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 // load 128-bit key ldp w2, w3, [x0] ldp w4, w5, [x0, 8] // load 128-bit plain text ldp w6, w7, [x1] ldp w8, w9, [x1, 8] // xor plaintext with key eor w6, w6, w2 // x[0] ^= k[0]; eor w7, w7, w3 // x[1] ^= k[1]; eor w8, w8, w4 // x[2] ^= k[2]; eor w9, w9, w5 // x[3] ^= k[3]; mov w10, 16 // i = 16 #endif #ifdef __x86_64__ // .intel_syntax noprefix push rbx push rbp push rsi # load plaintext lodsd xchg eax, ebp lodsd xchg eax, ebx lodsd xchg eax, edx lodsd xchg eax, ebp # pre-whiten xor eax, [rdi ] xor ebx, [rdi+ 4] xor edx, [rdi+ 8] xor ebp, [rdi+12] push 16 pop rcx #endif L0: #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 add w6, w6, w7 // x[0] += x[1]; eor w7, w6, w7, ror 27 // x[1]=R(x[1],27) ^ x[0]; add w8, w8, w9 // x[2] += x[3]; eor w9, w8, w9, ror 24 // x[3]=R(x[3],24) ^ x[2]; add w8, w8, w7 // x[2] += x[1]; ror w6, w6, 16 add w6, w9, w6 // x[0]=R(x[0],16) + x[3]; eor w9, w6, w9, ror 19 // x[3]=R(x[3],19) ^ x[0]; eor w7, w8, w7, ror 25 // x[1]=R(x[1],25) ^ x[2]; ror w8, w8, 16 // x[2]=R(x[2],16); subs w10, w10, 1 // i-- bne L0 // i > 0 // xor cipher text with key eor w6, w6, w2 // x[0] ^= k[0]; eor w7, w7, w3 // x[1] ^= k[1]; eor w8, w8, w4 // x[2] ^= k[2]; eor w9, w9, w5 // x[3] ^= k[3]; // save 128-bit cipher text stp w6, w7, [x1] stp w8, w9, [x1, 8] ret #endif #ifdef __x86_64__ // .intel_syntax noprefix # x[0] += x[1]# add eax, ebx # x[1]=ROTR32(x[1],27) ^ x[0] ror ebx, 27 xor ebx, eax # x[2] += x[3]# add edx, ebp # x[3]=ROTR32(x[3],24) ^ x[2] ror ebp, 24 xor ebp, edx # x[2] += x[1]# add edx, ebx # x[0]=ROTR32(x[0],16) + x[3] ror eax, 16 add eax, ebp # x[3]=ROTR32(x[3],19) ^ x[0] ror ebp, 19 xor ebp, eax # x[1]=ROTR32(x[1],25) ^ x[2] ror ebx, 25 xor ebx, edx # x[2]=ROTR32(x[2],16) ror edx, 16 loop L0 # post-whiten xor eax, [rdi ] xor ebx, [rdi+ 4] xor edx, [rdi+ 8] xor ebp, [rdi+12] pop rdi # save ciphertext stosd xchg eax, ebx stosd xchg eax, edx stosd xchg eax, ebp stosd pop rbp pop rbx ret #endif


speck.h   Select all
#ifndef SPECK_H #define SPECK_H #ifdef __cplusplus extern "C" { #endif void speck64(void*, void*); void speck128(void*, void*); #ifdef __cplusplus } #endif #endif


testspk.c   Select all
// test unit for speck #include <stdio.h> #include <string.h> #include <inttypes.h> #include "speck.h" void print_bytes(char *s, void *p, int len) { int i; printf("%s : ", s); for (i=0; i<len; i++) { printf ("%02x ", ((uint8_t*)p)[i]); } putchar('\n'); } // SPECK64/128 test vectors // // p = 0x3b7265747475432d uint8_t plain64[]= { 0x74, 0x65, 0x72, 0x3b, 0x2d, 0x43, 0x75, 0x74 }; // c = 0x8c6fa548454e028b uint8_t cipher64[]= { 0x48, 0xa5, 0x6f, 0x8c, 0x8b, 0x02, 0x4e, 0x45 }; // key = 0x03020100, 0x0b0a0908, 0x13121110, 0x1b1a1918 uint8_t key64[]= { 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1a, 0x1b }; // SPECK128/256 test vectors // uint8_t key128[]= { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f }; uint8_t plain128[]= { 0x70, 0x6f, 0x6f, 0x6e, 0x65, 0x72, 0x2e, 0x20, 0x49, 0x6e, 0x20, 0x74, 0x68, 0x6f, 0x73, 0x65}; uint64_t cipher128[2] = {0x4eeeb48d9c188f43, 0x4109010405c0f53e}; #define R(v,n)(((v)>>(n))|((v)<<(64-(n)))) #define F(n)for(i=0;i<n;i++) typedef unsigned long long W; void speck128x(void*mk,void*in){ W i,t,k[4],r[2]; memcpy(r,in,16); memcpy(k,mk,32); F(34) r[1]=(R(r[1],8)+*r)^*k, *r=R(*r,61)^r[1], t=k[3], k[3]=(R(k[1],8)+*k)^i, *k=R(*k,61)^k[3], k[1]=k[2],k[2]=t; memcpy(in,r,16); } int main (void) { uint64_t buf[4]; int equ; // copy plain text to local buffer memcpy (buf, plain64, sizeof(plain64)); speck64(key64, buf); equ = memcmp(cipher64, buf, sizeof(cipher64))==0; printf ("\nSPECK64/128 encryption %s\n", equ ? "OK" : "FAILED"); print_bytes("CT result ", buf, sizeof(plain64)); print_bytes("CT expected", cipher64, sizeof(cipher64)); print_bytes("K ", key64, sizeof(key64)); print_bytes("PT", plain64, sizeof(plain64)); // copy plain text to local buffer memcpy (buf, plain128, sizeof(plain128)); #ifdef __USE_C_FUNCTION speck128x(key128, buf); #else speck128(key128, buf); #endif equ = memcmp(cipher128, buf, sizeof(cipher128))==0; printf ("\nSPECK128/256 encryption %s\n", equ ? "OK" : "FAILED"); print_bytes("CT result ", buf, sizeof(plain128)); print_bytes("CT expected", cipher128, sizeof(cipher128)); print_bytes("K ", key128, sizeof(key128)); print_bytes("PT", plain128, sizeof(plain128)); printf("I am "); #ifdef __ARM_ARCH_ISA_A64 printf(" __ARM_ARCH_ISA_A64 "); #endif #ifdef __arm64__ printf(" __arm64__ "); #endif #ifdef __x86_64__ printf(" __x86_64__ "); #endif #ifdef __linux__ printf(" __linux__ "); #endif #ifdef __APPLE__ printf(" __APPLE__ "); #endif printf("\n"); return 0; }


spk64.S   Select all
// SPECK64/128 in ARM64 assembly // SPECK-64/128 Block Cipher in x86 assembly (Encryption only) #ifdef __x86_64__ .intel_syntax noprefix #endif .text .globl speck64 .globl _speck64 #ifdef __ARM_ARCH_ISA_A64 .align 4 #endif // speck64(void*mk, void*data); speck64: _speck64: #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 // load 128-bit key // k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; ldp w5, w6, [x0] ldp w7, w8, [x0, 8] // load 64-bit plain text ldp w2, w4, [x1] // x0 = x[0]; x1 = k[1]; mov w3, wzr // i=0 #endif #ifdef __x86_64__ push rbx push rbp push rsi # save lodsd xchg eax, ebx # ebx = in[0] lodsd xchg eax, edx # edx = in[1] push rdi pop rsi lodsd xchg eax, edi # edi = key[0] lodsd xchg eax, ebp # ebp = key[1] lodsd xchg eax, ecx # ecx = key[2] lodsd xchg eax, esi # esi = key[3] xor eax, eax # i = 0 #endif L0: #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 ror w2, w2, 8 add w2, w2, w4 // x0 = (R(x0, 8) + x1) ^ k0; eor w2, w2, w5 // eor w4, w2, w4, ror 29 // x1 = R(x1, 3) ^ x0; mov w9, w8 // backup k3 ror w6, w6, 8 add w8, w5, w6 // k3 = (R(k1, 8) + k0) ^ i; eor w8, w8, w3 // eor w5, w8, w5, ror 29 // k0 = R(k0, 3) ^ k3; mov w6, w7 // k1 = k2; mov w7, w9 // k2 = t; add w3, w3, 1 // i++; cmp w3, 27 // i < 27; bne L0 // save result stp w2, w4, [x1] // x[0] = x0; x[1] = x1; ret #endif #ifdef __x86_64__ # ebx = (ROTR32(ebx, 8) + edx) ^ edi; ror ebx, 8 add ebx, edx xor ebx, edi # edx = ROTR32(edx, 29) ^ ebx; ror edx, 29 xor edx, ebx # ebp = (ROTR32(ebp, 8) + edi) ^ i; ror ebp, 8 add ebp, edi xor ebp, eax # edi = ROTR32(edi, 29) ^ ebp; ror edi, 29 xor edi, ebp xchg esi, ecx xchg esi, ebp # i++ inc al cmp al, 27 jnz L0 pop rdi xchg eax, ebx stosd xchg eax, edx stosd pop rbp pop rbx ret #endif


spk128.S   Select all
// SPECK128/256 in ARM64 assembly // SPECK-128/256 Block Cipher in AMD64 assembly (Encryption only) #ifdef __x86_64__ .intel_syntax noprefix #endif .text .global speck128 .global _speck128 #ifdef __ARM_ARCH_ISA_A64 .align 4 #endif // speck128(void*mk, void*data); speck128: _speck128: #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 // load 256-bit key // k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; ldp x5, x6, [x0] ldp x7, x8, [x0, 16] // load 128-bit plain text ldp x2, x4, [x1] // x0 = x[0]; x1 = k[1]; mov x3, xzr // i=0 #endif #ifdef __x86_64__ push rbp push rbx push rdi push rsi # load 128-bit plaintext mov rbp, [rsi ] mov rsi, [rsi+8] # load 256-bit key mov rbx, [rdi ] # k0 mov rcx, [rdi+ 8] # k1 mov rdx, [rdi+16] # k2 mov rdi, [rdi+24] # k3 # i = 0 xor eax, eax #endif L0: #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 ror x4, x4, 8 add x4, x4, x2 // x1 = (R(x1, 8) + x0) ^ k0; eor x4, x4, x5 // eor x2, x4, x2, ror 61 // x0 = R(x0, 61) ^ x1; mov x9, x8 // backup k3 ror x6, x6, 8 add x8, x5, x6 // k3 = (R(k1, 8) + k0) ^ i; eor x8, x8, x3 // eor x5, x8, x5, ror 61 // k0 = R(k0, 61) ^ k3; mov x6, x7 // k1 = k2; mov x7, x9 // k2 = t; add x3, x3, 1 // i++; cmp x3, 34 // i < 34; bne L0 // save result stp x2, x4, [x1] // x[0] = x0; x[1] = x1; ret #endif #ifdef __x86_64__ # x[1] = (R(x[1], 8) + x[0]) ^ k[0]; ror rsi, 8 add rsi, rbp xor rsi, rbx # x[0] = R(x[0], 61) ^ x[1]; ror rbp, 61 xor rbp, rsi # k[1] = (R(k[1], 8) + k[0]) ^ i; ror rcx, 8 add rcx, rbx xor cl, al # k[0] = R(k[0], 61) ^ k[3]; ror rbx, 61 xor rbx, rcx # X(k3, k2), X(k3, k1); xchg rdi, rdx xchg rdi, rcx # i++ inc al cmp al, 34 jnz L0 pop rax push rax # save 128-bit result mov [rax ], rbp mov [rax+8], rsi pop rsi pop rdi pop rbx pop rbp ret #endif


(2) Compile and Linking
shell script   Select all
# use clang to compile and link # To compile and link the above in Linux (e.g. tested in Android arm64 Termux App or Windows 10 WSL2 and clang package should be installed) clang callsum.c sum.S -o callsum; clang callfactorial.c factorial.S -o callfactorial; clang maxofthree.S callmaxofthree.c -o callmaxofthree; clang testckey.c ckey.S -o testckey; clang testspk.c spk64.S spk128.S -o testspk; #To compile and link the above in macOS (new M1 machine is capable to run x86_64 and arm64 binaries with Rosetta 2 installed). To compile on macOS, XCode, Command Line Utility and Rosetta 2 should be installed. clang callsum.c sum.S -o callsum_x86_64 -arch x86_64; clang callfactorial.c factorial.S -o callfactorial_x86_64 -arch x86_64; clang maxofthree.S callmaxofthree.c -o callmaxofthree_x86_64 -arch x86_64; clang testckey.c ckey.S -o testckey_x86_64 -arch x86_64; clang testspk.c spk64.S spk128.S -o testspk_x86_64 -arch x86_64; clang callsum.c sum.S -o callsum_arm64 -arch arm64; clang callfactorial.c factorial.S -o callfactorial_arm64 -arch arm64; clang maxofthree.S callmaxofthree.c -o callmaxofthree_arm64 -arch arm64; clang testckey.c ckey.S -o testckey_arm64 -arch arm64; clang testspk.c spk64.S spk128.S -o testspk_arm64 -arch arm64; In order to debug say using lldb, add -g option when compile and, in addition, macOS has to codesign with enttlements


(3) Summary of differences
3.1) In order to preprocess the assembler file using clang compiler, the filename extension should be capital letter S in linux. Subroutine name between C and global asm labels should prefix by underscore for macOS.
3.2) A64 (arm64) instruction set does not include an explicit stack push instruction. Functions can use the stp and ldp (load pair of registers) to carry out the push and pop operations as demo in factorial.S source code above.
3.3) Most Armv8-64 platforms (e.g. macOS) require quadword (16-byte) alignment of the SP register.
3.4) A64 (arm64) parameter/ results registers are X0-7.   X8 is designated as the Indirect Result Location Parameter and X30 (LR) is the Link Register. If the function has a return value, it will be stored in X0.  A64 (arm64) floating point result registers are S0 or D0  as demo in sum.S
3.5) x86_64 parameter registers for integer or pointer are %rdi. %rsi, %rdx, %rcx, %r8, %r9. If the function has a return value, it will be stored in %rax.  x86_64 floating point result registers are %xmm0.  as demo in sum.S
3.6) by using the directive .intel_syntax noprefix, the x86_64 intel syntax assembly code can be used where the first assembler operand usually is the destination operand where the order is similar to that of arm64 code. In addition the prefix % can be omitted when using noprefix.

(4) To download the above source code using command line
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_CALLSUM.C | sed '1d' | sed -n "/END_OF_CALLSUM.C/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > callsum.c
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_SUM.S | sed '1d' | sed -n "/END_OF_SUM.S/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > sum.S

curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_CALLFACTORIAL.C | sed '1d' | sed -n "/END_OF_CALLFACTORIAL.C/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > callfactorial.c
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_FACTORIAL.S | sed '1d' | sed -n "/END_OF_FACTORIAL.S/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > factorial.S

curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_CALLMAXOFTHREE.C | sed '1d' | sed -n "/END_OF_CALLMAXOFTHREE.C/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > callmaxofthree.c
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_MAXOFTHREE.S | sed '1d' | sed -n "/END_OF_MAXOFTHREE.S/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > maxofthree.S

curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_CHASKEY.H | sed '1d' | sed -n "/END_OF_CHASKEY.H/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > chaskey.h
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_TESTCKEY.C | sed '1d' | sed -n "/END_OF_TESTCKEY.C/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > testckey.c
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_CKEY.S | sed '1d' | sed -n "/END_OF_CKEY.S/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > ckey.S

curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_SPECK.H | sed '1d' | sed -n "/END_OF_SPECK.H/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > speck.h
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_TESTSPK.C | sed '1d' | sed -n "/END_OF_TESTSPK.C/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > testspk.c
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_SPK64.S | sed '1d' | sed -n "/END_OF_SPK64.S/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > spk64.S
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_SPK128.S | sed '1d' | sed -n "/END_OF_SPK128.S/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > spk128.S