Tuesday, April 13, 2021

Mixing C and Assembler for x86_64 and arm64, major differences.

(1) These demo the mixing of C and Assembler Language for x86_64 and arm64 and show the differences in linux and macOS environment.
callsum.c   Select all
/* * callsum.c * * Illustrates how to call the sum function in assembly language. */ #include <stdio.h> double sum(double[], unsigned); int main() { double test[] = { 40.5, 26.7, 21.9, 1.5, -40.5, -23.4 }; printf("%20.7f\n", sum(test, 6)); printf("%20.7f\n", sum(test, 2)); printf("%20.7f\n", sum(test, 0)); printf("%20.7f\n", sum(test, 3)); printf("I am "); #ifdef __ARM_ARCH_ISA_A64 printf(" __ARM_ARCH_ISA_A64 "); #endif #ifdef __arm64__ printf(" __arm64__ "); #endif #ifdef __x86_64__ printf(" __x86_64__ "); #endif #ifdef __linux__ printf(" __linux__ "); #endif #ifdef __APPLE__ printf(" __APPLE__ "); #endif printf("\n"); return 0; }


sum.S   Select all
# --------------------------------------------------------------- # A 64-bit function that returns the sum of the elements in a # floating-point array. The function has prototype: # # double sum(double[] array, unsigned length) # ----------------------------------------------------------------------- #ifdef __linux__ .global sum #endif #ifdef __APPLE__ .global _sum #endif .text #ifdef __ARM_ARCH_ISA_A64 .align 4 #endif #ifdef __linux__ sum: #endif #ifdef __APPLE__ _sum: #endif #ifdef __x86_64__ xorpd %xmm0, %xmm0 // initialize the sum to 0 cmp $0, %rsi // special case for length = 0 je done #endif #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 movi d0, #0 // initialize the sum to 0 // floats in s0-7 and doubles in the d0-7 registers. cmp x1, #0 // special case for length = 0 b.eq done #endif next: #ifdef __x86_64__ addsd (%rdi), %xmm0 // add in the current array element add $8, %rdi // move to next array element dec %rsi // count down jnz next // if not done counting, continue #endif #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 ldr d16, [x0] // load the float into d16 // floats in s0-7 and doubles in the d0-7 registers. fadd d0, d0, d16 // add in the current array element add x0, x0, #8 // move to next array element subs x1, x1, #1 // count down cbnz w1, next // if not done counting, continue #endif done: ret


callfactorial.c   Select all
/* * An application that illustrates calling the factorial function defined elsewhere. */ #include <stdio.h> #include <inttypes.h> #ifdef __USE_C_FUNCTION uint64_t factorial(unsigned n) { return (n <= 1) ? 1 : n * factorial(n-1); } #else uint64_t factorial(unsigned n); #endif int main() { for (unsigned i = 0; i < 20; i++) { #ifdef __linux__ printf("factorial(%2u) = %lu\n", i, factorial(i)); #endif #ifdef __APPLE__ printf("factorial(%2u) = %llu\n", i, factorial(i)); #endif } printf("I am "); #ifdef __ARM_ARCH_ISA_A64 printf(" __ARM_ARCH_ISA_A64 "); #endif #ifdef __arm64__ printf(" __arm64__ "); #endif #ifdef __x86_64__ printf(" __x86_64__ "); #endif #ifdef __linux__ printf(" __linux__ "); #endif #ifdef __APPLE__ printf(" __APPLE__ "); #endif printf("\n"); }


factorial.S   Select all
# ---------------------------------------------------------------------------- # A 64-bit recursive implementation of the function # # uint64_t factorial(unsigned n) # # implemented recursively # ---------------------------------------------------------------------------- #ifdef __linux__ .globl factorial #endif #ifdef __APPLE__ .globl _factorial #endif .text #ifdef __ARM_ARCH_ISA_A64 .align 4 #endif #ifdef __linux__ factorial: #endif #ifdef __APPLE__ _factorial: #endif #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 cmp x8, #1 //# n > 1? b.gt L1 //# if yes, go do a recursive call mov x0, #1 //# otherwise return 1 ret #endif #ifdef __x86_64__ cmp $1, %rdi # n <= 1? jnbe L1 # if not, go do a recursive call mov $1, %rax # otherwise return 1 ret #endif L1: #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 STP X8, LR, [SP, #-16]! //# push x8 and LR(x30) // LR is used to return from subroutine subs x8, x8, #1 //# n-1 #ifdef __linux__ bl factorial //# factorial(n-1), result goes in x0 #endif #ifdef __APPLE__ bl _factorial //# factorial(n-1), result goes in x0 #endif LDP X8, LR, [SP], #16 //# pop x8 and LR(x30) mul x0, x0, x8 //# n * factorial(n-1), stored in x0 ret #endif #ifdef __x86_64__ push %rdi # save n on stack (also aligns %rsp!) dec %rdi # n-1 #ifdef __linux__ call factorial # factorial(n-1), result goes in %rax #endif #ifdef __APPLE__ call _factorial # factorial(n-1), result goes in %rax #endif pop %rdi # restore n imul %rdi, %rax # n * factorial(n-1), stored in %rax ret #endif


callmaxofthree.c   Select all
/* * callmaxofthree.c * * A small program that illustrates how to call the maxofthree function we wrote in * assembly language. */ #include <stdio.h> #include <inttypes.h> int64_t maxofthree(int64_t, int64_t, int64_t); int main() { #ifdef __linux__ printf("%ld\n", maxofthree(1, -4, -7)); printf("%ld\n", maxofthree(2, -6, 1)); printf("%ld\n", maxofthree(2, 3, 1)); printf("%ld\n", maxofthree(-2, 4, 3)); printf("%ld\n", maxofthree(2, -6, 5)); printf("%ld\n", maxofthree(2, 4, 6)); #endif #ifdef __APPLE__ printf("%lld\n", maxofthree(1, -4, -7)); printf("%lld\n", maxofthree(2, -6, 1)); printf("%lld\n", maxofthree(2, 3, 1)); printf("%lld\n", maxofthree(-2, 4, 3)); printf("%lld\n", maxofthree(2, -6, 5)); printf("%lld\n", maxofthree(2, 4, 6)); #endif printf("I am "); #ifdef __ARM_ARCH_ISA_A64 printf(" __ARM_ARCH_ISA_A64 "); #endif #ifdef __arm64__ printf(" __arm64__ "); #endif #ifdef __x86_64__ printf(" __x86_64__ "); #endif #ifdef __linux__ printf(" __linux__ "); #endif #ifdef __APPLE__ printf(" __APPLE__ "); #endif printf("\n"); return 0; }


maxofthree.S   Select all
# ----------------------------------------------------------------------------- # A 64-bit function that returns the maximum value of its three 64-bit integer # arguments. The function has signature: # # int64_t maxofthree(int64_t x, int64_t y, int64_t z) # # Note that the parameters for x86_64 have already been passed in rdi, rsi, and rdx. We # Note that the parameters for arm64 have already been passed in x0, x1, x2. We # just have to return the value in rax(x86_64), x0(arm64). # ----------------------------------------------------------------------------- #ifdef __linux__ .globl maxofthree #endif #ifdef __APPLE__ .globl _maxofthree #endif .text #ifdef __ARM_ARCH_ISA_A64 .align 4 #endif #ifdef __linux__ maxofthree: #endif #ifdef __APPLE__ _maxofthree: #endif #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 cmp x0, x1 //# is x0 > x1 csel x0, x0, x1, GT // if GT, x0 = x0 else x0 = x1 cmp x0, x2 //# is x0 > x2 csel x0, x0, x2, GT // if GT, x0 = x0 else x0 = x2 ret //# the max will be in x0 #endif #ifdef __x86_64__ mov %rdi, %rax # result (rax) initially holds x cmp %rsi, %rax # is x less than y? cmovl %rsi, %rax # if so, set result to y cmp %rdx, %rax # is max(x,y) less than z? cmovl %rdx, %rax # if so, set result to z ret # the max will be in eax #endif


chaskey.h   Select all
#ifndef CHASKEY_H #define CHASKEY_H #define CHASKEY_ENCRYPT 1 #define CHASKEY_DECRYPT 0 #ifdef __cplusplus extern "C" { #endif void chas_encrypt(int, void*, void*); void chaskey(void*, void*); void chas_encryptx(void*, void*); #ifdef __cplusplus } #endif #endif


testckey.c   Select all
// test unit for chaskey #include <stdio.h> #include <string.h> #include <inttypes.h> #include "chaskey.h" uint8_t plain[16]= { 0xb8, 0x23, 0x28, 0x26, 0xfd, 0x5e, 0x40, 0x5e, 0x69, 0xa3, 0x01, 0xa9, 0x78, 0xea, 0x7a, 0xd8 }; uint8_t key[16] = { 0x56, 0x09, 0xe9, 0x68, 0x5f, 0x58, 0xe3, 0x29, 0x40, 0xec, 0xec, 0x98, 0xc5, 0x22, 0x98, 0x2f }; uint8_t cipher[16] = { 0xd5, 0x60, 0x8d, 0x4d, 0xa2, 0xbf, 0x34, 0x7b, 0xab, 0xf8, 0x77, 0x2f, 0xdf, 0xed, 0xde, 0x07 }; int main(void) { uint8_t t[16]; int e; memcpy(t, plain, 16); chaskey(key, t); e = memcmp(t, cipher, 16)==0; printf("\nCHASKEY Encryption: %s\n", e ? "OK" : "FAILED"); printf("I am "); #ifdef __ARM_ARCH_ISA_A64 printf(" __ARM_ARCH_ISA_A64 "); #endif #ifdef __arm64__ printf(" __arm64__ "); #endif #ifdef __x86_64__ printf(" __x86_64__ "); #endif #ifdef __linux__ printf(" __linux__ "); #endif #ifdef __APPLE__ printf(" __APPLE__ "); #endif printf("\n"); return 0; }


ckey.S   Select all
// CHASKEY in ARM64 assembly // Chaskey-LTS Block Cipher in AMD64 assembly (Encryption only) .text #ifdef __x86_64__ .intel_syntax noprefix #endif .globl chaskey .globl _chaskey #ifdef __ARM_ARCH_ISA_A64 .align 4 #endif // chaskey(void*mk, void*data); chaskey: _chaskey: #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 // load 128-bit key ldp w2, w3, [x0] ldp w4, w5, [x0, 8] // load 128-bit plain text ldp w6, w7, [x1] ldp w8, w9, [x1, 8] // xor plaintext with key eor w6, w6, w2 // x[0] ^= k[0]; eor w7, w7, w3 // x[1] ^= k[1]; eor w8, w8, w4 // x[2] ^= k[2]; eor w9, w9, w5 // x[3] ^= k[3]; mov w10, 16 // i = 16 #endif #ifdef __x86_64__ // .intel_syntax noprefix push rbx push rbp push rsi # load plaintext lodsd xchg eax, ebp lodsd xchg eax, ebx lodsd xchg eax, edx lodsd xchg eax, ebp # pre-whiten xor eax, [rdi ] xor ebx, [rdi+ 4] xor edx, [rdi+ 8] xor ebp, [rdi+12] push 16 pop rcx #endif L0: #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 add w6, w6, w7 // x[0] += x[1]; eor w7, w6, w7, ror 27 // x[1]=R(x[1],27) ^ x[0]; add w8, w8, w9 // x[2] += x[3]; eor w9, w8, w9, ror 24 // x[3]=R(x[3],24) ^ x[2]; add w8, w8, w7 // x[2] += x[1]; ror w6, w6, 16 add w6, w9, w6 // x[0]=R(x[0],16) + x[3]; eor w9, w6, w9, ror 19 // x[3]=R(x[3],19) ^ x[0]; eor w7, w8, w7, ror 25 // x[1]=R(x[1],25) ^ x[2]; ror w8, w8, 16 // x[2]=R(x[2],16); subs w10, w10, 1 // i-- bne L0 // i > 0 // xor cipher text with key eor w6, w6, w2 // x[0] ^= k[0]; eor w7, w7, w3 // x[1] ^= k[1]; eor w8, w8, w4 // x[2] ^= k[2]; eor w9, w9, w5 // x[3] ^= k[3]; // save 128-bit cipher text stp w6, w7, [x1] stp w8, w9, [x1, 8] ret #endif #ifdef __x86_64__ // .intel_syntax noprefix # x[0] += x[1]# add eax, ebx # x[1]=ROTR32(x[1],27) ^ x[0] ror ebx, 27 xor ebx, eax # x[2] += x[3]# add edx, ebp # x[3]=ROTR32(x[3],24) ^ x[2] ror ebp, 24 xor ebp, edx # x[2] += x[1]# add edx, ebx # x[0]=ROTR32(x[0],16) + x[3] ror eax, 16 add eax, ebp # x[3]=ROTR32(x[3],19) ^ x[0] ror ebp, 19 xor ebp, eax # x[1]=ROTR32(x[1],25) ^ x[2] ror ebx, 25 xor ebx, edx # x[2]=ROTR32(x[2],16) ror edx, 16 loop L0 # post-whiten xor eax, [rdi ] xor ebx, [rdi+ 4] xor edx, [rdi+ 8] xor ebp, [rdi+12] pop rdi # save ciphertext stosd xchg eax, ebx stosd xchg eax, edx stosd xchg eax, ebp stosd pop rbp pop rbx ret #endif


speck.h   Select all
#ifndef SPECK_H #define SPECK_H #ifdef __cplusplus extern "C" { #endif void speck64(void*, void*); void speck128(void*, void*); #ifdef __cplusplus } #endif #endif


testspk.c   Select all
// test unit for speck #include <stdio.h> #include <string.h> #include <inttypes.h> #include "speck.h" void print_bytes(char *s, void *p, int len) { int i; printf("%s : ", s); for (i=0; i<len; i++) { printf ("%02x ", ((uint8_t*)p)[i]); } putchar('\n'); } // SPECK64/128 test vectors // // p = 0x3b7265747475432d uint8_t plain64[]= { 0x74, 0x65, 0x72, 0x3b, 0x2d, 0x43, 0x75, 0x74 }; // c = 0x8c6fa548454e028b uint8_t cipher64[]= { 0x48, 0xa5, 0x6f, 0x8c, 0x8b, 0x02, 0x4e, 0x45 }; // key = 0x03020100, 0x0b0a0908, 0x13121110, 0x1b1a1918 uint8_t key64[]= { 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1a, 0x1b }; // SPECK128/256 test vectors // uint8_t key128[]= { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f }; uint8_t plain128[]= { 0x70, 0x6f, 0x6f, 0x6e, 0x65, 0x72, 0x2e, 0x20, 0x49, 0x6e, 0x20, 0x74, 0x68, 0x6f, 0x73, 0x65}; uint64_t cipher128[2] = {0x4eeeb48d9c188f43, 0x4109010405c0f53e}; #define R(v,n)(((v)>>(n))|((v)<<(64-(n)))) #define F(n)for(i=0;i<n;i++) typedef unsigned long long W; void speck128x(void*mk,void*in){ W i,t,k[4],r[2]; memcpy(r,in,16); memcpy(k,mk,32); F(34) r[1]=(R(r[1],8)+*r)^*k, *r=R(*r,61)^r[1], t=k[3], k[3]=(R(k[1],8)+*k)^i, *k=R(*k,61)^k[3], k[1]=k[2],k[2]=t; memcpy(in,r,16); } int main (void) { uint64_t buf[4]; int equ; // copy plain text to local buffer memcpy (buf, plain64, sizeof(plain64)); speck64(key64, buf); equ = memcmp(cipher64, buf, sizeof(cipher64))==0; printf ("\nSPECK64/128 encryption %s\n", equ ? "OK" : "FAILED"); print_bytes("CT result ", buf, sizeof(plain64)); print_bytes("CT expected", cipher64, sizeof(cipher64)); print_bytes("K ", key64, sizeof(key64)); print_bytes("PT", plain64, sizeof(plain64)); // copy plain text to local buffer memcpy (buf, plain128, sizeof(plain128)); #ifdef __USE_C_FUNCTION speck128x(key128, buf); #else speck128(key128, buf); #endif equ = memcmp(cipher128, buf, sizeof(cipher128))==0; printf ("\nSPECK128/256 encryption %s\n", equ ? "OK" : "FAILED"); print_bytes("CT result ", buf, sizeof(plain128)); print_bytes("CT expected", cipher128, sizeof(cipher128)); print_bytes("K ", key128, sizeof(key128)); print_bytes("PT", plain128, sizeof(plain128)); printf("I am "); #ifdef __ARM_ARCH_ISA_A64 printf(" __ARM_ARCH_ISA_A64 "); #endif #ifdef __arm64__ printf(" __arm64__ "); #endif #ifdef __x86_64__ printf(" __x86_64__ "); #endif #ifdef __linux__ printf(" __linux__ "); #endif #ifdef __APPLE__ printf(" __APPLE__ "); #endif printf("\n"); return 0; }


spk64.S   Select all
// SPECK64/128 in ARM64 assembly // SPECK-64/128 Block Cipher in x86 assembly (Encryption only) #ifdef __x86_64__ .intel_syntax noprefix #endif .text .globl speck64 .globl _speck64 #ifdef __ARM_ARCH_ISA_A64 .align 4 #endif // speck64(void*mk, void*data); speck64: _speck64: #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 // load 128-bit key // k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; ldp w5, w6, [x0] ldp w7, w8, [x0, 8] // load 64-bit plain text ldp w2, w4, [x1] // x0 = x[0]; x1 = k[1]; mov w3, wzr // i=0 #endif #ifdef __x86_64__ push rbx push rbp push rsi # save lodsd xchg eax, ebx # ebx = in[0] lodsd xchg eax, edx # edx = in[1] push rdi pop rsi lodsd xchg eax, edi # edi = key[0] lodsd xchg eax, ebp # ebp = key[1] lodsd xchg eax, ecx # ecx = key[2] lodsd xchg eax, esi # esi = key[3] xor eax, eax # i = 0 #endif L0: #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 ror w2, w2, 8 add w2, w2, w4 // x0 = (R(x0, 8) + x1) ^ k0; eor w2, w2, w5 // eor w4, w2, w4, ror 29 // x1 = R(x1, 3) ^ x0; mov w9, w8 // backup k3 ror w6, w6, 8 add w8, w5, w6 // k3 = (R(k1, 8) + k0) ^ i; eor w8, w8, w3 // eor w5, w8, w5, ror 29 // k0 = R(k0, 3) ^ k3; mov w6, w7 // k1 = k2; mov w7, w9 // k2 = t; add w3, w3, 1 // i++; cmp w3, 27 // i < 27; bne L0 // save result stp w2, w4, [x1] // x[0] = x0; x[1] = x1; ret #endif #ifdef __x86_64__ # ebx = (ROTR32(ebx, 8) + edx) ^ edi; ror ebx, 8 add ebx, edx xor ebx, edi # edx = ROTR32(edx, 29) ^ ebx; ror edx, 29 xor edx, ebx # ebp = (ROTR32(ebp, 8) + edi) ^ i; ror ebp, 8 add ebp, edi xor ebp, eax # edi = ROTR32(edi, 29) ^ ebp; ror edi, 29 xor edi, ebp xchg esi, ecx xchg esi, ebp # i++ inc al cmp al, 27 jnz L0 pop rdi xchg eax, ebx stosd xchg eax, edx stosd pop rbp pop rbx ret #endif


spk128.S   Select all
// SPECK128/256 in ARM64 assembly // SPECK-128/256 Block Cipher in AMD64 assembly (Encryption only) #ifdef __x86_64__ .intel_syntax noprefix #endif .text .global speck128 .global _speck128 #ifdef __ARM_ARCH_ISA_A64 .align 4 #endif // speck128(void*mk, void*data); speck128: _speck128: #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 // load 256-bit key // k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; ldp x5, x6, [x0] ldp x7, x8, [x0, 16] // load 128-bit plain text ldp x2, x4, [x1] // x0 = x[0]; x1 = k[1]; mov x3, xzr // i=0 #endif #ifdef __x86_64__ push rbp push rbx push rdi push rsi # load 128-bit plaintext mov rbp, [rsi ] mov rsi, [rsi+8] # load 256-bit key mov rbx, [rdi ] # k0 mov rcx, [rdi+ 8] # k1 mov rdx, [rdi+16] # k2 mov rdi, [rdi+24] # k3 # i = 0 xor eax, eax #endif L0: #if defined __arm64__ || defined __ARM_ARCH_ISA_A64 ror x4, x4, 8 add x4, x4, x2 // x1 = (R(x1, 8) + x0) ^ k0; eor x4, x4, x5 // eor x2, x4, x2, ror 61 // x0 = R(x0, 61) ^ x1; mov x9, x8 // backup k3 ror x6, x6, 8 add x8, x5, x6 // k3 = (R(k1, 8) + k0) ^ i; eor x8, x8, x3 // eor x5, x8, x5, ror 61 // k0 = R(k0, 61) ^ k3; mov x6, x7 // k1 = k2; mov x7, x9 // k2 = t; add x3, x3, 1 // i++; cmp x3, 34 // i < 34; bne L0 // save result stp x2, x4, [x1] // x[0] = x0; x[1] = x1; ret #endif #ifdef __x86_64__ # x[1] = (R(x[1], 8) + x[0]) ^ k[0]; ror rsi, 8 add rsi, rbp xor rsi, rbx # x[0] = R(x[0], 61) ^ x[1]; ror rbp, 61 xor rbp, rsi # k[1] = (R(k[1], 8) + k[0]) ^ i; ror rcx, 8 add rcx, rbx xor cl, al # k[0] = R(k[0], 61) ^ k[3]; ror rbx, 61 xor rbx, rcx # X(k3, k2), X(k3, k1); xchg rdi, rdx xchg rdi, rcx # i++ inc al cmp al, 34 jnz L0 pop rax push rax # save 128-bit result mov [rax ], rbp mov [rax+8], rsi pop rsi pop rdi pop rbx pop rbp ret #endif


(2) Compile and Linking
shell script   Select all
# use clang to compile and link # To compile and link the above in Linux (e.g. tested in Android arm64 Termux App or Windows 10 WSL2 and clang package should be installed) clang callsum.c sum.S -o callsum; clang callfactorial.c factorial.S -o callfactorial; clang maxofthree.S callmaxofthree.c -o callmaxofthree; clang testckey.c ckey.S -o testckey; clang testspk.c spk64.S spk128.S -o testspk; #To compile and link the above in macOS (new M1 machine is capable to run x86_64 and arm64 binaries with Rosetta 2 installed). To compile on macOS, XCode, Command Line Utility and Rosetta 2 should be installed. clang callsum.c sum.S -o callsum_x86_64 -arch x86_64; clang callfactorial.c factorial.S -o callfactorial_x86_64 -arch x86_64; clang maxofthree.S callmaxofthree.c -o callmaxofthree_x86_64 -arch x86_64; clang testckey.c ckey.S -o testckey_x86_64 -arch x86_64; clang testspk.c spk64.S spk128.S -o testspk_x86_64 -arch x86_64; clang callsum.c sum.S -o callsum_arm64 -arch arm64; clang callfactorial.c factorial.S -o callfactorial_arm64 -arch arm64; clang maxofthree.S callmaxofthree.c -o callmaxofthree_arm64 -arch arm64; clang testckey.c ckey.S -o testckey_arm64 -arch arm64; clang testspk.c spk64.S spk128.S -o testspk_arm64 -arch arm64; In order to debug say using lldb, add -g option when compile and, in addition, macOS has to codesign with enttlements


(3) Summary of differences
3.1) In order to preprocess the assembler file using clang compiler, the filename extension should be capital letter S in linux. Subroutine name between C and global asm labels should prefix by underscore for macOS.
3.2) A64 (arm64) instruction set does not include an explicit stack push instruction. Functions can use the stp and ldp (load pair of registers) to carry out the push and pop operations as demo in factorial.S source code above.
3.3) Most Armv8-64 platforms (e.g. macOS) require quadword (16-byte) alignment of the SP register.
3.4) A64 (arm64) parameter/ results registers are X0-7.   X8 is designated as the Indirect Result Location Parameter and X30 (LR) is the Link Register. If the function has a return value, it will be stored in X0.  A64 (arm64) floating point result registers are S0 or D0  as demo in sum.S
3.5) x86_64 parameter registers for integer or pointer are %rdi. %rsi, %rdx, %rcx, %r8, %r9. If the function has a return value, it will be stored in %rax.  x86_64 floating point result registers are %xmm0.  as demo in sum.S
3.6) by using the directive .intel_syntax noprefix, the x86_64 intel syntax assembly code can be used where the first assembler operand usually is the destination operand where the order is similar to that of arm64 code. In addition the prefix % can be omitted when using noprefix.

(4) To download the above source code using command line
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_CALLSUM.C | sed '1d' | sed -n "/END_OF_CALLSUM.C/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > callsum.c
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_SUM.S | sed '1d' | sed -n "/END_OF_SUM.S/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > sum.S

curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_CALLFACTORIAL.C | sed '1d' | sed -n "/END_OF_CALLFACTORIAL.C/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > callfactorial.c
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_FACTORIAL.S | sed '1d' | sed -n "/END_OF_FACTORIAL.S/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > factorial.S

curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_CALLMAXOFTHREE.C | sed '1d' | sed -n "/END_OF_CALLMAXOFTHREE.C/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > callmaxofthree.c
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_MAXOFTHREE.S | sed '1d' | sed -n "/END_OF_MAXOFTHREE.S/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > maxofthree.S

curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_CHASKEY.H | sed '1d' | sed -n "/END_OF_CHASKEY.H/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > chaskey.h
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_TESTCKEY.C | sed '1d' | sed -n "/END_OF_TESTCKEY.C/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > testckey.c
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_CKEY.S | sed '1d' | sed -n "/END_OF_CKEY.S/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > ckey.S

curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_SPECK.H | sed '1d' | sed -n "/END_OF_SPECK.H/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > speck.h
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_TESTSPK.C | sed '1d' | sed -n "/END_OF_TESTSPK.C/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > testspk.c
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_SPK64.S | sed '1d' | sed -n "/END_OF_SPK64.S/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > spk64.S
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_SPK128.S | sed '1d' | sed -n "/END_OF_SPK128.S/q;p" | sed 's/&gt;/\>/g;s/&lt;/\</g' > spk128.S



No comments: