(1) These demo the mixing of C and Assembler Language for x86_64 and arm64 and show the differences in linux and macOS environment.
- callsum.c Select all
/*
* callsum.c
*
* Illustrates how to call the sum function in assembly language.
*/
#include <stdio.h>
double sum(double[], unsigned);
int main() {
double test[] = {
40.5, 26.7, 21.9, 1.5, -40.5, -23.4
};
printf("%20.7f\n", sum(test, 6));
printf("%20.7f\n", sum(test, 2));
printf("%20.7f\n", sum(test, 0));
printf("%20.7f\n", sum(test, 3));
printf("I am ");
#ifdef __ARM_ARCH_ISA_A64
printf(" __ARM_ARCH_ISA_A64 ");
#endif
#ifdef __arm64__
printf(" __arm64__ ");
#endif
#ifdef __x86_64__
printf(" __x86_64__ ");
#endif
#ifdef __linux__
printf(" __linux__ ");
#endif
#ifdef __APPLE__
printf(" __APPLE__ ");
#endif
printf("\n");
return 0;
}
- sum.S Select all
# ---------------------------------------------------------------
# A 64-bit function that returns the sum of the elements in a
# floating-point array. The function has prototype:
#
# double sum(double[] array, unsigned length)
# -----------------------------------------------------------------------
#ifdef __linux__
.global sum
#endif
#ifdef __APPLE__
.global _sum
#endif
.text
#ifdef __ARM_ARCH_ISA_A64
.align 4
#endif
#ifdef __linux__
sum:
#endif
#ifdef __APPLE__
_sum:
#endif
#ifdef __x86_64__
xorpd %xmm0, %xmm0 // initialize the sum to 0
cmp $0, %rsi // special case for length = 0
je done
#endif
#if defined __arm64__ || defined __ARM_ARCH_ISA_A64
movi d0, #0 // initialize the sum to 0
// floats in s0-7 and doubles in the d0-7 registers.
cmp x1, #0 // special case for length = 0
b.eq done
#endif
next:
#ifdef __x86_64__
addsd (%rdi), %xmm0 // add in the current array element
add $8, %rdi // move to next array element
dec %rsi // count down
jnz next // if not done counting, continue
#endif
#if defined __arm64__ || defined __ARM_ARCH_ISA_A64
ldr d16, [x0] // load the float into d16
// floats in s0-7 and doubles in the d0-7 registers.
fadd d0, d0, d16 // add in the current array element
add x0, x0, #8 // move to next array element
subs x1, x1, #1 // count down
cbnz w1, next // if not done counting, continue
#endif
done:
ret
- callfactorial.c Select all
/*
* An application that illustrates calling the factorial function defined elsewhere.
*/
#include <stdio.h>
#include <inttypes.h>
#ifdef __USE_C_FUNCTION
uint64_t factorial(unsigned n) {
return (n <= 1) ? 1 : n * factorial(n-1);
}
#else
uint64_t factorial(unsigned n);
#endif
int main() {
for (unsigned i = 0; i < 20; i++) {
#ifdef __linux__
printf("factorial(%2u) = %lu\n", i, factorial(i));
#endif
#ifdef __APPLE__
printf("factorial(%2u) = %llu\n", i, factorial(i));
#endif
}
printf("I am ");
#ifdef __ARM_ARCH_ISA_A64
printf(" __ARM_ARCH_ISA_A64 ");
#endif
#ifdef __arm64__
printf(" __arm64__ ");
#endif
#ifdef __x86_64__
printf(" __x86_64__ ");
#endif
#ifdef __linux__
printf(" __linux__ ");
#endif
#ifdef __APPLE__
printf(" __APPLE__ ");
#endif
printf("\n");
}
- factorial.S Select all
# ----------------------------------------------------------------------------
# A 64-bit recursive implementation of the function
#
# uint64_t factorial(unsigned n)
#
# implemented recursively
# ----------------------------------------------------------------------------
#ifdef __linux__
.globl factorial
#endif
#ifdef __APPLE__
.globl _factorial
#endif
.text
#ifdef __ARM_ARCH_ISA_A64
.align 4
#endif
#ifdef __linux__
factorial:
#endif
#ifdef __APPLE__
_factorial:
#endif
#if defined __arm64__ || defined __ARM_ARCH_ISA_A64
cmp x8, #1 //# n > 1?
b.gt L1 //# if yes, go do a recursive call
mov x0, #1 //# otherwise return 1
ret
#endif
#ifdef __x86_64__
cmp $1, %rdi # n <= 1?
jnbe L1 # if not, go do a recursive call
mov $1, %rax # otherwise return 1
ret
#endif
L1:
#if defined __arm64__ || defined __ARM_ARCH_ISA_A64
STP X8, LR, [SP, #-16]! //# push x8 and LR(x30)
// LR is used to return from subroutine
subs x8, x8, #1 //# n-1
#ifdef __linux__
bl factorial //# factorial(n-1), result goes in x0
#endif
#ifdef __APPLE__
bl _factorial //# factorial(n-1), result goes in x0
#endif
LDP X8, LR, [SP], #16 //# pop x8 and LR(x30)
mul x0, x0, x8 //# n * factorial(n-1), stored in x0
ret
#endif
#ifdef __x86_64__
push %rdi # save n on stack (also aligns %rsp!)
dec %rdi # n-1
#ifdef __linux__
call factorial # factorial(n-1), result goes in %rax
#endif
#ifdef __APPLE__
call _factorial # factorial(n-1), result goes in %rax
#endif
pop %rdi # restore n
imul %rdi, %rax # n * factorial(n-1), stored in %rax
ret
#endif
- callmaxofthree.c Select all
/*
* callmaxofthree.c
*
* A small program that illustrates how to call the maxofthree function we wrote in
* assembly language.
*/
#include <stdio.h>
#include <inttypes.h>
int64_t maxofthree(int64_t, int64_t, int64_t);
int main() {
#ifdef __linux__
printf("%ld\n", maxofthree(1, -4, -7));
printf("%ld\n", maxofthree(2, -6, 1));
printf("%ld\n", maxofthree(2, 3, 1));
printf("%ld\n", maxofthree(-2, 4, 3));
printf("%ld\n", maxofthree(2, -6, 5));
printf("%ld\n", maxofthree(2, 4, 6));
#endif
#ifdef __APPLE__
printf("%lld\n", maxofthree(1, -4, -7));
printf("%lld\n", maxofthree(2, -6, 1));
printf("%lld\n", maxofthree(2, 3, 1));
printf("%lld\n", maxofthree(-2, 4, 3));
printf("%lld\n", maxofthree(2, -6, 5));
printf("%lld\n", maxofthree(2, 4, 6));
#endif
printf("I am ");
#ifdef __ARM_ARCH_ISA_A64
printf(" __ARM_ARCH_ISA_A64 ");
#endif
#ifdef __arm64__
printf(" __arm64__ ");
#endif
#ifdef __x86_64__
printf(" __x86_64__ ");
#endif
#ifdef __linux__
printf(" __linux__ ");
#endif
#ifdef __APPLE__
printf(" __APPLE__ ");
#endif
printf("\n");
return 0;
}
- maxofthree.S Select all
# -----------------------------------------------------------------------------
# A 64-bit function that returns the maximum value of its three 64-bit integer
# arguments. The function has signature:
#
# int64_t maxofthree(int64_t x, int64_t y, int64_t z)
#
# Note that the parameters for x86_64 have already been passed in rdi, rsi, and rdx. We
# Note that the parameters for arm64 have already been passed in x0, x1, x2. We
# just have to return the value in rax(x86_64), x0(arm64).
# -----------------------------------------------------------------------------
#ifdef __linux__
.globl maxofthree
#endif
#ifdef __APPLE__
.globl _maxofthree
#endif
.text
#ifdef __ARM_ARCH_ISA_A64
.align 4
#endif
#ifdef __linux__
maxofthree:
#endif
#ifdef __APPLE__
_maxofthree:
#endif
#if defined __arm64__ || defined __ARM_ARCH_ISA_A64
cmp x0, x1 //# is x0 > x1
csel x0, x0, x1, GT // if GT, x0 = x0 else x0 = x1
cmp x0, x2 //# is x0 > x2
csel x0, x0, x2, GT // if GT, x0 = x0 else x0 = x2
ret //# the max will be in x0
#endif
#ifdef __x86_64__
mov %rdi, %rax # result (rax) initially holds x
cmp %rsi, %rax # is x less than y?
cmovl %rsi, %rax # if so, set result to y
cmp %rdx, %rax # is max(x,y) less than z?
cmovl %rdx, %rax # if so, set result to z
ret # the max will be in eax
#endif
- chaskey.h Select all
#ifndef CHASKEY_H
#define CHASKEY_H
#define CHASKEY_ENCRYPT 1
#define CHASKEY_DECRYPT 0
#ifdef __cplusplus
extern "C" {
#endif
void chas_encrypt(int, void*, void*);
void chaskey(void*, void*);
void chas_encryptx(void*, void*);
#ifdef __cplusplus
}
#endif
#endif
- testckey.c Select all
// test unit for chaskey
#include <stdio.h>
#include <string.h>
#include <inttypes.h>
#include "chaskey.h"
uint8_t plain[16]=
{ 0xb8, 0x23, 0x28, 0x26,
0xfd, 0x5e, 0x40, 0x5e,
0x69, 0xa3, 0x01, 0xa9,
0x78, 0xea, 0x7a, 0xd8 };
uint8_t key[16] =
{ 0x56, 0x09, 0xe9, 0x68,
0x5f, 0x58, 0xe3, 0x29,
0x40, 0xec, 0xec, 0x98,
0xc5, 0x22, 0x98, 0x2f };
uint8_t cipher[16] =
{ 0xd5, 0x60, 0x8d, 0x4d,
0xa2, 0xbf, 0x34, 0x7b,
0xab, 0xf8, 0x77, 0x2f,
0xdf, 0xed, 0xde, 0x07 };
int main(void)
{
uint8_t t[16];
int e;
memcpy(t, plain, 16);
chaskey(key, t);
e = memcmp(t, cipher, 16)==0;
printf("\nCHASKEY Encryption: %s\n",
e ? "OK" : "FAILED");
printf("I am ");
#ifdef __ARM_ARCH_ISA_A64
printf(" __ARM_ARCH_ISA_A64 ");
#endif
#ifdef __arm64__
printf(" __arm64__ ");
#endif
#ifdef __x86_64__
printf(" __x86_64__ ");
#endif
#ifdef __linux__
printf(" __linux__ ");
#endif
#ifdef __APPLE__
printf(" __APPLE__ ");
#endif
printf("\n");
return 0;
}
- ckey.S Select all
// CHASKEY in ARM64 assembly
// Chaskey-LTS Block Cipher in AMD64 assembly (Encryption only)
.text
#ifdef __x86_64__
.intel_syntax noprefix
#endif
.globl chaskey
.globl _chaskey
#ifdef __ARM_ARCH_ISA_A64
.align 4
#endif
// chaskey(void*mk, void*data);
chaskey:
_chaskey:
#if defined __arm64__ || defined __ARM_ARCH_ISA_A64
// load 128-bit key
ldp w2, w3, [x0]
ldp w4, w5, [x0, 8]
// load 128-bit plain text
ldp w6, w7, [x1]
ldp w8, w9, [x1, 8]
// xor plaintext with key
eor w6, w6, w2 // x[0] ^= k[0];
eor w7, w7, w3 // x[1] ^= k[1];
eor w8, w8, w4 // x[2] ^= k[2];
eor w9, w9, w5 // x[3] ^= k[3];
mov w10, 16 // i = 16
#endif
#ifdef __x86_64__
// .intel_syntax noprefix
push rbx
push rbp
push rsi
# load plaintext
lodsd
xchg eax, ebp
lodsd
xchg eax, ebx
lodsd
xchg eax, edx
lodsd
xchg eax, ebp
# pre-whiten
xor eax, [rdi ]
xor ebx, [rdi+ 4]
xor edx, [rdi+ 8]
xor ebp, [rdi+12]
push 16
pop rcx
#endif
L0:
#if defined __arm64__ || defined __ARM_ARCH_ISA_A64
add w6, w6, w7 // x[0] += x[1];
eor w7, w6, w7, ror 27 // x[1]=R(x[1],27) ^ x[0];
add w8, w8, w9 // x[2] += x[3];
eor w9, w8, w9, ror 24 // x[3]=R(x[3],24) ^ x[2];
add w8, w8, w7 // x[2] += x[1];
ror w6, w6, 16
add w6, w9, w6 // x[0]=R(x[0],16) + x[3];
eor w9, w6, w9, ror 19 // x[3]=R(x[3],19) ^ x[0];
eor w7, w8, w7, ror 25 // x[1]=R(x[1],25) ^ x[2];
ror w8, w8, 16 // x[2]=R(x[2],16);
subs w10, w10, 1 // i--
bne L0 // i > 0
// xor cipher text with key
eor w6, w6, w2 // x[0] ^= k[0];
eor w7, w7, w3 // x[1] ^= k[1];
eor w8, w8, w4 // x[2] ^= k[2];
eor w9, w9, w5 // x[3] ^= k[3];
// save 128-bit cipher text
stp w6, w7, [x1]
stp w8, w9, [x1, 8]
ret
#endif
#ifdef __x86_64__
// .intel_syntax noprefix
# x[0] += x[1]#
add eax, ebx
# x[1]=ROTR32(x[1],27) ^ x[0]
ror ebx, 27
xor ebx, eax
# x[2] += x[3]#
add edx, ebp
# x[3]=ROTR32(x[3],24) ^ x[2]
ror ebp, 24
xor ebp, edx
# x[2] += x[1]#
add edx, ebx
# x[0]=ROTR32(x[0],16) + x[3]
ror eax, 16
add eax, ebp
# x[3]=ROTR32(x[3],19) ^ x[0]
ror ebp, 19
xor ebp, eax
# x[1]=ROTR32(x[1],25) ^ x[2]
ror ebx, 25
xor ebx, edx
# x[2]=ROTR32(x[2],16)
ror edx, 16
loop L0
# post-whiten
xor eax, [rdi ]
xor ebx, [rdi+ 4]
xor edx, [rdi+ 8]
xor ebp, [rdi+12]
pop rdi
# save ciphertext
stosd
xchg eax, ebx
stosd
xchg eax, edx
stosd
xchg eax, ebp
stosd
pop rbp
pop rbx
ret
#endif
- speck.h Select all
#ifndef SPECK_H
#define SPECK_H
#ifdef __cplusplus
extern "C" {
#endif
void speck64(void*, void*);
void speck128(void*, void*);
#ifdef __cplusplus
}
#endif
#endif
- testspk.c Select all
// test unit for speck
#include <stdio.h>
#include <string.h>
#include <inttypes.h>
#include "speck.h"
void print_bytes(char *s, void *p, int len) {
int i;
printf("%s : ", s);
for (i=0; i<len; i++) {
printf ("%02x ", ((uint8_t*)p)[i]);
}
putchar('\n');
}
// SPECK64/128 test vectors
//
// p = 0x3b7265747475432d
uint8_t plain64[]=
{ 0x74, 0x65, 0x72, 0x3b,
0x2d, 0x43, 0x75, 0x74 };
// c = 0x8c6fa548454e028b
uint8_t cipher64[]=
{ 0x48, 0xa5, 0x6f, 0x8c,
0x8b, 0x02, 0x4e, 0x45 };
// key = 0x03020100, 0x0b0a0908, 0x13121110, 0x1b1a1918
uint8_t key64[]=
{ 0x00, 0x01, 0x02, 0x03,
0x08, 0x09, 0x0a, 0x0b,
0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1a, 0x1b };
// SPECK128/256 test vectors
//
uint8_t key128[]=
{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f };
uint8_t plain128[]= { 0x70, 0x6f, 0x6f, 0x6e, 0x65, 0x72, 0x2e, 0x20,
0x49, 0x6e, 0x20, 0x74, 0x68, 0x6f, 0x73, 0x65};
uint64_t cipher128[2] = {0x4eeeb48d9c188f43, 0x4109010405c0f53e};
#define R(v,n)(((v)>>(n))|((v)<<(64-(n))))
#define F(n)for(i=0;i<n;i++)
typedef unsigned long long W;
void speck128x(void*mk,void*in){
W i,t,k[4],r[2];
memcpy(r,in,16);
memcpy(k,mk,32);
F(34)
r[1]=(R(r[1],8)+*r)^*k,
*r=R(*r,61)^r[1],
t=k[3],
k[3]=(R(k[1],8)+*k)^i,
*k=R(*k,61)^k[3],
k[1]=k[2],k[2]=t;
memcpy(in,r,16);
}
int main (void)
{
uint64_t buf[4];
int equ;
// copy plain text to local buffer
memcpy (buf, plain64, sizeof(plain64));
speck64(key64, buf);
equ = memcmp(cipher64, buf, sizeof(cipher64))==0;
printf ("\nSPECK64/128 encryption %s\n", equ ? "OK" : "FAILED");
print_bytes("CT result ", buf, sizeof(plain64)); print_bytes("CT expected", cipher64, sizeof(cipher64));
print_bytes("K ", key64, sizeof(key64));
print_bytes("PT", plain64, sizeof(plain64));
// copy plain text to local buffer
memcpy (buf, plain128, sizeof(plain128));
#ifdef __USE_C_FUNCTION
speck128x(key128, buf);
#else
speck128(key128, buf);
#endif
equ = memcmp(cipher128, buf, sizeof(cipher128))==0;
printf ("\nSPECK128/256 encryption %s\n", equ ? "OK" : "FAILED");
print_bytes("CT result ", buf, sizeof(plain128));
print_bytes("CT expected", cipher128, sizeof(cipher128));
print_bytes("K ", key128, sizeof(key128)); print_bytes("PT", plain128, sizeof(plain128));
printf("I am ");
#ifdef __ARM_ARCH_ISA_A64
printf(" __ARM_ARCH_ISA_A64 ");
#endif
#ifdef __arm64__
printf(" __arm64__ ");
#endif
#ifdef __x86_64__
printf(" __x86_64__ ");
#endif
#ifdef __linux__
printf(" __linux__ ");
#endif
#ifdef __APPLE__
printf(" __APPLE__ ");
#endif
printf("\n");
return 0;
}
- spk64.S Select all
// SPECK64/128 in ARM64 assembly
// SPECK-64/128 Block Cipher in x86 assembly (Encryption only)
#ifdef __x86_64__
.intel_syntax noprefix
#endif
.text
.globl speck64
.globl _speck64
#ifdef __ARM_ARCH_ISA_A64
.align 4
#endif
// speck64(void*mk, void*data);
speck64:
_speck64:
#if defined __arm64__ || defined __ARM_ARCH_ISA_A64
// load 128-bit key
// k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
ldp w5, w6, [x0]
ldp w7, w8, [x0, 8]
// load 64-bit plain text
ldp w2, w4, [x1] // x0 = x[0]; x1 = k[1];
mov w3, wzr // i=0
#endif
#ifdef __x86_64__
push rbx
push rbp
push rsi # save
lodsd
xchg eax, ebx # ebx = in[0]
lodsd
xchg eax, edx # edx = in[1]
push rdi
pop rsi
lodsd
xchg eax, edi # edi = key[0]
lodsd
xchg eax, ebp # ebp = key[1]
lodsd
xchg eax, ecx # ecx = key[2]
lodsd
xchg eax, esi # esi = key[3]
xor eax, eax # i = 0
#endif
L0:
#if defined __arm64__ || defined __ARM_ARCH_ISA_A64
ror w2, w2, 8
add w2, w2, w4 // x0 = (R(x0, 8) + x1) ^ k0;
eor w2, w2, w5 //
eor w4, w2, w4, ror 29 // x1 = R(x1, 3) ^ x0;
mov w9, w8 // backup k3
ror w6, w6, 8
add w8, w5, w6 // k3 = (R(k1, 8) + k0) ^ i;
eor w8, w8, w3 //
eor w5, w8, w5, ror 29 // k0 = R(k0, 3) ^ k3;
mov w6, w7 // k1 = k2;
mov w7, w9 // k2 = t;
add w3, w3, 1 // i++;
cmp w3, 27 // i < 27;
bne L0
// save result
stp w2, w4, [x1] // x[0] = x0; x[1] = x1;
ret
#endif
#ifdef __x86_64__
# ebx = (ROTR32(ebx, 8) + edx) ^ edi;
ror ebx, 8
add ebx, edx
xor ebx, edi
# edx = ROTR32(edx, 29) ^ ebx;
ror edx, 29
xor edx, ebx
# ebp = (ROTR32(ebp, 8) + edi) ^ i;
ror ebp, 8
add ebp, edi
xor ebp, eax
# edi = ROTR32(edi, 29) ^ ebp;
ror edi, 29
xor edi, ebp
xchg esi, ecx
xchg esi, ebp
# i++
inc al
cmp al, 27
jnz L0
pop rdi
xchg eax, ebx
stosd
xchg eax, edx
stosd
pop rbp
pop rbx
ret
#endif
- spk128.S Select all
// SPECK128/256 in ARM64 assembly
// SPECK-128/256 Block Cipher in AMD64 assembly (Encryption only)
#ifdef __x86_64__
.intel_syntax noprefix
#endif
.text
.global speck128
.global _speck128
#ifdef __ARM_ARCH_ISA_A64
.align 4
#endif
// speck128(void*mk, void*data);
speck128:
_speck128:
#if defined __arm64__ || defined __ARM_ARCH_ISA_A64
// load 256-bit key
// k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
ldp x5, x6, [x0]
ldp x7, x8, [x0, 16]
// load 128-bit plain text
ldp x2, x4, [x1] // x0 = x[0]; x1 = k[1];
mov x3, xzr // i=0
#endif
#ifdef __x86_64__
push rbp
push rbx
push rdi
push rsi
# load 128-bit plaintext
mov rbp, [rsi ]
mov rsi, [rsi+8]
# load 256-bit key
mov rbx, [rdi ] # k0
mov rcx, [rdi+ 8] # k1
mov rdx, [rdi+16] # k2
mov rdi, [rdi+24] # k3
# i = 0
xor eax, eax
#endif
L0:
#if defined __arm64__ || defined __ARM_ARCH_ISA_A64
ror x4, x4, 8
add x4, x4, x2 // x1 = (R(x1, 8) + x0) ^ k0;
eor x4, x4, x5 //
eor x2, x4, x2, ror 61 // x0 = R(x0, 61) ^ x1;
mov x9, x8 // backup k3
ror x6, x6, 8
add x8, x5, x6 // k3 = (R(k1, 8) + k0) ^ i;
eor x8, x8, x3 //
eor x5, x8, x5, ror 61 // k0 = R(k0, 61) ^ k3;
mov x6, x7 // k1 = k2;
mov x7, x9 // k2 = t;
add x3, x3, 1 // i++;
cmp x3, 34 // i < 34;
bne L0
// save result
stp x2, x4, [x1] // x[0] = x0; x[1] = x1;
ret
#endif
#ifdef __x86_64__
# x[1] = (R(x[1], 8) + x[0]) ^ k[0];
ror rsi, 8
add rsi, rbp
xor rsi, rbx
# x[0] = R(x[0], 61) ^ x[1];
ror rbp, 61
xor rbp, rsi
# k[1] = (R(k[1], 8) + k[0]) ^ i;
ror rcx, 8
add rcx, rbx
xor cl, al
# k[0] = R(k[0], 61) ^ k[3];
ror rbx, 61
xor rbx, rcx
# X(k3, k2), X(k3, k1);
xchg rdi, rdx
xchg rdi, rcx
# i++
inc al
cmp al, 34
jnz L0
pop rax
push rax
# save 128-bit result
mov [rax ], rbp
mov [rax+8], rsi
pop rsi
pop rdi
pop rbx
pop rbp
ret
#endif
(2) Compile and Linking
- shell script Select all
# use clang to compile and link
# To compile and link the above in Linux (e.g. tested in Android arm64 Termux App or Windows 10 WSL2 and clang package should be installed)
clang callsum.c sum.S -o callsum;
clang callfactorial.c factorial.S -o callfactorial;
clang maxofthree.S callmaxofthree.c -o callmaxofthree;
clang testckey.c ckey.S -o testckey;
clang testspk.c spk64.S spk128.S -o testspk;
#To compile and link the above in macOS (new M1 machine is capable to run x86_64 and arm64 binaries with Rosetta 2 installed). To compile on macOS, XCode, Command Line Utility and Rosetta 2 should be installed.
clang callsum.c sum.S -o callsum_x86_64 -arch x86_64;
clang callfactorial.c factorial.S -o callfactorial_x86_64 -arch x86_64;
clang maxofthree.S callmaxofthree.c -o callmaxofthree_x86_64 -arch x86_64;
clang testckey.c ckey.S -o testckey_x86_64 -arch x86_64;
clang testspk.c spk64.S spk128.S -o testspk_x86_64 -arch x86_64;
clang callsum.c sum.S -o callsum_arm64 -arch arm64;
clang callfactorial.c factorial.S -o callfactorial_arm64 -arch arm64;
clang maxofthree.S callmaxofthree.c -o callmaxofthree_arm64 -arch arm64;
clang testckey.c ckey.S -o testckey_arm64 -arch arm64;
clang testspk.c spk64.S spk128.S -o testspk_arm64 -arch arm64;
In order to debug say using lldb, add -g option when compile and, in addition, macOS has to codesign with enttlements
(3) Summary of differences
3.1) In order to preprocess the assembler file using clang compiler, the filename extension should be capital letter S in linux. Subroutine name between C and global asm labels should prefix by underscore for macOS.
3.2) A64 (arm64) instruction set does not include an explicit stack push instruction. Functions can use the stp and ldp (load pair of registers) to carry out the push and pop operations as demo in factorial.S source code above.
3.3) Most Armv8-64 platforms (e.g. macOS) require quadword (16-byte) alignment of the SP register.
3.4) A64 (arm64) parameter/ results registers are X0-7. X8 is designated as the Indirect Result Location Parameter and X30 (LR) is the Link Register. If the function has a return value, it will be stored in X0. A64 (arm64) floating point result registers are S0 or D0 as demo in sum.S
3.5) x86_64 parameter registers for integer or pointer are %rdi. %rsi, %rdx, %rcx, %r8, %r9. If the function has a return value, it will be stored in %rax. x86_64 floating point result registers are %xmm0. as demo in sum.S
3.6) by using the directive .intel_syntax noprefix, the x86_64 intel syntax assembly code can be used where the first assembler operand usually is the destination operand where the order is similar to that of arm64 code. In addition the prefix % can be omitted when using noprefix.
(4) To download the above source code using command line
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_CALLSUM.C | sed '1d' | sed -n "/END_OF_CALLSUM.C/q;p" | sed 's/>/\>/g;s/</\</g' > callsum.c
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_SUM.S | sed '1d' | sed -n "/END_OF_SUM.S/q;p" | sed 's/>/\>/g;s/</\</g' > sum.S
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_CALLFACTORIAL.C | sed '1d' | sed -n "/END_OF_CALLFACTORIAL.C/q;p" | sed 's/>/\>/g;s/</\</g' > callfactorial.c
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_FACTORIAL.S | sed '1d' | sed -n "/END_OF_FACTORIAL.S/q;p" | sed 's/>/\>/g;s/</\</g' > factorial.S
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_CALLMAXOFTHREE.C | sed '1d' | sed -n "/END_OF_CALLMAXOFTHREE.C/q;p" | sed 's/>/\>/g;s/</\</g' > callmaxofthree.c
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_MAXOFTHREE.S | sed '1d' | sed -n "/END_OF_MAXOFTHREE.S/q;p" | sed 's/>/\>/g;s/</\</g' > maxofthree.S
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_CHASKEY.H | sed '1d' | sed -n "/END_OF_CHASKEY.H/q;p" | sed 's/>/\>/g;s/</\</g' > chaskey.h
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_TESTCKEY.C | sed '1d' | sed -n "/END_OF_TESTCKEY.C/q;p" | sed 's/>/\>/g;s/</\</g' > testckey.c
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_CKEY.S | sed '1d' | sed -n "/END_OF_CKEY.S/q;p" | sed 's/>/\>/g;s/</\</g' > ckey.S
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_SPECK.H | sed '1d' | sed -n "/END_OF_SPECK.H/q;p" | sed 's/>/\>/g;s/</\</g' > speck.h
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_TESTSPK.C | sed '1d' | sed -n "/END_OF_TESTSPK.C/q;p" | sed 's/>/\>/g;s/</\</g' > testspk.c
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_SPK64.S | sed '1d' | sed -n "/END_OF_SPK64.S/q;p" | sed 's/>/\>/g;s/</\</g' > spk64.S
curl -L https://tinyurl.com/mixcasm | grep -A200 START_OF_SPK128.S | sed '1d' | sed -n "/END_OF_SPK128.S/q;p" | sed 's/>/\>/g;s/</\</g' > spk128.S