AArch64上基本所有指令都有3个操作符。一个目标寄存器和2个源寄存器。
例如:
add w5, w3, w4 // w5 ← w3 + w4 复制代码
或者:
add x5, x3, x4 // x5 ← x3 + x4 复制代码
可以第32个通用寄存器:
add w0, w1, wzr // w0 ← w1 + 0 复制代码
add w0, w1, #2 // w0 ← w1 + 2 复制代码
add w0, w1, #-2 // w0 ← w1 + (-2) 复制代码
这里要注意的是如果是立即数,只有第二个源操作数才被允许是立即数。
减法同理。
注意的是:减法中有rsb(Reverse Substract)就是被减数和减数相比sub是反的。
乘法
在32位中有mul,smull,umull。
Mul不会更新cpsr。
Smull通过补码相乘。
Umull是无符号值。
语法:
{s,u}mull RdestLower, RdestHigher, Rsource1, Rsource2
乘法示例
一个64位的数变成两部分,高32位和低32位。
n = 232 × nhigher + nlower
例子(232 * 2+ 3755744309)*12345678
x 和y = z
Z=X*Y=(232 * X1+ X0)*(232 * Y1+ Y0)
那么 z = 264 × x1 × y1 + 232 × (x0 × y1 + x1 × y0) + x0 × y0
.data .align 4 message : .asciz "Multiplication of %lld by %lld is %lld\n" .align 8 number_a_low: .word 3755744309 number_a_high: .word 2 .align 8 number_b_low: .word 12345678 number_b_high: .word 0 .text /* 不是64位乘法的高效方法,用于指令乘目的,逻辑上相比不严谨.*/ mult64: /* 参数通过r0, r1 and r2, r3 and 传递,并通过r0, r1返回 */ /* 保存需要被覆盖的寄存器*/ push {r4, r5, r6, r7, r8, lr} /* 移动{r0,r1} 到{r4,r5} */ mov r4, r0 /* r4 ← r0 */ mov r5, r1 /* r5 ← r1 */ umull r0, r6, r2, r4 /* {r0,r6} ← r2 * r4 (X0*Y1)*/ umull r7, r8, r3, r4 /* {r7,r8} ← r3 * r4 (x0 × y1 )*/ umull r4, r5, r2, r5 /* {r4,r5} ← r2 * r5 (x1 × y0 )*/ adds r2, r7, r4 /* r2 ← r7 + r4 and update cpsr */ adc r1, r2, r6 /* r1 ← r2 + r6 + C,高位相加,X1*Y1没有 */ /* Restore registers */ pop {r4, r5, r6, r7, r8, lr} bx lr /* Leave mult64 */ .global main main: push {r4, r5, r6, r7, r8, lr} /* 保存寄存器 */ /* Load the numbers from memory */ /* {r4,r5} ← a */ ldr r4, addr_number_a_low /* r4 ← &a_low */ ldr r4, [r4] /* r4 ← *r4 */ ldr r5, addr_number_a_high /* r5 ← &a_high */ ldr r5, [r5] /* r5 ← *r5 */ /* {r6,r7} ← b */ ldr r6, addr_number_b_low /* r6 ← &b_low */ ldr r6, [r6] /* r6 ← *r6 */ ldr r7, addr_number_b_high /* r7 ← &b_high */ ldr r7, [r7] /* r7 ← *r7 */ /*第一个参数通过 {r0,r1} ,第二个参数通过 {r2,r3}*/ mov r0, r4 /* r0 ← r4 */ mov r1, r5 /* r1 ← r5 */ mov r2, r6 /* r2 ← r6 */ mov r3, r7 /* r3 ← r7 */ bl mult64 /* 调用mult64函数*/ /* 结果保存在r0,r1 */ /* Now prepare the call to printf */ /* We have to pass &message, {r4,r5}, {r6,r7} and {r0,r1} */ push {r1} /* 4th (higher) parameter */ push {r0} /* 4th (lower) parameter */ push {r7} /* 3rd (higher) parameter */ push {r6} /* 3rd (lower) parameter */ mov r3, r5 /* r3 ← r5. 2rd (higher) parameter */ mov r2, r4 /* r2 ← r4. 2nd (lower) parameter */ ldr r0, addr_of_message /*r0 ← &message 第一个参数,这里r1被跳过*/ bl printf /*调用printf函数*/ add sp, sp, #16 /* sp ← sp + 16 */ /* Pop the two registers we pushed above */ mov r0, #0 /* r0 ← 0 */ pop {r4, r5, r6, r7, r8, lr}/* 恢复寄存器*/ bx lr addr_of_message : .word message addr_number_a_low: .word number_a_low addr_number_a_high: .word number_a_high addr_number_b_low: .word number_b_low addr_number_b_high: .word number_b_high 复制代码
as -g -o mult64.o mult64.s
gcc -o mult64 mult64.o
调用函数mult64。
$./mult64
Multiplication of 12345678901 by 12345678 is 152415776403139878
其中打印结果有点复杂。64位必须作为成对的连续寄存器传递,其中下半部分位于偶数寄存器中。
同时函数的前4个参数通过r0,r1,r2,r3顺序传递。超过4个参数后就会使用堆栈。这个其实就是个约定。
64位乘法示例
上诉的32位乘法改成64位机器上就非常建议的,因为寄存器变成64位了。
例子(232 * 2+ 3755744309)*12345678
按照32位的逻辑如下:
.data .align 8 message : .asciz "Multiplication of %lld,%lld by %lld,%lld is %lld\n" .align 8 number_a_low: .word 3755744309 number_a_high: .word 2 .align 8 number_b_low: .word 12345678 number_b_high: .word 0 .text /* 不是64位乘法的高效方法,用于指令乘目的,逻辑上相比不严谨.*/ .type factorial,@function mult64: /* 参数通过x0, x1 ,x2, x3传递,并通过x0返回 */ /* 移动{x0,x1} 到{x4,x5} */ mov w4, w0 /* x4 ← x0 */ mov w5, w1 /* x5 ← x1 */ umulh x6, x2, x4 /* 低32位相乘,如果有高位,则位于w6中*/ umull x0, w2, w4 /*低32位相乘的结果 */ umull x7, w3, w4 /* {r7,r8} ← r3 * r4 (x1 × y0)*/ umull x4, w2, w5 /* {r4,r5} ← r2 * r5 (x0 × y1 )*/ adds x2, x7, x4/*r2 ← r7 + r4 and update cpsr,高32位相加得到结果*/ adc x1, x2, x6 /* x1 ← x2 + w6 + C,高位相加,X1*Y1没有 */ mov x1, x1, lsl #32//将高位右移动32位,然后和低位相加就是最后的结果 adc x0, x0, x1 ret .global _start _start: /* {x4,r5} ← a */ ldr x4, addr_number_a_low /* r4 ← &a_low */ ldr w4, [x4] /* r4 ← *r4 */ ldr x5, addr_number_a_high /* r5 ← &a_high */ ldr w5, [x5] /* r5 ← *r5 */ /* {x6,x7} ← b */ ldr x6, addr_number_b_low /* r6 ← &b_low */ ldr w6, [x6] /* r6 ← *r6 */ ldr x7, addr_number_b_high /* r7 ← &b_high */ ldr w7, [x7] /* r7 ← *r7 */ /*第一个参数通过 {r0,r1} ,第二个参数通过 {r2,r3}*/ mov w0, w4 /* r0 ← r4 */ mov w1, w5 /* r1 ← r5 */ mov w2, w6 /* r2 ← r6 */ mov w3, w7 /* r3 ← r7 */ bl mult64 /* 调用mult64函数*/ /* 结果保存在r0,r1 */ /* 准备调用 call 函数*/ mov x5,x0;//64位结果 ldr x2, addr_number_a_low ldr x2, [x2] ldr x1, addr_number_a_high ldr x1, [x1] ldr x3, addr_number_b_low ldr x3, [x3] ldr x4, addr_number_b_high ldr x4, [x4] ldr x0, addr_of_message /*r0 ← &message 第一个参数,这里r1被跳过*/ bl printf /*调用printf函数*/ mov x0, #0 /* r0 ← 0 */ mov x8, 93 svc 0 addr_of_message : .dword message addr_number_a_low: .dword number_a_low addr_number_a_high: .dword number_a_high addr_number_b_low: .dword number_b_low addr_number_b_high: .dword number_b_high 复制代码
as -g -o mult64.o mult64.s
ld -o mult64 mult64.o -lc -I /lib64/ld-linux-aarch64.so.1
ARM64中参数1~参数8 分别保存到 X0~X7 寄存器中 ,剩下的参数从右往左一次入栈,被调用者实现栈平衡,返回值存放在 X0 中。
$./mult64
Multiplication of 2,12345678901 by 12345678,0 is 152415776403139878
64位简化逻辑
按照64位的简化逻辑如下:
.data .align 8 message : .asciz "Multiplication of %lld by %lld is %lld\n" .align 8 number_a: .dword 12345678901 .align 8 number_b: .dword 12345678 .text /* 不是64位乘法的高效方法,用于指令乘目的,逻辑上相比不严谨.*/ .type factorial,@function mult64: mul x0, x0, x1 ret .global _start _start: /* {x4} ← a */ ldr x4, addr_number_a /* r4 ← &a */ ldr x4, [x4] /* r4 ← *r4 */ /* {x6} ← b */ ldr x6, addr_number_b /* r6 ← &b */ ldr x6, [x6] /* r6 ← *r6 */ /*参数通过 x0,x1*/ mov x0, x4 /* x0 ← x4 */ mov x1, x6 /* x1 ← x5 */ bl mult64 /* 调用mult64函数*/ /* 结果保存在r0,r1 */ /* 准备调用 call 函数*/ mov x3,x0;//64位结果 mov x1,x4 mov x2,x6 ldr x0, addr_of_message /*r0 ← &message 第一个参数,这里r1被跳过*/ bl printf /*调用printf函数*/ mov x0, #0 /* r0 ← 0 */ mov x8, 93 svc 0 addr_of_message : .dword message addr_number_a: .dword number_a addr_number_b: .dword number_b 复制代码
as -g -o mult64s.o mult64s.s
ld -o mult64s mult64s.o -lc -I /lib64/ld-linux-aarch64.so.1
结果如下:
$./mult64s
Multiplication of 12345678901 by 12345678 is 152415776403139878
除法
大多数计算机执行整数除法,其余数部分与分子的符号相同。
无符号整数除法是包含两个无符号整数N和D的整数除法。商Q和余数R始终为正。
在ARMv6没有整数除法指令的(有浮点除法指令),但是在armv8上是有除法指令的。
看下除法指令:
整数除法
SDIV Wd, Wn, Wm
Signed Divide: Wd = Wn ÷ Wm, treating source operands as signed.
SDIV Xd, Xn, Xm
Signed Divide (extended): Xd = Xn ÷ Xm, treating source operands as signed.
UDIV Wd, Wn, Wm
Unsigned Divide: Wd = Wn ÷ Wm, treating source operands as unsigned.
UDIV Xd, Xn, Xm
Unsigned Divide (extended): Xd = Xn ÷ Xm, treating source operands as unsigned.
64位整数除法示例
.data .balign 4 message1: .asciz "Hey, type a number: " .balign 4 message2: .asciz "I read the number %d/%d\n" .balign 4 message3: .asciz "Result, Q: %d,R: %d\n" /* Format pattern for scanf */ .balign 4 scan_pattern : .asciz "%d %d" /* Where scanf will store the number read */ .balign 4 number_N: .dword 0 number_D: .dword 0 .balign 4 return: .word 0 .arch armv8-a .global _start .text _start: ldr x0, address_of_message1 /* r0 ← &message1 */ bl printf /* call to printf */ ldr x0, address_of_scan_pattern /* r0 ← &scan_pattern */ ldr x1, address_of_number_N /* r1 ← &number_read */ ldr x2, address_of_number_D /* r1 ← &number_read */ bl scanf /* call to scanf */ ldr x0, address_of_message2 /* r0 ← &message2 */ ldr x1, address_of_number_N /* r1 ← &number_read */ ldr x1, [x1] /* r1 ← *r1 */ ldr x2, address_of_number_D /* r1 ← &number_read */ ldr x2, [x2] /* r1 ← *r1 */ bl printf /* call to printf */ ldr x0, address_of_message3 /* r0 ← &number_read */ ldr x1, address_of_number_N /* r1 ← &number_read */ ldr x1, [x1] /* r1 ← *r1 */ ldr x2, address_of_number_D /* r1 ← &number_read */ ldr x2, [x2] /* r1 ← *r1 */ udiv x3, x1, x2 msub x4,x3,x2,x1 mov x2,x4//余数给x2 mov x1,x3//商 bl printf /* call to printf */ ldr x0, 0 /* r0 ← *r0 */ mov x8, 93 svc 0 address_of_message1 : .dword message1 address_of_message2 : .dword message2 address_of_message3 : .dword message3 address_of_scan_pattern : .dword scan_pattern address_of_number_N: .dword number_N address_of_number_D : .dword number_D address_of_return : .dword return /* External */ .global printf .global scanf 复制代码
as -g -o div.o div.s
ld -o div div.o -lc -I /lib64/ld-linux-aarch64.so.1
这里的余数获取,并有直接的寄存器保存,所以获取通过msub指令:
MSUB Wd, Wn, Wm, Wa
Multiply-Subtract: Wd = Wa – (Wn × Wm).
测试运行如下:
$./div
Hey, type a number: 242 3
I read the number 242/3
Result, Q: 80,R: 2
浮点除法
FDIV Sd, Sn, Sm
Single-precision floating-point scalar division: Sd = Sn / Sm.
FDIV Dd, Dn, Dm
Double-precision floating-point scalar division: Dd = Dn / Dm.
FDIV Vd.<T>, Vn.<T>, Vm.<T>
Floating-point divide (vector). Where <T> is 2S, 4S or 2D.
64位浮点除法示例
.data .balign 4 message1: .asciz "Hey, type a number: " .balign 4 message2: .asciz "I read the number %5.2f/%5.2f\n" .balign 4 message3: .asciz "Result, Q: %5.2f\n" /* Format pattern for scanf */ .balign 4 scan_pattern : .asciz "%f %f" /* Where scanf will store the number read */ .balign 4 number_N: .dword 0 number_D: .dword 0 .balign 4 return: .word 0 .arch armv8-a .global _start .text _start: ldr x0, address_of_message1 /* r0 ← &message1 */ bl printf /* call to printf */ ldr x0, address_of_scan_pattern /* r0 ← &scan_pattern */ ldr x1, address_of_number_N /* r1 ← &number_read */ //ldr d0, address_of_number_N ldr x2, address_of_number_D /* r1 ← &number_read */ //ldr d1, address_of_number_D bl scanf /* call to scanf */ ldr x0, address_of_message2 /* r0 ← &message2 */ ldr x1, address_of_number_N /* r1 ← &number_read */ ldr s0, [x1] /* r1 ← *r1 */ fcvt d0,s0 ldr x2, address_of_number_D /* r1 ← &number_read */ ldr s1, [x2] /* r1 ← *r1 */ fcvt d1,s1 bl printf /* call to printf */ ldr x0, address_of_message3 /* r0 ← &number_read */ ldr x1, address_of_number_N /* r1 ← &number_read */ ldr s0, [x1] /* r1 ← *r1 */ fcvt d0,s0 ldr x2, address_of_number_D /* r1 ← &number_read */ ldr s1, [x2] /* r1 ← *r1 */ fcvt d1,s1 fdiv d3, d0, d1 fmov d0,d3//商 bl printf /* call to printf */ ldr x0, 0 /* r0 ← *r0 */ mov x8, 93 svc 0 address_of_message1 : .dword message1 address_of_message2 : .dword message2 address_of_message3 : .dword message3 address_of_scan_pattern : .dword scan_pattern address_of_number_N: .dword number_N address_of_number_D : .dword number_D address_of_return : .dword return /* External */ .global printf .global scanf 复制代码
as -g -o fdiv.o fdiv.s
ld -o fdiv fdiv.o -lc -I /lib64/ld-linux-aarch64.so.1
执行如下:
$./fdiv
Hey, type a number: 3.5 1.2
I read the number 3.50/ 1.20
Result, Q: 2.92
扩展指令
格式是:kxtw,其中k是我们要扩展的整数类型,w是窄值的宽度。 对于前者,整数的类型可以是U(无符号)或S(有符号,即两个补数)。 对于后者,宽度可以是B,H或W,分别表示字节(寄存器的最低8位),半字(寄存器的最低16位)或字(寄存器的最低32位)。
扩展指令uxtb, sxtb, uxth, sxth, uxtw, sxtw.
Add指令
add x0, x1, w2, sxtw // x0 ← x1 + ExtendSigned32To64 ( w2 )
使用这些扩展运算符时,必须考虑某种上下文。 例如,以下两条指令的含义略有不同:
add x0, x1, w2, sxtb // x0 ← x1 + ExtendSigned8To64 ( w2 )
add w0, w1, w2, sxtb // w0 ← w1 + ExtendSigned8To32 ( w2 )
add x2, x0, x1, sxtw #1 // x2 ← x0 + ( ExtendSigned16To64 ( x1 ) << 1 )
// this sets x2 to 0x2468
add x2, x0, x1, sxtw #2 // x2 ← x0 + ( ExtendSigned16To64 ( x1 ) << 2 )
// this sets x2 to 0x48d0
add x2, x0, x1, sxtw #3 // x2 ← x0 + ( ExtendSigned16To64 ( x1 ) << 3 )
// this sets x2 to 0x91a0
add x2, x0, x1, sxtw #4 // x2 ← x0 + ( ExtendSigned16To64 ( x1 ) << 4 )
// this sets x2 to 0x12340
系统调用
系统调用指令是SWI和SVC.
SWI和SVC是同一件事,只是名称的更改。 以前,SVC指令称为SWI,即软件中断。