1939 lines
38 KiB
ArmAsm
1939 lines
38 KiB
ArmAsm
.text
|
|
|
|
.align 5
|
|
Lsigma:
|
|
.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
|
|
Lone:
|
|
.long 1,0,0,0
|
|
|
|
.globl _chacha20_arm
|
|
.globl _chacha20_neon
|
|
|
|
|
|
.align 5
|
|
_chacha20_arm:
|
|
cbz x2,Labort
|
|
Lshort:
|
|
stp x29,x30,[sp,#-96]!
|
|
add x29,sp,#0
|
|
|
|
adr x5,Lsigma
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
sub sp,sp,#64
|
|
|
|
ldp x22,x23,[x5] // load sigma
|
|
ldp x24,x25,[x3] // load key
|
|
ldp x26,x27,[x3,#16]
|
|
ldp x28,x30,[x4] // load counter
|
|
#ifdef __ARMEB__
|
|
ror x24,x24,#32
|
|
ror x25,x25,#32
|
|
ror x26,x26,#32
|
|
ror x27,x27,#32
|
|
ror x28,x28,#32
|
|
ror x30,x30,#32
|
|
#endif
|
|
|
|
Loop_outer:
|
|
mov w5,w22 // unpack key block
|
|
lsr x6,x22,#32
|
|
mov w7,w23
|
|
lsr x8,x23,#32
|
|
mov w9,w24
|
|
lsr x10,x24,#32
|
|
mov w11,w25
|
|
lsr x12,x25,#32
|
|
mov w13,w26
|
|
lsr x14,x26,#32
|
|
mov w15,w27
|
|
lsr x16,x27,#32
|
|
mov w17,w28
|
|
lsr x19,x28,#32
|
|
mov w20,w30
|
|
lsr x21,x30,#32
|
|
|
|
mov x4,#10
|
|
subs x2,x2,#64
|
|
Loop:
|
|
sub x4,x4,#1
|
|
add w5,w5,w9
|
|
add w6,w6,w10
|
|
add w7,w7,w11
|
|
add w8,w8,w12
|
|
eor w17,w17,w5
|
|
eor w19,w19,w6
|
|
eor w20,w20,w7
|
|
eor w21,w21,w8
|
|
ror w17,w17,#16
|
|
ror w19,w19,#16
|
|
ror w20,w20,#16
|
|
ror w21,w21,#16
|
|
add w13,w13,w17
|
|
add w14,w14,w19
|
|
add w15,w15,w20
|
|
add w16,w16,w21
|
|
eor w9,w9,w13
|
|
eor w10,w10,w14
|
|
eor w11,w11,w15
|
|
eor w12,w12,w16
|
|
ror w9,w9,#20
|
|
ror w10,w10,#20
|
|
ror w11,w11,#20
|
|
ror w12,w12,#20
|
|
add w5,w5,w9
|
|
add w6,w6,w10
|
|
add w7,w7,w11
|
|
add w8,w8,w12
|
|
eor w17,w17,w5
|
|
eor w19,w19,w6
|
|
eor w20,w20,w7
|
|
eor w21,w21,w8
|
|
ror w17,w17,#24
|
|
ror w19,w19,#24
|
|
ror w20,w20,#24
|
|
ror w21,w21,#24
|
|
add w13,w13,w17
|
|
add w14,w14,w19
|
|
add w15,w15,w20
|
|
add w16,w16,w21
|
|
eor w9,w9,w13
|
|
eor w10,w10,w14
|
|
eor w11,w11,w15
|
|
eor w12,w12,w16
|
|
ror w9,w9,#25
|
|
ror w10,w10,#25
|
|
ror w11,w11,#25
|
|
ror w12,w12,#25
|
|
add w5,w5,w10
|
|
add w6,w6,w11
|
|
add w7,w7,w12
|
|
add w8,w8,w9
|
|
eor w21,w21,w5
|
|
eor w17,w17,w6
|
|
eor w19,w19,w7
|
|
eor w20,w20,w8
|
|
ror w21,w21,#16
|
|
ror w17,w17,#16
|
|
ror w19,w19,#16
|
|
ror w20,w20,#16
|
|
add w15,w15,w21
|
|
add w16,w16,w17
|
|
add w13,w13,w19
|
|
add w14,w14,w20
|
|
eor w10,w10,w15
|
|
eor w11,w11,w16
|
|
eor w12,w12,w13
|
|
eor w9,w9,w14
|
|
ror w10,w10,#20
|
|
ror w11,w11,#20
|
|
ror w12,w12,#20
|
|
ror w9,w9,#20
|
|
add w5,w5,w10
|
|
add w6,w6,w11
|
|
add w7,w7,w12
|
|
add w8,w8,w9
|
|
eor w21,w21,w5
|
|
eor w17,w17,w6
|
|
eor w19,w19,w7
|
|
eor w20,w20,w8
|
|
ror w21,w21,#24
|
|
ror w17,w17,#24
|
|
ror w19,w19,#24
|
|
ror w20,w20,#24
|
|
add w15,w15,w21
|
|
add w16,w16,w17
|
|
add w13,w13,w19
|
|
add w14,w14,w20
|
|
eor w10,w10,w15
|
|
eor w11,w11,w16
|
|
eor w12,w12,w13
|
|
eor w9,w9,w14
|
|
ror w10,w10,#25
|
|
ror w11,w11,#25
|
|
ror w12,w12,#25
|
|
ror w9,w9,#25
|
|
cbnz x4,Loop
|
|
|
|
add w5,w5,w22 // accumulate key block
|
|
add x6,x6,x22,lsr#32
|
|
add w7,w7,w23
|
|
add x8,x8,x23,lsr#32
|
|
add w9,w9,w24
|
|
add x10,x10,x24,lsr#32
|
|
add w11,w11,w25
|
|
add x12,x12,x25,lsr#32
|
|
add w13,w13,w26
|
|
add x14,x14,x26,lsr#32
|
|
add w15,w15,w27
|
|
add x16,x16,x27,lsr#32
|
|
add w17,w17,w28
|
|
add x19,x19,x28,lsr#32
|
|
add w20,w20,w30
|
|
add x21,x21,x30,lsr#32
|
|
|
|
b.lo Ltail
|
|
|
|
add x5,x5,x6,lsl#32 // pack
|
|
add x7,x7,x8,lsl#32
|
|
ldp x6,x8,[x1,#0] // load input
|
|
add x9,x9,x10,lsl#32
|
|
add x11,x11,x12,lsl#32
|
|
ldp x10,x12,[x1,#16]
|
|
add x13,x13,x14,lsl#32
|
|
add x15,x15,x16,lsl#32
|
|
ldp x14,x16,[x1,#32]
|
|
add x17,x17,x19,lsl#32
|
|
add x20,x20,x21,lsl#32
|
|
ldp x19,x21,[x1,#48]
|
|
add x1,x1,#64
|
|
#ifdef __ARMEB__
|
|
rev x5,x5
|
|
rev x7,x7
|
|
rev x9,x9
|
|
rev x11,x11
|
|
rev x13,x13
|
|
rev x15,x15
|
|
rev x17,x17
|
|
rev x20,x20
|
|
#endif
|
|
eor x5,x5,x6
|
|
eor x7,x7,x8
|
|
eor x9,x9,x10
|
|
eor x11,x11,x12
|
|
eor x13,x13,x14
|
|
eor x15,x15,x16
|
|
eor x17,x17,x19
|
|
eor x20,x20,x21
|
|
|
|
stp x5,x7,[x0,#0] // store output
|
|
add x28,x28,#1 // increment counter
|
|
stp x9,x11,[x0,#16]
|
|
stp x13,x15,[x0,#32]
|
|
stp x17,x20,[x0,#48]
|
|
add x0,x0,#64
|
|
|
|
b.hi Loop_outer
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
add sp,sp,#64
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldp x29,x30,[sp],#96
|
|
Labort:
|
|
ret
|
|
|
|
.align 4
|
|
Ltail:
|
|
add x2,x2,#64
|
|
Less_than_64:
|
|
sub x0,x0,#1
|
|
add x1,x1,x2
|
|
add x0,x0,x2
|
|
add x4,sp,x2
|
|
neg x2,x2
|
|
|
|
add x5,x5,x6,lsl#32 // pack
|
|
add x7,x7,x8,lsl#32
|
|
add x9,x9,x10,lsl#32
|
|
add x11,x11,x12,lsl#32
|
|
add x13,x13,x14,lsl#32
|
|
add x15,x15,x16,lsl#32
|
|
add x17,x17,x19,lsl#32
|
|
add x20,x20,x21,lsl#32
|
|
#ifdef __ARMEB__
|
|
rev x5,x5
|
|
rev x7,x7
|
|
rev x9,x9
|
|
rev x11,x11
|
|
rev x13,x13
|
|
rev x15,x15
|
|
rev x17,x17
|
|
rev x20,x20
|
|
#endif
|
|
stp x5,x7,[sp,#0]
|
|
stp x9,x11,[sp,#16]
|
|
stp x13,x15,[sp,#32]
|
|
stp x17,x20,[sp,#48]
|
|
|
|
Loop_tail:
|
|
ldrb w10,[x1,x2]
|
|
ldrb w11,[x4,x2]
|
|
add x2,x2,#1
|
|
eor w10,w10,w11
|
|
strb w10,[x0,x2]
|
|
cbnz x2,Loop_tail
|
|
|
|
stp xzr,xzr,[sp,#0]
|
|
stp xzr,xzr,[sp,#16]
|
|
stp xzr,xzr,[sp,#32]
|
|
stp xzr,xzr,[sp,#48]
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
add sp,sp,#64
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldp x29,x30,[sp],#96
|
|
ret
|
|
|
|
|
|
|
|
.align 5
|
|
_chacha20_neon:
|
|
cbz x2,Labort_neon
|
|
cmp x2,#192
|
|
b.lo Lshort
|
|
|
|
stp x29,x30,[sp,#-96]!
|
|
add x29,sp,#0
|
|
|
|
adr x5,Lsigma
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
cmp x2,#512
|
|
b.hs L512_or_more_neon
|
|
|
|
sub sp,sp,#64
|
|
|
|
ldp x22,x23,[x5] // load sigma
|
|
ld1 {v24.4s},[x5],#16
|
|
ldp x24,x25,[x3] // load key
|
|
ldp x26,x27,[x3,#16]
|
|
ld1 {v25.4s,v26.4s},[x3]
|
|
ldp x28,x30,[x4] // load counter
|
|
ld1 {v27.4s},[x4]
|
|
ld1 {v31.4s},[x5]
|
|
#ifdef __ARMEB__
|
|
rev64 v24.4s,v24.4s
|
|
ror x24,x24,#32
|
|
ror x25,x25,#32
|
|
ror x26,x26,#32
|
|
ror x27,x27,#32
|
|
ror x28,x28,#32
|
|
ror x30,x30,#32
|
|
#endif
|
|
add v27.4s,v27.4s,v31.4s // += 1
|
|
add v28.4s,v27.4s,v31.4s
|
|
add v29.4s,v28.4s,v31.4s
|
|
shl v31.4s,v31.4s,#2 // 1 -> 4
|
|
|
|
Loop_outer_neon:
|
|
mov w5,w22 // unpack key block
|
|
lsr x6,x22,#32
|
|
mov v0.16b,v24.16b
|
|
mov w7,w23
|
|
lsr x8,x23,#32
|
|
mov v4.16b,v24.16b
|
|
mov w9,w24
|
|
lsr x10,x24,#32
|
|
mov v16.16b,v24.16b
|
|
mov w11,w25
|
|
mov v1.16b,v25.16b
|
|
lsr x12,x25,#32
|
|
mov v5.16b,v25.16b
|
|
mov w13,w26
|
|
mov v17.16b,v25.16b
|
|
lsr x14,x26,#32
|
|
mov v3.16b,v27.16b
|
|
mov w15,w27
|
|
mov v7.16b,v28.16b
|
|
lsr x16,x27,#32
|
|
mov v19.16b,v29.16b
|
|
mov w17,w28
|
|
mov v2.16b,v26.16b
|
|
lsr x19,x28,#32
|
|
mov v6.16b,v26.16b
|
|
mov w20,w30
|
|
mov v18.16b,v26.16b
|
|
lsr x21,x30,#32
|
|
|
|
mov x4,#10
|
|
subs x2,x2,#256
|
|
Loop_neon:
|
|
sub x4,x4,#1
|
|
add v0.4s,v0.4s,v1.4s
|
|
add w5,w5,w9
|
|
add v4.4s,v4.4s,v5.4s
|
|
add w6,w6,w10
|
|
add v16.4s,v16.4s,v17.4s
|
|
add w7,w7,w11
|
|
eor v3.16b,v3.16b,v0.16b
|
|
add w8,w8,w12
|
|
eor v7.16b,v7.16b,v4.16b
|
|
eor w17,w17,w5
|
|
eor v19.16b,v19.16b,v16.16b
|
|
eor w19,w19,w6
|
|
rev32 v3.8h,v3.8h
|
|
eor w20,w20,w7
|
|
rev32 v7.8h,v7.8h
|
|
eor w21,w21,w8
|
|
rev32 v19.8h,v19.8h
|
|
ror w17,w17,#16
|
|
add v2.4s,v2.4s,v3.4s
|
|
ror w19,w19,#16
|
|
add v6.4s,v6.4s,v7.4s
|
|
ror w20,w20,#16
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w21,w21,#16
|
|
eor v20.16b,v1.16b,v2.16b
|
|
add w13,w13,w17
|
|
eor v21.16b,v5.16b,v6.16b
|
|
add w14,w14,w19
|
|
eor v22.16b,v17.16b,v18.16b
|
|
add w15,w15,w20
|
|
ushr v1.4s,v20.4s,#20
|
|
add w16,w16,w21
|
|
ushr v5.4s,v21.4s,#20
|
|
eor w9,w9,w13
|
|
ushr v17.4s,v22.4s,#20
|
|
eor w10,w10,w14
|
|
sli v1.4s,v20.4s,#12
|
|
eor w11,w11,w15
|
|
sli v5.4s,v21.4s,#12
|
|
eor w12,w12,w16
|
|
sli v17.4s,v22.4s,#12
|
|
ror w9,w9,#20
|
|
add v0.4s,v0.4s,v1.4s
|
|
ror w10,w10,#20
|
|
add v4.4s,v4.4s,v5.4s
|
|
ror w11,w11,#20
|
|
add v16.4s,v16.4s,v17.4s
|
|
ror w12,w12,#20
|
|
eor v20.16b,v3.16b,v0.16b
|
|
add w5,w5,w9
|
|
eor v21.16b,v7.16b,v4.16b
|
|
add w6,w6,w10
|
|
eor v22.16b,v19.16b,v16.16b
|
|
add w7,w7,w11
|
|
ushr v3.4s,v20.4s,#24
|
|
add w8,w8,w12
|
|
ushr v7.4s,v21.4s,#24
|
|
eor w17,w17,w5
|
|
ushr v19.4s,v22.4s,#24
|
|
eor w19,w19,w6
|
|
sli v3.4s,v20.4s,#8
|
|
eor w20,w20,w7
|
|
sli v7.4s,v21.4s,#8
|
|
eor w21,w21,w8
|
|
sli v19.4s,v22.4s,#8
|
|
ror w17,w17,#24
|
|
add v2.4s,v2.4s,v3.4s
|
|
ror w19,w19,#24
|
|
add v6.4s,v6.4s,v7.4s
|
|
ror w20,w20,#24
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w21,w21,#24
|
|
eor v20.16b,v1.16b,v2.16b
|
|
add w13,w13,w17
|
|
eor v21.16b,v5.16b,v6.16b
|
|
add w14,w14,w19
|
|
eor v22.16b,v17.16b,v18.16b
|
|
add w15,w15,w20
|
|
ushr v1.4s,v20.4s,#25
|
|
add w16,w16,w21
|
|
ushr v5.4s,v21.4s,#25
|
|
eor w9,w9,w13
|
|
ushr v17.4s,v22.4s,#25
|
|
eor w10,w10,w14
|
|
sli v1.4s,v20.4s,#7
|
|
eor w11,w11,w15
|
|
sli v5.4s,v21.4s,#7
|
|
eor w12,w12,w16
|
|
sli v17.4s,v22.4s,#7
|
|
ror w9,w9,#25
|
|
ext v2.16b,v2.16b,v2.16b,#8
|
|
ror w10,w10,#25
|
|
ext v6.16b,v6.16b,v6.16b,#8
|
|
ror w11,w11,#25
|
|
ext v18.16b,v18.16b,v18.16b,#8
|
|
ror w12,w12,#25
|
|
ext v3.16b,v3.16b,v3.16b,#12
|
|
ext v7.16b,v7.16b,v7.16b,#12
|
|
ext v19.16b,v19.16b,v19.16b,#12
|
|
ext v1.16b,v1.16b,v1.16b,#4
|
|
ext v5.16b,v5.16b,v5.16b,#4
|
|
ext v17.16b,v17.16b,v17.16b,#4
|
|
add v0.4s,v0.4s,v1.4s
|
|
add w5,w5,w10
|
|
add v4.4s,v4.4s,v5.4s
|
|
add w6,w6,w11
|
|
add v16.4s,v16.4s,v17.4s
|
|
add w7,w7,w12
|
|
eor v3.16b,v3.16b,v0.16b
|
|
add w8,w8,w9
|
|
eor v7.16b,v7.16b,v4.16b
|
|
eor w21,w21,w5
|
|
eor v19.16b,v19.16b,v16.16b
|
|
eor w17,w17,w6
|
|
rev32 v3.8h,v3.8h
|
|
eor w19,w19,w7
|
|
rev32 v7.8h,v7.8h
|
|
eor w20,w20,w8
|
|
rev32 v19.8h,v19.8h
|
|
ror w21,w21,#16
|
|
add v2.4s,v2.4s,v3.4s
|
|
ror w17,w17,#16
|
|
add v6.4s,v6.4s,v7.4s
|
|
ror w19,w19,#16
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w20,w20,#16
|
|
eor v20.16b,v1.16b,v2.16b
|
|
add w15,w15,w21
|
|
eor v21.16b,v5.16b,v6.16b
|
|
add w16,w16,w17
|
|
eor v22.16b,v17.16b,v18.16b
|
|
add w13,w13,w19
|
|
ushr v1.4s,v20.4s,#20
|
|
add w14,w14,w20
|
|
ushr v5.4s,v21.4s,#20
|
|
eor w10,w10,w15
|
|
ushr v17.4s,v22.4s,#20
|
|
eor w11,w11,w16
|
|
sli v1.4s,v20.4s,#12
|
|
eor w12,w12,w13
|
|
sli v5.4s,v21.4s,#12
|
|
eor w9,w9,w14
|
|
sli v17.4s,v22.4s,#12
|
|
ror w10,w10,#20
|
|
add v0.4s,v0.4s,v1.4s
|
|
ror w11,w11,#20
|
|
add v4.4s,v4.4s,v5.4s
|
|
ror w12,w12,#20
|
|
add v16.4s,v16.4s,v17.4s
|
|
ror w9,w9,#20
|
|
eor v20.16b,v3.16b,v0.16b
|
|
add w5,w5,w10
|
|
eor v21.16b,v7.16b,v4.16b
|
|
add w6,w6,w11
|
|
eor v22.16b,v19.16b,v16.16b
|
|
add w7,w7,w12
|
|
ushr v3.4s,v20.4s,#24
|
|
add w8,w8,w9
|
|
ushr v7.4s,v21.4s,#24
|
|
eor w21,w21,w5
|
|
ushr v19.4s,v22.4s,#24
|
|
eor w17,w17,w6
|
|
sli v3.4s,v20.4s,#8
|
|
eor w19,w19,w7
|
|
sli v7.4s,v21.4s,#8
|
|
eor w20,w20,w8
|
|
sli v19.4s,v22.4s,#8
|
|
ror w21,w21,#24
|
|
add v2.4s,v2.4s,v3.4s
|
|
ror w17,w17,#24
|
|
add v6.4s,v6.4s,v7.4s
|
|
ror w19,w19,#24
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w20,w20,#24
|
|
eor v20.16b,v1.16b,v2.16b
|
|
add w15,w15,w21
|
|
eor v21.16b,v5.16b,v6.16b
|
|
add w16,w16,w17
|
|
eor v22.16b,v17.16b,v18.16b
|
|
add w13,w13,w19
|
|
ushr v1.4s,v20.4s,#25
|
|
add w14,w14,w20
|
|
ushr v5.4s,v21.4s,#25
|
|
eor w10,w10,w15
|
|
ushr v17.4s,v22.4s,#25
|
|
eor w11,w11,w16
|
|
sli v1.4s,v20.4s,#7
|
|
eor w12,w12,w13
|
|
sli v5.4s,v21.4s,#7
|
|
eor w9,w9,w14
|
|
sli v17.4s,v22.4s,#7
|
|
ror w10,w10,#25
|
|
ext v2.16b,v2.16b,v2.16b,#8
|
|
ror w11,w11,#25
|
|
ext v6.16b,v6.16b,v6.16b,#8
|
|
ror w12,w12,#25
|
|
ext v18.16b,v18.16b,v18.16b,#8
|
|
ror w9,w9,#25
|
|
ext v3.16b,v3.16b,v3.16b,#4
|
|
ext v7.16b,v7.16b,v7.16b,#4
|
|
ext v19.16b,v19.16b,v19.16b,#4
|
|
ext v1.16b,v1.16b,v1.16b,#12
|
|
ext v5.16b,v5.16b,v5.16b,#12
|
|
ext v17.16b,v17.16b,v17.16b,#12
|
|
cbnz x4,Loop_neon
|
|
|
|
add w5,w5,w22 // accumulate key block
|
|
add v0.4s,v0.4s,v24.4s
|
|
add x6,x6,x22,lsr#32
|
|
add v4.4s,v4.4s,v24.4s
|
|
add w7,w7,w23
|
|
add v16.4s,v16.4s,v24.4s
|
|
add x8,x8,x23,lsr#32
|
|
add v2.4s,v2.4s,v26.4s
|
|
add w9,w9,w24
|
|
add v6.4s,v6.4s,v26.4s
|
|
add x10,x10,x24,lsr#32
|
|
add v18.4s,v18.4s,v26.4s
|
|
add w11,w11,w25
|
|
add v3.4s,v3.4s,v27.4s
|
|
add x12,x12,x25,lsr#32
|
|
add w13,w13,w26
|
|
add v7.4s,v7.4s,v28.4s
|
|
add x14,x14,x26,lsr#32
|
|
add w15,w15,w27
|
|
add v19.4s,v19.4s,v29.4s
|
|
add x16,x16,x27,lsr#32
|
|
add w17,w17,w28
|
|
add v1.4s,v1.4s,v25.4s
|
|
add x19,x19,x28,lsr#32
|
|
add w20,w20,w30
|
|
add v5.4s,v5.4s,v25.4s
|
|
add x21,x21,x30,lsr#32
|
|
add v17.4s,v17.4s,v25.4s
|
|
|
|
b.lo Ltail_neon
|
|
|
|
add x5,x5,x6,lsl#32 // pack
|
|
add x7,x7,x8,lsl#32
|
|
ldp x6,x8,[x1,#0] // load input
|
|
add x9,x9,x10,lsl#32
|
|
add x11,x11,x12,lsl#32
|
|
ldp x10,x12,[x1,#16]
|
|
add x13,x13,x14,lsl#32
|
|
add x15,x15,x16,lsl#32
|
|
ldp x14,x16,[x1,#32]
|
|
add x17,x17,x19,lsl#32
|
|
add x20,x20,x21,lsl#32
|
|
ldp x19,x21,[x1,#48]
|
|
add x1,x1,#64
|
|
#ifdef __ARMEB__
|
|
rev x5,x5
|
|
rev x7,x7
|
|
rev x9,x9
|
|
rev x11,x11
|
|
rev x13,x13
|
|
rev x15,x15
|
|
rev x17,x17
|
|
rev x20,x20
|
|
#endif
|
|
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
|
|
eor x5,x5,x6
|
|
eor x7,x7,x8
|
|
eor x9,x9,x10
|
|
eor x11,x11,x12
|
|
eor x13,x13,x14
|
|
eor v0.16b,v0.16b,v20.16b
|
|
eor x15,x15,x16
|
|
eor v1.16b,v1.16b,v21.16b
|
|
eor x17,x17,x19
|
|
eor v2.16b,v2.16b,v22.16b
|
|
eor x20,x20,x21
|
|
eor v3.16b,v3.16b,v23.16b
|
|
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
|
|
|
|
stp x5,x7,[x0,#0] // store output
|
|
add x28,x28,#4 // increment counter
|
|
stp x9,x11,[x0,#16]
|
|
add v27.4s,v27.4s,v31.4s // += 4
|
|
stp x13,x15,[x0,#32]
|
|
add v28.4s,v28.4s,v31.4s
|
|
stp x17,x20,[x0,#48]
|
|
add v29.4s,v29.4s,v31.4s
|
|
add x0,x0,#64
|
|
|
|
st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
|
|
ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
|
|
|
|
eor v4.16b,v4.16b,v20.16b
|
|
eor v5.16b,v5.16b,v21.16b
|
|
eor v6.16b,v6.16b,v22.16b
|
|
eor v7.16b,v7.16b,v23.16b
|
|
st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
|
|
|
|
eor v16.16b,v16.16b,v0.16b
|
|
eor v17.16b,v17.16b,v1.16b
|
|
eor v18.16b,v18.16b,v2.16b
|
|
eor v19.16b,v19.16b,v3.16b
|
|
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
|
|
|
|
b.hi Loop_outer_neon
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
add sp,sp,#64
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldp x29,x30,[sp],#96
|
|
ret
|
|
|
|
Ltail_neon:
|
|
add x2,x2,#256
|
|
cmp x2,#64
|
|
b.lo Less_than_64
|
|
|
|
add x5,x5,x6,lsl#32 // pack
|
|
add x7,x7,x8,lsl#32
|
|
ldp x6,x8,[x1,#0] // load input
|
|
add x9,x9,x10,lsl#32
|
|
add x11,x11,x12,lsl#32
|
|
ldp x10,x12,[x1,#16]
|
|
add x13,x13,x14,lsl#32
|
|
add x15,x15,x16,lsl#32
|
|
ldp x14,x16,[x1,#32]
|
|
add x17,x17,x19,lsl#32
|
|
add x20,x20,x21,lsl#32
|
|
ldp x19,x21,[x1,#48]
|
|
add x1,x1,#64
|
|
#ifdef __ARMEB__
|
|
rev x5,x5
|
|
rev x7,x7
|
|
rev x9,x9
|
|
rev x11,x11
|
|
rev x13,x13
|
|
rev x15,x15
|
|
rev x17,x17
|
|
rev x20,x20
|
|
#endif
|
|
eor x5,x5,x6
|
|
eor x7,x7,x8
|
|
eor x9,x9,x10
|
|
eor x11,x11,x12
|
|
eor x13,x13,x14
|
|
eor x15,x15,x16
|
|
eor x17,x17,x19
|
|
eor x20,x20,x21
|
|
|
|
stp x5,x7,[x0,#0] // store output
|
|
add x28,x28,#4 // increment counter
|
|
stp x9,x11,[x0,#16]
|
|
stp x13,x15,[x0,#32]
|
|
stp x17,x20,[x0,#48]
|
|
add x0,x0,#64
|
|
b.eq Ldone_neon
|
|
sub x2,x2,#64
|
|
cmp x2,#64
|
|
b.lo Less_than_128
|
|
|
|
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
|
|
eor v0.16b,v0.16b,v20.16b
|
|
eor v1.16b,v1.16b,v21.16b
|
|
eor v2.16b,v2.16b,v22.16b
|
|
eor v3.16b,v3.16b,v23.16b
|
|
st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
|
|
b.eq Ldone_neon
|
|
sub x2,x2,#64
|
|
cmp x2,#64
|
|
b.lo Less_than_192
|
|
|
|
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
|
|
eor v4.16b,v4.16b,v20.16b
|
|
eor v5.16b,v5.16b,v21.16b
|
|
eor v6.16b,v6.16b,v22.16b
|
|
eor v7.16b,v7.16b,v23.16b
|
|
st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
|
|
b.eq Ldone_neon
|
|
sub x2,x2,#64
|
|
|
|
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
|
|
b Last_neon
|
|
|
|
Less_than_128:
|
|
st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
|
|
b Last_neon
|
|
Less_than_192:
|
|
st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
|
|
b Last_neon
|
|
|
|
.align 4
|
|
Last_neon:
|
|
sub x0,x0,#1
|
|
add x1,x1,x2
|
|
add x0,x0,x2
|
|
add x4,sp,x2
|
|
neg x2,x2
|
|
|
|
Loop_tail_neon:
|
|
ldrb w10,[x1,x2]
|
|
ldrb w11,[x4,x2]
|
|
add x2,x2,#1
|
|
eor w10,w10,w11
|
|
strb w10,[x0,x2]
|
|
cbnz x2,Loop_tail_neon
|
|
|
|
stp xzr,xzr,[sp,#0]
|
|
stp xzr,xzr,[sp,#16]
|
|
stp xzr,xzr,[sp,#32]
|
|
stp xzr,xzr,[sp,#48]
|
|
|
|
Ldone_neon:
|
|
ldp x19,x20,[x29,#16]
|
|
add sp,sp,#64
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldp x29,x30,[sp],#96
|
|
Labort_neon:
|
|
ret
|
|
|
|
|
|
L512_or_more_neon:
|
|
sub sp,sp,#128+64
|
|
|
|
ldp x22,x23,[x5] // load sigma
|
|
ld1 {v24.4s},[x5],#16
|
|
ldp x24,x25,[x3] // load key
|
|
ldp x26,x27,[x3,#16]
|
|
ld1 {v25.4s,v26.4s},[x3]
|
|
ldp x28,x30,[x4] // load counter
|
|
ld1 {v27.4s},[x4]
|
|
ld1 {v31.4s},[x5]
|
|
#ifdef __ARMEB__
|
|
rev64 v24.4s,v24.4s
|
|
ror x24,x24,#32
|
|
ror x25,x25,#32
|
|
ror x26,x26,#32
|
|
ror x27,x27,#32
|
|
ror x28,x28,#32
|
|
ror x30,x30,#32
|
|
#endif
|
|
add v27.4s,v27.4s,v31.4s // += 1
|
|
stp q24,q25,[sp,#0] // off-load key block, invariant part
|
|
add v27.4s,v27.4s,v31.4s // not typo
|
|
str q26,[sp,#32]
|
|
add v28.4s,v27.4s,v31.4s
|
|
add v29.4s,v28.4s,v31.4s
|
|
add v30.4s,v29.4s,v31.4s
|
|
shl v31.4s,v31.4s,#2 // 1 -> 4
|
|
|
|
stp d8,d9,[sp,#128+0] // meet ABI requirements
|
|
stp d10,d11,[sp,#128+16]
|
|
stp d12,d13,[sp,#128+32]
|
|
stp d14,d15,[sp,#128+48]
|
|
|
|
sub x2,x2,#512 // not typo
|
|
|
|
Loop_outer_512_neon:
|
|
mov v0.16b,v24.16b
|
|
mov v4.16b,v24.16b
|
|
mov v8.16b,v24.16b
|
|
mov v12.16b,v24.16b
|
|
mov v16.16b,v24.16b
|
|
mov v20.16b,v24.16b
|
|
mov v1.16b,v25.16b
|
|
mov w5,w22 // unpack key block
|
|
mov v5.16b,v25.16b
|
|
lsr x6,x22,#32
|
|
mov v9.16b,v25.16b
|
|
mov w7,w23
|
|
mov v13.16b,v25.16b
|
|
lsr x8,x23,#32
|
|
mov v17.16b,v25.16b
|
|
mov w9,w24
|
|
mov v21.16b,v25.16b
|
|
lsr x10,x24,#32
|
|
mov v3.16b,v27.16b
|
|
mov w11,w25
|
|
mov v7.16b,v28.16b
|
|
lsr x12,x25,#32
|
|
mov v11.16b,v29.16b
|
|
mov w13,w26
|
|
mov v15.16b,v30.16b
|
|
lsr x14,x26,#32
|
|
mov v2.16b,v26.16b
|
|
mov w15,w27
|
|
mov v6.16b,v26.16b
|
|
lsr x16,x27,#32
|
|
add v19.4s,v3.4s,v31.4s // +4
|
|
mov w17,w28
|
|
add v23.4s,v7.4s,v31.4s // +4
|
|
lsr x19,x28,#32
|
|
mov v10.16b,v26.16b
|
|
mov w20,w30
|
|
mov v14.16b,v26.16b
|
|
lsr x21,x30,#32
|
|
mov v18.16b,v26.16b
|
|
stp q27,q28,[sp,#48] // off-load key block, variable part
|
|
mov v22.16b,v26.16b
|
|
str q29,[sp,#80]
|
|
|
|
mov x4,#5
|
|
subs x2,x2,#512
|
|
Loop_upper_neon:
|
|
sub x4,x4,#1
|
|
add v0.4s,v0.4s,v1.4s
|
|
add w5,w5,w9
|
|
add v4.4s,v4.4s,v5.4s
|
|
add w6,w6,w10
|
|
add v8.4s,v8.4s,v9.4s
|
|
add w7,w7,w11
|
|
add v12.4s,v12.4s,v13.4s
|
|
add w8,w8,w12
|
|
add v16.4s,v16.4s,v17.4s
|
|
eor w17,w17,w5
|
|
add v20.4s,v20.4s,v21.4s
|
|
eor w19,w19,w6
|
|
eor v3.16b,v3.16b,v0.16b
|
|
eor w20,w20,w7
|
|
eor v7.16b,v7.16b,v4.16b
|
|
eor w21,w21,w8
|
|
eor v11.16b,v11.16b,v8.16b
|
|
ror w17,w17,#16
|
|
eor v15.16b,v15.16b,v12.16b
|
|
ror w19,w19,#16
|
|
eor v19.16b,v19.16b,v16.16b
|
|
ror w20,w20,#16
|
|
eor v23.16b,v23.16b,v20.16b
|
|
ror w21,w21,#16
|
|
rev32 v3.8h,v3.8h
|
|
add w13,w13,w17
|
|
rev32 v7.8h,v7.8h
|
|
add w14,w14,w19
|
|
rev32 v11.8h,v11.8h
|
|
add w15,w15,w20
|
|
rev32 v15.8h,v15.8h
|
|
add w16,w16,w21
|
|
rev32 v19.8h,v19.8h
|
|
eor w9,w9,w13
|
|
rev32 v23.8h,v23.8h
|
|
eor w10,w10,w14
|
|
add v2.4s,v2.4s,v3.4s
|
|
eor w11,w11,w15
|
|
add v6.4s,v6.4s,v7.4s
|
|
eor w12,w12,w16
|
|
add v10.4s,v10.4s,v11.4s
|
|
ror w9,w9,#20
|
|
add v14.4s,v14.4s,v15.4s
|
|
ror w10,w10,#20
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w11,w11,#20
|
|
add v22.4s,v22.4s,v23.4s
|
|
ror w12,w12,#20
|
|
eor v24.16b,v1.16b,v2.16b
|
|
add w5,w5,w9
|
|
eor v25.16b,v5.16b,v6.16b
|
|
add w6,w6,w10
|
|
eor v26.16b,v9.16b,v10.16b
|
|
add w7,w7,w11
|
|
eor v27.16b,v13.16b,v14.16b
|
|
add w8,w8,w12
|
|
eor v28.16b,v17.16b,v18.16b
|
|
eor w17,w17,w5
|
|
eor v29.16b,v21.16b,v22.16b
|
|
eor w19,w19,w6
|
|
ushr v1.4s,v24.4s,#20
|
|
eor w20,w20,w7
|
|
ushr v5.4s,v25.4s,#20
|
|
eor w21,w21,w8
|
|
ushr v9.4s,v26.4s,#20
|
|
ror w17,w17,#24
|
|
ushr v13.4s,v27.4s,#20
|
|
ror w19,w19,#24
|
|
ushr v17.4s,v28.4s,#20
|
|
ror w20,w20,#24
|
|
ushr v21.4s,v29.4s,#20
|
|
ror w21,w21,#24
|
|
sli v1.4s,v24.4s,#12
|
|
add w13,w13,w17
|
|
sli v5.4s,v25.4s,#12
|
|
add w14,w14,w19
|
|
sli v9.4s,v26.4s,#12
|
|
add w15,w15,w20
|
|
sli v13.4s,v27.4s,#12
|
|
add w16,w16,w21
|
|
sli v17.4s,v28.4s,#12
|
|
eor w9,w9,w13
|
|
sli v21.4s,v29.4s,#12
|
|
eor w10,w10,w14
|
|
add v0.4s,v0.4s,v1.4s
|
|
eor w11,w11,w15
|
|
add v4.4s,v4.4s,v5.4s
|
|
eor w12,w12,w16
|
|
add v8.4s,v8.4s,v9.4s
|
|
ror w9,w9,#25
|
|
add v12.4s,v12.4s,v13.4s
|
|
ror w10,w10,#25
|
|
add v16.4s,v16.4s,v17.4s
|
|
ror w11,w11,#25
|
|
add v20.4s,v20.4s,v21.4s
|
|
ror w12,w12,#25
|
|
eor v24.16b,v3.16b,v0.16b
|
|
add w5,w5,w10
|
|
eor v25.16b,v7.16b,v4.16b
|
|
add w6,w6,w11
|
|
eor v26.16b,v11.16b,v8.16b
|
|
add w7,w7,w12
|
|
eor v27.16b,v15.16b,v12.16b
|
|
add w8,w8,w9
|
|
eor v28.16b,v19.16b,v16.16b
|
|
eor w21,w21,w5
|
|
eor v29.16b,v23.16b,v20.16b
|
|
eor w17,w17,w6
|
|
ushr v3.4s,v24.4s,#24
|
|
eor w19,w19,w7
|
|
ushr v7.4s,v25.4s,#24
|
|
eor w20,w20,w8
|
|
ushr v11.4s,v26.4s,#24
|
|
ror w21,w21,#16
|
|
ushr v15.4s,v27.4s,#24
|
|
ror w17,w17,#16
|
|
ushr v19.4s,v28.4s,#24
|
|
ror w19,w19,#16
|
|
ushr v23.4s,v29.4s,#24
|
|
ror w20,w20,#16
|
|
sli v3.4s,v24.4s,#8
|
|
add w15,w15,w21
|
|
sli v7.4s,v25.4s,#8
|
|
add w16,w16,w17
|
|
sli v11.4s,v26.4s,#8
|
|
add w13,w13,w19
|
|
sli v15.4s,v27.4s,#8
|
|
add w14,w14,w20
|
|
sli v19.4s,v28.4s,#8
|
|
eor w10,w10,w15
|
|
sli v23.4s,v29.4s,#8
|
|
eor w11,w11,w16
|
|
add v2.4s,v2.4s,v3.4s
|
|
eor w12,w12,w13
|
|
add v6.4s,v6.4s,v7.4s
|
|
eor w9,w9,w14
|
|
add v10.4s,v10.4s,v11.4s
|
|
ror w10,w10,#20
|
|
add v14.4s,v14.4s,v15.4s
|
|
ror w11,w11,#20
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w12,w12,#20
|
|
add v22.4s,v22.4s,v23.4s
|
|
ror w9,w9,#20
|
|
eor v24.16b,v1.16b,v2.16b
|
|
add w5,w5,w10
|
|
eor v25.16b,v5.16b,v6.16b
|
|
add w6,w6,w11
|
|
eor v26.16b,v9.16b,v10.16b
|
|
add w7,w7,w12
|
|
eor v27.16b,v13.16b,v14.16b
|
|
add w8,w8,w9
|
|
eor v28.16b,v17.16b,v18.16b
|
|
eor w21,w21,w5
|
|
eor v29.16b,v21.16b,v22.16b
|
|
eor w17,w17,w6
|
|
ushr v1.4s,v24.4s,#25
|
|
eor w19,w19,w7
|
|
ushr v5.4s,v25.4s,#25
|
|
eor w20,w20,w8
|
|
ushr v9.4s,v26.4s,#25
|
|
ror w21,w21,#24
|
|
ushr v13.4s,v27.4s,#25
|
|
ror w17,w17,#24
|
|
ushr v17.4s,v28.4s,#25
|
|
ror w19,w19,#24
|
|
ushr v21.4s,v29.4s,#25
|
|
ror w20,w20,#24
|
|
sli v1.4s,v24.4s,#7
|
|
add w15,w15,w21
|
|
sli v5.4s,v25.4s,#7
|
|
add w16,w16,w17
|
|
sli v9.4s,v26.4s,#7
|
|
add w13,w13,w19
|
|
sli v13.4s,v27.4s,#7
|
|
add w14,w14,w20
|
|
sli v17.4s,v28.4s,#7
|
|
eor w10,w10,w15
|
|
sli v21.4s,v29.4s,#7
|
|
eor w11,w11,w16
|
|
ext v2.16b,v2.16b,v2.16b,#8
|
|
eor w12,w12,w13
|
|
ext v6.16b,v6.16b,v6.16b,#8
|
|
eor w9,w9,w14
|
|
ext v10.16b,v10.16b,v10.16b,#8
|
|
ror w10,w10,#25
|
|
ext v14.16b,v14.16b,v14.16b,#8
|
|
ror w11,w11,#25
|
|
ext v18.16b,v18.16b,v18.16b,#8
|
|
ror w12,w12,#25
|
|
ext v22.16b,v22.16b,v22.16b,#8
|
|
ror w9,w9,#25
|
|
ext v3.16b,v3.16b,v3.16b,#12
|
|
ext v7.16b,v7.16b,v7.16b,#12
|
|
ext v11.16b,v11.16b,v11.16b,#12
|
|
ext v15.16b,v15.16b,v15.16b,#12
|
|
ext v19.16b,v19.16b,v19.16b,#12
|
|
ext v23.16b,v23.16b,v23.16b,#12
|
|
ext v1.16b,v1.16b,v1.16b,#4
|
|
ext v5.16b,v5.16b,v5.16b,#4
|
|
ext v9.16b,v9.16b,v9.16b,#4
|
|
ext v13.16b,v13.16b,v13.16b,#4
|
|
ext v17.16b,v17.16b,v17.16b,#4
|
|
ext v21.16b,v21.16b,v21.16b,#4
|
|
add v0.4s,v0.4s,v1.4s
|
|
add w5,w5,w9
|
|
add v4.4s,v4.4s,v5.4s
|
|
add w6,w6,w10
|
|
add v8.4s,v8.4s,v9.4s
|
|
add w7,w7,w11
|
|
add v12.4s,v12.4s,v13.4s
|
|
add w8,w8,w12
|
|
add v16.4s,v16.4s,v17.4s
|
|
eor w17,w17,w5
|
|
add v20.4s,v20.4s,v21.4s
|
|
eor w19,w19,w6
|
|
eor v3.16b,v3.16b,v0.16b
|
|
eor w20,w20,w7
|
|
eor v7.16b,v7.16b,v4.16b
|
|
eor w21,w21,w8
|
|
eor v11.16b,v11.16b,v8.16b
|
|
ror w17,w17,#16
|
|
eor v15.16b,v15.16b,v12.16b
|
|
ror w19,w19,#16
|
|
eor v19.16b,v19.16b,v16.16b
|
|
ror w20,w20,#16
|
|
eor v23.16b,v23.16b,v20.16b
|
|
ror w21,w21,#16
|
|
rev32 v3.8h,v3.8h
|
|
add w13,w13,w17
|
|
rev32 v7.8h,v7.8h
|
|
add w14,w14,w19
|
|
rev32 v11.8h,v11.8h
|
|
add w15,w15,w20
|
|
rev32 v15.8h,v15.8h
|
|
add w16,w16,w21
|
|
rev32 v19.8h,v19.8h
|
|
eor w9,w9,w13
|
|
rev32 v23.8h,v23.8h
|
|
eor w10,w10,w14
|
|
add v2.4s,v2.4s,v3.4s
|
|
eor w11,w11,w15
|
|
add v6.4s,v6.4s,v7.4s
|
|
eor w12,w12,w16
|
|
add v10.4s,v10.4s,v11.4s
|
|
ror w9,w9,#20
|
|
add v14.4s,v14.4s,v15.4s
|
|
ror w10,w10,#20
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w11,w11,#20
|
|
add v22.4s,v22.4s,v23.4s
|
|
ror w12,w12,#20
|
|
eor v24.16b,v1.16b,v2.16b
|
|
add w5,w5,w9
|
|
eor v25.16b,v5.16b,v6.16b
|
|
add w6,w6,w10
|
|
eor v26.16b,v9.16b,v10.16b
|
|
add w7,w7,w11
|
|
eor v27.16b,v13.16b,v14.16b
|
|
add w8,w8,w12
|
|
eor v28.16b,v17.16b,v18.16b
|
|
eor w17,w17,w5
|
|
eor v29.16b,v21.16b,v22.16b
|
|
eor w19,w19,w6
|
|
ushr v1.4s,v24.4s,#20
|
|
eor w20,w20,w7
|
|
ushr v5.4s,v25.4s,#20
|
|
eor w21,w21,w8
|
|
ushr v9.4s,v26.4s,#20
|
|
ror w17,w17,#24
|
|
ushr v13.4s,v27.4s,#20
|
|
ror w19,w19,#24
|
|
ushr v17.4s,v28.4s,#20
|
|
ror w20,w20,#24
|
|
ushr v21.4s,v29.4s,#20
|
|
ror w21,w21,#24
|
|
sli v1.4s,v24.4s,#12
|
|
add w13,w13,w17
|
|
sli v5.4s,v25.4s,#12
|
|
add w14,w14,w19
|
|
sli v9.4s,v26.4s,#12
|
|
add w15,w15,w20
|
|
sli v13.4s,v27.4s,#12
|
|
add w16,w16,w21
|
|
sli v17.4s,v28.4s,#12
|
|
eor w9,w9,w13
|
|
sli v21.4s,v29.4s,#12
|
|
eor w10,w10,w14
|
|
add v0.4s,v0.4s,v1.4s
|
|
eor w11,w11,w15
|
|
add v4.4s,v4.4s,v5.4s
|
|
eor w12,w12,w16
|
|
add v8.4s,v8.4s,v9.4s
|
|
ror w9,w9,#25
|
|
add v12.4s,v12.4s,v13.4s
|
|
ror w10,w10,#25
|
|
add v16.4s,v16.4s,v17.4s
|
|
ror w11,w11,#25
|
|
add v20.4s,v20.4s,v21.4s
|
|
ror w12,w12,#25
|
|
eor v24.16b,v3.16b,v0.16b
|
|
add w5,w5,w10
|
|
eor v25.16b,v7.16b,v4.16b
|
|
add w6,w6,w11
|
|
eor v26.16b,v11.16b,v8.16b
|
|
add w7,w7,w12
|
|
eor v27.16b,v15.16b,v12.16b
|
|
add w8,w8,w9
|
|
eor v28.16b,v19.16b,v16.16b
|
|
eor w21,w21,w5
|
|
eor v29.16b,v23.16b,v20.16b
|
|
eor w17,w17,w6
|
|
ushr v3.4s,v24.4s,#24
|
|
eor w19,w19,w7
|
|
ushr v7.4s,v25.4s,#24
|
|
eor w20,w20,w8
|
|
ushr v11.4s,v26.4s,#24
|
|
ror w21,w21,#16
|
|
ushr v15.4s,v27.4s,#24
|
|
ror w17,w17,#16
|
|
ushr v19.4s,v28.4s,#24
|
|
ror w19,w19,#16
|
|
ushr v23.4s,v29.4s,#24
|
|
ror w20,w20,#16
|
|
sli v3.4s,v24.4s,#8
|
|
add w15,w15,w21
|
|
sli v7.4s,v25.4s,#8
|
|
add w16,w16,w17
|
|
sli v11.4s,v26.4s,#8
|
|
add w13,w13,w19
|
|
sli v15.4s,v27.4s,#8
|
|
add w14,w14,w20
|
|
sli v19.4s,v28.4s,#8
|
|
eor w10,w10,w15
|
|
sli v23.4s,v29.4s,#8
|
|
eor w11,w11,w16
|
|
add v2.4s,v2.4s,v3.4s
|
|
eor w12,w12,w13
|
|
add v6.4s,v6.4s,v7.4s
|
|
eor w9,w9,w14
|
|
add v10.4s,v10.4s,v11.4s
|
|
ror w10,w10,#20
|
|
add v14.4s,v14.4s,v15.4s
|
|
ror w11,w11,#20
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w12,w12,#20
|
|
add v22.4s,v22.4s,v23.4s
|
|
ror w9,w9,#20
|
|
eor v24.16b,v1.16b,v2.16b
|
|
add w5,w5,w10
|
|
eor v25.16b,v5.16b,v6.16b
|
|
add w6,w6,w11
|
|
eor v26.16b,v9.16b,v10.16b
|
|
add w7,w7,w12
|
|
eor v27.16b,v13.16b,v14.16b
|
|
add w8,w8,w9
|
|
eor v28.16b,v17.16b,v18.16b
|
|
eor w21,w21,w5
|
|
eor v29.16b,v21.16b,v22.16b
|
|
eor w17,w17,w6
|
|
ushr v1.4s,v24.4s,#25
|
|
eor w19,w19,w7
|
|
ushr v5.4s,v25.4s,#25
|
|
eor w20,w20,w8
|
|
ushr v9.4s,v26.4s,#25
|
|
ror w21,w21,#24
|
|
ushr v13.4s,v27.4s,#25
|
|
ror w17,w17,#24
|
|
ushr v17.4s,v28.4s,#25
|
|
ror w19,w19,#24
|
|
ushr v21.4s,v29.4s,#25
|
|
ror w20,w20,#24
|
|
sli v1.4s,v24.4s,#7
|
|
add w15,w15,w21
|
|
sli v5.4s,v25.4s,#7
|
|
add w16,w16,w17
|
|
sli v9.4s,v26.4s,#7
|
|
add w13,w13,w19
|
|
sli v13.4s,v27.4s,#7
|
|
add w14,w14,w20
|
|
sli v17.4s,v28.4s,#7
|
|
eor w10,w10,w15
|
|
sli v21.4s,v29.4s,#7
|
|
eor w11,w11,w16
|
|
ext v2.16b,v2.16b,v2.16b,#8
|
|
eor w12,w12,w13
|
|
ext v6.16b,v6.16b,v6.16b,#8
|
|
eor w9,w9,w14
|
|
ext v10.16b,v10.16b,v10.16b,#8
|
|
ror w10,w10,#25
|
|
ext v14.16b,v14.16b,v14.16b,#8
|
|
ror w11,w11,#25
|
|
ext v18.16b,v18.16b,v18.16b,#8
|
|
ror w12,w12,#25
|
|
ext v22.16b,v22.16b,v22.16b,#8
|
|
ror w9,w9,#25
|
|
ext v3.16b,v3.16b,v3.16b,#4
|
|
ext v7.16b,v7.16b,v7.16b,#4
|
|
ext v11.16b,v11.16b,v11.16b,#4
|
|
ext v15.16b,v15.16b,v15.16b,#4
|
|
ext v19.16b,v19.16b,v19.16b,#4
|
|
ext v23.16b,v23.16b,v23.16b,#4
|
|
ext v1.16b,v1.16b,v1.16b,#12
|
|
ext v5.16b,v5.16b,v5.16b,#12
|
|
ext v9.16b,v9.16b,v9.16b,#12
|
|
ext v13.16b,v13.16b,v13.16b,#12
|
|
ext v17.16b,v17.16b,v17.16b,#12
|
|
ext v21.16b,v21.16b,v21.16b,#12
|
|
cbnz x4,Loop_upper_neon
|
|
|
|
add w5,w5,w22 // accumulate key block
|
|
add x6,x6,x22,lsr#32
|
|
add w7,w7,w23
|
|
add x8,x8,x23,lsr#32
|
|
add w9,w9,w24
|
|
add x10,x10,x24,lsr#32
|
|
add w11,w11,w25
|
|
add x12,x12,x25,lsr#32
|
|
add w13,w13,w26
|
|
add x14,x14,x26,lsr#32
|
|
add w15,w15,w27
|
|
add x16,x16,x27,lsr#32
|
|
add w17,w17,w28
|
|
add x19,x19,x28,lsr#32
|
|
add w20,w20,w30
|
|
add x21,x21,x30,lsr#32
|
|
|
|
add x5,x5,x6,lsl#32 // pack
|
|
add x7,x7,x8,lsl#32
|
|
ldp x6,x8,[x1,#0] // load input
|
|
add x9,x9,x10,lsl#32
|
|
add x11,x11,x12,lsl#32
|
|
ldp x10,x12,[x1,#16]
|
|
add x13,x13,x14,lsl#32
|
|
add x15,x15,x16,lsl#32
|
|
ldp x14,x16,[x1,#32]
|
|
add x17,x17,x19,lsl#32
|
|
add x20,x20,x21,lsl#32
|
|
ldp x19,x21,[x1,#48]
|
|
add x1,x1,#64
|
|
#ifdef __ARMEB__
|
|
rev x5,x5
|
|
rev x7,x7
|
|
rev x9,x9
|
|
rev x11,x11
|
|
rev x13,x13
|
|
rev x15,x15
|
|
rev x17,x17
|
|
rev x20,x20
|
|
#endif
|
|
eor x5,x5,x6
|
|
eor x7,x7,x8
|
|
eor x9,x9,x10
|
|
eor x11,x11,x12
|
|
eor x13,x13,x14
|
|
eor x15,x15,x16
|
|
eor x17,x17,x19
|
|
eor x20,x20,x21
|
|
|
|
stp x5,x7,[x0,#0] // store output
|
|
add x28,x28,#1 // increment counter
|
|
mov w5,w22 // unpack key block
|
|
lsr x6,x22,#32
|
|
stp x9,x11,[x0,#16]
|
|
mov w7,w23
|
|
lsr x8,x23,#32
|
|
stp x13,x15,[x0,#32]
|
|
mov w9,w24
|
|
lsr x10,x24,#32
|
|
stp x17,x20,[x0,#48]
|
|
add x0,x0,#64
|
|
mov w11,w25
|
|
lsr x12,x25,#32
|
|
mov w13,w26
|
|
lsr x14,x26,#32
|
|
mov w15,w27
|
|
lsr x16,x27,#32
|
|
mov w17,w28
|
|
lsr x19,x28,#32
|
|
mov w20,w30
|
|
lsr x21,x30,#32
|
|
|
|
mov x4,#5
|
|
Loop_lower_neon:
|
|
sub x4,x4,#1
|
|
add v0.4s,v0.4s,v1.4s
|
|
add w5,w5,w9
|
|
add v4.4s,v4.4s,v5.4s
|
|
add w6,w6,w10
|
|
add v8.4s,v8.4s,v9.4s
|
|
add w7,w7,w11
|
|
add v12.4s,v12.4s,v13.4s
|
|
add w8,w8,w12
|
|
add v16.4s,v16.4s,v17.4s
|
|
eor w17,w17,w5
|
|
add v20.4s,v20.4s,v21.4s
|
|
eor w19,w19,w6
|
|
eor v3.16b,v3.16b,v0.16b
|
|
eor w20,w20,w7
|
|
eor v7.16b,v7.16b,v4.16b
|
|
eor w21,w21,w8
|
|
eor v11.16b,v11.16b,v8.16b
|
|
ror w17,w17,#16
|
|
eor v15.16b,v15.16b,v12.16b
|
|
ror w19,w19,#16
|
|
eor v19.16b,v19.16b,v16.16b
|
|
ror w20,w20,#16
|
|
eor v23.16b,v23.16b,v20.16b
|
|
ror w21,w21,#16
|
|
rev32 v3.8h,v3.8h
|
|
add w13,w13,w17
|
|
rev32 v7.8h,v7.8h
|
|
add w14,w14,w19
|
|
rev32 v11.8h,v11.8h
|
|
add w15,w15,w20
|
|
rev32 v15.8h,v15.8h
|
|
add w16,w16,w21
|
|
rev32 v19.8h,v19.8h
|
|
eor w9,w9,w13
|
|
rev32 v23.8h,v23.8h
|
|
eor w10,w10,w14
|
|
add v2.4s,v2.4s,v3.4s
|
|
eor w11,w11,w15
|
|
add v6.4s,v6.4s,v7.4s
|
|
eor w12,w12,w16
|
|
add v10.4s,v10.4s,v11.4s
|
|
ror w9,w9,#20
|
|
add v14.4s,v14.4s,v15.4s
|
|
ror w10,w10,#20
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w11,w11,#20
|
|
add v22.4s,v22.4s,v23.4s
|
|
ror w12,w12,#20
|
|
eor v24.16b,v1.16b,v2.16b
|
|
add w5,w5,w9
|
|
eor v25.16b,v5.16b,v6.16b
|
|
add w6,w6,w10
|
|
eor v26.16b,v9.16b,v10.16b
|
|
add w7,w7,w11
|
|
eor v27.16b,v13.16b,v14.16b
|
|
add w8,w8,w12
|
|
eor v28.16b,v17.16b,v18.16b
|
|
eor w17,w17,w5
|
|
eor v29.16b,v21.16b,v22.16b
|
|
eor w19,w19,w6
|
|
ushr v1.4s,v24.4s,#20
|
|
eor w20,w20,w7
|
|
ushr v5.4s,v25.4s,#20
|
|
eor w21,w21,w8
|
|
ushr v9.4s,v26.4s,#20
|
|
ror w17,w17,#24
|
|
ushr v13.4s,v27.4s,#20
|
|
ror w19,w19,#24
|
|
ushr v17.4s,v28.4s,#20
|
|
ror w20,w20,#24
|
|
ushr v21.4s,v29.4s,#20
|
|
ror w21,w21,#24
|
|
sli v1.4s,v24.4s,#12
|
|
add w13,w13,w17
|
|
sli v5.4s,v25.4s,#12
|
|
add w14,w14,w19
|
|
sli v9.4s,v26.4s,#12
|
|
add w15,w15,w20
|
|
sli v13.4s,v27.4s,#12
|
|
add w16,w16,w21
|
|
sli v17.4s,v28.4s,#12
|
|
eor w9,w9,w13
|
|
sli v21.4s,v29.4s,#12
|
|
eor w10,w10,w14
|
|
add v0.4s,v0.4s,v1.4s
|
|
eor w11,w11,w15
|
|
add v4.4s,v4.4s,v5.4s
|
|
eor w12,w12,w16
|
|
add v8.4s,v8.4s,v9.4s
|
|
ror w9,w9,#25
|
|
add v12.4s,v12.4s,v13.4s
|
|
ror w10,w10,#25
|
|
add v16.4s,v16.4s,v17.4s
|
|
ror w11,w11,#25
|
|
add v20.4s,v20.4s,v21.4s
|
|
ror w12,w12,#25
|
|
eor v24.16b,v3.16b,v0.16b
|
|
add w5,w5,w10
|
|
eor v25.16b,v7.16b,v4.16b
|
|
add w6,w6,w11
|
|
eor v26.16b,v11.16b,v8.16b
|
|
add w7,w7,w12
|
|
eor v27.16b,v15.16b,v12.16b
|
|
add w8,w8,w9
|
|
eor v28.16b,v19.16b,v16.16b
|
|
eor w21,w21,w5
|
|
eor v29.16b,v23.16b,v20.16b
|
|
eor w17,w17,w6
|
|
ushr v3.4s,v24.4s,#24
|
|
eor w19,w19,w7
|
|
ushr v7.4s,v25.4s,#24
|
|
eor w20,w20,w8
|
|
ushr v11.4s,v26.4s,#24
|
|
ror w21,w21,#16
|
|
ushr v15.4s,v27.4s,#24
|
|
ror w17,w17,#16
|
|
ushr v19.4s,v28.4s,#24
|
|
ror w19,w19,#16
|
|
ushr v23.4s,v29.4s,#24
|
|
ror w20,w20,#16
|
|
sli v3.4s,v24.4s,#8
|
|
add w15,w15,w21
|
|
sli v7.4s,v25.4s,#8
|
|
add w16,w16,w17
|
|
sli v11.4s,v26.4s,#8
|
|
add w13,w13,w19
|
|
sli v15.4s,v27.4s,#8
|
|
add w14,w14,w20
|
|
sli v19.4s,v28.4s,#8
|
|
eor w10,w10,w15
|
|
sli v23.4s,v29.4s,#8
|
|
eor w11,w11,w16
|
|
add v2.4s,v2.4s,v3.4s
|
|
eor w12,w12,w13
|
|
add v6.4s,v6.4s,v7.4s
|
|
eor w9,w9,w14
|
|
add v10.4s,v10.4s,v11.4s
|
|
ror w10,w10,#20
|
|
add v14.4s,v14.4s,v15.4s
|
|
ror w11,w11,#20
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w12,w12,#20
|
|
add v22.4s,v22.4s,v23.4s
|
|
ror w9,w9,#20
|
|
eor v24.16b,v1.16b,v2.16b
|
|
add w5,w5,w10
|
|
eor v25.16b,v5.16b,v6.16b
|
|
add w6,w6,w11
|
|
eor v26.16b,v9.16b,v10.16b
|
|
add w7,w7,w12
|
|
eor v27.16b,v13.16b,v14.16b
|
|
add w8,w8,w9
|
|
eor v28.16b,v17.16b,v18.16b
|
|
eor w21,w21,w5
|
|
eor v29.16b,v21.16b,v22.16b
|
|
eor w17,w17,w6
|
|
ushr v1.4s,v24.4s,#25
|
|
eor w19,w19,w7
|
|
ushr v5.4s,v25.4s,#25
|
|
eor w20,w20,w8
|
|
ushr v9.4s,v26.4s,#25
|
|
ror w21,w21,#24
|
|
ushr v13.4s,v27.4s,#25
|
|
ror w17,w17,#24
|
|
ushr v17.4s,v28.4s,#25
|
|
ror w19,w19,#24
|
|
ushr v21.4s,v29.4s,#25
|
|
ror w20,w20,#24
|
|
sli v1.4s,v24.4s,#7
|
|
add w15,w15,w21
|
|
sli v5.4s,v25.4s,#7
|
|
add w16,w16,w17
|
|
sli v9.4s,v26.4s,#7
|
|
add w13,w13,w19
|
|
sli v13.4s,v27.4s,#7
|
|
add w14,w14,w20
|
|
sli v17.4s,v28.4s,#7
|
|
eor w10,w10,w15
|
|
sli v21.4s,v29.4s,#7
|
|
eor w11,w11,w16
|
|
ext v2.16b,v2.16b,v2.16b,#8
|
|
eor w12,w12,w13
|
|
ext v6.16b,v6.16b,v6.16b,#8
|
|
eor w9,w9,w14
|
|
ext v10.16b,v10.16b,v10.16b,#8
|
|
ror w10,w10,#25
|
|
ext v14.16b,v14.16b,v14.16b,#8
|
|
ror w11,w11,#25
|
|
ext v18.16b,v18.16b,v18.16b,#8
|
|
ror w12,w12,#25
|
|
ext v22.16b,v22.16b,v22.16b,#8
|
|
ror w9,w9,#25
|
|
ext v3.16b,v3.16b,v3.16b,#12
|
|
ext v7.16b,v7.16b,v7.16b,#12
|
|
ext v11.16b,v11.16b,v11.16b,#12
|
|
ext v15.16b,v15.16b,v15.16b,#12
|
|
ext v19.16b,v19.16b,v19.16b,#12
|
|
ext v23.16b,v23.16b,v23.16b,#12
|
|
ext v1.16b,v1.16b,v1.16b,#4
|
|
ext v5.16b,v5.16b,v5.16b,#4
|
|
ext v9.16b,v9.16b,v9.16b,#4
|
|
ext v13.16b,v13.16b,v13.16b,#4
|
|
ext v17.16b,v17.16b,v17.16b,#4
|
|
ext v21.16b,v21.16b,v21.16b,#4
|
|
add v0.4s,v0.4s,v1.4s
|
|
add w5,w5,w9
|
|
add v4.4s,v4.4s,v5.4s
|
|
add w6,w6,w10
|
|
add v8.4s,v8.4s,v9.4s
|
|
add w7,w7,w11
|
|
add v12.4s,v12.4s,v13.4s
|
|
add w8,w8,w12
|
|
add v16.4s,v16.4s,v17.4s
|
|
eor w17,w17,w5
|
|
add v20.4s,v20.4s,v21.4s
|
|
eor w19,w19,w6
|
|
eor v3.16b,v3.16b,v0.16b
|
|
eor w20,w20,w7
|
|
eor v7.16b,v7.16b,v4.16b
|
|
eor w21,w21,w8
|
|
eor v11.16b,v11.16b,v8.16b
|
|
ror w17,w17,#16
|
|
eor v15.16b,v15.16b,v12.16b
|
|
ror w19,w19,#16
|
|
eor v19.16b,v19.16b,v16.16b
|
|
ror w20,w20,#16
|
|
eor v23.16b,v23.16b,v20.16b
|
|
ror w21,w21,#16
|
|
rev32 v3.8h,v3.8h
|
|
add w13,w13,w17
|
|
rev32 v7.8h,v7.8h
|
|
add w14,w14,w19
|
|
rev32 v11.8h,v11.8h
|
|
add w15,w15,w20
|
|
rev32 v15.8h,v15.8h
|
|
add w16,w16,w21
|
|
rev32 v19.8h,v19.8h
|
|
eor w9,w9,w13
|
|
rev32 v23.8h,v23.8h
|
|
eor w10,w10,w14
|
|
add v2.4s,v2.4s,v3.4s
|
|
eor w11,w11,w15
|
|
add v6.4s,v6.4s,v7.4s
|
|
eor w12,w12,w16
|
|
add v10.4s,v10.4s,v11.4s
|
|
ror w9,w9,#20
|
|
add v14.4s,v14.4s,v15.4s
|
|
ror w10,w10,#20
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w11,w11,#20
|
|
add v22.4s,v22.4s,v23.4s
|
|
ror w12,w12,#20
|
|
eor v24.16b,v1.16b,v2.16b
|
|
add w5,w5,w9
|
|
eor v25.16b,v5.16b,v6.16b
|
|
add w6,w6,w10
|
|
eor v26.16b,v9.16b,v10.16b
|
|
add w7,w7,w11
|
|
eor v27.16b,v13.16b,v14.16b
|
|
add w8,w8,w12
|
|
eor v28.16b,v17.16b,v18.16b
|
|
eor w17,w17,w5
|
|
eor v29.16b,v21.16b,v22.16b
|
|
eor w19,w19,w6
|
|
ushr v1.4s,v24.4s,#20
|
|
eor w20,w20,w7
|
|
ushr v5.4s,v25.4s,#20
|
|
eor w21,w21,w8
|
|
ushr v9.4s,v26.4s,#20
|
|
ror w17,w17,#24
|
|
ushr v13.4s,v27.4s,#20
|
|
ror w19,w19,#24
|
|
ushr v17.4s,v28.4s,#20
|
|
ror w20,w20,#24
|
|
ushr v21.4s,v29.4s,#20
|
|
ror w21,w21,#24
|
|
sli v1.4s,v24.4s,#12
|
|
add w13,w13,w17
|
|
sli v5.4s,v25.4s,#12
|
|
add w14,w14,w19
|
|
sli v9.4s,v26.4s,#12
|
|
add w15,w15,w20
|
|
sli v13.4s,v27.4s,#12
|
|
add w16,w16,w21
|
|
sli v17.4s,v28.4s,#12
|
|
eor w9,w9,w13
|
|
sli v21.4s,v29.4s,#12
|
|
eor w10,w10,w14
|
|
add v0.4s,v0.4s,v1.4s
|
|
eor w11,w11,w15
|
|
add v4.4s,v4.4s,v5.4s
|
|
eor w12,w12,w16
|
|
add v8.4s,v8.4s,v9.4s
|
|
ror w9,w9,#25
|
|
add v12.4s,v12.4s,v13.4s
|
|
ror w10,w10,#25
|
|
add v16.4s,v16.4s,v17.4s
|
|
ror w11,w11,#25
|
|
add v20.4s,v20.4s,v21.4s
|
|
ror w12,w12,#25
|
|
eor v24.16b,v3.16b,v0.16b
|
|
add w5,w5,w10
|
|
eor v25.16b,v7.16b,v4.16b
|
|
add w6,w6,w11
|
|
eor v26.16b,v11.16b,v8.16b
|
|
add w7,w7,w12
|
|
eor v27.16b,v15.16b,v12.16b
|
|
add w8,w8,w9
|
|
eor v28.16b,v19.16b,v16.16b
|
|
eor w21,w21,w5
|
|
eor v29.16b,v23.16b,v20.16b
|
|
eor w17,w17,w6
|
|
ushr v3.4s,v24.4s,#24
|
|
eor w19,w19,w7
|
|
ushr v7.4s,v25.4s,#24
|
|
eor w20,w20,w8
|
|
ushr v11.4s,v26.4s,#24
|
|
ror w21,w21,#16
|
|
ushr v15.4s,v27.4s,#24
|
|
ror w17,w17,#16
|
|
ushr v19.4s,v28.4s,#24
|
|
ror w19,w19,#16
|
|
ushr v23.4s,v29.4s,#24
|
|
ror w20,w20,#16
|
|
sli v3.4s,v24.4s,#8
|
|
add w15,w15,w21
|
|
sli v7.4s,v25.4s,#8
|
|
add w16,w16,w17
|
|
sli v11.4s,v26.4s,#8
|
|
add w13,w13,w19
|
|
sli v15.4s,v27.4s,#8
|
|
add w14,w14,w20
|
|
sli v19.4s,v28.4s,#8
|
|
eor w10,w10,w15
|
|
sli v23.4s,v29.4s,#8
|
|
eor w11,w11,w16
|
|
add v2.4s,v2.4s,v3.4s
|
|
eor w12,w12,w13
|
|
add v6.4s,v6.4s,v7.4s
|
|
eor w9,w9,w14
|
|
add v10.4s,v10.4s,v11.4s
|
|
ror w10,w10,#20
|
|
add v14.4s,v14.4s,v15.4s
|
|
ror w11,w11,#20
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w12,w12,#20
|
|
add v22.4s,v22.4s,v23.4s
|
|
ror w9,w9,#20
|
|
eor v24.16b,v1.16b,v2.16b
|
|
add w5,w5,w10
|
|
eor v25.16b,v5.16b,v6.16b
|
|
add w6,w6,w11
|
|
eor v26.16b,v9.16b,v10.16b
|
|
add w7,w7,w12
|
|
eor v27.16b,v13.16b,v14.16b
|
|
add w8,w8,w9
|
|
eor v28.16b,v17.16b,v18.16b
|
|
eor w21,w21,w5
|
|
eor v29.16b,v21.16b,v22.16b
|
|
eor w17,w17,w6
|
|
ushr v1.4s,v24.4s,#25
|
|
eor w19,w19,w7
|
|
ushr v5.4s,v25.4s,#25
|
|
eor w20,w20,w8
|
|
ushr v9.4s,v26.4s,#25
|
|
ror w21,w21,#24
|
|
ushr v13.4s,v27.4s,#25
|
|
ror w17,w17,#24
|
|
ushr v17.4s,v28.4s,#25
|
|
ror w19,w19,#24
|
|
ushr v21.4s,v29.4s,#25
|
|
ror w20,w20,#24
|
|
sli v1.4s,v24.4s,#7
|
|
add w15,w15,w21
|
|
sli v5.4s,v25.4s,#7
|
|
add w16,w16,w17
|
|
sli v9.4s,v26.4s,#7
|
|
add w13,w13,w19
|
|
sli v13.4s,v27.4s,#7
|
|
add w14,w14,w20
|
|
sli v17.4s,v28.4s,#7
|
|
eor w10,w10,w15
|
|
sli v21.4s,v29.4s,#7
|
|
eor w11,w11,w16
|
|
ext v2.16b,v2.16b,v2.16b,#8
|
|
eor w12,w12,w13
|
|
ext v6.16b,v6.16b,v6.16b,#8
|
|
eor w9,w9,w14
|
|
ext v10.16b,v10.16b,v10.16b,#8
|
|
ror w10,w10,#25
|
|
ext v14.16b,v14.16b,v14.16b,#8
|
|
ror w11,w11,#25
|
|
ext v18.16b,v18.16b,v18.16b,#8
|
|
ror w12,w12,#25
|
|
ext v22.16b,v22.16b,v22.16b,#8
|
|
ror w9,w9,#25
|
|
ext v3.16b,v3.16b,v3.16b,#4
|
|
ext v7.16b,v7.16b,v7.16b,#4
|
|
ext v11.16b,v11.16b,v11.16b,#4
|
|
ext v15.16b,v15.16b,v15.16b,#4
|
|
ext v19.16b,v19.16b,v19.16b,#4
|
|
ext v23.16b,v23.16b,v23.16b,#4
|
|
ext v1.16b,v1.16b,v1.16b,#12
|
|
ext v5.16b,v5.16b,v5.16b,#12
|
|
ext v9.16b,v9.16b,v9.16b,#12
|
|
ext v13.16b,v13.16b,v13.16b,#12
|
|
ext v17.16b,v17.16b,v17.16b,#12
|
|
ext v21.16b,v21.16b,v21.16b,#12
|
|
cbnz x4,Loop_lower_neon
|
|
|
|
add w5,w5,w22 // accumulate key block
|
|
ldp q24,q25,[sp,#0]
|
|
add x6,x6,x22,lsr#32
|
|
ldp q26,q27,[sp,#32]
|
|
add w7,w7,w23
|
|
ldp q28,q29,[sp,#64]
|
|
add x8,x8,x23,lsr#32
|
|
add v0.4s,v0.4s,v24.4s
|
|
add w9,w9,w24
|
|
add v4.4s,v4.4s,v24.4s
|
|
add x10,x10,x24,lsr#32
|
|
add v8.4s,v8.4s,v24.4s
|
|
add w11,w11,w25
|
|
add v12.4s,v12.4s,v24.4s
|
|
add x12,x12,x25,lsr#32
|
|
add v16.4s,v16.4s,v24.4s
|
|
add w13,w13,w26
|
|
add v20.4s,v20.4s,v24.4s
|
|
add x14,x14,x26,lsr#32
|
|
add v2.4s,v2.4s,v26.4s
|
|
add w15,w15,w27
|
|
add v6.4s,v6.4s,v26.4s
|
|
add x16,x16,x27,lsr#32
|
|
add v10.4s,v10.4s,v26.4s
|
|
add w17,w17,w28
|
|
add v14.4s,v14.4s,v26.4s
|
|
add x19,x19,x28,lsr#32
|
|
add v18.4s,v18.4s,v26.4s
|
|
add w20,w20,w30
|
|
add v22.4s,v22.4s,v26.4s
|
|
add x21,x21,x30,lsr#32
|
|
add v19.4s,v19.4s,v31.4s // +4
|
|
add x5,x5,x6,lsl#32 // pack
|
|
add v23.4s,v23.4s,v31.4s // +4
|
|
add x7,x7,x8,lsl#32
|
|
add v3.4s,v3.4s,v27.4s
|
|
ldp x6,x8,[x1,#0] // load input
|
|
add v7.4s,v7.4s,v28.4s
|
|
add x9,x9,x10,lsl#32
|
|
add v11.4s,v11.4s,v29.4s
|
|
add x11,x11,x12,lsl#32
|
|
add v15.4s,v15.4s,v30.4s
|
|
ldp x10,x12,[x1,#16]
|
|
add v19.4s,v19.4s,v27.4s
|
|
add x13,x13,x14,lsl#32
|
|
add v23.4s,v23.4s,v28.4s
|
|
add x15,x15,x16,lsl#32
|
|
add v1.4s,v1.4s,v25.4s
|
|
ldp x14,x16,[x1,#32]
|
|
add v5.4s,v5.4s,v25.4s
|
|
add x17,x17,x19,lsl#32
|
|
add v9.4s,v9.4s,v25.4s
|
|
add x20,x20,x21,lsl#32
|
|
add v13.4s,v13.4s,v25.4s
|
|
ldp x19,x21,[x1,#48]
|
|
add v17.4s,v17.4s,v25.4s
|
|
add x1,x1,#64
|
|
add v21.4s,v21.4s,v25.4s
|
|
|
|
#ifdef __ARMEB__
|
|
rev x5,x5
|
|
rev x7,x7
|
|
rev x9,x9
|
|
rev x11,x11
|
|
rev x13,x13
|
|
rev x15,x15
|
|
rev x17,x17
|
|
rev x20,x20
|
|
#endif
|
|
ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
|
|
eor x5,x5,x6
|
|
eor x7,x7,x8
|
|
eor x9,x9,x10
|
|
eor x11,x11,x12
|
|
eor x13,x13,x14
|
|
eor v0.16b,v0.16b,v24.16b
|
|
eor x15,x15,x16
|
|
eor v1.16b,v1.16b,v25.16b
|
|
eor x17,x17,x19
|
|
eor v2.16b,v2.16b,v26.16b
|
|
eor x20,x20,x21
|
|
eor v3.16b,v3.16b,v27.16b
|
|
ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
|
|
|
|
stp x5,x7,[x0,#0] // store output
|
|
add x28,x28,#7 // increment counter
|
|
stp x9,x11,[x0,#16]
|
|
stp x13,x15,[x0,#32]
|
|
stp x17,x20,[x0,#48]
|
|
add x0,x0,#64
|
|
st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
|
|
|
|
ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
|
|
eor v4.16b,v4.16b,v24.16b
|
|
eor v5.16b,v5.16b,v25.16b
|
|
eor v6.16b,v6.16b,v26.16b
|
|
eor v7.16b,v7.16b,v27.16b
|
|
st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
|
|
|
|
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
|
|
eor v8.16b,v8.16b,v0.16b
|
|
ldp q24,q25,[sp,#0]
|
|
eor v9.16b,v9.16b,v1.16b
|
|
ldp q26,q27,[sp,#32]
|
|
eor v10.16b,v10.16b,v2.16b
|
|
eor v11.16b,v11.16b,v3.16b
|
|
st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
|
|
|
|
ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
|
|
eor v12.16b,v12.16b,v4.16b
|
|
eor v13.16b,v13.16b,v5.16b
|
|
eor v14.16b,v14.16b,v6.16b
|
|
eor v15.16b,v15.16b,v7.16b
|
|
st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
|
|
|
|
ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
|
|
eor v16.16b,v16.16b,v8.16b
|
|
eor v17.16b,v17.16b,v9.16b
|
|
eor v18.16b,v18.16b,v10.16b
|
|
eor v19.16b,v19.16b,v11.16b
|
|
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
|
|
|
|
shl v0.4s,v31.4s,#1 // 4 -> 8
|
|
eor v20.16b,v20.16b,v12.16b
|
|
eor v21.16b,v21.16b,v13.16b
|
|
eor v22.16b,v22.16b,v14.16b
|
|
eor v23.16b,v23.16b,v15.16b
|
|
st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
|
|
|
|
add v27.4s,v27.4s,v0.4s // += 8
|
|
add v28.4s,v28.4s,v0.4s
|
|
add v29.4s,v29.4s,v0.4s
|
|
add v30.4s,v30.4s,v0.4s
|
|
|
|
b.hs Loop_outer_512_neon
|
|
|
|
adds x2,x2,#512
|
|
ushr v0.4s,v31.4s,#2 // 4 -> 1
|
|
|
|
ldp d8,d9,[sp,#128+0] // meet ABI requirements
|
|
ldp d10,d11,[sp,#128+16]
|
|
ldp d12,d13,[sp,#128+32]
|
|
ldp d14,d15,[sp,#128+48]
|
|
|
|
stp q24,q31,[sp,#0] // wipe off-load area
|
|
stp q24,q31,[sp,#32]
|
|
stp q24,q31,[sp,#64]
|
|
|
|
b.eq Ldone_512_neon
|
|
|
|
cmp x2,#192
|
|
sub v27.4s,v27.4s,v0.4s // -= 1
|
|
sub v28.4s,v28.4s,v0.4s
|
|
sub v29.4s,v29.4s,v0.4s
|
|
add sp,sp,#128
|
|
b.hs Loop_outer_neon
|
|
|
|
eor v25.16b,v25.16b,v25.16b
|
|
eor v26.16b,v26.16b,v26.16b
|
|
eor v27.16b,v27.16b,v27.16b
|
|
eor v28.16b,v28.16b,v28.16b
|
|
eor v29.16b,v29.16b,v29.16b
|
|
eor v30.16b,v30.16b,v30.16b
|
|
b Loop_outer
|
|
|
|
Ldone_512_neon:
|
|
ldp x19,x20,[x29,#16]
|
|
add sp,sp,#128+64
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldp x29,x30,[sp],#96
|
|
ret
|
|
|