cf92ac7a0c
1.Subfolders in the Config/ directory now show up as submenus. 2.Added a way to run TunSafe as a Windows Service. Foreground Mode: The service will disconnect when TunSafe closes. Background Mode: The service will stay connected in the background. No longer required to run the TunSafe client as Admin as long as the service is running. 3.New config setting [Interface].ExcludedIPs to configure IPs that should not be routed through TunSafe. 4.Can now automatically start TunSafe when Windows starts 5.New UI with tabs and graphs 6.Cache DNS queries to ensure DNS will succeed if connection fails 7.Recreate tray icon when explorer.exe restarts 8.Renamed window title to TunSafe instead of TunSafe VPN Client 9.Main window is now resizable 10.Disallow roaming endpoint when using AllowedIPs=0.0.0.0/0 Only the original endpoint is added in the routing table so this would result in an endless loop of packets. 11.Display approximate Wireguard framing overhead in stats 12.Preparations for protocol handling with multiple threads 13.Delete the routes we made when disconnecting 14.Fix error message about unable to delete a route when connecting
1940 lines
38 KiB
ArmAsm
1940 lines
38 KiB
ArmAsm
/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
|
|
*
|
|
* Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
|
* Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
|
|
.text
|
|
.align 5
|
|
.Lsigma:
|
|
.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
|
|
.Lone:
|
|
.long 1,0,0,0
|
|
|
|
.align 5
|
|
ENTRY(chacha20_arm)
|
|
cbz x2,.Labort
|
|
.Lshort:
|
|
stp x29,x30,[sp,#-96]!
|
|
add x29,sp,#0
|
|
|
|
adr x5,.Lsigma
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
sub sp,sp,#64
|
|
|
|
ldp x22,x23,[x5] // load sigma
|
|
ldp x24,x25,[x3] // load key
|
|
ldp x26,x27,[x3,#16]
|
|
ldp x28,x30,[x4] // load counter
|
|
#ifdef __ARMEB__
|
|
ror x24,x24,#32
|
|
ror x25,x25,#32
|
|
ror x26,x26,#32
|
|
ror x27,x27,#32
|
|
ror x28,x28,#32
|
|
ror x30,x30,#32
|
|
#endif
|
|
|
|
.Loop_outer:
|
|
mov w5,w22 // unpack key block
|
|
lsr x6,x22,#32
|
|
mov w7,w23
|
|
lsr x8,x23,#32
|
|
mov w9,w24
|
|
lsr x10,x24,#32
|
|
mov w11,w25
|
|
lsr x12,x25,#32
|
|
mov w13,w26
|
|
lsr x14,x26,#32
|
|
mov w15,w27
|
|
lsr x16,x27,#32
|
|
mov w17,w28
|
|
lsr x19,x28,#32
|
|
mov w20,w30
|
|
lsr x21,x30,#32
|
|
|
|
mov x4,#10
|
|
subs x2,x2,#64
|
|
.Loop:
|
|
sub x4,x4,#1
|
|
add w5,w5,w9
|
|
add w6,w6,w10
|
|
add w7,w7,w11
|
|
add w8,w8,w12
|
|
eor w17,w17,w5
|
|
eor w19,w19,w6
|
|
eor w20,w20,w7
|
|
eor w21,w21,w8
|
|
ror w17,w17,#16
|
|
ror w19,w19,#16
|
|
ror w20,w20,#16
|
|
ror w21,w21,#16
|
|
add w13,w13,w17
|
|
add w14,w14,w19
|
|
add w15,w15,w20
|
|
add w16,w16,w21
|
|
eor w9,w9,w13
|
|
eor w10,w10,w14
|
|
eor w11,w11,w15
|
|
eor w12,w12,w16
|
|
ror w9,w9,#20
|
|
ror w10,w10,#20
|
|
ror w11,w11,#20
|
|
ror w12,w12,#20
|
|
add w5,w5,w9
|
|
add w6,w6,w10
|
|
add w7,w7,w11
|
|
add w8,w8,w12
|
|
eor w17,w17,w5
|
|
eor w19,w19,w6
|
|
eor w20,w20,w7
|
|
eor w21,w21,w8
|
|
ror w17,w17,#24
|
|
ror w19,w19,#24
|
|
ror w20,w20,#24
|
|
ror w21,w21,#24
|
|
add w13,w13,w17
|
|
add w14,w14,w19
|
|
add w15,w15,w20
|
|
add w16,w16,w21
|
|
eor w9,w9,w13
|
|
eor w10,w10,w14
|
|
eor w11,w11,w15
|
|
eor w12,w12,w16
|
|
ror w9,w9,#25
|
|
ror w10,w10,#25
|
|
ror w11,w11,#25
|
|
ror w12,w12,#25
|
|
add w5,w5,w10
|
|
add w6,w6,w11
|
|
add w7,w7,w12
|
|
add w8,w8,w9
|
|
eor w21,w21,w5
|
|
eor w17,w17,w6
|
|
eor w19,w19,w7
|
|
eor w20,w20,w8
|
|
ror w21,w21,#16
|
|
ror w17,w17,#16
|
|
ror w19,w19,#16
|
|
ror w20,w20,#16
|
|
add w15,w15,w21
|
|
add w16,w16,w17
|
|
add w13,w13,w19
|
|
add w14,w14,w20
|
|
eor w10,w10,w15
|
|
eor w11,w11,w16
|
|
eor w12,w12,w13
|
|
eor w9,w9,w14
|
|
ror w10,w10,#20
|
|
ror w11,w11,#20
|
|
ror w12,w12,#20
|
|
ror w9,w9,#20
|
|
add w5,w5,w10
|
|
add w6,w6,w11
|
|
add w7,w7,w12
|
|
add w8,w8,w9
|
|
eor w21,w21,w5
|
|
eor w17,w17,w6
|
|
eor w19,w19,w7
|
|
eor w20,w20,w8
|
|
ror w21,w21,#24
|
|
ror w17,w17,#24
|
|
ror w19,w19,#24
|
|
ror w20,w20,#24
|
|
add w15,w15,w21
|
|
add w16,w16,w17
|
|
add w13,w13,w19
|
|
add w14,w14,w20
|
|
eor w10,w10,w15
|
|
eor w11,w11,w16
|
|
eor w12,w12,w13
|
|
eor w9,w9,w14
|
|
ror w10,w10,#25
|
|
ror w11,w11,#25
|
|
ror w12,w12,#25
|
|
ror w9,w9,#25
|
|
cbnz x4,.Loop
|
|
|
|
add w5,w5,w22 // accumulate key block
|
|
add x6,x6,x22,lsr#32
|
|
add w7,w7,w23
|
|
add x8,x8,x23,lsr#32
|
|
add w9,w9,w24
|
|
add x10,x10,x24,lsr#32
|
|
add w11,w11,w25
|
|
add x12,x12,x25,lsr#32
|
|
add w13,w13,w26
|
|
add x14,x14,x26,lsr#32
|
|
add w15,w15,w27
|
|
add x16,x16,x27,lsr#32
|
|
add w17,w17,w28
|
|
add x19,x19,x28,lsr#32
|
|
add w20,w20,w30
|
|
add x21,x21,x30,lsr#32
|
|
|
|
b.lo .Ltail
|
|
|
|
add x5,x5,x6,lsl#32 // pack
|
|
add x7,x7,x8,lsl#32
|
|
ldp x6,x8,[x1,#0] // load input
|
|
add x9,x9,x10,lsl#32
|
|
add x11,x11,x12,lsl#32
|
|
ldp x10,x12,[x1,#16]
|
|
add x13,x13,x14,lsl#32
|
|
add x15,x15,x16,lsl#32
|
|
ldp x14,x16,[x1,#32]
|
|
add x17,x17,x19,lsl#32
|
|
add x20,x20,x21,lsl#32
|
|
ldp x19,x21,[x1,#48]
|
|
add x1,x1,#64
|
|
#ifdef __ARMEB__
|
|
rev x5,x5
|
|
rev x7,x7
|
|
rev x9,x9
|
|
rev x11,x11
|
|
rev x13,x13
|
|
rev x15,x15
|
|
rev x17,x17
|
|
rev x20,x20
|
|
#endif
|
|
eor x5,x5,x6
|
|
eor x7,x7,x8
|
|
eor x9,x9,x10
|
|
eor x11,x11,x12
|
|
eor x13,x13,x14
|
|
eor x15,x15,x16
|
|
eor x17,x17,x19
|
|
eor x20,x20,x21
|
|
|
|
stp x5,x7,[x0,#0] // store output
|
|
add x28,x28,#1 // increment counter
|
|
stp x9,x11,[x0,#16]
|
|
stp x13,x15,[x0,#32]
|
|
stp x17,x20,[x0,#48]
|
|
add x0,x0,#64
|
|
|
|
b.hi .Loop_outer
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
add sp,sp,#64
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldp x29,x30,[sp],#96
|
|
.Labort:
|
|
ret
|
|
|
|
.align 4
|
|
.Ltail:
|
|
add x2,x2,#64
|
|
.Less_than_64:
|
|
sub x0,x0,#1
|
|
add x1,x1,x2
|
|
add x0,x0,x2
|
|
add x4,sp,x2
|
|
neg x2,x2
|
|
|
|
add x5,x5,x6,lsl#32 // pack
|
|
add x7,x7,x8,lsl#32
|
|
add x9,x9,x10,lsl#32
|
|
add x11,x11,x12,lsl#32
|
|
add x13,x13,x14,lsl#32
|
|
add x15,x15,x16,lsl#32
|
|
add x17,x17,x19,lsl#32
|
|
add x20,x20,x21,lsl#32
|
|
#ifdef __ARMEB__
|
|
rev x5,x5
|
|
rev x7,x7
|
|
rev x9,x9
|
|
rev x11,x11
|
|
rev x13,x13
|
|
rev x15,x15
|
|
rev x17,x17
|
|
rev x20,x20
|
|
#endif
|
|
stp x5,x7,[sp,#0]
|
|
stp x9,x11,[sp,#16]
|
|
stp x13,x15,[sp,#32]
|
|
stp x17,x20,[sp,#48]
|
|
|
|
.Loop_tail:
|
|
ldrb w10,[x1,x2]
|
|
ldrb w11,[x4,x2]
|
|
add x2,x2,#1
|
|
eor w10,w10,w11
|
|
strb w10,[x0,x2]
|
|
cbnz x2,.Loop_tail
|
|
|
|
stp xzr,xzr,[sp,#0]
|
|
stp xzr,xzr,[sp,#16]
|
|
stp xzr,xzr,[sp,#32]
|
|
stp xzr,xzr,[sp,#48]
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
add sp,sp,#64
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldp x29,x30,[sp],#96
|
|
ret
|
|
ENDPROC(chacha20_arm)
|
|
|
|
.align 5
|
|
ENTRY(chacha20_neon)
|
|
cbz x2,.Labort_neon
|
|
cmp x2,#192
|
|
b.lo .Lshort
|
|
|
|
stp x29,x30,[sp,#-96]!
|
|
add x29,sp,#0
|
|
|
|
adr x5,.Lsigma
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
cmp x2,#512
|
|
b.hs .L512_or_more_neon
|
|
|
|
sub sp,sp,#64
|
|
|
|
ldp x22,x23,[x5] // load sigma
|
|
ld1 {v24.4s},[x5],#16
|
|
ldp x24,x25,[x3] // load key
|
|
ldp x26,x27,[x3,#16]
|
|
ld1 {v25.4s,v26.4s},[x3]
|
|
ldp x28,x30,[x4] // load counter
|
|
ld1 {v27.4s},[x4]
|
|
ld1 {v31.4s},[x5]
|
|
#ifdef __ARMEB__
|
|
rev64 v24.4s,v24.4s
|
|
ror x24,x24,#32
|
|
ror x25,x25,#32
|
|
ror x26,x26,#32
|
|
ror x27,x27,#32
|
|
ror x28,x28,#32
|
|
ror x30,x30,#32
|
|
#endif
|
|
add v27.4s,v27.4s,v31.4s // += 1
|
|
add v28.4s,v27.4s,v31.4s
|
|
add v29.4s,v28.4s,v31.4s
|
|
shl v31.4s,v31.4s,#2 // 1 -> 4
|
|
|
|
.Loop_outer_neon:
|
|
mov w5,w22 // unpack key block
|
|
lsr x6,x22,#32
|
|
mov v0.16b,v24.16b
|
|
mov w7,w23
|
|
lsr x8,x23,#32
|
|
mov v4.16b,v24.16b
|
|
mov w9,w24
|
|
lsr x10,x24,#32
|
|
mov v16.16b,v24.16b
|
|
mov w11,w25
|
|
mov v1.16b,v25.16b
|
|
lsr x12,x25,#32
|
|
mov v5.16b,v25.16b
|
|
mov w13,w26
|
|
mov v17.16b,v25.16b
|
|
lsr x14,x26,#32
|
|
mov v3.16b,v27.16b
|
|
mov w15,w27
|
|
mov v7.16b,v28.16b
|
|
lsr x16,x27,#32
|
|
mov v19.16b,v29.16b
|
|
mov w17,w28
|
|
mov v2.16b,v26.16b
|
|
lsr x19,x28,#32
|
|
mov v6.16b,v26.16b
|
|
mov w20,w30
|
|
mov v18.16b,v26.16b
|
|
lsr x21,x30,#32
|
|
|
|
mov x4,#10
|
|
subs x2,x2,#256
|
|
.Loop_neon:
|
|
sub x4,x4,#1
|
|
add v0.4s,v0.4s,v1.4s
|
|
add w5,w5,w9
|
|
add v4.4s,v4.4s,v5.4s
|
|
add w6,w6,w10
|
|
add v16.4s,v16.4s,v17.4s
|
|
add w7,w7,w11
|
|
eor v3.16b,v3.16b,v0.16b
|
|
add w8,w8,w12
|
|
eor v7.16b,v7.16b,v4.16b
|
|
eor w17,w17,w5
|
|
eor v19.16b,v19.16b,v16.16b
|
|
eor w19,w19,w6
|
|
rev32 v3.8h,v3.8h
|
|
eor w20,w20,w7
|
|
rev32 v7.8h,v7.8h
|
|
eor w21,w21,w8
|
|
rev32 v19.8h,v19.8h
|
|
ror w17,w17,#16
|
|
add v2.4s,v2.4s,v3.4s
|
|
ror w19,w19,#16
|
|
add v6.4s,v6.4s,v7.4s
|
|
ror w20,w20,#16
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w21,w21,#16
|
|
eor v20.16b,v1.16b,v2.16b
|
|
add w13,w13,w17
|
|
eor v21.16b,v5.16b,v6.16b
|
|
add w14,w14,w19
|
|
eor v22.16b,v17.16b,v18.16b
|
|
add w15,w15,w20
|
|
ushr v1.4s,v20.4s,#20
|
|
add w16,w16,w21
|
|
ushr v5.4s,v21.4s,#20
|
|
eor w9,w9,w13
|
|
ushr v17.4s,v22.4s,#20
|
|
eor w10,w10,w14
|
|
sli v1.4s,v20.4s,#12
|
|
eor w11,w11,w15
|
|
sli v5.4s,v21.4s,#12
|
|
eor w12,w12,w16
|
|
sli v17.4s,v22.4s,#12
|
|
ror w9,w9,#20
|
|
add v0.4s,v0.4s,v1.4s
|
|
ror w10,w10,#20
|
|
add v4.4s,v4.4s,v5.4s
|
|
ror w11,w11,#20
|
|
add v16.4s,v16.4s,v17.4s
|
|
ror w12,w12,#20
|
|
eor v20.16b,v3.16b,v0.16b
|
|
add w5,w5,w9
|
|
eor v21.16b,v7.16b,v4.16b
|
|
add w6,w6,w10
|
|
eor v22.16b,v19.16b,v16.16b
|
|
add w7,w7,w11
|
|
ushr v3.4s,v20.4s,#24
|
|
add w8,w8,w12
|
|
ushr v7.4s,v21.4s,#24
|
|
eor w17,w17,w5
|
|
ushr v19.4s,v22.4s,#24
|
|
eor w19,w19,w6
|
|
sli v3.4s,v20.4s,#8
|
|
eor w20,w20,w7
|
|
sli v7.4s,v21.4s,#8
|
|
eor w21,w21,w8
|
|
sli v19.4s,v22.4s,#8
|
|
ror w17,w17,#24
|
|
add v2.4s,v2.4s,v3.4s
|
|
ror w19,w19,#24
|
|
add v6.4s,v6.4s,v7.4s
|
|
ror w20,w20,#24
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w21,w21,#24
|
|
eor v20.16b,v1.16b,v2.16b
|
|
add w13,w13,w17
|
|
eor v21.16b,v5.16b,v6.16b
|
|
add w14,w14,w19
|
|
eor v22.16b,v17.16b,v18.16b
|
|
add w15,w15,w20
|
|
ushr v1.4s,v20.4s,#25
|
|
add w16,w16,w21
|
|
ushr v5.4s,v21.4s,#25
|
|
eor w9,w9,w13
|
|
ushr v17.4s,v22.4s,#25
|
|
eor w10,w10,w14
|
|
sli v1.4s,v20.4s,#7
|
|
eor w11,w11,w15
|
|
sli v5.4s,v21.4s,#7
|
|
eor w12,w12,w16
|
|
sli v17.4s,v22.4s,#7
|
|
ror w9,w9,#25
|
|
ext v2.16b,v2.16b,v2.16b,#8
|
|
ror w10,w10,#25
|
|
ext v6.16b,v6.16b,v6.16b,#8
|
|
ror w11,w11,#25
|
|
ext v18.16b,v18.16b,v18.16b,#8
|
|
ror w12,w12,#25
|
|
ext v3.16b,v3.16b,v3.16b,#12
|
|
ext v7.16b,v7.16b,v7.16b,#12
|
|
ext v19.16b,v19.16b,v19.16b,#12
|
|
ext v1.16b,v1.16b,v1.16b,#4
|
|
ext v5.16b,v5.16b,v5.16b,#4
|
|
ext v17.16b,v17.16b,v17.16b,#4
|
|
add v0.4s,v0.4s,v1.4s
|
|
add w5,w5,w10
|
|
add v4.4s,v4.4s,v5.4s
|
|
add w6,w6,w11
|
|
add v16.4s,v16.4s,v17.4s
|
|
add w7,w7,w12
|
|
eor v3.16b,v3.16b,v0.16b
|
|
add w8,w8,w9
|
|
eor v7.16b,v7.16b,v4.16b
|
|
eor w21,w21,w5
|
|
eor v19.16b,v19.16b,v16.16b
|
|
eor w17,w17,w6
|
|
rev32 v3.8h,v3.8h
|
|
eor w19,w19,w7
|
|
rev32 v7.8h,v7.8h
|
|
eor w20,w20,w8
|
|
rev32 v19.8h,v19.8h
|
|
ror w21,w21,#16
|
|
add v2.4s,v2.4s,v3.4s
|
|
ror w17,w17,#16
|
|
add v6.4s,v6.4s,v7.4s
|
|
ror w19,w19,#16
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w20,w20,#16
|
|
eor v20.16b,v1.16b,v2.16b
|
|
add w15,w15,w21
|
|
eor v21.16b,v5.16b,v6.16b
|
|
add w16,w16,w17
|
|
eor v22.16b,v17.16b,v18.16b
|
|
add w13,w13,w19
|
|
ushr v1.4s,v20.4s,#20
|
|
add w14,w14,w20
|
|
ushr v5.4s,v21.4s,#20
|
|
eor w10,w10,w15
|
|
ushr v17.4s,v22.4s,#20
|
|
eor w11,w11,w16
|
|
sli v1.4s,v20.4s,#12
|
|
eor w12,w12,w13
|
|
sli v5.4s,v21.4s,#12
|
|
eor w9,w9,w14
|
|
sli v17.4s,v22.4s,#12
|
|
ror w10,w10,#20
|
|
add v0.4s,v0.4s,v1.4s
|
|
ror w11,w11,#20
|
|
add v4.4s,v4.4s,v5.4s
|
|
ror w12,w12,#20
|
|
add v16.4s,v16.4s,v17.4s
|
|
ror w9,w9,#20
|
|
eor v20.16b,v3.16b,v0.16b
|
|
add w5,w5,w10
|
|
eor v21.16b,v7.16b,v4.16b
|
|
add w6,w6,w11
|
|
eor v22.16b,v19.16b,v16.16b
|
|
add w7,w7,w12
|
|
ushr v3.4s,v20.4s,#24
|
|
add w8,w8,w9
|
|
ushr v7.4s,v21.4s,#24
|
|
eor w21,w21,w5
|
|
ushr v19.4s,v22.4s,#24
|
|
eor w17,w17,w6
|
|
sli v3.4s,v20.4s,#8
|
|
eor w19,w19,w7
|
|
sli v7.4s,v21.4s,#8
|
|
eor w20,w20,w8
|
|
sli v19.4s,v22.4s,#8
|
|
ror w21,w21,#24
|
|
add v2.4s,v2.4s,v3.4s
|
|
ror w17,w17,#24
|
|
add v6.4s,v6.4s,v7.4s
|
|
ror w19,w19,#24
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w20,w20,#24
|
|
eor v20.16b,v1.16b,v2.16b
|
|
add w15,w15,w21
|
|
eor v21.16b,v5.16b,v6.16b
|
|
add w16,w16,w17
|
|
eor v22.16b,v17.16b,v18.16b
|
|
add w13,w13,w19
|
|
ushr v1.4s,v20.4s,#25
|
|
add w14,w14,w20
|
|
ushr v5.4s,v21.4s,#25
|
|
eor w10,w10,w15
|
|
ushr v17.4s,v22.4s,#25
|
|
eor w11,w11,w16
|
|
sli v1.4s,v20.4s,#7
|
|
eor w12,w12,w13
|
|
sli v5.4s,v21.4s,#7
|
|
eor w9,w9,w14
|
|
sli v17.4s,v22.4s,#7
|
|
ror w10,w10,#25
|
|
ext v2.16b,v2.16b,v2.16b,#8
|
|
ror w11,w11,#25
|
|
ext v6.16b,v6.16b,v6.16b,#8
|
|
ror w12,w12,#25
|
|
ext v18.16b,v18.16b,v18.16b,#8
|
|
ror w9,w9,#25
|
|
ext v3.16b,v3.16b,v3.16b,#4
|
|
ext v7.16b,v7.16b,v7.16b,#4
|
|
ext v19.16b,v19.16b,v19.16b,#4
|
|
ext v1.16b,v1.16b,v1.16b,#12
|
|
ext v5.16b,v5.16b,v5.16b,#12
|
|
ext v17.16b,v17.16b,v17.16b,#12
|
|
cbnz x4,.Loop_neon
|
|
|
|
add w5,w5,w22 // accumulate key block
|
|
add v0.4s,v0.4s,v24.4s
|
|
add x6,x6,x22,lsr#32
|
|
add v4.4s,v4.4s,v24.4s
|
|
add w7,w7,w23
|
|
add v16.4s,v16.4s,v24.4s
|
|
add x8,x8,x23,lsr#32
|
|
add v2.4s,v2.4s,v26.4s
|
|
add w9,w9,w24
|
|
add v6.4s,v6.4s,v26.4s
|
|
add x10,x10,x24,lsr#32
|
|
add v18.4s,v18.4s,v26.4s
|
|
add w11,w11,w25
|
|
add v3.4s,v3.4s,v27.4s
|
|
add x12,x12,x25,lsr#32
|
|
add w13,w13,w26
|
|
add v7.4s,v7.4s,v28.4s
|
|
add x14,x14,x26,lsr#32
|
|
add w15,w15,w27
|
|
add v19.4s,v19.4s,v29.4s
|
|
add x16,x16,x27,lsr#32
|
|
add w17,w17,w28
|
|
add v1.4s,v1.4s,v25.4s
|
|
add x19,x19,x28,lsr#32
|
|
add w20,w20,w30
|
|
add v5.4s,v5.4s,v25.4s
|
|
add x21,x21,x30,lsr#32
|
|
add v17.4s,v17.4s,v25.4s
|
|
|
|
b.lo .Ltail_neon
|
|
|
|
add x5,x5,x6,lsl#32 // pack
|
|
add x7,x7,x8,lsl#32
|
|
ldp x6,x8,[x1,#0] // load input
|
|
add x9,x9,x10,lsl#32
|
|
add x11,x11,x12,lsl#32
|
|
ldp x10,x12,[x1,#16]
|
|
add x13,x13,x14,lsl#32
|
|
add x15,x15,x16,lsl#32
|
|
ldp x14,x16,[x1,#32]
|
|
add x17,x17,x19,lsl#32
|
|
add x20,x20,x21,lsl#32
|
|
ldp x19,x21,[x1,#48]
|
|
add x1,x1,#64
|
|
#ifdef __ARMEB__
|
|
rev x5,x5
|
|
rev x7,x7
|
|
rev x9,x9
|
|
rev x11,x11
|
|
rev x13,x13
|
|
rev x15,x15
|
|
rev x17,x17
|
|
rev x20,x20
|
|
#endif
|
|
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
|
|
eor x5,x5,x6
|
|
eor x7,x7,x8
|
|
eor x9,x9,x10
|
|
eor x11,x11,x12
|
|
eor x13,x13,x14
|
|
eor v0.16b,v0.16b,v20.16b
|
|
eor x15,x15,x16
|
|
eor v1.16b,v1.16b,v21.16b
|
|
eor x17,x17,x19
|
|
eor v2.16b,v2.16b,v22.16b
|
|
eor x20,x20,x21
|
|
eor v3.16b,v3.16b,v23.16b
|
|
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
|
|
|
|
stp x5,x7,[x0,#0] // store output
|
|
add x28,x28,#4 // increment counter
|
|
stp x9,x11,[x0,#16]
|
|
add v27.4s,v27.4s,v31.4s // += 4
|
|
stp x13,x15,[x0,#32]
|
|
add v28.4s,v28.4s,v31.4s
|
|
stp x17,x20,[x0,#48]
|
|
add v29.4s,v29.4s,v31.4s
|
|
add x0,x0,#64
|
|
|
|
st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
|
|
ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
|
|
|
|
eor v4.16b,v4.16b,v20.16b
|
|
eor v5.16b,v5.16b,v21.16b
|
|
eor v6.16b,v6.16b,v22.16b
|
|
eor v7.16b,v7.16b,v23.16b
|
|
st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
|
|
|
|
eor v16.16b,v16.16b,v0.16b
|
|
eor v17.16b,v17.16b,v1.16b
|
|
eor v18.16b,v18.16b,v2.16b
|
|
eor v19.16b,v19.16b,v3.16b
|
|
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
|
|
|
|
b.hi .Loop_outer_neon
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
add sp,sp,#64
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldp x29,x30,[sp],#96
|
|
ret
|
|
|
|
.Ltail_neon:
|
|
add x2,x2,#256
|
|
cmp x2,#64
|
|
b.lo .Less_than_64
|
|
|
|
add x5,x5,x6,lsl#32 // pack
|
|
add x7,x7,x8,lsl#32
|
|
ldp x6,x8,[x1,#0] // load input
|
|
add x9,x9,x10,lsl#32
|
|
add x11,x11,x12,lsl#32
|
|
ldp x10,x12,[x1,#16]
|
|
add x13,x13,x14,lsl#32
|
|
add x15,x15,x16,lsl#32
|
|
ldp x14,x16,[x1,#32]
|
|
add x17,x17,x19,lsl#32
|
|
add x20,x20,x21,lsl#32
|
|
ldp x19,x21,[x1,#48]
|
|
add x1,x1,#64
|
|
#ifdef __ARMEB__
|
|
rev x5,x5
|
|
rev x7,x7
|
|
rev x9,x9
|
|
rev x11,x11
|
|
rev x13,x13
|
|
rev x15,x15
|
|
rev x17,x17
|
|
rev x20,x20
|
|
#endif
|
|
eor x5,x5,x6
|
|
eor x7,x7,x8
|
|
eor x9,x9,x10
|
|
eor x11,x11,x12
|
|
eor x13,x13,x14
|
|
eor x15,x15,x16
|
|
eor x17,x17,x19
|
|
eor x20,x20,x21
|
|
|
|
stp x5,x7,[x0,#0] // store output
|
|
add x28,x28,#4 // increment counter
|
|
stp x9,x11,[x0,#16]
|
|
stp x13,x15,[x0,#32]
|
|
stp x17,x20,[x0,#48]
|
|
add x0,x0,#64
|
|
b.eq .Ldone_neon
|
|
sub x2,x2,#64
|
|
cmp x2,#64
|
|
b.lo .Less_than_128
|
|
|
|
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
|
|
eor v0.16b,v0.16b,v20.16b
|
|
eor v1.16b,v1.16b,v21.16b
|
|
eor v2.16b,v2.16b,v22.16b
|
|
eor v3.16b,v3.16b,v23.16b
|
|
st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
|
|
b.eq .Ldone_neon
|
|
sub x2,x2,#64
|
|
cmp x2,#64
|
|
b.lo .Less_than_192
|
|
|
|
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
|
|
eor v4.16b,v4.16b,v20.16b
|
|
eor v5.16b,v5.16b,v21.16b
|
|
eor v6.16b,v6.16b,v22.16b
|
|
eor v7.16b,v7.16b,v23.16b
|
|
st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
|
|
b.eq .Ldone_neon
|
|
sub x2,x2,#64
|
|
|
|
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
|
|
b .Last_neon
|
|
|
|
.Less_than_128:
|
|
st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
|
|
b .Last_neon
|
|
.Less_than_192:
|
|
st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
|
|
b .Last_neon
|
|
|
|
.align 4
|
|
.Last_neon:
|
|
sub x0,x0,#1
|
|
add x1,x1,x2
|
|
add x0,x0,x2
|
|
add x4,sp,x2
|
|
neg x2,x2
|
|
|
|
.Loop_tail_neon:
|
|
ldrb w10,[x1,x2]
|
|
ldrb w11,[x4,x2]
|
|
add x2,x2,#1
|
|
eor w10,w10,w11
|
|
strb w10,[x0,x2]
|
|
cbnz x2,.Loop_tail_neon
|
|
|
|
stp xzr,xzr,[sp,#0]
|
|
stp xzr,xzr,[sp,#16]
|
|
stp xzr,xzr,[sp,#32]
|
|
stp xzr,xzr,[sp,#48]
|
|
|
|
.Ldone_neon:
|
|
ldp x19,x20,[x29,#16]
|
|
add sp,sp,#64
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldp x29,x30,[sp],#96
|
|
ret
|
|
|
|
.L512_or_more_neon:
|
|
sub sp,sp,#128+64
|
|
|
|
ldp x22,x23,[x5] // load sigma
|
|
ld1 {v24.4s},[x5],#16
|
|
ldp x24,x25,[x3] // load key
|
|
ldp x26,x27,[x3,#16]
|
|
ld1 {v25.4s,v26.4s},[x3]
|
|
ldp x28,x30,[x4] // load counter
|
|
ld1 {v27.4s},[x4]
|
|
ld1 {v31.4s},[x5]
|
|
#ifdef __ARMEB__
|
|
rev64 v24.4s,v24.4s
|
|
ror x24,x24,#32
|
|
ror x25,x25,#32
|
|
ror x26,x26,#32
|
|
ror x27,x27,#32
|
|
ror x28,x28,#32
|
|
ror x30,x30,#32
|
|
#endif
|
|
add v27.4s,v27.4s,v31.4s // += 1
|
|
stp q24,q25,[sp,#0] // off-load key block, invariant part
|
|
add v27.4s,v27.4s,v31.4s // not typo
|
|
str q26,[sp,#32]
|
|
add v28.4s,v27.4s,v31.4s
|
|
add v29.4s,v28.4s,v31.4s
|
|
add v30.4s,v29.4s,v31.4s
|
|
shl v31.4s,v31.4s,#2 // 1 -> 4
|
|
|
|
stp d8,d9,[sp,#128+0] // meet ABI requirements
|
|
stp d10,d11,[sp,#128+16]
|
|
stp d12,d13,[sp,#128+32]
|
|
stp d14,d15,[sp,#128+48]
|
|
|
|
sub x2,x2,#512 // not typo
|
|
|
|
.Loop_outer_512_neon:
|
|
mov v0.16b,v24.16b
|
|
mov v4.16b,v24.16b
|
|
mov v8.16b,v24.16b
|
|
mov v12.16b,v24.16b
|
|
mov v16.16b,v24.16b
|
|
mov v20.16b,v24.16b
|
|
mov v1.16b,v25.16b
|
|
mov w5,w22 // unpack key block
|
|
mov v5.16b,v25.16b
|
|
lsr x6,x22,#32
|
|
mov v9.16b,v25.16b
|
|
mov w7,w23
|
|
mov v13.16b,v25.16b
|
|
lsr x8,x23,#32
|
|
mov v17.16b,v25.16b
|
|
mov w9,w24
|
|
mov v21.16b,v25.16b
|
|
lsr x10,x24,#32
|
|
mov v3.16b,v27.16b
|
|
mov w11,w25
|
|
mov v7.16b,v28.16b
|
|
lsr x12,x25,#32
|
|
mov v11.16b,v29.16b
|
|
mov w13,w26
|
|
mov v15.16b,v30.16b
|
|
lsr x14,x26,#32
|
|
mov v2.16b,v26.16b
|
|
mov w15,w27
|
|
mov v6.16b,v26.16b
|
|
lsr x16,x27,#32
|
|
add v19.4s,v3.4s,v31.4s // +4
|
|
mov w17,w28
|
|
add v23.4s,v7.4s,v31.4s // +4
|
|
lsr x19,x28,#32
|
|
mov v10.16b,v26.16b
|
|
mov w20,w30
|
|
mov v14.16b,v26.16b
|
|
lsr x21,x30,#32
|
|
mov v18.16b,v26.16b
|
|
stp q27,q28,[sp,#48] // off-load key block, variable part
|
|
mov v22.16b,v26.16b
|
|
str q29,[sp,#80]
|
|
|
|
mov x4,#5
|
|
subs x2,x2,#512
|
|
.Loop_upper_neon:
|
|
sub x4,x4,#1
|
|
add v0.4s,v0.4s,v1.4s
|
|
add w5,w5,w9
|
|
add v4.4s,v4.4s,v5.4s
|
|
add w6,w6,w10
|
|
add v8.4s,v8.4s,v9.4s
|
|
add w7,w7,w11
|
|
add v12.4s,v12.4s,v13.4s
|
|
add w8,w8,w12
|
|
add v16.4s,v16.4s,v17.4s
|
|
eor w17,w17,w5
|
|
add v20.4s,v20.4s,v21.4s
|
|
eor w19,w19,w6
|
|
eor v3.16b,v3.16b,v0.16b
|
|
eor w20,w20,w7
|
|
eor v7.16b,v7.16b,v4.16b
|
|
eor w21,w21,w8
|
|
eor v11.16b,v11.16b,v8.16b
|
|
ror w17,w17,#16
|
|
eor v15.16b,v15.16b,v12.16b
|
|
ror w19,w19,#16
|
|
eor v19.16b,v19.16b,v16.16b
|
|
ror w20,w20,#16
|
|
eor v23.16b,v23.16b,v20.16b
|
|
ror w21,w21,#16
|
|
rev32 v3.8h,v3.8h
|
|
add w13,w13,w17
|
|
rev32 v7.8h,v7.8h
|
|
add w14,w14,w19
|
|
rev32 v11.8h,v11.8h
|
|
add w15,w15,w20
|
|
rev32 v15.8h,v15.8h
|
|
add w16,w16,w21
|
|
rev32 v19.8h,v19.8h
|
|
eor w9,w9,w13
|
|
rev32 v23.8h,v23.8h
|
|
eor w10,w10,w14
|
|
add v2.4s,v2.4s,v3.4s
|
|
eor w11,w11,w15
|
|
add v6.4s,v6.4s,v7.4s
|
|
eor w12,w12,w16
|
|
add v10.4s,v10.4s,v11.4s
|
|
ror w9,w9,#20
|
|
add v14.4s,v14.4s,v15.4s
|
|
ror w10,w10,#20
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w11,w11,#20
|
|
add v22.4s,v22.4s,v23.4s
|
|
ror w12,w12,#20
|
|
eor v24.16b,v1.16b,v2.16b
|
|
add w5,w5,w9
|
|
eor v25.16b,v5.16b,v6.16b
|
|
add w6,w6,w10
|
|
eor v26.16b,v9.16b,v10.16b
|
|
add w7,w7,w11
|
|
eor v27.16b,v13.16b,v14.16b
|
|
add w8,w8,w12
|
|
eor v28.16b,v17.16b,v18.16b
|
|
eor w17,w17,w5
|
|
eor v29.16b,v21.16b,v22.16b
|
|
eor w19,w19,w6
|
|
ushr v1.4s,v24.4s,#20
|
|
eor w20,w20,w7
|
|
ushr v5.4s,v25.4s,#20
|
|
eor w21,w21,w8
|
|
ushr v9.4s,v26.4s,#20
|
|
ror w17,w17,#24
|
|
ushr v13.4s,v27.4s,#20
|
|
ror w19,w19,#24
|
|
ushr v17.4s,v28.4s,#20
|
|
ror w20,w20,#24
|
|
ushr v21.4s,v29.4s,#20
|
|
ror w21,w21,#24
|
|
sli v1.4s,v24.4s,#12
|
|
add w13,w13,w17
|
|
sli v5.4s,v25.4s,#12
|
|
add w14,w14,w19
|
|
sli v9.4s,v26.4s,#12
|
|
add w15,w15,w20
|
|
sli v13.4s,v27.4s,#12
|
|
add w16,w16,w21
|
|
sli v17.4s,v28.4s,#12
|
|
eor w9,w9,w13
|
|
sli v21.4s,v29.4s,#12
|
|
eor w10,w10,w14
|
|
add v0.4s,v0.4s,v1.4s
|
|
eor w11,w11,w15
|
|
add v4.4s,v4.4s,v5.4s
|
|
eor w12,w12,w16
|
|
add v8.4s,v8.4s,v9.4s
|
|
ror w9,w9,#25
|
|
add v12.4s,v12.4s,v13.4s
|
|
ror w10,w10,#25
|
|
add v16.4s,v16.4s,v17.4s
|
|
ror w11,w11,#25
|
|
add v20.4s,v20.4s,v21.4s
|
|
ror w12,w12,#25
|
|
eor v24.16b,v3.16b,v0.16b
|
|
add w5,w5,w10
|
|
eor v25.16b,v7.16b,v4.16b
|
|
add w6,w6,w11
|
|
eor v26.16b,v11.16b,v8.16b
|
|
add w7,w7,w12
|
|
eor v27.16b,v15.16b,v12.16b
|
|
add w8,w8,w9
|
|
eor v28.16b,v19.16b,v16.16b
|
|
eor w21,w21,w5
|
|
eor v29.16b,v23.16b,v20.16b
|
|
eor w17,w17,w6
|
|
ushr v3.4s,v24.4s,#24
|
|
eor w19,w19,w7
|
|
ushr v7.4s,v25.4s,#24
|
|
eor w20,w20,w8
|
|
ushr v11.4s,v26.4s,#24
|
|
ror w21,w21,#16
|
|
ushr v15.4s,v27.4s,#24
|
|
ror w17,w17,#16
|
|
ushr v19.4s,v28.4s,#24
|
|
ror w19,w19,#16
|
|
ushr v23.4s,v29.4s,#24
|
|
ror w20,w20,#16
|
|
sli v3.4s,v24.4s,#8
|
|
add w15,w15,w21
|
|
sli v7.4s,v25.4s,#8
|
|
add w16,w16,w17
|
|
sli v11.4s,v26.4s,#8
|
|
add w13,w13,w19
|
|
sli v15.4s,v27.4s,#8
|
|
add w14,w14,w20
|
|
sli v19.4s,v28.4s,#8
|
|
eor w10,w10,w15
|
|
sli v23.4s,v29.4s,#8
|
|
eor w11,w11,w16
|
|
add v2.4s,v2.4s,v3.4s
|
|
eor w12,w12,w13
|
|
add v6.4s,v6.4s,v7.4s
|
|
eor w9,w9,w14
|
|
add v10.4s,v10.4s,v11.4s
|
|
ror w10,w10,#20
|
|
add v14.4s,v14.4s,v15.4s
|
|
ror w11,w11,#20
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w12,w12,#20
|
|
add v22.4s,v22.4s,v23.4s
|
|
ror w9,w9,#20
|
|
eor v24.16b,v1.16b,v2.16b
|
|
add w5,w5,w10
|
|
eor v25.16b,v5.16b,v6.16b
|
|
add w6,w6,w11
|
|
eor v26.16b,v9.16b,v10.16b
|
|
add w7,w7,w12
|
|
eor v27.16b,v13.16b,v14.16b
|
|
add w8,w8,w9
|
|
eor v28.16b,v17.16b,v18.16b
|
|
eor w21,w21,w5
|
|
eor v29.16b,v21.16b,v22.16b
|
|
eor w17,w17,w6
|
|
ushr v1.4s,v24.4s,#25
|
|
eor w19,w19,w7
|
|
ushr v5.4s,v25.4s,#25
|
|
eor w20,w20,w8
|
|
ushr v9.4s,v26.4s,#25
|
|
ror w21,w21,#24
|
|
ushr v13.4s,v27.4s,#25
|
|
ror w17,w17,#24
|
|
ushr v17.4s,v28.4s,#25
|
|
ror w19,w19,#24
|
|
ushr v21.4s,v29.4s,#25
|
|
ror w20,w20,#24
|
|
sli v1.4s,v24.4s,#7
|
|
add w15,w15,w21
|
|
sli v5.4s,v25.4s,#7
|
|
add w16,w16,w17
|
|
sli v9.4s,v26.4s,#7
|
|
add w13,w13,w19
|
|
sli v13.4s,v27.4s,#7
|
|
add w14,w14,w20
|
|
sli v17.4s,v28.4s,#7
|
|
eor w10,w10,w15
|
|
sli v21.4s,v29.4s,#7
|
|
eor w11,w11,w16
|
|
ext v2.16b,v2.16b,v2.16b,#8
|
|
eor w12,w12,w13
|
|
ext v6.16b,v6.16b,v6.16b,#8
|
|
eor w9,w9,w14
|
|
ext v10.16b,v10.16b,v10.16b,#8
|
|
ror w10,w10,#25
|
|
ext v14.16b,v14.16b,v14.16b,#8
|
|
ror w11,w11,#25
|
|
ext v18.16b,v18.16b,v18.16b,#8
|
|
ror w12,w12,#25
|
|
ext v22.16b,v22.16b,v22.16b,#8
|
|
ror w9,w9,#25
|
|
ext v3.16b,v3.16b,v3.16b,#12
|
|
ext v7.16b,v7.16b,v7.16b,#12
|
|
ext v11.16b,v11.16b,v11.16b,#12
|
|
ext v15.16b,v15.16b,v15.16b,#12
|
|
ext v19.16b,v19.16b,v19.16b,#12
|
|
ext v23.16b,v23.16b,v23.16b,#12
|
|
ext v1.16b,v1.16b,v1.16b,#4
|
|
ext v5.16b,v5.16b,v5.16b,#4
|
|
ext v9.16b,v9.16b,v9.16b,#4
|
|
ext v13.16b,v13.16b,v13.16b,#4
|
|
ext v17.16b,v17.16b,v17.16b,#4
|
|
ext v21.16b,v21.16b,v21.16b,#4
|
|
add v0.4s,v0.4s,v1.4s
|
|
add w5,w5,w9
|
|
add v4.4s,v4.4s,v5.4s
|
|
add w6,w6,w10
|
|
add v8.4s,v8.4s,v9.4s
|
|
add w7,w7,w11
|
|
add v12.4s,v12.4s,v13.4s
|
|
add w8,w8,w12
|
|
add v16.4s,v16.4s,v17.4s
|
|
eor w17,w17,w5
|
|
add v20.4s,v20.4s,v21.4s
|
|
eor w19,w19,w6
|
|
eor v3.16b,v3.16b,v0.16b
|
|
eor w20,w20,w7
|
|
eor v7.16b,v7.16b,v4.16b
|
|
eor w21,w21,w8
|
|
eor v11.16b,v11.16b,v8.16b
|
|
ror w17,w17,#16
|
|
eor v15.16b,v15.16b,v12.16b
|
|
ror w19,w19,#16
|
|
eor v19.16b,v19.16b,v16.16b
|
|
ror w20,w20,#16
|
|
eor v23.16b,v23.16b,v20.16b
|
|
ror w21,w21,#16
|
|
rev32 v3.8h,v3.8h
|
|
add w13,w13,w17
|
|
rev32 v7.8h,v7.8h
|
|
add w14,w14,w19
|
|
rev32 v11.8h,v11.8h
|
|
add w15,w15,w20
|
|
rev32 v15.8h,v15.8h
|
|
add w16,w16,w21
|
|
rev32 v19.8h,v19.8h
|
|
eor w9,w9,w13
|
|
rev32 v23.8h,v23.8h
|
|
eor w10,w10,w14
|
|
add v2.4s,v2.4s,v3.4s
|
|
eor w11,w11,w15
|
|
add v6.4s,v6.4s,v7.4s
|
|
eor w12,w12,w16
|
|
add v10.4s,v10.4s,v11.4s
|
|
ror w9,w9,#20
|
|
add v14.4s,v14.4s,v15.4s
|
|
ror w10,w10,#20
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w11,w11,#20
|
|
add v22.4s,v22.4s,v23.4s
|
|
ror w12,w12,#20
|
|
eor v24.16b,v1.16b,v2.16b
|
|
add w5,w5,w9
|
|
eor v25.16b,v5.16b,v6.16b
|
|
add w6,w6,w10
|
|
eor v26.16b,v9.16b,v10.16b
|
|
add w7,w7,w11
|
|
eor v27.16b,v13.16b,v14.16b
|
|
add w8,w8,w12
|
|
eor v28.16b,v17.16b,v18.16b
|
|
eor w17,w17,w5
|
|
eor v29.16b,v21.16b,v22.16b
|
|
eor w19,w19,w6
|
|
ushr v1.4s,v24.4s,#20
|
|
eor w20,w20,w7
|
|
ushr v5.4s,v25.4s,#20
|
|
eor w21,w21,w8
|
|
ushr v9.4s,v26.4s,#20
|
|
ror w17,w17,#24
|
|
ushr v13.4s,v27.4s,#20
|
|
ror w19,w19,#24
|
|
ushr v17.4s,v28.4s,#20
|
|
ror w20,w20,#24
|
|
ushr v21.4s,v29.4s,#20
|
|
ror w21,w21,#24
|
|
sli v1.4s,v24.4s,#12
|
|
add w13,w13,w17
|
|
sli v5.4s,v25.4s,#12
|
|
add w14,w14,w19
|
|
sli v9.4s,v26.4s,#12
|
|
add w15,w15,w20
|
|
sli v13.4s,v27.4s,#12
|
|
add w16,w16,w21
|
|
sli v17.4s,v28.4s,#12
|
|
eor w9,w9,w13
|
|
sli v21.4s,v29.4s,#12
|
|
eor w10,w10,w14
|
|
add v0.4s,v0.4s,v1.4s
|
|
eor w11,w11,w15
|
|
add v4.4s,v4.4s,v5.4s
|
|
eor w12,w12,w16
|
|
add v8.4s,v8.4s,v9.4s
|
|
ror w9,w9,#25
|
|
add v12.4s,v12.4s,v13.4s
|
|
ror w10,w10,#25
|
|
add v16.4s,v16.4s,v17.4s
|
|
ror w11,w11,#25
|
|
add v20.4s,v20.4s,v21.4s
|
|
ror w12,w12,#25
|
|
eor v24.16b,v3.16b,v0.16b
|
|
add w5,w5,w10
|
|
eor v25.16b,v7.16b,v4.16b
|
|
add w6,w6,w11
|
|
eor v26.16b,v11.16b,v8.16b
|
|
add w7,w7,w12
|
|
eor v27.16b,v15.16b,v12.16b
|
|
add w8,w8,w9
|
|
eor v28.16b,v19.16b,v16.16b
|
|
eor w21,w21,w5
|
|
eor v29.16b,v23.16b,v20.16b
|
|
eor w17,w17,w6
|
|
ushr v3.4s,v24.4s,#24
|
|
eor w19,w19,w7
|
|
ushr v7.4s,v25.4s,#24
|
|
eor w20,w20,w8
|
|
ushr v11.4s,v26.4s,#24
|
|
ror w21,w21,#16
|
|
ushr v15.4s,v27.4s,#24
|
|
ror w17,w17,#16
|
|
ushr v19.4s,v28.4s,#24
|
|
ror w19,w19,#16
|
|
ushr v23.4s,v29.4s,#24
|
|
ror w20,w20,#16
|
|
sli v3.4s,v24.4s,#8
|
|
add w15,w15,w21
|
|
sli v7.4s,v25.4s,#8
|
|
add w16,w16,w17
|
|
sli v11.4s,v26.4s,#8
|
|
add w13,w13,w19
|
|
sli v15.4s,v27.4s,#8
|
|
add w14,w14,w20
|
|
sli v19.4s,v28.4s,#8
|
|
eor w10,w10,w15
|
|
sli v23.4s,v29.4s,#8
|
|
eor w11,w11,w16
|
|
add v2.4s,v2.4s,v3.4s
|
|
eor w12,w12,w13
|
|
add v6.4s,v6.4s,v7.4s
|
|
eor w9,w9,w14
|
|
add v10.4s,v10.4s,v11.4s
|
|
ror w10,w10,#20
|
|
add v14.4s,v14.4s,v15.4s
|
|
ror w11,w11,#20
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w12,w12,#20
|
|
add v22.4s,v22.4s,v23.4s
|
|
ror w9,w9,#20
|
|
eor v24.16b,v1.16b,v2.16b
|
|
add w5,w5,w10
|
|
eor v25.16b,v5.16b,v6.16b
|
|
add w6,w6,w11
|
|
eor v26.16b,v9.16b,v10.16b
|
|
add w7,w7,w12
|
|
eor v27.16b,v13.16b,v14.16b
|
|
add w8,w8,w9
|
|
eor v28.16b,v17.16b,v18.16b
|
|
eor w21,w21,w5
|
|
eor v29.16b,v21.16b,v22.16b
|
|
eor w17,w17,w6
|
|
ushr v1.4s,v24.4s,#25
|
|
eor w19,w19,w7
|
|
ushr v5.4s,v25.4s,#25
|
|
eor w20,w20,w8
|
|
ushr v9.4s,v26.4s,#25
|
|
ror w21,w21,#24
|
|
ushr v13.4s,v27.4s,#25
|
|
ror w17,w17,#24
|
|
ushr v17.4s,v28.4s,#25
|
|
ror w19,w19,#24
|
|
ushr v21.4s,v29.4s,#25
|
|
ror w20,w20,#24
|
|
sli v1.4s,v24.4s,#7
|
|
add w15,w15,w21
|
|
sli v5.4s,v25.4s,#7
|
|
add w16,w16,w17
|
|
sli v9.4s,v26.4s,#7
|
|
add w13,w13,w19
|
|
sli v13.4s,v27.4s,#7
|
|
add w14,w14,w20
|
|
sli v17.4s,v28.4s,#7
|
|
eor w10,w10,w15
|
|
sli v21.4s,v29.4s,#7
|
|
eor w11,w11,w16
|
|
ext v2.16b,v2.16b,v2.16b,#8
|
|
eor w12,w12,w13
|
|
ext v6.16b,v6.16b,v6.16b,#8
|
|
eor w9,w9,w14
|
|
ext v10.16b,v10.16b,v10.16b,#8
|
|
ror w10,w10,#25
|
|
ext v14.16b,v14.16b,v14.16b,#8
|
|
ror w11,w11,#25
|
|
ext v18.16b,v18.16b,v18.16b,#8
|
|
ror w12,w12,#25
|
|
ext v22.16b,v22.16b,v22.16b,#8
|
|
ror w9,w9,#25
|
|
ext v3.16b,v3.16b,v3.16b,#4
|
|
ext v7.16b,v7.16b,v7.16b,#4
|
|
ext v11.16b,v11.16b,v11.16b,#4
|
|
ext v15.16b,v15.16b,v15.16b,#4
|
|
ext v19.16b,v19.16b,v19.16b,#4
|
|
ext v23.16b,v23.16b,v23.16b,#4
|
|
ext v1.16b,v1.16b,v1.16b,#12
|
|
ext v5.16b,v5.16b,v5.16b,#12
|
|
ext v9.16b,v9.16b,v9.16b,#12
|
|
ext v13.16b,v13.16b,v13.16b,#12
|
|
ext v17.16b,v17.16b,v17.16b,#12
|
|
ext v21.16b,v21.16b,v21.16b,#12
|
|
cbnz x4,.Loop_upper_neon
|
|
|
|
add w5,w5,w22 // accumulate key block
|
|
add x6,x6,x22,lsr#32
|
|
add w7,w7,w23
|
|
add x8,x8,x23,lsr#32
|
|
add w9,w9,w24
|
|
add x10,x10,x24,lsr#32
|
|
add w11,w11,w25
|
|
add x12,x12,x25,lsr#32
|
|
add w13,w13,w26
|
|
add x14,x14,x26,lsr#32
|
|
add w15,w15,w27
|
|
add x16,x16,x27,lsr#32
|
|
add w17,w17,w28
|
|
add x19,x19,x28,lsr#32
|
|
add w20,w20,w30
|
|
add x21,x21,x30,lsr#32
|
|
|
|
add x5,x5,x6,lsl#32 // pack
|
|
add x7,x7,x8,lsl#32
|
|
ldp x6,x8,[x1,#0] // load input
|
|
add x9,x9,x10,lsl#32
|
|
add x11,x11,x12,lsl#32
|
|
ldp x10,x12,[x1,#16]
|
|
add x13,x13,x14,lsl#32
|
|
add x15,x15,x16,lsl#32
|
|
ldp x14,x16,[x1,#32]
|
|
add x17,x17,x19,lsl#32
|
|
add x20,x20,x21,lsl#32
|
|
ldp x19,x21,[x1,#48]
|
|
add x1,x1,#64
|
|
#ifdef __ARMEB__
|
|
rev x5,x5
|
|
rev x7,x7
|
|
rev x9,x9
|
|
rev x11,x11
|
|
rev x13,x13
|
|
rev x15,x15
|
|
rev x17,x17
|
|
rev x20,x20
|
|
#endif
|
|
eor x5,x5,x6
|
|
eor x7,x7,x8
|
|
eor x9,x9,x10
|
|
eor x11,x11,x12
|
|
eor x13,x13,x14
|
|
eor x15,x15,x16
|
|
eor x17,x17,x19
|
|
eor x20,x20,x21
|
|
|
|
stp x5,x7,[x0,#0] // store output
|
|
add x28,x28,#1 // increment counter
|
|
mov w5,w22 // unpack key block
|
|
lsr x6,x22,#32
|
|
stp x9,x11,[x0,#16]
|
|
mov w7,w23
|
|
lsr x8,x23,#32
|
|
stp x13,x15,[x0,#32]
|
|
mov w9,w24
|
|
lsr x10,x24,#32
|
|
stp x17,x20,[x0,#48]
|
|
add x0,x0,#64
|
|
mov w11,w25
|
|
lsr x12,x25,#32
|
|
mov w13,w26
|
|
lsr x14,x26,#32
|
|
mov w15,w27
|
|
lsr x16,x27,#32
|
|
mov w17,w28
|
|
lsr x19,x28,#32
|
|
mov w20,w30
|
|
lsr x21,x30,#32
|
|
|
|
mov x4,#5
|
|
.Loop_lower_neon:
|
|
sub x4,x4,#1
|
|
add v0.4s,v0.4s,v1.4s
|
|
add w5,w5,w9
|
|
add v4.4s,v4.4s,v5.4s
|
|
add w6,w6,w10
|
|
add v8.4s,v8.4s,v9.4s
|
|
add w7,w7,w11
|
|
add v12.4s,v12.4s,v13.4s
|
|
add w8,w8,w12
|
|
add v16.4s,v16.4s,v17.4s
|
|
eor w17,w17,w5
|
|
add v20.4s,v20.4s,v21.4s
|
|
eor w19,w19,w6
|
|
eor v3.16b,v3.16b,v0.16b
|
|
eor w20,w20,w7
|
|
eor v7.16b,v7.16b,v4.16b
|
|
eor w21,w21,w8
|
|
eor v11.16b,v11.16b,v8.16b
|
|
ror w17,w17,#16
|
|
eor v15.16b,v15.16b,v12.16b
|
|
ror w19,w19,#16
|
|
eor v19.16b,v19.16b,v16.16b
|
|
ror w20,w20,#16
|
|
eor v23.16b,v23.16b,v20.16b
|
|
ror w21,w21,#16
|
|
rev32 v3.8h,v3.8h
|
|
add w13,w13,w17
|
|
rev32 v7.8h,v7.8h
|
|
add w14,w14,w19
|
|
rev32 v11.8h,v11.8h
|
|
add w15,w15,w20
|
|
rev32 v15.8h,v15.8h
|
|
add w16,w16,w21
|
|
rev32 v19.8h,v19.8h
|
|
eor w9,w9,w13
|
|
rev32 v23.8h,v23.8h
|
|
eor w10,w10,w14
|
|
add v2.4s,v2.4s,v3.4s
|
|
eor w11,w11,w15
|
|
add v6.4s,v6.4s,v7.4s
|
|
eor w12,w12,w16
|
|
add v10.4s,v10.4s,v11.4s
|
|
ror w9,w9,#20
|
|
add v14.4s,v14.4s,v15.4s
|
|
ror w10,w10,#20
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w11,w11,#20
|
|
add v22.4s,v22.4s,v23.4s
|
|
ror w12,w12,#20
|
|
eor v24.16b,v1.16b,v2.16b
|
|
add w5,w5,w9
|
|
eor v25.16b,v5.16b,v6.16b
|
|
add w6,w6,w10
|
|
eor v26.16b,v9.16b,v10.16b
|
|
add w7,w7,w11
|
|
eor v27.16b,v13.16b,v14.16b
|
|
add w8,w8,w12
|
|
eor v28.16b,v17.16b,v18.16b
|
|
eor w17,w17,w5
|
|
eor v29.16b,v21.16b,v22.16b
|
|
eor w19,w19,w6
|
|
ushr v1.4s,v24.4s,#20
|
|
eor w20,w20,w7
|
|
ushr v5.4s,v25.4s,#20
|
|
eor w21,w21,w8
|
|
ushr v9.4s,v26.4s,#20
|
|
ror w17,w17,#24
|
|
ushr v13.4s,v27.4s,#20
|
|
ror w19,w19,#24
|
|
ushr v17.4s,v28.4s,#20
|
|
ror w20,w20,#24
|
|
ushr v21.4s,v29.4s,#20
|
|
ror w21,w21,#24
|
|
sli v1.4s,v24.4s,#12
|
|
add w13,w13,w17
|
|
sli v5.4s,v25.4s,#12
|
|
add w14,w14,w19
|
|
sli v9.4s,v26.4s,#12
|
|
add w15,w15,w20
|
|
sli v13.4s,v27.4s,#12
|
|
add w16,w16,w21
|
|
sli v17.4s,v28.4s,#12
|
|
eor w9,w9,w13
|
|
sli v21.4s,v29.4s,#12
|
|
eor w10,w10,w14
|
|
add v0.4s,v0.4s,v1.4s
|
|
eor w11,w11,w15
|
|
add v4.4s,v4.4s,v5.4s
|
|
eor w12,w12,w16
|
|
add v8.4s,v8.4s,v9.4s
|
|
ror w9,w9,#25
|
|
add v12.4s,v12.4s,v13.4s
|
|
ror w10,w10,#25
|
|
add v16.4s,v16.4s,v17.4s
|
|
ror w11,w11,#25
|
|
add v20.4s,v20.4s,v21.4s
|
|
ror w12,w12,#25
|
|
eor v24.16b,v3.16b,v0.16b
|
|
add w5,w5,w10
|
|
eor v25.16b,v7.16b,v4.16b
|
|
add w6,w6,w11
|
|
eor v26.16b,v11.16b,v8.16b
|
|
add w7,w7,w12
|
|
eor v27.16b,v15.16b,v12.16b
|
|
add w8,w8,w9
|
|
eor v28.16b,v19.16b,v16.16b
|
|
eor w21,w21,w5
|
|
eor v29.16b,v23.16b,v20.16b
|
|
eor w17,w17,w6
|
|
ushr v3.4s,v24.4s,#24
|
|
eor w19,w19,w7
|
|
ushr v7.4s,v25.4s,#24
|
|
eor w20,w20,w8
|
|
ushr v11.4s,v26.4s,#24
|
|
ror w21,w21,#16
|
|
ushr v15.4s,v27.4s,#24
|
|
ror w17,w17,#16
|
|
ushr v19.4s,v28.4s,#24
|
|
ror w19,w19,#16
|
|
ushr v23.4s,v29.4s,#24
|
|
ror w20,w20,#16
|
|
sli v3.4s,v24.4s,#8
|
|
add w15,w15,w21
|
|
sli v7.4s,v25.4s,#8
|
|
add w16,w16,w17
|
|
sli v11.4s,v26.4s,#8
|
|
add w13,w13,w19
|
|
sli v15.4s,v27.4s,#8
|
|
add w14,w14,w20
|
|
sli v19.4s,v28.4s,#8
|
|
eor w10,w10,w15
|
|
sli v23.4s,v29.4s,#8
|
|
eor w11,w11,w16
|
|
add v2.4s,v2.4s,v3.4s
|
|
eor w12,w12,w13
|
|
add v6.4s,v6.4s,v7.4s
|
|
eor w9,w9,w14
|
|
add v10.4s,v10.4s,v11.4s
|
|
ror w10,w10,#20
|
|
add v14.4s,v14.4s,v15.4s
|
|
ror w11,w11,#20
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w12,w12,#20
|
|
add v22.4s,v22.4s,v23.4s
|
|
ror w9,w9,#20
|
|
eor v24.16b,v1.16b,v2.16b
|
|
add w5,w5,w10
|
|
eor v25.16b,v5.16b,v6.16b
|
|
add w6,w6,w11
|
|
eor v26.16b,v9.16b,v10.16b
|
|
add w7,w7,w12
|
|
eor v27.16b,v13.16b,v14.16b
|
|
add w8,w8,w9
|
|
eor v28.16b,v17.16b,v18.16b
|
|
eor w21,w21,w5
|
|
eor v29.16b,v21.16b,v22.16b
|
|
eor w17,w17,w6
|
|
ushr v1.4s,v24.4s,#25
|
|
eor w19,w19,w7
|
|
ushr v5.4s,v25.4s,#25
|
|
eor w20,w20,w8
|
|
ushr v9.4s,v26.4s,#25
|
|
ror w21,w21,#24
|
|
ushr v13.4s,v27.4s,#25
|
|
ror w17,w17,#24
|
|
ushr v17.4s,v28.4s,#25
|
|
ror w19,w19,#24
|
|
ushr v21.4s,v29.4s,#25
|
|
ror w20,w20,#24
|
|
sli v1.4s,v24.4s,#7
|
|
add w15,w15,w21
|
|
sli v5.4s,v25.4s,#7
|
|
add w16,w16,w17
|
|
sli v9.4s,v26.4s,#7
|
|
add w13,w13,w19
|
|
sli v13.4s,v27.4s,#7
|
|
add w14,w14,w20
|
|
sli v17.4s,v28.4s,#7
|
|
eor w10,w10,w15
|
|
sli v21.4s,v29.4s,#7
|
|
eor w11,w11,w16
|
|
ext v2.16b,v2.16b,v2.16b,#8
|
|
eor w12,w12,w13
|
|
ext v6.16b,v6.16b,v6.16b,#8
|
|
eor w9,w9,w14
|
|
ext v10.16b,v10.16b,v10.16b,#8
|
|
ror w10,w10,#25
|
|
ext v14.16b,v14.16b,v14.16b,#8
|
|
ror w11,w11,#25
|
|
ext v18.16b,v18.16b,v18.16b,#8
|
|
ror w12,w12,#25
|
|
ext v22.16b,v22.16b,v22.16b,#8
|
|
ror w9,w9,#25
|
|
ext v3.16b,v3.16b,v3.16b,#12
|
|
ext v7.16b,v7.16b,v7.16b,#12
|
|
ext v11.16b,v11.16b,v11.16b,#12
|
|
ext v15.16b,v15.16b,v15.16b,#12
|
|
ext v19.16b,v19.16b,v19.16b,#12
|
|
ext v23.16b,v23.16b,v23.16b,#12
|
|
ext v1.16b,v1.16b,v1.16b,#4
|
|
ext v5.16b,v5.16b,v5.16b,#4
|
|
ext v9.16b,v9.16b,v9.16b,#4
|
|
ext v13.16b,v13.16b,v13.16b,#4
|
|
ext v17.16b,v17.16b,v17.16b,#4
|
|
ext v21.16b,v21.16b,v21.16b,#4
|
|
add v0.4s,v0.4s,v1.4s
|
|
add w5,w5,w9
|
|
add v4.4s,v4.4s,v5.4s
|
|
add w6,w6,w10
|
|
add v8.4s,v8.4s,v9.4s
|
|
add w7,w7,w11
|
|
add v12.4s,v12.4s,v13.4s
|
|
add w8,w8,w12
|
|
add v16.4s,v16.4s,v17.4s
|
|
eor w17,w17,w5
|
|
add v20.4s,v20.4s,v21.4s
|
|
eor w19,w19,w6
|
|
eor v3.16b,v3.16b,v0.16b
|
|
eor w20,w20,w7
|
|
eor v7.16b,v7.16b,v4.16b
|
|
eor w21,w21,w8
|
|
eor v11.16b,v11.16b,v8.16b
|
|
ror w17,w17,#16
|
|
eor v15.16b,v15.16b,v12.16b
|
|
ror w19,w19,#16
|
|
eor v19.16b,v19.16b,v16.16b
|
|
ror w20,w20,#16
|
|
eor v23.16b,v23.16b,v20.16b
|
|
ror w21,w21,#16
|
|
rev32 v3.8h,v3.8h
|
|
add w13,w13,w17
|
|
rev32 v7.8h,v7.8h
|
|
add w14,w14,w19
|
|
rev32 v11.8h,v11.8h
|
|
add w15,w15,w20
|
|
rev32 v15.8h,v15.8h
|
|
add w16,w16,w21
|
|
rev32 v19.8h,v19.8h
|
|
eor w9,w9,w13
|
|
rev32 v23.8h,v23.8h
|
|
eor w10,w10,w14
|
|
add v2.4s,v2.4s,v3.4s
|
|
eor w11,w11,w15
|
|
add v6.4s,v6.4s,v7.4s
|
|
eor w12,w12,w16
|
|
add v10.4s,v10.4s,v11.4s
|
|
ror w9,w9,#20
|
|
add v14.4s,v14.4s,v15.4s
|
|
ror w10,w10,#20
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w11,w11,#20
|
|
add v22.4s,v22.4s,v23.4s
|
|
ror w12,w12,#20
|
|
eor v24.16b,v1.16b,v2.16b
|
|
add w5,w5,w9
|
|
eor v25.16b,v5.16b,v6.16b
|
|
add w6,w6,w10
|
|
eor v26.16b,v9.16b,v10.16b
|
|
add w7,w7,w11
|
|
eor v27.16b,v13.16b,v14.16b
|
|
add w8,w8,w12
|
|
eor v28.16b,v17.16b,v18.16b
|
|
eor w17,w17,w5
|
|
eor v29.16b,v21.16b,v22.16b
|
|
eor w19,w19,w6
|
|
ushr v1.4s,v24.4s,#20
|
|
eor w20,w20,w7
|
|
ushr v5.4s,v25.4s,#20
|
|
eor w21,w21,w8
|
|
ushr v9.4s,v26.4s,#20
|
|
ror w17,w17,#24
|
|
ushr v13.4s,v27.4s,#20
|
|
ror w19,w19,#24
|
|
ushr v17.4s,v28.4s,#20
|
|
ror w20,w20,#24
|
|
ushr v21.4s,v29.4s,#20
|
|
ror w21,w21,#24
|
|
sli v1.4s,v24.4s,#12
|
|
add w13,w13,w17
|
|
sli v5.4s,v25.4s,#12
|
|
add w14,w14,w19
|
|
sli v9.4s,v26.4s,#12
|
|
add w15,w15,w20
|
|
sli v13.4s,v27.4s,#12
|
|
add w16,w16,w21
|
|
sli v17.4s,v28.4s,#12
|
|
eor w9,w9,w13
|
|
sli v21.4s,v29.4s,#12
|
|
eor w10,w10,w14
|
|
add v0.4s,v0.4s,v1.4s
|
|
eor w11,w11,w15
|
|
add v4.4s,v4.4s,v5.4s
|
|
eor w12,w12,w16
|
|
add v8.4s,v8.4s,v9.4s
|
|
ror w9,w9,#25
|
|
add v12.4s,v12.4s,v13.4s
|
|
ror w10,w10,#25
|
|
add v16.4s,v16.4s,v17.4s
|
|
ror w11,w11,#25
|
|
add v20.4s,v20.4s,v21.4s
|
|
ror w12,w12,#25
|
|
eor v24.16b,v3.16b,v0.16b
|
|
add w5,w5,w10
|
|
eor v25.16b,v7.16b,v4.16b
|
|
add w6,w6,w11
|
|
eor v26.16b,v11.16b,v8.16b
|
|
add w7,w7,w12
|
|
eor v27.16b,v15.16b,v12.16b
|
|
add w8,w8,w9
|
|
eor v28.16b,v19.16b,v16.16b
|
|
eor w21,w21,w5
|
|
eor v29.16b,v23.16b,v20.16b
|
|
eor w17,w17,w6
|
|
ushr v3.4s,v24.4s,#24
|
|
eor w19,w19,w7
|
|
ushr v7.4s,v25.4s,#24
|
|
eor w20,w20,w8
|
|
ushr v11.4s,v26.4s,#24
|
|
ror w21,w21,#16
|
|
ushr v15.4s,v27.4s,#24
|
|
ror w17,w17,#16
|
|
ushr v19.4s,v28.4s,#24
|
|
ror w19,w19,#16
|
|
ushr v23.4s,v29.4s,#24
|
|
ror w20,w20,#16
|
|
sli v3.4s,v24.4s,#8
|
|
add w15,w15,w21
|
|
sli v7.4s,v25.4s,#8
|
|
add w16,w16,w17
|
|
sli v11.4s,v26.4s,#8
|
|
add w13,w13,w19
|
|
sli v15.4s,v27.4s,#8
|
|
add w14,w14,w20
|
|
sli v19.4s,v28.4s,#8
|
|
eor w10,w10,w15
|
|
sli v23.4s,v29.4s,#8
|
|
eor w11,w11,w16
|
|
add v2.4s,v2.4s,v3.4s
|
|
eor w12,w12,w13
|
|
add v6.4s,v6.4s,v7.4s
|
|
eor w9,w9,w14
|
|
add v10.4s,v10.4s,v11.4s
|
|
ror w10,w10,#20
|
|
add v14.4s,v14.4s,v15.4s
|
|
ror w11,w11,#20
|
|
add v18.4s,v18.4s,v19.4s
|
|
ror w12,w12,#20
|
|
add v22.4s,v22.4s,v23.4s
|
|
ror w9,w9,#20
|
|
eor v24.16b,v1.16b,v2.16b
|
|
add w5,w5,w10
|
|
eor v25.16b,v5.16b,v6.16b
|
|
add w6,w6,w11
|
|
eor v26.16b,v9.16b,v10.16b
|
|
add w7,w7,w12
|
|
eor v27.16b,v13.16b,v14.16b
|
|
add w8,w8,w9
|
|
eor v28.16b,v17.16b,v18.16b
|
|
eor w21,w21,w5
|
|
eor v29.16b,v21.16b,v22.16b
|
|
eor w17,w17,w6
|
|
ushr v1.4s,v24.4s,#25
|
|
eor w19,w19,w7
|
|
ushr v5.4s,v25.4s,#25
|
|
eor w20,w20,w8
|
|
ushr v9.4s,v26.4s,#25
|
|
ror w21,w21,#24
|
|
ushr v13.4s,v27.4s,#25
|
|
ror w17,w17,#24
|
|
ushr v17.4s,v28.4s,#25
|
|
ror w19,w19,#24
|
|
ushr v21.4s,v29.4s,#25
|
|
ror w20,w20,#24
|
|
sli v1.4s,v24.4s,#7
|
|
add w15,w15,w21
|
|
sli v5.4s,v25.4s,#7
|
|
add w16,w16,w17
|
|
sli v9.4s,v26.4s,#7
|
|
add w13,w13,w19
|
|
sli v13.4s,v27.4s,#7
|
|
add w14,w14,w20
|
|
sli v17.4s,v28.4s,#7
|
|
eor w10,w10,w15
|
|
sli v21.4s,v29.4s,#7
|
|
eor w11,w11,w16
|
|
ext v2.16b,v2.16b,v2.16b,#8
|
|
eor w12,w12,w13
|
|
ext v6.16b,v6.16b,v6.16b,#8
|
|
eor w9,w9,w14
|
|
ext v10.16b,v10.16b,v10.16b,#8
|
|
ror w10,w10,#25
|
|
ext v14.16b,v14.16b,v14.16b,#8
|
|
ror w11,w11,#25
|
|
ext v18.16b,v18.16b,v18.16b,#8
|
|
ror w12,w12,#25
|
|
ext v22.16b,v22.16b,v22.16b,#8
|
|
ror w9,w9,#25
|
|
ext v3.16b,v3.16b,v3.16b,#4
|
|
ext v7.16b,v7.16b,v7.16b,#4
|
|
ext v11.16b,v11.16b,v11.16b,#4
|
|
ext v15.16b,v15.16b,v15.16b,#4
|
|
ext v19.16b,v19.16b,v19.16b,#4
|
|
ext v23.16b,v23.16b,v23.16b,#4
|
|
ext v1.16b,v1.16b,v1.16b,#12
|
|
ext v5.16b,v5.16b,v5.16b,#12
|
|
ext v9.16b,v9.16b,v9.16b,#12
|
|
ext v13.16b,v13.16b,v13.16b,#12
|
|
ext v17.16b,v17.16b,v17.16b,#12
|
|
ext v21.16b,v21.16b,v21.16b,#12
|
|
cbnz x4,.Loop_lower_neon
|
|
|
|
add w5,w5,w22 // accumulate key block
|
|
ldp q24,q25,[sp,#0]
|
|
add x6,x6,x22,lsr#32
|
|
ldp q26,q27,[sp,#32]
|
|
add w7,w7,w23
|
|
ldp q28,q29,[sp,#64]
|
|
add x8,x8,x23,lsr#32
|
|
add v0.4s,v0.4s,v24.4s
|
|
add w9,w9,w24
|
|
add v4.4s,v4.4s,v24.4s
|
|
add x10,x10,x24,lsr#32
|
|
add v8.4s,v8.4s,v24.4s
|
|
add w11,w11,w25
|
|
add v12.4s,v12.4s,v24.4s
|
|
add x12,x12,x25,lsr#32
|
|
add v16.4s,v16.4s,v24.4s
|
|
add w13,w13,w26
|
|
add v20.4s,v20.4s,v24.4s
|
|
add x14,x14,x26,lsr#32
|
|
add v2.4s,v2.4s,v26.4s
|
|
add w15,w15,w27
|
|
add v6.4s,v6.4s,v26.4s
|
|
add x16,x16,x27,lsr#32
|
|
add v10.4s,v10.4s,v26.4s
|
|
add w17,w17,w28
|
|
add v14.4s,v14.4s,v26.4s
|
|
add x19,x19,x28,lsr#32
|
|
add v18.4s,v18.4s,v26.4s
|
|
add w20,w20,w30
|
|
add v22.4s,v22.4s,v26.4s
|
|
add x21,x21,x30,lsr#32
|
|
add v19.4s,v19.4s,v31.4s // +4
|
|
add x5,x5,x6,lsl#32 // pack
|
|
add v23.4s,v23.4s,v31.4s // +4
|
|
add x7,x7,x8,lsl#32
|
|
add v3.4s,v3.4s,v27.4s
|
|
ldp x6,x8,[x1,#0] // load input
|
|
add v7.4s,v7.4s,v28.4s
|
|
add x9,x9,x10,lsl#32
|
|
add v11.4s,v11.4s,v29.4s
|
|
add x11,x11,x12,lsl#32
|
|
add v15.4s,v15.4s,v30.4s
|
|
ldp x10,x12,[x1,#16]
|
|
add v19.4s,v19.4s,v27.4s
|
|
add x13,x13,x14,lsl#32
|
|
add v23.4s,v23.4s,v28.4s
|
|
add x15,x15,x16,lsl#32
|
|
add v1.4s,v1.4s,v25.4s
|
|
ldp x14,x16,[x1,#32]
|
|
add v5.4s,v5.4s,v25.4s
|
|
add x17,x17,x19,lsl#32
|
|
add v9.4s,v9.4s,v25.4s
|
|
add x20,x20,x21,lsl#32
|
|
add v13.4s,v13.4s,v25.4s
|
|
ldp x19,x21,[x1,#48]
|
|
add v17.4s,v17.4s,v25.4s
|
|
add x1,x1,#64
|
|
add v21.4s,v21.4s,v25.4s
|
|
|
|
#ifdef __ARMEB__
|
|
rev x5,x5
|
|
rev x7,x7
|
|
rev x9,x9
|
|
rev x11,x11
|
|
rev x13,x13
|
|
rev x15,x15
|
|
rev x17,x17
|
|
rev x20,x20
|
|
#endif
|
|
ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
|
|
eor x5,x5,x6
|
|
eor x7,x7,x8
|
|
eor x9,x9,x10
|
|
eor x11,x11,x12
|
|
eor x13,x13,x14
|
|
eor v0.16b,v0.16b,v24.16b
|
|
eor x15,x15,x16
|
|
eor v1.16b,v1.16b,v25.16b
|
|
eor x17,x17,x19
|
|
eor v2.16b,v2.16b,v26.16b
|
|
eor x20,x20,x21
|
|
eor v3.16b,v3.16b,v27.16b
|
|
ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
|
|
|
|
stp x5,x7,[x0,#0] // store output
|
|
add x28,x28,#7 // increment counter
|
|
stp x9,x11,[x0,#16]
|
|
stp x13,x15,[x0,#32]
|
|
stp x17,x20,[x0,#48]
|
|
add x0,x0,#64
|
|
st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
|
|
|
|
ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
|
|
eor v4.16b,v4.16b,v24.16b
|
|
eor v5.16b,v5.16b,v25.16b
|
|
eor v6.16b,v6.16b,v26.16b
|
|
eor v7.16b,v7.16b,v27.16b
|
|
st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
|
|
|
|
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
|
|
eor v8.16b,v8.16b,v0.16b
|
|
ldp q24,q25,[sp,#0]
|
|
eor v9.16b,v9.16b,v1.16b
|
|
ldp q26,q27,[sp,#32]
|
|
eor v10.16b,v10.16b,v2.16b
|
|
eor v11.16b,v11.16b,v3.16b
|
|
st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
|
|
|
|
ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
|
|
eor v12.16b,v12.16b,v4.16b
|
|
eor v13.16b,v13.16b,v5.16b
|
|
eor v14.16b,v14.16b,v6.16b
|
|
eor v15.16b,v15.16b,v7.16b
|
|
st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
|
|
|
|
ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
|
|
eor v16.16b,v16.16b,v8.16b
|
|
eor v17.16b,v17.16b,v9.16b
|
|
eor v18.16b,v18.16b,v10.16b
|
|
eor v19.16b,v19.16b,v11.16b
|
|
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
|
|
|
|
shl v0.4s,v31.4s,#1 // 4 -> 8
|
|
eor v20.16b,v20.16b,v12.16b
|
|
eor v21.16b,v21.16b,v13.16b
|
|
eor v22.16b,v22.16b,v14.16b
|
|
eor v23.16b,v23.16b,v15.16b
|
|
st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
|
|
|
|
add v27.4s,v27.4s,v0.4s // += 8
|
|
add v28.4s,v28.4s,v0.4s
|
|
add v29.4s,v29.4s,v0.4s
|
|
add v30.4s,v30.4s,v0.4s
|
|
|
|
b.hs .Loop_outer_512_neon
|
|
|
|
adds x2,x2,#512
|
|
ushr v0.4s,v31.4s,#2 // 4 -> 1
|
|
|
|
ldp d8,d9,[sp,#128+0] // meet ABI requirements
|
|
ldp d10,d11,[sp,#128+16]
|
|
ldp d12,d13,[sp,#128+32]
|
|
ldp d14,d15,[sp,#128+48]
|
|
|
|
stp q24,q31,[sp,#0] // wipe off-load area
|
|
stp q24,q31,[sp,#32]
|
|
stp q24,q31,[sp,#64]
|
|
|
|
b.eq .Ldone_512_neon
|
|
|
|
cmp x2,#192
|
|
sub v27.4s,v27.4s,v0.4s // -= 1
|
|
sub v28.4s,v28.4s,v0.4s
|
|
sub v29.4s,v29.4s,v0.4s
|
|
add sp,sp,#128
|
|
b.hs .Loop_outer_neon
|
|
|
|
eor v25.16b,v25.16b,v25.16b
|
|
eor v26.16b,v26.16b,v26.16b
|
|
eor v27.16b,v27.16b,v27.16b
|
|
eor v28.16b,v28.16b,v28.16b
|
|
eor v29.16b,v29.16b,v29.16b
|
|
eor v30.16b,v30.16b,v30.16b
|
|
b .Loop_outer
|
|
|
|
.Ldone_512_neon:
|
|
ldp x19,x20,[x29,#16]
|
|
add sp,sp,#128+64
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldp x29,x30,[sp],#96
|
|
.Labort_neon:
|
|
ret
|
|
ENDPROC(chacha20_neon)
|