3012 lines
65 KiB
NASM
3012 lines
65 KiB
NASM
|
default rel
|
||
|
%define XMMWORD
|
||
|
%define YMMWORD
|
||
|
%define ZMMWORD
|
||
|
section .text code align=64
|
||
|
|
||
|
|
||
|
ALIGN 64
|
||
|
$L$zero:
|
||
|
DD 0,0,0,0
|
||
|
$L$one:
|
||
|
DD 1,0,0,0
|
||
|
$L$inc:
|
||
|
DD 0,1,2,3
|
||
|
$L$four:
|
||
|
DD 4,4,4,4
|
||
|
$L$incy:
|
||
|
DD 0,2,4,6,1,3,5,7
|
||
|
$L$eight:
|
||
|
DD 8,8,8,8,8,8,8,8
|
||
|
$L$rot16:
|
||
|
DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
|
||
|
$L$rot24:
|
||
|
DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
|
||
|
$L$sigma:
|
||
|
DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
|
||
|
DB 0
|
||
|
ALIGN 64
|
||
|
$L$zeroz:
|
||
|
DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
|
||
|
$L$fourz:
|
||
|
DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
|
||
|
$L$incz:
|
||
|
DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
|
||
|
$L$sixteen:
|
||
|
DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
|
||
|
ALIGN 64
|
||
|
$L$twoy:
|
||
|
DD 2,0,0,0,2,0,0,0
|
||
|
|
||
|
global hchacha20_ssse3
|
||
|
|
||
|
ALIGN 32
|
||
|
hchacha20_ssse3:
|
||
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
||
|
mov QWORD[16+rsp],rsi
|
||
|
mov rax,rsp
|
||
|
$L$SEH_begin_hchacha20_ssse3:
|
||
|
mov rdi,rcx
|
||
|
mov rsi,rdx
|
||
|
mov rdx,r8
|
||
|
mov rcx,r9
|
||
|
mov r8,QWORD[40+rsp]
|
||
|
|
||
|
|
||
|
|
||
|
$L$hchacha20_ssse3:
|
||
|
movdqa xmm0,XMMWORD[$L$sigma]
|
||
|
movdqu xmm1,XMMWORD[rdx]
|
||
|
movdqu xmm2,XMMWORD[16+rdx]
|
||
|
movdqu xmm3,XMMWORD[rsi]
|
||
|
movdqa xmm6,XMMWORD[$L$rot16]
|
||
|
movdqa xmm7,XMMWORD[$L$rot24]
|
||
|
mov r8,10
|
||
|
ALIGN 32
|
||
|
$L$oop_hssse3:
|
||
|
paddd xmm0,xmm1
|
||
|
pxor xmm3,xmm0
|
||
|
pshufb xmm3,xmm6
|
||
|
paddd xmm2,xmm3
|
||
|
pxor xmm1,xmm2
|
||
|
movdqa xmm4,xmm1
|
||
|
psrld xmm1,20
|
||
|
pslld xmm4,12
|
||
|
por xmm1,xmm4
|
||
|
paddd xmm0,xmm1
|
||
|
pxor xmm3,xmm0
|
||
|
pshufb xmm3,xmm7
|
||
|
paddd xmm2,xmm3
|
||
|
pxor xmm1,xmm2
|
||
|
movdqa xmm4,xmm1
|
||
|
psrld xmm1,25
|
||
|
pslld xmm4,7
|
||
|
por xmm1,xmm4
|
||
|
pshufd xmm2,xmm2,78
|
||
|
pshufd xmm1,xmm1,57
|
||
|
pshufd xmm3,xmm3,147
|
||
|
nop
|
||
|
paddd xmm0,xmm1
|
||
|
pxor xmm3,xmm0
|
||
|
pshufb xmm3,xmm6
|
||
|
paddd xmm2,xmm3
|
||
|
pxor xmm1,xmm2
|
||
|
movdqa xmm4,xmm1
|
||
|
psrld xmm1,20
|
||
|
pslld xmm4,12
|
||
|
por xmm1,xmm4
|
||
|
paddd xmm0,xmm1
|
||
|
pxor xmm3,xmm0
|
||
|
pshufb xmm3,xmm7
|
||
|
paddd xmm2,xmm3
|
||
|
pxor xmm1,xmm2
|
||
|
movdqa xmm4,xmm1
|
||
|
psrld xmm1,25
|
||
|
pslld xmm4,7
|
||
|
por xmm1,xmm4
|
||
|
pshufd xmm2,xmm2,78
|
||
|
pshufd xmm1,xmm1,147
|
||
|
pshufd xmm3,xmm3,57
|
||
|
dec r8
|
||
|
jnz NEAR $L$oop_hssse3
|
||
|
movdqu XMMWORD[rdi],xmm0
|
||
|
movdqu XMMWORD[16+rdi],xmm3
|
||
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
||
|
mov rsi,QWORD[16+rsp]
|
||
|
DB 0F3h,0C3h ;repret
|
||
|
|
||
|
$L$SEH_end_hchacha20_ssse3:
|
||
|
global chacha20_ssse3
|
||
|
|
||
|
ALIGN 32
|
||
|
chacha20_ssse3:
|
||
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
||
|
mov QWORD[16+rsp],rsi
|
||
|
mov rax,rsp
|
||
|
$L$SEH_begin_chacha20_ssse3:
|
||
|
mov rdi,rcx
|
||
|
mov rsi,rdx
|
||
|
mov rdx,r8
|
||
|
mov rcx,r9
|
||
|
mov r8,QWORD[40+rsp]
|
||
|
|
||
|
|
||
|
|
||
|
$L$chacha20_ssse3:
|
||
|
mov r9,rsp
|
||
|
|
||
|
cmp rdx,128
|
||
|
ja NEAR $L$chacha20_4x
|
||
|
|
||
|
$L$do_sse3_after_all:
|
||
|
sub rsp,64+40
|
||
|
movaps XMMWORD[(-40)+r9],xmm6
|
||
|
movaps XMMWORD[(-24)+r9],xmm7
|
||
|
$L$ssse3_body:
|
||
|
movdqa xmm0,XMMWORD[$L$sigma]
|
||
|
movdqu xmm1,XMMWORD[rcx]
|
||
|
movdqu xmm2,XMMWORD[16+rcx]
|
||
|
movdqu xmm3,XMMWORD[r8]
|
||
|
movdqa xmm6,XMMWORD[$L$rot16]
|
||
|
movdqa xmm7,XMMWORD[$L$rot24]
|
||
|
|
||
|
movdqa XMMWORD[rsp],xmm0
|
||
|
movdqa XMMWORD[16+rsp],xmm1
|
||
|
movdqa XMMWORD[32+rsp],xmm2
|
||
|
movdqa XMMWORD[48+rsp],xmm3
|
||
|
mov r8,10
|
||
|
jmp NEAR $L$oop_ssse3
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$oop_outer_ssse3:
|
||
|
movdqa xmm3,XMMWORD[$L$one]
|
||
|
movdqa xmm0,XMMWORD[rsp]
|
||
|
movdqa xmm1,XMMWORD[16+rsp]
|
||
|
movdqa xmm2,XMMWORD[32+rsp]
|
||
|
paddd xmm3,XMMWORD[48+rsp]
|
||
|
mov r8,10
|
||
|
movdqa XMMWORD[48+rsp],xmm3
|
||
|
jmp NEAR $L$oop_ssse3
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$oop_ssse3:
|
||
|
paddd xmm0,xmm1
|
||
|
pxor xmm3,xmm0
|
||
|
pshufb xmm3,xmm6
|
||
|
paddd xmm2,xmm3
|
||
|
pxor xmm1,xmm2
|
||
|
movdqa xmm4,xmm1
|
||
|
psrld xmm1,20
|
||
|
pslld xmm4,12
|
||
|
por xmm1,xmm4
|
||
|
paddd xmm0,xmm1
|
||
|
pxor xmm3,xmm0
|
||
|
pshufb xmm3,xmm7
|
||
|
paddd xmm2,xmm3
|
||
|
pxor xmm1,xmm2
|
||
|
movdqa xmm4,xmm1
|
||
|
psrld xmm1,25
|
||
|
pslld xmm4,7
|
||
|
por xmm1,xmm4
|
||
|
pshufd xmm2,xmm2,78
|
||
|
pshufd xmm1,xmm1,57
|
||
|
pshufd xmm3,xmm3,147
|
||
|
nop
|
||
|
paddd xmm0,xmm1
|
||
|
pxor xmm3,xmm0
|
||
|
pshufb xmm3,xmm6
|
||
|
paddd xmm2,xmm3
|
||
|
pxor xmm1,xmm2
|
||
|
movdqa xmm4,xmm1
|
||
|
psrld xmm1,20
|
||
|
pslld xmm4,12
|
||
|
por xmm1,xmm4
|
||
|
paddd xmm0,xmm1
|
||
|
pxor xmm3,xmm0
|
||
|
pshufb xmm3,xmm7
|
||
|
paddd xmm2,xmm3
|
||
|
pxor xmm1,xmm2
|
||
|
movdqa xmm4,xmm1
|
||
|
psrld xmm1,25
|
||
|
pslld xmm4,7
|
||
|
por xmm1,xmm4
|
||
|
pshufd xmm2,xmm2,78
|
||
|
pshufd xmm1,xmm1,147
|
||
|
pshufd xmm3,xmm3,57
|
||
|
dec r8
|
||
|
jnz NEAR $L$oop_ssse3
|
||
|
paddd xmm0,XMMWORD[rsp]
|
||
|
paddd xmm1,XMMWORD[16+rsp]
|
||
|
paddd xmm2,XMMWORD[32+rsp]
|
||
|
paddd xmm3,XMMWORD[48+rsp]
|
||
|
|
||
|
cmp rdx,64
|
||
|
jb NEAR $L$tail_ssse3
|
||
|
|
||
|
movdqu xmm4,XMMWORD[rsi]
|
||
|
movdqu xmm5,XMMWORD[16+rsi]
|
||
|
pxor xmm0,xmm4
|
||
|
movdqu xmm4,XMMWORD[32+rsi]
|
||
|
pxor xmm1,xmm5
|
||
|
movdqu xmm5,XMMWORD[48+rsi]
|
||
|
lea rsi,[64+rsi]
|
||
|
pxor xmm2,xmm4
|
||
|
pxor xmm3,xmm5
|
||
|
|
||
|
movdqu XMMWORD[rdi],xmm0
|
||
|
movdqu XMMWORD[16+rdi],xmm1
|
||
|
movdqu XMMWORD[32+rdi],xmm2
|
||
|
movdqu XMMWORD[48+rdi],xmm3
|
||
|
lea rdi,[64+rdi]
|
||
|
|
||
|
sub rdx,64
|
||
|
jnz NEAR $L$oop_outer_ssse3
|
||
|
|
||
|
jmp NEAR $L$done_ssse3
|
||
|
|
||
|
ALIGN 16
|
||
|
$L$tail_ssse3:
|
||
|
movdqa XMMWORD[rsp],xmm0
|
||
|
movdqa XMMWORD[16+rsp],xmm1
|
||
|
movdqa XMMWORD[32+rsp],xmm2
|
||
|
movdqa XMMWORD[48+rsp],xmm3
|
||
|
xor r8,r8
|
||
|
|
||
|
$L$oop_tail_ssse3:
|
||
|
movzx eax,BYTE[r8*1+rsi]
|
||
|
movzx ecx,BYTE[r8*1+rsp]
|
||
|
lea r8,[1+r8]
|
||
|
xor eax,ecx
|
||
|
mov BYTE[((-1))+r8*1+rdi],al
|
||
|
dec rdx
|
||
|
jnz NEAR $L$oop_tail_ssse3
|
||
|
|
||
|
$L$done_ssse3:
|
||
|
movaps xmm6,XMMWORD[((-40))+r9]
|
||
|
movaps xmm7,XMMWORD[((-24))+r9]
|
||
|
lea rsp,[r9]
|
||
|
|
||
|
$L$ssse3_epilogue:
|
||
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
||
|
mov rsi,QWORD[16+rsp]
|
||
|
DB 0F3h,0C3h ;repret
|
||
|
|
||
|
$L$SEH_end_chacha20_ssse3:
|
||
|
global chacha20_4x
|
||
|
|
||
|
ALIGN 32
|
||
|
chacha20_4x:
|
||
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
||
|
mov QWORD[16+rsp],rsi
|
||
|
mov rax,rsp
|
||
|
$L$SEH_begin_chacha20_4x:
|
||
|
mov rdi,rcx
|
||
|
mov rsi,rdx
|
||
|
mov rdx,r8
|
||
|
mov rcx,r9
|
||
|
mov r8,QWORD[40+rsp]
|
||
|
|
||
|
|
||
|
|
||
|
$L$chacha20_4x:
|
||
|
mov r9,rsp
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
$L$proceed4x:
|
||
|
sub rsp,0x140+168
|
||
|
movaps XMMWORD[(-168)+r9],xmm6
|
||
|
movaps XMMWORD[(-152)+r9],xmm7
|
||
|
movaps XMMWORD[(-136)+r9],xmm8
|
||
|
movaps XMMWORD[(-120)+r9],xmm9
|
||
|
movaps XMMWORD[(-104)+r9],xmm10
|
||
|
movaps XMMWORD[(-88)+r9],xmm11
|
||
|
movaps XMMWORD[(-72)+r9],xmm12
|
||
|
movaps XMMWORD[(-56)+r9],xmm13
|
||
|
movaps XMMWORD[(-40)+r9],xmm14
|
||
|
movaps XMMWORD[(-24)+r9],xmm15
|
||
|
$L$4x_body:
|
||
|
movdqa xmm11,XMMWORD[$L$sigma]
|
||
|
movdqu xmm15,XMMWORD[rcx]
|
||
|
movdqu xmm7,XMMWORD[16+rcx]
|
||
|
movdqu xmm3,XMMWORD[r8]
|
||
|
lea rcx,[256+rsp]
|
||
|
lea r10,[$L$rot16]
|
||
|
lea r11,[$L$rot24]
|
||
|
|
||
|
pshufd xmm8,xmm11,0x00
|
||
|
pshufd xmm9,xmm11,0x55
|
||
|
movdqa XMMWORD[64+rsp],xmm8
|
||
|
pshufd xmm10,xmm11,0xaa
|
||
|
movdqa XMMWORD[80+rsp],xmm9
|
||
|
pshufd xmm11,xmm11,0xff
|
||
|
movdqa XMMWORD[96+rsp],xmm10
|
||
|
movdqa XMMWORD[112+rsp],xmm11
|
||
|
|
||
|
pshufd xmm12,xmm15,0x00
|
||
|
pshufd xmm13,xmm15,0x55
|
||
|
movdqa XMMWORD[(128-256)+rcx],xmm12
|
||
|
pshufd xmm14,xmm15,0xaa
|
||
|
movdqa XMMWORD[(144-256)+rcx],xmm13
|
||
|
pshufd xmm15,xmm15,0xff
|
||
|
movdqa XMMWORD[(160-256)+rcx],xmm14
|
||
|
movdqa XMMWORD[(176-256)+rcx],xmm15
|
||
|
|
||
|
pshufd xmm4,xmm7,0x00
|
||
|
pshufd xmm5,xmm7,0x55
|
||
|
movdqa XMMWORD[(192-256)+rcx],xmm4
|
||
|
pshufd xmm6,xmm7,0xaa
|
||
|
movdqa XMMWORD[(208-256)+rcx],xmm5
|
||
|
pshufd xmm7,xmm7,0xff
|
||
|
movdqa XMMWORD[(224-256)+rcx],xmm6
|
||
|
movdqa XMMWORD[(240-256)+rcx],xmm7
|
||
|
|
||
|
pshufd xmm0,xmm3,0x00
|
||
|
pshufd xmm1,xmm3,0x55
|
||
|
paddd xmm0,XMMWORD[$L$inc]
|
||
|
pshufd xmm2,xmm3,0xaa
|
||
|
movdqa XMMWORD[(272-256)+rcx],xmm1
|
||
|
pshufd xmm3,xmm3,0xff
|
||
|
movdqa XMMWORD[(288-256)+rcx],xmm2
|
||
|
movdqa XMMWORD[(304-256)+rcx],xmm3
|
||
|
|
||
|
jmp NEAR $L$oop_enter4x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$oop_outer4x:
|
||
|
movdqa xmm8,XMMWORD[64+rsp]
|
||
|
movdqa xmm9,XMMWORD[80+rsp]
|
||
|
movdqa xmm10,XMMWORD[96+rsp]
|
||
|
movdqa xmm11,XMMWORD[112+rsp]
|
||
|
movdqa xmm12,XMMWORD[((128-256))+rcx]
|
||
|
movdqa xmm13,XMMWORD[((144-256))+rcx]
|
||
|
movdqa xmm14,XMMWORD[((160-256))+rcx]
|
||
|
movdqa xmm15,XMMWORD[((176-256))+rcx]
|
||
|
movdqa xmm4,XMMWORD[((192-256))+rcx]
|
||
|
movdqa xmm5,XMMWORD[((208-256))+rcx]
|
||
|
movdqa xmm6,XMMWORD[((224-256))+rcx]
|
||
|
movdqa xmm7,XMMWORD[((240-256))+rcx]
|
||
|
movdqa xmm0,XMMWORD[((256-256))+rcx]
|
||
|
movdqa xmm1,XMMWORD[((272-256))+rcx]
|
||
|
movdqa xmm2,XMMWORD[((288-256))+rcx]
|
||
|
movdqa xmm3,XMMWORD[((304-256))+rcx]
|
||
|
paddd xmm0,XMMWORD[$L$four]
|
||
|
|
||
|
$L$oop_enter4x:
|
||
|
movdqa XMMWORD[32+rsp],xmm6
|
||
|
movdqa XMMWORD[48+rsp],xmm7
|
||
|
movdqa xmm7,XMMWORD[r10]
|
||
|
mov eax,10
|
||
|
movdqa XMMWORD[(256-256)+rcx],xmm0
|
||
|
jmp NEAR $L$oop4x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$oop4x:
|
||
|
paddd xmm8,xmm12
|
||
|
paddd xmm9,xmm13
|
||
|
pxor xmm0,xmm8
|
||
|
pxor xmm1,xmm9
|
||
|
pshufb xmm0,xmm7
|
||
|
pshufb xmm1,xmm7
|
||
|
paddd xmm4,xmm0
|
||
|
paddd xmm5,xmm1
|
||
|
pxor xmm12,xmm4
|
||
|
pxor xmm13,xmm5
|
||
|
movdqa xmm6,xmm12
|
||
|
pslld xmm12,12
|
||
|
psrld xmm6,20
|
||
|
movdqa xmm7,xmm13
|
||
|
pslld xmm13,12
|
||
|
por xmm12,xmm6
|
||
|
psrld xmm7,20
|
||
|
movdqa xmm6,XMMWORD[r11]
|
||
|
por xmm13,xmm7
|
||
|
paddd xmm8,xmm12
|
||
|
paddd xmm9,xmm13
|
||
|
pxor xmm0,xmm8
|
||
|
pxor xmm1,xmm9
|
||
|
pshufb xmm0,xmm6
|
||
|
pshufb xmm1,xmm6
|
||
|
paddd xmm4,xmm0
|
||
|
paddd xmm5,xmm1
|
||
|
pxor xmm12,xmm4
|
||
|
pxor xmm13,xmm5
|
||
|
movdqa xmm7,xmm12
|
||
|
pslld xmm12,7
|
||
|
psrld xmm7,25
|
||
|
movdqa xmm6,xmm13
|
||
|
pslld xmm13,7
|
||
|
por xmm12,xmm7
|
||
|
psrld xmm6,25
|
||
|
movdqa xmm7,XMMWORD[r10]
|
||
|
por xmm13,xmm6
|
||
|
movdqa XMMWORD[rsp],xmm4
|
||
|
movdqa XMMWORD[16+rsp],xmm5
|
||
|
movdqa xmm4,XMMWORD[32+rsp]
|
||
|
movdqa xmm5,XMMWORD[48+rsp]
|
||
|
paddd xmm10,xmm14
|
||
|
paddd xmm11,xmm15
|
||
|
pxor xmm2,xmm10
|
||
|
pxor xmm3,xmm11
|
||
|
pshufb xmm2,xmm7
|
||
|
pshufb xmm3,xmm7
|
||
|
paddd xmm4,xmm2
|
||
|
paddd xmm5,xmm3
|
||
|
pxor xmm14,xmm4
|
||
|
pxor xmm15,xmm5
|
||
|
movdqa xmm6,xmm14
|
||
|
pslld xmm14,12
|
||
|
psrld xmm6,20
|
||
|
movdqa xmm7,xmm15
|
||
|
pslld xmm15,12
|
||
|
por xmm14,xmm6
|
||
|
psrld xmm7,20
|
||
|
movdqa xmm6,XMMWORD[r11]
|
||
|
por xmm15,xmm7
|
||
|
paddd xmm10,xmm14
|
||
|
paddd xmm11,xmm15
|
||
|
pxor xmm2,xmm10
|
||
|
pxor xmm3,xmm11
|
||
|
pshufb xmm2,xmm6
|
||
|
pshufb xmm3,xmm6
|
||
|
paddd xmm4,xmm2
|
||
|
paddd xmm5,xmm3
|
||
|
pxor xmm14,xmm4
|
||
|
pxor xmm15,xmm5
|
||
|
movdqa xmm7,xmm14
|
||
|
pslld xmm14,7
|
||
|
psrld xmm7,25
|
||
|
movdqa xmm6,xmm15
|
||
|
pslld xmm15,7
|
||
|
por xmm14,xmm7
|
||
|
psrld xmm6,25
|
||
|
movdqa xmm7,XMMWORD[r10]
|
||
|
por xmm15,xmm6
|
||
|
paddd xmm8,xmm13
|
||
|
paddd xmm9,xmm14
|
||
|
pxor xmm3,xmm8
|
||
|
pxor xmm0,xmm9
|
||
|
pshufb xmm3,xmm7
|
||
|
pshufb xmm0,xmm7
|
||
|
paddd xmm4,xmm3
|
||
|
paddd xmm5,xmm0
|
||
|
pxor xmm13,xmm4
|
||
|
pxor xmm14,xmm5
|
||
|
movdqa xmm6,xmm13
|
||
|
pslld xmm13,12
|
||
|
psrld xmm6,20
|
||
|
movdqa xmm7,xmm14
|
||
|
pslld xmm14,12
|
||
|
por xmm13,xmm6
|
||
|
psrld xmm7,20
|
||
|
movdqa xmm6,XMMWORD[r11]
|
||
|
por xmm14,xmm7
|
||
|
paddd xmm8,xmm13
|
||
|
paddd xmm9,xmm14
|
||
|
pxor xmm3,xmm8
|
||
|
pxor xmm0,xmm9
|
||
|
pshufb xmm3,xmm6
|
||
|
pshufb xmm0,xmm6
|
||
|
paddd xmm4,xmm3
|
||
|
paddd xmm5,xmm0
|
||
|
pxor xmm13,xmm4
|
||
|
pxor xmm14,xmm5
|
||
|
movdqa xmm7,xmm13
|
||
|
pslld xmm13,7
|
||
|
psrld xmm7,25
|
||
|
movdqa xmm6,xmm14
|
||
|
pslld xmm14,7
|
||
|
por xmm13,xmm7
|
||
|
psrld xmm6,25
|
||
|
movdqa xmm7,XMMWORD[r10]
|
||
|
por xmm14,xmm6
|
||
|
movdqa XMMWORD[32+rsp],xmm4
|
||
|
movdqa XMMWORD[48+rsp],xmm5
|
||
|
movdqa xmm4,XMMWORD[rsp]
|
||
|
movdqa xmm5,XMMWORD[16+rsp]
|
||
|
paddd xmm10,xmm15
|
||
|
paddd xmm11,xmm12
|
||
|
pxor xmm1,xmm10
|
||
|
pxor xmm2,xmm11
|
||
|
pshufb xmm1,xmm7
|
||
|
pshufb xmm2,xmm7
|
||
|
paddd xmm4,xmm1
|
||
|
paddd xmm5,xmm2
|
||
|
pxor xmm15,xmm4
|
||
|
pxor xmm12,xmm5
|
||
|
movdqa xmm6,xmm15
|
||
|
pslld xmm15,12
|
||
|
psrld xmm6,20
|
||
|
movdqa xmm7,xmm12
|
||
|
pslld xmm12,12
|
||
|
por xmm15,xmm6
|
||
|
psrld xmm7,20
|
||
|
movdqa xmm6,XMMWORD[r11]
|
||
|
por xmm12,xmm7
|
||
|
paddd xmm10,xmm15
|
||
|
paddd xmm11,xmm12
|
||
|
pxor xmm1,xmm10
|
||
|
pxor xmm2,xmm11
|
||
|
pshufb xmm1,xmm6
|
||
|
pshufb xmm2,xmm6
|
||
|
paddd xmm4,xmm1
|
||
|
paddd xmm5,xmm2
|
||
|
pxor xmm15,xmm4
|
||
|
pxor xmm12,xmm5
|
||
|
movdqa xmm7,xmm15
|
||
|
pslld xmm15,7
|
||
|
psrld xmm7,25
|
||
|
movdqa xmm6,xmm12
|
||
|
pslld xmm12,7
|
||
|
por xmm15,xmm7
|
||
|
psrld xmm6,25
|
||
|
movdqa xmm7,XMMWORD[r10]
|
||
|
por xmm12,xmm6
|
||
|
dec eax
|
||
|
jnz NEAR $L$oop4x
|
||
|
|
||
|
paddd xmm8,XMMWORD[64+rsp]
|
||
|
paddd xmm9,XMMWORD[80+rsp]
|
||
|
paddd xmm10,XMMWORD[96+rsp]
|
||
|
paddd xmm11,XMMWORD[112+rsp]
|
||
|
|
||
|
movdqa xmm6,xmm8
|
||
|
punpckldq xmm8,xmm9
|
||
|
movdqa xmm7,xmm10
|
||
|
punpckldq xmm10,xmm11
|
||
|
punpckhdq xmm6,xmm9
|
||
|
punpckhdq xmm7,xmm11
|
||
|
movdqa xmm9,xmm8
|
||
|
punpcklqdq xmm8,xmm10
|
||
|
movdqa xmm11,xmm6
|
||
|
punpcklqdq xmm6,xmm7
|
||
|
punpckhqdq xmm9,xmm10
|
||
|
punpckhqdq xmm11,xmm7
|
||
|
paddd xmm12,XMMWORD[((128-256))+rcx]
|
||
|
paddd xmm13,XMMWORD[((144-256))+rcx]
|
||
|
paddd xmm14,XMMWORD[((160-256))+rcx]
|
||
|
paddd xmm15,XMMWORD[((176-256))+rcx]
|
||
|
|
||
|
movdqa XMMWORD[rsp],xmm8
|
||
|
movdqa XMMWORD[16+rsp],xmm9
|
||
|
movdqa xmm8,XMMWORD[32+rsp]
|
||
|
movdqa xmm9,XMMWORD[48+rsp]
|
||
|
|
||
|
movdqa xmm10,xmm12
|
||
|
punpckldq xmm12,xmm13
|
||
|
movdqa xmm7,xmm14
|
||
|
punpckldq xmm14,xmm15
|
||
|
punpckhdq xmm10,xmm13
|
||
|
punpckhdq xmm7,xmm15
|
||
|
movdqa xmm13,xmm12
|
||
|
punpcklqdq xmm12,xmm14
|
||
|
movdqa xmm15,xmm10
|
||
|
punpcklqdq xmm10,xmm7
|
||
|
punpckhqdq xmm13,xmm14
|
||
|
punpckhqdq xmm15,xmm7
|
||
|
paddd xmm4,XMMWORD[((192-256))+rcx]
|
||
|
paddd xmm5,XMMWORD[((208-256))+rcx]
|
||
|
paddd xmm8,XMMWORD[((224-256))+rcx]
|
||
|
paddd xmm9,XMMWORD[((240-256))+rcx]
|
||
|
|
||
|
movdqa XMMWORD[32+rsp],xmm6
|
||
|
movdqa XMMWORD[48+rsp],xmm11
|
||
|
|
||
|
movdqa xmm14,xmm4
|
||
|
punpckldq xmm4,xmm5
|
||
|
movdqa xmm7,xmm8
|
||
|
punpckldq xmm8,xmm9
|
||
|
punpckhdq xmm14,xmm5
|
||
|
punpckhdq xmm7,xmm9
|
||
|
movdqa xmm5,xmm4
|
||
|
punpcklqdq xmm4,xmm8
|
||
|
movdqa xmm9,xmm14
|
||
|
punpcklqdq xmm14,xmm7
|
||
|
punpckhqdq xmm5,xmm8
|
||
|
punpckhqdq xmm9,xmm7
|
||
|
paddd xmm0,XMMWORD[((256-256))+rcx]
|
||
|
paddd xmm1,XMMWORD[((272-256))+rcx]
|
||
|
paddd xmm2,XMMWORD[((288-256))+rcx]
|
||
|
paddd xmm3,XMMWORD[((304-256))+rcx]
|
||
|
|
||
|
movdqa xmm8,xmm0
|
||
|
punpckldq xmm0,xmm1
|
||
|
movdqa xmm7,xmm2
|
||
|
punpckldq xmm2,xmm3
|
||
|
punpckhdq xmm8,xmm1
|
||
|
punpckhdq xmm7,xmm3
|
||
|
movdqa xmm1,xmm0
|
||
|
punpcklqdq xmm0,xmm2
|
||
|
movdqa xmm3,xmm8
|
||
|
punpcklqdq xmm8,xmm7
|
||
|
punpckhqdq xmm1,xmm2
|
||
|
punpckhqdq xmm3,xmm7
|
||
|
cmp rdx,64*4
|
||
|
jb NEAR $L$tail4x
|
||
|
|
||
|
movdqu xmm6,XMMWORD[rsi]
|
||
|
movdqu xmm11,XMMWORD[16+rsi]
|
||
|
movdqu xmm2,XMMWORD[32+rsi]
|
||
|
movdqu xmm7,XMMWORD[48+rsi]
|
||
|
pxor xmm6,XMMWORD[rsp]
|
||
|
pxor xmm11,xmm12
|
||
|
pxor xmm2,xmm4
|
||
|
pxor xmm7,xmm0
|
||
|
|
||
|
movdqu XMMWORD[rdi],xmm6
|
||
|
movdqu xmm6,XMMWORD[64+rsi]
|
||
|
movdqu XMMWORD[16+rdi],xmm11
|
||
|
movdqu xmm11,XMMWORD[80+rsi]
|
||
|
movdqu XMMWORD[32+rdi],xmm2
|
||
|
movdqu xmm2,XMMWORD[96+rsi]
|
||
|
movdqu XMMWORD[48+rdi],xmm7
|
||
|
movdqu xmm7,XMMWORD[112+rsi]
|
||
|
lea rsi,[128+rsi]
|
||
|
pxor xmm6,XMMWORD[16+rsp]
|
||
|
pxor xmm11,xmm13
|
||
|
pxor xmm2,xmm5
|
||
|
pxor xmm7,xmm1
|
||
|
|
||
|
movdqu XMMWORD[64+rdi],xmm6
|
||
|
movdqu xmm6,XMMWORD[rsi]
|
||
|
movdqu XMMWORD[80+rdi],xmm11
|
||
|
movdqu xmm11,XMMWORD[16+rsi]
|
||
|
movdqu XMMWORD[96+rdi],xmm2
|
||
|
movdqu xmm2,XMMWORD[32+rsi]
|
||
|
movdqu XMMWORD[112+rdi],xmm7
|
||
|
lea rdi,[128+rdi]
|
||
|
movdqu xmm7,XMMWORD[48+rsi]
|
||
|
pxor xmm6,XMMWORD[32+rsp]
|
||
|
pxor xmm11,xmm10
|
||
|
pxor xmm2,xmm14
|
||
|
pxor xmm7,xmm8
|
||
|
|
||
|
movdqu XMMWORD[rdi],xmm6
|
||
|
movdqu xmm6,XMMWORD[64+rsi]
|
||
|
movdqu XMMWORD[16+rdi],xmm11
|
||
|
movdqu xmm11,XMMWORD[80+rsi]
|
||
|
movdqu XMMWORD[32+rdi],xmm2
|
||
|
movdqu xmm2,XMMWORD[96+rsi]
|
||
|
movdqu XMMWORD[48+rdi],xmm7
|
||
|
movdqu xmm7,XMMWORD[112+rsi]
|
||
|
lea rsi,[128+rsi]
|
||
|
pxor xmm6,XMMWORD[48+rsp]
|
||
|
pxor xmm11,xmm15
|
||
|
pxor xmm2,xmm9
|
||
|
pxor xmm7,xmm3
|
||
|
movdqu XMMWORD[64+rdi],xmm6
|
||
|
movdqu XMMWORD[80+rdi],xmm11
|
||
|
movdqu XMMWORD[96+rdi],xmm2
|
||
|
movdqu XMMWORD[112+rdi],xmm7
|
||
|
lea rdi,[128+rdi]
|
||
|
|
||
|
sub rdx,64*4
|
||
|
jnz NEAR $L$oop_outer4x
|
||
|
|
||
|
jmp NEAR $L$done4x
|
||
|
|
||
|
$L$tail4x:
|
||
|
cmp rdx,192
|
||
|
jae NEAR $L$192_or_more4x
|
||
|
cmp rdx,128
|
||
|
jae NEAR $L$128_or_more4x
|
||
|
cmp rdx,64
|
||
|
jae NEAR $L$64_or_more4x
|
||
|
|
||
|
|
||
|
xor r10,r10
|
||
|
|
||
|
movdqa XMMWORD[16+rsp],xmm12
|
||
|
movdqa XMMWORD[32+rsp],xmm4
|
||
|
movdqa XMMWORD[48+rsp],xmm0
|
||
|
jmp NEAR $L$oop_tail4x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$64_or_more4x:
|
||
|
movdqu xmm6,XMMWORD[rsi]
|
||
|
movdqu xmm11,XMMWORD[16+rsi]
|
||
|
movdqu xmm2,XMMWORD[32+rsi]
|
||
|
movdqu xmm7,XMMWORD[48+rsi]
|
||
|
pxor xmm6,XMMWORD[rsp]
|
||
|
pxor xmm11,xmm12
|
||
|
pxor xmm2,xmm4
|
||
|
pxor xmm7,xmm0
|
||
|
movdqu XMMWORD[rdi],xmm6
|
||
|
movdqu XMMWORD[16+rdi],xmm11
|
||
|
movdqu XMMWORD[32+rdi],xmm2
|
||
|
movdqu XMMWORD[48+rdi],xmm7
|
||
|
je NEAR $L$done4x
|
||
|
|
||
|
movdqa xmm6,XMMWORD[16+rsp]
|
||
|
lea rsi,[64+rsi]
|
||
|
xor r10,r10
|
||
|
movdqa XMMWORD[rsp],xmm6
|
||
|
movdqa XMMWORD[16+rsp],xmm13
|
||
|
lea rdi,[64+rdi]
|
||
|
movdqa XMMWORD[32+rsp],xmm5
|
||
|
sub rdx,64
|
||
|
movdqa XMMWORD[48+rsp],xmm1
|
||
|
jmp NEAR $L$oop_tail4x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$128_or_more4x:
|
||
|
movdqu xmm6,XMMWORD[rsi]
|
||
|
movdqu xmm11,XMMWORD[16+rsi]
|
||
|
movdqu xmm2,XMMWORD[32+rsi]
|
||
|
movdqu xmm7,XMMWORD[48+rsi]
|
||
|
pxor xmm6,XMMWORD[rsp]
|
||
|
pxor xmm11,xmm12
|
||
|
pxor xmm2,xmm4
|
||
|
pxor xmm7,xmm0
|
||
|
|
||
|
movdqu XMMWORD[rdi],xmm6
|
||
|
movdqu xmm6,XMMWORD[64+rsi]
|
||
|
movdqu XMMWORD[16+rdi],xmm11
|
||
|
movdqu xmm11,XMMWORD[80+rsi]
|
||
|
movdqu XMMWORD[32+rdi],xmm2
|
||
|
movdqu xmm2,XMMWORD[96+rsi]
|
||
|
movdqu XMMWORD[48+rdi],xmm7
|
||
|
movdqu xmm7,XMMWORD[112+rsi]
|
||
|
pxor xmm6,XMMWORD[16+rsp]
|
||
|
pxor xmm11,xmm13
|
||
|
pxor xmm2,xmm5
|
||
|
pxor xmm7,xmm1
|
||
|
movdqu XMMWORD[64+rdi],xmm6
|
||
|
movdqu XMMWORD[80+rdi],xmm11
|
||
|
movdqu XMMWORD[96+rdi],xmm2
|
||
|
movdqu XMMWORD[112+rdi],xmm7
|
||
|
je NEAR $L$done4x
|
||
|
|
||
|
movdqa xmm6,XMMWORD[32+rsp]
|
||
|
lea rsi,[128+rsi]
|
||
|
xor r10,r10
|
||
|
movdqa XMMWORD[rsp],xmm6
|
||
|
movdqa XMMWORD[16+rsp],xmm10
|
||
|
lea rdi,[128+rdi]
|
||
|
movdqa XMMWORD[32+rsp],xmm14
|
||
|
sub rdx,128
|
||
|
movdqa XMMWORD[48+rsp],xmm8
|
||
|
jmp NEAR $L$oop_tail4x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$192_or_more4x:
|
||
|
movdqu xmm6,XMMWORD[rsi]
|
||
|
movdqu xmm11,XMMWORD[16+rsi]
|
||
|
movdqu xmm2,XMMWORD[32+rsi]
|
||
|
movdqu xmm7,XMMWORD[48+rsi]
|
||
|
pxor xmm6,XMMWORD[rsp]
|
||
|
pxor xmm11,xmm12
|
||
|
pxor xmm2,xmm4
|
||
|
pxor xmm7,xmm0
|
||
|
|
||
|
movdqu XMMWORD[rdi],xmm6
|
||
|
movdqu xmm6,XMMWORD[64+rsi]
|
||
|
movdqu XMMWORD[16+rdi],xmm11
|
||
|
movdqu xmm11,XMMWORD[80+rsi]
|
||
|
movdqu XMMWORD[32+rdi],xmm2
|
||
|
movdqu xmm2,XMMWORD[96+rsi]
|
||
|
movdqu XMMWORD[48+rdi],xmm7
|
||
|
movdqu xmm7,XMMWORD[112+rsi]
|
||
|
lea rsi,[128+rsi]
|
||
|
pxor xmm6,XMMWORD[16+rsp]
|
||
|
pxor xmm11,xmm13
|
||
|
pxor xmm2,xmm5
|
||
|
pxor xmm7,xmm1
|
||
|
|
||
|
movdqu XMMWORD[64+rdi],xmm6
|
||
|
movdqu xmm6,XMMWORD[rsi]
|
||
|
movdqu XMMWORD[80+rdi],xmm11
|
||
|
movdqu xmm11,XMMWORD[16+rsi]
|
||
|
movdqu XMMWORD[96+rdi],xmm2
|
||
|
movdqu xmm2,XMMWORD[32+rsi]
|
||
|
movdqu XMMWORD[112+rdi],xmm7
|
||
|
lea rdi,[128+rdi]
|
||
|
movdqu xmm7,XMMWORD[48+rsi]
|
||
|
pxor xmm6,XMMWORD[32+rsp]
|
||
|
pxor xmm11,xmm10
|
||
|
pxor xmm2,xmm14
|
||
|
pxor xmm7,xmm8
|
||
|
movdqu XMMWORD[rdi],xmm6
|
||
|
movdqu XMMWORD[16+rdi],xmm11
|
||
|
movdqu XMMWORD[32+rdi],xmm2
|
||
|
movdqu XMMWORD[48+rdi],xmm7
|
||
|
je NEAR $L$done4x
|
||
|
|
||
|
movdqa xmm6,XMMWORD[48+rsp]
|
||
|
lea rsi,[64+rsi]
|
||
|
xor r10,r10
|
||
|
movdqa XMMWORD[rsp],xmm6
|
||
|
movdqa XMMWORD[16+rsp],xmm15
|
||
|
lea rdi,[64+rdi]
|
||
|
movdqa XMMWORD[32+rsp],xmm9
|
||
|
sub rdx,192
|
||
|
movdqa XMMWORD[48+rsp],xmm3
|
||
|
|
||
|
$L$oop_tail4x:
|
||
|
movzx eax,BYTE[r10*1+rsi]
|
||
|
movzx ecx,BYTE[r10*1+rsp]
|
||
|
lea r10,[1+r10]
|
||
|
xor eax,ecx
|
||
|
mov BYTE[((-1))+r10*1+rdi],al
|
||
|
dec rdx
|
||
|
jnz NEAR $L$oop_tail4x
|
||
|
|
||
|
$L$done4x:
|
||
|
movaps xmm6,XMMWORD[((-168))+r9]
|
||
|
movaps xmm7,XMMWORD[((-152))+r9]
|
||
|
movaps xmm8,XMMWORD[((-136))+r9]
|
||
|
movaps xmm9,XMMWORD[((-120))+r9]
|
||
|
movaps xmm10,XMMWORD[((-104))+r9]
|
||
|
movaps xmm11,XMMWORD[((-88))+r9]
|
||
|
movaps xmm12,XMMWORD[((-72))+r9]
|
||
|
movaps xmm13,XMMWORD[((-56))+r9]
|
||
|
movaps xmm14,XMMWORD[((-40))+r9]
|
||
|
movaps xmm15,XMMWORD[((-24))+r9]
|
||
|
lea rsp,[r9]
|
||
|
|
||
|
$L$4x_epilogue:
|
||
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
||
|
mov rsi,QWORD[16+rsp]
|
||
|
DB 0F3h,0C3h ;repret
|
||
|
|
||
|
$L$SEH_end_chacha20_4x:
|
||
|
global chacha20_avx2
|
||
|
|
||
|
ALIGN 32
|
||
|
chacha20_avx2:
|
||
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
||
|
mov QWORD[16+rsp],rsi
|
||
|
mov rax,rsp
|
||
|
$L$SEH_begin_chacha20_avx2:
|
||
|
mov rdi,rcx
|
||
|
mov rsi,rdx
|
||
|
mov rdx,r8
|
||
|
mov rcx,r9
|
||
|
mov r8,QWORD[40+rsp]
|
||
|
|
||
|
|
||
|
|
||
|
$L$chacha20_avx2:
|
||
|
mov r9,rsp
|
||
|
|
||
|
sub rsp,0x280+168
|
||
|
and rsp,-32
|
||
|
movaps XMMWORD[(-168)+r9],xmm6
|
||
|
movaps XMMWORD[(-152)+r9],xmm7
|
||
|
movaps XMMWORD[(-136)+r9],xmm8
|
||
|
movaps XMMWORD[(-120)+r9],xmm9
|
||
|
movaps XMMWORD[(-104)+r9],xmm10
|
||
|
movaps XMMWORD[(-88)+r9],xmm11
|
||
|
movaps XMMWORD[(-72)+r9],xmm12
|
||
|
movaps XMMWORD[(-56)+r9],xmm13
|
||
|
movaps XMMWORD[(-40)+r9],xmm14
|
||
|
movaps XMMWORD[(-24)+r9],xmm15
|
||
|
$L$8x_body:
|
||
|
vzeroupper
|
||
|
|
||
|
vbroadcasti128 ymm11,XMMWORD[$L$sigma]
|
||
|
vbroadcasti128 ymm3,XMMWORD[rcx]
|
||
|
vbroadcasti128 ymm15,XMMWORD[16+rcx]
|
||
|
vbroadcasti128 ymm7,XMMWORD[r8]
|
||
|
lea rcx,[256+rsp]
|
||
|
lea rax,[512+rsp]
|
||
|
lea r10,[$L$rot16]
|
||
|
lea r11,[$L$rot24]
|
||
|
|
||
|
vpshufd ymm8,ymm11,0x00
|
||
|
vpshufd ymm9,ymm11,0x55
|
||
|
vmovdqa YMMWORD[(128-256)+rcx],ymm8
|
||
|
vpshufd ymm10,ymm11,0xaa
|
||
|
vmovdqa YMMWORD[(160-256)+rcx],ymm9
|
||
|
vpshufd ymm11,ymm11,0xff
|
||
|
vmovdqa YMMWORD[(192-256)+rcx],ymm10
|
||
|
vmovdqa YMMWORD[(224-256)+rcx],ymm11
|
||
|
|
||
|
vpshufd ymm0,ymm3,0x00
|
||
|
vpshufd ymm1,ymm3,0x55
|
||
|
vmovdqa YMMWORD[(256-256)+rcx],ymm0
|
||
|
vpshufd ymm2,ymm3,0xaa
|
||
|
vmovdqa YMMWORD[(288-256)+rcx],ymm1
|
||
|
vpshufd ymm3,ymm3,0xff
|
||
|
vmovdqa YMMWORD[(320-256)+rcx],ymm2
|
||
|
vmovdqa YMMWORD[(352-256)+rcx],ymm3
|
||
|
|
||
|
vpshufd ymm12,ymm15,0x00
|
||
|
vpshufd ymm13,ymm15,0x55
|
||
|
vmovdqa YMMWORD[(384-512)+rax],ymm12
|
||
|
vpshufd ymm14,ymm15,0xaa
|
||
|
vmovdqa YMMWORD[(416-512)+rax],ymm13
|
||
|
vpshufd ymm15,ymm15,0xff
|
||
|
vmovdqa YMMWORD[(448-512)+rax],ymm14
|
||
|
vmovdqa YMMWORD[(480-512)+rax],ymm15
|
||
|
|
||
|
vpshufd ymm4,ymm7,0x00
|
||
|
vpshufd ymm5,ymm7,0x55
|
||
|
vpaddd ymm4,ymm4,YMMWORD[$L$incy]
|
||
|
vpshufd ymm6,ymm7,0xaa
|
||
|
vmovdqa YMMWORD[(544-512)+rax],ymm5
|
||
|
vpshufd ymm7,ymm7,0xff
|
||
|
vmovdqa YMMWORD[(576-512)+rax],ymm6
|
||
|
vmovdqa YMMWORD[(608-512)+rax],ymm7
|
||
|
|
||
|
jmp NEAR $L$oop_enter8x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$oop_outer8x:
|
||
|
vmovdqa ymm8,YMMWORD[((128-256))+rcx]
|
||
|
vmovdqa ymm9,YMMWORD[((160-256))+rcx]
|
||
|
vmovdqa ymm10,YMMWORD[((192-256))+rcx]
|
||
|
vmovdqa ymm11,YMMWORD[((224-256))+rcx]
|
||
|
vmovdqa ymm0,YMMWORD[((256-256))+rcx]
|
||
|
vmovdqa ymm1,YMMWORD[((288-256))+rcx]
|
||
|
vmovdqa ymm2,YMMWORD[((320-256))+rcx]
|
||
|
vmovdqa ymm3,YMMWORD[((352-256))+rcx]
|
||
|
vmovdqa ymm12,YMMWORD[((384-512))+rax]
|
||
|
vmovdqa ymm13,YMMWORD[((416-512))+rax]
|
||
|
vmovdqa ymm14,YMMWORD[((448-512))+rax]
|
||
|
vmovdqa ymm15,YMMWORD[((480-512))+rax]
|
||
|
vmovdqa ymm4,YMMWORD[((512-512))+rax]
|
||
|
vmovdqa ymm5,YMMWORD[((544-512))+rax]
|
||
|
vmovdqa ymm6,YMMWORD[((576-512))+rax]
|
||
|
vmovdqa ymm7,YMMWORD[((608-512))+rax]
|
||
|
vpaddd ymm4,ymm4,YMMWORD[$L$eight]
|
||
|
|
||
|
$L$oop_enter8x:
|
||
|
vmovdqa YMMWORD[64+rsp],ymm14
|
||
|
vmovdqa YMMWORD[96+rsp],ymm15
|
||
|
vbroadcasti128 ymm15,XMMWORD[r10]
|
||
|
vmovdqa YMMWORD[(512-512)+rax],ymm4
|
||
|
mov eax,10
|
||
|
jmp NEAR $L$oop8x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$oop8x:
|
||
|
vpaddd ymm8,ymm8,ymm0
|
||
|
vpxor ymm4,ymm8,ymm4
|
||
|
vpshufb ymm4,ymm4,ymm15
|
||
|
vpaddd ymm9,ymm9,ymm1
|
||
|
vpxor ymm5,ymm9,ymm5
|
||
|
vpshufb ymm5,ymm5,ymm15
|
||
|
vpaddd ymm12,ymm12,ymm4
|
||
|
vpxor ymm0,ymm12,ymm0
|
||
|
vpslld ymm14,ymm0,12
|
||
|
vpsrld ymm0,ymm0,20
|
||
|
vpor ymm0,ymm14,ymm0
|
||
|
vbroadcasti128 ymm14,XMMWORD[r11]
|
||
|
vpaddd ymm13,ymm13,ymm5
|
||
|
vpxor ymm1,ymm13,ymm1
|
||
|
vpslld ymm15,ymm1,12
|
||
|
vpsrld ymm1,ymm1,20
|
||
|
vpor ymm1,ymm15,ymm1
|
||
|
vpaddd ymm8,ymm8,ymm0
|
||
|
vpxor ymm4,ymm8,ymm4
|
||
|
vpshufb ymm4,ymm4,ymm14
|
||
|
vpaddd ymm9,ymm9,ymm1
|
||
|
vpxor ymm5,ymm9,ymm5
|
||
|
vpshufb ymm5,ymm5,ymm14
|
||
|
vpaddd ymm12,ymm12,ymm4
|
||
|
vpxor ymm0,ymm12,ymm0
|
||
|
vpslld ymm15,ymm0,7
|
||
|
vpsrld ymm0,ymm0,25
|
||
|
vpor ymm0,ymm15,ymm0
|
||
|
vbroadcasti128 ymm15,XMMWORD[r10]
|
||
|
vpaddd ymm13,ymm13,ymm5
|
||
|
vpxor ymm1,ymm13,ymm1
|
||
|
vpslld ymm14,ymm1,7
|
||
|
vpsrld ymm1,ymm1,25
|
||
|
vpor ymm1,ymm14,ymm1
|
||
|
vmovdqa YMMWORD[rsp],ymm12
|
||
|
vmovdqa YMMWORD[32+rsp],ymm13
|
||
|
vmovdqa ymm12,YMMWORD[64+rsp]
|
||
|
vmovdqa ymm13,YMMWORD[96+rsp]
|
||
|
vpaddd ymm10,ymm10,ymm2
|
||
|
vpxor ymm6,ymm10,ymm6
|
||
|
vpshufb ymm6,ymm6,ymm15
|
||
|
vpaddd ymm11,ymm11,ymm3
|
||
|
vpxor ymm7,ymm11,ymm7
|
||
|
vpshufb ymm7,ymm7,ymm15
|
||
|
vpaddd ymm12,ymm12,ymm6
|
||
|
vpxor ymm2,ymm12,ymm2
|
||
|
vpslld ymm14,ymm2,12
|
||
|
vpsrld ymm2,ymm2,20
|
||
|
vpor ymm2,ymm14,ymm2
|
||
|
vbroadcasti128 ymm14,XMMWORD[r11]
|
||
|
vpaddd ymm13,ymm13,ymm7
|
||
|
vpxor ymm3,ymm13,ymm3
|
||
|
vpslld ymm15,ymm3,12
|
||
|
vpsrld ymm3,ymm3,20
|
||
|
vpor ymm3,ymm15,ymm3
|
||
|
vpaddd ymm10,ymm10,ymm2
|
||
|
vpxor ymm6,ymm10,ymm6
|
||
|
vpshufb ymm6,ymm6,ymm14
|
||
|
vpaddd ymm11,ymm11,ymm3
|
||
|
vpxor ymm7,ymm11,ymm7
|
||
|
vpshufb ymm7,ymm7,ymm14
|
||
|
vpaddd ymm12,ymm12,ymm6
|
||
|
vpxor ymm2,ymm12,ymm2
|
||
|
vpslld ymm15,ymm2,7
|
||
|
vpsrld ymm2,ymm2,25
|
||
|
vpor ymm2,ymm15,ymm2
|
||
|
vbroadcasti128 ymm15,XMMWORD[r10]
|
||
|
vpaddd ymm13,ymm13,ymm7
|
||
|
vpxor ymm3,ymm13,ymm3
|
||
|
vpslld ymm14,ymm3,7
|
||
|
vpsrld ymm3,ymm3,25
|
||
|
vpor ymm3,ymm14,ymm3
|
||
|
vpaddd ymm8,ymm8,ymm1
|
||
|
vpxor ymm7,ymm8,ymm7
|
||
|
vpshufb ymm7,ymm7,ymm15
|
||
|
vpaddd ymm9,ymm9,ymm2
|
||
|
vpxor ymm4,ymm9,ymm4
|
||
|
vpshufb ymm4,ymm4,ymm15
|
||
|
vpaddd ymm12,ymm12,ymm7
|
||
|
vpxor ymm1,ymm12,ymm1
|
||
|
vpslld ymm14,ymm1,12
|
||
|
vpsrld ymm1,ymm1,20
|
||
|
vpor ymm1,ymm14,ymm1
|
||
|
vbroadcasti128 ymm14,XMMWORD[r11]
|
||
|
vpaddd ymm13,ymm13,ymm4
|
||
|
vpxor ymm2,ymm13,ymm2
|
||
|
vpslld ymm15,ymm2,12
|
||
|
vpsrld ymm2,ymm2,20
|
||
|
vpor ymm2,ymm15,ymm2
|
||
|
vpaddd ymm8,ymm8,ymm1
|
||
|
vpxor ymm7,ymm8,ymm7
|
||
|
vpshufb ymm7,ymm7,ymm14
|
||
|
vpaddd ymm9,ymm9,ymm2
|
||
|
vpxor ymm4,ymm9,ymm4
|
||
|
vpshufb ymm4,ymm4,ymm14
|
||
|
vpaddd ymm12,ymm12,ymm7
|
||
|
vpxor ymm1,ymm12,ymm1
|
||
|
vpslld ymm15,ymm1,7
|
||
|
vpsrld ymm1,ymm1,25
|
||
|
vpor ymm1,ymm15,ymm1
|
||
|
vbroadcasti128 ymm15,XMMWORD[r10]
|
||
|
vpaddd ymm13,ymm13,ymm4
|
||
|
vpxor ymm2,ymm13,ymm2
|
||
|
vpslld ymm14,ymm2,7
|
||
|
vpsrld ymm2,ymm2,25
|
||
|
vpor ymm2,ymm14,ymm2
|
||
|
vmovdqa YMMWORD[64+rsp],ymm12
|
||
|
vmovdqa YMMWORD[96+rsp],ymm13
|
||
|
vmovdqa ymm12,YMMWORD[rsp]
|
||
|
vmovdqa ymm13,YMMWORD[32+rsp]
|
||
|
vpaddd ymm10,ymm10,ymm3
|
||
|
vpxor ymm5,ymm10,ymm5
|
||
|
vpshufb ymm5,ymm5,ymm15
|
||
|
vpaddd ymm11,ymm11,ymm0
|
||
|
vpxor ymm6,ymm11,ymm6
|
||
|
vpshufb ymm6,ymm6,ymm15
|
||
|
vpaddd ymm12,ymm12,ymm5
|
||
|
vpxor ymm3,ymm12,ymm3
|
||
|
vpslld ymm14,ymm3,12
|
||
|
vpsrld ymm3,ymm3,20
|
||
|
vpor ymm3,ymm14,ymm3
|
||
|
vbroadcasti128 ymm14,XMMWORD[r11]
|
||
|
vpaddd ymm13,ymm13,ymm6
|
||
|
vpxor ymm0,ymm13,ymm0
|
||
|
vpslld ymm15,ymm0,12
|
||
|
vpsrld ymm0,ymm0,20
|
||
|
vpor ymm0,ymm15,ymm0
|
||
|
vpaddd ymm10,ymm10,ymm3
|
||
|
vpxor ymm5,ymm10,ymm5
|
||
|
vpshufb ymm5,ymm5,ymm14
|
||
|
vpaddd ymm11,ymm11,ymm0
|
||
|
vpxor ymm6,ymm11,ymm6
|
||
|
vpshufb ymm6,ymm6,ymm14
|
||
|
vpaddd ymm12,ymm12,ymm5
|
||
|
vpxor ymm3,ymm12,ymm3
|
||
|
vpslld ymm15,ymm3,7
|
||
|
vpsrld ymm3,ymm3,25
|
||
|
vpor ymm3,ymm15,ymm3
|
||
|
vbroadcasti128 ymm15,XMMWORD[r10]
|
||
|
vpaddd ymm13,ymm13,ymm6
|
||
|
vpxor ymm0,ymm13,ymm0
|
||
|
vpslld ymm14,ymm0,7
|
||
|
vpsrld ymm0,ymm0,25
|
||
|
vpor ymm0,ymm14,ymm0
|
||
|
dec eax
|
||
|
jnz NEAR $L$oop8x
|
||
|
|
||
|
lea rax,[512+rsp]
|
||
|
vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx]
|
||
|
vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx]
|
||
|
vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx]
|
||
|
vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx]
|
||
|
|
||
|
vpunpckldq ymm14,ymm8,ymm9
|
||
|
vpunpckldq ymm15,ymm10,ymm11
|
||
|
vpunpckhdq ymm8,ymm8,ymm9
|
||
|
vpunpckhdq ymm10,ymm10,ymm11
|
||
|
vpunpcklqdq ymm9,ymm14,ymm15
|
||
|
vpunpckhqdq ymm14,ymm14,ymm15
|
||
|
vpunpcklqdq ymm11,ymm8,ymm10
|
||
|
vpunpckhqdq ymm8,ymm8,ymm10
|
||
|
vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx]
|
||
|
vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx]
|
||
|
vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx]
|
||
|
vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx]
|
||
|
|
||
|
vpunpckldq ymm10,ymm0,ymm1
|
||
|
vpunpckldq ymm15,ymm2,ymm3
|
||
|
vpunpckhdq ymm0,ymm0,ymm1
|
||
|
vpunpckhdq ymm2,ymm2,ymm3
|
||
|
vpunpcklqdq ymm1,ymm10,ymm15
|
||
|
vpunpckhqdq ymm10,ymm10,ymm15
|
||
|
vpunpcklqdq ymm3,ymm0,ymm2
|
||
|
vpunpckhqdq ymm0,ymm0,ymm2
|
||
|
vperm2i128 ymm15,ymm9,ymm1,0x20
|
||
|
vperm2i128 ymm1,ymm9,ymm1,0x31
|
||
|
vperm2i128 ymm9,ymm14,ymm10,0x20
|
||
|
vperm2i128 ymm10,ymm14,ymm10,0x31
|
||
|
vperm2i128 ymm14,ymm11,ymm3,0x20
|
||
|
vperm2i128 ymm3,ymm11,ymm3,0x31
|
||
|
vperm2i128 ymm11,ymm8,ymm0,0x20
|
||
|
vperm2i128 ymm0,ymm8,ymm0,0x31
|
||
|
vmovdqa YMMWORD[rsp],ymm15
|
||
|
vmovdqa YMMWORD[32+rsp],ymm9
|
||
|
vmovdqa ymm15,YMMWORD[64+rsp]
|
||
|
vmovdqa ymm9,YMMWORD[96+rsp]
|
||
|
|
||
|
vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax]
|
||
|
vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax]
|
||
|
vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax]
|
||
|
vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax]
|
||
|
|
||
|
vpunpckldq ymm2,ymm12,ymm13
|
||
|
vpunpckldq ymm8,ymm15,ymm9
|
||
|
vpunpckhdq ymm12,ymm12,ymm13
|
||
|
vpunpckhdq ymm15,ymm15,ymm9
|
||
|
vpunpcklqdq ymm13,ymm2,ymm8
|
||
|
vpunpckhqdq ymm2,ymm2,ymm8
|
||
|
vpunpcklqdq ymm9,ymm12,ymm15
|
||
|
vpunpckhqdq ymm12,ymm12,ymm15
|
||
|
vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax]
|
||
|
vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax]
|
||
|
vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax]
|
||
|
vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax]
|
||
|
|
||
|
vpunpckldq ymm15,ymm4,ymm5
|
||
|
vpunpckldq ymm8,ymm6,ymm7
|
||
|
vpunpckhdq ymm4,ymm4,ymm5
|
||
|
vpunpckhdq ymm6,ymm6,ymm7
|
||
|
vpunpcklqdq ymm5,ymm15,ymm8
|
||
|
vpunpckhqdq ymm15,ymm15,ymm8
|
||
|
vpunpcklqdq ymm7,ymm4,ymm6
|
||
|
vpunpckhqdq ymm4,ymm4,ymm6
|
||
|
vperm2i128 ymm8,ymm13,ymm5,0x20
|
||
|
vperm2i128 ymm5,ymm13,ymm5,0x31
|
||
|
vperm2i128 ymm13,ymm2,ymm15,0x20
|
||
|
vperm2i128 ymm15,ymm2,ymm15,0x31
|
||
|
vperm2i128 ymm2,ymm9,ymm7,0x20
|
||
|
vperm2i128 ymm7,ymm9,ymm7,0x31
|
||
|
vperm2i128 ymm9,ymm12,ymm4,0x20
|
||
|
vperm2i128 ymm4,ymm12,ymm4,0x31
|
||
|
vmovdqa ymm6,YMMWORD[rsp]
|
||
|
vmovdqa ymm12,YMMWORD[32+rsp]
|
||
|
|
||
|
cmp rdx,64*8
|
||
|
jb NEAR $L$tail8x
|
||
|
|
||
|
vpxor ymm6,ymm6,YMMWORD[rsi]
|
||
|
vpxor ymm8,ymm8,YMMWORD[32+rsi]
|
||
|
vpxor ymm1,ymm1,YMMWORD[64+rsi]
|
||
|
vpxor ymm5,ymm5,YMMWORD[96+rsi]
|
||
|
lea rsi,[128+rsi]
|
||
|
vmovdqu YMMWORD[rdi],ymm6
|
||
|
vmovdqu YMMWORD[32+rdi],ymm8
|
||
|
vmovdqu YMMWORD[64+rdi],ymm1
|
||
|
vmovdqu YMMWORD[96+rdi],ymm5
|
||
|
lea rdi,[128+rdi]
|
||
|
|
||
|
vpxor ymm12,ymm12,YMMWORD[rsi]
|
||
|
vpxor ymm13,ymm13,YMMWORD[32+rsi]
|
||
|
vpxor ymm10,ymm10,YMMWORD[64+rsi]
|
||
|
vpxor ymm15,ymm15,YMMWORD[96+rsi]
|
||
|
lea rsi,[128+rsi]
|
||
|
vmovdqu YMMWORD[rdi],ymm12
|
||
|
vmovdqu YMMWORD[32+rdi],ymm13
|
||
|
vmovdqu YMMWORD[64+rdi],ymm10
|
||
|
vmovdqu YMMWORD[96+rdi],ymm15
|
||
|
lea rdi,[128+rdi]
|
||
|
|
||
|
vpxor ymm14,ymm14,YMMWORD[rsi]
|
||
|
vpxor ymm2,ymm2,YMMWORD[32+rsi]
|
||
|
vpxor ymm3,ymm3,YMMWORD[64+rsi]
|
||
|
vpxor ymm7,ymm7,YMMWORD[96+rsi]
|
||
|
lea rsi,[128+rsi]
|
||
|
vmovdqu YMMWORD[rdi],ymm14
|
||
|
vmovdqu YMMWORD[32+rdi],ymm2
|
||
|
vmovdqu YMMWORD[64+rdi],ymm3
|
||
|
vmovdqu YMMWORD[96+rdi],ymm7
|
||
|
lea rdi,[128+rdi]
|
||
|
|
||
|
vpxor ymm11,ymm11,YMMWORD[rsi]
|
||
|
vpxor ymm9,ymm9,YMMWORD[32+rsi]
|
||
|
vpxor ymm0,ymm0,YMMWORD[64+rsi]
|
||
|
vpxor ymm4,ymm4,YMMWORD[96+rsi]
|
||
|
lea rsi,[128+rsi]
|
||
|
vmovdqu YMMWORD[rdi],ymm11
|
||
|
vmovdqu YMMWORD[32+rdi],ymm9
|
||
|
vmovdqu YMMWORD[64+rdi],ymm0
|
||
|
vmovdqu YMMWORD[96+rdi],ymm4
|
||
|
lea rdi,[128+rdi]
|
||
|
|
||
|
sub rdx,64*8
|
||
|
jnz NEAR $L$oop_outer8x
|
||
|
|
||
|
jmp NEAR $L$done8x
|
||
|
|
||
|
$L$tail8x:
|
||
|
cmp rdx,448
|
||
|
jae NEAR $L$448_or_more8x
|
||
|
cmp rdx,384
|
||
|
jae NEAR $L$384_or_more8x
|
||
|
cmp rdx,320
|
||
|
jae NEAR $L$320_or_more8x
|
||
|
cmp rdx,256
|
||
|
jae NEAR $L$256_or_more8x
|
||
|
cmp rdx,192
|
||
|
jae NEAR $L$192_or_more8x
|
||
|
cmp rdx,128
|
||
|
jae NEAR $L$128_or_more8x
|
||
|
cmp rdx,64
|
||
|
jae NEAR $L$64_or_more8x
|
||
|
|
||
|
xor r10,r10
|
||
|
vmovdqa YMMWORD[rsp],ymm6
|
||
|
vmovdqa YMMWORD[32+rsp],ymm8
|
||
|
jmp NEAR $L$oop_tail8x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$64_or_more8x:
|
||
|
vpxor ymm6,ymm6,YMMWORD[rsi]
|
||
|
vpxor ymm8,ymm8,YMMWORD[32+rsi]
|
||
|
vmovdqu YMMWORD[rdi],ymm6
|
||
|
vmovdqu YMMWORD[32+rdi],ymm8
|
||
|
je NEAR $L$done8x
|
||
|
|
||
|
lea rsi,[64+rsi]
|
||
|
xor r10,r10
|
||
|
vmovdqa YMMWORD[rsp],ymm1
|
||
|
lea rdi,[64+rdi]
|
||
|
sub rdx,64
|
||
|
vmovdqa YMMWORD[32+rsp],ymm5
|
||
|
jmp NEAR $L$oop_tail8x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$128_or_more8x:
|
||
|
vpxor ymm6,ymm6,YMMWORD[rsi]
|
||
|
vpxor ymm8,ymm8,YMMWORD[32+rsi]
|
||
|
vpxor ymm1,ymm1,YMMWORD[64+rsi]
|
||
|
vpxor ymm5,ymm5,YMMWORD[96+rsi]
|
||
|
vmovdqu YMMWORD[rdi],ymm6
|
||
|
vmovdqu YMMWORD[32+rdi],ymm8
|
||
|
vmovdqu YMMWORD[64+rdi],ymm1
|
||
|
vmovdqu YMMWORD[96+rdi],ymm5
|
||
|
je NEAR $L$done8x
|
||
|
|
||
|
lea rsi,[128+rsi]
|
||
|
xor r10,r10
|
||
|
vmovdqa YMMWORD[rsp],ymm12
|
||
|
lea rdi,[128+rdi]
|
||
|
sub rdx,128
|
||
|
vmovdqa YMMWORD[32+rsp],ymm13
|
||
|
jmp NEAR $L$oop_tail8x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$192_or_more8x:
|
||
|
vpxor ymm6,ymm6,YMMWORD[rsi]
|
||
|
vpxor ymm8,ymm8,YMMWORD[32+rsi]
|
||
|
vpxor ymm1,ymm1,YMMWORD[64+rsi]
|
||
|
vpxor ymm5,ymm5,YMMWORD[96+rsi]
|
||
|
vpxor ymm12,ymm12,YMMWORD[128+rsi]
|
||
|
vpxor ymm13,ymm13,YMMWORD[160+rsi]
|
||
|
vmovdqu YMMWORD[rdi],ymm6
|
||
|
vmovdqu YMMWORD[32+rdi],ymm8
|
||
|
vmovdqu YMMWORD[64+rdi],ymm1
|
||
|
vmovdqu YMMWORD[96+rdi],ymm5
|
||
|
vmovdqu YMMWORD[128+rdi],ymm12
|
||
|
vmovdqu YMMWORD[160+rdi],ymm13
|
||
|
je NEAR $L$done8x
|
||
|
|
||
|
lea rsi,[192+rsi]
|
||
|
xor r10,r10
|
||
|
vmovdqa YMMWORD[rsp],ymm10
|
||
|
lea rdi,[192+rdi]
|
||
|
sub rdx,192
|
||
|
vmovdqa YMMWORD[32+rsp],ymm15
|
||
|
jmp NEAR $L$oop_tail8x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$256_or_more8x:
|
||
|
vpxor ymm6,ymm6,YMMWORD[rsi]
|
||
|
vpxor ymm8,ymm8,YMMWORD[32+rsi]
|
||
|
vpxor ymm1,ymm1,YMMWORD[64+rsi]
|
||
|
vpxor ymm5,ymm5,YMMWORD[96+rsi]
|
||
|
vpxor ymm12,ymm12,YMMWORD[128+rsi]
|
||
|
vpxor ymm13,ymm13,YMMWORD[160+rsi]
|
||
|
vpxor ymm10,ymm10,YMMWORD[192+rsi]
|
||
|
vpxor ymm15,ymm15,YMMWORD[224+rsi]
|
||
|
vmovdqu YMMWORD[rdi],ymm6
|
||
|
vmovdqu YMMWORD[32+rdi],ymm8
|
||
|
vmovdqu YMMWORD[64+rdi],ymm1
|
||
|
vmovdqu YMMWORD[96+rdi],ymm5
|
||
|
vmovdqu YMMWORD[128+rdi],ymm12
|
||
|
vmovdqu YMMWORD[160+rdi],ymm13
|
||
|
vmovdqu YMMWORD[192+rdi],ymm10
|
||
|
vmovdqu YMMWORD[224+rdi],ymm15
|
||
|
je NEAR $L$done8x
|
||
|
|
||
|
lea rsi,[256+rsi]
|
||
|
xor r10,r10
|
||
|
vmovdqa YMMWORD[rsp],ymm14
|
||
|
lea rdi,[256+rdi]
|
||
|
sub rdx,256
|
||
|
vmovdqa YMMWORD[32+rsp],ymm2
|
||
|
jmp NEAR $L$oop_tail8x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$320_or_more8x:
|
||
|
vpxor ymm6,ymm6,YMMWORD[rsi]
|
||
|
vpxor ymm8,ymm8,YMMWORD[32+rsi]
|
||
|
vpxor ymm1,ymm1,YMMWORD[64+rsi]
|
||
|
vpxor ymm5,ymm5,YMMWORD[96+rsi]
|
||
|
vpxor ymm12,ymm12,YMMWORD[128+rsi]
|
||
|
vpxor ymm13,ymm13,YMMWORD[160+rsi]
|
||
|
vpxor ymm10,ymm10,YMMWORD[192+rsi]
|
||
|
vpxor ymm15,ymm15,YMMWORD[224+rsi]
|
||
|
vpxor ymm14,ymm14,YMMWORD[256+rsi]
|
||
|
vpxor ymm2,ymm2,YMMWORD[288+rsi]
|
||
|
vmovdqu YMMWORD[rdi],ymm6
|
||
|
vmovdqu YMMWORD[32+rdi],ymm8
|
||
|
vmovdqu YMMWORD[64+rdi],ymm1
|
||
|
vmovdqu YMMWORD[96+rdi],ymm5
|
||
|
vmovdqu YMMWORD[128+rdi],ymm12
|
||
|
vmovdqu YMMWORD[160+rdi],ymm13
|
||
|
vmovdqu YMMWORD[192+rdi],ymm10
|
||
|
vmovdqu YMMWORD[224+rdi],ymm15
|
||
|
vmovdqu YMMWORD[256+rdi],ymm14
|
||
|
vmovdqu YMMWORD[288+rdi],ymm2
|
||
|
je NEAR $L$done8x
|
||
|
|
||
|
lea rsi,[320+rsi]
|
||
|
xor r10,r10
|
||
|
vmovdqa YMMWORD[rsp],ymm3
|
||
|
lea rdi,[320+rdi]
|
||
|
sub rdx,320
|
||
|
vmovdqa YMMWORD[32+rsp],ymm7
|
||
|
jmp NEAR $L$oop_tail8x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$384_or_more8x:
|
||
|
vpxor ymm6,ymm6,YMMWORD[rsi]
|
||
|
vpxor ymm8,ymm8,YMMWORD[32+rsi]
|
||
|
vpxor ymm1,ymm1,YMMWORD[64+rsi]
|
||
|
vpxor ymm5,ymm5,YMMWORD[96+rsi]
|
||
|
vpxor ymm12,ymm12,YMMWORD[128+rsi]
|
||
|
vpxor ymm13,ymm13,YMMWORD[160+rsi]
|
||
|
vpxor ymm10,ymm10,YMMWORD[192+rsi]
|
||
|
vpxor ymm15,ymm15,YMMWORD[224+rsi]
|
||
|
vpxor ymm14,ymm14,YMMWORD[256+rsi]
|
||
|
vpxor ymm2,ymm2,YMMWORD[288+rsi]
|
||
|
vpxor ymm3,ymm3,YMMWORD[320+rsi]
|
||
|
vpxor ymm7,ymm7,YMMWORD[352+rsi]
|
||
|
vmovdqu YMMWORD[rdi],ymm6
|
||
|
vmovdqu YMMWORD[32+rdi],ymm8
|
||
|
vmovdqu YMMWORD[64+rdi],ymm1
|
||
|
vmovdqu YMMWORD[96+rdi],ymm5
|
||
|
vmovdqu YMMWORD[128+rdi],ymm12
|
||
|
vmovdqu YMMWORD[160+rdi],ymm13
|
||
|
vmovdqu YMMWORD[192+rdi],ymm10
|
||
|
vmovdqu YMMWORD[224+rdi],ymm15
|
||
|
vmovdqu YMMWORD[256+rdi],ymm14
|
||
|
vmovdqu YMMWORD[288+rdi],ymm2
|
||
|
vmovdqu YMMWORD[320+rdi],ymm3
|
||
|
vmovdqu YMMWORD[352+rdi],ymm7
|
||
|
je NEAR $L$done8x
|
||
|
|
||
|
lea rsi,[384+rsi]
|
||
|
xor r10,r10
|
||
|
vmovdqa YMMWORD[rsp],ymm11
|
||
|
lea rdi,[384+rdi]
|
||
|
sub rdx,384
|
||
|
vmovdqa YMMWORD[32+rsp],ymm9
|
||
|
jmp NEAR $L$oop_tail8x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$448_or_more8x:
|
||
|
vpxor ymm6,ymm6,YMMWORD[rsi]
|
||
|
vpxor ymm8,ymm8,YMMWORD[32+rsi]
|
||
|
vpxor ymm1,ymm1,YMMWORD[64+rsi]
|
||
|
vpxor ymm5,ymm5,YMMWORD[96+rsi]
|
||
|
vpxor ymm12,ymm12,YMMWORD[128+rsi]
|
||
|
vpxor ymm13,ymm13,YMMWORD[160+rsi]
|
||
|
vpxor ymm10,ymm10,YMMWORD[192+rsi]
|
||
|
vpxor ymm15,ymm15,YMMWORD[224+rsi]
|
||
|
vpxor ymm14,ymm14,YMMWORD[256+rsi]
|
||
|
vpxor ymm2,ymm2,YMMWORD[288+rsi]
|
||
|
vpxor ymm3,ymm3,YMMWORD[320+rsi]
|
||
|
vpxor ymm7,ymm7,YMMWORD[352+rsi]
|
||
|
vpxor ymm11,ymm11,YMMWORD[384+rsi]
|
||
|
vpxor ymm9,ymm9,YMMWORD[416+rsi]
|
||
|
vmovdqu YMMWORD[rdi],ymm6
|
||
|
vmovdqu YMMWORD[32+rdi],ymm8
|
||
|
vmovdqu YMMWORD[64+rdi],ymm1
|
||
|
vmovdqu YMMWORD[96+rdi],ymm5
|
||
|
vmovdqu YMMWORD[128+rdi],ymm12
|
||
|
vmovdqu YMMWORD[160+rdi],ymm13
|
||
|
vmovdqu YMMWORD[192+rdi],ymm10
|
||
|
vmovdqu YMMWORD[224+rdi],ymm15
|
||
|
vmovdqu YMMWORD[256+rdi],ymm14
|
||
|
vmovdqu YMMWORD[288+rdi],ymm2
|
||
|
vmovdqu YMMWORD[320+rdi],ymm3
|
||
|
vmovdqu YMMWORD[352+rdi],ymm7
|
||
|
vmovdqu YMMWORD[384+rdi],ymm11
|
||
|
vmovdqu YMMWORD[416+rdi],ymm9
|
||
|
je NEAR $L$done8x
|
||
|
|
||
|
lea rsi,[448+rsi]
|
||
|
xor r10,r10
|
||
|
vmovdqa YMMWORD[rsp],ymm0
|
||
|
lea rdi,[448+rdi]
|
||
|
sub rdx,448
|
||
|
vmovdqa YMMWORD[32+rsp],ymm4
|
||
|
|
||
|
$L$oop_tail8x:
|
||
|
movzx eax,BYTE[r10*1+rsi]
|
||
|
movzx ecx,BYTE[r10*1+rsp]
|
||
|
lea r10,[1+r10]
|
||
|
xor eax,ecx
|
||
|
mov BYTE[((-1))+r10*1+rdi],al
|
||
|
dec rdx
|
||
|
jnz NEAR $L$oop_tail8x
|
||
|
|
||
|
$L$done8x:
|
||
|
vzeroall
|
||
|
movaps xmm6,XMMWORD[((-168))+r9]
|
||
|
movaps xmm7,XMMWORD[((-152))+r9]
|
||
|
movaps xmm8,XMMWORD[((-136))+r9]
|
||
|
movaps xmm9,XMMWORD[((-120))+r9]
|
||
|
movaps xmm10,XMMWORD[((-104))+r9]
|
||
|
movaps xmm11,XMMWORD[((-88))+r9]
|
||
|
movaps xmm12,XMMWORD[((-72))+r9]
|
||
|
movaps xmm13,XMMWORD[((-56))+r9]
|
||
|
movaps xmm14,XMMWORD[((-40))+r9]
|
||
|
movaps xmm15,XMMWORD[((-24))+r9]
|
||
|
lea rsp,[r9]
|
||
|
|
||
|
$L$8x_epilogue:
|
||
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
||
|
mov rsi,QWORD[16+rsp]
|
||
|
DB 0F3h,0C3h ;repret
|
||
|
|
||
|
$L$SEH_end_chacha20_avx2:
|
||
|
global chacha20_avx512
|
||
|
|
||
|
ALIGN 32
|
||
|
chacha20_avx512:
|
||
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
||
|
mov QWORD[16+rsp],rsi
|
||
|
mov rax,rsp
|
||
|
$L$SEH_begin_chacha20_avx512:
|
||
|
mov rdi,rcx
|
||
|
mov rsi,rdx
|
||
|
mov rdx,r8
|
||
|
mov rcx,r9
|
||
|
mov r8,QWORD[40+rsp]
|
||
|
|
||
|
|
||
|
|
||
|
$L$chacha20_avx512:
|
||
|
mov r9,rsp
|
||
|
|
||
|
cmp rdx,512
|
||
|
ja NEAR $L$chacha20_16x
|
||
|
|
||
|
sub rsp,64+40
|
||
|
movaps XMMWORD[(-40)+r9],xmm6
|
||
|
movaps XMMWORD[(-24)+r9],xmm7
|
||
|
$L$avx512_body:
|
||
|
vbroadcasti32x4 zmm0,ZMMWORD[$L$sigma]
|
||
|
vbroadcasti32x4 zmm1,ZMMWORD[rcx]
|
||
|
vbroadcasti32x4 zmm2,ZMMWORD[16+rcx]
|
||
|
vbroadcasti32x4 zmm3,ZMMWORD[r8]
|
||
|
|
||
|
vmovdqa32 zmm16,zmm0
|
||
|
vmovdqa32 zmm17,zmm1
|
||
|
vmovdqa32 zmm18,zmm2
|
||
|
vpaddd zmm3,zmm3,ZMMWORD[$L$zeroz]
|
||
|
vmovdqa32 zmm20,ZMMWORD[$L$fourz]
|
||
|
mov r8,10
|
||
|
vmovdqa32 zmm19,zmm3
|
||
|
jmp NEAR $L$oop_avx512
|
||
|
|
||
|
ALIGN 16
|
||
|
$L$oop_outer_avx512:
|
||
|
vmovdqa32 zmm0,zmm16
|
||
|
vmovdqa32 zmm1,zmm17
|
||
|
vmovdqa32 zmm2,zmm18
|
||
|
vpaddd zmm3,zmm19,zmm20
|
||
|
mov r8,10
|
||
|
vmovdqa32 zmm19,zmm3
|
||
|
jmp NEAR $L$oop_avx512
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$oop_avx512:
|
||
|
vpaddd zmm0,zmm0,zmm1
|
||
|
vpxord zmm3,zmm3,zmm0
|
||
|
vprold zmm3,zmm3,16
|
||
|
vpaddd zmm2,zmm2,zmm3
|
||
|
vpxord zmm1,zmm1,zmm2
|
||
|
vprold zmm1,zmm1,12
|
||
|
vpaddd zmm0,zmm0,zmm1
|
||
|
vpxord zmm3,zmm3,zmm0
|
||
|
vprold zmm3,zmm3,8
|
||
|
vpaddd zmm2,zmm2,zmm3
|
||
|
vpxord zmm1,zmm1,zmm2
|
||
|
vprold zmm1,zmm1,7
|
||
|
vpshufd zmm2,zmm2,78
|
||
|
vpshufd zmm1,zmm1,57
|
||
|
vpshufd zmm3,zmm3,147
|
||
|
vpaddd zmm0,zmm0,zmm1
|
||
|
vpxord zmm3,zmm3,zmm0
|
||
|
vprold zmm3,zmm3,16
|
||
|
vpaddd zmm2,zmm2,zmm3
|
||
|
vpxord zmm1,zmm1,zmm2
|
||
|
vprold zmm1,zmm1,12
|
||
|
vpaddd zmm0,zmm0,zmm1
|
||
|
vpxord zmm3,zmm3,zmm0
|
||
|
vprold zmm3,zmm3,8
|
||
|
vpaddd zmm2,zmm2,zmm3
|
||
|
vpxord zmm1,zmm1,zmm2
|
||
|
vprold zmm1,zmm1,7
|
||
|
vpshufd zmm2,zmm2,78
|
||
|
vpshufd zmm1,zmm1,147
|
||
|
vpshufd zmm3,zmm3,57
|
||
|
dec r8
|
||
|
jnz NEAR $L$oop_avx512
|
||
|
vpaddd zmm0,zmm0,zmm16
|
||
|
vpaddd zmm1,zmm1,zmm17
|
||
|
vpaddd zmm2,zmm2,zmm18
|
||
|
vpaddd zmm3,zmm3,zmm19
|
||
|
|
||
|
sub rdx,64
|
||
|
jb NEAR $L$tail64_avx512
|
||
|
|
||
|
vpxor xmm4,xmm0,XMMWORD[rsi]
|
||
|
vpxor xmm5,xmm1,XMMWORD[16+rsi]
|
||
|
vpxor xmm6,xmm2,XMMWORD[32+rsi]
|
||
|
vpxor xmm7,xmm3,XMMWORD[48+rsi]
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
vmovdqu XMMWORD[rdi],xmm4
|
||
|
vmovdqu XMMWORD[16+rdi],xmm5
|
||
|
vmovdqu XMMWORD[32+rdi],xmm6
|
||
|
vmovdqu XMMWORD[48+rdi],xmm7
|
||
|
lea rdi,[64+rdi]
|
||
|
|
||
|
jz NEAR $L$done_avx512
|
||
|
|
||
|
vextracti32x4 xmm4,zmm0,1
|
||
|
vextracti32x4 xmm5,zmm1,1
|
||
|
vextracti32x4 xmm6,zmm2,1
|
||
|
vextracti32x4 xmm7,zmm3,1
|
||
|
|
||
|
sub rdx,64
|
||
|
jb NEAR $L$tail_avx512
|
||
|
|
||
|
vpxor xmm4,xmm4,XMMWORD[rsi]
|
||
|
vpxor xmm5,xmm5,XMMWORD[16+rsi]
|
||
|
vpxor xmm6,xmm6,XMMWORD[32+rsi]
|
||
|
vpxor xmm7,xmm7,XMMWORD[48+rsi]
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
vmovdqu XMMWORD[rdi],xmm4
|
||
|
vmovdqu XMMWORD[16+rdi],xmm5
|
||
|
vmovdqu XMMWORD[32+rdi],xmm6
|
||
|
vmovdqu XMMWORD[48+rdi],xmm7
|
||
|
lea rdi,[64+rdi]
|
||
|
|
||
|
jz NEAR $L$done_avx512
|
||
|
|
||
|
vextracti32x4 xmm4,zmm0,2
|
||
|
vextracti32x4 xmm5,zmm1,2
|
||
|
vextracti32x4 xmm6,zmm2,2
|
||
|
vextracti32x4 xmm7,zmm3,2
|
||
|
|
||
|
sub rdx,64
|
||
|
jb NEAR $L$tail_avx512
|
||
|
|
||
|
vpxor xmm4,xmm4,XMMWORD[rsi]
|
||
|
vpxor xmm5,xmm5,XMMWORD[16+rsi]
|
||
|
vpxor xmm6,xmm6,XMMWORD[32+rsi]
|
||
|
vpxor xmm7,xmm7,XMMWORD[48+rsi]
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
vmovdqu XMMWORD[rdi],xmm4
|
||
|
vmovdqu XMMWORD[16+rdi],xmm5
|
||
|
vmovdqu XMMWORD[32+rdi],xmm6
|
||
|
vmovdqu XMMWORD[48+rdi],xmm7
|
||
|
lea rdi,[64+rdi]
|
||
|
|
||
|
jz NEAR $L$done_avx512
|
||
|
|
||
|
vextracti32x4 xmm4,zmm0,3
|
||
|
vextracti32x4 xmm5,zmm1,3
|
||
|
vextracti32x4 xmm6,zmm2,3
|
||
|
vextracti32x4 xmm7,zmm3,3
|
||
|
|
||
|
sub rdx,64
|
||
|
jb NEAR $L$tail_avx512
|
||
|
|
||
|
vpxor xmm4,xmm4,XMMWORD[rsi]
|
||
|
vpxor xmm5,xmm5,XMMWORD[16+rsi]
|
||
|
vpxor xmm6,xmm6,XMMWORD[32+rsi]
|
||
|
vpxor xmm7,xmm7,XMMWORD[48+rsi]
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
vmovdqu XMMWORD[rdi],xmm4
|
||
|
vmovdqu XMMWORD[16+rdi],xmm5
|
||
|
vmovdqu XMMWORD[32+rdi],xmm6
|
||
|
vmovdqu XMMWORD[48+rdi],xmm7
|
||
|
lea rdi,[64+rdi]
|
||
|
|
||
|
jnz NEAR $L$oop_outer_avx512
|
||
|
|
||
|
jmp NEAR $L$done_avx512
|
||
|
|
||
|
ALIGN 16
|
||
|
$L$tail64_avx512:
|
||
|
vmovdqa XMMWORD[rsp],xmm0
|
||
|
vmovdqa XMMWORD[16+rsp],xmm1
|
||
|
vmovdqa XMMWORD[32+rsp],xmm2
|
||
|
vmovdqa XMMWORD[48+rsp],xmm3
|
||
|
add rdx,64
|
||
|
jmp NEAR $L$oop_tail_avx512
|
||
|
|
||
|
ALIGN 16
|
||
|
$L$tail_avx512:
|
||
|
vmovdqa XMMWORD[rsp],xmm4
|
||
|
vmovdqa XMMWORD[16+rsp],xmm5
|
||
|
vmovdqa XMMWORD[32+rsp],xmm6
|
||
|
vmovdqa XMMWORD[48+rsp],xmm7
|
||
|
add rdx,64
|
||
|
|
||
|
$L$oop_tail_avx512:
|
||
|
movzx eax,BYTE[r8*1+rsi]
|
||
|
movzx ecx,BYTE[r8*1+rsp]
|
||
|
lea r8,[1+r8]
|
||
|
xor eax,ecx
|
||
|
mov BYTE[((-1))+r8*1+rdi],al
|
||
|
dec rdx
|
||
|
jnz NEAR $L$oop_tail_avx512
|
||
|
|
||
|
vmovdqu32 ZMMWORD[rsp],zmm16
|
||
|
|
||
|
$L$done_avx512:
|
||
|
vzeroall
|
||
|
movaps xmm6,XMMWORD[((-40))+r9]
|
||
|
movaps xmm7,XMMWORD[((-24))+r9]
|
||
|
lea rsp,[r9]
|
||
|
|
||
|
$L$avx512_epilogue:
|
||
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
||
|
mov rsi,QWORD[16+rsp]
|
||
|
DB 0F3h,0C3h ;repret
|
||
|
|
||
|
$L$SEH_end_chacha20_avx512:
|
||
|
global chacha20_avx512vl
|
||
|
|
||
|
ALIGN 32
|
||
|
chacha20_avx512vl:
|
||
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
||
|
mov QWORD[16+rsp],rsi
|
||
|
mov rax,rsp
|
||
|
$L$SEH_begin_chacha20_avx512vl:
|
||
|
mov rdi,rcx
|
||
|
mov rsi,rdx
|
||
|
mov rdx,r8
|
||
|
mov rcx,r9
|
||
|
mov r8,QWORD[40+rsp]
|
||
|
|
||
|
|
||
|
|
||
|
$L$chacha20_avx512vl:
|
||
|
mov r9,rsp
|
||
|
|
||
|
cmp rdx,128
|
||
|
ja NEAR $L$chacha20_8xvl
|
||
|
|
||
|
sub rsp,64+40
|
||
|
movaps XMMWORD[(-40)+r9],xmm6
|
||
|
movaps XMMWORD[(-24)+r9],xmm7
|
||
|
$L$avx512vl_body:
|
||
|
vbroadcasti128 ymm0,XMMWORD[$L$sigma]
|
||
|
vbroadcasti128 ymm1,XMMWORD[rcx]
|
||
|
vbroadcasti128 ymm2,XMMWORD[16+rcx]
|
||
|
vbroadcasti128 ymm3,XMMWORD[r8]
|
||
|
|
||
|
vmovdqa32 ymm16,ymm0
|
||
|
vmovdqa32 ymm17,ymm1
|
||
|
vmovdqa32 ymm18,ymm2
|
||
|
vpaddd ymm3,ymm3,YMMWORD[$L$zeroz]
|
||
|
vmovdqa32 ymm20,YMMWORD[$L$twoy]
|
||
|
mov r8,10
|
||
|
vmovdqa32 ymm19,ymm3
|
||
|
jmp NEAR $L$oop_avx512vl
|
||
|
|
||
|
ALIGN 16
|
||
|
$L$oop_outer_avx512vl:
|
||
|
vmovdqa32 ymm2,ymm18
|
||
|
vpaddd ymm3,ymm19,ymm20
|
||
|
mov r8,10
|
||
|
vmovdqa32 ymm19,ymm3
|
||
|
jmp NEAR $L$oop_avx512vl
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$oop_avx512vl:
|
||
|
vpaddd ymm0,ymm0,ymm1
|
||
|
vpxor ymm3,ymm3,ymm0
|
||
|
vprold ymm3,ymm3,16
|
||
|
vpaddd ymm2,ymm2,ymm3
|
||
|
vpxor ymm1,ymm1,ymm2
|
||
|
vprold ymm1,ymm1,12
|
||
|
vpaddd ymm0,ymm0,ymm1
|
||
|
vpxor ymm3,ymm3,ymm0
|
||
|
vprold ymm3,ymm3,8
|
||
|
vpaddd ymm2,ymm2,ymm3
|
||
|
vpxor ymm1,ymm1,ymm2
|
||
|
vprold ymm1,ymm1,7
|
||
|
vpshufd ymm2,ymm2,78
|
||
|
vpshufd ymm1,ymm1,57
|
||
|
vpshufd ymm3,ymm3,147
|
||
|
vpaddd ymm0,ymm0,ymm1
|
||
|
vpxor ymm3,ymm3,ymm0
|
||
|
vprold ymm3,ymm3,16
|
||
|
vpaddd ymm2,ymm2,ymm3
|
||
|
vpxor ymm1,ymm1,ymm2
|
||
|
vprold ymm1,ymm1,12
|
||
|
vpaddd ymm0,ymm0,ymm1
|
||
|
vpxor ymm3,ymm3,ymm0
|
||
|
vprold ymm3,ymm3,8
|
||
|
vpaddd ymm2,ymm2,ymm3
|
||
|
vpxor ymm1,ymm1,ymm2
|
||
|
vprold ymm1,ymm1,7
|
||
|
vpshufd ymm2,ymm2,78
|
||
|
vpshufd ymm1,ymm1,147
|
||
|
vpshufd ymm3,ymm3,57
|
||
|
dec r8
|
||
|
jnz NEAR $L$oop_avx512vl
|
||
|
vpaddd ymm0,ymm0,ymm16
|
||
|
vpaddd ymm1,ymm1,ymm17
|
||
|
vpaddd ymm2,ymm2,ymm18
|
||
|
vpaddd ymm3,ymm3,ymm19
|
||
|
|
||
|
sub rdx,64
|
||
|
jb NEAR $L$tail64_avx512vl
|
||
|
|
||
|
vpxor xmm4,xmm0,XMMWORD[rsi]
|
||
|
vpxor xmm5,xmm1,XMMWORD[16+rsi]
|
||
|
vpxor xmm6,xmm2,XMMWORD[32+rsi]
|
||
|
vpxor xmm7,xmm3,XMMWORD[48+rsi]
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
vmovdqu XMMWORD[rdi],xmm4
|
||
|
vmovdqu XMMWORD[16+rdi],xmm5
|
||
|
vmovdqu XMMWORD[32+rdi],xmm6
|
||
|
vmovdqu XMMWORD[48+rdi],xmm7
|
||
|
lea rdi,[64+rdi]
|
||
|
|
||
|
jz NEAR $L$done_avx512vl
|
||
|
|
||
|
vextracti128 xmm4,ymm0,1
|
||
|
vextracti128 xmm5,ymm1,1
|
||
|
vextracti128 xmm6,ymm2,1
|
||
|
vextracti128 xmm7,ymm3,1
|
||
|
|
||
|
sub rdx,64
|
||
|
jb NEAR $L$tail_avx512vl
|
||
|
|
||
|
vpxor xmm4,xmm4,XMMWORD[rsi]
|
||
|
vpxor xmm5,xmm5,XMMWORD[16+rsi]
|
||
|
vpxor xmm6,xmm6,XMMWORD[32+rsi]
|
||
|
vpxor xmm7,xmm7,XMMWORD[48+rsi]
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
vmovdqu XMMWORD[rdi],xmm4
|
||
|
vmovdqu XMMWORD[16+rdi],xmm5
|
||
|
vmovdqu XMMWORD[32+rdi],xmm6
|
||
|
vmovdqu XMMWORD[48+rdi],xmm7
|
||
|
lea rdi,[64+rdi]
|
||
|
|
||
|
vmovdqa32 ymm0,ymm16
|
||
|
vmovdqa32 ymm1,ymm17
|
||
|
jnz NEAR $L$oop_outer_avx512vl
|
||
|
|
||
|
jmp NEAR $L$done_avx512vl
|
||
|
|
||
|
ALIGN 16
|
||
|
$L$tail64_avx512vl:
|
||
|
vmovdqa XMMWORD[rsp],xmm0
|
||
|
vmovdqa XMMWORD[16+rsp],xmm1
|
||
|
vmovdqa XMMWORD[32+rsp],xmm2
|
||
|
vmovdqa XMMWORD[48+rsp],xmm3
|
||
|
add rdx,64
|
||
|
jmp NEAR $L$oop_tail_avx512vl
|
||
|
|
||
|
ALIGN 16
|
||
|
$L$tail_avx512vl:
|
||
|
vmovdqa XMMWORD[rsp],xmm4
|
||
|
vmovdqa XMMWORD[16+rsp],xmm5
|
||
|
vmovdqa XMMWORD[32+rsp],xmm6
|
||
|
vmovdqa XMMWORD[48+rsp],xmm7
|
||
|
add rdx,64
|
||
|
|
||
|
$L$oop_tail_avx512vl:
|
||
|
movzx eax,BYTE[r8*1+rsi]
|
||
|
movzx ecx,BYTE[r8*1+rsp]
|
||
|
lea r8,[1+r8]
|
||
|
xor eax,ecx
|
||
|
mov BYTE[((-1))+r8*1+rdi],al
|
||
|
dec rdx
|
||
|
jnz NEAR $L$oop_tail_avx512vl
|
||
|
|
||
|
vmovdqu32 YMMWORD[rsp],ymm16
|
||
|
vmovdqu32 YMMWORD[32+rsp],ymm16
|
||
|
|
||
|
$L$done_avx512vl:
|
||
|
vzeroall
|
||
|
movaps xmm6,XMMWORD[((-40))+r9]
|
||
|
movaps xmm7,XMMWORD[((-24))+r9]
|
||
|
lea rsp,[r9]
|
||
|
|
||
|
$L$avx512vl_epilogue:
|
||
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
||
|
mov rsi,QWORD[16+rsp]
|
||
|
DB 0F3h,0C3h ;repret
|
||
|
|
||
|
$L$SEH_end_chacha20_avx512vl:
|
||
|
global chacha20_16x
|
||
|
|
||
|
ALIGN 32
|
||
|
chacha20_16x:
|
||
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
||
|
mov QWORD[16+rsp],rsi
|
||
|
mov rax,rsp
|
||
|
$L$SEH_begin_chacha20_16x:
|
||
|
mov rdi,rcx
|
||
|
mov rsi,rdx
|
||
|
mov rdx,r8
|
||
|
mov rcx,r9
|
||
|
mov r8,QWORD[40+rsp]
|
||
|
|
||
|
|
||
|
|
||
|
$L$chacha20_16x:
|
||
|
mov r9,rsp
|
||
|
|
||
|
sub rsp,64+168
|
||
|
and rsp,-64
|
||
|
movaps XMMWORD[(-168)+r9],xmm6
|
||
|
movaps XMMWORD[(-152)+r9],xmm7
|
||
|
movaps XMMWORD[(-136)+r9],xmm8
|
||
|
movaps XMMWORD[(-120)+r9],xmm9
|
||
|
movaps XMMWORD[(-104)+r9],xmm10
|
||
|
movaps XMMWORD[(-88)+r9],xmm11
|
||
|
movaps XMMWORD[(-72)+r9],xmm12
|
||
|
movaps XMMWORD[(-56)+r9],xmm13
|
||
|
movaps XMMWORD[(-40)+r9],xmm14
|
||
|
movaps XMMWORD[(-24)+r9],xmm15
|
||
|
$L$16x_body:
|
||
|
vzeroupper
|
||
|
|
||
|
lea r10,[$L$sigma]
|
||
|
vbroadcasti32x4 zmm3,ZMMWORD[r10]
|
||
|
vbroadcasti32x4 zmm7,ZMMWORD[rcx]
|
||
|
vbroadcasti32x4 zmm11,ZMMWORD[16+rcx]
|
||
|
vbroadcasti32x4 zmm15,ZMMWORD[r8]
|
||
|
|
||
|
vpshufd zmm0,zmm3,0x00
|
||
|
vpshufd zmm1,zmm3,0x55
|
||
|
vpshufd zmm2,zmm3,0xaa
|
||
|
vpshufd zmm3,zmm3,0xff
|
||
|
vmovdqa64 zmm16,zmm0
|
||
|
vmovdqa64 zmm17,zmm1
|
||
|
vmovdqa64 zmm18,zmm2
|
||
|
vmovdqa64 zmm19,zmm3
|
||
|
|
||
|
vpshufd zmm4,zmm7,0x00
|
||
|
vpshufd zmm5,zmm7,0x55
|
||
|
vpshufd zmm6,zmm7,0xaa
|
||
|
vpshufd zmm7,zmm7,0xff
|
||
|
vmovdqa64 zmm20,zmm4
|
||
|
vmovdqa64 zmm21,zmm5
|
||
|
vmovdqa64 zmm22,zmm6
|
||
|
vmovdqa64 zmm23,zmm7
|
||
|
|
||
|
vpshufd zmm8,zmm11,0x00
|
||
|
vpshufd zmm9,zmm11,0x55
|
||
|
vpshufd zmm10,zmm11,0xaa
|
||
|
vpshufd zmm11,zmm11,0xff
|
||
|
vmovdqa64 zmm24,zmm8
|
||
|
vmovdqa64 zmm25,zmm9
|
||
|
vmovdqa64 zmm26,zmm10
|
||
|
vmovdqa64 zmm27,zmm11
|
||
|
|
||
|
vpshufd zmm12,zmm15,0x00
|
||
|
vpshufd zmm13,zmm15,0x55
|
||
|
vpshufd zmm14,zmm15,0xaa
|
||
|
vpshufd zmm15,zmm15,0xff
|
||
|
vpaddd zmm12,zmm12,ZMMWORD[$L$incz]
|
||
|
vmovdqa64 zmm28,zmm12
|
||
|
vmovdqa64 zmm29,zmm13
|
||
|
vmovdqa64 zmm30,zmm14
|
||
|
vmovdqa64 zmm31,zmm15
|
||
|
|
||
|
mov eax,10
|
||
|
jmp NEAR $L$oop16x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$oop_outer16x:
|
||
|
vpbroadcastd zmm0,DWORD[r10]
|
||
|
vpbroadcastd zmm1,DWORD[4+r10]
|
||
|
vpbroadcastd zmm2,DWORD[8+r10]
|
||
|
vpbroadcastd zmm3,DWORD[12+r10]
|
||
|
vpaddd zmm28,zmm28,ZMMWORD[$L$sixteen]
|
||
|
vmovdqa64 zmm4,zmm20
|
||
|
vmovdqa64 zmm5,zmm21
|
||
|
vmovdqa64 zmm6,zmm22
|
||
|
vmovdqa64 zmm7,zmm23
|
||
|
vmovdqa64 zmm8,zmm24
|
||
|
vmovdqa64 zmm9,zmm25
|
||
|
vmovdqa64 zmm10,zmm26
|
||
|
vmovdqa64 zmm11,zmm27
|
||
|
vmovdqa64 zmm12,zmm28
|
||
|
vmovdqa64 zmm13,zmm29
|
||
|
vmovdqa64 zmm14,zmm30
|
||
|
vmovdqa64 zmm15,zmm31
|
||
|
|
||
|
vmovdqa64 zmm16,zmm0
|
||
|
vmovdqa64 zmm17,zmm1
|
||
|
vmovdqa64 zmm18,zmm2
|
||
|
vmovdqa64 zmm19,zmm3
|
||
|
|
||
|
mov eax,10
|
||
|
jmp NEAR $L$oop16x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$oop16x:
|
||
|
vpaddd zmm0,zmm0,zmm4
|
||
|
vpaddd zmm1,zmm1,zmm5
|
||
|
vpaddd zmm2,zmm2,zmm6
|
||
|
vpaddd zmm3,zmm3,zmm7
|
||
|
vpxord zmm12,zmm12,zmm0
|
||
|
vpxord zmm13,zmm13,zmm1
|
||
|
vpxord zmm14,zmm14,zmm2
|
||
|
vpxord zmm15,zmm15,zmm3
|
||
|
vprold zmm12,zmm12,16
|
||
|
vprold zmm13,zmm13,16
|
||
|
vprold zmm14,zmm14,16
|
||
|
vprold zmm15,zmm15,16
|
||
|
vpaddd zmm8,zmm8,zmm12
|
||
|
vpaddd zmm9,zmm9,zmm13
|
||
|
vpaddd zmm10,zmm10,zmm14
|
||
|
vpaddd zmm11,zmm11,zmm15
|
||
|
vpxord zmm4,zmm4,zmm8
|
||
|
vpxord zmm5,zmm5,zmm9
|
||
|
vpxord zmm6,zmm6,zmm10
|
||
|
vpxord zmm7,zmm7,zmm11
|
||
|
vprold zmm4,zmm4,12
|
||
|
vprold zmm5,zmm5,12
|
||
|
vprold zmm6,zmm6,12
|
||
|
vprold zmm7,zmm7,12
|
||
|
vpaddd zmm0,zmm0,zmm4
|
||
|
vpaddd zmm1,zmm1,zmm5
|
||
|
vpaddd zmm2,zmm2,zmm6
|
||
|
vpaddd zmm3,zmm3,zmm7
|
||
|
vpxord zmm12,zmm12,zmm0
|
||
|
vpxord zmm13,zmm13,zmm1
|
||
|
vpxord zmm14,zmm14,zmm2
|
||
|
vpxord zmm15,zmm15,zmm3
|
||
|
vprold zmm12,zmm12,8
|
||
|
vprold zmm13,zmm13,8
|
||
|
vprold zmm14,zmm14,8
|
||
|
vprold zmm15,zmm15,8
|
||
|
vpaddd zmm8,zmm8,zmm12
|
||
|
vpaddd zmm9,zmm9,zmm13
|
||
|
vpaddd zmm10,zmm10,zmm14
|
||
|
vpaddd zmm11,zmm11,zmm15
|
||
|
vpxord zmm4,zmm4,zmm8
|
||
|
vpxord zmm5,zmm5,zmm9
|
||
|
vpxord zmm6,zmm6,zmm10
|
||
|
vpxord zmm7,zmm7,zmm11
|
||
|
vprold zmm4,zmm4,7
|
||
|
vprold zmm5,zmm5,7
|
||
|
vprold zmm6,zmm6,7
|
||
|
vprold zmm7,zmm7,7
|
||
|
vpaddd zmm0,zmm0,zmm5
|
||
|
vpaddd zmm1,zmm1,zmm6
|
||
|
vpaddd zmm2,zmm2,zmm7
|
||
|
vpaddd zmm3,zmm3,zmm4
|
||
|
vpxord zmm15,zmm15,zmm0
|
||
|
vpxord zmm12,zmm12,zmm1
|
||
|
vpxord zmm13,zmm13,zmm2
|
||
|
vpxord zmm14,zmm14,zmm3
|
||
|
vprold zmm15,zmm15,16
|
||
|
vprold zmm12,zmm12,16
|
||
|
vprold zmm13,zmm13,16
|
||
|
vprold zmm14,zmm14,16
|
||
|
vpaddd zmm10,zmm10,zmm15
|
||
|
vpaddd zmm11,zmm11,zmm12
|
||
|
vpaddd zmm8,zmm8,zmm13
|
||
|
vpaddd zmm9,zmm9,zmm14
|
||
|
vpxord zmm5,zmm5,zmm10
|
||
|
vpxord zmm6,zmm6,zmm11
|
||
|
vpxord zmm7,zmm7,zmm8
|
||
|
vpxord zmm4,zmm4,zmm9
|
||
|
vprold zmm5,zmm5,12
|
||
|
vprold zmm6,zmm6,12
|
||
|
vprold zmm7,zmm7,12
|
||
|
vprold zmm4,zmm4,12
|
||
|
vpaddd zmm0,zmm0,zmm5
|
||
|
vpaddd zmm1,zmm1,zmm6
|
||
|
vpaddd zmm2,zmm2,zmm7
|
||
|
vpaddd zmm3,zmm3,zmm4
|
||
|
vpxord zmm15,zmm15,zmm0
|
||
|
vpxord zmm12,zmm12,zmm1
|
||
|
vpxord zmm13,zmm13,zmm2
|
||
|
vpxord zmm14,zmm14,zmm3
|
||
|
vprold zmm15,zmm15,8
|
||
|
vprold zmm12,zmm12,8
|
||
|
vprold zmm13,zmm13,8
|
||
|
vprold zmm14,zmm14,8
|
||
|
vpaddd zmm10,zmm10,zmm15
|
||
|
vpaddd zmm11,zmm11,zmm12
|
||
|
vpaddd zmm8,zmm8,zmm13
|
||
|
vpaddd zmm9,zmm9,zmm14
|
||
|
vpxord zmm5,zmm5,zmm10
|
||
|
vpxord zmm6,zmm6,zmm11
|
||
|
vpxord zmm7,zmm7,zmm8
|
||
|
vpxord zmm4,zmm4,zmm9
|
||
|
vprold zmm5,zmm5,7
|
||
|
vprold zmm6,zmm6,7
|
||
|
vprold zmm7,zmm7,7
|
||
|
vprold zmm4,zmm4,7
|
||
|
dec eax
|
||
|
jnz NEAR $L$oop16x
|
||
|
|
||
|
vpaddd zmm0,zmm0,zmm16
|
||
|
vpaddd zmm1,zmm1,zmm17
|
||
|
vpaddd zmm2,zmm2,zmm18
|
||
|
vpaddd zmm3,zmm3,zmm19
|
||
|
|
||
|
vpunpckldq zmm18,zmm0,zmm1
|
||
|
vpunpckldq zmm19,zmm2,zmm3
|
||
|
vpunpckhdq zmm0,zmm0,zmm1
|
||
|
vpunpckhdq zmm2,zmm2,zmm3
|
||
|
vpunpcklqdq zmm1,zmm18,zmm19
|
||
|
vpunpckhqdq zmm18,zmm18,zmm19
|
||
|
vpunpcklqdq zmm3,zmm0,zmm2
|
||
|
vpunpckhqdq zmm0,zmm0,zmm2
|
||
|
vpaddd zmm4,zmm4,zmm20
|
||
|
vpaddd zmm5,zmm5,zmm21
|
||
|
vpaddd zmm6,zmm6,zmm22
|
||
|
vpaddd zmm7,zmm7,zmm23
|
||
|
|
||
|
vpunpckldq zmm2,zmm4,zmm5
|
||
|
vpunpckldq zmm19,zmm6,zmm7
|
||
|
vpunpckhdq zmm4,zmm4,zmm5
|
||
|
vpunpckhdq zmm6,zmm6,zmm7
|
||
|
vpunpcklqdq zmm5,zmm2,zmm19
|
||
|
vpunpckhqdq zmm2,zmm2,zmm19
|
||
|
vpunpcklqdq zmm7,zmm4,zmm6
|
||
|
vpunpckhqdq zmm4,zmm4,zmm6
|
||
|
vshufi32x4 zmm19,zmm1,zmm5,0x44
|
||
|
vshufi32x4 zmm5,zmm1,zmm5,0xee
|
||
|
vshufi32x4 zmm1,zmm18,zmm2,0x44
|
||
|
vshufi32x4 zmm2,zmm18,zmm2,0xee
|
||
|
vshufi32x4 zmm18,zmm3,zmm7,0x44
|
||
|
vshufi32x4 zmm7,zmm3,zmm7,0xee
|
||
|
vshufi32x4 zmm3,zmm0,zmm4,0x44
|
||
|
vshufi32x4 zmm4,zmm0,zmm4,0xee
|
||
|
vpaddd zmm8,zmm8,zmm24
|
||
|
vpaddd zmm9,zmm9,zmm25
|
||
|
vpaddd zmm10,zmm10,zmm26
|
||
|
vpaddd zmm11,zmm11,zmm27
|
||
|
|
||
|
vpunpckldq zmm6,zmm8,zmm9
|
||
|
vpunpckldq zmm0,zmm10,zmm11
|
||
|
vpunpckhdq zmm8,zmm8,zmm9
|
||
|
vpunpckhdq zmm10,zmm10,zmm11
|
||
|
vpunpcklqdq zmm9,zmm6,zmm0
|
||
|
vpunpckhqdq zmm6,zmm6,zmm0
|
||
|
vpunpcklqdq zmm11,zmm8,zmm10
|
||
|
vpunpckhqdq zmm8,zmm8,zmm10
|
||
|
vpaddd zmm12,zmm12,zmm28
|
||
|
vpaddd zmm13,zmm13,zmm29
|
||
|
vpaddd zmm14,zmm14,zmm30
|
||
|
vpaddd zmm15,zmm15,zmm31
|
||
|
|
||
|
vpunpckldq zmm10,zmm12,zmm13
|
||
|
vpunpckldq zmm0,zmm14,zmm15
|
||
|
vpunpckhdq zmm12,zmm12,zmm13
|
||
|
vpunpckhdq zmm14,zmm14,zmm15
|
||
|
vpunpcklqdq zmm13,zmm10,zmm0
|
||
|
vpunpckhqdq zmm10,zmm10,zmm0
|
||
|
vpunpcklqdq zmm15,zmm12,zmm14
|
||
|
vpunpckhqdq zmm12,zmm12,zmm14
|
||
|
vshufi32x4 zmm0,zmm9,zmm13,0x44
|
||
|
vshufi32x4 zmm13,zmm9,zmm13,0xee
|
||
|
vshufi32x4 zmm9,zmm6,zmm10,0x44
|
||
|
vshufi32x4 zmm10,zmm6,zmm10,0xee
|
||
|
vshufi32x4 zmm6,zmm11,zmm15,0x44
|
||
|
vshufi32x4 zmm15,zmm11,zmm15,0xee
|
||
|
vshufi32x4 zmm11,zmm8,zmm12,0x44
|
||
|
vshufi32x4 zmm12,zmm8,zmm12,0xee
|
||
|
vshufi32x4 zmm16,zmm19,zmm0,0x88
|
||
|
vshufi32x4 zmm19,zmm19,zmm0,0xdd
|
||
|
vshufi32x4 zmm0,zmm5,zmm13,0x88
|
||
|
vshufi32x4 zmm13,zmm5,zmm13,0xdd
|
||
|
vshufi32x4 zmm17,zmm1,zmm9,0x88
|
||
|
vshufi32x4 zmm1,zmm1,zmm9,0xdd
|
||
|
vshufi32x4 zmm9,zmm2,zmm10,0x88
|
||
|
vshufi32x4 zmm10,zmm2,zmm10,0xdd
|
||
|
vshufi32x4 zmm14,zmm18,zmm6,0x88
|
||
|
vshufi32x4 zmm18,zmm18,zmm6,0xdd
|
||
|
vshufi32x4 zmm6,zmm7,zmm15,0x88
|
||
|
vshufi32x4 zmm15,zmm7,zmm15,0xdd
|
||
|
vshufi32x4 zmm8,zmm3,zmm11,0x88
|
||
|
vshufi32x4 zmm3,zmm3,zmm11,0xdd
|
||
|
vshufi32x4 zmm11,zmm4,zmm12,0x88
|
||
|
vshufi32x4 zmm12,zmm4,zmm12,0xdd
|
||
|
cmp rdx,64*16
|
||
|
jb NEAR $L$tail16x
|
||
|
|
||
|
vpxord zmm16,zmm16,ZMMWORD[rsi]
|
||
|
vpxord zmm17,zmm17,ZMMWORD[64+rsi]
|
||
|
vpxord zmm14,zmm14,ZMMWORD[128+rsi]
|
||
|
vpxord zmm8,zmm8,ZMMWORD[192+rsi]
|
||
|
vmovdqu32 ZMMWORD[rdi],zmm16
|
||
|
vmovdqu32 ZMMWORD[64+rdi],zmm17
|
||
|
vmovdqu32 ZMMWORD[128+rdi],zmm14
|
||
|
vmovdqu32 ZMMWORD[192+rdi],zmm8
|
||
|
|
||
|
vpxord zmm19,zmm19,ZMMWORD[256+rsi]
|
||
|
vpxord zmm1,zmm1,ZMMWORD[320+rsi]
|
||
|
vpxord zmm18,zmm18,ZMMWORD[384+rsi]
|
||
|
vpxord zmm3,zmm3,ZMMWORD[448+rsi]
|
||
|
vmovdqu32 ZMMWORD[256+rdi],zmm19
|
||
|
vmovdqu32 ZMMWORD[320+rdi],zmm1
|
||
|
vmovdqu32 ZMMWORD[384+rdi],zmm18
|
||
|
vmovdqu32 ZMMWORD[448+rdi],zmm3
|
||
|
|
||
|
vpxord zmm0,zmm0,ZMMWORD[512+rsi]
|
||
|
vpxord zmm9,zmm9,ZMMWORD[576+rsi]
|
||
|
vpxord zmm6,zmm6,ZMMWORD[640+rsi]
|
||
|
vpxord zmm11,zmm11,ZMMWORD[704+rsi]
|
||
|
vmovdqu32 ZMMWORD[512+rdi],zmm0
|
||
|
vmovdqu32 ZMMWORD[576+rdi],zmm9
|
||
|
vmovdqu32 ZMMWORD[640+rdi],zmm6
|
||
|
vmovdqu32 ZMMWORD[704+rdi],zmm11
|
||
|
|
||
|
vpxord zmm13,zmm13,ZMMWORD[768+rsi]
|
||
|
vpxord zmm10,zmm10,ZMMWORD[832+rsi]
|
||
|
vpxord zmm15,zmm15,ZMMWORD[896+rsi]
|
||
|
vpxord zmm12,zmm12,ZMMWORD[960+rsi]
|
||
|
lea rsi,[1024+rsi]
|
||
|
vmovdqu32 ZMMWORD[768+rdi],zmm13
|
||
|
vmovdqu32 ZMMWORD[832+rdi],zmm10
|
||
|
vmovdqu32 ZMMWORD[896+rdi],zmm15
|
||
|
vmovdqu32 ZMMWORD[960+rdi],zmm12
|
||
|
lea rdi,[1024+rdi]
|
||
|
|
||
|
sub rdx,64*16
|
||
|
jnz NEAR $L$oop_outer16x
|
||
|
|
||
|
jmp NEAR $L$done16x
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$tail16x:
|
||
|
xor r10,r10
|
||
|
sub rdi,rsi
|
||
|
cmp rdx,64*1
|
||
|
jb NEAR $L$ess_than_64_16x
|
||
|
vpxord zmm16,zmm16,ZMMWORD[rsi]
|
||
|
vmovdqu32 ZMMWORD[rsi*1+rdi],zmm16
|
||
|
je NEAR $L$done16x
|
||
|
vmovdqa32 zmm16,zmm17
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*2
|
||
|
jb NEAR $L$ess_than_64_16x
|
||
|
vpxord zmm17,zmm17,ZMMWORD[rsi]
|
||
|
vmovdqu32 ZMMWORD[rsi*1+rdi],zmm17
|
||
|
je NEAR $L$done16x
|
||
|
vmovdqa32 zmm16,zmm14
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*3
|
||
|
jb NEAR $L$ess_than_64_16x
|
||
|
vpxord zmm14,zmm14,ZMMWORD[rsi]
|
||
|
vmovdqu32 ZMMWORD[rsi*1+rdi],zmm14
|
||
|
je NEAR $L$done16x
|
||
|
vmovdqa32 zmm16,zmm8
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*4
|
||
|
jb NEAR $L$ess_than_64_16x
|
||
|
vpxord zmm8,zmm8,ZMMWORD[rsi]
|
||
|
vmovdqu32 ZMMWORD[rsi*1+rdi],zmm8
|
||
|
je NEAR $L$done16x
|
||
|
vmovdqa32 zmm16,zmm19
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*5
|
||
|
jb NEAR $L$ess_than_64_16x
|
||
|
vpxord zmm19,zmm19,ZMMWORD[rsi]
|
||
|
vmovdqu32 ZMMWORD[rsi*1+rdi],zmm19
|
||
|
je NEAR $L$done16x
|
||
|
vmovdqa32 zmm16,zmm1
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*6
|
||
|
jb NEAR $L$ess_than_64_16x
|
||
|
vpxord zmm1,zmm1,ZMMWORD[rsi]
|
||
|
vmovdqu32 ZMMWORD[rsi*1+rdi],zmm1
|
||
|
je NEAR $L$done16x
|
||
|
vmovdqa32 zmm16,zmm18
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*7
|
||
|
jb NEAR $L$ess_than_64_16x
|
||
|
vpxord zmm18,zmm18,ZMMWORD[rsi]
|
||
|
vmovdqu32 ZMMWORD[rsi*1+rdi],zmm18
|
||
|
je NEAR $L$done16x
|
||
|
vmovdqa32 zmm16,zmm3
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*8
|
||
|
jb NEAR $L$ess_than_64_16x
|
||
|
vpxord zmm3,zmm3,ZMMWORD[rsi]
|
||
|
vmovdqu32 ZMMWORD[rsi*1+rdi],zmm3
|
||
|
je NEAR $L$done16x
|
||
|
vmovdqa32 zmm16,zmm0
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*9
|
||
|
jb NEAR $L$ess_than_64_16x
|
||
|
vpxord zmm0,zmm0,ZMMWORD[rsi]
|
||
|
vmovdqu32 ZMMWORD[rsi*1+rdi],zmm0
|
||
|
je NEAR $L$done16x
|
||
|
vmovdqa32 zmm16,zmm9
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*10
|
||
|
jb NEAR $L$ess_than_64_16x
|
||
|
vpxord zmm9,zmm9,ZMMWORD[rsi]
|
||
|
vmovdqu32 ZMMWORD[rsi*1+rdi],zmm9
|
||
|
je NEAR $L$done16x
|
||
|
vmovdqa32 zmm16,zmm6
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*11
|
||
|
jb NEAR $L$ess_than_64_16x
|
||
|
vpxord zmm6,zmm6,ZMMWORD[rsi]
|
||
|
vmovdqu32 ZMMWORD[rsi*1+rdi],zmm6
|
||
|
je NEAR $L$done16x
|
||
|
vmovdqa32 zmm16,zmm11
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*12
|
||
|
jb NEAR $L$ess_than_64_16x
|
||
|
vpxord zmm11,zmm11,ZMMWORD[rsi]
|
||
|
vmovdqu32 ZMMWORD[rsi*1+rdi],zmm11
|
||
|
je NEAR $L$done16x
|
||
|
vmovdqa32 zmm16,zmm13
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*13
|
||
|
jb NEAR $L$ess_than_64_16x
|
||
|
vpxord zmm13,zmm13,ZMMWORD[rsi]
|
||
|
vmovdqu32 ZMMWORD[rsi*1+rdi],zmm13
|
||
|
je NEAR $L$done16x
|
||
|
vmovdqa32 zmm16,zmm10
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*14
|
||
|
jb NEAR $L$ess_than_64_16x
|
||
|
vpxord zmm10,zmm10,ZMMWORD[rsi]
|
||
|
vmovdqu32 ZMMWORD[rsi*1+rdi],zmm10
|
||
|
je NEAR $L$done16x
|
||
|
vmovdqa32 zmm16,zmm15
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*15
|
||
|
jb NEAR $L$ess_than_64_16x
|
||
|
vpxord zmm15,zmm15,ZMMWORD[rsi]
|
||
|
vmovdqu32 ZMMWORD[rsi*1+rdi],zmm15
|
||
|
je NEAR $L$done16x
|
||
|
vmovdqa32 zmm16,zmm12
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
$L$ess_than_64_16x:
|
||
|
vmovdqa32 ZMMWORD[rsp],zmm16
|
||
|
lea rdi,[rsi*1+rdi]
|
||
|
and rdx,63
|
||
|
|
||
|
$L$oop_tail16x:
|
||
|
movzx eax,BYTE[r10*1+rsi]
|
||
|
movzx ecx,BYTE[r10*1+rsp]
|
||
|
lea r10,[1+r10]
|
||
|
xor eax,ecx
|
||
|
mov BYTE[((-1))+r10*1+rdi],al
|
||
|
dec rdx
|
||
|
jnz NEAR $L$oop_tail16x
|
||
|
|
||
|
vpxord zmm16,zmm16,zmm16
|
||
|
vmovdqa32 ZMMWORD[rsp],zmm16
|
||
|
|
||
|
$L$done16x:
|
||
|
vzeroall
|
||
|
movaps xmm6,XMMWORD[((-168))+r9]
|
||
|
movaps xmm7,XMMWORD[((-152))+r9]
|
||
|
movaps xmm8,XMMWORD[((-136))+r9]
|
||
|
movaps xmm9,XMMWORD[((-120))+r9]
|
||
|
movaps xmm10,XMMWORD[((-104))+r9]
|
||
|
movaps xmm11,XMMWORD[((-88))+r9]
|
||
|
movaps xmm12,XMMWORD[((-72))+r9]
|
||
|
movaps xmm13,XMMWORD[((-56))+r9]
|
||
|
movaps xmm14,XMMWORD[((-40))+r9]
|
||
|
movaps xmm15,XMMWORD[((-24))+r9]
|
||
|
lea rsp,[r9]
|
||
|
|
||
|
$L$16x_epilogue:
|
||
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
||
|
mov rsi,QWORD[16+rsp]
|
||
|
DB 0F3h,0C3h ;repret
|
||
|
|
||
|
$L$SEH_end_chacha20_16x:
|
||
|
global chacha20_8xvl
|
||
|
|
||
|
ALIGN 32
|
||
|
chacha20_8xvl:
|
||
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
||
|
mov QWORD[16+rsp],rsi
|
||
|
mov rax,rsp
|
||
|
$L$SEH_begin_chacha20_8xvl:
|
||
|
mov rdi,rcx
|
||
|
mov rsi,rdx
|
||
|
mov rdx,r8
|
||
|
mov rcx,r9
|
||
|
mov r8,QWORD[40+rsp]
|
||
|
|
||
|
|
||
|
|
||
|
$L$chacha20_8xvl:
|
||
|
mov r9,rsp
|
||
|
|
||
|
sub rsp,64+168
|
||
|
and rsp,-64
|
||
|
movaps XMMWORD[(-168)+r9],xmm6
|
||
|
movaps XMMWORD[(-152)+r9],xmm7
|
||
|
movaps XMMWORD[(-136)+r9],xmm8
|
||
|
movaps XMMWORD[(-120)+r9],xmm9
|
||
|
movaps XMMWORD[(-104)+r9],xmm10
|
||
|
movaps XMMWORD[(-88)+r9],xmm11
|
||
|
movaps XMMWORD[(-72)+r9],xmm12
|
||
|
movaps XMMWORD[(-56)+r9],xmm13
|
||
|
movaps XMMWORD[(-40)+r9],xmm14
|
||
|
movaps XMMWORD[(-24)+r9],xmm15
|
||
|
$L$8xvl_body:
|
||
|
vzeroupper
|
||
|
|
||
|
lea r10,[$L$sigma]
|
||
|
vbroadcasti128 ymm3,XMMWORD[r10]
|
||
|
vbroadcasti128 ymm7,XMMWORD[rcx]
|
||
|
vbroadcasti128 ymm11,XMMWORD[16+rcx]
|
||
|
vbroadcasti128 ymm15,XMMWORD[r8]
|
||
|
|
||
|
vpshufd ymm0,ymm3,0x00
|
||
|
vpshufd ymm1,ymm3,0x55
|
||
|
vpshufd ymm2,ymm3,0xaa
|
||
|
vpshufd ymm3,ymm3,0xff
|
||
|
vmovdqa64 ymm16,ymm0
|
||
|
vmovdqa64 ymm17,ymm1
|
||
|
vmovdqa64 ymm18,ymm2
|
||
|
vmovdqa64 ymm19,ymm3
|
||
|
|
||
|
vpshufd ymm4,ymm7,0x00
|
||
|
vpshufd ymm5,ymm7,0x55
|
||
|
vpshufd ymm6,ymm7,0xaa
|
||
|
vpshufd ymm7,ymm7,0xff
|
||
|
vmovdqa64 ymm20,ymm4
|
||
|
vmovdqa64 ymm21,ymm5
|
||
|
vmovdqa64 ymm22,ymm6
|
||
|
vmovdqa64 ymm23,ymm7
|
||
|
|
||
|
vpshufd ymm8,ymm11,0x00
|
||
|
vpshufd ymm9,ymm11,0x55
|
||
|
vpshufd ymm10,ymm11,0xaa
|
||
|
vpshufd ymm11,ymm11,0xff
|
||
|
vmovdqa64 ymm24,ymm8
|
||
|
vmovdqa64 ymm25,ymm9
|
||
|
vmovdqa64 ymm26,ymm10
|
||
|
vmovdqa64 ymm27,ymm11
|
||
|
|
||
|
vpshufd ymm12,ymm15,0x00
|
||
|
vpshufd ymm13,ymm15,0x55
|
||
|
vpshufd ymm14,ymm15,0xaa
|
||
|
vpshufd ymm15,ymm15,0xff
|
||
|
vpaddd ymm12,ymm12,YMMWORD[$L$incy]
|
||
|
vmovdqa64 ymm28,ymm12
|
||
|
vmovdqa64 ymm29,ymm13
|
||
|
vmovdqa64 ymm30,ymm14
|
||
|
vmovdqa64 ymm31,ymm15
|
||
|
|
||
|
mov eax,10
|
||
|
jmp NEAR $L$oop8xvl
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$oop_outer8xvl:
|
||
|
|
||
|
|
||
|
vpbroadcastd ymm2,DWORD[8+r10]
|
||
|
vpbroadcastd ymm3,DWORD[12+r10]
|
||
|
vpaddd ymm28,ymm28,YMMWORD[$L$eight]
|
||
|
vmovdqa64 ymm4,ymm20
|
||
|
vmovdqa64 ymm5,ymm21
|
||
|
vmovdqa64 ymm6,ymm22
|
||
|
vmovdqa64 ymm7,ymm23
|
||
|
vmovdqa64 ymm8,ymm24
|
||
|
vmovdqa64 ymm9,ymm25
|
||
|
vmovdqa64 ymm10,ymm26
|
||
|
vmovdqa64 ymm11,ymm27
|
||
|
vmovdqa64 ymm12,ymm28
|
||
|
vmovdqa64 ymm13,ymm29
|
||
|
vmovdqa64 ymm14,ymm30
|
||
|
vmovdqa64 ymm15,ymm31
|
||
|
|
||
|
vmovdqa64 ymm16,ymm0
|
||
|
vmovdqa64 ymm17,ymm1
|
||
|
vmovdqa64 ymm18,ymm2
|
||
|
vmovdqa64 ymm19,ymm3
|
||
|
|
||
|
mov eax,10
|
||
|
jmp NEAR $L$oop8xvl
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$oop8xvl:
|
||
|
vpaddd ymm0,ymm0,ymm4
|
||
|
vpaddd ymm1,ymm1,ymm5
|
||
|
vpaddd ymm2,ymm2,ymm6
|
||
|
vpaddd ymm3,ymm3,ymm7
|
||
|
vpxor ymm12,ymm12,ymm0
|
||
|
vpxor ymm13,ymm13,ymm1
|
||
|
vpxor ymm14,ymm14,ymm2
|
||
|
vpxor ymm15,ymm15,ymm3
|
||
|
vprold ymm12,ymm12,16
|
||
|
vprold ymm13,ymm13,16
|
||
|
vprold ymm14,ymm14,16
|
||
|
vprold ymm15,ymm15,16
|
||
|
vpaddd ymm8,ymm8,ymm12
|
||
|
vpaddd ymm9,ymm9,ymm13
|
||
|
vpaddd ymm10,ymm10,ymm14
|
||
|
vpaddd ymm11,ymm11,ymm15
|
||
|
vpxor ymm4,ymm4,ymm8
|
||
|
vpxor ymm5,ymm5,ymm9
|
||
|
vpxor ymm6,ymm6,ymm10
|
||
|
vpxor ymm7,ymm7,ymm11
|
||
|
vprold ymm4,ymm4,12
|
||
|
vprold ymm5,ymm5,12
|
||
|
vprold ymm6,ymm6,12
|
||
|
vprold ymm7,ymm7,12
|
||
|
vpaddd ymm0,ymm0,ymm4
|
||
|
vpaddd ymm1,ymm1,ymm5
|
||
|
vpaddd ymm2,ymm2,ymm6
|
||
|
vpaddd ymm3,ymm3,ymm7
|
||
|
vpxor ymm12,ymm12,ymm0
|
||
|
vpxor ymm13,ymm13,ymm1
|
||
|
vpxor ymm14,ymm14,ymm2
|
||
|
vpxor ymm15,ymm15,ymm3
|
||
|
vprold ymm12,ymm12,8
|
||
|
vprold ymm13,ymm13,8
|
||
|
vprold ymm14,ymm14,8
|
||
|
vprold ymm15,ymm15,8
|
||
|
vpaddd ymm8,ymm8,ymm12
|
||
|
vpaddd ymm9,ymm9,ymm13
|
||
|
vpaddd ymm10,ymm10,ymm14
|
||
|
vpaddd ymm11,ymm11,ymm15
|
||
|
vpxor ymm4,ymm4,ymm8
|
||
|
vpxor ymm5,ymm5,ymm9
|
||
|
vpxor ymm6,ymm6,ymm10
|
||
|
vpxor ymm7,ymm7,ymm11
|
||
|
vprold ymm4,ymm4,7
|
||
|
vprold ymm5,ymm5,7
|
||
|
vprold ymm6,ymm6,7
|
||
|
vprold ymm7,ymm7,7
|
||
|
vpaddd ymm0,ymm0,ymm5
|
||
|
vpaddd ymm1,ymm1,ymm6
|
||
|
vpaddd ymm2,ymm2,ymm7
|
||
|
vpaddd ymm3,ymm3,ymm4
|
||
|
vpxor ymm15,ymm15,ymm0
|
||
|
vpxor ymm12,ymm12,ymm1
|
||
|
vpxor ymm13,ymm13,ymm2
|
||
|
vpxor ymm14,ymm14,ymm3
|
||
|
vprold ymm15,ymm15,16
|
||
|
vprold ymm12,ymm12,16
|
||
|
vprold ymm13,ymm13,16
|
||
|
vprold ymm14,ymm14,16
|
||
|
vpaddd ymm10,ymm10,ymm15
|
||
|
vpaddd ymm11,ymm11,ymm12
|
||
|
vpaddd ymm8,ymm8,ymm13
|
||
|
vpaddd ymm9,ymm9,ymm14
|
||
|
vpxor ymm5,ymm5,ymm10
|
||
|
vpxor ymm6,ymm6,ymm11
|
||
|
vpxor ymm7,ymm7,ymm8
|
||
|
vpxor ymm4,ymm4,ymm9
|
||
|
vprold ymm5,ymm5,12
|
||
|
vprold ymm6,ymm6,12
|
||
|
vprold ymm7,ymm7,12
|
||
|
vprold ymm4,ymm4,12
|
||
|
vpaddd ymm0,ymm0,ymm5
|
||
|
vpaddd ymm1,ymm1,ymm6
|
||
|
vpaddd ymm2,ymm2,ymm7
|
||
|
vpaddd ymm3,ymm3,ymm4
|
||
|
vpxor ymm15,ymm15,ymm0
|
||
|
vpxor ymm12,ymm12,ymm1
|
||
|
vpxor ymm13,ymm13,ymm2
|
||
|
vpxor ymm14,ymm14,ymm3
|
||
|
vprold ymm15,ymm15,8
|
||
|
vprold ymm12,ymm12,8
|
||
|
vprold ymm13,ymm13,8
|
||
|
vprold ymm14,ymm14,8
|
||
|
vpaddd ymm10,ymm10,ymm15
|
||
|
vpaddd ymm11,ymm11,ymm12
|
||
|
vpaddd ymm8,ymm8,ymm13
|
||
|
vpaddd ymm9,ymm9,ymm14
|
||
|
vpxor ymm5,ymm5,ymm10
|
||
|
vpxor ymm6,ymm6,ymm11
|
||
|
vpxor ymm7,ymm7,ymm8
|
||
|
vpxor ymm4,ymm4,ymm9
|
||
|
vprold ymm5,ymm5,7
|
||
|
vprold ymm6,ymm6,7
|
||
|
vprold ymm7,ymm7,7
|
||
|
vprold ymm4,ymm4,7
|
||
|
dec eax
|
||
|
jnz NEAR $L$oop8xvl
|
||
|
|
||
|
vpaddd ymm0,ymm0,ymm16
|
||
|
vpaddd ymm1,ymm1,ymm17
|
||
|
vpaddd ymm2,ymm2,ymm18
|
||
|
vpaddd ymm3,ymm3,ymm19
|
||
|
|
||
|
vpunpckldq ymm18,ymm0,ymm1
|
||
|
vpunpckldq ymm19,ymm2,ymm3
|
||
|
vpunpckhdq ymm0,ymm0,ymm1
|
||
|
vpunpckhdq ymm2,ymm2,ymm3
|
||
|
vpunpcklqdq ymm1,ymm18,ymm19
|
||
|
vpunpckhqdq ymm18,ymm18,ymm19
|
||
|
vpunpcklqdq ymm3,ymm0,ymm2
|
||
|
vpunpckhqdq ymm0,ymm0,ymm2
|
||
|
vpaddd ymm4,ymm4,ymm20
|
||
|
vpaddd ymm5,ymm5,ymm21
|
||
|
vpaddd ymm6,ymm6,ymm22
|
||
|
vpaddd ymm7,ymm7,ymm23
|
||
|
|
||
|
vpunpckldq ymm2,ymm4,ymm5
|
||
|
vpunpckldq ymm19,ymm6,ymm7
|
||
|
vpunpckhdq ymm4,ymm4,ymm5
|
||
|
vpunpckhdq ymm6,ymm6,ymm7
|
||
|
vpunpcklqdq ymm5,ymm2,ymm19
|
||
|
vpunpckhqdq ymm2,ymm2,ymm19
|
||
|
vpunpcklqdq ymm7,ymm4,ymm6
|
||
|
vpunpckhqdq ymm4,ymm4,ymm6
|
||
|
vshufi32x4 ymm19,ymm1,ymm5,0
|
||
|
vshufi32x4 ymm5,ymm1,ymm5,3
|
||
|
vshufi32x4 ymm1,ymm18,ymm2,0
|
||
|
vshufi32x4 ymm2,ymm18,ymm2,3
|
||
|
vshufi32x4 ymm18,ymm3,ymm7,0
|
||
|
vshufi32x4 ymm7,ymm3,ymm7,3
|
||
|
vshufi32x4 ymm3,ymm0,ymm4,0
|
||
|
vshufi32x4 ymm4,ymm0,ymm4,3
|
||
|
vpaddd ymm8,ymm8,ymm24
|
||
|
vpaddd ymm9,ymm9,ymm25
|
||
|
vpaddd ymm10,ymm10,ymm26
|
||
|
vpaddd ymm11,ymm11,ymm27
|
||
|
|
||
|
vpunpckldq ymm6,ymm8,ymm9
|
||
|
vpunpckldq ymm0,ymm10,ymm11
|
||
|
vpunpckhdq ymm8,ymm8,ymm9
|
||
|
vpunpckhdq ymm10,ymm10,ymm11
|
||
|
vpunpcklqdq ymm9,ymm6,ymm0
|
||
|
vpunpckhqdq ymm6,ymm6,ymm0
|
||
|
vpunpcklqdq ymm11,ymm8,ymm10
|
||
|
vpunpckhqdq ymm8,ymm8,ymm10
|
||
|
vpaddd ymm12,ymm12,ymm28
|
||
|
vpaddd ymm13,ymm13,ymm29
|
||
|
vpaddd ymm14,ymm14,ymm30
|
||
|
vpaddd ymm15,ymm15,ymm31
|
||
|
|
||
|
vpunpckldq ymm10,ymm12,ymm13
|
||
|
vpunpckldq ymm0,ymm14,ymm15
|
||
|
vpunpckhdq ymm12,ymm12,ymm13
|
||
|
vpunpckhdq ymm14,ymm14,ymm15
|
||
|
vpunpcklqdq ymm13,ymm10,ymm0
|
||
|
vpunpckhqdq ymm10,ymm10,ymm0
|
||
|
vpunpcklqdq ymm15,ymm12,ymm14
|
||
|
vpunpckhqdq ymm12,ymm12,ymm14
|
||
|
vperm2i128 ymm0,ymm9,ymm13,0x20
|
||
|
vperm2i128 ymm13,ymm9,ymm13,0x31
|
||
|
vperm2i128 ymm9,ymm6,ymm10,0x20
|
||
|
vperm2i128 ymm10,ymm6,ymm10,0x31
|
||
|
vperm2i128 ymm6,ymm11,ymm15,0x20
|
||
|
vperm2i128 ymm15,ymm11,ymm15,0x31
|
||
|
vperm2i128 ymm11,ymm8,ymm12,0x20
|
||
|
vperm2i128 ymm12,ymm8,ymm12,0x31
|
||
|
cmp rdx,64*8
|
||
|
jb NEAR $L$tail8xvl
|
||
|
|
||
|
mov eax,0x80
|
||
|
vpxord ymm19,ymm19,YMMWORD[rsi]
|
||
|
vpxor ymm0,ymm0,YMMWORD[32+rsi]
|
||
|
vpxor ymm5,ymm5,YMMWORD[64+rsi]
|
||
|
vpxor ymm13,ymm13,YMMWORD[96+rsi]
|
||
|
lea rsi,[rax*1+rsi]
|
||
|
vmovdqu32 YMMWORD[rdi],ymm19
|
||
|
vmovdqu YMMWORD[32+rdi],ymm0
|
||
|
vmovdqu YMMWORD[64+rdi],ymm5
|
||
|
vmovdqu YMMWORD[96+rdi],ymm13
|
||
|
lea rdi,[rax*1+rdi]
|
||
|
|
||
|
vpxor ymm1,ymm1,YMMWORD[rsi]
|
||
|
vpxor ymm9,ymm9,YMMWORD[32+rsi]
|
||
|
vpxor ymm2,ymm2,YMMWORD[64+rsi]
|
||
|
vpxor ymm10,ymm10,YMMWORD[96+rsi]
|
||
|
lea rsi,[rax*1+rsi]
|
||
|
vmovdqu YMMWORD[rdi],ymm1
|
||
|
vmovdqu YMMWORD[32+rdi],ymm9
|
||
|
vmovdqu YMMWORD[64+rdi],ymm2
|
||
|
vmovdqu YMMWORD[96+rdi],ymm10
|
||
|
lea rdi,[rax*1+rdi]
|
||
|
|
||
|
vpxord ymm18,ymm18,YMMWORD[rsi]
|
||
|
vpxor ymm6,ymm6,YMMWORD[32+rsi]
|
||
|
vpxor ymm7,ymm7,YMMWORD[64+rsi]
|
||
|
vpxor ymm15,ymm15,YMMWORD[96+rsi]
|
||
|
lea rsi,[rax*1+rsi]
|
||
|
vmovdqu32 YMMWORD[rdi],ymm18
|
||
|
vmovdqu YMMWORD[32+rdi],ymm6
|
||
|
vmovdqu YMMWORD[64+rdi],ymm7
|
||
|
vmovdqu YMMWORD[96+rdi],ymm15
|
||
|
lea rdi,[rax*1+rdi]
|
||
|
|
||
|
vpxor ymm3,ymm3,YMMWORD[rsi]
|
||
|
vpxor ymm11,ymm11,YMMWORD[32+rsi]
|
||
|
vpxor ymm4,ymm4,YMMWORD[64+rsi]
|
||
|
vpxor ymm12,ymm12,YMMWORD[96+rsi]
|
||
|
lea rsi,[rax*1+rsi]
|
||
|
vmovdqu YMMWORD[rdi],ymm3
|
||
|
vmovdqu YMMWORD[32+rdi],ymm11
|
||
|
vmovdqu YMMWORD[64+rdi],ymm4
|
||
|
vmovdqu YMMWORD[96+rdi],ymm12
|
||
|
lea rdi,[rax*1+rdi]
|
||
|
|
||
|
vpbroadcastd ymm0,DWORD[r10]
|
||
|
vpbroadcastd ymm1,DWORD[4+r10]
|
||
|
|
||
|
sub rdx,64*8
|
||
|
jnz NEAR $L$oop_outer8xvl
|
||
|
|
||
|
jmp NEAR $L$done8xvl
|
||
|
|
||
|
ALIGN 32
|
||
|
$L$tail8xvl:
|
||
|
vmovdqa64 ymm8,ymm19
|
||
|
xor r10,r10
|
||
|
sub rdi,rsi
|
||
|
cmp rdx,64*1
|
||
|
jb NEAR $L$ess_than_64_8xvl
|
||
|
vpxor ymm8,ymm8,YMMWORD[rsi]
|
||
|
vpxor ymm0,ymm0,YMMWORD[32+rsi]
|
||
|
vmovdqu YMMWORD[rsi*1+rdi],ymm8
|
||
|
vmovdqu YMMWORD[32+rsi*1+rdi],ymm0
|
||
|
je NEAR $L$done8xvl
|
||
|
vmovdqa ymm8,ymm5
|
||
|
vmovdqa ymm0,ymm13
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*2
|
||
|
jb NEAR $L$ess_than_64_8xvl
|
||
|
vpxor ymm5,ymm5,YMMWORD[rsi]
|
||
|
vpxor ymm13,ymm13,YMMWORD[32+rsi]
|
||
|
vmovdqu YMMWORD[rsi*1+rdi],ymm5
|
||
|
vmovdqu YMMWORD[32+rsi*1+rdi],ymm13
|
||
|
je NEAR $L$done8xvl
|
||
|
vmovdqa ymm8,ymm1
|
||
|
vmovdqa ymm0,ymm9
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*3
|
||
|
jb NEAR $L$ess_than_64_8xvl
|
||
|
vpxor ymm1,ymm1,YMMWORD[rsi]
|
||
|
vpxor ymm9,ymm9,YMMWORD[32+rsi]
|
||
|
vmovdqu YMMWORD[rsi*1+rdi],ymm1
|
||
|
vmovdqu YMMWORD[32+rsi*1+rdi],ymm9
|
||
|
je NEAR $L$done8xvl
|
||
|
vmovdqa ymm8,ymm2
|
||
|
vmovdqa ymm0,ymm10
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*4
|
||
|
jb NEAR $L$ess_than_64_8xvl
|
||
|
vpxor ymm2,ymm2,YMMWORD[rsi]
|
||
|
vpxor ymm10,ymm10,YMMWORD[32+rsi]
|
||
|
vmovdqu YMMWORD[rsi*1+rdi],ymm2
|
||
|
vmovdqu YMMWORD[32+rsi*1+rdi],ymm10
|
||
|
je NEAR $L$done8xvl
|
||
|
vmovdqa32 ymm8,ymm18
|
||
|
vmovdqa ymm0,ymm6
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*5
|
||
|
jb NEAR $L$ess_than_64_8xvl
|
||
|
vpxord ymm18,ymm18,YMMWORD[rsi]
|
||
|
vpxor ymm6,ymm6,YMMWORD[32+rsi]
|
||
|
vmovdqu32 YMMWORD[rsi*1+rdi],ymm18
|
||
|
vmovdqu YMMWORD[32+rsi*1+rdi],ymm6
|
||
|
je NEAR $L$done8xvl
|
||
|
vmovdqa ymm8,ymm7
|
||
|
vmovdqa ymm0,ymm15
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*6
|
||
|
jb NEAR $L$ess_than_64_8xvl
|
||
|
vpxor ymm7,ymm7,YMMWORD[rsi]
|
||
|
vpxor ymm15,ymm15,YMMWORD[32+rsi]
|
||
|
vmovdqu YMMWORD[rsi*1+rdi],ymm7
|
||
|
vmovdqu YMMWORD[32+rsi*1+rdi],ymm15
|
||
|
je NEAR $L$done8xvl
|
||
|
vmovdqa ymm8,ymm3
|
||
|
vmovdqa ymm0,ymm11
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
cmp rdx,64*7
|
||
|
jb NEAR $L$ess_than_64_8xvl
|
||
|
vpxor ymm3,ymm3,YMMWORD[rsi]
|
||
|
vpxor ymm11,ymm11,YMMWORD[32+rsi]
|
||
|
vmovdqu YMMWORD[rsi*1+rdi],ymm3
|
||
|
vmovdqu YMMWORD[32+rsi*1+rdi],ymm11
|
||
|
je NEAR $L$done8xvl
|
||
|
vmovdqa ymm8,ymm4
|
||
|
vmovdqa ymm0,ymm12
|
||
|
lea rsi,[64+rsi]
|
||
|
|
||
|
$L$ess_than_64_8xvl:
|
||
|
vmovdqa YMMWORD[rsp],ymm8
|
||
|
vmovdqa YMMWORD[32+rsp],ymm0
|
||
|
lea rdi,[rsi*1+rdi]
|
||
|
and rdx,63
|
||
|
|
||
|
$L$oop_tail8xvl:
|
||
|
movzx eax,BYTE[r10*1+rsi]
|
||
|
movzx ecx,BYTE[r10*1+rsp]
|
||
|
lea r10,[1+r10]
|
||
|
xor eax,ecx
|
||
|
mov BYTE[((-1))+r10*1+rdi],al
|
||
|
dec rdx
|
||
|
jnz NEAR $L$oop_tail8xvl
|
||
|
|
||
|
vpxor ymm8,ymm8,ymm8
|
||
|
vmovdqa YMMWORD[rsp],ymm8
|
||
|
vmovdqa YMMWORD[32+rsp],ymm8
|
||
|
|
||
|
$L$done8xvl:
|
||
|
vzeroall
|
||
|
movaps xmm6,XMMWORD[((-168))+r9]
|
||
|
movaps xmm7,XMMWORD[((-152))+r9]
|
||
|
movaps xmm8,XMMWORD[((-136))+r9]
|
||
|
movaps xmm9,XMMWORD[((-120))+r9]
|
||
|
movaps xmm10,XMMWORD[((-104))+r9]
|
||
|
movaps xmm11,XMMWORD[((-88))+r9]
|
||
|
movaps xmm12,XMMWORD[((-72))+r9]
|
||
|
movaps xmm13,XMMWORD[((-56))+r9]
|
||
|
movaps xmm14,XMMWORD[((-40))+r9]
|
||
|
movaps xmm15,XMMWORD[((-24))+r9]
|
||
|
lea rsp,[r9]
|
||
|
|
||
|
$L$8xvl_epilogue:
|
||
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
||
|
mov rsi,QWORD[16+rsp]
|
||
|
DB 0F3h,0C3h ;repret
|
||
|
|
||
|
$L$SEH_end_chacha20_8xvl:
|
||
|
EXTERN __imp_RtlVirtualUnwind
|
||
|
|
||
|
ALIGN 16
|
||
|
ssse3_handler:
|
||
|
push rsi
|
||
|
push rdi
|
||
|
push rbx
|
||
|
push rbp
|
||
|
push r12
|
||
|
push r13
|
||
|
push r14
|
||
|
push r15
|
||
|
pushfq
|
||
|
sub rsp,64
|
||
|
|
||
|
mov rax,QWORD[120+r8]
|
||
|
mov rbx,QWORD[248+r8]
|
||
|
|
||
|
mov rsi,QWORD[8+r9]
|
||
|
mov r11,QWORD[56+r9]
|
||
|
|
||
|
mov r10d,DWORD[r11]
|
||
|
lea r10,[r10*1+rsi]
|
||
|
cmp rbx,r10
|
||
|
jb NEAR $L$common_seh_tail
|
||
|
|
||
|
mov rax,QWORD[192+r8]
|
||
|
|
||
|
mov r10d,DWORD[4+r11]
|
||
|
lea r10,[r10*1+rsi]
|
||
|
cmp rbx,r10
|
||
|
jae NEAR $L$common_seh_tail
|
||
|
|
||
|
lea rsi,[((-40))+rax]
|
||
|
lea rdi,[512+r8]
|
||
|
mov ecx,4
|
||
|
DD 0xa548f3fc
|
||
|
|
||
|
$L$common_seh_tail:
|
||
|
mov rdi,QWORD[8+rax]
|
||
|
mov rsi,QWORD[16+rax]
|
||
|
mov QWORD[152+r8],rax
|
||
|
mov QWORD[168+r8],rsi
|
||
|
mov QWORD[176+r8],rdi
|
||
|
|
||
|
mov rdi,QWORD[40+r9]
|
||
|
mov rsi,r8
|
||
|
mov ecx,154
|
||
|
DD 0xa548f3fc
|
||
|
|
||
|
mov rsi,r9
|
||
|
xor rcx,rcx
|
||
|
mov rdx,QWORD[8+rsi]
|
||
|
mov r8,QWORD[rsi]
|
||
|
mov r9,QWORD[16+rsi]
|
||
|
mov r10,QWORD[40+rsi]
|
||
|
lea r11,[56+rsi]
|
||
|
lea r12,[24+rsi]
|
||
|
mov QWORD[32+rsp],r10
|
||
|
mov QWORD[40+rsp],r11
|
||
|
mov QWORD[48+rsp],r12
|
||
|
mov QWORD[56+rsp],rcx
|
||
|
call QWORD[__imp_RtlVirtualUnwind]
|
||
|
|
||
|
mov eax,1
|
||
|
add rsp,64
|
||
|
popfq
|
||
|
pop r15
|
||
|
pop r14
|
||
|
pop r13
|
||
|
pop r12
|
||
|
pop rbp
|
||
|
pop rbx
|
||
|
pop rdi
|
||
|
pop rsi
|
||
|
DB 0F3h,0C3h ;repret
|
||
|
|
||
|
|
||
|
|
||
|
ALIGN 16
|
||
|
full_handler:
|
||
|
push rsi
|
||
|
push rdi
|
||
|
push rbx
|
||
|
push rbp
|
||
|
push r12
|
||
|
push r13
|
||
|
push r14
|
||
|
push r15
|
||
|
pushfq
|
||
|
sub rsp,64
|
||
|
|
||
|
mov rax,QWORD[120+r8]
|
||
|
mov rbx,QWORD[248+r8]
|
||
|
|
||
|
mov rsi,QWORD[8+r9]
|
||
|
mov r11,QWORD[56+r9]
|
||
|
|
||
|
mov r10d,DWORD[r11]
|
||
|
lea r10,[r10*1+rsi]
|
||
|
cmp rbx,r10
|
||
|
jb NEAR $L$common_seh_tail
|
||
|
|
||
|
mov rax,QWORD[192+r8]
|
||
|
|
||
|
mov r10d,DWORD[4+r11]
|
||
|
lea r10,[r10*1+rsi]
|
||
|
cmp rbx,r10
|
||
|
jae NEAR $L$common_seh_tail
|
||
|
|
||
|
lea rsi,[((-168))+rax]
|
||
|
lea rdi,[512+r8]
|
||
|
mov ecx,20
|
||
|
DD 0xa548f3fc
|
||
|
|
||
|
jmp NEAR $L$common_seh_tail
|
||
|
|
||
|
|
||
|
section .pdata rdata align=4
|
||
|
ALIGN 4
|
||
|
DD $L$SEH_begin_chacha20_ssse3 wrt ..imagebase
|
||
|
DD $L$SEH_end_chacha20_ssse3 wrt ..imagebase
|
||
|
DD $L$SEH_info_chacha20_ssse3 wrt ..imagebase
|
||
|
|
||
|
DD $L$SEH_begin_chacha20_4x wrt ..imagebase
|
||
|
DD $L$SEH_end_chacha20_4x wrt ..imagebase
|
||
|
DD $L$SEH_info_chacha20_4x wrt ..imagebase
|
||
|
DD $L$SEH_begin_chacha20_avx2 wrt ..imagebase
|
||
|
DD $L$SEH_end_chacha20_avx2 wrt ..imagebase
|
||
|
DD $L$SEH_info_chacha20_avx2 wrt ..imagebase
|
||
|
DD $L$SEH_begin_chacha20_avx512 wrt ..imagebase
|
||
|
DD $L$SEH_end_chacha20_avx512 wrt ..imagebase
|
||
|
DD $L$SEH_info_chacha20_avx512 wrt ..imagebase
|
||
|
|
||
|
DD $L$SEH_begin_chacha20_avx512vl wrt ..imagebase
|
||
|
DD $L$SEH_end_chacha20_avx512vl wrt ..imagebase
|
||
|
DD $L$SEH_info_chacha20_avx512vl wrt ..imagebase
|
||
|
|
||
|
DD $L$SEH_begin_chacha20_16x wrt ..imagebase
|
||
|
DD $L$SEH_end_chacha20_16x wrt ..imagebase
|
||
|
DD $L$SEH_info_chacha20_16x wrt ..imagebase
|
||
|
|
||
|
DD $L$SEH_begin_chacha20_8xvl wrt ..imagebase
|
||
|
DD $L$SEH_end_chacha20_8xvl wrt ..imagebase
|
||
|
DD $L$SEH_info_chacha20_8xvl wrt ..imagebase
|
||
|
section .xdata rdata align=8
|
||
|
ALIGN 8
|
||
|
$L$SEH_info_chacha20_ssse3:
|
||
|
DB 9,0,0,0
|
||
|
DD ssse3_handler wrt ..imagebase
|
||
|
DD $L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase
|
||
|
|
||
|
$L$SEH_info_chacha20_4x:
|
||
|
DB 9,0,0,0
|
||
|
DD full_handler wrt ..imagebase
|
||
|
DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
|
||
|
$L$SEH_info_chacha20_avx2:
|
||
|
DB 9,0,0,0
|
||
|
DD full_handler wrt ..imagebase
|
||
|
DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase
|
||
|
$L$SEH_info_chacha20_avx512:
|
||
|
DB 9,0,0,0
|
||
|
DD ssse3_handler wrt ..imagebase
|
||
|
DD $L$avx512_body wrt ..imagebase,$L$avx512_epilogue wrt ..imagebase
|
||
|
|
||
|
$L$SEH_info_chacha20_avx512vl:
|
||
|
DB 9,0,0,0
|
||
|
DD ssse3_handler wrt ..imagebase
|
||
|
DD $L$avx512vl_body wrt ..imagebase,$L$avx512vl_epilogue wrt ..imagebase
|
||
|
|
||
|
$L$SEH_info_chacha20_16x:
|
||
|
DB 9,0,0,0
|
||
|
DD full_handler wrt ..imagebase
|
||
|
DD $L$16x_body wrt ..imagebase,$L$16x_epilogue wrt ..imagebase
|
||
|
|
||
|
$L$SEH_info_chacha20_8xvl:
|
||
|
DB 9,0,0,0
|
||
|
DD full_handler wrt ..imagebase
|
||
|
DD $L$8xvl_body wrt ..imagebase,$L$8xvl_epilogue wrt ..imagebase
|