tunsafe-clang15/crypto/poly1305/poly1305-x64-win.asm

3488 lines
61 KiB
NASM
Raw Permalink Normal View History

default rel
%define XMMWORD
%define YMMWORD
%define ZMMWORD
ALIGN 64
$L$const:
$L$mask24:
DD 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
$L$129:
DD 16777216,0,16777216,0,16777216,0,16777216,0
$L$mask26:
DD 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
$L$permd_avx2:
DD 2,2,2,3,2,0,2,1
$L$permd_avx512:
DD 0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7
$L$2_44_inp_permd:
DD 0,1,1,2,2,3,7,7
$L$2_44_inp_shift:
DQ 0,12,24,64
$L$2_44_mask:
DQ 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
$L$2_44_shift_rgt:
DQ 44,44,42,64
$L$2_44_shift_lft:
DQ 8,8,10,64
ALIGN 64
$L$x_mask44:
DQ 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
DQ 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
$L$x_mask42:
DQ 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
DQ 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
section .text code align=64
global poly1305_init_x86_64
global poly1305_blocks_x86_64
global poly1305_emit_x86_64
global poly1305_emit_avx
global poly1305_blocks_avx
global poly1305_blocks_avx2
global poly1305_blocks_avx512
ALIGN 32
poly1305_init_x86_64:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_poly1305_init_x86_64:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
xor rax,rax
mov QWORD[rdi],rax
mov QWORD[8+rdi],rax
mov QWORD[16+rdi],rax
cmp rsi,0
je NEAR $L$no_key
mov rax,0x0ffffffc0fffffff
mov rcx,0x0ffffffc0ffffffc
and rax,QWORD[rsi]
and rcx,QWORD[8+rsi]
mov QWORD[24+rdi],rax
mov QWORD[32+rdi],rcx
mov eax,1
$L$no_key:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_poly1305_init_x86_64:
ALIGN 32
poly1305_blocks_x86_64:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_poly1305_blocks_x86_64:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
$L$blocks:
shr rdx,4
jz NEAR $L$no_data
push rbx
push rbp
push r12
push r13
push r14
push r15
$L$blocks_body:
mov r15,rdx
mov r11,QWORD[24+rdi]
mov r13,QWORD[32+rdi]
mov r14,QWORD[rdi]
mov rbx,QWORD[8+rdi]
mov rbp,QWORD[16+rdi]
mov r12,r13
shr r13,2
mov rax,r12
add r13,r12
jmp NEAR $L$oop
ALIGN 32
$L$oop:
add r14,QWORD[rsi]
adc rbx,QWORD[8+rsi]
lea rsi,[16+rsi]
adc rbp,rcx
mul r14
mov r9,rax
mov rax,r11
mov r10,rdx
mul r14
mov r14,rax
mov rax,r11
mov r8,rdx
mul rbx
add r9,rax
mov rax,r13
adc r10,rdx
mul rbx
mov rbx,rbp
add r14,rax
adc r8,rdx
imul rbx,r13
add r9,rbx
mov rbx,r8
adc r10,0
imul rbp,r11
add rbx,r9
mov rax,-4
adc r10,rbp
and rax,r10
mov rbp,r10
shr r10,2
and rbp,3
add rax,r10
add r14,rax
adc rbx,0
adc rbp,0
mov rax,r12
dec r15
jnz NEAR $L$oop
mov QWORD[rdi],r14
mov QWORD[8+rdi],rbx
mov QWORD[16+rdi],rbp
mov r15,QWORD[rsp]
mov r14,QWORD[8+rsp]
mov r13,QWORD[16+rsp]
mov r12,QWORD[24+rsp]
mov rbp,QWORD[32+rsp]
mov rbx,QWORD[40+rsp]
lea rsp,[48+rsp]
$L$no_data:
$L$blocks_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_poly1305_blocks_x86_64:
ALIGN 32
poly1305_emit_x86_64:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_poly1305_emit_x86_64:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
$L$emit:
mov r8,QWORD[rdi]
mov r9,QWORD[8+rdi]
mov r10,QWORD[16+rdi]
mov rax,r8
add r8,5
mov rcx,r9
adc r9,0
adc r10,0
shr r10,2
cmovnz rax,r8
cmovnz rcx,r9
add rax,QWORD[rdx]
adc rcx,QWORD[8+rdx]
mov QWORD[rsi],rax
mov QWORD[8+rsi],rcx
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_poly1305_emit_x86_64:
ALIGN 32
__poly1305_block:
mul r14
mov r9,rax
mov rax,r11
mov r10,rdx
mul r14
mov r14,rax
mov rax,r11
mov r8,rdx
mul rbx
add r9,rax
mov rax,r13
adc r10,rdx
mul rbx
mov rbx,rbp
add r14,rax
adc r8,rdx
imul rbx,r13
add r9,rbx
mov rbx,r8
adc r10,0
imul rbp,r11
add rbx,r9
mov rax,-4
adc r10,rbp
and rax,r10
mov rbp,r10
shr r10,2
and rbp,3
add rax,r10
add r14,rax
adc rbx,0
adc rbp,0
DB 0F3h,0C3h ;repret
ALIGN 32
__poly1305_init_avx:
mov r14,r11
mov rbx,r12
xor rbp,rbp
lea rdi,[((48+64))+rdi]
mov rax,r12
call __poly1305_block
mov eax,0x3ffffff
mov edx,0x3ffffff
mov r8,r14
and eax,r14d
mov r9,r11
and edx,r11d
mov DWORD[((-64))+rdi],eax
shr r8,26
mov DWORD[((-60))+rdi],edx
shr r9,26
mov eax,0x3ffffff
mov edx,0x3ffffff
and eax,r8d
and edx,r9d
mov DWORD[((-48))+rdi],eax
lea eax,[rax*4+rax]
mov DWORD[((-44))+rdi],edx
lea edx,[rdx*4+rdx]
mov DWORD[((-32))+rdi],eax
shr r8,26
mov DWORD[((-28))+rdi],edx
shr r9,26
mov rax,rbx
mov rdx,r12
shl rax,12
shl rdx,12
or rax,r8
or rdx,r9
and eax,0x3ffffff
and edx,0x3ffffff
mov DWORD[((-16))+rdi],eax
lea eax,[rax*4+rax]
mov DWORD[((-12))+rdi],edx
lea edx,[rdx*4+rdx]
mov DWORD[rdi],eax
mov r8,rbx
mov DWORD[4+rdi],edx
mov r9,r12
mov eax,0x3ffffff
mov edx,0x3ffffff
shr r8,14
shr r9,14
and eax,r8d
and edx,r9d
mov DWORD[16+rdi],eax
lea eax,[rax*4+rax]
mov DWORD[20+rdi],edx
lea edx,[rdx*4+rdx]
mov DWORD[32+rdi],eax
shr r8,26
mov DWORD[36+rdi],edx
shr r9,26
mov rax,rbp
shl rax,24
or r8,rax
mov DWORD[48+rdi],r8d
lea r8,[r8*4+r8]
mov DWORD[52+rdi],r9d
lea r9,[r9*4+r9]
mov DWORD[64+rdi],r8d
mov DWORD[68+rdi],r9d
mov rax,r12
call __poly1305_block
mov eax,0x3ffffff
mov r8,r14
and eax,r14d
shr r8,26
mov DWORD[((-52))+rdi],eax
mov edx,0x3ffffff
and edx,r8d
mov DWORD[((-36))+rdi],edx
lea edx,[rdx*4+rdx]
shr r8,26
mov DWORD[((-20))+rdi],edx
mov rax,rbx
shl rax,12
or rax,r8
and eax,0x3ffffff
mov DWORD[((-4))+rdi],eax
lea eax,[rax*4+rax]
mov r8,rbx
mov DWORD[12+rdi],eax
mov edx,0x3ffffff
shr r8,14
and edx,r8d
mov DWORD[28+rdi],edx
lea edx,[rdx*4+rdx]
shr r8,26
mov DWORD[44+rdi],edx
mov rax,rbp
shl rax,24
or r8,rax
mov DWORD[60+rdi],r8d
lea r8,[r8*4+r8]
mov DWORD[76+rdi],r8d
mov rax,r12
call __poly1305_block
mov eax,0x3ffffff
mov r8,r14
and eax,r14d
shr r8,26
mov DWORD[((-56))+rdi],eax
mov edx,0x3ffffff
and edx,r8d
mov DWORD[((-40))+rdi],edx
lea edx,[rdx*4+rdx]
shr r8,26
mov DWORD[((-24))+rdi],edx
mov rax,rbx
shl rax,12
or rax,r8
and eax,0x3ffffff
mov DWORD[((-8))+rdi],eax
lea eax,[rax*4+rax]
mov r8,rbx
mov DWORD[8+rdi],eax
mov edx,0x3ffffff
shr r8,14
and edx,r8d
mov DWORD[24+rdi],edx
lea edx,[rdx*4+rdx]
shr r8,26
mov DWORD[40+rdi],edx
mov rax,rbp
shl rax,24
or r8,rax
mov DWORD[56+rdi],r8d
lea r8,[r8*4+r8]
mov DWORD[72+rdi],r8d
lea rdi,[((-48-64))+rdi]
DB 0F3h,0C3h ;repret
ALIGN 32
poly1305_blocks_avx:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_poly1305_blocks_avx:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
mov r8d,DWORD[20+rdi]
cmp rdx,128
jae NEAR $L$blocks_avx
test r8d,r8d
jz NEAR $L$blocks
$L$blocks_avx:
and rdx,-16
jz NEAR $L$no_data_avx
vzeroupper
test r8d,r8d
jz NEAR $L$base2_64_avx
test rdx,31
jz NEAR $L$even_avx
push rbx
push rbp
push r12
push r13
push r14
push r15
$L$blocks_avx_body:
mov r15,rdx
mov r8,QWORD[rdi]
mov r9,QWORD[8+rdi]
mov ebp,DWORD[16+rdi]
mov r11,QWORD[24+rdi]
mov r13,QWORD[32+rdi]
mov r14d,r8d
and r8,-2147483648
mov r12,r9
mov ebx,r9d
and r9,-2147483648
shr r8,6
shl r12,52
add r14,r8
shr rbx,12
shr r9,18
add r14,r12
adc rbx,r9
mov r8,rbp
shl r8,40
shr rbp,24
add rbx,r8
adc rbp,0
mov r9,-4
mov r8,rbp
and r9,rbp
shr r8,2
and rbp,3
add r8,r9
add r14,r8
adc rbx,0
adc rbp,0
mov r12,r13
mov rax,r13
shr r13,2
add r13,r12
add r14,QWORD[rsi]
adc rbx,QWORD[8+rsi]
lea rsi,[16+rsi]
adc rbp,rcx
call __poly1305_block
test rcx,rcx
jz NEAR $L$store_base2_64_avx
mov rax,r14
mov rdx,r14
shr r14,52
mov r11,rbx
mov r12,rbx
shr rdx,26
and rax,0x3ffffff
shl r11,12
and rdx,0x3ffffff
shr rbx,14
or r14,r11
shl rbp,24
and r14,0x3ffffff
shr r12,40
and rbx,0x3ffffff
or rbp,r12
sub r15,16
jz NEAR $L$store_base2_26_avx
vmovd xmm0,eax
vmovd xmm1,edx
vmovd xmm2,r14d
vmovd xmm3,ebx
vmovd xmm4,ebp
jmp NEAR $L$proceed_avx
ALIGN 32
$L$store_base2_64_avx:
mov QWORD[rdi],r14
mov QWORD[8+rdi],rbx
mov QWORD[16+rdi],rbp
jmp NEAR $L$done_avx
ALIGN 16
$L$store_base2_26_avx:
mov DWORD[rdi],eax
mov DWORD[4+rdi],edx
mov DWORD[8+rdi],r14d
mov DWORD[12+rdi],ebx
mov DWORD[16+rdi],ebp
ALIGN 16
$L$done_avx:
mov r15,QWORD[rsp]
mov r14,QWORD[8+rsp]
mov r13,QWORD[16+rsp]
mov r12,QWORD[24+rsp]
mov rbp,QWORD[32+rsp]
mov rbx,QWORD[40+rsp]
lea rsp,[48+rsp]
$L$no_data_avx:
$L$blocks_avx_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
ALIGN 32
$L$base2_64_avx:
push rbx
push rbp
push r12
push r13
push r14
push r15
$L$base2_64_avx_body:
mov r15,rdx
mov r11,QWORD[24+rdi]
mov r13,QWORD[32+rdi]
mov r14,QWORD[rdi]
mov rbx,QWORD[8+rdi]
mov ebp,DWORD[16+rdi]
mov r12,r13
mov rax,r13
shr r13,2
add r13,r12
test rdx,31
jz NEAR $L$init_avx
add r14,QWORD[rsi]
adc rbx,QWORD[8+rsi]
lea rsi,[16+rsi]
adc rbp,rcx
sub r15,16
call __poly1305_block
$L$init_avx:
mov rax,r14
mov rdx,r14
shr r14,52
mov r8,rbx
mov r9,rbx
shr rdx,26
and rax,0x3ffffff
shl r8,12
and rdx,0x3ffffff
shr rbx,14
or r14,r8
shl rbp,24
and r14,0x3ffffff
shr r9,40
and rbx,0x3ffffff
or rbp,r9
vmovd xmm0,eax
vmovd xmm1,edx
vmovd xmm2,r14d
vmovd xmm3,ebx
vmovd xmm4,ebp
mov DWORD[20+rdi],1
call __poly1305_init_avx
$L$proceed_avx:
mov rdx,r15
mov r15,QWORD[rsp]
mov r14,QWORD[8+rsp]
mov r13,QWORD[16+rsp]
mov r12,QWORD[24+rsp]
mov rbp,QWORD[32+rsp]
mov rbx,QWORD[40+rsp]
lea rax,[48+rsp]
lea rsp,[48+rsp]
$L$base2_64_avx_epilogue:
jmp NEAR $L$do_avx
ALIGN 32
$L$even_avx:
vmovd xmm0,DWORD[rdi]
vmovd xmm1,DWORD[4+rdi]
vmovd xmm2,DWORD[8+rdi]
vmovd xmm3,DWORD[12+rdi]
vmovd xmm4,DWORD[16+rdi]
$L$do_avx:
lea r11,[((-248))+rsp]
sub rsp,0x218
vmovdqa XMMWORD[80+r11],xmm6
vmovdqa XMMWORD[96+r11],xmm7
vmovdqa XMMWORD[112+r11],xmm8
vmovdqa XMMWORD[128+r11],xmm9
vmovdqa XMMWORD[144+r11],xmm10
vmovdqa XMMWORD[160+r11],xmm11
vmovdqa XMMWORD[176+r11],xmm12
vmovdqa XMMWORD[192+r11],xmm13
vmovdqa XMMWORD[208+r11],xmm14
vmovdqa XMMWORD[224+r11],xmm15
$L$do_avx_body:
sub rdx,64
lea rax,[((-32))+rsi]
cmovc rsi,rax
vmovdqu xmm14,XMMWORD[48+rdi]
lea rdi,[112+rdi]
lea rcx,[$L$const]
vmovdqu xmm5,XMMWORD[32+rsi]
vmovdqu xmm6,XMMWORD[48+rsi]
vmovdqa xmm15,XMMWORD[64+rcx]
vpsrldq xmm7,xmm5,6
vpsrldq xmm8,xmm6,6
vpunpckhqdq xmm9,xmm5,xmm6
vpunpcklqdq xmm5,xmm5,xmm6
vpunpcklqdq xmm8,xmm7,xmm8
vpsrlq xmm9,xmm9,40
vpsrlq xmm6,xmm5,26
vpand xmm5,xmm5,xmm15
vpsrlq xmm7,xmm8,4
vpand xmm6,xmm6,xmm15
vpsrlq xmm8,xmm8,30
vpand xmm7,xmm7,xmm15
vpand xmm8,xmm8,xmm15
vpor xmm9,xmm9,XMMWORD[32+rcx]
jbe NEAR $L$skip_loop_avx
vmovdqu xmm11,XMMWORD[((-48))+rdi]
vmovdqu xmm12,XMMWORD[((-32))+rdi]
vpshufd xmm13,xmm14,0xEE
vpshufd xmm10,xmm14,0x44
vmovdqa XMMWORD[(-144)+r11],xmm13
vmovdqa XMMWORD[rsp],xmm10
vpshufd xmm14,xmm11,0xEE
vmovdqu xmm10,XMMWORD[((-16))+rdi]
vpshufd xmm11,xmm11,0x44
vmovdqa XMMWORD[(-128)+r11],xmm14
vmovdqa XMMWORD[16+rsp],xmm11
vpshufd xmm13,xmm12,0xEE
vmovdqu xmm11,XMMWORD[rdi]
vpshufd xmm12,xmm12,0x44
vmovdqa XMMWORD[(-112)+r11],xmm13
vmovdqa XMMWORD[32+rsp],xmm12
vpshufd xmm14,xmm10,0xEE
vmovdqu xmm12,XMMWORD[16+rdi]
vpshufd xmm10,xmm10,0x44
vmovdqa XMMWORD[(-96)+r11],xmm14
vmovdqa XMMWORD[48+rsp],xmm10
vpshufd xmm13,xmm11,0xEE
vmovdqu xmm10,XMMWORD[32+rdi]
vpshufd xmm11,xmm11,0x44
vmovdqa XMMWORD[(-80)+r11],xmm13
vmovdqa XMMWORD[64+rsp],xmm11
vpshufd xmm14,xmm12,0xEE
vmovdqu xmm11,XMMWORD[48+rdi]
vpshufd xmm12,xmm12,0x44
vmovdqa XMMWORD[(-64)+r11],xmm14
vmovdqa XMMWORD[80+rsp],xmm12
vpshufd xmm13,xmm10,0xEE
vmovdqu xmm12,XMMWORD[64+rdi]
vpshufd xmm10,xmm10,0x44
vmovdqa XMMWORD[(-48)+r11],xmm13
vmovdqa XMMWORD[96+rsp],xmm10
vpshufd xmm14,xmm11,0xEE
vpshufd xmm11,xmm11,0x44
vmovdqa XMMWORD[(-32)+r11],xmm14
vmovdqa XMMWORD[112+rsp],xmm11
vpshufd xmm13,xmm12,0xEE
vmovdqa xmm14,XMMWORD[rsp]
vpshufd xmm12,xmm12,0x44
vmovdqa XMMWORD[(-16)+r11],xmm13
vmovdqa XMMWORD[128+rsp],xmm12
jmp NEAR $L$oop_avx
ALIGN 32
$L$oop_avx:
vpmuludq xmm10,xmm14,xmm5
vpmuludq xmm11,xmm14,xmm6
vmovdqa XMMWORD[32+r11],xmm2
vpmuludq xmm12,xmm14,xmm7
vmovdqa xmm2,XMMWORD[16+rsp]
vpmuludq xmm13,xmm14,xmm8
vpmuludq xmm14,xmm14,xmm9
vmovdqa XMMWORD[r11],xmm0
vpmuludq xmm0,xmm9,XMMWORD[32+rsp]
vmovdqa XMMWORD[16+r11],xmm1
vpmuludq xmm1,xmm2,xmm8
vpaddq xmm10,xmm10,xmm0
vpaddq xmm14,xmm14,xmm1
vmovdqa XMMWORD[48+r11],xmm3
vpmuludq xmm0,xmm2,xmm7
vpmuludq xmm1,xmm2,xmm6
vpaddq xmm13,xmm13,xmm0
vmovdqa xmm3,XMMWORD[48+rsp]
vpaddq xmm12,xmm12,xmm1
vmovdqa XMMWORD[64+r11],xmm4
vpmuludq xmm2,xmm2,xmm5
vpmuludq xmm0,xmm3,xmm7
vpaddq xmm11,xmm11,xmm2
vmovdqa xmm4,XMMWORD[64+rsp]
vpaddq xmm14,xmm14,xmm0
vpmuludq xmm1,xmm3,xmm6
vpmuludq xmm3,xmm3,xmm5
vpaddq xmm13,xmm13,xmm1
vmovdqa xmm2,XMMWORD[80+rsp]
vpaddq xmm12,xmm12,xmm3
vpmuludq xmm0,xmm4,xmm9
vpmuludq xmm4,xmm4,xmm8
vpaddq xmm11,xmm11,xmm0
vmovdqa xmm3,XMMWORD[96+rsp]
vpaddq xmm10,xmm10,xmm4
vmovdqa xmm4,XMMWORD[128+rsp]
vpmuludq xmm1,xmm2,xmm6
vpmuludq xmm2,xmm2,xmm5
vpaddq xmm14,xmm14,xmm1
vpaddq xmm13,xmm13,xmm2
vpmuludq xmm0,xmm3,xmm9
vpmuludq xmm1,xmm3,xmm8
vpaddq xmm12,xmm12,xmm0
vmovdqu xmm0,XMMWORD[rsi]
vpaddq xmm11,xmm11,xmm1
vpmuludq xmm3,xmm3,xmm7
vpmuludq xmm7,xmm4,xmm7
vpaddq xmm10,xmm10,xmm3
vmovdqu xmm1,XMMWORD[16+rsi]
vpaddq xmm11,xmm11,xmm7
vpmuludq xmm8,xmm4,xmm8
vpmuludq xmm9,xmm4,xmm9
vpsrldq xmm2,xmm0,6
vpaddq xmm12,xmm12,xmm8
vpaddq xmm13,xmm13,xmm9
vpsrldq xmm3,xmm1,6
vpmuludq xmm9,xmm5,XMMWORD[112+rsp]
vpmuludq xmm5,xmm4,xmm6
vpunpckhqdq xmm4,xmm0,xmm1
vpaddq xmm14,xmm14,xmm9
vmovdqa xmm9,XMMWORD[((-144))+r11]
vpaddq xmm10,xmm10,xmm5
vpunpcklqdq xmm0,xmm0,xmm1
vpunpcklqdq xmm3,xmm2,xmm3
vpsrldq xmm4,xmm4,5
vpsrlq xmm1,xmm0,26
vpand xmm0,xmm0,xmm15
vpsrlq xmm2,xmm3,4
vpand xmm1,xmm1,xmm15
vpand xmm4,xmm4,XMMWORD[rcx]
vpsrlq xmm3,xmm3,30
vpand xmm2,xmm2,xmm15
vpand xmm3,xmm3,xmm15
vpor xmm4,xmm4,XMMWORD[32+rcx]
vpaddq xmm0,xmm0,XMMWORD[r11]
vpaddq xmm1,xmm1,XMMWORD[16+r11]
vpaddq xmm2,xmm2,XMMWORD[32+r11]
vpaddq xmm3,xmm3,XMMWORD[48+r11]
vpaddq xmm4,xmm4,XMMWORD[64+r11]
lea rax,[32+rsi]
lea rsi,[64+rsi]
sub rdx,64
cmovc rsi,rax
vpmuludq xmm5,xmm9,xmm0
vpmuludq xmm6,xmm9,xmm1
vpaddq xmm10,xmm10,xmm5
vpaddq xmm11,xmm11,xmm6
vmovdqa xmm7,XMMWORD[((-128))+r11]
vpmuludq xmm5,xmm9,xmm2
vpmuludq xmm6,xmm9,xmm3
vpaddq xmm12,xmm12,xmm5
vpaddq xmm13,xmm13,xmm6
vpmuludq xmm9,xmm9,xmm4
vpmuludq xmm5,xmm4,XMMWORD[((-112))+r11]
vpaddq xmm14,xmm14,xmm9
vpaddq xmm10,xmm10,xmm5
vpmuludq xmm6,xmm7,xmm2
vpmuludq xmm5,xmm7,xmm3
vpaddq xmm13,xmm13,xmm6
vmovdqa xmm8,XMMWORD[((-96))+r11]
vpaddq xmm14,xmm14,xmm5
vpmuludq xmm6,xmm7,xmm1
vpmuludq xmm7,xmm7,xmm0
vpaddq xmm12,xmm12,xmm6
vpaddq xmm11,xmm11,xmm7
vmovdqa xmm9,XMMWORD[((-80))+r11]
vpmuludq xmm5,xmm8,xmm2
vpmuludq xmm6,xmm8,xmm1
vpaddq xmm14,xmm14,xmm5
vpaddq xmm13,xmm13,xmm6
vmovdqa xmm7,XMMWORD[((-64))+r11]
vpmuludq xmm8,xmm8,xmm0
vpmuludq xmm5,xmm9,xmm4
vpaddq xmm12,xmm12,xmm8
vpaddq xmm11,xmm11,xmm5
vmovdqa xmm8,XMMWORD[((-48))+r11]
vpmuludq xmm9,xmm9,xmm3
vpmuludq xmm6,xmm7,xmm1
vpaddq xmm10,xmm10,xmm9
vmovdqa xmm9,XMMWORD[((-16))+r11]
vpaddq xmm14,xmm14,xmm6
vpmuludq xmm7,xmm7,xmm0
vpmuludq xmm5,xmm8,xmm4
vpaddq xmm13,xmm13,xmm7
vpaddq xmm12,xmm12,xmm5
vmovdqu xmm5,XMMWORD[32+rsi]
vpmuludq xmm7,xmm8,xmm3
vpmuludq xmm8,xmm8,xmm2
vpaddq xmm11,xmm11,xmm7
vmovdqu xmm6,XMMWORD[48+rsi]
vpaddq xmm10,xmm10,xmm8
vpmuludq xmm2,xmm9,xmm2
vpmuludq xmm3,xmm9,xmm3
vpsrldq xmm7,xmm5,6
vpaddq xmm11,xmm11,xmm2
vpmuludq xmm4,xmm9,xmm4
vpsrldq xmm8,xmm6,6
vpaddq xmm2,xmm12,xmm3
vpaddq xmm3,xmm13,xmm4
vpmuludq xmm4,xmm0,XMMWORD[((-32))+r11]
vpmuludq xmm0,xmm9,xmm1
vpunpckhqdq xmm9,xmm5,xmm6
vpaddq xmm4,xmm14,xmm4
vpaddq xmm0,xmm10,xmm0
vpunpcklqdq xmm5,xmm5,xmm6
vpunpcklqdq xmm8,xmm7,xmm8
vpsrldq xmm9,xmm9,5
vpsrlq xmm6,xmm5,26
vmovdqa xmm14,XMMWORD[rsp]
vpand xmm5,xmm5,xmm15
vpsrlq xmm7,xmm8,4
vpand xmm6,xmm6,xmm15
vpand xmm9,xmm9,XMMWORD[rcx]
vpsrlq xmm8,xmm8,30
vpand xmm7,xmm7,xmm15
vpand xmm8,xmm8,xmm15
vpor xmm9,xmm9,XMMWORD[32+rcx]
vpsrlq xmm13,xmm3,26
vpand xmm3,xmm3,xmm15
vpaddq xmm4,xmm4,xmm13
vpsrlq xmm10,xmm0,26
vpand xmm0,xmm0,xmm15
vpaddq xmm1,xmm11,xmm10
vpsrlq xmm10,xmm4,26
vpand xmm4,xmm4,xmm15
vpsrlq xmm11,xmm1,26
vpand xmm1,xmm1,xmm15
vpaddq xmm2,xmm2,xmm11
vpaddq xmm0,xmm0,xmm10
vpsllq xmm10,xmm10,2
vpaddq xmm0,xmm0,xmm10
vpsrlq xmm12,xmm2,26
vpand xmm2,xmm2,xmm15
vpaddq xmm3,xmm3,xmm12
vpsrlq xmm10,xmm0,26
vpand xmm0,xmm0,xmm15
vpaddq xmm1,xmm1,xmm10
vpsrlq xmm13,xmm3,26
vpand xmm3,xmm3,xmm15
vpaddq xmm4,xmm4,xmm13
ja NEAR $L$oop_avx
$L$skip_loop_avx:
vpshufd xmm14,xmm14,0x10
add rdx,32
jnz NEAR $L$ong_tail_avx
vpaddq xmm7,xmm7,xmm2
vpaddq xmm5,xmm5,xmm0
vpaddq xmm6,xmm6,xmm1
vpaddq xmm8,xmm8,xmm3
vpaddq xmm9,xmm9,xmm4
$L$ong_tail_avx:
vmovdqa XMMWORD[32+r11],xmm2
vmovdqa XMMWORD[r11],xmm0
vmovdqa XMMWORD[16+r11],xmm1
vmovdqa XMMWORD[48+r11],xmm3
vmovdqa XMMWORD[64+r11],xmm4
vpmuludq xmm12,xmm14,xmm7
vpmuludq xmm10,xmm14,xmm5
vpshufd xmm2,XMMWORD[((-48))+rdi],0x10
vpmuludq xmm11,xmm14,xmm6
vpmuludq xmm13,xmm14,xmm8
vpmuludq xmm14,xmm14,xmm9
vpmuludq xmm0,xmm2,xmm8
vpaddq xmm14,xmm14,xmm0
vpshufd xmm3,XMMWORD[((-32))+rdi],0x10
vpmuludq xmm1,xmm2,xmm7
vpaddq xmm13,xmm13,xmm1
vpshufd xmm4,XMMWORD[((-16))+rdi],0x10
vpmuludq xmm0,xmm2,xmm6
vpaddq xmm12,xmm12,xmm0
vpmuludq xmm2,xmm2,xmm5
vpaddq xmm11,xmm11,xmm2
vpmuludq xmm3,xmm3,xmm9
vpaddq xmm10,xmm10,xmm3
vpshufd xmm2,XMMWORD[rdi],0x10
vpmuludq xmm1,xmm4,xmm7
vpaddq xmm14,xmm14,xmm1
vpmuludq xmm0,xmm4,xmm6
vpaddq xmm13,xmm13,xmm0
vpshufd xmm3,XMMWORD[16+rdi],0x10
vpmuludq xmm4,xmm4,xmm5
vpaddq xmm12,xmm12,xmm4
vpmuludq xmm1,xmm2,xmm9
vpaddq xmm11,xmm11,xmm1
vpshufd xmm4,XMMWORD[32+rdi],0x10
vpmuludq xmm2,xmm2,xmm8
vpaddq xmm10,xmm10,xmm2
vpmuludq xmm0,xmm3,xmm6
vpaddq xmm14,xmm14,xmm0
vpmuludq xmm3,xmm3,xmm5
vpaddq xmm13,xmm13,xmm3
vpshufd xmm2,XMMWORD[48+rdi],0x10
vpmuludq xmm1,xmm4,xmm9
vpaddq xmm12,xmm12,xmm1
vpshufd xmm3,XMMWORD[64+rdi],0x10
vpmuludq xmm0,xmm4,xmm8
vpaddq xmm11,xmm11,xmm0
vpmuludq xmm4,xmm4,xmm7
vpaddq xmm10,xmm10,xmm4
vpmuludq xmm2,xmm2,xmm5
vpaddq xmm14,xmm14,xmm2
vpmuludq xmm1,xmm3,xmm9
vpaddq xmm13,xmm13,xmm1
vpmuludq xmm0,xmm3,xmm8
vpaddq xmm12,xmm12,xmm0
vpmuludq xmm1,xmm3,xmm7
vpaddq xmm11,xmm11,xmm1
vpmuludq xmm3,xmm3,xmm6
vpaddq xmm10,xmm10,xmm3
jz NEAR $L$short_tail_avx
vmovdqu xmm0,XMMWORD[rsi]
vmovdqu xmm1,XMMWORD[16+rsi]
vpsrldq xmm2,xmm0,6
vpsrldq xmm3,xmm1,6
vpunpckhqdq xmm4,xmm0,xmm1
vpunpcklqdq xmm0,xmm0,xmm1
vpunpcklqdq xmm3,xmm2,xmm3
vpsrlq xmm4,xmm4,40
vpsrlq xmm1,xmm0,26
vpand xmm0,xmm0,xmm15
vpsrlq xmm2,xmm3,4
vpand xmm1,xmm1,xmm15
vpsrlq xmm3,xmm3,30
vpand xmm2,xmm2,xmm15
vpand xmm3,xmm3,xmm15
vpor xmm4,xmm4,XMMWORD[32+rcx]
vpshufd xmm9,XMMWORD[((-64))+rdi],0x32
vpaddq xmm0,xmm0,XMMWORD[r11]
vpaddq xmm1,xmm1,XMMWORD[16+r11]
vpaddq xmm2,xmm2,XMMWORD[32+r11]
vpaddq xmm3,xmm3,XMMWORD[48+r11]
vpaddq xmm4,xmm4,XMMWORD[64+r11]
vpmuludq xmm5,xmm9,xmm0
vpaddq xmm10,xmm10,xmm5
vpmuludq xmm6,xmm9,xmm1
vpaddq xmm11,xmm11,xmm6
vpmuludq xmm5,xmm9,xmm2
vpaddq xmm12,xmm12,xmm5
vpshufd xmm7,XMMWORD[((-48))+rdi],0x32
vpmuludq xmm6,xmm9,xmm3
vpaddq xmm13,xmm13,xmm6
vpmuludq xmm9,xmm9,xmm4
vpaddq xmm14,xmm14,xmm9
vpmuludq xmm5,xmm7,xmm3
vpaddq xmm14,xmm14,xmm5
vpshufd xmm8,XMMWORD[((-32))+rdi],0x32
vpmuludq xmm6,xmm7,xmm2
vpaddq xmm13,xmm13,xmm6
vpshufd xmm9,XMMWORD[((-16))+rdi],0x32
vpmuludq xmm5,xmm7,xmm1
vpaddq xmm12,xmm12,xmm5
vpmuludq xmm7,xmm7,xmm0
vpaddq xmm11,xmm11,xmm7
vpmuludq xmm8,xmm8,xmm4
vpaddq xmm10,xmm10,xmm8
vpshufd xmm7,XMMWORD[rdi],0x32
vpmuludq xmm6,xmm9,xmm2
vpaddq xmm14,xmm14,xmm6
vpmuludq xmm5,xmm9,xmm1
vpaddq xmm13,xmm13,xmm5
vpshufd xmm8,XMMWORD[16+rdi],0x32
vpmuludq xmm9,xmm9,xmm0
vpaddq xmm12,xmm12,xmm9
vpmuludq xmm6,xmm7,xmm4
vpaddq xmm11,xmm11,xmm6
vpshufd xmm9,XMMWORD[32+rdi],0x32
vpmuludq xmm7,xmm7,xmm3
vpaddq xmm10,xmm10,xmm7
vpmuludq xmm5,xmm8,xmm1
vpaddq xmm14,xmm14,xmm5
vpmuludq xmm8,xmm8,xmm0
vpaddq xmm13,xmm13,xmm8
vpshufd xmm7,XMMWORD[48+rdi],0x32
vpmuludq xmm6,xmm9,xmm4
vpaddq xmm12,xmm12,xmm6
vpshufd xmm8,XMMWORD[64+rdi],0x32
vpmuludq xmm5,xmm9,xmm3
vpaddq xmm11,xmm11,xmm5
vpmuludq xmm9,xmm9,xmm2
vpaddq xmm10,xmm10,xmm9
vpmuludq xmm7,xmm7,xmm0
vpaddq xmm14,xmm14,xmm7
vpmuludq xmm6,xmm8,xmm4
vpaddq xmm13,xmm13,xmm6
vpmuludq xmm5,xmm8,xmm3
vpaddq xmm12,xmm12,xmm5
vpmuludq xmm6,xmm8,xmm2
vpaddq xmm11,xmm11,xmm6
vpmuludq xmm8,xmm8,xmm1
vpaddq xmm10,xmm10,xmm8
$L$short_tail_avx:
vpsrldq xmm9,xmm14,8
vpsrldq xmm8,xmm13,8
vpsrldq xmm6,xmm11,8
vpsrldq xmm5,xmm10,8
vpsrldq xmm7,xmm12,8
vpaddq xmm13,xmm13,xmm8
vpaddq xmm14,xmm14,xmm9
vpaddq xmm10,xmm10,xmm5
vpaddq xmm11,xmm11,xmm6
vpaddq xmm12,xmm12,xmm7
vpsrlq xmm3,xmm13,26
vpand xmm13,xmm13,xmm15
vpaddq xmm14,xmm14,xmm3
vpsrlq xmm0,xmm10,26
vpand xmm10,xmm10,xmm15
vpaddq xmm11,xmm11,xmm0
vpsrlq xmm4,xmm14,26
vpand xmm14,xmm14,xmm15
vpsrlq xmm1,xmm11,26
vpand xmm11,xmm11,xmm15
vpaddq xmm12,xmm12,xmm1
vpaddq xmm10,xmm10,xmm4
vpsllq xmm4,xmm4,2
vpaddq xmm10,xmm10,xmm4
vpsrlq xmm2,xmm12,26
vpand xmm12,xmm12,xmm15
vpaddq xmm13,xmm13,xmm2
vpsrlq xmm0,xmm10,26
vpand xmm10,xmm10,xmm15
vpaddq xmm11,xmm11,xmm0
vpsrlq xmm3,xmm13,26
vpand xmm13,xmm13,xmm15
vpaddq xmm14,xmm14,xmm3
vmovd DWORD[(-112)+rdi],xmm10
vmovd DWORD[(-108)+rdi],xmm11
vmovd DWORD[(-104)+rdi],xmm12
vmovd DWORD[(-100)+rdi],xmm13
vmovd DWORD[(-96)+rdi],xmm14
vmovdqa xmm6,XMMWORD[80+r11]
vmovdqa xmm7,XMMWORD[96+r11]
vmovdqa xmm8,XMMWORD[112+r11]
vmovdqa xmm9,XMMWORD[128+r11]
vmovdqa xmm10,XMMWORD[144+r11]
vmovdqa xmm11,XMMWORD[160+r11]
vmovdqa xmm12,XMMWORD[176+r11]
vmovdqa xmm13,XMMWORD[192+r11]
vmovdqa xmm14,XMMWORD[208+r11]
vmovdqa xmm15,XMMWORD[224+r11]
lea rsp,[248+r11]
$L$do_avx_epilogue:
vzeroupper
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_poly1305_blocks_avx:
ALIGN 32
poly1305_emit_avx:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_poly1305_emit_avx:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
cmp DWORD[20+rdi],0
je NEAR $L$emit
mov eax,DWORD[rdi]
mov ecx,DWORD[4+rdi]
mov r8d,DWORD[8+rdi]
mov r11d,DWORD[12+rdi]
mov r10d,DWORD[16+rdi]
shl rcx,26
mov r9,r8
shl r8,52
add rax,rcx
shr r9,12
add r8,rax
adc r9,0
shl r11,14
mov rax,r10
shr r10,24
add r9,r11
shl rax,40
add r9,rax
adc r10,0
mov rax,r10
mov rcx,r10
and r10,3
shr rax,2
and rcx,-4
add rax,rcx
add r8,rax
adc r9,0
adc r10,0
mov rax,r8
add r8,5
mov rcx,r9
adc r9,0
adc r10,0
shr r10,2
cmovnz rax,r8
cmovnz rcx,r9
add rax,QWORD[rdx]
adc rcx,QWORD[8+rdx]
mov QWORD[rsi],rax
mov QWORD[8+rsi],rcx
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_poly1305_emit_avx:
ALIGN 32
poly1305_blocks_avx2:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_poly1305_blocks_avx2:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
mov r8d,DWORD[20+rdi]
cmp rdx,128
jae NEAR $L$blocks_avx2
test r8d,r8d
jz NEAR $L$blocks
$L$blocks_avx2:
and rdx,-16
jz NEAR $L$no_data_avx2
vzeroupper
test r8d,r8d
jz NEAR $L$base2_64_avx2
test rdx,63
jz NEAR $L$even_avx2
push rbx
push rbp
push r12
push r13
push r14
push r15
$L$blocks_avx2_body:
mov r15,rdx
mov r8,QWORD[rdi]
mov r9,QWORD[8+rdi]
mov ebp,DWORD[16+rdi]
mov r11,QWORD[24+rdi]
mov r13,QWORD[32+rdi]
mov r14d,r8d
and r8,-2147483648
mov r12,r9
mov ebx,r9d
and r9,-2147483648
shr r8,6
shl r12,52
add r14,r8
shr rbx,12
shr r9,18
add r14,r12
adc rbx,r9
mov r8,rbp
shl r8,40
shr rbp,24
add rbx,r8
adc rbp,0
mov r9,-4
mov r8,rbp
and r9,rbp
shr r8,2
and rbp,3
add r8,r9
add r14,r8
adc rbx,0
adc rbp,0
mov r12,r13
mov rax,r13
shr r13,2
add r13,r12
$L$base2_26_pre_avx2:
add r14,QWORD[rsi]
adc rbx,QWORD[8+rsi]
lea rsi,[16+rsi]
adc rbp,rcx
sub r15,16
call __poly1305_block
mov rax,r12
test r15,63
jnz NEAR $L$base2_26_pre_avx2
test rcx,rcx
jz NEAR $L$store_base2_64_avx2
mov rax,r14
mov rdx,r14
shr r14,52
mov r11,rbx
mov r12,rbx
shr rdx,26
and rax,0x3ffffff
shl r11,12
and rdx,0x3ffffff
shr rbx,14
or r14,r11
shl rbp,24
and r14,0x3ffffff
shr r12,40
and rbx,0x3ffffff
or rbp,r12
test r15,r15
jz NEAR $L$store_base2_26_avx2
vmovd xmm0,eax
vmovd xmm1,edx
vmovd xmm2,r14d
vmovd xmm3,ebx
vmovd xmm4,ebp
jmp NEAR $L$proceed_avx2
ALIGN 32
$L$store_base2_64_avx2:
mov QWORD[rdi],r14
mov QWORD[8+rdi],rbx
mov QWORD[16+rdi],rbp
jmp NEAR $L$done_avx2
ALIGN 16
$L$store_base2_26_avx2:
mov DWORD[rdi],eax
mov DWORD[4+rdi],edx
mov DWORD[8+rdi],r14d
mov DWORD[12+rdi],ebx
mov DWORD[16+rdi],ebp
ALIGN 16
$L$done_avx2:
mov r15,QWORD[rsp]
mov r14,QWORD[8+rsp]
mov r13,QWORD[16+rsp]
mov r12,QWORD[24+rsp]
mov rbp,QWORD[32+rsp]
mov rbx,QWORD[40+rsp]
lea rsp,[48+rsp]
$L$no_data_avx2:
$L$blocks_avx2_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
ALIGN 32
$L$base2_64_avx2:
push rbx
push rbp
push r12
push r13
push r14
push r15
$L$base2_64_avx2_body:
mov r15,rdx
mov r11,QWORD[24+rdi]
mov r13,QWORD[32+rdi]
mov r14,QWORD[rdi]
mov rbx,QWORD[8+rdi]
mov ebp,DWORD[16+rdi]
mov r12,r13
mov rax,r13
shr r13,2
add r13,r12
test rdx,63
jz NEAR $L$init_avx2
$L$base2_64_pre_avx2:
add r14,QWORD[rsi]
adc rbx,QWORD[8+rsi]
lea rsi,[16+rsi]
adc rbp,rcx
sub r15,16
call __poly1305_block
mov rax,r12
test r15,63
jnz NEAR $L$base2_64_pre_avx2
$L$init_avx2:
mov rax,r14
mov rdx,r14
shr r14,52
mov r8,rbx
mov r9,rbx
shr rdx,26
and rax,0x3ffffff
shl r8,12
and rdx,0x3ffffff
shr rbx,14
or r14,r8
shl rbp,24
and r14,0x3ffffff
shr r9,40
and rbx,0x3ffffff
or rbp,r9
vmovd xmm0,eax
vmovd xmm1,edx
vmovd xmm2,r14d
vmovd xmm3,ebx
vmovd xmm4,ebp
mov DWORD[20+rdi],1
call __poly1305_init_avx
$L$proceed_avx2:
mov rdx,r15
mov r15,QWORD[rsp]
mov r14,QWORD[8+rsp]
mov r13,QWORD[16+rsp]
mov r12,QWORD[24+rsp]
mov rbp,QWORD[32+rsp]
mov rbx,QWORD[40+rsp]
lea rax,[48+rsp]
lea rsp,[48+rsp]
$L$base2_64_avx2_epilogue:
jmp NEAR $L$do_avx2
ALIGN 32
$L$even_avx2:
vmovd xmm0,DWORD[rdi]
vmovd xmm1,DWORD[4+rdi]
vmovd xmm2,DWORD[8+rdi]
vmovd xmm3,DWORD[12+rdi]
vmovd xmm4,DWORD[16+rdi]
$L$do_avx2:
lea r11,[((-248))+rsp]
sub rsp,0x1c8
vmovdqa XMMWORD[80+r11],xmm6
vmovdqa XMMWORD[96+r11],xmm7
vmovdqa XMMWORD[112+r11],xmm8
vmovdqa XMMWORD[128+r11],xmm9
vmovdqa XMMWORD[144+r11],xmm10
vmovdqa XMMWORD[160+r11],xmm11
vmovdqa XMMWORD[176+r11],xmm12
vmovdqa XMMWORD[192+r11],xmm13
vmovdqa XMMWORD[208+r11],xmm14
vmovdqa XMMWORD[224+r11],xmm15
$L$do_avx2_body:
lea rcx,[$L$const]
lea rdi,[((48+64))+rdi]
vmovdqa ymm7,YMMWORD[96+rcx]
vmovdqu xmm9,XMMWORD[((-64))+rdi]
and rsp,-512
vmovdqu xmm10,XMMWORD[((-48))+rdi]
vmovdqu xmm6,XMMWORD[((-32))+rdi]
vmovdqu xmm11,XMMWORD[((-16))+rdi]
vmovdqu xmm12,XMMWORD[rdi]
vmovdqu xmm13,XMMWORD[16+rdi]
lea rax,[144+rsp]
vmovdqu xmm14,XMMWORD[32+rdi]
vpermd ymm9,ymm7,ymm9
vmovdqu xmm15,XMMWORD[48+rdi]
vpermd ymm10,ymm7,ymm10
vmovdqu xmm5,XMMWORD[64+rdi]
vpermd ymm6,ymm7,ymm6
vmovdqa YMMWORD[rsp],ymm9
vpermd ymm11,ymm7,ymm11
vmovdqa YMMWORD[(32-144)+rax],ymm10
vpermd ymm12,ymm7,ymm12
vmovdqa YMMWORD[(64-144)+rax],ymm6
vpermd ymm13,ymm7,ymm13
vmovdqa YMMWORD[(96-144)+rax],ymm11
vpermd ymm14,ymm7,ymm14
vmovdqa YMMWORD[(128-144)+rax],ymm12
vpermd ymm15,ymm7,ymm15
vmovdqa YMMWORD[(160-144)+rax],ymm13
vpermd ymm5,ymm7,ymm5
vmovdqa YMMWORD[(192-144)+rax],ymm14
vmovdqa YMMWORD[(224-144)+rax],ymm15
vmovdqa YMMWORD[(256-144)+rax],ymm5
vmovdqa ymm5,YMMWORD[64+rcx]
vmovdqu xmm7,XMMWORD[rsi]
vmovdqu xmm8,XMMWORD[16+rsi]
vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1
vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1
lea rsi,[64+rsi]
vpsrldq ymm9,ymm7,6
vpsrldq ymm10,ymm8,6
vpunpckhqdq ymm6,ymm7,ymm8
vpunpcklqdq ymm9,ymm9,ymm10
vpunpcklqdq ymm7,ymm7,ymm8
vpsrlq ymm10,ymm9,30
vpsrlq ymm9,ymm9,4
vpsrlq ymm8,ymm7,26
vpsrlq ymm6,ymm6,40
vpand ymm9,ymm9,ymm5
vpand ymm7,ymm7,ymm5
vpand ymm8,ymm8,ymm5
vpand ymm10,ymm10,ymm5
vpor ymm6,ymm6,YMMWORD[32+rcx]
vpaddq ymm2,ymm9,ymm2
sub rdx,64
jz NEAR $L$tail_avx2
jmp NEAR $L$oop_avx2
ALIGN 32
$L$oop_avx2:
vpaddq ymm0,ymm7,ymm0
vmovdqa ymm7,YMMWORD[rsp]
vpaddq ymm1,ymm8,ymm1
vmovdqa ymm8,YMMWORD[32+rsp]
vpaddq ymm3,ymm10,ymm3
vmovdqa ymm9,YMMWORD[96+rsp]
vpaddq ymm4,ymm6,ymm4
vmovdqa ymm10,YMMWORD[48+rax]
vmovdqa ymm5,YMMWORD[112+rax]
vpmuludq ymm13,ymm7,ymm2
vpmuludq ymm14,ymm8,ymm2
vpmuludq ymm15,ymm9,ymm2
vpmuludq ymm11,ymm10,ymm2
vpmuludq ymm12,ymm5,ymm2
vpmuludq ymm6,ymm8,ymm0
vpmuludq ymm2,ymm8,ymm1
vpaddq ymm12,ymm12,ymm6
vpaddq ymm13,ymm13,ymm2
vpmuludq ymm6,ymm8,ymm3
vpmuludq ymm2,ymm4,YMMWORD[64+rsp]
vpaddq ymm15,ymm15,ymm6
vpaddq ymm11,ymm11,ymm2
vmovdqa ymm8,YMMWORD[((-16))+rax]
vpmuludq ymm6,ymm7,ymm0
vpmuludq ymm2,ymm7,ymm1
vpaddq ymm11,ymm11,ymm6
vpaddq ymm12,ymm12,ymm2
vpmuludq ymm6,ymm7,ymm3
vpmuludq ymm2,ymm7,ymm4
vmovdqu xmm7,XMMWORD[rsi]
vpaddq ymm14,ymm14,ymm6
vpaddq ymm15,ymm15,ymm2
vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1
vpmuludq ymm6,ymm8,ymm3
vpmuludq ymm2,ymm8,ymm4
vmovdqu xmm8,XMMWORD[16+rsi]
vpaddq ymm11,ymm11,ymm6
vpaddq ymm12,ymm12,ymm2
vmovdqa ymm2,YMMWORD[16+rax]
vpmuludq ymm6,ymm9,ymm1
vpmuludq ymm9,ymm9,ymm0
vpaddq ymm14,ymm14,ymm6
vpaddq ymm13,ymm13,ymm9
vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1
lea rsi,[64+rsi]
vpmuludq ymm6,ymm2,ymm1
vpmuludq ymm2,ymm2,ymm0
vpsrldq ymm9,ymm7,6
vpaddq ymm15,ymm15,ymm6
vpaddq ymm14,ymm14,ymm2
vpmuludq ymm6,ymm10,ymm3
vpmuludq ymm2,ymm10,ymm4
vpsrldq ymm10,ymm8,6
vpaddq ymm12,ymm12,ymm6
vpaddq ymm13,ymm13,ymm2
vpunpckhqdq ymm6,ymm7,ymm8
vpmuludq ymm3,ymm5,ymm3
vpmuludq ymm4,ymm5,ymm4
vpunpcklqdq ymm7,ymm7,ymm8
vpaddq ymm2,ymm13,ymm3
vpaddq ymm3,ymm14,ymm4
vpunpcklqdq ymm10,ymm9,ymm10
vpmuludq ymm4,ymm0,YMMWORD[80+rax]
vpmuludq ymm0,ymm5,ymm1
vmovdqa ymm5,YMMWORD[64+rcx]
vpaddq ymm4,ymm15,ymm4
vpaddq ymm0,ymm11,ymm0
vpsrlq ymm14,ymm3,26
vpand ymm3,ymm3,ymm5
vpaddq ymm4,ymm4,ymm14
vpsrlq ymm11,ymm0,26
vpand ymm0,ymm0,ymm5
vpaddq ymm1,ymm12,ymm11
vpsrlq ymm15,ymm4,26
vpand ymm4,ymm4,ymm5
vpsrlq ymm9,ymm10,4
vpsrlq ymm12,ymm1,26
vpand ymm1,ymm1,ymm5
vpaddq ymm2,ymm2,ymm12
vpaddq ymm0,ymm0,ymm15
vpsllq ymm15,ymm15,2
vpaddq ymm0,ymm0,ymm15
vpand ymm9,ymm9,ymm5
vpsrlq ymm8,ymm7,26
vpsrlq ymm13,ymm2,26
vpand ymm2,ymm2,ymm5
vpaddq ymm3,ymm3,ymm13
vpaddq ymm2,ymm2,ymm9
vpsrlq ymm10,ymm10,30
vpsrlq ymm11,ymm0,26
vpand ymm0,ymm0,ymm5
vpaddq ymm1,ymm1,ymm11
vpsrlq ymm6,ymm6,40
vpsrlq ymm14,ymm3,26
vpand ymm3,ymm3,ymm5
vpaddq ymm4,ymm4,ymm14
vpand ymm7,ymm7,ymm5
vpand ymm8,ymm8,ymm5
vpand ymm10,ymm10,ymm5
vpor ymm6,ymm6,YMMWORD[32+rcx]
sub rdx,64
jnz NEAR $L$oop_avx2
DB 0x66,0x90
$L$tail_avx2:
vpaddq ymm0,ymm7,ymm0
vmovdqu ymm7,YMMWORD[4+rsp]
vpaddq ymm1,ymm8,ymm1
vmovdqu ymm8,YMMWORD[36+rsp]
vpaddq ymm3,ymm10,ymm3
vmovdqu ymm9,YMMWORD[100+rsp]
vpaddq ymm4,ymm6,ymm4
vmovdqu ymm10,YMMWORD[52+rax]
vmovdqu ymm5,YMMWORD[116+rax]
vpmuludq ymm13,ymm7,ymm2
vpmuludq ymm14,ymm8,ymm2
vpmuludq ymm15,ymm9,ymm2
vpmuludq ymm11,ymm10,ymm2
vpmuludq ymm12,ymm5,ymm2
vpmuludq ymm6,ymm8,ymm0
vpmuludq ymm2,ymm8,ymm1
vpaddq ymm12,ymm12,ymm6
vpaddq ymm13,ymm13,ymm2
vpmuludq ymm6,ymm8,ymm3
vpmuludq ymm2,ymm4,YMMWORD[68+rsp]
vpaddq ymm15,ymm15,ymm6
vpaddq ymm11,ymm11,ymm2
vpmuludq ymm6,ymm7,ymm0
vpmuludq ymm2,ymm7,ymm1
vpaddq ymm11,ymm11,ymm6
vmovdqu ymm8,YMMWORD[((-12))+rax]
vpaddq ymm12,ymm12,ymm2
vpmuludq ymm6,ymm7,ymm3
vpmuludq ymm2,ymm7,ymm4
vpaddq ymm14,ymm14,ymm6
vpaddq ymm15,ymm15,ymm2
vpmuludq ymm6,ymm8,ymm3
vpmuludq ymm2,ymm8,ymm4
vpaddq ymm11,ymm11,ymm6
vpaddq ymm12,ymm12,ymm2
vmovdqu ymm2,YMMWORD[20+rax]
vpmuludq ymm6,ymm9,ymm1
vpmuludq ymm9,ymm9,ymm0
vpaddq ymm14,ymm14,ymm6
vpaddq ymm13,ymm13,ymm9
vpmuludq ymm6,ymm2,ymm1
vpmuludq ymm2,ymm2,ymm0
vpaddq ymm15,ymm15,ymm6
vpaddq ymm14,ymm14,ymm2
vpmuludq ymm6,ymm10,ymm3
vpmuludq ymm2,ymm10,ymm4
vpaddq ymm12,ymm12,ymm6
vpaddq ymm13,ymm13,ymm2
vpmuludq ymm3,ymm5,ymm3
vpmuludq ymm4,ymm5,ymm4
vpaddq ymm2,ymm13,ymm3
vpaddq ymm3,ymm14,ymm4
vpmuludq ymm4,ymm0,YMMWORD[84+rax]
vpmuludq ymm0,ymm5,ymm1
vmovdqa ymm5,YMMWORD[64+rcx]
vpaddq ymm4,ymm15,ymm4
vpaddq ymm0,ymm11,ymm0
vpsrldq ymm8,ymm12,8
vpsrldq ymm9,ymm2,8
vpsrldq ymm10,ymm3,8
vpsrldq ymm6,ymm4,8
vpsrldq ymm7,ymm0,8
vpaddq ymm12,ymm12,ymm8
vpaddq ymm2,ymm2,ymm9
vpaddq ymm3,ymm3,ymm10
vpaddq ymm4,ymm4,ymm6
vpaddq ymm0,ymm0,ymm7
vpermq ymm10,ymm3,0x2
vpermq ymm6,ymm4,0x2
vpermq ymm7,ymm0,0x2
vpermq ymm8,ymm12,0x2
vpermq ymm9,ymm2,0x2
vpaddq ymm3,ymm3,ymm10
vpaddq ymm4,ymm4,ymm6
vpaddq ymm0,ymm0,ymm7
vpaddq ymm12,ymm12,ymm8
vpaddq ymm2,ymm2,ymm9
vpsrlq ymm14,ymm3,26
vpand ymm3,ymm3,ymm5
vpaddq ymm4,ymm4,ymm14
vpsrlq ymm11,ymm0,26
vpand ymm0,ymm0,ymm5
vpaddq ymm1,ymm12,ymm11
vpsrlq ymm15,ymm4,26
vpand ymm4,ymm4,ymm5
vpsrlq ymm12,ymm1,26
vpand ymm1,ymm1,ymm5
vpaddq ymm2,ymm2,ymm12
vpaddq ymm0,ymm0,ymm15
vpsllq ymm15,ymm15,2
vpaddq ymm0,ymm0,ymm15
vpsrlq ymm13,ymm2,26
vpand ymm2,ymm2,ymm5
vpaddq ymm3,ymm3,ymm13
vpsrlq ymm11,ymm0,26
vpand ymm0,ymm0,ymm5
vpaddq ymm1,ymm1,ymm11
vpsrlq ymm14,ymm3,26
vpand ymm3,ymm3,ymm5
vpaddq ymm4,ymm4,ymm14
vmovd DWORD[(-112)+rdi],xmm0
vmovd DWORD[(-108)+rdi],xmm1
vmovd DWORD[(-104)+rdi],xmm2
vmovd DWORD[(-100)+rdi],xmm3
vmovd DWORD[(-96)+rdi],xmm4
vmovdqa xmm6,XMMWORD[80+r11]
vmovdqa xmm7,XMMWORD[96+r11]
vmovdqa xmm8,XMMWORD[112+r11]
vmovdqa xmm9,XMMWORD[128+r11]
vmovdqa xmm10,XMMWORD[144+r11]
vmovdqa xmm11,XMMWORD[160+r11]
vmovdqa xmm12,XMMWORD[176+r11]
vmovdqa xmm13,XMMWORD[192+r11]
vmovdqa xmm14,XMMWORD[208+r11]
vmovdqa xmm15,XMMWORD[224+r11]
lea rsp,[248+r11]
$L$do_avx2_epilogue:
vzeroupper
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_poly1305_blocks_avx2:
ALIGN 32
poly1305_blocks_avx512:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
$L$SEH_begin_poly1305_blocks_avx512:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
mov rcx,r9
mov r8d,DWORD[20+rdi]
cmp rdx,128
jae NEAR $L$blocks_avx2_512
test r8d,r8d
jz NEAR $L$blocks
$L$blocks_avx2_512:
and rdx,-16
jz NEAR $L$no_data_avx2_512
vzeroupper
test r8d,r8d
jz NEAR $L$base2_64_avx2_512
test rdx,63
jz NEAR $L$even_avx2_512
push rbx
push rbp
push r12
push r13
push r14
push r15
$L$blocks_avx2_body_512:
mov r15,rdx
mov r8,QWORD[rdi]
mov r9,QWORD[8+rdi]
mov ebp,DWORD[16+rdi]
mov r11,QWORD[24+rdi]
mov r13,QWORD[32+rdi]
mov r14d,r8d
and r8,-2147483648
mov r12,r9
mov ebx,r9d
and r9,-2147483648
shr r8,6
shl r12,52
add r14,r8
shr rbx,12
shr r9,18
add r14,r12
adc rbx,r9
mov r8,rbp
shl r8,40
shr rbp,24
add rbx,r8
adc rbp,0
mov r9,-4
mov r8,rbp
and r9,rbp
shr r8,2
and rbp,3
add r8,r9
add r14,r8
adc rbx,0
adc rbp,0
mov r12,r13
mov rax,r13
shr r13,2
add r13,r12
$L$base2_26_pre_avx2_512:
add r14,QWORD[rsi]
adc rbx,QWORD[8+rsi]
lea rsi,[16+rsi]
adc rbp,rcx
sub r15,16
call __poly1305_block
mov rax,r12
test r15,63
jnz NEAR $L$base2_26_pre_avx2_512
test rcx,rcx
jz NEAR $L$store_base2_64_avx2_512
mov rax,r14
mov rdx,r14
shr r14,52
mov r11,rbx
mov r12,rbx
shr rdx,26
and rax,0x3ffffff
shl r11,12
and rdx,0x3ffffff
shr rbx,14
or r14,r11
shl rbp,24
and r14,0x3ffffff
shr r12,40
and rbx,0x3ffffff
or rbp,r12
test r15,r15
jz NEAR $L$store_base2_26_avx2_512
vmovd xmm0,eax
vmovd xmm1,edx
vmovd xmm2,r14d
vmovd xmm3,ebx
vmovd xmm4,ebp
jmp NEAR $L$proceed_avx2_512
ALIGN 32
$L$store_base2_64_avx2_512:
mov QWORD[rdi],r14
mov QWORD[8+rdi],rbx
mov QWORD[16+rdi],rbp
jmp NEAR $L$done_avx2_512
ALIGN 16
$L$store_base2_26_avx2_512:
mov DWORD[rdi],eax
mov DWORD[4+rdi],edx
mov DWORD[8+rdi],r14d
mov DWORD[12+rdi],ebx
mov DWORD[16+rdi],ebp
ALIGN 16
$L$done_avx2_512:
mov r15,QWORD[rsp]
mov r14,QWORD[8+rsp]
mov r13,QWORD[16+rsp]
mov r12,QWORD[24+rsp]
mov rbp,QWORD[32+rsp]
mov rbx,QWORD[40+rsp]
lea rsp,[48+rsp]
$L$no_data_avx2_512:
$L$blocks_avx2_epilogue_512:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
ALIGN 32
$L$base2_64_avx2_512:
push rbx
push rbp
push r12
push r13
push r14
push r15
$L$base2_64_avx2_body_512:
mov r15,rdx
mov r11,QWORD[24+rdi]
mov r13,QWORD[32+rdi]
mov r14,QWORD[rdi]
mov rbx,QWORD[8+rdi]
mov ebp,DWORD[16+rdi]
mov r12,r13
mov rax,r13
shr r13,2
add r13,r12
test rdx,63
jz NEAR $L$init_avx2_512
$L$base2_64_pre_avx2_512:
add r14,QWORD[rsi]
adc rbx,QWORD[8+rsi]
lea rsi,[16+rsi]
adc rbp,rcx
sub r15,16
call __poly1305_block
mov rax,r12
test r15,63
jnz NEAR $L$base2_64_pre_avx2_512
$L$init_avx2_512:
mov rax,r14
mov rdx,r14
shr r14,52
mov r8,rbx
mov r9,rbx
shr rdx,26
and rax,0x3ffffff
shl r8,12
and rdx,0x3ffffff
shr rbx,14
or r14,r8
shl rbp,24
and r14,0x3ffffff
shr r9,40
and rbx,0x3ffffff
or rbp,r9
vmovd xmm0,eax
vmovd xmm1,edx
vmovd xmm2,r14d
vmovd xmm3,ebx
vmovd xmm4,ebp
mov DWORD[20+rdi],1
call __poly1305_init_avx
$L$proceed_avx2_512:
mov rdx,r15
mov r15,QWORD[rsp]
mov r14,QWORD[8+rsp]
mov r13,QWORD[16+rsp]
mov r12,QWORD[24+rsp]
mov rbp,QWORD[32+rsp]
mov rbx,QWORD[40+rsp]
lea rax,[48+rsp]
lea rsp,[48+rsp]
$L$base2_64_avx2_epilogue_512:
jmp NEAR $L$do_avx2_512
ALIGN 32
$L$even_avx2_512:
vmovd xmm0,DWORD[rdi]
vmovd xmm1,DWORD[4+rdi]
vmovd xmm2,DWORD[8+rdi]
vmovd xmm3,DWORD[12+rdi]
vmovd xmm4,DWORD[16+rdi]
$L$do_avx2_512:
cmp rdx,512
jae NEAR $L$blocks_avx512
$L$skip_avx512:
lea r11,[((-248))+rsp]
sub rsp,0x1c8
vmovdqa XMMWORD[80+r11],xmm6
vmovdqa XMMWORD[96+r11],xmm7
vmovdqa XMMWORD[112+r11],xmm8
vmovdqa XMMWORD[128+r11],xmm9
vmovdqa XMMWORD[144+r11],xmm10
vmovdqa XMMWORD[160+r11],xmm11
vmovdqa XMMWORD[176+r11],xmm12
vmovdqa XMMWORD[192+r11],xmm13
vmovdqa XMMWORD[208+r11],xmm14
vmovdqa XMMWORD[224+r11],xmm15
$L$do_avx2_body_512:
lea rcx,[$L$const]
lea rdi,[((48+64))+rdi]
vmovdqa ymm7,YMMWORD[96+rcx]
vmovdqu xmm9,XMMWORD[((-64))+rdi]
and rsp,-512
vmovdqu xmm10,XMMWORD[((-48))+rdi]
vmovdqu xmm6,XMMWORD[((-32))+rdi]
vmovdqu xmm11,XMMWORD[((-16))+rdi]
vmovdqu xmm12,XMMWORD[rdi]
vmovdqu xmm13,XMMWORD[16+rdi]
lea rax,[144+rsp]
vmovdqu xmm14,XMMWORD[32+rdi]
vpermd ymm9,ymm7,ymm9
vmovdqu xmm15,XMMWORD[48+rdi]
vpermd ymm10,ymm7,ymm10
vmovdqu xmm5,XMMWORD[64+rdi]
vpermd ymm6,ymm7,ymm6
vmovdqa YMMWORD[rsp],ymm9
vpermd ymm11,ymm7,ymm11
vmovdqa YMMWORD[(32-144)+rax],ymm10
vpermd ymm12,ymm7,ymm12
vmovdqa YMMWORD[(64-144)+rax],ymm6
vpermd ymm13,ymm7,ymm13
vmovdqa YMMWORD[(96-144)+rax],ymm11
vpermd ymm14,ymm7,ymm14
vmovdqa YMMWORD[(128-144)+rax],ymm12
vpermd ymm15,ymm7,ymm15
vmovdqa YMMWORD[(160-144)+rax],ymm13
vpermd ymm5,ymm7,ymm5
vmovdqa YMMWORD[(192-144)+rax],ymm14
vmovdqa YMMWORD[(224-144)+rax],ymm15
vmovdqa YMMWORD[(256-144)+rax],ymm5
vmovdqa ymm5,YMMWORD[64+rcx]
vmovdqu xmm7,XMMWORD[rsi]
vmovdqu xmm8,XMMWORD[16+rsi]
vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1
vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1
lea rsi,[64+rsi]
vpsrldq ymm9,ymm7,6
vpsrldq ymm10,ymm8,6
vpunpckhqdq ymm6,ymm7,ymm8
vpunpcklqdq ymm9,ymm9,ymm10
vpunpcklqdq ymm7,ymm7,ymm8
vpsrlq ymm10,ymm9,30
vpsrlq ymm9,ymm9,4
vpsrlq ymm8,ymm7,26
vpsrlq ymm6,ymm6,40
vpand ymm9,ymm9,ymm5
vpand ymm7,ymm7,ymm5
vpand ymm8,ymm8,ymm5
vpand ymm10,ymm10,ymm5
vpor ymm6,ymm6,YMMWORD[32+rcx]
vpaddq ymm2,ymm9,ymm2
sub rdx,64
jz NEAR $L$tail_avx2_512
jmp NEAR $L$oop_avx2_512
ALIGN 32
$L$oop_avx2_512:
vpaddq ymm0,ymm7,ymm0
vmovdqa ymm7,YMMWORD[rsp]
vpaddq ymm1,ymm8,ymm1
vmovdqa ymm8,YMMWORD[32+rsp]
vpaddq ymm3,ymm10,ymm3
vmovdqa ymm9,YMMWORD[96+rsp]
vpaddq ymm4,ymm6,ymm4
vmovdqa ymm10,YMMWORD[48+rax]
vmovdqa ymm5,YMMWORD[112+rax]
vpmuludq ymm13,ymm7,ymm2
vpmuludq ymm14,ymm8,ymm2
vpmuludq ymm15,ymm9,ymm2
vpmuludq ymm11,ymm10,ymm2
vpmuludq ymm12,ymm5,ymm2
vpmuludq ymm6,ymm8,ymm0
vpmuludq ymm2,ymm8,ymm1
vpaddq ymm12,ymm12,ymm6
vpaddq ymm13,ymm13,ymm2
vpmuludq ymm6,ymm8,ymm3
vpmuludq ymm2,ymm4,YMMWORD[64+rsp]
vpaddq ymm15,ymm15,ymm6
vpaddq ymm11,ymm11,ymm2
vmovdqa ymm8,YMMWORD[((-16))+rax]
vpmuludq ymm6,ymm7,ymm0
vpmuludq ymm2,ymm7,ymm1
vpaddq ymm11,ymm11,ymm6
vpaddq ymm12,ymm12,ymm2
vpmuludq ymm6,ymm7,ymm3
vpmuludq ymm2,ymm7,ymm4
vmovdqu xmm7,XMMWORD[rsi]
vpaddq ymm14,ymm14,ymm6
vpaddq ymm15,ymm15,ymm2
vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1
vpmuludq ymm6,ymm8,ymm3
vpmuludq ymm2,ymm8,ymm4
vmovdqu xmm8,XMMWORD[16+rsi]
vpaddq ymm11,ymm11,ymm6
vpaddq ymm12,ymm12,ymm2
vmovdqa ymm2,YMMWORD[16+rax]
vpmuludq ymm6,ymm9,ymm1
vpmuludq ymm9,ymm9,ymm0
vpaddq ymm14,ymm14,ymm6
vpaddq ymm13,ymm13,ymm9
vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1
lea rsi,[64+rsi]
vpmuludq ymm6,ymm2,ymm1
vpmuludq ymm2,ymm2,ymm0
vpsrldq ymm9,ymm7,6
vpaddq ymm15,ymm15,ymm6
vpaddq ymm14,ymm14,ymm2
vpmuludq ymm6,ymm10,ymm3
vpmuludq ymm2,ymm10,ymm4
vpsrldq ymm10,ymm8,6
vpaddq ymm12,ymm12,ymm6
vpaddq ymm13,ymm13,ymm2
vpunpckhqdq ymm6,ymm7,ymm8
vpmuludq ymm3,ymm5,ymm3
vpmuludq ymm4,ymm5,ymm4
vpunpcklqdq ymm7,ymm7,ymm8
vpaddq ymm2,ymm13,ymm3
vpaddq ymm3,ymm14,ymm4
vpunpcklqdq ymm10,ymm9,ymm10
vpmuludq ymm4,ymm0,YMMWORD[80+rax]
vpmuludq ymm0,ymm5,ymm1
vmovdqa ymm5,YMMWORD[64+rcx]
vpaddq ymm4,ymm15,ymm4
vpaddq ymm0,ymm11,ymm0
vpsrlq ymm14,ymm3,26
vpand ymm3,ymm3,ymm5
vpaddq ymm4,ymm4,ymm14
vpsrlq ymm11,ymm0,26
vpand ymm0,ymm0,ymm5
vpaddq ymm1,ymm12,ymm11
vpsrlq ymm15,ymm4,26
vpand ymm4,ymm4,ymm5
vpsrlq ymm9,ymm10,4
vpsrlq ymm12,ymm1,26
vpand ymm1,ymm1,ymm5
vpaddq ymm2,ymm2,ymm12
vpaddq ymm0,ymm0,ymm15
vpsllq ymm15,ymm15,2
vpaddq ymm0,ymm0,ymm15
vpand ymm9,ymm9,ymm5
vpsrlq ymm8,ymm7,26
vpsrlq ymm13,ymm2,26
vpand ymm2,ymm2,ymm5
vpaddq ymm3,ymm3,ymm13
vpaddq ymm2,ymm2,ymm9
vpsrlq ymm10,ymm10,30
vpsrlq ymm11,ymm0,26
vpand ymm0,ymm0,ymm5
vpaddq ymm1,ymm1,ymm11
vpsrlq ymm6,ymm6,40
vpsrlq ymm14,ymm3,26
vpand ymm3,ymm3,ymm5
vpaddq ymm4,ymm4,ymm14
vpand ymm7,ymm7,ymm5
vpand ymm8,ymm8,ymm5
vpand ymm10,ymm10,ymm5
vpor ymm6,ymm6,YMMWORD[32+rcx]
sub rdx,64
jnz NEAR $L$oop_avx2_512
DB 0x66,0x90
$L$tail_avx2_512:
vpaddq ymm0,ymm7,ymm0
vmovdqu ymm7,YMMWORD[4+rsp]
vpaddq ymm1,ymm8,ymm1
vmovdqu ymm8,YMMWORD[36+rsp]
vpaddq ymm3,ymm10,ymm3
vmovdqu ymm9,YMMWORD[100+rsp]
vpaddq ymm4,ymm6,ymm4
vmovdqu ymm10,YMMWORD[52+rax]
vmovdqu ymm5,YMMWORD[116+rax]
vpmuludq ymm13,ymm7,ymm2
vpmuludq ymm14,ymm8,ymm2
vpmuludq ymm15,ymm9,ymm2
vpmuludq ymm11,ymm10,ymm2
vpmuludq ymm12,ymm5,ymm2
vpmuludq ymm6,ymm8,ymm0
vpmuludq ymm2,ymm8,ymm1
vpaddq ymm12,ymm12,ymm6
vpaddq ymm13,ymm13,ymm2
vpmuludq ymm6,ymm8,ymm3
vpmuludq ymm2,ymm4,YMMWORD[68+rsp]
vpaddq ymm15,ymm15,ymm6
vpaddq ymm11,ymm11,ymm2
vpmuludq ymm6,ymm7,ymm0
vpmuludq ymm2,ymm7,ymm1
vpaddq ymm11,ymm11,ymm6
vmovdqu ymm8,YMMWORD[((-12))+rax]
vpaddq ymm12,ymm12,ymm2
vpmuludq ymm6,ymm7,ymm3
vpmuludq ymm2,ymm7,ymm4
vpaddq ymm14,ymm14,ymm6
vpaddq ymm15,ymm15,ymm2
vpmuludq ymm6,ymm8,ymm3
vpmuludq ymm2,ymm8,ymm4
vpaddq ymm11,ymm11,ymm6
vpaddq ymm12,ymm12,ymm2
vmovdqu ymm2,YMMWORD[20+rax]
vpmuludq ymm6,ymm9,ymm1
vpmuludq ymm9,ymm9,ymm0
vpaddq ymm14,ymm14,ymm6
vpaddq ymm13,ymm13,ymm9
vpmuludq ymm6,ymm2,ymm1
vpmuludq ymm2,ymm2,ymm0
vpaddq ymm15,ymm15,ymm6
vpaddq ymm14,ymm14,ymm2
vpmuludq ymm6,ymm10,ymm3
vpmuludq ymm2,ymm10,ymm4
vpaddq ymm12,ymm12,ymm6
vpaddq ymm13,ymm13,ymm2
vpmuludq ymm3,ymm5,ymm3
vpmuludq ymm4,ymm5,ymm4
vpaddq ymm2,ymm13,ymm3
vpaddq ymm3,ymm14,ymm4
vpmuludq ymm4,ymm0,YMMWORD[84+rax]
vpmuludq ymm0,ymm5,ymm1
vmovdqa ymm5,YMMWORD[64+rcx]
vpaddq ymm4,ymm15,ymm4
vpaddq ymm0,ymm11,ymm0
vpsrldq ymm8,ymm12,8
vpsrldq ymm9,ymm2,8
vpsrldq ymm10,ymm3,8
vpsrldq ymm6,ymm4,8
vpsrldq ymm7,ymm0,8
vpaddq ymm12,ymm12,ymm8
vpaddq ymm2,ymm2,ymm9
vpaddq ymm3,ymm3,ymm10
vpaddq ymm4,ymm4,ymm6
vpaddq ymm0,ymm0,ymm7
vpermq ymm10,ymm3,0x2
vpermq ymm6,ymm4,0x2
vpermq ymm7,ymm0,0x2
vpermq ymm8,ymm12,0x2
vpermq ymm9,ymm2,0x2
vpaddq ymm3,ymm3,ymm10
vpaddq ymm4,ymm4,ymm6
vpaddq ymm0,ymm0,ymm7
vpaddq ymm12,ymm12,ymm8
vpaddq ymm2,ymm2,ymm9
vpsrlq ymm14,ymm3,26
vpand ymm3,ymm3,ymm5
vpaddq ymm4,ymm4,ymm14
vpsrlq ymm11,ymm0,26
vpand ymm0,ymm0,ymm5
vpaddq ymm1,ymm12,ymm11
vpsrlq ymm15,ymm4,26
vpand ymm4,ymm4,ymm5
vpsrlq ymm12,ymm1,26
vpand ymm1,ymm1,ymm5
vpaddq ymm2,ymm2,ymm12
vpaddq ymm0,ymm0,ymm15
vpsllq ymm15,ymm15,2
vpaddq ymm0,ymm0,ymm15
vpsrlq ymm13,ymm2,26
vpand ymm2,ymm2,ymm5
vpaddq ymm3,ymm3,ymm13
vpsrlq ymm11,ymm0,26
vpand ymm0,ymm0,ymm5
vpaddq ymm1,ymm1,ymm11
vpsrlq ymm14,ymm3,26
vpand ymm3,ymm3,ymm5
vpaddq ymm4,ymm4,ymm14
vmovd DWORD[(-112)+rdi],xmm0
vmovd DWORD[(-108)+rdi],xmm1
vmovd DWORD[(-104)+rdi],xmm2
vmovd DWORD[(-100)+rdi],xmm3
vmovd DWORD[(-96)+rdi],xmm4
vmovdqa xmm6,XMMWORD[80+r11]
vmovdqa xmm7,XMMWORD[96+r11]
vmovdqa xmm8,XMMWORD[112+r11]
vmovdqa xmm9,XMMWORD[128+r11]
vmovdqa xmm10,XMMWORD[144+r11]
vmovdqa xmm11,XMMWORD[160+r11]
vmovdqa xmm12,XMMWORD[176+r11]
vmovdqa xmm13,XMMWORD[192+r11]
vmovdqa xmm14,XMMWORD[208+r11]
vmovdqa xmm15,XMMWORD[224+r11]
lea rsp,[248+r11]
$L$do_avx2_epilogue_512:
vzeroupper
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_poly1305_blocks_avx512:
$L$blocks_avx512:
mov eax,15
kmovw k2,eax
lea r11,[((-248))+rsp]
sub rsp,0x1c8
vmovdqa XMMWORD[80+r11],xmm6
vmovdqa XMMWORD[96+r11],xmm7
vmovdqa XMMWORD[112+r11],xmm8
vmovdqa XMMWORD[128+r11],xmm9
vmovdqa XMMWORD[144+r11],xmm10
vmovdqa XMMWORD[160+r11],xmm11
vmovdqa XMMWORD[176+r11],xmm12
vmovdqa XMMWORD[192+r11],xmm13
vmovdqa XMMWORD[208+r11],xmm14
vmovdqa XMMWORD[224+r11],xmm15
$L$do_avx512_body:
lea rcx,[$L$const]
lea rdi,[((48+64))+rdi]
vmovdqa ymm9,YMMWORD[96+rcx]
vmovdqu xmm11,XMMWORD[((-64))+rdi]
and rsp,-512
vmovdqu xmm12,XMMWORD[((-48))+rdi]
mov rax,0x20
vmovdqu xmm7,XMMWORD[((-32))+rdi]
vmovdqu xmm13,XMMWORD[((-16))+rdi]
vmovdqu xmm8,XMMWORD[rdi]
vmovdqu xmm14,XMMWORD[16+rdi]
vmovdqu xmm10,XMMWORD[32+rdi]
vmovdqu xmm15,XMMWORD[48+rdi]
vmovdqu xmm6,XMMWORD[64+rdi]
vpermd zmm16,zmm9,zmm11
vpbroadcastq zmm5,QWORD[64+rcx]
vpermd zmm17,zmm9,zmm12
vpermd zmm21,zmm9,zmm7
vpermd zmm18,zmm9,zmm13
vmovdqa64 ZMMWORD[rsp]{k2},zmm16
vpsrlq zmm7,zmm16,32
vpermd zmm22,zmm9,zmm8
vmovdqu64 ZMMWORD[rax*1+rsp]{k2},zmm17
vpsrlq zmm8,zmm17,32
vpermd zmm19,zmm9,zmm14
vmovdqa64 ZMMWORD[64+rsp]{k2},zmm21
vpermd zmm23,zmm9,zmm10
vpermd zmm20,zmm9,zmm15
vmovdqu64 ZMMWORD[64+rax*1+rsp]{k2},zmm18
vpermd zmm24,zmm9,zmm6
vmovdqa64 ZMMWORD[128+rsp]{k2},zmm22
vmovdqu64 ZMMWORD[128+rax*1+rsp]{k2},zmm19
vmovdqa64 ZMMWORD[192+rsp]{k2},zmm23
vmovdqu64 ZMMWORD[192+rax*1+rsp]{k2},zmm20
vmovdqa64 ZMMWORD[256+rsp]{k2},zmm24
vpmuludq zmm11,zmm16,zmm7
vpmuludq zmm12,zmm17,zmm7
vpmuludq zmm13,zmm18,zmm7
vpmuludq zmm14,zmm19,zmm7
vpmuludq zmm15,zmm20,zmm7
vpsrlq zmm9,zmm18,32
vpmuludq zmm25,zmm24,zmm8
vpmuludq zmm26,zmm16,zmm8
vpmuludq zmm27,zmm17,zmm8
vpmuludq zmm28,zmm18,zmm8
vpmuludq zmm29,zmm19,zmm8
vpsrlq zmm10,zmm19,32
vpaddq zmm11,zmm11,zmm25
vpaddq zmm12,zmm12,zmm26
vpaddq zmm13,zmm13,zmm27
vpaddq zmm14,zmm14,zmm28
vpaddq zmm15,zmm15,zmm29
vpmuludq zmm25,zmm23,zmm9
vpmuludq zmm26,zmm24,zmm9
vpmuludq zmm28,zmm17,zmm9
vpmuludq zmm29,zmm18,zmm9
vpmuludq zmm27,zmm16,zmm9
vpsrlq zmm6,zmm20,32
vpaddq zmm11,zmm11,zmm25
vpaddq zmm12,zmm12,zmm26
vpaddq zmm14,zmm14,zmm28
vpaddq zmm15,zmm15,zmm29
vpaddq zmm13,zmm13,zmm27
vpmuludq zmm25,zmm22,zmm10
vpmuludq zmm28,zmm16,zmm10
vpmuludq zmm29,zmm17,zmm10
vpmuludq zmm26,zmm23,zmm10
vpmuludq zmm27,zmm24,zmm10
vpaddq zmm11,zmm11,zmm25
vpaddq zmm14,zmm14,zmm28
vpaddq zmm15,zmm15,zmm29
vpaddq zmm12,zmm12,zmm26
vpaddq zmm13,zmm13,zmm27
vpmuludq zmm28,zmm24,zmm6
vpmuludq zmm29,zmm16,zmm6
vpmuludq zmm25,zmm21,zmm6
vpmuludq zmm26,zmm22,zmm6
vpmuludq zmm27,zmm23,zmm6
vpaddq zmm14,zmm14,zmm28
vpaddq zmm15,zmm15,zmm29
vpaddq zmm11,zmm11,zmm25
vpaddq zmm12,zmm12,zmm26
vpaddq zmm13,zmm13,zmm27
vmovdqu64 zmm10,ZMMWORD[rsi]
vmovdqu64 zmm6,ZMMWORD[64+rsi]
lea rsi,[128+rsi]
vpsrlq zmm28,zmm14,26
vpandq zmm14,zmm14,zmm5
vpaddq zmm15,zmm15,zmm28
vpsrlq zmm25,zmm11,26
vpandq zmm11,zmm11,zmm5
vpaddq zmm12,zmm12,zmm25
vpsrlq zmm29,zmm15,26
vpandq zmm15,zmm15,zmm5
vpsrlq zmm26,zmm12,26
vpandq zmm12,zmm12,zmm5
vpaddq zmm13,zmm13,zmm26
vpaddq zmm11,zmm11,zmm29
vpsllq zmm29,zmm29,2
vpaddq zmm11,zmm11,zmm29
vpsrlq zmm27,zmm13,26
vpandq zmm13,zmm13,zmm5
vpaddq zmm14,zmm14,zmm27
vpsrlq zmm25,zmm11,26
vpandq zmm11,zmm11,zmm5
vpaddq zmm12,zmm12,zmm25
vpsrlq zmm28,zmm14,26
vpandq zmm14,zmm14,zmm5
vpaddq zmm15,zmm15,zmm28
vpunpcklqdq zmm7,zmm10,zmm6
vpunpckhqdq zmm6,zmm10,zmm6
vmovdqa32 zmm25,ZMMWORD[128+rcx]
mov eax,0x7777
kmovw k1,eax
vpermd zmm16,zmm25,zmm16
vpermd zmm17,zmm25,zmm17
vpermd zmm18,zmm25,zmm18
vpermd zmm19,zmm25,zmm19
vpermd zmm20,zmm25,zmm20
vpermd zmm16{k1},zmm25,zmm11
vpermd zmm17{k1},zmm25,zmm12
vpermd zmm18{k1},zmm25,zmm13
vpermd zmm19{k1},zmm25,zmm14
vpermd zmm20{k1},zmm25,zmm15
vpslld zmm21,zmm17,2
vpslld zmm22,zmm18,2
vpslld zmm23,zmm19,2
vpslld zmm24,zmm20,2
vpaddd zmm21,zmm21,zmm17
vpaddd zmm22,zmm22,zmm18
vpaddd zmm23,zmm23,zmm19
vpaddd zmm24,zmm24,zmm20
vpbroadcastq zmm30,QWORD[32+rcx]
vpsrlq zmm9,zmm7,52
vpsllq zmm10,zmm6,12
vporq zmm9,zmm9,zmm10
vpsrlq zmm8,zmm7,26
vpsrlq zmm10,zmm6,14
vpsrlq zmm6,zmm6,40
vpandq zmm9,zmm9,zmm5
vpandq zmm7,zmm7,zmm5
vpaddq zmm2,zmm9,zmm2
sub rdx,192
jbe NEAR $L$tail_avx512
jmp NEAR $L$oop_avx512
ALIGN 32
$L$oop_avx512:
vpmuludq zmm14,zmm17,zmm2
vpaddq zmm0,zmm7,zmm0
vpmuludq zmm15,zmm18,zmm2
vpandq zmm8,zmm8,zmm5
vpmuludq zmm11,zmm23,zmm2
vpandq zmm10,zmm10,zmm5
vpmuludq zmm12,zmm24,zmm2
vporq zmm6,zmm6,zmm30
vpmuludq zmm13,zmm16,zmm2
vpaddq zmm1,zmm8,zmm1
vpaddq zmm3,zmm10,zmm3
vpaddq zmm4,zmm6,zmm4
vmovdqu64 zmm10,ZMMWORD[rsi]
vmovdqu64 zmm6,ZMMWORD[64+rsi]
lea rsi,[128+rsi]
vpmuludq zmm28,zmm19,zmm0
vpmuludq zmm29,zmm20,zmm0
vpmuludq zmm25,zmm16,zmm0
vpmuludq zmm26,zmm17,zmm0
vpaddq zmm14,zmm14,zmm28
vpaddq zmm15,zmm15,zmm29
vpaddq zmm11,zmm11,zmm25
vpaddq zmm12,zmm12,zmm26
vpmuludq zmm28,zmm18,zmm1
vpmuludq zmm29,zmm19,zmm1
vpmuludq zmm25,zmm24,zmm1
vpmuludq zmm27,zmm18,zmm0
vpaddq zmm14,zmm14,zmm28
vpaddq zmm15,zmm15,zmm29
vpaddq zmm11,zmm11,zmm25
vpaddq zmm13,zmm13,zmm27
vpunpcklqdq zmm7,zmm10,zmm6
vpunpckhqdq zmm6,zmm10,zmm6
vpmuludq zmm28,zmm16,zmm3
vpmuludq zmm29,zmm17,zmm3
vpmuludq zmm26,zmm16,zmm1
vpmuludq zmm27,zmm17,zmm1
vpaddq zmm14,zmm14,zmm28
vpaddq zmm15,zmm15,zmm29
vpaddq zmm12,zmm12,zmm26
vpaddq zmm13,zmm13,zmm27
vpmuludq zmm28,zmm24,zmm4
vpmuludq zmm29,zmm16,zmm4
vpmuludq zmm25,zmm22,zmm3
vpmuludq zmm26,zmm23,zmm3
vpaddq zmm14,zmm14,zmm28
vpmuludq zmm27,zmm24,zmm3
vpaddq zmm15,zmm15,zmm29
vpaddq zmm11,zmm11,zmm25
vpaddq zmm12,zmm12,zmm26
vpaddq zmm13,zmm13,zmm27
vpmuludq zmm25,zmm21,zmm4
vpmuludq zmm26,zmm22,zmm4
vpmuludq zmm27,zmm23,zmm4
vpaddq zmm0,zmm11,zmm25
vpaddq zmm1,zmm12,zmm26
vpaddq zmm2,zmm13,zmm27
vpsrlq zmm9,zmm7,52
vpsllq zmm10,zmm6,12
vpsrlq zmm3,zmm14,26
vpandq zmm14,zmm14,zmm5
vpaddq zmm4,zmm15,zmm3
vporq zmm9,zmm9,zmm10
vpsrlq zmm11,zmm0,26
vpandq zmm0,zmm0,zmm5
vpaddq zmm1,zmm1,zmm11
vpandq zmm9,zmm9,zmm5
vpsrlq zmm15,zmm4,26
vpandq zmm4,zmm4,zmm5
vpsrlq zmm12,zmm1,26
vpandq zmm1,zmm1,zmm5
vpaddq zmm2,zmm2,zmm12
vpaddq zmm0,zmm0,zmm15
vpsllq zmm15,zmm15,2
vpaddq zmm0,zmm0,zmm15
vpaddq zmm2,zmm2,zmm9
vpsrlq zmm8,zmm7,26
vpsrlq zmm13,zmm2,26
vpandq zmm2,zmm2,zmm5
vpaddq zmm3,zmm14,zmm13
vpsrlq zmm10,zmm6,14
vpsrlq zmm11,zmm0,26
vpandq zmm0,zmm0,zmm5
vpaddq zmm1,zmm1,zmm11
vpsrlq zmm6,zmm6,40
vpsrlq zmm14,zmm3,26
vpandq zmm3,zmm3,zmm5
vpaddq zmm4,zmm4,zmm14
vpandq zmm7,zmm7,zmm5
sub rdx,128
ja NEAR $L$oop_avx512
$L$tail_avx512:
vpsrlq zmm16,zmm16,32
vpsrlq zmm17,zmm17,32
vpsrlq zmm18,zmm18,32
vpsrlq zmm23,zmm23,32
vpsrlq zmm24,zmm24,32
vpsrlq zmm19,zmm19,32
vpsrlq zmm20,zmm20,32
vpsrlq zmm21,zmm21,32
vpsrlq zmm22,zmm22,32
lea rsi,[rdx*1+rsi]
vpaddq zmm0,zmm7,zmm0
vpmuludq zmm14,zmm17,zmm2
vpmuludq zmm15,zmm18,zmm2
vpmuludq zmm11,zmm23,zmm2
vpandq zmm8,zmm8,zmm5
vpmuludq zmm12,zmm24,zmm2
vpandq zmm10,zmm10,zmm5
vpmuludq zmm13,zmm16,zmm2
vporq zmm6,zmm6,zmm30
vpaddq zmm1,zmm8,zmm1
vpaddq zmm3,zmm10,zmm3
vpaddq zmm4,zmm6,zmm4
vmovdqu xmm7,XMMWORD[rsi]
vpmuludq zmm28,zmm19,zmm0
vpmuludq zmm29,zmm20,zmm0
vpmuludq zmm25,zmm16,zmm0
vpmuludq zmm26,zmm17,zmm0
vpaddq zmm14,zmm14,zmm28
vpaddq zmm15,zmm15,zmm29
vpaddq zmm11,zmm11,zmm25
vpaddq zmm12,zmm12,zmm26
vmovdqu xmm8,XMMWORD[16+rsi]
vpmuludq zmm28,zmm18,zmm1
vpmuludq zmm29,zmm19,zmm1
vpmuludq zmm25,zmm24,zmm1
vpmuludq zmm27,zmm18,zmm0
vpaddq zmm14,zmm14,zmm28
vpaddq zmm15,zmm15,zmm29
vpaddq zmm11,zmm11,zmm25
vpaddq zmm13,zmm13,zmm27
vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1
vpmuludq zmm28,zmm16,zmm3
vpmuludq zmm29,zmm17,zmm3
vpmuludq zmm26,zmm16,zmm1
vpmuludq zmm27,zmm17,zmm1
vpaddq zmm14,zmm14,zmm28
vpaddq zmm15,zmm15,zmm29
vpaddq zmm12,zmm12,zmm26
vpaddq zmm13,zmm13,zmm27
vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1
vpmuludq zmm28,zmm24,zmm4
vpmuludq zmm29,zmm16,zmm4
vpmuludq zmm25,zmm22,zmm3
vpmuludq zmm26,zmm23,zmm3
vpmuludq zmm27,zmm24,zmm3
vpaddq zmm3,zmm14,zmm28
vpaddq zmm15,zmm15,zmm29
vpaddq zmm11,zmm11,zmm25
vpaddq zmm12,zmm12,zmm26
vpaddq zmm13,zmm13,zmm27
vpmuludq zmm25,zmm21,zmm4
vpmuludq zmm26,zmm22,zmm4
vpmuludq zmm27,zmm23,zmm4
vpaddq zmm0,zmm11,zmm25
vpaddq zmm1,zmm12,zmm26
vpaddq zmm2,zmm13,zmm27
mov eax,1
vpermq zmm14,zmm3,0xb1
vpermq zmm4,zmm15,0xb1
vpermq zmm11,zmm0,0xb1
vpermq zmm12,zmm1,0xb1
vpermq zmm13,zmm2,0xb1
vpaddq zmm3,zmm3,zmm14
vpaddq zmm4,zmm4,zmm15
vpaddq zmm0,zmm0,zmm11
vpaddq zmm1,zmm1,zmm12
vpaddq zmm2,zmm2,zmm13
kmovw k3,eax
vpermq zmm14,zmm3,0x2
vpermq zmm15,zmm4,0x2
vpermq zmm11,zmm0,0x2
vpermq zmm12,zmm1,0x2
vpermq zmm13,zmm2,0x2
vpaddq zmm3,zmm3,zmm14
vpaddq zmm4,zmm4,zmm15
vpaddq zmm0,zmm0,zmm11
vpaddq zmm1,zmm1,zmm12
vpaddq zmm2,zmm2,zmm13
vextracti64x4 ymm14,zmm3,0x1
vextracti64x4 ymm15,zmm4,0x1
vextracti64x4 ymm11,zmm0,0x1
vextracti64x4 ymm12,zmm1,0x1
vextracti64x4 ymm13,zmm2,0x1
vpaddq zmm3{k3}{z},zmm3,zmm14
vpaddq zmm4{k3}{z},zmm4,zmm15
vpaddq zmm0{k3}{z},zmm0,zmm11
vpaddq zmm1{k3}{z},zmm1,zmm12
vpaddq zmm2{k3}{z},zmm2,zmm13
vpsrlq ymm14,ymm3,26
vpand ymm3,ymm3,ymm5
vpsrldq ymm9,ymm7,6
vpsrldq ymm10,ymm8,6
vpunpckhqdq ymm6,ymm7,ymm8
vpaddq ymm4,ymm4,ymm14
vpsrlq ymm11,ymm0,26
vpand ymm0,ymm0,ymm5
vpunpcklqdq ymm9,ymm9,ymm10
vpunpcklqdq ymm7,ymm7,ymm8
vpaddq ymm1,ymm1,ymm11
vpsrlq ymm15,ymm4,26
vpand ymm4,ymm4,ymm5
vpsrlq ymm12,ymm1,26
vpand ymm1,ymm1,ymm5
vpsrlq ymm10,ymm9,30
vpsrlq ymm9,ymm9,4
vpaddq ymm2,ymm2,ymm12
vpaddq ymm0,ymm0,ymm15
vpsllq ymm15,ymm15,2
vpsrlq ymm8,ymm7,26
vpsrlq ymm6,ymm6,40
vpaddq ymm0,ymm0,ymm15
vpsrlq ymm13,ymm2,26
vpand ymm2,ymm2,ymm5
vpand ymm9,ymm9,ymm5
vpand ymm7,ymm7,ymm5
vpaddq ymm3,ymm3,ymm13
vpsrlq ymm11,ymm0,26
vpand ymm0,ymm0,ymm5
vpaddq ymm2,ymm9,ymm2
vpand ymm8,ymm8,ymm5
vpaddq ymm1,ymm1,ymm11
vpsrlq ymm14,ymm3,26
vpand ymm3,ymm3,ymm5
vpand ymm10,ymm10,ymm5
vpor ymm6,ymm6,YMMWORD[32+rcx]
vpaddq ymm4,ymm4,ymm14
lea rax,[144+rsp]
add rdx,64
jnz NEAR $L$tail_avx2_512
vpsubq ymm2,ymm2,ymm9
vmovd DWORD[(-112)+rdi],xmm0
vmovd DWORD[(-108)+rdi],xmm1
vmovd DWORD[(-104)+rdi],xmm2
vmovd DWORD[(-100)+rdi],xmm3
vmovd DWORD[(-96)+rdi],xmm4
vzeroall
movdqa xmm6,XMMWORD[80+r11]
movdqa xmm7,XMMWORD[96+r11]
movdqa xmm8,XMMWORD[112+r11]
movdqa xmm9,XMMWORD[128+r11]
movdqa xmm10,XMMWORD[144+r11]
movdqa xmm11,XMMWORD[160+r11]
movdqa xmm12,XMMWORD[176+r11]
movdqa xmm13,XMMWORD[192+r11]
movdqa xmm14,XMMWORD[208+r11]
movdqa xmm15,XMMWORD[224+r11]
lea rsp,[248+r11]
$L$do_avx512_epilogue:
DB 0F3h,0C3h ;repret
EXTERN __imp_RtlVirtualUnwind
ALIGN 16
se_handler:
push rsi
push rdi
push rbx
push rbp
push r12
push r13
push r14
push r15
pushfq
sub rsp,64
mov rax,QWORD[120+r8]
mov rbx,QWORD[248+r8]
mov rsi,QWORD[8+r9]
mov r11,QWORD[56+r9]
mov r10d,DWORD[r11]
lea r10,[r10*1+rsi]
cmp rbx,r10
jb NEAR $L$common_seh_tail
mov rax,QWORD[152+r8]
mov r10d,DWORD[4+r11]
lea r10,[r10*1+rsi]
cmp rbx,r10
jae NEAR $L$common_seh_tail
lea rax,[48+rax]
mov rbx,QWORD[((-8))+rax]
mov rbp,QWORD[((-16))+rax]
mov r12,QWORD[((-24))+rax]
mov r13,QWORD[((-32))+rax]
mov r14,QWORD[((-40))+rax]
mov r15,QWORD[((-48))+rax]
mov QWORD[144+r8],rbx
mov QWORD[160+r8],rbp
mov QWORD[216+r8],r12
mov QWORD[224+r8],r13
mov QWORD[232+r8],r14
mov QWORD[240+r8],r15
jmp NEAR $L$common_seh_tail
ALIGN 16
avx_handler:
push rsi
push rdi
push rbx
push rbp
push r12
push r13
push r14
push r15
pushfq
sub rsp,64
mov rax,QWORD[120+r8]
mov rbx,QWORD[248+r8]
mov rsi,QWORD[8+r9]
mov r11,QWORD[56+r9]
mov r10d,DWORD[r11]
lea r10,[r10*1+rsi]
cmp rbx,r10
jb NEAR $L$common_seh_tail
mov rax,QWORD[152+r8]
mov r10d,DWORD[4+r11]
lea r10,[r10*1+rsi]
cmp rbx,r10
jae NEAR $L$common_seh_tail
mov rax,QWORD[208+r8]
lea rsi,[80+rax]
lea rax,[248+rax]
lea rdi,[512+r8]
mov ecx,20
DD 0xa548f3fc
$L$common_seh_tail:
mov rdi,QWORD[8+rax]
mov rsi,QWORD[16+rax]
mov QWORD[152+r8],rax
mov QWORD[168+r8],rsi
mov QWORD[176+r8],rdi
mov rdi,QWORD[40+r9]
mov rsi,r8
mov ecx,154
DD 0xa548f3fc
mov rsi,r9
xor rcx,rcx
mov rdx,QWORD[8+rsi]
mov r8,QWORD[rsi]
mov r9,QWORD[16+rsi]
mov r10,QWORD[40+rsi]
lea r11,[56+rsi]
lea r12,[24+rsi]
mov QWORD[32+rsp],r10
mov QWORD[40+rsp],r11
mov QWORD[48+rsp],r12
mov QWORD[56+rsp],rcx
call QWORD[__imp_RtlVirtualUnwind]
mov eax,1
add rsp,64
popfq
pop r15
pop r14
pop r13
pop r12
pop rbp
pop rbx
pop rdi
pop rsi
DB 0F3h,0C3h ;repret
section .pdata rdata align=4
ALIGN 4
DD $L$SEH_begin_poly1305_init_x86_64 wrt ..imagebase
DD $L$SEH_end_poly1305_init_x86_64 wrt ..imagebase
DD $L$SEH_info_poly1305_init wrt ..imagebase
DD $L$SEH_begin_poly1305_blocks_x86_64 wrt ..imagebase
DD $L$SEH_end_poly1305_blocks_x86_64 wrt ..imagebase
DD $L$SEH_info_poly1305_blocks wrt ..imagebase
DD $L$SEH_begin_poly1305_emit_x86_64 wrt ..imagebase
DD $L$SEH_end_poly1305_emit_x86_64 wrt ..imagebase
DD $L$SEH_info_poly1305_emit wrt ..imagebase
DD $L$SEH_begin_poly1305_blocks_avx wrt ..imagebase
DD $L$base2_64_avx wrt ..imagebase
DD $L$SEH_info_poly1305_blocks_avx_1 wrt ..imagebase
DD $L$base2_64_avx wrt ..imagebase
DD $L$even_avx wrt ..imagebase
DD $L$SEH_info_poly1305_blocks_avx_2 wrt ..imagebase
DD $L$even_avx wrt ..imagebase
DD $L$SEH_end_poly1305_blocks_avx wrt ..imagebase
DD $L$SEH_info_poly1305_blocks_avx_3 wrt ..imagebase
DD $L$SEH_begin_poly1305_emit_avx wrt ..imagebase
DD $L$SEH_end_poly1305_emit_avx wrt ..imagebase
DD $L$SEH_info_poly1305_emit_avx wrt ..imagebase
DD $L$SEH_begin_poly1305_blocks_avx2 wrt ..imagebase
DD $L$base2_64_avx2 wrt ..imagebase
DD $L$SEH_info_poly1305_blocks_avx2_1 wrt ..imagebase
DD $L$base2_64_avx2 wrt ..imagebase
DD $L$even_avx2 wrt ..imagebase
DD $L$SEH_info_poly1305_blocks_avx2_2 wrt ..imagebase
DD $L$even_avx2 wrt ..imagebase
DD $L$SEH_end_poly1305_blocks_avx2 wrt ..imagebase
DD $L$SEH_info_poly1305_blocks_avx2_3 wrt ..imagebase
DD $L$SEH_begin_poly1305_blocks_avx512 wrt ..imagebase
DD $L$SEH_end_poly1305_blocks_avx512 wrt ..imagebase
DD $L$SEH_info_poly1305_blocks_avx512 wrt ..imagebase
section .xdata rdata align=8
ALIGN 8
$L$SEH_info_poly1305_init:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$SEH_begin_poly1305_init_x86_64 wrt ..imagebase,$L$SEH_begin_poly1305_init_x86_64 wrt ..imagebase
$L$SEH_info_poly1305_blocks:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$blocks_body wrt ..imagebase,$L$blocks_epilogue wrt ..imagebase
$L$SEH_info_poly1305_emit:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$SEH_begin_poly1305_emit_x86_64 wrt ..imagebase,$L$SEH_begin_poly1305_emit_x86_64 wrt ..imagebase
$L$SEH_info_poly1305_blocks_avx_1:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$blocks_avx_body wrt ..imagebase,$L$blocks_avx_epilogue wrt ..imagebase
$L$SEH_info_poly1305_blocks_avx_2:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$base2_64_avx_body wrt ..imagebase,$L$base2_64_avx_epilogue wrt ..imagebase
$L$SEH_info_poly1305_blocks_avx_3:
DB 9,0,0,0
DD avx_handler wrt ..imagebase
DD $L$do_avx_body wrt ..imagebase,$L$do_avx_epilogue wrt ..imagebase
$L$SEH_info_poly1305_emit_avx:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$SEH_begin_poly1305_emit_avx wrt ..imagebase,$L$SEH_begin_poly1305_emit_avx wrt ..imagebase
$L$SEH_info_poly1305_blocks_avx2_1:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$blocks_avx2_body wrt ..imagebase,$L$blocks_avx2_epilogue wrt ..imagebase
$L$SEH_info_poly1305_blocks_avx2_2:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$base2_64_avx2_body wrt ..imagebase,$L$base2_64_avx2_epilogue wrt ..imagebase
$L$SEH_info_poly1305_blocks_avx2_3:
DB 9,0,0,0
DD avx_handler wrt ..imagebase
DD $L$do_avx2_body wrt ..imagebase,$L$do_avx2_epilogue wrt ..imagebase
$L$SEH_info_poly1305_blocks_avx512:
DB 9,0,0,0
DD avx_handler wrt ..imagebase
DD $L$do_avx512_body wrt ..imagebase,$L$do_avx512_epilogue wrt ..imagebase