3487 lines
61 KiB
NASM
3487 lines
61 KiB
NASM
default rel
|
|
%define XMMWORD
|
|
%define YMMWORD
|
|
%define ZMMWORD
|
|
ALIGN 64
|
|
$L$const:
|
|
$L$mask24:
|
|
DD 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
|
|
$L$129:
|
|
DD 16777216,0,16777216,0,16777216,0,16777216,0
|
|
$L$mask26:
|
|
DD 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
|
|
$L$permd_avx2:
|
|
DD 2,2,2,3,2,0,2,1
|
|
$L$permd_avx512:
|
|
DD 0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7
|
|
|
|
$L$2_44_inp_permd:
|
|
DD 0,1,1,2,2,3,7,7
|
|
$L$2_44_inp_shift:
|
|
DQ 0,12,24,64
|
|
$L$2_44_mask:
|
|
DQ 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
|
|
$L$2_44_shift_rgt:
|
|
DQ 44,44,42,64
|
|
$L$2_44_shift_lft:
|
|
DQ 8,8,10,64
|
|
|
|
ALIGN 64
|
|
$L$x_mask44:
|
|
DQ 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
|
|
DQ 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
|
|
$L$x_mask42:
|
|
DQ 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
|
|
DQ 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
|
|
|
|
section .text code align=64
|
|
|
|
|
|
|
|
global poly1305_init_x86_64
|
|
global poly1305_blocks_x86_64
|
|
global poly1305_emit_x86_64
|
|
global poly1305_emit_avx
|
|
global poly1305_blocks_avx
|
|
global poly1305_blocks_avx2
|
|
global poly1305_blocks_avx512
|
|
|
|
|
|
|
|
ALIGN 32
|
|
poly1305_init_x86_64:
|
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
|
mov QWORD[16+rsp],rsi
|
|
mov rax,rsp
|
|
$L$SEH_begin_poly1305_init_x86_64:
|
|
mov rdi,rcx
|
|
mov rsi,rdx
|
|
mov rdx,r8
|
|
|
|
|
|
xor rax,rax
|
|
mov QWORD[rdi],rax
|
|
mov QWORD[8+rdi],rax
|
|
mov QWORD[16+rdi],rax
|
|
|
|
cmp rsi,0
|
|
je NEAR $L$no_key
|
|
|
|
|
|
|
|
mov rax,0x0ffffffc0fffffff
|
|
mov rcx,0x0ffffffc0ffffffc
|
|
and rax,QWORD[rsi]
|
|
and rcx,QWORD[8+rsi]
|
|
mov QWORD[24+rdi],rax
|
|
mov QWORD[32+rdi],rcx
|
|
mov eax,1
|
|
$L$no_key:
|
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
|
mov rsi,QWORD[16+rsp]
|
|
DB 0F3h,0C3h ;repret
|
|
$L$SEH_end_poly1305_init_x86_64:
|
|
|
|
|
|
ALIGN 32
|
|
poly1305_blocks_x86_64:
|
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
|
mov QWORD[16+rsp],rsi
|
|
mov rax,rsp
|
|
$L$SEH_begin_poly1305_blocks_x86_64:
|
|
mov rdi,rcx
|
|
mov rsi,rdx
|
|
mov rdx,r8
|
|
mov rcx,r9
|
|
|
|
|
|
|
|
$L$blocks:
|
|
shr rdx,4
|
|
jz NEAR $L$no_data
|
|
|
|
push rbx
|
|
|
|
push rbp
|
|
|
|
push r12
|
|
|
|
push r13
|
|
|
|
push r14
|
|
|
|
push r15
|
|
|
|
$L$blocks_body:
|
|
|
|
mov r15,rdx
|
|
|
|
mov r11,QWORD[24+rdi]
|
|
mov r13,QWORD[32+rdi]
|
|
|
|
mov r14,QWORD[rdi]
|
|
mov rbx,QWORD[8+rdi]
|
|
mov rbp,QWORD[16+rdi]
|
|
|
|
mov r12,r13
|
|
shr r13,2
|
|
mov rax,r12
|
|
add r13,r12
|
|
jmp NEAR $L$oop
|
|
|
|
ALIGN 32
|
|
$L$oop:
|
|
add r14,QWORD[rsi]
|
|
adc rbx,QWORD[8+rsi]
|
|
lea rsi,[16+rsi]
|
|
adc rbp,rcx
|
|
mul r14
|
|
mov r9,rax
|
|
mov rax,r11
|
|
mov r10,rdx
|
|
|
|
mul r14
|
|
mov r14,rax
|
|
mov rax,r11
|
|
mov r8,rdx
|
|
|
|
mul rbx
|
|
add r9,rax
|
|
mov rax,r13
|
|
adc r10,rdx
|
|
|
|
mul rbx
|
|
mov rbx,rbp
|
|
add r14,rax
|
|
adc r8,rdx
|
|
|
|
imul rbx,r13
|
|
add r9,rbx
|
|
mov rbx,r8
|
|
adc r10,0
|
|
|
|
imul rbp,r11
|
|
add rbx,r9
|
|
mov rax,-4
|
|
adc r10,rbp
|
|
|
|
and rax,r10
|
|
mov rbp,r10
|
|
shr r10,2
|
|
and rbp,3
|
|
add rax,r10
|
|
add r14,rax
|
|
adc rbx,0
|
|
adc rbp,0
|
|
mov rax,r12
|
|
dec r15
|
|
jnz NEAR $L$oop
|
|
|
|
mov QWORD[rdi],r14
|
|
mov QWORD[8+rdi],rbx
|
|
mov QWORD[16+rdi],rbp
|
|
|
|
mov r15,QWORD[rsp]
|
|
|
|
mov r14,QWORD[8+rsp]
|
|
|
|
mov r13,QWORD[16+rsp]
|
|
|
|
mov r12,QWORD[24+rsp]
|
|
|
|
mov rbp,QWORD[32+rsp]
|
|
|
|
mov rbx,QWORD[40+rsp]
|
|
|
|
lea rsp,[48+rsp]
|
|
|
|
$L$no_data:
|
|
$L$blocks_epilogue:
|
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
|
mov rsi,QWORD[16+rsp]
|
|
DB 0F3h,0C3h ;repret
|
|
|
|
$L$SEH_end_poly1305_blocks_x86_64:
|
|
|
|
|
|
ALIGN 32
|
|
poly1305_emit_x86_64:
|
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
|
mov QWORD[16+rsp],rsi
|
|
mov rax,rsp
|
|
$L$SEH_begin_poly1305_emit_x86_64:
|
|
mov rdi,rcx
|
|
mov rsi,rdx
|
|
mov rdx,r8
|
|
|
|
|
|
$L$emit:
|
|
mov r8,QWORD[rdi]
|
|
mov r9,QWORD[8+rdi]
|
|
mov r10,QWORD[16+rdi]
|
|
|
|
mov rax,r8
|
|
add r8,5
|
|
mov rcx,r9
|
|
adc r9,0
|
|
adc r10,0
|
|
shr r10,2
|
|
cmovnz rax,r8
|
|
cmovnz rcx,r9
|
|
|
|
add rax,QWORD[rdx]
|
|
adc rcx,QWORD[8+rdx]
|
|
mov QWORD[rsi],rax
|
|
mov QWORD[8+rsi],rcx
|
|
|
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
|
mov rsi,QWORD[16+rsp]
|
|
DB 0F3h,0C3h ;repret
|
|
$L$SEH_end_poly1305_emit_x86_64:
|
|
|
|
ALIGN 32
|
|
__poly1305_block:
|
|
mul r14
|
|
mov r9,rax
|
|
mov rax,r11
|
|
mov r10,rdx
|
|
|
|
mul r14
|
|
mov r14,rax
|
|
mov rax,r11
|
|
mov r8,rdx
|
|
|
|
mul rbx
|
|
add r9,rax
|
|
mov rax,r13
|
|
adc r10,rdx
|
|
|
|
mul rbx
|
|
mov rbx,rbp
|
|
add r14,rax
|
|
adc r8,rdx
|
|
|
|
imul rbx,r13
|
|
add r9,rbx
|
|
mov rbx,r8
|
|
adc r10,0
|
|
|
|
imul rbp,r11
|
|
add rbx,r9
|
|
mov rax,-4
|
|
adc r10,rbp
|
|
|
|
and rax,r10
|
|
mov rbp,r10
|
|
shr r10,2
|
|
and rbp,3
|
|
add rax,r10
|
|
add r14,rax
|
|
adc rbx,0
|
|
adc rbp,0
|
|
DB 0F3h,0C3h ;repret
|
|
|
|
|
|
|
|
ALIGN 32
|
|
__poly1305_init_avx:
|
|
mov r14,r11
|
|
mov rbx,r12
|
|
xor rbp,rbp
|
|
|
|
lea rdi,[((48+64))+rdi]
|
|
|
|
mov rax,r12
|
|
call __poly1305_block
|
|
|
|
mov eax,0x3ffffff
|
|
mov edx,0x3ffffff
|
|
mov r8,r14
|
|
and eax,r14d
|
|
mov r9,r11
|
|
and edx,r11d
|
|
mov DWORD[((-64))+rdi],eax
|
|
shr r8,26
|
|
mov DWORD[((-60))+rdi],edx
|
|
shr r9,26
|
|
|
|
mov eax,0x3ffffff
|
|
mov edx,0x3ffffff
|
|
and eax,r8d
|
|
and edx,r9d
|
|
mov DWORD[((-48))+rdi],eax
|
|
lea eax,[rax*4+rax]
|
|
mov DWORD[((-44))+rdi],edx
|
|
lea edx,[rdx*4+rdx]
|
|
mov DWORD[((-32))+rdi],eax
|
|
shr r8,26
|
|
mov DWORD[((-28))+rdi],edx
|
|
shr r9,26
|
|
|
|
mov rax,rbx
|
|
mov rdx,r12
|
|
shl rax,12
|
|
shl rdx,12
|
|
or rax,r8
|
|
or rdx,r9
|
|
and eax,0x3ffffff
|
|
and edx,0x3ffffff
|
|
mov DWORD[((-16))+rdi],eax
|
|
lea eax,[rax*4+rax]
|
|
mov DWORD[((-12))+rdi],edx
|
|
lea edx,[rdx*4+rdx]
|
|
mov DWORD[rdi],eax
|
|
mov r8,rbx
|
|
mov DWORD[4+rdi],edx
|
|
mov r9,r12
|
|
|
|
mov eax,0x3ffffff
|
|
mov edx,0x3ffffff
|
|
shr r8,14
|
|
shr r9,14
|
|
and eax,r8d
|
|
and edx,r9d
|
|
mov DWORD[16+rdi],eax
|
|
lea eax,[rax*4+rax]
|
|
mov DWORD[20+rdi],edx
|
|
lea edx,[rdx*4+rdx]
|
|
mov DWORD[32+rdi],eax
|
|
shr r8,26
|
|
mov DWORD[36+rdi],edx
|
|
shr r9,26
|
|
|
|
mov rax,rbp
|
|
shl rax,24
|
|
or r8,rax
|
|
mov DWORD[48+rdi],r8d
|
|
lea r8,[r8*4+r8]
|
|
mov DWORD[52+rdi],r9d
|
|
lea r9,[r9*4+r9]
|
|
mov DWORD[64+rdi],r8d
|
|
mov DWORD[68+rdi],r9d
|
|
|
|
mov rax,r12
|
|
call __poly1305_block
|
|
|
|
mov eax,0x3ffffff
|
|
mov r8,r14
|
|
and eax,r14d
|
|
shr r8,26
|
|
mov DWORD[((-52))+rdi],eax
|
|
|
|
mov edx,0x3ffffff
|
|
and edx,r8d
|
|
mov DWORD[((-36))+rdi],edx
|
|
lea edx,[rdx*4+rdx]
|
|
shr r8,26
|
|
mov DWORD[((-20))+rdi],edx
|
|
|
|
mov rax,rbx
|
|
shl rax,12
|
|
or rax,r8
|
|
and eax,0x3ffffff
|
|
mov DWORD[((-4))+rdi],eax
|
|
lea eax,[rax*4+rax]
|
|
mov r8,rbx
|
|
mov DWORD[12+rdi],eax
|
|
|
|
mov edx,0x3ffffff
|
|
shr r8,14
|
|
and edx,r8d
|
|
mov DWORD[28+rdi],edx
|
|
lea edx,[rdx*4+rdx]
|
|
shr r8,26
|
|
mov DWORD[44+rdi],edx
|
|
|
|
mov rax,rbp
|
|
shl rax,24
|
|
or r8,rax
|
|
mov DWORD[60+rdi],r8d
|
|
lea r8,[r8*4+r8]
|
|
mov DWORD[76+rdi],r8d
|
|
|
|
mov rax,r12
|
|
call __poly1305_block
|
|
|
|
mov eax,0x3ffffff
|
|
mov r8,r14
|
|
and eax,r14d
|
|
shr r8,26
|
|
mov DWORD[((-56))+rdi],eax
|
|
|
|
mov edx,0x3ffffff
|
|
and edx,r8d
|
|
mov DWORD[((-40))+rdi],edx
|
|
lea edx,[rdx*4+rdx]
|
|
shr r8,26
|
|
mov DWORD[((-24))+rdi],edx
|
|
|
|
mov rax,rbx
|
|
shl rax,12
|
|
or rax,r8
|
|
and eax,0x3ffffff
|
|
mov DWORD[((-8))+rdi],eax
|
|
lea eax,[rax*4+rax]
|
|
mov r8,rbx
|
|
mov DWORD[8+rdi],eax
|
|
|
|
mov edx,0x3ffffff
|
|
shr r8,14
|
|
and edx,r8d
|
|
mov DWORD[24+rdi],edx
|
|
lea edx,[rdx*4+rdx]
|
|
shr r8,26
|
|
mov DWORD[40+rdi],edx
|
|
|
|
mov rax,rbp
|
|
shl rax,24
|
|
or r8,rax
|
|
mov DWORD[56+rdi],r8d
|
|
lea r8,[r8*4+r8]
|
|
mov DWORD[72+rdi],r8d
|
|
|
|
lea rdi,[((-48-64))+rdi]
|
|
DB 0F3h,0C3h ;repret
|
|
|
|
|
|
|
|
ALIGN 32
|
|
poly1305_blocks_avx:
|
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
|
mov QWORD[16+rsp],rsi
|
|
mov rax,rsp
|
|
$L$SEH_begin_poly1305_blocks_avx:
|
|
mov rdi,rcx
|
|
mov rsi,rdx
|
|
mov rdx,r8
|
|
mov rcx,r9
|
|
|
|
|
|
|
|
mov r8d,DWORD[20+rdi]
|
|
cmp rdx,128
|
|
jae NEAR $L$blocks_avx
|
|
test r8d,r8d
|
|
jz NEAR $L$blocks
|
|
|
|
$L$blocks_avx:
|
|
and rdx,-16
|
|
jz NEAR $L$no_data_avx
|
|
|
|
vzeroupper
|
|
|
|
test r8d,r8d
|
|
jz NEAR $L$base2_64_avx
|
|
|
|
test rdx,31
|
|
jz NEAR $L$even_avx
|
|
|
|
push rbx
|
|
|
|
push rbp
|
|
|
|
push r12
|
|
|
|
push r13
|
|
|
|
push r14
|
|
|
|
push r15
|
|
|
|
$L$blocks_avx_body:
|
|
|
|
mov r15,rdx
|
|
|
|
mov r8,QWORD[rdi]
|
|
mov r9,QWORD[8+rdi]
|
|
mov ebp,DWORD[16+rdi]
|
|
|
|
mov r11,QWORD[24+rdi]
|
|
mov r13,QWORD[32+rdi]
|
|
|
|
|
|
mov r14d,r8d
|
|
and r8,-2147483648
|
|
mov r12,r9
|
|
mov ebx,r9d
|
|
and r9,-2147483648
|
|
|
|
shr r8,6
|
|
shl r12,52
|
|
add r14,r8
|
|
shr rbx,12
|
|
shr r9,18
|
|
add r14,r12
|
|
adc rbx,r9
|
|
|
|
mov r8,rbp
|
|
shl r8,40
|
|
shr rbp,24
|
|
add rbx,r8
|
|
adc rbp,0
|
|
|
|
mov r9,-4
|
|
mov r8,rbp
|
|
and r9,rbp
|
|
shr r8,2
|
|
and rbp,3
|
|
add r8,r9
|
|
add r14,r8
|
|
adc rbx,0
|
|
adc rbp,0
|
|
|
|
mov r12,r13
|
|
mov rax,r13
|
|
shr r13,2
|
|
add r13,r12
|
|
|
|
add r14,QWORD[rsi]
|
|
adc rbx,QWORD[8+rsi]
|
|
lea rsi,[16+rsi]
|
|
adc rbp,rcx
|
|
|
|
call __poly1305_block
|
|
|
|
test rcx,rcx
|
|
jz NEAR $L$store_base2_64_avx
|
|
|
|
|
|
mov rax,r14
|
|
mov rdx,r14
|
|
shr r14,52
|
|
mov r11,rbx
|
|
mov r12,rbx
|
|
shr rdx,26
|
|
and rax,0x3ffffff
|
|
shl r11,12
|
|
and rdx,0x3ffffff
|
|
shr rbx,14
|
|
or r14,r11
|
|
shl rbp,24
|
|
and r14,0x3ffffff
|
|
shr r12,40
|
|
and rbx,0x3ffffff
|
|
or rbp,r12
|
|
|
|
sub r15,16
|
|
jz NEAR $L$store_base2_26_avx
|
|
|
|
vmovd xmm0,eax
|
|
vmovd xmm1,edx
|
|
vmovd xmm2,r14d
|
|
vmovd xmm3,ebx
|
|
vmovd xmm4,ebp
|
|
jmp NEAR $L$proceed_avx
|
|
|
|
ALIGN 32
|
|
$L$store_base2_64_avx:
|
|
mov QWORD[rdi],r14
|
|
mov QWORD[8+rdi],rbx
|
|
mov QWORD[16+rdi],rbp
|
|
jmp NEAR $L$done_avx
|
|
|
|
ALIGN 16
|
|
$L$store_base2_26_avx:
|
|
mov DWORD[rdi],eax
|
|
mov DWORD[4+rdi],edx
|
|
mov DWORD[8+rdi],r14d
|
|
mov DWORD[12+rdi],ebx
|
|
mov DWORD[16+rdi],ebp
|
|
ALIGN 16
|
|
$L$done_avx:
|
|
mov r15,QWORD[rsp]
|
|
|
|
mov r14,QWORD[8+rsp]
|
|
|
|
mov r13,QWORD[16+rsp]
|
|
|
|
mov r12,QWORD[24+rsp]
|
|
|
|
mov rbp,QWORD[32+rsp]
|
|
|
|
mov rbx,QWORD[40+rsp]
|
|
|
|
lea rsp,[48+rsp]
|
|
|
|
$L$no_data_avx:
|
|
$L$blocks_avx_epilogue:
|
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
|
mov rsi,QWORD[16+rsp]
|
|
DB 0F3h,0C3h ;repret
|
|
|
|
|
|
ALIGN 32
|
|
$L$base2_64_avx:
|
|
|
|
push rbx
|
|
|
|
push rbp
|
|
|
|
push r12
|
|
|
|
push r13
|
|
|
|
push r14
|
|
|
|
push r15
|
|
|
|
$L$base2_64_avx_body:
|
|
|
|
mov r15,rdx
|
|
|
|
mov r11,QWORD[24+rdi]
|
|
mov r13,QWORD[32+rdi]
|
|
|
|
mov r14,QWORD[rdi]
|
|
mov rbx,QWORD[8+rdi]
|
|
mov ebp,DWORD[16+rdi]
|
|
|
|
mov r12,r13
|
|
mov rax,r13
|
|
shr r13,2
|
|
add r13,r12
|
|
|
|
test rdx,31
|
|
jz NEAR $L$init_avx
|
|
|
|
add r14,QWORD[rsi]
|
|
adc rbx,QWORD[8+rsi]
|
|
lea rsi,[16+rsi]
|
|
adc rbp,rcx
|
|
sub r15,16
|
|
|
|
call __poly1305_block
|
|
|
|
$L$init_avx:
|
|
|
|
mov rax,r14
|
|
mov rdx,r14
|
|
shr r14,52
|
|
mov r8,rbx
|
|
mov r9,rbx
|
|
shr rdx,26
|
|
and rax,0x3ffffff
|
|
shl r8,12
|
|
and rdx,0x3ffffff
|
|
shr rbx,14
|
|
or r14,r8
|
|
shl rbp,24
|
|
and r14,0x3ffffff
|
|
shr r9,40
|
|
and rbx,0x3ffffff
|
|
or rbp,r9
|
|
|
|
vmovd xmm0,eax
|
|
vmovd xmm1,edx
|
|
vmovd xmm2,r14d
|
|
vmovd xmm3,ebx
|
|
vmovd xmm4,ebp
|
|
mov DWORD[20+rdi],1
|
|
|
|
call __poly1305_init_avx
|
|
|
|
$L$proceed_avx:
|
|
mov rdx,r15
|
|
|
|
mov r15,QWORD[rsp]
|
|
|
|
mov r14,QWORD[8+rsp]
|
|
|
|
mov r13,QWORD[16+rsp]
|
|
|
|
mov r12,QWORD[24+rsp]
|
|
|
|
mov rbp,QWORD[32+rsp]
|
|
|
|
mov rbx,QWORD[40+rsp]
|
|
|
|
lea rax,[48+rsp]
|
|
lea rsp,[48+rsp]
|
|
|
|
$L$base2_64_avx_epilogue:
|
|
jmp NEAR $L$do_avx
|
|
|
|
|
|
ALIGN 32
|
|
$L$even_avx:
|
|
|
|
vmovd xmm0,DWORD[rdi]
|
|
vmovd xmm1,DWORD[4+rdi]
|
|
vmovd xmm2,DWORD[8+rdi]
|
|
vmovd xmm3,DWORD[12+rdi]
|
|
vmovd xmm4,DWORD[16+rdi]
|
|
|
|
$L$do_avx:
|
|
lea r11,[((-248))+rsp]
|
|
sub rsp,0x218
|
|
vmovdqa XMMWORD[80+r11],xmm6
|
|
vmovdqa XMMWORD[96+r11],xmm7
|
|
vmovdqa XMMWORD[112+r11],xmm8
|
|
vmovdqa XMMWORD[128+r11],xmm9
|
|
vmovdqa XMMWORD[144+r11],xmm10
|
|
vmovdqa XMMWORD[160+r11],xmm11
|
|
vmovdqa XMMWORD[176+r11],xmm12
|
|
vmovdqa XMMWORD[192+r11],xmm13
|
|
vmovdqa XMMWORD[208+r11],xmm14
|
|
vmovdqa XMMWORD[224+r11],xmm15
|
|
$L$do_avx_body:
|
|
sub rdx,64
|
|
lea rax,[((-32))+rsi]
|
|
cmovc rsi,rax
|
|
|
|
vmovdqu xmm14,XMMWORD[48+rdi]
|
|
lea rdi,[112+rdi]
|
|
lea rcx,[$L$const]
|
|
|
|
|
|
|
|
vmovdqu xmm5,XMMWORD[32+rsi]
|
|
vmovdqu xmm6,XMMWORD[48+rsi]
|
|
vmovdqa xmm15,XMMWORD[64+rcx]
|
|
|
|
vpsrldq xmm7,xmm5,6
|
|
vpsrldq xmm8,xmm6,6
|
|
vpunpckhqdq xmm9,xmm5,xmm6
|
|
vpunpcklqdq xmm5,xmm5,xmm6
|
|
vpunpcklqdq xmm8,xmm7,xmm8
|
|
|
|
vpsrlq xmm9,xmm9,40
|
|
vpsrlq xmm6,xmm5,26
|
|
vpand xmm5,xmm5,xmm15
|
|
vpsrlq xmm7,xmm8,4
|
|
vpand xmm6,xmm6,xmm15
|
|
vpsrlq xmm8,xmm8,30
|
|
vpand xmm7,xmm7,xmm15
|
|
vpand xmm8,xmm8,xmm15
|
|
vpor xmm9,xmm9,XMMWORD[32+rcx]
|
|
|
|
jbe NEAR $L$skip_loop_avx
|
|
|
|
|
|
vmovdqu xmm11,XMMWORD[((-48))+rdi]
|
|
vmovdqu xmm12,XMMWORD[((-32))+rdi]
|
|
vpshufd xmm13,xmm14,0xEE
|
|
vpshufd xmm10,xmm14,0x44
|
|
vmovdqa XMMWORD[(-144)+r11],xmm13
|
|
vmovdqa XMMWORD[rsp],xmm10
|
|
vpshufd xmm14,xmm11,0xEE
|
|
vmovdqu xmm10,XMMWORD[((-16))+rdi]
|
|
vpshufd xmm11,xmm11,0x44
|
|
vmovdqa XMMWORD[(-128)+r11],xmm14
|
|
vmovdqa XMMWORD[16+rsp],xmm11
|
|
vpshufd xmm13,xmm12,0xEE
|
|
vmovdqu xmm11,XMMWORD[rdi]
|
|
vpshufd xmm12,xmm12,0x44
|
|
vmovdqa XMMWORD[(-112)+r11],xmm13
|
|
vmovdqa XMMWORD[32+rsp],xmm12
|
|
vpshufd xmm14,xmm10,0xEE
|
|
vmovdqu xmm12,XMMWORD[16+rdi]
|
|
vpshufd xmm10,xmm10,0x44
|
|
vmovdqa XMMWORD[(-96)+r11],xmm14
|
|
vmovdqa XMMWORD[48+rsp],xmm10
|
|
vpshufd xmm13,xmm11,0xEE
|
|
vmovdqu xmm10,XMMWORD[32+rdi]
|
|
vpshufd xmm11,xmm11,0x44
|
|
vmovdqa XMMWORD[(-80)+r11],xmm13
|
|
vmovdqa XMMWORD[64+rsp],xmm11
|
|
vpshufd xmm14,xmm12,0xEE
|
|
vmovdqu xmm11,XMMWORD[48+rdi]
|
|
vpshufd xmm12,xmm12,0x44
|
|
vmovdqa XMMWORD[(-64)+r11],xmm14
|
|
vmovdqa XMMWORD[80+rsp],xmm12
|
|
vpshufd xmm13,xmm10,0xEE
|
|
vmovdqu xmm12,XMMWORD[64+rdi]
|
|
vpshufd xmm10,xmm10,0x44
|
|
vmovdqa XMMWORD[(-48)+r11],xmm13
|
|
vmovdqa XMMWORD[96+rsp],xmm10
|
|
vpshufd xmm14,xmm11,0xEE
|
|
vpshufd xmm11,xmm11,0x44
|
|
vmovdqa XMMWORD[(-32)+r11],xmm14
|
|
vmovdqa XMMWORD[112+rsp],xmm11
|
|
vpshufd xmm13,xmm12,0xEE
|
|
vmovdqa xmm14,XMMWORD[rsp]
|
|
vpshufd xmm12,xmm12,0x44
|
|
vmovdqa XMMWORD[(-16)+r11],xmm13
|
|
vmovdqa XMMWORD[128+rsp],xmm12
|
|
|
|
jmp NEAR $L$oop_avx
|
|
|
|
ALIGN 32
|
|
$L$oop_avx:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpmuludq xmm10,xmm14,xmm5
|
|
vpmuludq xmm11,xmm14,xmm6
|
|
vmovdqa XMMWORD[32+r11],xmm2
|
|
vpmuludq xmm12,xmm14,xmm7
|
|
vmovdqa xmm2,XMMWORD[16+rsp]
|
|
vpmuludq xmm13,xmm14,xmm8
|
|
vpmuludq xmm14,xmm14,xmm9
|
|
|
|
vmovdqa XMMWORD[r11],xmm0
|
|
vpmuludq xmm0,xmm9,XMMWORD[32+rsp]
|
|
vmovdqa XMMWORD[16+r11],xmm1
|
|
vpmuludq xmm1,xmm2,xmm8
|
|
vpaddq xmm10,xmm10,xmm0
|
|
vpaddq xmm14,xmm14,xmm1
|
|
vmovdqa XMMWORD[48+r11],xmm3
|
|
vpmuludq xmm0,xmm2,xmm7
|
|
vpmuludq xmm1,xmm2,xmm6
|
|
vpaddq xmm13,xmm13,xmm0
|
|
vmovdqa xmm3,XMMWORD[48+rsp]
|
|
vpaddq xmm12,xmm12,xmm1
|
|
vmovdqa XMMWORD[64+r11],xmm4
|
|
vpmuludq xmm2,xmm2,xmm5
|
|
vpmuludq xmm0,xmm3,xmm7
|
|
vpaddq xmm11,xmm11,xmm2
|
|
|
|
vmovdqa xmm4,XMMWORD[64+rsp]
|
|
vpaddq xmm14,xmm14,xmm0
|
|
vpmuludq xmm1,xmm3,xmm6
|
|
vpmuludq xmm3,xmm3,xmm5
|
|
vpaddq xmm13,xmm13,xmm1
|
|
vmovdqa xmm2,XMMWORD[80+rsp]
|
|
vpaddq xmm12,xmm12,xmm3
|
|
vpmuludq xmm0,xmm4,xmm9
|
|
vpmuludq xmm4,xmm4,xmm8
|
|
vpaddq xmm11,xmm11,xmm0
|
|
vmovdqa xmm3,XMMWORD[96+rsp]
|
|
vpaddq xmm10,xmm10,xmm4
|
|
|
|
vmovdqa xmm4,XMMWORD[128+rsp]
|
|
vpmuludq xmm1,xmm2,xmm6
|
|
vpmuludq xmm2,xmm2,xmm5
|
|
vpaddq xmm14,xmm14,xmm1
|
|
vpaddq xmm13,xmm13,xmm2
|
|
vpmuludq xmm0,xmm3,xmm9
|
|
vpmuludq xmm1,xmm3,xmm8
|
|
vpaddq xmm12,xmm12,xmm0
|
|
vmovdqu xmm0,XMMWORD[rsi]
|
|
vpaddq xmm11,xmm11,xmm1
|
|
vpmuludq xmm3,xmm3,xmm7
|
|
vpmuludq xmm7,xmm4,xmm7
|
|
vpaddq xmm10,xmm10,xmm3
|
|
|
|
vmovdqu xmm1,XMMWORD[16+rsi]
|
|
vpaddq xmm11,xmm11,xmm7
|
|
vpmuludq xmm8,xmm4,xmm8
|
|
vpmuludq xmm9,xmm4,xmm9
|
|
vpsrldq xmm2,xmm0,6
|
|
vpaddq xmm12,xmm12,xmm8
|
|
vpaddq xmm13,xmm13,xmm9
|
|
vpsrldq xmm3,xmm1,6
|
|
vpmuludq xmm9,xmm5,XMMWORD[112+rsp]
|
|
vpmuludq xmm5,xmm4,xmm6
|
|
vpunpckhqdq xmm4,xmm0,xmm1
|
|
vpaddq xmm14,xmm14,xmm9
|
|
vmovdqa xmm9,XMMWORD[((-144))+r11]
|
|
vpaddq xmm10,xmm10,xmm5
|
|
|
|
vpunpcklqdq xmm0,xmm0,xmm1
|
|
vpunpcklqdq xmm3,xmm2,xmm3
|
|
|
|
|
|
vpsrldq xmm4,xmm4,5
|
|
vpsrlq xmm1,xmm0,26
|
|
vpand xmm0,xmm0,xmm15
|
|
vpsrlq xmm2,xmm3,4
|
|
vpand xmm1,xmm1,xmm15
|
|
vpand xmm4,xmm4,XMMWORD[rcx]
|
|
vpsrlq xmm3,xmm3,30
|
|
vpand xmm2,xmm2,xmm15
|
|
vpand xmm3,xmm3,xmm15
|
|
vpor xmm4,xmm4,XMMWORD[32+rcx]
|
|
|
|
vpaddq xmm0,xmm0,XMMWORD[r11]
|
|
vpaddq xmm1,xmm1,XMMWORD[16+r11]
|
|
vpaddq xmm2,xmm2,XMMWORD[32+r11]
|
|
vpaddq xmm3,xmm3,XMMWORD[48+r11]
|
|
vpaddq xmm4,xmm4,XMMWORD[64+r11]
|
|
|
|
lea rax,[32+rsi]
|
|
lea rsi,[64+rsi]
|
|
sub rdx,64
|
|
cmovc rsi,rax
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpmuludq xmm5,xmm9,xmm0
|
|
vpmuludq xmm6,xmm9,xmm1
|
|
vpaddq xmm10,xmm10,xmm5
|
|
vpaddq xmm11,xmm11,xmm6
|
|
vmovdqa xmm7,XMMWORD[((-128))+r11]
|
|
vpmuludq xmm5,xmm9,xmm2
|
|
vpmuludq xmm6,xmm9,xmm3
|
|
vpaddq xmm12,xmm12,xmm5
|
|
vpaddq xmm13,xmm13,xmm6
|
|
vpmuludq xmm9,xmm9,xmm4
|
|
vpmuludq xmm5,xmm4,XMMWORD[((-112))+r11]
|
|
vpaddq xmm14,xmm14,xmm9
|
|
|
|
vpaddq xmm10,xmm10,xmm5
|
|
vpmuludq xmm6,xmm7,xmm2
|
|
vpmuludq xmm5,xmm7,xmm3
|
|
vpaddq xmm13,xmm13,xmm6
|
|
vmovdqa xmm8,XMMWORD[((-96))+r11]
|
|
vpaddq xmm14,xmm14,xmm5
|
|
vpmuludq xmm6,xmm7,xmm1
|
|
vpmuludq xmm7,xmm7,xmm0
|
|
vpaddq xmm12,xmm12,xmm6
|
|
vpaddq xmm11,xmm11,xmm7
|
|
|
|
vmovdqa xmm9,XMMWORD[((-80))+r11]
|
|
vpmuludq xmm5,xmm8,xmm2
|
|
vpmuludq xmm6,xmm8,xmm1
|
|
vpaddq xmm14,xmm14,xmm5
|
|
vpaddq xmm13,xmm13,xmm6
|
|
vmovdqa xmm7,XMMWORD[((-64))+r11]
|
|
vpmuludq xmm8,xmm8,xmm0
|
|
vpmuludq xmm5,xmm9,xmm4
|
|
vpaddq xmm12,xmm12,xmm8
|
|
vpaddq xmm11,xmm11,xmm5
|
|
vmovdqa xmm8,XMMWORD[((-48))+r11]
|
|
vpmuludq xmm9,xmm9,xmm3
|
|
vpmuludq xmm6,xmm7,xmm1
|
|
vpaddq xmm10,xmm10,xmm9
|
|
|
|
vmovdqa xmm9,XMMWORD[((-16))+r11]
|
|
vpaddq xmm14,xmm14,xmm6
|
|
vpmuludq xmm7,xmm7,xmm0
|
|
vpmuludq xmm5,xmm8,xmm4
|
|
vpaddq xmm13,xmm13,xmm7
|
|
vpaddq xmm12,xmm12,xmm5
|
|
vmovdqu xmm5,XMMWORD[32+rsi]
|
|
vpmuludq xmm7,xmm8,xmm3
|
|
vpmuludq xmm8,xmm8,xmm2
|
|
vpaddq xmm11,xmm11,xmm7
|
|
vmovdqu xmm6,XMMWORD[48+rsi]
|
|
vpaddq xmm10,xmm10,xmm8
|
|
|
|
vpmuludq xmm2,xmm9,xmm2
|
|
vpmuludq xmm3,xmm9,xmm3
|
|
vpsrldq xmm7,xmm5,6
|
|
vpaddq xmm11,xmm11,xmm2
|
|
vpmuludq xmm4,xmm9,xmm4
|
|
vpsrldq xmm8,xmm6,6
|
|
vpaddq xmm2,xmm12,xmm3
|
|
vpaddq xmm3,xmm13,xmm4
|
|
vpmuludq xmm4,xmm0,XMMWORD[((-32))+r11]
|
|
vpmuludq xmm0,xmm9,xmm1
|
|
vpunpckhqdq xmm9,xmm5,xmm6
|
|
vpaddq xmm4,xmm14,xmm4
|
|
vpaddq xmm0,xmm10,xmm0
|
|
|
|
vpunpcklqdq xmm5,xmm5,xmm6
|
|
vpunpcklqdq xmm8,xmm7,xmm8
|
|
|
|
|
|
vpsrldq xmm9,xmm9,5
|
|
vpsrlq xmm6,xmm5,26
|
|
vmovdqa xmm14,XMMWORD[rsp]
|
|
vpand xmm5,xmm5,xmm15
|
|
vpsrlq xmm7,xmm8,4
|
|
vpand xmm6,xmm6,xmm15
|
|
vpand xmm9,xmm9,XMMWORD[rcx]
|
|
vpsrlq xmm8,xmm8,30
|
|
vpand xmm7,xmm7,xmm15
|
|
vpand xmm8,xmm8,xmm15
|
|
vpor xmm9,xmm9,XMMWORD[32+rcx]
|
|
|
|
|
|
|
|
|
|
|
|
vpsrlq xmm13,xmm3,26
|
|
vpand xmm3,xmm3,xmm15
|
|
vpaddq xmm4,xmm4,xmm13
|
|
|
|
vpsrlq xmm10,xmm0,26
|
|
vpand xmm0,xmm0,xmm15
|
|
vpaddq xmm1,xmm11,xmm10
|
|
|
|
vpsrlq xmm10,xmm4,26
|
|
vpand xmm4,xmm4,xmm15
|
|
|
|
vpsrlq xmm11,xmm1,26
|
|
vpand xmm1,xmm1,xmm15
|
|
vpaddq xmm2,xmm2,xmm11
|
|
|
|
vpaddq xmm0,xmm0,xmm10
|
|
vpsllq xmm10,xmm10,2
|
|
vpaddq xmm0,xmm0,xmm10
|
|
|
|
vpsrlq xmm12,xmm2,26
|
|
vpand xmm2,xmm2,xmm15
|
|
vpaddq xmm3,xmm3,xmm12
|
|
|
|
vpsrlq xmm10,xmm0,26
|
|
vpand xmm0,xmm0,xmm15
|
|
vpaddq xmm1,xmm1,xmm10
|
|
|
|
vpsrlq xmm13,xmm3,26
|
|
vpand xmm3,xmm3,xmm15
|
|
vpaddq xmm4,xmm4,xmm13
|
|
|
|
ja NEAR $L$oop_avx
|
|
|
|
$L$skip_loop_avx:
|
|
|
|
|
|
|
|
vpshufd xmm14,xmm14,0x10
|
|
add rdx,32
|
|
jnz NEAR $L$ong_tail_avx
|
|
|
|
vpaddq xmm7,xmm7,xmm2
|
|
vpaddq xmm5,xmm5,xmm0
|
|
vpaddq xmm6,xmm6,xmm1
|
|
vpaddq xmm8,xmm8,xmm3
|
|
vpaddq xmm9,xmm9,xmm4
|
|
|
|
$L$ong_tail_avx:
|
|
vmovdqa XMMWORD[32+r11],xmm2
|
|
vmovdqa XMMWORD[r11],xmm0
|
|
vmovdqa XMMWORD[16+r11],xmm1
|
|
vmovdqa XMMWORD[48+r11],xmm3
|
|
vmovdqa XMMWORD[64+r11],xmm4
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpmuludq xmm12,xmm14,xmm7
|
|
vpmuludq xmm10,xmm14,xmm5
|
|
vpshufd xmm2,XMMWORD[((-48))+rdi],0x10
|
|
vpmuludq xmm11,xmm14,xmm6
|
|
vpmuludq xmm13,xmm14,xmm8
|
|
vpmuludq xmm14,xmm14,xmm9
|
|
|
|
vpmuludq xmm0,xmm2,xmm8
|
|
vpaddq xmm14,xmm14,xmm0
|
|
vpshufd xmm3,XMMWORD[((-32))+rdi],0x10
|
|
vpmuludq xmm1,xmm2,xmm7
|
|
vpaddq xmm13,xmm13,xmm1
|
|
vpshufd xmm4,XMMWORD[((-16))+rdi],0x10
|
|
vpmuludq xmm0,xmm2,xmm6
|
|
vpaddq xmm12,xmm12,xmm0
|
|
vpmuludq xmm2,xmm2,xmm5
|
|
vpaddq xmm11,xmm11,xmm2
|
|
vpmuludq xmm3,xmm3,xmm9
|
|
vpaddq xmm10,xmm10,xmm3
|
|
|
|
vpshufd xmm2,XMMWORD[rdi],0x10
|
|
vpmuludq xmm1,xmm4,xmm7
|
|
vpaddq xmm14,xmm14,xmm1
|
|
vpmuludq xmm0,xmm4,xmm6
|
|
vpaddq xmm13,xmm13,xmm0
|
|
vpshufd xmm3,XMMWORD[16+rdi],0x10
|
|
vpmuludq xmm4,xmm4,xmm5
|
|
vpaddq xmm12,xmm12,xmm4
|
|
vpmuludq xmm1,xmm2,xmm9
|
|
vpaddq xmm11,xmm11,xmm1
|
|
vpshufd xmm4,XMMWORD[32+rdi],0x10
|
|
vpmuludq xmm2,xmm2,xmm8
|
|
vpaddq xmm10,xmm10,xmm2
|
|
|
|
vpmuludq xmm0,xmm3,xmm6
|
|
vpaddq xmm14,xmm14,xmm0
|
|
vpmuludq xmm3,xmm3,xmm5
|
|
vpaddq xmm13,xmm13,xmm3
|
|
vpshufd xmm2,XMMWORD[48+rdi],0x10
|
|
vpmuludq xmm1,xmm4,xmm9
|
|
vpaddq xmm12,xmm12,xmm1
|
|
vpshufd xmm3,XMMWORD[64+rdi],0x10
|
|
vpmuludq xmm0,xmm4,xmm8
|
|
vpaddq xmm11,xmm11,xmm0
|
|
vpmuludq xmm4,xmm4,xmm7
|
|
vpaddq xmm10,xmm10,xmm4
|
|
|
|
vpmuludq xmm2,xmm2,xmm5
|
|
vpaddq xmm14,xmm14,xmm2
|
|
vpmuludq xmm1,xmm3,xmm9
|
|
vpaddq xmm13,xmm13,xmm1
|
|
vpmuludq xmm0,xmm3,xmm8
|
|
vpaddq xmm12,xmm12,xmm0
|
|
vpmuludq xmm1,xmm3,xmm7
|
|
vpaddq xmm11,xmm11,xmm1
|
|
vpmuludq xmm3,xmm3,xmm6
|
|
vpaddq xmm10,xmm10,xmm3
|
|
|
|
jz NEAR $L$short_tail_avx
|
|
|
|
vmovdqu xmm0,XMMWORD[rsi]
|
|
vmovdqu xmm1,XMMWORD[16+rsi]
|
|
|
|
vpsrldq xmm2,xmm0,6
|
|
vpsrldq xmm3,xmm1,6
|
|
vpunpckhqdq xmm4,xmm0,xmm1
|
|
vpunpcklqdq xmm0,xmm0,xmm1
|
|
vpunpcklqdq xmm3,xmm2,xmm3
|
|
|
|
vpsrlq xmm4,xmm4,40
|
|
vpsrlq xmm1,xmm0,26
|
|
vpand xmm0,xmm0,xmm15
|
|
vpsrlq xmm2,xmm3,4
|
|
vpand xmm1,xmm1,xmm15
|
|
vpsrlq xmm3,xmm3,30
|
|
vpand xmm2,xmm2,xmm15
|
|
vpand xmm3,xmm3,xmm15
|
|
vpor xmm4,xmm4,XMMWORD[32+rcx]
|
|
|
|
vpshufd xmm9,XMMWORD[((-64))+rdi],0x32
|
|
vpaddq xmm0,xmm0,XMMWORD[r11]
|
|
vpaddq xmm1,xmm1,XMMWORD[16+r11]
|
|
vpaddq xmm2,xmm2,XMMWORD[32+r11]
|
|
vpaddq xmm3,xmm3,XMMWORD[48+r11]
|
|
vpaddq xmm4,xmm4,XMMWORD[64+r11]
|
|
|
|
|
|
|
|
|
|
vpmuludq xmm5,xmm9,xmm0
|
|
vpaddq xmm10,xmm10,xmm5
|
|
vpmuludq xmm6,xmm9,xmm1
|
|
vpaddq xmm11,xmm11,xmm6
|
|
vpmuludq xmm5,xmm9,xmm2
|
|
vpaddq xmm12,xmm12,xmm5
|
|
vpshufd xmm7,XMMWORD[((-48))+rdi],0x32
|
|
vpmuludq xmm6,xmm9,xmm3
|
|
vpaddq xmm13,xmm13,xmm6
|
|
vpmuludq xmm9,xmm9,xmm4
|
|
vpaddq xmm14,xmm14,xmm9
|
|
|
|
vpmuludq xmm5,xmm7,xmm3
|
|
vpaddq xmm14,xmm14,xmm5
|
|
vpshufd xmm8,XMMWORD[((-32))+rdi],0x32
|
|
vpmuludq xmm6,xmm7,xmm2
|
|
vpaddq xmm13,xmm13,xmm6
|
|
vpshufd xmm9,XMMWORD[((-16))+rdi],0x32
|
|
vpmuludq xmm5,xmm7,xmm1
|
|
vpaddq xmm12,xmm12,xmm5
|
|
vpmuludq xmm7,xmm7,xmm0
|
|
vpaddq xmm11,xmm11,xmm7
|
|
vpmuludq xmm8,xmm8,xmm4
|
|
vpaddq xmm10,xmm10,xmm8
|
|
|
|
vpshufd xmm7,XMMWORD[rdi],0x32
|
|
vpmuludq xmm6,xmm9,xmm2
|
|
vpaddq xmm14,xmm14,xmm6
|
|
vpmuludq xmm5,xmm9,xmm1
|
|
vpaddq xmm13,xmm13,xmm5
|
|
vpshufd xmm8,XMMWORD[16+rdi],0x32
|
|
vpmuludq xmm9,xmm9,xmm0
|
|
vpaddq xmm12,xmm12,xmm9
|
|
vpmuludq xmm6,xmm7,xmm4
|
|
vpaddq xmm11,xmm11,xmm6
|
|
vpshufd xmm9,XMMWORD[32+rdi],0x32
|
|
vpmuludq xmm7,xmm7,xmm3
|
|
vpaddq xmm10,xmm10,xmm7
|
|
|
|
vpmuludq xmm5,xmm8,xmm1
|
|
vpaddq xmm14,xmm14,xmm5
|
|
vpmuludq xmm8,xmm8,xmm0
|
|
vpaddq xmm13,xmm13,xmm8
|
|
vpshufd xmm7,XMMWORD[48+rdi],0x32
|
|
vpmuludq xmm6,xmm9,xmm4
|
|
vpaddq xmm12,xmm12,xmm6
|
|
vpshufd xmm8,XMMWORD[64+rdi],0x32
|
|
vpmuludq xmm5,xmm9,xmm3
|
|
vpaddq xmm11,xmm11,xmm5
|
|
vpmuludq xmm9,xmm9,xmm2
|
|
vpaddq xmm10,xmm10,xmm9
|
|
|
|
vpmuludq xmm7,xmm7,xmm0
|
|
vpaddq xmm14,xmm14,xmm7
|
|
vpmuludq xmm6,xmm8,xmm4
|
|
vpaddq xmm13,xmm13,xmm6
|
|
vpmuludq xmm5,xmm8,xmm3
|
|
vpaddq xmm12,xmm12,xmm5
|
|
vpmuludq xmm6,xmm8,xmm2
|
|
vpaddq xmm11,xmm11,xmm6
|
|
vpmuludq xmm8,xmm8,xmm1
|
|
vpaddq xmm10,xmm10,xmm8
|
|
|
|
$L$short_tail_avx:
|
|
|
|
|
|
|
|
vpsrldq xmm9,xmm14,8
|
|
vpsrldq xmm8,xmm13,8
|
|
vpsrldq xmm6,xmm11,8
|
|
vpsrldq xmm5,xmm10,8
|
|
vpsrldq xmm7,xmm12,8
|
|
vpaddq xmm13,xmm13,xmm8
|
|
vpaddq xmm14,xmm14,xmm9
|
|
vpaddq xmm10,xmm10,xmm5
|
|
vpaddq xmm11,xmm11,xmm6
|
|
vpaddq xmm12,xmm12,xmm7
|
|
|
|
|
|
|
|
|
|
vpsrlq xmm3,xmm13,26
|
|
vpand xmm13,xmm13,xmm15
|
|
vpaddq xmm14,xmm14,xmm3
|
|
|
|
vpsrlq xmm0,xmm10,26
|
|
vpand xmm10,xmm10,xmm15
|
|
vpaddq xmm11,xmm11,xmm0
|
|
|
|
vpsrlq xmm4,xmm14,26
|
|
vpand xmm14,xmm14,xmm15
|
|
|
|
vpsrlq xmm1,xmm11,26
|
|
vpand xmm11,xmm11,xmm15
|
|
vpaddq xmm12,xmm12,xmm1
|
|
|
|
vpaddq xmm10,xmm10,xmm4
|
|
vpsllq xmm4,xmm4,2
|
|
vpaddq xmm10,xmm10,xmm4
|
|
|
|
vpsrlq xmm2,xmm12,26
|
|
vpand xmm12,xmm12,xmm15
|
|
vpaddq xmm13,xmm13,xmm2
|
|
|
|
vpsrlq xmm0,xmm10,26
|
|
vpand xmm10,xmm10,xmm15
|
|
vpaddq xmm11,xmm11,xmm0
|
|
|
|
vpsrlq xmm3,xmm13,26
|
|
vpand xmm13,xmm13,xmm15
|
|
vpaddq xmm14,xmm14,xmm3
|
|
|
|
vmovd DWORD[(-112)+rdi],xmm10
|
|
vmovd DWORD[(-108)+rdi],xmm11
|
|
vmovd DWORD[(-104)+rdi],xmm12
|
|
vmovd DWORD[(-100)+rdi],xmm13
|
|
vmovd DWORD[(-96)+rdi],xmm14
|
|
vmovdqa xmm6,XMMWORD[80+r11]
|
|
vmovdqa xmm7,XMMWORD[96+r11]
|
|
vmovdqa xmm8,XMMWORD[112+r11]
|
|
vmovdqa xmm9,XMMWORD[128+r11]
|
|
vmovdqa xmm10,XMMWORD[144+r11]
|
|
vmovdqa xmm11,XMMWORD[160+r11]
|
|
vmovdqa xmm12,XMMWORD[176+r11]
|
|
vmovdqa xmm13,XMMWORD[192+r11]
|
|
vmovdqa xmm14,XMMWORD[208+r11]
|
|
vmovdqa xmm15,XMMWORD[224+r11]
|
|
lea rsp,[248+r11]
|
|
$L$do_avx_epilogue:
|
|
vzeroupper
|
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
|
mov rsi,QWORD[16+rsp]
|
|
DB 0F3h,0C3h ;repret
|
|
|
|
$L$SEH_end_poly1305_blocks_avx:
|
|
|
|
|
|
ALIGN 32
|
|
poly1305_emit_avx:
|
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
|
mov QWORD[16+rsp],rsi
|
|
mov rax,rsp
|
|
$L$SEH_begin_poly1305_emit_avx:
|
|
mov rdi,rcx
|
|
mov rsi,rdx
|
|
mov rdx,r8
|
|
|
|
|
|
cmp DWORD[20+rdi],0
|
|
je NEAR $L$emit
|
|
|
|
mov eax,DWORD[rdi]
|
|
mov ecx,DWORD[4+rdi]
|
|
mov r8d,DWORD[8+rdi]
|
|
mov r11d,DWORD[12+rdi]
|
|
mov r10d,DWORD[16+rdi]
|
|
|
|
shl rcx,26
|
|
mov r9,r8
|
|
shl r8,52
|
|
add rax,rcx
|
|
shr r9,12
|
|
add r8,rax
|
|
adc r9,0
|
|
|
|
shl r11,14
|
|
mov rax,r10
|
|
shr r10,24
|
|
add r9,r11
|
|
shl rax,40
|
|
add r9,rax
|
|
adc r10,0
|
|
|
|
mov rax,r10
|
|
mov rcx,r10
|
|
and r10,3
|
|
shr rax,2
|
|
and rcx,-4
|
|
add rax,rcx
|
|
add r8,rax
|
|
adc r9,0
|
|
adc r10,0
|
|
|
|
mov rax,r8
|
|
add r8,5
|
|
mov rcx,r9
|
|
adc r9,0
|
|
adc r10,0
|
|
shr r10,2
|
|
cmovnz rax,r8
|
|
cmovnz rcx,r9
|
|
|
|
add rax,QWORD[rdx]
|
|
adc rcx,QWORD[8+rdx]
|
|
mov QWORD[rsi],rax
|
|
mov QWORD[8+rsi],rcx
|
|
|
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
|
mov rsi,QWORD[16+rsp]
|
|
DB 0F3h,0C3h ;repret
|
|
$L$SEH_end_poly1305_emit_avx:
|
|
|
|
ALIGN 32
|
|
poly1305_blocks_avx2:
|
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
|
mov QWORD[16+rsp],rsi
|
|
mov rax,rsp
|
|
$L$SEH_begin_poly1305_blocks_avx2:
|
|
mov rdi,rcx
|
|
mov rsi,rdx
|
|
mov rdx,r8
|
|
mov rcx,r9
|
|
|
|
|
|
|
|
mov r8d,DWORD[20+rdi]
|
|
cmp rdx,128
|
|
jae NEAR $L$blocks_avx2
|
|
test r8d,r8d
|
|
jz NEAR $L$blocks
|
|
|
|
$L$blocks_avx2:
|
|
and rdx,-16
|
|
jz NEAR $L$no_data_avx2
|
|
|
|
vzeroupper
|
|
|
|
test r8d,r8d
|
|
jz NEAR $L$base2_64_avx2
|
|
|
|
test rdx,63
|
|
jz NEAR $L$even_avx2
|
|
|
|
push rbx
|
|
|
|
push rbp
|
|
|
|
push r12
|
|
|
|
push r13
|
|
|
|
push r14
|
|
|
|
push r15
|
|
|
|
$L$blocks_avx2_body:
|
|
|
|
mov r15,rdx
|
|
|
|
mov r8,QWORD[rdi]
|
|
mov r9,QWORD[8+rdi]
|
|
mov ebp,DWORD[16+rdi]
|
|
|
|
mov r11,QWORD[24+rdi]
|
|
mov r13,QWORD[32+rdi]
|
|
|
|
|
|
mov r14d,r8d
|
|
and r8,-2147483648
|
|
mov r12,r9
|
|
mov ebx,r9d
|
|
and r9,-2147483648
|
|
|
|
shr r8,6
|
|
shl r12,52
|
|
add r14,r8
|
|
shr rbx,12
|
|
shr r9,18
|
|
add r14,r12
|
|
adc rbx,r9
|
|
|
|
mov r8,rbp
|
|
shl r8,40
|
|
shr rbp,24
|
|
add rbx,r8
|
|
adc rbp,0
|
|
|
|
mov r9,-4
|
|
mov r8,rbp
|
|
and r9,rbp
|
|
shr r8,2
|
|
and rbp,3
|
|
add r8,r9
|
|
add r14,r8
|
|
adc rbx,0
|
|
adc rbp,0
|
|
|
|
mov r12,r13
|
|
mov rax,r13
|
|
shr r13,2
|
|
add r13,r12
|
|
|
|
$L$base2_26_pre_avx2:
|
|
add r14,QWORD[rsi]
|
|
adc rbx,QWORD[8+rsi]
|
|
lea rsi,[16+rsi]
|
|
adc rbp,rcx
|
|
sub r15,16
|
|
|
|
call __poly1305_block
|
|
mov rax,r12
|
|
|
|
test r15,63
|
|
jnz NEAR $L$base2_26_pre_avx2
|
|
|
|
test rcx,rcx
|
|
jz NEAR $L$store_base2_64_avx2
|
|
|
|
|
|
mov rax,r14
|
|
mov rdx,r14
|
|
shr r14,52
|
|
mov r11,rbx
|
|
mov r12,rbx
|
|
shr rdx,26
|
|
and rax,0x3ffffff
|
|
shl r11,12
|
|
and rdx,0x3ffffff
|
|
shr rbx,14
|
|
or r14,r11
|
|
shl rbp,24
|
|
and r14,0x3ffffff
|
|
shr r12,40
|
|
and rbx,0x3ffffff
|
|
or rbp,r12
|
|
|
|
test r15,r15
|
|
jz NEAR $L$store_base2_26_avx2
|
|
|
|
vmovd xmm0,eax
|
|
vmovd xmm1,edx
|
|
vmovd xmm2,r14d
|
|
vmovd xmm3,ebx
|
|
vmovd xmm4,ebp
|
|
jmp NEAR $L$proceed_avx2
|
|
|
|
ALIGN 32
|
|
$L$store_base2_64_avx2:
|
|
mov QWORD[rdi],r14
|
|
mov QWORD[8+rdi],rbx
|
|
mov QWORD[16+rdi],rbp
|
|
jmp NEAR $L$done_avx2
|
|
|
|
ALIGN 16
|
|
$L$store_base2_26_avx2:
|
|
mov DWORD[rdi],eax
|
|
mov DWORD[4+rdi],edx
|
|
mov DWORD[8+rdi],r14d
|
|
mov DWORD[12+rdi],ebx
|
|
mov DWORD[16+rdi],ebp
|
|
ALIGN 16
|
|
$L$done_avx2:
|
|
mov r15,QWORD[rsp]
|
|
|
|
mov r14,QWORD[8+rsp]
|
|
|
|
mov r13,QWORD[16+rsp]
|
|
|
|
mov r12,QWORD[24+rsp]
|
|
|
|
mov rbp,QWORD[32+rsp]
|
|
|
|
mov rbx,QWORD[40+rsp]
|
|
|
|
lea rsp,[48+rsp]
|
|
|
|
$L$no_data_avx2:
|
|
$L$blocks_avx2_epilogue:
|
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
|
mov rsi,QWORD[16+rsp]
|
|
DB 0F3h,0C3h ;repret
|
|
|
|
|
|
ALIGN 32
|
|
$L$base2_64_avx2:
|
|
|
|
push rbx
|
|
|
|
push rbp
|
|
|
|
push r12
|
|
|
|
push r13
|
|
|
|
push r14
|
|
|
|
push r15
|
|
|
|
$L$base2_64_avx2_body:
|
|
|
|
mov r15,rdx
|
|
|
|
mov r11,QWORD[24+rdi]
|
|
mov r13,QWORD[32+rdi]
|
|
|
|
mov r14,QWORD[rdi]
|
|
mov rbx,QWORD[8+rdi]
|
|
mov ebp,DWORD[16+rdi]
|
|
|
|
mov r12,r13
|
|
mov rax,r13
|
|
shr r13,2
|
|
add r13,r12
|
|
|
|
test rdx,63
|
|
jz NEAR $L$init_avx2
|
|
|
|
$L$base2_64_pre_avx2:
|
|
add r14,QWORD[rsi]
|
|
adc rbx,QWORD[8+rsi]
|
|
lea rsi,[16+rsi]
|
|
adc rbp,rcx
|
|
sub r15,16
|
|
|
|
call __poly1305_block
|
|
mov rax,r12
|
|
|
|
test r15,63
|
|
jnz NEAR $L$base2_64_pre_avx2
|
|
|
|
$L$init_avx2:
|
|
|
|
mov rax,r14
|
|
mov rdx,r14
|
|
shr r14,52
|
|
mov r8,rbx
|
|
mov r9,rbx
|
|
shr rdx,26
|
|
and rax,0x3ffffff
|
|
shl r8,12
|
|
and rdx,0x3ffffff
|
|
shr rbx,14
|
|
or r14,r8
|
|
shl rbp,24
|
|
and r14,0x3ffffff
|
|
shr r9,40
|
|
and rbx,0x3ffffff
|
|
or rbp,r9
|
|
|
|
vmovd xmm0,eax
|
|
vmovd xmm1,edx
|
|
vmovd xmm2,r14d
|
|
vmovd xmm3,ebx
|
|
vmovd xmm4,ebp
|
|
mov DWORD[20+rdi],1
|
|
|
|
call __poly1305_init_avx
|
|
|
|
$L$proceed_avx2:
|
|
mov rdx,r15
|
|
|
|
|
|
|
|
mov r15,QWORD[rsp]
|
|
|
|
mov r14,QWORD[8+rsp]
|
|
|
|
mov r13,QWORD[16+rsp]
|
|
|
|
mov r12,QWORD[24+rsp]
|
|
|
|
mov rbp,QWORD[32+rsp]
|
|
|
|
mov rbx,QWORD[40+rsp]
|
|
|
|
lea rax,[48+rsp]
|
|
lea rsp,[48+rsp]
|
|
|
|
$L$base2_64_avx2_epilogue:
|
|
jmp NEAR $L$do_avx2
|
|
|
|
|
|
ALIGN 32
|
|
$L$even_avx2:
|
|
|
|
|
|
vmovd xmm0,DWORD[rdi]
|
|
vmovd xmm1,DWORD[4+rdi]
|
|
vmovd xmm2,DWORD[8+rdi]
|
|
vmovd xmm3,DWORD[12+rdi]
|
|
vmovd xmm4,DWORD[16+rdi]
|
|
|
|
$L$do_avx2:
|
|
lea r11,[((-248))+rsp]
|
|
sub rsp,0x1c8
|
|
vmovdqa XMMWORD[80+r11],xmm6
|
|
vmovdqa XMMWORD[96+r11],xmm7
|
|
vmovdqa XMMWORD[112+r11],xmm8
|
|
vmovdqa XMMWORD[128+r11],xmm9
|
|
vmovdqa XMMWORD[144+r11],xmm10
|
|
vmovdqa XMMWORD[160+r11],xmm11
|
|
vmovdqa XMMWORD[176+r11],xmm12
|
|
vmovdqa XMMWORD[192+r11],xmm13
|
|
vmovdqa XMMWORD[208+r11],xmm14
|
|
vmovdqa XMMWORD[224+r11],xmm15
|
|
$L$do_avx2_body:
|
|
lea rcx,[$L$const]
|
|
lea rdi,[((48+64))+rdi]
|
|
vmovdqa ymm7,YMMWORD[96+rcx]
|
|
|
|
|
|
vmovdqu xmm9,XMMWORD[((-64))+rdi]
|
|
and rsp,-512
|
|
vmovdqu xmm10,XMMWORD[((-48))+rdi]
|
|
vmovdqu xmm6,XMMWORD[((-32))+rdi]
|
|
vmovdqu xmm11,XMMWORD[((-16))+rdi]
|
|
vmovdqu xmm12,XMMWORD[rdi]
|
|
vmovdqu xmm13,XMMWORD[16+rdi]
|
|
lea rax,[144+rsp]
|
|
vmovdqu xmm14,XMMWORD[32+rdi]
|
|
vpermd ymm9,ymm7,ymm9
|
|
vmovdqu xmm15,XMMWORD[48+rdi]
|
|
vpermd ymm10,ymm7,ymm10
|
|
vmovdqu xmm5,XMMWORD[64+rdi]
|
|
vpermd ymm6,ymm7,ymm6
|
|
vmovdqa YMMWORD[rsp],ymm9
|
|
vpermd ymm11,ymm7,ymm11
|
|
vmovdqa YMMWORD[(32-144)+rax],ymm10
|
|
vpermd ymm12,ymm7,ymm12
|
|
vmovdqa YMMWORD[(64-144)+rax],ymm6
|
|
vpermd ymm13,ymm7,ymm13
|
|
vmovdqa YMMWORD[(96-144)+rax],ymm11
|
|
vpermd ymm14,ymm7,ymm14
|
|
vmovdqa YMMWORD[(128-144)+rax],ymm12
|
|
vpermd ymm15,ymm7,ymm15
|
|
vmovdqa YMMWORD[(160-144)+rax],ymm13
|
|
vpermd ymm5,ymm7,ymm5
|
|
vmovdqa YMMWORD[(192-144)+rax],ymm14
|
|
vmovdqa YMMWORD[(224-144)+rax],ymm15
|
|
vmovdqa YMMWORD[(256-144)+rax],ymm5
|
|
vmovdqa ymm5,YMMWORD[64+rcx]
|
|
|
|
|
|
|
|
vmovdqu xmm7,XMMWORD[rsi]
|
|
vmovdqu xmm8,XMMWORD[16+rsi]
|
|
vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1
|
|
vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1
|
|
lea rsi,[64+rsi]
|
|
|
|
vpsrldq ymm9,ymm7,6
|
|
vpsrldq ymm10,ymm8,6
|
|
vpunpckhqdq ymm6,ymm7,ymm8
|
|
vpunpcklqdq ymm9,ymm9,ymm10
|
|
vpunpcklqdq ymm7,ymm7,ymm8
|
|
|
|
vpsrlq ymm10,ymm9,30
|
|
vpsrlq ymm9,ymm9,4
|
|
vpsrlq ymm8,ymm7,26
|
|
vpsrlq ymm6,ymm6,40
|
|
vpand ymm9,ymm9,ymm5
|
|
vpand ymm7,ymm7,ymm5
|
|
vpand ymm8,ymm8,ymm5
|
|
vpand ymm10,ymm10,ymm5
|
|
vpor ymm6,ymm6,YMMWORD[32+rcx]
|
|
|
|
vpaddq ymm2,ymm9,ymm2
|
|
sub rdx,64
|
|
jz NEAR $L$tail_avx2
|
|
jmp NEAR $L$oop_avx2
|
|
|
|
ALIGN 32
|
|
$L$oop_avx2:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpaddq ymm0,ymm7,ymm0
|
|
vmovdqa ymm7,YMMWORD[rsp]
|
|
vpaddq ymm1,ymm8,ymm1
|
|
vmovdqa ymm8,YMMWORD[32+rsp]
|
|
vpaddq ymm3,ymm10,ymm3
|
|
vmovdqa ymm9,YMMWORD[96+rsp]
|
|
vpaddq ymm4,ymm6,ymm4
|
|
vmovdqa ymm10,YMMWORD[48+rax]
|
|
vmovdqa ymm5,YMMWORD[112+rax]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpmuludq ymm13,ymm7,ymm2
|
|
vpmuludq ymm14,ymm8,ymm2
|
|
vpmuludq ymm15,ymm9,ymm2
|
|
vpmuludq ymm11,ymm10,ymm2
|
|
vpmuludq ymm12,ymm5,ymm2
|
|
|
|
vpmuludq ymm6,ymm8,ymm0
|
|
vpmuludq ymm2,ymm8,ymm1
|
|
vpaddq ymm12,ymm12,ymm6
|
|
vpaddq ymm13,ymm13,ymm2
|
|
vpmuludq ymm6,ymm8,ymm3
|
|
vpmuludq ymm2,ymm4,YMMWORD[64+rsp]
|
|
vpaddq ymm15,ymm15,ymm6
|
|
vpaddq ymm11,ymm11,ymm2
|
|
vmovdqa ymm8,YMMWORD[((-16))+rax]
|
|
|
|
vpmuludq ymm6,ymm7,ymm0
|
|
vpmuludq ymm2,ymm7,ymm1
|
|
vpaddq ymm11,ymm11,ymm6
|
|
vpaddq ymm12,ymm12,ymm2
|
|
vpmuludq ymm6,ymm7,ymm3
|
|
vpmuludq ymm2,ymm7,ymm4
|
|
vmovdqu xmm7,XMMWORD[rsi]
|
|
vpaddq ymm14,ymm14,ymm6
|
|
vpaddq ymm15,ymm15,ymm2
|
|
vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1
|
|
|
|
vpmuludq ymm6,ymm8,ymm3
|
|
vpmuludq ymm2,ymm8,ymm4
|
|
vmovdqu xmm8,XMMWORD[16+rsi]
|
|
vpaddq ymm11,ymm11,ymm6
|
|
vpaddq ymm12,ymm12,ymm2
|
|
vmovdqa ymm2,YMMWORD[16+rax]
|
|
vpmuludq ymm6,ymm9,ymm1
|
|
vpmuludq ymm9,ymm9,ymm0
|
|
vpaddq ymm14,ymm14,ymm6
|
|
vpaddq ymm13,ymm13,ymm9
|
|
vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1
|
|
lea rsi,[64+rsi]
|
|
|
|
vpmuludq ymm6,ymm2,ymm1
|
|
vpmuludq ymm2,ymm2,ymm0
|
|
vpsrldq ymm9,ymm7,6
|
|
vpaddq ymm15,ymm15,ymm6
|
|
vpaddq ymm14,ymm14,ymm2
|
|
vpmuludq ymm6,ymm10,ymm3
|
|
vpmuludq ymm2,ymm10,ymm4
|
|
vpsrldq ymm10,ymm8,6
|
|
vpaddq ymm12,ymm12,ymm6
|
|
vpaddq ymm13,ymm13,ymm2
|
|
vpunpckhqdq ymm6,ymm7,ymm8
|
|
|
|
vpmuludq ymm3,ymm5,ymm3
|
|
vpmuludq ymm4,ymm5,ymm4
|
|
vpunpcklqdq ymm7,ymm7,ymm8
|
|
vpaddq ymm2,ymm13,ymm3
|
|
vpaddq ymm3,ymm14,ymm4
|
|
vpunpcklqdq ymm10,ymm9,ymm10
|
|
vpmuludq ymm4,ymm0,YMMWORD[80+rax]
|
|
vpmuludq ymm0,ymm5,ymm1
|
|
vmovdqa ymm5,YMMWORD[64+rcx]
|
|
vpaddq ymm4,ymm15,ymm4
|
|
vpaddq ymm0,ymm11,ymm0
|
|
|
|
|
|
|
|
|
|
vpsrlq ymm14,ymm3,26
|
|
vpand ymm3,ymm3,ymm5
|
|
vpaddq ymm4,ymm4,ymm14
|
|
|
|
vpsrlq ymm11,ymm0,26
|
|
vpand ymm0,ymm0,ymm5
|
|
vpaddq ymm1,ymm12,ymm11
|
|
|
|
vpsrlq ymm15,ymm4,26
|
|
vpand ymm4,ymm4,ymm5
|
|
|
|
vpsrlq ymm9,ymm10,4
|
|
|
|
vpsrlq ymm12,ymm1,26
|
|
vpand ymm1,ymm1,ymm5
|
|
vpaddq ymm2,ymm2,ymm12
|
|
|
|
vpaddq ymm0,ymm0,ymm15
|
|
vpsllq ymm15,ymm15,2
|
|
vpaddq ymm0,ymm0,ymm15
|
|
|
|
vpand ymm9,ymm9,ymm5
|
|
vpsrlq ymm8,ymm7,26
|
|
|
|
vpsrlq ymm13,ymm2,26
|
|
vpand ymm2,ymm2,ymm5
|
|
vpaddq ymm3,ymm3,ymm13
|
|
|
|
vpaddq ymm2,ymm2,ymm9
|
|
vpsrlq ymm10,ymm10,30
|
|
|
|
vpsrlq ymm11,ymm0,26
|
|
vpand ymm0,ymm0,ymm5
|
|
vpaddq ymm1,ymm1,ymm11
|
|
|
|
vpsrlq ymm6,ymm6,40
|
|
|
|
vpsrlq ymm14,ymm3,26
|
|
vpand ymm3,ymm3,ymm5
|
|
vpaddq ymm4,ymm4,ymm14
|
|
|
|
vpand ymm7,ymm7,ymm5
|
|
vpand ymm8,ymm8,ymm5
|
|
vpand ymm10,ymm10,ymm5
|
|
vpor ymm6,ymm6,YMMWORD[32+rcx]
|
|
|
|
sub rdx,64
|
|
jnz NEAR $L$oop_avx2
|
|
|
|
DB 0x66,0x90
|
|
$L$tail_avx2:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpaddq ymm0,ymm7,ymm0
|
|
vmovdqu ymm7,YMMWORD[4+rsp]
|
|
vpaddq ymm1,ymm8,ymm1
|
|
vmovdqu ymm8,YMMWORD[36+rsp]
|
|
vpaddq ymm3,ymm10,ymm3
|
|
vmovdqu ymm9,YMMWORD[100+rsp]
|
|
vpaddq ymm4,ymm6,ymm4
|
|
vmovdqu ymm10,YMMWORD[52+rax]
|
|
vmovdqu ymm5,YMMWORD[116+rax]
|
|
|
|
vpmuludq ymm13,ymm7,ymm2
|
|
vpmuludq ymm14,ymm8,ymm2
|
|
vpmuludq ymm15,ymm9,ymm2
|
|
vpmuludq ymm11,ymm10,ymm2
|
|
vpmuludq ymm12,ymm5,ymm2
|
|
|
|
vpmuludq ymm6,ymm8,ymm0
|
|
vpmuludq ymm2,ymm8,ymm1
|
|
vpaddq ymm12,ymm12,ymm6
|
|
vpaddq ymm13,ymm13,ymm2
|
|
vpmuludq ymm6,ymm8,ymm3
|
|
vpmuludq ymm2,ymm4,YMMWORD[68+rsp]
|
|
vpaddq ymm15,ymm15,ymm6
|
|
vpaddq ymm11,ymm11,ymm2
|
|
|
|
vpmuludq ymm6,ymm7,ymm0
|
|
vpmuludq ymm2,ymm7,ymm1
|
|
vpaddq ymm11,ymm11,ymm6
|
|
vmovdqu ymm8,YMMWORD[((-12))+rax]
|
|
vpaddq ymm12,ymm12,ymm2
|
|
vpmuludq ymm6,ymm7,ymm3
|
|
vpmuludq ymm2,ymm7,ymm4
|
|
vpaddq ymm14,ymm14,ymm6
|
|
vpaddq ymm15,ymm15,ymm2
|
|
|
|
vpmuludq ymm6,ymm8,ymm3
|
|
vpmuludq ymm2,ymm8,ymm4
|
|
vpaddq ymm11,ymm11,ymm6
|
|
vpaddq ymm12,ymm12,ymm2
|
|
vmovdqu ymm2,YMMWORD[20+rax]
|
|
vpmuludq ymm6,ymm9,ymm1
|
|
vpmuludq ymm9,ymm9,ymm0
|
|
vpaddq ymm14,ymm14,ymm6
|
|
vpaddq ymm13,ymm13,ymm9
|
|
|
|
vpmuludq ymm6,ymm2,ymm1
|
|
vpmuludq ymm2,ymm2,ymm0
|
|
vpaddq ymm15,ymm15,ymm6
|
|
vpaddq ymm14,ymm14,ymm2
|
|
vpmuludq ymm6,ymm10,ymm3
|
|
vpmuludq ymm2,ymm10,ymm4
|
|
vpaddq ymm12,ymm12,ymm6
|
|
vpaddq ymm13,ymm13,ymm2
|
|
|
|
vpmuludq ymm3,ymm5,ymm3
|
|
vpmuludq ymm4,ymm5,ymm4
|
|
vpaddq ymm2,ymm13,ymm3
|
|
vpaddq ymm3,ymm14,ymm4
|
|
vpmuludq ymm4,ymm0,YMMWORD[84+rax]
|
|
vpmuludq ymm0,ymm5,ymm1
|
|
vmovdqa ymm5,YMMWORD[64+rcx]
|
|
vpaddq ymm4,ymm15,ymm4
|
|
vpaddq ymm0,ymm11,ymm0
|
|
|
|
|
|
|
|
|
|
vpsrldq ymm8,ymm12,8
|
|
vpsrldq ymm9,ymm2,8
|
|
vpsrldq ymm10,ymm3,8
|
|
vpsrldq ymm6,ymm4,8
|
|
vpsrldq ymm7,ymm0,8
|
|
vpaddq ymm12,ymm12,ymm8
|
|
vpaddq ymm2,ymm2,ymm9
|
|
vpaddq ymm3,ymm3,ymm10
|
|
vpaddq ymm4,ymm4,ymm6
|
|
vpaddq ymm0,ymm0,ymm7
|
|
|
|
vpermq ymm10,ymm3,0x2
|
|
vpermq ymm6,ymm4,0x2
|
|
vpermq ymm7,ymm0,0x2
|
|
vpermq ymm8,ymm12,0x2
|
|
vpermq ymm9,ymm2,0x2
|
|
vpaddq ymm3,ymm3,ymm10
|
|
vpaddq ymm4,ymm4,ymm6
|
|
vpaddq ymm0,ymm0,ymm7
|
|
vpaddq ymm12,ymm12,ymm8
|
|
vpaddq ymm2,ymm2,ymm9
|
|
|
|
|
|
|
|
|
|
vpsrlq ymm14,ymm3,26
|
|
vpand ymm3,ymm3,ymm5
|
|
vpaddq ymm4,ymm4,ymm14
|
|
|
|
vpsrlq ymm11,ymm0,26
|
|
vpand ymm0,ymm0,ymm5
|
|
vpaddq ymm1,ymm12,ymm11
|
|
|
|
vpsrlq ymm15,ymm4,26
|
|
vpand ymm4,ymm4,ymm5
|
|
|
|
vpsrlq ymm12,ymm1,26
|
|
vpand ymm1,ymm1,ymm5
|
|
vpaddq ymm2,ymm2,ymm12
|
|
|
|
vpaddq ymm0,ymm0,ymm15
|
|
vpsllq ymm15,ymm15,2
|
|
vpaddq ymm0,ymm0,ymm15
|
|
|
|
vpsrlq ymm13,ymm2,26
|
|
vpand ymm2,ymm2,ymm5
|
|
vpaddq ymm3,ymm3,ymm13
|
|
|
|
vpsrlq ymm11,ymm0,26
|
|
vpand ymm0,ymm0,ymm5
|
|
vpaddq ymm1,ymm1,ymm11
|
|
|
|
vpsrlq ymm14,ymm3,26
|
|
vpand ymm3,ymm3,ymm5
|
|
vpaddq ymm4,ymm4,ymm14
|
|
|
|
vmovd DWORD[(-112)+rdi],xmm0
|
|
vmovd DWORD[(-108)+rdi],xmm1
|
|
vmovd DWORD[(-104)+rdi],xmm2
|
|
vmovd DWORD[(-100)+rdi],xmm3
|
|
vmovd DWORD[(-96)+rdi],xmm4
|
|
vmovdqa xmm6,XMMWORD[80+r11]
|
|
vmovdqa xmm7,XMMWORD[96+r11]
|
|
vmovdqa xmm8,XMMWORD[112+r11]
|
|
vmovdqa xmm9,XMMWORD[128+r11]
|
|
vmovdqa xmm10,XMMWORD[144+r11]
|
|
vmovdqa xmm11,XMMWORD[160+r11]
|
|
vmovdqa xmm12,XMMWORD[176+r11]
|
|
vmovdqa xmm13,XMMWORD[192+r11]
|
|
vmovdqa xmm14,XMMWORD[208+r11]
|
|
vmovdqa xmm15,XMMWORD[224+r11]
|
|
lea rsp,[248+r11]
|
|
$L$do_avx2_epilogue:
|
|
vzeroupper
|
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
|
mov rsi,QWORD[16+rsp]
|
|
DB 0F3h,0C3h ;repret
|
|
|
|
$L$SEH_end_poly1305_blocks_avx2:
|
|
|
|
ALIGN 32
|
|
poly1305_blocks_avx512:
|
|
mov QWORD[8+rsp],rdi ;WIN64 prologue
|
|
mov QWORD[16+rsp],rsi
|
|
mov rax,rsp
|
|
$L$SEH_begin_poly1305_blocks_avx512:
|
|
mov rdi,rcx
|
|
mov rsi,rdx
|
|
mov rdx,r8
|
|
mov rcx,r9
|
|
|
|
|
|
|
|
mov r8d,DWORD[20+rdi]
|
|
cmp rdx,128
|
|
jae NEAR $L$blocks_avx2_512
|
|
test r8d,r8d
|
|
jz NEAR $L$blocks
|
|
|
|
$L$blocks_avx2_512:
|
|
and rdx,-16
|
|
jz NEAR $L$no_data_avx2_512
|
|
|
|
vzeroupper
|
|
|
|
test r8d,r8d
|
|
jz NEAR $L$base2_64_avx2_512
|
|
|
|
test rdx,63
|
|
jz NEAR $L$even_avx2_512
|
|
|
|
push rbx
|
|
|
|
push rbp
|
|
|
|
push r12
|
|
|
|
push r13
|
|
|
|
push r14
|
|
|
|
push r15
|
|
|
|
$L$blocks_avx2_body_512:
|
|
|
|
mov r15,rdx
|
|
|
|
mov r8,QWORD[rdi]
|
|
mov r9,QWORD[8+rdi]
|
|
mov ebp,DWORD[16+rdi]
|
|
|
|
mov r11,QWORD[24+rdi]
|
|
mov r13,QWORD[32+rdi]
|
|
|
|
|
|
mov r14d,r8d
|
|
and r8,-2147483648
|
|
mov r12,r9
|
|
mov ebx,r9d
|
|
and r9,-2147483648
|
|
|
|
shr r8,6
|
|
shl r12,52
|
|
add r14,r8
|
|
shr rbx,12
|
|
shr r9,18
|
|
add r14,r12
|
|
adc rbx,r9
|
|
|
|
mov r8,rbp
|
|
shl r8,40
|
|
shr rbp,24
|
|
add rbx,r8
|
|
adc rbp,0
|
|
|
|
mov r9,-4
|
|
mov r8,rbp
|
|
and r9,rbp
|
|
shr r8,2
|
|
and rbp,3
|
|
add r8,r9
|
|
add r14,r8
|
|
adc rbx,0
|
|
adc rbp,0
|
|
|
|
mov r12,r13
|
|
mov rax,r13
|
|
shr r13,2
|
|
add r13,r12
|
|
|
|
$L$base2_26_pre_avx2_512:
|
|
add r14,QWORD[rsi]
|
|
adc rbx,QWORD[8+rsi]
|
|
lea rsi,[16+rsi]
|
|
adc rbp,rcx
|
|
sub r15,16
|
|
|
|
call __poly1305_block
|
|
mov rax,r12
|
|
|
|
test r15,63
|
|
jnz NEAR $L$base2_26_pre_avx2_512
|
|
|
|
test rcx,rcx
|
|
jz NEAR $L$store_base2_64_avx2_512
|
|
|
|
|
|
mov rax,r14
|
|
mov rdx,r14
|
|
shr r14,52
|
|
mov r11,rbx
|
|
mov r12,rbx
|
|
shr rdx,26
|
|
and rax,0x3ffffff
|
|
shl r11,12
|
|
and rdx,0x3ffffff
|
|
shr rbx,14
|
|
or r14,r11
|
|
shl rbp,24
|
|
and r14,0x3ffffff
|
|
shr r12,40
|
|
and rbx,0x3ffffff
|
|
or rbp,r12
|
|
|
|
test r15,r15
|
|
jz NEAR $L$store_base2_26_avx2_512
|
|
|
|
vmovd xmm0,eax
|
|
vmovd xmm1,edx
|
|
vmovd xmm2,r14d
|
|
vmovd xmm3,ebx
|
|
vmovd xmm4,ebp
|
|
jmp NEAR $L$proceed_avx2_512
|
|
|
|
ALIGN 32
|
|
$L$store_base2_64_avx2_512:
|
|
mov QWORD[rdi],r14
|
|
mov QWORD[8+rdi],rbx
|
|
mov QWORD[16+rdi],rbp
|
|
jmp NEAR $L$done_avx2_512
|
|
|
|
ALIGN 16
|
|
$L$store_base2_26_avx2_512:
|
|
mov DWORD[rdi],eax
|
|
mov DWORD[4+rdi],edx
|
|
mov DWORD[8+rdi],r14d
|
|
mov DWORD[12+rdi],ebx
|
|
mov DWORD[16+rdi],ebp
|
|
ALIGN 16
|
|
$L$done_avx2_512:
|
|
mov r15,QWORD[rsp]
|
|
|
|
mov r14,QWORD[8+rsp]
|
|
|
|
mov r13,QWORD[16+rsp]
|
|
|
|
mov r12,QWORD[24+rsp]
|
|
|
|
mov rbp,QWORD[32+rsp]
|
|
|
|
mov rbx,QWORD[40+rsp]
|
|
|
|
lea rsp,[48+rsp]
|
|
|
|
$L$no_data_avx2_512:
|
|
$L$blocks_avx2_epilogue_512:
|
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
|
mov rsi,QWORD[16+rsp]
|
|
DB 0F3h,0C3h ;repret
|
|
|
|
|
|
ALIGN 32
|
|
$L$base2_64_avx2_512:
|
|
|
|
push rbx
|
|
|
|
push rbp
|
|
|
|
push r12
|
|
|
|
push r13
|
|
|
|
push r14
|
|
|
|
push r15
|
|
|
|
$L$base2_64_avx2_body_512:
|
|
|
|
mov r15,rdx
|
|
|
|
mov r11,QWORD[24+rdi]
|
|
mov r13,QWORD[32+rdi]
|
|
|
|
mov r14,QWORD[rdi]
|
|
mov rbx,QWORD[8+rdi]
|
|
mov ebp,DWORD[16+rdi]
|
|
|
|
mov r12,r13
|
|
mov rax,r13
|
|
shr r13,2
|
|
add r13,r12
|
|
|
|
test rdx,63
|
|
jz NEAR $L$init_avx2_512
|
|
|
|
$L$base2_64_pre_avx2_512:
|
|
add r14,QWORD[rsi]
|
|
adc rbx,QWORD[8+rsi]
|
|
lea rsi,[16+rsi]
|
|
adc rbp,rcx
|
|
sub r15,16
|
|
|
|
call __poly1305_block
|
|
mov rax,r12
|
|
|
|
test r15,63
|
|
jnz NEAR $L$base2_64_pre_avx2_512
|
|
|
|
$L$init_avx2_512:
|
|
|
|
mov rax,r14
|
|
mov rdx,r14
|
|
shr r14,52
|
|
mov r8,rbx
|
|
mov r9,rbx
|
|
shr rdx,26
|
|
and rax,0x3ffffff
|
|
shl r8,12
|
|
and rdx,0x3ffffff
|
|
shr rbx,14
|
|
or r14,r8
|
|
shl rbp,24
|
|
and r14,0x3ffffff
|
|
shr r9,40
|
|
and rbx,0x3ffffff
|
|
or rbp,r9
|
|
|
|
vmovd xmm0,eax
|
|
vmovd xmm1,edx
|
|
vmovd xmm2,r14d
|
|
vmovd xmm3,ebx
|
|
vmovd xmm4,ebp
|
|
mov DWORD[20+rdi],1
|
|
|
|
call __poly1305_init_avx
|
|
|
|
$L$proceed_avx2_512:
|
|
mov rdx,r15
|
|
|
|
|
|
|
|
mov r15,QWORD[rsp]
|
|
|
|
mov r14,QWORD[8+rsp]
|
|
|
|
mov r13,QWORD[16+rsp]
|
|
|
|
mov r12,QWORD[24+rsp]
|
|
|
|
mov rbp,QWORD[32+rsp]
|
|
|
|
mov rbx,QWORD[40+rsp]
|
|
|
|
lea rax,[48+rsp]
|
|
lea rsp,[48+rsp]
|
|
|
|
$L$base2_64_avx2_epilogue_512:
|
|
jmp NEAR $L$do_avx2_512
|
|
|
|
|
|
ALIGN 32
|
|
$L$even_avx2_512:
|
|
|
|
|
|
vmovd xmm0,DWORD[rdi]
|
|
vmovd xmm1,DWORD[4+rdi]
|
|
vmovd xmm2,DWORD[8+rdi]
|
|
vmovd xmm3,DWORD[12+rdi]
|
|
vmovd xmm4,DWORD[16+rdi]
|
|
|
|
$L$do_avx2_512:
|
|
cmp rdx,512
|
|
jae NEAR $L$blocks_avx512
|
|
$L$skip_avx512:
|
|
lea r11,[((-248))+rsp]
|
|
sub rsp,0x1c8
|
|
vmovdqa XMMWORD[80+r11],xmm6
|
|
vmovdqa XMMWORD[96+r11],xmm7
|
|
vmovdqa XMMWORD[112+r11],xmm8
|
|
vmovdqa XMMWORD[128+r11],xmm9
|
|
vmovdqa XMMWORD[144+r11],xmm10
|
|
vmovdqa XMMWORD[160+r11],xmm11
|
|
vmovdqa XMMWORD[176+r11],xmm12
|
|
vmovdqa XMMWORD[192+r11],xmm13
|
|
vmovdqa XMMWORD[208+r11],xmm14
|
|
vmovdqa XMMWORD[224+r11],xmm15
|
|
$L$do_avx2_body_512:
|
|
lea rcx,[$L$const]
|
|
lea rdi,[((48+64))+rdi]
|
|
vmovdqa ymm7,YMMWORD[96+rcx]
|
|
|
|
|
|
vmovdqu xmm9,XMMWORD[((-64))+rdi]
|
|
and rsp,-512
|
|
vmovdqu xmm10,XMMWORD[((-48))+rdi]
|
|
vmovdqu xmm6,XMMWORD[((-32))+rdi]
|
|
vmovdqu xmm11,XMMWORD[((-16))+rdi]
|
|
vmovdqu xmm12,XMMWORD[rdi]
|
|
vmovdqu xmm13,XMMWORD[16+rdi]
|
|
lea rax,[144+rsp]
|
|
vmovdqu xmm14,XMMWORD[32+rdi]
|
|
vpermd ymm9,ymm7,ymm9
|
|
vmovdqu xmm15,XMMWORD[48+rdi]
|
|
vpermd ymm10,ymm7,ymm10
|
|
vmovdqu xmm5,XMMWORD[64+rdi]
|
|
vpermd ymm6,ymm7,ymm6
|
|
vmovdqa YMMWORD[rsp],ymm9
|
|
vpermd ymm11,ymm7,ymm11
|
|
vmovdqa YMMWORD[(32-144)+rax],ymm10
|
|
vpermd ymm12,ymm7,ymm12
|
|
vmovdqa YMMWORD[(64-144)+rax],ymm6
|
|
vpermd ymm13,ymm7,ymm13
|
|
vmovdqa YMMWORD[(96-144)+rax],ymm11
|
|
vpermd ymm14,ymm7,ymm14
|
|
vmovdqa YMMWORD[(128-144)+rax],ymm12
|
|
vpermd ymm15,ymm7,ymm15
|
|
vmovdqa YMMWORD[(160-144)+rax],ymm13
|
|
vpermd ymm5,ymm7,ymm5
|
|
vmovdqa YMMWORD[(192-144)+rax],ymm14
|
|
vmovdqa YMMWORD[(224-144)+rax],ymm15
|
|
vmovdqa YMMWORD[(256-144)+rax],ymm5
|
|
vmovdqa ymm5,YMMWORD[64+rcx]
|
|
|
|
|
|
|
|
vmovdqu xmm7,XMMWORD[rsi]
|
|
vmovdqu xmm8,XMMWORD[16+rsi]
|
|
vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1
|
|
vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1
|
|
lea rsi,[64+rsi]
|
|
|
|
vpsrldq ymm9,ymm7,6
|
|
vpsrldq ymm10,ymm8,6
|
|
vpunpckhqdq ymm6,ymm7,ymm8
|
|
vpunpcklqdq ymm9,ymm9,ymm10
|
|
vpunpcklqdq ymm7,ymm7,ymm8
|
|
|
|
vpsrlq ymm10,ymm9,30
|
|
vpsrlq ymm9,ymm9,4
|
|
vpsrlq ymm8,ymm7,26
|
|
vpsrlq ymm6,ymm6,40
|
|
vpand ymm9,ymm9,ymm5
|
|
vpand ymm7,ymm7,ymm5
|
|
vpand ymm8,ymm8,ymm5
|
|
vpand ymm10,ymm10,ymm5
|
|
vpor ymm6,ymm6,YMMWORD[32+rcx]
|
|
|
|
vpaddq ymm2,ymm9,ymm2
|
|
sub rdx,64
|
|
jz NEAR $L$tail_avx2_512
|
|
jmp NEAR $L$oop_avx2_512
|
|
|
|
ALIGN 32
|
|
$L$oop_avx2_512:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpaddq ymm0,ymm7,ymm0
|
|
vmovdqa ymm7,YMMWORD[rsp]
|
|
vpaddq ymm1,ymm8,ymm1
|
|
vmovdqa ymm8,YMMWORD[32+rsp]
|
|
vpaddq ymm3,ymm10,ymm3
|
|
vmovdqa ymm9,YMMWORD[96+rsp]
|
|
vpaddq ymm4,ymm6,ymm4
|
|
vmovdqa ymm10,YMMWORD[48+rax]
|
|
vmovdqa ymm5,YMMWORD[112+rax]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpmuludq ymm13,ymm7,ymm2
|
|
vpmuludq ymm14,ymm8,ymm2
|
|
vpmuludq ymm15,ymm9,ymm2
|
|
vpmuludq ymm11,ymm10,ymm2
|
|
vpmuludq ymm12,ymm5,ymm2
|
|
|
|
vpmuludq ymm6,ymm8,ymm0
|
|
vpmuludq ymm2,ymm8,ymm1
|
|
vpaddq ymm12,ymm12,ymm6
|
|
vpaddq ymm13,ymm13,ymm2
|
|
vpmuludq ymm6,ymm8,ymm3
|
|
vpmuludq ymm2,ymm4,YMMWORD[64+rsp]
|
|
vpaddq ymm15,ymm15,ymm6
|
|
vpaddq ymm11,ymm11,ymm2
|
|
vmovdqa ymm8,YMMWORD[((-16))+rax]
|
|
|
|
vpmuludq ymm6,ymm7,ymm0
|
|
vpmuludq ymm2,ymm7,ymm1
|
|
vpaddq ymm11,ymm11,ymm6
|
|
vpaddq ymm12,ymm12,ymm2
|
|
vpmuludq ymm6,ymm7,ymm3
|
|
vpmuludq ymm2,ymm7,ymm4
|
|
vmovdqu xmm7,XMMWORD[rsi]
|
|
vpaddq ymm14,ymm14,ymm6
|
|
vpaddq ymm15,ymm15,ymm2
|
|
vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1
|
|
|
|
vpmuludq ymm6,ymm8,ymm3
|
|
vpmuludq ymm2,ymm8,ymm4
|
|
vmovdqu xmm8,XMMWORD[16+rsi]
|
|
vpaddq ymm11,ymm11,ymm6
|
|
vpaddq ymm12,ymm12,ymm2
|
|
vmovdqa ymm2,YMMWORD[16+rax]
|
|
vpmuludq ymm6,ymm9,ymm1
|
|
vpmuludq ymm9,ymm9,ymm0
|
|
vpaddq ymm14,ymm14,ymm6
|
|
vpaddq ymm13,ymm13,ymm9
|
|
vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1
|
|
lea rsi,[64+rsi]
|
|
|
|
vpmuludq ymm6,ymm2,ymm1
|
|
vpmuludq ymm2,ymm2,ymm0
|
|
vpsrldq ymm9,ymm7,6
|
|
vpaddq ymm15,ymm15,ymm6
|
|
vpaddq ymm14,ymm14,ymm2
|
|
vpmuludq ymm6,ymm10,ymm3
|
|
vpmuludq ymm2,ymm10,ymm4
|
|
vpsrldq ymm10,ymm8,6
|
|
vpaddq ymm12,ymm12,ymm6
|
|
vpaddq ymm13,ymm13,ymm2
|
|
vpunpckhqdq ymm6,ymm7,ymm8
|
|
|
|
vpmuludq ymm3,ymm5,ymm3
|
|
vpmuludq ymm4,ymm5,ymm4
|
|
vpunpcklqdq ymm7,ymm7,ymm8
|
|
vpaddq ymm2,ymm13,ymm3
|
|
vpaddq ymm3,ymm14,ymm4
|
|
vpunpcklqdq ymm10,ymm9,ymm10
|
|
vpmuludq ymm4,ymm0,YMMWORD[80+rax]
|
|
vpmuludq ymm0,ymm5,ymm1
|
|
vmovdqa ymm5,YMMWORD[64+rcx]
|
|
vpaddq ymm4,ymm15,ymm4
|
|
vpaddq ymm0,ymm11,ymm0
|
|
|
|
|
|
|
|
|
|
vpsrlq ymm14,ymm3,26
|
|
vpand ymm3,ymm3,ymm5
|
|
vpaddq ymm4,ymm4,ymm14
|
|
|
|
vpsrlq ymm11,ymm0,26
|
|
vpand ymm0,ymm0,ymm5
|
|
vpaddq ymm1,ymm12,ymm11
|
|
|
|
vpsrlq ymm15,ymm4,26
|
|
vpand ymm4,ymm4,ymm5
|
|
|
|
vpsrlq ymm9,ymm10,4
|
|
|
|
vpsrlq ymm12,ymm1,26
|
|
vpand ymm1,ymm1,ymm5
|
|
vpaddq ymm2,ymm2,ymm12
|
|
|
|
vpaddq ymm0,ymm0,ymm15
|
|
vpsllq ymm15,ymm15,2
|
|
vpaddq ymm0,ymm0,ymm15
|
|
|
|
vpand ymm9,ymm9,ymm5
|
|
vpsrlq ymm8,ymm7,26
|
|
|
|
vpsrlq ymm13,ymm2,26
|
|
vpand ymm2,ymm2,ymm5
|
|
vpaddq ymm3,ymm3,ymm13
|
|
|
|
vpaddq ymm2,ymm2,ymm9
|
|
vpsrlq ymm10,ymm10,30
|
|
|
|
vpsrlq ymm11,ymm0,26
|
|
vpand ymm0,ymm0,ymm5
|
|
vpaddq ymm1,ymm1,ymm11
|
|
|
|
vpsrlq ymm6,ymm6,40
|
|
|
|
vpsrlq ymm14,ymm3,26
|
|
vpand ymm3,ymm3,ymm5
|
|
vpaddq ymm4,ymm4,ymm14
|
|
|
|
vpand ymm7,ymm7,ymm5
|
|
vpand ymm8,ymm8,ymm5
|
|
vpand ymm10,ymm10,ymm5
|
|
vpor ymm6,ymm6,YMMWORD[32+rcx]
|
|
|
|
sub rdx,64
|
|
jnz NEAR $L$oop_avx2_512
|
|
|
|
DB 0x66,0x90
|
|
$L$tail_avx2_512:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpaddq ymm0,ymm7,ymm0
|
|
vmovdqu ymm7,YMMWORD[4+rsp]
|
|
vpaddq ymm1,ymm8,ymm1
|
|
vmovdqu ymm8,YMMWORD[36+rsp]
|
|
vpaddq ymm3,ymm10,ymm3
|
|
vmovdqu ymm9,YMMWORD[100+rsp]
|
|
vpaddq ymm4,ymm6,ymm4
|
|
vmovdqu ymm10,YMMWORD[52+rax]
|
|
vmovdqu ymm5,YMMWORD[116+rax]
|
|
|
|
vpmuludq ymm13,ymm7,ymm2
|
|
vpmuludq ymm14,ymm8,ymm2
|
|
vpmuludq ymm15,ymm9,ymm2
|
|
vpmuludq ymm11,ymm10,ymm2
|
|
vpmuludq ymm12,ymm5,ymm2
|
|
|
|
vpmuludq ymm6,ymm8,ymm0
|
|
vpmuludq ymm2,ymm8,ymm1
|
|
vpaddq ymm12,ymm12,ymm6
|
|
vpaddq ymm13,ymm13,ymm2
|
|
vpmuludq ymm6,ymm8,ymm3
|
|
vpmuludq ymm2,ymm4,YMMWORD[68+rsp]
|
|
vpaddq ymm15,ymm15,ymm6
|
|
vpaddq ymm11,ymm11,ymm2
|
|
|
|
vpmuludq ymm6,ymm7,ymm0
|
|
vpmuludq ymm2,ymm7,ymm1
|
|
vpaddq ymm11,ymm11,ymm6
|
|
vmovdqu ymm8,YMMWORD[((-12))+rax]
|
|
vpaddq ymm12,ymm12,ymm2
|
|
vpmuludq ymm6,ymm7,ymm3
|
|
vpmuludq ymm2,ymm7,ymm4
|
|
vpaddq ymm14,ymm14,ymm6
|
|
vpaddq ymm15,ymm15,ymm2
|
|
|
|
vpmuludq ymm6,ymm8,ymm3
|
|
vpmuludq ymm2,ymm8,ymm4
|
|
vpaddq ymm11,ymm11,ymm6
|
|
vpaddq ymm12,ymm12,ymm2
|
|
vmovdqu ymm2,YMMWORD[20+rax]
|
|
vpmuludq ymm6,ymm9,ymm1
|
|
vpmuludq ymm9,ymm9,ymm0
|
|
vpaddq ymm14,ymm14,ymm6
|
|
vpaddq ymm13,ymm13,ymm9
|
|
|
|
vpmuludq ymm6,ymm2,ymm1
|
|
vpmuludq ymm2,ymm2,ymm0
|
|
vpaddq ymm15,ymm15,ymm6
|
|
vpaddq ymm14,ymm14,ymm2
|
|
vpmuludq ymm6,ymm10,ymm3
|
|
vpmuludq ymm2,ymm10,ymm4
|
|
vpaddq ymm12,ymm12,ymm6
|
|
vpaddq ymm13,ymm13,ymm2
|
|
|
|
vpmuludq ymm3,ymm5,ymm3
|
|
vpmuludq ymm4,ymm5,ymm4
|
|
vpaddq ymm2,ymm13,ymm3
|
|
vpaddq ymm3,ymm14,ymm4
|
|
vpmuludq ymm4,ymm0,YMMWORD[84+rax]
|
|
vpmuludq ymm0,ymm5,ymm1
|
|
vmovdqa ymm5,YMMWORD[64+rcx]
|
|
vpaddq ymm4,ymm15,ymm4
|
|
vpaddq ymm0,ymm11,ymm0
|
|
|
|
|
|
|
|
|
|
vpsrldq ymm8,ymm12,8
|
|
vpsrldq ymm9,ymm2,8
|
|
vpsrldq ymm10,ymm3,8
|
|
vpsrldq ymm6,ymm4,8
|
|
vpsrldq ymm7,ymm0,8
|
|
vpaddq ymm12,ymm12,ymm8
|
|
vpaddq ymm2,ymm2,ymm9
|
|
vpaddq ymm3,ymm3,ymm10
|
|
vpaddq ymm4,ymm4,ymm6
|
|
vpaddq ymm0,ymm0,ymm7
|
|
|
|
vpermq ymm10,ymm3,0x2
|
|
vpermq ymm6,ymm4,0x2
|
|
vpermq ymm7,ymm0,0x2
|
|
vpermq ymm8,ymm12,0x2
|
|
vpermq ymm9,ymm2,0x2
|
|
vpaddq ymm3,ymm3,ymm10
|
|
vpaddq ymm4,ymm4,ymm6
|
|
vpaddq ymm0,ymm0,ymm7
|
|
vpaddq ymm12,ymm12,ymm8
|
|
vpaddq ymm2,ymm2,ymm9
|
|
|
|
|
|
|
|
|
|
vpsrlq ymm14,ymm3,26
|
|
vpand ymm3,ymm3,ymm5
|
|
vpaddq ymm4,ymm4,ymm14
|
|
|
|
vpsrlq ymm11,ymm0,26
|
|
vpand ymm0,ymm0,ymm5
|
|
vpaddq ymm1,ymm12,ymm11
|
|
|
|
vpsrlq ymm15,ymm4,26
|
|
vpand ymm4,ymm4,ymm5
|
|
|
|
vpsrlq ymm12,ymm1,26
|
|
vpand ymm1,ymm1,ymm5
|
|
vpaddq ymm2,ymm2,ymm12
|
|
|
|
vpaddq ymm0,ymm0,ymm15
|
|
vpsllq ymm15,ymm15,2
|
|
vpaddq ymm0,ymm0,ymm15
|
|
|
|
vpsrlq ymm13,ymm2,26
|
|
vpand ymm2,ymm2,ymm5
|
|
vpaddq ymm3,ymm3,ymm13
|
|
|
|
vpsrlq ymm11,ymm0,26
|
|
vpand ymm0,ymm0,ymm5
|
|
vpaddq ymm1,ymm1,ymm11
|
|
|
|
vpsrlq ymm14,ymm3,26
|
|
vpand ymm3,ymm3,ymm5
|
|
vpaddq ymm4,ymm4,ymm14
|
|
|
|
vmovd DWORD[(-112)+rdi],xmm0
|
|
vmovd DWORD[(-108)+rdi],xmm1
|
|
vmovd DWORD[(-104)+rdi],xmm2
|
|
vmovd DWORD[(-100)+rdi],xmm3
|
|
vmovd DWORD[(-96)+rdi],xmm4
|
|
vmovdqa xmm6,XMMWORD[80+r11]
|
|
vmovdqa xmm7,XMMWORD[96+r11]
|
|
vmovdqa xmm8,XMMWORD[112+r11]
|
|
vmovdqa xmm9,XMMWORD[128+r11]
|
|
vmovdqa xmm10,XMMWORD[144+r11]
|
|
vmovdqa xmm11,XMMWORD[160+r11]
|
|
vmovdqa xmm12,XMMWORD[176+r11]
|
|
vmovdqa xmm13,XMMWORD[192+r11]
|
|
vmovdqa xmm14,XMMWORD[208+r11]
|
|
vmovdqa xmm15,XMMWORD[224+r11]
|
|
lea rsp,[248+r11]
|
|
$L$do_avx2_epilogue_512:
|
|
vzeroupper
|
|
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
|
|
mov rsi,QWORD[16+rsp]
|
|
DB 0F3h,0C3h ;repret
|
|
|
|
$L$SEH_end_poly1305_blocks_avx512:
|
|
$L$blocks_avx512:
|
|
mov eax,15
|
|
kmovw k2,eax
|
|
lea r11,[((-248))+rsp]
|
|
sub rsp,0x1c8
|
|
vmovdqa XMMWORD[80+r11],xmm6
|
|
vmovdqa XMMWORD[96+r11],xmm7
|
|
vmovdqa XMMWORD[112+r11],xmm8
|
|
vmovdqa XMMWORD[128+r11],xmm9
|
|
vmovdqa XMMWORD[144+r11],xmm10
|
|
vmovdqa XMMWORD[160+r11],xmm11
|
|
vmovdqa XMMWORD[176+r11],xmm12
|
|
vmovdqa XMMWORD[192+r11],xmm13
|
|
vmovdqa XMMWORD[208+r11],xmm14
|
|
vmovdqa XMMWORD[224+r11],xmm15
|
|
$L$do_avx512_body:
|
|
lea rcx,[$L$const]
|
|
lea rdi,[((48+64))+rdi]
|
|
vmovdqa ymm9,YMMWORD[96+rcx]
|
|
|
|
|
|
vmovdqu xmm11,XMMWORD[((-64))+rdi]
|
|
and rsp,-512
|
|
vmovdqu xmm12,XMMWORD[((-48))+rdi]
|
|
mov rax,0x20
|
|
vmovdqu xmm7,XMMWORD[((-32))+rdi]
|
|
vmovdqu xmm13,XMMWORD[((-16))+rdi]
|
|
vmovdqu xmm8,XMMWORD[rdi]
|
|
vmovdqu xmm14,XMMWORD[16+rdi]
|
|
vmovdqu xmm10,XMMWORD[32+rdi]
|
|
vmovdqu xmm15,XMMWORD[48+rdi]
|
|
vmovdqu xmm6,XMMWORD[64+rdi]
|
|
vpermd zmm16,zmm9,zmm11
|
|
vpbroadcastq zmm5,QWORD[64+rcx]
|
|
vpermd zmm17,zmm9,zmm12
|
|
vpermd zmm21,zmm9,zmm7
|
|
vpermd zmm18,zmm9,zmm13
|
|
vmovdqa64 ZMMWORD[rsp]{k2},zmm16
|
|
vpsrlq zmm7,zmm16,32
|
|
vpermd zmm22,zmm9,zmm8
|
|
vmovdqu64 ZMMWORD[rax*1+rsp]{k2},zmm17
|
|
vpsrlq zmm8,zmm17,32
|
|
vpermd zmm19,zmm9,zmm14
|
|
vmovdqa64 ZMMWORD[64+rsp]{k2},zmm21
|
|
vpermd zmm23,zmm9,zmm10
|
|
vpermd zmm20,zmm9,zmm15
|
|
vmovdqu64 ZMMWORD[64+rax*1+rsp]{k2},zmm18
|
|
vpermd zmm24,zmm9,zmm6
|
|
vmovdqa64 ZMMWORD[128+rsp]{k2},zmm22
|
|
vmovdqu64 ZMMWORD[128+rax*1+rsp]{k2},zmm19
|
|
vmovdqa64 ZMMWORD[192+rsp]{k2},zmm23
|
|
vmovdqu64 ZMMWORD[192+rax*1+rsp]{k2},zmm20
|
|
vmovdqa64 ZMMWORD[256+rsp]{k2},zmm24
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpmuludq zmm11,zmm16,zmm7
|
|
vpmuludq zmm12,zmm17,zmm7
|
|
vpmuludq zmm13,zmm18,zmm7
|
|
vpmuludq zmm14,zmm19,zmm7
|
|
vpmuludq zmm15,zmm20,zmm7
|
|
vpsrlq zmm9,zmm18,32
|
|
|
|
vpmuludq zmm25,zmm24,zmm8
|
|
vpmuludq zmm26,zmm16,zmm8
|
|
vpmuludq zmm27,zmm17,zmm8
|
|
vpmuludq zmm28,zmm18,zmm8
|
|
vpmuludq zmm29,zmm19,zmm8
|
|
vpsrlq zmm10,zmm19,32
|
|
vpaddq zmm11,zmm11,zmm25
|
|
vpaddq zmm12,zmm12,zmm26
|
|
vpaddq zmm13,zmm13,zmm27
|
|
vpaddq zmm14,zmm14,zmm28
|
|
vpaddq zmm15,zmm15,zmm29
|
|
|
|
vpmuludq zmm25,zmm23,zmm9
|
|
vpmuludq zmm26,zmm24,zmm9
|
|
vpmuludq zmm28,zmm17,zmm9
|
|
vpmuludq zmm29,zmm18,zmm9
|
|
vpmuludq zmm27,zmm16,zmm9
|
|
vpsrlq zmm6,zmm20,32
|
|
vpaddq zmm11,zmm11,zmm25
|
|
vpaddq zmm12,zmm12,zmm26
|
|
vpaddq zmm14,zmm14,zmm28
|
|
vpaddq zmm15,zmm15,zmm29
|
|
vpaddq zmm13,zmm13,zmm27
|
|
|
|
vpmuludq zmm25,zmm22,zmm10
|
|
vpmuludq zmm28,zmm16,zmm10
|
|
vpmuludq zmm29,zmm17,zmm10
|
|
vpmuludq zmm26,zmm23,zmm10
|
|
vpmuludq zmm27,zmm24,zmm10
|
|
vpaddq zmm11,zmm11,zmm25
|
|
vpaddq zmm14,zmm14,zmm28
|
|
vpaddq zmm15,zmm15,zmm29
|
|
vpaddq zmm12,zmm12,zmm26
|
|
vpaddq zmm13,zmm13,zmm27
|
|
|
|
vpmuludq zmm28,zmm24,zmm6
|
|
vpmuludq zmm29,zmm16,zmm6
|
|
vpmuludq zmm25,zmm21,zmm6
|
|
vpmuludq zmm26,zmm22,zmm6
|
|
vpmuludq zmm27,zmm23,zmm6
|
|
vpaddq zmm14,zmm14,zmm28
|
|
vpaddq zmm15,zmm15,zmm29
|
|
vpaddq zmm11,zmm11,zmm25
|
|
vpaddq zmm12,zmm12,zmm26
|
|
vpaddq zmm13,zmm13,zmm27
|
|
|
|
|
|
|
|
vmovdqu64 zmm10,ZMMWORD[rsi]
|
|
vmovdqu64 zmm6,ZMMWORD[64+rsi]
|
|
lea rsi,[128+rsi]
|
|
|
|
|
|
|
|
|
|
vpsrlq zmm28,zmm14,26
|
|
vpandq zmm14,zmm14,zmm5
|
|
vpaddq zmm15,zmm15,zmm28
|
|
|
|
vpsrlq zmm25,zmm11,26
|
|
vpandq zmm11,zmm11,zmm5
|
|
vpaddq zmm12,zmm12,zmm25
|
|
|
|
vpsrlq zmm29,zmm15,26
|
|
vpandq zmm15,zmm15,zmm5
|
|
|
|
vpsrlq zmm26,zmm12,26
|
|
vpandq zmm12,zmm12,zmm5
|
|
vpaddq zmm13,zmm13,zmm26
|
|
|
|
vpaddq zmm11,zmm11,zmm29
|
|
vpsllq zmm29,zmm29,2
|
|
vpaddq zmm11,zmm11,zmm29
|
|
|
|
vpsrlq zmm27,zmm13,26
|
|
vpandq zmm13,zmm13,zmm5
|
|
vpaddq zmm14,zmm14,zmm27
|
|
|
|
vpsrlq zmm25,zmm11,26
|
|
vpandq zmm11,zmm11,zmm5
|
|
vpaddq zmm12,zmm12,zmm25
|
|
|
|
vpsrlq zmm28,zmm14,26
|
|
vpandq zmm14,zmm14,zmm5
|
|
vpaddq zmm15,zmm15,zmm28
|
|
|
|
|
|
|
|
|
|
|
|
vpunpcklqdq zmm7,zmm10,zmm6
|
|
vpunpckhqdq zmm6,zmm10,zmm6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vmovdqa32 zmm25,ZMMWORD[128+rcx]
|
|
mov eax,0x7777
|
|
kmovw k1,eax
|
|
|
|
vpermd zmm16,zmm25,zmm16
|
|
vpermd zmm17,zmm25,zmm17
|
|
vpermd zmm18,zmm25,zmm18
|
|
vpermd zmm19,zmm25,zmm19
|
|
vpermd zmm20,zmm25,zmm20
|
|
|
|
vpermd zmm16{k1},zmm25,zmm11
|
|
vpermd zmm17{k1},zmm25,zmm12
|
|
vpermd zmm18{k1},zmm25,zmm13
|
|
vpermd zmm19{k1},zmm25,zmm14
|
|
vpermd zmm20{k1},zmm25,zmm15
|
|
|
|
vpslld zmm21,zmm17,2
|
|
vpslld zmm22,zmm18,2
|
|
vpslld zmm23,zmm19,2
|
|
vpslld zmm24,zmm20,2
|
|
vpaddd zmm21,zmm21,zmm17
|
|
vpaddd zmm22,zmm22,zmm18
|
|
vpaddd zmm23,zmm23,zmm19
|
|
vpaddd zmm24,zmm24,zmm20
|
|
|
|
vpbroadcastq zmm30,QWORD[32+rcx]
|
|
|
|
vpsrlq zmm9,zmm7,52
|
|
vpsllq zmm10,zmm6,12
|
|
vporq zmm9,zmm9,zmm10
|
|
vpsrlq zmm8,zmm7,26
|
|
vpsrlq zmm10,zmm6,14
|
|
vpsrlq zmm6,zmm6,40
|
|
vpandq zmm9,zmm9,zmm5
|
|
vpandq zmm7,zmm7,zmm5
|
|
|
|
|
|
|
|
|
|
vpaddq zmm2,zmm9,zmm2
|
|
sub rdx,192
|
|
jbe NEAR $L$tail_avx512
|
|
jmp NEAR $L$oop_avx512
|
|
|
|
ALIGN 32
|
|
$L$oop_avx512:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpmuludq zmm14,zmm17,zmm2
|
|
vpaddq zmm0,zmm7,zmm0
|
|
vpmuludq zmm15,zmm18,zmm2
|
|
vpandq zmm8,zmm8,zmm5
|
|
vpmuludq zmm11,zmm23,zmm2
|
|
vpandq zmm10,zmm10,zmm5
|
|
vpmuludq zmm12,zmm24,zmm2
|
|
vporq zmm6,zmm6,zmm30
|
|
vpmuludq zmm13,zmm16,zmm2
|
|
vpaddq zmm1,zmm8,zmm1
|
|
vpaddq zmm3,zmm10,zmm3
|
|
vpaddq zmm4,zmm6,zmm4
|
|
|
|
vmovdqu64 zmm10,ZMMWORD[rsi]
|
|
vmovdqu64 zmm6,ZMMWORD[64+rsi]
|
|
lea rsi,[128+rsi]
|
|
vpmuludq zmm28,zmm19,zmm0
|
|
vpmuludq zmm29,zmm20,zmm0
|
|
vpmuludq zmm25,zmm16,zmm0
|
|
vpmuludq zmm26,zmm17,zmm0
|
|
vpaddq zmm14,zmm14,zmm28
|
|
vpaddq zmm15,zmm15,zmm29
|
|
vpaddq zmm11,zmm11,zmm25
|
|
vpaddq zmm12,zmm12,zmm26
|
|
|
|
vpmuludq zmm28,zmm18,zmm1
|
|
vpmuludq zmm29,zmm19,zmm1
|
|
vpmuludq zmm25,zmm24,zmm1
|
|
vpmuludq zmm27,zmm18,zmm0
|
|
vpaddq zmm14,zmm14,zmm28
|
|
vpaddq zmm15,zmm15,zmm29
|
|
vpaddq zmm11,zmm11,zmm25
|
|
vpaddq zmm13,zmm13,zmm27
|
|
|
|
vpunpcklqdq zmm7,zmm10,zmm6
|
|
vpunpckhqdq zmm6,zmm10,zmm6
|
|
|
|
vpmuludq zmm28,zmm16,zmm3
|
|
vpmuludq zmm29,zmm17,zmm3
|
|
vpmuludq zmm26,zmm16,zmm1
|
|
vpmuludq zmm27,zmm17,zmm1
|
|
vpaddq zmm14,zmm14,zmm28
|
|
vpaddq zmm15,zmm15,zmm29
|
|
vpaddq zmm12,zmm12,zmm26
|
|
vpaddq zmm13,zmm13,zmm27
|
|
|
|
vpmuludq zmm28,zmm24,zmm4
|
|
vpmuludq zmm29,zmm16,zmm4
|
|
vpmuludq zmm25,zmm22,zmm3
|
|
vpmuludq zmm26,zmm23,zmm3
|
|
vpaddq zmm14,zmm14,zmm28
|
|
vpmuludq zmm27,zmm24,zmm3
|
|
vpaddq zmm15,zmm15,zmm29
|
|
vpaddq zmm11,zmm11,zmm25
|
|
vpaddq zmm12,zmm12,zmm26
|
|
vpaddq zmm13,zmm13,zmm27
|
|
|
|
vpmuludq zmm25,zmm21,zmm4
|
|
vpmuludq zmm26,zmm22,zmm4
|
|
vpmuludq zmm27,zmm23,zmm4
|
|
vpaddq zmm0,zmm11,zmm25
|
|
vpaddq zmm1,zmm12,zmm26
|
|
vpaddq zmm2,zmm13,zmm27
|
|
|
|
|
|
|
|
|
|
vpsrlq zmm9,zmm7,52
|
|
vpsllq zmm10,zmm6,12
|
|
|
|
vpsrlq zmm3,zmm14,26
|
|
vpandq zmm14,zmm14,zmm5
|
|
vpaddq zmm4,zmm15,zmm3
|
|
|
|
vporq zmm9,zmm9,zmm10
|
|
|
|
vpsrlq zmm11,zmm0,26
|
|
vpandq zmm0,zmm0,zmm5
|
|
vpaddq zmm1,zmm1,zmm11
|
|
|
|
vpandq zmm9,zmm9,zmm5
|
|
|
|
vpsrlq zmm15,zmm4,26
|
|
vpandq zmm4,zmm4,zmm5
|
|
|
|
vpsrlq zmm12,zmm1,26
|
|
vpandq zmm1,zmm1,zmm5
|
|
vpaddq zmm2,zmm2,zmm12
|
|
|
|
vpaddq zmm0,zmm0,zmm15
|
|
vpsllq zmm15,zmm15,2
|
|
vpaddq zmm0,zmm0,zmm15
|
|
|
|
vpaddq zmm2,zmm2,zmm9
|
|
vpsrlq zmm8,zmm7,26
|
|
|
|
vpsrlq zmm13,zmm2,26
|
|
vpandq zmm2,zmm2,zmm5
|
|
vpaddq zmm3,zmm14,zmm13
|
|
|
|
vpsrlq zmm10,zmm6,14
|
|
|
|
vpsrlq zmm11,zmm0,26
|
|
vpandq zmm0,zmm0,zmm5
|
|
vpaddq zmm1,zmm1,zmm11
|
|
|
|
vpsrlq zmm6,zmm6,40
|
|
|
|
vpsrlq zmm14,zmm3,26
|
|
vpandq zmm3,zmm3,zmm5
|
|
vpaddq zmm4,zmm4,zmm14
|
|
|
|
vpandq zmm7,zmm7,zmm5
|
|
|
|
|
|
|
|
|
|
sub rdx,128
|
|
ja NEAR $L$oop_avx512
|
|
|
|
$L$tail_avx512:
|
|
|
|
|
|
|
|
|
|
|
|
vpsrlq zmm16,zmm16,32
|
|
vpsrlq zmm17,zmm17,32
|
|
vpsrlq zmm18,zmm18,32
|
|
vpsrlq zmm23,zmm23,32
|
|
vpsrlq zmm24,zmm24,32
|
|
vpsrlq zmm19,zmm19,32
|
|
vpsrlq zmm20,zmm20,32
|
|
vpsrlq zmm21,zmm21,32
|
|
vpsrlq zmm22,zmm22,32
|
|
|
|
|
|
|
|
lea rsi,[rdx*1+rsi]
|
|
|
|
|
|
vpaddq zmm0,zmm7,zmm0
|
|
|
|
vpmuludq zmm14,zmm17,zmm2
|
|
vpmuludq zmm15,zmm18,zmm2
|
|
vpmuludq zmm11,zmm23,zmm2
|
|
vpandq zmm8,zmm8,zmm5
|
|
vpmuludq zmm12,zmm24,zmm2
|
|
vpandq zmm10,zmm10,zmm5
|
|
vpmuludq zmm13,zmm16,zmm2
|
|
vporq zmm6,zmm6,zmm30
|
|
vpaddq zmm1,zmm8,zmm1
|
|
vpaddq zmm3,zmm10,zmm3
|
|
vpaddq zmm4,zmm6,zmm4
|
|
|
|
vmovdqu xmm7,XMMWORD[rsi]
|
|
vpmuludq zmm28,zmm19,zmm0
|
|
vpmuludq zmm29,zmm20,zmm0
|
|
vpmuludq zmm25,zmm16,zmm0
|
|
vpmuludq zmm26,zmm17,zmm0
|
|
vpaddq zmm14,zmm14,zmm28
|
|
vpaddq zmm15,zmm15,zmm29
|
|
vpaddq zmm11,zmm11,zmm25
|
|
vpaddq zmm12,zmm12,zmm26
|
|
|
|
vmovdqu xmm8,XMMWORD[16+rsi]
|
|
vpmuludq zmm28,zmm18,zmm1
|
|
vpmuludq zmm29,zmm19,zmm1
|
|
vpmuludq zmm25,zmm24,zmm1
|
|
vpmuludq zmm27,zmm18,zmm0
|
|
vpaddq zmm14,zmm14,zmm28
|
|
vpaddq zmm15,zmm15,zmm29
|
|
vpaddq zmm11,zmm11,zmm25
|
|
vpaddq zmm13,zmm13,zmm27
|
|
|
|
vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1
|
|
vpmuludq zmm28,zmm16,zmm3
|
|
vpmuludq zmm29,zmm17,zmm3
|
|
vpmuludq zmm26,zmm16,zmm1
|
|
vpmuludq zmm27,zmm17,zmm1
|
|
vpaddq zmm14,zmm14,zmm28
|
|
vpaddq zmm15,zmm15,zmm29
|
|
vpaddq zmm12,zmm12,zmm26
|
|
vpaddq zmm13,zmm13,zmm27
|
|
|
|
vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1
|
|
vpmuludq zmm28,zmm24,zmm4
|
|
vpmuludq zmm29,zmm16,zmm4
|
|
vpmuludq zmm25,zmm22,zmm3
|
|
vpmuludq zmm26,zmm23,zmm3
|
|
vpmuludq zmm27,zmm24,zmm3
|
|
vpaddq zmm3,zmm14,zmm28
|
|
vpaddq zmm15,zmm15,zmm29
|
|
vpaddq zmm11,zmm11,zmm25
|
|
vpaddq zmm12,zmm12,zmm26
|
|
vpaddq zmm13,zmm13,zmm27
|
|
|
|
vpmuludq zmm25,zmm21,zmm4
|
|
vpmuludq zmm26,zmm22,zmm4
|
|
vpmuludq zmm27,zmm23,zmm4
|
|
vpaddq zmm0,zmm11,zmm25
|
|
vpaddq zmm1,zmm12,zmm26
|
|
vpaddq zmm2,zmm13,zmm27
|
|
|
|
|
|
|
|
|
|
mov eax,1
|
|
vpermq zmm14,zmm3,0xb1
|
|
vpermq zmm4,zmm15,0xb1
|
|
vpermq zmm11,zmm0,0xb1
|
|
vpermq zmm12,zmm1,0xb1
|
|
vpermq zmm13,zmm2,0xb1
|
|
vpaddq zmm3,zmm3,zmm14
|
|
vpaddq zmm4,zmm4,zmm15
|
|
vpaddq zmm0,zmm0,zmm11
|
|
vpaddq zmm1,zmm1,zmm12
|
|
vpaddq zmm2,zmm2,zmm13
|
|
|
|
kmovw k3,eax
|
|
vpermq zmm14,zmm3,0x2
|
|
vpermq zmm15,zmm4,0x2
|
|
vpermq zmm11,zmm0,0x2
|
|
vpermq zmm12,zmm1,0x2
|
|
vpermq zmm13,zmm2,0x2
|
|
vpaddq zmm3,zmm3,zmm14
|
|
vpaddq zmm4,zmm4,zmm15
|
|
vpaddq zmm0,zmm0,zmm11
|
|
vpaddq zmm1,zmm1,zmm12
|
|
vpaddq zmm2,zmm2,zmm13
|
|
|
|
vextracti64x4 ymm14,zmm3,0x1
|
|
vextracti64x4 ymm15,zmm4,0x1
|
|
vextracti64x4 ymm11,zmm0,0x1
|
|
vextracti64x4 ymm12,zmm1,0x1
|
|
vextracti64x4 ymm13,zmm2,0x1
|
|
vpaddq zmm3{k3}{z},zmm3,zmm14
|
|
vpaddq zmm4{k3}{z},zmm4,zmm15
|
|
vpaddq zmm0{k3}{z},zmm0,zmm11
|
|
vpaddq zmm1{k3}{z},zmm1,zmm12
|
|
vpaddq zmm2{k3}{z},zmm2,zmm13
|
|
|
|
|
|
|
|
vpsrlq ymm14,ymm3,26
|
|
vpand ymm3,ymm3,ymm5
|
|
vpsrldq ymm9,ymm7,6
|
|
vpsrldq ymm10,ymm8,6
|
|
vpunpckhqdq ymm6,ymm7,ymm8
|
|
vpaddq ymm4,ymm4,ymm14
|
|
|
|
vpsrlq ymm11,ymm0,26
|
|
vpand ymm0,ymm0,ymm5
|
|
vpunpcklqdq ymm9,ymm9,ymm10
|
|
vpunpcklqdq ymm7,ymm7,ymm8
|
|
vpaddq ymm1,ymm1,ymm11
|
|
|
|
vpsrlq ymm15,ymm4,26
|
|
vpand ymm4,ymm4,ymm5
|
|
|
|
vpsrlq ymm12,ymm1,26
|
|
vpand ymm1,ymm1,ymm5
|
|
vpsrlq ymm10,ymm9,30
|
|
vpsrlq ymm9,ymm9,4
|
|
vpaddq ymm2,ymm2,ymm12
|
|
|
|
vpaddq ymm0,ymm0,ymm15
|
|
vpsllq ymm15,ymm15,2
|
|
vpsrlq ymm8,ymm7,26
|
|
vpsrlq ymm6,ymm6,40
|
|
vpaddq ymm0,ymm0,ymm15
|
|
|
|
vpsrlq ymm13,ymm2,26
|
|
vpand ymm2,ymm2,ymm5
|
|
vpand ymm9,ymm9,ymm5
|
|
vpand ymm7,ymm7,ymm5
|
|
vpaddq ymm3,ymm3,ymm13
|
|
|
|
vpsrlq ymm11,ymm0,26
|
|
vpand ymm0,ymm0,ymm5
|
|
vpaddq ymm2,ymm9,ymm2
|
|
vpand ymm8,ymm8,ymm5
|
|
vpaddq ymm1,ymm1,ymm11
|
|
|
|
vpsrlq ymm14,ymm3,26
|
|
vpand ymm3,ymm3,ymm5
|
|
vpand ymm10,ymm10,ymm5
|
|
vpor ymm6,ymm6,YMMWORD[32+rcx]
|
|
vpaddq ymm4,ymm4,ymm14
|
|
|
|
lea rax,[144+rsp]
|
|
add rdx,64
|
|
jnz NEAR $L$tail_avx2_512
|
|
|
|
vpsubq ymm2,ymm2,ymm9
|
|
vmovd DWORD[(-112)+rdi],xmm0
|
|
vmovd DWORD[(-108)+rdi],xmm1
|
|
vmovd DWORD[(-104)+rdi],xmm2
|
|
vmovd DWORD[(-100)+rdi],xmm3
|
|
vmovd DWORD[(-96)+rdi],xmm4
|
|
vzeroall
|
|
movdqa xmm6,XMMWORD[80+r11]
|
|
movdqa xmm7,XMMWORD[96+r11]
|
|
movdqa xmm8,XMMWORD[112+r11]
|
|
movdqa xmm9,XMMWORD[128+r11]
|
|
movdqa xmm10,XMMWORD[144+r11]
|
|
movdqa xmm11,XMMWORD[160+r11]
|
|
movdqa xmm12,XMMWORD[176+r11]
|
|
movdqa xmm13,XMMWORD[192+r11]
|
|
movdqa xmm14,XMMWORD[208+r11]
|
|
movdqa xmm15,XMMWORD[224+r11]
|
|
lea rsp,[248+r11]
|
|
$L$do_avx512_epilogue:
|
|
DB 0F3h,0C3h ;repret
|
|
|
|
|
|
EXTERN __imp_RtlVirtualUnwind
|
|
|
|
ALIGN 16
|
|
se_handler:
|
|
push rsi
|
|
push rdi
|
|
push rbx
|
|
push rbp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
pushfq
|
|
sub rsp,64
|
|
|
|
mov rax,QWORD[120+r8]
|
|
mov rbx,QWORD[248+r8]
|
|
|
|
mov rsi,QWORD[8+r9]
|
|
mov r11,QWORD[56+r9]
|
|
|
|
mov r10d,DWORD[r11]
|
|
lea r10,[r10*1+rsi]
|
|
cmp rbx,r10
|
|
jb NEAR $L$common_seh_tail
|
|
|
|
mov rax,QWORD[152+r8]
|
|
|
|
mov r10d,DWORD[4+r11]
|
|
lea r10,[r10*1+rsi]
|
|
cmp rbx,r10
|
|
jae NEAR $L$common_seh_tail
|
|
|
|
lea rax,[48+rax]
|
|
|
|
mov rbx,QWORD[((-8))+rax]
|
|
mov rbp,QWORD[((-16))+rax]
|
|
mov r12,QWORD[((-24))+rax]
|
|
mov r13,QWORD[((-32))+rax]
|
|
mov r14,QWORD[((-40))+rax]
|
|
mov r15,QWORD[((-48))+rax]
|
|
mov QWORD[144+r8],rbx
|
|
mov QWORD[160+r8],rbp
|
|
mov QWORD[216+r8],r12
|
|
mov QWORD[224+r8],r13
|
|
mov QWORD[232+r8],r14
|
|
mov QWORD[240+r8],r15
|
|
|
|
jmp NEAR $L$common_seh_tail
|
|
|
|
|
|
|
|
ALIGN 16
|
|
avx_handler:
|
|
push rsi
|
|
push rdi
|
|
push rbx
|
|
push rbp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
pushfq
|
|
sub rsp,64
|
|
|
|
mov rax,QWORD[120+r8]
|
|
mov rbx,QWORD[248+r8]
|
|
|
|
mov rsi,QWORD[8+r9]
|
|
mov r11,QWORD[56+r9]
|
|
|
|
mov r10d,DWORD[r11]
|
|
lea r10,[r10*1+rsi]
|
|
cmp rbx,r10
|
|
jb NEAR $L$common_seh_tail
|
|
|
|
mov rax,QWORD[152+r8]
|
|
|
|
mov r10d,DWORD[4+r11]
|
|
lea r10,[r10*1+rsi]
|
|
cmp rbx,r10
|
|
jae NEAR $L$common_seh_tail
|
|
|
|
mov rax,QWORD[208+r8]
|
|
|
|
lea rsi,[80+rax]
|
|
lea rax,[248+rax]
|
|
lea rdi,[512+r8]
|
|
mov ecx,20
|
|
DD 0xa548f3fc
|
|
|
|
$L$common_seh_tail:
|
|
mov rdi,QWORD[8+rax]
|
|
mov rsi,QWORD[16+rax]
|
|
mov QWORD[152+r8],rax
|
|
mov QWORD[168+r8],rsi
|
|
mov QWORD[176+r8],rdi
|
|
|
|
mov rdi,QWORD[40+r9]
|
|
mov rsi,r8
|
|
mov ecx,154
|
|
DD 0xa548f3fc
|
|
|
|
mov rsi,r9
|
|
xor rcx,rcx
|
|
mov rdx,QWORD[8+rsi]
|
|
mov r8,QWORD[rsi]
|
|
mov r9,QWORD[16+rsi]
|
|
mov r10,QWORD[40+rsi]
|
|
lea r11,[56+rsi]
|
|
lea r12,[24+rsi]
|
|
mov QWORD[32+rsp],r10
|
|
mov QWORD[40+rsp],r11
|
|
mov QWORD[48+rsp],r12
|
|
mov QWORD[56+rsp],rcx
|
|
call QWORD[__imp_RtlVirtualUnwind]
|
|
|
|
mov eax,1
|
|
add rsp,64
|
|
popfq
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
pop rbx
|
|
pop rdi
|
|
pop rsi
|
|
DB 0F3h,0C3h ;repret
|
|
|
|
|
|
section .pdata rdata align=4
|
|
ALIGN 4
|
|
DD $L$SEH_begin_poly1305_init_x86_64 wrt ..imagebase
|
|
DD $L$SEH_end_poly1305_init_x86_64 wrt ..imagebase
|
|
DD $L$SEH_info_poly1305_init wrt ..imagebase
|
|
|
|
DD $L$SEH_begin_poly1305_blocks_x86_64 wrt ..imagebase
|
|
DD $L$SEH_end_poly1305_blocks_x86_64 wrt ..imagebase
|
|
DD $L$SEH_info_poly1305_blocks wrt ..imagebase
|
|
|
|
DD $L$SEH_begin_poly1305_emit_x86_64 wrt ..imagebase
|
|
DD $L$SEH_end_poly1305_emit_x86_64 wrt ..imagebase
|
|
DD $L$SEH_info_poly1305_emit wrt ..imagebase
|
|
DD $L$SEH_begin_poly1305_blocks_avx wrt ..imagebase
|
|
DD $L$base2_64_avx wrt ..imagebase
|
|
DD $L$SEH_info_poly1305_blocks_avx_1 wrt ..imagebase
|
|
|
|
DD $L$base2_64_avx wrt ..imagebase
|
|
DD $L$even_avx wrt ..imagebase
|
|
DD $L$SEH_info_poly1305_blocks_avx_2 wrt ..imagebase
|
|
|
|
DD $L$even_avx wrt ..imagebase
|
|
DD $L$SEH_end_poly1305_blocks_avx wrt ..imagebase
|
|
DD $L$SEH_info_poly1305_blocks_avx_3 wrt ..imagebase
|
|
|
|
DD $L$SEH_begin_poly1305_emit_avx wrt ..imagebase
|
|
DD $L$SEH_end_poly1305_emit_avx wrt ..imagebase
|
|
DD $L$SEH_info_poly1305_emit_avx wrt ..imagebase
|
|
DD $L$SEH_begin_poly1305_blocks_avx2 wrt ..imagebase
|
|
DD $L$base2_64_avx2 wrt ..imagebase
|
|
DD $L$SEH_info_poly1305_blocks_avx2_1 wrt ..imagebase
|
|
|
|
DD $L$base2_64_avx2 wrt ..imagebase
|
|
DD $L$even_avx2 wrt ..imagebase
|
|
DD $L$SEH_info_poly1305_blocks_avx2_2 wrt ..imagebase
|
|
|
|
DD $L$even_avx2 wrt ..imagebase
|
|
DD $L$SEH_end_poly1305_blocks_avx2 wrt ..imagebase
|
|
DD $L$SEH_info_poly1305_blocks_avx2_3 wrt ..imagebase
|
|
DD $L$SEH_begin_poly1305_blocks_avx512 wrt ..imagebase
|
|
DD $L$SEH_end_poly1305_blocks_avx512 wrt ..imagebase
|
|
DD $L$SEH_info_poly1305_blocks_avx512 wrt ..imagebase
|
|
section .xdata rdata align=8
|
|
ALIGN 8
|
|
$L$SEH_info_poly1305_init:
|
|
DB 9,0,0,0
|
|
DD se_handler wrt ..imagebase
|
|
DD $L$SEH_begin_poly1305_init_x86_64 wrt ..imagebase,$L$SEH_begin_poly1305_init_x86_64 wrt ..imagebase
|
|
|
|
$L$SEH_info_poly1305_blocks:
|
|
DB 9,0,0,0
|
|
DD se_handler wrt ..imagebase
|
|
DD $L$blocks_body wrt ..imagebase,$L$blocks_epilogue wrt ..imagebase
|
|
|
|
$L$SEH_info_poly1305_emit:
|
|
DB 9,0,0,0
|
|
DD se_handler wrt ..imagebase
|
|
DD $L$SEH_begin_poly1305_emit_x86_64 wrt ..imagebase,$L$SEH_begin_poly1305_emit_x86_64 wrt ..imagebase
|
|
$L$SEH_info_poly1305_blocks_avx_1:
|
|
DB 9,0,0,0
|
|
DD se_handler wrt ..imagebase
|
|
DD $L$blocks_avx_body wrt ..imagebase,$L$blocks_avx_epilogue wrt ..imagebase
|
|
|
|
$L$SEH_info_poly1305_blocks_avx_2:
|
|
DB 9,0,0,0
|
|
DD se_handler wrt ..imagebase
|
|
DD $L$base2_64_avx_body wrt ..imagebase,$L$base2_64_avx_epilogue wrt ..imagebase
|
|
|
|
$L$SEH_info_poly1305_blocks_avx_3:
|
|
DB 9,0,0,0
|
|
DD avx_handler wrt ..imagebase
|
|
DD $L$do_avx_body wrt ..imagebase,$L$do_avx_epilogue wrt ..imagebase
|
|
|
|
$L$SEH_info_poly1305_emit_avx:
|
|
DB 9,0,0,0
|
|
DD se_handler wrt ..imagebase
|
|
DD $L$SEH_begin_poly1305_emit_avx wrt ..imagebase,$L$SEH_begin_poly1305_emit_avx wrt ..imagebase
|
|
$L$SEH_info_poly1305_blocks_avx2_1:
|
|
DB 9,0,0,0
|
|
DD se_handler wrt ..imagebase
|
|
DD $L$blocks_avx2_body wrt ..imagebase,$L$blocks_avx2_epilogue wrt ..imagebase
|
|
|
|
$L$SEH_info_poly1305_blocks_avx2_2:
|
|
DB 9,0,0,0
|
|
DD se_handler wrt ..imagebase
|
|
DD $L$base2_64_avx2_body wrt ..imagebase,$L$base2_64_avx2_epilogue wrt ..imagebase
|
|
|
|
$L$SEH_info_poly1305_blocks_avx2_3:
|
|
DB 9,0,0,0
|
|
DD avx_handler wrt ..imagebase
|
|
DD $L$do_avx2_body wrt ..imagebase,$L$do_avx2_epilogue wrt ..imagebase
|
|
$L$SEH_info_poly1305_blocks_avx512:
|
|
DB 9,0,0,0
|
|
DD avx_handler wrt ..imagebase
|
|
DD $L$do_avx512_body wrt ..imagebase,$L$do_avx512_epilogue wrt ..imagebase
|