Add ARM64 asm code, and rearranged asm code filenames

This commit is contained in:
Ludvig Strigeus 2018-10-07 19:41:58 +02:00
parent a05e3644b6
commit f2c1643635
65 changed files with 666 additions and 782 deletions

View file

@ -54,7 +54,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="crypto\nasm.props" />
<Import Project="crypto\tools\nasm.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@ -178,6 +178,12 @@
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="bit_ops.h" />
<ClInclude Include="crypto\blake2s\blake2s-load-sse2.h" />
<ClInclude Include="crypto\blake2s\blake2s-load-sse41.h" />
<ClInclude Include="crypto\blake2s\blake2s-load-xop.h" />
<ClInclude Include="crypto\blake2s\blake2s-round.h" />
<ClInclude Include="crypto\blake2s\blake2s-sse-impl.h" />
<ClInclude Include="crypto\curve25519\curve25519-donna.h" />
<ClInclude Include="ip_to_peer_map.h" />
<ClInclude Include="service_pipe_win32.h" />
<ClInclude Include="service_win32.h" />
@ -186,9 +192,9 @@
<ClInclude Include="tunsafe_config.h" />
<ClInclude Include="tunsafe_cpu.h" />
<ClInclude Include="crypto\aesgcm\aes.h" />
<ClInclude Include="crypto\blake2s.h" />
<ClInclude Include="crypto\blake2s\blake2s.h" />
<ClInclude Include="crypto\chacha20poly1305.h" />
<ClInclude Include="crypto\siphash.h" />
<ClInclude Include="crypto\siphash\siphash.h" />
<ClInclude Include="tunsafe_endian.h" />
<ClInclude Include="ipzip2\ipzip2.h" />
<ClInclude Include="netapi.h" />
@ -212,8 +218,7 @@
<ClCompile Include="service_win32.cpp" />
<ClCompile Include="tunsafe_cpu.cpp" />
<ClCompile Include="crypto\aesgcm\aesgcm.cpp" />
<ClCompile Include="crypto\blake2s_sse.cpp" />
<ClCompile Include="crypto\siphash.cpp" />
<ClCompile Include="crypto\siphash\siphash.cpp" />
<ClCompile Include="ipzip2\ipzip2.cpp" />
<ClCompile Include="network_win32_dnsblock.cpp" />
<ClCompile Include="tunsafe_threading.cpp" />
@ -221,7 +226,7 @@
<ClCompile Include="network_win32.cpp" />
<ClCompile Include="util_win32.cpp" />
<ClCompile Include="wireguard.cpp" />
<ClCompile Include="crypto\blake2s.cpp">
<ClCompile Include="crypto\blake2s\blake2s.cpp">
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
</ClCompile>
@ -229,7 +234,7 @@
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
</ClCompile>
<ClCompile Include="crypto\curve25519-donna.cpp">
<ClCompile Include="crypto\curve25519\curve25519-donna.cpp">
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
@ -255,34 +260,34 @@
<Image Include="icons\neutral-icon.ico" />
</ItemGroup>
<ItemGroup>
<NASM Include="crypto\aesgcm\aesni_gcm_x64_nasm.asm">
<NASM Include="crypto\aesgcm\aesni_gcm-x64-win.asm">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
</NASM>
<NASM Include="crypto\aesgcm\aesni_x64_nasm.asm">
<NASM Include="crypto\aesgcm\aesni-x64-win.asm">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
</NASM>
<NASM Include="crypto\aesgcm\ghash_x64_nasm.asm">
<NASM Include="crypto\aesgcm\ghash-x64-win.asm">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
</NASM>
<NASM Include="crypto\chacha20_x64.asm">
<NASM Include="crypto\chacha20\chacha20-x64-win.asm">
<FileType>Document</FileType>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
</NASM>
<NASM Include="crypto\curve25519_x64_nasm.asm">
<NASM Include="crypto\curve25519\curve25519-x64-win.asm">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
</NASM>
<NASM Include="crypto\poly1305_x64_nasm.asm">
<NASM Include="crypto\poly1305\poly1305-x64-win.asm">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
</NASM>
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="crypto\nasm.targets" />
<Import Project="crypto\tools\nasm.targets" />
</ImportGroup>
</Project>

View file

@ -14,6 +14,15 @@
<Filter Include="crypto\aesgcm">
<UniqueIdentifier>{d31b1b9f-4a2e-42d4-a26c-7c3daa4ccbe3}</UniqueIdentifier>
</Filter>
<Filter Include="crypto\blake2s">
<UniqueIdentifier>{45ab50f7-cde8-4d0b-9756-5bfa3b9a28db}</UniqueIdentifier>
</Filter>
<Filter Include="crypto\curve25519">
<UniqueIdentifier>{6adfd763-0197-437b-b7f6-2ffdd1e8e508}</UniqueIdentifier>
</Filter>
<Filter Include="crypto\chacha20poly1305">
<UniqueIdentifier>{1ca37c7b-e91e-4648-9584-7d0c73d8e416}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClInclude Include="stdafx.h">
@ -41,12 +50,6 @@
<ClInclude Include="network_win32_api.h">
<Filter>Source Files\Win32</Filter>
</ClInclude>
<ClInclude Include="crypto\chacha20poly1305.h">
<Filter>crypto</Filter>
</ClInclude>
<ClInclude Include="crypto\blake2s.h">
<Filter>crypto</Filter>
</ClInclude>
<ClInclude Include="wireguard_config.h">
<Filter>Source Files</Filter>
</ClInclude>
@ -56,9 +59,6 @@
<ClInclude Include="ipzip2\ipzip2.h">
<Filter>Source Files</Filter>
</ClInclude>
<ClInclude Include="crypto\siphash.h">
<Filter>crypto</Filter>
</ClInclude>
<ClInclude Include="tunsafe_types.h">
<Filter>Source Files</Filter>
</ClInclude>
@ -95,6 +95,31 @@
<ClInclude Include="service_win32_constants.h">
<Filter>Source Files\Win32</Filter>
</ClInclude>
<ClInclude Include="crypto\siphash\siphash.h" />
<ClInclude Include="crypto\blake2s\blake2s.h">
<Filter>crypto\blake2s</Filter>
</ClInclude>
<ClInclude Include="crypto\blake2s\blake2s-load-sse2.h">
<Filter>crypto\blake2s</Filter>
</ClInclude>
<ClInclude Include="crypto\blake2s\blake2s-load-sse41.h">
<Filter>crypto\blake2s</Filter>
</ClInclude>
<ClInclude Include="crypto\blake2s\blake2s-load-xop.h">
<Filter>crypto\blake2s</Filter>
</ClInclude>
<ClInclude Include="crypto\blake2s\blake2s-round.h">
<Filter>crypto\blake2s</Filter>
</ClInclude>
<ClInclude Include="crypto\blake2s\blake2s-sse-impl.h">
<Filter>crypto\blake2s</Filter>
</ClInclude>
<ClInclude Include="crypto\curve25519\curve25519-donna.h">
<Filter>crypto\curve25519</Filter>
</ClInclude>
<ClInclude Include="crypto\chacha20poly1305.h">
<Filter>crypto\chacha20poly1305</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="stdafx.cpp">
@ -118,27 +143,12 @@
<ClCompile Include="network_win32.cpp">
<Filter>Source Files\Win32</Filter>
</ClCompile>
<ClCompile Include="crypto\blake2s.cpp">
<Filter>crypto</Filter>
</ClCompile>
<ClCompile Include="crypto\blake2s_sse.cpp">
<Filter>crypto</Filter>
</ClCompile>
<ClCompile Include="crypto\chacha20poly1305.cpp">
<Filter>crypto</Filter>
</ClCompile>
<ClCompile Include="crypto\curve25519-donna.cpp">
<Filter>crypto</Filter>
</ClCompile>
<ClCompile Include="wireguard_config.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="ipzip2\ipzip2.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="crypto\siphash.cpp">
<Filter>crypto</Filter>
</ClCompile>
<ClCompile Include="crypto\aesgcm\aesgcm.cpp">
<Filter>crypto\aesgcm</Filter>
</ClCompile>
@ -163,6 +173,18 @@
<ClCompile Include="service_pipe_win32.cpp">
<Filter>Source Files\Win32</Filter>
</ClCompile>
<ClCompile Include="crypto\siphash\siphash.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="crypto\blake2s\blake2s.cpp">
<Filter>crypto\blake2s</Filter>
</ClCompile>
<ClCompile Include="crypto\curve25519\curve25519-donna.cpp">
<Filter>crypto\curve25519</Filter>
</ClCompile>
<ClCompile Include="crypto\chacha20poly1305.cpp">
<Filter>crypto\chacha20poly1305</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ResourceCompile Include="TunSafe.rc" />
@ -174,23 +196,23 @@
<Image Include="downarrow.bmp" />
</ItemGroup>
<ItemGroup>
<NASM Include="crypto\chacha20_x64.asm">
<Filter>crypto</Filter>
</NASM>
<NASM Include="crypto\curve25519_x64_nasm.asm">
<Filter>crypto</Filter>
</NASM>
<NASM Include="crypto\poly1305_x64_nasm.asm">
<Filter>crypto</Filter>
</NASM>
<NASM Include="crypto\aesgcm\aesni_gcm_x64_nasm.asm">
<NASM Include="crypto\aesgcm\aesni_gcm-x64-win.asm">
<Filter>crypto\aesgcm</Filter>
</NASM>
<NASM Include="crypto\aesgcm\aesni_x64_nasm.asm">
<NASM Include="crypto\aesgcm\aesni-x64-win.asm">
<Filter>crypto\aesgcm</Filter>
</NASM>
<NASM Include="crypto\aesgcm\ghash_x64_nasm.asm">
<NASM Include="crypto\aesgcm\ghash-x64-win.asm">
<Filter>crypto\aesgcm</Filter>
</NASM>
<NASM Include="crypto\curve25519\curve25519-x64-win.asm">
<Filter>crypto\curve25519</Filter>
</NASM>
<NASM Include="crypto\chacha20\chacha20-x64-win.asm">
<Filter>crypto\chacha20poly1305</Filter>
</NASM>
<NASM Include="crypto\poly1305\poly1305-x64-win.asm">
<Filter>crypto\chacha20poly1305</Filter>
</NASM>
</ItemGroup>
</Project>

View file

@ -1,4 +1,5 @@
g++7 -I . -O2 -DNDEBUG -static -mssse3 -o tunsafe benchmark.cpp tunsafe_cpu.cpp wireguard_config.cpp ip_to_peer_map.cpp tunsafe_threading.cpp \
wireguard.cpp wireguard_proto.cpp ts.cpp util.cpp network_bsd.cpp network_bsd_common.cpp \
crypto/blake2s.cpp crypto/blake2s_sse.cpp crypto/chacha20poly1305.cpp crypto/curve25519-donna.cpp \
crypto/siphash.cpp crypto/chacha20_x64_gas.s crypto/poly1305_x64_gas.s ipzip2/ipzip2.cpp -lrt -pthread
g++7 -I . -O2 -DNDEBUG -DWITH_NETWORK_BSD=1 -static -mssse3 -o tunsafe \
tunsafe_amalgam.cpp \
crypto/chacha20/chacha20-x64-linux.s \
crypto/poly1305/poly1305-x64-linux.s \
-lrt -pthread

View file

@ -1,9 +1,10 @@
#!/bin/sh
clang++-6.0 -c -march=skylake-avx512 crypto/poly1305_x64_gas.s crypto/chacha20_x64_gas.s
clang++-6.0 -I . -O3 -DNDEBUG -mssse3 -pthread -lrt -o tunsafe util.cpp wireguard_config.cpp wireguard.cpp ts.cpp ip_to_peer_map.cpp tunsafe_threading.cpp \
wireguard_proto.cpp network_bsd.cpp network_bsd_common.cpp tunsafe_cpu.cpp benchmark.cpp crypto/blake2s.cpp crypto/blake2s_sse.cpp crypto/chacha20poly1305.cpp \
crypto/curve25519-donna.cpp crypto/siphash.cpp chacha20_x64_gas.o crypto/aesgcm/aesni_gcm_x64_gas.s \
crypto/aesgcm/aesni_x64_gas.s crypto/aesgcm/aesgcm.cpp poly1305_x64_gas.o ipzip2/ipzip2.cpp \
crypto/aesgcm/ghash_x64_gas.s
set -e
clang++-6.0 -c -march=skylake-avx512 crypto/poly1305/poly1305-x64-linux.s crypto/chacha20/chacha20-x64-linux.s
clang++-6.0 -I . -O3 -DNDEBUG -DWITH_NETWORK_BSD=1 -mssse3 -pthread -lrt -o tunsafe \
tunsafe_amalgam.cpp \
crypto/aesgcm/aesni_gcm-x64-linux.s \
crypto/aesgcm/aesni-x64-linux.s \
crypto/aesgcm/ghash-x64-linux.s \
chacha20-x64-linux.o \
poly1305-x64-linux.o \

View file

@ -1,11 +1,10 @@
#!/bin/sh
set -e
#cpp -D__ARM_ARCH__=7 crypto/chacha20/chacha20-arm.s > crypto/chacha20/chacha20-arm.preprocessed.s
#cpp -D__ARM_ARCH__=7 crypto/poly1305/poly1305-arm.s > crypto/poly1305/poly1305-arm.preprocessed.s
cpp -D__ARM_ARCH__=7 crypto/chacha20/chacha20-arm.s > crypto/chacha20/chacha20-arm.preprocessed.s
cpp -D__ARM_ARCH__=7 crypto/poly1305/poly1305-arm.s > crypto/poly1305/poly1305-arm.preprocessed.s
g++-6 -mfpu=neon -I . -g -O2 -DNDEBUG -fno-omit-frame-pointer -march=armv7-a -mthumb -std=c++11 -pthread -lrt -o tunsafe util.cpp wireguard_config.cpp wireguard.cpp ip_to_peer_map.cpp tunsafe_threading.cpp \
wireguard_proto.cpp network_bsd.cpp network_bsd_common.cpp tunsafe_cpu.cpp benchmark.cpp crypto/blake2s.cpp crypto/chacha20poly1305.cpp \
crypto/curve25519-donna.cpp crypto/siphash.cpp crypto/aesgcm/aesgcm.cpp ipzip2/ipzip2.cpp \
crypto/chacha20/chacha20-arm.preprocessed.s crypto/poly1305/poly1305-arm.preprocessed.s
g++-6 -mfpu=neon -I . -g -O2 -DNDEBUG -DWITH_NETWORK_BSD=1 -fno-omit-frame-pointer -march=armv7-a -mthumb -std=c++11 -pthread -lrt -o tunsafe \
tunsafe_amalgam.cpp \
crypto/chacha20/chacha20-arm-linux.S \
crypto/poly1305/poly1305-arm-linux.S \

View file

@ -1,14 +1,14 @@
set -e
clang++ -c -mavx512f -mavx512vl crypto/poly1305/poly1305-x64-osx.s crypto/chacha20/chacha20-x64-osx.s
clang++ -c -mavx512f -mavx512vl crypto/poly1305_x64_gas_macosx.s crypto/chacha20_x64_gas_macosx.s
clang++ -g -O3 -I . -std=c++11 -DNDEBUG=1 -Wno-deprecated-declarations -fno-exceptions -fno-rtti -ffunction-sections -o tunsafe \
wireguard_config.cpp ip_to_peer_map.cpp tunsafe_threading.cpp wireguard.cpp wireguard_proto.cpp ts.cpp util.cpp network_bsd.cpp network_bsd_common.cpp benchmark.cpp tunsafe_cpu.cpp \
crypto/blake2s.cpp crypto/blake2s_sse.cpp crypto/chacha20poly1305.cpp crypto/curve25519-donna.cpp \
crypto/siphash.cpp crypto/aesgcm/aesgcm.cpp ipzip2/ipzip2.cpp \
crypto/aesgcm/aesni_gcm_x64_gas_macosx.s crypto/aesgcm/aesni_x64_gas_macosx.s crypto/aesgcm/ghash_x64_gas_macosx.s \
chacha20_x64_gas_macosx.o poly1305_x64_gas_macosx.o
clang++ -g -O3 -I . -std=c++11 -DWITH_NETWORK_BSD=1 -DNDEBUG=1 -Wno-deprecated-declarations -fno-exceptions -fno-rtti -ffunction-sections -o tunsafe \
tunsafe_amalgam.cpp \
crypto/aesgcm/aesni_gcm-x64-osx.s \
crypto/aesgcm/aesni-x64-osx.s \
crypto/aesgcm/ghash-x64-osx.s \
chacha20-x64-osx.o \
poly1305-x64-osx.o
cp tunsafe tunsafe.unstripped
strip tunsafe

View file

@ -203,7 +203,7 @@ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../x86_64-xlate.pl" and -f $xlate) or
( $xlate="${dir}../tools/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";

View file

@ -50,7 +50,7 @@ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../x86_64-xlate.pl" and -f $xlate) or
( $xlate="${dir}../tools/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
# |$avx| in ghash-x86_64.pl must be set to at least 1; otherwise tags will

View file

@ -99,7 +99,7 @@ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../x86_64-xlate.pl" and -f $xlate) or
( $xlate="${dir}../tools/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
# See the notes about |$avx| in aesni-gcm-x86_64.pl; otherwise tags will be

10
crypto/aesgcm/make.sh Normal file
View file

@ -0,0 +1,10 @@
#!/bin/sh
perl aesni_gcm-x64.pl macosx > aesni_gcm-x64-osx.s
perl aesni-x64.pl macosx > aesni-x64-osx.s
perl ghash-x64.pl macosx > ghash-x64-osx.s
perl aesni_gcm-x64.pl gas > aesni_gcm-x64-linux.s
perl aesni-x64.pl gas > aesni-x64-linux.s
perl ghash-x64.pl gas > ghash-x64-linux.s

View file

@ -39,7 +39,7 @@
#endif
#define G1(row1,row2,row3,row4,buf) \
#define BLAKE2S_G1(row1,row2,row3,row4,buf) \
row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \
row4 = _mm_xor_si128( row4, row1 ); \
row4 = _mm_roti_epi32(row4, -16); \
@ -47,7 +47,7 @@
row2 = _mm_xor_si128( row2, row3 ); \
row2 = _mm_roti_epi32(row2, -12);
#define G2(row1,row2,row3,row4,buf) \
#define BLAKE2S_G2(row1,row2,row3,row4,buf) \
row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \
row4 = _mm_xor_si128( row4, row1 ); \
row4 = _mm_roti_epi32(row4, -8); \
@ -55,12 +55,12 @@
row2 = _mm_xor_si128( row2, row3 ); \
row2 = _mm_roti_epi32(row2, -7);
#define DIAGONALIZE(row1,row2,row3,row4) \
#define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \
row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(2,1,0,3) ); \
row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \
row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(0,3,2,1) );
#define UNDIAGONALIZE(row1,row2,row3,row4) \
#define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \
row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(0,3,2,1) ); \
row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \
row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(2,1,0,3) );
@ -73,16 +73,16 @@
#include "blake2s-load-sse2.h"
#endif
#define ROUND(r) \
#define BLAKE2S_ROUND_SSE(r) \
LOAD_MSG_ ##r ##_1(buf1); \
G1(row1,row2,row3,row4,buf1); \
BLAKE2S_G1(row1,row2,row3,row4,buf1); \
LOAD_MSG_ ##r ##_2(buf2); \
G2(row1,row2,row3,row4,buf2); \
DIAGONALIZE(row1,row2,row3,row4); \
BLAKE2S_G2(row1,row2,row3,row4,buf2); \
BLAKE2S_DIAGONALIZE(row1,row2,row3,row4); \
LOAD_MSG_ ##r ##_3(buf3); \
G1(row1,row2,row3,row4,buf3); \
BLAKE2S_G1(row1,row2,row3,row4,buf3); \
LOAD_MSG_ ##r ##_4(buf4); \
G2(row1,row2,row3,row4,buf4); \
UNDIAGONALIZE(row1,row2,row3,row4); \
BLAKE2S_G2(row1,row2,row3,row4,buf4); \
BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4); \
#endif

View file

@ -0,0 +1,85 @@
/*
BLAKE2 reference source code package - optimized C implementations
Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
your option. The terms of these licenses can be found at:
- CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
- OpenSSL license : https://www.openssl.org/source/license.html
- Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
More information about the BLAKE2 hash function can be found at
https://blake2.net.
*/
#include <emmintrin.h>
#if defined(HAVE_SSSE3)
#include <tmmintrin.h>
#endif
#if defined(HAVE_SSE41)
#include <smmintrin.h>
#endif
#if defined(HAVE_AVX)
#include <immintrin.h>
#endif
#if defined(HAVE_XOP)
#include <x86intrin.h>
#endif
#include "blake2s-round.h"
void blake2s_compress_sse( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] ) {
__m128i row1, row2, row3, row4;
__m128i buf1, buf2, buf3, buf4;
#if defined(HAVE_SSE41)
__m128i t0, t1;
#if !defined(HAVE_XOP)
__m128i t2;
#endif
#endif
__m128i ff0, ff1;
#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 );
const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 );
#endif
#if defined(HAVE_SSE41)
const __m128i m0 = LOADU( block + 00 );
const __m128i m1 = LOADU( block + 16 );
const __m128i m2 = LOADU( block + 32 );
const __m128i m3 = LOADU( block + 48 );
#else
const uint32_t m0 = load32(block + 0 * sizeof(uint32_t));
const uint32_t m1 = load32(block + 1 * sizeof(uint32_t));
const uint32_t m2 = load32(block + 2 * sizeof(uint32_t));
const uint32_t m3 = load32(block + 3 * sizeof(uint32_t));
const uint32_t m4 = load32(block + 4 * sizeof(uint32_t));
const uint32_t m5 = load32(block + 5 * sizeof(uint32_t));
const uint32_t m6 = load32(block + 6 * sizeof(uint32_t));
const uint32_t m7 = load32(block + 7 * sizeof(uint32_t));
const uint32_t m8 = load32(block + 8 * sizeof(uint32_t));
const uint32_t m9 = load32(block + 9 * sizeof(uint32_t));
const uint32_t m10 = load32(block + 10 * sizeof(uint32_t));
const uint32_t m11 = load32(block + 11 * sizeof(uint32_t));
const uint32_t m12 = load32(block + 12 * sizeof(uint32_t));
const uint32_t m13 = load32(block + 13 * sizeof(uint32_t));
const uint32_t m14 = load32(block + 14 * sizeof(uint32_t));
const uint32_t m15 = load32(block + 15 * sizeof(uint32_t));
#endif
row1 = ff0 = LOADU( &S->h[0] );
row2 = ff1 = LOADU( &S->h[4] );
row3 = _mm_loadu_si128( (__m128i const *)&blake2s_IV[0] );
row4 = _mm_xor_si128( _mm_loadu_si128( (__m128i const *)&blake2s_IV[4] ), LOADU( &S->t[0] ) );
BLAKE2S_ROUND_SSE( 0 );
BLAKE2S_ROUND_SSE( 1 );
BLAKE2S_ROUND_SSE( 2 );
BLAKE2S_ROUND_SSE( 3 );
BLAKE2S_ROUND_SSE( 4 );
BLAKE2S_ROUND_SSE( 5 );
BLAKE2S_ROUND_SSE( 6 );
BLAKE2S_ROUND_SSE( 7 );
BLAKE2S_ROUND_SSE( 8 );
BLAKE2S_ROUND_SSE( 9 );
STOREU( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) );
STOREU( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) );
}

8
crypto/blake2s.cpp → crypto/blake2s/blake2s.cpp Executable file → Normal file
View file

@ -26,8 +26,6 @@ https://blake2.net.
#define BLAKE2S_WITH_ASM 1
#endif // BLAKE2S_WITH_ASM
void blake2s_compress_sse(blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]);
#if !defined(__cplusplus) && (!defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L)
#if defined(_MSC_VER)
#define BLAKE2_INLINE __inline
@ -246,6 +244,12 @@ static void blake2s_compress(blake2s_state *S, const uint8_t in[BLAKE2S_BLOCKBYT
#undef G
#undef ROUND
#if defined(ARCH_CPU_X86_FAMILY)
#include "blake2s-sse-impl.h"
#endif
static inline void blake2s_compress_impl(blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]) {
#if defined(ARCH_CPU_X86_64) && BLAKE2S_WITH_ASM
blake2s_compress_sse(S, block);

View file

@ -1,399 +0,0 @@
/*
BLAKE2 reference source code package - optimized C implementations
Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
your option. The terms of these licenses can be found at:
- CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
- OpenSSL license : https://www.openssl.org/source/license.html
- Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
More information about the BLAKE2 hash function can be found at
https://blake2.net.
*/
#include "stdafx.h"
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "blake2s.h"
#include "crypto_ops.h"
#include <emmintrin.h>
#if defined(HAVE_SSSE3)
#include <tmmintrin.h>
#endif
#if defined(HAVE_SSE41)
#include <smmintrin.h>
#endif
#if defined(HAVE_AVX)
#include <immintrin.h>
#endif
#if defined(HAVE_XOP)
#include <x86intrin.h>
#endif
#include "blake2s-round.h"
#if !defined(__cplusplus) && (!defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L)
#if defined(_MSC_VER)
#define BLAKE2_INLINE __inline
#elif defined(__GNUC__)
#define BLAKE2_INLINE __inline__
#else
#define BLAKE2_INLINE
#endif
#else
#define BLAKE2_INLINE inline
#endif
static BLAKE2_INLINE uint32_t load32(const void *src) {
#if defined(ARCH_CPU_LITTLE_ENDIAN)
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
#else
const uint8_t *p = (const uint8_t *)src;
return ((uint32_t)(p[0]) << 0) |
((uint32_t)(p[1]) << 8) |
((uint32_t)(p[2]) << 16) |
((uint32_t)(p[3]) << 24);
#endif
}
static BLAKE2_INLINE void store32(void *dst, uint32_t w) {
#if defined(ARCH_CPU_LITTLE_ENDIAN)
memcpy(dst, &w, sizeof w);
#else
uint8_t *p = (uint8_t *)dst;
p[0] = (uint8_t)(w >> 0);
p[1] = (uint8_t)(w >> 8);
p[2] = (uint8_t)(w >> 16);
p[3] = (uint8_t)(w >> 24);
#endif
}
static const uint32_t blake2s_IV[8] =
{
0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
};
/* Some helper functions */
static void blake2s_set_lastnode( blake2s_state *S )
{
S->f[1] = (uint32_t)-1;
}
static int blake2s_is_lastblock( const blake2s_state *S )
{
return S->f[0] != 0;
}
static void blake2s_set_lastblock( blake2s_state *S )
{
if( S->last_node ) blake2s_set_lastnode( S );
S->f[0] = (uint32_t)-1;
}
static void blake2s_increment_counter( blake2s_state *S, const uint32_t inc )
{
uint64_t t = ( ( uint64_t )S->t[1] << 32 ) | S->t[0];
t += inc;
S->t[0] = ( uint32_t )( t >> 0 );
S->t[1] = ( uint32_t )( t >> 32 );
}
/* init2 xors IV with input parameter block */
#if 0
void blake2s_init_param( blake2s_state *S, const blake2s_param *P )
{
size_t i;
/*blake2s_init0( S ); */
const uint8_t * v = ( const uint8_t * )( blake2s_IV );
const uint8_t * p = ( const uint8_t * )( P );
uint8_t * h = ( uint8_t * )( S->h );
/* IV XOR ParamBlock */
memset( S, 0, sizeof( blake2s_state ) );
for( i = 0; i < BLAKE2S_OUTBYTES; ++i ) h[i] = v[i] ^ p[i];
S->outlen = P->digest_length;
}
/* Some sort of default parameter block initialization, for sequential blake2s */
void blake2s_init( blake2s_state *S, size_t outlen )
{
blake2s_param P[1];
assert(outlen && outlen <= BLAKE2S_OUTBYTES);
P->digest_length = (uint8_t)outlen;
P->key_length = 0;
P->fanout = 1;
P->depth = 1;
store32( &P->leaf_length, 0 );
store32( &P->node_offset, 0 );
store16( &P->xof_length, 0 );
P->node_depth = 0;
P->inner_length = 0;
/* memset(P->reserved, 0, sizeof(P->reserved) ); */
memset( P->salt, 0, sizeof( P->salt ) );
memset( P->personal, 0, sizeof( P->personal ) );
blake2s_init_param( S, P );
}
int blake2s_init_key( blake2s_state *S, size_t outlen, const void *key, size_t keylen )
{
blake2s_param P[1];
/* Move interval verification here? */
if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
if ( ( !key ) || ( !keylen ) || keylen > BLAKE2S_KEYBYTES ) return -1;
P->digest_length = (uint8_t)outlen;
P->key_length = (uint8_t)keylen;
P->fanout = 1;
P->depth = 1;
store32( &P->leaf_length, 0 );
store32( &P->node_offset, 0 );
store16( &P->xof_length, 0 );
P->node_depth = 0;
P->inner_length = 0;
/* memset(P->reserved, 0, sizeof(P->reserved) ); */
memset( P->salt, 0, sizeof( P->salt ) );
memset( P->personal, 0, sizeof( P->personal ) );
if( blake2s_init_param( S, P ) < 0 )
return -1;
{
uint8_t block[BLAKE2S_BLOCKBYTES];
memset( block, 0, BLAKE2S_BLOCKBYTES );
memcpy( block, key, keylen );
blake2s_update( S, block, BLAKE2S_BLOCKBYTES );
memzero_crypto( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
}
return 0;
}
#endif
void blake2s_compress_sse( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
{
__m128i row1, row2, row3, row4;
__m128i buf1, buf2, buf3, buf4;
#if defined(HAVE_SSE41)
__m128i t0, t1;
#if !defined(HAVE_XOP)
__m128i t2;
#endif
#endif
__m128i ff0, ff1;
#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 );
const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 );
#endif
#if defined(HAVE_SSE41)
const __m128i m0 = LOADU( block + 00 );
const __m128i m1 = LOADU( block + 16 );
const __m128i m2 = LOADU( block + 32 );
const __m128i m3 = LOADU( block + 48 );
#else
const uint32_t m0 = load32(block + 0 * sizeof(uint32_t));
const uint32_t m1 = load32(block + 1 * sizeof(uint32_t));
const uint32_t m2 = load32(block + 2 * sizeof(uint32_t));
const uint32_t m3 = load32(block + 3 * sizeof(uint32_t));
const uint32_t m4 = load32(block + 4 * sizeof(uint32_t));
const uint32_t m5 = load32(block + 5 * sizeof(uint32_t));
const uint32_t m6 = load32(block + 6 * sizeof(uint32_t));
const uint32_t m7 = load32(block + 7 * sizeof(uint32_t));
const uint32_t m8 = load32(block + 8 * sizeof(uint32_t));
const uint32_t m9 = load32(block + 9 * sizeof(uint32_t));
const uint32_t m10 = load32(block + 10 * sizeof(uint32_t));
const uint32_t m11 = load32(block + 11 * sizeof(uint32_t));
const uint32_t m12 = load32(block + 12 * sizeof(uint32_t));
const uint32_t m13 = load32(block + 13 * sizeof(uint32_t));
const uint32_t m14 = load32(block + 14 * sizeof(uint32_t));
const uint32_t m15 = load32(block + 15 * sizeof(uint32_t));
#endif
row1 = ff0 = LOADU( &S->h[0] );
row2 = ff1 = LOADU( &S->h[4] );
row3 = _mm_loadu_si128( (__m128i const *)&blake2s_IV[0] );
row4 = _mm_xor_si128( _mm_loadu_si128( (__m128i const *)&blake2s_IV[4] ), LOADU( &S->t[0] ) );
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );
ROUND( 4 );
ROUND( 5 );
ROUND( 6 );
ROUND( 7 );
ROUND( 8 );
ROUND( 9 );
STOREU( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) );
STOREU( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) );
}
#if 0
int blake2s_update( blake2s_state *S, const void *pin, size_t inlen )
{
const unsigned char * in = (const unsigned char *)pin;
if( inlen > 0 )
{
size_t left = S->buflen;
size_t fill = BLAKE2S_BLOCKBYTES - left;
if( inlen > fill )
{
S->buflen = 0;
memcpy( S->buf + left, in, fill ); /* Fill buffer */
blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
blake2s_compress( S, S->buf ); /* Compress */
in += fill; inlen -= fill;
while(inlen > BLAKE2S_BLOCKBYTES) {
blake2s_increment_counter(S, BLAKE2S_BLOCKBYTES);
blake2s_compress( S, in );
in += BLAKE2S_BLOCKBYTES;
inlen -= BLAKE2S_BLOCKBYTES;
}
}
memcpy( S->buf + S->buflen, in, inlen );
S->buflen += inlen;
}
return 0;
}
int blake2s_final( blake2s_state *S, void *out, size_t outlen )
{
uint8_t buffer[BLAKE2S_OUTBYTES] = {0};
size_t i;
if( out == NULL || outlen < S->outlen )
return -1;
if( blake2s_is_lastblock( S ) )
return -1;
blake2s_increment_counter( S, (uint32_t)S->buflen );
blake2s_set_lastblock( S );
memset( S->buf + S->buflen, 0, BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */
blake2s_compress( S, S->buf );
for( i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
store32( buffer + sizeof( S->h[i] ) * i, S->h[i] );
memcpy( out, buffer, S->outlen );
memzero_crypto( buffer, sizeof(buffer) );
return 0;
}
/* inlen, at least, should be uint64_t. Others can be size_t. */
int blake2s( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
{
blake2s_state S[1];
/* Verify parameters */
if ( NULL == in && inlen > 0 ) return -1;
if ( NULL == out ) return -1;
if ( NULL == key && keylen > 0) return -1;
if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1;
if( keylen > BLAKE2S_KEYBYTES ) return -1;
if( keylen > 0 )
{
if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
}
else
{
if( blake2s_init( S, outlen ) < 0 ) return -1;
}
blake2s_update( S, ( const uint8_t * )in, inlen );
blake2s_final( S, out, outlen );
return 0;
}
#endif
#if defined(SUPERCOP)
int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
{
return blake2s( out, BLAKE2S_OUTBYTES, in, inlen, NULL, 0 );
}
#endif
#if defined(BLAKE2S_SELFTEST)
#include <string.h>
#include "blake2-kat.h"
int main( void )
{
uint8_t key[BLAKE2S_KEYBYTES];
uint8_t buf[BLAKE2_KAT_LENGTH];
size_t i, step;
for( i = 0; i < BLAKE2S_KEYBYTES; ++i )
key[i] = ( uint8_t )i;
for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
buf[i] = ( uint8_t )i;
/* Test simple API */
for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
{
uint8_t hash[BLAKE2S_OUTBYTES];
blake2s( hash, BLAKE2S_OUTBYTES, buf, i, key, BLAKE2S_KEYBYTES );
if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) )
{
goto fail;
}
}
/* Test streaming API */
for(step = 1; step < BLAKE2S_BLOCKBYTES; ++step) {
for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) {
uint8_t hash[BLAKE2S_OUTBYTES];
blake2s_state S;
uint8_t * p = buf;
size_t mlen = i;
int err = 0;
if( (err = blake2s_init_key(&S, BLAKE2S_OUTBYTES, key, BLAKE2S_KEYBYTES)) < 0 ) {
goto fail;
}
while (mlen >= step) {
if ( (err = blake2s_update(&S, p, step)) < 0 ) {
goto fail;
}
mlen -= step;
p += step;
}
if ( (err = blake2s_update(&S, p, mlen)) < 0) {
goto fail;
}
if ( (err = blake2s_final(&S, hash, BLAKE2S_OUTBYTES)) < 0) {
goto fail;
}
if (0 != memcmp(hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES)) {
goto fail;
}
}
}
puts( "ok" );
return 0;
fail:
puts("error");
return -1;
}
#endif

View file

@ -6,6 +6,8 @@
/*#include <linux/linkage.h>*/
#define __ARM_ARCH__ 7
.text
#if defined(__thumb2__) || defined(__clang__)
.syntax unified

View file

@ -41,7 +41,7 @@ else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
( $xlate="${dir}../tools/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";

View file

@ -1,26 +1,23 @@
/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
*
* Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
*/
#include <linux/linkage.h>
.text
.align 5
.Lsigma:
Lsigma:
.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
.Lone:
Lone:
.long 1,0,0,0
.globl _chacha20_arm
.globl _chacha20_neon
.align 5
ENTRY(chacha20_arm)
cbz x2,.Labort
.Lshort:
_chacha20_arm:
cbz x2,Labort
Lshort:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
adr x5,.Lsigma
adr x5,Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
@ -41,7 +38,7 @@ ENTRY(chacha20_arm)
ror x30,x30,#32
#endif
.Loop_outer:
Loop_outer:
mov w5,w22 // unpack key block
lsr x6,x22,#32
mov w7,w23
@ -61,7 +58,7 @@ ENTRY(chacha20_arm)
mov x4,#10
subs x2,x2,#64
.Loop:
Loop:
sub x4,x4,#1
add w5,w5,w9
add w6,w6,w10
@ -159,7 +156,7 @@ ENTRY(chacha20_arm)
ror w11,w11,#25
ror w12,w12,#25
ror w9,w9,#25
cbnz x4,.Loop
cbnz x4,Loop
add w5,w5,w22 // accumulate key block
add x6,x6,x22,lsr#32
@ -178,7 +175,7 @@ ENTRY(chacha20_arm)
add w20,w20,w30
add x21,x21,x30,lsr#32
b.lo .Ltail
b.lo Ltail
add x5,x5,x6,lsl#32 // pack
add x7,x7,x8,lsl#32
@ -219,7 +216,7 @@ ENTRY(chacha20_arm)
stp x17,x20,[x0,#48]
add x0,x0,#64
b.hi .Loop_outer
b.hi Loop_outer
ldp x19,x20,[x29,#16]
add sp,sp,#64
@ -228,13 +225,13 @@ ENTRY(chacha20_arm)
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
.Labort:
Labort:
ret
.align 4
.Ltail:
Ltail:
add x2,x2,#64
.Less_than_64:
Less_than_64:
sub x0,x0,#1
add x1,x1,x2
add x0,x0,x2
@ -264,13 +261,13 @@ ENTRY(chacha20_arm)
stp x13,x15,[sp,#32]
stp x17,x20,[sp,#48]
.Loop_tail:
Loop_tail:
ldrb w10,[x1,x2]
ldrb w11,[x4,x2]
add x2,x2,#1
eor w10,w10,w11
strb w10,[x0,x2]
cbnz x2,.Loop_tail
cbnz x2,Loop_tail
stp xzr,xzr,[sp,#0]
stp xzr,xzr,[sp,#16]
@ -285,25 +282,26 @@ ENTRY(chacha20_arm)
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
ret
ENDPROC(chacha20_arm)
.align 5
ENTRY(chacha20_neon)
cbz x2,.Labort_neon
_chacha20_neon:
cbz x2,Labort_neon
cmp x2,#192
b.lo .Lshort
b.lo Lshort
stp x29,x30,[sp,#-96]!
add x29,sp,#0
adr x5,.Lsigma
adr x5,Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
cmp x2,#512
b.hs .L512_or_more_neon
b.hs L512_or_more_neon
sub sp,sp,#64
@ -329,7 +327,7 @@ ENTRY(chacha20_neon)
add v29.4s,v28.4s,v31.4s
shl v31.4s,v31.4s,#2 // 1 -> 4
.Loop_outer_neon:
Loop_outer_neon:
mov w5,w22 // unpack key block
lsr x6,x22,#32
mov v0.16b,v24.16b
@ -361,7 +359,7 @@ ENTRY(chacha20_neon)
mov x4,#10
subs x2,x2,#256
.Loop_neon:
Loop_neon:
sub x4,x4,#1
add v0.4s,v0.4s,v1.4s
add w5,w5,w9
@ -567,7 +565,7 @@ ENTRY(chacha20_neon)
ext v1.16b,v1.16b,v1.16b,#12
ext v5.16b,v5.16b,v5.16b,#12
ext v17.16b,v17.16b,v17.16b,#12
cbnz x4,.Loop_neon
cbnz x4,Loop_neon
add w5,w5,w22 // accumulate key block
add v0.4s,v0.4s,v24.4s
@ -598,7 +596,7 @@ ENTRY(chacha20_neon)
add x21,x21,x30,lsr#32
add v17.4s,v17.4s,v25.4s
b.lo .Ltail_neon
b.lo Ltail_neon
add x5,x5,x6,lsl#32 // pack
add x7,x7,x8,lsl#32
@ -663,7 +661,7 @@ ENTRY(chacha20_neon)
eor v19.16b,v19.16b,v3.16b
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
b.hi .Loop_outer_neon
b.hi Loop_outer_neon
ldp x19,x20,[x29,#16]
add sp,sp,#64
@ -674,10 +672,10 @@ ENTRY(chacha20_neon)
ldp x29,x30,[sp],#96
ret
.Ltail_neon:
Ltail_neon:
add x2,x2,#256
cmp x2,#64
b.lo .Less_than_64
b.lo Less_than_64
add x5,x5,x6,lsl#32 // pack
add x7,x7,x8,lsl#32
@ -717,10 +715,10 @@ ENTRY(chacha20_neon)
stp x13,x15,[x0,#32]
stp x17,x20,[x0,#48]
add x0,x0,#64
b.eq .Ldone_neon
b.eq Ldone_neon
sub x2,x2,#64
cmp x2,#64
b.lo .Less_than_128
b.lo Less_than_128
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
eor v0.16b,v0.16b,v20.16b
@ -728,10 +726,10 @@ ENTRY(chacha20_neon)
eor v2.16b,v2.16b,v22.16b
eor v3.16b,v3.16b,v23.16b
st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
b.eq .Ldone_neon
b.eq Ldone_neon
sub x2,x2,#64
cmp x2,#64
b.lo .Less_than_192
b.lo Less_than_192
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
eor v4.16b,v4.16b,v20.16b
@ -739,41 +737,41 @@ ENTRY(chacha20_neon)
eor v6.16b,v6.16b,v22.16b
eor v7.16b,v7.16b,v23.16b
st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
b.eq .Ldone_neon
b.eq Ldone_neon
sub x2,x2,#64
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
b .Last_neon
b Last_neon
.Less_than_128:
Less_than_128:
st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
b .Last_neon
.Less_than_192:
b Last_neon
Less_than_192:
st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
b .Last_neon
b Last_neon
.align 4
.Last_neon:
Last_neon:
sub x0,x0,#1
add x1,x1,x2
add x0,x0,x2
add x4,sp,x2
neg x2,x2
.Loop_tail_neon:
Loop_tail_neon:
ldrb w10,[x1,x2]
ldrb w11,[x4,x2]
add x2,x2,#1
eor w10,w10,w11
strb w10,[x0,x2]
cbnz x2,.Loop_tail_neon
cbnz x2,Loop_tail_neon
stp xzr,xzr,[sp,#0]
stp xzr,xzr,[sp,#16]
stp xzr,xzr,[sp,#32]
stp xzr,xzr,[sp,#48]
.Ldone_neon:
Ldone_neon:
ldp x19,x20,[x29,#16]
add sp,sp,#64
ldp x21,x22,[x29,#32]
@ -781,9 +779,11 @@ ENTRY(chacha20_neon)
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
Labort_neon:
ret
.L512_or_more_neon:
L512_or_more_neon:
sub sp,sp,#128+64
ldp x22,x23,[x5] // load sigma
@ -819,7 +819,7 @@ ENTRY(chacha20_neon)
sub x2,x2,#512 // not typo
.Loop_outer_512_neon:
Loop_outer_512_neon:
mov v0.16b,v24.16b
mov v4.16b,v24.16b
mov v8.16b,v24.16b
@ -865,7 +865,7 @@ ENTRY(chacha20_neon)
mov x4,#5
subs x2,x2,#512
.Loop_upper_neon:
Loop_upper_neon:
sub x4,x4,#1
add v0.4s,v0.4s,v1.4s
add w5,w5,w9
@ -1275,7 +1275,7 @@ ENTRY(chacha20_neon)
ext v13.16b,v13.16b,v13.16b,#12
ext v17.16b,v17.16b,v17.16b,#12
ext v21.16b,v21.16b,v21.16b,#12
cbnz x4,.Loop_upper_neon
cbnz x4,Loop_upper_neon
add w5,w5,w22 // accumulate key block
add x6,x6,x22,lsr#32
@ -1350,7 +1350,7 @@ ENTRY(chacha20_neon)
lsr x21,x30,#32
mov x4,#5
.Loop_lower_neon:
Loop_lower_neon:
sub x4,x4,#1
add v0.4s,v0.4s,v1.4s
add w5,w5,w9
@ -1760,7 +1760,7 @@ ENTRY(chacha20_neon)
ext v13.16b,v13.16b,v13.16b,#12
ext v17.16b,v17.16b,v17.16b,#12
ext v21.16b,v21.16b,v21.16b,#12
cbnz x4,.Loop_lower_neon
cbnz x4,Loop_lower_neon
add w5,w5,w22 // accumulate key block
ldp q24,q25,[sp,#0]
@ -1896,7 +1896,7 @@ ENTRY(chacha20_neon)
add v29.4s,v29.4s,v0.4s
add v30.4s,v30.4s,v0.4s
b.hs .Loop_outer_512_neon
b.hs Loop_outer_512_neon
adds x2,x2,#512
ushr v0.4s,v31.4s,#2 // 4 -> 1
@ -1910,14 +1910,14 @@ ENTRY(chacha20_neon)
stp q24,q31,[sp,#32]
stp q24,q31,[sp,#64]
b.eq .Ldone_512_neon
b.eq Ldone_512_neon
cmp x2,#192
sub v27.4s,v27.4s,v0.4s // -= 1
sub v28.4s,v28.4s,v0.4s
sub v29.4s,v29.4s,v0.4s
add sp,sp,#128
b.hs .Loop_outer_neon
b.hs Loop_outer_neon
eor v25.16b,v25.16b,v25.16b
eor v26.16b,v26.16b,v26.16b
@ -1925,9 +1925,9 @@ ENTRY(chacha20_neon)
eor v28.16b,v28.16b,v28.16b
eor v29.16b,v29.16b,v29.16b
eor v30.16b,v30.16b,v30.16b
b .Loop_outer
b Loop_outer
.Ldone_512_neon:
Ldone_512_neon:
ldp x19,x20,[x29,#16]
add sp,sp,#128+64
ldp x21,x22,[x29,#32]
@ -1935,6 +1935,5 @@ ENTRY(chacha20_neon)
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
.Labort_neon:
ret
ENDPROC(chacha20_neon)

View file

@ -16,7 +16,7 @@
#
# June 2015
#
# ChaCha20 for ARMv8.
# chacha20 for ARMv8.
#
# Performance in cycles per byte out of large buffer.
#
@ -40,7 +40,7 @@ $output=shift;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
( $xlate="${dir}../tools/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
@ -120,42 +120,21 @@ my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
}
$code.=<<___;
#include "arm_arch.h"
.text
.extern OPENSSL_armcap_P
.align 5
.Lsigma:
.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
.Lone:
.long 1,0,0,0
.LOPENSSL_armcap_P:
#ifdef __ILP32__
.long OPENSSL_armcap_P-.
#else
.quad OPENSSL_armcap_P-.
#endif
.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.globl ChaCha20_ctr32
.type ChaCha20_ctr32,%function
.globl chacha20_arm
.globl chacha20_neon
.type chacha20_arm,%function
.align 5
ChaCha20_ctr32:
chacha20_arm:
cbz $len,.Labort
adr @x[0],.LOPENSSL_armcap_P
cmp $len,#192
b.lo .Lshort
#ifdef __ILP32__
ldrsw @x[1],[@x[0]]
#else
ldr @x[1],[@x[0]]
#endif
ldr w17,[@x[1],@x[0]]
tst w17,#ARMV7_NEON
b.ne ChaCha20_neon
.Lshort:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
@ -333,7 +312,7 @@ $code.=<<___;
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
ret
.size ChaCha20_ctr32,.-ChaCha20_ctr32
.size chacha20_arm,.-chacha20_arm
___
{{{
@ -374,9 +353,13 @@ my ($a,$b,$c,$d,$t)=@_;
$code.=<<___;
.type ChaCha20_neon,%function
.type chacha20_neon,%function
.align 5
ChaCha20_neon:
chacha20_neon:
cbz x2,.Labort_neon
cmp x2,#192
b.lo .Lshort
stp x29,x30,[sp,#-96]!
add x29,sp,#0
@ -684,8 +667,9 @@ $code.=<<___;
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
.Labort_neon:
ret
.size ChaCha20_neon,.-ChaCha20_neon
.size chacha20_neon,.-chacha20_neon
___
{
my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
@ -693,18 +677,6 @@ my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
$A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
$code.=<<___;
.type ChaCha20_512_neon,%function
.align 5
ChaCha20_512_neon:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
adr @x[0],.Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
.L512_or_more_neon:
sub sp,sp,#128+64
@ -1115,7 +1087,7 @@ $code.=<<___;
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
ret
.size ChaCha20_512_neon,.-ChaCha20_512_neon
.size chacha20_512_neon,.-chacha20_512_neon
___
}
}}}

View file

@ -67,7 +67,7 @@ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
( $xlate="${dir}../tools/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
$avx = 3;

4
crypto/chacha20/make.sh Normal file
View file

@ -0,0 +1,4 @@
#!/bin/sh
perl chacha20-x64.pl gas > chacha20-x64-linux.s
perl chacha20-x64.pl macosx > chacha20-x64-osx.s
perl chacha20-arm64.pl ios > chacha20-arm64-ios.S

View file

@ -48,6 +48,7 @@
#include <string.h>
#include <stdint.h>
#include "curve25519-donna.h"
#ifdef _MSC_VER
#define inline __inline
@ -57,6 +58,8 @@ typedef uint8_t u8;
typedef int32_t s32;
typedef int64_t limb;
const uint8 kCurve25519Basepoint[32] = {9};
/* Field element representation:
*
* Field elements are written as an array of signed, 64-bit limbs, least

View file

@ -1,17 +1,19 @@
#ifndef TUNSAFE_CRYPTO_CURVE25519_DONNA_H_
#define TUNSAFE_CRYPTO_CURVE25519_DONNA_H_
#include "tunsafe_types.h"
void curve25519_donna_ref(uint8 *mypublic, const uint8 *secret, const uint8 *basepoint);
extern "C" void curve25519_donna_x64(uint8 *mypublic, const uint8 *secret, const uint8 *basepoint);
#if defined(ARCH_CPU_X86_64) && defined(COMPILER_MSVC)
#define curve25519_donna curve25519_donna_x64
#else
#define curve25519_donna curve25519_donna_ref
#endif
void curve25519_normalize(uint8 *e);
#ifndef TUNSAFE_CRYPTO_CURVE25519_DONNA_H_
#define TUNSAFE_CRYPTO_CURVE25519_DONNA_H_
#include "tunsafe_types.h"
void curve25519_donna_ref(uint8 *mypublic, const uint8 *secret, const uint8 *basepoint);
extern "C" void curve25519_donna_x64(uint8 *mypublic, const uint8 *secret, const uint8 *basepoint);
#if defined(ARCH_CPU_X86_64) && defined(COMPILER_MSVC)
#define curve25519_donna curve25519_donna_x64
#else
#define curve25519_donna curve25519_donna_ref
#endif
void curve25519_normalize(uint8 *e);
extern const uint8 kCurve25519Basepoint[32];
#endif // TUNSAFE_CRYPTO_CURVE25519_DONNA_H_

6
crypto/poly1305/make.sh Normal file
View file

@ -0,0 +1,6 @@
#!/bin/sh
perl poly1305-x64.pl gas > poly1305-x64-linux.s
perl poly1305-x64.pl macosx > poly1305-x64-osx.s
perl poly1305-arm64.pl ios > poly1305-arm64-ios.S

View file

@ -4,7 +4,7 @@
* Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
*/
//#include <linux/linkage.h>
#define __ARM_ARCH__ 7
.text
#if defined(__thumb2__)

View file

@ -35,7 +35,7 @@ else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
( $xlate="${dir}../tools/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";

View file

@ -1,20 +1,21 @@
/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
*
* Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
*/
#include <linux/linkage.h>
.text
// forward "declarations" are required for Apple
.globl _poly1305_blocks_arm
.globl _poly1305_emit_arm
.globl _poly1305_blocks_neon
.globl _poly1305_emit_neon
.globl _poly1305_init_arm
.align 5
ENTRY(poly1305_init_arm)
_poly1305_init_arm:
cmp x1,xzr
stp xzr,xzr,[x0] // zero hash value
stp xzr,xzr,[x0,#16] // [along with is_base2_26]
csel x0,xzr,x0,eq
b.eq .Lno_key
b.eq Lno_key
ldp x7,x8,[x1] // load key
mov x9,#0xfffffffc0fffffff
@ -28,23 +29,24 @@ ENTRY(poly1305_init_arm)
and x8,x8,x9 // &=0ffffffc0ffffffc
stp x7,x8,[x0,#32] // save key value
.Lno_key:
Lno_key:
ret
ENDPROC(poly1305_init_arm)
.align 5
ENTRY(poly1305_blocks_arm)
_poly1305_blocks_arm:
ands x2,x2,#-16
b.eq .Lno_data
b.eq Lno_data
ldp x4,x5,[x0] // load hash value
ldp x7,x8,[x0,#32] // load key value
ldr x6,[x0,#16]
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
b .Loop
b Loop
.align 5
.Loop:
Loop:
ldp x10,x11,[x1],#16 // load input
sub x2,x2,#16
#ifdef __ARMEB__
@ -86,17 +88,18 @@ ENTRY(poly1305_blocks_arm)
adcs x5,x13,xzr
adc x6,x6,xzr
cbnz x2,.Loop
cbnz x2,Loop
stp x4,x5,[x0] // store hash value
str x6,[x0,#16]
.Lno_data:
Lno_data:
ret
ENDPROC(poly1305_blocks_arm)
.align 5
ENTRY(poly1305_emit_arm)
_poly1305_emit_arm:
ldp x4,x5,[x0] // load hash base 2^64
ldr x6,[x0,#16]
ldp x10,x11,[x2] // load nonce
@ -123,10 +126,10 @@ ENTRY(poly1305_emit_arm)
stp x4,x5,[x1] // write result
ret
ENDPROC(poly1305_emit_arm)
.align 5
__poly1305_mult:
poly1305_mult:
mul x12,x4,x7 // h0*r0
umulh x13,x4,x7
@ -160,7 +163,10 @@ __poly1305_mult:
ret
__poly1305_splat:
.align 5
poly1305_splat:
and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
ubfx x13,x4,#26,#26
extr x14,x5,x4,#52
@ -184,28 +190,30 @@ __poly1305_splat:
ret
.align 5
ENTRY(poly1305_blocks_neon)
_poly1305_blocks_neon:
ldr x17,[x0,#24]
cmp x2,#128
b.hs .Lblocks_neon
cbz x17,poly1305_blocks_arm
b.hs Lblocks_neon
cbz x17,_poly1305_blocks_arm
.Lblocks_neon:
Lblocks_neon:
stp x29,x30,[sp,#-80]!
add x29,sp,#0
ands x2,x2,#-16
b.eq .Lno_data_neon
b.eq Lno_data_neon
cbz x17,.Lbase2_64_neon
cbz x17,Lbase2_64_neon
ldp w10,w11,[x0] // load hash value base 2^26
ldp w12,w13,[x0,#8]
ldr w14,[x0,#16]
tst x2,#31
b.eq .Leven_neon
b.eq Leven_neon
ldp x7,x8,[x0,#32] // load key value
@ -237,10 +245,10 @@ ENTRY(poly1305_blocks_neon)
adcs x5,x5,x13
adc x6,x6,x3
bl __poly1305_mult
bl poly1305_mult
ldr x30,[sp,#8]
cbz x3,.Lstore_base2_64_neon
cbz x3,Lstore_base2_64_neon
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
ubfx x11,x4,#26,#26
@ -249,28 +257,28 @@ ENTRY(poly1305_blocks_neon)
ubfx x13,x5,#14,#26
extr x14,x6,x5,#40
cbnz x2,.Leven_neon
cbnz x2,Leven_neon
stp w10,w11,[x0] // store hash value base 2^26
stp w12,w13,[x0,#8]
str w14,[x0,#16]
b .Lno_data_neon
b Lno_data_neon
.align 4
.Lstore_base2_64_neon:
Lstore_base2_64_neon:
stp x4,x5,[x0] // store hash value base 2^64
stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
b .Lno_data_neon
b Lno_data_neon
.align 4
.Lbase2_64_neon:
Lbase2_64_neon:
ldp x7,x8,[x0,#32] // load key value
ldp x4,x5,[x0] // load hash value base 2^64
ldr x6,[x0,#16]
tst x2,#31
b.eq .Linit_neon
b.eq Linit_neon
ldp x12,x13,[x1],#16 // load input
sub x2,x2,#16
@ -283,9 +291,9 @@ ENTRY(poly1305_blocks_neon)
adcs x5,x5,x13
adc x6,x6,x3
bl __poly1305_mult
bl poly1305_mult
.Linit_neon:
Linit_neon:
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
ubfx x11,x4,#26,#26
extr x12,x5,x4,#52
@ -310,35 +318,35 @@ ENTRY(poly1305_blocks_neon)
mov x5,x8
mov x6,xzr
add x0,x0,#48+12
bl __poly1305_splat
bl poly1305_splat
bl __poly1305_mult // r^2
bl poly1305_mult // r^2
sub x0,x0,#4
bl __poly1305_splat
bl poly1305_splat
bl __poly1305_mult // r^3
bl poly1305_mult // r^3
sub x0,x0,#4
bl __poly1305_splat
bl poly1305_splat
bl __poly1305_mult // r^4
bl poly1305_mult // r^4
sub x0,x0,#4
bl __poly1305_splat
bl poly1305_splat
ldr x30,[sp,#8]
add x16,x1,#32
adr x17,.Lzeros
adr x17,Lzeros
subs x2,x2,#64
csel x16,x17,x16,lo
mov x4,#1
str x4,[x0,#-24] // set is_base2_26
sub x0,x0,#48 // restore original x0
b .Ldo_neon
b Ldo_neon
.align 4
.Leven_neon:
Leven_neon:
add x16,x1,#32
adr x17,.Lzeros
adr x17,Lzeros
subs x2,x2,#64
csel x16,x17,x16,lo
@ -353,7 +361,7 @@ ENTRY(poly1305_blocks_neon)
fmov d27,x13
fmov d28,x14
.Ldo_neon:
Ldo_neon:
ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
ldp x9,x13,[x16],#48
@ -427,10 +435,10 @@ ENTRY(poly1305_blocks_neon)
fmov d13,x12
ushr v31.2d,v31.2d,#38
b.ls .Lskip_loop
b.ls Lskip_loop
.align 4
.Loop_neon:
Loop_neon:
////////////////////////////////////////////////////////////////
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
@ -616,9 +624,9 @@ ENTRY(poly1305_blocks_neon)
add v25.2s,v25.2s,v29.2s // h0 -> h1
add v28.2s,v28.2s,v30.2s // h3 -> h4
b.hi .Loop_neon
b.hi Loop_neon
.Lskip_loop:
Lskip_loop:
dup v16.2d,v16.d[0]
add v11.2s,v11.2s,v26.2s
@ -626,7 +634,7 @@ ENTRY(poly1305_blocks_neon)
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
adds x2,x2,#32
b.ne .Long_tail
b.ne Long_tail
dup v16.2d,v11.d[0]
add v14.2s,v9.2s,v24.2s
@ -634,7 +642,7 @@ ENTRY(poly1305_blocks_neon)
add v15.2s,v10.2s,v25.2s
add v18.2s,v13.2s,v28.2s
.Long_tail:
Long_tail:
dup v14.2d,v14.d[0]
umull2 v19.2d,v16.4s,v6.4s
umull2 v22.2d,v16.4s,v1.4s
@ -669,7 +677,7 @@ ENTRY(poly1305_blocks_neon)
umlal2 v20.2d,v18.4s,v4.4s
umlal2 v21.2d,v18.4s,v6.4s
b.eq .Lshort_tail
b.eq Lshort_tail
////////////////////////////////////////////////////////////////
// (hash+inp[0:1])*r^4:r^3 and accumulate
@ -708,7 +716,7 @@ ENTRY(poly1305_blocks_neon)
umlal v20.2d,v13.2s,v4.2s
umlal v21.2d,v13.2s,v6.2s
.Lshort_tail:
Lshort_tail:
////////////////////////////////////////////////////////////////
// horizontal add
@ -759,15 +767,16 @@ ENTRY(poly1305_blocks_neon)
st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
st1 {v23.s}[0],[x0]
.Lno_data_neon:
Lno_data_neon:
ldr x29,[sp],#80
ret
ENDPROC(poly1305_blocks_neon)
.align 5
ENTRY(poly1305_emit_neon)
_poly1305_emit_neon:
ldr x17,[x0,#24]
cbz x17,poly1305_emit_arm
cbz x17,_poly1305_emit_arm
ldp w10,w11,[x0] // load hash value base 2^26
ldp w12,w13,[x0,#8]
@ -813,8 +822,8 @@ ENTRY(poly1305_emit_neon)
stp x4,x5,[x1] // write result
ret
ENDPROC(poly1305_emit_neon)
.align 5
.Lzeros:
Lzeros:
.long 0,0,0,0,0,0,0,0

View file

@ -18,7 +18,7 @@
#
# June 2015
#
# Numbers are cycles per processed byte with poly1305_blocks alone.
# Numbers are cycles per processed byte with poly1305_blocks_arm alone.
#
# IALU/gcc-4.9 NEON
#
@ -39,7 +39,7 @@ $output=shift;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
( $xlate="${dir}../tools/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
@ -51,19 +51,18 @@ my ($mac,$nonce)=($inp,$len);
my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
$code.=<<___;
#include "arm_arch.h"
.text
// forward "declarations" are required for Apple
.extern OPENSSL_armcap_P
.globl poly1305_blocks
.globl poly1305_emit
.globl poly1305_blocks_arm
.globl poly1305_emit_arm
.globl poly1305_blocks_neon
.globl poly1305_emit_neon
.globl poly1305_init
.type poly1305_init,%function
.globl poly1305_init_arm
.type poly1305_init_arm,%function
.align 5
poly1305_init:
poly1305_init_arm:
cmp $inp,xzr
stp xzr,xzr,[$ctx] // zero hash value
stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
@ -71,17 +70,9 @@ poly1305_init:
csel x0,xzr,x0,eq
b.eq .Lno_key
#ifdef __ILP32__
ldrsw $t1,.LOPENSSL_armcap_P
#else
ldr $t1,.LOPENSSL_armcap_P
#endif
adr $t0,.LOPENSSL_armcap_P
ldp $r0,$r1,[$inp] // load key
mov $s1,#0xfffffffc0fffffff
movk $s1,#0x0fff,lsl#48
ldr w17,[$t0,$t1]
#ifdef __ARMEB__
rev $r0,$r0 // flip bytes
rev $r1,$r1
@ -91,30 +82,13 @@ poly1305_init:
and $r1,$r1,$s1 // &=0ffffffc0ffffffc
stp $r0,$r1,[$ctx,#32] // save key value
tst w17,#ARMV7_NEON
adr $d0,poly1305_blocks
adr $r0,poly1305_blocks_neon
adr $d1,poly1305_emit
adr $r1,poly1305_emit_neon
csel $d0,$d0,$r0,eq
csel $d1,$d1,$r1,eq
#ifdef __ILP32__
stp w12,w13,[$len]
#else
stp $d0,$d1,[$len]
#endif
mov x0,#1
.Lno_key:
ret
.size poly1305_init,.-poly1305_init
.size poly1305_init_arm,.-poly1305_init_arm
.type poly1305_blocks,%function
.type poly1305_blocks_arm,%function
.align 5
poly1305_blocks:
poly1305_blocks_arm:
ands $len,$len,#-16
b.eq .Lno_data
@ -174,11 +148,11 @@ poly1305_blocks:
.Lno_data:
ret
.size poly1305_blocks,.-poly1305_blocks
.size poly1305_blocks_arm,.-poly1305_blocks_arm
.type poly1305_emit,%function
.type poly1305_emit_arm,%function
.align 5
poly1305_emit:
poly1305_emit_arm:
ldp $h0,$h1,[$ctx] // load hash base 2^64
ldr $h2,[$ctx,#16]
ldp $t0,$t1,[$nonce] // load nonce
@ -205,7 +179,7 @@ poly1305_emit:
stp $h0,$h1,[$mac] // write result
ret
.size poly1305_emit,.-poly1305_emit
.size poly1305_emit_arm,.-poly1305_emit_arm
___
my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
@ -288,7 +262,7 @@ poly1305_blocks_neon:
ldr $is_base2_26,[$ctx,#24]
cmp $len,#128
b.hs .Lblocks_neon
cbz $is_base2_26,poly1305_blocks
cbz $is_base2_26,poly1305_blocks_arm
.Lblocks_neon:
stp x29,x30,[sp,#-80]!
@ -867,7 +841,7 @@ poly1305_blocks_neon:
.align 5
poly1305_emit_neon:
ldr $is_base2_26,[$ctx,#24]
cbz $is_base2_26,poly1305_emit
cbz $is_base2_26,poly1305_emit_arm
ldp w10,w11,[$ctx] // load hash value base 2^26
ldp w12,w13,[$ctx,#8]
@ -918,14 +892,6 @@ poly1305_emit_neon:
.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0
.LOPENSSL_armcap_P:
#ifdef __ILP32__
.long OPENSSL_armcap_P-.
#else
.quad OPENSSL_armcap_P-.
#endif
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
foreach (split("\n",$code)) {

View file

View file

@ -71,7 +71,7 @@ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
( $xlate="${dir}../tools/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
$avx = 3;

View file

@ -11,7 +11,7 @@
*/
#include "stdafx.h"
#include "crypto/siphash.h"
#include "crypto/siphash/siphash.h"
#include "tunsafe_endian.h"
#define SIPROUND \

177
crypto/tools/arm-xlate.pl Normal file
View file

@ -0,0 +1,177 @@
#! /usr/bin/env perl
# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
use strict;
my $flavour = shift;
my $output = shift;
open STDOUT,">$output" || die "can't open $output: $!";
$flavour = "linux32" if (!$flavour or $flavour eq "void");
my %GLOBALS;
my $dotinlocallabels=($flavour=~/linux/)?1:0;
################################################################
# directives which need special treatment on different platforms
################################################################
my $arch = sub {
if ($flavour =~ /linux/) { ".arch\t".join(',',@_); }
else { ""; }
};
my $fpu = sub {
if ($flavour =~ /linux/) { ".fpu\t".join(',',@_); }
else { ""; }
};
my $hidden = sub {
if ($flavour =~ /ios/) { ".private_extern\t".join(',',@_); }
else { ".hidden\t".join(',',@_); }
};
my $comm = sub {
my @args = split(/,\s*/,shift);
my $name = @args[0];
my $global = \$GLOBALS{$name};
my $ret;
if ($flavour =~ /ios32/) {
$ret = ".comm\t_$name,@args[1]\n";
$ret .= ".non_lazy_symbol_pointer\n";
$ret .= "$name:\n";
$ret .= ".indirect_symbol\t_$name\n";
$ret .= ".long\t0";
$name = "_$name";
} else { $ret = ".comm\t".join(',',@args); }
$$global = $name;
$ret;
};
my $globl = sub {
my $name = shift;
my $global = \$GLOBALS{$name};
my $ret;
SWITCH: for ($flavour) {
/ios/ && do { $name = "_$name";
last;
};
}
$ret = ".globl $name" if (!$ret);
$$global = $name;
$ret;
};
my $global = $globl;
my $extern = sub {
&$globl(@_);
return; # return nothing
};
my $type = sub {
if ($flavour =~ /linux/) { ".type\t".join(',',@_); }
elsif ($flavour =~ /ios32/) { if (join(',',@_) =~ /(\w+),%function/) {
"#ifdef __thumb2__\n".
".thumb_func $1\n".
"#endif";
}
}
else { ""; }
};
my $size = sub {
if ($flavour =~ /linux/) { ".size\t".join(',',@_); }
else { ""; }
};
my $inst = sub {
if ($flavour =~ /linux/) { ".inst\t".join(',',@_); }
else { ".long\t".join(',',@_); }
};
my $asciz = sub {
my $line = join(",",@_);
if ($line =~ /^"(.*)"$/)
{ ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; }
else
{ ""; }
};
sub range {
my ($r,$sfx,$start,$end) = @_;
join(",",map("$r$_$sfx",($start..$end)));
}
sub expand_line {
my $line = shift;
my @ret = ();
pos($line)=0;
while ($line =~ m/\G[^@\/\{\"]*/g) {
if ($line =~ m/\G(@|\/\/|$)/gc) {
last;
}
elsif ($line =~ m/\G\{/gc) {
my $saved_pos = pos($line);
$line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e;
pos($line) = $saved_pos;
$line =~ m/\G[^\}]*\}/g;
}
elsif ($line =~ m/\G\"/gc) {
$line =~ m/\G[^\"]*\"/g;
}
}
$line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge;
return $line;
}
while(my $line=<>) {
if ($line =~ m/^\s*(#|@|\/\/)/) { print $line; next; }
$line =~ s|/\*.*\*/||; # get rid of C-style comments...
$line =~ s|^\s+||; # ... and skip white spaces in beginning...
$line =~ s|\s+$||; # ... and at the end
{
$line =~ s|[\b\.]L(\w{2,})|L$1|g; # common denominator for Locallabel
$line =~ s|\bL(\w{2,})|\.L$1|g if ($dotinlocallabels);
}
{
$line =~ s|(^[\.\w]+)\:\s*||;
my $label = $1;
if ($label) {
printf "%s:",($GLOBALS{$label} or $label);
}
}
if ($line !~ m/^[#@]/) {
$line =~ s|^\s*(\.?)(\S+)\s*||;
my $c = $1; $c = "\t" if ($c eq "");
my $mnemonic = $2;
my $opcode;
if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) {
$opcode = eval("\$$1_$2");
} else {
$opcode = eval("\$$mnemonic");
}
my $arg=expand_line($line);
if (ref($opcode) eq 'CODE') {
$line = &$opcode($arg);
} elsif ($mnemonic) {
$line = $c.$mnemonic;
$line.= "\t$arg" if ($arg ne "");
}
}
print $line if ($line);
print "\n";
}
close STDOUT;

View file

@ -248,11 +248,6 @@ done:
}
#endif // defined(OS_LINUX)
void OsInterruptibleSleep(int millis) {
usleep((useconds_t)millis * 1000);
}
#if defined(OS_MACOSX)
int open_tun(char *devname, size_t devname_size) {
struct sockaddr_ctl sc;
@ -789,14 +784,6 @@ public:
bool is_connected_;
};
struct CommandLineOutput {
const char *filename_to_load;
const char *interface_name;
bool daemon;
};
int HandleCommandLine(int argc, char **argv, CommandLineOutput *output);
int main(int argc, char **argv) {
CommandLineOutput cmd = {0};

10
ts.cpp
View file

@ -1,7 +1,7 @@
#include "stdafx.h"
#include "tunsafe_types.h"
#include "netapi.h"
#include "crypto/curve25519-donna.h"
#include "crypto/curve25519/curve25519-donna.h"
#include "util.h"
#include "wireguard_proto.h"
#include <string.h>
@ -35,8 +35,6 @@
#define ANSI_FG_CYAN "\x1b[36m"
#define ANSI_FG_WHITE "\x1b[37m"
static const uint8 kCurve25519Basepoint[32] = {9};
#if defined(OS_WIN)
#define EXENAME "ts"
@ -758,12 +756,6 @@ static int HandleStopCommand(int argc, char **argv) {
#endif // defined(OS_WIN)
struct CommandLineOutput {
const char *filename_to_load;
const char *interface_name;
bool daemon;
};
// Returns -1 on invalid subcommand
int HandleCommandLine(int argc, char **argv, CommandLineOutput *output) {
uint8 key[32];

View file

@ -54,7 +54,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="crypto\nasm.props" />
<Import Project="crypto\tools\nasm.props" />
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
@ -100,6 +100,7 @@
<PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
<ExceptionHandling>false</ExceptionHandling>
<AdditionalIncludeDirectories>.</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
@ -115,6 +116,7 @@
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
<ExceptionHandling>false</ExceptionHandling>
<AdditionalIncludeDirectories>.</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
@ -134,6 +136,7 @@
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
<OmitFramePointers>true</OmitFramePointers>
<ExceptionHandling>false</ExceptionHandling>
<AdditionalIncludeDirectories>.</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
@ -155,6 +158,7 @@
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
<OmitFramePointers>true</OmitFramePointers>
<ExceptionHandling>false</ExceptionHandling>
<AdditionalIncludeDirectories>.</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
@ -164,13 +168,13 @@
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="crypto\curve25519-donna.h" />
<ClInclude Include="crypto\curve25519\curve25519-donna.h" />
<ClInclude Include="stdafx.h" />
<ClInclude Include="util.h" />
<ClInclude Include="util_win32.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="crypto\curve25519-donna.cpp">
<ClCompile Include="crypto\curve25519\curve25519-donna.cpp">
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
@ -187,13 +191,13 @@
<ClCompile Include="util_win32.cpp" />
</ItemGroup>
<ItemGroup>
<NASM Include="crypto\curve25519_x64_nasm.asm">
<NASM Include="crypto\curve25519\curve25519-x64-win.asm">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
</NASM>
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="crypto\nasm.targets" />
<Import Project="crypto\tools\nasm.targets" />
</ImportGroup>
</Project>

View file

@ -21,12 +21,12 @@
<ClInclude Include="util.h">
<Filter>Source Files</Filter>
</ClInclude>
<ClInclude Include="crypto\curve25519-donna.h">
<Filter>Source Files</Filter>
</ClInclude>
<ClInclude Include="util_win32.h">
<Filter>Source Files</Filter>
</ClInclude>
<ClInclude Include="crypto\curve25519\curve25519-donna.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="stdafx.cpp">
@ -38,15 +38,15 @@
<ClCompile Include="util.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="crypto\curve25519-donna.cpp">
<ClCompile Include="util_win32.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="util_win32.cpp">
<ClCompile Include="crypto\curve25519\curve25519-donna.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<NASM Include="crypto\curve25519_x64_nasm.asm">
<NASM Include="crypto\curve25519\curve25519-x64-win.asm">
<Filter>Source Files</Filter>
</NASM>
</ItemGroup>

29
tunsafe_amalgam.cpp Normal file
View file

@ -0,0 +1,29 @@
#include "build_config.h"
// Skip asm for IOS simulator
#if defined(OS_IOS) && defined(ARCH_CPU_X86_FAMILY)
#define CHACHA20_WITH_ASM 0
#define BLAKE2S_WITH_ASM 0
#endif
#include "wireguard.cpp"
#include "wireguard_proto.cpp"
#include "wireguard_config.cpp"
#include "util.cpp"
#include "tunsafe_threading.cpp"
#include "tunsafe_cpu.cpp"
#include "ip_to_peer_map.cpp"
#include "crypto/curve25519/curve25519-donna.cpp"
#include "crypto/chacha20poly1305.cpp"
#include "crypto/blake2s/blake2s.cpp"
#include "crypto/siphash/siphash.cpp"
#include "crypto/aesgcm/aesgcm.cpp"
#include "ipzip2/ipzip2.cpp"
#if defined(WITH_NETWORK_BSD)
#include "network_bsd.cpp"
#include "network_bsd_common.cpp"
#include "ts.cpp"
#include "benchmark.cpp"
#endif

View file

@ -22,7 +22,7 @@
#include "util.h"
#include <atlbase.h>
#include <algorithm>
#include "crypto/curve25519-donna.h"
#include "crypto/curve25519/curve25519-donna.h"
#include "service_win32.h"
#include "util_win32.h"
@ -652,8 +652,6 @@ void BrowseFile(HWND wnd) {
ImportFile(szFile);
}
static const uint8 kCurve25519Basepoint[32] = {9};
static void SetKeyBox(HWND wnd, int ctr, uint8 buf[32]) {
char base64[WG_PUBLIC_KEY_LEN_BASE64 + 1];
SetDlgItemText(wnd, ctr, base64_encode(buf, 32, base64, sizeof(base64), NULL));

7
util.h
View file

@ -48,3 +48,10 @@ uint64 OsGetMilliseconds();
void InitOsxGetMilliseconds();
void OsInterruptibleSleep(int millis);
void OsGetTimestampTAI64N(uint8 dst[12]);
struct CommandLineOutput {
const char *filename_to_load;
const char *interface_name;
bool daemon;
};
int HandleCommandLine(int argc, char **argv, CommandLineOutput *output);

View file

@ -5,8 +5,8 @@
#include "netapi.h"
#include "wireguard_proto.h"
#include "crypto/chacha20poly1305.h"
#include "crypto/blake2s.h"
#include "crypto/siphash.h"
#include "crypto/blake2s/blake2s.h"
#include "crypto/siphash/siphash.h"
#include "tunsafe_endian.h"
#include <algorithm>
#include <assert.h>

View file

@ -3,10 +3,10 @@
#include "stdafx.h"
#include "wireguard_proto.h"
#include "crypto/chacha20poly1305.h"
#include "crypto/blake2s.h"
#include "crypto/curve25519-donna.h"
#include "crypto/blake2s/blake2s.h"
#include "crypto/curve25519/curve25519-donna.h"
#include "crypto/aesgcm/aes.h"
#include "crypto/siphash.h"
#include "crypto/siphash/siphash.h"
#include "tunsafe_endian.h"
#include "util.h"
#include "crypto_ops.h"
@ -21,7 +21,6 @@ static const uint8 kLabelCookie[] = {'c', 'o', 'o', 'k', 'i', 'e', '-', '-'};
static const uint8 kLabelMac1[] = {'m', 'a', 'c', '1', '-', '-', '-', '-'};
static const uint8 kWgInitHash[WG_HASH_LEN] = {0x22,0x11,0xb3,0x61,0x08,0x1a,0xc5,0x66,0x69,0x12,0x43,0xdb,0x45,0x8a,0xd5,0x32,0x2d,0x9c,0x6c,0x66,0x22,0x93,0xe8,0xb7,0x0e,0xe1,0x9c,0x65,0xba,0x07,0x9e,0xf3};
static const uint8 kWgInitChainingKey[WG_HASH_LEN] = {0x60,0xe2,0x6d,0xae,0xf3,0x27,0xef,0xc0,0x2e,0xc3,0x35,0xe2,0xa0,0x25,0xd2,0xd0,0x16,0xeb,0x42,0x06,0xf8,0x72,0x77,0xf5,0x2d,0x38,0xd1,0x98,0x8b,0x78,0xcd,0x36};
static const uint8 kCurve25519Basepoint[32] = {9};
ReplayDetector::ReplayDetector() {
expected_seq_nr_ = 0;