diff --git a/TunSafe.vcxproj b/TunSafe.vcxproj index a85d424..48c8261 100644 --- a/TunSafe.vcxproj +++ b/TunSafe.vcxproj @@ -54,7 +54,7 @@ - + @@ -178,6 +178,12 @@ + + + + + + @@ -186,9 +192,9 @@ - + - + @@ -212,8 +218,7 @@ - - + @@ -221,7 +226,7 @@ - + NotUsing NotUsing @@ -229,7 +234,7 @@ NotUsing NotUsing - + NotUsing NotUsing NotUsing @@ -255,34 +260,34 @@ - + true true - + true true - + true true - + Document true true - + true true - + true true - + \ No newline at end of file diff --git a/TunSafe.vcxproj.filters b/TunSafe.vcxproj.filters index 01e1ff8..71677f1 100644 --- a/TunSafe.vcxproj.filters +++ b/TunSafe.vcxproj.filters @@ -14,6 +14,15 @@ {d31b1b9f-4a2e-42d4-a26c-7c3daa4ccbe3} + + {45ab50f7-cde8-4d0b-9756-5bfa3b9a28db} + + + {6adfd763-0197-437b-b7f6-2ffdd1e8e508} + + + {1ca37c7b-e91e-4648-9584-7d0c73d8e416} + @@ -41,12 +50,6 @@ Source Files\Win32 - - crypto - - - crypto - Source Files @@ -56,9 +59,6 @@ Source Files - - crypto - Source Files @@ -95,6 +95,31 @@ Source Files\Win32 + + + crypto\blake2s + + + crypto\blake2s + + + crypto\blake2s + + + crypto\blake2s + + + crypto\blake2s + + + crypto\blake2s + + + crypto\curve25519 + + + crypto\chacha20poly1305 + @@ -118,27 +143,12 @@ Source Files\Win32 - - crypto - - - crypto - - - crypto - - - crypto - Source Files Source Files - - crypto - crypto\aesgcm @@ -163,6 +173,18 @@ Source Files\Win32 + + Source Files + + + crypto\blake2s + + + crypto\curve25519 + + + crypto\chacha20poly1305 + @@ -174,23 +196,23 @@ - - crypto - - - crypto - - - crypto - - + crypto\aesgcm - + crypto\aesgcm - + crypto\aesgcm + + crypto\curve25519 + + + crypto\chacha20poly1305 + + + crypto\chacha20poly1305 + \ No newline at end of file diff --git a/build_freebsd.sh b/build_freebsd.sh index 2f86855..51c1671 100755 --- a/build_freebsd.sh +++ b/build_freebsd.sh @@ -1,4 +1,5 @@ -g++7 -I . -O2 -DNDEBUG -static -mssse3 -o tunsafe benchmark.cpp tunsafe_cpu.cpp wireguard_config.cpp ip_to_peer_map.cpp tunsafe_threading.cpp \ -wireguard.cpp wireguard_proto.cpp ts.cpp util.cpp network_bsd.cpp network_bsd_common.cpp \ -crypto/blake2s.cpp crypto/blake2s_sse.cpp crypto/chacha20poly1305.cpp crypto/curve25519-donna.cpp \ -crypto/siphash.cpp crypto/chacha20_x64_gas.s crypto/poly1305_x64_gas.s ipzip2/ipzip2.cpp -lrt -pthread +g++7 -I . -O2 -DNDEBUG -DWITH_NETWORK_BSD=1 -static -mssse3 -o tunsafe \ +tunsafe_amalgam.cpp \ +crypto/chacha20/chacha20-x64-linux.s \ +crypto/poly1305/poly1305-x64-linux.s \ +-lrt -pthread diff --git a/build_linux.sh b/build_linux.sh index 850dcdc..4e87ecc 100755 --- a/build_linux.sh +++ b/build_linux.sh @@ -1,9 +1,10 @@ #!/bin/sh -clang++-6.0 -c -march=skylake-avx512 crypto/poly1305_x64_gas.s crypto/chacha20_x64_gas.s -clang++-6.0 -I . -O3 -DNDEBUG -mssse3 -pthread -lrt -o tunsafe util.cpp wireguard_config.cpp wireguard.cpp ts.cpp ip_to_peer_map.cpp tunsafe_threading.cpp \ -wireguard_proto.cpp network_bsd.cpp network_bsd_common.cpp tunsafe_cpu.cpp benchmark.cpp crypto/blake2s.cpp crypto/blake2s_sse.cpp crypto/chacha20poly1305.cpp \ -crypto/curve25519-donna.cpp crypto/siphash.cpp chacha20_x64_gas.o crypto/aesgcm/aesni_gcm_x64_gas.s \ -crypto/aesgcm/aesni_x64_gas.s crypto/aesgcm/aesgcm.cpp poly1305_x64_gas.o ipzip2/ipzip2.cpp \ -crypto/aesgcm/ghash_x64_gas.s - - +set -e +clang++-6.0 -c -march=skylake-avx512 crypto/poly1305/poly1305-x64-linux.s crypto/chacha20/chacha20-x64-linux.s +clang++-6.0 -I . -O3 -DNDEBUG -DWITH_NETWORK_BSD=1 -mssse3 -pthread -lrt -o tunsafe \ +tunsafe_amalgam.cpp \ +crypto/aesgcm/aesni_gcm-x64-linux.s \ +crypto/aesgcm/aesni-x64-linux.s \ +crypto/aesgcm/ghash-x64-linux.s \ +chacha20-x64-linux.o \ +poly1305-x64-linux.o \ diff --git a/build_linux_rpi.sh b/build_linux_rpi.sh index 96208a8..aae88c2 100755 --- a/build_linux_rpi.sh +++ b/build_linux_rpi.sh @@ -1,11 +1,10 @@ #!/bin/sh set -e +#cpp -D__ARM_ARCH__=7 crypto/chacha20/chacha20-arm.s > crypto/chacha20/chacha20-arm.preprocessed.s +#cpp -D__ARM_ARCH__=7 crypto/poly1305/poly1305-arm.s > crypto/poly1305/poly1305-arm.preprocessed.s -cpp -D__ARM_ARCH__=7 crypto/chacha20/chacha20-arm.s > crypto/chacha20/chacha20-arm.preprocessed.s -cpp -D__ARM_ARCH__=7 crypto/poly1305/poly1305-arm.s > crypto/poly1305/poly1305-arm.preprocessed.s - -g++-6 -mfpu=neon -I . -g -O2 -DNDEBUG -fno-omit-frame-pointer -march=armv7-a -mthumb -std=c++11 -pthread -lrt -o tunsafe util.cpp wireguard_config.cpp wireguard.cpp ip_to_peer_map.cpp tunsafe_threading.cpp \ -wireguard_proto.cpp network_bsd.cpp network_bsd_common.cpp tunsafe_cpu.cpp benchmark.cpp crypto/blake2s.cpp crypto/chacha20poly1305.cpp \ -crypto/curve25519-donna.cpp crypto/siphash.cpp crypto/aesgcm/aesgcm.cpp ipzip2/ipzip2.cpp \ -crypto/chacha20/chacha20-arm.preprocessed.s crypto/poly1305/poly1305-arm.preprocessed.s +g++-6 -mfpu=neon -I . -g -O2 -DNDEBUG -DWITH_NETWORK_BSD=1 -fno-omit-frame-pointer -march=armv7-a -mthumb -std=c++11 -pthread -lrt -o tunsafe \ +tunsafe_amalgam.cpp \ +crypto/chacha20/chacha20-arm-linux.S \ +crypto/poly1305/poly1305-arm-linux.S \ diff --git a/build_osx.sh b/build_osx.sh index 3e905f9..202d83c 100755 --- a/build_osx.sh +++ b/build_osx.sh @@ -1,14 +1,14 @@ set -e +clang++ -c -mavx512f -mavx512vl crypto/poly1305/poly1305-x64-osx.s crypto/chacha20/chacha20-x64-osx.s -clang++ -c -mavx512f -mavx512vl crypto/poly1305_x64_gas_macosx.s crypto/chacha20_x64_gas_macosx.s - -clang++ -g -O3 -I . -std=c++11 -DNDEBUG=1 -Wno-deprecated-declarations -fno-exceptions -fno-rtti -ffunction-sections -o tunsafe \ -wireguard_config.cpp ip_to_peer_map.cpp tunsafe_threading.cpp wireguard.cpp wireguard_proto.cpp ts.cpp util.cpp network_bsd.cpp network_bsd_common.cpp benchmark.cpp tunsafe_cpu.cpp \ -crypto/blake2s.cpp crypto/blake2s_sse.cpp crypto/chacha20poly1305.cpp crypto/curve25519-donna.cpp \ -crypto/siphash.cpp crypto/aesgcm/aesgcm.cpp ipzip2/ipzip2.cpp \ -crypto/aesgcm/aesni_gcm_x64_gas_macosx.s crypto/aesgcm/aesni_x64_gas_macosx.s crypto/aesgcm/ghash_x64_gas_macosx.s \ -chacha20_x64_gas_macosx.o poly1305_x64_gas_macosx.o +clang++ -g -O3 -I . -std=c++11 -DWITH_NETWORK_BSD=1 -DNDEBUG=1 -Wno-deprecated-declarations -fno-exceptions -fno-rtti -ffunction-sections -o tunsafe \ +tunsafe_amalgam.cpp \ +crypto/aesgcm/aesni_gcm-x64-osx.s \ +crypto/aesgcm/aesni-x64-osx.s \ +crypto/aesgcm/ghash-x64-osx.s \ +chacha20-x64-osx.o \ +poly1305-x64-osx.o cp tunsafe tunsafe.unstripped strip tunsafe diff --git a/crypto/aesgcm/aesni_x64_gas.s b/crypto/aesgcm/aesni-x64-linux.s similarity index 100% rename from crypto/aesgcm/aesni_x64_gas.s rename to crypto/aesgcm/aesni-x64-linux.s diff --git a/crypto/aesgcm/aesni_x64_gas_macosx.s b/crypto/aesgcm/aesni-x64-osx.s similarity index 100% rename from crypto/aesgcm/aesni_x64_gas_macosx.s rename to crypto/aesgcm/aesni-x64-osx.s diff --git a/crypto/aesgcm/aesni_x64_nasm.asm b/crypto/aesgcm/aesni-x64-win.asm similarity index 100% rename from crypto/aesgcm/aesni_x64_nasm.asm rename to crypto/aesgcm/aesni-x64-win.asm diff --git a/crypto/aesgcm/aesni-x86_64.pl b/crypto/aesgcm/aesni-x64.pl similarity index 99% rename from crypto/aesgcm/aesni-x86_64.pl rename to crypto/aesgcm/aesni-x64.pl index 252c485..20aa364 100644 --- a/crypto/aesgcm/aesni-x86_64.pl +++ b/crypto/aesgcm/aesni-x64.pl @@ -203,7 +203,7 @@ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../x86_64-xlate.pl" and -f $xlate) or +( $xlate="${dir}../tools/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; diff --git a/crypto/aesgcm/aesni_gcm_x64_gas.s b/crypto/aesgcm/aesni_gcm-x64-linux.s similarity index 100% rename from crypto/aesgcm/aesni_gcm_x64_gas.s rename to crypto/aesgcm/aesni_gcm-x64-linux.s diff --git a/crypto/aesgcm/aesni_gcm_x64_gas_macosx.s b/crypto/aesgcm/aesni_gcm-x64-osx.s similarity index 100% rename from crypto/aesgcm/aesni_gcm_x64_gas_macosx.s rename to crypto/aesgcm/aesni_gcm-x64-osx.s diff --git a/crypto/aesgcm/aesni_gcm_x64_nasm.asm b/crypto/aesgcm/aesni_gcm-x64-win.asm similarity index 100% rename from crypto/aesgcm/aesni_gcm_x64_nasm.asm rename to crypto/aesgcm/aesni_gcm-x64-win.asm diff --git a/crypto/aesgcm/aesni-gcm-x86_64.pl b/crypto/aesgcm/aesni_gcm-x64.pl similarity index 99% rename from crypto/aesgcm/aesni-gcm-x86_64.pl rename to crypto/aesgcm/aesni_gcm-x64.pl index f1607c7..52fe5c4 100644 --- a/crypto/aesgcm/aesni-gcm-x86_64.pl +++ b/crypto/aesgcm/aesni_gcm-x64.pl @@ -50,7 +50,7 @@ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../x86_64-xlate.pl" and -f $xlate) or +( $xlate="${dir}../tools/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; # |$avx| in ghash-x86_64.pl must be set to at least 1; otherwise tags will diff --git a/crypto/aesgcm/ghash_x64_gas.s b/crypto/aesgcm/ghash-x64-linux.s similarity index 100% rename from crypto/aesgcm/ghash_x64_gas.s rename to crypto/aesgcm/ghash-x64-linux.s diff --git a/crypto/aesgcm/ghash_x64_gas_macosx.s b/crypto/aesgcm/ghash-x64-osx.s similarity index 100% rename from crypto/aesgcm/ghash_x64_gas_macosx.s rename to crypto/aesgcm/ghash-x64-osx.s diff --git a/crypto/aesgcm/ghash_x64_nasm.asm b/crypto/aesgcm/ghash-x64-win.asm similarity index 100% rename from crypto/aesgcm/ghash_x64_nasm.asm rename to crypto/aesgcm/ghash-x64-win.asm diff --git a/crypto/aesgcm/ghash-x86_64.pl b/crypto/aesgcm/ghash-x64.pl similarity index 99% rename from crypto/aesgcm/ghash-x86_64.pl rename to crypto/aesgcm/ghash-x64.pl index ad94168..0eb80be 100644 --- a/crypto/aesgcm/ghash-x86_64.pl +++ b/crypto/aesgcm/ghash-x64.pl @@ -99,7 +99,7 @@ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../x86_64-xlate.pl" and -f $xlate) or +( $xlate="${dir}../tools/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; # See the notes about |$avx| in aesni-gcm-x86_64.pl; otherwise tags will be diff --git a/crypto/aesgcm/make.sh b/crypto/aesgcm/make.sh new file mode 100644 index 0000000..0de76ca --- /dev/null +++ b/crypto/aesgcm/make.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +perl aesni_gcm-x64.pl macosx > aesni_gcm-x64-osx.s +perl aesni-x64.pl macosx > aesni-x64-osx.s +perl ghash-x64.pl macosx > ghash-x64-osx.s + +perl aesni_gcm-x64.pl gas > aesni_gcm-x64-linux.s +perl aesni-x64.pl gas > aesni-x64-linux.s +perl ghash-x64.pl gas > ghash-x64-linux.s + diff --git a/crypto/blake2s-load-sse2.h b/crypto/blake2s/blake2s-load-sse2.h similarity index 100% rename from crypto/blake2s-load-sse2.h rename to crypto/blake2s/blake2s-load-sse2.h diff --git a/crypto/blake2s-load-sse41.h b/crypto/blake2s/blake2s-load-sse41.h similarity index 100% rename from crypto/blake2s-load-sse41.h rename to crypto/blake2s/blake2s-load-sse41.h diff --git a/crypto/blake2s-load-xop.h b/crypto/blake2s/blake2s-load-xop.h similarity index 100% rename from crypto/blake2s-load-xop.h rename to crypto/blake2s/blake2s-load-xop.h diff --git a/crypto/blake2s-round.h b/crypto/blake2s/blake2s-round.h similarity index 83% rename from crypto/blake2s-round.h rename to crypto/blake2s/blake2s-round.h index 44a5574..b9c52c1 100644 --- a/crypto/blake2s-round.h +++ b/crypto/blake2s/blake2s-round.h @@ -39,7 +39,7 @@ #endif -#define G1(row1,row2,row3,row4,buf) \ +#define BLAKE2S_G1(row1,row2,row3,row4,buf) \ row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \ row4 = _mm_xor_si128( row4, row1 ); \ row4 = _mm_roti_epi32(row4, -16); \ @@ -47,7 +47,7 @@ row2 = _mm_xor_si128( row2, row3 ); \ row2 = _mm_roti_epi32(row2, -12); -#define G2(row1,row2,row3,row4,buf) \ +#define BLAKE2S_G2(row1,row2,row3,row4,buf) \ row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \ row4 = _mm_xor_si128( row4, row1 ); \ row4 = _mm_roti_epi32(row4, -8); \ @@ -55,12 +55,12 @@ row2 = _mm_xor_si128( row2, row3 ); \ row2 = _mm_roti_epi32(row2, -7); -#define DIAGONALIZE(row1,row2,row3,row4) \ +#define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \ row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(2,1,0,3) ); \ row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \ row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(0,3,2,1) ); -#define UNDIAGONALIZE(row1,row2,row3,row4) \ +#define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \ row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(0,3,2,1) ); \ row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \ row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(2,1,0,3) ); @@ -73,16 +73,16 @@ #include "blake2s-load-sse2.h" #endif -#define ROUND(r) \ +#define BLAKE2S_ROUND_SSE(r) \ LOAD_MSG_ ##r ##_1(buf1); \ - G1(row1,row2,row3,row4,buf1); \ + BLAKE2S_G1(row1,row2,row3,row4,buf1); \ LOAD_MSG_ ##r ##_2(buf2); \ - G2(row1,row2,row3,row4,buf2); \ - DIAGONALIZE(row1,row2,row3,row4); \ + BLAKE2S_G2(row1,row2,row3,row4,buf2); \ + BLAKE2S_DIAGONALIZE(row1,row2,row3,row4); \ LOAD_MSG_ ##r ##_3(buf3); \ - G1(row1,row2,row3,row4,buf3); \ + BLAKE2S_G1(row1,row2,row3,row4,buf3); \ LOAD_MSG_ ##r ##_4(buf4); \ - G2(row1,row2,row3,row4,buf4); \ - UNDIAGONALIZE(row1,row2,row3,row4); \ + BLAKE2S_G2(row1,row2,row3,row4,buf4); \ + BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4); \ #endif diff --git a/crypto/blake2s/blake2s-sse-impl.h b/crypto/blake2s/blake2s-sse-impl.h new file mode 100644 index 0000000..a1cb512 --- /dev/null +++ b/crypto/blake2s/blake2s-sse-impl.h @@ -0,0 +1,85 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ + +#include +#if defined(HAVE_SSSE3) +#include +#endif +#if defined(HAVE_SSE41) +#include +#endif +#if defined(HAVE_AVX) +#include +#endif +#if defined(HAVE_XOP) +#include +#endif + +#include "blake2s-round.h" + +void blake2s_compress_sse( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] ) { + __m128i row1, row2, row3, row4; + __m128i buf1, buf2, buf3, buf4; +#if defined(HAVE_SSE41) + __m128i t0, t1; +#if !defined(HAVE_XOP) + __m128i t2; +#endif +#endif + __m128i ff0, ff1; +#if defined(HAVE_SSSE3) && !defined(HAVE_XOP) + const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 ); + const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 ); +#endif +#if defined(HAVE_SSE41) + const __m128i m0 = LOADU( block + 00 ); + const __m128i m1 = LOADU( block + 16 ); + const __m128i m2 = LOADU( block + 32 ); + const __m128i m3 = LOADU( block + 48 ); +#else + const uint32_t m0 = load32(block + 0 * sizeof(uint32_t)); + const uint32_t m1 = load32(block + 1 * sizeof(uint32_t)); + const uint32_t m2 = load32(block + 2 * sizeof(uint32_t)); + const uint32_t m3 = load32(block + 3 * sizeof(uint32_t)); + const uint32_t m4 = load32(block + 4 * sizeof(uint32_t)); + const uint32_t m5 = load32(block + 5 * sizeof(uint32_t)); + const uint32_t m6 = load32(block + 6 * sizeof(uint32_t)); + const uint32_t m7 = load32(block + 7 * sizeof(uint32_t)); + const uint32_t m8 = load32(block + 8 * sizeof(uint32_t)); + const uint32_t m9 = load32(block + 9 * sizeof(uint32_t)); + const uint32_t m10 = load32(block + 10 * sizeof(uint32_t)); + const uint32_t m11 = load32(block + 11 * sizeof(uint32_t)); + const uint32_t m12 = load32(block + 12 * sizeof(uint32_t)); + const uint32_t m13 = load32(block + 13 * sizeof(uint32_t)); + const uint32_t m14 = load32(block + 14 * sizeof(uint32_t)); + const uint32_t m15 = load32(block + 15 * sizeof(uint32_t)); +#endif + row1 = ff0 = LOADU( &S->h[0] ); + row2 = ff1 = LOADU( &S->h[4] ); + row3 = _mm_loadu_si128( (__m128i const *)&blake2s_IV[0] ); + row4 = _mm_xor_si128( _mm_loadu_si128( (__m128i const *)&blake2s_IV[4] ), LOADU( &S->t[0] ) ); + BLAKE2S_ROUND_SSE( 0 ); + BLAKE2S_ROUND_SSE( 1 ); + BLAKE2S_ROUND_SSE( 2 ); + BLAKE2S_ROUND_SSE( 3 ); + BLAKE2S_ROUND_SSE( 4 ); + BLAKE2S_ROUND_SSE( 5 ); + BLAKE2S_ROUND_SSE( 6 ); + BLAKE2S_ROUND_SSE( 7 ); + BLAKE2S_ROUND_SSE( 8 ); + BLAKE2S_ROUND_SSE( 9 ); + STOREU( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) ); + STOREU( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) ); +} diff --git a/crypto/blake2s.cpp b/crypto/blake2s/blake2s.cpp old mode 100755 new mode 100644 similarity index 99% rename from crypto/blake2s.cpp rename to crypto/blake2s/blake2s.cpp index 540407d..840720e --- a/crypto/blake2s.cpp +++ b/crypto/blake2s/blake2s.cpp @@ -26,8 +26,6 @@ https://blake2.net. #define BLAKE2S_WITH_ASM 1 #endif // BLAKE2S_WITH_ASM -void blake2s_compress_sse(blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]); - #if !defined(__cplusplus) && (!defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L) #if defined(_MSC_VER) #define BLAKE2_INLINE __inline @@ -246,6 +244,12 @@ static void blake2s_compress(blake2s_state *S, const uint8_t in[BLAKE2S_BLOCKBYT #undef G #undef ROUND + +#if defined(ARCH_CPU_X86_FAMILY) +#include "blake2s-sse-impl.h" +#endif + + static inline void blake2s_compress_impl(blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]) { #if defined(ARCH_CPU_X86_64) && BLAKE2S_WITH_ASM blake2s_compress_sse(S, block); diff --git a/crypto/blake2s.h b/crypto/blake2s/blake2s.h similarity index 100% rename from crypto/blake2s.h rename to crypto/blake2s/blake2s.h diff --git a/crypto/blake2s_sse.cpp b/crypto/blake2s_sse.cpp deleted file mode 100755 index 2527f24..0000000 --- a/crypto/blake2s_sse.cpp +++ /dev/null @@ -1,399 +0,0 @@ -/* - BLAKE2 reference source code package - optimized C implementations - - Copyright 2012, Samuel Neves . You may use this under the - terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at - your option. The terms of these licenses can be found at: - - - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - - OpenSSL license : https://www.openssl.org/source/license.html - - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - - More information about the BLAKE2 hash function can be found at - https://blake2.net. -*/ -#include "stdafx.h" -#include -#include -#include - -#include "blake2s.h" -#include "crypto_ops.h" - -#include -#if defined(HAVE_SSSE3) -#include -#endif -#if defined(HAVE_SSE41) -#include -#endif -#if defined(HAVE_AVX) -#include -#endif -#if defined(HAVE_XOP) -#include -#endif - -#include "blake2s-round.h" - -#if !defined(__cplusplus) && (!defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L) -#if defined(_MSC_VER) -#define BLAKE2_INLINE __inline -#elif defined(__GNUC__) -#define BLAKE2_INLINE __inline__ -#else -#define BLAKE2_INLINE -#endif -#else -#define BLAKE2_INLINE inline -#endif - -static BLAKE2_INLINE uint32_t load32(const void *src) { -#if defined(ARCH_CPU_LITTLE_ENDIAN) - uint32_t w; - memcpy(&w, src, sizeof w); - return w; -#else - const uint8_t *p = (const uint8_t *)src; - return ((uint32_t)(p[0]) << 0) | - ((uint32_t)(p[1]) << 8) | - ((uint32_t)(p[2]) << 16) | - ((uint32_t)(p[3]) << 24); -#endif -} - -static BLAKE2_INLINE void store32(void *dst, uint32_t w) { -#if defined(ARCH_CPU_LITTLE_ENDIAN) - memcpy(dst, &w, sizeof w); -#else - uint8_t *p = (uint8_t *)dst; - p[0] = (uint8_t)(w >> 0); - p[1] = (uint8_t)(w >> 8); - p[2] = (uint8_t)(w >> 16); - p[3] = (uint8_t)(w >> 24); -#endif -} - - - -static const uint32_t blake2s_IV[8] = -{ - 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, - 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL -}; - -/* Some helper functions */ -static void blake2s_set_lastnode( blake2s_state *S ) -{ - S->f[1] = (uint32_t)-1; -} - -static int blake2s_is_lastblock( const blake2s_state *S ) -{ - return S->f[0] != 0; -} - -static void blake2s_set_lastblock( blake2s_state *S ) -{ - if( S->last_node ) blake2s_set_lastnode( S ); - - S->f[0] = (uint32_t)-1; -} - -static void blake2s_increment_counter( blake2s_state *S, const uint32_t inc ) -{ - uint64_t t = ( ( uint64_t )S->t[1] << 32 ) | S->t[0]; - t += inc; - S->t[0] = ( uint32_t )( t >> 0 ); - S->t[1] = ( uint32_t )( t >> 32 ); -} - -/* init2 xors IV with input parameter block */ -#if 0 -void blake2s_init_param( blake2s_state *S, const blake2s_param *P ) -{ - size_t i; - /*blake2s_init0( S ); */ - const uint8_t * v = ( const uint8_t * )( blake2s_IV ); - const uint8_t * p = ( const uint8_t * )( P ); - uint8_t * h = ( uint8_t * )( S->h ); - /* IV XOR ParamBlock */ - memset( S, 0, sizeof( blake2s_state ) ); - - for( i = 0; i < BLAKE2S_OUTBYTES; ++i ) h[i] = v[i] ^ p[i]; - - S->outlen = P->digest_length; -} - -/* Some sort of default parameter block initialization, for sequential blake2s */ -void blake2s_init( blake2s_state *S, size_t outlen ) -{ - blake2s_param P[1]; - assert(outlen && outlen <= BLAKE2S_OUTBYTES); - - P->digest_length = (uint8_t)outlen; - P->key_length = 0; - P->fanout = 1; - P->depth = 1; - store32( &P->leaf_length, 0 ); - store32( &P->node_offset, 0 ); - store16( &P->xof_length, 0 ); - P->node_depth = 0; - P->inner_length = 0; - /* memset(P->reserved, 0, sizeof(P->reserved) ); */ - memset( P->salt, 0, sizeof( P->salt ) ); - memset( P->personal, 0, sizeof( P->personal ) ); - - blake2s_init_param( S, P ); -} - -int blake2s_init_key( blake2s_state *S, size_t outlen, const void *key, size_t keylen ) -{ - blake2s_param P[1]; - - /* Move interval verification here? */ - if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1; - - if ( ( !key ) || ( !keylen ) || keylen > BLAKE2S_KEYBYTES ) return -1; - - P->digest_length = (uint8_t)outlen; - P->key_length = (uint8_t)keylen; - P->fanout = 1; - P->depth = 1; - store32( &P->leaf_length, 0 ); - store32( &P->node_offset, 0 ); - store16( &P->xof_length, 0 ); - P->node_depth = 0; - P->inner_length = 0; - /* memset(P->reserved, 0, sizeof(P->reserved) ); */ - memset( P->salt, 0, sizeof( P->salt ) ); - memset( P->personal, 0, sizeof( P->personal ) ); - - if( blake2s_init_param( S, P ) < 0 ) - return -1; - - { - uint8_t block[BLAKE2S_BLOCKBYTES]; - memset( block, 0, BLAKE2S_BLOCKBYTES ); - memcpy( block, key, keylen ); - blake2s_update( S, block, BLAKE2S_BLOCKBYTES ); - memzero_crypto( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */ - } - return 0; -} -#endif - - -void blake2s_compress_sse( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] ) -{ - __m128i row1, row2, row3, row4; - __m128i buf1, buf2, buf3, buf4; -#if defined(HAVE_SSE41) - __m128i t0, t1; -#if !defined(HAVE_XOP) - __m128i t2; -#endif -#endif - __m128i ff0, ff1; -#if defined(HAVE_SSSE3) && !defined(HAVE_XOP) - const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 ); - const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 ); -#endif -#if defined(HAVE_SSE41) - const __m128i m0 = LOADU( block + 00 ); - const __m128i m1 = LOADU( block + 16 ); - const __m128i m2 = LOADU( block + 32 ); - const __m128i m3 = LOADU( block + 48 ); -#else - const uint32_t m0 = load32(block + 0 * sizeof(uint32_t)); - const uint32_t m1 = load32(block + 1 * sizeof(uint32_t)); - const uint32_t m2 = load32(block + 2 * sizeof(uint32_t)); - const uint32_t m3 = load32(block + 3 * sizeof(uint32_t)); - const uint32_t m4 = load32(block + 4 * sizeof(uint32_t)); - const uint32_t m5 = load32(block + 5 * sizeof(uint32_t)); - const uint32_t m6 = load32(block + 6 * sizeof(uint32_t)); - const uint32_t m7 = load32(block + 7 * sizeof(uint32_t)); - const uint32_t m8 = load32(block + 8 * sizeof(uint32_t)); - const uint32_t m9 = load32(block + 9 * sizeof(uint32_t)); - const uint32_t m10 = load32(block + 10 * sizeof(uint32_t)); - const uint32_t m11 = load32(block + 11 * sizeof(uint32_t)); - const uint32_t m12 = load32(block + 12 * sizeof(uint32_t)); - const uint32_t m13 = load32(block + 13 * sizeof(uint32_t)); - const uint32_t m14 = load32(block + 14 * sizeof(uint32_t)); - const uint32_t m15 = load32(block + 15 * sizeof(uint32_t)); -#endif - row1 = ff0 = LOADU( &S->h[0] ); - row2 = ff1 = LOADU( &S->h[4] ); - row3 = _mm_loadu_si128( (__m128i const *)&blake2s_IV[0] ); - row4 = _mm_xor_si128( _mm_loadu_si128( (__m128i const *)&blake2s_IV[4] ), LOADU( &S->t[0] ) ); - ROUND( 0 ); - ROUND( 1 ); - ROUND( 2 ); - ROUND( 3 ); - ROUND( 4 ); - ROUND( 5 ); - ROUND( 6 ); - ROUND( 7 ); - ROUND( 8 ); - ROUND( 9 ); - STOREU( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) ); - STOREU( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) ); -} - -#if 0 -int blake2s_update( blake2s_state *S, const void *pin, size_t inlen ) -{ - const unsigned char * in = (const unsigned char *)pin; - if( inlen > 0 ) - { - size_t left = S->buflen; - size_t fill = BLAKE2S_BLOCKBYTES - left; - if( inlen > fill ) - { - S->buflen = 0; - memcpy( S->buf + left, in, fill ); /* Fill buffer */ - blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES ); - blake2s_compress( S, S->buf ); /* Compress */ - in += fill; inlen -= fill; - while(inlen > BLAKE2S_BLOCKBYTES) { - blake2s_increment_counter(S, BLAKE2S_BLOCKBYTES); - blake2s_compress( S, in ); - in += BLAKE2S_BLOCKBYTES; - inlen -= BLAKE2S_BLOCKBYTES; - } - } - memcpy( S->buf + S->buflen, in, inlen ); - S->buflen += inlen; - } - return 0; -} - -int blake2s_final( blake2s_state *S, void *out, size_t outlen ) -{ - uint8_t buffer[BLAKE2S_OUTBYTES] = {0}; - size_t i; - - if( out == NULL || outlen < S->outlen ) - return -1; - - if( blake2s_is_lastblock( S ) ) - return -1; - - blake2s_increment_counter( S, (uint32_t)S->buflen ); - blake2s_set_lastblock( S ); - memset( S->buf + S->buflen, 0, BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */ - blake2s_compress( S, S->buf ); - - for( i = 0; i < 8; ++i ) /* Output full hash to temp buffer */ - store32( buffer + sizeof( S->h[i] ) * i, S->h[i] ); - - memcpy( out, buffer, S->outlen ); - memzero_crypto( buffer, sizeof(buffer) ); - return 0; -} - -/* inlen, at least, should be uint64_t. Others can be size_t. */ -int blake2s( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ) -{ - blake2s_state S[1]; - - /* Verify parameters */ - if ( NULL == in && inlen > 0 ) return -1; - - if ( NULL == out ) return -1; - - if ( NULL == key && keylen > 0) return -1; - - if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1; - - if( keylen > BLAKE2S_KEYBYTES ) return -1; - - if( keylen > 0 ) - { - if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1; - } - else - { - if( blake2s_init( S, outlen ) < 0 ) return -1; - } - - blake2s_update( S, ( const uint8_t * )in, inlen ); - blake2s_final( S, out, outlen ); - return 0; -} -#endif - -#if defined(SUPERCOP) -int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen ) -{ - return blake2s( out, BLAKE2S_OUTBYTES, in, inlen, NULL, 0 ); -} -#endif - -#if defined(BLAKE2S_SELFTEST) -#include -#include "blake2-kat.h" -int main( void ) -{ - uint8_t key[BLAKE2S_KEYBYTES]; - uint8_t buf[BLAKE2_KAT_LENGTH]; - size_t i, step; - - for( i = 0; i < BLAKE2S_KEYBYTES; ++i ) - key[i] = ( uint8_t )i; - - for( i = 0; i < BLAKE2_KAT_LENGTH; ++i ) - buf[i] = ( uint8_t )i; - - /* Test simple API */ - for( i = 0; i < BLAKE2_KAT_LENGTH; ++i ) - { - uint8_t hash[BLAKE2S_OUTBYTES]; - blake2s( hash, BLAKE2S_OUTBYTES, buf, i, key, BLAKE2S_KEYBYTES ); - - if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) ) - { - goto fail; - } - } - - /* Test streaming API */ - for(step = 1; step < BLAKE2S_BLOCKBYTES; ++step) { - for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) { - uint8_t hash[BLAKE2S_OUTBYTES]; - blake2s_state S; - uint8_t * p = buf; - size_t mlen = i; - int err = 0; - - if( (err = blake2s_init_key(&S, BLAKE2S_OUTBYTES, key, BLAKE2S_KEYBYTES)) < 0 ) { - goto fail; - } - - while (mlen >= step) { - if ( (err = blake2s_update(&S, p, step)) < 0 ) { - goto fail; - } - mlen -= step; - p += step; - } - if ( (err = blake2s_update(&S, p, mlen)) < 0) { - goto fail; - } - if ( (err = blake2s_final(&S, hash, BLAKE2S_OUTBYTES)) < 0) { - goto fail; - } - - if (0 != memcmp(hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES)) { - goto fail; - } - } - } - - puts( "ok" ); - return 0; -fail: - puts("error"); - return -1; -} -#endif diff --git a/crypto/chacha20/chacha20-arm.s b/crypto/chacha20/chacha20-arm-linux.S similarity index 99% rename from crypto/chacha20/chacha20-arm.s rename to crypto/chacha20/chacha20-arm-linux.S index 2e22fd1..7721cea 100644 --- a/crypto/chacha20/chacha20-arm.s +++ b/crypto/chacha20/chacha20-arm-linux.S @@ -6,6 +6,8 @@ /*#include */ +#define __ARM_ARCH__ 7 + .text #if defined(__thumb2__) || defined(__clang__) .syntax unified diff --git a/crypto/chacha20/chacha20-arm.pl b/crypto/chacha20/chacha20-arm.pl index cec1b89..d75e4bb 100644 --- a/crypto/chacha20/chacha20-arm.pl +++ b/crypto/chacha20/chacha20-arm.pl @@ -41,7 +41,7 @@ else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + ( $xlate="${dir}../tools/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open STDOUT,"| \"$^X\" $xlate $flavour $output"; diff --git a/crypto/chacha20/chacha20-arm64.s b/crypto/chacha20/chacha20-arm64-ios.S similarity index 96% rename from crypto/chacha20/chacha20-arm64.s rename to crypto/chacha20/chacha20-arm64-ios.S index c3d1243..2e92a93 100644 --- a/crypto/chacha20/chacha20-arm64.s +++ b/crypto/chacha20/chacha20-arm64-ios.S @@ -1,26 +1,23 @@ -/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0) - * - * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. - * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. - */ - -#include - .text + .align 5 -.Lsigma: +Lsigma: .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral -.Lone: +Lone: .long 1,0,0,0 +.globl _chacha20_arm +.globl _chacha20_neon + + .align 5 -ENTRY(chacha20_arm) - cbz x2,.Labort -.Lshort: +_chacha20_arm: + cbz x2,Labort +Lshort: stp x29,x30,[sp,#-96]! add x29,sp,#0 - adr x5,.Lsigma + adr x5,Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] @@ -41,7 +38,7 @@ ENTRY(chacha20_arm) ror x30,x30,#32 #endif -.Loop_outer: +Loop_outer: mov w5,w22 // unpack key block lsr x6,x22,#32 mov w7,w23 @@ -61,7 +58,7 @@ ENTRY(chacha20_arm) mov x4,#10 subs x2,x2,#64 -.Loop: +Loop: sub x4,x4,#1 add w5,w5,w9 add w6,w6,w10 @@ -159,7 +156,7 @@ ENTRY(chacha20_arm) ror w11,w11,#25 ror w12,w12,#25 ror w9,w9,#25 - cbnz x4,.Loop + cbnz x4,Loop add w5,w5,w22 // accumulate key block add x6,x6,x22,lsr#32 @@ -178,7 +175,7 @@ ENTRY(chacha20_arm) add w20,w20,w30 add x21,x21,x30,lsr#32 - b.lo .Ltail + b.lo Ltail add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 @@ -219,7 +216,7 @@ ENTRY(chacha20_arm) stp x17,x20,[x0,#48] add x0,x0,#64 - b.hi .Loop_outer + b.hi Loop_outer ldp x19,x20,[x29,#16] add sp,sp,#64 @@ -228,13 +225,13 @@ ENTRY(chacha20_arm) ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 -.Labort: +Labort: ret .align 4 -.Ltail: +Ltail: add x2,x2,#64 -.Less_than_64: +Less_than_64: sub x0,x0,#1 add x1,x1,x2 add x0,x0,x2 @@ -264,13 +261,13 @@ ENTRY(chacha20_arm) stp x13,x15,[sp,#32] stp x17,x20,[sp,#48] -.Loop_tail: +Loop_tail: ldrb w10,[x1,x2] ldrb w11,[x4,x2] add x2,x2,#1 eor w10,w10,w11 strb w10,[x0,x2] - cbnz x2,.Loop_tail + cbnz x2,Loop_tail stp xzr,xzr,[sp,#0] stp xzr,xzr,[sp,#16] @@ -285,25 +282,26 @@ ENTRY(chacha20_arm) ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 ret -ENDPROC(chacha20_arm) + + .align 5 -ENTRY(chacha20_neon) - cbz x2,.Labort_neon +_chacha20_neon: + cbz x2,Labort_neon cmp x2,#192 - b.lo .Lshort + b.lo Lshort stp x29,x30,[sp,#-96]! add x29,sp,#0 - adr x5,.Lsigma + adr x5,Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] cmp x2,#512 - b.hs .L512_or_more_neon + b.hs L512_or_more_neon sub sp,sp,#64 @@ -329,7 +327,7 @@ ENTRY(chacha20_neon) add v29.4s,v28.4s,v31.4s shl v31.4s,v31.4s,#2 // 1 -> 4 -.Loop_outer_neon: +Loop_outer_neon: mov w5,w22 // unpack key block lsr x6,x22,#32 mov v0.16b,v24.16b @@ -361,7 +359,7 @@ ENTRY(chacha20_neon) mov x4,#10 subs x2,x2,#256 -.Loop_neon: +Loop_neon: sub x4,x4,#1 add v0.4s,v0.4s,v1.4s add w5,w5,w9 @@ -567,7 +565,7 @@ ENTRY(chacha20_neon) ext v1.16b,v1.16b,v1.16b,#12 ext v5.16b,v5.16b,v5.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 - cbnz x4,.Loop_neon + cbnz x4,Loop_neon add w5,w5,w22 // accumulate key block add v0.4s,v0.4s,v24.4s @@ -598,7 +596,7 @@ ENTRY(chacha20_neon) add x21,x21,x30,lsr#32 add v17.4s,v17.4s,v25.4s - b.lo .Ltail_neon + b.lo Ltail_neon add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 @@ -663,7 +661,7 @@ ENTRY(chacha20_neon) eor v19.16b,v19.16b,v3.16b st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 - b.hi .Loop_outer_neon + b.hi Loop_outer_neon ldp x19,x20,[x29,#16] add sp,sp,#64 @@ -674,10 +672,10 @@ ENTRY(chacha20_neon) ldp x29,x30,[sp],#96 ret -.Ltail_neon: +Ltail_neon: add x2,x2,#256 cmp x2,#64 - b.lo .Less_than_64 + b.lo Less_than_64 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 @@ -717,10 +715,10 @@ ENTRY(chacha20_neon) stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 - b.eq .Ldone_neon + b.eq Ldone_neon sub x2,x2,#64 cmp x2,#64 - b.lo .Less_than_128 + b.lo Less_than_128 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 eor v0.16b,v0.16b,v20.16b @@ -728,10 +726,10 @@ ENTRY(chacha20_neon) eor v2.16b,v2.16b,v22.16b eor v3.16b,v3.16b,v23.16b st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - b.eq .Ldone_neon + b.eq Ldone_neon sub x2,x2,#64 cmp x2,#64 - b.lo .Less_than_192 + b.lo Less_than_192 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 eor v4.16b,v4.16b,v20.16b @@ -739,41 +737,41 @@ ENTRY(chacha20_neon) eor v6.16b,v6.16b,v22.16b eor v7.16b,v7.16b,v23.16b st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - b.eq .Ldone_neon + b.eq Ldone_neon sub x2,x2,#64 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] - b .Last_neon + b Last_neon -.Less_than_128: +Less_than_128: st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] - b .Last_neon -.Less_than_192: + b Last_neon +Less_than_192: st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] - b .Last_neon + b Last_neon .align 4 -.Last_neon: +Last_neon: sub x0,x0,#1 add x1,x1,x2 add x0,x0,x2 add x4,sp,x2 neg x2,x2 -.Loop_tail_neon: +Loop_tail_neon: ldrb w10,[x1,x2] ldrb w11,[x4,x2] add x2,x2,#1 eor w10,w10,w11 strb w10,[x0,x2] - cbnz x2,.Loop_tail_neon + cbnz x2,Loop_tail_neon stp xzr,xzr,[sp,#0] stp xzr,xzr,[sp,#16] stp xzr,xzr,[sp,#32] stp xzr,xzr,[sp,#48] -.Ldone_neon: +Ldone_neon: ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] @@ -781,9 +779,11 @@ ENTRY(chacha20_neon) ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 +Labort_neon: ret -.L512_or_more_neon: + +L512_or_more_neon: sub sp,sp,#128+64 ldp x22,x23,[x5] // load sigma @@ -819,7 +819,7 @@ ENTRY(chacha20_neon) sub x2,x2,#512 // not typo -.Loop_outer_512_neon: +Loop_outer_512_neon: mov v0.16b,v24.16b mov v4.16b,v24.16b mov v8.16b,v24.16b @@ -865,7 +865,7 @@ ENTRY(chacha20_neon) mov x4,#5 subs x2,x2,#512 -.Loop_upper_neon: +Loop_upper_neon: sub x4,x4,#1 add v0.4s,v0.4s,v1.4s add w5,w5,w9 @@ -1275,7 +1275,7 @@ ENTRY(chacha20_neon) ext v13.16b,v13.16b,v13.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 ext v21.16b,v21.16b,v21.16b,#12 - cbnz x4,.Loop_upper_neon + cbnz x4,Loop_upper_neon add w5,w5,w22 // accumulate key block add x6,x6,x22,lsr#32 @@ -1350,7 +1350,7 @@ ENTRY(chacha20_neon) lsr x21,x30,#32 mov x4,#5 -.Loop_lower_neon: +Loop_lower_neon: sub x4,x4,#1 add v0.4s,v0.4s,v1.4s add w5,w5,w9 @@ -1760,7 +1760,7 @@ ENTRY(chacha20_neon) ext v13.16b,v13.16b,v13.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 ext v21.16b,v21.16b,v21.16b,#12 - cbnz x4,.Loop_lower_neon + cbnz x4,Loop_lower_neon add w5,w5,w22 // accumulate key block ldp q24,q25,[sp,#0] @@ -1896,7 +1896,7 @@ ENTRY(chacha20_neon) add v29.4s,v29.4s,v0.4s add v30.4s,v30.4s,v0.4s - b.hs .Loop_outer_512_neon + b.hs Loop_outer_512_neon adds x2,x2,#512 ushr v0.4s,v31.4s,#2 // 4 -> 1 @@ -1910,14 +1910,14 @@ ENTRY(chacha20_neon) stp q24,q31,[sp,#32] stp q24,q31,[sp,#64] - b.eq .Ldone_512_neon + b.eq Ldone_512_neon cmp x2,#192 sub v27.4s,v27.4s,v0.4s // -= 1 sub v28.4s,v28.4s,v0.4s sub v29.4s,v29.4s,v0.4s add sp,sp,#128 - b.hs .Loop_outer_neon + b.hs Loop_outer_neon eor v25.16b,v25.16b,v25.16b eor v26.16b,v26.16b,v26.16b @@ -1925,9 +1925,9 @@ ENTRY(chacha20_neon) eor v28.16b,v28.16b,v28.16b eor v29.16b,v29.16b,v29.16b eor v30.16b,v30.16b,v30.16b - b .Loop_outer + b Loop_outer -.Ldone_512_neon: +Ldone_512_neon: ldp x19,x20,[x29,#16] add sp,sp,#128+64 ldp x21,x22,[x29,#32] @@ -1935,6 +1935,5 @@ ENTRY(chacha20_neon) ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 -.Labort_neon: ret -ENDPROC(chacha20_neon) + diff --git a/crypto/chacha20/chacha20-arm64.pl b/crypto/chacha20/chacha20-arm64.pl index 4a838bc..96dc3b7 100644 --- a/crypto/chacha20/chacha20-arm64.pl +++ b/crypto/chacha20/chacha20-arm64.pl @@ -16,7 +16,7 @@ # # June 2015 # -# ChaCha20 for ARMv8. +# chacha20 for ARMv8. # # Performance in cycles per byte out of large buffer. # @@ -40,7 +40,7 @@ $output=shift; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +( $xlate="${dir}../tools/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; @@ -120,42 +120,21 @@ my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); } $code.=<<___; -#include "arm_arch.h" - .text -.extern OPENSSL_armcap_P - .align 5 .Lsigma: .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral .Lone: .long 1,0,0,0 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif -.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by " -.globl ChaCha20_ctr32 -.type ChaCha20_ctr32,%function +.globl chacha20_arm +.globl chacha20_neon + +.type chacha20_arm,%function .align 5 -ChaCha20_ctr32: +chacha20_arm: cbz $len,.Labort - adr @x[0],.LOPENSSL_armcap_P - cmp $len,#192 - b.lo .Lshort -#ifdef __ILP32__ - ldrsw @x[1],[@x[0]] -#else - ldr @x[1],[@x[0]] -#endif - ldr w17,[@x[1],@x[0]] - tst w17,#ARMV7_NEON - b.ne ChaCha20_neon - .Lshort: stp x29,x30,[sp,#-96]! add x29,sp,#0 @@ -333,7 +312,7 @@ $code.=<<___; ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 ret -.size ChaCha20_ctr32,.-ChaCha20_ctr32 +.size chacha20_arm,.-chacha20_arm ___ {{{ @@ -374,9 +353,13 @@ my ($a,$b,$c,$d,$t)=@_; $code.=<<___; -.type ChaCha20_neon,%function +.type chacha20_neon,%function .align 5 -ChaCha20_neon: +chacha20_neon: + cbz x2,.Labort_neon + cmp x2,#192 + b.lo .Lshort + stp x29,x30,[sp,#-96]! add x29,sp,#0 @@ -684,8 +667,9 @@ $code.=<<___; ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 +.Labort_neon: ret -.size ChaCha20_neon,.-ChaCha20_neon +.size chacha20_neon,.-chacha20_neon ___ { my ($T0,$T1,$T2,$T3,$T4,$T5)=@K; @@ -693,18 +677,6 @@ my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2, $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23)); $code.=<<___; -.type ChaCha20_512_neon,%function -.align 5 -ChaCha20_512_neon: - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adr @x[0],.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] .L512_or_more_neon: sub sp,sp,#128+64 @@ -1115,7 +1087,7 @@ $code.=<<___; ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 ret -.size ChaCha20_512_neon,.-ChaCha20_512_neon +.size chacha20_512_neon,.-chacha20_512_neon ___ } }}} diff --git a/crypto/chacha20_x64_gas.s b/crypto/chacha20/chacha20-x64-linux.s similarity index 100% rename from crypto/chacha20_x64_gas.s rename to crypto/chacha20/chacha20-x64-linux.s diff --git a/crypto/chacha20_x64_gas_macosx.s b/crypto/chacha20/chacha20-x64-osx.s similarity index 100% rename from crypto/chacha20_x64_gas_macosx.s rename to crypto/chacha20/chacha20-x64-osx.s diff --git a/crypto/chacha20_x64.asm b/crypto/chacha20/chacha20-x64-win.asm similarity index 100% rename from crypto/chacha20_x64.asm rename to crypto/chacha20/chacha20-x64-win.asm diff --git a/crypto/make_chacha20_x64.pl b/crypto/chacha20/chacha20-x64.pl similarity index 99% rename from crypto/make_chacha20_x64.pl rename to crypto/chacha20/chacha20-x64.pl index f9379ca..95443f5 100644 --- a/crypto/make_chacha20_x64.pl +++ b/crypto/chacha20/chacha20-x64.pl @@ -67,7 +67,7 @@ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +( $xlate="${dir}../tools/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; $avx = 3; diff --git a/crypto/chacha20/make.sh b/crypto/chacha20/make.sh new file mode 100644 index 0000000..6cfee45 --- /dev/null +++ b/crypto/chacha20/make.sh @@ -0,0 +1,4 @@ +#!/bin/sh +perl chacha20-x64.pl gas > chacha20-x64-linux.s +perl chacha20-x64.pl macosx > chacha20-x64-osx.s +perl chacha20-arm64.pl ios > chacha20-arm64-ios.S \ No newline at end of file diff --git a/crypto/curve25519-donna.cpp b/crypto/curve25519/curve25519-donna.cpp similarity index 99% rename from crypto/curve25519-donna.cpp rename to crypto/curve25519/curve25519-donna.cpp index a8f5cbe..45003cf 100644 --- a/crypto/curve25519-donna.cpp +++ b/crypto/curve25519/curve25519-donna.cpp @@ -48,6 +48,7 @@ #include #include +#include "curve25519-donna.h" #ifdef _MSC_VER #define inline __inline @@ -57,6 +58,8 @@ typedef uint8_t u8; typedef int32_t s32; typedef int64_t limb; +const uint8 kCurve25519Basepoint[32] = {9}; + /* Field element representation: * * Field elements are written as an array of signed, 64-bit limbs, least diff --git a/crypto/curve25519-donna.h b/crypto/curve25519/curve25519-donna.h similarity index 92% rename from crypto/curve25519-donna.h rename to crypto/curve25519/curve25519-donna.h index 93380b3..f563e5d 100644 --- a/crypto/curve25519-donna.h +++ b/crypto/curve25519/curve25519-donna.h @@ -1,17 +1,19 @@ -#ifndef TUNSAFE_CRYPTO_CURVE25519_DONNA_H_ -#define TUNSAFE_CRYPTO_CURVE25519_DONNA_H_ - -#include "tunsafe_types.h" - -void curve25519_donna_ref(uint8 *mypublic, const uint8 *secret, const uint8 *basepoint); -extern "C" void curve25519_donna_x64(uint8 *mypublic, const uint8 *secret, const uint8 *basepoint); - -#if defined(ARCH_CPU_X86_64) && defined(COMPILER_MSVC) -#define curve25519_donna curve25519_donna_x64 -#else -#define curve25519_donna curve25519_donna_ref -#endif - -void curve25519_normalize(uint8 *e); - +#ifndef TUNSAFE_CRYPTO_CURVE25519_DONNA_H_ +#define TUNSAFE_CRYPTO_CURVE25519_DONNA_H_ + +#include "tunsafe_types.h" + +void curve25519_donna_ref(uint8 *mypublic, const uint8 *secret, const uint8 *basepoint); +extern "C" void curve25519_donna_x64(uint8 *mypublic, const uint8 *secret, const uint8 *basepoint); + +#if defined(ARCH_CPU_X86_64) && defined(COMPILER_MSVC) +#define curve25519_donna curve25519_donna_x64 +#else +#define curve25519_donna curve25519_donna_ref +#endif + +void curve25519_normalize(uint8 *e); + +extern const uint8 kCurve25519Basepoint[32]; + #endif // TUNSAFE_CRYPTO_CURVE25519_DONNA_H_ \ No newline at end of file diff --git a/crypto/curve25519_x64_nasm.asm b/crypto/curve25519/curve25519-x64-win.asm similarity index 100% rename from crypto/curve25519_x64_nasm.asm rename to crypto/curve25519/curve25519-x64-win.asm diff --git a/crypto/poly1305/make.sh b/crypto/poly1305/make.sh new file mode 100644 index 0000000..07c27ac --- /dev/null +++ b/crypto/poly1305/make.sh @@ -0,0 +1,6 @@ +#!/bin/sh +perl poly1305-x64.pl gas > poly1305-x64-linux.s +perl poly1305-x64.pl macosx > poly1305-x64-osx.s +perl poly1305-arm64.pl ios > poly1305-arm64-ios.S + + diff --git a/crypto/poly1305/poly1305-arm.s b/crypto/poly1305/poly1305-arm-linux.S similarity index 99% rename from crypto/poly1305/poly1305-arm.s rename to crypto/poly1305/poly1305-arm-linux.S index 1893360..2be4893 100644 --- a/crypto/poly1305/poly1305-arm.s +++ b/crypto/poly1305/poly1305-arm-linux.S @@ -4,7 +4,7 @@ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. */ -//#include +#define __ARM_ARCH__ 7 .text #if defined(__thumb2__) diff --git a/crypto/poly1305/poly1305-arm.pl b/crypto/poly1305/poly1305-arm.pl index 5cdb6be..ab3a819 100644 --- a/crypto/poly1305/poly1305-arm.pl +++ b/crypto/poly1305/poly1305-arm.pl @@ -35,7 +35,7 @@ else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + ( $xlate="${dir}../tools/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open STDOUT,"| \"$^X\" $xlate $flavour $output"; diff --git a/crypto/poly1305/poly1305-arm64.s b/crypto/poly1305/poly1305-arm64-ios.S similarity index 92% rename from crypto/poly1305/poly1305-arm64.s rename to crypto/poly1305/poly1305-arm64-ios.S index 911b57e..610613c 100644 --- a/crypto/poly1305/poly1305-arm64.s +++ b/crypto/poly1305/poly1305-arm64-ios.S @@ -1,20 +1,21 @@ -/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0) - * - * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. - * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. - */ - -#include .text +// forward "declarations" are required for Apple +.globl _poly1305_blocks_arm +.globl _poly1305_emit_arm +.globl _poly1305_blocks_neon +.globl _poly1305_emit_neon + +.globl _poly1305_init_arm + .align 5 -ENTRY(poly1305_init_arm) +_poly1305_init_arm: cmp x1,xzr stp xzr,xzr,[x0] // zero hash value stp xzr,xzr,[x0,#16] // [along with is_base2_26] csel x0,xzr,x0,eq - b.eq .Lno_key + b.eq Lno_key ldp x7,x8,[x1] // load key mov x9,#0xfffffffc0fffffff @@ -28,23 +29,24 @@ ENTRY(poly1305_init_arm) and x8,x8,x9 // &=0ffffffc0ffffffc stp x7,x8,[x0,#32] // save key value -.Lno_key: +Lno_key: ret -ENDPROC(poly1305_init_arm) + + .align 5 -ENTRY(poly1305_blocks_arm) +_poly1305_blocks_arm: ands x2,x2,#-16 - b.eq .Lno_data + b.eq Lno_data ldp x4,x5,[x0] // load hash value ldp x7,x8,[x0,#32] // load key value ldr x6,[x0,#16] add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) - b .Loop + b Loop .align 5 -.Loop: +Loop: ldp x10,x11,[x1],#16 // load input sub x2,x2,#16 #ifdef __ARMEB__ @@ -86,17 +88,18 @@ ENTRY(poly1305_blocks_arm) adcs x5,x13,xzr adc x6,x6,xzr - cbnz x2,.Loop + cbnz x2,Loop stp x4,x5,[x0] // store hash value str x6,[x0,#16] -.Lno_data: +Lno_data: ret -ENDPROC(poly1305_blocks_arm) + + .align 5 -ENTRY(poly1305_emit_arm) +_poly1305_emit_arm: ldp x4,x5,[x0] // load hash base 2^64 ldr x6,[x0,#16] ldp x10,x11,[x2] // load nonce @@ -123,10 +126,10 @@ ENTRY(poly1305_emit_arm) stp x4,x5,[x1] // write result ret -ENDPROC(poly1305_emit_arm) + .align 5 -__poly1305_mult: +poly1305_mult: mul x12,x4,x7 // h0*r0 umulh x13,x4,x7 @@ -160,7 +163,10 @@ __poly1305_mult: ret -__poly1305_splat: + + +.align 5 +poly1305_splat: and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 ubfx x13,x4,#26,#26 extr x14,x5,x4,#52 @@ -184,28 +190,30 @@ __poly1305_splat: ret + + .align 5 -ENTRY(poly1305_blocks_neon) +_poly1305_blocks_neon: ldr x17,[x0,#24] cmp x2,#128 - b.hs .Lblocks_neon - cbz x17,poly1305_blocks_arm + b.hs Lblocks_neon + cbz x17,_poly1305_blocks_arm -.Lblocks_neon: +Lblocks_neon: stp x29,x30,[sp,#-80]! add x29,sp,#0 ands x2,x2,#-16 - b.eq .Lno_data_neon + b.eq Lno_data_neon - cbz x17,.Lbase2_64_neon + cbz x17,Lbase2_64_neon ldp w10,w11,[x0] // load hash value base 2^26 ldp w12,w13,[x0,#8] ldr w14,[x0,#16] tst x2,#31 - b.eq .Leven_neon + b.eq Leven_neon ldp x7,x8,[x0,#32] // load key value @@ -237,10 +245,10 @@ ENTRY(poly1305_blocks_neon) adcs x5,x5,x13 adc x6,x6,x3 - bl __poly1305_mult + bl poly1305_mult ldr x30,[sp,#8] - cbz x3,.Lstore_base2_64_neon + cbz x3,Lstore_base2_64_neon and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 ubfx x11,x4,#26,#26 @@ -249,28 +257,28 @@ ENTRY(poly1305_blocks_neon) ubfx x13,x5,#14,#26 extr x14,x6,x5,#40 - cbnz x2,.Leven_neon + cbnz x2,Leven_neon stp w10,w11,[x0] // store hash value base 2^26 stp w12,w13,[x0,#8] str w14,[x0,#16] - b .Lno_data_neon + b Lno_data_neon .align 4 -.Lstore_base2_64_neon: +Lstore_base2_64_neon: stp x4,x5,[x0] // store hash value base 2^64 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed - b .Lno_data_neon + b Lno_data_neon .align 4 -.Lbase2_64_neon: +Lbase2_64_neon: ldp x7,x8,[x0,#32] // load key value ldp x4,x5,[x0] // load hash value base 2^64 ldr x6,[x0,#16] tst x2,#31 - b.eq .Linit_neon + b.eq Linit_neon ldp x12,x13,[x1],#16 // load input sub x2,x2,#16 @@ -283,9 +291,9 @@ ENTRY(poly1305_blocks_neon) adcs x5,x5,x13 adc x6,x6,x3 - bl __poly1305_mult + bl poly1305_mult -.Linit_neon: +Linit_neon: and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 ubfx x11,x4,#26,#26 extr x12,x5,x4,#52 @@ -310,35 +318,35 @@ ENTRY(poly1305_blocks_neon) mov x5,x8 mov x6,xzr add x0,x0,#48+12 - bl __poly1305_splat + bl poly1305_splat - bl __poly1305_mult // r^2 + bl poly1305_mult // r^2 sub x0,x0,#4 - bl __poly1305_splat + bl poly1305_splat - bl __poly1305_mult // r^3 + bl poly1305_mult // r^3 sub x0,x0,#4 - bl __poly1305_splat + bl poly1305_splat - bl __poly1305_mult // r^4 + bl poly1305_mult // r^4 sub x0,x0,#4 - bl __poly1305_splat + bl poly1305_splat ldr x30,[sp,#8] add x16,x1,#32 - adr x17,.Lzeros + adr x17,Lzeros subs x2,x2,#64 csel x16,x17,x16,lo mov x4,#1 str x4,[x0,#-24] // set is_base2_26 sub x0,x0,#48 // restore original x0 - b .Ldo_neon + b Ldo_neon .align 4 -.Leven_neon: +Leven_neon: add x16,x1,#32 - adr x17,.Lzeros + adr x17,Lzeros subs x2,x2,#64 csel x16,x17,x16,lo @@ -353,7 +361,7 @@ ENTRY(poly1305_blocks_neon) fmov d27,x13 fmov d28,x14 -.Ldo_neon: +Ldo_neon: ldp x8,x12,[x16],#16 // inp[2:3] (or zero) ldp x9,x13,[x16],#48 @@ -427,10 +435,10 @@ ENTRY(poly1305_blocks_neon) fmov d13,x12 ushr v31.2d,v31.2d,#38 - b.ls .Lskip_loop + b.ls Lskip_loop .align 4 -.Loop_neon: +Loop_neon: //////////////////////////////////////////////////////////////// // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r @@ -616,9 +624,9 @@ ENTRY(poly1305_blocks_neon) add v25.2s,v25.2s,v29.2s // h0 -> h1 add v28.2s,v28.2s,v30.2s // h3 -> h4 - b.hi .Loop_neon + b.hi Loop_neon -.Lskip_loop: +Lskip_loop: dup v16.2d,v16.d[0] add v11.2s,v11.2s,v26.2s @@ -626,7 +634,7 @@ ENTRY(poly1305_blocks_neon) // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 adds x2,x2,#32 - b.ne .Long_tail + b.ne Long_tail dup v16.2d,v11.d[0] add v14.2s,v9.2s,v24.2s @@ -634,7 +642,7 @@ ENTRY(poly1305_blocks_neon) add v15.2s,v10.2s,v25.2s add v18.2s,v13.2s,v28.2s -.Long_tail: +Long_tail: dup v14.2d,v14.d[0] umull2 v19.2d,v16.4s,v6.4s umull2 v22.2d,v16.4s,v1.4s @@ -669,7 +677,7 @@ ENTRY(poly1305_blocks_neon) umlal2 v20.2d,v18.4s,v4.4s umlal2 v21.2d,v18.4s,v6.4s - b.eq .Lshort_tail + b.eq Lshort_tail //////////////////////////////////////////////////////////////// // (hash+inp[0:1])*r^4:r^3 and accumulate @@ -708,7 +716,7 @@ ENTRY(poly1305_blocks_neon) umlal v20.2d,v13.2s,v4.2s umlal v21.2d,v13.2s,v6.2s -.Lshort_tail: +Lshort_tail: //////////////////////////////////////////////////////////////// // horizontal add @@ -759,15 +767,16 @@ ENTRY(poly1305_blocks_neon) st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 st1 {v23.s}[0],[x0] -.Lno_data_neon: +Lno_data_neon: ldr x29,[sp],#80 ret -ENDPROC(poly1305_blocks_neon) + + .align 5 -ENTRY(poly1305_emit_neon) +_poly1305_emit_neon: ldr x17,[x0,#24] - cbz x17,poly1305_emit_arm + cbz x17,_poly1305_emit_arm ldp w10,w11,[x0] // load hash value base 2^26 ldp w12,w13,[x0,#8] @@ -813,8 +822,8 @@ ENTRY(poly1305_emit_neon) stp x4,x5,[x1] // write result ret -ENDPROC(poly1305_emit_neon) + .align 5 -.Lzeros: +Lzeros: .long 0,0,0,0,0,0,0,0 diff --git a/crypto/poly1305/poly1305-arm64.pl b/crypto/poly1305/poly1305-arm64.pl index ac06457..0ef3453 100644 --- a/crypto/poly1305/poly1305-arm64.pl +++ b/crypto/poly1305/poly1305-arm64.pl @@ -18,7 +18,7 @@ # # June 2015 # -# Numbers are cycles per processed byte with poly1305_blocks alone. +# Numbers are cycles per processed byte with poly1305_blocks_arm alone. # # IALU/gcc-4.9 NEON # @@ -39,7 +39,7 @@ $output=shift; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +( $xlate="${dir}../tools/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; @@ -51,19 +51,18 @@ my ($mac,$nonce)=($inp,$len); my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); $code.=<<___; -#include "arm_arch.h" - .text // forward "declarations" are required for Apple -.extern OPENSSL_armcap_P -.globl poly1305_blocks -.globl poly1305_emit +.globl poly1305_blocks_arm +.globl poly1305_emit_arm +.globl poly1305_blocks_neon +.globl poly1305_emit_neon -.globl poly1305_init -.type poly1305_init,%function +.globl poly1305_init_arm +.type poly1305_init_arm,%function .align 5 -poly1305_init: +poly1305_init_arm: cmp $inp,xzr stp xzr,xzr,[$ctx] // zero hash value stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] @@ -71,17 +70,9 @@ poly1305_init: csel x0,xzr,x0,eq b.eq .Lno_key -#ifdef __ILP32__ - ldrsw $t1,.LOPENSSL_armcap_P -#else - ldr $t1,.LOPENSSL_armcap_P -#endif - adr $t0,.LOPENSSL_armcap_P - ldp $r0,$r1,[$inp] // load key mov $s1,#0xfffffffc0fffffff movk $s1,#0x0fff,lsl#48 - ldr w17,[$t0,$t1] #ifdef __ARMEB__ rev $r0,$r0 // flip bytes rev $r1,$r1 @@ -91,30 +82,13 @@ poly1305_init: and $r1,$r1,$s1 // &=0ffffffc0ffffffc stp $r0,$r1,[$ctx,#32] // save key value - tst w17,#ARMV7_NEON - - adr $d0,poly1305_blocks - adr $r0,poly1305_blocks_neon - adr $d1,poly1305_emit - adr $r1,poly1305_emit_neon - - csel $d0,$d0,$r0,eq - csel $d1,$d1,$r1,eq - -#ifdef __ILP32__ - stp w12,w13,[$len] -#else - stp $d0,$d1,[$len] -#endif - - mov x0,#1 .Lno_key: ret -.size poly1305_init,.-poly1305_init +.size poly1305_init_arm,.-poly1305_init_arm -.type poly1305_blocks,%function +.type poly1305_blocks_arm,%function .align 5 -poly1305_blocks: +poly1305_blocks_arm: ands $len,$len,#-16 b.eq .Lno_data @@ -174,11 +148,11 @@ poly1305_blocks: .Lno_data: ret -.size poly1305_blocks,.-poly1305_blocks +.size poly1305_blocks_arm,.-poly1305_blocks_arm -.type poly1305_emit,%function +.type poly1305_emit_arm,%function .align 5 -poly1305_emit: +poly1305_emit_arm: ldp $h0,$h1,[$ctx] // load hash base 2^64 ldr $h2,[$ctx,#16] ldp $t0,$t1,[$nonce] // load nonce @@ -205,7 +179,7 @@ poly1305_emit: stp $h0,$h1,[$mac] // write result ret -.size poly1305_emit,.-poly1305_emit +.size poly1305_emit_arm,.-poly1305_emit_arm ___ my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); @@ -288,7 +262,7 @@ poly1305_blocks_neon: ldr $is_base2_26,[$ctx,#24] cmp $len,#128 b.hs .Lblocks_neon - cbz $is_base2_26,poly1305_blocks + cbz $is_base2_26,poly1305_blocks_arm .Lblocks_neon: stp x29,x30,[sp,#-80]! @@ -867,7 +841,7 @@ poly1305_blocks_neon: .align 5 poly1305_emit_neon: ldr $is_base2_26,[$ctx,#24] - cbz $is_base2_26,poly1305_emit + cbz $is_base2_26,poly1305_emit_arm ldp w10,w11,[$ctx] // load hash value base 2^26 ldp w12,w13,[$ctx,#8] @@ -918,14 +892,6 @@ poly1305_emit_neon: .align 5 .Lzeros: .long 0,0,0,0,0,0,0,0 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif -.asciz "Poly1305 for ARMv8, CRYPTOGAMS by " -.align 2 ___ foreach (split("\n",$code)) { diff --git a/crypto/poly1305_x64_gas.s b/crypto/poly1305/poly1305-x64-linux.s old mode 100755 new mode 100644 similarity index 100% rename from crypto/poly1305_x64_gas.s rename to crypto/poly1305/poly1305-x64-linux.s diff --git a/crypto/poly1305_x64_gas_macosx.s b/crypto/poly1305/poly1305-x64-osx.s similarity index 100% rename from crypto/poly1305_x64_gas_macosx.s rename to crypto/poly1305/poly1305-x64-osx.s diff --git a/crypto/poly1305_x64_nasm.asm b/crypto/poly1305/poly1305-x64-win.asm similarity index 100% rename from crypto/poly1305_x64_nasm.asm rename to crypto/poly1305/poly1305-x64-win.asm diff --git a/crypto/make_poly1305_x64.pl b/crypto/poly1305/poly1305-x64.pl old mode 100755 new mode 100644 similarity index 99% rename from crypto/make_poly1305_x64.pl rename to crypto/poly1305/poly1305-x64.pl index f7a2ab7..122008f --- a/crypto/make_poly1305_x64.pl +++ b/crypto/poly1305/poly1305-x64.pl @@ -71,7 +71,7 @@ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +( $xlate="${dir}../tools/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; $avx = 3; diff --git a/crypto/make_poly1305_x86.pl b/crypto/poly1305/poly1305-x86.pl similarity index 100% rename from crypto/make_poly1305_x86.pl rename to crypto/poly1305/poly1305-x86.pl diff --git a/crypto/siphash.cpp b/crypto/siphash/siphash.cpp similarity index 99% rename from crypto/siphash.cpp rename to crypto/siphash/siphash.cpp index 4bd11ad..fddf5fd 100644 --- a/crypto/siphash.cpp +++ b/crypto/siphash/siphash.cpp @@ -11,7 +11,7 @@ */ #include "stdafx.h" -#include "crypto/siphash.h" +#include "crypto/siphash/siphash.h" #include "tunsafe_endian.h" #define SIPROUND \ diff --git a/crypto/siphash.h b/crypto/siphash/siphash.h similarity index 100% rename from crypto/siphash.h rename to crypto/siphash/siphash.h diff --git a/crypto/tools/arm-xlate.pl b/crypto/tools/arm-xlate.pl new file mode 100644 index 0000000..ca2f8b9 --- /dev/null +++ b/crypto/tools/arm-xlate.pl @@ -0,0 +1,177 @@ +#! /usr/bin/env perl +# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +use strict; + +my $flavour = shift; +my $output = shift; +open STDOUT,">$output" || die "can't open $output: $!"; + +$flavour = "linux32" if (!$flavour or $flavour eq "void"); + +my %GLOBALS; +my $dotinlocallabels=($flavour=~/linux/)?1:0; + +################################################################ +# directives which need special treatment on different platforms +################################################################ +my $arch = sub { + if ($flavour =~ /linux/) { ".arch\t".join(',',@_); } + else { ""; } +}; +my $fpu = sub { + if ($flavour =~ /linux/) { ".fpu\t".join(',',@_); } + else { ""; } +}; +my $hidden = sub { + if ($flavour =~ /ios/) { ".private_extern\t".join(',',@_); } + else { ".hidden\t".join(',',@_); } +}; +my $comm = sub { + my @args = split(/,\s*/,shift); + my $name = @args[0]; + my $global = \$GLOBALS{$name}; + my $ret; + + if ($flavour =~ /ios32/) { + $ret = ".comm\t_$name,@args[1]\n"; + $ret .= ".non_lazy_symbol_pointer\n"; + $ret .= "$name:\n"; + $ret .= ".indirect_symbol\t_$name\n"; + $ret .= ".long\t0"; + $name = "_$name"; + } else { $ret = ".comm\t".join(',',@args); } + + $$global = $name; + $ret; +}; +my $globl = sub { + my $name = shift; + my $global = \$GLOBALS{$name}; + my $ret; + + SWITCH: for ($flavour) { + /ios/ && do { $name = "_$name"; + last; + }; + } + + $ret = ".globl $name" if (!$ret); + $$global = $name; + $ret; +}; +my $global = $globl; +my $extern = sub { + &$globl(@_); + return; # return nothing +}; +my $type = sub { + if ($flavour =~ /linux/) { ".type\t".join(',',@_); } + elsif ($flavour =~ /ios32/) { if (join(',',@_) =~ /(\w+),%function/) { + "#ifdef __thumb2__\n". + ".thumb_func $1\n". + "#endif"; + } + } + else { ""; } +}; +my $size = sub { + if ($flavour =~ /linux/) { ".size\t".join(',',@_); } + else { ""; } +}; +my $inst = sub { + if ($flavour =~ /linux/) { ".inst\t".join(',',@_); } + else { ".long\t".join(',',@_); } +}; +my $asciz = sub { + my $line = join(",",@_); + if ($line =~ /^"(.*)"$/) + { ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; } + else + { ""; } +}; + +sub range { + my ($r,$sfx,$start,$end) = @_; + + join(",",map("$r$_$sfx",($start..$end))); +} + +sub expand_line { + my $line = shift; + my @ret = (); + + pos($line)=0; + + while ($line =~ m/\G[^@\/\{\"]*/g) { + if ($line =~ m/\G(@|\/\/|$)/gc) { + last; + } + elsif ($line =~ m/\G\{/gc) { + my $saved_pos = pos($line); + $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e; + pos($line) = $saved_pos; + $line =~ m/\G[^\}]*\}/g; + } + elsif ($line =~ m/\G\"/gc) { + $line =~ m/\G[^\"]*\"/g; + } + } + + $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge; + + return $line; +} + +while(my $line=<>) { + + if ($line =~ m/^\s*(#|@|\/\/)/) { print $line; next; } + + $line =~ s|/\*.*\*/||; # get rid of C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning... + $line =~ s|\s+$||; # ... and at the end + + { + $line =~ s|[\b\.]L(\w{2,})|L$1|g; # common denominator for Locallabel + $line =~ s|\bL(\w{2,})|\.L$1|g if ($dotinlocallabels); + } + + { + $line =~ s|(^[\.\w]+)\:\s*||; + my $label = $1; + if ($label) { + printf "%s:",($GLOBALS{$label} or $label); + } + } + + if ($line !~ m/^[#@]/) { + $line =~ s|^\s*(\.?)(\S+)\s*||; + my $c = $1; $c = "\t" if ($c eq ""); + my $mnemonic = $2; + my $opcode; + if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) { + $opcode = eval("\$$1_$2"); + } else { + $opcode = eval("\$$mnemonic"); + } + + my $arg=expand_line($line); + + if (ref($opcode) eq 'CODE') { + $line = &$opcode($arg); + } elsif ($mnemonic) { + $line = $c.$mnemonic; + $line.= "\t$arg" if ($arg ne ""); + } + } + + print $line if ($line); + print "\n"; +} + +close STDOUT; diff --git a/crypto/nasm.props b/crypto/tools/nasm.props similarity index 100% rename from crypto/nasm.props rename to crypto/tools/nasm.props diff --git a/crypto/nasm.targets b/crypto/tools/nasm.targets similarity index 100% rename from crypto/nasm.targets rename to crypto/tools/nasm.targets diff --git a/crypto/nasm.xml b/crypto/tools/nasm.xml similarity index 100% rename from crypto/nasm.xml rename to crypto/tools/nasm.xml diff --git a/crypto/x86_64-xlate.pl b/crypto/tools/x86_64-xlate.pl similarity index 100% rename from crypto/x86_64-xlate.pl rename to crypto/tools/x86_64-xlate.pl diff --git a/network_bsd_common.cpp b/network_bsd_common.cpp index 0d4346f..dae7b66 100644 --- a/network_bsd_common.cpp +++ b/network_bsd_common.cpp @@ -248,11 +248,6 @@ done: } #endif // defined(OS_LINUX) - -void OsInterruptibleSleep(int millis) { - usleep((useconds_t)millis * 1000); -} - #if defined(OS_MACOSX) int open_tun(char *devname, size_t devname_size) { struct sockaddr_ctl sc; @@ -789,14 +784,6 @@ public: bool is_connected_; }; -struct CommandLineOutput { - const char *filename_to_load; - const char *interface_name; - bool daemon; -}; - -int HandleCommandLine(int argc, char **argv, CommandLineOutput *output); - int main(int argc, char **argv) { CommandLineOutput cmd = {0}; diff --git a/ts.cpp b/ts.cpp index 2cd5d8c..fd363f2 100644 --- a/ts.cpp +++ b/ts.cpp @@ -1,7 +1,7 @@ #include "stdafx.h" #include "tunsafe_types.h" #include "netapi.h" -#include "crypto/curve25519-donna.h" +#include "crypto/curve25519/curve25519-donna.h" #include "util.h" #include "wireguard_proto.h" #include @@ -35,8 +35,6 @@ #define ANSI_FG_CYAN "\x1b[36m" #define ANSI_FG_WHITE "\x1b[37m" -static const uint8 kCurve25519Basepoint[32] = {9}; - #if defined(OS_WIN) #define EXENAME "ts" @@ -758,12 +756,6 @@ static int HandleStopCommand(int argc, char **argv) { #endif // defined(OS_WIN) -struct CommandLineOutput { - const char *filename_to_load; - const char *interface_name; - bool daemon; -}; - // Returns -1 on invalid subcommand int HandleCommandLine(int argc, char **argv, CommandLineOutput *output) { uint8 key[32]; diff --git a/ts.vcxproj b/ts.vcxproj index 0774b3d..0a9c943 100644 --- a/ts.vcxproj +++ b/ts.vcxproj @@ -54,7 +54,7 @@ - + @@ -100,6 +100,7 @@ _DEBUG;_CONSOLE;%(PreprocessorDefinitions) true false + . Console @@ -115,6 +116,7 @@ WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) true false + . Console @@ -134,6 +136,7 @@ MultiThreaded true false + . Console @@ -155,6 +158,7 @@ MultiThreaded true false + . Console @@ -164,13 +168,13 @@ - + - + NotUsing NotUsing NotUsing @@ -187,13 +191,13 @@ - + true true - + \ No newline at end of file diff --git a/ts.vcxproj.filters b/ts.vcxproj.filters index 4d6247d..7f47ece 100644 --- a/ts.vcxproj.filters +++ b/ts.vcxproj.filters @@ -21,12 +21,12 @@ Source Files - - Source Files - Source Files + + Header Files + @@ -38,15 +38,15 @@ Source Files - + Source Files - + Source Files - + Source Files diff --git a/tunsafe_amalgam.cpp b/tunsafe_amalgam.cpp new file mode 100644 index 0000000..19108eb --- /dev/null +++ b/tunsafe_amalgam.cpp @@ -0,0 +1,29 @@ +#include "build_config.h" + +// Skip asm for IOS simulator +#if defined(OS_IOS) && defined(ARCH_CPU_X86_FAMILY) +#define CHACHA20_WITH_ASM 0 +#define BLAKE2S_WITH_ASM 0 +#endif + +#include "wireguard.cpp" +#include "wireguard_proto.cpp" +#include "wireguard_config.cpp" +#include "util.cpp" +#include "tunsafe_threading.cpp" +#include "tunsafe_cpu.cpp" +#include "ip_to_peer_map.cpp" +#include "crypto/curve25519/curve25519-donna.cpp" +#include "crypto/chacha20poly1305.cpp" +#include "crypto/blake2s/blake2s.cpp" +#include "crypto/siphash/siphash.cpp" +#include "crypto/aesgcm/aesgcm.cpp" +#include "ipzip2/ipzip2.cpp" + +#if defined(WITH_NETWORK_BSD) +#include "network_bsd.cpp" +#include "network_bsd_common.cpp" +#include "ts.cpp" +#include "benchmark.cpp" +#endif + diff --git a/tunsafe_win32.cpp b/tunsafe_win32.cpp index f786043..31e807f 100644 --- a/tunsafe_win32.cpp +++ b/tunsafe_win32.cpp @@ -22,7 +22,7 @@ #include "util.h" #include #include -#include "crypto/curve25519-donna.h" +#include "crypto/curve25519/curve25519-donna.h" #include "service_win32.h" #include "util_win32.h" @@ -652,8 +652,6 @@ void BrowseFile(HWND wnd) { ImportFile(szFile); } -static const uint8 kCurve25519Basepoint[32] = {9}; - static void SetKeyBox(HWND wnd, int ctr, uint8 buf[32]) { char base64[WG_PUBLIC_KEY_LEN_BASE64 + 1]; SetDlgItemText(wnd, ctr, base64_encode(buf, 32, base64, sizeof(base64), NULL)); diff --git a/util.h b/util.h index 7eae298..72d8521 100644 --- a/util.h +++ b/util.h @@ -48,3 +48,10 @@ uint64 OsGetMilliseconds(); void InitOsxGetMilliseconds(); void OsInterruptibleSleep(int millis); void OsGetTimestampTAI64N(uint8 dst[12]); + +struct CommandLineOutput { + const char *filename_to_load; + const char *interface_name; + bool daemon; +}; +int HandleCommandLine(int argc, char **argv, CommandLineOutput *output); diff --git a/wireguard.cpp b/wireguard.cpp index be21969..e4f0b33 100644 --- a/wireguard.cpp +++ b/wireguard.cpp @@ -5,8 +5,8 @@ #include "netapi.h" #include "wireguard_proto.h" #include "crypto/chacha20poly1305.h" -#include "crypto/blake2s.h" -#include "crypto/siphash.h" +#include "crypto/blake2s/blake2s.h" +#include "crypto/siphash/siphash.h" #include "tunsafe_endian.h" #include #include diff --git a/wireguard_proto.cpp b/wireguard_proto.cpp index 033064b..60acdae 100644 --- a/wireguard_proto.cpp +++ b/wireguard_proto.cpp @@ -3,10 +3,10 @@ #include "stdafx.h" #include "wireguard_proto.h" #include "crypto/chacha20poly1305.h" -#include "crypto/blake2s.h" -#include "crypto/curve25519-donna.h" +#include "crypto/blake2s/blake2s.h" +#include "crypto/curve25519/curve25519-donna.h" #include "crypto/aesgcm/aes.h" -#include "crypto/siphash.h" +#include "crypto/siphash/siphash.h" #include "tunsafe_endian.h" #include "util.h" #include "crypto_ops.h" @@ -21,7 +21,6 @@ static const uint8 kLabelCookie[] = {'c', 'o', 'o', 'k', 'i', 'e', '-', '-'}; static const uint8 kLabelMac1[] = {'m', 'a', 'c', '1', '-', '-', '-', '-'}; static const uint8 kWgInitHash[WG_HASH_LEN] = {0x22,0x11,0xb3,0x61,0x08,0x1a,0xc5,0x66,0x69,0x12,0x43,0xdb,0x45,0x8a,0xd5,0x32,0x2d,0x9c,0x6c,0x66,0x22,0x93,0xe8,0xb7,0x0e,0xe1,0x9c,0x65,0xba,0x07,0x9e,0xf3}; static const uint8 kWgInitChainingKey[WG_HASH_LEN] = {0x60,0xe2,0x6d,0xae,0xf3,0x27,0xef,0xc0,0x2e,0xc3,0x35,0xe2,0xa0,0x25,0xd2,0xd0,0x16,0xeb,0x42,0x06,0xf8,0x72,0x77,0xf5,0x2d,0x38,0xd1,0x98,0x8b,0x78,0xcd,0x36}; -static const uint8 kCurve25519Basepoint[32] = {9}; ReplayDetector::ReplayDetector() { expected_seq_nr_ = 0;