From cf92ac7a0c444ba4b31bdf9b1744705e6542d028 Mon Sep 17 00:00:00 2001 From: Ludvig Strigeus Date: Sun, 12 Aug 2018 03:27:14 +0200 Subject: [PATCH] Updates for TunSafe 1.4-rc1 1.Subfolders in the Config/ directory now show up as submenus. 2.Added a way to run TunSafe as a Windows Service. Foreground Mode: The service will disconnect when TunSafe closes. Background Mode: The service will stay connected in the background. No longer required to run the TunSafe client as Admin as long as the service is running. 3.New config setting [Interface].ExcludedIPs to configure IPs that should not be routed through TunSafe. 4.Can now automatically start TunSafe when Windows starts 5.New UI with tabs and graphs 6.Cache DNS queries to ensure DNS will succeed if connection fails 7.Recreate tray icon when explorer.exe restarts 8.Renamed window title to TunSafe instead of TunSafe VPN Client 9.Main window is now resizable 10.Disallow roaming endpoint when using AllowedIPs=0.0.0.0/0 Only the original endpoint is added in the routing table so this would result in an endless loop of packets. 11.Display approximate Wireguard framing overhead in stats 12.Preparations for protocol handling with multiple threads 13.Delete the routes we made when disconnecting 14.Fix error message about unable to delete a route when connecting --- .gitignore | 4 +- TunSafe.conf | 2 +- TunSafe.rc | Bin 8988 -> 6310 bytes TunSafe.sln | 18 - TunSafe.vcxproj | 17 +- TunSafe.vcxproj.filters | 35 +- benchmark.cpp | 6 +- build.py | 5 +- build_freebsd.sh | 3 +- build_linux.sh | 2 +- build_linux_rpi.sh | 11 + build_osx.sh | 2 +- crypto/blake2s.cpp | 0 crypto/blake2s_sse.cpp | 0 crypto/chacha20/chacha20-arm.pl | 1160 ++++++++++++++++ crypto/chacha20/chacha20-arm.s | 1475 +++++++++++++++++++++ crypto/chacha20/chacha20-arm64.pl | 1136 ++++++++++++++++ crypto/chacha20/chacha20-arm64.s | 1940 +++++++++++++++++++++++++++ crypto/chacha20poly1305.cpp | 39 +- crypto/curve25519-donna.h | 32 +- crypto/make_all_asm_files.sh | 0 crypto/make_poly1305_x64.pl | 0 crypto/poly1305/poly1305-arm.pl | 1253 ++++++++++++++++++ crypto/poly1305/poly1305-arm.s | 1127 ++++++++++++++++ crypto/poly1305/poly1305-arm64.pl | 944 +++++++++++++ crypto/poly1305/poly1305-arm64.s | 820 ++++++++++++ crypto/poly1305_x64_gas.s | 0 crypto_ops.h | 1 + downarrow.bmp | Bin 0 -> 142 bytes installer/ChangeLog.txt | 23 + installer/servicelib.nsh | 419 ++++++ installer/tunsafe.nsi | 6 + ip_to_peer_map.cpp | 97 ++ ip_to_peer_map.h | 41 + ipzip2/ipzip2.cpp | 3 +- ipzip2/ipzip2.h | 1 + netapi.h | 3 + network_bsd_common.cpp | 91 +- network_bsd_common.h | 6 +- network_bsd_mt.cpp | 103 +- network_win32.cpp | 728 +++++++--- network_win32.h | 142 +- network_win32_api.h | 127 +- network_win32_dnsblock.cpp | 43 +- network_win32_dnsblock.h | 21 +- resource.h | Bin 4828 -> 1979 bytes service_win32.cpp | 1179 +++++++++++++++++ service_win32.h | 171 +++ service_win32_api.h | 24 + stdafx.h | 4 +- tunsafe_config.h | 4 +- tunsafe_cpu.cpp | 37 +- tunsafe_cpu.h | 16 +- tunsafe_endian.h | 1 + tunsafe_threading.cpp | 57 + tunsafe_threading.h | 175 +++ tunsafe_types.h | 2 +- tunsafe_win32.cpp | 2055 +++++++++++++++++++---------- util.cpp | 48 +- util.h | 11 + util_win32.cpp | 378 ++++++ util_win32.h | 56 + wireguard.cpp | 532 +++++--- wireguard.h | 114 +- wireguard_config.cpp | 140 +- wireguard_config.h | 31 +- wireguard_proto.cpp | 363 +++-- wireguard_proto.h | 199 +-- 68 files changed, 15851 insertions(+), 1632 deletions(-) mode change 100644 => 100755 build_freebsd.sh mode change 100644 => 100755 build_linux.sh create mode 100755 build_linux_rpi.sh mode change 100644 => 100755 build_osx.sh mode change 100644 => 100755 crypto/blake2s.cpp mode change 100644 => 100755 crypto/blake2s_sse.cpp create mode 100644 crypto/chacha20/chacha20-arm.pl create mode 100644 crypto/chacha20/chacha20-arm.s create mode 100644 crypto/chacha20/chacha20-arm64.pl create mode 100644 crypto/chacha20/chacha20-arm64.s mode change 100644 => 100755 crypto/make_all_asm_files.sh mode change 100644 => 100755 crypto/make_poly1305_x64.pl create mode 100644 crypto/poly1305/poly1305-arm.pl create mode 100644 crypto/poly1305/poly1305-arm.s create mode 100644 crypto/poly1305/poly1305-arm64.pl create mode 100644 crypto/poly1305/poly1305-arm64.s mode change 100644 => 100755 crypto/poly1305_x64_gas.s create mode 100644 downarrow.bmp create mode 100644 installer/servicelib.nsh create mode 100644 ip_to_peer_map.cpp create mode 100644 ip_to_peer_map.h create mode 100644 ipzip2/ipzip2.h create mode 100644 service_win32.cpp create mode 100644 service_win32.h create mode 100644 service_win32_api.h create mode 100644 tunsafe_threading.cpp create mode 100644 tunsafe_threading.h create mode 100644 util_win32.cpp create mode 100644 util_win32.h diff --git a/.gitignore b/.gitignore index fc80387..71ce91a 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,6 @@ /Build /Win32/ /TunSafe.aps -/ipch /*.sdf /*vcxproj.user /*.opensdf @@ -15,4 +14,5 @@ /*.psess /*.vspx /installer/*.zip -/config/ \ No newline at end of file +/config/ +/tunsafe.com/ diff --git a/TunSafe.conf b/TunSafe.conf index 073c9e5..f1c4a46 100644 --- a/TunSafe.conf +++ b/TunSafe.conf @@ -4,7 +4,6 @@ ListenPort = 51820 Address = 192.168.2.2/24 MTU = 1420 - [Peer] PublicKey = 2m1BdGW9AwwF5dqaGm0NgMggdDZDUPFAL4JxCySdgBw= #AllowedIPs = 0.0.0.0/0, fc00::2/64 @@ -14,3 +13,4 @@ Endpoint = 192.168.1.4:8040 PersistentKeepalive = 25 + diff --git a/TunSafe.rc b/TunSafe.rc index 7139e02b7e2b90252ed5afd195d2c572e6abba42..55f2de5b3d2395f6b9a411419e26259809ded20d 100644 GIT binary patch literal 6310 zcmc&2Yj2|3_PdGyVP7Wk%w(8Wv7JuOHy&HvDFOlP~e-L~W?7{FW#oHF0 z`OxN)#s`7TXG_l=O>Rsw*v=Dz6wH4WXo}98Z$kc%9?5Dcb;=MRZQJM_;-AAc1};p zxZ-#fM;nqpa^likml>))(tD#PU!-~H z%J$fEuIZO?uhMnPgvY-U`WS5*4|SrKK^18|KT44Rvy z+fnG48Ql(nr{T-@eeh;#0At;bbOfS~1EJwY!hL)n0)h-_t4B32G-(Zw&^iS3iHA$D z96TU#6?c_pQG|DPUeqMe(gJ@SJ(P-Nn&xRt*3I#N5KR`><>gD^@rV#ea}Wr?lHqi(7)b3mt#O!Co0hKSUx-9pXIl^_pnw z2SFrf{alXzGo*rs!&ozCqRd5Ci&PMqhUvq0Md|E99E#_Cad4J9#9JUO zNEUl=rj>yWE|0^}Wsnk}xC($tAP8aSs-`i@X`VNMVcWG$J!cbs1q4t%o2e6ok_Mt+ z2a89Opk((A~QK!@ZKys|y8rluzAnZJWk_5%%wTBbF#a%+{rkVRAx zGK~-^iz~U}F9aly-zQksQNlAvMC$3R}?#Y=Gr%9q3+IbbVgsAo(q38SIf z2J`GbI6Q67gG5!U61Yy?gr2IK0^XOS5ez?qb7Euh{ z**1dm5I&G?P7)!W0!VFhPM{aquKgG2^5%RUJ)P_+DF)a@Umqqp~G=>;!hNM4cs6lq0 zq3``xh8p+x8TwI(mhqGQ{NGv(28FS-FLWHZ1n*5B3AigyQqU9m>92R3fFDCmAgdsT z-83RROm;CR{&ov-=BEL`1qCi>dN+`~69VfbNKziA1de@(iR1;An*=@)n8rK?S-g+d zXgs*rSXQqBVo&bxL%vI6fBg-YC-4J_vPL$juIDemL-r7J9)6oY>|^_=h{BFP_kB;7 z@64H7D!z8c0cp9^h!JqK!ij4b>5pMPO0X#pceO0GhyCDy;O~Anmz6`IHwxC+45En| zqTv!jm`kTb=&{WvMISspboC~@?ZaLnHHdAp8c z9yhQQ7!=#QacG7EKAhhmai}HRvTNEcXF{*h9%g(t&|`wB+Qn92hp(ZJB@T~&^7yrc z06aBq>gBy}3;a>Ws%D~Bb`wqfh2&7(q1S2?>?m&9dk&!cE;aS)KMH-zz^`#Xb5wlE z+aRH5ZZS<)`R#)LCl{pU)Bg) z@r{^GZ)9CRAVSV!Pl+I(PkH(nEeqz5FuG`qLZDJjYIKzi@qTGX^w|IgR?}!%#xWir zH;opBf2Cvw0qJK>4+v^g6`VNKzGg5DIcyY29uUY6aRk$yj6V6SVv~!d2EPlj3_U&a zkSt&qEmC}YegRsqh>Re>#$XVfK_L6(({LxnU;MjA%Srv0k`xrfKcu9{>eW$7zjd`9 z%+Gckj^CntQo=}N2}zCsUvY}IeI+B*QiG~)*0vv%avFpIHknw}-53PS#QGiTa&Yhp z_nyO4s%#OkIuqnI^ctjR#r)KYpt(?^%?29Z{W!90YMIWV%p%Vr0Ne9QO=ldTguNR+ zhcB#$hWBNbFEn>U70-f6Tdl1cfx|h0BnIB)=3hHOLG%-PC*6$Z_@)z~Or1#r&L zkD(5q%r{%G4~(yyv$SK=(s!PyA#I$hulA?H)e!HU$rS^F2|GyoN zeRn<^47{*ZosK>4?#;~3!+&P?`1haZ;dw~HPB;rkp&u^7K=(qH-Q^;dM~EpoFB-bVd) zM0udMk7PBsL07Hhs{DWrwk(x5uby6sE1KddGFm%jc2e6jZX9 zvz1}@B7~asS+}%%%Hp+1h*aJt@}09-TU@B7@LpdBIc6i`Nkj47RD5nlFDCx|H`y@v_ z){azas-VuI&s+_oVZ5T%sDqMZVh!C`FMN(w+=l+1YCbwzme$hRdl9}3b9gZxw1u}| z@4DKaE?bSqU?QrI)knp5rb^{p83#UC@<5X8N^fL{^xI3}rDnX+y%HrUOC~(PY2Joo zE%KsTIuhqGN9<0H=|v8B#r^o;tyHzFteD%bEVCpH;(7PwR*y(P6>jUJ`Zc?mJ}ndW zG>R2A#jhd_zP=~lx-9rjoYHzE${JCNm9Tol%1oM`v`*daRvfpbk$tEiJEB{YKlxfJ zF?LJpXD{-*`a#31xU#kQW?lY=FYSps@zm7a(mabc8(oXuxvMzoin>XHEW~4W#`e@1 z`TayXnu+g@d^(QS9VYWBn@y71p$XYez9mxtP4){2+JdtPpPF%Qhobu}U(=s!PK*I`en(34-W=frbY zB28Zre)SO4+2glT)J<}zgXi;VY+Kf$!eS?M82*q4&bHCAc;JJq#C&jGj(KfuE8eM@ z$)am9CS7yR$5DJ#&OGl5G=3 zvYiUdQ-x70Y)C&%AxTFq^^qa4)o;aBnjvaCA}Oel#_W7mHLlHWQ-S4AV_d(9xEw0h z`-*g8cM<9@szT22KBdqHpQCzhMt?3xf42B6uDWiYVaVY7vJl9a&N28DIBM}jrnx!8 z%!ht$Y5gqxDgAQ>=@yw4&q%@kAnWap-nUiDT{w!~`_lZO?#rAtz}D~TXAeIh$5|jb2=uaApB?QqUd8*?q8vuNX0sUbxuuKg_$X%yo5H*uS(GzO{h-T2OgMC_=tl|Is16%1WLZSWm` zN@8~AD<7zAc+9s!z8A7X>JR5$36~S~b6&*vLuMapK5wv(BE#Ex68Xe9k7Z&07tJ9@ zu*b9Ufqo+iRFQ`>8>m{I7$`5;=ohv2WekZMs$tkIbE5J>^JiT?Z`PdjSZO5tKdorSq1Pbo*|V z-n|?sv#jZ~m#coNGabwB`lWEOY@U_|y{2!g zK4yLSo&2tnrngwn?$zGj_rC_J@;0lx$N$HX{jUMnWindows true kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);ws2_32.lib;Iphlpapi.lib - RequireAdministrator @@ -122,7 +121,6 @@ kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);ws2_32.lib;Iphlpapi.lib;Comctl32.lib - RequireAdministrator @@ -142,7 +140,6 @@ true true kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);ws2_32.lib;Iphlpapi.lib - RequireAdministrator @@ -167,11 +164,13 @@ true true kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);ws2_32.lib;Iphlpapi.lib - RequireAdministrator + + + @@ -179,12 +178,15 @@ + + + @@ -193,13 +195,18 @@ + + + + + NotUsing @@ -229,10 +236,10 @@ + - diff --git a/TunSafe.vcxproj.filters b/TunSafe.vcxproj.filters index 220b7f6..2a444bb 100644 --- a/TunSafe.vcxproj.filters +++ b/TunSafe.vcxproj.filters @@ -53,6 +53,9 @@ Source Files + + Source Files + crypto @@ -71,6 +74,21 @@ Source Files + + Source Files\Win32 + + + Source Files\Win32 + + + Source Files + + + Source Files + + + Source Files\Win32 + @@ -109,6 +127,9 @@ Source Files + + Source Files + crypto @@ -121,6 +142,18 @@ Source Files + + Source Files\Win32 + + + Source Files\Win32 + + + Source Files + + + Source Files + @@ -128,8 +161,8 @@ - + diff --git a/benchmark.cpp b/benchmark.cpp index 4d30d80..43b0ee4 100644 --- a/benchmark.cpp +++ b/benchmark.cpp @@ -66,11 +66,15 @@ void Benchmark() { fake_glb = dst; +size_t max_bytes = 1000000000; +#if defined(ARCH_CPU_ARM_FAMILY) + max_bytes = 100000000; +#endif auto RunOneBenchmark = [&](const char *name, const std::function &ff) { uint64 bytes = 0; QueryPerformanceCounter((LARGE_INTEGER*)&b); size_t i; - for (i = 0; bytes < 1000000000; i++) + for (i = 0; bytes < max_bytes; i++) bytes += ff(i); QueryPerformanceCounter((LARGE_INTEGER*)&a); RINFO("%s: %f MB/s", name, (double)bytes * 0.000001 / (a - b) * f); diff --git a/build.py b/build.py index 934e577..8bcfe72 100644 --- a/build.py +++ b/build.py @@ -11,9 +11,10 @@ import re MSBUILD_PATH = r"C:\Dev\VS2017\MSBuild\15.0\Bin\MSBuild.exe" NSIS_PATH = r'C:\Dev\NSIS\makeNSIS.EXE' + SIGNTOOL_PATH = r'c:\Program Files (x86)\Windows Kits\10\bin\10.0.15063.0\x86\signtool.exe' -SIGNTOOL_KEY_PATH = '' # put key here -SIGNTOOL_PASS = '' # put key pass here +SIGNTOOL_KEY_PATH = "" # path to key file +SIGNTOOL_PASS = "" # password def RmTree(path): try: diff --git a/build_freebsd.sh b/build_freebsd.sh old mode 100644 new mode 100755 index 8f516b5..b546216 --- a/build_freebsd.sh +++ b/build_freebsd.sh @@ -1,5 +1,4 @@ -g++7 -I . -O2 -static -mssse3 -o tunsafe benchmark.cpp tunsafe_cpu.cpp wireguard_config.cpp \ +g++7 -I . -O2 -DNDEBUG -static -mssse3 -o tunsafe benchmark.cpp tunsafe_cpu.cpp wireguard_config.cpp ip_to_peer_map.cpp tunsafe_threading.cpp \ wireguard.cpp wireguard_proto.cpp util.cpp network_bsd.cpp network_bsd_common.cpp \ crypto/blake2s.cpp crypto/blake2s_sse.cpp crypto/chacha20poly1305.cpp crypto/curve25519-donna.cpp \ crypto/siphash.cpp crypto/chacha20_x64_gas.s crypto/poly1305_x64_gas.s ipzip2/ipzip2.cpp -lrt -pthread - diff --git a/build_linux.sh b/build_linux.sh old mode 100644 new mode 100755 index 85a8a0b..026955e --- a/build_linux.sh +++ b/build_linux.sh @@ -1,6 +1,6 @@ #!/bin/sh clang++-6.0 -c -march=skylake-avx512 crypto/poly1305_x64_gas.s crypto/chacha20_x64_gas.s -clang++-6.0 -I . -O3 -mssse3 -pthread -lrt -o tunsafe util.cpp wireguard_config.cpp wireguard.cpp \ +clang++-6.0 -I . -O3 -DNDEBUG -mssse3 -pthread -lrt -o tunsafe util.cpp wireguard_config.cpp wireguard.cpp ip_to_peer_map.cpp tunsafe_threading.cpp \ wireguard_proto.cpp network_bsd.cpp network_bsd_common.cpp tunsafe_cpu.cpp benchmark.cpp crypto/blake2s.cpp crypto/blake2s_sse.cpp crypto/chacha20poly1305.cpp \ crypto/curve25519-donna.cpp crypto/siphash.cpp chacha20_x64_gas.o crypto/aesgcm/aesni_gcm_x64_gas.s \ crypto/aesgcm/aesni_x64_gas.s crypto/aesgcm/aesgcm.cpp poly1305_x64_gas.o ipzip2/ipzip2.cpp \ diff --git a/build_linux_rpi.sh b/build_linux_rpi.sh new file mode 100755 index 0000000..96208a8 --- /dev/null +++ b/build_linux_rpi.sh @@ -0,0 +1,11 @@ +#!/bin/sh + +set -e + +cpp -D__ARM_ARCH__=7 crypto/chacha20/chacha20-arm.s > crypto/chacha20/chacha20-arm.preprocessed.s +cpp -D__ARM_ARCH__=7 crypto/poly1305/poly1305-arm.s > crypto/poly1305/poly1305-arm.preprocessed.s + +g++-6 -mfpu=neon -I . -g -O2 -DNDEBUG -fno-omit-frame-pointer -march=armv7-a -mthumb -std=c++11 -pthread -lrt -o tunsafe util.cpp wireguard_config.cpp wireguard.cpp ip_to_peer_map.cpp tunsafe_threading.cpp \ +wireguard_proto.cpp network_bsd.cpp network_bsd_common.cpp tunsafe_cpu.cpp benchmark.cpp crypto/blake2s.cpp crypto/chacha20poly1305.cpp \ +crypto/curve25519-donna.cpp crypto/siphash.cpp crypto/aesgcm/aesgcm.cpp ipzip2/ipzip2.cpp \ +crypto/chacha20/chacha20-arm.preprocessed.s crypto/poly1305/poly1305-arm.preprocessed.s diff --git a/build_osx.sh b/build_osx.sh old mode 100644 new mode 100755 index c77dc01..29a02c0 --- a/build_osx.sh +++ b/build_osx.sh @@ -4,7 +4,7 @@ set -e clang++ -c -mavx512f -mavx512vl crypto/poly1305_x64_gas_macosx.s crypto/chacha20_x64_gas_macosx.s clang++ -g -O3 -I . -std=c++11 -DNDEBUG=1 -fno-exceptions -fno-rtti -ffunction-sections -o tunsafe \ -wireguard_config.cpp wireguard.cpp wireguard_proto.cpp util.cpp network_bsd.cpp network_bsd_common.cpp benchmark.cpp tunsafe_cpu.cpp \ +wireguard_config.cpp ip_to_peer_map.cpp tunsafe_threading.cpp wireguard.cpp wireguard_proto.cpp util.cpp network_bsd.cpp network_bsd_common.cpp benchmark.cpp tunsafe_cpu.cpp \ crypto/blake2s.cpp crypto/blake2s_sse.cpp crypto/chacha20poly1305.cpp crypto/curve25519-donna.cpp \ crypto/siphash.cpp crypto/aesgcm/aesgcm.cpp ipzip2/ipzip2.cpp \ crypto/aesgcm/aesni_gcm_x64_gas_macosx.s crypto/aesgcm/aesni_x64_gas_macosx.s crypto/aesgcm/ghash_x64_gas_macosx.s \ diff --git a/crypto/blake2s.cpp b/crypto/blake2s.cpp old mode 100644 new mode 100755 diff --git a/crypto/blake2s_sse.cpp b/crypto/blake2s_sse.cpp old mode 100644 new mode 100755 diff --git a/crypto/chacha20/chacha20-arm.pl b/crypto/chacha20/chacha20-arm.pl new file mode 100644 index 0000000..cec1b89 --- /dev/null +++ b/crypto/chacha20/chacha20-arm.pl @@ -0,0 +1,1160 @@ +#! /usr/bin/env perl +# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# December 2014 +# +# ChaCha20 for ARMv4. +# +# Performance in cycles per byte out of large buffer. +# +# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU +# +# Cortex-A5 19.3(*)/+95% 21.8 14.1 +# Cortex-A8 10.5(*)/+160% 13.9 6.35 +# Cortex-A9 12.9(**)/+110% 14.3 6.50 +# Cortex-A15 11.0/+40% 16.0 5.00 +# Snapdragon S4 11.5/+125% 13.6 4.90 +# +# (*) most "favourable" result for aligned data on little-endian +# processor, result for misaligned data is 10-15% lower; +# (**) this result is a trade-off: it can be improved by 20%, +# but then Snapdragon S4 and Cortex-A8 results get +# 20-25% worse; + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}../arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); +my @t=map("r$_",(8..11)); + +sub ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my $odd = $d0&1; +my ($xc,$xc_) = (@t[0..1]); +my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); +my @ret; + + # Consider order in which variables are addressed by their + # index: + # + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + # + # 'a', 'b' are permanently allocated in registers, @x[0..7], + # while 'c's and pair of 'd's are maintained in memory. If + # you observe 'c' column, you'll notice that pair of 'c's is + # invariant between rounds. This means that we have to reload + # them once per round, in the middle. This is why you'll see + # bunch of 'c' stores and loads in the middle, but none in + # the beginning or end. If you observe 'd' column, you'll + # notice that 15 and 13 are reused in next pair of rounds. + # This is why these two are chosen for offloading to memory, + # to make loads count more. + push @ret,( + "&add (@x[$a0],@x[$a0],@x[$b0])", + "&mov ($xd,$xd,'ror#16')", + "&add (@x[$a1],@x[$a1],@x[$b1])", + "&mov ($xd_,$xd_,'ror#16')", + "&eor ($xd,$xd,@x[$a0],'ror#16')", + "&eor ($xd_,$xd_,@x[$a1],'ror#16')", + + "&add ($xc,$xc,$xd)", + "&mov (@x[$b0],@x[$b0],'ror#20')", + "&add ($xc_,$xc_,$xd_)", + "&mov (@x[$b1],@x[$b1],'ror#20')", + "&eor (@x[$b0],@x[$b0],$xc,'ror#20')", + "&eor (@x[$b1],@x[$b1],$xc_,'ror#20')", + + "&add (@x[$a0],@x[$a0],@x[$b0])", + "&mov ($xd,$xd,'ror#24')", + "&add (@x[$a1],@x[$a1],@x[$b1])", + "&mov ($xd_,$xd_,'ror#24')", + "&eor ($xd,$xd,@x[$a0],'ror#24')", + "&eor ($xd_,$xd_,@x[$a1],'ror#24')", + + "&add ($xc,$xc,$xd)", + "&mov (@x[$b0],@x[$b0],'ror#25')" ); + push @ret,( + "&str ($xd,'[sp,#4*(16+$d0)]')", + "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); + push @ret,( + "&add ($xc_,$xc_,$xd_)", + "&mov (@x[$b1],@x[$b1],'ror#25')" ); + push @ret,( + "&str ($xd_,'[sp,#4*(16+$d1)]')", + "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); + push @ret,( + "&eor (@x[$b0],@x[$b0],$xc,'ror#25')", + "&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" ); + + $xd=@x[$d2] if (!$odd); + $xd_=@x[$d3] if ($odd); + push @ret,( + "&str ($xc,'[sp,#4*(16+$c0)]')", + "&ldr ($xc,'[sp,#4*(16+$c2)]')", + "&add (@x[$a2],@x[$a2],@x[$b2])", + "&mov ($xd,$xd,'ror#16')", + "&str ($xc_,'[sp,#4*(16+$c1)]')", + "&ldr ($xc_,'[sp,#4*(16+$c3)]')", + "&add (@x[$a3],@x[$a3],@x[$b3])", + "&mov ($xd_,$xd_,'ror#16')", + "&eor ($xd,$xd,@x[$a2],'ror#16')", + "&eor ($xd_,$xd_,@x[$a3],'ror#16')", + + "&add ($xc,$xc,$xd)", + "&mov (@x[$b2],@x[$b2],'ror#20')", + "&add ($xc_,$xc_,$xd_)", + "&mov (@x[$b3],@x[$b3],'ror#20')", + "&eor (@x[$b2],@x[$b2],$xc,'ror#20')", + "&eor (@x[$b3],@x[$b3],$xc_,'ror#20')", + + "&add (@x[$a2],@x[$a2],@x[$b2])", + "&mov ($xd,$xd,'ror#24')", + "&add (@x[$a3],@x[$a3],@x[$b3])", + "&mov ($xd_,$xd_,'ror#24')", + "&eor ($xd,$xd,@x[$a2],'ror#24')", + "&eor ($xd_,$xd_,@x[$a3],'ror#24')", + + "&add ($xc,$xc,$xd)", + "&mov (@x[$b2],@x[$b2],'ror#25')", + "&add ($xc_,$xc_,$xd_)", + "&mov (@x[$b3],@x[$b3],'ror#25')", + "&eor (@x[$b2],@x[$b2],$xc,'ror#25')", + "&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" ); + + @ret; +} + +$code.=<<___; +#include "arm_arch.h" + +.text +#if defined(__thumb2__) || defined(__clang__) +.syntax unified +#endif +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +#if defined(__thumb2__) || defined(__clang__) +#define ldrhsb ldrbhs +#endif + +.align 5 +.Lsigma: +.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral +.Lone: +.long 1,0,0,0 +#if __ARM_MAX_ARCH__>=7 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.LChaCha20_ctr32 +#else +.word -1 +#endif + +.globl ChaCha20_ctr32 +.type ChaCha20_ctr32,%function +.align 5 +ChaCha20_ctr32: +.LChaCha20_ctr32: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0-r2,r4-r11,lr} +#if __ARM_ARCH__<7 && !defined(__thumb2__) + sub r14,pc,#16 @ ChaCha20_ctr32 +#else + adr r14,.LChaCha20_ctr32 +#endif + cmp r2,#0 @ len==0? +#ifdef __thumb2__ + itt eq +#endif + addeq sp,sp,#4*3 + beq .Lno_data +#if __ARM_MAX_ARCH__>=7 + cmp r2,#192 @ test len + bls .Lshort + ldr r4,[r14,#-32] + ldr r4,[r14,r4] +# ifdef __APPLE__ + ldr r4,[r4] +# endif + tst r4,#ARMV7_NEON + bne .LChaCha20_neon +.Lshort: +#endif + ldmia r12,{r4-r7} @ load counter and nonce + sub sp,sp,#4*(16) @ off-load area + sub r14,r14,#64 @ .Lsigma + stmdb sp!,{r4-r7} @ copy counter and nonce + ldmia r3,{r4-r11} @ load key + ldmia r14,{r0-r3} @ load sigma + stmdb sp!,{r4-r11} @ copy key + stmdb sp!,{r0-r3} @ copy sigma + str r10,[sp,#4*(16+10)] @ off-load "@x[10]" + str r11,[sp,#4*(16+11)] @ off-load "@x[11]" + b .Loop_outer_enter + +.align 4 +.Loop_outer: + ldmia sp,{r0-r9} @ load key material + str @t[3],[sp,#4*(32+2)] @ save len + str r12, [sp,#4*(32+1)] @ save inp + str r14, [sp,#4*(32+0)] @ save out +.Loop_outer_enter: + ldr @t[3], [sp,#4*(15)] + ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load + ldr @t[2], [sp,#4*(13)] + ldr @x[14],[sp,#4*(14)] + str @t[3], [sp,#4*(16+15)] + mov @t[3],#10 + b .Loop + +.align 4 +.Loop: + subs @t[3],@t[3],#1 +___ + foreach (&ROUND(0, 4, 8,12)) { eval; } + foreach (&ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + bne .Loop + + ldr @t[3],[sp,#4*(32+2)] @ load len + + str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store + str @t[1], [sp,#4*(16+9)] + str @x[12],[sp,#4*(16+12)] + str @t[2], [sp,#4*(16+13)] + str @x[14],[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ @x[0-7] and second half at sp+4*(16+8) + + cmp @t[3],#64 @ done yet? +#ifdef __thumb2__ + itete lo +#endif + addlo r12,sp,#4*(0) @ shortcut or ... + ldrhs r12,[sp,#4*(32+1)] @ ... load inp + addlo r14,sp,#4*(0) @ shortcut or ... + ldrhs r14,[sp,#4*(32+0)] @ ... load out + + ldr @t[0],[sp,#4*(0)] @ load key material + ldr @t[1],[sp,#4*(1)] + +#if __ARM_ARCH__>=6 || !defined(__ARMEB__) +# if __ARM_ARCH__<7 + orr @t[2],r12,r14 + tst @t[2],#3 @ are input and output aligned? + ldr @t[2],[sp,#4*(2)] + bne .Lunaligned + cmp @t[3],#64 @ restore flags +# else + ldr @t[2],[sp,#4*(2)] +# endif + ldr @t[3],[sp,#4*(3)] + + add @x[0],@x[0],@t[0] @ accumulate key material + add @x[1],@x[1],@t[1] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[0],[r12],#16 @ load input + ldrhs @t[1],[r12,#-12] + + add @x[2],@x[2],@t[2] + add @x[3],@x[3],@t[3] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[2],[r12,#-8] + ldrhs @t[3],[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[0],@x[0],@t[0] @ xor with input + eorhs @x[1],@x[1],@t[1] + add @t[0],sp,#4*(4) + str @x[0],[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[2],@x[2],@t[2] + eorhs @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[1],[r14,#-12] + str @x[2],[r14,#-8] + str @x[3],[r14,#-4] + + add @x[4],@x[4],@t[0] @ accumulate key material + add @x[5],@x[5],@t[1] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[0],[r12],#16 @ load input + ldrhs @t[1],[r12,#-12] + add @x[6],@x[6],@t[2] + add @x[7],@x[7],@t[3] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[2],[r12,#-8] + ldrhs @t[3],[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[4],@x[4],@t[0] + eorhs @x[5],@x[5],@t[1] + add @t[0],sp,#4*(8) + str @x[4],[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[6],@x[6],@t[2] + eorhs @x[7],@x[7],@t[3] + str @x[5],[r14,#-12] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[6],[r14,#-8] + add @x[0],sp,#4*(16+8) + str @x[7],[r14,#-4] + + ldmia @x[0],{@x[0]-@x[7]} @ load second half + + add @x[0],@x[0],@t[0] @ accumulate key material + add @x[1],@x[1],@t[1] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[0],[r12],#16 @ load input + ldrhs @t[1],[r12,#-12] +# ifdef __thumb2__ + itt hi +# endif + strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it + strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it + add @x[2],@x[2],@t[2] + add @x[3],@x[3],@t[3] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[2],[r12,#-8] + ldrhs @t[3],[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[0],@x[0],@t[0] + eorhs @x[1],@x[1],@t[1] + add @t[0],sp,#4*(12) + str @x[0],[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[2],@x[2],@t[2] + eorhs @x[3],@x[3],@t[3] + str @x[1],[r14,#-12] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[2],[r14,#-8] + str @x[3],[r14,#-4] + + add @x[4],@x[4],@t[0] @ accumulate key material + add @x[5],@x[5],@t[1] +# ifdef __thumb2__ + itt hi +# endif + addhi @t[0],@t[0],#1 @ next counter value + strhi @t[0],[sp,#4*(12)] @ save next counter value +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[0],[r12],#16 @ load input + ldrhs @t[1],[r12,#-12] + add @x[6],@x[6],@t[2] + add @x[7],@x[7],@t[3] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[2],[r12,#-8] + ldrhs @t[3],[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[4],@x[4],@t[0] + eorhs @x[5],@x[5],@t[1] +# ifdef __thumb2__ + it ne +# endif + ldrne @t[0],[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[6],@x[6],@t[2] + eorhs @x[7],@x[7],@t[3] + str @x[4],[r14],#16 @ store output + str @x[5],[r14,#-12] +# ifdef __thumb2__ + it hs +# endif + subhs @t[3],@t[0],#64 @ len-=64 + str @x[6],[r14,#-8] + str @x[7],[r14,#-4] + bhi .Loop_outer + + beq .Ldone +# if __ARM_ARCH__<7 + b .Ltail + +.align 4 +.Lunaligned: @ unaligned endian-neutral path + cmp @t[3],#64 @ restore flags +# endif +#endif +#if __ARM_ARCH__<7 + ldr @t[3],[sp,#4*(3)] +___ +for ($i=0;$i<16;$i+=4) { +my $j=$i&0x7; + +$code.=<<___ if ($i==4); + add @x[0],sp,#4*(16+8) +___ +$code.=<<___ if ($i==8); + ldmia @x[0],{@x[0]-@x[7]} @ load second half +# ifdef __thumb2__ + itt hi +# endif + strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" + strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" +___ +$code.=<<___; + add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material +___ +$code.=<<___ if ($i==12); +# ifdef __thumb2__ + itt hi +# endif + addhi @t[0],@t[0],#1 @ next counter value + strhi @t[0],[sp,#4*(12)] @ save next counter value +___ +$code.=<<___; + add @x[$j+1],@x[$j+1],@t[1] + add @x[$j+2],@x[$j+2],@t[2] +# ifdef __thumb2__ + itete lo +# endif + eorlo @t[0],@t[0],@t[0] @ zero or ... + ldrhsb @t[0],[r12],#16 @ ... load input + eorlo @t[1],@t[1],@t[1] + ldrhsb @t[1],[r12,#-12] + + add @x[$j+3],@x[$j+3],@t[3] +# ifdef __thumb2__ + itete lo +# endif + eorlo @t[2],@t[2],@t[2] + ldrhsb @t[2],[r12,#-8] + eorlo @t[3],@t[3],@t[3] + ldrhsb @t[3],[r12,#-4] + + eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) + eor @x[$j+1],@t[1],@x[$j+1] +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[0],[r12,#-15] @ load more input + ldrhsb @t[1],[r12,#-11] + eor @x[$j+2],@t[2],@x[$j+2] + strb @x[$j+0],[r14],#16 @ store output + eor @x[$j+3],@t[3],@x[$j+3] +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[2],[r12,#-7] + ldrhsb @t[3],[r12,#-3] + strb @x[$j+1],[r14,#-12] + eor @x[$j+0],@t[0],@x[$j+0],lsr#8 + strb @x[$j+2],[r14,#-8] + eor @x[$j+1],@t[1],@x[$j+1],lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[0],[r12,#-14] @ load more input + ldrhsb @t[1],[r12,#-10] + strb @x[$j+3],[r14,#-4] + eor @x[$j+2],@t[2],@x[$j+2],lsr#8 + strb @x[$j+0],[r14,#-15] + eor @x[$j+3],@t[3],@x[$j+3],lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[2],[r12,#-6] + ldrhsb @t[3],[r12,#-2] + strb @x[$j+1],[r14,#-11] + eor @x[$j+0],@t[0],@x[$j+0],lsr#8 + strb @x[$j+2],[r14,#-7] + eor @x[$j+1],@t[1],@x[$j+1],lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[0],[r12,#-13] @ load more input + ldrhsb @t[1],[r12,#-9] + strb @x[$j+3],[r14,#-3] + eor @x[$j+2],@t[2],@x[$j+2],lsr#8 + strb @x[$j+0],[r14,#-14] + eor @x[$j+3],@t[3],@x[$j+3],lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[2],[r12,#-5] + ldrhsb @t[3],[r12,#-1] + strb @x[$j+1],[r14,#-10] + strb @x[$j+2],[r14,#-6] + eor @x[$j+0],@t[0],@x[$j+0],lsr#8 + strb @x[$j+3],[r14,#-2] + eor @x[$j+1],@t[1],@x[$j+1],lsr#8 + strb @x[$j+0],[r14,#-13] + eor @x[$j+2],@t[2],@x[$j+2],lsr#8 + strb @x[$j+1],[r14,#-9] + eor @x[$j+3],@t[3],@x[$j+3],lsr#8 + strb @x[$j+2],[r14,#-5] + strb @x[$j+3],[r14,#-1] +___ +$code.=<<___ if ($i<12); + add @t[0],sp,#4*(4+$i) + ldmia @t[0],{@t[0]-@t[3]} @ load key material +___ +} +$code.=<<___; +# ifdef __thumb2__ + it ne +# endif + ldrne @t[0],[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + it hs +# endif + subhs @t[3],@t[0],#64 @ len-=64 + bhi .Loop_outer + + beq .Ldone +#endif + +.Ltail: + ldr r12,[sp,#4*(32+1)] @ load inp + add @t[1],sp,#4*(0) + ldr r14,[sp,#4*(32+0)] @ load out + +.Loop_tail: + ldrb @t[2],[@t[1]],#1 @ read buffer on stack + ldrb @t[3],[r12],#1 @ read input + subs @t[0],@t[0],#1 + eor @t[3],@t[3],@t[2] + strb @t[3],[r14],#1 @ store output + bne .Loop_tail + +.Ldone: + add sp,sp,#4*(32+3) +.Lno_data: + ldmia sp!,{r4-r11,pc} +.size ChaCha20_ctr32,.-ChaCha20_ctr32 +___ + +{{{ +my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = + map("q$_",(0..15)); + +sub NEONROUND { +my $odd = pop; +my ($a,$b,$c,$d,$t)=@_; + + ( + "&vadd_i32 ($a,$a,$b)", + "&veor ($d,$d,$a)", + "&vrev32_16 ($d,$d)", # vrot ($d,16) + + "&vadd_i32 ($c,$c,$d)", + "&veor ($t,$b,$c)", + "&vshr_u32 ($b,$t,20)", + "&vsli_32 ($b,$t,12)", + + "&vadd_i32 ($a,$a,$b)", + "&veor ($t,$d,$a)", + "&vshr_u32 ($d,$t,24)", + "&vsli_32 ($d,$t,8)", + + "&vadd_i32 ($c,$c,$d)", + "&veor ($t,$b,$c)", + "&vshr_u32 ($b,$t,25)", + "&vsli_32 ($b,$t,7)", + + "&vext_8 ($c,$c,$c,8)", + "&vext_8 ($b,$b,$b,$odd?12:4)", + "&vext_8 ($d,$d,$d,$odd?4:12)" + ); +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.type ChaCha20_neon,%function +.align 5 +ChaCha20_neon: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0-r2,r4-r11,lr} +.LChaCha20_neon: + adr r14,.Lsigma + vstmdb sp!,{d8-d15} @ ABI spec says so + stmdb sp!,{r0-r3} + + vld1.32 {$b0-$c0},[r3] @ load key + ldmia r3,{r4-r11} @ load key + + sub sp,sp,#4*(16+16) + vld1.32 {$d0},[r12] @ load counter and nonce + add r12,sp,#4*8 + ldmia r14,{r0-r3} @ load sigma + vld1.32 {$a0},[r14]! @ load sigma + vld1.32 {$t0},[r14] @ one + vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce + vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key + + str r10,[sp,#4*(16+10)] @ off-load "@x[10]" + str r11,[sp,#4*(16+11)] @ off-load "@x[11]" + vshl.i32 $t1#lo,$t0#lo,#1 @ two + vstr $t0#lo,[sp,#4*(16+0)] + vshl.i32 $t2#lo,$t0#lo,#2 @ four + vstr $t1#lo,[sp,#4*(16+2)] + vmov $a1,$a0 + vstr $t2#lo,[sp,#4*(16+4)] + vmov $a2,$a0 + vmov $b1,$b0 + vmov $b2,$b0 + b .Loop_neon_enter + +.align 4 +.Loop_neon_outer: + ldmia sp,{r0-r9} @ load key material + cmp @t[3],#64*2 @ if len<=64*2 + bls .Lbreak_neon @ switch to integer-only + vmov $a1,$a0 + str @t[3],[sp,#4*(32+2)] @ save len + vmov $a2,$a0 + str r12, [sp,#4*(32+1)] @ save inp + vmov $b1,$b0 + str r14, [sp,#4*(32+0)] @ save out + vmov $b2,$b0 +.Loop_neon_enter: + ldr @t[3], [sp,#4*(15)] + vadd.i32 $d1,$d0,$t0 @ counter+1 + ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load + vmov $c1,$c0 + ldr @t[2], [sp,#4*(13)] + vmov $c2,$c0 + ldr @x[14],[sp,#4*(14)] + vadd.i32 $d2,$d1,$t0 @ counter+2 + str @t[3], [sp,#4*(16+15)] + mov @t[3],#10 + add @x[12],@x[12],#3 @ counter+3 + b .Loop_neon + +.align 4 +.Loop_neon: + subs @t[3],@t[3],#1 +___ + my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); + my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); + my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); + my @thread3=&ROUND(0,4,8,12); + + foreach (@thread0) { + eval; eval(shift(@thread3)); + eval(shift(@thread1)); eval(shift(@thread3)); + eval(shift(@thread2)); eval(shift(@thread3)); + } + + @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); + @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); + @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); + @thread3=&ROUND(0,5,10,15); + + foreach (@thread0) { + eval; eval(shift(@thread3)); + eval(shift(@thread1)); eval(shift(@thread3)); + eval(shift(@thread2)); eval(shift(@thread3)); + } +$code.=<<___; + bne .Loop_neon + + add @t[3],sp,#32 + vld1.32 {$t0-$t1},[sp] @ load key material + vld1.32 {$t2-$t3},[@t[3]] + + ldr @t[3],[sp,#4*(32+2)] @ load len + + str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store + str @t[1], [sp,#4*(16+9)] + str @x[12],[sp,#4*(16+12)] + str @t[2], [sp,#4*(16+13)] + str @x[14],[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ @x[0-7] and second half at sp+4*(16+8) + + ldr r12,[sp,#4*(32+1)] @ load inp + ldr r14,[sp,#4*(32+0)] @ load out + + vadd.i32 $a0,$a0,$t0 @ accumulate key material + vadd.i32 $a1,$a1,$t0 + vadd.i32 $a2,$a2,$t0 + vldr $t0#lo,[sp,#4*(16+0)] @ one + + vadd.i32 $b0,$b0,$t1 + vadd.i32 $b1,$b1,$t1 + vadd.i32 $b2,$b2,$t1 + vldr $t1#lo,[sp,#4*(16+2)] @ two + + vadd.i32 $c0,$c0,$t2 + vadd.i32 $c1,$c1,$t2 + vadd.i32 $c2,$c2,$t2 + vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 + vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 + + vadd.i32 $d0,$d0,$t3 + vadd.i32 $d1,$d1,$t3 + vadd.i32 $d2,$d2,$t3 + + cmp @t[3],#64*4 + blo .Ltail_neon + + vld1.8 {$t0-$t1},[r12]! @ load input + mov @t[3],sp + vld1.8 {$t2-$t3},[r12]! + veor $a0,$a0,$t0 @ xor with input + veor $b0,$b0,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c0,$c0,$t2 + veor $d0,$d0,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a1,$a1,$t0 + vst1.8 {$a0-$b0},[r14]! @ store output + veor $b1,$b1,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c1,$c1,$t2 + vst1.8 {$c0-$d0},[r14]! + veor $d1,$d1,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a2,$a2,$t0 + vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration + veor $t0#hi,$t0#hi,$t0#hi + vldr $t0#lo,[sp,#4*(16+4)] @ four + veor $b2,$b2,$t1 + vld1.32 {$c0-$d0},[@t[3]] + veor $c2,$c2,$t2 + vst1.8 {$a1-$b1},[r14]! + veor $d2,$d2,$t3 + vst1.8 {$c1-$d1},[r14]! + + vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value + vldr $t0#lo,[sp,#4*(16+0)] @ one + + ldmia sp,{@t[0]-@t[3]} @ load key material + add @x[0],@x[0],@t[0] @ accumulate key material + ldr @t[0],[r12],#16 @ load input + vst1.8 {$a2-$b2},[r14]! + add @x[1],@x[1],@t[1] + ldr @t[1],[r12,#-12] + vst1.8 {$c2-$d2},[r14]! + add @x[2],@x[2],@t[2] + ldr @t[2],[r12,#-8] + add @x[3],@x[3],@t[3] + ldr @t[3],[r12,#-4] +# ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] +# endif + eor @x[0],@x[0],@t[0] @ xor with input + add @t[0],sp,#4*(4) + eor @x[1],@x[1],@t[1] + str @x[0],[r14],#16 @ store output + eor @x[2],@x[2],@t[2] + str @x[1],[r14,#-12] + eor @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[2],[r14,#-8] + str @x[3],[r14,#-4] + + add @x[4],@x[4],@t[0] @ accumulate key material + ldr @t[0],[r12],#16 @ load input + add @x[5],@x[5],@t[1] + ldr @t[1],[r12,#-12] + add @x[6],@x[6],@t[2] + ldr @t[2],[r12,#-8] + add @x[7],@x[7],@t[3] + ldr @t[3],[r12,#-4] +# ifdef __ARMEB__ + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif + eor @x[4],@x[4],@t[0] + add @t[0],sp,#4*(8) + eor @x[5],@x[5],@t[1] + str @x[4],[r14],#16 @ store output + eor @x[6],@x[6],@t[2] + str @x[5],[r14,#-12] + eor @x[7],@x[7],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[6],[r14,#-8] + add @x[0],sp,#4*(16+8) + str @x[7],[r14,#-4] + + ldmia @x[0],{@x[0]-@x[7]} @ load second half + + add @x[0],@x[0],@t[0] @ accumulate key material + ldr @t[0],[r12],#16 @ load input + add @x[1],@x[1],@t[1] + ldr @t[1],[r12,#-12] +# ifdef __thumb2__ + it hi +# endif + strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it + add @x[2],@x[2],@t[2] + ldr @t[2],[r12,#-8] +# ifdef __thumb2__ + it hi +# endif + strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it + add @x[3],@x[3],@t[3] + ldr @t[3],[r12,#-4] +# ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] +# endif + eor @x[0],@x[0],@t[0] + add @t[0],sp,#4*(12) + eor @x[1],@x[1],@t[1] + str @x[0],[r14],#16 @ store output + eor @x[2],@x[2],@t[2] + str @x[1],[r14,#-12] + eor @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[2],[r14,#-8] + str @x[3],[r14,#-4] + + add @x[4],@x[4],@t[0] @ accumulate key material + add @t[0],@t[0],#4 @ next counter value + add @x[5],@x[5],@t[1] + str @t[0],[sp,#4*(12)] @ save next counter value + ldr @t[0],[r12],#16 @ load input + add @x[6],@x[6],@t[2] + add @x[4],@x[4],#3 @ counter+3 + ldr @t[1],[r12,#-12] + add @x[7],@x[7],@t[3] + ldr @t[2],[r12,#-8] + ldr @t[3],[r12,#-4] +# ifdef __ARMEB__ + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif + eor @x[4],@x[4],@t[0] +# ifdef __thumb2__ + it hi +# endif + ldrhi @t[0],[sp,#4*(32+2)] @ re-load len + eor @x[5],@x[5],@t[1] + eor @x[6],@x[6],@t[2] + str @x[4],[r14],#16 @ store output + eor @x[7],@x[7],@t[3] + str @x[5],[r14,#-12] + sub @t[3],@t[0],#64*4 @ len-=64*4 + str @x[6],[r14,#-8] + str @x[7],[r14,#-4] + bhi .Loop_neon_outer + + b .Ldone_neon + +.align 4 +.Lbreak_neon: + @ harmonize NEON and integer-only stack frames: load data + @ from NEON frame, but save to integer-only one; distance + @ between the two is 4*(32+4+16-32)=4*(20). + + str @t[3], [sp,#4*(20+32+2)] @ save len + add @t[3],sp,#4*(32+4) + str r12, [sp,#4*(20+32+1)] @ save inp + str r14, [sp,#4*(20+32+0)] @ save out + + ldr @x[12],[sp,#4*(16+10)] + ldr @x[14],[sp,#4*(16+11)] + vldmia @t[3],{d8-d15} @ fulfill ABI requirement + str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" + str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" + + ldr @t[3], [sp,#4*(15)] + ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load + ldr @t[2], [sp,#4*(13)] + ldr @x[14],[sp,#4*(14)] + str @t[3], [sp,#4*(20+16+15)] + add @t[3],sp,#4*(20) + vst1.32 {$a0-$b0},[@t[3]]! @ copy key + add sp,sp,#4*(20) @ switch frame + vst1.32 {$c0-$d0},[@t[3]] + mov @t[3],#10 + b .Loop @ go integer-only + +.align 4 +.Ltail_neon: + cmp @t[3],#64*3 + bhs .L192_or_more_neon + cmp @t[3],#64*2 + bhs .L128_or_more_neon + cmp @t[3],#64*1 + bhs .L64_or_more_neon + + add @t[0],sp,#4*(8) + vst1.8 {$a0-$b0},[sp] + add @t[2],sp,#4*(0) + vst1.8 {$c0-$d0},[@t[0]] + b .Loop_tail_neon + +.align 4 +.L64_or_more_neon: + vld1.8 {$t0-$t1},[r12]! + vld1.8 {$t2-$t3},[r12]! + veor $a0,$a0,$t0 + veor $b0,$b0,$t1 + veor $c0,$c0,$t2 + veor $d0,$d0,$t3 + vst1.8 {$a0-$b0},[r14]! + vst1.8 {$c0-$d0},[r14]! + + beq .Ldone_neon + + add @t[0],sp,#4*(8) + vst1.8 {$a1-$b1},[sp] + add @t[2],sp,#4*(0) + vst1.8 {$c1-$d1},[@t[0]] + sub @t[3],@t[3],#64*1 @ len-=64*1 + b .Loop_tail_neon + +.align 4 +.L128_or_more_neon: + vld1.8 {$t0-$t1},[r12]! + vld1.8 {$t2-$t3},[r12]! + veor $a0,$a0,$t0 + veor $b0,$b0,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c0,$c0,$t2 + veor $d0,$d0,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a1,$a1,$t0 + veor $b1,$b1,$t1 + vst1.8 {$a0-$b0},[r14]! + veor $c1,$c1,$t2 + vst1.8 {$c0-$d0},[r14]! + veor $d1,$d1,$t3 + vst1.8 {$a1-$b1},[r14]! + vst1.8 {$c1-$d1},[r14]! + + beq .Ldone_neon + + add @t[0],sp,#4*(8) + vst1.8 {$a2-$b2},[sp] + add @t[2],sp,#4*(0) + vst1.8 {$c2-$d2},[@t[0]] + sub @t[3],@t[3],#64*2 @ len-=64*2 + b .Loop_tail_neon + +.align 4 +.L192_or_more_neon: + vld1.8 {$t0-$t1},[r12]! + vld1.8 {$t2-$t3},[r12]! + veor $a0,$a0,$t0 + veor $b0,$b0,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c0,$c0,$t2 + veor $d0,$d0,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a1,$a1,$t0 + veor $b1,$b1,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c1,$c1,$t2 + vst1.8 {$a0-$b0},[r14]! + veor $d1,$d1,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a2,$a2,$t0 + vst1.8 {$c0-$d0},[r14]! + veor $b2,$b2,$t1 + vst1.8 {$a1-$b1},[r14]! + veor $c2,$c2,$t2 + vst1.8 {$c1-$d1},[r14]! + veor $d2,$d2,$t3 + vst1.8 {$a2-$b2},[r14]! + vst1.8 {$c2-$d2},[r14]! + + beq .Ldone_neon + + ldmia sp,{@t[0]-@t[3]} @ load key material + add @x[0],@x[0],@t[0] @ accumulate key material + add @t[0],sp,#4*(4) + add @x[1],@x[1],@t[1] + add @x[2],@x[2],@t[2] + add @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + + add @x[4],@x[4],@t[0] @ accumulate key material + add @t[0],sp,#4*(8) + add @x[5],@x[5],@t[1] + add @x[6],@x[6],@t[2] + add @x[7],@x[7],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material +# ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif + stmia sp,{@x[0]-@x[7]} + add @x[0],sp,#4*(16+8) + + ldmia @x[0],{@x[0]-@x[7]} @ load second half + + add @x[0],@x[0],@t[0] @ accumulate key material + add @t[0],sp,#4*(12) + add @x[1],@x[1],@t[1] + add @x[2],@x[2],@t[2] + add @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + + add @x[4],@x[4],@t[0] @ accumulate key material + add @t[0],sp,#4*(8) + add @x[5],@x[5],@t[1] + add @x[4],@x[4],#3 @ counter+3 + add @x[6],@x[6],@t[2] + add @x[7],@x[7],@t[3] + ldr @t[3],[sp,#4*(32+2)] @ re-load len +# ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif + stmia @t[0],{@x[0]-@x[7]} + add @t[2],sp,#4*(0) + sub @t[3],@t[3],#64*3 @ len-=64*3 + +.Loop_tail_neon: + ldrb @t[0],[@t[2]],#1 @ read buffer on stack + ldrb @t[1],[r12],#1 @ read input + subs @t[3],@t[3],#1 + eor @t[0],@t[0],@t[1] + strb @t[0],[r14],#1 @ store output + bne .Loop_tail_neon + +.Ldone_neon: + add sp,sp,#4*(32+4) + vldmia sp,{d8-d15} + add sp,sp,#4*(16+3) + ldmia sp!,{r4-r11,pc} +.size ChaCha20_neon,.-ChaCha20_neon +.comm OPENSSL_armcap_P,4,4 +#endif +___ +}}} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; + + print $_,"\n"; +} +close STDOUT; diff --git a/crypto/chacha20/chacha20-arm.s b/crypto/chacha20/chacha20-arm.s new file mode 100644 index 0000000..2e22fd1 --- /dev/null +++ b/crypto/chacha20/chacha20-arm.s @@ -0,0 +1,1475 @@ +/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0) + * + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + */ + +/*#include */ + +.text +#if defined(__thumb2__) || defined(__clang__) +.syntax unified +#endif +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +#if defined(__thumb2__) || defined(__clang__) +#define ldrbhs ldrbhs +#endif + +.align 5 +.Lsigma: +.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral +.Lone: +.long 1,0,0,0 +.word -1 + +#if __ARM_ARCH__ >= 7 +.arch armv7-a +.fpu neon + +.align 5 +.globl chacha20_neon +.type chacha20_neon,%function +chacha20_neon: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0-r2,r4-r11,lr} + cmp r2,#0 @ len==0? +#ifdef __thumb2__ + itt eq +#endif + addeq sp,sp,#4*3 + beq .Lno_data_neon + cmp r2,#192 @ test len + bls .Lshort +.Lchacha20_neon_begin: + adr r14,.Lsigma + vstmdb sp!,{d8-d15} @ ABI spec says so + stmdb sp!,{r0-r3} + + vld1.32 {q1-q2},[r3] @ load key + ldmia r3,{r4-r11} @ load key + + sub sp,sp,#4*(16+16) + vld1.32 {q3},[r12] @ load counter and nonce + add r12,sp,#4*8 + ldmia r14,{r0-r3} @ load sigma + vld1.32 {q0},[r14]! @ load sigma + vld1.32 {q12},[r14] @ one + vst1.32 {q2-q3},[r12] @ copy 1/2key|counter|nonce + vst1.32 {q0-q1},[sp] @ copy sigma|1/2key + + str r10,[sp,#4*(16+10)] @ off-load "rx" + str r11,[sp,#4*(16+11)] @ off-load "rx" + vshl.i32 d26,d24,#1 @ two + vstr d24,[sp,#4*(16+0)] + vshl.i32 d28,d24,#2 @ four + vstr d26,[sp,#4*(16+2)] + vmov q4,q0 + vstr d28,[sp,#4*(16+4)] + vmov q8,q0 + vmov q5,q1 + vmov q9,q1 + b .Loop_neon_enter + +.align 4 +.Loop_neon_outer: + ldmia sp,{r0-r9} @ load key material + cmp r11,#64*2 @ if len<=64*2 + bls .Lbreak_neon @ switch to integer-only + vmov q4,q0 + str r11,[sp,#4*(32+2)] @ save len + vmov q8,q0 + str r12, [sp,#4*(32+1)] @ save inp + vmov q5,q1 + str r14, [sp,#4*(32+0)] @ save out + vmov q9,q1 +.Loop_neon_enter: + ldr r11, [sp,#4*(15)] + vadd.i32 q7,q3,q12 @ counter+1 + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + vmov q6,q2 + ldr r10, [sp,#4*(13)] + vmov q10,q2 + ldr r14,[sp,#4*(14)] + vadd.i32 q11,q7,q12 @ counter+2 + str r11, [sp,#4*(16+15)] + mov r11,#10 + add r12,r12,#3 @ counter+3 + b .Loop_neon + +.align 4 +.Loop_neon: + subs r11,r11,#1 + vadd.i32 q0,q0,q1 + add r0,r0,r4 + vadd.i32 q4,q4,q5 + mov r12,r12,ror#16 + vadd.i32 q8,q8,q9 + add r1,r1,r5 + veor q3,q3,q0 + mov r10,r10,ror#16 + veor q7,q7,q4 + eor r12,r12,r0,ror#16 + veor q11,q11,q8 + eor r10,r10,r1,ror#16 + vrev32.16 q3,q3 + add r8,r8,r12 + vrev32.16 q7,q7 + mov r4,r4,ror#20 + vrev32.16 q11,q11 + add r9,r9,r10 + vadd.i32 q2,q2,q3 + mov r5,r5,ror#20 + vadd.i32 q6,q6,q7 + eor r4,r4,r8,ror#20 + vadd.i32 q10,q10,q11 + eor r5,r5,r9,ror#20 + veor q12,q1,q2 + add r0,r0,r4 + veor q13,q5,q6 + mov r12,r12,ror#24 + veor q14,q9,q10 + add r1,r1,r5 + vshr.u32 q1,q12,#20 + mov r10,r10,ror#24 + vshr.u32 q5,q13,#20 + eor r12,r12,r0,ror#24 + vshr.u32 q9,q14,#20 + eor r10,r10,r1,ror#24 + vsli.32 q1,q12,#12 + add r8,r8,r12 + vsli.32 q5,q13,#12 + mov r4,r4,ror#25 + vsli.32 q9,q14,#12 + add r9,r9,r10 + vadd.i32 q0,q0,q1 + mov r5,r5,ror#25 + vadd.i32 q4,q4,q5 + str r10,[sp,#4*(16+13)] + vadd.i32 q8,q8,q9 + ldr r10,[sp,#4*(16+15)] + veor q12,q3,q0 + eor r4,r4,r8,ror#25 + veor q13,q7,q4 + eor r5,r5,r9,ror#25 + veor q14,q11,q8 + str r8,[sp,#4*(16+8)] + vshr.u32 q3,q12,#24 + ldr r8,[sp,#4*(16+10)] + vshr.u32 q7,q13,#24 + add r2,r2,r6 + vshr.u32 q11,q14,#24 + mov r14,r14,ror#16 + vsli.32 q3,q12,#8 + str r9,[sp,#4*(16+9)] + vsli.32 q7,q13,#8 + ldr r9,[sp,#4*(16+11)] + vsli.32 q11,q14,#8 + add r3,r3,r7 + vadd.i32 q2,q2,q3 + mov r10,r10,ror#16 + vadd.i32 q6,q6,q7 + eor r14,r14,r2,ror#16 + vadd.i32 q10,q10,q11 + eor r10,r10,r3,ror#16 + veor q12,q1,q2 + add r8,r8,r14 + veor q13,q5,q6 + mov r6,r6,ror#20 + veor q14,q9,q10 + add r9,r9,r10 + vshr.u32 q1,q12,#25 + mov r7,r7,ror#20 + vshr.u32 q5,q13,#25 + eor r6,r6,r8,ror#20 + vshr.u32 q9,q14,#25 + eor r7,r7,r9,ror#20 + vsli.32 q1,q12,#7 + add r2,r2,r6 + vsli.32 q5,q13,#7 + mov r14,r14,ror#24 + vsli.32 q9,q14,#7 + add r3,r3,r7 + vext.8 q2,q2,q2,#8 + mov r10,r10,ror#24 + vext.8 q6,q6,q6,#8 + eor r14,r14,r2,ror#24 + vext.8 q10,q10,q10,#8 + eor r10,r10,r3,ror#24 + vext.8 q1,q1,q1,#4 + add r8,r8,r14 + vext.8 q5,q5,q5,#4 + mov r6,r6,ror#25 + vext.8 q9,q9,q9,#4 + add r9,r9,r10 + vext.8 q3,q3,q3,#12 + mov r7,r7,ror#25 + vext.8 q7,q7,q7,#12 + eor r6,r6,r8,ror#25 + vext.8 q11,q11,q11,#12 + eor r7,r7,r9,ror#25 + vadd.i32 q0,q0,q1 + add r0,r0,r5 + vadd.i32 q4,q4,q5 + mov r10,r10,ror#16 + vadd.i32 q8,q8,q9 + add r1,r1,r6 + veor q3,q3,q0 + mov r12,r12,ror#16 + veor q7,q7,q4 + eor r10,r10,r0,ror#16 + veor q11,q11,q8 + eor r12,r12,r1,ror#16 + vrev32.16 q3,q3 + add r8,r8,r10 + vrev32.16 q7,q7 + mov r5,r5,ror#20 + vrev32.16 q11,q11 + add r9,r9,r12 + vadd.i32 q2,q2,q3 + mov r6,r6,ror#20 + vadd.i32 q6,q6,q7 + eor r5,r5,r8,ror#20 + vadd.i32 q10,q10,q11 + eor r6,r6,r9,ror#20 + veor q12,q1,q2 + add r0,r0,r5 + veor q13,q5,q6 + mov r10,r10,ror#24 + veor q14,q9,q10 + add r1,r1,r6 + vshr.u32 q1,q12,#20 + mov r12,r12,ror#24 + vshr.u32 q5,q13,#20 + eor r10,r10,r0,ror#24 + vshr.u32 q9,q14,#20 + eor r12,r12,r1,ror#24 + vsli.32 q1,q12,#12 + add r8,r8,r10 + vsli.32 q5,q13,#12 + mov r5,r5,ror#25 + vsli.32 q9,q14,#12 + str r10,[sp,#4*(16+15)] + vadd.i32 q0,q0,q1 + ldr r10,[sp,#4*(16+13)] + vadd.i32 q4,q4,q5 + add r9,r9,r12 + vadd.i32 q8,q8,q9 + mov r6,r6,ror#25 + veor q12,q3,q0 + eor r5,r5,r8,ror#25 + veor q13,q7,q4 + eor r6,r6,r9,ror#25 + veor q14,q11,q8 + str r8,[sp,#4*(16+10)] + vshr.u32 q3,q12,#24 + ldr r8,[sp,#4*(16+8)] + vshr.u32 q7,q13,#24 + add r2,r2,r7 + vshr.u32 q11,q14,#24 + mov r10,r10,ror#16 + vsli.32 q3,q12,#8 + str r9,[sp,#4*(16+11)] + vsli.32 q7,q13,#8 + ldr r9,[sp,#4*(16+9)] + vsli.32 q11,q14,#8 + add r3,r3,r4 + vadd.i32 q2,q2,q3 + mov r14,r14,ror#16 + vadd.i32 q6,q6,q7 + eor r10,r10,r2,ror#16 + vadd.i32 q10,q10,q11 + eor r14,r14,r3,ror#16 + veor q12,q1,q2 + add r8,r8,r10 + veor q13,q5,q6 + mov r7,r7,ror#20 + veor q14,q9,q10 + add r9,r9,r14 + vshr.u32 q1,q12,#25 + mov r4,r4,ror#20 + vshr.u32 q5,q13,#25 + eor r7,r7,r8,ror#20 + vshr.u32 q9,q14,#25 + eor r4,r4,r9,ror#20 + vsli.32 q1,q12,#7 + add r2,r2,r7 + vsli.32 q5,q13,#7 + mov r10,r10,ror#24 + vsli.32 q9,q14,#7 + add r3,r3,r4 + vext.8 q2,q2,q2,#8 + mov r14,r14,ror#24 + vext.8 q6,q6,q6,#8 + eor r10,r10,r2,ror#24 + vext.8 q10,q10,q10,#8 + eor r14,r14,r3,ror#24 + vext.8 q1,q1,q1,#12 + add r8,r8,r10 + vext.8 q5,q5,q5,#12 + mov r7,r7,ror#25 + vext.8 q9,q9,q9,#12 + add r9,r9,r14 + vext.8 q3,q3,q3,#4 + mov r4,r4,ror#25 + vext.8 q7,q7,q7,#4 + eor r7,r7,r8,ror#25 + vext.8 q11,q11,q11,#4 + eor r4,r4,r9,ror#25 + bne .Loop_neon + + add r11,sp,#32 + vld1.32 {q12-q13},[sp] @ load key material + vld1.32 {q14-q15},[r11] + + ldr r11,[sp,#4*(32+2)] @ load len + + str r8, [sp,#4*(16+8)] @ modulo-scheduled store + str r9, [sp,#4*(16+9)] + str r12,[sp,#4*(16+12)] + str r10, [sp,#4*(16+13)] + str r14,[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ rx and second half at sp+4*(16+8) + + ldr r12,[sp,#4*(32+1)] @ load inp + ldr r14,[sp,#4*(32+0)] @ load out + + vadd.i32 q0,q0,q12 @ accumulate key material + vadd.i32 q4,q4,q12 + vadd.i32 q8,q8,q12 + vldr d24,[sp,#4*(16+0)] @ one + + vadd.i32 q1,q1,q13 + vadd.i32 q5,q5,q13 + vadd.i32 q9,q9,q13 + vldr d26,[sp,#4*(16+2)] @ two + + vadd.i32 q2,q2,q14 + vadd.i32 q6,q6,q14 + vadd.i32 q10,q10,q14 + vadd.i32 d14,d14,d24 @ counter+1 + vadd.i32 d22,d22,d26 @ counter+2 + + vadd.i32 q3,q3,q15 + vadd.i32 q7,q7,q15 + vadd.i32 q11,q11,q15 + + cmp r11,#64*4 + blo .Ltail_neon + + vld1.8 {q12-q13},[r12]! @ load input + mov r11,sp + vld1.8 {q14-q15},[r12]! + veor q0,q0,q12 @ xor with input + veor q1,q1,q13 + vld1.8 {q12-q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14-q15},[r12]! + + veor q4,q4,q12 + vst1.8 {q0-q1},[r14]! @ store output + veor q5,q5,q13 + vld1.8 {q12-q13},[r12]! + veor q6,q6,q14 + vst1.8 {q2-q3},[r14]! + veor q7,q7,q15 + vld1.8 {q14-q15},[r12]! + + veor q8,q8,q12 + vld1.32 {q0-q1},[r11]! @ load for next iteration + veor d25,d25,d25 + vldr d24,[sp,#4*(16+4)] @ four + veor q9,q9,q13 + vld1.32 {q2-q3},[r11] + veor q10,q10,q14 + vst1.8 {q4-q5},[r14]! + veor q11,q11,q15 + vst1.8 {q6-q7},[r14]! + + vadd.i32 d6,d6,d24 @ next counter value + vldr d24,[sp,#4*(16+0)] @ one + + ldmia sp,{r8-r11} @ load key material + add r0,r0,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + vst1.8 {q8-q9},[r14]! + add r1,r1,r9 + ldr r9,[r12,#-12] + vst1.8 {q10-q11},[r14]! + add r2,r2,r10 + ldr r10,[r12,#-8] + add r3,r3,r11 + ldr r11,[r12,#-4] +#ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + eor r0,r0,r8 @ xor with input + add r8,sp,#4*(4) + eor r1,r1,r9 + str r0,[r14],#16 @ store output + eor r2,r2,r10 + str r1,[r14,#-12] + eor r3,r3,r11 + ldmia r8,{r8-r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + add r5,r5,r9 + ldr r9,[r12,#-12] + add r6,r6,r10 + ldr r10,[r12,#-8] + add r7,r7,r11 + ldr r11,[r12,#-4] +#ifdef __ARMEB__ + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +#endif + eor r4,r4,r8 + add r8,sp,#4*(8) + eor r5,r5,r9 + str r4,[r14],#16 @ store output + eor r6,r6,r10 + str r5,[r14,#-12] + eor r7,r7,r11 + ldmia r8,{r8-r11} @ load key material + str r6,[r14,#-8] + add r0,sp,#4*(16+8) + str r7,[r14,#-4] + + ldmia r0,{r0-r7} @ load second half + + add r0,r0,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + add r1,r1,r9 + ldr r9,[r12,#-12] +#ifdef __thumb2__ + it hi +#endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it + add r2,r2,r10 + ldr r10,[r12,#-8] +#ifdef __thumb2__ + it hi +#endif + strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it + add r3,r3,r11 + ldr r11,[r12,#-4] +#ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + eor r0,r0,r8 + add r8,sp,#4*(12) + eor r1,r1,r9 + str r0,[r14],#16 @ store output + eor r2,r2,r10 + str r1,[r14,#-12] + eor r3,r3,r11 + ldmia r8,{r8-r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r8,r8,#4 @ next counter value + add r5,r5,r9 + str r8,[sp,#4*(12)] @ save next counter value + ldr r8,[r12],#16 @ load input + add r6,r6,r10 + add r4,r4,#3 @ counter+3 + ldr r9,[r12,#-12] + add r7,r7,r11 + ldr r10,[r12,#-8] + ldr r11,[r12,#-4] +#ifdef __ARMEB__ + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +#endif + eor r4,r4,r8 +#ifdef __thumb2__ + it hi +#endif + ldrhi r8,[sp,#4*(32+2)] @ re-load len + eor r5,r5,r9 + eor r6,r6,r10 + str r4,[r14],#16 @ store output + eor r7,r7,r11 + str r5,[r14,#-12] + sub r11,r8,#64*4 @ len-=64*4 + str r6,[r14,#-8] + str r7,[r14,#-4] + bhi .Loop_neon_outer + + b .Ldone_neon + +.align 4 +.Lbreak_neon: + @ harmonize NEON and integer-only stack frames: load data + @ from NEON frame, but save to integer-only one; distance + @ between the two is 4*(32+4+16-32)=4*(20). + + str r11, [sp,#4*(20+32+2)] @ save len + add r11,sp,#4*(32+4) + str r12, [sp,#4*(20+32+1)] @ save inp + str r14, [sp,#4*(20+32+0)] @ save out + + ldr r12,[sp,#4*(16+10)] + ldr r14,[sp,#4*(16+11)] + vldmia r11,{d8-d15} @ fulfill ABI requirement + str r12,[sp,#4*(20+16+10)] @ copy "rx" + str r14,[sp,#4*(20+16+11)] @ copy "rx" + + ldr r11, [sp,#4*(15)] + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + ldr r10, [sp,#4*(13)] + ldr r14,[sp,#4*(14)] + str r11, [sp,#4*(20+16+15)] + add r11,sp,#4*(20) + vst1.32 {q0-q1},[r11]! @ copy key + add sp,sp,#4*(20) @ switch frame + vst1.32 {q2-q3},[r11] + mov r11,#10 + b .Loop @ go integer-only + +.align 4 +.Ltail_neon: + cmp r11,#64*3 + bhs .L192_or_more_neon + cmp r11,#64*2 + bhs .L128_or_more_neon + cmp r11,#64*1 + bhs .L64_or_more_neon + + add r8,sp,#4*(8) + vst1.8 {q0-q1},[sp] + add r10,sp,#4*(0) + vst1.8 {q2-q3},[r8] + b .Loop_tail_neon + +.align 4 +.L64_or_more_neon: + vld1.8 {q12-q13},[r12]! + vld1.8 {q14-q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + veor q2,q2,q14 + veor q3,q3,q15 + vst1.8 {q0-q1},[r14]! + vst1.8 {q2-q3},[r14]! + + beq .Ldone_neon + + add r8,sp,#4*(8) + vst1.8 {q4-q5},[sp] + add r10,sp,#4*(0) + vst1.8 {q6-q7},[r8] + sub r11,r11,#64*1 @ len-=64*1 + b .Loop_tail_neon + +.align 4 +.L128_or_more_neon: + vld1.8 {q12-q13},[r12]! + vld1.8 {q14-q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + vld1.8 {q12-q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14-q15},[r12]! + + veor q4,q4,q12 + veor q5,q5,q13 + vst1.8 {q0-q1},[r14]! + veor q6,q6,q14 + vst1.8 {q2-q3},[r14]! + veor q7,q7,q15 + vst1.8 {q4-q5},[r14]! + vst1.8 {q6-q7},[r14]! + + beq .Ldone_neon + + add r8,sp,#4*(8) + vst1.8 {q8-q9},[sp] + add r10,sp,#4*(0) + vst1.8 {q10-q11},[r8] + sub r11,r11,#64*2 @ len-=64*2 + b .Loop_tail_neon + +.align 4 +.L192_or_more_neon: + vld1.8 {q12-q13},[r12]! + vld1.8 {q14-q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + vld1.8 {q12-q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14-q15},[r12]! + + veor q4,q4,q12 + veor q5,q5,q13 + vld1.8 {q12-q13},[r12]! + veor q6,q6,q14 + vst1.8 {q0-q1},[r14]! + veor q7,q7,q15 + vld1.8 {q14-q15},[r12]! + + veor q8,q8,q12 + vst1.8 {q2-q3},[r14]! + veor q9,q9,q13 + vst1.8 {q4-q5},[r14]! + veor q10,q10,q14 + vst1.8 {q6-q7},[r14]! + veor q11,q11,q15 + vst1.8 {q8-q9},[r14]! + vst1.8 {q10-q11},[r14]! + + beq .Ldone_neon + + ldmia sp,{r8-r11} @ load key material + add r0,r0,r8 @ accumulate key material + add r8,sp,#4*(4) + add r1,r1,r9 + add r2,r2,r10 + add r3,r3,r11 + ldmia r8,{r8-r11} @ load key material + + add r4,r4,r8 @ accumulate key material + add r8,sp,#4*(8) + add r5,r5,r9 + add r6,r6,r10 + add r7,r7,r11 + ldmia r8,{r8-r11} @ load key material +#ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +#endif + stmia sp,{r0-r7} + add r0,sp,#4*(16+8) + + ldmia r0,{r0-r7} @ load second half + + add r0,r0,r8 @ accumulate key material + add r8,sp,#4*(12) + add r1,r1,r9 + add r2,r2,r10 + add r3,r3,r11 + ldmia r8,{r8-r11} @ load key material + + add r4,r4,r8 @ accumulate key material + add r8,sp,#4*(8) + add r5,r5,r9 + add r4,r4,#3 @ counter+3 + add r6,r6,r10 + add r7,r7,r11 + ldr r11,[sp,#4*(32+2)] @ re-load len +#ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +#endif + stmia r8,{r0-r7} + add r10,sp,#4*(0) + sub r11,r11,#64*3 @ len-=64*3 + +.Loop_tail_neon: + ldrb r8,[r10],#1 @ read buffer on stack + ldrb r9,[r12],#1 @ read input + subs r11,r11,#1 + eor r8,r8,r9 + strb r8,[r14],#1 @ store output + bne .Loop_tail_neon + +.Ldone_neon: + add sp,sp,#4*(32+4) + vldmia sp,{d8-d15} + add sp,sp,#4*(16+3) +.Lno_data_neon: + ldmia sp!,{r4-r11,pc} +.size chacha20_neon,.-chacha20_neon +#endif + +.align 5 +.Lsigma2: +.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral +.Lone2: +.long 1,0,0,0 +.word -1 + +.align 5 +.globl chacha20_arm +.type chacha20_arm,%function +chacha20_arm: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0-r2,r4-r11,lr} + cmp r2,#0 @ len==0? +#ifdef __thumb2__ + itt eq +#endif + addeq sp,sp,#4*3 + beq .Lno_data_arm +.Lshort: + ldmia r12,{r4-r7} @ load counter and nonce + sub sp,sp,#4*(16) @ off-load area +#if __ARM_ARCH__ < 7 && !defined(__thumb2__) + sub r14,pc,#100 @ .Lsigma2 +#else + adr r14,.Lsigma2 @ .Lsigma2 +#endif + stmdb sp!,{r4-r7} @ copy counter and nonce + ldmia r3,{r4-r11} @ load key + ldmia r14,{r0-r3} @ load sigma + stmdb sp!,{r4-r11} @ copy key + stmdb sp!,{r0-r3} @ copy sigma + str r10,[sp,#4*(16+10)] @ off-load "rx" + str r11,[sp,#4*(16+11)] @ off-load "rx" + b .Loop_outer_enter + +.align 4 +.Loop_outer: + ldmia sp,{r0-r9} @ load key material + str r11,[sp,#4*(32+2)] @ save len + str r12, [sp,#4*(32+1)] @ save inp + str r14, [sp,#4*(32+0)] @ save out +.Loop_outer_enter: + ldr r11, [sp,#4*(15)] + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + ldr r10, [sp,#4*(13)] + ldr r14,[sp,#4*(14)] + str r11, [sp,#4*(16+15)] + mov r11,#10 + b .Loop + +.align 4 +.Loop: + subs r11,r11,#1 + add r0,r0,r4 + mov r12,r12,ror#16 + add r1,r1,r5 + mov r10,r10,ror#16 + eor r12,r12,r0,ror#16 + eor r10,r10,r1,ror#16 + add r8,r8,r12 + mov r4,r4,ror#20 + add r9,r9,r10 + mov r5,r5,ror#20 + eor r4,r4,r8,ror#20 + eor r5,r5,r9,ror#20 + add r0,r0,r4 + mov r12,r12,ror#24 + add r1,r1,r5 + mov r10,r10,ror#24 + eor r12,r12,r0,ror#24 + eor r10,r10,r1,ror#24 + add r8,r8,r12 + mov r4,r4,ror#25 + add r9,r9,r10 + mov r5,r5,ror#25 + str r10,[sp,#4*(16+13)] + ldr r10,[sp,#4*(16+15)] + eor r4,r4,r8,ror#25 + eor r5,r5,r9,ror#25 + str r8,[sp,#4*(16+8)] + ldr r8,[sp,#4*(16+10)] + add r2,r2,r6 + mov r14,r14,ror#16 + str r9,[sp,#4*(16+9)] + ldr r9,[sp,#4*(16+11)] + add r3,r3,r7 + mov r10,r10,ror#16 + eor r14,r14,r2,ror#16 + eor r10,r10,r3,ror#16 + add r8,r8,r14 + mov r6,r6,ror#20 + add r9,r9,r10 + mov r7,r7,ror#20 + eor r6,r6,r8,ror#20 + eor r7,r7,r9,ror#20 + add r2,r2,r6 + mov r14,r14,ror#24 + add r3,r3,r7 + mov r10,r10,ror#24 + eor r14,r14,r2,ror#24 + eor r10,r10,r3,ror#24 + add r8,r8,r14 + mov r6,r6,ror#25 + add r9,r9,r10 + mov r7,r7,ror#25 + eor r6,r6,r8,ror#25 + eor r7,r7,r9,ror#25 + add r0,r0,r5 + mov r10,r10,ror#16 + add r1,r1,r6 + mov r12,r12,ror#16 + eor r10,r10,r0,ror#16 + eor r12,r12,r1,ror#16 + add r8,r8,r10 + mov r5,r5,ror#20 + add r9,r9,r12 + mov r6,r6,ror#20 + eor r5,r5,r8,ror#20 + eor r6,r6,r9,ror#20 + add r0,r0,r5 + mov r10,r10,ror#24 + add r1,r1,r6 + mov r12,r12,ror#24 + eor r10,r10,r0,ror#24 + eor r12,r12,r1,ror#24 + add r8,r8,r10 + mov r5,r5,ror#25 + str r10,[sp,#4*(16+15)] + ldr r10,[sp,#4*(16+13)] + add r9,r9,r12 + mov r6,r6,ror#25 + eor r5,r5,r8,ror#25 + eor r6,r6,r9,ror#25 + str r8,[sp,#4*(16+10)] + ldr r8,[sp,#4*(16+8)] + add r2,r2,r7 + mov r10,r10,ror#16 + str r9,[sp,#4*(16+11)] + ldr r9,[sp,#4*(16+9)] + add r3,r3,r4 + mov r14,r14,ror#16 + eor r10,r10,r2,ror#16 + eor r14,r14,r3,ror#16 + add r8,r8,r10 + mov r7,r7,ror#20 + add r9,r9,r14 + mov r4,r4,ror#20 + eor r7,r7,r8,ror#20 + eor r4,r4,r9,ror#20 + add r2,r2,r7 + mov r10,r10,ror#24 + add r3,r3,r4 + mov r14,r14,ror#24 + eor r10,r10,r2,ror#24 + eor r14,r14,r3,ror#24 + add r8,r8,r10 + mov r7,r7,ror#25 + add r9,r9,r14 + mov r4,r4,ror#25 + eor r7,r7,r8,ror#25 + eor r4,r4,r9,ror#25 + bne .Loop + + ldr r11,[sp,#4*(32+2)] @ load len + + str r8, [sp,#4*(16+8)] @ modulo-scheduled store + str r9, [sp,#4*(16+9)] + str r12,[sp,#4*(16+12)] + str r10, [sp,#4*(16+13)] + str r14,[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ rx and second half at sp+4*(16+8) + + cmp r11,#64 @ done yet? +#ifdef __thumb2__ + itete lo +#endif + addlo r12,sp,#4*(0) @ shortcut or ... + ldrhs r12,[sp,#4*(32+1)] @ ... load inp + addlo r14,sp,#4*(0) @ shortcut or ... + ldrhs r14,[sp,#4*(32+0)] @ ... load out + + ldr r8,[sp,#4*(0)] @ load key material + ldr r9,[sp,#4*(1)] + +#if __ARM_ARCH__ >= 6 || !defined(__ARMEB__) +#if __ARM_ARCH__ < 7 + orr r10,r12,r14 + tst r10,#3 @ are input and output aligned? + ldr r10,[sp,#4*(2)] + bne .Lunaligned + cmp r11,#64 @ restore flags +#else + ldr r10,[sp,#4*(2)] +#endif + ldr r11,[sp,#4*(3)] + + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 +#ifdef __thumb2__ + itt hs +#endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + + add r2,r2,r10 + add r3,r3,r11 +#ifdef __thumb2__ + itt hs +#endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +#if __ARM_ARCH__ >= 6 && defined(__ARMEB__) + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif +#ifdef __thumb2__ + itt hs +#endif + eorhs r0,r0,r8 @ xor with input + eorhs r1,r1,r9 + add r8,sp,#4*(4) + str r0,[r14],#16 @ store output +#ifdef __thumb2__ + itt hs +#endif + eorhs r2,r2,r10 + eorhs r3,r3,r11 + ldmia r8,{r8-r11} @ load key material + str r1,[r14,#-12] + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 +#ifdef __thumb2__ + itt hs +#endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + add r6,r6,r10 + add r7,r7,r11 +#ifdef __thumb2__ + itt hs +#endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +#if __ARM_ARCH__ >= 6 && defined(__ARMEB__) + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +#endif +#ifdef __thumb2__ + itt hs +#endif + eorhs r4,r4,r8 + eorhs r5,r5,r9 + add r8,sp,#4*(8) + str r4,[r14],#16 @ store output +#ifdef __thumb2__ + itt hs +#endif + eorhs r6,r6,r10 + eorhs r7,r7,r11 + str r5,[r14,#-12] + ldmia r8,{r8-r11} @ load key material + str r6,[r14,#-8] + add r0,sp,#4*(16+8) + str r7,[r14,#-4] + + ldmia r0,{r0-r7} @ load second half + + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 +#ifdef __thumb2__ + itt hs +#endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] +#ifdef __thumb2__ + itt hi +#endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it + strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it + add r2,r2,r10 + add r3,r3,r11 +#ifdef __thumb2__ + itt hs +#endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +#if __ARM_ARCH__ >= 6 && defined(__ARMEB__) + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif +#ifdef __thumb2__ + itt hs +#endif + eorhs r0,r0,r8 + eorhs r1,r1,r9 + add r8,sp,#4*(12) + str r0,[r14],#16 @ store output +#ifdef __thumb2__ + itt hs +#endif + eorhs r2,r2,r10 + eorhs r3,r3,r11 + str r1,[r14,#-12] + ldmia r8,{r8-r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 +#ifdef __thumb2__ + itt hi +#endif + addhi r8,r8,#1 @ next counter value + strhi r8,[sp,#4*(12)] @ save next counter value +#ifdef __thumb2__ + itt hs +#endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + add r6,r6,r10 + add r7,r7,r11 +#ifdef __thumb2__ + itt hs +#endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +#if __ARM_ARCH__ >= 6 && defined(__ARMEB__) + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +#endif +#ifdef __thumb2__ + itt hs +#endif + eorhs r4,r4,r8 + eorhs r5,r5,r9 +#ifdef __thumb2__ + it ne +#endif + ldrne r8,[sp,#4*(32+2)] @ re-load len +#ifdef __thumb2__ + itt hs +#endif + eorhs r6,r6,r10 + eorhs r7,r7,r11 + str r4,[r14],#16 @ store output + str r5,[r14,#-12] +#ifdef __thumb2__ + it hs +#endif + subhs r11,r8,#64 @ len-=64 + str r6,[r14,#-8] + str r7,[r14,#-4] + bhi .Loop_outer + + beq .Ldone +#if __ARM_ARCH__ < 7 + b .Ltail + +.align 4 +.Lunaligned: @ unaligned endian-neutral path + cmp r11,#64 @ restore flags +#endif +#endif +#if __ARM_ARCH__ < 7 + ldr r11,[sp,#4*(3)] + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 + add r2,r2,r10 +#ifdef __thumb2__ + itete lo +#endif + eorlo r8,r8,r8 @ zero or ... + ldrbhs r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrbhs r9,[r12,#-12] + + add r3,r3,r11 +#ifdef __thumb2__ + itete lo +#endif + eorlo r10,r10,r10 + ldrbhs r10,[r12,#-8] + eorlo r11,r11,r11 + ldrbhs r11,[r12,#-4] + + eor r0,r8,r0 @ xor with input (or zero) + eor r1,r9,r1 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r8,[r12,#-15] @ load more input + ldrbhs r9,[r12,#-11] + eor r2,r10,r2 + strb r0,[r14],#16 @ store output + eor r3,r11,r3 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r10,[r12,#-7] + ldrbhs r11,[r12,#-3] + strb r1,[r14,#-12] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-8] + eor r1,r9,r1,lsr#8 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r8,[r12,#-14] @ load more input + ldrbhs r9,[r12,#-10] + strb r3,[r14,#-4] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-15] + eor r3,r11,r3,lsr#8 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r10,[r12,#-6] + ldrbhs r11,[r12,#-2] + strb r1,[r14,#-11] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-7] + eor r1,r9,r1,lsr#8 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r8,[r12,#-13] @ load more input + ldrbhs r9,[r12,#-9] + strb r3,[r14,#-3] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-14] + eor r3,r11,r3,lsr#8 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r10,[r12,#-5] + ldrbhs r11,[r12,#-1] + strb r1,[r14,#-10] + strb r2,[r14,#-6] + eor r0,r8,r0,lsr#8 + strb r3,[r14,#-2] + eor r1,r9,r1,lsr#8 + strb r0,[r14,#-13] + eor r2,r10,r2,lsr#8 + strb r1,[r14,#-9] + eor r3,r11,r3,lsr#8 + strb r2,[r14,#-5] + strb r3,[r14,#-1] + add r8,sp,#4*(4+0) + ldmia r8,{r8-r11} @ load key material + add r0,sp,#4*(16+8) + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 + add r6,r6,r10 +#ifdef __thumb2__ + itete lo +#endif + eorlo r8,r8,r8 @ zero or ... + ldrbhs r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrbhs r9,[r12,#-12] + + add r7,r7,r11 +#ifdef __thumb2__ + itete lo +#endif + eorlo r10,r10,r10 + ldrbhs r10,[r12,#-8] + eorlo r11,r11,r11 + ldrbhs r11,[r12,#-4] + + eor r4,r8,r4 @ xor with input (or zero) + eor r5,r9,r5 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r8,[r12,#-15] @ load more input + ldrbhs r9,[r12,#-11] + eor r6,r10,r6 + strb r4,[r14],#16 @ store output + eor r7,r11,r7 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r10,[r12,#-7] + ldrbhs r11,[r12,#-3] + strb r5,[r14,#-12] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-8] + eor r5,r9,r5,lsr#8 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r8,[r12,#-14] @ load more input + ldrbhs r9,[r12,#-10] + strb r7,[r14,#-4] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-15] + eor r7,r11,r7,lsr#8 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r10,[r12,#-6] + ldrbhs r11,[r12,#-2] + strb r5,[r14,#-11] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-7] + eor r5,r9,r5,lsr#8 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r8,[r12,#-13] @ load more input + ldrbhs r9,[r12,#-9] + strb r7,[r14,#-3] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-14] + eor r7,r11,r7,lsr#8 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r10,[r12,#-5] + ldrbhs r11,[r12,#-1] + strb r5,[r14,#-10] + strb r6,[r14,#-6] + eor r4,r8,r4,lsr#8 + strb r7,[r14,#-2] + eor r5,r9,r5,lsr#8 + strb r4,[r14,#-13] + eor r6,r10,r6,lsr#8 + strb r5,[r14,#-9] + eor r7,r11,r7,lsr#8 + strb r6,[r14,#-5] + strb r7,[r14,#-1] + add r8,sp,#4*(4+4) + ldmia r8,{r8-r11} @ load key material + ldmia r0,{r0-r7} @ load second half +#ifdef __thumb2__ + itt hi +#endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" + strhi r11,[sp,#4*(16+11)] @ copy "rx" + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 + add r2,r2,r10 +#ifdef __thumb2__ + itete lo +#endif + eorlo r8,r8,r8 @ zero or ... + ldrbhs r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrbhs r9,[r12,#-12] + + add r3,r3,r11 +#ifdef __thumb2__ + itete lo +#endif + eorlo r10,r10,r10 + ldrbhs r10,[r12,#-8] + eorlo r11,r11,r11 + ldrbhs r11,[r12,#-4] + + eor r0,r8,r0 @ xor with input (or zero) + eor r1,r9,r1 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r8,[r12,#-15] @ load more input + ldrbhs r9,[r12,#-11] + eor r2,r10,r2 + strb r0,[r14],#16 @ store output + eor r3,r11,r3 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r10,[r12,#-7] + ldrbhs r11,[r12,#-3] + strb r1,[r14,#-12] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-8] + eor r1,r9,r1,lsr#8 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r8,[r12,#-14] @ load more input + ldrbhs r9,[r12,#-10] + strb r3,[r14,#-4] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-15] + eor r3,r11,r3,lsr#8 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r10,[r12,#-6] + ldrbhs r11,[r12,#-2] + strb r1,[r14,#-11] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-7] + eor r1,r9,r1,lsr#8 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r8,[r12,#-13] @ load more input + ldrbhs r9,[r12,#-9] + strb r3,[r14,#-3] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-14] + eor r3,r11,r3,lsr#8 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r10,[r12,#-5] + ldrbhs r11,[r12,#-1] + strb r1,[r14,#-10] + strb r2,[r14,#-6] + eor r0,r8,r0,lsr#8 + strb r3,[r14,#-2] + eor r1,r9,r1,lsr#8 + strb r0,[r14,#-13] + eor r2,r10,r2,lsr#8 + strb r1,[r14,#-9] + eor r3,r11,r3,lsr#8 + strb r2,[r14,#-5] + strb r3,[r14,#-1] + add r8,sp,#4*(4+8) + ldmia r8,{r8-r11} @ load key material + add r4,r4,r8 @ accumulate key material +#ifdef __thumb2__ + itt hi +#endif + addhi r8,r8,#1 @ next counter value + strhi r8,[sp,#4*(12)] @ save next counter value + add r5,r5,r9 + add r6,r6,r10 +#ifdef __thumb2__ + itete lo +#endif + eorlo r8,r8,r8 @ zero or ... + ldrbhs r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrbhs r9,[r12,#-12] + + add r7,r7,r11 +#ifdef __thumb2__ + itete lo +#endif + eorlo r10,r10,r10 + ldrbhs r10,[r12,#-8] + eorlo r11,r11,r11 + ldrbhs r11,[r12,#-4] + + eor r4,r8,r4 @ xor with input (or zero) + eor r5,r9,r5 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r8,[r12,#-15] @ load more input + ldrbhs r9,[r12,#-11] + eor r6,r10,r6 + strb r4,[r14],#16 @ store output + eor r7,r11,r7 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r10,[r12,#-7] + ldrbhs r11,[r12,#-3] + strb r5,[r14,#-12] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-8] + eor r5,r9,r5,lsr#8 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r8,[r12,#-14] @ load more input + ldrbhs r9,[r12,#-10] + strb r7,[r14,#-4] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-15] + eor r7,r11,r7,lsr#8 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r10,[r12,#-6] + ldrbhs r11,[r12,#-2] + strb r5,[r14,#-11] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-7] + eor r5,r9,r5,lsr#8 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r8,[r12,#-13] @ load more input + ldrbhs r9,[r12,#-9] + strb r7,[r14,#-3] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-14] + eor r7,r11,r7,lsr#8 +#ifdef __thumb2__ + itt hs +#endif + ldrbhs r10,[r12,#-5] + ldrbhs r11,[r12,#-1] + strb r5,[r14,#-10] + strb r6,[r14,#-6] + eor r4,r8,r4,lsr#8 + strb r7,[r14,#-2] + eor r5,r9,r5,lsr#8 + strb r4,[r14,#-13] + eor r6,r10,r6,lsr#8 + strb r5,[r14,#-9] + eor r7,r11,r7,lsr#8 + strb r6,[r14,#-5] + strb r7,[r14,#-1] +#ifdef __thumb2__ + it ne +#endif + ldrne r8,[sp,#4*(32+2)] @ re-load len +#ifdef __thumb2__ + it hs +#endif + subhs r11,r8,#64 @ len-=64 + bhi .Loop_outer + + beq .Ldone +#endif + +.Ltail: + ldr r12,[sp,#4*(32+1)] @ load inp + add r9,sp,#4*(0) + ldr r14,[sp,#4*(32+0)] @ load out + +.Loop_tail: + ldrb r10,[r9],#1 @ read buffer on stack + ldrb r11,[r12],#1 @ read input + subs r8,r8,#1 + eor r11,r11,r10 + strb r11,[r14],#1 @ store output + bne .Loop_tail + +.Ldone: + add sp,sp,#4*(32+3) +.Lno_data_arm: + ldmia sp!,{r4-r11,pc} +.size chacha20_arm,.-chacha20_arm diff --git a/crypto/chacha20/chacha20-arm64.pl b/crypto/chacha20/chacha20-arm64.pl new file mode 100644 index 0000000..4a838bc --- /dev/null +++ b/crypto/chacha20/chacha20-arm64.pl @@ -0,0 +1,1136 @@ +#! /usr/bin/env perl +# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# June 2015 +# +# ChaCha20 for ARMv8. +# +# Performance in cycles per byte out of large buffer. +# +# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU +# +# Apple A7 5.50/+49% 3.33 1.70 +# Cortex-A53 8.40/+80% 4.72 4.72(*) +# Cortex-A57 8.06/+43% 4.90 4.43(**) +# Denver 4.50/+82% 2.63 2.67(*) +# X-Gene 9.50/+46% 8.82 8.89(*) +# Mongoose 8.00/+44% 3.64 3.25 +# Kryo 8.17/+50% 4.83 4.65 +# +# (*) it's expected that doubling interleave factor doesn't help +# all processors, only those with higher NEON latency and +# higher instruction issue rate; +# (**) expected improvement was actually higher; + +$flavour=shift; +$output=shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4)); + +my @x=map("x$_",(5..17,19..21)); +my @d=map("x$_",(22..28,30)); + +sub ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); + + ( + "&add_32 (@x[$a0],@x[$a0],@x[$b0])", + "&add_32 (@x[$a1],@x[$a1],@x[$b1])", + "&add_32 (@x[$a2],@x[$a2],@x[$b2])", + "&add_32 (@x[$a3],@x[$a3],@x[$b3])", + "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", + "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", + "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", + "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", + "&ror_32 (@x[$d0],@x[$d0],16)", + "&ror_32 (@x[$d1],@x[$d1],16)", + "&ror_32 (@x[$d2],@x[$d2],16)", + "&ror_32 (@x[$d3],@x[$d3],16)", + + "&add_32 (@x[$c0],@x[$c0],@x[$d0])", + "&add_32 (@x[$c1],@x[$c1],@x[$d1])", + "&add_32 (@x[$c2],@x[$c2],@x[$d2])", + "&add_32 (@x[$c3],@x[$c3],@x[$d3])", + "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", + "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", + "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", + "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", + "&ror_32 (@x[$b0],@x[$b0],20)", + "&ror_32 (@x[$b1],@x[$b1],20)", + "&ror_32 (@x[$b2],@x[$b2],20)", + "&ror_32 (@x[$b3],@x[$b3],20)", + + "&add_32 (@x[$a0],@x[$a0],@x[$b0])", + "&add_32 (@x[$a1],@x[$a1],@x[$b1])", + "&add_32 (@x[$a2],@x[$a2],@x[$b2])", + "&add_32 (@x[$a3],@x[$a3],@x[$b3])", + "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", + "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", + "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", + "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", + "&ror_32 (@x[$d0],@x[$d0],24)", + "&ror_32 (@x[$d1],@x[$d1],24)", + "&ror_32 (@x[$d2],@x[$d2],24)", + "&ror_32 (@x[$d3],@x[$d3],24)", + + "&add_32 (@x[$c0],@x[$c0],@x[$d0])", + "&add_32 (@x[$c1],@x[$c1],@x[$d1])", + "&add_32 (@x[$c2],@x[$c2],@x[$d2])", + "&add_32 (@x[$c3],@x[$c3],@x[$d3])", + "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", + "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", + "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", + "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", + "&ror_32 (@x[$b0],@x[$b0],25)", + "&ror_32 (@x[$b1],@x[$b1],25)", + "&ror_32 (@x[$b2],@x[$b2],25)", + "&ror_32 (@x[$b3],@x[$b3],25)" + ); +} + +$code.=<<___; +#include "arm_arch.h" + +.text + +.extern OPENSSL_armcap_P + +.align 5 +.Lsigma: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +.Lone: +.long 1,0,0,0 +.LOPENSSL_armcap_P: +#ifdef __ILP32__ +.long OPENSSL_armcap_P-. +#else +.quad OPENSSL_armcap_P-. +#endif +.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by " + +.globl ChaCha20_ctr32 +.type ChaCha20_ctr32,%function +.align 5 +ChaCha20_ctr32: + cbz $len,.Labort + adr @x[0],.LOPENSSL_armcap_P + cmp $len,#192 + b.lo .Lshort +#ifdef __ILP32__ + ldrsw @x[1],[@x[0]] +#else + ldr @x[1],[@x[0]] +#endif + ldr w17,[@x[1],@x[0]] + tst w17,#ARMV7_NEON + b.ne ChaCha20_neon + +.Lshort: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr @x[0],.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + ldp @d[0],@d[1],[@x[0]] // load sigma + ldp @d[2],@d[3],[$key] // load key + ldp @d[4],@d[5],[$key,#16] + ldp @d[6],@d[7],[$ctr] // load counter +#ifdef __ARMEB__ + ror @d[2],@d[2],#32 + ror @d[3],@d[3],#32 + ror @d[4],@d[4],#32 + ror @d[5],@d[5],#32 + ror @d[6],@d[6],#32 + ror @d[7],@d[7],#32 +#endif + +.Loop_outer: + mov.32 @x[0],@d[0] // unpack key block + lsr @x[1],@d[0],#32 + mov.32 @x[2],@d[1] + lsr @x[3],@d[1],#32 + mov.32 @x[4],@d[2] + lsr @x[5],@d[2],#32 + mov.32 @x[6],@d[3] + lsr @x[7],@d[3],#32 + mov.32 @x[8],@d[4] + lsr @x[9],@d[4],#32 + mov.32 @x[10],@d[5] + lsr @x[11],@d[5],#32 + mov.32 @x[12],@d[6] + lsr @x[13],@d[6],#32 + mov.32 @x[14],@d[7] + lsr @x[15],@d[7],#32 + + mov $ctr,#10 + subs $len,$len,#64 +.Loop: + sub $ctr,$ctr,#1 +___ + foreach (&ROUND(0, 4, 8,12)) { eval; } + foreach (&ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + cbnz $ctr,.Loop + + add.32 @x[0],@x[0],@d[0] // accumulate key block + add @x[1],@x[1],@d[0],lsr#32 + add.32 @x[2],@x[2],@d[1] + add @x[3],@x[3],@d[1],lsr#32 + add.32 @x[4],@x[4],@d[2] + add @x[5],@x[5],@d[2],lsr#32 + add.32 @x[6],@x[6],@d[3] + add @x[7],@x[7],@d[3],lsr#32 + add.32 @x[8],@x[8],@d[4] + add @x[9],@x[9],@d[4],lsr#32 + add.32 @x[10],@x[10],@d[5] + add @x[11],@x[11],@d[5],lsr#32 + add.32 @x[12],@x[12],@d[6] + add @x[13],@x[13],@d[6],lsr#32 + add.32 @x[14],@x[14],@d[7] + add @x[15],@x[15],@d[7],lsr#32 + + b.lo .Ltail + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +#ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor @x[10],@x[10],@x[11] + eor @x[12],@x[12],@x[13] + eor @x[14],@x[14],@x[15] + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#1 // increment counter + stp @x[4],@x[6],[$out,#16] + stp @x[8],@x[10],[$out,#32] + stp @x[12],@x[14],[$out,#48] + add $out,$out,#64 + + b.hi .Loop_outer + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +.Labort: + ret + +.align 4 +.Ltail: + add $len,$len,#64 +.Less_than_64: + sub $out,$out,#1 + add $inp,$inp,$len + add $out,$out,$len + add $ctr,sp,$len + neg $len,$len + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 +#ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + stp @x[0],@x[2],[sp,#0] + stp @x[4],@x[6],[sp,#16] + stp @x[8],@x[10],[sp,#32] + stp @x[12],@x[14],[sp,#48] + +.Loop_tail: + ldrb w10,[$inp,$len] + ldrb w11,[$ctr,$len] + add $len,$len,#1 + eor w10,w10,w11 + strb w10,[$out,$len] + cbnz $len,.Loop_tail + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret +.size ChaCha20_ctr32,.-ChaCha20_ctr32 +___ + +{{{ +my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) = + map("v$_.4s",(0..7,16..23)); +my (@K)=map("v$_.4s",(24..30)); +my $ONE="v31.4s"; + +sub NEONROUND { +my $odd = pop; +my ($a,$b,$c,$d,$t)=@_; + + ( + "&add ('$a','$a','$b')", + "&eor ('$d','$d','$a')", + "&rev32_16 ('$d','$d')", # vrot ($d,16) + + "&add ('$c','$c','$d')", + "&eor ('$t','$b','$c')", + "&ushr ('$b','$t',20)", + "&sli ('$b','$t',12)", + + "&add ('$a','$a','$b')", + "&eor ('$t','$d','$a')", + "&ushr ('$d','$t',24)", + "&sli ('$d','$t',8)", + + "&add ('$c','$c','$d')", + "&eor ('$t','$b','$c')", + "&ushr ('$b','$t',25)", + "&sli ('$b','$t',7)", + + "&ext ('$c','$c','$c',8)", + "&ext ('$d','$d','$d',$odd?4:12)", + "&ext ('$b','$b','$b',$odd?12:4)" + ); +} + +$code.=<<___; + +.type ChaCha20_neon,%function +.align 5 +ChaCha20_neon: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr @x[0],.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + cmp $len,#512 + b.hs .L512_or_more_neon + + sub sp,sp,#64 + + ldp @d[0],@d[1],[@x[0]] // load sigma + ld1 {@K[0]},[@x[0]],#16 + ldp @d[2],@d[3],[$key] // load key + ldp @d[4],@d[5],[$key,#16] + ld1 {@K[1],@K[2]},[$key] + ldp @d[6],@d[7],[$ctr] // load counter + ld1 {@K[3]},[$ctr] + ld1 {$ONE},[@x[0]] +#ifdef __ARMEB__ + rev64 @K[0],@K[0] + ror @d[2],@d[2],#32 + ror @d[3],@d[3],#32 + ror @d[4],@d[4],#32 + ror @d[5],@d[5],#32 + ror @d[6],@d[6],#32 + ror @d[7],@d[7],#32 +#endif + add @K[3],@K[3],$ONE // += 1 + add @K[4],@K[3],$ONE + add @K[5],@K[4],$ONE + shl $ONE,$ONE,#2 // 1 -> 4 + +.Loop_outer_neon: + mov.32 @x[0],@d[0] // unpack key block + lsr @x[1],@d[0],#32 + mov $A0,@K[0] + mov.32 @x[2],@d[1] + lsr @x[3],@d[1],#32 + mov $A1,@K[0] + mov.32 @x[4],@d[2] + lsr @x[5],@d[2],#32 + mov $A2,@K[0] + mov.32 @x[6],@d[3] + mov $B0,@K[1] + lsr @x[7],@d[3],#32 + mov $B1,@K[1] + mov.32 @x[8],@d[4] + mov $B2,@K[1] + lsr @x[9],@d[4],#32 + mov $D0,@K[3] + mov.32 @x[10],@d[5] + mov $D1,@K[4] + lsr @x[11],@d[5],#32 + mov $D2,@K[5] + mov.32 @x[12],@d[6] + mov $C0,@K[2] + lsr @x[13],@d[6],#32 + mov $C1,@K[2] + mov.32 @x[14],@d[7] + mov $C2,@K[2] + lsr @x[15],@d[7],#32 + + mov $ctr,#10 + subs $len,$len,#256 +.Loop_neon: + sub $ctr,$ctr,#1 +___ + my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); + my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); + my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); + my @thread3=&ROUND(0,4,8,12); + + foreach (@thread0) { + eval; eval(shift(@thread3)); + eval(shift(@thread1)); eval(shift(@thread3)); + eval(shift(@thread2)); eval(shift(@thread3)); + } + + @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); + @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); + @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); + @thread3=&ROUND(0,5,10,15); + + foreach (@thread0) { + eval; eval(shift(@thread3)); + eval(shift(@thread1)); eval(shift(@thread3)); + eval(shift(@thread2)); eval(shift(@thread3)); + } +$code.=<<___; + cbnz $ctr,.Loop_neon + + add.32 @x[0],@x[0],@d[0] // accumulate key block + add $A0,$A0,@K[0] + add @x[1],@x[1],@d[0],lsr#32 + add $A1,$A1,@K[0] + add.32 @x[2],@x[2],@d[1] + add $A2,$A2,@K[0] + add @x[3],@x[3],@d[1],lsr#32 + add $C0,$C0,@K[2] + add.32 @x[4],@x[4],@d[2] + add $C1,$C1,@K[2] + add @x[5],@x[5],@d[2],lsr#32 + add $C2,$C2,@K[2] + add.32 @x[6],@x[6],@d[3] + add $D0,$D0,@K[3] + add @x[7],@x[7],@d[3],lsr#32 + add.32 @x[8],@x[8],@d[4] + add $D1,$D1,@K[4] + add @x[9],@x[9],@d[4],lsr#32 + add.32 @x[10],@x[10],@d[5] + add $D2,$D2,@K[5] + add @x[11],@x[11],@d[5],lsr#32 + add.32 @x[12],@x[12],@d[6] + add $B0,$B0,@K[1] + add @x[13],@x[13],@d[6],lsr#32 + add.32 @x[14],@x[14],@d[7] + add $B1,$B1,@K[1] + add @x[15],@x[15],@d[7],lsr#32 + add $B2,$B2,@K[1] + + b.lo .Ltail_neon + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +#ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + ld1.8 {$T0-$T3},[$inp],#64 + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor $A0,$A0,$T0 + eor @x[10],@x[10],@x[11] + eor $B0,$B0,$T1 + eor @x[12],@x[12],@x[13] + eor $C0,$C0,$T2 + eor @x[14],@x[14],@x[15] + eor $D0,$D0,$T3 + ld1.8 {$T0-$T3},[$inp],#64 + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#4 // increment counter + stp @x[4],@x[6],[$out,#16] + add @K[3],@K[3],$ONE // += 4 + stp @x[8],@x[10],[$out,#32] + add @K[4],@K[4],$ONE + stp @x[12],@x[14],[$out,#48] + add @K[5],@K[5],$ONE + add $out,$out,#64 + + st1.8 {$A0-$D0},[$out],#64 + ld1.8 {$A0-$D0},[$inp],#64 + + eor $A1,$A1,$T0 + eor $B1,$B1,$T1 + eor $C1,$C1,$T2 + eor $D1,$D1,$T3 + st1.8 {$A1-$D1},[$out],#64 + + eor $A2,$A2,$A0 + eor $B2,$B2,$B0 + eor $C2,$C2,$C0 + eor $D2,$D2,$D0 + st1.8 {$A2-$D2},[$out],#64 + + b.hi .Loop_outer_neon + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret + +.Ltail_neon: + add $len,$len,#256 + cmp $len,#64 + b.lo .Less_than_64 + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +#ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor @x[10],@x[10],@x[11] + eor @x[12],@x[12],@x[13] + eor @x[14],@x[14],@x[15] + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#4 // increment counter + stp @x[4],@x[6],[$out,#16] + stp @x[8],@x[10],[$out,#32] + stp @x[12],@x[14],[$out,#48] + add $out,$out,#64 + b.eq .Ldone_neon + sub $len,$len,#64 + cmp $len,#64 + b.lo .Less_than_128 + + ld1.8 {$T0-$T3},[$inp],#64 + eor $A0,$A0,$T0 + eor $B0,$B0,$T1 + eor $C0,$C0,$T2 + eor $D0,$D0,$T3 + st1.8 {$A0-$D0},[$out],#64 + b.eq .Ldone_neon + sub $len,$len,#64 + cmp $len,#64 + b.lo .Less_than_192 + + ld1.8 {$T0-$T3},[$inp],#64 + eor $A1,$A1,$T0 + eor $B1,$B1,$T1 + eor $C1,$C1,$T2 + eor $D1,$D1,$T3 + st1.8 {$A1-$D1},[$out],#64 + b.eq .Ldone_neon + sub $len,$len,#64 + + st1.8 {$A2-$D2},[sp] + b .Last_neon + +.Less_than_128: + st1.8 {$A0-$D0},[sp] + b .Last_neon +.Less_than_192: + st1.8 {$A1-$D1},[sp] + b .Last_neon + +.align 4 +.Last_neon: + sub $out,$out,#1 + add $inp,$inp,$len + add $out,$out,$len + add $ctr,sp,$len + neg $len,$len + +.Loop_tail_neon: + ldrb w10,[$inp,$len] + ldrb w11,[$ctr,$len] + add $len,$len,#1 + eor w10,w10,w11 + strb w10,[$out,$len] + cbnz $len,.Loop_tail_neon + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + +.Ldone_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret +.size ChaCha20_neon,.-ChaCha20_neon +___ +{ +my ($T0,$T1,$T2,$T3,$T4,$T5)=@K; +my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2, + $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23)); + +$code.=<<___; +.type ChaCha20_512_neon,%function +.align 5 +ChaCha20_512_neon: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr @x[0],.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + +.L512_or_more_neon: + sub sp,sp,#128+64 + + ldp @d[0],@d[1],[@x[0]] // load sigma + ld1 {@K[0]},[@x[0]],#16 + ldp @d[2],@d[3],[$key] // load key + ldp @d[4],@d[5],[$key,#16] + ld1 {@K[1],@K[2]},[$key] + ldp @d[6],@d[7],[$ctr] // load counter + ld1 {@K[3]},[$ctr] + ld1 {$ONE},[@x[0]] +#ifdef __ARMEB__ + rev64 @K[0],@K[0] + ror @d[2],@d[2],#32 + ror @d[3],@d[3],#32 + ror @d[4],@d[4],#32 + ror @d[5],@d[5],#32 + ror @d[6],@d[6],#32 + ror @d[7],@d[7],#32 +#endif + add @K[3],@K[3],$ONE // += 1 + stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part + add @K[3],@K[3],$ONE // not typo + str @K[2],[sp,#32] + add @K[4],@K[3],$ONE + add @K[5],@K[4],$ONE + add @K[6],@K[5],$ONE + shl $ONE,$ONE,#2 // 1 -> 4 + + stp d8,d9,[sp,#128+0] // meet ABI requirements + stp d10,d11,[sp,#128+16] + stp d12,d13,[sp,#128+32] + stp d14,d15,[sp,#128+48] + + sub $len,$len,#512 // not typo + +.Loop_outer_512_neon: + mov $A0,@K[0] + mov $A1,@K[0] + mov $A2,@K[0] + mov $A3,@K[0] + mov $A4,@K[0] + mov $A5,@K[0] + mov $B0,@K[1] + mov.32 @x[0],@d[0] // unpack key block + mov $B1,@K[1] + lsr @x[1],@d[0],#32 + mov $B2,@K[1] + mov.32 @x[2],@d[1] + mov $B3,@K[1] + lsr @x[3],@d[1],#32 + mov $B4,@K[1] + mov.32 @x[4],@d[2] + mov $B5,@K[1] + lsr @x[5],@d[2],#32 + mov $D0,@K[3] + mov.32 @x[6],@d[3] + mov $D1,@K[4] + lsr @x[7],@d[3],#32 + mov $D2,@K[5] + mov.32 @x[8],@d[4] + mov $D3,@K[6] + lsr @x[9],@d[4],#32 + mov $C0,@K[2] + mov.32 @x[10],@d[5] + mov $C1,@K[2] + lsr @x[11],@d[5],#32 + add $D4,$D0,$ONE // +4 + mov.32 @x[12],@d[6] + add $D5,$D1,$ONE // +4 + lsr @x[13],@d[6],#32 + mov $C2,@K[2] + mov.32 @x[14],@d[7] + mov $C3,@K[2] + lsr @x[15],@d[7],#32 + mov $C4,@K[2] + stp @K[3],@K[4],[sp,#48] // off-load key block, variable part + mov $C5,@K[2] + str @K[5],[sp,#80] + + mov $ctr,#5 + subs $len,$len,#512 +.Loop_upper_neon: + sub $ctr,$ctr,#1 +___ + my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); + my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); + my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); + my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); + my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); + my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); + my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); + my $diff = ($#thread0+1)*6 - $#thread67 - 1; + my $i = 0; + + foreach (@thread0) { + eval; eval(shift(@thread67)); + eval(shift(@thread1)); eval(shift(@thread67)); + eval(shift(@thread2)); eval(shift(@thread67)); + eval(shift(@thread3)); eval(shift(@thread67)); + eval(shift(@thread4)); eval(shift(@thread67)); + eval(shift(@thread5)); eval(shift(@thread67)); + } + + @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); + @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); + @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); + @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); + @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); + @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); + @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); + + foreach (@thread0) { + eval; eval(shift(@thread67)); + eval(shift(@thread1)); eval(shift(@thread67)); + eval(shift(@thread2)); eval(shift(@thread67)); + eval(shift(@thread3)); eval(shift(@thread67)); + eval(shift(@thread4)); eval(shift(@thread67)); + eval(shift(@thread5)); eval(shift(@thread67)); + } +$code.=<<___; + cbnz $ctr,.Loop_upper_neon + + add.32 @x[0],@x[0],@d[0] // accumulate key block + add @x[1],@x[1],@d[0],lsr#32 + add.32 @x[2],@x[2],@d[1] + add @x[3],@x[3],@d[1],lsr#32 + add.32 @x[4],@x[4],@d[2] + add @x[5],@x[5],@d[2],lsr#32 + add.32 @x[6],@x[6],@d[3] + add @x[7],@x[7],@d[3],lsr#32 + add.32 @x[8],@x[8],@d[4] + add @x[9],@x[9],@d[4],lsr#32 + add.32 @x[10],@x[10],@d[5] + add @x[11],@x[11],@d[5],lsr#32 + add.32 @x[12],@x[12],@d[6] + add @x[13],@x[13],@d[6],lsr#32 + add.32 @x[14],@x[14],@d[7] + add @x[15],@x[15],@d[7],lsr#32 + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +#ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor @x[10],@x[10],@x[11] + eor @x[12],@x[12],@x[13] + eor @x[14],@x[14],@x[15] + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#1 // increment counter + mov.32 @x[0],@d[0] // unpack key block + lsr @x[1],@d[0],#32 + stp @x[4],@x[6],[$out,#16] + mov.32 @x[2],@d[1] + lsr @x[3],@d[1],#32 + stp @x[8],@x[10],[$out,#32] + mov.32 @x[4],@d[2] + lsr @x[5],@d[2],#32 + stp @x[12],@x[14],[$out,#48] + add $out,$out,#64 + mov.32 @x[6],@d[3] + lsr @x[7],@d[3],#32 + mov.32 @x[8],@d[4] + lsr @x[9],@d[4],#32 + mov.32 @x[10],@d[5] + lsr @x[11],@d[5],#32 + mov.32 @x[12],@d[6] + lsr @x[13],@d[6],#32 + mov.32 @x[14],@d[7] + lsr @x[15],@d[7],#32 + + mov $ctr,#5 +.Loop_lower_neon: + sub $ctr,$ctr,#1 +___ + @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); + @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); + @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); + @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); + @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); + @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); + @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); + + foreach (@thread0) { + eval; eval(shift(@thread67)); + eval(shift(@thread1)); eval(shift(@thread67)); + eval(shift(@thread2)); eval(shift(@thread67)); + eval(shift(@thread3)); eval(shift(@thread67)); + eval(shift(@thread4)); eval(shift(@thread67)); + eval(shift(@thread5)); eval(shift(@thread67)); + } + + @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); + @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); + @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); + @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); + @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); + @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); + @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); + + foreach (@thread0) { + eval; eval(shift(@thread67)); + eval(shift(@thread1)); eval(shift(@thread67)); + eval(shift(@thread2)); eval(shift(@thread67)); + eval(shift(@thread3)); eval(shift(@thread67)); + eval(shift(@thread4)); eval(shift(@thread67)); + eval(shift(@thread5)); eval(shift(@thread67)); + } +$code.=<<___; + cbnz $ctr,.Loop_lower_neon + + add.32 @x[0],@x[0],@d[0] // accumulate key block + ldp @K[0],@K[1],[sp,#0] + add @x[1],@x[1],@d[0],lsr#32 + ldp @K[2],@K[3],[sp,#32] + add.32 @x[2],@x[2],@d[1] + ldp @K[4],@K[5],[sp,#64] + add @x[3],@x[3],@d[1],lsr#32 + add $A0,$A0,@K[0] + add.32 @x[4],@x[4],@d[2] + add $A1,$A1,@K[0] + add @x[5],@x[5],@d[2],lsr#32 + add $A2,$A2,@K[0] + add.32 @x[6],@x[6],@d[3] + add $A3,$A3,@K[0] + add @x[7],@x[7],@d[3],lsr#32 + add $A4,$A4,@K[0] + add.32 @x[8],@x[8],@d[4] + add $A5,$A5,@K[0] + add @x[9],@x[9],@d[4],lsr#32 + add $C0,$C0,@K[2] + add.32 @x[10],@x[10],@d[5] + add $C1,$C1,@K[2] + add @x[11],@x[11],@d[5],lsr#32 + add $C2,$C2,@K[2] + add.32 @x[12],@x[12],@d[6] + add $C3,$C3,@K[2] + add @x[13],@x[13],@d[6],lsr#32 + add $C4,$C4,@K[2] + add.32 @x[14],@x[14],@d[7] + add $C5,$C5,@K[2] + add @x[15],@x[15],@d[7],lsr#32 + add $D4,$D4,$ONE // +4 + add @x[0],@x[0],@x[1],lsl#32 // pack + add $D5,$D5,$ONE // +4 + add @x[2],@x[2],@x[3],lsl#32 + add $D0,$D0,@K[3] + ldp @x[1],@x[3],[$inp,#0] // load input + add $D1,$D1,@K[4] + add @x[4],@x[4],@x[5],lsl#32 + add $D2,$D2,@K[5] + add @x[6],@x[6],@x[7],lsl#32 + add $D3,$D3,@K[6] + ldp @x[5],@x[7],[$inp,#16] + add $D4,$D4,@K[3] + add @x[8],@x[8],@x[9],lsl#32 + add $D5,$D5,@K[4] + add @x[10],@x[10],@x[11],lsl#32 + add $B0,$B0,@K[1] + ldp @x[9],@x[11],[$inp,#32] + add $B1,$B1,@K[1] + add @x[12],@x[12],@x[13],lsl#32 + add $B2,$B2,@K[1] + add @x[14],@x[14],@x[15],lsl#32 + add $B3,$B3,@K[1] + ldp @x[13],@x[15],[$inp,#48] + add $B4,$B4,@K[1] + add $inp,$inp,#64 + add $B5,$B5,@K[1] + +#ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + ld1.8 {$T0-$T3},[$inp],#64 + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor $A0,$A0,$T0 + eor @x[10],@x[10],@x[11] + eor $B0,$B0,$T1 + eor @x[12],@x[12],@x[13] + eor $C0,$C0,$T2 + eor @x[14],@x[14],@x[15] + eor $D0,$D0,$T3 + ld1.8 {$T0-$T3},[$inp],#64 + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#7 // increment counter + stp @x[4],@x[6],[$out,#16] + stp @x[8],@x[10],[$out,#32] + stp @x[12],@x[14],[$out,#48] + add $out,$out,#64 + st1.8 {$A0-$D0},[$out],#64 + + ld1.8 {$A0-$D0},[$inp],#64 + eor $A1,$A1,$T0 + eor $B1,$B1,$T1 + eor $C1,$C1,$T2 + eor $D1,$D1,$T3 + st1.8 {$A1-$D1},[$out],#64 + + ld1.8 {$A1-$D1},[$inp],#64 + eor $A2,$A2,$A0 + ldp @K[0],@K[1],[sp,#0] + eor $B2,$B2,$B0 + ldp @K[2],@K[3],[sp,#32] + eor $C2,$C2,$C0 + eor $D2,$D2,$D0 + st1.8 {$A2-$D2},[$out],#64 + + ld1.8 {$A2-$D2},[$inp],#64 + eor $A3,$A3,$A1 + eor $B3,$B3,$B1 + eor $C3,$C3,$C1 + eor $D3,$D3,$D1 + st1.8 {$A3-$D3},[$out],#64 + + ld1.8 {$A3-$D3},[$inp],#64 + eor $A4,$A4,$A2 + eor $B4,$B4,$B2 + eor $C4,$C4,$C2 + eor $D4,$D4,$D2 + st1.8 {$A4-$D4},[$out],#64 + + shl $A0,$ONE,#1 // 4 -> 8 + eor $A5,$A5,$A3 + eor $B5,$B5,$B3 + eor $C5,$C5,$C3 + eor $D5,$D5,$D3 + st1.8 {$A5-$D5},[$out],#64 + + add @K[3],@K[3],$A0 // += 8 + add @K[4],@K[4],$A0 + add @K[5],@K[5],$A0 + add @K[6],@K[6],$A0 + + b.hs .Loop_outer_512_neon + + adds $len,$len,#512 + ushr $A0,$ONE,#2 // 4 -> 1 + + ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + + stp @K[0],$ONE,[sp,#0] // wipe off-load area + stp @K[0],$ONE,[sp,#32] + stp @K[0],$ONE,[sp,#64] + + b.eq .Ldone_512_neon + + cmp $len,#192 + sub @K[3],@K[3],$A0 // -= 1 + sub @K[4],@K[4],$A0 + sub @K[5],@K[5],$A0 + add sp,sp,#128 + b.hs .Loop_outer_neon + + eor @K[1],@K[1],@K[1] + eor @K[2],@K[2],@K[2] + eor @K[3],@K[3],@K[3] + eor @K[4],@K[4],@K[4] + eor @K[5],@K[5],@K[5] + eor @K[6],@K[6],@K[6] + b .Loop_outer + +.Ldone_512_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret +.size ChaCha20_512_neon,.-ChaCha20_512_neon +___ +} +}}} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or + (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or + (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or + (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or + (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1)); + + #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; + + print $_,"\n"; +} +close STDOUT; # flush diff --git a/crypto/chacha20/chacha20-arm64.s b/crypto/chacha20/chacha20-arm64.s new file mode 100644 index 0000000..c3d1243 --- /dev/null +++ b/crypto/chacha20/chacha20-arm64.s @@ -0,0 +1,1940 @@ +/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0) + * + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + */ + +#include + +.text +.align 5 +.Lsigma: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +.Lone: +.long 1,0,0,0 + +.align 5 +ENTRY(chacha20_arm) + cbz x2,.Labort +.Lshort: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr x5,.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ldp x28,x30,[x4] // load counter +#ifdef __ARMEB__ + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + +.Loop_outer: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov w7,w23 + lsr x8,x23,#32 + mov w9,w24 + lsr x10,x24,#32 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#64 +.Loop: + sub x4,x4,#1 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + ror w21,w21,#16 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#20 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + ror w21,w21,#24 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#25 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#16 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + ror w9,w9,#20 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#24 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + ror w9,w9,#25 + cbnz x4,.Loop + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + b.lo .Ltail + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + + b.hi .Loop_outer + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +.Labort: + ret + +.align 4 +.Ltail: + add x2,x2,#64 +.Less_than_64: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + stp x5,x7,[sp,#0] + stp x9,x11,[sp,#16] + stp x13,x15,[sp,#32] + stp x17,x20,[sp,#48] + +.Loop_tail: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,.Loop_tail + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret +ENDPROC(chacha20_arm) + +.align 5 +ENTRY(chacha20_neon) + cbz x2,.Labort_neon + cmp x2,#192 + b.lo .Lshort + + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr x5,.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + cmp x2,#512 + b.hs .L512_or_more_neon + + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __ARMEB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + +.Loop_outer_neon: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov v0.16b,v24.16b + mov w7,w23 + lsr x8,x23,#32 + mov v4.16b,v24.16b + mov w9,w24 + lsr x10,x24,#32 + mov v16.16b,v24.16b + mov w11,w25 + mov v1.16b,v25.16b + lsr x12,x25,#32 + mov v5.16b,v25.16b + mov w13,w26 + mov v17.16b,v25.16b + lsr x14,x26,#32 + mov v3.16b,v27.16b + mov w15,w27 + mov v7.16b,v28.16b + lsr x16,x27,#32 + mov v19.16b,v29.16b + mov w17,w28 + mov v2.16b,v26.16b + lsr x19,x28,#32 + mov v6.16b,v26.16b + mov w20,w30 + mov v18.16b,v26.16b + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#256 +.Loop_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v16.4s,v16.4s,v17.4s + add w7,w7,w11 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w12 + eor v7.16b,v7.16b,v4.16b + eor w17,w17,w5 + eor v19.16b,v19.16b,v16.16b + eor w19,w19,w6 + rev32 v3.8h,v3.8h + eor w20,w20,w7 + rev32 v7.8h,v7.8h + eor w21,w21,w8 + rev32 v19.8h,v19.8h + ror w17,w17,#16 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#16 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#16 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#16 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#20 + add w16,w16,w21 + ushr v5.4s,v21.4s,#20 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#20 + eor w10,w10,w14 + sli v1.4s,v20.4s,#12 + eor w11,w11,w15 + sli v5.4s,v21.4s,#12 + eor w12,w12,w16 + sli v17.4s,v22.4s,#12 + ror w9,w9,#20 + add v0.4s,v0.4s,v1.4s + ror w10,w10,#20 + add v4.4s,v4.4s,v5.4s + ror w11,w11,#20 + add v16.4s,v16.4s,v17.4s + ror w12,w12,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w9 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w10 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w11 + ushr v3.4s,v20.4s,#24 + add w8,w8,w12 + ushr v7.4s,v21.4s,#24 + eor w17,w17,w5 + ushr v19.4s,v22.4s,#24 + eor w19,w19,w6 + sli v3.4s,v20.4s,#8 + eor w20,w20,w7 + sli v7.4s,v21.4s,#8 + eor w21,w21,w8 + sli v19.4s,v22.4s,#8 + ror w17,w17,#24 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#24 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#24 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#24 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#25 + add w16,w16,w21 + ushr v5.4s,v21.4s,#25 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#25 + eor w10,w10,w14 + sli v1.4s,v20.4s,#7 + eor w11,w11,w15 + sli v5.4s,v21.4s,#7 + eor w12,w12,w16 + sli v17.4s,v22.4s,#7 + ror w9,w9,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w10,w10,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w10 + add v4.4s,v4.4s,v5.4s + add w6,w6,w11 + add v16.4s,v16.4s,v17.4s + add w7,w7,w12 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w9 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w5 + eor v19.16b,v19.16b,v16.16b + eor w17,w17,w6 + rev32 v3.8h,v3.8h + eor w19,w19,w7 + rev32 v7.8h,v7.8h + eor w20,w20,w8 + rev32 v19.8h,v19.8h + ror w21,w21,#16 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#16 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#16 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#16 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#20 + add w14,w14,w20 + ushr v5.4s,v21.4s,#20 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#20 + eor w11,w11,w16 + sli v1.4s,v20.4s,#12 + eor w12,w12,w13 + sli v5.4s,v21.4s,#12 + eor w9,w9,w14 + sli v17.4s,v22.4s,#12 + ror w10,w10,#20 + add v0.4s,v0.4s,v1.4s + ror w11,w11,#20 + add v4.4s,v4.4s,v5.4s + ror w12,w12,#20 + add v16.4s,v16.4s,v17.4s + ror w9,w9,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w12 + ushr v3.4s,v20.4s,#24 + add w8,w8,w9 + ushr v7.4s,v21.4s,#24 + eor w21,w21,w5 + ushr v19.4s,v22.4s,#24 + eor w17,w17,w6 + sli v3.4s,v20.4s,#8 + eor w19,w19,w7 + sli v7.4s,v21.4s,#8 + eor w20,w20,w8 + sli v19.4s,v22.4s,#8 + ror w21,w21,#24 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#24 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#24 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#24 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#25 + add w14,w14,w20 + ushr v5.4s,v21.4s,#25 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#25 + eor w11,w11,w16 + sli v1.4s,v20.4s,#7 + eor w12,w12,w13 + sli v5.4s,v21.4s,#7 + eor w9,w9,w14 + sli v17.4s,v22.4s,#7 + ror w10,w10,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w11,w11,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w12,w12,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + cbnz x4,.Loop_neon + + add w5,w5,w22 // accumulate key block + add v0.4s,v0.4s,v24.4s + add x6,x6,x22,lsr#32 + add v4.4s,v4.4s,v24.4s + add w7,w7,w23 + add v16.4s,v16.4s,v24.4s + add x8,x8,x23,lsr#32 + add v2.4s,v2.4s,v26.4s + add w9,w9,w24 + add v6.4s,v6.4s,v26.4s + add x10,x10,x24,lsr#32 + add v18.4s,v18.4s,v26.4s + add w11,w11,w25 + add v3.4s,v3.4s,v27.4s + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add v7.4s,v7.4s,v28.4s + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add v19.4s,v19.4s,v29.4s + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add v1.4s,v1.4s,v25.4s + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add v5.4s,v5.4s,v25.4s + add x21,x21,x30,lsr#32 + add v17.4s,v17.4s,v25.4s + + b.lo .Ltail_neon + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v20.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v21.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v22.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v23.16b + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + add v27.4s,v27.4s,v31.4s // += 4 + stp x13,x15,[x0,#32] + add v28.4s,v28.4s,v31.4s + stp x17,x20,[x0,#48] + add v29.4s,v29.4s,v31.4s + add x0,x0,#64 + + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + eor v16.16b,v16.16b,v0.16b + eor v17.16b,v17.16b,v1.16b + eor v18.16b,v18.16b,v2.16b + eor v19.16b,v19.16b,v3.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + b.hi .Loop_outer_neon + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret + +.Ltail_neon: + add x2,x2,#256 + cmp x2,#64 + b.lo .Less_than_64 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + b.eq .Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo .Less_than_128 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v0.16b,v0.16b,v20.16b + eor v1.16b,v1.16b,v21.16b + eor v2.16b,v2.16b,v22.16b + eor v3.16b,v3.16b,v23.16b + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + b.eq .Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo .Less_than_192 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + b.eq .Ldone_neon + sub x2,x2,#64 + + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] + b .Last_neon + +.Less_than_128: + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] + b .Last_neon +.Less_than_192: + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] + b .Last_neon + +.align 4 +.Last_neon: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + +.Loop_tail_neon: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,.Loop_tail_neon + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + +.Ldone_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret + +.L512_or_more_neon: + sub sp,sp,#128+64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __ARMEB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + stp q24,q25,[sp,#0] // off-load key block, invariant part + add v27.4s,v27.4s,v31.4s // not typo + str q26,[sp,#32] + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + add v30.4s,v29.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + + stp d8,d9,[sp,#128+0] // meet ABI requirements + stp d10,d11,[sp,#128+16] + stp d12,d13,[sp,#128+32] + stp d14,d15,[sp,#128+48] + + sub x2,x2,#512 // not typo + +.Loop_outer_512_neon: + mov v0.16b,v24.16b + mov v4.16b,v24.16b + mov v8.16b,v24.16b + mov v12.16b,v24.16b + mov v16.16b,v24.16b + mov v20.16b,v24.16b + mov v1.16b,v25.16b + mov w5,w22 // unpack key block + mov v5.16b,v25.16b + lsr x6,x22,#32 + mov v9.16b,v25.16b + mov w7,w23 + mov v13.16b,v25.16b + lsr x8,x23,#32 + mov v17.16b,v25.16b + mov w9,w24 + mov v21.16b,v25.16b + lsr x10,x24,#32 + mov v3.16b,v27.16b + mov w11,w25 + mov v7.16b,v28.16b + lsr x12,x25,#32 + mov v11.16b,v29.16b + mov w13,w26 + mov v15.16b,v30.16b + lsr x14,x26,#32 + mov v2.16b,v26.16b + mov w15,w27 + mov v6.16b,v26.16b + lsr x16,x27,#32 + add v19.4s,v3.4s,v31.4s // +4 + mov w17,w28 + add v23.4s,v7.4s,v31.4s // +4 + lsr x19,x28,#32 + mov v10.16b,v26.16b + mov w20,w30 + mov v14.16b,v26.16b + lsr x21,x30,#32 + mov v18.16b,v26.16b + stp q27,q28,[sp,#48] // off-load key block, variable part + mov v22.16b,v26.16b + str q29,[sp,#80] + + mov x4,#5 + subs x2,x2,#512 +.Loop_upper_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,.Loop_upper_neon + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + mov w5,w22 // unpack key block + lsr x6,x22,#32 + stp x9,x11,[x0,#16] + mov w7,w23 + lsr x8,x23,#32 + stp x13,x15,[x0,#32] + mov w9,w24 + lsr x10,x24,#32 + stp x17,x20,[x0,#48] + add x0,x0,#64 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#5 +.Loop_lower_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,.Loop_lower_neon + + add w5,w5,w22 // accumulate key block + ldp q24,q25,[sp,#0] + add x6,x6,x22,lsr#32 + ldp q26,q27,[sp,#32] + add w7,w7,w23 + ldp q28,q29,[sp,#64] + add x8,x8,x23,lsr#32 + add v0.4s,v0.4s,v24.4s + add w9,w9,w24 + add v4.4s,v4.4s,v24.4s + add x10,x10,x24,lsr#32 + add v8.4s,v8.4s,v24.4s + add w11,w11,w25 + add v12.4s,v12.4s,v24.4s + add x12,x12,x25,lsr#32 + add v16.4s,v16.4s,v24.4s + add w13,w13,w26 + add v20.4s,v20.4s,v24.4s + add x14,x14,x26,lsr#32 + add v2.4s,v2.4s,v26.4s + add w15,w15,w27 + add v6.4s,v6.4s,v26.4s + add x16,x16,x27,lsr#32 + add v10.4s,v10.4s,v26.4s + add w17,w17,w28 + add v14.4s,v14.4s,v26.4s + add x19,x19,x28,lsr#32 + add v18.4s,v18.4s,v26.4s + add w20,w20,w30 + add v22.4s,v22.4s,v26.4s + add x21,x21,x30,lsr#32 + add v19.4s,v19.4s,v31.4s // +4 + add x5,x5,x6,lsl#32 // pack + add v23.4s,v23.4s,v31.4s // +4 + add x7,x7,x8,lsl#32 + add v3.4s,v3.4s,v27.4s + ldp x6,x8,[x1,#0] // load input + add v7.4s,v7.4s,v28.4s + add x9,x9,x10,lsl#32 + add v11.4s,v11.4s,v29.4s + add x11,x11,x12,lsl#32 + add v15.4s,v15.4s,v30.4s + ldp x10,x12,[x1,#16] + add v19.4s,v19.4s,v27.4s + add x13,x13,x14,lsl#32 + add v23.4s,v23.4s,v28.4s + add x15,x15,x16,lsl#32 + add v1.4s,v1.4s,v25.4s + ldp x14,x16,[x1,#32] + add v5.4s,v5.4s,v25.4s + add x17,x17,x19,lsl#32 + add v9.4s,v9.4s,v25.4s + add x20,x20,x21,lsl#32 + add v13.4s,v13.4s,v25.4s + ldp x19,x21,[x1,#48] + add v17.4s,v17.4s,v25.4s + add x1,x1,#64 + add v21.4s,v21.4s,v25.4s + +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v24.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v25.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v26.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v27.16b + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#7 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + eor v4.16b,v4.16b,v24.16b + eor v5.16b,v5.16b,v25.16b + eor v6.16b,v6.16b,v26.16b + eor v7.16b,v7.16b,v27.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v8.16b,v8.16b,v0.16b + ldp q24,q25,[sp,#0] + eor v9.16b,v9.16b,v1.16b + ldp q26,q27,[sp,#32] + eor v10.16b,v10.16b,v2.16b + eor v11.16b,v11.16b,v3.16b + st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 + + ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 + eor v12.16b,v12.16b,v4.16b + eor v13.16b,v13.16b,v5.16b + eor v14.16b,v14.16b,v6.16b + eor v15.16b,v15.16b,v7.16b + st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 + + ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v9.16b + eor v18.16b,v18.16b,v10.16b + eor v19.16b,v19.16b,v11.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + shl v0.4s,v31.4s,#1 // 4 -> 8 + eor v20.16b,v20.16b,v12.16b + eor v21.16b,v21.16b,v13.16b + eor v22.16b,v22.16b,v14.16b + eor v23.16b,v23.16b,v15.16b + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + + add v27.4s,v27.4s,v0.4s // += 8 + add v28.4s,v28.4s,v0.4s + add v29.4s,v29.4s,v0.4s + add v30.4s,v30.4s,v0.4s + + b.hs .Loop_outer_512_neon + + adds x2,x2,#512 + ushr v0.4s,v31.4s,#2 // 4 -> 1 + + ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + + stp q24,q31,[sp,#0] // wipe off-load area + stp q24,q31,[sp,#32] + stp q24,q31,[sp,#64] + + b.eq .Ldone_512_neon + + cmp x2,#192 + sub v27.4s,v27.4s,v0.4s // -= 1 + sub v28.4s,v28.4s,v0.4s + sub v29.4s,v29.4s,v0.4s + add sp,sp,#128 + b.hs .Loop_outer_neon + + eor v25.16b,v25.16b,v25.16b + eor v26.16b,v26.16b,v26.16b + eor v27.16b,v27.16b,v27.16b + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b + eor v30.16b,v30.16b,v30.16b + b .Loop_outer + +.Ldone_512_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +.Labort_neon: + ret +ENDPROC(chacha20_neon) diff --git a/crypto/chacha20poly1305.cpp b/crypto/chacha20poly1305.cpp index a5c222d..6ec5a75 100644 --- a/crypto/chacha20poly1305.cpp +++ b/crypto/chacha20poly1305.cpp @@ -43,8 +43,22 @@ void _cdecl poly1305_emit_avx(void *ctx, uint8 mac[16], const uint32 nonce[4]); void _cdecl poly1305_blocks_avx(void *ctx, const uint8 *inp, size_t len, uint32 padbit); void _cdecl poly1305_blocks_avx2(void *ctx, const uint8 *inp, size_t len, uint32 padbit); void _cdecl poly1305_blocks_avx512(void *ctx, const uint8 *inp, size_t len, uint32 padbit); + +#if defined(ARCH_CPU_ARM_FAMILY) +void chacha20_arm(uint8 *out, const uint8 *in, size_t len, const uint32 key[8], const uint32 counter[4]); +void chacha20_neon(uint8 *out, const uint8 *in, size_t len, const uint32 key[8], const uint32 counter[4]); +#endif +void poly1305_init_arm(void *ctx, const uint8 key[16]); +void poly1305_blocks_arm(void *ctx, const uint8 *inp, size_t len, uint32 padbit); +void poly1305_emit_arm(void *ctx, uint8 mac[16], const uint32 nonce[4]); +void poly1305_blocks_neon(void *ctx, const uint8 *inp, size_t len, uint32 padbit); +void poly1305_emit_neon(void *ctx, uint8 mac[16], const uint32 nonce[4]); + } + + + struct chacha20_ctx { uint32 state[CHACHA20_BLOCK_SIZE / sizeof(uint32)]; }; @@ -193,6 +207,17 @@ SAFEBUFFERS static void chacha20_crypt(struct chacha20_ctx *ctx, uint8 *dst, con } #endif // defined(ARCH_CPU_X86_64) +#if defined(ARCH_CPU_ARM_FAMILY) + if (ARM_PCAP_NEON) { + chacha20_neon(dst, src, bytes, &ctx->state[4], &ctx->state[12]); + } else { + chacha20_arm(dst, src, bytes, &ctx->state[4], &ctx->state[12]); + } + ctx->state[12] += (bytes + 63) / 64; + return; +#endif // defined(ARCH_CPU_ARM_FAMILY) + + if (dst != src) memcpy(dst, src, bytes); @@ -385,7 +410,7 @@ SAFEBUFFERS static void poly1305_init(struct poly1305_ctx *ctx, const uint8 key[ #if defined(ARCH_CPU_X86_64) poly1305_init_x86_64(ctx->opaque, key); -#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64) +#elif defined(ARCH_CPU_ARM_FAMILY) poly1305_init_arm(ctx->opaque, key); #elif defined(CONFIG_MIPS) && defined(CONFIG_64BIT) poly1305_init_mips(ctx->opaque, key); @@ -409,7 +434,12 @@ static inline void poly1305_blocks(void *ctx, const uint8 *inp, size_t len, uint poly1305_blocks_avx(ctx, inp, len, padbit); else poly1305_blocks_x86_64(ctx, inp, len, padbit); -#else // defined(ARCH_CPU_X86_64) +#elif defined(ARCH_CPU_ARM_FAMILY) + if (ARM_PCAP_NEON) + poly1305_blocks_neon(ctx, inp, len, padbit); + else + poly1305_blocks_arm(ctx, inp, len, padbit); +#else poly1305_blocks_generic(ctx, inp, len, padbit); #endif // defined(ARCH_CPU_X86_64) } @@ -421,6 +451,11 @@ static inline void poly1305_emit(void *ctx, uint8 mac[16], const uint32 nonce[4] poly1305_emit_avx(ctx, mac, nonce); else poly1305_emit_x86_64(ctx, mac, nonce); +#elif defined(ARCH_CPU_ARM_FAMILY) + if (ARM_PCAP_NEON) + poly1305_emit_neon(ctx, mac, nonce); + else + poly1305_emit_arm(ctx, mac, nonce); #else // defined(ARCH_CPU_X86_64) poly1305_emit_generic(ctx, mac, nonce); #endif // defined(ARCH_CPU_X86_64) diff --git a/crypto/curve25519-donna.h b/crypto/curve25519-donna.h index 6985273..93380b3 100644 --- a/crypto/curve25519-donna.h +++ b/crypto/curve25519-donna.h @@ -1,17 +1,17 @@ -#ifndef TUNSAFE_CRYPTO_CURVE25519_DONNA_H_ -#define TUNSAFE_CRYPTO_CURVE25519_DONNA_H_ - -#include "tunsafe_types.h" - -void curve25519_donna_ref(uint8 *mypublic, const uint8 *secret, const uint8 *basepoint); -extern "C" void curve25519_donna_x64(uint8 *mypublic, const uint8 *secret, const uint8 *basepoint); - -#if defined(ARCH_CPU_X86_64) && defined(COMPILER_MSVC) -#define curve25519_donna curve25519_donna_x64 -#else -#define curve25519_donna curve25519_donna_ref -#endif - -void curve25519_normalize(uint8 *e); - +#ifndef TUNSAFE_CRYPTO_CURVE25519_DONNA_H_ +#define TUNSAFE_CRYPTO_CURVE25519_DONNA_H_ + +#include "tunsafe_types.h" + +void curve25519_donna_ref(uint8 *mypublic, const uint8 *secret, const uint8 *basepoint); +extern "C" void curve25519_donna_x64(uint8 *mypublic, const uint8 *secret, const uint8 *basepoint); + +#if defined(ARCH_CPU_X86_64) && defined(COMPILER_MSVC) +#define curve25519_donna curve25519_donna_x64 +#else +#define curve25519_donna curve25519_donna_ref +#endif + +void curve25519_normalize(uint8 *e); + #endif // TUNSAFE_CRYPTO_CURVE25519_DONNA_H_ \ No newline at end of file diff --git a/crypto/make_all_asm_files.sh b/crypto/make_all_asm_files.sh old mode 100644 new mode 100755 diff --git a/crypto/make_poly1305_x64.pl b/crypto/make_poly1305_x64.pl old mode 100644 new mode 100755 diff --git a/crypto/poly1305/poly1305-arm.pl b/crypto/poly1305/poly1305-arm.pl new file mode 100644 index 0000000..5cdb6be --- /dev/null +++ b/crypto/poly1305/poly1305-arm.pl @@ -0,0 +1,1253 @@ +#! /usr/bin/env perl +# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# IALU(*)/gcc-4.4 NEON +# +# ARM11xx(ARMv6) 7.78/+100% - +# Cortex-A5 6.35/+130% 3.00 +# Cortex-A8 6.25/+115% 2.36 +# Cortex-A9 5.10/+95% 2.55 +# Cortex-A15 3.85/+85% 1.25(**) +# Snapdragon S4 5.70/+100% 1.48(**) +# +# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; +# (**) these are trade-off results, they can be improved by ~8% but at +# the cost of 15/12% regression on Cortex-A5/A7, it's even possible +# to improve Cortex-A9 result, but then A5/A7 loose more than 20%; + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); + +$code.=<<___; +#include "arm_arch.h" + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.globl poly1305_emit +.globl poly1305_blocks +.globl poly1305_init +.type poly1305_init,%function +.align 5 +poly1305_init: +.Lpoly1305_init: + stmdb sp!,{r4-r11} + + eor r3,r3,r3 + cmp $inp,#0 + str r3,[$ctx,#0] @ zero hash value + str r3,[$ctx,#4] + str r3,[$ctx,#8] + str r3,[$ctx,#12] + str r3,[$ctx,#16] + str r3,[$ctx,#36] @ is_base2_26 + add $ctx,$ctx,#20 + +#ifdef __thumb2__ + it eq +#endif + moveq r0,#0 + beq .Lno_key + +#if __ARM_MAX_ARCH__>=7 + adr r11,.Lpoly1305_init + ldr r12,.LOPENSSL_armcap +#endif + ldrb r4,[$inp,#0] + mov r10,#0x0fffffff + ldrb r5,[$inp,#1] + and r3,r10,#-4 @ 0x0ffffffc + ldrb r6,[$inp,#2] + ldrb r7,[$inp,#3] + orr r4,r4,r5,lsl#8 + ldrb r5,[$inp,#4] + orr r4,r4,r6,lsl#16 + ldrb r6,[$inp,#5] + orr r4,r4,r7,lsl#24 + ldrb r7,[$inp,#6] + and r4,r4,r10 + +#if __ARM_MAX_ARCH__>=7 + ldr r12,[r11,r12] @ OPENSSL_armcap_P +# ifdef __APPLE__ + ldr r12,[r12] +# endif +#endif + ldrb r8,[$inp,#7] + orr r5,r5,r6,lsl#8 + ldrb r6,[$inp,#8] + orr r5,r5,r7,lsl#16 + ldrb r7,[$inp,#9] + orr r5,r5,r8,lsl#24 + ldrb r8,[$inp,#10] + and r5,r5,r3 + +#if __ARM_MAX_ARCH__>=7 + tst r12,#ARMV7_NEON @ check for NEON +# ifdef __APPLE__ + adr r9,poly1305_blocks_neon + adr r11,poly1305_blocks +# ifdef __thumb2__ + it ne +# endif + movne r11,r9 + adr r12,poly1305_emit + adr r10,poly1305_emit_neon +# ifdef __thumb2__ + it ne +# endif + movne r12,r10 +# else +# ifdef __thumb2__ + itete eq +# endif + addeq r12,r11,#(poly1305_emit-.Lpoly1305_init) + addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init) + addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init) + addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init) +# endif +# ifdef __thumb2__ + orr r12,r12,#1 @ thumb-ify address + orr r11,r11,#1 +# endif +#endif + ldrb r9,[$inp,#11] + orr r6,r6,r7,lsl#8 + ldrb r7,[$inp,#12] + orr r6,r6,r8,lsl#16 + ldrb r8,[$inp,#13] + orr r6,r6,r9,lsl#24 + ldrb r9,[$inp,#14] + and r6,r6,r3 + + ldrb r10,[$inp,#15] + orr r7,r7,r8,lsl#8 + str r4,[$ctx,#0] + orr r7,r7,r9,lsl#16 + str r5,[$ctx,#4] + orr r7,r7,r10,lsl#24 + str r6,[$ctx,#8] + and r7,r7,r3 + str r7,[$ctx,#12] +#if __ARM_MAX_ARCH__>=7 + stmia r2,{r11,r12} @ fill functions table + mov r0,#1 +#else + mov r0,#0 +#endif +.Lno_key: + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_init,.-poly1305_init +___ +{ +my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); +my ($s1,$s2,$s3)=($r1,$r2,$r3); + +$code.=<<___; +.type poly1305_blocks,%function +.align 5 +poly1305_blocks: +.Lpoly1305_blocks: + stmdb sp!,{r3-r11,lr} + + ands $len,$len,#-16 + beq .Lno_data + + cmp $padbit,#0 + add $len,$len,$inp @ end pointer + sub sp,sp,#32 + + ldmia $ctx,{$h0-$r3} @ load context + + str $ctx,[sp,#12] @ offload stuff + mov lr,$inp + str $len,[sp,#16] + str $r1,[sp,#20] + str $r2,[sp,#24] + str $r3,[sp,#28] + b .Loop + +.Loop: +#if __ARM_ARCH__<7 + ldrb r0,[lr],#16 @ load input +# ifdef __thumb2__ + it hi +# endif + addhi $h4,$h4,#1 @ 1<<128 + ldrb r1,[lr,#-15] + ldrb r2,[lr,#-14] + ldrb r3,[lr,#-13] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-12] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-11] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-10] + adds $h0,$h0,r3 @ accumulate input + + ldrb r3,[lr,#-9] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-8] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-7] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-6] + adcs $h1,$h1,r3 + + ldrb r3,[lr,#-5] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-4] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-3] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-2] + adcs $h2,$h2,r3 + + ldrb r3,[lr,#-1] + orr r1,r0,r1,lsl#8 + str lr,[sp,#8] @ offload input pointer + orr r2,r1,r2,lsl#16 + add $s1,$r1,$r1,lsr#2 + orr r3,r2,r3,lsl#24 +#else + ldr r0,[lr],#16 @ load input +# ifdef __thumb2__ + it hi +# endif + addhi $h4,$h4,#1 @ padbit + ldr r1,[lr,#-12] + ldr r2,[lr,#-8] + ldr r3,[lr,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + adds $h0,$h0,r0 @ accumulate input + str lr,[sp,#8] @ offload input pointer + adcs $h1,$h1,r1 + add $s1,$r1,$r1,lsr#2 + adcs $h2,$h2,r2 +#endif + add $s2,$r2,$r2,lsr#2 + adcs $h3,$h3,r3 + add $s3,$r3,$r3,lsr#2 + + umull r2,r3,$h1,$r0 + adc $h4,$h4,#0 + umull r0,r1,$h0,$r0 + umlal r2,r3,$h4,$s1 + umlal r0,r1,$h3,$s1 + ldr $r1,[sp,#20] @ reload $r1 + umlal r2,r3,$h2,$s3 + umlal r0,r1,$h1,$s3 + umlal r2,r3,$h3,$s2 + umlal r0,r1,$h2,$s2 + umlal r2,r3,$h0,$r1 + str r0,[sp,#0] @ future $h0 + mul r0,$s2,$h4 + ldr $r2,[sp,#24] @ reload $r2 + adds r2,r2,r1 @ d1+=d0>>32 + eor r1,r1,r1 + adc lr,r3,#0 @ future $h2 + str r2,[sp,#4] @ future $h1 + + mul r2,$s3,$h4 + eor r3,r3,r3 + umlal r0,r1,$h3,$s3 + ldr $r3,[sp,#28] @ reload $r3 + umlal r2,r3,$h3,$r0 + umlal r0,r1,$h2,$r0 + umlal r2,r3,$h2,$r1 + umlal r0,r1,$h1,$r1 + umlal r2,r3,$h1,$r2 + umlal r0,r1,$h0,$r2 + umlal r2,r3,$h0,$r3 + ldr $h0,[sp,#0] + mul $h4,$r0,$h4 + ldr $h1,[sp,#4] + + adds $h2,lr,r0 @ d2+=d1>>32 + ldr lr,[sp,#8] @ reload input pointer + adc r1,r1,#0 + adds $h3,r2,r1 @ d3+=d2>>32 + ldr r0,[sp,#16] @ reload end pointer + adc r3,r3,#0 + add $h4,$h4,r3 @ h4+=d3>>32 + + and r1,$h4,#-4 + and $h4,$h4,#3 + add r1,r1,r1,lsr#2 @ *=5 + adds $h0,$h0,r1 + adcs $h1,$h1,#0 + adcs $h2,$h2,#0 + adcs $h3,$h3,#0 + adc $h4,$h4,#0 + + cmp r0,lr @ done yet? + bhi .Loop + + ldr $ctx,[sp,#12] + add sp,sp,#32 + stmia $ctx,{$h0-$h4} @ store the result + +.Lno_data: +#if __ARM_ARCH__>=5 + ldmia sp!,{r3-r11,pc} +#else + ldmia sp!,{r3-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_blocks,.-poly1305_blocks +___ +} +{ +my ($ctx,$mac,$nonce)=map("r$_",(0..2)); +my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); +my $g4=$h4; + +$code.=<<___; +.type poly1305_emit,%function +.align 5 +poly1305_emit: + stmdb sp!,{r4-r11} +.Lpoly1305_emit_enter: + + ldmia $ctx,{$h0-$h4} + adds $g0,$h0,#5 @ compare to modulus + adcs $g1,$h1,#0 + adcs $g2,$h2,#0 + adcs $g3,$h3,#0 + adc $g4,$h4,#0 + tst $g4,#4 @ did it carry/borrow? + +#ifdef __thumb2__ + it ne +#endif + movne $h0,$g0 + ldr $g0,[$nonce,#0] +#ifdef __thumb2__ + it ne +#endif + movne $h1,$g1 + ldr $g1,[$nonce,#4] +#ifdef __thumb2__ + it ne +#endif + movne $h2,$g2 + ldr $g2,[$nonce,#8] +#ifdef __thumb2__ + it ne +#endif + movne $h3,$g3 + ldr $g3,[$nonce,#12] + + adds $h0,$h0,$g0 + adcs $h1,$h1,$g1 + adcs $h2,$h2,$g2 + adc $h3,$h3,$g3 + +#if __ARM_ARCH__>=7 +# ifdef __ARMEB__ + rev $h0,$h0 + rev $h1,$h1 + rev $h2,$h2 + rev $h3,$h3 +# endif + str $h0,[$mac,#0] + str $h1,[$mac,#4] + str $h2,[$mac,#8] + str $h3,[$mac,#12] +#else + strb $h0,[$mac,#0] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#4] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#8] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#12] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#1] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#5] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#9] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#13] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#2] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#6] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#10] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#14] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#3] + strb $h1,[$mac,#7] + strb $h2,[$mac,#11] + strb $h3,[$mac,#15] +#endif + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_emit,.-poly1305_emit +___ +{ +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); +my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); +my ($T0,$T1,$MASK) = map("q$_",(15,4,0)); + +my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.fpu neon + +.type poly1305_init_neon,%function +.align 5 +poly1305_init_neon: + ldr r4,[$ctx,#20] @ load key base 2^32 + ldr r5,[$ctx,#24] + ldr r6,[$ctx,#28] + ldr r7,[$ctx,#32] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + and r3,r3,#0x03ffffff + and r4,r4,#0x03ffffff + and r5,r5,#0x03ffffff + + vdup.32 $R0,r2 @ r^1 in both lanes + add r2,r3,r3,lsl#2 @ *5 + vdup.32 $R1,r3 + add r3,r4,r4,lsl#2 + vdup.32 $S1,r2 + vdup.32 $R2,r4 + add r4,r5,r5,lsl#2 + vdup.32 $S2,r3 + vdup.32 $R3,r5 + add r5,r6,r6,lsl#2 + vdup.32 $S3,r4 + vdup.32 $R4,r6 + vdup.32 $S4,r5 + + mov $zeros,#2 @ counter + +.Lsquare_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + + vmull.u32 $D0,$R0,${R0}[1] + vmull.u32 $D1,$R1,${R0}[1] + vmull.u32 $D2,$R2,${R0}[1] + vmull.u32 $D3,$R3,${R0}[1] + vmull.u32 $D4,$R4,${R0}[1] + + vmlal.u32 $D0,$R4,${S1}[1] + vmlal.u32 $D1,$R0,${R1}[1] + vmlal.u32 $D2,$R1,${R1}[1] + vmlal.u32 $D3,$R2,${R1}[1] + vmlal.u32 $D4,$R3,${R1}[1] + + vmlal.u32 $D0,$R3,${S2}[1] + vmlal.u32 $D1,$R4,${S2}[1] + vmlal.u32 $D3,$R1,${R2}[1] + vmlal.u32 $D2,$R0,${R2}[1] + vmlal.u32 $D4,$R2,${R2}[1] + + vmlal.u32 $D0,$R2,${S3}[1] + vmlal.u32 $D3,$R0,${R3}[1] + vmlal.u32 $D1,$R3,${S3}[1] + vmlal.u32 $D2,$R4,${S3}[1] + vmlal.u32 $D4,$R1,${R3}[1] + + vmlal.u32 $D3,$R4,${S4}[1] + vmlal.u32 $D0,$R1,${S4}[1] + vmlal.u32 $D1,$R2,${S4}[1] + vmlal.u32 $D2,$R3,${S4}[1] + vmlal.u32 $D4,$R0,${R4}[1] + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + @ and P. Schwabe + @ + @ H0>>+H1>>+H2>>+H3>>+H4 + @ H3>>+H4>>*5+H0>>+H1 + @ + @ Trivia. + @ + @ Result of multiplication of n-bit number by m-bit number is + @ n+m bits wide. However! Even though 2^n is a n+1-bit number, + @ m-bit number multiplied by 2^n is still n+m bits wide. + @ + @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, + @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit + @ one is n+1 bits wide. + @ + @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that + @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 + @ can be 27. However! In cases when their width exceeds 26 bits + @ they are limited by 2^26+2^6. This in turn means that *sum* + @ of the products with these values can still be viewed as sum + @ of 52-bit numbers as long as the amount of addends is not a + @ power of 2. For example, + @ + @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, + @ + @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or + @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than + @ 8 * (2^52) or 2^55. However, the value is then multiplied by + @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), + @ which is less than 32 * (2^52) or 2^57. And when processing + @ data we are looking at triple as many addends... + @ + @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and + @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the + @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while + @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 + @ instruction accepts 2x32-bit input and writes 2x64-bit result. + @ This means that result of reduction have to be compressed upon + @ loop wrap-around. This can be done in the process of reduction + @ to minimize amount of instructions [as well as amount of + @ 128-bit instructions, which benefits low-end processors], but + @ one has to watch for H2 (which is narrower than H0) and 5*H4 + @ not being wider than 58 bits, so that result of right shift + @ by 26 bits fits in 32 bits. This is also useful on x86, + @ because it allows to use paddd in place for paddq, which + @ benefits Atom, where paddq is ridiculously slow. + + vshr.u64 $T0,$D3,#26 + vmovn.i64 $D3#lo,$D3 + vshr.u64 $T1,$D0,#26 + vmovn.i64 $D0#lo,$D0 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + vbic.i32 $D0#lo,#0xfc000000 + + vshrn.u64 $T0#lo,$D4,#26 + vmovn.i64 $D4#lo,$D4 + vshr.u64 $T1,$D1,#26 + vmovn.i64 $D1#lo,$D1 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + vbic.i32 $D4#lo,#0xfc000000 + vbic.i32 $D1#lo,#0xfc000000 + + vadd.i32 $D0#lo,$D0#lo,$T0#lo + vshl.u32 $T0#lo,$T0#lo,#2 + vshrn.u64 $T1#lo,$D2,#26 + vmovn.i64 $D2#lo,$D2 + vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 + vbic.i32 $D2#lo,#0xfc000000 + + vshr.u32 $T0#lo,$D0#lo,#26 + vbic.i32 $D0#lo,#0xfc000000 + vshr.u32 $T1#lo,$D3#lo,#26 + vbic.i32 $D3#lo,#0xfc000000 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 + + subs $zeros,$zeros,#1 + beq .Lsquare_break_neon + + add $tbl0,$ctx,#(48+0*9*4) + add $tbl1,$ctx,#(48+1*9*4) + + vtrn.32 $R0,$D0#lo @ r^2:r^1 + vtrn.32 $R2,$D2#lo + vtrn.32 $R3,$D3#lo + vtrn.32 $R1,$D1#lo + vtrn.32 $R4,$D4#lo + + vshl.u32 $S2,$R2,#2 @ *5 + vshl.u32 $S3,$R3,#2 + vshl.u32 $S1,$R1,#2 + vshl.u32 $S4,$R4,#2 + vadd.i32 $S2,$S2,$R2 + vadd.i32 $S1,$S1,$R1 + vadd.i32 $S3,$S3,$R3 + vadd.i32 $S4,$S4,$R4 + + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vst1.32 {${S4}[0]},[$tbl0,:32] + vst1.32 {${S4}[1]},[$tbl1,:32] + + b .Lsquare_neon + +.align 4 +.Lsquare_break_neon: + add $tbl0,$ctx,#(48+2*4*9) + add $tbl1,$ctx,#(48+3*4*9) + + vmov $R0,$D0#lo @ r^4:r^3 + vshl.u32 $S1,$D1#lo,#2 @ *5 + vmov $R1,$D1#lo + vshl.u32 $S2,$D2#lo,#2 + vmov $R2,$D2#lo + vshl.u32 $S3,$D3#lo,#2 + vmov $R3,$D3#lo + vshl.u32 $S4,$D4#lo,#2 + vmov $R4,$D4#lo + vadd.i32 $S1,$S1,$D1#lo + vadd.i32 $S2,$S2,$D2#lo + vadd.i32 $S3,$S3,$D3#lo + vadd.i32 $S4,$S4,$D4#lo + + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vst1.32 {${S4}[0]},[$tbl0] + vst1.32 {${S4}[1]},[$tbl1] + + ret @ bx lr +.size poly1305_init_neon,.-poly1305_init_neon + +.type poly1305_blocks_neon,%function +.align 5 +poly1305_blocks_neon: + ldr ip,[$ctx,#36] @ is_base2_26 + ands $len,$len,#-16 + beq .Lno_data_neon + + cmp $len,#64 + bhs .Lenter_neon + tst ip,ip @ is_base2_26? + beq .Lpoly1305_blocks + +.Lenter_neon: + stmdb sp!,{r4-r7} + vstmdb sp!,{d8-d15} @ ABI specification says so + + tst ip,ip @ is_base2_26? + bne .Lbase2_26_neon + + stmdb sp!,{r1-r3,lr} + bl poly1305_init_neon + + ldr r4,[$ctx,#0] @ load hash value base 2^32 + ldr r5,[$ctx,#4] + ldr r6,[$ctx,#8] + ldr r7,[$ctx,#12] + ldr ip,[$ctx,#16] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + veor $D0#lo,$D0#lo,$D0#lo + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + veor $D1#lo,$D1#lo,$D1#lo + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + veor $D2#lo,$D2#lo,$D2#lo + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + veor $D3#lo,$D3#lo,$D3#lo + and r3,r3,#0x03ffffff + orr r6,r6,ip,lsl#24 + veor $D4#lo,$D4#lo,$D4#lo + and r4,r4,#0x03ffffff + mov r1,#1 + and r5,r5,#0x03ffffff + str r1,[$ctx,#36] @ is_base2_26 + + vmov.32 $D0#lo[0],r2 + vmov.32 $D1#lo[0],r3 + vmov.32 $D2#lo[0],r4 + vmov.32 $D3#lo[0],r5 + vmov.32 $D4#lo[0],r6 + adr $zeros,.Lzeros + + ldmia sp!,{r1-r3,lr} + b .Lbase2_32_neon + +.align 4 +.Lbase2_26_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ load hash value + + veor $D0#lo,$D0#lo,$D0#lo + veor $D1#lo,$D1#lo,$D1#lo + veor $D2#lo,$D2#lo,$D2#lo + veor $D3#lo,$D3#lo,$D3#lo + veor $D4#lo,$D4#lo,$D4#lo + vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! + adr $zeros,.Lzeros + vld1.32 {$D4#lo[0]},[$ctx] + sub $ctx,$ctx,#16 @ rewind + +.Lbase2_32_neon: + add $in2,$inp,#32 + mov $padbit,$padbit,lsl#24 + tst $len,#31 + beq .Leven + + vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! + vmov.32 $H4#lo[0],$padbit + sub $len,$len,#16 + add $in2,$inp,#32 + +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H3,$H3 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 +# endif + vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 + vshl.u32 $H3#lo,$H3#lo,#18 + + vsri.u32 $H3#lo,$H2#lo,#14 + vshl.u32 $H2#lo,$H2#lo,#12 + vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi + + vbic.i32 $H3#lo,#0xfc000000 + vsri.u32 $H2#lo,$H1#lo,#20 + vshl.u32 $H1#lo,$H1#lo,#6 + + vbic.i32 $H2#lo,#0xfc000000 + vsri.u32 $H1#lo,$H0#lo,#26 + vadd.i32 $H3#hi,$H3#lo,$D3#lo + + vbic.i32 $H0#lo,#0xfc000000 + vbic.i32 $H1#lo,#0xfc000000 + vadd.i32 $H2#hi,$H2#lo,$D2#lo + + vadd.i32 $H0#hi,$H0#lo,$D0#lo + vadd.i32 $H1#hi,$H1#lo,$D1#lo + + mov $tbl1,$zeros + add $tbl0,$ctx,#48 + + cmp $len,$len + b .Long_tail + +.align 4 +.Leven: + subs $len,$len,#64 + it lo + movlo $in2,$zeros + + vmov.i32 $H4,#1<<24 @ padbit, yes, always + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] + add $inp,$inp,#64 + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) + add $in2,$in2,#64 + itt hi + addhi $tbl1,$ctx,#(48+1*9*4) + addhi $tbl0,$ctx,#(48+3*9*4) + +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H3,$H3 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 +# endif + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 + vshl.u32 $H3,$H3,#18 + + vsri.u32 $H3,$H2,#14 + vshl.u32 $H2,$H2,#12 + + vbic.i32 $H3,#0xfc000000 + vsri.u32 $H2,$H1,#20 + vshl.u32 $H1,$H1,#6 + + vbic.i32 $H2,#0xfc000000 + vsri.u32 $H1,$H0,#26 + + vbic.i32 $H0,#0xfc000000 + vbic.i32 $H1,#0xfc000000 + + bls .Lskip_loop + + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + b .Loop_neon + +.align 5 +.Loop_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + @ \___________________/ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + @ \___________________/ \____________________/ + @ + @ Note that we start with inp[2:3]*r^2. This is because it + @ doesn't depend on reduction in previous iteration. + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ inp[2:3]*r^2 + + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] + vmull.u32 $D2,$H2#hi,${R0}[1] + vadd.i32 $H0#lo,$H0#lo,$D0#lo + vmull.u32 $D0,$H0#hi,${R0}[1] + vadd.i32 $H3#lo,$H3#lo,$D3#lo + vmull.u32 $D3,$H3#hi,${R0}[1] + vmlal.u32 $D2,$H1#hi,${R1}[1] + vadd.i32 $H1#lo,$H1#lo,$D1#lo + vmull.u32 $D1,$H1#hi,${R0}[1] + + vadd.i32 $H4#lo,$H4#lo,$D4#lo + vmull.u32 $D4,$H4#hi,${R0}[1] + subs $len,$len,#64 + vmlal.u32 $D0,$H4#hi,${S1}[1] + it lo + movlo $in2,$zeros + vmlal.u32 $D3,$H2#hi,${R1}[1] + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D1,$H0#hi,${R1}[1] + vmlal.u32 $D4,$H3#hi,${R1}[1] + + vmlal.u32 $D0,$H3#hi,${S2}[1] + vmlal.u32 $D3,$H1#hi,${R2}[1] + vmlal.u32 $D4,$H2#hi,${R2}[1] + vmlal.u32 $D1,$H4#hi,${S2}[1] + vmlal.u32 $D2,$H0#hi,${R2}[1] + + vmlal.u32 $D3,$H0#hi,${R3}[1] + vmlal.u32 $D0,$H2#hi,${S3}[1] + vmlal.u32 $D4,$H1#hi,${R3}[1] + vmlal.u32 $D1,$H3#hi,${S3}[1] + vmlal.u32 $D2,$H4#hi,${S3}[1] + + vmlal.u32 $D3,$H4#hi,${S4}[1] + vmlal.u32 $D0,$H1#hi,${S4}[1] + vmlal.u32 $D4,$H0#hi,${R4}[1] + vmlal.u32 $D1,$H2#hi,${S4}[1] + vmlal.u32 $D2,$H3#hi,${S4}[1] + + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) + add $in2,$in2,#64 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4 and accumulate + + vmlal.u32 $D3,$H3#lo,${R0}[0] + vmlal.u32 $D0,$H0#lo,${R0}[0] + vmlal.u32 $D4,$H4#lo,${R0}[0] + vmlal.u32 $D1,$H1#lo,${R0}[0] + vmlal.u32 $D2,$H2#lo,${R0}[0] + vld1.32 ${S4}[0],[$tbl0,:32] + + vmlal.u32 $D3,$H2#lo,${R1}[0] + vmlal.u32 $D0,$H4#lo,${S1}[0] + vmlal.u32 $D4,$H3#lo,${R1}[0] + vmlal.u32 $D1,$H0#lo,${R1}[0] + vmlal.u32 $D2,$H1#lo,${R1}[0] + + vmlal.u32 $D3,$H1#lo,${R2}[0] + vmlal.u32 $D0,$H3#lo,${S2}[0] + vmlal.u32 $D4,$H2#lo,${R2}[0] + vmlal.u32 $D1,$H4#lo,${S2}[0] + vmlal.u32 $D2,$H0#lo,${R2}[0] + + vmlal.u32 $D3,$H0#lo,${R3}[0] + vmlal.u32 $D0,$H2#lo,${S3}[0] + vmlal.u32 $D4,$H1#lo,${R3}[0] + vmlal.u32 $D1,$H3#lo,${S3}[0] + vmlal.u32 $D3,$H4#lo,${S4}[0] + + vmlal.u32 $D2,$H4#lo,${S3}[0] + vmlal.u32 $D0,$H1#lo,${S4}[0] + vmlal.u32 $D4,$H0#lo,${R4}[0] + vmov.i32 $H4,#1<<24 @ padbit, yes, always + vmlal.u32 $D1,$H2#lo,${S4}[0] + vmlal.u32 $D2,$H3#lo,${S4}[0] + + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] + add $inp,$inp,#64 +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 + vrev32.8 $H3,$H3 +# endif + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction interleaved with base 2^32 -> base 2^26 of + @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. + + vshr.u64 $T0,$D3,#26 + vmovn.i64 $D3#lo,$D3 + vshr.u64 $T1,$D0,#26 + vmovn.i64 $D0#lo,$D0 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vbic.i32 $D3#lo,#0xfc000000 + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + vshl.u32 $H3,$H3,#18 + vbic.i32 $D0#lo,#0xfc000000 + + vshrn.u64 $T0#lo,$D4,#26 + vmovn.i64 $D4#lo,$D4 + vshr.u64 $T1,$D1,#26 + vmovn.i64 $D1#lo,$D1 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + vsri.u32 $H3,$H2,#14 + vbic.i32 $D4#lo,#0xfc000000 + vshl.u32 $H2,$H2,#12 + vbic.i32 $D1#lo,#0xfc000000 + + vadd.i32 $D0#lo,$D0#lo,$T0#lo + vshl.u32 $T0#lo,$T0#lo,#2 + vbic.i32 $H3,#0xfc000000 + vshrn.u64 $T1#lo,$D2,#26 + vmovn.i64 $D2#lo,$D2 + vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] + vsri.u32 $H2,$H1,#20 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 + vshl.u32 $H1,$H1,#6 + vbic.i32 $D2#lo,#0xfc000000 + vbic.i32 $H2,#0xfc000000 + + vshrn.u64 $T0#lo,$D0,#26 @ re-narrow + vmovn.i64 $D0#lo,$D0 + vsri.u32 $H1,$H0,#26 + vbic.i32 $H0,#0xfc000000 + vshr.u32 $T1#lo,$D3#lo,#26 + vbic.i32 $D3#lo,#0xfc000000 + vbic.i32 $D0#lo,#0xfc000000 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 + vbic.i32 $H1,#0xfc000000 + + bhi .Loop_neon + +.Lskip_loop: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + add $tbl1,$ctx,#(48+0*9*4) + add $tbl0,$ctx,#(48+1*9*4) + adds $len,$len,#32 + it ne + movne $len,#0 + bne .Long_tail + + vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi + vadd.i32 $H0#hi,$H0#lo,$D0#lo + vadd.i32 $H3#hi,$H3#lo,$D3#lo + vadd.i32 $H1#hi,$H1#lo,$D1#lo + vadd.i32 $H4#hi,$H4#lo,$D4#lo + +.Long_tail: + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 + + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant + vmull.u32 $D2,$H2#hi,$R0 + vadd.i32 $H0#lo,$H0#lo,$D0#lo + vmull.u32 $D0,$H0#hi,$R0 + vadd.i32 $H3#lo,$H3#lo,$D3#lo + vmull.u32 $D3,$H3#hi,$R0 + vadd.i32 $H1#lo,$H1#lo,$D1#lo + vmull.u32 $D1,$H1#hi,$R0 + vadd.i32 $H4#lo,$H4#lo,$D4#lo + vmull.u32 $D4,$H4#hi,$R0 + + vmlal.u32 $D0,$H4#hi,$S1 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vmlal.u32 $D3,$H2#hi,$R1 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vmlal.u32 $D1,$H0#hi,$R1 + vmlal.u32 $D4,$H3#hi,$R1 + vmlal.u32 $D2,$H1#hi,$R1 + + vmlal.u32 $D3,$H1#hi,$R2 + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D0,$H3#hi,$S2 + vld1.32 ${S4}[0],[$tbl0,:32] + vmlal.u32 $D4,$H2#hi,$R2 + vmlal.u32 $D1,$H4#hi,$S2 + vmlal.u32 $D2,$H0#hi,$R2 + + vmlal.u32 $D3,$H0#hi,$R3 + it ne + addne $tbl1,$ctx,#(48+2*9*4) + vmlal.u32 $D0,$H2#hi,$S3 + it ne + addne $tbl0,$ctx,#(48+3*9*4) + vmlal.u32 $D4,$H1#hi,$R3 + vmlal.u32 $D1,$H3#hi,$S3 + vmlal.u32 $D2,$H4#hi,$S3 + + vmlal.u32 $D3,$H4#hi,$S4 + vorn $MASK,$MASK,$MASK @ all-ones, can be redundant + vmlal.u32 $D0,$H1#hi,$S4 + vshr.u64 $MASK,$MASK,#38 + vmlal.u32 $D4,$H0#hi,$R4 + vmlal.u32 $D1,$H2#hi,$S4 + vmlal.u32 $D2,$H3#hi,$S4 + + beq .Lshort_tail + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4:r^3 and accumulate + + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 + + vmlal.u32 $D2,$H2#lo,$R0 + vmlal.u32 $D0,$H0#lo,$R0 + vmlal.u32 $D3,$H3#lo,$R0 + vmlal.u32 $D1,$H1#lo,$R0 + vmlal.u32 $D4,$H4#lo,$R0 + + vmlal.u32 $D0,$H4#lo,$S1 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vmlal.u32 $D3,$H2#lo,$R1 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vmlal.u32 $D1,$H0#lo,$R1 + vmlal.u32 $D4,$H3#lo,$R1 + vmlal.u32 $D2,$H1#lo,$R1 + + vmlal.u32 $D3,$H1#lo,$R2 + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D0,$H3#lo,$S2 + vld1.32 ${S4}[0],[$tbl0,:32] + vmlal.u32 $D4,$H2#lo,$R2 + vmlal.u32 $D1,$H4#lo,$S2 + vmlal.u32 $D2,$H0#lo,$R2 + + vmlal.u32 $D3,$H0#lo,$R3 + vmlal.u32 $D0,$H2#lo,$S3 + vmlal.u32 $D4,$H1#lo,$R3 + vmlal.u32 $D1,$H3#lo,$S3 + vmlal.u32 $D2,$H4#lo,$S3 + + vmlal.u32 $D3,$H4#lo,$S4 + vorn $MASK,$MASK,$MASK @ all-ones + vmlal.u32 $D0,$H1#lo,$S4 + vshr.u64 $MASK,$MASK,#38 + vmlal.u32 $D4,$H0#lo,$R4 + vmlal.u32 $D1,$H2#lo,$S4 + vmlal.u32 $D2,$H3#lo,$S4 + +.Lshort_tail: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ horizontal addition + + vadd.i64 $D3#lo,$D3#lo,$D3#hi + vadd.i64 $D0#lo,$D0#lo,$D0#hi + vadd.i64 $D4#lo,$D4#lo,$D4#hi + vadd.i64 $D1#lo,$D1#lo,$D1#hi + vadd.i64 $D2#lo,$D2#lo,$D2#hi + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction, but without narrowing + + vshr.u64 $T0,$D3,#26 + vand.i64 $D3,$D3,$MASK + vshr.u64 $T1,$D0,#26 + vand.i64 $D0,$D0,$MASK + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + + vshr.u64 $T0,$D4,#26 + vand.i64 $D4,$D4,$MASK + vshr.u64 $T1,$D1,#26 + vand.i64 $D1,$D1,$MASK + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + + vadd.i64 $D0,$D0,$T0 + vshl.u64 $T0,$T0,#2 + vshr.u64 $T1,$D2,#26 + vand.i64 $D2,$D2,$MASK + vadd.i64 $D0,$D0,$T0 @ h4 -> h0 + vadd.i64 $D3,$D3,$T1 @ h2 -> h3 + + vshr.u64 $T0,$D0,#26 + vand.i64 $D0,$D0,$MASK + vshr.u64 $T1,$D3,#26 + vand.i64 $D3,$D3,$MASK + vadd.i64 $D1,$D1,$T0 @ h0 -> h1 + vadd.i64 $D4,$D4,$T1 @ h3 -> h4 + + cmp $len,#0 + bne .Leven + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ store hash value + + vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! + vst1.32 {$D4#lo[0]},[$ctx] + + vldmia sp!,{d8-d15} @ epilogue + ldmia sp!,{r4-r7} +.Lno_data_neon: + ret @ bx lr +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.type poly1305_emit_neon,%function +.align 5 +poly1305_emit_neon: + ldr ip,[$ctx,#36] @ is_base2_26 + + stmdb sp!,{r4-r11} + + tst ip,ip + beq .Lpoly1305_emit_enter + + ldmia $ctx,{$h0-$h4} + eor $g0,$g0,$g0 + + adds $h0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 + mov $h1,$h1,lsr#6 + adcs $h1,$h1,$h2,lsl#20 + mov $h2,$h2,lsr#12 + adcs $h2,$h2,$h3,lsl#14 + mov $h3,$h3,lsr#18 + adcs $h3,$h3,$h4,lsl#8 + adc $h4,$g0,$h4,lsr#24 @ can be partially reduced ... + + and $g0,$h4,#-4 @ ... so reduce + and $h4,$h3,#3 + add $g0,$g0,$g0,lsr#2 @ *= 5 + adds $h0,$h0,$g0 + adcs $h1,$h1,#0 + adcs $h2,$h2,#0 + adcs $h3,$h3,#0 + adc $h4,$h4,#0 + + adds $g0,$h0,#5 @ compare to modulus + adcs $g1,$h1,#0 + adcs $g2,$h2,#0 + adcs $g3,$h3,#0 + adc $g4,$h4,#0 + tst $g4,#4 @ did it carry/borrow? + + it ne + movne $h0,$g0 + ldr $g0,[$nonce,#0] + it ne + movne $h1,$g1 + ldr $g1,[$nonce,#4] + it ne + movne $h2,$g2 + ldr $g2,[$nonce,#8] + it ne + movne $h3,$g3 + ldr $g3,[$nonce,#12] + + adds $h0,$h0,$g0 @ accumulate nonce + adcs $h1,$h1,$g1 + adcs $h2,$h2,$g2 + adc $h3,$h3,$g3 + +# ifdef __ARMEB__ + rev $h0,$h0 + rev $h1,$h1 + rev $h2,$h2 + rev $h3,$h3 +# endif + str $h0,[$mac,#0] @ store the result + str $h1,[$mac,#4] + str $h2,[$mac,#8] + str $h3,[$mac,#12] + + ldmia sp!,{r4-r11} + ret @ bx lr +.size poly1305_emit_neon,.-poly1305_emit_neon + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.Lpoly1305_init +#endif +___ +} } +$code.=<<___; +.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by " +.align 2 +#if __ARM_MAX_ARCH__>=7 +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} +close STDOUT; # enforce flush diff --git a/crypto/poly1305/poly1305-arm.s b/crypto/poly1305/poly1305-arm.s new file mode 100644 index 0000000..1893360 --- /dev/null +++ b/crypto/poly1305/poly1305-arm.s @@ -0,0 +1,1127 @@ +/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0) + * + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + */ + +//#include + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.align 5 +.globl poly1305_init_arm +.type poly1305_init_arm,%function +poly1305_init_arm: + stmdb sp!,{r4-r11} + + eor r3,r3,r3 + cmp r1,#0 + str r3,[r0,#0] @ zero hash value + str r3,[r0,#4] + str r3,[r0,#8] + str r3,[r0,#12] + str r3,[r0,#16] + str r3,[r0,#36] @ is_base2_26 + add r0,r0,#20 + +#ifdef __thumb2__ + it eq +#endif + moveq r0,#0 + beq .Lno_key + + ldrb r4,[r1,#0] + mov r10,#0x0fffffff + ldrb r5,[r1,#1] + and r3,r10,#-4 @ 0x0ffffffc + ldrb r6,[r1,#2] + ldrb r7,[r1,#3] + orr r4,r4,r5,lsl#8 + ldrb r5,[r1,#4] + orr r4,r4,r6,lsl#16 + ldrb r6,[r1,#5] + orr r4,r4,r7,lsl#24 + ldrb r7,[r1,#6] + and r4,r4,r10 + + ldrb r8,[r1,#7] + orr r5,r5,r6,lsl#8 + ldrb r6,[r1,#8] + orr r5,r5,r7,lsl#16 + ldrb r7,[r1,#9] + orr r5,r5,r8,lsl#24 + ldrb r8,[r1,#10] + and r5,r5,r3 + + ldrb r9,[r1,#11] + orr r6,r6,r7,lsl#8 + ldrb r7,[r1,#12] + orr r6,r6,r8,lsl#16 + ldrb r8,[r1,#13] + orr r6,r6,r9,lsl#24 + ldrb r9,[r1,#14] + and r6,r6,r3 + + ldrb r10,[r1,#15] + orr r7,r7,r8,lsl#8 + str r4,[r0,#0] + orr r7,r7,r9,lsl#16 + str r5,[r0,#4] + orr r7,r7,r10,lsl#24 + str r6,[r0,#8] + and r7,r7,r3 + str r7,[r0,#12] +.Lno_key: + ldmia sp!,{r4-r11} +#if __ARM_ARCH__ >= 5 + bx lr @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size poly1305_init_arm,.-poly1305_init_arm + +.align 5 +.globl poly1305_blocks_arm +.type poly1305_blocks_arm,%function +poly1305_blocks_arm: +.Lpoly1305_blocks_arm: + stmdb sp!,{r3-r11,lr} + + ands r2,r2,#-16 + beq .Lno_data + + cmp r3,#0 + add r2,r2,r1 @ end pointer + sub sp,sp,#32 + + ldmia r0,{r4-r12} @ load context + + str r0,[sp,#12] @ offload stuff + mov lr,r1 + str r2,[sp,#16] + str r10,[sp,#20] + str r11,[sp,#24] + str r12,[sp,#28] + b .Loop + +.Loop: +#if __ARM_ARCH__ < 7 + ldrb r0,[lr],#16 @ load input +#ifdef __thumb2__ + it hi +#endif + addhi r8,r8,#1 @ 1<<128 + ldrb r1,[lr,#-15] + ldrb r2,[lr,#-14] + ldrb r3,[lr,#-13] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-12] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-11] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-10] + adds r4,r4,r3 @ accumulate input + + ldrb r3,[lr,#-9] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-8] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-7] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-6] + adcs r5,r5,r3 + + ldrb r3,[lr,#-5] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-4] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-3] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-2] + adcs r6,r6,r3 + + ldrb r3,[lr,#-1] + orr r1,r0,r1,lsl#8 + str lr,[sp,#8] @ offload input pointer + orr r2,r1,r2,lsl#16 + add r10,r10,r10,lsr#2 + orr r3,r2,r3,lsl#24 +#else + ldr r0,[lr],#16 @ load input +#ifdef __thumb2__ + it hi +#endif + addhi r8,r8,#1 @ padbit + ldr r1,[lr,#-12] + ldr r2,[lr,#-8] + ldr r3,[lr,#-4] +#ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + adds r4,r4,r0 @ accumulate input + str lr,[sp,#8] @ offload input pointer + adcs r5,r5,r1 + add r10,r10,r10,lsr#2 + adcs r6,r6,r2 +#endif + add r11,r11,r11,lsr#2 + adcs r7,r7,r3 + add r12,r12,r12,lsr#2 + + umull r2,r3,r5,r9 + adc r8,r8,#0 + umull r0,r1,r4,r9 + umlal r2,r3,r8,r10 + umlal r0,r1,r7,r10 + ldr r10,[sp,#20] @ reload r10 + umlal r2,r3,r6,r12 + umlal r0,r1,r5,r12 + umlal r2,r3,r7,r11 + umlal r0,r1,r6,r11 + umlal r2,r3,r4,r10 + str r0,[sp,#0] @ future r4 + mul r0,r11,r8 + ldr r11,[sp,#24] @ reload r11 + adds r2,r2,r1 @ d1+=d0>>32 + eor r1,r1,r1 + adc lr,r3,#0 @ future r6 + str r2,[sp,#4] @ future r5 + + mul r2,r12,r8 + eor r3,r3,r3 + umlal r0,r1,r7,r12 + ldr r12,[sp,#28] @ reload r12 + umlal r2,r3,r7,r9 + umlal r0,r1,r6,r9 + umlal r2,r3,r6,r10 + umlal r0,r1,r5,r10 + umlal r2,r3,r5,r11 + umlal r0,r1,r4,r11 + umlal r2,r3,r4,r12 + ldr r4,[sp,#0] + mul r8,r9,r8 + ldr r5,[sp,#4] + + adds r6,lr,r0 @ d2+=d1>>32 + ldr lr,[sp,#8] @ reload input pointer + adc r1,r1,#0 + adds r7,r2,r1 @ d3+=d2>>32 + ldr r0,[sp,#16] @ reload end pointer + adc r3,r3,#0 + add r8,r8,r3 @ h4+=d3>>32 + + and r1,r8,#-4 + and r8,r8,#3 + add r1,r1,r1,lsr#2 @ *=5 + adds r4,r4,r1 + adcs r5,r5,#0 + adcs r6,r6,#0 + adcs r7,r7,#0 + adc r8,r8,#0 + + cmp r0,lr @ done yet? + bhi .Loop + + ldr r0,[sp,#12] + add sp,sp,#32 + stmia r0,{r4-r8} @ store the result + +.Lno_data: +#if __ARM_ARCH__ >= 5 + ldmia sp!,{r3-r11,pc} +#else + ldmia sp!,{r3-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size poly1305_blocks_arm,.-poly1305_blocks_arm + + +.align 5 +.globl poly1305_emit_arm +.type poly1305_emit_arm,%function +poly1305_emit_arm: + stmdb sp!,{r4-r11} +.Lpoly1305_emit_enter: + ldmia r0,{r3-r7} + adds r8,r3,#5 @ compare to modulus + adcs r9,r4,#0 + adcs r10,r5,#0 + adcs r11,r6,#0 + adc r7,r7,#0 + tst r7,#4 @ did it carry/borrow? + +#ifdef __thumb2__ + it ne +#endif + movne r3,r8 + ldr r8,[r2,#0] +#ifdef __thumb2__ + it ne +#endif + movne r4,r9 + ldr r9,[r2,#4] +#ifdef __thumb2__ + it ne +#endif + movne r5,r10 + ldr r10,[r2,#8] +#ifdef __thumb2__ + it ne +#endif + movne r6,r11 + ldr r11,[r2,#12] + + adds r3,r3,r8 + adcs r4,r4,r9 + adcs r5,r5,r10 + adc r6,r6,r11 + +#if __ARM_ARCH__ >= 7 +#ifdef __ARMEB__ + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 +#endif + str r3,[r1,#0] + str r4,[r1,#4] + str r5,[r1,#8] + str r6,[r1,#12] +#else + strb r3,[r1,#0] + mov r3,r3,lsr#8 + strb r4,[r1,#4] + mov r4,r4,lsr#8 + strb r5,[r1,#8] + mov r5,r5,lsr#8 + strb r6,[r1,#12] + mov r6,r6,lsr#8 + + strb r3,[r1,#1] + mov r3,r3,lsr#8 + strb r4,[r1,#5] + mov r4,r4,lsr#8 + strb r5,[r1,#9] + mov r5,r5,lsr#8 + strb r6,[r1,#13] + mov r6,r6,lsr#8 + + strb r3,[r1,#2] + mov r3,r3,lsr#8 + strb r4,[r1,#6] + mov r4,r4,lsr#8 + strb r5,[r1,#10] + mov r5,r5,lsr#8 + strb r6,[r1,#14] + mov r6,r6,lsr#8 + + strb r3,[r1,#3] + strb r4,[r1,#7] + strb r5,[r1,#11] + strb r6,[r1,#15] +#endif + ldmia sp!,{r4-r11} +#if __ARM_ARCH__ >= 5 + bx lr @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size poly1305_emit_arm,.-poly1305_emit_arm + + +#if __ARM_ARCH__ >= 7 +.fpu neon + +.align 5 +.type poly1305_init_neon,%function +poly1305_init_neon: +.Lpoly1305_init_neon: + ldr r4,[r0,#20] @ load key base 2^32 + ldr r5,[r0,#24] + ldr r6,[r0,#28] + ldr r7,[r0,#32] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + and r3,r3,#0x03ffffff + and r4,r4,#0x03ffffff + and r5,r5,#0x03ffffff + + vdup.32 d0,r2 @ r^1 in both lanes + add r2,r3,r3,lsl#2 @ *5 + vdup.32 d1,r3 + add r3,r4,r4,lsl#2 + vdup.32 d2,r2 + vdup.32 d3,r4 + add r4,r5,r5,lsl#2 + vdup.32 d4,r3 + vdup.32 d5,r5 + add r5,r6,r6,lsl#2 + vdup.32 d6,r4 + vdup.32 d7,r6 + vdup.32 d8,r5 + + mov r5,#2 @ counter + +.Lsquare_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + + vmull.u32 q5,d0,d0[1] + vmull.u32 q6,d1,d0[1] + vmull.u32 q7,d3,d0[1] + vmull.u32 q8,d5,d0[1] + vmull.u32 q9,d7,d0[1] + + vmlal.u32 q5,d7,d2[1] + vmlal.u32 q6,d0,d1[1] + vmlal.u32 q7,d1,d1[1] + vmlal.u32 q8,d3,d1[1] + vmlal.u32 q9,d5,d1[1] + + vmlal.u32 q5,d5,d4[1] + vmlal.u32 q6,d7,d4[1] + vmlal.u32 q8,d1,d3[1] + vmlal.u32 q7,d0,d3[1] + vmlal.u32 q9,d3,d3[1] + + vmlal.u32 q5,d3,d6[1] + vmlal.u32 q8,d0,d5[1] + vmlal.u32 q6,d5,d6[1] + vmlal.u32 q7,d7,d6[1] + vmlal.u32 q9,d1,d5[1] + + vmlal.u32 q8,d7,d8[1] + vmlal.u32 q5,d1,d8[1] + vmlal.u32 q6,d3,d8[1] + vmlal.u32 q7,d5,d8[1] + vmlal.u32 q9,d0,d7[1] + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + @ and P. Schwabe + @ + @ H0>>+H1>>+H2>>+H3>>+H4 + @ H3>>+H4>>*5+H0>>+H1 + @ + @ Trivia. + @ + @ Result of multiplication of n-bit number by m-bit number is + @ n+m bits wide. However! Even though 2^n is a n+1-bit number, + @ m-bit number multiplied by 2^n is still n+m bits wide. + @ + @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, + @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit + @ one is n+1 bits wide. + @ + @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that + @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 + @ can be 27. However! In cases when their width exceeds 26 bits + @ they are limited by 2^26+2^6. This in turn means that *sum* + @ of the products with these values can still be viewed as sum + @ of 52-bit numbers as long as the amount of addends is not a + @ power of 2. For example, + @ + @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, + @ + @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or + @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than + @ 8 * (2^52) or 2^55. However, the value is then multiplied by + @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), + @ which is less than 32 * (2^52) or 2^57. And when processing + @ data we are looking at triple as many addends... + @ + @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and + @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the + @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while + @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 + @ instruction accepts 2x32-bit input and writes 2x64-bit result. + @ This means that result of reduction have to be compressed upon + @ loop wrap-around. This can be done in the process of reduction + @ to minimize amount of instructions [as well as amount of + @ 128-bit instructions, which benefits low-end processors], but + @ one has to watch for H2 (which is narrower than H0) and 5*H4 + @ not being wider than 58 bits, so that result of right shift + @ by 26 bits fits in 32 bits. This is also useful on x86, + @ because it allows to use paddd in place for paddq, which + @ benefits Atom, where paddq is ridiculously slow. + + vshr.u64 q15,q8,#26 + vmovn.i64 d16,q8 + vshr.u64 q4,q5,#26 + vmovn.i64 d10,q5 + vadd.i64 q9,q9,q15 @ h3 -> h4 + vbic.i32 d16,#0xfc000000 @ &=0x03ffffff + vadd.i64 q6,q6,q4 @ h0 -> h1 + vbic.i32 d10,#0xfc000000 + + vshrn.u64 d30,q9,#26 + vmovn.i64 d18,q9 + vshr.u64 q4,q6,#26 + vmovn.i64 d12,q6 + vadd.i64 q7,q7,q4 @ h1 -> h2 + vbic.i32 d18,#0xfc000000 + vbic.i32 d12,#0xfc000000 + + vadd.i32 d10,d10,d30 + vshl.u32 d30,d30,#2 + vshrn.u64 d8,q7,#26 + vmovn.i64 d14,q7 + vadd.i32 d10,d10,d30 @ h4 -> h0 + vadd.i32 d16,d16,d8 @ h2 -> h3 + vbic.i32 d14,#0xfc000000 + + vshr.u32 d30,d10,#26 + vbic.i32 d10,#0xfc000000 + vshr.u32 d8,d16,#26 + vbic.i32 d16,#0xfc000000 + vadd.i32 d12,d12,d30 @ h0 -> h1 + vadd.i32 d18,d18,d8 @ h3 -> h4 + + subs r5,r5,#1 + beq .Lsquare_break_neon + + add r6,r0,#(48+0*9*4) + add r7,r0,#(48+1*9*4) + + vtrn.32 d0,d10 @ r^2:r^1 + vtrn.32 d3,d14 + vtrn.32 d5,d16 + vtrn.32 d1,d12 + vtrn.32 d7,d18 + + vshl.u32 d4,d3,#2 @ *5 + vshl.u32 d6,d5,#2 + vshl.u32 d2,d1,#2 + vshl.u32 d8,d7,#2 + vadd.i32 d4,d4,d3 + vadd.i32 d2,d2,d1 + vadd.i32 d6,d6,d5 + vadd.i32 d8,d8,d7 + + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! + vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! + vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! + vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! + vst1.32 {d8[0]},[r6,:32] + vst1.32 {d8[1]},[r7,:32] + + b .Lsquare_neon + +.align 4 +.Lsquare_break_neon: + add r6,r0,#(48+2*4*9) + add r7,r0,#(48+3*4*9) + + vmov d0,d10 @ r^4:r^3 + vshl.u32 d2,d12,#2 @ *5 + vmov d1,d12 + vshl.u32 d4,d14,#2 + vmov d3,d14 + vshl.u32 d6,d16,#2 + vmov d5,d16 + vshl.u32 d8,d18,#2 + vmov d7,d18 + vadd.i32 d2,d2,d12 + vadd.i32 d4,d4,d14 + vadd.i32 d6,d6,d16 + vadd.i32 d8,d8,d18 + + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! + vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! + vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! + vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! + vst1.32 {d8[0]},[r6] + vst1.32 {d8[1]},[r7] + + bx lr @ bx lr +.size poly1305_init_neon,.-poly1305_init_neon + +.align 5 +.globl poly1305_blocks_neon +.type poly1305_blocks_neon,%function +poly1305_blocks_neon: + ldr ip,[r0,#36] @ is_base2_26 + ands r2,r2,#-16 + beq .Lno_data_neon + + cmp r2,#64 + bhs .Lenter_neon + tst ip,ip @ is_base2_26? + beq .Lpoly1305_blocks_arm + +.Lenter_neon: + stmdb sp!,{r4-r7} + vstmdb sp!,{d8-d15} @ ABI specification says so + + tst ip,ip @ is_base2_26? + bne .Lbase2_26_neon + + stmdb sp!,{r1-r3,lr} + bl .Lpoly1305_init_neon + + ldr r4,[r0,#0] @ load hash value base 2^32 + ldr r5,[r0,#4] + ldr r6,[r0,#8] + ldr r7,[r0,#12] + ldr ip,[r0,#16] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + veor d10,d10,d10 + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + veor d12,d12,d12 + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + veor d14,d14,d14 + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + veor d16,d16,d16 + and r3,r3,#0x03ffffff + orr r6,r6,ip,lsl#24 + veor d18,d18,d18 + and r4,r4,#0x03ffffff + mov r1,#1 + and r5,r5,#0x03ffffff + str r1,[r0,#36] @ is_base2_26 + + vmov.32 d10[0],r2 + vmov.32 d12[0],r3 + vmov.32 d14[0],r4 + vmov.32 d16[0],r5 + vmov.32 d18[0],r6 + adr r5,.Lzeros + + ldmia sp!,{r1-r3,lr} + b .Lbase2_32_neon + +.align 4 +.Lbase2_26_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ load hash value + + veor d10,d10,d10 + veor d12,d12,d12 + veor d14,d14,d14 + veor d16,d16,d16 + veor d18,d18,d18 + vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! + adr r5,.Lzeros + vld1.32 {d18[0]},[r0] + sub r0,r0,#16 @ rewind + +.Lbase2_32_neon: + add r4,r1,#32 + mov r3,r3,lsl#24 + tst r2,#31 + beq .Leven + + vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]! + vmov.32 d28[0],r3 + sub r2,r2,#16 + add r4,r1,#32 + +#ifdef __ARMEB__ + vrev32.8 q10,q10 + vrev32.8 q13,q13 + vrev32.8 q11,q11 + vrev32.8 q12,q12 +#endif + vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26 + vshl.u32 d26,d26,#18 + + vsri.u32 d26,d24,#14 + vshl.u32 d24,d24,#12 + vadd.i32 d29,d28,d18 @ add hash value and move to #hi + + vbic.i32 d26,#0xfc000000 + vsri.u32 d24,d22,#20 + vshl.u32 d22,d22,#6 + + vbic.i32 d24,#0xfc000000 + vsri.u32 d22,d20,#26 + vadd.i32 d27,d26,d16 + + vbic.i32 d20,#0xfc000000 + vbic.i32 d22,#0xfc000000 + vadd.i32 d25,d24,d14 + + vadd.i32 d21,d20,d10 + vadd.i32 d23,d22,d12 + + mov r7,r5 + add r6,r0,#48 + + cmp r2,r2 + b .Long_tail + +.align 4 +.Leven: + subs r2,r2,#64 + it lo + movlo r4,r5 + + vmov.i32 q14,#1<<24 @ padbit, yes, always + vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] + add r1,r1,#64 + vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) + add r4,r4,#64 + itt hi + addhi r7,r0,#(48+1*9*4) + addhi r6,r0,#(48+3*9*4) + +#ifdef __ARMEB__ + vrev32.8 q10,q10 + vrev32.8 q13,q13 + vrev32.8 q11,q11 + vrev32.8 q12,q12 +#endif + vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 + vshl.u32 q13,q13,#18 + + vsri.u32 q13,q12,#14 + vshl.u32 q12,q12,#12 + + vbic.i32 q13,#0xfc000000 + vsri.u32 q12,q11,#20 + vshl.u32 q11,q11,#6 + + vbic.i32 q12,#0xfc000000 + vsri.u32 q11,q10,#26 + + vbic.i32 q10,#0xfc000000 + vbic.i32 q11,#0xfc000000 + + bls .Lskip_loop + + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! + b .Loop_neon + +.align 5 +.Loop_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + @ ___________________/ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + @ ___________________/ ____________________/ + @ + @ Note that we start with inp[2:3]*r^2. This is because it + @ doesn't depend on reduction in previous iteration. + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ inp[2:3]*r^2 + + vadd.i32 d24,d24,d14 @ accumulate inp[0:1] + vmull.u32 q7,d25,d0[1] + vadd.i32 d20,d20,d10 + vmull.u32 q5,d21,d0[1] + vadd.i32 d26,d26,d16 + vmull.u32 q8,d27,d0[1] + vmlal.u32 q7,d23,d1[1] + vadd.i32 d22,d22,d12 + vmull.u32 q6,d23,d0[1] + + vadd.i32 d28,d28,d18 + vmull.u32 q9,d29,d0[1] + subs r2,r2,#64 + vmlal.u32 q5,d29,d2[1] + it lo + movlo r4,r5 + vmlal.u32 q8,d25,d1[1] + vld1.32 d8[1],[r7,:32] + vmlal.u32 q6,d21,d1[1] + vmlal.u32 q9,d27,d1[1] + + vmlal.u32 q5,d27,d4[1] + vmlal.u32 q8,d23,d3[1] + vmlal.u32 q9,d25,d3[1] + vmlal.u32 q6,d29,d4[1] + vmlal.u32 q7,d21,d3[1] + + vmlal.u32 q8,d21,d5[1] + vmlal.u32 q5,d25,d6[1] + vmlal.u32 q9,d23,d5[1] + vmlal.u32 q6,d27,d6[1] + vmlal.u32 q7,d29,d6[1] + + vmlal.u32 q8,d29,d8[1] + vmlal.u32 q5,d23,d8[1] + vmlal.u32 q9,d21,d7[1] + vmlal.u32 q6,d25,d8[1] + vmlal.u32 q7,d27,d8[1] + + vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) + add r4,r4,#64 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4 and accumulate + + vmlal.u32 q8,d26,d0[0] + vmlal.u32 q5,d20,d0[0] + vmlal.u32 q9,d28,d0[0] + vmlal.u32 q6,d22,d0[0] + vmlal.u32 q7,d24,d0[0] + vld1.32 d8[0],[r6,:32] + + vmlal.u32 q8,d24,d1[0] + vmlal.u32 q5,d28,d2[0] + vmlal.u32 q9,d26,d1[0] + vmlal.u32 q6,d20,d1[0] + vmlal.u32 q7,d22,d1[0] + + vmlal.u32 q8,d22,d3[0] + vmlal.u32 q5,d26,d4[0] + vmlal.u32 q9,d24,d3[0] + vmlal.u32 q6,d28,d4[0] + vmlal.u32 q7,d20,d3[0] + + vmlal.u32 q8,d20,d5[0] + vmlal.u32 q5,d24,d6[0] + vmlal.u32 q9,d22,d5[0] + vmlal.u32 q6,d26,d6[0] + vmlal.u32 q8,d28,d8[0] + + vmlal.u32 q7,d28,d6[0] + vmlal.u32 q5,d22,d8[0] + vmlal.u32 q9,d20,d7[0] + vmov.i32 q14,#1<<24 @ padbit, yes, always + vmlal.u32 q6,d24,d8[0] + vmlal.u32 q7,d26,d8[0] + + vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] + add r1,r1,#64 +#ifdef __ARMEB__ + vrev32.8 q10,q10 + vrev32.8 q11,q11 + vrev32.8 q12,q12 + vrev32.8 q13,q13 +#endif + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction interleaved with base 2^32 -> base 2^26 of + @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14. + + vshr.u64 q15,q8,#26 + vmovn.i64 d16,q8 + vshr.u64 q4,q5,#26 + vmovn.i64 d10,q5 + vadd.i64 q9,q9,q15 @ h3 -> h4 + vbic.i32 d16,#0xfc000000 + vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 + vadd.i64 q6,q6,q4 @ h0 -> h1 + vshl.u32 q13,q13,#18 + vbic.i32 d10,#0xfc000000 + + vshrn.u64 d30,q9,#26 + vmovn.i64 d18,q9 + vshr.u64 q4,q6,#26 + vmovn.i64 d12,q6 + vadd.i64 q7,q7,q4 @ h1 -> h2 + vsri.u32 q13,q12,#14 + vbic.i32 d18,#0xfc000000 + vshl.u32 q12,q12,#12 + vbic.i32 d12,#0xfc000000 + + vadd.i32 d10,d10,d30 + vshl.u32 d30,d30,#2 + vbic.i32 q13,#0xfc000000 + vshrn.u64 d8,q7,#26 + vmovn.i64 d14,q7 + vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec] + vsri.u32 q12,q11,#20 + vadd.i32 d16,d16,d8 @ h2 -> h3 + vshl.u32 q11,q11,#6 + vbic.i32 d14,#0xfc000000 + vbic.i32 q12,#0xfc000000 + + vshrn.u64 d30,q5,#26 @ re-narrow + vmovn.i64 d10,q5 + vsri.u32 q11,q10,#26 + vbic.i32 q10,#0xfc000000 + vshr.u32 d8,d16,#26 + vbic.i32 d16,#0xfc000000 + vbic.i32 d10,#0xfc000000 + vadd.i32 d12,d12,d30 @ h0 -> h1 + vadd.i32 d18,d18,d8 @ h3 -> h4 + vbic.i32 q11,#0xfc000000 + + bhi .Loop_neon + +.Lskip_loop: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + add r7,r0,#(48+0*9*4) + add r6,r0,#(48+1*9*4) + adds r2,r2,#32 + it ne + movne r2,#0 + bne .Long_tail + + vadd.i32 d25,d24,d14 @ add hash value and move to #hi + vadd.i32 d21,d20,d10 + vadd.i32 d27,d26,d16 + vadd.i32 d23,d22,d12 + vadd.i32 d29,d28,d18 + +.Long_tail: + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2 + + vadd.i32 d24,d24,d14 @ can be redundant + vmull.u32 q7,d25,d0 + vadd.i32 d20,d20,d10 + vmull.u32 q5,d21,d0 + vadd.i32 d26,d26,d16 + vmull.u32 q8,d27,d0 + vadd.i32 d22,d22,d12 + vmull.u32 q6,d23,d0 + vadd.i32 d28,d28,d18 + vmull.u32 q9,d29,d0 + + vmlal.u32 q5,d29,d2 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! + vmlal.u32 q8,d25,d1 + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! + vmlal.u32 q6,d21,d1 + vmlal.u32 q9,d27,d1 + vmlal.u32 q7,d23,d1 + + vmlal.u32 q8,d23,d3 + vld1.32 d8[1],[r7,:32] + vmlal.u32 q5,d27,d4 + vld1.32 d8[0],[r6,:32] + vmlal.u32 q9,d25,d3 + vmlal.u32 q6,d29,d4 + vmlal.u32 q7,d21,d3 + + vmlal.u32 q8,d21,d5 + it ne + addne r7,r0,#(48+2*9*4) + vmlal.u32 q5,d25,d6 + it ne + addne r6,r0,#(48+3*9*4) + vmlal.u32 q9,d23,d5 + vmlal.u32 q6,d27,d6 + vmlal.u32 q7,d29,d6 + + vmlal.u32 q8,d29,d8 + vorn q0,q0,q0 @ all-ones, can be redundant + vmlal.u32 q5,d23,d8 + vshr.u64 q0,q0,#38 + vmlal.u32 q9,d21,d7 + vmlal.u32 q6,d25,d8 + vmlal.u32 q7,d27,d8 + + beq .Lshort_tail + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4:r^3 and accumulate + + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 + + vmlal.u32 q7,d24,d0 + vmlal.u32 q5,d20,d0 + vmlal.u32 q8,d26,d0 + vmlal.u32 q6,d22,d0 + vmlal.u32 q9,d28,d0 + + vmlal.u32 q5,d28,d2 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! + vmlal.u32 q8,d24,d1 + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! + vmlal.u32 q6,d20,d1 + vmlal.u32 q9,d26,d1 + vmlal.u32 q7,d22,d1 + + vmlal.u32 q8,d22,d3 + vld1.32 d8[1],[r7,:32] + vmlal.u32 q5,d26,d4 + vld1.32 d8[0],[r6,:32] + vmlal.u32 q9,d24,d3 + vmlal.u32 q6,d28,d4 + vmlal.u32 q7,d20,d3 + + vmlal.u32 q8,d20,d5 + vmlal.u32 q5,d24,d6 + vmlal.u32 q9,d22,d5 + vmlal.u32 q6,d26,d6 + vmlal.u32 q7,d28,d6 + + vmlal.u32 q8,d28,d8 + vorn q0,q0,q0 @ all-ones + vmlal.u32 q5,d22,d8 + vshr.u64 q0,q0,#38 + vmlal.u32 q9,d20,d7 + vmlal.u32 q6,d24,d8 + vmlal.u32 q7,d26,d8 + +.Lshort_tail: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ horizontal addition + + vadd.i64 d16,d16,d17 + vadd.i64 d10,d10,d11 + vadd.i64 d18,d18,d19 + vadd.i64 d12,d12,d13 + vadd.i64 d14,d14,d15 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction, but without narrowing + + vshr.u64 q15,q8,#26 + vand.i64 q8,q8,q0 + vshr.u64 q4,q5,#26 + vand.i64 q5,q5,q0 + vadd.i64 q9,q9,q15 @ h3 -> h4 + vadd.i64 q6,q6,q4 @ h0 -> h1 + + vshr.u64 q15,q9,#26 + vand.i64 q9,q9,q0 + vshr.u64 q4,q6,#26 + vand.i64 q6,q6,q0 + vadd.i64 q7,q7,q4 @ h1 -> h2 + + vadd.i64 q5,q5,q15 + vshl.u64 q15,q15,#2 + vshr.u64 q4,q7,#26 + vand.i64 q7,q7,q0 + vadd.i64 q5,q5,q15 @ h4 -> h0 + vadd.i64 q8,q8,q4 @ h2 -> h3 + + vshr.u64 q15,q5,#26 + vand.i64 q5,q5,q0 + vshr.u64 q4,q8,#26 + vand.i64 q8,q8,q0 + vadd.i64 q6,q6,q15 @ h0 -> h1 + vadd.i64 q9,q9,q4 @ h3 -> h4 + + cmp r2,#0 + bne .Leven + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ store hash value + + vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! + vst1.32 {d18[0]},[r0] + + vldmia sp!,{d8-d15} @ epilogue + ldmia sp!,{r4-r7} +.Lno_data_neon: + bx lr @ bx lr +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.align 5 +.globl poly1305_emit_neon +.type poly1305_emit_neon,%function +poly1305_emit_neon: + ldr ip,[r0,#36] @ is_base2_26 + + stmdb sp!,{r4-r11} + + tst ip,ip + beq .Lpoly1305_emit_enter + + ldmia r0,{r3-r7} + eor r8,r8,r8 + + adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32 + mov r4,r4,lsr#6 + adcs r4,r4,r5,lsl#20 + mov r5,r5,lsr#12 + adcs r5,r5,r6,lsl#14 + mov r6,r6,lsr#18 + adcs r6,r6,r7,lsl#8 + adc r7,r8,r7,lsr#24 @ can be partially reduced ... + + and r8,r7,#-4 @ ... so reduce + and r7,r6,#3 + add r8,r8,r8,lsr#2 @ *= 5 + adds r3,r3,r8 + adcs r4,r4,#0 + adcs r5,r5,#0 + adcs r6,r6,#0 + adc r7,r7,#0 + + adds r8,r3,#5 @ compare to modulus + adcs r9,r4,#0 + adcs r10,r5,#0 + adcs r11,r6,#0 + adc r7,r7,#0 + tst r7,#4 @ did it carry/borrow? + + it ne + movne r3,r8 + ldr r8,[r2,#0] + it ne + movne r4,r9 + ldr r9,[r2,#4] + it ne + movne r5,r10 + ldr r10,[r2,#8] + it ne + movne r6,r11 + ldr r11,[r2,#12] + + adds r3,r3,r8 @ accumulate nonce + adcs r4,r4,r9 + adcs r5,r5,r10 + adc r6,r6,r11 + +#ifdef __ARMEB__ + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 +#endif + str r3,[r1,#0] @ store the result + str r4,[r1,#4] + str r5,[r1,#8] + str r6,[r1,#12] + + ldmia sp!,{r4-r11} + bx lr @ bx lr +.size poly1305_emit_neon,.-poly1305_emit_neon + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +#endif diff --git a/crypto/poly1305/poly1305-arm64.pl b/crypto/poly1305/poly1305-arm64.pl new file mode 100644 index 0000000..ac06457 --- /dev/null +++ b/crypto/poly1305/poly1305-arm64.pl @@ -0,0 +1,944 @@ +#! /usr/bin/env perl +# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements Poly1305 hash for ARMv8. +# +# June 2015 +# +# Numbers are cycles per processed byte with poly1305_blocks alone. +# +# IALU/gcc-4.9 NEON +# +# Apple A7 1.86/+5% 0.72 +# Cortex-A53 2.69/+58% 1.47 +# Cortex-A57 2.70/+7% 1.14 +# Denver 1.64/+50% 1.18(*) +# X-Gene 2.13/+68% 2.27 +# Mongoose 1.77/+75% 1.12 +# Kryo 2.70/+55% 1.13 +# +# (*) estimate based on resources availability is less than 1.0, +# i.e. measured result is worse than expected, presumably binary +# translator is not almighty; + +$flavour=shift; +$output=shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); +my ($mac,$nonce)=($inp,$len); + +my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); + +$code.=<<___; +#include "arm_arch.h" + +.text + +// forward "declarations" are required for Apple +.extern OPENSSL_armcap_P +.globl poly1305_blocks +.globl poly1305_emit + +.globl poly1305_init +.type poly1305_init,%function +.align 5 +poly1305_init: + cmp $inp,xzr + stp xzr,xzr,[$ctx] // zero hash value + stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] + + csel x0,xzr,x0,eq + b.eq .Lno_key + +#ifdef __ILP32__ + ldrsw $t1,.LOPENSSL_armcap_P +#else + ldr $t1,.LOPENSSL_armcap_P +#endif + adr $t0,.LOPENSSL_armcap_P + + ldp $r0,$r1,[$inp] // load key + mov $s1,#0xfffffffc0fffffff + movk $s1,#0x0fff,lsl#48 + ldr w17,[$t0,$t1] +#ifdef __ARMEB__ + rev $r0,$r0 // flip bytes + rev $r1,$r1 +#endif + and $r0,$r0,$s1 // &=0ffffffc0fffffff + and $s1,$s1,#-4 + and $r1,$r1,$s1 // &=0ffffffc0ffffffc + stp $r0,$r1,[$ctx,#32] // save key value + + tst w17,#ARMV7_NEON + + adr $d0,poly1305_blocks + adr $r0,poly1305_blocks_neon + adr $d1,poly1305_emit + adr $r1,poly1305_emit_neon + + csel $d0,$d0,$r0,eq + csel $d1,$d1,$r1,eq + +#ifdef __ILP32__ + stp w12,w13,[$len] +#else + stp $d0,$d1,[$len] +#endif + + mov x0,#1 +.Lno_key: + ret +.size poly1305_init,.-poly1305_init + +.type poly1305_blocks,%function +.align 5 +poly1305_blocks: + ands $len,$len,#-16 + b.eq .Lno_data + + ldp $h0,$h1,[$ctx] // load hash value + ldp $r0,$r1,[$ctx,#32] // load key value + ldr $h2,[$ctx,#16] + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + b .Loop + +.align 5 +.Loop: + ldp $t0,$t1,[$inp],#16 // load input + sub $len,$len,#16 +#ifdef __ARMEB__ + rev $t0,$t0 + rev $t1,$t1 +#endif + adds $h0,$h0,$t0 // accumulate input + adcs $h1,$h1,$t1 + + mul $d0,$h0,$r0 // h0*r0 + adc $h2,$h2,$padbit + umulh $d1,$h0,$r0 + + mul $t0,$h1,$s1 // h1*5*r1 + umulh $t1,$h1,$s1 + + adds $d0,$d0,$t0 + mul $t0,$h0,$r1 // h0*r1 + adc $d1,$d1,$t1 + umulh $d2,$h0,$r1 + + adds $d1,$d1,$t0 + mul $t0,$h1,$r0 // h1*r0 + adc $d2,$d2,xzr + umulh $t1,$h1,$r0 + + adds $d1,$d1,$t0 + mul $t0,$h2,$s1 // h2*5*r1 + adc $d2,$d2,$t1 + mul $t1,$h2,$r0 // h2*r0 + + adds $d1,$d1,$t0 + adc $d2,$d2,$t1 + + and $t0,$d2,#-4 // final reduction + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$d0,$t0 + adcs $h1,$d1,xzr + adc $h2,$h2,xzr + + cbnz $len,.Loop + + stp $h0,$h1,[$ctx] // store hash value + str $h2,[$ctx,#16] + +.Lno_data: + ret +.size poly1305_blocks,.-poly1305_blocks + +.type poly1305_emit,%function +.align 5 +poly1305_emit: + ldp $h0,$h1,[$ctx] // load hash base 2^64 + ldr $h2,[$ctx,#16] + ldp $t0,$t1,[$nonce] // load nonce + + adds $d0,$h0,#5 // compare to modulus + adcs $d1,$h1,xzr + adc $d2,$h2,xzr + + tst $d2,#-4 // see if it's carried/borrowed + + csel $h0,$h0,$d0,eq + csel $h1,$h1,$d1,eq + +#ifdef __ARMEB__ + ror $t0,$t0,#32 // flip nonce words + ror $t1,$t1,#32 +#endif + adds $h0,$h0,$t0 // accumulate nonce + adc $h1,$h1,$t1 +#ifdef __ARMEB__ + rev $h0,$h0 // flip output bytes + rev $h1,$h1 +#endif + stp $h0,$h1,[$mac] // write result + + ret +.size poly1305_emit,.-poly1305_emit +___ +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); +my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); +my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); +my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); +my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); +my ($T0,$T1,$MASK) = map("v$_",(29..31)); + +my ($in2,$zeros)=("x16","x17"); +my $is_base2_26 = $zeros; # borrow + +$code.=<<___; +.type poly1305_mult,%function +.align 5 +poly1305_mult: + mul $d0,$h0,$r0 // h0*r0 + umulh $d1,$h0,$r0 + + mul $t0,$h1,$s1 // h1*5*r1 + umulh $t1,$h1,$s1 + + adds $d0,$d0,$t0 + mul $t0,$h0,$r1 // h0*r1 + adc $d1,$d1,$t1 + umulh $d2,$h0,$r1 + + adds $d1,$d1,$t0 + mul $t0,$h1,$r0 // h1*r0 + adc $d2,$d2,xzr + umulh $t1,$h1,$r0 + + adds $d1,$d1,$t0 + mul $t0,$h2,$s1 // h2*5*r1 + adc $d2,$d2,$t1 + mul $t1,$h2,$r0 // h2*r0 + + adds $d1,$d1,$t0 + adc $d2,$d2,$t1 + + and $t0,$d2,#-4 // final reduction + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$d0,$t0 + adcs $h1,$d1,xzr + adc $h2,$h2,xzr + + ret +.size poly1305_mult,.-poly1305_mult + +.type poly1305_splat,%function +.align 5 +poly1305_splat: + and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x13,$h0,#26,#26 + extr x14,$h1,$h0,#52 + and x14,x14,#0x03ffffff + ubfx x15,$h1,#14,#26 + extr x16,$h2,$h1,#40 + + str w12,[$ctx,#16*0] // r0 + add w12,w13,w13,lsl#2 // r1*5 + str w13,[$ctx,#16*1] // r1 + add w13,w14,w14,lsl#2 // r2*5 + str w12,[$ctx,#16*2] // s1 + str w14,[$ctx,#16*3] // r2 + add w14,w15,w15,lsl#2 // r3*5 + str w13,[$ctx,#16*4] // s2 + str w15,[$ctx,#16*5] // r3 + add w15,w16,w16,lsl#2 // r4*5 + str w14,[$ctx,#16*6] // s3 + str w16,[$ctx,#16*7] // r4 + str w15,[$ctx,#16*8] // s4 + + ret +.size poly1305_splat,.-poly1305_splat + +.type poly1305_blocks_neon,%function +.align 5 +poly1305_blocks_neon: + ldr $is_base2_26,[$ctx,#24] + cmp $len,#128 + b.hs .Lblocks_neon + cbz $is_base2_26,poly1305_blocks + +.Lblocks_neon: + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + + ands $len,$len,#-16 + b.eq .Lno_data_neon + + cbz $is_base2_26,.Lbase2_64_neon + + ldp w10,w11,[$ctx] // load hash value base 2^26 + ldp w12,w13,[$ctx,#8] + ldr w14,[$ctx,#16] + + tst $len,#31 + b.eq .Leven_neon + + ldp $r0,$r1,[$ctx,#32] // load key value + + add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 + lsr $h1,x12,#12 + adds $h0,$h0,x12,lsl#52 + add $h1,$h1,x13,lsl#14 + adc $h1,$h1,xzr + lsr $h2,x14,#24 + adds $h1,$h1,x14,lsl#40 + adc $d2,$h2,xzr // can be partially reduced... + + ldp $d0,$d1,[$inp],#16 // load input + sub $len,$len,#16 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + + and $t0,$d2,#-4 // ... so reduce + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$h0,$t0 + adcs $h1,$h1,xzr + adc $h2,$h2,xzr + +#ifdef __ARMEB__ + rev $d0,$d0 + rev $d1,$d1 +#endif + adds $h0,$h0,$d0 // accumulate input + adcs $h1,$h1,$d1 + adc $h2,$h2,$padbit + + bl poly1305_mult + ldr x30,[sp,#8] + + cbz $padbit,.Lstore_base2_64_neon + + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,$h0,#26,#26 + extr x12,$h1,$h0,#52 + and x12,x12,#0x03ffffff + ubfx x13,$h1,#14,#26 + extr x14,$h2,$h1,#40 + + cbnz $len,.Leven_neon + + stp w10,w11,[$ctx] // store hash value base 2^26 + stp w12,w13,[$ctx,#8] + str w14,[$ctx,#16] + b .Lno_data_neon + +.align 4 +.Lstore_base2_64_neon: + stp $h0,$h1,[$ctx] // store hash value base 2^64 + stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed + b .Lno_data_neon + +.align 4 +.Lbase2_64_neon: + ldp $r0,$r1,[$ctx,#32] // load key value + + ldp $h0,$h1,[$ctx] // load hash value base 2^64 + ldr $h2,[$ctx,#16] + + tst $len,#31 + b.eq .Linit_neon + + ldp $d0,$d1,[$inp],#16 // load input + sub $len,$len,#16 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) +#ifdef __ARMEB__ + rev $d0,$d0 + rev $d1,$d1 +#endif + adds $h0,$h0,$d0 // accumulate input + adcs $h1,$h1,$d1 + adc $h2,$h2,$padbit + + bl poly1305_mult + +.Linit_neon: + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,$h0,#26,#26 + extr x12,$h1,$h0,#52 + and x12,x12,#0x03ffffff + ubfx x13,$h1,#14,#26 + extr x14,$h2,$h1,#40 + + stp d8,d9,[sp,#16] // meet ABI requirements + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + fmov ${H0},x10 + fmov ${H1},x11 + fmov ${H2},x12 + fmov ${H3},x13 + fmov ${H4},x14 + + ////////////////////////////////// initialize r^n table + mov $h0,$r0 // r^1 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + mov $h1,$r1 + mov $h2,xzr + add $ctx,$ctx,#48+12 + bl poly1305_splat + + bl poly1305_mult // r^2 + sub $ctx,$ctx,#4 + bl poly1305_splat + + bl poly1305_mult // r^3 + sub $ctx,$ctx,#4 + bl poly1305_splat + + bl poly1305_mult // r^4 + sub $ctx,$ctx,#4 + bl poly1305_splat + ldr x30,[sp,#8] + + add $in2,$inp,#32 + adr $zeros,.Lzeros + subs $len,$len,#64 + csel $in2,$zeros,$in2,lo + + mov x4,#1 + str x4,[$ctx,#-24] // set is_base2_26 + sub $ctx,$ctx,#48 // restore original $ctx + b .Ldo_neon + +.align 4 +.Leven_neon: + add $in2,$inp,#32 + adr $zeros,.Lzeros + subs $len,$len,#64 + csel $in2,$zeros,$in2,lo + + stp d8,d9,[sp,#16] // meet ABI requirements + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + fmov ${H0},x10 + fmov ${H1},x11 + fmov ${H2},x12 + fmov ${H3},x13 + fmov ${H4},x14 + +.Ldo_neon: + ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) + ldp x9,x13,[$in2],#48 + + lsl $padbit,$padbit,#24 + add x15,$ctx,#48 + +#ifdef __ARMEB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov $IN23_0,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,$padbit,x12,lsr#40 + add x13,$padbit,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov $IN23_1,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + fmov $IN23_2,x8 + fmov $IN23_3,x10 + fmov $IN23_4,x12 + + ldp x8,x12,[$inp],#16 // inp[0:1] + ldp x9,x13,[$inp],#48 + + ld1 {$R0,$R1,$S1,$R2},[x15],#64 + ld1 {$S2,$R3,$S3,$R4},[x15],#64 + ld1 {$S4},[x15] + +#ifdef __ARMEB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov $IN01_0,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,$padbit,x12,lsr#40 + add x13,$padbit,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov $IN01_1,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + movi $MASK.2d,#-1 + fmov $IN01_2,x8 + fmov $IN01_3,x10 + fmov $IN01_4,x12 + ushr $MASK.2d,$MASK.2d,#38 + + b.ls .Lskip_loop + +.align 4 +.Loop_neon: + //////////////////////////////////////////////////////////////// + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + // \___________________/ + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + // \___________________/ \____________________/ + // + // Note that we start with inp[2:3]*r^2. This is because it + // doesn't depend on reduction in previous iteration. + //////////////////////////////////////////////////////////////// + // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 + // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 + // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 + // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 + // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 + + subs $len,$len,#64 + umull $ACC4,$IN23_0,${R4}[2] + csel $in2,$zeros,$in2,lo + umull $ACC3,$IN23_0,${R3}[2] + umull $ACC2,$IN23_0,${R2}[2] + ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) + umull $ACC1,$IN23_0,${R1}[2] + ldp x9,x13,[$in2],#48 + umull $ACC0,$IN23_0,${R0}[2] +#ifdef __ARMEB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + umlal $ACC4,$IN23_1,${R3}[2] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal $ACC3,$IN23_1,${R2}[2] + and x5,x9,#0x03ffffff + umlal $ACC2,$IN23_1,${R1}[2] + ubfx x6,x8,#26,#26 + umlal $ACC1,$IN23_1,${R0}[2] + ubfx x7,x9,#26,#26 + umlal $ACC0,$IN23_1,${S4}[2] + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + + umlal $ACC4,$IN23_2,${R2}[2] + extr x8,x12,x8,#52 + umlal $ACC3,$IN23_2,${R1}[2] + extr x9,x13,x9,#52 + umlal $ACC2,$IN23_2,${R0}[2] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal $ACC1,$IN23_2,${S4}[2] + fmov $IN23_0,x4 + umlal $ACC0,$IN23_2,${S3}[2] + and x8,x8,#0x03ffffff + + umlal $ACC4,$IN23_3,${R1}[2] + and x9,x9,#0x03ffffff + umlal $ACC3,$IN23_3,${R0}[2] + ubfx x10,x12,#14,#26 + umlal $ACC2,$IN23_3,${S4}[2] + ubfx x11,x13,#14,#26 + umlal $ACC1,$IN23_3,${S3}[2] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal $ACC0,$IN23_3,${S2}[2] + fmov $IN23_1,x6 + + add $IN01_2,$IN01_2,$H2 + add x12,$padbit,x12,lsr#40 + umlal $ACC4,$IN23_4,${R0}[2] + add x13,$padbit,x13,lsr#40 + umlal $ACC3,$IN23_4,${S4}[2] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal $ACC2,$IN23_4,${S3}[2] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal $ACC1,$IN23_4,${S2}[2] + fmov $IN23_2,x8 + umlal $ACC0,$IN23_4,${S1}[2] + fmov $IN23_3,x10 + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4 and accumulate + + add $IN01_0,$IN01_0,$H0 + fmov $IN23_4,x12 + umlal $ACC3,$IN01_2,${R1}[0] + ldp x8,x12,[$inp],#16 // inp[0:1] + umlal $ACC0,$IN01_2,${S3}[0] + ldp x9,x13,[$inp],#48 + umlal $ACC4,$IN01_2,${R2}[0] + umlal $ACC1,$IN01_2,${S4}[0] + umlal $ACC2,$IN01_2,${R0}[0] +#ifdef __ARMEB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + add $IN01_1,$IN01_1,$H1 + umlal $ACC3,$IN01_0,${R3}[0] + umlal $ACC4,$IN01_0,${R4}[0] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal $ACC2,$IN01_0,${R2}[0] + and x5,x9,#0x03ffffff + umlal $ACC0,$IN01_0,${R0}[0] + ubfx x6,x8,#26,#26 + umlal $ACC1,$IN01_0,${R1}[0] + ubfx x7,x9,#26,#26 + + add $IN01_3,$IN01_3,$H3 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + umlal $ACC3,$IN01_1,${R2}[0] + extr x8,x12,x8,#52 + umlal $ACC4,$IN01_1,${R3}[0] + extr x9,x13,x9,#52 + umlal $ACC0,$IN01_1,${S4}[0] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal $ACC2,$IN01_1,${R1}[0] + fmov $IN01_0,x4 + umlal $ACC1,$IN01_1,${R0}[0] + and x8,x8,#0x03ffffff + + add $IN01_4,$IN01_4,$H4 + and x9,x9,#0x03ffffff + umlal $ACC3,$IN01_3,${R0}[0] + ubfx x10,x12,#14,#26 + umlal $ACC0,$IN01_3,${S2}[0] + ubfx x11,x13,#14,#26 + umlal $ACC4,$IN01_3,${R1}[0] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal $ACC1,$IN01_3,${S3}[0] + fmov $IN01_1,x6 + umlal $ACC2,$IN01_3,${S4}[0] + add x12,$padbit,x12,lsr#40 + + umlal $ACC3,$IN01_4,${S4}[0] + add x13,$padbit,x13,lsr#40 + umlal $ACC0,$IN01_4,${S1}[0] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal $ACC4,$IN01_4,${R0}[0] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal $ACC1,$IN01_4,${S2}[0] + fmov $IN01_2,x8 + umlal $ACC2,$IN01_4,${S3}[0] + fmov $IN01_3,x10 + fmov $IN01_4,x12 + + ///////////////////////////////////////////////////////////////// + // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + // and P. Schwabe + // + // [see discussion in poly1305-armv4 module] + + ushr $T0.2d,$ACC3,#26 + xtn $H3,$ACC3 + ushr $T1.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + add $ACC4,$ACC4,$T0.2d // h3 -> h4 + bic $H3,#0xfc,lsl#24 // &=0x03ffffff + add $ACC1,$ACC1,$T1.2d // h0 -> h1 + + ushr $T0.2d,$ACC4,#26 + xtn $H4,$ACC4 + ushr $T1.2d,$ACC1,#26 + xtn $H1,$ACC1 + bic $H4,#0xfc,lsl#24 + add $ACC2,$ACC2,$T1.2d // h1 -> h2 + + add $ACC0,$ACC0,$T0.2d + shl $T0.2d,$T0.2d,#2 + shrn $T1.2s,$ACC2,#26 + xtn $H2,$ACC2 + add $ACC0,$ACC0,$T0.2d // h4 -> h0 + bic $H1,#0xfc,lsl#24 + add $H3,$H3,$T1.2s // h2 -> h3 + bic $H2,#0xfc,lsl#24 + + shrn $T0.2s,$ACC0,#26 + xtn $H0,$ACC0 + ushr $T1.2s,$H3,#26 + bic $H3,#0xfc,lsl#24 + bic $H0,#0xfc,lsl#24 + add $H1,$H1,$T0.2s // h0 -> h1 + add $H4,$H4,$T1.2s // h3 -> h4 + + b.hi .Loop_neon + +.Lskip_loop: + dup $IN23_2,${IN23_2}[0] + add $IN01_2,$IN01_2,$H2 + + //////////////////////////////////////////////////////////////// + // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + adds $len,$len,#32 + b.ne .Long_tail + + dup $IN23_2,${IN01_2}[0] + add $IN23_0,$IN01_0,$H0 + add $IN23_3,$IN01_3,$H3 + add $IN23_1,$IN01_1,$H1 + add $IN23_4,$IN01_4,$H4 + +.Long_tail: + dup $IN23_0,${IN23_0}[0] + umull2 $ACC0,$IN23_2,${S3} + umull2 $ACC3,$IN23_2,${R1} + umull2 $ACC4,$IN23_2,${R2} + umull2 $ACC2,$IN23_2,${R0} + umull2 $ACC1,$IN23_2,${S4} + + dup $IN23_1,${IN23_1}[0] + umlal2 $ACC0,$IN23_0,${R0} + umlal2 $ACC2,$IN23_0,${R2} + umlal2 $ACC3,$IN23_0,${R3} + umlal2 $ACC4,$IN23_0,${R4} + umlal2 $ACC1,$IN23_0,${R1} + + dup $IN23_3,${IN23_3}[0] + umlal2 $ACC0,$IN23_1,${S4} + umlal2 $ACC3,$IN23_1,${R2} + umlal2 $ACC2,$IN23_1,${R1} + umlal2 $ACC4,$IN23_1,${R3} + umlal2 $ACC1,$IN23_1,${R0} + + dup $IN23_4,${IN23_4}[0] + umlal2 $ACC3,$IN23_3,${R0} + umlal2 $ACC4,$IN23_3,${R1} + umlal2 $ACC0,$IN23_3,${S2} + umlal2 $ACC1,$IN23_3,${S3} + umlal2 $ACC2,$IN23_3,${S4} + + umlal2 $ACC3,$IN23_4,${S4} + umlal2 $ACC0,$IN23_4,${S1} + umlal2 $ACC4,$IN23_4,${R0} + umlal2 $ACC1,$IN23_4,${S2} + umlal2 $ACC2,$IN23_4,${S3} + + b.eq .Lshort_tail + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4:r^3 and accumulate + + add $IN01_0,$IN01_0,$H0 + umlal $ACC3,$IN01_2,${R1} + umlal $ACC0,$IN01_2,${S3} + umlal $ACC4,$IN01_2,${R2} + umlal $ACC1,$IN01_2,${S4} + umlal $ACC2,$IN01_2,${R0} + + add $IN01_1,$IN01_1,$H1 + umlal $ACC3,$IN01_0,${R3} + umlal $ACC0,$IN01_0,${R0} + umlal $ACC4,$IN01_0,${R4} + umlal $ACC1,$IN01_0,${R1} + umlal $ACC2,$IN01_0,${R2} + + add $IN01_3,$IN01_3,$H3 + umlal $ACC3,$IN01_1,${R2} + umlal $ACC0,$IN01_1,${S4} + umlal $ACC4,$IN01_1,${R3} + umlal $ACC1,$IN01_1,${R0} + umlal $ACC2,$IN01_1,${R1} + + add $IN01_4,$IN01_4,$H4 + umlal $ACC3,$IN01_3,${R0} + umlal $ACC0,$IN01_3,${S2} + umlal $ACC4,$IN01_3,${R1} + umlal $ACC1,$IN01_3,${S3} + umlal $ACC2,$IN01_3,${S4} + + umlal $ACC3,$IN01_4,${S4} + umlal $ACC0,$IN01_4,${S1} + umlal $ACC4,$IN01_4,${R0} + umlal $ACC1,$IN01_4,${S2} + umlal $ACC2,$IN01_4,${S3} + +.Lshort_tail: + //////////////////////////////////////////////////////////////// + // horizontal add + + addp $ACC3,$ACC3,$ACC3 + ldp d8,d9,[sp,#16] // meet ABI requirements + addp $ACC0,$ACC0,$ACC0 + ldp d10,d11,[sp,#32] + addp $ACC4,$ACC4,$ACC4 + ldp d12,d13,[sp,#48] + addp $ACC1,$ACC1,$ACC1 + ldp d14,d15,[sp,#64] + addp $ACC2,$ACC2,$ACC2 + + //////////////////////////////////////////////////////////////// + // lazy reduction, but without narrowing + + ushr $T0.2d,$ACC3,#26 + and $ACC3,$ACC3,$MASK.2d + ushr $T1.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + + add $ACC4,$ACC4,$T0.2d // h3 -> h4 + add $ACC1,$ACC1,$T1.2d // h0 -> h1 + + ushr $T0.2d,$ACC4,#26 + and $ACC4,$ACC4,$MASK.2d + ushr $T1.2d,$ACC1,#26 + and $ACC1,$ACC1,$MASK.2d + add $ACC2,$ACC2,$T1.2d // h1 -> h2 + + add $ACC0,$ACC0,$T0.2d + shl $T0.2d,$T0.2d,#2 + ushr $T1.2d,$ACC2,#26 + and $ACC2,$ACC2,$MASK.2d + add $ACC0,$ACC0,$T0.2d // h4 -> h0 + add $ACC3,$ACC3,$T1.2d // h2 -> h3 + + ushr $T0.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + ushr $T1.2d,$ACC3,#26 + and $ACC3,$ACC3,$MASK.2d + add $ACC1,$ACC1,$T0.2d // h0 -> h1 + add $ACC4,$ACC4,$T1.2d // h3 -> h4 + + //////////////////////////////////////////////////////////////// + // write the result, can be partially reduced + + st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 + st1 {$ACC4}[0],[$ctx] + +.Lno_data_neon: + ldr x29,[sp],#80 + ret +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.type poly1305_emit_neon,%function +.align 5 +poly1305_emit_neon: + ldr $is_base2_26,[$ctx,#24] + cbz $is_base2_26,poly1305_emit + + ldp w10,w11,[$ctx] // load hash value base 2^26 + ldp w12,w13,[$ctx,#8] + ldr w14,[$ctx,#16] + + add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 + lsr $h1,x12,#12 + adds $h0,$h0,x12,lsl#52 + add $h1,$h1,x13,lsl#14 + adc $h1,$h1,xzr + lsr $h2,x14,#24 + adds $h1,$h1,x14,lsl#40 + adc $h2,$h2,xzr // can be partially reduced... + + ldp $t0,$t1,[$nonce] // load nonce + + and $d0,$h2,#-4 // ... so reduce + add $d0,$d0,$h2,lsr#2 + and $h2,$h2,#3 + adds $h0,$h0,$d0 + adcs $h1,$h1,xzr + adc $h2,$h2,xzr + + adds $d0,$h0,#5 // compare to modulus + adcs $d1,$h1,xzr + adc $d2,$h2,xzr + + tst $d2,#-4 // see if it's carried/borrowed + + csel $h0,$h0,$d0,eq + csel $h1,$h1,$d1,eq + +#ifdef __ARMEB__ + ror $t0,$t0,#32 // flip nonce words + ror $t1,$t1,#32 +#endif + adds $h0,$h0,$t0 // accumulate nonce + adc $h1,$h1,$t1 +#ifdef __ARMEB__ + rev $h0,$h0 // flip output bytes + rev $h1,$h1 +#endif + stp $h0,$h1,[$mac] // write result + + ret +.size poly1305_emit_neon,.-poly1305_emit_neon + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0 +.LOPENSSL_armcap_P: +#ifdef __ILP32__ +.long OPENSSL_armcap_P-. +#else +.quad OPENSSL_armcap_P-. +#endif +.asciz "Poly1305 for ARMv8, CRYPTOGAMS by " +.align 2 +___ + +foreach (split("\n",$code)) { + s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or + s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or + (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or + (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or + (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or + (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or + (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); + + s/\.[124]([sd])\[/.$1\[/; + + print $_,"\n"; +} +close STDOUT; diff --git a/crypto/poly1305/poly1305-arm64.s b/crypto/poly1305/poly1305-arm64.s new file mode 100644 index 0000000..911b57e --- /dev/null +++ b/crypto/poly1305/poly1305-arm64.s @@ -0,0 +1,820 @@ +/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0) + * + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + */ + +#include +.text + +.align 5 +ENTRY(poly1305_init_arm) + cmp x1,xzr + stp xzr,xzr,[x0] // zero hash value + stp xzr,xzr,[x0,#16] // [along with is_base2_26] + + csel x0,xzr,x0,eq + b.eq .Lno_key + + ldp x7,x8,[x1] // load key + mov x9,#0xfffffffc0fffffff + movk x9,#0x0fff,lsl#48 +#ifdef __ARMEB__ + rev x7,x7 // flip bytes + rev x8,x8 +#endif + and x7,x7,x9 // &=0ffffffc0fffffff + and x9,x9,#-4 + and x8,x8,x9 // &=0ffffffc0ffffffc + stp x7,x8,[x0,#32] // save key value + +.Lno_key: + ret +ENDPROC(poly1305_init_arm) + +.align 5 +ENTRY(poly1305_blocks_arm) + ands x2,x2,#-16 + b.eq .Lno_data + + ldp x4,x5,[x0] // load hash value + ldp x7,x8,[x0,#32] // load key value + ldr x6,[x0,#16] + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) + b .Loop + +.align 5 +.Loop: + ldp x10,x11,[x1],#16 // load input + sub x2,x2,#16 +#ifdef __ARMEB__ + rev x10,x10 + rev x11,x11 +#endif + adds x4,x4,x10 // accumulate input + adcs x5,x5,x11 + + mul x12,x4,x7 // h0*r0 + adc x6,x6,x3 + umulh x13,x4,x7 + + mul x10,x5,x9 // h1*5*r1 + umulh x11,x5,x9 + + adds x12,x12,x10 + mul x10,x4,x8 // h0*r1 + adc x13,x13,x11 + umulh x14,x4,x8 + + adds x13,x13,x10 + mul x10,x5,x7 // h1*r0 + adc x14,x14,xzr + umulh x11,x5,x7 + + adds x13,x13,x10 + mul x10,x6,x9 // h2*5*r1 + adc x14,x14,x11 + mul x11,x6,x7 // h2*r0 + + adds x13,x13,x10 + adc x14,x14,x11 + + and x10,x14,#-4 // final reduction + and x6,x14,#3 + add x10,x10,x14,lsr#2 + adds x4,x12,x10 + adcs x5,x13,xzr + adc x6,x6,xzr + + cbnz x2,.Loop + + stp x4,x5,[x0] // store hash value + str x6,[x0,#16] + +.Lno_data: + ret +ENDPROC(poly1305_blocks_arm) + +.align 5 +ENTRY(poly1305_emit_arm) + ldp x4,x5,[x0] // load hash base 2^64 + ldr x6,[x0,#16] + ldp x10,x11,[x2] // load nonce + + adds x12,x4,#5 // compare to modulus + adcs x13,x5,xzr + adc x14,x6,xzr + + tst x14,#-4 // see if it's carried/borrowed + + csel x4,x4,x12,eq + csel x5,x5,x13,eq + +#ifdef __ARMEB__ + ror x10,x10,#32 // flip nonce words + ror x11,x11,#32 +#endif + adds x4,x4,x10 // accumulate nonce + adc x5,x5,x11 +#ifdef __ARMEB__ + rev x4,x4 // flip output bytes + rev x5,x5 +#endif + stp x4,x5,[x1] // write result + + ret +ENDPROC(poly1305_emit_arm) + +.align 5 +__poly1305_mult: + mul x12,x4,x7 // h0*r0 + umulh x13,x4,x7 + + mul x10,x5,x9 // h1*5*r1 + umulh x11,x5,x9 + + adds x12,x12,x10 + mul x10,x4,x8 // h0*r1 + adc x13,x13,x11 + umulh x14,x4,x8 + + adds x13,x13,x10 + mul x10,x5,x7 // h1*r0 + adc x14,x14,xzr + umulh x11,x5,x7 + + adds x13,x13,x10 + mul x10,x6,x9 // h2*5*r1 + adc x14,x14,x11 + mul x11,x6,x7 // h2*r0 + + adds x13,x13,x10 + adc x14,x14,x11 + + and x10,x14,#-4 // final reduction + and x6,x14,#3 + add x10,x10,x14,lsr#2 + adds x4,x12,x10 + adcs x5,x13,xzr + adc x6,x6,xzr + + ret + +__poly1305_splat: + and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x13,x4,#26,#26 + extr x14,x5,x4,#52 + and x14,x14,#0x03ffffff + ubfx x15,x5,#14,#26 + extr x16,x6,x5,#40 + + str w12,[x0,#16*0] // r0 + add w12,w13,w13,lsl#2 // r1*5 + str w13,[x0,#16*1] // r1 + add w13,w14,w14,lsl#2 // r2*5 + str w12,[x0,#16*2] // s1 + str w14,[x0,#16*3] // r2 + add w14,w15,w15,lsl#2 // r3*5 + str w13,[x0,#16*4] // s2 + str w15,[x0,#16*5] // r3 + add w15,w16,w16,lsl#2 // r4*5 + str w14,[x0,#16*6] // s3 + str w16,[x0,#16*7] // r4 + str w15,[x0,#16*8] // s4 + + ret + +.align 5 +ENTRY(poly1305_blocks_neon) + ldr x17,[x0,#24] + cmp x2,#128 + b.hs .Lblocks_neon + cbz x17,poly1305_blocks_arm + +.Lblocks_neon: + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + + ands x2,x2,#-16 + b.eq .Lno_data_neon + + cbz x17,.Lbase2_64_neon + + ldp w10,w11,[x0] // load hash value base 2^26 + ldp w12,w13,[x0,#8] + ldr w14,[x0,#16] + + tst x2,#31 + b.eq .Leven_neon + + ldp x7,x8,[x0,#32] // load key value + + add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 + lsr x5,x12,#12 + adds x4,x4,x12,lsl#52 + add x5,x5,x13,lsl#14 + adc x5,x5,xzr + lsr x6,x14,#24 + adds x5,x5,x14,lsl#40 + adc x14,x6,xzr // can be partially reduced... + + ldp x12,x13,[x1],#16 // load input + sub x2,x2,#16 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) + + and x10,x14,#-4 // ... so reduce + and x6,x14,#3 + add x10,x10,x14,lsr#2 + adds x4,x4,x10 + adcs x5,x5,xzr + adc x6,x6,xzr + +#ifdef __ARMEB__ + rev x12,x12 + rev x13,x13 +#endif + adds x4,x4,x12 // accumulate input + adcs x5,x5,x13 + adc x6,x6,x3 + + bl __poly1305_mult + ldr x30,[sp,#8] + + cbz x3,.Lstore_base2_64_neon + + and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,x4,#26,#26 + extr x12,x5,x4,#52 + and x12,x12,#0x03ffffff + ubfx x13,x5,#14,#26 + extr x14,x6,x5,#40 + + cbnz x2,.Leven_neon + + stp w10,w11,[x0] // store hash value base 2^26 + stp w12,w13,[x0,#8] + str w14,[x0,#16] + b .Lno_data_neon + +.align 4 +.Lstore_base2_64_neon: + stp x4,x5,[x0] // store hash value base 2^64 + stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed + b .Lno_data_neon + +.align 4 +.Lbase2_64_neon: + ldp x7,x8,[x0,#32] // load key value + + ldp x4,x5,[x0] // load hash value base 2^64 + ldr x6,[x0,#16] + + tst x2,#31 + b.eq .Linit_neon + + ldp x12,x13,[x1],#16 // load input + sub x2,x2,#16 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) +#ifdef __ARMEB__ + rev x12,x12 + rev x13,x13 +#endif + adds x4,x4,x12 // accumulate input + adcs x5,x5,x13 + adc x6,x6,x3 + + bl __poly1305_mult + +.Linit_neon: + and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,x4,#26,#26 + extr x12,x5,x4,#52 + and x12,x12,#0x03ffffff + ubfx x13,x5,#14,#26 + extr x14,x6,x5,#40 + + stp d8,d9,[sp,#16] // meet ABI requirements + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + fmov d24,x10 + fmov d25,x11 + fmov d26,x12 + fmov d27,x13 + fmov d28,x14 + + ////////////////////////////////// initialize r^n table + mov x4,x7 // r^1 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) + mov x5,x8 + mov x6,xzr + add x0,x0,#48+12 + bl __poly1305_splat + + bl __poly1305_mult // r^2 + sub x0,x0,#4 + bl __poly1305_splat + + bl __poly1305_mult // r^3 + sub x0,x0,#4 + bl __poly1305_splat + + bl __poly1305_mult // r^4 + sub x0,x0,#4 + bl __poly1305_splat + ldr x30,[sp,#8] + + add x16,x1,#32 + adr x17,.Lzeros + subs x2,x2,#64 + csel x16,x17,x16,lo + + mov x4,#1 + str x4,[x0,#-24] // set is_base2_26 + sub x0,x0,#48 // restore original x0 + b .Ldo_neon + +.align 4 +.Leven_neon: + add x16,x1,#32 + adr x17,.Lzeros + subs x2,x2,#64 + csel x16,x17,x16,lo + + stp d8,d9,[sp,#16] // meet ABI requirements + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + fmov d24,x10 + fmov d25,x11 + fmov d26,x12 + fmov d27,x13 + fmov d28,x14 + +.Ldo_neon: + ldp x8,x12,[x16],#16 // inp[2:3] (or zero) + ldp x9,x13,[x16],#48 + + lsl x3,x3,#24 + add x15,x0,#48 + +#ifdef __ARMEB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov d14,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,x3,x12,lsr#40 + add x13,x3,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov d15,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + fmov d16,x8 + fmov d17,x10 + fmov d18,x12 + + ldp x8,x12,[x1],#16 // inp[0:1] + ldp x9,x13,[x1],#48 + + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 + ld1 {v8.4s},[x15] + +#ifdef __ARMEB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov d9,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,x3,x12,lsr#40 + add x13,x3,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov d10,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + movi v31.2d,#-1 + fmov d11,x8 + fmov d12,x10 + fmov d13,x12 + ushr v31.2d,v31.2d,#38 + + b.ls .Lskip_loop + +.align 4 +.Loop_neon: + //////////////////////////////////////////////////////////////// + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + // ___________________/ + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + // ___________________/ ____________________/ + // + // Note that we start with inp[2:3]*r^2. This is because it + // doesn't depend on reduction in previous iteration. + //////////////////////////////////////////////////////////////// + // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 + // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 + // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 + // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 + // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 + + subs x2,x2,#64 + umull v23.2d,v14.2s,v7.s[2] + csel x16,x17,x16,lo + umull v22.2d,v14.2s,v5.s[2] + umull v21.2d,v14.2s,v3.s[2] + ldp x8,x12,[x16],#16 // inp[2:3] (or zero) + umull v20.2d,v14.2s,v1.s[2] + ldp x9,x13,[x16],#48 + umull v19.2d,v14.2s,v0.s[2] +#ifdef __ARMEB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + umlal v23.2d,v15.2s,v5.s[2] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal v22.2d,v15.2s,v3.s[2] + and x5,x9,#0x03ffffff + umlal v21.2d,v15.2s,v1.s[2] + ubfx x6,x8,#26,#26 + umlal v20.2d,v15.2s,v0.s[2] + ubfx x7,x9,#26,#26 + umlal v19.2d,v15.2s,v8.s[2] + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + + umlal v23.2d,v16.2s,v3.s[2] + extr x8,x12,x8,#52 + umlal v22.2d,v16.2s,v1.s[2] + extr x9,x13,x9,#52 + umlal v21.2d,v16.2s,v0.s[2] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal v20.2d,v16.2s,v8.s[2] + fmov d14,x4 + umlal v19.2d,v16.2s,v6.s[2] + and x8,x8,#0x03ffffff + + umlal v23.2d,v17.2s,v1.s[2] + and x9,x9,#0x03ffffff + umlal v22.2d,v17.2s,v0.s[2] + ubfx x10,x12,#14,#26 + umlal v21.2d,v17.2s,v8.s[2] + ubfx x11,x13,#14,#26 + umlal v20.2d,v17.2s,v6.s[2] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal v19.2d,v17.2s,v4.s[2] + fmov d15,x6 + + add v11.2s,v11.2s,v26.2s + add x12,x3,x12,lsr#40 + umlal v23.2d,v18.2s,v0.s[2] + add x13,x3,x13,lsr#40 + umlal v22.2d,v18.2s,v8.s[2] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal v21.2d,v18.2s,v6.s[2] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal v20.2d,v18.2s,v4.s[2] + fmov d16,x8 + umlal v19.2d,v18.2s,v2.s[2] + fmov d17,x10 + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4 and accumulate + + add v9.2s,v9.2s,v24.2s + fmov d18,x12 + umlal v22.2d,v11.2s,v1.s[0] + ldp x8,x12,[x1],#16 // inp[0:1] + umlal v19.2d,v11.2s,v6.s[0] + ldp x9,x13,[x1],#48 + umlal v23.2d,v11.2s,v3.s[0] + umlal v20.2d,v11.2s,v8.s[0] + umlal v21.2d,v11.2s,v0.s[0] +#ifdef __ARMEB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + add v10.2s,v10.2s,v25.2s + umlal v22.2d,v9.2s,v5.s[0] + umlal v23.2d,v9.2s,v7.s[0] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal v21.2d,v9.2s,v3.s[0] + and x5,x9,#0x03ffffff + umlal v19.2d,v9.2s,v0.s[0] + ubfx x6,x8,#26,#26 + umlal v20.2d,v9.2s,v1.s[0] + ubfx x7,x9,#26,#26 + + add v12.2s,v12.2s,v27.2s + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + umlal v22.2d,v10.2s,v3.s[0] + extr x8,x12,x8,#52 + umlal v23.2d,v10.2s,v5.s[0] + extr x9,x13,x9,#52 + umlal v19.2d,v10.2s,v8.s[0] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal v21.2d,v10.2s,v1.s[0] + fmov d9,x4 + umlal v20.2d,v10.2s,v0.s[0] + and x8,x8,#0x03ffffff + + add v13.2s,v13.2s,v28.2s + and x9,x9,#0x03ffffff + umlal v22.2d,v12.2s,v0.s[0] + ubfx x10,x12,#14,#26 + umlal v19.2d,v12.2s,v4.s[0] + ubfx x11,x13,#14,#26 + umlal v23.2d,v12.2s,v1.s[0] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal v20.2d,v12.2s,v6.s[0] + fmov d10,x6 + umlal v21.2d,v12.2s,v8.s[0] + add x12,x3,x12,lsr#40 + + umlal v22.2d,v13.2s,v8.s[0] + add x13,x3,x13,lsr#40 + umlal v19.2d,v13.2s,v2.s[0] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal v23.2d,v13.2s,v0.s[0] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal v20.2d,v13.2s,v4.s[0] + fmov d11,x8 + umlal v21.2d,v13.2s,v6.s[0] + fmov d12,x10 + fmov d13,x12 + + ///////////////////////////////////////////////////////////////// + // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + // and P. Schwabe + // + // [see discussion in poly1305-armv4 module] + + ushr v29.2d,v22.2d,#26 + xtn v27.2s,v22.2d + ushr v30.2d,v19.2d,#26 + and v19.16b,v19.16b,v31.16b + add v23.2d,v23.2d,v29.2d // h3 -> h4 + bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff + add v20.2d,v20.2d,v30.2d // h0 -> h1 + + ushr v29.2d,v23.2d,#26 + xtn v28.2s,v23.2d + ushr v30.2d,v20.2d,#26 + xtn v25.2s,v20.2d + bic v28.2s,#0xfc,lsl#24 + add v21.2d,v21.2d,v30.2d // h1 -> h2 + + add v19.2d,v19.2d,v29.2d + shl v29.2d,v29.2d,#2 + shrn v30.2s,v21.2d,#26 + xtn v26.2s,v21.2d + add v19.2d,v19.2d,v29.2d // h4 -> h0 + bic v25.2s,#0xfc,lsl#24 + add v27.2s,v27.2s,v30.2s // h2 -> h3 + bic v26.2s,#0xfc,lsl#24 + + shrn v29.2s,v19.2d,#26 + xtn v24.2s,v19.2d + ushr v30.2s,v27.2s,#26 + bic v27.2s,#0xfc,lsl#24 + bic v24.2s,#0xfc,lsl#24 + add v25.2s,v25.2s,v29.2s // h0 -> h1 + add v28.2s,v28.2s,v30.2s // h3 -> h4 + + b.hi .Loop_neon + +.Lskip_loop: + dup v16.2d,v16.d[0] + add v11.2s,v11.2s,v26.2s + + //////////////////////////////////////////////////////////////// + // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + adds x2,x2,#32 + b.ne .Long_tail + + dup v16.2d,v11.d[0] + add v14.2s,v9.2s,v24.2s + add v17.2s,v12.2s,v27.2s + add v15.2s,v10.2s,v25.2s + add v18.2s,v13.2s,v28.2s + +.Long_tail: + dup v14.2d,v14.d[0] + umull2 v19.2d,v16.4s,v6.4s + umull2 v22.2d,v16.4s,v1.4s + umull2 v23.2d,v16.4s,v3.4s + umull2 v21.2d,v16.4s,v0.4s + umull2 v20.2d,v16.4s,v8.4s + + dup v15.2d,v15.d[0] + umlal2 v19.2d,v14.4s,v0.4s + umlal2 v21.2d,v14.4s,v3.4s + umlal2 v22.2d,v14.4s,v5.4s + umlal2 v23.2d,v14.4s,v7.4s + umlal2 v20.2d,v14.4s,v1.4s + + dup v17.2d,v17.d[0] + umlal2 v19.2d,v15.4s,v8.4s + umlal2 v22.2d,v15.4s,v3.4s + umlal2 v21.2d,v15.4s,v1.4s + umlal2 v23.2d,v15.4s,v5.4s + umlal2 v20.2d,v15.4s,v0.4s + + dup v18.2d,v18.d[0] + umlal2 v22.2d,v17.4s,v0.4s + umlal2 v23.2d,v17.4s,v1.4s + umlal2 v19.2d,v17.4s,v4.4s + umlal2 v20.2d,v17.4s,v6.4s + umlal2 v21.2d,v17.4s,v8.4s + + umlal2 v22.2d,v18.4s,v8.4s + umlal2 v19.2d,v18.4s,v2.4s + umlal2 v23.2d,v18.4s,v0.4s + umlal2 v20.2d,v18.4s,v4.4s + umlal2 v21.2d,v18.4s,v6.4s + + b.eq .Lshort_tail + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4:r^3 and accumulate + + add v9.2s,v9.2s,v24.2s + umlal v22.2d,v11.2s,v1.2s + umlal v19.2d,v11.2s,v6.2s + umlal v23.2d,v11.2s,v3.2s + umlal v20.2d,v11.2s,v8.2s + umlal v21.2d,v11.2s,v0.2s + + add v10.2s,v10.2s,v25.2s + umlal v22.2d,v9.2s,v5.2s + umlal v19.2d,v9.2s,v0.2s + umlal v23.2d,v9.2s,v7.2s + umlal v20.2d,v9.2s,v1.2s + umlal v21.2d,v9.2s,v3.2s + + add v12.2s,v12.2s,v27.2s + umlal v22.2d,v10.2s,v3.2s + umlal v19.2d,v10.2s,v8.2s + umlal v23.2d,v10.2s,v5.2s + umlal v20.2d,v10.2s,v0.2s + umlal v21.2d,v10.2s,v1.2s + + add v13.2s,v13.2s,v28.2s + umlal v22.2d,v12.2s,v0.2s + umlal v19.2d,v12.2s,v4.2s + umlal v23.2d,v12.2s,v1.2s + umlal v20.2d,v12.2s,v6.2s + umlal v21.2d,v12.2s,v8.2s + + umlal v22.2d,v13.2s,v8.2s + umlal v19.2d,v13.2s,v2.2s + umlal v23.2d,v13.2s,v0.2s + umlal v20.2d,v13.2s,v4.2s + umlal v21.2d,v13.2s,v6.2s + +.Lshort_tail: + //////////////////////////////////////////////////////////////// + // horizontal add + + addp v22.2d,v22.2d,v22.2d + ldp d8,d9,[sp,#16] // meet ABI requirements + addp v19.2d,v19.2d,v19.2d + ldp d10,d11,[sp,#32] + addp v23.2d,v23.2d,v23.2d + ldp d12,d13,[sp,#48] + addp v20.2d,v20.2d,v20.2d + ldp d14,d15,[sp,#64] + addp v21.2d,v21.2d,v21.2d + + //////////////////////////////////////////////////////////////// + // lazy reduction, but without narrowing + + ushr v29.2d,v22.2d,#26 + and v22.16b,v22.16b,v31.16b + ushr v30.2d,v19.2d,#26 + and v19.16b,v19.16b,v31.16b + + add v23.2d,v23.2d,v29.2d // h3 -> h4 + add v20.2d,v20.2d,v30.2d // h0 -> h1 + + ushr v29.2d,v23.2d,#26 + and v23.16b,v23.16b,v31.16b + ushr v30.2d,v20.2d,#26 + and v20.16b,v20.16b,v31.16b + add v21.2d,v21.2d,v30.2d // h1 -> h2 + + add v19.2d,v19.2d,v29.2d + shl v29.2d,v29.2d,#2 + ushr v30.2d,v21.2d,#26 + and v21.16b,v21.16b,v31.16b + add v19.2d,v19.2d,v29.2d // h4 -> h0 + add v22.2d,v22.2d,v30.2d // h2 -> h3 + + ushr v29.2d,v19.2d,#26 + and v19.16b,v19.16b,v31.16b + ushr v30.2d,v22.2d,#26 + and v22.16b,v22.16b,v31.16b + add v20.2d,v20.2d,v29.2d // h0 -> h1 + add v23.2d,v23.2d,v30.2d // h3 -> h4 + + //////////////////////////////////////////////////////////////// + // write the result, can be partially reduced + + st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 + st1 {v23.s}[0],[x0] + +.Lno_data_neon: + ldr x29,[sp],#80 + ret +ENDPROC(poly1305_blocks_neon) + +.align 5 +ENTRY(poly1305_emit_neon) + ldr x17,[x0,#24] + cbz x17,poly1305_emit_arm + + ldp w10,w11,[x0] // load hash value base 2^26 + ldp w12,w13,[x0,#8] + ldr w14,[x0,#16] + + add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 + lsr x5,x12,#12 + adds x4,x4,x12,lsl#52 + add x5,x5,x13,lsl#14 + adc x5,x5,xzr + lsr x6,x14,#24 + adds x5,x5,x14,lsl#40 + adc x6,x6,xzr // can be partially reduced... + + ldp x10,x11,[x2] // load nonce + + and x12,x6,#-4 // ... so reduce + add x12,x12,x6,lsr#2 + and x6,x6,#3 + adds x4,x4,x12 + adcs x5,x5,xzr + adc x6,x6,xzr + + adds x12,x4,#5 // compare to modulus + adcs x13,x5,xzr + adc x14,x6,xzr + + tst x14,#-4 // see if it's carried/borrowed + + csel x4,x4,x12,eq + csel x5,x5,x13,eq + +#ifdef __ARMEB__ + ror x10,x10,#32 // flip nonce words + ror x11,x11,#32 +#endif + adds x4,x4,x10 // accumulate nonce + adc x5,x5,x11 +#ifdef __ARMEB__ + rev x4,x4 // flip output bytes + rev x5,x5 +#endif + stp x4,x5,[x1] // write result + + ret +ENDPROC(poly1305_emit_neon) + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0 diff --git a/crypto/poly1305_x64_gas.s b/crypto/poly1305_x64_gas.s old mode 100644 new mode 100755 diff --git a/crypto_ops.h b/crypto_ops.h index 4c72280..09b598f 100644 --- a/crypto_ops.h +++ b/crypto_ops.h @@ -7,6 +7,7 @@ #include "tunsafe_types.h" #include + #if defined(COMPILER_MSVC) #include #endif // defined(COMPILER_MSVC) diff --git a/downarrow.bmp b/downarrow.bmp new file mode 100644 index 0000000000000000000000000000000000000000..02370440e2cc355990c1d271a8a6d9b23d0fdd9d GIT binary patch literal 142 zcmZ?r?PGv|G9aY^#B5N^$iM=YkN}cG+z>1PWP>mWI5{~n?A^PU;s5{tBx7SGV+LbE VV+JDzWd=hAVFm>TCZM^D3;^y)E#LqE literal 0 HcmV?d00001 diff --git a/installer/ChangeLog.txt b/installer/ChangeLog.txt index 3cb9202..22120bd 100644 --- a/installer/ChangeLog.txt +++ b/installer/ChangeLog.txt @@ -1,3 +1,26 @@ +2018-08-11 - TunSafe v1.4-rc1 +1.Subfolders in the Config/ directory now show up as submenus. +2.Added a way to run TunSafe as a Windows Service. + Foreground Mode: The service will disconnect when TunSafe closes. + Background Mode: The service will stay connected in the background. + No longer required to run the TunSafe client as Admin as long as + the service is running. +3.New config setting [Interface].ExcludedIPs to configure IPs that + should not be routed through TunSafe. +4.Can now automatically start TunSafe when Windows starts +5.New UI with tabs and graphs +6.Cache DNS queries to ensure DNS will succeed if connection fails +7.Recreate tray icon when explorer.exe restarts +8.Renamed window title to TunSafe instead of TunSafe VPN Client +9.Main window is now resizable +10.Disallow roaming endpoint when using AllowedIPs=0.0.0.0/0 + Only the original endpoint is added in the routing table so + this would result in an endless loop of packets. +11.Display approximate Wireguard framing overhead in stats +12.Preparations for protocol handling with multiple threads +13.Delete the routes we made when disconnecting +14.Fix error message about unable to delete a route when connecting + 2018-06-20 - TunSafe v1.3-rc3 Changes: diff --git a/installer/servicelib.nsh b/installer/servicelib.nsh new file mode 100644 index 0000000..7796a58 --- /dev/null +++ b/installer/servicelib.nsh @@ -0,0 +1,419 @@ +; NSIS SERVICE LIBRARY - servicelib.nsh +; Version 1.8.1 - Jun 21th, 2013 +; Questions/Comments - dselkirk@hotmail.com +; +; Description: +; Provides an interface to window services +; +; Inputs: +; action - systemlib action ie. create, delete, start, stop, pause, +; continue, installed, running, status +; name - name of service to manipulate +; param - action parameters; usage: var1=value1;var2=value2;...etc. +; (don't forget to add a ';' after the last value!) +; +; Actions: +; create - creates a new windows service +; Parameters: +; path - path to service executable +; autostart - automatically start with system ie. 1|0 +; interact - interact with the desktop ie. 1|0 +; depend - service dependencies +; user - user that runs the service +; password - password of the above user +; display - display name in service's console +; description - Description of service +; starttype - start type (supersedes autostart) +; servicetype - service type (supersedes interact) +; +; delete - deletes a windows service +; start - start a stopped windows service +; stop - stops a running windows service +; pause - pauses a running windows service +; continue - continues a paused windows service +; installed - is the provided service installed +; Parameters: +; action - if true then invokes the specified action +; running - is the provided service running +; Parameters: +; action - if true then invokes the specified action +; status - check the status of the provided service +; +; Usage: +; Method 1: +; Push "action" +; Push "name" +; Push "param" +; Call Service +; Pop $0 ;response +; +; Method 2: +; !insertmacro SERVICE "action" "name" "param" +; +; History: +; 1.0 - 09/15/2003 - Initial release +; 1.1 - 09/16/2003 - Changed &l to i, thx brainsucker +; 1.2 - 02/29/2004 - Fixed documentation. +; 1.3 - 01/05/2006 - Fixed interactive flag and pop order (Kichik) +; 1.4 - 12/07/2006 - Added display and depend, fixed datatypes (Vitoco) +; 1.5 - 06/25/2008 - Added description of service.(DeSafe.com/liuqixing#gmail.com) +; 1.5.1 - 06/12/2009 - Added use of __UNINSTALL__ +; 1.6 - 08/02/2010 - Fixed description implementation (Anders) +; 1.7 - 04/11/2010 - Added get running service process id (Nico) +; 1.8 - 24/03/2011 - Added starttype and servicetype (Sergius) +; 1.8.1 - 21/06/2013 - Added dynamic ASCII & Unicode support (Zinthose) + +!ifndef SERVICELIB + !define SERVICELIB + + !define SC_MANAGER_ALL_ACCESS 0x3F + !define SC_STATUS_PROCESS_INFO 0x0 + !define SERVICE_ALL_ACCESS 0xF01FF + + !define SERVICE_CONTROL_STOP 1 + !define SERVICE_CONTROL_PAUSE 2 + !define SERVICE_CONTROL_CONTINUE 3 + + !define SERVICE_STOPPED 0x1 + !define SERVICE_START_PENDING 0x2 + !define SERVICE_STOP_PENDING 0x3 + !define SERVICE_RUNNING 0x4 + !define SERVICE_CONTINUE_PENDING 0x5 + !define SERVICE_PAUSE_PENDING 0x6 + !define SERVICE_PAUSED 0x7 + + !define SERVICE_KERNEL_DRIVER 0x00000001 + !define SERVICE_FILE_SYSTEM_DRIVER 0x00000002 + !define SERVICE_WIN32_OWN_PROCESS 0x00000010 + !define SERVICE_WIN32_SHARE_PROCESS 0x00000020 + !define SERVICE_INTERACTIVE_PROCESS 0x00000100 + + + !define SERVICE_BOOT_START 0x00000000 + !define SERVICE_SYSTEM_START 0x00000001 + !define SERVICE_AUTO_START 0x00000002 + !define SERVICE_DEMAND_START 0x00000003 + !define SERVICE_DISABLED 0x00000004 + + ## Added by Zinthose for Native Unicode Support + !ifdef NSIS_UNICODE + !define APITAG "W" + !else + !define APITAG "A" + !endif + + !macro SERVICE ACTION NAME PARAM + Push '${ACTION}' + Push '${NAME}' + Push '${PARAM}' + !ifdef __UNINSTALL__ + Call un.Service + !else + Call Service + !endif + !macroend + + !macro FUNC_GETPARAM + Push $0 + Push $1 + Push $2 + Push $3 + Push $4 + Push $5 + Push $6 + Push $7 + Exch 8 + Pop $1 ;name + Exch 8 + Pop $2 ;source + StrCpy $0 "" + StrLen $7 $2 + StrCpy $3 0 + lbl_loop: + IntCmp $3 $7 0 0 lbl_done + StrLen $4 "$1=" + StrCpy $5 $2 $4 $3 + StrCmp $5 "$1=" 0 lbl_next + IntOp $5 $3 + $4 + StrCpy $3 $5 + lbl_loop2: + IntCmp $3 $7 0 0 lbl_done + StrCpy $6 $2 1 $3 + StrCmp $6 ";" 0 lbl_next2 + IntOp $6 $3 - $5 + StrCpy $0 $2 $6 $5 + Goto lbl_done + lbl_next2: + IntOp $3 $3 + 1 + Goto lbl_loop2 + lbl_next: + IntOp $3 $3 + 1 + Goto lbl_loop + lbl_done: + Pop $5 + Pop $4 + Pop $3 + Pop $2 + Pop $1 + Exch 2 + Pop $6 + Pop $7 + Exch $0 + !macroend + + !macro CALL_GETPARAM VAR NAME DEFAULT LABEL + Push $1 + Push ${NAME} + Call ${UN}GETPARAM + Pop $6 + StrCpy ${VAR} "${DEFAULT}" + StrCmp $6 "" "${LABEL}" 0 + StrCpy ${VAR} $6 + !macroend + + !macro FUNC_SERVICE UN + Push $0 + Push $1 + Push $2 + Push $3 + Push $4 + Push $5 + Push $6 + Push $7 + Exch 8 + Pop $1 ;param + Exch 8 + Pop $2 ;name + Exch 8 + Pop $3 ;action + ;$0 return + ;$4 OpenSCManager + ;$5 OpenService + + StrCpy $0 "false" + System::Call 'advapi32::OpenSCManager${APITAG}(n, n, i ${SC_MANAGER_ALL_ACCESS}) i.r4' + IntCmp $4 0 lbl_done + StrCmp $3 "create" lbl_create + System::Call 'advapi32::OpenService${APITAG}(i r4, t r2, i ${SERVICE_ALL_ACCESS}) i.r5' + IntCmp $5 0 lbl_done + + lbl_select: + StrCmp $3 "delete" lbl_delete + StrCmp $3 "start" lbl_start + StrCmp $3 "stop" lbl_stop + StrCmp $3 "pause" lbl_pause + StrCmp $3 "continue" lbl_continue + StrCmp $3 "installed" lbl_installed + StrCmp $3 "running" lbl_running + StrCmp $3 "status" lbl_status + StrCmp $3 "processid" lbl_processid + Goto lbl_done + + ; create service + lbl_create: + Push $R1 ;depend + Push $R2 ;user + Push $R3 ;password + Push $R4 ;servicetype/interact + Push $R5 ;starttype/autostart + Push $R6 ;path + Push $R7 ;display + Push $R8 ;description + + !insertmacro CALL_GETPARAM $R1 "depend" "n" "lbl_depend" + StrCpy $R1 't "$R1"' + lbl_depend: + StrCmp $R1 "n" 0 lbl_machine ;old name of depend param + !insertmacro CALL_GETPARAM $R1 "machine" "n" "lbl_machine" + StrCpy $R1 't "$R1"' + lbl_machine: + + !insertmacro CALL_GETPARAM $R2 "user" "n" "lbl_user" + StrCpy $R2 't "$R2"' + lbl_user: + + !insertmacro CALL_GETPARAM $R3 "password" "n" "lbl_password" + StrCpy $R3 't "$R3"' + lbl_password: + + !insertmacro CALL_GETPARAM $R4 "interact" "${SERVICE_WIN32_OWN_PROCESS}" "lbl_interact" + StrCpy $6 ${SERVICE_WIN32_OWN_PROCESS} + IntCmp $R4 0 +2 + IntOp $6 $6 | ${SERVICE_INTERACTIVE_PROCESS} + StrCpy $R4 $6 + lbl_interact: + + !insertmacro CALL_GETPARAM $R4 "servicetype" "$R4" "lbl_servicetype" + lbl_servicetype: + + !insertmacro CALL_GETPARAM $R5 "autostart" "${SERVICE_DEMAND_START}" "lbl_autostart" + StrCpy $6 ${SERVICE_DEMAND_START} + IntCmp $R5 0 +2 + StrCpy $6 ${SERVICE_AUTO_START} + StrCpy $R5 $6 + lbl_autostart: + + !insertmacro CALL_GETPARAM $R5 "starttype" "$R5" "lbl_starttype" + lbl_starttype: + + !insertmacro CALL_GETPARAM $R6 "path" "n" "lbl_path" + lbl_path: + + !insertmacro CALL_GETPARAM $R7 "display" "$2" "lbl_display" + lbl_display: + + !insertmacro CALL_GETPARAM $R8 "description" "$2" "lbl_description" + lbl_description: + + System::Call 'advapi32::CreateService${APITAG}(i r4, t r2, t R7, i ${SERVICE_ALL_ACCESS}, \ + i R4, i R5, i 0, t R6, n, n, $R1, $R2, $R3) i.r6' + + ; write description of service (SERVICE_CONFIG_DESCRIPTION) + System::Call 'advapi32::ChangeServiceConfig2${APITAG}(ir6,i1,*t "$R8")i.R7' + strcmp $R7 "error" 0 lbl_descriptioncomplete + WriteRegStr HKLM "SYSTEM\CurrentControlSet\Services\$2" "Description" $R8 + lbl_descriptioncomplete: + + Pop $R8 + Pop $R7 + Pop $R6 + Pop $R5 + Pop $R4 + Pop $R3 + Pop $R2 + Pop $R1 + StrCmp $6 0 lbl_done lbl_good + + ; delete service + lbl_delete: + System::Call 'advapi32::DeleteService(i r5) i.r6' + StrCmp $6 0 lbl_done lbl_good + + ; start service + lbl_start: + System::Call 'advapi32::StartService${APITAG}(i r5, i 0, i 0) i.r6' + StrCmp $6 0 lbl_done lbl_good + + ; stop service + lbl_stop: + Push $R1 + System::Call '*(i,i,i,i,i,i,i) i.R1' + System::Call 'advapi32::ControlService(i r5, i ${SERVICE_CONTROL_STOP}, i $R1) i' + System::Free $R1 + Pop $R1 + StrCmp $6 0 lbl_done lbl_good + + ; pause service + lbl_pause: + Push $R1 + System::Call '*(i,i,i,i,i,i,i) i.R1' + System::Call 'advapi32::ControlService(i r5, i ${SERVICE_CONTROL_PAUSE}, i $R1) i' + System::Free $R1 + Pop $R1 + StrCmp $6 0 lbl_done lbl_good + + ; continue service + lbl_continue: + Push $R1 + System::Call '*(i,i,i,i,i,i,i) i.R1' + System::Call 'advapi32::ControlService(i r5, i ${SERVICE_CONTROL_CONTINUE}, i $R1) i' + System::Free $R1 + Pop $R1 + StrCmp $6 0 lbl_done lbl_good + + ; is installed + lbl_installed: + !insertmacro CALL_GETPARAM $7 "action" "" "lbl_good" + StrCpy $3 $7 + Goto lbl_select + + ; is service running + lbl_running: + Push $R1 + System::Call '*(i,i,i,i,i,i,i) i.R1' + System::Call 'advapi32::QueryServiceStatus(i r5, i $R1) i' + System::Call '*$R1(i, i.r6)' + System::Free $R1 + Pop $R1 + IntFmt $6 "0x%X" $6 + StrCmp $6 ${SERVICE_RUNNING} 0 lbl_done + !insertmacro CALL_GETPARAM $7 "action" "" "lbl_good" + StrCpy $3 $7 + Goto lbl_select + + lbl_status: + Push $R1 + System::Call '*(i,i,i,i,i,i,i) i.R1' + System::Call 'advapi32::QueryServiceStatus(i r5, i $R1) i' + System::Call '*$R1(i, i .r6)' + System::Free $R1 + Pop $R1 + IntFmt $6 "0x%X" $6 + StrCpy $0 "running" + IntCmp $6 ${SERVICE_RUNNING} lbl_done + StrCpy $0 "stopped" + IntCmp $6 ${SERVICE_STOPPED} lbl_done + StrCpy $0 "start_pending" + IntCmp $6 ${SERVICE_START_PENDING} lbl_done + StrCpy $0 "stop_pending" + IntCmp $6 ${SERVICE_STOP_PENDING} lbl_done + StrCpy $0 "running" + IntCmp $6 ${SERVICE_RUNNING} lbl_done + StrCpy $0 "continue_pending" + IntCmp $6 ${SERVICE_CONTINUE_PENDING} lbl_done + StrCpy $0 "pause_pending" + IntCmp $6 ${SERVICE_PAUSE_PENDING} lbl_done + StrCpy $0 "paused" + IntCmp $6 ${SERVICE_PAUSED} lbl_done + StrCpy $0 "unknown" + Goto lbl_done + + lbl_processid: + Push $R1 + Push $R2 + System::Call '*(i,i,i,i,i,i,i,i,i) i.R1' + System::Call '*(i 0) i.R2' + System::Call "advapi32::QueryServiceStatusEx(i r5, i ${SC_STATUS_PROCESS_INFO}, i $R1, i 36, i $R2) i" + System::Call "*$R1(i,i,i,i,i,i,i, i .r0)" + System::Free $R2 + System::Free $R1 + Pop $R2 + Pop $R1 + Goto lbl_done + + lbl_good: + StrCpy $0 "true" + lbl_done: + IntCmp $5 0 +2 + System::Call 'advapi32::CloseServiceHandle(i r5) n' + IntCmp $4 0 +2 + System::Call 'advapi32::CloseServiceHandle(i r4) n' + Pop $4 + Pop $3 + Pop $2 + Pop $1 + Exch 3 + Pop $5 + Pop $7 + Pop $6 + Exch $0 + !macroend + + Function Service + !insertmacro FUNC_SERVICE "" + FunctionEnd + + Function un.Service + !insertmacro FUNC_SERVICE "un." + FunctionEnd + + Function GetParam + !insertmacro FUNC_GETPARAM + FunctionEnd + + Function un.GetParam + !insertmacro FUNC_GETPARAM + FunctionEnd + + !undef APITAG +!endif \ No newline at end of file diff --git a/installer/tunsafe.nsi b/installer/tunsafe.nsi index 7b77322..044f485 100644 --- a/installer/tunsafe.nsi +++ b/installer/tunsafe.nsi @@ -9,6 +9,7 @@ SetCompressor /SOLID lzma !include "x64.nsh" !define MULTIUSER_EXECUTIONLEVEL Admin !include "MultiUser.nsh" +!include "servicelib.nsh" !insertmacro GetParameters !insertmacro GetOptions @@ -130,6 +131,7 @@ again: Sleep 500 Goto again done: + !insertmacro SERVICE stop TunSafeService "" FunctionEnd Function .onInit @@ -198,6 +200,10 @@ Function un.onInit FunctionEnd Section "Uninstall" + !insertmacro SERVICE stop "TunSafeService" "" + !insertmacro SERVICE delete "TunSafeService" "" + + Delete "$INSTDIR\TunSafe.exe" Delete "$INSTDIR\License.txt" Delete "$INSTDIR\ChangeLog.txt" diff --git a/ip_to_peer_map.cpp b/ip_to_peer_map.cpp new file mode 100644 index 0000000..4210e66 --- /dev/null +++ b/ip_to_peer_map.cpp @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: AGPL-1.0-only +// Copyright (C) 2018 Ludvig Strigeus . All Rights Reserved. +#include "stdafx.h" +#include "ip_to_peer_map.h" +#include "bit_ops.h" +#include + +IpToPeerMap::IpToPeerMap() { + +} + +IpToPeerMap::~IpToPeerMap() { +} + +bool IpToPeerMap::InsertV4(const void *addr, int cidr, void *peer) { + uint32 mask = cidr == 32 ? 0xffffffff : ~(0xffffffff >> cidr); + Entry4 e = {ReadBE32(addr) & mask, mask, peer}; + ipv4_.push_back(e); + return true; +} + +bool IpToPeerMap::InsertV6(const void *addr, int cidr, void *peer) { + Entry6 e; + e.cidr_len = cidr; + e.peer = peer; + memcpy(e.ip, addr, 16); + ipv6_.push_back(e); + return true; +} + +void *IpToPeerMap::LookupV4(uint32 ip) { + uint32 best_mask = 0; + void *best_peer = NULL; + for (auto it = ipv4_.begin(); it != ipv4_.end(); ++it) { + if (it->ip == (ip & it->mask) && it->mask >= best_mask) { + best_mask = it->mask; + best_peer = it->peer; + } + } + return best_peer; +} + +void *IpToPeerMap::LookupV4DefaultPeer() { + for (auto it = ipv4_.begin(); it != ipv4_.end(); ++it) { + if (it->mask == 0) + return it->peer; + } + return NULL; +} + +void *IpToPeerMap::LookupV6DefaultPeer() { + for (auto it = ipv6_.begin(); it != ipv6_.end(); ++it) { + if (it->cidr_len == 0) + return it->peer; + } + return NULL; +} + +static int CalculateIPv6CommonPrefix(const uint8 *a, const uint8 *b) { + uint64 x = ToBE64(*(uint64*)&a[0] ^ *(uint64*)&b[0]); + uint64 y = ToBE64(*(uint64*)&a[8] ^ *(uint64*)&b[8]); + return x ? 64 - FindHighestSetBit64(x) : 128 - FindHighestSetBit64(y); +} + +void *IpToPeerMap::LookupV6(const void *addr) { + int best_len = 0; + void *best_peer = NULL; + for (auto it = ipv6_.begin(); it != ipv6_.end(); ++it) { + int len = CalculateIPv6CommonPrefix((const uint8*)addr, it->ip); + if (len >= it->cidr_len && len >= best_len) { + best_len = len; + best_peer = it->peer; + } + } + return best_peer; +} + +void IpToPeerMap::RemovePeer(void *peer) { + { + size_t n = ipv4_.size(); + Entry4 *r = &ipv4_[0], *w = r; + for (size_t i = 0; i != n; i++, r++) { + if (r->peer != peer) + *w++ = *r; + } + ipv4_.resize(w - &ipv4_[0]); + } + { + size_t n = ipv6_.size(); + Entry6 *r = &ipv6_[0], *w = r; + for (size_t i = 0; i != n; i++, r++) { + if (r->peer != peer) + *w++ = *r; + } + ipv6_.resize(w - &ipv6_[0]); + } +} \ No newline at end of file diff --git a/ip_to_peer_map.h b/ip_to_peer_map.h new file mode 100644 index 0000000..476f8cb --- /dev/null +++ b/ip_to_peer_map.h @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: AGPL-1.0-only +// Copyright (C) 2018 Ludvig Strigeus . All Rights Reserved. +#pragma once + +#include "tunsafe_types.h" +#include + +// Maps CIDR addresses to a peer, always returning the longest match +// Slow O(n) implementation +class IpToPeerMap { +public: + IpToPeerMap(); + ~IpToPeerMap(); + + // Inserts an IP address of a given CIDR length into the lookup table, pointing to peer. + bool InsertV4(const void *addr, int cidr, void *peer); + bool InsertV6(const void *addr, int cidr, void *peer); + + // Lookup the peer matching the IP Address + void *LookupV4(uint32 ip); + void *LookupV6(const void *addr); + + void *LookupV4DefaultPeer(); + void *LookupV6DefaultPeer(); + + // Remove a peer from the table + void RemovePeer(void *peer); +private: + struct Entry4 { + uint32 ip; + uint32 mask; + void *peer; + }; + struct Entry6 { + uint8 ip[16]; + uint8 cidr_len; + void *peer; + }; + std::vector ipv4_; + std::vector ipv6_; +}; diff --git a/ipzip2/ipzip2.cpp b/ipzip2/ipzip2.cpp index 1b23962..5ed17a3 100644 --- a/ipzip2/ipzip2.cpp +++ b/ipzip2/ipzip2.cpp @@ -1 +1,2 @@ -// this is a placeholder for a packet compression algorithm not yet released. \ No newline at end of file +#include "stdafx.h" +// this is a placeholder for a packet compression algorithm not yet released. diff --git a/ipzip2/ipzip2.h b/ipzip2/ipzip2.h new file mode 100644 index 0000000..bfcd94b --- /dev/null +++ b/ipzip2/ipzip2.h @@ -0,0 +1 @@ +// this is a placeholder for a packet compression algorithm not yet released. diff --git a/netapi.h b/netapi.h index 56af4f6..4dfc8e1 100644 --- a/netapi.h +++ b/netapi.h @@ -121,6 +121,9 @@ public: // This holds all cidr addresses to add as additional routing entries std::vector extra_routes; + // This holds all the ips to exclude + std::vector excluded_ips; + // This holds the pre/post commands PrePostCommands pre_post_commands; }; diff --git a/network_bsd_common.cpp b/network_bsd_common.cpp index 479dff1..161f70f 100644 --- a/network_bsd_common.cpp +++ b/network_bsd_common.cpp @@ -41,6 +41,11 @@ #include #endif +void tunsafe_die(const char *msg) { + fprintf(stderr, "%s\n", msg); + exit(1); +} + void SetThreadName(const char *name) { #if defined(OS_LINUX) prctl(PR_SET_NAME, name, 0, 0, 0); @@ -438,11 +443,11 @@ static void ComputeIpv6DefaultRoute(const uint8 *ipv6_address, uint8 ipv6_cidr, default_route_v6[15] ^= 3; } -void TunsafeBackendBsd::AddRoute(uint32 ip, uint32 cidr, uint32 gw) { +void TunsafeBackendBsd::AddRoute(uint32 ip, uint32 cidr, uint32 gw, const char *dev) { uint32 ip_be, gw_be; WriteBE32(&ip_be, ip); WriteBE32(&gw_be, gw); - AddRoute(AF_INET, &ip_be, cidr, &gw_be); + AddRoute(AF_INET, &ip_be, cidr, &gw_be, dev); } static void AddOrRemoveRoute(const RouteInfo &cd, bool remove) { @@ -452,13 +457,12 @@ static void AddOrRemoveRoute(const RouteInfo &cd, bool remove) { print_ip_prefix(buf2, cd.family, cd.gw, -1); #if defined(OS_LINUX) - const char *cmd = remove ? "delete" : "add"; - if (cd.family == AF_INET) { - const char *net_or_host = (cd.cidr == 32) ? "-host" : "-net"; - RunCommand("/sbin/route %s %s %s gw %s", cmd, net_or_host, buf1, buf2); + const char *cmd = remove ? "del" : "add"; + const char *proto = (cd.family == AF_INET) ? NULL : "-6"; + if (cd.dev.empty()) { + RunCommand("/sbin/ip %s route %s %s via %s", proto, cmd, buf1, buf2); } else { - const char *net_or_host = (cd.cidr == 128) ? "-host" : "-net"; - RunCommand("/sbin/route %s %s inet6 %s gw %s", cmd, net_or_host, buf1, buf2); + RunCommand("/sbin/ip %s route %s %s dev %s", proto, cmd, buf1, cd.dev.c_str()); } #elif defined(OS_MACOSX) || defined(OS_FREEBSD) const char *cmd = remove ? "delete" : "add"; @@ -470,9 +474,10 @@ static void AddOrRemoveRoute(const RouteInfo &cd, bool remove) { #endif } -bool TunsafeBackendBsd::AddRoute(int family, const void *dest, int dest_prefix, const void *gateway) { +bool TunsafeBackendBsd::AddRoute(int family, const void *dest, int dest_prefix, const void *gateway, const char *dev) { RouteInfo c; + c.dev = dev ? dev : ""; c.family = family; size_t len = (family == AF_INET) ? 4 : 16; memcpy(c.ip, dest, len); @@ -493,7 +498,6 @@ static bool IsIpv6AddressSet(const void *p) { // Called to initialize tun bool TunsafeBackendBsd::Initialize(const TunConfig &&config, TunConfigOut *out) override { - char def_iface[12]; char devname[16]; if (!RunPrePostCommand(config.pre_post_commands.pre_up)) { @@ -513,20 +517,24 @@ bool TunsafeBackendBsd::Initialize(const TunConfig &&config, TunConfigOut *out) uint32 default_route_v4 = ComputeIpv4DefaultRoute(config.ip, netmask); RunCommand("/sbin/ifconfig %s %A mtu %d %A netmask %A up", devname, config.ip, config.mtu, config.ip, netmask); - AddRoute(config.ip & netmask, config.cidr, config.ip); + AddRoute(config.ip & netmask, config.cidr, config.ip, devname); if (config.use_ipv4_default_route) { if (config.default_route_endpoint_v4) { - uint32 gw; - if (!GetDefaultRoute(def_iface, sizeof(def_iface), &gw)) { + uint32 ipv4_default_gw; + char default_iface[16]; + if (!GetDefaultRoute(default_iface, sizeof(default_iface), &ipv4_default_gw)) { RERROR("Unable to determine default interface."); return false; } - AddRoute(config.default_route_endpoint_v4, 32, gw); - + AddRoute(config.default_route_endpoint_v4, 32, ipv4_default_gw, NULL); + for (auto it = config.excluded_ips.begin(); it != config.excluded_ips.end(); ++it) { + if (it->size == 32) + AddRoute(ReadBE32(it->addr), it->cidr, ipv4_default_gw, default_iface); + } } - AddRoute(0x00000000, 1, default_route_v4); - AddRoute(0x80000000, 1, default_route_v4); + AddRoute(0x00000000, 1, default_route_v4, devname); + AddRoute(0x80000000, 1, default_route_v4, devname); } uint8 default_route_v6[16]; @@ -537,23 +545,23 @@ bool TunsafeBackendBsd::Initialize(const TunConfig &&config, TunConfigOut *out) ComputeIpv6DefaultRoute(config.ipv6_address, config.ipv6_cidr, default_route_v6); - RunCommand("/sbin/ifconfig %s inet6 %s", devname, print_ip_prefix(buf, AF_INET6, config.ipv6_address, config.ipv6_cidr)); + RunCommand("/sbin/ifconfig %s inet6 add %s", devname, print_ip_prefix(buf, AF_INET6, config.ipv6_address, config.ipv6_cidr)); if (config.use_ipv6_default_route) { if (IsIpv6AddressSet(config.default_route_endpoint_v6)) { RERROR("default_route_endpoint_v6 not supported"); } - AddRoute(AF_INET6, matchall_1_route + 1, 1, default_route_v6); - AddRoute(AF_INET6, matchall_1_route + 0, 1, default_route_v6); + AddRoute(AF_INET6, matchall_1_route + 1, 1, default_route_v6, devname); + AddRoute(AF_INET6, matchall_1_route + 0, 1, default_route_v6, devname); } } // Add all the extra routes for (auto it = config.extra_routes.begin(); it != config.extra_routes.end(); ++it) { if (it->size == 32) { - AddRoute(ReadBE32(it->addr), it->cidr, default_route_v4); + AddRoute(ReadBE32(it->addr), it->cidr, default_route_v4, devname); } else if (it->size == 128 && config.ipv6_cidr) { - AddRoute(AF_INET6, it->addr, it->cidr, default_route_v6); + AddRoute(AF_INET6, it->addr, it->cidr, default_route_v6, devname); } } @@ -688,34 +696,38 @@ void InitCpuFeatures(); void Benchmark(); -uint32 g_ui_ip; - const char *print_ip(char buf[kSizeOfAddress], in_addr_t ip) { snprintf(buf, kSizeOfAddress, "%d.%d.%d.%d", (ip >> 24) & 0xff, (ip >> 16) & 0xff, (ip >> 8) & 0xff, (ip >> 0) & 0xff); return buf; } - class MyProcessorDelegate : public ProcessorDelegate { public: - virtual void OnConnected(in_addr_t my_ip) { - if (my_ip != g_ui_ip) { - if (my_ip) { - char buf[kSizeOfAddress]; - print_ip(buf, my_ip); - RINFO("Connection established. IP %s", buf); - } - g_ui_ip = my_ip; + MyProcessorDelegate() { + wg_processor_ = NULL; + is_connected_ = false; + } + + virtual void OnConnected() override { + if (!is_connected_) { + uint32 ipv4_ip = ReadBE32(wg_processor_->tun_addr().addr); + char buf[kSizeOfAddress]; + RINFO("Connection established. IP %s", print_ip(buf, ipv4_ip)); + is_connected_ = true; } } - virtual void OnDisconnected() { - MyProcessorDelegate::OnConnected(0); + virtual void OnConnectionRetry(uint32 attempts) override { + if (is_connected_ && attempts >= 3) { + is_connected_ = false; + RINFO("Reconnecting..."); + } } + + WireguardProcessor *wg_processor_; + bool is_connected_; }; int main(int argc, char **argv) { - bool exit_flag = false; - InitCpuFeatures(); if (argc == 2 && strcmp(argv[1], "--benchmark") == 0) { @@ -739,9 +751,12 @@ int main(int argc, char **argv) { MyProcessorDelegate my_procdel; TunsafeBackendBsd *socket_loop = CreateTunsafeBackendBsd(); WireguardProcessor wg(socket_loop, socket_loop, &my_procdel); + + my_procdel.wg_processor_ = &wg; socket_loop->SetProcessor(&wg); - if (!ParseWireGuardConfigFile(&wg, argv[1], &exit_flag)) return 1; + DnsResolver dns_resolver(NULL); + if (!ParseWireGuardConfigFile(&wg, argv[1], &dns_resolver)) return 1; if (!wg.Start()) return 1; socket_loop->RunLoop(); diff --git a/network_bsd_common.h b/network_bsd_common.h index 1db5646..cc14bc6 100644 --- a/network_bsd_common.h +++ b/network_bsd_common.h @@ -6,12 +6,14 @@ #include "netapi.h" #include "wireguard.h" #include "wireguard_config.h" +#include struct RouteInfo { uint8 family; uint8 cidr; uint8 ip[16]; uint8 gw[16]; + std::string dev; }; class TunsafeBackendBsd : public TunInterface, public UdpInterface { @@ -34,9 +36,9 @@ protected: virtual bool InitializeTun(char devname[16]) = 0; virtual void RunLoopInner() = 0; - void AddRoute(uint32 ip, uint32 cidr, uint32 gw); + void AddRoute(uint32 ip, uint32 cidr, uint32 gw, const char *dev); void DelRoute(const RouteInfo &cd); - bool AddRoute(int family, const void *dest, int dest_prefix, const void *gateway); + bool AddRoute(int family, const void *dest, int dest_prefix, const void *gateway, const char *dev); bool RunPrePostCommand(const std::vector &vec); WireguardProcessor *processor_; diff --git a/network_bsd_mt.cpp b/network_bsd_mt.cpp index 30caaf9..f7134a8 100644 --- a/network_bsd_mt.cpp +++ b/network_bsd_mt.cpp @@ -1,8 +1,11 @@ // SPDX-License-Identifier: AGPL-1.0-only // Copyright (C) 2018 Ludvig Strigeus . All Rights Reserved. +// Note: This is an experimental implementation that doesn't work, there's no way +// for the alarm signal to interrupt the tunsafe main thread. #include "network_bsd_common.h" #include "tunsafe_endian.h" #include "tunsafe_config.h" +#include "tunsafe_threading.h" #include "util.h" #include @@ -91,7 +94,7 @@ private: bool shutting_down_; bool got_sig_alarm_; - pthread_mutex_t lock_; + Mutex lock_; pthread_cond_t cond_; }; @@ -120,7 +123,7 @@ private: bool shutting_down_; - pthread_mutex_t lock_; + Mutex lock_; pthread_cond_t cond_; }; @@ -147,7 +150,7 @@ private: WorkerLoop *worker_; pthread_t read_tid_, write_tid_; Packet *queue_, **queue_end_; - pthread_mutex_t lock_; + Mutex lock_; pthread_cond_t cond_; }; @@ -158,12 +161,11 @@ WorkerLoop::WorkerLoop() { shutting_down_ = false; got_sig_alarm_ = false; processor_ = NULL; - pthread_mutex_init(&lock_, NULL); - pthread_cond_init(&cond_, NULL); + if (pthread_cond_init(&cond_, NULL) != 0) + tunsafe_die("pthread_cond_init failed"); } WorkerLoop::~WorkerLoop() { - pthread_mutex_destroy(&lock_); pthread_cond_destroy(&cond_); } @@ -174,13 +176,14 @@ bool WorkerLoop::Initialize(WireguardProcessor *processor) { void WorkerLoop::StartThread() { assert(tid_ == 0); - pthread_create(&tid_, NULL, &ThreadMainStatic, this); + if (pthread_create(&tid_, NULL, &ThreadMainStatic, this) != 0) + tunsafe_die("pthread_create failed"); } void WorkerLoop::StopThread() { - pthread_mutex_lock(&lock_); + lock_.Acquire(); shutting_down_ = true; - pthread_mutex_unlock(&lock_); + lock_.Release(); if (tid_) { void *x; @@ -198,16 +201,16 @@ void WorkerLoop::NotifyStop() { void WorkerLoop::HandlePacket(Packet *packet, int target) { // RINFO("WorkerLoop::HandlePacket"); packet->post_target = target; - pthread_mutex_lock(&lock_); + lock_.Acquire(); Packet *old_queue = queue_; *queue_end_ = packet; queue_end_ = &packet->next; packet->next = NULL; if (old_queue == NULL) { - pthread_mutex_unlock(&lock_); + lock_.Release(); pthread_cond_signal(&cond_); } else { - pthread_mutex_unlock(&lock_); + lock_.Release(); } } @@ -218,19 +221,19 @@ void *WorkerLoop::ThreadMainStatic(void *x) { void *WorkerLoop::ThreadMain() { Packet *packet_queue; - pthread_mutex_lock(&lock_); + lock_.Acquire(); for (;;) { // Grab the whole list for (;;) { while (got_sig_alarm_) { got_sig_alarm_ = false; - pthread_mutex_unlock(&lock_); + lock_.Release(); processor_->SecondLoop(); - pthread_mutex_lock(&lock_); + lock_.Acquire(); } if (shutting_down_ || queue_ != NULL) break; - pthread_cond_wait(&cond_, &lock_); + pthread_cond_wait(&cond_, lock_.impl()); } if (shutting_down_) break; @@ -238,7 +241,7 @@ void *WorkerLoop::ThreadMain() { queue_ = NULL; queue_end_ = &queue_; - pthread_mutex_unlock(&lock_); + lock_.Release(); // And send all items in the list while (packet_queue != NULL) { Packet *next = packet_queue->next; @@ -249,9 +252,9 @@ void *WorkerLoop::ThreadMain() { } packet_queue = next; } - pthread_mutex_lock(&lock_); + lock_.Acquire(); } - pthread_mutex_unlock(&lock_); + lock_.Release(); return NULL; } @@ -265,14 +268,13 @@ UdpLoop::UdpLoop() { worker_ = NULL; queue_ = NULL; queue_end_ = &queue_; - pthread_mutex_init(&lock_, NULL); - pthread_cond_init(&cond_, NULL); + if (pthread_cond_init(&cond_, NULL) != 0) + tunsafe_die("pthread_cond_init failed"); } UdpLoop::~UdpLoop() { if (fd_ != -1) close(fd_); - pthread_mutex_destroy(&lock_); pthread_cond_destroy(&cond_); } @@ -286,16 +288,18 @@ bool UdpLoop::Initialize(int listen_port, WorkerLoop *worker) { } void UdpLoop::Start() { - pthread_create(&read_tid_, NULL, &ReaderMainStatic, this); - pthread_create(&write_tid_, NULL, &WriterMainStatic, this); + if (pthread_create(&read_tid_, NULL, &ReaderMainStatic, this) != 0) + tunsafe_die("pthread_create failed"); + if (pthread_create(&write_tid_, NULL, &WriterMainStatic, this) != 0) + tunsafe_die("pthread_create failed"); } void UdpLoop::Stop() { void *x; - pthread_mutex_lock(&lock_); + lock_.Acquire(); shutting_down_ = true; - pthread_mutex_unlock(&lock_); + lock_.Release(); pthread_cond_signal(&cond_); pthread_kill(read_tid_, SIGUSR1); @@ -345,17 +349,17 @@ void *UdpLoop::ReaderMain() { void *UdpLoop::WriterMain() { Packet *queue; - pthread_mutex_lock(&lock_); + lock_.Acquire(); for (;;) { // Grab the whole list while (!shutting_down_ && queue_ == NULL) - pthread_cond_wait(&cond_, &lock_); + pthread_cond_wait(&cond_, lock_.impl()); if (shutting_down_) break; queue = queue_; queue_ = NULL; queue_end_ = &queue_; - pthread_mutex_unlock(&lock_); + lock_.Release(); // And send all items in the list while (queue != NULL) { int r = sendto(fd_, queue->data, queue->size, 0, @@ -370,9 +374,9 @@ void *UdpLoop::WriterMain() { queue = queue->next; FreePacket(to_free); } - pthread_mutex_lock(&lock_); + lock_.Acquire(); } - pthread_mutex_unlock(&lock_); + lock_.Release(); return NULL; } @@ -380,15 +384,15 @@ void UdpLoop::WriteUdpPacket(Packet *packet) { // RINFO("write udp packet to queue!"); packet->next = NULL; - pthread_mutex_lock(&lock_); + lock_.Acquire(); Packet *old_queue = queue_; *queue_end_ = packet; queue_end_ = &packet->next; if (old_queue == NULL) { - pthread_mutex_unlock(&lock_); + lock_.Release(); pthread_cond_signal(&cond_); } else { - pthread_mutex_unlock(&lock_); + lock_.Release(); } } @@ -400,14 +404,13 @@ TunLoop::TunLoop() { write_tid_ = 0; queue_ = NULL; queue_end_ = &queue_; - pthread_mutex_init(&lock_, NULL); - pthread_cond_init(&cond_, NULL); + if (pthread_cond_init(&cond_, NULL) != 0) + tunsafe_die("pthread_cond_init failed"); } TunLoop::~TunLoop() { if (fd_ != -1) close(fd_); - pthread_mutex_destroy(&lock_); pthread_cond_destroy(&cond_); } @@ -421,16 +424,18 @@ bool TunLoop::Initialize(char devname[16], WorkerLoop *worker) { } void TunLoop::Start() { - pthread_create(&read_tid_, NULL, &ReaderMainStatic, this); - pthread_create(&write_tid_, NULL, &WriterMainStatic, this); + if (pthread_create(&read_tid_, NULL, &ReaderMainStatic, this) != 0) + tunsafe_die("pthread_create failed"); + if (pthread_create(&write_tid_, NULL, &WriterMainStatic, this) != 0) + tunsafe_die("pthread_create failed"); } void TunLoop::Stop() { void *x; - pthread_mutex_lock(&lock_); + lock_.Acquire(); shutting_down_ = true; - pthread_mutex_unlock(&lock_); + lock_.Release(); pthread_kill(read_tid_, SIGUSR1); pthread_kill(write_tid_, SIGUSR1); @@ -469,18 +474,18 @@ void *TunLoop::ReaderMain() { void *TunLoop::WriterMain() { Packet *queue; - pthread_mutex_lock(&lock_); + lock_.Acquire(); for (;;) { // Grab the whole list while (!shutting_down_ && queue_ == NULL) { - pthread_cond_wait(&cond_, &lock_); + pthread_cond_wait(&cond_, lock_.impl()); } if (shutting_down_) break; queue = queue_; queue_ = NULL; queue_end_ = &queue_; - pthread_mutex_unlock(&lock_); + lock_.Release(); // And send all items in the list while (queue != NULL) { if (TUN_PREFIX_BYTES) @@ -494,24 +499,24 @@ void *TunLoop::WriterMain() { queue = queue->next; FreePacket(to_free); } - pthread_mutex_lock(&lock_); + lock_.Acquire(); } - pthread_mutex_unlock(&lock_); + lock_.Release(); return NULL; } void TunLoop::WriteTunPacket(Packet *packet) { packet->next = NULL; - pthread_mutex_lock(&lock_); + lock_.Acquire(); Packet *old_queue = queue_; *queue_end_ = packet; queue_end_ = &packet->next; if (old_queue == NULL) { - pthread_mutex_unlock(&lock_); + lock_.Release(); pthread_cond_signal(&cond_); } else { - pthread_mutex_unlock(&lock_); + lock_.Release(); } } diff --git a/network_win32.cpp b/network_win32.cpp index beb1b39..d364611 100644 --- a/network_win32.cpp +++ b/network_win32.cpp @@ -38,6 +38,8 @@ static SLIST_HEADER freelist_head; bool g_allow_pre_post; +static InternetBlockState GetInternetBlockState(bool *is_activated); + Packet *AllocPacket() { Packet *packet = (Packet*)InterlockedPopEntrySList(&freelist_head); if (packet == NULL) @@ -51,6 +53,40 @@ void FreePacket(Packet *packet) { InterlockedPushEntrySList(&freelist_head, &packet->list_entry); } +void OsGetRandomBytes(uint8 *data, size_t data_size) { + static BOOLEAN(APIENTRY *pfn)(void*, ULONG); + static bool resolved; + if (!resolved) { + pfn = (BOOLEAN(APIENTRY *)(void*, ULONG))GetProcAddress(LoadLibrary("ADVAPI32.DLL"), "SystemFunction036"); + resolved = true; + } + if (pfn && pfn(data, (ULONG)data_size)) + return; + size_t r = 0; + for (; r < data_size; r++) + data[r] = rand() >> 6; +} + +void OsInterruptibleSleep(int millis) { + SleepEx(millis, TRUE); +} + +uint64 OsGetMilliseconds() { + return GetTickCount64(); +} + +void OsGetTimestampTAI64N(uint8 dst[12]) { + SYSTEMTIME systime; + uint64 file_time_uint64 = 0; + GetSystemTime(&systime); + SystemTimeToFileTime(&systime, (FILETIME*)&file_time_uint64); + uint64 time_since_epoch_100ns = (file_time_uint64 - 116444736000000000); + uint64 secs_since_epoch = time_since_epoch_100ns / 10000000 + 0x400000000000000a; + uint32 nanos = (uint32)(time_since_epoch_100ns % 10000000) * 100; + WriteBE64(dst, secs_since_epoch); + WriteBE32(dst + 8, nanos); +} + extern "C" PSLIST_ENTRY __fastcall InterlockedPushListSList( IN PSLIST_HEADER ListHead, @@ -80,11 +116,6 @@ void InitPacketMutexes() { } } - -void CallbackUpdateUI(); -void CallbackTriggerReconnect(); -void CallbackSetPublicKey(const uint8 public_key[32]); - int tpq_last_qsize; int g_tun_reads, g_tun_writes; @@ -200,7 +231,7 @@ static bool GetTapAdapterGuid(char guid[64]) { } // Open the TAP adapter -static HANDLE OpenTunAdapter(char guid[64], int retry_count, bool *exit_thread, DWORD open_flags) { +static HANDLE OpenTunAdapter(char guid[64], int retry_count, uint32 *exit_thread, DWORD open_flags) { char path[128]; HANDLE h; int retries = 0; @@ -221,7 +252,16 @@ RETRY: if ((error_code == ERROR_FILE_NOT_FOUND || error_code == ERROR_GEN_FAILURE) && retry_count != 0 && !*exit_thread) { RERROR("OpenTapAdapter: CreateFile failed: 0x%X... retrying", error_code); retry_count--; - Sleep(250 * ++retries); + + int sleep_amount = 250 * ++retries; + for(;;) { + if (*exit_thread) + return NULL; + if (sleep_amount == 0) + break; + Sleep(50); + sleep_amount -= 50; + } goto RETRY; } @@ -239,7 +279,7 @@ RETRY: static bool AddRoute(int family, const void *dest, int dest_prefix, const void *gateway, const NET_LUID *interface_luid, - std::vector *undo_array = NULL) { + std::vector *undo_array) { MIB_IPFORWARD_ROW2 row = {0}; char buf1[kSizeOfAddress], buf2[kSizeOfAddress]; @@ -261,11 +301,12 @@ static bool AddRoute(int family, row.Metric = 100; row.Protocol = MIB_IPPROTO_NETMGMT; - if (undo_array) - undo_array->push_back(row); - DWORD error = CreateIpForwardEntry2(&row); if (error == NO_ERROR || error == ERROR_OBJECT_ALREADY_EXISTS) { + + if (undo_array) + undo_array->push_back(row); + RINFO("Added Route %s => %s", print_ip_prefix(buf1, family, dest, dest_prefix), print_ip_prefix(buf2, family, gateway, -1)); return true; @@ -352,7 +393,7 @@ static bool GetDefaultRouteAndDeleteOldRoutes(int family, const NET_LUID *Interf for (unsigned i = 0; i < table->NumEntries; i++) { MIB_IPFORWARD_ROW2 *row = &table->Table[i]; if (InterfaceLuid && memcmp(&row->InterfaceLuid, InterfaceLuid, sizeof(NET_LUID)) == 0) { - if (row->Protocol == MIB_IPPROTO_NETMGMT) + if (row->Protocol == MIB_IPPROTO_NETMGMT && !row->AutoconfigureAddress) DeleteRouteOrPrintErr(row); } else if (IsRouteOriginatingFromNullRoute(row)) { ri->found_null_routes++; @@ -422,8 +463,6 @@ UdpSocketWin32::UdpSocketWin32() { thread_ = NULL; socket_ipv6_ = INVALID_SOCKET; completion_port_handle_ = NULL; - - InitializeCriticalSectionAndSpinCount(&mutex_, 1024); } UdpSocketWin32::~UdpSocketWin32() { @@ -432,7 +471,6 @@ UdpSocketWin32::~UdpSocketWin32() { closesocket(socket_ipv6_); CloseHandle(completion_port_handle_); FreePacketList(wqueue_); - DeleteCriticalSection(&mutex_); } bool UdpSocketWin32::Initialize(int listen_on_port) { @@ -613,11 +651,11 @@ restart_read_udp: if (!pending_writes) { if (!wqueue_) break; - EnterCriticalSection(&mutex_); + mutex_.Acquire(); pending_writes = wqueue_; wqueue_end_ = &wqueue_; wqueue_ = NULL; - LeaveCriticalSection(&mutex_); + mutex_.Release(); if (!pending_writes) break; } @@ -690,11 +728,11 @@ void UdpSocketWin32::WriteUdpPacket(Packet *packet) { packet->next = NULL; qs.udp_qsize2 += packet->size; - EnterCriticalSection(&mutex_); + mutex_.Acquire(); Packet *was_empty = wqueue_; *wqueue_end_ = packet; wqueue_end_ = &packet->next; - LeaveCriticalSection(&mutex_); + mutex_.Release(); if (was_empty == NULL) { // Notify the worker thread that it should attempt more writes @@ -722,10 +760,9 @@ void UdpSocketWin32::StopThread() { thread_ = NULL; } -ThreadedPacketQueue::ThreadedPacketQueue(WireguardProcessor *wg, NetworkStats *stats) { +ThreadedPacketQueue::ThreadedPacketQueue(WireguardProcessor *wg, TunsafeBackendWin32 *backend) { wg_ = wg; - stats_ = stats; - InitializeCriticalSectionAndSpinCount(&mutex_, 1024); + backend_ = backend; event_ = CreateEvent(NULL, FALSE, FALSE, NULL); last_ptr_ = &first_; @@ -743,7 +780,6 @@ ThreadedPacketQueue::~ThreadedPacketQueue() { assert(timer_handle_ == NULL); first_ = NULL; last_ptr_ = &first_; - DeleteCriticalSection(&mutex_); CloseHandle(event_); } @@ -755,23 +791,29 @@ DWORD WINAPI ThreadedPacketQueue::ThreadedPacketQueueLauncher(VOID *x) { DWORD ThreadedPacketQueue::ThreadMain() { int free_packets_ctr = 0; int overload = 0; + Packet *packet; - EnterCriticalSection(&mutex_); + wg_->dev().SetCurrentThreadAsMainThread(); + + mutex_.Acquire(); while (!exit_flag_) { if (timer_interrupt_) { timer_interrupt_ = false; need_notify_ = 0; - LeaveCriticalSection(&mutex_); + mutex_.Release(); wg_->SecondLoop(); - EnterCriticalSection(&stats_->mutex); - if (stats_->reset_stats) { - stats_->reset_stats = false; - wg_->ResetStats(); - } - stats_->packet_stats = wg_->GetStats(); - LeaveCriticalSection(&stats_->mutex); + backend_->stats_mutex_.Acquire(); + backend_->stats_ = wg_->GetStats(); + float data[2] = { + // unit is megabits/second + backend_->stats_.tun_bytes_in_per_second * (1.0f / 125000), + backend_->stats_.tun_bytes_out_per_second * (1.0f / 125000), + }; + backend_->stats_collector_.AddSamples(data); + backend_->stats_mutex_.Release(); - CallbackUpdateUI(); + backend_->delegate_->OnGraphAvailable(); + backend_->PushStats(); // Conserve memory every 10s if (free_packets_ctr++ == 10) { @@ -780,46 +822,38 @@ DWORD ThreadedPacketQueue::ThreadMain() { } if (overload) overload -= 1; - EnterCriticalSection(&mutex_); - continue; - } - - // Grab the elements of the queue - Packet *packet = first_; - if (packet == NULL) { + } else if ((packet = first_) == NULL) { need_notify_ = 1; - LeaveCriticalSection(&mutex_); + mutex_.Release(); WaitForSingleObject(event_, INFINITE); - EnterCriticalSection(&mutex_); + } else { + // Steal the whole work queue + first_ = NULL; + last_ptr_ = &first_; + int packets_in_queue = packets_in_queue_; + packets_in_queue_ = 0; + need_notify_ = 0; + mutex_.Release(); - //SleepConditionVariableCS(&cv_, &mutex, INFINITE); - continue; + tpq_last_qsize = packets_in_queue; + if (packets_in_queue >= 1024) + overload = 2; + bool is_overload = (overload != 0); + + WireguardProcessor *procint = wg_; + do { + Packet *next = packet->next; + if (packet->post_target == TARGET_PROCESSOR_UDP) + procint->HandleUdpPacket(packet, is_overload); + else + procint->HandleTunPacket(packet); + packet = next; + } while (packet); } - // Steal the whole work queue - first_ = NULL; - last_ptr_ = &first_; - int packets_in_queue = packets_in_queue_; - packets_in_queue_ = 0; - need_notify_ = 0; - LeaveCriticalSection(&mutex_); - - tpq_last_qsize = packets_in_queue; - if (packets_in_queue >= 1024) - overload = 2; - bool is_overload = (overload != 0); - - WireguardProcessor *procint = wg_; - do { - Packet *next = packet->next; - if (packet->post_target == TARGET_PROCESSOR_UDP) - procint->HandleUdpPacket(packet, is_overload); - else - procint->HandleTunPacket(packet); - packet = next; - } while (packet); - EnterCriticalSection(&mutex_); + wg_->RunAllMainThreadScheduled(); + mutex_.Acquire(); } - LeaveCriticalSection(&mutex_); + mutex_.Release(); return 0; } @@ -837,9 +871,9 @@ void ThreadedPacketQueue::Start() { } void ThreadedPacketQueue::Stop() { - EnterCriticalSection(&mutex_); + mutex_.Acquire(); exit_flag_ = true; - LeaveCriticalSection(&mutex_); + mutex_.Release(); SetEvent(event_); @@ -859,15 +893,15 @@ void ThreadedPacketQueue::Stop() { } void ThreadedPacketQueue::AbortingDriver() { - EnterCriticalSection(&mutex_); + mutex_.Acquire(); exit_flag_ = true; - LeaveCriticalSection(&mutex_); + mutex_.Release(); } void ThreadedPacketQueue::Post(Packet *packet, Packet **end, int count) { - EnterCriticalSection(&mutex_); + mutex_.Acquire(); if (packets_in_queue_ >= HARD_MAXIMUM_QUEUE_SIZE) { - LeaveCriticalSection(&mutex_); + mutex_.Release(); FreePackets(packet, end, count); return; } @@ -883,11 +917,11 @@ void ThreadedPacketQueue::Post(Packet *packet, Packet **end, int count) { } if (need_notify_) { need_notify_ = 0; - LeaveCriticalSection(&mutex_); + mutex_.Release(); SetEvent(event_); return; } - LeaveCriticalSection(&mutex_); + mutex_.Release(); } void CALLBACK ThreadedPacketQueue::TimerRoutine(LPVOID lpArgToCompletionRoutine, DWORD dwTimerLowValue, DWORD dwTimerHighValue) { @@ -895,15 +929,15 @@ void CALLBACK ThreadedPacketQueue::TimerRoutine(LPVOID lpArgToCompletionRoutine, } void ThreadedPacketQueue::PostTimerInterrupt() { - EnterCriticalSection(&mutex_); + mutex_.Acquire(); timer_interrupt_ = true; if (need_notify_) { need_notify_ = 0; - LeaveCriticalSection(&mutex_); + mutex_.Release(); SetEvent(event_); return; } - LeaveCriticalSection(&mutex_); + mutex_.Release(); } bool GetNetLuidFromGuid(const char *adapter_guid, NET_LUID *luid) { @@ -1052,64 +1086,28 @@ static void ComputeIpv6DefaultRoute(const uint8 *ipv6_address, uint8 ipv6_cidr, } -static bool AddMultipleCatchallRoutes(int inet, int bits, const uint8 *target, const NET_LUID &luid) { +static bool AddMultipleCatchallRoutes(int inet, int bits, const uint8 *target, const NET_LUID &luid, std::vector *undo_array) { uint8 tmp[16] = {0}; bool success = true; for (int i = 0; i < (1 << bits); i++) { tmp[0] = i << (8 - bits); - success &= AddRoute(inet, tmp, bits, target, &luid); + success &= AddRoute(inet, tmp, bits, target, &luid, undo_array); } return success; } -static uint8 GetInternetRouteBlockingState() { - if (internet_route_blocking_state == ROUTE_BLOCK_UNKNOWN) { - RouteInfo ri; - internet_route_blocking_state = - (GetDefaultRouteAndDeleteOldRoutes(AF_INET, NULL, TRUE, NULL, &ri) && ri.found_null_routes == 2) + ROUTE_BLOCK_OFF; - } - return internet_route_blocking_state; -} - -static void SetInternetRouteBlockingState(bool want) { - if (want) { - internet_route_blocking_state = ROUTE_BLOCK_PENDING; - } else if (internet_route_blocking_state != ROUTE_BLOCK_OFF) { - RouteInfo ri; - GetDefaultRouteAndDeleteOldRoutes(AF_INET, NULL, FALSE, NULL, &ri); - GetDefaultRouteAndDeleteOldRoutes(AF_INET6, NULL, FALSE, NULL, &ri); - internet_route_blocking_state = ROUTE_BLOCK_OFF; - } -} - -InternetBlockState GetInternetBlockState(bool *is_activated) { - int a = GetInternetRouteBlockingState(); - int b = GetInternetFwBlockingState(); - - if (is_activated) - *is_activated = (a == ROUTE_BLOCK_ON || b == IBS_ACTIVE); - - return (InternetBlockState)( - (a >= ROUTE_BLOCK_ON) * kBlockInternet_Route + - (b >= IBS_ACTIVE) * kBlockInternet_Firewall); -} - -void SetInternetBlockState(InternetBlockState s) { - SetInternetRouteBlockingState((s & kBlockInternet_Route) != 0); - SetInternetFwBlockingState((s & kBlockInternet_Firewall) != 0); -} - -TunWin32Adapter::TunWin32Adapter() { +TunWin32Adapter::TunWin32Adapter(DnsBlocker *dns_blocker) { handle_ = NULL; - current_dns_block_ = NULL; + dns_blocker_ = dns_blocker; } TunWin32Adapter::~TunWin32Adapter() { } -bool TunWin32Adapter::OpenAdapter(bool *exit_thread, DWORD open_flags) { - int retry_count = 10; +bool TunWin32Adapter::OpenAdapter(uint32 *exit_thread, DWORD open_flags) { + assert(handle_ == NULL); + int retry_count = 20; handle_ = OpenTunAdapter(guid_, retry_count, exit_thread, open_flags); return (handle_ != NULL); } @@ -1230,7 +1228,7 @@ bool TunWin32Adapter::InitAdapter(const TunInterface::TunConfig &&config, TunInt if (has_dns_setting && config.block_dns_on_adapters) { RINFO("Blocking standard DNS on all adapters"); - current_dns_block_ = BlockDnsExceptOnAdapter(InterfaceLuid, config.ipv6_cidr != 0); + dns_blocker_->BlockDnsExceptOnAdapter(InterfaceLuid, config.ipv6_cidr != 0); err = SetMetricOnNetworkAdapter(&InterfaceLuid, AF_INET, 2); if (err) @@ -1241,6 +1239,8 @@ bool TunWin32Adapter::InitAdapter(const TunInterface::TunConfig &&config, TunInt if (err) RERROR("SetMetricOnNetworkAdapter IPv6 failed: %d", err); } + } else { + dns_blocker_->RestoreDns(); } uint8 ibs = config.internet_blocking; @@ -1279,10 +1279,10 @@ bool TunWin32Adapter::InitAdapter(const TunInterface::TunConfig &&config, TunInt RERROR("Unable to get localhost luid - while adding route based blocking."); } else { uint32 dst[4] = {0}; - if (!AddMultipleCatchallRoutes(AF_INET, 1, (uint8*)&dst, localhost_luid)) + if (!AddMultipleCatchallRoutes(AF_INET, 1, (uint8*)&dst, localhost_luid, NULL)) RERROR("Unable to add routes for route based blocking."); if (config.ipv6_cidr) { - if (!AddMultipleCatchallRoutes(AF_INET6, 1, (uint8*)&dst, localhost_luid)) + if (!AddMultipleCatchallRoutes(AF_INET6, 1, (uint8*)&dst, localhost_luid, NULL)) RERROR("Unable to add IPv6 routes for route based blocking."); } } @@ -1312,7 +1312,7 @@ bool TunWin32Adapter::InitAdapter(const TunInterface::TunConfig &&config, TunInt } // Either add 4 routes or 2 routes, depending on if we use route blocking. uint32 be = ToBE32(default_route_v4); - if (!AddMultipleCatchallRoutes(AF_INET, block_all_traffic_route ? 2 : 1, (uint8*)&be, InterfaceLuid)) + if (!AddMultipleCatchallRoutes(AF_INET, block_all_traffic_route ? 2 : 1, (uint8*)&be, InterfaceLuid, &routes_to_undo_)) RERROR("Unable to add new default ipv4 route."); } @@ -1331,7 +1331,7 @@ bool TunWin32Adapter::InitAdapter(const TunInterface::TunConfig &&config, TunInt return false; } } - if (!AddMultipleCatchallRoutes(AF_INET6, block_all_traffic_route ? 2 : 1, default_route_v6, InterfaceLuid)) + if (!AddMultipleCatchallRoutes(AF_INET6, block_all_traffic_route ? 2 : 1, default_route_v6, InterfaceLuid, &routes_to_undo_)) RERROR("Unable to add new default ipv6 route."); } } @@ -1340,9 +1340,20 @@ bool TunWin32Adapter::InitAdapter(const TunInterface::TunConfig &&config, TunInt for (auto it = config.extra_routes.begin(); it != config.extra_routes.end(); ++it) { if (it->size == 32) { uint32 be = ToBE32(default_route_v4); - AddRoute(AF_INET, it->addr, it->cidr, &be, &InterfaceLuid); + AddRoute(AF_INET, it->addr, it->cidr, &be, &InterfaceLuid, &routes_to_undo_); } else if (it->size == 128 && config.ipv6_cidr) { - AddRoute(AF_INET6, it->addr, it->cidr, default_route_v6, &InterfaceLuid); + AddRoute(AF_INET6, it->addr, it->cidr, default_route_v6, &InterfaceLuid, &routes_to_undo_); + } + } + + // Add all the routes that should bypass vpn + for (auto it = config.excluded_ips.begin(); it != config.excluded_ips.end(); ++it) { + if (it->size == 32) { + if (ri.found_default_adapter) + AddRoute(AF_INET, it->addr, it->cidr, ri.default_gw, &ri.default_adapter, &routes_to_undo_); + } else if (it->size == 128 && config.ipv6_cidr) { + if (ri6.found_default_adapter) + AddRoute(AF_INET6, it->addr, it->cidr, ri6.default_gw, &ri6.default_adapter, &routes_to_undo_); } } @@ -1386,9 +1397,9 @@ void TunWin32Adapter::CloseAdapter() { DeleteRoute(&*it); routes_to_undo_.clear(); - RestoreDnsExceptOnAdapter(current_dns_block_); - current_dns_block_ = NULL; - + if (dns_blocker_) + dns_blocker_->RestoreDns(); + RunPrePostCommand(post_down_); } @@ -1445,7 +1456,7 @@ static bool RunOneCommand(const std::string &cmd) { char *nl2 = nl; if (nl != buf + bufstart && nl[-1] == '\r') nl--; - bufstart = nl2 - buf + 1; + bufstart = (DWORD)(nl2 - buf + 1); RINFO("%.*s", nl - st, st); } if (bufend - bufstart == sizeof(buf) || foundeof) { @@ -1496,14 +1507,13 @@ bool TunWin32Adapter::RunPrePostCommand(const std::vector &vec) { ////////////////////////////////////////////////////////////////////////////// -TunWin32Iocp::TunWin32Iocp() { +TunWin32Iocp::TunWin32Iocp(DnsBlocker *blocker, TunsafeBackendWin32 *backend) : adapter_(blocker), backend_(backend) { wqueue_end_ = &wqueue_; wqueue_ = NULL; thread_ = NULL; completion_port_handle_ = NULL; packet_handler_ = NULL; - InitializeCriticalSectionAndSpinCount(&mutex_, 1024); exit_thread_ = false; } @@ -1511,13 +1521,12 @@ TunWin32Iocp::~TunWin32Iocp() { //assert(num_reads_ == 0 && num_writes_ == 0); assert(thread_ == NULL); CloseTun(); - DeleteCriticalSection(&mutex_); } bool TunWin32Iocp::Initialize(const TunConfig &&config, TunConfigOut *out) { - CloseTun(); + assert(thread_ == NULL); - if (!adapter_.OpenAdapter(&exit_thread_, FILE_FLAG_OVERLAPPED)) + if (!adapter_.OpenAdapter(&backend_->stop_mode_, FILE_FLAG_OVERLAPPED)) return false; completion_port_handle_ = CreateIoCompletionPort(adapter_.handle(), NULL, NULL, 0); @@ -1568,13 +1577,13 @@ void TunWin32Iocp::ThreadMain() { RERROR("TunWin32: ReadFile failed 0x%X", err); - if (err == ERROR_OPERATION_ABORTED) { + if (err == ERROR_OPERATION_ABORTED || err == ERROR_FILE_NOT_FOUND) { packet_handler_->AbortingDriver(); RERROR("TAP driver stopped communicating. Attempting to restart.", err); // This can happen if we reinstall the TAP driver while there's an active connection. Wait a bit, then attempt to // restart. Sleep(1000); - CallbackTriggerReconnect(); + backend_->TunAdapterFailed(); goto EXIT; } } else { @@ -1642,11 +1651,11 @@ void TunWin32Iocp::ThreadMain() { if (!pending_writes) { if (!wqueue_) break; - EnterCriticalSection(&mutex_); + mutex_.Acquire(); pending_writes = wqueue_; wqueue_end_ = &wqueue_; wqueue_ = NULL; - LeaveCriticalSection(&mutex_); + mutex_.Release(); if (!pending_writes) break; } @@ -1711,11 +1720,11 @@ void TunWin32Iocp::StopThread() { void TunWin32Iocp::WriteTunPacket(Packet *packet) { packet->next = NULL; - EnterCriticalSection(&mutex_); + mutex_.Acquire(); Packet *was_empty = wqueue_; *wqueue_end_ = packet; wqueue_end_ = &packet->next; - LeaveCriticalSection(&mutex_); + mutex_.Release(); if (was_empty == NULL) { // Notify the worker thread that it should attempt more writes PostQueuedCompletionStatus(completion_port_handle_, NULL, NULL, NULL); @@ -1726,7 +1735,7 @@ void TunWin32Iocp::WriteTunPacket(Packet *packet) { ////////////////////////////////////////////////////////////////////////////// -TunWin32Overlapped::TunWin32Overlapped() { +TunWin32Overlapped::TunWin32Overlapped(DnsBlocker *blocker, TunsafeBackendWin32 *backend) : adapter_(blocker), backend_(backend) { wqueue_end_ = &wqueue_; wqueue_ = NULL; @@ -1737,13 +1746,11 @@ TunWin32Overlapped::TunWin32Overlapped() { wake_event_ = CreateEvent(NULL, FALSE, FALSE, NULL); packet_handler_ = NULL; - InitializeCriticalSectionAndSpinCount(&mutex_, 1024); exit_thread_ = false; } TunWin32Overlapped::~TunWin32Overlapped() { CloseTun(); - DeleteCriticalSection(&mutex_); CloseHandle(read_event_); CloseHandle(write_event_); CloseHandle(wake_event_); @@ -1751,7 +1758,7 @@ TunWin32Overlapped::~TunWin32Overlapped() { bool TunWin32Overlapped::Initialize(const TunConfig &&config, TunConfigOut *out) { CloseTun(); - return adapter_.OpenAdapter(&exit_thread_, FILE_FLAG_OVERLAPPED) && + return adapter_.OpenAdapter(&backend_->stop_mode_, FILE_FLAG_OVERLAPPED) && adapter_.InitAdapter(std::move(config), out); } @@ -1809,11 +1816,11 @@ void TunWin32Overlapped::ThreadMain() { if (write_packet == NULL) { if (!pending_writes) { - EnterCriticalSection(&mutex_); + mutex_.Acquire(); pending_writes = wqueue_; wqueue_end_ = &wqueue_; wqueue_ = NULL; - LeaveCriticalSection(&mutex_); + mutex_.Release(); } if (pending_writes) { // Then issue writes @@ -1859,98 +1866,427 @@ void TunWin32Overlapped::StopThread() { void TunWin32Overlapped::WriteTunPacket(Packet *packet) { packet->next = NULL; - EnterCriticalSection(&mutex_); + mutex_.Acquire(); Packet *was_empty = wqueue_; *wqueue_end_ = packet; wqueue_end_ = &packet->next; - LeaveCriticalSection(&mutex_); + mutex_.Release(); if (was_empty == NULL) SetEvent(wake_event_); } - - - +void TunsafeBackendWin32::SetPublicKey(const uint8 key[32]) { + memcpy(public_key_, key, 32); + delegate_->OnStateChanged(); +} DWORD WINAPI TunsafeBackendWin32::WorkerThread(void *bk) { TunsafeBackendWin32 *backend = (TunsafeBackendWin32*)bk; + int stop_mode; - TunWin32Iocp tun; - UdpSocketWin32 udp; - WireguardProcessor wg_proc(&udp, &tun, backend->procdel_); + for(;;) { + TunWin32Iocp tun(&backend->dns_blocker_, backend); + UdpSocketWin32 udp; + WireguardProcessor wg_proc(&udp, &tun, backend); - ThreadedPacketQueue queues_for_processor(&wg_proc, &backend->stats_); + ThreadedPacketQueue queues_for_processor(&wg_proc, backend); - qs.udp_qsize1 = qs.udp_qsize2 = 0; + qs.udp_qsize1 = qs.udp_qsize2 = 0; - udp.SetPacketHandler(&queues_for_processor); - tun.SetPacketHandler(&queues_for_processor); + udp.SetPacketHandler(&queues_for_processor); + tun.SetPacketHandler(&queues_for_processor); - if (!ParseWireGuardConfigFile(&wg_proc, backend->config_file_, &backend->exit_flag_)) - goto getout; + wg_proc.dev().SetCurrentThreadAsMainThread(); - if (!wg_proc.Start()) - goto getout; + if (!ParseWireGuardConfigFile(&wg_proc, backend->config_file_, &backend->dns_resolver_)) + goto getout_fail; - queues_for_processor.Start(); - udp.StartThread(); - tun.StartThread(); - - CallbackSetPublicKey(wg_proc.dev().public_key()); - - while (!backend->exit_flag_) { - SleepEx(INFINITE, TRUE); + if (!wg_proc.Start()) + goto getout_fail; + + // only for use in callbacks from wg + backend->wg_processor_ = &wg_proc; + + queues_for_processor.Start(); + udp.StartThread(); + tun.StartThread(); + + backend->SetPublicKey(wg_proc.dev().public_key()); + + while ((stop_mode = InterlockedExchange(&backend->stop_mode_, MODE_NONE)) == MODE_NONE) { + SleepEx(INFINITE, TRUE); + } + + // Keep DNS alive + if (stop_mode != MODE_EXIT) + tun.adapter().DisassociateDnsBlocker(); + else + backend->dns_resolver_.ClearCache(); + + udp.StopThread(); + tun.StopThread(); + queues_for_processor.Stop(); + + backend->wg_processor_ = NULL; + + FreeAllPackets(); + + if (stop_mode != MODE_TUN_FAILED) + return 0; + + uint32 last_fail = GetTickCount(); + bool permanent_fail = (last_fail - backend->last_tun_adapter_failed_) < 5000; + backend->last_tun_adapter_failed_ = last_fail; + + backend->status_ = permanent_fail ? TunsafeBackend::kErrorTunPermanent : TunsafeBackend::kStatusTunRetrying; + backend->delegate_->OnStatusCode(backend->status_); + + if (permanent_fail) { + RERROR("Too many automatic restarts..."); + goto getout_fail; + } } - - udp.StopThread(); - tun.StopThread(); - queues_for_processor.Stop(); - - FreeAllPackets(); -getout: +getout_fail: + backend->dns_blocker_.RestoreDns(); + backend->status_ = TunsafeBackend::kErrorInitialize; + backend->delegate_->OnStatusCode(TunsafeBackend::kErrorInitialize); return 0; } static void WINAPI ExitServiceAPC(ULONG_PTR a) { - *(bool*)a = true; } -TunsafeBackendWin32::TunsafeBackendWin32() { +TunsafeBackend::TunsafeBackend() { + is_started_ = false; + is_remote_ = false; + ipv4_ip_ = 0; + status_ = kStatusStopped; + memset(public_key_, 0, sizeof(public_key_)); +} + +TunsafeBackend::~TunsafeBackend() { + +} + + +TunsafeBackendWin32::TunsafeBackendWin32(Delegate *delegate) : delegate_(delegate), dns_resolver_(&dns_blocker_) { memset(&stats_, 0, sizeof(stats_)); + wg_processor_ = NULL; InitPacketMutexes(); - InitializeCriticalSectionAndSpinCount(&stats_.mutex, 1024); worker_thread_ = NULL; + stop_mode_ = MODE_NONE; + last_tun_adapter_failed_ = 0; + want_periodic_stats_ = false; + + internet_route_blocking_state = ROUTE_BLOCK_UNKNOWN; + ClearInternetFwBlockingStateCache(); + + delegate_->OnStateChanged(); } TunsafeBackendWin32::~TunsafeBackendWin32() { - DeleteCriticalSection(&stats_.mutex); + StopInner(false); } -ProcessorStats TunsafeBackendWin32::GetStats() { - EnterCriticalSection(&stats_.mutex); - ProcessorStats stats = stats_.packet_stats; - LeaveCriticalSection(&stats_.mutex); - return stats; +bool TunsafeBackendWin32::Initialize() { + // it's always initialized + + return true; } -void TunsafeBackendWin32::Start(ProcessorDelegate *procdel, const char *config_file) { - Stop(); - procdel_ = procdel; - exit_flag_ = false; +void TunsafeBackendWin32::Teardown() { + +} + + +void TunsafeBackendWin32::RequestStats(bool enable) { + want_periodic_stats_ = enable; + PushStats(); +} + +void TunsafeBackendWin32::PushStats() { + if (want_periodic_stats_) { + stats_mutex_.Acquire(); + WgProcessorStats stats = stats_; + stats_mutex_.Release(); + delegate_->OnGetStats(stats); + } +} + +void TunsafeBackendWin32::Stop() { + StopInner(false); + delegate_->OnStatusCode(status_); + delegate_->OnStateChanged(); +} + +void TunsafeBackendWin32::Start(const char *config_file) { + StopInner(true); + stop_mode_ = MODE_NONE; // this needs to be here cause it's not reset on config file errors + dns_resolver_.SetAbortFlag(false); + is_started_ = true; + memset(public_key_, 0, sizeof(public_key_)); + status_ = kStatusInitializing; + delegate_->OnStatusCode(kStatusInitializing); + delegate_->OnClearLog(); DWORD thread_id; config_file_ = _strdup(config_file); worker_thread_ = CreateThread(NULL, 0, &WorkerThread, this, 0, &thread_id); SetThreadPriority(worker_thread_, THREAD_PRIORITY_ABOVE_NORMAL); + delegate_->OnStateChanged(); } -void TunsafeBackendWin32::Stop() { +void TunsafeBackendWin32::TunAdapterFailed() { + InterlockedExchange(&stop_mode_, MODE_TUN_FAILED); + QueueUserAPC(&ExitServiceAPC, worker_thread_, NULL); +} + +void TunsafeBackendWin32::StopInner(bool is_restart) { if (worker_thread_) { - QueueUserAPC(&ExitServiceAPC, worker_thread_, (ULONG_PTR)&exit_flag_); + ipv4_ip_ = 0; + dns_resolver_.SetAbortFlag(true); + InterlockedExchange(&stop_mode_, is_restart ? MODE_RESTART : MODE_EXIT); + QueueUserAPC(&ExitServiceAPC, worker_thread_, NULL); WaitForSingleObject(worker_thread_, INFINITE); CloseHandle(worker_thread_); worker_thread_ = NULL; free(config_file_); config_file_ = NULL; + is_started_ = false; + status_ = kStatusStopped; } } +void TunsafeBackendWin32::ResetStats() { +} + +LinearizedGraph *TunsafeBackendWin32::GetGraph(int type) { + if (type < 0 || type >= 4) + return NULL; + + size_t size = sizeof(LinearizedGraph) + 2 * (sizeof(uint32) + sizeof(float) * 120); + LinearizedGraph *graph = (LinearizedGraph *)malloc(size); + if (graph) { + graph->total_size = (uint32)size; + graph->num_charts = 2; + graph->graph_type = type; + memset(graph->reserved, 0, sizeof(graph->reserved)); + stats_mutex_.Acquire(); + + uint8 *ptr = (uint8*)(graph + 1); + for (size_t i = 0; i < 2; i++) { + *(uint32*)ptr = 120; + ptr += 4; + const StatsCollector::TimeSeries *series = stats_collector_.GetTimeSeries((int)i, type); + memcpy(postinc(ptr, (series->size - series->shift) * sizeof(float)), + series->data + series->shift, + (series->size - series->shift) * sizeof(float)); + memcpy(postinc(ptr, series->shift * sizeof(float)), series->data, series->shift * sizeof(float)); + } + stats_mutex_.Release(); + } + return graph; +} + + +static uint8 GetInternetRouteBlockingState() { + if (internet_route_blocking_state == ROUTE_BLOCK_UNKNOWN) { + RouteInfo ri; + internet_route_blocking_state = + (GetDefaultRouteAndDeleteOldRoutes(AF_INET, NULL, TRUE, NULL, &ri) && ri.found_null_routes == 2) + ROUTE_BLOCK_OFF; + } + return internet_route_blocking_state; +} + +static void SetInternetRouteBlockingState(bool want) { + if (want) { + internet_route_blocking_state = ROUTE_BLOCK_PENDING; + } else if (internet_route_blocking_state != ROUTE_BLOCK_OFF) { + RouteInfo ri; + GetDefaultRouteAndDeleteOldRoutes(AF_INET, NULL, FALSE, NULL, &ri); + GetDefaultRouteAndDeleteOldRoutes(AF_INET6, NULL, FALSE, NULL, &ri); + internet_route_blocking_state = ROUTE_BLOCK_OFF; + } +} + +static InternetBlockState GetInternetBlockState(bool *is_activated) { + int a = GetInternetRouteBlockingState(); + int b = GetInternetFwBlockingState(); + + if (is_activated) + *is_activated = (a == ROUTE_BLOCK_ON || b == IBS_ACTIVE); + + return (InternetBlockState)( + (a >= ROUTE_BLOCK_ON) * kBlockInternet_Route + + (b >= IBS_ACTIVE) * kBlockInternet_Firewall); +} + +InternetBlockState TunsafeBackendWin32::GetInternetBlockState(bool *is_activated) { + return ::GetInternetBlockState(is_activated); +} + +void TunsafeBackendWin32::SetInternetBlockState(InternetBlockState s) { + SetInternetRouteBlockingState((s & kBlockInternet_Route) != 0); + SetInternetFwBlockingState((s & kBlockInternet_Firewall) != 0); +} + +void TunsafeBackendWin32::SetServiceStartupFlags(uint32 flags) { + // not used +} + +std::string TunsafeBackendWin32::GetConfigFileName() { + return std::string(); +} + +void TunsafeBackendWin32::OnConnected() { + if (status_ != TunsafeBackend::kStatusConnected) { + ipv4_ip_ = ReadBE32(wg_processor_->tun_addr().addr); + if (status_ != TunsafeBackend::kStatusReconnecting) { + char buf[kSizeOfAddress]; + RINFO("Connection established. IP %s", print_ip_prefix(buf, AF_INET, wg_processor_->tun_addr().addr, -1)); + } + status_ = TunsafeBackend::kStatusConnected; + delegate_->OnStatusCode(TunsafeBackend::kStatusConnected); + } +} + +void TunsafeBackendWin32::OnConnectionRetry(uint32 attempts) { + if (status_ == TunsafeBackend::kStatusInitializing) { + status_ = TunsafeBackend::kStatusConnecting; + delegate_->OnStatusCode(TunsafeBackend::kStatusConnecting); + } else if (attempts >= 3 && status_ == TunsafeBackend::kStatusConnected) { + status_ = TunsafeBackend::kStatusReconnecting; + delegate_->OnStatusCode(TunsafeBackend::kStatusReconnecting); + } +} + +void TunsafeBackend::Delegate::DoWork() { + // implemented by subclasses +} + +TunsafeBackendDelegateThreaded::TunsafeBackendDelegateThreaded(TunsafeBackend::Delegate *delegate, const std::function &callback) { + callback_ = callback; + delegate_ = delegate; +} + +TunsafeBackendDelegateThreaded::~TunsafeBackendDelegateThreaded() { + for (auto it = incoming_entry_.begin(); it != incoming_entry_.end(); ++it) + FreeEntry(&*it); +} + +void TunsafeBackendDelegateThreaded::FreeEntry(Entry *e) { + if (e->lparam) { + free((void*)e->lparam); + e->lparam = NULL; + } +} + +void TunsafeBackendDelegateThreaded::DoWork() { + mutex_.Acquire(); + std::swap(incoming_entry_, processing_entry_); + mutex_.Release(); + TunsafeBackend::Delegate *delegate = delegate_; + for (auto it = processing_entry_.begin(); it != processing_entry_.end(); ++it) { + switch (it->which) { + case Id_OnGetStats: delegate->OnGetStats(*(WgProcessorStats*)it->lparam); break; + case Id_OnStateChanged: delegate->OnStateChanged(); break; + case Id_OnLogLine: delegate->OnLogLine((const char**)&it->lparam); break; + case Id_OnStatusCode: delegate->OnStatusCode((TunsafeBackend::StatusCode)it->wparam); break; + case Id_OnClearLog: delegate->OnClearLog(); break; + case Id_OnGraphAvailable: delegate->OnGraphAvailable(); break; + } + FreeEntry(&*it); + } + processing_entry_.clear(); +} + +void TunsafeBackendDelegateThreaded::AddEntry(Which which, intptr_t lparam, uint32 wparam) { + mutex_.Acquire(); + bool was_empty = incoming_entry_.empty(); + incoming_entry_.emplace_back(which, wparam, lparam); + mutex_.Release(); + if (was_empty) + callback_(); +} + +void TunsafeBackendDelegateThreaded::OnGetStats(const WgProcessorStats &stats) { + AddEntry(Id_OnGetStats, (intptr_t)memdup(&stats, sizeof(stats))); +} + +void TunsafeBackendDelegateThreaded::OnGraphAvailable() { + AddEntry(Id_OnGraphAvailable); +} + +void TunsafeBackendDelegateThreaded::OnStateChanged() { + AddEntry(Id_OnStateChanged); +} + +void TunsafeBackendDelegateThreaded::OnLogLine(const char **s) { + const char *ss = *s; + *s = NULL; + AddEntry(Id_OnLogLine, (intptr_t)ss); +} + +void TunsafeBackendDelegateThreaded::OnStatusCode(TunsafeBackend::StatusCode status) { + AddEntry(Id_OnStatusCode, 0, status); +} + +void TunsafeBackendDelegateThreaded::OnClearLog() { + AddEntry(Id_OnClearLog); +} + +TunsafeBackend::Delegate::~Delegate() { +} + +TunsafeBackend *CreateNativeTunsafeBackend(TunsafeBackend::Delegate *delegate) { + return new TunsafeBackendWin32(delegate); +} + +TunsafeBackend::Delegate *CreateTunsafeBackendDelegateThreaded(TunsafeBackend::Delegate *delegate, const std::function &callback) { + return new TunsafeBackendDelegateThreaded(delegate, callback); +} + +/////////////////////////////////////////////////// + +void StatsCollector::Init() { + Accumulator *acc = &accum_[0][0]; + static const int kAccMax[TIMEVALS] = {5, 6, 10, 0}; + + // Initialize all stats channels + for (uint32 channel = 0; channel != CHANNELS; channel++) { + for (uint32 timeval = 0; timeval != TIMEVALS; timeval++, acc++) { + acc->acc = 0; + acc->dirty = false; + acc->acc_count = 0; + acc->acc_max = kAccMax[timeval]; + acc->data.size = 120; + acc->data.data = (float*)calloc(sizeof(float), acc->data.size); + acc->data.shift = 0; + } + } +} + +void StatsCollector::AddToGraphDataSource(StatsCollector::TimeSeries *ts, float value) { + ts->data[ts->shift] = value; + if (++ts->shift == ts->size) + ts->shift = 0; +} + +void StatsCollector::AddToAccumulators(StatsCollector::Accumulator *acc, float rval) { + for (;;) { + AddToGraphDataSource(&acc->data, rval); + acc->dirty = true; + acc->acc += rval; + if (acc->acc_max == 0 || ++acc->acc_count < acc->acc_max) + break; + rval = acc->acc / (float)acc->acc_count; + acc->acc_count = 0; + acc->acc = 0.0f; + acc++; + } +} + +void StatsCollector::AddSamples(float data[CHANNELS]) { + for (size_t i = 0; i < CHANNELS; i++) + AddToAccumulators(&accum_[i][0], data[i]); +} + diff --git a/network_win32.h b/network_win32.h index a67f226..162d9b3 100644 --- a/network_win32.h +++ b/network_win32.h @@ -6,14 +6,18 @@ #include "tunsafe_types.h" #include "netapi.h" #include "network_win32_api.h" +#include "network_win32_dnsblock.h" +#include "wireguard_config.h" +#include "tunsafe_threading.h" +#include struct Packet; class WireguardProcessor; - +class TunsafeBackendWin32; class ThreadedPacketQueue { public: - explicit ThreadedPacketQueue(WireguardProcessor *wg, NetworkStats *stats); + explicit ThreadedPacketQueue(WireguardProcessor *wg, TunsafeBackendWin32 *backend); ~ThreadedPacketQueue(); enum { @@ -39,7 +43,7 @@ private: Packet **last_ptr_; uint32 packets_in_queue_; uint32 need_notify_; - CRITICAL_SECTION mutex_; + Mutex mutex_; HANDLE event_; HANDLE timer_handle_; @@ -47,7 +51,7 @@ private: WireguardProcessor *wg_; bool exit_flag_; bool timer_interrupt_; - NetworkStats *stats_; + TunsafeBackendWin32 *backend_; }; // Encapsulates a UDP socket, optionally listening for incoming packets @@ -74,7 +78,7 @@ private: // All packets queued for writing. Locked by |mutex_| Packet *wqueue_, **wqueue_end_; - CRITICAL_SECTION mutex_; + Mutex mutex_; ThreadedPacketQueue *packet_handler_; SOCKET socket_; @@ -85,22 +89,26 @@ private: bool exit_thread_; }; +class DnsBlocker; + class TunWin32Adapter { public: - TunWin32Adapter(); + TunWin32Adapter(DnsBlocker *dns_blocker); ~TunWin32Adapter(); - bool OpenAdapter(bool *exit_thread, DWORD open_flags); + bool OpenAdapter(unsigned int *exit_thread, DWORD open_flags); bool InitAdapter(const TunInterface::TunConfig &&config, TunInterface::TunConfigOut *out); void CloseAdapter(); HANDLE handle() { return handle_; } + void DisassociateDnsBlocker() { dns_blocker_ = NULL; } + private: bool RunPrePostCommand(const std::vector &vec); HANDLE handle_; - HANDLE current_dns_block_; + DnsBlocker *dns_blocker_; std::vector routes_to_undo_; uint8 mac_adress_[6]; @@ -113,7 +121,7 @@ private: // Implementation of TUN interface handling using IO Completion Ports class TunWin32Iocp : public TunInterface { public: - explicit TunWin32Iocp(); + explicit TunWin32Iocp(DnsBlocker *blocker, TunsafeBackendWin32 *backend); ~TunWin32Iocp(); void SetPacketHandler(ThreadedPacketQueue *packet_handler) { packet_handler_ = packet_handler; } @@ -125,6 +133,8 @@ public: virtual bool Initialize(const TunConfig &&config, TunConfigOut *out) override; virtual void WriteTunPacket(Packet *packet) override; + TunWin32Adapter &adapter() { return adapter_; } + private: void CloseTun(); void ThreadMain(); @@ -134,20 +144,21 @@ private: HANDLE completion_port_handle_; HANDLE thread_; - CRITICAL_SECTION mutex_; + Mutex mutex_; bool exit_thread_; // All packets queued for writing Packet *wqueue_, **wqueue_end_; + TunsafeBackendWin32 *backend_; TunWin32Adapter adapter_; }; // Implementation of TUN interface handling using Overlapped IO class TunWin32Overlapped : public TunInterface { public: - explicit TunWin32Overlapped(); + explicit TunWin32Overlapped(DnsBlocker *blocker, TunsafeBackendWin32 *backend); ~TunWin32Overlapped(); void SetPacketHandler(ThreadedPacketQueue *packet_handler) { packet_handler_ = packet_handler; } @@ -167,7 +178,7 @@ private: ThreadedPacketQueue *packet_handler_; HANDLE thread_; - CRITICAL_SECTION mutex_; + Mutex mutex_; HANDLE read_event_, write_event_, wake_event_; @@ -176,4 +187,111 @@ private: Packet *wqueue_, **wqueue_end_; TunWin32Adapter adapter_; + + TunsafeBackendWin32 *backend_; }; + +class TunsafeBackendWin32 : public TunsafeBackend, public ProcessorDelegate { + friend class ThreadedPacketQueue; + friend class TunWin32Iocp; + friend class TunWin32Overlapped; +public: + TunsafeBackendWin32(Delegate *delegate); + ~TunsafeBackendWin32(); + + // -- from TunsafeBackend + virtual bool Initialize() override; + virtual void Teardown() override; + virtual void Start(const char *config_file) override; + virtual void Stop() override; + virtual void RequestStats(bool enable) override; + virtual void ResetStats() override; + virtual InternetBlockState GetInternetBlockState(bool *is_activated) override; + virtual void SetInternetBlockState(InternetBlockState s) override; + virtual void SetServiceStartupFlags(uint32 flags) override; + virtual LinearizedGraph *GetGraph(int type) override; + virtual std::string GetConfigFileName() override; + + // -- from ProcessorDelegate + virtual void OnConnected() override; + virtual void OnConnectionRetry(uint32 attempts) override; + + void SetPublicKey(const uint8 key[32]); + void TunAdapterFailed(); +private: + + void StopInner(bool is_restart); + static DWORD WINAPI WorkerThread(void *x); + void PushStats(); + + HANDLE worker_thread_; + + enum { + MODE_NONE = 0, + MODE_EXIT = 1, + MODE_RESTART = 2, + MODE_TUN_FAILED = 3, + }; + + bool want_periodic_stats_; + unsigned int stop_mode_; + + Delegate *delegate_; + char *config_file_; + + DnsBlocker dns_blocker_; + DnsResolver dns_resolver_; + + WireguardProcessor *wg_processor_; + + uint32 last_tun_adapter_failed_; + StatsCollector stats_collector_; + + Mutex stats_mutex_; + WgProcessorStats stats_; +}; + +// This class ensures that all callbacks get rescheduled to another thread +class TunsafeBackendDelegateThreaded : public TunsafeBackend::Delegate { +public: + TunsafeBackendDelegateThreaded(TunsafeBackend::Delegate *delegate, const std::function &callback); + ~TunsafeBackendDelegateThreaded(); + +private: + virtual void OnGetStats(const WgProcessorStats &stats); + virtual void OnGraphAvailable(); + virtual void OnStateChanged(); + virtual void OnClearLog(); + virtual void OnLogLine(const char **s); + virtual void OnStatusCode(TunsafeBackend::StatusCode status); + virtual void DoWork(); + + enum Which { + Id_OnGetStats, + Id_OnStateChanged, + Id_OnClearLog, + Id_OnLogLine, + Id_OnUpdateUI, + Id_OnStatusCode, + Id_OnGraphAvailable, + }; + + void AddEntry(Which which, intptr_t lparam = 0, uint32 wparam = 0); + + TunsafeBackend::Delegate *delegate_; + std::function callback_; + + struct Entry { + uint8 which; + uint32 wparam; + intptr_t lparam; + Entry(uint8 which, uint32 wparam, intptr_t lparam) : which(which), wparam(wparam), lparam(lparam) {} + }; + + static void FreeEntry(Entry *e); + + Mutex mutex_; + std::vector incoming_entry_; + std::vector processing_entry_; +}; + diff --git a/network_win32_api.h b/network_win32_api.h index dac9856..bf5cf88 100644 --- a/network_win32_api.h +++ b/network_win32_api.h @@ -6,44 +6,115 @@ #include "tunsafe_types.h" #include "wireguard.h" -struct NetworkStats { - bool reset_stats; - CRITICAL_SECTION mutex; - ProcessorStats packet_stats; -}; +#include -class TunsafeBackendWin32 { +struct StatsCollector { public: - TunsafeBackendWin32(); - ~TunsafeBackendWin32(); - - void Start(ProcessorDelegate *procdel, const char *config_file); - void Stop(); - - ProcessorStats GetStats(); - void ResetStats() { stats_.reset_stats = true; } - - bool is_started() const { return worker_thread_ != NULL; } - + enum { + CHANNELS = 2, + TIMEVALS = 4, + }; + StatsCollector() { Init(); } + void AddSamples(float data[CHANNELS]); + struct TimeSeries { + float *data; + int size; + int shift; + }; + const TimeSeries *GetTimeSeries(int channel, int timeval) { return &accum_[channel][timeval].data; } private: - static DWORD WINAPI WorkerThread(void *x); - - NetworkStats stats_; - HANDLE worker_thread_; - bool exit_flag_; - - ProcessorDelegate *procdel_; - char *config_file_; + struct Accumulator { + float acc; + int acc_count; + int acc_max; + bool dirty; + TimeSeries data; + }; + void Init(); + static void AddToGraphDataSource(StatsCollector::TimeSeries *ts, float value); + static void AddToAccumulators(StatsCollector::Accumulator *acc, float rval); + Accumulator accum_[CHANNELS][TIMEVALS]; }; +struct LinearizedGraph { + uint32 total_size; + uint32 graph_type; + uint8 num_charts; + uint8 reserved[7]; +}; +class TunsafeBackend { +public: + // All codes < 0 are permanent errors + enum StatusCode { + kStatusStopped = 0, + kStatusInitializing = 1, + kStatusConnecting = 2, + kStatusConnected = 3, + kStatusReconnecting = 4, + kStatusTunRetrying = 10, -InternetBlockState GetInternetBlockState(bool *is_activated); + kErrorInitialize = -1, + kErrorTunPermanent = -2, + kErrorServiceLost = -3, + }; -// Returns if reconnect is needed -void SetInternetBlockState(InternetBlockState s); + static bool IsPermanentError(StatusCode status) { + return (int32)status < 0; + } + class Delegate { + public: + virtual ~Delegate(); + virtual void OnGetStats(const WgProcessorStats &stats) = 0; + virtual void OnGraphAvailable() = 0; + virtual void OnStateChanged() = 0; + virtual void OnClearLog() = 0; + virtual void OnLogLine(const char **s) = 0; + virtual void OnStatusCode(TunsafeBackend::StatusCode status) = 0; + // This function is needed for CreateTunsafeBackendDelegateThreaded, + // It's expected to be called on the main thread and then all callbacks will arrive + // on the right thread. + virtual void DoWork(); + }; + TunsafeBackend(); + virtual ~TunsafeBackend(); + + // Setup/teardown the connection to the local service (if any) + virtual bool Initialize() = 0; + virtual void Teardown() = 0; + + virtual void Start(const char *config_file) = 0; + virtual void Stop() = 0; + virtual void RequestStats(bool enable) = 0; + virtual void ResetStats() = 0; + + virtual InternetBlockState GetInternetBlockState(bool *is_activated) = 0; + virtual void SetInternetBlockState(InternetBlockState s) = 0; + virtual void SetServiceStartupFlags(uint32 flags) = 0; + + virtual std::string GetConfigFileName() = 0; + + virtual LinearizedGraph *GetGraph(int type) = 0; + + bool is_started() { return is_started_; } + bool is_remote() { return is_remote_; } + const uint8 *public_key() { return public_key_; } + + StatusCode status() { return status_; } + uint32 GetIP() { return ipv4_ip_; } + +protected: + bool is_started_; + bool is_remote_; + StatusCode status_; + uint32 ipv4_ip_; + uint8 public_key_[32]; +}; + +TunsafeBackend *CreateNativeTunsafeBackend(TunsafeBackend::Delegate *delegate); +TunsafeBackend::Delegate *CreateTunsafeBackendDelegateThreaded(TunsafeBackend::Delegate *delegate, const std::function &callback); extern int tpq_last_qsize; extern int g_tun_reads, g_tun_writes; diff --git a/network_win32_dnsblock.cpp b/network_win32_dnsblock.cpp index e17f09a..b76fb91 100644 --- a/network_win32_dnsblock.cpp +++ b/network_win32_dnsblock.cpp @@ -5,6 +5,7 @@ #include "network_win32_dnsblock.h" #include #include +#include #pragma comment (lib, "Fwpuclnt.lib") @@ -43,11 +44,19 @@ static inline bool FwpmFilterAddCheckedAleConnect(HANDLE handle, FWPM_FILTER0 *f return false; } } - return true; } -HANDLE BlockDnsExceptOnAdapter(const NET_LUID &luid, bool also_ipv6) { +DnsBlocker::DnsBlocker() { + also_ipv6_ = false; + handle_ = NULL; +} + +DnsBlocker::~DnsBlocker() { + RestoreDns(); +} + +bool DnsBlocker::BlockDnsExceptOnAdapter(const NET_LUID &luid, bool also_ipv6) { FWPM_SUBLAYER0 *sublayer = NULL; FWP_BYTE_BLOB *fwp_appid = NULL; @@ -56,6 +65,14 @@ HANDLE BlockDnsExceptOnAdapter(const NET_LUID &luid, bool also_ipv6) { DWORD err; HANDLE handle = NULL; + // Check if it already matches + if (handle_ != NULL) { + if (memcmp(&luid, &luid_, sizeof(luid)) == 0 && also_ipv6_) + return true; + FwpmEngineClose0(handle_); + handle_ = NULL; + } + { FWPM_SESSION0 session = {0}; session.flags = FWPM_SESSION_FLAG_DYNAMIC; @@ -69,7 +86,7 @@ HANDLE BlockDnsExceptOnAdapter(const NET_LUID &luid, bool also_ipv6) { { FWPM_SUBLAYER0 sublayer = {0}; sublayer.subLayerKey = TUNSAFE_DNS_SUBLAYER; - sublayer.displayData.name = L"TunSafe"; + sublayer.displayData.name = L"TunSafe DNS Block"; sublayer.weight = 0x100; err = FwpmSubLayerAdd0(handle, &sublayer, NULL); if (err != 0) { @@ -96,7 +113,7 @@ HANDLE BlockDnsExceptOnAdapter(const NET_LUID &luid, bool also_ipv6) { filter.filterCondition = filter_condition; filter.numFilterConditions = 2; filter.subLayerKey = TUNSAFE_DNS_SUBLAYER; - filter.displayData.name = L"TunSafe"; + filter.displayData.name = L"TunSafe DNS Block"; filter.weight.type = FWP_UINT8; filter.weight.uint8 = 15; filter.action.type = FWP_ACTION_PERMIT; @@ -127,15 +144,21 @@ getout: success: if (fwp_appid) FwpmFreeMemory0((void **)&fwp_appid); - return handle; + + handle_ = handle; + also_ipv6_ = also_ipv6; + luid_ = luid; + return handle != NULL; } -void RestoreDnsExceptOnAdapter(HANDLE h) { - if (h) +void DnsBlocker::RestoreDns() { + HANDLE h = handle_; + if (h) { + handle_ = NULL; FwpmEngineClose0(h); + } } - static bool RemovePersistentInternetBlockingInner(HANDLE handle) { FWPM_FILTER_ENUM_TEMPLATE0 enum_template = {0}; HANDLE enum_handle = NULL; @@ -337,6 +360,10 @@ getout: return false; } +void ClearInternetFwBlockingStateCache() { + internet_fw_blocking_state = 0; +} + uint8 GetInternetFwBlockingState() { if (internet_fw_blocking_state != 0) return internet_fw_blocking_state; diff --git a/network_win32_dnsblock.h b/network_win32_dnsblock.h index 1da7e64..3bc9c7f 100644 --- a/network_win32_dnsblock.h +++ b/network_win32_dnsblock.h @@ -2,13 +2,25 @@ // Copyright (C) 2018 Ludvig Strigeus . All Rights Reserved. #pragma once -HANDLE BlockDnsExceptOnAdapter(const NET_LUID &luid, bool also_ipv6 ); -void RestoreDnsExceptOnAdapter(HANDLE h); + +class DnsBlocker { +public: + DnsBlocker(); + ~DnsBlocker(); + + bool BlockDnsExceptOnAdapter(const NET_LUID &luid, bool also_ipv6); + void RestoreDns(); + bool IsActive() { return handle_ != NULL; } + + // Current state + NET_LUID luid_; + HANDLE handle_; + bool also_ipv6_; +}; + bool AddPersistentInternetBlocking(const NET_LUID *default_interface, const NET_LUID &luid_to_allow, bool also_ipv6); - - enum { IBS_UNKOWN, IBS_INACTIVE, @@ -18,3 +30,4 @@ enum { void SetInternetFwBlockingState(bool want); uint8 GetInternetFwBlockingState(); +void ClearInternetFwBlockingStateCache(); \ No newline at end of file diff --git a/resource.h b/resource.h index 3c10a98c62cf7bfd741a4a014722bde5c86dfa20..3e325674328bb1458331c27fb01f39caa2995964 100644 GIT binary patch literal 1979 zcmZ{lNsr<%5QXpj3QK)#sllOVJ33B5TO@YmIM6*8i-u&B5egE^ENabvp9-r(43{X* z$4lkCa=G9C_>js*kb;arask=Df4zR6B-LWG-mc3XnO58Vd`0-ib_}_H{Ndi>J*3S9=f3qO9tIh)7t$l^hWNk9J7v2T9Ila@G9;I;%h#)$VeFUY2qb1{>za zDA|R-Ib;UL*cpU2Wn<`|ba0EZQ03lgshiS4bpmfG6S^Zxs2kG(RoGye$~C_&fx0;z z&`_&c26iN3$Xe3Wwf~}e09zd1P}l#XYOMG~NCR4ev1)X&n7RRYP>pt+il$#~%?ewp zBcWl&;x_nsPugfFtuu;UD}fu!a-$9v1f8_bXd$^?LHBbs4%2a68s4 zeYuCa4sgq&DU%!`A3by(;8qL%0=N?c&lDQE4q&ZXOql_TTcK5Fb2D@uh-@TSti~Qm zMCrgS_RIzy$!38LT)VVQK<0kchILz(H#5uRS;#zl7FoAvLmR1?WEwZ4r;0XkX-k-(>y%vsmo54c2?8 zN~8!&Gm}R`Ss|xt0#UPGw;pIQZ ut}Itcf@ew_CC`IFaBUBlmPV`ihQ`dVqvr}^fp9B)YLKE{R%P$uc=-<>nA)cR literal 4828 zcmcJTT~FIE6o$|Br2U6TyPQNx_kGzEkuP!~S2%Cbe~arm`c~*!VAKY87r0-d=Mt^G2XB~z zWk#RL9mbUUsXM&o8=l?+Z*f1GcD>LnlK8d+R`+o#oIQM3Cj01%IZ{n;qNY*IMA}ZO zg2p8*%_E8YrdLjCAss!FF<)-Sx~WPE?Gq*F1+2DDgj`=cZ8ZwS6ZzIR9_m0o!VBW8 zG+*-8Lp2Hvp1^(Qq0XRdiDQIWJAg-oC~C&M_fQ8q7RqfT4F40g9nAgUv2}tNptef7 zGS^x#dP8(@Q?05>PC)L0ox^hv&~E0Y$Q-WxQgJ(=PPM(wL^C24cZR9BN3ii!TRVa( zA?(zXV=6B540HTUTV232nzTqgS*GH4VdWGmvq#Agemq9Y=f|nIGeX6+_EyX$EmBWT zsJLC&H^tW<>4+;~YXMalg9`ma9J}vH#hn$xxKxOq{aEo-+<764OSKs}C-Cht4(n&` z%n-(?P^(f-}b~?AG5+$o!>V_{Kl-ljN_1qV^s8DHKUGs zxH|QH(Z4S?p3GCeu^5Xe6-Z=ZrpY$ySeLA{De>mOIo@BQl_wck`Gh#I6e3I-_ zn(PVeQHVL_wmKhh^@Xr=#1^BPbzjUo!q@}YTtsmzP29B~Ev>UV#8-^nO>9%G9KsXW z>IbTB*lyph=5hr4U9(o-Z`khKZ#1!b$MFr@T@zMzGc(0iu!ilAorjf6$Q)XQ4BMTr zX?LML`|v(K+ksiIiCv;f=BhV`cRXb-$#>OBmR*7#R?AsMOg{0|N|yZzx;)KpckEa; zxmwi2SBr5jd*BUp^%0k44ci}=6+6ZQt=>&>NiRHx7G|Xh&E&>z z!>;xcGW=ivsdo^#8?eMa!3Mhl+)?{|Pt z*_$w4xWahDOJHZN>ks-iS9IixjPKz5*IRA9s6GSNv5k^knfzR. All Rights Reserved. +#include "stdafx.h" +#include "service_win32.h" +#include +#include "util.h" +#include "network_win32_api.h" +#include +#include +#include +#include "util_win32.h" + +static const uint64 kTunsafeServiceProtocolVersion = 20180809001; + +static SERVICE_STATUS_HANDLE m_statusHandle; +static TunsafeServiceImpl *g_service; + +#define SERVICE_NAME L"TunSafeService" +#define SERVICE_NAMEA "TunSafeService" +#define SERVICE_START_TYPE SERVICE_AUTO_START +#define SERVICE_DEPENDENCIES L"tap0901\0dhcp\0" +#define SERVICE_ACCOUNT NULL +//L"NT AUTHORITY\\LocalService" +#define SERVICE_PASSWORD NULL +#define PIPE_NAME "\\\\.\\pipe\\TunSafe\\ServiceControl" + + +enum { + SERVICE_REQ_LOGIN = 0, + SERVICE_REQ_START = 1, + SERVICE_REQ_STOP = 2, + SERVICE_REQ_GETSTATS = 4, + SERVICE_REQ_SET_INTERNET_BLOCKSTATE = 5, + SERVICE_REQ_RESETSTATS = 6, + SERVICE_REQ_SET_STARTUP_FLAGS = 7, + + SERVICE_MSG_STATE = 8, + SERVICE_MSG_LOGLINE = 9, + SERVICE_MSG_STATS = 11, + SERVICE_MSG_CLEARLOG = 12, + SERVICE_MSG_STATUS_CODE = 14, + + SERVICE_REQ_GET_GRAPH = 15, + SERVICE_MSG_GRAPH = 16, +}; + +struct ServiceHandles { + SC_HANDLE manager; + SC_HANDLE service; + + ServiceHandles() : manager(NULL), service(NULL) {} + ~ServiceHandles() { + if (manager) + CloseServiceHandle(manager); + if (service) + CloseServiceHandle(service); + } + + bool Open(PWSTR pszServiceName, DWORD sc_rights, DWORD service_rights); + bool StopService(); + bool StartService(); +}; + + +static DWORD InstallService(PWSTR pszServiceName, + PWSTR pszDisplayName, + DWORD dwStartType, + PWSTR pszDependencies, + PWSTR pszAccount, + PWSTR pszPassword) { + wchar_t szPath[MAX_PATH + 32]; + ServiceHandles handles; + DWORD res; + + szPath[0] = '"'; + if (GetModuleFileNameW(NULL, szPath + 1, MAX_PATH) == 0) { + res = GetLastError(); + goto Cleanup; + } + size_t len = wcslen(szPath); + memcpy(szPath + len, L"\" --service", 12 * sizeof(wchar_t)); + + // Open the local default service control manager database + handles.manager = OpenSCManagerW(NULL, NULL, SC_MANAGER_CONNECT | + SC_MANAGER_CREATE_SERVICE); + if (handles.manager == NULL) { + res = GetLastError(); + goto Cleanup; + } + + // Install the service into SCM by calling CreateService + handles.service = CreateServiceW( + handles.manager, // SCManager database + pszServiceName, // Name of service + pszDisplayName, // Name to display + SERVICE_QUERY_STATUS, // Desired access + SERVICE_WIN32_OWN_PROCESS, // Service type + dwStartType, // Service start type + SERVICE_ERROR_NORMAL, // Error control type + szPath, // Service's binary + NULL, // No load ordering group + NULL, // No tag identifier + pszDependencies, // Dependencies + pszAccount, // Service running account + pszPassword // Password of the account + ); + if (handles.service == NULL) { + res = GetLastError(); + goto Cleanup; + } + { + SERVICE_DESCRIPTIONA desc; + desc.lpDescription = "TunSafe uses this service to connect to a VPN server in the background."; + ChangeServiceConfig2A(handles.service, SERVICE_CONFIG_DESCRIPTION, &desc); + } + res = 0; +Cleanup: + if (res && res != ERROR_SERVICE_EXISTS) + RERROR("TunSafe service installation failed: %d", res); + return res; +} + +bool ServiceHandles::Open(PWSTR pszServiceName, DWORD sc_rights, DWORD service_rights) { + manager = OpenSCManagerW(NULL, NULL, sc_rights); + if (manager == NULL) + return false; + service = OpenServiceW(manager, pszServiceName, service_rights); + return (service != NULL); +} + +bool ServiceHandles::StopService() { + SERVICE_STATUS ssSvcStatus = {}; + // Try to stop the service + if (ControlService(service, SERVICE_CONTROL_STOP, &ssSvcStatus)) { + Sleep(100); + while (QueryServiceStatus(service, &ssSvcStatus)) { + if (ssSvcStatus.dwCurrentState == SERVICE_STOP_PENDING) { + Sleep(100); + } else { + break; + } + } + } + return (ssSvcStatus.dwCurrentState == SERVICE_STOPPED); +} + +static wchar_t *GetUsernameOfCurrentUser(bool use_thread_token) { + HANDLE thread_token = NULL; + wchar_t *result = NULL; + DWORD len; + PTOKEN_USER token_user = NULL; + DWORD domain_len; + WCHAR username[256], domain[256]; + SID_NAME_USE sid_type; + + if (use_thread_token) { + if (!OpenThreadToken(GetCurrentThread(), TOKEN_ALL_ACCESS, FALSE, &thread_token)) + goto getout; + } else { + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ALL_ACCESS, &thread_token)) + goto getout; + + } + len = 0; + token_user = NULL; + while (!GetTokenInformation(thread_token, TokenUser, token_user, len, &len)) { + if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) + goto getout; + token_user = (PTOKEN_USER)realloc(token_user, len); + if (!token_user) + goto getout; + } + if (!IsValidSid(token_user->User.Sid)) + goto getout; + domain_len = len = 256; + if (!LookupAccountSidW(NULL, token_user->User.Sid, username, &len, domain, &domain_len, &sid_type)) + goto getout; + + size_t alen = wcslen(username); + size_t blen = wcslen(domain); + + result = (wchar_t*)malloc((alen + blen + 2) * sizeof(wchar_t)); + if (result) { + result[alen] = '@'; + memcpy(result, username, alen * sizeof(wchar_t)); + memcpy(result + alen + 1, domain, (blen + 1) * sizeof(wchar_t)); + } +getout: + free(token_user); + CloseHandle(thread_token); + return result; +} + + +static DWORD GetNonTransientServiceStatus(SC_HANDLE service) { + SERVICE_STATUS ssSvcStatus = {}; + int delay = 100; + for(;;) { + if (!QueryServiceStatus(service, &ssSvcStatus)) + return 0; + + if (--delay == 0 || + ssSvcStatus.dwCurrentState != SERVICE_START_PENDING && + ssSvcStatus.dwCurrentState != SERVICE_STOP_PENDING) + return ssSvcStatus.dwCurrentState; + Sleep(100); + delay--; + } +} + + +bool ServiceHandles::StartService() { + DWORD state = GetNonTransientServiceStatus(service); + if (state == 0 || state == SERVICE_RUNNING) + return false; // service already running, no need to start + if (!::StartService(service, 0, NULL)) { +// if (GetLastError() == ERROR_SERVICE_ALREADY_RUNNING) +// return false; + return false; + } + return GetNonTransientServiceStatus(service) == SERVICE_RUNNING; +} + + +static bool StartTunsafeService() { + ServiceHandles handles; + + if (!handles.Open(SERVICE_NAME, SC_MANAGER_CONNECT, SERVICE_START | SERVICE_QUERY_STATUS)) + return false; + return handles.StartService(); +} + +bool IsTunsafeServiceRunning() { + ServiceHandles handles; + + if (!handles.Open(SERVICE_NAME, SC_MANAGER_CONNECT, SERVICE_QUERY_STATUS)) + return false; + + return GetNonTransientServiceStatus(handles.service) == SERVICE_RUNNING; +} + + +void StopTunsafeService() { + ServiceHandles handles; + if (!handles.Open(SERVICE_NAME, SC_MANAGER_CONNECT, + SERVICE_STOP | SERVICE_QUERY_STATUS)) + goto Cleanup; + handles.StopService(); +Cleanup: + return; +} + +static void SetTunsafeUserNameInRegistry() { + wchar_t *user = GetUsernameOfCurrentUser(false); + if (!user) { + RERROR("Unable to get current username"); + return; + } + HKEY hkey = NULL; + RegCreateKeyEx(HKEY_LOCAL_MACHINE, "Software\\TunSafe", NULL, NULL, 0, KEY_ALL_ACCESS, NULL, &hkey, NULL); + if (!hkey) { + RERROR("Unable to open registry key"); + return; + } + if (RegSetValueExW(hkey, L"AllowedUsername", NULL, REG_SZ, (BYTE*)user, (DWORD)(wcslen(user) + 1) * 2) != ERROR_SUCCESS) { + RERROR("Unable to set registry key"); + } + RegCloseKey(hkey); +} + +void InstallTunSafeWindowsService() { + InstallService(SERVICE_NAME, L"TunSafe Service", SERVICE_START_TYPE, + SERVICE_DEPENDENCIES, SERVICE_ACCOUNT, SERVICE_PASSWORD); + StartTunsafeService(); + SetTunsafeUserNameInRegistry(); +} + +bool UninstallTunSafeWindowsService() { + ServiceHandles handles; + + if (!handles.Open(SERVICE_NAME, SC_MANAGER_CONNECT, + SERVICE_STOP | SERVICE_QUERY_STATUS | DELETE)) + goto Cleanup; + + handles.StopService(); + + if (!DeleteService(handles.service)) + goto Cleanup; + return true; +Cleanup: + return false; +} + +bool IsTunSafeServiceInstalled() { + ServiceHandles handles; + return handles.Open(SERVICE_NAME, SC_MANAGER_CONNECT, SERVICE_QUERY_STATUS); +} + + +static void WriteServiceLog(const char *pszFunction, WORD dwError) { + char szMessage[260]; + snprintf(szMessage, ARRAYSIZE(szMessage), "%s failed w/err 0x%08lx", pszFunction, dwError); + HANDLE hEventSource = NULL; + LPCSTR lpszStrings[2] = {NULL, NULL}; + hEventSource = RegisterEventSourceW(NULL, SERVICE_NAME); + if (hEventSource) { + lpszStrings[0] = SERVICE_NAMEA; + lpszStrings[1] = szMessage; + + ReportEventA(hEventSource, // Event log handle + dwError, // Event type + 0, // Event category + 0, // Event identifier + NULL, // No security identifier + 2, // Size of lpszStrings array + 0, // No binary data + lpszStrings, // Array of strings + NULL // No binary data + ); + DeregisterEventSource(hEventSource); + } +} + +static void SetServiceStatus(DWORD dwCurrentState, + DWORD dwWin32ExitCode = 0, + DWORD dwWaitHint = 0) { + static DWORD dwCheckPoint = 1; + + SERVICE_STATUS m_status; + m_status.dwServiceType = SERVICE_WIN32_OWN_PROCESS; + m_status.dwControlsAccepted = SERVICE_ACCEPT_STOP | SERVICE_ACCEPT_SHUTDOWN; + m_status.dwServiceSpecificExitCode = 0; + m_status.dwCurrentState = dwCurrentState; + m_status.dwWin32ExitCode = dwWin32ExitCode; + m_status.dwWaitHint = dwWaitHint; + m_status.dwCheckPoint = + ((dwCurrentState == SERVICE_RUNNING) || + (dwCurrentState == SERVICE_STOPPED)) ? + 0 : dwCheckPoint++; + // Report the status of the service to the SCM. + ::SetServiceStatus(m_statusHandle, &m_status); +} + +static void OnServiceStart(DWORD dwArgc, PWSTR *pszArgv) { + WriteServiceLog("Service Starting", EVENTLOG_INFORMATION_TYPE); + SetServiceStatus(SERVICE_START_PENDING); + DWORD rv = g_service->OnStart(dwArgc, pszArgv); + if (rv) { + SetServiceStatus(SERVICE_STOPPED, rv); + } else { + SetServiceStatus(SERVICE_RUNNING); + } +} + +static void OnServiceStop() { + WriteServiceLog("Service Stopping", EVENTLOG_INFORMATION_TYPE); + SetServiceStatus(SERVICE_STOP_PENDING); + g_service->OnStop(); + SetServiceStatus(SERVICE_STOPPED); +} + +static void OnServiceShutdown() { + g_service->OnShutdown(); + SetServiceStatus(SERVICE_STOPPED); +} + +static void WINAPI ServiceCtrlHandler(DWORD dwCtrl) { + switch (dwCtrl) { + case SERVICE_CONTROL_STOP: OnServiceStop(); break; +// case SERVICE_CONTROL_PAUSE: OnServicePause(); break; +// case SERVICE_CONTROL_CONTINUE: OnServiceContinue(); break; + case SERVICE_CONTROL_SHUTDOWN: OnServiceShutdown(); break; + case SERVICE_CONTROL_INTERROGATE: break; + default: break; + } +} + +static void WINAPI ServiceMain(DWORD dwArgc, PWSTR *pszArgv) { + // Register the handler function for the service + m_statusHandle = RegisterServiceCtrlHandlerW(SERVICE_NAME, ServiceCtrlHandler); + if (m_statusHandle == NULL) + throw GetLastError(); + // Start the service. + OnServiceStart(dwArgc, pszArgv); +} + +static const SERVICE_TABLE_ENTRYW serviceTable[] = { + {SERVICE_NAME, ServiceMain}, + {NULL, NULL} +}; + +PipeMessageHandler::PipeMessageHandler(const char *pipe_name, bool is_server_pipe, Delegate *delegate) { + pipe_name_ = _strdup(pipe_name); + is_server_pipe_ = is_server_pipe; + delegate_ = delegate; + pipe_ = INVALID_HANDLE_VALUE; + wait_handles_[0] = CreateEvent(NULL, TRUE, FALSE, NULL); // for ReadFile + wait_handles_[1] = CreateEvent(NULL, FALSE, FALSE, NULL); // For Exit + wait_handles_[2] = CreateEvent(NULL, TRUE, FALSE, NULL); // for WriteFile + packets_ = NULL; + thread_ = NULL; + packets_end_ = &packets_; + write_overlapped_active_ = false; + exit_ = false; + connection_established_ = false; + thread_id_ = 0; +} + +PipeMessageHandler::~PipeMessageHandler() { + StopThread(); + CloseHandle(wait_handles_[0]); + CloseHandle(wait_handles_[1]); + CloseHandle(wait_handles_[2]); + free(pipe_name_); +} + +bool PipeMessageHandler::InitializeServerPipe() { + int BUFSIZE = 2048; + SECURITY_ATTRIBUTES saPipeSecurity = {0}; + uint8 buf[SECURITY_DESCRIPTOR_MIN_LENGTH]; + PSECURITY_DESCRIPTOR pPipeSD = (PSECURITY_DESCRIPTOR)buf; + + if (!InitializeSecurityDescriptor(pPipeSD, SECURITY_DESCRIPTOR_REVISION)) + return false; + + // set NULL DACL on the SD + if (!SetSecurityDescriptorDacl(pPipeSD, TRUE, (PACL)NULL, FALSE)) + return false; + + // now set up the security attributes + saPipeSecurity.nLength = sizeof(SECURITY_ATTRIBUTES); + saPipeSecurity.bInheritHandle = TRUE; + saPipeSecurity.lpSecurityDescriptor = pPipeSD; + + pipe_ = CreateNamedPipeW(L"\\\\.\\pipe\\TunSafe\\ServiceControl", + PIPE_ACCESS_DUPLEX | FILE_FLAG_OVERLAPPED, + PIPE_TYPE_MESSAGE | PIPE_READMODE_MESSAGE | PIPE_REJECT_REMOTE_CLIENTS | PIPE_WAIT, + PIPE_UNLIMITED_INSTANCES, + BUFSIZE, BUFSIZE, 0, &saPipeSecurity); + return pipe_ != INVALID_HANDLE_VALUE; +} + +bool PipeMessageHandler::InitializeClientPipe() { + assert(pipe_ == INVALID_HANDLE_VALUE); + pipe_ = CreateFile( + pipe_name_, + GENERIC_READ | GENERIC_WRITE, 0, + NULL, OPEN_EXISTING, FILE_FLAG_OVERLAPPED, NULL); + if (pipe_ == INVALID_HANDLE_VALUE) + return false; + DWORD mode = PIPE_READMODE_MESSAGE; + SetNamedPipeHandleState(pipe_, &mode, NULL, NULL); + return true; +} + +void PipeMessageHandler::ClosePipe() { + if (pipe_ != INVALID_HANDLE_VALUE) { + CancelIo(pipe_); + CloseHandle(pipe_); + pipe_ = INVALID_HANDLE_VALUE; + } + connection_established_ = false; + write_overlapped_active_ = false; + + packets_mutex_.Acquire(); + OutgoingPacket *packets = packets_; + packets_ = NULL; + packets_end_ = &packets_; + packets_mutex_.Release(); + while (packets) { + OutgoingPacket *p = packets; + packets = p->next; + free(p); + } +} + +bool PipeMessageHandler::WritePacket(int type, const uint8 *data, size_t data_size) { + OutgoingPacket *packet = (OutgoingPacket *)malloc(offsetof(OutgoingPacket, data[data_size + 1])); + if (packet) { + packet->size = (uint32)(data_size + 1); + packet->data[0] = type; + memcpy(packet->data + 1, data, data_size); + packet->next = NULL; + + packets_mutex_.Acquire(); + OutgoingPacket *was_empty = packets_; + // login messages are always queued up front + if (type == SERVICE_REQ_LOGIN) { + packet->next = packets_; + if (packet->next == NULL) + packets_end_ = &packet->next; + packets_ = packet; + } else { + *packets_end_ = packet; + packets_end_ = &packet->next; + } + packets_mutex_.Release(); + + if (was_empty == NULL) { + // Only allow the pipe thread to invoke the send + if (GetCurrentThreadId() == thread_id_) { + SendNextQueuedWrite(); + } else { + SetEvent(wait_handles_[1]); + } + } + } + return true; +} + +void PipeMessageHandler::SendNextQueuedWrite() { + assert(thread_id_ == GetCurrentThreadId()); + if (!write_overlapped_active_) { + OutgoingPacket *p = packets_; + if (p && connection_established_) { + memset(&write_overlapped_, 0, sizeof(write_overlapped_)); + write_overlapped_.hEvent = wait_handles_[2]; + if (WriteFile(pipe_, p->data, p->size, NULL, &write_overlapped_) || GetLastError() == ERROR_IO_PENDING) + write_overlapped_active_ = true; + } + } +} + +uint8 *PipeMessageHandler::ReadNamedPipeAsync(size_t *packet_size) { + OVERLAPPED ov = {0}; + uint8 *result = NULL; + DWORD bytes_waiting = 0; + DWORD rv; + ov.hEvent = wait_handles_[0]; + if (!ReadFile(pipe_, NULL, 0, NULL, &ov)) { + rv = GetLastError(); + if (rv != ERROR_IO_PENDING && rv != ERROR_MORE_DATA) + goto getout; + } + + if (!WaitAndHandleWrites(INFINITE)) { + CancelIo(pipe_); + write_overlapped_active_ = false; + goto getout; + } + + PeekNamedPipe(pipe_, NULL, 0, NULL, &bytes_waiting, NULL); + if (bytes_waiting == 0) + goto getout; // this is typically what happens when pipe closes. + + result = (uint8*)malloc(bytes_waiting); + if (!result) + goto getout; + + if (!ReadFile(pipe_, result, bytes_waiting, NULL, &ov)) { + rv = GetLastError(); + if (rv != ERROR_IO_PENDING) + goto getout; + } + if (!WaitAndHandleWrites(1000)) { + CancelIo(pipe_); + write_overlapped_active_ = false; + free(result); + result = NULL; + goto getout; + } + bytes_waiting = (uint32)ov.InternalHigh; + if (bytes_waiting == 0) { + free(result); + result = NULL; + goto getout; + } + *packet_size = bytes_waiting; +getout: + return result; +} + +bool PipeMessageHandler::ConnectNamedPipeAsync() { + OVERLAPPED ov = {0}; + DWORD rv; + bool result = false; + ov.hEvent = wait_handles_[0]; + if (!ConnectNamedPipe(pipe_, &ov)) { + rv = GetLastError(); + if (rv != ERROR_PIPE_CONNECTED && rv != ERROR_IO_PENDING) + goto getout; + } + if (!WaitAndHandleWrites(INFINITE)) { + CancelIo(pipe_); + write_overlapped_active_ = false; + goto getout; + } + result = true; +getout: + return result; +} + +bool PipeMessageHandler::WaitAndHandleWrites(int delay) { + DWORD rv; + assert(thread_id_ == GetCurrentThreadId()); + +again: + rv = WaitForMultipleObjects(2 + write_overlapped_active_, wait_handles_, FALSE, delay); + if (rv == WAIT_OBJECT_0 + 2) { + assert(write_overlapped_active_); + write_overlapped_active_ = false; + // Remove the packet from the front of the queue, now + // that it was sent. + packets_mutex_.Acquire(); + OutgoingPacket *p = packets_; + if ((packets_ = p->next) == NULL) + packets_end_ = &packets_; + packets_mutex_.Release(); + free(p); + SendNextQueuedWrite(); + goto again; + } + if (rv == WAIT_OBJECT_0 + 1) { + if (exit_ || !delegate_->HandleNotify()) + return false; + + SendNextQueuedWrite(); + goto again; + } + return rv == WAIT_OBJECT_0; +} + +DWORD WINAPI PipeMessageHandler::StaticThreadMain(void *x) { + return ((PipeMessageHandler*)x)->ThreadMain(); +} + +bool PipeMessageHandler::VerifyThread() { + return thread_id_ == GetCurrentThreadId(); +} + +DWORD PipeMessageHandler::ThreadMain() { + assert((thread_id_ = GetCurrentThreadId()) != 0); + + while (!exit_) { + // Create a named pipe and wait for connections from the UI process + if (is_server_pipe_) { + if (!InitializeServerPipe()) { + if (!exit_) + ExitProcess(1); + break; + } + // Wait for a client to connect to us. + if (!ConnectNamedPipeAsync()) { + if (!exit_) + ExitProcess(1); + break; + } + } else { + if (!InitializeClientPipe()) { + RINFO("Unable to connect to the TunSafe Service. Please make sure it's running."); + break; + } + } + + connection_established_ = true; + if (!delegate_->HandleNewConnection()) + goto closepipe; + + SendNextQueuedWrite(); + + // Read/Process each message + for (;;) { + size_t message_size; + uint8 *message = ReadNamedPipeAsync(&message_size); + if (!message) + break; + + if (message_size) { + if (!delegate_->HandleMessage(message[0], message + 1, message_size - 1)) { + FlushWrites(1000); + break; + } + } + free(message); + } + + if (exit_) + break; + + delegate_->HandleDisconnect(); + + if (!is_server_pipe_) + break; + +closepipe: + ClosePipe(); + } + + + ClosePipe(); + + return 0; +} + +void PipeMessageHandler::FlushWrites(int delay) { + ResetEvent(wait_handles_[0]); + WaitAndHandleWrites(1000); +} + +bool PipeMessageHandler::StartThread() { + DWORD thread_id; + assert(thread_ == NULL); + thread_ = CreateThread(NULL, 0, &StaticThreadMain, this, 0, &thread_id); + return thread_ != NULL; +} + +void PipeMessageHandler::StopThread() { + if (thread_ != NULL) { + exit_ = true; + SetEvent(wait_handles_[1]); + WaitForSingleObject(thread_, INFINITE); + CloseHandle(thread_); + thread_ = NULL; + } + ClosePipe(); +} + +TunsafeServiceImpl::TunsafeServiceImpl() + : message_handler_(PIPE_NAME, true, this) { + thread_delegate_ = CreateTunsafeBackendDelegateThreaded(this, [=] { + SetEvent(message_handler_.notify_handle()); + }); + + backend_ = CreateNativeTunsafeBackend(thread_delegate_); + historical_log_lines_count_ = historical_log_lines_pos_ = 0; + last_line_sent_ = 0; + did_send_getstate_ = false; + memset(historical_log_lines_, 0, sizeof(historical_log_lines_)); + hkey_ = NULL; + want_graph_type_ = 0xffffffff; + RegCreateKeyEx(HKEY_LOCAL_MACHINE, "Software\\TunSafe", NULL, NULL, 0, KEY_ALL_ACCESS, NULL, &hkey_, NULL); +} + +TunsafeServiceImpl::~TunsafeServiceImpl() { + RegCloseKey(hkey_); +} + +static wchar_t *RegReadStrW(HKEY hkey, const wchar_t *key, const wchar_t *def) { + wchar_t buf[1024]; + DWORD n = sizeof(buf) - 2; + DWORD type = 0; + if (RegQueryValueExW(hkey, key, NULL, &type, (BYTE*)buf, &n) != ERROR_SUCCESS || type != REG_SZ) + return def ? _wcsdup(def) : NULL; + n >>= 1; + if (n && buf[n - 1] == 0) + n--; + buf[n] = 0; + return _wcsdup(buf); +} + +unsigned TunsafeServiceImpl::OnStart(int argc, wchar_t **argv) { + message_handler_.StartThread(); + + uint32 service_flags = RegReadInt(hkey_, "ServiceStartupFlags", 0); + if ( (service_flags & kStartupFlag_BackgroundService) && (service_flags & kStartupFlag_ConnectWhenWindowsStarts) ) { + char *conf = RegReadStr(hkey_, "LastUsedConfigFile", ""); + if (conf && *conf) { + current_filename_ = (char*)conf; + backend_->Start((char*)conf); + } + free(conf); + } + + return 0; +} + +bool TunsafeServiceImpl::AuthenticateUser() { + did_authenticate_user_ = true; + + if (!ImpersonateNamedPipeClient(message_handler_.pipe_handle())) + return false; + wchar_t *user = GetUsernameOfCurrentUser(true); + RevertToSelf(); + if (!user) + return false; + wchar_t *valid_user = RegReadStrW(hkey_, L"AllowedUsername", L""); + bool rv = valid_user && wcscmp(user, valid_user) == 0; + + free(user); + free(valid_user); + return rv; +} + +bool TunsafeServiceImpl::HandleMessage(int type, uint8 *data, size_t size) { + if (!did_authenticate_user_) { + if (type != SERVICE_REQ_LOGIN || size < 8 || *(uint64*)data != kTunsafeServiceProtocolVersion) { + const char *s = "Versioning Problem: The TunSafe service is a different version than the UI."; + message_handler_.WritePacket(SERVICE_MSG_LOGLINE, (uint8*)s, strlen(s)); + return false; + } + if (!AuthenticateUser()) { + const char *s = "Permission Problem: Your Windows account is different from the account\r\nthat installed the TunSafe Service. Please reinstall it.\r\n"; + message_handler_.WritePacket(SERVICE_MSG_LOGLINE, (uint8*)s, strlen(s)); + return false; + } + } + + switch (type) { + case SERVICE_REQ_START: + if (data[size - 1] != 0) + return false; + + // Don't allow reading arbitrary files on disk + if (!EnsureValidConfigPath((char*)data)) { + char buf[MAX_PATH]; + GetConfigPath(buf, sizeof(buf)); + char *s = str_cat_alloc("Permission Problem: The Config file is in an unsafe location.\r\n Must be in:", buf, "\r\n"); + message_handler_.WritePacket(SERVICE_MSG_LOGLINE, (uint8*)s, strlen(s)); + free(s); + return false; + } + + g_allow_pre_post = RegReadInt(hkey_, "AllowPrePost", 0) != 0; + + current_filename_ = (char*)data; + backend_->Start((char*)data); + RegWriteStr(hkey_, "LastUsedConfigFile", (char*)data); + + break; + + case SERVICE_REQ_STOP: + backend_->Stop(); + RegWriteStr(hkey_, "LastUsedConfigFile", ""); + OnStateChanged(); + break; + + case SERVICE_REQ_LOGIN: + did_send_getstate_ = true; + OnStatusCode(backend_->status()); + OnStateChanged(); + SendQueuedLogLines(); + break; + + case SERVICE_REQ_GETSTATS: + if (size < 1) return false; + backend_->RequestStats(data[0] != 0); + break; + + case SERVICE_REQ_SET_INTERNET_BLOCKSTATE: + if (size < 1) + return false; + backend_->SetInternetBlockState((InternetBlockState)data[0]); + OnStateChanged(); + break; + + case SERVICE_REQ_RESETSTATS: + backend_->ResetStats(); + break; + + case SERVICE_REQ_GET_GRAPH: + if (size < 4) return false; + want_graph_type_ = *(int*)data; + TunsafeServiceImpl::OnGraphAvailable(); + break; + + case SERVICE_REQ_SET_STARTUP_FLAGS: + if (size < 4) + return false; + RegSetValueEx(hkey_, "ServiceStartupFlags", NULL, REG_DWORD, (BYTE*)data, 4); + break; + + default: + return false; + } + return true; +} + +bool TunsafeServiceImpl::HandleNotify() { + thread_delegate_->DoWork(); + return true; +} + +bool TunsafeServiceImpl::HandleNewConnection() { + did_send_getstate_ = false; + did_authenticate_user_ = false; + last_line_sent_ = 0; + return true; +} + +void TunsafeServiceImpl::HandleDisconnect() { + want_graph_type_ = 0xffffffff; + backend_->RequestStats(false); + uint32 service_flags = RegReadInt(hkey_, "ServiceStartupFlags", 0); + if (!(service_flags & kStartupFlag_BackgroundService)) + backend_->Stop(); +} + +void TunsafeServiceImpl::OnGraphAvailable() { + if (want_graph_type_ != 0xffffffff) { + LinearizedGraph *graph = backend_->GetGraph(want_graph_type_); + if (graph) + message_handler_.WritePacket(SERVICE_MSG_GRAPH, (uint8*)graph, graph->total_size); + } +} + +void TunsafeServiceImpl::SendQueuedLogLines() { + assert(message_handler_.VerifyThread()); + uint32 maxi = std::min(historical_log_lines_count_, historical_log_lines_pos_ - last_line_sent_); + last_line_sent_ = historical_log_lines_pos_; + for (uint32 i = 0; i < maxi; i++) { + const char *s = historical_log_lines_[(historical_log_lines_pos_ - maxi + i) & (LOGLINE_COUNT - 1)]; + if (s) + message_handler_.WritePacket(SERVICE_MSG_LOGLINE, (uint8*)s, strlen(s)); + } +} + +void TunsafeServiceImpl::OnClearLog() { + historical_log_lines_pos_ = 0; + historical_log_lines_count_ = 0; + message_handler_.WritePacket(SERVICE_MSG_CLEARLOG, NULL, 0); +} + +void TunsafeServiceImpl::OnLogLine(const char **s) { + assert(message_handler_.VerifyThread()); + char *ss = (char*)*s; + *s = NULL; + char *&x = historical_log_lines_[historical_log_lines_pos_++ & (LOGLINE_COUNT - 1)]; + std::swap(x, ss); + if (historical_log_lines_count_ < LOGLINE_COUNT) + historical_log_lines_count_++; + free(ss); + if (did_send_getstate_) + SendQueuedLogLines(); +} + +void TunsafeServiceImpl::OnGetStats(const WgProcessorStats &stats) { + message_handler_.WritePacket(SERVICE_MSG_STATS, (uint8*)&stats, sizeof(stats)); +} + +void TunsafeServiceImpl::OnStateChanged() { + uint8 *temp = new uint8[current_filename_.size() + 1 + sizeof(ServiceState)]; + bool is_activated; + + memset(temp, 0, sizeof(ServiceState)); + + ServiceState *ss = (ServiceState *)temp; + ss->is_started = backend_->is_started(); + ss->internet_block_state = backend_->GetInternetBlockState(&is_activated); + ss->internet_block_state_active = is_activated; + ss->ipv4_ip = backend_->GetIP(); + memcpy(ss->public_key, backend_->public_key(), 32); + + memcpy(temp + sizeof(ServiceState), current_filename_.c_str(), current_filename_.size() + 1); + message_handler_.WritePacket(SERVICE_MSG_STATE, temp, current_filename_.size() + 1 + sizeof(ServiceState)); + delete[] temp; +} + +void TunsafeServiceImpl::OnStatusCode(TunsafeBackend::StatusCode status) { + if (status == TunsafeBackend::kStatusConnected) + OnStateChanged(); // ensure we know the ip first + uint32 v32 = (uint32)status; + message_handler_.WritePacket(SERVICE_MSG_STATUS_CODE, (uint8*)&v32, 4); +} + +void TunsafeServiceImpl::OnStop() { + message_handler_.StopThread(); + backend_->Stop(); +} + +void TunsafeServiceImpl::OnShutdown() { + +} + +static void PushServiceLine(const char *s) { + if (g_service) { + char buf[64]; + SYSTEMTIME t; + + size_t l = strlen(s); + GetLocalTime(&t); + snprintf(buf, sizeof(buf), "[%.2d:%.2d:%.2d] ", t.wHour, t.wMinute, t.wSecond); + size_t tl = strlen(buf); + + char *x = (char*) malloc(tl + l + 3); + memcpy(x, buf, tl); + memcpy(x + tl, s, l); + x[l + tl] = '\r'; + x[l + tl + 1] = '\n'; + x[l + tl + 2] = '\0'; + g_service->delegate()->OnLogLine((const char**)&x); + free(x); + } else { + size_t l = strlen(s); + char buf[1024]; + SYSTEMTIME t; + GetLocalTime(&t); + + snprintf(buf, sizeof(buf), "[%.2d:%.2d:%.2d] ", t.wHour, t.wMinute, t.wSecond); + size_t tl = strlen(buf); + + if (l >= ARRAYSIZE(buf) - tl - 1) + l = ARRAYSIZE(buf) - tl - 1; + + memcpy(buf + tl, s, l); + buf[l + tl] = '\0'; + + WriteServiceLog(buf, EVENTLOG_INFORMATION_TYPE); + } +} + +BOOL RunProcessAsTunsafeServiceProcess() { + g_service = new TunsafeServiceImpl; + g_logger = &PushServiceLine; + + //g_service->OnStart(NULL, 0); + + //MessageBoxA(0, "Service running", "Service running", 0); + //return TRUE; +// while (true)Sleep(1000); + + // Connects the main thread of a service process to the service control + // manager, which causes the thread to be the service control dispatcher + // thread for the calling process. This call returns when the service has + // stopped. The process should simply terminate when the call returns. + return StartServiceCtrlDispatcherW(serviceTable); +} +TunsafeServiceClient::TunsafeServiceClient(TunsafeBackend::Delegate *delegate) + : message_handler_(PIPE_NAME, false, this) { + is_remote_ = true; + got_state_from_control_ = false; + delegate_ = delegate; + cached_graph_ = 0; + last_graph_type_ = 0xffffffff; + memset(&service_state_, 0, sizeof(service_state_)); +} + +TunsafeServiceClient::~TunsafeServiceClient() { + message_handler_.StopThread(); +} + +bool TunsafeServiceClient::Initialize() { + // Wait for the service to start + last_graph_type_ = 0xffffffff; + return message_handler_.StartThread(); +} + +void TunsafeServiceClient::Start(const char *config_file) { + message_handler_.WritePacket(SERVICE_REQ_START, (uint8*)config_file, strlen(config_file) + 1); +} + +void TunsafeServiceClient::Stop() { + message_handler_.WritePacket(SERVICE_REQ_STOP, NULL, 0); +} + +void TunsafeServiceClient::RequestStats(bool enable) { + want_stats_ = enable; + if (message_handler_.is_connected()) + message_handler_.WritePacket(SERVICE_REQ_GETSTATS, &want_stats_, 1); +} + +void TunsafeServiceClient::ResetStats() { + message_handler_.WritePacket(SERVICE_REQ_RESETSTATS, NULL, 0); +} + +InternetBlockState TunsafeServiceClient::GetInternetBlockState(bool *is_activated) { + if (is_activated) + *is_activated = service_state_.internet_block_state_active; + return (InternetBlockState)service_state_.internet_block_state; +} + +void TunsafeServiceClient::SetInternetBlockState(InternetBlockState s) { + uint8 v = (uint8)s; + message_handler_.WritePacket(SERVICE_REQ_SET_INTERNET_BLOCKSTATE, &v, 1); +} + +void TunsafeServiceClient::SetServiceStartupFlags(uint32 flags) { + message_handler_.WritePacket(SERVICE_REQ_SET_STARTUP_FLAGS, (uint8*)&flags, 4); +} + +LinearizedGraph *TunsafeServiceClient::GetGraph(int type) { + if (type != last_graph_type_) { + last_graph_type_ = type; + message_handler_.WritePacket(SERVICE_REQ_GET_GRAPH, (uint8*)&type, 4); + } + mutex_.Acquire(); + LinearizedGraph *graph = cached_graph_; + LinearizedGraph *new_graph = (graph && graph->graph_type == type) ? (LinearizedGraph*)memdup(graph, graph->total_size) : NULL; + mutex_.Release(); + return new_graph; +} + + +std::string TunsafeServiceClient::GetConfigFileName() { + mutex_.Acquire(); + std::string rv = config_file_; + mutex_.Release(); + return rv; +} + +bool TunsafeServiceClient::HandleMessage(int type, uint8 *data, size_t data_size) { + switch(type) { + case SERVICE_MSG_STATE: + if (data_size <= sizeof(service_state_) || data[data_size - 1]) + return false; + got_state_from_control_ = true; + + mutex_.Acquire(); + config_file_.assign((char*)data + sizeof(service_state_), data_size - 1 - sizeof(service_state_)); + memcpy(&service_state_, data, sizeof(service_state_)); + memcpy(public_key_, service_state_.public_key, 32); + is_started_ = service_state_.is_started; + ipv4_ip_ = service_state_.ipv4_ip; + mutex_.Release(); + delegate_->OnStateChanged(); + return true; + case SERVICE_MSG_LOGLINE: { + if (data_size == 0) + return false; + char *s = my_strndup((char*)data, data_size); + delegate_->OnLogLine((const char **)&s); + free(s); + return true; + } + case SERVICE_MSG_STATS: { + WgProcessorStats stats; + if (data_size != sizeof(WgProcessorStats)) + return false; + memcpy(&stats, data, sizeof(WgProcessorStats)); + delegate_->OnGetStats(stats); + return true; + } + case SERVICE_MSG_CLEARLOG: + delegate_->OnClearLog(); + return true; + + case SERVICE_MSG_STATUS_CODE: + if (data_size < 4) + return false; + status_ = (StatusCode)*(uint32*)data; + delegate_->OnStatusCode(status_); + return true; + + case SERVICE_MSG_GRAPH: + if (data_size < 4 || data_size != *(uint32*)data) + return false; + + LinearizedGraph *graph = (LinearizedGraph*)memdup(data, data_size); + mutex_.Acquire(); + std::swap(graph, cached_graph_); + mutex_.Release(); + free(graph); + delegate_->OnGraphAvailable(); + return true; + } + + return false; +} + +bool TunsafeServiceClient::HandleNotify() { + return true; +} + + +bool TunsafeServiceClient::HandleNewConnection() { + message_handler_.WritePacket(SERVICE_REQ_LOGIN, (uint8*)&kTunsafeServiceProtocolVersion, 8); + if (want_stats_) + message_handler_.WritePacket(SERVICE_REQ_GETSTATS, &want_stats_, 1); + return true; +} + +void TunsafeServiceClient::HandleDisconnect() { + status_ = TunsafeBackend::kErrorServiceLost; + delegate_->OnStatusCode(TunsafeBackend::kErrorServiceLost); +} + +void TunsafeServiceClient::Teardown() { + message_handler_.StopThread(); +} + +TunsafeBackend *CreateTunsafeServiceClient(TunsafeBackend::Delegate *delegate) { + TunsafeServiceClient *client = new TunsafeServiceClient(delegate); + if (client && !client->Initialize()) { + delete client; + client = NULL; + } + return client; +} + + diff --git a/service_win32.h b/service_win32.h new file mode 100644 index 0000000..eee61be --- /dev/null +++ b/service_win32.h @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: AGPL-1.0-only +// Copyright (C) 2018 Ludvig Strigeus . All Rights Reserved. +#pragma once + +#include "service_win32_api.h" +#include +#include "util.h" +#include "network_win32_api.h" +#include "tunsafe_threading.h" +#include +#include +#include + +struct ServiceState { + uint8 is_started : 1; + uint8 internet_block_state_active : 1; + uint8 internet_block_state; + uint8 reserved[26+64]; + uint32 ipv4_ip; + uint8 public_key[32]; +}; + +STATIC_ASSERT(sizeof(ServiceState) == 128, ServiceState_wrong_size); + +class PipeMessageHandler { +public: + class Delegate { + public: + virtual bool HandleMessage(int type, uint8 *data, size_t size) = 0; + virtual bool HandleNotify() = 0; + virtual bool HandleNewConnection() = 0; + virtual void HandleDisconnect() = 0; + }; + + PipeMessageHandler(const char *pipe_name, bool is_server_pipe, Delegate *delegate); + ~PipeMessageHandler(); + + bool StartThread(); + void StopThread(); + + bool WritePacket(int type, const uint8 *data, size_t data_size); + + HANDLE notify_handle() { return wait_handles_[1]; } + HANDLE pipe_handle() { return pipe_; } + + bool VerifyThread(); + + void FlushWrites(int delay); + bool is_connected() { return connection_established_; } +private: + bool InitializeServerPipe(); + bool InitializeClientPipe(); + void ClosePipe(); + DWORD ThreadMain(); + void SendNextQueuedWrite(); + uint8 *ReadNamedPipeAsync(size_t *packet_size); + bool ConnectNamedPipeAsync(); + bool WaitAndHandleWrites(int delay); + static DWORD WINAPI StaticThreadMain(void *x); + + Delegate *delegate_; + + HANDLE pipe_; + HANDLE thread_; + HANDLE wait_handles_[3]; + OVERLAPPED write_overlapped_; + bool write_overlapped_active_; + bool exit_; + bool is_server_pipe_; + bool connection_established_; + char *pipe_name_; + + struct OutgoingPacket { + OutgoingPacket *next; + uint32 size; + uint8 data[0]; + }; + OutgoingPacket *packets_, **packets_end_; + + Mutex packets_mutex_; + + DWORD thread_id_; +}; + + +class TunsafeServiceImpl : public TunsafeBackend::Delegate, public PipeMessageHandler::Delegate { +public: + TunsafeServiceImpl(); + virtual ~TunsafeServiceImpl(); + + // -- from TunsafeBackend::Delegate + virtual void OnGetStats(const WgProcessorStats &stats); + virtual void OnClearLog(); + virtual void OnLogLine(const char **s); + virtual void OnStateChanged(); + virtual void OnStatusCode(TunsafeBackend::StatusCode status); + virtual void OnGraphAvailable(); + + // -- from PipeMessageHandler::Delegate + virtual bool HandleMessage(int type, uint8 *data, size_t size); + virtual bool HandleNotify(); + virtual bool HandleNewConnection(); + virtual void HandleDisconnect(); + + // virtual methods + virtual unsigned OnStart(int argc, wchar_t **argv); + virtual void OnStop(); + virtual void OnShutdown(); + + TunsafeBackend::Delegate *delegate() { return thread_delegate_; } + +private: + void SendQueuedLogLines(); + bool AuthenticateUser(); + + bool did_send_getstate_; + + bool did_authenticate_user_; + uint32 want_graph_type_; + + HKEY hkey_; + + TunsafeBackend *backend_; + TunsafeBackend::Delegate *thread_delegate_; + + PipeMessageHandler message_handler_; + + uint32 historical_log_lines_pos_; + uint32 historical_log_lines_count_; + uint32 last_line_sent_; + std::string current_filename_; + + enum { + LOGLINE_COUNT = 256 + }; + char *historical_log_lines_[LOGLINE_COUNT]; +}; + +class TunsafeServiceClient : public TunsafeBackend, public PipeMessageHandler::Delegate { +public: + TunsafeServiceClient(TunsafeBackend::Delegate *delegate); + virtual ~TunsafeServiceClient(); + virtual bool Initialize(); + virtual void Teardown(); + virtual void Start(const char *config_file); + virtual void Stop(); + virtual void RequestStats(bool enable); + virtual void ResetStats(); + virtual InternetBlockState GetInternetBlockState(bool *is_activated); + virtual void SetInternetBlockState(InternetBlockState s); + virtual std::string GetConfigFileName(); + virtual void SetServiceStartupFlags(uint32 flags); + virtual LinearizedGraph *GetGraph(int type); + + // -- from PipeMessageHandler::Delegate + virtual bool HandleMessage(int type, uint8 *data, size_t size); + virtual bool HandleNotify(); + virtual bool HandleNewConnection(); + virtual void HandleDisconnect(); + +protected: + TunsafeBackend::Delegate *delegate_; + uint8 want_stats_; + bool got_state_from_control_; + ServiceState service_state_; + std::string config_file_; + PipeMessageHandler message_handler_; + LinearizedGraph *cached_graph_; + uint32 last_graph_type_; + Mutex mutex_; +}; diff --git a/service_win32_api.h b/service_win32_api.h new file mode 100644 index 0000000..dac7151 --- /dev/null +++ b/service_win32_api.h @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: AGPL-1.0-only +// Copyright (C) 2018 Ludvig Strigeus . All Rights Reserved. +#pragma once + +#include "network_win32_api.h" + +enum StartupFlags { + kStartupFlag_ForegroundService = 1, + kStartupFlag_BackgroundService = 2, + kStartupFlag_ConnectWhenWindowsStarts = 4, + kStartupFlag_MinimizeToTrayWhenWindowsStarts = 8, +}; + +BOOL RunProcessAsTunsafeServiceProcess(); + +void StopTunsafeService(); + +bool IsTunSafeServiceInstalled(); + +bool IsTunsafeServiceRunning(); +void InstallTunSafeWindowsService(); +bool UninstallTunSafeWindowsService(); + +TunsafeBackend *CreateTunsafeServiceClient(TunsafeBackend::Delegate *delegate); diff --git a/stdafx.h b/stdafx.h index bd6427f..61625f4 100644 --- a/stdafx.h +++ b/stdafx.h @@ -21,7 +21,7 @@ #include #include #include - +#include #include #else @@ -31,3 +31,5 @@ #include #include +#undef min + diff --git a/tunsafe_config.h b/tunsafe_config.h index 2f29472..d493b25 100644 --- a/tunsafe_config.h +++ b/tunsafe_config.h @@ -1,6 +1,8 @@ +// SPDX-License-Identifier: AGPL-1.0-only +// Copyright (C) 2018 Ludvig Strigeus . All Rights Reserved. #pragma once -#define TUNSAFE_VERSION_STRING "TunSafe 1.3-rc3" +#define TUNSAFE_VERSION_STRING "TunSafe 1.4-rc1" #define WITH_HANDSHAKE_EXT 0 #define WITH_SHORT_HEADERS 0 diff --git a/tunsafe_cpu.cpp b/tunsafe_cpu.cpp index b1ee8cc..ec14aac 100644 --- a/tunsafe_cpu.cpp +++ b/tunsafe_cpu.cpp @@ -10,6 +10,16 @@ #include +static char *strcpy_e(char *dst, char *end, const char *copy) { + size_t len = strlen(copy); + if (len >= (size_t)(end - dst)) return end; + memcpy(dst, copy, len + 1); + return dst + len; +} + + +#if defined(ARCH_CPU_X86_FAMILY) + uint32 x86_pcap[3]; #if !defined(COMPILER_MSVC) @@ -22,6 +32,7 @@ static inline void __cpuid(int info[4], int func) { } #endif + void InitCpuFeatures() { unsigned nIds, nExIds; @@ -45,13 +56,6 @@ void InitCpuFeatures() { } } -static char *strcpy_e(char *dst, char *end, const char *copy) { - size_t len = strlen(copy); - if (len >= (size_t)(end - dst)) return end; - memcpy(dst, copy, len + 1); - return dst + len; -} - void PrintCpuFeatures() { char capbuf[2048], *end = capbuf + 2048, *s = capbuf; @@ -66,3 +70,22 @@ void PrintCpuFeatures() { RINFO("Using:%s", capbuf); } + +#endif // defined(ARCH_CPU_X86_FAMILY) + +#if defined(ARCH_CPU_ARM_FAMILY) + +uint32 arm_pcap[1]; + +void InitCpuFeatures() { + arm_pcap[0] = 0xffffffff; +} + +void PrintCpuFeatures() { + char capbuf[2048], *end = capbuf + 2048, *s = capbuf; + + if (ARM_PCAP_NEON) s = strcpy_e(s, end, " neon"); + + RINFO("Using:%s", capbuf); +} +#endif // defined(ARCH_CPU_ARM_FAMILY) diff --git a/tunsafe_cpu.h b/tunsafe_cpu.h index de97b6c..c19f6b1 100644 --- a/tunsafe_cpu.h +++ b/tunsafe_cpu.h @@ -5,6 +5,9 @@ #include "tunsafe_types.h" + +#if defined(ARCH_CPU_X86_FAMILY) + extern uint32 x86_pcap[3]; // cpuid 1, edx @@ -22,8 +25,19 @@ extern uint32 x86_pcap[3]; #define X86_PCAP_AVX512F (x86_pcap[2] & (1 << 16)) #define X86_PCAP_AVX512VL (x86_pcap[2] & (1 << 31)) +#endif // defined(ARCH_CPU_X86_FAMILY) + + +#if defined(ARCH_CPU_ARM_FAMILY) + +extern uint32 arm_pcap[1]; + +#define ARM_PCAP_NEON (arm_pcap[0] & (1 << 0)) + +#endif // defined(ARCH_CPU_ARM_FAMILY) + void InitCpuFeatures(); void PrintCpuFeatures(); -#endif // TUNSAFE_CPU_H_ \ No newline at end of file +#endif // TUNSAFE_CPU_H_ diff --git a/tunsafe_endian.h b/tunsafe_endian.h index 32bce5e..45cb316 100644 --- a/tunsafe_endian.h +++ b/tunsafe_endian.h @@ -70,6 +70,7 @@ #define ReadBE32Aligned(pt) ToBE32(*(uint32*)(pt)) #define WriteBE32Aligned(ct, st) (*(uint32*)(ct) = ToBE32(st)) +// todo: these need to support unaligned pointers #define ReadBE16(pt) ToBE16(*(uint16*)(pt)) #define WriteBE16(ct, st) (*(uint16*)(ct) = ToBE16(st)) #define ReadBE32(pt) ToBE32(*(uint32*)(pt)) diff --git a/tunsafe_threading.cpp b/tunsafe_threading.cpp new file mode 100644 index 0000000..af21db3 --- /dev/null +++ b/tunsafe_threading.cpp @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: AGPL-1.0-only +// Copyright (C) 2018 Ludvig Strigeus . All Rights Reserved. +#include "stdafx.h" +#include "tunsafe_threading.h" +#include + +MultithreadedDelayedDelete::MultithreadedDelayedDelete() { + table_ = NULL; + num_threads_ = 0; +} + +MultithreadedDelayedDelete::~MultithreadedDelayedDelete() { + free(table_); +} + +void MultithreadedDelayedDelete::Initialize(uint32 num_threads) { + num_threads_ = num_threads; + table_ = (CheckpointData*)calloc(sizeof(CheckpointData), num_threads); +} + +void MultithreadedDelayedDelete::Add(DoDeleteFunc *func, void *param) { + if (num_threads_ == 0) { + func(param); + return; + } + lock_.Acquire(); + Entry e = {func, param}; + curr_.push_back(e); + lock_.Release(); +} + +void MultithreadedDelayedDelete::Checkpoint(uint32 thread_id) { + table_[thread_id].value.store(1); +} + +void MultithreadedDelayedDelete::MainCheckpoint() { + // Wait for all threads to signal that they reached the checkpoint + for (size_t i = 0; i < num_threads_; i++) { + if (table_[i].value.load() == 0) + return; + } + + // All threads reached the checkpoint, clear the values + for (size_t i = 0; i < num_threads_; i++) + table_[i].value.store(0); + + // Swap curr and next, and delete all nexts. + lock_.Acquire(); + std::swap(curr_, next_); + std::swap(curr_, to_delete_); + lock_.Release(); + + for (auto it = to_delete_.begin(); it != to_delete_.end(); ++it) { + it->func(it->param); + } + to_delete_.clear(); +} diff --git a/tunsafe_threading.h b/tunsafe_threading.h new file mode 100644 index 0000000..1362678 --- /dev/null +++ b/tunsafe_threading.h @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: AGPL-1.0-only +// Copyright (C) 2018 Ludvig Strigeus . All Rights Reserved. +#pragma once +#include "tunsafe_types.h" +#include +#include +#include +#if !defined(OS_WIN) +#include +#endif // !defined(OS_WIN) + +#if defined(OS_WIN) + +class ReaderWriterLock { +public: + ReaderWriterLock() : lock_(SRWLOCK_INIT) {} + void AcquireExclusive() { AcquireSRWLockExclusive(&lock_); } + void AcquireShared() { AcquireSRWLockShared(&lock_); } + void ReleaseExclusive() { ReleaseSRWLockExclusive(&lock_); } + void ReleaseShared() { ReleaseSRWLockShared(&lock_); } +private: + SRWLOCK lock_; +}; + +class Mutex { +public: +#if defined(_DEBUG) + bool locked_; + bool IsLocked() { return locked_; } +#define Mutex_SETLOCKED(x) locked_ = x; +#else + bool IsLocked() { return false; } +#define Mutex_SETLOCKED(x) +#endif + Mutex() : lock_(SRWLOCK_INIT) { Mutex_SETLOCKED(false); } + ~Mutex() { } + void Acquire() { + AcquireSRWLockExclusive(&lock_); + Mutex_SETLOCKED(true); + } + void Release() { + Mutex_SETLOCKED(false); + ReleaseSRWLockExclusive(&lock_); + } +private: + SRWLOCK lock_; +}; + +typedef uint32 ThreadId; + +static inline bool CurrentThreadIdEquals(ThreadId thread_id) { + return thread_id == GetCurrentThreadId(); +} + +#else // defined(OS_WIN) + +class ReaderWriterLock { +public: + ReaderWriterLock() { + if (pthread_rwlock_init(&lock_, NULL) != 0) + tunsafe_die("pthread_rwlock_init failed"); + } + ~ReaderWriterLock() { + pthread_rwlock_destroy(&lock_); + } + void AcquireExclusive() { int rv = pthread_rwlock_wrlock(&lock_); assert(rv == 0); } + void AcquireShared() { int rv = pthread_rwlock_rdlock(&lock_); assert(rv == 0); } + void ReleaseExclusive() { int rv = pthread_rwlock_unlock(&lock_); assert(rv == 0); } + void ReleaseShared() { int rv = pthread_rwlock_unlock(&lock_); assert(rv == 0); } +private: + pthread_rwlock_t lock_; +}; + +class Mutex { +public: +#if defined(_DEBUG) + bool locked_; + bool IsLocked() { return locked_; } +#define Mutex_SETLOCKED(x) locked_ = x; +#else + bool IsLocked() { return false; } +#define Mutex_SETLOCKED(x) +#endif + Mutex() { + if (pthread_mutex_init(&lock_, NULL) != 0) + tunsafe_die("pthread_mutex_init failed"); + Mutex_SETLOCKED(false); + } + ~Mutex() { + pthread_mutex_destroy(&lock_); + } + void Acquire() { + int rv = pthread_mutex_lock(&lock_); + assert(rv == 0); + Mutex_SETLOCKED(true); + } + void Release() { + Mutex_SETLOCKED(false); + int rv = pthread_mutex_unlock(&lock_); + assert(rv == 0); + } + pthread_mutex_t *impl() { return &lock_; } +private: + pthread_mutex_t lock_; +}; + +typedef pthread_t ThreadId; + +static inline bool CurrentThreadIdEquals(ThreadId thread_id) { + return pthread_equal(thread_id, pthread_self()) != 0; +} + +static inline ThreadId GetCurrentThreadId() { + return pthread_self(); +} + +#endif // !defined(OS_WIN) + +class ScopedLockShared { +public: + ScopedLockShared(ReaderWriterLock *lock) : lock_(lock) { lock->AcquireShared(); } + ~ScopedLockShared() { lock_->ReleaseShared(); } +private: + ReaderWriterLock *lock_; +}; + +class ScopedLockExclusive { +public: + ScopedLockExclusive(ReaderWriterLock *lock) : lock_(lock) { lock->AcquireExclusive(); } + ~ScopedLockExclusive() { lock_->ReleaseExclusive(); } +private: + ReaderWriterLock *lock_; +}; + +class ScopedLock { +public: + ScopedLock(Mutex *lock) : lock_(lock) { lock->Acquire(); } + ~ScopedLock() { lock_->Release(); } +private: + Mutex *lock_; +}; + +// This class deletes objects delayed. All participating threads will call a function, +// and then once all threads did, all registered objects will get deleted. +class MultithreadedDelayedDelete { +public: + MultithreadedDelayedDelete(); + ~MultithreadedDelayedDelete(); + + typedef void DoDeleteFunc(void *x); + void Add(DoDeleteFunc *func, void *param); + + void Initialize(uint32 num_threads); + + void Checkpoint(uint32 thread_id); + + void MainCheckpoint(); + +private: + struct Entry { + DoDeleteFunc *func; + void *param; + }; + + struct CheckpointData { + std::atomic value; + uint8 align[60]; + }; + + uint32 num_threads_; + + std::vector curr_, next_, to_delete_; + CheckpointData *table_; + Mutex lock_; +}; diff --git a/tunsafe_types.h b/tunsafe_types.h index 9ddabab..7bdc600 100644 --- a/tunsafe_types.h +++ b/tunsafe_types.h @@ -68,6 +68,6 @@ static inline uint32 rol32(uint32 x, int8_t r) { void RERROR(const char *msg, ...); void RINFO(const char *msg, ...); - +void tunsafe_die(const char *msg); #endif // TINYVPN_TYPES_H_ diff --git a/tunsafe_win32.cpp b/tunsafe_win32.cpp index 846ce28..7a59d5e 100644 --- a/tunsafe_win32.cpp +++ b/tunsafe_win32.cpp @@ -23,8 +23,9 @@ #include #include #include "crypto/curve25519-donna.h" +#include "service_win32.h" +#include "util_win32.h" -#undef min #pragma comment(lib, "iphlpapi.lib") #pragma comment(lib, "rpcrt4.lib") #pragma comment(lib,"comctl32.lib") @@ -34,119 +35,103 @@ void InitCpuFeatures(); void PrintCpuFeatures(); void Benchmark(); static const char *GetCurrentConfigTitle(char *buf, size_t max_size); +static char *PrintMB(char *buf, int64 bytes); +static void LoadConfigFile(const char *filename, bool save, bool force_start); +static void SetCurrentConfigFilename(const char *filename); +static void CreateLocalOrRemoteBackend(bool remote); +static void UpdateGraphReq(); #pragma warning(disable: 4200) -static void MyPostMessage(int msg, WPARAM wparam, LPARAM lparam); - +static bool g_is_connected_to_server; +static bool g_notified_connected_server; static HWND g_ui_window; -static in_addr_t g_ui_ip; static HICON g_icons[2]; static bool g_minimize_on_connect; static bool g_ui_visible; static char *g_current_filename; -static HKEY g_reg_key; static HINSTANCE g_hinstance; -static TunsafeBackendWin32 *g_backend; -static bool g_last_popup_is_tray; +static TunsafeBackend *g_backend; +static TunsafeBackend::Delegate *g_backend_delegate; +static const char *g_cmdline_filename; +static bool g_first_state_msg; +static bool g_is_limited_uac_account; +static bool g_is_tunsafe_service_running; +static bool g_disable_connect_on_start; +static bool g_not_first_status_msg; +static HANDLE g_runonce_mutex; +static int g_startup_flags; +static HKEY g_reg_key; +static HKEY g_hklm_reg_key; +static HKEY g_hklm_readonly_reg_key; +static HWND hwndPaintBox, hwndStatus, hwndGraphBox, hwndTab, hwndAdvancedBox, hwndEdit; +static WgProcessorStats g_processor_stats; +static int g_large_fonts; +static TunsafeBackend::StatusCode g_status_code; +static UINT g_message_taskbar_created; +static int g_current_tab; +static bool wm_dropfiles_recursive; +static bool g_has_icon; +static int g_selected_graph_type; +static RECT comborect; +static HBITMAP arrowbitmap; +static uint32 g_timestamp_of_exit_menuloop; +enum UpdateIconWhy { + UIW_NONE = 0, + UIW_STOPPED_WORKING_FAIL = 1, + UIW_START = 2, +}; +static void UpdateIcon(UpdateIconWhy error); -int RegReadInt(const char *key, int def) { - DWORD value = def, n = sizeof(value); - RegQueryValueEx(g_reg_key, key, NULL, NULL, (BYTE*)&value, &n); - return value; + +int RescaleDpi(int size) { + return (g_large_fonts == 96) ? size : size * g_large_fonts / 96; } -void RegWriteInt(const char *key, int value) { - RegSetValueEx(g_reg_key, key, NULL, REG_DWORD, (BYTE*)&value, sizeof(value)); -} - -char *RegReadStr(const char *key, const char *def) { - char buf[1024]; - DWORD n = sizeof(buf) - 1; - DWORD type = 0; - if (RegQueryValueEx(g_reg_key, key, NULL, &type, (BYTE*)buf, &n) != ERROR_SUCCESS || type != REG_SZ) - return def ? _strdup(def) : NULL; - if (n && buf[n - 1] == 0) - n--; - buf[n] = 0; - return _strdup(buf); -} - -void RegWriteStr(const char *key, const char *v) { - RegSetValueEx(g_reg_key, key, NULL, REG_SZ, (BYTE*)v, (DWORD)strlen(v) + 1); -} - -void str_set(char **x, const char *s) { - free(*x); - *x = _strdup(s); -} - -char *str_cat_alloc(const char *a, const char *b) { - size_t al = strlen(a); - size_t bl = strlen(b); - char *r = (char *)malloc(al + bl + 1); - memcpy(r, a, al); - r[al + bl] = 0; - memcpy(r + al, b, bl); - return r; -} - -static const char *FindLastFolderSep(const char *s) { - size_t len = strlen(s); - for (;;) { - if (len == 0) - return NULL; - len--; - if (s[len] == '\\' || s[len] == '/') - break; +RECT RescaleDpiRect(const RECT &r) { + RECT rr = r; + if (g_large_fonts != 96) { + rr.left = rr.left * g_large_fonts / 96; + rr.top = rr.top * g_large_fonts / 96; + rr.right = rr.right * g_large_fonts / 96; + rr.bottom = rr.bottom * g_large_fonts / 96; } - return s + len; + return rr; } +static void SetUiVisibility(bool visible) { + g_ui_visible = visible; + ShowWindow(g_ui_window, visible ? SW_SHOW : SW_HIDE); + g_backend->RequestStats(visible); + UpdateGraphReq(); +} static bool GetConfigFullName(const char *basename, char *fullname, size_t fullname_size) { size_t len = strlen(basename); - if (FindLastFolderSep(basename)) { + if (FindFilenameComponent(basename)[0]) { if (len >= fullname_size) return false; memcpy(fullname, basename, len + 1); return true; } - if (!GetModuleFileName(NULL, fullname, (DWORD)fullname_size)) + size_t clen = GetConfigPath(fullname, fullname_size); + if (clen == 0 || clen + len >= fullname_size) return false; - char *last = (char *)FindLastFolderSep(fullname); - if (!last || last + len + 8 >= fullname + fullname_size) - return false; - memcpy(last + 1, "Config\\", 7 * sizeof(last[0])); - memcpy(last + 8, basename, (len + 1) * sizeof(last[0])); + memcpy(fullname + clen, basename, (len + 1) * sizeof(fullname[0])); return true; } -enum UpdateIconWhy { - UIW_NONE = 0, - UIW_STOPPED_WORKING_FAIL = 1, - UIW_STOPPED_WORKING_RETRY = 2, - UIW_EXITING = 3, -}; -static void UpdateIcon(UpdateIconWhy error); -static void UpdateButtons(); - - -void StopService(UpdateIconWhy error) { +void StopTunsafeBackend(UpdateIconWhy why) { if (g_backend->is_started()) { g_backend->Stop(); - - g_ui_ip = 0; - - if (error != UIW_EXITING) { - UpdateIcon(error); - RINFO("Disconnecting"); - UpdateButtons(); - RegWriteInt("IsConnected", 0); - } + if (g_is_connected_to_server) + RINFO("Disconnected"); + g_is_connected_to_server = false; + UpdateIcon(why); + RegWriteInt(g_reg_key, "IsConnected", 0); } } @@ -155,44 +140,146 @@ const char *print_ip(char buf[kSizeOfAddress], in_addr_t ip) { return buf; } -class MyProcessorDelegate : public ProcessorDelegate { -public: - virtual void OnConnected(in_addr_t my_ip) { - if (my_ip != g_ui_ip) { +void StartTunsafeBackend(UpdateIconWhy reason) { + if (!*g_current_filename) + return; - if (my_ip) { - char buf[kSizeOfAddress]; - print_ip(buf, my_ip); - RINFO("Connection established. IP %s", buf); + // recreate service connection + if (g_backend->status() == TunsafeBackend::kErrorServiceLost) + CreateLocalOrRemoteBackend(g_backend->is_remote()); + + if (g_backend->is_remote() && !EnsureValidConfigPath(g_current_filename)) { + RERROR("The config file needs to be in the Config-directory. Maybe the TunSafe\r\n process doesn't match with the running service. Try selecting 'Don't Use a Service'."); + StopTunsafeBackend(UIW_NONE); + return; + } + g_notified_connected_server = false; + g_is_connected_to_server = false; + g_backend->Start(g_current_filename); + RegWriteInt(g_reg_key, "IsConnected", 1); +} + +static void InvalidatePaintbox() { + InvalidateRect(hwndPaintBox, NULL, FALSE); +} + +class MyBackendDelegate : public TunsafeBackend::Delegate { +public: + virtual void OnGraphAvailable() { + InvalidateRect(hwndGraphBox, NULL, FALSE); + } + + virtual void OnGetStats(const WgProcessorStats &stats) { + g_processor_stats = stats; + InvalidatePaintbox(); + + char buf[64]; + uint32 mbs_in = (uint32)(stats.tun_bytes_out_per_second * (1.0 / 1250)); + uint32 gb_in = (uint32)(stats.tun_bytes_out * (1.0 / (1024 * 1024 * 1024 / 100))); + + snprintf(buf, ARRAYSIZE(buf), "D: %d.%.2d Mbps (%d.%.2d GB)", mbs_in / 100, mbs_in % 100, gb_in / 100, gb_in % 100); + SendMessage(hwndStatus, SB_SETTEXT, 1, (LPARAM)buf); + + uint32 mbs_out = (uint32)(stats.tun_bytes_in_per_second * (1.0 / 1250)); + uint32 gb_out = (uint32)(stats.tun_bytes_in * (1.0 / (1024 * 1024 * 1024 / 100))); + + snprintf(buf, ARRAYSIZE(buf), "U: %d.%.2d Mbps (%d.%.2d GB)", mbs_out / 100, mbs_out % 100, gb_out / 100, gb_out % 100); + SendMessage(hwndStatus, SB_SETTEXT, 2, (LPARAM)buf); + + InvalidateRect(hwndAdvancedBox, NULL, FALSE); + } + + virtual void OnLogLine(const char **s) { + CHARRANGE cr; + cr.cpMin = -1; + cr.cpMax = -1; + // hwnd = rich edit hwnd + SendMessage(hwndEdit, EM_EXSETSEL, 0, (LPARAM)&cr); + SendMessage(hwndEdit, EM_REPLACESEL, 0, (LPARAM)*s); + } + + virtual void OnStateChanged() { + if (!g_first_state_msg) { + g_first_state_msg = true; + char fullname[1024]; + + const char *filename = g_cmdline_filename; + if (filename) { + if (GetConfigFullName(filename, fullname, sizeof(fullname))) + SetCurrentConfigFilename(fullname); + } else { + std::string currconfig = g_backend->GetConfigFileName(); + if (currconfig.empty()) { + char *conf = RegReadStr(g_reg_key, "ConfigFile", "TunSafe.conf"); + if (GetConfigFullName(conf, fullname, sizeof(fullname))) + SetCurrentConfigFilename(fullname); + free(conf); + } else { + SetCurrentConfigFilename(currconfig.c_str()); + } } - g_ui_ip = my_ip; - MyPostMessage(WM_USER + 2, 0, 0); + + if (filename != NULL || !(g_startup_flags & kStartupFlag_BackgroundService) && !g_disable_connect_on_start && RegReadInt(g_reg_key, "IsConnected", 0)) { + StartTunsafeBackend(UIW_START); + } else { + if (!g_backend->is_started()) + RINFO("Press Connect to initiate a connection to the WireGuard server."); + } + } + + bool running = g_backend->is_started(); + SetDlgItemText(g_ui_window, ID_START, running ? "Re&connect" : "&Connect"); + InvalidatePaintbox(); + EnableWindow(GetDlgItem(g_ui_window, ID_STOP), running); + } + + virtual void OnStatusCode(TunsafeBackend::StatusCode status) override { + g_status_code = status; + if (TunsafeBackend::IsPermanentError(status)) { + UpdateIcon(g_is_connected_to_server ? UIW_STOPPED_WORKING_FAIL : UIW_NONE); + InvalidatePaintbox(); + return; + } + bool is_connected = (status == TunsafeBackend::kStatusConnected); + if (is_connected && g_minimize_on_connect) { + g_minimize_on_connect = false; + SetUiVisibility(false); + } + + bool not_first = g_not_first_status_msg; + g_not_first_status_msg = true; + + if (is_connected != g_is_connected_to_server) { + g_is_connected_to_server = is_connected; + // avoid showing a notice if service is already connected + if (is_connected > not_first && (g_startup_flags & kStartupFlag_BackgroundService)) + g_notified_connected_server = true; + UpdateIcon(UIW_NONE); + InvalidatePaintbox(); } } - virtual void OnDisconnected() { - MyProcessorDelegate::OnConnected(0); + + virtual void OnClearLog() override { + SetWindowText(hwndEdit, ""); } }; -static MyProcessorDelegate my_procdel; +static MyBackendDelegate my_procdel; -void StartService(bool skip_clear = false) { - char buf[1024]; - if (!GetConfigFullName(g_current_filename, buf, ARRAYSIZE(buf))) - return; - - if (!g_backend->is_started()) { - if (!skip_clear) - PostMessage(g_ui_window, WM_USER + 6, NULL, NULL); - - g_backend->Start(&my_procdel, buf); +static void CreateLocalOrRemoteBackend(bool remote) { + delete g_backend; - UpdateButtons(); - RegWriteInt("IsConnected", 1); + g_first_state_msg = false; + + if (!remote) { + g_backend = CreateNativeTunsafeBackend(g_backend_delegate); + } else { + RINFO("Connecting to the TunSafe Service..."); + g_backend = CreateTunsafeServiceClient(g_backend_delegate); } -} -static bool g_has_icon; + g_backend->RequestStats(g_ui_visible); +} static char *PrintMB(char *buf, int64 bytes) { char *bo = buf; @@ -215,55 +302,7 @@ static char *PrintMB(char *buf, int64 bytes) { return bo; } -static void UpdateStats() { - ProcessorStats stats = g_backend->GetStats(); - - char tmp[64], tmp2[64]; - char buf[512]; - snprintf(buf, 512, "%s received (%lld packets), %s sent (%lld packets)", - PrintMB(tmp, stats.udp_bytes_in), stats.udp_packets_in, - PrintMB(tmp2, stats.udp_bytes_out), stats.udp_packets_out/*, udp_qsize2 - udp_qsize1, g_tun_reads*/); - SetDlgItemText(g_ui_window, IDTXT_UDP, buf); - - snprintf(buf, 512, "%s received (%lld packets), %s sent (%lld packets)", - PrintMB(tmp, stats.tun_bytes_in), stats.tun_packets_in, - PrintMB(tmp2, stats.tun_bytes_out), stats.tun_packets_out/*, - tpq_last_qsize, g_tun_writes*/); - SetDlgItemText(g_ui_window, IDTXT_TUN, buf); - - char *d = buf; - if (stats.last_complete_handskake_timestamp) { - uint32 ago = (uint32)((OsGetMilliseconds() - stats.last_complete_handskake_timestamp) / 1000); - uint32 hours = ago / 3600; - uint32 minutes = (ago - hours * 3600) / 60; - uint32 seconds = (ago - hours * 3600 - minutes * 60); - - if (hours) - d += snprintf(d, 32, hours == 1 ? "%d hour, " : "%d hours, ", hours); - if (minutes) - d += snprintf(d, 32, minutes == 1 ? "%d minute, " : "%d minutes, ", minutes); - if (d == buf || seconds) - d += snprintf(d, 32, seconds == 1 ? "%d second, " : "%d seconds, ", seconds); - memcpy(d - 2, " ago", 5); - } else { - memcpy(buf, "(never)", 8); - } - SetDlgItemText(g_ui_window, IDTXT_HANDSHAKE, buf); -} - -void UpdatePublicKey(char *s) { - SetDlgItemText(g_ui_window, IDC_PUBLIC_KEY, s); - free(s); -} - -static void UpdateButtons() { - bool running = g_backend->is_started(); - SetDlgItemText(g_ui_window, ID_START, running ? "Re&connect" : "&Connect"); - EnableWindow(GetDlgItem(g_ui_window, ID_STOP), running); -} - static void UpdateIcon(UpdateIconWhy why) { - in_addr_t ip = g_ui_ip; NOTIFYICONDATA nid; memset(&nid, 0, sizeof(nid)); nid.cbSize = sizeof(nid); @@ -272,18 +311,22 @@ static void UpdateIcon(UpdateIconWhy why) { nid.uVersion = NOTIFYICON_VERSION; nid.uCallbackMessage = WM_USER + 1; nid.uFlags = NIF_MESSAGE | NIF_TIP | NIF_ICON; - nid.hIcon = g_icons[ip ? 0 : 1]; + nid.hIcon = g_icons[g_is_connected_to_server ? 0 : 1]; char buf[kSizeOfAddress]; - char namebuf[64]; - if (ip != 0) { - snprintf(nid.szTip, sizeof(nid.szTip), "TunSafe [%s - %s]", GetCurrentConfigTitle(namebuf, sizeof(namebuf)), print_ip(buf, ip)); - nid.uFlags |= NIF_INFO; - snprintf(nid.szInfoTitle, sizeof(nid.szInfoTitle), "Connected to: %s", namebuf); - snprintf(nid.szInfo, sizeof(nid.szInfo), "IP: %s", buf); - nid.uTimeout = 5000; - nid.dwInfoFlags = NIIF_INFO; + char namebuf[128]; + if (g_is_connected_to_server) { + snprintf(nid.szTip, sizeof(nid.szTip), "TunSafe [%s - %s]", GetCurrentConfigTitle(namebuf, sizeof(namebuf)), print_ip(buf, g_backend->GetIP())); + if (!g_notified_connected_server) { + g_notified_connected_server = true; + nid.uFlags |= NIF_INFO; + snprintf(nid.szInfoTitle, sizeof(nid.szInfoTitle), "Connected to: %s", namebuf); + snprintf(nid.szInfo, sizeof(nid.szInfo), "IP: %s", buf); + nid.uTimeout = 5000; + nid.dwInfoFlags = NIIF_INFO; + } } else { + g_notified_connected_server = false; snprintf(nid.szTip, sizeof(nid.szTip), "TunSafe [%s]", "Disconnected"); if (why == UIW_STOPPED_WORKING_FAIL) { @@ -296,7 +339,7 @@ static void UpdateIcon(UpdateIconWhy why) { } Shell_NotifyIcon(g_has_icon ? NIM_MODIFY : NIM_ADD, &nid); - SendMessage(g_ui_window, WM_SETICON, ICON_SMALL, (LPARAM)g_icons[ip ? 0 : 1]); + SendMessage(g_ui_window, WM_SETICON, ICON_SMALL, (LPARAM)g_icons[g_is_connected_to_server ? 0 : 1]); g_has_icon = true; } @@ -312,16 +355,10 @@ static void RemoveIcon() { } } -#define MAX_CONFIG_FILES 100 +#define MAX_CONFIG_FILES 1024 #define ID_POPUP_CONFIG_FILE 10000 char *config_filenames[MAX_CONFIG_FILES]; - -static void RestartService(UpdateIconWhy why, bool only_if_active) { - if (!only_if_active || g_backend->is_started()) { - StopService(why); - StartService(why != UIW_NONE); - } -} +uint8 config_filenames_indent[MAX_CONFIG_FILES]; static char *StripConfExtension(const char *src, char *target, size_t size) { size_t len = strlen(src); @@ -335,65 +372,121 @@ static char *StripConfExtension(const char *src, char *target, size_t size) { } static const char *GetCurrentConfigTitle(char *target, size_t size) { - const char *ll = FindLastFolderSep(g_current_filename); - return StripConfExtension(ll ? ll + 1 : g_current_filename, target, size); + const char *ll = FindFilenameComponent(g_current_filename); + return StripConfExtension(ll, target, size); } -static void LoadConfigFile(const char *filename, bool save, bool force_start) { +static void SetCurrentConfigFilename(const char *filename) { str_set(&g_current_filename, filename); char namebuf[64]; - char *f = str_cat_alloc("TunSafe VPN Client - ", GetCurrentConfigTitle(namebuf, sizeof(namebuf))); + char *f = str_cat_alloc("TunSafe - ", GetCurrentConfigTitle(namebuf, sizeof(namebuf))); SetWindowText(g_ui_window, f); free(f); - RestartService(UIW_NONE, !force_start); - if (save) - RegWriteStr("ConfigFile", filename); + + InvalidateRect(hwndPaintBox, NULL, FALSE); } -static void AddToAvailableFilesPopup(HMENU menu, int max_num_items, bool is_settings) { - char buf[1024]; - int nfiles = 0; - if (!GetConfigFullName("*.*", buf, ARRAYSIZE(buf))) + +static void LoadConfigFile(const char *filename, bool save, bool force_start) { + SetCurrentConfigFilename(filename); + + if (force_start || g_backend->is_started()) + StartTunsafeBackend(UIW_START); + + if (save) + RegWriteStr(g_reg_key, "ConfigFile", filename); +} + +class ConfigMenuBuilder { +public: + ConfigMenuBuilder(); + + void Recurse(); + + int depth_; + int nfiles_; + size_t bufpos_; + WIN32_FIND_DATA wfd_; + char buf_[1024]; +}; + +ConfigMenuBuilder::ConfigMenuBuilder() + : nfiles_(0), depth_(0) { + if (!GetConfigFullName("", buf_, sizeof(buf_))) + bufpos_ = sizeof(buf_); + else + bufpos_ = strlen(buf_); +} + +void ConfigMenuBuilder::Recurse() { + if (bufpos_ >= sizeof(buf_) - 4) return; - - int selected_item = -1; - WIN32_FIND_DATA wfd; - HANDLE handle = FindFirstFile(buf, &wfd); + memcpy(buf_ + bufpos_, "*.*", 4); + HANDLE handle = FindFirstFile(buf_, &wfd_); if (handle != INVALID_HANDLE_VALUE) { do { - if (wfd.cFileName[0] == '.') + if (wfd_.cFileName[0] == '.') continue; - if (strcmp(g_current_filename, wfd.cFileName) == 0) - selected_item = nfiles; - - str_set(&config_filenames[nfiles], wfd.cFileName); - - nfiles++; - if (nfiles == MAX_CONFIG_FILES) + size_t len = strlen(wfd_.cFileName); + if (bufpos_ + len >= sizeof(buf_) - 1) + continue; + size_t old_bufpos = bufpos_; + memcpy(buf_ + bufpos_, wfd_.cFileName, len + 1); + bufpos_ = bufpos_ + len + 1; + config_filenames_indent[nfiles_] = depth_ + !!(wfd_.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY); + str_set(&config_filenames[nfiles_], buf_); + nfiles_++; + if (nfiles_ == MAX_CONFIG_FILES) break; - } while (FindNextFile(handle, &wfd)); + if (wfd_.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) { + buf_[bufpos_ - 1] = '\\'; + depth_++; + if (depth_ < 16) + Recurse(); + depth_--; + if (nfiles_ == MAX_CONFIG_FILES) + break; + } + bufpos_ = old_bufpos; + } while (FindNextFile(handle, &wfd_)); FindClose(handle); } +} - HMENU where; + +static int AddToAvailableFilesPopup(HMENU menu, int max_num_items, bool is_settings) { + ConfigMenuBuilder menu_builder; + HMENU where[16] = {0}; + + menu_builder.Recurse(); bool is_connected = g_backend->is_started(); + uint32 last_indent = 0; + where[0] = menu; - where = menu; - for (int i = 0; i < nfiles; i++) { - if (i == max_num_items) { - where = CreatePopupMenu(); - AppendMenu(menu, MF_POPUP, (UINT_PTR)where, "&More"); + for (int i = 0; i < menu_builder.nfiles_; i++) { + uint32 indent = config_filenames_indent[i]; + if (indent > last_indent) { + HMENU n = CreatePopupMenu(); + where[indent] = n; + AppendMenu(where[last_indent], MF_POPUP, (UINT_PTR)n, FindFilenameComponent(config_filenames[i])); + } else { + bool selected_item = (strcmp(g_current_filename, config_filenames[i]) == 0); + AppendMenu(where[indent], (selected_item && is_connected) ? + MF_CHECKED : 0, ID_POPUP_CONFIG_FILE + i, + StripConfExtension( + FindFilenameComponent(config_filenames[i]), menu_builder.buf_, sizeof(menu_builder.buf_))); + if (selected_item) + SetMenuDefaultItem(where[indent], ID_POPUP_CONFIG_FILE + i, MF_BYCOMMAND); } - - AppendMenu(where, (i == selected_item && is_connected) ? MF_CHECKED : 0, ID_POPUP_CONFIG_FILE + i, StripConfExtension(config_filenames[i], buf, sizeof(buf))); - - if (i == selected_item) - SetMenuDefaultItem(where, ID_POPUP_CONFIG_FILE + i, MF_BYCOMMAND); + last_indent = indent; } - if (nfiles) - AppendMenu(menu, MF_SEPARATOR, 0, 0); + + if (menu_builder.nfiles_ == 0) + AppendMenu(menu, MF_GRAYED | MF_DISABLED, 0, "(no config files found)"); + + return menu_builder.nfiles_; } static void ShowSettingsMenu(HWND wnd) { @@ -401,102 +494,64 @@ static void ShowSettingsMenu(HWND wnd) { AddToAvailableFilesPopup(menu, 10, true); - AppendMenu(menu, 0, IDSETT_OPEN_FILE, "&Import File..."); - AppendMenu(menu, 0, IDSETT_BROWSE_FILES, "&Browse in Explorer"); + //POINT pt; + //GetCursorPos(&pt); + + RECT r = GetParentRect(GetDlgItem(g_ui_window, ID_START)); + + RECT r2 = GetParentRect(hwndPaintBox); + + POINT pt = {r2.left, r.bottom}; + + ClientToScreen(g_ui_window, &pt); - AppendMenu(menu, MF_SEPARATOR, 0, 0); - AppendMenu(menu, 0, IDSETT_KEYPAIR, "Generate &Key Pair..."); - AppendMenu(menu, MF_SEPARATOR, 0, 0); - HMENU blockinternet = CreatePopupMenu(); - AppendMenu(blockinternet, 0, IDSETT_BLOCKINTERNET_OFF, "Off"); - AppendMenu(blockinternet, MF_SEPARATOR, 0, 0); - AppendMenu(blockinternet, 0, IDSETT_BLOCKINTERNET_ROUTE, "Yes, with Routing Rules"); - AppendMenu(blockinternet, 0, IDSETT_BLOCKINTERNET_FIREWALL, "Yes, with Firewall Rules"); - AppendMenu(blockinternet, 0, IDSETT_BLOCKINTERNET_BOTH, "Yes, Both Methods"); - bool is_activated = false; - int value = GetInternetBlockState(&is_activated); - CheckMenuRadioItem(blockinternet, IDSETT_BLOCKINTERNET_OFF, IDSETT_BLOCKINTERNET_BOTH, IDSETT_BLOCKINTERNET_OFF + value, MF_BYCOMMAND); - AppendMenu(menu, MF_POPUP + is_activated * MF_CHECKED, (UINT_PTR)blockinternet, "Block &All Internet Traffic"); - - if (g_allow_pre_post || GetAsyncKeyState(VK_SHIFT) < 0) { - AppendMenu(menu, g_allow_pre_post ? MF_CHECKED : 0, IDSETT_PREPOST, "&Allow Pre/Post commands"); - } - AppendMenu(menu, MF_SEPARATOR, 0, 0); - AppendMenu(menu, 0, IDSETT_WEB_PAGE, "Go to &Web Page"); - AppendMenu(menu, 0, IDSETT_OPENSOURCE, "See Open Source Licenses"); - AppendMenu(menu, 0, IDSETT_ABOUT, "&About TunSafe..."); - - POINT pt; - GetCursorPos(&pt); - g_last_popup_is_tray = false; int rv = TrackPopupMenu(menu, 0, pt.x, pt.y, 0, wnd, NULL); DestroyMenu(menu); } -void FindDesktopFolderView(REFIID riid, void **ppv) { - CComPtr spShellWindows; - spShellWindows.CoCreateInstance(CLSID_ShellWindows); - - CComVariant vtLoc(CSIDL_DESKTOP); - CComVariant vtEmpty; - long lhwnd; - CComPtr spdisp; - spShellWindows->FindWindowSW( - &vtLoc, &vtEmpty, - SWC_DESKTOP, &lhwnd, SWFO_NEEDDISPATCH, &spdisp); - - CComPtr spBrowser; - CComQIPtr(spdisp)-> - QueryService(SID_STopLevelBrowser, - IID_PPV_ARGS(&spBrowser)); - - CComPtr spView; - spBrowser->QueryActiveShellView(&spView); - - spView->QueryInterface(riid, ppv); -} - -void GetDesktopAutomationObject(REFIID riid, void **ppv) { - CComPtr spsv; - FindDesktopFolderView(IID_PPV_ARGS(&spsv)); - CComPtr spdispView; - spsv->GetItemObject(SVGIO_BACKGROUND, IID_PPV_ARGS(&spdispView)); - spdispView->QueryInterface(riid, ppv); -} - -void ShellExecuteFromExplorer( - PCSTR pszFile, - PCSTR pszParameters = nullptr, - PCSTR pszDirectory = nullptr, - PCSTR pszOperation = nullptr, - int nShowCmd = SW_SHOWNORMAL) { - CComPtr spFolderView; - GetDesktopAutomationObject(IID_PPV_ARGS(&spFolderView)); - CComPtr spdispShell; - spFolderView->get_Application(&spdispShell); - - CComQIPtr(spdispShell) - ->ShellExecute(CComBSTR(pszFile), - CComVariant(pszParameters ? pszParameters : ""), - CComVariant(pszDirectory ? pszDirectory : ""), - CComVariant(pszOperation ? pszOperation : ""), - CComVariant(nShowCmd)); +static bool HasReadWriteAccess(const char *filename) { + HANDLE fileH = CreateFile(filename, + GENERIC_READ | GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_WRITE, // For Exclusive access + 0, + OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, + NULL); + if (fileH != INVALID_HANDLE_VALUE) { + CloseHandle(fileH); + return true; + } + return false; } static void OpenEditor() { - char buf[MAX_PATH]; - if (GetConfigFullName(g_current_filename, buf, ARRAYSIZE(buf))) { - SHELLEXECUTEINFO shinfo = {0}; - shinfo.cbSize = sizeof(shinfo); - shinfo.fMask = SEE_MASK_CLASSNAME; - shinfo.lpFile = buf; - shinfo.lpParameters = ""; - shinfo.lpClass = ".txt"; - shinfo.nShow = SW_SHOWNORMAL; - ShellExecuteEx(&shinfo); + SHELLEXECUTEINFO shinfo = {0}; + shinfo.hwnd = g_ui_window; + shinfo.cbSize = sizeof(shinfo); + shinfo.nShow = SW_SHOWNORMAL; + + if (g_current_filename[0]) { + if (!HasReadWriteAccess(g_current_filename)) { + // Need to runas admin + char buf[1024]; + if (!ExpandEnvironmentStrings("%windir%\\system32\\notepad.exe", buf, sizeof(buf))) + return; + shinfo.lpFile = buf; + char *filename = str_cat_alloc("\"", g_current_filename, "\""); + shinfo.lpParameters = filename; + shinfo.lpVerb = "runas"; + ShellExecuteEx(&shinfo); + free(filename); + } else { + shinfo.fMask = SEE_MASK_CLASSNAME; + shinfo.lpFile = g_current_filename; + shinfo.lpParameters = ""; + shinfo.lpClass = ".txt"; + ShellExecuteEx(&shinfo); + } } } @@ -509,127 +564,62 @@ static void BrowseFiles() { } } -bool FileExists(const CHAR *fileName) { - DWORD fileAttr = GetFileAttributes(fileName); - return (0xFFFFFFFF != fileAttr); -} - -__int64 FileSize(const char* name) { - WIN32_FILE_ATTRIBUTE_DATA fad; - if (!GetFileAttributesEx(name, GetFileExInfoStandard, &fad)) - return -1; // error condition, could call GetLastError to find out more - LARGE_INTEGER size; - size.HighPart = fad.nFileSizeHigh; - size.LowPart = fad.nFileSizeLow; - return size.QuadPart; -} - -static bool is_space(uint8_t c) { - return c == ' ' || c == '\r' || c == '\n' || c == '\t'; -} - -static bool is_valid(uint8_t c) { - return c >= ' ' || c == '\r' || c == '\n' || c == '\t'; -} - -bool SanityCheckBuf(uint8 *buf, size_t n) { - for (size_t i = 0; i < n; i++) { - if (!is_space(buf[i])) { - if (buf[i] != '[' && buf[i] != '#') - return false; - for (; i < n; i++) - if (!is_valid(buf[i])) - return false; - return true; - } - } - return false; -} - -uint8* LoadFileSane(const char *name, size_t *size) { - FILE *f = fopen(name, "rb"); - uint8 *new_file = NULL, *file = NULL; - size_t j, i, n; - if (!f) return false; - fseek(f, 0, SEEK_END); - long x = ftell(f); - fseek(f, 0, SEEK_SET); - if (x < 0 || x >= 65536) goto error; - file = (uint8*)malloc(x + 1); - if (!file) goto error; - n = fread(file, 1, x + 1, f); - if (n != x || !SanityCheckBuf(file, n)) - goto error; - // Convert the file to DOS new lines - for (i = j = 0; i < n; i++) - j += (file[i] == '\n'); - new_file = (uint8*)malloc(n + 1 + j); - if (!new_file) goto error; - for (i = j = 0; i < n; i++) { - uint8 c = file[i]; - if (c == '\r') - continue; - if (c == '\n') - new_file[j++] = '\r'; - new_file[j++] = c; - } - new_file[j] = 0; - *size = j; - -error: - fclose(f); - free(file); - return new_file; -} - -bool WriteOutFile(const char *filename, uint8 *filedata, size_t filesize) { - FILE *f = fopen(filename, "wb"); - if (!f) return false; - if (fwrite(filedata, 1, filesize, f) != filesize) { - fclose(f); - return false; - } - fclose(f); - return true; -} - -void ImportFile(const char *s) { +bool ImportFile(const char *s, bool silent = false) { char buf[1024]; char mesg[1024]; size_t filesize; - const char *last = FindLastFolderSep(s); - if (!last || !GetConfigFullName(last + 1, buf, ARRAYSIZE(buf)) || _stricmp(buf, s) == 0) - return; + const char *last = FindFilenameComponent(s); + uint8 *filedata = NULL; + bool rv = false; + int filerv; - uint8 *filedata = LoadFileSane(s, &filesize); - if (!filedata) goto fail; + if (!*last || !GetConfigFullName(last, buf, ARRAYSIZE(buf)) || _stricmp(buf, s) == 0) + goto out; - if (FileExists(buf)) { - snprintf(mesg, ARRAYSIZE(mesg), "A file already exists with the name '%s' in the configuration folder. Do you want to overwrite it?", last + 1); - if (MessageBoxA(g_ui_window, mesg, "TunSafe", MB_OKCANCEL | MB_ICONEXCLAMATION) != IDOK) - goto out; - } else { - snprintf(mesg, ARRAYSIZE(mesg), "Do you want to import '%s' into TunSafe?", last + 1); - if (MessageBoxA(g_ui_window, mesg, "TunSafe", MB_OKCANCEL | MB_ICONQUESTION) != IDOK) - goto out; + filedata = LoadFileSane(s, &filesize); + if (!filedata) + goto out; + + if (!silent) { + if (FileExists(buf)) { + snprintf(mesg, ARRAYSIZE(mesg), "A file already exists with the name '%s' in the configuration folder. Do you want to overwrite it?", last); + if (MessageBoxA(g_ui_window, mesg, "TunSafe", MB_OKCANCEL | MB_ICONEXCLAMATION) != IDOK) + goto out; + } else { + snprintf(mesg, ARRAYSIZE(mesg), "Do you want to import '%s' into TunSafe?", last); + if (MessageBoxA(g_ui_window, mesg, "TunSafe", MB_OKCANCEL | MB_ICONQUESTION) != IDOK) + goto out; + } } - if (!WriteOutFile(buf, filedata, filesize)) { + filerv = WriteOutFile(buf, filedata, filesize); + + // elevate? + if (filerv == kWriteOutFile_AccessError && g_is_limited_uac_account) { + char *args = str_cat_alloc("--import \"", s, "\""); + rv = RunProcessAsAdminWithArgs(args, true); + free(args); + return rv; + } + + rv = (filerv == kWriteOutFile_Ok); + if (!rv) DeleteFileA(buf); -fail: - MessageBoxA(g_ui_window, "There was a problem importing the file.", "TunSafe", MB_ICONEXCLAMATION); - } else { - LoadConfigFile(last + 1, true, false); - } out: free(filedata); + + if (!silent) { + if (rv) + LoadConfigFile(buf, true, false); + else + MessageBoxA(g_ui_window, "There was a problem importing the file.", "TunSafe", MB_ICONEXCLAMATION); + } + return !rv; } void ShowUI(HWND hWnd) { - g_ui_visible = true; - UpdateStats(); - ShowWindow(hWnd, SW_SHOW); + SetUiVisibility(true); BringWindowToTop(hWnd); SetForegroundWindow(hWnd); } @@ -717,77 +707,280 @@ static INT_PTR WINAPI KeyPairDlgProc(HWND hWnd, UINT message, WPARAM wParam, return FALSE; } -bool wm_dropfiles_recursive; -uint64 last_auto_service_restart; +static void SetStartupFlags(int new_flags) { + // Determine whether to autorun or not. + bool autorun = (new_flags & kStartupFlag_MinimizeToTrayWhenWindowsStarts) || + !(new_flags & kStartupFlag_BackgroundService) && (new_flags & kStartupFlag_ConnectWhenWindowsStarts); + + // Update the autorun key. + HKEY hkey; + LSTATUS result; + result = RegOpenKeyEx(HKEY_CURRENT_USER, "Software\\Microsoft\\Windows\\CurrentVersion\\Run", 0, KEY_WRITE, &hkey); + if (result == 0) { + if (autorun) { + wchar_t buf[512 + 32]; + buf[0] = '"'; + DWORD len = GetModuleFileNameW(NULL, buf + 1, 512); + if (len < 512) { + memcpy(buf + len + 1, L"\" --autostart", sizeof(wchar_t) * 14); + result = RegSetValueExW(hkey, L"TunSafe", NULL, REG_SZ, (BYTE*)buf, (DWORD)(len + 15) * sizeof(wchar_t)); + } + } else { + RegDeleteValueW(hkey, L"TunSafe"); + } + RegCloseKey(hkey); + } + RegWriteInt(g_reg_key, "StartupFlags", new_flags); + + bool was_started = g_backend && g_backend->is_started(); + bool recreate_backend = false; + + if (!!(new_flags & (kStartupFlag_BackgroundService | kStartupFlag_ForegroundService))) { + // Want to run as a service - make sure service is installed and running. + if (!IsTunsafeServiceRunning()) { + g_backend->Stop(); + RINFO("Starting TunSafe service..."); + InstallTunSafeWindowsService(); + recreate_backend = true; + } +} else { + if (IsTunSafeServiceInstalled()) { + g_backend->Stop(); + g_backend->Teardown(); + + RINFO("Removing TunSafe service..."); + // Don't want to run as a service - Make sure we delete the service. + if (g_is_limited_uac_account) { + // Need to stop this early so service process is able to open. + CloseHandle(g_runonce_mutex); + if (!RunProcessAsAdminWithArgs("--delete-service-and-start", false)) { + RINFO("Unable to stop and remove service"); + uint32 m = kStartupFlag_BackgroundService | kStartupFlag_ForegroundService; + new_flags = (g_startup_flags & m) | (new_flags & ~m); + } else { + PostQuitMessage(0); + return; + } + } else { + if (!UninstallTunSafeWindowsService()) { + RINFO("Unable to stop and remove service"); + uint32 m = kStartupFlag_BackgroundService | kStartupFlag_ForegroundService; + new_flags = (g_startup_flags & m) | (new_flags & ~m); + } + } + recreate_backend = true; + } + } + if (recreate_backend) { + CreateLocalOrRemoteBackend(!!(new_flags & (kStartupFlag_BackgroundService | kStartupFlag_ForegroundService))); + if (was_started) + StartTunsafeBackend(UIW_START); + } + g_startup_flags = new_flags; + g_backend->SetServiceStartupFlags(g_startup_flags); +} + +enum { + kTab_Logs = 0, + kTab_Charts = 1, + kTab_Advanced = 2, +}; + +static void UpdateGraphReq() { + if (g_backend && (g_current_tab != 1 || !g_ui_visible)) + g_backend->GetGraph(0); +} + +static void UpdateTabSelection() { + int tab = TabCtrl_GetCurSel(hwndTab); + HWND wnd = g_ui_window; + g_current_tab = tab; + ShowWindow(hwndEdit, (tab == kTab_Logs) ? SW_SHOW : SW_HIDE); + ShowWindow(hwndGraphBox, (tab == kTab_Charts) ? SW_SHOW : SW_HIDE); + ShowWindow(hwndAdvancedBox, (tab == kTab_Advanced) ? SW_SHOW : SW_HIDE); + UpdateGraphReq(); +} + +struct WindowSizingItem { + uint16 id; + uint16 edges; +}; + +enum { + WSI_LEFT = 1, + WSI_RIGHT = 2, + WSI_TOP = 4, + WSI_BOTTOM = 8, +}; + +static const WindowSizingItem kWindowSizing[] = { + {ID_START,WSI_LEFT | WSI_RIGHT}, + {ID_STOP,WSI_LEFT | WSI_RIGHT}, + {ID_EDITCONF,WSI_LEFT | WSI_RIGHT}, + {IDC_PAINTBOX,WSI_RIGHT}, + {IDC_TAB, WSI_RIGHT | WSI_BOTTOM}, +}; + +static void HandleWindowSizing() { + RECT wr; + + GetClientRect(g_ui_window, &wr); + + static int g_orig_w, g_orig_h; + static RECT g_orig_rects[ARRAYSIZE(kWindowSizing)]; + + if (g_orig_w == 0) { + g_orig_w = wr.right; + g_orig_h = wr.bottom; + for (size_t i = 0; i < ARRAYSIZE(kWindowSizing); i++) { + const WindowSizingItem *it = &kWindowSizing[i]; + g_orig_rects[i] = GetParentRect(GetDlgItem(g_ui_window, it->id)); + } + } + + int dx = wr.right - g_orig_w; + int dy = wr.bottom - g_orig_h; + + if (dx|dy) { + HDWP dwp = BeginDeferWindowPos(10), dwp_next; + for (size_t i = 0; i < ARRAYSIZE(kWindowSizing); i++) { + const WindowSizingItem *it = &kWindowSizing[i]; + HWND wnd = GetDlgItem(g_ui_window, it->id); + RECT r = g_orig_rects[i]; + if (it->edges & WSI_LEFT) r.left += dx; + if (it->edges & WSI_RIGHT) r.right += dx; + if (it->edges & WSI_TOP) r.top += dy; + if (it->edges & WSI_BOTTOM) r.bottom += dy; + if (r.right < r.left) r.right = r.left; + if (r.bottom < r.top) r.bottom = r.top; + dwp_next = DeferWindowPos(dwp, wnd, NULL, r.left, r.top, r.right - r.left, r.bottom - r.top, SWP_NOZORDER | SWP_NOREPOSITION | SWP_NOACTIVATE); + dwp = dwp_next ? dwp_next : dwp; + } + EndDeferWindowPos(dwp); + } + + RECT rect = GetParentRect(hwndTab); + TabCtrl_AdjustRect(hwndTab, false, &rect); + MoveWindow(hwndEdit, rect.left, rect.top, rect.right - rect.left, rect.bottom - rect.top, TRUE); + MoveWindow(hwndGraphBox, rect.left, rect.top, rect.right - rect.left, rect.bottom - rect.top, TRUE); + MoveWindow(hwndAdvancedBox, rect.left, rect.top, rect.right - rect.left, rect.bottom - rect.top, TRUE); + + int parts[3] = { + (int)(wr.right * 0.2f), + (int)(wr.right * 0.6f), + (int)-1, + }; + + SendMessage(hwndStatus, SB_SETPARTS, 3, (LPARAM)parts); + SendMessage(hwndStatus, WM_SIZE, 0, 0); + InvalidateRect(hwndStatus, NULL, TRUE); +} + +static void HandleClickedItem(HWND hWnd, int wParam) { + if (wParam >= ID_POPUP_CONFIG_FILE && wParam < ID_POPUP_CONFIG_FILE + MAX_CONFIG_FILES) { + const char *new_conf = config_filenames[wParam - ID_POPUP_CONFIG_FILE]; + if (!new_conf) + return; + + if (strcmp(new_conf, g_current_filename) == 0 && g_backend->is_started()) { + StopTunsafeBackend(UIW_NONE); + } else { + LoadConfigFile(new_conf, true, GetAsyncKeyState(VK_SHIFT) >= 0); + } + + return; + } + switch (wParam) { + case ID_START: StartTunsafeBackend(UIW_START); break; + case ID_STOP: StopTunsafeBackend(UIW_NONE); break; + case ID_EXIT: PostQuitMessage(0); break; + case ID_MORE_BUTTON: ShowSettingsMenu(hWnd); break; + case IDSETT_WEB_PAGE: ShellExecute(g_ui_window, NULL, "https://tunsafe.com/", NULL, NULL, 0); break; + case IDSETT_OPENSOURCE: ShellExecute(g_ui_window, NULL, "https://tunsafe.com/open-source", NULL, NULL, 0); break; + case ID_EDITCONF: OpenEditor(); break; + case IDSETT_BROWSE_FILES:BrowseFiles(); break; + case IDSETT_OPEN_FILE: BrowseFile(hWnd); break; + case IDSETT_ABOUT: + MessageBoxA(g_ui_window, TUNSAFE_VERSION_STRING "\r\n\r\nCopyright © 2018, Ludvig Strigeus\r\n\r\nThanks for choosing TunSafe!\r\n\r\nThis version was built on " __DATE__ " " __TIME__, "About TunSafe", MB_ICONINFORMATION); + break; + case IDSETT_KEYPAIR: + DialogBox(g_hinstance, MAKEINTRESOURCE(IDD_DIALOG2), hWnd, &KeyPairDlgProc); + break; + case IDSETT_BLOCKINTERNET_OFF: + case IDSETT_BLOCKINTERNET_ROUTE: + case IDSETT_BLOCKINTERNET_FIREWALL: + case IDSETT_BLOCKINTERNET_BOTH: + { + InternetBlockState old_state = g_backend->GetInternetBlockState(NULL); + InternetBlockState new_state = (InternetBlockState)(wParam - IDSETT_BLOCKINTERNET_OFF); + + if (old_state == kBlockInternet_Off && new_state != kBlockInternet_Off) { + if (MessageBoxA(g_ui_window, "Warning! All Internet traffic will be blocked until you restart your computer. Only traffic through TunSafe will be allowed.\r\n\r\nThe blocking is activated the next time you connect to a VPN server.\r\n\r\nDo you want to continue?", "TunSafe", MB_ICONWARNING | MB_OKCANCEL) == IDCANCEL) + return; + } + + g_backend->SetInternetBlockState(new_state); + + if ((~old_state & new_state) && g_backend->is_started()) + StartTunsafeBackend(UIW_START); + return; + } + case IDSETT_SERVICE_OFF: + case IDSETT_SERVICE_FOREGROUND: + case IDSETT_SERVICE_BACKGROUND: + SetStartupFlags((int)((g_startup_flags & ~3) + wParam - IDSETT_SERVICE_OFF)); + break; + case IDSETT_SERVICE_CONNECT_AUTO: + SetStartupFlags(g_startup_flags ^ kStartupFlag_ConnectWhenWindowsStarts); + break; + case IDSETT_SERVICE_MINIMIZE_AUTO: + SetStartupFlags(g_startup_flags ^ kStartupFlag_MinimizeToTrayWhenWindowsStarts); + break; + + case IDSETT_PREPOST: + { + if (!g_hklm_reg_key) { + if (!RunProcessAsAdminWithArgs(g_allow_pre_post ? "--set-allow-pre-post 0" : "--set-allow-pre-post 1", true)) + MessageBox(g_ui_window, "You need to run TunSafe as an Administrator to be able to change this setting.", "TunSafe", MB_ICONWARNING); + g_allow_pre_post = RegReadInt(g_hklm_readonly_reg_key, "AllowPrePost", 0) != 0; + return; + } + g_allow_pre_post = !g_allow_pre_post; + RegWriteInt(g_hklm_reg_key, "AllowPrePost", g_allow_pre_post); + return; + } + } +} + static INT_PTR WINAPI DlgProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam) { - switch(message) { + + switch (message) { case WM_INITDIALOG: + SetMenu(hWnd, LoadMenu(g_hinstance, MAKEINTRESOURCE(IDR_MENU1))); return TRUE; case WM_CLOSE: - g_ui_visible = false; - ShowWindow(hWnd, SW_HIDE); + SetUiVisibility(false); return TRUE; - case WM_COMMAND: - if (wParam >= ID_POPUP_CONFIG_FILE && wParam < ID_POPUP_CONFIG_FILE + MAX_CONFIG_FILES) { - const char *new_conf = config_filenames[wParam - ID_POPUP_CONFIG_FILE]; - if (!new_conf) + case WM_NOTIFY: { + UINT idFrom = (UINT)((NMHDR*)lParam)->idFrom; + switch (((NMHDR*)lParam)->code) { + case TCN_SELCHANGE: + switch (idFrom) { + case IDC_TAB: + UpdateTabSelection(); return TRUE; - - if (g_last_popup_is_tray && strcmp(new_conf, g_current_filename) == 0 && g_backend->is_started()) { - StopService(UIW_NONE); - } else { - LoadConfigFile(new_conf, true, g_last_popup_is_tray); } - - return TRUE; - } - switch(wParam) { - case ID_START: - StopService(UIW_NONE); - StartService(); break; - case ID_STOP: StopService(UIW_NONE); break; - case ID_EXIT: PostQuitMessage(0); break; - case ID_RESET: g_backend->ResetStats(); break; - case ID_MORE_BUTTON: ShowSettingsMenu(hWnd); break; - case IDSETT_WEB_PAGE: ShellExecute(NULL, NULL, "https://tunsafe.com/", NULL, NULL, 0); break; - case IDSETT_OPENSOURCE: ShellExecute(NULL, NULL, "https://tunsafe.com/open-source", NULL, NULL, 0); break; - case ID_EDITCONF: OpenEditor(); break; - case IDSETT_BROWSE_FILES:BrowseFiles(); break; - case IDSETT_OPEN_FILE: BrowseFile(hWnd); break; - case IDSETT_ABOUT: - MessageBoxA(g_ui_window, TUNSAFE_VERSION_STRING "\r\n\r\nCopyright © 2018, Ludvig Strigeus\r\n\r\nThanks for choosing TunSafe!\r\n\r\nThis version was built on " __DATE__ " " __TIME__, "About TunSafe", MB_ICONINFORMATION); - break; - case IDSETT_KEYPAIR: - DialogBox(g_hinstance, MAKEINTRESOURCE(IDD_DIALOG2), hWnd, &KeyPairDlgProc); - break; - case IDSETT_BLOCKINTERNET_OFF: - case IDSETT_BLOCKINTERNET_ROUTE: - case IDSETT_BLOCKINTERNET_FIREWALL: - case IDSETT_BLOCKINTERNET_BOTH: { - InternetBlockState old_state = GetInternetBlockState(NULL); - InternetBlockState new_state = (InternetBlockState)(wParam - IDSETT_BLOCKINTERNET_OFF); - - if (old_state == kBlockInternet_Off && new_state != kBlockInternet_Off) { - if (MessageBoxA(g_ui_window, "Warning! All Internet traffic will be blocked until you restart your computer. Only traffic through TunSafe will be allowed.\r\n\r\nThe blocking is activated the next time you connect to a VPN server.\r\n\r\nDo you want to continue?", "TunSafe", MB_ICONWARNING | MB_OKCANCEL) == IDCANCEL) - return TRUE; - } - - SetInternetBlockState(new_state); - - if ((~old_state & new_state) && g_backend->is_started()) { - StopService(UIW_NONE); - StartService(); - } - return TRUE; - } - case IDSETT_PREPOST: { - g_allow_pre_post = !g_allow_pre_post; - RegWriteInt("AllowPrePost", g_allow_pre_post); - return TRUE; } + break; + } + case WM_COMMAND: + switch (HIWORD(wParam)) { + case 0: + HandleClickedItem(hWnd, (int)wParam); + break; } break; case WM_DROPFILES: @@ -800,7 +993,8 @@ static INT_PTR WINAPI DlgProc(HWND hWnd, UINT message, WPARAM wParam, case WM_USER + 1: if (lParam == WM_RBUTTONUP) { HMENU menu = CreatePopupMenu(); - AddToAvailableFilesPopup(menu, 10, false); + if (AddToAvailableFilesPopup(menu, 10, false)) + AppendMenu(menu, MF_SEPARATOR, 0, 0); bool active = g_backend->is_started(); AppendMenu(menu, 0, ID_START, active ? "Re&connect" : "&Connect"); @@ -812,150 +1006,56 @@ static INT_PTR WINAPI DlgProc(HWND hWnd, UINT message, WPARAM wParam, SetForegroundWindow(hWnd); - g_last_popup_is_tray = true; - - int rv = TrackPopupMenu(menu, 0, pt.x, pt.y, 0, hWnd, NULL); + int rv = TrackPopupMenu(menu, 0, pt.x, pt.y, 0, hWnd, NULL); DestroyMenu(menu); } else if (lParam == WM_LBUTTONDBLCLK) { if (IsWindowVisible(hWnd)) { - g_ui_visible = false; - ShowWindow(hWnd, SW_HIDE); + SetUiVisibility(false); } else { ShowUI(hWnd); } } return TRUE; case WM_USER + 2: - if (g_ui_ip != 0 && g_minimize_on_connect) { - g_minimize_on_connect = false; - g_ui_visible = false; - ShowWindow(hWnd, SW_HIDE); - } - UpdateIcon(UIW_NONE); - return TRUE; - case WM_USER + 3: { - CHARRANGE cr; - cr.cpMin = -1; - cr.cpMax = -1; - // hwnd = rich edit hwnd - SendDlgItemMessage(hWnd, IDC_RICHEDIT21, EM_EXSETSEL, 0, (LPARAM)&cr); - SendDlgItemMessage(hWnd, IDC_RICHEDIT21, EM_REPLACESEL, 0, (LPARAM)lParam); - free( (void*) lParam); + g_backend_delegate->DoWork(); return true; + + case WM_INITMENU: { + HMENU menu = GetMenu(g_ui_window); + + CheckMenuItem(menu, IDSETT_SERVICE_CONNECT_AUTO, MF_CHECKED * !!(g_startup_flags & kStartupFlag_ConnectWhenWindowsStarts)); + CheckMenuItem(menu, IDSETT_SERVICE_MINIMIZE_AUTO, MF_CHECKED * !!(g_startup_flags & kStartupFlag_MinimizeToTrayWhenWindowsStarts)); + CheckMenuItem(menu, IDSETT_PREPOST, g_allow_pre_post ? MF_CHECKED : 0); + + bool is_activated = false; + int value = g_backend->GetInternetBlockState(&is_activated); + CheckMenuRadioItem(menu, IDSETT_BLOCKINTERNET_OFF, IDSETT_BLOCKINTERNET_BOTH, IDSETT_BLOCKINTERNET_OFF + value, MF_BYCOMMAND); + CheckMenuRadioItem(menu, IDSETT_SERVICE_OFF, IDSETT_SERVICE_BACKGROUND, IDSETT_SERVICE_OFF + (g_startup_flags & 3), MF_BYCOMMAND); + + break; } - case WM_USER + 6: - SetDlgItemText(hWnd, IDC_RICHEDIT21, ""); - return true; - case WM_USER + 5: - UpdatePublicKey((char*)lParam); - return true; - case WM_USER + 4: { - UpdateStats(); - return true; - } - case WM_USER + 10: + + case WM_SIZE: + if (wParam == SIZE_MAXIMIZED || wParam == SIZE_RESTORED) { + if (g_ui_window) + HandleWindowSizing(); + } break; - case WM_USER + 11: { - uint64 now = GetTickCount64(); - if (now < last_auto_service_restart + 5000) { - RERROR("Too many automatic restarts..."); - StopService(UIW_STOPPED_WORKING_FAIL); - } else { - last_auto_service_restart = now; - RestartService(UIW_STOPPED_WORKING_RETRY, true); + case WM_EXITMENULOOP: + g_timestamp_of_exit_menuloop = GetTickCount(); + break; + + default: + if (message == g_message_taskbar_created) { + g_has_icon = false; + UpdateIcon(UIW_NONE); } break; } - } return FALSE; } -struct PostMsg { - int msg; - WPARAM wparam; - LPARAM lparam; - PostMsg(int a, WPARAM b, LPARAM c) : msg(a), wparam(b), lparam(c) {} -}; - -static HANDLE msg_event; -static CRITICAL_SECTION msg_section; -static std::vector msgvect; - -static DWORD WINAPI MessageThread(void *x) { - std::vector proc; - for(;;) { - WaitForSingleObject(msg_event, INFINITE); - proc.clear(); - EnterCriticalSection(&msg_section); - std::swap(proc, msgvect); - LeaveCriticalSection(&msg_section); - for(size_t i = 0; i != proc.size(); i++) - PostMessage(g_ui_window, proc[i].msg, proc[i].wparam, proc[i].lparam); - } -} - -static void MyPostMessage(int msg, WPARAM wparam, LPARAM lparam) { - size_t count; - EnterCriticalSection(&msg_section); - count = msgvect.size(); - msgvect.emplace_back(msg, wparam, lparam); - LeaveCriticalSection(&msg_section); - if (count == 0) SetEvent(msg_event); -} - -static void InitMyPostMessage() { - msg_event = CreateEvent(NULL, FALSE, FALSE, NULL); - InitializeCriticalSection(&msg_section); - DWORD thread_id; - CloseHandle(CreateThread(NULL, 0, &MessageThread, NULL, 0, &thread_id)); -} - - -void OsGetRandomBytes(uint8 *data, size_t data_size) { -#if defined(OS_WIN) - static BOOLEAN(APIENTRY *pfn)(void*, ULONG); - static bool resolved; - if (!resolved) { - pfn = (BOOLEAN(APIENTRY *)(void*, ULONG))GetProcAddress(LoadLibrary("ADVAPI32.DLL"), "SystemFunction036"); - resolved = true; - } - if (pfn && pfn(data, (ULONG)data_size)) - return; - int r = 0; -#else - int fd = open("/dev/urandom", O_RDONLY); - int r = read(fd, data, data_size); - if (r < 0) r = 0; - close(fd); -#endif - for (; r < data_size; r++) - data[r] = rand() >> 6; -} - -void OsInterruptibleSleep(int millis) { - SleepEx(millis, TRUE); -} - - -uint64 OsGetMilliseconds() { - return GetTickCount64(); -} - -void OsGetTimestampTAI64N(uint8 dst[12]) { - SYSTEMTIME systime; - uint64 file_time_uint64 = 0; - GetSystemTime(&systime); - SystemTimeToFileTime(&systime, (FILETIME*)&file_time_uint64); - uint64 time_since_epoch_100ns = (file_time_uint64 - 116444736000000000); - uint64 secs_since_epoch = time_since_epoch_100ns / 10000000 + 0x400000000000000a; - uint32 nanos = (uint32)(time_since_epoch_100ns % 10000000) * 100; - WriteBE64(dst, secs_since_epoch); - WriteBE32(dst + 8, nanos); -} - - - void PushLine(const char *s) { size_t l = strlen(s); char buf[64]; @@ -973,7 +1073,8 @@ void PushLine(const char *s) { x[l + tl] = '\r'; x[l + tl + 1] = '\n'; x[l + tl + 2] = '\0'; - MyPostMessage(WM_USER + 3, 0, (LPARAM)x); + g_backend_delegate->OnLogLine((const char**)&x); + free(x); } void EnsureConfigDirCreated() { @@ -986,7 +1087,6 @@ void EnableControl(int wnd, bool b) { EnableWindow(GetDlgItem(g_ui_window, wnd), b); } - LRESULT CALLBACK NotifyWndProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam) { switch (uMsg) { case WM_USER + 10: @@ -1012,27 +1112,649 @@ void CreateNotificationWindow() { CreateWindow("TunSafe-f19e092db01cbe0fb6aee132f8231e5b71c98f90", "TunSafe-f19e092db01cbe0fb6aee132f8231e5b71c98f90", 0, 0, 0, 0, 0, 0, 0, g_hinstance, NULL); } - -void CallbackUpdateUI() { - if (g_ui_visible) - MyPostMessage(WM_USER + 4, NULL, NULL); +HFONT CreateBoldUiFont() { + LOGFONT lf; + HFONT ffont = (HFONT)SendMessage(g_ui_window, WM_GETFONT, 0, 0); + GetObject(ffont, sizeof(lf), &lf); + lf.lfWeight = FW_BOLD; + HFONT font = CreateFontIndirect(&lf); + return font; } -void CallbackTriggerReconnect() { - PostMessage(g_ui_window, WM_USER + 11, 0, 0); +void FillRectColor(HDC dc, const RECT &r, COLORREF color) { + COLORREF old = ::SetBkColor(dc, color); + ExtTextOut(dc, 0, 0, ETO_OPAQUE, &r, NULL, 0, NULL); + ::SetBkColor(dc, old); } -void CallbackSetPublicKey(const uint8 public_key[32]) { - char *str = (char*)base64_encode(public_key, 32, NULL); - PostMessage(g_ui_window, WM_USER + 5, NULL, (LPARAM)str); +void DrawRectOutline(HDC dc, const RECT &r) { + POINT points[5] = { + {r.left, r.top}, + {r.right, r.top}, + {r.right, r.bottom}, + {r.left, r.bottom}, + {r.left, r.top} + }; + Polyline(dc, points, 5); } -int WINAPI WinMain (HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nShowCmd) { +static HFONT CreateFontHelper(int size, byte flags, const char *face, int angle = 0) { + return CreateFontA(-RescaleDpi(size), 0, angle, angle, flags & 1 ? FW_BOLD : 0, FALSE, flags & 2 ? 1 : 0, FALSE, DEFAULT_CHARSET, OUT_DEFAULT_PRECIS, + CLIP_DEFAULT_PRECIS, DEFAULT_QUALITY, DEFAULT_PITCH, face); +} + +static const char *StatusCodeToString(TunsafeBackend::StatusCode code) { + switch (code) { + case TunsafeBackend::kErrorInitialize: return "Configuration Error"; + case TunsafeBackend::kErrorTunPermanent: return "TUN Adapter Error"; + case TunsafeBackend::kErrorServiceLost: return "Service Lost"; + case TunsafeBackend::kStatusStopped: return "Disconnected"; + case TunsafeBackend::kStatusInitializing: return "Initializing"; + case TunsafeBackend::kStatusConnecting: return "Connecting..."; + case TunsafeBackend::kStatusReconnecting: return "Reconnecting..."; + case TunsafeBackend::kStatusConnected: return "Connected"; + case TunsafeBackend::kStatusTunRetrying: return "TUN Adapter Error, retrying..."; + default: + return "Unknown"; + } +} + +static void DrawInPaintBox(HDC hdc, int w, int h) { + RECT rect = {0, 0, w, h}; + FillRect(hdc, &rect, (HBRUSH)(COLOR_3DFACE + 1)); + + HFONT font = CreateBoldUiFont(); + + char namebuf[128]; + GetCurrentConfigTitle(namebuf, sizeof(namebuf)); + + RECT btrect = GetParentRect(GetDlgItem(g_ui_window, ID_START)); + + HPEN pen = CreatePen(PS_SOLID, 0, GetSysColor(COLOR_3DSHADOW)); + HBRUSH brush = GetSysColorBrush(COLOR_WINDOW); + + SelectObject(hdc, pen); + SelectObject(hdc, brush); + + comborect = MakeRect(0, btrect.top + 1, w, btrect.bottom - 1); + Rectangle(hdc, 0, btrect.top + 1, w, btrect.bottom - 1); + + if (arrowbitmap == NULL) + arrowbitmap = LoadBitmap(g_hinstance, MAKEINTRESOURCE(IDB_DOWNARROW)); + + int bw = RescaleDpi(6); + + HDC memdc = CreateCompatibleDC(hdc); + SelectObject(memdc, arrowbitmap); + StretchBlt(hdc, w - 1 - bw - 5, btrect.top + 1 + ((btrect.bottom - btrect.top - bw) >> 1), + bw, bw, memdc, 0, 0, 6, 6, SRCCOPY); + + int th = RescaleDpi(20); + + SelectObject(hdc, font); + SetBkColor(hdc, GetSysColor(COLOR_WINDOW)); + TextOut(hdc, RescaleDpi(4), btrect.top + RescaleDpi(4), namebuf, (int)strlen(namebuf)); + + int y = btrect.bottom + RescaleDpi(4); + + DeleteObject(pen); + + SelectObject(hdc, (HFONT)SendMessage(g_ui_window, WM_GETFONT, 0, 0)); + SetBkColor(hdc, GetSysColor(COLOR_3DFACE)); + + TunsafeBackend::StatusCode status = g_backend->status(); + my_strlcpy(namebuf, sizeof(namebuf) - 32, StatusCodeToString(status)); + if (status == TunsafeBackend::kStatusConnected || status == TunsafeBackend::kStatusReconnecting) { + uint64 when = g_processor_stats.first_complete_handshake_timestamp; + uint32 seconds = (when != 0) ? (uint32)((OsGetMilliseconds() - when + 500) / 1000) : 0; + snprintf(strchr(namebuf, 0), 32, ", %.2d:%.2d:%.2d", seconds / 3600, (seconds / 60) % 60, seconds % 60); + } + + int img = (status == TunsafeBackend::kStatusConnected) ? 0 : + g_backend->is_started() && !TunsafeBackend::IsPermanentError(status) ? 1 : 2; + + static const COLORREF kDotColors[3] = { + 0x51a600, + 0x00c0c0, + 0x0000c0, + }; + SetBkMode(hdc, TRANSPARENT); + COLORREF oldcolor = SetTextColor(hdc, kDotColors[img]); + HFONT oldfont = (HFONT)SelectObject(hdc, CreateFontHelper(18, 0, "Tahoma")); + wchar_t bullet = 0x25CF; + TextOutW(hdc, RescaleDpi(2), y - RescaleDpi(7), &bullet, 1); + DeleteObject(SelectObject(hdc, oldfont)); + SetTextColor(hdc, oldcolor); + + TextOut(hdc, RescaleDpi(2 + 14), y, namebuf, (int)strlen(namebuf)); + + y += RescaleDpi(18); + + uint32 ip = g_backend->GetIP(); + if (ip) { + print_ip(namebuf, ip); + TextOut(hdc, 2, y, namebuf, (int)strlen(namebuf)); + } + DeleteObject(font); + DeleteDC(memdc); +} + +typedef void DrawInPaintBoxFunc(HDC dc, int w, int h); +static void HandleWmPaintPaintbox(HWND hwnd, DrawInPaintBoxFunc *func) { + PAINTSTRUCT ps; + BeginPaint(hwnd, &ps); + + RECT r; + GetClientRect(hwnd, &r); + + HBITMAP bmp = CreateCompatibleBitmap(ps.hdc, r.right, r.bottom); + HDC dc = CreateCompatibleDC(ps.hdc); + SelectObject(dc, bmp); + + func(dc, r.right, r.bottom); + + BitBlt(ps.hdc, 0, 0, r.right, r.bottom, dc, 0, 0, SRCCOPY); + DeleteDC(dc); + DeleteObject(bmp); + EndPaint(hwnd, &ps); +} + +static LRESULT CALLBACK PaintBoxWndProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam) { + switch (uMsg) { + case WM_PAINT: { + HandleWmPaintPaintbox(hwnd, &DrawInPaintBox); + return TRUE; + } + case WM_LBUTTONDOWN: { + POINT pt = {GET_X_LPARAM(lParam), GET_Y_LPARAM(lParam)}; + if (PtInRect(&comborect, pt)) { + // Avoid showing the menu again if clicking to close. + if (GetTickCount() - g_timestamp_of_exit_menuloop >= 50u) + ShowSettingsMenu(g_ui_window); + } + return TRUE; + } + } + return DefWindowProc(hwnd, uMsg, wParam, lParam); +} + +static void DrawGraph(HDC dc, const RECT *rr, StatsCollector::TimeSeries **sources, const COLORREF *colors, int num_source, const char *xcaption, const char *ycaption) { + RECT r = *rr; + FillRectColor(dc, r, 0xffffff); + + RECT margins = { 30, 10, -10, -15 }; + margins = RescaleDpiRect(margins); + + r.left += margins.left; + r.top += margins.top; + r.right += margins.right; + r.bottom += margins.bottom; + + HPEN borderpen = CreatePen(PS_SOLID, 1, 0x808080); + SelectObject(dc, borderpen); + DrawRectOutline(dc, r); + + static const uint8 bits[4] = {0x70, 0, 0, 0}; + HBITMAP bmp = CreateBitmap(4, 1, 1, 1, &bits); + HBRUSH brush = CreatePatternBrush(bmp); + DeleteObject(bmp); + + // Draw horizontal dotted lines + { + SetTextColor(dc, 0x808080); + SetBkColor(dc, 0xffffff); + int inc = (r.bottom - r.top) >> 2; + RECT r2 = {r.left + 1, r.top + inc * 1, r.right - 1, r.top + inc * 1 + 1}; + FillRect(dc, &r2, brush); + r2.top += inc; r2.bottom += inc; + FillRect(dc, &r2, brush); + r2.top += inc; r2.bottom += inc; + FillRect(dc, &r2, brush); + } + DeleteObject(brush); + + static const uint8 bits_vertical[16] = { + 0xff, 0x0, 0xff, 0, + 0xff, 0x0, 0x0, 0, + 0xff, 0x0, 0x0, 0, + 0x0, 0x0, 0x0, 0}; + bmp = CreateBitmap(1, 4, 1, 1, &bits_vertical); + brush = CreatePatternBrush(bmp); + DeleteObject(bmp); + + { + // Draw vertical dotted lines + for (int i = 1; i < 12; i++) { + int x = (r.right - r.left) * i / 12; + RECT r2 = {r.left + x, r.top + 1, r.left + x + 1, r.bottom - 1}; + FillRect(dc, &r2, brush); + } + } + + { + // Draw legend text + HFONT font = CreateFontHelper(10, 0, "Tahoma"); + SelectObject(dc, font); + SetTextColor(dc, 0x202020); + SetBkMode(dc, TRANSPARENT); + RECT r2 = {r.left + 1, r.bottom, r.right - 1, r.bottom + RescaleDpi(15)}; + DrawText(dc, xcaption, (int)strlen(xcaption), &r2, DT_CENTER | DT_SINGLELINE | DT_VCENTER); + DeleteObject(font); + } + DeleteObject(brush); + DeleteObject(borderpen); + + // Determine the scaling factor + float mx = 1; + for (size_t j = 0; j != num_source; j++) { + const StatsCollector::TimeSeries *src = sources[j]; + for (size_t i = 0; i != src->size; i++) + mx = max(mx, src->data[i]); + } + int topval = (int)(mx + 0.5f); + // round it appropriately + if (topval >= 500) + topval = (topval + 99) / 100 * 100; + else if (topval >= 200) + topval = (topval + 49) / 50 * 50; + else if (topval >= 50) + topval = (topval + 9) / 10 * 10; + else if (topval >= 20) + topval = (topval + 4) / 5 * 5; + if (topval > mx) + mx = (float)topval; + + { + RECT r2 = {r.left - RescaleDpi(30), r.top - RescaleDpi(2), r.left - RescaleDpi(2), r.bottom}; + char buf[30]; + sprintf(buf, "%d", topval); + DrawText(dc, buf, (int)strlen(buf), &r2, DT_RIGHT | DT_SINGLELINE); + r2.top = r.bottom - RescaleDpi(12); + DrawText(dc, "0", 1, &r2, DT_RIGHT | DT_SINGLELINE); + } + + float mx_f = (1.0f / mx) * (r.bottom - r.top); + + for (size_t k = 0; k != num_source; k++) { + HPEN borderpen = CreatePen(PS_SOLID, 2, colors[k]); + SelectObject(dc, borderpen); + const StatsCollector::TimeSeries *src = sources[k]; + POINT *points = new POINT[src->size]; + for (size_t i = 0, j = src->shift; i != src->size; i++) { + points[i].x = (int)(r.left + (r.right - r.left) * i / (src->size - 1)); + points[i].y = r.bottom - (int)((float)src->data[j] * mx_f); + if (++j == src->size) j = 0; + } + Polyline(dc, points, src->size); + delete points; + DeleteObject(borderpen); + } + + if (ycaption != NULL) { + HFONT font = CreateFontHelper(10, 0, "Tahoma", 900); + SelectObject(dc, font); + TextOut(dc, r.left - RescaleDpi(18), ((r.top + r.bottom) >> 1) + RescaleDpi(12), ycaption, (int)strlen(ycaption)); + DeleteObject(font); + } +} + +static const char * const kGraphStepNames[] = { + "1 second step", + "5 second step", + "30 second step", + "5 minute step", +}; + +static void DrawInGraphBox(HDC hdc, int w, int h) { + RECT r = {0, 0, w, h}; + + static const COLORREF color[4] = { + 0x00c000, + 0xc00000, + }; + + LinearizedGraph *graph = g_backend->GetGraph(g_selected_graph_type); + StatsCollector::TimeSeries *time_series_ptr[4]; + StatsCollector::TimeSeries time_series[4]; + + int num_charts = 0; + if (graph && graph->num_charts <= 4) { + uint8 *ptr = (uint8*)(graph + 1); + for (int i = 0; i < graph->num_charts; i++) { + time_series_ptr[i] = &time_series[i]; + time_series[i].shift = 0; + time_series[i].size = *(uint32*)ptr; + time_series[i].data = (float*)(ptr + 4); + ptr += 4 + *(uint32*)ptr * 4; + if (ptr - (uint8*)graph > graph->total_size) + break; + } + num_charts = graph->num_charts; + } + + char buf[256]; + snprintf(buf, sizeof(buf), "Time (%s)", kGraphStepNames[g_selected_graph_type]); + + DrawGraph(hdc, &r, time_series_ptr, color, num_charts, buf, "Mbps"); + + free(graph); +} + +static LRESULT CALLBACK GraphBoxWndProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam) { + switch (uMsg) { + case WM_PAINT: { + HandleWmPaintPaintbox(hwnd, &DrawInGraphBox); + return TRUE; + } + case WM_RBUTTONDOWN: { + HMENU menu = CreatePopupMenu(); + for(int i = 0; i < ARRAYSIZE(kGraphStepNames); i++) + AppendMenu(menu, (i == g_selected_graph_type) * MF_CHECKED, i + 1, kGraphStepNames[i]); + POINT pt = {GET_X_LPARAM(lParam), GET_Y_LPARAM(lParam)}; + ClientToScreen(hwnd, &pt); + int rv = TrackPopupMenu(menu, TPM_NONOTIFY | TPM_RETURNCMD, pt.x, pt.y, 0, hwnd, NULL); + DestroyMenu(menu); + if (rv != 0) { + g_selected_graph_type = rv - 1; + InvalidateRect(hwnd, NULL, FALSE); + } + return TRUE; + } + } + return DefWindowProc(hwnd, uMsg, wParam, lParam); +} + +struct AdvancedTextInfo { + uint16 y; + uint8 indent; + const char *title; +}; + +static const AdvancedTextInfo ADVANCED_TEXT_INFOS[] = { +#define Y 26 + {Y + 19 * 0, 66, "Public Key:"}, + {Y + 19 * 1, 66, "Endpoint:"}, + {Y + 19 * 2, 66, "Transfer:"}, + {Y + 19 * 3, 66, "Handshake:"}, + {Y + 19 * 4, 66, ""}, + {Y + 19 * 5, 66, "Overhead:"}, +#undef Y +}; + +static char *PrintLastHandshakeAt(char buf[256], WgProcessorStats *ps) { + char *d = buf; + if (ps->last_complete_handshake_timestamp) { + uint32 ago = (uint32)((OsGetMilliseconds() - ps->last_complete_handshake_timestamp + 500) / 1000); + uint32 hours = ago / 3600; + uint32 minutes = (ago - hours * 3600) / 60; + uint32 seconds = (ago - hours * 3600 - minutes * 60); + if (hours) + d += snprintf(d, 32, hours == 1 ? "%d hour, " : "%d hours, ", hours); + if (minutes) + d += snprintf(d, 32, minutes == 1 ? "%d minute, " : "%d minutes, ", minutes); + if (d == buf || seconds) + d += snprintf(d, 32, seconds == 1 ? "%d second, " : "%d seconds, ", seconds); + memcpy(d - 2, " ago", 5); + } else { + memcpy(buf, "(never)", 8); + } + return buf; +} + +static const char *GetAdvancedInfoValue(char buffer[256], int i) { + char tmp[64], tmp2[64]; + WgProcessorStats *ps = &g_processor_stats; + switch (i) { + case 0: { + if (IsOnlyZeros(g_backend->public_key(), 32)) + return ""; + char *str = (char*)base64_encode(g_backend->public_key(), 32, NULL); + snprintf(buffer, 256, "%s", str); + free(str); + return buffer; + } + case 1: { + char ip[kSizeOfAddress]; + if (ps->endpoint.sin.sin_family == 0) + return ""; + PrintIpAddr(ps->endpoint, ip); + snprintf(buffer, 256, "%s:%d", ip, htons(ps->endpoint.sin.sin_port)); + return buffer; + } + + case 2: + snprintf(buffer, 256, "%s in (%lld packets), %s out (%lld packets)", + PrintMB(tmp, ps->udp_bytes_in), ps->udp_packets_in, + PrintMB(tmp2, ps->udp_bytes_out), ps->udp_packets_out/*, udp_qsize2 - udp_qsize1, g_tun_reads*/); + return buffer; + case 3: return PrintLastHandshakeAt(buffer, ps); + case 4: { + snprintf(buffer, 256, "%d handshakes in (%d failed), %d handshakes out (%d failed)", + ps->handshakes_in, ps->handshakes_in - ps->handshakes_in_success, + ps->handshakes_out, ps->handshakes_out - ps->handshakes_out_success); + return buffer; + } + case 5: { + uint64 overhead_in = ps->udp_bytes_in + ps->udp_packets_in * 40 - ps->tun_bytes_out; + uint32 overhead_in_pct = ps->tun_bytes_out ? (uint32)(overhead_in * 100000 / ps->tun_bytes_out) : 0; + + uint64 overhead_out = ps->udp_bytes_out + ps->udp_packets_out * 40 - ps->tun_bytes_in; + uint32 overhead_out_pct = ps->tun_bytes_in ? (uint32)(overhead_out * 100000 / ps->tun_bytes_in) : 0; + + snprintf(buffer, 256, "%d.%.3d%% in, %d.%.3d%% out", overhead_in_pct / 1000, overhead_in_pct % 1000, + overhead_out_pct / 1000, overhead_out_pct % 1000); + return buffer; + } + default: return ""; + } +} + +static void DrawInAdvancedBox(HDC dc, int w, int h) { + RECT r = {0, 0, w, h}; + + FillRectColor(dc, r, 0xffffff); + + SelectObject(dc, (HFONT)SendMessage(g_ui_window, WM_GETFONT, 0, 0)); + SetTextColor(dc, GetSysColor(COLOR_WINDOWTEXT)); + SetBkColor(dc, GetSysColor(COLOR_WINDOW)); + + const AdvancedTextInfo *tp = ADVANCED_TEXT_INFOS; + char buffer[256]; + + for (size_t i = 0; i != ARRAYSIZE(ADVANCED_TEXT_INFOS); i++, tp++) { + int x = 8; + + RECT r = {x, tp->y, x + tp->indent, tp->y + 19}; + r = RescaleDpiRect(r); + ::ExtTextOut(dc, r.left, r.top, ETO_CLIPPED | ETO_OPAQUE, &r, tp->title, (UINT)strlen(tp->title), NULL); + + const char *s = GetAdvancedInfoValue(buffer, (int)i); + r.left = r.right; + r.right = w; + ::ExtTextOut(dc, r.left, r.top, ETO_CLIPPED | ETO_OPAQUE, &r, s, (UINT)strlen(s), NULL); + } + + SetBkColor(dc, GetSysColor(COLOR_3DFACE)); + + static const int grouptop[1] = { + 2 + }; + static const char *grouptext[1] = { + "General", + }; + + HFONT font = CreateFontHelper(12, 1, "Tahoma"); + SelectObject(dc, font); + for (size_t i = 0; i != ARRAYSIZE(grouptext); i++) { + RECT r = {RescaleDpi(4), RescaleDpi(grouptop[i]), w - RescaleDpi(4), RescaleDpi(grouptop[i] + 18)}; + ::ExtTextOut(dc, RescaleDpi(8), r.top + 1, ETO_CLIPPED | ETO_OPAQUE, &r, grouptext[i], (UINT)strlen(grouptext[i]), NULL); + } + DeleteFont(font); +} + +static LRESULT CALLBACK AdvancedBoxWndProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam) { + switch (uMsg) { + case WM_PAINT: { + HandleWmPaintPaintbox(hwnd, &DrawInAdvancedBox); + return TRUE; + } + case WM_ERASEBKGND: + return TRUE; + + case WM_RBUTTONDOWN: { + int x = GET_X_LPARAM(lParam), y = GET_Y_LPARAM(lParam); + char buffer[256]; + + const AdvancedTextInfo *tp = ADVANCED_TEXT_INFOS; + for (size_t i = 0; i != ARRAYSIZE(ADVANCED_TEXT_INFOS); i++, tp++) { + if (x >= RescaleDpi(tp->indent) && y >= RescaleDpi(tp->y) && y < RescaleDpi(tp->y + 19)) { + HMENU menu = CreatePopupMenu(); + AppendMenu(menu, 0, 1, "Copy"); + POINT pt = {x, y}; + ClientToScreen(hwnd, &pt); + int rv = TrackPopupMenu(menu, TPM_NONOTIFY | TPM_RETURNCMD, pt.x, pt.y, 0, hwnd, NULL); + DestroyMenu(menu); + if (rv == 1) + SetClipboardString(GetAdvancedInfoValue(buffer, (int)i)); + return TRUE; + } + } + return TRUE; + } + } + return DefWindowProc(hwnd, uMsg, wParam, lParam); +} + +void InitializeClass(WNDPROC wndproc, const char *name) { + WNDCLASSEX wce = {0}; + wce.cbSize = sizeof(wce); + wce.lpfnWndProc = wndproc; + wce.hInstance = g_hinstance; + wce.lpszClassName = name; + wce.style = CS_HREDRAW | CS_VREDRAW; + wce.hCursor = LoadCursor(NULL, IDC_ARROW); + RegisterClassEx(&wce); +} + +static bool CreateMainWindow() { + LoadLibrary(TEXT("Riched20.dll")); + INITCOMMONCONTROLSEX ccx; + ccx.dwSize = sizeof(INITCOMMONCONTROLSEX); + ccx.dwICC = ICC_TAB_CLASSES; + InitCommonControlsEx(&ccx); + + InitializeClass(&PaintBoxWndProc, "PaintBox"); + InitializeClass(&GraphBoxWndProc, "GraphBox"); + InitializeClass(&AdvancedBoxWndProc, "AdvancedBox"); + + HDC dc = GetDC(0); + g_large_fonts = GetDeviceCaps(dc, LOGPIXELSX); + ReleaseDC(0, dc); + + g_message_taskbar_created = RegisterWindowMessage(TEXT("TaskbarCreated")); + + g_icons[0] = LoadIcon(GetModuleHandle(NULL), MAKEINTRESOURCE(IDI_ICON1)); + g_icons[1] = LoadIcon(GetModuleHandle(NULL), MAKEINTRESOURCE(IDI_ICON0)); + g_ui_window = CreateDialog(GetModuleHandle(NULL), MAKEINTRESOURCE(IDD_DIALOG1), NULL, &DlgProc); + + if (!g_ui_window) + return false; + + DragAcceptFiles(g_ui_window, TRUE); + + ChangeWindowMessageFilter(WM_DROPFILES, MSGFLT_ADD); + ChangeWindowMessageFilter(WM_COPYDATA, MSGFLT_ADD); + ChangeWindowMessageFilter(0x0049, MSGFLT_ADD); + ChangeWindowMessageFilter(WM_USER + 10, MSGFLT_ADD); + + TCITEM tabitem; + HWND hwnd_tab = GetDlgItem(g_ui_window, IDC_TAB); + hwndTab = hwnd_tab; + tabitem.mask = TCIF_TEXT; + tabitem.pszText = "Logs"; + TabCtrl_InsertItem(hwnd_tab, 0, &tabitem); + tabitem.pszText = "Charts"; + TabCtrl_InsertItem(hwnd_tab, 1, &tabitem); + tabitem.pszText = "Advanced"; + TabCtrl_InsertItem(hwnd_tab, 2, &tabitem); + SetWindowLong(hwnd_tab, GWL_EXSTYLE, GetWindowLong(hwnd_tab, GWL_EXSTYLE) | WS_EX_COMPOSITED); + + + + hwndEdit = GetDlgItem(g_ui_window, IDC_RICHEDIT21); + hwndPaintBox = GetDlgItem(g_ui_window, IDC_PAINTBOX); + hwndGraphBox = GetDlgItem(g_ui_window, IDC_GRAPHBOX); + hwndAdvancedBox = GetDlgItem(g_ui_window, IDC_ADVANCEDBOX); + + SetWindowLong(hwndEdit, GWL_EXSTYLE, GetWindowLong(hwndEdit, GWL_EXSTYLE) &~ WS_EX_CLIENTEDGE); + + // Create the status bar. + hwndStatus = CreateWindowEx( + WS_EX_COMPOSITED, STATUSCLASSNAME, NULL, + WS_CHILD | WS_VISIBLE, 0, 0, 0, 0, g_ui_window, + (HMENU)IDC_STATUSBAR, g_hinstance, NULL); + + HandleWindowSizing(); + UpdateTabSelection(); + return true; +} + +int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nShowCmd) { g_hinstance = hInstance; InitCpuFeatures(); + WSADATA wsaData = {0}; + WSAStartup(MAKEWORD(2, 2), &wsaData); + + bool minimize = false; + bool is_autostart = false; + const char *filename = NULL; + + for (int i = 1; i < __argc; i++) { + const char *arg = __argv[i]; + if (strcmp(arg, "/minimize") == 0) { + minimize = true; + } else if (strcmp(arg, "/minimize_on_connect") == 0) { + g_minimize_on_connect = true; + } else if (strcmp(arg, "/allow_pre_post") == 0) { + g_allow_pre_post = true; + } else if (strcmp(arg, "--service") == 0) { + RunProcessAsTunsafeServiceProcess(); + return 0; + } else if (strcmp(arg, "--delete-service-and-start") == 0) { + UninstallTunSafeWindowsService(); + } else if (strcmp(arg, "--autostart") == 0) { + is_autostart = true; + } else if (strcmp(arg, "--set-allow-pre-post") == 0) { + bool want = i + 1 < __argc && atoi(__argv[i + 1]) != 0; + RegCreateKeyEx(HKEY_LOCAL_MACHINE, "Software\\TunSafe", NULL, NULL, 0, KEY_ALL_ACCESS, NULL, &g_hklm_reg_key, NULL); + RegWriteInt(g_hklm_reg_key, "AllowPrePost", want); + return 0; + } else if (strcmp(arg, "--import") == 0) { + if (i + 1 >= __argc) return 1; + const char *filename = __argv[i + 1]; + return ImportFile(filename, true); + } else { + filename = arg; + break; + } + } + + SetProcessDPIAware(); + + RegCreateKeyEx(HKEY_CURRENT_USER, "Software\\TunSafe", NULL, NULL, 0, KEY_ALL_ACCESS, NULL, &g_reg_key, NULL); + RegCreateKeyEx(HKEY_LOCAL_MACHINE, "Software\\TunSafe", NULL, NULL, 0, KEY_ALL_ACCESS, NULL, &g_hklm_reg_key, NULL); + RegOpenKeyEx(HKEY_LOCAL_MACHINE, "Software\\TunSafe", 0, KEY_READ, &g_hklm_readonly_reg_key); + + g_startup_flags = RegReadInt(g_reg_key, "StartupFlags", 0); + + if (is_autostart) { + g_disable_connect_on_start = !(g_startup_flags & kStartupFlag_ConnectWhenWindowsStarts); + minimize = !!(g_startup_flags & kStartupFlag_MinimizeToTrayWhenWindowsStarts); + } + // Check if the app is already running. - CreateMutexA(0, FALSE, "TunSafe-f19e092db01cbe0fb6aee132f8231e5b71c98f90"); + g_runonce_mutex = CreateMutexA(0, FALSE, "TunSafe-f19e092db01cbe0fb6aee132f8231e5b71c98f90"); if (GetLastError() == ERROR_ALREADY_EXISTS) { HWND window = FindWindow("TunSafe-f19e092db01cbe0fb6aee132f8231e5b71c98f90", NULL); DWORD_PTR result; @@ -1041,103 +1763,64 @@ int WINAPI WinMain (HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLin } return 1; } + + TOKEN_ELEVATION_TYPE toktype; + g_is_limited_uac_account = (GetProcessElevationType(&toktype) && toktype == TokenElevationTypeLimited); + g_is_tunsafe_service_running = IsTunsafeServiceRunning(); + bool want_use_service = !!(g_startup_flags & (kStartupFlag_BackgroundService | kStartupFlag_ForegroundService)); + + // Re-launch the process as administrator if the TunSafe service isn't running. + if ((!g_is_tunsafe_service_running || !want_use_service) && g_is_limited_uac_account) { + CloseHandle(g_runonce_mutex); + if (!RestartProcessAsAdministrator()) + MessageBoxA(0, "TunSafe needs to run as Administrator unless the TunSafe Service is started.", "TunSafe", MB_ICONWARNING); + return 0; + } + CreateNotificationWindow(); - WSADATA wsaData = {0}; - if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) { - RERROR("WSAStartup failed"); - return 1; - } - - LoadLibrary(TEXT("Riched20.dll")); - - g_backend = new TunsafeBackendWin32(); - - InitMyPostMessage(); - InitCommonControls(); - - g_icons[0] = LoadIcon(GetModuleHandle(NULL), MAKEINTRESOURCE(IDI_ICON1)); - g_icons[1] = LoadIcon(GetModuleHandle(NULL), MAKEINTRESOURCE(IDI_ICON0)); - g_ui_window = CreateDialog(GetModuleHandle(NULL), MAKEINTRESOURCE(IDD_DIALOG1), NULL, &DlgProc); - - if (!g_ui_window) - return 1; - - RegCreateKeyEx(HKEY_CURRENT_USER, "Software\\TunSafe", NULL, NULL, 0, KEY_ALL_ACCESS, NULL, &g_reg_key, NULL); - DragAcceptFiles(g_ui_window, TRUE); - - ChangeWindowMessageFilter(WM_DROPFILES, MSGFLT_ADD); - ChangeWindowMessageFilter(WM_COPYDATA, MSGFLT_ADD); - ChangeWindowMessageFilter(0x0049, MSGFLT_ADD); - - static const int ctrls[] = {IDTXT_UDP, IDTXT_TUN, IDTXT_HANDSHAKE}; - for (int i = 0; i < 3; i++) { - HWND w = GetDlgItem(g_ui_window, ctrls[i]); - SetWindowLong(w, GWL_EXSTYLE, GetWindowLong(w, GWL_EXSTYLE) | WS_EX_COMPOSITED); - } - - g_allow_pre_post = RegReadInt("AllowPrePost", 0) != 0; - - bool minimize = false; - const char *filename = NULL; - - for (size_t i = 1; i < __argc; i++) { - const char *arg = __argv[i]; - - if (_stricmp(arg, "/minimize") == 0) { - minimize = true; - } else if (_stricmp(arg, "/minimize_on_connect") == 0) { - g_minimize_on_connect = true; - } else if (_stricmp(arg, "/allow_pre_post") == 0) { - g_allow_pre_post = true; - } else { - filename = arg; - break; - } - } - - if (!minimize) { - g_ui_visible = true; - ShowWindow(g_ui_window, SW_SHOW); - } - - UpdateIcon(UIW_NONE); - + g_backend_delegate = CreateTunsafeBackendDelegateThreaded(&my_procdel, []() { + if (g_ui_window) + PostMessage(g_ui_window, WM_USER + 2, 0, 0); + }); g_logger = &PushLine; + if (!CreateMainWindow()) + return 1; + + g_current_filename = _strdup(""); + g_cmdline_filename = filename; + + if (!g_allow_pre_post && g_hklm_readonly_reg_key) + g_allow_pre_post = RegReadInt(g_hklm_readonly_reg_key, "AllowPrePost", 0) != 0; + + // Attempt to start service... + if (want_use_service && !g_is_tunsafe_service_running) { + RINFO("Starting TunSafe service..."); + InstallTunSafeWindowsService(); + } + + CreateLocalOrRemoteBackend(want_use_service); + + if (!minimize) { + SetUiVisibility(true); + } + UpdateIcon(UIW_NONE); EnsureConfigDirCreated(); - if (filename) { - LoadConfigFile(filename, false, false); - } else { - char *conf = RegReadStr("ConfigFile", "TunSafe.conf"); - LoadConfigFile(conf, false, false); - free(conf); - } - - // PrintCpuFeatures(); - -// Benchmark(); - - if (filename != NULL || RegReadInt("IsConnected", 0)) { - StartService(); - } else { - RINFO("Press Connect to initiate a connection to the WireGuard server."); - } - MSG msg; - while (GetMessage(&msg, NULL, 0, 0)) { if (!IsDialogMessage(g_ui_window, &msg)) { TranslateMessage(&msg); DispatchMessage(&msg); } } - StopService(UIW_EXITING); + + if (!g_backend->is_remote()) + g_backend->Stop(); + + delete g_backend; RemoveIcon(); return 0; } - - - diff --git a/util.cpp b/util.cpp index a601a0b..2269a3f 100644 --- a/util.cpp +++ b/util.cpp @@ -17,6 +17,7 @@ #include #endif +#include #include "tunsafe_types.h" static char base64_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; @@ -133,6 +134,7 @@ int RunCommand(const char *fmt, ...) { char *args[33]; char *envp[1] = {NULL}; int nargs = 0; + bool didadd = false; va_start(va, fmt); for (;;) { c = *fmt++; @@ -140,13 +142,14 @@ int RunCommand(const char *fmt, ...) { c = *fmt++; if (c == 0) goto ZERO; if (c == 's') { - tmp += va_arg(va, char*); + char *arg = va_arg(va, char*); + if (arg != NULL) { + tmp += arg; + didadd = true; + } } else if (c == 'd') { snprintf(buf, 32, "%d", va_arg(va, int)); tmp += buf; - } else if (c == 'u') { - snprintf(buf, 32, "%u", va_arg(va, int)); - tmp += buf; } else if (c == '%') { tmp += '%'; } else if (c == 'A') { @@ -156,9 +159,12 @@ int RunCommand(const char *fmt, ...) { } } else if (c == ' ' || c == 0) { ZERO: - args[nargs++] = _strdup(tmp.c_str()); - tmp.clear(); - if (nargs == 32 || c == 0) break; + if (!tmp.empty() || didadd) { + args[nargs++] = _strdup(tmp.c_str()); + tmp.clear(); + if (nargs == 32 || c == 0) break; + } + didadd = false; } else { tmp += c; } @@ -187,7 +193,7 @@ ZERO: #endif if (ret != 0) - RERROR("Command %s failed %d!", fmt_org, ret); + RERROR("Command failed %d!", ret); return ret; } @@ -265,3 +271,29 @@ void RINFO(const char *msg, ...) { fputs("\n", stderr); } } + +void *memdup(const void *p, size_t size) { + void *x = malloc(size); + if (x) + memcpy(x, p, size); + return x; +} + +char *my_strndup(const char *p, size_t size) { + char *x = (char*)malloc(size + 1); + if (x) { + x[size] = 0; + memcpy(x, p, size); + } + return x; +} + +size_t my_strlcpy(char *dst, size_t dstsize, const char *src) { + size_t len = strlen(src); + if (dstsize) { + size_t lenx = std::min(dstsize - 1, len); + dst[lenx] = 0; + memcpy(dst, src, lenx); + } + return len; +} \ No newline at end of file diff --git a/util.h b/util.h index 48b8324..e0846f3 100644 --- a/util.h +++ b/util.h @@ -12,3 +12,14 @@ typedef void Logger(const char *msg); extern Logger *g_logger; +void *memdup(const void *p, size_t size); +char *my_strndup(const char *p, size_t size); + +size_t my_strlcpy(char *dst, size_t dstsize, const char *src); + + +template static inline T postinc(T&x, U v) { + T t = x; + x += v; + return t; +} diff --git a/util_win32.cpp b/util_win32.cpp new file mode 100644 index 0000000..1dec101 --- /dev/null +++ b/util_win32.cpp @@ -0,0 +1,378 @@ +// SPDX-License-Identifier: AGPL-1.0-only +// Copyright (C) 2018 Ludvig Strigeus . All Rights Reserved. +#include "stdafx.h" +#include "util_win32.h" +#include +#include +#include +#include +#include +#include + +const char *FindFilenameComponent(const char *s) { + size_t len = strlen(s); + for (;;) { + if (len == 0) + return ""; + len--; + if (s[len] == '\\' || s[len] == '/') + break; + } + return s + len + 1; +} + +void str_set(char **x, const char *s) { + free(*x); + *x = _strdup(s); +} + +char *str_cat_alloc(const char * const *a, size_t n) { + if (n > 32) return NULL; + size_t len[32], totlen = 0; + for (size_t i = 0; i < n; i++) { + len[i] = strlen(a[i]); + totlen += len[i]; + } + char *r = (char *)malloc(totlen + 1); + totlen = 0; + for (size_t i = 0; i < n; i++) { + size_t n = len[i]; + memcpy(r + totlen, a[i], n); + totlen += n; + } + r[totlen] = 0; + return r; +} + +char *str_cat_alloc(const char *a, const char *b) { + const char * x[2] = {a, b}; + return str_cat_alloc(x, 2); +} + +char *str_cat_alloc(const char *a, const char *b, const char *c) { + const char * x[3] = {a, b, c}; + return str_cat_alloc(x, 3); +} + + +int RegReadInt(HKEY hkey, const char *key, int def) { + DWORD value = def, n = sizeof(value); + RegQueryValueEx(hkey, key, NULL, NULL, (BYTE*)&value, &n); + return value; +} + +void RegWriteInt(HKEY hkey, const char *key, int value) { + RegSetValueEx(hkey, key, NULL, REG_DWORD, (BYTE*)&value, sizeof(value)); +} + +char *RegReadStr(HKEY hkey, const char *key, const char *def) { + char buf[1024]; + DWORD n = sizeof(buf) - 1; + DWORD type = 0; + if (RegQueryValueEx(hkey, key, NULL, &type, (BYTE*)buf, &n) != ERROR_SUCCESS || type != REG_SZ) + return def ? _strdup(def) : NULL; + if (n && buf[n - 1] == 0) + n--; + buf[n] = 0; + return _strdup(buf); +} + +void RegWriteStr(HKEY hkey, const char *key, const char *v) { + RegSetValueEx(hkey, key, NULL, REG_SZ, (BYTE*)v, (DWORD)strlen(v) + 1); +} + +bool GetProcessElevationType(TOKEN_ELEVATION_TYPE *pOutElevationType) { + *pOutElevationType = TokenElevationTypeDefault; + bool fResult = false; + HANDLE hProcToken = NULL; + if (::OpenProcessToken(::GetCurrentProcess(), TOKEN_QUERY, &hProcToken)) { + DWORD dwSize = 0; + TOKEN_ELEVATION_TYPE elevationType = TokenElevationTypeDefault; + if (::GetTokenInformation(hProcToken, TokenElevationType, &elevationType, sizeof(elevationType), &dwSize) + && dwSize == sizeof(elevationType)) { + *pOutElevationType = elevationType; + fResult = true; + } + ::CloseHandle(hProcToken); + } + return fResult; +} + +/*++ +Routine Description: This routine returns TRUE if the caller's +process is a member of the Administrators local group. Caller is NOT +expected to be impersonating anyone and is expected to be able to +open its own process and process token. +Arguments: None. +Return Value: +TRUE - Caller has Administrators local group. +FALSE - Caller does not have Administrators local group. -- +*/ + +BOOL IsUserAdmin(VOID) { + BOOL b; + SID_IDENTIFIER_AUTHORITY NtAuthority = SECURITY_NT_AUTHORITY; + PSID AdministratorsGroup; + b = AllocateAndInitializeSid( + &NtAuthority, + 2, + SECURITY_BUILTIN_DOMAIN_RID, + DOMAIN_ALIAS_RID_ADMINS, + 0, 0, 0, 0, 0, 0, + &AdministratorsGroup); + if (b) { + if (!CheckTokenMembership(NULL, AdministratorsGroup, &b)) { + b = FALSE; + } + FreeSid(AdministratorsGroup); + } + + return(b); +} + + +const wchar_t *SkipAppNameInCommandLineArgs(const wchar_t *s) { + if (*s == '\"') { + for (;;) { + s++; + if (*s == 0) return s; + if (*s == '\"') return s + 1; + } + } else { + for (;;) { + if (*s == 0) return s; + if (*s == ' ') return s + 1; + s++; + } + } +} + + +uint8* LoadFileSane(const char *name, size_t *size) { + FILE *f = fopen(name, "rb"); + uint8 *new_file = NULL, *file = NULL; + size_t j, i, n; + if (!f) return false; + fseek(f, 0, SEEK_END); + long x = ftell(f); + fseek(f, 0, SEEK_SET); + if (x < 0 || x >= 65536) goto error; + file = (uint8*)malloc(x + 1); + if (!file) goto error; + n = fread(file, 1, x + 1, f); + if (n != x || !SanityCheckBuf(file, n)) + goto error; + // Convert the file to DOS new lines + for (i = j = 0; i < n; i++) + j += (file[i] == '\n'); + new_file = (uint8*)malloc(n + 1 + j); + if (!new_file) goto error; + for (i = j = 0; i < n; i++) { + uint8 c = file[i]; + if (c == '\r') + continue; + if (c == '\n') + new_file[j++] = '\r'; + new_file[j++] = c; + } + new_file[j] = 0; + *size = j; + +error: + fclose(f); + free(file); + return new_file; +} + +int WriteOutFile(const char *filename, uint8 *filedata, size_t filesize) { + FILE *f = fopen(filename, "wb"); + if (!f) return kWriteOutFile_AccessError; + if (fwrite(filedata, 1, filesize, f) != filesize) { + fclose(f); + return kWriteOutFile_OtherError; + } + fclose(f); + return kWriteOutFile_Ok; +} + +bool FileExists(const CHAR *fileName) { + DWORD fileAttr = GetFileAttributes(fileName); + return (0xFFFFFFFF != fileAttr); +} + +__int64 FileSize(const char* name) { + WIN32_FILE_ATTRIBUTE_DATA fad; + if (!GetFileAttributesEx(name, GetFileExInfoStandard, &fad)) + return -1; // error condition, could call GetLastError to find out more + LARGE_INTEGER size; + size.HighPart = fad.nFileSizeHigh; + size.LowPart = fad.nFileSizeLow; + return size.QuadPart; +} + +static bool is_space(uint8_t c) { + return c == ' ' || c == '\r' || c == '\n' || c == '\t'; +} + +static bool is_valid(uint8_t c) { + return c >= ' ' || c == '\r' || c == '\n' || c == '\t'; +} + +bool SanityCheckBuf(uint8 *buf, size_t n) { + for (size_t i = 0; i < n; i++) { + if (!is_space(buf[i])) { + if (buf[i] != '[' && buf[i] != '#') + return false; + for (; i < n; i++) + if (!is_valid(buf[i])) + return false; + return true; + } + } + return false; +} + +void FindDesktopFolderView(REFIID riid, void **ppv) { + CComPtr spShellWindows; + spShellWindows.CoCreateInstance(CLSID_ShellWindows); + + CComVariant vtLoc(CSIDL_DESKTOP); + CComVariant vtEmpty; + long lhwnd; + CComPtr spdisp; + spShellWindows->FindWindowSW( + &vtLoc, &vtEmpty, + SWC_DESKTOP, &lhwnd, SWFO_NEEDDISPATCH, &spdisp); + + CComPtr spBrowser; + CComQIPtr(spdisp)-> + QueryService(SID_STopLevelBrowser, + IID_PPV_ARGS(&spBrowser)); + + CComPtr spView; + spBrowser->QueryActiveShellView(&spView); + + spView->QueryInterface(riid, ppv); +} + +void GetDesktopAutomationObject(REFIID riid, void **ppv) { + CComPtr spsv; + FindDesktopFolderView(IID_PPV_ARGS(&spsv)); + CComPtr spdispView; + spsv->GetItemObject(SVGIO_BACKGROUND, IID_PPV_ARGS(&spdispView)); + spdispView->QueryInterface(riid, ppv); +} + +void ShellExecuteFromExplorer( + PCSTR pszFile, + PCSTR pszParameters, + PCSTR pszDirectory, + PCSTR pszOperation, + int nShowCmd) { + CComPtr spFolderView; + GetDesktopAutomationObject(IID_PPV_ARGS(&spFolderView)); + CComPtr spdispShell; + spFolderView->get_Application(&spdispShell); + + CComQIPtr(spdispShell) + ->ShellExecute(CComBSTR(pszFile), + CComVariant(pszParameters ? pszParameters : ""), + CComVariant(pszDirectory ? pszDirectory : ""), + CComVariant(pszOperation ? pszOperation : ""), + CComVariant(nShowCmd)); +} + +size_t GetConfigPath(char *path, size_t path_size) { + + if (!GetModuleFileName(NULL, path, (DWORD)path_size)) { + *path = 0; + return 0; + } + char *last = (char *)FindFilenameComponent(path); + if (!*last || last + 8 > path + path_size) { + *path = 0; + return 0; + } + memcpy(last, "Config\\", 8 * sizeof(last[0])); + return last + 7 - path; +} + +static bool ContainsDotDot(const char *path) { + for (uint8 last = 0, cur; (cur = path[0]) != '\0'; last = cur, path++) + if (cur == '.' && last == cur) + return true; + return false; +} + +bool EnsureValidConfigPath(const char *path) { + char buf[1024]; + + size_t len = GetConfigPath(buf, sizeof(buf)); + return (len != 0) && (strlen(path) > len && memcmp(path, buf, len) == 0 && !ContainsDotDot(path + len)); +} + +bool RunProcessAsAdminWithArgs(const char *args, bool wait_for_exit) { + SHELLEXECUTEINFO shExecInfo = {0}; + char buf[1024]; + + if (!GetModuleFileName(NULL, buf, 1024)) + return false; + shExecInfo.cbSize = sizeof(shExecInfo); + shExecInfo.lpVerb = "runas"; + shExecInfo.lpFile = buf; + shExecInfo.lpParameters = args; + shExecInfo.nShow = SW_SHOW; + shExecInfo.fMask = SEE_MASK_NOASYNC | wait_for_exit * SEE_MASK_NOCLOSEPROCESS; + if (!ShellExecuteExA(&shExecInfo)) + return false; + if (shExecInfo.hProcess) { + WaitForSingleObject(shExecInfo.hProcess, 10000); + CloseHandle(shExecInfo.hProcess); + } + return true; +} + +bool RestartProcessAsAdministrator() { + SHELLEXECUTEINFOW shExecInfo = {0}; + wchar_t buf[1024]; + + if (!GetModuleFileNameW(NULL, buf, 1024)) + return false; + +// shExecInfo.hwnd = window; + shExecInfo.cbSize = sizeof(shExecInfo); + shExecInfo.lpVerb = L"runas"; + shExecInfo.lpFile = buf; + shExecInfo.lpParameters = SkipAppNameInCommandLineArgs(GetCommandLineW()); + shExecInfo.nShow = SW_SHOW; + + return ShellExecuteExW(&shExecInfo) != 0; +} + +bool SetClipboardString(const char *string) { + bool ok = false; + if (OpenClipboard(NULL)) { + HGLOBAL hglb; + size_t len = strlen(string); + hglb = GlobalAlloc(GMEM_SHARE | GMEM_MOVEABLE, (len + 1) * sizeof(char)); + LPSTR lptstr = (LPSTR)GlobalLock(hglb); + memcpy(lptstr, string, len + 1); + GlobalUnlock(hglb); + EmptyClipboard(); + ok = SetClipboardData(CF_TEXT, hglb) != 0; + CloseClipboard(); + } + return ok; +} + +RECT GetParentRect(HWND wnd) { + RECT btrect; + GetClientRect(wnd, &btrect); + MapWindowPoints(wnd, GetParent(wnd), (LPPOINT)&btrect, 2); + return btrect; +} + +RECT MakeRect(int l, int t, int r, int b) { + RECT rr = { l, t, r, b }; + return rr; +} diff --git a/util_win32.h b/util_win32.h new file mode 100644 index 0000000..8497903 --- /dev/null +++ b/util_win32.h @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: AGPL-1.0-only +// Copyright (C) 2018 Ludvig Strigeus . All Rights Reserved. +#include "tunsafe_types.h" + +#pragma once +const char *FindFilenameComponent(const char *s); +void str_set(char **x, const char *s); + +char *str_cat_alloc(const char * const *a, size_t n); +char *str_cat_alloc(const char *a, const char *b); +char *str_cat_alloc(const char *a, const char *b, const char *c); + +int RegReadInt(HKEY hkey, const char *key, int def); +void RegWriteInt(HKEY hkey, const char *key, int value); +char *RegReadStr(HKEY hkey, const char *key, const char *def); +void RegWriteStr(HKEY hkey, const char *key, const char *v); + +// TokenElevationTypeDefault -- User is not using a split token. (e.g. UAC disabled or local admin "Administrator" account which UAC may not apply to.) +// TokenElevationTypeFull -- User has a split token, and the process is running elevated. +// TokenElevationTypeLimited -- User has a split token, but the process is not running elevated. +bool GetProcessElevationType(TOKEN_ELEVATION_TYPE *pOutElevationType); + + +const wchar_t *SkipAppNameInCommandLineArgs(const wchar_t *s); + +uint8* LoadFileSane(const char *name, size_t *size); + +enum { + kWriteOutFile_Ok = 0, + kWriteOutFile_AccessError = 1, + kWriteOutFile_OtherError = 2, +}; + +int WriteOutFile(const char *filename, uint8 *filedata, size_t filesize); + +bool SanityCheckBuf(uint8 *buf, size_t n); + +__int64 FileSize(const char* name); + +bool FileExists(const CHAR *fileName); + +void ShellExecuteFromExplorer( + PCSTR pszFile, + PCSTR pszParameters = nullptr, + PCSTR pszDirectory = nullptr, + PCSTR pszOperation = nullptr, + int nShowCmd = SW_SHOWNORMAL); + +size_t GetConfigPath(char *path, size_t path_size); +bool EnsureValidConfigPath(const char *path); + +bool RunProcessAsAdminWithArgs(const char *args, bool wait_for_exit); +bool RestartProcessAsAdministrator(); +bool SetClipboardString(const char *string); +RECT GetParentRect(HWND wnd); +RECT MakeRect(int l, int t, int r, int b); diff --git a/wireguard.cpp b/wireguard.cpp index ab9b393..2e0e72a 100644 --- a/wireguard.cpp +++ b/wireguard.cpp @@ -12,7 +12,9 @@ #include #include #include +#include "ipzip2/ipzip2.h" #include "wireguard.h" +#include "wireguard_config.h" uint64 OsGetMilliseconds(); @@ -35,11 +37,23 @@ WireguardProcessor::WireguardProcessor(UdpInterface *udp, TunInterface *tun, Pro dns_blocking_ = true; internet_blocking_ = kBlockInternet_Default; dns6_addr_.sin.sin_family = dns_addr_.sin.sin_family = 0; + + stats_last_bytes_in_ = 0; + stats_last_bytes_out_ = 0; + stats_last_ts_ = OsGetMilliseconds(); + + main_thread_scheduled_ = NULL; + main_thread_scheduled_last_ = &main_thread_scheduled_; } WireguardProcessor::~WireguardProcessor() { } +void WireguardProcessor::SetListenPort(int listen_port) { + listen_port_ = listen_port; +} + + bool WireguardProcessor::AddDnsServer(const IpAddr &sin) { IpAddr *target = (sin.sin.sin_family == AF_INET6) ? &dns6_addr_ : &dns_addr_; if (target->sin.sin_family != 0) @@ -48,7 +62,6 @@ bool WireguardProcessor::AddDnsServer(const IpAddr &sin) { return true; } - bool WireguardProcessor::SetTunAddress(const WgCidrAddr &addr) { WgCidrAddr *target = (addr.size == 128) ? &tun6_addr_ : &tun_addr_; if (target->size != 0) @@ -57,9 +70,37 @@ bool WireguardProcessor::SetTunAddress(const WgCidrAddr &addr) { return true; } +void WireguardProcessor::AddExcludedIp(const WgCidrAddr &cidr_addr) { + excluded_ips_.push_back(cidr_addr); +} -ProcessorStats WireguardProcessor::GetStats() { - stats_.last_complete_handskake_timestamp = dev_.last_complete_handskake_timestamp(); +void WireguardProcessor::SetMtu(int mtu) { + if (mtu >= 576 && mtu <= 10000) + mtu_ = mtu; +} + +void WireguardProcessor::SetAddRoutesMode(bool mode) { + add_routes_mode_ = mode; +} + +void WireguardProcessor::SetDnsBlocking(bool dns_blocking) { + dns_blocking_ = dns_blocking; +} + +void WireguardProcessor::SetInternetBlocking(InternetBlockState internet_blocking) { + internet_blocking_ = internet_blocking; +} + +void WireguardProcessor::SetHeaderObfuscation(const char *key) { + dev_.SetHeaderObfuscation(key); +} + +WgProcessorStats WireguardProcessor::GetStats() { + // todo: only supports one peer but i want this in the ui for now. + stats_.endpoint.sin.sin_family = 0; + WgPeer *peer = dev_.first_peer(); + if (peer) + stats_.endpoint = peer->endpoint_; return stats_; } @@ -92,6 +133,7 @@ static bool IsWgCidrAddrSubsetOf(const WgCidrAddr &inner, const WgCidrAddr &oute } bool WireguardProcessor::Start() { + assert(dev_.IsMainThread()); if (!udp_->Initialize(listen_port_)) return false; @@ -101,7 +143,7 @@ bool WireguardProcessor::Start() { } if (tun_addr_.cidr >= 31) { - RERROR("The TAP driver is not compatible with Address using CIDR /31 or /32. Changing to /24"); + RERROR("TAP is not compatible CIDR /31 or /32. Changing to /24"); tun_addr_.cidr = 24; } @@ -110,7 +152,8 @@ bool WireguardProcessor::Start() { config.cidr = tun_addr_.cidr; config.mtu = mtu_; config.pre_post_commands = pre_post_; - + config.excluded_ips = excluded_ips_; + uint32 netmask = tun_addr_.cidr == 32 ? 0xffffffff : 0xffffffff << (32 - tun_addr_.cidr); uint32 ipv4_broadcast_addr = (netmask == 0xffffffff) ? 0xffffffff : config.ip | ~netmask; @@ -130,6 +173,7 @@ bool WireguardProcessor::Start() { config.default_route_endpoint_v4 = (peer->endpoint_.sin.sin_family == AF_INET) ? ReadBE32(&peer->endpoint_.sin.sin_addr) : 0; // Set the default route to something config.use_ipv4_default_route = true; + peer->allow_endpoint_change_ = false; } // Also configure ipv6 gw? @@ -139,6 +183,7 @@ bool WireguardProcessor::Start() { if (peer->endpoint_.sin.sin_family == AF_INET6) memcpy(&config.default_route_endpoint_v6, &peer->endpoint_.sin6.sin6_addr, 16); config.use_ipv6_default_route = true; + peer->allow_endpoint_change_ = false; } } @@ -158,7 +203,8 @@ bool WireguardProcessor::Start() { uint8 dhcp_options[6]; - config.block_dns_on_adapters = dns_blocking_; + config.block_dns_on_adapters = dns_blocking_ && ((config.use_ipv4_default_route && dns_addr_.sin.sin_family == AF_INET) || + (config.use_ipv6_default_route && dns6_addr_.sin6.sin6_family == AF_INET6)); config.internet_blocking = internet_blocking_; if (dns_addr_.sin.sin_family == AF_INET) { @@ -187,7 +233,7 @@ bool WireguardProcessor::Start() { peer->ipv4_broadcast_addr_ = ipv4_broadcast_addr; if (peer->endpoint_.sin.sin_family != 0) { RINFO("Sending handshake..."); - SendHandshakeInitiationAndResetRetries(peer); + SendHandshakeInitiation(peer); } } @@ -222,10 +268,8 @@ struct ICMPv6NaPacketWithoutTarget { uint8 reserved[3]; uint8 target[16]; }; - #pragma pack (pop) - static uint16 ComputeIcmpv6Checksum(const uint8 *buf, int buf_size, const uint8 src_addr[16], const uint8 dst_addr[16]) { uint32 sum = 0; for (int i = 0; i < buf_size - 1; i += 2) @@ -242,28 +286,25 @@ static uint16 ComputeIcmpv6Checksum(const uint8 *buf, int buf_size, const uint8 return ((uint16)~sum); } - bool WireguardProcessor::HandleIcmpv6NeighborSolicitation(const byte *data, size_t data_size) { if (data_size < 48 + 16) return false; // Filter out neighbor solicitation - if (data[40] != kICMPv6_NeighborSolicitation || data[41] != 0) - return false; - - if (!network_discovery_spoofing_) + if (data[40] != kICMPv6_NeighborSolicitation || data[41] != 0 || !network_discovery_spoofing_) return false; bool is_broadcast = true; - if (memcmp(data + 24, kIcmpv6NeighborMulticastPrefix, sizeof(kIcmpv6NeighborMulticastPrefix)) != 0) { if (memcmp(data + 24, data + 48, 16) != 0) return false; is_broadcast = false; } - + // Target address must match a peer's range. + WG_ACQUIRE_RWLOCK_SHARED(dev_.ip_to_peer_map_lock_); WgPeer *peer = (WgPeer*)dev_.ip_to_peer_map().LookupV6(data + 48); + WG_RELEASE_RWLOCK_SHARED(dev_.ip_to_peer_map_lock_) if (peer == NULL) return false; @@ -273,8 +314,7 @@ bool WireguardProcessor::HandleIcmpv6NeighborSolicitation(const byte *data, size return false; byte *odata = out->data; - - int packet_size = is_broadcast ? sizeof(ICMPv6NaPacket) : sizeof(ICMPv6NaPacketWithoutTarget); + size_t packet_size = is_broadcast ? sizeof(ICMPv6NaPacket) : sizeof(ICMPv6NaPacketWithoutTarget); memcpy(odata, data, 4); WriteBE16(odata + 4, packet_size); @@ -298,10 +338,10 @@ bool WireguardProcessor::HandleIcmpv6NeighborSolicitation(const byte *data, size // For some reason this is openvpn's 'related mac' ((ICMPv6NaPacket*)(odata + 40))->target_mac[2] += 1; } - uint16 checksum = ComputeIcmpv6Checksum(odata + 40, packet_size, odata + 8, odata + 24); + uint16 checksum = ComputeIcmpv6Checksum(odata + 40, (int)packet_size, odata + 8, odata + 24); WriteBE16(&((ICMPv6NaPacket*)(odata + 40))->checksum, checksum); - out->size = 40 + packet_size; + out->size = (unsigned)(40 + packet_size); tun_->WriteTunPacket(out); return true; } @@ -317,9 +357,6 @@ void WireguardProcessor::HandleTunPacket(Packet *packet) { unsigned ip_version, size_from_header; WgPeer *peer; - stats_.tun_bytes_in += data_size; - stats_.tun_packets_in++; - // Sanity check that it looks like a valid ipv4 or ipv6 packet, // and determine the destination peer from the ip header if (data_size < IPV4_HEADER_SIZE) @@ -328,7 +365,9 @@ void WireguardProcessor::HandleTunPacket(Packet *packet) { ip_version = *data >> 4; if (ip_version == 4) { uint32 ip = ReadBE32(data + 16); + WG_ACQUIRE_RWLOCK_SHARED(dev_.ip_to_peer_map_lock_); peer = (WgPeer*)dev_.ip_to_peer_map().LookupV4(ip); + WG_RELEASE_RWLOCK_SHARED(dev_.ip_to_peer_map_lock_) if (peer == NULL) goto getout; if ((ip >= (224 << 24) || ip == peer->ipv4_broadcast_addr_) && !peer->allow_multicast_through_peer_) @@ -346,7 +385,9 @@ void WireguardProcessor::HandleTunPacket(Packet *packet) { if (data[6] == kIpProto_ICMPv6 && HandleIcmpv6NeighborSolicitation(data, data_size)) goto getout; + WG_ACQUIRE_RWLOCK_SHARED(dev_.ip_to_peer_map_lock_); peer = (WgPeer*)dev_.ip_to_peer_map().LookupV6(data + 24); + WG_RELEASE_RWLOCK_SHARED(dev_.ip_to_peer_map_lock_) if (peer == NULL) goto getout; @@ -359,10 +400,10 @@ void WireguardProcessor::HandleTunPacket(Packet *packet) { } if (size_from_header > data_size) goto getout; - if (peer->endpoint_.sin.sin_family == 0) - goto getout; - WritePacketToUdp(peer, packet); + // WriteAndEncryptPacketToUdp needs a held lock + WG_ACQUIRE_LOCK(peer->mutex_); + WriteAndEncryptPacketToUdp_WillUnlock(peer, packet); return; getout: @@ -370,25 +411,52 @@ getout: FreePacket(packet); } -void WireguardProcessor::WritePacketToUdp(WgPeer *peer, Packet *packet) { - byte *data = packet->data; - size_t size = packet->size; +void WgPeer::AddPacketToPeerQueue(Packet *packet) { + assert(IsPeerLocked()); + // Keep only the first MAX_QUEUED_PACKETS packets. + while (num_queued_packets_ >= MAX_QUEUED_PACKETS_PER_PEER) { + Packet *packet = first_queued_packet_; + first_queued_packet_ = packet->next; + num_queued_packets_--; + FreePacket(packet); + } + // Add the packet to the out queue that will get sent once handshake completes + *last_queued_packet_ptr_ = packet; + last_queued_packet_ptr_ = &packet->next; + packet->next = NULL; + num_queued_packets_++; +} + +// This function must be called with the peer lock held. It will remove the lock +void WireguardProcessor::WriteAndEncryptPacketToUdp_WillUnlock(WgPeer *peer, Packet *packet) { + assert(peer->IsPeerLocked()); + uint8 *data = packet->data, *ad; + size_t size = packet->size, ad_len, orig_size = size; bool want_handshake; + WgKeypair *keypair; uint64 send_ctr; - WgKeypair *keypair = peer->curr_keypair_; - - if (keypair == NULL || - keypair->send_key_state == WgKeypair::KEY_INVALID || - keypair->send_ctr >= REJECT_AFTER_MESSAGES) - goto getout_handshake; - - want_handshake = (keypair->send_ctr >= REKEY_AFTER_MESSAGES || - keypair->send_key_state == WgKeypair::KEY_WANT_REFRESH); // Ensure packet will fit including the biggest padding - if (size > kPacketCapacity - 15 - CHACHA20POLY1305_AUTHTAGLEN) + if (peer->endpoint_.sin.sin_family == 0 || + size > kPacketCapacity - 15 - CHACHA20POLY1305_AUTHTAGLEN) goto getout_discard; + if ((keypair = peer->curr_keypair_) == NULL || + (send_ctr = keypair->send_ctr) >= REJECT_AFTER_MESSAGES) { + peer->AddPacketToPeerQueue(packet); + WG_RELEASE_LOCK(peer->mutex_); + ScheduleNewHandshake(peer); + return; + } + + stats_.tun_bytes_in += size; + stats_.tun_packets_in++; + + want_handshake = (send_ctr >= REKEY_AFTER_MESSAGES || + keypair->send_key_state == WgKeypair::KEY_WANT_REFRESH); + keypair->send_ctr = send_ctr + 1; + packet->addr = peer->endpoint_; + if (size == 0) { peer->OnKeepaliveSent(); } else { @@ -416,7 +484,6 @@ add_padding: size += padding; } } - send_ctr = keypair->send_ctr++; #if WITH_SHORT_HEADERS if (keypair->enabled_features[WG_FEATURE_ID_SHORT_HEADER]) { @@ -434,8 +501,9 @@ add_padding: WriteLE32(write -= 4, (uint32)next_expected_packet); inner_tag = WG_ACK_HEADER_COUNTER_4; } else { - WriteLE64(write -= 8, next_expected_packet); - inner_tag = WG_ACK_HEADER_COUNTER_8; + WriteLE32(write -= 4, (uint32)next_expected_packet); + WriteLE16(write -= 2, (uint16)(next_expected_packet>>32)); + inner_tag = WG_ACK_HEADER_COUNTER_6; } if (keypair->broadcast_short_key != 0) { inner_tag += keypair->addr_entry_slot; @@ -448,6 +516,7 @@ add_padding: *--write = keypair->addr_entry_slot; tag += WG_SHORT_HEADER_ACK; } + byte *write_after_ack_header = write; // Determine the distance from the most recently acked packet, // be conservative when picking a suitable packet length to send. @@ -471,61 +540,54 @@ add_padding: WriteLE32(write -= 4, keypair->remote_key_id); *--write = tag; + // Not using any fields from now on + WG_RELEASE_LOCK(peer->mutex_); header_size = data - write; - stats_.compression_wg_saved_out += (int64)16 - header_size; - packet->data = data - header_size; packet->size = (int)(size + header_size + keypair->auth_tag_length); - WgKeypairEncryptPayload(data, size, write, data - write, send_ctr, keypair); + + // todo: figure out what to actually use as ad. + ad = write_after_ack_header; + ad_len = data - write_after_ack_header; } else { need_big_packet: #else { #endif // #if WITH_SHORT_HEADERS + // Not using any fields from now on + WG_RELEASE_LOCK(peer->mutex_); + ((MessageData*)data)[-1].type = ToLE32(MESSAGE_DATA); ((MessageData*)data)[-1].receiver_id = keypair->remote_key_id; ((MessageData*)data)[-1].counter = ToLE64(send_ctr); packet->data = data - sizeof(MessageData); packet->size = (int)(size + sizeof(MessageData) + keypair->auth_tag_length); - WgKeypairEncryptPayload(data, size, NULL, 0, send_ctr, keypair); + ad = NULL; + ad_len = 0; } - packet->addr = peer->endpoint_; + WgKeypairEncryptPayload(data, size, ad, ad_len, send_ctr, keypair); + DoWriteUdpPacket(packet); if (want_handshake) - SendHandshakeInitiationAndResetRetries(peer); + ScheduleNewHandshake(peer); return; getout_discard: + WG_RELEASE_LOCK(peer->mutex_); FreePacket(packet); return; - -getout_handshake: - // Keep only the first MAX_QUEUED_PACKETS packets. - while (peer->num_queued_packets_ >= MAX_QUEUED_PACKETS_PER_PEER) { - Packet *packet = peer->first_queued_packet_; - peer->first_queued_packet_ = packet->next; - peer->num_queued_packets_--; - FreePacket(packet); - } - // Add the packet to the out queue that will get sent once handshake completes - *peer->last_queued_packet_ptr_ = packet; - peer->last_queued_packet_ptr_ = &packet->next; - packet->next = NULL; - peer->num_queued_packets_++; - - SendHandshakeInitiationAndResetRetries(peer); } // This scrambles the initial 16 bytes of the packet with the -// trailing 8 bytes of the packet. +// next 8 bytes of the packet as a seed. static void ScrambleUnscramblePacket(Packet *packet, ScramblerSiphashKeys *keys) { uint8 *data = packet->data; size_t data_size = packet->size; - if (data_size < 8) + if (data_size <= 8) return; uint64 last_uint64 = ReadLE64(data_size >= 24 ? data + 16 : data + data_size - 8); @@ -537,10 +599,12 @@ static void ScrambleUnscramblePacket(Packet *packet, ScramblerSiphashKeys *keys) ((uint64*)data)[0] ^= a; ((uint64*)data)[1] ^= b; } else { - struct { uint64 a, b; } scramblers = {a, b}; - uint8 *s = (uint8*)&scramblers; + union { + uint64 d[2]; + uint8 s[16]; + } scrambler = {{a,b}}; for (size_t i = 0; i < data_size - 8; i++) - data[i] ^= s[i]; + data[i] ^= scrambler.s[i]; } } @@ -560,38 +624,81 @@ void WireguardProcessor::DoWriteUdpPacket(Packet *packet) { ScrambleUnscrambleAndWrite(packet, &dev_.header_obfuscation_key_, udp_); } -void WireguardProcessor::SendHandshakeInitiationAndResetRetries(WgPeer *peer) { - peer->handshake_attempts_ = 0; - SendHandshakeInitiation(peer); +void WireguardProcessor::ScheduleNewHandshake(WgPeer *peer) { + if (peer->main_thread_scheduled_.fetch_or(WgPeer::kMainThreadScheduled_ScheduleHandshake) == 0) { + peer->main_thread_scheduled_next_ = NULL; + WG_ACQUIRE_LOCK(main_thread_scheduled_lock_); + *main_thread_scheduled_last_ = peer; + main_thread_scheduled_last_ = &peer->main_thread_scheduled_next_; + WG_RELEASE_LOCK(main_thread_scheduled_lock_); + // todo: in multithreaded impl need to trigger |RunAllMainThreadScheduled| to get called + } +} + +void WireguardProcessor::RunAllMainThreadScheduled() { + assert(dev_.IsMainThread()); + + if (main_thread_scheduled_ == NULL) + return; + + WG_ACQUIRE_LOCK(main_thread_scheduled_lock_); + WgPeer *peer = main_thread_scheduled_; + main_thread_scheduled_ = NULL; + main_thread_scheduled_last_ = &main_thread_scheduled_; + WG_RELEASE_LOCK(main_thread_scheduled_lock_); + + while (peer) { + // todo: for the multithreaded use case figure out whether to use atomic_thread_fence here. + WgPeer *next = peer->main_thread_scheduled_next_; + uint32 ev = peer->main_thread_scheduled_.exchange(0); + if (ev & WgPeer::kMainThreadScheduled_ScheduleHandshake) { + peer->handshake_attempts_ = 0; + SendHandshakeInitiation(peer); + } + peer = next; + } } void WireguardProcessor::SendHandshakeInitiation(WgPeer *peer) { - // Send out a handshake init packet to trigger the handshake procedure + assert(dev_.IsMainThread()); + if (!peer->CheckHandshakeRateLimit()) return; + stats_.handshakes_out++; Packet *packet = AllocPacket(); - if (!packet) - return; - peer->CreateMessageHandshakeInitiation(packet); + if (packet) { + peer->CreateMessageHandshakeInitiation(packet); + WG_ACQUIRE_LOCK(peer->mutex_); + int attempts = ++peer->total_handshake_attempts_; + if (procdel_) + procdel_->OnConnectionRetry(attempts); + peer->OnHandshakeInitSent(); + packet->addr = peer->endpoint_; + WG_RELEASE_LOCK(peer->mutex_); + DoWriteUdpPacket(packet); + if (attempts > 1 && attempts <= 20) + RINFO("Retrying handshake, attempt %d...%s", attempts, (attempts == 20) ? " (last notice)" : ""); + } +} - packet->addr = peer->endpoint_; - DoWriteUdpPacket(packet); - peer->OnHandshakeInitSent(); +bool WireguardProcessor::IsMainThreadPacket(Packet *packet) { + // TODO(ludde): Support header obfuscation + return packet->size == 0 || (packet->data[0] != MESSAGE_DATA && !(packet->data[0] & WG_SHORT_HEADER_BIT)); } // Handles an incoming WireGuard packet from the UDP side, decrypt etc. void WireguardProcessor::HandleUdpPacket(Packet *packet, bool overload) { uint32 type; - stats_.udp_bytes_in += packet->size; - stats_.udp_packets_in++; - // Unscramble incoming packets #if WITH_HEADER_OBFUSCATION if (dev_.header_obfuscation_) ScrambleUnscramblePacket(packet, &dev_.header_obfuscation_key_); #endif // WITH_HEADER_OBFUSCATION + stats_.udp_bytes_in += packet->size; + stats_.udp_packets_in++; + if (packet->size < sizeof(uint32)) goto invalid_size; type = ReadLE32((uint32*)packet->data); @@ -604,22 +711,23 @@ void WireguardProcessor::HandleUdpPacket(Packet *packet, bool overload) { HandleShortHeaderFormatPacket(type, packet); #endif // WITH_SHORT_HEADERS } else if (type == MESSAGE_HANDSHAKE_COOKIE) { + assert(dev_.IsMainThread()); if (packet->size != sizeof(MessageHandshakeCookie)) goto invalid_size; HandleHandshakeCookiePacket(packet); } else if (type == MESSAGE_HANDSHAKE_INITIATION) { + assert(dev_.IsMainThread()); if (WITH_HANDSHAKE_EXT ? (packet->size < sizeof(MessageHandshakeInitiation)) : (packet->size != sizeof(MessageHandshakeInitiation))) goto invalid_size; - - if (!CheckIncomingHandshakeRateLimit(packet, overload)) - return; - HandleHandshakeInitiationPacket(packet); + stats_.handshakes_in++; + if (CheckIncomingHandshakeRateLimit(packet, overload)) + HandleHandshakeInitiationPacket(packet); } else if (type == MESSAGE_HANDSHAKE_RESPONSE) { + assert(dev_.IsMainThread()); if (WITH_HANDSHAKE_EXT ? (packet->size < sizeof(MessageHandshakeResponse)) : (packet->size != sizeof(MessageHandshakeResponse))) goto invalid_size; - if (!CheckIncomingHandshakeRateLimit(packet, overload)) - return; - HandleHandshakeResponsePacket(packet); + if (CheckIncomingHandshakeRateLimit(packet, overload)) + HandleHandshakeResponsePacket(packet); } else { // unknown packet invalid_size: @@ -628,7 +736,7 @@ invalid_size: } // Returns nonzero if two endpoints are different. -static uint32 CompareEndpoint(const IpAddr *a, const IpAddr *b) { +static uint32 CompareIpAddr(const IpAddr *a, const IpAddr *b) { uint32 rv = b->sin.sin_family ^ a->sin.sin_family; if (b->sin.sin_family != AF_INET6) { rv |= b->sin.sin_addr.s_addr ^ a->sin.sin_addr.s_addr; @@ -642,9 +750,10 @@ static uint32 CompareEndpoint(const IpAddr *a, const IpAddr *b) { return rv; } -void WgPeer::CopyEndpointToPeer(WgKeypair *keypair, const IpAddr *addr) { +void WgPeer::CopyEndpointToPeer_Locked(WgKeypair *keypair, const IpAddr *addr) { // Remember how to send packets to this peer - if (CompareEndpoint(&keypair->peer->endpoint_, addr)) { + if (keypair->peer->allow_endpoint_change_ && + CompareIpAddr(&keypair->peer->endpoint_, addr)) { #if WITH_SHORT_HEADERS // When the endpoint changes, forget about using the short key. keypair->broadcast_short_key = 0; @@ -660,28 +769,21 @@ void WireguardProcessor::HandleShortHeaderFormatPacket(uint32 tag, Packet *packe size_t bytes_left = packet->size - 1; WgKeypair *keypair; uint64 counter, acked_counter; - uint8 ack_tag; + uint8 ack_tag, *ack_start; if ((tag & WG_SHORT_HEADER_KEY_ID_MASK) == 0x00) { // The key_id is explicitly included in the packet. if (bytes_left < 4) goto getout; uint32 key_id = ReadLE32(data); data += 4, bytes_left -= 4; - auto it = dev_.key_id_lookup().find(key_id); - if (it == dev_.key_id_lookup().end()) goto getout; - keypair = it->second.second; + keypair = dev_.LookupKeypairByKeyId(key_id); } else { // Lookup the packet source ip and port in the address mapping uint64 addr_id = packet->addr.sin.sin_addr.s_addr | ((uint64)packet->addr.sin.sin_port << 32); - auto it = dev_.addr_entry_map().find(addr_id); - if (it == dev_.addr_entry_map().end()) - goto getout; - WgAddrEntry *addr_entry = it->second; - keypair = addr_entry->keys[((tag / WG_SHORT_HEADER_KEY_ID) & 3) - 1]; + keypair = dev_.LookupKeypairInAddrEntryMap(addr_id, ((tag / WG_SHORT_HEADER_KEY_ID) & 3) - 1); } - if (!keypair || keypair->recv_key_state == WgKeypair::KEY_INVALID || - !keypair->enabled_features[WG_FEATURE_ID_SHORT_HEADER]) + if (!keypair || !keypair->enabled_features[WG_FEATURE_ID_SHORT_HEADER]) goto getout; // Pick the closest possible counter value with the same low bits. @@ -709,11 +811,13 @@ void WireguardProcessor::HandleShortHeaderFormatPacket(uint32 tag, Packet *packe acked_counter = 0; ack_tag = 0; + ack_start = data; // If the acknowledge header is present, then parse it so we may // get an ack for the highest seen packet. if (tag & WG_SHORT_HEADER_ACK) { if (bytes_left == 0) goto getout; ack_tag = *data; + if (ack_tag & 0xF0) goto getout; // undefined bits data += 1, bytes_left -= 1; switch (ack_tag & WG_ACK_HEADER_COUNTER_MASK) { @@ -727,83 +831,104 @@ void WireguardProcessor::HandleShortHeaderFormatPacket(uint32 tag, Packet *packe acked_counter = ReadLE32(data); data += 4, bytes_left -= 4; break; - case WG_ACK_HEADER_COUNTER_8: - if (bytes_left < 8) goto getout; - acked_counter = ReadLE64(data); - data += 8, bytes_left -= 8; + case WG_ACK_HEADER_COUNTER_6: + if (bytes_left < 6) goto getout; + acked_counter = ReadLE32(data) | ((uint64)ReadLE16(data + 4) << 32); + data += 6, bytes_left -= 6; break; default: - break; + goto getout; } } if (counter >= REJECT_AFTER_MESSAGES) goto getout; // Authenticate the packet before we can apply the state changes. - if (!WgKeypairDecryptPayload(data, bytes_left, packet->data, data - packet->data, counter, keypair)) + if (!WgKeypairDecryptPayload(data, bytes_left, ack_start, data - ack_start, counter, keypair)) goto getout; + WG_ACQUIRE_LOCK(keypair->peer->mutex_); + + if (keypair->recv_key_state == WgKeypair::KEY_INVALID) + goto getout_unlock; + if (!keypair->replay_detector.CheckReplay(counter)) - goto getout; + goto getout_unlock; stats_.compression_wg_saved_in += 16 - (data - packet->data); keypair->send_ctr_acked = std::max(keypair->send_ctr_acked, acked_counter); keypair->incoming_packet_count++; - WgPeer::CopyEndpointToPeer(keypair, &packet->addr); + WgPeer::CopyEndpointToPeer_Locked(keypair, &packet->addr); // Periodically broadcast out the short key if ((tag & WG_SHORT_HEADER_KEY_ID_MASK) == 0x00 && !keypair->did_attempt_remember_ip_port) { keypair->did_attempt_remember_ip_port = true; if (keypair->enabled_features[WG_FEATURE_ID_SKIP_KEYID_IN]) { uint64 addr_id = packet->addr.sin.sin_addr.s_addr | ((uint64)packet->addr.sin.sin_port << 32); - dev_.UpdateKeypairAddrEntry(addr_id, keypair); + dev_.UpdateKeypairAddrEntry_Locked(addr_id, keypair); } } - // Ack header may also signal that we can omit the key id in packets from now on. if (tag & WG_SHORT_HEADER_ACK) keypair->can_use_short_key_for_outgoing = (ack_tag & WG_ACK_HEADER_KEY_MASK) * WG_SHORT_HEADER_KEY_ID; - HandleAuthenticatedDataPacket(keypair, packet, data, bytes_left - keypair->auth_tag_length); + HandleAuthenticatedDataPacket_WillUnlock(keypair, packet, data, bytes_left - keypair->auth_tag_length); return; +getout_unlock: + WG_RELEASE_LOCK(keypair->peer->mutex_); getout: FreePacket(packet); return; } #endif // WITH_SHORT_HEADERS -void WireguardProcessor::HandleAuthenticatedDataPacket(WgKeypair *keypair, Packet *packet, uint8 *data, size_t data_size) { +void WireguardProcessor::NotifyHandshakeComplete() { + uint64 now = OsGetMilliseconds(); + + // todo: should lock something + stats_.last_complete_handshake_timestamp = now; + if (stats_.first_complete_handshake_timestamp == 0) + stats_.first_complete_handshake_timestamp = now; + + if (procdel_) + procdel_->OnConnected(); +} + +void WireguardProcessor::HandleAuthenticatedDataPacket_WillUnlock(WgKeypair *keypair, Packet *packet, uint8 *data, size_t data_size) { WgPeer *peer = keypair->peer; + assert(peer->IsPeerLocked()); // Promote the next key to the current key when we receive a data packet, // the handshake is now complete. - if (peer->CheckSwitchToNextKey(keypair)) { - if (procdel_) { - procdel_->OnConnected(ReadBE32(tun_addr_.addr)); - } + if (peer->CheckSwitchToNextKey_Locked(keypair)) { + stats_.handshakes_in_success++; peer->OnHandshakeFullyComplete(); - SendQueuedPackets(peer); + NotifyHandshakeComplete(); + SendQueuedPackets_Locked(peer); } // Refresh when current key gets too old - if (peer->curr_keypair_ && peer->curr_keypair_->recv_key_state == WgKeypair::KEY_WANT_REFRESH) { - peer->curr_keypair_->recv_key_state = WgKeypair::KEY_DID_REFRESH; - SendHandshakeInitiationAndResetRetries(peer); + WgKeypair *curr_keypair = peer->curr_keypair_; + if (curr_keypair && curr_keypair->recv_key_state == WgKeypair::KEY_WANT_REFRESH) { + curr_keypair->recv_key_state = WgKeypair::KEY_DID_REFRESH; + ScheduleNewHandshake(peer); } if (data_size == 0) { peer->OnKeepaliveReceived(); + WG_RELEASE_LOCK(peer->mutex_); goto getout; } peer->OnDataReceived(); + WG_RELEASE_LOCK(peer->mutex_); #if WITH_HANDSHAKE_EXT // Unpack the packet headers using ipzip if (keypair->enabled_features[WG_FEATURE_ID_IPZIP]) { uint32 rv = IpzipDecompress(data, (uint32)data_size, &keypair->ipzip_state_, IPZIP_RECV_BY_CLIENT); if (rv == (uint32)-1) - goto getout; // ipzip failed decompress + goto getout; stats_.compression_hdr_saved_in += (int64)rv - data_size; data -= (int64)rv - data_size, data_size = rv; } @@ -816,36 +941,30 @@ void WireguardProcessor::HandleAuthenticatedDataPacket(WgKeypair *keypair, Packe ip_version = *data >> 4; if (ip_version == 4) { - if (data_size < IPV4_HEADER_SIZE) { - // too small ipv4 header - goto getout; - } + if (data_size < IPV4_HEADER_SIZE) + goto getout_error_header; + WG_ACQUIRE_RWLOCK_SHARED(dev_.ip_to_peer_map_lock_); peer_from_header = (WgPeer*)dev_.ip_to_peer_map().LookupV4(ReadBE32(data + 12)); + WG_RELEASE_RWLOCK_SHARED(dev_.ip_to_peer_map_lock_) size_from_header = ReadBE16(data + 2); if (size_from_header < IPV4_HEADER_SIZE) { // too small packet? - goto getout; + goto getout_error_header; } } else if (ip_version == 6) { - if (data_size < IPV6_HEADER_SIZE) { - // too small ipv6 header - goto getout; - } + if (data_size < IPV6_HEADER_SIZE) + goto getout_error_header; + WG_ACQUIRE_RWLOCK_SHARED(dev_.ip_to_peer_map_lock_); peer_from_header = (WgPeer*)dev_.ip_to_peer_map().LookupV6(data + 8); + WG_RELEASE_RWLOCK_SHARED(dev_.ip_to_peer_map_lock_) size_from_header = IPV6_HEADER_SIZE + ReadBE16(data + 4); } else { // invalid ip version - goto getout; + goto getout_error_header; } - if (size_from_header > data_size) { - // oversized packet? - goto getout; - } - if (peer_from_header != peer) { - // source address mismatch? - goto getout; - } - //RINFO("Outgoing TUN packet of size %d", (int)size_from_header); + if (peer_from_header != peer || size_from_header > data_size) + goto getout_error_header; + packet->data = data; packet->size = size_from_header; @@ -855,9 +974,10 @@ void WireguardProcessor::HandleAuthenticatedDataPacket(WgKeypair *keypair, Packe tun_->WriteTunPacket(packet); return; +getout_error_header: + stats_.error_header++; getout: FreePacket(packet); - return; } void WireguardProcessor::HandleDataPacket(Packet *packet) { @@ -865,29 +985,33 @@ void WireguardProcessor::HandleDataPacket(Packet *packet) { size_t data_size = packet->size; uint32 key_id = ((MessageData*)data)->receiver_id; uint64 counter = ToLE64((((MessageData*)data)->counter)); - WgKeypair *keypair; - - auto it = dev_.key_id_lookup().find(key_id); - if (it == dev_.key_id_lookup().end() || - (keypair = it->second.second) == NULL || - keypair->recv_key_state == WgKeypair::KEY_INVALID) { + WgKeypair *keypair = dev_.LookupKeypairByKeyId(key_id); + if (keypair == NULL || counter >= REJECT_AFTER_MESSAGES) { + stats_.error_key_id++; getout: FreePacket(packet); return; } - if (counter >= REJECT_AFTER_MESSAGES) - goto getout; - if (!WgKeypairDecryptPayload(data + sizeof(MessageData), data_size - sizeof(MessageData), - NULL, 0, counter, keypair)) { + NULL, 0, counter, keypair)) { + stats_.error_mac++; goto getout; } - if (!keypair->replay_detector.CheckReplay(counter)) - goto getout; - WgPeer::CopyEndpointToPeer(keypair, &packet->addr); - HandleAuthenticatedDataPacket(keypair, packet, data + sizeof(MessageData), data_size - sizeof(MessageData) - keypair->auth_tag_length); + WG_ACQUIRE_LOCK(keypair->peer->mutex_); + if (keypair->recv_key_state == WgKeypair::KEY_INVALID) { + stats_.error_key_id++; + WG_RELEASE_LOCK(keypair->peer->mutex_); + goto getout; + } else if (!keypair->replay_detector.CheckReplay(counter)) { + stats_.error_duplicate++; + WG_RELEASE_LOCK(keypair->peer->mutex_); + goto getout; + } else { + WgPeer::CopyEndpointToPeer_Locked(keypair, &packet->addr); + HandleAuthenticatedDataPacket_WillUnlock(keypair, packet, data + sizeof(MessageData), data_size - sizeof(MessageData) - keypair->auth_tag_length); + } } static uint64 GetIpForRateLimit(Packet *packet) { @@ -899,54 +1023,55 @@ static uint64 GetIpForRateLimit(Packet *packet) { } bool WireguardProcessor::CheckIncomingHandshakeRateLimit(Packet *packet, bool overload) { + assert(dev_.IsMainThread()); WgRateLimit::RateLimitResult rr = dev_.rate_limiter()->CheckRateLimit(GetIpForRateLimit(packet)); if ((overload && rr.is_rate_limited()) || !dev_.CheckCookieMac1(packet)) { FreePacket(packet); return false; } + dev_.rate_limiter()->CommitResult(rr); if (overload && !rr.is_first_ip() && !dev_.CheckCookieMac2(packet)) { - dev_.rate_limiter()->CommitResult(rr); dev_.CreateCookieMessage((MessageHandshakeCookie*)packet->data, packet, ((MessageHandshakeInitiation*)packet->data)->sender_key_id); packet->size = sizeof(MessageHandshakeCookie); DoWriteUdpPacket(packet); return false; } - dev_.rate_limiter()->CommitResult(rr); return true; } // server receives this when client wants to setup a session void WireguardProcessor::HandleHandshakeInitiationPacket(Packet *packet) { + assert(dev_.IsMainThread()); WgPeer *peer = WgPeer::ParseMessageHandshakeInitiation(&dev_, packet); - if (!peer) { + if (peer) { + DoWriteUdpPacket(packet); + } else { FreePacket(packet); - return; } - peer->OnHandshakeAuthComplete(); - DoWriteUdpPacket(packet); } // client receives this after session is established void WireguardProcessor::HandleHandshakeResponsePacket(Packet *packet) { + assert(dev_.IsMainThread()); WgPeer *peer = WgPeer::ParseMessageHandshakeResponse(&dev_, packet); - if (!peer) { - FreePacket(packet); - return; + if (peer) { + stats_.handshakes_out_success++; + WG_SCOPED_LOCK(peer->mutex_); + if (peer->allow_endpoint_change_) + peer->endpoint_ = packet->addr; + peer->OnHandshakeAuthComplete(); + peer->OnHandshakeFullyComplete(); + NotifyHandshakeComplete(); + SendKeepalive_Locked(peer); } - peer->endpoint_ = packet->addr; FreePacket(packet); - peer->OnHandshakeAuthComplete(); - peer->OnHandshakeFullyComplete(); - if (procdel_) - procdel_->OnConnected(ReadBE32(tun_addr_.addr)); - SendKeepalive(peer); } -void WireguardProcessor::SendKeepalive(WgPeer *peer) { +void WireguardProcessor::SendKeepalive_Locked(WgPeer *peer) { + assert(dev_.IsMainThread() && peer->IsPeerLocked()); // can't send keepalive if no endpoint is configured if (peer->endpoint_.sin.sin_family == 0) return; - // If nothing is queued, insert a keepalive packet if (peer->first_queued_packet_ == NULL) { Packet *packet = AllocPacket(); @@ -956,43 +1081,70 @@ void WireguardProcessor::SendKeepalive(WgPeer *peer) { packet->next = NULL; peer->first_queued_packet_ = packet; } - SendQueuedPackets(peer); + SendQueuedPackets_Locked(peer); } -void WireguardProcessor::SendQueuedPackets(WgPeer *peer) { - // Steal the packets +void WireguardProcessor::SendQueuedPackets_Locked(WgPeer *peer) { + assert(peer->IsPeerLocked()); + // Steal the queue of all packets and send them all. Packet *packet = peer->first_queued_packet_; peer->first_queued_packet_ = NULL; peer->last_queued_packet_ptr_ = &peer->first_queued_packet_; peer->num_queued_packets_ = 0; - while (packet) { + while (packet != NULL) { Packet *next = packet->next; - WritePacketToUdp(peer, packet); + WriteAndEncryptPacketToUdp_WillUnlock(peer, packet); packet = next; + WG_ACQUIRE_LOCK(peer->mutex_); // WriteAndEncryptPacketToUdp_WillUnlock releases the lock } } void WireguardProcessor::HandleHandshakeCookiePacket(Packet *packet) { + assert(dev_.IsMainThread()); WgPeer::ParseMessageHandshakeCookie(&dev_, (MessageHandshakeCookie *)packet->data); } +// Only one thread may run the second loop void WireguardProcessor::SecondLoop() { + assert(dev_.IsMainThread()); uint64 now = OsGetMilliseconds(); + + uint64 bytes_in = stats_.tun_bytes_in - stats_last_bytes_in_; + uint64 bytes_out = stats_.tun_bytes_out - stats_last_bytes_out_; + + stats_last_bytes_in_ = stats_.tun_bytes_in; + stats_last_bytes_out_ = stats_.tun_bytes_out; + + uint64 millis = now - stats_last_ts_; + stats_last_ts_ = now; + + double f = 1000.0 / std::max((uint32)millis, 500); + + stats_.tun_bytes_in_per_second = (float)(bytes_in * f); + stats_.tun_bytes_out_per_second = (float)(bytes_out * f); + for (WgPeer *peer = dev_.first_peer(); peer; peer = peer->next_peer_) { + WgKeypair *keypair = peer->curr_keypair_; // Allow ip/port to be remembered again for this keypair - if (peer->curr_keypair_) - peer->curr_keypair_->did_attempt_remember_ip_port = false; + if (keypair) + keypair->did_attempt_remember_ip_port = false; - uint32 mask = peer->CheckTimeouts(now); - if (mask == 0) - continue; - if (mask & WgPeer::ACTION_SEND_KEEPALIVE) - SendKeepalive(peer); - if (mask & WgPeer::ACTION_SEND_HANDSHAKE) - SendHandshakeInitiation(peer); + // Avoid taking the lock if it seems unneccessary + if (now >= peer->time_of_next_key_event_ || peer->timers_ != 0) { + uint32 mask; + { + WG_SCOPED_LOCK(peer->mutex_); + mask = peer->CheckTimeouts(now); + if (mask == 0) + continue; + if (mask & WgPeer::ACTION_SEND_KEEPALIVE) + SendKeepalive_Locked(peer); + } + if (mask & WgPeer::ACTION_SEND_HANDSHAKE) + SendHandshakeInitiation(peer); + } } dev_.SecondLoop(now); } - diff --git a/wireguard.h b/wireguard.h index ef050c5..75a10a6 100644 --- a/wireguard.h +++ b/wireguard.h @@ -5,24 +5,50 @@ #include "tunsafe_types.h" #include "wireguard_proto.h" -struct ProcessorStats { - // Number of bytes sent/received over the physical UDP connections - int64 udp_bytes_in, udp_bytes_out; - int64 udp_packets_in, udp_packets_out; - // Number of bytes sent/received over the TUN interface - int64 tun_bytes_in, tun_bytes_out; - int64 tun_packets_in, tun_packets_out; - uint64 last_complete_handskake_timestamp; +// todo: for multithreaded use case need to use atomic ops. +struct WgProcessorStats { + // Number of bytes sent/received over the physical UDP connection + uint64 udp_bytes_in, udp_bytes_out; + uint64 udp_packets_in, udp_packets_out; + // Number of valid packets sent/received over the TUN interface + uint64 tun_bytes_in, tun_bytes_out; + uint64 tun_packets_in, tun_packets_out; + + // Error types + uint32 error_key_id; + uint32 error_mac; + uint32 error_duplicate; + uint32 error_source_addr; + uint32 error_header; + + // Current speed of TUN packets + float tun_bytes_in_per_second, tun_bytes_out_per_second; + + // Timestamp of handshakes + uint64 first_complete_handshake_timestamp; + uint64 last_complete_handshake_timestamp; + + // How much saved from header compression int64 compression_hdr_saved_in, compression_hdr_saved_out; - int64 compression_wg_saved_in, compression_wg_saved_out; + + // Number of handshakes received and sent + // Number of successful handshakes in and out + uint32 handshakes_in, handshakes_out; + uint32 handshakes_in_success, handshakes_out_success; + + // Key stuff + uint8 public_key[32]; + + // Address of the endpoint + IpAddr endpoint; }; class ProcessorDelegate { public: - virtual void OnConnected(in_addr_t my_ip) = 0; - virtual void OnDisconnected() = 0; + virtual void OnConnected() = 0; + virtual void OnConnectionRetry(uint32 attempts) = 0; }; enum InternetBlockState { @@ -42,62 +68,46 @@ public: WireguardProcessor(UdpInterface *udp, TunInterface *tun, ProcessorDelegate *procdel); ~WireguardProcessor(); - void SetListenPort(int listen_port) { - listen_port_ = listen_port; - } - - bool SetTunAddress(const WgCidrAddr &addr); - + void SetListenPort(int listen_port); bool AddDnsServer(const IpAddr &sin); - - void SetMtu(int mtu) { - if (mtu >= 576 && mtu <= 10000) - mtu_ = mtu; - } - - void SetAddRoutesMode(bool mode) { - add_routes_mode_ = mode; - } - - void SetDnsBlocking(bool dns_blocking) { - dns_blocking_ = dns_blocking; - } - - void SetInternetBlocking(InternetBlockState internet_blocking) { - internet_blocking_ = internet_blocking; - } - - void SetHeaderObfuscation(const char *key) { - dev_.SetHeaderObfuscation(key); - } + bool SetTunAddress(const WgCidrAddr &addr); + void AddExcludedIp(const WgCidrAddr &cidr_addr); + void SetMtu(int mtu); + void SetAddRoutesMode(bool mode); + void SetDnsBlocking(bool dns_blocking); + void SetInternetBlocking(InternetBlockState internet_blocking); + void SetHeaderObfuscation(const char *key); void HandleTunPacket(Packet *packet); void HandleUdpPacket(Packet *packet, bool overload); + static bool IsMainThreadPacket(Packet *packet); + void SecondLoop(); - ProcessorStats GetStats(); + WgProcessorStats GetStats(); void ResetStats(); bool Start(); WgDevice &dev() { return dev_; } - TunInterface::PrePostCommands &prepost() { return pre_post_; } + const WgCidrAddr &tun_addr() { return tun_addr_; } + void RunAllMainThreadScheduled(); private: void DoWriteUdpPacket(Packet *packet); - void WritePacketToUdp(WgPeer *peer, Packet *packet); + void WriteAndEncryptPacketToUdp_WillUnlock(WgPeer *peer, Packet *packet); void SendHandshakeInitiation(WgPeer *peer); - void SendHandshakeInitiationAndResetRetries(WgPeer *peer); - void SendKeepalive(WgPeer *peer); - void SendQueuedPackets(WgPeer *peer); + void ScheduleNewHandshake(WgPeer *peer); + void SendKeepalive_Locked(WgPeer *peer); + void SendQueuedPackets_Locked(WgPeer *peer); void HandleHandshakeInitiationPacket(Packet *packet); void HandleHandshakeResponsePacket(Packet *packet); void HandleHandshakeCookiePacket(Packet *packet); void HandleDataPacket(Packet *packet); - void HandleAuthenticatedDataPacket(WgKeypair *keypair, Packet *packet, uint8 *data, size_t data_size); + void HandleAuthenticatedDataPacket_WillUnlock(WgKeypair *keypair, Packet *packet, uint8 *data, size_t data_size); void HandleShortHeaderFormatPacket(uint32 tag, Packet *packet); @@ -106,6 +116,7 @@ private: bool HandleIcmpv6NeighborSolicitation(const byte *data, size_t data_size); void SetupCompressionHeader(WgPacketCompressionVer01 *c); + void NotifyHandshakeComplete(); int listen_port_; @@ -113,12 +124,13 @@ private: TunInterface *tun_; UdpInterface *udp_; int mtu_; - ProcessorStats stats_; + WgProcessorStats stats_; bool dns_blocking_; uint8 internet_blocking_; bool add_routes_mode_; bool network_discovery_spoofing_; + bool did_have_first_handshake_; uint8 network_discovery_mac_[6]; WgDevice dev_; @@ -129,5 +141,15 @@ private: IpAddr dns_addr_, dns6_addr_; TunInterface::PrePostCommands pre_post_; + + // Queue of things scheduled to run on the main thread. + WG_DECLARE_LOCK(main_thread_scheduled_lock_); + WgPeer *main_thread_scheduled_, **main_thread_scheduled_last_; + + uint64 stats_last_bytes_in_, stats_last_bytes_out_; + uint64 stats_last_ts_; + + // IPs we want to map to the default route + std::vector excluded_ips_; }; diff --git a/wireguard_config.cpp b/wireguard_config.cpp index 3d51f62..dbc67b0 100644 --- a/wireguard_config.cpp +++ b/wireguard_config.cpp @@ -20,6 +20,10 @@ #include #endif +#if defined(OS_WIN) +#include "network_win32_dnsblock.h" +#endif + const char *print_ip_prefix(char buf[kSizeOfAddress], int family, const void *ip, int prefixlen) { if (!inet_ntop(family, ip, buf, kSizeOfAddress - 8)) { memcpy(buf, "unknown", 8); @@ -29,6 +33,17 @@ const char *print_ip_prefix(char buf[kSizeOfAddress], int family, const void *ip return buf; } +char *PrintIpAddr(const IpAddr &addr, char buf[kSizeOfAddress]) { + if (addr.sin.sin_family == AF_INET) { + print_ip_prefix(buf, addr.sin.sin_family, &addr.sin.sin_addr, -1); + } else if (addr.sin.sin_family == AF_INET) { + print_ip_prefix(buf, addr.sin.sin_family, &addr.sin6.sin6_addr, -1); + } else { + buf[0] = 0; + } + return buf; +} + struct Addr { byte addr[4]; uint8 cidr; @@ -58,19 +73,71 @@ static bool ParseCidrAddr(char *s, WgCidrAddr *out) { return false; } -struct hostent *gethostbyname_retry_on_failure(const char * name, bool *exit_flag) { +DnsResolver::DnsResolver(DnsBlocker *dns_blocker) { + dns_blocker_ = dns_blocker; + abort_flag_ = false; +} + +DnsResolver::~DnsResolver() { +} + +void DnsResolver::ClearCache() { + cache_.clear(); +} + +bool DnsResolver::Resolve(const char *hostname, IpAddr *result) { int attempt = 0; - static const uint8 retry_delays[] = {1, 2, 3, 5, 10, 20, 40, 60}; + static const uint8 retry_delays[] = {1, 2, 3, 5, 10}; + char buf[kSizeOfAddress]; + + memset(result, 0, sizeof(IpAddr)); + if (inet_pton(AF_INET6, hostname, &result->sin6.sin6_addr) == 1) { + result->sin.sin_family = AF_INET6; + return true; + } + + if (inet_pton(AF_INET, hostname, &result->sin.sin_addr) == 1) { + result->sin.sin_family = AF_INET; + return true; + } + + // First check cache + for (auto it = cache_.begin(); it != cache_.end(); ++it) { + if (it->name == hostname) { + + *result = it->ip; + RINFO("Resolved %s to %s%s", hostname, PrintIpAddr(*result, buf), " (cached)"); + return true; + } + } + +#if defined(OS_WIN) + // Then disable dns blocker (otherwise the windows dns client service can't resolve) + if (dns_blocker_ && dns_blocker_->IsActive()) { + RINFO("Disabling DNS blocker to resolve %s", hostname); + dns_blocker_->RestoreDns(); + } +#endif // defined(OS_WIN) for (;;) { - hostent *he = gethostbyname(name); - if (he || exit_flag == NULL || *exit_flag) - return he; + hostent *he = gethostbyname(hostname); + if (abort_flag_) + return false; - RINFO("Unable to resolve %s. Trying again in %d second(s)", name, retry_delays[attempt]); + if (he) { + result->sin.sin_family = AF_INET; + result->sin.sin_port = 0; + memcpy(&result->sin.sin_addr, he->h_addr_list[0], 4); + // add to cache + cache_.emplace_back(hostname, *result); + RINFO("Resolved %s to %s%s", hostname, PrintIpAddr(*result, buf), ""); + return true; + } + + RINFO("Unable to resolve %s. Trying again in %d second(s)", hostname, retry_delays[attempt]); OsInterruptibleSleep(retry_delays[attempt] * 1000); - if (*exit_flag) - return NULL; + if (abort_flag_) + return false; if (attempt != ARRAY_SIZE(retry_delays) - 1) attempt++; @@ -78,7 +145,9 @@ struct hostent *gethostbyname_retry_on_failure(const char * name, bool *exit_fla } -static bool ParseSockaddrInWithPort(char *s, IpAddr *sin, bool *exit_flag) { + + +static bool ParseSockaddrInWithPort(char *s, IpAddr *sin, DnsResolver *resolver) { memset(sin, 0, sizeof(IpAddr)); if (*s == '[') { char *end = strchr(s, ']'); @@ -97,30 +166,20 @@ static bool ParseSockaddrInWithPort(char *s, IpAddr *sin, bool *exit_flag) { char *x = strchr(s, ':'); if (!x) return false; *x = 0; - hostent *he = gethostbyname_retry_on_failure(s, exit_flag); - if (!he) { + + if (!resolver->Resolve(s, sin)) { RERROR("Unable to resolve %s", s); return false; } - sin->sin.sin_family = AF_INET; sin->sin.sin_port = htons(atoi(x + 1)); - memcpy(&sin->sin.sin_addr, he->h_addr_list[0], 4); return true; } -static bool ParseSockaddrInWithoutPort(char *s, IpAddr *sin, bool *exit_flag) { - memset(sin, 0, sizeof(IpAddr)); - if (inet_pton(AF_INET6, s, &sin->sin6.sin6_addr) == 1) { - sin->sin.sin_family = AF_INET6; - return true; - } - hostent *he = gethostbyname_retry_on_failure(s, exit_flag); - if (!he) { +static bool ParseSockaddrInWithoutPort(char *s, IpAddr *sin, DnsResolver *resolver) { + if (!resolver->Resolve(s, sin)) { RERROR("Unable to resolve %s", s); return false; } - sin->sin.sin_family = AF_INET; - memcpy(&sin->sin.sin_addr, he->h_addr_list[0], 4); return true; } @@ -131,7 +190,7 @@ static bool ParseBase64Key(const char *s, uint8 key[32]) { class WgFileParser { public: - WgFileParser(WireguardProcessor *wg, bool *exit_flag) : wg_(wg), exit_flag_(exit_flag) {} + WgFileParser(WireguardProcessor *wg, DnsResolver *resolver) : wg_(wg), dns_resolver_(resolver) {} bool ParseFlag(const char *group, const char *key, char *value); WireguardProcessor *wg_; @@ -142,7 +201,7 @@ public: }; Peer pi_; WgPeer *peer_ = NULL; - bool *exit_flag_; + DnsResolver *dns_resolver_; bool had_interface_ = false; }; @@ -271,7 +330,7 @@ bool WgFileParser::ParseFlag(const char *group, const char *key, char *value) { } else if (strcmp(key, "DNS") == 0) { SplitString(value, ',', &ss); for (size_t i = 0; i < ss.size(); i++) { - if (!ParseSockaddrInWithoutPort(ss[i], &sin, exit_flag_)) + if (!ParseSockaddrInWithoutPort(ss[i], &sin, dns_resolver_)) return false; if (!wg_->AddDnsServer(sin)) { RERROR("Multiple DNS not allowed."); @@ -315,6 +374,13 @@ bool WgFileParser::ParseFlag(const char *group, const char *key, char *value) { wg_->prepost().pre_up.emplace_back(value); } else if (strcmp(key, "PreDown") == 0) { wg_->prepost().pre_down.emplace_back(value); + } else if (strcmp(key, "ExcludedIPs") == 0) { + SplitString(value, ',', &ss); + for (size_t i = 0; i < ss.size(); i++) { + if (!ParseCidrAddr(ss[i], &addr)) + return false; + wg_->AddExcludedIp(addr); + } } else { goto err; } @@ -344,7 +410,7 @@ bool WgFileParser::ParseFlag(const char *group, const char *key, char *value) { return false; } } else if (strcmp(key, "Endpoint") == 0) { - if (!ParseSockaddrInWithPort(value, &sin, exit_flag_)) + if (!ParseSockaddrInWithPort(value, &sin, dns_resolver_)) return false; peer_->SetEndpoint(sin); } else if (strcmp(key, "PersistentKeepalive") == 0) { @@ -384,11 +450,20 @@ err: return true; } -bool ParseWireGuardConfigFile(WireguardProcessor *wg, const char *filename, bool *exit_flag) { +static bool ContainsNonAsciiCharacter(const char *buf, size_t size) { + for (size_t i = 0; i < size; i++) { + uint8 c = buf[i]; + if (c < 32 && ((1 << c) & (1 << '\n' | 1 << '\r' | 1 << '\t')) == 0) + return true; + } + return false; +} + +bool ParseWireGuardConfigFile(WireguardProcessor *wg, const char *filename, DnsResolver *dns_resolver) { char buf[1024]; char group[32] = {0}; - WgFileParser file_parser(wg, exit_flag); + WgFileParser file_parser(wg, dns_resolver); RINFO("Loading file: %s", filename); @@ -400,6 +475,13 @@ bool ParseWireGuardConfigFile(WireguardProcessor *wg, const char *filename, bool while (fgets(buf, sizeof(buf), f)) { size_t l = strlen(buf); + + if (ContainsNonAsciiCharacter(buf, l)) { + RERROR("File is not a config file: %s", filename); + return false; + } + + while (l && is_space(buf[l - 1])) buf[--l] = 0; if (buf[0] == '#' || buf[0] == '\0') diff --git a/wireguard_config.h b/wireguard_config.h index 03d7899..01d9678 100644 --- a/wireguard_config.h +++ b/wireguard_config.h @@ -3,13 +3,38 @@ #ifndef TINYVPN_TINYVPN_H_ #define TINYVPN_TINYVPN_H_ -class WireguardProcessor; +#include "netapi.h" -bool ParseWireGuardConfigFile(WireguardProcessor *wg, const char *filename, bool *exit_flag); +class WireguardProcessor; +class DnsBlocker; + +class DnsResolver { +public: + explicit DnsResolver(DnsBlocker *dns_blocker); + ~DnsResolver(); + + bool Resolve(const char *hostname, IpAddr *result); + + void ClearCache(); + + void SetAbortFlag(bool v) { abort_flag_ = v; } +private: + struct Entry { + std::string name; + IpAddr ip; + Entry(const std::string &name, const IpAddr &ip) : name(name), ip(ip) {} + }; + std::vector cache_; + bool abort_flag_; + DnsBlocker *dns_blocker_; +}; + + +bool ParseWireGuardConfigFile(WireguardProcessor *wg, const char *filename, DnsResolver *dns_resolver); #define kSizeOfAddress 64 const char *print_ip_prefix(char buf[kSizeOfAddress], int family, const void *ip, int prefixlen); - +char *PrintIpAddr(const IpAddr &addr, char buf[kSizeOfAddress]); #endif // TINYVPN_TINYVPN_H_ diff --git a/wireguard_proto.cpp b/wireguard_proto.cpp index ad20a53..9d6b0ab 100644 --- a/wireguard_proto.cpp +++ b/wireguard_proto.cpp @@ -11,7 +11,7 @@ #include "util.h" #include "crypto_ops.h" #include "bit_ops.h" -#include "tunsafe_cpu.h" +#include "tunsafe_cpu.h" #include #include #include @@ -23,97 +23,6 @@ static const uint8 kWgInitHash[WG_HASH_LEN] = {0x22,0x11,0xb3,0x61,0x08,0x1a,0xc static const uint8 kWgInitChainingKey[WG_HASH_LEN] = {0x60,0xe2,0x6d,0xae,0xf3,0x27,0xef,0xc0,0x2e,0xc3,0x35,0xe2,0xa0,0x25,0xd2,0xd0,0x16,0xeb,0x42,0x06,0xf8,0x72,0x77,0xf5,0x2d,0x38,0xd1,0x98,0x8b,0x78,0xcd,0x36}; static const uint8 kCurve25519Basepoint[32] = {9}; -IpToPeerMap::IpToPeerMap() { - -} - -IpToPeerMap::~IpToPeerMap() { -} - -bool IpToPeerMap::InsertV4(const void *addr, int cidr, void *peer) { - uint32 mask = cidr == 32 ? 0xffffffff : ~(0xffffffff >> cidr); - Entry4 e = {ReadBE32(addr) & mask, mask, peer}; - ipv4_.push_back(e); - return true; -} - -bool IpToPeerMap::InsertV6(const void *addr, int cidr, void *peer) { - Entry6 e; - e.cidr_len = cidr; - e.peer = peer; - memcpy(e.ip, addr, 16); - ipv6_.push_back(e); - return true; -} - -void *IpToPeerMap::LookupV4(uint32 ip) { - uint32 best_mask = 0; - void *best_peer = NULL; - for (auto it = ipv4_.begin(); it != ipv4_.end(); ++it) { - if (it->ip == (ip & it->mask) && it->mask >= best_mask) { - best_mask = it->mask; - best_peer = it->peer; - } - } - return best_peer; -} - -void *IpToPeerMap::LookupV4DefaultPeer() { - for (auto it = ipv4_.begin(); it != ipv4_.end(); ++it) { - if (it->mask == 0) - return it->peer; - } - return NULL; -} - -void *IpToPeerMap::LookupV6DefaultPeer() { - for (auto it = ipv6_.begin(); it != ipv6_.end(); ++it) { - if (it->cidr_len == 0) - return it->peer; - } - return NULL; -} - -static int CalculateIPv6CommonPrefix(const uint8 *a, const uint8 *b) { - uint64 x = ToBE64(*(uint64*)&a[0] ^ *(uint64*)&b[0]); - uint64 y = ToBE64(*(uint64*)&a[8] ^ *(uint64*)&b[8]); - return x ? 64 - FindHighestSetBit64(x) : 128 - FindHighestSetBit64(y); -} - -void *IpToPeerMap::LookupV6(const void *addr) { - int best_len = 0; - void *best_peer = NULL; - for (auto it = ipv6_.begin(); it != ipv6_.end(); ++it) { - int len = CalculateIPv6CommonPrefix((const uint8*)addr, it->ip); - if (len >= it->cidr_len && len >= best_len) { - best_len = len; - best_peer = it->peer; - } - } - return best_peer; -} - -void IpToPeerMap::RemovePeer(void *peer) { - { - size_t n = ipv4_.size(); - Entry4 *r = &ipv4_[0], *w = r; - for (size_t i = 0; i != n; i++, r++) { - if (r->peer != peer) - *w++ = *r; - } - ipv4_.resize(w - &ipv4_[0]); - } - { - size_t n = ipv6_.size(); - Entry6 *r = &ipv6_[0], *w = r; - for (size_t i = 0; i != n; i++, r++) { - if (r->peer != peer) - *w++ = *r; - } - ipv6_.resize(w - &ipv6_[0]); - } -} - ReplayDetector::ReplayDetector() { expected_seq_nr_ = 0; memset(bitmap_, 0, sizeof(bitmap_)); @@ -124,8 +33,9 @@ ReplayDetector::~ReplayDetector() { bool ReplayDetector::CheckReplay(uint64 seq_nr) { uint64 slot = seq_nr / BITS_PER_ENTRY; - if (seq_nr >= expected_seq_nr_) { - uint64 prev_slot = (expected_seq_nr_ + BITS_PER_ENTRY - 1) / BITS_PER_ENTRY - 1, n; + uint64 expected_seq_nr = expected_seq_nr_; + if (seq_nr >= expected_seq_nr) { + uint64 prev_slot = (expected_seq_nr + BITS_PER_ENTRY - 1) / BITS_PER_ENTRY - 1, n; if ((n = slot - prev_slot) != 0) { size_t nn = (size_t)std::min(n, BITMAP_SIZE); do { @@ -133,7 +43,7 @@ bool ReplayDetector::CheckReplay(uint64 seq_nr) { } while (--nn); } expected_seq_nr_ = seq_nr + 1; - } else if (seq_nr + WINDOW_SIZE <= expected_seq_nr_) { + } else if (seq_nr + WINDOW_SIZE <= expected_seq_nr) { return false; } uint32 mask = 1 << (seq_nr & (BITS_PER_ENTRY - 1)), prev; @@ -146,21 +56,21 @@ WgDevice::WgDevice() { peers_ = NULL; header_obfuscation_ = false; next_rng_slot_ = 0; - last_complete_handskake_timestamp_ = 0; memset(&compression_header_, 0, sizeof(compression_header_)); low_resolution_timestamp_ = cookie_secret_timestamp_ = OsGetMilliseconds(); OsGetRandomBytes(cookie_secret_, sizeof(cookie_secret_)); OsGetRandomBytes((uint8*)random_number_input_, sizeof(random_number_input_)); - + SetCurrentThreadAsMainThread(); } WgDevice::~WgDevice() { } void WgDevice::SecondLoop(uint64 now) { - low_resolution_timestamp_ = now; + assert(IsMainThread()); + low_resolution_timestamp_ = now; if (rate_limiter_.is_used()) { uint32 k[5]; for (size_t i = 0; i < ARRAY_SIZE(k); i++) @@ -170,11 +80,16 @@ void WgDevice::SecondLoop(uint64 now) { } uint32 WgDevice::InsertInKeyIdLookup(WgPeer *peer, WgKeypair *kp) { + assert(IsMainThread()); assert(peer); for (;;) { uint32 v = GetRandomNumber(); if (v == 0) continue; + + // Take the exclusive lock since we're modifying it. + WG_SCOPED_RWLOCK_EXCLUSIVE(key_id_lookup_lock_); + std::pair &peer_and_keypair = key_id_lookup_[v]; if (peer_and_keypair.first == NULL) { peer_and_keypair = std::make_pair(peer, kp); @@ -188,7 +103,24 @@ uint32 WgDevice::InsertInKeyIdLookup(WgPeer *peer, WgKeypair *kp) { } } +std::pair *WgDevice::LookupPeerInKeyIdLookup(uint32 key_id) { + // This function is only ever called by the main thread, so no need to lock, + // since the main thread is the only mutator. + assert(IsMainThread()); + auto it = key_id_lookup_.find(key_id); + return (it != key_id_lookup_.end() && it->second.second == NULL) ? &it->second : NULL; +} + +WgKeypair *WgDevice::LookupKeypairByKeyId(uint32 key_id) { + // This function can be called from any thread, so make sure to + // lock using the shared lock. + WG_SCOPED_RWLOCK_SHARED(key_id_lookup_lock_); + auto it = key_id_lookup_.find(key_id); + return (it != key_id_lookup_.end()) ? it->second.second : NULL; +} + uint32 WgDevice::GetRandomNumber() { + assert(IsMainThread()); size_t slot; if ((slot = next_rng_slot_) == 0) { blake2s(random_number_output_, sizeof(random_number_output_), random_number_input_, sizeof(random_number_input_), NULL, 0); @@ -232,6 +164,7 @@ void WgDevice::Initialize(const uint8 private_key[WG_PUBLIC_KEY_LEN]) { } WgPeer *WgDevice::AddPeer() { + assert(IsMainThread()); WgPeer *peer = new WgPeer(this); WgPeer **pp = &peers_; while (*pp) @@ -241,6 +174,8 @@ WgPeer *WgDevice::AddPeer() { } WgPeer *WgDevice::GetPeerFromPublicKey(uint8 public_key[WG_PUBLIC_KEY_LEN]) { + assert(IsMainThread()); + // todo: add O(1) lookup for (WgPeer *peer = peers_; peer; peer = peer->next_peer_) { if (memcmp(peer->s_remote_, public_key, WG_PUBLIC_KEY_LEN) == 0) return peer; @@ -249,15 +184,16 @@ WgPeer *WgDevice::GetPeerFromPublicKey(uint8 public_key[WG_PUBLIC_KEY_LEN]) { } bool WgDevice::CheckCookieMac1(Packet *packet) { + assert(IsMainThread()); uint8 mac[WG_COOKIE_LEN]; const uint8 *data = packet->data; size_t data_size = packet->size; - blake2s(mac, sizeof(mac), data, data_size - WG_COOKIE_LEN * 2, precomputed_mac1_key_, sizeof(precomputed_mac1_key_)); return !memcmp_crypto(mac, data + data_size - WG_COOKIE_LEN * 2, WG_COOKIE_LEN); } void WgDevice::MakeCookie(uint8 cookie[WG_COOKIE_LEN], Packet *packet) { + assert(IsMainThread()); blake2s_state b2s; uint64 now = OsGetMilliseconds(); if (now - cookie_secret_timestamp_ >= COOKIE_SECRET_MAX_AGE_MS) { @@ -274,6 +210,7 @@ void WgDevice::MakeCookie(uint8 cookie[WG_COOKIE_LEN], Packet *packet) { } bool WgDevice::CheckCookieMac2(Packet *packet) { + assert(IsMainThread()); uint8 cookie[WG_COOKIE_LEN]; uint8 mac[WG_COOKIE_LEN]; MakeCookie(cookie, packet); @@ -282,6 +219,7 @@ bool WgDevice::CheckCookieMac2(Packet *packet) { } void WgDevice::CreateCookieMessage(MessageHandshakeCookie *dst, Packet *packet, uint32 remote_key_id) { + assert(IsMainThread()); dst->type = MESSAGE_HANDSHAKE_COOKIE; dst->receiver_key_id = remote_key_id; MakeCookie(dst->cookie_enc, packet); @@ -290,7 +228,7 @@ void WgDevice::CreateCookieMessage(MessageHandshakeCookie *dst, Packet *packet, xchacha20poly1305_encrypt(dst->cookie_enc, dst->cookie_enc, WG_COOKIE_LEN, mac->mac1, WG_COOKIE_LEN, dst->nonce, precomputed_cookie_key_); } -void WgDevice::EraseKeypairAddrEntry(WgKeypair *kp) { +void WgDevice::EraseKeypairAddrEntry_Locked(WgKeypair *kp) { WgAddrEntry *ae = kp->addr_entry; assert(ae->ref_count >= 1); @@ -308,14 +246,28 @@ void WgDevice::EraseKeypairAddrEntry(WgKeypair *kp) { } } -void WgDevice::UpdateKeypairAddrEntry(uint64 addr_id, WgKeypair *keypair) { - if (keypair->addr_entry != NULL && keypair->addr_entry->addr_entry_id == addr_id) { - keypair->broadcast_short_key = 1; - return; +WgKeypair *WgDevice::LookupKeypairInAddrEntryMap(uint64 addr, uint32 slot) { + WG_SCOPED_RWLOCK_SHARED(addr_entry_lookup_lock_); + auto it = addr_entry_lookup_.find(addr); + if (it == addr_entry_lookup_.end()) + return NULL; + WgAddrEntry *addr_entry = it->second; + return addr_entry->keys[slot]; +} + +void WgDevice::UpdateKeypairAddrEntry_Locked(uint64 addr_id, WgKeypair *keypair) { + assert(keypair->peer->IsPeerLocked()); + { + WG_SCOPED_RWLOCK_SHARED(addr_entry_lookup_lock_); + if (keypair->addr_entry != NULL && keypair->addr_entry->addr_entry_id == addr_id) { + keypair->broadcast_short_key = 1; + return; + } } + WG_SCOPED_RWLOCK_EXCLUSIVE(addr_entry_lookup_lock_); if (keypair->addr_entry != NULL) - EraseKeypairAddrEntry(keypair); + EraseKeypairAddrEntry_Locked(keypair); WgAddrEntry **aep = &addr_entry_lookup_[addr_id], *ae; @@ -362,13 +314,16 @@ void WgDevice::SetHeaderObfuscation(const char *key) { WgPeer::WgPeer(WgDevice *dev) { + assert(dev->IsMainThread()); dev_ = dev; endpoint_.sin.sin_family = 0; next_peer_ = NULL; curr_keypair_ = next_keypair_ = prev_keypair_ = NULL; expect_cookie_reply_ = false; has_mac2_cookie_ = false; + pending_keepalive_ = false; allow_multicast_through_peer_ = false; + allow_endpoint_change_ = true; supports_handshake_extensions_ = true; local_key_id_during_hs_ = 0; last_handshake_init_timestamp_ = -1000000ll; @@ -380,20 +335,43 @@ WgPeer::WgPeer(WgDevice *dev) { last_queued_packet_ptr_ = &first_queued_packet_; num_queued_packets_ = 0; handshake_attempts_ = 0; + total_handshake_attempts_ = 0; num_ciphers_ = 0; cipher_prio_ = 0; + main_thread_scheduled_ = 0; memset(last_timestamp_, 0, sizeof(last_timestamp_)); ipv4_broadcast_addr_ = 0xffffffff; memset(features_, 0, sizeof(features_)); } WgPeer::~WgPeer() { - ClearKeys(); - ClearHandshake(); - ClearPacketQueue(); + assert(dev_->IsMainThread()); + WG_ACQUIRE_LOCK(mutex_); + ClearKeys_Locked(); + ClearHandshake_Locked(); + ClearPacketQueue_Locked(); + WG_RELEASE_LOCK(mutex_); } -void WgPeer::ClearPacketQueue() { +void WgPeer::ClearKeys_Locked() { + assert(dev_->IsMainThread() && IsPeerLocked()); + DeleteKeypair(&curr_keypair_); + DeleteKeypair(&next_keypair_); + DeleteKeypair(&prev_keypair_); +} + +void WgPeer::ClearHandshake_Locked() { + assert(dev_->IsMainThread() && IsPeerLocked()); + uint32 v = local_key_id_during_hs_; + if (v != 0) { + local_key_id_during_hs_ = 0; + WG_SCOPED_RWLOCK_EXCLUSIVE(dev_->key_id_lookup_lock_); + dev_->key_id_lookup_.erase(v); + } +} + +void WgPeer::ClearPacketQueue_Locked() { + assert(dev_->IsMainThread() && IsPeerLocked()); Packet *packet; while ((packet = first_queued_packet_) != NULL) { first_queued_packet_ = packet->next; @@ -422,6 +400,8 @@ void WgPeer::Initialize(const uint8 spub[WG_PUBLIC_KEY_LEN], const uint8 preshar // run on the client void WgPeer::CreateMessageHandshakeInitiation(Packet *packet) { + assert(dev_->IsMainThread()); + uint8 k[WG_SYMMETRIC_KEY_LEN]; MessageHandshakeInitiation *dst = (MessageHandshakeInitiation *)packet->data; @@ -463,7 +443,6 @@ void WgPeer::CreateMessageHandshakeInitiation(Packet *packet) { packet->size = (unsigned)(sizeof(MessageHandshakeInitiation) + extfield_size); - // Insert a pointer to this object, dst->sender_key_id = dev_->InsertInKeyIdLookup(this, NULL); dst->type = MESSAGE_HANDSHAKE_INITIATION; memzero_crypto(k, sizeof(k)); @@ -472,6 +451,7 @@ void WgPeer::CreateMessageHandshakeInitiation(Packet *packet) { // Parsed by server WgPeer *WgPeer::ParseMessageHandshakeInitiation(WgDevice *dev, Packet *packet) { // const MessageHandshakeInitiation *src, MessageHandshakeResponse *dst) { + assert(dev->IsMainThread()); // Copy values into handshake once we've validated it all. uint8 ci[WG_HASH_LEN]; uint8 hi[WG_HASH_LEN]; @@ -562,9 +542,14 @@ WgPeer *WgPeer::ParseMessageHandshakeInitiation(WgDevice *dev, Packet *packet) { BlakeMix(hi, t, sizeof(t)); dst->receiver_key_id = remote_key_id; - keypair = peer->CreateNewKeypair(false, ci, remote_key_id, extbuf + WG_TIMESTAMP_LEN, extfield_size); + keypair = WgPeer::CreateNewKeypair(false, ci, remote_key_id, extbuf + WG_TIMESTAMP_LEN, extfield_size); if (keypair) { - peer->InsertKeypairInPeer(keypair); + + WG_ACQUIRE_LOCK(peer->mutex_); + peer->InsertKeypairInPeer_Locked(keypair); + peer->OnHandshakeAuthComplete(); + WG_RELEASE_LOCK(peer->mutex_); + dst->sender_key_id = dev->InsertInKeyIdLookup(peer, keypair); size_t extfield_out_size = 0; @@ -593,15 +578,15 @@ getout: } WgPeer *WgPeer::ParseMessageHandshakeResponse(WgDevice *dev, const Packet *packet) { + assert(dev->IsMainThread()); MessageHandshakeResponse *src = (MessageHandshakeResponse *)packet->data; uint8 t[WG_HASH_LEN]; uint8 k[WG_SYMMETRIC_KEY_LEN]; WgKeypair *keypair; - auto it = dev->key_id_lookup().find(src->receiver_key_id); - if (it == dev->key_id_lookup().end() || it->second.second != NULL) + auto peer_and_keypair = dev->LookupPeerInKeyIdLookup(src->receiver_key_id); + if (peer_and_keypair == NULL) return NULL; - WgPeer *peer = it->second.first; - + WgPeer *peer = peer_and_keypair->first; assert(src->receiver_key_id == peer->local_key_id_during_hs_); HandshakeState hs = peer->hs_; @@ -626,16 +611,18 @@ WgPeer *WgPeer::ParseMessageHandshakeResponse(WgDevice *dev, const Packet *packe if (!chacha20poly1305_decrypt(src->empty_enc, src->empty_enc, extfield_size + sizeof(src->empty_enc), hs.hi, sizeof(hs.hi), 0, k)) goto getout; - keypair = peer->CreateNewKeypair(true, hs.ci, src->sender_key_id, src->empty_enc, extfield_size); + keypair = WgPeer::CreateNewKeypair(true, hs.ci, src->sender_key_id, src->empty_enc, extfield_size); if (!keypair) goto getout; - peer->InsertKeypairInPeer(keypair); - // Re-map the entry in the id table so it points at this keypair instead. keypair->local_key_id = peer->local_key_id_during_hs_; peer->local_key_id_during_hs_ = 0; - it->second.second = keypair; + peer_and_keypair->second = keypair; + + WG_ACQUIRE_LOCK(peer->mutex_); + peer->InsertKeypairInPeer_Locked(keypair); + WG_RELEASE_LOCK(peer->mutex_); if (0) { getout: @@ -650,11 +637,12 @@ getout: // This is parsed by the initiator, when it needs to re-send the handshake message with a better mac. void WgPeer::ParseMessageHandshakeCookie(WgDevice *dev, const MessageHandshakeCookie *src) { + assert(dev->IsMainThread()); uint8 cookie[WG_COOKIE_LEN]; - auto it = dev->key_id_lookup().find(src->receiver_key_id); - if (it == dev->key_id_lookup().end() || it->second.second != NULL) + auto peer_and_keypair = dev->LookupPeerInKeyIdLookup(src->receiver_key_id); + if (!peer_and_keypair) return; - WgPeer *peer = it->second.first; + WgPeer *peer = peer_and_keypair->first; if (!peer->expect_cookie_reply_) return; if (!xchacha20poly1305_decrypt(cookie, src->cookie_enc, sizeof(src->cookie_enc), @@ -756,6 +744,7 @@ void WgKeypairSetupCompressionExtension(WgKeypair *keypair, const WgPacketCompre state->server_addr_v4_subnet_bytes = (remotec->flags & 3); WriteLE32(&state->server_addr_v4_netmask, 0xffffffff >> ((remotec->flags & 3) * 8)); } + bool WgKeypairParseExtendedHandshake(WgKeypair *keypair, const uint8 *data, size_t data_size) { bool did_setup_compression = false; @@ -804,33 +793,29 @@ bool WgKeypairParseExtendedHandshake(WgKeypair *keypair, const uint8 *data, size #endif // WITH_HANDSHAKE_EXT -void WgPeer::ClearKeys() { - DeleteKeypair(&curr_keypair_); - DeleteKeypair(&next_keypair_); - DeleteKeypair(&prev_keypair_); -} - -void WgPeer::ClearHandshake() { - uint32 v = local_key_id_during_hs_; - if (v != 0) { - local_key_id_during_hs_ = 0; - dev_->key_id_lookup_.erase(v); - } +static void ActualFreeKeypair(void *x) { + WgKeypair *t = (WgKeypair*)x; + if (t->aes_gcm128_context_) + free(t->aes_gcm128_context_); + delete t; } void WgPeer::DeleteKeypair(WgKeypair **kp) { WgKeypair *t = *kp; *kp = NULL; if (t) { - if (t->addr_entry) - dev_->EraseKeypairAddrEntry(t); - - if (t->local_key_id) + assert(t->peer->IsPeerLocked()); + if (t->addr_entry) { + WG_SCOPED_RWLOCK_EXCLUSIVE(dev_->addr_entry_lookup_lock_); + dev_->EraseKeypairAddrEntry_Locked(t); + } + if (t->local_key_id) { + WG_SCOPED_RWLOCK_EXCLUSIVE(dev_->key_id_lookup_lock_); dev_->key_id_lookup_.erase(t->local_key_id); - - if (t->aes_gcm128_context_) - free(t->aes_gcm128_context_); - delete t; + t->local_key_id = 0; + } + t->recv_key_state = WgKeypair::KEY_INVALID; + dev_->delayed_delete_.Add(&ActualFreeKeypair, t); } } @@ -840,21 +825,24 @@ WgKeypair *WgPeer::CreateNewKeypair(bool is_initiator, const uint8 chaining_key[ if (!kp) return NULL; memset(kp, 0, offsetof(WgKeypair, replay_detector)); - kp->peer = this; kp->is_initiator = is_initiator; kp->remote_key_id = remote_key_id; kp->auth_tag_length = CHACHA20POLY1305_AUTHTAGLEN; #if WITH_HANDSHAKE_EXT - if (!WgKeypairParseExtendedHandshake(kp, extfield, extfield_size)) - goto fail; + if (!WgKeypairParseExtendedHandshake(kp, extfield, extfield_size)) { +fail: + delete kp; + return NULL; + } #endif // WITH_HANDSHAKE_EXT first_key = kp->send_key, second_key = kp->recv_key; if (!is_initiator) std::swap(first_key, second_key); blake2s_hkdf(first_key, sizeof(kp->send_key), second_key, sizeof(kp->recv_key), - kp->auth_tag_length != CHACHA20POLY1305_AUTHTAGLEN ? (uint8*)kp->compress_mac_keys : NULL, 32, NULL, 0, chaining_key, WG_HASH_LEN); + kp->auth_tag_length != CHACHA20POLY1305_AUTHTAGLEN ? (uint8*)kp->compress_mac_keys : NULL, 32, + NULL, 0, chaining_key, WG_HASH_LEN); if (!is_initiator) { std::swap(kp->compress_mac_keys[0][0], kp->compress_mac_keys[1][0]); @@ -870,25 +858,22 @@ WgKeypair *WgPeer::CreateNewKeypair(bool is_initiator, const uint8 chaining_key[ int key_size = (kp->cipher_suite == EXT_CIPHER_SUITE_AES128_GCM) ? 128 : 256; CRYPTO_gcm128_init(&kp->aes_gcm128_context_[0], kp->send_key, key_size); CRYPTO_gcm128_init(&kp->aes_gcm128_context_[1], kp->recv_key, key_size); -#else +#else // WITH_AESGCM goto fail; -#endif +#endif // WITH_AESGCM } #endif // WITH_HANDSHAKE_EXT kp->send_key_state = kp->recv_key_state = WgKeypair::KEY_VALID; - time_of_next_key_event_ = 0; kp->key_timestamp = OsGetMilliseconds(); - return kp; - -fail: - delete kp; - return NULL; } -void WgPeer::InsertKeypairInPeer(WgKeypair *kp) { - assert(kp->peer == this); +void WgPeer::InsertKeypairInPeer_Locked(WgKeypair *kp) { + assert(dev_->IsMainThread() && IsPeerLocked()); + assert(kp->peer == NULL); + kp->peer = this; + time_of_next_key_event_ = 0; DeleteKeypair(&prev_keypair_); if (kp->is_initiator) { // When we're the initator then we got the handshake and we can @@ -908,7 +893,8 @@ void WgPeer::InsertKeypairInPeer(WgKeypair *kp) { } } -bool WgPeer::CheckSwitchToNextKey(WgKeypair *keypair) { +bool WgPeer::CheckSwitchToNextKey_Locked(WgKeypair *keypair) { + assert(IsPeerLocked()); if (keypair != next_keypair_) return false; DeleteKeypair(&prev_keypair_); @@ -920,6 +906,7 @@ bool WgPeer::CheckSwitchToNextKey(WgKeypair *keypair) { } bool WgPeer::CheckHandshakeRateLimit() { + assert(dev_->IsMainThread()); uint64 now = OsGetMilliseconds(); if (now - last_handshake_init_timestamp_ < REKEY_TIMEOUT_MS) return false; @@ -928,6 +915,7 @@ bool WgPeer::CheckHandshakeRateLimit() { } void WgPeer::WriteMacToPacket(const uint8 *data, MessageMacs *dst) { + assert(dev_->IsMainThread()); expect_cookie_reply_ = true; blake2s(dst->mac1, sizeof(dst->mac1), data, (uint8*)dst->mac1 - data, precomputed_mac1_key_, sizeof(precomputed_mac1_key_)); memcpy(sent_mac1_, dst->mac1, sizeof(sent_mac1_)); @@ -964,6 +952,7 @@ enum { #define WgSetTimer(x) (timers_ |= (32 << (x))) void WgPeer::OnDataSent() { + assert(IsPeerLocked()); WgClearTimer(TIMER_SEND_KEEPALIVE); if (!WgIsTimerActive(TIMER_NEW_HANDSHAKE)) WgSetTimer(TIMER_NEW_HANDSHAKE); @@ -971,10 +960,12 @@ void WgPeer::OnDataSent() { } void WgPeer::OnKeepaliveSent() { + assert(IsPeerLocked()); WgSetTimer(TIMER_PERSISTENT_KEEPALIVE); } void WgPeer::OnDataReceived() { + assert(IsPeerLocked()); WgClearTimer(TIMER_NEW_HANDSHAKE); if (!WgIsTimerActive(TIMER_SEND_KEEPALIVE)) WgSetTimer(TIMER_SEND_KEEPALIVE); @@ -984,16 +975,19 @@ void WgPeer::OnDataReceived() { } void WgPeer::OnKeepaliveReceived() { + assert(IsPeerLocked()); WgClearTimer(TIMER_NEW_HANDSHAKE); WgSetTimer(TIMER_PERSISTENT_KEEPALIVE); } void WgPeer::OnHandshakeInitSent() { + assert(IsPeerLocked()); WgClearTimer(TIMER_SEND_KEEPALIVE); WgSetTimer(TIMER_RETRANSMIT_HANDSHAKE); } void WgPeer::OnHandshakeAuthComplete() { + assert(IsPeerLocked()); WgClearTimer(TIMER_NEW_HANDSHAKE); WgSetTimer(TIMER_ZERO_KEYS); WgSetTimer(TIMER_PERSISTENT_KEEPALIVE); @@ -1007,8 +1001,11 @@ static const char * const kCipherSuites[] = { }; void WgPeer::OnHandshakeFullyComplete() { + assert(IsPeerLocked()); WgClearTimer(TIMER_RETRANSMIT_HANDSHAKE); - handshake_attempts_ = 0; + total_handshake_attempts_ = handshake_attempts_ = 0; + + uint64 now = OsGetMilliseconds(); if (last_complete_handskake_timestamp_ == 0) { bool any_feature = false; @@ -1022,17 +1019,15 @@ void WgPeer::OnHandshakeFullyComplete() { curr_keypair_->enabled_features[4] ? "skip_keyid_in" : "", curr_keypair_->enabled_features[5] ? "skip_keyid_out" : ""); } - - } - - last_complete_handskake_timestamp_ = OsGetMilliseconds(); - dev_->last_complete_handskake_timestamp_ = last_complete_handskake_timestamp_; + last_complete_handskake_timestamp_ = now; // RINFO("Connection established."); } // Check if any of the timeouts have expired uint32 WgPeer::CheckTimeouts(uint64 now) { + assert(IsPeerLocked()); + uint32 t, rv = 0; if (now >= time_of_next_key_event_) @@ -1056,11 +1051,9 @@ uint32 WgPeer::CheckTimeouts(uint64 now) { if ((t & (1 << TIMER_RETRANSMIT_HANDSHAKE)) && (now32 - timer_value_[TIMER_RETRANSMIT_HANDSHAKE]) >= REKEY_TIMEOUT_MS) { t ^= (1 << TIMER_RETRANSMIT_HANDSHAKE); if (handshake_attempts_ > MAX_HANDSHAKE_ATTEMPTS) { - RINFO("Too many handshake attempts. Stopping."); t &= ~(1 << TIMER_SEND_KEEPALIVE); - ClearPacketQueue(); + ClearPacketQueue_Locked(); } else { - RINFO("Retrying handshake, attempt %d...", handshake_attempts_ + 2); handshake_attempts_++; rv |= ACTION_SEND_HANDSHAKE; } @@ -1085,13 +1078,12 @@ uint32 WgPeer::CheckTimeouts(uint64 now) { t &= ~(1 << TIMER_NEW_HANDSHAKE); handshake_attempts_ = 0; rv |= ACTION_SEND_HANDSHAKE; - RINFO("Retrying handshake with peer"); } if ((t & (1 << TIMER_ZERO_KEYS)) && (now32 - timer_value_[TIMER_ZERO_KEYS]) >= REJECT_AFTER_TIME_MS * 3) { RINFO("Expiring all keys for peer"); t &= ~(1 << TIMER_ZERO_KEYS); - ClearKeys(); - ClearHandshake(); + ClearKeys_Locked(); + ClearHandshake_Locked(); } } timers_ = t; @@ -1100,6 +1092,7 @@ uint32 WgPeer::CheckTimeouts(uint64 now) { // Check all key stuff here to avoid calling possibly expensive timestamp routines in the packet handler void WgPeer::CheckAndUpdateTimeOfNextKeyEvent(uint64 now) { + assert(IsPeerLocked()); uint64 next_time = UINT64_MAX; uint32 rv = 0; @@ -1110,8 +1103,7 @@ void WgPeer::CheckAndUpdateTimeOfNextKeyEvent(uint64 now) { // if a peer is the initiator of a current secure session, WireGuard will send a handshake initiation // message to begin a new secure session if, after transmitting a transport data message, the current secure session // is REKEY_AFTER_TIME_MS old, or if after receiving a transport data message, the current secure session is - // (REKEY_AFTER_TIME_MS - KEEPALIVE_TIMEOUT_MS - REKEY_TIMEOUT_MS) seconds old and it has not yet acted upon - // this event. + // (REKEY_AFTER_TIME_MS - KEEPALIVE_TIMEOUT_MS - REKEY_TIMEOUT_MS) seconds old and it has not yet acted upon it. if (now >= curr_keypair_->key_timestamp + (REJECT_AFTER_TIME_MS - KEEPALIVE_TIMEOUT_MS - REKEY_TIMEOUT_MS)) { next_time = curr_keypair_->key_timestamp + REJECT_AFTER_TIME_MS; if (curr_keypair_->recv_key_state == WgKeypair::KEY_VALID) @@ -1153,16 +1145,22 @@ void WgPeer::SetPersistentKeepalive(int persistent_keepalive_secs) { } bool WgPeer::AddIp(const WgCidrAddr &cidr_addr) { + assert(dev_->IsMainThread()); + if (cidr_addr.size == 32) { if (cidr_addr.cidr > 32) return false; + WG_ACQUIRE_RWLOCK_EXCLUSIVE(dev_->ip_to_peer_map_lock_); dev_->ip_to_peer_map_.InsertV4(cidr_addr.addr, cidr_addr.cidr, this); + WG_RELEASE_RWLOCK_EXCLUSIVE(dev_->ip_to_peer_map_lock_); allowed_ips_.push_back(cidr_addr); return true; } else if (cidr_addr.size == 128) { if (cidr_addr.cidr > 128) return false; + WG_ACQUIRE_RWLOCK_EXCLUSIVE(dev_->ip_to_peer_map_lock_); dev_->ip_to_peer_map_.InsertV6(cidr_addr.addr, cidr_addr.cidr, this); + WG_RELEASE_RWLOCK_EXCLUSIVE(dev_->ip_to_peer_map_lock_); allowed_ips_.push_back(cidr_addr); return true; } else { @@ -1183,14 +1181,13 @@ bool WgPeer::AddCipher(int cipher) { return false; if (cipher == EXT_CIPHER_SUITE_AES128_GCM || cipher == EXT_CIPHER_SUITE_AES256_GCM) { -#if !WITH_AESGCM - return true; -#endif // !WITH_AESGCM +#if defined(ARCH_CPU_X86_FAMILY) && WITH_AESGCM if (!X86_PCAP_AES) return true; +#else + return true; +#endif // defined(ARCH_CPU_X86_FAMILY) && WITH_AESGCM } - - ciphers_[num_ciphers_++] = cipher; return true; } @@ -1214,15 +1211,10 @@ void WgRateLimit::Periodic(uint32 s[5]) { if (per_sec < 1) per_sec = 1; } - if ((unsigned)per_sec > packets_per_sec_) per_sec = (per_sec + packets_per_sec_ + 1) >> 1; - -// if (per_sec != packets_per_sec_) { -// RINFO("Setting pps: %d", per_sec); - packets_per_sec_ = per_sec; -// } - + + packets_per_sec_ = per_sec; used_rate_limit_ = 0; rand_xor_ = s[4]; key2_[0] = key1_[0]; @@ -1278,7 +1270,8 @@ void WgKeypairEncryptPayload(uint8 *dst, const size_t src_len, bool WgKeypairDecryptPayload(uint8 *dst, size_t src_len, const uint8 *ad, size_t ad_len, const uint64 nonce, WgKeypair *keypair) { - uint8 mac[16]; + + __aligned(16) uint8 mac[16]; if (src_len < keypair->auth_tag_length) return false; diff --git a/wireguard_proto.h b/wireguard_proto.h index cd66901..9e5c12f 100644 --- a/wireguard_proto.h +++ b/wireguard_proto.h @@ -4,9 +4,40 @@ #include "tunsafe_types.h" #include "netapi.h" +#include "ipzip2/ipzip2.h" #include "tunsafe_config.h" +#include "tunsafe_threading.h" +#include "ip_to_peer_map.h" #include #include +#include + +// Threading macros that enable locks only in MT builds +#if WITH_WG_THREADING +#define WG_SCOPED_LOCK(name) AutoLock scoped_lock(&name) +#define WG_ACQUIRE_LOCK(name) name.Acquire() +#define WG_RELEASE_LOCK(name) name.Release() +#define WG_DECLARE_LOCK(name) Mutex name; +#define WG_DECLARE_RWLOCK(name) ReaderWriterLock name; +#define WG_ACQUIRE_RWLOCK_SHARED(name) name.AcquireShared() +#define WG_RELEASE_RWLOCK_SHARED(name) name.ReleaseShared() +#define WG_ACQUIRE_RWLOCK_EXCLUSIVE(name) name.AcquireExclusive() +#define WG_RELEASE_RWLOCK_EXCLUSIVE(name) name.ReleaseExclusive() +#define WG_SCOPED_RWLOCK_SHARED(name) ScopedLockShared scoped_lock(&name) +#define WG_SCOPED_RWLOCK_EXCLUSIVE(name) ScopedLockExclusive scoped_lock(&name) +#else // WITH_WG_THREADING +#define WG_SCOPED_LOCK(name) +#define WG_ACQUIRE_LOCK(name) +#define WG_RELEASE_LOCK(name) +#define WG_DECLARE_LOCK(name) +#define WG_DECLARE_RWLOCK(name) +#define WG_ACQUIRE_RWLOCK_SHARED(name) +#define WG_RELEASE_RWLOCK_SHARED(name) +#define WG_ACQUIRE_RWLOCK_EXCLUSIVE(name) +#define WG_RELEASE_RWLOCK_EXCLUSIVE(name) +#define WG_SCOPED_RWLOCK_SHARED(name) +#define WG_SCOPED_RWLOCK_EXCLUSIVE(name) +#endif // WITH_WG_THREADING enum ProtocolTimeouts { COOKIE_SECRET_MAX_AGE_MS = 120000, @@ -17,6 +48,8 @@ enum ProtocolTimeouts { REJECT_AFTER_TIME_MS = 180000, PERSISTENT_KEEPALIVE_MS = 25000, MIN_HANDSHAKE_INTERVAL_MS = 20, + + MAX_SIZE_OF_HANDSHAKE_EXTENSION = 1024, }; enum ProtocolLimits { @@ -26,7 +59,6 @@ enum ProtocolLimits { MAX_HANDSHAKE_ATTEMPTS = 20, MAX_QUEUED_PACKETS_PER_PEER = 128, MESSAGE_MINIMUM_SIZE = 16, - MAX_SIZE_OF_HANDSHAKE_EXTENSION = 1024, }; enum MessageType { @@ -61,7 +93,7 @@ enum { WG_ACK_HEADER_COUNTER_NONE = 0x00, WG_ACK_HEADER_COUNTER_2 = 0x04, WG_ACK_HEADER_COUNTER_4 = 0x08, - WG_ACK_HEADER_COUNTER_8 = 0x0C, + WG_ACK_HEADER_COUNTER_6 = 0x0C, WG_ACK_HEADER_KEY_MASK = 3, }; @@ -166,39 +198,6 @@ STATIC_ASSERT(sizeof(WgPacketCompressionVer01) == 24, WgPacketCompressionVer01_w struct WgKeypair; class WgPeer; -// Maps CIDR addresses to a peer, always returning the longest match -class IpToPeerMap { -public: - IpToPeerMap(); - ~IpToPeerMap(); - - // Inserts an IP address of a given CIDR length into the lookup table, pointing to peer. - bool InsertV4(const void *addr, int cidr, void *peer); - bool InsertV6(const void *addr, int cidr, void *peer); - - // Lookup the peer matching the IP Address - void *LookupV4(uint32 ip); - void *LookupV6(const void *addr); - - void *LookupV4DefaultPeer(); - void *LookupV6DefaultPeer(); - - // Remove a peer from the table - void RemovePeer(void *peer); -private: - struct Entry4 { - uint32 ip; - uint32 mask; - void *peer; - }; - struct Entry6 { - uint8 ip[16]; - uint8 cidr_len; - void *peer; - }; - std::vector ipv4_; - std::vector ipv6_; -}; class WgRateLimit { public: @@ -262,7 +261,6 @@ struct ScramblerSiphashKeys { uint64 keys[4]; }; -// Implementation of most business logic of Wireguard class WgDevice { friend class WgPeer; friend class WireguardProcessor; @@ -272,7 +270,8 @@ public: // Initialize with the private key, precompute all internal keys etc. void Initialize(const uint8 private_key[WG_PUBLIC_KEY_LEN]); - + + // Create a new peer WgPeer *AddPeer(); // Setup header obfuscation @@ -281,35 +280,26 @@ public: // Check whether Mac1 appears to be valid bool CheckCookieMac1(Packet *packet); - // Check whether Mac2 appears to be valid, this also uses - // the remote ip address + // Check whether Mac2 appears to be valid, this also uses the remote ip address bool CheckCookieMac2(Packet *packet); void CreateCookieMessage(MessageHandshakeCookie *dst, Packet *packet, uint32 remote_key_id); - - void UpdateKeypairAddrEntry(uint64 addr_id, WgKeypair *keypair); + void UpdateKeypairAddrEntry_Locked(uint64 addr_id, WgKeypair *keypair); + void SecondLoop(uint64 now); IpToPeerMap &ip_to_peer_map() { return ip_to_peer_map_; } - - std::unordered_map > &key_id_lookup() { return key_id_lookup_; } - WgPeer *first_peer() { return peers_; } - - uint64 last_complete_handskake_timestamp() const { - return last_complete_handskake_timestamp_; - } - const uint8 *public_key() const { return s_pub_; } - - void SecondLoop(uint64 now); - WgRateLimit *rate_limiter() { return &rate_limiter_; } - std::unordered_map &addr_entry_map() { return addr_entry_lookup_; } - - WgPacketCompressionVer01 *compression_header() { return &compression_header_; } + + bool IsMainThread() { return CurrentThreadIdEquals(main_thread_id_); } + void SetCurrentThreadAsMainThread() { main_thread_id_ = GetCurrentThreadId(); } private: + std::pair *LookupPeerInKeyIdLookup(uint32 key_id); + WgKeypair *LookupKeypairByKeyId(uint32 key_id); + WgKeypair *LookupKeypairInAddrEntryMap(uint64 addr, uint32 slot); // Return the peer matching the |public_key| or NULL WgPeer *GetPeerFromPublicKey(uint8 public_key[WG_PUBLIC_KEY_LEN]); // Create a cookie by inspecting the source address of the |packet| @@ -319,12 +309,19 @@ private: // Get a random number uint32 GetRandomNumber(); - void EraseKeypairAddrEntry(WgKeypair *kp); + void EraseKeypairAddrEntry_Locked(WgKeypair *kp); // Maps IP addresses to peers IpToPeerMap ip_to_peer_map_; + + // This lock protects |ip_to_peer_map_|. + WG_DECLARE_RWLOCK(ip_to_peer_map_lock_); + // For enumerating all peers WgPeer *peers_; + + // Lock that protects key_id_lookup_ + WG_DECLARE_RWLOCK(key_id_lookup_lock_); // Mapping from key-id to either an active keypair (if keypair is non-NULL), // or to a handshake. std::unordered_map > key_id_lookup_; @@ -332,6 +329,7 @@ private: // Mapping from IPV4 IP/PORT to WgPeer*, so we can find the peer when a key id is // not explicitly included. std::unordered_map addr_entry_lookup_; + WG_DECLARE_RWLOCK(addr_entry_lookup_lock_); // Counter for generating new indices in |keypair_lookup_| uint8 next_rng_slot_; @@ -339,7 +337,7 @@ private: // Whether packet obfuscation is enabled bool header_obfuscation_; - uint64 last_complete_handskake_timestamp_; + ThreadId main_thread_id_; uint64 low_resolution_timestamp_; @@ -360,9 +358,12 @@ private: WgRateLimit rate_limiter_; WgPacketCompressionVer01 compression_header_; + + // For defering deletes until all worker threads are guaranteed not to use an object. + MultithreadedDelayedDelete delayed_delete_; }; -// State for Noise handshake +// State for peer class WgPeer { friend class WgDevice; friend class WireguardProcessor; @@ -387,10 +388,10 @@ public: static WgPeer *ParseMessageHandshakeResponse(WgDevice *dev, const Packet *packet); static void ParseMessageHandshakeCookie(WgDevice *dev, const MessageHandshakeCookie *src); void CreateMessageHandshakeInitiation(Packet *packet); - bool CheckSwitchToNextKey(WgKeypair *keypair); - void ClearKeys(); - void ClearHandshake(); - void ClearPacketQueue(); + bool CheckSwitchToNextKey_Locked(WgKeypair *keypair); + void ClearKeys_Locked(); + void ClearHandshake_Locked(); + void ClearPacketQueue_Locked(); bool CheckHandshakeRateLimit(); // Timer notifications @@ -408,23 +409,32 @@ public: }; uint32 CheckTimeouts(uint64 now); + void AddPacketToPeerQueue(Packet *packet); + +#if WITH_WG_THREADING + bool IsPeerLocked() { return mutex_.IsLocked(); } +#else // WITH_WG_THREADING + bool IsPeerLocked() { return true; } +#endif // WITH_WG_THREADING + private: - WgKeypair *CreateNewKeypair(bool is_initiator, const uint8 key[WG_HASH_LEN], uint32 send_key_id, const uint8 *extfield, size_t extfield_size); + static WgKeypair *CreateNewKeypair(bool is_initiator, const uint8 key[WG_HASH_LEN], uint32 send_key_id, const uint8 *extfield, size_t extfield_size); void WriteMacToPacket(const uint8 *data, MessageMacs *mac); void DeleteKeypair(WgKeypair **kp); void CheckAndUpdateTimeOfNextKeyEvent(uint64 now); - static void CopyEndpointToPeer(WgKeypair *keypair, const IpAddr *addr); + static void CopyEndpointToPeer_Locked(WgKeypair *keypair, const IpAddr *addr); size_t WriteHandshakeExtension(uint8 *dst, WgKeypair *keypair); - void InsertKeypairInPeer(WgKeypair *keypair); + void InsertKeypairInPeer_Locked(WgKeypair *keypair); WgDevice *dev_; WgPeer *next_peer_; // Keypairs, |curr_keypair_| is the used one, the other ones are // the old ones and the next one. - WgKeypair *curr_keypair_; - WgKeypair *prev_keypair_; - WgKeypair *next_keypair_; + WgKeypair *curr_keypair_, *prev_keypair_, *next_keypair_; + + // Protects shared variables of the WgPeer + WG_DECLARE_LOCK(mutex_); // Timestamp when the next key related event is going to occur. uint64 time_of_next_key_event_; @@ -433,23 +443,38 @@ private: uint32 timers_; uint32 timer_value_[5]; - // Holds the entry into the key id table during handshake + // Holds the entry into the key id table during handshake - mt only. uint32 local_key_id_during_hs_; + + // Address of peer IpAddr endpoint_; + enum { + kMainThreadScheduled_ScheduleHandshake = 1, + }; + std::atomic main_thread_scheduled_; + WgPeer *main_thread_scheduled_next_; + // The broadcast address of the IPv4 network, used to block broadcast traffic // from being sent out over the VPN link. uint32 ipv4_broadcast_addr_; + // Whether the tunsafe specific handshake extensions are supported bool supports_handshake_extensions_; + // Whether any data was sent since the keepalive timer was set bool pending_keepalive_; + + // Whether to change the endpoint on incoming packets. + bool allow_endpoint_change_; + + // Whether we've sent a mac to the peer so we may expect a cookie reply back. bool expect_cookie_reply_; // Whether we want to route incoming multicast/broadcast traffic to this peer. bool allow_multicast_through_peer_; - // Whether + // Whether |mac2_cookie_| is valid. bool has_mac2_cookie_; // Number of handshakes made so far, when this gets too high we stop connecting. @@ -462,11 +487,18 @@ private: uint8 num_queued_packets_; Packet *first_queued_packet_, **last_queued_packet_ptr_; + // For statistics uint64 last_handshake_init_timestamp_; uint64 last_complete_handskake_timestamp_; - uint64 last_handshake_init_recv_timestamp_; - enum { MAX_CIPHERS = 16 }; + // Timestamp to detect flooding of handshakes + uint64 last_handshake_init_recv_timestamp_; // main thread only + + // Number of handshake attempts since last successful handshake + uint32 total_handshake_attempts_; + + // For dynamic ciphers, holds the list of supported ciphers. + enum { MAX_CIPHERS = 4 }; uint8 cipher_prio_; uint8 num_ciphers_; uint8 ciphers_[MAX_CIPHERS]; @@ -482,19 +514,19 @@ private: uint8 e_priv[WG_PUBLIC_KEY_LEN]; }; HandshakeState hs_; - // Remote's static public key - Written only by Init + // Remote's static public key - init only. uint8 s_remote_[WG_PUBLIC_KEY_LEN]; - // Remote's preshared key - Written only by Init + // Remote's preshared key - init only. uint8 preshared_key_[WG_SYMMETRIC_KEY_LEN]; - // Precomputed DH(spriv_local, spub_remote). + // Precomputed DH(spriv_local, spub_remote) - init only. uint8 s_priv_pub_[WG_PUBLIC_KEY_LEN]; - // The most recent seen timestamp, only accept higher timestamps. - uint8 last_timestamp_[WG_TIMESTAMP_LEN]; - // Precomputed key for decrypting cookies from the peer. + // The most recent seen timestamp, only accept higher timestamps - mt only. + uint8 last_timestamp_[WG_TIMESTAMP_LEN]; + // Precomputed key for decrypting cookies from the peer - init only. uint8 precomputed_cookie_key_[WG_SYMMETRIC_KEY_LEN]; - // Precomputed key for sending MACs to the peer. + // Precomputed key for sending MACs to the peer - init only. uint8 precomputed_mac1_key_[WG_SYMMETRIC_KEY_LEN]; - // The last mac value sent, required to make cookies + // The last mac value sent, required to make cookies - mt only. uint8 sent_mac1_[WG_COOKIE_LEN]; // The mac2 cookie that gets appended to outgoing packets uint8 mac2_cookie_[WG_COOKIE_LEN]; @@ -520,10 +552,10 @@ public: BITMAP_MASK = BITMAP_SIZE - 1, }; - uint64 expected_seq_nr() const { return expected_seq_nr_; } + const uint64 expected_seq_nr() const { return expected_seq_nr_; } private: - uint64 expected_seq_nr_; + std::atomic expected_seq_nr_; uint32 bitmap_[BITMAP_SIZE]; }; @@ -574,7 +606,7 @@ struct WgKeypair { // Used so we know when to send out ack packets. uint32 incoming_packet_count; - // Id of the key in my map + // Id of the key in my map. (MainThread) uint32 local_key_id; // Id of the key in their map uint32 remote_key_id; @@ -602,7 +634,6 @@ struct WgKeypair { // State for packet compressor IpzipState ipzip_state_; #endif // WITH_HANDSHAKE_EXT - }; void WgKeypairEncryptPayload(uint8 *dst, const size_t src_len,