diff -rNu openssl-1.0.2e/Configure openssl-1.0.2e-modified/Configure --- openssl-1.0.2e/Configure 2015-12-03 15:04:23.000000000 +0100 +++ openssl-1.0.2e-modified/Configure 2016-02-08 16:12:00.592614754 +0100 @@ -143,25 +143,25 @@ my $bits1="THIRTY_TWO_BIT "; my $bits2="SIXTY_FOUR_BIT "; -my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o::des-586.o crypt586.o:aes-586.o vpaes-x86.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o:ghash-x86.o:"; +my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o::des-586.o crypt586.o:aes-586.o vpaes-x86.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o:ghash-x86.o::"; my $x86_elf_asm="$x86_asm:elf"; -my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o:ecp_nistz256.o ecp_nistz256-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:"; -my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void"; -my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o::des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o::void"; -my $sparcv8_asm=":sparcv8.o::des_enc-sparc.o fcrypt_b.o:::::::::::::void"; -my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::sha1-alpha.o:::::::ghash-alpha.o::void"; -my $mips64_asm=":bn-mips.o mips-mont.o:::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::"; +my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o:ecp_nistz256.o ecp_nistz256-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o::chacha20_avx.o poly1305_avx.o chacha20_avx2.o poly1305_avx2.o"; +my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o:::void"; +my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o::des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o:::void"; +my $sparcv8_asm=":sparcv8.o::des_enc-sparc.o fcrypt_b.o::::::::::::::void"; +my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::sha1-alpha.o:::::::ghash-alpha.o:::void"; +my $mips64_asm=":bn-mips.o mips-mont.o:::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o:::::::::"; my $mips32_asm=$mips64_asm; $mips32_asm =~ s/\s*sha512\-mips\.o//; -my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o:::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:"; -my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o:::aes_cbc.o aes-armv4.o bsaes-armv7.o aesv8-armx.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o ghashv8-armx.o::void"; -my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o::::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o:"; -my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32"; -my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64"; -my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o:::aes_core.o aes_cbc.o aes-ppc.o vpaes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:"; +my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o:::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o::"; +my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o:::aes_cbc.o aes-armv4.o bsaes-armv7.o aesv8-armx.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o ghashv8-armx.o:::void"; +my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o::::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o::"; +my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:::32"; +my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:::64"; +my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o:::aes_core.o aes_cbc.o aes-ppc.o vpaes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o::"; my $ppc32_asm=$ppc64_asm; -my $no_asm="::::::::::::::::void"; +my $no_asm=":::::::::::::::::void"; # As for $BSDthreads. Idea is to maintain "collective" set of flags, # which would cover all BSD flavors. -pthread applies to them all, @@ -213,7 +213,7 @@ "debug-linux-ppro","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DL_ENDIAN -g -mcpu=pentiumpro -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn", "debug-linux-elf","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DL_ENDIAN -g -march=i486 -Wall::-D_REENTRANT::-lefence -ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "debug-linux-elf-noefence","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DL_ENDIAN -g -march=i486 -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"debug-linux-ia32-aes", "gcc:-DAES_EXPERIMENTAL -DL_ENDIAN -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:x86cpuid.o:bn-586.o co-586.o x86-mont.o::des-586.o crypt586.o:aes_x86core.o aes_cbc.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o::ghash-x86.o::elf:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"debug-linux-ia32-aes", "gcc:-DAES_EXPERIMENTAL -DL_ENDIAN -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:x86cpuid.o:bn-586.o co-586.o x86-mont.o::des-586.o crypt586.o:aes_x86core.o aes_cbc.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o::ghash-x86.o:::elf:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "debug-linux-generic32","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -g -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "debug-linux-generic64","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DTERMIO -g -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "debug-linux-x86_64","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -m64 -DL_ENDIAN -g -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", @@ -320,7 +320,7 @@ "hpux-parisc-gcc","gcc:-O3 -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-Wl,+s -ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1:${no_asm}:dl:hpux-shared:-fPIC:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "hpux-parisc1_1-gcc","gcc:-O3 -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-Wl,+s -ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1:${parisc11_asm}:dl:hpux-shared:-fPIC:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/pa1.1", "hpux-parisc2-gcc","gcc:-march=2.0 -O3 -DB_ENDIAN -D_REENTRANT::::-Wl,+s -ldld:SIXTY_FOUR_BIT RC4_CHAR RC4_CHUNK DES_PTR DES_UNROLL DES_RISC1:".eval{my $asm=$parisc20_asm;$asm=~s/2W\./2\./;$asm=~s/:64/:32/;$asm}.":dl:hpux-shared:-fPIC:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/pa20_32", -"hpux64-parisc2-gcc","gcc:-O3 -DB_ENDIAN -D_REENTRANT::::-ldl:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::pa-risc2W.o:::::::::::::::void:dlfcn:hpux-shared:-fpic:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/pa20_64", +"hpux64-parisc2-gcc","gcc:-O3 -DB_ENDIAN -D_REENTRANT::::-ldl:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::pa-risc2W.o::::::::::::::::void:dlfcn:hpux-shared:-fpic:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/pa20_64", # More attempts at unified 10.X and 11.X targets for HP C compiler. # @@ -577,9 +577,9 @@ # Visual C targets # # Win64 targets, WIN64I denotes IA-64 and WIN64A - AMD64 -"VC-WIN64I","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o ia64-mont.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ghash-ia64.o::ias:win32", +"VC-WIN64I","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o ia64-mont.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::::::::ghash-ia64.o::ias:win32", "VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:".eval{my $asm=$x86_64_asm;$asm=~s/x86_64-gcc\.o/bn_asm.o/;$asm}.":auto:win32", -"debug-VC-WIN64I","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ghash-ia64.o::ias:win32", +"debug-VC-WIN64I","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::::::::ghash-ia64.o::ias:win32", "debug-VC-WIN64A","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:".eval{my $asm=$x86_64_asm;$asm=~s/x86_64-gcc\.o/bn_asm.o/;$asm}.":auto:win32", # x86 Win32 target defaults to ANSI API, if you want UNICODE, complement # 'perl Configure VC-WIN32' with '-DUNICODE -D_UNICODE' @@ -707,6 +707,7 @@ my $idx_cmll_obj = $idx++; my $idx_modes_obj = $idx++; my $idx_engines_obj = $idx++; +my $idx_chapoly_obj = $idx++; my $idx_perlasm_scheme = $idx++; my $idx_dso_scheme = $idx++; my $idx_shared_target = $idx++; @@ -749,6 +750,7 @@ my $bn_asm ="bn_asm.o"; my $des_enc="des_enc.o fcrypt_b.o"; my $aes_enc="aes_core.o aes_cbc.o"; +my $chapoly_enc=""; my $bf_enc ="bf_enc.o"; my $cast_enc="c_enc.o"; my $rc4_enc="rc4_enc.o rc4_skey.o"; @@ -1207,7 +1209,7 @@ print "IsMK1MF=$IsMK1MF\n"; -my @fields = split(/\s*:\s*/,$table{$target} . ":" x 30 , -1); +my @fields = split(/\s*:\s*/,$table{$target} . ":" x 31 , -1); my $cc = $fields[$idx_cc]; # Allow environment CC to override compiler... if($ENV{CC}) { @@ -1236,6 +1238,7 @@ my $cmll_obj = $fields[$idx_cmll_obj]; my $modes_obj = $fields[$idx_modes_obj]; my $engines_obj = $fields[$idx_engines_obj]; +my $chapoly_obj = $fields[$idx_chapoly_obj]; my $perlasm_scheme = $fields[$idx_perlasm_scheme]; my $dso_scheme = $fields[$idx_dso_scheme]; my $shared_target = $fields[$idx_shared_target]; @@ -1402,7 +1405,7 @@ { $cpuid_obj=$bn_obj=$ec_obj= $des_obj=$aes_obj=$bf_obj=$cast_obj=$rc4_obj=$rc5_obj=$cmll_obj= - $modes_obj=$sha1_obj=$md5_obj=$rmd160_obj=$wp_obj=$engines_obj=""; + $modes_obj=$sha1_obj=$md5_obj=$rmd160_obj=$wp_obj=$engines_obj=$chapoly_obj=""; } if (!$no_shared) @@ -1555,6 +1558,14 @@ $cast_obj=$cast_enc unless ($cast_obj =~ /\.o$/); $rc4_obj=$rc4_enc unless ($rc4_obj =~ /\.o$/); $rc5_obj=$rc5_enc unless ($rc5_obj =~ /\.o$/); +if ($chapoly_obj =~ /\.o$/) + { + $cflags.=" -DCHAPOLY_x86_64_ASM"; + } +else + { + $chapoly_obj=$chapoly_enc; + } if ($sha1_obj =~ /\.o$/) { # $sha1_obj=$sha1_enc; @@ -1737,6 +1748,7 @@ s/^WP_ASM_OBJ=.*$/WP_ASM_OBJ= $wp_obj/; s/^CMLL_ENC=.*$/CMLL_ENC= $cmll_obj/; s/^MODES_ASM_OBJ.=*$/MODES_ASM_OBJ= $modes_obj/; + s/^CHAPOLY_ENC=.*$/CHAPOLY_ENC= $chapoly_obj/; s/^ENGINES_ASM_OBJ.=*$/ENGINES_ASM_OBJ= $engines_obj/; s/^PERLASM_SCHEME=.*$/PERLASM_SCHEME= $perlasm_scheme/; s/^PROCESSOR=.*/PROCESSOR= $processor/; @@ -1799,6 +1811,7 @@ print "CMLL_ENC =$cmll_obj\n"; print "MODES_OBJ =$modes_obj\n"; print "ENGINES_OBJ =$engines_obj\n"; +print "CHAPOLY_ENC =$chapoly_obj\n"; print "PROCESSOR =$processor\n"; print "RANLIB =$ranlib\n"; print "ARFLAGS =$arflags\n"; @@ -2197,7 +2210,7 @@ my ($cc, $cflags, $unistd, $thread_cflag, $sys_id, $lflags, $bn_ops, $cpuid_obj, $bn_obj, $ec_obj, $des_obj, $aes_obj, $bf_obj, $md5_obj, $sha1_obj, $cast_obj, $rc4_obj, $rmd160_obj, - $rc5_obj, $wp_obj, $cmll_obj, $modes_obj, $engines_obj, + $rc5_obj, $wp_obj, $cmll_obj, $modes_obj, $engines_obj, $chapoly_obj, $perlasm_scheme, $dso_scheme, $shared_target, $shared_cflag, $shared_ldflag, $shared_extension, $ranlib, $arflags, $multilib)= split(/\s*:\s*/,$table{$target} . ":" x 30 , -1); @@ -2228,6 +2241,7 @@ \$cmll_obj = $cmll_obj \$modes_obj = $modes_obj \$engines_obj = $engines_obj +\$chapoly_obj = $chapoly_obj \$perlasm_scheme = $perlasm_scheme \$dso_scheme = $dso_scheme \$shared_target= $shared_target diff -rNu openssl-1.0.2e/Makefile.org openssl-1.0.2e-modified/Makefile.org --- openssl-1.0.2e/Makefile.org 2015-12-03 15:04:23.000000000 +0100 +++ openssl-1.0.2e-modified/Makefile.org 2016-02-08 16:12:00.593614754 +0100 @@ -91,6 +91,7 @@ EC_ASM= DES_ENC= des_enc.o fcrypt_b.o AES_ENC= aes_core.o aes_cbc.o +CHAPOLY_ENC= BF_ENC= bf_enc.o CAST_ENC= c_enc.o RC4_ENC= rc4_enc.o @@ -148,7 +149,7 @@ bn ec rsa dsa ecdsa dh ecdh dso engine \ buffer bio stack lhash rand err \ evp asn1 pem x509 x509v3 conf txt_db pkcs7 pkcs12 comp ocsp ui krb5 \ - cms pqueue ts jpake srp store cmac + cms pqueue ts jpake srp store cmac chacha20poly1305 # keep in mind that the above list is adjusted by ./Configure # according to no-xxx arguments... @@ -235,6 +236,7 @@ WP_ASM_OBJ='$(WP_ASM_OBJ)' \ MODES_ASM_OBJ='$(MODES_ASM_OBJ)' \ ENGINES_ASM_OBJ='$(ENGINES_ASM_OBJ)' \ + CHAPOLY_ENC='$(CHAPOLY_ENC)' \ PERLASM_SCHEME='$(PERLASM_SCHEME)' \ FIPSLIBDIR='${FIPSLIBDIR}' \ FIPSDIR='${FIPSDIR}' \ diff -rNu openssl-1.0.2e/apps/speed.c openssl-1.0.2e-modified/apps/speed.c --- openssl-1.0.2e/apps/speed.c 2015-12-03 15:04:23.000000000 +0100 +++ openssl-1.0.2e-modified/apps/speed.c 2016-02-08 16:12:00.594614754 +0100 @@ -226,7 +226,7 @@ # endif # undef BUFSIZE -# define BUFSIZE ((long)1024*8+1) +# define BUFSIZE ((long)1024*8+16) static volatile int run = 0; static int mr = 0; @@ -241,7 +241,7 @@ static int do_multi(int multi); # endif -# define ALGOR_NUM 30 +# define ALGOR_NUM 31 # define SIZE_NUM 5 # define RSA_NUM 4 # define DSA_NUM 3 @@ -256,7 +256,7 @@ "aes-128 cbc", "aes-192 cbc", "aes-256 cbc", "camellia-128 cbc", "camellia-192 cbc", "camellia-256 cbc", "evp", "sha256", "sha512", "whirlpool", - "aes-128 ige", "aes-192 ige", "aes-256 ige", "ghash" + "aes-128 ige", "aes-192 ige", "aes-256 ige", "ghash", "chacha20-poly1305" }; static double results[ALGOR_NUM][SIZE_NUM]; @@ -516,6 +516,7 @@ # define D_IGE_192_AES 27 # define D_IGE_256_AES 28 # define D_GHASH 29 +# define D_CHAPOLY 30 double d = 0.0; long c[ALGOR_NUM][SIZE_NUM]; # define R_DSA_512 0 @@ -972,6 +973,11 @@ doit[D_CBC_256_CML] = 1; } else # endif +# ifndef OPENSSL_NO_CHACHA_POLY + if (strcmp(*argv,"chacha20-poly1305") == 0) { + doit[D_CHAPOLY] = 1; + } else +# endif # ifndef OPENSSL_NO_RSA if (strcmp(*argv, "rsa") == 0) { rsa_doit[R_RSA_512] = 1; @@ -1139,7 +1145,9 @@ BIO_printf(bio_err, "rc4"); # endif BIO_printf(bio_err, "\n"); - +# ifndef OPENSSL_NO_CHACHA_POLY + BIO_printf(bio_err,"chacha20-poly1305\n"); +# endif # ifndef OPENSSL_NO_RSA BIO_printf(bio_err, "rsa512 rsa1024 rsa2048 rsa4096\n"); # endif @@ -1370,6 +1378,7 @@ c[D_IGE_192_AES][0] = count; c[D_IGE_256_AES][0] = count; c[D_GHASH][0] = count; + c[D_CHAPOLY][0] = count; for (i = 1; i < SIZE_NUM; i++) { c[D_MD2][i] = c[D_MD2][0] * 4 * lengths[0] / lengths[i]; @@ -1862,6 +1871,23 @@ } } # endif +# ifndef OPENSSL_NO_CHACHA_POLY + if (doit[D_CHAPOLY]) { + EVP_CIPHER_CTX ctx; + EVP_CIPHER_CTX_init(&ctx); + EVP_CipherInit_ex(&ctx,EVP_chacha20_poly1305(),NULL,key32,NULL,1); + for (j=0; j $@ +poly1305_avx.s:asm/poly1305_avx.pl + $(PERL) asm/poly1305_avx.pl $(PERLASM_SCHEME) > $@ +chacha20_avx2.s:asm/chacha20_avx2.pl + $(PERL) asm/chacha20_avx2.pl $(PERLASM_SCHEME) > $@ +poly1305_avx2.s:asm/poly1305_avx2.pl + $(PERL) asm/poly1305_avx2.pl $(PERLASM_SCHEME) > $@ + +files: + $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO + +links: + @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER) + @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST) + @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS) + +install: + @[ -n "$(INSTALLTOP)" ] # should be set by top Makefile... + @headerlist="$(EXHEADER)"; for i in $$headerlist ; \ + do \ + (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \ + chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \ + done; + +tags: + ctags $(SRC) + +tests: + +lint: + lint -DLINT $(INCLUDES) $(SRC)>fluff + +depend: + @[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile... + $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC) + +dclean: + $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new + mv -f Makefile.new $(MAKEFILE) + +clean: + rm -f *.s *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff + +# DO NOT DELETE THIS LINE -- make depend depends on it. + +chacha20.o: ../../include/openssl/chacha20poly1305.h chacha20.c +poly1305.o: ../../include/openssl/chacha20poly1305.h poly1305.c diff -rNu openssl-1.0.2e/crypto/chacha20poly1305/asm/chacha20_avx.pl openssl-1.0.2e-modified/crypto/chacha20poly1305/asm/chacha20_avx.pl --- openssl-1.0.2e/crypto/chacha20poly1305/asm/chacha20_avx.pl 1970-01-01 01:00:00.000000000 +0100 +++ openssl-1.0.2e-modified/crypto/chacha20poly1305/asm/chacha20_avx.pl 2016-02-08 16:12:00.595614754 +0100 @@ -0,0 +1,388 @@ +#!/usr/bin/env perl + +############################################################################## +# # +# Copyright 2014 Intel Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# # +############################################################################## +# # +# Developers and authors: # +# Shay Gueron (1, 2), and Vlad Krasnov (1) # +# (1) Intel Corporation, Israel Development Center # +# (2) University of Haifa # +# # +# Related work: # +# M. Goll, S. Gueron, "Vectorization on ChaCha Stream Cipher", IEEE # +# Proceedings of 11th International Conference on Information # +# Technology: New Generations (ITNG 2014), 612-615 (2014). # +# M. Goll, S. Gueron, "Vectorization on Poly1305 Message Authentication Code"# +# to be published. # +# A. Langley, chacha20poly1305 for the AEAD head # +# https://git.openssl.org/gitweb/?p=openssl.git;a=commit;h=9a8646510b3d0a48e950748f7a2aaa12ed40d5e0 # +############################################################################## + + + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { + my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 + $avx = ($ver>=3.0) + ($ver>=3.01); +} + +if ($avx>=1) {{ + +sub chacha_qr { +my ($a,$b,$c,$d,$tmp)=@_; +$code.=<<___; + + vpaddd $b, $a, $a # a += b + vpxor $a, $d, $d # d ^= a + vpshufb .rol16(%rip), $d, $d # d <<<= 16 + + vpaddd $d, $c, $c # c += d + vpxor $c, $b, $b # b ^= c + vpslld \$12, $b, $tmp + vpsrld \$20, $b, $b + vpxor $tmp, $b, $b # b <<<= 12 + + vpaddd $b, $a, $a # a += b + vpxor $a, $d, $d # d ^= a + vpshufb .rol8(%rip), $d, $d # d <<<= 8 + + vpaddd $d, $c, $c # c += d + vpxor $c, $b, $b # b ^= c + + vpslld \$7, $b, $tmp + vpsrld \$25, $b, $b + vpxor $tmp, $b, $b # b <<<= 7 +___ +} + + +$code.=<<___; +.text +.align 16 +chacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.rol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.rol16: +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.avxInc: +.quad 1,0 +___ + +{ +my ($state_4567, $state_89ab, $state_cdef, $tmp, + $v0, $v1, $v2, $v3, $v4, $v5, $v6, $v7, + $v8, $v9, $v10, $v11)=map("%xmm$_",(0..15)); + +my ($out, $in, $in_len, $key_ptr, $nonce_ptr, $counter, $nr) + =("%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9", "%rax"); + +$code.=<<___; +.globl chacha_20_core_avx +.type chacha_20_core_avx ,\@function,2 +.align 64 +chacha_20_core_avx: + vzeroupper + + # Init state + vmovdqu 16*0($key_ptr), $state_4567 + vmovdqu 16*1($key_ptr), $state_89ab + vmovq $counter, $state_cdef + vpinsrq \$1, ($nonce_ptr), $state_cdef, $state_cdef +2: + cmp \$3*64, $in_len + jb 2f + + vmovdqa chacha20_consts(%rip), $v0 + vmovdqa chacha20_consts(%rip), $v4 + vmovdqa chacha20_consts(%rip), $v8 + + vmovdqa $state_4567, $v1 + vmovdqa $state_4567, $v5 + vmovdqa $state_4567, $v9 + + vmovdqa $state_89ab, $v2 + vmovdqa $state_89ab, $v6 + vmovdqa $state_89ab, $v10 + + vmovdqa $state_cdef, $v3 + vpaddq .avxInc(%rip), $v3, $v7 + vpaddq .avxInc(%rip), $v7, $v11 + + mov \$10, $nr + + 1: +___ + &chacha_qr($v0,$v1,$v2,$v3,$tmp); + &chacha_qr($v4,$v5,$v6,$v7,$tmp); + &chacha_qr($v8,$v9,$v10,$v11,$tmp); +$code.=<<___; + vpalignr \$4, $v1, $v1, $v1 + vpalignr \$8, $v2, $v2, $v2 + vpalignr \$12, $v3, $v3, $v3 + vpalignr \$4, $v5, $v5, $v5 + vpalignr \$8, $v6, $v6, $v6 + vpalignr \$12, $v7, $v7, $v7 + vpalignr \$4, $v9, $v9, $v9 + vpalignr \$8, $v10, $v10, $v10 + vpalignr \$12, $v11, $v11, $v11 +___ + &chacha_qr($v0,$v1,$v2,$v3,$tmp); + &chacha_qr($v4,$v5,$v6,$v7,$tmp); + &chacha_qr($v8,$v9,$v10,$v11,$tmp); +$code.=<<___; + vpalignr \$12, $v1, $v1, $v1 + vpalignr \$8, $v2, $v2, $v2 + vpalignr \$4, $v3, $v3, $v3 + vpalignr \$12, $v5, $v5, $v5 + vpalignr \$8, $v6, $v6, $v6 + vpalignr \$4, $v7, $v7, $v7 + vpalignr \$12, $v9, $v9, $v9 + vpalignr \$8, $v10, $v10, $v10 + vpalignr \$4, $v11, $v11, $v11 + + dec $nr + + jnz 1b + + vpaddd chacha20_consts(%rip), $v0, $v0 + vpaddd chacha20_consts(%rip), $v4, $v4 + vpaddd chacha20_consts(%rip), $v8, $v8 + + vpaddd $state_4567, $v1, $v1 + vpaddd $state_4567, $v5, $v5 + vpaddd $state_4567, $v9, $v9 + + vpaddd $state_89ab, $v2, $v2 + vpaddd $state_89ab, $v6, $v6 + vpaddd $state_89ab, $v10, $v10 + + vpaddd $state_cdef, $v3, $v3 + vpaddq .avxInc(%rip), $state_cdef, $state_cdef + vpaddd $state_cdef, $v7, $v7 + vpaddq .avxInc(%rip), $state_cdef, $state_cdef + vpaddd $state_cdef, $v11, $v11 + vpaddq .avxInc(%rip), $state_cdef, $state_cdef + + vpxor 16*0($in), $v0, $v0 + vpxor 16*1($in), $v1, $v1 + vpxor 16*2($in), $v2, $v2 + vpxor 16*3($in), $v3, $v3 + + vmovdqu $v0, 16*0($out) + vmovdqu $v1, 16*1($out) + vmovdqu $v2, 16*2($out) + vmovdqu $v3, 16*3($out) + + vpxor 16*4($in), $v4, $v4 + vpxor 16*5($in), $v5, $v5 + vpxor 16*6($in), $v6, $v6 + vpxor 16*7($in), $v7, $v7 + + vmovdqu $v4, 16*4($out) + vmovdqu $v5, 16*5($out) + vmovdqu $v6, 16*6($out) + vmovdqu $v7, 16*7($out) + + vpxor 16*8($in), $v8, $v8 + vpxor 16*9($in), $v9, $v9 + vpxor 16*10($in), $v10, $v10 + vpxor 16*11($in), $v11, $v11 + + vmovdqu $v8, 16*8($out) + vmovdqu $v9, 16*9($out) + vmovdqu $v10, 16*10($out) + vmovdqu $v11, 16*11($out) + + lea 16*12($in), $in + lea 16*12($out), $out + sub \$16*12, $in_len + + jmp 2b + +2: + cmp \$2*64, $in_len + jb 2f + + vmovdqa chacha20_consts(%rip), $v0 + vmovdqa chacha20_consts(%rip), $v4 + vmovdqa $state_4567, $v1 + vmovdqa $state_4567, $v5 + vmovdqa $state_89ab, $v2 + vmovdqa $state_89ab, $v6 + vmovdqa $state_89ab, $v10 + vmovdqa $state_cdef, $v3 + vpaddq .avxInc(%rip), $v3, $v7 + + mov \$10, $nr + + 1: +___ + &chacha_qr($v0,$v1,$v2,$v3,$tmp); + &chacha_qr($v4,$v5,$v6,$v7,$tmp); +$code.=<<___; + vpalignr \$4, $v1, $v1, $v1 + vpalignr \$8, $v2, $v2, $v2 + vpalignr \$12, $v3, $v3, $v3 + vpalignr \$4, $v5, $v5, $v5 + vpalignr \$8, $v6, $v6, $v6 + vpalignr \$12, $v7, $v7, $v7 +___ + &chacha_qr($v0,$v1,$v2,$v3,$tmp); + &chacha_qr($v4,$v5,$v6,$v7,$tmp); +$code.=<<___; + vpalignr \$12, $v1, $v1, $v1 + vpalignr \$8, $v2, $v2, $v2 + vpalignr \$4, $v3, $v3, $v3 + vpalignr \$12, $v5, $v5, $v5 + vpalignr \$8, $v6, $v6, $v6 + vpalignr \$4, $v7, $v7, $v7 + + dec $nr + + jnz 1b + + vpaddd chacha20_consts(%rip), $v0, $v0 + vpaddd chacha20_consts(%rip), $v4, $v4 + + vpaddd $state_4567, $v1, $v1 + vpaddd $state_4567, $v5, $v5 + + vpaddd $state_89ab, $v2, $v2 + vpaddd $state_89ab, $v6, $v6 + + vpaddd $state_cdef, $v3, $v3 + vpaddq .avxInc(%rip), $state_cdef, $state_cdef + vpaddd $state_cdef, $v7, $v7 + vpaddq .avxInc(%rip), $state_cdef, $state_cdef + + vpxor 16*0($in), $v0, $v0 + vpxor 16*1($in), $v1, $v1 + vpxor 16*2($in), $v2, $v2 + vpxor 16*3($in), $v3, $v3 + + vmovdqu $v0, 16*0($out) + vmovdqu $v1, 16*1($out) + vmovdqu $v2, 16*2($out) + vmovdqu $v3, 16*3($out) + + vpxor 16*4($in), $v4, $v4 + vpxor 16*5($in), $v5, $v5 + vpxor 16*6($in), $v6, $v6 + vpxor 16*7($in), $v7, $v7 + + vmovdqu $v4, 16*4($out) + vmovdqu $v5, 16*5($out) + vmovdqu $v6, 16*6($out) + vmovdqu $v7, 16*7($out) + + lea 16*8($in), $in + lea 16*8($out), $out + sub \$16*8, $in_len + + jmp 2b +2: + cmp \$64, $in_len + jb 2f + + vmovdqa chacha20_consts(%rip), $v0 + vmovdqa $state_4567, $v1 + vmovdqa $state_89ab, $v2 + vmovdqa $state_cdef, $v3 + + mov \$10, $nr + + 1: +___ + &chacha_qr($v0,$v1,$v2,$v3,$tmp); +$code.=<<___; + vpalignr \$4, $v1, $v1, $v1 + vpalignr \$8, $v2, $v2, $v2 + vpalignr \$12, $v3, $v3, $v3 +___ + &chacha_qr($v0,$v1,$v2,$v3,$tmp); +$code.=<<___; + vpalignr \$12, $v1, $v1, $v1 + vpalignr \$8, $v2, $v2, $v2 + vpalignr \$4, $v3, $v3, $v3 + + dec $nr + jnz 1b + + vpaddd chacha20_consts(%rip), $v0, $v0 + vpaddd $state_4567, $v1, $v1 + vpaddd $state_89ab, $v2, $v2 + vpaddd $state_cdef, $v3, $v3 + vpaddq .avxInc(%rip), $state_cdef, $state_cdef + + vpxor 16*0($in), $v0, $v0 + vpxor 16*1($in), $v1, $v1 + vpxor 16*2($in), $v2, $v2 + vpxor 16*3($in), $v3, $v3 + + vmovdqu $v0, 16*0($out) + vmovdqu $v1, 16*1($out) + vmovdqu $v2, 16*2($out) + vmovdqu $v3, 16*3($out) + + lea 16*4($in), $in + lea 16*4($out), $out + sub \$16*4, $in_len + jmp 2b +2: + vzeroupper + ret +.size chacha_20_core_avx,.-chacha_20_core_avx +___ +} +}} + + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT; diff -rNu openssl-1.0.2e/crypto/chacha20poly1305/asm/chacha20_avx2.pl openssl-1.0.2e-modified/crypto/chacha20poly1305/asm/chacha20_avx2.pl --- openssl-1.0.2e/crypto/chacha20poly1305/asm/chacha20_avx2.pl 1970-01-01 01:00:00.000000000 +0100 +++ openssl-1.0.2e-modified/crypto/chacha20poly1305/asm/chacha20_avx2.pl 2016-02-08 16:12:00.595614754 +0100 @@ -0,0 +1,424 @@ +#!/usr/bin/env perl + +############################################################################## +# # +# Copyright 2014 Intel Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# # +############################################################################## +# # +# Developers and authors: # +# Shay Gueron (1, 2), and Vlad Krasnov (1) # +# (1) Intel Corporation, Israel Development Center # +# (2) University of Haifa # +# # +# Related work: # +# M. Goll, S. Gueron, "Vectorization on ChaCha Stream Cipher", IEEE # +# Proceedings of 11th International Conference on Information # +# Technology: New Generations (ITNG 2014), 612-615 (2014). # +# M. Goll, S. Gueron, "Vectorization on Poly1305 Message Authentication Code"# +# to be published. # +# A. Langley, chacha20poly1305 for the AEAD head # +# https://git.openssl.org/gitweb/?p=openssl.git;a=commit;h=9a8646510b3d0a48e950748f7a2aaa12ed40d5e0 # +############################################################################## + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { + my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 + $avx = ($ver>=3.0) + ($ver>=3.01); +} + +if ($avx>=2) {{ + +sub chacha_qr { +my ($a,$b,$c,$d,$tmp)=@_; +$code.=<<___; + + vpaddd $b, $a, $a # a += b + vpxor $a, $d, $d # d ^= a + vpshufb .rol16(%rip), $d, $d # d <<<= 16 + + vpaddd $d, $c, $c # c += d + vpxor $c, $b, $b # b ^= c + vpslld \$12, $b, $tmp + vpsrld \$20, $b, $b + vpxor $tmp, $b, $b # b <<<= 12 + + vpaddd $b, $a, $a # a += b + vpxor $a, $d, $d # d ^= a + vpshufb .rol8(%rip), $d, $d # d <<<= 8 + + vpaddd $d, $c, $c # c += d + vpxor $c, $b, $b # b ^= c + + vpslld \$7, $b, $tmp + vpsrld \$25, $b, $b + vpxor $tmp, $b, $b # b <<<= 7 +___ +} + + +$code.=<<___; +.text +.align 32 +chacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.rol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.rol16: +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.avx2Init: +.quad 0,0,1,0 +.avx2Inc: +.quad 2,0,2,0 +___ + +{ +my ($state_4567, $state_89ab, $state_cdef, $tmp, + $v0, $v1, $v2, $v3, $v4, $v5, $v6, $v7, + $v8, $v9, $v10, $v11)=map("%ymm$_",(0..15)); + +my $state_cdef_xmm="%xmm2"; + +my ($out, $in, $in_len, $key_ptr, $nonce_ptr, $counter, $nr) + =("%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9", "%rax"); + +$code.=<<___; +.globl chacha_20_core_avx2 +.type chacha_20_core_avx2 ,\@function,2 +.align 64 +chacha_20_core_avx2: + vzeroupper + + # Init state + vbroadcasti128 16*0($key_ptr), $state_4567 + vbroadcasti128 16*1($key_ptr), $state_89ab + vmovq $counter, $state_cdef_xmm + vpinsrq \$1, ($nonce_ptr), $state_cdef_xmm, $state_cdef_xmm + vperm2i128 \$0x00, $state_cdef, $state_cdef, $state_cdef + vpaddq .avx2Init(%rip), $state_cdef, $state_cdef + +2: + cmp \$6*64, $in_len + jb 2f + + vmovdqa chacha20_consts(%rip), $v0 + vmovdqa chacha20_consts(%rip), $v4 + vmovdqa chacha20_consts(%rip), $v8 + + vmovdqa $state_4567, $v1 + vmovdqa $state_4567, $v5 + vmovdqa $state_4567, $v9 + + vmovdqa $state_89ab, $v2 + vmovdqa $state_89ab, $v6 + vmovdqa $state_89ab, $v10 + + vmovdqa $state_cdef, $v3 + vpaddq .avx2Inc(%rip), $v3, $v7 + vpaddq .avx2Inc(%rip), $v7, $v11 + + mov \$10, $nr + + 1: +___ + &chacha_qr($v0,$v1,$v2,$v3,$tmp); + &chacha_qr($v4,$v5,$v6,$v7,$tmp); + &chacha_qr($v8,$v9,$v10,$v11,$tmp); +$code.=<<___; + vpalignr \$4, $v1, $v1, $v1 + vpalignr \$8, $v2, $v2, $v2 + vpalignr \$12, $v3, $v3, $v3 + vpalignr \$4, $v5, $v5, $v5 + vpalignr \$8, $v6, $v6, $v6 + vpalignr \$12, $v7, $v7, $v7 + vpalignr \$4, $v9, $v9, $v9 + vpalignr \$8, $v10, $v10, $v10 + vpalignr \$12, $v11, $v11, $v11 +___ + &chacha_qr($v0,$v1,$v2,$v3,$tmp); + &chacha_qr($v4,$v5,$v6,$v7,$tmp); + &chacha_qr($v8,$v9,$v10,$v11,$tmp); +$code.=<<___; + vpalignr \$12, $v1, $v1, $v1 + vpalignr \$8, $v2, $v2, $v2 + vpalignr \$4, $v3, $v3, $v3 + vpalignr \$12, $v5, $v5, $v5 + vpalignr \$8, $v6, $v6, $v6 + vpalignr \$4, $v7, $v7, $v7 + vpalignr \$12, $v9, $v9, $v9 + vpalignr \$8, $v10, $v10, $v10 + vpalignr \$4, $v11, $v11, $v11 + + dec $nr + + jnz 1b + + vpaddd chacha20_consts(%rip), $v0, $v0 + vpaddd chacha20_consts(%rip), $v4, $v4 + vpaddd chacha20_consts(%rip), $v8, $v8 + + vpaddd $state_4567, $v1, $v1 + vpaddd $state_4567, $v5, $v5 + vpaddd $state_4567, $v9, $v9 + + vpaddd $state_89ab, $v2, $v2 + vpaddd $state_89ab, $v6, $v6 + vpaddd $state_89ab, $v10, $v10 + + vpaddd $state_cdef, $v3, $v3 + vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef + vpaddd $state_cdef, $v7, $v7 + vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef + vpaddd $state_cdef, $v11, $v11 + vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef + + vperm2i128 \$0x02, $v0, $v1, $tmp + vpxor 32*0($in), $tmp, $tmp + vmovdqu $tmp, 32*0($out) + vperm2i128 \$0x02, $v2, $v3, $tmp + vpxor 32*1($in), $tmp, $tmp + vmovdqu $tmp, 32*1($out) + vperm2i128 \$0x13, $v0, $v1, $tmp + vpxor 32*2($in), $tmp, $tmp + vmovdqu $tmp, 32*2($out) + vperm2i128 \$0x13, $v2, $v3, $tmp + vpxor 32*3($in), $tmp, $tmp + vmovdqu $tmp, 32*3($out) + + vperm2i128 \$0x02, $v4, $v5, $v0 + vperm2i128 \$0x02, $v6, $v7, $v1 + vperm2i128 \$0x13, $v4, $v5, $v2 + vperm2i128 \$0x13, $v6, $v7, $v3 + + vpxor 32*4($in), $v0, $v0 + vpxor 32*5($in), $v1, $v1 + vpxor 32*6($in), $v2, $v2 + vpxor 32*7($in), $v3, $v3 + + vmovdqu $v0, 32*4($out) + vmovdqu $v1, 32*5($out) + vmovdqu $v2, 32*6($out) + vmovdqu $v3, 32*7($out) + + vperm2i128 \$0x02, $v8, $v9, $v0 + vperm2i128 \$0x02, $v10, $v11, $v1 + vperm2i128 \$0x13, $v8, $v9, $v2 + vperm2i128 \$0x13, $v10, $v11, $v3 + + vpxor 32*8($in), $v0, $v0 + vpxor 32*9($in), $v1, $v1 + vpxor 32*10($in), $v2, $v2 + vpxor 32*11($in), $v3, $v3 + + vmovdqu $v0, 32*8($out) + vmovdqu $v1, 32*9($out) + vmovdqu $v2, 32*10($out) + vmovdqu $v3, 32*11($out) + + lea 64*6($in), $in + lea 64*6($out), $out + sub \$64*6, $in_len + + jmp 2b + +2: + cmp \$4*64, $in_len + jb 2f + + vmovdqa chacha20_consts(%rip), $v0 + vmovdqa chacha20_consts(%rip), $v4 + vmovdqa $state_4567, $v1 + vmovdqa $state_4567, $v5 + vmovdqa $state_89ab, $v2 + vmovdqa $state_89ab, $v6 + vmovdqa $state_89ab, $v10 + vmovdqa $state_cdef, $v3 + vpaddq .avx2Inc(%rip), $v3, $v7 + + mov \$10, $nr + + 1: +___ + &chacha_qr($v0,$v1,$v2,$v3,$tmp); + &chacha_qr($v4,$v5,$v6,$v7,$tmp); +$code.=<<___; + vpalignr \$4, $v1, $v1, $v1 + vpalignr \$8, $v2, $v2, $v2 + vpalignr \$12, $v3, $v3, $v3 + vpalignr \$4, $v5, $v5, $v5 + vpalignr \$8, $v6, $v6, $v6 + vpalignr \$12, $v7, $v7, $v7 +___ + &chacha_qr($v0,$v1,$v2,$v3,$tmp); + &chacha_qr($v4,$v5,$v6,$v7,$tmp); +$code.=<<___; + vpalignr \$12, $v1, $v1, $v1 + vpalignr \$8, $v2, $v2, $v2 + vpalignr \$4, $v3, $v3, $v3 + vpalignr \$12, $v5, $v5, $v5 + vpalignr \$8, $v6, $v6, $v6 + vpalignr \$4, $v7, $v7, $v7 + + dec $nr + + jnz 1b + + vpaddd chacha20_consts(%rip), $v0, $v0 + vpaddd chacha20_consts(%rip), $v4, $v4 + + vpaddd $state_4567, $v1, $v1 + vpaddd $state_4567, $v5, $v5 + + vpaddd $state_89ab, $v2, $v2 + vpaddd $state_89ab, $v6, $v6 + + vpaddd $state_cdef, $v3, $v3 + vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef + vpaddd $state_cdef, $v7, $v7 + vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef + + vperm2i128 \$0x02, $v0, $v1, $v8 + vperm2i128 \$0x02, $v2, $v3, $v9 + vperm2i128 \$0x13, $v0, $v1, $v10 + vperm2i128 \$0x13, $v2, $v3, $v11 + + vpxor 32*0($in), $v8, $v8 + vpxor 32*1($in), $v9, $v9 + vpxor 32*2($in), $v10, $v10 + vpxor 32*3($in), $v11, $v11 + + vmovdqu $v8, 32*0($out) + vmovdqu $v9, 32*1($out) + vmovdqu $v10, 32*2($out) + vmovdqu $v11, 32*3($out) + + vperm2i128 \$0x02, $v4, $v5, $v0 + vperm2i128 \$0x02, $v6, $v7, $v1 + vperm2i128 \$0x13, $v4, $v5, $v2 + vperm2i128 \$0x13, $v6, $v7, $v3 + + vpxor 32*4($in), $v0, $v0 + vpxor 32*5($in), $v1, $v1 + vpxor 32*6($in), $v2, $v2 + vpxor 32*7($in), $v3, $v3 + + vmovdqu $v0, 32*4($out) + vmovdqu $v1, 32*5($out) + vmovdqu $v2, 32*6($out) + vmovdqu $v3, 32*7($out) + + lea 64*4($in), $in + lea 64*4($out), $out + sub \$64*4, $in_len + + jmp 2b +2: + cmp \$128, $in_len + jb 2f + + vmovdqa chacha20_consts(%rip), $v0 + vmovdqa $state_4567, $v1 + vmovdqa $state_89ab, $v2 + vmovdqa $state_cdef, $v3 + + mov \$10, $nr + + 1: +___ + &chacha_qr($v0,$v1,$v2,$v3,$tmp); +$code.=<<___; + vpalignr \$4, $v1, $v1, $v1 + vpalignr \$8, $v2, $v2, $v2 + vpalignr \$12, $v3, $v3, $v3 +___ + &chacha_qr($v0,$v1,$v2,$v3,$tmp); +$code.=<<___; + vpalignr \$12, $v1, $v1, $v1 + vpalignr \$8, $v2, $v2, $v2 + vpalignr \$4, $v3, $v3, $v3 + + dec $nr + jnz 1b + + vpaddd chacha20_consts(%rip), $v0, $v0 + vpaddd $state_4567, $v1, $v1 + vpaddd $state_89ab, $v2, $v2 + vpaddd $state_cdef, $v3, $v3 + vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef + + vperm2i128 \$0x02, $v0, $v1, $v8 + vperm2i128 \$0x02, $v2, $v3, $v9 + vperm2i128 \$0x13, $v0, $v1, $v10 + vperm2i128 \$0x13, $v2, $v3, $v11 + + vpxor 32*0($in), $v8, $v8 + vpxor 32*1($in), $v9, $v9 + vpxor 32*2($in), $v10, $v10 + vpxor 32*3($in), $v11, $v11 + + vmovdqu $v8, 32*0($out) + vmovdqu $v9, 32*1($out) + vmovdqu $v10, 32*2($out) + vmovdqu $v11, 32*3($out) + + lea 64*2($in), $in + lea 64*2($out), $out + sub \$64*2, $in_len + jmp 2b +2: + vzeroupper + ret +.size chacha_20_core_avx2,.-chacha_20_core_avx2 +___ +} +}} + + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT; diff -rNu openssl-1.0.2e/crypto/chacha20poly1305/asm/poly1305_avx.pl openssl-1.0.2e-modified/crypto/chacha20poly1305/asm/poly1305_avx.pl --- openssl-1.0.2e/crypto/chacha20poly1305/asm/poly1305_avx.pl 1970-01-01 01:00:00.000000000 +0100 +++ openssl-1.0.2e-modified/crypto/chacha20poly1305/asm/poly1305_avx.pl 2016-02-08 16:12:00.596614754 +0100 @@ -0,0 +1,717 @@ +############################################################################## +# # +# Copyright 2014 Intel Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# # +############################################################################## +# # +# Developers and authors: # +# Shay Gueron (1, 2), and Vlad Krasnov (1) # +# (1) Intel Corporation, Israel Development Center # +# (2) University of Haifa # +# # +############################################################################## +# state: +# 0: r[0] || r^2[0] +# 16: r[1] || r^2[1] +# 32: r[2] || r^2[2] +# 48: r[3] || r^2[3] +# 64: r[4] || r^2[4] +# 80: r[1]*5 || r^2[1]*5 +# 96: r[2]*5 || r^2[2]*5 +#112: r[3]*5 || r^2[3]*5 +#128: r[4]*5 || r^2[4]*5 +#144: k +#160: A0 +#164: A1 +#168: A2 +#172: A3 +#176: A4 +#180: END + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { + my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 + $avx = ($ver>=3.0) + ($ver>=3.01); +} + +if ($avx>=1) {{ + +my ($_r0_, $_r1_, $_r2_, $_r3_, $_r4_, $_r1_x5, $_r2_x5, $_r3_x5, $_r4_x5, $_k_, $_A0_, $_A1_, $_A2_, $_A3_, $_A4_) += (0,16,32,48,64,80,96,112,128,144,160,164,168,172,176); + +$code.=<<___; +.text +.align 32 +.LandMask: +.quad 0x3FFFFFF, 0x3FFFFFF +.LsetBit: +.quad 0x1000000, 0x1000000 +.LrSet: +.quad 0xFFFFFFC0FFFFFFF, 0xFFFFFFC0FFFFFFF +.quad 0xFFFFFFC0FFFFFFC, 0xFFFFFFC0FFFFFFC +.Lone: +.quad 1,0 +___ + + +{ +my ($A0, $A1, $A2, $A3, $A4, + $r0, $r1, $r2, $r3, $r4, + $T0, $T1, $A5, $A6, $A7, $A8)=map("%xmm$_",(0..15)); +my ($state, $key) + =("%rdi", "%rsi"); + +$code.=<<___; +################################################################################ +# void poly1305_init_avx(void *state, uint8_t key[32]) + +.globl poly1305_init_avx +.type poly1305_init_avx, \@function, 2 +.align 64 +poly1305_init_avx: + vzeroupper + # load and convert r + vmovq 8*0($key), $r0 + vmovq 8*1($key), $T0 + vpand .LrSet(%rip), $r0, $r0 + vpand .LrSet+16(%rip), $T0, $T0 + + vpsrlq \$26, $r0, $r1 + vpand .LandMask(%rip), $r0, $r0 + vpsrlq \$26, $r1, $r2 + vpand .LandMask(%rip), $r1, $r1 + vpsllq \$12, $T0, $T1 + vpxor $T1, $r2, $r2 + vpsrlq \$26, $r2, $r3 + vpsrlq \$40, $T0, $r4 + vpand .LandMask(%rip), $r2, $r2 + vpand .LandMask(%rip), $r3, $r3 + + # SQR R + vpmuludq $r0, $r0, $A0 + vpmuludq $r1, $r0, $A1 + vpmuludq $r2, $r0, $A2 + vpmuludq $r3, $r0, $A3 + vpmuludq $r4, $r0, $A4 + + vpsllq \$1, $A1, $A1 + vpsllq \$1, $A2, $A2 + vpmuludq $r1, $r1, $T0 + vpaddq $T0, $A2, $A2 + vpmuludq $r2, $r1, $T0 + vpaddq $T0, $A3, $A3 + vpmuludq $r3, $r1, $T0 + vpaddq $T0, $A4, $A4 + vpmuludq $r4, $r1, $A5 + + vpsllq \$1, $A3, $A3 + vpsllq \$1, $A4, $A4 + vpmuludq $r2, $r2, $T0 + vpaddq $T0, $A4, $A4 + vpmuludq $r3, $r2, $T0 + vpaddq $T0, $A5, $A5 + vpmuludq $r4, $r2, $A6 + + vpsllq \$1, $A5, $A5 + vpsllq \$1, $A6, $A6 + vpmuludq $r3, $r3, $T0 + vpaddq $T0, $A6, $A6 + vpmuludq $r4, $r3, $A7 + + vpsllq \$1, $A7, $A7 + vpmuludq $r4, $r4, $A8 + + # Reduce + vpsrlq \$26, $A4, $T0 + vpand .LandMask(%rip), $A4, $A4 + vpaddq $T0, $A5, $A5 + + vpsllq \$2, $A5, $T0 + vpaddq $T0, $A5, $A5 + vpsllq \$2, $A6, $T0 + vpaddq $T0, $A6, $A6 + vpsllq \$2, $A7, $T0 + vpaddq $T0, $A7, $A7 + vpsllq \$2, $A8, $T0 + vpaddq $T0, $A8, $A8 + + vpaddq $A5, $A0, $A0 + vpaddq $A6, $A1, $A1 + vpaddq $A7, $A2, $A2 + vpaddq $A8, $A3, $A3 + + vpsrlq \$26, $A0, $T0 + vpand .LandMask(%rip), $A0, $A0 + vpaddq $T0, $A1, $A1 + vpsrlq \$26, $A1, $T0 + vpand .LandMask(%rip), $A1, $A1 + vpaddq $T0, $A2, $A2 + vpsrlq \$26, $A2, $T0 + vpand .LandMask(%rip), $A2, $A2 + vpaddq $T0, $A3, $A3 + vpsrlq \$26, $A3, $T0 + vpand .LandMask(%rip), $A3, $A3 + vpaddq $T0, $A4, $A4 + + vpunpcklqdq $r0, $A0, $r0 + vpunpcklqdq $r1, $A1, $r1 + vpunpcklqdq $r2, $A2, $r2 + vpunpcklqdq $r3, $A3, $r3 + vpunpcklqdq $r4, $A4, $r4 + + vmovdqu $r0, $_r0_($state) + vmovdqu $r1, $_r1_($state) + vmovdqu $r2, $_r2_($state) + vmovdqu $r3, $_r3_($state) + vmovdqu $r4, $_r4_($state) + + vpsllq \$2, $r1, $A1 + vpsllq \$2, $r2, $A2 + vpsllq \$2, $r3, $A3 + vpsllq \$2, $r4, $A4 + + vpaddq $A1, $r1, $A1 + vpaddq $A2, $r2, $A2 + vpaddq $A3, $r3, $A3 + vpaddq $A4, $r4, $A4 + + vmovdqu $A1, $_r1_x5($state) + vmovdqu $A2, $_r2_x5($state) + vmovdqu $A3, $_r3_x5($state) + vmovdqu $A4, $_r4_x5($state) + # Store k + vmovdqu 16*1($key), $T0 + vmovdqu $T0, $_k_($state) + # Init the MAC value + vpxor $T0, $T0, $T0 + vmovdqu $T0, $_A0_($state) + vmovd $T0, $_A4_($state) + vzeroupper + ret +.size poly1305_init_avx,.-poly1305_init_avx +___ +} + +{ + +my ($A0, $A1, $A2, $A3, $A4, + $T0, $T1, $R0, $R1, $R2, + $R3, $R4, $AND_MASK)=map("%xmm$_",(0..12)); + +my ($state, $in, $in_len)=("%rdi", "%rsi", "%rdx"); + +$code.=<<___; + +############################################################################### +# void* poly1305_update_avx(void* $state, void* in, uint64_t in_len) +.globl poly1305_update_avx +.type poly1305_update_avx, \@function, 2 +.align 64 +poly1305_update_avx: + + vzeroupper + vmovd $_A0_($state), $A0 + vmovd $_A1_($state), $A1 + vmovd $_A2_($state), $A2 + vmovd $_A3_($state), $A3 + vmovd $_A4_($state), $A4 + vmovdqa .LandMask(%rip), $AND_MASK + # Skip to single block case + cmp \$32, $in_len + jb 3f +1: + cmp \$16*4, $in_len + jb 1f + sub \$16*2, $in_len + # load the next two blocks + vmovdqu 16*0($in), $R2 + vmovdqu 16*1($in), $R3 + add \$16*2, $in + + vpunpcklqdq $R3, $R2, $R0 + vpunpckhqdq $R3, $R2, $R1 + + vpsrlq \$26, $R0, $R2 + vpand $AND_MASK, $R0, $R0 + vpaddq $R0, $A0, $A0 + + vpsrlq \$26, $R2, $R0 + vpand $AND_MASK, $R2, $R2 + vpaddq $R2, $A1, $A1 + + vpsllq \$12, $R1, $R2 + vpxor $R2, $R0, $R0 + vpand $AND_MASK, $R0, $R0 + vpaddq $R0, $A2, $A2 + + vpsrlq \$26, $R2, $R0 + vpsrlq \$40, $R1, $R2 + vpand $AND_MASK, $R0, $R0 + vpxor .LsetBit(%rip), $R2, $R2 + vpaddq $R0, $A3, $A3 + vpaddq $R2, $A4, $A4 + + # Multiply input by R[0] + vbroadcastss $_r0_($state), $T0 + vpmuludq $T0, $A0, $R0 + vpmuludq $T0, $A1, $R1 + vpmuludq $T0, $A2, $R2 + vpmuludq $T0, $A3, $R3 + vpmuludq $T0, $A4, $R4 + # Multiply input by R[1] (and R[1]*5) + vbroadcastss $_r1_x5($state), $T0 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R0, $R0 + vbroadcastss $_r1_($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R4, $R4 + # Etc + vbroadcastss $_r2_x5($state), $T0 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R1, $R1 + vbroadcastss $_r2_($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R4, $R4 + + vbroadcastss $_r3_x5($state), $T0 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R2, $R2 + vbroadcastss $_r3_($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R4, $R4 + + vbroadcastss $_r4_x5($state), $T0 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R3, $R3 + vbroadcastss $_r4_($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R4, $R4 + # Reduce + vpsrlq \$26, $R3, $T0 + vpaddq $T0, $R4, $R4 + vpand $AND_MASK, $R3, $R3 + + vpsrlq \$26, $R4, $T0 + vpsllq \$2, $T0, $T1 + vpaddq $T1, $T0, $T0 + vpaddq $T0, $R0, $R0 + vpand $AND_MASK, $R4, $R4 + + vpsrlq \$26, $R0, $T0 + vpand $AND_MASK, $R0, $A0 + vpaddq $T0, $R1, $R1 + vpsrlq \$26, $R1, $T0 + vpand $AND_MASK, $R1, $A1 + vpaddq $T0, $R2, $R2 + vpsrlq \$26, $R2, $T0 + vpand $AND_MASK, $R2, $A2 + vpaddq $T0, $R3, $R3 + vpsrlq \$26, $R3, $T0 + vpand $AND_MASK, $R3, $A3 + vpaddq $T0, $R4, $A4 + jmp 1b +1: + cmp \$16*2, $in_len + jb 1f + sub \$16*2, $in_len + # load the next two blocks + vmovdqu 16*0($in), $R2 + vmovdqu 16*1($in), $R3 + add \$16*2, $in + + vpunpcklqdq $R3, $R2, $R0 + vpunpckhqdq $R3, $R2, $R1 + + vpsrlq \$26, $R0, $R2 + vpand $AND_MASK, $R0, $R0 + vpaddq $R0, $A0, $A0 + + vpsrlq \$26, $R2, $R0 + vpand $AND_MASK, $R2, $R2 + vpaddq $R2, $A1, $A1 + + vpsllq \$12, $R1, $R2 + vpxor $R2, $R0, $R0 + vpand $AND_MASK, $R0, $R0 + vpaddq $R0, $A2, $A2 + + vpsrlq \$26, $R2, $R0 + vpsrlq \$40, $R1, $R2 + vpand $AND_MASK, $R0, $R0 + vpxor .LsetBit(%rip), $R2, $R2 + vpaddq $R0, $A3, $A3 + vpaddq $R2, $A4, $A4 + + # Multiply input by R[0] + vmovdqu $_r0_($state), $T0 + vpmuludq $T0, $A0, $R0 + vpmuludq $T0, $A1, $R1 + vpmuludq $T0, $A2, $R2 + vpmuludq $T0, $A3, $R3 + vpmuludq $T0, $A4, $R4 + # Multiply input by R[1] (and R[1]*5) + vmovdqu $_r1_x5($state), $T0 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R0, $R0 + vmovdqu $_r1_($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R4, $R4 + # Etc + vmovdqu $_r2_x5($state), $T0 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R1, $R1 + vmovdqu $_r2_($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R4, $R4 + + vmovdqu $_r3_x5($state), $T0 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R2, $R2 + vmovdqu $_r3_($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R4, $R4 + + vmovdqu $_r4_x5($state), $T0 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R3, $R3 + vmovdqu $_r4_($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R4, $R4 +1: + vpsrldq \$8, $R0, $A0 + vpsrldq \$8, $R1, $A1 + vpsrldq \$8, $R2, $A2 + vpsrldq \$8, $R3, $A3 + vpsrldq \$8, $R4, $A4 + + vpaddq $R0, $A0, $A0 + vpaddq $R1, $A1, $A1 + vpaddq $R2, $A2, $A2 + vpaddq $R3, $A3, $A3 + vpaddq $R4, $A4, $A4 + # Reduce + vpsrlq \$26, $A3, $T0 + vpaddq $T0, $A4, $A4 + vpand $AND_MASK, $A3, $A3 + vpsrlq \$26, $A4, $T0 + vpsllq \$2, $T0, $T1 + vpaddq $T1, $T0, $T0 + vpaddq $T0, $A0, $A0 + vpand $AND_MASK, $A4, $A4 + vpsrlq \$26, $A0, $T0 + vpand $AND_MASK, $A0, $A0 + vpaddq $T0, $A1, $A1 + vpsrlq \$26, $A1, $T0 + vpand $AND_MASK, $A1, $A1 + vpaddq $T0, $A2, $A2 + vpsrlq \$26, $A2, $T0 + vpand $AND_MASK, $A2, $A2 + vpaddq $T0, $A3, $A3 + vpsrlq \$26, $A3, $T0 + vpand $AND_MASK, $A3, $A3 + vpaddq $T0, $A4, $A4 +3: + cmp \$16, $in_len + jb 1f + + # load the next block + vmovq 8*0($in), $R0 + vmovq 8*1($in), $R1 + add \$16, $in + sub \$16, $in_len + + vpsrlq \$26, $R0, $R2 + vpand $AND_MASK, $R0, $R0 + vpaddq $R0, $A0, $A0 + + vpsrlq \$26, $R2, $R0 + vpand $AND_MASK, $R2, $R2 + vpaddq $R2, $A1, $A1 + + vpsllq \$12, $R1, $R2 + vpxor $R2, $R0, $R0 + vpand $AND_MASK, $R0, $R0 + vpaddq $R0, $A2, $A2 + + vpsrlq \$26, $R2, $R0 + vpsrlq \$40, $R1, $R2 + vpand $AND_MASK, $R0, $R0 + vpxor .LsetBit(%rip), $R2, $R2 + vpaddq $R0, $A3, $A3 + vpaddq $R2, $A4, $A4 +2: + # Multiply input by R[0] + vmovq $_r0_+8($state), $T0 + vpmuludq $T0, $A0, $R0 + vpmuludq $T0, $A1, $R1 + vpmuludq $T0, $A2, $R2 + vpmuludq $T0, $A3, $R3 + vpmuludq $T0, $A4, $R4 + # Multiply input by R[1] (and R[1]*5) + vmovq $_r1_x5+8($state), $T0 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R0, $R0 + vmovq $_r1_+8($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R4, $R4 + # Etc + vmovq $_r2_x5+8($state), $T0 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R1, $R1 + vmovq $_r2_+8($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R4, $R4 + + vmovq $_r3_x5+8($state), $T0 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R2, $R2 + vmovq $_r3_+8($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R4, $R4 + + vmovq $_r4_x5+8($state), $T0 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R3, $R3 + vmovq $_r4_+8($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R4, $R4 + + # Reduce + vpsrlq \$26, $R3, $T0 + vpaddq $T0, $R4, $R4 + vpand $AND_MASK, $R3, $R3 + vpsrlq \$26, $R4, $T0 + vpsllq \$2, $T0, $T1 + vpaddq $T1, $T0, $T0 + vpaddq $T0, $R0, $R0 + vpand $AND_MASK, $R4, $R4 + vpsrlq \$26, $R0, $T0 + vpand $AND_MASK, $R0, $A0 + vpaddq $T0, $R1, $R1 + vpsrlq \$26, $R1, $T0 + vpand $AND_MASK, $R1, $A1 + vpaddq $T0, $R2, $R2 + vpsrlq \$26, $R2, $T0 + vpand $AND_MASK, $R2, $A2 + vpaddq $T0, $R3, $R3 + vpsrlq \$26, $R3, $T0 + vpand $AND_MASK, $R3, $A3 + vpaddq $T0, $R4, $A4 + +1: + test $in_len, $in_len + jz 1f + + vmovdqa .Lone(%rip), $R0 +3: + dec $in_len + vpslldq \$1, $R0, $R0 + vpinsrb \$0, ($in, $in_len), $R0, $R0 + test $in_len, $in_len + jnz 3b + + vpsrldq \$8, $R0, $R1 + vpsrlq \$26, $R0, $R2 + vpand $AND_MASK, $R0, $R0 + vpaddq $R0, $A0, $A0 + + vpsrlq \$26, $R2, $R0 + vpand $AND_MASK, $R2, $R2 + vpaddq $R2, $A1, $A1 + + vpsllq \$12, $R1, $R2 + vpxor $R2, $R0, $R0 + vpand $AND_MASK, $R0, $R0 + vpaddq $R0, $A2, $A2 + + vpsrlq \$26, $R2, $R0 + vpsrlq \$40, $R1, $R2 + vpand $AND_MASK, $R0, $R0 + vpaddq $R0, $A3, $A3 + vpaddq $R2, $A4, $A4 + xor $in_len, $in_len + jmp 2b +1: + vmovd $A0, $_A0_($state) + vmovd $A1, $_A1_($state) + vmovd $A2, $_A2_($state) + vmovd $A3, $_A3_($state) + vmovd $A4, $_A4_($state) + + + mov $in, %rax + vzeroupper + ret +.size poly1305_update_avx,.-poly1305_update_avx +############################################################################### +# void poly1305_finish_avx(void* $state, uint64_t mac[2]); +.type poly1305_finish_avx,\@function, 2 +.globl poly1305_finish_avx +poly1305_finish_avx: +___ +my $mac="%rsi"; +$code.=<<___; + vzeroupper + vmovd $_A0_($state), $A0 + vmovd $_A1_($state), $A1 + vmovd $_A2_($state), $A2 + vmovd $_A3_($state), $A3 + vmovd $_A4_($state), $A4 + # Reduce one last time in case there was a carry from 130 bit + vpsrlq \$26, $A4, $T0 + vpsllq \$2, $T0, $T1 + vpaddq $T1, $T0, $T0 + vpaddq $T0, $A0, $A0 + vpand .LandMask(%rip), $A4, $A4 + + vpsrlq \$26, $A0, $T0 + vpand .LandMask(%rip), $A0, $A0 + vpaddq $T0, $A1, $A1 + vpsrlq \$26, $A1, $T0 + vpand .LandMask(%rip), $A1, $A1 + vpaddq $T0, $A2, $A2 + vpsrlq \$26, $A2, $T0 + vpand .LandMask(%rip), $A2, $A2 + vpaddq $T0, $A3, $A3 + vpsrlq \$26, $A3, $T0 + vpand .LandMask(%rip), $A3, $A3 + vpaddq $T0, $A4, $A4 + # Convert to normal + vpsllq \$26, $A1, $T0 + vpxor $T0, $A0, $A0 + vpsllq \$52, $A2, $T0 + vpxor $T0, $A0, $A0 + vpsrlq \$12, $A2, $A1 + vpsllq \$14, $A3, $T0 + vpxor $T0, $A1, $A1 + vpsllq \$40, $A4, $T0 + vpxor $T0, $A1, $A1 + vmovq $A0, %rax + vmovq $A1, %rdx + + add $_k_($state), %rax + adc $_k_+8($state), %rdx + mov %rax, ($mac) + mov %rdx, 8($mac) + vzeroupper + ret +.size poly1305_finish_avx,.-poly1305_finish_avx +___ +} +}} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +print $code; +close STDOUT; diff -rNu openssl-1.0.2e/crypto/chacha20poly1305/asm/poly1305_avx2.pl openssl-1.0.2e-modified/crypto/chacha20poly1305/asm/poly1305_avx2.pl --- openssl-1.0.2e/crypto/chacha20poly1305/asm/poly1305_avx2.pl 1970-01-01 01:00:00.000000000 +0100 +++ openssl-1.0.2e-modified/crypto/chacha20poly1305/asm/poly1305_avx2.pl 2016-02-08 16:12:00.597614755 +0100 @@ -0,0 +1,918 @@ +############################################################################## +# # +# Copyright 2014 Intel Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# # +############################################################################## +# # +# Developers and authors: # +# Shay Gueron (1, 2), and Vlad Krasnov (1) # +# (1) Intel Corporation, Israel Development Center # +# (2) University of Haifa # +# # +############################################################################## +# state: +# 0: r[0] || r^2[0] +# 16: r[1] || r^2[1] +# 32: r[2] || r^2[2] +# 48: r[3] || r^2[3] +# 64: r[4] || r^2[4] +# 80: r[1]*5 || r^2[1]*5 +# 96: r[2]*5 || r^2[2]*5 +#112: r[3]*5 || r^2[3]*5 +#128: r[4]*5 || r^2[4]*5 +#144: k +#160: A0 +#164: A1 +#168: A2 +#172: A3 +#176: A4 +#180: END + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { + my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 + $avx = ($ver>=3.0) + ($ver>=3.01); +} + +if ($avx>=1) {{ + +my ($_r0_, $_r1_, $_r2_, $_r3_, $_r4_, $_r1_x5, $_r2_x5, $_r3_x5, $_r4_x5, $_k_, $_A0_, $_A1_, $_A2_, $_A3_, $_A4_) += (0,32,64,96,128,160,192,224,256,288,304,308,312,316,320); + +$code.=<<___; +.text +.align 32 +.LandMask: +.quad 0x3FFFFFF, 0x3FFFFFF, 0x3FFFFFF, 0x3FFFFFF +.LsetBit: +.quad 0x1000000, 0x1000000, 0x1000000, 0x1000000 +.LrSet: +.quad 0xFFFFFFC0FFFFFFF, 0xFFFFFFC0FFFFFFF, 0xFFFFFFC0FFFFFFF, 0xFFFFFFC0FFFFFFF +.quad 0xFFFFFFC0FFFFFFC, 0xFFFFFFC0FFFFFFC, 0xFFFFFFC0FFFFFFC, 0xFFFFFFC0FFFFFFC + +.LpermFix: +.long 6,7,6,7,6,7,6,7 +.long 4,5,6,7,6,7,6,7 +.long 2,3,6,7,4,5,6,7 +.long 0,1,4,5,2,3,6,7 +___ + + +{ +my ($A0, $A1, $A2, $A3, $A4, + $r0, $r1, $r2, $r3, $r4, + $T0, $T1, $A5, $A6, $A7, $A8)=map("%xmm$_",(0..15)); +my ($A0_y, $A1_y, $A2_y, $A3_y, $A4_y, + $r0_y, $r1_y, $r2_y, $r3_y, $r4_y)=map("%ymm$_",(0..9)); +my ($state, $key) + =("%rdi", "%rsi"); + +$code.=<<___; +################################################################################ +# void poly1305_init_avx2(void *state, uint8_t key[32]) + +.globl poly1305_init_avx2 +.type poly1305_init_avx2, \@function, 2 +.align 64 +poly1305_init_avx2: + vzeroupper + + # Store k + vmovdqu 16*1($key), $T0 + vmovdqu $T0, $_k_($state) + # Init the MAC value + vpxor $T0, $T0, $T0 + vmovdqu $T0, $_A0_($state) + vmovd $T0, $_A4_($state) + # load and convert r + vmovq 8*0($key), $r0 + vmovq 8*1($key), $T0 + vpand .LrSet(%rip), $r0, $r0 + vpand .LrSet+32(%rip), $T0, $T0 + + vpsrlq \$26, $r0, $r1 + vpand .LandMask(%rip), $r0, $r0 + vpsrlq \$26, $r1, $r2 + vpand .LandMask(%rip), $r1, $r1 + vpsllq \$12, $T0, $T1 + vpxor $T1, $r2, $r2 + vpsrlq \$26, $r2, $r3 + vpsrlq \$40, $T0, $r4 + vpand .LandMask(%rip), $r2, $r2 + vpand .LandMask(%rip), $r3, $r3 + # SQR R + vpmuludq $r0, $r0, $A0 + vpmuludq $r1, $r0, $A1 + vpmuludq $r2, $r0, $A2 + vpmuludq $r3, $r0, $A3 + vpmuludq $r4, $r0, $A4 + + vpsllq \$1, $A1, $A1 + vpsllq \$1, $A2, $A2 + vpmuludq $r1, $r1, $T0 + vpaddq $T0, $A2, $A2 + vpmuludq $r2, $r1, $T0 + vpaddq $T0, $A3, $A3 + vpmuludq $r3, $r1, $T0 + vpaddq $T0, $A4, $A4 + vpmuludq $r4, $r1, $A5 + + vpsllq \$1, $A3, $A3 + vpsllq \$1, $A4, $A4 + vpmuludq $r2, $r2, $T0 + vpaddq $T0, $A4, $A4 + vpmuludq $r3, $r2, $T0 + vpaddq $T0, $A5, $A5 + vpmuludq $r4, $r2, $A6 + + vpsllq \$1, $A5, $A5 + vpsllq \$1, $A6, $A6 + vpmuludq $r3, $r3, $T0 + vpaddq $T0, $A6, $A6 + vpmuludq $r4, $r3, $A7 + + vpsllq \$1, $A7, $A7 + vpmuludq $r4, $r4, $A8 + + # Reduce + vpsrlq \$26, $A4, $T0 + vpand .LandMask(%rip), $A4, $A4 + vpaddq $T0, $A5, $A5 + + vpsllq \$2, $A5, $T0 + vpaddq $T0, $A5, $A5 + vpsllq \$2, $A6, $T0 + vpaddq $T0, $A6, $A6 + vpsllq \$2, $A7, $T0 + vpaddq $T0, $A7, $A7 + vpsllq \$2, $A8, $T0 + vpaddq $T0, $A8, $A8 + + vpaddq $A5, $A0, $A0 + vpaddq $A6, $A1, $A1 + vpaddq $A7, $A2, $A2 + vpaddq $A8, $A3, $A3 + + vpsrlq \$26, $A0, $T0 + vpand .LandMask(%rip), $A0, $A0 + vpaddq $T0, $A1, $A1 + vpsrlq \$26, $A1, $T0 + vpand .LandMask(%rip), $A1, $A1 + vpaddq $T0, $A2, $A2 + vpsrlq \$26, $A2, $T0 + vpand .LandMask(%rip), $A2, $A2 + vpaddq $T0, $A3, $A3 + vpsrlq \$26, $A3, $T0 + vpand .LandMask(%rip), $A3, $A3 + vpaddq $T0, $A4, $A4 + + vpunpcklqdq $r0, $A0, $r0 + vpunpcklqdq $r1, $A1, $r1 + vpunpcklqdq $r2, $A2, $r2 + vpunpcklqdq $r3, $A3, $r3 + vpunpcklqdq $r4, $A4, $r4 + + vmovdqu $r0, $_r0_+16($state) + vmovdqu $r1, $_r1_+16($state) + vmovdqu $r2, $_r2_+16($state) + vmovdqu $r3, $_r3_+16($state) + vmovdqu $r4, $_r4_+16($state) + + vpsllq \$2, $r1, $A1 + vpsllq \$2, $r2, $A2 + vpsllq \$2, $r3, $A3 + vpsllq \$2, $r4, $A4 + + vpaddq $A1, $r1, $A1 + vpaddq $A2, $r2, $A2 + vpaddq $A3, $r3, $A3 + vpaddq $A4, $r4, $A4 + + vmovdqu $A1, $_r1_x5+16($state) + vmovdqu $A2, $_r2_x5+16($state) + vmovdqu $A3, $_r3_x5+16($state) + vmovdqu $A4, $_r4_x5+16($state) + + # Compute r^3 and r^4 + vpshufd \$0x44, $r0, $A0 + vpshufd \$0x44, $r1, $A1 + vpshufd \$0x44, $r2, $A2 + vpshufd \$0x44, $r3, $A3 + vpshufd \$0x44, $r4, $A4 + + # Multiply input by R[0] + vmovdqu $_r0_+16($state), $T0 + vpmuludq $T0, $A0, $r0 + vpmuludq $T0, $A1, $r1 + vpmuludq $T0, $A2, $r2 + vpmuludq $T0, $A3, $r3 + vpmuludq $T0, $A4, $r4 + # Multiply input by R[1] (and R[1]*5) + vmovdqu $_r1_x5+16($state), $T0 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $r0, $r0 + vmovdqu $_r1_+16($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $r1, $r1 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $r2, $r2 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $r3, $r3 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $r4, $r4 + # Etc + vmovdqu $_r2_x5+16($state), $T0 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $r0, $r0 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $r1, $r1 + vmovdqu $_r2_+16($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $r2, $r2 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $r3, $r3 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $r4, $r4 + + vmovdqu $_r3_x5+16($state), $T0 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $r0, $r0 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $r1, $r1 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $r2, $r2 + vmovdqu $_r3_+16($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $r3, $r3 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $r4, $r4 + + vmovdqu $_r4_x5+16($state), $T0 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $r0, $r0 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $r1, $r1 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $r2, $r2 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $r3, $r3 + vmovdqu $_r4_+16($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $r4, $r4 + # Reduce + vpsrlq \$26, $r3, $T0 + vpaddq $T0, $r4, $r4 + vpand .LandMask(%rip), $r3, $r3 + vpsrlq \$26, $r4, $T0 + vpsllq \$2, $T0, $T1 + vpaddq $T1, $T0, $T0 + vpaddq $T0, $r0, $r0 + vpand .LandMask(%rip), $r4, $r4 + vpsrlq \$26, $r0, $T0 + vpand .LandMask(%rip), $r0, $r0 + vpaddq $T0, $r1, $r1 + vpsrlq \$26, $r1, $T0 + vpand .LandMask(%rip), $r1, $r1 + vpaddq $T0, $r2, $r2 + vpsrlq \$26, $r2, $T0 + vpand .LandMask(%rip), $r2, $r2 + vpaddq $T0, $r3, $r3 + vpsrlq \$26, $r3, $T0 + vpand .LandMask(%rip), $r3, $r3 + vpaddq $T0, $r4, $r4 + + vmovdqu $r0, $_r0_($state) + vmovdqu $r1, $_r1_($state) + vmovdqu $r2, $_r2_($state) + vmovdqu $r3, $_r3_($state) + vmovdqu $r4, $_r4_($state) + + vpsllq \$2, $r1, $A1 + vpsllq \$2, $r2, $A2 + vpsllq \$2, $r3, $A3 + vpsllq \$2, $r4, $A4 + + vpaddq $A1, $r1, $A1 + vpaddq $A2, $r2, $A2 + vpaddq $A3, $r3, $A3 + vpaddq $A4, $r4, $A4 + + vmovdqu $A1, $_r1_x5($state) + vmovdqu $A2, $_r2_x5($state) + vmovdqu $A3, $_r3_x5($state) + vmovdqu $A4, $_r4_x5($state) + + ret +.size poly1305_init_avx2,.-poly1305_init_avx2 +___ +} + +{ + +my ($A0, $A1, $A2, $A3, $A4, + $T0, $T1, $R0, $R1, $R2, + $R3, $R4, $AND_MASK, $PERM_MASK, $SET_MASK)=map("%ymm$_",(0..14)); + +my ($A0_x, $A1_x, $A2_x, $A3_x, $A4_x, + $T0_x, $T1_x, $R0_x, $R1_x, $R2_x, + $R3_x, $R4_x, $AND_MASK_x, $PERM_MASK_x, $SET_MASK_x)=map("%xmm$_",(0..14)); + +my ($state, $in, $in_len, $hlp, $rsp_save)=("%rdi", "%rsi", "%rdx", "%rcx", "%rax"); + +$code.=<<___; + +############################################################################### +# void poly1305_update_avx2(void* $state, void* in, uint64_t in_len2) +.globl poly1305_update_avx2 +.type poly1305_update_avx2, \@function, 2 +.align 64 +poly1305_update_avx2: + + vmovd $_A0_($state), $A0_x + vmovd $_A1_($state), $A1_x + vmovd $_A2_($state), $A2_x + vmovd $_A3_($state), $A3_x + vmovd $_A4_($state), $A4_x + + vmovdqa .LandMask(%rip), $AND_MASK +1: + cmp \$32*4, $in_len + jb 1f + sub \$32*2, $in_len + + # load the next four blocks + vmovdqu 32*0($in), $R2 + vmovdqu 32*1($in), $R3 + add \$32*2, $in + + vpunpcklqdq $R3, $R2, $R0 + vpunpckhqdq $R3, $R2, $R1 + + vpermq \$0xD8, $R0, $R0 # it is possible to rearrange the precomputations, and save this shuffle + vpermq \$0xD8, $R1, $R1 + + vpsrlq \$26, $R0, $R2 + vpand $AND_MASK, $R0, $R0 + vpaddq $R0, $A0, $A0 + + vpsrlq \$26, $R2, $R0 + vpand $AND_MASK, $R2, $R2 + vpaddq $R2, $A1, $A1 + + vpsllq \$12, $R1, $R2 + vpxor $R2, $R0, $R0 + vpand $AND_MASK, $R0, $R0 + vpaddq $R0, $A2, $A2 + + vpsrlq \$26, $R2, $R0 + vpsrlq \$40, $R1, $R2 + vpand $AND_MASK, $R0, $R0 + vpxor .LsetBit(%rip), $R2, $R2 + vpaddq $R0, $A3, $A3 + vpaddq $R2, $A4, $A4 + + # Multiply input by R[0] + vpbroadcastq $_r0_($state), $T0 + vpmuludq $T0, $A0, $R0 + vpmuludq $T0, $A1, $R1 + vpmuludq $T0, $A2, $R2 + vpmuludq $T0, $A3, $R3 + vpmuludq $T0, $A4, $R4 + # Multiply input by R[1] (and R[1]*5) + vpbroadcastq $_r1_x5($state), $T0 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R0, $R0 + vpbroadcastq $_r1_($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R4, $R4 + # Etc + vpbroadcastq $_r2_x5($state), $T0 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R1, $R1 + vpbroadcastq $_r2_($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R4, $R4 + + vpbroadcastq $_r3_x5($state), $T0 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R2, $R2 + vpbroadcastq $_r3_($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R4, $R4 + + vpbroadcastq $_r4_x5($state), $T0 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R3, $R3 + vpbroadcastq $_r4_($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R4, $R4 + # Reduce + vpsrlq \$26, $R3, $T0 + vpaddq $T0, $R4, $R4 + vpand $AND_MASK, $R3, $R3 + + vpsrlq \$26, $R4, $T0 + vpsllq \$2, $T0, $T1 + vpaddq $T1, $T0, $T0 + vpaddq $T0, $R0, $R0 + vpand $AND_MASK, $R4, $R4 + + vpsrlq \$26, $R0, $T0 + vpand $AND_MASK, $R0, $A0 + vpaddq $T0, $R1, $R1 + vpsrlq \$26, $R1, $T0 + vpand $AND_MASK, $R1, $A1 + vpaddq $T0, $R2, $R2 + vpsrlq \$26, $R2, $T0 + vpand $AND_MASK, $R2, $A2 + vpaddq $T0, $R3, $R3 + vpsrlq \$26, $R3, $T0 + vpand $AND_MASK, $R3, $A3 + vpaddq $T0, $R4, $A4 + jmp 1b +1: + + cmp \$32*2, $in_len + jb 1f + sub \$32*2, $in_len + # load the next four blocks + vmovdqu 32*0($in), $R2 + vmovdqu 32*1($in), $R3 + add \$32*2, $in + + vpunpcklqdq $R3, $R2, $R0 + vpunpckhqdq $R3, $R2, $R1 + + vpermq \$0xD8, $R0, $R0 + vpermq \$0xD8, $R1, $R1 + + vpsrlq \$26, $R0, $R2 + vpand $AND_MASK, $R0, $R0 + vpaddq $R0, $A0, $A0 + + vpsrlq \$26, $R2, $R0 + vpand $AND_MASK, $R2, $R2 + vpaddq $R2, $A1, $A1 + + vpsllq \$12, $R1, $R2 + vpxor $R2, $R0, $R0 + vpand $AND_MASK, $R0, $R0 + vpaddq $R0, $A2, $A2 + + vpsrlq \$26, $R2, $R0 + vpsrlq \$40, $R1, $R2 + vpand $AND_MASK, $R0, $R0 + vpxor .LsetBit(%rip), $R2, $R2 + vpaddq $R0, $A3, $A3 + vpaddq $R2, $A4, $A4 + + # Multiply input by R[0] + vmovdqu $_r0_($state), $T0 + vpmuludq $T0, $A0, $R0 + vpmuludq $T0, $A1, $R1 + vpmuludq $T0, $A2, $R2 + vpmuludq $T0, $A3, $R3 + vpmuludq $T0, $A4, $R4 + # Multiply input by R[1] (and R[1]*5) + vmovdqu $_r1_x5($state), $T0 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R0, $R0 + vmovdqu $_r1_($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R4, $R4 + # Etc + vmovdqu $_r2_x5($state), $T0 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R1, $R1 + vmovdqu $_r2_($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R4, $R4 + + vmovdqu $_r3_x5($state), $T0 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R2, $R2 + vmovdqu $_r3_($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R4, $R4 + + vmovdqu $_r4_x5($state), $T0 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R3, $R3 + vmovdqu $_r4_($state), $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R4, $R4 + # Reduce + vpsrlq \$26, $R3, $T0 + vpaddq $T0, $R4, $R4 + vpand $AND_MASK, $R3, $R3 + vpsrlq \$26, $R4, $T0 + vpsllq \$2, $T0, $T1 + vpaddq $T1, $T0, $T0 + vpaddq $T0, $R0, $R0 + vpand $AND_MASK, $R4, $R4 + vpsrlq \$26, $R0, $T0 + vpand $AND_MASK, $R0, $A0 + vpaddq $T0, $R1, $R1 + vpsrlq \$26, $R1, $T0 + vpand $AND_MASK, $R1, $A1 + vpaddq $T0, $R2, $R2 + vpsrlq \$26, $R2, $T0 + vpand $AND_MASK, $R2, $A2 + vpaddq $T0, $R3, $R3 + vpsrlq \$26, $R3, $T0 + vpand $AND_MASK, $R3, $A3 + vpaddq $T0, $R4, $A4 + + vpsrldq \$8, $A0, $R0 + vpsrldq \$8, $A1, $R1 + vpsrldq \$8, $A2, $R2 + vpsrldq \$8, $A3, $R3 + vpsrldq \$8, $A4, $R4 + + vpaddq $R0, $A0, $A0 + vpaddq $R1, $A1, $A1 + vpaddq $R2, $A2, $A2 + vpaddq $R3, $A3, $A3 + vpaddq $R4, $A4, $A4 + + vpermq \$0xAA, $A0, $R0 + vpermq \$0xAA, $A1, $R1 + vpermq \$0xAA, $A2, $R2 + vpermq \$0xAA, $A3, $R3 + vpermq \$0xAA, $A4, $R4 + + vpaddq $R0, $A0, $A0 + vpaddq $R1, $A1, $A1 + vpaddq $R2, $A2, $A2 + vpaddq $R3, $A3, $A3 + vpaddq $R4, $A4, $A4 +1: + test $in_len, $in_len + jz 5f + # In case 1,2 or 3 blocks remain, we want to multiply them correctly + vmovq $A0_x, $A0_x + vmovq $A1_x, $A1_x + vmovq $A2_x, $A2_x + vmovq $A3_x, $A3_x + vmovq $A4_x, $A4_x + + mov .LsetBit(%rip), $hlp + mov %rsp, $rsp_save + test \$15, $in_len + jz 1f + xor $hlp, $hlp + sub \$64, %rsp + vpxor $R0, $R0, $R0 + vmovdqu $R0, (%rsp) + vmovdqu $R0, 32(%rsp) +3: + movb ($in, $hlp), %r8b + movb %r8b, (%rsp, $hlp) + inc $hlp + cmp $hlp, $in_len + jne 3b + + movb \$1, (%rsp, $hlp) + xor $hlp, $hlp + mov %rsp, $in + +1: + + cmp \$16, $in_len + ja 2f + vmovq 8*0($in), $R0_x + vmovq 8*1($in), $R1_x + vmovq $hlp, $SET_MASK_x + vmovdqa .LpermFix(%rip), $PERM_MASK + jmp 1f +2: + cmp \$32, $in_len + ja 2f + vmovdqu 16*0($in), $R2_x + vmovdqu 16*1($in), $R3_x + vmovq .LsetBit(%rip), $SET_MASK_x + vpinsrq \$1, $hlp, $SET_MASK_x, $SET_MASK_x + vmovdqa .LpermFix+32(%rip), $PERM_MASK + + vpunpcklqdq $R3, $R2, $R0 + vpunpckhqdq $R3, $R2, $R1 + jmp 1f +2: + cmp \$48, $in_len + ja 2f + vmovdqu 32*0($in), $R2 + vmovdqu 32*1($in), $R3_x + vmovq .LsetBit(%rip), $SET_MASK_x + vpinsrq \$1, $hlp, $SET_MASK_x, $SET_MASK_x + vpermq \$0xc4, $SET_MASK, $SET_MASK + vmovdqa .LpermFix+64(%rip), $PERM_MASK + + vpunpcklqdq $R3, $R2, $R0 + vpunpckhqdq $R3, $R2, $R1 + jmp 1f +2: + vmovdqu 32*0($in), $R2 + vmovdqu 32*1($in), $R3 + vmovq .LsetBit(%rip), $SET_MASK_x + vpinsrq \$1, $hlp, $SET_MASK_x, $SET_MASK_x + vpermq \$0x40, $SET_MASK, $SET_MASK + vmovdqa .LpermFix+96(%rip), $PERM_MASK + + vpunpcklqdq $R3, $R2, $R0 + vpunpckhqdq $R3, $R2, $R1 + +1: + mov $rsp_save, %rsp + + vpsrlq \$26, $R0, $R2 + vpand $AND_MASK, $R0, $R0 + vpaddq $R0, $A0, $A0 + + vpsrlq \$26, $R2, $R0 + vpand $AND_MASK, $R2, $R2 + vpaddq $R2, $A1, $A1 + + vpsllq \$12, $R1, $R2 + vpxor $R2, $R0, $R0 + vpand $AND_MASK, $R0, $R0 + vpaddq $R0, $A2, $A2 + + vpsrlq \$26, $R2, $R0 + vpsrlq \$40, $R1, $R2 + vpand $AND_MASK, $R0, $R0 + vpxor $SET_MASK, $R2, $R2 + vpaddq $R0, $A3, $A3 + vpaddq $R2, $A4, $A4 + + # Multiply input by R[0] + vmovdqu $_r0_($state), $T0 + vpermd $T0, $PERM_MASK, $T0 + vpmuludq $T0, $A0, $R0 + vpmuludq $T0, $A1, $R1 + vpmuludq $T0, $A2, $R2 + vpmuludq $T0, $A3, $R3 + vpmuludq $T0, $A4, $R4 + # Multiply input by R[1] (and R[1]*5) + vmovdqu $_r1_x5($state), $T0 + vpermd $T0, $PERM_MASK, $T0 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R0, $R0 + vmovdqu $_r1_($state), $T0 + vpermd $T0, $PERM_MASK, $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R4, $R4 + # Etc + vmovdqu $_r2_x5($state), $T0 + vpermd $T0, $PERM_MASK, $T0 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R1, $R1 + vmovdqu $_r2_($state), $T0 + vpermd $T0, $PERM_MASK, $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R4, $R4 + + vmovdqu $_r3_x5($state), $T0 + vpermd $T0, $PERM_MASK, $T0 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R2, $R2 + vmovdqu $_r3_($state), $T0 + vpermd $T0, $PERM_MASK, $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R3, $R3 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R4, $R4 + + vmovdqu $_r4_x5($state), $T0 + vpermd $T0, $PERM_MASK, $T0 + vpmuludq $T0, $A1, $T1 + vpaddq $T1, $R0, $R0 + vpmuludq $T0, $A2, $T1 + vpaddq $T1, $R1, $R1 + vpmuludq $T0, $A3, $T1 + vpaddq $T1, $R2, $R2 + vpmuludq $T0, $A4, $T1 + vpaddq $T1, $R3, $R3 + vmovdqu $_r4_($state), $T0 + vpermd $T0, $PERM_MASK, $T0 + vpmuludq $T0, $A0, $T1 + vpaddq $T1, $R4, $R4 + # Reduce + vpsrlq \$26, $R3, $T0 + vpaddq $T0, $R4, $R4 + vpand $AND_MASK, $R3, $R3 + vpsrlq \$26, $R4, $T0 + vpsllq \$2, $T0, $T1 + vpaddq $T1, $T0, $T0 + vpaddq $T0, $R0, $R0 + vpand $AND_MASK, $R4, $R4 + vpsrlq \$26, $R0, $T0 + vpand $AND_MASK, $R0, $A0 + vpaddq $T0, $R1, $R1 + vpsrlq \$26, $R1, $T0 + vpand $AND_MASK, $R1, $A1 + vpaddq $T0, $R2, $R2 + vpsrlq \$26, $R2, $T0 + vpand $AND_MASK, $R2, $A2 + vpaddq $T0, $R3, $R3 + vpsrlq \$26, $R3, $T0 + vpand $AND_MASK, $R3, $A3 + vpaddq $T0, $R4, $A4 + + vpsrldq \$8, $A0, $R0 + vpsrldq \$8, $A1, $R1 + vpsrldq \$8, $A2, $R2 + vpsrldq \$8, $A3, $R3 + vpsrldq \$8, $A4, $R4 + + vpaddq $R0, $A0, $A0 + vpaddq $R1, $A1, $A1 + vpaddq $R2, $A2, $A2 + vpaddq $R3, $A3, $A3 + vpaddq $R4, $A4, $A4 + + vpermq \$0xAA, $A0, $R0 + vpermq \$0xAA, $A1, $R1 + vpermq \$0xAA, $A2, $R2 + vpermq \$0xAA, $A3, $R3 + vpermq \$0xAA, $A4, $R4 + + vpaddq $R0, $A0, $A0 + vpaddq $R1, $A1, $A1 + vpaddq $R2, $A2, $A2 + vpaddq $R3, $A3, $A3 + vpaddq $R4, $A4, $A4 + +5: + vmovd $A0_x, $_A0_($state) + vmovd $A1_x, $_A1_($state) + vmovd $A2_x, $_A2_($state) + vmovd $A3_x, $_A3_($state) + vmovd $A4_x, $_A4_($state) + + ret +.size poly1305_update_avx2,.-poly1305_update_avx2 +############################################################################### +# void poly1305_finish_avx2(void* $state, uint8_t mac[16]); +.type poly1305_finish_avx2,\@function,2 +.globl poly1305_finish_avx2 +poly1305_finish_avx2: +___ +my $mac="%rsi"; +my ($A0, $A1, $A2, $A3, $A4, $T0, $T1)=map("%xmm$_",(0..6)); + +$code.=<<___; + vmovd $_A0_($state), $A0 + vmovd $_A1_($state), $A1 + vmovd $_A2_($state), $A2 + vmovd $_A3_($state), $A3 + vmovd $_A4_($state), $A4 + # Reduce one last time in case there was a carry from 130 bit + vpsrlq \$26, $A4, $T0 + vpsllq \$2, $T0, $T1 + vpaddq $T1, $T0, $T0 + vpaddq $T0, $A0, $A0 + vpand .LandMask(%rip), $A4, $A4 + + vpsrlq \$26, $A0, $T0 + vpand .LandMask(%rip), $A0, $A0 + vpaddq $T0, $A1, $A1 + vpsrlq \$26, $A1, $T0 + vpand .LandMask(%rip), $A1, $A1 + vpaddq $T0, $A2, $A2 + vpsrlq \$26, $A2, $T0 + vpand .LandMask(%rip), $A2, $A2 + vpaddq $T0, $A3, $A3 + vpsrlq \$26, $A3, $T0 + vpand .LandMask(%rip), $A3, $A3 + vpaddq $T0, $A4, $A4 + # Convert to normal + vpsllq \$26, $A1, $T0 + vpxor $T0, $A0, $A0 + vpsllq \$52, $A2, $T0 + vpxor $T0, $A0, $A0 + vpsrlq \$12, $A2, $A1 + vpsllq \$14, $A3, $T0 + vpxor $T0, $A1, $A1 + vpsllq \$40, $A4, $T0 + vpxor $T0, $A1, $A1 + vmovq $A0, %rax + vmovq $A1, %rdx + + add $_k_($state), %rax + adc $_k_+8($state), %rdx + mov %rax, ($mac) + mov %rdx, 8($mac) + + ret +.size poly1305_finish_avx2,.-poly1305_finish_avx2 +___ +} +}} + +$code =~ s/\`([^\`]*)\`/eval(\$1)/gem; +print $code; +close STDOUT; diff -rNu openssl-1.0.2e/crypto/chacha20poly1305/chacha20.c openssl-1.0.2e-modified/crypto/chacha20poly1305/chacha20.c --- openssl-1.0.2e/crypto/chacha20poly1305/chacha20.c 1970-01-01 01:00:00.000000000 +0100 +++ openssl-1.0.2e-modified/crypto/chacha20poly1305/chacha20.c 2016-02-08 16:12:00.597614755 +0100 @@ -0,0 +1,157 @@ +/* Copyright (c) 2014, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +/* Adapted from the public domain, estream code by D. Bernstein. */ + +#include + +/* sigma contains the ChaCha constants, which happen to be an ASCII string. */ +static const char sigma[16] = "expand 32-byte k"; + +#define ROTATE(v, n) (((v) << (n)) | ((v) >> (32 - (n)))) +#define XOR(v, w) ((v) ^ (w)) +#define PLUS(x, y) ((x) + (y)) +#define PLUSONE(v) (PLUS((v), 1)) + +#define U32TO8_LITTLE(p, v) \ + { \ + (p)[0] = (v >> 0) & 0xff; \ + (p)[1] = (v >> 8) & 0xff; \ + (p)[2] = (v >> 16) & 0xff; \ + (p)[3] = (v >> 24) & 0xff; \ + } + +#define U8TO32_LITTLE(p) \ + (((uint32_t)((p)[0])) | ((uint32_t)((p)[1]) << 8) | \ + ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24)) + +/* QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. */ +#define QUARTERROUND(a,b,c,d) \ + x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \ + x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \ + x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \ + x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7); + +/* chacha_core performs |num_rounds| rounds of ChaCha20 on the input words in + * |input| and writes the 64 output bytes to |output|. */ +static void chacha_core(uint8_t output[64], const uint32_t input[16]) { + uint32_t x[16]; + int i; + + memcpy(x, input, sizeof(uint32_t) * 16); + for (i = 20; i > 0; i -= 2) { + QUARTERROUND(0, 4, 8, 12) + QUARTERROUND(1, 5, 9, 13) + QUARTERROUND(2, 6, 10, 14) + QUARTERROUND(3, 7, 11, 15) + QUARTERROUND(0, 5, 10, 15) + QUARTERROUND(1, 6, 11, 12) + QUARTERROUND(2, 7, 8, 13) + QUARTERROUND(3, 4, 9, 14) + } + + for (i = 0; i < 16; ++i) { + x[i] = PLUS(x[i], input[i]); + } + for (i = 0; i < 16; ++i) { + U32TO8_LITTLE(output + 4 * i, x[i]); + } +} + +void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len, + const uint8_t key[32], const uint8_t nonce[8], + size_t counter) { +#ifdef CHAPOLY_x86_64_ASM + uint8_t buf[256]; + size_t buf_size, ctr_msk; + void (*core_func)(uint8_t *out, const uint8_t *in, size_t in_len, + const uint8_t key[32], const uint8_t nonce[8], + size_t counter) = NULL; +#else + uint8_t buf[64]; +#endif + uint32_t input[16]; + size_t todo, i; + +#ifdef CHAPOLY_x86_64_ASM + + if ((OPENSSL_ia32cap_loc()[2] >> 5) & 1) + { + buf_size = 128; + core_func = chacha_20_core_avx2; + ctr_msk = -2; + } + else if ((OPENSSL_ia32cap_loc()[1] >> 28) & 1) + { + buf_size = 64; + core_func = chacha_20_core_avx; + ctr_msk = -1; + } + else goto do_legacy; + + core_func(out, in, in_len, key, nonce, counter); + todo = in_len & (~(-buf_size)); + if(todo) + { + out += in_len&(-buf_size); + in += in_len&(-buf_size); + counter += (in_len/64) & ctr_msk; + memcpy(buf, in, todo); + core_func(buf, buf, buf_size, key, nonce, counter); + memcpy(out, buf, todo); + memset(buf, 0, buf_size); + } + return; + +do_legacy: +#endif + + input[0] = U8TO32_LITTLE(sigma + 0); + input[1] = U8TO32_LITTLE(sigma + 4); + input[2] = U8TO32_LITTLE(sigma + 8); + input[3] = U8TO32_LITTLE(sigma + 12); + + input[4] = U8TO32_LITTLE(key + 0); + input[5] = U8TO32_LITTLE(key + 4); + input[6] = U8TO32_LITTLE(key + 8); + input[7] = U8TO32_LITTLE(key + 12); + + input[8] = U8TO32_LITTLE(key + 16); + input[9] = U8TO32_LITTLE(key + 20); + input[10] = U8TO32_LITTLE(key + 24); + input[11] = U8TO32_LITTLE(key + 28); + + input[12] = counter; + input[13] = (uint64_t)counter >> 32; + input[14] = U8TO32_LITTLE(nonce + 0); + input[15] = U8TO32_LITTLE(nonce + 4); + + while (in_len > 0) { + todo = 64; + if (in_len < todo) { + todo = in_len; + } + + chacha_core(buf, input); + for (i = 0; i < todo; i++) { + out[i] = in[i] ^ buf[i]; + } + + out += todo; + in += todo; + in_len -= todo; + + ((uint64_t*)input)[6]++; + } +} diff -rNu openssl-1.0.2e/crypto/chacha20poly1305/chacha20poly1305.h openssl-1.0.2e-modified/crypto/chacha20poly1305/chacha20poly1305.h --- openssl-1.0.2e/crypto/chacha20poly1305/chacha20poly1305.h 1970-01-01 01:00:00.000000000 +0100 +++ openssl-1.0.2e-modified/crypto/chacha20poly1305/chacha20poly1305.h 2016-02-08 16:12:00.597614755 +0100 @@ -0,0 +1,63 @@ +#ifndef OPENSSL_HEADER_POLY1305_H +#define OPENSSL_HEADER_POLY1305_H + +#include +#include +#include +#include "crypto.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define POLY1305_MAC_LEN (16) + +typedef unsigned char poly1305_state[512]; + + +/* CRYPTO_poly1305_init sets up |state| so that it can be used to calculate an + * authentication tag with the one-time key |key|. Note that |key| is a + * one-time key and therefore there is no `reset' method because that would + * enable several messages to be authenticated with the same key. */ +void CRYPTO_poly1305_init(poly1305_state* state, const uint8_t key[32]); + +/* CRYPTO_poly1305_update processes |in_len| bytes from |in|. It can be called + * zero or more times after poly1305_init. */ +void CRYPTO_poly1305_update(poly1305_state* state, const uint8_t* in, + size_t in_len); + +/* CRYPTO_poly1305_finish completes the poly1305 calculation and writes a 16 + * byte authentication tag to |mac|. */ +void CRYPTO_poly1305_finish(poly1305_state* state, uint8_t mac[16]); + +/* CRYPTO_chacha_20 encrypts |in_len| bytes from |in| with the given key and + * nonce and writes the result to |out|, which may be equal to |in|. The + * initial block counter is specified by |counter|. */ +void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len, + const uint8_t key[32], const uint8_t nonce[8], + size_t counter); + +#ifdef CHAPOLY_x86_64_ASM +void poly1305_init_avx(poly1305_state* state, const uint8_t key[32]); +void poly1305_update_avx(poly1305_state* state, const uint8_t *in, size_t in_len); +void poly1305_finish_avx(poly1305_state* state, uint8_t mac[16]); + +void poly1305_init_avx2(poly1305_state* state, const uint8_t key[32]); +void poly1305_update_avx2(poly1305_state* state, const uint8_t *in, size_t in_len); +void poly1305_finish_avx2(poly1305_state* state, uint8_t mac[16]); + +void chacha_20_core_avx(uint8_t *out, const uint8_t *in, size_t in_len, + const uint8_t key[32], const uint8_t nonce[8], + size_t counter); + +void chacha_20_core_avx2(uint8_t *out, const uint8_t *in, size_t in_len, + const uint8_t key[32], const uint8_t nonce[8], + size_t counter); +#endif + + +#if defined(__cplusplus) +} /* extern C */ +#endif + +#endif /* OPENSSL_HEADER_POLY1305_H */ diff -rNu openssl-1.0.2e/crypto/chacha20poly1305/chapolytest.c openssl-1.0.2e-modified/crypto/chacha20poly1305/chapolytest.c --- openssl-1.0.2e/crypto/chacha20poly1305/chapolytest.c 1970-01-01 01:00:00.000000000 +0100 +++ openssl-1.0.2e-modified/crypto/chacha20poly1305/chapolytest.c 2016-02-08 16:12:00.598614755 +0100 @@ -0,0 +1,287 @@ +/* ==================================================================== + * Copyright (c) 2011-2013 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + + +#include +#include +#include +#include + +#include + +struct chacha_test { + const char *keyhex; + const char *noncehex; + const char *outhex; +}; + +struct poly1305_test + { + const char *inputhex; + const char *keyhex; + const char *outhex; + }; + +static const struct chacha_test chacha_tests[] = { + { + "0000000000000000000000000000000000000000000000000000000000000000", + "0000000000000000", + "76b8e0ada0f13d90405d6ae55386bd28bdd219b8a08ded1aa836efcc8b770dc7da41597c5157488d7724e03fb8d84a376a43b8f41518a11cc387b669b2ee6586", + }, + { + "0000000000000000000000000000000000000000000000000000000000000001", + "0000000000000000", + "4540f05a9f1fb296d7736e7b208e3c96eb4fe1834688d2604f450952ed432d41bbe2a0b6ea7566d2a5d1e7e20d42af2c53d792b1c43fea817e9ad275ae546963", + }, + { + "0000000000000000000000000000000000000000000000000000000000000000", + "0000000000000001", + "de9cba7bf3d69ef5e786dc63973f653a0b49e015adbff7134fcb7df137821031e85a050278a7084527214f73efc7fa5b5277062eb7a0433e445f41e31afab757", + }, + { + "0000000000000000000000000000000000000000000000000000000000000000", + "0100000000000000", + "ef3fdfd6c61578fbf5cf35bd3dd33b8009631634d21e42ac33960bd138e50d32111e4caf237ee53ca8ad6426194a88545ddc497a0b466e7d6bbdb0041b2f586b", + }, + { + "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f", + "0001020304050607", + "f798a189f195e66982105ffb640bb7757f579da31602fc93ec01ac56f85ac3c134a4547b733b46413042c9440049176905d3be59ea1c53f15916155c2be8241a38008b9a26bc35941e2444177c8ade6689de95264986d95889fb60e84629c9bd9a5acb1cc118be563eb9b3a4a472f82e09a7e778492b562ef7130e88dfe031c79db9d4f7c7a899151b9a475032b63fc385245fe054e3dd5a97a5f576fe064025d3ce042c566ab2c507b138db853e3d6959660996546cc9c4a6eafdc777c040d70eaf46f76dad3979e5c5360c3317166a1c894c94a371876a94df7628fe4eaaf2ccb27d5aaae0ad7ad0f9d4b6ad3b54098746d4524d38407a6deb", + }, +}; + +static const struct poly1305_test poly1305_tests[] = { + { + "", + "c8afaac331ee372cd6082de134943b174710130e9f6fea8d72293850a667d86c", + "4710130e9f6fea8d72293850a667d86c", + }, + { + "48656c6c6f20776f726c6421", + "746869732069732033322d62797465206b657920666f7220506f6c7931333035", + "a6f745008f81c916a20dcc74eef2b2f0", + }, + { + "0000000000000000000000000000000000000000000000000000000000000000", + "746869732069732033322d62797465206b657920666f7220506f6c7931333035", + "49ec78090e481ec6c26b33b91ccc0307", + }, +}; + +static unsigned char hex_digit(char h) + { + if (h >= '0' && h <= '9') + return h - '0'; + else if (h >= 'a' && h <= 'f') + return h - 'a' + 10; + else if (h >= 'A' && h <= 'F') + return h - 'A' + 10; + else + abort(); + } + +static void hex_decode(unsigned char *out, const char* hex) + { + size_t j = 0; + + while (*hex != 0) + { + unsigned char v = hex_digit(*hex++); + v <<= 4; + v |= hex_digit(*hex++); + out[j++] = v; + } + } + +static void hexdump(unsigned char *a, size_t len) + { + size_t i; + + for (i = 0; i < len; i++) + printf("%02x", a[i]); + } + +/* misalign returns a pointer that points 0 to 15 bytes into |in| such that the + * returned pointer has alignment 1 mod 16. */ +static void* misalign(void* in) + { + intptr_t x = (intptr_t) in; + x += (17 - (x % 16)) % 16; + return (void*) x; + } + +int main() + { + unsigned num_tests = + sizeof(chacha_tests) / sizeof(struct chacha_test); + unsigned i; + unsigned char key_bytes[32 + 16]; + unsigned char nonce_bytes[8 + 16] = {0}; + + + for (i = 0; i < num_tests; i++) + { + unsigned char *key = misalign(key_bytes); + unsigned char *nonce = misalign(nonce_bytes); + + printf("ChaCha20 test #%d\n", i); + const struct chacha_test *test = &chacha_tests[i]; + unsigned char *expected, *out_bytes, *zero_bytes, *out, *zeros; + size_t len = strlen(test->outhex); + + if (strlen(test->keyhex) != 32*2 || + strlen(test->noncehex) != 8*2 || + (len & 1) == 1) + return 1; + + len /= 2; + + hex_decode(key, test->keyhex); + hex_decode(nonce, test->noncehex); + + expected = malloc(len); + out_bytes = malloc(len+16); + zero_bytes = malloc(len+16); + /* Attempt to test unaligned inputs. */ + out = misalign(out_bytes); + zeros = misalign(zero_bytes); + memset(zeros, 0, len); + + hex_decode(expected, test->outhex); + CRYPTO_chacha_20(out, zeros, len, key, nonce, 0); + + if (memcmp(out, expected, len) != 0) + { + printf("ChaCha20 test #%d failed.\n", i); + printf("got: "); + hexdump(out, len); + printf("\nexpected: "); + hexdump(expected, len); + printf("\n"); + return 1; + } + + /* The last test has a large output. We test whether the + * counter works as expected by skipping the first 64 bytes of + * it. */ + if (i == num_tests - 1) + { + CRYPTO_chacha_20(out, zeros, len - 64, key, nonce, 1); + if (memcmp(out, expected + 64, len - 64) != 0) + { + printf("ChaCha20 skip test failed.\n"); + return 1; + } + } + + free(expected); + free(zero_bytes); + free(out_bytes); + } + num_tests = + sizeof(poly1305_tests) / sizeof(struct poly1305_test); + unsigned char key[32], out[16], expected[16]; + poly1305_state poly1305; + + for (i = 0; i < num_tests; i++) + { + printf("Poly1305 test #%d\n", i); + const struct poly1305_test *test = &poly1305_tests[i]; + unsigned char *in; + size_t inlen = strlen(test->inputhex); + + if (strlen(test->keyhex) != sizeof(key)*2 || + strlen(test->outhex) != sizeof(out)*2 || + (inlen & 1) == 1) + return 1; + + inlen /= 2; + + hex_decode(key, test->keyhex); + hex_decode(expected, test->outhex); + + in = malloc(inlen); + + hex_decode(in, test->inputhex); + +#ifdef CHAPOLY_x86_64_ASM + if((OPENSSL_ia32cap_loc()[1] >> 5) & 1) { + poly1305_init_avx2(&poly1305, key); + poly1305_update_avx2(&poly1305, in, inlen); + poly1305_finish_avx2(&poly1305, out); + } + else if ((OPENSSL_ia32cap_loc()[0] >> 60) & 1) { + poly1305_init_avx(&poly1305, key); + poly1305_update_avx(&poly1305, in, inlen); + poly1305_finish_avx(&poly1305, out); + } + else +#endif + { + CRYPTO_poly1305_init(&poly1305, key); + CRYPTO_poly1305_update(&poly1305, in, inlen); + CRYPTO_poly1305_finish(&poly1305, out); + } + if (memcmp(out, expected, sizeof(expected)) != 0) + { + printf("Poly1305 test #%d failed.\n", i); + printf("got: "); + hexdump(out, sizeof(out)); + printf("\nexpected: "); + hexdump(expected, sizeof(expected)); + printf("\n"); + return 1; + } + + free(in); + } + + printf("PASS\n"); + return 0; + } diff -rNu openssl-1.0.2e/crypto/chacha20poly1305/poly1305.c openssl-1.0.2e-modified/crypto/chacha20poly1305/poly1305.c --- openssl-1.0.2e/crypto/chacha20poly1305/poly1305.c 1970-01-01 01:00:00.000000000 +0100 +++ openssl-1.0.2e-modified/crypto/chacha20poly1305/poly1305.c 2016-02-08 16:12:00.598614755 +0100 @@ -0,0 +1,285 @@ +/* Copyright (c) 2014, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +/* This implementation of poly1305 is by Andrew Moon + * (https://github.com/floodyberry/poly1305-donna) and released as public + * domain. */ + +#include +#include + +#if !defined(B_ENDIAN) +/* We can assume little-endian. */ +static uint32_t U8TO32_LE(const uint8_t *m) { + uint32_t r; + memcpy(&r, m, sizeof(r)); + return r; +} + +static void U32TO8_LE(uint8_t *m, uint32_t v) { memcpy(m, &v, sizeof(v)); } +#else +static uint32_t U8TO32_LE(const uint8_t *m) { + return (uint32_t)m[0] | (uint32_t)m[1] << 8 | (uint32_t)m[2] << 16 | + (uint32_t)m[3] << 24; +} + +static void U32TO8_LE(uint8_t *m, uint32_t v) { + m[0] = v; + m[1] = v >> 8; + m[2] = v >> 16; + m[3] = v >> 24; +} +#endif + +static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; } + +struct poly1305_state_st { + uint32_t r0, r1, r2, r3, r4; + uint32_t s1, s2, s3, s4; + uint32_t h0, h1, h2, h3, h4; + uint8_t buf[16]; + unsigned int buf_used; + uint8_t key[16]; +}; + +/* poly1305_blocks updates |state| given some amount of input data. This + * function may only be called with a |len| that is not a multiple of 16 at the + * end of the data. Otherwise the input must be buffered into 16 byte blocks. */ +static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in, + size_t len) { + uint32_t t0, t1, t2, t3; + uint64_t t[5]; + uint32_t b; + uint64_t c; + size_t j; + uint8_t mp[16]; + + if (len < 16) { + goto poly1305_donna_atmost15bytes; + } + +poly1305_donna_16bytes: + t0 = U8TO32_LE(in); + t1 = U8TO32_LE(in + 4); + t2 = U8TO32_LE(in + 8); + t3 = U8TO32_LE(in + 12); + + in += 16; + len -= 16; + + state->h0 += t0 & 0x3ffffff; + state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; + state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; + state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; + state->h4 += (t3 >> 8) | (1 << 24); + +poly1305_donna_mul: + t[0] = mul32x32_64(state->h0, state->r0) + mul32x32_64(state->h1, state->s4) + + mul32x32_64(state->h2, state->s3) + mul32x32_64(state->h3, state->s2) + + mul32x32_64(state->h4, state->s1); + t[1] = mul32x32_64(state->h0, state->r1) + mul32x32_64(state->h1, state->r0) + + mul32x32_64(state->h2, state->s4) + mul32x32_64(state->h3, state->s3) + + mul32x32_64(state->h4, state->s2); + t[2] = mul32x32_64(state->h0, state->r2) + mul32x32_64(state->h1, state->r1) + + mul32x32_64(state->h2, state->r0) + mul32x32_64(state->h3, state->s4) + + mul32x32_64(state->h4, state->s3); + t[3] = mul32x32_64(state->h0, state->r3) + mul32x32_64(state->h1, state->r2) + + mul32x32_64(state->h2, state->r1) + mul32x32_64(state->h3, state->r0) + + mul32x32_64(state->h4, state->s4); + t[4] = mul32x32_64(state->h0, state->r4) + mul32x32_64(state->h1, state->r3) + + mul32x32_64(state->h2, state->r2) + mul32x32_64(state->h3, state->r1) + + mul32x32_64(state->h4, state->r0); + + state->h0 = (uint32_t)t[0] & 0x3ffffff; + c = (t[0] >> 26); + t[1] += c; + state->h1 = (uint32_t)t[1] & 0x3ffffff; + b = (uint32_t)(t[1] >> 26); + t[2] += b; + state->h2 = (uint32_t)t[2] & 0x3ffffff; + b = (uint32_t)(t[2] >> 26); + t[3] += b; + state->h3 = (uint32_t)t[3] & 0x3ffffff; + b = (uint32_t)(t[3] >> 26); + t[4] += b; + state->h4 = (uint32_t)t[4] & 0x3ffffff; + b = (uint32_t)(t[4] >> 26); + state->h0 += b * 5; + + if (len >= 16) + goto poly1305_donna_16bytes; + +/* final bytes */ +poly1305_donna_atmost15bytes: + if (!len) + return; + + for (j = 0; j < len; j++) + mp[j] = in[j]; + mp[j++] = 1; + for (; j < 16; j++) + mp[j] = 0; + len = 0; + + t0 = U8TO32_LE(mp + 0); + t1 = U8TO32_LE(mp + 4); + t2 = U8TO32_LE(mp + 8); + t3 = U8TO32_LE(mp + 12); + + state->h0 += t0 & 0x3ffffff; + state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; + state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; + state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; + state->h4 += (t3 >> 8); + + goto poly1305_donna_mul; +} + +void CRYPTO_poly1305_init(poly1305_state *statep, const uint8_t key[32]) { + struct poly1305_state_st *state = (struct poly1305_state_st *)statep; + uint32_t t0, t1, t2, t3; + + t0 = U8TO32_LE(key + 0); + t1 = U8TO32_LE(key + 4); + t2 = U8TO32_LE(key + 8); + t3 = U8TO32_LE(key + 12); + + /* precompute multipliers */ + state->r0 = t0 & 0x3ffffff; + t0 >>= 26; + t0 |= t1 << 6; + state->r1 = t0 & 0x3ffff03; + t1 >>= 20; + t1 |= t2 << 12; + state->r2 = t1 & 0x3ffc0ff; + t2 >>= 14; + t2 |= t3 << 18; + state->r3 = t2 & 0x3f03fff; + t3 >>= 8; + state->r4 = t3 & 0x00fffff; + + state->s1 = state->r1 * 5; + state->s2 = state->r2 * 5; + state->s3 = state->r3 * 5; + state->s4 = state->r4 * 5; + + /* init state */ + state->h0 = 0; + state->h1 = 0; + state->h2 = 0; + state->h3 = 0; + state->h4 = 0; + + state->buf_used = 0; + memcpy(state->key, key + 16, sizeof(state->key)); +} + +void CRYPTO_poly1305_update(poly1305_state *statep, const uint8_t *in, + size_t in_len) { + unsigned int i; + struct poly1305_state_st *state = (struct poly1305_state_st *)statep; + + if (state->buf_used) { + unsigned int todo = 16 - state->buf_used; + if (todo > in_len) + todo = in_len; + for (i = 0; i < todo; i++) + state->buf[state->buf_used + i] = in[i]; + state->buf_used += todo; + in_len -= todo; + in += todo; + + if (state->buf_used == 16) { + poly1305_update(state, state->buf, 16); + state->buf_used = 0; + } + } + + if (in_len >= 16) { + size_t todo = in_len & ~0xf; + poly1305_update(state, in, todo); + in += todo; + in_len &= 0xf; + } + + if (in_len) { + for (i = 0; i < in_len; i++) + state->buf[i] = in[i]; + state->buf_used = in_len; + } +} + +void CRYPTO_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) { + struct poly1305_state_st *state = (struct poly1305_state_st *)statep; + uint64_t f0, f1, f2, f3; + uint32_t g0, g1, g2, g3, g4; + uint32_t b, nb; + + if (state->buf_used) + poly1305_update(state, state->buf, state->buf_used); + + b = state->h0 >> 26; + state->h0 = state->h0 & 0x3ffffff; + state->h1 += b; + b = state->h1 >> 26; + state->h1 = state->h1 & 0x3ffffff; + state->h2 += b; + b = state->h2 >> 26; + state->h2 = state->h2 & 0x3ffffff; + state->h3 += b; + b = state->h3 >> 26; + state->h3 = state->h3 & 0x3ffffff; + state->h4 += b; + b = state->h4 >> 26; + state->h4 = state->h4 & 0x3ffffff; + state->h0 += b * 5; + + g0 = state->h0 + 5; + b = g0 >> 26; + g0 &= 0x3ffffff; + g1 = state->h1 + b; + b = g1 >> 26; + g1 &= 0x3ffffff; + g2 = state->h2 + b; + b = g2 >> 26; + g2 &= 0x3ffffff; + g3 = state->h3 + b; + b = g3 >> 26; + g3 &= 0x3ffffff; + g4 = state->h4 + b - (1 << 26); + + b = (g4 >> 31) - 1; + nb = ~b; + state->h0 = (state->h0 & nb) | (g0 & b); + state->h1 = (state->h1 & nb) | (g1 & b); + state->h2 = (state->h2 & nb) | (g2 & b); + state->h3 = (state->h3 & nb) | (g3 & b); + state->h4 = (state->h4 & nb) | (g4 & b); + + f0 = ((state->h0) | (state->h1 << 26)) + (uint64_t)U8TO32_LE(&state->key[0]); + f1 = ((state->h1 >> 6) | (state->h2 << 20)) + + (uint64_t)U8TO32_LE(&state->key[4]); + f2 = ((state->h2 >> 12) | (state->h3 << 14)) + + (uint64_t)U8TO32_LE(&state->key[8]); + f3 = ((state->h3 >> 18) | (state->h4 << 8)) + + (uint64_t)U8TO32_LE(&state->key[12]); + + U32TO8_LE(&mac[0], f0); + f1 += (f0 >> 32); + U32TO8_LE(&mac[4], f1); + f2 += (f1 >> 32); + U32TO8_LE(&mac[8], f2); + f3 += (f2 >> 32); + U32TO8_LE(&mac[12], f3); +} diff -rNu openssl-1.0.2e/crypto/cryptlib.c openssl-1.0.2e-modified/crypto/cryptlib.c --- openssl-1.0.2e/crypto/cryptlib.c 2015-12-03 15:04:23.000000000 +0100 +++ openssl-1.0.2e-modified/crypto/cryptlib.c 2016-02-08 16:12:00.599614755 +0100 @@ -654,19 +654,9 @@ defined(_M_AMD64) || defined(_M_X64) extern unsigned int OPENSSL_ia32cap_P[4]; -unsigned long *OPENSSL_ia32cap_loc(void) +unsigned int *OPENSSL_ia32cap_loc(void) { - if (sizeof(long) == 4) - /* - * If 32-bit application pulls address of OPENSSL_ia32cap_P[0] - * clear second element to maintain the illusion that vector - * is 32-bit. - */ - OPENSSL_ia32cap_P[1] = 0; - - OPENSSL_ia32cap_P[2] = 0; - - return (unsigned long *)OPENSSL_ia32cap_P; + return OPENSSL_ia32cap_P; } # if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM) && !defined(I386_ONLY) diff -rNu openssl-1.0.2e/crypto/crypto.h openssl-1.0.2e-modified/crypto/crypto.h --- openssl-1.0.2e/crypto/crypto.h 2015-12-03 15:04:23.000000000 +0100 +++ openssl-1.0.2e-modified/crypto/crypto.h 2016-02-08 16:12:00.599614755 +0100 @@ -590,7 +590,7 @@ void OpenSSLDie(const char *file, int line, const char *assertion); # define OPENSSL_assert(e) (void)((e) ? 0 : (OpenSSLDie(__FILE__, __LINE__, #e),1)) -unsigned long *OPENSSL_ia32cap_loc(void); +unsigned int *OPENSSL_ia32cap_loc(void); # define OPENSSL_ia32cap (*(OPENSSL_ia32cap_loc())) int OPENSSL_isservice(void); diff -rNu openssl-1.0.2e/crypto/evp/Makefile openssl-1.0.2e-modified/crypto/evp/Makefile --- openssl-1.0.2e/crypto/evp/Makefile 2015-12-03 15:44:23.000000000 +0100 +++ openssl-1.0.2e-modified/crypto/evp/Makefile 2016-02-08 16:12:00.600614755 +0100 @@ -29,7 +29,8 @@ c_all.c c_allc.c c_alld.c evp_lib.c bio_ok.c \ evp_pkey.c evp_pbe.c p5_crpt.c p5_crpt2.c \ e_old.c pmeth_lib.c pmeth_fn.c pmeth_gn.c m_sigver.c \ - e_aes_cbc_hmac_sha1.c e_aes_cbc_hmac_sha256.c e_rc4_hmac_md5.c + e_aes_cbc_hmac_sha1.c e_aes_cbc_hmac_sha256.c e_rc4_hmac_md5.c \ + e_chacha20poly1305.c LIBOBJ= encode.o digest.o evp_enc.o evp_key.o evp_acnf.o evp_cnf.o \ e_des.o e_bf.o e_idea.o e_des3.o e_camellia.o\ @@ -42,7 +43,8 @@ c_all.o c_allc.o c_alld.o evp_lib.o bio_ok.o \ evp_pkey.o evp_pbe.o p5_crpt.o p5_crpt2.o \ e_old.o pmeth_lib.o pmeth_fn.o pmeth_gn.o m_sigver.o \ - e_aes_cbc_hmac_sha1.o e_aes_cbc_hmac_sha256.o e_rc4_hmac_md5.o + e_aes_cbc_hmac_sha1.o e_aes_cbc_hmac_sha256.o e_rc4_hmac_md5.o \ + e_chacha20poly1305.o SRC= $(LIBSRC) @@ -263,6 +265,7 @@ e_cast.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h e_cast.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h e_cast.o: ../../include/openssl/symhacks.h ../cryptlib.h e_cast.c evp_locl.h +e_chacha20poly1305.o: ../../include/openssl/chacha20poly1305.h e_chacha20poly1305.c e_des.o: ../../e_os.h ../../include/openssl/asn1.h ../../include/openssl/bio.h e_des.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h e_des.o: ../../include/openssl/des.h ../../include/openssl/des_old.h diff -rNu openssl-1.0.2e/crypto/evp/e_chacha20poly1305.c openssl-1.0.2e-modified/crypto/evp/e_chacha20poly1305.c --- openssl-1.0.2e/crypto/evp/e_chacha20poly1305.c 1970-01-01 01:00:00.000000000 +0100 +++ openssl-1.0.2e-modified/crypto/evp/e_chacha20poly1305.c 2016-02-08 16:12:00.601614755 +0100 @@ -0,0 +1,323 @@ +/* ==================================================================== + * Copyright (c) 2001-2014 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + */ + +#include +#ifndef OPENSSL_NO_CHACHA_POLY +#include +#include +#include +#include "evp_locl.h" +#include + +typedef struct + { + uint8_t key[32]; + /* uint8_t salt[4] */; + uint8_t nonce[8]; + poly1305_state poly_state; + size_t aad_l; + size_t ct_l; + int valid; +#ifdef CHAPOLY_x86_64_ASM + void (*poly1305_init_ptr)(poly1305_state *, const uint8_t *); + void (*poly1305_update_ptr)(poly1305_state *, const uint8_t *, size_t); + void (*poly1305_finish_ptr)(poly1305_state *, uint8_t *); + #define poly_init aead_ctx->poly1305_init_ptr + #define poly_update poly1305_update_wrapper + #define poly_finish poly1305_finish_wrapper + #define FILL_BUFFER ((size_t)128) + uint8_t poly_buffer[FILL_BUFFER]; + uint8_t chacha_buffer[FILL_BUFFER]; + uint8_t poly_buffer_used; + uint8_t chacha_used; +#else + #define poly_init CRYPTO_poly1305_init + #define poly_update(c,i,l) CRYPTO_poly1305_update(&c->poly_state,i,l) + #define poly_finish(c,m) CRYPTO_poly1305_finish(&c->poly_state,m) +#endif + } EVP_CHACHA20_POLY1305_CTX; + +#ifdef CHAPOLY_x86_64_ASM +static void poly1305_update_wrapper(EVP_CHACHA20_POLY1305_CTX *ctx, const uint8_t *in, size_t in_len) + { + int todo; + /* Attempt to fill as many bytes as possible before calling the update function */ + if(in_len < FILL_BUFFER || ctx->poly_buffer_used) + { + todo = FILL_BUFFER - ctx->poly_buffer_used; + todo = in_len < todo? in_len : todo; + memcpy(ctx->poly_buffer + ctx->poly_buffer_used, in, todo); + ctx->poly_buffer_used += todo; + in += todo; + in_len -= todo; + if(ctx->poly_buffer_used == FILL_BUFFER) + { + ctx->poly1305_update_ptr(&ctx->poly_state, ctx->poly_buffer, FILL_BUFFER); + ctx->poly_buffer_used = 0; + } + } + if(in_len >= FILL_BUFFER) + { + ctx->poly1305_update_ptr(&ctx->poly_state, in, in_len&(-FILL_BUFFER)); + in += in_len&(-FILL_BUFFER); + in_len &= (FILL_BUFFER-1); + } + if(in_len) + { + memcpy(ctx->poly_buffer, in, in_len); + ctx->poly_buffer_used = in_len; + } + } + +static void poly1305_finish_wrapper(EVP_CHACHA20_POLY1305_CTX *ctx, uint8_t mac[16]) + { + if(ctx->poly_buffer_used) + { + if(ctx->poly_buffer_used % 16) + { + memset(ctx->poly_buffer + ctx->poly_buffer_used, 0, 16 - (ctx->poly_buffer_used%16)); + } + ctx->poly1305_update_ptr(&ctx->poly_state, ctx->poly_buffer, ctx->poly_buffer_used); + } + ctx->poly1305_finish_ptr(&ctx->poly_state, mac); + memset(ctx->poly_buffer, 0 ,FILL_BUFFER); + } +#endif + +static int EVP_chacha20_poly1305_init(EVP_CIPHER_CTX *ctx, const unsigned char *key, const unsigned char *iv, int enc) + { + EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data; + /* simply copy the chacha key and iv*/ + memcpy(aead_ctx->key, key, 32); + /* memcpy(aead_ctx->salt, iv, 4); */ + aead_ctx->valid = 0; + return 1; + } + +static int EVP_chacha20_poly1305_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t inl) + { + EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data; + uint8_t poly_block[16]; + uint64_t cl; + if(!aead_ctx->valid) + return 0; + if (inl < 16) + return -1; + /* Fix for MAC */ + inl -= 16; + /* Encryption */ + if(ctx->encrypt) + { +#ifdef FILL_BUFFER + /* we can use the buffer we already accumulated during the parallel computation in init */ + if(inl<=FILL_BUFFER-64) + { + int i; + for(i=0; ichacha_buffer[i+64]; + } + else +#endif + CRYPTO_chacha_20(out, in, inl, aead_ctx->key, aead_ctx->nonce, 1); + poly_update(aead_ctx, out, inl); + aead_ctx->ct_l += inl; + cl = aead_ctx->ct_l; + poly_update(aead_ctx, (uint8_t*)&cl, sizeof(cl)); + poly_finish(aead_ctx, &out[inl]); + aead_ctx->valid = 0; + return inl+16; + } + /* Decryption */ + else + { + /* Fix to accommodate for the MAC */ + poly_update(aead_ctx, in, inl); +#ifdef FILL_BUFFER + /* we can use the buffer we already accumulated during the parallel computation in init */ + if(inl<=FILL_BUFFER-64) + { + int i; + for(i=0; ichacha_buffer[i+64]; + } + else +#endif + CRYPTO_chacha_20(out, in, inl, aead_ctx->key, aead_ctx->nonce, 1); + aead_ctx->ct_l += inl; + cl = aead_ctx->ct_l; + poly_update(aead_ctx, (uint8_t*)&cl, sizeof(cl)); + poly_finish(aead_ctx, poly_block); + + uint64_t cmp = ((uint64_t*)poly_block)[0] ^ ((uint64_t*)(in + inl))[0]; + cmp |= ((uint64_t*)poly_block)[1] ^ ((uint64_t*)(in + inl))[1]; + + /*if (memcmp(poly_block, in + inl, POLY1305_MAC_LEN)) */ + if (cmp) + { + OPENSSL_cleanse(out, inl); + aead_ctx->valid = 0; + return -1; + } + aead_ctx->valid = 0; + return inl; + } + return 0; + } + +static int EVP_chacha20_poly1305_cleanup(EVP_CIPHER_CTX *ctx) + { + return 1; + } + +static int EVP_chacha20_poly1305_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr) + { + EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data; +#ifndef FILL_BUFFER + uint8_t poly1305_key[32]; +#endif + uint8_t aad[13 + 8]; + uint64_t thirteen = 13; + + switch(type) + { + case EVP_CTRL_AEAD_TLS1_AAD: + if(arg!=13) + return 0; + /* Initialize poly keys */ +#ifndef FILL_BUFFER + memset(poly1305_key, 0, sizeof(poly1305_key)); +#else + memset(aead_ctx->chacha_buffer, 0, FILL_BUFFER); +#endif + /* Salt is the IV (not in draft) */ + /* memcpy(aead_ctx->nonce, aead_ctx->salt, 4); */ + /* Take sequence number from AAD */ + /* memcpy(&aead_ctx->nonce[4], ptr, 8); */ + memcpy(aead_ctx->nonce, ptr, 8); + +#ifdef CHAPOLY_x86_64_ASM + aead_ctx->poly_buffer_used = 0; + if((OPENSSL_ia32cap_loc()[2] >> 5) & 1) /* AVX2 */ + { + aead_ctx->poly1305_init_ptr = poly1305_init_avx2; + aead_ctx->poly1305_update_ptr = poly1305_update_avx2; + aead_ctx->poly1305_finish_ptr = poly1305_finish_avx2; + } + else if ((OPENSSL_ia32cap_loc()[1] >> 28) & 1) /* AVX */ + { + aead_ctx->poly1305_init_ptr = poly1305_init_avx; + aead_ctx->poly1305_update_ptr = poly1305_update_avx; + aead_ctx->poly1305_finish_ptr = poly1305_finish_avx; + } + else /*C*/ + { + aead_ctx->poly1305_init_ptr = CRYPTO_poly1305_init; + aead_ctx->poly1305_update_ptr = CRYPTO_poly1305_update; + aead_ctx->poly1305_finish_ptr = CRYPTO_poly1305_finish; + } + +#endif +#ifndef FILL_BUFFER + CRYPTO_chacha_20(poly1305_key, poly1305_key, sizeof(poly1305_key), aead_ctx->key, aead_ctx->nonce, 0); + poly_init(&aead_ctx->poly_state, poly1305_key); +#else + CRYPTO_chacha_20(aead_ctx->chacha_buffer, aead_ctx->chacha_buffer, FILL_BUFFER, aead_ctx->key, aead_ctx->nonce, 0); + poly_init(&aead_ctx->poly_state, aead_ctx->chacha_buffer); + aead_ctx->chacha_used = 64; /* We keep 64 byte for future use, to accelerate for very short messages */ +#endif + aead_ctx->aad_l = 0; + aead_ctx->ct_l = 0; + /* Absorb AAD */ + memcpy(aad, ptr, arg); + memcpy(&aad[arg], &thirteen, sizeof(thirteen)); + /* If decrypting fix length for tag */ + if (!ctx->encrypt) + { + unsigned int len=aad[arg-2]<<8|aad[arg-1]; + len -= POLY1305_MAC_LEN; + aad[arg-2] = len>>8; + aad[arg-1] = len & 0xff; + } + poly_update(aead_ctx, aad, arg + sizeof(thirteen)); + /* aead_ctx->aad_l += arg; */ + aead_ctx->valid = 1; + return POLY1305_MAC_LEN; + break; + default: + return 0; + break; + } + return 0; + } + +#define CUSTOM_FLAGS (\ + EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \ + | EVP_CIPH_ALWAYS_CALL_INIT \ + | EVP_CIPH_CUSTOM_COPY) + +static const EVP_CIPHER chacha20_poly1305 = { + NID_chacha20_poly1305, /* nid */ + 1, /* block size, sorta */ + 32, /* key len */ + 0, /* iv len */ + CUSTOM_FLAGS|EVP_CIPH_FLAG_AEAD_CIPHER, /* flags */ + EVP_chacha20_poly1305_init, + EVP_chacha20_poly1305_cipher, + EVP_chacha20_poly1305_cleanup, + sizeof(EVP_CHACHA20_POLY1305_CTX), /* ctx size */ + NULL, NULL, + EVP_chacha20_poly1305_ctrl, + NULL + }; + +const EVP_CIPHER *EVP_chacha20_poly1305(void) +{ return &chacha20_poly1305; } + +#endif diff -rNu openssl-1.0.2e/crypto/evp/evp.h openssl-1.0.2e-modified/crypto/evp/evp.h --- openssl-1.0.2e/crypto/evp/evp.h 2015-12-03 15:04:23.000000000 +0100 +++ openssl-1.0.2e-modified/crypto/evp/evp.h 2016-02-08 16:12:00.601614755 +0100 @@ -893,6 +893,9 @@ # define EVP_camellia_256_cfb EVP_camellia_256_cfb128 const EVP_CIPHER *EVP_camellia_256_ofb(void); # endif +# ifndef OPENSSL_NO_CHACHA_POLY +const EVP_CIPHER *EVP_chacha20_poly1305(void); +# endif # ifndef OPENSSL_NO_SEED const EVP_CIPHER *EVP_seed_ecb(void); diff -rNu openssl-1.0.2e/crypto/objects/obj_dat.h openssl-1.0.2e-modified/crypto/objects/obj_dat.h --- openssl-1.0.2e/crypto/objects/obj_dat.h 2015-12-03 15:41:29.000000000 +0100 +++ openssl-1.0.2e-modified/crypto/objects/obj_dat.h 2016-02-08 16:12:00.603614755 +0100 @@ -62,9 +62,9 @@ * [including the GNU Public Licence.] */ -#define NUM_NID 958 -#define NUM_SN 951 -#define NUM_LN 951 +#define NUM_NID 959 +#define NUM_SN 952 +#define NUM_LN 952 #define NUM_OBJ 890 static const unsigned char lvalues[6255]={ @@ -2514,6 +2514,8 @@ NID_jurisdictionStateOrProvinceName,11,&(lvalues[6232]),0}, {"jurisdictionC","jurisdictionCountryName", NID_jurisdictionCountryName,11,&(lvalues[6243]),0}, +{"id-chacha20-poly1305","chacha20-poly1305",NID_chacha20_poly1305,0, + NULL,0}, }; static const unsigned int sn_objs[NUM_SN]={ @@ -2954,6 +2956,7 @@ 362, /* "id-cct-PKIResponse" */ 360, /* "id-cct-crs" */ 81, /* "id-ce" */ +958, /* "id-chacha20-poly1305" */ 680, /* "id-characteristic-two-basis" */ 263, /* "id-cmc" */ 334, /* "id-cmc-addExtensions" */ @@ -3728,6 +3731,7 @@ 677, /* "certicom-arc" */ 517, /* "certificate extensions" */ 883, /* "certificateRevocationList" */ +958, /* "chacha20-poly1305" */ 54, /* "challengePassword" */ 407, /* "characteristic-two-field" */ 395, /* "clearance" */ diff -rNu openssl-1.0.2e/crypto/objects/obj_mac.h openssl-1.0.2e-modified/crypto/objects/obj_mac.h --- openssl-1.0.2e/crypto/objects/obj_mac.h 2015-12-03 15:41:28.000000000 +0100 +++ openssl-1.0.2e-modified/crypto/objects/obj_mac.h 2016-02-08 16:12:00.604614755 +0100 @@ -4192,3 +4192,7 @@ #define LN_jurisdictionCountryName "jurisdictionCountryName" #define NID_jurisdictionCountryName 957 #define OBJ_jurisdictionCountryName 1L,3L,6L,1L,4L,1L,311L,60L,2L,1L,3L + +#define SN_chacha20_poly1305 "id-chacha20-poly1305" +#define LN_chacha20_poly1305 "chacha20-poly1305" +#define NID_chacha20_poly1305 958 diff -rNu openssl-1.0.2e/ssl/s3_lib.c openssl-1.0.2e-modified/ssl/s3_lib.c --- openssl-1.0.2e/ssl/s3_lib.c 2015-12-03 15:04:23.000000000 +0100 +++ openssl-1.0.2e-modified/ssl/s3_lib.c 2016-02-08 16:12:00.605614755 +0100 @@ -2891,6 +2891,53 @@ 256}, #endif + /* Chacha20-Poly1305 draft cipher suites */ +#if !defined(OPENSSL_NO_CHACHA_POLY) + { + 1, + TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305, + TLS1_CK_ECDHE_RSA_CHACHA20_POLY1305, + SSL_kEECDH, + SSL_aRSA, + SSL_CHACHA20POLY1305, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, + 256, + 0, + }, + + { + 1, + TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305, + TLS1_CK_ECDHE_ECDSA_CHACHA20_POLY1305, + SSL_kEECDH, + SSL_aECDSA, + SSL_CHACHA20POLY1305, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, + 256, + 0, + }, + + { + 1, + TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305, + TLS1_CK_DHE_RSA_CHACHA20_POLY1305, + SSL_kEDH, + SSL_aRSA, + SSL_CHACHA20POLY1305, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, + 256, + 0, + }, +#endif /* end of list */ }; @@ -4047,6 +4094,7 @@ int i, ii, ok; CERT *cert; unsigned long alg_k, alg_a, mask_k, mask_a, emask_k, emask_a; + int use_chacha = 0; /* Let's see which ciphers we can support */ cert = s->cert; @@ -4080,9 +4128,16 @@ if (s->options & SSL_OP_CIPHER_SERVER_PREFERENCE || tls1_suiteb(s)) { prio = srvr; allow = clnt; + /* Use ChaCha20+Poly1305 iff it's client's most preferred cipher suite */ + if (sk_SSL_CIPHER_num(clnt) > 0) { + c = sk_SSL_CIPHER_value(clnt, 0); + if (c->algorithm_enc == SSL_CHACHA20POLY1305) + use_chacha = 1; + } } else { prio = clnt; allow = srvr; + use_chacha = 1; } tls1_set_cert_validity(s); @@ -4094,6 +4149,11 @@ if ((c->algorithm_ssl & SSL_TLSV1_2) && !SSL_USE_TLS1_2_CIPHERS(s)) continue; + /* Skip ChaCha unless top client priority */ + if ((c->algorithm_enc == SSL_CHACHA20POLY1305) && + !use_chacha) + continue; + ssl_set_cert_masks(cert, c); mask_k = cert->mask_k; mask_a = cert->mask_a; diff -rNu openssl-1.0.2e/ssl/ssl.h openssl-1.0.2e-modified/ssl/ssl.h --- openssl-1.0.2e/ssl/ssl.h 2015-12-03 15:04:23.000000000 +0100 +++ openssl-1.0.2e-modified/ssl/ssl.h 2016-02-08 16:12:00.606614755 +0100 @@ -297,6 +297,7 @@ # define SSL_TXT_CAMELLIA128 "CAMELLIA128" # define SSL_TXT_CAMELLIA256 "CAMELLIA256" # define SSL_TXT_CAMELLIA "CAMELLIA" +# define SSL_TXT_CHACHA20 "CHACHA20" # define SSL_TXT_MD5 "MD5" # define SSL_TXT_SHA1 "SHA1" diff -rNu openssl-1.0.2e/ssl/ssl_algs.c openssl-1.0.2e-modified/ssl/ssl_algs.c --- openssl-1.0.2e/ssl/ssl_algs.c 2015-12-03 15:04:23.000000000 +0100 +++ openssl-1.0.2e-modified/ssl/ssl_algs.c 2016-02-08 16:12:00.606614755 +0100 @@ -106,6 +106,10 @@ EVP_add_cipher(EVP_camellia_256_cbc()); #endif +#ifndef OPENSSL_NO_CHACHA_POLY + EVP_add_cipher(EVP_chacha20_poly1305()); +#endif + #ifndef OPENSSL_NO_SEED EVP_add_cipher(EVP_seed_cbc()); #endif diff -rNu openssl-1.0.2e/ssl/ssl_ciph.c openssl-1.0.2e-modified/ssl/ssl_ciph.c --- openssl-1.0.2e/ssl/ssl_ciph.c 2015-12-03 15:04:23.000000000 +0100 +++ openssl-1.0.2e-modified/ssl/ssl_ciph.c 2016-02-08 16:12:00.607614755 +0100 @@ -164,7 +164,8 @@ #define SSL_ENC_SEED_IDX 11 #define SSL_ENC_AES128GCM_IDX 12 #define SSL_ENC_AES256GCM_IDX 13 -#define SSL_ENC_NUM_IDX 14 +#define SSL_ENC_CHACHA20POLY1305_IDX 14 +#define SSL_ENC_NUM_IDX 15 static const EVP_CIPHER *ssl_cipher_methods[SSL_ENC_NUM_IDX] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -316,6 +317,7 @@ {0, SSL_TXT_CAMELLIA256, 0, 0, 0, SSL_CAMELLIA256, 0, 0, 0, 0, 0, 0}, {0, SSL_TXT_CAMELLIA, 0, 0, 0, SSL_CAMELLIA128 | SSL_CAMELLIA256, 0, 0, 0, 0, 0, 0}, + {0, SSL_TXT_CHACHA20, 0, 0, 0, SSL_CHACHA20POLY1305, 0, 0, 0, 0, 0, 0}, /* MAC aliases */ {0, SSL_TXT_MD5, 0, 0, 0, 0, SSL_MD5, 0, 0, 0, 0, 0}, @@ -432,6 +434,9 @@ ssl_cipher_methods[SSL_ENC_AES256GCM_IDX] = EVP_get_cipherbyname(SN_aes_256_gcm); + ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_IDX] = + EVP_get_cipherbyname(SN_chacha20_poly1305); + ssl_digest_methods[SSL_MD_MD5_IDX] = EVP_get_digestbyname(SN_md5); ssl_mac_secret_size[SSL_MD_MD5_IDX] = EVP_MD_size(ssl_digest_methods[SSL_MD_MD5_IDX]); @@ -582,6 +587,9 @@ case SSL_AES256GCM: i = SSL_ENC_AES256GCM_IDX; break; + case SSL_CHACHA20POLY1305: + i = SSL_ENC_CHACHA20POLY1305_IDX; + break; default: i = -1; break; @@ -806,6 +814,8 @@ (ssl_cipher_methods[SSL_ENC_GOST89_IDX] == NULL) ? SSL_eGOST2814789CNT : 0; *enc |= (ssl_cipher_methods[SSL_ENC_SEED_IDX] == NULL) ? SSL_SEED : 0; + *enc |= (ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_IDX] == + NULL) ? SSL_CHACHA20POLY1305 : 0; *mac |= (ssl_digest_methods[SSL_MD_MD5_IDX] == NULL) ? SSL_MD5 : 0; *mac |= (ssl_digest_methods[SSL_MD_SHA1_IDX] == NULL) ? SSL_SHA1 : 0; @@ -1824,6 +1834,9 @@ case SSL_eGOST2814789CNT: enc = "GOST89(256)"; break; + case SSL_CHACHA20POLY1305: + enc = "CHACHA20-POLY1305(256)"; + break; default: enc = "unknown"; break; diff -rNu openssl-1.0.2e/ssl/ssl_locl.h openssl-1.0.2e-modified/ssl/ssl_locl.h --- openssl-1.0.2e/ssl/ssl_locl.h 2015-12-03 15:04:23.000000000 +0100 +++ openssl-1.0.2e-modified/ssl/ssl_locl.h 2016-02-08 16:12:00.608614755 +0100 @@ -354,6 +354,7 @@ # define SSL_SEED 0x00000800L # define SSL_AES128GCM 0x00001000L # define SSL_AES256GCM 0x00002000L +# define SSL_CHACHA20POLY1305 0x00004000L # define SSL_AES (SSL_AES128|SSL_AES256|SSL_AES128GCM|SSL_AES256GCM) # define SSL_CAMELLIA (SSL_CAMELLIA128|SSL_CAMELLIA256) diff -rNu openssl-1.0.2e/ssl/tls1.h openssl-1.0.2e-modified/ssl/tls1.h --- openssl-1.0.2e/ssl/tls1.h 2015-12-03 15:04:23.000000000 +0100 +++ openssl-1.0.2e-modified/ssl/tls1.h 2016-02-08 16:12:00.608614755 +0100 @@ -563,6 +563,11 @@ # define TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256 0x0300C031 # define TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384 0x0300C032 +/* ChaCha20-Poly1305 ciphersuites draft-agl-tls-chacha20poly1305-01 */ +# define TLS1_CK_ECDHE_RSA_CHACHA20_POLY1305 0x0300CC13 +# define TLS1_CK_ECDHE_ECDSA_CHACHA20_POLY1305 0x0300CC14 +# define TLS1_CK_DHE_RSA_CHACHA20_POLY1305 0x0300CC15 + /* * XXX * Backward compatibility alert: + * Older versions of OpenSSL gave * some DHE ciphers names with "EDH" + * instead of "DHE". Going forward, we @@ -713,6 +718,11 @@ # define TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256 "ECDH-RSA-AES128-GCM-SHA256" # define TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384 "ECDH-RSA-AES256-GCM-SHA384" +/* ChaCha20-Poly1305 ciphersuites draft-agl-tls-chacha20poly1305-01 */ +#define TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305 "ECDHE-RSA-CHACHA20-POLY1305" +#define TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 "ECDHE-ECDSA-CHACHA20-POLY1305" +#define TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305 "DHE-RSA-CHACHA20-POLY1305" + # define TLS_CT_RSA_SIGN 1 # define TLS_CT_DSS_SIGN 2 # define TLS_CT_RSA_FIXED_DH 3 diff -rNu openssl-1.0.2e/test/Makefile openssl-1.0.2e-modified/test/Makefile --- openssl-1.0.2e/test/Makefile 2015-12-03 15:44:31.000000000 +0100 +++ openssl-1.0.2e-modified/test/Makefile 2016-02-08 16:12:00.608614755 +0100 @@ -70,6 +70,7 @@ CONSTTIMETEST= constant_time_test VERIFYEXTRATEST= verify_extra_test CLIENTHELLOTEST= clienthellotest +CHAPOLYTEST= chapolytest TESTS= alltests @@ -83,7 +84,7 @@ $(EVPTEST)$(EXE_EXT) $(EVPEXTRATEST)$(EXE_EXT) $(IGETEST)$(EXE_EXT) $(JPAKETEST)$(EXE_EXT) $(SRPTEST)$(EXE_EXT) \ $(ASN1TEST)$(EXE_EXT) $(V3NAMETEST)$(EXE_EXT) $(HEARTBEATTEST)$(EXE_EXT) \ $(CONSTTIMETEST)$(EXE_EXT) $(VERIFYEXTRATEST)$(EXE_EXT) \ - $(CLIENTHELLOTEST)$(EXE_EXT) + $(CLIENTHELLOTEST)$(EXE_EXT) $(CHAPOLYTEST)$(EXE_EXT) # $(METHTEST)$(EXE_EXT) @@ -97,7 +98,7 @@ $(BFTEST).o $(SSLTEST).o $(DSATEST).o $(EXPTEST).o $(RSATEST).o \ $(EVPTEST).o $(EVPEXTRATEST).o $(IGETEST).o $(JPAKETEST).o $(ASN1TEST).o $(V3NAMETEST).o \ $(HEARTBEATTEST).o $(CONSTTIMETEST).o $(VERIFYEXTRATEST).o \ - $(CLIENTHELLOTEST).o + $(CLIENTHELLOTEST).o $(CHAPOLYTEST).o SRC= $(BNTEST).c $(ECTEST).c $(ECDSATEST).c $(ECDHTEST).c $(IDEATEST).c \ $(MD2TEST).c $(MD4TEST).c $(MD5TEST).c \ @@ -108,7 +109,7 @@ $(BFTEST).c $(SSLTEST).c $(DSATEST).c $(EXPTEST).c $(RSATEST).c \ $(EVPTEST).c $(EVPEXTRATEST).c $(IGETEST).c $(JPAKETEST).c $(SRPTEST).c $(ASN1TEST).c \ $(V3NAMETEST).c $(HEARTBEATTEST).c $(CONSTTIMETEST).c $(VERIFYEXTRATEST).c \ - $(CLIENTHELLOTEST).c + $(CLIENTHELLOTEST).c $(CHAPOLYTEST).c EXHEADER= HEADER= testutil.h $(EXHEADER) @@ -144,7 +145,7 @@ @(cd ..; $(MAKE) DIRS=apps all) alltests: \ - test_des test_idea test_sha test_md4 test_md5 test_hmac \ + test_des test_idea test_sha test_md4 test_md5 test_hmac test_chapoly \ test_md2 test_mdc2 test_wp \ test_rmd test_rc2 test_rc4 test_rc5 test_bf test_cast test_aes \ test_rand test_bn test_ec test_ecdsa test_ecdh \ @@ -361,6 +362,10 @@ @echo $(START) $@ ../util/shlib_wrap.sh ./$(CLIENTHELLOTEST) +test_chapoly: $(CHAPOLYTEST)$(EXE_EXT) + @echo "Test ChaCha20 and Poly1305" + ../util/shlib_wrap.sh ./$(CHAPOLYTEST) + lint: lint -DLINT $(INCLUDES) $(SRC)>fluff @@ -538,6 +543,9 @@ $(CLIENTHELLOTEST)$(EXE_EXT): $(CLIENTHELLOTEST).o @target=$(CLIENTHELLOTEST) $(BUILD_CMD) +$(CHAPOLYTEST)$(EXE_EXT): $(CHAPOLYTEST).o + @target=$(CHAPOLYTEST); $(BUILD_CMD) + #$(AESTEST).o: $(AESTEST).c # $(CC) -c $(CFLAGS) -DINTERMEDIATE_VALUE_KAT -DTRACE_KAT_MCT $(AESTEST).c @@ -606,6 +614,7 @@ constant_time_test.o: ../crypto/constant_time_locl.h ../e_os.h constant_time_test.o: ../include/openssl/e_os2.h constant_time_test.o: ../include/openssl/opensslconf.h constant_time_test.c +chapolytest.o: ../include/openssl/chacha20poly1305.h chapolytest.c destest.o: ../include/openssl/des.h ../include/openssl/des_old.h destest.o: ../include/openssl/e_os2.h ../include/openssl/opensslconf.h destest.o: ../include/openssl/ossl_typ.h ../include/openssl/safestack.h