Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto updates from Herbert Xu: "Here is the crypto update for 4.3: API: - the AEAD interface transition is now complete. - add top-level skcipher interface. Drivers: - x86-64 acceleration for chacha20/poly1305. - add sunxi-ss Allwinner Security System crypto accelerator. - add RSA algorithm to qat driver. - add SRIOV support to qat driver. - add LS1021A support to caam. - add i.MX6 support to caam" * git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (163 commits) crypto: algif_aead - fix for multiple operations on AF_ALG sockets crypto: qat - enable legacy VFs MPI: Fix mpi_read_buffer crypto: qat - silence a static checker warning crypto: vmx - Fixing opcode issue crypto: caam - Use the preferred style for memory allocations crypto: caam - Propagate the real error code in caam_probe crypto: caam - Fix the error handling in caam_probe crypto: caam - fix writing to JQCR_MS when using service interface crypto: hash - Add AHASH_REQUEST_ON_STACK crypto: testmgr - Use new skcipher interface crypto: skcipher - Add top-level skcipher interface crypto: cmac - allow usage in FIPS mode crypto: sahara - Use dmam_alloc_coherent crypto: caam - Add support for LS1021A crypto: qat - Don't move data inside output buffer crypto: vmx - Fixing GHASH Key issue on little endian crypto: vmx - Fixing AES-CTR counter bug crypto: null - Add missing Kconfig tristate for NULL2 crypto: nx - Add forward declaration for struct crypto_aead ...
2026-03-09 10:07:04 -07:00 · 2015-08-31 17:38:39 -07:00
parent f36fc04e4c bf433416e6
commit d4c90396ed
154 changed files with 15742 additions and 7446 deletions
--- a/Documentation/DocBook/crypto-API.tmpl
+++ b/Documentation/DocBook/crypto-API.tmpl
@@ -585,7 +585,7 @@ kernel crypto API                                |   IPSEC Layer
 +-----------+                                    |
 |           |            (1)
 |   aead    | <-----------------------------------  esp_output
-| (seqniv)  | ---+
+|  (seqiv)  | ---+
 +-----------+    |
                 | (2)
 +-----------+    |
@@ -1101,7 +1101,7 @@ kernel crypto API            |       Caller
    </para>

    <para>
-     [1] http://www.chronox.de/libkcapi.html
+     [1] <ulink url="http://www.chronox.de/libkcapi.html">http://www.chronox.de/libkcapi.html</ulink>
    </para>

   </sect1>
@@ -1661,7 +1661,7 @@ read(opfd, out, outlen);
    </para>

    <para>
-     [1] http://www.chronox.de/libkcapi.html
+     [1] <ulink url="http://www.chronox.de/libkcapi.html">http://www.chronox.de/libkcapi.html</ulink>
    </para>

   </sect1>
@@ -1687,7 +1687,7 @@ read(opfd, out, outlen);
 !Pinclude/linux/crypto.h Block Cipher Algorithm Definitions
 !Finclude/linux/crypto.h crypto_alg
 !Finclude/linux/crypto.h ablkcipher_alg
-!Finclude/linux/crypto.h aead_alg
+!Finclude/crypto/aead.h aead_alg
 !Finclude/linux/crypto.h blkcipher_alg
 !Finclude/linux/crypto.h cipher_alg
 !Finclude/crypto/rng.h rng_alg
--- a/Documentation/devicetree/bindings/crypto/fsl-sec4.txt
+++ b/Documentation/devicetree/bindings/crypto/fsl-sec4.txt
@@ -106,6 +106,18 @@ PROPERTIES
          to the interrupt parent to which the child domain
          is being mapped.

+   - clocks
+      Usage: required if SEC 4.0 requires explicit enablement of clocks
+      Value type: <prop_encoded-array>
+      Definition:  A list of phandle and clock specifier pairs describing
+          the clocks required for enabling and disabling SEC 4.0.
+
+   - clock-names
+      Usage: required if SEC 4.0 requires explicit enablement of clocks
+      Value type: <string>
+      Definition: A list of clock name strings in the same order as the
+          clocks property.
+
   Note: All other standard properties (see the ePAPR) are allowed
   but are optional.

@@ -120,6 +132,11 @@ EXAMPLE
 		ranges = <0 0x300000 0x10000>;
 		interrupt-parent = <&mpic>;
 		interrupts = <92 2>;
+		clocks = <&clks IMX6QDL_CLK_CAAM_MEM>,
+			 <&clks IMX6QDL_CLK_CAAM_ACLK>,
+			 <&clks IMX6QDL_CLK_CAAM_IPG>,
+			 <&clks IMX6QDL_CLK_EIM_SLOW>;
+		clock-names = "mem", "aclk", "ipg", "emi_slow";
 	};

 =====================================================================
--- a/Documentation/devicetree/bindings/crypto/sun4i-ss.txt
+++ b/Documentation/devicetree/bindings/crypto/sun4i-ss.txt
@@ -0,0 +1,23 @@
+* Allwinner Security System found on A20 SoC
+
+Required properties:
+- compatible : Should be "allwinner,sun4i-a10-crypto".
+- reg: Should contain the Security System register location and length.
+- interrupts: Should contain the IRQ line for the Security System.
+- clocks : List of clock specifiers, corresponding to ahb and ss.
+- clock-names : Name of the functional clock, should be
+	* "ahb" : AHB gating clock
+	* "mod" : SS controller clock
+
+Optional properties:
+ - resets : phandle + reset specifier pair
+ - reset-names : must contain "ahb"
+
+Example:
+	crypto: crypto-engine@01c15000 {
+		compatible = "allwinner,sun4i-a10-crypto";
+		reg = <0x01c15000 0x1000>;
+		interrupts = <GIC_SPI 86 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&ahb_gates 5>, <&ss_clk>;
+		clock-names = "ahb", "mod";
+	};
--- a/24
+++ b/24
@@ -556,6 +556,12 @@ S:	Maintained
 F:	Documentation/i2c/busses/i2c-ali1563
 F:	drivers/i2c/busses/i2c-ali1563.c

+ALLWINNER SECURITY SYSTEM
+M:	Corentin Labbe <clabbe.montjoie@gmail.com>
+L:	linux-crypto@vger.kernel.org
+S:	Maintained
+F:	drivers/crypto/sunxi-ss/
+
 ALPHA PORT
 M:	Richard Henderson <rth@twiddle.net>
 M:	Ivan Kokshaysky <ink@jurassic.park.msu.ru>
@@ -5078,9 +5084,21 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux.git
 S:	Maintained
 F:	arch/ia64/

+IBM Power VMX Cryptographic instructions
+M:	Leonidas S. Barbosa <leosilva@linux.vnet.ibm.com>
+M:	Paulo Flabiano Smorigo <pfsmorigo@linux.vnet.ibm.com>
+L:	linux-crypto@vger.kernel.org
+S:	Supported
+F:	drivers/crypto/vmx/Makefile
+F:	drivers/crypto/vmx/Kconfig
+F:	drivers/crypto/vmx/vmx.c
+F:	drivers/crypto/vmx/aes*
+F:	drivers/crypto/vmx/ghash*
+F:	drivers/crypto/vmx/ppc-xlate.pl
+
 IBM Power in-Nest Crypto Acceleration
-M:	Marcelo Henrique Cerri <mhcerri@linux.vnet.ibm.com>
-M:	Fionnuala Gunter <fin@linux.vnet.ibm.com>
+M:	Leonidas S. Barbosa <leosilva@linux.vnet.ibm.com>
+M:	Paulo Flabiano Smorigo <pfsmorigo@linux.vnet.ibm.com>
 L:	linux-crypto@vger.kernel.org
 S:	Supported
 F:	drivers/crypto/nx/Makefile
@@ -5092,7 +5110,7 @@ F:	drivers/crypto/nx/nx_csbcpb.h
 F:	drivers/crypto/nx/nx_debugfs.h

 IBM Power 842 compression accelerator
-M:	Dan Streetman <ddstreet@us.ibm.com>
+M:	Dan Streetman <ddstreet@ieee.org>
 S:	Supported
 F:	drivers/crypto/nx/Makefile
 F:	drivers/crypto/nx/Kconfig
--- a/arch/arm/boot/dts/imx6qdl.dtsi
+++ b/arch/arm/boot/dts/imx6qdl.dtsi
@@ -836,10 +836,31 @@
 			reg = <0x02100000 0x100000>;
 			ranges;

-			caam@02100000 {
-				reg = <0x02100000 0x40000>;
-				interrupts = <0 105 IRQ_TYPE_LEVEL_HIGH>,
-					     <0 106 IRQ_TYPE_LEVEL_HIGH>;
+			crypto: caam@2100000 {
+				compatible = "fsl,sec-v4.0";
+				fsl,sec-era = <4>;
+				#address-cells = <1>;
+				#size-cells = <1>;
+				reg = <0x2100000 0x10000>;
+				ranges = <0 0x2100000 0x10000>;
+				interrupt-parent = <&intc>;
+				clocks = <&clks IMX6QDL_CLK_CAAM_MEM>,
+					 <&clks IMX6QDL_CLK_CAAM_ACLK>,
+					 <&clks IMX6QDL_CLK_CAAM_IPG>,
+					 <&clks IMX6QDL_CLK_EIM_SLOW>;
+				clock-names = "mem", "aclk", "ipg", "emi_slow";
+
+				sec_jr0: jr0@1000 {
+					compatible = "fsl,sec-v4.0-job-ring";
+					reg = <0x1000 0x1000>;
+					interrupts = <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>;
+				};
+
+				sec_jr1: jr1@2000 {
+					compatible = "fsl,sec-v4.0-job-ring";
+					reg = <0x2000 0x1000>;
+					interrupts = <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>;
+				};
 			};

 			aipstz@0217c000 { /* AIPSTZ2 */
--- a/arch/arm/boot/dts/imx6sx.dtsi
+++ b/arch/arm/boot/dts/imx6sx.dtsi
@@ -738,6 +738,33 @@
 			reg = <0x02100000 0x100000>;
 			ranges;

+			crypto: caam@2100000 {
+				compatible = "fsl,sec-v4.0";
+				fsl,sec-era = <4>;
+				#address-cells = <1>;
+				#size-cells = <1>;
+				reg = <0x2100000 0x10000>;
+				ranges = <0 0x2100000 0x10000>;
+				interrupt-parent = <&intc>;
+				clocks = <&clks IMX6SX_CLK_CAAM_MEM>,
+					 <&clks IMX6SX_CLK_CAAM_ACLK>,
+					 <&clks IMX6SX_CLK_CAAM_IPG>,
+					 <&clks IMX6SX_CLK_EIM_SLOW>;
+				clock-names = "mem", "aclk", "ipg", "emi_slow";
+
+				sec_jr0: jr0@1000 {
+					compatible = "fsl,sec-v4.0-job-ring";
+					reg = <0x1000 0x1000>;
+					interrupts = <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>;
+				};
+
+				sec_jr1: jr1@2000 {
+					compatible = "fsl,sec-v4.0-job-ring";
+					reg = <0x2000 0x1000>;
+					interrupts = <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>;
+				};
+			};
+
 			usbotg1: usb@02184000 {
 				compatible = "fsl,imx6sx-usb", "fsl,imx27-usb";
 				reg = <0x02184000 0x200>;
--- a/arch/arm/boot/dts/sun4i-a10.dtsi
+++ b/arch/arm/boot/dts/sun4i-a10.dtsi
@@ -678,6 +678,14 @@
 			status = "disabled";
 		};

+		crypto: crypto-engine@01c15000 {
+			compatible = "allwinner,sun4i-a10-crypto";
+			reg = <0x01c15000 0x1000>;
+			interrupts = <86>;
+			clocks = <&ahb_gates 5>, <&ss_clk>;
+			clock-names = "ahb", "mod";
+		};
+
 		spi2: spi@01c17000 {
 			compatible = "allwinner,sun4i-a10-spi";
 			reg = <0x01c17000 0x1000>;
--- a/arch/arm/boot/dts/sun6i-a31.dtsi
+++ b/arch/arm/boot/dts/sun6i-a31.dtsi
@@ -367,6 +367,14 @@
 					     "mmc3_sample";
 		};

+		ss_clk: clk@01c2009c {
+			#clock-cells = <0>;
+			compatible = "allwinner,sun4i-a10-mod0-clk";
+			reg = <0x01c2009c 0x4>;
+			clocks = <&osc24M>, <&pll6 0>;
+			clock-output-names = "ss";
+		};
+
 		spi0_clk: clk@01c200a0 {
 			#clock-cells = <0>;
 			compatible = "allwinner,sun4i-a10-mod0-clk";
@@ -894,6 +902,16 @@
 			#size-cells = <0>;
 		};

+		crypto: crypto-engine@01c15000 {
+			compatible = "allwinner,sun4i-a10-crypto";
+			reg = <0x01c15000 0x1000>;
+			interrupts = <GIC_SPI 80 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&ahb1_gates 5>, <&ss_clk>;
+			clock-names = "ahb", "mod";
+			resets = <&ahb1_rst 5>;
+			reset-names = "ahb";
+		};
+
 		timer@01c60000 {
 			compatible = "allwinner,sun6i-a31-hstimer",
 				     "allwinner,sun7i-a20-hstimer";
--- a/arch/arm/boot/dts/sun7i-a20.dtsi
+++ b/arch/arm/boot/dts/sun7i-a20.dtsi
@@ -754,6 +754,14 @@
 			status = "disabled";
 		};

+		crypto: crypto-engine@01c15000 {
+			compatible = "allwinner,sun4i-a10-crypto";
+			reg = <0x01c15000 0x1000>;
+			interrupts = <GIC_SPI 86 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&ahb_gates 5>, <&ss_clk>;
+			clock-names = "ahb", "mod";
+		};
+
 		spi2: spi@01c17000 {
 			compatible = "allwinner,sun4i-a10-spi";
 			reg = <0x01c17000 0x1000>;
--- a/arch/arm/configs/imx_v6_v7_defconfig
+++ b/arch/arm/configs/imx_v6_v7_defconfig
@@ -354,8 +354,7 @@ CONFIG_PROVE_LOCKING=y
 # CONFIG_FTRACE is not set
 # CONFIG_ARM_UNWIND is not set
 CONFIG_SECURITYFS=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
-# CONFIG_CRYPTO_HW is not set
+CONFIG_CRYPTO_DEV_FSL_CAAM=y
 CONFIG_CRC_CCITT=m
 CONFIG_CRC_T10DIF=y
 CONFIG_CRC7=m
--- a/arch/arm/crypto/.gitignore
+++ b/arch/arm/crypto/.gitignore
@@ -1 +1,3 @@
 aesbs-core.S
+sha256-core.S
+sha512-core.S
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -124,7 +124,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])

 	ce_aes_ccm_auth_data(mac, (u8 *)&ltag, ltag.len, &macp, ctx->key_enc,
 			     num_rounds(ctx));
-	scatterwalk_start(&walk, req->assoc);
+	scatterwalk_start(&walk, req->src);

 	do {
 		u32 n = scatterwalk_clamp(&walk, len);
@@ -151,6 +151,10 @@ static int ccm_encrypt(struct aead_request *req)
 	struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
 	struct blkcipher_desc desc = { .info = req->iv };
 	struct blkcipher_walk walk;
+	struct scatterlist srcbuf[2];
+	struct scatterlist dstbuf[2];
+	struct scatterlist *src;
+	struct scatterlist *dst;
 	u8 __aligned(8) mac[AES_BLOCK_SIZE];
 	u8 buf[AES_BLOCK_SIZE];
 	u32 len = req->cryptlen;
@@ -168,7 +172,12 @@ static int ccm_encrypt(struct aead_request *req)
 	/* preserve the original iv for the final round */
 	memcpy(buf, req->iv, AES_BLOCK_SIZE);

-	blkcipher_walk_init(&walk, req->dst, req->src, len);
+	src = scatterwalk_ffwd(srcbuf, req->src, req->assoclen);
+	dst = src;
+	if (req->src != req->dst)
+		dst = scatterwalk_ffwd(dstbuf, req->dst, req->assoclen);
+
+	blkcipher_walk_init(&walk, dst, src, len);
 	err = blkcipher_aead_walk_virt_block(&desc, &walk, aead,
 					     AES_BLOCK_SIZE);

@@ -194,7 +203,7 @@ static int ccm_encrypt(struct aead_request *req)
 		return err;

 	/* copy authtag to end of dst */
-	scatterwalk_map_and_copy(mac, req->dst, req->cryptlen,
+	scatterwalk_map_and_copy(mac, dst, req->cryptlen,
 				 crypto_aead_authsize(aead), 1);

 	return 0;
@@ -207,6 +216,10 @@ static int ccm_decrypt(struct aead_request *req)
 	unsigned int authsize = crypto_aead_authsize(aead);
 	struct blkcipher_desc desc = { .info = req->iv };
 	struct blkcipher_walk walk;
+	struct scatterlist srcbuf[2];
+	struct scatterlist dstbuf[2];
+	struct scatterlist *src;
+	struct scatterlist *dst;
 	u8 __aligned(8) mac[AES_BLOCK_SIZE];
 	u8 buf[AES_BLOCK_SIZE];
 	u32 len = req->cryptlen - authsize;
@@ -224,7 +237,12 @@ static int ccm_decrypt(struct aead_request *req)
 	/* preserve the original iv for the final round */
 	memcpy(buf, req->iv, AES_BLOCK_SIZE);

-	blkcipher_walk_init(&walk, req->dst, req->src, len);
+	src = scatterwalk_ffwd(srcbuf, req->src, req->assoclen);
+	dst = src;
+	if (req->src != req->dst)
+		dst = scatterwalk_ffwd(dstbuf, req->dst, req->assoclen);
+
+	blkcipher_walk_init(&walk, dst, src, len);
 	err = blkcipher_aead_walk_virt_block(&desc, &walk, aead,
 					     AES_BLOCK_SIZE);

@@ -250,44 +268,42 @@ static int ccm_decrypt(struct aead_request *req)
 		return err;

 	/* compare calculated auth tag with the stored one */
-	scatterwalk_map_and_copy(buf, req->src, req->cryptlen - authsize,
+	scatterwalk_map_and_copy(buf, src, req->cryptlen - authsize,
 				 authsize, 0);

-	if (memcmp(mac, buf, authsize))
+	if (crypto_memneq(mac, buf, authsize))
 		return -EBADMSG;
 	return 0;
 }

-static struct crypto_alg ccm_aes_alg = {
-	.cra_name		= "ccm(aes)",
-	.cra_driver_name	= "ccm-aes-ce",
-	.cra_priority		= 300,
-	.cra_flags		= CRYPTO_ALG_TYPE_AEAD,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
-	.cra_alignmask		= 7,
-	.cra_type		= &crypto_aead_type,
-	.cra_module		= THIS_MODULE,
-	.cra_aead = {
-		.ivsize		= AES_BLOCK_SIZE,
-		.maxauthsize	= AES_BLOCK_SIZE,
-		.setkey		= ccm_setkey,
-		.setauthsize	= ccm_setauthsize,
-		.encrypt	= ccm_encrypt,
-		.decrypt	= ccm_decrypt,
-	}
+static struct aead_alg ccm_aes_alg = {
+	.base = {
+		.cra_name		= "ccm(aes)",
+		.cra_driver_name	= "ccm-aes-ce",
+		.cra_priority		= 300,
+		.cra_blocksize		= 1,
+		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
+		.cra_alignmask		= 7,
+		.cra_module		= THIS_MODULE,
+	},
+	.ivsize		= AES_BLOCK_SIZE,
+	.maxauthsize	= AES_BLOCK_SIZE,
+	.setkey		= ccm_setkey,
+	.setauthsize	= ccm_setauthsize,
+	.encrypt	= ccm_encrypt,
+	.decrypt	= ccm_decrypt,
 };

 static int __init aes_mod_init(void)
 {
 	if (!(elf_hwcap & HWCAP_AES))
 		return -ENODEV;
-	return crypto_register_alg(&ccm_aes_alg);
+	return crypto_register_aead(&ccm_aes_alg);
 }

 static void __exit aes_mod_exit(void)
 {
-	crypto_unregister_alg(&ccm_aes_alg);
+	crypto_unregister_aead(&ccm_aes_alg);
 }

 module_init(aes_mod_init);
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -29,6 +29,7 @@ static inline void save_early_sprs(struct thread_struct *prev) {}

 extern void enable_kernel_fp(void);
 extern void enable_kernel_altivec(void);
+extern void enable_kernel_vsx(void);
 extern int emulate_altivec(struct pt_regs *);
 extern void __giveup_vsx(struct task_struct *);
 extern void giveup_vsx(struct task_struct *);
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -204,8 +204,6 @@ EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
 #endif /* CONFIG_ALTIVEC */

 #ifdef CONFIG_VSX
-#if 0
-/* not currently used, but some crazy RAID module might want to later */
 void enable_kernel_vsx(void)
 {
 	WARN_ON(preemptible());
@@ -220,7 +218,6 @@ void enable_kernel_vsx(void)
 #endif /* CONFIG_SMP */
 }
 EXPORT_SYMBOL(enable_kernel_vsx);
-#endif

 void giveup_vsx(struct task_struct *tsk)
 {
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
+obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
@@ -30,6 +31,7 @@ obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
 obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
 obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
+obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o

 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
@@ -60,6 +62,7 @@ blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
+chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o

 ifeq ($(avx_supported),yes)
@@ -75,6 +78,7 @@ endif

 ifeq ($(avx2_supported),yes)
 	camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
+	chacha20-x86_64-y += chacha20-avx2-x86_64.o
 	serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
 endif

@@ -82,8 +86,10 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
 aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
+poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
 ifeq ($(avx2_supported),yes)
 sha1-ssse3-y += sha1_avx2_x86_64_asm.o
+poly1305-x86_64-y += poly1305-avx2-x86_64.o
 endif
 crc32c-intel-y := crc32c-intel_glue.o
 crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -803,10 +803,7 @@ static int rfc4106_init(struct crypto_aead *aead)
 		return PTR_ERR(cryptd_tfm);

 	*ctx = cryptd_tfm;
-	crypto_aead_set_reqsize(
-		aead,
-		sizeof(struct aead_request) +
-		crypto_aead_reqsize(&cryptd_tfm->base));
+	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
 	return 0;
 }

@@ -955,8 +952,8 @@ static int helper_rfc4106_encrypt(struct aead_request *req)

 	/* Assuming we are supporting rfc4106 64-bit extended */
 	/* sequence numbers We need to have the AAD length equal */
-	/* to 8 or 12 bytes */
-	if (unlikely(req->assoclen != 8 && req->assoclen != 12))
+	/* to 16 or 20 bytes */
+	if (unlikely(req->assoclen != 16 && req->assoclen != 20))
 		return -EINVAL;

 	/* IV below built */
@@ -992,9 +989,9 @@ static int helper_rfc4106_encrypt(struct aead_request *req)
 	}

 	kernel_fpu_begin();
-	aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
-		ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
-		+ ((unsigned long)req->cryptlen), auth_tag_len);
+	aesni_gcm_enc_tfm(aes_ctx, dst, src, req->cryptlen, iv,
+			  ctx->hash_subkey, assoc, req->assoclen - 8,
+			  dst + req->cryptlen, auth_tag_len);
 	kernel_fpu_end();

 	/* The authTag (aka the Integrity Check Value) needs to be written
@@ -1033,12 +1030,12 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
 	struct scatter_walk dst_sg_walk;
 	unsigned int i;

-	if (unlikely(req->assoclen != 8 && req->assoclen != 12))
+	if (unlikely(req->assoclen != 16 && req->assoclen != 20))
 		return -EINVAL;

 	/* Assuming we are supporting rfc4106 64-bit extended */
 	/* sequence numbers We need to have the AAD length */
-	/* equal to 8 or 12 bytes */
+	/* equal to 16 or 20 bytes */

 	tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
 	/* IV below built */
@@ -1075,8 +1072,8 @@ static int helper_rfc4106_decrypt(struct aead_request *req)

 	kernel_fpu_begin();
 	aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv,
-		ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
-		authTag, auth_tag_len);
+			  ctx->hash_subkey, assoc, req->assoclen - 8,
+			  authTag, auth_tag_len);
 	kernel_fpu_end();

 	/* Compare generated tag with passed in tag. */
@@ -1105,19 +1102,12 @@ static int rfc4106_encrypt(struct aead_request *req)
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
 	struct cryptd_aead *cryptd_tfm = *ctx;
-	struct aead_request *subreq = aead_request_ctx(req);

-	aead_request_set_tfm(subreq, irq_fpu_usable() ?
-				     cryptd_aead_child(cryptd_tfm) :
-				     &cryptd_tfm->base);
+	aead_request_set_tfm(req, irq_fpu_usable() ?
+				  cryptd_aead_child(cryptd_tfm) :
+				  &cryptd_tfm->base);

-	aead_request_set_callback(subreq, req->base.flags,
-				  req->base.complete, req->base.data);
-	aead_request_set_crypt(subreq, req->src, req->dst,
-			       req->cryptlen, req->iv);
-	aead_request_set_ad(subreq, req->assoclen);
-
-	return crypto_aead_encrypt(subreq);
+	return crypto_aead_encrypt(req);
 }

 static int rfc4106_decrypt(struct aead_request *req)
@@ -1125,19 +1115,12 @@ static int rfc4106_decrypt(struct aead_request *req)
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
 	struct cryptd_aead *cryptd_tfm = *ctx;
-	struct aead_request *subreq = aead_request_ctx(req);

-	aead_request_set_tfm(subreq, irq_fpu_usable() ?
-				     cryptd_aead_child(cryptd_tfm) :
-				     &cryptd_tfm->base);
+	aead_request_set_tfm(req, irq_fpu_usable() ?
+				  cryptd_aead_child(cryptd_tfm) :
+				  &cryptd_tfm->base);

-	aead_request_set_callback(subreq, req->base.flags,
-				  req->base.complete, req->base.data);
-	aead_request_set_crypt(subreq, req->src, req->dst,
-			       req->cryptlen, req->iv);
-	aead_request_set_ad(subreq, req->assoclen);
-
-	return crypto_aead_decrypt(subreq);
+	return crypto_aead_decrypt(req);
 }
 #endif

--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -0,0 +1,443 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.data
+.align 32
+
+ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
+	.octa 0x0e0d0c0f0a09080b0605040702010003
+ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
+	.octa 0x0d0c0f0e09080b0a0504070601000302
+CTRINC:	.octa 0x00000003000000020000000100000000
+	.octa 0x00000007000000060000000500000004
+
+.text
+
+ENTRY(chacha20_8block_xor_avx2)
+	# %rdi: Input state matrix, s
+	# %rsi: 8 data blocks output, o
+	# %rdx: 8 data blocks input, i
+
+	# This function encrypts eight consecutive ChaCha20 blocks by loading
+	# the state matrix in AVX registers eight times. As we need some
+	# scratch registers, we save the first four registers on the stack. The
+	# algorithm performs each operation on the corresponding word of each
+	# state matrix, hence requires no word shuffling. For final XORing step
+	# we transpose the matrix by interleaving 32-, 64- and then 128-bit
+	# words, which allows us to do XOR in AVX registers. 8/16-bit word
+	# rotation is done with the slightly better performing byte shuffling,
+	# 7/12-bit word rotation uses traditional shift+OR.
+
+	vzeroupper
+	# 4 * 32 byte stack, 32-byte aligned
+	mov		%rsp, %r8
+	and		$~31, %rsp
+	sub		$0x80, %rsp
+
+	# x0..15[0-7] = s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpbroadcastd	0x04(%rdi),%ymm1
+	vpbroadcastd	0x08(%rdi),%ymm2
+	vpbroadcastd	0x0c(%rdi),%ymm3
+	vpbroadcastd	0x10(%rdi),%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm15
+	# x0..3 on stack
+	vmovdqa		%ymm0,0x00(%rsp)
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		%ymm3,0x60(%rsp)
+
+	vmovdqa		CTRINC(%rip),%ymm1
+	vmovdqa		ROT8(%rip),%ymm2
+	vmovdqa		ROT16(%rip),%ymm3
+
+	# x12 += counter values 0-3
+	vpaddd		%ymm1,%ymm12,%ymm12
+
+	mov		$10,%ecx
+
+.Ldoubleround8:
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	vpaddd		0x00(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm3,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	vpaddd		0x20(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm3,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	vpaddd		0x40(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm3,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vpaddd		0x60(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm3,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm4,%ymm4
+	vpslld		$12,%ymm4,%ymm0
+	vpsrld		$20,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm5,%ymm5
+	vpslld		$12,%ymm5,%ymm0
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm6,%ymm6
+	vpslld		$12,%ymm6,%ymm0
+	vpsrld		$20,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm7,%ymm7
+	vpslld		$12,%ymm7,%ymm0
+	vpsrld		$20,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	vpaddd		0x00(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm2,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	vpaddd		0x20(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm2,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	vpaddd		0x40(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm2,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vpaddd		0x60(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm2,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm4,%ymm4
+	vpslld		$7,%ymm4,%ymm0
+	vpsrld		$25,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm5,%ymm5
+	vpslld		$7,%ymm5,%ymm0
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm6,%ymm6
+	vpslld		$7,%ymm6,%ymm0
+	vpsrld		$25,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm7,%ymm7
+	vpslld		$7,%ymm7,%ymm0
+	vpsrld		$25,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	vpaddd		0x00(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm3,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
+	vpaddd		0x20(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm3,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	vpaddd		0x40(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm3,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vpaddd		0x60(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm3,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm5,%ymm5
+	vpslld		$12,%ymm5,%ymm0
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm6,%ymm6
+	vpslld		$12,%ymm6,%ymm0
+	vpsrld		$20,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm7,%ymm7
+	vpslld		$12,%ymm7,%ymm0
+	vpsrld		$20,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm4,%ymm4
+	vpslld		$12,%ymm4,%ymm0
+	vpsrld		$20,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	vpaddd		0x00(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm2,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	vpaddd		0x20(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm2,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	vpaddd		0x40(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm2,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vpaddd		0x60(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm2,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm5,%ymm5
+	vpslld		$7,%ymm5,%ymm0
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm6,%ymm6
+	vpslld		$7,%ymm6,%ymm0
+	vpsrld		$25,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm7,%ymm7
+	vpslld		$7,%ymm7,%ymm0
+	vpsrld		$25,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm4,%ymm4
+	vpslld		$7,%ymm4,%ymm0
+	vpsrld		$25,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+
+	dec		%ecx
+	jnz		.Ldoubleround8
+
+	# x0..15[0-3] += s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpaddd		0x00(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpbroadcastd	0x04(%rdi),%ymm0
+	vpaddd		0x20(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpbroadcastd	0x08(%rdi),%ymm0
+	vpaddd		0x40(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpbroadcastd	0x0c(%rdi),%ymm0
+	vpaddd		0x60(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpbroadcastd	0x10(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm4,%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm5,%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm6,%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm7,%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm8,%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm9,%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm10,%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm11,%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm12,%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm13,%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm14,%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm15,%ymm15
+
+	# x12 += counter values 0-3
+	vpaddd		%ymm1,%ymm12,%ymm12
+
+	# interleave 32-bit words in state n, n+1
+	vmovdqa		0x00(%rsp),%ymm0
+	vmovdqa		0x20(%rsp),%ymm1
+	vpunpckldq	%ymm1,%ymm0,%ymm2
+	vpunpckhdq	%ymm1,%ymm0,%ymm1
+	vmovdqa		%ymm2,0x00(%rsp)
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		0x40(%rsp),%ymm0
+	vmovdqa		0x60(%rsp),%ymm1
+	vpunpckldq	%ymm1,%ymm0,%ymm2
+	vpunpckhdq	%ymm1,%ymm0,%ymm1
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		%ymm1,0x60(%rsp)
+	vmovdqa		%ymm4,%ymm0
+	vpunpckldq	%ymm5,%ymm0,%ymm4
+	vpunpckhdq	%ymm5,%ymm0,%ymm5
+	vmovdqa		%ymm6,%ymm0
+	vpunpckldq	%ymm7,%ymm0,%ymm6
+	vpunpckhdq	%ymm7,%ymm0,%ymm7
+	vmovdqa		%ymm8,%ymm0
+	vpunpckldq	%ymm9,%ymm0,%ymm8
+	vpunpckhdq	%ymm9,%ymm0,%ymm9
+	vmovdqa		%ymm10,%ymm0
+	vpunpckldq	%ymm11,%ymm0,%ymm10
+	vpunpckhdq	%ymm11,%ymm0,%ymm11
+	vmovdqa		%ymm12,%ymm0
+	vpunpckldq	%ymm13,%ymm0,%ymm12
+	vpunpckhdq	%ymm13,%ymm0,%ymm13
+	vmovdqa		%ymm14,%ymm0
+	vpunpckldq	%ymm15,%ymm0,%ymm14
+	vpunpckhdq	%ymm15,%ymm0,%ymm15
+
+	# interleave 64-bit words in state n, n+2
+	vmovdqa		0x00(%rsp),%ymm0
+	vmovdqa		0x40(%rsp),%ymm2
+	vpunpcklqdq	%ymm2,%ymm0,%ymm1
+	vpunpckhqdq	%ymm2,%ymm0,%ymm2
+	vmovdqa		%ymm1,0x00(%rsp)
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		0x20(%rsp),%ymm0
+	vmovdqa		0x60(%rsp),%ymm2
+	vpunpcklqdq	%ymm2,%ymm0,%ymm1
+	vpunpckhqdq	%ymm2,%ymm0,%ymm2
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		%ymm2,0x60(%rsp)
+	vmovdqa		%ymm4,%ymm0
+	vpunpcklqdq	%ymm6,%ymm0,%ymm4
+	vpunpckhqdq	%ymm6,%ymm0,%ymm6
+	vmovdqa		%ymm5,%ymm0
+	vpunpcklqdq	%ymm7,%ymm0,%ymm5
+	vpunpckhqdq	%ymm7,%ymm0,%ymm7
+	vmovdqa		%ymm8,%ymm0
+	vpunpcklqdq	%ymm10,%ymm0,%ymm8
+	vpunpckhqdq	%ymm10,%ymm0,%ymm10
+	vmovdqa		%ymm9,%ymm0
+	vpunpcklqdq	%ymm11,%ymm0,%ymm9
+	vpunpckhqdq	%ymm11,%ymm0,%ymm11
+	vmovdqa		%ymm12,%ymm0
+	vpunpcklqdq	%ymm14,%ymm0,%ymm12
+	vpunpckhqdq	%ymm14,%ymm0,%ymm14
+	vmovdqa		%ymm13,%ymm0
+	vpunpcklqdq	%ymm15,%ymm0,%ymm13
+	vpunpckhqdq	%ymm15,%ymm0,%ymm15
+
+	# interleave 128-bit words in state n, n+4
+	vmovdqa		0x00(%rsp),%ymm0
+	vperm2i128	$0x20,%ymm4,%ymm0,%ymm1
+	vperm2i128	$0x31,%ymm4,%ymm0,%ymm4
+	vmovdqa		%ymm1,0x00(%rsp)
+	vmovdqa		0x20(%rsp),%ymm0
+	vperm2i128	$0x20,%ymm5,%ymm0,%ymm1
+	vperm2i128	$0x31,%ymm5,%ymm0,%ymm5
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		0x40(%rsp),%ymm0
+	vperm2i128	$0x20,%ymm6,%ymm0,%ymm1
+	vperm2i128	$0x31,%ymm6,%ymm0,%ymm6
+	vmovdqa		%ymm1,0x40(%rsp)
+	vmovdqa		0x60(%rsp),%ymm0
+	vperm2i128	$0x20,%ymm7,%ymm0,%ymm1
+	vperm2i128	$0x31,%ymm7,%ymm0,%ymm7
+	vmovdqa		%ymm1,0x60(%rsp)
+	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
+	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
+	vmovdqa		%ymm0,%ymm8
+	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
+	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
+	vmovdqa		%ymm0,%ymm9
+	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
+	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
+	vmovdqa		%ymm0,%ymm10
+	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
+	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
+	vmovdqa		%ymm0,%ymm11
+
+	# xor with corresponding input, write to output
+	vmovdqa		0x00(%rsp),%ymm0
+	vpxor		0x0000(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0000(%rsi)
+	vmovdqa		0x20(%rsp),%ymm0
+	vpxor		0x0080(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0080(%rsi)
+	vmovdqa		0x40(%rsp),%ymm0
+	vpxor		0x0040(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0040(%rsi)
+	vmovdqa		0x60(%rsp),%ymm0
+	vpxor		0x00c0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x00c0(%rsi)
+	vpxor		0x0100(%rdx),%ymm4,%ymm4
+	vmovdqu		%ymm4,0x0100(%rsi)
+	vpxor		0x0180(%rdx),%ymm5,%ymm5
+	vmovdqu		%ymm5,0x00180(%rsi)
+	vpxor		0x0140(%rdx),%ymm6,%ymm6
+	vmovdqu		%ymm6,0x0140(%rsi)
+	vpxor		0x01c0(%rdx),%ymm7,%ymm7
+	vmovdqu		%ymm7,0x01c0(%rsi)
+	vpxor		0x0020(%rdx),%ymm8,%ymm8
+	vmovdqu		%ymm8,0x0020(%rsi)
+	vpxor		0x00a0(%rdx),%ymm9,%ymm9
+	vmovdqu		%ymm9,0x00a0(%rsi)
+	vpxor		0x0060(%rdx),%ymm10,%ymm10
+	vmovdqu		%ymm10,0x0060(%rsi)
+	vpxor		0x00e0(%rdx),%ymm11,%ymm11
+	vmovdqu		%ymm11,0x00e0(%rsi)
+	vpxor		0x0120(%rdx),%ymm12,%ymm12
+	vmovdqu		%ymm12,0x0120(%rsi)
+	vpxor		0x01a0(%rdx),%ymm13,%ymm13
+	vmovdqu		%ymm13,0x01a0(%rsi)
+	vpxor		0x0160(%rdx),%ymm14,%ymm14
+	vmovdqu		%ymm14,0x0160(%rsi)
+	vpxor		0x01e0(%rdx),%ymm15,%ymm15
+	vmovdqu		%ymm15,0x01e0(%rsi)
+
+	vzeroupper
+	mov		%r8,%rsp
+	ret
+ENDPROC(chacha20_8block_xor_avx2)
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -0,0 +1,150 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha20.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/simd.h>
+
+#define CHACHA20_STATE_ALIGN 16
+
+asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
+#ifdef CONFIG_AS_AVX2
+asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
+static bool chacha20_use_avx2;
+#endif
+
+static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
+			    unsigned int bytes)
+{
+	u8 buf[CHACHA20_BLOCK_SIZE];
+
+#ifdef CONFIG_AS_AVX2
+	if (chacha20_use_avx2) {
+		while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
+			chacha20_8block_xor_avx2(state, dst, src);
+			bytes -= CHACHA20_BLOCK_SIZE * 8;
+			src += CHACHA20_BLOCK_SIZE * 8;
+			dst += CHACHA20_BLOCK_SIZE * 8;
+			state[12] += 8;
+		}
+	}
+#endif
+	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+		chacha20_4block_xor_ssse3(state, dst, src);
+		bytes -= CHACHA20_BLOCK_SIZE * 4;
+		src += CHACHA20_BLOCK_SIZE * 4;
+		dst += CHACHA20_BLOCK_SIZE * 4;
+		state[12] += 4;
+	}
+	while (bytes >= CHACHA20_BLOCK_SIZE) {
+		chacha20_block_xor_ssse3(state, dst, src);
+		bytes -= CHACHA20_BLOCK_SIZE;
+		src += CHACHA20_BLOCK_SIZE;
+		dst += CHACHA20_BLOCK_SIZE;
+		state[12]++;
+	}
+	if (bytes) {
+		memcpy(buf, src, bytes);
+		chacha20_block_xor_ssse3(state, buf, buf);
+		memcpy(dst, buf, bytes);
+	}
+}
+
+static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
+			 struct scatterlist *src, unsigned int nbytes)
+{
+	u32 *state, state_buf[16 + (CHACHA20_STATE_ALIGN / sizeof(u32)) - 1];
+	struct blkcipher_walk walk;
+	int err;
+
+	if (!may_use_simd())
+		return crypto_chacha20_crypt(desc, dst, src, nbytes);
+
+	state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN);
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
+
+	crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
+
+	kernel_fpu_begin();
+
+	while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
+		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
+				rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
+		err = blkcipher_walk_done(desc, &walk,
+					  walk.nbytes % CHACHA20_BLOCK_SIZE);
+	}
+
+	if (walk.nbytes) {
+		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
+				walk.nbytes);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+
+	kernel_fpu_end();
+
+	return err;
+}
+
+static struct crypto_alg alg = {
+	.cra_name		= "chacha20",
+	.cra_driver_name	= "chacha20-simd",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_ctxsize		= sizeof(struct chacha20_ctx),
+	.cra_alignmask		= sizeof(u32) - 1,
+	.cra_module		= THIS_MODULE,
+	.cra_u			= {
+		.blkcipher = {
+			.min_keysize	= CHACHA20_KEY_SIZE,
+			.max_keysize	= CHACHA20_KEY_SIZE,
+			.ivsize		= CHACHA20_IV_SIZE,
+			.geniv		= "seqiv",
+			.setkey		= crypto_chacha20_setkey,
+			.encrypt	= chacha20_simd,
+			.decrypt	= chacha20_simd,
+		},
+	},
+};
+
+static int __init chacha20_simd_mod_init(void)
+{
+	if (!cpu_has_ssse3)
+		return -ENODEV;
+
+#ifdef CONFIG_AS_AVX2
+	chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 &&
+			    cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL);
+#endif
+	return crypto_register_alg(&alg);
+}
+
+static void __exit chacha20_simd_mod_fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(chacha20_simd_mod_init);
+module_exit(chacha20_simd_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-simd");
--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
+++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
@@ -0,0 +1,386 @@
+/*
+ * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.data
+.align 32
+
+ANMASK:	.octa 0x0000000003ffffff0000000003ffffff
+	.octa 0x0000000003ffffff0000000003ffffff
+ORMASK:	.octa 0x00000000010000000000000001000000
+	.octa 0x00000000010000000000000001000000
+
+.text
+
+#define h0 0x00(%rdi)
+#define h1 0x04(%rdi)
+#define h2 0x08(%rdi)
+#define h3 0x0c(%rdi)
+#define h4 0x10(%rdi)
+#define r0 0x00(%rdx)
+#define r1 0x04(%rdx)
+#define r2 0x08(%rdx)
+#define r3 0x0c(%rdx)
+#define r4 0x10(%rdx)
+#define u0 0x00(%r8)
+#define u1 0x04(%r8)
+#define u2 0x08(%r8)
+#define u3 0x0c(%r8)
+#define u4 0x10(%r8)
+#define w0 0x14(%r8)
+#define w1 0x18(%r8)
+#define w2 0x1c(%r8)
+#define w3 0x20(%r8)
+#define w4 0x24(%r8)
+#define y0 0x28(%r8)
+#define y1 0x2c(%r8)
+#define y2 0x30(%r8)
+#define y3 0x34(%r8)
+#define y4 0x38(%r8)
+#define m %rsi
+#define hc0 %ymm0
+#define hc1 %ymm1
+#define hc2 %ymm2
+#define hc3 %ymm3
+#define hc4 %ymm4
+#define hc0x %xmm0
+#define hc1x %xmm1
+#define hc2x %xmm2
+#define hc3x %xmm3
+#define hc4x %xmm4
+#define t1 %ymm5
+#define t2 %ymm6
+#define t1x %xmm5
+#define t2x %xmm6
+#define ruwy0 %ymm7
+#define ruwy1 %ymm8
+#define ruwy2 %ymm9
+#define ruwy3 %ymm10
+#define ruwy4 %ymm11
+#define ruwy0x %xmm7
+#define ruwy1x %xmm8
+#define ruwy2x %xmm9
+#define ruwy3x %xmm10
+#define ruwy4x %xmm11
+#define svxz1 %ymm12
+#define svxz2 %ymm13
+#define svxz3 %ymm14
+#define svxz4 %ymm15
+#define d0 %r9
+#define d1 %r10
+#define d2 %r11
+#define d3 %r12
+#define d4 %r13
+
+ENTRY(poly1305_4block_avx2)
+	# %rdi: Accumulator h[5]
+	# %rsi: 64 byte input block m
+	# %rdx: Poly1305 key r[5]
+	# %rcx: Quadblock count
+	# %r8:  Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
+
+	# This four-block variant uses loop unrolled block processing. It
+	# requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
+	# h = (h + m) * r  =>  h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
+
+	vzeroupper
+	push		%rbx
+	push		%r12
+	push		%r13
+
+	# combine r0,u0,w0,y0
+	vmovd		y0,ruwy0x
+	vmovd		w0,t1x
+	vpunpcklqdq	t1,ruwy0,ruwy0
+	vmovd		u0,t1x
+	vmovd		r0,t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,ruwy0,ruwy0
+
+	# combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
+	vmovd		y1,ruwy1x
+	vmovd		w1,t1x
+	vpunpcklqdq	t1,ruwy1,ruwy1
+	vmovd		u1,t1x
+	vmovd		r1,t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,ruwy1,ruwy1
+	vpslld		$2,ruwy1,svxz1
+	vpaddd		ruwy1,svxz1,svxz1
+
+	# combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
+	vmovd		y2,ruwy2x
+	vmovd		w2,t1x
+	vpunpcklqdq	t1,ruwy2,ruwy2
+	vmovd		u2,t1x
+	vmovd		r2,t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,ruwy2,ruwy2
+	vpslld		$2,ruwy2,svxz2
+	vpaddd		ruwy2,svxz2,svxz2
+
+	# combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
+	vmovd		y3,ruwy3x
+	vmovd		w3,t1x
+	vpunpcklqdq	t1,ruwy3,ruwy3
+	vmovd		u3,t1x
+	vmovd		r3,t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,ruwy3,ruwy3
+	vpslld		$2,ruwy3,svxz3
+	vpaddd		ruwy3,svxz3,svxz3
+
+	# combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
+	vmovd		y4,ruwy4x
+	vmovd		w4,t1x
+	vpunpcklqdq	t1,ruwy4,ruwy4
+	vmovd		u4,t1x
+	vmovd		r4,t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,ruwy4,ruwy4
+	vpslld		$2,ruwy4,svxz4
+	vpaddd		ruwy4,svxz4,svxz4
+
+.Ldoblock4:
+	# hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
+	#	 m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
+	vmovd		0x00(m),hc0x
+	vmovd		0x10(m),t1x
+	vpunpcklqdq	t1,hc0,hc0
+	vmovd		0x20(m),t1x
+	vmovd		0x30(m),t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,hc0,hc0
+	vpand		ANMASK(%rip),hc0,hc0
+	vmovd		h0,t1x
+	vpaddd		t1,hc0,hc0
+	# hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
+	#	 (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
+	vmovd		0x03(m),hc1x
+	vmovd		0x13(m),t1x
+	vpunpcklqdq	t1,hc1,hc1
+	vmovd		0x23(m),t1x
+	vmovd		0x33(m),t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,hc1,hc1
+	vpsrld		$2,hc1,hc1
+	vpand		ANMASK(%rip),hc1,hc1
+	vmovd		h1,t1x
+	vpaddd		t1,hc1,hc1
+	# hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
+	#	 (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
+	vmovd		0x06(m),hc2x
+	vmovd		0x16(m),t1x
+	vpunpcklqdq	t1,hc2,hc2
+	vmovd		0x26(m),t1x
+	vmovd		0x36(m),t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,hc2,hc2
+	vpsrld		$4,hc2,hc2
+	vpand		ANMASK(%rip),hc2,hc2
+	vmovd		h2,t1x
+	vpaddd		t1,hc2,hc2
+	# hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
+	#	 (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
+	vmovd		0x09(m),hc3x
+	vmovd		0x19(m),t1x
+	vpunpcklqdq	t1,hc3,hc3
+	vmovd		0x29(m),t1x
+	vmovd		0x39(m),t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,hc3,hc3
+	vpsrld		$6,hc3,hc3
+	vpand		ANMASK(%rip),hc3,hc3
+	vmovd		h3,t1x
+	vpaddd		t1,hc3,hc3
+	# hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
+	#	 (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
+	vmovd		0x0c(m),hc4x
+	vmovd		0x1c(m),t1x
+	vpunpcklqdq	t1,hc4,hc4
+	vmovd		0x2c(m),t1x
+	vmovd		0x3c(m),t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,hc4,hc4
+	vpsrld		$8,hc4,hc4
+	vpor		ORMASK(%rip),hc4,hc4
+	vmovd		h4,t1x
+	vpaddd		t1,hc4,hc4
+
+	# t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
+	vpmuludq	hc0,ruwy0,t1
+	# t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
+	vpmuludq	hc1,svxz4,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
+	vpmuludq	hc2,svxz3,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
+	vpmuludq	hc3,svxz2,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
+	vpmuludq	hc4,svxz1,t2
+	vpaddq		t2,t1,t1
+	# d0 = t1[0] + t1[1] + t[2] + t[3]
+	vpermq		$0xee,t1,t2
+	vpaddq		t2,t1,t1
+	vpsrldq		$8,t1,t2
+	vpaddq		t2,t1,t1
+	vmovq		t1x,d0
+
+	# t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
+	vpmuludq	hc0,ruwy1,t1
+	# t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
+	vpmuludq	hc1,ruwy0,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
+	vpmuludq	hc2,svxz4,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
+	vpmuludq	hc3,svxz3,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
+	vpmuludq	hc4,svxz2,t2
+	vpaddq		t2,t1,t1
+	# d1 = t1[0] + t1[1] + t1[3] + t1[4]
+	vpermq		$0xee,t1,t2
+	vpaddq		t2,t1,t1
+	vpsrldq		$8,t1,t2
+	vpaddq		t2,t1,t1
+	vmovq		t1x,d1
+
+	# t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
+	vpmuludq	hc0,ruwy2,t1
+	# t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
+	vpmuludq	hc1,ruwy1,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
+	vpmuludq	hc2,ruwy0,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
+	vpmuludq	hc3,svxz4,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
+	vpmuludq	hc4,svxz3,t2
+	vpaddq		t2,t1,t1
+	# d2 = t1[0] + t1[1] + t1[2] + t1[3]
+	vpermq		$0xee,t1,t2
+	vpaddq		t2,t1,t1
+	vpsrldq		$8,t1,t2
+	vpaddq		t2,t1,t1
+	vmovq		t1x,d2
+
+	# t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
+	vpmuludq	hc0,ruwy3,t1
+	# t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
+	vpmuludq	hc1,ruwy2,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
+	vpmuludq	hc2,ruwy1,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
+	vpmuludq	hc3,ruwy0,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
+	vpmuludq	hc4,svxz4,t2
+	vpaddq		t2,t1,t1
+	# d3 = t1[0] + t1[1] + t1[2] + t1[3]
+	vpermq		$0xee,t1,t2
+	vpaddq		t2,t1,t1
+	vpsrldq		$8,t1,t2
+	vpaddq		t2,t1,t1
+	vmovq		t1x,d3
+
+	# t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
+	vpmuludq	hc0,ruwy4,t1
+	# t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
+	vpmuludq	hc1,ruwy3,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
+	vpmuludq	hc2,ruwy2,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
+	vpmuludq	hc3,ruwy1,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
+	vpmuludq	hc4,ruwy0,t2
+	vpaddq		t2,t1,t1
+	# d4 = t1[0] + t1[1] + t1[2] + t1[3]
+	vpermq		$0xee,t1,t2
+	vpaddq		t2,t1,t1
+	vpsrldq		$8,t1,t2
+	vpaddq		t2,t1,t1
+	vmovq		t1x,d4
+
+	# d1 += d0 >> 26
+	mov		d0,%rax
+	shr		$26,%rax
+	add		%rax,d1
+	# h0 = d0 & 0x3ffffff
+	mov		d0,%rbx
+	and		$0x3ffffff,%ebx
+
+	# d2 += d1 >> 26
+	mov		d1,%rax
+	shr		$26,%rax
+	add		%rax,d2
+	# h1 = d1 & 0x3ffffff
+	mov		d1,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h1
+
+	# d3 += d2 >> 26
+	mov		d2,%rax
+	shr		$26,%rax
+	add		%rax,d3
+	# h2 = d2 & 0x3ffffff
+	mov		d2,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h2
+
+	# d4 += d3 >> 26
+	mov		d3,%rax
+	shr		$26,%rax
+	add		%rax,d4
+	# h3 = d3 & 0x3ffffff
+	mov		d3,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h3
+
+	# h0 += (d4 >> 26) * 5
+	mov		d4,%rax
+	shr		$26,%rax
+	lea		(%eax,%eax,4),%eax
+	add		%eax,%ebx
+	# h4 = d4 & 0x3ffffff
+	mov		d4,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h4
+
+	# h1 += h0 >> 26
+	mov		%ebx,%eax
+	shr		$26,%eax
+	add		%eax,h1
+	# h0 = h0 & 0x3ffffff
+	andl		$0x3ffffff,%ebx
+	mov		%ebx,h0
+
+	add		$0x40,m
+	dec		%rcx
+	jnz		.Ldoblock4
+
+	vzeroupper
+	pop		%r13
+	pop		%r12
+	pop		%rbx
+	ret
+ENDPROC(poly1305_4block_avx2)
--- a/Show More
+++ b/Show More