mirror of
https://github.com/ukui/kernel.git
synced 2026-03-09 10:07:04 -07:00
Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu: "Here is the crypto update for 4.3: API: - the AEAD interface transition is now complete. - add top-level skcipher interface. Drivers: - x86-64 acceleration for chacha20/poly1305. - add sunxi-ss Allwinner Security System crypto accelerator. - add RSA algorithm to qat driver. - add SRIOV support to qat driver. - add LS1021A support to caam. - add i.MX6 support to caam" * git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (163 commits) crypto: algif_aead - fix for multiple operations on AF_ALG sockets crypto: qat - enable legacy VFs MPI: Fix mpi_read_buffer crypto: qat - silence a static checker warning crypto: vmx - Fixing opcode issue crypto: caam - Use the preferred style for memory allocations crypto: caam - Propagate the real error code in caam_probe crypto: caam - Fix the error handling in caam_probe crypto: caam - fix writing to JQCR_MS when using service interface crypto: hash - Add AHASH_REQUEST_ON_STACK crypto: testmgr - Use new skcipher interface crypto: skcipher - Add top-level skcipher interface crypto: cmac - allow usage in FIPS mode crypto: sahara - Use dmam_alloc_coherent crypto: caam - Add support for LS1021A crypto: qat - Don't move data inside output buffer crypto: vmx - Fixing GHASH Key issue on little endian crypto: vmx - Fixing AES-CTR counter bug crypto: null - Add missing Kconfig tristate for NULL2 crypto: nx - Add forward declaration for struct crypto_aead ...
This commit is contained in:
@@ -585,7 +585,7 @@ kernel crypto API | IPSEC Layer
|
||||
+-----------+ |
|
||||
| | (1)
|
||||
| aead | <----------------------------------- esp_output
|
||||
| (seqniv) | ---+
|
||||
| (seqiv) | ---+
|
||||
+-----------+ |
|
||||
| (2)
|
||||
+-----------+ |
|
||||
@@ -1101,7 +1101,7 @@ kernel crypto API | Caller
|
||||
</para>
|
||||
|
||||
<para>
|
||||
[1] http://www.chronox.de/libkcapi.html
|
||||
[1] <ulink url="http://www.chronox.de/libkcapi.html">http://www.chronox.de/libkcapi.html</ulink>
|
||||
</para>
|
||||
|
||||
</sect1>
|
||||
@@ -1661,7 +1661,7 @@ read(opfd, out, outlen);
|
||||
</para>
|
||||
|
||||
<para>
|
||||
[1] http://www.chronox.de/libkcapi.html
|
||||
[1] <ulink url="http://www.chronox.de/libkcapi.html">http://www.chronox.de/libkcapi.html</ulink>
|
||||
</para>
|
||||
|
||||
</sect1>
|
||||
@@ -1687,7 +1687,7 @@ read(opfd, out, outlen);
|
||||
!Pinclude/linux/crypto.h Block Cipher Algorithm Definitions
|
||||
!Finclude/linux/crypto.h crypto_alg
|
||||
!Finclude/linux/crypto.h ablkcipher_alg
|
||||
!Finclude/linux/crypto.h aead_alg
|
||||
!Finclude/crypto/aead.h aead_alg
|
||||
!Finclude/linux/crypto.h blkcipher_alg
|
||||
!Finclude/linux/crypto.h cipher_alg
|
||||
!Finclude/crypto/rng.h rng_alg
|
||||
|
||||
@@ -106,6 +106,18 @@ PROPERTIES
|
||||
to the interrupt parent to which the child domain
|
||||
is being mapped.
|
||||
|
||||
- clocks
|
||||
Usage: required if SEC 4.0 requires explicit enablement of clocks
|
||||
Value type: <prop_encoded-array>
|
||||
Definition: A list of phandle and clock specifier pairs describing
|
||||
the clocks required for enabling and disabling SEC 4.0.
|
||||
|
||||
- clock-names
|
||||
Usage: required if SEC 4.0 requires explicit enablement of clocks
|
||||
Value type: <string>
|
||||
Definition: A list of clock name strings in the same order as the
|
||||
clocks property.
|
||||
|
||||
Note: All other standard properties (see the ePAPR) are allowed
|
||||
but are optional.
|
||||
|
||||
@@ -120,6 +132,11 @@ EXAMPLE
|
||||
ranges = <0 0x300000 0x10000>;
|
||||
interrupt-parent = <&mpic>;
|
||||
interrupts = <92 2>;
|
||||
clocks = <&clks IMX6QDL_CLK_CAAM_MEM>,
|
||||
<&clks IMX6QDL_CLK_CAAM_ACLK>,
|
||||
<&clks IMX6QDL_CLK_CAAM_IPG>,
|
||||
<&clks IMX6QDL_CLK_EIM_SLOW>;
|
||||
clock-names = "mem", "aclk", "ipg", "emi_slow";
|
||||
};
|
||||
|
||||
=====================================================================
|
||||
|
||||
23
Documentation/devicetree/bindings/crypto/sun4i-ss.txt
Normal file
23
Documentation/devicetree/bindings/crypto/sun4i-ss.txt
Normal file
@@ -0,0 +1,23 @@
|
||||
* Allwinner Security System found on A20 SoC
|
||||
|
||||
Required properties:
|
||||
- compatible : Should be "allwinner,sun4i-a10-crypto".
|
||||
- reg: Should contain the Security System register location and length.
|
||||
- interrupts: Should contain the IRQ line for the Security System.
|
||||
- clocks : List of clock specifiers, corresponding to ahb and ss.
|
||||
- clock-names : Name of the functional clock, should be
|
||||
* "ahb" : AHB gating clock
|
||||
* "mod" : SS controller clock
|
||||
|
||||
Optional properties:
|
||||
- resets : phandle + reset specifier pair
|
||||
- reset-names : must contain "ahb"
|
||||
|
||||
Example:
|
||||
crypto: crypto-engine@01c15000 {
|
||||
compatible = "allwinner,sun4i-a10-crypto";
|
||||
reg = <0x01c15000 0x1000>;
|
||||
interrupts = <GIC_SPI 86 IRQ_TYPE_LEVEL_HIGH>;
|
||||
clocks = <&ahb_gates 5>, <&ss_clk>;
|
||||
clock-names = "ahb", "mod";
|
||||
};
|
||||
24
MAINTAINERS
24
MAINTAINERS
@@ -556,6 +556,12 @@ S: Maintained
|
||||
F: Documentation/i2c/busses/i2c-ali1563
|
||||
F: drivers/i2c/busses/i2c-ali1563.c
|
||||
|
||||
ALLWINNER SECURITY SYSTEM
|
||||
M: Corentin Labbe <clabbe.montjoie@gmail.com>
|
||||
L: linux-crypto@vger.kernel.org
|
||||
S: Maintained
|
||||
F: drivers/crypto/sunxi-ss/
|
||||
|
||||
ALPHA PORT
|
||||
M: Richard Henderson <rth@twiddle.net>
|
||||
M: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
|
||||
@@ -5078,9 +5084,21 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux.git
|
||||
S: Maintained
|
||||
F: arch/ia64/
|
||||
|
||||
IBM Power VMX Cryptographic instructions
|
||||
M: Leonidas S. Barbosa <leosilva@linux.vnet.ibm.com>
|
||||
M: Paulo Flabiano Smorigo <pfsmorigo@linux.vnet.ibm.com>
|
||||
L: linux-crypto@vger.kernel.org
|
||||
S: Supported
|
||||
F: drivers/crypto/vmx/Makefile
|
||||
F: drivers/crypto/vmx/Kconfig
|
||||
F: drivers/crypto/vmx/vmx.c
|
||||
F: drivers/crypto/vmx/aes*
|
||||
F: drivers/crypto/vmx/ghash*
|
||||
F: drivers/crypto/vmx/ppc-xlate.pl
|
||||
|
||||
IBM Power in-Nest Crypto Acceleration
|
||||
M: Marcelo Henrique Cerri <mhcerri@linux.vnet.ibm.com>
|
||||
M: Fionnuala Gunter <fin@linux.vnet.ibm.com>
|
||||
M: Leonidas S. Barbosa <leosilva@linux.vnet.ibm.com>
|
||||
M: Paulo Flabiano Smorigo <pfsmorigo@linux.vnet.ibm.com>
|
||||
L: linux-crypto@vger.kernel.org
|
||||
S: Supported
|
||||
F: drivers/crypto/nx/Makefile
|
||||
@@ -5092,7 +5110,7 @@ F: drivers/crypto/nx/nx_csbcpb.h
|
||||
F: drivers/crypto/nx/nx_debugfs.h
|
||||
|
||||
IBM Power 842 compression accelerator
|
||||
M: Dan Streetman <ddstreet@us.ibm.com>
|
||||
M: Dan Streetman <ddstreet@ieee.org>
|
||||
S: Supported
|
||||
F: drivers/crypto/nx/Makefile
|
||||
F: drivers/crypto/nx/Kconfig
|
||||
|
||||
@@ -836,10 +836,31 @@
|
||||
reg = <0x02100000 0x100000>;
|
||||
ranges;
|
||||
|
||||
caam@02100000 {
|
||||
reg = <0x02100000 0x40000>;
|
||||
interrupts = <0 105 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<0 106 IRQ_TYPE_LEVEL_HIGH>;
|
||||
crypto: caam@2100000 {
|
||||
compatible = "fsl,sec-v4.0";
|
||||
fsl,sec-era = <4>;
|
||||
#address-cells = <1>;
|
||||
#size-cells = <1>;
|
||||
reg = <0x2100000 0x10000>;
|
||||
ranges = <0 0x2100000 0x10000>;
|
||||
interrupt-parent = <&intc>;
|
||||
clocks = <&clks IMX6QDL_CLK_CAAM_MEM>,
|
||||
<&clks IMX6QDL_CLK_CAAM_ACLK>,
|
||||
<&clks IMX6QDL_CLK_CAAM_IPG>,
|
||||
<&clks IMX6QDL_CLK_EIM_SLOW>;
|
||||
clock-names = "mem", "aclk", "ipg", "emi_slow";
|
||||
|
||||
sec_jr0: jr0@1000 {
|
||||
compatible = "fsl,sec-v4.0-job-ring";
|
||||
reg = <0x1000 0x1000>;
|
||||
interrupts = <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>;
|
||||
};
|
||||
|
||||
sec_jr1: jr1@2000 {
|
||||
compatible = "fsl,sec-v4.0-job-ring";
|
||||
reg = <0x2000 0x1000>;
|
||||
interrupts = <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>;
|
||||
};
|
||||
};
|
||||
|
||||
aipstz@0217c000 { /* AIPSTZ2 */
|
||||
|
||||
@@ -738,6 +738,33 @@
|
||||
reg = <0x02100000 0x100000>;
|
||||
ranges;
|
||||
|
||||
crypto: caam@2100000 {
|
||||
compatible = "fsl,sec-v4.0";
|
||||
fsl,sec-era = <4>;
|
||||
#address-cells = <1>;
|
||||
#size-cells = <1>;
|
||||
reg = <0x2100000 0x10000>;
|
||||
ranges = <0 0x2100000 0x10000>;
|
||||
interrupt-parent = <&intc>;
|
||||
clocks = <&clks IMX6SX_CLK_CAAM_MEM>,
|
||||
<&clks IMX6SX_CLK_CAAM_ACLK>,
|
||||
<&clks IMX6SX_CLK_CAAM_IPG>,
|
||||
<&clks IMX6SX_CLK_EIM_SLOW>;
|
||||
clock-names = "mem", "aclk", "ipg", "emi_slow";
|
||||
|
||||
sec_jr0: jr0@1000 {
|
||||
compatible = "fsl,sec-v4.0-job-ring";
|
||||
reg = <0x1000 0x1000>;
|
||||
interrupts = <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>;
|
||||
};
|
||||
|
||||
sec_jr1: jr1@2000 {
|
||||
compatible = "fsl,sec-v4.0-job-ring";
|
||||
reg = <0x2000 0x1000>;
|
||||
interrupts = <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>;
|
||||
};
|
||||
};
|
||||
|
||||
usbotg1: usb@02184000 {
|
||||
compatible = "fsl,imx6sx-usb", "fsl,imx27-usb";
|
||||
reg = <0x02184000 0x200>;
|
||||
|
||||
@@ -678,6 +678,14 @@
|
||||
status = "disabled";
|
||||
};
|
||||
|
||||
crypto: crypto-engine@01c15000 {
|
||||
compatible = "allwinner,sun4i-a10-crypto";
|
||||
reg = <0x01c15000 0x1000>;
|
||||
interrupts = <86>;
|
||||
clocks = <&ahb_gates 5>, <&ss_clk>;
|
||||
clock-names = "ahb", "mod";
|
||||
};
|
||||
|
||||
spi2: spi@01c17000 {
|
||||
compatible = "allwinner,sun4i-a10-spi";
|
||||
reg = <0x01c17000 0x1000>;
|
||||
|
||||
@@ -367,6 +367,14 @@
|
||||
"mmc3_sample";
|
||||
};
|
||||
|
||||
ss_clk: clk@01c2009c {
|
||||
#clock-cells = <0>;
|
||||
compatible = "allwinner,sun4i-a10-mod0-clk";
|
||||
reg = <0x01c2009c 0x4>;
|
||||
clocks = <&osc24M>, <&pll6 0>;
|
||||
clock-output-names = "ss";
|
||||
};
|
||||
|
||||
spi0_clk: clk@01c200a0 {
|
||||
#clock-cells = <0>;
|
||||
compatible = "allwinner,sun4i-a10-mod0-clk";
|
||||
@@ -894,6 +902,16 @@
|
||||
#size-cells = <0>;
|
||||
};
|
||||
|
||||
crypto: crypto-engine@01c15000 {
|
||||
compatible = "allwinner,sun4i-a10-crypto";
|
||||
reg = <0x01c15000 0x1000>;
|
||||
interrupts = <GIC_SPI 80 IRQ_TYPE_LEVEL_HIGH>;
|
||||
clocks = <&ahb1_gates 5>, <&ss_clk>;
|
||||
clock-names = "ahb", "mod";
|
||||
resets = <&ahb1_rst 5>;
|
||||
reset-names = "ahb";
|
||||
};
|
||||
|
||||
timer@01c60000 {
|
||||
compatible = "allwinner,sun6i-a31-hstimer",
|
||||
"allwinner,sun7i-a20-hstimer";
|
||||
|
||||
@@ -754,6 +754,14 @@
|
||||
status = "disabled";
|
||||
};
|
||||
|
||||
crypto: crypto-engine@01c15000 {
|
||||
compatible = "allwinner,sun4i-a10-crypto";
|
||||
reg = <0x01c15000 0x1000>;
|
||||
interrupts = <GIC_SPI 86 IRQ_TYPE_LEVEL_HIGH>;
|
||||
clocks = <&ahb_gates 5>, <&ss_clk>;
|
||||
clock-names = "ahb", "mod";
|
||||
};
|
||||
|
||||
spi2: spi@01c17000 {
|
||||
compatible = "allwinner,sun4i-a10-spi";
|
||||
reg = <0x01c17000 0x1000>;
|
||||
|
||||
@@ -354,8 +354,7 @@ CONFIG_PROVE_LOCKING=y
|
||||
# CONFIG_FTRACE is not set
|
||||
# CONFIG_ARM_UNWIND is not set
|
||||
CONFIG_SECURITYFS=y
|
||||
# CONFIG_CRYPTO_ANSI_CPRNG is not set
|
||||
# CONFIG_CRYPTO_HW is not set
|
||||
CONFIG_CRYPTO_DEV_FSL_CAAM=y
|
||||
CONFIG_CRC_CCITT=m
|
||||
CONFIG_CRC_T10DIF=y
|
||||
CONFIG_CRC7=m
|
||||
|
||||
2
arch/arm/crypto/.gitignore
vendored
2
arch/arm/crypto/.gitignore
vendored
@@ -1 +1,3 @@
|
||||
aesbs-core.S
|
||||
sha256-core.S
|
||||
sha512-core.S
|
||||
|
||||
@@ -124,7 +124,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
|
||||
|
||||
ce_aes_ccm_auth_data(mac, (u8 *)<ag, ltag.len, &macp, ctx->key_enc,
|
||||
num_rounds(ctx));
|
||||
scatterwalk_start(&walk, req->assoc);
|
||||
scatterwalk_start(&walk, req->src);
|
||||
|
||||
do {
|
||||
u32 n = scatterwalk_clamp(&walk, len);
|
||||
@@ -151,6 +151,10 @@ static int ccm_encrypt(struct aead_request *req)
|
||||
struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
|
||||
struct blkcipher_desc desc = { .info = req->iv };
|
||||
struct blkcipher_walk walk;
|
||||
struct scatterlist srcbuf[2];
|
||||
struct scatterlist dstbuf[2];
|
||||
struct scatterlist *src;
|
||||
struct scatterlist *dst;
|
||||
u8 __aligned(8) mac[AES_BLOCK_SIZE];
|
||||
u8 buf[AES_BLOCK_SIZE];
|
||||
u32 len = req->cryptlen;
|
||||
@@ -168,7 +172,12 @@ static int ccm_encrypt(struct aead_request *req)
|
||||
/* preserve the original iv for the final round */
|
||||
memcpy(buf, req->iv, AES_BLOCK_SIZE);
|
||||
|
||||
blkcipher_walk_init(&walk, req->dst, req->src, len);
|
||||
src = scatterwalk_ffwd(srcbuf, req->src, req->assoclen);
|
||||
dst = src;
|
||||
if (req->src != req->dst)
|
||||
dst = scatterwalk_ffwd(dstbuf, req->dst, req->assoclen);
|
||||
|
||||
blkcipher_walk_init(&walk, dst, src, len);
|
||||
err = blkcipher_aead_walk_virt_block(&desc, &walk, aead,
|
||||
AES_BLOCK_SIZE);
|
||||
|
||||
@@ -194,7 +203,7 @@ static int ccm_encrypt(struct aead_request *req)
|
||||
return err;
|
||||
|
||||
/* copy authtag to end of dst */
|
||||
scatterwalk_map_and_copy(mac, req->dst, req->cryptlen,
|
||||
scatterwalk_map_and_copy(mac, dst, req->cryptlen,
|
||||
crypto_aead_authsize(aead), 1);
|
||||
|
||||
return 0;
|
||||
@@ -207,6 +216,10 @@ static int ccm_decrypt(struct aead_request *req)
|
||||
unsigned int authsize = crypto_aead_authsize(aead);
|
||||
struct blkcipher_desc desc = { .info = req->iv };
|
||||
struct blkcipher_walk walk;
|
||||
struct scatterlist srcbuf[2];
|
||||
struct scatterlist dstbuf[2];
|
||||
struct scatterlist *src;
|
||||
struct scatterlist *dst;
|
||||
u8 __aligned(8) mac[AES_BLOCK_SIZE];
|
||||
u8 buf[AES_BLOCK_SIZE];
|
||||
u32 len = req->cryptlen - authsize;
|
||||
@@ -224,7 +237,12 @@ static int ccm_decrypt(struct aead_request *req)
|
||||
/* preserve the original iv for the final round */
|
||||
memcpy(buf, req->iv, AES_BLOCK_SIZE);
|
||||
|
||||
blkcipher_walk_init(&walk, req->dst, req->src, len);
|
||||
src = scatterwalk_ffwd(srcbuf, req->src, req->assoclen);
|
||||
dst = src;
|
||||
if (req->src != req->dst)
|
||||
dst = scatterwalk_ffwd(dstbuf, req->dst, req->assoclen);
|
||||
|
||||
blkcipher_walk_init(&walk, dst, src, len);
|
||||
err = blkcipher_aead_walk_virt_block(&desc, &walk, aead,
|
||||
AES_BLOCK_SIZE);
|
||||
|
||||
@@ -250,44 +268,42 @@ static int ccm_decrypt(struct aead_request *req)
|
||||
return err;
|
||||
|
||||
/* compare calculated auth tag with the stored one */
|
||||
scatterwalk_map_and_copy(buf, req->src, req->cryptlen - authsize,
|
||||
scatterwalk_map_and_copy(buf, src, req->cryptlen - authsize,
|
||||
authsize, 0);
|
||||
|
||||
if (memcmp(mac, buf, authsize))
|
||||
if (crypto_memneq(mac, buf, authsize))
|
||||
return -EBADMSG;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct crypto_alg ccm_aes_alg = {
|
||||
.cra_name = "ccm(aes)",
|
||||
.cra_driver_name = "ccm-aes-ce",
|
||||
.cra_priority = 300,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_AEAD,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
|
||||
.cra_alignmask = 7,
|
||||
.cra_type = &crypto_aead_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_aead = {
|
||||
.ivsize = AES_BLOCK_SIZE,
|
||||
.maxauthsize = AES_BLOCK_SIZE,
|
||||
.setkey = ccm_setkey,
|
||||
.setauthsize = ccm_setauthsize,
|
||||
.encrypt = ccm_encrypt,
|
||||
.decrypt = ccm_decrypt,
|
||||
}
|
||||
static struct aead_alg ccm_aes_alg = {
|
||||
.base = {
|
||||
.cra_name = "ccm(aes)",
|
||||
.cra_driver_name = "ccm-aes-ce",
|
||||
.cra_priority = 300,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
|
||||
.cra_alignmask = 7,
|
||||
.cra_module = THIS_MODULE,
|
||||
},
|
||||
.ivsize = AES_BLOCK_SIZE,
|
||||
.maxauthsize = AES_BLOCK_SIZE,
|
||||
.setkey = ccm_setkey,
|
||||
.setauthsize = ccm_setauthsize,
|
||||
.encrypt = ccm_encrypt,
|
||||
.decrypt = ccm_decrypt,
|
||||
};
|
||||
|
||||
static int __init aes_mod_init(void)
|
||||
{
|
||||
if (!(elf_hwcap & HWCAP_AES))
|
||||
return -ENODEV;
|
||||
return crypto_register_alg(&ccm_aes_alg);
|
||||
return crypto_register_aead(&ccm_aes_alg);
|
||||
}
|
||||
|
||||
static void __exit aes_mod_exit(void)
|
||||
{
|
||||
crypto_unregister_alg(&ccm_aes_alg);
|
||||
crypto_unregister_aead(&ccm_aes_alg);
|
||||
}
|
||||
|
||||
module_init(aes_mod_init);
|
||||
|
||||
@@ -29,6 +29,7 @@ static inline void save_early_sprs(struct thread_struct *prev) {}
|
||||
|
||||
extern void enable_kernel_fp(void);
|
||||
extern void enable_kernel_altivec(void);
|
||||
extern void enable_kernel_vsx(void);
|
||||
extern int emulate_altivec(struct pt_regs *);
|
||||
extern void __giveup_vsx(struct task_struct *);
|
||||
extern void giveup_vsx(struct task_struct *);
|
||||
|
||||
@@ -204,8 +204,6 @@ EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
|
||||
#endif /* CONFIG_ALTIVEC */
|
||||
|
||||
#ifdef CONFIG_VSX
|
||||
#if 0
|
||||
/* not currently used, but some crazy RAID module might want to later */
|
||||
void enable_kernel_vsx(void)
|
||||
{
|
||||
WARN_ON(preemptible());
|
||||
@@ -220,7 +218,6 @@ void enable_kernel_vsx(void)
|
||||
#endif /* CONFIG_SMP */
|
||||
}
|
||||
EXPORT_SYMBOL(enable_kernel_vsx);
|
||||
#endif
|
||||
|
||||
void giveup_vsx(struct task_struct *tsk)
|
||||
{
|
||||
|
||||
@@ -20,6 +20,7 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
|
||||
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
|
||||
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
|
||||
@@ -30,6 +31,7 @@ obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
|
||||
obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
|
||||
obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
|
||||
obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
|
||||
obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
|
||||
|
||||
# These modules require assembler to support AVX.
|
||||
ifeq ($(avx_supported),yes)
|
||||
@@ -60,6 +62,7 @@ blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
|
||||
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
|
||||
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
|
||||
salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
|
||||
chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
|
||||
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
|
||||
|
||||
ifeq ($(avx_supported),yes)
|
||||
@@ -75,6 +78,7 @@ endif
|
||||
|
||||
ifeq ($(avx2_supported),yes)
|
||||
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
|
||||
chacha20-x86_64-y += chacha20-avx2-x86_64.o
|
||||
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
|
||||
endif
|
||||
|
||||
@@ -82,8 +86,10 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
|
||||
aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
|
||||
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
|
||||
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
|
||||
poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
|
||||
ifeq ($(avx2_supported),yes)
|
||||
sha1-ssse3-y += sha1_avx2_x86_64_asm.o
|
||||
poly1305-x86_64-y += poly1305-avx2-x86_64.o
|
||||
endif
|
||||
crc32c-intel-y := crc32c-intel_glue.o
|
||||
crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
|
||||
|
||||
@@ -803,10 +803,7 @@ static int rfc4106_init(struct crypto_aead *aead)
|
||||
return PTR_ERR(cryptd_tfm);
|
||||
|
||||
*ctx = cryptd_tfm;
|
||||
crypto_aead_set_reqsize(
|
||||
aead,
|
||||
sizeof(struct aead_request) +
|
||||
crypto_aead_reqsize(&cryptd_tfm->base));
|
||||
crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -955,8 +952,8 @@ static int helper_rfc4106_encrypt(struct aead_request *req)
|
||||
|
||||
/* Assuming we are supporting rfc4106 64-bit extended */
|
||||
/* sequence numbers We need to have the AAD length equal */
|
||||
/* to 8 or 12 bytes */
|
||||
if (unlikely(req->assoclen != 8 && req->assoclen != 12))
|
||||
/* to 16 or 20 bytes */
|
||||
if (unlikely(req->assoclen != 16 && req->assoclen != 20))
|
||||
return -EINVAL;
|
||||
|
||||
/* IV below built */
|
||||
@@ -992,9 +989,9 @@ static int helper_rfc4106_encrypt(struct aead_request *req)
|
||||
}
|
||||
|
||||
kernel_fpu_begin();
|
||||
aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
|
||||
ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
|
||||
+ ((unsigned long)req->cryptlen), auth_tag_len);
|
||||
aesni_gcm_enc_tfm(aes_ctx, dst, src, req->cryptlen, iv,
|
||||
ctx->hash_subkey, assoc, req->assoclen - 8,
|
||||
dst + req->cryptlen, auth_tag_len);
|
||||
kernel_fpu_end();
|
||||
|
||||
/* The authTag (aka the Integrity Check Value) needs to be written
|
||||
@@ -1033,12 +1030,12 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
|
||||
struct scatter_walk dst_sg_walk;
|
||||
unsigned int i;
|
||||
|
||||
if (unlikely(req->assoclen != 8 && req->assoclen != 12))
|
||||
if (unlikely(req->assoclen != 16 && req->assoclen != 20))
|
||||
return -EINVAL;
|
||||
|
||||
/* Assuming we are supporting rfc4106 64-bit extended */
|
||||
/* sequence numbers We need to have the AAD length */
|
||||
/* equal to 8 or 12 bytes */
|
||||
/* equal to 16 or 20 bytes */
|
||||
|
||||
tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
|
||||
/* IV below built */
|
||||
@@ -1075,8 +1072,8 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
|
||||
|
||||
kernel_fpu_begin();
|
||||
aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv,
|
||||
ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
|
||||
authTag, auth_tag_len);
|
||||
ctx->hash_subkey, assoc, req->assoclen - 8,
|
||||
authTag, auth_tag_len);
|
||||
kernel_fpu_end();
|
||||
|
||||
/* Compare generated tag with passed in tag. */
|
||||
@@ -1105,19 +1102,12 @@ static int rfc4106_encrypt(struct aead_request *req)
|
||||
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
|
||||
struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
|
||||
struct cryptd_aead *cryptd_tfm = *ctx;
|
||||
struct aead_request *subreq = aead_request_ctx(req);
|
||||
|
||||
aead_request_set_tfm(subreq, irq_fpu_usable() ?
|
||||
cryptd_aead_child(cryptd_tfm) :
|
||||
&cryptd_tfm->base);
|
||||
aead_request_set_tfm(req, irq_fpu_usable() ?
|
||||
cryptd_aead_child(cryptd_tfm) :
|
||||
&cryptd_tfm->base);
|
||||
|
||||
aead_request_set_callback(subreq, req->base.flags,
|
||||
req->base.complete, req->base.data);
|
||||
aead_request_set_crypt(subreq, req->src, req->dst,
|
||||
req->cryptlen, req->iv);
|
||||
aead_request_set_ad(subreq, req->assoclen);
|
||||
|
||||
return crypto_aead_encrypt(subreq);
|
||||
return crypto_aead_encrypt(req);
|
||||
}
|
||||
|
||||
static int rfc4106_decrypt(struct aead_request *req)
|
||||
@@ -1125,19 +1115,12 @@ static int rfc4106_decrypt(struct aead_request *req)
|
||||
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
|
||||
struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
|
||||
struct cryptd_aead *cryptd_tfm = *ctx;
|
||||
struct aead_request *subreq = aead_request_ctx(req);
|
||||
|
||||
aead_request_set_tfm(subreq, irq_fpu_usable() ?
|
||||
cryptd_aead_child(cryptd_tfm) :
|
||||
&cryptd_tfm->base);
|
||||
aead_request_set_tfm(req, irq_fpu_usable() ?
|
||||
cryptd_aead_child(cryptd_tfm) :
|
||||
&cryptd_tfm->base);
|
||||
|
||||
aead_request_set_callback(subreq, req->base.flags,
|
||||
req->base.complete, req->base.data);
|
||||
aead_request_set_crypt(subreq, req->src, req->dst,
|
||||
req->cryptlen, req->iv);
|
||||
aead_request_set_ad(subreq, req->assoclen);
|
||||
|
||||
return crypto_aead_decrypt(subreq);
|
||||
return crypto_aead_decrypt(req);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
443
arch/x86/crypto/chacha20-avx2-x86_64.S
Normal file
443
arch/x86/crypto/chacha20-avx2-x86_64.S
Normal file
@@ -0,0 +1,443 @@
|
||||
/*
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.data
|
||||
.align 32
|
||||
|
||||
ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
|
||||
.octa 0x0e0d0c0f0a09080b0605040702010003
|
||||
ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
|
||||
.octa 0x0d0c0f0e09080b0a0504070601000302
|
||||
CTRINC: .octa 0x00000003000000020000000100000000
|
||||
.octa 0x00000007000000060000000500000004
|
||||
|
||||
.text
|
||||
|
||||
ENTRY(chacha20_8block_xor_avx2)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: 8 data blocks output, o
|
||||
# %rdx: 8 data blocks input, i
|
||||
|
||||
# This function encrypts eight consecutive ChaCha20 blocks by loading
|
||||
# the state matrix in AVX registers eight times. As we need some
|
||||
# scratch registers, we save the first four registers on the stack. The
|
||||
# algorithm performs each operation on the corresponding word of each
|
||||
# state matrix, hence requires no word shuffling. For final XORing step
|
||||
# we transpose the matrix by interleaving 32-, 64- and then 128-bit
|
||||
# words, which allows us to do XOR in AVX registers. 8/16-bit word
|
||||
# rotation is done with the slightly better performing byte shuffling,
|
||||
# 7/12-bit word rotation uses traditional shift+OR.
|
||||
|
||||
vzeroupper
|
||||
# 4 * 32 byte stack, 32-byte aligned
|
||||
mov %rsp, %r8
|
||||
and $~31, %rsp
|
||||
sub $0x80, %rsp
|
||||
|
||||
# x0..15[0-7] = s[0..15]
|
||||
vpbroadcastd 0x00(%rdi),%ymm0
|
||||
vpbroadcastd 0x04(%rdi),%ymm1
|
||||
vpbroadcastd 0x08(%rdi),%ymm2
|
||||
vpbroadcastd 0x0c(%rdi),%ymm3
|
||||
vpbroadcastd 0x10(%rdi),%ymm4
|
||||
vpbroadcastd 0x14(%rdi),%ymm5
|
||||
vpbroadcastd 0x18(%rdi),%ymm6
|
||||
vpbroadcastd 0x1c(%rdi),%ymm7
|
||||
vpbroadcastd 0x20(%rdi),%ymm8
|
||||
vpbroadcastd 0x24(%rdi),%ymm9
|
||||
vpbroadcastd 0x28(%rdi),%ymm10
|
||||
vpbroadcastd 0x2c(%rdi),%ymm11
|
||||
vpbroadcastd 0x30(%rdi),%ymm12
|
||||
vpbroadcastd 0x34(%rdi),%ymm13
|
||||
vpbroadcastd 0x38(%rdi),%ymm14
|
||||
vpbroadcastd 0x3c(%rdi),%ymm15
|
||||
# x0..3 on stack
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vmovdqa %ymm1,0x20(%rsp)
|
||||
vmovdqa %ymm2,0x40(%rsp)
|
||||
vmovdqa %ymm3,0x60(%rsp)
|
||||
|
||||
vmovdqa CTRINC(%rip),%ymm1
|
||||
vmovdqa ROT8(%rip),%ymm2
|
||||
vmovdqa ROT16(%rip),%ymm3
|
||||
|
||||
# x12 += counter values 0-3
|
||||
vpaddd %ymm1,%ymm12,%ymm12
|
||||
|
||||
mov $10,%ecx
|
||||
|
||||
.Ldoubleround8:
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
||||
vpaddd 0x00(%rsp),%ymm4,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpxor %ymm0,%ymm12,%ymm12
|
||||
vpshufb %ymm3,%ymm12,%ymm12
|
||||
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
|
||||
vpaddd 0x20(%rsp),%ymm5,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpxor %ymm0,%ymm13,%ymm13
|
||||
vpshufb %ymm3,%ymm13,%ymm13
|
||||
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
|
||||
vpaddd 0x40(%rsp),%ymm6,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpxor %ymm0,%ymm14,%ymm14
|
||||
vpshufb %ymm3,%ymm14,%ymm14
|
||||
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
|
||||
vpaddd 0x60(%rsp),%ymm7,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpxor %ymm0,%ymm15,%ymm15
|
||||
vpshufb %ymm3,%ymm15,%ymm15
|
||||
|
||||
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
|
||||
vpaddd %ymm12,%ymm8,%ymm8
|
||||
vpxor %ymm8,%ymm4,%ymm4
|
||||
vpslld $12,%ymm4,%ymm0
|
||||
vpsrld $20,%ymm4,%ymm4
|
||||
vpor %ymm0,%ymm4,%ymm4
|
||||
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
|
||||
vpaddd %ymm13,%ymm9,%ymm9
|
||||
vpxor %ymm9,%ymm5,%ymm5
|
||||
vpslld $12,%ymm5,%ymm0
|
||||
vpsrld $20,%ymm5,%ymm5
|
||||
vpor %ymm0,%ymm5,%ymm5
|
||||
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
|
||||
vpaddd %ymm14,%ymm10,%ymm10
|
||||
vpxor %ymm10,%ymm6,%ymm6
|
||||
vpslld $12,%ymm6,%ymm0
|
||||
vpsrld $20,%ymm6,%ymm6
|
||||
vpor %ymm0,%ymm6,%ymm6
|
||||
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
|
||||
vpaddd %ymm15,%ymm11,%ymm11
|
||||
vpxor %ymm11,%ymm7,%ymm7
|
||||
vpslld $12,%ymm7,%ymm0
|
||||
vpsrld $20,%ymm7,%ymm7
|
||||
vpor %ymm0,%ymm7,%ymm7
|
||||
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
|
||||
vpaddd 0x00(%rsp),%ymm4,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpxor %ymm0,%ymm12,%ymm12
|
||||
vpshufb %ymm2,%ymm12,%ymm12
|
||||
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
|
||||
vpaddd 0x20(%rsp),%ymm5,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpxor %ymm0,%ymm13,%ymm13
|
||||
vpshufb %ymm2,%ymm13,%ymm13
|
||||
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
|
||||
vpaddd 0x40(%rsp),%ymm6,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpxor %ymm0,%ymm14,%ymm14
|
||||
vpshufb %ymm2,%ymm14,%ymm14
|
||||
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
|
||||
vpaddd 0x60(%rsp),%ymm7,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpxor %ymm0,%ymm15,%ymm15
|
||||
vpshufb %ymm2,%ymm15,%ymm15
|
||||
|
||||
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
|
||||
vpaddd %ymm12,%ymm8,%ymm8
|
||||
vpxor %ymm8,%ymm4,%ymm4
|
||||
vpslld $7,%ymm4,%ymm0
|
||||
vpsrld $25,%ymm4,%ymm4
|
||||
vpor %ymm0,%ymm4,%ymm4
|
||||
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
|
||||
vpaddd %ymm13,%ymm9,%ymm9
|
||||
vpxor %ymm9,%ymm5,%ymm5
|
||||
vpslld $7,%ymm5,%ymm0
|
||||
vpsrld $25,%ymm5,%ymm5
|
||||
vpor %ymm0,%ymm5,%ymm5
|
||||
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
|
||||
vpaddd %ymm14,%ymm10,%ymm10
|
||||
vpxor %ymm10,%ymm6,%ymm6
|
||||
vpslld $7,%ymm6,%ymm0
|
||||
vpsrld $25,%ymm6,%ymm6
|
||||
vpor %ymm0,%ymm6,%ymm6
|
||||
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
|
||||
vpaddd %ymm15,%ymm11,%ymm11
|
||||
vpxor %ymm11,%ymm7,%ymm7
|
||||
vpslld $7,%ymm7,%ymm0
|
||||
vpsrld $25,%ymm7,%ymm7
|
||||
vpor %ymm0,%ymm7,%ymm7
|
||||
|
||||
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
|
||||
vpaddd 0x00(%rsp),%ymm5,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpxor %ymm0,%ymm15,%ymm15
|
||||
vpshufb %ymm3,%ymm15,%ymm15
|
||||
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
|
||||
vpaddd 0x20(%rsp),%ymm6,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpxor %ymm0,%ymm12,%ymm12
|
||||
vpshufb %ymm3,%ymm12,%ymm12
|
||||
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
|
||||
vpaddd 0x40(%rsp),%ymm7,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpxor %ymm0,%ymm13,%ymm13
|
||||
vpshufb %ymm3,%ymm13,%ymm13
|
||||
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
|
||||
vpaddd 0x60(%rsp),%ymm4,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpxor %ymm0,%ymm14,%ymm14
|
||||
vpshufb %ymm3,%ymm14,%ymm14
|
||||
|
||||
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
|
||||
vpaddd %ymm15,%ymm10,%ymm10
|
||||
vpxor %ymm10,%ymm5,%ymm5
|
||||
vpslld $12,%ymm5,%ymm0
|
||||
vpsrld $20,%ymm5,%ymm5
|
||||
vpor %ymm0,%ymm5,%ymm5
|
||||
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
|
||||
vpaddd %ymm12,%ymm11,%ymm11
|
||||
vpxor %ymm11,%ymm6,%ymm6
|
||||
vpslld $12,%ymm6,%ymm0
|
||||
vpsrld $20,%ymm6,%ymm6
|
||||
vpor %ymm0,%ymm6,%ymm6
|
||||
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
|
||||
vpaddd %ymm13,%ymm8,%ymm8
|
||||
vpxor %ymm8,%ymm7,%ymm7
|
||||
vpslld $12,%ymm7,%ymm0
|
||||
vpsrld $20,%ymm7,%ymm7
|
||||
vpor %ymm0,%ymm7,%ymm7
|
||||
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
|
||||
vpaddd %ymm14,%ymm9,%ymm9
|
||||
vpxor %ymm9,%ymm4,%ymm4
|
||||
vpslld $12,%ymm4,%ymm0
|
||||
vpsrld $20,%ymm4,%ymm4
|
||||
vpor %ymm0,%ymm4,%ymm4
|
||||
|
||||
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
|
||||
vpaddd 0x00(%rsp),%ymm5,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpxor %ymm0,%ymm15,%ymm15
|
||||
vpshufb %ymm2,%ymm15,%ymm15
|
||||
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
|
||||
vpaddd 0x20(%rsp),%ymm6,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpxor %ymm0,%ymm12,%ymm12
|
||||
vpshufb %ymm2,%ymm12,%ymm12
|
||||
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
|
||||
vpaddd 0x40(%rsp),%ymm7,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpxor %ymm0,%ymm13,%ymm13
|
||||
vpshufb %ymm2,%ymm13,%ymm13
|
||||
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
|
||||
vpaddd 0x60(%rsp),%ymm4,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpxor %ymm0,%ymm14,%ymm14
|
||||
vpshufb %ymm2,%ymm14,%ymm14
|
||||
|
||||
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
|
||||
vpaddd %ymm15,%ymm10,%ymm10
|
||||
vpxor %ymm10,%ymm5,%ymm5
|
||||
vpslld $7,%ymm5,%ymm0
|
||||
vpsrld $25,%ymm5,%ymm5
|
||||
vpor %ymm0,%ymm5,%ymm5
|
||||
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
|
||||
vpaddd %ymm12,%ymm11,%ymm11
|
||||
vpxor %ymm11,%ymm6,%ymm6
|
||||
vpslld $7,%ymm6,%ymm0
|
||||
vpsrld $25,%ymm6,%ymm6
|
||||
vpor %ymm0,%ymm6,%ymm6
|
||||
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
|
||||
vpaddd %ymm13,%ymm8,%ymm8
|
||||
vpxor %ymm8,%ymm7,%ymm7
|
||||
vpslld $7,%ymm7,%ymm0
|
||||
vpsrld $25,%ymm7,%ymm7
|
||||
vpor %ymm0,%ymm7,%ymm7
|
||||
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
|
||||
vpaddd %ymm14,%ymm9,%ymm9
|
||||
vpxor %ymm9,%ymm4,%ymm4
|
||||
vpslld $7,%ymm4,%ymm0
|
||||
vpsrld $25,%ymm4,%ymm4
|
||||
vpor %ymm0,%ymm4,%ymm4
|
||||
|
||||
dec %ecx
|
||||
jnz .Ldoubleround8
|
||||
|
||||
# x0..15[0-3] += s[0..15]
|
||||
vpbroadcastd 0x00(%rdi),%ymm0
|
||||
vpaddd 0x00(%rsp),%ymm0,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpbroadcastd 0x04(%rdi),%ymm0
|
||||
vpaddd 0x20(%rsp),%ymm0,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpbroadcastd 0x08(%rdi),%ymm0
|
||||
vpaddd 0x40(%rsp),%ymm0,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpbroadcastd 0x0c(%rdi),%ymm0
|
||||
vpaddd 0x60(%rsp),%ymm0,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpbroadcastd 0x10(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm4,%ymm4
|
||||
vpbroadcastd 0x14(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm5,%ymm5
|
||||
vpbroadcastd 0x18(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm6,%ymm6
|
||||
vpbroadcastd 0x1c(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm7,%ymm7
|
||||
vpbroadcastd 0x20(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm8,%ymm8
|
||||
vpbroadcastd 0x24(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm9,%ymm9
|
||||
vpbroadcastd 0x28(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm10,%ymm10
|
||||
vpbroadcastd 0x2c(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm11,%ymm11
|
||||
vpbroadcastd 0x30(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm12,%ymm12
|
||||
vpbroadcastd 0x34(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm13,%ymm13
|
||||
vpbroadcastd 0x38(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm14,%ymm14
|
||||
vpbroadcastd 0x3c(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm15,%ymm15
|
||||
|
||||
# x12 += counter values 0-3
|
||||
vpaddd %ymm1,%ymm12,%ymm12
|
||||
|
||||
# interleave 32-bit words in state n, n+1
|
||||
vmovdqa 0x00(%rsp),%ymm0
|
||||
vmovdqa 0x20(%rsp),%ymm1
|
||||
vpunpckldq %ymm1,%ymm0,%ymm2
|
||||
vpunpckhdq %ymm1,%ymm0,%ymm1
|
||||
vmovdqa %ymm2,0x00(%rsp)
|
||||
vmovdqa %ymm1,0x20(%rsp)
|
||||
vmovdqa 0x40(%rsp),%ymm0
|
||||
vmovdqa 0x60(%rsp),%ymm1
|
||||
vpunpckldq %ymm1,%ymm0,%ymm2
|
||||
vpunpckhdq %ymm1,%ymm0,%ymm1
|
||||
vmovdqa %ymm2,0x40(%rsp)
|
||||
vmovdqa %ymm1,0x60(%rsp)
|
||||
vmovdqa %ymm4,%ymm0
|
||||
vpunpckldq %ymm5,%ymm0,%ymm4
|
||||
vpunpckhdq %ymm5,%ymm0,%ymm5
|
||||
vmovdqa %ymm6,%ymm0
|
||||
vpunpckldq %ymm7,%ymm0,%ymm6
|
||||
vpunpckhdq %ymm7,%ymm0,%ymm7
|
||||
vmovdqa %ymm8,%ymm0
|
||||
vpunpckldq %ymm9,%ymm0,%ymm8
|
||||
vpunpckhdq %ymm9,%ymm0,%ymm9
|
||||
vmovdqa %ymm10,%ymm0
|
||||
vpunpckldq %ymm11,%ymm0,%ymm10
|
||||
vpunpckhdq %ymm11,%ymm0,%ymm11
|
||||
vmovdqa %ymm12,%ymm0
|
||||
vpunpckldq %ymm13,%ymm0,%ymm12
|
||||
vpunpckhdq %ymm13,%ymm0,%ymm13
|
||||
vmovdqa %ymm14,%ymm0
|
||||
vpunpckldq %ymm15,%ymm0,%ymm14
|
||||
vpunpckhdq %ymm15,%ymm0,%ymm15
|
||||
|
||||
# interleave 64-bit words in state n, n+2
|
||||
vmovdqa 0x00(%rsp),%ymm0
|
||||
vmovdqa 0x40(%rsp),%ymm2
|
||||
vpunpcklqdq %ymm2,%ymm0,%ymm1
|
||||
vpunpckhqdq %ymm2,%ymm0,%ymm2
|
||||
vmovdqa %ymm1,0x00(%rsp)
|
||||
vmovdqa %ymm2,0x40(%rsp)
|
||||
vmovdqa 0x20(%rsp),%ymm0
|
||||
vmovdqa 0x60(%rsp),%ymm2
|
||||
vpunpcklqdq %ymm2,%ymm0,%ymm1
|
||||
vpunpckhqdq %ymm2,%ymm0,%ymm2
|
||||
vmovdqa %ymm1,0x20(%rsp)
|
||||
vmovdqa %ymm2,0x60(%rsp)
|
||||
vmovdqa %ymm4,%ymm0
|
||||
vpunpcklqdq %ymm6,%ymm0,%ymm4
|
||||
vpunpckhqdq %ymm6,%ymm0,%ymm6
|
||||
vmovdqa %ymm5,%ymm0
|
||||
vpunpcklqdq %ymm7,%ymm0,%ymm5
|
||||
vpunpckhqdq %ymm7,%ymm0,%ymm7
|
||||
vmovdqa %ymm8,%ymm0
|
||||
vpunpcklqdq %ymm10,%ymm0,%ymm8
|
||||
vpunpckhqdq %ymm10,%ymm0,%ymm10
|
||||
vmovdqa %ymm9,%ymm0
|
||||
vpunpcklqdq %ymm11,%ymm0,%ymm9
|
||||
vpunpckhqdq %ymm11,%ymm0,%ymm11
|
||||
vmovdqa %ymm12,%ymm0
|
||||
vpunpcklqdq %ymm14,%ymm0,%ymm12
|
||||
vpunpckhqdq %ymm14,%ymm0,%ymm14
|
||||
vmovdqa %ymm13,%ymm0
|
||||
vpunpcklqdq %ymm15,%ymm0,%ymm13
|
||||
vpunpckhqdq %ymm15,%ymm0,%ymm15
|
||||
|
||||
# interleave 128-bit words in state n, n+4
|
||||
vmovdqa 0x00(%rsp),%ymm0
|
||||
vperm2i128 $0x20,%ymm4,%ymm0,%ymm1
|
||||
vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
|
||||
vmovdqa %ymm1,0x00(%rsp)
|
||||
vmovdqa 0x20(%rsp),%ymm0
|
||||
vperm2i128 $0x20,%ymm5,%ymm0,%ymm1
|
||||
vperm2i128 $0x31,%ymm5,%ymm0,%ymm5
|
||||
vmovdqa %ymm1,0x20(%rsp)
|
||||
vmovdqa 0x40(%rsp),%ymm0
|
||||
vperm2i128 $0x20,%ymm6,%ymm0,%ymm1
|
||||
vperm2i128 $0x31,%ymm6,%ymm0,%ymm6
|
||||
vmovdqa %ymm1,0x40(%rsp)
|
||||
vmovdqa 0x60(%rsp),%ymm0
|
||||
vperm2i128 $0x20,%ymm7,%ymm0,%ymm1
|
||||
vperm2i128 $0x31,%ymm7,%ymm0,%ymm7
|
||||
vmovdqa %ymm1,0x60(%rsp)
|
||||
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
|
||||
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
|
||||
vmovdqa %ymm0,%ymm8
|
||||
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
|
||||
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
|
||||
vmovdqa %ymm0,%ymm9
|
||||
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
|
||||
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
|
||||
vmovdqa %ymm0,%ymm10
|
||||
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
|
||||
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
|
||||
vmovdqa %ymm0,%ymm11
|
||||
|
||||
# xor with corresponding input, write to output
|
||||
vmovdqa 0x00(%rsp),%ymm0
|
||||
vpxor 0x0000(%rdx),%ymm0,%ymm0
|
||||
vmovdqu %ymm0,0x0000(%rsi)
|
||||
vmovdqa 0x20(%rsp),%ymm0
|
||||
vpxor 0x0080(%rdx),%ymm0,%ymm0
|
||||
vmovdqu %ymm0,0x0080(%rsi)
|
||||
vmovdqa 0x40(%rsp),%ymm0
|
||||
vpxor 0x0040(%rdx),%ymm0,%ymm0
|
||||
vmovdqu %ymm0,0x0040(%rsi)
|
||||
vmovdqa 0x60(%rsp),%ymm0
|
||||
vpxor 0x00c0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu %ymm0,0x00c0(%rsi)
|
||||
vpxor 0x0100(%rdx),%ymm4,%ymm4
|
||||
vmovdqu %ymm4,0x0100(%rsi)
|
||||
vpxor 0x0180(%rdx),%ymm5,%ymm5
|
||||
vmovdqu %ymm5,0x00180(%rsi)
|
||||
vpxor 0x0140(%rdx),%ymm6,%ymm6
|
||||
vmovdqu %ymm6,0x0140(%rsi)
|
||||
vpxor 0x01c0(%rdx),%ymm7,%ymm7
|
||||
vmovdqu %ymm7,0x01c0(%rsi)
|
||||
vpxor 0x0020(%rdx),%ymm8,%ymm8
|
||||
vmovdqu %ymm8,0x0020(%rsi)
|
||||
vpxor 0x00a0(%rdx),%ymm9,%ymm9
|
||||
vmovdqu %ymm9,0x00a0(%rsi)
|
||||
vpxor 0x0060(%rdx),%ymm10,%ymm10
|
||||
vmovdqu %ymm10,0x0060(%rsi)
|
||||
vpxor 0x00e0(%rdx),%ymm11,%ymm11
|
||||
vmovdqu %ymm11,0x00e0(%rsi)
|
||||
vpxor 0x0120(%rdx),%ymm12,%ymm12
|
||||
vmovdqu %ymm12,0x0120(%rsi)
|
||||
vpxor 0x01a0(%rdx),%ymm13,%ymm13
|
||||
vmovdqu %ymm13,0x01a0(%rsi)
|
||||
vpxor 0x0160(%rdx),%ymm14,%ymm14
|
||||
vmovdqu %ymm14,0x0160(%rsi)
|
||||
vpxor 0x01e0(%rdx),%ymm15,%ymm15
|
||||
vmovdqu %ymm15,0x01e0(%rsi)
|
||||
|
||||
vzeroupper
|
||||
mov %r8,%rsp
|
||||
ret
|
||||
ENDPROC(chacha20_8block_xor_avx2)
|
||||
625
arch/x86/crypto/chacha20-ssse3-x86_64.S
Normal file
625
arch/x86/crypto/chacha20-ssse3-x86_64.S
Normal file
File diff suppressed because it is too large
Load Diff
150
arch/x86/crypto/chacha20_glue.c
Normal file
150
arch/x86/crypto/chacha20_glue.c
Normal file
@@ -0,0 +1,150 @@
|
||||
/*
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/chacha20.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <asm/fpu/api.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
#define CHACHA20_STATE_ALIGN 16
|
||||
|
||||
asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
|
||||
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
|
||||
static bool chacha20_use_avx2;
|
||||
#endif
|
||||
|
||||
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes)
|
||||
{
|
||||
u8 buf[CHACHA20_BLOCK_SIZE];
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
if (chacha20_use_avx2) {
|
||||
while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
|
||||
chacha20_8block_xor_avx2(state, dst, src);
|
||||
bytes -= CHACHA20_BLOCK_SIZE * 8;
|
||||
src += CHACHA20_BLOCK_SIZE * 8;
|
||||
dst += CHACHA20_BLOCK_SIZE * 8;
|
||||
state[12] += 8;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
|
||||
chacha20_4block_xor_ssse3(state, dst, src);
|
||||
bytes -= CHACHA20_BLOCK_SIZE * 4;
|
||||
src += CHACHA20_BLOCK_SIZE * 4;
|
||||
dst += CHACHA20_BLOCK_SIZE * 4;
|
||||
state[12] += 4;
|
||||
}
|
||||
while (bytes >= CHACHA20_BLOCK_SIZE) {
|
||||
chacha20_block_xor_ssse3(state, dst, src);
|
||||
bytes -= CHACHA20_BLOCK_SIZE;
|
||||
src += CHACHA20_BLOCK_SIZE;
|
||||
dst += CHACHA20_BLOCK_SIZE;
|
||||
state[12]++;
|
||||
}
|
||||
if (bytes) {
|
||||
memcpy(buf, src, bytes);
|
||||
chacha20_block_xor_ssse3(state, buf, buf);
|
||||
memcpy(dst, buf, bytes);
|
||||
}
|
||||
}
|
||||
|
||||
static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
u32 *state, state_buf[16 + (CHACHA20_STATE_ALIGN / sizeof(u32)) - 1];
|
||||
struct blkcipher_walk walk;
|
||||
int err;
|
||||
|
||||
if (!may_use_simd())
|
||||
return crypto_chacha20_crypt(desc, dst, src, nbytes);
|
||||
|
||||
state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN);
|
||||
|
||||
blkcipher_walk_init(&walk, dst, src, nbytes);
|
||||
err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
|
||||
|
||||
crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
|
||||
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
|
||||
rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
|
||||
err = blkcipher_walk_done(desc, &walk,
|
||||
walk.nbytes % CHACHA20_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
if (walk.nbytes) {
|
||||
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
|
||||
walk.nbytes);
|
||||
err = blkcipher_walk_done(desc, &walk, 0);
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct crypto_alg alg = {
|
||||
.cra_name = "chacha20",
|
||||
.cra_driver_name = "chacha20-simd",
|
||||
.cra_priority = 300,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = 1,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_ctxsize = sizeof(struct chacha20_ctx),
|
||||
.cra_alignmask = sizeof(u32) - 1,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = CHACHA20_KEY_SIZE,
|
||||
.max_keysize = CHACHA20_KEY_SIZE,
|
||||
.ivsize = CHACHA20_IV_SIZE,
|
||||
.geniv = "seqiv",
|
||||
.setkey = crypto_chacha20_setkey,
|
||||
.encrypt = chacha20_simd,
|
||||
.decrypt = chacha20_simd,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
static int __init chacha20_simd_mod_init(void)
|
||||
{
|
||||
if (!cpu_has_ssse3)
|
||||
return -ENODEV;
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 &&
|
||||
cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL);
|
||||
#endif
|
||||
return crypto_register_alg(&alg);
|
||||
}
|
||||
|
||||
static void __exit chacha20_simd_mod_fini(void)
|
||||
{
|
||||
crypto_unregister_alg(&alg);
|
||||
}
|
||||
|
||||
module_init(chacha20_simd_mod_init);
|
||||
module_exit(chacha20_simd_mod_fini);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
|
||||
MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
|
||||
MODULE_ALIAS_CRYPTO("chacha20");
|
||||
MODULE_ALIAS_CRYPTO("chacha20-simd");
|
||||
386
arch/x86/crypto/poly1305-avx2-x86_64.S
Normal file
386
arch/x86/crypto/poly1305-avx2-x86_64.S
Normal file
@@ -0,0 +1,386 @@
|
||||
/*
|
||||
* Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.data
|
||||
.align 32
|
||||
|
||||
ANMASK: .octa 0x0000000003ffffff0000000003ffffff
|
||||
.octa 0x0000000003ffffff0000000003ffffff
|
||||
ORMASK: .octa 0x00000000010000000000000001000000
|
||||
.octa 0x00000000010000000000000001000000
|
||||
|
||||
.text
|
||||
|
||||
#define h0 0x00(%rdi)
|
||||
#define h1 0x04(%rdi)
|
||||
#define h2 0x08(%rdi)
|
||||
#define h3 0x0c(%rdi)
|
||||
#define h4 0x10(%rdi)
|
||||
#define r0 0x00(%rdx)
|
||||
#define r1 0x04(%rdx)
|
||||
#define r2 0x08(%rdx)
|
||||
#define r3 0x0c(%rdx)
|
||||
#define r4 0x10(%rdx)
|
||||
#define u0 0x00(%r8)
|
||||
#define u1 0x04(%r8)
|
||||
#define u2 0x08(%r8)
|
||||
#define u3 0x0c(%r8)
|
||||
#define u4 0x10(%r8)
|
||||
#define w0 0x14(%r8)
|
||||
#define w1 0x18(%r8)
|
||||
#define w2 0x1c(%r8)
|
||||
#define w3 0x20(%r8)
|
||||
#define w4 0x24(%r8)
|
||||
#define y0 0x28(%r8)
|
||||
#define y1 0x2c(%r8)
|
||||
#define y2 0x30(%r8)
|
||||
#define y3 0x34(%r8)
|
||||
#define y4 0x38(%r8)
|
||||
#define m %rsi
|
||||
#define hc0 %ymm0
|
||||
#define hc1 %ymm1
|
||||
#define hc2 %ymm2
|
||||
#define hc3 %ymm3
|
||||
#define hc4 %ymm4
|
||||
#define hc0x %xmm0
|
||||
#define hc1x %xmm1
|
||||
#define hc2x %xmm2
|
||||
#define hc3x %xmm3
|
||||
#define hc4x %xmm4
|
||||
#define t1 %ymm5
|
||||
#define t2 %ymm6
|
||||
#define t1x %xmm5
|
||||
#define t2x %xmm6
|
||||
#define ruwy0 %ymm7
|
||||
#define ruwy1 %ymm8
|
||||
#define ruwy2 %ymm9
|
||||
#define ruwy3 %ymm10
|
||||
#define ruwy4 %ymm11
|
||||
#define ruwy0x %xmm7
|
||||
#define ruwy1x %xmm8
|
||||
#define ruwy2x %xmm9
|
||||
#define ruwy3x %xmm10
|
||||
#define ruwy4x %xmm11
|
||||
#define svxz1 %ymm12
|
||||
#define svxz2 %ymm13
|
||||
#define svxz3 %ymm14
|
||||
#define svxz4 %ymm15
|
||||
#define d0 %r9
|
||||
#define d1 %r10
|
||||
#define d2 %r11
|
||||
#define d3 %r12
|
||||
#define d4 %r13
|
||||
|
||||
ENTRY(poly1305_4block_avx2)
|
||||
# %rdi: Accumulator h[5]
|
||||
# %rsi: 64 byte input block m
|
||||
# %rdx: Poly1305 key r[5]
|
||||
# %rcx: Quadblock count
|
||||
# %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
|
||||
|
||||
# This four-block variant uses loop unrolled block processing. It
|
||||
# requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
|
||||
# h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
|
||||
|
||||
vzeroupper
|
||||
push %rbx
|
||||
push %r12
|
||||
push %r13
|
||||
|
||||
# combine r0,u0,w0,y0
|
||||
vmovd y0,ruwy0x
|
||||
vmovd w0,t1x
|
||||
vpunpcklqdq t1,ruwy0,ruwy0
|
||||
vmovd u0,t1x
|
||||
vmovd r0,t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,ruwy0,ruwy0
|
||||
|
||||
# combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
|
||||
vmovd y1,ruwy1x
|
||||
vmovd w1,t1x
|
||||
vpunpcklqdq t1,ruwy1,ruwy1
|
||||
vmovd u1,t1x
|
||||
vmovd r1,t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,ruwy1,ruwy1
|
||||
vpslld $2,ruwy1,svxz1
|
||||
vpaddd ruwy1,svxz1,svxz1
|
||||
|
||||
# combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
|
||||
vmovd y2,ruwy2x
|
||||
vmovd w2,t1x
|
||||
vpunpcklqdq t1,ruwy2,ruwy2
|
||||
vmovd u2,t1x
|
||||
vmovd r2,t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,ruwy2,ruwy2
|
||||
vpslld $2,ruwy2,svxz2
|
||||
vpaddd ruwy2,svxz2,svxz2
|
||||
|
||||
# combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
|
||||
vmovd y3,ruwy3x
|
||||
vmovd w3,t1x
|
||||
vpunpcklqdq t1,ruwy3,ruwy3
|
||||
vmovd u3,t1x
|
||||
vmovd r3,t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,ruwy3,ruwy3
|
||||
vpslld $2,ruwy3,svxz3
|
||||
vpaddd ruwy3,svxz3,svxz3
|
||||
|
||||
# combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
|
||||
vmovd y4,ruwy4x
|
||||
vmovd w4,t1x
|
||||
vpunpcklqdq t1,ruwy4,ruwy4
|
||||
vmovd u4,t1x
|
||||
vmovd r4,t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,ruwy4,ruwy4
|
||||
vpslld $2,ruwy4,svxz4
|
||||
vpaddd ruwy4,svxz4,svxz4
|
||||
|
||||
.Ldoblock4:
|
||||
# hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
|
||||
# m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
|
||||
vmovd 0x00(m),hc0x
|
||||
vmovd 0x10(m),t1x
|
||||
vpunpcklqdq t1,hc0,hc0
|
||||
vmovd 0x20(m),t1x
|
||||
vmovd 0x30(m),t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,hc0,hc0
|
||||
vpand ANMASK(%rip),hc0,hc0
|
||||
vmovd h0,t1x
|
||||
vpaddd t1,hc0,hc0
|
||||
# hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
|
||||
# (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
|
||||
vmovd 0x03(m),hc1x
|
||||
vmovd 0x13(m),t1x
|
||||
vpunpcklqdq t1,hc1,hc1
|
||||
vmovd 0x23(m),t1x
|
||||
vmovd 0x33(m),t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,hc1,hc1
|
||||
vpsrld $2,hc1,hc1
|
||||
vpand ANMASK(%rip),hc1,hc1
|
||||
vmovd h1,t1x
|
||||
vpaddd t1,hc1,hc1
|
||||
# hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
|
||||
# (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
|
||||
vmovd 0x06(m),hc2x
|
||||
vmovd 0x16(m),t1x
|
||||
vpunpcklqdq t1,hc2,hc2
|
||||
vmovd 0x26(m),t1x
|
||||
vmovd 0x36(m),t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,hc2,hc2
|
||||
vpsrld $4,hc2,hc2
|
||||
vpand ANMASK(%rip),hc2,hc2
|
||||
vmovd h2,t1x
|
||||
vpaddd t1,hc2,hc2
|
||||
# hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
|
||||
# (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
|
||||
vmovd 0x09(m),hc3x
|
||||
vmovd 0x19(m),t1x
|
||||
vpunpcklqdq t1,hc3,hc3
|
||||
vmovd 0x29(m),t1x
|
||||
vmovd 0x39(m),t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,hc3,hc3
|
||||
vpsrld $6,hc3,hc3
|
||||
vpand ANMASK(%rip),hc3,hc3
|
||||
vmovd h3,t1x
|
||||
vpaddd t1,hc3,hc3
|
||||
# hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
|
||||
# (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
|
||||
vmovd 0x0c(m),hc4x
|
||||
vmovd 0x1c(m),t1x
|
||||
vpunpcklqdq t1,hc4,hc4
|
||||
vmovd 0x2c(m),t1x
|
||||
vmovd 0x3c(m),t2x
|
||||
vpunpcklqdq t2,t1,t1
|
||||
vperm2i128 $0x20,t1,hc4,hc4
|
||||
vpsrld $8,hc4,hc4
|
||||
vpor ORMASK(%rip),hc4,hc4
|
||||
vmovd h4,t1x
|
||||
vpaddd t1,hc4,hc4
|
||||
|
||||
# t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
|
||||
vpmuludq hc0,ruwy0,t1
|
||||
# t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
|
||||
vpmuludq hc1,svxz4,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
|
||||
vpmuludq hc2,svxz3,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
|
||||
vpmuludq hc3,svxz2,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
|
||||
vpmuludq hc4,svxz1,t2
|
||||
vpaddq t2,t1,t1
|
||||
# d0 = t1[0] + t1[1] + t[2] + t[3]
|
||||
vpermq $0xee,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vpsrldq $8,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vmovq t1x,d0
|
||||
|
||||
# t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
|
||||
vpmuludq hc0,ruwy1,t1
|
||||
# t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
|
||||
vpmuludq hc1,ruwy0,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
|
||||
vpmuludq hc2,svxz4,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
|
||||
vpmuludq hc3,svxz3,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
|
||||
vpmuludq hc4,svxz2,t2
|
||||
vpaddq t2,t1,t1
|
||||
# d1 = t1[0] + t1[1] + t1[3] + t1[4]
|
||||
vpermq $0xee,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vpsrldq $8,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vmovq t1x,d1
|
||||
|
||||
# t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
|
||||
vpmuludq hc0,ruwy2,t1
|
||||
# t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
|
||||
vpmuludq hc1,ruwy1,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
|
||||
vpmuludq hc2,ruwy0,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
|
||||
vpmuludq hc3,svxz4,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
|
||||
vpmuludq hc4,svxz3,t2
|
||||
vpaddq t2,t1,t1
|
||||
# d2 = t1[0] + t1[1] + t1[2] + t1[3]
|
||||
vpermq $0xee,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vpsrldq $8,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vmovq t1x,d2
|
||||
|
||||
# t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
|
||||
vpmuludq hc0,ruwy3,t1
|
||||
# t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
|
||||
vpmuludq hc1,ruwy2,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
|
||||
vpmuludq hc2,ruwy1,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
|
||||
vpmuludq hc3,ruwy0,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
|
||||
vpmuludq hc4,svxz4,t2
|
||||
vpaddq t2,t1,t1
|
||||
# d3 = t1[0] + t1[1] + t1[2] + t1[3]
|
||||
vpermq $0xee,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vpsrldq $8,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vmovq t1x,d3
|
||||
|
||||
# t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
|
||||
vpmuludq hc0,ruwy4,t1
|
||||
# t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
|
||||
vpmuludq hc1,ruwy3,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
|
||||
vpmuludq hc2,ruwy2,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
|
||||
vpmuludq hc3,ruwy1,t2
|
||||
vpaddq t2,t1,t1
|
||||
# t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
|
||||
vpmuludq hc4,ruwy0,t2
|
||||
vpaddq t2,t1,t1
|
||||
# d4 = t1[0] + t1[1] + t1[2] + t1[3]
|
||||
vpermq $0xee,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vpsrldq $8,t1,t2
|
||||
vpaddq t2,t1,t1
|
||||
vmovq t1x,d4
|
||||
|
||||
# d1 += d0 >> 26
|
||||
mov d0,%rax
|
||||
shr $26,%rax
|
||||
add %rax,d1
|
||||
# h0 = d0 & 0x3ffffff
|
||||
mov d0,%rbx
|
||||
and $0x3ffffff,%ebx
|
||||
|
||||
# d2 += d1 >> 26
|
||||
mov d1,%rax
|
||||
shr $26,%rax
|
||||
add %rax,d2
|
||||
# h1 = d1 & 0x3ffffff
|
||||
mov d1,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h1
|
||||
|
||||
# d3 += d2 >> 26
|
||||
mov d2,%rax
|
||||
shr $26,%rax
|
||||
add %rax,d3
|
||||
# h2 = d2 & 0x3ffffff
|
||||
mov d2,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h2
|
||||
|
||||
# d4 += d3 >> 26
|
||||
mov d3,%rax
|
||||
shr $26,%rax
|
||||
add %rax,d4
|
||||
# h3 = d3 & 0x3ffffff
|
||||
mov d3,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h3
|
||||
|
||||
# h0 += (d4 >> 26) * 5
|
||||
mov d4,%rax
|
||||
shr $26,%rax
|
||||
lea (%eax,%eax,4),%eax
|
||||
add %eax,%ebx
|
||||
# h4 = d4 & 0x3ffffff
|
||||
mov d4,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h4
|
||||
|
||||
# h1 += h0 >> 26
|
||||
mov %ebx,%eax
|
||||
shr $26,%eax
|
||||
add %eax,h1
|
||||
# h0 = h0 & 0x3ffffff
|
||||
andl $0x3ffffff,%ebx
|
||||
mov %ebx,h0
|
||||
|
||||
add $0x40,m
|
||||
dec %rcx
|
||||
jnz .Ldoblock4
|
||||
|
||||
vzeroupper
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbx
|
||||
ret
|
||||
ENDPROC(poly1305_4block_avx2)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user