mirror of
https://github.com/armbian/linux-cix.git
synced 2026-01-06 12:30:45 -08:00
Merge tag 'v5.20-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu: "API: - Make proc files report fips module name and version Algorithms: - Move generic SHA1 code into lib/crypto - Implement Chinese Remainder Theorem for RSA - Remove blake2s - Add XCTR with x86/arm64 acceleration - Add POLYVAL with x86/arm64 acceleration - Add HCTR2 - Add ARIA Drivers: - Add support for new CCP/PSP device ID in ccp" * tag 'v5.20-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (89 commits) crypto: tcrypt - Remove the static variable initialisations to NULL crypto: arm64/poly1305 - fix a read out-of-bound crypto: hisilicon/zip - Use the bitmap API to allocate bitmaps crypto: hisilicon/sec - fix auth key size error crypto: ccree - Remove a useless dma_supported() call crypto: ccp - Add support for new CCP/PSP device ID crypto: inside-secure - Add missing MODULE_DEVICE_TABLE for of crypto: hisilicon/hpre - don't use GFP_KERNEL to alloc mem during softirq crypto: testmgr - some more fixes to RSA test vectors cyrpto: powerpc/aes - delete the rebundant word "block" in comments hwrng: via - Fix comment typo crypto: twofish - Fix comment typo crypto: rmd160 - fix Kconfig "its" grammar crypto: keembay-ocs-ecc - Drop if with an always false condition Documentation: qat: rewrite description Documentation: qat: Use code block for qat sysfs example crypto: lib - add module license to libsha1 crypto: lib - make the sha1 library optional crypto: lib - move lib/sha1.c into lib/crypto/ crypto: fips - make proc files report fips module name and version ...
This commit is contained in:
49
Documentation/ABI/testing/sysfs-driver-qat
Normal file
49
Documentation/ABI/testing/sysfs-driver-qat
Normal file
@@ -0,0 +1,49 @@
|
||||
What: /sys/bus/pci/devices/<BDF>/qat/state
|
||||
Date: June 2022
|
||||
KernelVersion: 5.20
|
||||
Contact: qat-linux@intel.com
|
||||
Description: (RW) Reports the current state of the QAT device. Write to
|
||||
the file to start or stop the device.
|
||||
|
||||
The values are:
|
||||
|
||||
* up: the device is up and running
|
||||
* down: the device is down
|
||||
|
||||
|
||||
It is possible to transition the device from up to down only
|
||||
if the device is up and vice versa.
|
||||
|
||||
This attribute is only available for qat_4xxx devices.
|
||||
|
||||
What: /sys/bus/pci/devices/<BDF>/qat/cfg_services
|
||||
Date: June 2022
|
||||
KernelVersion: 5.20
|
||||
Contact: qat-linux@intel.com
|
||||
Description: (RW) Reports the current configuration of the QAT device.
|
||||
Write to the file to change the configured services.
|
||||
|
||||
The values are:
|
||||
|
||||
* sym;asym: the device is configured for running crypto
|
||||
services
|
||||
* dc: the device is configured for running compression services
|
||||
|
||||
It is possible to set the configuration only if the device
|
||||
is in the `down` state (see /sys/bus/pci/devices/<BDF>/qat/state)
|
||||
|
||||
The following example shows how to change the configuration of
|
||||
a device configured for running crypto services in order to
|
||||
run data compression::
|
||||
|
||||
# cat /sys/bus/pci/devices/<BDF>/qat/state
|
||||
up
|
||||
# cat /sys/bus/pci/devices/<BDF>/qat/cfg_services
|
||||
sym;asym
|
||||
# echo down > /sys/bus/pci/devices/<BDF>/qat/state
|
||||
# echo dc > /sys/bus/pci/devices/<BDF>/qat/cfg_services
|
||||
# echo up > /sys/bus/pci/devices/<BDF>/qat/state
|
||||
# cat /sys/bus/pci/devices/<BDF>/qat/cfg_services
|
||||
dc
|
||||
|
||||
This attribute is only available for qat_4xxx devices.
|
||||
@@ -337,6 +337,7 @@ Currently, the following pairs of encryption modes are supported:
|
||||
- AES-256-XTS for contents and AES-256-CTS-CBC for filenames
|
||||
- AES-128-CBC for contents and AES-128-CTS-CBC for filenames
|
||||
- Adiantum for both contents and filenames
|
||||
- AES-256-XTS for contents and AES-256-HCTR2 for filenames (v2 policies only)
|
||||
|
||||
If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair.
|
||||
|
||||
@@ -357,6 +358,17 @@ To use Adiantum, CONFIG_CRYPTO_ADIANTUM must be enabled. Also, fast
|
||||
implementations of ChaCha and NHPoly1305 should be enabled, e.g.
|
||||
CONFIG_CRYPTO_CHACHA20_NEON and CONFIG_CRYPTO_NHPOLY1305_NEON for ARM.
|
||||
|
||||
AES-256-HCTR2 is another true wide-block encryption mode that is intended for
|
||||
use on CPUs with dedicated crypto instructions. AES-256-HCTR2 has the property
|
||||
that a bitflip in the plaintext changes the entire ciphertext. This property
|
||||
makes it desirable for filename encryption since initialization vectors are
|
||||
reused within a directory. For more details on AES-256-HCTR2, see the paper
|
||||
"Length-preserving encryption with HCTR2"
|
||||
(https://eprint.iacr.org/2021/1441.pdf). To use AES-256-HCTR2,
|
||||
CONFIG_CRYPTO_HCTR2 must be enabled. Also, fast implementations of XCTR and
|
||||
POLYVAL should be enabled, e.g. CRYPTO_POLYVAL_ARM64_CE and
|
||||
CRYPTO_AES_ARM64_CE_BLK for ARM64.
|
||||
|
||||
New encryption modes can be added relatively easily, without changes
|
||||
to individual filesystems. However, authenticated encryption (AE)
|
||||
modes are not currently supported because of the difficulty of dealing
|
||||
@@ -404,11 +416,11 @@ alternatively has the file's nonce (for `DIRECT_KEY policies`_) or
|
||||
inode number (for `IV_INO_LBLK_64 policies`_) included in the IVs.
|
||||
Thus, IV reuse is limited to within a single directory.
|
||||
|
||||
With CTS-CBC, the IV reuse means that when the plaintext filenames
|
||||
share a common prefix at least as long as the cipher block size (16
|
||||
bytes for AES), the corresponding encrypted filenames will also share
|
||||
a common prefix. This is undesirable. Adiantum does not have this
|
||||
weakness, as it is a wide-block encryption mode.
|
||||
With CTS-CBC, the IV reuse means that when the plaintext filenames share a
|
||||
common prefix at least as long as the cipher block size (16 bytes for AES), the
|
||||
corresponding encrypted filenames will also share a common prefix. This is
|
||||
undesirable. Adiantum and HCTR2 do not have this weakness, as they are
|
||||
wide-block encryption modes.
|
||||
|
||||
All supported filenames encryption modes accept any plaintext length
|
||||
>= 16 bytes; cipher block alignment is not required. However,
|
||||
|
||||
17
MAINTAINERS
17
MAINTAINERS
@@ -9079,15 +9079,24 @@ S: Supported
|
||||
F: Documentation/admin-guide/perf/hns3-pmu.rst
|
||||
F: drivers/perf/hisilicon/hns3_pmu.c
|
||||
|
||||
HISILICON QM AND ZIP Controller DRIVER
|
||||
HISILICON QM DRIVER
|
||||
M: Weili Qian <qianweili@huawei.com>
|
||||
M: Zhou Wang <wangzhou1@hisilicon.com>
|
||||
L: linux-crypto@vger.kernel.org
|
||||
S: Maintained
|
||||
F: drivers/crypto/hisilicon/Kconfig
|
||||
F: drivers/crypto/hisilicon/Makefile
|
||||
F: drivers/crypto/hisilicon/qm.c
|
||||
F: drivers/crypto/hisilicon/sgl.c
|
||||
F: include/linux/hisi_acc_qm.h
|
||||
|
||||
HISILICON ZIP Controller DRIVER
|
||||
M: Yang Shen <shenyang39@huawei.com>
|
||||
M: Zhou Wang <wangzhou1@hisilicon.com>
|
||||
L: linux-crypto@vger.kernel.org
|
||||
S: Maintained
|
||||
F: Documentation/ABI/testing/debugfs-hisi-zip
|
||||
F: drivers/crypto/hisilicon/qm.c
|
||||
F: drivers/crypto/hisilicon/sgl.c
|
||||
F: drivers/crypto/hisilicon/zip/
|
||||
F: include/linux/hisi_acc_qm.h
|
||||
|
||||
HISILICON ROCE DRIVER
|
||||
M: Wenpeng Liang <liangwenpeng@huawei.com>
|
||||
|
||||
@@ -63,7 +63,7 @@ config CRYPTO_SHA512_ARM
|
||||
using optimized ARM assembler and NEON, when available.
|
||||
|
||||
config CRYPTO_BLAKE2S_ARM
|
||||
tristate "BLAKE2s digest algorithm (ARM)"
|
||||
bool "BLAKE2s digest algorithm (ARM)"
|
||||
select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
|
||||
help
|
||||
BLAKE2s digest algorithm optimized with ARM scalar instructions. This
|
||||
|
||||
@@ -9,8 +9,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
|
||||
obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
|
||||
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
|
||||
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
|
||||
obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += blake2s-arm.o
|
||||
obj-$(if $(CONFIG_CRYPTO_BLAKE2S_ARM),y) += libblake2s-arm.o
|
||||
obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o
|
||||
obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
|
||||
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
|
||||
obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
|
||||
@@ -32,7 +31,6 @@ sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
|
||||
sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
|
||||
sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
|
||||
sha512-arm-y := sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
|
||||
blake2s-arm-y := blake2s-shash.o
|
||||
libblake2s-arm-y:= blake2s-core.o blake2s-glue.o
|
||||
blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o
|
||||
sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
|
||||
|
||||
@@ -1,75 +0,0 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
* BLAKE2s digest algorithm, ARM scalar implementation
|
||||
*
|
||||
* Copyright 2020 Google LLC
|
||||
*/
|
||||
|
||||
#include <crypto/internal/blake2s.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
|
||||
#include <linux/module.h>
|
||||
|
||||
static int crypto_blake2s_update_arm(struct shash_desc *desc,
|
||||
const u8 *in, unsigned int inlen)
|
||||
{
|
||||
return crypto_blake2s_update(desc, in, inlen, false);
|
||||
}
|
||||
|
||||
static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
|
||||
{
|
||||
return crypto_blake2s_final(desc, out, false);
|
||||
}
|
||||
|
||||
#define BLAKE2S_ALG(name, driver_name, digest_size) \
|
||||
{ \
|
||||
.base.cra_name = name, \
|
||||
.base.cra_driver_name = driver_name, \
|
||||
.base.cra_priority = 200, \
|
||||
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
|
||||
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
|
||||
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
|
||||
.base.cra_module = THIS_MODULE, \
|
||||
.digestsize = digest_size, \
|
||||
.setkey = crypto_blake2s_setkey, \
|
||||
.init = crypto_blake2s_init, \
|
||||
.update = crypto_blake2s_update_arm, \
|
||||
.final = crypto_blake2s_final_arm, \
|
||||
.descsize = sizeof(struct blake2s_state), \
|
||||
}
|
||||
|
||||
static struct shash_alg blake2s_arm_algs[] = {
|
||||
BLAKE2S_ALG("blake2s-128", "blake2s-128-arm", BLAKE2S_128_HASH_SIZE),
|
||||
BLAKE2S_ALG("blake2s-160", "blake2s-160-arm", BLAKE2S_160_HASH_SIZE),
|
||||
BLAKE2S_ALG("blake2s-224", "blake2s-224-arm", BLAKE2S_224_HASH_SIZE),
|
||||
BLAKE2S_ALG("blake2s-256", "blake2s-256-arm", BLAKE2S_256_HASH_SIZE),
|
||||
};
|
||||
|
||||
static int __init blake2s_arm_mod_init(void)
|
||||
{
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
|
||||
crypto_register_shashes(blake2s_arm_algs,
|
||||
ARRAY_SIZE(blake2s_arm_algs)) : 0;
|
||||
}
|
||||
|
||||
static void __exit blake2s_arm_mod_exit(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
|
||||
crypto_unregister_shashes(blake2s_arm_algs,
|
||||
ARRAY_SIZE(blake2s_arm_algs));
|
||||
}
|
||||
|
||||
module_init(blake2s_arm_mod_init);
|
||||
module_exit(blake2s_arm_mod_exit);
|
||||
|
||||
MODULE_DESCRIPTION("BLAKE2s digest algorithm, ARM scalar implementation");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-128");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-128-arm");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-160");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-160-arm");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-224");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-224-arm");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-256");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-256-arm");
|
||||
@@ -71,6 +71,12 @@ config CRYPTO_GHASH_ARM64_CE
|
||||
select CRYPTO_HASH
|
||||
select CRYPTO_GF128MUL
|
||||
select CRYPTO_LIB_AES
|
||||
select CRYPTO_AEAD
|
||||
|
||||
config CRYPTO_POLYVAL_ARM64_CE
|
||||
tristate "POLYVAL using ARMv8 Crypto Extensions (for HCTR2)"
|
||||
depends on KERNEL_MODE_NEON
|
||||
select CRYPTO_POLYVAL
|
||||
|
||||
config CRYPTO_CRCT10DIF_ARM64_CE
|
||||
tristate "CRCT10DIF digest algorithm using PMULL instructions"
|
||||
@@ -96,13 +102,13 @@ config CRYPTO_AES_ARM64_CE_CCM
|
||||
select CRYPTO_LIB_AES
|
||||
|
||||
config CRYPTO_AES_ARM64_CE_BLK
|
||||
tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
|
||||
tristate "AES in ECB/CBC/CTR/XTS/XCTR modes using ARMv8 Crypto Extensions"
|
||||
depends on KERNEL_MODE_NEON
|
||||
select CRYPTO_SKCIPHER
|
||||
select CRYPTO_AES_ARM64_CE
|
||||
|
||||
config CRYPTO_AES_ARM64_NEON_BLK
|
||||
tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
|
||||
tristate "AES in ECB/CBC/CTR/XTS/XCTR modes using NEON instructions"
|
||||
depends on KERNEL_MODE_NEON
|
||||
select CRYPTO_SKCIPHER
|
||||
select CRYPTO_LIB_AES
|
||||
|
||||
@@ -32,6 +32,9 @@ sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o
|
||||
obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
|
||||
ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o
|
||||
polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
|
||||
crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
|
||||
|
||||
|
||||
@@ -34,10 +34,11 @@
|
||||
#define aes_essiv_cbc_encrypt ce_aes_essiv_cbc_encrypt
|
||||
#define aes_essiv_cbc_decrypt ce_aes_essiv_cbc_decrypt
|
||||
#define aes_ctr_encrypt ce_aes_ctr_encrypt
|
||||
#define aes_xctr_encrypt ce_aes_xctr_encrypt
|
||||
#define aes_xts_encrypt ce_aes_xts_encrypt
|
||||
#define aes_xts_decrypt ce_aes_xts_decrypt
|
||||
#define aes_mac_update ce_aes_mac_update
|
||||
MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
|
||||
MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 Crypto Extensions");
|
||||
#else
|
||||
#define MODE "neon"
|
||||
#define PRIO 200
|
||||
@@ -50,16 +51,18 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
|
||||
#define aes_essiv_cbc_encrypt neon_aes_essiv_cbc_encrypt
|
||||
#define aes_essiv_cbc_decrypt neon_aes_essiv_cbc_decrypt
|
||||
#define aes_ctr_encrypt neon_aes_ctr_encrypt
|
||||
#define aes_xctr_encrypt neon_aes_xctr_encrypt
|
||||
#define aes_xts_encrypt neon_aes_xts_encrypt
|
||||
#define aes_xts_decrypt neon_aes_xts_decrypt
|
||||
#define aes_mac_update neon_aes_mac_update
|
||||
MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
|
||||
MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 NEON");
|
||||
#endif
|
||||
#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS)
|
||||
MODULE_ALIAS_CRYPTO("ecb(aes)");
|
||||
MODULE_ALIAS_CRYPTO("cbc(aes)");
|
||||
MODULE_ALIAS_CRYPTO("ctr(aes)");
|
||||
MODULE_ALIAS_CRYPTO("xts(aes)");
|
||||
MODULE_ALIAS_CRYPTO("xctr(aes)");
|
||||
#endif
|
||||
MODULE_ALIAS_CRYPTO("cts(cbc(aes))");
|
||||
MODULE_ALIAS_CRYPTO("essiv(cbc(aes),sha256)");
|
||||
@@ -89,6 +92,9 @@ asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
|
||||
asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
|
||||
int rounds, int bytes, u8 ctr[]);
|
||||
|
||||
asmlinkage void aes_xctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
|
||||
int rounds, int bytes, u8 ctr[], int byte_ctr);
|
||||
|
||||
asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[],
|
||||
int rounds, int bytes, u32 const rk2[], u8 iv[],
|
||||
int first);
|
||||
@@ -442,6 +448,52 @@ static int __maybe_unused essiv_cbc_decrypt(struct skcipher_request *req)
|
||||
return err ?: cbc_decrypt_walk(req, &walk);
|
||||
}
|
||||
|
||||
static int __maybe_unused xctr_encrypt(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
int err, rounds = 6 + ctx->key_length / 4;
|
||||
struct skcipher_walk walk;
|
||||
unsigned int byte_ctr = 0;
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, false);
|
||||
|
||||
while (walk.nbytes > 0) {
|
||||
const u8 *src = walk.src.virt.addr;
|
||||
unsigned int nbytes = walk.nbytes;
|
||||
u8 *dst = walk.dst.virt.addr;
|
||||
u8 buf[AES_BLOCK_SIZE];
|
||||
|
||||
/*
|
||||
* If given less than 16 bytes, we must copy the partial block
|
||||
* into a temporary buffer of 16 bytes to avoid out of bounds
|
||||
* reads and writes. Furthermore, this code is somewhat unusual
|
||||
* in that it expects the end of the data to be at the end of
|
||||
* the temporary buffer, rather than the start of the data at
|
||||
* the start of the temporary buffer.
|
||||
*/
|
||||
if (unlikely(nbytes < AES_BLOCK_SIZE))
|
||||
src = dst = memcpy(buf + sizeof(buf) - nbytes,
|
||||
src, nbytes);
|
||||
else if (nbytes < walk.total)
|
||||
nbytes &= ~(AES_BLOCK_SIZE - 1);
|
||||
|
||||
kernel_neon_begin();
|
||||
aes_xctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes,
|
||||
walk.iv, byte_ctr);
|
||||
kernel_neon_end();
|
||||
|
||||
if (unlikely(nbytes < AES_BLOCK_SIZE))
|
||||
memcpy(walk.dst.virt.addr,
|
||||
buf + sizeof(buf) - nbytes, nbytes);
|
||||
byte_ctr += nbytes;
|
||||
|
||||
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int __maybe_unused ctr_encrypt(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
@@ -457,6 +509,14 @@ static int __maybe_unused ctr_encrypt(struct skcipher_request *req)
|
||||
u8 *dst = walk.dst.virt.addr;
|
||||
u8 buf[AES_BLOCK_SIZE];
|
||||
|
||||
/*
|
||||
* If given less than 16 bytes, we must copy the partial block
|
||||
* into a temporary buffer of 16 bytes to avoid out of bounds
|
||||
* reads and writes. Furthermore, this code is somewhat unusual
|
||||
* in that it expects the end of the data to be at the end of
|
||||
* the temporary buffer, rather than the start of the data at
|
||||
* the start of the temporary buffer.
|
||||
*/
|
||||
if (unlikely(nbytes < AES_BLOCK_SIZE))
|
||||
src = dst = memcpy(buf + sizeof(buf) - nbytes,
|
||||
src, nbytes);
|
||||
@@ -669,6 +729,22 @@ static struct skcipher_alg aes_algs[] = { {
|
||||
.setkey = skcipher_aes_setkey,
|
||||
.encrypt = ctr_encrypt,
|
||||
.decrypt = ctr_encrypt,
|
||||
}, {
|
||||
.base = {
|
||||
.cra_name = "xctr(aes)",
|
||||
.cra_driver_name = "xctr-aes-" MODE,
|
||||
.cra_priority = PRIO,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
|
||||
.cra_module = THIS_MODULE,
|
||||
},
|
||||
.min_keysize = AES_MIN_KEY_SIZE,
|
||||
.max_keysize = AES_MAX_KEY_SIZE,
|
||||
.ivsize = AES_BLOCK_SIZE,
|
||||
.chunksize = AES_BLOCK_SIZE,
|
||||
.setkey = skcipher_aes_setkey,
|
||||
.encrypt = xctr_encrypt,
|
||||
.decrypt = xctr_encrypt,
|
||||
}, {
|
||||
.base = {
|
||||
.cra_name = "xts(aes)",
|
||||
|
||||
@@ -318,127 +318,211 @@ AES_FUNC_END(aes_cbc_cts_decrypt)
|
||||
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
||||
.previous
|
||||
|
||||
|
||||
/*
|
||||
* aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
||||
* int bytes, u8 ctr[])
|
||||
* This macro generates the code for CTR and XCTR mode.
|
||||
*/
|
||||
.macro ctr_encrypt xctr
|
||||
// Arguments
|
||||
OUT .req x0
|
||||
IN .req x1
|
||||
KEY .req x2
|
||||
ROUNDS_W .req w3
|
||||
BYTES_W .req w4
|
||||
IV .req x5
|
||||
BYTE_CTR_W .req w6 // XCTR only
|
||||
// Intermediate values
|
||||
CTR_W .req w11 // XCTR only
|
||||
CTR .req x11 // XCTR only
|
||||
IV_PART .req x12
|
||||
BLOCKS .req x13
|
||||
BLOCKS_W .req w13
|
||||
|
||||
AES_FUNC_START(aes_ctr_encrypt)
|
||||
stp x29, x30, [sp, #-16]!
|
||||
mov x29, sp
|
||||
|
||||
enc_prepare w3, x2, x12
|
||||
ld1 {vctr.16b}, [x5]
|
||||
enc_prepare ROUNDS_W, KEY, IV_PART
|
||||
ld1 {vctr.16b}, [IV]
|
||||
|
||||
umov x12, vctr.d[1] /* keep swabbed ctr in reg */
|
||||
rev x12, x12
|
||||
/*
|
||||
* Keep 64 bits of the IV in a register. For CTR mode this lets us
|
||||
* easily increment the IV. For XCTR mode this lets us efficiently XOR
|
||||
* the 64-bit counter with the IV.
|
||||
*/
|
||||
.if \xctr
|
||||
umov IV_PART, vctr.d[0]
|
||||
lsr CTR_W, BYTE_CTR_W, #4
|
||||
.else
|
||||
umov IV_PART, vctr.d[1]
|
||||
rev IV_PART, IV_PART
|
||||
.endif
|
||||
|
||||
.LctrloopNx:
|
||||
add w7, w4, #15
|
||||
sub w4, w4, #MAX_STRIDE << 4
|
||||
lsr w7, w7, #4
|
||||
.LctrloopNx\xctr:
|
||||
add BLOCKS_W, BYTES_W, #15
|
||||
sub BYTES_W, BYTES_W, #MAX_STRIDE << 4
|
||||
lsr BLOCKS_W, BLOCKS_W, #4
|
||||
mov w8, #MAX_STRIDE
|
||||
cmp w7, w8
|
||||
csel w7, w7, w8, lt
|
||||
adds x12, x12, x7
|
||||
cmp BLOCKS_W, w8
|
||||
csel BLOCKS_W, BLOCKS_W, w8, lt
|
||||
|
||||
/*
|
||||
* Set up the counter values in v0-v{MAX_STRIDE-1}.
|
||||
*
|
||||
* If we are encrypting less than MAX_STRIDE blocks, the tail block
|
||||
* handling code expects the last keystream block to be in
|
||||
* v{MAX_STRIDE-1}. For example: if encrypting two blocks with
|
||||
* MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
|
||||
*/
|
||||
.if \xctr
|
||||
add CTR, CTR, BLOCKS
|
||||
.else
|
||||
adds IV_PART, IV_PART, BLOCKS
|
||||
.endif
|
||||
mov v0.16b, vctr.16b
|
||||
mov v1.16b, vctr.16b
|
||||
mov v2.16b, vctr.16b
|
||||
mov v3.16b, vctr.16b
|
||||
ST5( mov v4.16b, vctr.16b )
|
||||
bcs 0f
|
||||
.if \xctr
|
||||
sub x6, CTR, #MAX_STRIDE - 1
|
||||
sub x7, CTR, #MAX_STRIDE - 2
|
||||
sub x8, CTR, #MAX_STRIDE - 3
|
||||
sub x9, CTR, #MAX_STRIDE - 4
|
||||
ST5( sub x10, CTR, #MAX_STRIDE - 5 )
|
||||
eor x6, x6, IV_PART
|
||||
eor x7, x7, IV_PART
|
||||
eor x8, x8, IV_PART
|
||||
eor x9, x9, IV_PART
|
||||
ST5( eor x10, x10, IV_PART )
|
||||
mov v0.d[0], x6
|
||||
mov v1.d[0], x7
|
||||
mov v2.d[0], x8
|
||||
mov v3.d[0], x9
|
||||
ST5( mov v4.d[0], x10 )
|
||||
.else
|
||||
bcs 0f
|
||||
.subsection 1
|
||||
/*
|
||||
* This subsection handles carries.
|
||||
*
|
||||
* Conditional branching here is allowed with respect to time
|
||||
* invariance since the branches are dependent on the IV instead
|
||||
* of the plaintext or key. This code is rarely executed in
|
||||
* practice anyway.
|
||||
*/
|
||||
|
||||
.subsection 1
|
||||
/* apply carry to outgoing counter */
|
||||
0: umov x8, vctr.d[0]
|
||||
rev x8, x8
|
||||
add x8, x8, #1
|
||||
rev x8, x8
|
||||
ins vctr.d[0], x8
|
||||
/* Apply carry to outgoing counter. */
|
||||
0: umov x8, vctr.d[0]
|
||||
rev x8, x8
|
||||
add x8, x8, #1
|
||||
rev x8, x8
|
||||
ins vctr.d[0], x8
|
||||
|
||||
/* apply carry to N counter blocks for N := x12 */
|
||||
cbz x12, 2f
|
||||
adr x16, 1f
|
||||
sub x16, x16, x12, lsl #3
|
||||
br x16
|
||||
bti c
|
||||
mov v0.d[0], vctr.d[0]
|
||||
bti c
|
||||
mov v1.d[0], vctr.d[0]
|
||||
bti c
|
||||
mov v2.d[0], vctr.d[0]
|
||||
bti c
|
||||
mov v3.d[0], vctr.d[0]
|
||||
ST5( bti c )
|
||||
ST5( mov v4.d[0], vctr.d[0] )
|
||||
1: b 2f
|
||||
.previous
|
||||
/*
|
||||
* Apply carry to counter blocks if needed.
|
||||
*
|
||||
* Since the carry flag was set, we know 0 <= IV_PART <
|
||||
* MAX_STRIDE. Using the value of IV_PART we can determine how
|
||||
* many counter blocks need to be updated.
|
||||
*/
|
||||
cbz IV_PART, 2f
|
||||
adr x16, 1f
|
||||
sub x16, x16, IV_PART, lsl #3
|
||||
br x16
|
||||
bti c
|
||||
mov v0.d[0], vctr.d[0]
|
||||
bti c
|
||||
mov v1.d[0], vctr.d[0]
|
||||
bti c
|
||||
mov v2.d[0], vctr.d[0]
|
||||
bti c
|
||||
mov v3.d[0], vctr.d[0]
|
||||
ST5( bti c )
|
||||
ST5( mov v4.d[0], vctr.d[0] )
|
||||
1: b 2f
|
||||
.previous
|
||||
|
||||
2: rev x7, x12
|
||||
ins vctr.d[1], x7
|
||||
sub x7, x12, #MAX_STRIDE - 1
|
||||
sub x8, x12, #MAX_STRIDE - 2
|
||||
sub x9, x12, #MAX_STRIDE - 3
|
||||
rev x7, x7
|
||||
rev x8, x8
|
||||
mov v1.d[1], x7
|
||||
rev x9, x9
|
||||
ST5( sub x10, x12, #MAX_STRIDE - 4 )
|
||||
mov v2.d[1], x8
|
||||
ST5( rev x10, x10 )
|
||||
mov v3.d[1], x9
|
||||
ST5( mov v4.d[1], x10 )
|
||||
tbnz w4, #31, .Lctrtail
|
||||
ld1 {v5.16b-v7.16b}, [x1], #48
|
||||
2: rev x7, IV_PART
|
||||
ins vctr.d[1], x7
|
||||
sub x7, IV_PART, #MAX_STRIDE - 1
|
||||
sub x8, IV_PART, #MAX_STRIDE - 2
|
||||
sub x9, IV_PART, #MAX_STRIDE - 3
|
||||
rev x7, x7
|
||||
rev x8, x8
|
||||
mov v1.d[1], x7
|
||||
rev x9, x9
|
||||
ST5( sub x10, IV_PART, #MAX_STRIDE - 4 )
|
||||
mov v2.d[1], x8
|
||||
ST5( rev x10, x10 )
|
||||
mov v3.d[1], x9
|
||||
ST5( mov v4.d[1], x10 )
|
||||
.endif
|
||||
|
||||
/*
|
||||
* If there are at least MAX_STRIDE blocks left, XOR the data with
|
||||
* keystream and store. Otherwise jump to tail handling.
|
||||
*/
|
||||
tbnz BYTES_W, #31, .Lctrtail\xctr
|
||||
ld1 {v5.16b-v7.16b}, [IN], #48
|
||||
ST4( bl aes_encrypt_block4x )
|
||||
ST5( bl aes_encrypt_block5x )
|
||||
eor v0.16b, v5.16b, v0.16b
|
||||
ST4( ld1 {v5.16b}, [x1], #16 )
|
||||
ST4( ld1 {v5.16b}, [IN], #16 )
|
||||
eor v1.16b, v6.16b, v1.16b
|
||||
ST5( ld1 {v5.16b-v6.16b}, [x1], #32 )
|
||||
ST5( ld1 {v5.16b-v6.16b}, [IN], #32 )
|
||||
eor v2.16b, v7.16b, v2.16b
|
||||
eor v3.16b, v5.16b, v3.16b
|
||||
ST5( eor v4.16b, v6.16b, v4.16b )
|
||||
st1 {v0.16b-v3.16b}, [x0], #64
|
||||
ST5( st1 {v4.16b}, [x0], #16 )
|
||||
cbz w4, .Lctrout
|
||||
b .LctrloopNx
|
||||
st1 {v0.16b-v3.16b}, [OUT], #64
|
||||
ST5( st1 {v4.16b}, [OUT], #16 )
|
||||
cbz BYTES_W, .Lctrout\xctr
|
||||
b .LctrloopNx\xctr
|
||||
|
||||
.Lctrout:
|
||||
st1 {vctr.16b}, [x5] /* return next CTR value */
|
||||
.Lctrout\xctr:
|
||||
.if !\xctr
|
||||
st1 {vctr.16b}, [IV] /* return next CTR value */
|
||||
.endif
|
||||
ldp x29, x30, [sp], #16
|
||||
ret
|
||||
|
||||
.Lctrtail:
|
||||
/* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */
|
||||
.Lctrtail\xctr:
|
||||
/*
|
||||
* Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
|
||||
*
|
||||
* This code expects the last keystream block to be in v{MAX_STRIDE-1}.
|
||||
* For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
|
||||
* v4 should have the next two counter blocks.
|
||||
*
|
||||
* This allows us to store the ciphertext by writing to overlapping
|
||||
* regions of memory. Any invalid ciphertext blocks get overwritten by
|
||||
* correctly computed blocks. This approach greatly simplifies the
|
||||
* logic for storing the ciphertext.
|
||||
*/
|
||||
mov x16, #16
|
||||
ands x6, x4, #0xf
|
||||
csel x13, x6, x16, ne
|
||||
ands w7, BYTES_W, #0xf
|
||||
csel x13, x7, x16, ne
|
||||
|
||||
ST5( cmp w4, #64 - (MAX_STRIDE << 4) )
|
||||
ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4))
|
||||
ST5( csel x14, x16, xzr, gt )
|
||||
cmp w4, #48 - (MAX_STRIDE << 4)
|
||||
cmp BYTES_W, #48 - (MAX_STRIDE << 4)
|
||||
csel x15, x16, xzr, gt
|
||||
cmp w4, #32 - (MAX_STRIDE << 4)
|
||||
cmp BYTES_W, #32 - (MAX_STRIDE << 4)
|
||||
csel x16, x16, xzr, gt
|
||||
cmp w4, #16 - (MAX_STRIDE << 4)
|
||||
cmp BYTES_W, #16 - (MAX_STRIDE << 4)
|
||||
|
||||
adr_l x12, .Lcts_permute_table
|
||||
add x12, x12, x13
|
||||
ble .Lctrtail1x
|
||||
adr_l x9, .Lcts_permute_table
|
||||
add x9, x9, x13
|
||||
ble .Lctrtail1x\xctr
|
||||
|
||||
ST5( ld1 {v5.16b}, [x1], x14 )
|
||||
ld1 {v6.16b}, [x1], x15
|
||||
ld1 {v7.16b}, [x1], x16
|
||||
ST5( ld1 {v5.16b}, [IN], x14 )
|
||||
ld1 {v6.16b}, [IN], x15
|
||||
ld1 {v7.16b}, [IN], x16
|
||||
|
||||
ST4( bl aes_encrypt_block4x )
|
||||
ST5( bl aes_encrypt_block5x )
|
||||
|
||||
ld1 {v8.16b}, [x1], x13
|
||||
ld1 {v9.16b}, [x1]
|
||||
ld1 {v10.16b}, [x12]
|
||||
ld1 {v8.16b}, [IN], x13
|
||||
ld1 {v9.16b}, [IN]
|
||||
ld1 {v10.16b}, [x9]
|
||||
|
||||
ST4( eor v6.16b, v6.16b, v0.16b )
|
||||
ST4( eor v7.16b, v7.16b, v1.16b )
|
||||
@@ -453,32 +537,91 @@ ST5( eor v7.16b, v7.16b, v2.16b )
|
||||
ST5( eor v8.16b, v8.16b, v3.16b )
|
||||
ST5( eor v9.16b, v9.16b, v4.16b )
|
||||
|
||||
ST5( st1 {v5.16b}, [x0], x14 )
|
||||
st1 {v6.16b}, [x0], x15
|
||||
st1 {v7.16b}, [x0], x16
|
||||
add x13, x13, x0
|
||||
ST5( st1 {v5.16b}, [OUT], x14 )
|
||||
st1 {v6.16b}, [OUT], x15
|
||||
st1 {v7.16b}, [OUT], x16
|
||||
add x13, x13, OUT
|
||||
st1 {v9.16b}, [x13] // overlapping stores
|
||||
st1 {v8.16b}, [x0]
|
||||
b .Lctrout
|
||||
st1 {v8.16b}, [OUT]
|
||||
b .Lctrout\xctr
|
||||
|
||||
.Lctrtail1x:
|
||||
sub x7, x6, #16
|
||||
csel x6, x6, x7, eq
|
||||
add x1, x1, x6
|
||||
add x0, x0, x6
|
||||
ld1 {v5.16b}, [x1]
|
||||
ld1 {v6.16b}, [x0]
|
||||
.Lctrtail1x\xctr:
|
||||
/*
|
||||
* Handle <= 16 bytes of plaintext
|
||||
*
|
||||
* This code always reads and writes 16 bytes. To avoid out of bounds
|
||||
* accesses, XCTR and CTR modes must use a temporary buffer when
|
||||
* encrypting/decrypting less than 16 bytes.
|
||||
*
|
||||
* This code is unusual in that it loads the input and stores the output
|
||||
* relative to the end of the buffers rather than relative to the start.
|
||||
* This causes unusual behaviour when encrypting/decrypting less than 16
|
||||
* bytes; the end of the data is expected to be at the end of the
|
||||
* temporary buffer rather than the start of the data being at the start
|
||||
* of the temporary buffer.
|
||||
*/
|
||||
sub x8, x7, #16
|
||||
csel x7, x7, x8, eq
|
||||
add IN, IN, x7
|
||||
add OUT, OUT, x7
|
||||
ld1 {v5.16b}, [IN]
|
||||
ld1 {v6.16b}, [OUT]
|
||||
ST5( mov v3.16b, v4.16b )
|
||||
encrypt_block v3, w3, x2, x8, w7
|
||||
ld1 {v10.16b-v11.16b}, [x12]
|
||||
encrypt_block v3, ROUNDS_W, KEY, x8, w7
|
||||
ld1 {v10.16b-v11.16b}, [x9]
|
||||
tbl v3.16b, {v3.16b}, v10.16b
|
||||
sshr v11.16b, v11.16b, #7
|
||||
eor v5.16b, v5.16b, v3.16b
|
||||
bif v5.16b, v6.16b, v11.16b
|
||||
st1 {v5.16b}, [x0]
|
||||
b .Lctrout
|
||||
st1 {v5.16b}, [OUT]
|
||||
b .Lctrout\xctr
|
||||
|
||||
// Arguments
|
||||
.unreq OUT
|
||||
.unreq IN
|
||||
.unreq KEY
|
||||
.unreq ROUNDS_W
|
||||
.unreq BYTES_W
|
||||
.unreq IV
|
||||
.unreq BYTE_CTR_W // XCTR only
|
||||
// Intermediate values
|
||||
.unreq CTR_W // XCTR only
|
||||
.unreq CTR // XCTR only
|
||||
.unreq IV_PART
|
||||
.unreq BLOCKS
|
||||
.unreq BLOCKS_W
|
||||
.endm
|
||||
|
||||
/*
|
||||
* aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
||||
* int bytes, u8 ctr[])
|
||||
*
|
||||
* The input and output buffers must always be at least 16 bytes even if
|
||||
* encrypting/decrypting less than 16 bytes. Otherwise out of bounds
|
||||
* accesses will occur. The data to be encrypted/decrypted is expected
|
||||
* to be at the end of this 16-byte temporary buffer rather than the
|
||||
* start.
|
||||
*/
|
||||
|
||||
AES_FUNC_START(aes_ctr_encrypt)
|
||||
ctr_encrypt 0
|
||||
AES_FUNC_END(aes_ctr_encrypt)
|
||||
|
||||
/*
|
||||
* aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
||||
* int bytes, u8 const iv[], int byte_ctr)
|
||||
*
|
||||
* The input and output buffers must always be at least 16 bytes even if
|
||||
* encrypting/decrypting less than 16 bytes. Otherwise out of bounds
|
||||
* accesses will occur. The data to be encrypted/decrypted is expected
|
||||
* to be at the end of this 16-byte temporary buffer rather than the
|
||||
* start.
|
||||
*/
|
||||
|
||||
AES_FUNC_START(aes_xctr_encrypt)
|
||||
ctr_encrypt 1
|
||||
AES_FUNC_END(aes_xctr_encrypt)
|
||||
|
||||
|
||||
/*
|
||||
* aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
|
||||
|
||||
@@ -66,7 +66,7 @@
|
||||
prepare crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp
|
||||
.endm
|
||||
|
||||
/* apply SubBytes transformation using the the preloaded Sbox */
|
||||
/* apply SubBytes transformation using the preloaded Sbox */
|
||||
.macro sub_bytes, in
|
||||
sub v9.16b, \in\().16b, v15.16b
|
||||
tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b
|
||||
|
||||
@@ -52,7 +52,7 @@ static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||
{
|
||||
if (unlikely(!dctx->sset)) {
|
||||
if (!dctx->rset) {
|
||||
poly1305_init_arch(dctx, src);
|
||||
poly1305_init_arm64(&dctx->h, src);
|
||||
src += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
dctx->rset = 1;
|
||||
|
||||
361
arch/arm64/crypto/polyval-ce-core.S
Normal file
361
arch/arm64/crypto/polyval-ce-core.S
Normal file
@@ -0,0 +1,361 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Implementation of POLYVAL using ARMv8 Crypto Extensions.
|
||||
*
|
||||
* Copyright 2021 Google LLC
|
||||
*/
|
||||
/*
|
||||
* This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions
|
||||
* It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8,
|
||||
* ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
|
||||
* finite field multiplication into two steps.
|
||||
*
|
||||
* In the first step, we consider h^i, m_i as normal polynomials of degree less
|
||||
* than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
|
||||
* is simply polynomial multiplication.
|
||||
*
|
||||
* In the second step, we compute the reduction of p(x) modulo the finite field
|
||||
* modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
|
||||
*
|
||||
* This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
|
||||
* multiplication is finite field multiplication. The advantage is that the
|
||||
* two-step process only requires 1 finite field reduction for every 8
|
||||
* polynomial multiplications. Further parallelism is gained by interleaving the
|
||||
* multiplications and polynomial reductions.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#define STRIDE_BLOCKS 8
|
||||
|
||||
KEY_POWERS .req x0
|
||||
MSG .req x1
|
||||
BLOCKS_LEFT .req x2
|
||||
ACCUMULATOR .req x3
|
||||
KEY_START .req x10
|
||||
EXTRA_BYTES .req x11
|
||||
TMP .req x13
|
||||
|
||||
M0 .req v0
|
||||
M1 .req v1
|
||||
M2 .req v2
|
||||
M3 .req v3
|
||||
M4 .req v4
|
||||
M5 .req v5
|
||||
M6 .req v6
|
||||
M7 .req v7
|
||||
KEY8 .req v8
|
||||
KEY7 .req v9
|
||||
KEY6 .req v10
|
||||
KEY5 .req v11
|
||||
KEY4 .req v12
|
||||
KEY3 .req v13
|
||||
KEY2 .req v14
|
||||
KEY1 .req v15
|
||||
PL .req v16
|
||||
PH .req v17
|
||||
TMP_V .req v18
|
||||
LO .req v20
|
||||
MI .req v21
|
||||
HI .req v22
|
||||
SUM .req v23
|
||||
GSTAR .req v24
|
||||
|
||||
.text
|
||||
|
||||
.arch armv8-a+crypto
|
||||
.align 4
|
||||
|
||||
.Lgstar:
|
||||
.quad 0xc200000000000000, 0xc200000000000000
|
||||
|
||||
/*
|
||||
* Computes the product of two 128-bit polynomials in X and Y and XORs the
|
||||
* components of the 256-bit product into LO, MI, HI.
|
||||
*
|
||||
* Given:
|
||||
* X = [X_1 : X_0]
|
||||
* Y = [Y_1 : Y_0]
|
||||
*
|
||||
* We compute:
|
||||
* LO += X_0 * Y_0
|
||||
* MI += (X_0 + X_1) * (Y_0 + Y_1)
|
||||
* HI += X_1 * Y_1
|
||||
*
|
||||
* Later, the 256-bit result can be extracted as:
|
||||
* [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0]
|
||||
* This step is done when computing the polynomial reduction for efficiency
|
||||
* reasons.
|
||||
*
|
||||
* Karatsuba multiplication is used instead of Schoolbook multiplication because
|
||||
* it was found to be slightly faster on ARM64 CPUs.
|
||||
*
|
||||
*/
|
||||
.macro karatsuba1 X Y
|
||||
X .req \X
|
||||
Y .req \Y
|
||||
ext v25.16b, X.16b, X.16b, #8
|
||||
ext v26.16b, Y.16b, Y.16b, #8
|
||||
eor v25.16b, v25.16b, X.16b
|
||||
eor v26.16b, v26.16b, Y.16b
|
||||
pmull2 v28.1q, X.2d, Y.2d
|
||||
pmull v29.1q, X.1d, Y.1d
|
||||
pmull v27.1q, v25.1d, v26.1d
|
||||
eor HI.16b, HI.16b, v28.16b
|
||||
eor LO.16b, LO.16b, v29.16b
|
||||
eor MI.16b, MI.16b, v27.16b
|
||||
.unreq X
|
||||
.unreq Y
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into
|
||||
* them.
|
||||
*/
|
||||
.macro karatsuba1_store X Y
|
||||
X .req \X
|
||||
Y .req \Y
|
||||
ext v25.16b, X.16b, X.16b, #8
|
||||
ext v26.16b, Y.16b, Y.16b, #8
|
||||
eor v25.16b, v25.16b, X.16b
|
||||
eor v26.16b, v26.16b, Y.16b
|
||||
pmull2 HI.1q, X.2d, Y.2d
|
||||
pmull LO.1q, X.1d, Y.1d
|
||||
pmull MI.1q, v25.1d, v26.1d
|
||||
.unreq X
|
||||
.unreq Y
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Computes the 256-bit polynomial represented by LO, HI, MI. Stores
|
||||
* the result in PL, PH.
|
||||
* [PH : PL] =
|
||||
* [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
|
||||
*/
|
||||
.macro karatsuba2
|
||||
// v4 = [HI_1 + MI_1 : HI_0 + MI_0]
|
||||
eor v4.16b, HI.16b, MI.16b
|
||||
// v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]
|
||||
eor v4.16b, v4.16b, LO.16b
|
||||
// v5 = [HI_0 : LO_1]
|
||||
ext v5.16b, LO.16b, HI.16b, #8
|
||||
// v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]
|
||||
eor v4.16b, v4.16b, v5.16b
|
||||
// HI = [HI_0 : HI_1]
|
||||
ext HI.16b, HI.16b, HI.16b, #8
|
||||
// LO = [LO_0 : LO_1]
|
||||
ext LO.16b, LO.16b, LO.16b, #8
|
||||
// PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]
|
||||
ext PH.16b, v4.16b, HI.16b, #8
|
||||
// PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
|
||||
ext PL.16b, LO.16b, v4.16b, #8
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Computes the 128-bit reduction of PH : PL. Stores the result in dest.
|
||||
*
|
||||
* This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
|
||||
* x^128 + x^127 + x^126 + x^121 + 1.
|
||||
*
|
||||
* We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
|
||||
* product of two 128-bit polynomials in Montgomery form. We need to reduce it
|
||||
* mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
|
||||
* of x^128, this product has two extra factors of x^128. To get it back into
|
||||
* Montgomery form, we need to remove one of these factors by dividing by x^128.
|
||||
*
|
||||
* To accomplish both of these goals, we add multiples of g(x) that cancel out
|
||||
* the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
|
||||
* bits are zero, the polynomial division by x^128 can be done by right
|
||||
* shifting.
|
||||
*
|
||||
* Since the only nonzero term in the low 64 bits of g(x) is the constant term,
|
||||
* the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
|
||||
* only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
|
||||
* x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
|
||||
* the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
|
||||
* = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
|
||||
*
|
||||
* Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
|
||||
* 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
|
||||
* + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
|
||||
* x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
|
||||
* P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
|
||||
*
|
||||
* So our final computation is:
|
||||
* T = T_1 : T_0 = g*(x) * P_0
|
||||
* V = V_1 : V_0 = g*(x) * (P_1 + T_0)
|
||||
* p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
|
||||
*
|
||||
* The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
|
||||
* + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
|
||||
* T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
|
||||
*/
|
||||
.macro montgomery_reduction dest
|
||||
DEST .req \dest
|
||||
// TMP_V = T_1 : T_0 = P_0 * g*(x)
|
||||
pmull TMP_V.1q, PL.1d, GSTAR.1d
|
||||
// TMP_V = T_0 : T_1
|
||||
ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
|
||||
// TMP_V = P_1 + T_0 : P_0 + T_1
|
||||
eor TMP_V.16b, PL.16b, TMP_V.16b
|
||||
// PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
|
||||
eor PH.16b, PH.16b, TMP_V.16b
|
||||
// TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)
|
||||
pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
|
||||
eor DEST.16b, PH.16b, TMP_V.16b
|
||||
.unreq DEST
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Compute Polyval on 8 blocks.
|
||||
*
|
||||
* If reduce is set, also computes the montgomery reduction of the
|
||||
* previous full_stride call and XORs with the first message block.
|
||||
* (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
|
||||
* I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
|
||||
*
|
||||
* Sets PL, PH.
|
||||
*/
|
||||
.macro full_stride reduce
|
||||
eor LO.16b, LO.16b, LO.16b
|
||||
eor MI.16b, MI.16b, MI.16b
|
||||
eor HI.16b, HI.16b, HI.16b
|
||||
|
||||
ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
|
||||
ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
|
||||
|
||||
karatsuba1 M7 KEY1
|
||||
.if \reduce
|
||||
pmull TMP_V.1q, PL.1d, GSTAR.1d
|
||||
.endif
|
||||
|
||||
karatsuba1 M6 KEY2
|
||||
.if \reduce
|
||||
ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
|
||||
.endif
|
||||
|
||||
karatsuba1 M5 KEY3
|
||||
.if \reduce
|
||||
eor TMP_V.16b, PL.16b, TMP_V.16b
|
||||
.endif
|
||||
|
||||
karatsuba1 M4 KEY4
|
||||
.if \reduce
|
||||
eor PH.16b, PH.16b, TMP_V.16b
|
||||
.endif
|
||||
|
||||
karatsuba1 M3 KEY5
|
||||
.if \reduce
|
||||
pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
|
||||
.endif
|
||||
|
||||
karatsuba1 M2 KEY6
|
||||
.if \reduce
|
||||
eor SUM.16b, PH.16b, TMP_V.16b
|
||||
.endif
|
||||
|
||||
karatsuba1 M1 KEY7
|
||||
eor M0.16b, M0.16b, SUM.16b
|
||||
|
||||
karatsuba1 M0 KEY8
|
||||
karatsuba2
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Handle any extra blocks after full_stride loop.
|
||||
*/
|
||||
.macro partial_stride
|
||||
add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)
|
||||
sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4
|
||||
ld1 {KEY1.16b}, [KEY_POWERS], #16
|
||||
|
||||
ld1 {TMP_V.16b}, [MSG], #16
|
||||
eor SUM.16b, SUM.16b, TMP_V.16b
|
||||
karatsuba1_store KEY1 SUM
|
||||
sub BLOCKS_LEFT, BLOCKS_LEFT, #1
|
||||
|
||||
tst BLOCKS_LEFT, #4
|
||||
beq .Lpartial4BlocksDone
|
||||
ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
|
||||
ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
|
||||
karatsuba1 M0 KEY8
|
||||
karatsuba1 M1 KEY7
|
||||
karatsuba1 M2 KEY6
|
||||
karatsuba1 M3 KEY5
|
||||
.Lpartial4BlocksDone:
|
||||
tst BLOCKS_LEFT, #2
|
||||
beq .Lpartial2BlocksDone
|
||||
ld1 {M0.16b, M1.16b}, [MSG], #32
|
||||
ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
|
||||
karatsuba1 M0 KEY8
|
||||
karatsuba1 M1 KEY7
|
||||
.Lpartial2BlocksDone:
|
||||
tst BLOCKS_LEFT, #1
|
||||
beq .LpartialDone
|
||||
ld1 {M0.16b}, [MSG], #16
|
||||
ld1 {KEY8.16b}, [KEY_POWERS], #16
|
||||
karatsuba1 M0 KEY8
|
||||
.LpartialDone:
|
||||
karatsuba2
|
||||
montgomery_reduction SUM
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Perform montgomery multiplication in GF(2^128) and store result in op1.
|
||||
*
|
||||
* Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
|
||||
* If op1, op2 are in montgomery form, this computes the montgomery
|
||||
* form of op1*op2.
|
||||
*
|
||||
* void pmull_polyval_mul(u8 *op1, const u8 *op2);
|
||||
*/
|
||||
SYM_FUNC_START(pmull_polyval_mul)
|
||||
adr TMP, .Lgstar
|
||||
ld1 {GSTAR.2d}, [TMP]
|
||||
ld1 {v0.16b}, [x0]
|
||||
ld1 {v1.16b}, [x1]
|
||||
karatsuba1_store v0 v1
|
||||
karatsuba2
|
||||
montgomery_reduction SUM
|
||||
st1 {SUM.16b}, [x0]
|
||||
ret
|
||||
SYM_FUNC_END(pmull_polyval_mul)
|
||||
|
||||
/*
|
||||
* Perform polynomial evaluation as specified by POLYVAL. This computes:
|
||||
* h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
|
||||
* where n=nblocks, h is the hash key, and m_i are the message blocks.
|
||||
*
|
||||
* x0 - pointer to precomputed key powers h^8 ... h^1
|
||||
* x1 - pointer to message blocks
|
||||
* x2 - number of blocks to hash
|
||||
* x3 - pointer to accumulator
|
||||
*
|
||||
* void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in,
|
||||
* size_t nblocks, u8 *accumulator);
|
||||
*/
|
||||
SYM_FUNC_START(pmull_polyval_update)
|
||||
adr TMP, .Lgstar
|
||||
mov KEY_START, KEY_POWERS
|
||||
ld1 {GSTAR.2d}, [TMP]
|
||||
ld1 {SUM.16b}, [ACCUMULATOR]
|
||||
subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
|
||||
blt .LstrideLoopExit
|
||||
ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
|
||||
ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
|
||||
full_stride 0
|
||||
subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
|
||||
blt .LstrideLoopExitReduce
|
||||
.LstrideLoop:
|
||||
full_stride 1
|
||||
subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
|
||||
bge .LstrideLoop
|
||||
.LstrideLoopExitReduce:
|
||||
montgomery_reduction SUM
|
||||
.LstrideLoopExit:
|
||||
adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
|
||||
beq .LskipPartial
|
||||
partial_stride
|
||||
.LskipPartial:
|
||||
st1 {SUM.16b}, [ACCUMULATOR]
|
||||
ret
|
||||
SYM_FUNC_END(pmull_polyval_update)
|
||||
191
arch/arm64/crypto/polyval-ce-glue.c
Normal file
191
arch/arm64/crypto/polyval-ce-glue.c
Normal file
@@ -0,0 +1,191 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* Glue code for POLYVAL using ARMv8 Crypto Extensions
|
||||
*
|
||||
* Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
|
||||
* Copyright (c) 2009 Intel Corp.
|
||||
* Author: Huang Ying <ying.huang@intel.com>
|
||||
* Copyright 2021 Google LLC
|
||||
*/
|
||||
|
||||
/*
|
||||
* Glue code based on ghash-clmulni-intel_glue.c.
|
||||
*
|
||||
* This implementation of POLYVAL uses montgomery multiplication accelerated by
|
||||
* ARMv8 Crypto Extensions instructions to implement the finite field operations.
|
||||
*/
|
||||
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <crypto/internal/simd.h>
|
||||
#include <crypto/polyval.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/cpufeature.h>
|
||||
#include <asm/neon.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
#define NUM_KEY_POWERS 8
|
||||
|
||||
struct polyval_tfm_ctx {
|
||||
/*
|
||||
* These powers must be in the order h^8, ..., h^1.
|
||||
*/
|
||||
u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE];
|
||||
};
|
||||
|
||||
struct polyval_desc_ctx {
|
||||
u8 buffer[POLYVAL_BLOCK_SIZE];
|
||||
u32 bytes;
|
||||
};
|
||||
|
||||
asmlinkage void pmull_polyval_update(const struct polyval_tfm_ctx *keys,
|
||||
const u8 *in, size_t nblocks, u8 *accumulator);
|
||||
asmlinkage void pmull_polyval_mul(u8 *op1, const u8 *op2);
|
||||
|
||||
static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
|
||||
const u8 *in, size_t nblocks, u8 *accumulator)
|
||||
{
|
||||
if (likely(crypto_simd_usable())) {
|
||||
kernel_neon_begin();
|
||||
pmull_polyval_update(keys, in, nblocks, accumulator);
|
||||
kernel_neon_end();
|
||||
} else {
|
||||
polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in,
|
||||
nblocks, accumulator);
|
||||
}
|
||||
}
|
||||
|
||||
static void internal_polyval_mul(u8 *op1, const u8 *op2)
|
||||
{
|
||||
if (likely(crypto_simd_usable())) {
|
||||
kernel_neon_begin();
|
||||
pmull_polyval_mul(op1, op2);
|
||||
kernel_neon_end();
|
||||
} else {
|
||||
polyval_mul_non4k(op1, op2);
|
||||
}
|
||||
}
|
||||
|
||||
static int polyval_arm64_setkey(struct crypto_shash *tfm,
|
||||
const u8 *key, unsigned int keylen)
|
||||
{
|
||||
struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm);
|
||||
int i;
|
||||
|
||||
if (keylen != POLYVAL_BLOCK_SIZE)
|
||||
return -EINVAL;
|
||||
|
||||
memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
|
||||
|
||||
for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
|
||||
memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
|
||||
internal_polyval_mul(tctx->key_powers[i],
|
||||
tctx->key_powers[i+1]);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int polyval_arm64_init(struct shash_desc *desc)
|
||||
{
|
||||
struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
memset(dctx, 0, sizeof(*dctx));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int polyval_arm64_update(struct shash_desc *desc,
|
||||
const u8 *src, unsigned int srclen)
|
||||
{
|
||||
struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
|
||||
u8 *pos;
|
||||
unsigned int nblocks;
|
||||
unsigned int n;
|
||||
|
||||
if (dctx->bytes) {
|
||||
n = min(srclen, dctx->bytes);
|
||||
pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes;
|
||||
|
||||
dctx->bytes -= n;
|
||||
srclen -= n;
|
||||
|
||||
while (n--)
|
||||
*pos++ ^= *src++;
|
||||
|
||||
if (!dctx->bytes)
|
||||
internal_polyval_mul(dctx->buffer,
|
||||
tctx->key_powers[NUM_KEY_POWERS-1]);
|
||||
}
|
||||
|
||||
while (srclen >= POLYVAL_BLOCK_SIZE) {
|
||||
/* allow rescheduling every 4K bytes */
|
||||
nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
|
||||
internal_polyval_update(tctx, src, nblocks, dctx->buffer);
|
||||
srclen -= nblocks * POLYVAL_BLOCK_SIZE;
|
||||
src += nblocks * POLYVAL_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
if (srclen) {
|
||||
dctx->bytes = POLYVAL_BLOCK_SIZE - srclen;
|
||||
pos = dctx->buffer;
|
||||
while (srclen--)
|
||||
*pos++ ^= *src++;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int polyval_arm64_final(struct shash_desc *desc, u8 *dst)
|
||||
{
|
||||
struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
|
||||
|
||||
if (dctx->bytes) {
|
||||
internal_polyval_mul(dctx->buffer,
|
||||
tctx->key_powers[NUM_KEY_POWERS-1]);
|
||||
}
|
||||
|
||||
memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg polyval_alg = {
|
||||
.digestsize = POLYVAL_DIGEST_SIZE,
|
||||
.init = polyval_arm64_init,
|
||||
.update = polyval_arm64_update,
|
||||
.final = polyval_arm64_final,
|
||||
.setkey = polyval_arm64_setkey,
|
||||
.descsize = sizeof(struct polyval_desc_ctx),
|
||||
.base = {
|
||||
.cra_name = "polyval",
|
||||
.cra_driver_name = "polyval-ce",
|
||||
.cra_priority = 200,
|
||||
.cra_blocksize = POLYVAL_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct polyval_tfm_ctx),
|
||||
.cra_module = THIS_MODULE,
|
||||
},
|
||||
};
|
||||
|
||||
static int __init polyval_ce_mod_init(void)
|
||||
{
|
||||
return crypto_register_shash(&polyval_alg);
|
||||
}
|
||||
|
||||
static void __exit polyval_ce_mod_exit(void)
|
||||
{
|
||||
crypto_unregister_shash(&polyval_alg);
|
||||
}
|
||||
|
||||
module_cpu_feature_match(PMULL, polyval_ce_mod_init)
|
||||
module_exit(polyval_ce_mod_exit);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("POLYVAL hash function accelerated by ARMv8 Crypto Extensions");
|
||||
MODULE_ALIAS_CRYPTO("polyval");
|
||||
MODULE_ALIAS_CRYPTO("polyval-ce");
|
||||
@@ -28,7 +28,7 @@
|
||||
* instructions per clock cycle using one 32/64 bit unit (SU1) and one 32
|
||||
* bit unit (SU2). One of these can be a memory access that is executed via
|
||||
* a single load and store unit (LSU). XTS-AES-256 takes ~780 operations per
|
||||
* 16 byte block block or 25 cycles per byte. Thus 768 bytes of input data
|
||||
* 16 byte block or 25 cycles per byte. Thus 768 bytes of input data
|
||||
* will need an estimated maximum of 20,000 cycles. Headroom for cache misses
|
||||
* included. Even with the low end model clocked at 667 MHz this equals to a
|
||||
* critical time window of less than 30us. The value has been chosen to
|
||||
|
||||
@@ -61,14 +61,15 @@ sha256-ssse3-$(CONFIG_AS_SHA256_NI) += sha256_ni_asm.o
|
||||
obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
|
||||
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
|
||||
blake2s-x86_64-y := blake2s-shash.o
|
||||
obj-$(if $(CONFIG_CRYPTO_BLAKE2S_X86),y) += libblake2s-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o
|
||||
libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
|
||||
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
|
||||
polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
|
||||
crc32c-intel-y := crc32c-intel_glue.o
|
||||
crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
|
||||
|
||||
@@ -23,6 +23,11 @@
|
||||
|
||||
#define VMOVDQ vmovdqu
|
||||
|
||||
/*
|
||||
* Note: the "x" prefix in these aliases means "this is an xmm register". The
|
||||
* alias prefixes have no relation to XCTR where the "X" prefix means "XOR
|
||||
* counter".
|
||||
*/
|
||||
#define xdata0 %xmm0
|
||||
#define xdata1 %xmm1
|
||||
#define xdata2 %xmm2
|
||||
@@ -31,8 +36,10 @@
|
||||
#define xdata5 %xmm5
|
||||
#define xdata6 %xmm6
|
||||
#define xdata7 %xmm7
|
||||
#define xcounter %xmm8
|
||||
#define xbyteswap %xmm9
|
||||
#define xcounter %xmm8 // CTR mode only
|
||||
#define xiv %xmm8 // XCTR mode only
|
||||
#define xbyteswap %xmm9 // CTR mode only
|
||||
#define xtmp %xmm9 // XCTR mode only
|
||||
#define xkey0 %xmm10
|
||||
#define xkey4 %xmm11
|
||||
#define xkey8 %xmm12
|
||||
@@ -45,7 +52,7 @@
|
||||
#define p_keys %rdx
|
||||
#define p_out %rcx
|
||||
#define num_bytes %r8
|
||||
|
||||
#define counter %r9 // XCTR mode only
|
||||
#define tmp %r10
|
||||
#define DDQ_DATA 0
|
||||
#define XDATA 1
|
||||
@@ -102,7 +109,7 @@ ddq_add_8:
|
||||
* do_aes num_in_par load_keys key_len
|
||||
* This increments p_in, but not p_out
|
||||
*/
|
||||
.macro do_aes b, k, key_len
|
||||
.macro do_aes b, k, key_len, xctr
|
||||
.set by, \b
|
||||
.set load_keys, \k
|
||||
.set klen, \key_len
|
||||
@@ -111,29 +118,48 @@ ddq_add_8:
|
||||
vmovdqa 0*16(p_keys), xkey0
|
||||
.endif
|
||||
|
||||
vpshufb xbyteswap, xcounter, xdata0
|
||||
|
||||
.set i, 1
|
||||
.rept (by - 1)
|
||||
club XDATA, i
|
||||
vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
|
||||
vptest ddq_low_msk(%rip), var_xdata
|
||||
jnz 1f
|
||||
vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
|
||||
vpaddq ddq_high_add_1(%rip), xcounter, xcounter
|
||||
1:
|
||||
vpshufb xbyteswap, var_xdata, var_xdata
|
||||
.set i, (i +1)
|
||||
.endr
|
||||
.if \xctr
|
||||
movq counter, xtmp
|
||||
.set i, 0
|
||||
.rept (by)
|
||||
club XDATA, i
|
||||
vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata
|
||||
.set i, (i +1)
|
||||
.endr
|
||||
.set i, 0
|
||||
.rept (by)
|
||||
club XDATA, i
|
||||
vpxor xiv, var_xdata, var_xdata
|
||||
.set i, (i +1)
|
||||
.endr
|
||||
.else
|
||||
vpshufb xbyteswap, xcounter, xdata0
|
||||
.set i, 1
|
||||
.rept (by - 1)
|
||||
club XDATA, i
|
||||
vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
|
||||
vptest ddq_low_msk(%rip), var_xdata
|
||||
jnz 1f
|
||||
vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
|
||||
vpaddq ddq_high_add_1(%rip), xcounter, xcounter
|
||||
1:
|
||||
vpshufb xbyteswap, var_xdata, var_xdata
|
||||
.set i, (i +1)
|
||||
.endr
|
||||
.endif
|
||||
|
||||
vmovdqa 1*16(p_keys), xkeyA
|
||||
|
||||
vpxor xkey0, xdata0, xdata0
|
||||
vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
|
||||
vptest ddq_low_msk(%rip), xcounter
|
||||
jnz 1f
|
||||
vpaddq ddq_high_add_1(%rip), xcounter, xcounter
|
||||
1:
|
||||
.if \xctr
|
||||
add $by, counter
|
||||
.else
|
||||
vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
|
||||
vptest ddq_low_msk(%rip), xcounter
|
||||
jnz 1f
|
||||
vpaddq ddq_high_add_1(%rip), xcounter, xcounter
|
||||
1:
|
||||
.endif
|
||||
|
||||
.set i, 1
|
||||
.rept (by - 1)
|
||||
@@ -371,94 +397,99 @@ ddq_add_8:
|
||||
.endr
|
||||
.endm
|
||||
|
||||
.macro do_aes_load val, key_len
|
||||
do_aes \val, 1, \key_len
|
||||
.macro do_aes_load val, key_len, xctr
|
||||
do_aes \val, 1, \key_len, \xctr
|
||||
.endm
|
||||
|
||||
.macro do_aes_noload val, key_len
|
||||
do_aes \val, 0, \key_len
|
||||
.macro do_aes_noload val, key_len, xctr
|
||||
do_aes \val, 0, \key_len, \xctr
|
||||
.endm
|
||||
|
||||
/* main body of aes ctr load */
|
||||
|
||||
.macro do_aes_ctrmain key_len
|
||||
.macro do_aes_ctrmain key_len, xctr
|
||||
cmp $16, num_bytes
|
||||
jb .Ldo_return2\key_len
|
||||
jb .Ldo_return2\xctr\key_len
|
||||
|
||||
vmovdqa byteswap_const(%rip), xbyteswap
|
||||
vmovdqu (p_iv), xcounter
|
||||
vpshufb xbyteswap, xcounter, xcounter
|
||||
.if \xctr
|
||||
shr $4, counter
|
||||
vmovdqu (p_iv), xiv
|
||||
.else
|
||||
vmovdqa byteswap_const(%rip), xbyteswap
|
||||
vmovdqu (p_iv), xcounter
|
||||
vpshufb xbyteswap, xcounter, xcounter
|
||||
.endif
|
||||
|
||||
mov num_bytes, tmp
|
||||
and $(7*16), tmp
|
||||
jz .Lmult_of_8_blks\key_len
|
||||
jz .Lmult_of_8_blks\xctr\key_len
|
||||
|
||||
/* 1 <= tmp <= 7 */
|
||||
cmp $(4*16), tmp
|
||||
jg .Lgt4\key_len
|
||||
je .Leq4\key_len
|
||||
jg .Lgt4\xctr\key_len
|
||||
je .Leq4\xctr\key_len
|
||||
|
||||
.Llt4\key_len:
|
||||
.Llt4\xctr\key_len:
|
||||
cmp $(2*16), tmp
|
||||
jg .Leq3\key_len
|
||||
je .Leq2\key_len
|
||||
jg .Leq3\xctr\key_len
|
||||
je .Leq2\xctr\key_len
|
||||
|
||||
.Leq1\key_len:
|
||||
do_aes_load 1, \key_len
|
||||
.Leq1\xctr\key_len:
|
||||
do_aes_load 1, \key_len, \xctr
|
||||
add $(1*16), p_out
|
||||
and $(~7*16), num_bytes
|
||||
jz .Ldo_return2\key_len
|
||||
jmp .Lmain_loop2\key_len
|
||||
jz .Ldo_return2\xctr\key_len
|
||||
jmp .Lmain_loop2\xctr\key_len
|
||||
|
||||
.Leq2\key_len:
|
||||
do_aes_load 2, \key_len
|
||||
.Leq2\xctr\key_len:
|
||||
do_aes_load 2, \key_len, \xctr
|
||||
add $(2*16), p_out
|
||||
and $(~7*16), num_bytes
|
||||
jz .Ldo_return2\key_len
|
||||
jmp .Lmain_loop2\key_len
|
||||
jz .Ldo_return2\xctr\key_len
|
||||
jmp .Lmain_loop2\xctr\key_len
|
||||
|
||||
|
||||
.Leq3\key_len:
|
||||
do_aes_load 3, \key_len
|
||||
.Leq3\xctr\key_len:
|
||||
do_aes_load 3, \key_len, \xctr
|
||||
add $(3*16), p_out
|
||||
and $(~7*16), num_bytes
|
||||
jz .Ldo_return2\key_len
|
||||
jmp .Lmain_loop2\key_len
|
||||
jz .Ldo_return2\xctr\key_len
|
||||
jmp .Lmain_loop2\xctr\key_len
|
||||
|
||||
.Leq4\key_len:
|
||||
do_aes_load 4, \key_len
|
||||
.Leq4\xctr\key_len:
|
||||
do_aes_load 4, \key_len, \xctr
|
||||
add $(4*16), p_out
|
||||
and $(~7*16), num_bytes
|
||||
jz .Ldo_return2\key_len
|
||||
jmp .Lmain_loop2\key_len
|
||||
jz .Ldo_return2\xctr\key_len
|
||||
jmp .Lmain_loop2\xctr\key_len
|
||||
|
||||
.Lgt4\key_len:
|
||||
.Lgt4\xctr\key_len:
|
||||
cmp $(6*16), tmp
|
||||
jg .Leq7\key_len
|
||||
je .Leq6\key_len
|
||||
jg .Leq7\xctr\key_len
|
||||
je .Leq6\xctr\key_len
|
||||
|
||||
.Leq5\key_len:
|
||||
do_aes_load 5, \key_len
|
||||
.Leq5\xctr\key_len:
|
||||
do_aes_load 5, \key_len, \xctr
|
||||
add $(5*16), p_out
|
||||
and $(~7*16), num_bytes
|
||||
jz .Ldo_return2\key_len
|
||||
jmp .Lmain_loop2\key_len
|
||||
jz .Ldo_return2\xctr\key_len
|
||||
jmp .Lmain_loop2\xctr\key_len
|
||||
|
||||
.Leq6\key_len:
|
||||
do_aes_load 6, \key_len
|
||||
.Leq6\xctr\key_len:
|
||||
do_aes_load 6, \key_len, \xctr
|
||||
add $(6*16), p_out
|
||||
and $(~7*16), num_bytes
|
||||
jz .Ldo_return2\key_len
|
||||
jmp .Lmain_loop2\key_len
|
||||
jz .Ldo_return2\xctr\key_len
|
||||
jmp .Lmain_loop2\xctr\key_len
|
||||
|
||||
.Leq7\key_len:
|
||||
do_aes_load 7, \key_len
|
||||
.Leq7\xctr\key_len:
|
||||
do_aes_load 7, \key_len, \xctr
|
||||
add $(7*16), p_out
|
||||
and $(~7*16), num_bytes
|
||||
jz .Ldo_return2\key_len
|
||||
jmp .Lmain_loop2\key_len
|
||||
jz .Ldo_return2\xctr\key_len
|
||||
jmp .Lmain_loop2\xctr\key_len
|
||||
|
||||
.Lmult_of_8_blks\key_len:
|
||||
.Lmult_of_8_blks\xctr\key_len:
|
||||
.if (\key_len != KEY_128)
|
||||
vmovdqa 0*16(p_keys), xkey0
|
||||
vmovdqa 4*16(p_keys), xkey4
|
||||
@@ -471,17 +502,19 @@ ddq_add_8:
|
||||
vmovdqa 9*16(p_keys), xkey12
|
||||
.endif
|
||||
.align 16
|
||||
.Lmain_loop2\key_len:
|
||||
.Lmain_loop2\xctr\key_len:
|
||||
/* num_bytes is a multiple of 8 and >0 */
|
||||
do_aes_noload 8, \key_len
|
||||
do_aes_noload 8, \key_len, \xctr
|
||||
add $(8*16), p_out
|
||||
sub $(8*16), num_bytes
|
||||
jne .Lmain_loop2\key_len
|
||||
jne .Lmain_loop2\xctr\key_len
|
||||
|
||||
.Ldo_return2\key_len:
|
||||
/* return updated IV */
|
||||
vpshufb xbyteswap, xcounter, xcounter
|
||||
vmovdqu xcounter, (p_iv)
|
||||
.Ldo_return2\xctr\key_len:
|
||||
.if !\xctr
|
||||
/* return updated IV */
|
||||
vpshufb xbyteswap, xcounter, xcounter
|
||||
vmovdqu xcounter, (p_iv)
|
||||
.endif
|
||||
RET
|
||||
.endm
|
||||
|
||||
@@ -494,7 +527,7 @@ ddq_add_8:
|
||||
*/
|
||||
SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
|
||||
/* call the aes main loop */
|
||||
do_aes_ctrmain KEY_128
|
||||
do_aes_ctrmain KEY_128 0
|
||||
|
||||
SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
|
||||
|
||||
@@ -507,7 +540,7 @@ SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
|
||||
*/
|
||||
SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
|
||||
/* call the aes main loop */
|
||||
do_aes_ctrmain KEY_192
|
||||
do_aes_ctrmain KEY_192 0
|
||||
|
||||
SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
|
||||
|
||||
@@ -520,6 +553,45 @@ SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
|
||||
*/
|
||||
SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
|
||||
/* call the aes main loop */
|
||||
do_aes_ctrmain KEY_256
|
||||
do_aes_ctrmain KEY_256 0
|
||||
|
||||
SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
|
||||
|
||||
/*
|
||||
* routine to do AES128 XCTR enc/decrypt "by8"
|
||||
* XMM registers are clobbered.
|
||||
* Saving/restoring must be done at a higher level
|
||||
* aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys,
|
||||
* u8* out, unsigned int num_bytes, unsigned int byte_ctr)
|
||||
*/
|
||||
SYM_FUNC_START(aes_xctr_enc_128_avx_by8)
|
||||
/* call the aes main loop */
|
||||
do_aes_ctrmain KEY_128 1
|
||||
|
||||
SYM_FUNC_END(aes_xctr_enc_128_avx_by8)
|
||||
|
||||
/*
|
||||
* routine to do AES192 XCTR enc/decrypt "by8"
|
||||
* XMM registers are clobbered.
|
||||
* Saving/restoring must be done at a higher level
|
||||
* aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys,
|
||||
* u8* out, unsigned int num_bytes, unsigned int byte_ctr)
|
||||
*/
|
||||
SYM_FUNC_START(aes_xctr_enc_192_avx_by8)
|
||||
/* call the aes main loop */
|
||||
do_aes_ctrmain KEY_192 1
|
||||
|
||||
SYM_FUNC_END(aes_xctr_enc_192_avx_by8)
|
||||
|
||||
/*
|
||||
* routine to do AES256 XCTR enc/decrypt "by8"
|
||||
* XMM registers are clobbered.
|
||||
* Saving/restoring must be done at a higher level
|
||||
* aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys,
|
||||
* u8* out, unsigned int num_bytes, unsigned int byte_ctr)
|
||||
*/
|
||||
SYM_FUNC_START(aes_xctr_enc_256_avx_by8)
|
||||
/* call the aes main loop */
|
||||
do_aes_ctrmain KEY_256 1
|
||||
|
||||
SYM_FUNC_END(aes_xctr_enc_256_avx_by8)
|
||||
|
||||
@@ -135,6 +135,20 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
|
||||
void *keys, u8 *out, unsigned int num_bytes);
|
||||
asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
|
||||
void *keys, u8 *out, unsigned int num_bytes);
|
||||
|
||||
|
||||
asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv,
|
||||
const void *keys, u8 *out, unsigned int num_bytes,
|
||||
unsigned int byte_ctr);
|
||||
|
||||
asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv,
|
||||
const void *keys, u8 *out, unsigned int num_bytes,
|
||||
unsigned int byte_ctr);
|
||||
|
||||
asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv,
|
||||
const void *keys, u8 *out, unsigned int num_bytes,
|
||||
unsigned int byte_ctr);
|
||||
|
||||
/*
|
||||
* asmlinkage void aesni_gcm_init_avx_gen2()
|
||||
* gcm_data *my_ctx_data, context data
|
||||
@@ -527,6 +541,59 @@ static int ctr_crypt(struct skcipher_request *req)
|
||||
return err;
|
||||
}
|
||||
|
||||
static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
|
||||
const u8 *in, unsigned int len, u8 *iv,
|
||||
unsigned int byte_ctr)
|
||||
{
|
||||
if (ctx->key_length == AES_KEYSIZE_128)
|
||||
aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len,
|
||||
byte_ctr);
|
||||
else if (ctx->key_length == AES_KEYSIZE_192)
|
||||
aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len,
|
||||
byte_ctr);
|
||||
else
|
||||
aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len,
|
||||
byte_ctr);
|
||||
}
|
||||
|
||||
static int xctr_crypt(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
|
||||
u8 keystream[AES_BLOCK_SIZE];
|
||||
struct skcipher_walk walk;
|
||||
unsigned int nbytes;
|
||||
unsigned int byte_ctr = 0;
|
||||
int err;
|
||||
__le32 block[AES_BLOCK_SIZE / sizeof(__le32)];
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, false);
|
||||
|
||||
while ((nbytes = walk.nbytes) > 0) {
|
||||
kernel_fpu_begin();
|
||||
if (nbytes & AES_BLOCK_MASK)
|
||||
aesni_xctr_enc_avx_tfm(ctx, walk.dst.virt.addr,
|
||||
walk.src.virt.addr, nbytes & AES_BLOCK_MASK,
|
||||
walk.iv, byte_ctr);
|
||||
nbytes &= ~AES_BLOCK_MASK;
|
||||
byte_ctr += walk.nbytes - nbytes;
|
||||
|
||||
if (walk.nbytes == walk.total && nbytes > 0) {
|
||||
memcpy(block, walk.iv, AES_BLOCK_SIZE);
|
||||
block[0] ^= cpu_to_le32(1 + byte_ctr / AES_BLOCK_SIZE);
|
||||
aesni_enc(ctx, keystream, (u8 *)block);
|
||||
crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes -
|
||||
nbytes, walk.src.virt.addr + walk.nbytes
|
||||
- nbytes, keystream, nbytes);
|
||||
byte_ctr += nbytes;
|
||||
nbytes = 0;
|
||||
}
|
||||
kernel_fpu_end();
|
||||
err = skcipher_walk_done(&walk, nbytes);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static int
|
||||
rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
|
||||
{
|
||||
@@ -1050,6 +1117,33 @@ static struct skcipher_alg aesni_skciphers[] = {
|
||||
static
|
||||
struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)];
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
* XCTR does not have a non-AVX implementation, so it must be enabled
|
||||
* conditionally.
|
||||
*/
|
||||
static struct skcipher_alg aesni_xctr = {
|
||||
.base = {
|
||||
.cra_name = "__xctr(aes)",
|
||||
.cra_driver_name = "__xctr-aes-aesni",
|
||||
.cra_priority = 400,
|
||||
.cra_flags = CRYPTO_ALG_INTERNAL,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = CRYPTO_AES_CTX_SIZE,
|
||||
.cra_module = THIS_MODULE,
|
||||
},
|
||||
.min_keysize = AES_MIN_KEY_SIZE,
|
||||
.max_keysize = AES_MAX_KEY_SIZE,
|
||||
.ivsize = AES_BLOCK_SIZE,
|
||||
.chunksize = AES_BLOCK_SIZE,
|
||||
.setkey = aesni_skcipher_setkey,
|
||||
.encrypt = xctr_crypt,
|
||||
.decrypt = xctr_crypt,
|
||||
};
|
||||
|
||||
static struct simd_skcipher_alg *aesni_simd_xctr;
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
|
||||
unsigned int key_len)
|
||||
@@ -1163,7 +1257,7 @@ static int __init aesni_init(void)
|
||||
static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm);
|
||||
pr_info("AES CTR mode by8 optimization enabled\n");
|
||||
}
|
||||
#endif
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
err = crypto_register_alg(&aesni_cipher_alg);
|
||||
if (err)
|
||||
@@ -1180,8 +1274,22 @@ static int __init aesni_init(void)
|
||||
if (err)
|
||||
goto unregister_skciphers;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
if (boot_cpu_has(X86_FEATURE_AVX))
|
||||
err = simd_register_skciphers_compat(&aesni_xctr, 1,
|
||||
&aesni_simd_xctr);
|
||||
if (err)
|
||||
goto unregister_aeads;
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
return 0;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
unregister_aeads:
|
||||
simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
|
||||
aesni_simd_aeads);
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
unregister_skciphers:
|
||||
simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
|
||||
aesni_simd_skciphers);
|
||||
@@ -1197,6 +1305,10 @@ static void __exit aesni_exit(void)
|
||||
simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
|
||||
aesni_simd_skciphers);
|
||||
crypto_unregister_alg(&aesni_cipher_alg);
|
||||
#ifdef CONFIG_X86_64
|
||||
if (boot_cpu_has(X86_FEATURE_AVX))
|
||||
simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
|
||||
#endif /* CONFIG_X86_64 */
|
||||
}
|
||||
|
||||
late_initcall(aesni_init);
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
*/
|
||||
|
||||
#include <crypto/internal/blake2s.h>
|
||||
#include <crypto/internal/simd.h>
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/jump_label.h>
|
||||
@@ -33,7 +32,7 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block,
|
||||
/* SIMD disables preemption, so relax after processing each page. */
|
||||
BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
|
||||
|
||||
if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
|
||||
if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
|
||||
blake2s_compress_generic(state, block, nblocks, inc);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1,77 +0,0 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR MIT
|
||||
/*
|
||||
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||
*/
|
||||
|
||||
#include <crypto/internal/blake2s.h>
|
||||
#include <crypto/internal/simd.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/sizes.h>
|
||||
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/processor.h>
|
||||
|
||||
static int crypto_blake2s_update_x86(struct shash_desc *desc,
|
||||
const u8 *in, unsigned int inlen)
|
||||
{
|
||||
return crypto_blake2s_update(desc, in, inlen, false);
|
||||
}
|
||||
|
||||
static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
|
||||
{
|
||||
return crypto_blake2s_final(desc, out, false);
|
||||
}
|
||||
|
||||
#define BLAKE2S_ALG(name, driver_name, digest_size) \
|
||||
{ \
|
||||
.base.cra_name = name, \
|
||||
.base.cra_driver_name = driver_name, \
|
||||
.base.cra_priority = 200, \
|
||||
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
|
||||
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
|
||||
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
|
||||
.base.cra_module = THIS_MODULE, \
|
||||
.digestsize = digest_size, \
|
||||
.setkey = crypto_blake2s_setkey, \
|
||||
.init = crypto_blake2s_init, \
|
||||
.update = crypto_blake2s_update_x86, \
|
||||
.final = crypto_blake2s_final_x86, \
|
||||
.descsize = sizeof(struct blake2s_state), \
|
||||
}
|
||||
|
||||
static struct shash_alg blake2s_algs[] = {
|
||||
BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE),
|
||||
BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE),
|
||||
BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE),
|
||||
BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE),
|
||||
};
|
||||
|
||||
static int __init blake2s_mod_init(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
|
||||
return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __exit blake2s_mod_exit(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
|
||||
crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
|
||||
}
|
||||
|
||||
module_init(blake2s_mod_init);
|
||||
module_exit(blake2s_mod_exit);
|
||||
|
||||
MODULE_ALIAS_CRYPTO("blake2s-128");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-128-x86");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-160");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-160-x86");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-224");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-224-x86");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-256");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-256-x86");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user