Merge tag 'v5.20-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto updates from Herbert Xu:
"API:

   - Make proc files report fips module name and version

  Algorithms:

   - Move generic SHA1 code into lib/crypto

   - Implement Chinese Remainder Theorem for RSA

   - Remove blake2s

   - Add XCTR with x86/arm64 acceleration

   - Add POLYVAL with x86/arm64 acceleration

   - Add HCTR2

   - Add ARIA

  Drivers:

   - Add support for new CCP/PSP device ID in ccp"

* tag 'v5.20-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (89 commits)
  crypto: tcrypt - Remove the static variable initialisations to NULL
  crypto: arm64/poly1305 - fix a read out-of-bound
  crypto: hisilicon/zip - Use the bitmap API to allocate bitmaps
  crypto: hisilicon/sec - fix auth key size error
  crypto: ccree - Remove a useless dma_supported() call
  crypto: ccp - Add support for new CCP/PSP device ID
  crypto: inside-secure - Add missing MODULE_DEVICE_TABLE for of
  crypto: hisilicon/hpre - don't use GFP_KERNEL to alloc mem during softirq
  crypto: testmgr - some more fixes to RSA test vectors
  cyrpto: powerpc/aes - delete the rebundant word "block" in comments
  hwrng: via - Fix comment typo
  crypto: twofish - Fix comment typo
  crypto: rmd160 - fix Kconfig "its" grammar
  crypto: keembay-ocs-ecc - Drop if with an always false condition
  Documentation: qat: rewrite description
  Documentation: qat: Use code block for qat sysfs example
  crypto: lib - add module license to libsha1
  crypto: lib - make the sha1 library optional
  crypto: lib - move lib/sha1.c into lib/crypto/
  crypto: fips - make proc files report fips module name and version
  ...
This commit is contained in:
Linus Torvalds
2022-08-02 17:45:14 -07:00
114 changed files with 9136 additions and 1199 deletions

View File

@@ -0,0 +1,49 @@
What: /sys/bus/pci/devices/<BDF>/qat/state
Date: June 2022
KernelVersion: 5.20
Contact: qat-linux@intel.com
Description: (RW) Reports the current state of the QAT device. Write to
the file to start or stop the device.
The values are:
* up: the device is up and running
* down: the device is down
It is possible to transition the device from up to down only
if the device is up and vice versa.
This attribute is only available for qat_4xxx devices.
What: /sys/bus/pci/devices/<BDF>/qat/cfg_services
Date: June 2022
KernelVersion: 5.20
Contact: qat-linux@intel.com
Description: (RW) Reports the current configuration of the QAT device.
Write to the file to change the configured services.
The values are:
* sym;asym: the device is configured for running crypto
services
* dc: the device is configured for running compression services
It is possible to set the configuration only if the device
is in the `down` state (see /sys/bus/pci/devices/<BDF>/qat/state)
The following example shows how to change the configuration of
a device configured for running crypto services in order to
run data compression::
# cat /sys/bus/pci/devices/<BDF>/qat/state
up
# cat /sys/bus/pci/devices/<BDF>/qat/cfg_services
sym;asym
# echo down > /sys/bus/pci/devices/<BDF>/qat/state
# echo dc > /sys/bus/pci/devices/<BDF>/qat/cfg_services
# echo up > /sys/bus/pci/devices/<BDF>/qat/state
# cat /sys/bus/pci/devices/<BDF>/qat/cfg_services
dc
This attribute is only available for qat_4xxx devices.

View File

@@ -337,6 +337,7 @@ Currently, the following pairs of encryption modes are supported:
- AES-256-XTS for contents and AES-256-CTS-CBC for filenames
- AES-128-CBC for contents and AES-128-CTS-CBC for filenames
- Adiantum for both contents and filenames
- AES-256-XTS for contents and AES-256-HCTR2 for filenames (v2 policies only)
If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair.
@@ -357,6 +358,17 @@ To use Adiantum, CONFIG_CRYPTO_ADIANTUM must be enabled. Also, fast
implementations of ChaCha and NHPoly1305 should be enabled, e.g.
CONFIG_CRYPTO_CHACHA20_NEON and CONFIG_CRYPTO_NHPOLY1305_NEON for ARM.
AES-256-HCTR2 is another true wide-block encryption mode that is intended for
use on CPUs with dedicated crypto instructions. AES-256-HCTR2 has the property
that a bitflip in the plaintext changes the entire ciphertext. This property
makes it desirable for filename encryption since initialization vectors are
reused within a directory. For more details on AES-256-HCTR2, see the paper
"Length-preserving encryption with HCTR2"
(https://eprint.iacr.org/2021/1441.pdf). To use AES-256-HCTR2,
CONFIG_CRYPTO_HCTR2 must be enabled. Also, fast implementations of XCTR and
POLYVAL should be enabled, e.g. CRYPTO_POLYVAL_ARM64_CE and
CRYPTO_AES_ARM64_CE_BLK for ARM64.
New encryption modes can be added relatively easily, without changes
to individual filesystems. However, authenticated encryption (AE)
modes are not currently supported because of the difficulty of dealing
@@ -404,11 +416,11 @@ alternatively has the file's nonce (for `DIRECT_KEY policies`_) or
inode number (for `IV_INO_LBLK_64 policies`_) included in the IVs.
Thus, IV reuse is limited to within a single directory.
With CTS-CBC, the IV reuse means that when the plaintext filenames
share a common prefix at least as long as the cipher block size (16
bytes for AES), the corresponding encrypted filenames will also share
a common prefix. This is undesirable. Adiantum does not have this
weakness, as it is a wide-block encryption mode.
With CTS-CBC, the IV reuse means that when the plaintext filenames share a
common prefix at least as long as the cipher block size (16 bytes for AES), the
corresponding encrypted filenames will also share a common prefix. This is
undesirable. Adiantum and HCTR2 do not have this weakness, as they are
wide-block encryption modes.
All supported filenames encryption modes accept any plaintext length
>= 16 bytes; cipher block alignment is not required. However,

View File

@@ -9079,15 +9079,24 @@ S: Supported
F: Documentation/admin-guide/perf/hns3-pmu.rst
F: drivers/perf/hisilicon/hns3_pmu.c
HISILICON QM AND ZIP Controller DRIVER
HISILICON QM DRIVER
M: Weili Qian <qianweili@huawei.com>
M: Zhou Wang <wangzhou1@hisilicon.com>
L: linux-crypto@vger.kernel.org
S: Maintained
F: drivers/crypto/hisilicon/Kconfig
F: drivers/crypto/hisilicon/Makefile
F: drivers/crypto/hisilicon/qm.c
F: drivers/crypto/hisilicon/sgl.c
F: include/linux/hisi_acc_qm.h
HISILICON ZIP Controller DRIVER
M: Yang Shen <shenyang39@huawei.com>
M: Zhou Wang <wangzhou1@hisilicon.com>
L: linux-crypto@vger.kernel.org
S: Maintained
F: Documentation/ABI/testing/debugfs-hisi-zip
F: drivers/crypto/hisilicon/qm.c
F: drivers/crypto/hisilicon/sgl.c
F: drivers/crypto/hisilicon/zip/
F: include/linux/hisi_acc_qm.h
HISILICON ROCE DRIVER
M: Wenpeng Liang <liangwenpeng@huawei.com>

View File

@@ -63,7 +63,7 @@ config CRYPTO_SHA512_ARM
using optimized ARM assembler and NEON, when available.
config CRYPTO_BLAKE2S_ARM
tristate "BLAKE2s digest algorithm (ARM)"
bool "BLAKE2s digest algorithm (ARM)"
select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
help
BLAKE2s digest algorithm optimized with ARM scalar instructions. This

View File

@@ -9,8 +9,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += blake2s-arm.o
obj-$(if $(CONFIG_CRYPTO_BLAKE2S_ARM),y) += libblake2s-arm.o
obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o
obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
@@ -32,7 +31,6 @@ sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
sha512-arm-y := sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
blake2s-arm-y := blake2s-shash.o
libblake2s-arm-y:= blake2s-core.o blake2s-glue.o
blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o
sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o

View File

@@ -1,75 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* BLAKE2s digest algorithm, ARM scalar implementation
*
* Copyright 2020 Google LLC
*/
#include <crypto/internal/blake2s.h>
#include <crypto/internal/hash.h>
#include <linux/module.h>
static int crypto_blake2s_update_arm(struct shash_desc *desc,
const u8 *in, unsigned int inlen)
{
return crypto_blake2s_update(desc, in, inlen, false);
}
static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
{
return crypto_blake2s_final(desc, out, false);
}
#define BLAKE2S_ALG(name, driver_name, digest_size) \
{ \
.base.cra_name = name, \
.base.cra_driver_name = driver_name, \
.base.cra_priority = 200, \
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
.base.cra_module = THIS_MODULE, \
.digestsize = digest_size, \
.setkey = crypto_blake2s_setkey, \
.init = crypto_blake2s_init, \
.update = crypto_blake2s_update_arm, \
.final = crypto_blake2s_final_arm, \
.descsize = sizeof(struct blake2s_state), \
}
static struct shash_alg blake2s_arm_algs[] = {
BLAKE2S_ALG("blake2s-128", "blake2s-128-arm", BLAKE2S_128_HASH_SIZE),
BLAKE2S_ALG("blake2s-160", "blake2s-160-arm", BLAKE2S_160_HASH_SIZE),
BLAKE2S_ALG("blake2s-224", "blake2s-224-arm", BLAKE2S_224_HASH_SIZE),
BLAKE2S_ALG("blake2s-256", "blake2s-256-arm", BLAKE2S_256_HASH_SIZE),
};
static int __init blake2s_arm_mod_init(void)
{
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
crypto_register_shashes(blake2s_arm_algs,
ARRAY_SIZE(blake2s_arm_algs)) : 0;
}
static void __exit blake2s_arm_mod_exit(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
crypto_unregister_shashes(blake2s_arm_algs,
ARRAY_SIZE(blake2s_arm_algs));
}
module_init(blake2s_arm_mod_init);
module_exit(blake2s_arm_mod_exit);
MODULE_DESCRIPTION("BLAKE2s digest algorithm, ARM scalar implementation");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
MODULE_ALIAS_CRYPTO("blake2s-128");
MODULE_ALIAS_CRYPTO("blake2s-128-arm");
MODULE_ALIAS_CRYPTO("blake2s-160");
MODULE_ALIAS_CRYPTO("blake2s-160-arm");
MODULE_ALIAS_CRYPTO("blake2s-224");
MODULE_ALIAS_CRYPTO("blake2s-224-arm");
MODULE_ALIAS_CRYPTO("blake2s-256");
MODULE_ALIAS_CRYPTO("blake2s-256-arm");

View File

@@ -71,6 +71,12 @@ config CRYPTO_GHASH_ARM64_CE
select CRYPTO_HASH
select CRYPTO_GF128MUL
select CRYPTO_LIB_AES
select CRYPTO_AEAD
config CRYPTO_POLYVAL_ARM64_CE
tristate "POLYVAL using ARMv8 Crypto Extensions (for HCTR2)"
depends on KERNEL_MODE_NEON
select CRYPTO_POLYVAL
config CRYPTO_CRCT10DIF_ARM64_CE
tristate "CRCT10DIF digest algorithm using PMULL instructions"
@@ -96,13 +102,13 @@ config CRYPTO_AES_ARM64_CE_CCM
select CRYPTO_LIB_AES
config CRYPTO_AES_ARM64_CE_BLK
tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
tristate "AES in ECB/CBC/CTR/XTS/XCTR modes using ARMv8 Crypto Extensions"
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
select CRYPTO_AES_ARM64_CE
config CRYPTO_AES_ARM64_NEON_BLK
tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
tristate "AES in ECB/CBC/CTR/XTS/XCTR modes using NEON instructions"
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
select CRYPTO_LIB_AES

View File

@@ -32,6 +32,9 @@ sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o
obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o
polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o
obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o

View File

@@ -34,10 +34,11 @@
#define aes_essiv_cbc_encrypt ce_aes_essiv_cbc_encrypt
#define aes_essiv_cbc_decrypt ce_aes_essiv_cbc_decrypt
#define aes_ctr_encrypt ce_aes_ctr_encrypt
#define aes_xctr_encrypt ce_aes_xctr_encrypt
#define aes_xts_encrypt ce_aes_xts_encrypt
#define aes_xts_decrypt ce_aes_xts_decrypt
#define aes_mac_update ce_aes_mac_update
MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 Crypto Extensions");
#else
#define MODE "neon"
#define PRIO 200
@@ -50,16 +51,18 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
#define aes_essiv_cbc_encrypt neon_aes_essiv_cbc_encrypt
#define aes_essiv_cbc_decrypt neon_aes_essiv_cbc_decrypt
#define aes_ctr_encrypt neon_aes_ctr_encrypt
#define aes_xctr_encrypt neon_aes_xctr_encrypt
#define aes_xts_encrypt neon_aes_xts_encrypt
#define aes_xts_decrypt neon_aes_xts_decrypt
#define aes_mac_update neon_aes_mac_update
MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 NEON");
#endif
#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS)
MODULE_ALIAS_CRYPTO("ecb(aes)");
MODULE_ALIAS_CRYPTO("cbc(aes)");
MODULE_ALIAS_CRYPTO("ctr(aes)");
MODULE_ALIAS_CRYPTO("xts(aes)");
MODULE_ALIAS_CRYPTO("xctr(aes)");
#endif
MODULE_ALIAS_CRYPTO("cts(cbc(aes))");
MODULE_ALIAS_CRYPTO("essiv(cbc(aes),sha256)");
@@ -89,6 +92,9 @@ asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int bytes, u8 ctr[]);
asmlinkage void aes_xctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int bytes, u8 ctr[], int byte_ctr);
asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[],
int rounds, int bytes, u32 const rk2[], u8 iv[],
int first);
@@ -442,6 +448,52 @@ static int __maybe_unused essiv_cbc_decrypt(struct skcipher_request *req)
return err ?: cbc_decrypt_walk(req, &walk);
}
static int __maybe_unused xctr_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
int err, rounds = 6 + ctx->key_length / 4;
struct skcipher_walk walk;
unsigned int byte_ctr = 0;
err = skcipher_walk_virt(&walk, req, false);
while (walk.nbytes > 0) {
const u8 *src = walk.src.virt.addr;
unsigned int nbytes = walk.nbytes;
u8 *dst = walk.dst.virt.addr;
u8 buf[AES_BLOCK_SIZE];
/*
* If given less than 16 bytes, we must copy the partial block
* into a temporary buffer of 16 bytes to avoid out of bounds
* reads and writes. Furthermore, this code is somewhat unusual
* in that it expects the end of the data to be at the end of
* the temporary buffer, rather than the start of the data at
* the start of the temporary buffer.
*/
if (unlikely(nbytes < AES_BLOCK_SIZE))
src = dst = memcpy(buf + sizeof(buf) - nbytes,
src, nbytes);
else if (nbytes < walk.total)
nbytes &= ~(AES_BLOCK_SIZE - 1);
kernel_neon_begin();
aes_xctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes,
walk.iv, byte_ctr);
kernel_neon_end();
if (unlikely(nbytes < AES_BLOCK_SIZE))
memcpy(walk.dst.virt.addr,
buf + sizeof(buf) - nbytes, nbytes);
byte_ctr += nbytes;
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
}
static int __maybe_unused ctr_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
@@ -457,6 +509,14 @@ static int __maybe_unused ctr_encrypt(struct skcipher_request *req)
u8 *dst = walk.dst.virt.addr;
u8 buf[AES_BLOCK_SIZE];
/*
* If given less than 16 bytes, we must copy the partial block
* into a temporary buffer of 16 bytes to avoid out of bounds
* reads and writes. Furthermore, this code is somewhat unusual
* in that it expects the end of the data to be at the end of
* the temporary buffer, rather than the start of the data at
* the start of the temporary buffer.
*/
if (unlikely(nbytes < AES_BLOCK_SIZE))
src = dst = memcpy(buf + sizeof(buf) - nbytes,
src, nbytes);
@@ -669,6 +729,22 @@ static struct skcipher_alg aes_algs[] = { {
.setkey = skcipher_aes_setkey,
.encrypt = ctr_encrypt,
.decrypt = ctr_encrypt,
}, {
.base = {
.cra_name = "xctr(aes)",
.cra_driver_name = "xctr-aes-" MODE,
.cra_priority = PRIO,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
},
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.chunksize = AES_BLOCK_SIZE,
.setkey = skcipher_aes_setkey,
.encrypt = xctr_encrypt,
.decrypt = xctr_encrypt,
}, {
.base = {
.cra_name = "xts(aes)",

View File

@@ -318,127 +318,211 @@ AES_FUNC_END(aes_cbc_cts_decrypt)
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.previous
/*
* aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int bytes, u8 ctr[])
* This macro generates the code for CTR and XCTR mode.
*/
.macro ctr_encrypt xctr
// Arguments
OUT .req x0
IN .req x1
KEY .req x2
ROUNDS_W .req w3
BYTES_W .req w4
IV .req x5
BYTE_CTR_W .req w6 // XCTR only
// Intermediate values
CTR_W .req w11 // XCTR only
CTR .req x11 // XCTR only
IV_PART .req x12
BLOCKS .req x13
BLOCKS_W .req w13
AES_FUNC_START(aes_ctr_encrypt)
stp x29, x30, [sp, #-16]!
mov x29, sp
enc_prepare w3, x2, x12
ld1 {vctr.16b}, [x5]
enc_prepare ROUNDS_W, KEY, IV_PART
ld1 {vctr.16b}, [IV]
umov x12, vctr.d[1] /* keep swabbed ctr in reg */
rev x12, x12
/*
* Keep 64 bits of the IV in a register. For CTR mode this lets us
* easily increment the IV. For XCTR mode this lets us efficiently XOR
* the 64-bit counter with the IV.
*/
.if \xctr
umov IV_PART, vctr.d[0]
lsr CTR_W, BYTE_CTR_W, #4
.else
umov IV_PART, vctr.d[1]
rev IV_PART, IV_PART
.endif
.LctrloopNx:
add w7, w4, #15
sub w4, w4, #MAX_STRIDE << 4
lsr w7, w7, #4
.LctrloopNx\xctr:
add BLOCKS_W, BYTES_W, #15
sub BYTES_W, BYTES_W, #MAX_STRIDE << 4
lsr BLOCKS_W, BLOCKS_W, #4
mov w8, #MAX_STRIDE
cmp w7, w8
csel w7, w7, w8, lt
adds x12, x12, x7
cmp BLOCKS_W, w8
csel BLOCKS_W, BLOCKS_W, w8, lt
/*
* Set up the counter values in v0-v{MAX_STRIDE-1}.
*
* If we are encrypting less than MAX_STRIDE blocks, the tail block
* handling code expects the last keystream block to be in
* v{MAX_STRIDE-1}. For example: if encrypting two blocks with
* MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
*/
.if \xctr
add CTR, CTR, BLOCKS
.else
adds IV_PART, IV_PART, BLOCKS
.endif
mov v0.16b, vctr.16b
mov v1.16b, vctr.16b
mov v2.16b, vctr.16b
mov v3.16b, vctr.16b
ST5( mov v4.16b, vctr.16b )
bcs 0f
.if \xctr
sub x6, CTR, #MAX_STRIDE - 1
sub x7, CTR, #MAX_STRIDE - 2
sub x8, CTR, #MAX_STRIDE - 3
sub x9, CTR, #MAX_STRIDE - 4
ST5( sub x10, CTR, #MAX_STRIDE - 5 )
eor x6, x6, IV_PART
eor x7, x7, IV_PART
eor x8, x8, IV_PART
eor x9, x9, IV_PART
ST5( eor x10, x10, IV_PART )
mov v0.d[0], x6
mov v1.d[0], x7
mov v2.d[0], x8
mov v3.d[0], x9
ST5( mov v4.d[0], x10 )
.else
bcs 0f
.subsection 1
/*
* This subsection handles carries.
*
* Conditional branching here is allowed with respect to time
* invariance since the branches are dependent on the IV instead
* of the plaintext or key. This code is rarely executed in
* practice anyway.
*/
.subsection 1
/* apply carry to outgoing counter */
0: umov x8, vctr.d[0]
rev x8, x8
add x8, x8, #1
rev x8, x8
ins vctr.d[0], x8
/* Apply carry to outgoing counter. */
0: umov x8, vctr.d[0]
rev x8, x8
add x8, x8, #1
rev x8, x8
ins vctr.d[0], x8
/* apply carry to N counter blocks for N := x12 */
cbz x12, 2f
adr x16, 1f
sub x16, x16, x12, lsl #3
br x16
bti c
mov v0.d[0], vctr.d[0]
bti c
mov v1.d[0], vctr.d[0]
bti c
mov v2.d[0], vctr.d[0]
bti c
mov v3.d[0], vctr.d[0]
ST5( bti c )
ST5( mov v4.d[0], vctr.d[0] )
1: b 2f
.previous
/*
* Apply carry to counter blocks if needed.
*
* Since the carry flag was set, we know 0 <= IV_PART <
* MAX_STRIDE. Using the value of IV_PART we can determine how
* many counter blocks need to be updated.
*/
cbz IV_PART, 2f
adr x16, 1f
sub x16, x16, IV_PART, lsl #3
br x16
bti c
mov v0.d[0], vctr.d[0]
bti c
mov v1.d[0], vctr.d[0]
bti c
mov v2.d[0], vctr.d[0]
bti c
mov v3.d[0], vctr.d[0]
ST5( bti c )
ST5( mov v4.d[0], vctr.d[0] )
1: b 2f
.previous
2: rev x7, x12
ins vctr.d[1], x7
sub x7, x12, #MAX_STRIDE - 1
sub x8, x12, #MAX_STRIDE - 2
sub x9, x12, #MAX_STRIDE - 3
rev x7, x7
rev x8, x8
mov v1.d[1], x7
rev x9, x9
ST5( sub x10, x12, #MAX_STRIDE - 4 )
mov v2.d[1], x8
ST5( rev x10, x10 )
mov v3.d[1], x9
ST5( mov v4.d[1], x10 )
tbnz w4, #31, .Lctrtail
ld1 {v5.16b-v7.16b}, [x1], #48
2: rev x7, IV_PART
ins vctr.d[1], x7
sub x7, IV_PART, #MAX_STRIDE - 1
sub x8, IV_PART, #MAX_STRIDE - 2
sub x9, IV_PART, #MAX_STRIDE - 3
rev x7, x7
rev x8, x8
mov v1.d[1], x7
rev x9, x9
ST5( sub x10, IV_PART, #MAX_STRIDE - 4 )
mov v2.d[1], x8
ST5( rev x10, x10 )
mov v3.d[1], x9
ST5( mov v4.d[1], x10 )
.endif
/*
* If there are at least MAX_STRIDE blocks left, XOR the data with
* keystream and store. Otherwise jump to tail handling.
*/
tbnz BYTES_W, #31, .Lctrtail\xctr
ld1 {v5.16b-v7.16b}, [IN], #48
ST4( bl aes_encrypt_block4x )
ST5( bl aes_encrypt_block5x )
eor v0.16b, v5.16b, v0.16b
ST4( ld1 {v5.16b}, [x1], #16 )
ST4( ld1 {v5.16b}, [IN], #16 )
eor v1.16b, v6.16b, v1.16b
ST5( ld1 {v5.16b-v6.16b}, [x1], #32 )
ST5( ld1 {v5.16b-v6.16b}, [IN], #32 )
eor v2.16b, v7.16b, v2.16b
eor v3.16b, v5.16b, v3.16b
ST5( eor v4.16b, v6.16b, v4.16b )
st1 {v0.16b-v3.16b}, [x0], #64
ST5( st1 {v4.16b}, [x0], #16 )
cbz w4, .Lctrout
b .LctrloopNx
st1 {v0.16b-v3.16b}, [OUT], #64
ST5( st1 {v4.16b}, [OUT], #16 )
cbz BYTES_W, .Lctrout\xctr
b .LctrloopNx\xctr
.Lctrout:
st1 {vctr.16b}, [x5] /* return next CTR value */
.Lctrout\xctr:
.if !\xctr
st1 {vctr.16b}, [IV] /* return next CTR value */
.endif
ldp x29, x30, [sp], #16
ret
.Lctrtail:
/* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */
.Lctrtail\xctr:
/*
* Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
*
* This code expects the last keystream block to be in v{MAX_STRIDE-1}.
* For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
* v4 should have the next two counter blocks.
*
* This allows us to store the ciphertext by writing to overlapping
* regions of memory. Any invalid ciphertext blocks get overwritten by
* correctly computed blocks. This approach greatly simplifies the
* logic for storing the ciphertext.
*/
mov x16, #16
ands x6, x4, #0xf
csel x13, x6, x16, ne
ands w7, BYTES_W, #0xf
csel x13, x7, x16, ne
ST5( cmp w4, #64 - (MAX_STRIDE << 4) )
ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4))
ST5( csel x14, x16, xzr, gt )
cmp w4, #48 - (MAX_STRIDE << 4)
cmp BYTES_W, #48 - (MAX_STRIDE << 4)
csel x15, x16, xzr, gt
cmp w4, #32 - (MAX_STRIDE << 4)
cmp BYTES_W, #32 - (MAX_STRIDE << 4)
csel x16, x16, xzr, gt
cmp w4, #16 - (MAX_STRIDE << 4)
cmp BYTES_W, #16 - (MAX_STRIDE << 4)
adr_l x12, .Lcts_permute_table
add x12, x12, x13
ble .Lctrtail1x
adr_l x9, .Lcts_permute_table
add x9, x9, x13
ble .Lctrtail1x\xctr
ST5( ld1 {v5.16b}, [x1], x14 )
ld1 {v6.16b}, [x1], x15
ld1 {v7.16b}, [x1], x16
ST5( ld1 {v5.16b}, [IN], x14 )
ld1 {v6.16b}, [IN], x15
ld1 {v7.16b}, [IN], x16
ST4( bl aes_encrypt_block4x )
ST5( bl aes_encrypt_block5x )
ld1 {v8.16b}, [x1], x13
ld1 {v9.16b}, [x1]
ld1 {v10.16b}, [x12]
ld1 {v8.16b}, [IN], x13
ld1 {v9.16b}, [IN]
ld1 {v10.16b}, [x9]
ST4( eor v6.16b, v6.16b, v0.16b )
ST4( eor v7.16b, v7.16b, v1.16b )
@@ -453,32 +537,91 @@ ST5( eor v7.16b, v7.16b, v2.16b )
ST5( eor v8.16b, v8.16b, v3.16b )
ST5( eor v9.16b, v9.16b, v4.16b )
ST5( st1 {v5.16b}, [x0], x14 )
st1 {v6.16b}, [x0], x15
st1 {v7.16b}, [x0], x16
add x13, x13, x0
ST5( st1 {v5.16b}, [OUT], x14 )
st1 {v6.16b}, [OUT], x15
st1 {v7.16b}, [OUT], x16
add x13, x13, OUT
st1 {v9.16b}, [x13] // overlapping stores
st1 {v8.16b}, [x0]
b .Lctrout
st1 {v8.16b}, [OUT]
b .Lctrout\xctr
.Lctrtail1x:
sub x7, x6, #16
csel x6, x6, x7, eq
add x1, x1, x6
add x0, x0, x6
ld1 {v5.16b}, [x1]
ld1 {v6.16b}, [x0]
.Lctrtail1x\xctr:
/*
* Handle <= 16 bytes of plaintext
*
* This code always reads and writes 16 bytes. To avoid out of bounds
* accesses, XCTR and CTR modes must use a temporary buffer when
* encrypting/decrypting less than 16 bytes.
*
* This code is unusual in that it loads the input and stores the output
* relative to the end of the buffers rather than relative to the start.
* This causes unusual behaviour when encrypting/decrypting less than 16
* bytes; the end of the data is expected to be at the end of the
* temporary buffer rather than the start of the data being at the start
* of the temporary buffer.
*/
sub x8, x7, #16
csel x7, x7, x8, eq
add IN, IN, x7
add OUT, OUT, x7
ld1 {v5.16b}, [IN]
ld1 {v6.16b}, [OUT]
ST5( mov v3.16b, v4.16b )
encrypt_block v3, w3, x2, x8, w7
ld1 {v10.16b-v11.16b}, [x12]
encrypt_block v3, ROUNDS_W, KEY, x8, w7
ld1 {v10.16b-v11.16b}, [x9]
tbl v3.16b, {v3.16b}, v10.16b
sshr v11.16b, v11.16b, #7
eor v5.16b, v5.16b, v3.16b
bif v5.16b, v6.16b, v11.16b
st1 {v5.16b}, [x0]
b .Lctrout
st1 {v5.16b}, [OUT]
b .Lctrout\xctr
// Arguments
.unreq OUT
.unreq IN
.unreq KEY
.unreq ROUNDS_W
.unreq BYTES_W
.unreq IV
.unreq BYTE_CTR_W // XCTR only
// Intermediate values
.unreq CTR_W // XCTR only
.unreq CTR // XCTR only
.unreq IV_PART
.unreq BLOCKS
.unreq BLOCKS_W
.endm
/*
* aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int bytes, u8 ctr[])
*
* The input and output buffers must always be at least 16 bytes even if
* encrypting/decrypting less than 16 bytes. Otherwise out of bounds
* accesses will occur. The data to be encrypted/decrypted is expected
* to be at the end of this 16-byte temporary buffer rather than the
* start.
*/
AES_FUNC_START(aes_ctr_encrypt)
ctr_encrypt 0
AES_FUNC_END(aes_ctr_encrypt)
/*
* aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int bytes, u8 const iv[], int byte_ctr)
*
* The input and output buffers must always be at least 16 bytes even if
* encrypting/decrypting less than 16 bytes. Otherwise out of bounds
* accesses will occur. The data to be encrypted/decrypted is expected
* to be at the end of this 16-byte temporary buffer rather than the
* start.
*/
AES_FUNC_START(aes_xctr_encrypt)
ctr_encrypt 1
AES_FUNC_END(aes_xctr_encrypt)
/*
* aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,

View File

@@ -66,7 +66,7 @@
prepare crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp
.endm
/* apply SubBytes transformation using the the preloaded Sbox */
/* apply SubBytes transformation using the preloaded Sbox */
.macro sub_bytes, in
sub v9.16b, \in\().16b, v15.16b
tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b

View File

@@ -52,7 +52,7 @@ static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
{
if (unlikely(!dctx->sset)) {
if (!dctx->rset) {
poly1305_init_arch(dctx, src);
poly1305_init_arm64(&dctx->h, src);
src += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
dctx->rset = 1;

View File

@@ -0,0 +1,361 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Implementation of POLYVAL using ARMv8 Crypto Extensions.
*
* Copyright 2021 Google LLC
*/
/*
* This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions
* It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8,
* ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
* finite field multiplication into two steps.
*
* In the first step, we consider h^i, m_i as normal polynomials of degree less
* than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
* is simply polynomial multiplication.
*
* In the second step, we compute the reduction of p(x) modulo the finite field
* modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
*
* This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
* multiplication is finite field multiplication. The advantage is that the
* two-step process only requires 1 finite field reduction for every 8
* polynomial multiplications. Further parallelism is gained by interleaving the
* multiplications and polynomial reductions.
*/
#include <linux/linkage.h>
#define STRIDE_BLOCKS 8
KEY_POWERS .req x0
MSG .req x1
BLOCKS_LEFT .req x2
ACCUMULATOR .req x3
KEY_START .req x10
EXTRA_BYTES .req x11
TMP .req x13
M0 .req v0
M1 .req v1
M2 .req v2
M3 .req v3
M4 .req v4
M5 .req v5
M6 .req v6
M7 .req v7
KEY8 .req v8
KEY7 .req v9
KEY6 .req v10
KEY5 .req v11
KEY4 .req v12
KEY3 .req v13
KEY2 .req v14
KEY1 .req v15
PL .req v16
PH .req v17
TMP_V .req v18
LO .req v20
MI .req v21
HI .req v22
SUM .req v23
GSTAR .req v24
.text
.arch armv8-a+crypto
.align 4
.Lgstar:
.quad 0xc200000000000000, 0xc200000000000000
/*
* Computes the product of two 128-bit polynomials in X and Y and XORs the
* components of the 256-bit product into LO, MI, HI.
*
* Given:
* X = [X_1 : X_0]
* Y = [Y_1 : Y_0]
*
* We compute:
* LO += X_0 * Y_0
* MI += (X_0 + X_1) * (Y_0 + Y_1)
* HI += X_1 * Y_1
*
* Later, the 256-bit result can be extracted as:
* [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0]
* This step is done when computing the polynomial reduction for efficiency
* reasons.
*
* Karatsuba multiplication is used instead of Schoolbook multiplication because
* it was found to be slightly faster on ARM64 CPUs.
*
*/
.macro karatsuba1 X Y
X .req \X
Y .req \Y
ext v25.16b, X.16b, X.16b, #8
ext v26.16b, Y.16b, Y.16b, #8
eor v25.16b, v25.16b, X.16b
eor v26.16b, v26.16b, Y.16b
pmull2 v28.1q, X.2d, Y.2d
pmull v29.1q, X.1d, Y.1d
pmull v27.1q, v25.1d, v26.1d
eor HI.16b, HI.16b, v28.16b
eor LO.16b, LO.16b, v29.16b
eor MI.16b, MI.16b, v27.16b
.unreq X
.unreq Y
.endm
/*
* Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into
* them.
*/
.macro karatsuba1_store X Y
X .req \X
Y .req \Y
ext v25.16b, X.16b, X.16b, #8
ext v26.16b, Y.16b, Y.16b, #8
eor v25.16b, v25.16b, X.16b
eor v26.16b, v26.16b, Y.16b
pmull2 HI.1q, X.2d, Y.2d
pmull LO.1q, X.1d, Y.1d
pmull MI.1q, v25.1d, v26.1d
.unreq X
.unreq Y
.endm
/*
* Computes the 256-bit polynomial represented by LO, HI, MI. Stores
* the result in PL, PH.
* [PH : PL] =
* [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
*/
.macro karatsuba2
// v4 = [HI_1 + MI_1 : HI_0 + MI_0]
eor v4.16b, HI.16b, MI.16b
// v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]
eor v4.16b, v4.16b, LO.16b
// v5 = [HI_0 : LO_1]
ext v5.16b, LO.16b, HI.16b, #8
// v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]
eor v4.16b, v4.16b, v5.16b
// HI = [HI_0 : HI_1]
ext HI.16b, HI.16b, HI.16b, #8
// LO = [LO_0 : LO_1]
ext LO.16b, LO.16b, LO.16b, #8
// PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]
ext PH.16b, v4.16b, HI.16b, #8
// PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
ext PL.16b, LO.16b, v4.16b, #8
.endm
/*
* Computes the 128-bit reduction of PH : PL. Stores the result in dest.
*
* This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
* x^128 + x^127 + x^126 + x^121 + 1.
*
* We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
* product of two 128-bit polynomials in Montgomery form. We need to reduce it
* mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
* of x^128, this product has two extra factors of x^128. To get it back into
* Montgomery form, we need to remove one of these factors by dividing by x^128.
*
* To accomplish both of these goals, we add multiples of g(x) that cancel out
* the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
* bits are zero, the polynomial division by x^128 can be done by right
* shifting.
*
* Since the only nonzero term in the low 64 bits of g(x) is the constant term,
* the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
* only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
* x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
* the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
* = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
*
* Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
* 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
* + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
* x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
* P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
*
* So our final computation is:
* T = T_1 : T_0 = g*(x) * P_0
* V = V_1 : V_0 = g*(x) * (P_1 + T_0)
* p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
*
* The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
* + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
* T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
*/
.macro montgomery_reduction dest
DEST .req \dest
// TMP_V = T_1 : T_0 = P_0 * g*(x)
pmull TMP_V.1q, PL.1d, GSTAR.1d
// TMP_V = T_0 : T_1
ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
// TMP_V = P_1 + T_0 : P_0 + T_1
eor TMP_V.16b, PL.16b, TMP_V.16b
// PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
eor PH.16b, PH.16b, TMP_V.16b
// TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)
pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
eor DEST.16b, PH.16b, TMP_V.16b
.unreq DEST
.endm
/*
* Compute Polyval on 8 blocks.
*
* If reduce is set, also computes the montgomery reduction of the
* previous full_stride call and XORs with the first message block.
* (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
* I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
*
* Sets PL, PH.
*/
.macro full_stride reduce
eor LO.16b, LO.16b, LO.16b
eor MI.16b, MI.16b, MI.16b
eor HI.16b, HI.16b, HI.16b
ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
karatsuba1 M7 KEY1
.if \reduce
pmull TMP_V.1q, PL.1d, GSTAR.1d
.endif
karatsuba1 M6 KEY2
.if \reduce
ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
.endif
karatsuba1 M5 KEY3
.if \reduce
eor TMP_V.16b, PL.16b, TMP_V.16b
.endif
karatsuba1 M4 KEY4
.if \reduce
eor PH.16b, PH.16b, TMP_V.16b
.endif
karatsuba1 M3 KEY5
.if \reduce
pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
.endif
karatsuba1 M2 KEY6
.if \reduce
eor SUM.16b, PH.16b, TMP_V.16b
.endif
karatsuba1 M1 KEY7
eor M0.16b, M0.16b, SUM.16b
karatsuba1 M0 KEY8
karatsuba2
.endm
/*
* Handle any extra blocks after full_stride loop.
*/
.macro partial_stride
add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)
sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4
ld1 {KEY1.16b}, [KEY_POWERS], #16
ld1 {TMP_V.16b}, [MSG], #16
eor SUM.16b, SUM.16b, TMP_V.16b
karatsuba1_store KEY1 SUM
sub BLOCKS_LEFT, BLOCKS_LEFT, #1
tst BLOCKS_LEFT, #4
beq .Lpartial4BlocksDone
ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
karatsuba1 M0 KEY8
karatsuba1 M1 KEY7
karatsuba1 M2 KEY6
karatsuba1 M3 KEY5
.Lpartial4BlocksDone:
tst BLOCKS_LEFT, #2
beq .Lpartial2BlocksDone
ld1 {M0.16b, M1.16b}, [MSG], #32
ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
karatsuba1 M0 KEY8
karatsuba1 M1 KEY7
.Lpartial2BlocksDone:
tst BLOCKS_LEFT, #1
beq .LpartialDone
ld1 {M0.16b}, [MSG], #16
ld1 {KEY8.16b}, [KEY_POWERS], #16
karatsuba1 M0 KEY8
.LpartialDone:
karatsuba2
montgomery_reduction SUM
.endm
/*
* Perform montgomery multiplication in GF(2^128) and store result in op1.
*
* Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
* If op1, op2 are in montgomery form, this computes the montgomery
* form of op1*op2.
*
* void pmull_polyval_mul(u8 *op1, const u8 *op2);
*/
SYM_FUNC_START(pmull_polyval_mul)
adr TMP, .Lgstar
ld1 {GSTAR.2d}, [TMP]
ld1 {v0.16b}, [x0]
ld1 {v1.16b}, [x1]
karatsuba1_store v0 v1
karatsuba2
montgomery_reduction SUM
st1 {SUM.16b}, [x0]
ret
SYM_FUNC_END(pmull_polyval_mul)
/*
* Perform polynomial evaluation as specified by POLYVAL. This computes:
* h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
* where n=nblocks, h is the hash key, and m_i are the message blocks.
*
* x0 - pointer to precomputed key powers h^8 ... h^1
* x1 - pointer to message blocks
* x2 - number of blocks to hash
* x3 - pointer to accumulator
*
* void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in,
* size_t nblocks, u8 *accumulator);
*/
SYM_FUNC_START(pmull_polyval_update)
adr TMP, .Lgstar
mov KEY_START, KEY_POWERS
ld1 {GSTAR.2d}, [TMP]
ld1 {SUM.16b}, [ACCUMULATOR]
subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
blt .LstrideLoopExit
ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
full_stride 0
subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
blt .LstrideLoopExitReduce
.LstrideLoop:
full_stride 1
subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
bge .LstrideLoop
.LstrideLoopExitReduce:
montgomery_reduction SUM
.LstrideLoopExit:
adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
beq .LskipPartial
partial_stride
.LskipPartial:
st1 {SUM.16b}, [ACCUMULATOR]
ret
SYM_FUNC_END(pmull_polyval_update)

View File

@@ -0,0 +1,191 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Glue code for POLYVAL using ARMv8 Crypto Extensions
*
* Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
* Copyright (c) 2009 Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
* Copyright 2021 Google LLC
*/
/*
* Glue code based on ghash-clmulni-intel_glue.c.
*
* This implementation of POLYVAL uses montgomery multiplication accelerated by
* ARMv8 Crypto Extensions instructions to implement the finite field operations.
*/
#include <crypto/algapi.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/polyval.h>
#include <linux/crypto.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/cpufeature.h>
#include <asm/neon.h>
#include <asm/simd.h>
#define NUM_KEY_POWERS 8
struct polyval_tfm_ctx {
/*
* These powers must be in the order h^8, ..., h^1.
*/
u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE];
};
struct polyval_desc_ctx {
u8 buffer[POLYVAL_BLOCK_SIZE];
u32 bytes;
};
asmlinkage void pmull_polyval_update(const struct polyval_tfm_ctx *keys,
const u8 *in, size_t nblocks, u8 *accumulator);
asmlinkage void pmull_polyval_mul(u8 *op1, const u8 *op2);
static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
const u8 *in, size_t nblocks, u8 *accumulator)
{
if (likely(crypto_simd_usable())) {
kernel_neon_begin();
pmull_polyval_update(keys, in, nblocks, accumulator);
kernel_neon_end();
} else {
polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in,
nblocks, accumulator);
}
}
static void internal_polyval_mul(u8 *op1, const u8 *op2)
{
if (likely(crypto_simd_usable())) {
kernel_neon_begin();
pmull_polyval_mul(op1, op2);
kernel_neon_end();
} else {
polyval_mul_non4k(op1, op2);
}
}
static int polyval_arm64_setkey(struct crypto_shash *tfm,
const u8 *key, unsigned int keylen)
{
struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm);
int i;
if (keylen != POLYVAL_BLOCK_SIZE)
return -EINVAL;
memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
internal_polyval_mul(tctx->key_powers[i],
tctx->key_powers[i+1]);
}
return 0;
}
static int polyval_arm64_init(struct shash_desc *desc)
{
struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
memset(dctx, 0, sizeof(*dctx));
return 0;
}
static int polyval_arm64_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
{
struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
u8 *pos;
unsigned int nblocks;
unsigned int n;
if (dctx->bytes) {
n = min(srclen, dctx->bytes);
pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes;
dctx->bytes -= n;
srclen -= n;
while (n--)
*pos++ ^= *src++;
if (!dctx->bytes)
internal_polyval_mul(dctx->buffer,
tctx->key_powers[NUM_KEY_POWERS-1]);
}
while (srclen >= POLYVAL_BLOCK_SIZE) {
/* allow rescheduling every 4K bytes */
nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
internal_polyval_update(tctx, src, nblocks, dctx->buffer);
srclen -= nblocks * POLYVAL_BLOCK_SIZE;
src += nblocks * POLYVAL_BLOCK_SIZE;
}
if (srclen) {
dctx->bytes = POLYVAL_BLOCK_SIZE - srclen;
pos = dctx->buffer;
while (srclen--)
*pos++ ^= *src++;
}
return 0;
}
static int polyval_arm64_final(struct shash_desc *desc, u8 *dst)
{
struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
if (dctx->bytes) {
internal_polyval_mul(dctx->buffer,
tctx->key_powers[NUM_KEY_POWERS-1]);
}
memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
return 0;
}
static struct shash_alg polyval_alg = {
.digestsize = POLYVAL_DIGEST_SIZE,
.init = polyval_arm64_init,
.update = polyval_arm64_update,
.final = polyval_arm64_final,
.setkey = polyval_arm64_setkey,
.descsize = sizeof(struct polyval_desc_ctx),
.base = {
.cra_name = "polyval",
.cra_driver_name = "polyval-ce",
.cra_priority = 200,
.cra_blocksize = POLYVAL_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct polyval_tfm_ctx),
.cra_module = THIS_MODULE,
},
};
static int __init polyval_ce_mod_init(void)
{
return crypto_register_shash(&polyval_alg);
}
static void __exit polyval_ce_mod_exit(void)
{
crypto_unregister_shash(&polyval_alg);
}
module_cpu_feature_match(PMULL, polyval_ce_mod_init)
module_exit(polyval_ce_mod_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("POLYVAL hash function accelerated by ARMv8 Crypto Extensions");
MODULE_ALIAS_CRYPTO("polyval");
MODULE_ALIAS_CRYPTO("polyval-ce");

View File

@@ -28,7 +28,7 @@
* instructions per clock cycle using one 32/64 bit unit (SU1) and one 32
* bit unit (SU2). One of these can be a memory access that is executed via
* a single load and store unit (LSU). XTS-AES-256 takes ~780 operations per
* 16 byte block block or 25 cycles per byte. Thus 768 bytes of input data
* 16 byte block or 25 cycles per byte. Thus 768 bytes of input data
* will need an estimated maximum of 20,000 cycles. Headroom for cache misses
* included. Even with the low end model clocked at 667 MHz this equals to a
* critical time window of less than 30us. The value has been chosen to

View File

@@ -61,14 +61,15 @@ sha256-ssse3-$(CONFIG_AS_SHA256_NI) += sha256_ni_asm.o
obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
blake2s-x86_64-y := blake2s-shash.o
obj-$(if $(CONFIG_CRYPTO_BLAKE2S_X86),y) += libblake2s-x86_64.o
obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o
libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
crc32c-intel-y := crc32c-intel_glue.o
crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o

View File

@@ -23,6 +23,11 @@
#define VMOVDQ vmovdqu
/*
* Note: the "x" prefix in these aliases means "this is an xmm register". The
* alias prefixes have no relation to XCTR where the "X" prefix means "XOR
* counter".
*/
#define xdata0 %xmm0
#define xdata1 %xmm1
#define xdata2 %xmm2
@@ -31,8 +36,10 @@
#define xdata5 %xmm5
#define xdata6 %xmm6
#define xdata7 %xmm7
#define xcounter %xmm8
#define xbyteswap %xmm9
#define xcounter %xmm8 // CTR mode only
#define xiv %xmm8 // XCTR mode only
#define xbyteswap %xmm9 // CTR mode only
#define xtmp %xmm9 // XCTR mode only
#define xkey0 %xmm10
#define xkey4 %xmm11
#define xkey8 %xmm12
@@ -45,7 +52,7 @@
#define p_keys %rdx
#define p_out %rcx
#define num_bytes %r8
#define counter %r9 // XCTR mode only
#define tmp %r10
#define DDQ_DATA 0
#define XDATA 1
@@ -102,7 +109,7 @@ ddq_add_8:
* do_aes num_in_par load_keys key_len
* This increments p_in, but not p_out
*/
.macro do_aes b, k, key_len
.macro do_aes b, k, key_len, xctr
.set by, \b
.set load_keys, \k
.set klen, \key_len
@@ -111,29 +118,48 @@ ddq_add_8:
vmovdqa 0*16(p_keys), xkey0
.endif
vpshufb xbyteswap, xcounter, xdata0
.set i, 1
.rept (by - 1)
club XDATA, i
vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
vptest ddq_low_msk(%rip), var_xdata
jnz 1f
vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
vpaddq ddq_high_add_1(%rip), xcounter, xcounter
1:
vpshufb xbyteswap, var_xdata, var_xdata
.set i, (i +1)
.endr
.if \xctr
movq counter, xtmp
.set i, 0
.rept (by)
club XDATA, i
vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata
.set i, (i +1)
.endr
.set i, 0
.rept (by)
club XDATA, i
vpxor xiv, var_xdata, var_xdata
.set i, (i +1)
.endr
.else
vpshufb xbyteswap, xcounter, xdata0
.set i, 1
.rept (by - 1)
club XDATA, i
vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
vptest ddq_low_msk(%rip), var_xdata
jnz 1f
vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
vpaddq ddq_high_add_1(%rip), xcounter, xcounter
1:
vpshufb xbyteswap, var_xdata, var_xdata
.set i, (i +1)
.endr
.endif
vmovdqa 1*16(p_keys), xkeyA
vpxor xkey0, xdata0, xdata0
vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
vptest ddq_low_msk(%rip), xcounter
jnz 1f
vpaddq ddq_high_add_1(%rip), xcounter, xcounter
1:
.if \xctr
add $by, counter
.else
vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
vptest ddq_low_msk(%rip), xcounter
jnz 1f
vpaddq ddq_high_add_1(%rip), xcounter, xcounter
1:
.endif
.set i, 1
.rept (by - 1)
@@ -371,94 +397,99 @@ ddq_add_8:
.endr
.endm
.macro do_aes_load val, key_len
do_aes \val, 1, \key_len
.macro do_aes_load val, key_len, xctr
do_aes \val, 1, \key_len, \xctr
.endm
.macro do_aes_noload val, key_len
do_aes \val, 0, \key_len
.macro do_aes_noload val, key_len, xctr
do_aes \val, 0, \key_len, \xctr
.endm
/* main body of aes ctr load */
.macro do_aes_ctrmain key_len
.macro do_aes_ctrmain key_len, xctr
cmp $16, num_bytes
jb .Ldo_return2\key_len
jb .Ldo_return2\xctr\key_len
vmovdqa byteswap_const(%rip), xbyteswap
vmovdqu (p_iv), xcounter
vpshufb xbyteswap, xcounter, xcounter
.if \xctr
shr $4, counter
vmovdqu (p_iv), xiv
.else
vmovdqa byteswap_const(%rip), xbyteswap
vmovdqu (p_iv), xcounter
vpshufb xbyteswap, xcounter, xcounter
.endif
mov num_bytes, tmp
and $(7*16), tmp
jz .Lmult_of_8_blks\key_len
jz .Lmult_of_8_blks\xctr\key_len
/* 1 <= tmp <= 7 */
cmp $(4*16), tmp
jg .Lgt4\key_len
je .Leq4\key_len
jg .Lgt4\xctr\key_len
je .Leq4\xctr\key_len
.Llt4\key_len:
.Llt4\xctr\key_len:
cmp $(2*16), tmp
jg .Leq3\key_len
je .Leq2\key_len
jg .Leq3\xctr\key_len
je .Leq2\xctr\key_len
.Leq1\key_len:
do_aes_load 1, \key_len
.Leq1\xctr\key_len:
do_aes_load 1, \key_len, \xctr
add $(1*16), p_out
and $(~7*16), num_bytes
jz .Ldo_return2\key_len
jmp .Lmain_loop2\key_len
jz .Ldo_return2\xctr\key_len
jmp .Lmain_loop2\xctr\key_len
.Leq2\key_len:
do_aes_load 2, \key_len
.Leq2\xctr\key_len:
do_aes_load 2, \key_len, \xctr
add $(2*16), p_out
and $(~7*16), num_bytes
jz .Ldo_return2\key_len
jmp .Lmain_loop2\key_len
jz .Ldo_return2\xctr\key_len
jmp .Lmain_loop2\xctr\key_len
.Leq3\key_len:
do_aes_load 3, \key_len
.Leq3\xctr\key_len:
do_aes_load 3, \key_len, \xctr
add $(3*16), p_out
and $(~7*16), num_bytes
jz .Ldo_return2\key_len
jmp .Lmain_loop2\key_len
jz .Ldo_return2\xctr\key_len
jmp .Lmain_loop2\xctr\key_len
.Leq4\key_len:
do_aes_load 4, \key_len
.Leq4\xctr\key_len:
do_aes_load 4, \key_len, \xctr
add $(4*16), p_out
and $(~7*16), num_bytes
jz .Ldo_return2\key_len
jmp .Lmain_loop2\key_len
jz .Ldo_return2\xctr\key_len
jmp .Lmain_loop2\xctr\key_len
.Lgt4\key_len:
.Lgt4\xctr\key_len:
cmp $(6*16), tmp
jg .Leq7\key_len
je .Leq6\key_len
jg .Leq7\xctr\key_len
je .Leq6\xctr\key_len
.Leq5\key_len:
do_aes_load 5, \key_len
.Leq5\xctr\key_len:
do_aes_load 5, \key_len, \xctr
add $(5*16), p_out
and $(~7*16), num_bytes
jz .Ldo_return2\key_len
jmp .Lmain_loop2\key_len
jz .Ldo_return2\xctr\key_len
jmp .Lmain_loop2\xctr\key_len
.Leq6\key_len:
do_aes_load 6, \key_len
.Leq6\xctr\key_len:
do_aes_load 6, \key_len, \xctr
add $(6*16), p_out
and $(~7*16), num_bytes
jz .Ldo_return2\key_len
jmp .Lmain_loop2\key_len
jz .Ldo_return2\xctr\key_len
jmp .Lmain_loop2\xctr\key_len
.Leq7\key_len:
do_aes_load 7, \key_len
.Leq7\xctr\key_len:
do_aes_load 7, \key_len, \xctr
add $(7*16), p_out
and $(~7*16), num_bytes
jz .Ldo_return2\key_len
jmp .Lmain_loop2\key_len
jz .Ldo_return2\xctr\key_len
jmp .Lmain_loop2\xctr\key_len
.Lmult_of_8_blks\key_len:
.Lmult_of_8_blks\xctr\key_len:
.if (\key_len != KEY_128)
vmovdqa 0*16(p_keys), xkey0
vmovdqa 4*16(p_keys), xkey4
@@ -471,17 +502,19 @@ ddq_add_8:
vmovdqa 9*16(p_keys), xkey12
.endif
.align 16
.Lmain_loop2\key_len:
.Lmain_loop2\xctr\key_len:
/* num_bytes is a multiple of 8 and >0 */
do_aes_noload 8, \key_len
do_aes_noload 8, \key_len, \xctr
add $(8*16), p_out
sub $(8*16), num_bytes
jne .Lmain_loop2\key_len
jne .Lmain_loop2\xctr\key_len
.Ldo_return2\key_len:
/* return updated IV */
vpshufb xbyteswap, xcounter, xcounter
vmovdqu xcounter, (p_iv)
.Ldo_return2\xctr\key_len:
.if !\xctr
/* return updated IV */
vpshufb xbyteswap, xcounter, xcounter
vmovdqu xcounter, (p_iv)
.endif
RET
.endm
@@ -494,7 +527,7 @@ ddq_add_8:
*/
SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
/* call the aes main loop */
do_aes_ctrmain KEY_128
do_aes_ctrmain KEY_128 0
SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
@@ -507,7 +540,7 @@ SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
*/
SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
/* call the aes main loop */
do_aes_ctrmain KEY_192
do_aes_ctrmain KEY_192 0
SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
@@ -520,6 +553,45 @@ SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
*/
SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
/* call the aes main loop */
do_aes_ctrmain KEY_256
do_aes_ctrmain KEY_256 0
SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
/*
* routine to do AES128 XCTR enc/decrypt "by8"
* XMM registers are clobbered.
* Saving/restoring must be done at a higher level
* aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys,
* u8* out, unsigned int num_bytes, unsigned int byte_ctr)
*/
SYM_FUNC_START(aes_xctr_enc_128_avx_by8)
/* call the aes main loop */
do_aes_ctrmain KEY_128 1
SYM_FUNC_END(aes_xctr_enc_128_avx_by8)
/*
* routine to do AES192 XCTR enc/decrypt "by8"
* XMM registers are clobbered.
* Saving/restoring must be done at a higher level
* aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys,
* u8* out, unsigned int num_bytes, unsigned int byte_ctr)
*/
SYM_FUNC_START(aes_xctr_enc_192_avx_by8)
/* call the aes main loop */
do_aes_ctrmain KEY_192 1
SYM_FUNC_END(aes_xctr_enc_192_avx_by8)
/*
* routine to do AES256 XCTR enc/decrypt "by8"
* XMM registers are clobbered.
* Saving/restoring must be done at a higher level
* aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys,
* u8* out, unsigned int num_bytes, unsigned int byte_ctr)
*/
SYM_FUNC_START(aes_xctr_enc_256_avx_by8)
/* call the aes main loop */
do_aes_ctrmain KEY_256 1
SYM_FUNC_END(aes_xctr_enc_256_avx_by8)

View File

@@ -135,6 +135,20 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv,
const void *keys, u8 *out, unsigned int num_bytes,
unsigned int byte_ctr);
asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv,
const void *keys, u8 *out, unsigned int num_bytes,
unsigned int byte_ctr);
asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv,
const void *keys, u8 *out, unsigned int num_bytes,
unsigned int byte_ctr);
/*
* asmlinkage void aesni_gcm_init_avx_gen2()
* gcm_data *my_ctx_data, context data
@@ -527,6 +541,59 @@ static int ctr_crypt(struct skcipher_request *req)
return err;
}
static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv,
unsigned int byte_ctr)
{
if (ctx->key_length == AES_KEYSIZE_128)
aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len,
byte_ctr);
else if (ctx->key_length == AES_KEYSIZE_192)
aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len,
byte_ctr);
else
aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len,
byte_ctr);
}
static int xctr_crypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
u8 keystream[AES_BLOCK_SIZE];
struct skcipher_walk walk;
unsigned int nbytes;
unsigned int byte_ctr = 0;
int err;
__le32 block[AES_BLOCK_SIZE / sizeof(__le32)];
err = skcipher_walk_virt(&walk, req, false);
while ((nbytes = walk.nbytes) > 0) {
kernel_fpu_begin();
if (nbytes & AES_BLOCK_MASK)
aesni_xctr_enc_avx_tfm(ctx, walk.dst.virt.addr,
walk.src.virt.addr, nbytes & AES_BLOCK_MASK,
walk.iv, byte_ctr);
nbytes &= ~AES_BLOCK_MASK;
byte_ctr += walk.nbytes - nbytes;
if (walk.nbytes == walk.total && nbytes > 0) {
memcpy(block, walk.iv, AES_BLOCK_SIZE);
block[0] ^= cpu_to_le32(1 + byte_ctr / AES_BLOCK_SIZE);
aesni_enc(ctx, keystream, (u8 *)block);
crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes -
nbytes, walk.src.virt.addr + walk.nbytes
- nbytes, keystream, nbytes);
byte_ctr += nbytes;
nbytes = 0;
}
kernel_fpu_end();
err = skcipher_walk_done(&walk, nbytes);
}
return err;
}
static int
rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
{
@@ -1050,6 +1117,33 @@ static struct skcipher_alg aesni_skciphers[] = {
static
struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)];
#ifdef CONFIG_X86_64
/*
* XCTR does not have a non-AVX implementation, so it must be enabled
* conditionally.
*/
static struct skcipher_alg aesni_xctr = {
.base = {
.cra_name = "__xctr(aes)",
.cra_driver_name = "__xctr-aes-aesni",
.cra_priority = 400,
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = CRYPTO_AES_CTX_SIZE,
.cra_module = THIS_MODULE,
},
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.chunksize = AES_BLOCK_SIZE,
.setkey = aesni_skcipher_setkey,
.encrypt = xctr_crypt,
.decrypt = xctr_crypt,
};
static struct simd_skcipher_alg *aesni_simd_xctr;
#endif /* CONFIG_X86_64 */
#ifdef CONFIG_X86_64
static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
unsigned int key_len)
@@ -1163,7 +1257,7 @@ static int __init aesni_init(void)
static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm);
pr_info("AES CTR mode by8 optimization enabled\n");
}
#endif
#endif /* CONFIG_X86_64 */
err = crypto_register_alg(&aesni_cipher_alg);
if (err)
@@ -1180,8 +1274,22 @@ static int __init aesni_init(void)
if (err)
goto unregister_skciphers;
#ifdef CONFIG_X86_64
if (boot_cpu_has(X86_FEATURE_AVX))
err = simd_register_skciphers_compat(&aesni_xctr, 1,
&aesni_simd_xctr);
if (err)
goto unregister_aeads;
#endif /* CONFIG_X86_64 */
return 0;
#ifdef CONFIG_X86_64
unregister_aeads:
simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
aesni_simd_aeads);
#endif /* CONFIG_X86_64 */
unregister_skciphers:
simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
aesni_simd_skciphers);
@@ -1197,6 +1305,10 @@ static void __exit aesni_exit(void)
simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
aesni_simd_skciphers);
crypto_unregister_alg(&aesni_cipher_alg);
#ifdef CONFIG_X86_64
if (boot_cpu_has(X86_FEATURE_AVX))
simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
#endif /* CONFIG_X86_64 */
}
late_initcall(aesni_init);

View File

@@ -4,7 +4,6 @@
*/
#include <crypto/internal/blake2s.h>
#include <crypto/internal/simd.h>
#include <linux/types.h>
#include <linux/jump_label.h>
@@ -33,7 +32,7 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block,
/* SIMD disables preemption, so relax after processing each page. */
BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
blake2s_compress_generic(state, block, nblocks, inc);
return;
}

View File

@@ -1,77 +0,0 @@
// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#include <crypto/internal/blake2s.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/hash.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/sizes.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
static int crypto_blake2s_update_x86(struct shash_desc *desc,
const u8 *in, unsigned int inlen)
{
return crypto_blake2s_update(desc, in, inlen, false);
}
static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
{
return crypto_blake2s_final(desc, out, false);
}
#define BLAKE2S_ALG(name, driver_name, digest_size) \
{ \
.base.cra_name = name, \
.base.cra_driver_name = driver_name, \
.base.cra_priority = 200, \
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
.base.cra_module = THIS_MODULE, \
.digestsize = digest_size, \
.setkey = crypto_blake2s_setkey, \
.init = crypto_blake2s_init, \
.update = crypto_blake2s_update_x86, \
.final = crypto_blake2s_final_x86, \
.descsize = sizeof(struct blake2s_state), \
}
static struct shash_alg blake2s_algs[] = {
BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE),
BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE),
BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE),
BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE),
};
static int __init blake2s_mod_init(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
return 0;
}
static void __exit blake2s_mod_exit(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
}
module_init(blake2s_mod_init);
module_exit(blake2s_mod_exit);
MODULE_ALIAS_CRYPTO("blake2s-128");
MODULE_ALIAS_CRYPTO("blake2s-128-x86");
MODULE_ALIAS_CRYPTO("blake2s-160");
MODULE_ALIAS_CRYPTO("blake2s-160-x86");
MODULE_ALIAS_CRYPTO("blake2s-224");
MODULE_ALIAS_CRYPTO("blake2s-224-x86");
MODULE_ALIAS_CRYPTO("blake2s-256");
MODULE_ALIAS_CRYPTO("blake2s-256-x86");
MODULE_LICENSE("GPL v2");

Some files were not shown because too many files have changed in this diff Show More