From 116db2704c193fff6d73ea6c2219625f0c9bdfc8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 Dec 2022 21:40:40 -0800 Subject: [PATCH 001/156] crypto: x86/ghash - fix unaligned access in ghash_setkey() The key can be unaligned, so use the unaligned memory access helpers. Fixes: 8ceee72808d1 ("crypto: ghash-clmulni-intel - use C implementation for setkey()") Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/ghash-clmulni-intel_glue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 1f1a95f3dd0c..c0ab0ff4af65 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -19,6 +19,7 @@ #include #include #include +#include #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 @@ -54,15 +55,14 @@ static int ghash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct ghash_ctx *ctx = crypto_shash_ctx(tfm); - be128 *x = (be128 *)key; u64 a, b; if (keylen != GHASH_BLOCK_SIZE) return -EINVAL; /* perform multiplication by 'x' in GF(2^128) */ - a = be64_to_cpu(x->a); - b = be64_to_cpu(x->b); + a = get_unaligned_be64(key); + b = get_unaligned_be64(key + 8); ctx->shash.a = (b << 1) | (a >> 63); ctx->shash.b = (a << 1) | (b >> 63); From f1740751f793d1ee5f0bd0639f68f49c4ccb94a9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 Dec 2022 21:40:41 -0800 Subject: [PATCH 002/156] crypto: x86/ghash - use le128 instead of u128 The u128 struct type is going away, so make ghash-clmulni-intel use le128 instead. Note that the field names a and b swapped, as they were backwards with u128. (a is meant to be high-order and b low-order.) Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/ghash-clmulni-intel_asm.S | 4 ++-- arch/x86/crypto/ghash-clmulni-intel_glue.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 2bf871899920..9dfeb4d31b92 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -88,7 +88,7 @@ SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble) RET SYM_FUNC_END(__clmul_gf128mul_ble) -/* void clmul_ghash_mul(char *dst, const u128 *shash) */ +/* void clmul_ghash_mul(char *dst, const le128 *shash) */ SYM_FUNC_START(clmul_ghash_mul) FRAME_BEGIN movups (%rdi), DATA @@ -104,7 +104,7 @@ SYM_FUNC_END(clmul_ghash_mul) /* * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, - * const u128 *shash); + * const le128 *shash); */ SYM_FUNC_START(clmul_ghash_update) FRAME_BEGIN diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index c0ab0ff4af65..9453b094bb3b 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -24,17 +24,17 @@ #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 -void clmul_ghash_mul(char *dst, const u128 *shash); +void clmul_ghash_mul(char *dst, const le128 *shash); void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, - const u128 *shash); + const le128 *shash); struct ghash_async_ctx { struct cryptd_ahash *cryptd_tfm; }; struct ghash_ctx { - u128 shash; + le128 shash; }; struct ghash_desc_ctx { @@ -64,11 +64,11 @@ static int ghash_setkey(struct crypto_shash *tfm, a = get_unaligned_be64(key); b = get_unaligned_be64(key + 8); - ctx->shash.a = (b << 1) | (a >> 63); - ctx->shash.b = (a << 1) | (b >> 63); + ctx->shash.a = cpu_to_le64((a << 1) | (b >> 63)); + ctx->shash.b = cpu_to_le64((b << 1) | (a >> 63)); if (a >> 63) - ctx->shash.b ^= ((u64)0xc2) << 56; + ctx->shash.a ^= cpu_to_le64((u64)0xc2 << 56); return 0; } From 750426d63368c98404c917500b3687d7aed1a484 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 Dec 2022 21:40:42 -0800 Subject: [PATCH 003/156] crypto: x86/ghash - add comment and fix broken link Add a comment that explains what ghash_setkey() is doing, as it's hard to understand otherwise. Also fix a broken hyperlink. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/ghash-clmulni-intel_asm.S | 2 +- arch/x86/crypto/ghash-clmulni-intel_glue.c | 27 ++++++++++++++++++---- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 9dfeb4d31b92..257ed9446f3e 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -4,7 +4,7 @@ * instructions. This file contains accelerated part of ghash * implementation. More information about PCLMULQDQ can be found at: * - * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ + * https://www.intel.com/content/dam/develop/external/us/en/documents/clmul-wp-rev-2-02-2014-04-20.pdf * * Copyright (c) 2009 Intel Corp. * Author: Huang Ying diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 9453b094bb3b..700ecaee9a08 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -60,16 +60,35 @@ static int ghash_setkey(struct crypto_shash *tfm, if (keylen != GHASH_BLOCK_SIZE) return -EINVAL; - /* perform multiplication by 'x' in GF(2^128) */ + /* + * GHASH maps bits to polynomial coefficients backwards, which makes it + * hard to implement. But it can be shown that the GHASH multiplication + * + * D * K (mod x^128 + x^7 + x^2 + x + 1) + * + * (where D is a data block and K is the key) is equivalent to: + * + * bitreflect(D) * bitreflect(K) * x^(-127) + * (mod x^128 + x^127 + x^126 + x^121 + 1) + * + * So, the code below precomputes: + * + * bitreflect(K) * x^(-127) (mod x^128 + x^127 + x^126 + x^121 + 1) + * + * ... but in Montgomery form (so that Montgomery multiplication can be + * used), i.e. with an extra x^128 factor, which means actually: + * + * bitreflect(K) * x (mod x^128 + x^127 + x^126 + x^121 + 1) + * + * The within-a-byte part of bitreflect() cancels out GHASH's built-in + * reflection, and thus bitreflect() is actually a byteswap. + */ a = get_unaligned_be64(key); b = get_unaligned_be64(key + 8); - ctx->shash.a = cpu_to_le64((a << 1) | (b >> 63)); ctx->shash.b = cpu_to_le64((b << 1) | (a >> 63)); - if (a >> 63) ctx->shash.a ^= cpu_to_le64((u64)0xc2 << 56); - return 0; } From 4838c5195af75373104d1e2895810c1f95c00ec4 Mon Sep 17 00:00:00 2001 From: Sergiu Moga Date: Wed, 7 Dec 2022 15:59:55 +0200 Subject: [PATCH 004/156] crypto: atmel - Add capability case for the 0x600 SHA and AES IP versions In order for the driver to be made aware of the capabilities of the SHA and AES IP versions 0x600 , such as those present on the SAM9X60 SoC's, add a corresponding switch case to the capability method of the respective drivers. Without this, besides the capabilities not being correctly set, the self tests may hang since the driver is endlessly waiting for a completion to be set by a never occurring DMA interrupt handler. Signed-off-by: Sergiu Moga Acked-by: Nicolas Ferre Signed-off-by: Herbert Xu --- drivers/crypto/atmel-aes.c | 1 + drivers/crypto/atmel-sha.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/crypto/atmel-aes.c b/drivers/crypto/atmel-aes.c index 886bf258544c..063394cfa874 100644 --- a/drivers/crypto/atmel-aes.c +++ b/drivers/crypto/atmel-aes.c @@ -2510,6 +2510,7 @@ static void atmel_aes_get_cap(struct atmel_aes_dev *dd) /* keep only major version number */ switch (dd->hw_version & 0xff0) { case 0x700: + case 0x600: case 0x500: dd->caps.has_dualbuff = 1; dd->caps.has_cfb64 = 1; diff --git a/drivers/crypto/atmel-sha.c b/drivers/crypto/atmel-sha.c index ca4b01926d1b..00be792e605c 100644 --- a/drivers/crypto/atmel-sha.c +++ b/drivers/crypto/atmel-sha.c @@ -2509,6 +2509,7 @@ static void atmel_sha_get_cap(struct atmel_sha_dev *dd) /* keep only major version number */ switch (dd->hw_version & 0xff0) { case 0x700: + case 0x600: case 0x510: dd->caps.has_dma = 1; dd->caps.has_dualbuff = 1; From b517b0fbfec836391427649b207735afb698fcf3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 13 Dec 2022 18:43:30 +0800 Subject: [PATCH 005/156] crypto: arm/sha1 - Fix clang function cast warnings Instead of casting the function which upsets clang for some reason, change the assembly function siganture instead. Reported-by: kernel test robot Signed-off-by: Herbert Xu Reviewed-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/sha1_glue.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c index 6c2b849e459d..95a727bcd664 100644 --- a/arch/arm/crypto/sha1_glue.c +++ b/arch/arm/crypto/sha1_glue.c @@ -21,31 +21,29 @@ #include "sha1.h" -asmlinkage void sha1_block_data_order(u32 *digest, - const unsigned char *data, unsigned int rounds); +asmlinkage void sha1_block_data_order(struct sha1_state *digest, + const u8 *data, int rounds); int sha1_update_arm(struct shash_desc *desc, const u8 *data, unsigned int len) { - /* make sure casting to sha1_block_fn() is safe */ + /* make sure signature matches sha1_block_fn() */ BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0); - return sha1_base_do_update(desc, data, len, - (sha1_block_fn *)sha1_block_data_order); + return sha1_base_do_update(desc, data, len, sha1_block_data_order); } EXPORT_SYMBOL_GPL(sha1_update_arm); static int sha1_final(struct shash_desc *desc, u8 *out) { - sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_block_data_order); + sha1_base_do_finalize(desc, sha1_block_data_order); return sha1_base_finish(desc, out); } int sha1_finup_arm(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - sha1_base_do_update(desc, data, len, - (sha1_block_fn *)sha1_block_data_order); + sha1_base_do_update(desc, data, len, sha1_block_data_order); return sha1_final(desc, out); } EXPORT_SYMBOL_GPL(sha1_finup_arm); From aa9695157f65c55e5c85a1c194859d3c03e68018 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 13 Dec 2022 17:13:10 +0100 Subject: [PATCH 006/156] crypto: scatterwalk - use kmap_local() not kmap_atomic() kmap_atomic() is used to create short-lived mappings of pages that may not be accessible via the kernel direct map. This is only needed on 32-bit architectures that implement CONFIG_HIGHMEM, but it can be used on 64-bit other architectures too, where the returned mapping is simply the kernel direct address of the page. However, kmap_atomic() does not support migration on CONFIG_HIGHMEM configurations, due to the use of per-CPU kmap slots, and so it disables preemption on all architectures, not just the 32-bit ones. This implies that all scatterwalk based crypto routines essentially execute with preemption disabled all the time, which is less than ideal. So let's switch scatterwalk_map/_unmap and the shash/ahash routines to kmap_local() instead, which serves a similar purpose, but without the resulting impact on preemption on architectures that have no need for CONFIG_HIGHMEM. Cc: Eric Biggers Cc: Herbert Xu Cc: "Elliott, Robert (Servers)" Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/ahash.c | 4 ++-- crypto/shash.c | 4 ++-- include/crypto/scatterwalk.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/crypto/ahash.c b/crypto/ahash.c index c2ca631a111f..4b089f1b770f 100644 --- a/crypto/ahash.c +++ b/crypto/ahash.c @@ -45,7 +45,7 @@ static int hash_walk_next(struct crypto_hash_walk *walk) unsigned int nbytes = min(walk->entrylen, ((unsigned int)(PAGE_SIZE)) - offset); - walk->data = kmap_atomic(walk->pg); + walk->data = kmap_local_page(walk->pg); walk->data += offset; if (offset & alignmask) { @@ -95,7 +95,7 @@ int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err) } } - kunmap_atomic(walk->data); + kunmap_local(walk->data); crypto_yield(walk->flags); if (err) diff --git a/crypto/shash.c b/crypto/shash.c index 868b6ba2b3b7..58b46f198449 100644 --- a/crypto/shash.c +++ b/crypto/shash.c @@ -320,10 +320,10 @@ int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc) nbytes <= min(sg->length, ((unsigned int)(PAGE_SIZE)) - offset))) { void *data; - data = kmap_atomic(sg_page(sg)); + data = kmap_local_page(sg_page(sg)); err = crypto_shash_digest(desc, data + offset, nbytes, req->result); - kunmap_atomic(data); + kunmap_local(data); } else err = crypto_shash_init(desc) ?: shash_ahash_finup(req, desc); diff --git a/include/crypto/scatterwalk.h b/include/crypto/scatterwalk.h index f2c42b4111b1..32fc4473175b 100644 --- a/include/crypto/scatterwalk.h +++ b/include/crypto/scatterwalk.h @@ -53,7 +53,7 @@ static inline struct page *scatterwalk_page(struct scatter_walk *walk) static inline void scatterwalk_unmap(void *vaddr) { - kunmap_atomic(vaddr); + kunmap_local(vaddr); } static inline void scatterwalk_start(struct scatter_walk *walk, @@ -65,7 +65,7 @@ static inline void scatterwalk_start(struct scatter_walk *walk, static inline void *scatterwalk_map(struct scatter_walk *walk) { - return kmap_atomic(scatterwalk_page(walk)) + + return kmap_local_page(scatterwalk_page(walk)) + offset_in_page(walk->offset); } From 8031d1f678c2b06733c8f9028b413194b47c33ab Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 15 Dec 2022 17:29:38 +0100 Subject: [PATCH 007/156] crypto: wp512 - disable kmsan checks in wp512_process_buffer() The memory sanitizer causes excessive register spills in this function: crypto/wp512.c:782:13: error: stack frame size (2104) exceeds limit (2048) in 'wp512_process_buffer' [-Werror,-Wframe-larger-than] Assume that this one is safe, and mark it as needing no checks to get the stack usage back down to the normal level. Signed-off-by: Arnd Bergmann Signed-off-by: Herbert Xu --- crypto/wp512.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/wp512.c b/crypto/wp512.c index 5e820afa3c78..07994e5ebf4e 100644 --- a/crypto/wp512.c +++ b/crypto/wp512.c @@ -779,7 +779,7 @@ static const u64 rc[WHIRLPOOL_ROUNDS] = { * The core Whirlpool transform. */ -static void wp512_process_buffer(struct wp512_ctx *wctx) { +static __no_kmsan_checks void wp512_process_buffer(struct wp512_ctx *wctx) { int i, r; u64 K[8]; /* the round key */ u64 block[8]; /* mu(buffer) */ From 49bc6a7786b7d03eab6912a88d09a7991a32174e Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 20 Dec 2022 09:05:36 +0100 Subject: [PATCH 008/156] crypto: ux500 - update debug config after ux500 cryp driver removal Commit 453de3eb08c4 ("crypto: ux500/cryp - delete driver") removes the config CRYPTO_DEV_UX500_CRYP, but leaves an obsolete reference in the dependencies of config CRYPTO_DEV_UX500_DEBUG. Remove that obsolete reference, and adjust the description while at it. Fixes: 453de3eb08c4 ("crypto: ux500/cryp - delete driver") Signed-off-by: Lukas Bulwahn Acked-by: Linus Walleij Signed-off-by: Herbert Xu --- drivers/crypto/ux500/Kconfig | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/ux500/Kconfig b/drivers/crypto/ux500/Kconfig index dcbd7404768f..ac89cd2de12a 100644 --- a/drivers/crypto/ux500/Kconfig +++ b/drivers/crypto/ux500/Kconfig @@ -15,8 +15,7 @@ config CRYPTO_DEV_UX500_HASH Depends on UX500/STM DMA if running in DMA mode. config CRYPTO_DEV_UX500_DEBUG - bool "Activate ux500 platform debug-mode for crypto and hash block" - depends on CRYPTO_DEV_UX500_CRYP || CRYPTO_DEV_UX500_HASH + bool "Activate debug-mode for UX500 crypto driver for HASH block" + depends on CRYPTO_DEV_UX500_HASH help - Say Y if you want to add debug prints to ux500_hash and - ux500_cryp devices. + Say Y if you want to add debug prints to ux500_hash devices. From e20d5a22bd241c5084184bcf69dfb6eddc35417f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 21 Dec 2022 14:58:08 +0800 Subject: [PATCH 009/156] crypto: lib/blake2s - Split up test function to halve stack usage Reduce the stack usage further by splitting up the test function. Also squash blocks and unaligned_blocks into one array. Reported-by: kernel test robot Signed-off-by: Herbert Xu --- lib/crypto/blake2s-selftest.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/lib/crypto/blake2s-selftest.c b/lib/crypto/blake2s-selftest.c index 7d77dea15587..d0634ed6a937 100644 --- a/lib/crypto/blake2s-selftest.c +++ b/lib/crypto/blake2s-selftest.c @@ -545,7 +545,7 @@ static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = { 0xd6, 0x98, 0x6b, 0x07, 0x10, 0x65, 0x52, 0x65, }, }; -bool __init blake2s_selftest(void) +static bool __init noinline_for_stack blake2s_digest_test(void) { u8 key[BLAKE2S_KEY_SIZE]; u8 buf[ARRAY_SIZE(blake2s_testvecs)]; @@ -589,11 +589,20 @@ bool __init blake2s_selftest(void) } } + return success; +} + +static bool __init noinline_for_stack blake2s_random_test(void) +{ + struct blake2s_state state; + bool success = true; + int i, l; + for (i = 0; i < 32; ++i) { enum { TEST_ALIGNMENT = 16 }; - u8 unaligned_block[BLAKE2S_BLOCK_SIZE + TEST_ALIGNMENT - 1] + u8 blocks[BLAKE2S_BLOCK_SIZE * 2 + TEST_ALIGNMENT - 1] __aligned(TEST_ALIGNMENT); - u8 blocks[BLAKE2S_BLOCK_SIZE * 2]; + u8 *unaligned_block = blocks + BLAKE2S_BLOCK_SIZE; struct blake2s_state state1, state2; get_random_bytes(blocks, sizeof(blocks)); @@ -630,3 +639,13 @@ bool __init blake2s_selftest(void) return success; } + +bool __init blake2s_selftest(void) +{ + bool success; + + success = blake2s_digest_test(); + success &= blake2s_random_test(); + + return success; +} From 7361d1bc307b926cbca214ab67b641123c2d6357 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 27 Dec 2022 15:27:39 +0100 Subject: [PATCH 010/156] lib/mpi: Fix buffer overrun when SG is too long The helper mpi_read_raw_from_sgl sets the number of entries in the SG list according to nbytes. However, if the last entry in the SG list contains more data than nbytes, then it may overrun the buffer because it only allocates enough memory for nbytes. Fixes: 2d4d1eea540b ("lib/mpi: Add mpi sgl helpers") Reported-by: Roberto Sassu Signed-off-by: Herbert Xu Reviewed-by: Eric Biggers Signed-off-by: Herbert Xu --- lib/mpi/mpicoder.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c index 39c4c6731094..3cb6bd148fa9 100644 --- a/lib/mpi/mpicoder.c +++ b/lib/mpi/mpicoder.c @@ -504,7 +504,8 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) while (sg_miter_next(&miter)) { buff = miter.addr; - len = miter.length; + len = min_t(unsigned, miter.length, nbytes); + nbytes -= len; for (x = 0; x < len; x++) { a <<= 8; From 4f289826fee03571c2784272dd530aae1fd77566 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 28 Dec 2022 19:03:32 +0800 Subject: [PATCH 011/156] crypto: caam - Avoid GCC memset bug warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Certain versions of gcc don't like the memcpy with a NULL dst (which only happens with a zero length). This only happens when debugging is enabled so add an if clause to work around these warnings. A similar warning used to be generated by sparse but that was fixed years ago. Link: https://lore.kernel.org/lkml/202210290446.qBayTfzl-lkp@intel.com Reported-by: kernel test robot Reported-by: Kees Cook Reported-by: Uwe Kleine-König Tested-by: Uwe Kleine-König Signed-off-by: Herbert Xu --- drivers/crypto/caam/desc_constr.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/caam/desc_constr.h b/drivers/crypto/caam/desc_constr.h index 62ce6421bb3f..824c94d44f94 100644 --- a/drivers/crypto/caam/desc_constr.h +++ b/drivers/crypto/caam/desc_constr.h @@ -163,7 +163,8 @@ static inline void append_data(u32 * const desc, const void *data, int len) { u32 *offset = desc_end(desc); - if (len) /* avoid sparse warning: memcpy with byte count of 0 */ + /* Avoid gcc warning: memcpy with data == NULL */ + if (!IS_ENABLED(CONFIG_CRYPTO_DEV_FSL_CAAM_DEBUG) || data) memcpy(offset, data, len); (*desc) = cpu_to_caam32(caam32_to_cpu(*desc) + From 39a76cf1f5cecec2256ab2d20cf714573c5d994c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 29 Dec 2022 16:58:21 +0800 Subject: [PATCH 012/156] crypto: sun8i-ss - Remove GFP_DMA and add DMA alignment padding GFP_DMA does not guarantee that the returned memory is aligned for DMA. In fact for sun8i-ss it is superfluous and can be removed. However, kmalloc may start returning DMA-unaligned memory in future so fix this by adding the alignment by hand. Signed-off-by: Herbert Xu Tested-by: Corentin Labbe Acked-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c | 4 ++-- drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c | 13 ++++++++----- drivers/crypto/allwinner/sun8i-ss/sun8i-ss-hash.c | 4 ++-- drivers/crypto/allwinner/sun8i-ss/sun8i-ss-prng.c | 11 +++++++++-- 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c index 902f6be057ec..83c6dfad77e1 100644 --- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c +++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c @@ -452,7 +452,7 @@ int sun8i_ss_aes_setkey(struct crypto_skcipher *tfm, const u8 *key, } kfree_sensitive(op->key); op->keylen = keylen; - op->key = kmemdup(key, keylen, GFP_KERNEL | GFP_DMA); + op->key = kmemdup(key, keylen, GFP_KERNEL); if (!op->key) return -ENOMEM; @@ -475,7 +475,7 @@ int sun8i_ss_des3_setkey(struct crypto_skcipher *tfm, const u8 *key, kfree_sensitive(op->key); op->keylen = keylen; - op->key = kmemdup(key, keylen, GFP_KERNEL | GFP_DMA); + op->key = kmemdup(key, keylen, GFP_KERNEL); if (!op->key) return -ENOMEM; diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c index ac2329e2b0e5..c9dc06f97857 100644 --- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c +++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -527,7 +528,7 @@ static int allocate_flows(struct sun8i_ss_dev *ss) init_completion(&ss->flows[i].complete); ss->flows[i].biv = devm_kmalloc(ss->dev, AES_BLOCK_SIZE, - GFP_KERNEL | GFP_DMA); + GFP_KERNEL); if (!ss->flows[i].biv) { err = -ENOMEM; goto error_engine; @@ -535,7 +536,7 @@ static int allocate_flows(struct sun8i_ss_dev *ss) for (j = 0; j < MAX_SG; j++) { ss->flows[i].iv[j] = devm_kmalloc(ss->dev, AES_BLOCK_SIZE, - GFP_KERNEL | GFP_DMA); + GFP_KERNEL); if (!ss->flows[i].iv[j]) { err = -ENOMEM; goto error_engine; @@ -544,13 +545,15 @@ static int allocate_flows(struct sun8i_ss_dev *ss) /* the padding could be up to two block. */ ss->flows[i].pad = devm_kmalloc(ss->dev, MAX_PAD_SIZE, - GFP_KERNEL | GFP_DMA); + GFP_KERNEL); if (!ss->flows[i].pad) { err = -ENOMEM; goto error_engine; } - ss->flows[i].result = devm_kmalloc(ss->dev, SHA256_DIGEST_SIZE, - GFP_KERNEL | GFP_DMA); + ss->flows[i].result = + devm_kmalloc(ss->dev, max(SHA256_DIGEST_SIZE, + dma_get_cache_alignment()), + GFP_KERNEL); if (!ss->flows[i].result) { err = -ENOMEM; goto error_engine; diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-hash.c b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-hash.c index 36a82b22953c..577bf636f7fb 100644 --- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-hash.c +++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-hash.c @@ -79,10 +79,10 @@ int sun8i_ss_hmac_setkey(struct crypto_ahash *ahash, const u8 *key, memcpy(tfmctx->key, key, keylen); } - tfmctx->ipad = kzalloc(bs, GFP_KERNEL | GFP_DMA); + tfmctx->ipad = kzalloc(bs, GFP_KERNEL); if (!tfmctx->ipad) return -ENOMEM; - tfmctx->opad = kzalloc(bs, GFP_KERNEL | GFP_DMA); + tfmctx->opad = kzalloc(bs, GFP_KERNEL); if (!tfmctx->opad) { ret = -ENOMEM; goto err_opad; diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-prng.c b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-prng.c index dd677e9ed06f..70c7b5d571b8 100644 --- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-prng.c +++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-prng.c @@ -11,6 +11,8 @@ */ #include "sun8i-ss.h" #include +#include +#include #include #include @@ -25,7 +27,7 @@ int sun8i_ss_prng_seed(struct crypto_rng *tfm, const u8 *seed, ctx->seed = NULL; } if (!ctx->seed) - ctx->seed = kmalloc(slen, GFP_KERNEL | GFP_DMA); + ctx->seed = kmalloc(slen, GFP_KERNEL); if (!ctx->seed) return -ENOMEM; @@ -58,6 +60,7 @@ int sun8i_ss_prng_generate(struct crypto_rng *tfm, const u8 *src, struct sun8i_ss_rng_tfm_ctx *ctx = crypto_rng_ctx(tfm); struct rng_alg *alg = crypto_rng_alg(tfm); struct sun8i_ss_alg_template *algt; + unsigned int todo_with_padding; struct sun8i_ss_dev *ss; dma_addr_t dma_iv, dma_dst; unsigned int todo; @@ -81,7 +84,11 @@ int sun8i_ss_prng_generate(struct crypto_rng *tfm, const u8 *src, todo = dlen + PRNG_SEED_SIZE + PRNG_DATA_SIZE; todo -= todo % PRNG_DATA_SIZE; - d = kzalloc(todo, GFP_KERNEL | GFP_DMA); + todo_with_padding = ALIGN(todo, dma_get_cache_alignment()); + if (todo_with_padding < todo || todo < dlen) + return -EOVERFLOW; + + d = kzalloc(todo_with_padding, GFP_KERNEL); if (!d) return -ENOMEM; From 1c4428b295884316eaff16be9c1d85f7c2361696 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Thu, 29 Dec 2022 22:17:05 +0100 Subject: [PATCH 013/156] crypto: xts - restrict key lengths to approved values in FIPS mode According to FIPS 140-3 IG C.I., only (total) key lengths of either 256 bits or 512 bits are allowed with xts(aes). Make xts_verify_key() to reject anything else in FIPS mode. As xts(aes) is the only approved xts() template instantiation in FIPS mode, the new restriction implemented in xts_verify_key() effectively only applies to this particular construction. Signed-off-by: Nicolai Stange Signed-off-by: Vladis Dronov Signed-off-by: Herbert Xu --- include/crypto/xts.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/crypto/xts.h b/include/crypto/xts.h index 0f8dba69feb4..a233c1054df2 100644 --- a/include/crypto/xts.h +++ b/include/crypto/xts.h @@ -35,6 +35,13 @@ static inline int xts_verify_key(struct crypto_skcipher *tfm, if (keylen % 2) return -EINVAL; + /* + * In FIPS mode only a combined key length of either 256 or + * 512 bits is allowed, c.f. FIPS 140-3 IG C.I. + */ + if (fips_enabled && keylen != 32 && keylen != 64) + return -EINVAL; + /* ensure that the AES and tweak key are not identical */ if ((fips_enabled || (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS)) && From 0ee433676e4f499cb65f3d375a60a0ac54af4c47 Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Thu, 29 Dec 2022 22:17:06 +0100 Subject: [PATCH 014/156] crypto: xts - drop xts_check_key() xts_check_key() is obsoleted by xts_verify_key(). Over time XTS crypto drivers adopted the newer xts_verify_key() variant, but xts_check_key() is still used by a number of drivers. Switch drivers to use the newer xts_verify_key() and make a couple of cleanups. This allows us to drop xts_check_key() completely and avoid redundancy. Signed-off-by: Vladis Dronov Reviewed-by: Eric Biggers Reviewed-by: Nicolai Stange Signed-off-by: Herbert Xu --- arch/s390/crypto/paes_s390.c | 2 +- drivers/crypto/atmel-aes.c | 2 +- drivers/crypto/axis/artpec6_crypto.c | 2 +- drivers/crypto/cavium/cpt/cptvf_algs.c | 8 +++---- .../crypto/cavium/nitrox/nitrox_skcipher.c | 8 +++---- drivers/crypto/ccree/cc_cipher.c | 2 +- .../crypto/marvell/octeontx/otx_cptvf_algs.c | 2 +- .../marvell/octeontx2/otx2_cptvf_algs.c | 2 +- include/crypto/xts.h | 22 ++++--------------- 9 files changed, 16 insertions(+), 34 deletions(-) diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c index a279b7d23a5e..29dc827e0fe8 100644 --- a/arch/s390/crypto/paes_s390.c +++ b/arch/s390/crypto/paes_s390.c @@ -474,7 +474,7 @@ static int xts_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, return rc; /* - * xts_check_key verifies the key length is not odd and makes + * xts_verify_key verifies the key length is not odd and makes * sure that the two keys are not the same. This can be done * on the two protected keys as well */ diff --git a/drivers/crypto/atmel-aes.c b/drivers/crypto/atmel-aes.c index 063394cfa874..e90e4a6cc37a 100644 --- a/drivers/crypto/atmel-aes.c +++ b/drivers/crypto/atmel-aes.c @@ -1879,7 +1879,7 @@ static int atmel_aes_xts_setkey(struct crypto_skcipher *tfm, const u8 *key, struct atmel_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm); int err; - err = xts_check_key(crypto_skcipher_tfm(tfm), key, keylen); + err = xts_verify_key(tfm, key, keylen); if (err) return err; diff --git a/drivers/crypto/axis/artpec6_crypto.c b/drivers/crypto/axis/artpec6_crypto.c index 51c66afbe677..f6f41e316dfe 100644 --- a/drivers/crypto/axis/artpec6_crypto.c +++ b/drivers/crypto/axis/artpec6_crypto.c @@ -1621,7 +1621,7 @@ artpec6_crypto_xts_set_key(struct crypto_skcipher *cipher, const u8 *key, crypto_skcipher_ctx(cipher); int ret; - ret = xts_check_key(&cipher->base, key, keylen); + ret = xts_verify_key(cipher, key, keylen); if (ret) return ret; diff --git a/drivers/crypto/cavium/cpt/cptvf_algs.c b/drivers/crypto/cavium/cpt/cptvf_algs.c index 9eca0c302186..0b38c2600b86 100644 --- a/drivers/crypto/cavium/cpt/cptvf_algs.c +++ b/drivers/crypto/cavium/cpt/cptvf_algs.c @@ -232,13 +232,12 @@ static int cvm_decrypt(struct skcipher_request *req) static int cvm_xts_setkey(struct crypto_skcipher *cipher, const u8 *key, u32 keylen) { - struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher); - struct cvm_enc_ctx *ctx = crypto_tfm_ctx(tfm); + struct cvm_enc_ctx *ctx = crypto_skcipher_ctx(cipher); int err; const u8 *key1 = key; const u8 *key2 = key + (keylen / 2); - err = xts_check_key(tfm, key, keylen); + err = xts_verify_key(cipher, key, keylen); if (err) return err; ctx->key_len = keylen; @@ -289,8 +288,7 @@ static int cvm_validate_keylen(struct cvm_enc_ctx *ctx, u32 keylen) static int cvm_setkey(struct crypto_skcipher *cipher, const u8 *key, u32 keylen, u8 cipher_type) { - struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher); - struct cvm_enc_ctx *ctx = crypto_tfm_ctx(tfm); + struct cvm_enc_ctx *ctx = crypto_skcipher_ctx(cipher); ctx->cipher_type = cipher_type; if (!cvm_validate_keylen(ctx, keylen)) { diff --git a/drivers/crypto/cavium/nitrox/nitrox_skcipher.c b/drivers/crypto/cavium/nitrox/nitrox_skcipher.c index 248b4fff1c72..138261dcd032 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_skcipher.c +++ b/drivers/crypto/cavium/nitrox/nitrox_skcipher.c @@ -337,12 +337,11 @@ static int nitrox_3des_decrypt(struct skcipher_request *skreq) static int nitrox_aes_xts_setkey(struct crypto_skcipher *cipher, const u8 *key, unsigned int keylen) { - struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher); - struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm); + struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(cipher); struct flexi_crypto_context *fctx; int aes_keylen, ret; - ret = xts_check_key(tfm, key, keylen); + ret = xts_verify_key(cipher, key, keylen); if (ret) return ret; @@ -362,8 +361,7 @@ static int nitrox_aes_xts_setkey(struct crypto_skcipher *cipher, static int nitrox_aes_ctr_rfc3686_setkey(struct crypto_skcipher *cipher, const u8 *key, unsigned int keylen) { - struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher); - struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm); + struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(cipher); struct flexi_crypto_context *fctx; int aes_keylen; diff --git a/drivers/crypto/ccree/cc_cipher.c b/drivers/crypto/ccree/cc_cipher.c index 309da6334a0a..2cd44d7457a4 100644 --- a/drivers/crypto/ccree/cc_cipher.c +++ b/drivers/crypto/ccree/cc_cipher.c @@ -460,7 +460,7 @@ static int cc_cipher_setkey(struct crypto_skcipher *sktfm, const u8 *key, } if (ctx_p->cipher_mode == DRV_CIPHER_XTS && - xts_check_key(tfm, key, keylen)) { + xts_verify_key(sktfm, key, keylen)) { dev_dbg(dev, "weak XTS key"); return -EINVAL; } diff --git a/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c b/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c index 80ba77c793a7..83493dd0416f 100644 --- a/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c +++ b/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c @@ -398,7 +398,7 @@ static int otx_cpt_skcipher_xts_setkey(struct crypto_skcipher *tfm, const u8 *key1 = key; int ret; - ret = xts_check_key(crypto_skcipher_tfm(tfm), key, keylen); + ret = xts_verify_key(tfm, key, keylen); if (ret) return ret; ctx->key_len = keylen; diff --git a/drivers/crypto/marvell/octeontx2/otx2_cptvf_algs.c b/drivers/crypto/marvell/octeontx2/otx2_cptvf_algs.c index 30b423605c9c..443202caa140 100644 --- a/drivers/crypto/marvell/octeontx2/otx2_cptvf_algs.c +++ b/drivers/crypto/marvell/octeontx2/otx2_cptvf_algs.c @@ -412,7 +412,7 @@ static int otx2_cpt_skcipher_xts_setkey(struct crypto_skcipher *tfm, const u8 *key1 = key; int ret; - ret = xts_check_key(crypto_skcipher_tfm(tfm), key, keylen); + ret = xts_verify_key(tfm, key, keylen); if (ret) return ret; ctx->key_len = keylen; diff --git a/include/crypto/xts.h b/include/crypto/xts.h index a233c1054df2..15b16c4853d8 100644 --- a/include/crypto/xts.h +++ b/include/crypto/xts.h @@ -8,23 +8,6 @@ #define XTS_BLOCK_SIZE 16 -static inline int xts_check_key(struct crypto_tfm *tfm, - const u8 *key, unsigned int keylen) -{ - /* - * key consists of keys of equal size concatenated, therefore - * the length must be even. - */ - if (keylen % 2) - return -EINVAL; - - /* ensure that the AES and tweak key are not identical */ - if (fips_enabled && !crypto_memneq(key, key + (keylen / 2), keylen / 2)) - return -EINVAL; - - return 0; -} - static inline int xts_verify_key(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { @@ -42,7 +25,10 @@ static inline int xts_verify_key(struct crypto_skcipher *tfm, if (fips_enabled && keylen != 32 && keylen != 64) return -EINVAL; - /* ensure that the AES and tweak key are not identical */ + /* + * Ensure that the AES and tweak key are not identical when + * in FIPS mode or the FORBID_WEAK_KEYS flag is set. + */ if ((fips_enabled || (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS)) && !crypto_memneq(key, key + (keylen / 2), keylen / 2)) From b6f5278003c4b250f19499a6561d6437b2c94c8e Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Thu, 29 Dec 2022 22:17:07 +0100 Subject: [PATCH 015/156] crypto: s390/aes - drop redundant xts key check xts_fallback_setkey() in xts_aes_set_key() will now enforce key size rule in FIPS mode when setting up the fallback algorithm keys, which makes the check in xts_aes_set_key() redundant or unreachable. So just drop this check. xts_fallback_setkey() now makes a key size check in xts_verify_key(): xts_fallback_setkey() crypto_skcipher_setkey() [ skcipher_setkey_unaligned() ] cipher->setkey() { .setkey = xts_setkey } xts_setkey() xts_verify_key() Signed-off-by: Vladis Dronov Signed-off-by: Herbert Xu --- arch/s390/crypto/aes_s390.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index 526c3f40f6a2..c773820e4af9 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -398,10 +398,6 @@ static int xts_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, if (err) return err; - /* In fips mode only 128 bit or 256 bit keys are valid */ - if (fips_enabled && key_len != 32 && key_len != 64) - return -EINVAL; - /* Pick the correct function code based on the key length */ fc = (key_len == 32) ? CPACF_KM_XTS_128 : (key_len == 64) ? CPACF_KM_XTS_256 : 0; From 1ce94a8c2c3721be1d9bc85fd38fc8c520aa37d6 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Thu, 29 Dec 2022 22:17:08 +0100 Subject: [PATCH 016/156] crypto: testmgr - disallow plain cbcmac(aes) in FIPS mode cbcmac(aes) may be used only as part of the ccm(aes) construction in FIPS mode. Since commit d6097b8d5d55 ("crypto: api - allow algs only in specific constructions in FIPS mode") there's support for using spawns which by itself are marked as non-approved from approved template instantiations. So simply mark plain cbcmac(aes) as non-approved in testmgr to block any attempts of direct instantiations in FIPS mode. Signed-off-by: Nicolai Stange Signed-off-by: Vladis Dronov Signed-off-by: Herbert Xu --- crypto/testmgr.c | 1 - 1 file changed, 1 deletion(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 4476ac97baa5..562463a77a76 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -4501,7 +4501,6 @@ static const struct alg_test_desc alg_test_descs[] = { }, { #endif .alg = "cbcmac(aes)", - .fips_allowed = 1, .test = alg_test_hash, .suite = { .hash = __VECS(aes_cbcmac_tv_template) From 2912eb9b17ac29facd799ffe05fdb7cf10017e82 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Thu, 29 Dec 2022 22:17:09 +0100 Subject: [PATCH 017/156] crypto: testmgr - disallow plain ghash in FIPS mode ghash may be used only as part of the gcm(aes) construction in FIPS mode. Since commit d6097b8d5d55 ("crypto: api - allow algs only in specific constructions in FIPS mode") there's support for using spawns which by itself are marked as non-approved from approved template instantiations. So simply mark plain ghash as non-approved in testmgr to block any attempts of direct instantiations in FIPS mode. Signed-off-by: Nicolai Stange Signed-off-by: Vladis Dronov Signed-off-by: Herbert Xu --- crypto/testmgr.c | 1 - 1 file changed, 1 deletion(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 562463a77a76..a223cf5f3626 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -5125,7 +5125,6 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "ghash", .test = alg_test_hash, - .fips_allowed = 1, .suite = { .hash = __VECS(ghash_tv_template) } From c27b2d2012e1826674255b9e45b61c172a267e1c Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Thu, 29 Dec 2022 22:17:10 +0100 Subject: [PATCH 018/156] crypto: testmgr - allow ecdsa-nist-p256 and -p384 in FIPS mode The kernel provides implementations of the NIST ECDSA signature verification primitives. For key sizes of 256 and 384 bits respectively they are approved and can be enabled in FIPS mode. Do so. Signed-off-by: Nicolai Stange Signed-off-by: Vladis Dronov Signed-off-by: Herbert Xu --- crypto/testmgr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index a223cf5f3626..795c4858c741 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -5034,12 +5034,14 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "ecdsa-nist-p256", .test = alg_test_akcipher, + .fips_allowed = 1, .suite = { .akcipher = __VECS(ecdsa_nist_p256_tv_template) } }, { .alg = "ecdsa-nist-p384", .test = alg_test_akcipher, + .fips_allowed = 1, .suite = { .akcipher = __VECS(ecdsa_nist_p384_tv_template) } From 199354d7fb6eaa2cc5bb650af0bca624baffee35 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 30 Dec 2022 13:21:38 +0800 Subject: [PATCH 019/156] crypto: caam - Remove GFP_DMA and add DMA alignment padding GFP_DMA does not guarantee that the returned memory is aligned for DMA. It should be removed where it is superfluous. However, kmalloc may start returning DMA-unaligned memory in future so fix this by adding the alignment by hand. Signed-off-by: Herbert Xu --- drivers/crypto/caam/blob_gen.c | 2 +- drivers/crypto/caam/caamalg.c | 16 +++++++---- drivers/crypto/caam/caamalg_qi.c | 16 +++++++---- drivers/crypto/caam/caamalg_qi2.c | 48 +++++++++++++++++++------------ drivers/crypto/caam/caamalg_qi2.h | 10 +++---- drivers/crypto/caam/caamhash.c | 14 +++++++-- drivers/crypto/caam/caampkc.c | 31 +++++++++++--------- drivers/crypto/caam/caamprng.c | 12 ++++++-- drivers/crypto/caam/caamrng.c | 11 ++++--- drivers/crypto/caam/ctrl.c | 4 +-- drivers/crypto/caam/key_gen.c | 2 +- drivers/crypto/caam/qi.c | 4 +-- drivers/crypto/caam/qi.h | 12 +++++--- 13 files changed, 111 insertions(+), 71 deletions(-) diff --git a/drivers/crypto/caam/blob_gen.c b/drivers/crypto/caam/blob_gen.c index f46b161d2cda..87781c1534ee 100644 --- a/drivers/crypto/caam/blob_gen.c +++ b/drivers/crypto/caam/blob_gen.c @@ -83,7 +83,7 @@ int caam_process_blob(struct caam_blob_priv *priv, output_len = info->input_len - CAAM_BLOB_OVERHEAD; } - desc = kzalloc(CAAM_BLOB_DESC_BYTES_MAX, GFP_KERNEL | GFP_DMA); + desc = kzalloc(CAAM_BLOB_DESC_BYTES_MAX, GFP_KERNEL); if (!desc) return -ENOMEM; diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c index ecc15bc521db..4a9b998a8d26 100644 --- a/drivers/crypto/caam/caamalg.c +++ b/drivers/crypto/caam/caamalg.c @@ -59,6 +59,8 @@ #include #include #include +#include +#include /* * crypto alg @@ -1379,8 +1381,7 @@ static struct aead_edesc *aead_edesc_alloc(struct aead_request *req, sec4_sg_bytes = sec4_sg_len * sizeof(struct sec4_sg_entry); /* allocate space for base edesc and hw desc commands, link tables */ - edesc = kzalloc(sizeof(*edesc) + desc_bytes + sec4_sg_bytes, - GFP_DMA | flags); + edesc = kzalloc(sizeof(*edesc) + desc_bytes + sec4_sg_bytes, flags); if (!edesc) { caam_unmap(jrdev, req->src, req->dst, src_nents, dst_nents, 0, 0, 0, 0); @@ -1608,6 +1609,7 @@ static struct skcipher_edesc *skcipher_edesc_alloc(struct skcipher_request *req, u8 *iv; int ivsize = crypto_skcipher_ivsize(skcipher); int dst_sg_idx, sec4_sg_ents, sec4_sg_bytes; + unsigned int aligned_size; src_nents = sg_nents_for_len(req->src, req->cryptlen); if (unlikely(src_nents < 0)) { @@ -1681,15 +1683,18 @@ static struct skcipher_edesc *skcipher_edesc_alloc(struct skcipher_request *req, /* * allocate space for base edesc and hw desc commands, link tables, IV */ - edesc = kzalloc(sizeof(*edesc) + desc_bytes + sec4_sg_bytes + ivsize, - GFP_DMA | flags); - if (!edesc) { + aligned_size = ALIGN(ivsize, __alignof__(*edesc)); + aligned_size += sizeof(*edesc) + desc_bytes + sec4_sg_bytes; + aligned_size = ALIGN(aligned_size, dma_get_cache_alignment()); + iv = kzalloc(aligned_size, flags); + if (!iv) { dev_err(jrdev, "could not allocate extended descriptor\n"); caam_unmap(jrdev, req->src, req->dst, src_nents, dst_nents, 0, 0, 0, 0); return ERR_PTR(-ENOMEM); } + edesc = (void *)(iv + ALIGN(ivsize, __alignof__(*edesc))); edesc->src_nents = src_nents; edesc->dst_nents = dst_nents; edesc->mapped_src_nents = mapped_src_nents; @@ -1701,7 +1706,6 @@ static struct skcipher_edesc *skcipher_edesc_alloc(struct skcipher_request *req, /* Make sure IV is located in a DMAable area */ if (ivsize) { - iv = (u8 *)edesc->sec4_sg + sec4_sg_bytes; memcpy(iv, req->iv, ivsize); iv_dma = dma_map_single(jrdev, iv, ivsize, DMA_BIDIRECTIONAL); diff --git a/drivers/crypto/caam/caamalg_qi.c b/drivers/crypto/caam/caamalg_qi.c index c37b67be0492..5e218bf20d5b 100644 --- a/drivers/crypto/caam/caamalg_qi.c +++ b/drivers/crypto/caam/caamalg_qi.c @@ -20,6 +20,8 @@ #include "caamalg_desc.h" #include #include +#include +#include /* * crypto alg @@ -959,7 +961,7 @@ static struct aead_edesc *aead_edesc_alloc(struct aead_request *req, return (struct aead_edesc *)drv_ctx; /* allocate space for base edesc and hw desc commands, link tables */ - edesc = qi_cache_alloc(GFP_DMA | flags); + edesc = qi_cache_alloc(flags); if (unlikely(!edesc)) { dev_err(qidev, "could not allocate extended descriptor\n"); return ERR_PTR(-ENOMEM); @@ -1317,8 +1319,9 @@ static struct skcipher_edesc *skcipher_edesc_alloc(struct skcipher_request *req, qm_sg_ents = 1 + pad_sg_nents(qm_sg_ents); qm_sg_bytes = qm_sg_ents * sizeof(struct qm_sg_entry); - if (unlikely(offsetof(struct skcipher_edesc, sgt) + qm_sg_bytes + - ivsize > CAAM_QI_MEMCACHE_SIZE)) { + if (unlikely(ALIGN(ivsize, __alignof__(*edesc)) + + offsetof(struct skcipher_edesc, sgt) + qm_sg_bytes > + CAAM_QI_MEMCACHE_SIZE)) { dev_err(qidev, "No space for %d S/G entries and/or %dB IV\n", qm_sg_ents, ivsize); caam_unmap(qidev, req->src, req->dst, src_nents, dst_nents, 0, @@ -1327,17 +1330,18 @@ static struct skcipher_edesc *skcipher_edesc_alloc(struct skcipher_request *req, } /* allocate space for base edesc, link tables and IV */ - edesc = qi_cache_alloc(GFP_DMA | flags); - if (unlikely(!edesc)) { + iv = qi_cache_alloc(flags); + if (unlikely(!iv)) { dev_err(qidev, "could not allocate extended descriptor\n"); caam_unmap(qidev, req->src, req->dst, src_nents, dst_nents, 0, 0, DMA_NONE, 0, 0); return ERR_PTR(-ENOMEM); } + edesc = (void *)(iv + ALIGN(ivsize, __alignof__(*edesc))); + /* Make sure IV is located in a DMAable area */ sg_table = &edesc->sgt[0]; - iv = (u8 *)(sg_table + qm_sg_ents); memcpy(iv, req->iv, ivsize); iv_dma = dma_map_single(qidev, iv, ivsize, DMA_BIDIRECTIONAL); diff --git a/drivers/crypto/caam/caamalg_qi2.c b/drivers/crypto/caam/caamalg_qi2.c index 1b0dd742c53f..0ddef9a033a1 100644 --- a/drivers/crypto/caam/caamalg_qi2.c +++ b/drivers/crypto/caam/caamalg_qi2.c @@ -16,7 +16,9 @@ #include "caamalg_desc.h" #include "caamhash_desc.h" #include "dpseci-debugfs.h" +#include #include +#include #include #include #include @@ -370,7 +372,7 @@ static struct aead_edesc *aead_edesc_alloc(struct aead_request *req, struct dpaa2_sg_entry *sg_table; /* allocate space for base edesc, link tables and IV */ - edesc = qi_cache_zalloc(GFP_DMA | flags); + edesc = qi_cache_zalloc(flags); if (unlikely(!edesc)) { dev_err(dev, "could not allocate extended descriptor\n"); return ERR_PTR(-ENOMEM); @@ -1189,7 +1191,7 @@ static struct skcipher_edesc *skcipher_edesc_alloc(struct skcipher_request *req) } /* allocate space for base edesc, link tables and IV */ - edesc = qi_cache_zalloc(GFP_DMA | flags); + edesc = qi_cache_zalloc(flags); if (unlikely(!edesc)) { dev_err(dev, "could not allocate extended descriptor\n"); caam_unmap(dev, req->src, req->dst, src_nents, dst_nents, 0, @@ -3220,14 +3222,14 @@ static int hash_digest_key(struct caam_hash_ctx *ctx, u32 *keylen, u8 *key, int ret = -ENOMEM; struct dpaa2_fl_entry *in_fle, *out_fle; - req_ctx = kzalloc(sizeof(*req_ctx), GFP_KERNEL | GFP_DMA); + req_ctx = kzalloc(sizeof(*req_ctx), GFP_KERNEL); if (!req_ctx) return -ENOMEM; in_fle = &req_ctx->fd_flt[1]; out_fle = &req_ctx->fd_flt[0]; - flc = kzalloc(sizeof(*flc), GFP_KERNEL | GFP_DMA); + flc = kzalloc(sizeof(*flc), GFP_KERNEL); if (!flc) goto err_flc; @@ -3316,7 +3318,13 @@ static int ahash_setkey(struct crypto_ahash *ahash, const u8 *key, dev_dbg(ctx->dev, "keylen %d blocksize %d\n", keylen, blocksize); if (keylen > blocksize) { - hashed_key = kmemdup(key, keylen, GFP_KERNEL | GFP_DMA); + unsigned int aligned_len = + ALIGN(keylen, dma_get_cache_alignment()); + + if (aligned_len < keylen) + return -EOVERFLOW; + + hashed_key = kmemdup(key, aligned_len, GFP_KERNEL); if (!hashed_key) return -ENOMEM; ret = hash_digest_key(ctx, &keylen, hashed_key, digestsize); @@ -3560,7 +3568,7 @@ static int ahash_update_ctx(struct ahash_request *req) } /* allocate space for base edesc and link tables */ - edesc = qi_cache_zalloc(GFP_DMA | flags); + edesc = qi_cache_zalloc(flags); if (!edesc) { dma_unmap_sg(ctx->dev, req->src, src_nents, DMA_TO_DEVICE); @@ -3654,7 +3662,7 @@ static int ahash_final_ctx(struct ahash_request *req) int ret; /* allocate space for base edesc and link tables */ - edesc = qi_cache_zalloc(GFP_DMA | flags); + edesc = qi_cache_zalloc(flags); if (!edesc) return -ENOMEM; @@ -3743,7 +3751,7 @@ static int ahash_finup_ctx(struct ahash_request *req) } /* allocate space for base edesc and link tables */ - edesc = qi_cache_zalloc(GFP_DMA | flags); + edesc = qi_cache_zalloc(flags); if (!edesc) { dma_unmap_sg(ctx->dev, req->src, src_nents, DMA_TO_DEVICE); return -ENOMEM; @@ -3836,7 +3844,7 @@ static int ahash_digest(struct ahash_request *req) } /* allocate space for base edesc and link tables */ - edesc = qi_cache_zalloc(GFP_DMA | flags); + edesc = qi_cache_zalloc(flags); if (!edesc) { dma_unmap_sg(ctx->dev, req->src, src_nents, DMA_TO_DEVICE); return ret; @@ -3913,7 +3921,7 @@ static int ahash_final_no_ctx(struct ahash_request *req) int ret = -ENOMEM; /* allocate space for base edesc and link tables */ - edesc = qi_cache_zalloc(GFP_DMA | flags); + edesc = qi_cache_zalloc(flags); if (!edesc) return ret; @@ -4012,7 +4020,7 @@ static int ahash_update_no_ctx(struct ahash_request *req) } /* allocate space for base edesc and link tables */ - edesc = qi_cache_zalloc(GFP_DMA | flags); + edesc = qi_cache_zalloc(flags); if (!edesc) { dma_unmap_sg(ctx->dev, req->src, src_nents, DMA_TO_DEVICE); @@ -4125,7 +4133,7 @@ static int ahash_finup_no_ctx(struct ahash_request *req) } /* allocate space for base edesc and link tables */ - edesc = qi_cache_zalloc(GFP_DMA | flags); + edesc = qi_cache_zalloc(flags); if (!edesc) { dma_unmap_sg(ctx->dev, req->src, src_nents, DMA_TO_DEVICE); return ret; @@ -4230,7 +4238,7 @@ static int ahash_update_first(struct ahash_request *req) } /* allocate space for base edesc and link tables */ - edesc = qi_cache_zalloc(GFP_DMA | flags); + edesc = qi_cache_zalloc(flags); if (!edesc) { dma_unmap_sg(ctx->dev, req->src, src_nents, DMA_TO_DEVICE); @@ -4926,6 +4934,7 @@ static int dpaa2_dpseci_congestion_setup(struct dpaa2_caam_priv *priv, { struct dpseci_congestion_notification_cfg cong_notif_cfg = { 0 }; struct device *dev = priv->dev; + unsigned int alignmask; int err; /* @@ -4936,13 +4945,14 @@ static int dpaa2_dpseci_congestion_setup(struct dpaa2_caam_priv *priv, !(priv->dpseci_attr.options & DPSECI_OPT_HAS_CG)) return 0; - priv->cscn_mem = kzalloc(DPAA2_CSCN_SIZE + DPAA2_CSCN_ALIGN, - GFP_KERNEL | GFP_DMA); + alignmask = DPAA2_CSCN_ALIGN - 1; + alignmask |= dma_get_cache_alignment() - 1; + priv->cscn_mem = kzalloc(ALIGN(DPAA2_CSCN_SIZE, alignmask + 1), + GFP_KERNEL); if (!priv->cscn_mem) return -ENOMEM; - priv->cscn_mem_aligned = PTR_ALIGN(priv->cscn_mem, DPAA2_CSCN_ALIGN); - priv->cscn_dma = dma_map_single(dev, priv->cscn_mem_aligned, + priv->cscn_dma = dma_map_single(dev, priv->cscn_mem, DPAA2_CSCN_SIZE, DMA_FROM_DEVICE); if (dma_mapping_error(dev, priv->cscn_dma)) { dev_err(dev, "Error mapping CSCN memory area\n"); @@ -5174,7 +5184,7 @@ static int dpaa2_caam_probe(struct fsl_mc_device *dpseci_dev) priv->domain = iommu_get_domain_for_dev(dev); qi_cache = kmem_cache_create("dpaa2_caamqicache", CAAM_QI_MEMCACHE_SIZE, - 0, SLAB_CACHE_DMA, NULL); + 0, 0, NULL); if (!qi_cache) { dev_err(dev, "Can't allocate SEC cache\n"); return -ENOMEM; @@ -5451,7 +5461,7 @@ int dpaa2_caam_enqueue(struct device *dev, struct caam_request *req) dma_sync_single_for_cpu(priv->dev, priv->cscn_dma, DPAA2_CSCN_SIZE, DMA_FROM_DEVICE); - if (unlikely(dpaa2_cscn_state_congested(priv->cscn_mem_aligned))) { + if (unlikely(dpaa2_cscn_state_congested(priv->cscn_mem))) { dev_dbg_ratelimited(dev, "Dropping request\n"); return -EBUSY; } diff --git a/drivers/crypto/caam/caamalg_qi2.h b/drivers/crypto/caam/caamalg_qi2.h index d35253407ade..abb502bb675c 100644 --- a/drivers/crypto/caam/caamalg_qi2.h +++ b/drivers/crypto/caam/caamalg_qi2.h @@ -7,13 +7,14 @@ #ifndef _CAAMALG_QI2_H_ #define _CAAMALG_QI2_H_ +#include +#include #include #include #include #include #include "dpseci.h" #include "desc_constr.h" -#include #define DPAA2_CAAM_STORE_SIZE 16 /* NAPI weight *must* be a multiple of the store size. */ @@ -36,8 +37,6 @@ * @tx_queue_attr: array of Tx queue attributes * @cscn_mem: pointer to memory region containing the congestion SCN * it's size is larger than to accommodate alignment - * @cscn_mem_aligned: pointer to congestion SCN; it is computed as - * PTR_ALIGN(cscn_mem, DPAA2_CSCN_ALIGN) * @cscn_dma: dma address used by the QMAN to write CSCN messages * @dev: device associated with the DPSECI object * @mc_io: pointer to MC portal's I/O object @@ -58,7 +57,6 @@ struct dpaa2_caam_priv { /* congestion */ void *cscn_mem; - void *cscn_mem_aligned; dma_addr_t cscn_dma; struct device *dev; @@ -158,7 +156,7 @@ struct ahash_edesc { struct caam_flc { u32 flc[16]; u32 sh_desc[MAX_SDLEN]; -} ____cacheline_aligned; +} __aligned(CRYPTO_DMA_ALIGN); enum optype { ENCRYPT = 0, @@ -180,7 +178,7 @@ enum optype { * @edesc: extended descriptor; points to one of {skcipher,aead}_edesc */ struct caam_request { - struct dpaa2_fl_entry fd_flt[2]; + struct dpaa2_fl_entry fd_flt[2] __aligned(CRYPTO_DMA_ALIGN); dma_addr_t fd_flt_dma; struct caam_flc *flc; dma_addr_t flc_dma; diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c index 1050e965a438..1f357f48c473 100644 --- a/drivers/crypto/caam/caamhash.c +++ b/drivers/crypto/caam/caamhash.c @@ -66,6 +66,8 @@ #include "key_gen.h" #include "caamhash_desc.h" #include +#include +#include #define CAAM_CRA_PRIORITY 3000 @@ -365,7 +367,7 @@ static int hash_digest_key(struct caam_hash_ctx *ctx, u32 *keylen, u8 *key, dma_addr_t key_dma; int ret; - desc = kmalloc(CAAM_CMD_SZ * 8 + CAAM_PTR_SZ * 2, GFP_KERNEL | GFP_DMA); + desc = kmalloc(CAAM_CMD_SZ * 8 + CAAM_PTR_SZ * 2, GFP_KERNEL); if (!desc) { dev_err(jrdev, "unable to allocate key input memory\n"); return -ENOMEM; @@ -432,7 +434,13 @@ static int ahash_setkey(struct crypto_ahash *ahash, dev_dbg(jrdev, "keylen %d\n", keylen); if (keylen > blocksize) { - hashed_key = kmemdup(key, keylen, GFP_KERNEL | GFP_DMA); + unsigned int aligned_len = + ALIGN(keylen, dma_get_cache_alignment()); + + if (aligned_len < keylen) + return -EOVERFLOW; + + hashed_key = kmemdup(key, keylen, GFP_KERNEL); if (!hashed_key) return -ENOMEM; ret = hash_digest_key(ctx, &keylen, hashed_key, digestsize); @@ -702,7 +710,7 @@ static struct ahash_edesc *ahash_edesc_alloc(struct ahash_request *req, struct ahash_edesc *edesc; unsigned int sg_size = sg_num * sizeof(struct sec4_sg_entry); - edesc = kzalloc(sizeof(*edesc) + sg_size, GFP_DMA | flags); + edesc = kzalloc(sizeof(*edesc) + sg_size, flags); if (!edesc) { dev_err(ctx->jrdev, "could not allocate extended descriptor\n"); return NULL; diff --git a/drivers/crypto/caam/caampkc.c b/drivers/crypto/caam/caampkc.c index aef031946f33..e40614fef39d 100644 --- a/drivers/crypto/caam/caampkc.c +++ b/drivers/crypto/caam/caampkc.c @@ -16,6 +16,8 @@ #include "desc_constr.h" #include "sg_sw_sec4.h" #include "caampkc.h" +#include +#include #define DESC_RSA_PUB_LEN (2 * CAAM_CMD_SZ + SIZEOF_RSA_PUB_PDB) #define DESC_RSA_PRIV_F1_LEN (2 * CAAM_CMD_SZ + \ @@ -310,8 +312,7 @@ static struct rsa_edesc *rsa_edesc_alloc(struct akcipher_request *req, sec4_sg_bytes = sec4_sg_len * sizeof(struct sec4_sg_entry); /* allocate space for base edesc, hw desc commands and link tables */ - edesc = kzalloc(sizeof(*edesc) + desclen + sec4_sg_bytes, - GFP_DMA | flags); + edesc = kzalloc(sizeof(*edesc) + desclen + sec4_sg_bytes, flags); if (!edesc) goto dst_fail; @@ -898,7 +899,7 @@ static u8 *caam_read_rsa_crt(const u8 *ptr, size_t nbytes, size_t dstlen) if (!nbytes) return NULL; - dst = kzalloc(dstlen, GFP_DMA | GFP_KERNEL); + dst = kzalloc(dstlen, GFP_KERNEL); if (!dst) return NULL; @@ -910,7 +911,7 @@ static u8 *caam_read_rsa_crt(const u8 *ptr, size_t nbytes, size_t dstlen) /** * caam_read_raw_data - Read a raw byte stream as a positive integer. * The function skips buffer's leading zeros, copies the remained data - * to a buffer allocated in the GFP_DMA | GFP_KERNEL zone and returns + * to a buffer allocated in the GFP_KERNEL zone and returns * the address of the new buffer. * * @buf : The data to read @@ -923,7 +924,7 @@ static inline u8 *caam_read_raw_data(const u8 *buf, size_t *nbytes) if (!*nbytes) return NULL; - return kmemdup(buf, *nbytes, GFP_DMA | GFP_KERNEL); + return kmemdup(buf, *nbytes, GFP_KERNEL); } static int caam_rsa_check_key_length(unsigned int len) @@ -949,13 +950,13 @@ static int caam_rsa_set_pub_key(struct crypto_akcipher *tfm, const void *key, return ret; /* Copy key in DMA zone */ - rsa_key->e = kmemdup(raw_key.e, raw_key.e_sz, GFP_DMA | GFP_KERNEL); + rsa_key->e = kmemdup(raw_key.e, raw_key.e_sz, GFP_KERNEL); if (!rsa_key->e) goto err; /* * Skip leading zeros and copy the positive integer to a buffer - * allocated in the GFP_DMA | GFP_KERNEL zone. The decryption descriptor + * allocated in the GFP_KERNEL zone. The decryption descriptor * expects a positive integer for the RSA modulus and uses its length as * decryption output length. */ @@ -983,6 +984,7 @@ static void caam_rsa_set_priv_key_form(struct caam_rsa_ctx *ctx, struct caam_rsa_key *rsa_key = &ctx->key; size_t p_sz = raw_key->p_sz; size_t q_sz = raw_key->q_sz; + unsigned aligned_size; rsa_key->p = caam_read_raw_data(raw_key->p, &p_sz); if (!rsa_key->p) @@ -994,11 +996,13 @@ static void caam_rsa_set_priv_key_form(struct caam_rsa_ctx *ctx, goto free_p; rsa_key->q_sz = q_sz; - rsa_key->tmp1 = kzalloc(raw_key->p_sz, GFP_DMA | GFP_KERNEL); + aligned_size = ALIGN(raw_key->p_sz, dma_get_cache_alignment()); + rsa_key->tmp1 = kzalloc(aligned_size, GFP_KERNEL); if (!rsa_key->tmp1) goto free_q; - rsa_key->tmp2 = kzalloc(raw_key->q_sz, GFP_DMA | GFP_KERNEL); + aligned_size = ALIGN(raw_key->q_sz, dma_get_cache_alignment()); + rsa_key->tmp2 = kzalloc(aligned_size, GFP_KERNEL); if (!rsa_key->tmp2) goto free_tmp1; @@ -1051,17 +1055,17 @@ static int caam_rsa_set_priv_key(struct crypto_akcipher *tfm, const void *key, return ret; /* Copy key in DMA zone */ - rsa_key->d = kmemdup(raw_key.d, raw_key.d_sz, GFP_DMA | GFP_KERNEL); + rsa_key->d = kmemdup(raw_key.d, raw_key.d_sz, GFP_KERNEL); if (!rsa_key->d) goto err; - rsa_key->e = kmemdup(raw_key.e, raw_key.e_sz, GFP_DMA | GFP_KERNEL); + rsa_key->e = kmemdup(raw_key.e, raw_key.e_sz, GFP_KERNEL); if (!rsa_key->e) goto err; /* * Skip leading zeros and copy the positive integer to a buffer - * allocated in the GFP_DMA | GFP_KERNEL zone. The decryption descriptor + * allocated in the GFP_KERNEL zone. The decryption descriptor * expects a positive integer for the RSA modulus and uses its length as * decryption output length. */ @@ -1185,8 +1189,7 @@ int caam_pkc_init(struct device *ctrldev) return 0; /* allocate zero buffer, used for padding input */ - zero_buffer = kzalloc(CAAM_RSA_MAX_INPUT_SIZE - 1, GFP_DMA | - GFP_KERNEL); + zero_buffer = kzalloc(CAAM_RSA_MAX_INPUT_SIZE - 1, GFP_KERNEL); if (!zero_buffer) return -ENOMEM; diff --git a/drivers/crypto/caam/caamprng.c b/drivers/crypto/caam/caamprng.c index 4839e66300a2..6e4c1191cb28 100644 --- a/drivers/crypto/caam/caamprng.c +++ b/drivers/crypto/caam/caamprng.c @@ -8,6 +8,8 @@ #include #include +#include +#include #include "compat.h" #include "regs.h" #include "intern.h" @@ -75,6 +77,7 @@ static int caam_prng_generate(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int dlen) { + unsigned int aligned_dlen = ALIGN(dlen, dma_get_cache_alignment()); struct caam_prng_ctx ctx; struct device *jrdev; dma_addr_t dst_dma; @@ -82,7 +85,10 @@ static int caam_prng_generate(struct crypto_rng *tfm, u8 *buf; int ret; - buf = kzalloc(dlen, GFP_KERNEL); + if (aligned_dlen < dlen) + return -EOVERFLOW; + + buf = kzalloc(aligned_dlen, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -94,7 +100,7 @@ static int caam_prng_generate(struct crypto_rng *tfm, return ret; } - desc = kzalloc(CAAM_PRNG_MAX_DESC_LEN, GFP_KERNEL | GFP_DMA); + desc = kzalloc(CAAM_PRNG_MAX_DESC_LEN, GFP_KERNEL); if (!desc) { ret = -ENOMEM; goto out1; @@ -156,7 +162,7 @@ static int caam_prng_seed(struct crypto_rng *tfm, return ret; } - desc = kzalloc(CAAM_PRNG_MAX_DESC_LEN, GFP_KERNEL | GFP_DMA); + desc = kzalloc(CAAM_PRNG_MAX_DESC_LEN, GFP_KERNEL); if (!desc) { caam_jr_free(jrdev); return -ENOMEM; diff --git a/drivers/crypto/caam/caamrng.c b/drivers/crypto/caam/caamrng.c index 1f0e82050976..1fd8ff965006 100644 --- a/drivers/crypto/caam/caamrng.c +++ b/drivers/crypto/caam/caamrng.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include "compat.h" @@ -176,17 +178,18 @@ static int caam_init(struct hwrng *rng) int err; ctx->desc_sync = devm_kzalloc(ctx->ctrldev, CAAM_RNG_DESC_LEN, - GFP_DMA | GFP_KERNEL); + GFP_KERNEL); if (!ctx->desc_sync) return -ENOMEM; ctx->desc_async = devm_kzalloc(ctx->ctrldev, CAAM_RNG_DESC_LEN, - GFP_DMA | GFP_KERNEL); + GFP_KERNEL); if (!ctx->desc_async) return -ENOMEM; - if (kfifo_alloc(&ctx->fifo, CAAM_RNG_MAX_FIFO_STORE_SIZE, - GFP_DMA | GFP_KERNEL)) + if (kfifo_alloc(&ctx->fifo, ALIGN(CAAM_RNG_MAX_FIFO_STORE_SIZE, + dma_get_cache_alignment()), + GFP_KERNEL)) return -ENOMEM; INIT_WORK(&ctx->worker, caam_rng_worker); diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c index 32253a064d0f..6278afb951c3 100644 --- a/drivers/crypto/caam/ctrl.c +++ b/drivers/crypto/caam/ctrl.c @@ -199,7 +199,7 @@ static int deinstantiate_rng(struct device *ctrldev, int state_handle_mask) u32 *desc, status; int sh_idx, ret = 0; - desc = kmalloc(CAAM_CMD_SZ * 3, GFP_KERNEL | GFP_DMA); + desc = kmalloc(CAAM_CMD_SZ * 3, GFP_KERNEL); if (!desc) return -ENOMEM; @@ -276,7 +276,7 @@ static int instantiate_rng(struct device *ctrldev, int state_handle_mask, int ret = 0, sh_idx; ctrl = (struct caam_ctrl __iomem *)ctrlpriv->ctrl; - desc = kmalloc(CAAM_CMD_SZ * 7, GFP_KERNEL | GFP_DMA); + desc = kmalloc(CAAM_CMD_SZ * 7, GFP_KERNEL); if (!desc) return -ENOMEM; diff --git a/drivers/crypto/caam/key_gen.c b/drivers/crypto/caam/key_gen.c index b0e8a4939b4f..88cc4fe2a585 100644 --- a/drivers/crypto/caam/key_gen.c +++ b/drivers/crypto/caam/key_gen.c @@ -64,7 +64,7 @@ int gen_split_key(struct device *jrdev, u8 *key_out, if (local_max > max_keylen) return -EINVAL; - desc = kmalloc(CAAM_CMD_SZ * 6 + CAAM_PTR_SZ * 2, GFP_KERNEL | GFP_DMA); + desc = kmalloc(CAAM_CMD_SZ * 6 + CAAM_PTR_SZ * 2, GFP_KERNEL); if (!desc) { dev_err(jrdev, "unable to allocate key input memory\n"); return ret; diff --git a/drivers/crypto/caam/qi.c b/drivers/crypto/caam/qi.c index c36f27376d7e..4c52c9365558 100644 --- a/drivers/crypto/caam/qi.c +++ b/drivers/crypto/caam/qi.c @@ -614,7 +614,7 @@ static int alloc_rsp_fq_cpu(struct device *qidev, unsigned int cpu) struct qman_fq *fq; int ret; - fq = kzalloc(sizeof(*fq), GFP_KERNEL | GFP_DMA); + fq = kzalloc(sizeof(*fq), GFP_KERNEL); if (!fq) return -ENOMEM; @@ -756,7 +756,7 @@ int caam_qi_init(struct platform_device *caam_pdev) } qi_cache = kmem_cache_create("caamqicache", CAAM_QI_MEMCACHE_SIZE, 0, - SLAB_CACHE_DMA, NULL); + 0, NULL); if (!qi_cache) { dev_err(qidev, "Can't allocate CAAM cache\n"); free_rsp_fqs(); diff --git a/drivers/crypto/caam/qi.h b/drivers/crypto/caam/qi.h index 5894f16f8fe3..a96e3d213c06 100644 --- a/drivers/crypto/caam/qi.h +++ b/drivers/crypto/caam/qi.h @@ -9,6 +9,8 @@ #ifndef __QI_H__ #define __QI_H__ +#include +#include #include #include "compat.h" #include "desc.h" @@ -58,8 +60,10 @@ enum optype { * @qidev: device pointer for CAAM/QI backend */ struct caam_drv_ctx { - u32 prehdr[2]; - u32 sh_desc[MAX_SDLEN]; + struct { + u32 prehdr[2]; + u32 sh_desc[MAX_SDLEN]; + } __aligned(CRYPTO_DMA_ALIGN); dma_addr_t context_a; struct qman_fq *req_fq; struct qman_fq *rsp_fq; @@ -67,7 +71,7 @@ struct caam_drv_ctx { int cpu; enum optype op_type; struct device *qidev; -} ____cacheline_aligned; +}; /** * caam_drv_req - The request structure the driver application should fill while @@ -88,7 +92,7 @@ struct caam_drv_req { struct caam_drv_ctx *drv_ctx; caam_qi_cbk cbk; void *app_ctx; -} ____cacheline_aligned; +} __aligned(CRYPTO_DMA_ALIGN); /** * caam_drv_ctx_init - Initialise a CAAM/QI driver context From 8e613cec25196b51601dfac50c5bf229acd72bc6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 30 Dec 2022 15:31:33 +0800 Subject: [PATCH 020/156] crypto: talitos - Remove GFP_DMA and add DMA alignment padding GFP_DMA does not guarantee that the returned memory is aligned for DMA. It should be removed where it is superfluous. However, kmalloc may start returning DMA-unaligned memory in future so fix this by adding the alignment by hand. Signed-off-by: Herbert Xu --- drivers/crypto/talitos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index 71db6450b6aa..d62ec68e3183 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -1393,7 +1393,7 @@ static struct talitos_edesc *talitos_edesc_alloc(struct device *dev, alloc_len += sizeof(struct talitos_desc); alloc_len += ivsize; - edesc = kmalloc(alloc_len, GFP_DMA | flags); + edesc = kmalloc(ALIGN(alloc_len, dma_get_cache_alignment()), flags); if (!edesc) return ERR_PTR(-ENOMEM); if (ivsize) { From 91dfd98216d817ec5f1c55890bacb7b4fe9b068a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 30 Dec 2022 14:18:46 -0800 Subject: [PATCH 021/156] crypto: ccp - Avoid page allocation failure warning for SEV_GET_ID2 For SEV_GET_ID2, the user provided length does not have a specified limitation because the length of the ID may change in the future. The kernel memory allocation, however, is implicitly limited to 4MB on x86 by the page allocator, otherwise the kzalloc() will fail. When this happens, it is best not to spam the kernel log with the warning. Simply fail the allocation and return ENOMEM to the user. Fixes: d6112ea0cb34 ("crypto: ccp - introduce SEV_GET_ID2 command") Reported-by: Andy Nguyen Reported-by: Peter Gonda Suggested-by: Herbert Xu Signed-off-by: David Rientjes Acked-by: Tom Lendacky Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sev-dev.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 06fc7156c04f..56998bc579d6 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -881,7 +881,14 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp) input_address = (void __user *)input.address; if (input.address && input.length) { - id_blob = kzalloc(input.length, GFP_KERNEL); + /* + * The length of the ID shouldn't be assumed by software since + * it may change in the future. The allocation size is limited + * to 1 << (PAGE_SHIFT + MAX_ORDER - 1) by the page allocator. + * If the allocation fails, simply return ENOMEM rather than + * warning in the kernel log. + */ + id_blob = kzalloc(input.length, GFP_KERNEL | __GFP_NOWARN); if (!id_blob) return -ENOMEM; From 8e7d7ce2e3947013a7b12452db42cb73021e13a0 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 1 Jan 2023 09:12:49 +0000 Subject: [PATCH 022/156] crypto: x86/aria - add keystream array into request ctx avx accelerated aria module used local keystream array. But, keystream array size is too big. So, it puts the keystream array into request ctx. Signed-off-by: Taehee Yoo Signed-off-by: Herbert Xu --- arch/x86/crypto/aria_aesni_avx_glue.c | 39 ++++++++++++++++++--------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c index c561ea4fefa5..5f97e442349f 100644 --- a/arch/x86/crypto/aria_aesni_avx_glue.c +++ b/arch/x86/crypto/aria_aesni_avx_glue.c @@ -33,6 +33,10 @@ asmlinkage void aria_aesni_avx_gfni_ctr_crypt_16way(const void *ctx, u8 *dst, static struct aria_avx_ops aria_ops; +struct aria_avx_request_ctx { + u8 keystream[ARIA_AESNI_PARALLEL_BLOCK_SIZE]; +}; + static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey) { ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS); @@ -73,6 +77,7 @@ static int aria_avx_set_key(struct crypto_skcipher *tfm, const u8 *key, static int aria_avx_ctr_encrypt(struct skcipher_request *req) { + struct aria_avx_request_ctx *req_ctx = skcipher_request_ctx(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct aria_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; @@ -86,10 +91,9 @@ static int aria_avx_ctr_encrypt(struct skcipher_request *req) u8 *dst = walk.dst.virt.addr; while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) { - u8 keystream[ARIA_AESNI_PARALLEL_BLOCK_SIZE]; - kernel_fpu_begin(); - aria_ops.aria_ctr_crypt_16way(ctx, dst, src, keystream, + aria_ops.aria_ctr_crypt_16way(ctx, dst, src, + &req_ctx->keystream[0], walk.iv); kernel_fpu_end(); dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE; @@ -98,28 +102,29 @@ static int aria_avx_ctr_encrypt(struct skcipher_request *req) } while (nbytes >= ARIA_BLOCK_SIZE) { - u8 keystream[ARIA_BLOCK_SIZE]; - - memcpy(keystream, walk.iv, ARIA_BLOCK_SIZE); + memcpy(&req_ctx->keystream[0], walk.iv, ARIA_BLOCK_SIZE); crypto_inc(walk.iv, ARIA_BLOCK_SIZE); - aria_encrypt(ctx, keystream, keystream); + aria_encrypt(ctx, &req_ctx->keystream[0], + &req_ctx->keystream[0]); - crypto_xor_cpy(dst, src, keystream, ARIA_BLOCK_SIZE); + crypto_xor_cpy(dst, src, &req_ctx->keystream[0], + ARIA_BLOCK_SIZE); dst += ARIA_BLOCK_SIZE; src += ARIA_BLOCK_SIZE; nbytes -= ARIA_BLOCK_SIZE; } if (walk.nbytes == walk.total && nbytes > 0) { - u8 keystream[ARIA_BLOCK_SIZE]; - - memcpy(keystream, walk.iv, ARIA_BLOCK_SIZE); + memcpy(&req_ctx->keystream[0], walk.iv, + ARIA_BLOCK_SIZE); crypto_inc(walk.iv, ARIA_BLOCK_SIZE); - aria_encrypt(ctx, keystream, keystream); + aria_encrypt(ctx, &req_ctx->keystream[0], + &req_ctx->keystream[0]); - crypto_xor_cpy(dst, src, keystream, nbytes); + crypto_xor_cpy(dst, src, &req_ctx->keystream[0], + nbytes); dst += nbytes; src += nbytes; nbytes = 0; @@ -130,6 +135,13 @@ static int aria_avx_ctr_encrypt(struct skcipher_request *req) return err; } +static int aria_avx_init_tfm(struct crypto_skcipher *tfm) +{ + crypto_skcipher_set_reqsize(tfm, sizeof(struct aria_avx_request_ctx)); + + return 0; +} + static struct skcipher_alg aria_algs[] = { { .base.cra_name = "__ecb(aria)", @@ -160,6 +172,7 @@ static struct skcipher_alg aria_algs[] = { .setkey = aria_avx_set_key, .encrypt = aria_avx_ctr_encrypt, .decrypt = aria_avx_ctr_encrypt, + .init = aria_avx_init_tfm, } }; From 35344cf30f2b080f75b4097eebcd7567e54bce22 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 1 Jan 2023 09:12:50 +0000 Subject: [PATCH 023/156] crypto: x86/aria - do not use magic number offsets of aria_ctx aria-avx assembly code accesses members of aria_ctx with magic number offset. If the shape of struct aria_ctx is changed carelessly, aria-avx will not work. So, we need to ensure accessing members of aria_ctx with correct offset values, not with magic numbers. It adds ARIA_CTX_enc_key, ARIA_CTX_dec_key, and ARIA_CTX_rounds in the asm-offsets.c So, correct offset definitions will be generated. aria-avx assembly code can access members of aria_ctx safely with these definitions. Signed-off-by: Taehee Yoo Signed-off-by: Herbert Xu --- arch/x86/crypto/aria-aesni-avx-asm_64.S | 26 +++++++++++-------------- arch/x86/kernel/asm-offsets.c | 8 ++++++++ crypto/aria_generic.c | 4 ++++ 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S index 03ae4cd1d976..be6adc6e7458 100644 --- a/arch/x86/crypto/aria-aesni-avx-asm_64.S +++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S @@ -8,13 +8,9 @@ #include #include +#include #include -/* struct aria_ctx: */ -#define enc_key 0 -#define dec_key 272 -#define rounds 544 - /* register macros */ #define CTX %rdi @@ -874,7 +870,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way) aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %rax, %r9, 10); - cmpl $12, rounds(CTX); + cmpl $12, ARIA_CTX_rounds(CTX); jne .Laria_192; aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, @@ -887,7 +883,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way) aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %rax, %r9, 12); - cmpl $14, rounds(CTX); + cmpl $14, ARIA_CTX_rounds(CTX); jne .Laria_256; aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, @@ -923,7 +919,7 @@ SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way) FRAME_BEGIN - leaq enc_key(CTX), %r9; + leaq ARIA_CTX_enc_key(CTX), %r9; inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, @@ -948,7 +944,7 @@ SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way) FRAME_BEGIN - leaq dec_key(CTX), %r9; + leaq ARIA_CTX_dec_key(CTX), %r9; inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, @@ -1056,7 +1052,7 @@ SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way) leaq (%rdx), %r11; leaq (%rcx), %rsi; leaq (%rcx), %rdx; - leaq enc_key(CTX), %r9; + leaq ARIA_CTX_enc_key(CTX), %r9; call __aria_aesni_avx_crypt_16way; @@ -1157,7 +1153,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way) %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %rax, %r9, 10); - cmpl $12, rounds(CTX); + cmpl $12, ARIA_CTX_rounds(CTX); jne .Laria_gfni_192; aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, @@ -1174,7 +1170,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way) %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %rax, %r9, 12); - cmpl $14, rounds(CTX); + cmpl $14, ARIA_CTX_rounds(CTX); jne .Laria_gfni_256; aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, @@ -1218,7 +1214,7 @@ SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way) FRAME_BEGIN - leaq enc_key(CTX), %r9; + leaq ARIA_CTX_enc_key(CTX), %r9; inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, @@ -1243,7 +1239,7 @@ SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way) FRAME_BEGIN - leaq dec_key(CTX), %r9; + leaq ARIA_CTX_dec_key(CTX), %r9; inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, @@ -1275,7 +1271,7 @@ SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way) leaq (%rdx), %r11; leaq (%rcx), %rsi; leaq (%rcx), %rdx; - leaq enc_key(CTX), %r9; + leaq ARIA_CTX_enc_key(CTX), %r9; call __aria_aesni_avx_gfni_crypt_16way; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 82c783da16a8..ef9e951415c5 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -7,6 +7,7 @@ #define COMPILE_OFFSETS #include +#include #include #include #include @@ -111,5 +112,12 @@ static void __used common(void) #ifdef CONFIG_CALL_DEPTH_TRACKING OFFSET(X86_call_depth, pcpu_hot, call_depth); #endif +#if IS_ENABLED(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) + /* Offset for fields in aria_ctx */ + BLANK(); + OFFSET(ARIA_CTX_enc_key, aria_ctx, enc_key); + OFFSET(ARIA_CTX_dec_key, aria_ctx, dec_key); + OFFSET(ARIA_CTX_rounds, aria_ctx, rounds); +#endif } diff --git a/crypto/aria_generic.c b/crypto/aria_generic.c index 4cc29b82b99d..d96dfc4fdde6 100644 --- a/crypto/aria_generic.c +++ b/crypto/aria_generic.c @@ -178,6 +178,10 @@ int aria_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) if (key_len != 16 && key_len != 24 && key_len != 32) return -EINVAL; + BUILD_BUG_ON(sizeof(ctx->enc_key) != 272); + BUILD_BUG_ON(sizeof(ctx->dec_key) != 272); + BUILD_BUG_ON(sizeof(int) != sizeof(ctx->rounds)); + ctx->key_length = key_len; ctx->rounds = (key_len + 32) / 4; From 37d8d3ae7a58cb16fa3f4f1992d2ee36bc621438 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 1 Jan 2023 09:12:51 +0000 Subject: [PATCH 024/156] crypto: x86/aria - implement aria-avx2 aria-avx2 implementation uses AVX2, AES-NI, and GFNI. It supports 32way parallel processing. So, byteslicing code is changed to support 32way parallel. And it exports some aria-avx functions such as encrypt() and decrypt(). There are two main logics, s-box layer and diffusion layer. These codes are the same as aria-avx implementation. But some instruction are exchanged because they don't support 256bit registers. Also, AES-NI doesn't support 256bit register. So, aesenclast and aesdeclast are used twice like below: vextracti128 $1, ymm0, xmm6; vaesenclast xmm7, xmm0, xmm0; vaesenclast xmm7, xmm6, xmm6; vinserti128 $1, xmm6, ymm0, ymm0; Benchmark with modprobe tcrypt mode=610 num_mb=8192, i3-12100: ARIA-AVX2 with GFNI(128bit and 256bit) testing speed of multibuffer ecb(aria) (ecb-aria-avx2) encryption tcrypt: 1 operation in 2003 cycles (1024 bytes) tcrypt: 1 operation in 5867 cycles (4096 bytes) tcrypt: 1 operation in 2358 cycles (1024 bytes) tcrypt: 1 operation in 7295 cycles (4096 bytes) testing speed of multibuffer ecb(aria) (ecb-aria-avx2) decryption tcrypt: 1 operation in 2004 cycles (1024 bytes) tcrypt: 1 operation in 5956 cycles (4096 bytes) tcrypt: 1 operation in 2409 cycles (1024 bytes) tcrypt: 1 operation in 7564 cycles (4096 bytes) ARIA-AVX with GFNI(128bit and 256bit) testing speed of multibuffer ecb(aria) (ecb-aria-avx) encryption tcrypt: 1 operation in 2761 cycles (1024 bytes) tcrypt: 1 operation in 9390 cycles (4096 bytes) tcrypt: 1 operation in 3401 cycles (1024 bytes) tcrypt: 1 operation in 11876 cycles (4096 bytes) testing speed of multibuffer ecb(aria) (ecb-aria-avx) decryption tcrypt: 1 operation in 2735 cycles (1024 bytes) tcrypt: 1 operation in 9424 cycles (4096 bytes) tcrypt: 1 operation in 3369 cycles (1024 bytes) tcrypt: 1 operation in 11954 cycles (4096 bytes) Signed-off-by: Taehee Yoo Signed-off-by: Herbert Xu --- arch/x86/crypto/Kconfig | 19 + arch/x86/crypto/Makefile | 3 + arch/x86/crypto/aria-aesni-avx2-asm_64.S | 1433 ++++++++++++++++++++++ arch/x86/crypto/aria-avx.h | 40 +- arch/x86/crypto/aria_aesni_avx2_glue.c | 252 ++++ arch/x86/crypto/aria_aesni_avx_glue.c | 6 + 6 files changed, 1752 insertions(+), 1 deletion(-) create mode 100644 arch/x86/crypto/aria-aesni-avx2-asm_64.S create mode 100644 arch/x86/crypto/aria_aesni_avx2_glue.c diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 71c4c473d34b..3837ba8b78c5 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -304,6 +304,25 @@ config CRYPTO_ARIA_AESNI_AVX_X86_64 Processes 16 blocks in parallel. +config CRYPTO_ARIA_AESNI_AVX2_X86_64 + tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX2/GFNI)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_SIMD + select CRYPTO_ALGAPI + select CRYPTO_ARIA + select CRYPTO_ARIA_AESNI_AVX_X86_64 + help + Length-preserving cipher: ARIA cipher algorithms + (RFC 5794) with ECB and CTR modes + + Architecture: x86_64 using: + - AES-NI (AES New Instructions) + - AVX2 (Advanced Vector Extensions) + - GFNI (Galois Field New Instructions) + + Processes 32 blocks in parallel. + config CRYPTO_CHACHA20_X86_64 tristate "Ciphers: ChaCha20, XChaCha20, XChaCha12 (SSSE3/AVX2/AVX-512VL)" depends on X86 && 64BIT diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 3e7a329235bd..8b834aa410b1 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -103,6 +103,9 @@ sm4-aesni-avx2-x86_64-y := sm4-aesni-avx2-asm_64.o sm4_aesni_avx2_glue.o obj-$(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) += aria-aesni-avx-x86_64.o aria-aesni-avx-x86_64-y := aria-aesni-avx-asm_64.o aria_aesni_avx_glue.o +obj-$(CONFIG_CRYPTO_ARIA_AESNI_AVX2_X86_64) += aria-aesni-avx2-x86_64.o +aria-aesni-avx2-x86_64-y := aria-aesni-avx2-asm_64.o aria_aesni_avx2_glue.o + quiet_cmd_perlasm = PERLASM $@ cmd_perlasm = $(PERL) $< > $@ $(obj)/%.S: $(src)/%.pl FORCE diff --git a/arch/x86/crypto/aria-aesni-avx2-asm_64.S b/arch/x86/crypto/aria-aesni-avx2-asm_64.S new file mode 100644 index 000000000000..b6cac9a40f2c --- /dev/null +++ b/arch/x86/crypto/aria-aesni-avx2-asm_64.S @@ -0,0 +1,1433 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * ARIA Cipher 32-way parallel algorithm (AVX2) + * + * Copyright (c) 2022 Taehee Yoo + * + */ + +#include +#include +#include +#include + +/* register macros */ +#define CTX %rdi + +#define ymm0_x xmm0 +#define ymm1_x xmm1 +#define ymm2_x xmm2 +#define ymm3_x xmm3 +#define ymm4_x xmm4 +#define ymm5_x xmm5 +#define ymm6_x xmm6 +#define ymm7_x xmm7 +#define ymm8_x xmm8 +#define ymm9_x xmm9 +#define ymm10_x xmm10 +#define ymm11_x xmm11 +#define ymm12_x xmm12 +#define ymm13_x xmm13 +#define ymm14_x xmm14 +#define ymm15_x xmm15 + +#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \ + ( (((a0) & 1) << 0) | \ + (((a1) & 1) << 1) | \ + (((a2) & 1) << 2) | \ + (((a3) & 1) << 3) | \ + (((a4) & 1) << 4) | \ + (((a5) & 1) << 5) | \ + (((a6) & 1) << 6) | \ + (((a7) & 1) << 7) ) + +#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \ + ( ((l7) << (0 * 8)) | \ + ((l6) << (1 * 8)) | \ + ((l5) << (2 * 8)) | \ + ((l4) << (3 * 8)) | \ + ((l3) << (4 * 8)) | \ + ((l2) << (5 * 8)) | \ + ((l1) << (6 * 8)) | \ + ((l0) << (7 * 8)) ) + +#define inc_le128(x, minus_one, tmp) \ + vpcmpeqq minus_one, x, tmp; \ + vpsubq minus_one, x, x; \ + vpslldq $8, tmp, tmp; \ + vpsubq tmp, x, x; + +#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ + vpand x, mask4bit, tmp0; \ + vpandn x, mask4bit, x; \ + vpsrld $4, x, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x1, x0, x0; \ + \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x2; \ + \ + vpunpckhqdq t1, x0, x1; \ + vpunpcklqdq t1, x0, x0; \ + \ + vpunpckhqdq x2, t2, x3; \ + vpunpcklqdq x2, t2, x2; + +#define byteslice_16x16b(a0, b0, c0, d0, \ + a1, b1, c1, d1, \ + a2, b2, c2, d2, \ + a3, b3, c3, d3, \ + st0, st1) \ + vmovdqu d2, st0; \ + vmovdqu d3, st1; \ + transpose_4x4(a0, a1, a2, a3, d2, d3); \ + transpose_4x4(b0, b1, b2, b3, d2, d3); \ + vmovdqu st0, d2; \ + vmovdqu st1, d3; \ + \ + vmovdqu a0, st0; \ + vmovdqu a1, st1; \ + transpose_4x4(c0, c1, c2, c3, a0, a1); \ + transpose_4x4(d0, d1, d2, d3, a0, a1); \ + \ + vbroadcasti128 .Lshufb_16x16b, a0; \ + vmovdqu st1, a1; \ + vpshufb a0, a2, a2; \ + vpshufb a0, a3, a3; \ + vpshufb a0, b0, b0; \ + vpshufb a0, b1, b1; \ + vpshufb a0, b2, b2; \ + vpshufb a0, b3, b3; \ + vpshufb a0, a1, a1; \ + vpshufb a0, c0, c0; \ + vpshufb a0, c1, c1; \ + vpshufb a0, c2, c2; \ + vpshufb a0, c3, c3; \ + vpshufb a0, d0, d0; \ + vpshufb a0, d1, d1; \ + vpshufb a0, d2, d2; \ + vpshufb a0, d3, d3; \ + vmovdqu d3, st1; \ + vmovdqu st0, d3; \ + vpshufb a0, d3, a0; \ + vmovdqu d2, st0; \ + \ + transpose_4x4(a0, b0, c0, d0, d2, d3); \ + transpose_4x4(a1, b1, c1, d1, d2, d3); \ + vmovdqu st0, d2; \ + vmovdqu st1, d3; \ + \ + vmovdqu b0, st0; \ + vmovdqu b1, st1; \ + transpose_4x4(a2, b2, c2, d2, b0, b1); \ + transpose_4x4(a3, b3, c3, d3, b0, b1); \ + vmovdqu st0, b0; \ + vmovdqu st1, b1; \ + /* does not adjust output bytes inside vectors */ + +#define debyteslice_16x16b(a0, b0, c0, d0, \ + a1, b1, c1, d1, \ + a2, b2, c2, d2, \ + a3, b3, c3, d3, \ + st0, st1) \ + vmovdqu d2, st0; \ + vmovdqu d3, st1; \ + transpose_4x4(a0, a1, a2, a3, d2, d3); \ + transpose_4x4(b0, b1, b2, b3, d2, d3); \ + vmovdqu st0, d2; \ + vmovdqu st1, d3; \ + \ + vmovdqu a0, st0; \ + vmovdqu a1, st1; \ + transpose_4x4(c0, c1, c2, c3, a0, a1); \ + transpose_4x4(d0, d1, d2, d3, a0, a1); \ + \ + vbroadcasti128 .Lshufb_16x16b, a0; \ + vmovdqu st1, a1; \ + vpshufb a0, a2, a2; \ + vpshufb a0, a3, a3; \ + vpshufb a0, b0, b0; \ + vpshufb a0, b1, b1; \ + vpshufb a0, b2, b2; \ + vpshufb a0, b3, b3; \ + vpshufb a0, a1, a1; \ + vpshufb a0, c0, c0; \ + vpshufb a0, c1, c1; \ + vpshufb a0, c2, c2; \ + vpshufb a0, c3, c3; \ + vpshufb a0, d0, d0; \ + vpshufb a0, d1, d1; \ + vpshufb a0, d2, d2; \ + vpshufb a0, d3, d3; \ + vmovdqu d3, st1; \ + vmovdqu st0, d3; \ + vpshufb a0, d3, a0; \ + vmovdqu d2, st0; \ + \ + transpose_4x4(c0, d0, a0, b0, d2, d3); \ + transpose_4x4(c1, d1, a1, b1, d2, d3); \ + vmovdqu st0, d2; \ + vmovdqu st1, d3; \ + \ + vmovdqu b0, st0; \ + vmovdqu b1, st1; \ + transpose_4x4(c2, d2, a2, b2, b0, b1); \ + transpose_4x4(c3, d3, a3, b3, b0, b1); \ + vmovdqu st0, b0; \ + vmovdqu st1, b1; \ + /* does not adjust output bytes inside vectors */ + +/* load blocks to registers and apply pre-whitening */ +#define inpack16_pre(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + rio) \ + vmovdqu (0 * 32)(rio), x0; \ + vmovdqu (1 * 32)(rio), x1; \ + vmovdqu (2 * 32)(rio), x2; \ + vmovdqu (3 * 32)(rio), x3; \ + vmovdqu (4 * 32)(rio), x4; \ + vmovdqu (5 * 32)(rio), x5; \ + vmovdqu (6 * 32)(rio), x6; \ + vmovdqu (7 * 32)(rio), x7; \ + vmovdqu (8 * 32)(rio), y0; \ + vmovdqu (9 * 32)(rio), y1; \ + vmovdqu (10 * 32)(rio), y2; \ + vmovdqu (11 * 32)(rio), y3; \ + vmovdqu (12 * 32)(rio), y4; \ + vmovdqu (13 * 32)(rio), y5; \ + vmovdqu (14 * 32)(rio), y6; \ + vmovdqu (15 * 32)(rio), y7; + +/* byteslice pre-whitened blocks and store to temporary memory */ +#define inpack16_post(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_ab, mem_cd) \ + byteslice_16x16b(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + (mem_ab), (mem_cd)); \ + \ + vmovdqu x0, 0 * 32(mem_ab); \ + vmovdqu x1, 1 * 32(mem_ab); \ + vmovdqu x2, 2 * 32(mem_ab); \ + vmovdqu x3, 3 * 32(mem_ab); \ + vmovdqu x4, 4 * 32(mem_ab); \ + vmovdqu x5, 5 * 32(mem_ab); \ + vmovdqu x6, 6 * 32(mem_ab); \ + vmovdqu x7, 7 * 32(mem_ab); \ + vmovdqu y0, 0 * 32(mem_cd); \ + vmovdqu y1, 1 * 32(mem_cd); \ + vmovdqu y2, 2 * 32(mem_cd); \ + vmovdqu y3, 3 * 32(mem_cd); \ + vmovdqu y4, 4 * 32(mem_cd); \ + vmovdqu y5, 5 * 32(mem_cd); \ + vmovdqu y6, 6 * 32(mem_cd); \ + vmovdqu y7, 7 * 32(mem_cd); + +#define write_output(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem) \ + vmovdqu x0, 0 * 32(mem); \ + vmovdqu x1, 1 * 32(mem); \ + vmovdqu x2, 2 * 32(mem); \ + vmovdqu x3, 3 * 32(mem); \ + vmovdqu x4, 4 * 32(mem); \ + vmovdqu x5, 5 * 32(mem); \ + vmovdqu x6, 6 * 32(mem); \ + vmovdqu x7, 7 * 32(mem); \ + vmovdqu y0, 8 * 32(mem); \ + vmovdqu y1, 9 * 32(mem); \ + vmovdqu y2, 10 * 32(mem); \ + vmovdqu y3, 11 * 32(mem); \ + vmovdqu y4, 12 * 32(mem); \ + vmovdqu y5, 13 * 32(mem); \ + vmovdqu y6, 14 * 32(mem); \ + vmovdqu y7, 15 * 32(mem); \ + +#define aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, idx) \ + vmovdqu x0, ((idx + 0) * 32)(mem_tmp); \ + vmovdqu x1, ((idx + 1) * 32)(mem_tmp); \ + vmovdqu x2, ((idx + 2) * 32)(mem_tmp); \ + vmovdqu x3, ((idx + 3) * 32)(mem_tmp); \ + vmovdqu x4, ((idx + 4) * 32)(mem_tmp); \ + vmovdqu x5, ((idx + 5) * 32)(mem_tmp); \ + vmovdqu x6, ((idx + 6) * 32)(mem_tmp); \ + vmovdqu x7, ((idx + 7) * 32)(mem_tmp); + +#define aria_load_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, idx) \ + vmovdqu ((idx + 0) * 32)(mem_tmp), x0; \ + vmovdqu ((idx + 1) * 32)(mem_tmp), x1; \ + vmovdqu ((idx + 2) * 32)(mem_tmp), x2; \ + vmovdqu ((idx + 3) * 32)(mem_tmp), x3; \ + vmovdqu ((idx + 4) * 32)(mem_tmp), x4; \ + vmovdqu ((idx + 5) * 32)(mem_tmp), x5; \ + vmovdqu ((idx + 6) * 32)(mem_tmp), x6; \ + vmovdqu ((idx + 7) * 32)(mem_tmp), x7; + +#define aria_ark_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + t0, rk, idx, round) \ + /* AddRoundKey */ \ + vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \ + vpxor t0, x0, x0; \ + vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \ + vpxor t0, x1, x1; \ + vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \ + vpxor t0, x2, x2; \ + vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \ + vpxor t0, x3, x3; \ + vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \ + vpxor t0, x4, x4; \ + vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \ + vpxor t0, x5, x5; \ + vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \ + vpxor t0, x6, x6; \ + vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \ + vpxor t0, x7, x7; + +#define aria_sbox_8way_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + t0, t1, t2, t3, \ + t4, t5, t6, t7) \ + vpbroadcastq .Ltf_s2_bitmatrix, t0; \ + vpbroadcastq .Ltf_inv_bitmatrix, t1; \ + vpbroadcastq .Ltf_id_bitmatrix, t2; \ + vpbroadcastq .Ltf_aff_bitmatrix, t3; \ + vpbroadcastq .Ltf_x2_bitmatrix, t4; \ + vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ + vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ + vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ + vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \ + vgf2p8affineinvqb $0, t2, x2, x2; \ + vgf2p8affineinvqb $0, t2, x6, x6; \ + vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \ + vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \ + vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \ + vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \ + vgf2p8affineinvqb $0, t2, x3, x3; \ + vgf2p8affineinvqb $0, t2, x7, x7 + +#define aria_sbox_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + t0, t1, t2, t3, \ + t4, t5, t6, t7) \ + vpxor t7, t7, t7; \ + vpxor t6, t6, t6; \ + vbroadcasti128 .Linv_shift_row, t0; \ + vbroadcasti128 .Lshift_row, t1; \ + vbroadcasti128 .Ltf_lo__inv_aff__and__s2, t2; \ + vbroadcasti128 .Ltf_hi__inv_aff__and__s2, t3; \ + vbroadcasti128 .Ltf_lo__x2__and__fwd_aff, t4; \ + vbroadcasti128 .Ltf_hi__x2__and__fwd_aff, t5; \ + \ + vextracti128 $1, x0, t6##_x; \ + vaesenclast t7##_x, x0##_x, x0##_x; \ + vaesenclast t7##_x, t6##_x, t6##_x; \ + vinserti128 $1, t6##_x, x0, x0; \ + \ + vextracti128 $1, x4, t6##_x; \ + vaesenclast t7##_x, x4##_x, x4##_x; \ + vaesenclast t7##_x, t6##_x, t6##_x; \ + vinserti128 $1, t6##_x, x4, x4; \ + \ + vextracti128 $1, x1, t6##_x; \ + vaesenclast t7##_x, x1##_x, x1##_x; \ + vaesenclast t7##_x, t6##_x, t6##_x; \ + vinserti128 $1, t6##_x, x1, x1; \ + \ + vextracti128 $1, x5, t6##_x; \ + vaesenclast t7##_x, x5##_x, x5##_x; \ + vaesenclast t7##_x, t6##_x, t6##_x; \ + vinserti128 $1, t6##_x, x5, x5; \ + \ + vextracti128 $1, x2, t6##_x; \ + vaesdeclast t7##_x, x2##_x, x2##_x; \ + vaesdeclast t7##_x, t6##_x, t6##_x; \ + vinserti128 $1, t6##_x, x2, x2; \ + \ + vextracti128 $1, x6, t6##_x; \ + vaesdeclast t7##_x, x6##_x, x6##_x; \ + vaesdeclast t7##_x, t6##_x, t6##_x; \ + vinserti128 $1, t6##_x, x6, x6; \ + \ + vpbroadcastd .L0f0f0f0f, t6; \ + \ + /* AES inverse shift rows */ \ + vpshufb t0, x0, x0; \ + vpshufb t0, x4, x4; \ + vpshufb t0, x1, x1; \ + vpshufb t0, x5, x5; \ + vpshufb t1, x3, x3; \ + vpshufb t1, x7, x7; \ + vpshufb t1, x2, x2; \ + vpshufb t1, x6, x6; \ + \ + /* affine transformation for S2 */ \ + filter_8bit(x1, t2, t3, t6, t0); \ + /* affine transformation for S2 */ \ + filter_8bit(x5, t2, t3, t6, t0); \ + \ + /* affine transformation for X2 */ \ + filter_8bit(x3, t4, t5, t6, t0); \ + /* affine transformation for X2 */ \ + filter_8bit(x7, t4, t5, t6, t0); \ + \ + vpxor t6, t6, t6; \ + vextracti128 $1, x3, t6##_x; \ + vaesdeclast t7##_x, x3##_x, x3##_x; \ + vaesdeclast t7##_x, t6##_x, t6##_x; \ + vinserti128 $1, t6##_x, x3, x3; \ + \ + vextracti128 $1, x7, t6##_x; \ + vaesdeclast t7##_x, x7##_x, x7##_x; \ + vaesdeclast t7##_x, t6##_x, t6##_x; \ + vinserti128 $1, t6##_x, x7, x7; \ + +#define aria_diff_m(x0, x1, x2, x3, \ + t0, t1, t2, t3) \ + /* T = rotr32(X, 8); */ \ + /* X ^= T */ \ + vpxor x0, x3, t0; \ + vpxor x1, x0, t1; \ + vpxor x2, x1, t2; \ + vpxor x3, x2, t3; \ + /* X = T ^ rotr(X, 16); */ \ + vpxor t2, x0, x0; \ + vpxor x1, t3, t3; \ + vpxor t0, x2, x2; \ + vpxor t1, x3, x1; \ + vmovdqu t3, x3; + +#define aria_diff_word(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7) \ + /* t1 ^= t2; */ \ + vpxor y0, x4, x4; \ + vpxor y1, x5, x5; \ + vpxor y2, x6, x6; \ + vpxor y3, x7, x7; \ + \ + /* t2 ^= t3; */ \ + vpxor y4, y0, y0; \ + vpxor y5, y1, y1; \ + vpxor y6, y2, y2; \ + vpxor y7, y3, y3; \ + \ + /* t0 ^= t1; */ \ + vpxor x4, x0, x0; \ + vpxor x5, x1, x1; \ + vpxor x6, x2, x2; \ + vpxor x7, x3, x3; \ + \ + /* t3 ^= t1; */ \ + vpxor x4, y4, y4; \ + vpxor x5, y5, y5; \ + vpxor x6, y6, y6; \ + vpxor x7, y7, y7; \ + \ + /* t2 ^= t0; */ \ + vpxor x0, y0, y0; \ + vpxor x1, y1, y1; \ + vpxor x2, y2, y2; \ + vpxor x3, y3, y3; \ + \ + /* t1 ^= t2; */ \ + vpxor y0, x4, x4; \ + vpxor y1, x5, x5; \ + vpxor y2, x6, x6; \ + vpxor y3, x7, x7; + +#define aria_fe(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, rk, round) \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 8, round); \ + \ + aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ + y0, y1, y2, y3, y4, y5, y6, y7); \ + \ + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 8); \ + \ + aria_load_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 0, round); \ + \ + aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ + y0, y1, y2, y3, y4, y5, y6, y7); \ + \ + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_load_state_8way(y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, 8); \ + aria_diff_word(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + /* aria_diff_byte() \ + * T3 = ABCD -> BADC \ + * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ + * T0 = ABCD -> CDAB \ + * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ + * T1 = ABCD -> DCBA \ + * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ + */ \ + aria_diff_word(x2, x3, x0, x1, \ + x7, x6, x5, x4, \ + y0, y1, y2, y3, \ + y5, y4, y7, y6); \ + aria_store_state_8way(x3, x2, x1, x0, \ + x6, x7, x4, x5, \ + mem_tmp, 0); + +#define aria_fo(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, rk, round) \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 8, round); \ + \ + aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, y1, y2, y3, y4, y5, y6, y7); \ + \ + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 8); \ + \ + aria_load_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 0, round); \ + \ + aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, y1, y2, y3, y4, y5, y6, y7); \ + \ + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_load_state_8way(y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, 8); \ + aria_diff_word(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + /* aria_diff_byte() \ + * T1 = ABCD -> BADC \ + * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ + * T2 = ABCD -> CDAB \ + * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ + * T3 = ABCD -> DCBA \ + * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ + */ \ + aria_diff_word(x0, x1, x2, x3, \ + x5, x4, x7, x6, \ + y2, y3, y0, y1, \ + y7, y6, y5, y4); \ + aria_store_state_8way(x3, x2, x1, x0, \ + x6, x7, x4, x5, \ + mem_tmp, 0); + +#define aria_ff(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, rk, round, last_round) \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 8, round); \ + \ + aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ + y0, y1, y2, y3, y4, y5, y6, y7); \ + \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 8, last_round); \ + \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 8); \ + \ + aria_load_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 0, round); \ + \ + aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ + y0, y1, y2, y3, y4, y5, y6, y7); \ + \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 0, last_round); \ + \ + aria_load_state_8way(y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, 8); + +#define aria_fe_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, rk, round) \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 8, round); \ + \ + aria_sbox_8way_gfni(x2, x3, x0, x1, \ + x6, x7, x4, x5, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + \ + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 8); \ + \ + aria_load_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 0, round); \ + \ + aria_sbox_8way_gfni(x2, x3, x0, x1, \ + x6, x7, x4, x5, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + \ + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_load_state_8way(y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, 8); \ + aria_diff_word(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + /* aria_diff_byte() \ + * T3 = ABCD -> BADC \ + * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ + * T0 = ABCD -> CDAB \ + * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ + * T1 = ABCD -> DCBA \ + * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ + */ \ + aria_diff_word(x2, x3, x0, x1, \ + x7, x6, x5, x4, \ + y0, y1, y2, y3, \ + y5, y4, y7, y6); \ + aria_store_state_8way(x3, x2, x1, x0, \ + x6, x7, x4, x5, \ + mem_tmp, 0); + +#define aria_fo_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, rk, round) \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 8, round); \ + \ + aria_sbox_8way_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + \ + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 8); \ + \ + aria_load_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 0, round); \ + \ + aria_sbox_8way_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + \ + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_load_state_8way(y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, 8); \ + aria_diff_word(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + /* aria_diff_byte() \ + * T1 = ABCD -> BADC \ + * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ + * T2 = ABCD -> CDAB \ + * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ + * T3 = ABCD -> DCBA \ + * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ + */ \ + aria_diff_word(x0, x1, x2, x3, \ + x5, x4, x7, x6, \ + y2, y3, y0, y1, \ + y7, y6, y5, y4); \ + aria_store_state_8way(x3, x2, x1, x0, \ + x6, x7, x4, x5, \ + mem_tmp, 0); + +#define aria_ff_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, rk, round, last_round) \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 8, round); \ + \ + aria_sbox_8way_gfni(x2, x3, x0, x1, \ + x6, x7, x4, x5, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 8, last_round); \ + \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 8); \ + \ + aria_load_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 0, round); \ + \ + aria_sbox_8way_gfni(x2, x3, x0, x1, \ + x6, x7, x4, x5, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 0, last_round); \ + \ + aria_load_state_8way(y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, 8); + +.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32 +.align 32 +#define SHUFB_BYTES(idx) \ + 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) +.Lshufb_16x16b: + .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) + .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) + +.section .rodata.cst16, "aM", @progbits, 16 +.align 16 +/* For isolating SubBytes from AESENCLAST, inverse shift row */ +.Linv_shift_row: + .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b + .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 +.Lshift_row: + .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03 + .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b +/* For CTR-mode IV byteswap */ +.Lbswap128_mask: + .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 + .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 + +/* AES inverse affine and S2 combined: + * 1 1 0 0 0 0 0 1 x0 0 + * 0 1 0 0 1 0 0 0 x1 0 + * 1 1 0 0 1 1 1 1 x2 0 + * 0 1 1 0 1 0 0 1 x3 1 + * 0 1 0 0 1 1 0 0 * x4 + 0 + * 0 1 0 1 1 0 0 0 x5 0 + * 0 0 0 0 0 1 0 1 x6 0 + * 1 1 1 0 0 1 1 1 x7 1 + */ +.Ltf_lo__inv_aff__and__s2: + .octa 0x92172DA81A9FA520B2370D883ABF8500 +.Ltf_hi__inv_aff__and__s2: + .octa 0x2B15FFC1AF917B45E6D8320C625CB688 + +/* X2 and AES forward affine combined: + * 1 0 1 1 0 0 0 1 x0 0 + * 0 1 1 1 1 0 1 1 x1 0 + * 0 0 0 1 1 0 1 0 x2 1 + * 0 1 0 0 0 1 0 0 x3 0 + * 0 0 1 1 1 0 1 1 * x4 + 0 + * 0 1 0 0 1 0 0 0 x5 0 + * 1 1 0 1 0 0 1 1 x6 0 + * 0 1 0 0 1 0 1 0 x7 0 + */ +.Ltf_lo__x2__and__fwd_aff: + .octa 0xEFAE0544FCBD1657B8F95213ABEA4100 +.Ltf_hi__x2__and__fwd_aff: + .octa 0x3F893781E95FE1576CDA64D2BA0CB204 + +.section .rodata.cst8, "aM", @progbits, 8 +.align 8 +/* AES affine: */ +#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0) +.Ltf_aff_bitmatrix: + .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), + BV8(1, 1, 0, 0, 0, 1, 1, 1), + BV8(1, 1, 1, 0, 0, 0, 1, 1), + BV8(1, 1, 1, 1, 0, 0, 0, 1), + BV8(1, 1, 1, 1, 1, 0, 0, 0), + BV8(0, 1, 1, 1, 1, 1, 0, 0), + BV8(0, 0, 1, 1, 1, 1, 1, 0), + BV8(0, 0, 0, 1, 1, 1, 1, 1)) + +/* AES inverse affine: */ +#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0) +.Ltf_inv_bitmatrix: + .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), + BV8(1, 0, 0, 1, 0, 0, 1, 0), + BV8(0, 1, 0, 0, 1, 0, 0, 1), + BV8(1, 0, 1, 0, 0, 1, 0, 0), + BV8(0, 1, 0, 1, 0, 0, 1, 0), + BV8(0, 0, 1, 0, 1, 0, 0, 1), + BV8(1, 0, 0, 1, 0, 1, 0, 0), + BV8(0, 1, 0, 0, 1, 0, 1, 0)) + +/* S2: */ +#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1) +.Ltf_s2_bitmatrix: + .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), + BV8(0, 0, 1, 1, 1, 1, 1, 1), + BV8(1, 1, 1, 0, 1, 1, 0, 1), + BV8(1, 1, 0, 0, 0, 0, 1, 1), + BV8(0, 1, 0, 0, 0, 0, 1, 1), + BV8(1, 1, 0, 0, 1, 1, 1, 0), + BV8(0, 1, 1, 0, 0, 0, 1, 1), + BV8(1, 1, 1, 1, 0, 1, 1, 0)) + +/* X2: */ +#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0) +.Ltf_x2_bitmatrix: + .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), + BV8(0, 0, 1, 0, 0, 1, 1, 0), + BV8(0, 0, 0, 0, 1, 0, 1, 0), + BV8(1, 1, 1, 0, 0, 0, 1, 1), + BV8(1, 1, 1, 0, 1, 1, 0, 0), + BV8(0, 1, 1, 0, 1, 0, 1, 1), + BV8(1, 0, 1, 1, 1, 1, 0, 1), + BV8(1, 0, 0, 1, 0, 0, 1, 1)) + +/* Identity matrix: */ +.Ltf_id_bitmatrix: + .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 1, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 1, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 1, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 1, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 1, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 1, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 1)) + +/* 4-bit mask */ +.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 +.align 4 +.L0f0f0f0f: + .long 0x0f0f0f0f + +.text + +SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way) + /* input: + * %r9: rk + * %rsi: dst + * %rdx: src + * %ymm0..%ymm15: byte-sliced blocks + */ + + FRAME_BEGIN + + movq %rsi, %rax; + leaq 8 * 32(%rax), %r8; + + inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r8); + aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, + %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %rax, %r9, 0); + aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 1); + aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, + %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %rax, %r9, 2); + aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 3); + aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, + %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %rax, %r9, 4); + aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 5); + aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, + %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %rax, %r9, 6); + aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 7); + aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, + %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %rax, %r9, 8); + aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 9); + aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, + %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %rax, %r9, 10); + cmpl $12, ARIA_CTX_rounds(CTX); + jne .Laria_192; + aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 11, 12); + jmp .Laria_end; +.Laria_192: + aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 11); + aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, + %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %rax, %r9, 12); + cmpl $14, ARIA_CTX_rounds(CTX); + jne .Laria_256; + aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 13, 14); + jmp .Laria_end; +.Laria_256: + aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 13); + aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, + %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %rax, %r9, 14); + aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 15, 16); +.Laria_end: + debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4, + %ymm9, %ymm13, %ymm0, %ymm5, + %ymm10, %ymm14, %ymm3, %ymm6, + %ymm11, %ymm15, %ymm2, %ymm7, + (%rax), (%r8)); + + FRAME_END + RET; +SYM_FUNC_END(__aria_aesni_avx2_crypt_32way) + +SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + FRAME_BEGIN + + leaq ARIA_CTX_enc_key(CTX), %r9; + + inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rdx); + + call __aria_aesni_avx2_crypt_32way; + + write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax); + + FRAME_END + RET; +SYM_FUNC_END(aria_aesni_avx2_encrypt_32way) + +SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + FRAME_BEGIN + + leaq ARIA_CTX_dec_key(CTX), %r9; + + inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rdx); + + call __aria_aesni_avx2_crypt_32way; + + write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax); + + FRAME_END + RET; +SYM_FUNC_END(aria_aesni_avx2_decrypt_32way) + +SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way) + /* input: + * %rdi: ctx + * %rsi: dst + * %rdx: src + * %rcx: keystream + * %r8: iv (big endian, 128bit) + */ + + FRAME_BEGIN + movq 8(%r8), %r11; + bswapq %r11; + + vbroadcasti128 .Lbswap128_mask (%rip), %ymm6; + vpcmpeqd %ymm0, %ymm0, %ymm0; + vpsrldq $8, %ymm0, %ymm0; /* ab: -1:0 ; cd: -1:0 */ + vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */ + + /* load IV and byteswap */ + vmovdqu (%r8), %xmm7; + vpshufb %xmm6, %xmm7, %xmm7; + vmovdqa %xmm7, %xmm3; + inc_le128(%xmm7, %xmm0, %xmm4); + vinserti128 $1, %xmm7, %ymm3, %ymm3; + vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */ + + /* check need for handling 64-bit overflow and carry */ + cmpq $(0xffffffffffffffff - 32), %r11; + ja .Lhandle_ctr_carry; + + /* construct IVs */ + vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */ + vpshufb %ymm6, %ymm3, %ymm9; + vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */ + vpshufb %ymm6, %ymm3, %ymm10; + vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */ + vpshufb %ymm6, %ymm3, %ymm11; + vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */ + vpshufb %ymm6, %ymm3, %ymm12; + vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */ + vpshufb %ymm6, %ymm3, %ymm13; + vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */ + vpshufb %ymm6, %ymm3, %ymm14; + vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */ + vpshufb %ymm6, %ymm3, %ymm15; + vmovdqu %ymm8, (0 * 32)(%rcx); + vmovdqu %ymm9, (1 * 32)(%rcx); + vmovdqu %ymm10, (2 * 32)(%rcx); + vmovdqu %ymm11, (3 * 32)(%rcx); + vmovdqu %ymm12, (4 * 32)(%rcx); + vmovdqu %ymm13, (5 * 32)(%rcx); + vmovdqu %ymm14, (6 * 32)(%rcx); + vmovdqu %ymm15, (7 * 32)(%rcx); + + vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */ + vpshufb %ymm6, %ymm3, %ymm8; + vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */ + vpshufb %ymm6, %ymm3, %ymm9; + vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */ + vpshufb %ymm6, %ymm3, %ymm10; + vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */ + vpshufb %ymm6, %ymm3, %ymm11; + vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */ + vpshufb %ymm6, %ymm3, %ymm12; + vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */ + vpshufb %ymm6, %ymm3, %ymm13; + vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */ + vpshufb %ymm6, %ymm3, %ymm14; + vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */ + vpshufb %ymm6, %ymm3, %ymm15; + vpsubq %ymm5, %ymm3, %ymm3; /* +32 */ + vpshufb %xmm6, %xmm3, %xmm3; + vmovdqu %xmm3, (%r8); + vmovdqu (0 * 32)(%rcx), %ymm0; + vmovdqu (1 * 32)(%rcx), %ymm1; + vmovdqu (2 * 32)(%rcx), %ymm2; + vmovdqu (3 * 32)(%rcx), %ymm3; + vmovdqu (4 * 32)(%rcx), %ymm4; + vmovdqu (5 * 32)(%rcx), %ymm5; + vmovdqu (6 * 32)(%rcx), %ymm6; + vmovdqu (7 * 32)(%rcx), %ymm7; + jmp .Lctr_carry_done; + + .Lhandle_ctr_carry: + /* construct IVs */ + inc_le128(%ymm3, %ymm0, %ymm4); + inc_le128(%ymm3, %ymm0, %ymm4); + vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */ + inc_le128(%ymm3, %ymm0, %ymm4); + inc_le128(%ymm3, %ymm0, %ymm4); + vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */ + inc_le128(%ymm3, %ymm0, %ymm4); + inc_le128(%ymm3, %ymm0, %ymm4); + vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */ + inc_le128(%ymm3, %ymm0, %ymm4); + inc_le128(%ymm3, %ymm0, %ymm4); + vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */ + inc_le128(%ymm3, %ymm0, %ymm4); + inc_le128(%ymm3, %ymm0, %ymm4); + vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */ + inc_le128(%ymm3, %ymm0, %ymm4); + inc_le128(%ymm3, %ymm0, %ymm4); + vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */ + inc_le128(%ymm3, %ymm0, %ymm4); + inc_le128(%ymm3, %ymm0, %ymm4); + vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */ + vmovdqu %ymm8, (0 * 32)(%rcx); + vmovdqu %ymm9, (1 * 32)(%rcx); + vmovdqu %ymm10, (2 * 32)(%rcx); + vmovdqu %ymm11, (3 * 32)(%rcx); + vmovdqu %ymm12, (4 * 32)(%rcx); + vmovdqu %ymm13, (5 * 32)(%rcx); + vmovdqu %ymm14, (6 * 32)(%rcx); + vmovdqu %ymm15, (7 * 32)(%rcx); + + inc_le128(%ymm3, %ymm0, %ymm4); + inc_le128(%ymm3, %ymm0, %ymm4); + vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */ + inc_le128(%ymm3, %ymm0, %ymm4); + inc_le128(%ymm3, %ymm0, %ymm4); + vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */ + inc_le128(%ymm3, %ymm0, %ymm4); + inc_le128(%ymm3, %ymm0, %ymm4); + vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */ + inc_le128(%ymm3, %ymm0, %ymm4); + inc_le128(%ymm3, %ymm0, %ymm4); + vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */ + inc_le128(%ymm3, %ymm0, %ymm4); + inc_le128(%ymm3, %ymm0, %ymm4); + vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */ + inc_le128(%ymm3, %ymm0, %ymm4); + inc_le128(%ymm3, %ymm0, %ymm4); + vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */ + inc_le128(%ymm3, %ymm0, %ymm4); + inc_le128(%ymm3, %ymm0, %ymm4); + vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */ + inc_le128(%ymm3, %ymm0, %ymm4); + inc_le128(%ymm3, %ymm0, %ymm4); + vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */ + inc_le128(%ymm3, %ymm0, %ymm4); + vextracti128 $1, %ymm3, %xmm3; + vpshufb %xmm6, %xmm3, %xmm3; /* +32 */ + vmovdqu %xmm3, (%r8); + vmovdqu (0 * 32)(%rcx), %ymm0; + vmovdqu (1 * 32)(%rcx), %ymm1; + vmovdqu (2 * 32)(%rcx), %ymm2; + vmovdqu (3 * 32)(%rcx), %ymm3; + vmovdqu (4 * 32)(%rcx), %ymm4; + vmovdqu (5 * 32)(%rcx), %ymm5; + vmovdqu (6 * 32)(%rcx), %ymm6; + vmovdqu (7 * 32)(%rcx), %ymm7; + + .Lctr_carry_done: + + FRAME_END + RET; +SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way) + +SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way) + /* input: + * %rdi: ctx + * %rsi: dst + * %rdx: src + * %rcx: keystream + * %r8: iv (big endian, 128bit) + */ + FRAME_BEGIN + + call __aria_aesni_avx2_ctr_gen_keystream_32way; + + leaq (%rsi), %r10; + leaq (%rdx), %r11; + leaq (%rcx), %rsi; + leaq (%rcx), %rdx; + leaq ARIA_CTX_enc_key(CTX), %r9; + + call __aria_aesni_avx2_crypt_32way; + + vpxor (0 * 32)(%r11), %ymm1, %ymm1; + vpxor (1 * 32)(%r11), %ymm0, %ymm0; + vpxor (2 * 32)(%r11), %ymm3, %ymm3; + vpxor (3 * 32)(%r11), %ymm2, %ymm2; + vpxor (4 * 32)(%r11), %ymm4, %ymm4; + vpxor (5 * 32)(%r11), %ymm5, %ymm5; + vpxor (6 * 32)(%r11), %ymm6, %ymm6; + vpxor (7 * 32)(%r11), %ymm7, %ymm7; + vpxor (8 * 32)(%r11), %ymm8, %ymm8; + vpxor (9 * 32)(%r11), %ymm9, %ymm9; + vpxor (10 * 32)(%r11), %ymm10, %ymm10; + vpxor (11 * 32)(%r11), %ymm11, %ymm11; + vpxor (12 * 32)(%r11), %ymm12, %ymm12; + vpxor (13 * 32)(%r11), %ymm13, %ymm13; + vpxor (14 * 32)(%r11), %ymm14, %ymm14; + vpxor (15 * 32)(%r11), %ymm15, %ymm15; + write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %r10); + + FRAME_END + RET; +SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way) + +SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way) + /* input: + * %r9: rk + * %rsi: dst + * %rdx: src + * %ymm0..%ymm15: 16 byte-sliced blocks + */ + + FRAME_BEGIN + + movq %rsi, %rax; + leaq 8 * 32(%rax), %r8; + + inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, + %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, + %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r8); + aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11, + %ymm12, %ymm13, %ymm14, %ymm15, + %ymm0, %ymm1, %ymm2, %ymm3, + %ymm4, %ymm5, %ymm6, %ymm7, + %rax, %r9, 0); + aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, + %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, + %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 1); + aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, + %ymm12, %ymm13, %ymm14, %ymm15, + %ymm0, %ymm1, %ymm2, %ymm3, + %ymm4, %ymm5, %ymm6, %ymm7, + %rax, %r9, 2); + aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, + %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, + %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 3); + aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, + %ymm12, %ymm13, %ymm14, %ymm15, + %ymm0, %ymm1, %ymm2, %ymm3, + %ymm4, %ymm5, %ymm6, %ymm7, + %rax, %r9, 4); + aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, + %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, + %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 5); + aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, + %ymm12, %ymm13, %ymm14, %ymm15, + %ymm0, %ymm1, %ymm2, %ymm3, + %ymm4, %ymm5, %ymm6, %ymm7, + %rax, %r9, 6); + aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, + %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, + %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 7); + aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, + %ymm12, %ymm13, %ymm14, %ymm15, + %ymm0, %ymm1, %ymm2, %ymm3, + %ymm4, %ymm5, %ymm6, %ymm7, + %rax, %r9, 8); + aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, + %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, + %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 9); + aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, + %ymm12, %ymm13, %ymm14, %ymm15, + %ymm0, %ymm1, %ymm2, %ymm3, + %ymm4, %ymm5, %ymm6, %ymm7, + %rax, %r9, 10); + cmpl $12, ARIA_CTX_rounds(CTX); + jne .Laria_gfni_192; + aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 11, 12); + jmp .Laria_gfni_end; +.Laria_gfni_192: + aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, + %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, + %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 11); + aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, + %ymm12, %ymm13, %ymm14, %ymm15, + %ymm0, %ymm1, %ymm2, %ymm3, + %ymm4, %ymm5, %ymm6, %ymm7, + %rax, %r9, 12); + cmpl $14, ARIA_CTX_rounds(CTX); + jne .Laria_gfni_256; + aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, + %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, + %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 13, 14); + jmp .Laria_gfni_end; +.Laria_gfni_256: + aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, + %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, + %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 13); + aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, + %ymm12, %ymm13, %ymm14, %ymm15, + %ymm0, %ymm1, %ymm2, %ymm3, + %ymm4, %ymm5, %ymm6, %ymm7, + %rax, %r9, 14); + aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, + %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, + %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %r9, 15, 16); +.Laria_gfni_end: + debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4, + %ymm9, %ymm13, %ymm0, %ymm5, + %ymm10, %ymm14, %ymm3, %ymm6, + %ymm11, %ymm15, %ymm2, %ymm7, + (%rax), (%r8)); + + FRAME_END + RET; +SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way) + +SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + FRAME_BEGIN + + leaq ARIA_CTX_enc_key(CTX), %r9; + + inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rdx); + + call __aria_aesni_avx2_gfni_crypt_32way; + + write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax); + + FRAME_END + RET; +SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way) + +SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + FRAME_BEGIN + + leaq ARIA_CTX_dec_key(CTX), %r9; + + inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rdx); + + call __aria_aesni_avx2_gfni_crypt_32way; + + write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax); + + FRAME_END + RET; +SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way) + +SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way) + /* input: + * %rdi: ctx + * %rsi: dst + * %rdx: src + * %rcx: keystream + * %r8: iv (big endian, 128bit) + */ + FRAME_BEGIN + + call __aria_aesni_avx2_ctr_gen_keystream_32way + + leaq (%rsi), %r10; + leaq (%rdx), %r11; + leaq (%rcx), %rsi; + leaq (%rcx), %rdx; + leaq ARIA_CTX_enc_key(CTX), %r9; + + call __aria_aesni_avx2_gfni_crypt_32way; + + vpxor (0 * 32)(%r11), %ymm1, %ymm1; + vpxor (1 * 32)(%r11), %ymm0, %ymm0; + vpxor (2 * 32)(%r11), %ymm3, %ymm3; + vpxor (3 * 32)(%r11), %ymm2, %ymm2; + vpxor (4 * 32)(%r11), %ymm4, %ymm4; + vpxor (5 * 32)(%r11), %ymm5, %ymm5; + vpxor (6 * 32)(%r11), %ymm6, %ymm6; + vpxor (7 * 32)(%r11), %ymm7, %ymm7; + vpxor (8 * 32)(%r11), %ymm8, %ymm8; + vpxor (9 * 32)(%r11), %ymm9, %ymm9; + vpxor (10 * 32)(%r11), %ymm10, %ymm10; + vpxor (11 * 32)(%r11), %ymm11, %ymm11; + vpxor (12 * 32)(%r11), %ymm12, %ymm12; + vpxor (13 * 32)(%r11), %ymm13, %ymm13; + vpxor (14 * 32)(%r11), %ymm14, %ymm14; + vpxor (15 * 32)(%r11), %ymm15, %ymm15; + write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %r10); + + FRAME_END + RET; +SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way) diff --git a/arch/x86/crypto/aria-avx.h b/arch/x86/crypto/aria-avx.h index 01e9a01dc157..9f2d847c460a 100644 --- a/arch/x86/crypto/aria-avx.h +++ b/arch/x86/crypto/aria-avx.h @@ -5,12 +5,50 @@ #include #define ARIA_AESNI_PARALLEL_BLOCKS 16 -#define ARIA_AESNI_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * 16) +#define ARIA_AESNI_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * ARIA_AESNI_PARALLEL_BLOCKS) + +#define ARIA_AESNI_AVX2_PARALLEL_BLOCKS 32 +#define ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * ARIA_AESNI_AVX2_PARALLEL_BLOCKS) + +asmlinkage void aria_aesni_avx_encrypt_16way(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void aria_aesni_avx_decrypt_16way(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void aria_aesni_avx_ctr_crypt_16way(const void *ctx, u8 *dst, + const u8 *src, + u8 *keystream, u8 *iv); +asmlinkage void aria_aesni_avx_gfni_encrypt_16way(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void aria_aesni_avx_gfni_decrypt_16way(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void aria_aesni_avx_gfni_ctr_crypt_16way(const void *ctx, u8 *dst, + const u8 *src, + u8 *keystream, u8 *iv); + +asmlinkage void aria_aesni_avx2_encrypt_32way(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void aria_aesni_avx2_decrypt_32way(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void aria_aesni_avx2_ctr_crypt_32way(const void *ctx, u8 *dst, + const u8 *src, + u8 *keystream, u8 *iv); +asmlinkage void aria_aesni_avx2_gfni_encrypt_32way(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void aria_aesni_avx2_gfni_decrypt_32way(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void aria_aesni_avx2_gfni_ctr_crypt_32way(const void *ctx, u8 *dst, + const u8 *src, + u8 *keystream, u8 *iv); struct aria_avx_ops { void (*aria_encrypt_16way)(const void *ctx, u8 *dst, const u8 *src); void (*aria_decrypt_16way)(const void *ctx, u8 *dst, const u8 *src); void (*aria_ctr_crypt_16way)(const void *ctx, u8 *dst, const u8 *src, u8 *keystream, u8 *iv); + void (*aria_encrypt_32way)(const void *ctx, u8 *dst, const u8 *src); + void (*aria_decrypt_32way)(const void *ctx, u8 *dst, const u8 *src); + void (*aria_ctr_crypt_32way)(const void *ctx, u8 *dst, const u8 *src, + u8 *keystream, u8 *iv); + }; #endif diff --git a/arch/x86/crypto/aria_aesni_avx2_glue.c b/arch/x86/crypto/aria_aesni_avx2_glue.c new file mode 100644 index 000000000000..95fccc6dc420 --- /dev/null +++ b/arch/x86/crypto/aria_aesni_avx2_glue.c @@ -0,0 +1,252 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Glue Code for the AVX2/AES-NI/GFNI assembler implementation of the ARIA Cipher + * + * Copyright (c) 2022 Taehee Yoo + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "ecb_cbc_helpers.h" +#include "aria-avx.h" + +asmlinkage void aria_aesni_avx2_encrypt_32way(const void *ctx, u8 *dst, + const u8 *src); +EXPORT_SYMBOL_GPL(aria_aesni_avx2_encrypt_32way); +asmlinkage void aria_aesni_avx2_decrypt_32way(const void *ctx, u8 *dst, + const u8 *src); +EXPORT_SYMBOL_GPL(aria_aesni_avx2_decrypt_32way); +asmlinkage void aria_aesni_avx2_ctr_crypt_32way(const void *ctx, u8 *dst, + const u8 *src, + u8 *keystream, u8 *iv); +EXPORT_SYMBOL_GPL(aria_aesni_avx2_ctr_crypt_32way); +asmlinkage void aria_aesni_avx2_gfni_encrypt_32way(const void *ctx, u8 *dst, + const u8 *src); +EXPORT_SYMBOL_GPL(aria_aesni_avx2_gfni_encrypt_32way); +asmlinkage void aria_aesni_avx2_gfni_decrypt_32way(const void *ctx, u8 *dst, + const u8 *src); +EXPORT_SYMBOL_GPL(aria_aesni_avx2_gfni_decrypt_32way); +asmlinkage void aria_aesni_avx2_gfni_ctr_crypt_32way(const void *ctx, u8 *dst, + const u8 *src, + u8 *keystream, u8 *iv); +EXPORT_SYMBOL_GPL(aria_aesni_avx2_gfni_ctr_crypt_32way); + +static struct aria_avx_ops aria_ops; + +struct aria_avx2_request_ctx { + u8 keystream[ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE]; +}; + +static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey) +{ + ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(ARIA_AESNI_AVX2_PARALLEL_BLOCKS, aria_ops.aria_encrypt_32way); + ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_encrypt_16way); + ECB_BLOCK(1, aria_encrypt); + ECB_WALK_END(); +} + +static int ecb_do_decrypt(struct skcipher_request *req, const u32 *rkey) +{ + ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(ARIA_AESNI_AVX2_PARALLEL_BLOCKS, aria_ops.aria_decrypt_32way); + ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_decrypt_16way); + ECB_BLOCK(1, aria_decrypt); + ECB_WALK_END(); +} + +static int aria_avx2_ecb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aria_ctx *ctx = crypto_skcipher_ctx(tfm); + + return ecb_do_encrypt(req, ctx->enc_key[0]); +} + +static int aria_avx2_ecb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aria_ctx *ctx = crypto_skcipher_ctx(tfm); + + return ecb_do_decrypt(req, ctx->dec_key[0]); +} + +static int aria_avx2_set_key(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + return aria_set_key(&tfm->base, key, keylen); +} + +static int aria_avx2_ctr_encrypt(struct skcipher_request *req) +{ + struct aria_avx2_request_ctx *req_ctx = skcipher_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aria_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + while (nbytes >= ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE) { + kernel_fpu_begin(); + aria_ops.aria_ctr_crypt_32way(ctx, dst, src, + &req_ctx->keystream[0], + walk.iv); + kernel_fpu_end(); + dst += ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE; + src += ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE; + nbytes -= ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE; + } + + while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) { + kernel_fpu_begin(); + aria_ops.aria_ctr_crypt_16way(ctx, dst, src, + &req_ctx->keystream[0], + walk.iv); + kernel_fpu_end(); + dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE; + src += ARIA_AESNI_PARALLEL_BLOCK_SIZE; + nbytes -= ARIA_AESNI_PARALLEL_BLOCK_SIZE; + } + + while (nbytes >= ARIA_BLOCK_SIZE) { + memcpy(&req_ctx->keystream[0], walk.iv, ARIA_BLOCK_SIZE); + crypto_inc(walk.iv, ARIA_BLOCK_SIZE); + + aria_encrypt(ctx, &req_ctx->keystream[0], + &req_ctx->keystream[0]); + + crypto_xor_cpy(dst, src, &req_ctx->keystream[0], + ARIA_BLOCK_SIZE); + dst += ARIA_BLOCK_SIZE; + src += ARIA_BLOCK_SIZE; + nbytes -= ARIA_BLOCK_SIZE; + } + + if (walk.nbytes == walk.total && nbytes > 0) { + memcpy(&req_ctx->keystream[0], walk.iv, + ARIA_BLOCK_SIZE); + crypto_inc(walk.iv, ARIA_BLOCK_SIZE); + + aria_encrypt(ctx, &req_ctx->keystream[0], + &req_ctx->keystream[0]); + + crypto_xor_cpy(dst, src, &req_ctx->keystream[0], + nbytes); + dst += nbytes; + src += nbytes; + nbytes = 0; + } + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int aria_avx2_init_tfm(struct crypto_skcipher *tfm) +{ + crypto_skcipher_set_reqsize(tfm, sizeof(struct aria_avx2_request_ctx)); + + return 0; +} + +static struct skcipher_alg aria_algs[] = { + { + .base.cra_name = "__ecb(aria)", + .base.cra_driver_name = "__ecb-aria-avx2", + .base.cra_priority = 500, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = ARIA_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct aria_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = ARIA_MIN_KEY_SIZE, + .max_keysize = ARIA_MAX_KEY_SIZE, + .setkey = aria_avx2_set_key, + .encrypt = aria_avx2_ecb_encrypt, + .decrypt = aria_avx2_ecb_decrypt, + }, { + .base.cra_name = "__ctr(aria)", + .base.cra_driver_name = "__ctr-aria-avx2", + .base.cra_priority = 500, + .base.cra_flags = CRYPTO_ALG_INTERNAL | + CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct aria_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = ARIA_MIN_KEY_SIZE, + .max_keysize = ARIA_MAX_KEY_SIZE, + .ivsize = ARIA_BLOCK_SIZE, + .chunksize = ARIA_BLOCK_SIZE, + .setkey = aria_avx2_set_key, + .encrypt = aria_avx2_ctr_encrypt, + .decrypt = aria_avx2_ctr_encrypt, + .init = aria_avx2_init_tfm, + } +}; + +static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)]; + +static int __init aria_avx2_init(void) +{ + const char *feature_name; + + if (!boot_cpu_has(X86_FEATURE_AVX) || + !boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) { + pr_info("AVX2 or AES-NI instructions are not detected.\n"); + return -ENODEV; + } + + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); + return -ENODEV; + } + + if (boot_cpu_has(X86_FEATURE_GFNI)) { + aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way; + aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way; + aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way; + aria_ops.aria_encrypt_32way = aria_aesni_avx2_gfni_encrypt_32way; + aria_ops.aria_decrypt_32way = aria_aesni_avx2_gfni_decrypt_32way; + aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_gfni_ctr_crypt_32way; + } else { + aria_ops.aria_encrypt_16way = aria_aesni_avx_encrypt_16way; + aria_ops.aria_decrypt_16way = aria_aesni_avx_decrypt_16way; + aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_ctr_crypt_16way; + aria_ops.aria_encrypt_32way = aria_aesni_avx2_encrypt_32way; + aria_ops.aria_decrypt_32way = aria_aesni_avx2_decrypt_32way; + aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_ctr_crypt_32way; + } + + return simd_register_skciphers_compat(aria_algs, + ARRAY_SIZE(aria_algs), + aria_simd_algs); +} + +static void __exit aria_avx2_exit(void) +{ + simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs), + aria_simd_algs); +} + +module_init(aria_avx2_init); +module_exit(aria_avx2_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Taehee Yoo "); +MODULE_DESCRIPTION("ARIA Cipher Algorithm, AVX2/AES-NI/GFNI optimized"); +MODULE_ALIAS_CRYPTO("aria"); +MODULE_ALIAS_CRYPTO("aria-aesni-avx2"); diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c index 5f97e442349f..487094d64863 100644 --- a/arch/x86/crypto/aria_aesni_avx_glue.c +++ b/arch/x86/crypto/aria_aesni_avx_glue.c @@ -18,18 +18,24 @@ asmlinkage void aria_aesni_avx_encrypt_16way(const void *ctx, u8 *dst, const u8 *src); +EXPORT_SYMBOL_GPL(aria_aesni_avx_encrypt_16way); asmlinkage void aria_aesni_avx_decrypt_16way(const void *ctx, u8 *dst, const u8 *src); +EXPORT_SYMBOL_GPL(aria_aesni_avx_decrypt_16way); asmlinkage void aria_aesni_avx_ctr_crypt_16way(const void *ctx, u8 *dst, const u8 *src, u8 *keystream, u8 *iv); +EXPORT_SYMBOL_GPL(aria_aesni_avx_ctr_crypt_16way); asmlinkage void aria_aesni_avx_gfni_encrypt_16way(const void *ctx, u8 *dst, const u8 *src); +EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_encrypt_16way); asmlinkage void aria_aesni_avx_gfni_decrypt_16way(const void *ctx, u8 *dst, const u8 *src); +EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_decrypt_16way); asmlinkage void aria_aesni_avx_gfni_ctr_crypt_16way(const void *ctx, u8 *dst, const u8 *src, u8 *keystream, u8 *iv); +EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_ctr_crypt_16way); static struct aria_avx_ops aria_ops; From c970d42001f2dffd587cc38458fea4130796be43 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 1 Jan 2023 09:12:52 +0000 Subject: [PATCH 025/156] crypto: x86/aria - implement aria-avx512 aria-avx512 implementation uses AVX512 and GFNI. It supports 64way parallel processing. So, byteslicing code is changed to support 64way parallel. And it exports some aria-avx2 functions such as encrypt() and decrypt(). AVX and AVX2 have 16 registers. They should use memory to store/load state because of lack of registers. But AVX512 supports 32 registers. So, it doesn't require store/load in the s-box layer. It means that it can reduce overhead of store/load in the s-box layer. Also code become much simpler. Benchmark with modprobe tcrypt mode=610 num_mb=8192, i3-12100: ARIA-AVX512(128bit and 256bit) testing speed of multibuffer ecb(aria) (ecb-aria-avx512) encryption tcrypt: 1 operation in 1504 cycles (1024 bytes) tcrypt: 1 operation in 4595 cycles (4096 bytes) tcrypt: 1 operation in 1763 cycles (1024 bytes) tcrypt: 1 operation in 5540 cycles (4096 bytes) testing speed of multibuffer ecb(aria) (ecb-aria-avx512) decryption tcrypt: 1 operation in 1502 cycles (1024 bytes) tcrypt: 1 operation in 4615 cycles (4096 bytes) tcrypt: 1 operation in 1759 cycles (1024 bytes) tcrypt: 1 operation in 5554 cycles (4096 bytes) ARIA-AVX2 with GFNI(128bit and 256bit) testing speed of multibuffer ecb(aria) (ecb-aria-avx2) encryption tcrypt: 1 operation in 2003 cycles (1024 bytes) tcrypt: 1 operation in 5867 cycles (4096 bytes) tcrypt: 1 operation in 2358 cycles (1024 bytes) tcrypt: 1 operation in 7295 cycles (4096 bytes) testing speed of multibuffer ecb(aria) (ecb-aria-avx2) decryption tcrypt: 1 operation in 2004 cycles (1024 bytes) tcrypt: 1 operation in 5956 cycles (4096 bytes) tcrypt: 1 operation in 2409 cycles (1024 bytes) tcrypt: 1 operation in 7564 cycles (4096 bytes) Signed-off-by: Taehee Yoo Signed-off-by: Herbert Xu --- arch/x86/crypto/Kconfig | 19 + arch/x86/crypto/Makefile | 3 + arch/x86/crypto/aria-avx.h | 8 + arch/x86/crypto/aria-gfni-avx512-asm_64.S | 971 ++++++++++++++++++++++ arch/x86/crypto/aria_gfni_avx512_glue.c | 250 ++++++ 5 files changed, 1251 insertions(+) create mode 100644 arch/x86/crypto/aria-gfni-avx512-asm_64.S create mode 100644 arch/x86/crypto/aria_gfni_avx512_glue.c diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 3837ba8b78c5..688e848f740d 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -323,6 +323,25 @@ config CRYPTO_ARIA_AESNI_AVX2_X86_64 Processes 32 blocks in parallel. +config CRYPTO_ARIA_GFNI_AVX512_X86_64 + tristate "Ciphers: ARIA with modes: ECB, CTR (AVX512/GFNI)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_SIMD + select CRYPTO_ALGAPI + select CRYPTO_ARIA + select CRYPTO_ARIA_AESNI_AVX_X86_64 + select CRYPTO_ARIA_AESNI_AVX2_X86_64 + help + Length-preserving cipher: ARIA cipher algorithms + (RFC 5794) with ECB and CTR modes + + Architecture: x86_64 using: + - AVX512 (Advanced Vector Extensions) + - GFNI (Galois Field New Instructions) + + Processes 64 blocks in parallel. + config CRYPTO_CHACHA20_X86_64 tristate "Ciphers: ChaCha20, XChaCha20, XChaCha12 (SSSE3/AVX2/AVX-512VL)" depends on X86 && 64BIT diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 8b834aa410b1..9aa46093c91b 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -106,6 +106,9 @@ aria-aesni-avx-x86_64-y := aria-aesni-avx-asm_64.o aria_aesni_avx_glue.o obj-$(CONFIG_CRYPTO_ARIA_AESNI_AVX2_X86_64) += aria-aesni-avx2-x86_64.o aria-aesni-avx2-x86_64-y := aria-aesni-avx2-asm_64.o aria_aesni_avx2_glue.o +obj-$(CONFIG_CRYPTO_ARIA_GFNI_AVX512_X86_64) += aria-gfni-avx512-x86_64.o +aria-gfni-avx512-x86_64-y := aria-gfni-avx512-asm_64.o aria_gfni_avx512_glue.o + quiet_cmd_perlasm = PERLASM $@ cmd_perlasm = $(PERL) $< > $@ $(obj)/%.S: $(src)/%.pl FORCE diff --git a/arch/x86/crypto/aria-avx.h b/arch/x86/crypto/aria-avx.h index 9f2d847c460a..6e1b2d8a31ed 100644 --- a/arch/x86/crypto/aria-avx.h +++ b/arch/x86/crypto/aria-avx.h @@ -10,6 +10,9 @@ #define ARIA_AESNI_AVX2_PARALLEL_BLOCKS 32 #define ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * ARIA_AESNI_AVX2_PARALLEL_BLOCKS) +#define ARIA_GFNI_AVX512_PARALLEL_BLOCKS 64 +#define ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * ARIA_GFNI_AVX512_PARALLEL_BLOCKS) + asmlinkage void aria_aesni_avx_encrypt_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void aria_aesni_avx_decrypt_16way(const void *ctx, u8 *dst, @@ -49,6 +52,11 @@ struct aria_avx_ops { void (*aria_decrypt_32way)(const void *ctx, u8 *dst, const u8 *src); void (*aria_ctr_crypt_32way)(const void *ctx, u8 *dst, const u8 *src, u8 *keystream, u8 *iv); + void (*aria_encrypt_64way)(const void *ctx, u8 *dst, const u8 *src); + void (*aria_decrypt_64way)(const void *ctx, u8 *dst, const u8 *src); + void (*aria_ctr_crypt_64way)(const void *ctx, u8 *dst, const u8 *src, + u8 *keystream, u8 *iv); + }; #endif diff --git a/arch/x86/crypto/aria-gfni-avx512-asm_64.S b/arch/x86/crypto/aria-gfni-avx512-asm_64.S new file mode 100644 index 000000000000..3193f0701450 --- /dev/null +++ b/arch/x86/crypto/aria-gfni-avx512-asm_64.S @@ -0,0 +1,971 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * ARIA Cipher 64-way parallel algorithm (AVX512) + * + * Copyright (c) 2022 Taehee Yoo + * + */ + +#include +#include +#include +#include + +/* register macros */ +#define CTX %rdi + + +#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \ + ( (((a0) & 1) << 0) | \ + (((a1) & 1) << 1) | \ + (((a2) & 1) << 2) | \ + (((a3) & 1) << 3) | \ + (((a4) & 1) << 4) | \ + (((a5) & 1) << 5) | \ + (((a6) & 1) << 6) | \ + (((a7) & 1) << 7) ) + +#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \ + ( ((l7) << (0 * 8)) | \ + ((l6) << (1 * 8)) | \ + ((l5) << (2 * 8)) | \ + ((l4) << (3 * 8)) | \ + ((l3) << (4 * 8)) | \ + ((l2) << (5 * 8)) | \ + ((l1) << (6 * 8)) | \ + ((l0) << (7 * 8)) ) + +#define add_le128(out, in, lo_counter, hi_counter1) \ + vpaddq lo_counter, in, out; \ + vpcmpuq $1, lo_counter, out, %k1; \ + kaddb %k1, %k1, %k1; \ + vpaddq hi_counter1, out, out{%k1}; + +#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ + vpandq x, mask4bit, tmp0; \ + vpandqn x, mask4bit, x; \ + vpsrld $4, x, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxorq tmp0, x, x; + +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x1, x0, x0; \ + \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x2; \ + \ + vpunpckhqdq t1, x0, x1; \ + vpunpcklqdq t1, x0, x0; \ + \ + vpunpckhqdq x2, t2, x3; \ + vpunpcklqdq x2, t2, x2; + +#define byteslice_16x16b(a0, b0, c0, d0, \ + a1, b1, c1, d1, \ + a2, b2, c2, d2, \ + a3, b3, c3, d3, \ + st0, st1) \ + vmovdqu64 d2, st0; \ + vmovdqu64 d3, st1; \ + transpose_4x4(a0, a1, a2, a3, d2, d3); \ + transpose_4x4(b0, b1, b2, b3, d2, d3); \ + vmovdqu64 st0, d2; \ + vmovdqu64 st1, d3; \ + \ + vmovdqu64 a0, st0; \ + vmovdqu64 a1, st1; \ + transpose_4x4(c0, c1, c2, c3, a0, a1); \ + transpose_4x4(d0, d1, d2, d3, a0, a1); \ + \ + vbroadcasti64x2 .Lshufb_16x16b, a0; \ + vmovdqu64 st1, a1; \ + vpshufb a0, a2, a2; \ + vpshufb a0, a3, a3; \ + vpshufb a0, b0, b0; \ + vpshufb a0, b1, b1; \ + vpshufb a0, b2, b2; \ + vpshufb a0, b3, b3; \ + vpshufb a0, a1, a1; \ + vpshufb a0, c0, c0; \ + vpshufb a0, c1, c1; \ + vpshufb a0, c2, c2; \ + vpshufb a0, c3, c3; \ + vpshufb a0, d0, d0; \ + vpshufb a0, d1, d1; \ + vpshufb a0, d2, d2; \ + vpshufb a0, d3, d3; \ + vmovdqu64 d3, st1; \ + vmovdqu64 st0, d3; \ + vpshufb a0, d3, a0; \ + vmovdqu64 d2, st0; \ + \ + transpose_4x4(a0, b0, c0, d0, d2, d3); \ + transpose_4x4(a1, b1, c1, d1, d2, d3); \ + vmovdqu64 st0, d2; \ + vmovdqu64 st1, d3; \ + \ + vmovdqu64 b0, st0; \ + vmovdqu64 b1, st1; \ + transpose_4x4(a2, b2, c2, d2, b0, b1); \ + transpose_4x4(a3, b3, c3, d3, b0, b1); \ + vmovdqu64 st0, b0; \ + vmovdqu64 st1, b1; \ + /* does not adjust output bytes inside vectors */ + +#define debyteslice_16x16b(a0, b0, c0, d0, \ + a1, b1, c1, d1, \ + a2, b2, c2, d2, \ + a3, b3, c3, d3, \ + st0, st1) \ + vmovdqu64 d2, st0; \ + vmovdqu64 d3, st1; \ + transpose_4x4(a0, a1, a2, a3, d2, d3); \ + transpose_4x4(b0, b1, b2, b3, d2, d3); \ + vmovdqu64 st0, d2; \ + vmovdqu64 st1, d3; \ + \ + vmovdqu64 a0, st0; \ + vmovdqu64 a1, st1; \ + transpose_4x4(c0, c1, c2, c3, a0, a1); \ + transpose_4x4(d0, d1, d2, d3, a0, a1); \ + \ + vbroadcasti64x2 .Lshufb_16x16b, a0; \ + vmovdqu64 st1, a1; \ + vpshufb a0, a2, a2; \ + vpshufb a0, a3, a3; \ + vpshufb a0, b0, b0; \ + vpshufb a0, b1, b1; \ + vpshufb a0, b2, b2; \ + vpshufb a0, b3, b3; \ + vpshufb a0, a1, a1; \ + vpshufb a0, c0, c0; \ + vpshufb a0, c1, c1; \ + vpshufb a0, c2, c2; \ + vpshufb a0, c3, c3; \ + vpshufb a0, d0, d0; \ + vpshufb a0, d1, d1; \ + vpshufb a0, d2, d2; \ + vpshufb a0, d3, d3; \ + vmovdqu64 d3, st1; \ + vmovdqu64 st0, d3; \ + vpshufb a0, d3, a0; \ + vmovdqu64 d2, st0; \ + \ + transpose_4x4(c0, d0, a0, b0, d2, d3); \ + transpose_4x4(c1, d1, a1, b1, d2, d3); \ + vmovdqu64 st0, d2; \ + vmovdqu64 st1, d3; \ + \ + vmovdqu64 b0, st0; \ + vmovdqu64 b1, st1; \ + transpose_4x4(c2, d2, a2, b2, b0, b1); \ + transpose_4x4(c3, d3, a3, b3, b0, b1); \ + vmovdqu64 st0, b0; \ + vmovdqu64 st1, b1; \ + /* does not adjust output bytes inside vectors */ + +/* load blocks to registers and apply pre-whitening */ +#define inpack16_pre(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + rio) \ + vmovdqu64 (0 * 64)(rio), x0; \ + vmovdqu64 (1 * 64)(rio), x1; \ + vmovdqu64 (2 * 64)(rio), x2; \ + vmovdqu64 (3 * 64)(rio), x3; \ + vmovdqu64 (4 * 64)(rio), x4; \ + vmovdqu64 (5 * 64)(rio), x5; \ + vmovdqu64 (6 * 64)(rio), x6; \ + vmovdqu64 (7 * 64)(rio), x7; \ + vmovdqu64 (8 * 64)(rio), y0; \ + vmovdqu64 (9 * 64)(rio), y1; \ + vmovdqu64 (10 * 64)(rio), y2; \ + vmovdqu64 (11 * 64)(rio), y3; \ + vmovdqu64 (12 * 64)(rio), y4; \ + vmovdqu64 (13 * 64)(rio), y5; \ + vmovdqu64 (14 * 64)(rio), y6; \ + vmovdqu64 (15 * 64)(rio), y7; + +/* byteslice pre-whitened blocks and store to temporary memory */ +#define inpack16_post(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_ab, mem_cd) \ + byteslice_16x16b(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + (mem_ab), (mem_cd)); \ + \ + vmovdqu64 x0, 0 * 64(mem_ab); \ + vmovdqu64 x1, 1 * 64(mem_ab); \ + vmovdqu64 x2, 2 * 64(mem_ab); \ + vmovdqu64 x3, 3 * 64(mem_ab); \ + vmovdqu64 x4, 4 * 64(mem_ab); \ + vmovdqu64 x5, 5 * 64(mem_ab); \ + vmovdqu64 x6, 6 * 64(mem_ab); \ + vmovdqu64 x7, 7 * 64(mem_ab); \ + vmovdqu64 y0, 0 * 64(mem_cd); \ + vmovdqu64 y1, 1 * 64(mem_cd); \ + vmovdqu64 y2, 2 * 64(mem_cd); \ + vmovdqu64 y3, 3 * 64(mem_cd); \ + vmovdqu64 y4, 4 * 64(mem_cd); \ + vmovdqu64 y5, 5 * 64(mem_cd); \ + vmovdqu64 y6, 6 * 64(mem_cd); \ + vmovdqu64 y7, 7 * 64(mem_cd); + +#define write_output(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem) \ + vmovdqu64 x0, 0 * 64(mem); \ + vmovdqu64 x1, 1 * 64(mem); \ + vmovdqu64 x2, 2 * 64(mem); \ + vmovdqu64 x3, 3 * 64(mem); \ + vmovdqu64 x4, 4 * 64(mem); \ + vmovdqu64 x5, 5 * 64(mem); \ + vmovdqu64 x6, 6 * 64(mem); \ + vmovdqu64 x7, 7 * 64(mem); \ + vmovdqu64 y0, 8 * 64(mem); \ + vmovdqu64 y1, 9 * 64(mem); \ + vmovdqu64 y2, 10 * 64(mem); \ + vmovdqu64 y3, 11 * 64(mem); \ + vmovdqu64 y4, 12 * 64(mem); \ + vmovdqu64 y5, 13 * 64(mem); \ + vmovdqu64 y6, 14 * 64(mem); \ + vmovdqu64 y7, 15 * 64(mem); \ + +#define aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, idx) \ + vmovdqu64 x0, ((idx + 0) * 64)(mem_tmp); \ + vmovdqu64 x1, ((idx + 1) * 64)(mem_tmp); \ + vmovdqu64 x2, ((idx + 2) * 64)(mem_tmp); \ + vmovdqu64 x3, ((idx + 3) * 64)(mem_tmp); \ + vmovdqu64 x4, ((idx + 4) * 64)(mem_tmp); \ + vmovdqu64 x5, ((idx + 5) * 64)(mem_tmp); \ + vmovdqu64 x6, ((idx + 6) * 64)(mem_tmp); \ + vmovdqu64 x7, ((idx + 7) * 64)(mem_tmp); + +#define aria_load_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, idx) \ + vmovdqu64 ((idx + 0) * 64)(mem_tmp), x0; \ + vmovdqu64 ((idx + 1) * 64)(mem_tmp), x1; \ + vmovdqu64 ((idx + 2) * 64)(mem_tmp), x2; \ + vmovdqu64 ((idx + 3) * 64)(mem_tmp), x3; \ + vmovdqu64 ((idx + 4) * 64)(mem_tmp), x4; \ + vmovdqu64 ((idx + 5) * 64)(mem_tmp), x5; \ + vmovdqu64 ((idx + 6) * 64)(mem_tmp), x6; \ + vmovdqu64 ((idx + 7) * 64)(mem_tmp), x7; + +#define aria_ark_16way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + t0, rk, round) \ + /* AddRoundKey */ \ + vpbroadcastb ((round * 16) + 3)(rk), t0; \ + vpxorq t0, x0, x0; \ + vpbroadcastb ((round * 16) + 2)(rk), t0; \ + vpxorq t0, x1, x1; \ + vpbroadcastb ((round * 16) + 1)(rk), t0; \ + vpxorq t0, x2, x2; \ + vpbroadcastb ((round * 16) + 0)(rk), t0; \ + vpxorq t0, x3, x3; \ + vpbroadcastb ((round * 16) + 7)(rk), t0; \ + vpxorq t0, x4, x4; \ + vpbroadcastb ((round * 16) + 6)(rk), t0; \ + vpxorq t0, x5, x5; \ + vpbroadcastb ((round * 16) + 5)(rk), t0; \ + vpxorq t0, x6, x6; \ + vpbroadcastb ((round * 16) + 4)(rk), t0; \ + vpxorq t0, x7, x7; \ + vpbroadcastb ((round * 16) + 11)(rk), t0; \ + vpxorq t0, y0, y0; \ + vpbroadcastb ((round * 16) + 10)(rk), t0; \ + vpxorq t0, y1, y1; \ + vpbroadcastb ((round * 16) + 9)(rk), t0; \ + vpxorq t0, y2, y2; \ + vpbroadcastb ((round * 16) + 8)(rk), t0; \ + vpxorq t0, y3, y3; \ + vpbroadcastb ((round * 16) + 15)(rk), t0; \ + vpxorq t0, y4, y4; \ + vpbroadcastb ((round * 16) + 14)(rk), t0; \ + vpxorq t0, y5, y5; \ + vpbroadcastb ((round * 16) + 13)(rk), t0; \ + vpxorq t0, y6, y6; \ + vpbroadcastb ((round * 16) + 12)(rk), t0; \ + vpxorq t0, y7, y7; + +#define aria_sbox_8way_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + t0, t1, t2, t3, \ + t4, t5, t6, t7) \ + vpbroadcastq .Ltf_s2_bitmatrix, t0; \ + vpbroadcastq .Ltf_inv_bitmatrix, t1; \ + vpbroadcastq .Ltf_id_bitmatrix, t2; \ + vpbroadcastq .Ltf_aff_bitmatrix, t3; \ + vpbroadcastq .Ltf_x2_bitmatrix, t4; \ + vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ + vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ + vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ + vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \ + vgf2p8affineinvqb $0, t2, x2, x2; \ + vgf2p8affineinvqb $0, t2, x6, x6; \ + vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \ + vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \ + vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \ + vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \ + vgf2p8affineinvqb $0, t2, x3, x3; \ + vgf2p8affineinvqb $0, t2, x7, x7; + +#define aria_sbox_16way_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + t0, t1, t2, t3, \ + t4, t5, t6, t7) \ + vpbroadcastq .Ltf_s2_bitmatrix, t0; \ + vpbroadcastq .Ltf_inv_bitmatrix, t1; \ + vpbroadcastq .Ltf_id_bitmatrix, t2; \ + vpbroadcastq .Ltf_aff_bitmatrix, t3; \ + vpbroadcastq .Ltf_x2_bitmatrix, t4; \ + vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ + vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ + vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ + vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \ + vgf2p8affineinvqb $0, t2, x2, x2; \ + vgf2p8affineinvqb $0, t2, x6, x6; \ + vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \ + vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \ + vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \ + vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \ + vgf2p8affineinvqb $0, t2, x3, x3; \ + vgf2p8affineinvqb $0, t2, x7, x7; \ + vgf2p8affineinvqb $(tf_s2_const), t0, y1, y1; \ + vgf2p8affineinvqb $(tf_s2_const), t0, y5, y5; \ + vgf2p8affineqb $(tf_inv_const), t1, y2, y2; \ + vgf2p8affineqb $(tf_inv_const), t1, y6, y6; \ + vgf2p8affineinvqb $0, t2, y2, y2; \ + vgf2p8affineinvqb $0, t2, y6, y6; \ + vgf2p8affineinvqb $(tf_aff_const), t3, y0, y0; \ + vgf2p8affineinvqb $(tf_aff_const), t3, y4, y4; \ + vgf2p8affineqb $(tf_x2_const), t4, y3, y3; \ + vgf2p8affineqb $(tf_x2_const), t4, y7, y7; \ + vgf2p8affineinvqb $0, t2, y3, y3; \ + vgf2p8affineinvqb $0, t2, y7, y7; + + +#define aria_diff_m(x0, x1, x2, x3, \ + t0, t1, t2, t3) \ + /* T = rotr32(X, 8); */ \ + /* X ^= T */ \ + vpxorq x0, x3, t0; \ + vpxorq x1, x0, t1; \ + vpxorq x2, x1, t2; \ + vpxorq x3, x2, t3; \ + /* X = T ^ rotr(X, 16); */ \ + vpxorq t2, x0, x0; \ + vpxorq x1, t3, t3; \ + vpxorq t0, x2, x2; \ + vpxorq t1, x3, x1; \ + vmovdqu64 t3, x3; + +#define aria_diff_word(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7) \ + /* t1 ^= t2; */ \ + vpxorq y0, x4, x4; \ + vpxorq y1, x5, x5; \ + vpxorq y2, x6, x6; \ + vpxorq y3, x7, x7; \ + \ + /* t2 ^= t3; */ \ + vpxorq y4, y0, y0; \ + vpxorq y5, y1, y1; \ + vpxorq y6, y2, y2; \ + vpxorq y7, y3, y3; \ + \ + /* t0 ^= t1; */ \ + vpxorq x4, x0, x0; \ + vpxorq x5, x1, x1; \ + vpxorq x6, x2, x2; \ + vpxorq x7, x3, x3; \ + \ + /* t3 ^= t1; */ \ + vpxorq x4, y4, y4; \ + vpxorq x5, y5, y5; \ + vpxorq x6, y6, y6; \ + vpxorq x7, y7, y7; \ + \ + /* t2 ^= t0; */ \ + vpxorq x0, y0, y0; \ + vpxorq x1, y1, y1; \ + vpxorq x2, y2, y2; \ + vpxorq x3, y3, y3; \ + \ + /* t1 ^= t2; */ \ + vpxorq y0, x4, x4; \ + vpxorq y1, x5, x5; \ + vpxorq y2, x6, x6; \ + vpxorq y3, x7, x7; + +#define aria_fe_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + z0, z1, z2, z3, \ + z4, z5, z6, z7, \ + mem_tmp, rk, round) \ + aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, y1, y2, y3, y4, y5, y6, y7, \ + z0, rk, round); \ + \ + aria_sbox_16way_gfni(x2, x3, x0, x1, \ + x6, x7, x4, x5, \ + y2, y3, y0, y1, \ + y6, y7, y4, y5, \ + z0, z1, z2, z3, \ + z4, z5, z6, z7); \ + \ + aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \ + aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \ + aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \ + aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \ + aria_diff_word(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + /* aria_diff_byte() \ + * T3 = ABCD -> BADC \ + * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ + * T0 = ABCD -> CDAB \ + * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ + * T1 = ABCD -> DCBA \ + * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ + */ \ + aria_diff_word(x2, x3, x0, x1, \ + x7, x6, x5, x4, \ + y0, y1, y2, y3, \ + y5, y4, y7, y6); \ + + +#define aria_fo_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + z0, z1, z2, z3, \ + z4, z5, z6, z7, \ + mem_tmp, rk, round) \ + aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, y1, y2, y3, y4, y5, y6, y7, \ + z0, rk, round); \ + \ + aria_sbox_16way_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + z0, z1, z2, z3, \ + z4, z5, z6, z7); \ + \ + aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \ + aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \ + aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \ + aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \ + aria_diff_word(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + /* aria_diff_byte() \ + * T1 = ABCD -> BADC \ + * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ + * T2 = ABCD -> CDAB \ + * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ + * T3 = ABCD -> DCBA \ + * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ + */ \ + aria_diff_word(x0, x1, x2, x3, \ + x5, x4, x7, x6, \ + y2, y3, y0, y1, \ + y7, y6, y5, y4); + +#define aria_ff_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + z0, z1, z2, z3, \ + z4, z5, z6, z7, \ + mem_tmp, rk, round, last_round) \ + aria_ark_16way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + z0, rk, round); \ + aria_sbox_16way_gfni(x2, x3, x0, x1, \ + x6, x7, x4, x5, \ + y2, y3, y0, y1, \ + y6, y7, y4, y5, \ + z0, z1, z2, z3, \ + z4, z5, z6, z7); \ + aria_ark_16way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + z0, rk, last_round); + + +.section .rodata.cst64, "aM", @progbits, 64 +.align 64 +.Lcounter0123_lo: + .quad 0, 0 + .quad 1, 0 + .quad 2, 0 + .quad 3, 0 + +.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32 +.align 32 +#define SHUFB_BYTES(idx) \ + 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) +.Lshufb_16x16b: + .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) + .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) + +.section .rodata.cst16, "aM", @progbits, 16 +.align 16 + +.Lcounter4444_lo: + .quad 4, 0 +.Lcounter8888_lo: + .quad 8, 0 +.Lcounter16161616_lo: + .quad 16, 0 +.Lcounter1111_hi: + .quad 0, 1 + +/* For CTR-mode IV byteswap */ +.Lbswap128_mask: + .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 + .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 + +.section .rodata.cst8, "aM", @progbits, 8 +.align 8 +/* AES affine: */ +#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0) +.Ltf_aff_bitmatrix: + .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), + BV8(1, 1, 0, 0, 0, 1, 1, 1), + BV8(1, 1, 1, 0, 0, 0, 1, 1), + BV8(1, 1, 1, 1, 0, 0, 0, 1), + BV8(1, 1, 1, 1, 1, 0, 0, 0), + BV8(0, 1, 1, 1, 1, 1, 0, 0), + BV8(0, 0, 1, 1, 1, 1, 1, 0), + BV8(0, 0, 0, 1, 1, 1, 1, 1)) + +/* AES inverse affine: */ +#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0) +.Ltf_inv_bitmatrix: + .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), + BV8(1, 0, 0, 1, 0, 0, 1, 0), + BV8(0, 1, 0, 0, 1, 0, 0, 1), + BV8(1, 0, 1, 0, 0, 1, 0, 0), + BV8(0, 1, 0, 1, 0, 0, 1, 0), + BV8(0, 0, 1, 0, 1, 0, 0, 1), + BV8(1, 0, 0, 1, 0, 1, 0, 0), + BV8(0, 1, 0, 0, 1, 0, 1, 0)) + +/* S2: */ +#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1) +.Ltf_s2_bitmatrix: + .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), + BV8(0, 0, 1, 1, 1, 1, 1, 1), + BV8(1, 1, 1, 0, 1, 1, 0, 1), + BV8(1, 1, 0, 0, 0, 0, 1, 1), + BV8(0, 1, 0, 0, 0, 0, 1, 1), + BV8(1, 1, 0, 0, 1, 1, 1, 0), + BV8(0, 1, 1, 0, 0, 0, 1, 1), + BV8(1, 1, 1, 1, 0, 1, 1, 0)) + +/* X2: */ +#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0) +.Ltf_x2_bitmatrix: + .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), + BV8(0, 0, 1, 0, 0, 1, 1, 0), + BV8(0, 0, 0, 0, 1, 0, 1, 0), + BV8(1, 1, 1, 0, 0, 0, 1, 1), + BV8(1, 1, 1, 0, 1, 1, 0, 0), + BV8(0, 1, 1, 0, 1, 0, 1, 1), + BV8(1, 0, 1, 1, 1, 1, 0, 1), + BV8(1, 0, 0, 1, 0, 0, 1, 1)) + +/* Identity matrix: */ +.Ltf_id_bitmatrix: + .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 1, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 1, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 1, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 1, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 1, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 1, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 1)) + +.text +SYM_FUNC_START_LOCAL(__aria_gfni_avx512_crypt_64way) + /* input: + * %r9: rk + * %rsi: dst + * %rdx: src + * %zmm0..%zmm15: byte-sliced blocks + */ + + FRAME_BEGIN + + movq %rsi, %rax; + leaq 8 * 64(%rax), %r8; + + inpack16_post(%zmm0, %zmm1, %zmm2, %zmm3, + %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, + %zmm12, %zmm13, %zmm14, + %zmm15, %rax, %r8); + aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3, + %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 0); + aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0, + %zmm6, %zmm7, %zmm4, %zmm5, + %zmm9, %zmm8, %zmm11, %zmm10, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 1); + aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3, + %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 2); + aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0, + %zmm6, %zmm7, %zmm4, %zmm5, + %zmm9, %zmm8, %zmm11, %zmm10, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 3); + aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3, + %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 4); + aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0, + %zmm6, %zmm7, %zmm4, %zmm5, + %zmm9, %zmm8, %zmm11, %zmm10, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 5); + aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3, + %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 6); + aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0, + %zmm6, %zmm7, %zmm4, %zmm5, + %zmm9, %zmm8, %zmm11, %zmm10, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 7); + aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3, + %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 8); + aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0, + %zmm6, %zmm7, %zmm4, %zmm5, + %zmm9, %zmm8, %zmm11, %zmm10, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 9); + aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3, + %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 10); + cmpl $12, ARIA_CTX_rounds(CTX); + jne .Laria_gfni_192; + aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0, + %zmm6, %zmm7, %zmm4, %zmm5, + %zmm9, %zmm8, %zmm11, %zmm10, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 11, 12); + jmp .Laria_gfni_end; +.Laria_gfni_192: + aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0, + %zmm6, %zmm7, %zmm4, %zmm5, + %zmm9, %zmm8, %zmm11, %zmm10, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 11); + aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3, + %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 12); + cmpl $14, ARIA_CTX_rounds(CTX); + jne .Laria_gfni_256; + aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0, + %zmm6, %zmm7, %zmm4, %zmm5, + %zmm9, %zmm8, %zmm11, %zmm10, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 13, 14); + jmp .Laria_gfni_end; +.Laria_gfni_256: + aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0, + %zmm6, %zmm7, %zmm4, %zmm5, + %zmm9, %zmm8, %zmm11, %zmm10, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 13); + aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3, + %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 14); + aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0, + %zmm6, %zmm7, %zmm4, %zmm5, + %zmm9, %zmm8, %zmm11, %zmm10, + %zmm12, %zmm13, %zmm14, %zmm15, + %zmm24, %zmm25, %zmm26, %zmm27, + %zmm28, %zmm29, %zmm30, %zmm31, + %rax, %r9, 15, 16); +.Laria_gfni_end: + debyteslice_16x16b(%zmm9, %zmm12, %zmm3, %zmm6, + %zmm8, %zmm13, %zmm2, %zmm7, + %zmm11, %zmm14, %zmm1, %zmm4, + %zmm10, %zmm15, %zmm0, %zmm5, + (%rax), (%r8)); + FRAME_END + RET; +SYM_FUNC_END(__aria_gfni_avx512_crypt_64way) + +SYM_TYPED_FUNC_START(aria_gfni_avx512_encrypt_64way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + FRAME_BEGIN + + leaq ARIA_CTX_enc_key(CTX), %r9; + + inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14, + %zmm15, %rdx); + + call __aria_gfni_avx512_crypt_64way; + + write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5, + %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14, + %zmm15, %rax); + + FRAME_END + RET; +SYM_FUNC_END(aria_gfni_avx512_encrypt_64way) + +SYM_TYPED_FUNC_START(aria_gfni_avx512_decrypt_64way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + FRAME_BEGIN + + leaq ARIA_CTX_dec_key(CTX), %r9; + + inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14, + %zmm15, %rdx); + + call __aria_gfni_avx512_crypt_64way; + + write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5, + %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14, + %zmm15, %rax); + + FRAME_END + RET; +SYM_FUNC_END(aria_gfni_avx512_decrypt_64way) + +SYM_FUNC_START_LOCAL(__aria_gfni_avx512_ctr_gen_keystream_64way) + /* input: + * %rdi: ctx + * %rsi: dst + * %rdx: src + * %rcx: keystream + * %r8: iv (big endian, 128bit) + */ + + FRAME_BEGIN + + vbroadcasti64x2 .Lbswap128_mask (%rip), %zmm19; + vmovdqa64 .Lcounter0123_lo (%rip), %zmm21; + vbroadcasti64x2 .Lcounter4444_lo (%rip), %zmm22; + vbroadcasti64x2 .Lcounter8888_lo (%rip), %zmm23; + vbroadcasti64x2 .Lcounter16161616_lo (%rip), %zmm24; + vbroadcasti64x2 .Lcounter1111_hi (%rip), %zmm25; + + /* load IV and byteswap */ + movq 8(%r8), %r11; + movq (%r8), %r10; + bswapq %r11; + bswapq %r10; + vbroadcasti64x2 (%r8), %zmm20; + vpshufb %zmm19, %zmm20, %zmm20; + + /* check need for handling 64-bit overflow and carry */ + cmpq $(0xffffffffffffffff - 64), %r11; + ja .Lload_ctr_carry; + + /* construct IVs */ + vpaddq %zmm21, %zmm20, %zmm0; /* +0:+1:+2:+3 */ + vpaddq %zmm22, %zmm0, %zmm1; /* +4:+5:+6:+7 */ + vpaddq %zmm23, %zmm0, %zmm2; /* +8:+9:+10:+11 */ + vpaddq %zmm23, %zmm1, %zmm3; /* +12:+13:+14:+15 */ + vpaddq %zmm24, %zmm0, %zmm4; /* +16... */ + vpaddq %zmm24, %zmm1, %zmm5; /* +20... */ + vpaddq %zmm24, %zmm2, %zmm6; /* +24... */ + vpaddq %zmm24, %zmm3, %zmm7; /* +28... */ + vpaddq %zmm24, %zmm4, %zmm8; /* +32... */ + vpaddq %zmm24, %zmm5, %zmm9; /* +36... */ + vpaddq %zmm24, %zmm6, %zmm10; /* +40... */ + vpaddq %zmm24, %zmm7, %zmm11; /* +44... */ + vpaddq %zmm24, %zmm8, %zmm12; /* +48... */ + vpaddq %zmm24, %zmm9, %zmm13; /* +52... */ + vpaddq %zmm24, %zmm10, %zmm14; /* +56... */ + vpaddq %zmm24, %zmm11, %zmm15; /* +60... */ + jmp .Lload_ctr_done; + +.Lload_ctr_carry: + /* construct IVs */ + add_le128(%zmm0, %zmm20, %zmm21, %zmm25); /* +0:+1:+2:+3 */ + add_le128(%zmm1, %zmm0, %zmm22, %zmm25); /* +4:+5:+6:+7 */ + add_le128(%zmm2, %zmm0, %zmm23, %zmm25); /* +8:+9:+10:+11 */ + add_le128(%zmm3, %zmm1, %zmm23, %zmm25); /* +12:+13:+14:+15 */ + add_le128(%zmm4, %zmm0, %zmm24, %zmm25); /* +16... */ + add_le128(%zmm5, %zmm1, %zmm24, %zmm25); /* +20... */ + add_le128(%zmm6, %zmm2, %zmm24, %zmm25); /* +24... */ + add_le128(%zmm7, %zmm3, %zmm24, %zmm25); /* +28... */ + add_le128(%zmm8, %zmm4, %zmm24, %zmm25); /* +32... */ + add_le128(%zmm9, %zmm5, %zmm24, %zmm25); /* +36... */ + add_le128(%zmm10, %zmm6, %zmm24, %zmm25); /* +40... */ + add_le128(%zmm11, %zmm7, %zmm24, %zmm25); /* +44... */ + add_le128(%zmm12, %zmm8, %zmm24, %zmm25); /* +48... */ + add_le128(%zmm13, %zmm9, %zmm24, %zmm25); /* +52... */ + add_le128(%zmm14, %zmm10, %zmm24, %zmm25); /* +56... */ + add_le128(%zmm15, %zmm11, %zmm24, %zmm25); /* +60... */ + +.Lload_ctr_done: + /* Byte-swap IVs and update counter. */ + addq $64, %r11; + adcq $0, %r10; + vpshufb %zmm19, %zmm15, %zmm15; + vpshufb %zmm19, %zmm14, %zmm14; + vpshufb %zmm19, %zmm13, %zmm13; + vpshufb %zmm19, %zmm12, %zmm12; + vpshufb %zmm19, %zmm11, %zmm11; + vpshufb %zmm19, %zmm10, %zmm10; + vpshufb %zmm19, %zmm9, %zmm9; + vpshufb %zmm19, %zmm8, %zmm8; + bswapq %r11; + bswapq %r10; + vpshufb %zmm19, %zmm7, %zmm7; + vpshufb %zmm19, %zmm6, %zmm6; + vpshufb %zmm19, %zmm5, %zmm5; + vpshufb %zmm19, %zmm4, %zmm4; + vpshufb %zmm19, %zmm3, %zmm3; + vpshufb %zmm19, %zmm2, %zmm2; + vpshufb %zmm19, %zmm1, %zmm1; + vpshufb %zmm19, %zmm0, %zmm0; + movq %r11, 8(%r8); + movq %r10, (%r8); + + FRAME_END + RET; +SYM_FUNC_END(__aria_gfni_avx512_ctr_gen_keystream_64way) + +SYM_TYPED_FUNC_START(aria_gfni_avx512_ctr_crypt_64way) + /* input: + * %rdi: ctx + * %rsi: dst + * %rdx: src + * %rcx: keystream + * %r8: iv (big endian, 128bit) + */ + FRAME_BEGIN + + call __aria_gfni_avx512_ctr_gen_keystream_64way + + leaq (%rsi), %r10; + leaq (%rdx), %r11; + leaq (%rcx), %rsi; + leaq (%rcx), %rdx; + leaq ARIA_CTX_enc_key(CTX), %r9; + + call __aria_gfni_avx512_crypt_64way; + + vpxorq (0 * 64)(%r11), %zmm3, %zmm3; + vpxorq (1 * 64)(%r11), %zmm2, %zmm2; + vpxorq (2 * 64)(%r11), %zmm1, %zmm1; + vpxorq (3 * 64)(%r11), %zmm0, %zmm0; + vpxorq (4 * 64)(%r11), %zmm6, %zmm6; + vpxorq (5 * 64)(%r11), %zmm7, %zmm7; + vpxorq (6 * 64)(%r11), %zmm4, %zmm4; + vpxorq (7 * 64)(%r11), %zmm5, %zmm5; + vpxorq (8 * 64)(%r11), %zmm9, %zmm9; + vpxorq (9 * 64)(%r11), %zmm8, %zmm8; + vpxorq (10 * 64)(%r11), %zmm11, %zmm11; + vpxorq (11 * 64)(%r11), %zmm10, %zmm10; + vpxorq (12 * 64)(%r11), %zmm12, %zmm12; + vpxorq (13 * 64)(%r11), %zmm13, %zmm13; + vpxorq (14 * 64)(%r11), %zmm14, %zmm14; + vpxorq (15 * 64)(%r11), %zmm15, %zmm15; + write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5, + %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14, + %zmm15, %r10); + + FRAME_END + RET; +SYM_FUNC_END(aria_gfni_avx512_ctr_crypt_64way) diff --git a/arch/x86/crypto/aria_gfni_avx512_glue.c b/arch/x86/crypto/aria_gfni_avx512_glue.c new file mode 100644 index 000000000000..f4a2208d2638 --- /dev/null +++ b/arch/x86/crypto/aria_gfni_avx512_glue.c @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Glue Code for the AVX512/GFNI assembler implementation of the ARIA Cipher + * + * Copyright (c) 2022 Taehee Yoo + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "ecb_cbc_helpers.h" +#include "aria-avx.h" + +asmlinkage void aria_gfni_avx512_encrypt_64way(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void aria_gfni_avx512_decrypt_64way(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void aria_gfni_avx512_ctr_crypt_64way(const void *ctx, u8 *dst, + const u8 *src, + u8 *keystream, u8 *iv); + +static struct aria_avx_ops aria_ops; + +struct aria_avx512_request_ctx { + u8 keystream[ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE]; +}; + +static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey) +{ + ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(ARIA_GFNI_AVX512_PARALLEL_BLOCKS, aria_ops.aria_encrypt_64way); + ECB_BLOCK(ARIA_AESNI_AVX2_PARALLEL_BLOCKS, aria_ops.aria_encrypt_32way); + ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_encrypt_16way); + ECB_BLOCK(1, aria_encrypt); + ECB_WALK_END(); +} + +static int ecb_do_decrypt(struct skcipher_request *req, const u32 *rkey) +{ + ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(ARIA_GFNI_AVX512_PARALLEL_BLOCKS, aria_ops.aria_decrypt_64way); + ECB_BLOCK(ARIA_AESNI_AVX2_PARALLEL_BLOCKS, aria_ops.aria_decrypt_32way); + ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_decrypt_16way); + ECB_BLOCK(1, aria_decrypt); + ECB_WALK_END(); +} + +static int aria_avx512_ecb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aria_ctx *ctx = crypto_skcipher_ctx(tfm); + + return ecb_do_encrypt(req, ctx->enc_key[0]); +} + +static int aria_avx512_ecb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aria_ctx *ctx = crypto_skcipher_ctx(tfm); + + return ecb_do_decrypt(req, ctx->dec_key[0]); +} + +static int aria_avx512_set_key(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + return aria_set_key(&tfm->base, key, keylen); +} + +static int aria_avx512_ctr_encrypt(struct skcipher_request *req) +{ + struct aria_avx512_request_ctx *req_ctx = skcipher_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aria_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + while (nbytes >= ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE) { + kernel_fpu_begin(); + aria_ops.aria_ctr_crypt_64way(ctx, dst, src, + &req_ctx->keystream[0], + walk.iv); + kernel_fpu_end(); + dst += ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE; + src += ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE; + nbytes -= ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE; + } + + while (nbytes >= ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE) { + kernel_fpu_begin(); + aria_ops.aria_ctr_crypt_32way(ctx, dst, src, + &req_ctx->keystream[0], + walk.iv); + kernel_fpu_end(); + dst += ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE; + src += ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE; + nbytes -= ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE; + } + + while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) { + kernel_fpu_begin(); + aria_ops.aria_ctr_crypt_16way(ctx, dst, src, + &req_ctx->keystream[0], + walk.iv); + kernel_fpu_end(); + dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE; + src += ARIA_AESNI_PARALLEL_BLOCK_SIZE; + nbytes -= ARIA_AESNI_PARALLEL_BLOCK_SIZE; + } + + while (nbytes >= ARIA_BLOCK_SIZE) { + memcpy(&req_ctx->keystream[0], walk.iv, + ARIA_BLOCK_SIZE); + crypto_inc(walk.iv, ARIA_BLOCK_SIZE); + + aria_encrypt(ctx, &req_ctx->keystream[0], + &req_ctx->keystream[0]); + + crypto_xor_cpy(dst, src, &req_ctx->keystream[0], + ARIA_BLOCK_SIZE); + dst += ARIA_BLOCK_SIZE; + src += ARIA_BLOCK_SIZE; + nbytes -= ARIA_BLOCK_SIZE; + } + + if (walk.nbytes == walk.total && nbytes > 0) { + memcpy(&req_ctx->keystream[0], walk.iv, + ARIA_BLOCK_SIZE); + crypto_inc(walk.iv, ARIA_BLOCK_SIZE); + + aria_encrypt(ctx, &req_ctx->keystream[0], + &req_ctx->keystream[0]); + + crypto_xor_cpy(dst, src, &req_ctx->keystream[0], + nbytes); + dst += nbytes; + src += nbytes; + nbytes = 0; + } + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int aria_avx512_init_tfm(struct crypto_skcipher *tfm) +{ + crypto_skcipher_set_reqsize(tfm, + sizeof(struct aria_avx512_request_ctx)); + + return 0; +} + +static struct skcipher_alg aria_algs[] = { + { + .base.cra_name = "__ecb(aria)", + .base.cra_driver_name = "__ecb-aria-avx512", + .base.cra_priority = 600, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = ARIA_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct aria_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = ARIA_MIN_KEY_SIZE, + .max_keysize = ARIA_MAX_KEY_SIZE, + .setkey = aria_avx512_set_key, + .encrypt = aria_avx512_ecb_encrypt, + .decrypt = aria_avx512_ecb_decrypt, + }, { + .base.cra_name = "__ctr(aria)", + .base.cra_driver_name = "__ctr-aria-avx512", + .base.cra_priority = 600, + .base.cra_flags = CRYPTO_ALG_INTERNAL | + CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct aria_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = ARIA_MIN_KEY_SIZE, + .max_keysize = ARIA_MAX_KEY_SIZE, + .ivsize = ARIA_BLOCK_SIZE, + .chunksize = ARIA_BLOCK_SIZE, + .setkey = aria_avx512_set_key, + .encrypt = aria_avx512_ctr_encrypt, + .decrypt = aria_avx512_ctr_encrypt, + .init = aria_avx512_init_tfm, + } +}; + +static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)]; + +static int __init aria_avx512_init(void) +{ + const char *feature_name; + + if (!boot_cpu_has(X86_FEATURE_AVX) || + !boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_AVX512F) || + !boot_cpu_has(X86_FEATURE_AVX512VL) || + !boot_cpu_has(X86_FEATURE_GFNI) || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) { + pr_info("AVX512/GFNI instructions are not detected.\n"); + return -ENODEV; + } + + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | + XFEATURE_MASK_AVX512, &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); + return -ENODEV; + } + + aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way; + aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way; + aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way; + aria_ops.aria_encrypt_32way = aria_aesni_avx2_gfni_encrypt_32way; + aria_ops.aria_decrypt_32way = aria_aesni_avx2_gfni_decrypt_32way; + aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_gfni_ctr_crypt_32way; + aria_ops.aria_encrypt_64way = aria_gfni_avx512_encrypt_64way; + aria_ops.aria_decrypt_64way = aria_gfni_avx512_decrypt_64way; + aria_ops.aria_ctr_crypt_64way = aria_gfni_avx512_ctr_crypt_64way; + + return simd_register_skciphers_compat(aria_algs, + ARRAY_SIZE(aria_algs), + aria_simd_algs); +} + +static void __exit aria_avx512_exit(void) +{ + simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs), + aria_simd_algs); +} + +module_init(aria_avx512_init); +module_exit(aria_avx512_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Taehee Yoo "); +MODULE_DESCRIPTION("ARIA Cipher Algorithm, AVX512/GFNI optimized"); +MODULE_ALIAS_CRYPTO("aria"); +MODULE_ALIAS_CRYPTO("aria-gfni-avx512"); From 3c657e8689ab50dd201abfe16c86653fda74bf25 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Sun, 4 Dec 2022 19:34:53 -0500 Subject: [PATCH 026/156] crypto: p10-aes-gcm - Update Kconfig and Makefile Defined CRYPTO_P10_AES_GCM in Kconfig to support AES/GCM stitched implementation for Power10+ CPU. Added a new module driver p10-aes-gcm-crypto. Signed-off-by: Danny Tsen Signed-off-by: Herbert Xu --- arch/powerpc/crypto/Kconfig | 11 +++++++++++ arch/powerpc/crypto/Makefile | 10 ++++++++++ 2 files changed, 21 insertions(+) diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index c1b964447401..db7d99383993 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -94,4 +94,15 @@ config CRYPTO_AES_PPC_SPE architecture specific assembler implementations that work on 1KB tables or 256 bytes S-boxes. +config CRYPTO_P10_AES_GCM + tristate "Stitched AES/GCM acceleration support on P10+ CPU (PPC)" + depends on PPC64 + select CRYPTO_LIB_AES + select CRYPTO_ALGAPI + select CRYPTO_AEAD + default m + help + Support for cryptographic acceleration instructions on Power10+ CPU. + This module supports stitched acceleration for AES/GCM in hardware. + endmenu diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index 4808d97fede5..5b8252013abd 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o +obj-$(CONFIG_CRYPTO_P10_AES_GCM) += p10-aes-gcm-crypto.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o md5-ppc-y := md5-asm.o md5-glue.o @@ -21,3 +22,12 @@ sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o +p10-aes-gcm-crypto-y := p10-aes-gcm-glue.o p10_aes_gcm.o ghashp8-ppc.o aesp8-ppc.o + +quiet_cmd_perl = PERL $@ + cmd_perl = $(PERL) $< $(if $(CONFIG_CPU_LITTLE_ENDIAN), linux-ppc64le, linux-ppc64) > $@ + +targets += aesp8-ppc.S ghashp8-ppc.S + +$(obj)/aesp8-ppc.S $(obj)/ghashp8-ppc.S: $(obj)/%.S: $(src)/%.pl FORCE + $(call if_changed,perl) From cc40379b6e19d85bd389f979dc185b5bb478a385 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Sun, 4 Dec 2022 19:34:54 -0500 Subject: [PATCH 027/156] crypto: p10-aes-gcm - Glue code for AES/GCM stitched implementation Signed-off-by: Danny Tsen Signed-off-by: Herbert Xu --- arch/powerpc/crypto/p10-aes-gcm-glue.c | 345 +++++++++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 arch/powerpc/crypto/p10-aes-gcm-glue.c diff --git a/arch/powerpc/crypto/p10-aes-gcm-glue.c b/arch/powerpc/crypto/p10-aes-gcm-glue.c new file mode 100644 index 000000000000..777e6b5254da --- /dev/null +++ b/arch/powerpc/crypto/p10-aes-gcm-glue.c @@ -0,0 +1,345 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Glue code for accelerated AES-GCM stitched implementation for ppc64le. + * + * Copyright 2022- IBM Inc. All rights reserved + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PPC_MODULE_FEATURE_P10 (32 + ilog2(PPC_FEATURE2_ARCH_3_1)) +#define PPC_ALIGN 16 +#define GCM_IV_SIZE 12 + +MODULE_DESCRIPTION("PPC64le AES-GCM with Stitched implementation"); +MODULE_AUTHOR("Danny Tsen aadLen = alen; + i = alen & ~0xf; + if (i) { + gcm_ghash_p8(nXi, hash->Htable+32, aad, i); + aad += i; + alen -= i; + } + if (alen) { + for (i = 0; i < alen; i++) + nXi[i] ^= aad[i]; + + memset(gctx->aad_hash, 0, 16); + gcm_ghash_p8(gctx->aad_hash, hash->Htable+32, nXi, 16); + } else { + memcpy(gctx->aad_hash, nXi, 16); + } + + memcpy(hash->Htable, gctx->aad_hash, 16); +} + +static void gcmp10_init(struct gcm_ctx *gctx, u8 *iv, unsigned char *rdkey, + struct Hash_ctx *hash, u8 *assoc, unsigned int assoclen) +{ + __be32 counter = cpu_to_be32(1); + + aes_p8_encrypt(hash->H, hash->H, rdkey); + set_subkey(hash->H); + gcm_init_htable(hash->Htable+32, hash->H); + + *((__be32 *)(iv+12)) = counter; + + gctx->Plen = 0; + + /* + * Encrypt counter vector as iv tag and increment counter. + */ + aes_p8_encrypt(iv, gctx->ivtag, rdkey); + + counter = cpu_to_be32(2); + *((__be32 *)(iv+12)) = counter; + memcpy(gctx->iv, iv, 16); + + gctx->aadLen = assoclen; + memset(gctx->aad_hash, 0, 16); + if (assoclen) + set_aad(gctx, hash, assoc, assoclen); +} + +static void finish_tag(struct gcm_ctx *gctx, struct Hash_ctx *hash, int len) +{ + int i; + unsigned char len_ac[16 + PPC_ALIGN]; + unsigned char *aclen = PTR_ALIGN((void *)len_ac, PPC_ALIGN); + __be64 clen = cpu_to_be64(len << 3); + __be64 alen = cpu_to_be64(gctx->aadLen << 3); + + if (len == 0 && gctx->aadLen == 0) { + memcpy(hash->Htable, gctx->ivtag, 16); + return; + } + + /* + * Len is in bits. + */ + *((__be64 *)(aclen)) = alen; + *((__be64 *)(aclen+8)) = clen; + + /* + * hash (AAD len and len) + */ + gcm_ghash_p8(hash->Htable, hash->Htable+32, aclen, 16); + + for (i = 0; i < 16; i++) + hash->Htable[i] ^= gctx->ivtag[i]; +} + +static int set_authsize(struct crypto_aead *tfm, unsigned int authsize) +{ + switch (authsize) { + case 4: + case 8: + case 12: + case 13: + case 14: + case 15: + case 16: + break; + default: + return -EINVAL; + } + + return 0; +} + +static int p10_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key, + unsigned int keylen) +{ + struct crypto_tfm *tfm = crypto_aead_tfm(aead); + struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm); + int ret; + + vsx_begin(); + ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key); + vsx_end(); + + return ret ? -EINVAL : 0; +} + +static int p10_aes_gcm_crypt(struct aead_request *req, int enc) +{ + struct crypto_tfm *tfm = req->base.tfm; + struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm); + u8 databuf[sizeof(struct gcm_ctx) + PPC_ALIGN]; + struct gcm_ctx *gctx = PTR_ALIGN((void *)databuf, PPC_ALIGN); + u8 hashbuf[sizeof(struct Hash_ctx) + PPC_ALIGN]; + struct Hash_ctx *hash = PTR_ALIGN((void *)hashbuf, PPC_ALIGN); + struct scatter_walk assoc_sg_walk; + struct skcipher_walk walk; + u8 *assocmem = NULL; + u8 *assoc; + unsigned int assoclen = req->assoclen; + unsigned int cryptlen = req->cryptlen; + unsigned char ivbuf[AES_BLOCK_SIZE+PPC_ALIGN]; + unsigned char *iv = PTR_ALIGN((void *)ivbuf, PPC_ALIGN); + int ret; + unsigned long auth_tag_len = crypto_aead_authsize(__crypto_aead_cast(tfm)); + u8 otag[16]; + int total_processed = 0; + + memset(databuf, 0, sizeof(databuf)); + memset(hashbuf, 0, sizeof(hashbuf)); + memset(ivbuf, 0, sizeof(ivbuf)); + memcpy(iv, req->iv, GCM_IV_SIZE); + + /* Linearize assoc, if not already linear */ + if (req->src->length >= assoclen && req->src->length) { + scatterwalk_start(&assoc_sg_walk, req->src); + assoc = scatterwalk_map(&assoc_sg_walk); + } else { + gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? + GFP_KERNEL : GFP_ATOMIC; + + /* assoc can be any length, so must be on heap */ + assocmem = kmalloc(assoclen, flags); + if (unlikely(!assocmem)) + return -ENOMEM; + assoc = assocmem; + + scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0); + } + + vsx_begin(); + gcmp10_init(gctx, iv, (unsigned char *) &ctx->enc_key, hash, assoc, assoclen); + vsx_end(); + + if (!assocmem) + scatterwalk_unmap(assoc); + else + kfree(assocmem); + + if (enc) + ret = skcipher_walk_aead_encrypt(&walk, req, false); + else + ret = skcipher_walk_aead_decrypt(&walk, req, false); + if (ret) + return ret; + + while (walk.nbytes > 0 && ret == 0) { + + vsx_begin(); + if (enc) + aes_p10_gcm_encrypt(walk.src.virt.addr, + walk.dst.virt.addr, + walk.nbytes, + &ctx->enc_key, gctx->iv, hash->Htable); + else + aes_p10_gcm_decrypt(walk.src.virt.addr, + walk.dst.virt.addr, + walk.nbytes, + &ctx->enc_key, gctx->iv, hash->Htable); + vsx_end(); + + total_processed += walk.nbytes; + ret = skcipher_walk_done(&walk, 0); + } + + if (ret) + return ret; + + /* Finalize hash */ + vsx_begin(); + finish_tag(gctx, hash, total_processed); + vsx_end(); + + /* copy Xi to end of dst */ + if (enc) + scatterwalk_map_and_copy(hash->Htable, req->dst, req->assoclen + cryptlen, + auth_tag_len, 1); + else { + scatterwalk_map_and_copy(otag, req->src, + req->assoclen + cryptlen - auth_tag_len, + auth_tag_len, 0); + + if (crypto_memneq(otag, hash->Htable, auth_tag_len)) { + memzero_explicit(hash->Htable, 16); + return -EBADMSG; + } + } + + return 0; +} + +static int p10_aes_gcm_encrypt(struct aead_request *req) +{ + return p10_aes_gcm_crypt(req, 1); +} + +static int p10_aes_gcm_decrypt(struct aead_request *req) +{ + return p10_aes_gcm_crypt(req, 0); +} + +static struct aead_alg gcm_aes_alg = { + .ivsize = GCM_IV_SIZE, + .maxauthsize = 16, + + .setauthsize = set_authsize, + .setkey = p10_aes_gcm_setkey, + .encrypt = p10_aes_gcm_encrypt, + .decrypt = p10_aes_gcm_decrypt, + + .base.cra_name = "gcm(aes)", + .base.cra_driver_name = "p10_aes_gcm", + .base.cra_priority = 2100, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct p10_aes_gcm_ctx), + .base.cra_module = THIS_MODULE, +}; + +static int __init p10_init(void) +{ + return crypto_register_aead(&gcm_aes_alg); +} + +static void __exit p10_exit(void) +{ + crypto_unregister_aead(&gcm_aes_alg); +} + +module_cpu_feature_match(PPC_MODULE_FEATURE_P10, p10_init); +module_exit(p10_exit); From ca68a96c37eb93791373c5debb0991df0410da9a Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Sun, 4 Dec 2022 19:34:55 -0500 Subject: [PATCH 028/156] crypto: p10-aes-gcm - An accelerated AES/GCM stitched implementation Improve overall performance of AES/GCM encrypt and decrypt operations for Power10+ CPU. Signed-off-by: Danny Tsen Signed-off-by: Herbert Xu --- arch/powerpc/crypto/p10_aes_gcm.S | 1519 +++++++++++++++++++++++++++++ 1 file changed, 1519 insertions(+) create mode 100644 arch/powerpc/crypto/p10_aes_gcm.S diff --git a/arch/powerpc/crypto/p10_aes_gcm.S b/arch/powerpc/crypto/p10_aes_gcm.S new file mode 100644 index 000000000000..2306ad7c5e36 --- /dev/null +++ b/arch/powerpc/crypto/p10_aes_gcm.S @@ -0,0 +1,1519 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + # + # Accelerated AES-GCM stitched implementation for ppc64le. + # + # Copyright 2022- IBM Inc. All rights reserved + # + #=================================================================================== + # Written by Danny Tsen + # + # GHASH is based on the Karatsuba multiplication method. + # + # Xi xor X1 + # + # X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H = + # (X1.h * H4.h + xX.l * H4.l + X1 * H4) + + # (X2.h * H3.h + X2.l * H3.l + X2 * H3) + + # (X3.h * H2.h + X3.l * H2.l + X3 * H2) + + # (X4.h * H.h + X4.l * H.l + X4 * H) + # + # Xi = v0 + # H Poly = v2 + # Hash keys = v3 - v14 + # ( H.l, H, H.h) + # ( H^2.l, H^2, H^2.h) + # ( H^3.l, H^3, H^3.h) + # ( H^4.l, H^4, H^4.h) + # + # v30 is IV + # v31 - counter 1 + # + # AES used, + # vs0 - vs14 for round keys + # v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted) + # + # This implementation uses stitched AES-GCM approach to improve overall performance. + # AES is implemented with 8x blocks and GHASH is using 2 4x blocks. + # + # =================================================================================== + # + +.machine "any" +.abiversion 1 +.text + + # 4x loops + # v15 - v18 - input states + # vs1 - vs9 - round keys + # +.macro Loop_aes_middle4x + xxlor 19+32, 1, 1 + xxlor 20+32, 2, 2 + xxlor 21+32, 3, 3 + xxlor 22+32, 4, 4 + + vcipher 15, 15, 19 + vcipher 16, 16, 19 + vcipher 17, 17, 19 + vcipher 18, 18, 19 + + vcipher 15, 15, 20 + vcipher 16, 16, 20 + vcipher 17, 17, 20 + vcipher 18, 18, 20 + + vcipher 15, 15, 21 + vcipher 16, 16, 21 + vcipher 17, 17, 21 + vcipher 18, 18, 21 + + vcipher 15, 15, 22 + vcipher 16, 16, 22 + vcipher 17, 17, 22 + vcipher 18, 18, 22 + + xxlor 19+32, 5, 5 + xxlor 20+32, 6, 6 + xxlor 21+32, 7, 7 + xxlor 22+32, 8, 8 + + vcipher 15, 15, 19 + vcipher 16, 16, 19 + vcipher 17, 17, 19 + vcipher 18, 18, 19 + + vcipher 15, 15, 20 + vcipher 16, 16, 20 + vcipher 17, 17, 20 + vcipher 18, 18, 20 + + vcipher 15, 15, 21 + vcipher 16, 16, 21 + vcipher 17, 17, 21 + vcipher 18, 18, 21 + + vcipher 15, 15, 22 + vcipher 16, 16, 22 + vcipher 17, 17, 22 + vcipher 18, 18, 22 + + xxlor 23+32, 9, 9 + vcipher 15, 15, 23 + vcipher 16, 16, 23 + vcipher 17, 17, 23 + vcipher 18, 18, 23 +.endm + + # 8x loops + # v15 - v22 - input states + # vs1 - vs9 - round keys + # +.macro Loop_aes_middle8x + xxlor 23+32, 1, 1 + xxlor 24+32, 2, 2 + xxlor 25+32, 3, 3 + xxlor 26+32, 4, 4 + + vcipher 15, 15, 23 + vcipher 16, 16, 23 + vcipher 17, 17, 23 + vcipher 18, 18, 23 + vcipher 19, 19, 23 + vcipher 20, 20, 23 + vcipher 21, 21, 23 + vcipher 22, 22, 23 + + vcipher 15, 15, 24 + vcipher 16, 16, 24 + vcipher 17, 17, 24 + vcipher 18, 18, 24 + vcipher 19, 19, 24 + vcipher 20, 20, 24 + vcipher 21, 21, 24 + vcipher 22, 22, 24 + + vcipher 15, 15, 25 + vcipher 16, 16, 25 + vcipher 17, 17, 25 + vcipher 18, 18, 25 + vcipher 19, 19, 25 + vcipher 20, 20, 25 + vcipher 21, 21, 25 + vcipher 22, 22, 25 + + vcipher 15, 15, 26 + vcipher 16, 16, 26 + vcipher 17, 17, 26 + vcipher 18, 18, 26 + vcipher 19, 19, 26 + vcipher 20, 20, 26 + vcipher 21, 21, 26 + vcipher 22, 22, 26 + + xxlor 23+32, 5, 5 + xxlor 24+32, 6, 6 + xxlor 25+32, 7, 7 + xxlor 26+32, 8, 8 + + vcipher 15, 15, 23 + vcipher 16, 16, 23 + vcipher 17, 17, 23 + vcipher 18, 18, 23 + vcipher 19, 19, 23 + vcipher 20, 20, 23 + vcipher 21, 21, 23 + vcipher 22, 22, 23 + + vcipher 15, 15, 24 + vcipher 16, 16, 24 + vcipher 17, 17, 24 + vcipher 18, 18, 24 + vcipher 19, 19, 24 + vcipher 20, 20, 24 + vcipher 21, 21, 24 + vcipher 22, 22, 24 + + vcipher 15, 15, 25 + vcipher 16, 16, 25 + vcipher 17, 17, 25 + vcipher 18, 18, 25 + vcipher 19, 19, 25 + vcipher 20, 20, 25 + vcipher 21, 21, 25 + vcipher 22, 22, 25 + + vcipher 15, 15, 26 + vcipher 16, 16, 26 + vcipher 17, 17, 26 + vcipher 18, 18, 26 + vcipher 19, 19, 26 + vcipher 20, 20, 26 + vcipher 21, 21, 26 + vcipher 22, 22, 26 + + xxlor 23+32, 9, 9 + vcipher 15, 15, 23 + vcipher 16, 16, 23 + vcipher 17, 17, 23 + vcipher 18, 18, 23 + vcipher 19, 19, 23 + vcipher 20, 20, 23 + vcipher 21, 21, 23 + vcipher 22, 22, 23 +.endm + +.macro Loop_aes_middle_1x + xxlor 19+32, 1, 1 + xxlor 20+32, 2, 2 + xxlor 21+32, 3, 3 + xxlor 22+32, 4, 4 + + vcipher 15, 15, 19 + vcipher 15, 15, 20 + vcipher 15, 15, 21 + vcipher 15, 15, 22 + + xxlor 19+32, 5, 5 + xxlor 20+32, 6, 6 + xxlor 21+32, 7, 7 + xxlor 22+32, 8, 8 + + vcipher 15, 15, 19 + vcipher 15, 15, 20 + vcipher 15, 15, 21 + vcipher 15, 15, 22 + + xxlor 19+32, 9, 9 + vcipher 15, 15, 19 +.endm + + # + # Compute 4x hash values based on Karatsuba method. + # +ppc_aes_gcm_ghash: + vxor 15, 15, 0 + + vpmsumd 23, 12, 15 # H4.L * X.L + vpmsumd 24, 9, 16 + vpmsumd 25, 6, 17 + vpmsumd 26, 3, 18 + + vxor 23, 23, 24 + vxor 23, 23, 25 + vxor 23, 23, 26 # L + + vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L + vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L + vpmsumd 26, 7, 17 + vpmsumd 27, 4, 18 + + vxor 24, 24, 25 + vxor 24, 24, 26 + vxor 24, 24, 27 # M + + # sum hash and reduction with H Poly + vpmsumd 28, 23, 2 # reduction + + vxor 29, 29, 29 + vsldoi 26, 24, 29, 8 # mL + vsldoi 29, 29, 24, 8 # mH + vxor 23, 23, 26 # mL + L + + vsldoi 23, 23, 23, 8 # swap + vxor 23, 23, 28 + + vpmsumd 24, 14, 15 # H4.H * X.H + vpmsumd 25, 11, 16 + vpmsumd 26, 8, 17 + vpmsumd 27, 5, 18 + + vxor 24, 24, 25 + vxor 24, 24, 26 + vxor 24, 24, 27 + + vxor 24, 24, 29 + + # sum hash and reduction with H Poly + vsldoi 27, 23, 23, 8 # swap + vpmsumd 23, 23, 2 + vxor 27, 27, 24 + vxor 23, 23, 27 + + xxlor 32, 23+32, 23+32 # update hash + + blr + + # + # Combine two 4x ghash + # v15 - v22 - input blocks + # +.macro ppc_aes_gcm_ghash2_4x + # first 4x hash + vxor 15, 15, 0 # Xi + X + + vpmsumd 23, 12, 15 # H4.L * X.L + vpmsumd 24, 9, 16 + vpmsumd 25, 6, 17 + vpmsumd 26, 3, 18 + + vxor 23, 23, 24 + vxor 23, 23, 25 + vxor 23, 23, 26 # L + + vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L + vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L + vpmsumd 26, 7, 17 + vpmsumd 27, 4, 18 + + vxor 24, 24, 25 + vxor 24, 24, 26 + + # sum hash and reduction with H Poly + vpmsumd 28, 23, 2 # reduction + + vxor 29, 29, 29 + + vxor 24, 24, 27 # M + vsldoi 26, 24, 29, 8 # mL + vsldoi 29, 29, 24, 8 # mH + vxor 23, 23, 26 # mL + L + + vsldoi 23, 23, 23, 8 # swap + vxor 23, 23, 28 + + vpmsumd 24, 14, 15 # H4.H * X.H + vpmsumd 25, 11, 16 + vpmsumd 26, 8, 17 + vpmsumd 27, 5, 18 + + vxor 24, 24, 25 + vxor 24, 24, 26 + vxor 24, 24, 27 # H + + vxor 24, 24, 29 # H + mH + + # sum hash and reduction with H Poly + vsldoi 27, 23, 23, 8 # swap + vpmsumd 23, 23, 2 + vxor 27, 27, 24 + vxor 27, 23, 27 # 1st Xi + + # 2nd 4x hash + vpmsumd 24, 9, 20 + vpmsumd 25, 6, 21 + vpmsumd 26, 3, 22 + vxor 19, 19, 27 # Xi + X + vpmsumd 23, 12, 19 # H4.L * X.L + + vxor 23, 23, 24 + vxor 23, 23, 25 + vxor 23, 23, 26 # L + + vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L + vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L + vpmsumd 26, 7, 21 + vpmsumd 27, 4, 22 + + vxor 24, 24, 25 + vxor 24, 24, 26 + + # sum hash and reduction with H Poly + vpmsumd 28, 23, 2 # reduction + + vxor 29, 29, 29 + + vxor 24, 24, 27 # M + vsldoi 26, 24, 29, 8 # mL + vsldoi 29, 29, 24, 8 # mH + vxor 23, 23, 26 # mL + L + + vsldoi 23, 23, 23, 8 # swap + vxor 23, 23, 28 + + vpmsumd 24, 14, 19 # H4.H * X.H + vpmsumd 25, 11, 20 + vpmsumd 26, 8, 21 + vpmsumd 27, 5, 22 + + vxor 24, 24, 25 + vxor 24, 24, 26 + vxor 24, 24, 27 # H + + vxor 24, 24, 29 # H + mH + + # sum hash and reduction with H Poly + vsldoi 27, 23, 23, 8 # swap + vpmsumd 23, 23, 2 + vxor 27, 27, 24 + vxor 23, 23, 27 + + xxlor 32, 23+32, 23+32 # update hash + +.endm + + # + # Compute update single hash + # +.macro ppc_update_hash_1x + vxor 28, 28, 0 + + vxor 19, 19, 19 + + vpmsumd 22, 3, 28 # L + vpmsumd 23, 4, 28 # M + vpmsumd 24, 5, 28 # H + + vpmsumd 27, 22, 2 # reduction + + vsldoi 25, 23, 19, 8 # mL + vsldoi 26, 19, 23, 8 # mH + vxor 22, 22, 25 # LL + LL + vxor 24, 24, 26 # HH + HH + + vsldoi 22, 22, 22, 8 # swap + vxor 22, 22, 27 + + vsldoi 20, 22, 22, 8 # swap + vpmsumd 22, 22, 2 # reduction + vxor 20, 20, 24 + vxor 22, 22, 20 + + vmr 0, 22 # update hash + +.endm + +.macro SAVE_REGS + stdu 1,-640(1) + mflr 0 + + std 14,112(1) + std 15,120(1) + std 16,128(1) + std 17,136(1) + std 18,144(1) + std 19,152(1) + std 20,160(1) + std 21,168(1) + li 9, 256 + stvx 20, 9, 1 + addi 9, 9, 16 + stvx 21, 9, 1 + addi 9, 9, 16 + stvx 22, 9, 1 + addi 9, 9, 16 + stvx 23, 9, 1 + addi 9, 9, 16 + stvx 24, 9, 1 + addi 9, 9, 16 + stvx 25, 9, 1 + addi 9, 9, 16 + stvx 26, 9, 1 + addi 9, 9, 16 + stvx 27, 9, 1 + addi 9, 9, 16 + stvx 28, 9, 1 + addi 9, 9, 16 + stvx 29, 9, 1 + addi 9, 9, 16 + stvx 30, 9, 1 + addi 9, 9, 16 + stvx 31, 9, 1 + stxv 14, 464(1) + stxv 15, 480(1) + stxv 16, 496(1) + stxv 17, 512(1) + stxv 18, 528(1) + stxv 19, 544(1) + stxv 20, 560(1) + stxv 21, 576(1) + stxv 22, 592(1) + std 0, 656(1) +.endm + +.macro RESTORE_REGS + lxv 14, 464(1) + lxv 15, 480(1) + lxv 16, 496(1) + lxv 17, 512(1) + lxv 18, 528(1) + lxv 19, 544(1) + lxv 20, 560(1) + lxv 21, 576(1) + lxv 22, 592(1) + li 9, 256 + lvx 20, 9, 1 + addi 9, 9, 16 + lvx 21, 9, 1 + addi 9, 9, 16 + lvx 22, 9, 1 + addi 9, 9, 16 + lvx 23, 9, 1 + addi 9, 9, 16 + lvx 24, 9, 1 + addi 9, 9, 16 + lvx 25, 9, 1 + addi 9, 9, 16 + lvx 26, 9, 1 + addi 9, 9, 16 + lvx 27, 9, 1 + addi 9, 9, 16 + lvx 28, 9, 1 + addi 9, 9, 16 + lvx 29, 9, 1 + addi 9, 9, 16 + lvx 30, 9, 1 + addi 9, 9, 16 + lvx 31, 9, 1 + + ld 0, 656(1) + ld 14,112(1) + ld 15,120(1) + ld 16,128(1) + ld 17,136(1) + ld 18,144(1) + ld 19,152(1) + ld 20,160(1) + ld 21,168(1) + + mtlr 0 + addi 1, 1, 640 +.endm + +.macro LOAD_HASH_TABLE + # Load Xi + lxvb16x 32, 0, 8 # load Xi + + # load Hash - h^4, h^3, h^2, h + li 10, 32 + lxvd2x 2+32, 10, 8 # H Poli + li 10, 48 + lxvd2x 3+32, 10, 8 # Hl + li 10, 64 + lxvd2x 4+32, 10, 8 # H + li 10, 80 + lxvd2x 5+32, 10, 8 # Hh + + li 10, 96 + lxvd2x 6+32, 10, 8 # H^2l + li 10, 112 + lxvd2x 7+32, 10, 8 # H^2 + li 10, 128 + lxvd2x 8+32, 10, 8 # H^2h + + li 10, 144 + lxvd2x 9+32, 10, 8 # H^3l + li 10, 160 + lxvd2x 10+32, 10, 8 # H^3 + li 10, 176 + lxvd2x 11+32, 10, 8 # H^3h + + li 10, 192 + lxvd2x 12+32, 10, 8 # H^4l + li 10, 208 + lxvd2x 13+32, 10, 8 # H^4 + li 10, 224 + lxvd2x 14+32, 10, 8 # H^4h +.endm + + # + # aes_p10_gcm_encrypt (const void *inp, void *out, size_t len, + # const char *rk, unsigned char iv[16], void *Xip); + # + # r3 - inp + # r4 - out + # r5 - len + # r6 - AES round keys + # r7 - iv and other data + # r8 - Xi, HPoli, hash keys + # + # rounds is at offset 240 in rk + # Xi is at 0 in gcm_table (Xip). + # +.global aes_p10_gcm_encrypt +.align 5 +aes_p10_gcm_encrypt: + + SAVE_REGS + + LOAD_HASH_TABLE + + # initialize ICB: GHASH( IV ), IV - r7 + lxvb16x 30+32, 0, 7 # load IV - v30 + + mr 12, 5 # length + li 11, 0 # block index + + # counter 1 + vxor 31, 31, 31 + vspltisb 22, 1 + vsldoi 31, 31, 22,1 # counter 1 + + # load round key to VSR + lxv 0, 0(6) + lxv 1, 0x10(6) + lxv 2, 0x20(6) + lxv 3, 0x30(6) + lxv 4, 0x40(6) + lxv 5, 0x50(6) + lxv 6, 0x60(6) + lxv 7, 0x70(6) + lxv 8, 0x80(6) + lxv 9, 0x90(6) + lxv 10, 0xa0(6) + + # load rounds - 10 (128), 12 (192), 14 (256) + lwz 9,240(6) + + # + # vxor state, state, w # addroundkey + xxlor 32+29, 0, 0 + vxor 15, 30, 29 # IV + round key - add round key 0 + + cmpdi 9, 10 + beq Loop_aes_gcm_8x + + # load 2 more round keys (v11, v12) + lxv 11, 0xb0(6) + lxv 12, 0xc0(6) + + cmpdi 9, 12 + beq Loop_aes_gcm_8x + + # load 2 more round keys (v11, v12, v13, v14) + lxv 13, 0xd0(6) + lxv 14, 0xe0(6) + cmpdi 9, 14 + beq Loop_aes_gcm_8x + + b aes_gcm_out + +.align 5 +Loop_aes_gcm_8x: + mr 14, 3 + mr 9, 4 + + # + # check partial block + # +Continue_partial_check: + ld 15, 56(7) + cmpdi 15, 0 + beq Continue + bgt Final_block + cmpdi 15, 16 + blt Final_block + +Continue: + # n blcoks + li 10, 128 + divdu 10, 12, 10 # n 128 bytes-blocks + cmpdi 10, 0 + beq Loop_last_block + + vaddudm 30, 30, 31 # IV + counter + vxor 16, 30, 29 + vaddudm 30, 30, 31 + vxor 17, 30, 29 + vaddudm 30, 30, 31 + vxor 18, 30, 29 + vaddudm 30, 30, 31 + vxor 19, 30, 29 + vaddudm 30, 30, 31 + vxor 20, 30, 29 + vaddudm 30, 30, 31 + vxor 21, 30, 29 + vaddudm 30, 30, 31 + vxor 22, 30, 29 + + mtctr 10 + + li 15, 16 + li 16, 32 + li 17, 48 + li 18, 64 + li 19, 80 + li 20, 96 + li 21, 112 + + lwz 10, 240(6) + +Loop_8x_block: + + lxvb16x 15, 0, 14 # load block + lxvb16x 16, 15, 14 # load block + lxvb16x 17, 16, 14 # load block + lxvb16x 18, 17, 14 # load block + lxvb16x 19, 18, 14 # load block + lxvb16x 20, 19, 14 # load block + lxvb16x 21, 20, 14 # load block + lxvb16x 22, 21, 14 # load block + addi 14, 14, 128 + + Loop_aes_middle8x + + xxlor 23+32, 10, 10 + + cmpdi 10, 10 + beq Do_next_ghash + + # 192 bits + xxlor 24+32, 11, 11 + + vcipher 15, 15, 23 + vcipher 16, 16, 23 + vcipher 17, 17, 23 + vcipher 18, 18, 23 + vcipher 19, 19, 23 + vcipher 20, 20, 23 + vcipher 21, 21, 23 + vcipher 22, 22, 23 + + vcipher 15, 15, 24 + vcipher 16, 16, 24 + vcipher 17, 17, 24 + vcipher 18, 18, 24 + vcipher 19, 19, 24 + vcipher 20, 20, 24 + vcipher 21, 21, 24 + vcipher 22, 22, 24 + + xxlor 23+32, 12, 12 + + cmpdi 10, 12 + beq Do_next_ghash + + # 256 bits + xxlor 24+32, 13, 13 + + vcipher 15, 15, 23 + vcipher 16, 16, 23 + vcipher 17, 17, 23 + vcipher 18, 18, 23 + vcipher 19, 19, 23 + vcipher 20, 20, 23 + vcipher 21, 21, 23 + vcipher 22, 22, 23 + + vcipher 15, 15, 24 + vcipher 16, 16, 24 + vcipher 17, 17, 24 + vcipher 18, 18, 24 + vcipher 19, 19, 24 + vcipher 20, 20, 24 + vcipher 21, 21, 24 + vcipher 22, 22, 24 + + xxlor 23+32, 14, 14 + + cmpdi 10, 14 + beq Do_next_ghash + b aes_gcm_out + +Do_next_ghash: + + # + # last round + vcipherlast 15, 15, 23 + vcipherlast 16, 16, 23 + + xxlxor 47, 47, 15 + stxvb16x 47, 0, 9 # store output + xxlxor 48, 48, 16 + stxvb16x 48, 15, 9 # store output + + vcipherlast 17, 17, 23 + vcipherlast 18, 18, 23 + + xxlxor 49, 49, 17 + stxvb16x 49, 16, 9 # store output + xxlxor 50, 50, 18 + stxvb16x 50, 17, 9 # store output + + vcipherlast 19, 19, 23 + vcipherlast 20, 20, 23 + + xxlxor 51, 51, 19 + stxvb16x 51, 18, 9 # store output + xxlxor 52, 52, 20 + stxvb16x 52, 19, 9 # store output + + vcipherlast 21, 21, 23 + vcipherlast 22, 22, 23 + + xxlxor 53, 53, 21 + stxvb16x 53, 20, 9 # store output + xxlxor 54, 54, 22 + stxvb16x 54, 21, 9 # store output + + addi 9, 9, 128 + + # ghash here + ppc_aes_gcm_ghash2_4x + + xxlor 27+32, 0, 0 + vaddudm 30, 30, 31 # IV + counter + vmr 29, 30 + vxor 15, 30, 27 # add round key + vaddudm 30, 30, 31 + vxor 16, 30, 27 + vaddudm 30, 30, 31 + vxor 17, 30, 27 + vaddudm 30, 30, 31 + vxor 18, 30, 27 + vaddudm 30, 30, 31 + vxor 19, 30, 27 + vaddudm 30, 30, 31 + vxor 20, 30, 27 + vaddudm 30, 30, 31 + vxor 21, 30, 27 + vaddudm 30, 30, 31 + vxor 22, 30, 27 + + addi 12, 12, -128 + addi 11, 11, 128 + + bdnz Loop_8x_block + + vmr 30, 29 + stxvb16x 30+32, 0, 7 # update IV + +Loop_last_block: + cmpdi 12, 0 + beq aes_gcm_out + + # loop last few blocks + li 10, 16 + divdu 10, 12, 10 + + mtctr 10 + + lwz 10, 240(6) + + cmpdi 12, 16 + blt Final_block + +Next_rem_block: + lxvb16x 15, 0, 14 # load block + + Loop_aes_middle_1x + + xxlor 23+32, 10, 10 + + cmpdi 10, 10 + beq Do_next_1x + + # 192 bits + xxlor 24+32, 11, 11 + + vcipher 15, 15, 23 + vcipher 15, 15, 24 + + xxlor 23+32, 12, 12 + + cmpdi 10, 12 + beq Do_next_1x + + # 256 bits + xxlor 24+32, 13, 13 + + vcipher 15, 15, 23 + vcipher 15, 15, 24 + + xxlor 23+32, 14, 14 + + cmpdi 10, 14 + beq Do_next_1x + +Do_next_1x: + vcipherlast 15, 15, 23 + + xxlxor 47, 47, 15 + stxvb16x 47, 0, 9 # store output + addi 14, 14, 16 + addi 9, 9, 16 + + vmr 28, 15 + ppc_update_hash_1x + + addi 12, 12, -16 + addi 11, 11, 16 + xxlor 19+32, 0, 0 + vaddudm 30, 30, 31 # IV + counter + vxor 15, 30, 19 # add round key + + bdnz Next_rem_block + + li 15, 0 + std 15, 56(7) # clear partial? + stxvb16x 30+32, 0, 7 # update IV + cmpdi 12, 0 + beq aes_gcm_out + +Final_block: + lwz 10, 240(6) + Loop_aes_middle_1x + + xxlor 23+32, 10, 10 + + cmpdi 10, 10 + beq Do_final_1x + + # 192 bits + xxlor 24+32, 11, 11 + + vcipher 15, 15, 23 + vcipher 15, 15, 24 + + xxlor 23+32, 12, 12 + + cmpdi 10, 12 + beq Do_final_1x + + # 256 bits + xxlor 24+32, 13, 13 + + vcipher 15, 15, 23 + vcipher 15, 15, 24 + + xxlor 23+32, 14, 14 + + cmpdi 10, 14 + beq Do_final_1x + +Do_final_1x: + vcipherlast 15, 15, 23 + + # check partial block + li 21, 0 # encrypt + ld 15, 56(7) # partial? + cmpdi 15, 0 + beq Normal_block + bl Do_partial_block + + cmpdi 12, 0 + ble aes_gcm_out + + b Continue_partial_check + +Normal_block: + lxvb16x 15, 0, 14 # load last block + xxlxor 47, 47, 15 + + # create partial block mask + li 15, 16 + sub 15, 15, 12 # index to the mask + + vspltisb 16, -1 # first 16 bytes - 0xffff...ff + vspltisb 17, 0 # second 16 bytes - 0x0000...00 + li 10, 192 + stvx 16, 10, 1 + addi 10, 10, 16 + stvx 17, 10, 1 + + addi 10, 1, 192 + lxvb16x 16, 15, 10 # load partial block mask + xxland 47, 47, 16 + + vmr 28, 15 + ppc_update_hash_1x + + # * should store only the remaining bytes. + bl Write_partial_block + + stxvb16x 30+32, 0, 7 # update IV + std 12, 56(7) # update partial? + li 16, 16 + + stxvb16x 32, 0, 8 # write out Xi + stxvb16x 32, 16, 8 # write out Xi + b aes_gcm_out + + # + # Compute data mask + # +.macro GEN_MASK _mask _start _end + vspltisb 16, -1 # first 16 bytes - 0xffff...ff + vspltisb 17, 0 # second 16 bytes - 0x0000...00 + li 10, 192 + stxvb16x 17+32, 10, 1 + add 10, 10, \_start + stxvb16x 16+32, 10, 1 + add 10, 10, \_end + stxvb16x 17+32, 10, 1 + + addi 10, 1, 192 + lxvb16x \_mask, 0, 10 # load partial block mask +.endm + + # + # Handle multiple partial blocks for encrypt and decrypt + # operations. + # +Do_partial_block: + add 17, 15, 5 + cmpdi 17, 16 + bgt Big_block + GEN_MASK 18, 15, 5 + b _Partial +Big_block: + li 16, 16 + GEN_MASK 18, 15, 16 + +_Partial: + lxvb16x 17+32, 0, 14 # load last block + sldi 16, 15, 3 + mtvsrdd 32+16, 0, 16 + vsro 17, 17, 16 + xxlxor 47, 47, 17+32 + xxland 47, 47, 18 + + vxor 0, 0, 0 # clear Xi + vmr 28, 15 + + cmpdi 21, 0 # encrypt/decrypt ops? + beq Skip_decrypt + xxland 32+28, 32+17, 18 + +Skip_decrypt: + + ppc_update_hash_1x + + li 16, 16 + lxvb16x 32+29, 16, 8 + vxor 0, 0, 29 + stxvb16x 32, 0, 8 # save Xi + stxvb16x 32, 16, 8 # save Xi + + # store partial block + # loop the rest of the stream if any + sldi 16, 15, 3 + mtvsrdd 32+16, 0, 16 + vslo 15, 15, 16 + #stxvb16x 15+32, 0, 9 # last block + + li 16, 16 + sub 17, 16, 15 # 16 - partial + + add 16, 15, 5 + cmpdi 16, 16 + bgt Larger_16 + mr 17, 5 +Larger_16: + + # write partial + li 10, 192 + stxvb16x 15+32, 10, 1 # save current block + + addi 10, 9, -1 + addi 16, 1, 191 + mtctr 17 # move partial byte count + +Write_last_partial: + lbzu 18, 1(16) + stbu 18, 1(10) + bdnz Write_last_partial + # Complete loop partial + + add 14, 14, 17 + add 9, 9, 17 + sub 12, 12, 17 + add 11, 11, 17 + + add 15, 15, 5 + cmpdi 15, 16 + blt Save_partial + + vaddudm 30, 30, 31 + stxvb16x 30+32, 0, 7 # update IV + xxlor 32+29, 0, 0 + vxor 15, 30, 29 # IV + round key - add round key 0 + li 15, 0 + std 15, 56(7) # partial done - clear + b Partial_done +Save_partial: + std 15, 56(7) # partial + +Partial_done: + blr + + # + # Write partial block + # r9 - output + # r12 - remaining bytes + # v15 - partial input data + # +Write_partial_block: + li 10, 192 + stxvb16x 15+32, 10, 1 # last block + + addi 10, 9, -1 + addi 16, 1, 191 + + mtctr 12 # remaining bytes + li 15, 0 + +Write_last_byte: + lbzu 14, 1(16) + stbu 14, 1(10) + bdnz Write_last_byte + blr + +aes_gcm_out: + # out = state + stxvb16x 32, 0, 8 # write out Xi + add 3, 11, 12 # return count + + RESTORE_REGS + blr + + # + # 8x Decrypt + # +.global aes_p10_gcm_decrypt +.align 5 +aes_p10_gcm_decrypt: + + SAVE_REGS + + LOAD_HASH_TABLE + + # initialize ICB: GHASH( IV ), IV - r7 + lxvb16x 30+32, 0, 7 # load IV - v30 + + mr 12, 5 # length + li 11, 0 # block index + + # counter 1 + vxor 31, 31, 31 + vspltisb 22, 1 + vsldoi 31, 31, 22,1 # counter 1 + + # load round key to VSR + lxv 0, 0(6) + lxv 1, 0x10(6) + lxv 2, 0x20(6) + lxv 3, 0x30(6) + lxv 4, 0x40(6) + lxv 5, 0x50(6) + lxv 6, 0x60(6) + lxv 7, 0x70(6) + lxv 8, 0x80(6) + lxv 9, 0x90(6) + lxv 10, 0xa0(6) + + # load rounds - 10 (128), 12 (192), 14 (256) + lwz 9,240(6) + + # + # vxor state, state, w # addroundkey + xxlor 32+29, 0, 0 + vxor 15, 30, 29 # IV + round key - add round key 0 + + cmpdi 9, 10 + beq Loop_aes_gcm_8x_dec + + # load 2 more round keys (v11, v12) + lxv 11, 0xb0(6) + lxv 12, 0xc0(6) + + cmpdi 9, 12 + beq Loop_aes_gcm_8x_dec + + # load 2 more round keys (v11, v12, v13, v14) + lxv 13, 0xd0(6) + lxv 14, 0xe0(6) + cmpdi 9, 14 + beq Loop_aes_gcm_8x_dec + + b aes_gcm_out + +.align 5 +Loop_aes_gcm_8x_dec: + mr 14, 3 + mr 9, 4 + + # + # check partial block + # +Continue_partial_check_dec: + ld 15, 56(7) + cmpdi 15, 0 + beq Continue_dec + bgt Final_block_dec + cmpdi 15, 16 + blt Final_block_dec + +Continue_dec: + # n blcoks + li 10, 128 + divdu 10, 12, 10 # n 128 bytes-blocks + cmpdi 10, 0 + beq Loop_last_block_dec + + vaddudm 30, 30, 31 # IV + counter + vxor 16, 30, 29 + vaddudm 30, 30, 31 + vxor 17, 30, 29 + vaddudm 30, 30, 31 + vxor 18, 30, 29 + vaddudm 30, 30, 31 + vxor 19, 30, 29 + vaddudm 30, 30, 31 + vxor 20, 30, 29 + vaddudm 30, 30, 31 + vxor 21, 30, 29 + vaddudm 30, 30, 31 + vxor 22, 30, 29 + + mtctr 10 + + li 15, 16 + li 16, 32 + li 17, 48 + li 18, 64 + li 19, 80 + li 20, 96 + li 21, 112 + + lwz 10, 240(6) + +Loop_8x_block_dec: + + lxvb16x 15, 0, 14 # load block + lxvb16x 16, 15, 14 # load block + lxvb16x 17, 16, 14 # load block + lxvb16x 18, 17, 14 # load block + lxvb16x 19, 18, 14 # load block + lxvb16x 20, 19, 14 # load block + lxvb16x 21, 20, 14 # load block + lxvb16x 22, 21, 14 # load block + addi 14, 14, 128 + + Loop_aes_middle8x + + xxlor 23+32, 10, 10 + + cmpdi 10, 10 + beq Do_next_ghash_dec + + # 192 bits + xxlor 24+32, 11, 11 + + vcipher 15, 15, 23 + vcipher 16, 16, 23 + vcipher 17, 17, 23 + vcipher 18, 18, 23 + vcipher 19, 19, 23 + vcipher 20, 20, 23 + vcipher 21, 21, 23 + vcipher 22, 22, 23 + + vcipher 15, 15, 24 + vcipher 16, 16, 24 + vcipher 17, 17, 24 + vcipher 18, 18, 24 + vcipher 19, 19, 24 + vcipher 20, 20, 24 + vcipher 21, 21, 24 + vcipher 22, 22, 24 + + xxlor 23+32, 12, 12 + + cmpdi 10, 12 + beq Do_next_ghash_dec + + # 256 bits + xxlor 24+32, 13, 13 + + vcipher 15, 15, 23 + vcipher 16, 16, 23 + vcipher 17, 17, 23 + vcipher 18, 18, 23 + vcipher 19, 19, 23 + vcipher 20, 20, 23 + vcipher 21, 21, 23 + vcipher 22, 22, 23 + + vcipher 15, 15, 24 + vcipher 16, 16, 24 + vcipher 17, 17, 24 + vcipher 18, 18, 24 + vcipher 19, 19, 24 + vcipher 20, 20, 24 + vcipher 21, 21, 24 + vcipher 22, 22, 24 + + xxlor 23+32, 14, 14 + + cmpdi 10, 14 + beq Do_next_ghash_dec + b aes_gcm_out + +Do_next_ghash_dec: + + # + # last round + vcipherlast 15, 15, 23 + vcipherlast 16, 16, 23 + + xxlxor 47, 47, 15 + stxvb16x 47, 0, 9 # store output + xxlxor 48, 48, 16 + stxvb16x 48, 15, 9 # store output + + vcipherlast 17, 17, 23 + vcipherlast 18, 18, 23 + + xxlxor 49, 49, 17 + stxvb16x 49, 16, 9 # store output + xxlxor 50, 50, 18 + stxvb16x 50, 17, 9 # store output + + vcipherlast 19, 19, 23 + vcipherlast 20, 20, 23 + + xxlxor 51, 51, 19 + stxvb16x 51, 18, 9 # store output + xxlxor 52, 52, 20 + stxvb16x 52, 19, 9 # store output + + vcipherlast 21, 21, 23 + vcipherlast 22, 22, 23 + + xxlxor 53, 53, 21 + stxvb16x 53, 20, 9 # store output + xxlxor 54, 54, 22 + stxvb16x 54, 21, 9 # store output + + addi 9, 9, 128 + + xxlor 15+32, 15, 15 + xxlor 16+32, 16, 16 + xxlor 17+32, 17, 17 + xxlor 18+32, 18, 18 + xxlor 19+32, 19, 19 + xxlor 20+32, 20, 20 + xxlor 21+32, 21, 21 + xxlor 22+32, 22, 22 + + # ghash here + ppc_aes_gcm_ghash2_4x + + xxlor 27+32, 0, 0 + vaddudm 30, 30, 31 # IV + counter + vmr 29, 30 + vxor 15, 30, 27 # add round key + vaddudm 30, 30, 31 + vxor 16, 30, 27 + vaddudm 30, 30, 31 + vxor 17, 30, 27 + vaddudm 30, 30, 31 + vxor 18, 30, 27 + vaddudm 30, 30, 31 + vxor 19, 30, 27 + vaddudm 30, 30, 31 + vxor 20, 30, 27 + vaddudm 30, 30, 31 + vxor 21, 30, 27 + vaddudm 30, 30, 31 + vxor 22, 30, 27 + + addi 12, 12, -128 + addi 11, 11, 128 + + bdnz Loop_8x_block_dec + + vmr 30, 29 + stxvb16x 30+32, 0, 7 # update IV + +Loop_last_block_dec: + cmpdi 12, 0 + beq aes_gcm_out + + # loop last few blocks + li 10, 16 + divdu 10, 12, 10 + + mtctr 10 + + lwz 10, 240(6) + + cmpdi 12, 16 + blt Final_block_dec + +Next_rem_block_dec: + lxvb16x 15, 0, 14 # load block + + Loop_aes_middle_1x + + xxlor 23+32, 10, 10 + + cmpdi 10, 10 + beq Do_next_1x_dec + + # 192 bits + xxlor 24+32, 11, 11 + + vcipher 15, 15, 23 + vcipher 15, 15, 24 + + xxlor 23+32, 12, 12 + + cmpdi 10, 12 + beq Do_next_1x_dec + + # 256 bits + xxlor 24+32, 13, 13 + + vcipher 15, 15, 23 + vcipher 15, 15, 24 + + xxlor 23+32, 14, 14 + + cmpdi 10, 14 + beq Do_next_1x_dec + +Do_next_1x_dec: + vcipherlast 15, 15, 23 + + xxlxor 47, 47, 15 + stxvb16x 47, 0, 9 # store output + addi 14, 14, 16 + addi 9, 9, 16 + + xxlor 28+32, 15, 15 + #vmr 28, 15 + ppc_update_hash_1x + + addi 12, 12, -16 + addi 11, 11, 16 + xxlor 19+32, 0, 0 + vaddudm 30, 30, 31 # IV + counter + vxor 15, 30, 19 # add round key + + bdnz Next_rem_block_dec + + li 15, 0 + std 15, 56(7) # clear partial? + stxvb16x 30+32, 0, 7 # update IV + cmpdi 12, 0 + beq aes_gcm_out + +Final_block_dec: + lwz 10, 240(6) + Loop_aes_middle_1x + + xxlor 23+32, 10, 10 + + cmpdi 10, 10 + beq Do_final_1x_dec + + # 192 bits + xxlor 24+32, 11, 11 + + vcipher 15, 15, 23 + vcipher 15, 15, 24 + + xxlor 23+32, 12, 12 + + cmpdi 10, 12 + beq Do_final_1x_dec + + # 256 bits + xxlor 24+32, 13, 13 + + vcipher 15, 15, 23 + vcipher 15, 15, 24 + + xxlor 23+32, 14, 14 + + cmpdi 10, 14 + beq Do_final_1x_dec + +Do_final_1x_dec: + vcipherlast 15, 15, 23 + + # check partial block + li 21, 1 # decrypt + ld 15, 56(7) # partial? + cmpdi 15, 0 + beq Normal_block_dec + bl Do_partial_block + cmpdi 12, 0 + ble aes_gcm_out + + b Continue_partial_check_dec + +Normal_block_dec: + lxvb16x 15, 0, 14 # load last block + xxlxor 47, 47, 15 + + # create partial block mask + li 15, 16 + sub 15, 15, 12 # index to the mask + + vspltisb 16, -1 # first 16 bytes - 0xffff...ff + vspltisb 17, 0 # second 16 bytes - 0x0000...00 + li 10, 192 + stvx 16, 10, 1 + addi 10, 10, 16 + stvx 17, 10, 1 + + addi 10, 1, 192 + lxvb16x 16, 15, 10 # load partial block mask + xxland 47, 47, 16 + + xxland 32+28, 15, 16 + #vmr 28, 15 + ppc_update_hash_1x + + # * should store only the remaining bytes. + bl Write_partial_block + + stxvb16x 30+32, 0, 7 # update IV + std 12, 56(7) # update partial? + li 16, 16 + + stxvb16x 32, 0, 8 # write out Xi + stxvb16x 32, 16, 8 # write out Xi + b aes_gcm_out From 3b47eccaaff4e4bb9e0c3684428c852422bd9bac Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Sun, 4 Dec 2022 19:34:56 -0500 Subject: [PATCH 029/156] crypto: p10-aes-gcm - Supporting functions for AES This code is taken from CRYPTOGAMs[1]. The following functions are used, aes_p8_set_encrypt_key is used to generate AES round keys and aes_p8_encrypt is used to encrypt single block. Signed-off-by: Danny Tsen Signed-off-by: Herbert Xu --- arch/powerpc/crypto/aesp8-ppc.pl | 3846 ++++++++++++++++++++++++++++++ 1 file changed, 3846 insertions(+) create mode 100644 arch/powerpc/crypto/aesp8-ppc.pl diff --git a/arch/powerpc/crypto/aesp8-ppc.pl b/arch/powerpc/crypto/aesp8-ppc.pl new file mode 100644 index 000000000000..50a0a18f35da --- /dev/null +++ b/arch/powerpc/crypto/aesp8-ppc.pl @@ -0,0 +1,3846 @@ +#! /usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 + +# This code is taken from CRYPTOGAMs[1] and is included here using the option +# in the license to distribute the code under the GPL. Therefore this program +# is free software; you can redistribute it and/or modify it under the terms of +# the GNU General Public License version 2 as published by the Free Software +# Foundation. +# +# [1] https://www.openssl.org/~appro/cryptogams/ + +# Copyright (c) 2006-2017, CRYPTOGAMS by +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain copyright notices, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# * Neither the name of the CRYPTOGAMS nor the names of its +# copyright holder and contributors may be used to endorse or +# promote products derived from this software without specific +# prior written permission. +# +# ALTERNATIVELY, provided that this notice is retained in full, this +# product may be distributed under the terms of the GNU General Public +# License (GPL), in which case the provisions of the GPL apply INSTEAD OF +# those given above. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see https://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements support for AES instructions as per PowerISA +# specification version 2.07, first implemented by POWER8 processor. +# The module is endian-agnostic in sense that it supports both big- +# and little-endian cases. Data alignment in parallelizable modes is +# handled with VSX loads and stores, which implies MSR.VSX flag being +# set. It should also be noted that ISA specification doesn't prohibit +# alignment exceptions for these instructions on page boundaries. +# Initially alignment was handled in pure AltiVec/VMX way [when data +# is aligned programmatically, which in turn guarantees exception- +# free execution], but it turned to hamper performance when vcipher +# instructions are interleaved. It's reckoned that eventual +# misalignment penalties at page boundaries are in average lower +# than additional overhead in pure AltiVec approach. +# +# May 2016 +# +# Add XTS subroutine, 9x on little- and 12x improvement on big-endian +# systems were measured. +# +###################################################################### +# Current large-block performance in cycles per byte processed with +# 128-bit key (less is better). +# +# CBC en-/decrypt CTR XTS +# POWER8[le] 3.96/0.72 0.74 1.1 +# POWER8[be] 3.75/0.65 0.66 1.0 + +$flavour = shift; + +if ($flavour =~ /64/) { + $SIZE_T =8; + $LRSAVE =2*$SIZE_T; + $STU ="stdu"; + $POP ="ld"; + $PUSH ="std"; + $UCMP ="cmpld"; + $SHL ="sldi"; +} elsif ($flavour =~ /32/) { + $SIZE_T =4; + $LRSAVE =$SIZE_T; + $STU ="stwu"; + $POP ="lwz"; + $PUSH ="stw"; + $UCMP ="cmplw"; + $SHL ="slwi"; +} else { die "nonsense $flavour"; } + +$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; + +$FRAME=8*$SIZE_T; +$prefix="aes_p8"; + +$sp="r1"; +$vrsave="r12"; + +######################################################################### +{{{ # Key setup procedures # +my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8)); +my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6)); +my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11)); + +$code.=<<___; +.machine "any" + +.text + +.align 7 +rcon: +.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev +.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev +.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev +.long 0,0,0,0 ?asis +Lconsts: + mflr r0 + bcl 20,31,\$+4 + mflr $ptr #vvvvv "distance between . and rcon + addi $ptr,$ptr,-0x48 + mtlr r0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +.asciz "AES for PowerISA 2.07, CRYPTOGAMS by " + +.globl .${prefix}_set_encrypt_key +Lset_encrypt_key: + mflr r11 + $PUSH r11,$LRSAVE($sp) + + li $ptr,-1 + ${UCMP}i $inp,0 + beq- Lenc_key_abort # if ($inp==0) return -1; + ${UCMP}i $out,0 + beq- Lenc_key_abort # if ($out==0) return -1; + li $ptr,-2 + cmpwi $bits,128 + blt- Lenc_key_abort + cmpwi $bits,256 + bgt- Lenc_key_abort + andi. r0,$bits,0x3f + bne- Lenc_key_abort + + lis r0,0xfff0 + mfspr $vrsave,256 + mtspr 256,r0 + + bl Lconsts + mtlr r11 + + neg r9,$inp + lvx $in0,0,$inp + addi $inp,$inp,15 # 15 is not typo + lvsr $key,0,r9 # borrow $key + li r8,0x20 + cmpwi $bits,192 + lvx $in1,0,$inp + le?vspltisb $mask,0x0f # borrow $mask + lvx $rcon,0,$ptr + le?vxor $key,$key,$mask # adjust for byte swap + lvx $mask,r8,$ptr + addi $ptr,$ptr,0x10 + vperm $in0,$in0,$in1,$key # align [and byte swap in LE] + li $cnt,8 + vxor $zero,$zero,$zero + mtctr $cnt + + ?lvsr $outperm,0,$out + vspltisb $outmask,-1 + lvx $outhead,0,$out + ?vperm $outmask,$zero,$outmask,$outperm + + blt Loop128 + addi $inp,$inp,8 + beq L192 + addi $inp,$inp,8 + b L256 + +.align 4 +Loop128: + vperm $key,$in0,$in0,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vadduwm $rcon,$rcon,$rcon + vxor $in0,$in0,$key + bdnz Loop128 + + lvx $rcon,0,$ptr # last two round keys + + vperm $key,$in0,$in0,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vadduwm $rcon,$rcon,$rcon + vxor $in0,$in0,$key + + vperm $key,$in0,$in0,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vxor $in0,$in0,$key + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + + addi $inp,$out,15 # 15 is not typo + addi $out,$out,0x50 + + li $rounds,10 + b Ldone + +.align 4 +L192: + lvx $tmp,0,$inp + li $cnt,4 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + addi $out,$out,16 + vperm $in1,$in1,$tmp,$key # align [and byte swap in LE] + vspltisb $key,8 # borrow $key + mtctr $cnt + vsububm $mask,$mask,$key # adjust the mask + +Loop192: + vperm $key,$in1,$in1,$mask # roate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vcipherlast $key,$key,$rcon + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + + vsldoi $stage,$zero,$in1,8 + vspltw $tmp,$in0,3 + vxor $tmp,$tmp,$in1 + vsldoi $in1,$zero,$in1,12 # >>32 + vadduwm $rcon,$rcon,$rcon + vxor $in1,$in1,$tmp + vxor $in0,$in0,$key + vxor $in1,$in1,$key + vsldoi $stage,$stage,$in0,8 + + vperm $key,$in1,$in1,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$stage,$stage,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vsldoi $stage,$in0,$in1,8 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vperm $outtail,$stage,$stage,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + stvx $stage,0,$out + addi $out,$out,16 + + vspltw $tmp,$in0,3 + vxor $tmp,$tmp,$in1 + vsldoi $in1,$zero,$in1,12 # >>32 + vadduwm $rcon,$rcon,$rcon + vxor $in1,$in1,$tmp + vxor $in0,$in0,$key + vxor $in1,$in1,$key + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + addi $inp,$out,15 # 15 is not typo + addi $out,$out,16 + bdnz Loop192 + + li $rounds,12 + addi $out,$out,0x20 + b Ldone + +.align 4 +L256: + lvx $tmp,0,$inp + li $cnt,7 + li $rounds,14 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + addi $out,$out,16 + vperm $in1,$in1,$tmp,$key # align [and byte swap in LE] + mtctr $cnt + +Loop256: + vperm $key,$in1,$in1,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$in1,$in1,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vadduwm $rcon,$rcon,$rcon + vxor $in0,$in0,$key + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + addi $inp,$out,15 # 15 is not typo + addi $out,$out,16 + bdz Ldone + + vspltw $key,$in0,3 # just splat + vsldoi $tmp,$zero,$in1,12 # >>32 + vsbox $key,$key + + vxor $in1,$in1,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in1,$in1,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in1,$in1,$tmp + + vxor $in1,$in1,$key + b Loop256 + +.align 4 +Ldone: + lvx $in1,0,$inp # redundant in aligned case + vsel $in1,$outhead,$in1,$outmask + stvx $in1,0,$inp + li $ptr,0 + mtspr 256,$vrsave + stw $rounds,0($out) + +Lenc_key_abort: + mr r3,$ptr + blr + .long 0 + .byte 0,12,0x14,1,0,0,3,0 + .long 0 +.size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key + +.globl .${prefix}_set_decrypt_key + $STU $sp,-$FRAME($sp) + mflr r10 + $PUSH r10,$FRAME+$LRSAVE($sp) + bl Lset_encrypt_key + mtlr r10 + + cmpwi r3,0 + bne- Ldec_key_abort + + slwi $cnt,$rounds,4 + subi $inp,$out,240 # first round key + srwi $rounds,$rounds,1 + add $out,$inp,$cnt # last round key + mtctr $rounds + +Ldeckey: + lwz r0, 0($inp) + lwz r6, 4($inp) + lwz r7, 8($inp) + lwz r8, 12($inp) + addi $inp,$inp,16 + lwz r9, 0($out) + lwz r10,4($out) + lwz r11,8($out) + lwz r12,12($out) + stw r0, 0($out) + stw r6, 4($out) + stw r7, 8($out) + stw r8, 12($out) + subi $out,$out,16 + stw r9, -16($inp) + stw r10,-12($inp) + stw r11,-8($inp) + stw r12,-4($inp) + bdnz Ldeckey + + xor r3,r3,r3 # return value +Ldec_key_abort: + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,1,0x80,0,3,0 + .long 0 +.size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key +___ +}}} +######################################################################### +{{{ # Single block en- and decrypt procedures # +sub gen_block () { +my $dir = shift; +my $n = $dir eq "de" ? "n" : ""; +my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7)); + +$code.=<<___; +.globl .${prefix}_${dir}crypt + lwz $rounds,240($key) + lis r0,0xfc00 + mfspr $vrsave,256 + li $idx,15 # 15 is not typo + mtspr 256,r0 + + lvx v0,0,$inp + neg r11,$out + lvx v1,$idx,$inp + lvsl v2,0,$inp # inpperm + le?vspltisb v4,0x0f + ?lvsl v3,0,r11 # outperm + le?vxor v2,v2,v4 + li $idx,16 + vperm v0,v0,v1,v2 # align [and byte swap in LE] + lvx v1,0,$key + ?lvsl v5,0,$key # keyperm + srwi $rounds,$rounds,1 + lvx v2,$idx,$key + addi $idx,$idx,16 + subi $rounds,$rounds,1 + ?vperm v1,v1,v2,v5 # align round key + + vxor v0,v0,v1 + lvx v1,$idx,$key + addi $idx,$idx,16 + mtctr $rounds + +Loop_${dir}c: + ?vperm v2,v2,v1,v5 + v${n}cipher v0,v0,v2 + lvx v2,$idx,$key + addi $idx,$idx,16 + ?vperm v1,v1,v2,v5 + v${n}cipher v0,v0,v1 + lvx v1,$idx,$key + addi $idx,$idx,16 + bdnz Loop_${dir}c + + ?vperm v2,v2,v1,v5 + v${n}cipher v0,v0,v2 + lvx v2,$idx,$key + ?vperm v1,v1,v2,v5 + v${n}cipherlast v0,v0,v1 + + vspltisb v2,-1 + vxor v1,v1,v1 + li $idx,15 # 15 is not typo + ?vperm v2,v1,v2,v3 # outmask + le?vxor v3,v3,v4 + lvx v1,0,$out # outhead + vperm v0,v0,v0,v3 # rotate [and byte swap in LE] + vsel v1,v1,v0,v2 + lvx v4,$idx,$out + stvx v1,0,$out + vsel v0,v0,v4,v2 + stvx v0,$idx,$out + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt +___ +} +&gen_block("en"); +&gen_block("de"); +}}} +######################################################################### +{{{ # CBC en- and decrypt procedures # +my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10)); +my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3)); +my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)= + map("v$_",(4..10)); +$code.=<<___; +.globl .${prefix}_cbc_encrypt + ${UCMP}i $len,16 + bltlr- + + cmpwi $enc,0 # test direction + lis r0,0xffe0 + mfspr $vrsave,256 + mtspr 256,r0 + + li $idx,15 + vxor $rndkey0,$rndkey0,$rndkey0 + le?vspltisb $tmp,0x0f + + lvx $ivec,0,$ivp # load [unaligned] iv + lvsl $inpperm,0,$ivp + lvx $inptail,$idx,$ivp + le?vxor $inpperm,$inpperm,$tmp + vperm $ivec,$ivec,$inptail,$inpperm + + neg r11,$inp + ?lvsl $keyperm,0,$key # prepare for unaligned key + lwz $rounds,240($key) + + lvsr $inpperm,0,r11 # prepare for unaligned load + lvx $inptail,0,$inp + addi $inp,$inp,15 # 15 is not typo + le?vxor $inpperm,$inpperm,$tmp + + ?lvsr $outperm,0,$out # prepare for unaligned store + vspltisb $outmask,-1 + lvx $outhead,0,$out + ?vperm $outmask,$rndkey0,$outmask,$outperm + le?vxor $outperm,$outperm,$tmp + + srwi $rounds,$rounds,1 + li $idx,16 + subi $rounds,$rounds,1 + beq Lcbc_dec + +Lcbc_enc: + vmr $inout,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + mtctr $rounds + subi $len,$len,16 # len-=16 + + lvx $rndkey0,0,$key + vperm $inout,$inout,$inptail,$inpperm + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + vxor $inout,$inout,$ivec + +Loop_cbc_enc: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + bdnz Loop_cbc_enc + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipherlast $ivec,$inout,$rndkey0 + ${UCMP}i $len,16 + + vperm $tmp,$ivec,$ivec,$outperm + vsel $inout,$outhead,$tmp,$outmask + vmr $outhead,$tmp + stvx $inout,0,$out + addi $out,$out,16 + bge Lcbc_enc + + b Lcbc_done + +.align 4 +Lcbc_dec: + ${UCMP}i $len,128 + bge _aesp8_cbc_decrypt8x + vmr $tmp,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + mtctr $rounds + subi $len,$len,16 # len-=16 + + lvx $rndkey0,0,$key + vperm $tmp,$tmp,$inptail,$inpperm + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$tmp,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + +Loop_cbc_dec: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vncipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + bdnz Loop_cbc_dec + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vncipherlast $inout,$inout,$rndkey0 + ${UCMP}i $len,16 + + vxor $inout,$inout,$ivec + vmr $ivec,$tmp + vperm $tmp,$inout,$inout,$outperm + vsel $inout,$outhead,$tmp,$outmask + vmr $outhead,$tmp + stvx $inout,0,$out + addi $out,$out,16 + bge Lcbc_dec + +Lcbc_done: + addi $out,$out,-1 + lvx $inout,0,$out # redundant in aligned case + vsel $inout,$outhead,$inout,$outmask + stvx $inout,0,$out + + neg $enc,$ivp # write [unaligned] iv + li $idx,15 # 15 is not typo + vxor $rndkey0,$rndkey0,$rndkey0 + vspltisb $outmask,-1 + le?vspltisb $tmp,0x0f + ?lvsl $outperm,0,$enc + ?vperm $outmask,$rndkey0,$outmask,$outperm + le?vxor $outperm,$outperm,$tmp + lvx $outhead,0,$ivp + vperm $ivec,$ivec,$ivec,$outperm + vsel $inout,$outhead,$ivec,$outmask + lvx $inptail,$idx,$ivp + stvx $inout,0,$ivp + vsel $inout,$ivec,$inptail,$outmask + stvx $inout,$idx,$ivp + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,6,0 + .long 0 +___ +######################################################################### +{{ # Optimized CBC decrypt procedure # +my $key_="r11"; +my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31)); +my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13)); +my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21)); +my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys + # v26-v31 last 6 round keys +my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment + +$code.=<<___; +.align 5 +_aesp8_cbc_decrypt8x: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + li r10,`$FRAME+8*16+15` + li r11,`$FRAME+8*16+31` + stvx v20,r10,$sp # ABI says so + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + li r0,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + mtspr 256,r0 + + subi $rounds,$rounds,3 # -4 in total + subi $len,$len,128 # bias + + lvx $rndkey0,$x00,$key # load key schedule + lvx v30,$x10,$key + addi $key,$key,0x20 + lvx v31,$x00,$key + ?vperm $rndkey0,$rndkey0,v30,$keyperm + addi $key_,$sp,$FRAME+15 + mtctr $rounds + +Load_cbc_dec_key: + ?vperm v24,v30,v31,$keyperm + lvx v30,$x10,$key + addi $key,$key,0x20 + stvx v24,$x00,$key_ # off-load round[1] + ?vperm v25,v31,v30,$keyperm + lvx v31,$x00,$key + stvx v25,$x10,$key_ # off-load round[2] + addi $key_,$key_,0x20 + bdnz Load_cbc_dec_key + + lvx v26,$x10,$key + ?vperm v24,v30,v31,$keyperm + lvx v27,$x20,$key + stvx v24,$x00,$key_ # off-load round[3] + ?vperm v25,v31,v26,$keyperm + lvx v28,$x30,$key + stvx v25,$x10,$key_ # off-load round[4] + addi $key_,$sp,$FRAME+15 # rewind $key_ + ?vperm v26,v26,v27,$keyperm + lvx v29,$x40,$key + ?vperm v27,v27,v28,$keyperm + lvx v30,$x50,$key + ?vperm v28,v28,v29,$keyperm + lvx v31,$x60,$key + ?vperm v29,v29,v30,$keyperm + lvx $out0,$x70,$key # borrow $out0 + ?vperm v30,v30,v31,$keyperm + lvx v24,$x00,$key_ # pre-load round[1] + ?vperm v31,v31,$out0,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + + #lvx $inptail,0,$inp # "caller" already did this + #addi $inp,$inp,15 # 15 is not typo + subi $inp,$inp,15 # undo "caller" + + le?li $idx,8 + lvx_u $in0,$x00,$inp # load first 8 "words" + le?lvsl $inpperm,0,$idx + le?vspltisb $tmp,0x0f + lvx_u $in1,$x10,$inp + le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u + lvx_u $in2,$x20,$inp + le?vperm $in0,$in0,$in0,$inpperm + lvx_u $in3,$x30,$inp + le?vperm $in1,$in1,$in1,$inpperm + lvx_u $in4,$x40,$inp + le?vperm $in2,$in2,$in2,$inpperm + vxor $out0,$in0,$rndkey0 + lvx_u $in5,$x50,$inp + le?vperm $in3,$in3,$in3,$inpperm + vxor $out1,$in1,$rndkey0 + lvx_u $in6,$x60,$inp + le?vperm $in4,$in4,$in4,$inpperm + vxor $out2,$in2,$rndkey0 + lvx_u $in7,$x70,$inp + addi $inp,$inp,0x80 + le?vperm $in5,$in5,$in5,$inpperm + vxor $out3,$in3,$rndkey0 + le?vperm $in6,$in6,$in6,$inpperm + vxor $out4,$in4,$rndkey0 + le?vperm $in7,$in7,$in7,$inpperm + vxor $out5,$in5,$rndkey0 + vxor $out6,$in6,$rndkey0 + vxor $out7,$in7,$rndkey0 + + mtctr $rounds + b Loop_cbc_dec8x +.align 5 +Loop_cbc_dec8x: + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + vncipher $out6,$out6,v24 + vncipher $out7,$out7,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + vncipher $out6,$out6,v25 + vncipher $out7,$out7,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_cbc_dec8x + + subic $len,$len,128 # $len-=128 + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + vncipher $out6,$out6,v24 + vncipher $out7,$out7,v24 + + subfe. r0,r0,r0 # borrow?-1:0 + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + vncipher $out6,$out6,v25 + vncipher $out7,$out7,v25 + + and r0,r0,$len + vncipher $out0,$out0,v26 + vncipher $out1,$out1,v26 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 + vncipher $out4,$out4,v26 + vncipher $out5,$out5,v26 + vncipher $out6,$out6,v26 + vncipher $out7,$out7,v26 + + add $inp,$inp,r0 # $inp is adjusted in such + # way that at exit from the + # loop inX-in7 are loaded + # with last "words" + vncipher $out0,$out0,v27 + vncipher $out1,$out1,v27 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vncipher $out4,$out4,v27 + vncipher $out5,$out5,v27 + vncipher $out6,$out6,v27 + vncipher $out7,$out7,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vncipher $out0,$out0,v28 + vncipher $out1,$out1,v28 + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vncipher $out4,$out4,v28 + vncipher $out5,$out5,v28 + vncipher $out6,$out6,v28 + vncipher $out7,$out7,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + + vncipher $out0,$out0,v29 + vncipher $out1,$out1,v29 + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vncipher $out4,$out4,v29 + vncipher $out5,$out5,v29 + vncipher $out6,$out6,v29 + vncipher $out7,$out7,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + + vncipher $out0,$out0,v30 + vxor $ivec,$ivec,v31 # xor with last round key + vncipher $out1,$out1,v30 + vxor $in0,$in0,v31 + vncipher $out2,$out2,v30 + vxor $in1,$in1,v31 + vncipher $out3,$out3,v30 + vxor $in2,$in2,v31 + vncipher $out4,$out4,v30 + vxor $in3,$in3,v31 + vncipher $out5,$out5,v30 + vxor $in4,$in4,v31 + vncipher $out6,$out6,v30 + vxor $in5,$in5,v31 + vncipher $out7,$out7,v30 + vxor $in6,$in6,v31 + + vncipherlast $out0,$out0,$ivec + vncipherlast $out1,$out1,$in0 + lvx_u $in0,$x00,$inp # load next input block + vncipherlast $out2,$out2,$in1 + lvx_u $in1,$x10,$inp + vncipherlast $out3,$out3,$in2 + le?vperm $in0,$in0,$in0,$inpperm + lvx_u $in2,$x20,$inp + vncipherlast $out4,$out4,$in3 + le?vperm $in1,$in1,$in1,$inpperm + lvx_u $in3,$x30,$inp + vncipherlast $out5,$out5,$in4 + le?vperm $in2,$in2,$in2,$inpperm + lvx_u $in4,$x40,$inp + vncipherlast $out6,$out6,$in5 + le?vperm $in3,$in3,$in3,$inpperm + lvx_u $in5,$x50,$inp + vncipherlast $out7,$out7,$in6 + le?vperm $in4,$in4,$in4,$inpperm + lvx_u $in6,$x60,$inp + vmr $ivec,$in7 + le?vperm $in5,$in5,$in5,$inpperm + lvx_u $in7,$x70,$inp + addi $inp,$inp,0x80 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $in6,$in6,$in6,$inpperm + vxor $out0,$in0,$rndkey0 + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $in7,$in7,$in7,$inpperm + vxor $out1,$in1,$rndkey0 + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + vxor $out2,$in2,$rndkey0 + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + vxor $out3,$in3,$rndkey0 + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x40,$out + vxor $out4,$in4,$rndkey0 + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x50,$out + vxor $out5,$in5,$rndkey0 + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x60,$out + vxor $out6,$in6,$rndkey0 + stvx_u $out7,$x70,$out + addi $out,$out,0x80 + vxor $out7,$in7,$rndkey0 + + mtctr $rounds + beq Loop_cbc_dec8x # did $len-=128 borrow? + + addic. $len,$len,128 + beq Lcbc_dec8x_done + nop + nop + +Loop_cbc_dec8x_tail: # up to 7 "words" tail... + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + vncipher $out6,$out6,v24 + vncipher $out7,$out7,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + vncipher $out6,$out6,v25 + vncipher $out7,$out7,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_cbc_dec8x_tail + + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + vncipher $out6,$out6,v24 + vncipher $out7,$out7,v24 + + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + vncipher $out6,$out6,v25 + vncipher $out7,$out7,v25 + + vncipher $out1,$out1,v26 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 + vncipher $out4,$out4,v26 + vncipher $out5,$out5,v26 + vncipher $out6,$out6,v26 + vncipher $out7,$out7,v26 + + vncipher $out1,$out1,v27 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vncipher $out4,$out4,v27 + vncipher $out5,$out5,v27 + vncipher $out6,$out6,v27 + vncipher $out7,$out7,v27 + + vncipher $out1,$out1,v28 + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vncipher $out4,$out4,v28 + vncipher $out5,$out5,v28 + vncipher $out6,$out6,v28 + vncipher $out7,$out7,v28 + + vncipher $out1,$out1,v29 + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vncipher $out4,$out4,v29 + vncipher $out5,$out5,v29 + vncipher $out6,$out6,v29 + vncipher $out7,$out7,v29 + + vncipher $out1,$out1,v30 + vxor $ivec,$ivec,v31 # last round key + vncipher $out2,$out2,v30 + vxor $in1,$in1,v31 + vncipher $out3,$out3,v30 + vxor $in2,$in2,v31 + vncipher $out4,$out4,v30 + vxor $in3,$in3,v31 + vncipher $out5,$out5,v30 + vxor $in4,$in4,v31 + vncipher $out6,$out6,v30 + vxor $in5,$in5,v31 + vncipher $out7,$out7,v30 + vxor $in6,$in6,v31 + + cmplwi $len,32 # switch($len) + blt Lcbc_dec8x_one + nop + beq Lcbc_dec8x_two + cmplwi $len,64 + blt Lcbc_dec8x_three + nop + beq Lcbc_dec8x_four + cmplwi $len,96 + blt Lcbc_dec8x_five + nop + beq Lcbc_dec8x_six + +Lcbc_dec8x_seven: + vncipherlast $out1,$out1,$ivec + vncipherlast $out2,$out2,$in1 + vncipherlast $out3,$out3,$in2 + vncipherlast $out4,$out4,$in3 + vncipherlast $out5,$out5,$in4 + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out1,$out1,$out1,$inpperm + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x00,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x10,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x20,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x30,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x40,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x50,$out + stvx_u $out7,$x60,$out + addi $out,$out,0x70 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_six: + vncipherlast $out2,$out2,$ivec + vncipherlast $out3,$out3,$in2 + vncipherlast $out4,$out4,$in3 + vncipherlast $out5,$out5,$in4 + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out2,$out2,$out2,$inpperm + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x00,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x10,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x20,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x30,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x40,$out + stvx_u $out7,$x50,$out + addi $out,$out,0x60 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_five: + vncipherlast $out3,$out3,$ivec + vncipherlast $out4,$out4,$in3 + vncipherlast $out5,$out5,$in4 + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out3,$out3,$out3,$inpperm + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x00,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x10,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x20,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x30,$out + stvx_u $out7,$x40,$out + addi $out,$out,0x50 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_four: + vncipherlast $out4,$out4,$ivec + vncipherlast $out5,$out5,$in4 + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out4,$out4,$out4,$inpperm + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x00,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x10,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x20,$out + stvx_u $out7,$x30,$out + addi $out,$out,0x40 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_three: + vncipherlast $out5,$out5,$ivec + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out5,$out5,$out5,$inpperm + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x00,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x10,$out + stvx_u $out7,$x20,$out + addi $out,$out,0x30 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_two: + vncipherlast $out6,$out6,$ivec + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out6,$out6,$out6,$inpperm + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x00,$out + stvx_u $out7,$x10,$out + addi $out,$out,0x20 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_one: + vncipherlast $out7,$out7,$ivec + vmr $ivec,$in7 + + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out7,0,$out + addi $out,$out,0x10 + +Lcbc_dec8x_done: + le?vperm $ivec,$ivec,$ivec,$inpperm + stvx_u $ivec,0,$ivp # write [unaligned] iv + + li r10,`$FRAME+15` + li r11,`$FRAME+31` + stvx $inpperm,r10,$sp # wipe copies of round keys + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,0x14,0,0x80,6,6,0 + .long 0 +.size .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt +___ +}} }}} + +######################################################################### +{{{ # CTR procedure[s] # + +####################### WARNING: Here be dragons! ####################### +# +# This code is written as 'ctr32', based on a 32-bit counter used +# upstream. The kernel does *not* use a 32-bit counter. The kernel uses +# a 128-bit counter. +# +# This leads to subtle changes from the upstream code: the counter +# is incremented with vaddu_q_m rather than vaddu_w_m. This occurs in +# both the bulk (8 blocks at a time) path, and in the individual block +# path. Be aware of this when doing updates. +# +# See: +# 1d4aa0b4c181 ("crypto: vmx - Fixing AES-CTR counter bug") +# 009b30ac7444 ("crypto: vmx - CTR: always increment IV as quadword") +# https://github.com/openssl/openssl/pull/8942 +# +######################################################################### +my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10)); +my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3)); +my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)= + map("v$_",(4..11)); +my $dat=$tmp; + +$code.=<<___; +.globl .${prefix}_ctr32_encrypt_blocks + ${UCMP}i $len,1 + bltlr- + + lis r0,0xfff0 + mfspr $vrsave,256 + mtspr 256,r0 + + li $idx,15 + vxor $rndkey0,$rndkey0,$rndkey0 + le?vspltisb $tmp,0x0f + + lvx $ivec,0,$ivp # load [unaligned] iv + lvsl $inpperm,0,$ivp + lvx $inptail,$idx,$ivp + vspltisb $one,1 + le?vxor $inpperm,$inpperm,$tmp + vperm $ivec,$ivec,$inptail,$inpperm + vsldoi $one,$rndkey0,$one,1 + + neg r11,$inp + ?lvsl $keyperm,0,$key # prepare for unaligned key + lwz $rounds,240($key) + + lvsr $inpperm,0,r11 # prepare for unaligned load + lvx $inptail,0,$inp + addi $inp,$inp,15 # 15 is not typo + le?vxor $inpperm,$inpperm,$tmp + + srwi $rounds,$rounds,1 + li $idx,16 + subi $rounds,$rounds,1 + + ${UCMP}i $len,8 + bge _aesp8_ctr32_encrypt8x + + ?lvsr $outperm,0,$out # prepare for unaligned store + vspltisb $outmask,-1 + lvx $outhead,0,$out + ?vperm $outmask,$rndkey0,$outmask,$outperm + le?vxor $outperm,$outperm,$tmp + + lvx $rndkey0,0,$key + mtctr $rounds + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$ivec,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + b Loop_ctr32_enc + +.align 5 +Loop_ctr32_enc: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + bdnz Loop_ctr32_enc + + vadduqm $ivec,$ivec,$one # Kernel change for 128-bit + vmr $dat,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + subic. $len,$len,1 # blocks-- + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + vperm $dat,$dat,$inptail,$inpperm + li $idx,16 + ?vperm $rndkey1,$rndkey0,$rndkey1,$keyperm + lvx $rndkey0,0,$key + vxor $dat,$dat,$rndkey1 # last round key + vcipherlast $inout,$inout,$dat + + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + vperm $inout,$inout,$inout,$outperm + vsel $dat,$outhead,$inout,$outmask + mtctr $rounds + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vmr $outhead,$inout + vxor $inout,$ivec,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + stvx $dat,0,$out + addi $out,$out,16 + bne Loop_ctr32_enc + + addi $out,$out,-1 + lvx $inout,0,$out # redundant in aligned case + vsel $inout,$outhead,$inout,$outmask + stvx $inout,0,$out + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,6,0 + .long 0 +___ +######################################################################### +{{ # Optimized CTR procedure # +my $key_="r11"; +my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31)); +my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14)); +my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22)); +my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys + # v26-v31 last 6 round keys +my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment +my ($two,$three,$four)=($outhead,$outperm,$outmask); + +$code.=<<___; +.align 5 +_aesp8_ctr32_encrypt8x: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + li r10,`$FRAME+8*16+15` + li r11,`$FRAME+8*16+31` + stvx v20,r10,$sp # ABI says so + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + li r0,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + mtspr 256,r0 + + subi $rounds,$rounds,3 # -4 in total + + lvx $rndkey0,$x00,$key # load key schedule + lvx v30,$x10,$key + addi $key,$key,0x20 + lvx v31,$x00,$key + ?vperm $rndkey0,$rndkey0,v30,$keyperm + addi $key_,$sp,$FRAME+15 + mtctr $rounds + +Load_ctr32_enc_key: + ?vperm v24,v30,v31,$keyperm + lvx v30,$x10,$key + addi $key,$key,0x20 + stvx v24,$x00,$key_ # off-load round[1] + ?vperm v25,v31,v30,$keyperm + lvx v31,$x00,$key + stvx v25,$x10,$key_ # off-load round[2] + addi $key_,$key_,0x20 + bdnz Load_ctr32_enc_key + + lvx v26,$x10,$key + ?vperm v24,v30,v31,$keyperm + lvx v27,$x20,$key + stvx v24,$x00,$key_ # off-load round[3] + ?vperm v25,v31,v26,$keyperm + lvx v28,$x30,$key + stvx v25,$x10,$key_ # off-load round[4] + addi $key_,$sp,$FRAME+15 # rewind $key_ + ?vperm v26,v26,v27,$keyperm + lvx v29,$x40,$key + ?vperm v27,v27,v28,$keyperm + lvx v30,$x50,$key + ?vperm v28,v28,v29,$keyperm + lvx v31,$x60,$key + ?vperm v29,v29,v30,$keyperm + lvx $out0,$x70,$key # borrow $out0 + ?vperm v30,v30,v31,$keyperm + lvx v24,$x00,$key_ # pre-load round[1] + ?vperm v31,v31,$out0,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + + vadduqm $two,$one,$one + subi $inp,$inp,15 # undo "caller" + $SHL $len,$len,4 + + vadduqm $out1,$ivec,$one # counter values ... + vadduqm $out2,$ivec,$two # (do all ctr adds as 128-bit) + vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0] + le?li $idx,8 + vadduqm $out3,$out1,$two + vxor $out1,$out1,$rndkey0 + le?lvsl $inpperm,0,$idx + vadduqm $out4,$out2,$two + vxor $out2,$out2,$rndkey0 + le?vspltisb $tmp,0x0f + vadduqm $out5,$out3,$two + vxor $out3,$out3,$rndkey0 + le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u + vadduqm $out6,$out4,$two + vxor $out4,$out4,$rndkey0 + vadduqm $out7,$out5,$two + vxor $out5,$out5,$rndkey0 + vadduqm $ivec,$out6,$two # next counter value + vxor $out6,$out6,$rndkey0 + vxor $out7,$out7,$rndkey0 + + mtctr $rounds + b Loop_ctr32_enc8x +.align 5 +Loop_ctr32_enc8x: + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + vcipher $out6,$out6,v24 + vcipher $out7,$out7,v24 +Loop_ctr32_enc8x_middle: + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + vcipher $out5,$out5,v25 + vcipher $out6,$out6,v25 + vcipher $out7,$out7,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_ctr32_enc8x + + subic r11,$len,256 # $len-256, borrow $key_ + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + vcipher $out6,$out6,v24 + vcipher $out7,$out7,v24 + + subfe r0,r0,r0 # borrow?-1:0 + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + vcipher $out5,$out5,v25 + vcipher $out6,$out6,v25 + vcipher $out7,$out7,v25 + + and r0,r0,r11 + addi $key_,$sp,$FRAME+15 # rewind $key_ + vcipher $out0,$out0,v26 + vcipher $out1,$out1,v26 + vcipher $out2,$out2,v26 + vcipher $out3,$out3,v26 + vcipher $out4,$out4,v26 + vcipher $out5,$out5,v26 + vcipher $out6,$out6,v26 + vcipher $out7,$out7,v26 + lvx v24,$x00,$key_ # re-pre-load round[1] + + subic $len,$len,129 # $len-=129 + vcipher $out0,$out0,v27 + addi $len,$len,1 # $len-=128 really + vcipher $out1,$out1,v27 + vcipher $out2,$out2,v27 + vcipher $out3,$out3,v27 + vcipher $out4,$out4,v27 + vcipher $out5,$out5,v27 + vcipher $out6,$out6,v27 + vcipher $out7,$out7,v27 + lvx v25,$x10,$key_ # re-pre-load round[2] + + vcipher $out0,$out0,v28 + lvx_u $in0,$x00,$inp # load input + vcipher $out1,$out1,v28 + lvx_u $in1,$x10,$inp + vcipher $out2,$out2,v28 + lvx_u $in2,$x20,$inp + vcipher $out3,$out3,v28 + lvx_u $in3,$x30,$inp + vcipher $out4,$out4,v28 + lvx_u $in4,$x40,$inp + vcipher $out5,$out5,v28 + lvx_u $in5,$x50,$inp + vcipher $out6,$out6,v28 + lvx_u $in6,$x60,$inp + vcipher $out7,$out7,v28 + lvx_u $in7,$x70,$inp + addi $inp,$inp,0x80 + + vcipher $out0,$out0,v29 + le?vperm $in0,$in0,$in0,$inpperm + vcipher $out1,$out1,v29 + le?vperm $in1,$in1,$in1,$inpperm + vcipher $out2,$out2,v29 + le?vperm $in2,$in2,$in2,$inpperm + vcipher $out3,$out3,v29 + le?vperm $in3,$in3,$in3,$inpperm + vcipher $out4,$out4,v29 + le?vperm $in4,$in4,$in4,$inpperm + vcipher $out5,$out5,v29 + le?vperm $in5,$in5,$in5,$inpperm + vcipher $out6,$out6,v29 + le?vperm $in6,$in6,$in6,$inpperm + vcipher $out7,$out7,v29 + le?vperm $in7,$in7,$in7,$inpperm + + add $inp,$inp,r0 # $inp is adjusted in such + # way that at exit from the + # loop inX-in7 are loaded + # with last "words" + subfe. r0,r0,r0 # borrow?-1:0 + vcipher $out0,$out0,v30 + vxor $in0,$in0,v31 # xor with last round key + vcipher $out1,$out1,v30 + vxor $in1,$in1,v31 + vcipher $out2,$out2,v30 + vxor $in2,$in2,v31 + vcipher $out3,$out3,v30 + vxor $in3,$in3,v31 + vcipher $out4,$out4,v30 + vxor $in4,$in4,v31 + vcipher $out5,$out5,v30 + vxor $in5,$in5,v31 + vcipher $out6,$out6,v30 + vxor $in6,$in6,v31 + vcipher $out7,$out7,v30 + vxor $in7,$in7,v31 + + bne Lctr32_enc8x_break # did $len-129 borrow? + + vcipherlast $in0,$out0,$in0 + vcipherlast $in1,$out1,$in1 + vadduqm $out1,$ivec,$one # counter values ... + vcipherlast $in2,$out2,$in2 + vadduqm $out2,$ivec,$two + vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0] + vcipherlast $in3,$out3,$in3 + vadduqm $out3,$out1,$two + vxor $out1,$out1,$rndkey0 + vcipherlast $in4,$out4,$in4 + vadduqm $out4,$out2,$two + vxor $out2,$out2,$rndkey0 + vcipherlast $in5,$out5,$in5 + vadduqm $out5,$out3,$two + vxor $out3,$out3,$rndkey0 + vcipherlast $in6,$out6,$in6 + vadduqm $out6,$out4,$two + vxor $out4,$out4,$rndkey0 + vcipherlast $in7,$out7,$in7 + vadduqm $out7,$out5,$two + vxor $out5,$out5,$rndkey0 + le?vperm $in0,$in0,$in0,$inpperm + vadduqm $ivec,$out6,$two # next counter value + vxor $out6,$out6,$rndkey0 + le?vperm $in1,$in1,$in1,$inpperm + vxor $out7,$out7,$rndkey0 + mtctr $rounds + + vcipher $out0,$out0,v24 + stvx_u $in0,$x00,$out + le?vperm $in2,$in2,$in2,$inpperm + vcipher $out1,$out1,v24 + stvx_u $in1,$x10,$out + le?vperm $in3,$in3,$in3,$inpperm + vcipher $out2,$out2,v24 + stvx_u $in2,$x20,$out + le?vperm $in4,$in4,$in4,$inpperm + vcipher $out3,$out3,v24 + stvx_u $in3,$x30,$out + le?vperm $in5,$in5,$in5,$inpperm + vcipher $out4,$out4,v24 + stvx_u $in4,$x40,$out + le?vperm $in6,$in6,$in6,$inpperm + vcipher $out5,$out5,v24 + stvx_u $in5,$x50,$out + le?vperm $in7,$in7,$in7,$inpperm + vcipher $out6,$out6,v24 + stvx_u $in6,$x60,$out + vcipher $out7,$out7,v24 + stvx_u $in7,$x70,$out + addi $out,$out,0x80 + + b Loop_ctr32_enc8x_middle + +.align 5 +Lctr32_enc8x_break: + cmpwi $len,-0x60 + blt Lctr32_enc8x_one + nop + beq Lctr32_enc8x_two + cmpwi $len,-0x40 + blt Lctr32_enc8x_three + nop + beq Lctr32_enc8x_four + cmpwi $len,-0x20 + blt Lctr32_enc8x_five + nop + beq Lctr32_enc8x_six + cmpwi $len,0x00 + blt Lctr32_enc8x_seven + +Lctr32_enc8x_eight: + vcipherlast $out0,$out0,$in0 + vcipherlast $out1,$out1,$in1 + vcipherlast $out2,$out2,$in2 + vcipherlast $out3,$out3,$in3 + vcipherlast $out4,$out4,$in4 + vcipherlast $out5,$out5,$in5 + vcipherlast $out6,$out6,$in6 + vcipherlast $out7,$out7,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x40,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x50,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x60,$out + stvx_u $out7,$x70,$out + addi $out,$out,0x80 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_seven: + vcipherlast $out0,$out0,$in1 + vcipherlast $out1,$out1,$in2 + vcipherlast $out2,$out2,$in3 + vcipherlast $out3,$out3,$in4 + vcipherlast $out4,$out4,$in5 + vcipherlast $out5,$out5,$in6 + vcipherlast $out6,$out6,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x40,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x50,$out + stvx_u $out6,$x60,$out + addi $out,$out,0x70 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_six: + vcipherlast $out0,$out0,$in2 + vcipherlast $out1,$out1,$in3 + vcipherlast $out2,$out2,$in4 + vcipherlast $out3,$out3,$in5 + vcipherlast $out4,$out4,$in6 + vcipherlast $out5,$out5,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x40,$out + stvx_u $out5,$x50,$out + addi $out,$out,0x60 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_five: + vcipherlast $out0,$out0,$in3 + vcipherlast $out1,$out1,$in4 + vcipherlast $out2,$out2,$in5 + vcipherlast $out3,$out3,$in6 + vcipherlast $out4,$out4,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + stvx_u $out4,$x40,$out + addi $out,$out,0x50 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_four: + vcipherlast $out0,$out0,$in4 + vcipherlast $out1,$out1,$in5 + vcipherlast $out2,$out2,$in6 + vcipherlast $out3,$out3,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + stvx_u $out3,$x30,$out + addi $out,$out,0x40 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_three: + vcipherlast $out0,$out0,$in5 + vcipherlast $out1,$out1,$in6 + vcipherlast $out2,$out2,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + stvx_u $out2,$x20,$out + addi $out,$out,0x30 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_two: + vcipherlast $out0,$out0,$in6 + vcipherlast $out1,$out1,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + stvx_u $out1,$x10,$out + addi $out,$out,0x20 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_one: + vcipherlast $out0,$out0,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + stvx_u $out0,0,$out + addi $out,$out,0x10 + +Lctr32_enc8x_done: + li r10,`$FRAME+15` + li r11,`$FRAME+31` + stvx $inpperm,r10,$sp # wipe copies of round keys + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,0x14,0,0x80,6,6,0 + .long 0 +.size .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks +___ +}} }}} + +######################################################################### +{{{ # XTS procedures # +# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len, # +# const AES_KEY *key1, const AES_KEY *key2, # +# [const] unsigned char iv[16]); # +# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which # +# input tweak value is assumed to be encrypted already, and last tweak # +# value, one suitable for consecutive call on same chunk of data, is # +# written back to original buffer. In addition, in "tweak chaining" # +# mode only complete input blocks are processed. # + +my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) = map("r$_",(3..10)); +my ($rndkey0,$rndkey1,$inout) = map("v$_",(0..2)); +my ($output,$inptail,$inpperm,$leperm,$keyperm) = map("v$_",(3..7)); +my ($tweak,$seven,$eighty7,$tmp,$tweak1) = map("v$_",(8..12)); +my $taillen = $key2; + + ($inp,$idx) = ($idx,$inp); # reassign + +$code.=<<___; +.globl .${prefix}_xts_encrypt + mr $inp,r3 # reassign + li r3,-1 + ${UCMP}i $len,16 + bltlr- + + lis r0,0xfff0 + mfspr r12,256 # save vrsave + li r11,0 + mtspr 256,r0 + + vspltisb $seven,0x07 # 0x070707..07 + le?lvsl $leperm,r11,r11 + le?vspltisb $tmp,0x0f + le?vxor $leperm,$leperm,$seven + + li $idx,15 + lvx $tweak,0,$ivp # load [unaligned] iv + lvsl $inpperm,0,$ivp + lvx $inptail,$idx,$ivp + le?vxor $inpperm,$inpperm,$tmp + vperm $tweak,$tweak,$inptail,$inpperm + + neg r11,$inp + lvsr $inpperm,0,r11 # prepare for unaligned load + lvx $inout,0,$inp + addi $inp,$inp,15 # 15 is not typo + le?vxor $inpperm,$inpperm,$tmp + + ${UCMP}i $key2,0 # key2==NULL? + beq Lxts_enc_no_key2 + + ?lvsl $keyperm,0,$key2 # prepare for unaligned key + lwz $rounds,240($key2) + srwi $rounds,$rounds,1 + subi $rounds,$rounds,1 + li $idx,16 + + lvx $rndkey0,0,$key2 + lvx $rndkey1,$idx,$key2 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $tweak,$tweak,$rndkey0 + lvx $rndkey0,$idx,$key2 + addi $idx,$idx,16 + mtctr $rounds + +Ltweak_xts_enc: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $tweak,$tweak,$rndkey1 + lvx $rndkey1,$idx,$key2 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $tweak,$tweak,$rndkey0 + lvx $rndkey0,$idx,$key2 + addi $idx,$idx,16 + bdnz Ltweak_xts_enc + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $tweak,$tweak,$rndkey1 + lvx $rndkey1,$idx,$key2 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipherlast $tweak,$tweak,$rndkey0 + + li $ivp,0 # don't chain the tweak + b Lxts_enc + +Lxts_enc_no_key2: + li $idx,-16 + and $len,$len,$idx # in "tweak chaining" + # mode only complete + # blocks are processed +Lxts_enc: + lvx $inptail,0,$inp + addi $inp,$inp,16 + + ?lvsl $keyperm,0,$key1 # prepare for unaligned key + lwz $rounds,240($key1) + srwi $rounds,$rounds,1 + subi $rounds,$rounds,1 + li $idx,16 + + vslb $eighty7,$seven,$seven # 0x808080..80 + vor $eighty7,$eighty7,$seven # 0x878787..87 + vspltisb $tmp,1 # 0x010101..01 + vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01 + + ${UCMP}i $len,96 + bge _aesp8_xts_encrypt6x + + andi. $taillen,$len,15 + subic r0,$len,32 + subi $taillen,$taillen,16 + subfe r0,r0,r0 + and r0,r0,$taillen + add $inp,$inp,r0 + + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$tweak + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + mtctr $rounds + b Loop_xts_enc + +.align 5 +Loop_xts_enc: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + bdnz Loop_xts_enc + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $rndkey0,$rndkey0,$tweak + vcipherlast $output,$inout,$rndkey0 + + le?vperm $tmp,$output,$output,$leperm + be?nop + le?stvx_u $tmp,0,$out + be?stvx_u $output,0,$out + addi $out,$out,16 + + subic. $len,$len,16 + beq Lxts_enc_done + + vmr $inout,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + + subic r0,$len,32 + subfe r0,r0,r0 + and r0,r0,$taillen + add $inp,$inp,r0 + + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak,$tweak,$tmp + + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$tweak + vxor $output,$output,$rndkey0 # just in case $len<16 + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + + mtctr $rounds + ${UCMP}i $len,16 + bge Loop_xts_enc + + vxor $output,$output,$tweak + lvsr $inpperm,0,$len # $inpperm is no longer needed + vxor $inptail,$inptail,$inptail # $inptail is no longer needed + vspltisb $tmp,-1 + vperm $inptail,$inptail,$tmp,$inpperm + vsel $inout,$inout,$output,$inptail + + subi r11,$out,17 + subi $out,$out,16 + mtctr $len + li $len,16 +Loop_xts_enc_steal: + lbzu r0,1(r11) + stb r0,16(r11) + bdnz Loop_xts_enc_steal + + mtctr $rounds + b Loop_xts_enc # one more time... + +Lxts_enc_done: + ${UCMP}i $ivp,0 + beq Lxts_enc_ret + + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak,$tweak,$tmp + + le?vperm $tweak,$tweak,$tweak,$leperm + stvx_u $tweak,0,$ivp + +Lxts_enc_ret: + mtspr 256,r12 # restore vrsave + li r3,0 + blr + .long 0 + .byte 0,12,0x04,0,0x80,6,6,0 + .long 0 +.size .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt + +.globl .${prefix}_xts_decrypt + mr $inp,r3 # reassign + li r3,-1 + ${UCMP}i $len,16 + bltlr- + + lis r0,0xfff8 + mfspr r12,256 # save vrsave + li r11,0 + mtspr 256,r0 + + andi. r0,$len,15 + neg r0,r0 + andi. r0,r0,16 + sub $len,$len,r0 + + vspltisb $seven,0x07 # 0x070707..07 + le?lvsl $leperm,r11,r11 + le?vspltisb $tmp,0x0f + le?vxor $leperm,$leperm,$seven + + li $idx,15 + lvx $tweak,0,$ivp # load [unaligned] iv + lvsl $inpperm,0,$ivp + lvx $inptail,$idx,$ivp + le?vxor $inpperm,$inpperm,$tmp + vperm $tweak,$tweak,$inptail,$inpperm + + neg r11,$inp + lvsr $inpperm,0,r11 # prepare for unaligned load + lvx $inout,0,$inp + addi $inp,$inp,15 # 15 is not typo + le?vxor $inpperm,$inpperm,$tmp + + ${UCMP}i $key2,0 # key2==NULL? + beq Lxts_dec_no_key2 + + ?lvsl $keyperm,0,$key2 # prepare for unaligned key + lwz $rounds,240($key2) + srwi $rounds,$rounds,1 + subi $rounds,$rounds,1 + li $idx,16 + + lvx $rndkey0,0,$key2 + lvx $rndkey1,$idx,$key2 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $tweak,$tweak,$rndkey0 + lvx $rndkey0,$idx,$key2 + addi $idx,$idx,16 + mtctr $rounds + +Ltweak_xts_dec: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $tweak,$tweak,$rndkey1 + lvx $rndkey1,$idx,$key2 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $tweak,$tweak,$rndkey0 + lvx $rndkey0,$idx,$key2 + addi $idx,$idx,16 + bdnz Ltweak_xts_dec + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $tweak,$tweak,$rndkey1 + lvx $rndkey1,$idx,$key2 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipherlast $tweak,$tweak,$rndkey0 + + li $ivp,0 # don't chain the tweak + b Lxts_dec + +Lxts_dec_no_key2: + neg $idx,$len + andi. $idx,$idx,15 + add $len,$len,$idx # in "tweak chaining" + # mode only complete + # blocks are processed +Lxts_dec: + lvx $inptail,0,$inp + addi $inp,$inp,16 + + ?lvsl $keyperm,0,$key1 # prepare for unaligned key + lwz $rounds,240($key1) + srwi $rounds,$rounds,1 + subi $rounds,$rounds,1 + li $idx,16 + + vslb $eighty7,$seven,$seven # 0x808080..80 + vor $eighty7,$eighty7,$seven # 0x878787..87 + vspltisb $tmp,1 # 0x010101..01 + vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01 + + ${UCMP}i $len,96 + bge _aesp8_xts_decrypt6x + + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$tweak + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + mtctr $rounds + + ${UCMP}i $len,16 + blt Ltail_xts_dec + be?b Loop_xts_dec + +.align 5 +Loop_xts_dec: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vncipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + bdnz Loop_xts_dec + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $rndkey0,$rndkey0,$tweak + vncipherlast $output,$inout,$rndkey0 + + le?vperm $tmp,$output,$output,$leperm + be?nop + le?stvx_u $tmp,0,$out + be?stvx_u $output,0,$out + addi $out,$out,16 + + subic. $len,$len,16 + beq Lxts_dec_done + + vmr $inout,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak,$tweak,$tmp + + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$tweak + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + + mtctr $rounds + ${UCMP}i $len,16 + bge Loop_xts_dec + +Ltail_xts_dec: + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak1,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak1,$tweak1,$tmp + + subi $inp,$inp,16 + add $inp,$inp,$len + + vxor $inout,$inout,$tweak # :-( + vxor $inout,$inout,$tweak1 # :-) + +Loop_xts_dec_short: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vncipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + bdnz Loop_xts_dec_short + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $rndkey0,$rndkey0,$tweak1 + vncipherlast $output,$inout,$rndkey0 + + le?vperm $tmp,$output,$output,$leperm + be?nop + le?stvx_u $tmp,0,$out + be?stvx_u $output,0,$out + + vmr $inout,$inptail + lvx $inptail,0,$inp + #addi $inp,$inp,16 + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + + lvsr $inpperm,0,$len # $inpperm is no longer needed + vxor $inptail,$inptail,$inptail # $inptail is no longer needed + vspltisb $tmp,-1 + vperm $inptail,$inptail,$tmp,$inpperm + vsel $inout,$inout,$output,$inptail + + vxor $rndkey0,$rndkey0,$tweak + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + + subi r11,$out,1 + mtctr $len + li $len,16 +Loop_xts_dec_steal: + lbzu r0,1(r11) + stb r0,16(r11) + bdnz Loop_xts_dec_steal + + mtctr $rounds + b Loop_xts_dec # one more time... + +Lxts_dec_done: + ${UCMP}i $ivp,0 + beq Lxts_dec_ret + + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak,$tweak,$tmp + + le?vperm $tweak,$tweak,$tweak,$leperm + stvx_u $tweak,0,$ivp + +Lxts_dec_ret: + mtspr 256,r12 # restore vrsave + li r3,0 + blr + .long 0 + .byte 0,12,0x04,0,0x80,6,6,0 + .long 0 +.size .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt +___ +######################################################################### +{{ # Optimized XTS procedures # +my $key_=$key2; +my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31)); + $x00=0 if ($flavour =~ /osx/); +my ($in0, $in1, $in2, $in3, $in4, $in5 )=map("v$_",(0..5)); +my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16)); +my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22)); +my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys + # v26-v31 last 6 round keys +my ($keyperm)=($out0); # aliases with "caller", redundant assignment +my $taillen=$x70; + +$code.=<<___; +.align 5 +_aesp8_xts_encrypt6x: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + mflr r11 + li r7,`$FRAME+8*16+15` + li r3,`$FRAME+8*16+31` + $PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) + stvx v20,r7,$sp # ABI says so + addi r7,r7,32 + stvx v21,r3,$sp + addi r3,r3,32 + stvx v22,r7,$sp + addi r7,r7,32 + stvx v23,r3,$sp + addi r3,r3,32 + stvx v24,r7,$sp + addi r7,r7,32 + stvx v25,r3,$sp + addi r3,r3,32 + stvx v26,r7,$sp + addi r7,r7,32 + stvx v27,r3,$sp + addi r3,r3,32 + stvx v28,r7,$sp + addi r7,r7,32 + stvx v29,r3,$sp + addi r3,r3,32 + stvx v30,r7,$sp + stvx v31,r3,$sp + li r0,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + mtspr 256,r0 + + subi $rounds,$rounds,3 # -4 in total + + lvx $rndkey0,$x00,$key1 # load key schedule + lvx v30,$x10,$key1 + addi $key1,$key1,0x20 + lvx v31,$x00,$key1 + ?vperm $rndkey0,$rndkey0,v30,$keyperm + addi $key_,$sp,$FRAME+15 + mtctr $rounds + +Load_xts_enc_key: + ?vperm v24,v30,v31,$keyperm + lvx v30,$x10,$key1 + addi $key1,$key1,0x20 + stvx v24,$x00,$key_ # off-load round[1] + ?vperm v25,v31,v30,$keyperm + lvx v31,$x00,$key1 + stvx v25,$x10,$key_ # off-load round[2] + addi $key_,$key_,0x20 + bdnz Load_xts_enc_key + + lvx v26,$x10,$key1 + ?vperm v24,v30,v31,$keyperm + lvx v27,$x20,$key1 + stvx v24,$x00,$key_ # off-load round[3] + ?vperm v25,v31,v26,$keyperm + lvx v28,$x30,$key1 + stvx v25,$x10,$key_ # off-load round[4] + addi $key_,$sp,$FRAME+15 # rewind $key_ + ?vperm v26,v26,v27,$keyperm + lvx v29,$x40,$key1 + ?vperm v27,v27,v28,$keyperm + lvx v30,$x50,$key1 + ?vperm v28,v28,v29,$keyperm + lvx v31,$x60,$key1 + ?vperm v29,v29,v30,$keyperm + lvx $twk5,$x70,$key1 # borrow $twk5 + ?vperm v30,v30,v31,$keyperm + lvx v24,$x00,$key_ # pre-load round[1] + ?vperm v31,v31,$twk5,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + + vperm $in0,$inout,$inptail,$inpperm + subi $inp,$inp,31 # undo "caller" + vxor $twk0,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $out0,$in0,$twk0 + vxor $tweak,$tweak,$tmp + + lvx_u $in1,$x10,$inp + vxor $twk1,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in1,$in1,$in1,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out1,$in1,$twk1 + vxor $tweak,$tweak,$tmp + + lvx_u $in2,$x20,$inp + andi. $taillen,$len,15 + vxor $twk2,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in2,$in2,$in2,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out2,$in2,$twk2 + vxor $tweak,$tweak,$tmp + + lvx_u $in3,$x30,$inp + sub $len,$len,$taillen + vxor $twk3,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in3,$in3,$in3,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out3,$in3,$twk3 + vxor $tweak,$tweak,$tmp + + lvx_u $in4,$x40,$inp + subi $len,$len,0x60 + vxor $twk4,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in4,$in4,$in4,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out4,$in4,$twk4 + vxor $tweak,$tweak,$tmp + + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + vxor $twk5,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in5,$in5,$in5,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out5,$in5,$twk5 + vxor $tweak,$tweak,$tmp + + vxor v31,v31,$rndkey0 + mtctr $rounds + b Loop_xts_enc6x + +.align 5 +Loop_xts_enc6x: + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + vcipher $out5,$out5,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_enc6x + + subic $len,$len,96 # $len-=96 + vxor $in0,$twk0,v31 # xor with last round key + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk0,$tweak,$rndkey0 + vaddubm $tweak,$tweak,$tweak + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vsldoi $tmp,$tmp,$tmp,15 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + + subfe. r0,r0,r0 # borrow?-1:0 + vand $tmp,$tmp,$eighty7 + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vxor $tweak,$tweak,$tmp + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vxor $in1,$twk1,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk1,$tweak,$rndkey0 + vcipher $out4,$out4,v25 + vcipher $out5,$out5,v25 + + and r0,r0,$len + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vcipher $out0,$out0,v26 + vcipher $out1,$out1,v26 + vand $tmp,$tmp,$eighty7 + vcipher $out2,$out2,v26 + vcipher $out3,$out3,v26 + vxor $tweak,$tweak,$tmp + vcipher $out4,$out4,v26 + vcipher $out5,$out5,v26 + + add $inp,$inp,r0 # $inp is adjusted in such + # way that at exit from the + # loop inX-in5 are loaded + # with last "words" + vxor $in2,$twk2,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk2,$tweak,$rndkey0 + vaddubm $tweak,$tweak,$tweak + vcipher $out0,$out0,v27 + vcipher $out1,$out1,v27 + vsldoi $tmp,$tmp,$tmp,15 + vcipher $out2,$out2,v27 + vcipher $out3,$out3,v27 + vand $tmp,$tmp,$eighty7 + vcipher $out4,$out4,v27 + vcipher $out5,$out5,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vxor $tweak,$tweak,$tmp + vcipher $out0,$out0,v28 + vcipher $out1,$out1,v28 + vxor $in3,$twk3,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk3,$tweak,$rndkey0 + vcipher $out2,$out2,v28 + vcipher $out3,$out3,v28 + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vcipher $out4,$out4,v28 + vcipher $out5,$out5,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + vand $tmp,$tmp,$eighty7 + + vcipher $out0,$out0,v29 + vcipher $out1,$out1,v29 + vxor $tweak,$tweak,$tmp + vcipher $out2,$out2,v29 + vcipher $out3,$out3,v29 + vxor $in4,$twk4,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk4,$tweak,$rndkey0 + vcipher $out4,$out4,v29 + vcipher $out5,$out5,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + + vcipher $out0,$out0,v30 + vcipher $out1,$out1,v30 + vand $tmp,$tmp,$eighty7 + vcipher $out2,$out2,v30 + vcipher $out3,$out3,v30 + vxor $tweak,$tweak,$tmp + vcipher $out4,$out4,v30 + vcipher $out5,$out5,v30 + vxor $in5,$twk5,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk5,$tweak,$rndkey0 + + vcipherlast $out0,$out0,$in0 + lvx_u $in0,$x00,$inp # load next input block + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vcipherlast $out1,$out1,$in1 + lvx_u $in1,$x10,$inp + vcipherlast $out2,$out2,$in2 + le?vperm $in0,$in0,$in0,$leperm + lvx_u $in2,$x20,$inp + vand $tmp,$tmp,$eighty7 + vcipherlast $out3,$out3,$in3 + le?vperm $in1,$in1,$in1,$leperm + lvx_u $in3,$x30,$inp + vcipherlast $out4,$out4,$in4 + le?vperm $in2,$in2,$in2,$leperm + lvx_u $in4,$x40,$inp + vxor $tweak,$tweak,$tmp + vcipherlast $tmp,$out5,$in5 # last block might be needed + # in stealing mode + le?vperm $in3,$in3,$in3,$leperm + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + le?vperm $in4,$in4,$in4,$leperm + le?vperm $in5,$in5,$in5,$leperm + + le?vperm $out0,$out0,$out0,$leperm + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk0 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + vxor $out1,$in1,$twk1 + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + vxor $out2,$in2,$twk2 + le?vperm $out4,$out4,$out4,$leperm + stvx_u $out3,$x30,$out + vxor $out3,$in3,$twk3 + le?vperm $out5,$tmp,$tmp,$leperm + stvx_u $out4,$x40,$out + vxor $out4,$in4,$twk4 + le?stvx_u $out5,$x50,$out + be?stvx_u $tmp, $x50,$out + vxor $out5,$in5,$twk5 + addi $out,$out,0x60 + + mtctr $rounds + beq Loop_xts_enc6x # did $len-=96 borrow? + + addic. $len,$len,0x60 + beq Lxts_enc6x_zero + cmpwi $len,0x20 + blt Lxts_enc6x_one + nop + beq Lxts_enc6x_two + cmpwi $len,0x40 + blt Lxts_enc6x_three + nop + beq Lxts_enc6x_four + +Lxts_enc6x_five: + vxor $out0,$in1,$twk0 + vxor $out1,$in2,$twk1 + vxor $out2,$in3,$twk2 + vxor $out3,$in4,$twk3 + vxor $out4,$in5,$twk4 + + bl _aesp8_xts_enc5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk5 # unused tweak + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + vxor $tmp,$out4,$twk5 # last block prep for stealing + le?vperm $out4,$out4,$out4,$leperm + stvx_u $out3,$x30,$out + stvx_u $out4,$x40,$out + addi $out,$out,0x50 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_four: + vxor $out0,$in2,$twk0 + vxor $out1,$in3,$twk1 + vxor $out2,$in4,$twk2 + vxor $out3,$in5,$twk3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_enc5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk4 # unused tweak + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + vxor $tmp,$out3,$twk4 # last block prep for stealing + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + stvx_u $out3,$x30,$out + addi $out,$out,0x40 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_three: + vxor $out0,$in3,$twk0 + vxor $out1,$in4,$twk1 + vxor $out2,$in5,$twk2 + vxor $out3,$out3,$out3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_enc5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk3 # unused tweak + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $tmp,$out2,$twk3 # last block prep for stealing + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + stvx_u $out2,$x20,$out + addi $out,$out,0x30 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_two: + vxor $out0,$in4,$twk0 + vxor $out1,$in5,$twk1 + vxor $out2,$out2,$out2 + vxor $out3,$out3,$out3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_enc5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk2 # unused tweak + vxor $tmp,$out1,$twk2 # last block prep for stealing + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + stvx_u $out1,$x10,$out + addi $out,$out,0x20 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_one: + vxor $out0,$in5,$twk0 + nop +Loop_xts_enc1x: + vcipher $out0,$out0,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vcipher $out0,$out0,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_enc1x + + add $inp,$inp,$taillen + cmpwi $taillen,0 + vcipher $out0,$out0,v24 + + subi $inp,$inp,16 + vcipher $out0,$out0,v25 + + lvsr $inpperm,0,$taillen + vcipher $out0,$out0,v26 + + lvx_u $in0,0,$inp + vcipher $out0,$out0,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vcipher $out0,$out0,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + + vcipher $out0,$out0,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $twk0,$twk0,v31 + + le?vperm $in0,$in0,$in0,$leperm + vcipher $out0,$out0,v30 + + vperm $in0,$in0,$in0,$inpperm + vcipherlast $out0,$out0,$twk0 + + vmr $twk0,$twk1 # unused tweak + vxor $tmp,$out0,$twk1 # last block prep for stealing + le?vperm $out0,$out0,$out0,$leperm + stvx_u $out0,$x00,$out # store output + addi $out,$out,0x10 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_zero: + cmpwi $taillen,0 + beq Lxts_enc6x_done + + add $inp,$inp,$taillen + subi $inp,$inp,16 + lvx_u $in0,0,$inp + lvsr $inpperm,0,$taillen # $in5 is no more + le?vperm $in0,$in0,$in0,$leperm + vperm $in0,$in0,$in0,$inpperm + vxor $tmp,$tmp,$twk0 +Lxts_enc6x_steal: + vxor $in0,$in0,$twk0 + vxor $out0,$out0,$out0 + vspltisb $out1,-1 + vperm $out0,$out0,$out1,$inpperm + vsel $out0,$in0,$tmp,$out0 # $tmp is last block, remember? + + subi r30,$out,17 + subi $out,$out,16 + mtctr $taillen +Loop_xts_enc6x_steal: + lbzu r0,1(r30) + stb r0,16(r30) + bdnz Loop_xts_enc6x_steal + + li $taillen,0 + mtctr $rounds + b Loop_xts_enc1x # one more time... + +.align 4 +Lxts_enc6x_done: + ${UCMP}i $ivp,0 + beq Lxts_enc6x_ret + + vxor $tweak,$twk0,$rndkey0 + le?vperm $tweak,$tweak,$tweak,$leperm + stvx_u $tweak,0,$ivp + +Lxts_enc6x_ret: + mtlr r11 + li r10,`$FRAME+15` + li r11,`$FRAME+31` + stvx $seven,r10,$sp # wipe copies of round keys + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,0x04,1,0x80,6,6,0 + .long 0 + +.align 5 +_aesp8_xts_enc5x: + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + lvx v25,$x10,$key_ # round[4] + bdnz _aesp8_xts_enc5x + + add $inp,$inp,$taillen + cmpwi $taillen,0 + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + + subi $inp,$inp,16 + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + vxor $twk0,$twk0,v31 + + vcipher $out0,$out0,v26 + lvsr $inpperm,r0,$taillen # $in5 is no more + vcipher $out1,$out1,v26 + vcipher $out2,$out2,v26 + vcipher $out3,$out3,v26 + vcipher $out4,$out4,v26 + vxor $in1,$twk1,v31 + + vcipher $out0,$out0,v27 + lvx_u $in0,0,$inp + vcipher $out1,$out1,v27 + vcipher $out2,$out2,v27 + vcipher $out3,$out3,v27 + vcipher $out4,$out4,v27 + vxor $in2,$twk2,v31 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vcipher $out0,$out0,v28 + vcipher $out1,$out1,v28 + vcipher $out2,$out2,v28 + vcipher $out3,$out3,v28 + vcipher $out4,$out4,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + vxor $in3,$twk3,v31 + + vcipher $out0,$out0,v29 + le?vperm $in0,$in0,$in0,$leperm + vcipher $out1,$out1,v29 + vcipher $out2,$out2,v29 + vcipher $out3,$out3,v29 + vcipher $out4,$out4,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $in4,$twk4,v31 + + vcipher $out0,$out0,v30 + vperm $in0,$in0,$in0,$inpperm + vcipher $out1,$out1,v30 + vcipher $out2,$out2,v30 + vcipher $out3,$out3,v30 + vcipher $out4,$out4,v30 + + vcipherlast $out0,$out0,$twk0 + vcipherlast $out1,$out1,$in1 + vcipherlast $out2,$out2,$in2 + vcipherlast $out3,$out3,$in3 + vcipherlast $out4,$out4,$in4 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +.align 5 +_aesp8_xts_decrypt6x: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + mflr r11 + li r7,`$FRAME+8*16+15` + li r3,`$FRAME+8*16+31` + $PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) + stvx v20,r7,$sp # ABI says so + addi r7,r7,32 + stvx v21,r3,$sp + addi r3,r3,32 + stvx v22,r7,$sp + addi r7,r7,32 + stvx v23,r3,$sp + addi r3,r3,32 + stvx v24,r7,$sp + addi r7,r7,32 + stvx v25,r3,$sp + addi r3,r3,32 + stvx v26,r7,$sp + addi r7,r7,32 + stvx v27,r3,$sp + addi r3,r3,32 + stvx v28,r7,$sp + addi r7,r7,32 + stvx v29,r3,$sp + addi r3,r3,32 + stvx v30,r7,$sp + stvx v31,r3,$sp + li r0,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + mtspr 256,r0 + + subi $rounds,$rounds,3 # -4 in total + + lvx $rndkey0,$x00,$key1 # load key schedule + lvx v30,$x10,$key1 + addi $key1,$key1,0x20 + lvx v31,$x00,$key1 + ?vperm $rndkey0,$rndkey0,v30,$keyperm + addi $key_,$sp,$FRAME+15 + mtctr $rounds + +Load_xts_dec_key: + ?vperm v24,v30,v31,$keyperm + lvx v30,$x10,$key1 + addi $key1,$key1,0x20 + stvx v24,$x00,$key_ # off-load round[1] + ?vperm v25,v31,v30,$keyperm + lvx v31,$x00,$key1 + stvx v25,$x10,$key_ # off-load round[2] + addi $key_,$key_,0x20 + bdnz Load_xts_dec_key + + lvx v26,$x10,$key1 + ?vperm v24,v30,v31,$keyperm + lvx v27,$x20,$key1 + stvx v24,$x00,$key_ # off-load round[3] + ?vperm v25,v31,v26,$keyperm + lvx v28,$x30,$key1 + stvx v25,$x10,$key_ # off-load round[4] + addi $key_,$sp,$FRAME+15 # rewind $key_ + ?vperm v26,v26,v27,$keyperm + lvx v29,$x40,$key1 + ?vperm v27,v27,v28,$keyperm + lvx v30,$x50,$key1 + ?vperm v28,v28,v29,$keyperm + lvx v31,$x60,$key1 + ?vperm v29,v29,v30,$keyperm + lvx $twk5,$x70,$key1 # borrow $twk5 + ?vperm v30,v30,v31,$keyperm + lvx v24,$x00,$key_ # pre-load round[1] + ?vperm v31,v31,$twk5,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + + vperm $in0,$inout,$inptail,$inpperm + subi $inp,$inp,31 # undo "caller" + vxor $twk0,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $out0,$in0,$twk0 + vxor $tweak,$tweak,$tmp + + lvx_u $in1,$x10,$inp + vxor $twk1,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in1,$in1,$in1,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out1,$in1,$twk1 + vxor $tweak,$tweak,$tmp + + lvx_u $in2,$x20,$inp + andi. $taillen,$len,15 + vxor $twk2,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in2,$in2,$in2,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out2,$in2,$twk2 + vxor $tweak,$tweak,$tmp + + lvx_u $in3,$x30,$inp + sub $len,$len,$taillen + vxor $twk3,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in3,$in3,$in3,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out3,$in3,$twk3 + vxor $tweak,$tweak,$tmp + + lvx_u $in4,$x40,$inp + subi $len,$len,0x60 + vxor $twk4,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in4,$in4,$in4,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out4,$in4,$twk4 + vxor $tweak,$tweak,$tmp + + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + vxor $twk5,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in5,$in5,$in5,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out5,$in5,$twk5 + vxor $tweak,$tweak,$tmp + + vxor v31,v31,$rndkey0 + mtctr $rounds + b Loop_xts_dec6x + +.align 5 +Loop_xts_dec6x: + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_dec6x + + subic $len,$len,96 # $len-=96 + vxor $in0,$twk0,v31 # xor with last round key + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk0,$tweak,$rndkey0 + vaddubm $tweak,$tweak,$tweak + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vsldoi $tmp,$tmp,$tmp,15 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + + subfe. r0,r0,r0 # borrow?-1:0 + vand $tmp,$tmp,$eighty7 + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vxor $tweak,$tweak,$tmp + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vxor $in1,$twk1,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk1,$tweak,$rndkey0 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + + and r0,r0,$len + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vncipher $out0,$out0,v26 + vncipher $out1,$out1,v26 + vand $tmp,$tmp,$eighty7 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 + vxor $tweak,$tweak,$tmp + vncipher $out4,$out4,v26 + vncipher $out5,$out5,v26 + + add $inp,$inp,r0 # $inp is adjusted in such + # way that at exit from the + # loop inX-in5 are loaded + # with last "words" + vxor $in2,$twk2,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk2,$tweak,$rndkey0 + vaddubm $tweak,$tweak,$tweak + vncipher $out0,$out0,v27 + vncipher $out1,$out1,v27 + vsldoi $tmp,$tmp,$tmp,15 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vand $tmp,$tmp,$eighty7 + vncipher $out4,$out4,v27 + vncipher $out5,$out5,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vxor $tweak,$tweak,$tmp + vncipher $out0,$out0,v28 + vncipher $out1,$out1,v28 + vxor $in3,$twk3,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk3,$tweak,$rndkey0 + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vncipher $out4,$out4,v28 + vncipher $out5,$out5,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + vand $tmp,$tmp,$eighty7 + + vncipher $out0,$out0,v29 + vncipher $out1,$out1,v29 + vxor $tweak,$tweak,$tmp + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vxor $in4,$twk4,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk4,$tweak,$rndkey0 + vncipher $out4,$out4,v29 + vncipher $out5,$out5,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + + vncipher $out0,$out0,v30 + vncipher $out1,$out1,v30 + vand $tmp,$tmp,$eighty7 + vncipher $out2,$out2,v30 + vncipher $out3,$out3,v30 + vxor $tweak,$tweak,$tmp + vncipher $out4,$out4,v30 + vncipher $out5,$out5,v30 + vxor $in5,$twk5,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk5,$tweak,$rndkey0 + + vncipherlast $out0,$out0,$in0 + lvx_u $in0,$x00,$inp # load next input block + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vncipherlast $out1,$out1,$in1 + lvx_u $in1,$x10,$inp + vncipherlast $out2,$out2,$in2 + le?vperm $in0,$in0,$in0,$leperm + lvx_u $in2,$x20,$inp + vand $tmp,$tmp,$eighty7 + vncipherlast $out3,$out3,$in3 + le?vperm $in1,$in1,$in1,$leperm + lvx_u $in3,$x30,$inp + vncipherlast $out4,$out4,$in4 + le?vperm $in2,$in2,$in2,$leperm + lvx_u $in4,$x40,$inp + vxor $tweak,$tweak,$tmp + vncipherlast $out5,$out5,$in5 + le?vperm $in3,$in3,$in3,$leperm + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + le?vperm $in4,$in4,$in4,$leperm + le?vperm $in5,$in5,$in5,$leperm + + le?vperm $out0,$out0,$out0,$leperm + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk0 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + vxor $out1,$in1,$twk1 + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + vxor $out2,$in2,$twk2 + le?vperm $out4,$out4,$out4,$leperm + stvx_u $out3,$x30,$out + vxor $out3,$in3,$twk3 + le?vperm $out5,$out5,$out5,$leperm + stvx_u $out4,$x40,$out + vxor $out4,$in4,$twk4 + stvx_u $out5,$x50,$out + vxor $out5,$in5,$twk5 + addi $out,$out,0x60 + + mtctr $rounds + beq Loop_xts_dec6x # did $len-=96 borrow? + + addic. $len,$len,0x60 + beq Lxts_dec6x_zero + cmpwi $len,0x20 + blt Lxts_dec6x_one + nop + beq Lxts_dec6x_two + cmpwi $len,0x40 + blt Lxts_dec6x_three + nop + beq Lxts_dec6x_four + +Lxts_dec6x_five: + vxor $out0,$in1,$twk0 + vxor $out1,$in2,$twk1 + vxor $out2,$in3,$twk2 + vxor $out3,$in4,$twk3 + vxor $out4,$in5,$twk4 + + bl _aesp8_xts_dec5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk5 # unused tweak + vxor $twk1,$tweak,$rndkey0 + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk1 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$leperm + stvx_u $out3,$x30,$out + stvx_u $out4,$x40,$out + addi $out,$out,0x50 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_four: + vxor $out0,$in2,$twk0 + vxor $out1,$in3,$twk1 + vxor $out2,$in4,$twk2 + vxor $out3,$in5,$twk3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_dec5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk4 # unused tweak + vmr $twk1,$twk5 + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk5 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + stvx_u $out3,$x30,$out + addi $out,$out,0x40 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_three: + vxor $out0,$in3,$twk0 + vxor $out1,$in4,$twk1 + vxor $out2,$in5,$twk2 + vxor $out3,$out3,$out3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_dec5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk3 # unused tweak + vmr $twk1,$twk4 + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk4 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + stvx_u $out2,$x20,$out + addi $out,$out,0x30 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_two: + vxor $out0,$in4,$twk0 + vxor $out1,$in5,$twk1 + vxor $out2,$out2,$out2 + vxor $out3,$out3,$out3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_dec5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk2 # unused tweak + vmr $twk1,$twk3 + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk3 + stvx_u $out1,$x10,$out + addi $out,$out,0x20 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_one: + vxor $out0,$in5,$twk0 + nop +Loop_xts_dec1x: + vncipher $out0,$out0,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_dec1x + + subi r0,$taillen,1 + vncipher $out0,$out0,v24 + + andi. r0,r0,16 + cmpwi $taillen,0 + vncipher $out0,$out0,v25 + + sub $inp,$inp,r0 + vncipher $out0,$out0,v26 + + lvx_u $in0,0,$inp + vncipher $out0,$out0,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vncipher $out0,$out0,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + + vncipher $out0,$out0,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $twk0,$twk0,v31 + + le?vperm $in0,$in0,$in0,$leperm + vncipher $out0,$out0,v30 + + mtctr $rounds + vncipherlast $out0,$out0,$twk0 + + vmr $twk0,$twk1 # unused tweak + vmr $twk1,$twk2 + le?vperm $out0,$out0,$out0,$leperm + stvx_u $out0,$x00,$out # store output + addi $out,$out,0x10 + vxor $out0,$in0,$twk2 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_zero: + cmpwi $taillen,0 + beq Lxts_dec6x_done + + lvx_u $in0,0,$inp + le?vperm $in0,$in0,$in0,$leperm + vxor $out0,$in0,$twk1 +Lxts_dec6x_steal: + vncipher $out0,$out0,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Lxts_dec6x_steal + + add $inp,$inp,$taillen + vncipher $out0,$out0,v24 + + cmpwi $taillen,0 + vncipher $out0,$out0,v25 + + lvx_u $in0,0,$inp + vncipher $out0,$out0,v26 + + lvsr $inpperm,0,$taillen # $in5 is no more + vncipher $out0,$out0,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vncipher $out0,$out0,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + + vncipher $out0,$out0,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $twk1,$twk1,v31 + + le?vperm $in0,$in0,$in0,$leperm + vncipher $out0,$out0,v30 + + vperm $in0,$in0,$in0,$inpperm + vncipherlast $tmp,$out0,$twk1 + + le?vperm $out0,$tmp,$tmp,$leperm + le?stvx_u $out0,0,$out + be?stvx_u $tmp,0,$out + + vxor $out0,$out0,$out0 + vspltisb $out1,-1 + vperm $out0,$out0,$out1,$inpperm + vsel $out0,$in0,$tmp,$out0 + vxor $out0,$out0,$twk0 + + subi r30,$out,1 + mtctr $taillen +Loop_xts_dec6x_steal: + lbzu r0,1(r30) + stb r0,16(r30) + bdnz Loop_xts_dec6x_steal + + li $taillen,0 + mtctr $rounds + b Loop_xts_dec1x # one more time... + +.align 4 +Lxts_dec6x_done: + ${UCMP}i $ivp,0 + beq Lxts_dec6x_ret + + vxor $tweak,$twk0,$rndkey0 + le?vperm $tweak,$tweak,$tweak,$leperm + stvx_u $tweak,0,$ivp + +Lxts_dec6x_ret: + mtlr r11 + li r10,`$FRAME+15` + li r11,`$FRAME+31` + stvx $seven,r10,$sp # wipe copies of round keys + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,0x04,1,0x80,6,6,0 + .long 0 + +.align 5 +_aesp8_xts_dec5x: + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + lvx v25,$x10,$key_ # round[4] + bdnz _aesp8_xts_dec5x + + subi r0,$taillen,1 + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + + andi. r0,r0,16 + cmpwi $taillen,0 + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vxor $twk0,$twk0,v31 + + sub $inp,$inp,r0 + vncipher $out0,$out0,v26 + vncipher $out1,$out1,v26 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 + vncipher $out4,$out4,v26 + vxor $in1,$twk1,v31 + + vncipher $out0,$out0,v27 + lvx_u $in0,0,$inp + vncipher $out1,$out1,v27 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vncipher $out4,$out4,v27 + vxor $in2,$twk2,v31 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vncipher $out0,$out0,v28 + vncipher $out1,$out1,v28 + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vncipher $out4,$out4,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + vxor $in3,$twk3,v31 + + vncipher $out0,$out0,v29 + le?vperm $in0,$in0,$in0,$leperm + vncipher $out1,$out1,v29 + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vncipher $out4,$out4,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $in4,$twk4,v31 + + vncipher $out0,$out0,v30 + vncipher $out1,$out1,v30 + vncipher $out2,$out2,v30 + vncipher $out3,$out3,v30 + vncipher $out4,$out4,v30 + + vncipherlast $out0,$out0,$twk0 + vncipherlast $out1,$out1,$in1 + vncipherlast $out2,$out2,$in2 + vncipherlast $out3,$out3,$in3 + vncipherlast $out4,$out4,$in4 + mtctr $rounds + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +___ +}} }}} + +my $consts=1; +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + + # constants table endian-specific conversion + if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) { + my $conv=$3; + my @bytes=(); + + # convert to endian-agnostic format + if ($1 eq "long") { + foreach (split(/,\s*/,$2)) { + my $l = /^0/?oct:int; + push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff; + } + } else { + @bytes = map(/^0/?oct:int,split(/,\s*/,$2)); + } + + # little-endian conversion + if ($flavour =~ /le$/o) { + SWITCH: for($conv) { + /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; }; + /\?rev/ && do { @bytes=reverse(@bytes); last; }; + } + } + + #emit + print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n"; + next; + } + $consts=0 if (m/Lconsts:/o); # end of table + + # instructions prefixed with '?' are endian-specific and need + # to be adjusted accordingly... + if ($flavour =~ /le$/o) { # little-endian + s/le\?//o or + s/be\?/#be#/o or + s/\?lvsr/lvsl/o or + s/\?lvsl/lvsr/o or + s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or + s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or + s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o; + } else { # big-endian + s/le\?/#le#/o or + s/be\?//o or + s/\?([a-z]+)/$1/o; + } + + print $_,"\n"; +} + +close STDOUT; From 41a6437ab415e5daa7dfcd4b1b91dd934306d4e9 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Sun, 4 Dec 2022 19:34:57 -0500 Subject: [PATCH 030/156] crypto: p10-aes-gcm - Supporting functions for ghash This perl code is taken from the OpenSSL project and added gcm_init_htable function used in the p10-aes-gcm-glue.c code to initialize hash table. gcm_hash_p8 is used to hash encrypted data blocks. Signed-off-by: Danny Tsen Signed-off-by: Herbert Xu --- arch/powerpc/crypto/ghashp8-ppc.pl | 370 +++++++++++++++++++++++++++++ 1 file changed, 370 insertions(+) create mode 100644 arch/powerpc/crypto/ghashp8-ppc.pl diff --git a/arch/powerpc/crypto/ghashp8-ppc.pl b/arch/powerpc/crypto/ghashp8-ppc.pl new file mode 100644 index 000000000000..b56603b4a893 --- /dev/null +++ b/arch/powerpc/crypto/ghashp8-ppc.pl @@ -0,0 +1,370 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 + +# This code is taken from the OpenSSL project but the author (Andy Polyakov) +# has relicensed it under the GPLv2. Therefore this program is free software; +# you can redistribute it and/or modify it under the terms of the GNU General +# Public License version 2 as published by the Free Software Foundation. +# +# The original headers, including the original license headers, are +# included below for completeness. + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see https://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# GHASH for PowerISA v2.07. +# +# July 2014 +# +# Accurate performance measurements are problematic, because it's +# always virtualized setup with possibly throttled processor. +# Relative comparison is therefore more informative. This initial +# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x +# faster than "4-bit" integer-only compiler-generated 64-bit code. +# "Initial version" means that there is room for futher improvement. + +$flavour=shift; +$output =shift; + +if ($flavour =~ /64/) { + $SIZE_T=8; + $LRSAVE=2*$SIZE_T; + $STU="stdu"; + $POP="ld"; + $PUSH="std"; +} elsif ($flavour =~ /32/) { + $SIZE_T=4; + $LRSAVE=$SIZE_T; + $STU="stwu"; + $POP="lwz"; + $PUSH="stw"; +} else { die "nonsense $flavour"; } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; + +my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block + +my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3)); +my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12)); +my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19)); +my $vrsave="r12"; +my ($t4,$t5,$t6) = ($Hl,$H,$Hh); + +$code=<<___; +.machine "any" + +.text + +.globl .gcm_init_p8 + lis r0,0xfff0 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $H,0,r4 # load H + le?xor r7,r7,r7 + le?addi r7,r7,0x8 # need a vperm start with 08 + le?lvsr 5,0,r7 + le?vspltisb 6,0x0f + le?vxor 5,5,6 # set a b-endian mask + le?vperm $H,$H,$H,5 + + vspltisb $xC2,-16 # 0xf0 + vspltisb $t0,1 # one + vaddubm $xC2,$xC2,$xC2 # 0xe0 + vxor $zero,$zero,$zero + vor $xC2,$xC2,$t0 # 0xe1 + vsldoi $xC2,$xC2,$zero,15 # 0xe1... + vsldoi $t1,$zero,$t0,1 # ...1 + vaddubm $xC2,$xC2,$xC2 # 0xc2... + vspltisb $t2,7 + vor $xC2,$xC2,$t1 # 0xc2....01 + vspltb $t1,$H,0 # most significant byte + vsl $H,$H,$t0 # H<<=1 + vsrab $t1,$t1,$t2 # broadcast carry bit + vand $t1,$t1,$xC2 + vxor $H,$H,$t1 # twisted H + + vsldoi $H,$H,$H,8 # twist even more ... + vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 + vsldoi $Hl,$zero,$H,8 # ... and split + vsldoi $Hh,$H,$zero,8 + + stvx_u $xC2,0,r3 # save pre-computed table + stvx_u $Hl,r8,r3 + stvx_u $H, r9,r3 + stvx_u $Hh,r10,r3 + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size .gcm_init_p8,.-.gcm_init_p8 + +.globl .gcm_init_htable + lis r0,0xfff0 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $H,0,r4 # load H + + vspltisb $xC2,-16 # 0xf0 + vspltisb $t0,1 # one + vaddubm $xC2,$xC2,$xC2 # 0xe0 + vxor $zero,$zero,$zero + vor $xC2,$xC2,$t0 # 0xe1 + vsldoi $xC2,$xC2,$zero,15 # 0xe1... + vsldoi $t1,$zero,$t0,1 # ...1 + vaddubm $xC2,$xC2,$xC2 # 0xc2... + vspltisb $t2,7 + vor $xC2,$xC2,$t1 # 0xc2....01 + vspltb $t1,$H,0 # most significant byte + vsl $H,$H,$t0 # H<<=1 + vsrab $t1,$t1,$t2 # broadcast carry bit + vand $t1,$t1,$xC2 + vxor $IN,$H,$t1 # twisted H + + vsldoi $H,$IN,$IN,8 # twist even more ... + vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 + vsldoi $Hl,$zero,$H,8 # ... and split + vsldoi $Hh,$H,$zero,8 + + stvx_u $xC2,0,r3 # save pre-computed table + stvx_u $Hl,r8,r3 + li r8,0x40 + stvx_u $H, r9,r3 + li r9,0x50 + stvx_u $Hh,r10,r3 + li r10,0x60 + + vpmsumd $Xl,$IN,$Hl # H.lo·H.lo + vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi + vpmsumd $Xh,$IN,$Hh # H.hi·H.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + vxor $IN1,$Xl,$t1 + + vsldoi $H2,$IN1,$IN1,8 + vsldoi $H2l,$zero,$H2,8 + vsldoi $H2h,$H2,$zero,8 + + stvx_u $H2l,r8,r3 # save H^2 + li r8,0x70 + stvx_u $H2,r9,r3 + li r9,0x80 + stvx_u $H2h,r10,r3 + li r10,0x90 + + vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo + vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo + vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi + vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi + vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi + vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vsldoi $t4,$Xm1,$zero,8 + vsldoi $t5,$zero,$Xm1,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + vxor $Xl1,$Xl1,$t4 + vxor $Xh1,$Xh1,$t5 + + vsldoi $Xl,$Xl,$Xl,8 + vsldoi $Xl1,$Xl1,$Xl1,8 + vxor $Xl,$Xl,$t2 + vxor $Xl1,$Xl1,$t6 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vpmsumd $Xl1,$Xl1,$xC2 + vxor $t1,$t1,$Xh + vxor $t5,$t5,$Xh1 + vxor $Xl,$Xl,$t1 + vxor $Xl1,$Xl1,$t5 + + vsldoi $H,$Xl,$Xl,8 + vsldoi $H2,$Xl1,$Xl1,8 + vsldoi $Hl,$zero,$H,8 + vsldoi $Hh,$H,$zero,8 + vsldoi $H2l,$zero,$H2,8 + vsldoi $H2h,$H2,$zero,8 + + stvx_u $Hl,r8,r3 # save H^3 + li r8,0xa0 + stvx_u $H,r9,r3 + li r9,0xb0 + stvx_u $Hh,r10,r3 + li r10,0xc0 + stvx_u $H2l,r8,r3 # save H^4 + stvx_u $H2,r9,r3 + stvx_u $H2h,r10,r3 + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size .gcm_init_htable,.-.gcm_init_htable + +.globl .gcm_gmult_p8 + lis r0,0xfff8 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $IN,0,$Xip # load Xi + + lvx_u $Hl,r8,$Htbl # load pre-computed table + le?lvsl $lemask,r0,r0 + lvx_u $H, r9,$Htbl + le?vspltisb $t0,0x07 + lvx_u $Hh,r10,$Htbl + le?vxor $lemask,$lemask,$t0 + lvx_u $xC2,0,$Htbl + le?vperm $IN,$IN,$IN,$lemask + vxor $zero,$zero,$zero + + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi + + vpmsumd $t2,$Xl,$xC2 # 1st phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + vxor $Xl,$Xl,$t1 + + le?vperm $Xl,$Xl,$Xl,$lemask + stvx_u $Xl,0,$Xip # write out Xi + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size .gcm_gmult_p8,.-.gcm_gmult_p8 + +.globl .gcm_ghash_p8 + lis r0,0xfff8 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $Xl,0,$Xip # load Xi + + lvx_u $Hl,r8,$Htbl # load pre-computed table + le?lvsl $lemask,r0,r0 + lvx_u $H, r9,$Htbl + le?vspltisb $t0,0x07 + lvx_u $Hh,r10,$Htbl + le?vxor $lemask,$lemask,$t0 + lvx_u $xC2,0,$Htbl + le?vperm $Xl,$Xl,$Xl,$lemask + vxor $zero,$zero,$zero + + lvx_u $IN,0,$inp + addi $inp,$inp,16 + subi $len,$len,16 + le?vperm $IN,$IN,$IN,$lemask + vxor $IN,$IN,$Xl + b Loop + +.align 5 +Loop: + subic $len,$len,16 + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo + subfe. r0,r0,r0 # borrow?-1:0 + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi + and r0,r0,$len + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi + add $inp,$inp,r0 + + vpmsumd $t2,$Xl,$xC2 # 1st phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + lvx_u $IN,0,$inp + addi $inp,$inp,16 + + vsldoi $t1,$Xl,$Xl,8 # 2nd phase + vpmsumd $Xl,$Xl,$xC2 + le?vperm $IN,$IN,$IN,$lemask + vxor $t1,$t1,$Xh + vxor $IN,$IN,$t1 + vxor $IN,$IN,$Xl + beq Loop # did $len-=16 borrow? + + vxor $Xl,$Xl,$t1 + le?vperm $Xl,$Xl,$Xl,$lemask + stvx_u $Xl,0,$Xip # write out Xi + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,4,0 + .long 0 +.size .gcm_ghash_p8,.-.gcm_ghash_p8 + +.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by " +.align 2 +___ + +foreach (split("\n",$code)) { + if ($flavour =~ /le$/o) { # little-endian + s/le\?//o or + s/be\?/#be#/o; + } else { + s/le\?/#le#/o or + s/be\?//o; + } + print $_,"\n"; +} + +close STDOUT; # enforce flush From 0781bbd7eaca70611bc2314001f82e1903aeff48 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Sun, 4 Dec 2022 19:34:58 -0500 Subject: [PATCH 031/156] crypto: p10-aes-gcm - A perl script to process PowerPC assembler source Signed-off-by: Danny Tsen Signed-off-by: Herbert Xu --- arch/powerpc/crypto/ppc-xlate.pl | 229 +++++++++++++++++++++++++++++++ 1 file changed, 229 insertions(+) create mode 100644 arch/powerpc/crypto/ppc-xlate.pl diff --git a/arch/powerpc/crypto/ppc-xlate.pl b/arch/powerpc/crypto/ppc-xlate.pl new file mode 100644 index 000000000000..23cca703ce29 --- /dev/null +++ b/arch/powerpc/crypto/ppc-xlate.pl @@ -0,0 +1,229 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 + +# PowerPC assembler distiller by . + +my $flavour = shift; +my $output = shift; +open STDOUT,">$output" || die "can't open $output: $!"; + +my %GLOBALS; +my $dotinlocallabels=($flavour=~/linux/)?1:0; + +################################################################ +# directives which need special treatment on different platforms +################################################################ +my $globl = sub { + my $junk = shift; + my $name = shift; + my $global = \$GLOBALS{$name}; + my $ret; + + $name =~ s|^[\.\_]||; + + SWITCH: for ($flavour) { + /aix/ && do { $name = ".$name"; + last; + }; + /osx/ && do { $name = "_$name"; + last; + }; + /linux/ + && do { $ret = "_GLOBAL($name)"; + last; + }; + } + + $ret = ".globl $name\nalign 5\n$name:" if (!$ret); + $$global = $name; + $ret; +}; +my $text = sub { + my $ret = ($flavour =~ /aix/) ? ".csect\t.text[PR],7" : ".text"; + $ret = ".abiversion 2\n".$ret if ($flavour =~ /linux.*64le/); + $ret; +}; +my $machine = sub { + my $junk = shift; + my $arch = shift; + if ($flavour =~ /osx/) + { $arch =~ s/\"//g; + $arch = ($flavour=~/64/) ? "ppc970-64" : "ppc970" if ($arch eq "any"); + } + ".machine $arch"; +}; +my $size = sub { + if ($flavour =~ /linux/) + { shift; + my $name = shift; $name =~ s|^[\.\_]||; + my $ret = ".size $name,.-".($flavour=~/64$/?".":"").$name; + $ret .= "\n.size .$name,.-.$name" if ($flavour=~/64$/); + $ret; + } + else + { ""; } +}; +my $asciz = sub { + shift; + my $line = join(",",@_); + if ($line =~ /^"(.*)"$/) + { ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; } + else + { ""; } +}; +my $quad = sub { + shift; + my @ret; + my ($hi,$lo); + for (@_) { + if (/^0x([0-9a-f]*?)([0-9a-f]{1,8})$/io) + { $hi=$1?"0x$1":"0"; $lo="0x$2"; } + elsif (/^([0-9]+)$/o) + { $hi=$1>>32; $lo=$1&0xffffffff; } # error-prone with 32-bit perl + else + { $hi=undef; $lo=$_; } + + if (defined($hi)) + { push(@ret,$flavour=~/le$/o?".long\t$lo,$hi":".long\t$hi,$lo"); } + else + { push(@ret,".quad $lo"); } + } + join("\n",@ret); +}; + +################################################################ +# simplified mnemonics not handled by at least one assembler +################################################################ +my $cmplw = sub { + my $f = shift; + my $cr = 0; $cr = shift if ($#_>1); + # Some out-of-date 32-bit GNU assembler just can't handle cmplw... + ($flavour =~ /linux.*32/) ? + " .long ".sprintf "0x%x",31<<26|$cr<<23|$_[0]<<16|$_[1]<<11|64 : + " cmplw ".join(',',$cr,@_); +}; +my $bdnz = sub { + my $f = shift; + my $bo = $f=~/[\+\-]/ ? 16+9 : 16; # optional "to be taken" hint + " bc $bo,0,".shift; +} if ($flavour!~/linux/); +my $bltlr = sub { + my $f = shift; + my $bo = $f=~/\-/ ? 12+2 : 12; # optional "not to be taken" hint + ($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints + " .long ".sprintf "0x%x",19<<26|$bo<<21|16<<1 : + " bclr $bo,0"; +}; +my $bnelr = sub { + my $f = shift; + my $bo = $f=~/\-/ ? 4+2 : 4; # optional "not to be taken" hint + ($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints + " .long ".sprintf "0x%x",19<<26|$bo<<21|2<<16|16<<1 : + " bclr $bo,2"; +}; +my $beqlr = sub { + my $f = shift; + my $bo = $f=~/-/ ? 12+2 : 12; # optional "not to be taken" hint + ($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints + " .long ".sprintf "0x%X",19<<26|$bo<<21|2<<16|16<<1 : + " bclr $bo,2"; +}; +# GNU assembler can't handle extrdi rA,rS,16,48, or when sum of last two +# arguments is 64, with "operand out of range" error. +my $extrdi = sub { + my ($f,$ra,$rs,$n,$b) = @_; + $b = ($b+$n)&63; $n = 64-$n; + " rldicl $ra,$rs,$b,$n"; +}; +my $vmr = sub { + my ($f,$vx,$vy) = @_; + " vor $vx,$vy,$vy"; +}; + +# Some ABIs specify vrsave, special-purpose register #256, as reserved +# for system use. +my $no_vrsave = ($flavour =~ /linux-ppc64le/); +my $mtspr = sub { + my ($f,$idx,$ra) = @_; + if ($idx == 256 && $no_vrsave) { + " or $ra,$ra,$ra"; + } else { + " mtspr $idx,$ra"; + } +}; +my $mfspr = sub { + my ($f,$rd,$idx) = @_; + if ($idx == 256 && $no_vrsave) { + " li $rd,-1"; + } else { + " mfspr $rd,$idx"; + } +}; + +# PowerISA 2.06 stuff +sub vsxmem_op { + my ($f, $vrt, $ra, $rb, $op) = @_; + " .long ".sprintf "0x%X",(31<<26)|($vrt<<21)|($ra<<16)|($rb<<11)|($op*2+1); +} +# made-up unaligned memory reference AltiVec/VMX instructions +my $lvx_u = sub { vsxmem_op(@_, 844); }; # lxvd2x +my $stvx_u = sub { vsxmem_op(@_, 972); }; # stxvd2x +my $lvdx_u = sub { vsxmem_op(@_, 588); }; # lxsdx +my $stvdx_u = sub { vsxmem_op(@_, 716); }; # stxsdx +my $lvx_4w = sub { vsxmem_op(@_, 780); }; # lxvw4x +my $stvx_4w = sub { vsxmem_op(@_, 908); }; # stxvw4x + +# PowerISA 2.07 stuff +sub vcrypto_op { + my ($f, $vrt, $vra, $vrb, $op) = @_; + " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op; +} +my $vcipher = sub { vcrypto_op(@_, 1288); }; +my $vcipherlast = sub { vcrypto_op(@_, 1289); }; +my $vncipher = sub { vcrypto_op(@_, 1352); }; +my $vncipherlast= sub { vcrypto_op(@_, 1353); }; +my $vsbox = sub { vcrypto_op(@_, 0, 1480); }; +my $vshasigmad = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1730); }; +my $vshasigmaw = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1666); }; +my $vpmsumb = sub { vcrypto_op(@_, 1032); }; +my $vpmsumd = sub { vcrypto_op(@_, 1224); }; +my $vpmsubh = sub { vcrypto_op(@_, 1096); }; +my $vpmsumw = sub { vcrypto_op(@_, 1160); }; +my $vaddudm = sub { vcrypto_op(@_, 192); }; +my $vadduqm = sub { vcrypto_op(@_, 256); }; + +my $mtsle = sub { + my ($f, $arg) = @_; + " .long ".sprintf "0x%X",(31<<26)|($arg<<21)|(147*2); +}; + +print "#include \n" if $flavour =~ /linux/; + +while($line=<>) { + + $line =~ s|[#!;].*$||; # get rid of asm-style comments... + $line =~ s|/\*.*\*/||; # ... and C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning... + $line =~ s|\s+$||; # ... and at the end + + { + $line =~ s|\b\.L(\w+)|L$1|g; # common denominator for Locallabel + $line =~ s|\bL(\w+)|\.L$1|g if ($dotinlocallabels); + } + + { + $line =~ s|^\s*(\.?)(\w+)([\.\+\-]?)\s*||; + my $c = $1; $c = "\t" if ($c eq ""); + my $mnemonic = $2; + my $f = $3; + my $opcode = eval("\$$mnemonic"); + $line =~ s/\b(c?[rf]|v|vs)([0-9]+)\b/$2/g if ($c ne "." and $flavour !~ /osx/); + if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); } + elsif ($mnemonic) { $line = $c.$mnemonic.$f."\t".$line; } + } + + print $line if ($line); + print "\n"; +} + +close STDOUT; From d07bd950b91efb4d6a960347c603f8424a1125d1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 2 Jan 2023 11:18:46 +0100 Subject: [PATCH 032/156] crypto: skcipher - Use scatterwalk (un)map interface for dst and src buffers The skcipher walk API implementation avoids scatterwalk_map() for mapping the source and destination buffers, and invokes kmap_atomic() directly if the buffer in question is not in low memory (which can only happen on 32-bit architectures). This avoids some overhead on 64-bit architectures, and most notably, permits the skcipher code to run with preemption enabled. Now that scatterwalk_map() has been updated to use kmap_local(), none of this is needed, so we can simply use scatterwalk_map/unmap instead. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/skcipher.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/crypto/skcipher.c b/crypto/skcipher.c index 0ecab31cfe79..7bf4871fec80 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -42,38 +42,24 @@ struct skcipher_walk_buffer { static int skcipher_walk_next(struct skcipher_walk *walk); -static inline void skcipher_unmap(struct scatter_walk *walk, void *vaddr) -{ - if (PageHighMem(scatterwalk_page(walk))) - kunmap_atomic(vaddr); -} - -static inline void *skcipher_map(struct scatter_walk *walk) -{ - struct page *page = scatterwalk_page(walk); - - return (PageHighMem(page) ? kmap_atomic(page) : page_address(page)) + - offset_in_page(walk->offset); -} - static inline void skcipher_map_src(struct skcipher_walk *walk) { - walk->src.virt.addr = skcipher_map(&walk->in); + walk->src.virt.addr = scatterwalk_map(&walk->in); } static inline void skcipher_map_dst(struct skcipher_walk *walk) { - walk->dst.virt.addr = skcipher_map(&walk->out); + walk->dst.virt.addr = scatterwalk_map(&walk->out); } static inline void skcipher_unmap_src(struct skcipher_walk *walk) { - skcipher_unmap(&walk->in, walk->src.virt.addr); + scatterwalk_unmap(walk->src.virt.addr); } static inline void skcipher_unmap_dst(struct skcipher_walk *walk) { - skcipher_unmap(&walk->out, walk->dst.virt.addr); + scatterwalk_unmap(walk->dst.virt.addr); } static inline gfp_t skcipher_walk_gfp(struct skcipher_walk *walk) From 2f1cf4e50c956f882c9fc209c7cded832b67b8a3 Mon Sep 17 00:00:00 2001 From: Neal Liu Date: Wed, 4 Jan 2023 09:34:33 +0800 Subject: [PATCH 033/156] crypto: aspeed - Add ACRY RSA driver ACRY Engine is designed to accelerate the throughput of ECDSA/RSA signature and verification. This patch aims to add ACRY RSA engine driver for hardware acceleration. Signed-off-by: Neal Liu Signed-off-by: Herbert Xu --- drivers/crypto/aspeed/Kconfig | 11 + drivers/crypto/aspeed/Makefile | 2 + drivers/crypto/aspeed/aspeed-acry.c | 828 ++++++++++++++++++++++++++++ 3 files changed, 841 insertions(+) create mode 100644 drivers/crypto/aspeed/aspeed-acry.c diff --git a/drivers/crypto/aspeed/Kconfig b/drivers/crypto/aspeed/Kconfig index ae2710ae8d8f..db6c5b4cdc40 100644 --- a/drivers/crypto/aspeed/Kconfig +++ b/drivers/crypto/aspeed/Kconfig @@ -46,3 +46,14 @@ config CRYPTO_DEV_ASPEED_HACE_CRYPTO crypto driver. Supports AES/DES symmetric-key encryption and decryption with ECB/CBC/CFB/OFB/CTR options. + +config CRYPTO_DEV_ASPEED_ACRY + bool "Enable Aspeed ACRY RSA Engine" + depends on CRYPTO_DEV_ASPEED + select CRYPTO_ENGINE + select CRYPTO_RSA + help + Select here to enable Aspeed ECC/RSA Engine (ACRY) + RSA driver. + Supports 256 bits to 4096 bits RSA encryption/decryption + and signature/verification. diff --git a/drivers/crypto/aspeed/Makefile b/drivers/crypto/aspeed/Makefile index a0ed40ddaad1..24284d812b79 100644 --- a/drivers/crypto/aspeed/Makefile +++ b/drivers/crypto/aspeed/Makefile @@ -5,3 +5,5 @@ obj-$(CONFIG_CRYPTO_DEV_ASPEED) += aspeed_crypto.o aspeed_crypto-objs := aspeed-hace.o \ $(hace-hash-y) \ $(hace-crypto-y) + +obj-$(CONFIG_CRYPTO_DEV_ASPEED_ACRY) += aspeed-acry.o diff --git a/drivers/crypto/aspeed/aspeed-acry.c b/drivers/crypto/aspeed/aspeed-acry.c new file mode 100644 index 000000000000..6d3790583f8b --- /dev/null +++ b/drivers/crypto/aspeed/aspeed-acry.c @@ -0,0 +1,828 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright 2021 Aspeed Technology Inc. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_CRYPTO_DEV_ASPEED_DEBUG +#define ACRY_DBG(d, fmt, ...) \ + dev_info((d)->dev, "%s() " fmt, __func__, ##__VA_ARGS__) +#else +#define ACRY_DBG(d, fmt, ...) \ + dev_dbg((d)->dev, "%s() " fmt, __func__, ##__VA_ARGS__) +#endif + +/***************************** + * * + * ACRY register definitions * + * * + * ***************************/ +#define ASPEED_ACRY_TRIGGER 0x000 /* ACRY Engine Control: trigger */ +#define ASPEED_ACRY_DMA_CMD 0x048 /* ACRY Engine Control: Command */ +#define ASPEED_ACRY_DMA_SRC_BASE 0x04C /* ACRY DRAM base address for DMA */ +#define ASPEED_ACRY_DMA_LEN 0x050 /* ACRY Data Length of DMA */ +#define ASPEED_ACRY_RSA_KEY_LEN 0x058 /* ACRY RSA Exp/Mod Key Length (Bits) */ +#define ASPEED_ACRY_INT_MASK 0x3F8 /* ACRY Interrupt Mask */ +#define ASPEED_ACRY_STATUS 0x3FC /* ACRY Interrupt Status */ + +/* rsa trigger */ +#define ACRY_CMD_RSA_TRIGGER BIT(0) +#define ACRY_CMD_DMA_RSA_TRIGGER BIT(1) + +/* rsa dma cmd */ +#define ACRY_CMD_DMA_SRAM_MODE_RSA (0x3 << 4) +#define ACRY_CMD_DMEM_AHB BIT(8) +#define ACRY_CMD_DMA_SRAM_AHB_ENGINE 0 + +/* rsa key len */ +#define RSA_E_BITS_LEN(x) ((x) << 16) +#define RSA_M_BITS_LEN(x) (x) + +/* acry isr */ +#define ACRY_RSA_ISR BIT(1) + +#define ASPEED_ACRY_BUFF_SIZE 0x1800 /* DMA buffer size */ +#define ASPEED_ACRY_SRAM_MAX_LEN 2048 /* ACRY SRAM maximum length (Bytes) */ +#define ASPEED_ACRY_RSA_MAX_KEY_LEN 512 /* ACRY RSA maximum key length (Bytes) */ + +#define CRYPTO_FLAGS_BUSY BIT(1) +#define BYTES_PER_DWORD 4 + +/***************************** + * * + * AHBC register definitions * + * * + * ***************************/ +#define AHBC_REGION_PROT 0x240 +#define REGION_ACRYM BIT(23) + +#define ast_acry_write(acry, val, offset) \ + writel((val), (acry)->regs + (offset)) + +#define ast_acry_read(acry, offset) \ + readl((acry)->regs + (offset)) + +struct aspeed_acry_dev; + +typedef int (*aspeed_acry_fn_t)(struct aspeed_acry_dev *); + +struct aspeed_acry_dev { + void __iomem *regs; + struct device *dev; + int irq; + struct clk *clk; + struct regmap *ahbc; + + struct akcipher_request *req; + struct tasklet_struct done_task; + aspeed_acry_fn_t resume; + unsigned long flags; + + /* ACRY output SRAM buffer */ + void __iomem *acry_sram; + + /* ACRY input DMA buffer */ + void *buf_addr; + dma_addr_t buf_dma_addr; + + struct crypto_engine *crypt_engine_rsa; + + /* ACRY SRAM memory mapped */ + int exp_dw_mapping[ASPEED_ACRY_RSA_MAX_KEY_LEN]; + int mod_dw_mapping[ASPEED_ACRY_RSA_MAX_KEY_LEN]; + int data_byte_mapping[ASPEED_ACRY_SRAM_MAX_LEN]; +}; + +struct aspeed_acry_ctx { + struct crypto_engine_ctx enginectx; + struct aspeed_acry_dev *acry_dev; + + struct rsa_key key; + int enc; + u8 *n; + u8 *e; + u8 *d; + size_t n_sz; + size_t e_sz; + size_t d_sz; + + aspeed_acry_fn_t trigger; + + struct crypto_akcipher *fallback_tfm; +}; + +struct aspeed_acry_alg { + struct aspeed_acry_dev *acry_dev; + struct akcipher_alg akcipher; +}; + +enum aspeed_rsa_key_mode { + ASPEED_RSA_EXP_MODE = 0, + ASPEED_RSA_MOD_MODE, + ASPEED_RSA_DATA_MODE, +}; + +static inline struct akcipher_request * + akcipher_request_cast(struct crypto_async_request *req) +{ + return container_of(req, struct akcipher_request, base); +} + +static int aspeed_acry_do_fallback(struct akcipher_request *req) +{ + struct crypto_akcipher *cipher = crypto_akcipher_reqtfm(req); + struct aspeed_acry_ctx *ctx = akcipher_tfm_ctx(cipher); + int err; + + akcipher_request_set_tfm(req, ctx->fallback_tfm); + + if (ctx->enc) + err = crypto_akcipher_encrypt(req); + else + err = crypto_akcipher_decrypt(req); + + akcipher_request_set_tfm(req, cipher); + + return err; +} + +static bool aspeed_acry_need_fallback(struct akcipher_request *req) +{ + struct crypto_akcipher *cipher = crypto_akcipher_reqtfm(req); + struct aspeed_acry_ctx *ctx = akcipher_tfm_ctx(cipher); + + return ctx->key.n_sz > ASPEED_ACRY_RSA_MAX_KEY_LEN; +} + +static int aspeed_acry_handle_queue(struct aspeed_acry_dev *acry_dev, + struct akcipher_request *req) +{ + if (aspeed_acry_need_fallback(req)) { + ACRY_DBG(acry_dev, "SW fallback\n"); + return aspeed_acry_do_fallback(req); + } + + return crypto_transfer_akcipher_request_to_engine(acry_dev->crypt_engine_rsa, req); +} + +static int aspeed_acry_do_request(struct crypto_engine *engine, void *areq) +{ + struct akcipher_request *req = akcipher_request_cast(areq); + struct crypto_akcipher *cipher = crypto_akcipher_reqtfm(req); + struct aspeed_acry_ctx *ctx = akcipher_tfm_ctx(cipher); + struct aspeed_acry_dev *acry_dev = ctx->acry_dev; + + acry_dev->req = req; + acry_dev->flags |= CRYPTO_FLAGS_BUSY; + + return ctx->trigger(acry_dev); +} + +static int aspeed_acry_complete(struct aspeed_acry_dev *acry_dev, int err) +{ + struct akcipher_request *req = acry_dev->req; + + acry_dev->flags &= ~CRYPTO_FLAGS_BUSY; + + crypto_finalize_akcipher_request(acry_dev->crypt_engine_rsa, req, err); + + return err; +} + +/* + * Copy Data to DMA buffer for engine used. + */ +static void aspeed_acry_rsa_sg_copy_to_buffer(struct aspeed_acry_dev *acry_dev, + u8 *buf, struct scatterlist *src, + size_t nbytes) +{ + static u8 dram_buffer[ASPEED_ACRY_SRAM_MAX_LEN]; + int i = 0, j; + int data_idx; + + ACRY_DBG(acry_dev, "\n"); + + scatterwalk_map_and_copy(dram_buffer, src, 0, nbytes, 0); + + for (j = nbytes - 1; j >= 0; j--) { + data_idx = acry_dev->data_byte_mapping[i]; + buf[data_idx] = dram_buffer[j]; + i++; + } + + for (; i < ASPEED_ACRY_SRAM_MAX_LEN; i++) { + data_idx = acry_dev->data_byte_mapping[i]; + buf[data_idx] = 0; + } +} + +/* + * Copy Exp/Mod to DMA buffer for engine used. + * + * Params: + * - mode 0 : Exponential + * - mode 1 : Modulus + * + * Example: + * - DRAM memory layout: + * D[0], D[4], D[8], D[12] + * - ACRY SRAM memory layout should reverse the order of source data: + * D[12], D[8], D[4], D[0] + */ +static int aspeed_acry_rsa_ctx_copy(struct aspeed_acry_dev *acry_dev, void *buf, + const void *xbuf, size_t nbytes, + enum aspeed_rsa_key_mode mode) +{ + const u8 *src = xbuf; + u32 *dw_buf = (u32 *)buf; + int nbits, ndw; + int i, j, idx; + u32 data = 0; + + ACRY_DBG(acry_dev, "nbytes:%zu, mode:%d\n", nbytes, mode); + + if (nbytes > ASPEED_ACRY_RSA_MAX_KEY_LEN) + return -ENOMEM; + + /* Remove the leading zeros */ + while (nbytes > 0 && src[0] == 0) { + src++; + nbytes--; + } + + nbits = nbytes * 8; + if (nbytes > 0) + nbits -= count_leading_zeros(src[0]) - (BITS_PER_LONG - 8); + + /* double-world alignment */ + ndw = DIV_ROUND_UP(nbytes, BYTES_PER_DWORD); + + if (nbytes > 0) { + i = BYTES_PER_DWORD - nbytes % BYTES_PER_DWORD; + i %= BYTES_PER_DWORD; + + for (j = ndw; j > 0; j--) { + for (; i < BYTES_PER_DWORD; i++) { + data <<= 8; + data |= *src++; + } + + i = 0; + + if (mode == ASPEED_RSA_EXP_MODE) + idx = acry_dev->exp_dw_mapping[j - 1]; + else if (mode == ASPEED_RSA_MOD_MODE) + idx = acry_dev->mod_dw_mapping[j - 1]; + + dw_buf[idx] = cpu_to_le32(data); + } + } + + return nbits; +} + +static int aspeed_acry_rsa_transfer(struct aspeed_acry_dev *acry_dev) +{ + struct akcipher_request *req = acry_dev->req; + u8 *sram_buffer = (u8 *)acry_dev->acry_sram; + struct scatterlist *out_sg = req->dst; + static u8 dram_buffer[ASPEED_ACRY_SRAM_MAX_LEN]; + int leading_zero = 1; + int result_nbytes; + int i = 0, j; + int data_idx; + + /* Set Data Memory to AHB(CPU) Access Mode */ + ast_acry_write(acry_dev, ACRY_CMD_DMEM_AHB, ASPEED_ACRY_DMA_CMD); + + /* Disable ACRY SRAM protection */ + regmap_update_bits(acry_dev->ahbc, AHBC_REGION_PROT, + REGION_ACRYM, 0); + + result_nbytes = ASPEED_ACRY_SRAM_MAX_LEN; + + for (j = ASPEED_ACRY_SRAM_MAX_LEN - 1; j >= 0; j--) { + data_idx = acry_dev->data_byte_mapping[j]; + if (sram_buffer[data_idx] == 0 && leading_zero) { + result_nbytes--; + } else { + leading_zero = 0; + dram_buffer[i] = sram_buffer[data_idx]; + i++; + } + } + + ACRY_DBG(acry_dev, "result_nbytes:%d, req->dst_len:%d\n", + result_nbytes, req->dst_len); + + if (result_nbytes <= req->dst_len) { + scatterwalk_map_and_copy(dram_buffer, out_sg, 0, result_nbytes, + 1); + req->dst_len = result_nbytes; + + } else { + dev_err(acry_dev->dev, "RSA engine error!\n"); + } + + memzero_explicit(acry_dev->buf_addr, ASPEED_ACRY_BUFF_SIZE); + + return aspeed_acry_complete(acry_dev, 0); +} + +static int aspeed_acry_rsa_trigger(struct aspeed_acry_dev *acry_dev) +{ + struct akcipher_request *req = acry_dev->req; + struct crypto_akcipher *cipher = crypto_akcipher_reqtfm(req); + struct aspeed_acry_ctx *ctx = akcipher_tfm_ctx(cipher); + int ne, nm; + + if (!ctx->n || !ctx->n_sz) { + dev_err(acry_dev->dev, "%s: key n is not set\n", __func__); + return -EINVAL; + } + + memzero_explicit(acry_dev->buf_addr, ASPEED_ACRY_BUFF_SIZE); + + /* Copy source data to DMA buffer */ + aspeed_acry_rsa_sg_copy_to_buffer(acry_dev, acry_dev->buf_addr, + req->src, req->src_len); + + nm = aspeed_acry_rsa_ctx_copy(acry_dev, acry_dev->buf_addr, ctx->n, + ctx->n_sz, ASPEED_RSA_MOD_MODE); + if (ctx->enc) { + if (!ctx->e || !ctx->e_sz) { + dev_err(acry_dev->dev, "%s: key e is not set\n", + __func__); + return -EINVAL; + } + /* Copy key e to DMA buffer */ + ne = aspeed_acry_rsa_ctx_copy(acry_dev, acry_dev->buf_addr, + ctx->e, ctx->e_sz, + ASPEED_RSA_EXP_MODE); + } else { + if (!ctx->d || !ctx->d_sz) { + dev_err(acry_dev->dev, "%s: key d is not set\n", + __func__); + return -EINVAL; + } + /* Copy key d to DMA buffer */ + ne = aspeed_acry_rsa_ctx_copy(acry_dev, acry_dev->buf_addr, + ctx->key.d, ctx->key.d_sz, + ASPEED_RSA_EXP_MODE); + } + + ast_acry_write(acry_dev, acry_dev->buf_dma_addr, + ASPEED_ACRY_DMA_SRC_BASE); + ast_acry_write(acry_dev, (ne << 16) + nm, + ASPEED_ACRY_RSA_KEY_LEN); + ast_acry_write(acry_dev, ASPEED_ACRY_BUFF_SIZE, + ASPEED_ACRY_DMA_LEN); + + acry_dev->resume = aspeed_acry_rsa_transfer; + + /* Enable ACRY SRAM protection */ + regmap_update_bits(acry_dev->ahbc, AHBC_REGION_PROT, + REGION_ACRYM, REGION_ACRYM); + + ast_acry_write(acry_dev, ACRY_RSA_ISR, ASPEED_ACRY_INT_MASK); + ast_acry_write(acry_dev, ACRY_CMD_DMA_SRAM_MODE_RSA | + ACRY_CMD_DMA_SRAM_AHB_ENGINE, ASPEED_ACRY_DMA_CMD); + + /* Trigger RSA engines */ + ast_acry_write(acry_dev, ACRY_CMD_RSA_TRIGGER | + ACRY_CMD_DMA_RSA_TRIGGER, ASPEED_ACRY_TRIGGER); + + return 0; +} + +static int aspeed_acry_rsa_enc(struct akcipher_request *req) +{ + struct crypto_akcipher *cipher = crypto_akcipher_reqtfm(req); + struct aspeed_acry_ctx *ctx = akcipher_tfm_ctx(cipher); + struct aspeed_acry_dev *acry_dev = ctx->acry_dev; + + ctx->trigger = aspeed_acry_rsa_trigger; + ctx->enc = 1; + + return aspeed_acry_handle_queue(acry_dev, req); +} + +static int aspeed_acry_rsa_dec(struct akcipher_request *req) +{ + struct crypto_akcipher *cipher = crypto_akcipher_reqtfm(req); + struct aspeed_acry_ctx *ctx = akcipher_tfm_ctx(cipher); + struct aspeed_acry_dev *acry_dev = ctx->acry_dev; + + ctx->trigger = aspeed_acry_rsa_trigger; + ctx->enc = 0; + + return aspeed_acry_handle_queue(acry_dev, req); +} + +static u8 *aspeed_rsa_key_copy(u8 *src, size_t len) +{ + return kmemdup(src, len, GFP_KERNEL); +} + +static int aspeed_rsa_set_n(struct aspeed_acry_ctx *ctx, u8 *value, + size_t len) +{ + ctx->n_sz = len; + ctx->n = aspeed_rsa_key_copy(value, len); + if (!ctx->n) + return -ENOMEM; + + return 0; +} + +static int aspeed_rsa_set_e(struct aspeed_acry_ctx *ctx, u8 *value, + size_t len) +{ + ctx->e_sz = len; + ctx->e = aspeed_rsa_key_copy(value, len); + if (!ctx->e) + return -ENOMEM; + + return 0; +} + +static int aspeed_rsa_set_d(struct aspeed_acry_ctx *ctx, u8 *value, + size_t len) +{ + ctx->d_sz = len; + ctx->d = aspeed_rsa_key_copy(value, len); + if (!ctx->d) + return -ENOMEM; + + return 0; +} + +static void aspeed_rsa_key_free(struct aspeed_acry_ctx *ctx) +{ + kfree_sensitive(ctx->n); + kfree_sensitive(ctx->e); + kfree_sensitive(ctx->d); + ctx->n_sz = 0; + ctx->e_sz = 0; + ctx->d_sz = 0; +} + +static int aspeed_acry_rsa_setkey(struct crypto_akcipher *tfm, const void *key, + unsigned int keylen, int priv) +{ + struct aspeed_acry_ctx *ctx = akcipher_tfm_ctx(tfm); + struct aspeed_acry_dev *acry_dev = ctx->acry_dev; + int ret; + + if (priv) + ret = rsa_parse_priv_key(&ctx->key, key, keylen); + else + ret = rsa_parse_pub_key(&ctx->key, key, keylen); + + if (ret) { + dev_err(acry_dev->dev, "rsa parse key failed, ret:0x%x\n", + ret); + return ret; + } + + /* Aspeed engine supports up to 4096 bits, + * Use software fallback instead. + */ + if (ctx->key.n_sz > ASPEED_ACRY_RSA_MAX_KEY_LEN) + return 0; + + ret = aspeed_rsa_set_n(ctx, (u8 *)ctx->key.n, ctx->key.n_sz); + if (ret) + goto err; + + ret = aspeed_rsa_set_e(ctx, (u8 *)ctx->key.e, ctx->key.e_sz); + if (ret) + goto err; + + if (priv) { + ret = aspeed_rsa_set_d(ctx, (u8 *)ctx->key.d, ctx->key.d_sz); + if (ret) + goto err; + } + + return 0; + +err: + dev_err(acry_dev->dev, "rsa set key failed\n"); + aspeed_rsa_key_free(ctx); + + return ret; +} + +static int aspeed_acry_rsa_set_pub_key(struct crypto_akcipher *tfm, + const void *key, + unsigned int keylen) +{ + struct aspeed_acry_ctx *ctx = akcipher_tfm_ctx(tfm); + int ret; + + ret = crypto_akcipher_set_pub_key(ctx->fallback_tfm, key, keylen); + if (ret) + return ret; + + return aspeed_acry_rsa_setkey(tfm, key, keylen, 0); +} + +static int aspeed_acry_rsa_set_priv_key(struct crypto_akcipher *tfm, + const void *key, + unsigned int keylen) +{ + struct aspeed_acry_ctx *ctx = akcipher_tfm_ctx(tfm); + int ret; + + ret = crypto_akcipher_set_priv_key(ctx->fallback_tfm, key, keylen); + if (ret) + return ret; + + return aspeed_acry_rsa_setkey(tfm, key, keylen, 1); +} + +static unsigned int aspeed_acry_rsa_max_size(struct crypto_akcipher *tfm) +{ + struct aspeed_acry_ctx *ctx = akcipher_tfm_ctx(tfm); + + if (ctx->key.n_sz > ASPEED_ACRY_RSA_MAX_KEY_LEN) + return crypto_akcipher_maxsize(ctx->fallback_tfm); + + return ctx->n_sz; +} + +static int aspeed_acry_rsa_init_tfm(struct crypto_akcipher *tfm) +{ + struct aspeed_acry_ctx *ctx = akcipher_tfm_ctx(tfm); + struct akcipher_alg *alg = crypto_akcipher_alg(tfm); + const char *name = crypto_tfm_alg_name(&tfm->base); + struct aspeed_acry_alg *acry_alg; + + acry_alg = container_of(alg, struct aspeed_acry_alg, akcipher); + + ctx->acry_dev = acry_alg->acry_dev; + + ctx->fallback_tfm = crypto_alloc_akcipher(name, 0, CRYPTO_ALG_ASYNC | + CRYPTO_ALG_NEED_FALLBACK); + if (IS_ERR(ctx->fallback_tfm)) { + dev_err(ctx->acry_dev->dev, "ERROR: Cannot allocate fallback for %s %ld\n", + name, PTR_ERR(ctx->fallback_tfm)); + return PTR_ERR(ctx->fallback_tfm); + } + + ctx->enginectx.op.do_one_request = aspeed_acry_do_request; + ctx->enginectx.op.prepare_request = NULL; + ctx->enginectx.op.unprepare_request = NULL; + + return 0; +} + +static void aspeed_acry_rsa_exit_tfm(struct crypto_akcipher *tfm) +{ + struct aspeed_acry_ctx *ctx = akcipher_tfm_ctx(tfm); + + crypto_free_akcipher(ctx->fallback_tfm); +} + +struct aspeed_acry_alg aspeed_acry_akcipher_algs[] = { + { + .akcipher = { + .encrypt = aspeed_acry_rsa_enc, + .decrypt = aspeed_acry_rsa_dec, + .sign = aspeed_acry_rsa_dec, + .verify = aspeed_acry_rsa_enc, + .set_pub_key = aspeed_acry_rsa_set_pub_key, + .set_priv_key = aspeed_acry_rsa_set_priv_key, + .max_size = aspeed_acry_rsa_max_size, + .init = aspeed_acry_rsa_init_tfm, + .exit = aspeed_acry_rsa_exit_tfm, + .base = { + .cra_name = "rsa", + .cra_driver_name = "aspeed-rsa", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_AKCIPHER | + CRYPTO_ALG_ASYNC | + CRYPTO_ALG_KERN_DRIVER_ONLY | + CRYPTO_ALG_NEED_FALLBACK, + .cra_module = THIS_MODULE, + .cra_ctxsize = sizeof(struct aspeed_acry_ctx), + }, + }, + }, +}; + +static void aspeed_acry_register(struct aspeed_acry_dev *acry_dev) +{ + int i, rc; + + for (i = 0; i < ARRAY_SIZE(aspeed_acry_akcipher_algs); i++) { + aspeed_acry_akcipher_algs[i].acry_dev = acry_dev; + rc = crypto_register_akcipher(&aspeed_acry_akcipher_algs[i].akcipher); + if (rc) { + ACRY_DBG(acry_dev, "Failed to register %s\n", + aspeed_acry_akcipher_algs[i].akcipher.base.cra_name); + } + } +} + +static void aspeed_acry_unregister(struct aspeed_acry_dev *acry_dev) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(aspeed_acry_akcipher_algs); i++) + crypto_unregister_akcipher(&aspeed_acry_akcipher_algs[i].akcipher); +} + +/* ACRY interrupt service routine. */ +static irqreturn_t aspeed_acry_irq(int irq, void *dev) +{ + struct aspeed_acry_dev *acry_dev = (struct aspeed_acry_dev *)dev; + u32 sts; + + sts = ast_acry_read(acry_dev, ASPEED_ACRY_STATUS); + ast_acry_write(acry_dev, sts, ASPEED_ACRY_STATUS); + + ACRY_DBG(acry_dev, "irq sts:0x%x\n", sts); + + if (sts & ACRY_RSA_ISR) { + /* Stop RSA engine */ + ast_acry_write(acry_dev, 0, ASPEED_ACRY_TRIGGER); + + if (acry_dev->flags & CRYPTO_FLAGS_BUSY) + tasklet_schedule(&acry_dev->done_task); + else + dev_err(acry_dev->dev, "RSA no active requests.\n"); + } + + return IRQ_HANDLED; +} + +/* + * ACRY SRAM has its own memory layout. + * Set the DRAM to SRAM indexing for future used. + */ +static void aspeed_acry_sram_mapping(struct aspeed_acry_dev *acry_dev) +{ + int i, j = 0; + + for (i = 0; i < (ASPEED_ACRY_SRAM_MAX_LEN / BYTES_PER_DWORD); i++) { + acry_dev->exp_dw_mapping[i] = j; + acry_dev->mod_dw_mapping[i] = j + 4; + acry_dev->data_byte_mapping[(i * 4)] = (j + 8) * 4; + acry_dev->data_byte_mapping[(i * 4) + 1] = (j + 8) * 4 + 1; + acry_dev->data_byte_mapping[(i * 4) + 2] = (j + 8) * 4 + 2; + acry_dev->data_byte_mapping[(i * 4) + 3] = (j + 8) * 4 + 3; + j++; + j = j % 4 ? j : j + 8; + } +} + +static void aspeed_acry_done_task(unsigned long data) +{ + struct aspeed_acry_dev *acry_dev = (struct aspeed_acry_dev *)data; + + (void)acry_dev->resume(acry_dev); +} + +static const struct of_device_id aspeed_acry_of_matches[] = { + { .compatible = "aspeed,ast2600-acry", }, + {}, +}; + +static int aspeed_acry_probe(struct platform_device *pdev) +{ + struct aspeed_acry_dev *acry_dev; + struct device *dev = &pdev->dev; + struct resource *res; + int rc; + + acry_dev = devm_kzalloc(dev, sizeof(struct aspeed_acry_dev), + GFP_KERNEL); + if (!acry_dev) + return -ENOMEM; + + acry_dev->dev = dev; + + platform_set_drvdata(pdev, acry_dev); + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + acry_dev->regs = devm_ioremap_resource(dev, res); + if (IS_ERR(acry_dev->regs)) + return PTR_ERR(acry_dev->regs); + + res = platform_get_resource(pdev, IORESOURCE_MEM, 1); + acry_dev->acry_sram = devm_ioremap_resource(dev, res); + if (IS_ERR(acry_dev->acry_sram)) + return PTR_ERR(acry_dev->acry_sram); + + /* Get irq number and register it */ + acry_dev->irq = platform_get_irq(pdev, 0); + if (acry_dev->irq < 0) + return -ENXIO; + + rc = devm_request_irq(dev, acry_dev->irq, aspeed_acry_irq, 0, + dev_name(dev), acry_dev); + if (rc) { + dev_err(dev, "Failed to request irq.\n"); + return rc; + } + + acry_dev->clk = devm_clk_get_enabled(dev, NULL); + if (IS_ERR(acry_dev->clk)) { + dev_err(dev, "Failed to get acry clk\n"); + return PTR_ERR(acry_dev->clk); + } + + acry_dev->ahbc = syscon_regmap_lookup_by_phandle(dev->of_node, + "aspeed,ahbc"); + if (IS_ERR(acry_dev->ahbc)) { + dev_err(dev, "Failed to get AHBC regmap\n"); + return -ENODEV; + } + + /* Initialize crypto hardware engine structure for RSA */ + acry_dev->crypt_engine_rsa = crypto_engine_alloc_init(dev, true); + if (!acry_dev->crypt_engine_rsa) { + rc = -ENOMEM; + goto clk_exit; + } + + rc = crypto_engine_start(acry_dev->crypt_engine_rsa); + if (rc) + goto err_engine_rsa_start; + + tasklet_init(&acry_dev->done_task, aspeed_acry_done_task, + (unsigned long)acry_dev); + + /* Set Data Memory to AHB(CPU) Access Mode */ + ast_acry_write(acry_dev, ACRY_CMD_DMEM_AHB, ASPEED_ACRY_DMA_CMD); + + /* Initialize ACRY SRAM index */ + aspeed_acry_sram_mapping(acry_dev); + + acry_dev->buf_addr = dmam_alloc_coherent(dev, ASPEED_ACRY_BUFF_SIZE, + &acry_dev->buf_dma_addr, + GFP_KERNEL); + memzero_explicit(acry_dev->buf_addr, ASPEED_ACRY_BUFF_SIZE); + + aspeed_acry_register(acry_dev); + + dev_info(dev, "Aspeed ACRY Accelerator successfully registered\n"); + + return 0; + +err_engine_rsa_start: + crypto_engine_exit(acry_dev->crypt_engine_rsa); +clk_exit: + clk_disable_unprepare(acry_dev->clk); + + return rc; +} + +static int aspeed_acry_remove(struct platform_device *pdev) +{ + struct aspeed_acry_dev *acry_dev = platform_get_drvdata(pdev); + + aspeed_acry_unregister(acry_dev); + crypto_engine_exit(acry_dev->crypt_engine_rsa); + tasklet_kill(&acry_dev->done_task); + clk_disable_unprepare(acry_dev->clk); + + return 0; +} + +MODULE_DEVICE_TABLE(of, aspeed_acry_of_matches); + +static struct platform_driver aspeed_acry_driver = { + .probe = aspeed_acry_probe, + .remove = aspeed_acry_remove, + .driver = { + .name = KBUILD_MODNAME, + .of_match_table = aspeed_acry_of_matches, + }, +}; + +module_platform_driver(aspeed_acry_driver); + +MODULE_AUTHOR("Neal Liu "); +MODULE_DESCRIPTION("ASPEED ACRY driver for hardware RSA Engine"); +MODULE_LICENSE("GPL"); From 62462a525b49e5b3096895b28a217fd3ec0b3caa Mon Sep 17 00:00:00 2001 From: Neal Liu Date: Wed, 4 Jan 2023 09:34:34 +0800 Subject: [PATCH 034/156] ARM: dts: aspeed: Add ACRY/AHBC device controller node Add acry & ahbc node to device tree for AST2600. Signed-off-by: Neal Liu Signed-off-by: Herbert Xu --- arch/arm/boot/dts/aspeed-g6.dtsi | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm/boot/dts/aspeed-g6.dtsi b/arch/arm/boot/dts/aspeed-g6.dtsi index cc2f8b785917..8246a60de0d0 100644 --- a/arch/arm/boot/dts/aspeed-g6.dtsi +++ b/arch/arm/boot/dts/aspeed-g6.dtsi @@ -98,6 +98,11 @@ <0x40466000 0x2000>; }; + ahbc: bus@1e600000 { + compatible = "aspeed,ast2600-ahbc", "syscon"; + reg = <0x1e600000 0x100>; + }; + fmc: spi@1e620000 { reg = <0x1e620000 0xc4>, <0x20000000 0x10000000>; #address-cells = <1>; @@ -431,6 +436,14 @@ reg = <0x1e6f2000 0x1000>; }; + acry: crypto@1e6fa000 { + compatible = "aspeed,ast2600-acry"; + reg = <0x1e6fa000 0x400>, <0x1e710000 0x1800>; + interrupts = ; + clocks = <&syscon ASPEED_CLK_GATE_RSACLK>; + aspeed,ahbc = <&ahbc>; + }; + video: video@1e700000 { compatible = "aspeed,ast2600-video-engine"; reg = <0x1e700000 0x1000>; From ce43b4b20e2b4aa209956c5773ac97d1beea62c4 Mon Sep 17 00:00:00 2001 From: Neal Liu Date: Wed, 4 Jan 2023 09:34:35 +0800 Subject: [PATCH 035/156] dt-bindings: crypto: add documentation for Aspeed ACRY Add device tree binding documentation for the Aspeed ECDSA/RSA ACRY Engines Controller. Signed-off-by: Neal Liu Reviewed-by: Rob Herring Signed-off-by: Herbert Xu --- .../bindings/crypto/aspeed,ast2600-acry.yaml | 49 +++++++++++++++++++ MAINTAINERS | 2 +- 2 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 Documentation/devicetree/bindings/crypto/aspeed,ast2600-acry.yaml diff --git a/Documentation/devicetree/bindings/crypto/aspeed,ast2600-acry.yaml b/Documentation/devicetree/bindings/crypto/aspeed,ast2600-acry.yaml new file mode 100644 index 000000000000..b18f178aac06 --- /dev/null +++ b/Documentation/devicetree/bindings/crypto/aspeed,ast2600-acry.yaml @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/crypto/aspeed,ast2600-acry.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: ASPEED ACRY ECDSA/RSA Hardware Accelerator Engines + +maintainers: + - Neal Liu + +description: + The ACRY ECDSA/RSA engines is designed to accelerate the throughput + of ECDSA/RSA signature and verification. Basically, ACRY can be + divided into two independent engines - ECC Engine and RSA Engine. + +properties: + compatible: + enum: + - aspeed,ast2600-acry + + reg: + items: + - description: acry base address & size + - description: acry sram base address & size + + clocks: + maxItems: 1 + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + - clocks + - interrupts + +additionalProperties: false + +examples: + - | + #include + acry: crypto@1e6fa000 { + compatible = "aspeed,ast2600-acry"; + reg = <0x1e6fa000 0x400>, <0x1e710000 0x1800>; + interrupts = <160>; + clocks = <&syscon ASPEED_CLK_GATE_RSACLK>; + }; diff --git a/MAINTAINERS b/MAINTAINERS index f61eb221415b..c4b4b9283d18 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3310,7 +3310,7 @@ ASPEED CRYPTO DRIVER M: Neal Liu L: linux-aspeed@lists.ozlabs.org (moderated for non-subscribers) S: Maintained -F: Documentation/devicetree/bindings/crypto/aspeed,ast2500-hace.yaml +F: Documentation/devicetree/bindings/crypto/aspeed,* F: drivers/crypto/aspeed/ ASUS NOTEBOOKS AND EEEPC ACPI/WMI EXTRAS DRIVERS From 55ef6c811b8481b65e54217010a62c4074e19302 Mon Sep 17 00:00:00 2001 From: Neal Liu Date: Wed, 4 Jan 2023 09:34:36 +0800 Subject: [PATCH 036/156] dt-bindings: bus: add documentation for Aspeed AHBC Add device tree binding documentation for the Aspeed Advanced High-Performance Bus (AHB) Controller. Signed-off-by: Neal Liu Reviewed-by: Rob Herring Signed-off-by: Herbert Xu --- .../bindings/bus/aspeed,ast2600-ahbc.yaml | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 Documentation/devicetree/bindings/bus/aspeed,ast2600-ahbc.yaml diff --git a/Documentation/devicetree/bindings/bus/aspeed,ast2600-ahbc.yaml b/Documentation/devicetree/bindings/bus/aspeed,ast2600-ahbc.yaml new file mode 100644 index 000000000000..2894256c976d --- /dev/null +++ b/Documentation/devicetree/bindings/bus/aspeed,ast2600-ahbc.yaml @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/bus/aspeed,ast2600-ahbc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: ASPEED Advanced High-Performance Bus Controller (AHBC) + +maintainers: + - Neal Liu + - Chia-Wei Wang + +description: | + Advanced High-performance Bus Controller (AHBC) supports plenty of mechanisms + including a priority arbiter, an address decoder and a data multiplexer + to control the overall operations of Advanced High-performance Bus (AHB). + +properties: + compatible: + enum: + - aspeed,ast2600-ahbc + + reg: + maxItems: 1 + +required: + - compatible + - reg + +additionalProperties: false + +examples: + - | + ahbc@1e600000 { + compatible = "aspeed,ast2600-ahbc"; + reg = <0x1e600000 0x100>; + }; From 197286f86012250191bfe616dc4e9b8e34380c35 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Sat, 31 Dec 2022 16:01:43 -0600 Subject: [PATCH 037/156] dt-bindings: crypto: sun8i-ce: Add compatible for D1 D1 has a crypto engine similar to the one in other Allwinner SoCs. Like H6, it has a separate MBUS clock gate. It also requires the internal RC oscillator to be enabled for the TRNG to return data, presumably because noise from the oscillator is used as an entropy source. This is likely the case for earlier variants as well, but it really only matters for H616 and newer SoCs, as H6 provides no way to disable the internal oscillator. Signed-off-by: Samuel Holland Reviewed-by: Krzysztof Kozlowski Acked-by: Corentin Labbe Signed-off-by: Herbert Xu --- .../bindings/crypto/allwinner,sun8i-ce.yaml | 33 ++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml b/Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml index 026a9f9e1aeb..4287678aa79f 100644 --- a/Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml +++ b/Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml @@ -14,6 +14,7 @@ properties: enum: - allwinner,sun8i-h3-crypto - allwinner,sun8i-r40-crypto + - allwinner,sun20i-d1-crypto - allwinner,sun50i-a64-crypto - allwinner,sun50i-h5-crypto - allwinner,sun50i-h6-crypto @@ -29,6 +30,7 @@ properties: - description: Bus clock - description: Module clock - description: MBus clock + - description: TRNG clock (RC oscillator) minItems: 2 clock-names: @@ -36,6 +38,7 @@ properties: - const: bus - const: mod - const: ram + - const: trng minItems: 2 resets: @@ -44,19 +47,33 @@ properties: if: properties: compatible: - const: allwinner,sun50i-h6-crypto + enum: + - allwinner,sun20i-d1-crypto then: properties: clocks: - minItems: 3 + minItems: 4 clock-names: - minItems: 3 + minItems: 4 else: - properties: - clocks: - maxItems: 2 - clock-names: - maxItems: 2 + if: + properties: + compatible: + const: allwinner,sun50i-h6-crypto + then: + properties: + clocks: + minItems: 3 + maxItems: 3 + clock-names: + minItems: 3 + maxItems: 3 + else: + properties: + clocks: + maxItems: 2 + clock-names: + maxItems: 2 required: - compatible From f81c1d4a6d3f6b9f31cb8feec13a4357b68a95d6 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Sat, 31 Dec 2022 16:01:44 -0600 Subject: [PATCH 038/156] crypto: sun8i-ce - Add TRNG clock to the D1 variant At least the D1 variant requires a separate clock for the TRNG. Without this clock enabled, reading from /dev/hwrng reports: sun8i-ce 3040000.crypto: DMA timeout for TRNG (tm=96) on flow 3 Experimentation shows that the necessary clock is the SoC's internal RC oscillator. This makes sense, as noise from the oscillator can be used as a source of entropy. Signed-off-by: Samuel Holland Reviewed-by: Jernej Skrabec Acked-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c | 1 + drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c index 9f6594699835..a6865ff4d400 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c @@ -118,6 +118,7 @@ static const struct ce_variant ce_d1_variant = { { "bus", 0, 200000000 }, { "mod", 300000000, 0 }, { "ram", 0, 400000000 }, + { "trng", 0, 0 }, }, .esr = ESR_D1, .prng = CE_ALG_PRNG, diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h index 8177aaba4434..27029fb77e29 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h @@ -105,7 +105,7 @@ #define MAX_SG 8 -#define CE_MAX_CLOCKS 3 +#define CE_MAX_CLOCKS 4 #define MAXFLOW 4 From 596f674dc9a7b5456f6b01de3211e0b820786dc8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 18 Jan 2023 17:07:19 +0800 Subject: [PATCH 039/156] crypto: p10-aes-gcm - Revert implementation Revert the changes that added p10-aes-gcm: 0781bbd7eaca ("crypto: p10-aes-gcm - A perl script to process PowerPC assembler source") 41a6437ab415 ("crypto: p10-aes-gcm - Supporting functions for ghash") 3b47eccaaff4 ("crypto: p10-aes-gcm - Supporting functions for AES") ca68a96c37eb ("crypto: p10-aes-gcm - An accelerated AES/GCM stitched implementation") cc40379b6e19 ("crypto: p10-aes-gcm - Glue code for AES/GCM stitched implementation") 3c657e8689ab ("crypto: p10-aes-gcm - Update Kconfig and Makefile") These changes fail to build in many configurations and are not ready for prime time. Signed-off-by: Herbert Xu --- arch/powerpc/crypto/Kconfig | 11 - arch/powerpc/crypto/Makefile | 10 - arch/powerpc/crypto/aesp8-ppc.pl | 3846 ------------------------ arch/powerpc/crypto/ghashp8-ppc.pl | 370 --- arch/powerpc/crypto/p10-aes-gcm-glue.c | 345 --- arch/powerpc/crypto/p10_aes_gcm.S | 1519 ---------- arch/powerpc/crypto/ppc-xlate.pl | 229 -- 7 files changed, 6330 deletions(-) delete mode 100644 arch/powerpc/crypto/aesp8-ppc.pl delete mode 100644 arch/powerpc/crypto/ghashp8-ppc.pl delete mode 100644 arch/powerpc/crypto/p10-aes-gcm-glue.c delete mode 100644 arch/powerpc/crypto/p10_aes_gcm.S delete mode 100644 arch/powerpc/crypto/ppc-xlate.pl diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index db7d99383993..c1b964447401 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -94,15 +94,4 @@ config CRYPTO_AES_PPC_SPE architecture specific assembler implementations that work on 1KB tables or 256 bytes S-boxes. -config CRYPTO_P10_AES_GCM - tristate "Stitched AES/GCM acceleration support on P10+ CPU (PPC)" - depends on PPC64 - select CRYPTO_LIB_AES - select CRYPTO_ALGAPI - select CRYPTO_AEAD - default m - help - Support for cryptographic acceleration instructions on Power10+ CPU. - This module supports stitched acceleration for AES/GCM in hardware. - endmenu diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index 5b8252013abd..4808d97fede5 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -13,7 +13,6 @@ obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o -obj-$(CONFIG_CRYPTO_P10_AES_GCM) += p10-aes-gcm-crypto.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o md5-ppc-y := md5-asm.o md5-glue.o @@ -22,12 +21,3 @@ sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o -p10-aes-gcm-crypto-y := p10-aes-gcm-glue.o p10_aes_gcm.o ghashp8-ppc.o aesp8-ppc.o - -quiet_cmd_perl = PERL $@ - cmd_perl = $(PERL) $< $(if $(CONFIG_CPU_LITTLE_ENDIAN), linux-ppc64le, linux-ppc64) > $@ - -targets += aesp8-ppc.S ghashp8-ppc.S - -$(obj)/aesp8-ppc.S $(obj)/ghashp8-ppc.S: $(obj)/%.S: $(src)/%.pl FORCE - $(call if_changed,perl) diff --git a/arch/powerpc/crypto/aesp8-ppc.pl b/arch/powerpc/crypto/aesp8-ppc.pl deleted file mode 100644 index 50a0a18f35da..000000000000 --- a/arch/powerpc/crypto/aesp8-ppc.pl +++ /dev/null @@ -1,3846 +0,0 @@ -#! /usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 - -# This code is taken from CRYPTOGAMs[1] and is included here using the option -# in the license to distribute the code under the GPL. Therefore this program -# is free software; you can redistribute it and/or modify it under the terms of -# the GNU General Public License version 2 as published by the Free Software -# Foundation. -# -# [1] https://www.openssl.org/~appro/cryptogams/ - -# Copyright (c) 2006-2017, CRYPTOGAMS by -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain copyright notices, -# this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# * Neither the name of the CRYPTOGAMS nor the names of its -# copyright holder and contributors may be used to endorse or -# promote products derived from this software without specific -# prior written permission. -# -# ALTERNATIVELY, provided that this notice is retained in full, this -# product may be distributed under the terms of the GNU General Public -# License (GPL), in which case the provisions of the GPL apply INSTEAD OF -# those given above. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# ==================================================================== -# Written by Andy Polyakov for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see https://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# This module implements support for AES instructions as per PowerISA -# specification version 2.07, first implemented by POWER8 processor. -# The module is endian-agnostic in sense that it supports both big- -# and little-endian cases. Data alignment in parallelizable modes is -# handled with VSX loads and stores, which implies MSR.VSX flag being -# set. It should also be noted that ISA specification doesn't prohibit -# alignment exceptions for these instructions on page boundaries. -# Initially alignment was handled in pure AltiVec/VMX way [when data -# is aligned programmatically, which in turn guarantees exception- -# free execution], but it turned to hamper performance when vcipher -# instructions are interleaved. It's reckoned that eventual -# misalignment penalties at page boundaries are in average lower -# than additional overhead in pure AltiVec approach. -# -# May 2016 -# -# Add XTS subroutine, 9x on little- and 12x improvement on big-endian -# systems were measured. -# -###################################################################### -# Current large-block performance in cycles per byte processed with -# 128-bit key (less is better). -# -# CBC en-/decrypt CTR XTS -# POWER8[le] 3.96/0.72 0.74 1.1 -# POWER8[be] 3.75/0.65 0.66 1.0 - -$flavour = shift; - -if ($flavour =~ /64/) { - $SIZE_T =8; - $LRSAVE =2*$SIZE_T; - $STU ="stdu"; - $POP ="ld"; - $PUSH ="std"; - $UCMP ="cmpld"; - $SHL ="sldi"; -} elsif ($flavour =~ /32/) { - $SIZE_T =4; - $LRSAVE =$SIZE_T; - $STU ="stwu"; - $POP ="lwz"; - $PUSH ="stw"; - $UCMP ="cmplw"; - $SHL ="slwi"; -} else { die "nonsense $flavour"; } - -$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or -die "can't locate ppc-xlate.pl"; - -open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; - -$FRAME=8*$SIZE_T; -$prefix="aes_p8"; - -$sp="r1"; -$vrsave="r12"; - -######################################################################### -{{{ # Key setup procedures # -my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8)); -my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6)); -my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11)); - -$code.=<<___; -.machine "any" - -.text - -.align 7 -rcon: -.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev -.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev -.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev -.long 0,0,0,0 ?asis -Lconsts: - mflr r0 - bcl 20,31,\$+4 - mflr $ptr #vvvvv "distance between . and rcon - addi $ptr,$ptr,-0x48 - mtlr r0 - blr - .long 0 - .byte 0,12,0x14,0,0,0,0,0 -.asciz "AES for PowerISA 2.07, CRYPTOGAMS by " - -.globl .${prefix}_set_encrypt_key -Lset_encrypt_key: - mflr r11 - $PUSH r11,$LRSAVE($sp) - - li $ptr,-1 - ${UCMP}i $inp,0 - beq- Lenc_key_abort # if ($inp==0) return -1; - ${UCMP}i $out,0 - beq- Lenc_key_abort # if ($out==0) return -1; - li $ptr,-2 - cmpwi $bits,128 - blt- Lenc_key_abort - cmpwi $bits,256 - bgt- Lenc_key_abort - andi. r0,$bits,0x3f - bne- Lenc_key_abort - - lis r0,0xfff0 - mfspr $vrsave,256 - mtspr 256,r0 - - bl Lconsts - mtlr r11 - - neg r9,$inp - lvx $in0,0,$inp - addi $inp,$inp,15 # 15 is not typo - lvsr $key,0,r9 # borrow $key - li r8,0x20 - cmpwi $bits,192 - lvx $in1,0,$inp - le?vspltisb $mask,0x0f # borrow $mask - lvx $rcon,0,$ptr - le?vxor $key,$key,$mask # adjust for byte swap - lvx $mask,r8,$ptr - addi $ptr,$ptr,0x10 - vperm $in0,$in0,$in1,$key # align [and byte swap in LE] - li $cnt,8 - vxor $zero,$zero,$zero - mtctr $cnt - - ?lvsr $outperm,0,$out - vspltisb $outmask,-1 - lvx $outhead,0,$out - ?vperm $outmask,$zero,$outmask,$outperm - - blt Loop128 - addi $inp,$inp,8 - beq L192 - addi $inp,$inp,8 - b L256 - -.align 4 -Loop128: - vperm $key,$in0,$in0,$mask # rotate-n-splat - vsldoi $tmp,$zero,$in0,12 # >>32 - vperm $outtail,$in0,$in0,$outperm # rotate - vsel $stage,$outhead,$outtail,$outmask - vmr $outhead,$outtail - vcipherlast $key,$key,$rcon - stvx $stage,0,$out - addi $out,$out,16 - - vxor $in0,$in0,$tmp - vsldoi $tmp,$zero,$tmp,12 # >>32 - vxor $in0,$in0,$tmp - vsldoi $tmp,$zero,$tmp,12 # >>32 - vxor $in0,$in0,$tmp - vadduwm $rcon,$rcon,$rcon - vxor $in0,$in0,$key - bdnz Loop128 - - lvx $rcon,0,$ptr # last two round keys - - vperm $key,$in0,$in0,$mask # rotate-n-splat - vsldoi $tmp,$zero,$in0,12 # >>32 - vperm $outtail,$in0,$in0,$outperm # rotate - vsel $stage,$outhead,$outtail,$outmask - vmr $outhead,$outtail - vcipherlast $key,$key,$rcon - stvx $stage,0,$out - addi $out,$out,16 - - vxor $in0,$in0,$tmp - vsldoi $tmp,$zero,$tmp,12 # >>32 - vxor $in0,$in0,$tmp - vsldoi $tmp,$zero,$tmp,12 # >>32 - vxor $in0,$in0,$tmp - vadduwm $rcon,$rcon,$rcon - vxor $in0,$in0,$key - - vperm $key,$in0,$in0,$mask # rotate-n-splat - vsldoi $tmp,$zero,$in0,12 # >>32 - vperm $outtail,$in0,$in0,$outperm # rotate - vsel $stage,$outhead,$outtail,$outmask - vmr $outhead,$outtail - vcipherlast $key,$key,$rcon - stvx $stage,0,$out - addi $out,$out,16 - - vxor $in0,$in0,$tmp - vsldoi $tmp,$zero,$tmp,12 # >>32 - vxor $in0,$in0,$tmp - vsldoi $tmp,$zero,$tmp,12 # >>32 - vxor $in0,$in0,$tmp - vxor $in0,$in0,$key - vperm $outtail,$in0,$in0,$outperm # rotate - vsel $stage,$outhead,$outtail,$outmask - vmr $outhead,$outtail - stvx $stage,0,$out - - addi $inp,$out,15 # 15 is not typo - addi $out,$out,0x50 - - li $rounds,10 - b Ldone - -.align 4 -L192: - lvx $tmp,0,$inp - li $cnt,4 - vperm $outtail,$in0,$in0,$outperm # rotate - vsel $stage,$outhead,$outtail,$outmask - vmr $outhead,$outtail - stvx $stage,0,$out - addi $out,$out,16 - vperm $in1,$in1,$tmp,$key # align [and byte swap in LE] - vspltisb $key,8 # borrow $key - mtctr $cnt - vsububm $mask,$mask,$key # adjust the mask - -Loop192: - vperm $key,$in1,$in1,$mask # roate-n-splat - vsldoi $tmp,$zero,$in0,12 # >>32 - vcipherlast $key,$key,$rcon - - vxor $in0,$in0,$tmp - vsldoi $tmp,$zero,$tmp,12 # >>32 - vxor $in0,$in0,$tmp - vsldoi $tmp,$zero,$tmp,12 # >>32 - vxor $in0,$in0,$tmp - - vsldoi $stage,$zero,$in1,8 - vspltw $tmp,$in0,3 - vxor $tmp,$tmp,$in1 - vsldoi $in1,$zero,$in1,12 # >>32 - vadduwm $rcon,$rcon,$rcon - vxor $in1,$in1,$tmp - vxor $in0,$in0,$key - vxor $in1,$in1,$key - vsldoi $stage,$stage,$in0,8 - - vperm $key,$in1,$in1,$mask # rotate-n-splat - vsldoi $tmp,$zero,$in0,12 # >>32 - vperm $outtail,$stage,$stage,$outperm # rotate - vsel $stage,$outhead,$outtail,$outmask - vmr $outhead,$outtail - vcipherlast $key,$key,$rcon - stvx $stage,0,$out - addi $out,$out,16 - - vsldoi $stage,$in0,$in1,8 - vxor $in0,$in0,$tmp - vsldoi $tmp,$zero,$tmp,12 # >>32 - vperm $outtail,$stage,$stage,$outperm # rotate - vsel $stage,$outhead,$outtail,$outmask - vmr $outhead,$outtail - vxor $in0,$in0,$tmp - vsldoi $tmp,$zero,$tmp,12 # >>32 - vxor $in0,$in0,$tmp - stvx $stage,0,$out - addi $out,$out,16 - - vspltw $tmp,$in0,3 - vxor $tmp,$tmp,$in1 - vsldoi $in1,$zero,$in1,12 # >>32 - vadduwm $rcon,$rcon,$rcon - vxor $in1,$in1,$tmp - vxor $in0,$in0,$key - vxor $in1,$in1,$key - vperm $outtail,$in0,$in0,$outperm # rotate - vsel $stage,$outhead,$outtail,$outmask - vmr $outhead,$outtail - stvx $stage,0,$out - addi $inp,$out,15 # 15 is not typo - addi $out,$out,16 - bdnz Loop192 - - li $rounds,12 - addi $out,$out,0x20 - b Ldone - -.align 4 -L256: - lvx $tmp,0,$inp - li $cnt,7 - li $rounds,14 - vperm $outtail,$in0,$in0,$outperm # rotate - vsel $stage,$outhead,$outtail,$outmask - vmr $outhead,$outtail - stvx $stage,0,$out - addi $out,$out,16 - vperm $in1,$in1,$tmp,$key # align [and byte swap in LE] - mtctr $cnt - -Loop256: - vperm $key,$in1,$in1,$mask # rotate-n-splat - vsldoi $tmp,$zero,$in0,12 # >>32 - vperm $outtail,$in1,$in1,$outperm # rotate - vsel $stage,$outhead,$outtail,$outmask - vmr $outhead,$outtail - vcipherlast $key,$key,$rcon - stvx $stage,0,$out - addi $out,$out,16 - - vxor $in0,$in0,$tmp - vsldoi $tmp,$zero,$tmp,12 # >>32 - vxor $in0,$in0,$tmp - vsldoi $tmp,$zero,$tmp,12 # >>32 - vxor $in0,$in0,$tmp - vadduwm $rcon,$rcon,$rcon - vxor $in0,$in0,$key - vperm $outtail,$in0,$in0,$outperm # rotate - vsel $stage,$outhead,$outtail,$outmask - vmr $outhead,$outtail - stvx $stage,0,$out - addi $inp,$out,15 # 15 is not typo - addi $out,$out,16 - bdz Ldone - - vspltw $key,$in0,3 # just splat - vsldoi $tmp,$zero,$in1,12 # >>32 - vsbox $key,$key - - vxor $in1,$in1,$tmp - vsldoi $tmp,$zero,$tmp,12 # >>32 - vxor $in1,$in1,$tmp - vsldoi $tmp,$zero,$tmp,12 # >>32 - vxor $in1,$in1,$tmp - - vxor $in1,$in1,$key - b Loop256 - -.align 4 -Ldone: - lvx $in1,0,$inp # redundant in aligned case - vsel $in1,$outhead,$in1,$outmask - stvx $in1,0,$inp - li $ptr,0 - mtspr 256,$vrsave - stw $rounds,0($out) - -Lenc_key_abort: - mr r3,$ptr - blr - .long 0 - .byte 0,12,0x14,1,0,0,3,0 - .long 0 -.size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key - -.globl .${prefix}_set_decrypt_key - $STU $sp,-$FRAME($sp) - mflr r10 - $PUSH r10,$FRAME+$LRSAVE($sp) - bl Lset_encrypt_key - mtlr r10 - - cmpwi r3,0 - bne- Ldec_key_abort - - slwi $cnt,$rounds,4 - subi $inp,$out,240 # first round key - srwi $rounds,$rounds,1 - add $out,$inp,$cnt # last round key - mtctr $rounds - -Ldeckey: - lwz r0, 0($inp) - lwz r6, 4($inp) - lwz r7, 8($inp) - lwz r8, 12($inp) - addi $inp,$inp,16 - lwz r9, 0($out) - lwz r10,4($out) - lwz r11,8($out) - lwz r12,12($out) - stw r0, 0($out) - stw r6, 4($out) - stw r7, 8($out) - stw r8, 12($out) - subi $out,$out,16 - stw r9, -16($inp) - stw r10,-12($inp) - stw r11,-8($inp) - stw r12,-4($inp) - bdnz Ldeckey - - xor r3,r3,r3 # return value -Ldec_key_abort: - addi $sp,$sp,$FRAME - blr - .long 0 - .byte 0,12,4,1,0x80,0,3,0 - .long 0 -.size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key -___ -}}} -######################################################################### -{{{ # Single block en- and decrypt procedures # -sub gen_block () { -my $dir = shift; -my $n = $dir eq "de" ? "n" : ""; -my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7)); - -$code.=<<___; -.globl .${prefix}_${dir}crypt - lwz $rounds,240($key) - lis r0,0xfc00 - mfspr $vrsave,256 - li $idx,15 # 15 is not typo - mtspr 256,r0 - - lvx v0,0,$inp - neg r11,$out - lvx v1,$idx,$inp - lvsl v2,0,$inp # inpperm - le?vspltisb v4,0x0f - ?lvsl v3,0,r11 # outperm - le?vxor v2,v2,v4 - li $idx,16 - vperm v0,v0,v1,v2 # align [and byte swap in LE] - lvx v1,0,$key - ?lvsl v5,0,$key # keyperm - srwi $rounds,$rounds,1 - lvx v2,$idx,$key - addi $idx,$idx,16 - subi $rounds,$rounds,1 - ?vperm v1,v1,v2,v5 # align round key - - vxor v0,v0,v1 - lvx v1,$idx,$key - addi $idx,$idx,16 - mtctr $rounds - -Loop_${dir}c: - ?vperm v2,v2,v1,v5 - v${n}cipher v0,v0,v2 - lvx v2,$idx,$key - addi $idx,$idx,16 - ?vperm v1,v1,v2,v5 - v${n}cipher v0,v0,v1 - lvx v1,$idx,$key - addi $idx,$idx,16 - bdnz Loop_${dir}c - - ?vperm v2,v2,v1,v5 - v${n}cipher v0,v0,v2 - lvx v2,$idx,$key - ?vperm v1,v1,v2,v5 - v${n}cipherlast v0,v0,v1 - - vspltisb v2,-1 - vxor v1,v1,v1 - li $idx,15 # 15 is not typo - ?vperm v2,v1,v2,v3 # outmask - le?vxor v3,v3,v4 - lvx v1,0,$out # outhead - vperm v0,v0,v0,v3 # rotate [and byte swap in LE] - vsel v1,v1,v0,v2 - lvx v4,$idx,$out - stvx v1,0,$out - vsel v0,v0,v4,v2 - stvx v0,$idx,$out - - mtspr 256,$vrsave - blr - .long 0 - .byte 0,12,0x14,0,0,0,3,0 - .long 0 -.size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt -___ -} -&gen_block("en"); -&gen_block("de"); -}}} -######################################################################### -{{{ # CBC en- and decrypt procedures # -my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10)); -my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3)); -my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)= - map("v$_",(4..10)); -$code.=<<___; -.globl .${prefix}_cbc_encrypt - ${UCMP}i $len,16 - bltlr- - - cmpwi $enc,0 # test direction - lis r0,0xffe0 - mfspr $vrsave,256 - mtspr 256,r0 - - li $idx,15 - vxor $rndkey0,$rndkey0,$rndkey0 - le?vspltisb $tmp,0x0f - - lvx $ivec,0,$ivp # load [unaligned] iv - lvsl $inpperm,0,$ivp - lvx $inptail,$idx,$ivp - le?vxor $inpperm,$inpperm,$tmp - vperm $ivec,$ivec,$inptail,$inpperm - - neg r11,$inp - ?lvsl $keyperm,0,$key # prepare for unaligned key - lwz $rounds,240($key) - - lvsr $inpperm,0,r11 # prepare for unaligned load - lvx $inptail,0,$inp - addi $inp,$inp,15 # 15 is not typo - le?vxor $inpperm,$inpperm,$tmp - - ?lvsr $outperm,0,$out # prepare for unaligned store - vspltisb $outmask,-1 - lvx $outhead,0,$out - ?vperm $outmask,$rndkey0,$outmask,$outperm - le?vxor $outperm,$outperm,$tmp - - srwi $rounds,$rounds,1 - li $idx,16 - subi $rounds,$rounds,1 - beq Lcbc_dec - -Lcbc_enc: - vmr $inout,$inptail - lvx $inptail,0,$inp - addi $inp,$inp,16 - mtctr $rounds - subi $len,$len,16 # len-=16 - - lvx $rndkey0,0,$key - vperm $inout,$inout,$inptail,$inpperm - lvx $rndkey1,$idx,$key - addi $idx,$idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vxor $inout,$inout,$rndkey0 - lvx $rndkey0,$idx,$key - addi $idx,$idx,16 - vxor $inout,$inout,$ivec - -Loop_cbc_enc: - ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm - vcipher $inout,$inout,$rndkey1 - lvx $rndkey1,$idx,$key - addi $idx,$idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vcipher $inout,$inout,$rndkey0 - lvx $rndkey0,$idx,$key - addi $idx,$idx,16 - bdnz Loop_cbc_enc - - ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm - vcipher $inout,$inout,$rndkey1 - lvx $rndkey1,$idx,$key - li $idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vcipherlast $ivec,$inout,$rndkey0 - ${UCMP}i $len,16 - - vperm $tmp,$ivec,$ivec,$outperm - vsel $inout,$outhead,$tmp,$outmask - vmr $outhead,$tmp - stvx $inout,0,$out - addi $out,$out,16 - bge Lcbc_enc - - b Lcbc_done - -.align 4 -Lcbc_dec: - ${UCMP}i $len,128 - bge _aesp8_cbc_decrypt8x - vmr $tmp,$inptail - lvx $inptail,0,$inp - addi $inp,$inp,16 - mtctr $rounds - subi $len,$len,16 # len-=16 - - lvx $rndkey0,0,$key - vperm $tmp,$tmp,$inptail,$inpperm - lvx $rndkey1,$idx,$key - addi $idx,$idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vxor $inout,$tmp,$rndkey0 - lvx $rndkey0,$idx,$key - addi $idx,$idx,16 - -Loop_cbc_dec: - ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm - vncipher $inout,$inout,$rndkey1 - lvx $rndkey1,$idx,$key - addi $idx,$idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vncipher $inout,$inout,$rndkey0 - lvx $rndkey0,$idx,$key - addi $idx,$idx,16 - bdnz Loop_cbc_dec - - ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm - vncipher $inout,$inout,$rndkey1 - lvx $rndkey1,$idx,$key - li $idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vncipherlast $inout,$inout,$rndkey0 - ${UCMP}i $len,16 - - vxor $inout,$inout,$ivec - vmr $ivec,$tmp - vperm $tmp,$inout,$inout,$outperm - vsel $inout,$outhead,$tmp,$outmask - vmr $outhead,$tmp - stvx $inout,0,$out - addi $out,$out,16 - bge Lcbc_dec - -Lcbc_done: - addi $out,$out,-1 - lvx $inout,0,$out # redundant in aligned case - vsel $inout,$outhead,$inout,$outmask - stvx $inout,0,$out - - neg $enc,$ivp # write [unaligned] iv - li $idx,15 # 15 is not typo - vxor $rndkey0,$rndkey0,$rndkey0 - vspltisb $outmask,-1 - le?vspltisb $tmp,0x0f - ?lvsl $outperm,0,$enc - ?vperm $outmask,$rndkey0,$outmask,$outperm - le?vxor $outperm,$outperm,$tmp - lvx $outhead,0,$ivp - vperm $ivec,$ivec,$ivec,$outperm - vsel $inout,$outhead,$ivec,$outmask - lvx $inptail,$idx,$ivp - stvx $inout,0,$ivp - vsel $inout,$ivec,$inptail,$outmask - stvx $inout,$idx,$ivp - - mtspr 256,$vrsave - blr - .long 0 - .byte 0,12,0x14,0,0,0,6,0 - .long 0 -___ -######################################################################### -{{ # Optimized CBC decrypt procedure # -my $key_="r11"; -my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31)); -my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13)); -my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21)); -my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys - # v26-v31 last 6 round keys -my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment - -$code.=<<___; -.align 5 -_aesp8_cbc_decrypt8x: - $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) - li r10,`$FRAME+8*16+15` - li r11,`$FRAME+8*16+31` - stvx v20,r10,$sp # ABI says so - addi r10,r10,32 - stvx v21,r11,$sp - addi r11,r11,32 - stvx v22,r10,$sp - addi r10,r10,32 - stvx v23,r11,$sp - addi r11,r11,32 - stvx v24,r10,$sp - addi r10,r10,32 - stvx v25,r11,$sp - addi r11,r11,32 - stvx v26,r10,$sp - addi r10,r10,32 - stvx v27,r11,$sp - addi r11,r11,32 - stvx v28,r10,$sp - addi r10,r10,32 - stvx v29,r11,$sp - addi r11,r11,32 - stvx v30,r10,$sp - stvx v31,r11,$sp - li r0,-1 - stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave - li $x10,0x10 - $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) - li $x20,0x20 - $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) - li $x30,0x30 - $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) - li $x40,0x40 - $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) - li $x50,0x50 - $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) - li $x60,0x60 - $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) - li $x70,0x70 - mtspr 256,r0 - - subi $rounds,$rounds,3 # -4 in total - subi $len,$len,128 # bias - - lvx $rndkey0,$x00,$key # load key schedule - lvx v30,$x10,$key - addi $key,$key,0x20 - lvx v31,$x00,$key - ?vperm $rndkey0,$rndkey0,v30,$keyperm - addi $key_,$sp,$FRAME+15 - mtctr $rounds - -Load_cbc_dec_key: - ?vperm v24,v30,v31,$keyperm - lvx v30,$x10,$key - addi $key,$key,0x20 - stvx v24,$x00,$key_ # off-load round[1] - ?vperm v25,v31,v30,$keyperm - lvx v31,$x00,$key - stvx v25,$x10,$key_ # off-load round[2] - addi $key_,$key_,0x20 - bdnz Load_cbc_dec_key - - lvx v26,$x10,$key - ?vperm v24,v30,v31,$keyperm - lvx v27,$x20,$key - stvx v24,$x00,$key_ # off-load round[3] - ?vperm v25,v31,v26,$keyperm - lvx v28,$x30,$key - stvx v25,$x10,$key_ # off-load round[4] - addi $key_,$sp,$FRAME+15 # rewind $key_ - ?vperm v26,v26,v27,$keyperm - lvx v29,$x40,$key - ?vperm v27,v27,v28,$keyperm - lvx v30,$x50,$key - ?vperm v28,v28,v29,$keyperm - lvx v31,$x60,$key - ?vperm v29,v29,v30,$keyperm - lvx $out0,$x70,$key # borrow $out0 - ?vperm v30,v30,v31,$keyperm - lvx v24,$x00,$key_ # pre-load round[1] - ?vperm v31,v31,$out0,$keyperm - lvx v25,$x10,$key_ # pre-load round[2] - - #lvx $inptail,0,$inp # "caller" already did this - #addi $inp,$inp,15 # 15 is not typo - subi $inp,$inp,15 # undo "caller" - - le?li $idx,8 - lvx_u $in0,$x00,$inp # load first 8 "words" - le?lvsl $inpperm,0,$idx - le?vspltisb $tmp,0x0f - lvx_u $in1,$x10,$inp - le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u - lvx_u $in2,$x20,$inp - le?vperm $in0,$in0,$in0,$inpperm - lvx_u $in3,$x30,$inp - le?vperm $in1,$in1,$in1,$inpperm - lvx_u $in4,$x40,$inp - le?vperm $in2,$in2,$in2,$inpperm - vxor $out0,$in0,$rndkey0 - lvx_u $in5,$x50,$inp - le?vperm $in3,$in3,$in3,$inpperm - vxor $out1,$in1,$rndkey0 - lvx_u $in6,$x60,$inp - le?vperm $in4,$in4,$in4,$inpperm - vxor $out2,$in2,$rndkey0 - lvx_u $in7,$x70,$inp - addi $inp,$inp,0x80 - le?vperm $in5,$in5,$in5,$inpperm - vxor $out3,$in3,$rndkey0 - le?vperm $in6,$in6,$in6,$inpperm - vxor $out4,$in4,$rndkey0 - le?vperm $in7,$in7,$in7,$inpperm - vxor $out5,$in5,$rndkey0 - vxor $out6,$in6,$rndkey0 - vxor $out7,$in7,$rndkey0 - - mtctr $rounds - b Loop_cbc_dec8x -.align 5 -Loop_cbc_dec8x: - vncipher $out0,$out0,v24 - vncipher $out1,$out1,v24 - vncipher $out2,$out2,v24 - vncipher $out3,$out3,v24 - vncipher $out4,$out4,v24 - vncipher $out5,$out5,v24 - vncipher $out6,$out6,v24 - vncipher $out7,$out7,v24 - lvx v24,$x20,$key_ # round[3] - addi $key_,$key_,0x20 - - vncipher $out0,$out0,v25 - vncipher $out1,$out1,v25 - vncipher $out2,$out2,v25 - vncipher $out3,$out3,v25 - vncipher $out4,$out4,v25 - vncipher $out5,$out5,v25 - vncipher $out6,$out6,v25 - vncipher $out7,$out7,v25 - lvx v25,$x10,$key_ # round[4] - bdnz Loop_cbc_dec8x - - subic $len,$len,128 # $len-=128 - vncipher $out0,$out0,v24 - vncipher $out1,$out1,v24 - vncipher $out2,$out2,v24 - vncipher $out3,$out3,v24 - vncipher $out4,$out4,v24 - vncipher $out5,$out5,v24 - vncipher $out6,$out6,v24 - vncipher $out7,$out7,v24 - - subfe. r0,r0,r0 # borrow?-1:0 - vncipher $out0,$out0,v25 - vncipher $out1,$out1,v25 - vncipher $out2,$out2,v25 - vncipher $out3,$out3,v25 - vncipher $out4,$out4,v25 - vncipher $out5,$out5,v25 - vncipher $out6,$out6,v25 - vncipher $out7,$out7,v25 - - and r0,r0,$len - vncipher $out0,$out0,v26 - vncipher $out1,$out1,v26 - vncipher $out2,$out2,v26 - vncipher $out3,$out3,v26 - vncipher $out4,$out4,v26 - vncipher $out5,$out5,v26 - vncipher $out6,$out6,v26 - vncipher $out7,$out7,v26 - - add $inp,$inp,r0 # $inp is adjusted in such - # way that at exit from the - # loop inX-in7 are loaded - # with last "words" - vncipher $out0,$out0,v27 - vncipher $out1,$out1,v27 - vncipher $out2,$out2,v27 - vncipher $out3,$out3,v27 - vncipher $out4,$out4,v27 - vncipher $out5,$out5,v27 - vncipher $out6,$out6,v27 - vncipher $out7,$out7,v27 - - addi $key_,$sp,$FRAME+15 # rewind $key_ - vncipher $out0,$out0,v28 - vncipher $out1,$out1,v28 - vncipher $out2,$out2,v28 - vncipher $out3,$out3,v28 - vncipher $out4,$out4,v28 - vncipher $out5,$out5,v28 - vncipher $out6,$out6,v28 - vncipher $out7,$out7,v28 - lvx v24,$x00,$key_ # re-pre-load round[1] - - vncipher $out0,$out0,v29 - vncipher $out1,$out1,v29 - vncipher $out2,$out2,v29 - vncipher $out3,$out3,v29 - vncipher $out4,$out4,v29 - vncipher $out5,$out5,v29 - vncipher $out6,$out6,v29 - vncipher $out7,$out7,v29 - lvx v25,$x10,$key_ # re-pre-load round[2] - - vncipher $out0,$out0,v30 - vxor $ivec,$ivec,v31 # xor with last round key - vncipher $out1,$out1,v30 - vxor $in0,$in0,v31 - vncipher $out2,$out2,v30 - vxor $in1,$in1,v31 - vncipher $out3,$out3,v30 - vxor $in2,$in2,v31 - vncipher $out4,$out4,v30 - vxor $in3,$in3,v31 - vncipher $out5,$out5,v30 - vxor $in4,$in4,v31 - vncipher $out6,$out6,v30 - vxor $in5,$in5,v31 - vncipher $out7,$out7,v30 - vxor $in6,$in6,v31 - - vncipherlast $out0,$out0,$ivec - vncipherlast $out1,$out1,$in0 - lvx_u $in0,$x00,$inp # load next input block - vncipherlast $out2,$out2,$in1 - lvx_u $in1,$x10,$inp - vncipherlast $out3,$out3,$in2 - le?vperm $in0,$in0,$in0,$inpperm - lvx_u $in2,$x20,$inp - vncipherlast $out4,$out4,$in3 - le?vperm $in1,$in1,$in1,$inpperm - lvx_u $in3,$x30,$inp - vncipherlast $out5,$out5,$in4 - le?vperm $in2,$in2,$in2,$inpperm - lvx_u $in4,$x40,$inp - vncipherlast $out6,$out6,$in5 - le?vperm $in3,$in3,$in3,$inpperm - lvx_u $in5,$x50,$inp - vncipherlast $out7,$out7,$in6 - le?vperm $in4,$in4,$in4,$inpperm - lvx_u $in6,$x60,$inp - vmr $ivec,$in7 - le?vperm $in5,$in5,$in5,$inpperm - lvx_u $in7,$x70,$inp - addi $inp,$inp,0x80 - - le?vperm $out0,$out0,$out0,$inpperm - le?vperm $out1,$out1,$out1,$inpperm - stvx_u $out0,$x00,$out - le?vperm $in6,$in6,$in6,$inpperm - vxor $out0,$in0,$rndkey0 - le?vperm $out2,$out2,$out2,$inpperm - stvx_u $out1,$x10,$out - le?vperm $in7,$in7,$in7,$inpperm - vxor $out1,$in1,$rndkey0 - le?vperm $out3,$out3,$out3,$inpperm - stvx_u $out2,$x20,$out - vxor $out2,$in2,$rndkey0 - le?vperm $out4,$out4,$out4,$inpperm - stvx_u $out3,$x30,$out - vxor $out3,$in3,$rndkey0 - le?vperm $out5,$out5,$out5,$inpperm - stvx_u $out4,$x40,$out - vxor $out4,$in4,$rndkey0 - le?vperm $out6,$out6,$out6,$inpperm - stvx_u $out5,$x50,$out - vxor $out5,$in5,$rndkey0 - le?vperm $out7,$out7,$out7,$inpperm - stvx_u $out6,$x60,$out - vxor $out6,$in6,$rndkey0 - stvx_u $out7,$x70,$out - addi $out,$out,0x80 - vxor $out7,$in7,$rndkey0 - - mtctr $rounds - beq Loop_cbc_dec8x # did $len-=128 borrow? - - addic. $len,$len,128 - beq Lcbc_dec8x_done - nop - nop - -Loop_cbc_dec8x_tail: # up to 7 "words" tail... - vncipher $out1,$out1,v24 - vncipher $out2,$out2,v24 - vncipher $out3,$out3,v24 - vncipher $out4,$out4,v24 - vncipher $out5,$out5,v24 - vncipher $out6,$out6,v24 - vncipher $out7,$out7,v24 - lvx v24,$x20,$key_ # round[3] - addi $key_,$key_,0x20 - - vncipher $out1,$out1,v25 - vncipher $out2,$out2,v25 - vncipher $out3,$out3,v25 - vncipher $out4,$out4,v25 - vncipher $out5,$out5,v25 - vncipher $out6,$out6,v25 - vncipher $out7,$out7,v25 - lvx v25,$x10,$key_ # round[4] - bdnz Loop_cbc_dec8x_tail - - vncipher $out1,$out1,v24 - vncipher $out2,$out2,v24 - vncipher $out3,$out3,v24 - vncipher $out4,$out4,v24 - vncipher $out5,$out5,v24 - vncipher $out6,$out6,v24 - vncipher $out7,$out7,v24 - - vncipher $out1,$out1,v25 - vncipher $out2,$out2,v25 - vncipher $out3,$out3,v25 - vncipher $out4,$out4,v25 - vncipher $out5,$out5,v25 - vncipher $out6,$out6,v25 - vncipher $out7,$out7,v25 - - vncipher $out1,$out1,v26 - vncipher $out2,$out2,v26 - vncipher $out3,$out3,v26 - vncipher $out4,$out4,v26 - vncipher $out5,$out5,v26 - vncipher $out6,$out6,v26 - vncipher $out7,$out7,v26 - - vncipher $out1,$out1,v27 - vncipher $out2,$out2,v27 - vncipher $out3,$out3,v27 - vncipher $out4,$out4,v27 - vncipher $out5,$out5,v27 - vncipher $out6,$out6,v27 - vncipher $out7,$out7,v27 - - vncipher $out1,$out1,v28 - vncipher $out2,$out2,v28 - vncipher $out3,$out3,v28 - vncipher $out4,$out4,v28 - vncipher $out5,$out5,v28 - vncipher $out6,$out6,v28 - vncipher $out7,$out7,v28 - - vncipher $out1,$out1,v29 - vncipher $out2,$out2,v29 - vncipher $out3,$out3,v29 - vncipher $out4,$out4,v29 - vncipher $out5,$out5,v29 - vncipher $out6,$out6,v29 - vncipher $out7,$out7,v29 - - vncipher $out1,$out1,v30 - vxor $ivec,$ivec,v31 # last round key - vncipher $out2,$out2,v30 - vxor $in1,$in1,v31 - vncipher $out3,$out3,v30 - vxor $in2,$in2,v31 - vncipher $out4,$out4,v30 - vxor $in3,$in3,v31 - vncipher $out5,$out5,v30 - vxor $in4,$in4,v31 - vncipher $out6,$out6,v30 - vxor $in5,$in5,v31 - vncipher $out7,$out7,v30 - vxor $in6,$in6,v31 - - cmplwi $len,32 # switch($len) - blt Lcbc_dec8x_one - nop - beq Lcbc_dec8x_two - cmplwi $len,64 - blt Lcbc_dec8x_three - nop - beq Lcbc_dec8x_four - cmplwi $len,96 - blt Lcbc_dec8x_five - nop - beq Lcbc_dec8x_six - -Lcbc_dec8x_seven: - vncipherlast $out1,$out1,$ivec - vncipherlast $out2,$out2,$in1 - vncipherlast $out3,$out3,$in2 - vncipherlast $out4,$out4,$in3 - vncipherlast $out5,$out5,$in4 - vncipherlast $out6,$out6,$in5 - vncipherlast $out7,$out7,$in6 - vmr $ivec,$in7 - - le?vperm $out1,$out1,$out1,$inpperm - le?vperm $out2,$out2,$out2,$inpperm - stvx_u $out1,$x00,$out - le?vperm $out3,$out3,$out3,$inpperm - stvx_u $out2,$x10,$out - le?vperm $out4,$out4,$out4,$inpperm - stvx_u $out3,$x20,$out - le?vperm $out5,$out5,$out5,$inpperm - stvx_u $out4,$x30,$out - le?vperm $out6,$out6,$out6,$inpperm - stvx_u $out5,$x40,$out - le?vperm $out7,$out7,$out7,$inpperm - stvx_u $out6,$x50,$out - stvx_u $out7,$x60,$out - addi $out,$out,0x70 - b Lcbc_dec8x_done - -.align 5 -Lcbc_dec8x_six: - vncipherlast $out2,$out2,$ivec - vncipherlast $out3,$out3,$in2 - vncipherlast $out4,$out4,$in3 - vncipherlast $out5,$out5,$in4 - vncipherlast $out6,$out6,$in5 - vncipherlast $out7,$out7,$in6 - vmr $ivec,$in7 - - le?vperm $out2,$out2,$out2,$inpperm - le?vperm $out3,$out3,$out3,$inpperm - stvx_u $out2,$x00,$out - le?vperm $out4,$out4,$out4,$inpperm - stvx_u $out3,$x10,$out - le?vperm $out5,$out5,$out5,$inpperm - stvx_u $out4,$x20,$out - le?vperm $out6,$out6,$out6,$inpperm - stvx_u $out5,$x30,$out - le?vperm $out7,$out7,$out7,$inpperm - stvx_u $out6,$x40,$out - stvx_u $out7,$x50,$out - addi $out,$out,0x60 - b Lcbc_dec8x_done - -.align 5 -Lcbc_dec8x_five: - vncipherlast $out3,$out3,$ivec - vncipherlast $out4,$out4,$in3 - vncipherlast $out5,$out5,$in4 - vncipherlast $out6,$out6,$in5 - vncipherlast $out7,$out7,$in6 - vmr $ivec,$in7 - - le?vperm $out3,$out3,$out3,$inpperm - le?vperm $out4,$out4,$out4,$inpperm - stvx_u $out3,$x00,$out - le?vperm $out5,$out5,$out5,$inpperm - stvx_u $out4,$x10,$out - le?vperm $out6,$out6,$out6,$inpperm - stvx_u $out5,$x20,$out - le?vperm $out7,$out7,$out7,$inpperm - stvx_u $out6,$x30,$out - stvx_u $out7,$x40,$out - addi $out,$out,0x50 - b Lcbc_dec8x_done - -.align 5 -Lcbc_dec8x_four: - vncipherlast $out4,$out4,$ivec - vncipherlast $out5,$out5,$in4 - vncipherlast $out6,$out6,$in5 - vncipherlast $out7,$out7,$in6 - vmr $ivec,$in7 - - le?vperm $out4,$out4,$out4,$inpperm - le?vperm $out5,$out5,$out5,$inpperm - stvx_u $out4,$x00,$out - le?vperm $out6,$out6,$out6,$inpperm - stvx_u $out5,$x10,$out - le?vperm $out7,$out7,$out7,$inpperm - stvx_u $out6,$x20,$out - stvx_u $out7,$x30,$out - addi $out,$out,0x40 - b Lcbc_dec8x_done - -.align 5 -Lcbc_dec8x_three: - vncipherlast $out5,$out5,$ivec - vncipherlast $out6,$out6,$in5 - vncipherlast $out7,$out7,$in6 - vmr $ivec,$in7 - - le?vperm $out5,$out5,$out5,$inpperm - le?vperm $out6,$out6,$out6,$inpperm - stvx_u $out5,$x00,$out - le?vperm $out7,$out7,$out7,$inpperm - stvx_u $out6,$x10,$out - stvx_u $out7,$x20,$out - addi $out,$out,0x30 - b Lcbc_dec8x_done - -.align 5 -Lcbc_dec8x_two: - vncipherlast $out6,$out6,$ivec - vncipherlast $out7,$out7,$in6 - vmr $ivec,$in7 - - le?vperm $out6,$out6,$out6,$inpperm - le?vperm $out7,$out7,$out7,$inpperm - stvx_u $out6,$x00,$out - stvx_u $out7,$x10,$out - addi $out,$out,0x20 - b Lcbc_dec8x_done - -.align 5 -Lcbc_dec8x_one: - vncipherlast $out7,$out7,$ivec - vmr $ivec,$in7 - - le?vperm $out7,$out7,$out7,$inpperm - stvx_u $out7,0,$out - addi $out,$out,0x10 - -Lcbc_dec8x_done: - le?vperm $ivec,$ivec,$ivec,$inpperm - stvx_u $ivec,0,$ivp # write [unaligned] iv - - li r10,`$FRAME+15` - li r11,`$FRAME+31` - stvx $inpperm,r10,$sp # wipe copies of round keys - addi r10,r10,32 - stvx $inpperm,r11,$sp - addi r11,r11,32 - stvx $inpperm,r10,$sp - addi r10,r10,32 - stvx $inpperm,r11,$sp - addi r11,r11,32 - stvx $inpperm,r10,$sp - addi r10,r10,32 - stvx $inpperm,r11,$sp - addi r11,r11,32 - stvx $inpperm,r10,$sp - addi r10,r10,32 - stvx $inpperm,r11,$sp - addi r11,r11,32 - - mtspr 256,$vrsave - lvx v20,r10,$sp # ABI says so - addi r10,r10,32 - lvx v21,r11,$sp - addi r11,r11,32 - lvx v22,r10,$sp - addi r10,r10,32 - lvx v23,r11,$sp - addi r11,r11,32 - lvx v24,r10,$sp - addi r10,r10,32 - lvx v25,r11,$sp - addi r11,r11,32 - lvx v26,r10,$sp - addi r10,r10,32 - lvx v27,r11,$sp - addi r11,r11,32 - lvx v28,r10,$sp - addi r10,r10,32 - lvx v29,r11,$sp - addi r11,r11,32 - lvx v30,r10,$sp - lvx v31,r11,$sp - $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) - $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) - $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) - $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) - $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) - $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) - addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` - blr - .long 0 - .byte 0,12,0x14,0,0x80,6,6,0 - .long 0 -.size .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt -___ -}} }}} - -######################################################################### -{{{ # CTR procedure[s] # - -####################### WARNING: Here be dragons! ####################### -# -# This code is written as 'ctr32', based on a 32-bit counter used -# upstream. The kernel does *not* use a 32-bit counter. The kernel uses -# a 128-bit counter. -# -# This leads to subtle changes from the upstream code: the counter -# is incremented with vaddu_q_m rather than vaddu_w_m. This occurs in -# both the bulk (8 blocks at a time) path, and in the individual block -# path. Be aware of this when doing updates. -# -# See: -# 1d4aa0b4c181 ("crypto: vmx - Fixing AES-CTR counter bug") -# 009b30ac7444 ("crypto: vmx - CTR: always increment IV as quadword") -# https://github.com/openssl/openssl/pull/8942 -# -######################################################################### -my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10)); -my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3)); -my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)= - map("v$_",(4..11)); -my $dat=$tmp; - -$code.=<<___; -.globl .${prefix}_ctr32_encrypt_blocks - ${UCMP}i $len,1 - bltlr- - - lis r0,0xfff0 - mfspr $vrsave,256 - mtspr 256,r0 - - li $idx,15 - vxor $rndkey0,$rndkey0,$rndkey0 - le?vspltisb $tmp,0x0f - - lvx $ivec,0,$ivp # load [unaligned] iv - lvsl $inpperm,0,$ivp - lvx $inptail,$idx,$ivp - vspltisb $one,1 - le?vxor $inpperm,$inpperm,$tmp - vperm $ivec,$ivec,$inptail,$inpperm - vsldoi $one,$rndkey0,$one,1 - - neg r11,$inp - ?lvsl $keyperm,0,$key # prepare for unaligned key - lwz $rounds,240($key) - - lvsr $inpperm,0,r11 # prepare for unaligned load - lvx $inptail,0,$inp - addi $inp,$inp,15 # 15 is not typo - le?vxor $inpperm,$inpperm,$tmp - - srwi $rounds,$rounds,1 - li $idx,16 - subi $rounds,$rounds,1 - - ${UCMP}i $len,8 - bge _aesp8_ctr32_encrypt8x - - ?lvsr $outperm,0,$out # prepare for unaligned store - vspltisb $outmask,-1 - lvx $outhead,0,$out - ?vperm $outmask,$rndkey0,$outmask,$outperm - le?vxor $outperm,$outperm,$tmp - - lvx $rndkey0,0,$key - mtctr $rounds - lvx $rndkey1,$idx,$key - addi $idx,$idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vxor $inout,$ivec,$rndkey0 - lvx $rndkey0,$idx,$key - addi $idx,$idx,16 - b Loop_ctr32_enc - -.align 5 -Loop_ctr32_enc: - ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm - vcipher $inout,$inout,$rndkey1 - lvx $rndkey1,$idx,$key - addi $idx,$idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vcipher $inout,$inout,$rndkey0 - lvx $rndkey0,$idx,$key - addi $idx,$idx,16 - bdnz Loop_ctr32_enc - - vadduqm $ivec,$ivec,$one # Kernel change for 128-bit - vmr $dat,$inptail - lvx $inptail,0,$inp - addi $inp,$inp,16 - subic. $len,$len,1 # blocks-- - - ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm - vcipher $inout,$inout,$rndkey1 - lvx $rndkey1,$idx,$key - vperm $dat,$dat,$inptail,$inpperm - li $idx,16 - ?vperm $rndkey1,$rndkey0,$rndkey1,$keyperm - lvx $rndkey0,0,$key - vxor $dat,$dat,$rndkey1 # last round key - vcipherlast $inout,$inout,$dat - - lvx $rndkey1,$idx,$key - addi $idx,$idx,16 - vperm $inout,$inout,$inout,$outperm - vsel $dat,$outhead,$inout,$outmask - mtctr $rounds - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vmr $outhead,$inout - vxor $inout,$ivec,$rndkey0 - lvx $rndkey0,$idx,$key - addi $idx,$idx,16 - stvx $dat,0,$out - addi $out,$out,16 - bne Loop_ctr32_enc - - addi $out,$out,-1 - lvx $inout,0,$out # redundant in aligned case - vsel $inout,$outhead,$inout,$outmask - stvx $inout,0,$out - - mtspr 256,$vrsave - blr - .long 0 - .byte 0,12,0x14,0,0,0,6,0 - .long 0 -___ -######################################################################### -{{ # Optimized CTR procedure # -my $key_="r11"; -my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31)); -my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14)); -my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22)); -my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys - # v26-v31 last 6 round keys -my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment -my ($two,$three,$four)=($outhead,$outperm,$outmask); - -$code.=<<___; -.align 5 -_aesp8_ctr32_encrypt8x: - $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) - li r10,`$FRAME+8*16+15` - li r11,`$FRAME+8*16+31` - stvx v20,r10,$sp # ABI says so - addi r10,r10,32 - stvx v21,r11,$sp - addi r11,r11,32 - stvx v22,r10,$sp - addi r10,r10,32 - stvx v23,r11,$sp - addi r11,r11,32 - stvx v24,r10,$sp - addi r10,r10,32 - stvx v25,r11,$sp - addi r11,r11,32 - stvx v26,r10,$sp - addi r10,r10,32 - stvx v27,r11,$sp - addi r11,r11,32 - stvx v28,r10,$sp - addi r10,r10,32 - stvx v29,r11,$sp - addi r11,r11,32 - stvx v30,r10,$sp - stvx v31,r11,$sp - li r0,-1 - stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave - li $x10,0x10 - $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) - li $x20,0x20 - $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) - li $x30,0x30 - $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) - li $x40,0x40 - $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) - li $x50,0x50 - $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) - li $x60,0x60 - $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) - li $x70,0x70 - mtspr 256,r0 - - subi $rounds,$rounds,3 # -4 in total - - lvx $rndkey0,$x00,$key # load key schedule - lvx v30,$x10,$key - addi $key,$key,0x20 - lvx v31,$x00,$key - ?vperm $rndkey0,$rndkey0,v30,$keyperm - addi $key_,$sp,$FRAME+15 - mtctr $rounds - -Load_ctr32_enc_key: - ?vperm v24,v30,v31,$keyperm - lvx v30,$x10,$key - addi $key,$key,0x20 - stvx v24,$x00,$key_ # off-load round[1] - ?vperm v25,v31,v30,$keyperm - lvx v31,$x00,$key - stvx v25,$x10,$key_ # off-load round[2] - addi $key_,$key_,0x20 - bdnz Load_ctr32_enc_key - - lvx v26,$x10,$key - ?vperm v24,v30,v31,$keyperm - lvx v27,$x20,$key - stvx v24,$x00,$key_ # off-load round[3] - ?vperm v25,v31,v26,$keyperm - lvx v28,$x30,$key - stvx v25,$x10,$key_ # off-load round[4] - addi $key_,$sp,$FRAME+15 # rewind $key_ - ?vperm v26,v26,v27,$keyperm - lvx v29,$x40,$key - ?vperm v27,v27,v28,$keyperm - lvx v30,$x50,$key - ?vperm v28,v28,v29,$keyperm - lvx v31,$x60,$key - ?vperm v29,v29,v30,$keyperm - lvx $out0,$x70,$key # borrow $out0 - ?vperm v30,v30,v31,$keyperm - lvx v24,$x00,$key_ # pre-load round[1] - ?vperm v31,v31,$out0,$keyperm - lvx v25,$x10,$key_ # pre-load round[2] - - vadduqm $two,$one,$one - subi $inp,$inp,15 # undo "caller" - $SHL $len,$len,4 - - vadduqm $out1,$ivec,$one # counter values ... - vadduqm $out2,$ivec,$two # (do all ctr adds as 128-bit) - vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0] - le?li $idx,8 - vadduqm $out3,$out1,$two - vxor $out1,$out1,$rndkey0 - le?lvsl $inpperm,0,$idx - vadduqm $out4,$out2,$two - vxor $out2,$out2,$rndkey0 - le?vspltisb $tmp,0x0f - vadduqm $out5,$out3,$two - vxor $out3,$out3,$rndkey0 - le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u - vadduqm $out6,$out4,$two - vxor $out4,$out4,$rndkey0 - vadduqm $out7,$out5,$two - vxor $out5,$out5,$rndkey0 - vadduqm $ivec,$out6,$two # next counter value - vxor $out6,$out6,$rndkey0 - vxor $out7,$out7,$rndkey0 - - mtctr $rounds - b Loop_ctr32_enc8x -.align 5 -Loop_ctr32_enc8x: - vcipher $out0,$out0,v24 - vcipher $out1,$out1,v24 - vcipher $out2,$out2,v24 - vcipher $out3,$out3,v24 - vcipher $out4,$out4,v24 - vcipher $out5,$out5,v24 - vcipher $out6,$out6,v24 - vcipher $out7,$out7,v24 -Loop_ctr32_enc8x_middle: - lvx v24,$x20,$key_ # round[3] - addi $key_,$key_,0x20 - - vcipher $out0,$out0,v25 - vcipher $out1,$out1,v25 - vcipher $out2,$out2,v25 - vcipher $out3,$out3,v25 - vcipher $out4,$out4,v25 - vcipher $out5,$out5,v25 - vcipher $out6,$out6,v25 - vcipher $out7,$out7,v25 - lvx v25,$x10,$key_ # round[4] - bdnz Loop_ctr32_enc8x - - subic r11,$len,256 # $len-256, borrow $key_ - vcipher $out0,$out0,v24 - vcipher $out1,$out1,v24 - vcipher $out2,$out2,v24 - vcipher $out3,$out3,v24 - vcipher $out4,$out4,v24 - vcipher $out5,$out5,v24 - vcipher $out6,$out6,v24 - vcipher $out7,$out7,v24 - - subfe r0,r0,r0 # borrow?-1:0 - vcipher $out0,$out0,v25 - vcipher $out1,$out1,v25 - vcipher $out2,$out2,v25 - vcipher $out3,$out3,v25 - vcipher $out4,$out4,v25 - vcipher $out5,$out5,v25 - vcipher $out6,$out6,v25 - vcipher $out7,$out7,v25 - - and r0,r0,r11 - addi $key_,$sp,$FRAME+15 # rewind $key_ - vcipher $out0,$out0,v26 - vcipher $out1,$out1,v26 - vcipher $out2,$out2,v26 - vcipher $out3,$out3,v26 - vcipher $out4,$out4,v26 - vcipher $out5,$out5,v26 - vcipher $out6,$out6,v26 - vcipher $out7,$out7,v26 - lvx v24,$x00,$key_ # re-pre-load round[1] - - subic $len,$len,129 # $len-=129 - vcipher $out0,$out0,v27 - addi $len,$len,1 # $len-=128 really - vcipher $out1,$out1,v27 - vcipher $out2,$out2,v27 - vcipher $out3,$out3,v27 - vcipher $out4,$out4,v27 - vcipher $out5,$out5,v27 - vcipher $out6,$out6,v27 - vcipher $out7,$out7,v27 - lvx v25,$x10,$key_ # re-pre-load round[2] - - vcipher $out0,$out0,v28 - lvx_u $in0,$x00,$inp # load input - vcipher $out1,$out1,v28 - lvx_u $in1,$x10,$inp - vcipher $out2,$out2,v28 - lvx_u $in2,$x20,$inp - vcipher $out3,$out3,v28 - lvx_u $in3,$x30,$inp - vcipher $out4,$out4,v28 - lvx_u $in4,$x40,$inp - vcipher $out5,$out5,v28 - lvx_u $in5,$x50,$inp - vcipher $out6,$out6,v28 - lvx_u $in6,$x60,$inp - vcipher $out7,$out7,v28 - lvx_u $in7,$x70,$inp - addi $inp,$inp,0x80 - - vcipher $out0,$out0,v29 - le?vperm $in0,$in0,$in0,$inpperm - vcipher $out1,$out1,v29 - le?vperm $in1,$in1,$in1,$inpperm - vcipher $out2,$out2,v29 - le?vperm $in2,$in2,$in2,$inpperm - vcipher $out3,$out3,v29 - le?vperm $in3,$in3,$in3,$inpperm - vcipher $out4,$out4,v29 - le?vperm $in4,$in4,$in4,$inpperm - vcipher $out5,$out5,v29 - le?vperm $in5,$in5,$in5,$inpperm - vcipher $out6,$out6,v29 - le?vperm $in6,$in6,$in6,$inpperm - vcipher $out7,$out7,v29 - le?vperm $in7,$in7,$in7,$inpperm - - add $inp,$inp,r0 # $inp is adjusted in such - # way that at exit from the - # loop inX-in7 are loaded - # with last "words" - subfe. r0,r0,r0 # borrow?-1:0 - vcipher $out0,$out0,v30 - vxor $in0,$in0,v31 # xor with last round key - vcipher $out1,$out1,v30 - vxor $in1,$in1,v31 - vcipher $out2,$out2,v30 - vxor $in2,$in2,v31 - vcipher $out3,$out3,v30 - vxor $in3,$in3,v31 - vcipher $out4,$out4,v30 - vxor $in4,$in4,v31 - vcipher $out5,$out5,v30 - vxor $in5,$in5,v31 - vcipher $out6,$out6,v30 - vxor $in6,$in6,v31 - vcipher $out7,$out7,v30 - vxor $in7,$in7,v31 - - bne Lctr32_enc8x_break # did $len-129 borrow? - - vcipherlast $in0,$out0,$in0 - vcipherlast $in1,$out1,$in1 - vadduqm $out1,$ivec,$one # counter values ... - vcipherlast $in2,$out2,$in2 - vadduqm $out2,$ivec,$two - vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0] - vcipherlast $in3,$out3,$in3 - vadduqm $out3,$out1,$two - vxor $out1,$out1,$rndkey0 - vcipherlast $in4,$out4,$in4 - vadduqm $out4,$out2,$two - vxor $out2,$out2,$rndkey0 - vcipherlast $in5,$out5,$in5 - vadduqm $out5,$out3,$two - vxor $out3,$out3,$rndkey0 - vcipherlast $in6,$out6,$in6 - vadduqm $out6,$out4,$two - vxor $out4,$out4,$rndkey0 - vcipherlast $in7,$out7,$in7 - vadduqm $out7,$out5,$two - vxor $out5,$out5,$rndkey0 - le?vperm $in0,$in0,$in0,$inpperm - vadduqm $ivec,$out6,$two # next counter value - vxor $out6,$out6,$rndkey0 - le?vperm $in1,$in1,$in1,$inpperm - vxor $out7,$out7,$rndkey0 - mtctr $rounds - - vcipher $out0,$out0,v24 - stvx_u $in0,$x00,$out - le?vperm $in2,$in2,$in2,$inpperm - vcipher $out1,$out1,v24 - stvx_u $in1,$x10,$out - le?vperm $in3,$in3,$in3,$inpperm - vcipher $out2,$out2,v24 - stvx_u $in2,$x20,$out - le?vperm $in4,$in4,$in4,$inpperm - vcipher $out3,$out3,v24 - stvx_u $in3,$x30,$out - le?vperm $in5,$in5,$in5,$inpperm - vcipher $out4,$out4,v24 - stvx_u $in4,$x40,$out - le?vperm $in6,$in6,$in6,$inpperm - vcipher $out5,$out5,v24 - stvx_u $in5,$x50,$out - le?vperm $in7,$in7,$in7,$inpperm - vcipher $out6,$out6,v24 - stvx_u $in6,$x60,$out - vcipher $out7,$out7,v24 - stvx_u $in7,$x70,$out - addi $out,$out,0x80 - - b Loop_ctr32_enc8x_middle - -.align 5 -Lctr32_enc8x_break: - cmpwi $len,-0x60 - blt Lctr32_enc8x_one - nop - beq Lctr32_enc8x_two - cmpwi $len,-0x40 - blt Lctr32_enc8x_three - nop - beq Lctr32_enc8x_four - cmpwi $len,-0x20 - blt Lctr32_enc8x_five - nop - beq Lctr32_enc8x_six - cmpwi $len,0x00 - blt Lctr32_enc8x_seven - -Lctr32_enc8x_eight: - vcipherlast $out0,$out0,$in0 - vcipherlast $out1,$out1,$in1 - vcipherlast $out2,$out2,$in2 - vcipherlast $out3,$out3,$in3 - vcipherlast $out4,$out4,$in4 - vcipherlast $out5,$out5,$in5 - vcipherlast $out6,$out6,$in6 - vcipherlast $out7,$out7,$in7 - - le?vperm $out0,$out0,$out0,$inpperm - le?vperm $out1,$out1,$out1,$inpperm - stvx_u $out0,$x00,$out - le?vperm $out2,$out2,$out2,$inpperm - stvx_u $out1,$x10,$out - le?vperm $out3,$out3,$out3,$inpperm - stvx_u $out2,$x20,$out - le?vperm $out4,$out4,$out4,$inpperm - stvx_u $out3,$x30,$out - le?vperm $out5,$out5,$out5,$inpperm - stvx_u $out4,$x40,$out - le?vperm $out6,$out6,$out6,$inpperm - stvx_u $out5,$x50,$out - le?vperm $out7,$out7,$out7,$inpperm - stvx_u $out6,$x60,$out - stvx_u $out7,$x70,$out - addi $out,$out,0x80 - b Lctr32_enc8x_done - -.align 5 -Lctr32_enc8x_seven: - vcipherlast $out0,$out0,$in1 - vcipherlast $out1,$out1,$in2 - vcipherlast $out2,$out2,$in3 - vcipherlast $out3,$out3,$in4 - vcipherlast $out4,$out4,$in5 - vcipherlast $out5,$out5,$in6 - vcipherlast $out6,$out6,$in7 - - le?vperm $out0,$out0,$out0,$inpperm - le?vperm $out1,$out1,$out1,$inpperm - stvx_u $out0,$x00,$out - le?vperm $out2,$out2,$out2,$inpperm - stvx_u $out1,$x10,$out - le?vperm $out3,$out3,$out3,$inpperm - stvx_u $out2,$x20,$out - le?vperm $out4,$out4,$out4,$inpperm - stvx_u $out3,$x30,$out - le?vperm $out5,$out5,$out5,$inpperm - stvx_u $out4,$x40,$out - le?vperm $out6,$out6,$out6,$inpperm - stvx_u $out5,$x50,$out - stvx_u $out6,$x60,$out - addi $out,$out,0x70 - b Lctr32_enc8x_done - -.align 5 -Lctr32_enc8x_six: - vcipherlast $out0,$out0,$in2 - vcipherlast $out1,$out1,$in3 - vcipherlast $out2,$out2,$in4 - vcipherlast $out3,$out3,$in5 - vcipherlast $out4,$out4,$in6 - vcipherlast $out5,$out5,$in7 - - le?vperm $out0,$out0,$out0,$inpperm - le?vperm $out1,$out1,$out1,$inpperm - stvx_u $out0,$x00,$out - le?vperm $out2,$out2,$out2,$inpperm - stvx_u $out1,$x10,$out - le?vperm $out3,$out3,$out3,$inpperm - stvx_u $out2,$x20,$out - le?vperm $out4,$out4,$out4,$inpperm - stvx_u $out3,$x30,$out - le?vperm $out5,$out5,$out5,$inpperm - stvx_u $out4,$x40,$out - stvx_u $out5,$x50,$out - addi $out,$out,0x60 - b Lctr32_enc8x_done - -.align 5 -Lctr32_enc8x_five: - vcipherlast $out0,$out0,$in3 - vcipherlast $out1,$out1,$in4 - vcipherlast $out2,$out2,$in5 - vcipherlast $out3,$out3,$in6 - vcipherlast $out4,$out4,$in7 - - le?vperm $out0,$out0,$out0,$inpperm - le?vperm $out1,$out1,$out1,$inpperm - stvx_u $out0,$x00,$out - le?vperm $out2,$out2,$out2,$inpperm - stvx_u $out1,$x10,$out - le?vperm $out3,$out3,$out3,$inpperm - stvx_u $out2,$x20,$out - le?vperm $out4,$out4,$out4,$inpperm - stvx_u $out3,$x30,$out - stvx_u $out4,$x40,$out - addi $out,$out,0x50 - b Lctr32_enc8x_done - -.align 5 -Lctr32_enc8x_four: - vcipherlast $out0,$out0,$in4 - vcipherlast $out1,$out1,$in5 - vcipherlast $out2,$out2,$in6 - vcipherlast $out3,$out3,$in7 - - le?vperm $out0,$out0,$out0,$inpperm - le?vperm $out1,$out1,$out1,$inpperm - stvx_u $out0,$x00,$out - le?vperm $out2,$out2,$out2,$inpperm - stvx_u $out1,$x10,$out - le?vperm $out3,$out3,$out3,$inpperm - stvx_u $out2,$x20,$out - stvx_u $out3,$x30,$out - addi $out,$out,0x40 - b Lctr32_enc8x_done - -.align 5 -Lctr32_enc8x_three: - vcipherlast $out0,$out0,$in5 - vcipherlast $out1,$out1,$in6 - vcipherlast $out2,$out2,$in7 - - le?vperm $out0,$out0,$out0,$inpperm - le?vperm $out1,$out1,$out1,$inpperm - stvx_u $out0,$x00,$out - le?vperm $out2,$out2,$out2,$inpperm - stvx_u $out1,$x10,$out - stvx_u $out2,$x20,$out - addi $out,$out,0x30 - b Lctr32_enc8x_done - -.align 5 -Lctr32_enc8x_two: - vcipherlast $out0,$out0,$in6 - vcipherlast $out1,$out1,$in7 - - le?vperm $out0,$out0,$out0,$inpperm - le?vperm $out1,$out1,$out1,$inpperm - stvx_u $out0,$x00,$out - stvx_u $out1,$x10,$out - addi $out,$out,0x20 - b Lctr32_enc8x_done - -.align 5 -Lctr32_enc8x_one: - vcipherlast $out0,$out0,$in7 - - le?vperm $out0,$out0,$out0,$inpperm - stvx_u $out0,0,$out - addi $out,$out,0x10 - -Lctr32_enc8x_done: - li r10,`$FRAME+15` - li r11,`$FRAME+31` - stvx $inpperm,r10,$sp # wipe copies of round keys - addi r10,r10,32 - stvx $inpperm,r11,$sp - addi r11,r11,32 - stvx $inpperm,r10,$sp - addi r10,r10,32 - stvx $inpperm,r11,$sp - addi r11,r11,32 - stvx $inpperm,r10,$sp - addi r10,r10,32 - stvx $inpperm,r11,$sp - addi r11,r11,32 - stvx $inpperm,r10,$sp - addi r10,r10,32 - stvx $inpperm,r11,$sp - addi r11,r11,32 - - mtspr 256,$vrsave - lvx v20,r10,$sp # ABI says so - addi r10,r10,32 - lvx v21,r11,$sp - addi r11,r11,32 - lvx v22,r10,$sp - addi r10,r10,32 - lvx v23,r11,$sp - addi r11,r11,32 - lvx v24,r10,$sp - addi r10,r10,32 - lvx v25,r11,$sp - addi r11,r11,32 - lvx v26,r10,$sp - addi r10,r10,32 - lvx v27,r11,$sp - addi r11,r11,32 - lvx v28,r10,$sp - addi r10,r10,32 - lvx v29,r11,$sp - addi r11,r11,32 - lvx v30,r10,$sp - lvx v31,r11,$sp - $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) - $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) - $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) - $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) - $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) - $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) - addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` - blr - .long 0 - .byte 0,12,0x14,0,0x80,6,6,0 - .long 0 -.size .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks -___ -}} }}} - -######################################################################### -{{{ # XTS procedures # -# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len, # -# const AES_KEY *key1, const AES_KEY *key2, # -# [const] unsigned char iv[16]); # -# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which # -# input tweak value is assumed to be encrypted already, and last tweak # -# value, one suitable for consecutive call on same chunk of data, is # -# written back to original buffer. In addition, in "tweak chaining" # -# mode only complete input blocks are processed. # - -my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) = map("r$_",(3..10)); -my ($rndkey0,$rndkey1,$inout) = map("v$_",(0..2)); -my ($output,$inptail,$inpperm,$leperm,$keyperm) = map("v$_",(3..7)); -my ($tweak,$seven,$eighty7,$tmp,$tweak1) = map("v$_",(8..12)); -my $taillen = $key2; - - ($inp,$idx) = ($idx,$inp); # reassign - -$code.=<<___; -.globl .${prefix}_xts_encrypt - mr $inp,r3 # reassign - li r3,-1 - ${UCMP}i $len,16 - bltlr- - - lis r0,0xfff0 - mfspr r12,256 # save vrsave - li r11,0 - mtspr 256,r0 - - vspltisb $seven,0x07 # 0x070707..07 - le?lvsl $leperm,r11,r11 - le?vspltisb $tmp,0x0f - le?vxor $leperm,$leperm,$seven - - li $idx,15 - lvx $tweak,0,$ivp # load [unaligned] iv - lvsl $inpperm,0,$ivp - lvx $inptail,$idx,$ivp - le?vxor $inpperm,$inpperm,$tmp - vperm $tweak,$tweak,$inptail,$inpperm - - neg r11,$inp - lvsr $inpperm,0,r11 # prepare for unaligned load - lvx $inout,0,$inp - addi $inp,$inp,15 # 15 is not typo - le?vxor $inpperm,$inpperm,$tmp - - ${UCMP}i $key2,0 # key2==NULL? - beq Lxts_enc_no_key2 - - ?lvsl $keyperm,0,$key2 # prepare for unaligned key - lwz $rounds,240($key2) - srwi $rounds,$rounds,1 - subi $rounds,$rounds,1 - li $idx,16 - - lvx $rndkey0,0,$key2 - lvx $rndkey1,$idx,$key2 - addi $idx,$idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vxor $tweak,$tweak,$rndkey0 - lvx $rndkey0,$idx,$key2 - addi $idx,$idx,16 - mtctr $rounds - -Ltweak_xts_enc: - ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm - vcipher $tweak,$tweak,$rndkey1 - lvx $rndkey1,$idx,$key2 - addi $idx,$idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vcipher $tweak,$tweak,$rndkey0 - lvx $rndkey0,$idx,$key2 - addi $idx,$idx,16 - bdnz Ltweak_xts_enc - - ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm - vcipher $tweak,$tweak,$rndkey1 - lvx $rndkey1,$idx,$key2 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vcipherlast $tweak,$tweak,$rndkey0 - - li $ivp,0 # don't chain the tweak - b Lxts_enc - -Lxts_enc_no_key2: - li $idx,-16 - and $len,$len,$idx # in "tweak chaining" - # mode only complete - # blocks are processed -Lxts_enc: - lvx $inptail,0,$inp - addi $inp,$inp,16 - - ?lvsl $keyperm,0,$key1 # prepare for unaligned key - lwz $rounds,240($key1) - srwi $rounds,$rounds,1 - subi $rounds,$rounds,1 - li $idx,16 - - vslb $eighty7,$seven,$seven # 0x808080..80 - vor $eighty7,$eighty7,$seven # 0x878787..87 - vspltisb $tmp,1 # 0x010101..01 - vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01 - - ${UCMP}i $len,96 - bge _aesp8_xts_encrypt6x - - andi. $taillen,$len,15 - subic r0,$len,32 - subi $taillen,$taillen,16 - subfe r0,r0,r0 - and r0,r0,$taillen - add $inp,$inp,r0 - - lvx $rndkey0,0,$key1 - lvx $rndkey1,$idx,$key1 - addi $idx,$idx,16 - vperm $inout,$inout,$inptail,$inpperm - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vxor $inout,$inout,$tweak - vxor $inout,$inout,$rndkey0 - lvx $rndkey0,$idx,$key1 - addi $idx,$idx,16 - mtctr $rounds - b Loop_xts_enc - -.align 5 -Loop_xts_enc: - ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm - vcipher $inout,$inout,$rndkey1 - lvx $rndkey1,$idx,$key1 - addi $idx,$idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vcipher $inout,$inout,$rndkey0 - lvx $rndkey0,$idx,$key1 - addi $idx,$idx,16 - bdnz Loop_xts_enc - - ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm - vcipher $inout,$inout,$rndkey1 - lvx $rndkey1,$idx,$key1 - li $idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vxor $rndkey0,$rndkey0,$tweak - vcipherlast $output,$inout,$rndkey0 - - le?vperm $tmp,$output,$output,$leperm - be?nop - le?stvx_u $tmp,0,$out - be?stvx_u $output,0,$out - addi $out,$out,16 - - subic. $len,$len,16 - beq Lxts_enc_done - - vmr $inout,$inptail - lvx $inptail,0,$inp - addi $inp,$inp,16 - lvx $rndkey0,0,$key1 - lvx $rndkey1,$idx,$key1 - addi $idx,$idx,16 - - subic r0,$len,32 - subfe r0,r0,r0 - and r0,r0,$taillen - add $inp,$inp,r0 - - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - vand $tmp,$tmp,$eighty7 - vxor $tweak,$tweak,$tmp - - vperm $inout,$inout,$inptail,$inpperm - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vxor $inout,$inout,$tweak - vxor $output,$output,$rndkey0 # just in case $len<16 - vxor $inout,$inout,$rndkey0 - lvx $rndkey0,$idx,$key1 - addi $idx,$idx,16 - - mtctr $rounds - ${UCMP}i $len,16 - bge Loop_xts_enc - - vxor $output,$output,$tweak - lvsr $inpperm,0,$len # $inpperm is no longer needed - vxor $inptail,$inptail,$inptail # $inptail is no longer needed - vspltisb $tmp,-1 - vperm $inptail,$inptail,$tmp,$inpperm - vsel $inout,$inout,$output,$inptail - - subi r11,$out,17 - subi $out,$out,16 - mtctr $len - li $len,16 -Loop_xts_enc_steal: - lbzu r0,1(r11) - stb r0,16(r11) - bdnz Loop_xts_enc_steal - - mtctr $rounds - b Loop_xts_enc # one more time... - -Lxts_enc_done: - ${UCMP}i $ivp,0 - beq Lxts_enc_ret - - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - vand $tmp,$tmp,$eighty7 - vxor $tweak,$tweak,$tmp - - le?vperm $tweak,$tweak,$tweak,$leperm - stvx_u $tweak,0,$ivp - -Lxts_enc_ret: - mtspr 256,r12 # restore vrsave - li r3,0 - blr - .long 0 - .byte 0,12,0x04,0,0x80,6,6,0 - .long 0 -.size .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt - -.globl .${prefix}_xts_decrypt - mr $inp,r3 # reassign - li r3,-1 - ${UCMP}i $len,16 - bltlr- - - lis r0,0xfff8 - mfspr r12,256 # save vrsave - li r11,0 - mtspr 256,r0 - - andi. r0,$len,15 - neg r0,r0 - andi. r0,r0,16 - sub $len,$len,r0 - - vspltisb $seven,0x07 # 0x070707..07 - le?lvsl $leperm,r11,r11 - le?vspltisb $tmp,0x0f - le?vxor $leperm,$leperm,$seven - - li $idx,15 - lvx $tweak,0,$ivp # load [unaligned] iv - lvsl $inpperm,0,$ivp - lvx $inptail,$idx,$ivp - le?vxor $inpperm,$inpperm,$tmp - vperm $tweak,$tweak,$inptail,$inpperm - - neg r11,$inp - lvsr $inpperm,0,r11 # prepare for unaligned load - lvx $inout,0,$inp - addi $inp,$inp,15 # 15 is not typo - le?vxor $inpperm,$inpperm,$tmp - - ${UCMP}i $key2,0 # key2==NULL? - beq Lxts_dec_no_key2 - - ?lvsl $keyperm,0,$key2 # prepare for unaligned key - lwz $rounds,240($key2) - srwi $rounds,$rounds,1 - subi $rounds,$rounds,1 - li $idx,16 - - lvx $rndkey0,0,$key2 - lvx $rndkey1,$idx,$key2 - addi $idx,$idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vxor $tweak,$tweak,$rndkey0 - lvx $rndkey0,$idx,$key2 - addi $idx,$idx,16 - mtctr $rounds - -Ltweak_xts_dec: - ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm - vcipher $tweak,$tweak,$rndkey1 - lvx $rndkey1,$idx,$key2 - addi $idx,$idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vcipher $tweak,$tweak,$rndkey0 - lvx $rndkey0,$idx,$key2 - addi $idx,$idx,16 - bdnz Ltweak_xts_dec - - ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm - vcipher $tweak,$tweak,$rndkey1 - lvx $rndkey1,$idx,$key2 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vcipherlast $tweak,$tweak,$rndkey0 - - li $ivp,0 # don't chain the tweak - b Lxts_dec - -Lxts_dec_no_key2: - neg $idx,$len - andi. $idx,$idx,15 - add $len,$len,$idx # in "tweak chaining" - # mode only complete - # blocks are processed -Lxts_dec: - lvx $inptail,0,$inp - addi $inp,$inp,16 - - ?lvsl $keyperm,0,$key1 # prepare for unaligned key - lwz $rounds,240($key1) - srwi $rounds,$rounds,1 - subi $rounds,$rounds,1 - li $idx,16 - - vslb $eighty7,$seven,$seven # 0x808080..80 - vor $eighty7,$eighty7,$seven # 0x878787..87 - vspltisb $tmp,1 # 0x010101..01 - vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01 - - ${UCMP}i $len,96 - bge _aesp8_xts_decrypt6x - - lvx $rndkey0,0,$key1 - lvx $rndkey1,$idx,$key1 - addi $idx,$idx,16 - vperm $inout,$inout,$inptail,$inpperm - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vxor $inout,$inout,$tweak - vxor $inout,$inout,$rndkey0 - lvx $rndkey0,$idx,$key1 - addi $idx,$idx,16 - mtctr $rounds - - ${UCMP}i $len,16 - blt Ltail_xts_dec - be?b Loop_xts_dec - -.align 5 -Loop_xts_dec: - ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm - vncipher $inout,$inout,$rndkey1 - lvx $rndkey1,$idx,$key1 - addi $idx,$idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vncipher $inout,$inout,$rndkey0 - lvx $rndkey0,$idx,$key1 - addi $idx,$idx,16 - bdnz Loop_xts_dec - - ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm - vncipher $inout,$inout,$rndkey1 - lvx $rndkey1,$idx,$key1 - li $idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vxor $rndkey0,$rndkey0,$tweak - vncipherlast $output,$inout,$rndkey0 - - le?vperm $tmp,$output,$output,$leperm - be?nop - le?stvx_u $tmp,0,$out - be?stvx_u $output,0,$out - addi $out,$out,16 - - subic. $len,$len,16 - beq Lxts_dec_done - - vmr $inout,$inptail - lvx $inptail,0,$inp - addi $inp,$inp,16 - lvx $rndkey0,0,$key1 - lvx $rndkey1,$idx,$key1 - addi $idx,$idx,16 - - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - vand $tmp,$tmp,$eighty7 - vxor $tweak,$tweak,$tmp - - vperm $inout,$inout,$inptail,$inpperm - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vxor $inout,$inout,$tweak - vxor $inout,$inout,$rndkey0 - lvx $rndkey0,$idx,$key1 - addi $idx,$idx,16 - - mtctr $rounds - ${UCMP}i $len,16 - bge Loop_xts_dec - -Ltail_xts_dec: - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak1,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - vand $tmp,$tmp,$eighty7 - vxor $tweak1,$tweak1,$tmp - - subi $inp,$inp,16 - add $inp,$inp,$len - - vxor $inout,$inout,$tweak # :-( - vxor $inout,$inout,$tweak1 # :-) - -Loop_xts_dec_short: - ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm - vncipher $inout,$inout,$rndkey1 - lvx $rndkey1,$idx,$key1 - addi $idx,$idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vncipher $inout,$inout,$rndkey0 - lvx $rndkey0,$idx,$key1 - addi $idx,$idx,16 - bdnz Loop_xts_dec_short - - ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm - vncipher $inout,$inout,$rndkey1 - lvx $rndkey1,$idx,$key1 - li $idx,16 - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - vxor $rndkey0,$rndkey0,$tweak1 - vncipherlast $output,$inout,$rndkey0 - - le?vperm $tmp,$output,$output,$leperm - be?nop - le?stvx_u $tmp,0,$out - be?stvx_u $output,0,$out - - vmr $inout,$inptail - lvx $inptail,0,$inp - #addi $inp,$inp,16 - lvx $rndkey0,0,$key1 - lvx $rndkey1,$idx,$key1 - addi $idx,$idx,16 - vperm $inout,$inout,$inptail,$inpperm - ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm - - lvsr $inpperm,0,$len # $inpperm is no longer needed - vxor $inptail,$inptail,$inptail # $inptail is no longer needed - vspltisb $tmp,-1 - vperm $inptail,$inptail,$tmp,$inpperm - vsel $inout,$inout,$output,$inptail - - vxor $rndkey0,$rndkey0,$tweak - vxor $inout,$inout,$rndkey0 - lvx $rndkey0,$idx,$key1 - addi $idx,$idx,16 - - subi r11,$out,1 - mtctr $len - li $len,16 -Loop_xts_dec_steal: - lbzu r0,1(r11) - stb r0,16(r11) - bdnz Loop_xts_dec_steal - - mtctr $rounds - b Loop_xts_dec # one more time... - -Lxts_dec_done: - ${UCMP}i $ivp,0 - beq Lxts_dec_ret - - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - vand $tmp,$tmp,$eighty7 - vxor $tweak,$tweak,$tmp - - le?vperm $tweak,$tweak,$tweak,$leperm - stvx_u $tweak,0,$ivp - -Lxts_dec_ret: - mtspr 256,r12 # restore vrsave - li r3,0 - blr - .long 0 - .byte 0,12,0x04,0,0x80,6,6,0 - .long 0 -.size .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt -___ -######################################################################### -{{ # Optimized XTS procedures # -my $key_=$key2; -my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31)); - $x00=0 if ($flavour =~ /osx/); -my ($in0, $in1, $in2, $in3, $in4, $in5 )=map("v$_",(0..5)); -my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16)); -my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22)); -my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys - # v26-v31 last 6 round keys -my ($keyperm)=($out0); # aliases with "caller", redundant assignment -my $taillen=$x70; - -$code.=<<___; -.align 5 -_aesp8_xts_encrypt6x: - $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) - mflr r11 - li r7,`$FRAME+8*16+15` - li r3,`$FRAME+8*16+31` - $PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) - stvx v20,r7,$sp # ABI says so - addi r7,r7,32 - stvx v21,r3,$sp - addi r3,r3,32 - stvx v22,r7,$sp - addi r7,r7,32 - stvx v23,r3,$sp - addi r3,r3,32 - stvx v24,r7,$sp - addi r7,r7,32 - stvx v25,r3,$sp - addi r3,r3,32 - stvx v26,r7,$sp - addi r7,r7,32 - stvx v27,r3,$sp - addi r3,r3,32 - stvx v28,r7,$sp - addi r7,r7,32 - stvx v29,r3,$sp - addi r3,r3,32 - stvx v30,r7,$sp - stvx v31,r3,$sp - li r0,-1 - stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave - li $x10,0x10 - $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) - li $x20,0x20 - $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) - li $x30,0x30 - $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) - li $x40,0x40 - $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) - li $x50,0x50 - $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) - li $x60,0x60 - $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) - li $x70,0x70 - mtspr 256,r0 - - subi $rounds,$rounds,3 # -4 in total - - lvx $rndkey0,$x00,$key1 # load key schedule - lvx v30,$x10,$key1 - addi $key1,$key1,0x20 - lvx v31,$x00,$key1 - ?vperm $rndkey0,$rndkey0,v30,$keyperm - addi $key_,$sp,$FRAME+15 - mtctr $rounds - -Load_xts_enc_key: - ?vperm v24,v30,v31,$keyperm - lvx v30,$x10,$key1 - addi $key1,$key1,0x20 - stvx v24,$x00,$key_ # off-load round[1] - ?vperm v25,v31,v30,$keyperm - lvx v31,$x00,$key1 - stvx v25,$x10,$key_ # off-load round[2] - addi $key_,$key_,0x20 - bdnz Load_xts_enc_key - - lvx v26,$x10,$key1 - ?vperm v24,v30,v31,$keyperm - lvx v27,$x20,$key1 - stvx v24,$x00,$key_ # off-load round[3] - ?vperm v25,v31,v26,$keyperm - lvx v28,$x30,$key1 - stvx v25,$x10,$key_ # off-load round[4] - addi $key_,$sp,$FRAME+15 # rewind $key_ - ?vperm v26,v26,v27,$keyperm - lvx v29,$x40,$key1 - ?vperm v27,v27,v28,$keyperm - lvx v30,$x50,$key1 - ?vperm v28,v28,v29,$keyperm - lvx v31,$x60,$key1 - ?vperm v29,v29,v30,$keyperm - lvx $twk5,$x70,$key1 # borrow $twk5 - ?vperm v30,v30,v31,$keyperm - lvx v24,$x00,$key_ # pre-load round[1] - ?vperm v31,v31,$twk5,$keyperm - lvx v25,$x10,$key_ # pre-load round[2] - - vperm $in0,$inout,$inptail,$inpperm - subi $inp,$inp,31 # undo "caller" - vxor $twk0,$tweak,$rndkey0 - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - vand $tmp,$tmp,$eighty7 - vxor $out0,$in0,$twk0 - vxor $tweak,$tweak,$tmp - - lvx_u $in1,$x10,$inp - vxor $twk1,$tweak,$rndkey0 - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - le?vperm $in1,$in1,$in1,$leperm - vand $tmp,$tmp,$eighty7 - vxor $out1,$in1,$twk1 - vxor $tweak,$tweak,$tmp - - lvx_u $in2,$x20,$inp - andi. $taillen,$len,15 - vxor $twk2,$tweak,$rndkey0 - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - le?vperm $in2,$in2,$in2,$leperm - vand $tmp,$tmp,$eighty7 - vxor $out2,$in2,$twk2 - vxor $tweak,$tweak,$tmp - - lvx_u $in3,$x30,$inp - sub $len,$len,$taillen - vxor $twk3,$tweak,$rndkey0 - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - le?vperm $in3,$in3,$in3,$leperm - vand $tmp,$tmp,$eighty7 - vxor $out3,$in3,$twk3 - vxor $tweak,$tweak,$tmp - - lvx_u $in4,$x40,$inp - subi $len,$len,0x60 - vxor $twk4,$tweak,$rndkey0 - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - le?vperm $in4,$in4,$in4,$leperm - vand $tmp,$tmp,$eighty7 - vxor $out4,$in4,$twk4 - vxor $tweak,$tweak,$tmp - - lvx_u $in5,$x50,$inp - addi $inp,$inp,0x60 - vxor $twk5,$tweak,$rndkey0 - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - le?vperm $in5,$in5,$in5,$leperm - vand $tmp,$tmp,$eighty7 - vxor $out5,$in5,$twk5 - vxor $tweak,$tweak,$tmp - - vxor v31,v31,$rndkey0 - mtctr $rounds - b Loop_xts_enc6x - -.align 5 -Loop_xts_enc6x: - vcipher $out0,$out0,v24 - vcipher $out1,$out1,v24 - vcipher $out2,$out2,v24 - vcipher $out3,$out3,v24 - vcipher $out4,$out4,v24 - vcipher $out5,$out5,v24 - lvx v24,$x20,$key_ # round[3] - addi $key_,$key_,0x20 - - vcipher $out0,$out0,v25 - vcipher $out1,$out1,v25 - vcipher $out2,$out2,v25 - vcipher $out3,$out3,v25 - vcipher $out4,$out4,v25 - vcipher $out5,$out5,v25 - lvx v25,$x10,$key_ # round[4] - bdnz Loop_xts_enc6x - - subic $len,$len,96 # $len-=96 - vxor $in0,$twk0,v31 # xor with last round key - vcipher $out0,$out0,v24 - vcipher $out1,$out1,v24 - vsrab $tmp,$tweak,$seven # next tweak value - vxor $twk0,$tweak,$rndkey0 - vaddubm $tweak,$tweak,$tweak - vcipher $out2,$out2,v24 - vcipher $out3,$out3,v24 - vsldoi $tmp,$tmp,$tmp,15 - vcipher $out4,$out4,v24 - vcipher $out5,$out5,v24 - - subfe. r0,r0,r0 # borrow?-1:0 - vand $tmp,$tmp,$eighty7 - vcipher $out0,$out0,v25 - vcipher $out1,$out1,v25 - vxor $tweak,$tweak,$tmp - vcipher $out2,$out2,v25 - vcipher $out3,$out3,v25 - vxor $in1,$twk1,v31 - vsrab $tmp,$tweak,$seven # next tweak value - vxor $twk1,$tweak,$rndkey0 - vcipher $out4,$out4,v25 - vcipher $out5,$out5,v25 - - and r0,r0,$len - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - vcipher $out0,$out0,v26 - vcipher $out1,$out1,v26 - vand $tmp,$tmp,$eighty7 - vcipher $out2,$out2,v26 - vcipher $out3,$out3,v26 - vxor $tweak,$tweak,$tmp - vcipher $out4,$out4,v26 - vcipher $out5,$out5,v26 - - add $inp,$inp,r0 # $inp is adjusted in such - # way that at exit from the - # loop inX-in5 are loaded - # with last "words" - vxor $in2,$twk2,v31 - vsrab $tmp,$tweak,$seven # next tweak value - vxor $twk2,$tweak,$rndkey0 - vaddubm $tweak,$tweak,$tweak - vcipher $out0,$out0,v27 - vcipher $out1,$out1,v27 - vsldoi $tmp,$tmp,$tmp,15 - vcipher $out2,$out2,v27 - vcipher $out3,$out3,v27 - vand $tmp,$tmp,$eighty7 - vcipher $out4,$out4,v27 - vcipher $out5,$out5,v27 - - addi $key_,$sp,$FRAME+15 # rewind $key_ - vxor $tweak,$tweak,$tmp - vcipher $out0,$out0,v28 - vcipher $out1,$out1,v28 - vxor $in3,$twk3,v31 - vsrab $tmp,$tweak,$seven # next tweak value - vxor $twk3,$tweak,$rndkey0 - vcipher $out2,$out2,v28 - vcipher $out3,$out3,v28 - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - vcipher $out4,$out4,v28 - vcipher $out5,$out5,v28 - lvx v24,$x00,$key_ # re-pre-load round[1] - vand $tmp,$tmp,$eighty7 - - vcipher $out0,$out0,v29 - vcipher $out1,$out1,v29 - vxor $tweak,$tweak,$tmp - vcipher $out2,$out2,v29 - vcipher $out3,$out3,v29 - vxor $in4,$twk4,v31 - vsrab $tmp,$tweak,$seven # next tweak value - vxor $twk4,$tweak,$rndkey0 - vcipher $out4,$out4,v29 - vcipher $out5,$out5,v29 - lvx v25,$x10,$key_ # re-pre-load round[2] - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - - vcipher $out0,$out0,v30 - vcipher $out1,$out1,v30 - vand $tmp,$tmp,$eighty7 - vcipher $out2,$out2,v30 - vcipher $out3,$out3,v30 - vxor $tweak,$tweak,$tmp - vcipher $out4,$out4,v30 - vcipher $out5,$out5,v30 - vxor $in5,$twk5,v31 - vsrab $tmp,$tweak,$seven # next tweak value - vxor $twk5,$tweak,$rndkey0 - - vcipherlast $out0,$out0,$in0 - lvx_u $in0,$x00,$inp # load next input block - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - vcipherlast $out1,$out1,$in1 - lvx_u $in1,$x10,$inp - vcipherlast $out2,$out2,$in2 - le?vperm $in0,$in0,$in0,$leperm - lvx_u $in2,$x20,$inp - vand $tmp,$tmp,$eighty7 - vcipherlast $out3,$out3,$in3 - le?vperm $in1,$in1,$in1,$leperm - lvx_u $in3,$x30,$inp - vcipherlast $out4,$out4,$in4 - le?vperm $in2,$in2,$in2,$leperm - lvx_u $in4,$x40,$inp - vxor $tweak,$tweak,$tmp - vcipherlast $tmp,$out5,$in5 # last block might be needed - # in stealing mode - le?vperm $in3,$in3,$in3,$leperm - lvx_u $in5,$x50,$inp - addi $inp,$inp,0x60 - le?vperm $in4,$in4,$in4,$leperm - le?vperm $in5,$in5,$in5,$leperm - - le?vperm $out0,$out0,$out0,$leperm - le?vperm $out1,$out1,$out1,$leperm - stvx_u $out0,$x00,$out # store output - vxor $out0,$in0,$twk0 - le?vperm $out2,$out2,$out2,$leperm - stvx_u $out1,$x10,$out - vxor $out1,$in1,$twk1 - le?vperm $out3,$out3,$out3,$leperm - stvx_u $out2,$x20,$out - vxor $out2,$in2,$twk2 - le?vperm $out4,$out4,$out4,$leperm - stvx_u $out3,$x30,$out - vxor $out3,$in3,$twk3 - le?vperm $out5,$tmp,$tmp,$leperm - stvx_u $out4,$x40,$out - vxor $out4,$in4,$twk4 - le?stvx_u $out5,$x50,$out - be?stvx_u $tmp, $x50,$out - vxor $out5,$in5,$twk5 - addi $out,$out,0x60 - - mtctr $rounds - beq Loop_xts_enc6x # did $len-=96 borrow? - - addic. $len,$len,0x60 - beq Lxts_enc6x_zero - cmpwi $len,0x20 - blt Lxts_enc6x_one - nop - beq Lxts_enc6x_two - cmpwi $len,0x40 - blt Lxts_enc6x_three - nop - beq Lxts_enc6x_four - -Lxts_enc6x_five: - vxor $out0,$in1,$twk0 - vxor $out1,$in2,$twk1 - vxor $out2,$in3,$twk2 - vxor $out3,$in4,$twk3 - vxor $out4,$in5,$twk4 - - bl _aesp8_xts_enc5x - - le?vperm $out0,$out0,$out0,$leperm - vmr $twk0,$twk5 # unused tweak - le?vperm $out1,$out1,$out1,$leperm - stvx_u $out0,$x00,$out # store output - le?vperm $out2,$out2,$out2,$leperm - stvx_u $out1,$x10,$out - le?vperm $out3,$out3,$out3,$leperm - stvx_u $out2,$x20,$out - vxor $tmp,$out4,$twk5 # last block prep for stealing - le?vperm $out4,$out4,$out4,$leperm - stvx_u $out3,$x30,$out - stvx_u $out4,$x40,$out - addi $out,$out,0x50 - bne Lxts_enc6x_steal - b Lxts_enc6x_done - -.align 4 -Lxts_enc6x_four: - vxor $out0,$in2,$twk0 - vxor $out1,$in3,$twk1 - vxor $out2,$in4,$twk2 - vxor $out3,$in5,$twk3 - vxor $out4,$out4,$out4 - - bl _aesp8_xts_enc5x - - le?vperm $out0,$out0,$out0,$leperm - vmr $twk0,$twk4 # unused tweak - le?vperm $out1,$out1,$out1,$leperm - stvx_u $out0,$x00,$out # store output - le?vperm $out2,$out2,$out2,$leperm - stvx_u $out1,$x10,$out - vxor $tmp,$out3,$twk4 # last block prep for stealing - le?vperm $out3,$out3,$out3,$leperm - stvx_u $out2,$x20,$out - stvx_u $out3,$x30,$out - addi $out,$out,0x40 - bne Lxts_enc6x_steal - b Lxts_enc6x_done - -.align 4 -Lxts_enc6x_three: - vxor $out0,$in3,$twk0 - vxor $out1,$in4,$twk1 - vxor $out2,$in5,$twk2 - vxor $out3,$out3,$out3 - vxor $out4,$out4,$out4 - - bl _aesp8_xts_enc5x - - le?vperm $out0,$out0,$out0,$leperm - vmr $twk0,$twk3 # unused tweak - le?vperm $out1,$out1,$out1,$leperm - stvx_u $out0,$x00,$out # store output - vxor $tmp,$out2,$twk3 # last block prep for stealing - le?vperm $out2,$out2,$out2,$leperm - stvx_u $out1,$x10,$out - stvx_u $out2,$x20,$out - addi $out,$out,0x30 - bne Lxts_enc6x_steal - b Lxts_enc6x_done - -.align 4 -Lxts_enc6x_two: - vxor $out0,$in4,$twk0 - vxor $out1,$in5,$twk1 - vxor $out2,$out2,$out2 - vxor $out3,$out3,$out3 - vxor $out4,$out4,$out4 - - bl _aesp8_xts_enc5x - - le?vperm $out0,$out0,$out0,$leperm - vmr $twk0,$twk2 # unused tweak - vxor $tmp,$out1,$twk2 # last block prep for stealing - le?vperm $out1,$out1,$out1,$leperm - stvx_u $out0,$x00,$out # store output - stvx_u $out1,$x10,$out - addi $out,$out,0x20 - bne Lxts_enc6x_steal - b Lxts_enc6x_done - -.align 4 -Lxts_enc6x_one: - vxor $out0,$in5,$twk0 - nop -Loop_xts_enc1x: - vcipher $out0,$out0,v24 - lvx v24,$x20,$key_ # round[3] - addi $key_,$key_,0x20 - - vcipher $out0,$out0,v25 - lvx v25,$x10,$key_ # round[4] - bdnz Loop_xts_enc1x - - add $inp,$inp,$taillen - cmpwi $taillen,0 - vcipher $out0,$out0,v24 - - subi $inp,$inp,16 - vcipher $out0,$out0,v25 - - lvsr $inpperm,0,$taillen - vcipher $out0,$out0,v26 - - lvx_u $in0,0,$inp - vcipher $out0,$out0,v27 - - addi $key_,$sp,$FRAME+15 # rewind $key_ - vcipher $out0,$out0,v28 - lvx v24,$x00,$key_ # re-pre-load round[1] - - vcipher $out0,$out0,v29 - lvx v25,$x10,$key_ # re-pre-load round[2] - vxor $twk0,$twk0,v31 - - le?vperm $in0,$in0,$in0,$leperm - vcipher $out0,$out0,v30 - - vperm $in0,$in0,$in0,$inpperm - vcipherlast $out0,$out0,$twk0 - - vmr $twk0,$twk1 # unused tweak - vxor $tmp,$out0,$twk1 # last block prep for stealing - le?vperm $out0,$out0,$out0,$leperm - stvx_u $out0,$x00,$out # store output - addi $out,$out,0x10 - bne Lxts_enc6x_steal - b Lxts_enc6x_done - -.align 4 -Lxts_enc6x_zero: - cmpwi $taillen,0 - beq Lxts_enc6x_done - - add $inp,$inp,$taillen - subi $inp,$inp,16 - lvx_u $in0,0,$inp - lvsr $inpperm,0,$taillen # $in5 is no more - le?vperm $in0,$in0,$in0,$leperm - vperm $in0,$in0,$in0,$inpperm - vxor $tmp,$tmp,$twk0 -Lxts_enc6x_steal: - vxor $in0,$in0,$twk0 - vxor $out0,$out0,$out0 - vspltisb $out1,-1 - vperm $out0,$out0,$out1,$inpperm - vsel $out0,$in0,$tmp,$out0 # $tmp is last block, remember? - - subi r30,$out,17 - subi $out,$out,16 - mtctr $taillen -Loop_xts_enc6x_steal: - lbzu r0,1(r30) - stb r0,16(r30) - bdnz Loop_xts_enc6x_steal - - li $taillen,0 - mtctr $rounds - b Loop_xts_enc1x # one more time... - -.align 4 -Lxts_enc6x_done: - ${UCMP}i $ivp,0 - beq Lxts_enc6x_ret - - vxor $tweak,$twk0,$rndkey0 - le?vperm $tweak,$tweak,$tweak,$leperm - stvx_u $tweak,0,$ivp - -Lxts_enc6x_ret: - mtlr r11 - li r10,`$FRAME+15` - li r11,`$FRAME+31` - stvx $seven,r10,$sp # wipe copies of round keys - addi r10,r10,32 - stvx $seven,r11,$sp - addi r11,r11,32 - stvx $seven,r10,$sp - addi r10,r10,32 - stvx $seven,r11,$sp - addi r11,r11,32 - stvx $seven,r10,$sp - addi r10,r10,32 - stvx $seven,r11,$sp - addi r11,r11,32 - stvx $seven,r10,$sp - addi r10,r10,32 - stvx $seven,r11,$sp - addi r11,r11,32 - - mtspr 256,$vrsave - lvx v20,r10,$sp # ABI says so - addi r10,r10,32 - lvx v21,r11,$sp - addi r11,r11,32 - lvx v22,r10,$sp - addi r10,r10,32 - lvx v23,r11,$sp - addi r11,r11,32 - lvx v24,r10,$sp - addi r10,r10,32 - lvx v25,r11,$sp - addi r11,r11,32 - lvx v26,r10,$sp - addi r10,r10,32 - lvx v27,r11,$sp - addi r11,r11,32 - lvx v28,r10,$sp - addi r10,r10,32 - lvx v29,r11,$sp - addi r11,r11,32 - lvx v30,r10,$sp - lvx v31,r11,$sp - $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) - $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) - $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) - $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) - $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) - $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) - addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` - blr - .long 0 - .byte 0,12,0x04,1,0x80,6,6,0 - .long 0 - -.align 5 -_aesp8_xts_enc5x: - vcipher $out0,$out0,v24 - vcipher $out1,$out1,v24 - vcipher $out2,$out2,v24 - vcipher $out3,$out3,v24 - vcipher $out4,$out4,v24 - lvx v24,$x20,$key_ # round[3] - addi $key_,$key_,0x20 - - vcipher $out0,$out0,v25 - vcipher $out1,$out1,v25 - vcipher $out2,$out2,v25 - vcipher $out3,$out3,v25 - vcipher $out4,$out4,v25 - lvx v25,$x10,$key_ # round[4] - bdnz _aesp8_xts_enc5x - - add $inp,$inp,$taillen - cmpwi $taillen,0 - vcipher $out0,$out0,v24 - vcipher $out1,$out1,v24 - vcipher $out2,$out2,v24 - vcipher $out3,$out3,v24 - vcipher $out4,$out4,v24 - - subi $inp,$inp,16 - vcipher $out0,$out0,v25 - vcipher $out1,$out1,v25 - vcipher $out2,$out2,v25 - vcipher $out3,$out3,v25 - vcipher $out4,$out4,v25 - vxor $twk0,$twk0,v31 - - vcipher $out0,$out0,v26 - lvsr $inpperm,r0,$taillen # $in5 is no more - vcipher $out1,$out1,v26 - vcipher $out2,$out2,v26 - vcipher $out3,$out3,v26 - vcipher $out4,$out4,v26 - vxor $in1,$twk1,v31 - - vcipher $out0,$out0,v27 - lvx_u $in0,0,$inp - vcipher $out1,$out1,v27 - vcipher $out2,$out2,v27 - vcipher $out3,$out3,v27 - vcipher $out4,$out4,v27 - vxor $in2,$twk2,v31 - - addi $key_,$sp,$FRAME+15 # rewind $key_ - vcipher $out0,$out0,v28 - vcipher $out1,$out1,v28 - vcipher $out2,$out2,v28 - vcipher $out3,$out3,v28 - vcipher $out4,$out4,v28 - lvx v24,$x00,$key_ # re-pre-load round[1] - vxor $in3,$twk3,v31 - - vcipher $out0,$out0,v29 - le?vperm $in0,$in0,$in0,$leperm - vcipher $out1,$out1,v29 - vcipher $out2,$out2,v29 - vcipher $out3,$out3,v29 - vcipher $out4,$out4,v29 - lvx v25,$x10,$key_ # re-pre-load round[2] - vxor $in4,$twk4,v31 - - vcipher $out0,$out0,v30 - vperm $in0,$in0,$in0,$inpperm - vcipher $out1,$out1,v30 - vcipher $out2,$out2,v30 - vcipher $out3,$out3,v30 - vcipher $out4,$out4,v30 - - vcipherlast $out0,$out0,$twk0 - vcipherlast $out1,$out1,$in1 - vcipherlast $out2,$out2,$in2 - vcipherlast $out3,$out3,$in3 - vcipherlast $out4,$out4,$in4 - blr - .long 0 - .byte 0,12,0x14,0,0,0,0,0 - -.align 5 -_aesp8_xts_decrypt6x: - $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) - mflr r11 - li r7,`$FRAME+8*16+15` - li r3,`$FRAME+8*16+31` - $PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) - stvx v20,r7,$sp # ABI says so - addi r7,r7,32 - stvx v21,r3,$sp - addi r3,r3,32 - stvx v22,r7,$sp - addi r7,r7,32 - stvx v23,r3,$sp - addi r3,r3,32 - stvx v24,r7,$sp - addi r7,r7,32 - stvx v25,r3,$sp - addi r3,r3,32 - stvx v26,r7,$sp - addi r7,r7,32 - stvx v27,r3,$sp - addi r3,r3,32 - stvx v28,r7,$sp - addi r7,r7,32 - stvx v29,r3,$sp - addi r3,r3,32 - stvx v30,r7,$sp - stvx v31,r3,$sp - li r0,-1 - stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave - li $x10,0x10 - $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) - li $x20,0x20 - $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) - li $x30,0x30 - $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) - li $x40,0x40 - $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) - li $x50,0x50 - $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) - li $x60,0x60 - $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) - li $x70,0x70 - mtspr 256,r0 - - subi $rounds,$rounds,3 # -4 in total - - lvx $rndkey0,$x00,$key1 # load key schedule - lvx v30,$x10,$key1 - addi $key1,$key1,0x20 - lvx v31,$x00,$key1 - ?vperm $rndkey0,$rndkey0,v30,$keyperm - addi $key_,$sp,$FRAME+15 - mtctr $rounds - -Load_xts_dec_key: - ?vperm v24,v30,v31,$keyperm - lvx v30,$x10,$key1 - addi $key1,$key1,0x20 - stvx v24,$x00,$key_ # off-load round[1] - ?vperm v25,v31,v30,$keyperm - lvx v31,$x00,$key1 - stvx v25,$x10,$key_ # off-load round[2] - addi $key_,$key_,0x20 - bdnz Load_xts_dec_key - - lvx v26,$x10,$key1 - ?vperm v24,v30,v31,$keyperm - lvx v27,$x20,$key1 - stvx v24,$x00,$key_ # off-load round[3] - ?vperm v25,v31,v26,$keyperm - lvx v28,$x30,$key1 - stvx v25,$x10,$key_ # off-load round[4] - addi $key_,$sp,$FRAME+15 # rewind $key_ - ?vperm v26,v26,v27,$keyperm - lvx v29,$x40,$key1 - ?vperm v27,v27,v28,$keyperm - lvx v30,$x50,$key1 - ?vperm v28,v28,v29,$keyperm - lvx v31,$x60,$key1 - ?vperm v29,v29,v30,$keyperm - lvx $twk5,$x70,$key1 # borrow $twk5 - ?vperm v30,v30,v31,$keyperm - lvx v24,$x00,$key_ # pre-load round[1] - ?vperm v31,v31,$twk5,$keyperm - lvx v25,$x10,$key_ # pre-load round[2] - - vperm $in0,$inout,$inptail,$inpperm - subi $inp,$inp,31 # undo "caller" - vxor $twk0,$tweak,$rndkey0 - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - vand $tmp,$tmp,$eighty7 - vxor $out0,$in0,$twk0 - vxor $tweak,$tweak,$tmp - - lvx_u $in1,$x10,$inp - vxor $twk1,$tweak,$rndkey0 - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - le?vperm $in1,$in1,$in1,$leperm - vand $tmp,$tmp,$eighty7 - vxor $out1,$in1,$twk1 - vxor $tweak,$tweak,$tmp - - lvx_u $in2,$x20,$inp - andi. $taillen,$len,15 - vxor $twk2,$tweak,$rndkey0 - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - le?vperm $in2,$in2,$in2,$leperm - vand $tmp,$tmp,$eighty7 - vxor $out2,$in2,$twk2 - vxor $tweak,$tweak,$tmp - - lvx_u $in3,$x30,$inp - sub $len,$len,$taillen - vxor $twk3,$tweak,$rndkey0 - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - le?vperm $in3,$in3,$in3,$leperm - vand $tmp,$tmp,$eighty7 - vxor $out3,$in3,$twk3 - vxor $tweak,$tweak,$tmp - - lvx_u $in4,$x40,$inp - subi $len,$len,0x60 - vxor $twk4,$tweak,$rndkey0 - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - le?vperm $in4,$in4,$in4,$leperm - vand $tmp,$tmp,$eighty7 - vxor $out4,$in4,$twk4 - vxor $tweak,$tweak,$tmp - - lvx_u $in5,$x50,$inp - addi $inp,$inp,0x60 - vxor $twk5,$tweak,$rndkey0 - vsrab $tmp,$tweak,$seven # next tweak value - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - le?vperm $in5,$in5,$in5,$leperm - vand $tmp,$tmp,$eighty7 - vxor $out5,$in5,$twk5 - vxor $tweak,$tweak,$tmp - - vxor v31,v31,$rndkey0 - mtctr $rounds - b Loop_xts_dec6x - -.align 5 -Loop_xts_dec6x: - vncipher $out0,$out0,v24 - vncipher $out1,$out1,v24 - vncipher $out2,$out2,v24 - vncipher $out3,$out3,v24 - vncipher $out4,$out4,v24 - vncipher $out5,$out5,v24 - lvx v24,$x20,$key_ # round[3] - addi $key_,$key_,0x20 - - vncipher $out0,$out0,v25 - vncipher $out1,$out1,v25 - vncipher $out2,$out2,v25 - vncipher $out3,$out3,v25 - vncipher $out4,$out4,v25 - vncipher $out5,$out5,v25 - lvx v25,$x10,$key_ # round[4] - bdnz Loop_xts_dec6x - - subic $len,$len,96 # $len-=96 - vxor $in0,$twk0,v31 # xor with last round key - vncipher $out0,$out0,v24 - vncipher $out1,$out1,v24 - vsrab $tmp,$tweak,$seven # next tweak value - vxor $twk0,$tweak,$rndkey0 - vaddubm $tweak,$tweak,$tweak - vncipher $out2,$out2,v24 - vncipher $out3,$out3,v24 - vsldoi $tmp,$tmp,$tmp,15 - vncipher $out4,$out4,v24 - vncipher $out5,$out5,v24 - - subfe. r0,r0,r0 # borrow?-1:0 - vand $tmp,$tmp,$eighty7 - vncipher $out0,$out0,v25 - vncipher $out1,$out1,v25 - vxor $tweak,$tweak,$tmp - vncipher $out2,$out2,v25 - vncipher $out3,$out3,v25 - vxor $in1,$twk1,v31 - vsrab $tmp,$tweak,$seven # next tweak value - vxor $twk1,$tweak,$rndkey0 - vncipher $out4,$out4,v25 - vncipher $out5,$out5,v25 - - and r0,r0,$len - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - vncipher $out0,$out0,v26 - vncipher $out1,$out1,v26 - vand $tmp,$tmp,$eighty7 - vncipher $out2,$out2,v26 - vncipher $out3,$out3,v26 - vxor $tweak,$tweak,$tmp - vncipher $out4,$out4,v26 - vncipher $out5,$out5,v26 - - add $inp,$inp,r0 # $inp is adjusted in such - # way that at exit from the - # loop inX-in5 are loaded - # with last "words" - vxor $in2,$twk2,v31 - vsrab $tmp,$tweak,$seven # next tweak value - vxor $twk2,$tweak,$rndkey0 - vaddubm $tweak,$tweak,$tweak - vncipher $out0,$out0,v27 - vncipher $out1,$out1,v27 - vsldoi $tmp,$tmp,$tmp,15 - vncipher $out2,$out2,v27 - vncipher $out3,$out3,v27 - vand $tmp,$tmp,$eighty7 - vncipher $out4,$out4,v27 - vncipher $out5,$out5,v27 - - addi $key_,$sp,$FRAME+15 # rewind $key_ - vxor $tweak,$tweak,$tmp - vncipher $out0,$out0,v28 - vncipher $out1,$out1,v28 - vxor $in3,$twk3,v31 - vsrab $tmp,$tweak,$seven # next tweak value - vxor $twk3,$tweak,$rndkey0 - vncipher $out2,$out2,v28 - vncipher $out3,$out3,v28 - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - vncipher $out4,$out4,v28 - vncipher $out5,$out5,v28 - lvx v24,$x00,$key_ # re-pre-load round[1] - vand $tmp,$tmp,$eighty7 - - vncipher $out0,$out0,v29 - vncipher $out1,$out1,v29 - vxor $tweak,$tweak,$tmp - vncipher $out2,$out2,v29 - vncipher $out3,$out3,v29 - vxor $in4,$twk4,v31 - vsrab $tmp,$tweak,$seven # next tweak value - vxor $twk4,$tweak,$rndkey0 - vncipher $out4,$out4,v29 - vncipher $out5,$out5,v29 - lvx v25,$x10,$key_ # re-pre-load round[2] - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - - vncipher $out0,$out0,v30 - vncipher $out1,$out1,v30 - vand $tmp,$tmp,$eighty7 - vncipher $out2,$out2,v30 - vncipher $out3,$out3,v30 - vxor $tweak,$tweak,$tmp - vncipher $out4,$out4,v30 - vncipher $out5,$out5,v30 - vxor $in5,$twk5,v31 - vsrab $tmp,$tweak,$seven # next tweak value - vxor $twk5,$tweak,$rndkey0 - - vncipherlast $out0,$out0,$in0 - lvx_u $in0,$x00,$inp # load next input block - vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 - vncipherlast $out1,$out1,$in1 - lvx_u $in1,$x10,$inp - vncipherlast $out2,$out2,$in2 - le?vperm $in0,$in0,$in0,$leperm - lvx_u $in2,$x20,$inp - vand $tmp,$tmp,$eighty7 - vncipherlast $out3,$out3,$in3 - le?vperm $in1,$in1,$in1,$leperm - lvx_u $in3,$x30,$inp - vncipherlast $out4,$out4,$in4 - le?vperm $in2,$in2,$in2,$leperm - lvx_u $in4,$x40,$inp - vxor $tweak,$tweak,$tmp - vncipherlast $out5,$out5,$in5 - le?vperm $in3,$in3,$in3,$leperm - lvx_u $in5,$x50,$inp - addi $inp,$inp,0x60 - le?vperm $in4,$in4,$in4,$leperm - le?vperm $in5,$in5,$in5,$leperm - - le?vperm $out0,$out0,$out0,$leperm - le?vperm $out1,$out1,$out1,$leperm - stvx_u $out0,$x00,$out # store output - vxor $out0,$in0,$twk0 - le?vperm $out2,$out2,$out2,$leperm - stvx_u $out1,$x10,$out - vxor $out1,$in1,$twk1 - le?vperm $out3,$out3,$out3,$leperm - stvx_u $out2,$x20,$out - vxor $out2,$in2,$twk2 - le?vperm $out4,$out4,$out4,$leperm - stvx_u $out3,$x30,$out - vxor $out3,$in3,$twk3 - le?vperm $out5,$out5,$out5,$leperm - stvx_u $out4,$x40,$out - vxor $out4,$in4,$twk4 - stvx_u $out5,$x50,$out - vxor $out5,$in5,$twk5 - addi $out,$out,0x60 - - mtctr $rounds - beq Loop_xts_dec6x # did $len-=96 borrow? - - addic. $len,$len,0x60 - beq Lxts_dec6x_zero - cmpwi $len,0x20 - blt Lxts_dec6x_one - nop - beq Lxts_dec6x_two - cmpwi $len,0x40 - blt Lxts_dec6x_three - nop - beq Lxts_dec6x_four - -Lxts_dec6x_five: - vxor $out0,$in1,$twk0 - vxor $out1,$in2,$twk1 - vxor $out2,$in3,$twk2 - vxor $out3,$in4,$twk3 - vxor $out4,$in5,$twk4 - - bl _aesp8_xts_dec5x - - le?vperm $out0,$out0,$out0,$leperm - vmr $twk0,$twk5 # unused tweak - vxor $twk1,$tweak,$rndkey0 - le?vperm $out1,$out1,$out1,$leperm - stvx_u $out0,$x00,$out # store output - vxor $out0,$in0,$twk1 - le?vperm $out2,$out2,$out2,$leperm - stvx_u $out1,$x10,$out - le?vperm $out3,$out3,$out3,$leperm - stvx_u $out2,$x20,$out - le?vperm $out4,$out4,$out4,$leperm - stvx_u $out3,$x30,$out - stvx_u $out4,$x40,$out - addi $out,$out,0x50 - bne Lxts_dec6x_steal - b Lxts_dec6x_done - -.align 4 -Lxts_dec6x_four: - vxor $out0,$in2,$twk0 - vxor $out1,$in3,$twk1 - vxor $out2,$in4,$twk2 - vxor $out3,$in5,$twk3 - vxor $out4,$out4,$out4 - - bl _aesp8_xts_dec5x - - le?vperm $out0,$out0,$out0,$leperm - vmr $twk0,$twk4 # unused tweak - vmr $twk1,$twk5 - le?vperm $out1,$out1,$out1,$leperm - stvx_u $out0,$x00,$out # store output - vxor $out0,$in0,$twk5 - le?vperm $out2,$out2,$out2,$leperm - stvx_u $out1,$x10,$out - le?vperm $out3,$out3,$out3,$leperm - stvx_u $out2,$x20,$out - stvx_u $out3,$x30,$out - addi $out,$out,0x40 - bne Lxts_dec6x_steal - b Lxts_dec6x_done - -.align 4 -Lxts_dec6x_three: - vxor $out0,$in3,$twk0 - vxor $out1,$in4,$twk1 - vxor $out2,$in5,$twk2 - vxor $out3,$out3,$out3 - vxor $out4,$out4,$out4 - - bl _aesp8_xts_dec5x - - le?vperm $out0,$out0,$out0,$leperm - vmr $twk0,$twk3 # unused tweak - vmr $twk1,$twk4 - le?vperm $out1,$out1,$out1,$leperm - stvx_u $out0,$x00,$out # store output - vxor $out0,$in0,$twk4 - le?vperm $out2,$out2,$out2,$leperm - stvx_u $out1,$x10,$out - stvx_u $out2,$x20,$out - addi $out,$out,0x30 - bne Lxts_dec6x_steal - b Lxts_dec6x_done - -.align 4 -Lxts_dec6x_two: - vxor $out0,$in4,$twk0 - vxor $out1,$in5,$twk1 - vxor $out2,$out2,$out2 - vxor $out3,$out3,$out3 - vxor $out4,$out4,$out4 - - bl _aesp8_xts_dec5x - - le?vperm $out0,$out0,$out0,$leperm - vmr $twk0,$twk2 # unused tweak - vmr $twk1,$twk3 - le?vperm $out1,$out1,$out1,$leperm - stvx_u $out0,$x00,$out # store output - vxor $out0,$in0,$twk3 - stvx_u $out1,$x10,$out - addi $out,$out,0x20 - bne Lxts_dec6x_steal - b Lxts_dec6x_done - -.align 4 -Lxts_dec6x_one: - vxor $out0,$in5,$twk0 - nop -Loop_xts_dec1x: - vncipher $out0,$out0,v24 - lvx v24,$x20,$key_ # round[3] - addi $key_,$key_,0x20 - - vncipher $out0,$out0,v25 - lvx v25,$x10,$key_ # round[4] - bdnz Loop_xts_dec1x - - subi r0,$taillen,1 - vncipher $out0,$out0,v24 - - andi. r0,r0,16 - cmpwi $taillen,0 - vncipher $out0,$out0,v25 - - sub $inp,$inp,r0 - vncipher $out0,$out0,v26 - - lvx_u $in0,0,$inp - vncipher $out0,$out0,v27 - - addi $key_,$sp,$FRAME+15 # rewind $key_ - vncipher $out0,$out0,v28 - lvx v24,$x00,$key_ # re-pre-load round[1] - - vncipher $out0,$out0,v29 - lvx v25,$x10,$key_ # re-pre-load round[2] - vxor $twk0,$twk0,v31 - - le?vperm $in0,$in0,$in0,$leperm - vncipher $out0,$out0,v30 - - mtctr $rounds - vncipherlast $out0,$out0,$twk0 - - vmr $twk0,$twk1 # unused tweak - vmr $twk1,$twk2 - le?vperm $out0,$out0,$out0,$leperm - stvx_u $out0,$x00,$out # store output - addi $out,$out,0x10 - vxor $out0,$in0,$twk2 - bne Lxts_dec6x_steal - b Lxts_dec6x_done - -.align 4 -Lxts_dec6x_zero: - cmpwi $taillen,0 - beq Lxts_dec6x_done - - lvx_u $in0,0,$inp - le?vperm $in0,$in0,$in0,$leperm - vxor $out0,$in0,$twk1 -Lxts_dec6x_steal: - vncipher $out0,$out0,v24 - lvx v24,$x20,$key_ # round[3] - addi $key_,$key_,0x20 - - vncipher $out0,$out0,v25 - lvx v25,$x10,$key_ # round[4] - bdnz Lxts_dec6x_steal - - add $inp,$inp,$taillen - vncipher $out0,$out0,v24 - - cmpwi $taillen,0 - vncipher $out0,$out0,v25 - - lvx_u $in0,0,$inp - vncipher $out0,$out0,v26 - - lvsr $inpperm,0,$taillen # $in5 is no more - vncipher $out0,$out0,v27 - - addi $key_,$sp,$FRAME+15 # rewind $key_ - vncipher $out0,$out0,v28 - lvx v24,$x00,$key_ # re-pre-load round[1] - - vncipher $out0,$out0,v29 - lvx v25,$x10,$key_ # re-pre-load round[2] - vxor $twk1,$twk1,v31 - - le?vperm $in0,$in0,$in0,$leperm - vncipher $out0,$out0,v30 - - vperm $in0,$in0,$in0,$inpperm - vncipherlast $tmp,$out0,$twk1 - - le?vperm $out0,$tmp,$tmp,$leperm - le?stvx_u $out0,0,$out - be?stvx_u $tmp,0,$out - - vxor $out0,$out0,$out0 - vspltisb $out1,-1 - vperm $out0,$out0,$out1,$inpperm - vsel $out0,$in0,$tmp,$out0 - vxor $out0,$out0,$twk0 - - subi r30,$out,1 - mtctr $taillen -Loop_xts_dec6x_steal: - lbzu r0,1(r30) - stb r0,16(r30) - bdnz Loop_xts_dec6x_steal - - li $taillen,0 - mtctr $rounds - b Loop_xts_dec1x # one more time... - -.align 4 -Lxts_dec6x_done: - ${UCMP}i $ivp,0 - beq Lxts_dec6x_ret - - vxor $tweak,$twk0,$rndkey0 - le?vperm $tweak,$tweak,$tweak,$leperm - stvx_u $tweak,0,$ivp - -Lxts_dec6x_ret: - mtlr r11 - li r10,`$FRAME+15` - li r11,`$FRAME+31` - stvx $seven,r10,$sp # wipe copies of round keys - addi r10,r10,32 - stvx $seven,r11,$sp - addi r11,r11,32 - stvx $seven,r10,$sp - addi r10,r10,32 - stvx $seven,r11,$sp - addi r11,r11,32 - stvx $seven,r10,$sp - addi r10,r10,32 - stvx $seven,r11,$sp - addi r11,r11,32 - stvx $seven,r10,$sp - addi r10,r10,32 - stvx $seven,r11,$sp - addi r11,r11,32 - - mtspr 256,$vrsave - lvx v20,r10,$sp # ABI says so - addi r10,r10,32 - lvx v21,r11,$sp - addi r11,r11,32 - lvx v22,r10,$sp - addi r10,r10,32 - lvx v23,r11,$sp - addi r11,r11,32 - lvx v24,r10,$sp - addi r10,r10,32 - lvx v25,r11,$sp - addi r11,r11,32 - lvx v26,r10,$sp - addi r10,r10,32 - lvx v27,r11,$sp - addi r11,r11,32 - lvx v28,r10,$sp - addi r10,r10,32 - lvx v29,r11,$sp - addi r11,r11,32 - lvx v30,r10,$sp - lvx v31,r11,$sp - $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) - $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) - $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) - $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) - $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) - $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) - addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` - blr - .long 0 - .byte 0,12,0x04,1,0x80,6,6,0 - .long 0 - -.align 5 -_aesp8_xts_dec5x: - vncipher $out0,$out0,v24 - vncipher $out1,$out1,v24 - vncipher $out2,$out2,v24 - vncipher $out3,$out3,v24 - vncipher $out4,$out4,v24 - lvx v24,$x20,$key_ # round[3] - addi $key_,$key_,0x20 - - vncipher $out0,$out0,v25 - vncipher $out1,$out1,v25 - vncipher $out2,$out2,v25 - vncipher $out3,$out3,v25 - vncipher $out4,$out4,v25 - lvx v25,$x10,$key_ # round[4] - bdnz _aesp8_xts_dec5x - - subi r0,$taillen,1 - vncipher $out0,$out0,v24 - vncipher $out1,$out1,v24 - vncipher $out2,$out2,v24 - vncipher $out3,$out3,v24 - vncipher $out4,$out4,v24 - - andi. r0,r0,16 - cmpwi $taillen,0 - vncipher $out0,$out0,v25 - vncipher $out1,$out1,v25 - vncipher $out2,$out2,v25 - vncipher $out3,$out3,v25 - vncipher $out4,$out4,v25 - vxor $twk0,$twk0,v31 - - sub $inp,$inp,r0 - vncipher $out0,$out0,v26 - vncipher $out1,$out1,v26 - vncipher $out2,$out2,v26 - vncipher $out3,$out3,v26 - vncipher $out4,$out4,v26 - vxor $in1,$twk1,v31 - - vncipher $out0,$out0,v27 - lvx_u $in0,0,$inp - vncipher $out1,$out1,v27 - vncipher $out2,$out2,v27 - vncipher $out3,$out3,v27 - vncipher $out4,$out4,v27 - vxor $in2,$twk2,v31 - - addi $key_,$sp,$FRAME+15 # rewind $key_ - vncipher $out0,$out0,v28 - vncipher $out1,$out1,v28 - vncipher $out2,$out2,v28 - vncipher $out3,$out3,v28 - vncipher $out4,$out4,v28 - lvx v24,$x00,$key_ # re-pre-load round[1] - vxor $in3,$twk3,v31 - - vncipher $out0,$out0,v29 - le?vperm $in0,$in0,$in0,$leperm - vncipher $out1,$out1,v29 - vncipher $out2,$out2,v29 - vncipher $out3,$out3,v29 - vncipher $out4,$out4,v29 - lvx v25,$x10,$key_ # re-pre-load round[2] - vxor $in4,$twk4,v31 - - vncipher $out0,$out0,v30 - vncipher $out1,$out1,v30 - vncipher $out2,$out2,v30 - vncipher $out3,$out3,v30 - vncipher $out4,$out4,v30 - - vncipherlast $out0,$out0,$twk0 - vncipherlast $out1,$out1,$in1 - vncipherlast $out2,$out2,$in2 - vncipherlast $out3,$out3,$in3 - vncipherlast $out4,$out4,$in4 - mtctr $rounds - blr - .long 0 - .byte 0,12,0x14,0,0,0,0,0 -___ -}} }}} - -my $consts=1; -foreach(split("\n",$code)) { - s/\`([^\`]*)\`/eval($1)/geo; - - # constants table endian-specific conversion - if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) { - my $conv=$3; - my @bytes=(); - - # convert to endian-agnostic format - if ($1 eq "long") { - foreach (split(/,\s*/,$2)) { - my $l = /^0/?oct:int; - push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff; - } - } else { - @bytes = map(/^0/?oct:int,split(/,\s*/,$2)); - } - - # little-endian conversion - if ($flavour =~ /le$/o) { - SWITCH: for($conv) { - /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; }; - /\?rev/ && do { @bytes=reverse(@bytes); last; }; - } - } - - #emit - print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n"; - next; - } - $consts=0 if (m/Lconsts:/o); # end of table - - # instructions prefixed with '?' are endian-specific and need - # to be adjusted accordingly... - if ($flavour =~ /le$/o) { # little-endian - s/le\?//o or - s/be\?/#be#/o or - s/\?lvsr/lvsl/o or - s/\?lvsl/lvsr/o or - s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or - s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or - s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o; - } else { # big-endian - s/le\?/#le#/o or - s/be\?//o or - s/\?([a-z]+)/$1/o; - } - - print $_,"\n"; -} - -close STDOUT; diff --git a/arch/powerpc/crypto/ghashp8-ppc.pl b/arch/powerpc/crypto/ghashp8-ppc.pl deleted file mode 100644 index b56603b4a893..000000000000 --- a/arch/powerpc/crypto/ghashp8-ppc.pl +++ /dev/null @@ -1,370 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 - -# This code is taken from the OpenSSL project but the author (Andy Polyakov) -# has relicensed it under the GPLv2. Therefore this program is free software; -# you can redistribute it and/or modify it under the terms of the GNU General -# Public License version 2 as published by the Free Software Foundation. -# -# The original headers, including the original license headers, are -# included below for completeness. - -# ==================================================================== -# Written by Andy Polyakov for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see https://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# GHASH for PowerISA v2.07. -# -# July 2014 -# -# Accurate performance measurements are problematic, because it's -# always virtualized setup with possibly throttled processor. -# Relative comparison is therefore more informative. This initial -# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x -# faster than "4-bit" integer-only compiler-generated 64-bit code. -# "Initial version" means that there is room for futher improvement. - -$flavour=shift; -$output =shift; - -if ($flavour =~ /64/) { - $SIZE_T=8; - $LRSAVE=2*$SIZE_T; - $STU="stdu"; - $POP="ld"; - $PUSH="std"; -} elsif ($flavour =~ /32/) { - $SIZE_T=4; - $LRSAVE=$SIZE_T; - $STU="stwu"; - $POP="lwz"; - $PUSH="stw"; -} else { die "nonsense $flavour"; } - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or -die "can't locate ppc-xlate.pl"; - -open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; - -my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block - -my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3)); -my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12)); -my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19)); -my $vrsave="r12"; -my ($t4,$t5,$t6) = ($Hl,$H,$Hh); - -$code=<<___; -.machine "any" - -.text - -.globl .gcm_init_p8 - lis r0,0xfff0 - li r8,0x10 - mfspr $vrsave,256 - li r9,0x20 - mtspr 256,r0 - li r10,0x30 - lvx_u $H,0,r4 # load H - le?xor r7,r7,r7 - le?addi r7,r7,0x8 # need a vperm start with 08 - le?lvsr 5,0,r7 - le?vspltisb 6,0x0f - le?vxor 5,5,6 # set a b-endian mask - le?vperm $H,$H,$H,5 - - vspltisb $xC2,-16 # 0xf0 - vspltisb $t0,1 # one - vaddubm $xC2,$xC2,$xC2 # 0xe0 - vxor $zero,$zero,$zero - vor $xC2,$xC2,$t0 # 0xe1 - vsldoi $xC2,$xC2,$zero,15 # 0xe1... - vsldoi $t1,$zero,$t0,1 # ...1 - vaddubm $xC2,$xC2,$xC2 # 0xc2... - vspltisb $t2,7 - vor $xC2,$xC2,$t1 # 0xc2....01 - vspltb $t1,$H,0 # most significant byte - vsl $H,$H,$t0 # H<<=1 - vsrab $t1,$t1,$t2 # broadcast carry bit - vand $t1,$t1,$xC2 - vxor $H,$H,$t1 # twisted H - - vsldoi $H,$H,$H,8 # twist even more ... - vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 - vsldoi $Hl,$zero,$H,8 # ... and split - vsldoi $Hh,$H,$zero,8 - - stvx_u $xC2,0,r3 # save pre-computed table - stvx_u $Hl,r8,r3 - stvx_u $H, r9,r3 - stvx_u $Hh,r10,r3 - - mtspr 256,$vrsave - blr - .long 0 - .byte 0,12,0x14,0,0,0,2,0 - .long 0 -.size .gcm_init_p8,.-.gcm_init_p8 - -.globl .gcm_init_htable - lis r0,0xfff0 - li r8,0x10 - mfspr $vrsave,256 - li r9,0x20 - mtspr 256,r0 - li r10,0x30 - lvx_u $H,0,r4 # load H - - vspltisb $xC2,-16 # 0xf0 - vspltisb $t0,1 # one - vaddubm $xC2,$xC2,$xC2 # 0xe0 - vxor $zero,$zero,$zero - vor $xC2,$xC2,$t0 # 0xe1 - vsldoi $xC2,$xC2,$zero,15 # 0xe1... - vsldoi $t1,$zero,$t0,1 # ...1 - vaddubm $xC2,$xC2,$xC2 # 0xc2... - vspltisb $t2,7 - vor $xC2,$xC2,$t1 # 0xc2....01 - vspltb $t1,$H,0 # most significant byte - vsl $H,$H,$t0 # H<<=1 - vsrab $t1,$t1,$t2 # broadcast carry bit - vand $t1,$t1,$xC2 - vxor $IN,$H,$t1 # twisted H - - vsldoi $H,$IN,$IN,8 # twist even more ... - vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 - vsldoi $Hl,$zero,$H,8 # ... and split - vsldoi $Hh,$H,$zero,8 - - stvx_u $xC2,0,r3 # save pre-computed table - stvx_u $Hl,r8,r3 - li r8,0x40 - stvx_u $H, r9,r3 - li r9,0x50 - stvx_u $Hh,r10,r3 - li r10,0x60 - - vpmsumd $Xl,$IN,$Hl # H.lo·H.lo - vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi - vpmsumd $Xh,$IN,$Hh # H.hi·H.hi - - vpmsumd $t2,$Xl,$xC2 # 1st reduction phase - - vsldoi $t0,$Xm,$zero,8 - vsldoi $t1,$zero,$Xm,8 - vxor $Xl,$Xl,$t0 - vxor $Xh,$Xh,$t1 - - vsldoi $Xl,$Xl,$Xl,8 - vxor $Xl,$Xl,$t2 - - vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase - vpmsumd $Xl,$Xl,$xC2 - vxor $t1,$t1,$Xh - vxor $IN1,$Xl,$t1 - - vsldoi $H2,$IN1,$IN1,8 - vsldoi $H2l,$zero,$H2,8 - vsldoi $H2h,$H2,$zero,8 - - stvx_u $H2l,r8,r3 # save H^2 - li r8,0x70 - stvx_u $H2,r9,r3 - li r9,0x80 - stvx_u $H2h,r10,r3 - li r10,0x90 - - vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo - vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo - vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi - vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi - vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi - vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi - - vpmsumd $t2,$Xl,$xC2 # 1st reduction phase - vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase - - vsldoi $t0,$Xm,$zero,8 - vsldoi $t1,$zero,$Xm,8 - vsldoi $t4,$Xm1,$zero,8 - vsldoi $t5,$zero,$Xm1,8 - vxor $Xl,$Xl,$t0 - vxor $Xh,$Xh,$t1 - vxor $Xl1,$Xl1,$t4 - vxor $Xh1,$Xh1,$t5 - - vsldoi $Xl,$Xl,$Xl,8 - vsldoi $Xl1,$Xl1,$Xl1,8 - vxor $Xl,$Xl,$t2 - vxor $Xl1,$Xl1,$t6 - - vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase - vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase - vpmsumd $Xl,$Xl,$xC2 - vpmsumd $Xl1,$Xl1,$xC2 - vxor $t1,$t1,$Xh - vxor $t5,$t5,$Xh1 - vxor $Xl,$Xl,$t1 - vxor $Xl1,$Xl1,$t5 - - vsldoi $H,$Xl,$Xl,8 - vsldoi $H2,$Xl1,$Xl1,8 - vsldoi $Hl,$zero,$H,8 - vsldoi $Hh,$H,$zero,8 - vsldoi $H2l,$zero,$H2,8 - vsldoi $H2h,$H2,$zero,8 - - stvx_u $Hl,r8,r3 # save H^3 - li r8,0xa0 - stvx_u $H,r9,r3 - li r9,0xb0 - stvx_u $Hh,r10,r3 - li r10,0xc0 - stvx_u $H2l,r8,r3 # save H^4 - stvx_u $H2,r9,r3 - stvx_u $H2h,r10,r3 - - mtspr 256,$vrsave - blr - .long 0 - .byte 0,12,0x14,0,0,0,2,0 - .long 0 -.size .gcm_init_htable,.-.gcm_init_htable - -.globl .gcm_gmult_p8 - lis r0,0xfff8 - li r8,0x10 - mfspr $vrsave,256 - li r9,0x20 - mtspr 256,r0 - li r10,0x30 - lvx_u $IN,0,$Xip # load Xi - - lvx_u $Hl,r8,$Htbl # load pre-computed table - le?lvsl $lemask,r0,r0 - lvx_u $H, r9,$Htbl - le?vspltisb $t0,0x07 - lvx_u $Hh,r10,$Htbl - le?vxor $lemask,$lemask,$t0 - lvx_u $xC2,0,$Htbl - le?vperm $IN,$IN,$IN,$lemask - vxor $zero,$zero,$zero - - vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo - vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi - vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi - - vpmsumd $t2,$Xl,$xC2 # 1st phase - - vsldoi $t0,$Xm,$zero,8 - vsldoi $t1,$zero,$Xm,8 - vxor $Xl,$Xl,$t0 - vxor $Xh,$Xh,$t1 - - vsldoi $Xl,$Xl,$Xl,8 - vxor $Xl,$Xl,$t2 - - vsldoi $t1,$Xl,$Xl,8 # 2nd phase - vpmsumd $Xl,$Xl,$xC2 - vxor $t1,$t1,$Xh - vxor $Xl,$Xl,$t1 - - le?vperm $Xl,$Xl,$Xl,$lemask - stvx_u $Xl,0,$Xip # write out Xi - - mtspr 256,$vrsave - blr - .long 0 - .byte 0,12,0x14,0,0,0,2,0 - .long 0 -.size .gcm_gmult_p8,.-.gcm_gmult_p8 - -.globl .gcm_ghash_p8 - lis r0,0xfff8 - li r8,0x10 - mfspr $vrsave,256 - li r9,0x20 - mtspr 256,r0 - li r10,0x30 - lvx_u $Xl,0,$Xip # load Xi - - lvx_u $Hl,r8,$Htbl # load pre-computed table - le?lvsl $lemask,r0,r0 - lvx_u $H, r9,$Htbl - le?vspltisb $t0,0x07 - lvx_u $Hh,r10,$Htbl - le?vxor $lemask,$lemask,$t0 - lvx_u $xC2,0,$Htbl - le?vperm $Xl,$Xl,$Xl,$lemask - vxor $zero,$zero,$zero - - lvx_u $IN,0,$inp - addi $inp,$inp,16 - subi $len,$len,16 - le?vperm $IN,$IN,$IN,$lemask - vxor $IN,$IN,$Xl - b Loop - -.align 5 -Loop: - subic $len,$len,16 - vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo - subfe. r0,r0,r0 # borrow?-1:0 - vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi - and r0,r0,$len - vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi - add $inp,$inp,r0 - - vpmsumd $t2,$Xl,$xC2 # 1st phase - - vsldoi $t0,$Xm,$zero,8 - vsldoi $t1,$zero,$Xm,8 - vxor $Xl,$Xl,$t0 - vxor $Xh,$Xh,$t1 - - vsldoi $Xl,$Xl,$Xl,8 - vxor $Xl,$Xl,$t2 - lvx_u $IN,0,$inp - addi $inp,$inp,16 - - vsldoi $t1,$Xl,$Xl,8 # 2nd phase - vpmsumd $Xl,$Xl,$xC2 - le?vperm $IN,$IN,$IN,$lemask - vxor $t1,$t1,$Xh - vxor $IN,$IN,$t1 - vxor $IN,$IN,$Xl - beq Loop # did $len-=16 borrow? - - vxor $Xl,$Xl,$t1 - le?vperm $Xl,$Xl,$Xl,$lemask - stvx_u $Xl,0,$Xip # write out Xi - - mtspr 256,$vrsave - blr - .long 0 - .byte 0,12,0x14,0,0,0,4,0 - .long 0 -.size .gcm_ghash_p8,.-.gcm_ghash_p8 - -.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by " -.align 2 -___ - -foreach (split("\n",$code)) { - if ($flavour =~ /le$/o) { # little-endian - s/le\?//o or - s/be\?/#be#/o; - } else { - s/le\?/#le#/o or - s/be\?//o; - } - print $_,"\n"; -} - -close STDOUT; # enforce flush diff --git a/arch/powerpc/crypto/p10-aes-gcm-glue.c b/arch/powerpc/crypto/p10-aes-gcm-glue.c deleted file mode 100644 index 777e6b5254da..000000000000 --- a/arch/powerpc/crypto/p10-aes-gcm-glue.c +++ /dev/null @@ -1,345 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Glue code for accelerated AES-GCM stitched implementation for ppc64le. - * - * Copyright 2022- IBM Inc. All rights reserved - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define PPC_MODULE_FEATURE_P10 (32 + ilog2(PPC_FEATURE2_ARCH_3_1)) -#define PPC_ALIGN 16 -#define GCM_IV_SIZE 12 - -MODULE_DESCRIPTION("PPC64le AES-GCM with Stitched implementation"); -MODULE_AUTHOR("Danny Tsen aadLen = alen; - i = alen & ~0xf; - if (i) { - gcm_ghash_p8(nXi, hash->Htable+32, aad, i); - aad += i; - alen -= i; - } - if (alen) { - for (i = 0; i < alen; i++) - nXi[i] ^= aad[i]; - - memset(gctx->aad_hash, 0, 16); - gcm_ghash_p8(gctx->aad_hash, hash->Htable+32, nXi, 16); - } else { - memcpy(gctx->aad_hash, nXi, 16); - } - - memcpy(hash->Htable, gctx->aad_hash, 16); -} - -static void gcmp10_init(struct gcm_ctx *gctx, u8 *iv, unsigned char *rdkey, - struct Hash_ctx *hash, u8 *assoc, unsigned int assoclen) -{ - __be32 counter = cpu_to_be32(1); - - aes_p8_encrypt(hash->H, hash->H, rdkey); - set_subkey(hash->H); - gcm_init_htable(hash->Htable+32, hash->H); - - *((__be32 *)(iv+12)) = counter; - - gctx->Plen = 0; - - /* - * Encrypt counter vector as iv tag and increment counter. - */ - aes_p8_encrypt(iv, gctx->ivtag, rdkey); - - counter = cpu_to_be32(2); - *((__be32 *)(iv+12)) = counter; - memcpy(gctx->iv, iv, 16); - - gctx->aadLen = assoclen; - memset(gctx->aad_hash, 0, 16); - if (assoclen) - set_aad(gctx, hash, assoc, assoclen); -} - -static void finish_tag(struct gcm_ctx *gctx, struct Hash_ctx *hash, int len) -{ - int i; - unsigned char len_ac[16 + PPC_ALIGN]; - unsigned char *aclen = PTR_ALIGN((void *)len_ac, PPC_ALIGN); - __be64 clen = cpu_to_be64(len << 3); - __be64 alen = cpu_to_be64(gctx->aadLen << 3); - - if (len == 0 && gctx->aadLen == 0) { - memcpy(hash->Htable, gctx->ivtag, 16); - return; - } - - /* - * Len is in bits. - */ - *((__be64 *)(aclen)) = alen; - *((__be64 *)(aclen+8)) = clen; - - /* - * hash (AAD len and len) - */ - gcm_ghash_p8(hash->Htable, hash->Htable+32, aclen, 16); - - for (i = 0; i < 16; i++) - hash->Htable[i] ^= gctx->ivtag[i]; -} - -static int set_authsize(struct crypto_aead *tfm, unsigned int authsize) -{ - switch (authsize) { - case 4: - case 8: - case 12: - case 13: - case 14: - case 15: - case 16: - break; - default: - return -EINVAL; - } - - return 0; -} - -static int p10_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct crypto_tfm *tfm = crypto_aead_tfm(aead); - struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm); - int ret; - - vsx_begin(); - ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key); - vsx_end(); - - return ret ? -EINVAL : 0; -} - -static int p10_aes_gcm_crypt(struct aead_request *req, int enc) -{ - struct crypto_tfm *tfm = req->base.tfm; - struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm); - u8 databuf[sizeof(struct gcm_ctx) + PPC_ALIGN]; - struct gcm_ctx *gctx = PTR_ALIGN((void *)databuf, PPC_ALIGN); - u8 hashbuf[sizeof(struct Hash_ctx) + PPC_ALIGN]; - struct Hash_ctx *hash = PTR_ALIGN((void *)hashbuf, PPC_ALIGN); - struct scatter_walk assoc_sg_walk; - struct skcipher_walk walk; - u8 *assocmem = NULL; - u8 *assoc; - unsigned int assoclen = req->assoclen; - unsigned int cryptlen = req->cryptlen; - unsigned char ivbuf[AES_BLOCK_SIZE+PPC_ALIGN]; - unsigned char *iv = PTR_ALIGN((void *)ivbuf, PPC_ALIGN); - int ret; - unsigned long auth_tag_len = crypto_aead_authsize(__crypto_aead_cast(tfm)); - u8 otag[16]; - int total_processed = 0; - - memset(databuf, 0, sizeof(databuf)); - memset(hashbuf, 0, sizeof(hashbuf)); - memset(ivbuf, 0, sizeof(ivbuf)); - memcpy(iv, req->iv, GCM_IV_SIZE); - - /* Linearize assoc, if not already linear */ - if (req->src->length >= assoclen && req->src->length) { - scatterwalk_start(&assoc_sg_walk, req->src); - assoc = scatterwalk_map(&assoc_sg_walk); - } else { - gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? - GFP_KERNEL : GFP_ATOMIC; - - /* assoc can be any length, so must be on heap */ - assocmem = kmalloc(assoclen, flags); - if (unlikely(!assocmem)) - return -ENOMEM; - assoc = assocmem; - - scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0); - } - - vsx_begin(); - gcmp10_init(gctx, iv, (unsigned char *) &ctx->enc_key, hash, assoc, assoclen); - vsx_end(); - - if (!assocmem) - scatterwalk_unmap(assoc); - else - kfree(assocmem); - - if (enc) - ret = skcipher_walk_aead_encrypt(&walk, req, false); - else - ret = skcipher_walk_aead_decrypt(&walk, req, false); - if (ret) - return ret; - - while (walk.nbytes > 0 && ret == 0) { - - vsx_begin(); - if (enc) - aes_p10_gcm_encrypt(walk.src.virt.addr, - walk.dst.virt.addr, - walk.nbytes, - &ctx->enc_key, gctx->iv, hash->Htable); - else - aes_p10_gcm_decrypt(walk.src.virt.addr, - walk.dst.virt.addr, - walk.nbytes, - &ctx->enc_key, gctx->iv, hash->Htable); - vsx_end(); - - total_processed += walk.nbytes; - ret = skcipher_walk_done(&walk, 0); - } - - if (ret) - return ret; - - /* Finalize hash */ - vsx_begin(); - finish_tag(gctx, hash, total_processed); - vsx_end(); - - /* copy Xi to end of dst */ - if (enc) - scatterwalk_map_and_copy(hash->Htable, req->dst, req->assoclen + cryptlen, - auth_tag_len, 1); - else { - scatterwalk_map_and_copy(otag, req->src, - req->assoclen + cryptlen - auth_tag_len, - auth_tag_len, 0); - - if (crypto_memneq(otag, hash->Htable, auth_tag_len)) { - memzero_explicit(hash->Htable, 16); - return -EBADMSG; - } - } - - return 0; -} - -static int p10_aes_gcm_encrypt(struct aead_request *req) -{ - return p10_aes_gcm_crypt(req, 1); -} - -static int p10_aes_gcm_decrypt(struct aead_request *req) -{ - return p10_aes_gcm_crypt(req, 0); -} - -static struct aead_alg gcm_aes_alg = { - .ivsize = GCM_IV_SIZE, - .maxauthsize = 16, - - .setauthsize = set_authsize, - .setkey = p10_aes_gcm_setkey, - .encrypt = p10_aes_gcm_encrypt, - .decrypt = p10_aes_gcm_decrypt, - - .base.cra_name = "gcm(aes)", - .base.cra_driver_name = "p10_aes_gcm", - .base.cra_priority = 2100, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct p10_aes_gcm_ctx), - .base.cra_module = THIS_MODULE, -}; - -static int __init p10_init(void) -{ - return crypto_register_aead(&gcm_aes_alg); -} - -static void __exit p10_exit(void) -{ - crypto_unregister_aead(&gcm_aes_alg); -} - -module_cpu_feature_match(PPC_MODULE_FEATURE_P10, p10_init); -module_exit(p10_exit); diff --git a/arch/powerpc/crypto/p10_aes_gcm.S b/arch/powerpc/crypto/p10_aes_gcm.S deleted file mode 100644 index 2306ad7c5e36..000000000000 --- a/arch/powerpc/crypto/p10_aes_gcm.S +++ /dev/null @@ -1,1519 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ - # - # Accelerated AES-GCM stitched implementation for ppc64le. - # - # Copyright 2022- IBM Inc. All rights reserved - # - #=================================================================================== - # Written by Danny Tsen - # - # GHASH is based on the Karatsuba multiplication method. - # - # Xi xor X1 - # - # X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H = - # (X1.h * H4.h + xX.l * H4.l + X1 * H4) + - # (X2.h * H3.h + X2.l * H3.l + X2 * H3) + - # (X3.h * H2.h + X3.l * H2.l + X3 * H2) + - # (X4.h * H.h + X4.l * H.l + X4 * H) - # - # Xi = v0 - # H Poly = v2 - # Hash keys = v3 - v14 - # ( H.l, H, H.h) - # ( H^2.l, H^2, H^2.h) - # ( H^3.l, H^3, H^3.h) - # ( H^4.l, H^4, H^4.h) - # - # v30 is IV - # v31 - counter 1 - # - # AES used, - # vs0 - vs14 for round keys - # v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted) - # - # This implementation uses stitched AES-GCM approach to improve overall performance. - # AES is implemented with 8x blocks and GHASH is using 2 4x blocks. - # - # =================================================================================== - # - -.machine "any" -.abiversion 1 -.text - - # 4x loops - # v15 - v18 - input states - # vs1 - vs9 - round keys - # -.macro Loop_aes_middle4x - xxlor 19+32, 1, 1 - xxlor 20+32, 2, 2 - xxlor 21+32, 3, 3 - xxlor 22+32, 4, 4 - - vcipher 15, 15, 19 - vcipher 16, 16, 19 - vcipher 17, 17, 19 - vcipher 18, 18, 19 - - vcipher 15, 15, 20 - vcipher 16, 16, 20 - vcipher 17, 17, 20 - vcipher 18, 18, 20 - - vcipher 15, 15, 21 - vcipher 16, 16, 21 - vcipher 17, 17, 21 - vcipher 18, 18, 21 - - vcipher 15, 15, 22 - vcipher 16, 16, 22 - vcipher 17, 17, 22 - vcipher 18, 18, 22 - - xxlor 19+32, 5, 5 - xxlor 20+32, 6, 6 - xxlor 21+32, 7, 7 - xxlor 22+32, 8, 8 - - vcipher 15, 15, 19 - vcipher 16, 16, 19 - vcipher 17, 17, 19 - vcipher 18, 18, 19 - - vcipher 15, 15, 20 - vcipher 16, 16, 20 - vcipher 17, 17, 20 - vcipher 18, 18, 20 - - vcipher 15, 15, 21 - vcipher 16, 16, 21 - vcipher 17, 17, 21 - vcipher 18, 18, 21 - - vcipher 15, 15, 22 - vcipher 16, 16, 22 - vcipher 17, 17, 22 - vcipher 18, 18, 22 - - xxlor 23+32, 9, 9 - vcipher 15, 15, 23 - vcipher 16, 16, 23 - vcipher 17, 17, 23 - vcipher 18, 18, 23 -.endm - - # 8x loops - # v15 - v22 - input states - # vs1 - vs9 - round keys - # -.macro Loop_aes_middle8x - xxlor 23+32, 1, 1 - xxlor 24+32, 2, 2 - xxlor 25+32, 3, 3 - xxlor 26+32, 4, 4 - - vcipher 15, 15, 23 - vcipher 16, 16, 23 - vcipher 17, 17, 23 - vcipher 18, 18, 23 - vcipher 19, 19, 23 - vcipher 20, 20, 23 - vcipher 21, 21, 23 - vcipher 22, 22, 23 - - vcipher 15, 15, 24 - vcipher 16, 16, 24 - vcipher 17, 17, 24 - vcipher 18, 18, 24 - vcipher 19, 19, 24 - vcipher 20, 20, 24 - vcipher 21, 21, 24 - vcipher 22, 22, 24 - - vcipher 15, 15, 25 - vcipher 16, 16, 25 - vcipher 17, 17, 25 - vcipher 18, 18, 25 - vcipher 19, 19, 25 - vcipher 20, 20, 25 - vcipher 21, 21, 25 - vcipher 22, 22, 25 - - vcipher 15, 15, 26 - vcipher 16, 16, 26 - vcipher 17, 17, 26 - vcipher 18, 18, 26 - vcipher 19, 19, 26 - vcipher 20, 20, 26 - vcipher 21, 21, 26 - vcipher 22, 22, 26 - - xxlor 23+32, 5, 5 - xxlor 24+32, 6, 6 - xxlor 25+32, 7, 7 - xxlor 26+32, 8, 8 - - vcipher 15, 15, 23 - vcipher 16, 16, 23 - vcipher 17, 17, 23 - vcipher 18, 18, 23 - vcipher 19, 19, 23 - vcipher 20, 20, 23 - vcipher 21, 21, 23 - vcipher 22, 22, 23 - - vcipher 15, 15, 24 - vcipher 16, 16, 24 - vcipher 17, 17, 24 - vcipher 18, 18, 24 - vcipher 19, 19, 24 - vcipher 20, 20, 24 - vcipher 21, 21, 24 - vcipher 22, 22, 24 - - vcipher 15, 15, 25 - vcipher 16, 16, 25 - vcipher 17, 17, 25 - vcipher 18, 18, 25 - vcipher 19, 19, 25 - vcipher 20, 20, 25 - vcipher 21, 21, 25 - vcipher 22, 22, 25 - - vcipher 15, 15, 26 - vcipher 16, 16, 26 - vcipher 17, 17, 26 - vcipher 18, 18, 26 - vcipher 19, 19, 26 - vcipher 20, 20, 26 - vcipher 21, 21, 26 - vcipher 22, 22, 26 - - xxlor 23+32, 9, 9 - vcipher 15, 15, 23 - vcipher 16, 16, 23 - vcipher 17, 17, 23 - vcipher 18, 18, 23 - vcipher 19, 19, 23 - vcipher 20, 20, 23 - vcipher 21, 21, 23 - vcipher 22, 22, 23 -.endm - -.macro Loop_aes_middle_1x - xxlor 19+32, 1, 1 - xxlor 20+32, 2, 2 - xxlor 21+32, 3, 3 - xxlor 22+32, 4, 4 - - vcipher 15, 15, 19 - vcipher 15, 15, 20 - vcipher 15, 15, 21 - vcipher 15, 15, 22 - - xxlor 19+32, 5, 5 - xxlor 20+32, 6, 6 - xxlor 21+32, 7, 7 - xxlor 22+32, 8, 8 - - vcipher 15, 15, 19 - vcipher 15, 15, 20 - vcipher 15, 15, 21 - vcipher 15, 15, 22 - - xxlor 19+32, 9, 9 - vcipher 15, 15, 19 -.endm - - # - # Compute 4x hash values based on Karatsuba method. - # -ppc_aes_gcm_ghash: - vxor 15, 15, 0 - - vpmsumd 23, 12, 15 # H4.L * X.L - vpmsumd 24, 9, 16 - vpmsumd 25, 6, 17 - vpmsumd 26, 3, 18 - - vxor 23, 23, 24 - vxor 23, 23, 25 - vxor 23, 23, 26 # L - - vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L - vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L - vpmsumd 26, 7, 17 - vpmsumd 27, 4, 18 - - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 # M - - # sum hash and reduction with H Poly - vpmsumd 28, 23, 2 # reduction - - vxor 29, 29, 29 - vsldoi 26, 24, 29, 8 # mL - vsldoi 29, 29, 24, 8 # mH - vxor 23, 23, 26 # mL + L - - vsldoi 23, 23, 23, 8 # swap - vxor 23, 23, 28 - - vpmsumd 24, 14, 15 # H4.H * X.H - vpmsumd 25, 11, 16 - vpmsumd 26, 8, 17 - vpmsumd 27, 5, 18 - - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 - - vxor 24, 24, 29 - - # sum hash and reduction with H Poly - vsldoi 27, 23, 23, 8 # swap - vpmsumd 23, 23, 2 - vxor 27, 27, 24 - vxor 23, 23, 27 - - xxlor 32, 23+32, 23+32 # update hash - - blr - - # - # Combine two 4x ghash - # v15 - v22 - input blocks - # -.macro ppc_aes_gcm_ghash2_4x - # first 4x hash - vxor 15, 15, 0 # Xi + X - - vpmsumd 23, 12, 15 # H4.L * X.L - vpmsumd 24, 9, 16 - vpmsumd 25, 6, 17 - vpmsumd 26, 3, 18 - - vxor 23, 23, 24 - vxor 23, 23, 25 - vxor 23, 23, 26 # L - - vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L - vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L - vpmsumd 26, 7, 17 - vpmsumd 27, 4, 18 - - vxor 24, 24, 25 - vxor 24, 24, 26 - - # sum hash and reduction with H Poly - vpmsumd 28, 23, 2 # reduction - - vxor 29, 29, 29 - - vxor 24, 24, 27 # M - vsldoi 26, 24, 29, 8 # mL - vsldoi 29, 29, 24, 8 # mH - vxor 23, 23, 26 # mL + L - - vsldoi 23, 23, 23, 8 # swap - vxor 23, 23, 28 - - vpmsumd 24, 14, 15 # H4.H * X.H - vpmsumd 25, 11, 16 - vpmsumd 26, 8, 17 - vpmsumd 27, 5, 18 - - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 # H - - vxor 24, 24, 29 # H + mH - - # sum hash and reduction with H Poly - vsldoi 27, 23, 23, 8 # swap - vpmsumd 23, 23, 2 - vxor 27, 27, 24 - vxor 27, 23, 27 # 1st Xi - - # 2nd 4x hash - vpmsumd 24, 9, 20 - vpmsumd 25, 6, 21 - vpmsumd 26, 3, 22 - vxor 19, 19, 27 # Xi + X - vpmsumd 23, 12, 19 # H4.L * X.L - - vxor 23, 23, 24 - vxor 23, 23, 25 - vxor 23, 23, 26 # L - - vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L - vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L - vpmsumd 26, 7, 21 - vpmsumd 27, 4, 22 - - vxor 24, 24, 25 - vxor 24, 24, 26 - - # sum hash and reduction with H Poly - vpmsumd 28, 23, 2 # reduction - - vxor 29, 29, 29 - - vxor 24, 24, 27 # M - vsldoi 26, 24, 29, 8 # mL - vsldoi 29, 29, 24, 8 # mH - vxor 23, 23, 26 # mL + L - - vsldoi 23, 23, 23, 8 # swap - vxor 23, 23, 28 - - vpmsumd 24, 14, 19 # H4.H * X.H - vpmsumd 25, 11, 20 - vpmsumd 26, 8, 21 - vpmsumd 27, 5, 22 - - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 # H - - vxor 24, 24, 29 # H + mH - - # sum hash and reduction with H Poly - vsldoi 27, 23, 23, 8 # swap - vpmsumd 23, 23, 2 - vxor 27, 27, 24 - vxor 23, 23, 27 - - xxlor 32, 23+32, 23+32 # update hash - -.endm - - # - # Compute update single hash - # -.macro ppc_update_hash_1x - vxor 28, 28, 0 - - vxor 19, 19, 19 - - vpmsumd 22, 3, 28 # L - vpmsumd 23, 4, 28 # M - vpmsumd 24, 5, 28 # H - - vpmsumd 27, 22, 2 # reduction - - vsldoi 25, 23, 19, 8 # mL - vsldoi 26, 19, 23, 8 # mH - vxor 22, 22, 25 # LL + LL - vxor 24, 24, 26 # HH + HH - - vsldoi 22, 22, 22, 8 # swap - vxor 22, 22, 27 - - vsldoi 20, 22, 22, 8 # swap - vpmsumd 22, 22, 2 # reduction - vxor 20, 20, 24 - vxor 22, 22, 20 - - vmr 0, 22 # update hash - -.endm - -.macro SAVE_REGS - stdu 1,-640(1) - mflr 0 - - std 14,112(1) - std 15,120(1) - std 16,128(1) - std 17,136(1) - std 18,144(1) - std 19,152(1) - std 20,160(1) - std 21,168(1) - li 9, 256 - stvx 20, 9, 1 - addi 9, 9, 16 - stvx 21, 9, 1 - addi 9, 9, 16 - stvx 22, 9, 1 - addi 9, 9, 16 - stvx 23, 9, 1 - addi 9, 9, 16 - stvx 24, 9, 1 - addi 9, 9, 16 - stvx 25, 9, 1 - addi 9, 9, 16 - stvx 26, 9, 1 - addi 9, 9, 16 - stvx 27, 9, 1 - addi 9, 9, 16 - stvx 28, 9, 1 - addi 9, 9, 16 - stvx 29, 9, 1 - addi 9, 9, 16 - stvx 30, 9, 1 - addi 9, 9, 16 - stvx 31, 9, 1 - stxv 14, 464(1) - stxv 15, 480(1) - stxv 16, 496(1) - stxv 17, 512(1) - stxv 18, 528(1) - stxv 19, 544(1) - stxv 20, 560(1) - stxv 21, 576(1) - stxv 22, 592(1) - std 0, 656(1) -.endm - -.macro RESTORE_REGS - lxv 14, 464(1) - lxv 15, 480(1) - lxv 16, 496(1) - lxv 17, 512(1) - lxv 18, 528(1) - lxv 19, 544(1) - lxv 20, 560(1) - lxv 21, 576(1) - lxv 22, 592(1) - li 9, 256 - lvx 20, 9, 1 - addi 9, 9, 16 - lvx 21, 9, 1 - addi 9, 9, 16 - lvx 22, 9, 1 - addi 9, 9, 16 - lvx 23, 9, 1 - addi 9, 9, 16 - lvx 24, 9, 1 - addi 9, 9, 16 - lvx 25, 9, 1 - addi 9, 9, 16 - lvx 26, 9, 1 - addi 9, 9, 16 - lvx 27, 9, 1 - addi 9, 9, 16 - lvx 28, 9, 1 - addi 9, 9, 16 - lvx 29, 9, 1 - addi 9, 9, 16 - lvx 30, 9, 1 - addi 9, 9, 16 - lvx 31, 9, 1 - - ld 0, 656(1) - ld 14,112(1) - ld 15,120(1) - ld 16,128(1) - ld 17,136(1) - ld 18,144(1) - ld 19,152(1) - ld 20,160(1) - ld 21,168(1) - - mtlr 0 - addi 1, 1, 640 -.endm - -.macro LOAD_HASH_TABLE - # Load Xi - lxvb16x 32, 0, 8 # load Xi - - # load Hash - h^4, h^3, h^2, h - li 10, 32 - lxvd2x 2+32, 10, 8 # H Poli - li 10, 48 - lxvd2x 3+32, 10, 8 # Hl - li 10, 64 - lxvd2x 4+32, 10, 8 # H - li 10, 80 - lxvd2x 5+32, 10, 8 # Hh - - li 10, 96 - lxvd2x 6+32, 10, 8 # H^2l - li 10, 112 - lxvd2x 7+32, 10, 8 # H^2 - li 10, 128 - lxvd2x 8+32, 10, 8 # H^2h - - li 10, 144 - lxvd2x 9+32, 10, 8 # H^3l - li 10, 160 - lxvd2x 10+32, 10, 8 # H^3 - li 10, 176 - lxvd2x 11+32, 10, 8 # H^3h - - li 10, 192 - lxvd2x 12+32, 10, 8 # H^4l - li 10, 208 - lxvd2x 13+32, 10, 8 # H^4 - li 10, 224 - lxvd2x 14+32, 10, 8 # H^4h -.endm - - # - # aes_p10_gcm_encrypt (const void *inp, void *out, size_t len, - # const char *rk, unsigned char iv[16], void *Xip); - # - # r3 - inp - # r4 - out - # r5 - len - # r6 - AES round keys - # r7 - iv and other data - # r8 - Xi, HPoli, hash keys - # - # rounds is at offset 240 in rk - # Xi is at 0 in gcm_table (Xip). - # -.global aes_p10_gcm_encrypt -.align 5 -aes_p10_gcm_encrypt: - - SAVE_REGS - - LOAD_HASH_TABLE - - # initialize ICB: GHASH( IV ), IV - r7 - lxvb16x 30+32, 0, 7 # load IV - v30 - - mr 12, 5 # length - li 11, 0 # block index - - # counter 1 - vxor 31, 31, 31 - vspltisb 22, 1 - vsldoi 31, 31, 22,1 # counter 1 - - # load round key to VSR - lxv 0, 0(6) - lxv 1, 0x10(6) - lxv 2, 0x20(6) - lxv 3, 0x30(6) - lxv 4, 0x40(6) - lxv 5, 0x50(6) - lxv 6, 0x60(6) - lxv 7, 0x70(6) - lxv 8, 0x80(6) - lxv 9, 0x90(6) - lxv 10, 0xa0(6) - - # load rounds - 10 (128), 12 (192), 14 (256) - lwz 9,240(6) - - # - # vxor state, state, w # addroundkey - xxlor 32+29, 0, 0 - vxor 15, 30, 29 # IV + round key - add round key 0 - - cmpdi 9, 10 - beq Loop_aes_gcm_8x - - # load 2 more round keys (v11, v12) - lxv 11, 0xb0(6) - lxv 12, 0xc0(6) - - cmpdi 9, 12 - beq Loop_aes_gcm_8x - - # load 2 more round keys (v11, v12, v13, v14) - lxv 13, 0xd0(6) - lxv 14, 0xe0(6) - cmpdi 9, 14 - beq Loop_aes_gcm_8x - - b aes_gcm_out - -.align 5 -Loop_aes_gcm_8x: - mr 14, 3 - mr 9, 4 - - # - # check partial block - # -Continue_partial_check: - ld 15, 56(7) - cmpdi 15, 0 - beq Continue - bgt Final_block - cmpdi 15, 16 - blt Final_block - -Continue: - # n blcoks - li 10, 128 - divdu 10, 12, 10 # n 128 bytes-blocks - cmpdi 10, 0 - beq Loop_last_block - - vaddudm 30, 30, 31 # IV + counter - vxor 16, 30, 29 - vaddudm 30, 30, 31 - vxor 17, 30, 29 - vaddudm 30, 30, 31 - vxor 18, 30, 29 - vaddudm 30, 30, 31 - vxor 19, 30, 29 - vaddudm 30, 30, 31 - vxor 20, 30, 29 - vaddudm 30, 30, 31 - vxor 21, 30, 29 - vaddudm 30, 30, 31 - vxor 22, 30, 29 - - mtctr 10 - - li 15, 16 - li 16, 32 - li 17, 48 - li 18, 64 - li 19, 80 - li 20, 96 - li 21, 112 - - lwz 10, 240(6) - -Loop_8x_block: - - lxvb16x 15, 0, 14 # load block - lxvb16x 16, 15, 14 # load block - lxvb16x 17, 16, 14 # load block - lxvb16x 18, 17, 14 # load block - lxvb16x 19, 18, 14 # load block - lxvb16x 20, 19, 14 # load block - lxvb16x 21, 20, 14 # load block - lxvb16x 22, 21, 14 # load block - addi 14, 14, 128 - - Loop_aes_middle8x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_next_ghash - - # 192 bits - xxlor 24+32, 11, 11 - - vcipher 15, 15, 23 - vcipher 16, 16, 23 - vcipher 17, 17, 23 - vcipher 18, 18, 23 - vcipher 19, 19, 23 - vcipher 20, 20, 23 - vcipher 21, 21, 23 - vcipher 22, 22, 23 - - vcipher 15, 15, 24 - vcipher 16, 16, 24 - vcipher 17, 17, 24 - vcipher 18, 18, 24 - vcipher 19, 19, 24 - vcipher 20, 20, 24 - vcipher 21, 21, 24 - vcipher 22, 22, 24 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_next_ghash - - # 256 bits - xxlor 24+32, 13, 13 - - vcipher 15, 15, 23 - vcipher 16, 16, 23 - vcipher 17, 17, 23 - vcipher 18, 18, 23 - vcipher 19, 19, 23 - vcipher 20, 20, 23 - vcipher 21, 21, 23 - vcipher 22, 22, 23 - - vcipher 15, 15, 24 - vcipher 16, 16, 24 - vcipher 17, 17, 24 - vcipher 18, 18, 24 - vcipher 19, 19, 24 - vcipher 20, 20, 24 - vcipher 21, 21, 24 - vcipher 22, 22, 24 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_next_ghash - b aes_gcm_out - -Do_next_ghash: - - # - # last round - vcipherlast 15, 15, 23 - vcipherlast 16, 16, 23 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 # store output - xxlxor 48, 48, 16 - stxvb16x 48, 15, 9 # store output - - vcipherlast 17, 17, 23 - vcipherlast 18, 18, 23 - - xxlxor 49, 49, 17 - stxvb16x 49, 16, 9 # store output - xxlxor 50, 50, 18 - stxvb16x 50, 17, 9 # store output - - vcipherlast 19, 19, 23 - vcipherlast 20, 20, 23 - - xxlxor 51, 51, 19 - stxvb16x 51, 18, 9 # store output - xxlxor 52, 52, 20 - stxvb16x 52, 19, 9 # store output - - vcipherlast 21, 21, 23 - vcipherlast 22, 22, 23 - - xxlxor 53, 53, 21 - stxvb16x 53, 20, 9 # store output - xxlxor 54, 54, 22 - stxvb16x 54, 21, 9 # store output - - addi 9, 9, 128 - - # ghash here - ppc_aes_gcm_ghash2_4x - - xxlor 27+32, 0, 0 - vaddudm 30, 30, 31 # IV + counter - vmr 29, 30 - vxor 15, 30, 27 # add round key - vaddudm 30, 30, 31 - vxor 16, 30, 27 - vaddudm 30, 30, 31 - vxor 17, 30, 27 - vaddudm 30, 30, 31 - vxor 18, 30, 27 - vaddudm 30, 30, 31 - vxor 19, 30, 27 - vaddudm 30, 30, 31 - vxor 20, 30, 27 - vaddudm 30, 30, 31 - vxor 21, 30, 27 - vaddudm 30, 30, 31 - vxor 22, 30, 27 - - addi 12, 12, -128 - addi 11, 11, 128 - - bdnz Loop_8x_block - - vmr 30, 29 - stxvb16x 30+32, 0, 7 # update IV - -Loop_last_block: - cmpdi 12, 0 - beq aes_gcm_out - - # loop last few blocks - li 10, 16 - divdu 10, 12, 10 - - mtctr 10 - - lwz 10, 240(6) - - cmpdi 12, 16 - blt Final_block - -Next_rem_block: - lxvb16x 15, 0, 14 # load block - - Loop_aes_middle_1x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_next_1x - - # 192 bits - xxlor 24+32, 11, 11 - - vcipher 15, 15, 23 - vcipher 15, 15, 24 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_next_1x - - # 256 bits - xxlor 24+32, 13, 13 - - vcipher 15, 15, 23 - vcipher 15, 15, 24 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_next_1x - -Do_next_1x: - vcipherlast 15, 15, 23 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 # store output - addi 14, 14, 16 - addi 9, 9, 16 - - vmr 28, 15 - ppc_update_hash_1x - - addi 12, 12, -16 - addi 11, 11, 16 - xxlor 19+32, 0, 0 - vaddudm 30, 30, 31 # IV + counter - vxor 15, 30, 19 # add round key - - bdnz Next_rem_block - - li 15, 0 - std 15, 56(7) # clear partial? - stxvb16x 30+32, 0, 7 # update IV - cmpdi 12, 0 - beq aes_gcm_out - -Final_block: - lwz 10, 240(6) - Loop_aes_middle_1x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_final_1x - - # 192 bits - xxlor 24+32, 11, 11 - - vcipher 15, 15, 23 - vcipher 15, 15, 24 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_final_1x - - # 256 bits - xxlor 24+32, 13, 13 - - vcipher 15, 15, 23 - vcipher 15, 15, 24 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_final_1x - -Do_final_1x: - vcipherlast 15, 15, 23 - - # check partial block - li 21, 0 # encrypt - ld 15, 56(7) # partial? - cmpdi 15, 0 - beq Normal_block - bl Do_partial_block - - cmpdi 12, 0 - ble aes_gcm_out - - b Continue_partial_check - -Normal_block: - lxvb16x 15, 0, 14 # load last block - xxlxor 47, 47, 15 - - # create partial block mask - li 15, 16 - sub 15, 15, 12 # index to the mask - - vspltisb 16, -1 # first 16 bytes - 0xffff...ff - vspltisb 17, 0 # second 16 bytes - 0x0000...00 - li 10, 192 - stvx 16, 10, 1 - addi 10, 10, 16 - stvx 17, 10, 1 - - addi 10, 1, 192 - lxvb16x 16, 15, 10 # load partial block mask - xxland 47, 47, 16 - - vmr 28, 15 - ppc_update_hash_1x - - # * should store only the remaining bytes. - bl Write_partial_block - - stxvb16x 30+32, 0, 7 # update IV - std 12, 56(7) # update partial? - li 16, 16 - - stxvb16x 32, 0, 8 # write out Xi - stxvb16x 32, 16, 8 # write out Xi - b aes_gcm_out - - # - # Compute data mask - # -.macro GEN_MASK _mask _start _end - vspltisb 16, -1 # first 16 bytes - 0xffff...ff - vspltisb 17, 0 # second 16 bytes - 0x0000...00 - li 10, 192 - stxvb16x 17+32, 10, 1 - add 10, 10, \_start - stxvb16x 16+32, 10, 1 - add 10, 10, \_end - stxvb16x 17+32, 10, 1 - - addi 10, 1, 192 - lxvb16x \_mask, 0, 10 # load partial block mask -.endm - - # - # Handle multiple partial blocks for encrypt and decrypt - # operations. - # -Do_partial_block: - add 17, 15, 5 - cmpdi 17, 16 - bgt Big_block - GEN_MASK 18, 15, 5 - b _Partial -Big_block: - li 16, 16 - GEN_MASK 18, 15, 16 - -_Partial: - lxvb16x 17+32, 0, 14 # load last block - sldi 16, 15, 3 - mtvsrdd 32+16, 0, 16 - vsro 17, 17, 16 - xxlxor 47, 47, 17+32 - xxland 47, 47, 18 - - vxor 0, 0, 0 # clear Xi - vmr 28, 15 - - cmpdi 21, 0 # encrypt/decrypt ops? - beq Skip_decrypt - xxland 32+28, 32+17, 18 - -Skip_decrypt: - - ppc_update_hash_1x - - li 16, 16 - lxvb16x 32+29, 16, 8 - vxor 0, 0, 29 - stxvb16x 32, 0, 8 # save Xi - stxvb16x 32, 16, 8 # save Xi - - # store partial block - # loop the rest of the stream if any - sldi 16, 15, 3 - mtvsrdd 32+16, 0, 16 - vslo 15, 15, 16 - #stxvb16x 15+32, 0, 9 # last block - - li 16, 16 - sub 17, 16, 15 # 16 - partial - - add 16, 15, 5 - cmpdi 16, 16 - bgt Larger_16 - mr 17, 5 -Larger_16: - - # write partial - li 10, 192 - stxvb16x 15+32, 10, 1 # save current block - - addi 10, 9, -1 - addi 16, 1, 191 - mtctr 17 # move partial byte count - -Write_last_partial: - lbzu 18, 1(16) - stbu 18, 1(10) - bdnz Write_last_partial - # Complete loop partial - - add 14, 14, 17 - add 9, 9, 17 - sub 12, 12, 17 - add 11, 11, 17 - - add 15, 15, 5 - cmpdi 15, 16 - blt Save_partial - - vaddudm 30, 30, 31 - stxvb16x 30+32, 0, 7 # update IV - xxlor 32+29, 0, 0 - vxor 15, 30, 29 # IV + round key - add round key 0 - li 15, 0 - std 15, 56(7) # partial done - clear - b Partial_done -Save_partial: - std 15, 56(7) # partial - -Partial_done: - blr - - # - # Write partial block - # r9 - output - # r12 - remaining bytes - # v15 - partial input data - # -Write_partial_block: - li 10, 192 - stxvb16x 15+32, 10, 1 # last block - - addi 10, 9, -1 - addi 16, 1, 191 - - mtctr 12 # remaining bytes - li 15, 0 - -Write_last_byte: - lbzu 14, 1(16) - stbu 14, 1(10) - bdnz Write_last_byte - blr - -aes_gcm_out: - # out = state - stxvb16x 32, 0, 8 # write out Xi - add 3, 11, 12 # return count - - RESTORE_REGS - blr - - # - # 8x Decrypt - # -.global aes_p10_gcm_decrypt -.align 5 -aes_p10_gcm_decrypt: - - SAVE_REGS - - LOAD_HASH_TABLE - - # initialize ICB: GHASH( IV ), IV - r7 - lxvb16x 30+32, 0, 7 # load IV - v30 - - mr 12, 5 # length - li 11, 0 # block index - - # counter 1 - vxor 31, 31, 31 - vspltisb 22, 1 - vsldoi 31, 31, 22,1 # counter 1 - - # load round key to VSR - lxv 0, 0(6) - lxv 1, 0x10(6) - lxv 2, 0x20(6) - lxv 3, 0x30(6) - lxv 4, 0x40(6) - lxv 5, 0x50(6) - lxv 6, 0x60(6) - lxv 7, 0x70(6) - lxv 8, 0x80(6) - lxv 9, 0x90(6) - lxv 10, 0xa0(6) - - # load rounds - 10 (128), 12 (192), 14 (256) - lwz 9,240(6) - - # - # vxor state, state, w # addroundkey - xxlor 32+29, 0, 0 - vxor 15, 30, 29 # IV + round key - add round key 0 - - cmpdi 9, 10 - beq Loop_aes_gcm_8x_dec - - # load 2 more round keys (v11, v12) - lxv 11, 0xb0(6) - lxv 12, 0xc0(6) - - cmpdi 9, 12 - beq Loop_aes_gcm_8x_dec - - # load 2 more round keys (v11, v12, v13, v14) - lxv 13, 0xd0(6) - lxv 14, 0xe0(6) - cmpdi 9, 14 - beq Loop_aes_gcm_8x_dec - - b aes_gcm_out - -.align 5 -Loop_aes_gcm_8x_dec: - mr 14, 3 - mr 9, 4 - - # - # check partial block - # -Continue_partial_check_dec: - ld 15, 56(7) - cmpdi 15, 0 - beq Continue_dec - bgt Final_block_dec - cmpdi 15, 16 - blt Final_block_dec - -Continue_dec: - # n blcoks - li 10, 128 - divdu 10, 12, 10 # n 128 bytes-blocks - cmpdi 10, 0 - beq Loop_last_block_dec - - vaddudm 30, 30, 31 # IV + counter - vxor 16, 30, 29 - vaddudm 30, 30, 31 - vxor 17, 30, 29 - vaddudm 30, 30, 31 - vxor 18, 30, 29 - vaddudm 30, 30, 31 - vxor 19, 30, 29 - vaddudm 30, 30, 31 - vxor 20, 30, 29 - vaddudm 30, 30, 31 - vxor 21, 30, 29 - vaddudm 30, 30, 31 - vxor 22, 30, 29 - - mtctr 10 - - li 15, 16 - li 16, 32 - li 17, 48 - li 18, 64 - li 19, 80 - li 20, 96 - li 21, 112 - - lwz 10, 240(6) - -Loop_8x_block_dec: - - lxvb16x 15, 0, 14 # load block - lxvb16x 16, 15, 14 # load block - lxvb16x 17, 16, 14 # load block - lxvb16x 18, 17, 14 # load block - lxvb16x 19, 18, 14 # load block - lxvb16x 20, 19, 14 # load block - lxvb16x 21, 20, 14 # load block - lxvb16x 22, 21, 14 # load block - addi 14, 14, 128 - - Loop_aes_middle8x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_next_ghash_dec - - # 192 bits - xxlor 24+32, 11, 11 - - vcipher 15, 15, 23 - vcipher 16, 16, 23 - vcipher 17, 17, 23 - vcipher 18, 18, 23 - vcipher 19, 19, 23 - vcipher 20, 20, 23 - vcipher 21, 21, 23 - vcipher 22, 22, 23 - - vcipher 15, 15, 24 - vcipher 16, 16, 24 - vcipher 17, 17, 24 - vcipher 18, 18, 24 - vcipher 19, 19, 24 - vcipher 20, 20, 24 - vcipher 21, 21, 24 - vcipher 22, 22, 24 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_next_ghash_dec - - # 256 bits - xxlor 24+32, 13, 13 - - vcipher 15, 15, 23 - vcipher 16, 16, 23 - vcipher 17, 17, 23 - vcipher 18, 18, 23 - vcipher 19, 19, 23 - vcipher 20, 20, 23 - vcipher 21, 21, 23 - vcipher 22, 22, 23 - - vcipher 15, 15, 24 - vcipher 16, 16, 24 - vcipher 17, 17, 24 - vcipher 18, 18, 24 - vcipher 19, 19, 24 - vcipher 20, 20, 24 - vcipher 21, 21, 24 - vcipher 22, 22, 24 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_next_ghash_dec - b aes_gcm_out - -Do_next_ghash_dec: - - # - # last round - vcipherlast 15, 15, 23 - vcipherlast 16, 16, 23 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 # store output - xxlxor 48, 48, 16 - stxvb16x 48, 15, 9 # store output - - vcipherlast 17, 17, 23 - vcipherlast 18, 18, 23 - - xxlxor 49, 49, 17 - stxvb16x 49, 16, 9 # store output - xxlxor 50, 50, 18 - stxvb16x 50, 17, 9 # store output - - vcipherlast 19, 19, 23 - vcipherlast 20, 20, 23 - - xxlxor 51, 51, 19 - stxvb16x 51, 18, 9 # store output - xxlxor 52, 52, 20 - stxvb16x 52, 19, 9 # store output - - vcipherlast 21, 21, 23 - vcipherlast 22, 22, 23 - - xxlxor 53, 53, 21 - stxvb16x 53, 20, 9 # store output - xxlxor 54, 54, 22 - stxvb16x 54, 21, 9 # store output - - addi 9, 9, 128 - - xxlor 15+32, 15, 15 - xxlor 16+32, 16, 16 - xxlor 17+32, 17, 17 - xxlor 18+32, 18, 18 - xxlor 19+32, 19, 19 - xxlor 20+32, 20, 20 - xxlor 21+32, 21, 21 - xxlor 22+32, 22, 22 - - # ghash here - ppc_aes_gcm_ghash2_4x - - xxlor 27+32, 0, 0 - vaddudm 30, 30, 31 # IV + counter - vmr 29, 30 - vxor 15, 30, 27 # add round key - vaddudm 30, 30, 31 - vxor 16, 30, 27 - vaddudm 30, 30, 31 - vxor 17, 30, 27 - vaddudm 30, 30, 31 - vxor 18, 30, 27 - vaddudm 30, 30, 31 - vxor 19, 30, 27 - vaddudm 30, 30, 31 - vxor 20, 30, 27 - vaddudm 30, 30, 31 - vxor 21, 30, 27 - vaddudm 30, 30, 31 - vxor 22, 30, 27 - - addi 12, 12, -128 - addi 11, 11, 128 - - bdnz Loop_8x_block_dec - - vmr 30, 29 - stxvb16x 30+32, 0, 7 # update IV - -Loop_last_block_dec: - cmpdi 12, 0 - beq aes_gcm_out - - # loop last few blocks - li 10, 16 - divdu 10, 12, 10 - - mtctr 10 - - lwz 10, 240(6) - - cmpdi 12, 16 - blt Final_block_dec - -Next_rem_block_dec: - lxvb16x 15, 0, 14 # load block - - Loop_aes_middle_1x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_next_1x_dec - - # 192 bits - xxlor 24+32, 11, 11 - - vcipher 15, 15, 23 - vcipher 15, 15, 24 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_next_1x_dec - - # 256 bits - xxlor 24+32, 13, 13 - - vcipher 15, 15, 23 - vcipher 15, 15, 24 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_next_1x_dec - -Do_next_1x_dec: - vcipherlast 15, 15, 23 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 # store output - addi 14, 14, 16 - addi 9, 9, 16 - - xxlor 28+32, 15, 15 - #vmr 28, 15 - ppc_update_hash_1x - - addi 12, 12, -16 - addi 11, 11, 16 - xxlor 19+32, 0, 0 - vaddudm 30, 30, 31 # IV + counter - vxor 15, 30, 19 # add round key - - bdnz Next_rem_block_dec - - li 15, 0 - std 15, 56(7) # clear partial? - stxvb16x 30+32, 0, 7 # update IV - cmpdi 12, 0 - beq aes_gcm_out - -Final_block_dec: - lwz 10, 240(6) - Loop_aes_middle_1x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_final_1x_dec - - # 192 bits - xxlor 24+32, 11, 11 - - vcipher 15, 15, 23 - vcipher 15, 15, 24 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_final_1x_dec - - # 256 bits - xxlor 24+32, 13, 13 - - vcipher 15, 15, 23 - vcipher 15, 15, 24 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_final_1x_dec - -Do_final_1x_dec: - vcipherlast 15, 15, 23 - - # check partial block - li 21, 1 # decrypt - ld 15, 56(7) # partial? - cmpdi 15, 0 - beq Normal_block_dec - bl Do_partial_block - cmpdi 12, 0 - ble aes_gcm_out - - b Continue_partial_check_dec - -Normal_block_dec: - lxvb16x 15, 0, 14 # load last block - xxlxor 47, 47, 15 - - # create partial block mask - li 15, 16 - sub 15, 15, 12 # index to the mask - - vspltisb 16, -1 # first 16 bytes - 0xffff...ff - vspltisb 17, 0 # second 16 bytes - 0x0000...00 - li 10, 192 - stvx 16, 10, 1 - addi 10, 10, 16 - stvx 17, 10, 1 - - addi 10, 1, 192 - lxvb16x 16, 15, 10 # load partial block mask - xxland 47, 47, 16 - - xxland 32+28, 15, 16 - #vmr 28, 15 - ppc_update_hash_1x - - # * should store only the remaining bytes. - bl Write_partial_block - - stxvb16x 30+32, 0, 7 # update IV - std 12, 56(7) # update partial? - li 16, 16 - - stxvb16x 32, 0, 8 # write out Xi - stxvb16x 32, 16, 8 # write out Xi - b aes_gcm_out diff --git a/arch/powerpc/crypto/ppc-xlate.pl b/arch/powerpc/crypto/ppc-xlate.pl deleted file mode 100644 index 23cca703ce29..000000000000 --- a/arch/powerpc/crypto/ppc-xlate.pl +++ /dev/null @@ -1,229 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 - -# PowerPC assembler distiller by . - -my $flavour = shift; -my $output = shift; -open STDOUT,">$output" || die "can't open $output: $!"; - -my %GLOBALS; -my $dotinlocallabels=($flavour=~/linux/)?1:0; - -################################################################ -# directives which need special treatment on different platforms -################################################################ -my $globl = sub { - my $junk = shift; - my $name = shift; - my $global = \$GLOBALS{$name}; - my $ret; - - $name =~ s|^[\.\_]||; - - SWITCH: for ($flavour) { - /aix/ && do { $name = ".$name"; - last; - }; - /osx/ && do { $name = "_$name"; - last; - }; - /linux/ - && do { $ret = "_GLOBAL($name)"; - last; - }; - } - - $ret = ".globl $name\nalign 5\n$name:" if (!$ret); - $$global = $name; - $ret; -}; -my $text = sub { - my $ret = ($flavour =~ /aix/) ? ".csect\t.text[PR],7" : ".text"; - $ret = ".abiversion 2\n".$ret if ($flavour =~ /linux.*64le/); - $ret; -}; -my $machine = sub { - my $junk = shift; - my $arch = shift; - if ($flavour =~ /osx/) - { $arch =~ s/\"//g; - $arch = ($flavour=~/64/) ? "ppc970-64" : "ppc970" if ($arch eq "any"); - } - ".machine $arch"; -}; -my $size = sub { - if ($flavour =~ /linux/) - { shift; - my $name = shift; $name =~ s|^[\.\_]||; - my $ret = ".size $name,.-".($flavour=~/64$/?".":"").$name; - $ret .= "\n.size .$name,.-.$name" if ($flavour=~/64$/); - $ret; - } - else - { ""; } -}; -my $asciz = sub { - shift; - my $line = join(",",@_); - if ($line =~ /^"(.*)"$/) - { ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; } - else - { ""; } -}; -my $quad = sub { - shift; - my @ret; - my ($hi,$lo); - for (@_) { - if (/^0x([0-9a-f]*?)([0-9a-f]{1,8})$/io) - { $hi=$1?"0x$1":"0"; $lo="0x$2"; } - elsif (/^([0-9]+)$/o) - { $hi=$1>>32; $lo=$1&0xffffffff; } # error-prone with 32-bit perl - else - { $hi=undef; $lo=$_; } - - if (defined($hi)) - { push(@ret,$flavour=~/le$/o?".long\t$lo,$hi":".long\t$hi,$lo"); } - else - { push(@ret,".quad $lo"); } - } - join("\n",@ret); -}; - -################################################################ -# simplified mnemonics not handled by at least one assembler -################################################################ -my $cmplw = sub { - my $f = shift; - my $cr = 0; $cr = shift if ($#_>1); - # Some out-of-date 32-bit GNU assembler just can't handle cmplw... - ($flavour =~ /linux.*32/) ? - " .long ".sprintf "0x%x",31<<26|$cr<<23|$_[0]<<16|$_[1]<<11|64 : - " cmplw ".join(',',$cr,@_); -}; -my $bdnz = sub { - my $f = shift; - my $bo = $f=~/[\+\-]/ ? 16+9 : 16; # optional "to be taken" hint - " bc $bo,0,".shift; -} if ($flavour!~/linux/); -my $bltlr = sub { - my $f = shift; - my $bo = $f=~/\-/ ? 12+2 : 12; # optional "not to be taken" hint - ($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints - " .long ".sprintf "0x%x",19<<26|$bo<<21|16<<1 : - " bclr $bo,0"; -}; -my $bnelr = sub { - my $f = shift; - my $bo = $f=~/\-/ ? 4+2 : 4; # optional "not to be taken" hint - ($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints - " .long ".sprintf "0x%x",19<<26|$bo<<21|2<<16|16<<1 : - " bclr $bo,2"; -}; -my $beqlr = sub { - my $f = shift; - my $bo = $f=~/-/ ? 12+2 : 12; # optional "not to be taken" hint - ($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints - " .long ".sprintf "0x%X",19<<26|$bo<<21|2<<16|16<<1 : - " bclr $bo,2"; -}; -# GNU assembler can't handle extrdi rA,rS,16,48, or when sum of last two -# arguments is 64, with "operand out of range" error. -my $extrdi = sub { - my ($f,$ra,$rs,$n,$b) = @_; - $b = ($b+$n)&63; $n = 64-$n; - " rldicl $ra,$rs,$b,$n"; -}; -my $vmr = sub { - my ($f,$vx,$vy) = @_; - " vor $vx,$vy,$vy"; -}; - -# Some ABIs specify vrsave, special-purpose register #256, as reserved -# for system use. -my $no_vrsave = ($flavour =~ /linux-ppc64le/); -my $mtspr = sub { - my ($f,$idx,$ra) = @_; - if ($idx == 256 && $no_vrsave) { - " or $ra,$ra,$ra"; - } else { - " mtspr $idx,$ra"; - } -}; -my $mfspr = sub { - my ($f,$rd,$idx) = @_; - if ($idx == 256 && $no_vrsave) { - " li $rd,-1"; - } else { - " mfspr $rd,$idx"; - } -}; - -# PowerISA 2.06 stuff -sub vsxmem_op { - my ($f, $vrt, $ra, $rb, $op) = @_; - " .long ".sprintf "0x%X",(31<<26)|($vrt<<21)|($ra<<16)|($rb<<11)|($op*2+1); -} -# made-up unaligned memory reference AltiVec/VMX instructions -my $lvx_u = sub { vsxmem_op(@_, 844); }; # lxvd2x -my $stvx_u = sub { vsxmem_op(@_, 972); }; # stxvd2x -my $lvdx_u = sub { vsxmem_op(@_, 588); }; # lxsdx -my $stvdx_u = sub { vsxmem_op(@_, 716); }; # stxsdx -my $lvx_4w = sub { vsxmem_op(@_, 780); }; # lxvw4x -my $stvx_4w = sub { vsxmem_op(@_, 908); }; # stxvw4x - -# PowerISA 2.07 stuff -sub vcrypto_op { - my ($f, $vrt, $vra, $vrb, $op) = @_; - " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op; -} -my $vcipher = sub { vcrypto_op(@_, 1288); }; -my $vcipherlast = sub { vcrypto_op(@_, 1289); }; -my $vncipher = sub { vcrypto_op(@_, 1352); }; -my $vncipherlast= sub { vcrypto_op(@_, 1353); }; -my $vsbox = sub { vcrypto_op(@_, 0, 1480); }; -my $vshasigmad = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1730); }; -my $vshasigmaw = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1666); }; -my $vpmsumb = sub { vcrypto_op(@_, 1032); }; -my $vpmsumd = sub { vcrypto_op(@_, 1224); }; -my $vpmsubh = sub { vcrypto_op(@_, 1096); }; -my $vpmsumw = sub { vcrypto_op(@_, 1160); }; -my $vaddudm = sub { vcrypto_op(@_, 192); }; -my $vadduqm = sub { vcrypto_op(@_, 256); }; - -my $mtsle = sub { - my ($f, $arg) = @_; - " .long ".sprintf "0x%X",(31<<26)|($arg<<21)|(147*2); -}; - -print "#include \n" if $flavour =~ /linux/; - -while($line=<>) { - - $line =~ s|[#!;].*$||; # get rid of asm-style comments... - $line =~ s|/\*.*\*/||; # ... and C-style comments... - $line =~ s|^\s+||; # ... and skip white spaces in beginning... - $line =~ s|\s+$||; # ... and at the end - - { - $line =~ s|\b\.L(\w+)|L$1|g; # common denominator for Locallabel - $line =~ s|\bL(\w+)|\.L$1|g if ($dotinlocallabels); - } - - { - $line =~ s|^\s*(\.?)(\w+)([\.\+\-]?)\s*||; - my $c = $1; $c = "\t" if ($c eq ""); - my $mnemonic = $2; - my $f = $3; - my $opcode = eval("\$$mnemonic"); - $line =~ s/\b(c?[rf]|v|vs)([0-9]+)\b/$2/g if ($c ne "." and $flavour !~ /osx/); - if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); } - elsif ($mnemonic) { $line = $c.$mnemonic.$f."\t".$line; } - } - - print $line if ($line); - print "\n"; -} - -close STDOUT; From 9e3457112b9d410f0cad760f91728251fef7e3e3 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 14 Dec 2022 18:19:55 +0100 Subject: [PATCH 040/156] crypto: arm64/gcm - add RFC4106 support Add support for RFC4106 ESP encapsulation to the accelerated GCM implementation. This results in a ~10% speedup for IPsec frames of typical size (~1420 bytes) on Cortex-A53. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/ghash-ce-glue.c | 145 ++++++++++++++++++++++-------- 1 file changed, 107 insertions(+), 38 deletions(-) diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index e5e9adc1fcf4..97331b454ea8 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -28,7 +29,8 @@ MODULE_ALIAS_CRYPTO("ghash"); #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 -#define GCM_IV_SIZE 12 + +#define RFC4106_NONCE_SIZE 4 struct ghash_key { be128 k; @@ -43,6 +45,7 @@ struct ghash_desc_ctx { struct gcm_aes_ctx { struct crypto_aes_ctx aes_key; + u8 nonce[RFC4106_NONCE_SIZE]; struct ghash_key ghash_key; }; @@ -226,8 +229,8 @@ static int num_rounds(struct crypto_aes_ctx *ctx) return 6 + ctx->key_length / 4; } -static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, - unsigned int keylen) +static int gcm_aes_setkey(struct crypto_aead *tfm, const u8 *inkey, + unsigned int keylen) { struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm); u8 key[GHASH_BLOCK_SIZE]; @@ -258,17 +261,9 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, return 0; } -static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) +static int gcm_aes_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { - switch (authsize) { - case 4: - case 8: - case 12 ... 16: - break; - default: - return -EINVAL; - } - return 0; + return crypto_gcm_check_authsize(authsize); } static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[], @@ -302,13 +297,12 @@ static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[], } } -static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[]) +static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[], u32 len) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); u8 buf[GHASH_BLOCK_SIZE]; struct scatter_walk walk; - u32 len = req->assoclen; int buf_count = 0; scatterwalk_start(&walk, req->src); @@ -338,27 +332,25 @@ static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[]) } } -static int gcm_encrypt(struct aead_request *req) +static int gcm_encrypt(struct aead_request *req, char *iv, int assoclen) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); int nrounds = num_rounds(&ctx->aes_key); struct skcipher_walk walk; u8 buf[AES_BLOCK_SIZE]; - u8 iv[AES_BLOCK_SIZE]; u64 dg[2] = {}; be128 lengths; u8 *tag; int err; - lengths.a = cpu_to_be64(req->assoclen * 8); + lengths.a = cpu_to_be64(assoclen * 8); lengths.b = cpu_to_be64(req->cryptlen * 8); - if (req->assoclen) - gcm_calculate_auth_mac(req, dg); + if (assoclen) + gcm_calculate_auth_mac(req, dg, assoclen); - memcpy(iv, req->iv, GCM_IV_SIZE); - put_unaligned_be32(2, iv + GCM_IV_SIZE); + put_unaligned_be32(2, iv + GCM_AES_IV_SIZE); err = skcipher_walk_aead_encrypt(&walk, req, false); @@ -403,7 +395,7 @@ static int gcm_encrypt(struct aead_request *req) return 0; } -static int gcm_decrypt(struct aead_request *req) +static int gcm_decrypt(struct aead_request *req, char *iv, int assoclen) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); @@ -412,21 +404,19 @@ static int gcm_decrypt(struct aead_request *req) struct skcipher_walk walk; u8 otag[AES_BLOCK_SIZE]; u8 buf[AES_BLOCK_SIZE]; - u8 iv[AES_BLOCK_SIZE]; u64 dg[2] = {}; be128 lengths; u8 *tag; int ret; int err; - lengths.a = cpu_to_be64(req->assoclen * 8); + lengths.a = cpu_to_be64(assoclen * 8); lengths.b = cpu_to_be64((req->cryptlen - authsize) * 8); - if (req->assoclen) - gcm_calculate_auth_mac(req, dg); + if (assoclen) + gcm_calculate_auth_mac(req, dg, assoclen); - memcpy(iv, req->iv, GCM_IV_SIZE); - put_unaligned_be32(2, iv + GCM_IV_SIZE); + put_unaligned_be32(2, iv + GCM_AES_IV_SIZE); scatterwalk_map_and_copy(otag, req->src, req->assoclen + req->cryptlen - authsize, @@ -471,14 +461,76 @@ static int gcm_decrypt(struct aead_request *req) return ret ? -EBADMSG : 0; } -static struct aead_alg gcm_aes_alg = { - .ivsize = GCM_IV_SIZE, +static int gcm_aes_encrypt(struct aead_request *req) +{ + u8 iv[AES_BLOCK_SIZE]; + + memcpy(iv, req->iv, GCM_AES_IV_SIZE); + return gcm_encrypt(req, iv, req->assoclen); +} + +static int gcm_aes_decrypt(struct aead_request *req) +{ + u8 iv[AES_BLOCK_SIZE]; + + memcpy(iv, req->iv, GCM_AES_IV_SIZE); + return gcm_decrypt(req, iv, req->assoclen); +} + +static int rfc4106_setkey(struct crypto_aead *tfm, const u8 *inkey, + unsigned int keylen) +{ + struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm); + int err; + + keylen -= RFC4106_NONCE_SIZE; + err = gcm_aes_setkey(tfm, inkey, keylen); + if (err) + return err; + + memcpy(ctx->nonce, inkey + keylen, RFC4106_NONCE_SIZE); + return 0; +} + +static int rfc4106_setauthsize(struct crypto_aead *tfm, unsigned int authsize) +{ + return crypto_rfc4106_check_authsize(authsize); +} + +static int rfc4106_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); + u8 iv[AES_BLOCK_SIZE]; + + memcpy(iv, ctx->nonce, RFC4106_NONCE_SIZE); + memcpy(iv + RFC4106_NONCE_SIZE, req->iv, GCM_RFC4106_IV_SIZE); + + return crypto_ipsec_check_assoclen(req->assoclen) ?: + gcm_encrypt(req, iv, req->assoclen - GCM_RFC4106_IV_SIZE); +} + +static int rfc4106_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); + u8 iv[AES_BLOCK_SIZE]; + + memcpy(iv, ctx->nonce, RFC4106_NONCE_SIZE); + memcpy(iv + RFC4106_NONCE_SIZE, req->iv, GCM_RFC4106_IV_SIZE); + + return crypto_ipsec_check_assoclen(req->assoclen) ?: + gcm_decrypt(req, iv, req->assoclen - GCM_RFC4106_IV_SIZE); +} + +static struct aead_alg gcm_aes_algs[] = {{ + .ivsize = GCM_AES_IV_SIZE, .chunksize = AES_BLOCK_SIZE, .maxauthsize = AES_BLOCK_SIZE, - .setkey = gcm_setkey, - .setauthsize = gcm_setauthsize, - .encrypt = gcm_encrypt, - .decrypt = gcm_decrypt, + .setkey = gcm_aes_setkey, + .setauthsize = gcm_aes_setauthsize, + .encrypt = gcm_aes_encrypt, + .decrypt = gcm_aes_decrypt, .base.cra_name = "gcm(aes)", .base.cra_driver_name = "gcm-aes-ce", @@ -487,7 +539,23 @@ static struct aead_alg gcm_aes_alg = { .base.cra_ctxsize = sizeof(struct gcm_aes_ctx) + 4 * sizeof(u64[2]), .base.cra_module = THIS_MODULE, -}; +}, { + .ivsize = GCM_RFC4106_IV_SIZE, + .chunksize = AES_BLOCK_SIZE, + .maxauthsize = AES_BLOCK_SIZE, + .setkey = rfc4106_setkey, + .setauthsize = rfc4106_setauthsize, + .encrypt = rfc4106_encrypt, + .decrypt = rfc4106_decrypt, + + .base.cra_name = "rfc4106(gcm(aes))", + .base.cra_driver_name = "rfc4106-gcm-aes-ce", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct gcm_aes_ctx) + + 4 * sizeof(u64[2]), + .base.cra_module = THIS_MODULE, +}}; static int __init ghash_ce_mod_init(void) { @@ -495,7 +563,8 @@ static int __init ghash_ce_mod_init(void) return -ENODEV; if (cpu_have_named_feature(PMULL)) - return crypto_register_aead(&gcm_aes_alg); + return crypto_register_aeads(gcm_aes_algs, + ARRAY_SIZE(gcm_aes_algs)); return crypto_register_shash(&ghash_alg); } @@ -503,7 +572,7 @@ static int __init ghash_ce_mod_init(void) static void __exit ghash_ce_mod_exit(void) { if (cpu_have_named_feature(PMULL)) - crypto_unregister_aead(&gcm_aes_alg); + crypto_unregister_aeads(gcm_aes_algs, ARRAY_SIZE(gcm_aes_algs)); else crypto_unregister_shash(&ghash_alg); } From 425359aef47972b2d14e15b6702d60d86779af65 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 14 Dec 2022 18:19:56 +0100 Subject: [PATCH 041/156] crypto: tcrypt - include larger key sizes in RFC4106 benchmark RFC4106 wraps AES in GCM mode, and can be used with larger key sizes than 128/160 bits, just like AES itself. So add these to the tcrypt recipe so they will be benchmarked as well. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 8 ++++---- crypto/tcrypt.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index a0833654ce94..6521feec7756 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -2044,11 +2044,11 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) case 211: test_aead_speed("rfc4106(gcm(aes))", ENCRYPT, sec, - NULL, 0, 16, 16, aead_speed_template_20); + NULL, 0, 16, 16, aead_speed_template_20_28_36); test_aead_speed("gcm(aes)", ENCRYPT, sec, NULL, 0, 16, 8, speed_template_16_24_32); test_aead_speed("rfc4106(gcm(aes))", DECRYPT, sec, - NULL, 0, 16, 16, aead_speed_template_20); + NULL, 0, 16, 16, aead_speed_template_20_28_36); test_aead_speed("gcm(aes)", DECRYPT, sec, NULL, 0, 16, 8, speed_template_16_24_32); break; @@ -2074,11 +2074,11 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) case 215: test_mb_aead_speed("rfc4106(gcm(aes))", ENCRYPT, sec, NULL, - 0, 16, 16, aead_speed_template_20, num_mb); + 0, 16, 16, aead_speed_template_20_28_36, num_mb); test_mb_aead_speed("gcm(aes)", ENCRYPT, sec, NULL, 0, 16, 8, speed_template_16_24_32, num_mb); test_mb_aead_speed("rfc4106(gcm(aes))", DECRYPT, sec, NULL, - 0, 16, 16, aead_speed_template_20, num_mb); + 0, 16, 16, aead_speed_template_20_28_36, num_mb); test_mb_aead_speed("gcm(aes)", DECRYPT, sec, NULL, 0, 16, 8, speed_template_16_24_32, num_mb); break; diff --git a/crypto/tcrypt.h b/crypto/tcrypt.h index 9f654677172a..96c843a24607 100644 --- a/crypto/tcrypt.h +++ b/crypto/tcrypt.h @@ -62,7 +62,7 @@ static u8 speed_template_32[] = {32, 0}; * AEAD speed tests */ static u8 aead_speed_template_19[] = {19, 0}; -static u8 aead_speed_template_20[] = {20, 0}; +static u8 aead_speed_template_20_28_36[] = {20, 28, 36, 0}; static u8 aead_speed_template_36[] = {36, 0}; /* From 2b0c954010873166fdf6a8468f25ff24dcc8aa05 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 14 Dec 2022 18:19:57 +0100 Subject: [PATCH 042/156] crypto: aead - fix inaccurate documentation The AEAD documentation conflates associated data and authentication tags: the former (along with the ciphertext) is authenticated by the latter. Fix the doc accordingly. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/aead.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/crypto/aead.h b/include/crypto/aead.h index 14db3bee0519..4a2b7e6e0c1f 100644 --- a/include/crypto/aead.h +++ b/include/crypto/aead.h @@ -27,15 +27,12 @@ * * For example: authenc(hmac(sha256), cbc(aes)) * - * The example code provided for the symmetric key cipher operation - * applies here as well. Naturally all *skcipher* symbols must be exchanged - * the *aead* pendants discussed in the following. In addition, for the AEAD - * operation, the aead_request_set_ad function must be used to set the - * pointer to the associated data memory location before performing the - * encryption or decryption operation. In case of an encryption, the associated - * data memory is filled during the encryption operation. For decryption, the - * associated data memory must contain data that is used to verify the integrity - * of the decrypted data. Another deviation from the asynchronous block cipher + * The example code provided for the symmetric key cipher operation applies + * here as well. Naturally all *skcipher* symbols must be exchanged the *aead* + * pendants discussed in the following. In addition, for the AEAD operation, + * the aead_request_set_ad function must be used to set the pointer to the + * associated data memory location before performing the encryption or + * decryption operation. Another deviation from the asynchronous block cipher * operation is that the caller should explicitly check for -EBADMSG of the * crypto_aead_decrypt. That error indicates an authentication error, i.e. * a breach in the integrity of the message. In essence, that -EBADMSG error @@ -49,7 +46,10 @@ * * The destination scatterlist has the same layout, except that the plaintext * (resp. ciphertext) will grow (resp. shrink) by the authentication tag size - * during encryption (resp. decryption). + * during encryption (resp. decryption). The authentication tag is generated + * during the encryption operation and appended to the ciphertext. During + * decryption, the authentication tag is consumed along with the ciphertext and + * used to verify the integrity of the plaintext and the associated data. * * In-place encryption/decryption is enabled by using the same scatterlist * pointer for both the source and destination. From 299bf602b3f92f1456aef59c6413591fb02e762a Mon Sep 17 00:00:00 2001 From: Koba Ko Date: Mon, 9 Jan 2023 10:15:02 +0800 Subject: [PATCH 043/156] crypto: ccp - Failure on re-initialization due to duplicate sysfs filename The following warning appears during the CCP module re-initialization: [ 140.965403] sysfs: cannot create duplicate filename '/devices/pci0000:00/0000:00:07.1/0000:03:00.2/dma/dma0chan0' [ 140.975736] CPU: 0 PID: 388 Comm: kworker/0:2 Kdump: loaded Not tainted 6.2.0-0.rc2.18.eln124.x86_64 #1 [ 140.985185] Hardware name: HPE ProLiant DL325 Gen10/ProLiant DL325 Gen10, BIOS A41 07/17/2020 [ 140.993761] Workqueue: events work_for_cpu_fn [ 140.998151] Call Trace: [ 141.000613] [ 141.002726] dump_stack_lvl+0x33/0x46 [ 141.006415] sysfs_warn_dup.cold+0x17/0x23 [ 141.010542] sysfs_create_dir_ns+0xba/0xd0 [ 141.014670] kobject_add_internal+0xba/0x260 [ 141.018970] kobject_add+0x81/0xb0 [ 141.022395] device_add+0xdc/0x7e0 [ 141.025822] ? complete_all+0x20/0x90 [ 141.029510] __dma_async_device_channel_register+0xc9/0x130 [ 141.035119] dma_async_device_register+0x19e/0x3b0 [ 141.039943] ccp_dmaengine_register+0x334/0x3f0 [ccp] [ 141.045042] ccp5_init+0x662/0x6a0 [ccp] [ 141.049000] ? devm_kmalloc+0x40/0xd0 [ 141.052688] ccp_dev_init+0xbb/0xf0 [ccp] [ 141.056732] ? __pci_set_master+0x56/0xd0 [ 141.060768] sp_init+0x70/0x90 [ccp] [ 141.064377] sp_pci_probe+0x186/0x1b0 [ccp] [ 141.068596] local_pci_probe+0x41/0x80 [ 141.072374] work_for_cpu_fn+0x16/0x20 [ 141.076145] process_one_work+0x1c8/0x380 [ 141.080181] worker_thread+0x1ab/0x380 [ 141.083953] ? __pfx_worker_thread+0x10/0x10 [ 141.088250] kthread+0xda/0x100 [ 141.091413] ? __pfx_kthread+0x10/0x10 [ 141.095185] ret_from_fork+0x2c/0x50 [ 141.098788] [ 141.100996] kobject_add_internal failed for dma0chan0 with -EEXIST, don't try to register things with the same name in the same directory. [ 141.113703] ccp 0000:03:00.2: ccp initialization failed The /dma/dma0chan0 sysfs file is not removed since dma_chan object has been released in ccp_dma_release() before releasing dma device. A correct procedure would be: release dma channels first => unregister dma device => release ccp dma object. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216888 Fixes: 68dbe80f5b51 ("crypto: ccp - Release dma channels before dmaengine unrgister") Tested-by: Vladis Dronov Signed-off-by: Koba Ko Reviewed-by: Vladis Dronov Acked-by: Tom Lendacky Signed-off-by: Herbert Xu --- drivers/crypto/ccp/ccp-dmaengine.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/ccp/ccp-dmaengine.c b/drivers/crypto/ccp/ccp-dmaengine.c index 9f753cb4f5f1..b386a7063818 100644 --- a/drivers/crypto/ccp/ccp-dmaengine.c +++ b/drivers/crypto/ccp/ccp-dmaengine.c @@ -642,14 +642,26 @@ static void ccp_dma_release(struct ccp_device *ccp) chan = ccp->ccp_dma_chan + i; dma_chan = &chan->dma_chan; - if (dma_chan->client_count) - dma_release_channel(dma_chan); - tasklet_kill(&chan->cleanup_tasklet); list_del_rcu(&dma_chan->device_node); } } +static void ccp_dma_release_channels(struct ccp_device *ccp) +{ + struct ccp_dma_chan *chan; + struct dma_chan *dma_chan; + unsigned int i; + + for (i = 0; i < ccp->cmd_q_count; i++) { + chan = ccp->ccp_dma_chan + i; + dma_chan = &chan->dma_chan; + + if (dma_chan->client_count) + dma_release_channel(dma_chan); + } +} + int ccp_dmaengine_register(struct ccp_device *ccp) { struct ccp_dma_chan *chan; @@ -770,8 +782,9 @@ void ccp_dmaengine_unregister(struct ccp_device *ccp) if (!dmaengine) return; - ccp_dma_release(ccp); + ccp_dma_release_channels(ccp); dma_async_device_unregister(dma_dev); + ccp_dma_release(ccp); kmem_cache_destroy(ccp->dma_desc_cache); kmem_cache_destroy(ccp->dma_cmd_cache); From f104b2169e6841b1629ff8cea15b751bc6f9a0e7 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 9 Jan 2023 19:39:17 -0600 Subject: [PATCH 044/156] crypto: aspeed - Replace zero-length array with flexible-array member Zero-length arrays are deprecated[1] and we are moving towards adopting C99 flexible-array members instead. So, replace zero-length array declaration in struct aspeed_sham_ctx with flex-array member. This helps with the ongoing efforts to tighten the FORTIFY_SOURCE routines on memcpy() and help us make progress towards globally enabling -fstrict-flex-arrays=3 [2]. Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays [1] Link: https://gcc.gnu.org/pipermail/gcc-patches/2022-October/602902.html [2] Link: https://github.com/KSPP/linux/issues/78 Signed-off-by: Gustavo A. R. Silva Reviewed-by: Neal Liu Reviewed-by: Kees Cook Signed-off-by: Herbert Xu --- drivers/crypto/aspeed/aspeed-hace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/aspeed/aspeed-hace.h b/drivers/crypto/aspeed/aspeed-hace.h index f2cde23b56ae..05d0a15d546d 100644 --- a/drivers/crypto/aspeed/aspeed-hace.h +++ b/drivers/crypto/aspeed/aspeed-hace.h @@ -183,7 +183,7 @@ struct aspeed_sham_ctx { struct aspeed_hace_dev *hace_dev; unsigned long flags; /* hmac flag */ - struct aspeed_sha_hmac_ctx base[0]; + struct aspeed_sha_hmac_ctx base[]; }; struct aspeed_sham_reqctx { From 319ad16d62d3da695faebd2d5def6be0d542f586 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 10 Jan 2023 20:43:07 +0100 Subject: [PATCH 045/156] crypto: stm32 - Use accelerated readsl/writesl When reading or writing crypto buffers the inner loops can be replaced with readsl and writesl which will on ARM result in a tight assembly loop, speeding up encryption/decryption a little bit. This optimization was in the Ux500 driver so let's carry it over to the STM32 driver. Cc: Maxime Coquelin Cc: Alexandre Torgue Cc: Lionel Debieve Signed-off-by: Linus Walleij Signed-off-by: Herbert Xu --- drivers/crypto/stm32/stm32-cryp.c | 37 +++++++++---------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/drivers/crypto/stm32/stm32-cryp.c b/drivers/crypto/stm32/stm32-cryp.c index 4208338e72b6..6b8d731092a4 100644 --- a/drivers/crypto/stm32/stm32-cryp.c +++ b/drivers/crypto/stm32/stm32-cryp.c @@ -597,7 +597,6 @@ static void stm32_crypt_gcmccm_end_header(struct stm32_cryp *cryp) static void stm32_cryp_write_ccm_first_header(struct stm32_cryp *cryp) { - unsigned int i; size_t written; size_t len; u32 alen = cryp->areq->assoclen; @@ -623,8 +622,8 @@ static void stm32_cryp_write_ccm_first_header(struct stm32_cryp *cryp) written = min_t(size_t, AES_BLOCK_SIZE - len, alen); scatterwalk_copychunks((char *)block + len, &cryp->in_walk, written, 0); - for (i = 0; i < AES_BLOCK_32; i++) - stm32_cryp_write(cryp, cryp->caps->din, block[i]); + + writesl(cryp->regs + cryp->caps->din, block, AES_BLOCK_32); cryp->header_in -= written; @@ -1363,18 +1362,14 @@ static int stm32_cryp_read_auth_tag(struct stm32_cryp *cryp) u32 out_tag[AES_BLOCK_32]; /* Get and write tag */ - for (i = 0; i < AES_BLOCK_32; i++) - out_tag[i] = stm32_cryp_read(cryp, cryp->caps->dout); - + readsl(cryp->regs + cryp->caps->dout, out_tag, AES_BLOCK_32); scatterwalk_copychunks(out_tag, &cryp->out_walk, cryp->authsize, 1); } else { /* Get and check tag */ u32 in_tag[AES_BLOCK_32], out_tag[AES_BLOCK_32]; scatterwalk_copychunks(in_tag, &cryp->in_walk, cryp->authsize, 0); - - for (i = 0; i < AES_BLOCK_32; i++) - out_tag[i] = stm32_cryp_read(cryp, cryp->caps->dout); + readsl(cryp->regs + cryp->caps->dout, out_tag, AES_BLOCK_32); if (crypto_memneq(in_tag, out_tag, cryp->authsize)) ret = -EBADMSG; @@ -1415,12 +1410,9 @@ static void stm32_cryp_check_ctr_counter(struct stm32_cryp *cryp) static void stm32_cryp_irq_read_data(struct stm32_cryp *cryp) { - unsigned int i; u32 block[AES_BLOCK_32]; - for (i = 0; i < cryp->hw_blocksize / sizeof(u32); i++) - block[i] = stm32_cryp_read(cryp, cryp->caps->dout); - + readsl(cryp->regs + cryp->caps->dout, block, cryp->hw_blocksize / sizeof(u32)); scatterwalk_copychunks(block, &cryp->out_walk, min_t(size_t, cryp->hw_blocksize, cryp->payload_out), 1); cryp->payload_out -= min_t(size_t, cryp->hw_blocksize, @@ -1429,14 +1421,11 @@ static void stm32_cryp_irq_read_data(struct stm32_cryp *cryp) static void stm32_cryp_irq_write_block(struct stm32_cryp *cryp) { - unsigned int i; u32 block[AES_BLOCK_32] = {0}; scatterwalk_copychunks(block, &cryp->in_walk, min_t(size_t, cryp->hw_blocksize, cryp->payload_in), 0); - for (i = 0; i < cryp->hw_blocksize / sizeof(u32); i++) - stm32_cryp_write(cryp, cryp->caps->din, block[i]); - + writesl(cryp->regs + cryp->caps->din, block, cryp->hw_blocksize / sizeof(u32)); cryp->payload_in -= min_t(size_t, cryp->hw_blocksize, cryp->payload_in); } @@ -1480,8 +1469,7 @@ static void stm32_cryp_irq_write_gcm_padded_data(struct stm32_cryp *cryp) * Same code as stm32_cryp_irq_read_data(), but we want to store * block value */ - for (i = 0; i < cryp->hw_blocksize / sizeof(u32); i++) - block[i] = stm32_cryp_read(cryp, cryp->caps->dout); + readsl(cryp->regs + cryp->caps->dout, block, cryp->hw_blocksize / sizeof(u32)); scatterwalk_copychunks(block, &cryp->out_walk, min_t(size_t, cryp->hw_blocksize, cryp->payload_out), 1); @@ -1499,8 +1487,7 @@ static void stm32_cryp_irq_write_gcm_padded_data(struct stm32_cryp *cryp) stm32_cryp_write(cryp, cryp->caps->cr, cfg); /* f) write padded data */ - for (i = 0; i < AES_BLOCK_32; i++) - stm32_cryp_write(cryp, cryp->caps->din, block[i]); + writesl(cryp->regs + cryp->caps->din, block, AES_BLOCK_32); /* g) Empty fifo out */ err = stm32_cryp_wait_output(cryp); @@ -1580,8 +1567,7 @@ static void stm32_cryp_irq_write_ccm_padded_data(struct stm32_cryp *cryp) * Same code as stm32_cryp_irq_read_data(), but we want to store * block value */ - for (i = 0; i < cryp->hw_blocksize / sizeof(u32); i++) - block[i] = stm32_cryp_read(cryp, cryp->caps->dout); + readsl(cryp->regs + cryp->caps->dout, block, cryp->hw_blocksize / sizeof(u32)); scatterwalk_copychunks(block, &cryp->out_walk, min_t(size_t, cryp->hw_blocksize, cryp->payload_out), 1); @@ -1660,15 +1646,14 @@ static void stm32_cryp_irq_write_data(struct stm32_cryp *cryp) static void stm32_cryp_irq_write_gcmccm_header(struct stm32_cryp *cryp) { - unsigned int i; u32 block[AES_BLOCK_32] = {0}; size_t written; written = min_t(size_t, AES_BLOCK_SIZE, cryp->header_in); scatterwalk_copychunks(block, &cryp->in_walk, written, 0); - for (i = 0; i < AES_BLOCK_32; i++) - stm32_cryp_write(cryp, cryp->caps->din, block[i]); + + writesl(cryp->regs + cryp->caps->din, block, AES_BLOCK_32); cryp->header_in -= written; From 00bef64ac3c95ab0f73b5d352743bf2bd6f5fc99 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 11 Jan 2023 13:02:03 +0100 Subject: [PATCH 046/156] crypto: hisilicon - remove redundant config PCI dependency for some CRYPTO_DEV_HISI configs While reviewing dependencies in some Kconfig files, I noticed the redundant dependency "depends on PCI && PCI_MSI". The config PCI_MSI has always, since its introduction, been dependent on the config PCI. So, it is sufficient to just depend on PCI_MSI, and know that the dependency on PCI is implicitly implied. Reduce the dependencies of configs CRYPTO_DEV_HISI_SEC2, CRYPTO_DEV_HISI_QM, CRYPTO_DEV_HISI_ZIP and CRYPTO_DEV_HISI_HPRE. No functional change and effective change of Kconfig dependendencies. Signed-off-by: Lukas Bulwahn Acked-by: Weili Qian Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/Kconfig | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/hisilicon/Kconfig b/drivers/crypto/hisilicon/Kconfig index 743ce4fc3158..4137a8bf131f 100644 --- a/drivers/crypto/hisilicon/Kconfig +++ b/drivers/crypto/hisilicon/Kconfig @@ -27,7 +27,7 @@ config CRYPTO_DEV_HISI_SEC2 select CRYPTO_SHA256 select CRYPTO_SHA512 select CRYPTO_SM4_GENERIC - depends on PCI && PCI_MSI + depends on PCI_MSI depends on UACCE || UACCE=n depends on ARM64 || (COMPILE_TEST && 64BIT) depends on ACPI @@ -42,7 +42,7 @@ config CRYPTO_DEV_HISI_SEC2 config CRYPTO_DEV_HISI_QM tristate depends on ARM64 || COMPILE_TEST - depends on PCI && PCI_MSI + depends on PCI_MSI depends on UACCE || UACCE=n depends on ACPI help @@ -51,7 +51,7 @@ config CRYPTO_DEV_HISI_QM config CRYPTO_DEV_HISI_ZIP tristate "Support for HiSilicon ZIP accelerator" - depends on PCI && PCI_MSI + depends on PCI_MSI depends on ARM64 || (COMPILE_TEST && 64BIT) depends on !CPU_BIG_ENDIAN || COMPILE_TEST depends on UACCE || UACCE=n @@ -62,7 +62,7 @@ config CRYPTO_DEV_HISI_ZIP config CRYPTO_DEV_HISI_HPRE tristate "Support for HISI HPRE accelerator" - depends on PCI && PCI_MSI + depends on PCI_MSI depends on UACCE || UACCE=n depends on ARM64 || (COMPILE_TEST && 64BIT) depends on ACPI From 692ed5d4b270d1b12aa7f00f25f4f1e914831cb3 Mon Sep 17 00:00:00 2001 From: Meadhbh Fitzpatrick Date: Thu, 12 Jan 2023 15:51:54 +0100 Subject: [PATCH 047/156] crypto: qat - fix spelling mistakes from 'bufer' to 'buffer' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix spelling mistakes from 'bufer' to 'buffer' in qat_common. Also fix indentation issue caused by the spelling change. Signed-off-by: Meadhbh Fitzpatrick Reviewed-by: Ilpo Järvinen Reviewed-by: Giovanni Cabiddu Signed-off-by: Herbert Xu --- .../qat_common/adf_transport_access_macros.h | 2 +- drivers/crypto/qat/qat_common/qat_bl.c | 86 +++++++++---------- drivers/crypto/qat/qat_common/qat_bl.h | 2 +- 3 files changed, 45 insertions(+), 45 deletions(-) diff --git a/drivers/crypto/qat/qat_common/adf_transport_access_macros.h b/drivers/crypto/qat/qat_common/adf_transport_access_macros.h index 3b6b0267bbec..d3667dbd9826 100644 --- a/drivers/crypto/qat/qat_common/adf_transport_access_macros.h +++ b/drivers/crypto/qat/qat_common/adf_transport_access_macros.h @@ -37,7 +37,7 @@ #define ADF_SIZE_TO_RING_SIZE_IN_BYTES(SIZE) ((1 << (SIZE - 1)) << 7) #define ADF_RING_SIZE_IN_BYTES_TO_SIZE(SIZE) ((1 << (SIZE - 1)) >> 7) -/* Minimum ring bufer size for memory allocation */ +/* Minimum ring buffer size for memory allocation */ #define ADF_RING_SIZE_BYTES_MIN(SIZE) \ ((SIZE < ADF_SIZE_TO_RING_SIZE_IN_BYTES(ADF_RING_SIZE_4K)) ? \ ADF_SIZE_TO_RING_SIZE_IN_BYTES(ADF_RING_SIZE_4K) : SIZE) diff --git a/drivers/crypto/qat/qat_common/qat_bl.c b/drivers/crypto/qat/qat_common/qat_bl.c index 2e89ff08041b..c72831fa025d 100644 --- a/drivers/crypto/qat/qat_common/qat_bl.c +++ b/drivers/crypto/qat/qat_common/qat_bl.c @@ -26,8 +26,8 @@ void qat_bl_free_bufl(struct adf_accel_dev *accel_dev, bl_dma_dir = blp != blpout ? DMA_TO_DEVICE : DMA_BIDIRECTIONAL; for (i = 0; i < bl->num_bufs; i++) - dma_unmap_single(dev, bl->bufers[i].addr, - bl->bufers[i].len, bl_dma_dir); + dma_unmap_single(dev, bl->buffers[i].addr, + bl->buffers[i].len, bl_dma_dir); dma_unmap_single(dev, blp, sz, DMA_TO_DEVICE); @@ -36,8 +36,8 @@ void qat_bl_free_bufl(struct adf_accel_dev *accel_dev, if (blp != blpout) { for (i = 0; i < blout->num_mapped_bufs; i++) { - dma_unmap_single(dev, blout->bufers[i].addr, - blout->bufers[i].len, + dma_unmap_single(dev, blout->buffers[i].addr, + blout->buffers[i].len, DMA_FROM_DEVICE); } dma_unmap_single(dev, blpout, sz_out, DMA_TO_DEVICE); @@ -63,7 +63,7 @@ static int __qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, dma_addr_t blp = DMA_MAPPING_ERROR; dma_addr_t bloutp = DMA_MAPPING_ERROR; struct scatterlist *sg; - size_t sz_out, sz = struct_size(bufl, bufers, n); + size_t sz_out, sz = struct_size(bufl, buffers, n); int node = dev_to_node(&GET_DEV(accel_dev)); int bufl_dma_dir; @@ -86,7 +86,7 @@ static int __qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, bufl_dma_dir = sgl != sglout ? DMA_TO_DEVICE : DMA_BIDIRECTIONAL; for (i = 0; i < n; i++) - bufl->bufers[i].addr = DMA_MAPPING_ERROR; + bufl->buffers[i].addr = DMA_MAPPING_ERROR; for_each_sg(sgl, sg, n, i) { int y = sg_nctr; @@ -94,11 +94,11 @@ static int __qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, if (!sg->length) continue; - bufl->bufers[y].addr = dma_map_single(dev, sg_virt(sg), - sg->length, - bufl_dma_dir); - bufl->bufers[y].len = sg->length; - if (unlikely(dma_mapping_error(dev, bufl->bufers[y].addr))) + bufl->buffers[y].addr = dma_map_single(dev, sg_virt(sg), + sg->length, + bufl_dma_dir); + bufl->buffers[y].len = sg->length; + if (unlikely(dma_mapping_error(dev, bufl->buffers[y].addr))) goto err_in; sg_nctr++; } @@ -111,12 +111,12 @@ static int __qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, buf->sz = sz; /* Handle out of place operation */ if (sgl != sglout) { - struct qat_alg_buf *bufers; + struct qat_alg_buf *buffers; int extra_buff = extra_dst_buff ? 1 : 0; int n_sglout = sg_nents(sglout); n = n_sglout + extra_buff; - sz_out = struct_size(buflout, bufers, n); + sz_out = struct_size(buflout, buffers, n); sg_nctr = 0; if (n > QAT_MAX_BUFF_DESC) { @@ -129,9 +129,9 @@ static int __qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, buf->sgl_dst_valid = true; } - bufers = buflout->bufers; + buffers = buflout->buffers; for (i = 0; i < n; i++) - bufers[i].addr = DMA_MAPPING_ERROR; + buffers[i].addr = DMA_MAPPING_ERROR; for_each_sg(sglout, sg, n_sglout, i) { int y = sg_nctr; @@ -139,17 +139,17 @@ static int __qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, if (!sg->length) continue; - bufers[y].addr = dma_map_single(dev, sg_virt(sg), - sg->length, - DMA_FROM_DEVICE); - if (unlikely(dma_mapping_error(dev, bufers[y].addr))) + buffers[y].addr = dma_map_single(dev, sg_virt(sg), + sg->length, + DMA_FROM_DEVICE); + if (unlikely(dma_mapping_error(dev, buffers[y].addr))) goto err_out; - bufers[y].len = sg->length; + buffers[y].len = sg->length; sg_nctr++; } if (extra_buff) { - bufers[sg_nctr].addr = extra_dst_buff; - bufers[sg_nctr].len = sz_extra_dst_buff; + buffers[sg_nctr].addr = extra_dst_buff; + buffers[sg_nctr].len = sz_extra_dst_buff; } buflout->num_bufs = sg_nctr; @@ -174,11 +174,11 @@ err_out: n = sg_nents(sglout); for (i = 0; i < n; i++) { - if (buflout->bufers[i].addr == extra_dst_buff) + if (buflout->buffers[i].addr == extra_dst_buff) break; - if (!dma_mapping_error(dev, buflout->bufers[i].addr)) - dma_unmap_single(dev, buflout->bufers[i].addr, - buflout->bufers[i].len, + if (!dma_mapping_error(dev, buflout->buffers[i].addr)) + dma_unmap_single(dev, buflout->buffers[i].addr, + buflout->buffers[i].len, DMA_FROM_DEVICE); } @@ -191,9 +191,9 @@ err_in: n = sg_nents(sgl); for (i = 0; i < n; i++) - if (!dma_mapping_error(dev, bufl->bufers[i].addr)) - dma_unmap_single(dev, bufl->bufers[i].addr, - bufl->bufers[i].len, + if (!dma_mapping_error(dev, bufl->buffers[i].addr)) + dma_unmap_single(dev, bufl->buffers[i].addr, + bufl->buffers[i].len, bufl_dma_dir); if (!buf->sgl_src_valid) @@ -231,9 +231,9 @@ static void qat_bl_sgl_unmap(struct adf_accel_dev *accel_dev, int i; for (i = 0; i < n; i++) - if (!dma_mapping_error(dev, bl->bufers[i].addr)) - dma_unmap_single(dev, bl->bufers[i].addr, - bl->bufers[i].len, DMA_FROM_DEVICE); + if (!dma_mapping_error(dev, bl->buffers[i].addr)) + dma_unmap_single(dev, bl->buffers[i].addr, + bl->buffers[i].len, DMA_FROM_DEVICE); } static int qat_bl_sgl_map(struct adf_accel_dev *accel_dev, @@ -248,13 +248,13 @@ static int qat_bl_sgl_map(struct adf_accel_dev *accel_dev, size_t sz; n = sg_nents(sgl); - sz = struct_size(bufl, bufers, n); + sz = struct_size(bufl, buffers, n); bufl = kzalloc_node(sz, GFP_KERNEL, node); if (unlikely(!bufl)) return -ENOMEM; for (i = 0; i < n; i++) - bufl->bufers[i].addr = DMA_MAPPING_ERROR; + bufl->buffers[i].addr = DMA_MAPPING_ERROR; sg_nctr = 0; for_each_sg(sgl, sg, n, i) { @@ -263,11 +263,11 @@ static int qat_bl_sgl_map(struct adf_accel_dev *accel_dev, if (!sg->length) continue; - bufl->bufers[y].addr = dma_map_single(dev, sg_virt(sg), - sg->length, - DMA_FROM_DEVICE); - bufl->bufers[y].len = sg->length; - if (unlikely(dma_mapping_error(dev, bufl->bufers[y].addr))) + bufl->buffers[y].addr = dma_map_single(dev, sg_virt(sg), + sg->length, + DMA_FROM_DEVICE); + bufl->buffers[y].len = sg->length; + if (unlikely(dma_mapping_error(dev, bufl->buffers[y].addr))) goto err_map; sg_nctr++; } @@ -280,9 +280,9 @@ static int qat_bl_sgl_map(struct adf_accel_dev *accel_dev, err_map: for (i = 0; i < n; i++) - if (!dma_mapping_error(dev, bufl->bufers[i].addr)) - dma_unmap_single(dev, bufl->bufers[i].addr, - bufl->bufers[i].len, + if (!dma_mapping_error(dev, bufl->buffers[i].addr)) + dma_unmap_single(dev, bufl->buffers[i].addr, + bufl->buffers[i].len, DMA_FROM_DEVICE); kfree(bufl); *bl = NULL; @@ -351,7 +351,7 @@ int qat_bl_realloc_map_new_dst(struct adf_accel_dev *accel_dev, if (ret) return ret; - new_bl_size = struct_size(new_bl, bufers, new_bl->num_bufs); + new_bl_size = struct_size(new_bl, buffers, new_bl->num_bufs); /* Map new firmware SGL descriptor */ new_blp = dma_map_single(dev, new_bl, new_bl_size, DMA_TO_DEVICE); diff --git a/drivers/crypto/qat/qat_common/qat_bl.h b/drivers/crypto/qat/qat_common/qat_bl.h index 8ca5e52ee9e2..1479fef3b634 100644 --- a/drivers/crypto/qat/qat_common/qat_bl.h +++ b/drivers/crypto/qat/qat_common/qat_bl.h @@ -18,7 +18,7 @@ struct qat_alg_buf_list { u64 resrvd; u32 num_bufs; u32 num_mapped_bufs; - struct qat_alg_buf bufers[]; + struct qat_alg_buf buffers[]; } __packed; struct qat_alg_fixed_buf_list { From b0f4f74631979afaff6b5b7b8e54eb1cf2bd5cfa Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 13 Jan 2023 08:47:15 +0100 Subject: [PATCH 048/156] crypto: atmel-i2c - avoid defines prefixed with CONFIG Defines prefixed with "CONFIG" should be limited to proper Kconfig options, that are introduced in a Kconfig file. Here, a definition for the driver's configuration zone is named CONFIG_ZONE. Rename this local definition to CONFIGURATION_ZONE to avoid defines prefixed with "CONFIG". No functional change. Signed-off-by: Lukas Bulwahn Signed-off-by: Herbert Xu --- drivers/crypto/atmel-i2c.c | 2 +- drivers/crypto/atmel-i2c.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/atmel-i2c.c b/drivers/crypto/atmel-i2c.c index 81ce09bedda8..e0896224513a 100644 --- a/drivers/crypto/atmel-i2c.c +++ b/drivers/crypto/atmel-i2c.c @@ -59,7 +59,7 @@ void atmel_i2c_init_read_cmd(struct atmel_i2c_cmd *cmd) * Read the word from Configuration zone that contains the lock bytes * (UserExtra, Selector, LockValue, LockConfig). */ - cmd->param1 = CONFIG_ZONE; + cmd->param1 = CONFIGURATION_ZONE; cmd->param2 = cpu_to_le16(DEVICE_LOCK_ADDR); cmd->count = READ_COUNT; diff --git a/drivers/crypto/atmel-i2c.h b/drivers/crypto/atmel-i2c.h index 48929efe2a5b..2872ed8cf3b5 100644 --- a/drivers/crypto/atmel-i2c.h +++ b/drivers/crypto/atmel-i2c.h @@ -63,7 +63,7 @@ struct atmel_i2c_cmd { #define STATUS_WAKE_SUCCESSFUL 0x11 /* Definitions for eeprom organization */ -#define CONFIG_ZONE 0 +#define CONFIGURATION_ZONE 0 /* Definitions for Indexes common to all commands */ #define RSP_DATA_IDX 1 /* buffer index of data in response */ From b5a772adf45a32c68bef28e60621f12617161556 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 13 Jan 2023 18:24:09 +0800 Subject: [PATCH 049/156] crypto: essiv - Handle EBUSY correctly As it is essiv only handles the special return value of EINPROGERSS, which means that in all other cases it will free data related to the request. However, as the caller of essiv may specify MAY_BACKLOG, we also need to expect EBUSY and treat it in the same way. Otherwise backlogged requests will trigger a use-after-free. Fixes: be1eb7f78aa8 ("crypto: essiv - create wrapper template...") Signed-off-by: Herbert Xu Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/essiv.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/crypto/essiv.c b/crypto/essiv.c index e33369df9034..307eba74b901 100644 --- a/crypto/essiv.c +++ b/crypto/essiv.c @@ -171,7 +171,12 @@ static void essiv_aead_done(struct crypto_async_request *areq, int err) struct aead_request *req = areq->data; struct essiv_aead_request_ctx *rctx = aead_request_ctx(req); + if (err == -EINPROGRESS) + goto out; + kfree(rctx->assoc); + +out: aead_request_complete(req, err); } @@ -247,7 +252,7 @@ static int essiv_aead_crypt(struct aead_request *req, bool enc) err = enc ? crypto_aead_encrypt(subreq) : crypto_aead_decrypt(subreq); - if (rctx->assoc && err != -EINPROGRESS) + if (rctx->assoc && err != -EINPROGRESS && err != -EBUSY) kfree(rctx->assoc); return err; } From 32e62025e5e52fbe4812ef044759de7010b15dbc Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 13 Jan 2023 18:27:51 +0800 Subject: [PATCH 050/156] crypto: seqiv - Handle EBUSY correctly As it is seqiv only handles the special return value of EINPROGERSS, which means that in all other cases it will free data related to the request. However, as the caller of seqiv may specify MAY_BACKLOG, we also need to expect EBUSY and treat it in the same way. Otherwise backlogged requests will trigger a use-after-free. Fixes: 0a270321dbf9 ("[CRYPTO] seqiv: Add Sequence Number IV Generator") Signed-off-by: Herbert Xu --- crypto/seqiv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/seqiv.c b/crypto/seqiv.c index 0899d527c284..b1bcfe537daf 100644 --- a/crypto/seqiv.c +++ b/crypto/seqiv.c @@ -23,7 +23,7 @@ static void seqiv_aead_encrypt_complete2(struct aead_request *req, int err) struct aead_request *subreq = aead_request_ctx(req); struct crypto_aead *geniv; - if (err == -EINPROGRESS) + if (err == -EINPROGRESS || err == -EBUSY) return; if (err) From e3cf2f8794b031ec0e640cb2dff95f45139ac4e9 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 15 Jan 2023 12:15:34 +0000 Subject: [PATCH 051/156] crypto: x86/aria-avx - fix build failure with old binutils The minimum version of binutils for kernel build is currently 2.23 and it doesn't support GFNI. So, it fails to build the aria-avx if the old binutils is used. The code using GFNI is an optional part of aria-avx. So, it disables GFNI part in it when the old binutils is used. In order to check whether the using binutils is supporting GFNI or not, AS_GFNI is added. Fixes: ba3579e6e45c ("crypto: aria-avx - add AES-NI/AVX/x86_64/GFNI assembler implementation of aria cipher") Reported-by: Jan Beulich Signed-off-by: Taehee Yoo Signed-off-by: Herbert Xu --- arch/x86/Kconfig.assembler | 5 +++++ arch/x86/crypto/aria-aesni-avx-asm_64.S | 10 ++++++++++ arch/x86/crypto/aria_aesni_avx_glue.c | 4 +++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler index 26b8c08e2fc4..b88f784cb02e 100644 --- a/arch/x86/Kconfig.assembler +++ b/arch/x86/Kconfig.assembler @@ -19,3 +19,8 @@ config AS_TPAUSE def_bool $(as-instr,tpause %ecx) help Supported by binutils >= 2.31.1 and LLVM integrated assembler >= V7 + +config AS_GFNI + def_bool $(as-instr,vgf2p8mulb %xmm0$(comma)%xmm1$(comma)%xmm2) + help + Supported by binutils >= 2.30 and LLVM integrated assembler diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S index be6adc6e7458..fe0d84a7ced1 100644 --- a/arch/x86/crypto/aria-aesni-avx-asm_64.S +++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S @@ -286,6 +286,7 @@ vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \ vpxor t0, x7, x7; +#ifdef CONFIG_AS_GFNI #define aria_sbox_8way_gfni(x0, x1, x2, x3, \ x4, x5, x6, x7, \ t0, t1, t2, t3, \ @@ -308,6 +309,8 @@ vgf2p8affineinvqb $0, t2, x3, x3; \ vgf2p8affineinvqb $0, t2, x7, x7 +#endif /* CONFIG_AS_GFNI */ + #define aria_sbox_8way(x0, x1, x2, x3, \ x4, x5, x6, x7, \ t0, t1, t2, t3, \ @@ -547,6 +550,7 @@ y4, y5, y6, y7, \ mem_tmp, 8); +#ifdef CONFIG_AS_GFNI #define aria_fe_gfni(x0, x1, x2, x3, \ x4, x5, x6, x7, \ y0, y1, y2, y3, \ @@ -701,6 +705,8 @@ y4, y5, y6, y7, \ mem_tmp, 8); +#endif /* CONFIG_AS_GFNI */ + /* NB: section is mergeable, all elements must be aligned 16-byte blocks */ .section .rodata.cst16, "aM", @progbits, 16 .align 16 @@ -752,6 +758,7 @@ .Ltf_hi__x2__and__fwd_aff: .octa 0x3F893781E95FE1576CDA64D2BA0CB204 +#ifdef CONFIG_AS_GFNI .section .rodata.cst8, "aM", @progbits, 8 .align 8 /* AES affine: */ @@ -812,6 +819,7 @@ BV8(0, 0, 0, 0, 0, 1, 0, 0), BV8(0, 0, 0, 0, 0, 0, 1, 0), BV8(0, 0, 0, 0, 0, 0, 0, 1)) +#endif /* CONFIG_AS_GFNI */ /* 4-bit mask */ .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 @@ -1080,6 +1088,7 @@ SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way) RET; SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way) +#ifdef CONFIG_AS_GFNI SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way) /* input: * %r9: rk @@ -1298,3 +1307,4 @@ SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way) FRAME_END RET; SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way) +#endif /* CONFIG_AS_GFNI */ diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c index 487094d64863..4e1516b76669 100644 --- a/arch/x86/crypto/aria_aesni_avx_glue.c +++ b/arch/x86/crypto/aria_aesni_avx_glue.c @@ -26,6 +26,7 @@ asmlinkage void aria_aesni_avx_ctr_crypt_16way(const void *ctx, u8 *dst, const u8 *src, u8 *keystream, u8 *iv); EXPORT_SYMBOL_GPL(aria_aesni_avx_ctr_crypt_16way); +#ifdef CONFIG_AS_GFNI asmlinkage void aria_aesni_avx_gfni_encrypt_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_encrypt_16way); @@ -36,6 +37,7 @@ asmlinkage void aria_aesni_avx_gfni_ctr_crypt_16way(const void *ctx, u8 *dst, const u8 *src, u8 *keystream, u8 *iv); EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_ctr_crypt_16way); +#endif /* CONFIG_AS_GFNI */ static struct aria_avx_ops aria_ops; @@ -201,7 +203,7 @@ static int __init aria_avx_init(void) return -ENODEV; } - if (boot_cpu_has(X86_FEATURE_GFNI)) { + if (boot_cpu_has(X86_FEATURE_GFNI) && IS_ENABLED(CONFIG_AS_GFNI)) { aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way; aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way; aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way; From 1eb468b3c7199cf7fe0c34ee37b95ae0e9bcc5fe Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 15 Jan 2023 12:15:35 +0000 Subject: [PATCH 052/156] crypto: x86/aria-avx2 - fix build failure with old binutils The minimum version of binutils for kernel build is currently 2.23 and it doesn't support GFNI. So, it fails to build the aria-avx2 if the old binutils is used. The code using GFNI is an optional part of aria-avx2. So, it disables GFNI part in it when the old binutils is used. Fixes: 37d8d3ae7a58 ("crypto: x86/aria - implement aria-avx2") Reported-by: Jan Beulich Signed-off-by: Taehee Yoo Signed-off-by: Herbert Xu --- arch/x86/crypto/aria-aesni-avx2-asm_64.S | 10 +++++++++- arch/x86/crypto/aria_aesni_avx2_glue.c | 4 +++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/crypto/aria-aesni-avx2-asm_64.S b/arch/x86/crypto/aria-aesni-avx2-asm_64.S index b6cac9a40f2c..82a14b4ad920 100644 --- a/arch/x86/crypto/aria-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/aria-aesni-avx2-asm_64.S @@ -302,6 +302,7 @@ vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \ vpxor t0, x7, x7; +#ifdef CONFIG_AS_GFNI #define aria_sbox_8way_gfni(x0, x1, x2, x3, \ x4, x5, x6, x7, \ t0, t1, t2, t3, \ @@ -324,6 +325,7 @@ vgf2p8affineinvqb $0, t2, x3, x3; \ vgf2p8affineinvqb $0, t2, x7, x7 +#endif /* CONFIG_AS_GFNI */ #define aria_sbox_8way(x0, x1, x2, x3, \ x4, x5, x6, x7, \ t0, t1, t2, t3, \ @@ -596,7 +598,7 @@ aria_load_state_8way(y0, y1, y2, y3, \ y4, y5, y6, y7, \ mem_tmp, 8); - +#ifdef CONFIG_AS_GFNI #define aria_fe_gfni(x0, x1, x2, x3, \ x4, x5, x6, x7, \ y0, y1, y2, y3, \ @@ -750,6 +752,7 @@ aria_load_state_8way(y0, y1, y2, y3, \ y4, y5, y6, y7, \ mem_tmp, 8); +#endif /* CONFIG_AS_GFNI */ .section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32 .align 32 @@ -803,6 +806,7 @@ .Ltf_hi__x2__and__fwd_aff: .octa 0x3F893781E95FE1576CDA64D2BA0CB204 +#ifdef CONFIG_AS_GFNI .section .rodata.cst8, "aM", @progbits, 8 .align 8 /* AES affine: */ @@ -864,6 +868,8 @@ BV8(0, 0, 0, 0, 0, 0, 1, 0), BV8(0, 0, 0, 0, 0, 0, 0, 1)) +#endif /* CONFIG_AS_GFNI */ + /* 4-bit mask */ .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 .align 4 @@ -1213,6 +1219,7 @@ SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way) RET; SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way) +#ifdef CONFIG_AS_GFNI SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way) /* input: * %r9: rk @@ -1431,3 +1438,4 @@ SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way) FRAME_END RET; SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way) +#endif /* CONFIG_AS_GFNI */ diff --git a/arch/x86/crypto/aria_aesni_avx2_glue.c b/arch/x86/crypto/aria_aesni_avx2_glue.c index 95fccc6dc420..87a11804fc77 100644 --- a/arch/x86/crypto/aria_aesni_avx2_glue.c +++ b/arch/x86/crypto/aria_aesni_avx2_glue.c @@ -26,6 +26,7 @@ asmlinkage void aria_aesni_avx2_ctr_crypt_32way(const void *ctx, u8 *dst, const u8 *src, u8 *keystream, u8 *iv); EXPORT_SYMBOL_GPL(aria_aesni_avx2_ctr_crypt_32way); +#ifdef CONFIG_AS_GFNI asmlinkage void aria_aesni_avx2_gfni_encrypt_32way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(aria_aesni_avx2_gfni_encrypt_32way); @@ -36,6 +37,7 @@ asmlinkage void aria_aesni_avx2_gfni_ctr_crypt_32way(const void *ctx, u8 *dst, const u8 *src, u8 *keystream, u8 *iv); EXPORT_SYMBOL_GPL(aria_aesni_avx2_gfni_ctr_crypt_32way); +#endif /* CONFIG_AS_GFNI */ static struct aria_avx_ops aria_ops; @@ -215,7 +217,7 @@ static int __init aria_avx2_init(void) return -ENODEV; } - if (boot_cpu_has(X86_FEATURE_GFNI)) { + if (boot_cpu_has(X86_FEATURE_GFNI) && IS_ENABLED(CONFIG_AS_GFNI)) { aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way; aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way; aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way; From d6b7ec11062e116f44a504f73f16c97e1cc53bd0 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 15 Jan 2023 12:15:36 +0000 Subject: [PATCH 053/156] crypto: x86/aria-avx512 - fix build failure with old binutils The minimum version of binutils for kernel build is currently 2.23 and it doesn't support GFNI. So, it fails to build the aria-avx512 if the old binutils is used. aria-avx512 requires GFNI, so it should not be allowed to build if the old binutils is used. The AS_AVX512 and AS_GFNI are added to the Kconfig to disable build aria-avx512 if the old binutils is used. Fixes: c970d42001f2 ("crypto: x86/aria - implement aria-avx512") Reported-by: Jan Beulich Signed-off-by: Taehee Yoo Signed-off-by: Herbert Xu --- arch/x86/crypto/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 688e848f740d..9bbfd01cfa2f 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -325,7 +325,7 @@ config CRYPTO_ARIA_AESNI_AVX2_X86_64 config CRYPTO_ARIA_GFNI_AVX512_X86_64 tristate "Ciphers: ARIA with modes: ECB, CTR (AVX512/GFNI)" - depends on X86 && 64BIT + depends on X86 && 64BIT && AS_AVX512 && AS_GFNI select CRYPTO_SKCIPHER select CRYPTO_SIMD select CRYPTO_ALGAPI From 675c39196ce3b47307dfc339dade6e6946ee1e1b Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 16 Jan 2023 14:03:26 -0600 Subject: [PATCH 054/156] crypto: ccp - Provide MMIO register naming for documenation Add comments next to the version data MMIO register values to identify the register name being used. Signed-off-by: Tom Lendacky Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sp-pci.c | 46 ++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c index 084d052fddcc..cde33b2ac71b 100644 --- a/drivers/crypto/ccp/sp-pci.c +++ b/drivers/crypto/ccp/sp-pci.c @@ -342,52 +342,52 @@ static int __maybe_unused sp_pci_resume(struct device *dev) #ifdef CONFIG_CRYPTO_DEV_SP_PSP static const struct sev_vdata sevv1 = { - .cmdresp_reg = 0x10580, - .cmdbuff_addr_lo_reg = 0x105e0, - .cmdbuff_addr_hi_reg = 0x105e4, + .cmdresp_reg = 0x10580, /* C2PMSG_32 */ + .cmdbuff_addr_lo_reg = 0x105e0, /* C2PMSG_56 */ + .cmdbuff_addr_hi_reg = 0x105e4, /* C2PMSG_57 */ }; static const struct sev_vdata sevv2 = { - .cmdresp_reg = 0x10980, - .cmdbuff_addr_lo_reg = 0x109e0, - .cmdbuff_addr_hi_reg = 0x109e4, + .cmdresp_reg = 0x10980, /* C2PMSG_32 */ + .cmdbuff_addr_lo_reg = 0x109e0, /* C2PMSG_56 */ + .cmdbuff_addr_hi_reg = 0x109e4, /* C2PMSG_57 */ }; static const struct tee_vdata teev1 = { - .cmdresp_reg = 0x10544, - .cmdbuff_addr_lo_reg = 0x10548, - .cmdbuff_addr_hi_reg = 0x1054c, - .ring_wptr_reg = 0x10550, - .ring_rptr_reg = 0x10554, + .cmdresp_reg = 0x10544, /* C2PMSG_17 */ + .cmdbuff_addr_lo_reg = 0x10548, /* C2PMSG_18 */ + .cmdbuff_addr_hi_reg = 0x1054c, /* C2PMSG_19 */ + .ring_wptr_reg = 0x10550, /* C2PMSG_20 */ + .ring_rptr_reg = 0x10554, /* C2PMSG_21 */ }; static const struct psp_vdata pspv1 = { .sev = &sevv1, - .feature_reg = 0x105fc, - .inten_reg = 0x10610, - .intsts_reg = 0x10614, + .feature_reg = 0x105fc, /* C2PMSG_63 */ + .inten_reg = 0x10610, /* P2CMSG_INTEN */ + .intsts_reg = 0x10614, /* P2CMSG_INTSTS */ }; static const struct psp_vdata pspv2 = { .sev = &sevv2, - .feature_reg = 0x109fc, - .inten_reg = 0x10690, - .intsts_reg = 0x10694, + .feature_reg = 0x109fc, /* C2PMSG_63 */ + .inten_reg = 0x10690, /* P2CMSG_INTEN */ + .intsts_reg = 0x10694, /* P2CMSG_INTSTS */ }; static const struct psp_vdata pspv3 = { .tee = &teev1, - .feature_reg = 0x109fc, - .inten_reg = 0x10690, - .intsts_reg = 0x10694, + .feature_reg = 0x109fc, /* C2PMSG_63 */ + .inten_reg = 0x10690, /* P2CMSG_INTEN */ + .intsts_reg = 0x10694, /* P2CMSG_INTSTS */ }; static const struct psp_vdata pspv4 = { .sev = &sevv2, .tee = &teev1, - .feature_reg = 0x109fc, - .inten_reg = 0x10690, - .intsts_reg = 0x10694, + .feature_reg = 0x109fc, /* C2PMSG_63 */ + .inten_reg = 0x10690, /* P2CMSG_INTEN */ + .intsts_reg = 0x10694, /* P2CMSG_INTSTS */ }; #endif From 4fc790d7d6b15eb4b90d297eb8c93589748e0c49 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 16 Jan 2023 14:04:40 -0600 Subject: [PATCH 055/156] crypto: ccp - Add a firmware definition for EPYC gen 4 processors Add a new MODULE_FIRMWARE() entry for 4th generation EPYC processors. Signed-off-by: Tom Lendacky Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sev-dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 56998bc579d6..13f1c88810d8 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -56,6 +56,7 @@ MODULE_PARM_DESC(psp_init_on_probe, " if true, the PSP will be initialized on m MODULE_FIRMWARE("amd/amd_sev_fam17h_model0xh.sbin"); /* 1st gen EPYC */ MODULE_FIRMWARE("amd/amd_sev_fam17h_model3xh.sbin"); /* 2nd gen EPYC */ MODULE_FIRMWARE("amd/amd_sev_fam19h_model0xh.sbin"); /* 3rd gen EPYC */ +MODULE_FIRMWARE("amd/amd_sev_fam19h_model1xh.sbin"); /* 4th gen EPYC */ static bool psp_dead; static int psp_timeout; From a482b02d6ae0d82c3c616ec0fb2a7b3080707e48 Mon Sep 17 00:00:00 2001 From: Jia Jie Ho Date: Tue, 17 Jan 2023 09:54:43 +0800 Subject: [PATCH 056/156] dt-bindings: rng: Add StarFive TRNG module Add documentation to describe Starfive true random number generator module. Co-developed-by: Jenny Zhang Signed-off-by: Jenny Zhang Signed-off-by: Jia Jie Ho Reviewed-by: Krzysztof Kozlowski Reviewed-by: Conor Dooley Signed-off-by: Herbert Xu --- .../bindings/rng/starfive,jh7110-trng.yaml | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 Documentation/devicetree/bindings/rng/starfive,jh7110-trng.yaml diff --git a/Documentation/devicetree/bindings/rng/starfive,jh7110-trng.yaml b/Documentation/devicetree/bindings/rng/starfive,jh7110-trng.yaml new file mode 100644 index 000000000000..2b76ce25acc4 --- /dev/null +++ b/Documentation/devicetree/bindings/rng/starfive,jh7110-trng.yaml @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/rng/starfive,jh7110-trng.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: StarFive SoC TRNG Module + +maintainers: + - Jia Jie Ho + +properties: + compatible: + const: starfive,jh7110-trng + + reg: + maxItems: 1 + + clocks: + items: + - description: Hardware reference clock + - description: AHB reference clock + + clock-names: + items: + - const: hclk + - const: ahb + + resets: + maxItems: 1 + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + - clocks + - clock-names + - resets + - interrupts + +additionalProperties: false + +examples: + - | + rng: rng@1600C000 { + compatible = "starfive,jh7110-trng"; + reg = <0x1600C000 0x4000>; + clocks = <&clk 15>, <&clk 16>; + clock-names = "hclk", "ahb"; + resets = <&reset 3>; + interrupts = <30>; + }; +... From c388f458bc34eb3a5728b67f6614f9375cd99087 Mon Sep 17 00:00:00 2001 From: Jia Jie Ho Date: Tue, 17 Jan 2023 09:54:44 +0800 Subject: [PATCH 057/156] hwrng: starfive - Add TRNG driver for StarFive SoC This adds driver support for the hardware random number generator in Starfive SoCs and adds StarFive TRNG entry to MAINTAINERS. Co-developed-by: Jenny Zhang Signed-off-by: Jenny Zhang Signed-off-by: Jia Jie Ho Signed-off-by: Herbert Xu --- MAINTAINERS | 6 + drivers/char/hw_random/Kconfig | 11 + drivers/char/hw_random/Makefile | 1 + drivers/char/hw_random/jh7110-trng.c | 393 +++++++++++++++++++++++++++ 4 files changed, 411 insertions(+) create mode 100644 drivers/char/hw_random/jh7110-trng.c diff --git a/MAINTAINERS b/MAINTAINERS index c4b4b9283d18..4f59559597ab 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19905,6 +19905,12 @@ F: Documentation/devicetree/bindings/reset/starfive,jh7100-reset.yaml F: drivers/reset/reset-starfive-jh7100.c F: include/dt-bindings/reset/starfive-jh7100.h +STARFIVE TRNG DRIVER +M: Jia Jie Ho +S: Supported +F: Documentation/devicetree/bindings/rng/starfive* +F: drivers/char/hw_random/starfive-trng.c + STATIC BRANCH/CALL M: Peter Zijlstra M: Josh Poimboeuf diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig index 3da8e85f8aae..be202959efa6 100644 --- a/drivers/char/hw_random/Kconfig +++ b/drivers/char/hw_random/Kconfig @@ -549,6 +549,17 @@ config HW_RANDOM_CN10K To compile this driver as a module, choose M here. The module will be called cn10k_rng. If unsure, say Y. +config HW_RANDOM_JH7110 + tristate "StarFive JH7110 Random Number Generator support" + depends on SOC_STARFIVE + depends on HW_RANDOM + help + This driver provides support for the True Random Number + Generator in StarFive JH7110 SoCs. + + To compile this driver as a module, choose M here. + The module will be called jh7110-trng. + endif # HW_RANDOM config UML_RANDOM diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile index 3e948cf04476..09bde4a0f971 100644 --- a/drivers/char/hw_random/Makefile +++ b/drivers/char/hw_random/Makefile @@ -47,3 +47,4 @@ obj-$(CONFIG_HW_RANDOM_XIPHERA) += xiphera-trng.o obj-$(CONFIG_HW_RANDOM_ARM_SMCCC_TRNG) += arm_smccc_trng.o obj-$(CONFIG_HW_RANDOM_CN10K) += cn10k-rng.o obj-$(CONFIG_HW_RANDOM_POLARFIRE_SOC) += mpfs-rng.o +obj-$(CONFIG_HW_RANDOM_JH7110) += jh7110-trng.o diff --git a/drivers/char/hw_random/jh7110-trng.c b/drivers/char/hw_random/jh7110-trng.c new file mode 100644 index 000000000000..38474d48a25e --- /dev/null +++ b/drivers/char/hw_random/jh7110-trng.c @@ -0,0 +1,393 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * TRNG driver for the StarFive JH7110 SoC + * + * Copyright (C) 2022 StarFive Technology Co. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* trng register offset */ +#define STARFIVE_CTRL 0x00 +#define STARFIVE_STAT 0x04 +#define STARFIVE_MODE 0x08 +#define STARFIVE_SMODE 0x0C +#define STARFIVE_IE 0x10 +#define STARFIVE_ISTAT 0x14 +#define STARFIVE_RAND0 0x20 +#define STARFIVE_RAND1 0x24 +#define STARFIVE_RAND2 0x28 +#define STARFIVE_RAND3 0x2C +#define STARFIVE_RAND4 0x30 +#define STARFIVE_RAND5 0x34 +#define STARFIVE_RAND6 0x38 +#define STARFIVE_RAND7 0x3C +#define STARFIVE_AUTO_RQSTS 0x60 +#define STARFIVE_AUTO_AGE 0x64 + +/* CTRL CMD */ +#define STARFIVE_CTRL_EXEC_NOP 0x0 +#define STARFIVE_CTRL_GENE_RANDNUM 0x1 +#define STARFIVE_CTRL_EXEC_RANDRESEED 0x2 + +/* STAT */ +#define STARFIVE_STAT_NONCE_MODE BIT(2) +#define STARFIVE_STAT_R256 BIT(3) +#define STARFIVE_STAT_MISSION_MODE BIT(8) +#define STARFIVE_STAT_SEEDED BIT(9) +#define STARFIVE_STAT_LAST_RESEED(x) ((x) << 16) +#define STARFIVE_STAT_SRVC_RQST BIT(27) +#define STARFIVE_STAT_RAND_GENERATING BIT(30) +#define STARFIVE_STAT_RAND_SEEDING BIT(31) + +/* MODE */ +#define STARFIVE_MODE_R256 BIT(3) + +/* SMODE */ +#define STARFIVE_SMODE_NONCE_MODE BIT(2) +#define STARFIVE_SMODE_MISSION_MODE BIT(8) +#define STARFIVE_SMODE_MAX_REJECTS(x) ((x) << 16) + +/* IE */ +#define STARFIVE_IE_RAND_RDY_EN BIT(0) +#define STARFIVE_IE_SEED_DONE_EN BIT(1) +#define STARFIVE_IE_LFSR_LOCKUP_EN BIT(4) +#define STARFIVE_IE_GLBL_EN BIT(31) + +#define STARFIVE_IE_ALL (STARFIVE_IE_GLBL_EN | \ + STARFIVE_IE_RAND_RDY_EN | \ + STARFIVE_IE_SEED_DONE_EN | \ + STARFIVE_IE_LFSR_LOCKUP_EN) + +/* ISTAT */ +#define STARFIVE_ISTAT_RAND_RDY BIT(0) +#define STARFIVE_ISTAT_SEED_DONE BIT(1) +#define STARFIVE_ISTAT_LFSR_LOCKUP BIT(4) + +#define STARFIVE_RAND_LEN sizeof(u32) + +#define to_trng(p) container_of(p, struct starfive_trng, rng) + +enum reseed { + RANDOM_RESEED, + NONCE_RESEED, +}; + +enum mode { + PRNG_128BIT, + PRNG_256BIT, +}; + +struct starfive_trng { + struct device *dev; + void __iomem *base; + struct clk *hclk; + struct clk *ahb; + struct reset_control *rst; + struct hwrng rng; + struct completion random_done; + struct completion reseed_done; + u32 mode; + u32 mission; + u32 reseed; + /* protects against concurrent write to ctrl register */ + spinlock_t write_lock; +}; + +static u16 autoreq; +module_param(autoreq, ushort, 0); +MODULE_PARM_DESC(autoreq, "Auto-reseeding after random number requests by host reaches specified counter:\n" + " 0 - disable counter\n" + " other - reload value for internal counter"); + +static u16 autoage; +module_param(autoage, ushort, 0); +MODULE_PARM_DESC(autoage, "Auto-reseeding after specified timer countdowns to 0:\n" + " 0 - disable timer\n" + " other - reload value for internal timer"); + +static inline int starfive_trng_wait_idle(struct starfive_trng *trng) +{ + u32 stat; + + return readl_relaxed_poll_timeout(trng->base + STARFIVE_STAT, stat, + !(stat & (STARFIVE_STAT_RAND_GENERATING | + STARFIVE_STAT_RAND_SEEDING)), + 10, 100000); +} + +static inline void starfive_trng_irq_mask_clear(struct starfive_trng *trng) +{ + /* clear register: ISTAT */ + u32 data = readl(trng->base + STARFIVE_ISTAT); + + writel(data, trng->base + STARFIVE_ISTAT); +} + +static int starfive_trng_cmd(struct starfive_trng *trng, u32 cmd, bool wait) +{ + int wait_time = 1000; + + /* allow up to 40 us for wait == 0 */ + if (!wait) + wait_time = 40; + + switch (cmd) { + case STARFIVE_CTRL_GENE_RANDNUM: + reinit_completion(&trng->random_done); + spin_lock_irq(&trng->write_lock); + writel(cmd, trng->base + STARFIVE_CTRL); + spin_unlock_irq(&trng->write_lock); + if (!wait_for_completion_timeout(&trng->random_done, usecs_to_jiffies(wait_time))) + return -ETIMEDOUT; + break; + case STARFIVE_CTRL_EXEC_RANDRESEED: + reinit_completion(&trng->reseed_done); + spin_lock_irq(&trng->write_lock); + writel(cmd, trng->base + STARFIVE_CTRL); + spin_unlock_irq(&trng->write_lock); + if (!wait_for_completion_timeout(&trng->reseed_done, usecs_to_jiffies(wait_time))) + return -ETIMEDOUT; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int starfive_trng_init(struct hwrng *rng) +{ + struct starfive_trng *trng = to_trng(rng); + u32 mode, intr = 0; + + /* setup Auto Request/Age register */ + writel(autoage, trng->base + STARFIVE_AUTO_AGE); + writel(autoreq, trng->base + STARFIVE_AUTO_RQSTS); + + /* clear register: ISTAT */ + starfive_trng_irq_mask_clear(trng); + + intr |= STARFIVE_IE_ALL; + writel(intr, trng->base + STARFIVE_IE); + + mode = readl(trng->base + STARFIVE_MODE); + + switch (trng->mode) { + case PRNG_128BIT: + mode &= ~STARFIVE_MODE_R256; + break; + case PRNG_256BIT: + mode |= STARFIVE_MODE_R256; + break; + default: + mode |= STARFIVE_MODE_R256; + break; + } + + writel(mode, trng->base + STARFIVE_MODE); + + return starfive_trng_cmd(trng, STARFIVE_CTRL_EXEC_RANDRESEED, 1); +} + +static irqreturn_t starfive_trng_irq(int irq, void *priv) +{ + u32 status; + struct starfive_trng *trng = (struct starfive_trng *)priv; + + status = readl(trng->base + STARFIVE_ISTAT); + if (status & STARFIVE_ISTAT_RAND_RDY) { + writel(STARFIVE_ISTAT_RAND_RDY, trng->base + STARFIVE_ISTAT); + complete(&trng->random_done); + } + + if (status & STARFIVE_ISTAT_SEED_DONE) { + writel(STARFIVE_ISTAT_SEED_DONE, trng->base + STARFIVE_ISTAT); + complete(&trng->reseed_done); + } + + if (status & STARFIVE_ISTAT_LFSR_LOCKUP) { + writel(STARFIVE_ISTAT_LFSR_LOCKUP, trng->base + STARFIVE_ISTAT); + /* SEU occurred, reseeding required*/ + spin_lock(&trng->write_lock); + writel(STARFIVE_CTRL_EXEC_RANDRESEED, trng->base + STARFIVE_CTRL); + spin_unlock(&trng->write_lock); + } + + return IRQ_HANDLED; +} + +static void starfive_trng_cleanup(struct hwrng *rng) +{ + struct starfive_trng *trng = to_trng(rng); + + writel(0, trng->base + STARFIVE_CTRL); + + reset_control_assert(trng->rst); + clk_disable_unprepare(trng->hclk); + clk_disable_unprepare(trng->ahb); +} + +static int starfive_trng_read(struct hwrng *rng, void *buf, size_t max, bool wait) +{ + struct starfive_trng *trng = to_trng(rng); + int ret; + + pm_runtime_get_sync(trng->dev); + + if (trng->mode == PRNG_256BIT) + max = min_t(size_t, max, (STARFIVE_RAND_LEN * 8)); + else + max = min_t(size_t, max, (STARFIVE_RAND_LEN * 4)); + + if (wait) { + ret = starfive_trng_wait_idle(trng); + if (ret) + return -ETIMEDOUT; + } + + ret = starfive_trng_cmd(trng, STARFIVE_CTRL_GENE_RANDNUM, wait); + if (ret) + return ret; + + memcpy_fromio(buf, trng->base + STARFIVE_RAND0, max); + + pm_runtime_put_sync_autosuspend(trng->dev); + + return max; +} + +static int starfive_trng_probe(struct platform_device *pdev) +{ + int ret; + int irq; + struct starfive_trng *trng; + + trng = devm_kzalloc(&pdev->dev, sizeof(*trng), GFP_KERNEL); + if (!trng) + return -ENOMEM; + + platform_set_drvdata(pdev, trng); + trng->dev = &pdev->dev; + + trng->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(trng->base)) + return dev_err_probe(&pdev->dev, PTR_ERR(trng->base), + "Error remapping memory for platform device.\n"); + + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; + + init_completion(&trng->random_done); + init_completion(&trng->reseed_done); + spin_lock_init(&trng->write_lock); + + ret = devm_request_irq(&pdev->dev, irq, starfive_trng_irq, 0, pdev->name, + (void *)trng); + if (ret) + return dev_err_probe(&pdev->dev, irq, + "Failed to register interrupt handler\n"); + + trng->hclk = devm_clk_get(&pdev->dev, "hclk"); + if (IS_ERR(trng->hclk)) + return dev_err_probe(&pdev->dev, PTR_ERR(trng->hclk), + "Error getting hardware reference clock\n"); + + trng->ahb = devm_clk_get(&pdev->dev, "ahb"); + if (IS_ERR(trng->ahb)) + return dev_err_probe(&pdev->dev, PTR_ERR(trng->ahb), + "Error getting ahb reference clock\n"); + + trng->rst = devm_reset_control_get_shared(&pdev->dev, NULL); + if (IS_ERR(trng->rst)) + return dev_err_probe(&pdev->dev, PTR_ERR(trng->rst), + "Error getting hardware reset line\n"); + + clk_prepare_enable(trng->hclk); + clk_prepare_enable(trng->ahb); + reset_control_deassert(trng->rst); + + trng->rng.name = dev_driver_string(&pdev->dev); + trng->rng.init = starfive_trng_init; + trng->rng.cleanup = starfive_trng_cleanup; + trng->rng.read = starfive_trng_read; + + trng->mode = PRNG_256BIT; + trng->mission = 1; + trng->reseed = RANDOM_RESEED; + + pm_runtime_use_autosuspend(&pdev->dev); + pm_runtime_set_autosuspend_delay(&pdev->dev, 100); + pm_runtime_enable(&pdev->dev); + + ret = devm_hwrng_register(&pdev->dev, &trng->rng); + if (ret) { + pm_runtime_disable(&pdev->dev); + + reset_control_assert(trng->rst); + clk_disable_unprepare(trng->ahb); + clk_disable_unprepare(trng->hclk); + + return dev_err_probe(&pdev->dev, ret, "Failed to register hwrng\n"); + } + + return 0; +} + +static int __maybe_unused starfive_trng_suspend(struct device *dev) +{ + struct starfive_trng *trng = dev_get_drvdata(dev); + + clk_disable_unprepare(trng->hclk); + clk_disable_unprepare(trng->ahb); + + return 0; +} + +static int __maybe_unused starfive_trng_resume(struct device *dev) +{ + struct starfive_trng *trng = dev_get_drvdata(dev); + + clk_prepare_enable(trng->hclk); + clk_prepare_enable(trng->ahb); + + return 0; +} + +static DEFINE_SIMPLE_DEV_PM_OPS(starfive_trng_pm_ops, starfive_trng_suspend, + starfive_trng_resume); + +static const struct of_device_id trng_dt_ids[] __maybe_unused = { + { .compatible = "starfive,jh7110-trng" }, + { } +}; +MODULE_DEVICE_TABLE(of, trng_dt_ids); + +static struct platform_driver starfive_trng_driver = { + .probe = starfive_trng_probe, + .driver = { + .name = "jh7110-trng", + .pm = &starfive_trng_pm_ops, + .of_match_table = of_match_ptr(trng_dt_ids), + }, +}; + +module_platform_driver(starfive_trng_driver); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("StarFive True Random Number Generator"); From 1b4744e107e3d6eeda6473fefee5e2b904b85b8b Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Tue, 17 Jan 2023 18:20:06 +0100 Subject: [PATCH 058/156] crypto: testmgr - disallow certain DRBG hash functions in FIPS mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to FIPS 140-3 IG, section D.R "Hash Functions Acceptable for Use in the SP 800-90A DRBGs", modules certified after May 16th, 2023 must not support the use of: SHA-224, SHA-384, SHA512-224, SHA512-256, SHA3-224, SHA3-384. Disallow HMAC and HASH DRBGs using SHA-384 in FIPS mode. Signed-off-by: Vladis Dronov Reviewed-by: Stephan Müller Signed-off-by: Herbert Xu --- crypto/testmgr.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 795c4858c741..dd748832ed4a 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -4781,7 +4781,6 @@ static const struct alg_test_desc alg_test_descs[] = { }, { /* covered by drbg_nopr_hmac_sha256 test */ .alg = "drbg_nopr_hmac_sha384", - .fips_allowed = 1, .test = alg_test_null, }, { .alg = "drbg_nopr_hmac_sha512", @@ -4804,7 +4803,6 @@ static const struct alg_test_desc alg_test_descs[] = { }, { /* covered by drbg_nopr_sha256 test */ .alg = "drbg_nopr_sha384", - .fips_allowed = 1, .test = alg_test_null, }, { .alg = "drbg_nopr_sha512", @@ -4840,7 +4838,6 @@ static const struct alg_test_desc alg_test_descs[] = { }, { /* covered by drbg_pr_hmac_sha256 test */ .alg = "drbg_pr_hmac_sha384", - .fips_allowed = 1, .test = alg_test_null, }, { .alg = "drbg_pr_hmac_sha512", @@ -4860,7 +4857,6 @@ static const struct alg_test_desc alg_test_descs[] = { }, { /* covered by drbg_pr_sha256 test */ .alg = "drbg_pr_sha384", - .fips_allowed = 1, .test = alg_test_null, }, { .alg = "drbg_pr_sha512", From aaf16cdca641267c93aa0826de5f13582304bcb8 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 19 Jan 2023 09:48:59 +0800 Subject: [PATCH 059/156] crypto: aspeed - change aspeed_acry_akcipher_algs to static aspeed_acry_akcipher_algs is only used in aspeed-acry.c now, change it to static. Signed-off-by: Yang Yingliang Acked-by: Andrew Jeffery Reviewed-by: Neal Liu Signed-off-by: Herbert Xu --- drivers/crypto/aspeed/aspeed-acry.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/aspeed/aspeed-acry.c b/drivers/crypto/aspeed/aspeed-acry.c index 6d3790583f8b..164c524015f0 100644 --- a/drivers/crypto/aspeed/aspeed-acry.c +++ b/drivers/crypto/aspeed/aspeed-acry.c @@ -603,7 +603,7 @@ static void aspeed_acry_rsa_exit_tfm(struct crypto_akcipher *tfm) crypto_free_akcipher(ctx->fallback_tfm); } -struct aspeed_acry_alg aspeed_acry_akcipher_algs[] = { +static struct aspeed_acry_alg aspeed_acry_akcipher_algs[] = { { .akcipher = { .encrypt = aspeed_acry_rsa_enc, From e9040736d24852a618602300beec1f35fe0c42f8 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Thu, 19 Jan 2023 15:36:49 +0800 Subject: [PATCH 060/156] crypto: aspeed - Use devm_platform_get_and_ioremap_resource() Convert platform_get_resource(), devm_ioremap_resource() to a single call to devm_platform_get_and_ioremap_resource(), as this is exactly what this function does. Signed-off-by: ye xingchen Signed-off-by: Herbert Xu --- drivers/crypto/aspeed/aspeed-hace.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/crypto/aspeed/aspeed-hace.c b/drivers/crypto/aspeed/aspeed-hace.c index 656cb92c8bb6..d2871e1de9c2 100644 --- a/drivers/crypto/aspeed/aspeed-hace.c +++ b/drivers/crypto/aspeed/aspeed-hace.c @@ -99,7 +99,6 @@ static int aspeed_hace_probe(struct platform_device *pdev) const struct of_device_id *hace_dev_id; struct aspeed_engine_hash *hash_engine; struct aspeed_hace_dev *hace_dev; - struct resource *res; int rc; hace_dev = devm_kzalloc(&pdev->dev, sizeof(struct aspeed_hace_dev), @@ -118,11 +117,9 @@ static int aspeed_hace_probe(struct platform_device *pdev) hash_engine = &hace_dev->hash_engine; crypto_engine = &hace_dev->crypto_engine; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - platform_set_drvdata(pdev, hace_dev); - hace_dev->regs = devm_ioremap_resource(&pdev->dev, res); + hace_dev->regs = devm_platform_get_and_ioremap_resource(pdev, 0, NULL); if (IS_ERR(hace_dev->regs)) return PTR_ERR(hace_dev->regs); From e16dda2b69ba36c01522735221f08e9dd8f438d3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 19 Jan 2023 17:01:39 +0800 Subject: [PATCH 061/156] crypto: cryptd - Remove unnecessary skcipher_request_zero Previously the child skcipher request was stored on the stack and therefore needed to be zeroed. As it is now dynamically allocated we no longer need to do so. Signed-off-by: Herbert Xu --- crypto/cryptd.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/crypto/cryptd.c b/crypto/cryptd.c index ca3a40fc7da9..1ff58a021d57 100644 --- a/crypto/cryptd.c +++ b/crypto/cryptd.c @@ -272,7 +272,6 @@ static void cryptd_skcipher_encrypt(struct crypto_async_request *base, req->iv); err = crypto_skcipher_encrypt(subreq); - skcipher_request_zero(subreq); req->base.complete = rctx->complete; @@ -300,7 +299,6 @@ static void cryptd_skcipher_decrypt(struct crypto_async_request *base, req->iv); err = crypto_skcipher_decrypt(subreq); - skcipher_request_zero(subreq); req->base.complete = rctx->complete; From 654627ad0beeb8658cbb809028a0e17df961ee9d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 19 Jan 2023 18:36:58 +0800 Subject: [PATCH 062/156] crypto: bcm - Use subrequest for fallback Instead of doing saving and restoring on the AEAD request object for fallback processing, use a subrequest instead. Signed-off-by: Herbert Xu --- drivers/crypto/bcm/cipher.c | 96 +++++++++++++------------------------ drivers/crypto/bcm/cipher.h | 7 +-- 2 files changed, 35 insertions(+), 68 deletions(-) diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c index c8c799428fe0..f8e035039aeb 100644 --- a/drivers/crypto/bcm/cipher.c +++ b/drivers/crypto/bcm/cipher.c @@ -2570,66 +2570,29 @@ static int aead_need_fallback(struct aead_request *req) return payload_len > ctx->max_payload; } -static void aead_complete(struct crypto_async_request *areq, int err) -{ - struct aead_request *req = - container_of(areq, struct aead_request, base); - struct iproc_reqctx_s *rctx = aead_request_ctx(req); - struct crypto_aead *aead = crypto_aead_reqtfm(req); - - flow_log("%s() err:%d\n", __func__, err); - - areq->tfm = crypto_aead_tfm(aead); - - areq->complete = rctx->old_complete; - areq->data = rctx->old_data; - - areq->complete(areq, err); -} - static int aead_do_fallback(struct aead_request *req, bool is_encrypt) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_tfm *tfm = crypto_aead_tfm(aead); struct iproc_reqctx_s *rctx = aead_request_ctx(req); struct iproc_ctx_s *ctx = crypto_tfm_ctx(tfm); - int err; - u32 req_flags; + struct aead_request *subreq; flow_log("%s() enc:%u\n", __func__, is_encrypt); - if (ctx->fallback_cipher) { - /* Store the cipher tfm and then use the fallback tfm */ - rctx->old_tfm = tfm; - aead_request_set_tfm(req, ctx->fallback_cipher); - /* - * Save the callback and chain ourselves in, so we can restore - * the tfm - */ - rctx->old_complete = req->base.complete; - rctx->old_data = req->base.data; - req_flags = aead_request_flags(req); - aead_request_set_callback(req, req_flags, aead_complete, req); - err = is_encrypt ? crypto_aead_encrypt(req) : - crypto_aead_decrypt(req); + if (!ctx->fallback_cipher) + return -EINVAL; - if (err == 0) { - /* - * fallback was synchronous (did not return - * -EINPROGRESS). So restore request state here. - */ - aead_request_set_callback(req, req_flags, - rctx->old_complete, req); - req->base.data = rctx->old_data; - aead_request_set_tfm(req, aead); - flow_log("%s() fallback completed successfully\n\n", - __func__); - } - } else { - err = -EINVAL; - } + subreq = &rctx->req; + aead_request_set_tfm(subreq, ctx->fallback_cipher); + aead_request_set_callback(subreq, aead_request_flags(req), + req->base.complete, req->base.data); + aead_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, + req->iv); + aead_request_set_ad(subreq, req->assoclen); - return err; + return is_encrypt ? crypto_aead_encrypt(req) : + crypto_aead_decrypt(req); } static int aead_enqueue(struct aead_request *req, bool is_encrypt) @@ -4243,6 +4206,7 @@ static int ahash_cra_init(struct crypto_tfm *tfm) static int aead_cra_init(struct crypto_aead *aead) { + unsigned int reqsize = sizeof(struct iproc_reqctx_s); struct crypto_tfm *tfm = crypto_aead_tfm(aead); struct iproc_ctx_s *ctx = crypto_tfm_ctx(tfm); struct crypto_alg *alg = tfm->__crt_alg; @@ -4254,7 +4218,6 @@ static int aead_cra_init(struct crypto_aead *aead) flow_log("%s()\n", __func__); - crypto_aead_set_reqsize(aead, sizeof(struct iproc_reqctx_s)); ctx->is_esp = false; ctx->salt_len = 0; ctx->salt_offset = 0; @@ -4263,22 +4226,29 @@ static int aead_cra_init(struct crypto_aead *aead) get_random_bytes(ctx->iv, MAX_IV_SIZE); flow_dump(" iv: ", ctx->iv, MAX_IV_SIZE); - if (!err) { - if (alg->cra_flags & CRYPTO_ALG_NEED_FALLBACK) { - flow_log("%s() creating fallback cipher\n", __func__); + if (err) + goto out; - ctx->fallback_cipher = - crypto_alloc_aead(alg->cra_name, 0, - CRYPTO_ALG_ASYNC | - CRYPTO_ALG_NEED_FALLBACK); - if (IS_ERR(ctx->fallback_cipher)) { - pr_err("%s() Error: failed to allocate fallback for %s\n", - __func__, alg->cra_name); - return PTR_ERR(ctx->fallback_cipher); - } - } + if (!(alg->cra_flags & CRYPTO_ALG_NEED_FALLBACK)) + goto reqsize; + + flow_log("%s() creating fallback cipher\n", __func__); + + ctx->fallback_cipher = crypto_alloc_aead(alg->cra_name, 0, + CRYPTO_ALG_ASYNC | + CRYPTO_ALG_NEED_FALLBACK); + if (IS_ERR(ctx->fallback_cipher)) { + pr_err("%s() Error: failed to allocate fallback for %s\n", + __func__, alg->cra_name); + return PTR_ERR(ctx->fallback_cipher); } + reqsize += crypto_aead_reqsize(ctx->fallback_cipher); + +reqsize: + crypto_aead_set_reqsize(aead, reqsize); + +out: return err; } diff --git a/drivers/crypto/bcm/cipher.h b/drivers/crypto/bcm/cipher.h index d6d87332140a..e36881c983cf 100644 --- a/drivers/crypto/bcm/cipher.h +++ b/drivers/crypto/bcm/cipher.h @@ -339,15 +339,12 @@ struct iproc_reqctx_s { /* hmac context */ bool is_sw_hmac; - /* aead context */ - struct crypto_tfm *old_tfm; - crypto_completion_t old_complete; - void *old_data; - gfp_t gfp; /* Buffers used to build SPU request and response messages */ struct spu_msg_buf msg_buf; + + struct aead_request req; }; /* From 4bc713a49d7f97f7328e5a595b8f5d231d9ebfca Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 22 Jan 2023 15:32:03 +0800 Subject: [PATCH 063/156] crypto: caam - Use ahash_request_complete Instead of calling the base completion function directly, use the correct ahash helper which is ahash_request_complete. Signed-off-by: Herbert Xu Reviewed-by: Gaurav Jain Signed-off-by: Herbert Xu --- drivers/crypto/caam/caamalg_qi2.c | 8 ++++---- drivers/crypto/caam/caamhash.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/caam/caamalg_qi2.c b/drivers/crypto/caam/caamalg_qi2.c index 0ddef9a033a1..5c8d35edaa1c 100644 --- a/drivers/crypto/caam/caamalg_qi2.c +++ b/drivers/crypto/caam/caamalg_qi2.c @@ -3419,7 +3419,7 @@ static void ahash_done(void *cbk_ctx, u32 status) DUMP_PREFIX_ADDRESS, 16, 4, state->caam_ctx, ctx->ctx_len, 1); - req->base.complete(&req->base, ecode); + ahash_request_complete(req, ecode); } static void ahash_done_bi(void *cbk_ctx, u32 status) @@ -3457,7 +3457,7 @@ static void ahash_done_bi(void *cbk_ctx, u32 status) DUMP_PREFIX_ADDRESS, 16, 4, req->result, crypto_ahash_digestsize(ahash), 1); - req->base.complete(&req->base, ecode); + ahash_request_complete(req, ecode); } static void ahash_done_ctx_src(void *cbk_ctx, u32 status) @@ -3484,7 +3484,7 @@ static void ahash_done_ctx_src(void *cbk_ctx, u32 status) DUMP_PREFIX_ADDRESS, 16, 4, state->caam_ctx, ctx->ctx_len, 1); - req->base.complete(&req->base, ecode); + ahash_request_complete(req, ecode); } static void ahash_done_ctx_dst(void *cbk_ctx, u32 status) @@ -3522,7 +3522,7 @@ static void ahash_done_ctx_dst(void *cbk_ctx, u32 status) DUMP_PREFIX_ADDRESS, 16, 4, req->result, crypto_ahash_digestsize(ahash), 1); - req->base.complete(&req->base, ecode); + ahash_request_complete(req, ecode); } static int ahash_update_ctx(struct ahash_request *req) diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c index 1f357f48c473..82d3c730a502 100644 --- a/drivers/crypto/caam/caamhash.c +++ b/drivers/crypto/caam/caamhash.c @@ -614,7 +614,7 @@ static inline void ahash_done_cpy(struct device *jrdev, u32 *desc, u32 err, * by CAAM, not crypto engine. */ if (!has_bklog) - req->base.complete(&req->base, ecode); + ahash_request_complete(req, ecode); else crypto_finalize_hash_request(jrp->engine, req, ecode); } @@ -676,7 +676,7 @@ static inline void ahash_done_switch(struct device *jrdev, u32 *desc, u32 err, * by CAAM, not crypto engine. */ if (!has_bklog) - req->base.complete(&req->base, ecode); + ahash_request_complete(req, ecode); else crypto_finalize_hash_request(jrp->engine, req, ecode); From 51c082514c2dedf2711c99d93c196cc4eedceb40 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 22 Jan 2023 16:07:37 +0800 Subject: [PATCH 064/156] crypto: xts - Handle EBUSY correctly As it is xts only handles the special return value of EINPROGRESS, which means that in all other cases it will free data related to the request. However, as the caller of xts may specify MAY_BACKLOG, we also need to expect EBUSY and treat it in the same way. Otherwise backlogged requests will trigger a use-after-free. Fixes: 8083b1bf8163 ("crypto: xts - add support for ciphertext stealing") Signed-off-by: Herbert Xu Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/xts.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crypto/xts.c b/crypto/xts.c index 63c85b9e64e0..de6cbcf69bbd 100644 --- a/crypto/xts.c +++ b/crypto/xts.c @@ -203,12 +203,12 @@ static void xts_encrypt_done(struct crypto_async_request *areq, int err) if (!err) { struct xts_request_ctx *rctx = skcipher_request_ctx(req); - rctx->subreq.base.flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + rctx->subreq.base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG; err = xts_xor_tweak_post(req, true); if (!err && unlikely(req->cryptlen % XTS_BLOCK_SIZE)) { err = xts_cts_final(req, crypto_skcipher_encrypt); - if (err == -EINPROGRESS) + if (err == -EINPROGRESS || err == -EBUSY) return; } } @@ -223,12 +223,12 @@ static void xts_decrypt_done(struct crypto_async_request *areq, int err) if (!err) { struct xts_request_ctx *rctx = skcipher_request_ctx(req); - rctx->subreq.base.flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + rctx->subreq.base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG; err = xts_xor_tweak_post(req, false); if (!err && unlikely(req->cryptlen % XTS_BLOCK_SIZE)) { err = xts_cts_final(req, crypto_skcipher_decrypt); - if (err == -EINPROGRESS) + if (err == -EINPROGRESS || err == -EBUSY) return; } } From 42a9a08b9aa69134981d1564f95eb06c0469eff3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 22 Jan 2023 16:56:02 +0800 Subject: [PATCH 065/156] crypto: engine - Fix excess parameter doc warning The engine parameter should not be marked for kernel doc as it triggers a warning. Signed-off-by: Herbert Xu --- crypto/crypto_engine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c index bb8e77077f02..64dc9aa3ca24 100644 --- a/crypto/crypto_engine.c +++ b/crypto/crypto_engine.c @@ -499,7 +499,7 @@ EXPORT_SYMBOL_GPL(crypto_engine_stop); * This has the form: * callback(struct crypto_engine *engine) * where: - * @engine: the crypto engine structure. + * engine: the crypto engine structure. * @rt: whether this queue is set to run as a realtime task * @qlen: maximum size of the crypto-engine queue * From 8a1955f95883cc0cb35b74f0cf58a283680323ac Mon Sep 17 00:00:00 2001 From: Peter Lafreniere Date: Sat, 21 Jan 2023 13:34:50 -0500 Subject: [PATCH 066/156] crypto: x86 - exit fpu context earlier in ECB/CBC macros Currently the ecb/cbc macros hold fpu context unnecessarily when using scalar cipher routines (e.g. when handling odd sizes of blocks per walk). Change the macros to drop fpu context as soon as the fpu is out of use. No performance impact found (on Intel Haswell). Signed-off-by: Peter Lafreniere Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/ecb_cbc_helpers.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/x86/crypto/ecb_cbc_helpers.h b/arch/x86/crypto/ecb_cbc_helpers.h index eaa15c7b29d6..11955bd01af1 100644 --- a/arch/x86/crypto/ecb_cbc_helpers.h +++ b/arch/x86/crypto/ecb_cbc_helpers.h @@ -13,13 +13,14 @@ #define ECB_WALK_START(req, bsize, fpu_blocks) do { \ void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); \ + const int __fpu_blocks = (fpu_blocks); \ const int __bsize = (bsize); \ struct skcipher_walk walk; \ int err = skcipher_walk_virt(&walk, (req), false); \ while (walk.nbytes > 0) { \ unsigned int nbytes = walk.nbytes; \ - bool do_fpu = (fpu_blocks) != -1 && \ - nbytes >= (fpu_blocks) * __bsize; \ + bool do_fpu = __fpu_blocks != -1 && \ + nbytes >= __fpu_blocks * __bsize; \ const u8 *src = walk.src.virt.addr; \ u8 *dst = walk.dst.virt.addr; \ u8 __maybe_unused buf[(bsize)]; \ @@ -35,7 +36,12 @@ } while (0) #define ECB_BLOCK(blocks, func) do { \ - while (nbytes >= (blocks) * __bsize) { \ + const int __blocks = (blocks); \ + if (do_fpu && __blocks < __fpu_blocks) { \ + kernel_fpu_end(); \ + do_fpu = false; \ + } \ + while (nbytes >= __blocks * __bsize) { \ (func)(ctx, dst, src); \ ECB_WALK_ADVANCE(blocks); \ } \ @@ -53,7 +59,12 @@ } while (0) #define CBC_DEC_BLOCK(blocks, func) do { \ - while (nbytes >= (blocks) * __bsize) { \ + const int __blocks = (blocks); \ + if (do_fpu && __blocks < __fpu_blocks) { \ + kernel_fpu_end(); \ + do_fpu = false; \ + } \ + while (nbytes >= __blocks * __bsize) { \ const u8 *__iv = src + ((blocks) - 1) * __bsize; \ if (dst == src) \ __iv = memcpy(buf, __iv, __bsize); \ From d52b0c780c1f8cdd0cef9c6e683ab568d04bb19d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 23 Jan 2023 18:08:56 +0800 Subject: [PATCH 067/156] Revert "crypto: rsa-pkcs1pad - Replace GFP_ATOMIC with GFP_KERNEL in pkcs1pad_encrypt_sign_complete" This reverts commit 1ca2809897155f1adc43e4859b4a3582e235c09a. While the akcipher API as a whole is designed to be called only from thread context, its completion path is still called from softirq context as usual. Therefore we must not use GFP_KERNEL on that path. Signed-off-by: Herbert Xu --- crypto/rsa-pkcs1pad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c index 6ee5b8a060c0..141f7e08df6a 100644 --- a/crypto/rsa-pkcs1pad.c +++ b/crypto/rsa-pkcs1pad.c @@ -190,7 +190,7 @@ static int pkcs1pad_encrypt_sign_complete(struct akcipher_request *req, int err) if (likely(!pad_len)) goto out; - out_buf = kzalloc(ctx->key_size, GFP_KERNEL); + out_buf = kzalloc(ctx->key_size, GFP_ATOMIC); err = -ENOMEM; if (!out_buf) goto out; From c1e98807b7049dfc04a971b937ca127faa5514f4 Mon Sep 17 00:00:00 2001 From: Lucas Segarra Fernandez Date: Mon, 23 Jan 2023 11:42:21 +0100 Subject: [PATCH 068/156] crypto: qat - extend buffer list logic interface Extend qat_bl_sgl_to_bufl() to allow skipping the mapping of a region of the source and the destination scatter lists starting from byte zero. This is to support the ZLIB format (RFC 1950) in the qat driver. The ZLIB format is made of deflate compressed data surrounded by a header and a footer. The QAT accelerators support only the deflate algorithm, therefore the header should not be mapped since it is inserted in software. Signed-off-by: Lucas Segarra Fernandez Reviewed-by: Giovanni Cabiddu Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_common/qat_bl.c | 37 ++++++++++++++++--- drivers/crypto/qat/qat_common/qat_bl.h | 2 + drivers/crypto/qat/qat_common/qat_comp_algs.c | 3 ++ 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/drivers/crypto/qat/qat_common/qat_bl.c b/drivers/crypto/qat/qat_common/qat_bl.c index c72831fa025d..76baed0a76c0 100644 --- a/drivers/crypto/qat/qat_common/qat_bl.c +++ b/drivers/crypto/qat/qat_common/qat_bl.c @@ -53,6 +53,8 @@ static int __qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, struct qat_request_buffs *buf, dma_addr_t extra_dst_buff, size_t sz_extra_dst_buff, + unsigned int sskip, + unsigned int dskip, gfp_t flags) { struct device *dev = &GET_DEV(accel_dev); @@ -65,6 +67,7 @@ static int __qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, struct scatterlist *sg; size_t sz_out, sz = struct_size(bufl, buffers, n); int node = dev_to_node(&GET_DEV(accel_dev)); + unsigned int left; int bufl_dma_dir; if (unlikely(!n)) @@ -88,19 +91,29 @@ static int __qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, for (i = 0; i < n; i++) bufl->buffers[i].addr = DMA_MAPPING_ERROR; + left = sskip; + for_each_sg(sgl, sg, n, i) { int y = sg_nctr; if (!sg->length) continue; - bufl->buffers[y].addr = dma_map_single(dev, sg_virt(sg), - sg->length, + if (left >= sg->length) { + left -= sg->length; + continue; + } + bufl->buffers[y].addr = dma_map_single(dev, sg_virt(sg) + left, + sg->length - left, bufl_dma_dir); bufl->buffers[y].len = sg->length; if (unlikely(dma_mapping_error(dev, bufl->buffers[y].addr))) goto err_in; sg_nctr++; + if (left) { + bufl->buffers[y].len -= left; + left = 0; + } } bufl->num_bufs = sg_nctr; blp = dma_map_single(dev, bufl, sz, DMA_TO_DEVICE); @@ -117,6 +130,8 @@ static int __qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, n = n_sglout + extra_buff; sz_out = struct_size(buflout, buffers, n); + left = dskip; + sg_nctr = 0; if (n > QAT_MAX_BUFF_DESC) { @@ -139,13 +154,21 @@ static int __qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, if (!sg->length) continue; - buffers[y].addr = dma_map_single(dev, sg_virt(sg), - sg->length, + if (left >= sg->length) { + left -= sg->length; + continue; + } + buffers[y].addr = dma_map_single(dev, sg_virt(sg) + left, + sg->length - left, DMA_FROM_DEVICE); if (unlikely(dma_mapping_error(dev, buffers[y].addr))) goto err_out; buffers[y].len = sg->length; sg_nctr++; + if (left) { + buffers[y].len -= left; + left = 0; + } } if (extra_buff) { buffers[sg_nctr].addr = extra_dst_buff; @@ -212,15 +235,19 @@ int qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, { dma_addr_t extra_dst_buff = 0; size_t sz_extra_dst_buff = 0; + unsigned int sskip = 0; + unsigned int dskip = 0; if (params) { extra_dst_buff = params->extra_dst_buff; sz_extra_dst_buff = params->sz_extra_dst_buff; + sskip = params->sskip; + dskip = params->dskip; } return __qat_bl_sgl_to_bufl(accel_dev, sgl, sglout, buf, extra_dst_buff, sz_extra_dst_buff, - flags); + sskip, dskip, flags); } static void qat_bl_sgl_unmap(struct adf_accel_dev *accel_dev, diff --git a/drivers/crypto/qat/qat_common/qat_bl.h b/drivers/crypto/qat/qat_common/qat_bl.h index 1479fef3b634..d87e4f35ac39 100644 --- a/drivers/crypto/qat/qat_common/qat_bl.h +++ b/drivers/crypto/qat/qat_common/qat_bl.h @@ -42,6 +42,8 @@ struct qat_request_buffs { struct qat_sgl_to_bufl_params { dma_addr_t extra_dst_buff; size_t sz_extra_dst_buff; + unsigned int sskip; + unsigned int dskip; }; void qat_bl_free_bufl(struct adf_accel_dev *accel_dev, diff --git a/drivers/crypto/qat/qat_common/qat_comp_algs.c b/drivers/crypto/qat/qat_common/qat_comp_algs.c index 1480d36a8d2b..12d5e0fc3a95 100644 --- a/drivers/crypto/qat/qat_common/qat_comp_algs.c +++ b/drivers/crypto/qat/qat_common/qat_comp_algs.c @@ -233,6 +233,9 @@ static int qat_comp_alg_compress_decompress(struct acomp_req *areq, size_t ovf_buff_sz; int ret; + params.sskip = 0; + params.dskip = 0; + if (!areq->src || !slen) return -EINVAL; From 7ce515e636ec1807723c30fd3a6181b4ccac6ec6 Mon Sep 17 00:00:00 2001 From: Lucas Segarra Fernandez Date: Mon, 23 Jan 2023 11:42:22 +0100 Subject: [PATCH 069/156] crypto: qat - add qat_zlib_deflate The ZLIB format (RFC 1950) is made of deflate compressed data surrounded by a header and a footer. The QAT accelerators support only the deflate algorithm, therefore the header and the footer need to be inserted in software. This adds logic in the QAT driver to support the ZLIB format. In particular: * Generalize the function qat_comp_alg_compress_decompress() to allow skipping an initial region (header) of the source and/or destination scatter lists. * Add logic to register the qat_zlib_deflate algorithm into the acomp framework. * For ZLIB compression, skip the initial portion of the destination buffer before sending the job to the QAT accelerator and insert the ZLIB header and footer in the callback, after the QAT request has been processed. * For ZLIB decompression, parse the header in the input buffer provided by the user and verify its validity before attempting the decompression of the buffer with QAT. Then submit the buffer to QAT for decompression. In the callback verify the correctness of the footer by comparing the value of the ADLER produced by QAT with the one in the destination buffer. Signed-off-by: Lucas Segarra Fernandez Reviewed-by: Giovanni Cabiddu Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_common/qat_comp_algs.c | 166 ++++++++++++++++-- 1 file changed, 154 insertions(+), 12 deletions(-) diff --git a/drivers/crypto/qat/qat_common/qat_comp_algs.c b/drivers/crypto/qat/qat_common/qat_comp_algs.c index 12d5e0fc3a95..8e54158c11f9 100644 --- a/drivers/crypto/qat/qat_common/qat_comp_algs.c +++ b/drivers/crypto/qat/qat_common/qat_comp_algs.c @@ -13,6 +13,15 @@ #include "qat_compression.h" #include "qat_algs_send.h" +#define QAT_RFC_1950_HDR_SIZE 2 +#define QAT_RFC_1950_FOOTER_SIZE 4 +#define QAT_RFC_1950_CM_DEFLATE 8 +#define QAT_RFC_1950_CM_DEFLATE_CINFO_32K 7 +#define QAT_RFC_1950_CM_MASK 0x0f +#define QAT_RFC_1950_CM_OFFSET 4 +#define QAT_RFC_1950_DICT_MASK 0x20 +#define QAT_RFC_1950_COMP_HDR 0x785e + static DEFINE_MUTEX(algs_lock); static unsigned int active_devs; @@ -21,9 +30,12 @@ enum direction { COMPRESSION = 1, }; +struct qat_compression_req; + struct qat_compression_ctx { u8 comp_ctx[QAT_COMP_CTX_SIZE]; struct qat_compression_instance *inst; + int (*qat_comp_callback)(struct qat_compression_req *qat_req, void *resp); }; struct qat_dst { @@ -97,6 +109,69 @@ err: areq->base.complete(&areq->base, ret); } +static int parse_zlib_header(u16 zlib_h) +{ + int ret = -EINVAL; + __be16 header; + u8 *header_p; + u8 cmf, flg; + + header = cpu_to_be16(zlib_h); + header_p = (u8 *)&header; + + flg = header_p[0]; + cmf = header_p[1]; + + if (cmf >> QAT_RFC_1950_CM_OFFSET > QAT_RFC_1950_CM_DEFLATE_CINFO_32K) + return ret; + + if ((cmf & QAT_RFC_1950_CM_MASK) != QAT_RFC_1950_CM_DEFLATE) + return ret; + + if (flg & QAT_RFC_1950_DICT_MASK) + return ret; + + return 0; +} + +static int qat_comp_rfc1950_callback(struct qat_compression_req *qat_req, + void *resp) +{ + struct acomp_req *areq = qat_req->acompress_req; + enum direction dir = qat_req->dir; + __be32 qat_produced_adler; + + qat_produced_adler = cpu_to_be32(qat_comp_get_produced_adler32(resp)); + + if (dir == COMPRESSION) { + __be16 zlib_header; + + zlib_header = cpu_to_be16(QAT_RFC_1950_COMP_HDR); + scatterwalk_map_and_copy(&zlib_header, areq->dst, 0, QAT_RFC_1950_HDR_SIZE, 1); + areq->dlen += QAT_RFC_1950_HDR_SIZE; + + scatterwalk_map_and_copy(&qat_produced_adler, areq->dst, areq->dlen, + QAT_RFC_1950_FOOTER_SIZE, 1); + areq->dlen += QAT_RFC_1950_FOOTER_SIZE; + } else { + __be32 decomp_adler; + int footer_offset; + int consumed; + + consumed = qat_comp_get_consumed_ctr(resp); + footer_offset = consumed + QAT_RFC_1950_HDR_SIZE; + if (footer_offset + QAT_RFC_1950_FOOTER_SIZE > areq->slen) + return -EBADMSG; + + scatterwalk_map_and_copy(&decomp_adler, areq->src, footer_offset, + QAT_RFC_1950_FOOTER_SIZE, 0); + + if (qat_produced_adler != decomp_adler) + return -EBADMSG; + } + return 0; +} + static void qat_comp_generic_callback(struct qat_compression_req *qat_req, void *resp) { @@ -167,6 +242,9 @@ static void qat_comp_generic_callback(struct qat_compression_req *qat_req, res = 0; areq->dlen = produced; + if (ctx->qat_comp_callback) + res = ctx->qat_comp_callback(qat_req, resp); + end: qat_bl_free_bufl(accel_dev, &qat_req->buf); areq->base.complete(&areq->base, res); @@ -215,26 +293,38 @@ static void qat_comp_alg_exit_tfm(struct crypto_acomp *acomp_tfm) memset(ctx, 0, sizeof(*ctx)); } -static int qat_comp_alg_compress_decompress(struct acomp_req *areq, - enum direction dir) +static int qat_comp_alg_rfc1950_init_tfm(struct crypto_acomp *acomp_tfm) +{ + struct crypto_tfm *tfm = crypto_acomp_tfm(acomp_tfm); + struct qat_compression_ctx *ctx = crypto_tfm_ctx(tfm); + int ret; + + ret = qat_comp_alg_init_tfm(acomp_tfm); + ctx->qat_comp_callback = &qat_comp_rfc1950_callback; + + return ret; +} + +static int qat_comp_alg_compress_decompress(struct acomp_req *areq, enum direction dir, + unsigned int shdr, unsigned int sftr, + unsigned int dhdr, unsigned int dftr) { struct qat_compression_req *qat_req = acomp_request_ctx(areq); struct crypto_acomp *acomp_tfm = crypto_acomp_reqtfm(areq); struct crypto_tfm *tfm = crypto_acomp_tfm(acomp_tfm); struct qat_compression_ctx *ctx = crypto_tfm_ctx(tfm); struct qat_compression_instance *inst = ctx->inst; - struct qat_sgl_to_bufl_params *p_params = NULL; gfp_t f = qat_algs_alloc_flags(&areq->base); - struct qat_sgl_to_bufl_params params; - unsigned int slen = areq->slen; - unsigned int dlen = areq->dlen; + struct qat_sgl_to_bufl_params params = {0}; + int slen = areq->slen - shdr - sftr; + int dlen = areq->dlen - dhdr - dftr; dma_addr_t sfbuf, dfbuf; u8 *req = qat_req->req; size_t ovf_buff_sz; int ret; - params.sskip = 0; - params.dskip = 0; + params.sskip = shdr; + params.dskip = dhdr; if (!areq->src || !slen) return -EINVAL; @@ -257,6 +347,7 @@ static int qat_comp_alg_compress_decompress(struct acomp_req *areq, if (!areq->dst) return -ENOMEM; + dlen -= dhdr + dftr; areq->dlen = dlen; qat_req->dst.resubmitted = false; } @@ -265,11 +356,10 @@ static int qat_comp_alg_compress_decompress(struct acomp_req *areq, params.extra_dst_buff = inst->dc_data->ovf_buff_p; ovf_buff_sz = inst->dc_data->ovf_buff_sz; params.sz_extra_dst_buff = ovf_buff_sz; - p_params = ¶ms; } ret = qat_bl_sgl_to_bufl(ctx->inst->accel_dev, areq->src, areq->dst, - &qat_req->buf, p_params, f); + &qat_req->buf, ¶ms, f); if (unlikely(ret)) return ret; @@ -302,12 +392,49 @@ static int qat_comp_alg_compress_decompress(struct acomp_req *areq, static int qat_comp_alg_compress(struct acomp_req *req) { - return qat_comp_alg_compress_decompress(req, COMPRESSION); + return qat_comp_alg_compress_decompress(req, COMPRESSION, 0, 0, 0, 0); } static int qat_comp_alg_decompress(struct acomp_req *req) { - return qat_comp_alg_compress_decompress(req, DECOMPRESSION); + return qat_comp_alg_compress_decompress(req, DECOMPRESSION, 0, 0, 0, 0); +} + +static int qat_comp_alg_rfc1950_compress(struct acomp_req *req) +{ + if (!req->dst && req->dlen != 0) + return -EINVAL; + + if (req->dst && req->dlen <= QAT_RFC_1950_HDR_SIZE + QAT_RFC_1950_FOOTER_SIZE) + return -EINVAL; + + return qat_comp_alg_compress_decompress(req, COMPRESSION, 0, 0, + QAT_RFC_1950_HDR_SIZE, + QAT_RFC_1950_FOOTER_SIZE); +} + +static int qat_comp_alg_rfc1950_decompress(struct acomp_req *req) +{ + struct crypto_acomp *acomp_tfm = crypto_acomp_reqtfm(req); + struct crypto_tfm *tfm = crypto_acomp_tfm(acomp_tfm); + struct qat_compression_ctx *ctx = crypto_tfm_ctx(tfm); + struct adf_accel_dev *accel_dev = ctx->inst->accel_dev; + u16 zlib_header; + int ret; + + if (req->slen <= QAT_RFC_1950_HDR_SIZE + QAT_RFC_1950_FOOTER_SIZE) + return -EBADMSG; + + scatterwalk_map_and_copy(&zlib_header, req->src, 0, QAT_RFC_1950_HDR_SIZE, 0); + + ret = parse_zlib_header(zlib_header); + if (ret) { + dev_dbg(&GET_DEV(accel_dev), "Error parsing zlib header\n"); + return ret; + } + + return qat_comp_alg_compress_decompress(req, DECOMPRESSION, QAT_RFC_1950_HDR_SIZE, + QAT_RFC_1950_FOOTER_SIZE, 0, 0); } static struct acomp_alg qat_acomp[] = { { @@ -325,6 +452,21 @@ static struct acomp_alg qat_acomp[] = { { .decompress = qat_comp_alg_decompress, .dst_free = sgl_free, .reqsize = sizeof(struct qat_compression_req), +}, { + .base = { + .cra_name = "zlib-deflate", + .cra_driver_name = "qat_zlib_deflate", + .cra_priority = 4001, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_ctxsize = sizeof(struct qat_compression_ctx), + .cra_module = THIS_MODULE, + }, + .init = qat_comp_alg_rfc1950_init_tfm, + .exit = qat_comp_alg_exit_tfm, + .compress = qat_comp_alg_rfc1950_compress, + .decompress = qat_comp_alg_rfc1950_decompress, + .dst_free = sgl_free, + .reqsize = sizeof(struct qat_compression_req), } }; int qat_comp_algs_register(void) From 46a334a98f585ef78d51d8f5736596887bdd7f54 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 23 Jan 2023 16:53:08 -0600 Subject: [PATCH 070/156] crypto: ccp - Flush the SEV-ES TMR memory before giving it to firmware Perform a cache flush on the SEV-ES TMR memory after allocation to prevent any possibility of the firmware encountering an error should dirty cache lines be present. Use clflush_cache_range() to flush the SEV-ES TMR memory. Fixes: 97f9ac3db661 ("crypto: ccp - Add support for SEV-ES to the PSP driver") Signed-off-by: Tom Lendacky Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sev-dev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 13f1c88810d8..e2f25926eb51 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -26,6 +26,7 @@ #include #include +#include #include "psp-dev.h" #include "sev-dev.h" @@ -1335,7 +1336,10 @@ void sev_pci_init(void) /* Obtain the TMR memory area for SEV-ES use */ sev_es_tmr = sev_fw_alloc(SEV_ES_TMR_SIZE); - if (!sev_es_tmr) + if (sev_es_tmr) + /* Must flush the cache before giving it to the firmware */ + clflush_cache_range(sev_es_tmr, SEV_ES_TMR_SIZE); + else dev_warn(sev->dev, "SEV: TMR allocation failed, SEV-ES support unavailable\n"); From 3ce2d5971522adfdec9b17182ce9e513a26707e1 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 24 Jan 2023 16:28:39 +0800 Subject: [PATCH 071/156] crypto: img-hash - Fix sparse endianness warning Use cpu_to_be32 instead of be32_to_cpu in img_hash_read_result_queue to silence sparse. The generated code should be identical. Signed-off-by: Herbert Xu --- drivers/crypto/img-hash.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/img-hash.c b/drivers/crypto/img-hash.c index 9629e98bd68b..e5ed098a154d 100644 --- a/drivers/crypto/img-hash.c +++ b/drivers/crypto/img-hash.c @@ -157,9 +157,9 @@ static inline void img_hash_write(struct img_hash_dev *hdev, writel_relaxed(value, hdev->io_base + offset); } -static inline u32 img_hash_read_result_queue(struct img_hash_dev *hdev) +static inline __be32 img_hash_read_result_queue(struct img_hash_dev *hdev) { - return be32_to_cpu(img_hash_read(hdev, CR_RESULT_QUEUE)); + return cpu_to_be32(img_hash_read(hdev, CR_RESULT_QUEUE)); } static void img_hash_start(struct img_hash_dev *hdev, bool dma) @@ -283,10 +283,10 @@ static int img_hash_finish(struct ahash_request *req) static void img_hash_copy_hash(struct ahash_request *req) { struct img_hash_request_ctx *ctx = ahash_request_ctx(req); - u32 *hash = (u32 *)ctx->digest; + __be32 *hash = (__be32 *)ctx->digest; int i; - for (i = (ctx->digsize / sizeof(u32)) - 1; i >= 0; i--) + for (i = (ctx->digsize / sizeof(*hash)) - 1; i >= 0; i--) hash[i] = img_hash_read_result_queue(ctx->hdev); } From 5efa7186696e64cd4decb4419f174188ee06c4cb Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 24 Jan 2023 17:11:11 +0800 Subject: [PATCH 072/156] crypto: marvell/cesa - Use crypto_wait_req This patch replaces the custom crypto completion function with crypto_req_done. Signed-off-by: Herbert Xu --- drivers/crypto/marvell/cesa/hash.c | 41 ++++++------------------------ 1 file changed, 8 insertions(+), 33 deletions(-) diff --git a/drivers/crypto/marvell/cesa/hash.c b/drivers/crypto/marvell/cesa/hash.c index c72b0672fc71..8d84ad45571c 100644 --- a/drivers/crypto/marvell/cesa/hash.c +++ b/drivers/crypto/marvell/cesa/hash.c @@ -1104,47 +1104,27 @@ struct ahash_alg mv_sha256_alg = { } }; -struct mv_cesa_ahash_result { - struct completion completion; - int error; -}; - -static void mv_cesa_hmac_ahash_complete(struct crypto_async_request *req, - int error) -{ - struct mv_cesa_ahash_result *result = req->data; - - if (error == -EINPROGRESS) - return; - - result->error = error; - complete(&result->completion); -} - static int mv_cesa_ahmac_iv_state_init(struct ahash_request *req, u8 *pad, void *state, unsigned int blocksize) { - struct mv_cesa_ahash_result result; + DECLARE_CRYPTO_WAIT(result); struct scatterlist sg; int ret; ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - mv_cesa_hmac_ahash_complete, &result); + crypto_req_done, &result); sg_init_one(&sg, pad, blocksize); ahash_request_set_crypt(req, &sg, pad, blocksize); - init_completion(&result.completion); ret = crypto_ahash_init(req); if (ret) return ret; ret = crypto_ahash_update(req); - if (ret && ret != -EINPROGRESS) - return ret; + ret = crypto_wait_req(ret, &result); - wait_for_completion_interruptible(&result.completion); - if (result.error) - return result.error; + if (ret) + return ret; ret = crypto_ahash_export(req, state); if (ret) @@ -1158,7 +1138,7 @@ static int mv_cesa_ahmac_pad_init(struct ahash_request *req, u8 *ipad, u8 *opad, unsigned int blocksize) { - struct mv_cesa_ahash_result result; + DECLARE_CRYPTO_WAIT(result); struct scatterlist sg; int ret; int i; @@ -1172,17 +1152,12 @@ static int mv_cesa_ahmac_pad_init(struct ahash_request *req, return -ENOMEM; ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - mv_cesa_hmac_ahash_complete, - &result); + crypto_req_done, &result); sg_init_one(&sg, keydup, keylen); ahash_request_set_crypt(req, &sg, ipad, keylen); - init_completion(&result.completion); ret = crypto_ahash_digest(req); - if (ret == -EINPROGRESS) { - wait_for_completion_interruptible(&result.completion); - ret = result.error; - } + ret = crypto_wait_req(ret, &result); /* Set the memory region to 0 to avoid any leak. */ kfree_sensitive(keydup); From 85f3fc5446e5315197529736ddfbda61c2777e06 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 25 Jan 2023 01:23:06 +0100 Subject: [PATCH 073/156] dt-bindings: crypto: Let STM32 define Ux500 HASH This adds device tree bindings for the Ux500 HASH block as a compatible in the STM32 HASH bindings. The Ux500 HASH binding has been used for ages in the kernel device tree for Ux500 but was never documented, so fill in the gap by making it a sibling of the STM32 HASH block, which is what it is. The relationship to the existing STM32 HASH block is pretty obvious when looking at the register map, and I have written patches to reuse the STM32 HASH driver on the Ux500. The main difference from the outside is that the Ux500 HASH lacks the interrupt line, so some special if-clauses are needed to accomodate this in the binding. Reviewed-by: Rob Herring Signed-off-by: Linus Walleij Signed-off-by: Herbert Xu --- .../bindings/crypto/st,stm32-hash.yaml | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/crypto/st,stm32-hash.yaml b/Documentation/devicetree/bindings/crypto/st,stm32-hash.yaml index 4ccb335e8063..b767ec72a999 100644 --- a/Documentation/devicetree/bindings/crypto/st,stm32-hash.yaml +++ b/Documentation/devicetree/bindings/crypto/st,stm32-hash.yaml @@ -6,12 +6,18 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: STMicroelectronics STM32 HASH +description: The STM32 HASH block is built on the HASH block found in + the STn8820 SoC introduced in 2007, and subsequently used in the U8500 + SoC in 2010. + maintainers: - Lionel Debieve properties: compatible: enum: + - st,stn8820-hash + - stericsson,ux500-hash - st,stm32f456-hash - st,stm32f756-hash @@ -41,11 +47,26 @@ properties: maximum: 2 default: 0 + power-domains: + maxItems: 1 + required: - compatible - reg - clocks - - interrupts + +allOf: + - if: + properties: + compatible: + items: + const: stericsson,ux500-hash + then: + properties: + interrupts: false + else: + required: + - interrupts additionalProperties: false From 2d27267b379f589836949481aec9c9f39dbe1f26 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 25 Jan 2023 01:23:07 +0100 Subject: [PATCH 074/156] crypto: stm32/hash - Simplify code We are passing (rctx->flags & HASH_FLAGS_FINUP) as indicator for the final request but we already know this to be true since we are in the (final) arm of an if-statement set from the same flag. Just open-code it as true. Acked-by: Lionel Debieve Signed-off-by: Linus Walleij Signed-off-by: Herbert Xu --- drivers/crypto/stm32/stm32-hash.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/crypto/stm32/stm32-hash.c b/drivers/crypto/stm32/stm32-hash.c index d33006d43f76..0473ced7b4ea 100644 --- a/drivers/crypto/stm32/stm32-hash.c +++ b/drivers/crypto/stm32/stm32-hash.c @@ -399,8 +399,7 @@ static int stm32_hash_update_cpu(struct stm32_hash_dev *hdev) if (final) { bufcnt = rctx->bufcnt; rctx->bufcnt = 0; - err = stm32_hash_xmit_cpu(hdev, rctx->buffer, bufcnt, - (rctx->flags & HASH_FLAGS_FINUP)); + err = stm32_hash_xmit_cpu(hdev, rctx->buffer, bufcnt, 1); } return err; From 727f083ff48ae94cd7422e42bb9513ab61160e8a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 25 Jan 2023 01:23:08 +0100 Subject: [PATCH 075/156] crypto: stm32/hash - Use existing busy poll function When exporting state we are waiting indefinitely in the same was as the ordinary stm32_hash_wait_busy() poll-for-completion function but without a timeout, which means we could hang in an eternal loop. Fix this by waiting for completion like the rest of the code. Acked-by: Lionel Debieve Signed-off-by: Linus Walleij Signed-off-by: Herbert Xu --- drivers/crypto/stm32/stm32-hash.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/stm32/stm32-hash.c b/drivers/crypto/stm32/stm32-hash.c index 0473ced7b4ea..cc0a4e413a82 100644 --- a/drivers/crypto/stm32/stm32-hash.c +++ b/drivers/crypto/stm32/stm32-hash.c @@ -960,11 +960,13 @@ static int stm32_hash_export(struct ahash_request *req, void *out) struct stm32_hash_dev *hdev = stm32_hash_find_dev(ctx); u32 *preg; unsigned int i; + int ret; pm_runtime_get_sync(hdev->dev); - while ((stm32_hash_read(hdev, HASH_SR) & HASH_SR_BUSY)) - cpu_relax(); + ret = stm32_hash_wait_busy(hdev); + if (ret) + return ret; rctx->hw_context = kmalloc_array(3 + HASH_CSR_REGISTER_NUMBER, sizeof(u32), From 5a2d52b54065c2aeb0012d050734ead78252215a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 25 Jan 2023 01:23:09 +0100 Subject: [PATCH 076/156] crypto: stm32/hash - Wait for idle before final CPU xmit When calculating the hash using the CPU, right before the final hash calculation, heavy testing on Ux500 reveals that it is wise to wait for the hardware to go idle before calculating the final hash. The default test vectors mostly worked fine, but when I used the extensive tests and stress the hardware I ran into this problem. Acked-by: Lionel Debieve Signed-off-by: Linus Walleij Signed-off-by: Herbert Xu --- drivers/crypto/stm32/stm32-hash.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/crypto/stm32/stm32-hash.c b/drivers/crypto/stm32/stm32-hash.c index cc0a4e413a82..d4eefd8292ff 100644 --- a/drivers/crypto/stm32/stm32-hash.c +++ b/drivers/crypto/stm32/stm32-hash.c @@ -362,6 +362,9 @@ static int stm32_hash_xmit_cpu(struct stm32_hash_dev *hdev, stm32_hash_write(hdev, HASH_DIN, buffer[count]); if (final) { + if (stm32_hash_wait_busy(hdev)) + return -ETIMEDOUT; + stm32_hash_set_nblw(hdev, length); reg = stm32_hash_read(hdev, HASH_STR); reg |= HASH_STR_DCAL; From b56403a25af703064bed0d7e81d2c9ffb30511ff Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 25 Jan 2023 01:23:10 +0100 Subject: [PATCH 077/156] crypto: stm32/hash - Support Ux500 hash The Ux500 has a hash block which is an ancestor to the STM32 hash block. With some minor code path additions we can support also this variant in the STM32 driver. Differences: - Ux500 only supports SHA1 and SHA256 (+/- MAC) so we split up the algorithm registration per-algorithm and register each algorithm along with its MAC variant separately. - Ux500 does not have an interrupt to indicate that hash calculation is complete, so we add code paths to handle polling for completion if the interrupt is missing in the device tree. - Ux500 is lacking the SR status register, to check if an operating is complete, we need to poll the HASH_STR_DCAL bit in the HASH_STR register instead. - Ux500 had the resulting hash at address offset 0x0c and 8 32bit registers ahead. We account for this with a special code path when reading out the hash digest. - Ux500 need a special bit set in the control register before performing the final hash calculation on an empty message. - Ux500 hashes on empty messages will be performed if the above bit is set, but are incorrect. For this reason we just make an inline synchronous hash using a fallback hash. Tested on the Ux500 Golden device with the extended tests. Acked-by: Lionel Debieve Signed-off-by: Linus Walleij Signed-off-by: Herbert Xu --- drivers/crypto/stm32/stm32-hash.c | 254 ++++++++++++++++++++++++++---- 1 file changed, 219 insertions(+), 35 deletions(-) diff --git a/drivers/crypto/stm32/stm32-hash.c b/drivers/crypto/stm32/stm32-hash.c index d4eefd8292ff..7bf805563ac2 100644 --- a/drivers/crypto/stm32/stm32-hash.c +++ b/drivers/crypto/stm32/stm32-hash.c @@ -32,6 +32,7 @@ #define HASH_CR 0x00 #define HASH_DIN 0x04 #define HASH_STR 0x08 +#define HASH_UX500_HREG(x) (0x0c + ((x) * 0x04)) #define HASH_IMR 0x20 #define HASH_SR 0x24 #define HASH_CSR(x) (0x0F8 + ((x) * 0x04)) @@ -54,6 +55,10 @@ #define HASH_CR_ALGO_SHA224 0x40000 #define HASH_CR_ALGO_SHA256 0x40080 +#define HASH_CR_UX500_EMPTYMSG BIT(20) +#define HASH_CR_UX500_ALGO_SHA1 BIT(7) +#define HASH_CR_UX500_ALGO_SHA256 0x0 + /* Interrupt */ #define HASH_DINIE BIT(0) #define HASH_DCIE BIT(1) @@ -115,6 +120,7 @@ enum stm32_hash_data_format { struct stm32_hash_ctx { struct crypto_engine_ctx enginectx; struct stm32_hash_dev *hdev; + struct crypto_shash *xtfm; unsigned long flags; u8 key[HASH_MAX_KEY_SIZE]; @@ -157,6 +163,10 @@ struct stm32_hash_algs_info { struct stm32_hash_pdata { struct stm32_hash_algs_info *algs_info; size_t algs_info_size; + bool has_sr; + bool has_mdmat; + bool broken_emptymsg; + bool ux500; }; struct stm32_hash_dev { @@ -168,6 +178,7 @@ struct stm32_hash_dev { phys_addr_t phys_base; u32 dma_mode; u32 dma_maxburst; + bool polled; struct ahash_request *req; struct crypto_engine *engine; @@ -208,6 +219,11 @@ static inline int stm32_hash_wait_busy(struct stm32_hash_dev *hdev) { u32 status; + /* The Ux500 lacks the special status register, we poll the DCAL bit instead */ + if (!hdev->pdata->has_sr) + return readl_relaxed_poll_timeout(hdev->io_base + HASH_STR, status, + !(status & HASH_STR_DCAL), 10, 10000); + return readl_relaxed_poll_timeout(hdev->io_base + HASH_SR, status, !(status & HASH_SR_BUSY), 10, 10000); } @@ -249,7 +265,7 @@ static int stm32_hash_write_key(struct stm32_hash_dev *hdev) return 0; } -static void stm32_hash_write_ctrl(struct stm32_hash_dev *hdev) +static void stm32_hash_write_ctrl(struct stm32_hash_dev *hdev, int bufcnt) { struct stm32_hash_request_ctx *rctx = ahash_request_ctx(hdev->req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(hdev->req); @@ -263,13 +279,19 @@ static void stm32_hash_write_ctrl(struct stm32_hash_dev *hdev) reg |= HASH_CR_ALGO_MD5; break; case HASH_FLAGS_SHA1: - reg |= HASH_CR_ALGO_SHA1; + if (hdev->pdata->ux500) + reg |= HASH_CR_UX500_ALGO_SHA1; + else + reg |= HASH_CR_ALGO_SHA1; break; case HASH_FLAGS_SHA224: reg |= HASH_CR_ALGO_SHA224; break; case HASH_FLAGS_SHA256: - reg |= HASH_CR_ALGO_SHA256; + if (hdev->pdata->ux500) + reg |= HASH_CR_UX500_ALGO_SHA256; + else + reg |= HASH_CR_ALGO_SHA256; break; default: reg |= HASH_CR_ALGO_MD5; @@ -284,7 +306,15 @@ static void stm32_hash_write_ctrl(struct stm32_hash_dev *hdev) reg |= HASH_CR_LKEY; } - stm32_hash_write(hdev, HASH_IMR, HASH_DCIE); + /* + * On the Ux500 we need to set a special flag to indicate that + * the message is zero length. + */ + if (hdev->pdata->ux500 && bufcnt == 0) + reg |= HASH_CR_UX500_EMPTYMSG; + + if (!hdev->polled) + stm32_hash_write(hdev, HASH_IMR, HASH_DCIE); stm32_hash_write(hdev, HASH_CR, reg); @@ -345,7 +375,7 @@ static int stm32_hash_xmit_cpu(struct stm32_hash_dev *hdev, hdev->flags |= HASH_FLAGS_CPU; - stm32_hash_write_ctrl(hdev); + stm32_hash_write_ctrl(hdev, length); if (stm32_hash_wait_busy(hdev)) return -ETIMEDOUT; @@ -403,6 +433,14 @@ static int stm32_hash_update_cpu(struct stm32_hash_dev *hdev) bufcnt = rctx->bufcnt; rctx->bufcnt = 0; err = stm32_hash_xmit_cpu(hdev, rctx->buffer, bufcnt, 1); + + /* If we have an IRQ, wait for that, else poll for completion */ + if (hdev->polled) { + if (stm32_hash_wait_busy(hdev)) + return -ETIMEDOUT; + hdev->flags |= HASH_FLAGS_OUTPUT_READY; + err = 0; + } } return err; @@ -433,11 +471,12 @@ static int stm32_hash_xmit_dma(struct stm32_hash_dev *hdev, reg = stm32_hash_read(hdev, HASH_CR); - if (mdma) - reg |= HASH_CR_MDMAT; - else - reg &= ~HASH_CR_MDMAT; - + if (!hdev->pdata->has_mdmat) { + if (mdma) + reg |= HASH_CR_MDMAT; + else + reg &= ~HASH_CR_MDMAT; + } reg |= HASH_CR_DMAE; stm32_hash_write(hdev, HASH_CR, reg); @@ -558,7 +597,7 @@ static int stm32_hash_dma_send(struct stm32_hash_dev *hdev) if (rctx->nents < 0) return -EINVAL; - stm32_hash_write_ctrl(hdev); + stm32_hash_write_ctrl(hdev, rctx->total); if (hdev->flags & HASH_FLAGS_HMAC) { err = stm32_hash_hmac_dma_send(hdev); @@ -745,16 +784,57 @@ static int stm32_hash_final_req(struct stm32_hash_dev *hdev) else err = stm32_hash_xmit_cpu(hdev, rctx->buffer, buflen, 1); + /* If we have an IRQ, wait for that, else poll for completion */ + if (hdev->polled) { + if (stm32_hash_wait_busy(hdev)) + return -ETIMEDOUT; + hdev->flags |= HASH_FLAGS_OUTPUT_READY; + /* Caller will call stm32_hash_finish_req() */ + err = 0; + } return err; } +static void stm32_hash_emptymsg_fallback(struct ahash_request *req) +{ + struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); + struct stm32_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct stm32_hash_request_ctx *rctx = ahash_request_ctx(req); + struct stm32_hash_dev *hdev = rctx->hdev; + int ret; + + dev_dbg(hdev->dev, "use fallback message size 0 key size %d\n", + ctx->keylen); + + if (!ctx->xtfm) { + dev_err(hdev->dev, "no fallback engine\n"); + return; + } + + if (ctx->keylen) { + ret = crypto_shash_setkey(ctx->xtfm, ctx->key, ctx->keylen); + if (ret) { + dev_err(hdev->dev, "failed to set key ret=%d\n", ret); + return; + } + } + + ret = crypto_shash_tfm_digest(ctx->xtfm, NULL, 0, rctx->digest); + if (ret) + dev_err(hdev->dev, "shash digest error\n"); +} + static void stm32_hash_copy_hash(struct ahash_request *req) { struct stm32_hash_request_ctx *rctx = ahash_request_ctx(req); + struct stm32_hash_dev *hdev = rctx->hdev; __be32 *hash = (void *)rctx->digest; unsigned int i, hashsize; + if (hdev->pdata->broken_emptymsg && !req->nbytes) + return stm32_hash_emptymsg_fallback(req); + switch (rctx->flags & HASH_FLAGS_ALGO_MASK) { case HASH_FLAGS_MD5: hashsize = MD5_DIGEST_SIZE; @@ -772,9 +852,14 @@ static void stm32_hash_copy_hash(struct ahash_request *req) return; } - for (i = 0; i < hashsize / sizeof(u32); i++) - hash[i] = cpu_to_be32(stm32_hash_read(rctx->hdev, - HASH_HREG(i))); + for (i = 0; i < hashsize / sizeof(u32); i++) { + if (hdev->pdata->ux500) + hash[i] = cpu_to_be32(stm32_hash_read(hdev, + HASH_UX500_HREG(i))); + else + hash[i] = cpu_to_be32(stm32_hash_read(hdev, + HASH_HREG(i))); + } } static int stm32_hash_finish(struct ahash_request *req) @@ -977,7 +1062,8 @@ static int stm32_hash_export(struct ahash_request *req, void *out) preg = rctx->hw_context; - *preg++ = stm32_hash_read(hdev, HASH_IMR); + if (!hdev->pdata->ux500) + *preg++ = stm32_hash_read(hdev, HASH_IMR); *preg++ = stm32_hash_read(hdev, HASH_STR); *preg++ = stm32_hash_read(hdev, HASH_CR); for (i = 0; i < HASH_CSR_REGISTER_NUMBER; i++) @@ -1006,7 +1092,8 @@ static int stm32_hash_import(struct ahash_request *req, const void *in) pm_runtime_get_sync(hdev->dev); - stm32_hash_write(hdev, HASH_IMR, *preg++); + if (!hdev->pdata->ux500) + stm32_hash_write(hdev, HASH_IMR, *preg++); stm32_hash_write(hdev, HASH_STR, *preg++); stm32_hash_write(hdev, HASH_CR, *preg); reg = *preg++ | HASH_CR_INIT; @@ -1038,6 +1125,29 @@ static int stm32_hash_setkey(struct crypto_ahash *tfm, return 0; } +static int stm32_hash_init_fallback(struct crypto_tfm *tfm) +{ + struct stm32_hash_ctx *ctx = crypto_tfm_ctx(tfm); + struct stm32_hash_dev *hdev = stm32_hash_find_dev(ctx); + const char *name = crypto_tfm_alg_name(tfm); + struct crypto_shash *xtfm; + + /* The fallback is only needed on Ux500 */ + if (!hdev->pdata->ux500) + return 0; + + xtfm = crypto_alloc_shash(name, 0, CRYPTO_ALG_NEED_FALLBACK); + if (IS_ERR(xtfm)) { + dev_err(hdev->dev, "failed to allocate %s fallback\n", + name); + return PTR_ERR(xtfm); + } + dev_info(hdev->dev, "allocated %s fallback\n", name); + ctx->xtfm = xtfm; + + return 0; +} + static int stm32_hash_cra_init_algs(struct crypto_tfm *tfm, const char *algs_hmac_name) { @@ -1054,7 +1164,8 @@ static int stm32_hash_cra_init_algs(struct crypto_tfm *tfm, ctx->enginectx.op.do_one_request = stm32_hash_one_request; ctx->enginectx.op.prepare_request = stm32_hash_prepare_req; ctx->enginectx.op.unprepare_request = NULL; - return 0; + + return stm32_hash_init_fallback(tfm); } static int stm32_hash_cra_init(struct crypto_tfm *tfm) @@ -1082,6 +1193,14 @@ static int stm32_hash_cra_sha256_init(struct crypto_tfm *tfm) return stm32_hash_cra_init_algs(tfm, "sha256"); } +static void stm32_hash_cra_exit(struct crypto_tfm *tfm) +{ + struct stm32_hash_ctx *ctx = crypto_tfm_ctx(tfm); + + if (ctx->xtfm) + crypto_free_shash(ctx->xtfm); +} + static irqreturn_t stm32_hash_irq_thread(int irq, void *dev_id) { struct stm32_hash_dev *hdev = dev_id; @@ -1125,7 +1244,7 @@ static irqreturn_t stm32_hash_irq_handler(int irq, void *dev_id) return IRQ_NONE; } -static struct ahash_alg algs_md5_sha1[] = { +static struct ahash_alg algs_md5[] = { { .init = stm32_hash_init, .update = stm32_hash_update, @@ -1147,6 +1266,7 @@ static struct ahash_alg algs_md5_sha1[] = { .cra_ctxsize = sizeof(struct stm32_hash_ctx), .cra_alignmask = 3, .cra_init = stm32_hash_cra_init, + .cra_exit = stm32_hash_cra_exit, .cra_module = THIS_MODULE, } } @@ -1173,10 +1293,14 @@ static struct ahash_alg algs_md5_sha1[] = { .cra_ctxsize = sizeof(struct stm32_hash_ctx), .cra_alignmask = 3, .cra_init = stm32_hash_cra_md5_init, + .cra_exit = stm32_hash_cra_exit, .cra_module = THIS_MODULE, } } }, +}; + +static struct ahash_alg algs_sha1[] = { { .init = stm32_hash_init, .update = stm32_hash_update, @@ -1198,6 +1322,7 @@ static struct ahash_alg algs_md5_sha1[] = { .cra_ctxsize = sizeof(struct stm32_hash_ctx), .cra_alignmask = 3, .cra_init = stm32_hash_cra_init, + .cra_exit = stm32_hash_cra_exit, .cra_module = THIS_MODULE, } } @@ -1224,13 +1349,14 @@ static struct ahash_alg algs_md5_sha1[] = { .cra_ctxsize = sizeof(struct stm32_hash_ctx), .cra_alignmask = 3, .cra_init = stm32_hash_cra_sha1_init, + .cra_exit = stm32_hash_cra_exit, .cra_module = THIS_MODULE, } } }, }; -static struct ahash_alg algs_sha224_sha256[] = { +static struct ahash_alg algs_sha224[] = { { .init = stm32_hash_init, .update = stm32_hash_update, @@ -1252,6 +1378,7 @@ static struct ahash_alg algs_sha224_sha256[] = { .cra_ctxsize = sizeof(struct stm32_hash_ctx), .cra_alignmask = 3, .cra_init = stm32_hash_cra_init, + .cra_exit = stm32_hash_cra_exit, .cra_module = THIS_MODULE, } } @@ -1278,10 +1405,14 @@ static struct ahash_alg algs_sha224_sha256[] = { .cra_ctxsize = sizeof(struct stm32_hash_ctx), .cra_alignmask = 3, .cra_init = stm32_hash_cra_sha224_init, + .cra_exit = stm32_hash_cra_exit, .cra_module = THIS_MODULE, } } }, +}; + +static struct ahash_alg algs_sha256[] = { { .init = stm32_hash_init, .update = stm32_hash_update, @@ -1303,6 +1434,7 @@ static struct ahash_alg algs_sha224_sha256[] = { .cra_ctxsize = sizeof(struct stm32_hash_ctx), .cra_alignmask = 3, .cra_init = stm32_hash_cra_init, + .cra_exit = stm32_hash_cra_exit, .cra_module = THIS_MODULE, } } @@ -1329,6 +1461,7 @@ static struct ahash_alg algs_sha224_sha256[] = { .cra_ctxsize = sizeof(struct stm32_hash_ctx), .cra_alignmask = 3, .cra_init = stm32_hash_cra_sha256_init, + .cra_exit = stm32_hash_cra_exit, .cra_module = THIS_MODULE, } } @@ -1374,35 +1507,73 @@ static int stm32_hash_unregister_algs(struct stm32_hash_dev *hdev) return 0; } +static struct stm32_hash_algs_info stm32_hash_algs_info_ux500[] = { + { + .algs_list = algs_sha1, + .size = ARRAY_SIZE(algs_sha1), + }, + { + .algs_list = algs_sha256, + .size = ARRAY_SIZE(algs_sha256), + }, +}; + +static const struct stm32_hash_pdata stm32_hash_pdata_ux500 = { + .algs_info = stm32_hash_algs_info_ux500, + .algs_info_size = ARRAY_SIZE(stm32_hash_algs_info_ux500), + .broken_emptymsg = true, + .ux500 = true, +}; + static struct stm32_hash_algs_info stm32_hash_algs_info_stm32f4[] = { { - .algs_list = algs_md5_sha1, - .size = ARRAY_SIZE(algs_md5_sha1), + .algs_list = algs_md5, + .size = ARRAY_SIZE(algs_md5), + }, + { + .algs_list = algs_sha1, + .size = ARRAY_SIZE(algs_sha1), }, }; static const struct stm32_hash_pdata stm32_hash_pdata_stm32f4 = { .algs_info = stm32_hash_algs_info_stm32f4, .algs_info_size = ARRAY_SIZE(stm32_hash_algs_info_stm32f4), + .has_sr = true, + .has_mdmat = true, }; static struct stm32_hash_algs_info stm32_hash_algs_info_stm32f7[] = { { - .algs_list = algs_md5_sha1, - .size = ARRAY_SIZE(algs_md5_sha1), + .algs_list = algs_md5, + .size = ARRAY_SIZE(algs_md5), }, { - .algs_list = algs_sha224_sha256, - .size = ARRAY_SIZE(algs_sha224_sha256), + .algs_list = algs_sha1, + .size = ARRAY_SIZE(algs_sha1), + }, + { + .algs_list = algs_sha224, + .size = ARRAY_SIZE(algs_sha224), + }, + { + .algs_list = algs_sha256, + .size = ARRAY_SIZE(algs_sha256), }, }; static const struct stm32_hash_pdata stm32_hash_pdata_stm32f7 = { .algs_info = stm32_hash_algs_info_stm32f7, .algs_info_size = ARRAY_SIZE(stm32_hash_algs_info_stm32f7), + .has_sr = true, + .has_mdmat = true, }; static const struct of_device_id stm32_hash_of_match[] = { + { + .compatible = "stericsson,ux500-hash", + .data = &stm32_hash_pdata_ux500, + }, { .compatible = "st,stm32f456-hash", .data = &stm32_hash_pdata_stm32f4, @@ -1456,16 +1627,23 @@ static int stm32_hash_probe(struct platform_device *pdev) if (ret) return ret; - irq = platform_get_irq(pdev, 0); - if (irq < 0) + irq = platform_get_irq_optional(pdev, 0); + if (irq < 0 && irq != -ENXIO) return irq; - ret = devm_request_threaded_irq(dev, irq, stm32_hash_irq_handler, - stm32_hash_irq_thread, IRQF_ONESHOT, - dev_name(dev), hdev); - if (ret) { - dev_err(dev, "Cannot grab IRQ\n"); - return ret; + if (irq > 0) { + ret = devm_request_threaded_irq(dev, irq, + stm32_hash_irq_handler, + stm32_hash_irq_thread, + IRQF_ONESHOT, + dev_name(dev), hdev); + if (ret) { + dev_err(dev, "Cannot grab IRQ\n"); + return ret; + } + } else { + dev_info(dev, "No IRQ, use polling mode\n"); + hdev->polled = true; } hdev->clk = devm_clk_get(&pdev->dev, NULL); @@ -1507,9 +1685,11 @@ static int stm32_hash_probe(struct platform_device *pdev) case 0: break; case -ENOENT: - dev_dbg(dev, "DMA mode not available\n"); + case -ENODEV: + dev_info(dev, "DMA mode not available\n"); break; default: + dev_err(dev, "DMA init error %d\n", ret); goto err_dma; } @@ -1528,7 +1708,11 @@ static int stm32_hash_probe(struct platform_device *pdev) if (ret) goto err_engine_start; - hdev->dma_mode = stm32_hash_read(hdev, HASH_HWCFGR); + if (hdev->pdata->ux500) + /* FIXME: implement DMA mode for Ux500 */ + hdev->dma_mode = 0; + else + hdev->dma_mode = stm32_hash_read(hdev, HASH_HWCFGR); /* Register algos */ ret = stm32_hash_register_algs(hdev); From dd7b7972cb896c33f20915f7c89173b3927340e5 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 25 Jan 2023 01:23:11 +0100 Subject: [PATCH 078/156] crypto: ux500/hash - delete driver It turns out we can just modify the newer STM32 HASH driver to be used with Ux500 and now that we have done that, delete the old and sparsely maintained Ux500 HASH driver. Signed-off-by: Linus Walleij Signed-off-by: Herbert Xu --- drivers/crypto/Kconfig | 10 - drivers/crypto/Makefile | 1 - drivers/crypto/ux500/Kconfig | 21 - drivers/crypto/ux500/Makefile | 7 - drivers/crypto/ux500/hash/Makefile | 11 - drivers/crypto/ux500/hash/hash_alg.h | 398 ----- drivers/crypto/ux500/hash/hash_core.c | 1966 ------------------------- 7 files changed, 2414 deletions(-) delete mode 100644 drivers/crypto/ux500/Kconfig delete mode 100644 drivers/crypto/ux500/Makefile delete mode 100644 drivers/crypto/ux500/hash/Makefile delete mode 100644 drivers/crypto/ux500/hash/hash_alg.h delete mode 100644 drivers/crypto/ux500/hash/hash_core.c diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index dfb103f81a64..3b2516d1433f 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -390,16 +390,6 @@ if CRYPTO_DEV_NX source "drivers/crypto/nx/Kconfig" endif -config CRYPTO_DEV_UX500 - tristate "Driver for ST-Ericsson UX500 crypto hardware acceleration" - depends on ARCH_U8500 - help - Driver for ST-Ericsson UX500 crypto engine. - -if CRYPTO_DEV_UX500 - source "drivers/crypto/ux500/Kconfig" -endif # if CRYPTO_DEV_UX500 - config CRYPTO_DEV_ATMEL_AUTHENC bool "Support for Atmel IPSEC/SSL hw accelerator" depends on ARCH_AT91 || COMPILE_TEST diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile index fa8bf1be1a8c..476f1a25ca32 100644 --- a/drivers/crypto/Makefile +++ b/drivers/crypto/Makefile @@ -43,7 +43,6 @@ obj-$(CONFIG_CRYPTO_DEV_SAHARA) += sahara.o obj-$(CONFIG_CRYPTO_DEV_SL3516) += gemini/ obj-y += stm32/ obj-$(CONFIG_CRYPTO_DEV_TALITOS) += talitos.o -obj-$(CONFIG_CRYPTO_DEV_UX500) += ux500/ obj-$(CONFIG_CRYPTO_DEV_VIRTIO) += virtio/ obj-$(CONFIG_CRYPTO_DEV_VMX) += vmx/ obj-$(CONFIG_CRYPTO_DEV_BCM_SPU) += bcm/ diff --git a/drivers/crypto/ux500/Kconfig b/drivers/crypto/ux500/Kconfig deleted file mode 100644 index ac89cd2de12a..000000000000 --- a/drivers/crypto/ux500/Kconfig +++ /dev/null @@ -1,21 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Copyright (C) ST-Ericsson SA 2010 -# Author: Shujuan Chen (shujuan.chen@stericsson.com) -# - -config CRYPTO_DEV_UX500_HASH - tristate "UX500 crypto driver for HASH block" - depends on CRYPTO_DEV_UX500 - select CRYPTO_HASH - select CRYPTO_SHA1 - select CRYPTO_SHA256 - help - This selects the hash driver for the UX500_HASH hardware. - Depends on UX500/STM DMA if running in DMA mode. - -config CRYPTO_DEV_UX500_DEBUG - bool "Activate debug-mode for UX500 crypto driver for HASH block" - depends on CRYPTO_DEV_UX500_HASH - help - Say Y if you want to add debug prints to ux500_hash devices. diff --git a/drivers/crypto/ux500/Makefile b/drivers/crypto/ux500/Makefile deleted file mode 100644 index f1aa4edf66f4..000000000000 --- a/drivers/crypto/ux500/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Copyright (C) ST-Ericsson SA 2010 -# Author: Shujuan Chen (shujuan.chen@stericsson.com) -# - -obj-$(CONFIG_CRYPTO_DEV_UX500_HASH) += hash/ diff --git a/drivers/crypto/ux500/hash/Makefile b/drivers/crypto/ux500/hash/Makefile deleted file mode 100644 index a8f088724772..000000000000 --- a/drivers/crypto/ux500/hash/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Copyright (C) ST-Ericsson SA 2010 -# Author: Shujuan Chen (shujuan.chen@stericsson.com) -# -ifdef CONFIG_CRYPTO_DEV_UX500_DEBUG -CFLAGS_hash_core.o := -DDEBUG -endif - -obj-$(CONFIG_CRYPTO_DEV_UX500_HASH) += ux500_hash.o -ux500_hash-objs := hash_core.o diff --git a/drivers/crypto/ux500/hash/hash_alg.h b/drivers/crypto/ux500/hash/hash_alg.h deleted file mode 100644 index 7c9bcc15125f..000000000000 --- a/drivers/crypto/ux500/hash/hash_alg.h +++ /dev/null @@ -1,398 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson SA 2010 - * Author: Shujuan Chen (shujuan.chen@stericsson.com) - * Author: Joakim Bech (joakim.xx.bech@stericsson.com) - * Author: Berne Hebark (berne.hebark@stericsson.com)) - */ -#ifndef _HASH_ALG_H -#define _HASH_ALG_H - -#include - -#define HASH_BLOCK_SIZE 64 -#define HASH_DMA_FIFO 4 -#define HASH_DMA_ALIGN_SIZE 4 -#define HASH_DMA_PERFORMANCE_MIN_SIZE 1024 -#define HASH_BYTES_PER_WORD 4 - -/* Maximum value of the length's high word */ -#define HASH_HIGH_WORD_MAX_VAL 0xFFFFFFFFUL - -/* Power on Reset values HASH registers */ -#define HASH_RESET_CR_VALUE 0x0 -#define HASH_RESET_STR_VALUE 0x0 - -/* Number of context swap registers */ -#define HASH_CSR_COUNT 52 - -#define HASH_RESET_CSRX_REG_VALUE 0x0 -#define HASH_RESET_CSFULL_REG_VALUE 0x0 -#define HASH_RESET_CSDATAIN_REG_VALUE 0x0 - -#define HASH_RESET_INDEX_VAL 0x0 -#define HASH_RESET_BIT_INDEX_VAL 0x0 -#define HASH_RESET_BUFFER_VAL 0x0 -#define HASH_RESET_LEN_HIGH_VAL 0x0 -#define HASH_RESET_LEN_LOW_VAL 0x0 - -/* Control register bitfields */ -#define HASH_CR_RESUME_MASK 0x11FCF - -#define HASH_CR_SWITCHON_POS 31 -#define HASH_CR_SWITCHON_MASK BIT(31) - -#define HASH_CR_EMPTYMSG_POS 20 -#define HASH_CR_EMPTYMSG_MASK BIT(20) - -#define HASH_CR_DINF_POS 12 -#define HASH_CR_DINF_MASK BIT(12) - -#define HASH_CR_NBW_POS 8 -#define HASH_CR_NBW_MASK 0x00000F00UL - -#define HASH_CR_LKEY_POS 16 -#define HASH_CR_LKEY_MASK BIT(16) - -#define HASH_CR_ALGO_POS 7 -#define HASH_CR_ALGO_MASK BIT(7) - -#define HASH_CR_MODE_POS 6 -#define HASH_CR_MODE_MASK BIT(6) - -#define HASH_CR_DATAFORM_POS 4 -#define HASH_CR_DATAFORM_MASK (BIT(4) | BIT(5)) - -#define HASH_CR_DMAE_POS 3 -#define HASH_CR_DMAE_MASK BIT(3) - -#define HASH_CR_INIT_POS 2 -#define HASH_CR_INIT_MASK BIT(2) - -#define HASH_CR_PRIVN_POS 1 -#define HASH_CR_PRIVN_MASK BIT(1) - -#define HASH_CR_SECN_POS 0 -#define HASH_CR_SECN_MASK BIT(0) - -/* Start register bitfields */ -#define HASH_STR_DCAL_POS 8 -#define HASH_STR_DCAL_MASK BIT(8) -#define HASH_STR_DEFAULT 0x0 - -#define HASH_STR_NBLW_POS 0 -#define HASH_STR_NBLW_MASK 0x0000001FUL - -#define HASH_NBLW_MAX_VAL 0x1F - -/* PrimeCell IDs */ -#define HASH_P_ID0 0xE0 -#define HASH_P_ID1 0x05 -#define HASH_P_ID2 0x38 -#define HASH_P_ID3 0x00 -#define HASH_CELL_ID0 0x0D -#define HASH_CELL_ID1 0xF0 -#define HASH_CELL_ID2 0x05 -#define HASH_CELL_ID3 0xB1 - -#define HASH_SET_BITS(reg_name, mask) \ - writel_relaxed((readl_relaxed(reg_name) | mask), reg_name) - -#define HASH_CLEAR_BITS(reg_name, mask) \ - writel_relaxed((readl_relaxed(reg_name) & ~mask), reg_name) - -#define HASH_PUT_BITS(reg, val, shift, mask) \ - writel_relaxed(((readl(reg) & ~(mask)) | \ - (((u32)val << shift) & (mask))), reg) - -#define HASH_SET_DIN(val, len) writesl(&device_data->base->din, (val), (len)) - -#define HASH_INITIALIZE \ - HASH_PUT_BITS( \ - &device_data->base->cr, \ - 0x01, HASH_CR_INIT_POS, \ - HASH_CR_INIT_MASK) - -#define HASH_SET_DATA_FORMAT(data_format) \ - HASH_PUT_BITS( \ - &device_data->base->cr, \ - (u32) (data_format), HASH_CR_DATAFORM_POS, \ - HASH_CR_DATAFORM_MASK) -#define HASH_SET_NBLW(val) \ - HASH_PUT_BITS( \ - &device_data->base->str, \ - (u32) (val), HASH_STR_NBLW_POS, \ - HASH_STR_NBLW_MASK) -#define HASH_SET_DCAL \ - HASH_PUT_BITS( \ - &device_data->base->str, \ - 0x01, HASH_STR_DCAL_POS, \ - HASH_STR_DCAL_MASK) - -/* Hardware access method */ -enum hash_mode { - HASH_MODE_CPU, - HASH_MODE_DMA -}; - -/** - * struct uint64 - Structure to handle 64 bits integers. - * @high_word: Most significant bits. - * @low_word: Least significant bits. - * - * Used to handle 64 bits integers. - */ -struct uint64 { - u32 high_word; - u32 low_word; -}; - -/** - * struct hash_register - Contains all registers in ux500 hash hardware. - * @cr: HASH control register (0x000). - * @din: HASH data input register (0x004). - * @str: HASH start register (0x008). - * @hx: HASH digest register 0..7 (0x00c-0x01C). - * @padding0: Reserved (0x02C). - * @itcr: Integration test control register (0x080). - * @itip: Integration test input register (0x084). - * @itop: Integration test output register (0x088). - * @padding1: Reserved (0x08C). - * @csfull: HASH context full register (0x0F8). - * @csdatain: HASH context swap data input register (0x0FC). - * @csrx: HASH context swap register 0..51 (0x100-0x1CC). - * @padding2: Reserved (0x1D0). - * @periphid0: HASH peripheral identification register 0 (0xFE0). - * @periphid1: HASH peripheral identification register 1 (0xFE4). - * @periphid2: HASH peripheral identification register 2 (0xFE8). - * @periphid3: HASH peripheral identification register 3 (0xFEC). - * @cellid0: HASH PCell identification register 0 (0xFF0). - * @cellid1: HASH PCell identification register 1 (0xFF4). - * @cellid2: HASH PCell identification register 2 (0xFF8). - * @cellid3: HASH PCell identification register 3 (0xFFC). - * - * The device communicates to the HASH via 32-bit-wide control registers - * accessible via the 32-bit width AMBA rev. 2.0 AHB Bus. Below is a structure - * with the registers used. - */ -struct hash_register { - u32 cr; - u32 din; - u32 str; - u32 hx[8]; - - u32 padding0[(0x080 - 0x02C) / sizeof(u32)]; - - u32 itcr; - u32 itip; - u32 itop; - - u32 padding1[(0x0F8 - 0x08C) / sizeof(u32)]; - - u32 csfull; - u32 csdatain; - u32 csrx[HASH_CSR_COUNT]; - - u32 padding2[(0xFE0 - 0x1D0) / sizeof(u32)]; - - u32 periphid0; - u32 periphid1; - u32 periphid2; - u32 periphid3; - - u32 cellid0; - u32 cellid1; - u32 cellid2; - u32 cellid3; -}; - -/** - * struct hash_state - Hash context state. - * @temp_cr: Temporary HASH Control Register. - * @str_reg: HASH Start Register. - * @din_reg: HASH Data Input Register. - * @csr[52]: HASH Context Swap Registers 0-39. - * @csfull: HASH Context Swap Registers 40 ie Status flags. - * @csdatain: HASH Context Swap Registers 41 ie Input data. - * @buffer: Working buffer for messages going to the hardware. - * @length: Length of the part of message hashed so far (floor(N/64) * 64). - * @index: Valid number of bytes in buffer (N % 64). - * @bit_index: Valid number of bits in buffer (N % 8). - * - * This structure is used between context switches, i.e. when ongoing jobs are - * interupted with new jobs. When this happens we need to store intermediate - * results in software. - * - * WARNING: "index" is the member of the structure, to be sure that "buffer" - * is aligned on a 4-bytes boundary. This is highly implementation dependent - * and MUST be checked whenever this code is ported on new platforms. - */ -struct hash_state { - u32 temp_cr; - u32 str_reg; - u32 din_reg; - u32 csr[52]; - u32 csfull; - u32 csdatain; - u32 buffer[HASH_BLOCK_SIZE / sizeof(u32)]; - struct uint64 length; - u8 index; - u8 bit_index; -}; - -/** - * enum hash_device_id - HASH device ID. - * @HASH_DEVICE_ID_0: Hash hardware with ID 0 - * @HASH_DEVICE_ID_1: Hash hardware with ID 1 - */ -enum hash_device_id { - HASH_DEVICE_ID_0 = 0, - HASH_DEVICE_ID_1 = 1 -}; - -/** - * enum hash_data_format - HASH data format. - * @HASH_DATA_32_BITS: 32 bits data format - * @HASH_DATA_16_BITS: 16 bits data format - * @HASH_DATA_8_BITS: 8 bits data format. - * @HASH_DATA_1_BITS: 1 bit data format. - */ -enum hash_data_format { - HASH_DATA_32_BITS = 0x0, - HASH_DATA_16_BITS = 0x1, - HASH_DATA_8_BITS = 0x2, - HASH_DATA_1_BIT = 0x3 -}; - -/** - * enum hash_algo - Enumeration for selecting between SHA1 or SHA2 algorithm. - * @HASH_ALGO_SHA1: Indicates that SHA1 is used. - * @HASH_ALGO_SHA2: Indicates that SHA2 (SHA256) is used. - */ -enum hash_algo { - HASH_ALGO_SHA1 = 0x0, - HASH_ALGO_SHA256 = 0x1 -}; - -/** - * enum hash_op - Enumeration for selecting between HASH or HMAC mode. - * @HASH_OPER_MODE_HASH: Indicates usage of normal HASH mode. - * @HASH_OPER_MODE_HMAC: Indicates usage of HMAC. - */ -enum hash_op { - HASH_OPER_MODE_HASH = 0x0, - HASH_OPER_MODE_HMAC = 0x1 -}; - -/** - * struct hash_config - Configuration data for the hardware. - * @data_format: Format of data entered into the hash data in register. - * @algorithm: Algorithm selection bit. - * @oper_mode: Operating mode selection bit. - */ -struct hash_config { - int data_format; - int algorithm; - int oper_mode; -}; - -/** - * struct hash_dma - Structure used for dma. - * @mask: DMA capabilities bitmap mask. - * @complete: Used to maintain state for a "completion". - * @chan_mem2hash: DMA channel. - * @cfg_mem2hash: DMA channel configuration. - * @sg_len: Scatterlist length. - * @sg: Scatterlist. - * @nents: Number of sg entries. - */ -struct hash_dma { - dma_cap_mask_t mask; - struct completion complete; - struct dma_chan *chan_mem2hash; - void *cfg_mem2hash; - int sg_len; - struct scatterlist *sg; - int nents; -}; - -/** - * struct hash_ctx - The context used for hash calculations. - * @key: The key used in the operation. - * @keylen: The length of the key. - * @state: The state of the current calculations. - * @config: The current configuration. - * @digestsize: The size of current digest. - * @device: Pointer to the device structure. - */ -struct hash_ctx { - u8 *key; - u32 keylen; - struct hash_config config; - int digestsize; - struct hash_device_data *device; -}; - -/** - * struct hash_ctx - The request context used for hash calculations. - * @state: The state of the current calculations. - * @dma_mode: Used in special cases (workaround), e.g. need to change to - * cpu mode, if not supported/working in dma mode. - * @updated: Indicates if hardware is initialized for new operations. - */ -struct hash_req_ctx { - struct hash_state state; - bool dma_mode; - u8 updated; -}; - -/** - * struct hash_device_data - structure for a hash device. - * @base: Pointer to virtual base address of the hash device. - * @phybase: Pointer to physical memory location of the hash device. - * @list_node: For inclusion in klist. - * @dev: Pointer to the device dev structure. - * @ctx_lock: Spinlock for current_ctx. - * @current_ctx: Pointer to the currently allocated context. - * @power_state: TRUE = power state on, FALSE = power state off. - * @power_state_lock: Spinlock for power_state. - * @regulator: Pointer to the device's power control. - * @clk: Pointer to the device's clock control. - * @restore_dev_state: TRUE = saved state, FALSE = no saved state. - * @dma: Structure used for dma. - */ -struct hash_device_data { - struct hash_register __iomem *base; - phys_addr_t phybase; - struct klist_node list_node; - struct device *dev; - spinlock_t ctx_lock; - struct hash_ctx *current_ctx; - bool power_state; - spinlock_t power_state_lock; - struct regulator *regulator; - struct clk *clk; - bool restore_dev_state; - struct hash_state state; /* Used for saving and resuming state */ - struct hash_dma dma; -}; - -int hash_check_hw(struct hash_device_data *device_data); - -int hash_setconfiguration(struct hash_device_data *device_data, - struct hash_config *config); - -void hash_begin(struct hash_device_data *device_data, struct hash_ctx *ctx); - -void hash_get_digest(struct hash_device_data *device_data, - u8 *digest, int algorithm); - -int hash_hw_update(struct ahash_request *req); - -int hash_save_state(struct hash_device_data *device_data, - struct hash_state *state); - -int hash_resume_state(struct hash_device_data *device_data, - const struct hash_state *state); - -#endif diff --git a/drivers/crypto/ux500/hash/hash_core.c b/drivers/crypto/ux500/hash/hash_core.c deleted file mode 100644 index f104e8a43036..000000000000 --- a/drivers/crypto/ux500/hash/hash_core.c +++ /dev/null @@ -1,1966 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Cryptographic API. - * Support for Nomadik hardware crypto engine. - - * Copyright (C) ST-Ericsson SA 2010 - * Author: Shujuan Chen for ST-Ericsson - * Author: Joakim Bech for ST-Ericsson - * Author: Berne Hebark for ST-Ericsson. - * Author: Niklas Hernaeus for ST-Ericsson. - * Author: Andreas Westin for ST-Ericsson. - */ - -#define pr_fmt(fmt) "hashX hashX: " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -#include "hash_alg.h" - -static int hash_mode; -module_param(hash_mode, int, 0); -MODULE_PARM_DESC(hash_mode, "CPU or DMA mode. CPU = 0 (default), DMA = 1"); - -/* HMAC-SHA1, no key */ -static const u8 zero_message_hmac_sha1[SHA1_DIGEST_SIZE] = { - 0xfb, 0xdb, 0x1d, 0x1b, 0x18, 0xaa, 0x6c, 0x08, - 0x32, 0x4b, 0x7d, 0x64, 0xb7, 0x1f, 0xb7, 0x63, - 0x70, 0x69, 0x0e, 0x1d -}; - -/* HMAC-SHA256, no key */ -static const u8 zero_message_hmac_sha256[SHA256_DIGEST_SIZE] = { - 0xb6, 0x13, 0x67, 0x9a, 0x08, 0x14, 0xd9, 0xec, - 0x77, 0x2f, 0x95, 0xd7, 0x78, 0xc3, 0x5f, 0xc5, - 0xff, 0x16, 0x97, 0xc4, 0x93, 0x71, 0x56, 0x53, - 0xc6, 0xc7, 0x12, 0x14, 0x42, 0x92, 0xc5, 0xad -}; - -/** - * struct hash_driver_data - data specific to the driver. - * - * @device_list: A list of registered devices to choose from. - * @device_allocation: A semaphore initialized with number of devices. - */ -struct hash_driver_data { - struct klist device_list; - struct semaphore device_allocation; -}; - -static struct hash_driver_data driver_data; - -/* Declaration of functions */ -/** - * hash_messagepad - Pads a message and write the nblw bits. - * @device_data: Structure for the hash device. - * @message: Last word of a message - * @index_bytes: The number of bytes in the last message - * - * This function manages the final part of the digest calculation, when less - * than 512 bits (64 bytes) remain in message. This means index_bytes < 64. - * - */ -static void hash_messagepad(struct hash_device_data *device_data, - const u32 *message, u8 index_bytes); - -/** - * release_hash_device - Releases a previously allocated hash device. - * @device_data: Structure for the hash device. - * - */ -static void release_hash_device(struct hash_device_data *device_data) -{ - spin_lock(&device_data->ctx_lock); - device_data->current_ctx->device = NULL; - device_data->current_ctx = NULL; - spin_unlock(&device_data->ctx_lock); - - /* - * The down_interruptible part for this semaphore is called in - * cryp_get_device_data. - */ - up(&driver_data.device_allocation); -} - -static void hash_dma_setup_channel(struct hash_device_data *device_data, - struct device *dev) -{ - struct hash_platform_data *platform_data = dev->platform_data; - struct dma_slave_config conf = { - .direction = DMA_MEM_TO_DEV, - .dst_addr = device_data->phybase + HASH_DMA_FIFO, - .dst_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES, - .dst_maxburst = 16, - }; - - dma_cap_zero(device_data->dma.mask); - dma_cap_set(DMA_SLAVE, device_data->dma.mask); - - device_data->dma.cfg_mem2hash = platform_data->mem_to_engine; - device_data->dma.chan_mem2hash = - dma_request_channel(device_data->dma.mask, - platform_data->dma_filter, - device_data->dma.cfg_mem2hash); - - dmaengine_slave_config(device_data->dma.chan_mem2hash, &conf); - - init_completion(&device_data->dma.complete); -} - -static void hash_dma_callback(void *data) -{ - struct hash_ctx *ctx = data; - - complete(&ctx->device->dma.complete); -} - -static int hash_set_dma_transfer(struct hash_ctx *ctx, struct scatterlist *sg, - int len, enum dma_data_direction direction) -{ - struct dma_async_tx_descriptor *desc = NULL; - struct dma_chan *channel = NULL; - - if (direction != DMA_TO_DEVICE) { - dev_err(ctx->device->dev, "%s: Invalid DMA direction\n", - __func__); - return -EFAULT; - } - - sg->length = ALIGN(sg->length, HASH_DMA_ALIGN_SIZE); - - channel = ctx->device->dma.chan_mem2hash; - ctx->device->dma.sg = sg; - ctx->device->dma.sg_len = dma_map_sg(channel->device->dev, - ctx->device->dma.sg, ctx->device->dma.nents, - direction); - - if (!ctx->device->dma.sg_len) { - dev_err(ctx->device->dev, "%s: Could not map the sg list (TO_DEVICE)\n", - __func__); - return -EFAULT; - } - - dev_dbg(ctx->device->dev, "%s: Setting up DMA for buffer (TO_DEVICE)\n", - __func__); - desc = dmaengine_prep_slave_sg(channel, - ctx->device->dma.sg, ctx->device->dma.sg_len, - DMA_MEM_TO_DEV, DMA_CTRL_ACK | DMA_PREP_INTERRUPT); - if (!desc) { - dev_err(ctx->device->dev, - "%s: dmaengine_prep_slave_sg() failed!\n", __func__); - return -EFAULT; - } - - desc->callback = hash_dma_callback; - desc->callback_param = ctx; - - dmaengine_submit(desc); - dma_async_issue_pending(channel); - - return 0; -} - -static void hash_dma_done(struct hash_ctx *ctx) -{ - struct dma_chan *chan; - - chan = ctx->device->dma.chan_mem2hash; - dmaengine_terminate_all(chan); - dma_unmap_sg(chan->device->dev, ctx->device->dma.sg, - ctx->device->dma.nents, DMA_TO_DEVICE); -} - -static int hash_dma_write(struct hash_ctx *ctx, - struct scatterlist *sg, int len) -{ - int error = hash_set_dma_transfer(ctx, sg, len, DMA_TO_DEVICE); - if (error) { - dev_dbg(ctx->device->dev, - "%s: hash_set_dma_transfer() failed\n", __func__); - return error; - } - - return len; -} - -/** - * get_empty_message_digest - Returns a pre-calculated digest for - * the empty message. - * @device_data: Structure for the hash device. - * @zero_hash: Buffer to return the empty message digest. - * @zero_hash_size: Hash size of the empty message digest. - * @zero_digest: True if zero_digest returned. - */ -static int get_empty_message_digest( - struct hash_device_data *device_data, - u8 *zero_hash, u32 *zero_hash_size, bool *zero_digest) -{ - int ret = 0; - struct hash_ctx *ctx = device_data->current_ctx; - *zero_digest = false; - - /** - * Caller responsible for ctx != NULL. - */ - - if (HASH_OPER_MODE_HASH == ctx->config.oper_mode) { - if (HASH_ALGO_SHA1 == ctx->config.algorithm) { - memcpy(zero_hash, &sha1_zero_message_hash[0], - SHA1_DIGEST_SIZE); - *zero_hash_size = SHA1_DIGEST_SIZE; - *zero_digest = true; - } else if (HASH_ALGO_SHA256 == - ctx->config.algorithm) { - memcpy(zero_hash, &sha256_zero_message_hash[0], - SHA256_DIGEST_SIZE); - *zero_hash_size = SHA256_DIGEST_SIZE; - *zero_digest = true; - } else { - dev_err(device_data->dev, "%s: Incorrect algorithm!\n", - __func__); - ret = -EINVAL; - goto out; - } - } else if (HASH_OPER_MODE_HMAC == ctx->config.oper_mode) { - if (!ctx->keylen) { - if (HASH_ALGO_SHA1 == ctx->config.algorithm) { - memcpy(zero_hash, &zero_message_hmac_sha1[0], - SHA1_DIGEST_SIZE); - *zero_hash_size = SHA1_DIGEST_SIZE; - *zero_digest = true; - } else if (HASH_ALGO_SHA256 == ctx->config.algorithm) { - memcpy(zero_hash, &zero_message_hmac_sha256[0], - SHA256_DIGEST_SIZE); - *zero_hash_size = SHA256_DIGEST_SIZE; - *zero_digest = true; - } else { - dev_err(device_data->dev, "%s: Incorrect algorithm!\n", - __func__); - ret = -EINVAL; - goto out; - } - } else { - dev_dbg(device_data->dev, - "%s: Continue hash calculation, since hmac key available\n", - __func__); - } - } -out: - - return ret; -} - -/** - * hash_disable_power - Request to disable power and clock. - * @device_data: Structure for the hash device. - * @save_device_state: If true, saves the current hw state. - * - * This function request for disabling power (regulator) and clock, - * and could also save current hw state. - */ -static int hash_disable_power(struct hash_device_data *device_data, - bool save_device_state) -{ - int ret = 0; - struct device *dev = device_data->dev; - - spin_lock(&device_data->power_state_lock); - if (!device_data->power_state) - goto out; - - if (save_device_state) { - hash_save_state(device_data, - &device_data->state); - device_data->restore_dev_state = true; - } - - clk_disable(device_data->clk); - ret = regulator_disable(device_data->regulator); - if (ret) - dev_err(dev, "%s: regulator_disable() failed!\n", __func__); - - device_data->power_state = false; - -out: - spin_unlock(&device_data->power_state_lock); - - return ret; -} - -/** - * hash_enable_power - Request to enable power and clock. - * @device_data: Structure for the hash device. - * @restore_device_state: If true, restores a previous saved hw state. - * - * This function request for enabling power (regulator) and clock, - * and could also restore a previously saved hw state. - */ -static int hash_enable_power(struct hash_device_data *device_data, - bool restore_device_state) -{ - int ret = 0; - struct device *dev = device_data->dev; - - spin_lock(&device_data->power_state_lock); - if (!device_data->power_state) { - ret = regulator_enable(device_data->regulator); - if (ret) { - dev_err(dev, "%s: regulator_enable() failed!\n", - __func__); - goto out; - } - ret = clk_enable(device_data->clk); - if (ret) { - dev_err(dev, "%s: clk_enable() failed!\n", __func__); - ret = regulator_disable( - device_data->regulator); - goto out; - } - device_data->power_state = true; - } - - if (device_data->restore_dev_state) { - if (restore_device_state) { - device_data->restore_dev_state = false; - hash_resume_state(device_data, &device_data->state); - } - } -out: - spin_unlock(&device_data->power_state_lock); - - return ret; -} - -/** - * hash_get_device_data - Checks for an available hash device and return it. - * @ctx: Structure for the hash context. - * @device_data: Structure for the hash device. - * - * This function check for an available hash device and return it to - * the caller. - * Note! Caller need to release the device, calling up(). - */ -static int hash_get_device_data(struct hash_ctx *ctx, - struct hash_device_data **device_data) -{ - int ret; - struct klist_iter device_iterator; - struct klist_node *device_node; - struct hash_device_data *local_device_data = NULL; - - /* Wait until a device is available */ - ret = down_interruptible(&driver_data.device_allocation); - if (ret) - return ret; /* Interrupted */ - - /* Select a device */ - klist_iter_init(&driver_data.device_list, &device_iterator); - device_node = klist_next(&device_iterator); - while (device_node) { - local_device_data = container_of(device_node, - struct hash_device_data, list_node); - spin_lock(&local_device_data->ctx_lock); - /* current_ctx allocates a device, NULL = unallocated */ - if (local_device_data->current_ctx) { - device_node = klist_next(&device_iterator); - } else { - local_device_data->current_ctx = ctx; - ctx->device = local_device_data; - spin_unlock(&local_device_data->ctx_lock); - break; - } - spin_unlock(&local_device_data->ctx_lock); - } - klist_iter_exit(&device_iterator); - - if (!device_node) { - /** - * No free device found. - * Since we allocated a device with down_interruptible, this - * should not be able to happen. - * Number of available devices, which are contained in - * device_allocation, is therefore decremented by not doing - * an up(device_allocation). - */ - return -EBUSY; - } - - *device_data = local_device_data; - - return 0; -} - -/** - * hash_hw_write_key - Writes the key to the hardware registries. - * - * @device_data: Structure for the hash device. - * @key: Key to be written. - * @keylen: The lengt of the key. - * - * Note! This function DOES NOT write to the NBLW registry, even though - * specified in the hw design spec. Either due to incorrect info in the - * spec or due to a bug in the hw. - */ -static void hash_hw_write_key(struct hash_device_data *device_data, - const u8 *key, unsigned int keylen) -{ - u32 word = 0; - int nwords = 1; - - HASH_CLEAR_BITS(&device_data->base->str, HASH_STR_NBLW_MASK); - - while (keylen >= 4) { - u32 *key_word = (u32 *)key; - - HASH_SET_DIN(key_word, nwords); - keylen -= 4; - key += 4; - } - - /* Take care of the remaining bytes in the last word */ - if (keylen) { - word = 0; - while (keylen) { - word |= (key[keylen - 1] << (8 * (keylen - 1))); - keylen--; - } - - HASH_SET_DIN(&word, nwords); - } - - while (readl(&device_data->base->str) & HASH_STR_DCAL_MASK) - cpu_relax(); - - HASH_SET_DCAL; - - while (readl(&device_data->base->str) & HASH_STR_DCAL_MASK) - cpu_relax(); -} - -/** - * init_hash_hw - Initialise the hash hardware for a new calculation. - * @device_data: Structure for the hash device. - * @ctx: The hash context. - * - * This function will enable the bits needed to clear and start a new - * calculation. - */ -static int init_hash_hw(struct hash_device_data *device_data, - struct hash_ctx *ctx) -{ - int ret = 0; - - ret = hash_setconfiguration(device_data, &ctx->config); - if (ret) { - dev_err(device_data->dev, "%s: hash_setconfiguration() failed!\n", - __func__); - return ret; - } - - hash_begin(device_data, ctx); - - if (ctx->config.oper_mode == HASH_OPER_MODE_HMAC) - hash_hw_write_key(device_data, ctx->key, ctx->keylen); - - return ret; -} - -/** - * hash_get_nents - Return number of entries (nents) in scatterlist (sg). - * - * @sg: Scatterlist. - * @size: Size in bytes. - * @aligned: True if sg data aligned to work in DMA mode. - * - */ -static int hash_get_nents(struct scatterlist *sg, int size, bool *aligned) -{ - int nents = 0; - bool aligned_data = true; - - while (size > 0 && sg) { - nents++; - size -= sg->length; - - /* hash_set_dma_transfer will align last nent */ - if ((aligned && !IS_ALIGNED(sg->offset, HASH_DMA_ALIGN_SIZE)) || - (!IS_ALIGNED(sg->length, HASH_DMA_ALIGN_SIZE) && size > 0)) - aligned_data = false; - - sg = sg_next(sg); - } - - if (aligned) - *aligned = aligned_data; - - if (size != 0) - return -EFAULT; - - return nents; -} - -/** - * hash_dma_valid_data - checks for dma valid sg data. - * @sg: Scatterlist. - * @datasize: Datasize in bytes. - * - * NOTE! This function checks for dma valid sg data, since dma - * only accept datasizes of even wordsize. - */ -static bool hash_dma_valid_data(struct scatterlist *sg, int datasize) -{ - bool aligned; - - /* Need to include at least one nent, else error */ - if (hash_get_nents(sg, datasize, &aligned) < 1) - return false; - - return aligned; -} - -/** - * ux500_hash_init - Common hash init function for SHA1/SHA2 (SHA256). - * @req: The hash request for the job. - * - * Initialize structures. - */ -static int ux500_hash_init(struct ahash_request *req) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct hash_ctx *ctx = crypto_ahash_ctx(tfm); - struct hash_req_ctx *req_ctx = ahash_request_ctx(req); - - if (!ctx->key) - ctx->keylen = 0; - - memset(&req_ctx->state, 0, sizeof(struct hash_state)); - req_ctx->updated = 0; - if (hash_mode == HASH_MODE_DMA) { - if (req->nbytes < HASH_DMA_ALIGN_SIZE) { - req_ctx->dma_mode = false; /* Don't use DMA */ - - pr_debug("%s: DMA mode, but direct to CPU mode for data size < %d\n", - __func__, HASH_DMA_ALIGN_SIZE); - } else { - if (req->nbytes >= HASH_DMA_PERFORMANCE_MIN_SIZE && - hash_dma_valid_data(req->src, req->nbytes)) { - req_ctx->dma_mode = true; - } else { - req_ctx->dma_mode = false; - pr_debug("%s: DMA mode, but use CPU mode for datalength < %d or non-aligned data, except in last nent\n", - __func__, - HASH_DMA_PERFORMANCE_MIN_SIZE); - } - } - } - return 0; -} - -/** - * hash_processblock - This function processes a single block of 512 bits (64 - * bytes), word aligned, starting at message. - * @device_data: Structure for the hash device. - * @message: Block (512 bits) of message to be written to - * the HASH hardware. - * @length: Message length - * - */ -static void hash_processblock(struct hash_device_data *device_data, - const u32 *message, int length) -{ - int len = length / HASH_BYTES_PER_WORD; - /* - * NBLW bits. Reset the number of bits in last word (NBLW). - */ - HASH_CLEAR_BITS(&device_data->base->str, HASH_STR_NBLW_MASK); - - /* - * Write message data to the HASH_DIN register. - */ - HASH_SET_DIN(message, len); -} - -/** - * hash_messagepad - Pads a message and write the nblw bits. - * @device_data: Structure for the hash device. - * @message: Last word of a message. - * @index_bytes: The number of bytes in the last message. - * - * This function manages the final part of the digest calculation, when less - * than 512 bits (64 bytes) remain in message. This means index_bytes < 64. - * - */ -static void hash_messagepad(struct hash_device_data *device_data, - const u32 *message, u8 index_bytes) -{ - int nwords = 1; - - /* - * Clear hash str register, only clear NBLW - * since DCAL will be reset by hardware. - */ - HASH_CLEAR_BITS(&device_data->base->str, HASH_STR_NBLW_MASK); - - /* Main loop */ - while (index_bytes >= 4) { - HASH_SET_DIN(message, nwords); - index_bytes -= 4; - message++; - } - - if (index_bytes) - HASH_SET_DIN(message, nwords); - - while (readl(&device_data->base->str) & HASH_STR_DCAL_MASK) - cpu_relax(); - - /* num_of_bytes == 0 => NBLW <- 0 (32 bits valid in DATAIN) */ - HASH_SET_NBLW(index_bytes * 8); - dev_dbg(device_data->dev, "%s: DIN=0x%08x NBLW=%lu\n", - __func__, readl_relaxed(&device_data->base->din), - readl_relaxed(&device_data->base->str) & HASH_STR_NBLW_MASK); - HASH_SET_DCAL; - dev_dbg(device_data->dev, "%s: after dcal -> DIN=0x%08x NBLW=%lu\n", - __func__, readl_relaxed(&device_data->base->din), - readl_relaxed(&device_data->base->str) & HASH_STR_NBLW_MASK); - - while (readl(&device_data->base->str) & HASH_STR_DCAL_MASK) - cpu_relax(); -} - -/** - * hash_incrementlength - Increments the length of the current message. - * @ctx: Hash context - * @incr: Length of message processed already - * - * Overflow cannot occur, because conditions for overflow are checked in - * hash_hw_update. - */ -static void hash_incrementlength(struct hash_req_ctx *ctx, u32 incr) -{ - ctx->state.length.low_word += incr; - - /* Check for wrap-around */ - if (ctx->state.length.low_word < incr) - ctx->state.length.high_word++; -} - -/** - * hash_setconfiguration - Sets the required configuration for the hash - * hardware. - * @device_data: Structure for the hash device. - * @config: Pointer to a configuration structure. - */ -int hash_setconfiguration(struct hash_device_data *device_data, - struct hash_config *config) -{ - int ret = 0; - - if (config->algorithm != HASH_ALGO_SHA1 && - config->algorithm != HASH_ALGO_SHA256) - return -EPERM; - - /* - * DATAFORM bits. Set the DATAFORM bits to 0b11, which means the data - * to be written to HASH_DIN is considered as 32 bits. - */ - HASH_SET_DATA_FORMAT(config->data_format); - - /* - * ALGO bit. Set to 0b1 for SHA-1 and 0b0 for SHA-256 - */ - switch (config->algorithm) { - case HASH_ALGO_SHA1: - HASH_SET_BITS(&device_data->base->cr, HASH_CR_ALGO_MASK); - break; - - case HASH_ALGO_SHA256: - HASH_CLEAR_BITS(&device_data->base->cr, HASH_CR_ALGO_MASK); - break; - - default: - dev_err(device_data->dev, "%s: Incorrect algorithm\n", - __func__); - return -EPERM; - } - - /* - * MODE bit. This bit selects between HASH or HMAC mode for the - * selected algorithm. 0b0 = HASH and 0b1 = HMAC. - */ - if (HASH_OPER_MODE_HASH == config->oper_mode) - HASH_CLEAR_BITS(&device_data->base->cr, - HASH_CR_MODE_MASK); - else if (HASH_OPER_MODE_HMAC == config->oper_mode) { - HASH_SET_BITS(&device_data->base->cr, HASH_CR_MODE_MASK); - if (device_data->current_ctx->keylen > HASH_BLOCK_SIZE) { - /* Truncate key to blocksize */ - dev_dbg(device_data->dev, "%s: LKEY set\n", __func__); - HASH_SET_BITS(&device_data->base->cr, - HASH_CR_LKEY_MASK); - } else { - dev_dbg(device_data->dev, "%s: LKEY cleared\n", - __func__); - HASH_CLEAR_BITS(&device_data->base->cr, - HASH_CR_LKEY_MASK); - } - } else { /* Wrong hash mode */ - ret = -EPERM; - dev_err(device_data->dev, "%s: HASH_INVALID_PARAMETER!\n", - __func__); - } - return ret; -} - -/** - * hash_begin - This routine resets some globals and initializes the hash - * hardware. - * @device_data: Structure for the hash device. - * @ctx: Hash context. - */ -void hash_begin(struct hash_device_data *device_data, struct hash_ctx *ctx) -{ - /* HW and SW initializations */ - /* Note: there is no need to initialize buffer and digest members */ - - while (readl(&device_data->base->str) & HASH_STR_DCAL_MASK) - cpu_relax(); - - /* - * INIT bit. Set this bit to 0b1 to reset the HASH processor core and - * prepare the initialize the HASH accelerator to compute the message - * digest of a new message. - */ - HASH_INITIALIZE; - - /* - * NBLW bits. Reset the number of bits in last word (NBLW). - */ - HASH_CLEAR_BITS(&device_data->base->str, HASH_STR_NBLW_MASK); -} - -static int hash_process_data(struct hash_device_data *device_data, - struct hash_ctx *ctx, struct hash_req_ctx *req_ctx, - int msg_length, u8 *data_buffer, u8 *buffer, - u8 *index) -{ - int ret = 0; - u32 count; - - do { - if ((*index + msg_length) < HASH_BLOCK_SIZE) { - for (count = 0; count < msg_length; count++) { - buffer[*index + count] = - *(data_buffer + count); - } - *index += msg_length; - msg_length = 0; - } else { - if (req_ctx->updated) { - ret = hash_resume_state(device_data, - &device_data->state); - memmove(req_ctx->state.buffer, - device_data->state.buffer, - HASH_BLOCK_SIZE); - if (ret) { - dev_err(device_data->dev, - "%s: hash_resume_state() failed!\n", - __func__); - goto out; - } - } else { - ret = init_hash_hw(device_data, ctx); - if (ret) { - dev_err(device_data->dev, - "%s: init_hash_hw() failed!\n", - __func__); - goto out; - } - req_ctx->updated = 1; - } - /* - * If 'data_buffer' is four byte aligned and - * local buffer does not have any data, we can - * write data directly from 'data_buffer' to - * HW peripheral, otherwise we first copy data - * to a local buffer - */ - if (IS_ALIGNED((unsigned long)data_buffer, 4) && - (0 == *index)) - hash_processblock(device_data, - (const u32 *)data_buffer, - HASH_BLOCK_SIZE); - else { - for (count = 0; - count < (u32)(HASH_BLOCK_SIZE - *index); - count++) { - buffer[*index + count] = - *(data_buffer + count); - } - hash_processblock(device_data, - (const u32 *)buffer, - HASH_BLOCK_SIZE); - } - hash_incrementlength(req_ctx, HASH_BLOCK_SIZE); - data_buffer += (HASH_BLOCK_SIZE - *index); - - msg_length -= (HASH_BLOCK_SIZE - *index); - *index = 0; - - ret = hash_save_state(device_data, - &device_data->state); - - memmove(device_data->state.buffer, - req_ctx->state.buffer, - HASH_BLOCK_SIZE); - if (ret) { - dev_err(device_data->dev, "%s: hash_save_state() failed!\n", - __func__); - goto out; - } - } - } while (msg_length != 0); -out: - - return ret; -} - -/** - * hash_dma_final - The hash dma final function for SHA1/SHA256. - * @req: The hash request for the job. - */ -static int hash_dma_final(struct ahash_request *req) -{ - int ret = 0; - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct hash_ctx *ctx = crypto_ahash_ctx(tfm); - struct hash_req_ctx *req_ctx = ahash_request_ctx(req); - struct hash_device_data *device_data; - u8 digest[SHA256_DIGEST_SIZE]; - int bytes_written = 0; - - ret = hash_get_device_data(ctx, &device_data); - if (ret) - return ret; - - dev_dbg(device_data->dev, "%s: (ctx=0x%lx)!\n", __func__, - (unsigned long)ctx); - - if (req_ctx->updated) { - ret = hash_resume_state(device_data, &device_data->state); - - if (ret) { - dev_err(device_data->dev, "%s: hash_resume_state() failed!\n", - __func__); - goto out; - } - } else { - ret = hash_setconfiguration(device_data, &ctx->config); - if (ret) { - dev_err(device_data->dev, - "%s: hash_setconfiguration() failed!\n", - __func__); - goto out; - } - - /* Enable DMA input */ - if (hash_mode != HASH_MODE_DMA || !req_ctx->dma_mode) { - HASH_CLEAR_BITS(&device_data->base->cr, - HASH_CR_DMAE_MASK); - } else { - HASH_SET_BITS(&device_data->base->cr, - HASH_CR_DMAE_MASK); - HASH_SET_BITS(&device_data->base->cr, - HASH_CR_PRIVN_MASK); - } - - HASH_INITIALIZE; - - if (ctx->config.oper_mode == HASH_OPER_MODE_HMAC) - hash_hw_write_key(device_data, ctx->key, ctx->keylen); - - /* Number of bits in last word = (nbytes * 8) % 32 */ - HASH_SET_NBLW((req->nbytes * 8) % 32); - req_ctx->updated = 1; - } - - /* Store the nents in the dma struct. */ - ctx->device->dma.nents = hash_get_nents(req->src, req->nbytes, NULL); - if (!ctx->device->dma.nents) { - dev_err(device_data->dev, "%s: ctx->device->dma.nents = 0\n", - __func__); - ret = ctx->device->dma.nents; - goto out; - } - - bytes_written = hash_dma_write(ctx, req->src, req->nbytes); - if (bytes_written != req->nbytes) { - dev_err(device_data->dev, "%s: hash_dma_write() failed!\n", - __func__); - ret = bytes_written; - goto out; - } - - wait_for_completion(&ctx->device->dma.complete); - hash_dma_done(ctx); - - while (readl(&device_data->base->str) & HASH_STR_DCAL_MASK) - cpu_relax(); - - if (ctx->config.oper_mode == HASH_OPER_MODE_HMAC && ctx->key) { - unsigned int keylen = ctx->keylen; - u8 *key = ctx->key; - - dev_dbg(device_data->dev, "%s: keylen: %d\n", - __func__, ctx->keylen); - hash_hw_write_key(device_data, key, keylen); - } - - hash_get_digest(device_data, digest, ctx->config.algorithm); - memcpy(req->result, digest, ctx->digestsize); - -out: - release_hash_device(device_data); - - /** - * Allocated in setkey, and only used in HMAC. - */ - kfree(ctx->key); - - return ret; -} - -/** - * hash_hw_final - The final hash calculation function - * @req: The hash request for the job. - */ -static int hash_hw_final(struct ahash_request *req) -{ - int ret = 0; - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct hash_ctx *ctx = crypto_ahash_ctx(tfm); - struct hash_req_ctx *req_ctx = ahash_request_ctx(req); - struct hash_device_data *device_data; - u8 digest[SHA256_DIGEST_SIZE]; - - ret = hash_get_device_data(ctx, &device_data); - if (ret) - return ret; - - dev_dbg(device_data->dev, "%s: (ctx=0x%lx)!\n", __func__, - (unsigned long)ctx); - - if (req_ctx->updated) { - ret = hash_resume_state(device_data, &device_data->state); - - if (ret) { - dev_err(device_data->dev, - "%s: hash_resume_state() failed!\n", __func__); - goto out; - } - } else if (req->nbytes == 0 && ctx->keylen == 0) { - u8 zero_hash[SHA256_DIGEST_SIZE]; - u32 zero_hash_size = 0; - bool zero_digest = false; - /** - * Use a pre-calculated empty message digest - * (workaround since hw return zeroes, hw bug!?) - */ - ret = get_empty_message_digest(device_data, &zero_hash[0], - &zero_hash_size, &zero_digest); - if (!ret && likely(zero_hash_size == ctx->digestsize) && - zero_digest) { - memcpy(req->result, &zero_hash[0], ctx->digestsize); - goto out; - } else if (!ret && !zero_digest) { - dev_dbg(device_data->dev, - "%s: HMAC zero msg with key, continue...\n", - __func__); - } else { - dev_err(device_data->dev, - "%s: ret=%d, or wrong digest size? %s\n", - __func__, ret, - zero_hash_size == ctx->digestsize ? - "true" : "false"); - /* Return error */ - goto out; - } - } else if (req->nbytes == 0 && ctx->keylen > 0) { - ret = -EPERM; - dev_err(device_data->dev, "%s: Empty message with keylength > 0, NOT supported\n", - __func__); - goto out; - } - - if (!req_ctx->updated) { - ret = init_hash_hw(device_data, ctx); - if (ret) { - dev_err(device_data->dev, - "%s: init_hash_hw() failed!\n", __func__); - goto out; - } - } - - if (req_ctx->state.index) { - hash_messagepad(device_data, req_ctx->state.buffer, - req_ctx->state.index); - } else { - HASH_SET_DCAL; - while (readl(&device_data->base->str) & HASH_STR_DCAL_MASK) - cpu_relax(); - } - - if (ctx->config.oper_mode == HASH_OPER_MODE_HMAC && ctx->key) { - unsigned int keylen = ctx->keylen; - u8 *key = ctx->key; - - dev_dbg(device_data->dev, "%s: keylen: %d\n", - __func__, ctx->keylen); - hash_hw_write_key(device_data, key, keylen); - } - - hash_get_digest(device_data, digest, ctx->config.algorithm); - memcpy(req->result, digest, ctx->digestsize); - -out: - release_hash_device(device_data); - - /** - * Allocated in setkey, and only used in HMAC. - */ - kfree(ctx->key); - - return ret; -} - -/** - * hash_hw_update - Updates current HASH computation hashing another part of - * the message. - * @req: Byte array containing the message to be hashed (caller - * allocated). - */ -int hash_hw_update(struct ahash_request *req) -{ - int ret = 0; - u8 index = 0; - u8 *buffer; - struct hash_device_data *device_data; - u8 *data_buffer; - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct hash_ctx *ctx = crypto_ahash_ctx(tfm); - struct hash_req_ctx *req_ctx = ahash_request_ctx(req); - struct crypto_hash_walk walk; - int msg_length; - - index = req_ctx->state.index; - buffer = (u8 *)req_ctx->state.buffer; - - ret = hash_get_device_data(ctx, &device_data); - if (ret) - return ret; - - msg_length = crypto_hash_walk_first(req, &walk); - - /* Empty message ("") is correct indata */ - if (msg_length == 0) { - ret = 0; - goto release_dev; - } - - /* Check if ctx->state.length + msg_length - overflows */ - if (msg_length > (req_ctx->state.length.low_word + msg_length) && - HASH_HIGH_WORD_MAX_VAL == req_ctx->state.length.high_word) { - pr_err("%s: HASH_MSG_LENGTH_OVERFLOW!\n", __func__); - ret = crypto_hash_walk_done(&walk, -EPERM); - goto release_dev; - } - - /* Main loop */ - while (0 != msg_length) { - data_buffer = walk.data; - ret = hash_process_data(device_data, ctx, req_ctx, msg_length, - data_buffer, buffer, &index); - - if (ret) { - dev_err(device_data->dev, "%s: hash_internal_hw_update() failed!\n", - __func__); - crypto_hash_walk_done(&walk, ret); - goto release_dev; - } - - msg_length = crypto_hash_walk_done(&walk, 0); - } - - req_ctx->state.index = index; - dev_dbg(device_data->dev, "%s: indata length=%d, bin=%d\n", - __func__, req_ctx->state.index, req_ctx->state.bit_index); - -release_dev: - release_hash_device(device_data); - - return ret; -} - -/** - * hash_resume_state - Function that resumes the state of an calculation. - * @device_data: Pointer to the device structure. - * @device_state: The state to be restored in the hash hardware - */ -int hash_resume_state(struct hash_device_data *device_data, - const struct hash_state *device_state) -{ - u32 temp_cr; - s32 count; - int hash_mode = HASH_OPER_MODE_HASH; - - if (NULL == device_state) { - dev_err(device_data->dev, "%s: HASH_INVALID_PARAMETER!\n", - __func__); - return -EPERM; - } - - /* Check correctness of index and length members */ - if (device_state->index > HASH_BLOCK_SIZE || - (device_state->length.low_word % HASH_BLOCK_SIZE) != 0) { - dev_err(device_data->dev, "%s: HASH_INVALID_PARAMETER!\n", - __func__); - return -EPERM; - } - - /* - * INIT bit. Set this bit to 0b1 to reset the HASH processor core and - * prepare the initialize the HASH accelerator to compute the message - * digest of a new message. - */ - HASH_INITIALIZE; - - temp_cr = device_state->temp_cr; - writel_relaxed(temp_cr & HASH_CR_RESUME_MASK, &device_data->base->cr); - - if (readl(&device_data->base->cr) & HASH_CR_MODE_MASK) - hash_mode = HASH_OPER_MODE_HMAC; - else - hash_mode = HASH_OPER_MODE_HASH; - - for (count = 0; count < HASH_CSR_COUNT; count++) { - if ((count >= 36) && (hash_mode == HASH_OPER_MODE_HASH)) - break; - - writel_relaxed(device_state->csr[count], - &device_data->base->csrx[count]); - } - - writel_relaxed(device_state->csfull, &device_data->base->csfull); - writel_relaxed(device_state->csdatain, &device_data->base->csdatain); - - writel_relaxed(device_state->str_reg, &device_data->base->str); - writel_relaxed(temp_cr, &device_data->base->cr); - - return 0; -} - -/** - * hash_save_state - Function that saves the state of hardware. - * @device_data: Pointer to the device structure. - * @device_state: The strucure where the hardware state should be saved. - */ -int hash_save_state(struct hash_device_data *device_data, - struct hash_state *device_state) -{ - u32 temp_cr; - u32 count; - int hash_mode = HASH_OPER_MODE_HASH; - - if (NULL == device_state) { - dev_err(device_data->dev, "%s: HASH_INVALID_PARAMETER!\n", - __func__); - return -ENOTSUPP; - } - - /* Write dummy value to force digest intermediate calculation. This - * actually makes sure that there isn't any ongoing calculation in the - * hardware. - */ - while (readl(&device_data->base->str) & HASH_STR_DCAL_MASK) - cpu_relax(); - - temp_cr = readl_relaxed(&device_data->base->cr); - - device_state->str_reg = readl_relaxed(&device_data->base->str); - - device_state->din_reg = readl_relaxed(&device_data->base->din); - - if (readl(&device_data->base->cr) & HASH_CR_MODE_MASK) - hash_mode = HASH_OPER_MODE_HMAC; - else - hash_mode = HASH_OPER_MODE_HASH; - - for (count = 0; count < HASH_CSR_COUNT; count++) { - if ((count >= 36) && (hash_mode == HASH_OPER_MODE_HASH)) - break; - - device_state->csr[count] = - readl_relaxed(&device_data->base->csrx[count]); - } - - device_state->csfull = readl_relaxed(&device_data->base->csfull); - device_state->csdatain = readl_relaxed(&device_data->base->csdatain); - - device_state->temp_cr = temp_cr; - - return 0; -} - -/** - * hash_check_hw - This routine checks for peripheral Ids and PCell Ids. - * @device_data: - * - */ -int hash_check_hw(struct hash_device_data *device_data) -{ - /* Checking Peripheral Ids */ - if (HASH_P_ID0 == readl_relaxed(&device_data->base->periphid0) && - HASH_P_ID1 == readl_relaxed(&device_data->base->periphid1) && - HASH_P_ID2 == readl_relaxed(&device_data->base->periphid2) && - HASH_P_ID3 == readl_relaxed(&device_data->base->periphid3) && - HASH_CELL_ID0 == readl_relaxed(&device_data->base->cellid0) && - HASH_CELL_ID1 == readl_relaxed(&device_data->base->cellid1) && - HASH_CELL_ID2 == readl_relaxed(&device_data->base->cellid2) && - HASH_CELL_ID3 == readl_relaxed(&device_data->base->cellid3)) { - return 0; - } - - dev_err(device_data->dev, "%s: HASH_UNSUPPORTED_HW!\n", __func__); - return -ENOTSUPP; -} - -/** - * hash_get_digest - Gets the digest. - * @device_data: Pointer to the device structure. - * @digest: User allocated byte array for the calculated digest. - * @algorithm: The algorithm in use. - */ -void hash_get_digest(struct hash_device_data *device_data, - u8 *digest, int algorithm) -{ - u32 temp_hx_val, count; - int loop_ctr; - - if (algorithm != HASH_ALGO_SHA1 && algorithm != HASH_ALGO_SHA256) { - dev_err(device_data->dev, "%s: Incorrect algorithm %d\n", - __func__, algorithm); - return; - } - - if (algorithm == HASH_ALGO_SHA1) - loop_ctr = SHA1_DIGEST_SIZE / sizeof(u32); - else - loop_ctr = SHA256_DIGEST_SIZE / sizeof(u32); - - dev_dbg(device_data->dev, "%s: digest array:(0x%lx)\n", - __func__, (unsigned long)digest); - - /* Copy result into digest array */ - for (count = 0; count < loop_ctr; count++) { - temp_hx_val = readl_relaxed(&device_data->base->hx[count]); - digest[count * 4] = (u8) ((temp_hx_val >> 24) & 0xFF); - digest[count * 4 + 1] = (u8) ((temp_hx_val >> 16) & 0xFF); - digest[count * 4 + 2] = (u8) ((temp_hx_val >> 8) & 0xFF); - digest[count * 4 + 3] = (u8) ((temp_hx_val >> 0) & 0xFF); - } -} - -/** - * ahash_update - The hash update function for SHA1/SHA2 (SHA256). - * @req: The hash request for the job. - */ -static int ahash_update(struct ahash_request *req) -{ - int ret = 0; - struct hash_req_ctx *req_ctx = ahash_request_ctx(req); - - if (hash_mode != HASH_MODE_DMA || !req_ctx->dma_mode) - ret = hash_hw_update(req); - /* Skip update for DMA, all data will be passed to DMA in final */ - - if (ret) { - pr_err("%s: hash_hw_update() failed!\n", __func__); - } - - return ret; -} - -/** - * ahash_final - The hash final function for SHA1/SHA2 (SHA256). - * @req: The hash request for the job. - */ -static int ahash_final(struct ahash_request *req) -{ - int ret = 0; - struct hash_req_ctx *req_ctx = ahash_request_ctx(req); - - pr_debug("%s: data size: %d\n", __func__, req->nbytes); - - if ((hash_mode == HASH_MODE_DMA) && req_ctx->dma_mode) - ret = hash_dma_final(req); - else - ret = hash_hw_final(req); - - if (ret) { - pr_err("%s: hash_hw/dma_final() failed\n", __func__); - } - - return ret; -} - -static int hash_setkey(struct crypto_ahash *tfm, - const u8 *key, unsigned int keylen, int alg) -{ - int ret = 0; - struct hash_ctx *ctx = crypto_ahash_ctx(tfm); - - /** - * Freed in final. - */ - ctx->key = kmemdup(key, keylen, GFP_KERNEL); - if (!ctx->key) { - pr_err("%s: Failed to allocate ctx->key for %d\n", - __func__, alg); - return -ENOMEM; - } - ctx->keylen = keylen; - - return ret; -} - -static int ahash_sha1_init(struct ahash_request *req) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct hash_ctx *ctx = crypto_ahash_ctx(tfm); - - ctx->config.data_format = HASH_DATA_8_BITS; - ctx->config.algorithm = HASH_ALGO_SHA1; - ctx->config.oper_mode = HASH_OPER_MODE_HASH; - ctx->digestsize = SHA1_DIGEST_SIZE; - - return ux500_hash_init(req); -} - -static int ahash_sha256_init(struct ahash_request *req) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct hash_ctx *ctx = crypto_ahash_ctx(tfm); - - ctx->config.data_format = HASH_DATA_8_BITS; - ctx->config.algorithm = HASH_ALGO_SHA256; - ctx->config.oper_mode = HASH_OPER_MODE_HASH; - ctx->digestsize = SHA256_DIGEST_SIZE; - - return ux500_hash_init(req); -} - -static int ahash_sha1_digest(struct ahash_request *req) -{ - int ret2, ret1; - - ret1 = ahash_sha1_init(req); - if (ret1) - goto out; - - ret1 = ahash_update(req); - ret2 = ahash_final(req); - -out: - return ret1 ? ret1 : ret2; -} - -static int ahash_sha256_digest(struct ahash_request *req) -{ - int ret2, ret1; - - ret1 = ahash_sha256_init(req); - if (ret1) - goto out; - - ret1 = ahash_update(req); - ret2 = ahash_final(req); - -out: - return ret1 ? ret1 : ret2; -} - -static int ahash_noimport(struct ahash_request *req, const void *in) -{ - return -ENOSYS; -} - -static int ahash_noexport(struct ahash_request *req, void *out) -{ - return -ENOSYS; -} - -static int hmac_sha1_init(struct ahash_request *req) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct hash_ctx *ctx = crypto_ahash_ctx(tfm); - - ctx->config.data_format = HASH_DATA_8_BITS; - ctx->config.algorithm = HASH_ALGO_SHA1; - ctx->config.oper_mode = HASH_OPER_MODE_HMAC; - ctx->digestsize = SHA1_DIGEST_SIZE; - - return ux500_hash_init(req); -} - -static int hmac_sha256_init(struct ahash_request *req) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct hash_ctx *ctx = crypto_ahash_ctx(tfm); - - ctx->config.data_format = HASH_DATA_8_BITS; - ctx->config.algorithm = HASH_ALGO_SHA256; - ctx->config.oper_mode = HASH_OPER_MODE_HMAC; - ctx->digestsize = SHA256_DIGEST_SIZE; - - return ux500_hash_init(req); -} - -static int hmac_sha1_digest(struct ahash_request *req) -{ - int ret2, ret1; - - ret1 = hmac_sha1_init(req); - if (ret1) - goto out; - - ret1 = ahash_update(req); - ret2 = ahash_final(req); - -out: - return ret1 ? ret1 : ret2; -} - -static int hmac_sha256_digest(struct ahash_request *req) -{ - int ret2, ret1; - - ret1 = hmac_sha256_init(req); - if (ret1) - goto out; - - ret1 = ahash_update(req); - ret2 = ahash_final(req); - -out: - return ret1 ? ret1 : ret2; -} - -static int hmac_sha1_setkey(struct crypto_ahash *tfm, - const u8 *key, unsigned int keylen) -{ - return hash_setkey(tfm, key, keylen, HASH_ALGO_SHA1); -} - -static int hmac_sha256_setkey(struct crypto_ahash *tfm, - const u8 *key, unsigned int keylen) -{ - return hash_setkey(tfm, key, keylen, HASH_ALGO_SHA256); -} - -struct hash_algo_template { - struct hash_config conf; - struct ahash_alg hash; -}; - -static int hash_cra_init(struct crypto_tfm *tfm) -{ - struct hash_ctx *ctx = crypto_tfm_ctx(tfm); - struct crypto_alg *alg = tfm->__crt_alg; - struct hash_algo_template *hash_alg; - - hash_alg = container_of(__crypto_ahash_alg(alg), - struct hash_algo_template, - hash); - - crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), - sizeof(struct hash_req_ctx)); - - ctx->config.data_format = HASH_DATA_8_BITS; - ctx->config.algorithm = hash_alg->conf.algorithm; - ctx->config.oper_mode = hash_alg->conf.oper_mode; - - ctx->digestsize = hash_alg->hash.halg.digestsize; - - return 0; -} - -static struct hash_algo_template hash_algs[] = { - { - .conf.algorithm = HASH_ALGO_SHA1, - .conf.oper_mode = HASH_OPER_MODE_HASH, - .hash = { - .init = ux500_hash_init, - .update = ahash_update, - .final = ahash_final, - .digest = ahash_sha1_digest, - .export = ahash_noexport, - .import = ahash_noimport, - .halg.digestsize = SHA1_DIGEST_SIZE, - .halg.statesize = sizeof(struct hash_ctx), - .halg.base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-ux500", - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct hash_ctx), - .cra_init = hash_cra_init, - .cra_module = THIS_MODULE, - } - } - }, - { - .conf.algorithm = HASH_ALGO_SHA256, - .conf.oper_mode = HASH_OPER_MODE_HASH, - .hash = { - .init = ux500_hash_init, - .update = ahash_update, - .final = ahash_final, - .digest = ahash_sha256_digest, - .export = ahash_noexport, - .import = ahash_noimport, - .halg.digestsize = SHA256_DIGEST_SIZE, - .halg.statesize = sizeof(struct hash_ctx), - .halg.base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-ux500", - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct hash_ctx), - .cra_init = hash_cra_init, - .cra_module = THIS_MODULE, - } - } - }, - { - .conf.algorithm = HASH_ALGO_SHA1, - .conf.oper_mode = HASH_OPER_MODE_HMAC, - .hash = { - .init = ux500_hash_init, - .update = ahash_update, - .final = ahash_final, - .digest = hmac_sha1_digest, - .setkey = hmac_sha1_setkey, - .export = ahash_noexport, - .import = ahash_noimport, - .halg.digestsize = SHA1_DIGEST_SIZE, - .halg.statesize = sizeof(struct hash_ctx), - .halg.base = { - .cra_name = "hmac(sha1)", - .cra_driver_name = "hmac-sha1-ux500", - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct hash_ctx), - .cra_init = hash_cra_init, - .cra_module = THIS_MODULE, - } - } - }, - { - .conf.algorithm = HASH_ALGO_SHA256, - .conf.oper_mode = HASH_OPER_MODE_HMAC, - .hash = { - .init = ux500_hash_init, - .update = ahash_update, - .final = ahash_final, - .digest = hmac_sha256_digest, - .setkey = hmac_sha256_setkey, - .export = ahash_noexport, - .import = ahash_noimport, - .halg.digestsize = SHA256_DIGEST_SIZE, - .halg.statesize = sizeof(struct hash_ctx), - .halg.base = { - .cra_name = "hmac(sha256)", - .cra_driver_name = "hmac-sha256-ux500", - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct hash_ctx), - .cra_init = hash_cra_init, - .cra_module = THIS_MODULE, - } - } - } -}; - -static int ahash_algs_register_all(struct hash_device_data *device_data) -{ - int ret; - int i; - int count; - - for (i = 0; i < ARRAY_SIZE(hash_algs); i++) { - ret = crypto_register_ahash(&hash_algs[i].hash); - if (ret) { - count = i; - dev_err(device_data->dev, "%s: alg registration failed\n", - hash_algs[i].hash.halg.base.cra_driver_name); - goto unreg; - } - } - return 0; -unreg: - for (i = 0; i < count; i++) - crypto_unregister_ahash(&hash_algs[i].hash); - return ret; -} - -static void ahash_algs_unregister_all(struct hash_device_data *device_data) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(hash_algs); i++) - crypto_unregister_ahash(&hash_algs[i].hash); -} - -/** - * ux500_hash_probe - Function that probes the hash hardware. - * @pdev: The platform device. - */ -static int ux500_hash_probe(struct platform_device *pdev) -{ - int ret = 0; - struct resource *res = NULL; - struct hash_device_data *device_data; - struct device *dev = &pdev->dev; - - device_data = devm_kzalloc(dev, sizeof(*device_data), GFP_KERNEL); - if (!device_data) { - ret = -ENOMEM; - goto out; - } - - device_data->dev = dev; - device_data->current_ctx = NULL; - - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { - dev_dbg(dev, "%s: platform_get_resource() failed!\n", __func__); - ret = -ENODEV; - goto out; - } - - device_data->phybase = res->start; - device_data->base = devm_ioremap_resource(dev, res); - if (IS_ERR(device_data->base)) { - ret = PTR_ERR(device_data->base); - goto out; - } - spin_lock_init(&device_data->ctx_lock); - spin_lock_init(&device_data->power_state_lock); - - /* Enable power for HASH1 hardware block */ - device_data->regulator = regulator_get(dev, "v-ape"); - if (IS_ERR(device_data->regulator)) { - dev_err(dev, "%s: regulator_get() failed!\n", __func__); - ret = PTR_ERR(device_data->regulator); - device_data->regulator = NULL; - goto out; - } - - /* Enable the clock for HASH1 hardware block */ - device_data->clk = devm_clk_get(dev, NULL); - if (IS_ERR(device_data->clk)) { - dev_err(dev, "%s: clk_get() failed!\n", __func__); - ret = PTR_ERR(device_data->clk); - goto out_regulator; - } - - ret = clk_prepare(device_data->clk); - if (ret) { - dev_err(dev, "%s: clk_prepare() failed!\n", __func__); - goto out_regulator; - } - - /* Enable device power (and clock) */ - ret = hash_enable_power(device_data, false); - if (ret) { - dev_err(dev, "%s: hash_enable_power() failed!\n", __func__); - goto out_clk_unprepare; - } - - ret = hash_check_hw(device_data); - if (ret) { - dev_err(dev, "%s: hash_check_hw() failed!\n", __func__); - goto out_power; - } - - if (hash_mode == HASH_MODE_DMA) - hash_dma_setup_channel(device_data, dev); - - platform_set_drvdata(pdev, device_data); - - /* Put the new device into the device list... */ - klist_add_tail(&device_data->list_node, &driver_data.device_list); - /* ... and signal that a new device is available. */ - up(&driver_data.device_allocation); - - ret = ahash_algs_register_all(device_data); - if (ret) { - dev_err(dev, "%s: ahash_algs_register_all() failed!\n", - __func__); - goto out_power; - } - - dev_info(dev, "successfully registered\n"); - return 0; - -out_power: - hash_disable_power(device_data, false); - -out_clk_unprepare: - clk_unprepare(device_data->clk); - -out_regulator: - regulator_put(device_data->regulator); - -out: - return ret; -} - -/** - * ux500_hash_remove - Function that removes the hash device from the platform. - * @pdev: The platform device. - */ -static int ux500_hash_remove(struct platform_device *pdev) -{ - struct hash_device_data *device_data; - struct device *dev = &pdev->dev; - - device_data = platform_get_drvdata(pdev); - if (!device_data) { - dev_err(dev, "%s: platform_get_drvdata() failed!\n", __func__); - return -ENOMEM; - } - - /* Try to decrease the number of available devices. */ - if (down_trylock(&driver_data.device_allocation)) - return -EBUSY; - - /* Check that the device is free */ - spin_lock(&device_data->ctx_lock); - /* current_ctx allocates a device, NULL = unallocated */ - if (device_data->current_ctx) { - /* The device is busy */ - spin_unlock(&device_data->ctx_lock); - /* Return the device to the pool. */ - up(&driver_data.device_allocation); - return -EBUSY; - } - - spin_unlock(&device_data->ctx_lock); - - /* Remove the device from the list */ - if (klist_node_attached(&device_data->list_node)) - klist_remove(&device_data->list_node); - - /* If this was the last device, remove the services */ - if (list_empty(&driver_data.device_list.k_list)) - ahash_algs_unregister_all(device_data); - - if (hash_disable_power(device_data, false)) - dev_err(dev, "%s: hash_disable_power() failed\n", - __func__); - - clk_unprepare(device_data->clk); - regulator_put(device_data->regulator); - - return 0; -} - -/** - * ux500_hash_shutdown - Function that shutdown the hash device. - * @pdev: The platform device - */ -static void ux500_hash_shutdown(struct platform_device *pdev) -{ - struct hash_device_data *device_data; - - device_data = platform_get_drvdata(pdev); - if (!device_data) { - dev_err(&pdev->dev, "%s: platform_get_drvdata() failed!\n", - __func__); - return; - } - - /* Check that the device is free */ - spin_lock(&device_data->ctx_lock); - /* current_ctx allocates a device, NULL = unallocated */ - if (!device_data->current_ctx) { - if (down_trylock(&driver_data.device_allocation)) - dev_dbg(&pdev->dev, "%s: Cryp still in use! Shutting down anyway...\n", - __func__); - /** - * (Allocate the device) - * Need to set this to non-null (dummy) value, - * to avoid usage if context switching. - */ - device_data->current_ctx++; - } - spin_unlock(&device_data->ctx_lock); - - /* Remove the device from the list */ - if (klist_node_attached(&device_data->list_node)) - klist_remove(&device_data->list_node); - - /* If this was the last device, remove the services */ - if (list_empty(&driver_data.device_list.k_list)) - ahash_algs_unregister_all(device_data); - - if (hash_disable_power(device_data, false)) - dev_err(&pdev->dev, "%s: hash_disable_power() failed\n", - __func__); -} - -#ifdef CONFIG_PM_SLEEP -/** - * ux500_hash_suspend - Function that suspends the hash device. - * @dev: Device to suspend. - */ -static int ux500_hash_suspend(struct device *dev) -{ - int ret; - struct hash_device_data *device_data; - struct hash_ctx *temp_ctx = NULL; - - device_data = dev_get_drvdata(dev); - if (!device_data) { - dev_err(dev, "%s: platform_get_drvdata() failed!\n", __func__); - return -ENOMEM; - } - - spin_lock(&device_data->ctx_lock); - if (!device_data->current_ctx) - device_data->current_ctx++; - spin_unlock(&device_data->ctx_lock); - - if (device_data->current_ctx == ++temp_ctx) { - if (down_interruptible(&driver_data.device_allocation)) - dev_dbg(dev, "%s: down_interruptible() failed\n", - __func__); - ret = hash_disable_power(device_data, false); - - } else { - ret = hash_disable_power(device_data, true); - } - - if (ret) - dev_err(dev, "%s: hash_disable_power()\n", __func__); - - return ret; -} - -/** - * ux500_hash_resume - Function that resume the hash device. - * @dev: Device to resume. - */ -static int ux500_hash_resume(struct device *dev) -{ - int ret = 0; - struct hash_device_data *device_data; - struct hash_ctx *temp_ctx = NULL; - - device_data = dev_get_drvdata(dev); - if (!device_data) { - dev_err(dev, "%s: platform_get_drvdata() failed!\n", __func__); - return -ENOMEM; - } - - spin_lock(&device_data->ctx_lock); - if (device_data->current_ctx == ++temp_ctx) - device_data->current_ctx = NULL; - spin_unlock(&device_data->ctx_lock); - - if (!device_data->current_ctx) - up(&driver_data.device_allocation); - else - ret = hash_enable_power(device_data, true); - - if (ret) - dev_err(dev, "%s: hash_enable_power() failed!\n", __func__); - - return ret; -} -#endif - -static SIMPLE_DEV_PM_OPS(ux500_hash_pm, ux500_hash_suspend, ux500_hash_resume); - -static const struct of_device_id ux500_hash_match[] = { - { .compatible = "stericsson,ux500-hash" }, - { }, -}; -MODULE_DEVICE_TABLE(of, ux500_hash_match); - -static struct platform_driver hash_driver = { - .probe = ux500_hash_probe, - .remove = ux500_hash_remove, - .shutdown = ux500_hash_shutdown, - .driver = { - .name = "hash1", - .of_match_table = ux500_hash_match, - .pm = &ux500_hash_pm, - } -}; - -/** - * ux500_hash_mod_init - The kernel module init function. - */ -static int __init ux500_hash_mod_init(void) -{ - klist_init(&driver_data.device_list, NULL, NULL); - /* Initialize the semaphore to 0 devices (locked state) */ - sema_init(&driver_data.device_allocation, 0); - - return platform_driver_register(&hash_driver); -} - -/** - * ux500_hash_mod_fini - The kernel module exit function. - */ -static void __exit ux500_hash_mod_fini(void) -{ - platform_driver_unregister(&hash_driver); -} - -module_init(ux500_hash_mod_init); -module_exit(ux500_hash_mod_fini); - -MODULE_DESCRIPTION("Driver for ST-Ericsson UX500 HASH engine."); -MODULE_LICENSE("GPL"); - -MODULE_ALIAS_CRYPTO("sha1-all"); -MODULE_ALIAS_CRYPTO("sha256-all"); -MODULE_ALIAS_CRYPTO("hmac-sha1-all"); -MODULE_ALIAS_CRYPTO("hmac-sha256-all"); From c7410b425de40e9b163eef781e1bdacf1bf2962e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 27 Jan 2023 19:03:21 +0800 Subject: [PATCH 079/156] hwrng: starfive - Enable compile testing Enable compile testing for jh7110. Also remove the dependency on HW_RANDOM. Signed-off-by: Herbert Xu Reviewed-by: Conor Dooley Reviewed-by: Jia Jie Ho Signed-off-by: Herbert Xu --- drivers/char/hw_random/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig index be202959efa6..4fdf07ae3c54 100644 --- a/drivers/char/hw_random/Kconfig +++ b/drivers/char/hw_random/Kconfig @@ -551,8 +551,7 @@ config HW_RANDOM_CN10K config HW_RANDOM_JH7110 tristate "StarFive JH7110 Random Number Generator support" - depends on SOC_STARFIVE - depends on HW_RANDOM + depends on SOC_STARFIVE || COMPILE_TEST help This driver provides support for the True Random Number Generator in StarFive JH7110 SoCs. From 1d273983fa83d7bc5b3d3b784ec774fb7e52983e Mon Sep 17 00:00:00 2001 From: Meadhbh Date: Thu, 19 Jan 2023 09:05:08 +0100 Subject: [PATCH 080/156] Documentation: qat: change kernel version Change kernel version from 5.20 to 6.0, as 5.20 is not a release. Signed-off-by: Meadhbh Fitzpatrick Reviewed-by: Giovanni Cabiddu Reviewed-by: Vladis Dronov Signed-off-by: Herbert Xu --- Documentation/ABI/testing/sysfs-driver-qat | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-driver-qat b/Documentation/ABI/testing/sysfs-driver-qat index 185f81a2aab3..087842b1969e 100644 --- a/Documentation/ABI/testing/sysfs-driver-qat +++ b/Documentation/ABI/testing/sysfs-driver-qat @@ -1,6 +1,6 @@ What: /sys/bus/pci/devices//qat/state Date: June 2022 -KernelVersion: 5.20 +KernelVersion: 6.0 Contact: qat-linux@intel.com Description: (RW) Reports the current state of the QAT device. Write to the file to start or stop the device. @@ -18,7 +18,7 @@ Description: (RW) Reports the current state of the QAT device. Write to What: /sys/bus/pci/devices//qat/cfg_services Date: June 2022 -KernelVersion: 5.20 +KernelVersion: 6.0 Contact: qat-linux@intel.com Description: (RW) Reports the current configuration of the QAT device. Write to the file to change the configured services. From 808d065ad7367e9552b054aa3e092b78027bdf3e Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 30 Jan 2023 08:31:09 +0100 Subject: [PATCH 081/156] MAINTAINERS: repair file entry for STARFIVE TRNG DRIVER Commit c388f458bc34 ("hwrng: starfive - Add TRNG driver for StarFive SoC") adds the STARFIVE TRNG DRIVER section to MAINTAINERS, but refers to the non-existing file drivers/char/hw_random/starfive-trng.c rather than to the actually added file drivers/char/hw_random/jh7110-trng.c in this commit. Hence, ./scripts/get_maintainer.pl --self-test=patterns complains about a broken reference. Repair this file entry in STARFIVE TRNG DRIVER. Fixes: c388f458bc34 ("hwrng: starfive - Add TRNG driver for StarFive SoC") Signed-off-by: Lukas Bulwahn Acked-by: Jia Jie Ho Signed-off-by: Herbert Xu --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4f59559597ab..7c90bc235a0a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19909,7 +19909,7 @@ STARFIVE TRNG DRIVER M: Jia Jie Ho S: Supported F: Documentation/devicetree/bindings/rng/starfive* -F: drivers/char/hw_random/starfive-trng.c +F: drivers/char/hw_random/jh7110-trng.c STATIC BRANCH/CALL M: Peter Zijlstra From 57ead1bf1c5430c5f663dfd3399040d90db9c7ab Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 30 Jan 2023 16:58:51 +0800 Subject: [PATCH 082/156] crypto: arm64/aes-ccm - Rewrite skcipher walker loop An often overlooked aspect of the skcipher walker API is that an error is not just indicated by a non-zero return value, but by the fact that walk->nbytes is zero. Thus it is an error to call skcipher_walk_done after getting back walk->nbytes == 0 from the previous interaction with the walker. This is because when walk->nbytes is zero the walker is left in an undefined state and any further calls to it may try to free uninitialised stack memory. The arm64 ccm code has to deal with zero-length messages, and it needs to process data even when walk->nbytes == 0 is returned. It doesn't have this bug because there is an explicit check for walk->nbytes != 0 prior to the skcipher_walk_done call. However, the loop is still sufficiently different from the usual layout and it appears to have been copied into other code which then ended up with this bug. This patch rewrites it to follow the usual convention of checking walk->nbytes. Signed-off-by: Herbert Xu Tested-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-ce-ccm-glue.c | 57 +++++++++++++---------------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c index c4f14415f5f0..25cd3808ecbe 100644 --- a/arch/arm64/crypto/aes-ce-ccm-glue.c +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c @@ -161,43 +161,39 @@ static int ccm_encrypt(struct aead_request *req) memcpy(buf, req->iv, AES_BLOCK_SIZE); err = skcipher_walk_aead_encrypt(&walk, req, false); - if (unlikely(err)) - return err; kernel_neon_begin(); if (req->assoclen) ccm_calculate_auth_mac(req, mac); - do { + while (walk.nbytes) { u32 tail = walk.nbytes % AES_BLOCK_SIZE; + bool final = walk.nbytes == walk.total; - if (walk.nbytes == walk.total) + if (final) tail = 0; ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes - tail, ctx->key_enc, num_rounds(ctx), mac, walk.iv); - if (walk.nbytes == walk.total) - ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); + if (!final) + kernel_neon_end(); + err = skcipher_walk_done(&walk, tail); + if (!final) + kernel_neon_begin(); + } - kernel_neon_end(); + ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); - if (walk.nbytes) { - err = skcipher_walk_done(&walk, tail); - if (unlikely(err)) - return err; - if (unlikely(walk.nbytes)) - kernel_neon_begin(); - } - } while (walk.nbytes); + kernel_neon_end(); /* copy authtag to end of dst */ scatterwalk_map_and_copy(mac, req->dst, req->assoclen + req->cryptlen, crypto_aead_authsize(aead), 1); - return 0; + return err; } static int ccm_decrypt(struct aead_request *req) @@ -219,37 +215,36 @@ static int ccm_decrypt(struct aead_request *req) memcpy(buf, req->iv, AES_BLOCK_SIZE); err = skcipher_walk_aead_decrypt(&walk, req, false); - if (unlikely(err)) - return err; kernel_neon_begin(); if (req->assoclen) ccm_calculate_auth_mac(req, mac); - do { + while (walk.nbytes) { u32 tail = walk.nbytes % AES_BLOCK_SIZE; + bool final = walk.nbytes == walk.total; - if (walk.nbytes == walk.total) + if (final) tail = 0; ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes - tail, ctx->key_enc, num_rounds(ctx), mac, walk.iv); - if (walk.nbytes == walk.total) - ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); + if (!final) + kernel_neon_end(); + err = skcipher_walk_done(&walk, tail); + if (!final) + kernel_neon_begin(); + } - kernel_neon_end(); + ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); - if (walk.nbytes) { - err = skcipher_walk_done(&walk, tail); - if (unlikely(err)) - return err; - if (unlikely(walk.nbytes)) - kernel_neon_begin(); - } - } while (walk.nbytes); + kernel_neon_end(); + + if (unlikely(err)) + return err; /* compare calculated auth tag with the stored one */ scatterwalk_map_and_copy(buf, req->src, From b529ea65931cb8731c668f1699c845b1eb9909a8 Mon Sep 17 00:00:00 2001 From: Peter Lafreniere Date: Mon, 30 Jan 2023 20:27:14 -0500 Subject: [PATCH 083/156] crypto: x86/blowfish - Remove unused encode parameter The blowfish-x86_64 encryption functions have an unused argument. Remove it. This involves: 1 - Removing xor_block() macros. 2 - Removing handling of fourth argument from __blowfish_enc_blk{,_4way}() functions. 3 - Renaming __blowfish_enc_blk{,_4way}() to blowfish_enc_blk{,_4way}(). 4 - Removing the blowfish_enc_blk{,_4way}() wrappers from blowfish_glue.c 5 - Temporarily using SYM_TYPED_FUNC_START for now indirectly-callable encode functions. Signed-off-by: Peter Lafreniere Signed-off-by: Herbert Xu --- arch/x86/crypto/blowfish-x86_64-asm_64.S | 46 +++--------------------- arch/x86/crypto/blowfish_glue.c | 18 ++-------- 2 files changed, 7 insertions(+), 57 deletions(-) diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 4a43e072d2d1..4c5d4bc28ac4 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -100,16 +100,11 @@ bswapq RX0; \ movq RX0, (RIO); -#define xor_block() \ - bswapq RX0; \ - xorq RX0, (RIO); - -SYM_FUNC_START(__blowfish_enc_blk) +SYM_TYPED_FUNC_START(blowfish_enc_blk) /* input: * %rdi: ctx * %rsi: dst * %rdx: src - * %rcx: bool, if true: xor output */ movq %r12, %r11; @@ -130,17 +125,11 @@ SYM_FUNC_START(__blowfish_enc_blk) add_roundkey_enc(16); movq %r11, %r12; - movq %r10, RIO; - test %cl, %cl; - jnz .L__enc_xor; write_block(); RET; -.L__enc_xor: - xor_block(); - RET; -SYM_FUNC_END(__blowfish_enc_blk) +SYM_FUNC_END(blowfish_enc_blk) SYM_TYPED_FUNC_START(blowfish_dec_blk) /* input: @@ -271,29 +260,14 @@ SYM_FUNC_END(blowfish_dec_blk) bswapq RX3; \ movq RX3, 24(RIO); -#define xor_block4() \ - bswapq RX0; \ - xorq RX0, (RIO); \ - \ - bswapq RX1; \ - xorq RX1, 8(RIO); \ - \ - bswapq RX2; \ - xorq RX2, 16(RIO); \ - \ - bswapq RX3; \ - xorq RX3, 24(RIO); - -SYM_FUNC_START(__blowfish_enc_blk_4way) +SYM_TYPED_FUNC_START(blowfish_enc_blk_4way) /* input: * %rdi: ctx * %rsi: dst * %rdx: src - * %rcx: bool, if true: xor output */ pushq %r12; pushq %rbx; - pushq %rcx; movq %rdi, CTX movq %rsi, %r11; @@ -313,25 +287,13 @@ SYM_FUNC_START(__blowfish_enc_blk_4way) round_enc4(14); add_preloaded_roundkey4(); - popq %r12; movq %r11, RIO; - - test %r12b, %r12b; - jnz .L__enc_xor4; - write_block4(); popq %rbx; popq %r12; RET; - -.L__enc_xor4: - xor_block4(); - - popq %rbx; - popq %r12; - RET; -SYM_FUNC_END(__blowfish_enc_blk_4way) +SYM_FUNC_END(blowfish_enc_blk_4way) SYM_TYPED_FUNC_START(blowfish_dec_blk_4way) /* input: diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 019c64c1340a..13a6664a89f3 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -17,27 +17,15 @@ #include /* regular block cipher functions */ -asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, - bool xor); +asmlinkage void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); /* 4-way parallel cipher functions */ -asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, - const u8 *src, bool xor); +asmlinkage void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src); asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, const u8 *src); -static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) -{ - __blowfish_enc_blk(ctx, dst, src, false); -} - -static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, - const u8 *src) -{ - __blowfish_enc_blk_4way(ctx, dst, src, false); -} - static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { blowfish_enc_blk(crypto_tfm_ctx(tfm), dst, src); From bc3f42acc4eefc5e7e300596f0836e0d9ad9f4a8 Mon Sep 17 00:00:00 2001 From: Peter Lafreniere Date: Mon, 30 Jan 2023 20:27:57 -0500 Subject: [PATCH 084/156] crypto: x86/blowfish - Convert to use ECB/CBC helpers We can simplify the blowfish-x86_64 glue code by using the preexisting ECB/CBC helper macros. Additionally, this allows for easier reuse of asm functions in later x86 implementations of blowfish. This involves: 1 - Modifying blowfish_dec_blk_4way() to xor outputs when a flag is passed. 2 - Renaming blowfish_dec_blk_4way() to __blowfish_dec_blk_4way(). 3 - Creating two wrapper functions around __blowfish_dec_blk_4way() for use in the ECB/CBC macros. 4 - Removing the custom ecb_encrypt() and cbc_encrypt() routines in favor of macro-based routines. Signed-off-by: Peter Lafreniere Signed-off-by: Herbert Xu --- arch/x86/crypto/blowfish-x86_64-asm_64.S | 30 +++- arch/x86/crypto/blowfish_glue.c | 196 ++++------------------- 2 files changed, 58 insertions(+), 168 deletions(-) diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 4c5d4bc28ac4..9a9924c32883 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -260,6 +260,19 @@ SYM_FUNC_END(blowfish_dec_blk) bswapq RX3; \ movq RX3, 24(RIO); +#define xor_block4() \ + movq (RIO), RT0; \ + bswapq RT0; \ + xorq RT0, RX1; \ + \ + movq 8(RIO), RT2; \ + bswapq RT2; \ + xorq RT2, RX2; \ + \ + movq 16(RIO), RT3; \ + bswapq RT3; \ + xorq RT3, RX3; + SYM_TYPED_FUNC_START(blowfish_enc_blk_4way) /* input: * %rdi: ctx @@ -295,17 +308,20 @@ SYM_TYPED_FUNC_START(blowfish_enc_blk_4way) RET; SYM_FUNC_END(blowfish_enc_blk_4way) -SYM_TYPED_FUNC_START(blowfish_dec_blk_4way) +SYM_TYPED_FUNC_START(__blowfish_dec_blk_4way) /* input: * %rdi: ctx * %rsi: dst * %rdx: src + * %rcx: cbc (bool) */ pushq %r12; pushq %rbx; + pushq %rcx; + pushq %rdx; movq %rdi, CTX; - movq %rsi, %r11 + movq %rsi, %r11; movq %rdx, RIO; preload_roundkey_dec(17); @@ -321,6 +337,14 @@ SYM_TYPED_FUNC_START(blowfish_dec_blk_4way) round_dec4(3); add_preloaded_roundkey4(); + popq RIO; + popq %r12; + testq %r12, %r12; + jz .L_no_cbc_xor; + + xor_block4(); + +.L_no_cbc_xor: movq %r11, RIO; write_block4(); @@ -328,4 +352,4 @@ SYM_TYPED_FUNC_START(blowfish_dec_blk_4way) popq %r12; RET; -SYM_FUNC_END(blowfish_dec_blk_4way) +SYM_FUNC_END(__blowfish_dec_blk_4way) diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 13a6664a89f3..552f2df0643f 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -16,6 +16,8 @@ #include #include +#include "ecb_cbc_helpers.h" + /* regular block cipher functions */ asmlinkage void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); @@ -23,8 +25,20 @@ asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); /* 4-way parallel cipher functions */ asmlinkage void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, const u8 *src); -asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void __blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src, bool cbc); + +static inline void blowfish_dec_ecb_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + return __blowfish_dec_blk_4way(ctx, dst, src, false); +} + +static inline void blowfish_dec_cbc_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + return __blowfish_dec_blk_4way(ctx, dst, src, true); +} static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { @@ -42,183 +56,35 @@ static int blowfish_setkey_skcipher(struct crypto_skcipher *tfm, return blowfish_setkey(&tfm->base, key, keylen); } -static int ecb_crypt(struct skcipher_request *req, - void (*fn)(struct bf_ctx *, u8 *, const u8 *), - void (*fn_4way)(struct bf_ctx *, u8 *, const u8 *)) -{ - unsigned int bsize = BF_BLOCK_SIZE; - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct bf_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - u8 *wsrc = walk.src.virt.addr; - u8 *wdst = walk.dst.virt.addr; - - /* Process four block batch */ - if (nbytes >= bsize * 4) { - do { - fn_4way(ctx, wdst, wsrc); - - wsrc += bsize * 4; - wdst += bsize * 4; - nbytes -= bsize * 4; - } while (nbytes >= bsize * 4); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - fn(ctx, wdst, wsrc); - - wsrc += bsize; - wdst += bsize; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - err = skcipher_walk_done(&walk, nbytes); - } - - return err; -} - static int ecb_encrypt(struct skcipher_request *req) { - return ecb_crypt(req, blowfish_enc_blk, blowfish_enc_blk_4way); + ECB_WALK_START(req, BF_BLOCK_SIZE, -1); + ECB_BLOCK(4, blowfish_enc_blk_4way); + ECB_BLOCK(1, blowfish_enc_blk); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return ecb_crypt(req, blowfish_dec_blk, blowfish_dec_blk_4way); -} - -static unsigned int __cbc_encrypt(struct bf_ctx *ctx, - struct skcipher_walk *walk) -{ - unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - u64 *iv = (u64 *)walk->iv; - - do { - *dst = *src ^ *iv; - blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - - *(u64 *)walk->iv = *iv; - return nbytes; + ECB_WALK_START(req, BF_BLOCK_SIZE, -1); + ECB_BLOCK(4, blowfish_dec_ecb_4way); + ECB_BLOCK(1, blowfish_dec_blk); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct bf_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while (walk.nbytes) { - nbytes = __cbc_encrypt(ctx, &walk); - err = skcipher_walk_done(&walk, nbytes); - } - - return err; -} - -static unsigned int __cbc_decrypt(struct bf_ctx *ctx, - struct skcipher_walk *walk) -{ - unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - u64 ivs[4 - 1]; - u64 last_iv; - - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - /* Process four block batch */ - if (nbytes >= bsize * 4) { - do { - nbytes -= bsize * 4 - bsize; - src -= 4 - 1; - dst -= 4 - 1; - - ivs[0] = src[0]; - ivs[1] = src[1]; - ivs[2] = src[2]; - - blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src); - - dst[1] ^= ivs[0]; - dst[2] ^= ivs[1]; - dst[3] ^= ivs[2]; - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * 4); - } - - /* Handle leftovers */ - for (;;) { - blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - break; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } - -done: - *dst ^= *(u64 *)walk->iv; - *(u64 *)walk->iv = last_iv; - - return nbytes; + CBC_WALK_START(req, BF_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(blowfish_enc_blk); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct bf_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while (walk.nbytes) { - nbytes = __cbc_decrypt(ctx, &walk); - err = skcipher_walk_done(&walk, nbytes); - } - - return err; + CBC_WALK_START(req, BF_BLOCK_SIZE, -1); + CBC_DEC_BLOCK(4, blowfish_dec_cbc_4way); + CBC_DEC_BLOCK(1, blowfish_dec_blk); + CBC_WALK_END(); } static struct crypto_alg bf_cipher_alg = { From c9adc75d320c72247ea77b2bf255f1c3c1ca3d6e Mon Sep 17 00:00:00 2001 From: Peter Lafreniere Date: Mon, 30 Jan 2023 20:28:40 -0500 Subject: [PATCH 085/156] crypto: x86/blowfish - Eliminate use of SYM_TYPED_FUNC_START in asm Now that we use the ECB/CBC macros, none of the asm functions in blowfish-x86_64 are called indirectly. So we can safely use SYM_FUNC_START instead of SYM_TYPED_FUNC_START with no effect, allowing us to remove an include. Signed-off-by: Peter Lafreniere Signed-off-by: Herbert Xu --- arch/x86/crypto/blowfish-x86_64-asm_64.S | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 9a9924c32883..e88c8e4f013c 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -6,7 +6,6 @@ */ #include -#include .file "blowfish-x86_64-asm.S" .text @@ -100,7 +99,7 @@ bswapq RX0; \ movq RX0, (RIO); -SYM_TYPED_FUNC_START(blowfish_enc_blk) +SYM_FUNC_START(blowfish_enc_blk) /* input: * %rdi: ctx * %rsi: dst @@ -131,7 +130,7 @@ SYM_TYPED_FUNC_START(blowfish_enc_blk) RET; SYM_FUNC_END(blowfish_enc_blk) -SYM_TYPED_FUNC_START(blowfish_dec_blk) +SYM_FUNC_START(blowfish_dec_blk) /* input: * %rdi: ctx * %rsi: dst @@ -273,7 +272,7 @@ SYM_FUNC_END(blowfish_dec_blk) bswapq RT3; \ xorq RT3, RX3; -SYM_TYPED_FUNC_START(blowfish_enc_blk_4way) +SYM_FUNC_START(blowfish_enc_blk_4way) /* input: * %rdi: ctx * %rsi: dst @@ -308,7 +307,7 @@ SYM_TYPED_FUNC_START(blowfish_enc_blk_4way) RET; SYM_FUNC_END(blowfish_enc_blk_4way) -SYM_TYPED_FUNC_START(__blowfish_dec_blk_4way) +SYM_FUNC_START(__blowfish_dec_blk_4way) /* input: * %rdi: ctx * %rsi: dst From acc3f5504489691dabc6199e294b40aabe8eccfa Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 13:44:05 +0800 Subject: [PATCH 086/156] crypto: safexcel - Use crypto_wait_req This patch replaces the custom crypto completion function with crypto_req_done. Signed-off-by: Herbert Xu --- drivers/crypto/inside-secure/safexcel.c | 11 ---- drivers/crypto/inside-secure/safexcel.h | 6 --- .../crypto/inside-secure/safexcel_cipher.c | 21 ++++---- drivers/crypto/inside-secure/safexcel_hash.c | 54 +++++-------------- 4 files changed, 24 insertions(+), 68 deletions(-) diff --git a/drivers/crypto/inside-secure/safexcel.c b/drivers/crypto/inside-secure/safexcel.c index ae6110376e21..baff0123f919 100644 --- a/drivers/crypto/inside-secure/safexcel.c +++ b/drivers/crypto/inside-secure/safexcel.c @@ -970,17 +970,6 @@ void safexcel_complete(struct safexcel_crypto_priv *priv, int ring) } while (!cdesc->last_seg); } -void safexcel_inv_complete(struct crypto_async_request *req, int error) -{ - struct safexcel_inv_result *result = req->data; - - if (error == -EINPROGRESS) - return; - - result->error = error; - complete(&result->completion); -} - int safexcel_invalidate_cache(struct crypto_async_request *async, struct safexcel_crypto_priv *priv, dma_addr_t ctxr_dma, int ring) diff --git a/drivers/crypto/inside-secure/safexcel.h b/drivers/crypto/inside-secure/safexcel.h index 6c2fc662f64f..47ef6c7cd02c 100644 --- a/drivers/crypto/inside-secure/safexcel.h +++ b/drivers/crypto/inside-secure/safexcel.h @@ -884,11 +884,6 @@ struct safexcel_alg_template { } alg; }; -struct safexcel_inv_result { - struct completion completion; - int error; -}; - void safexcel_dequeue(struct safexcel_crypto_priv *priv, int ring); int safexcel_rdesc_check_errors(struct safexcel_crypto_priv *priv, void *rdp); @@ -927,7 +922,6 @@ void safexcel_rdr_req_set(struct safexcel_crypto_priv *priv, struct crypto_async_request *req); inline struct crypto_async_request * safexcel_rdr_req_get(struct safexcel_crypto_priv *priv, int ring); -void safexcel_inv_complete(struct crypto_async_request *req, int error); int safexcel_hmac_setkey(struct safexcel_context *base, const u8 *key, unsigned int keylen, const char *alg, unsigned int state_sz); diff --git a/drivers/crypto/inside-secure/safexcel_cipher.c b/drivers/crypto/inside-secure/safexcel_cipher.c index 32a37e3850c5..272c28b5a088 100644 --- a/drivers/crypto/inside-secure/safexcel_cipher.c +++ b/drivers/crypto/inside-secure/safexcel_cipher.c @@ -1091,13 +1091,12 @@ static int safexcel_aead_send(struct crypto_async_request *async, int ring, static int safexcel_cipher_exit_inv(struct crypto_tfm *tfm, struct crypto_async_request *base, struct safexcel_cipher_req *sreq, - struct safexcel_inv_result *result) + struct crypto_wait *result) { struct safexcel_cipher_ctx *ctx = crypto_tfm_ctx(tfm); struct safexcel_crypto_priv *priv = ctx->base.priv; int ring = ctx->base.ring; - - init_completion(&result->completion); + int err; ctx = crypto_tfm_ctx(base->tfm); ctx->base.exit_inv = true; @@ -1110,13 +1109,13 @@ static int safexcel_cipher_exit_inv(struct crypto_tfm *tfm, queue_work(priv->ring[ring].workqueue, &priv->ring[ring].work_data.work); - wait_for_completion(&result->completion); + err = crypto_wait_req(-EINPROGRESS, result); - if (result->error) { + if (err) { dev_warn(priv->dev, "cipher: sync: invalidate: completion error %d\n", - result->error); - return result->error; + err); + return err; } return 0; @@ -1126,12 +1125,12 @@ static int safexcel_skcipher_exit_inv(struct crypto_tfm *tfm) { EIP197_REQUEST_ON_STACK(req, skcipher, EIP197_SKCIPHER_REQ_SIZE); struct safexcel_cipher_req *sreq = skcipher_request_ctx(req); - struct safexcel_inv_result result = {}; + DECLARE_CRYPTO_WAIT(result); memset(req, 0, sizeof(struct skcipher_request)); skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - safexcel_inv_complete, &result); + crypto_req_done, &result); skcipher_request_set_tfm(req, __crypto_skcipher_cast(tfm)); return safexcel_cipher_exit_inv(tfm, &req->base, sreq, &result); @@ -1141,12 +1140,12 @@ static int safexcel_aead_exit_inv(struct crypto_tfm *tfm) { EIP197_REQUEST_ON_STACK(req, aead, EIP197_AEAD_REQ_SIZE); struct safexcel_cipher_req *sreq = aead_request_ctx(req); - struct safexcel_inv_result result = {}; + DECLARE_CRYPTO_WAIT(result); memset(req, 0, sizeof(struct aead_request)); aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - safexcel_inv_complete, &result); + crypto_req_done, &result); aead_request_set_tfm(req, __crypto_aead_cast(tfm)); return safexcel_cipher_exit_inv(tfm, &req->base, sreq, &result); diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c index ca46328472d4..e17577b785c3 100644 --- a/drivers/crypto/inside-secure/safexcel_hash.c +++ b/drivers/crypto/inside-secure/safexcel_hash.c @@ -625,15 +625,16 @@ static int safexcel_ahash_exit_inv(struct crypto_tfm *tfm) struct safexcel_crypto_priv *priv = ctx->base.priv; EIP197_REQUEST_ON_STACK(req, ahash, EIP197_AHASH_REQ_SIZE); struct safexcel_ahash_req *rctx = ahash_request_ctx_dma(req); - struct safexcel_inv_result result = {}; + DECLARE_CRYPTO_WAIT(result); int ring = ctx->base.ring; + int err; memset(req, 0, EIP197_AHASH_REQ_SIZE); /* create invalidation request */ init_completion(&result.completion); ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - safexcel_inv_complete, &result); + crypto_req_done, &result); ahash_request_set_tfm(req, __crypto_ahash_cast(tfm)); ctx = crypto_tfm_ctx(req->base.tfm); @@ -647,12 +648,11 @@ static int safexcel_ahash_exit_inv(struct crypto_tfm *tfm) queue_work(priv->ring[ring].workqueue, &priv->ring[ring].work_data.work); - wait_for_completion(&result.completion); + err = crypto_wait_req(-EINPROGRESS, &result); - if (result.error) { - dev_warn(priv->dev, "hash: completion error (%d)\n", - result.error); - return result.error; + if (err) { + dev_warn(priv->dev, "hash: completion error (%d)\n", err); + return err; } return 0; @@ -1042,27 +1042,11 @@ static int safexcel_hmac_sha1_digest(struct ahash_request *areq) return safexcel_ahash_finup(areq); } -struct safexcel_ahash_result { - struct completion completion; - int error; -}; - -static void safexcel_ahash_complete(struct crypto_async_request *req, int error) -{ - struct safexcel_ahash_result *result = req->data; - - if (error == -EINPROGRESS) - return; - - result->error = error; - complete(&result->completion); -} - static int safexcel_hmac_init_pad(struct ahash_request *areq, unsigned int blocksize, const u8 *key, unsigned int keylen, u8 *ipad, u8 *opad) { - struct safexcel_ahash_result result; + DECLARE_CRYPTO_WAIT(result); struct scatterlist sg; int ret, i; u8 *keydup; @@ -1075,16 +1059,12 @@ static int safexcel_hmac_init_pad(struct ahash_request *areq, return -ENOMEM; ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_BACKLOG, - safexcel_ahash_complete, &result); + crypto_req_done, &result); sg_init_one(&sg, keydup, keylen); ahash_request_set_crypt(areq, &sg, ipad, keylen); - init_completion(&result.completion); ret = crypto_ahash_digest(areq); - if (ret == -EINPROGRESS || ret == -EBUSY) { - wait_for_completion_interruptible(&result.completion); - ret = result.error; - } + ret = crypto_wait_req(ret, &result); /* Avoid leaking */ kfree_sensitive(keydup); @@ -1109,16 +1089,15 @@ static int safexcel_hmac_init_pad(struct ahash_request *areq, static int safexcel_hmac_init_iv(struct ahash_request *areq, unsigned int blocksize, u8 *pad, void *state) { - struct safexcel_ahash_result result; struct safexcel_ahash_req *req; + DECLARE_CRYPTO_WAIT(result); struct scatterlist sg; int ret; ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_BACKLOG, - safexcel_ahash_complete, &result); + crypto_req_done, &result); sg_init_one(&sg, pad, blocksize); ahash_request_set_crypt(areq, &sg, pad, blocksize); - init_completion(&result.completion); ret = crypto_ahash_init(areq); if (ret) @@ -1129,14 +1108,9 @@ static int safexcel_hmac_init_iv(struct ahash_request *areq, req->last_req = true; ret = crypto_ahash_update(areq); - if (ret && ret != -EINPROGRESS && ret != -EBUSY) - return ret; + ret = crypto_wait_req(ret, &result); - wait_for_completion_interruptible(&result.completion); - if (result.error) - return result.error; - - return crypto_ahash_export(areq, state); + return ret ?: crypto_ahash_export(areq, state); } static int __safexcel_hmac_setkey(const char *alg, const u8 *key, From d58fa987be772a9a1e76302d80c1a1e0e15c7d8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 31 Jan 2023 09:13:51 +0100 Subject: [PATCH 087/156] crypto: atmel - Drop unused id parameter from atmel_i2c_probe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit id is unused in atmel_i2c_probe() and the callers have extra efforts to determine the right parameter. So drop the parameter simplifying both atmel_i2c_probe() and its callers. Signed-off-by: Uwe Kleine-König Reviewed-by: Tudor Ambarus Signed-off-by: Herbert Xu --- drivers/crypto/atmel-ecc.c | 3 +-- drivers/crypto/atmel-i2c.c | 2 +- drivers/crypto/atmel-i2c.h | 2 +- drivers/crypto/atmel-sha204a.c | 3 +-- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/atmel-ecc.c b/drivers/crypto/atmel-ecc.c index 53100fb9b07b..8a72afb5f52f 100644 --- a/drivers/crypto/atmel-ecc.c +++ b/drivers/crypto/atmel-ecc.c @@ -313,11 +313,10 @@ static struct kpp_alg atmel_ecdh_nist_p256 = { static int atmel_ecc_probe(struct i2c_client *client) { - const struct i2c_device_id *id = i2c_client_get_device_id(client); struct atmel_i2c_client_priv *i2c_priv; int ret; - ret = atmel_i2c_probe(client, id); + ret = atmel_i2c_probe(client); if (ret) return ret; diff --git a/drivers/crypto/atmel-i2c.c b/drivers/crypto/atmel-i2c.c index e0896224513a..4c2d4d89ca42 100644 --- a/drivers/crypto/atmel-i2c.c +++ b/drivers/crypto/atmel-i2c.c @@ -324,7 +324,7 @@ free_cmd: return ret; } -int atmel_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id) +int atmel_i2c_probe(struct i2c_client *client) { struct atmel_i2c_client_priv *i2c_priv; struct device *dev = &client->dev; diff --git a/drivers/crypto/atmel-i2c.h b/drivers/crypto/atmel-i2c.h index 2872ed8cf3b5..610d7dddd922 100644 --- a/drivers/crypto/atmel-i2c.h +++ b/drivers/crypto/atmel-i2c.h @@ -167,7 +167,7 @@ struct atmel_i2c_work_data { struct atmel_i2c_cmd cmd; }; -int atmel_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id); +int atmel_i2c_probe(struct i2c_client *client); void atmel_i2c_enqueue(struct atmel_i2c_work_data *work_data, void (*cbk)(struct atmel_i2c_work_data *work_data, diff --git a/drivers/crypto/atmel-sha204a.c b/drivers/crypto/atmel-sha204a.c index 272a06f0b588..4403dbb0f0b1 100644 --- a/drivers/crypto/atmel-sha204a.c +++ b/drivers/crypto/atmel-sha204a.c @@ -93,11 +93,10 @@ static int atmel_sha204a_rng_read(struct hwrng *rng, void *data, size_t max, static int atmel_sha204a_probe(struct i2c_client *client) { - const struct i2c_device_id *id = i2c_client_get_device_id(client); struct atmel_i2c_client_priv *i2c_priv; int ret; - ret = atmel_i2c_probe(client, id); + ret = atmel_i2c_probe(client); if (ret) return ret; From 3b9d902153f31998a30b0cf4a0aeb090a05005d3 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Wed, 1 Feb 2023 20:32:07 +0800 Subject: [PATCH 088/156] crypto: arm64/sm4-ccm - Rewrite skcipher walker loop The fact that an error in the skcipher walker API are indicated not only by a non-zero return value, but also by the fact that walk->nbytes is zero, causes the layout of the skcipher walker loop to be sufficiently different from the usual layout, which is not a problem in itself, but it is likely to cause reading confusion and difficulty in code maintenance. This patch rewrites skcipher walker loop, and separates the last chunk cryption from the loop to avoid wrong calls to the skcipher walker API. In addition to following the usual convention of checking walk->nbytes, it also makes the loop execute logic clearer and easier to understand. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/sm4-ce-ccm-glue.c | 48 ++++++++++++++++------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/arch/arm64/crypto/sm4-ce-ccm-glue.c b/arch/arm64/crypto/sm4-ce-ccm-glue.c index f2cec7b52efc..5e7e17bbec81 100644 --- a/arch/arm64/crypto/sm4-ce-ccm-glue.c +++ b/arch/arm64/crypto/sm4-ce-ccm-glue.c @@ -166,7 +166,7 @@ static int ccm_crypt(struct aead_request *req, struct skcipher_walk *walk, unsigned int nbytes, u8 *mac)) { u8 __aligned(8) ctr0[SM4_BLOCK_SIZE]; - int err; + int err = 0; /* preserve the initial ctr0 for the TAG */ memcpy(ctr0, walk->iv, SM4_BLOCK_SIZE); @@ -177,33 +177,37 @@ static int ccm_crypt(struct aead_request *req, struct skcipher_walk *walk, if (req->assoclen) ccm_calculate_auth_mac(req, mac); - do { + while (walk->nbytes && walk->nbytes != walk->total) { unsigned int tail = walk->nbytes % SM4_BLOCK_SIZE; - const u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - if (walk->nbytes == walk->total) - tail = 0; - - if (walk->nbytes - tail) - sm4_ce_ccm_crypt(rkey_enc, dst, src, walk->iv, - walk->nbytes - tail, mac); - - if (walk->nbytes == walk->total) - sm4_ce_ccm_final(rkey_enc, ctr0, mac); + sm4_ce_ccm_crypt(rkey_enc, walk->dst.virt.addr, + walk->src.virt.addr, walk->iv, + walk->nbytes - tail, mac); kernel_neon_end(); - if (walk->nbytes) { - err = skcipher_walk_done(walk, tail); - if (err) - return err; - if (walk->nbytes) - kernel_neon_begin(); - } - } while (walk->nbytes > 0); + err = skcipher_walk_done(walk, tail); - return 0; + kernel_neon_begin(); + } + + if (walk->nbytes) { + sm4_ce_ccm_crypt(rkey_enc, walk->dst.virt.addr, + walk->src.virt.addr, walk->iv, + walk->nbytes, mac); + + sm4_ce_ccm_final(rkey_enc, ctr0, mac); + + kernel_neon_end(); + + err = skcipher_walk_done(walk, 0); + } else { + sm4_ce_ccm_final(rkey_enc, ctr0, mac); + + kernel_neon_end(); + } + + return err; } static int ccm_encrypt(struct aead_request *req) From f6044cc3030e139f60c281386f28bda6e3049d66 Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Wed, 1 Feb 2023 15:59:44 +0000 Subject: [PATCH 089/156] crypto: qat - fix out-of-bounds read When preparing an AER-CTR request, the driver copies the key provided by the user into a data structure that is accessible by the firmware. If the target device is QAT GEN4, the key size is rounded up by 16 since a rounded up size is expected by the device. If the key size is rounded up before the copy, the size used for copying the key might be bigger than the size of the region containing the key, causing an out-of-bounds read. Fix by doing the copy first and then update the keylen. This is to fix the following warning reported by KASAN: [ 138.150574] BUG: KASAN: global-out-of-bounds in qat_alg_skcipher_init_com.isra.0+0x197/0x250 [intel_qat] [ 138.150641] Read of size 32 at addr ffffffff88c402c0 by task cryptomgr_test/2340 [ 138.150651] CPU: 15 PID: 2340 Comm: cryptomgr_test Not tainted 6.2.0-rc1+ #45 [ 138.150659] Hardware name: Intel Corporation ArcherCity/ArcherCity, BIOS EGSDCRB1.86B.0087.D13.2208261706 08/26/2022 [ 138.150663] Call Trace: [ 138.150668] [ 138.150922] kasan_check_range+0x13a/0x1c0 [ 138.150931] memcpy+0x1f/0x60 [ 138.150940] qat_alg_skcipher_init_com.isra.0+0x197/0x250 [intel_qat] [ 138.151006] qat_alg_skcipher_init_sessions+0xc1/0x240 [intel_qat] [ 138.151073] crypto_skcipher_setkey+0x82/0x160 [ 138.151085] ? prepare_keybuf+0xa2/0xd0 [ 138.151095] test_skcipher_vec_cfg+0x2b8/0x800 Fixes: 67916c951689 ("crypto: qat - add AES-CTR support for QAT GEN4 devices") Cc: Reported-by: Vladis Dronov Signed-off-by: Giovanni Cabiddu Reviewed-by: Fiona Trahe Reviewed-by: Vladis Dronov Tested-by: Vladis Dronov Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_common/qat_algs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/qat/qat_common/qat_algs.c b/drivers/crypto/qat/qat_common/qat_algs.c index b4b9f0aa59b9..b61ada559158 100644 --- a/drivers/crypto/qat/qat_common/qat_algs.c +++ b/drivers/crypto/qat/qat_common/qat_algs.c @@ -435,8 +435,8 @@ static void qat_alg_skcipher_init_com(struct qat_alg_skcipher_ctx *ctx, } else if (aes_v2_capable && mode == ICP_QAT_HW_CIPHER_CTR_MODE) { ICP_QAT_FW_LA_SLICE_TYPE_SET(header->serv_specif_flags, ICP_QAT_FW_LA_USE_UCS_SLICE_TYPE); - keylen = round_up(keylen, 16); memcpy(cd->ucs_aes.key, key, keylen); + keylen = round_up(keylen, 16); } else { memcpy(cd->aes.key, key, keylen); } From 0ceb587dbb94e13bb70988872f1fc0cf412f5f87 Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Wed, 1 Feb 2023 17:04:41 +0000 Subject: [PATCH 090/156] crypto: qat - drop log level of msg in get_instance_node() The functions qat_crypto_get_instance_node() and qat_compression_get_instance_node() allow to get a QAT instance (ring pair) on a device close to the node specified as input parameter. When this is not possible, and a QAT device is available in the system, these function return an instance on a remote node and they print a message reporting that it is not possible to find a device on the specified node. This is interpreted by people as an error rather than an info. The print "Could not find a device on node" indicates that a kernel application is running on a core in a socket that does not have a QAT device directly attached to it and performance might suffer. Due to the nature of the message, this can be considered as a debug message, therefore drop the severity to debug and report it only once to avoid flooding. Suggested-by: Vladis Dronov Signed-off-by: Giovanni Cabiddu Reviewed-by: Fiona Trahe Reviewed-by: Vladis Dronov Tested-by: Vladis Dronov Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_common/qat_compression.c | 2 +- drivers/crypto/qat/qat_common/qat_crypto.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/qat/qat_common/qat_compression.c b/drivers/crypto/qat/qat_common/qat_compression.c index 9fd10f4242f8..3f1f35283266 100644 --- a/drivers/crypto/qat/qat_common/qat_compression.c +++ b/drivers/crypto/qat/qat_common/qat_compression.c @@ -72,7 +72,7 @@ struct qat_compression_instance *qat_compression_get_instance_node(int node) } if (!accel_dev) { - pr_info("QAT: Could not find a device on node %d\n", node); + pr_debug_ratelimited("QAT: Could not find a device on node %d\n", node); /* Get any started device */ list_for_each(itr, adf_devmgr_get_head()) { struct adf_accel_dev *tmp_dev; diff --git a/drivers/crypto/qat/qat_common/qat_crypto.c b/drivers/crypto/qat/qat_common/qat_crypto.c index e31199eade5b..40c8e74d1cf9 100644 --- a/drivers/crypto/qat/qat_common/qat_crypto.c +++ b/drivers/crypto/qat/qat_common/qat_crypto.c @@ -70,7 +70,7 @@ struct qat_crypto_instance *qat_crypto_get_instance_node(int node) } if (!accel_dev) { - pr_info("QAT: Could not find a device on node %d\n", node); + pr_debug_ratelimited("QAT: Could not find a device on node %d\n", node); /* Get any started device */ list_for_each_entry(tmp_dev, adf_devmgr_get_head(), list) { if (adf_dev_started(tmp_dev) && From 4e4a08868f15897ca236528771c3733fded42c62 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 2 Feb 2023 16:33:47 +0800 Subject: [PATCH 091/156] crypto: arm64/sm4-gcm - Fix possible crash in GCM cryption An often overlooked aspect of the skcipher walker API is that an error is not just indicated by a non-zero return value, but by the fact that walk->nbytes is zero. Thus it is an error to call skcipher_walk_done after getting back walk->nbytes == 0 from the previous interaction with the walker. This is because when walk->nbytes is zero the walker is left in an undefined state and any further calls to it may try to free uninitialised stack memory. The sm4 arm64 ccm code gets this wrong and ends up calling skcipher_walk_done even when walk->nbytes is zero. This patch rewrites the loop in a form that resembles other callers. Reported-by: Tianjia Zhang Fixes: ae1b83c7d572 ("crypto: arm64/sm4 - add CE implementation for GCM mode") Signed-off-by: Herbert Xu Tested-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/sm4-ce-gcm-glue.c | 51 ++++++++++++++--------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/arch/arm64/crypto/sm4-ce-gcm-glue.c b/arch/arm64/crypto/sm4-ce-gcm-glue.c index c450a2025ca9..73bfb6972d3a 100644 --- a/arch/arm64/crypto/sm4-ce-gcm-glue.c +++ b/arch/arm64/crypto/sm4-ce-gcm-glue.c @@ -135,22 +135,23 @@ static void gcm_calculate_auth_mac(struct aead_request *req, u8 ghash[]) } static int gcm_crypt(struct aead_request *req, struct skcipher_walk *walk, - struct sm4_gcm_ctx *ctx, u8 ghash[], + u8 ghash[], int err, void (*sm4_ce_pmull_gcm_crypt)(const u32 *rkey_enc, u8 *dst, const u8 *src, u8 *iv, unsigned int nbytes, u8 *ghash, const u8 *ghash_table, const u8 *lengths)) { + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct sm4_gcm_ctx *ctx = crypto_aead_ctx(aead); u8 __aligned(8) iv[SM4_BLOCK_SIZE]; be128 __aligned(8) lengths; - int err; memset(ghash, 0, SM4_BLOCK_SIZE); lengths.a = cpu_to_be64(req->assoclen * 8); lengths.b = cpu_to_be64(walk->total * 8); - memcpy(iv, walk->iv, GCM_IV_SIZE); + memcpy(iv, req->iv, GCM_IV_SIZE); put_unaligned_be32(2, iv + GCM_IV_SIZE); kernel_neon_begin(); @@ -158,49 +159,51 @@ static int gcm_crypt(struct aead_request *req, struct skcipher_walk *walk, if (req->assoclen) gcm_calculate_auth_mac(req, ghash); - do { + while (walk->nbytes) { unsigned int tail = walk->nbytes % SM4_BLOCK_SIZE; const u8 *src = walk->src.virt.addr; u8 *dst = walk->dst.virt.addr; if (walk->nbytes == walk->total) { - tail = 0; - sm4_ce_pmull_gcm_crypt(ctx->key.rkey_enc, dst, src, iv, walk->nbytes, ghash, ctx->ghash_table, (const u8 *)&lengths); - } else if (walk->nbytes - tail) { - sm4_ce_pmull_gcm_crypt(ctx->key.rkey_enc, dst, src, iv, - walk->nbytes - tail, ghash, - ctx->ghash_table, NULL); + + kernel_neon_end(); + + return skcipher_walk_done(walk, 0); } + sm4_ce_pmull_gcm_crypt(ctx->key.rkey_enc, dst, src, iv, + walk->nbytes - tail, ghash, + ctx->ghash_table, NULL); + kernel_neon_end(); err = skcipher_walk_done(walk, tail); - if (err) - return err; - if (walk->nbytes) - kernel_neon_begin(); - } while (walk->nbytes > 0); - return 0; + kernel_neon_begin(); + } + + sm4_ce_pmull_gcm_crypt(ctx->key.rkey_enc, NULL, NULL, iv, + walk->nbytes, ghash, ctx->ghash_table, + (const u8 *)&lengths); + + kernel_neon_end(); + + return err; } static int gcm_encrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct sm4_gcm_ctx *ctx = crypto_aead_ctx(aead); u8 __aligned(8) ghash[SM4_BLOCK_SIZE]; struct skcipher_walk walk; int err; err = skcipher_walk_aead_encrypt(&walk, req, false); - if (err) - return err; - - err = gcm_crypt(req, &walk, ctx, ghash, sm4_ce_pmull_gcm_enc); + err = gcm_crypt(req, &walk, ghash, err, sm4_ce_pmull_gcm_enc); if (err) return err; @@ -215,17 +218,13 @@ static int gcm_decrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); unsigned int authsize = crypto_aead_authsize(aead); - struct sm4_gcm_ctx *ctx = crypto_aead_ctx(aead); u8 __aligned(8) ghash[SM4_BLOCK_SIZE]; u8 authtag[SM4_BLOCK_SIZE]; struct skcipher_walk walk; int err; err = skcipher_walk_aead_decrypt(&walk, req, false); - if (err) - return err; - - err = gcm_crypt(req, &walk, ctx, ghash, sm4_ce_pmull_gcm_dec); + err = gcm_crypt(req, &walk, ghash, err, sm4_ce_pmull_gcm_dec); if (err) return err; From acd4045db64cdebc6ce4be35abe01000311664aa Mon Sep 17 00:00:00 2001 From: Zhang Yiqun Date: Thu, 2 Feb 2023 16:38:05 +0800 Subject: [PATCH 092/156] crypto: testmgr - add diff-splits of src/dst into default cipher config This type of request is often happened in AF_ALG cases. So add this vector in default cipher config array. Signed-off-by: Zhang Yiqun Reviewed-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/testmgr.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index dd748832ed4a..c91e93ece20b 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -356,6 +356,14 @@ static const struct testvec_config default_cipher_testvec_configs[] = { { .proportion_of_total = 5000 }, { .proportion_of_total = 5000 }, }, + }, { + .name = "one src, two even splits dst", + .inplace_mode = OUT_OF_PLACE, + .src_divs = { { .proportion_of_total = 10000 } }, + .dst_divs = { + { .proportion_of_total = 5000 }, + { .proportion_of_total = 5000 }, + }, }, { .name = "uneven misaligned splits, may sleep", .req_flags = CRYPTO_TFM_REQ_MAY_SLEEP, From eaf05e829f389d172ca7e553f22d49d104b6861c Mon Sep 17 00:00:00 2001 From: Neal Liu Date: Fri, 3 Feb 2023 11:35:12 +0800 Subject: [PATCH 093/156] crypto: aspeed - fix type warnings This patch fixes following warnings: 1. sparse: incorrect type in assignment (different base types) Fix: change to __le32 type. 2. sparse: cast removes address space '__iomem' of expression Fix: use readb to avoid dereferencing the memory. Signed-off-by: Neal Liu Signed-off-by: Herbert Xu --- drivers/crypto/aspeed/aspeed-acry.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/aspeed/aspeed-acry.c b/drivers/crypto/aspeed/aspeed-acry.c index 164c524015f0..1f77ebd73489 100644 --- a/drivers/crypto/aspeed/aspeed-acry.c +++ b/drivers/crypto/aspeed/aspeed-acry.c @@ -252,7 +252,7 @@ static int aspeed_acry_rsa_ctx_copy(struct aspeed_acry_dev *acry_dev, void *buf, enum aspeed_rsa_key_mode mode) { const u8 *src = xbuf; - u32 *dw_buf = (u32 *)buf; + __le32 *dw_buf = buf; int nbits, ndw; int i, j, idx; u32 data = 0; @@ -302,7 +302,7 @@ static int aspeed_acry_rsa_ctx_copy(struct aspeed_acry_dev *acry_dev, void *buf, static int aspeed_acry_rsa_transfer(struct aspeed_acry_dev *acry_dev) { struct akcipher_request *req = acry_dev->req; - u8 *sram_buffer = (u8 *)acry_dev->acry_sram; + u8 __iomem *sram_buffer = acry_dev->acry_sram; struct scatterlist *out_sg = req->dst; static u8 dram_buffer[ASPEED_ACRY_SRAM_MAX_LEN]; int leading_zero = 1; @@ -321,11 +321,11 @@ static int aspeed_acry_rsa_transfer(struct aspeed_acry_dev *acry_dev) for (j = ASPEED_ACRY_SRAM_MAX_LEN - 1; j >= 0; j--) { data_idx = acry_dev->data_byte_mapping[j]; - if (sram_buffer[data_idx] == 0 && leading_zero) { + if (readb(sram_buffer + data_idx) == 0 && leading_zero) { result_nbytes--; } else { leading_zero = 0; - dram_buffer[i] = sram_buffer[data_idx]; + dram_buffer[i] = readb(sram_buffer + data_idx); i++; } } From 4409c08d806721f0be80bf1c6537a983289272ed Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 4 Feb 2023 21:54:08 +0100 Subject: [PATCH 094/156] crypto: virtio/akcipher - Do not use GFP_ATOMIC when not needed There is no need to use GFP_ATOMIC here. GFP_KERNEL is already used for another memory allocation just the line after. Signed-off-by: Christophe JAILLET Signed-off-by: Herbert Xu --- drivers/crypto/virtio/virtio_crypto_akcipher_algs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c b/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c index b2979be613b8..6963344f6a3a 100644 --- a/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c +++ b/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c @@ -116,7 +116,7 @@ static int virtio_crypto_alg_akcipher_init_session(struct virtio_crypto_akcipher struct virtio_crypto_session_input *input; struct virtio_crypto_ctrl_request *vc_ctrl_req; - pkey = kmemdup(key, keylen, GFP_ATOMIC); + pkey = kmemdup(key, keylen, GFP_KERNEL); if (!pkey) return -ENOMEM; From c35e03eaece71101ff6cbf776b86403860ac8cc3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:01:45 +0800 Subject: [PATCH 095/156] crypto: api - Add scaffolding to change completion function signature The crypto completion function currently takes a pointer to a struct crypto_async_request object. However, in reality the API does not allow the use of any part of the object apart from the data field. For example, ahash/shash will create a fake object on the stack to pass along a different data field. This leads to potential bugs where the user may try to dereference or otherwise use the crypto_async_request object. This patch adds some temporary scaffolding so that the completion function can take a void * instead. Once affected users have been converted this can be removed. The helper crypto_request_complete will remain even after the conversion is complete. It should be used instead of calling the completion function directly. Signed-off-by: Herbert Xu Reviewed-by: Giovanni Cabiddu Signed-off-by: Herbert Xu --- include/crypto/algapi.h | 7 +++++++ include/linux/crypto.h | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h index 61b327206b55..1fd81e74a174 100644 --- a/include/crypto/algapi.h +++ b/include/crypto/algapi.h @@ -302,4 +302,11 @@ enum { CRYPTO_MSG_ALG_LOADED, }; +static inline void crypto_request_complete(struct crypto_async_request *req, + int err) +{ + crypto_completion_t complete = req->complete; + complete(req, err); +} + #endif /* _CRYPTO_ALGAPI_H */ diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 5d1e961f810e..b18f6e669fb1 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -176,6 +176,7 @@ struct crypto_async_request; struct crypto_tfm; struct crypto_type; +typedef struct crypto_async_request crypto_completion_data_t; typedef void (*crypto_completion_t)(struct crypto_async_request *req, int err); /** @@ -595,6 +596,11 @@ struct crypto_wait { /* * Async ops completion helper functioons */ +static inline void *crypto_get_completion_data(crypto_completion_data_t *req) +{ + return req->data; +} + void crypto_req_done(struct crypto_async_request *req, int err); static inline int crypto_wait_req(int err, struct crypto_wait *wait) From 96747228b72509e981c146b06f652e0f17a26675 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 18:22:12 +0800 Subject: [PATCH 096/156] dm: Add scaffolding to change completion function signature This patch adds temporary scaffolding so that the Crypto API completion function can take a void * instead of crypto_async_request. Once affected users have been converted this can be removed. Signed-off-by: Herbert Xu Acked-by: Mike Snitzer Signed-off-by: Herbert Xu --- drivers/md/dm-crypt.c | 8 +++----- drivers/md/dm-integrity.c | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 2653516bcdef..7609fe39ab8c 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1458,8 +1458,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc, return r; } -static void kcryptd_async_done(struct crypto_async_request *async_req, - int error); +static void kcryptd_async_done(crypto_completion_data_t *async_req, int error); static int crypt_alloc_req_skcipher(struct crypt_config *cc, struct convert_context *ctx) @@ -2147,10 +2146,9 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) crypt_dec_pending(io); } -static void kcryptd_async_done(struct crypto_async_request *async_req, - int error) +static void kcryptd_async_done(crypto_completion_data_t *data, int error) { - struct dm_crypt_request *dmreq = async_req->data; + struct dm_crypt_request *dmreq = crypto_get_completion_data(data); struct convert_context *ctx = dmreq->ctx; struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); struct crypt_config *cc = io->cc; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 1388ee35571e..eefe25ed841e 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -955,9 +955,9 @@ static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned sectio async_tx_issue_pending_all(); } -static void complete_journal_encrypt(struct crypto_async_request *req, int err) +static void complete_journal_encrypt(crypto_completion_data_t *data, int err) { - struct journal_completion *comp = req->data; + struct journal_completion *comp = crypto_get_completion_data(data); if (unlikely(err)) { if (likely(err == -EINPROGRESS)) { complete(&comp->ic->crypto_backoff); From 30859e97e06ac15d77ee2fc26bfe2583c01f3fc7 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 18:22:15 +0800 Subject: [PATCH 097/156] net: macsec: Add scaffolding to change completion function signature This patch adds temporary scaffolding so that the Crypto API completion function can take a void * instead of crypto_async_request. Once affected users have been converted this can be removed. Signed-off-by: Herbert Xu Acked-by: Jarkko Sakkinen Signed-off-by: Herbert Xu --- drivers/net/macsec.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index bf8ac7a3ded7..b7d9d487ccd2 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -528,9 +528,9 @@ static void count_tx(struct net_device *dev, int ret, int len) } } -static void macsec_encrypt_done(struct crypto_async_request *base, int err) +static void macsec_encrypt_done(crypto_completion_data_t *data, int err) { - struct sk_buff *skb = base->data; + struct sk_buff *skb = crypto_get_completion_data(data); struct net_device *dev = skb->dev; struct macsec_dev *macsec = macsec_priv(dev); struct macsec_tx_sa *sa = macsec_skb_cb(skb)->tx_sa; @@ -835,9 +835,9 @@ static void count_rx(struct net_device *dev, int len) u64_stats_update_end(&stats->syncp); } -static void macsec_decrypt_done(struct crypto_async_request *base, int err) +static void macsec_decrypt_done(crypto_completion_data_t *data, int err) { - struct sk_buff *skb = base->data; + struct sk_buff *skb = crypto_get_completion_data(data); struct net_device *dev = skb->dev; struct macsec_dev *macsec = macsec_priv(dev); struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa; From 20066bf70034007874c546e456c9546bfad3759a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 18:22:17 +0800 Subject: [PATCH 098/156] fs: ecryptfs: Use crypto_wait_req This patch replaces the custom crypto completion function with crypto_req_done. Signed-off-by: Herbert Xu Acked-by: Jarkko Sakkinen Signed-off-by: Herbert Xu --- fs/ecryptfs/crypto.c | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index e3f5d7f3c8a0..c3057539f088 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -260,22 +260,6 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, return i; } -struct extent_crypt_result { - struct completion completion; - int rc; -}; - -static void extent_crypt_complete(struct crypto_async_request *req, int rc) -{ - struct extent_crypt_result *ecr = req->data; - - if (rc == -EINPROGRESS) - return; - - ecr->rc = rc; - complete(&ecr->completion); -} - /** * crypt_scatterlist * @crypt_stat: Pointer to the crypt_stat struct to initialize. @@ -293,7 +277,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, unsigned char *iv, int op) { struct skcipher_request *req = NULL; - struct extent_crypt_result ecr; + DECLARE_CRYPTO_WAIT(ecr); int rc = 0; if (unlikely(ecryptfs_verbosity > 0)) { @@ -303,8 +287,6 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, crypt_stat->key_size); } - init_completion(&ecr.completion); - mutex_lock(&crypt_stat->cs_tfm_mutex); req = skcipher_request_alloc(crypt_stat->tfm, GFP_NOFS); if (!req) { @@ -315,7 +297,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - extent_crypt_complete, &ecr); + crypto_req_done, &ecr); /* Consider doing this once, when the file is opened */ if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) { rc = crypto_skcipher_setkey(crypt_stat->tfm, crypt_stat->key, @@ -334,13 +316,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, skcipher_request_set_crypt(req, src_sg, dst_sg, size, iv); rc = op == ENCRYPT ? crypto_skcipher_encrypt(req) : crypto_skcipher_decrypt(req); - if (rc == -EINPROGRESS || rc == -EBUSY) { - struct extent_crypt_result *ecr = req->base.data; - - wait_for_completion(&ecr->completion); - rc = ecr->rc; - reinit_completion(&ecr->completion); - } + rc = crypto_wait_req(rc, &ecr); out: skcipher_request_free(req); return rc; From fe93d841dda6885f4cdf40de676f1adf81b4fb45 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 18:22:19 +0800 Subject: [PATCH 099/156] Bluetooth: Use crypto_wait_req This patch replaces the custom crypto completion function with crypto_req_done. Signed-off-by: Herbert Xu --- net/bluetooth/ecdh_helper.c | 37 ++++++------------------------------- 1 file changed, 6 insertions(+), 31 deletions(-) diff --git a/net/bluetooth/ecdh_helper.c b/net/bluetooth/ecdh_helper.c index 989401f116e9..0efc93fdae8a 100644 --- a/net/bluetooth/ecdh_helper.c +++ b/net/bluetooth/ecdh_helper.c @@ -25,22 +25,6 @@ #include #include -struct ecdh_completion { - struct completion completion; - int err; -}; - -static void ecdh_complete(struct crypto_async_request *req, int err) -{ - struct ecdh_completion *res = req->data; - - if (err == -EINPROGRESS) - return; - - res->err = err; - complete(&res->completion); -} - static inline void swap_digits(u64 *in, u64 *out, unsigned int ndigits) { int i; @@ -60,9 +44,9 @@ static inline void swap_digits(u64 *in, u64 *out, unsigned int ndigits) int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64], u8 secret[32]) { + DECLARE_CRYPTO_WAIT(result); struct kpp_request *req; u8 *tmp; - struct ecdh_completion result; struct scatterlist src, dst; int err; @@ -76,8 +60,6 @@ int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64], goto free_tmp; } - init_completion(&result.completion); - swap_digits((u64 *)public_key, (u64 *)tmp, 4); /* x */ swap_digits((u64 *)&public_key[32], (u64 *)&tmp[32], 4); /* y */ @@ -86,12 +68,9 @@ int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64], kpp_request_set_input(req, &src, 64); kpp_request_set_output(req, &dst, 32); kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - ecdh_complete, &result); + crypto_req_done, &result); err = crypto_kpp_compute_shared_secret(req); - if (err == -EINPROGRESS) { - wait_for_completion(&result.completion); - err = result.err; - } + err = crypto_wait_req(err, &result); if (err < 0) { pr_err("alg: ecdh: compute shared secret failed. err %d\n", err); @@ -165,9 +144,9 @@ free_tmp: */ int generate_ecdh_public_key(struct crypto_kpp *tfm, u8 public_key[64]) { + DECLARE_CRYPTO_WAIT(result); struct kpp_request *req; u8 *tmp; - struct ecdh_completion result; struct scatterlist dst; int err; @@ -181,18 +160,14 @@ int generate_ecdh_public_key(struct crypto_kpp *tfm, u8 public_key[64]) goto free_tmp; } - init_completion(&result.completion); sg_init_one(&dst, tmp, 64); kpp_request_set_input(req, NULL, 0); kpp_request_set_output(req, &dst, 64); kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - ecdh_complete, &result); + crypto_req_done, &result); err = crypto_kpp_generate_public_key(req); - if (err == -EINPROGRESS) { - wait_for_completion(&result.completion); - err = result.err; - } + err = crypto_wait_req(err, &result); if (err < 0) goto free_all; From 14d3109c9c5aa993c46c2219d222fc40eaff113e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 18:22:21 +0800 Subject: [PATCH 100/156] net: ipv4: Add scaffolding to change completion function signature This patch adds temporary scaffolding so that the Crypto API completion function can take a void * instead of crypto_async_request. Once affected users have been converted this can be removed. Signed-off-by: Herbert Xu --- net/ipv4/ah4.c | 8 ++++---- net/ipv4/esp4.c | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index ee4e578c7f20..1fc0231eb1ee 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -117,11 +117,11 @@ static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr) return 0; } -static void ah_output_done(struct crypto_async_request *base, int err) +static void ah_output_done(crypto_completion_data_t *data, int err) { u8 *icv; struct iphdr *iph; - struct sk_buff *skb = base->data; + struct sk_buff *skb = crypto_get_completion_data(data); struct xfrm_state *x = skb_dst(skb)->xfrm; struct ah_data *ahp = x->data; struct iphdr *top_iph = ip_hdr(skb); @@ -262,12 +262,12 @@ out: return err; } -static void ah_input_done(struct crypto_async_request *base, int err) +static void ah_input_done(crypto_completion_data_t *data, int err) { u8 *auth_data; u8 *icv; struct iphdr *work_iph; - struct sk_buff *skb = base->data; + struct sk_buff *skb = crypto_get_completion_data(data); struct xfrm_state *x = xfrm_input_state(skb); struct ah_data *ahp = x->data; struct ip_auth_hdr *ah = ip_auth_hdr(skb); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 52c8047efedb..8abe07c1ff28 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -244,9 +244,9 @@ static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) } #endif -static void esp_output_done(struct crypto_async_request *base, int err) +static void esp_output_done(crypto_completion_data_t *data, int err) { - struct sk_buff *skb = base->data; + struct sk_buff *skb = crypto_get_completion_data(data); struct xfrm_offload *xo = xfrm_offload(skb); void *tmp; struct xfrm_state *x; @@ -332,12 +332,12 @@ static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb, return esph; } -static void esp_output_done_esn(struct crypto_async_request *base, int err) +static void esp_output_done_esn(crypto_completion_data_t *data, int err) { - struct sk_buff *skb = base->data; + struct sk_buff *skb = crypto_get_completion_data(data); esp_output_restore_header(skb); - esp_output_done(base, err); + esp_output_done(data, err); } static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb, @@ -830,9 +830,9 @@ out: } EXPORT_SYMBOL_GPL(esp_input_done2); -static void esp_input_done(struct crypto_async_request *base, int err) +static void esp_input_done(crypto_completion_data_t *data, int err) { - struct sk_buff *skb = base->data; + struct sk_buff *skb = crypto_get_completion_data(data); xfrm_input_resume(skb, esp_input_done2(skb, err)); } @@ -860,12 +860,12 @@ static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) } } -static void esp_input_done_esn(struct crypto_async_request *base, int err) +static void esp_input_done_esn(crypto_completion_data_t *data, int err) { - struct sk_buff *skb = base->data; + struct sk_buff *skb = crypto_get_completion_data(data); esp_input_restore_header(skb); - esp_input_done(base, err); + esp_input_done(data, err); } /* From ec2964e807d1d53dbc7a23d7b2c9019245ae121b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 18:22:23 +0800 Subject: [PATCH 101/156] net: ipv6: Add scaffolding to change completion function signature This patch adds temporary scaffolding so that the Crypto API completion function can take a void * instead of crypto_async_request. Once affected users have been converted this can be removed. Signed-off-by: Herbert Xu --- net/ipv6/ah6.c | 8 ++++---- net/ipv6/esp6.c | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 5228d2716289..e43735578a76 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -281,12 +281,12 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) return 0; } -static void ah6_output_done(struct crypto_async_request *base, int err) +static void ah6_output_done(crypto_completion_data_t *data, int err) { int extlen; u8 *iph_base; u8 *icv; - struct sk_buff *skb = base->data; + struct sk_buff *skb = crypto_get_completion_data(data); struct xfrm_state *x = skb_dst(skb)->xfrm; struct ah_data *ahp = x->data; struct ipv6hdr *top_iph = ipv6_hdr(skb); @@ -451,12 +451,12 @@ out: return err; } -static void ah6_input_done(struct crypto_async_request *base, int err) +static void ah6_input_done(crypto_completion_data_t *data, int err) { u8 *auth_data; u8 *icv; u8 *work_iph; - struct sk_buff *skb = base->data; + struct sk_buff *skb = crypto_get_completion_data(data); struct xfrm_state *x = xfrm_input_state(skb); struct ah_data *ahp = x->data; struct ip_auth_hdr *ah = ip_auth_hdr(skb); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 14ed868680c6..b9ee81c7dfcf 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -278,9 +278,9 @@ static void esp_output_encap_csum(struct sk_buff *skb) } } -static void esp_output_done(struct crypto_async_request *base, int err) +static void esp_output_done(crypto_completion_data_t *data, int err) { - struct sk_buff *skb = base->data; + struct sk_buff *skb = crypto_get_completion_data(data); struct xfrm_offload *xo = xfrm_offload(skb); void *tmp; struct xfrm_state *x; @@ -368,12 +368,12 @@ static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb, return esph; } -static void esp_output_done_esn(struct crypto_async_request *base, int err) +static void esp_output_done_esn(crypto_completion_data_t *data, int err) { - struct sk_buff *skb = base->data; + struct sk_buff *skb = crypto_get_completion_data(data); esp_output_restore_header(skb); - esp_output_done(base, err); + esp_output_done(data, err); } static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb, @@ -879,9 +879,9 @@ out: } EXPORT_SYMBOL_GPL(esp6_input_done2); -static void esp_input_done(struct crypto_async_request *base, int err) +static void esp_input_done(crypto_completion_data_t *data, int err) { - struct sk_buff *skb = base->data; + struct sk_buff *skb = crypto_get_completion_data(data); xfrm_input_resume(skb, esp6_input_done2(skb, err)); } @@ -909,12 +909,12 @@ static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) } } -static void esp_input_done_esn(struct crypto_async_request *base, int err) +static void esp_input_done_esn(crypto_completion_data_t *data, int err) { - struct sk_buff *skb = base->data; + struct sk_buff *skb = crypto_get_completion_data(data); esp_input_restore_header(skb); - esp_input_done(base, err); + esp_input_done(data, err); } static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) From 1dbab1312254602175c805863eaace59ebf162ba Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 18:22:25 +0800 Subject: [PATCH 102/156] tipc: Add scaffolding to change completion function signature This patch adds temporary scaffolding so that the Crypto API completion function can take a void * instead of crypto_async_request. Once affected users have been converted this can be removed. Signed-off-by: Herbert Xu --- net/tipc/crypto.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index d67440de011e..ab356e7a3870 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -267,10 +267,10 @@ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode); -static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err); +static void tipc_aead_encrypt_done(crypto_completion_data_t *data, int err); static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b); -static void tipc_aead_decrypt_done(struct crypto_async_request *base, int err); +static void tipc_aead_decrypt_done(crypto_completion_data_t *data, int err); static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr); static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, u8 tx_key, struct sk_buff *skb, @@ -830,9 +830,9 @@ exit: return rc; } -static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err) +static void tipc_aead_encrypt_done(crypto_completion_data_t *data, int err) { - struct sk_buff *skb = base->data; + struct sk_buff *skb = crypto_get_completion_data(data); struct tipc_crypto_tx_ctx *tx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; struct tipc_bearer *b = tx_ctx->bearer; struct tipc_aead *aead = tx_ctx->aead; @@ -954,9 +954,9 @@ exit: return rc; } -static void tipc_aead_decrypt_done(struct crypto_async_request *base, int err) +static void tipc_aead_decrypt_done(crypto_completion_data_t *data, int err) { - struct sk_buff *skb = base->data; + struct sk_buff *skb = crypto_get_completion_data(data); struct tipc_crypto_rx_ctx *rx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; struct tipc_bearer *b = rx_ctx->bearer; struct tipc_aead *aead = rx_ctx->aead; From 8d338c76f7cfe0eb4bc46078b1c09c8c5fc75353 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 18:22:27 +0800 Subject: [PATCH 103/156] tls: Only use data field in crypto completion function The crypto_async_request passed to the completion is not guaranteed to be the original request object. Only the data field can be relied upon. Fix this by storing the socket pointer with the AEAD request. Signed-off-by: Herbert Xu --- net/tls/tls.h | 2 ++ net/tls/tls_sw.c | 40 +++++++++++++++++++++++++++++----------- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/net/tls/tls.h b/net/tls/tls.h index 0e840a0c3437..804c3880d028 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -70,6 +70,8 @@ struct tls_rec { char content_type; struct scatterlist sg_content_type; + struct sock *sk; + char aad_space[TLS_AAD_SPACE_SIZE]; u8 iv_data[MAX_IV_SIZE]; struct aead_request aead_req; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 9ed978634125..5b7f67a7d394 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -57,6 +58,7 @@ struct tls_decrypt_arg { }; struct tls_decrypt_ctx { + struct sock *sk; u8 iv[MAX_IV_SIZE]; u8 aad[TLS_MAX_AAD_SIZE]; u8 tail; @@ -177,18 +179,25 @@ static int tls_padding_length(struct tls_prot_info *prot, struct sk_buff *skb, return sub; } -static void tls_decrypt_done(struct crypto_async_request *req, int err) +static void tls_decrypt_done(crypto_completion_data_t *data, int err) { - struct aead_request *aead_req = (struct aead_request *)req; + struct aead_request *aead_req = crypto_get_completion_data(data); + struct crypto_aead *aead = crypto_aead_reqtfm(aead_req); struct scatterlist *sgout = aead_req->dst; struct scatterlist *sgin = aead_req->src; struct tls_sw_context_rx *ctx; + struct tls_decrypt_ctx *dctx; struct tls_context *tls_ctx; struct scatterlist *sg; unsigned int pages; struct sock *sk; + int aead_size; - sk = (struct sock *)req->data; + aead_size = sizeof(*aead_req) + crypto_aead_reqsize(aead); + aead_size = ALIGN(aead_size, __alignof__(*dctx)); + dctx = (void *)((u8 *)aead_req + aead_size); + + sk = dctx->sk; tls_ctx = tls_get_ctx(sk); ctx = tls_sw_ctx_rx(tls_ctx); @@ -240,7 +249,7 @@ static int tls_do_decryption(struct sock *sk, if (darg->async) { aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, - tls_decrypt_done, sk); + tls_decrypt_done, aead_req); atomic_inc(&ctx->decrypt_pending); } else { aead_request_set_callback(aead_req, @@ -336,6 +345,8 @@ static struct tls_rec *tls_get_rec(struct sock *sk) sg_set_buf(&rec->sg_aead_out[0], rec->aad_space, prot->aad_size); sg_unmark_end(&rec->sg_aead_out[1]); + rec->sk = sk; + return rec; } @@ -417,22 +428,27 @@ tx_err: return rc; } -static void tls_encrypt_done(struct crypto_async_request *req, int err) +static void tls_encrypt_done(crypto_completion_data_t *data, int err) { - struct aead_request *aead_req = (struct aead_request *)req; - struct sock *sk = req->data; - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_prot_info *prot = &tls_ctx->prot_info; - struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct aead_request *aead_req = crypto_get_completion_data(data); + struct tls_sw_context_tx *ctx; + struct tls_context *tls_ctx; + struct tls_prot_info *prot; struct scatterlist *sge; struct sk_msg *msg_en; struct tls_rec *rec; bool ready = false; + struct sock *sk; int pending; rec = container_of(aead_req, struct tls_rec, aead_req); msg_en = &rec->msg_encrypted; + sk = rec->sk; + tls_ctx = tls_get_ctx(sk); + prot = &tls_ctx->prot_info; + ctx = tls_sw_ctx_tx(tls_ctx); + sge = sk_msg_elem(msg_en, msg_en->sg.curr); sge->offset -= prot->prepend_size; sge->length += prot->prepend_size; @@ -520,7 +536,7 @@ static int tls_do_encryption(struct sock *sk, data_len, rec->iv_data); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, - tls_encrypt_done, sk); + tls_encrypt_done, aead_req); /* Add the record in tx_list */ list_add_tail((struct list_head *)&rec->list, &ctx->tx_list); @@ -1485,6 +1501,7 @@ static int tls_decrypt_sg(struct sock *sk, struct iov_iter *out_iov, * Both structs are variable length. */ aead_size = sizeof(*aead_req) + crypto_aead_reqsize(ctx->aead_recv); + aead_size = ALIGN(aead_size, __alignof__(*dctx)); mem = kmalloc(aead_size + struct_size(dctx, sg, n_sgin + n_sgout), sk->sk_allocation); if (!mem) { @@ -1495,6 +1512,7 @@ static int tls_decrypt_sg(struct sock *sk, struct iov_iter *out_iov, /* Segment the allocated memory */ aead_req = (struct aead_request *)mem; dctx = (struct tls_decrypt_ctx *)(mem + aead_size); + dctx->sk = sk; sgin = &dctx->sg[0]; sgout = &dctx->sg[n_sgin]; From 5419f2b27ea594f97cbeb743789448a9872f283f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 18:22:29 +0800 Subject: [PATCH 104/156] KEYS: DH: Use crypto_wait_req This patch replaces the custom crypto completion function with crypto_req_done. Signed-off-by: Herbert Xu --- security/keys/dh.c | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/security/keys/dh.c b/security/keys/dh.c index b339760a31dd..da64c358474b 100644 --- a/security/keys/dh.c +++ b/security/keys/dh.c @@ -64,22 +64,6 @@ static void dh_free_data(struct dh *dh) kfree_sensitive(dh->g); } -struct dh_completion { - struct completion completion; - int err; -}; - -static void dh_crypto_done(struct crypto_async_request *req, int err) -{ - struct dh_completion *compl = req->data; - - if (err == -EINPROGRESS) - return; - - compl->err = err; - complete(&compl->completion); -} - static int kdf_alloc(struct crypto_shash **hash, char *hashname) { struct crypto_shash *tfm; @@ -146,7 +130,7 @@ long __keyctl_dh_compute(struct keyctl_dh_params __user *params, struct keyctl_dh_params pcopy; struct dh dh_inputs; struct scatterlist outsg; - struct dh_completion compl; + DECLARE_CRYPTO_WAIT(compl); struct crypto_kpp *tfm; struct kpp_request *req; uint8_t *secret; @@ -266,22 +250,18 @@ long __keyctl_dh_compute(struct keyctl_dh_params __user *params, kpp_request_set_input(req, NULL, 0); kpp_request_set_output(req, &outsg, outlen); - init_completion(&compl.completion); kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - dh_crypto_done, &compl); + crypto_req_done, &compl); /* * For DH, generate_public_key and generate_shared_secret are * the same calculation */ ret = crypto_kpp_generate_public_key(req); - if (ret == -EINPROGRESS) { - wait_for_completion(&compl.completion); - ret = compl.err; - if (ret) - goto out6; - } + ret = crypto_wait_req(ret, &compl); + if (ret) + goto out6; if (kdfcopy) { /* From 256f9e53ae129c9536226b485a8c4e98223088d0 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 8 Feb 2023 13:53:24 +0800 Subject: [PATCH 105/156] crypto: cryptd - Use subreq for AEAD AEAD reuses the existing request object for its child. This is error-prone and unnecessary. This patch adds a subrequest object just like we do for skcipher and hash. This patch also restores the original completion function as we do for skcipher/hash. Signed-off-by: Herbert Xu --- crypto/cryptd.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/crypto/cryptd.c b/crypto/cryptd.c index 1ff58a021d57..9d60acc920cb 100644 --- a/crypto/cryptd.c +++ b/crypto/cryptd.c @@ -93,6 +93,7 @@ struct cryptd_aead_ctx { struct cryptd_aead_request_ctx { crypto_completion_t complete; + struct aead_request req; }; static void cryptd_queue_worker(struct work_struct *work); @@ -715,6 +716,7 @@ static void cryptd_aead_crypt(struct aead_request *req, int (*crypt)(struct aead_request *req)) { struct cryptd_aead_request_ctx *rctx; + struct aead_request *subreq; struct cryptd_aead_ctx *ctx; crypto_completion_t compl; struct crypto_aead *tfm; @@ -722,13 +724,23 @@ static void cryptd_aead_crypt(struct aead_request *req, rctx = aead_request_ctx(req); compl = rctx->complete; + subreq = &rctx->req; tfm = crypto_aead_reqtfm(req); if (unlikely(err == -EINPROGRESS)) goto out; - aead_request_set_tfm(req, child); - err = crypt( req ); + + aead_request_set_tfm(subreq, child); + aead_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + aead_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, + req->iv); + aead_request_set_ad(subreq, req->assoclen); + + err = crypt(subreq); + + req->base.complete = compl; out: ctx = crypto_aead_ctx(tfm); @@ -798,8 +810,8 @@ static int cryptd_aead_init_tfm(struct crypto_aead *tfm) ctx->child = cipher; crypto_aead_set_reqsize( - tfm, max((unsigned)sizeof(struct cryptd_aead_request_ctx), - crypto_aead_reqsize(cipher))); + tfm, sizeof(struct cryptd_aead_request_ctx) + + crypto_aead_reqsize(cipher)); return 0; } From 4cc01c7f3df10a4f7aa11266e8406f7e16898769 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:01:49 +0800 Subject: [PATCH 106/156] crypto: acompress - Use crypto_request_complete Use the crypto_request_complete helper instead of calling the completion function directly. Signed-off-by: Herbert Xu Reviewed-by: Giovanni Cabiddu Signed-off-by: Herbert Xu --- include/crypto/internal/acompress.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/crypto/internal/acompress.h b/include/crypto/internal/acompress.h index 49339003bd2c..978b57a3f4f0 100644 --- a/include/crypto/internal/acompress.h +++ b/include/crypto/internal/acompress.h @@ -28,7 +28,7 @@ static inline void *acomp_tfm_ctx(struct crypto_acomp *tfm) static inline void acomp_request_complete(struct acomp_req *req, int err) { - req->base.complete(&req->base, err); + crypto_request_complete(&req->base, err); } static inline const char *acomp_alg_name(struct crypto_acomp *tfm) From 372e6b80bafe1269f43c750885aa32baa34cd017 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:01:51 +0800 Subject: [PATCH 107/156] crypto: aead - Use crypto_request_complete Use the crypto_request_complete helper instead of calling the completion function directly. Signed-off-by: Herbert Xu --- include/crypto/internal/aead.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/crypto/internal/aead.h b/include/crypto/internal/aead.h index cd8cb1e921b7..28a95eb3182d 100644 --- a/include/crypto/internal/aead.h +++ b/include/crypto/internal/aead.h @@ -82,7 +82,7 @@ static inline void *aead_request_ctx_dma(struct aead_request *req) static inline void aead_request_complete(struct aead_request *req, int err) { - req->base.complete(&req->base, err); + crypto_request_complete(&req->base, err); } static inline u32 aead_request_flags(struct aead_request *req) From 700d507805727635e8f57882a7f9641e8ca1eb19 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:01:53 +0800 Subject: [PATCH 108/156] crypto: akcipher - Use crypto_request_complete Use the crypto_request_complete helper instead of calling the completion function directly. Signed-off-by: Herbert Xu --- include/crypto/internal/akcipher.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/crypto/internal/akcipher.h b/include/crypto/internal/akcipher.h index aaf1092b93b8..a0fba4b2eccf 100644 --- a/include/crypto/internal/akcipher.h +++ b/include/crypto/internal/akcipher.h @@ -69,7 +69,7 @@ static inline void *akcipher_tfm_ctx_dma(struct crypto_akcipher *tfm) static inline void akcipher_request_complete(struct akcipher_request *req, int err) { - req->base.complete(&req->base, err); + crypto_request_complete(&req->base, err); } static inline const char *akcipher_alg_name(struct crypto_akcipher *tfm) From d9588045f540157e36d8d6aeb54d6ce769a89b5b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 10 Feb 2023 20:20:20 +0800 Subject: [PATCH 109/156] crypto: hash - Use crypto_request_complete Use the crypto_request_complete helper instead of calling the completion function directly. This patch also removes the voodoo programming previously used for unaligned ahash operations and replaces it with a sub-request. Signed-off-by: Herbert Xu --- crypto/ahash.c | 173 ++++++++++++++------------------- include/crypto/internal/hash.h | 2 +- 2 files changed, 72 insertions(+), 103 deletions(-) diff --git a/crypto/ahash.c b/crypto/ahash.c index 4b089f1b770f..19241b18a4d1 100644 --- a/crypto/ahash.c +++ b/crypto/ahash.c @@ -190,133 +190,98 @@ int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key, } EXPORT_SYMBOL_GPL(crypto_ahash_setkey); -static inline unsigned int ahash_align_buffer_size(unsigned len, - unsigned long mask) -{ - return len + (mask & ~(crypto_tfm_ctx_alignment() - 1)); -} - -static int ahash_save_req(struct ahash_request *req, crypto_completion_t cplt) +static int ahash_save_req(struct ahash_request *req, crypto_completion_t cplt, + bool has_state) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); unsigned long alignmask = crypto_ahash_alignmask(tfm); unsigned int ds = crypto_ahash_digestsize(tfm); - struct ahash_request_priv *priv; + struct ahash_request *subreq; + unsigned int subreq_size; + unsigned int reqsize; + u8 *result; + gfp_t gfp; + u32 flags; - priv = kmalloc(sizeof(*priv) + ahash_align_buffer_size(ds, alignmask), - (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? - GFP_KERNEL : GFP_ATOMIC); - if (!priv) + subreq_size = sizeof(*subreq); + reqsize = crypto_ahash_reqsize(tfm); + reqsize = ALIGN(reqsize, crypto_tfm_ctx_alignment()); + subreq_size += reqsize; + subreq_size += ds; + subreq_size += alignmask & ~(crypto_tfm_ctx_alignment() - 1); + + flags = ahash_request_flags(req); + gfp = (flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? GFP_KERNEL : GFP_ATOMIC; + subreq = kmalloc(subreq_size, gfp); + if (!subreq) return -ENOMEM; - /* - * WARNING: Voodoo programming below! - * - * The code below is obscure and hard to understand, thus explanation - * is necessary. See include/crypto/hash.h and include/linux/crypto.h - * to understand the layout of structures used here! - * - * The code here will replace portions of the ORIGINAL request with - * pointers to new code and buffers so the hashing operation can store - * the result in aligned buffer. We will call the modified request - * an ADJUSTED request. - * - * The newly mangled request will look as such: - * - * req { - * .result = ADJUSTED[new aligned buffer] - * .base.complete = ADJUSTED[pointer to completion function] - * .base.data = ADJUSTED[*req (pointer to self)] - * .priv = ADJUSTED[new priv] { - * .result = ORIGINAL(result) - * .complete = ORIGINAL(base.complete) - * .data = ORIGINAL(base.data) - * } - */ + ahash_request_set_tfm(subreq, tfm); + ahash_request_set_callback(subreq, flags, cplt, req); - priv->result = req->result; - priv->complete = req->base.complete; - priv->data = req->base.data; - priv->flags = req->base.flags; + result = (u8 *)(subreq + 1) + reqsize; + result = PTR_ALIGN(result, alignmask + 1); - /* - * WARNING: We do not backup req->priv here! The req->priv - * is for internal use of the Crypto API and the - * user must _NOT_ _EVER_ depend on it's content! - */ + ahash_request_set_crypt(subreq, req->src, result, req->nbytes); - req->result = PTR_ALIGN((u8 *)priv->ubuf, alignmask + 1); - req->base.complete = cplt; - req->base.data = req; - req->priv = priv; + if (has_state) { + void *state; + + state = kmalloc(crypto_ahash_statesize(tfm), gfp); + if (!state) { + kfree(subreq); + return -ENOMEM; + } + + crypto_ahash_export(req, state); + crypto_ahash_import(subreq, state); + kfree_sensitive(state); + } + + req->priv = subreq; return 0; } static void ahash_restore_req(struct ahash_request *req, int err) { - struct ahash_request_priv *priv = req->priv; + struct ahash_request *subreq = req->priv; if (!err) - memcpy(priv->result, req->result, + memcpy(req->result, subreq->result, crypto_ahash_digestsize(crypto_ahash_reqtfm(req))); - /* Restore the original crypto request. */ - req->result = priv->result; - - ahash_request_set_callback(req, priv->flags, - priv->complete, priv->data); req->priv = NULL; - /* Free the req->priv.priv from the ADJUSTED request. */ - kfree_sensitive(priv); -} - -static void ahash_notify_einprogress(struct ahash_request *req) -{ - struct ahash_request_priv *priv = req->priv; - struct crypto_async_request oreq; - - oreq.data = priv->data; - - priv->complete(&oreq, -EINPROGRESS); + kfree_sensitive(subreq); } static void ahash_op_unaligned_done(struct crypto_async_request *req, int err) { struct ahash_request *areq = req->data; - if (err == -EINPROGRESS) { - ahash_notify_einprogress(areq); - return; - } - - /* - * Restore the original request, see ahash_op_unaligned() for what - * goes where. - * - * The "struct ahash_request *req" here is in fact the "req.base" - * from the ADJUSTED request from ahash_op_unaligned(), thus as it - * is a pointer to self, it is also the ADJUSTED "req" . - */ + if (err == -EINPROGRESS) + goto out; /* First copy req->result into req->priv.result */ ahash_restore_req(areq, err); +out: /* Complete the ORIGINAL request. */ - areq->base.complete(&areq->base, err); + ahash_request_complete(areq, err); } static int ahash_op_unaligned(struct ahash_request *req, - int (*op)(struct ahash_request *)) + int (*op)(struct ahash_request *), + bool has_state) { int err; - err = ahash_save_req(req, ahash_op_unaligned_done); + err = ahash_save_req(req, ahash_op_unaligned_done, has_state); if (err) return err; - err = op(req); + err = op(req->priv); if (err == -EINPROGRESS || err == -EBUSY) return err; @@ -326,13 +291,14 @@ static int ahash_op_unaligned(struct ahash_request *req, } static int crypto_ahash_op(struct ahash_request *req, - int (*op)(struct ahash_request *)) + int (*op)(struct ahash_request *), + bool has_state) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); unsigned long alignmask = crypto_ahash_alignmask(tfm); if ((unsigned long)req->result & alignmask) - return ahash_op_unaligned(req, op); + return ahash_op_unaligned(req, op, has_state); return op(req); } @@ -345,7 +311,7 @@ int crypto_ahash_final(struct ahash_request *req) int ret; crypto_stats_get(alg); - ret = crypto_ahash_op(req, crypto_ahash_reqtfm(req)->final); + ret = crypto_ahash_op(req, crypto_ahash_reqtfm(req)->final, true); crypto_stats_ahash_final(nbytes, ret, alg); return ret; } @@ -359,7 +325,7 @@ int crypto_ahash_finup(struct ahash_request *req) int ret; crypto_stats_get(alg); - ret = crypto_ahash_op(req, crypto_ahash_reqtfm(req)->finup); + ret = crypto_ahash_op(req, crypto_ahash_reqtfm(req)->finup, true); crypto_stats_ahash_final(nbytes, ret, alg); return ret; } @@ -376,7 +342,7 @@ int crypto_ahash_digest(struct ahash_request *req) if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) ret = -ENOKEY; else - ret = crypto_ahash_op(req, tfm->digest); + ret = crypto_ahash_op(req, tfm->digest, false); crypto_stats_ahash_final(nbytes, ret, alg); return ret; } @@ -391,17 +357,19 @@ static void ahash_def_finup_done2(struct crypto_async_request *req, int err) ahash_restore_req(areq, err); - areq->base.complete(&areq->base, err); + ahash_request_complete(areq, err); } static int ahash_def_finup_finish1(struct ahash_request *req, int err) { + struct ahash_request *subreq = req->priv; + if (err) goto out; - req->base.complete = ahash_def_finup_done2; + subreq->base.complete = ahash_def_finup_done2; - err = crypto_ahash_reqtfm(req)->final(req); + err = crypto_ahash_reqtfm(req)->final(subreq); if (err == -EINPROGRESS || err == -EBUSY) return err; @@ -413,19 +381,20 @@ out: static void ahash_def_finup_done1(struct crypto_async_request *req, int err) { struct ahash_request *areq = req->data; + struct ahash_request *subreq; - if (err == -EINPROGRESS) { - ahash_notify_einprogress(areq); - return; - } + if (err == -EINPROGRESS) + goto out; - areq->base.flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + subreq = areq->priv; + subreq->base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG; err = ahash_def_finup_finish1(areq, err); - if (areq->priv) + if (err == -EINPROGRESS || err == -EBUSY) return; - areq->base.complete(&areq->base, err); +out: + ahash_request_complete(areq, err); } static int ahash_def_finup(struct ahash_request *req) @@ -433,11 +402,11 @@ static int ahash_def_finup(struct ahash_request *req) struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); int err; - err = ahash_save_req(req, ahash_def_finup_done1); + err = ahash_save_req(req, ahash_def_finup_done1, true); if (err) return err; - err = tfm->update(req); + err = tfm->update(req->priv); if (err == -EINPROGRESS || err == -EBUSY) return err; diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h index 1a2a41b79253..0b259dbb97af 100644 --- a/include/crypto/internal/hash.h +++ b/include/crypto/internal/hash.h @@ -199,7 +199,7 @@ static inline void *ahash_request_ctx_dma(struct ahash_request *req) static inline void ahash_request_complete(struct ahash_request *req, int err) { - req->base.complete(&req->base, err); + crypto_request_complete(&req->base, err); } static inline u32 ahash_request_flags(struct ahash_request *req) From ba354b2fdb1068495ffeabc9f48b799407300f92 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:01:58 +0800 Subject: [PATCH 110/156] crypto: kpp - Use crypto_request_complete Use the crypto_request_complete helper instead of calling the completion function directly. Signed-off-by: Herbert Xu --- include/crypto/internal/kpp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/crypto/internal/kpp.h b/include/crypto/internal/kpp.h index 3c9726e89f53..0a6db8c4a9a0 100644 --- a/include/crypto/internal/kpp.h +++ b/include/crypto/internal/kpp.h @@ -85,7 +85,7 @@ static inline void *kpp_tfm_ctx_dma(struct crypto_kpp *tfm) static inline void kpp_request_complete(struct kpp_request *req, int err) { - req->base.complete(&req->base, err); + crypto_request_complete(&req->base, err); } static inline const char *kpp_alg_name(struct crypto_kpp *tfm) From d5770679ad5abc8d62dc5aff7b3f888dfe8a05fa Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:00 +0800 Subject: [PATCH 111/156] crypto: skcipher - Use crypto_request_complete Use the crypto_request_complete helper instead of calling the completion function directly. Signed-off-by: Herbert Xu --- include/crypto/internal/skcipher.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h index 06d0a5491cf3..fb3d9e899f52 100644 --- a/include/crypto/internal/skcipher.h +++ b/include/crypto/internal/skcipher.h @@ -94,7 +94,7 @@ static inline void *skcipher_instance_ctx(struct skcipher_instance *inst) static inline void skcipher_request_complete(struct skcipher_request *req, int err) { - req->base.complete(&req->base, err); + crypto_request_complete(&req->base, err); } int crypto_grab_skcipher(struct crypto_skcipher_spawn *spawn, From 6909823d47c17cba84e9244d04050b5db8d53789 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:02 +0800 Subject: [PATCH 112/156] crypto: engine - Use crypto_request_complete Use the crypto_request_complete helper instead of calling the completion function directly. Signed-off-by: Herbert Xu --- crypto/crypto_engine.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c index 64dc9aa3ca24..21f791615114 100644 --- a/crypto/crypto_engine.c +++ b/crypto/crypto_engine.c @@ -54,7 +54,7 @@ static void crypto_finalize_request(struct crypto_engine *engine, } } lockdep_assert_in_softirq(); - req->complete(req, err); + crypto_request_complete(req, err); kthread_queue_work(engine->kworker, &engine->pump_requests); } @@ -130,7 +130,7 @@ start_request: engine->cur_req = async_req; if (backlog) - backlog->complete(backlog, -EINPROGRESS); + crypto_request_complete(backlog, -EINPROGRESS); if (engine->busy) was_busy = true; @@ -214,7 +214,7 @@ req_err_1: } req_err_2: - async_req->complete(async_req, ret); + crypto_request_complete(async_req, ret); retry: /* If retry mechanism is supported, send new requests to engine */ From 564cabc0ca0bdfa8f0fc1ae74b24d0a7554522c5 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:04 +0800 Subject: [PATCH 113/156] crypto: rsa-pkcs1pad - Use akcipher_request_complete Use the akcipher_request_complete helper instead of calling the completion function directly. In fact the previous code was buggy in that EINPROGRESS was never passed back to the original caller. Fixes: 3d5b1ecdea6f ("crypto: rsa - RSA padding algorithm") Signed-off-by: Herbert Xu --- crypto/rsa-pkcs1pad.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c index 141f7e08df6a..02028670331d 100644 --- a/crypto/rsa-pkcs1pad.c +++ b/crypto/rsa-pkcs1pad.c @@ -214,16 +214,14 @@ static void pkcs1pad_encrypt_sign_complete_cb( struct crypto_async_request *child_async_req, int err) { struct akcipher_request *req = child_async_req->data; - struct crypto_async_request async_req; if (err == -EINPROGRESS) - return; + goto out; - async_req.data = req->base.data; - async_req.tfm = crypto_akcipher_tfm(crypto_akcipher_reqtfm(req)); - async_req.flags = child_async_req->flags; - req->base.complete(&async_req, - pkcs1pad_encrypt_sign_complete(req, err)); + err = pkcs1pad_encrypt_sign_complete(req, err); + +out: + akcipher_request_complete(req, err); } static int pkcs1pad_encrypt(struct akcipher_request *req) @@ -332,15 +330,14 @@ static void pkcs1pad_decrypt_complete_cb( struct crypto_async_request *child_async_req, int err) { struct akcipher_request *req = child_async_req->data; - struct crypto_async_request async_req; if (err == -EINPROGRESS) - return; + goto out; - async_req.data = req->base.data; - async_req.tfm = crypto_akcipher_tfm(crypto_akcipher_reqtfm(req)); - async_req.flags = child_async_req->flags; - req->base.complete(&async_req, pkcs1pad_decrypt_complete(req, err)); + err = pkcs1pad_decrypt_complete(req, err); + +out: + akcipher_request_complete(req, err); } static int pkcs1pad_decrypt(struct akcipher_request *req) @@ -513,15 +510,14 @@ static void pkcs1pad_verify_complete_cb( struct crypto_async_request *child_async_req, int err) { struct akcipher_request *req = child_async_req->data; - struct crypto_async_request async_req; if (err == -EINPROGRESS) - return; + goto out; - async_req.data = req->base.data; - async_req.tfm = crypto_akcipher_tfm(crypto_akcipher_reqtfm(req)); - async_req.flags = child_async_req->flags; - req->base.complete(&async_req, pkcs1pad_verify_complete(req, err)); + err = pkcs1pad_verify_complete(req, err); + +out: + akcipher_request_complete(req, err); } /* From f27c94aac5b071f402c0ff8ef439d4b2058502fd Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 8 Feb 2023 13:56:00 +0800 Subject: [PATCH 114/156] crypto: cryptd - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- crypto/cryptd.c | 234 ++++++++++++++++++++++++++---------------------- 1 file changed, 126 insertions(+), 108 deletions(-) diff --git a/crypto/cryptd.c b/crypto/cryptd.c index 9d60acc920cb..29890fc0eab7 100644 --- a/crypto/cryptd.c +++ b/crypto/cryptd.c @@ -72,7 +72,6 @@ struct cryptd_skcipher_ctx { }; struct cryptd_skcipher_request_ctx { - crypto_completion_t complete; struct skcipher_request req; }; @@ -83,6 +82,7 @@ struct cryptd_hash_ctx { struct cryptd_hash_request_ctx { crypto_completion_t complete; + void *data; struct shash_desc desc; }; @@ -92,7 +92,6 @@ struct cryptd_aead_ctx { }; struct cryptd_aead_request_ctx { - crypto_completion_t complete; struct aead_request req; }; @@ -178,8 +177,8 @@ static void cryptd_queue_worker(struct work_struct *work) return; if (backlog) - backlog->complete(backlog, -EINPROGRESS); - req->complete(req, 0); + crypto_request_complete(backlog, -EINPROGRESS); + crypto_request_complete(req, 0); if (cpu_queue->queue.qlen) queue_work(cryptd_wq, &cpu_queue->work); @@ -238,18 +237,51 @@ static int cryptd_skcipher_setkey(struct crypto_skcipher *parent, return crypto_skcipher_setkey(child, key, keylen); } -static void cryptd_skcipher_complete(struct skcipher_request *req, int err) +static struct skcipher_request *cryptd_skcipher_prepare( + struct skcipher_request *req, int err) { + struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req); + struct skcipher_request *subreq = &rctx->req; + struct cryptd_skcipher_ctx *ctx; + struct crypto_skcipher *child; + + req->base.complete = subreq->base.complete; + req->base.data = subreq->base.data; + + if (unlikely(err == -EINPROGRESS)) + return NULL; + + ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); + child = ctx->child; + + skcipher_request_set_tfm(subreq, child); + skcipher_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, + req->iv); + + return subreq; +} + +static void cryptd_skcipher_complete(struct skcipher_request *req, int err, + crypto_completion_t complete) +{ + struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); - struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req); + struct skcipher_request *subreq = &rctx->req; int refcnt = refcount_read(&ctx->refcnt); local_bh_disable(); - rctx->complete(&req->base, err); + skcipher_request_complete(req, err); local_bh_enable(); - if (err != -EINPROGRESS && refcnt && refcount_dec_and_test(&ctx->refcnt)) + if (unlikely(err == -EINPROGRESS)) { + subreq->base.complete = req->base.complete; + subreq->base.data = req->base.data; + req->base.complete = complete; + req->base.data = req; + } else if (refcnt && refcount_dec_and_test(&ctx->refcnt)) crypto_free_skcipher(tfm); } @@ -257,54 +289,26 @@ static void cryptd_skcipher_encrypt(struct crypto_async_request *base, int err) { struct skcipher_request *req = skcipher_request_cast(base); - struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req); - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_request *subreq = &rctx->req; - struct crypto_skcipher *child = ctx->child; + struct skcipher_request *subreq; - if (unlikely(err == -EINPROGRESS)) - goto out; + subreq = cryptd_skcipher_prepare(req, err); + if (likely(subreq)) + err = crypto_skcipher_encrypt(subreq); - skcipher_request_set_tfm(subreq, child); - skcipher_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP, - NULL, NULL); - skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, - req->iv); - - err = crypto_skcipher_encrypt(subreq); - - req->base.complete = rctx->complete; - -out: - cryptd_skcipher_complete(req, err); + cryptd_skcipher_complete(req, err, cryptd_skcipher_encrypt); } static void cryptd_skcipher_decrypt(struct crypto_async_request *base, int err) { struct skcipher_request *req = skcipher_request_cast(base); - struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req); - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_request *subreq = &rctx->req; - struct crypto_skcipher *child = ctx->child; + struct skcipher_request *subreq; - if (unlikely(err == -EINPROGRESS)) - goto out; + subreq = cryptd_skcipher_prepare(req, err); + if (likely(subreq)) + err = crypto_skcipher_decrypt(subreq); - skcipher_request_set_tfm(subreq, child); - skcipher_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP, - NULL, NULL); - skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, - req->iv); - - err = crypto_skcipher_decrypt(subreq); - - req->base.complete = rctx->complete; - -out: - cryptd_skcipher_complete(req, err); + cryptd_skcipher_complete(req, err, cryptd_skcipher_decrypt); } static int cryptd_skcipher_enqueue(struct skcipher_request *req, @@ -312,11 +316,14 @@ static int cryptd_skcipher_enqueue(struct skcipher_request *req, { struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct skcipher_request *subreq = &rctx->req; struct cryptd_queue *queue; queue = cryptd_get_queue(crypto_skcipher_tfm(tfm)); - rctx->complete = req->base.complete; + subreq->base.complete = req->base.complete; + subreq->base.data = req->base.data; req->base.complete = compl; + req->base.data = req; return cryptd_enqueue_request(queue, &req->base); } @@ -469,45 +476,63 @@ static int cryptd_hash_enqueue(struct ahash_request *req, cryptd_get_queue(crypto_ahash_tfm(tfm)); rctx->complete = req->base.complete; + rctx->data = req->base.data; req->base.complete = compl; + req->base.data = req; return cryptd_enqueue_request(queue, &req->base); } -static void cryptd_hash_complete(struct ahash_request *req, int err) +static struct shash_desc *cryptd_hash_prepare(struct ahash_request *req, + int err) +{ + struct cryptd_hash_request_ctx *rctx = ahash_request_ctx(req); + + req->base.complete = rctx->complete; + req->base.data = rctx->data; + + if (unlikely(err == -EINPROGRESS)) + return NULL; + + return &rctx->desc; +} + +static void cryptd_hash_complete(struct ahash_request *req, int err, + crypto_completion_t complete) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(tfm); - struct cryptd_hash_request_ctx *rctx = ahash_request_ctx(req); int refcnt = refcount_read(&ctx->refcnt); local_bh_disable(); - rctx->complete(&req->base, err); + ahash_request_complete(req, err); local_bh_enable(); - if (err != -EINPROGRESS && refcnt && refcount_dec_and_test(&ctx->refcnt)) + if (err == -EINPROGRESS) { + req->base.complete = complete; + req->base.data = req; + } else if (refcnt && refcount_dec_and_test(&ctx->refcnt)) crypto_free_ahash(tfm); } static void cryptd_hash_init(struct crypto_async_request *req_async, int err) { - struct cryptd_hash_ctx *ctx = crypto_tfm_ctx(req_async->tfm); - struct crypto_shash *child = ctx->child; struct ahash_request *req = ahash_request_cast(req_async); - struct cryptd_hash_request_ctx *rctx = ahash_request_ctx(req); - struct shash_desc *desc = &rctx->desc; + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct crypto_shash *child = ctx->child; + struct shash_desc *desc; - if (unlikely(err == -EINPROGRESS)) + desc = cryptd_hash_prepare(req, err); + if (unlikely(!desc)) goto out; desc->tfm = child; err = crypto_shash_init(desc); - req->base.complete = rctx->complete; - out: - cryptd_hash_complete(req, err); + cryptd_hash_complete(req, err, cryptd_hash_init); } static int cryptd_hash_init_enqueue(struct ahash_request *req) @@ -518,19 +543,13 @@ static int cryptd_hash_init_enqueue(struct ahash_request *req) static void cryptd_hash_update(struct crypto_async_request *req_async, int err) { struct ahash_request *req = ahash_request_cast(req_async); - struct cryptd_hash_request_ctx *rctx; + struct shash_desc *desc; - rctx = ahash_request_ctx(req); + desc = cryptd_hash_prepare(req, err); + if (likely(desc)) + err = shash_ahash_update(req, desc); - if (unlikely(err == -EINPROGRESS)) - goto out; - - err = shash_ahash_update(req, &rctx->desc); - - req->base.complete = rctx->complete; - -out: - cryptd_hash_complete(req, err); + cryptd_hash_complete(req, err, cryptd_hash_update); } static int cryptd_hash_update_enqueue(struct ahash_request *req) @@ -541,17 +560,13 @@ static int cryptd_hash_update_enqueue(struct ahash_request *req) static void cryptd_hash_final(struct crypto_async_request *req_async, int err) { struct ahash_request *req = ahash_request_cast(req_async); - struct cryptd_hash_request_ctx *rctx = ahash_request_ctx(req); + struct shash_desc *desc; - if (unlikely(err == -EINPROGRESS)) - goto out; + desc = cryptd_hash_prepare(req, err); + if (likely(desc)) + err = crypto_shash_final(desc, req->result); - err = crypto_shash_final(&rctx->desc, req->result); - - req->base.complete = rctx->complete; - -out: - cryptd_hash_complete(req, err); + cryptd_hash_complete(req, err, cryptd_hash_final); } static int cryptd_hash_final_enqueue(struct ahash_request *req) @@ -562,17 +577,13 @@ static int cryptd_hash_final_enqueue(struct ahash_request *req) static void cryptd_hash_finup(struct crypto_async_request *req_async, int err) { struct ahash_request *req = ahash_request_cast(req_async); - struct cryptd_hash_request_ctx *rctx = ahash_request_ctx(req); + struct shash_desc *desc; - if (unlikely(err == -EINPROGRESS)) - goto out; + desc = cryptd_hash_prepare(req, err); + if (likely(desc)) + err = shash_ahash_finup(req, desc); - err = shash_ahash_finup(req, &rctx->desc); - - req->base.complete = rctx->complete; - -out: - cryptd_hash_complete(req, err); + cryptd_hash_complete(req, err, cryptd_hash_finup); } static int cryptd_hash_finup_enqueue(struct ahash_request *req) @@ -582,23 +593,22 @@ static int cryptd_hash_finup_enqueue(struct ahash_request *req) static void cryptd_hash_digest(struct crypto_async_request *req_async, int err) { - struct cryptd_hash_ctx *ctx = crypto_tfm_ctx(req_async->tfm); - struct crypto_shash *child = ctx->child; struct ahash_request *req = ahash_request_cast(req_async); - struct cryptd_hash_request_ctx *rctx = ahash_request_ctx(req); - struct shash_desc *desc = &rctx->desc; + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct crypto_shash *child = ctx->child; + struct shash_desc *desc; - if (unlikely(err == -EINPROGRESS)) + desc = cryptd_hash_prepare(req, err); + if (unlikely(!desc)) goto out; desc->tfm = child; err = shash_ahash_digest(req, desc); - req->base.complete = rctx->complete; - out: - cryptd_hash_complete(req, err); + cryptd_hash_complete(req, err, cryptd_hash_digest); } static int cryptd_hash_digest_enqueue(struct ahash_request *req) @@ -711,20 +721,20 @@ static int cryptd_aead_setauthsize(struct crypto_aead *parent, } static void cryptd_aead_crypt(struct aead_request *req, - struct crypto_aead *child, - int err, - int (*crypt)(struct aead_request *req)) + struct crypto_aead *child, int err, + int (*crypt)(struct aead_request *req), + crypto_completion_t compl) { struct cryptd_aead_request_ctx *rctx; struct aead_request *subreq; struct cryptd_aead_ctx *ctx; - crypto_completion_t compl; struct crypto_aead *tfm; int refcnt; rctx = aead_request_ctx(req); - compl = rctx->complete; subreq = &rctx->req; + req->base.complete = subreq->base.complete; + req->base.data = subreq->base.data; tfm = crypto_aead_reqtfm(req); @@ -740,17 +750,20 @@ static void cryptd_aead_crypt(struct aead_request *req, err = crypt(subreq); - req->base.complete = compl; - out: ctx = crypto_aead_ctx(tfm); refcnt = refcount_read(&ctx->refcnt); local_bh_disable(); - compl(&req->base, err); + aead_request_complete(req, err); local_bh_enable(); - if (err != -EINPROGRESS && refcnt && refcount_dec_and_test(&ctx->refcnt)) + if (err == -EINPROGRESS) { + subreq->base.complete = req->base.complete; + subreq->base.data = req->base.data; + req->base.complete = compl; + req->base.data = req; + } else if (refcnt && refcount_dec_and_test(&ctx->refcnt)) crypto_free_aead(tfm); } @@ -761,7 +774,8 @@ static void cryptd_aead_encrypt(struct crypto_async_request *areq, int err) struct aead_request *req; req = container_of(areq, struct aead_request, base); - cryptd_aead_crypt(req, child, err, crypto_aead_alg(child)->encrypt); + cryptd_aead_crypt(req, child, err, crypto_aead_alg(child)->encrypt, + cryptd_aead_encrypt); } static void cryptd_aead_decrypt(struct crypto_async_request *areq, int err) @@ -771,7 +785,8 @@ static void cryptd_aead_decrypt(struct crypto_async_request *areq, int err) struct aead_request *req; req = container_of(areq, struct aead_request, base); - cryptd_aead_crypt(req, child, err, crypto_aead_alg(child)->decrypt); + cryptd_aead_crypt(req, child, err, crypto_aead_alg(child)->decrypt, + cryptd_aead_decrypt); } static int cryptd_aead_enqueue(struct aead_request *req, @@ -780,9 +795,12 @@ static int cryptd_aead_enqueue(struct aead_request *req, struct cryptd_aead_request_ctx *rctx = aead_request_ctx(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cryptd_queue *queue = cryptd_get_queue(crypto_aead_tfm(tfm)); + struct aead_request *subreq = &rctx->req; - rctx->complete = req->base.complete; + subreq->base.complete = req->base.complete; + subreq->base.data = req->base.data; req->base.complete = compl; + req->base.data = req; return cryptd_enqueue_request(queue, &req->base); } From 7d19abdcb7b3443021ce744ded8db1a0203f1594 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:08 +0800 Subject: [PATCH 115/156] crypto: atmel - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/atmel-aes.c | 4 ++-- drivers/crypto/atmel-sha.c | 4 ++-- drivers/crypto/atmel-tdes.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/atmel-aes.c b/drivers/crypto/atmel-aes.c index e90e4a6cc37a..ed10f2ae4523 100644 --- a/drivers/crypto/atmel-aes.c +++ b/drivers/crypto/atmel-aes.c @@ -554,7 +554,7 @@ static inline int atmel_aes_complete(struct atmel_aes_dev *dd, int err) } if (dd->is_async) - dd->areq->complete(dd->areq, err); + crypto_request_complete(dd->areq, err); tasklet_schedule(&dd->queue_task); @@ -955,7 +955,7 @@ static int atmel_aes_handle_queue(struct atmel_aes_dev *dd, return ret; if (backlog) - backlog->complete(backlog, -EINPROGRESS); + crypto_request_complete(backlog, -EINPROGRESS); ctx = crypto_tfm_ctx(areq->tfm); diff --git a/drivers/crypto/atmel-sha.c b/drivers/crypto/atmel-sha.c index 00be792e605c..a77cf0da0816 100644 --- a/drivers/crypto/atmel-sha.c +++ b/drivers/crypto/atmel-sha.c @@ -292,7 +292,7 @@ static inline int atmel_sha_complete(struct atmel_sha_dev *dd, int err) clk_disable(dd->iclk); if ((dd->is_async || dd->force_complete) && req->base.complete) - req->base.complete(&req->base, err); + ahash_request_complete(req, err); /* handle new request */ tasklet_schedule(&dd->queue_task); @@ -1080,7 +1080,7 @@ static int atmel_sha_handle_queue(struct atmel_sha_dev *dd, return ret; if (backlog) - backlog->complete(backlog, -EINPROGRESS); + crypto_request_complete(backlog, -EINPROGRESS); ctx = crypto_tfm_ctx(async_req->tfm); diff --git a/drivers/crypto/atmel-tdes.c b/drivers/crypto/atmel-tdes.c index 8b7bc1076e0d..b2d48c1649b9 100644 --- a/drivers/crypto/atmel-tdes.c +++ b/drivers/crypto/atmel-tdes.c @@ -590,7 +590,7 @@ static void atmel_tdes_finish_req(struct atmel_tdes_dev *dd, int err) if (!err && (rctx->mode & TDES_FLAGS_OPMODE_MASK) != TDES_FLAGS_ECB) atmel_tdes_set_iv_as_last_ciphertext_block(dd); - req->base.complete(&req->base, err); + skcipher_request_complete(req, err); } static int atmel_tdes_handle_queue(struct atmel_tdes_dev *dd, @@ -619,7 +619,7 @@ static int atmel_tdes_handle_queue(struct atmel_tdes_dev *dd, return ret; if (backlog) - backlog->complete(backlog, -EINPROGRESS); + crypto_request_complete(backlog, -EINPROGRESS); req = skcipher_request_cast(async_req); From 51bdb2fe01dfad5447b9cb6f565c73b32b519b2e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:10 +0800 Subject: [PATCH 116/156] crypto: artpec6 - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu Acked-by: Jesper Nilsson Signed-off-by: Herbert Xu --- drivers/crypto/axis/artpec6_crypto.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/axis/artpec6_crypto.c b/drivers/crypto/axis/artpec6_crypto.c index f6f41e316dfe..8493a45e1bd4 100644 --- a/drivers/crypto/axis/artpec6_crypto.c +++ b/drivers/crypto/axis/artpec6_crypto.c @@ -2143,13 +2143,13 @@ static void artpec6_crypto_task(unsigned long data) list_for_each_entry_safe(req, n, &complete_in_progress, complete_in_progress) { - req->req->complete(req->req, -EINPROGRESS); + crypto_request_complete(req->req, -EINPROGRESS); } } static void artpec6_crypto_complete_crypto(struct crypto_async_request *req) { - req->complete(req, 0); + crypto_request_complete(req, 0); } static void @@ -2161,7 +2161,7 @@ artpec6_crypto_complete_cbc_decrypt(struct crypto_async_request *req) scatterwalk_map_and_copy(cipher_req->iv, cipher_req->src, cipher_req->cryptlen - AES_BLOCK_SIZE, AES_BLOCK_SIZE, 0); - req->complete(req, 0); + skcipher_request_complete(cipher_req, 0); } static void @@ -2173,7 +2173,7 @@ artpec6_crypto_complete_cbc_encrypt(struct crypto_async_request *req) scatterwalk_map_and_copy(cipher_req->iv, cipher_req->dst, cipher_req->cryptlen - AES_BLOCK_SIZE, AES_BLOCK_SIZE, 0); - req->complete(req, 0); + skcipher_request_complete(cipher_req, 0); } static void artpec6_crypto_complete_aead(struct crypto_async_request *req) @@ -2211,12 +2211,12 @@ static void artpec6_crypto_complete_aead(struct crypto_async_request *req) } } - req->complete(req, result); + aead_request_complete(areq, result); } static void artpec6_crypto_complete_hash(struct crypto_async_request *req) { - req->complete(req, 0); + crypto_request_complete(req, 0); } From 33ccbfd2e02a04d2fdbf8f48a80230f6235493bd Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:12 +0800 Subject: [PATCH 117/156] crypto: bcm - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/bcm/cipher.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c index f8e035039aeb..70b911baab26 100644 --- a/drivers/crypto/bcm/cipher.c +++ b/drivers/crypto/bcm/cipher.c @@ -1614,7 +1614,7 @@ static void finish_req(struct iproc_reqctx_s *rctx, int err) spu_chunk_cleanup(rctx); if (areq) - areq->complete(areq, err); + crypto_request_complete(areq, err); } /** From b34a6416728812b5ca283598100516852646a133 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:15 +0800 Subject: [PATCH 118/156] crypto: cpt - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/cavium/cpt/cptvf_algs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/cavium/cpt/cptvf_algs.c b/drivers/crypto/cavium/cpt/cptvf_algs.c index 0b38c2600b86..ee476c6c7f82 100644 --- a/drivers/crypto/cavium/cpt/cptvf_algs.c +++ b/drivers/crypto/cavium/cpt/cptvf_algs.c @@ -28,7 +28,7 @@ static void cvm_callback(u32 status, void *arg) { struct crypto_async_request *req = (struct crypto_async_request *)arg; - req->complete(req, !status); + crypto_request_complete(req, !status); } static inline void update_input_iv(struct cpt_request_info *req_info, From ea2fbe3b1a317498d2c66d6b97bc281db1a18b45 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:17 +0800 Subject: [PATCH 119/156] crypto: nitrox - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/cavium/nitrox/nitrox_aead.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/cavium/nitrox/nitrox_aead.c b/drivers/crypto/cavium/nitrox/nitrox_aead.c index 0653484df23f..b0e53034164a 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_aead.c +++ b/drivers/crypto/cavium/nitrox/nitrox_aead.c @@ -199,7 +199,7 @@ static void nitrox_aead_callback(void *arg, int err) err = -EINVAL; } - areq->base.complete(&areq->base, err); + aead_request_complete(areq, err); } static inline bool nitrox_aes_gcm_assoclen_supported(unsigned int assoclen) @@ -434,7 +434,7 @@ static void nitrox_rfc4106_callback(void *arg, int err) err = -EINVAL; } - areq->base.complete(&areq->base, err); + aead_request_complete(areq, err); } static int nitrox_rfc4106_enc(struct aead_request *areq) From 0c18d05463315ce144c2d72a2699fa54ef429f52 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:19 +0800 Subject: [PATCH 120/156] crypto: ccp - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu Acked-by: Tom Lendacky Signed-off-by: Herbert Xu --- drivers/crypto/ccp/ccp-crypto-main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/ccp/ccp-crypto-main.c b/drivers/crypto/ccp/ccp-crypto-main.c index 73442a382f68..ecd58b38c46e 100644 --- a/drivers/crypto/ccp/ccp-crypto-main.c +++ b/drivers/crypto/ccp/ccp-crypto-main.c @@ -146,7 +146,7 @@ static void ccp_crypto_complete(void *data, int err) /* Only propagate the -EINPROGRESS if necessary */ if (crypto_cmd->ret == -EBUSY) { crypto_cmd->ret = -EINPROGRESS; - req->complete(req, -EINPROGRESS); + crypto_request_complete(req, -EINPROGRESS); } return; @@ -159,18 +159,18 @@ static void ccp_crypto_complete(void *data, int err) held = ccp_crypto_cmd_complete(crypto_cmd, &backlog); if (backlog) { backlog->ret = -EINPROGRESS; - backlog->req->complete(backlog->req, -EINPROGRESS); + crypto_request_complete(backlog->req, -EINPROGRESS); } /* Transition the state from -EBUSY to -EINPROGRESS first */ if (crypto_cmd->ret == -EBUSY) - req->complete(req, -EINPROGRESS); + crypto_request_complete(req, -EINPROGRESS); /* Completion callbacks */ ret = err; if (ctx->complete) ret = ctx->complete(req, ret); - req->complete(req, ret); + crypto_request_complete(req, ret); /* Submit the next cmd */ while (held) { @@ -186,12 +186,12 @@ static void ccp_crypto_complete(void *data, int err) ctx = crypto_tfm_ctx_dma(held->req->tfm); if (ctx->complete) ret = ctx->complete(held->req, ret); - held->req->complete(held->req, ret); + crypto_request_complete(held->req, ret); next = ccp_crypto_cmd_complete(held, &backlog); if (backlog) { backlog->ret = -EINPROGRESS; - backlog->req->complete(backlog->req, -EINPROGRESS); + crypto_request_complete(backlog->req, -EINPROGRESS); } kfree(held); From 13c20754c664796a697859d5d329810236cdc131 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:21 +0800 Subject: [PATCH 121/156] crypto: chelsio - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/chelsio/chcr_algo.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index 68d65773ef2b..0eade4fa6695 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -220,7 +220,7 @@ static inline int chcr_handle_aead_resp(struct aead_request *req, reqctx->verify = VERIFY_HW; } chcr_dec_wrcount(dev); - req->base.complete(&req->base, err); + aead_request_complete(req, err); return err; } @@ -1235,7 +1235,7 @@ complete: complete(&ctx->cbc_aes_aio_done); } chcr_dec_wrcount(dev); - req->base.complete(&req->base, err); + skcipher_request_complete(req, err); return err; } @@ -2132,7 +2132,7 @@ unmap: out: chcr_dec_wrcount(dev); - req->base.complete(&req->base, err); + ahash_request_complete(req, err); } /* From 17fcc82eebd92a510b2124ae5544491af950a315 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:23 +0800 Subject: [PATCH 122/156] crypto: hifn_795x - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/hifn_795x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/hifn_795x.c b/drivers/crypto/hifn_795x.c index 7e7a8f01ea6b..5a7f6611803c 100644 --- a/drivers/crypto/hifn_795x.c +++ b/drivers/crypto/hifn_795x.c @@ -1705,7 +1705,7 @@ static void hifn_process_ready(struct skcipher_request *req, int error) hifn_cipher_walk_exit(&rctx->walk); } - req->base.complete(&req->base, error); + skcipher_request_complete(req, error); } static void hifn_clear_rings(struct hifn_device *dev, int error) @@ -2054,7 +2054,7 @@ static int hifn_process_queue(struct hifn_device *dev) break; if (backlog) - backlog->complete(backlog, -EINPROGRESS); + crypto_request_complete(backlog, -EINPROGRESS); req = skcipher_request_cast(async_req); From e2b537b7706284afba9b780e21f879e1c0a0ec20 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:25 +0800 Subject: [PATCH 123/156] crypto: hisilicon - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/sec/sec_algs.c | 6 +++--- drivers/crypto/hisilicon/sec2/sec_crypto.c | 10 ++++------ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/crypto/hisilicon/sec/sec_algs.c b/drivers/crypto/hisilicon/sec/sec_algs.c index 490e1542305e..1189effcdad0 100644 --- a/drivers/crypto/hisilicon/sec/sec_algs.c +++ b/drivers/crypto/hisilicon/sec/sec_algs.c @@ -504,8 +504,8 @@ static void sec_skcipher_alg_callback(struct sec_bd_info *sec_resp, kfifo_avail(&ctx->queue->softqueue) > backlog_req->num_elements)) { sec_send_request(backlog_req, ctx->queue); - backlog_req->req_base->complete(backlog_req->req_base, - -EINPROGRESS); + crypto_request_complete(backlog_req->req_base, + -EINPROGRESS); list_del(&backlog_req->backlog_head); } } @@ -534,7 +534,7 @@ static void sec_skcipher_alg_callback(struct sec_bd_info *sec_resp, if (skreq->src != skreq->dst) dma_unmap_sg(dev, skreq->dst, sec_req->len_out, DMA_BIDIRECTIONAL); - skreq->base.complete(&skreq->base, sec_req->err); + skcipher_request_complete(skreq, sec_req->err); } } diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c index f5bfc9755a4a..074e50ef512c 100644 --- a/drivers/crypto/hisilicon/sec2/sec_crypto.c +++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c @@ -1459,12 +1459,11 @@ static void sec_skcipher_callback(struct sec_ctx *ctx, struct sec_req *req, break; backlog_sk_req = backlog_req->c_req.sk_req; - backlog_sk_req->base.complete(&backlog_sk_req->base, - -EINPROGRESS); + skcipher_request_complete(backlog_sk_req, -EINPROGRESS); atomic64_inc(&ctx->sec->debug.dfx.recv_busy_cnt); } - sk_req->base.complete(&sk_req->base, err); + skcipher_request_complete(sk_req, err); } static void set_aead_auth_iv(struct sec_ctx *ctx, struct sec_req *req) @@ -1736,12 +1735,11 @@ static void sec_aead_callback(struct sec_ctx *c, struct sec_req *req, int err) break; backlog_aead_req = backlog_req->aead_req.aead_req; - backlog_aead_req->base.complete(&backlog_aead_req->base, - -EINPROGRESS); + aead_request_complete(backlog_aead_req, -EINPROGRESS); atomic64_inc(&c->sec->debug.dfx.recv_busy_cnt); } - a_req->base.complete(&a_req->base, err); + aead_request_complete(a_req, err); } static void sec_request_uninit(struct sec_ctx *ctx, struct sec_req *req) From a712bff02333821639ff4b19ceed5a988ea8c50a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:27 +0800 Subject: [PATCH 124/156] crypto: img-hash - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/img-hash.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/img-hash.c b/drivers/crypto/img-hash.c index e5ed098a154d..fe93d19e3044 100644 --- a/drivers/crypto/img-hash.c +++ b/drivers/crypto/img-hash.c @@ -308,7 +308,7 @@ static void img_hash_finish_req(struct ahash_request *req, int err) DRIVER_FLAGS_CPU | DRIVER_FLAGS_BUSY | DRIVER_FLAGS_FINAL); if (req->base.complete) - req->base.complete(&req->base, err); + ahash_request_complete(req, err); } static int img_hash_write_via_dma(struct img_hash_dev *hdev) @@ -526,7 +526,7 @@ static int img_hash_handle_queue(struct img_hash_dev *hdev, return res; if (backlog) - backlog->complete(backlog, -EINPROGRESS); + crypto_request_complete(backlog, -EINPROGRESS); req = ahash_request_cast(async_req); hdev->req = req; From 47c32286fbc6399812b8b10f4f45473f82f16730 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:29 +0800 Subject: [PATCH 125/156] crypto: safexcel - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/inside-secure/safexcel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/inside-secure/safexcel.c b/drivers/crypto/inside-secure/safexcel.c index baff0123f919..6858753af6b3 100644 --- a/drivers/crypto/inside-secure/safexcel.c +++ b/drivers/crypto/inside-secure/safexcel.c @@ -850,7 +850,7 @@ handle_req: goto request_failed; if (backlog) - backlog->complete(backlog, -EINPROGRESS); + crypto_request_complete(backlog, -EINPROGRESS); /* In case the send() helper did not issue any command to push * to the engine because the input data was cached, continue to @@ -1039,7 +1039,7 @@ handle_results: if (should_complete) { local_bh_disable(); - req->complete(req, ret); + crypto_request_complete(req, ret); local_bh_enable(); } From 0d07ae6ae9cb902214c925778d748af805a0c7f0 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:31 +0800 Subject: [PATCH 126/156] crypto: ixp4xx - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/ixp4xx_crypto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c index 984b3cc0237c..b63e2359a133 100644 --- a/drivers/crypto/ixp4xx_crypto.c +++ b/drivers/crypto/ixp4xx_crypto.c @@ -382,7 +382,7 @@ static void one_packet(dma_addr_t phys) if (req_ctx->hmac_virt) finish_scattered_hmac(crypt); - req->base.complete(&req->base, failed); + aead_request_complete(req, failed); break; } case CTL_FLAG_PERFORM_ABLK: { @@ -407,7 +407,7 @@ static void one_packet(dma_addr_t phys) free_buf_chain(dev, req_ctx->dst, crypt->dst_buf); free_buf_chain(dev, req_ctx->src, crypt->src_buf); - req->base.complete(&req->base, failed); + skcipher_request_complete(req, failed); break; } case CTL_FLAG_GEN_ICV: From 25e3159c79fc1c35ed6bea076d151786e198cf64 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:33 +0800 Subject: [PATCH 127/156] crypto: marvell/cesa - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/marvell/cesa/cesa.c | 4 ++-- drivers/crypto/marvell/cesa/tdma.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/marvell/cesa/cesa.c b/drivers/crypto/marvell/cesa/cesa.c index 5cd332880653..b61e35b932e5 100644 --- a/drivers/crypto/marvell/cesa/cesa.c +++ b/drivers/crypto/marvell/cesa/cesa.c @@ -66,7 +66,7 @@ static void mv_cesa_rearm_engine(struct mv_cesa_engine *engine) return; if (backlog) - backlog->complete(backlog, -EINPROGRESS); + crypto_request_complete(backlog, -EINPROGRESS); ctx = crypto_tfm_ctx(req->tfm); ctx->ops->step(req); @@ -106,7 +106,7 @@ mv_cesa_complete_req(struct mv_cesa_ctx *ctx, struct crypto_async_request *req, { ctx->ops->cleanup(req); local_bh_disable(); - req->complete(req, res); + crypto_request_complete(req, res); local_bh_enable(); } diff --git a/drivers/crypto/marvell/cesa/tdma.c b/drivers/crypto/marvell/cesa/tdma.c index f0b5537038c2..388a06e180d6 100644 --- a/drivers/crypto/marvell/cesa/tdma.c +++ b/drivers/crypto/marvell/cesa/tdma.c @@ -168,7 +168,7 @@ int mv_cesa_tdma_process(struct mv_cesa_engine *engine, u32 status) req); if (backlog) - backlog->complete(backlog, -EINPROGRESS); + crypto_request_complete(backlog, -EINPROGRESS); } if (res || tdma->cur_dma == tdma_cur) From d80bcdf297e8b03caaaa868941cab47ec2768750 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:35 +0800 Subject: [PATCH 128/156] crypto: octeontx - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/marvell/octeontx/otx_cptvf_algs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c b/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c index 83493dd0416f..1c2c870e887a 100644 --- a/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c +++ b/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c @@ -138,7 +138,7 @@ static void otx_cpt_aead_callback(int status, void *arg1, void *arg2) complete: if (areq) - areq->complete(areq, status); + crypto_request_complete(areq, status); } static void output_iv_copyback(struct crypto_async_request *areq) @@ -188,7 +188,7 @@ static void otx_cpt_skcipher_callback(int status, void *arg1, void *arg2) pdev = cpt_info->pdev; do_request_cleanup(pdev, cpt_info); } - areq->complete(areq, status); + crypto_request_complete(areq, status); } } From 25085ba5a77b80dbf3b2b06c64452cf748c3cf01 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:38 +0800 Subject: [PATCH 129/156] crypto: octeontx2 - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/marvell/octeontx2/otx2_cptvf_algs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/marvell/octeontx2/otx2_cptvf_algs.c b/drivers/crypto/marvell/octeontx2/otx2_cptvf_algs.c index 443202caa140..e27ddd3c4e55 100644 --- a/drivers/crypto/marvell/octeontx2/otx2_cptvf_algs.c +++ b/drivers/crypto/marvell/octeontx2/otx2_cptvf_algs.c @@ -120,7 +120,7 @@ static void otx2_cpt_aead_callback(int status, void *arg1, void *arg2) otx2_cpt_info_destroy(pdev, inst_info); } if (areq) - areq->complete(areq, status); + crypto_request_complete(areq, status); } static void output_iv_copyback(struct crypto_async_request *areq) @@ -170,7 +170,7 @@ static void otx2_cpt_skcipher_callback(int status, void *arg1, void *arg2) pdev = inst_info->pdev; otx2_cpt_info_destroy(pdev, inst_info); } - areq->complete(areq, status); + crypto_request_complete(areq, status); } } From 25c9d2c358aec584bbbc7dfd09a55f123c4b8258 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:40 +0800 Subject: [PATCH 130/156] crypto: mxs-dcp - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/mxs-dcp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/mxs-dcp.c b/drivers/crypto/mxs-dcp.c index d6f9e2fe863d..1c11946a4f0b 100644 --- a/drivers/crypto/mxs-dcp.c +++ b/drivers/crypto/mxs-dcp.c @@ -413,11 +413,11 @@ static int dcp_chan_thread_aes(void *data) set_current_state(TASK_RUNNING); if (backlog) - backlog->complete(backlog, -EINPROGRESS); + crypto_request_complete(backlog, -EINPROGRESS); if (arq) { ret = mxs_dcp_aes_block_crypt(arq); - arq->complete(arq, ret); + crypto_request_complete(arq, ret); } } @@ -709,11 +709,11 @@ static int dcp_chan_thread_sha(void *data) set_current_state(TASK_RUNNING); if (backlog) - backlog->complete(backlog, -EINPROGRESS); + crypto_request_complete(backlog, -EINPROGRESS); if (arq) { ret = dcp_sha_req_to_buf(arq); - arq->complete(arq, ret); + crypto_request_complete(arq, ret); } } From 6c621864554c9b767df6839999d5e0465c8be097 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:42 +0800 Subject: [PATCH 131/156] crypto: qat - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu Reviewed-by: Giovanni Cabiddu Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_common/qat_algs.c | 4 ++-- drivers/crypto/qat/qat_common/qat_algs_send.c | 3 ++- drivers/crypto/qat/qat_common/qat_comp_algs.c | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/crypto/qat/qat_common/qat_algs.c b/drivers/crypto/qat/qat_common/qat_algs.c index b61ada559158..538dcbfbcd26 100644 --- a/drivers/crypto/qat/qat_common/qat_algs.c +++ b/drivers/crypto/qat/qat_common/qat_algs.c @@ -676,7 +676,7 @@ static void qat_aead_alg_callback(struct icp_qat_fw_la_resp *qat_resp, qat_bl_free_bufl(inst->accel_dev, &qat_req->buf); if (unlikely(qat_res != ICP_QAT_FW_COMN_STATUS_FLAG_OK)) res = -EBADMSG; - areq->base.complete(&areq->base, res); + aead_request_complete(areq, res); } static void qat_alg_update_iv_ctr_mode(struct qat_crypto_request *qat_req) @@ -752,7 +752,7 @@ static void qat_skcipher_alg_callback(struct icp_qat_fw_la_resp *qat_resp, memcpy(sreq->iv, qat_req->iv, AES_BLOCK_SIZE); - sreq->base.complete(&sreq->base, res); + skcipher_request_complete(sreq, res); } void qat_alg_callback(void *resp) diff --git a/drivers/crypto/qat/qat_common/qat_algs_send.c b/drivers/crypto/qat/qat_common/qat_algs_send.c index ff5b4347f783..bb80455b3e81 100644 --- a/drivers/crypto/qat/qat_common/qat_algs_send.c +++ b/drivers/crypto/qat/qat_common/qat_algs_send.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0-only) /* Copyright(c) 2022 Intel Corporation */ +#include #include "adf_transport.h" #include "qat_algs_send.h" #include "qat_crypto.h" @@ -34,7 +35,7 @@ void qat_alg_send_backlog(struct qat_instance_backlog *backlog) break; } list_del(&req->list); - req->base->complete(req->base, -EINPROGRESS); + crypto_request_complete(req->base, -EINPROGRESS); } spin_unlock_bh(&backlog->lock); } diff --git a/drivers/crypto/qat/qat_common/qat_comp_algs.c b/drivers/crypto/qat/qat_common/qat_comp_algs.c index 8e54158c11f9..b533984906ec 100644 --- a/drivers/crypto/qat/qat_common/qat_comp_algs.c +++ b/drivers/crypto/qat/qat_common/qat_comp_algs.c @@ -106,7 +106,7 @@ static void qat_comp_resubmit(struct work_struct *work) err: qat_bl_free_bufl(accel_dev, qat_bufs); - areq->base.complete(&areq->base, ret); + acomp_request_complete(areq, ret); } static int parse_zlib_header(u16 zlib_h) @@ -247,7 +247,7 @@ static void qat_comp_generic_callback(struct qat_compression_req *qat_req, end: qat_bl_free_bufl(accel_dev, &qat_req->buf); - areq->base.complete(&areq->base, res); + acomp_request_complete(areq, res); } void qat_comp_alg_callback(void *resp) From 0cbe89d5d17393f4ef3d5c25af8bc730fd507cf5 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:44 +0800 Subject: [PATCH 132/156] crypto: qce - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/qce/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/qce/core.c b/drivers/crypto/qce/core.c index d3780be44a76..74deca4f96e0 100644 --- a/drivers/crypto/qce/core.c +++ b/drivers/crypto/qce/core.c @@ -107,7 +107,7 @@ static int qce_handle_queue(struct qce_device *qce, if (backlog) { spin_lock_bh(&qce->lock); - backlog->complete(backlog, -EINPROGRESS); + crypto_request_complete(backlog, -EINPROGRESS); spin_unlock_bh(&qce->lock); } @@ -132,7 +132,7 @@ static void qce_tasklet_req_done(unsigned long data) spin_unlock_irqrestore(&qce->lock, flags); if (req) - req->complete(req, qce->result); + crypto_request_complete(req, qce->result); qce_handle_queue(qce, NULL); } From e94c1c9b43bcb6a6f094b36fcee0bdf6d0712cd5 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:46 +0800 Subject: [PATCH 133/156] crypto: s5p-sss - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/s5p-sss.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/s5p-sss.c b/drivers/crypto/s5p-sss.c index b79e49aa724f..1c4d5fb05d69 100644 --- a/drivers/crypto/s5p-sss.c +++ b/drivers/crypto/s5p-sss.c @@ -499,7 +499,7 @@ static void s5p_sg_done(struct s5p_aes_dev *dev) /* Calls the completion. Cannot be called with dev->lock hold. */ static void s5p_aes_complete(struct skcipher_request *req, int err) { - req->base.complete(&req->base, err); + skcipher_request_complete(req, err); } static void s5p_unset_outdata(struct s5p_aes_dev *dev) @@ -1355,7 +1355,7 @@ static void s5p_hash_finish_req(struct ahash_request *req, int err) spin_unlock_irqrestore(&dd->hash_lock, flags); if (req->base.complete) - req->base.complete(&req->base, err); + ahash_request_complete(req, err); } /** @@ -1397,7 +1397,7 @@ retry: return ret; if (backlog) - backlog->complete(backlog, -EINPROGRESS); + crypto_request_complete(backlog, -EINPROGRESS); req = ahash_request_cast(async_req); dd->hash_req = req; @@ -1991,7 +1991,7 @@ static void s5p_tasklet_cb(unsigned long data) spin_unlock_irqrestore(&dev->lock, flags); if (backlog) - backlog->complete(backlog, -EINPROGRESS); + crypto_request_complete(backlog, -EINPROGRESS); dev->req = skcipher_request_cast(async_req); dev->ctx = crypto_tfm_ctx(dev->req->base.tfm); From 555c5661317e9c3090b9d181106d8bc31dd8e29a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:48 +0800 Subject: [PATCH 134/156] crypto: sahara - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/sahara.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/sahara.c b/drivers/crypto/sahara.c index 7ab20fb95166..dd4c703cd855 100644 --- a/drivers/crypto/sahara.c +++ b/drivers/crypto/sahara.c @@ -1049,7 +1049,7 @@ static int sahara_queue_manage(void *data) spin_unlock_bh(&dev->queue_spinlock); if (backlog) - backlog->complete(backlog, -EINPROGRESS); + crypto_request_complete(backlog, -EINPROGRESS); if (async_req) { if (crypto_tfm_alg_type(async_req->tfm) == @@ -1065,7 +1065,7 @@ static int sahara_queue_manage(void *data) ret = sahara_aes_process(req); } - async_req->complete(async_req, ret); + crypto_request_complete(async_req, ret); continue; } From 234650bd22b4a875a359c83066fd723704aaa7b2 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 31 Jan 2023 16:02:50 +0800 Subject: [PATCH 135/156] crypto: talitos - Use request_complete helpers Use the request_complete helpers instead of calling the completion function directly. Signed-off-by: Herbert Xu --- drivers/crypto/talitos.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index d62ec68e3183..bb27f011cf31 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -1560,7 +1560,7 @@ static void skcipher_done(struct device *dev, kfree(edesc); - areq->base.complete(&areq->base, err); + skcipher_request_complete(areq, err); } static int common_nonsnoop(struct talitos_edesc *edesc, @@ -1759,7 +1759,7 @@ static void ahash_done(struct device *dev, kfree(edesc); - areq->base.complete(&areq->base, err); + ahash_request_complete(areq, err); } /* From 255e48eb17684157336bd6dd98d22c1b2d9e3f43 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 8 Feb 2023 13:58:44 +0800 Subject: [PATCH 136/156] crypto: api - Use data directly in completion function This patch does the final flag day conversion of all completion functions which are now all contained in the Crypto API. Signed-off-by: Herbert Xu --- crypto/adiantum.c | 5 ++-- crypto/af_alg.c | 6 ++--- crypto/ahash.c | 12 ++++----- crypto/api.c | 4 +-- crypto/authenc.c | 14 +++++----- crypto/authencesn.c | 15 +++++------ crypto/ccm.c | 9 +++---- crypto/chacha20poly1305.c | 40 ++++++++++++++--------------- crypto/cryptd.c | 52 +++++++++++++++++++------------------- crypto/cts.c | 12 ++++----- crypto/dh.c | 5 ++-- crypto/essiv.c | 8 +++--- crypto/gcm.c | 36 +++++++++++++------------- crypto/hctr2.c | 5 ++-- crypto/lrw.c | 4 +-- crypto/pcrypt.c | 4 +-- crypto/rsa-pkcs1pad.c | 15 +++++------ crypto/seqiv.c | 5 ++-- crypto/xts.c | 12 ++++----- drivers/crypto/atmel-sha.c | 5 ++-- include/crypto/algapi.h | 3 +-- include/crypto/if_alg.h | 4 +-- include/linux/crypto.h | 10 ++++---- 23 files changed, 133 insertions(+), 152 deletions(-) diff --git a/crypto/adiantum.c b/crypto/adiantum.c index 84450130cb6b..c33ba22a6638 100644 --- a/crypto/adiantum.c +++ b/crypto/adiantum.c @@ -308,10 +308,9 @@ static int adiantum_finish(struct skcipher_request *req) return 0; } -static void adiantum_streamcipher_done(struct crypto_async_request *areq, - int err) +static void adiantum_streamcipher_done(void *data, int err) { - struct skcipher_request *req = areq->data; + struct skcipher_request *req = data; if (!err) err = adiantum_finish(req); diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 0a4fa2a429e2..5f7252a5b7b4 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -1186,7 +1186,7 @@ EXPORT_SYMBOL_GPL(af_alg_free_resources); /** * af_alg_async_cb - AIO callback handler - * @_req: async request info + * @data: async request completion data * @err: if non-zero, error result to be returned via ki_complete(); * otherwise return the AIO output length via ki_complete(). * @@ -1196,9 +1196,9 @@ EXPORT_SYMBOL_GPL(af_alg_free_resources); * The number of bytes to be generated with the AIO operation must be set * in areq->outlen before the AIO callback handler is invoked. */ -void af_alg_async_cb(struct crypto_async_request *_req, int err) +void af_alg_async_cb(void *data, int err) { - struct af_alg_async_req *areq = _req->data; + struct af_alg_async_req *areq = data; struct sock *sk = areq->sk; struct kiocb *iocb = areq->iocb; unsigned int resultlen; diff --git a/crypto/ahash.c b/crypto/ahash.c index 19241b18a4d1..ff8c79d975c1 100644 --- a/crypto/ahash.c +++ b/crypto/ahash.c @@ -256,9 +256,9 @@ static void ahash_restore_req(struct ahash_request *req, int err) kfree_sensitive(subreq); } -static void ahash_op_unaligned_done(struct crypto_async_request *req, int err) +static void ahash_op_unaligned_done(void *data, int err) { - struct ahash_request *areq = req->data; + struct ahash_request *areq = data; if (err == -EINPROGRESS) goto out; @@ -348,9 +348,9 @@ int crypto_ahash_digest(struct ahash_request *req) } EXPORT_SYMBOL_GPL(crypto_ahash_digest); -static void ahash_def_finup_done2(struct crypto_async_request *req, int err) +static void ahash_def_finup_done2(void *data, int err) { - struct ahash_request *areq = req->data; + struct ahash_request *areq = data; if (err == -EINPROGRESS) return; @@ -378,9 +378,9 @@ out: return err; } -static void ahash_def_finup_done1(struct crypto_async_request *req, int err) +static void ahash_def_finup_done1(void *data, int err) { - struct ahash_request *areq = req->data; + struct ahash_request *areq = data; struct ahash_request *subreq; if (err == -EINPROGRESS) diff --git a/crypto/api.c b/crypto/api.c index b022702f6436..e67cc63368ed 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -643,9 +643,9 @@ int crypto_has_alg(const char *name, u32 type, u32 mask) } EXPORT_SYMBOL_GPL(crypto_has_alg); -void crypto_req_done(struct crypto_async_request *req, int err) +void crypto_req_done(void *data, int err) { - struct crypto_wait *wait = req->data; + struct crypto_wait *wait = data; if (err == -EINPROGRESS) return; diff --git a/crypto/authenc.c b/crypto/authenc.c index 17f674a7cdff..3326c7343e86 100644 --- a/crypto/authenc.c +++ b/crypto/authenc.c @@ -109,9 +109,9 @@ out: return err; } -static void authenc_geniv_ahash_done(struct crypto_async_request *areq, int err) +static void authenc_geniv_ahash_done(void *data, int err) { - struct aead_request *req = areq->data; + struct aead_request *req = data; struct crypto_aead *authenc = crypto_aead_reqtfm(req); struct aead_instance *inst = aead_alg_instance(authenc); struct authenc_instance_ctx *ictx = aead_instance_ctx(inst); @@ -160,10 +160,9 @@ static int crypto_authenc_genicv(struct aead_request *req, unsigned int flags) return 0; } -static void crypto_authenc_encrypt_done(struct crypto_async_request *req, - int err) +static void crypto_authenc_encrypt_done(void *data, int err) { - struct aead_request *areq = req->data; + struct aead_request *areq = data; if (err) goto out; @@ -261,10 +260,9 @@ static int crypto_authenc_decrypt_tail(struct aead_request *req, return crypto_skcipher_decrypt(skreq); } -static void authenc_verify_ahash_done(struct crypto_async_request *areq, - int err) +static void authenc_verify_ahash_done(void *data, int err) { - struct aead_request *req = areq->data; + struct aead_request *req = data; if (err) goto out; diff --git a/crypto/authencesn.c b/crypto/authencesn.c index b60e61b1904c..91424e791d5c 100644 --- a/crypto/authencesn.c +++ b/crypto/authencesn.c @@ -107,10 +107,9 @@ static int crypto_authenc_esn_genicv_tail(struct aead_request *req, return 0; } -static void authenc_esn_geniv_ahash_done(struct crypto_async_request *areq, - int err) +static void authenc_esn_geniv_ahash_done(void *data, int err) { - struct aead_request *req = areq->data; + struct aead_request *req = data; err = err ?: crypto_authenc_esn_genicv_tail(req, 0); aead_request_complete(req, err); @@ -153,10 +152,9 @@ static int crypto_authenc_esn_genicv(struct aead_request *req, } -static void crypto_authenc_esn_encrypt_done(struct crypto_async_request *req, - int err) +static void crypto_authenc_esn_encrypt_done(void *data, int err) { - struct aead_request *areq = req->data; + struct aead_request *areq = data; if (!err) err = crypto_authenc_esn_genicv(areq, 0); @@ -258,10 +256,9 @@ decrypt: return crypto_skcipher_decrypt(skreq); } -static void authenc_esn_verify_ahash_done(struct crypto_async_request *areq, - int err) +static void authenc_esn_verify_ahash_done(void *data, int err) { - struct aead_request *req = areq->data; + struct aead_request *req = data; err = err ?: crypto_authenc_esn_decrypt_tail(req, 0); authenc_esn_request_complete(req, err); diff --git a/crypto/ccm.c b/crypto/ccm.c index 30dbae72728f..a9453129c51c 100644 --- a/crypto/ccm.c +++ b/crypto/ccm.c @@ -224,9 +224,9 @@ out: return err; } -static void crypto_ccm_encrypt_done(struct crypto_async_request *areq, int err) +static void crypto_ccm_encrypt_done(void *data, int err) { - struct aead_request *req = areq->data; + struct aead_request *req = data; struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); u8 *odata = pctx->odata; @@ -320,10 +320,9 @@ static int crypto_ccm_encrypt(struct aead_request *req) return err; } -static void crypto_ccm_decrypt_done(struct crypto_async_request *areq, - int err) +static void crypto_ccm_decrypt_done(void *data, int err) { - struct aead_request *req = areq->data; + struct aead_request *req = data; struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct crypto_aead *aead = crypto_aead_reqtfm(req); unsigned int authsize = crypto_aead_authsize(aead); diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c index 97bbb135e9a6..3a905c5d8f53 100644 --- a/crypto/chacha20poly1305.c +++ b/crypto/chacha20poly1305.c @@ -115,9 +115,9 @@ static int poly_copy_tag(struct aead_request *req) return 0; } -static void chacha_decrypt_done(struct crypto_async_request *areq, int err) +static void chacha_decrypt_done(void *data, int err) { - async_done_continue(areq->data, err, poly_verify_tag); + async_done_continue(data, err, poly_verify_tag); } static int chacha_decrypt(struct aead_request *req) @@ -161,9 +161,9 @@ static int poly_tail_continue(struct aead_request *req) return chacha_decrypt(req); } -static void poly_tail_done(struct crypto_async_request *areq, int err) +static void poly_tail_done(void *data, int err) { - async_done_continue(areq->data, err, poly_tail_continue); + async_done_continue(data, err, poly_tail_continue); } static int poly_tail(struct aead_request *req) @@ -191,9 +191,9 @@ static int poly_tail(struct aead_request *req) return poly_tail_continue(req); } -static void poly_cipherpad_done(struct crypto_async_request *areq, int err) +static void poly_cipherpad_done(void *data, int err) { - async_done_continue(areq->data, err, poly_tail); + async_done_continue(data, err, poly_tail); } static int poly_cipherpad(struct aead_request *req) @@ -220,9 +220,9 @@ static int poly_cipherpad(struct aead_request *req) return poly_tail(req); } -static void poly_cipher_done(struct crypto_async_request *areq, int err) +static void poly_cipher_done(void *data, int err) { - async_done_continue(areq->data, err, poly_cipherpad); + async_done_continue(data, err, poly_cipherpad); } static int poly_cipher(struct aead_request *req) @@ -250,9 +250,9 @@ static int poly_cipher(struct aead_request *req) return poly_cipherpad(req); } -static void poly_adpad_done(struct crypto_async_request *areq, int err) +static void poly_adpad_done(void *data, int err) { - async_done_continue(areq->data, err, poly_cipher); + async_done_continue(data, err, poly_cipher); } static int poly_adpad(struct aead_request *req) @@ -279,9 +279,9 @@ static int poly_adpad(struct aead_request *req) return poly_cipher(req); } -static void poly_ad_done(struct crypto_async_request *areq, int err) +static void poly_ad_done(void *data, int err) { - async_done_continue(areq->data, err, poly_adpad); + async_done_continue(data, err, poly_adpad); } static int poly_ad(struct aead_request *req) @@ -303,9 +303,9 @@ static int poly_ad(struct aead_request *req) return poly_adpad(req); } -static void poly_setkey_done(struct crypto_async_request *areq, int err) +static void poly_setkey_done(void *data, int err) { - async_done_continue(areq->data, err, poly_ad); + async_done_continue(data, err, poly_ad); } static int poly_setkey(struct aead_request *req) @@ -329,9 +329,9 @@ static int poly_setkey(struct aead_request *req) return poly_ad(req); } -static void poly_init_done(struct crypto_async_request *areq, int err) +static void poly_init_done(void *data, int err) { - async_done_continue(areq->data, err, poly_setkey); + async_done_continue(data, err, poly_setkey); } static int poly_init(struct aead_request *req) @@ -352,9 +352,9 @@ static int poly_init(struct aead_request *req) return poly_setkey(req); } -static void poly_genkey_done(struct crypto_async_request *areq, int err) +static void poly_genkey_done(void *data, int err) { - async_done_continue(areq->data, err, poly_init); + async_done_continue(data, err, poly_init); } static int poly_genkey(struct aead_request *req) @@ -391,9 +391,9 @@ static int poly_genkey(struct aead_request *req) return poly_init(req); } -static void chacha_encrypt_done(struct crypto_async_request *areq, int err) +static void chacha_encrypt_done(void *data, int err) { - async_done_continue(areq->data, err, poly_genkey); + async_done_continue(data, err, poly_genkey); } static int chacha_encrypt(struct aead_request *req) diff --git a/crypto/cryptd.c b/crypto/cryptd.c index 29890fc0eab7..37365ed30b38 100644 --- a/crypto/cryptd.c +++ b/crypto/cryptd.c @@ -285,10 +285,9 @@ static void cryptd_skcipher_complete(struct skcipher_request *req, int err, crypto_free_skcipher(tfm); } -static void cryptd_skcipher_encrypt(struct crypto_async_request *base, - int err) +static void cryptd_skcipher_encrypt(void *data, int err) { - struct skcipher_request *req = skcipher_request_cast(base); + struct skcipher_request *req = data; struct skcipher_request *subreq; subreq = cryptd_skcipher_prepare(req, err); @@ -298,10 +297,9 @@ static void cryptd_skcipher_encrypt(struct crypto_async_request *base, cryptd_skcipher_complete(req, err, cryptd_skcipher_encrypt); } -static void cryptd_skcipher_decrypt(struct crypto_async_request *base, - int err) +static void cryptd_skcipher_decrypt(void *data, int err) { - struct skcipher_request *req = skcipher_request_cast(base); + struct skcipher_request *req = data; struct skcipher_request *subreq; subreq = cryptd_skcipher_prepare(req, err); @@ -515,9 +513,9 @@ static void cryptd_hash_complete(struct ahash_request *req, int err, crypto_free_ahash(tfm); } -static void cryptd_hash_init(struct crypto_async_request *req_async, int err) +static void cryptd_hash_init(void *data, int err) { - struct ahash_request *req = ahash_request_cast(req_async); + struct ahash_request *req = data; struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(tfm); struct crypto_shash *child = ctx->child; @@ -540,9 +538,9 @@ static int cryptd_hash_init_enqueue(struct ahash_request *req) return cryptd_hash_enqueue(req, cryptd_hash_init); } -static void cryptd_hash_update(struct crypto_async_request *req_async, int err) +static void cryptd_hash_update(void *data, int err) { - struct ahash_request *req = ahash_request_cast(req_async); + struct ahash_request *req = data; struct shash_desc *desc; desc = cryptd_hash_prepare(req, err); @@ -557,9 +555,9 @@ static int cryptd_hash_update_enqueue(struct ahash_request *req) return cryptd_hash_enqueue(req, cryptd_hash_update); } -static void cryptd_hash_final(struct crypto_async_request *req_async, int err) +static void cryptd_hash_final(void *data, int err) { - struct ahash_request *req = ahash_request_cast(req_async); + struct ahash_request *req = data; struct shash_desc *desc; desc = cryptd_hash_prepare(req, err); @@ -574,9 +572,9 @@ static int cryptd_hash_final_enqueue(struct ahash_request *req) return cryptd_hash_enqueue(req, cryptd_hash_final); } -static void cryptd_hash_finup(struct crypto_async_request *req_async, int err) +static void cryptd_hash_finup(void *data, int err) { - struct ahash_request *req = ahash_request_cast(req_async); + struct ahash_request *req = data; struct shash_desc *desc; desc = cryptd_hash_prepare(req, err); @@ -591,9 +589,9 @@ static int cryptd_hash_finup_enqueue(struct ahash_request *req) return cryptd_hash_enqueue(req, cryptd_hash_finup); } -static void cryptd_hash_digest(struct crypto_async_request *req_async, int err) +static void cryptd_hash_digest(void *data, int err) { - struct ahash_request *req = ahash_request_cast(req_async); + struct ahash_request *req = data; struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(tfm); struct crypto_shash *child = ctx->child; @@ -767,24 +765,26 @@ out: crypto_free_aead(tfm); } -static void cryptd_aead_encrypt(struct crypto_async_request *areq, int err) +static void cryptd_aead_encrypt(void *data, int err) { - struct cryptd_aead_ctx *ctx = crypto_tfm_ctx(areq->tfm); - struct crypto_aead *child = ctx->child; - struct aead_request *req; + struct aead_request *req = data; + struct cryptd_aead_ctx *ctx; + struct crypto_aead *child; - req = container_of(areq, struct aead_request, base); + ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); + child = ctx->child; cryptd_aead_crypt(req, child, err, crypto_aead_alg(child)->encrypt, cryptd_aead_encrypt); } -static void cryptd_aead_decrypt(struct crypto_async_request *areq, int err) +static void cryptd_aead_decrypt(void *data, int err) { - struct cryptd_aead_ctx *ctx = crypto_tfm_ctx(areq->tfm); - struct crypto_aead *child = ctx->child; - struct aead_request *req; + struct aead_request *req = data; + struct cryptd_aead_ctx *ctx; + struct crypto_aead *child; - req = container_of(areq, struct aead_request, base); + ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); + child = ctx->child; cryptd_aead_crypt(req, child, err, crypto_aead_alg(child)->decrypt, cryptd_aead_decrypt); } diff --git a/crypto/cts.c b/crypto/cts.c index 3766d47ebcc0..8f604f6554b1 100644 --- a/crypto/cts.c +++ b/crypto/cts.c @@ -85,9 +85,9 @@ static int crypto_cts_setkey(struct crypto_skcipher *parent, const u8 *key, return crypto_skcipher_setkey(child, key, keylen); } -static void cts_cbc_crypt_done(struct crypto_async_request *areq, int err) +static void cts_cbc_crypt_done(void *data, int err) { - struct skcipher_request *req = areq->data; + struct skcipher_request *req = data; if (err == -EINPROGRESS) return; @@ -125,9 +125,9 @@ static int cts_cbc_encrypt(struct skcipher_request *req) return crypto_skcipher_encrypt(subreq); } -static void crypto_cts_encrypt_done(struct crypto_async_request *areq, int err) +static void crypto_cts_encrypt_done(void *data, int err) { - struct skcipher_request *req = areq->data; + struct skcipher_request *req = data; if (err) goto out; @@ -219,9 +219,9 @@ static int cts_cbc_decrypt(struct skcipher_request *req) return crypto_skcipher_decrypt(subreq); } -static void crypto_cts_decrypt_done(struct crypto_async_request *areq, int err) +static void crypto_cts_decrypt_done(void *data, int err) { - struct skcipher_request *req = areq->data; + struct skcipher_request *req = data; if (err) goto out; diff --git a/crypto/dh.c b/crypto/dh.c index e39c1bde1ac0..0fcad279e6fe 100644 --- a/crypto/dh.c +++ b/crypto/dh.c @@ -503,10 +503,9 @@ out: return err; } -static void dh_safe_prime_complete_req(struct crypto_async_request *dh_req, - int err) +static void dh_safe_prime_complete_req(void *data, int err) { - struct kpp_request *req = dh_req->data; + struct kpp_request *req = data; kpp_request_complete(req, err); } diff --git a/crypto/essiv.c b/crypto/essiv.c index 307eba74b901..f7d4ef4837e5 100644 --- a/crypto/essiv.c +++ b/crypto/essiv.c @@ -131,9 +131,9 @@ static int essiv_aead_setauthsize(struct crypto_aead *tfm, return crypto_aead_setauthsize(tctx->u.aead, authsize); } -static void essiv_skcipher_done(struct crypto_async_request *areq, int err) +static void essiv_skcipher_done(void *data, int err) { - struct skcipher_request *req = areq->data; + struct skcipher_request *req = data; skcipher_request_complete(req, err); } @@ -166,9 +166,9 @@ static int essiv_skcipher_decrypt(struct skcipher_request *req) return essiv_skcipher_crypt(req, false); } -static void essiv_aead_done(struct crypto_async_request *areq, int err) +static void essiv_aead_done(void *data, int err) { - struct aead_request *req = areq->data; + struct aead_request *req = data; struct essiv_aead_request_ctx *rctx = aead_request_ctx(req); if (err == -EINPROGRESS) diff --git a/crypto/gcm.c b/crypto/gcm.c index 338ee0769747..4ba624450c3f 100644 --- a/crypto/gcm.c +++ b/crypto/gcm.c @@ -197,7 +197,7 @@ static inline unsigned int gcm_remain(unsigned int len) return len ? 16 - len : 0; } -static void gcm_hash_len_done(struct crypto_async_request *areq, int err); +static void gcm_hash_len_done(void *data, int err); static int gcm_hash_update(struct aead_request *req, crypto_completion_t compl, @@ -246,9 +246,9 @@ static int gcm_hash_len_continue(struct aead_request *req, u32 flags) return gctx->complete(req, flags); } -static void gcm_hash_len_done(struct crypto_async_request *areq, int err) +static void gcm_hash_len_done(void *data, int err) { - struct aead_request *req = areq->data; + struct aead_request *req = data; if (err) goto out; @@ -267,10 +267,9 @@ static int gcm_hash_crypt_remain_continue(struct aead_request *req, u32 flags) gcm_hash_len_continue(req, flags); } -static void gcm_hash_crypt_remain_done(struct crypto_async_request *areq, - int err) +static void gcm_hash_crypt_remain_done(void *data, int err) { - struct aead_request *req = areq->data; + struct aead_request *req = data; if (err) goto out; @@ -298,9 +297,9 @@ static int gcm_hash_crypt_continue(struct aead_request *req, u32 flags) return gcm_hash_crypt_remain_continue(req, flags); } -static void gcm_hash_crypt_done(struct crypto_async_request *areq, int err) +static void gcm_hash_crypt_done(void *data, int err) { - struct aead_request *req = areq->data; + struct aead_request *req = data; if (err) goto out; @@ -326,10 +325,9 @@ static int gcm_hash_assoc_remain_continue(struct aead_request *req, u32 flags) return gcm_hash_crypt_remain_continue(req, flags); } -static void gcm_hash_assoc_remain_done(struct crypto_async_request *areq, - int err) +static void gcm_hash_assoc_remain_done(void *data, int err) { - struct aead_request *req = areq->data; + struct aead_request *req = data; if (err) goto out; @@ -355,9 +353,9 @@ static int gcm_hash_assoc_continue(struct aead_request *req, u32 flags) return gcm_hash_assoc_remain_continue(req, flags); } -static void gcm_hash_assoc_done(struct crypto_async_request *areq, int err) +static void gcm_hash_assoc_done(void *data, int err) { - struct aead_request *req = areq->data; + struct aead_request *req = data; if (err) goto out; @@ -380,9 +378,9 @@ static int gcm_hash_init_continue(struct aead_request *req, u32 flags) return gcm_hash_assoc_remain_continue(req, flags); } -static void gcm_hash_init_done(struct crypto_async_request *areq, int err) +static void gcm_hash_init_done(void *data, int err) { - struct aead_request *req = areq->data; + struct aead_request *req = data; if (err) goto out; @@ -433,9 +431,9 @@ static int gcm_encrypt_continue(struct aead_request *req, u32 flags) return gcm_hash(req, flags); } -static void gcm_encrypt_done(struct crypto_async_request *areq, int err) +static void gcm_encrypt_done(void *data, int err) { - struct aead_request *req = areq->data; + struct aead_request *req = data; if (err) goto out; @@ -477,9 +475,9 @@ static int crypto_gcm_verify(struct aead_request *req) return crypto_memneq(iauth_tag, auth_tag, authsize) ? -EBADMSG : 0; } -static void gcm_decrypt_done(struct crypto_async_request *areq, int err) +static void gcm_decrypt_done(void *data, int err) { - struct aead_request *req = areq->data; + struct aead_request *req = data; if (!err) err = crypto_gcm_verify(req); diff --git a/crypto/hctr2.c b/crypto/hctr2.c index 7d00a3bcb667..6f4c1884d0e9 100644 --- a/crypto/hctr2.c +++ b/crypto/hctr2.c @@ -252,10 +252,9 @@ static int hctr2_finish(struct skcipher_request *req) return 0; } -static void hctr2_xctr_done(struct crypto_async_request *areq, - int err) +static void hctr2_xctr_done(void *data, int err) { - struct skcipher_request *req = areq->data; + struct skcipher_request *req = data; if (!err) err = hctr2_finish(req); diff --git a/crypto/lrw.c b/crypto/lrw.c index 8d59a66b6525..1b0f76ba3eb5 100644 --- a/crypto/lrw.c +++ b/crypto/lrw.c @@ -205,9 +205,9 @@ static int lrw_xor_tweak_post(struct skcipher_request *req) return lrw_xor_tweak(req, true); } -static void lrw_crypt_done(struct crypto_async_request *areq, int err) +static void lrw_crypt_done(void *data, int err) { - struct skcipher_request *req = areq->data; + struct skcipher_request *req = data; if (!err) { struct lrw_request_ctx *rctx = skcipher_request_ctx(req); diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c index 9d10b846ccf7..8c1d0ca41213 100644 --- a/crypto/pcrypt.c +++ b/crypto/pcrypt.c @@ -63,9 +63,9 @@ static void pcrypt_aead_serial(struct padata_priv *padata) aead_request_complete(req->base.data, padata->info); } -static void pcrypt_aead_done(struct crypto_async_request *areq, int err) +static void pcrypt_aead_done(void *data, int err) { - struct aead_request *req = areq->data; + struct aead_request *req = data; struct pcrypt_request *preq = aead_request_ctx(req); struct padata_priv *padata = pcrypt_request_padata(preq); diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c index 02028670331d..d2e5e104f8cf 100644 --- a/crypto/rsa-pkcs1pad.c +++ b/crypto/rsa-pkcs1pad.c @@ -210,10 +210,9 @@ out: return err; } -static void pkcs1pad_encrypt_sign_complete_cb( - struct crypto_async_request *child_async_req, int err) +static void pkcs1pad_encrypt_sign_complete_cb(void *data, int err) { - struct akcipher_request *req = child_async_req->data; + struct akcipher_request *req = data; if (err == -EINPROGRESS) goto out; @@ -326,10 +325,9 @@ done: return err; } -static void pkcs1pad_decrypt_complete_cb( - struct crypto_async_request *child_async_req, int err) +static void pkcs1pad_decrypt_complete_cb(void *data, int err) { - struct akcipher_request *req = child_async_req->data; + struct akcipher_request *req = data; if (err == -EINPROGRESS) goto out; @@ -506,10 +504,9 @@ done: return err; } -static void pkcs1pad_verify_complete_cb( - struct crypto_async_request *child_async_req, int err) +static void pkcs1pad_verify_complete_cb(void *data, int err) { - struct akcipher_request *req = child_async_req->data; + struct akcipher_request *req = data; if (err == -EINPROGRESS) goto out; diff --git a/crypto/seqiv.c b/crypto/seqiv.c index b1bcfe537daf..17e11d51ddc3 100644 --- a/crypto/seqiv.c +++ b/crypto/seqiv.c @@ -36,10 +36,9 @@ out: kfree_sensitive(subreq->iv); } -static void seqiv_aead_encrypt_complete(struct crypto_async_request *base, - int err) +static void seqiv_aead_encrypt_complete(void *data, int err) { - struct aead_request *req = base->data; + struct aead_request *req = data; seqiv_aead_encrypt_complete2(req, err); aead_request_complete(req, err); diff --git a/crypto/xts.c b/crypto/xts.c index de6cbcf69bbd..09be909a6a1a 100644 --- a/crypto/xts.c +++ b/crypto/xts.c @@ -140,9 +140,9 @@ static int xts_xor_tweak_post(struct skcipher_request *req, bool enc) return xts_xor_tweak(req, true, enc); } -static void xts_cts_done(struct crypto_async_request *areq, int err) +static void xts_cts_done(void *data, int err) { - struct skcipher_request *req = areq->data; + struct skcipher_request *req = data; le128 b; if (!err) { @@ -196,9 +196,9 @@ static int xts_cts_final(struct skcipher_request *req, return 0; } -static void xts_encrypt_done(struct crypto_async_request *areq, int err) +static void xts_encrypt_done(void *data, int err) { - struct skcipher_request *req = areq->data; + struct skcipher_request *req = data; if (!err) { struct xts_request_ctx *rctx = skcipher_request_ctx(req); @@ -216,9 +216,9 @@ static void xts_encrypt_done(struct crypto_async_request *areq, int err) skcipher_request_complete(req, err); } -static void xts_decrypt_done(struct crypto_async_request *areq, int err) +static void xts_decrypt_done(void *data, int err) { - struct skcipher_request *req = areq->data; + struct skcipher_request *req = data; if (!err) { struct xts_request_ctx *rctx = skcipher_request_ctx(req); diff --git a/drivers/crypto/atmel-sha.c b/drivers/crypto/atmel-sha.c index a77cf0da0816..e7c1db2739ec 100644 --- a/drivers/crypto/atmel-sha.c +++ b/drivers/crypto/atmel-sha.c @@ -2099,10 +2099,9 @@ struct atmel_sha_authenc_reqctx { unsigned int digestlen; }; -static void atmel_sha_authenc_complete(struct crypto_async_request *areq, - int err) +static void atmel_sha_authenc_complete(void *data, int err) { - struct ahash_request *req = areq->data; + struct ahash_request *req = data; struct atmel_sha_authenc_reqctx *authctx = ahash_request_ctx(req); authctx->cb(authctx->aes_dev, err, authctx->base.dd->is_async); diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h index 1fd81e74a174..fede394ae2ab 100644 --- a/include/crypto/algapi.h +++ b/include/crypto/algapi.h @@ -305,8 +305,7 @@ enum { static inline void crypto_request_complete(struct crypto_async_request *req, int err) { - crypto_completion_t complete = req->complete; - complete(req, err); + req->complete(req->data, err); } #endif /* _CRYPTO_ALGAPI_H */ diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index a5db86670bdf..7e76623f9ec3 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -21,8 +21,6 @@ #define ALG_MAX_PAGES 16 -struct crypto_async_request; - struct alg_sock { /* struct sock must be the first member of struct alg_sock */ struct sock sk; @@ -235,7 +233,7 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, ssize_t af_alg_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); void af_alg_free_resources(struct af_alg_async_req *areq); -void af_alg_async_cb(struct crypto_async_request *_req, int err); +void af_alg_async_cb(void *data, int err); __poll_t af_alg_poll(struct file *file, struct socket *sock, poll_table *wait); struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk, diff --git a/include/linux/crypto.h b/include/linux/crypto.h index b18f6e669fb1..80f6350fb588 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -176,8 +176,8 @@ struct crypto_async_request; struct crypto_tfm; struct crypto_type; -typedef struct crypto_async_request crypto_completion_data_t; -typedef void (*crypto_completion_t)(struct crypto_async_request *req, int err); +typedef void crypto_completion_data_t; +typedef void (*crypto_completion_t)(void *req, int err); /** * DOC: Block Cipher Context Data Structures @@ -596,12 +596,12 @@ struct crypto_wait { /* * Async ops completion helper functioons */ -static inline void *crypto_get_completion_data(crypto_completion_data_t *req) +static inline void *crypto_get_completion_data(void *data) { - return req->data; + return data; } -void crypto_req_done(struct crypto_async_request *req, int err); +void crypto_req_done(void *req, int err); static inline int crypto_wait_req(int err, struct crypto_wait *wait) { From dcfe653d7cd4f3891aa0a471a1afdae884941bee Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 18:22:34 +0800 Subject: [PATCH 137/156] dm: Remove completion function scaffolding This patch removes the temporary scaffolding now that the comletion function signature has been converted. Signed-off-by: Herbert Xu Acked-by: Mike Snitzer Signed-off-by: Herbert Xu --- drivers/md/dm-crypt.c | 6 +++--- drivers/md/dm-integrity.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 7609fe39ab8c..3aeeb8f2802f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1458,7 +1458,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc, return r; } -static void kcryptd_async_done(crypto_completion_data_t *async_req, int error); +static void kcryptd_async_done(void *async_req, int error); static int crypt_alloc_req_skcipher(struct crypt_config *cc, struct convert_context *ctx) @@ -2146,9 +2146,9 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) crypt_dec_pending(io); } -static void kcryptd_async_done(crypto_completion_data_t *data, int error) +static void kcryptd_async_done(void *data, int error) { - struct dm_crypt_request *dmreq = crypto_get_completion_data(data); + struct dm_crypt_request *dmreq = data; struct convert_context *ctx = dmreq->ctx; struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); struct crypt_config *cc = io->cc; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index eefe25ed841e..c58156deb2b1 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -955,9 +955,9 @@ static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned sectio async_tx_issue_pending_all(); } -static void complete_journal_encrypt(crypto_completion_data_t *data, int err) +static void complete_journal_encrypt(void *data, int err) { - struct journal_completion *comp = crypto_get_completion_data(data); + struct journal_completion *comp = data; if (unlikely(err)) { if (likely(err == -EINPROGRESS)) { complete(&comp->ic->crypto_backoff); From 23b8b93ba94bff36fb35454df1c93481fc47c160 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 18:22:36 +0800 Subject: [PATCH 138/156] net: macsec: Remove completion function scaffolding This patch removes the temporary scaffolding now that the comletion function signature has been converted. Signed-off-by: Herbert Xu --- drivers/net/macsec.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index b7d9d487ccd2..becb04123d3e 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -528,9 +528,9 @@ static void count_tx(struct net_device *dev, int ret, int len) } } -static void macsec_encrypt_done(crypto_completion_data_t *data, int err) +static void macsec_encrypt_done(void *data, int err) { - struct sk_buff *skb = crypto_get_completion_data(data); + struct sk_buff *skb = data; struct net_device *dev = skb->dev; struct macsec_dev *macsec = macsec_priv(dev); struct macsec_tx_sa *sa = macsec_skb_cb(skb)->tx_sa; @@ -835,9 +835,9 @@ static void count_rx(struct net_device *dev, int len) u64_stats_update_end(&stats->syncp); } -static void macsec_decrypt_done(crypto_completion_data_t *data, int err) +static void macsec_decrypt_done(void *data, int err) { - struct sk_buff *skb = crypto_get_completion_data(data); + struct sk_buff *skb = data; struct net_device *dev = skb->dev; struct macsec_dev *macsec = macsec_priv(dev); struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa; From fd5dabf764f79ced0b128ddad39fb272e884a5b8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 18:22:38 +0800 Subject: [PATCH 139/156] net: ipv4: Remove completion function scaffolding This patch removes the temporary scaffolding now that the comletion function signature has been converted. Signed-off-by: Herbert Xu --- net/ipv4/ah4.c | 8 ++++---- net/ipv4/esp4.c | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 1fc0231eb1ee..015c0f4ec5ba 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -117,11 +117,11 @@ static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr) return 0; } -static void ah_output_done(crypto_completion_data_t *data, int err) +static void ah_output_done(void *data, int err) { u8 *icv; struct iphdr *iph; - struct sk_buff *skb = crypto_get_completion_data(data); + struct sk_buff *skb = data; struct xfrm_state *x = skb_dst(skb)->xfrm; struct ah_data *ahp = x->data; struct iphdr *top_iph = ip_hdr(skb); @@ -262,12 +262,12 @@ out: return err; } -static void ah_input_done(crypto_completion_data_t *data, int err) +static void ah_input_done(void *data, int err) { u8 *auth_data; u8 *icv; struct iphdr *work_iph; - struct sk_buff *skb = crypto_get_completion_data(data); + struct sk_buff *skb = data; struct xfrm_state *x = xfrm_input_state(skb); struct ah_data *ahp = x->data; struct ip_auth_hdr *ah = ip_auth_hdr(skb); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 8abe07c1ff28..ba06ed42e428 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -244,9 +244,9 @@ static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) } #endif -static void esp_output_done(crypto_completion_data_t *data, int err) +static void esp_output_done(void *data, int err) { - struct sk_buff *skb = crypto_get_completion_data(data); + struct sk_buff *skb = data; struct xfrm_offload *xo = xfrm_offload(skb); void *tmp; struct xfrm_state *x; @@ -332,9 +332,9 @@ static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb, return esph; } -static void esp_output_done_esn(crypto_completion_data_t *data, int err) +static void esp_output_done_esn(void *data, int err) { - struct sk_buff *skb = crypto_get_completion_data(data); + struct sk_buff *skb = data; esp_output_restore_header(skb); esp_output_done(data, err); @@ -830,9 +830,9 @@ out: } EXPORT_SYMBOL_GPL(esp_input_done2); -static void esp_input_done(crypto_completion_data_t *data, int err) +static void esp_input_done(void *data, int err) { - struct sk_buff *skb = crypto_get_completion_data(data); + struct sk_buff *skb = data; xfrm_input_resume(skb, esp_input_done2(skb, err)); } @@ -860,9 +860,9 @@ static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) } } -static void esp_input_done_esn(crypto_completion_data_t *data, int err) +static void esp_input_done_esn(void *data, int err) { - struct sk_buff *skb = crypto_get_completion_data(data); + struct sk_buff *skb = data; esp_input_restore_header(skb); esp_input_done(data, err); From 6002e20dd0f85cfaedc2f2f54a60119f84a60838 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 18:22:40 +0800 Subject: [PATCH 140/156] net: ipv6: Remove completion function scaffolding This patch removes the temporary scaffolding now that the comletion function signature has been converted. Signed-off-by: Herbert Xu --- net/ipv6/ah6.c | 8 ++++---- net/ipv6/esp6.c | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index e43735578a76..01005035ad10 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -281,12 +281,12 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) return 0; } -static void ah6_output_done(crypto_completion_data_t *data, int err) +static void ah6_output_done(void *data, int err) { int extlen; u8 *iph_base; u8 *icv; - struct sk_buff *skb = crypto_get_completion_data(data); + struct sk_buff *skb = data; struct xfrm_state *x = skb_dst(skb)->xfrm; struct ah_data *ahp = x->data; struct ipv6hdr *top_iph = ipv6_hdr(skb); @@ -451,12 +451,12 @@ out: return err; } -static void ah6_input_done(crypto_completion_data_t *data, int err) +static void ah6_input_done(void *data, int err) { u8 *auth_data; u8 *icv; u8 *work_iph; - struct sk_buff *skb = crypto_get_completion_data(data); + struct sk_buff *skb = data; struct xfrm_state *x = xfrm_input_state(skb); struct ah_data *ahp = x->data; struct ip_auth_hdr *ah = ip_auth_hdr(skb); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index b9ee81c7dfcf..fddd0cbdede1 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -278,9 +278,9 @@ static void esp_output_encap_csum(struct sk_buff *skb) } } -static void esp_output_done(crypto_completion_data_t *data, int err) +static void esp_output_done(void *data, int err) { - struct sk_buff *skb = crypto_get_completion_data(data); + struct sk_buff *skb = data; struct xfrm_offload *xo = xfrm_offload(skb); void *tmp; struct xfrm_state *x; @@ -368,9 +368,9 @@ static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb, return esph; } -static void esp_output_done_esn(crypto_completion_data_t *data, int err) +static void esp_output_done_esn(void *data, int err) { - struct sk_buff *skb = crypto_get_completion_data(data); + struct sk_buff *skb = data; esp_output_restore_header(skb); esp_output_done(data, err); @@ -879,9 +879,9 @@ out: } EXPORT_SYMBOL_GPL(esp6_input_done2); -static void esp_input_done(crypto_completion_data_t *data, int err) +static void esp_input_done(void *data, int err) { - struct sk_buff *skb = crypto_get_completion_data(data); + struct sk_buff *skb = data; xfrm_input_resume(skb, esp6_input_done2(skb, err)); } @@ -909,9 +909,9 @@ static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) } } -static void esp_input_done_esn(crypto_completion_data_t *data, int err) +static void esp_input_done_esn(void *data, int err) { - struct sk_buff *skb = crypto_get_completion_data(data); + struct sk_buff *skb = data; esp_input_restore_header(skb); esp_input_done(data, err); From 65cb4657bac776967f1f794fb45d2fa419565c63 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 18:22:42 +0800 Subject: [PATCH 141/156] tipc: Remove completion function scaffolding This patch removes the temporary scaffolding now that the comletion function signature has been converted. Signed-off-by: Herbert Xu --- net/tipc/crypto.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index ab356e7a3870..577fa5af33ec 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -267,10 +267,10 @@ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode); -static void tipc_aead_encrypt_done(crypto_completion_data_t *data, int err); +static void tipc_aead_encrypt_done(void *data, int err); static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b); -static void tipc_aead_decrypt_done(crypto_completion_data_t *data, int err); +static void tipc_aead_decrypt_done(void *data, int err); static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr); static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, u8 tx_key, struct sk_buff *skb, @@ -830,9 +830,9 @@ exit: return rc; } -static void tipc_aead_encrypt_done(crypto_completion_data_t *data, int err) +static void tipc_aead_encrypt_done(void *data, int err) { - struct sk_buff *skb = crypto_get_completion_data(data); + struct sk_buff *skb = data; struct tipc_crypto_tx_ctx *tx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; struct tipc_bearer *b = tx_ctx->bearer; struct tipc_aead *aead = tx_ctx->aead; @@ -954,9 +954,9 @@ exit: return rc; } -static void tipc_aead_decrypt_done(crypto_completion_data_t *data, int err) +static void tipc_aead_decrypt_done(void *data, int err) { - struct sk_buff *skb = crypto_get_completion_data(data); + struct sk_buff *skb = data; struct tipc_crypto_rx_ctx *rx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; struct tipc_bearer *b = rx_ctx->bearer; struct tipc_aead *aead = rx_ctx->aead; From 8580e55aa85f5000065450add82707a5ee7f88f5 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 18:22:44 +0800 Subject: [PATCH 142/156] tls: Remove completion function scaffolding This patch removes the temporary scaffolding now that the comletion function signature has been converted. Signed-off-by: Herbert Xu --- net/tls/tls_sw.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 5b7f67a7d394..0515cda32fe2 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -179,9 +179,9 @@ static int tls_padding_length(struct tls_prot_info *prot, struct sk_buff *skb, return sub; } -static void tls_decrypt_done(crypto_completion_data_t *data, int err) +static void tls_decrypt_done(void *data, int err) { - struct aead_request *aead_req = crypto_get_completion_data(data); + struct aead_request *aead_req = data; struct crypto_aead *aead = crypto_aead_reqtfm(aead_req); struct scatterlist *sgout = aead_req->dst; struct scatterlist *sgin = aead_req->src; @@ -428,9 +428,9 @@ tx_err: return rc; } -static void tls_encrypt_done(crypto_completion_data_t *data, int err) +static void tls_encrypt_done(void *data, int err) { - struct aead_request *aead_req = crypto_get_completion_data(data); + struct aead_request *aead_req = data; struct tls_sw_context_tx *ctx; struct tls_context *tls_ctx; struct tls_prot_info *prot; From 846366b0df0ad19051fbdd38734b1f7690d814c3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 18:22:46 +0800 Subject: [PATCH 143/156] crypto: api - Remove completion function scaffolding This patch removes the temporary scaffolding now that the comletion function signature has been converted. Signed-off-by: Herbert Xu Acked-by: Jarkko Sakkinen Signed-off-by: Herbert Xu --- include/linux/crypto.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 80f6350fb588..bb1d9b0e1647 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -176,7 +176,6 @@ struct crypto_async_request; struct crypto_tfm; struct crypto_type; -typedef void crypto_completion_data_t; typedef void (*crypto_completion_t)(void *req, int err); /** @@ -596,11 +595,6 @@ struct crypto_wait { /* * Async ops completion helper functioons */ -static inline void *crypto_get_completion_data(void *data) -{ - return data; -} - void crypto_req_done(void *req, int err); static inline int crypto_wait_req(int err, struct crypto_wait *wait) From d3777ceaad080716f6d2c1a4c62020d494df00db Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 7 Feb 2023 16:18:36 +0800 Subject: [PATCH 144/156] tls: Pass rec instead of aead_req into tls_encrypt_done The function tls_encrypt_done only uses aead_req to get ahold of the tls_rec object. So we could pass that in instead of aead_req to simplify the code. Suggested-by: Jakub Kicinski Signed-off-by: Herbert Xu Reviewed-by: Jakub Kicinski Signed-off-by: Herbert Xu --- net/tls/tls_sw.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 0515cda32fe2..6dfec2e8fdfa 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -430,18 +430,16 @@ tx_err: static void tls_encrypt_done(void *data, int err) { - struct aead_request *aead_req = data; struct tls_sw_context_tx *ctx; struct tls_context *tls_ctx; struct tls_prot_info *prot; + struct tls_rec *rec = data; struct scatterlist *sge; struct sk_msg *msg_en; - struct tls_rec *rec; bool ready = false; struct sock *sk; int pending; - rec = container_of(aead_req, struct tls_rec, aead_req); msg_en = &rec->msg_encrypted; sk = rec->sk; @@ -536,7 +534,7 @@ static int tls_do_encryption(struct sock *sk, data_len, rec->iv_data); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, - tls_encrypt_done, aead_req); + tls_encrypt_done, rec); /* Add the record in tx_list */ list_add_tail((struct list_head *)&rec->list, &ctx->tx_list); From 5072b1c211107fca80b75f0ac0ca6f90e3385837 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 12:53:38 +0800 Subject: [PATCH 145/156] crypto: ecc - Silence sparse warning Rewrite the bitwise operations to silence the sparse warnings: CHECK ../crypto/ecc.c ../crypto/ecc.c:1387:39: warning: dubious: !x | y ../crypto/ecc.c:1397:47: warning: dubious: !x | y Signed-off-by: Herbert Xu Reviewed-by: Vitaly Chikunov Signed-off-by: Herbert Xu --- crypto/ecc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crypto/ecc.c b/crypto/ecc.c index 7315217c8f73..f53fb4d6af99 100644 --- a/crypto/ecc.c +++ b/crypto/ecc.c @@ -1384,7 +1384,8 @@ void ecc_point_mult_shamir(const struct ecc_point *result, num_bits = max(vli_num_bits(u1, ndigits), vli_num_bits(u2, ndigits)); i = num_bits - 1; - idx = (!!vli_test_bit(u1, i)) | ((!!vli_test_bit(u2, i)) << 1); + idx = !!vli_test_bit(u1, i); + idx |= (!!vli_test_bit(u2, i)) << 1; point = points[idx]; vli_set(rx, point->x, ndigits); @@ -1394,7 +1395,8 @@ void ecc_point_mult_shamir(const struct ecc_point *result, for (--i; i >= 0; i--) { ecc_point_double_jacobian(rx, ry, z, curve); - idx = (!!vli_test_bit(u1, i)) | ((!!vli_test_bit(u2, i)) << 1); + idx = !!vli_test_bit(u1, i); + idx |= (!!vli_test_bit(u2, i)) << 1; point = points[idx]; if (point) { u64 tx[ECC_MAX_DIGITS]; From 6084466e761754815bea11d0fa67dbf79e2b0666 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 13:08:48 +0800 Subject: [PATCH 146/156] crypto: nx - Fix sparse warnings This driver generates a large number of sparse warnings due to two issues. First of all the structure nx842_devdata is defined inline causing the __rcu tag to be added to all users of it. This easily fixed by splitting up the struct definition. The second issue is with kdoc markers being incomplete. The trivial case of nx842_exec_vas has been fixed, while the other incomplete documentation has simply been downgraded to normal C comments. Signed-off-by: Herbert Xu --- drivers/crypto/nx/nx-common-powernv.c | 13 +++++-------- drivers/crypto/nx/nx-common-pseries.c | 6 ++++-- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/crypto/nx/nx-common-powernv.c b/drivers/crypto/nx/nx-common-powernv.c index f34c75a862f2..8c859872c183 100644 --- a/drivers/crypto/nx/nx-common-powernv.c +++ b/drivers/crypto/nx/nx-common-powernv.c @@ -72,7 +72,7 @@ static int (*nx842_powernv_exec)(const unsigned char *in, unsigned int inlen, unsigned char *out, unsigned int *outlenp, void *workmem, int fc); -/** +/* * setup_indirect_dde - Setup an indirect DDE * * The DDE is setup with the DDE count, byte count, and address of @@ -89,7 +89,7 @@ static void setup_indirect_dde(struct data_descriptor_entry *dde, dde->address = cpu_to_be64(nx842_get_pa(ddl)); } -/** +/* * setup_direct_dde - Setup single DDE from buffer * * The DDE is setup with the buffer and length. The buffer must be properly @@ -111,7 +111,7 @@ static unsigned int setup_direct_dde(struct data_descriptor_entry *dde, return l; } -/** +/* * setup_ddl - Setup DDL from buffer * * Returns: @@ -181,9 +181,6 @@ static int setup_ddl(struct data_descriptor_entry *dde, CSB_ERR(csb, msg " at %lx", ##__VA_ARGS__, \ (unsigned long)be64_to_cpu((csb)->address)) -/** - * wait_for_csb - */ static int wait_for_csb(struct nx842_workmem *wmem, struct coprocessor_status_block *csb) { @@ -632,8 +629,8 @@ static int nx842_exec_vas(const unsigned char *in, unsigned int inlen, * @inlen: input buffer size * @out: output buffer pointer * @outlenp: output buffer size pointer - * @workmem: working memory buffer pointer, size determined by - * nx842_powernv_driver.workmem_size + * @wmem: working memory buffer pointer, size determined by + * nx842_powernv_driver.workmem_size * * Returns: see @nx842_powernv_exec() */ diff --git a/drivers/crypto/nx/nx-common-pseries.c b/drivers/crypto/nx/nx-common-pseries.c index 3ea334b7f820..35f2d0d8507e 100644 --- a/drivers/crypto/nx/nx-common-pseries.c +++ b/drivers/crypto/nx/nx-common-pseries.c @@ -123,14 +123,16 @@ struct ibm_nx842_counters { atomic64_t decomp_times[32]; }; -static struct nx842_devdata { +struct nx842_devdata { struct vio_dev *vdev; struct device *dev; struct ibm_nx842_counters *counters; unsigned int max_sg_len; unsigned int max_sync_size; unsigned int max_sync_sg; -} __rcu *devdata; +}; + +static struct nx842_devdata __rcu *devdata; static DEFINE_SPINLOCK(devdata_mutex); #define NX842_COUNTER_INC(_x) \ From 72bc4e71dbeedee0a446bcbc37c9bb25449072b7 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Mon, 6 Feb 2023 13:18:50 +0800 Subject: [PATCH 147/156] crypto: octeontx2 - Fix objects shared between several modules cn10k_cpt.o, otx2_cptlf.o and otx2_cpt_mbox_common.o are linked into both rvu_cptpf and rvu_cptvf modules: > scripts/Makefile.build:252: ./drivers/crypto/marvell/octeontx2/Makefile: > cn10k_cpt.o is added to multiple modules: rvu_cptpf rvu_cptvf > scripts/Makefile.build:252: ./drivers/crypto/marvell/octeontx2/Makefile: > otx2_cptlf.o is added to multiple modules: rvu_cptpf rvu_cptvf > scripts/Makefile.build:252: ./drivers/crypto/marvell/octeontx2/Makefile: > otx2_cpt_mbox_common.o is added to multiple modules: rvu_cptpf rvu_cptvf Despite they're build under the same Kconfig option (CONFIG_CRYPTO_DEV_OCTEONTX2_CPT), it's better do link the common code into a standalone module and export the shared functions. Under certain circumstances, this can lead to the same situation as fixed by commit 637a642f5ca5 ("zstd: Fixing mixed module-builtin objects"). Plus, those three common object files are relatively big to duplicate them several times. Introduce the new module, rvu_cptcommon, to provide the common functions to both modules. Fixes: 19d8e8c7be15 ("crypto: octeontx2 - add virtual function driver support") Suggested-by: Masahiro Yamada Signed-off-by: Alexander Lobakin Reviewed-by: Masahiro Yamada Signed-off-by: Herbert Xu --- drivers/crypto/marvell/octeontx2/Makefile | 11 +++++------ drivers/crypto/marvell/octeontx2/cn10k_cpt.c | 9 +++++++-- drivers/crypto/marvell/octeontx2/cn10k_cpt.h | 2 -- drivers/crypto/marvell/octeontx2/otx2_cpt_common.h | 2 -- .../marvell/octeontx2/otx2_cpt_mbox_common.c | 14 ++++++++++++-- drivers/crypto/marvell/octeontx2/otx2_cptlf.c | 11 +++++++++++ drivers/crypto/marvell/octeontx2/otx2_cptpf_main.c | 2 ++ drivers/crypto/marvell/octeontx2/otx2_cptvf_main.c | 2 ++ 8 files changed, 39 insertions(+), 14 deletions(-) diff --git a/drivers/crypto/marvell/octeontx2/Makefile b/drivers/crypto/marvell/octeontx2/Makefile index 965297e96954..f0f2942c1d27 100644 --- a/drivers/crypto/marvell/octeontx2/Makefile +++ b/drivers/crypto/marvell/octeontx2/Makefile @@ -1,11 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_CRYPTO_DEV_OCTEONTX2_CPT) += rvu_cptpf.o rvu_cptvf.o +obj-$(CONFIG_CRYPTO_DEV_OCTEONTX2_CPT) += rvu_cptcommon.o rvu_cptpf.o rvu_cptvf.o +rvu_cptcommon-objs := cn10k_cpt.o otx2_cptlf.o otx2_cpt_mbox_common.o rvu_cptpf-objs := otx2_cptpf_main.o otx2_cptpf_mbox.o \ - otx2_cpt_mbox_common.o otx2_cptpf_ucode.o otx2_cptlf.o \ - cn10k_cpt.o otx2_cpt_devlink.o -rvu_cptvf-objs := otx2_cptvf_main.o otx2_cptvf_mbox.o otx2_cptlf.o \ - otx2_cpt_mbox_common.o otx2_cptvf_reqmgr.o \ - otx2_cptvf_algs.o cn10k_cpt.o + otx2_cptpf_ucode.o otx2_cpt_devlink.o +rvu_cptvf-objs := otx2_cptvf_main.o otx2_cptvf_mbox.o \ + otx2_cptvf_reqmgr.o otx2_cptvf_algs.o ccflags-y += -I$(srctree)/drivers/net/ethernet/marvell/octeontx2/af diff --git a/drivers/crypto/marvell/octeontx2/cn10k_cpt.c b/drivers/crypto/marvell/octeontx2/cn10k_cpt.c index 1499ef75b5c2..93d22b328991 100644 --- a/drivers/crypto/marvell/octeontx2/cn10k_cpt.c +++ b/drivers/crypto/marvell/octeontx2/cn10k_cpt.c @@ -7,6 +7,9 @@ #include "otx2_cptlf.h" #include "cn10k_cpt.h" +static void cn10k_cpt_send_cmd(union otx2_cpt_inst_s *cptinst, u32 insts_num, + struct otx2_cptlf_info *lf); + static struct cpt_hw_ops otx2_hw_ops = { .send_cmd = otx2_cpt_send_cmd, .cpt_get_compcode = otx2_cpt_get_compcode, @@ -19,8 +22,8 @@ static struct cpt_hw_ops cn10k_hw_ops = { .cpt_get_uc_compcode = cn10k_cpt_get_uc_compcode, }; -void cn10k_cpt_send_cmd(union otx2_cpt_inst_s *cptinst, u32 insts_num, - struct otx2_cptlf_info *lf) +static void cn10k_cpt_send_cmd(union otx2_cpt_inst_s *cptinst, u32 insts_num, + struct otx2_cptlf_info *lf) { void __iomem *lmtline = lf->lmtline; u64 val = (lf->slot & 0x7FF); @@ -68,6 +71,7 @@ int cn10k_cptpf_lmtst_init(struct otx2_cptpf_dev *cptpf) return 0; } +EXPORT_SYMBOL_NS_GPL(cn10k_cptpf_lmtst_init, CRYPTO_DEV_OCTEONTX2_CPT); int cn10k_cptvf_lmtst_init(struct otx2_cptvf_dev *cptvf) { @@ -91,3 +95,4 @@ int cn10k_cptvf_lmtst_init(struct otx2_cptvf_dev *cptvf) return 0; } +EXPORT_SYMBOL_NS_GPL(cn10k_cptvf_lmtst_init, CRYPTO_DEV_OCTEONTX2_CPT); diff --git a/drivers/crypto/marvell/octeontx2/cn10k_cpt.h b/drivers/crypto/marvell/octeontx2/cn10k_cpt.h index c091392b47e0..aaefc7e38e06 100644 --- a/drivers/crypto/marvell/octeontx2/cn10k_cpt.h +++ b/drivers/crypto/marvell/octeontx2/cn10k_cpt.h @@ -28,8 +28,6 @@ static inline u8 otx2_cpt_get_uc_compcode(union otx2_cpt_res_s *result) return ((struct cn9k_cpt_res_s *)result)->uc_compcode; } -void cn10k_cpt_send_cmd(union otx2_cpt_inst_s *cptinst, u32 insts_num, - struct otx2_cptlf_info *lf); int cn10k_cptpf_lmtst_init(struct otx2_cptpf_dev *cptpf); int cn10k_cptvf_lmtst_init(struct otx2_cptvf_dev *cptvf); diff --git a/drivers/crypto/marvell/octeontx2/otx2_cpt_common.h b/drivers/crypto/marvell/octeontx2/otx2_cpt_common.h index 5012b7e669f0..6019066a6451 100644 --- a/drivers/crypto/marvell/octeontx2/otx2_cpt_common.h +++ b/drivers/crypto/marvell/octeontx2/otx2_cpt_common.h @@ -145,8 +145,6 @@ int otx2_cpt_send_mbox_msg(struct otx2_mbox *mbox, struct pci_dev *pdev); int otx2_cpt_send_af_reg_requests(struct otx2_mbox *mbox, struct pci_dev *pdev); -int otx2_cpt_add_read_af_reg(struct otx2_mbox *mbox, struct pci_dev *pdev, - u64 reg, u64 *val, int blkaddr); int otx2_cpt_add_write_af_reg(struct otx2_mbox *mbox, struct pci_dev *pdev, u64 reg, u64 val, int blkaddr); int otx2_cpt_read_af_reg(struct otx2_mbox *mbox, struct pci_dev *pdev, diff --git a/drivers/crypto/marvell/octeontx2/otx2_cpt_mbox_common.c b/drivers/crypto/marvell/octeontx2/otx2_cpt_mbox_common.c index a317319696ef..115997475beb 100644 --- a/drivers/crypto/marvell/octeontx2/otx2_cpt_mbox_common.c +++ b/drivers/crypto/marvell/octeontx2/otx2_cpt_mbox_common.c @@ -19,6 +19,7 @@ int otx2_cpt_send_mbox_msg(struct otx2_mbox *mbox, struct pci_dev *pdev) } return ret; } +EXPORT_SYMBOL_NS_GPL(otx2_cpt_send_mbox_msg, CRYPTO_DEV_OCTEONTX2_CPT); int otx2_cpt_send_ready_msg(struct otx2_mbox *mbox, struct pci_dev *pdev) { @@ -36,14 +37,17 @@ int otx2_cpt_send_ready_msg(struct otx2_mbox *mbox, struct pci_dev *pdev) return otx2_cpt_send_mbox_msg(mbox, pdev); } +EXPORT_SYMBOL_NS_GPL(otx2_cpt_send_ready_msg, CRYPTO_DEV_OCTEONTX2_CPT); int otx2_cpt_send_af_reg_requests(struct otx2_mbox *mbox, struct pci_dev *pdev) { return otx2_cpt_send_mbox_msg(mbox, pdev); } +EXPORT_SYMBOL_NS_GPL(otx2_cpt_send_af_reg_requests, CRYPTO_DEV_OCTEONTX2_CPT); -int otx2_cpt_add_read_af_reg(struct otx2_mbox *mbox, struct pci_dev *pdev, - u64 reg, u64 *val, int blkaddr) +static int otx2_cpt_add_read_af_reg(struct otx2_mbox *mbox, + struct pci_dev *pdev, u64 reg, + u64 *val, int blkaddr) { struct cpt_rd_wr_reg_msg *reg_msg; @@ -91,6 +95,7 @@ int otx2_cpt_add_write_af_reg(struct otx2_mbox *mbox, struct pci_dev *pdev, return 0; } +EXPORT_SYMBOL_NS_GPL(otx2_cpt_add_write_af_reg, CRYPTO_DEV_OCTEONTX2_CPT); int otx2_cpt_read_af_reg(struct otx2_mbox *mbox, struct pci_dev *pdev, u64 reg, u64 *val, int blkaddr) @@ -103,6 +108,7 @@ int otx2_cpt_read_af_reg(struct otx2_mbox *mbox, struct pci_dev *pdev, return otx2_cpt_send_mbox_msg(mbox, pdev); } +EXPORT_SYMBOL_NS_GPL(otx2_cpt_read_af_reg, CRYPTO_DEV_OCTEONTX2_CPT); int otx2_cpt_write_af_reg(struct otx2_mbox *mbox, struct pci_dev *pdev, u64 reg, u64 val, int blkaddr) @@ -115,6 +121,7 @@ int otx2_cpt_write_af_reg(struct otx2_mbox *mbox, struct pci_dev *pdev, return otx2_cpt_send_mbox_msg(mbox, pdev); } +EXPORT_SYMBOL_NS_GPL(otx2_cpt_write_af_reg, CRYPTO_DEV_OCTEONTX2_CPT); int otx2_cpt_attach_rscrs_msg(struct otx2_cptlfs_info *lfs) { @@ -170,6 +177,7 @@ int otx2_cpt_detach_rsrcs_msg(struct otx2_cptlfs_info *lfs) return ret; } +EXPORT_SYMBOL_NS_GPL(otx2_cpt_detach_rsrcs_msg, CRYPTO_DEV_OCTEONTX2_CPT); int otx2_cpt_msix_offset_msg(struct otx2_cptlfs_info *lfs) { @@ -202,6 +210,7 @@ int otx2_cpt_msix_offset_msg(struct otx2_cptlfs_info *lfs) } return ret; } +EXPORT_SYMBOL_NS_GPL(otx2_cpt_msix_offset_msg, CRYPTO_DEV_OCTEONTX2_CPT); int otx2_cpt_sync_mbox_msg(struct otx2_mbox *mbox) { @@ -216,3 +225,4 @@ int otx2_cpt_sync_mbox_msg(struct otx2_mbox *mbox) return otx2_mbox_check_rsp_msgs(mbox, 0); } +EXPORT_SYMBOL_NS_GPL(otx2_cpt_sync_mbox_msg, CRYPTO_DEV_OCTEONTX2_CPT); diff --git a/drivers/crypto/marvell/octeontx2/otx2_cptlf.c b/drivers/crypto/marvell/octeontx2/otx2_cptlf.c index c8350fcd60fa..71e5f79431af 100644 --- a/drivers/crypto/marvell/octeontx2/otx2_cptlf.c +++ b/drivers/crypto/marvell/octeontx2/otx2_cptlf.c @@ -274,6 +274,8 @@ void otx2_cptlf_unregister_interrupts(struct otx2_cptlfs_info *lfs) } cptlf_disable_intrs(lfs); } +EXPORT_SYMBOL_NS_GPL(otx2_cptlf_unregister_interrupts, + CRYPTO_DEV_OCTEONTX2_CPT); static int cptlf_do_register_interrrupts(struct otx2_cptlfs_info *lfs, int lf_num, int irq_offset, @@ -321,6 +323,7 @@ free_irq: otx2_cptlf_unregister_interrupts(lfs); return ret; } +EXPORT_SYMBOL_NS_GPL(otx2_cptlf_register_interrupts, CRYPTO_DEV_OCTEONTX2_CPT); void otx2_cptlf_free_irqs_affinity(struct otx2_cptlfs_info *lfs) { @@ -334,6 +337,7 @@ void otx2_cptlf_free_irqs_affinity(struct otx2_cptlfs_info *lfs) free_cpumask_var(lfs->lf[slot].affinity_mask); } } +EXPORT_SYMBOL_NS_GPL(otx2_cptlf_free_irqs_affinity, CRYPTO_DEV_OCTEONTX2_CPT); int otx2_cptlf_set_irqs_affinity(struct otx2_cptlfs_info *lfs) { @@ -366,6 +370,7 @@ free_affinity_mask: otx2_cptlf_free_irqs_affinity(lfs); return ret; } +EXPORT_SYMBOL_NS_GPL(otx2_cptlf_set_irqs_affinity, CRYPTO_DEV_OCTEONTX2_CPT); int otx2_cptlf_init(struct otx2_cptlfs_info *lfs, u8 eng_grp_mask, int pri, int lfs_num) @@ -422,6 +427,7 @@ clear_lfs_num: lfs->lfs_num = 0; return ret; } +EXPORT_SYMBOL_NS_GPL(otx2_cptlf_init, CRYPTO_DEV_OCTEONTX2_CPT); void otx2_cptlf_shutdown(struct otx2_cptlfs_info *lfs) { @@ -431,3 +437,8 @@ void otx2_cptlf_shutdown(struct otx2_cptlfs_info *lfs) /* Send request to detach LFs */ otx2_cpt_detach_rsrcs_msg(lfs); } +EXPORT_SYMBOL_NS_GPL(otx2_cptlf_shutdown, CRYPTO_DEV_OCTEONTX2_CPT); + +MODULE_AUTHOR("Marvell"); +MODULE_DESCRIPTION("Marvell RVU CPT Common module"); +MODULE_LICENSE("GPL"); diff --git a/drivers/crypto/marvell/octeontx2/otx2_cptpf_main.c b/drivers/crypto/marvell/octeontx2/otx2_cptpf_main.c index a402ccfac557..ddf6e913c1c4 100644 --- a/drivers/crypto/marvell/octeontx2/otx2_cptpf_main.c +++ b/drivers/crypto/marvell/octeontx2/otx2_cptpf_main.c @@ -831,6 +831,8 @@ static struct pci_driver otx2_cpt_pci_driver = { module_pci_driver(otx2_cpt_pci_driver); +MODULE_IMPORT_NS(CRYPTO_DEV_OCTEONTX2_CPT); + MODULE_AUTHOR("Marvell"); MODULE_DESCRIPTION(OTX2_CPT_DRV_STRING); MODULE_LICENSE("GPL v2"); diff --git a/drivers/crypto/marvell/octeontx2/otx2_cptvf_main.c b/drivers/crypto/marvell/octeontx2/otx2_cptvf_main.c index 3411e664cf50..392e9fee05e8 100644 --- a/drivers/crypto/marvell/octeontx2/otx2_cptvf_main.c +++ b/drivers/crypto/marvell/octeontx2/otx2_cptvf_main.c @@ -429,6 +429,8 @@ static struct pci_driver otx2_cptvf_pci_driver = { module_pci_driver(otx2_cptvf_pci_driver); +MODULE_IMPORT_NS(CRYPTO_DEV_OCTEONTX2_CPT); + MODULE_AUTHOR("Marvell"); MODULE_DESCRIPTION("Marvell RVU CPT Virtual Function Driver"); MODULE_LICENSE("GPL v2"); From bcdda4301bdc4955d45f7e1ffefb6207967b067e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 6 Feb 2023 14:01:53 +0800 Subject: [PATCH 148/156] crypto: crypto4xx - Call dma_unmap_page when done In crypto4xx_cipher_done, we should be unmapping the dst page, not mapping it. This was flagged by a sparse warning about the unused addr variable. While we're at it, also fix a sparse warning regarding the unused ctx variable in crypto4xx_ahash_done (by actually using it). Fixes: 049359d65527 ("crypto: amcc - Add crypt4xx driver") Signed-off-by: Herbert Xu Tested-by: Christian Lamparter Signed-off-by: Herbert Xu --- drivers/crypto/amcc/crypto4xx_core.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/amcc/crypto4xx_core.c b/drivers/crypto/amcc/crypto4xx_core.c index 280f4b0e7133..50dc783821b6 100644 --- a/drivers/crypto/amcc/crypto4xx_core.c +++ b/drivers/crypto/amcc/crypto4xx_core.c @@ -522,7 +522,6 @@ static void crypto4xx_cipher_done(struct crypto4xx_device *dev, { struct skcipher_request *req; struct scatterlist *dst; - dma_addr_t addr; req = skcipher_request_cast(pd_uinfo->async_req); @@ -531,8 +530,8 @@ static void crypto4xx_cipher_done(struct crypto4xx_device *dev, req->cryptlen, req->dst); } else { dst = pd_uinfo->dest_va; - addr = dma_map_page(dev->core_dev->device, sg_page(dst), - dst->offset, dst->length, DMA_FROM_DEVICE); + dma_unmap_page(dev->core_dev->device, pd->dest, dst->length, + DMA_FROM_DEVICE); } if (pd_uinfo->sa_va->sa_command_0.bf.save_iv == SA_SAVE_IV) { @@ -557,10 +556,9 @@ static void crypto4xx_ahash_done(struct crypto4xx_device *dev, struct ahash_request *ahash_req; ahash_req = ahash_request_cast(pd_uinfo->async_req); - ctx = crypto_tfm_ctx(ahash_req->base.tfm); + ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(ahash_req)); - crypto4xx_copy_digest_to_dst(ahash_req->result, pd_uinfo, - crypto_tfm_ctx(ahash_req->base.tfm)); + crypto4xx_copy_digest_to_dst(ahash_req->result, pd_uinfo, ctx); crypto4xx_ret_sg_desc(dev, pd_uinfo); if (pd_uinfo->state & PD_ENTRY_BUSY) From c43cc8823d56b1d34744581e42d6eeaca028f1ab Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 9 Feb 2023 09:16:45 +0800 Subject: [PATCH 149/156] crypto: proc - Print fips status As FIPS may disable algorithms it is useful to show their status in /proc/crypto. Signed-off-by: Herbert Xu Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/proc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/crypto/proc.c b/crypto/proc.c index 12fccb9c5205..56c7c78df297 100644 --- a/crypto/proc.c +++ b/crypto/proc.c @@ -11,6 +11,7 @@ #include #include #include +#include #include /* for module_name() */ #include #include @@ -48,6 +49,11 @@ static int c_show(struct seq_file *m, void *p) seq_printf(m, "internal : %s\n", (alg->cra_flags & CRYPTO_ALG_INTERNAL) ? "yes" : "no"); + if (fips_enabled) { + seq_printf(m, "fips : %s\n", + (alg->cra_flags & CRYPTO_ALG_FIPS_INTERNAL) ? + "no" : "yes"); + } if (alg->cra_flags & CRYPTO_ALG_LARVAL) { seq_printf(m, "type : larval\n"); From a292f2534fb2caec7f3ad32045338f8feac28112 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Thu, 9 Feb 2023 20:36:13 +0800 Subject: [PATCH 150/156] crypto: hisilicon/qm - remove some unused defines 1. Remove some macros define since it is not used. 2. Remove enum QM_HW_UNKNOWN since it is not used. 3. Remove unused member 'is_frozen' in 'hisi_qm' structure. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 10 ---------- include/linux/hisi_acc_qm.h | 2 -- 2 files changed, 12 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 007ac7a69ce7..547626718484 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -95,8 +95,6 @@ #define QM_VFT_CFG_RDY 0x10006c #define QM_VFT_CFG_OP_WR 0x100058 #define QM_VFT_CFG_TYPE 0x10005c -#define QM_SQC_VFT 0x0 -#define QM_CQC_VFT 0x1 #define QM_VFT_CFG 0x100060 #define QM_VFT_CFG_OP_ENABLE 0x100054 #define QM_PM_CTRL 0x100148 @@ -164,7 +162,6 @@ /* interfunction communication */ #define QM_IFC_READY_STATUS 0x100128 -#define QM_IFC_C_STS_M 0x10012C #define QM_IFC_INT_SET_P 0x100130 #define QM_IFC_INT_CFG 0x100134 #define QM_IFC_INT_SOURCE_P 0x100138 @@ -198,7 +195,6 @@ #define PCI_BAR_2 2 #define PCI_BAR_4 4 -#define QM_SQE_DATA_ALIGN_MASK GENMASK(6, 0) #define QMC_ALIGN(sz) ALIGN(sz, 32) #define QM_DBG_READ_LEN 256 @@ -212,8 +208,6 @@ #define QM_DRIVER_REMOVING 0 #define QM_RST_SCHED 1 #define QM_QOS_PARAM_NUM 2 -#define QM_QOS_VAL_NUM 1 -#define QM_QOS_BDF_PARAM_NUM 4 #define QM_QOS_MAX_VAL 1000 #define QM_QOS_RATE 100 #define QM_QOS_EXPAND_RATE 1000 @@ -225,18 +219,14 @@ #define QM_SHAPER_FACTOR_CBS_B_SHIFT 15 #define QM_SHAPER_FACTOR_CBS_S_SHIFT 19 #define QM_SHAPER_CBS_B 1 -#define QM_SHAPER_CBS_S 16 #define QM_SHAPER_VFT_OFFSET 6 -#define WAIT_FOR_QOS_VF 100 #define QM_QOS_MIN_ERROR_RATE 5 -#define QM_QOS_TYPICAL_NUM 8 #define QM_SHAPER_MIN_CBS_S 8 #define QM_QOS_TICK 0x300U #define QM_QOS_DIVISOR_CLK 0x1f40U #define QM_QOS_MAX_CIR_B 200 #define QM_QOS_MIN_CIR_B 100 #define QM_QOS_MAX_CIR_U 6 -#define QM_QOS_MAX_CIR_S 11 #define QM_AUTOSUSPEND_DELAY 3000 #define QM_MK_CQC_DW3_V1(hop_num, pg_sz, buf_sz, cqe_sz) \ diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index be3aedaa96dc..df5be7e6db04 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -122,7 +122,6 @@ enum qp_state { }; enum qm_hw_ver { - QM_HW_UNKNOWN = -1, QM_HW_V1 = 0x20, QM_HW_V2 = 0x21, QM_HW_V3 = 0x30, @@ -332,7 +331,6 @@ struct hisi_qm { const char *algs; bool use_sva; - bool is_frozen; resource_size_t phys_base; resource_size_t db_phys_base; From f8de067cbe05a43a3de5d8873cd63c17abb1c7c8 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Thu, 9 Feb 2023 20:36:14 +0800 Subject: [PATCH 151/156] crypto: hisilicon/qm - use min() instead of min_t() 'act_q_num = min_t(int, act_q_num, max_qp_num)', the type of 'act_q_num' and 'max_qp_num' are both 'u32', so use min() instead of min_t(). Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 547626718484..93b2f07b567d 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3361,7 +3361,7 @@ static int qm_vf_q_assign(struct hisi_qm *qm, u32 num_vfs) act_q_num = q_num; } - act_q_num = min_t(int, act_q_num, max_qp_num); + act_q_num = min(act_q_num, max_qp_num); ret = hisi_qm_set_vft(qm, i, q_base, act_q_num); if (ret) { for (j = num_vfs; j > i; j--) From ac80056f2e7bd80184312ccd91a40532eb95b811 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Thu, 9 Feb 2023 20:36:15 +0800 Subject: [PATCH 152/156] crypto: hisilicon/qm - change function names The accelerator devices support multiple interrupts. To better reflect purpose of each interrupt function, change function name 'qm_irq' to 'qm_eq_irq' and 'do_qm_irq' to 'do_qm_eq_irq'. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 93b2f07b567d..5c989f135b4d 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -895,7 +895,7 @@ static void qm_work_process(struct work_struct *work) } } -static bool do_qm_irq(struct hisi_qm *qm) +static bool do_qm_eq_irq(struct hisi_qm *qm) { struct qm_eqe *eqe = qm->eqe + qm->status.eq_head; struct hisi_qm_poll_data *poll_data; @@ -915,12 +915,12 @@ static bool do_qm_irq(struct hisi_qm *qm) return false; } -static irqreturn_t qm_irq(int irq, void *data) +static irqreturn_t qm_eq_irq(int irq, void *data) { struct hisi_qm *qm = data; bool ret; - ret = do_qm_irq(qm); + ret = do_qm_eq_irq(qm); if (ret) return IRQ_HANDLED; @@ -4893,7 +4893,7 @@ static int qm_register_eq_irq(struct hisi_qm *qm) return 0; irq_vector = val & QM_IRQ_VECTOR_MASK; - ret = request_irq(pci_irq_vector(pdev, irq_vector), qm_irq, 0, qm->dev_name, qm); + ret = request_irq(pci_irq_vector(pdev, irq_vector), qm_eq_irq, 0, qm->dev_name, qm); if (ret) dev_err(&pdev->dev, "failed to request eq irq, ret = %d", ret); From 9b4eb8f8b8eccc313663a99904dcfe2660c41b6c Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Thu, 9 Feb 2023 20:36:16 +0800 Subject: [PATCH 153/156] crypto: hisilicon/qm - update comments to match function The return values of some functions have been modified, but the comments have not been modified together. The comments must be updated to be consistent with the functions. Also move comments over the codes instead of right place to ensure consistent coding styles. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 5 ++--- include/linux/hisi_acc_qm.h | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 5c989f135b4d..2f888e5e2bfa 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -1882,8 +1882,7 @@ static struct hisi_qp *qm_create_qp_nolock(struct hisi_qm *qm, u8 alg_type) * @qm: The qm we create a qp from. * @alg_type: Accelerator specific algorithm type in sqc. * - * return created qp, -EBUSY if all qps in qm allocated, -ENOMEM if allocating - * qp memory fails. + * Return created qp, negative error code if failed. */ static struct hisi_qp *hisi_qm_create_qp(struct hisi_qm *qm, u8 alg_type) { @@ -2052,7 +2051,7 @@ static int qm_start_qp_nolock(struct hisi_qp *qp, unsigned long arg) * @arg: Accelerator specific argument. * * After this function, qp can receive request from user. Return 0 if - * successful, Return -EBUSY if failed. + * successful, negative error code if failed. */ int hisi_qm_start_qp(struct hisi_qp *qp, unsigned long arg) { diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index df5be7e6db04..a59fbffa238f 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -308,7 +308,8 @@ struct hisi_qm { const struct hisi_qm_err_ini *err_ini; struct hisi_qm_err_info err_info; struct hisi_qm_err_status err_status; - unsigned long misc_ctl; /* driver removing and reset sched */ + /* driver removing and reset sched */ + unsigned long misc_ctl; /* Device capability bit */ unsigned long caps; From ced18fd1794787d57acff1a4d1b2816d5ec99fbc Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Thu, 9 Feb 2023 20:36:17 +0800 Subject: [PATCH 154/156] crypto: hisilicon/qm - fix coding style issues 1. Remove extra blank lines. 2. Remove extra spaces. 3. Use spaces instead of tabs around '=' and '\', to ensure consistent coding styles. 4. Macros should be capital letters, change 'QM_SQC_VFT_NUM_MASK_v2' to 'QM_SQC_VFT_NUM_MASK_V2'. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 29 ++++++++++++----------------- drivers/crypto/hisilicon/sgl.c | 1 - 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 2f888e5e2bfa..59823ad1d9ae 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -116,7 +116,7 @@ #define QM_SQC_VFT_BASE_SHIFT_V2 28 #define QM_SQC_VFT_BASE_MASK_V2 GENMASK(15, 0) #define QM_SQC_VFT_NUM_SHIFT_V2 45 -#define QM_SQC_VFT_NUM_MASK_v2 GENMASK(9, 0) +#define QM_SQC_VFT_NUM_MASK_V2 GENMASK(9, 0) #define QM_ABNORMAL_INT_SOURCE 0x100000 #define QM_ABNORMAL_INT_MASK 0x100004 @@ -230,23 +230,23 @@ #define QM_AUTOSUSPEND_DELAY 3000 #define QM_MK_CQC_DW3_V1(hop_num, pg_sz, buf_sz, cqe_sz) \ - (((hop_num) << QM_CQ_HOP_NUM_SHIFT) | \ - ((pg_sz) << QM_CQ_PAGE_SIZE_SHIFT) | \ - ((buf_sz) << QM_CQ_BUF_SIZE_SHIFT) | \ + (((hop_num) << QM_CQ_HOP_NUM_SHIFT) | \ + ((pg_sz) << QM_CQ_PAGE_SIZE_SHIFT) | \ + ((buf_sz) << QM_CQ_BUF_SIZE_SHIFT) | \ ((cqe_sz) << QM_CQ_CQE_SIZE_SHIFT)) #define QM_MK_CQC_DW3_V2(cqe_sz, cq_depth) \ ((((u32)cq_depth) - 1) | ((cqe_sz) << QM_CQ_CQE_SIZE_SHIFT)) #define QM_MK_SQC_W13(priority, orders, alg_type) \ - (((priority) << QM_SQ_PRIORITY_SHIFT) | \ - ((orders) << QM_SQ_ORDERS_SHIFT) | \ + (((priority) << QM_SQ_PRIORITY_SHIFT) | \ + ((orders) << QM_SQ_ORDERS_SHIFT) | \ (((alg_type) & QM_SQ_TYPE_MASK) << QM_SQ_TYPE_SHIFT)) #define QM_MK_SQC_DW3_V1(hop_num, pg_sz, buf_sz, sqe_sz) \ - (((hop_num) << QM_SQ_HOP_NUM_SHIFT) | \ - ((pg_sz) << QM_SQ_PAGE_SIZE_SHIFT) | \ - ((buf_sz) << QM_SQ_BUF_SIZE_SHIFT) | \ + (((hop_num) << QM_SQ_HOP_NUM_SHIFT) | \ + ((pg_sz) << QM_SQ_PAGE_SIZE_SHIFT) | \ + ((buf_sz) << QM_SQ_BUF_SIZE_SHIFT) | \ ((u32)ilog2(sqe_sz) << QM_SQ_SQE_SIZE_SHIFT)) #define QM_MK_SQC_DW3_V2(sqe_sz, sq_depth) \ @@ -696,7 +696,7 @@ static void qm_db_v2(struct hisi_qm *qm, u16 qn, u8 cmd, u16 index, u8 priority) doorbell = qn | ((u64)cmd << QM_DB_CMD_SHIFT_V2) | ((u64)randata << QM_DB_RAND_SHIFT_V2) | - ((u64)index << QM_DB_INDEX_SHIFT_V2) | + ((u64)index << QM_DB_INDEX_SHIFT_V2) | ((u64)priority << QM_DB_PRIORITY_SHIFT_V2); writeq(doorbell, io_base); @@ -1294,7 +1294,7 @@ static int qm_get_vft_v2(struct hisi_qm *qm, u32 *base, u32 *number) sqc_vft = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) | ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) << 32); *base = QM_SQC_VFT_BASE_MASK_V2 & (sqc_vft >> QM_SQC_VFT_BASE_SHIFT_V2); - *number = (QM_SQC_VFT_NUM_MASK_v2 & + *number = (QM_SQC_VFT_NUM_MASK_V2 & (sqc_vft >> QM_SQC_VFT_NUM_SHIFT_V2)) + 1; return 0; @@ -3063,7 +3063,6 @@ static int qm_stop_started_qp(struct hisi_qm *qm) return 0; } - /** * qm_clear_queues() - Clear all queues memory in a qm. * @qm: The qm in which the queues will be cleared. @@ -3547,7 +3546,7 @@ static ssize_t qm_algqos_read(struct file *filp, char __user *buf, qos_val = ir / QM_QOS_RATE; ret = scnprintf(tbuf, QM_DBG_READ_LEN, "%u\n", qos_val); - ret = simple_read_from_buffer(buf, count, pos, tbuf, ret); + ret = simple_read_from_buffer(buf, count, pos, tbuf, ret); err_get_status: clear_bit(QM_RESETTING, &qm->misc_ctl); @@ -4038,13 +4037,10 @@ static void qm_dev_ecc_mbit_handle(struct hisi_qm *qm) if (!qm->err_status.is_dev_ecc_mbit && qm->err_status.is_qm_ecc_mbit && qm->err_ini->close_axi_master_ooo) { - qm->err_ini->close_axi_master_ooo(qm); - } else if (qm->err_status.is_dev_ecc_mbit && !qm->err_status.is_qm_ecc_mbit && !qm->err_ini->close_axi_master_ooo) { - nfe_enb = readl(qm->io_base + QM_RAS_NFE_ENABLE); writel(nfe_enb & QM_RAS_NFE_MBIT_DISABLE, qm->io_base + QM_RAS_NFE_ENABLE); @@ -4488,7 +4484,6 @@ static irqreturn_t qm_abnormal_irq(int irq, void *data) return IRQ_HANDLED; } - /** * hisi_qm_dev_shutdown() - Shutdown device. * @pdev: The device will be shutdown. diff --git a/drivers/crypto/hisilicon/sgl.c b/drivers/crypto/hisilicon/sgl.c index 2b6f2281cfd6..a07257a7e75f 100644 --- a/drivers/crypto/hisilicon/sgl.c +++ b/drivers/crypto/hisilicon/sgl.c @@ -250,7 +250,6 @@ hisi_acc_sg_buf_map_to_hw_sgl(struct device *dev, dev_err(dev, "Get SGL error!\n"); dma_unmap_sg(dev, sgl, sg_n, DMA_BIDIRECTIONAL); return ERR_PTR(-ENOMEM); - } curr_hw_sgl->entry_length_in_sgl = cpu_to_le16(pool->sge_nr); curr_hw_sge = curr_hw_sgl->sge_entries; From eb33108858b64ebc0f79a582123731c2585ae63b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 10 Feb 2023 17:40:57 +0800 Subject: [PATCH 155/156] crypto: aspeed - Fix modular aspeed-acry When aspeed-acry is enabled as a module it doesn't get built at all. Fix this by adding it to obj-m. Signed-off-by: Herbert Xu Reviewed-by: Neal Liu Signed-off-by: Herbert Xu --- drivers/crypto/aspeed/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/aspeed/Makefile b/drivers/crypto/aspeed/Makefile index 24284d812b79..15862752c053 100644 --- a/drivers/crypto/aspeed/Makefile +++ b/drivers/crypto/aspeed/Makefile @@ -6,4 +6,6 @@ aspeed_crypto-objs := aspeed-hace.o \ $(hace-hash-y) \ $(hace-crypto-y) -obj-$(CONFIG_CRYPTO_DEV_ASPEED_ACRY) += aspeed-acry.o +aspeed_acry-$(CONFIG_CRYPTO_DEV_ASPEED_ACRY) += aspeed-acry.o + +obj-$(CONFIG_CRYPTO_DEV_ASPEED) += $(aspeed_acry-y) From 8b84475318641c2b89320859332544cf187e1cbd Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 10 Feb 2023 18:15:41 +0000 Subject: [PATCH 156/156] crypto: x86/aria-avx - Do not use avx2 instructions vpbroadcastb and vpbroadcastd are not AVX instructions. But the aria-avx assembly code contains these instructions. So, kernel panic will occur if the aria-avx works on AVX2 unsupported CPU. vbroadcastss, and vpshufb are used to avoid using vpbroadcastb in it. Unfortunately, this change reduces performance by about 5%. Also, vpbroadcastd is simply replaced by vmovdqa in it. Fixes: ba3579e6e45c ("crypto: aria-avx - add AES-NI/AVX/x86_64/GFNI assembler implementation of aria cipher") Reported-by: Herbert Xu Reported-by: Erhard F. Signed-off-by: Taehee Yoo Signed-off-by: Herbert Xu --- arch/x86/crypto/aria-aesni-avx-asm_64.S | 134 +++++++++++++++++------- 1 file changed, 94 insertions(+), 40 deletions(-) diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S index fe0d84a7ced1..9243f6289d34 100644 --- a/arch/x86/crypto/aria-aesni-avx-asm_64.S +++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S @@ -267,35 +267,44 @@ #define aria_ark_8way(x0, x1, x2, x3, \ x4, x5, x6, x7, \ - t0, rk, idx, round) \ + t0, t1, t2, rk, \ + idx, round) \ /* AddRoundKey */ \ - vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \ - vpxor t0, x0, x0; \ - vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \ - vpxor t0, x1, x1; \ - vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \ - vpxor t0, x2, x2; \ - vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \ - vpxor t0, x3, x3; \ - vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \ - vpxor t0, x4, x4; \ - vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \ - vpxor t0, x5, x5; \ - vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \ - vpxor t0, x6, x6; \ - vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \ - vpxor t0, x7, x7; + vbroadcastss ((round * 16) + idx + 0)(rk), t0; \ + vpsrld $24, t0, t2; \ + vpshufb t1, t2, t2; \ + vpxor t2, x0, x0; \ + vpsrld $16, t0, t2; \ + vpshufb t1, t2, t2; \ + vpxor t2, x1, x1; \ + vpsrld $8, t0, t2; \ + vpshufb t1, t2, t2; \ + vpxor t2, x2, x2; \ + vpshufb t1, t0, t2; \ + vpxor t2, x3, x3; \ + vbroadcastss ((round * 16) + idx + 4)(rk), t0; \ + vpsrld $24, t0, t2; \ + vpshufb t1, t2, t2; \ + vpxor t2, x4, x4; \ + vpsrld $16, t0, t2; \ + vpshufb t1, t2, t2; \ + vpxor t2, x5, x5; \ + vpsrld $8, t0, t2; \ + vpshufb t1, t2, t2; \ + vpxor t2, x6, x6; \ + vpshufb t1, t0, t2; \ + vpxor t2, x7, x7; #ifdef CONFIG_AS_GFNI #define aria_sbox_8way_gfni(x0, x1, x2, x3, \ x4, x5, x6, x7, \ t0, t1, t2, t3, \ t4, t5, t6, t7) \ - vpbroadcastq .Ltf_s2_bitmatrix, t0; \ - vpbroadcastq .Ltf_inv_bitmatrix, t1; \ - vpbroadcastq .Ltf_id_bitmatrix, t2; \ - vpbroadcastq .Ltf_aff_bitmatrix, t3; \ - vpbroadcastq .Ltf_x2_bitmatrix, t4; \ + vmovdqa .Ltf_s2_bitmatrix, t0; \ + vmovdqa .Ltf_inv_bitmatrix, t1; \ + vmovdqa .Ltf_id_bitmatrix, t2; \ + vmovdqa .Ltf_aff_bitmatrix, t3; \ + vmovdqa .Ltf_x2_bitmatrix, t4; \ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ @@ -315,10 +324,9 @@ x4, x5, x6, x7, \ t0, t1, t2, t3, \ t4, t5, t6, t7) \ - vpxor t7, t7, t7; \ vmovdqa .Linv_shift_row, t0; \ vmovdqa .Lshift_row, t1; \ - vpbroadcastd .L0f0f0f0f, t6; \ + vbroadcastss .L0f0f0f0f, t6; \ vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \ vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \ vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \ @@ -413,8 +421,9 @@ y0, y1, y2, y3, \ y4, y5, y6, y7, \ mem_tmp, rk, round) \ + vpxor y7, y7, y7; \ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ - y0, rk, 8, round); \ + y0, y7, y2, rk, 8, round); \ \ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ y0, y1, y2, y3, y4, y5, y6, y7); \ @@ -429,7 +438,7 @@ x4, x5, x6, x7, \ mem_tmp, 0); \ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ - y0, rk, 0, round); \ + y0, y7, y2, rk, 0, round); \ \ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ y0, y1, y2, y3, y4, y5, y6, y7); \ @@ -467,8 +476,9 @@ y0, y1, y2, y3, \ y4, y5, y6, y7, \ mem_tmp, rk, round) \ + vpxor y7, y7, y7; \ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ - y0, rk, 8, round); \ + y0, y7, y2, rk, 8, round); \ \ aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ y0, y1, y2, y3, y4, y5, y6, y7); \ @@ -483,7 +493,7 @@ x4, x5, x6, x7, \ mem_tmp, 0); \ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ - y0, rk, 0, round); \ + y0, y7, y2, rk, 0, round); \ \ aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ y0, y1, y2, y3, y4, y5, y6, y7); \ @@ -521,14 +531,15 @@ y0, y1, y2, y3, \ y4, y5, y6, y7, \ mem_tmp, rk, round, last_round) \ + vpxor y7, y7, y7; \ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ - y0, rk, 8, round); \ + y0, y7, y2, rk, 8, round); \ \ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ y0, y1, y2, y3, y4, y5, y6, y7); \ \ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ - y0, rk, 8, last_round); \ + y0, y7, y2, rk, 8, last_round); \ \ aria_store_state_8way(x0, x1, x2, x3, \ x4, x5, x6, x7, \ @@ -538,13 +549,13 @@ x4, x5, x6, x7, \ mem_tmp, 0); \ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ - y0, rk, 0, round); \ + y0, y7, y2, rk, 0, round); \ \ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ y0, y1, y2, y3, y4, y5, y6, y7); \ \ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ - y0, rk, 0, last_round); \ + y0, y7, y2, rk, 0, last_round); \ \ aria_load_state_8way(y0, y1, y2, y3, \ y4, y5, y6, y7, \ @@ -556,8 +567,9 @@ y0, y1, y2, y3, \ y4, y5, y6, y7, \ mem_tmp, rk, round) \ + vpxor y7, y7, y7; \ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ - y0, rk, 8, round); \ + y0, y7, y2, rk, 8, round); \ \ aria_sbox_8way_gfni(x2, x3, x0, x1, \ x6, x7, x4, x5, \ @@ -574,7 +586,7 @@ x4, x5, x6, x7, \ mem_tmp, 0); \ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ - y0, rk, 0, round); \ + y0, y7, y2, rk, 0, round); \ \ aria_sbox_8way_gfni(x2, x3, x0, x1, \ x6, x7, x4, x5, \ @@ -614,8 +626,9 @@ y0, y1, y2, y3, \ y4, y5, y6, y7, \ mem_tmp, rk, round) \ + vpxor y7, y7, y7; \ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ - y0, rk, 8, round); \ + y0, y7, y2, rk, 8, round); \ \ aria_sbox_8way_gfni(x0, x1, x2, x3, \ x4, x5, x6, x7, \ @@ -632,7 +645,7 @@ x4, x5, x6, x7, \ mem_tmp, 0); \ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ - y0, rk, 0, round); \ + y0, y7, y2, rk, 0, round); \ \ aria_sbox_8way_gfni(x0, x1, x2, x3, \ x4, x5, x6, x7, \ @@ -672,8 +685,9 @@ y0, y1, y2, y3, \ y4, y5, y6, y7, \ mem_tmp, rk, round, last_round) \ + vpxor y7, y7, y7; \ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ - y0, rk, 8, round); \ + y0, y7, y2, rk, 8, round); \ \ aria_sbox_8way_gfni(x2, x3, x0, x1, \ x6, x7, x4, x5, \ @@ -681,7 +695,7 @@ y4, y5, y6, y7); \ \ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ - y0, rk, 8, last_round); \ + y0, y7, y2, rk, 8, last_round); \ \ aria_store_state_8way(x0, x1, x2, x3, \ x4, x5, x6, x7, \ @@ -691,7 +705,7 @@ x4, x5, x6, x7, \ mem_tmp, 0); \ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ - y0, rk, 0, round); \ + y0, y7, y2, rk, 0, round); \ \ aria_sbox_8way_gfni(x2, x3, x0, x1, \ x6, x7, x4, x5, \ @@ -699,7 +713,7 @@ y4, y5, y6, y7); \ \ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ - y0, rk, 0, last_round); \ + y0, y7, y2, rk, 0, last_round); \ \ aria_load_state_8way(y0, y1, y2, y3, \ y4, y5, y6, y7, \ @@ -772,6 +786,14 @@ BV8(0, 1, 1, 1, 1, 1, 0, 0), BV8(0, 0, 1, 1, 1, 1, 1, 0), BV8(0, 0, 0, 1, 1, 1, 1, 1)) + .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), + BV8(1, 1, 0, 0, 0, 1, 1, 1), + BV8(1, 1, 1, 0, 0, 0, 1, 1), + BV8(1, 1, 1, 1, 0, 0, 0, 1), + BV8(1, 1, 1, 1, 1, 0, 0, 0), + BV8(0, 1, 1, 1, 1, 1, 0, 0), + BV8(0, 0, 1, 1, 1, 1, 1, 0), + BV8(0, 0, 0, 1, 1, 1, 1, 1)) /* AES inverse affine: */ #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0) @@ -784,6 +806,14 @@ BV8(0, 0, 1, 0, 1, 0, 0, 1), BV8(1, 0, 0, 1, 0, 1, 0, 0), BV8(0, 1, 0, 0, 1, 0, 1, 0)) + .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), + BV8(1, 0, 0, 1, 0, 0, 1, 0), + BV8(0, 1, 0, 0, 1, 0, 0, 1), + BV8(1, 0, 1, 0, 0, 1, 0, 0), + BV8(0, 1, 0, 1, 0, 0, 1, 0), + BV8(0, 0, 1, 0, 1, 0, 0, 1), + BV8(1, 0, 0, 1, 0, 1, 0, 0), + BV8(0, 1, 0, 0, 1, 0, 1, 0)) /* S2: */ #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1) @@ -796,6 +826,14 @@ BV8(1, 1, 0, 0, 1, 1, 1, 0), BV8(0, 1, 1, 0, 0, 0, 1, 1), BV8(1, 1, 1, 1, 0, 1, 1, 0)) + .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), + BV8(0, 0, 1, 1, 1, 1, 1, 1), + BV8(1, 1, 1, 0, 1, 1, 0, 1), + BV8(1, 1, 0, 0, 0, 0, 1, 1), + BV8(0, 1, 0, 0, 0, 0, 1, 1), + BV8(1, 1, 0, 0, 1, 1, 1, 0), + BV8(0, 1, 1, 0, 0, 0, 1, 1), + BV8(1, 1, 1, 1, 0, 1, 1, 0)) /* X2: */ #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0) @@ -808,6 +846,14 @@ BV8(0, 1, 1, 0, 1, 0, 1, 1), BV8(1, 0, 1, 1, 1, 1, 0, 1), BV8(1, 0, 0, 1, 0, 0, 1, 1)) + .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), + BV8(0, 0, 1, 0, 0, 1, 1, 0), + BV8(0, 0, 0, 0, 1, 0, 1, 0), + BV8(1, 1, 1, 0, 0, 0, 1, 1), + BV8(1, 1, 1, 0, 1, 1, 0, 0), + BV8(0, 1, 1, 0, 1, 0, 1, 1), + BV8(1, 0, 1, 1, 1, 1, 0, 1), + BV8(1, 0, 0, 1, 0, 0, 1, 1)) /* Identity matrix: */ .Ltf_id_bitmatrix: @@ -819,6 +865,14 @@ BV8(0, 0, 0, 0, 0, 1, 0, 0), BV8(0, 0, 0, 0, 0, 0, 1, 0), BV8(0, 0, 0, 0, 0, 0, 0, 1)) + .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 1, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 1, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 1, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 1, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 1, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 1, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 1)) #endif /* CONFIG_AS_GFNI */ /* 4-bit mask */