mirror of
https://github.com/armbian/linux-cix.git
synced 2026-01-06 12:30:45 -08:00
Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto update from Herbert Xu: "API: - Restrict crypto_cipher to internal API users only. Algorithms: - Add x86 aesni acceleration for cts. - Improve x86 aesni acceleration for xts. - Remove x86 acceleration of some uncommon algorithms. - Remove RIPE-MD, Tiger and Salsa20. - Remove tnepres. - Add ARM acceleration for BLAKE2s and BLAKE2b. Drivers: - Add Keem Bay OCS HCU driver. - Add Marvell OcteonTX2 CPT PF driver. - Remove PicoXcell driver. - Remove mediatek driver" * 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (154 commits) hwrng: timeriomem - Use device-managed registration API crypto: hisilicon/qm - fix printing format issue crypto: hisilicon/qm - do not reset hardware when CE happens crypto: hisilicon/qm - update irqflag crypto: hisilicon/qm - fix the value of 'QM_SQC_VFT_BASE_MASK_V2' crypto: hisilicon/qm - fix request missing error crypto: hisilicon/qm - removing driver after reset crypto: octeontx2 - fix -Wpointer-bool-conversion warning crypto: hisilicon/hpre - enable Elliptic curve cryptography crypto: hisilicon - PASID fixed on Kunpeng 930 crypto: hisilicon/qm - fix use of 'dma_map_single' crypto: hisilicon/hpre - tiny fix crypto: hisilicon/hpre - adapt the number of clusters crypto: cpt - remove casting dma_alloc_coherent crypto: keembay-ocs-aes - Fix 'q' assignment during CCM B0 generation crypto: xor - Fix typo of optimization hwrng: optee - Use device-managed registration API crypto: arm64/crc-t10dif - move NEON yield to C code crypto: arm64/aes-ce-mac - simplify NEON yield crypto: arm64/aes-neonbs - remove NEON yield calls ...
This commit is contained in:
1
.mailmap
1
.mailmap
@@ -174,7 +174,6 @@ Juha Yrjola <at solidboot.com>
|
||||
Juha Yrjola <juha.yrjola@nokia.com>
|
||||
Juha Yrjola <juha.yrjola@solidboot.com>
|
||||
Julien Thierry <julien.thierry.kdev@gmail.com> <julien.thierry@arm.com>
|
||||
Kamil Konieczny <k.konieczny@samsung.com> <k.konieczny@partner.samsung.com>
|
||||
Kay Sievers <kay.sievers@vrfy.org>
|
||||
Kees Cook <keescook@chromium.org> <kees.cook@canonical.com>
|
||||
Kees Cook <keescook@chromium.org> <keescook@google.com>
|
||||
|
||||
@@ -143,8 +143,8 @@ recalculate
|
||||
journal_crypt:algorithm(:key) (the key is optional)
|
||||
Encrypt the journal using given algorithm to make sure that the
|
||||
attacker can't read the journal. You can use a block cipher here
|
||||
(such as "cbc(aes)") or a stream cipher (for example "chacha20",
|
||||
"salsa20" or "ctr(aes)").
|
||||
(such as "cbc(aes)") or a stream cipher (for example "chacha20"
|
||||
or "ctr(aes)").
|
||||
|
||||
The journal contains history of last writes to the block device,
|
||||
an attacker reading the journal could see the last sector numbers
|
||||
|
||||
@@ -28,8 +28,8 @@ Symmetric Key Cipher Request Handle
|
||||
Single Block Cipher API
|
||||
-----------------------
|
||||
|
||||
.. kernel-doc:: include/linux/crypto.h
|
||||
.. kernel-doc:: include/crypto/internal/cipher.h
|
||||
:doc: Single Block Cipher API
|
||||
|
||||
.. kernel-doc:: include/linux/crypto.h
|
||||
.. kernel-doc:: include/crypto/internal/cipher.h
|
||||
:functions: crypto_alloc_cipher crypto_free_cipher crypto_has_cipher crypto_cipher_blocksize crypto_cipher_setkey crypto_cipher_encrypt_one crypto_cipher_decrypt_one
|
||||
|
||||
@@ -0,0 +1,46 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/crypto/intel,keembay-ocs-hcu.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Intel Keem Bay OCS HCU Device Tree Bindings
|
||||
|
||||
maintainers:
|
||||
- Declan Murphy <declan.murphy@intel.com>
|
||||
- Daniele Alessandrelli <daniele.alessandrelli@intel.com>
|
||||
|
||||
description:
|
||||
The Intel Keem Bay Offload and Crypto Subsystem (OCS) Hash Control Unit (HCU)
|
||||
provides hardware-accelerated hashing and HMAC.
|
||||
|
||||
properties:
|
||||
compatible:
|
||||
const: intel,keembay-ocs-hcu
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
interrupts:
|
||||
maxItems: 1
|
||||
|
||||
clocks:
|
||||
maxItems: 1
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- reg
|
||||
- interrupts
|
||||
- clocks
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
examples:
|
||||
- |
|
||||
#include <dt-bindings/interrupt-controller/arm-gic.h>
|
||||
crypto@3000b000 {
|
||||
compatible = "intel,keembay-ocs-hcu";
|
||||
reg = <0x3000b000 0x1000>;
|
||||
interrupts = <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>;
|
||||
clocks = <&scmi_clk 94>;
|
||||
};
|
||||
@@ -8,7 +8,6 @@ title: Samsung Exynos SoC SlimSSS (Slim Security SubSystem) module
|
||||
|
||||
maintainers:
|
||||
- Krzysztof Kozlowski <krzk@kernel.org>
|
||||
- Kamil Konieczny <k.konieczny@partner.samsung.com>
|
||||
|
||||
description: |+
|
||||
The SlimSSS module in Exynos5433 SoC supports the following:
|
||||
|
||||
@@ -8,7 +8,6 @@ title: Samsung Exynos SoC SSS (Security SubSystem) module
|
||||
|
||||
maintainers:
|
||||
- Krzysztof Kozlowski <krzk@kernel.org>
|
||||
- Kamil Konieczny <k.konieczny@partner.samsung.com>
|
||||
|
||||
description: |+
|
||||
The SSS module in S5PV210 SoC supports the following:
|
||||
|
||||
12
MAINTAINERS
12
MAINTAINERS
@@ -9032,6 +9032,17 @@ F: drivers/crypto/keembay/keembay-ocs-aes-core.c
|
||||
F: drivers/crypto/keembay/ocs-aes.c
|
||||
F: drivers/crypto/keembay/ocs-aes.h
|
||||
|
||||
INTEL KEEM BAY OCS HCU CRYPTO DRIVER
|
||||
M: Daniele Alessandrelli <daniele.alessandrelli@intel.com>
|
||||
M: Declan Murphy <declan.murphy@intel.com>
|
||||
S: Maintained
|
||||
F: Documentation/devicetree/bindings/crypto/intel,keembay-ocs-hcu.yaml
|
||||
F: drivers/crypto/keembay/Kconfig
|
||||
F: drivers/crypto/keembay/Makefile
|
||||
F: drivers/crypto/keembay/keembay-ocs-hcu-core.c
|
||||
F: drivers/crypto/keembay/ocs-hcu.c
|
||||
F: drivers/crypto/keembay/ocs-hcu.h
|
||||
|
||||
INTEL MANAGEMENT ENGINE (mei)
|
||||
M: Tomas Winkler <tomas.winkler@intel.com>
|
||||
L: linux-kernel@vger.kernel.org
|
||||
@@ -15683,7 +15694,6 @@ F: drivers/media/i2c/s5k5baf.c
|
||||
SAMSUNG S5P Security SubSystem (SSS) DRIVER
|
||||
M: Krzysztof Kozlowski <krzk@kernel.org>
|
||||
M: Vladimir Zapolskiy <vz@mleia.com>
|
||||
M: Kamil Konieczny <k.konieczny@samsung.com>
|
||||
L: linux-crypto@vger.kernel.org
|
||||
L: linux-samsung-soc@vger.kernel.org
|
||||
S: Maintained
|
||||
|
||||
@@ -62,6 +62,25 @@ config CRYPTO_SHA512_ARM
|
||||
SHA-512 secure hash standard (DFIPS 180-2) implemented
|
||||
using optimized ARM assembler and NEON, when available.
|
||||
|
||||
config CRYPTO_BLAKE2S_ARM
|
||||
tristate "BLAKE2s digest algorithm (ARM)"
|
||||
select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
|
||||
help
|
||||
BLAKE2s digest algorithm optimized with ARM scalar instructions. This
|
||||
is faster than the generic implementations of BLAKE2s and BLAKE2b, but
|
||||
slower than the NEON implementation of BLAKE2b. (There is no NEON
|
||||
implementation of BLAKE2s, since NEON doesn't really help with it.)
|
||||
|
||||
config CRYPTO_BLAKE2B_NEON
|
||||
tristate "BLAKE2b digest algorithm (ARM NEON)"
|
||||
depends on KERNEL_MODE_NEON
|
||||
select CRYPTO_BLAKE2B
|
||||
help
|
||||
BLAKE2b digest algorithm optimized with ARM NEON instructions.
|
||||
On ARM processors that have NEON support but not the ARMv8
|
||||
Crypto Extensions, typically this BLAKE2b implementation is
|
||||
much faster than SHA-2 and slightly faster than SHA-1.
|
||||
|
||||
config CRYPTO_AES_ARM
|
||||
tristate "Scalar AES cipher for ARM"
|
||||
select CRYPTO_ALGAPI
|
||||
|
||||
@@ -9,6 +9,8 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
|
||||
obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
|
||||
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
|
||||
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
|
||||
obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += blake2s-arm.o
|
||||
obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
|
||||
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
|
||||
obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
|
||||
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
|
||||
@@ -29,6 +31,8 @@ sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
|
||||
sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
|
||||
sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
|
||||
sha512-arm-y := sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
|
||||
blake2s-arm-y := blake2s-core.o blake2s-glue.o
|
||||
blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o
|
||||
sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
|
||||
sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o
|
||||
aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <asm/simd.h>
|
||||
#include <crypto/aes.h>
|
||||
#include <crypto/ctr.h>
|
||||
#include <crypto/internal/cipher.h>
|
||||
#include <crypto/internal/simd.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <crypto/scatterwalk.h>
|
||||
@@ -23,6 +24,8 @@ MODULE_ALIAS_CRYPTO("cbc(aes)-all");
|
||||
MODULE_ALIAS_CRYPTO("ctr(aes)");
|
||||
MODULE_ALIAS_CRYPTO("xts(aes)");
|
||||
|
||||
MODULE_IMPORT_NS(CRYPTO_INTERNAL);
|
||||
|
||||
asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds);
|
||||
|
||||
asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
|
||||
|
||||
347
arch/arm/crypto/blake2b-neon-core.S
Normal file
347
arch/arm/crypto/blake2b-neon-core.S
Normal file
@@ -0,0 +1,347 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
/*
|
||||
* BLAKE2b digest algorithm, NEON accelerated
|
||||
*
|
||||
* Copyright 2020 Google LLC
|
||||
*
|
||||
* Author: Eric Biggers <ebiggers@google.com>
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.text
|
||||
.fpu neon
|
||||
|
||||
// The arguments to blake2b_compress_neon()
|
||||
STATE .req r0
|
||||
BLOCK .req r1
|
||||
NBLOCKS .req r2
|
||||
INC .req r3
|
||||
|
||||
// Pointers to the rotation tables
|
||||
ROR24_TABLE .req r4
|
||||
ROR16_TABLE .req r5
|
||||
|
||||
// The original stack pointer
|
||||
ORIG_SP .req r6
|
||||
|
||||
// NEON registers which contain the message words of the current block.
|
||||
// M_0-M_3 are occasionally used for other purposes too.
|
||||
M_0 .req d16
|
||||
M_1 .req d17
|
||||
M_2 .req d18
|
||||
M_3 .req d19
|
||||
M_4 .req d20
|
||||
M_5 .req d21
|
||||
M_6 .req d22
|
||||
M_7 .req d23
|
||||
M_8 .req d24
|
||||
M_9 .req d25
|
||||
M_10 .req d26
|
||||
M_11 .req d27
|
||||
M_12 .req d28
|
||||
M_13 .req d29
|
||||
M_14 .req d30
|
||||
M_15 .req d31
|
||||
|
||||
.align 4
|
||||
// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
|
||||
// instruction. This is the most efficient way to implement these
|
||||
// rotation amounts with NEON. (On Cortex-A53 it's the same speed as
|
||||
// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
|
||||
.Lror24_table:
|
||||
.byte 3, 4, 5, 6, 7, 0, 1, 2
|
||||
.Lror16_table:
|
||||
.byte 2, 3, 4, 5, 6, 7, 0, 1
|
||||
// The BLAKE2b initialization vector
|
||||
.Lblake2b_IV:
|
||||
.quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
|
||||
.quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
|
||||
.quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
|
||||
.quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
|
||||
|
||||
// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
|
||||
// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack
|
||||
// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
|
||||
// (M_0-M_3), so that they can be reloaded if they are used as temporary
|
||||
// registers. The macro arguments s0-s15 give the order in which the message
|
||||
// words are used in this round. 'final' is 1 if this is the final round.
|
||||
.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \
|
||||
s8, s9, s10, s11, s12, s13, s14, s15, final=0
|
||||
|
||||
// Mix the columns:
|
||||
// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
|
||||
// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
|
||||
|
||||
// a += b + m[blake2b_sigma[r][2*i + 0]];
|
||||
vadd.u64 q0, q0, q2
|
||||
vadd.u64 q1, q1, q3
|
||||
vadd.u64 d0, d0, M_\s0
|
||||
vadd.u64 d1, d1, M_\s2
|
||||
vadd.u64 d2, d2, M_\s4
|
||||
vadd.u64 d3, d3, M_\s6
|
||||
|
||||
// d = ror64(d ^ a, 32);
|
||||
veor q6, q6, q0
|
||||
veor q7, q7, q1
|
||||
vrev64.32 q6, q6
|
||||
vrev64.32 q7, q7
|
||||
|
||||
// c += d;
|
||||
vadd.u64 q4, q4, q6
|
||||
vadd.u64 q5, q5, q7
|
||||
|
||||
// b = ror64(b ^ c, 24);
|
||||
vld1.8 {M_0}, [ROR24_TABLE, :64]
|
||||
veor q2, q2, q4
|
||||
veor q3, q3, q5
|
||||
vtbl.8 d4, {d4}, M_0
|
||||
vtbl.8 d5, {d5}, M_0
|
||||
vtbl.8 d6, {d6}, M_0
|
||||
vtbl.8 d7, {d7}, M_0
|
||||
|
||||
// a += b + m[blake2b_sigma[r][2*i + 1]];
|
||||
//
|
||||
// M_0 got clobbered above, so we have to reload it if any of the four
|
||||
// message words this step needs happens to be M_0. Otherwise we don't
|
||||
// need to reload it here, as it will just get clobbered again below.
|
||||
.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
|
||||
vld1.8 {M_0}, [sp, :64]
|
||||
.endif
|
||||
vadd.u64 q0, q0, q2
|
||||
vadd.u64 q1, q1, q3
|
||||
vadd.u64 d0, d0, M_\s1
|
||||
vadd.u64 d1, d1, M_\s3
|
||||
vadd.u64 d2, d2, M_\s5
|
||||
vadd.u64 d3, d3, M_\s7
|
||||
|
||||
// d = ror64(d ^ a, 16);
|
||||
vld1.8 {M_0}, [ROR16_TABLE, :64]
|
||||
veor q6, q6, q0
|
||||
veor q7, q7, q1
|
||||
vtbl.8 d12, {d12}, M_0
|
||||
vtbl.8 d13, {d13}, M_0
|
||||
vtbl.8 d14, {d14}, M_0
|
||||
vtbl.8 d15, {d15}, M_0
|
||||
|
||||
// c += d;
|
||||
vadd.u64 q4, q4, q6
|
||||
vadd.u64 q5, q5, q7
|
||||
|
||||
// b = ror64(b ^ c, 63);
|
||||
//
|
||||
// This rotation amount isn't a multiple of 8, so it has to be
|
||||
// implemented using a pair of shifts, which requires temporary
|
||||
// registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
|
||||
veor q8, q2, q4
|
||||
veor q9, q3, q5
|
||||
vshr.u64 q2, q8, #63
|
||||
vshr.u64 q3, q9, #63
|
||||
vsli.u64 q2, q8, #1
|
||||
vsli.u64 q3, q9, #1
|
||||
vld1.8 {q8-q9}, [sp, :256]
|
||||
|
||||
// Mix the diagonals:
|
||||
// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
|
||||
// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
|
||||
//
|
||||
// There are two possible ways to do this: use 'vext' instructions to
|
||||
// shift the rows of the matrix so that the diagonals become columns,
|
||||
// and undo it afterwards; or just use 64-bit operations on 'd'
|
||||
// registers instead of 128-bit operations on 'q' registers. We use the
|
||||
// latter approach, as it performs much better on Cortex-A7.
|
||||
|
||||
// a += b + m[blake2b_sigma[r][2*i + 0]];
|
||||
vadd.u64 d0, d0, d5
|
||||
vadd.u64 d1, d1, d6
|
||||
vadd.u64 d2, d2, d7
|
||||
vadd.u64 d3, d3, d4
|
||||
vadd.u64 d0, d0, M_\s8
|
||||
vadd.u64 d1, d1, M_\s10
|
||||
vadd.u64 d2, d2, M_\s12
|
||||
vadd.u64 d3, d3, M_\s14
|
||||
|
||||
// d = ror64(d ^ a, 32);
|
||||
veor d15, d15, d0
|
||||
veor d12, d12, d1
|
||||
veor d13, d13, d2
|
||||
veor d14, d14, d3
|
||||
vrev64.32 d15, d15
|
||||
vrev64.32 d12, d12
|
||||
vrev64.32 d13, d13
|
||||
vrev64.32 d14, d14
|
||||
|
||||
// c += d;
|
||||
vadd.u64 d10, d10, d15
|
||||
vadd.u64 d11, d11, d12
|
||||
vadd.u64 d8, d8, d13
|
||||
vadd.u64 d9, d9, d14
|
||||
|
||||
// b = ror64(b ^ c, 24);
|
||||
vld1.8 {M_0}, [ROR24_TABLE, :64]
|
||||
veor d5, d5, d10
|
||||
veor d6, d6, d11
|
||||
veor d7, d7, d8
|
||||
veor d4, d4, d9
|
||||
vtbl.8 d5, {d5}, M_0
|
||||
vtbl.8 d6, {d6}, M_0
|
||||
vtbl.8 d7, {d7}, M_0
|
||||
vtbl.8 d4, {d4}, M_0
|
||||
|
||||
// a += b + m[blake2b_sigma[r][2*i + 1]];
|
||||
.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
|
||||
vld1.8 {M_0}, [sp, :64]
|
||||
.endif
|
||||
vadd.u64 d0, d0, d5
|
||||
vadd.u64 d1, d1, d6
|
||||
vadd.u64 d2, d2, d7
|
||||
vadd.u64 d3, d3, d4
|
||||
vadd.u64 d0, d0, M_\s9
|
||||
vadd.u64 d1, d1, M_\s11
|
||||
vadd.u64 d2, d2, M_\s13
|
||||
vadd.u64 d3, d3, M_\s15
|
||||
|
||||
// d = ror64(d ^ a, 16);
|
||||
vld1.8 {M_0}, [ROR16_TABLE, :64]
|
||||
veor d15, d15, d0
|
||||
veor d12, d12, d1
|
||||
veor d13, d13, d2
|
||||
veor d14, d14, d3
|
||||
vtbl.8 d12, {d12}, M_0
|
||||
vtbl.8 d13, {d13}, M_0
|
||||
vtbl.8 d14, {d14}, M_0
|
||||
vtbl.8 d15, {d15}, M_0
|
||||
|
||||
// c += d;
|
||||
vadd.u64 d10, d10, d15
|
||||
vadd.u64 d11, d11, d12
|
||||
vadd.u64 d8, d8, d13
|
||||
vadd.u64 d9, d9, d14
|
||||
|
||||
// b = ror64(b ^ c, 63);
|
||||
veor d16, d4, d9
|
||||
veor d17, d5, d10
|
||||
veor d18, d6, d11
|
||||
veor d19, d7, d8
|
||||
vshr.u64 q2, q8, #63
|
||||
vshr.u64 q3, q9, #63
|
||||
vsli.u64 q2, q8, #1
|
||||
vsli.u64 q3, q9, #1
|
||||
// Reloading q8-q9 can be skipped on the final round.
|
||||
.if ! \final
|
||||
vld1.8 {q8-q9}, [sp, :256]
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//
|
||||
// void blake2b_compress_neon(struct blake2b_state *state,
|
||||
// const u8 *block, size_t nblocks, u32 inc);
|
||||
//
|
||||
// Only the first three fields of struct blake2b_state are used:
|
||||
// u64 h[8]; (inout)
|
||||
// u64 t[2]; (inout)
|
||||
// u64 f[2]; (in)
|
||||
//
|
||||
.align 5
|
||||
ENTRY(blake2b_compress_neon)
|
||||
push {r4-r10}
|
||||
|
||||
// Allocate a 32-byte stack buffer that is 32-byte aligned.
|
||||
mov ORIG_SP, sp
|
||||
sub ip, sp, #32
|
||||
bic ip, ip, #31
|
||||
mov sp, ip
|
||||
|
||||
adr ROR24_TABLE, .Lror24_table
|
||||
adr ROR16_TABLE, .Lror16_table
|
||||
|
||||
mov ip, STATE
|
||||
vld1.64 {q0-q1}, [ip]! // Load h[0..3]
|
||||
vld1.64 {q2-q3}, [ip]! // Load h[4..7]
|
||||
.Lnext_block:
|
||||
adr r10, .Lblake2b_IV
|
||||
vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1]
|
||||
vld1.64 {q4-q5}, [r10]! // Load IV[0..3]
|
||||
vmov r7, r8, d28 // Copy t[0] to (r7, r8)
|
||||
vld1.64 {q6-q7}, [r10] // Load IV[4..7]
|
||||
adds r7, r7, INC // Increment counter
|
||||
bcs .Lslow_inc_ctr
|
||||
vmov.i32 d28[0], r7
|
||||
vst1.64 {d28}, [ip] // Update t[0]
|
||||
.Linc_ctr_done:
|
||||
|
||||
// Load the next message block and finish initializing the state matrix
|
||||
// 'v'. Fortunately, there are exactly enough NEON registers to fit the
|
||||
// entire state matrix in q0-q7 and the entire message block in q8-15.
|
||||
//
|
||||
// However, _blake2b_round also needs some extra registers for rotates,
|
||||
// so we have to spill some registers. It's better to spill the message
|
||||
// registers than the state registers, as the message doesn't change.
|
||||
// Therefore we store a copy of the first 32 bytes of the message block
|
||||
// (q8-q9) in an aligned buffer on the stack so that they can be
|
||||
// reloaded when needed. (We could just reload directly from the
|
||||
// message buffer, but it's faster to use aligned loads.)
|
||||
vld1.8 {q8-q9}, [BLOCK]!
|
||||
veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1]
|
||||
vld1.8 {q10-q11}, [BLOCK]!
|
||||
veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1]
|
||||
vld1.8 {q12-q13}, [BLOCK]!
|
||||
vst1.8 {q8-q9}, [sp, :256]
|
||||
mov ip, STATE
|
||||
vld1.8 {q14-q15}, [BLOCK]!
|
||||
|
||||
// Execute the rounds. Each round is provided the order in which it
|
||||
// needs to use the message words.
|
||||
_blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
_blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
|
||||
_blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
|
||||
_blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
|
||||
_blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
|
||||
_blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
|
||||
_blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
|
||||
_blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
|
||||
_blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
|
||||
_blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
|
||||
_blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
_blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
|
||||
final=1
|
||||
|
||||
// Fold the final state matrix into the hash chaining value:
|
||||
//
|
||||
// for (i = 0; i < 8; i++)
|
||||
// h[i] ^= v[i] ^ v[i + 8];
|
||||
//
|
||||
vld1.64 {q8-q9}, [ip]! // Load old h[0..3]
|
||||
veor q0, q0, q4 // v[0..1] ^= v[8..9]
|
||||
veor q1, q1, q5 // v[2..3] ^= v[10..11]
|
||||
vld1.64 {q10-q11}, [ip] // Load old h[4..7]
|
||||
veor q2, q2, q6 // v[4..5] ^= v[12..13]
|
||||
veor q3, q3, q7 // v[6..7] ^= v[14..15]
|
||||
veor q0, q0, q8 // v[0..1] ^= h[0..1]
|
||||
veor q1, q1, q9 // v[2..3] ^= h[2..3]
|
||||
mov ip, STATE
|
||||
subs NBLOCKS, NBLOCKS, #1 // nblocks--
|
||||
vst1.64 {q0-q1}, [ip]! // Store new h[0..3]
|
||||
veor q2, q2, q10 // v[4..5] ^= h[4..5]
|
||||
veor q3, q3, q11 // v[6..7] ^= h[6..7]
|
||||
vst1.64 {q2-q3}, [ip]! // Store new h[4..7]
|
||||
|
||||
// Advance to the next block, if there is one.
|
||||
bne .Lnext_block // nblocks != 0?
|
||||
|
||||
mov sp, ORIG_SP
|
||||
pop {r4-r10}
|
||||
mov pc, lr
|
||||
|
||||
.Lslow_inc_ctr:
|
||||
// Handle the case where the counter overflowed its low 32 bits, by
|
||||
// carrying the overflow bit into the full 128-bit counter.
|
||||
vmov r9, r10, d29
|
||||
adcs r8, r8, #0
|
||||
adcs r9, r9, #0
|
||||
adc r10, r10, #0
|
||||
vmov d28, r7, r8
|
||||
vmov d29, r9, r10
|
||||
vst1.64 {q14}, [ip] // Update t[0] and t[1]
|
||||
b .Linc_ctr_done
|
||||
ENDPROC(blake2b_compress_neon)
|
||||
105
arch/arm/crypto/blake2b-neon-glue.c
Normal file
105
arch/arm/crypto/blake2b-neon-glue.c
Normal file
@@ -0,0 +1,105 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
* BLAKE2b digest algorithm, NEON accelerated
|
||||
*
|
||||
* Copyright 2020 Google LLC
|
||||
*/
|
||||
|
||||
#include <crypto/internal/blake2b.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <crypto/internal/simd.h>
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/sizes.h>
|
||||
|
||||
#include <asm/neon.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
asmlinkage void blake2b_compress_neon(struct blake2b_state *state,
|
||||
const u8 *block, size_t nblocks, u32 inc);
|
||||
|
||||
static void blake2b_compress_arch(struct blake2b_state *state,
|
||||
const u8 *block, size_t nblocks, u32 inc)
|
||||
{
|
||||
if (!crypto_simd_usable()) {
|
||||
blake2b_compress_generic(state, block, nblocks, inc);
|
||||
return;
|
||||
}
|
||||
|
||||
do {
|
||||
const size_t blocks = min_t(size_t, nblocks,
|
||||
SZ_4K / BLAKE2B_BLOCK_SIZE);
|
||||
|
||||
kernel_neon_begin();
|
||||
blake2b_compress_neon(state, block, blocks, inc);
|
||||
kernel_neon_end();
|
||||
|
||||
nblocks -= blocks;
|
||||
block += blocks * BLAKE2B_BLOCK_SIZE;
|
||||
} while (nblocks);
|
||||
}
|
||||
|
||||
static int crypto_blake2b_update_neon(struct shash_desc *desc,
|
||||
const u8 *in, unsigned int inlen)
|
||||
{
|
||||
return crypto_blake2b_update(desc, in, inlen, blake2b_compress_arch);
|
||||
}
|
||||
|
||||
static int crypto_blake2b_final_neon(struct shash_desc *desc, u8 *out)
|
||||
{
|
||||
return crypto_blake2b_final(desc, out, blake2b_compress_arch);
|
||||
}
|
||||
|
||||
#define BLAKE2B_ALG(name, driver_name, digest_size) \
|
||||
{ \
|
||||
.base.cra_name = name, \
|
||||
.base.cra_driver_name = driver_name, \
|
||||
.base.cra_priority = 200, \
|
||||
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
|
||||
.base.cra_blocksize = BLAKE2B_BLOCK_SIZE, \
|
||||
.base.cra_ctxsize = sizeof(struct blake2b_tfm_ctx), \
|
||||
.base.cra_module = THIS_MODULE, \
|
||||
.digestsize = digest_size, \
|
||||
.setkey = crypto_blake2b_setkey, \
|
||||
.init = crypto_blake2b_init, \
|
||||
.update = crypto_blake2b_update_neon, \
|
||||
.final = crypto_blake2b_final_neon, \
|
||||
.descsize = sizeof(struct blake2b_state), \
|
||||
}
|
||||
|
||||
static struct shash_alg blake2b_neon_algs[] = {
|
||||
BLAKE2B_ALG("blake2b-160", "blake2b-160-neon", BLAKE2B_160_HASH_SIZE),
|
||||
BLAKE2B_ALG("blake2b-256", "blake2b-256-neon", BLAKE2B_256_HASH_SIZE),
|
||||
BLAKE2B_ALG("blake2b-384", "blake2b-384-neon", BLAKE2B_384_HASH_SIZE),
|
||||
BLAKE2B_ALG("blake2b-512", "blake2b-512-neon", BLAKE2B_512_HASH_SIZE),
|
||||
};
|
||||
|
||||
static int __init blake2b_neon_mod_init(void)
|
||||
{
|
||||
if (!(elf_hwcap & HWCAP_NEON))
|
||||
return -ENODEV;
|
||||
|
||||
return crypto_register_shashes(blake2b_neon_algs,
|
||||
ARRAY_SIZE(blake2b_neon_algs));
|
||||
}
|
||||
|
||||
static void __exit blake2b_neon_mod_exit(void)
|
||||
{
|
||||
return crypto_unregister_shashes(blake2b_neon_algs,
|
||||
ARRAY_SIZE(blake2b_neon_algs));
|
||||
}
|
||||
|
||||
module_init(blake2b_neon_mod_init);
|
||||
module_exit(blake2b_neon_mod_exit);
|
||||
|
||||
MODULE_DESCRIPTION("BLAKE2b digest algorithm, NEON accelerated");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
|
||||
MODULE_ALIAS_CRYPTO("blake2b-160");
|
||||
MODULE_ALIAS_CRYPTO("blake2b-160-neon");
|
||||
MODULE_ALIAS_CRYPTO("blake2b-256");
|
||||
MODULE_ALIAS_CRYPTO("blake2b-256-neon");
|
||||
MODULE_ALIAS_CRYPTO("blake2b-384");
|
||||
MODULE_ALIAS_CRYPTO("blake2b-384-neon");
|
||||
MODULE_ALIAS_CRYPTO("blake2b-512");
|
||||
MODULE_ALIAS_CRYPTO("blake2b-512-neon");
|
||||
285
arch/arm/crypto/blake2s-core.S
Normal file
285
arch/arm/crypto/blake2s-core.S
Normal file
@@ -0,0 +1,285 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
/*
|
||||
* BLAKE2s digest algorithm, ARM scalar implementation
|
||||
*
|
||||
* Copyright 2020 Google LLC
|
||||
*
|
||||
* Author: Eric Biggers <ebiggers@google.com>
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
// Registers used to hold message words temporarily. There aren't
|
||||
// enough ARM registers to hold the whole message block, so we have to
|
||||
// load the words on-demand.
|
||||
M_0 .req r12
|
||||
M_1 .req r14
|
||||
|
||||
// The BLAKE2s initialization vector
|
||||
.Lblake2s_IV:
|
||||
.word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
|
||||
.word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
|
||||
.macro __ldrd a, b, src, offset
|
||||
#if __LINUX_ARM_ARCH__ >= 6
|
||||
ldrd \a, \b, [\src, #\offset]
|
||||
#else
|
||||
ldr \a, [\src, #\offset]
|
||||
ldr \b, [\src, #\offset + 4]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro __strd a, b, dst, offset
|
||||
#if __LINUX_ARM_ARCH__ >= 6
|
||||
strd \a, \b, [\dst, #\offset]
|
||||
#else
|
||||
str \a, [\dst, #\offset]
|
||||
str \b, [\dst, #\offset + 4]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
|
||||
// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
|
||||
// columns/diagonals. s0-s1 are the word offsets to the message words the first
|
||||
// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
|
||||
// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
|
||||
//
|
||||
// Note that to save instructions, the rotations don't happen when the
|
||||
// pseudocode says they should, but rather they are delayed until the values are
|
||||
// used. See the comment above _blake2s_round().
|
||||
.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3
|
||||
|
||||
ldr M_0, [sp, #32 + 4 * \s0]
|
||||
ldr M_1, [sp, #32 + 4 * \s2]
|
||||
|
||||
// a += b + m[blake2s_sigma[r][2*i + 0]];
|
||||
add \a0, \a0, \b0, ror #brot
|
||||
add \a1, \a1, \b1, ror #brot
|
||||
add \a0, \a0, M_0
|
||||
add \a1, \a1, M_1
|
||||
|
||||
// d = ror32(d ^ a, 16);
|
||||
eor \d0, \a0, \d0, ror #drot
|
||||
eor \d1, \a1, \d1, ror #drot
|
||||
|
||||
// c += d;
|
||||
add \c0, \c0, \d0, ror #16
|
||||
add \c1, \c1, \d1, ror #16
|
||||
|
||||
// b = ror32(b ^ c, 12);
|
||||
eor \b0, \c0, \b0, ror #brot
|
||||
eor \b1, \c1, \b1, ror #brot
|
||||
|
||||
ldr M_0, [sp, #32 + 4 * \s1]
|
||||
ldr M_1, [sp, #32 + 4 * \s3]
|
||||
|
||||
// a += b + m[blake2s_sigma[r][2*i + 1]];
|
||||
add \a0, \a0, \b0, ror #12
|
||||
add \a1, \a1, \b1, ror #12
|
||||
add \a0, \a0, M_0
|
||||
add \a1, \a1, M_1
|
||||
|
||||
// d = ror32(d ^ a, 8);
|
||||
eor \d0, \a0, \d0, ror#16
|
||||
eor \d1, \a1, \d1, ror#16
|
||||
|
||||
// c += d;
|
||||
add \c0, \c0, \d0, ror#8
|
||||
add \c1, \c1, \d1, ror#8
|
||||
|
||||
// b = ror32(b ^ c, 7);
|
||||
eor \b0, \c0, \b0, ror#12
|
||||
eor \b1, \c1, \b1, ror#12
|
||||
.endm
|
||||
|
||||
// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9]
|
||||
// are in r0..r9. The stack pointer points to 8 bytes of scratch space for
|
||||
// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and
|
||||
// r14 are free to use. The macro arguments s0-s15 give the order in which the
|
||||
// message words are used in this round.
|
||||
//
|
||||
// All rotates are performed using the implicit rotate operand accepted by the
|
||||
// 'add' and 'eor' instructions. This is faster than using explicit rotate
|
||||
// instructions. To make this work, we allow the values in the second and last
|
||||
// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
|
||||
// wrong rotation amount. The rotation amount is then fixed up just in time
|
||||
// when the values are used. 'brot' is the number of bits the values in row 'b'
|
||||
// need to be rotated right to arrive at the correct values, and 'drot'
|
||||
// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
|
||||
// that they end up as (7, 8) after every round.
|
||||
.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \
|
||||
s8, s9, s10, s11, s12, s13, s14, s15
|
||||
|
||||
// Mix first two columns:
|
||||
// (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
|
||||
__ldrd r10, r11, sp, 16 // load v[12] and v[13]
|
||||
_blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \
|
||||
\s0, \s1, \s2, \s3
|
||||
__strd r8, r9, sp, 0
|
||||
__strd r10, r11, sp, 16
|
||||
|
||||
// Mix second two columns:
|
||||
// (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
|
||||
__ldrd r8, r9, sp, 8 // load v[10] and v[11]
|
||||
__ldrd r10, r11, sp, 24 // load v[14] and v[15]
|
||||
_blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \
|
||||
\s4, \s5, \s6, \s7
|
||||
str r10, [sp, #24] // store v[14]
|
||||
// v[10], v[11], and v[15] are used below, so no need to store them yet.
|
||||
|
||||
.set brot, 7
|
||||
.set drot, 8
|
||||
|
||||
// Mix first two diagonals:
|
||||
// (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
|
||||
ldr r10, [sp, #16] // load v[12]
|
||||
_blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \
|
||||
\s8, \s9, \s10, \s11
|
||||
__strd r8, r9, sp, 8
|
||||
str r11, [sp, #28]
|
||||
str r10, [sp, #16]
|
||||
|
||||
// Mix second two diagonals:
|
||||
// (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
|
||||
__ldrd r8, r9, sp, 0 // load v[8] and v[9]
|
||||
__ldrd r10, r11, sp, 20 // load v[13] and v[14]
|
||||
_blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \
|
||||
\s12, \s13, \s14, \s15
|
||||
__strd r10, r11, sp, 20
|
||||
.endm
|
||||
|
||||
//
|
||||
// void blake2s_compress_arch(struct blake2s_state *state,
|
||||
// const u8 *block, size_t nblocks, u32 inc);
|
||||
//
|
||||
// Only the first three fields of struct blake2s_state are used:
|
||||
// u32 h[8]; (inout)
|
||||
// u32 t[2]; (inout)
|
||||
// u32 f[2]; (in)
|
||||
//
|
||||
.align 5
|
||||
ENTRY(blake2s_compress_arch)
|
||||
push {r0-r2,r4-r11,lr} // keep this an even number
|
||||
|
||||
.Lnext_block:
|
||||
// r0 is 'state'
|
||||
// r1 is 'block'
|
||||
// r3 is 'inc'
|
||||
|
||||
// Load and increment the counter t[0..1].
|
||||
__ldrd r10, r11, r0, 32
|
||||
adds r10, r10, r3
|
||||
adc r11, r11, #0
|
||||
__strd r10, r11, r0, 32
|
||||
|
||||
// _blake2s_round is very short on registers, so copy the message block
|
||||
// to the stack to save a register during the rounds. This also has the
|
||||
// advantage that misalignment only needs to be dealt with in one place.
|
||||
sub sp, sp, #64
|
||||
mov r12, sp
|
||||
tst r1, #3
|
||||
bne .Lcopy_block_misaligned
|
||||
ldmia r1!, {r2-r9}
|
||||
stmia r12!, {r2-r9}
|
||||
ldmia r1!, {r2-r9}
|
||||
stmia r12, {r2-r9}
|
||||
.Lcopy_block_done:
|
||||
str r1, [sp, #68] // Update message pointer
|
||||
|
||||
// Calculate v[8..15]. Push v[9..15] onto the stack, and leave space
|
||||
// for spilling v[8..9]. Leave v[8..9] in r8-r9.
|
||||
mov r14, r0 // r14 = state
|
||||
adr r12, .Lblake2s_IV
|
||||
ldmia r12!, {r8-r9} // load IV[0..1]
|
||||
__ldrd r0, r1, r14, 40 // load f[0..1]
|
||||
ldm r12, {r2-r7} // load IV[3..7]
|
||||
eor r4, r4, r10 // v[12] = IV[4] ^ t[0]
|
||||
eor r5, r5, r11 // v[13] = IV[5] ^ t[1]
|
||||
eor r6, r6, r0 // v[14] = IV[6] ^ f[0]
|
||||
eor r7, r7, r1 // v[15] = IV[7] ^ f[1]
|
||||
push {r2-r7} // push v[9..15]
|
||||
sub sp, sp, #8 // leave space for v[8..9]
|
||||
|
||||
// Load h[0..7] == v[0..7].
|
||||
ldm r14, {r0-r7}
|
||||
|
||||
// Execute the rounds. Each round is provided the order in which it
|
||||
// needs to use the message words.
|
||||
.set brot, 0
|
||||
.set drot, 0
|
||||
_blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
_blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
|
||||
_blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
|
||||
_blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
|
||||
_blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
|
||||
_blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
|
||||
_blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
|
||||
_blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
|
||||
_blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
|
||||
_blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
|
||||
|
||||
// Fold the final state matrix into the hash chaining value:
|
||||
//
|
||||
// for (i = 0; i < 8; i++)
|
||||
// h[i] ^= v[i] ^ v[i + 8];
|
||||
//
|
||||
ldr r14, [sp, #96] // r14 = &h[0]
|
||||
add sp, sp, #8 // v[8..9] are already loaded.
|
||||
pop {r10-r11} // load v[10..11]
|
||||
eor r0, r0, r8
|
||||
eor r1, r1, r9
|
||||
eor r2, r2, r10
|
||||
eor r3, r3, r11
|
||||
ldm r14, {r8-r11} // load h[0..3]
|
||||
eor r0, r0, r8
|
||||
eor r1, r1, r9
|
||||
eor r2, r2, r10
|
||||
eor r3, r3, r11
|
||||
stmia r14!, {r0-r3} // store new h[0..3]
|
||||
ldm r14, {r0-r3} // load old h[4..7]
|
||||
pop {r8-r11} // load v[12..15]
|
||||
eor r0, r0, r4, ror #brot
|
||||
eor r1, r1, r5, ror #brot
|
||||
eor r2, r2, r6, ror #brot
|
||||
eor r3, r3, r7, ror #brot
|
||||
eor r0, r0, r8, ror #drot
|
||||
eor r1, r1, r9, ror #drot
|
||||
eor r2, r2, r10, ror #drot
|
||||
eor r3, r3, r11, ror #drot
|
||||
add sp, sp, #64 // skip copy of message block
|
||||
stm r14, {r0-r3} // store new h[4..7]
|
||||
|
||||
// Advance to the next block, if there is one. Note that if there are
|
||||
// multiple blocks, then 'inc' (the counter increment amount) must be
|
||||
// 64. So we can simply set it to 64 without re-loading it.
|
||||
ldm sp, {r0, r1, r2} // load (state, block, nblocks)
|
||||
mov r3, #64 // set 'inc'
|
||||
subs r2, r2, #1 // nblocks--
|
||||
str r2, [sp, #8]
|
||||
bne .Lnext_block // nblocks != 0?
|
||||
|
||||
pop {r0-r2,r4-r11,pc}
|
||||
|
||||
// The next message block (pointed to by r1) isn't 4-byte aligned, so it
|
||||
// can't be loaded using ldmia. Copy it to the stack buffer (pointed to
|
||||
// by r12) using an alternative method. r2-r9 are free to use.
|
||||
.Lcopy_block_misaligned:
|
||||
mov r2, #64
|
||||
1:
|
||||
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
|
||||
ldr r3, [r1], #4
|
||||
#else
|
||||
ldrb r3, [r1, #0]
|
||||
ldrb r4, [r1, #1]
|
||||
ldrb r5, [r1, #2]
|
||||
ldrb r6, [r1, #3]
|
||||
add r1, r1, #4
|
||||
orr r3, r3, r4, lsl #8
|
||||
orr r3, r3, r5, lsl #16
|
||||
orr r3, r3, r6, lsl #24
|
||||
#endif
|
||||
subs r2, r2, #4
|
||||
str r3, [r12], #4
|
||||
bne 1b
|
||||
b .Lcopy_block_done
|
||||
ENDPROC(blake2s_compress_arch)
|
||||
78
arch/arm/crypto/blake2s-glue.c
Normal file
78
arch/arm/crypto/blake2s-glue.c
Normal file
@@ -0,0 +1,78 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
* BLAKE2s digest algorithm, ARM scalar implementation
|
||||
*
|
||||
* Copyright 2020 Google LLC
|
||||
*/
|
||||
|
||||
#include <crypto/internal/blake2s.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
|
||||
#include <linux/module.h>
|
||||
|
||||
/* defined in blake2s-core.S */
|
||||
EXPORT_SYMBOL(blake2s_compress_arch);
|
||||
|
||||
static int crypto_blake2s_update_arm(struct shash_desc *desc,
|
||||
const u8 *in, unsigned int inlen)
|
||||
{
|
||||
return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
|
||||
}
|
||||
|
||||
static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
|
||||
{
|
||||
return crypto_blake2s_final(desc, out, blake2s_compress_arch);
|
||||
}
|
||||
|
||||
#define BLAKE2S_ALG(name, driver_name, digest_size) \
|
||||
{ \
|
||||
.base.cra_name = name, \
|
||||
.base.cra_driver_name = driver_name, \
|
||||
.base.cra_priority = 200, \
|
||||
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
|
||||
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
|
||||
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
|
||||
.base.cra_module = THIS_MODULE, \
|
||||
.digestsize = digest_size, \
|
||||
.setkey = crypto_blake2s_setkey, \
|
||||
.init = crypto_blake2s_init, \
|
||||
.update = crypto_blake2s_update_arm, \
|
||||
.final = crypto_blake2s_final_arm, \
|
||||
.descsize = sizeof(struct blake2s_state), \
|
||||
}
|
||||
|
||||
static struct shash_alg blake2s_arm_algs[] = {
|
||||
BLAKE2S_ALG("blake2s-128", "blake2s-128-arm", BLAKE2S_128_HASH_SIZE),
|
||||
BLAKE2S_ALG("blake2s-160", "blake2s-160-arm", BLAKE2S_160_HASH_SIZE),
|
||||
BLAKE2S_ALG("blake2s-224", "blake2s-224-arm", BLAKE2S_224_HASH_SIZE),
|
||||
BLAKE2S_ALG("blake2s-256", "blake2s-256-arm", BLAKE2S_256_HASH_SIZE),
|
||||
};
|
||||
|
||||
static int __init blake2s_arm_mod_init(void)
|
||||
{
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
|
||||
crypto_register_shashes(blake2s_arm_algs,
|
||||
ARRAY_SIZE(blake2s_arm_algs)) : 0;
|
||||
}
|
||||
|
||||
static void __exit blake2s_arm_mod_exit(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
|
||||
crypto_unregister_shashes(blake2s_arm_algs,
|
||||
ARRAY_SIZE(blake2s_arm_algs));
|
||||
}
|
||||
|
||||
module_init(blake2s_arm_mod_init);
|
||||
module_exit(blake2s_arm_mod_exit);
|
||||
|
||||
MODULE_DESCRIPTION("BLAKE2s digest algorithm, ARM scalar implementation");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-128");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-128-arm");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-160");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-160-arm");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-224");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-224-arm");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-256");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-256-arm");
|
||||
@@ -24,6 +24,7 @@
|
||||
#ifdef USE_V8_CRYPTO_EXTENSIONS
|
||||
#define MODE "ce"
|
||||
#define PRIO 300
|
||||
#define STRIDE 5
|
||||
#define aes_expandkey ce_aes_expandkey
|
||||
#define aes_ecb_encrypt ce_aes_ecb_encrypt
|
||||
#define aes_ecb_decrypt ce_aes_ecb_decrypt
|
||||
@@ -41,6 +42,7 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
|
||||
#else
|
||||
#define MODE "neon"
|
||||
#define PRIO 200
|
||||
#define STRIDE 4
|
||||
#define aes_ecb_encrypt neon_aes_ecb_encrypt
|
||||
#define aes_ecb_decrypt neon_aes_ecb_decrypt
|
||||
#define aes_cbc_encrypt neon_aes_cbc_encrypt
|
||||
@@ -55,7 +57,7 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
|
||||
#define aes_mac_update neon_aes_mac_update
|
||||
MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
|
||||
#endif
|
||||
#if defined(USE_V8_CRYPTO_EXTENSIONS) || !defined(CONFIG_CRYPTO_AES_ARM64_BS)
|
||||
#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS)
|
||||
MODULE_ALIAS_CRYPTO("ecb(aes)");
|
||||
MODULE_ALIAS_CRYPTO("cbc(aes)");
|
||||
MODULE_ALIAS_CRYPTO("ctr(aes)");
|
||||
@@ -87,7 +89,7 @@ asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
|
||||
int rounds, int bytes, u8 const iv[]);
|
||||
|
||||
asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
|
||||
int rounds, int blocks, u8 ctr[]);
|
||||
int rounds, int bytes, u8 ctr[], u8 finalbuf[]);
|
||||
|
||||
asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[],
|
||||
int rounds, int bytes, u32 const rk2[], u8 iv[],
|
||||
@@ -103,9 +105,9 @@ asmlinkage void aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
|
||||
int rounds, int blocks, u8 iv[],
|
||||
u32 const rk2[]);
|
||||
|
||||
asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
|
||||
int blocks, u8 dg[], int enc_before,
|
||||
int enc_after);
|
||||
asmlinkage int aes_mac_update(u8 const in[], u32 const rk[], int rounds,
|
||||
int blocks, u8 dg[], int enc_before,
|
||||
int enc_after);
|
||||
|
||||
struct crypto_aes_xts_ctx {
|
||||
struct crypto_aes_ctx key1;
|
||||
@@ -448,34 +450,36 @@ static int ctr_encrypt(struct skcipher_request *req)
|
||||
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
int err, rounds = 6 + ctx->key_length / 4;
|
||||
struct skcipher_walk walk;
|
||||
int blocks;
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, false);
|
||||
|
||||
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
|
||||
kernel_neon_begin();
|
||||
aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
|
||||
ctx->key_enc, rounds, blocks, walk.iv);
|
||||
kernel_neon_end();
|
||||
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
|
||||
}
|
||||
if (walk.nbytes) {
|
||||
u8 __aligned(8) tail[AES_BLOCK_SIZE];
|
||||
while (walk.nbytes > 0) {
|
||||
const u8 *src = walk.src.virt.addr;
|
||||
unsigned int nbytes = walk.nbytes;
|
||||
u8 *tdst = walk.dst.virt.addr;
|
||||
u8 *tsrc = walk.src.virt.addr;
|
||||
u8 *dst = walk.dst.virt.addr;
|
||||
u8 buf[AES_BLOCK_SIZE];
|
||||
unsigned int tail;
|
||||
|
||||
/*
|
||||
* Tell aes_ctr_encrypt() to process a tail block.
|
||||
*/
|
||||
blocks = -1;
|
||||
if (unlikely(nbytes < AES_BLOCK_SIZE))
|
||||
src = memcpy(buf, src, nbytes);
|
||||
else if (nbytes < walk.total)
|
||||
nbytes &= ~(AES_BLOCK_SIZE - 1);
|
||||
|
||||
kernel_neon_begin();
|
||||
aes_ctr_encrypt(tail, NULL, ctx->key_enc, rounds,
|
||||
blocks, walk.iv);
|
||||
aes_ctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes,
|
||||
walk.iv, buf);
|
||||
kernel_neon_end();
|
||||
crypto_xor_cpy(tdst, tsrc, tail, nbytes);
|
||||
err = skcipher_walk_done(&walk, 0);
|
||||
|
||||
tail = nbytes % (STRIDE * AES_BLOCK_SIZE);
|
||||
if (tail > 0 && tail < AES_BLOCK_SIZE)
|
||||
/*
|
||||
* The final partial block could not be returned using
|
||||
* an overlapping store, so it was passed via buf[]
|
||||
* instead.
|
||||
*/
|
||||
memcpy(dst + nbytes - tail, buf, tail);
|
||||
|
||||
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||
}
|
||||
|
||||
return err;
|
||||
@@ -650,7 +654,7 @@ static int __maybe_unused xts_decrypt(struct skcipher_request *req)
|
||||
}
|
||||
|
||||
static struct skcipher_alg aes_algs[] = { {
|
||||
#if defined(USE_V8_CRYPTO_EXTENSIONS) || !defined(CONFIG_CRYPTO_AES_ARM64_BS)
|
||||
#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS)
|
||||
.base = {
|
||||
.cra_name = "__ecb(aes)",
|
||||
.cra_driver_name = "__ecb-aes-" MODE,
|
||||
@@ -852,10 +856,17 @@ static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
|
||||
int rounds = 6 + ctx->key_length / 4;
|
||||
|
||||
if (crypto_simd_usable()) {
|
||||
kernel_neon_begin();
|
||||
aes_mac_update(in, ctx->key_enc, rounds, blocks, dg, enc_before,
|
||||
enc_after);
|
||||
kernel_neon_end();
|
||||
int rem;
|
||||
|
||||
do {
|
||||
kernel_neon_begin();
|
||||
rem = aes_mac_update(in, ctx->key_enc, rounds, blocks,
|
||||
dg, enc_before, enc_after);
|
||||
kernel_neon_end();
|
||||
in += (blocks - rem) * AES_BLOCK_SIZE;
|
||||
blocks = rem;
|
||||
enc_before = 0;
|
||||
} while (blocks);
|
||||
} else {
|
||||
if (enc_before)
|
||||
aes_encrypt(ctx, dg, dg);
|
||||
|
||||
@@ -321,42 +321,76 @@ AES_FUNC_END(aes_cbc_cts_decrypt)
|
||||
|
||||
/*
|
||||
* aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
||||
* int blocks, u8 ctr[])
|
||||
* int bytes, u8 ctr[], u8 finalbuf[])
|
||||
*/
|
||||
|
||||
AES_FUNC_START(aes_ctr_encrypt)
|
||||
stp x29, x30, [sp, #-16]!
|
||||
mov x29, sp
|
||||
|
||||
enc_prepare w3, x2, x6
|
||||
enc_prepare w3, x2, x12
|
||||
ld1 {vctr.16b}, [x5]
|
||||
|
||||
umov x6, vctr.d[1] /* keep swabbed ctr in reg */
|
||||
rev x6, x6
|
||||
cmn w6, w4 /* 32 bit overflow? */
|
||||
bcs .Lctrloop
|
||||
umov x12, vctr.d[1] /* keep swabbed ctr in reg */
|
||||
rev x12, x12
|
||||
|
||||
.LctrloopNx:
|
||||
subs w4, w4, #MAX_STRIDE
|
||||
bmi .Lctr1x
|
||||
add w7, w6, #1
|
||||
add w7, w4, #15
|
||||
sub w4, w4, #MAX_STRIDE << 4
|
||||
lsr w7, w7, #4
|
||||
mov w8, #MAX_STRIDE
|
||||
cmp w7, w8
|
||||
csel w7, w7, w8, lt
|
||||
adds x12, x12, x7
|
||||
|
||||
mov v0.16b, vctr.16b
|
||||
add w8, w6, #2
|
||||
mov v1.16b, vctr.16b
|
||||
add w9, w6, #3
|
||||
mov v2.16b, vctr.16b
|
||||
add w9, w6, #3
|
||||
rev w7, w7
|
||||
mov v3.16b, vctr.16b
|
||||
rev w8, w8
|
||||
ST5( mov v4.16b, vctr.16b )
|
||||
mov v1.s[3], w7
|
||||
rev w9, w9
|
||||
ST5( add w10, w6, #4 )
|
||||
mov v2.s[3], w8
|
||||
ST5( rev w10, w10 )
|
||||
mov v3.s[3], w9
|
||||
ST5( mov v4.s[3], w10 )
|
||||
ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
|
||||
bcs 0f
|
||||
|
||||
.subsection 1
|
||||
/* apply carry to outgoing counter */
|
||||
0: umov x8, vctr.d[0]
|
||||
rev x8, x8
|
||||
add x8, x8, #1
|
||||
rev x8, x8
|
||||
ins vctr.d[0], x8
|
||||
|
||||
/* apply carry to N counter blocks for N := x12 */
|
||||
adr x16, 1f
|
||||
sub x16, x16, x12, lsl #3
|
||||
br x16
|
||||
hint 34 // bti c
|
||||
mov v0.d[0], vctr.d[0]
|
||||
hint 34 // bti c
|
||||
mov v1.d[0], vctr.d[0]
|
||||
hint 34 // bti c
|
||||
mov v2.d[0], vctr.d[0]
|
||||
hint 34 // bti c
|
||||
mov v3.d[0], vctr.d[0]
|
||||
ST5( hint 34 )
|
||||
ST5( mov v4.d[0], vctr.d[0] )
|
||||
1: b 2f
|
||||
.previous
|
||||
|
||||
2: rev x7, x12
|
||||
ins vctr.d[1], x7
|
||||
sub x7, x12, #MAX_STRIDE - 1
|
||||
sub x8, x12, #MAX_STRIDE - 2
|
||||
sub x9, x12, #MAX_STRIDE - 3
|
||||
rev x7, x7
|
||||
rev x8, x8
|
||||
mov v1.d[1], x7
|
||||
rev x9, x9
|
||||
ST5( sub x10, x12, #MAX_STRIDE - 4 )
|
||||
mov v2.d[1], x8
|
||||
ST5( rev x10, x10 )
|
||||
mov v3.d[1], x9
|
||||
ST5( mov v4.d[1], x10 )
|
||||
tbnz w4, #31, .Lctrtail
|
||||
ld1 {v5.16b-v7.16b}, [x1], #48
|
||||
ST4( bl aes_encrypt_block4x )
|
||||
ST5( bl aes_encrypt_block5x )
|
||||
eor v0.16b, v5.16b, v0.16b
|
||||
@@ -368,47 +402,72 @@ ST5( ld1 {v5.16b-v6.16b}, [x1], #32 )
|
||||
ST5( eor v4.16b, v6.16b, v4.16b )
|
||||
st1 {v0.16b-v3.16b}, [x0], #64
|
||||
ST5( st1 {v4.16b}, [x0], #16 )
|
||||
add x6, x6, #MAX_STRIDE
|
||||
rev x7, x6
|
||||
ins vctr.d[1], x7
|
||||
cbz w4, .Lctrout
|
||||
b .LctrloopNx
|
||||
.Lctr1x:
|
||||
adds w4, w4, #MAX_STRIDE
|
||||
beq .Lctrout
|
||||
.Lctrloop:
|
||||
mov v0.16b, vctr.16b
|
||||
encrypt_block v0, w3, x2, x8, w7
|
||||
|
||||
adds x6, x6, #1 /* increment BE ctr */
|
||||
rev x7, x6
|
||||
ins vctr.d[1], x7
|
||||
bcs .Lctrcarry /* overflow? */
|
||||
|
||||
.Lctrcarrydone:
|
||||
subs w4, w4, #1
|
||||
bmi .Lctrtailblock /* blocks <0 means tail block */
|
||||
ld1 {v3.16b}, [x1], #16
|
||||
eor v3.16b, v0.16b, v3.16b
|
||||
st1 {v3.16b}, [x0], #16
|
||||
bne .Lctrloop
|
||||
|
||||
.Lctrout:
|
||||
st1 {vctr.16b}, [x5] /* return next CTR value */
|
||||
ldp x29, x30, [sp], #16
|
||||
ret
|
||||
|
||||
.Lctrtailblock:
|
||||
st1 {v0.16b}, [x0]
|
||||
.Lctrtail:
|
||||
/* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */
|
||||
mov x16, #16
|
||||
ands x13, x4, #0xf
|
||||
csel x13, x13, x16, ne
|
||||
|
||||
ST5( cmp w4, #64 - (MAX_STRIDE << 4) )
|
||||
ST5( csel x14, x16, xzr, gt )
|
||||
cmp w4, #48 - (MAX_STRIDE << 4)
|
||||
csel x15, x16, xzr, gt
|
||||
cmp w4, #32 - (MAX_STRIDE << 4)
|
||||
csel x16, x16, xzr, gt
|
||||
cmp w4, #16 - (MAX_STRIDE << 4)
|
||||
ble .Lctrtail1x
|
||||
|
||||
adr_l x12, .Lcts_permute_table
|
||||
add x12, x12, x13
|
||||
|
||||
ST5( ld1 {v5.16b}, [x1], x14 )
|
||||
ld1 {v6.16b}, [x1], x15
|
||||
ld1 {v7.16b}, [x1], x16
|
||||
|
||||
ST4( bl aes_encrypt_block4x )
|
||||
ST5( bl aes_encrypt_block5x )
|
||||
|
||||
ld1 {v8.16b}, [x1], x13
|
||||
ld1 {v9.16b}, [x1]
|
||||
ld1 {v10.16b}, [x12]
|
||||
|
||||
ST4( eor v6.16b, v6.16b, v0.16b )
|
||||
ST4( eor v7.16b, v7.16b, v1.16b )
|
||||
ST4( tbl v3.16b, {v3.16b}, v10.16b )
|
||||
ST4( eor v8.16b, v8.16b, v2.16b )
|
||||
ST4( eor v9.16b, v9.16b, v3.16b )
|
||||
|
||||
ST5( eor v5.16b, v5.16b, v0.16b )
|
||||
ST5( eor v6.16b, v6.16b, v1.16b )
|
||||
ST5( tbl v4.16b, {v4.16b}, v10.16b )
|
||||
ST5( eor v7.16b, v7.16b, v2.16b )
|
||||
ST5( eor v8.16b, v8.16b, v3.16b )
|
||||
ST5( eor v9.16b, v9.16b, v4.16b )
|
||||
|
||||
ST5( st1 {v5.16b}, [x0], x14 )
|
||||
st1 {v6.16b}, [x0], x15
|
||||
st1 {v7.16b}, [x0], x16
|
||||
add x13, x13, x0
|
||||
st1 {v9.16b}, [x13] // overlapping stores
|
||||
st1 {v8.16b}, [x0]
|
||||
b .Lctrout
|
||||
|
||||
.Lctrcarry:
|
||||
umov x7, vctr.d[0] /* load upper word of ctr */
|
||||
rev x7, x7 /* ... to handle the carry */
|
||||
add x7, x7, #1
|
||||
rev x7, x7
|
||||
ins vctr.d[0], x7
|
||||
b .Lctrcarrydone
|
||||
.Lctrtail1x:
|
||||
csel x0, x0, x6, eq // use finalbuf if less than a full block
|
||||
ld1 {v5.16b}, [x1]
|
||||
ST5( mov v3.16b, v4.16b )
|
||||
encrypt_block v3, w3, x2, x8, w7
|
||||
eor v5.16b, v5.16b, v3.16b
|
||||
st1 {v5.16b}, [x0]
|
||||
b .Lctrout
|
||||
AES_FUNC_END(aes_ctr_encrypt)
|
||||
|
||||
|
||||
@@ -619,61 +678,47 @@ AES_FUNC_END(aes_xts_decrypt)
|
||||
* int blocks, u8 dg[], int enc_before, int enc_after)
|
||||
*/
|
||||
AES_FUNC_START(aes_mac_update)
|
||||
frame_push 6
|
||||
|
||||
mov x19, x0
|
||||
mov x20, x1
|
||||
mov x21, x2
|
||||
mov x22, x3
|
||||
mov x23, x4
|
||||
mov x24, x6
|
||||
|
||||
ld1 {v0.16b}, [x23] /* get dg */
|
||||
ld1 {v0.16b}, [x4] /* get dg */
|
||||
enc_prepare w2, x1, x7
|
||||
cbz w5, .Lmacloop4x
|
||||
|
||||
encrypt_block v0, w2, x1, x7, w8
|
||||
|
||||
.Lmacloop4x:
|
||||
subs w22, w22, #4
|
||||
subs w3, w3, #4
|
||||
bmi .Lmac1x
|
||||
ld1 {v1.16b-v4.16b}, [x19], #64 /* get next pt block */
|
||||
ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */
|
||||
eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
|
||||
encrypt_block v0, w21, x20, x7, w8
|
||||
encrypt_block v0, w2, x1, x7, w8
|
||||
eor v0.16b, v0.16b, v2.16b
|
||||
encrypt_block v0, w21, x20, x7, w8
|
||||
encrypt_block v0, w2, x1, x7, w8
|
||||
eor v0.16b, v0.16b, v3.16b
|
||||
encrypt_block v0, w21, x20, x7, w8
|
||||
encrypt_block v0, w2, x1, x7, w8
|
||||
eor v0.16b, v0.16b, v4.16b
|
||||
cmp w22, wzr
|
||||
csinv x5, x24, xzr, eq
|
||||
cmp w3, wzr
|
||||
csinv x5, x6, xzr, eq
|
||||
cbz w5, .Lmacout
|
||||
encrypt_block v0, w21, x20, x7, w8
|
||||
st1 {v0.16b}, [x23] /* return dg */
|
||||
cond_yield_neon .Lmacrestart
|
||||
encrypt_block v0, w2, x1, x7, w8
|
||||
st1 {v0.16b}, [x4] /* return dg */
|
||||
cond_yield .Lmacout, x7
|
||||
b .Lmacloop4x
|
||||
.Lmac1x:
|
||||
add w22, w22, #4
|
||||
add w3, w3, #4
|
||||
.Lmacloop:
|
||||
cbz w22, .Lmacout
|
||||
ld1 {v1.16b}, [x19], #16 /* get next pt block */
|
||||
cbz w3, .Lmacout
|
||||
ld1 {v1.16b}, [x0], #16 /* get next pt block */
|
||||
eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
|
||||
|
||||
subs w22, w22, #1
|
||||
csinv x5, x24, xzr, eq
|
||||
subs w3, w3, #1
|
||||
csinv x5, x6, xzr, eq
|
||||
cbz w5, .Lmacout
|
||||
|
||||
.Lmacenc:
|
||||
encrypt_block v0, w21, x20, x7, w8
|
||||
encrypt_block v0, w2, x1, x7, w8
|
||||
b .Lmacloop
|
||||
|
||||
.Lmacout:
|
||||
st1 {v0.16b}, [x23] /* return dg */
|
||||
frame_pop
|
||||
st1 {v0.16b}, [x4] /* return dg */
|
||||
mov w0, w3
|
||||
ret
|
||||
|
||||
.Lmacrestart:
|
||||
ld1 {v0.16b}, [x23] /* get dg */
|
||||
enc_prepare w21, x20, x0
|
||||
b .Lmacloop4x
|
||||
AES_FUNC_END(aes_mac_update)
|
||||
|
||||
@@ -613,7 +613,6 @@ SYM_FUNC_END(aesbs_decrypt8)
|
||||
st1 {\o7\().16b}, [x19], #16
|
||||
|
||||
cbz x23, 1f
|
||||
cond_yield_neon
|
||||
b 99b
|
||||
|
||||
1: frame_pop
|
||||
@@ -715,7 +714,6 @@ SYM_FUNC_START(aesbs_cbc_decrypt)
|
||||
1: st1 {v24.16b}, [x24] // store IV
|
||||
|
||||
cbz x23, 2f
|
||||
cond_yield_neon
|
||||
b 99b
|
||||
|
||||
2: frame_pop
|
||||
@@ -801,7 +799,7 @@ SYM_FUNC_END(__xts_crypt8)
|
||||
mov x23, x4
|
||||
mov x24, x5
|
||||
|
||||
0: movi v30.2s, #0x1
|
||||
movi v30.2s, #0x1
|
||||
movi v25.2s, #0x87
|
||||
uzp1 v30.4s, v30.4s, v25.4s
|
||||
ld1 {v25.16b}, [x24]
|
||||
@@ -846,7 +844,6 @@ SYM_FUNC_END(__xts_crypt8)
|
||||
cbz x23, 1f
|
||||
st1 {v25.16b}, [x24]
|
||||
|
||||
cond_yield_neon 0b
|
||||
b 99b
|
||||
|
||||
1: st1 {v25.16b}, [x24]
|
||||
@@ -889,7 +886,7 @@ SYM_FUNC_START(aesbs_ctr_encrypt)
|
||||
cset x26, ne
|
||||
add x23, x23, x26 // do one extra block if final
|
||||
|
||||
98: ldp x7, x8, [x24]
|
||||
ldp x7, x8, [x24]
|
||||
ld1 {v0.16b}, [x24]
|
||||
CPU_LE( rev x7, x7 )
|
||||
CPU_LE( rev x8, x8 )
|
||||
@@ -967,7 +964,6 @@ CPU_LE( rev x8, x8 )
|
||||
st1 {v0.16b}, [x24]
|
||||
cbz x23, .Lctr_done
|
||||
|
||||
cond_yield_neon 98b
|
||||
b 99b
|
||||
|
||||
.Lctr_done:
|
||||
|
||||
@@ -68,10 +68,10 @@
|
||||
.text
|
||||
.arch armv8-a+crypto
|
||||
|
||||
init_crc .req w19
|
||||
buf .req x20
|
||||
len .req x21
|
||||
fold_consts_ptr .req x22
|
||||
init_crc .req w0
|
||||
buf .req x1
|
||||
len .req x2
|
||||
fold_consts_ptr .req x3
|
||||
|
||||
fold_consts .req v10
|
||||
|
||||
@@ -257,12 +257,6 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
|
||||
.endm
|
||||
|
||||
.macro crc_t10dif_pmull, p
|
||||
frame_push 4, 128
|
||||
|
||||
mov init_crc, w0
|
||||
mov buf, x1
|
||||
mov len, x2
|
||||
|
||||
__pmull_init_\p
|
||||
|
||||
// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
|
||||
@@ -317,26 +311,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
|
||||
fold_32_bytes \p, v6, v7
|
||||
|
||||
subs len, len, #128
|
||||
b.lt .Lfold_128_bytes_loop_done_\@
|
||||
|
||||
if_will_cond_yield_neon
|
||||
stp q0, q1, [sp, #.Lframe_local_offset]
|
||||
stp q2, q3, [sp, #.Lframe_local_offset + 32]
|
||||
stp q4, q5, [sp, #.Lframe_local_offset + 64]
|
||||
stp q6, q7, [sp, #.Lframe_local_offset + 96]
|
||||
do_cond_yield_neon
|
||||
ldp q0, q1, [sp, #.Lframe_local_offset]
|
||||
ldp q2, q3, [sp, #.Lframe_local_offset + 32]
|
||||
ldp q4, q5, [sp, #.Lframe_local_offset + 64]
|
||||
ldp q6, q7, [sp, #.Lframe_local_offset + 96]
|
||||
ld1 {fold_consts.2d}, [fold_consts_ptr]
|
||||
__pmull_init_\p
|
||||
__pmull_pre_\p fold_consts
|
||||
endif_yield_neon
|
||||
|
||||
b .Lfold_128_bytes_loop_\@
|
||||
|
||||
.Lfold_128_bytes_loop_done_\@:
|
||||
b.ge .Lfold_128_bytes_loop_\@
|
||||
|
||||
// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
|
||||
|
||||
@@ -453,7 +428,9 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
|
||||
// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
|
||||
|
||||
umov w0, v0.h[0]
|
||||
frame_pop
|
||||
.ifc \p, p8
|
||||
ldp x29, x30, [sp], #16
|
||||
.endif
|
||||
ret
|
||||
|
||||
.Lless_than_256_bytes_\@:
|
||||
@@ -489,7 +466,9 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
|
||||
// Assumes len >= 16.
|
||||
//
|
||||
SYM_FUNC_START(crc_t10dif_pmull_p8)
|
||||
crc_t10dif_pmull p8
|
||||
stp x29, x30, [sp, #-16]!
|
||||
mov x29, sp
|
||||
crc_t10dif_pmull p8
|
||||
SYM_FUNC_END(crc_t10dif_pmull_p8)
|
||||
|
||||
.align 5
|
||||
|
||||
@@ -37,9 +37,18 @@ static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
|
||||
u16 *crc = shash_desc_ctx(desc);
|
||||
|
||||
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
|
||||
kernel_neon_begin();
|
||||
*crc = crc_t10dif_pmull_p8(*crc, data, length);
|
||||
kernel_neon_end();
|
||||
do {
|
||||
unsigned int chunk = length;
|
||||
|
||||
if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
|
||||
chunk = SZ_4K;
|
||||
|
||||
kernel_neon_begin();
|
||||
*crc = crc_t10dif_pmull_p8(*crc, data, chunk);
|
||||
kernel_neon_end();
|
||||
data += chunk;
|
||||
length -= chunk;
|
||||
} while (length);
|
||||
} else {
|
||||
*crc = crc_t10dif_generic(*crc, data, length);
|
||||
}
|
||||
@@ -53,9 +62,18 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
|
||||
u16 *crc = shash_desc_ctx(desc);
|
||||
|
||||
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
|
||||
kernel_neon_begin();
|
||||
*crc = crc_t10dif_pmull_p64(*crc, data, length);
|
||||
kernel_neon_end();
|
||||
do {
|
||||
unsigned int chunk = length;
|
||||
|
||||
if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
|
||||
chunk = SZ_4K;
|
||||
|
||||
kernel_neon_begin();
|
||||
*crc = crc_t10dif_pmull_p64(*crc, data, chunk);
|
||||
kernel_neon_end();
|
||||
data += chunk;
|
||||
length -= chunk;
|
||||
} while (length);
|
||||
} else {
|
||||
*crc = crc_t10dif_generic(*crc, data, length);
|
||||
}
|
||||
|
||||
@@ -62,40 +62,34 @@
|
||||
.endm
|
||||
|
||||
/*
|
||||
* void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
|
||||
* int blocks)
|
||||
* int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
|
||||
* int blocks)
|
||||
*/
|
||||
SYM_FUNC_START(sha1_ce_transform)
|
||||
frame_push 3
|
||||
|
||||
mov x19, x0
|
||||
mov x20, x1
|
||||
mov x21, x2
|
||||
|
||||
/* load round constants */
|
||||
0: loadrc k0.4s, 0x5a827999, w6
|
||||
loadrc k0.4s, 0x5a827999, w6
|
||||
loadrc k1.4s, 0x6ed9eba1, w6
|
||||
loadrc k2.4s, 0x8f1bbcdc, w6
|
||||
loadrc k3.4s, 0xca62c1d6, w6
|
||||
|
||||
/* load state */
|
||||
ld1 {dgav.4s}, [x19]
|
||||
ldr dgb, [x19, #16]
|
||||
ld1 {dgav.4s}, [x0]
|
||||
ldr dgb, [x0, #16]
|
||||
|
||||
/* load sha1_ce_state::finalize */
|
||||
ldr_l w4, sha1_ce_offsetof_finalize, x4
|
||||
ldr w4, [x19, x4]
|
||||
ldr w4, [x0, x4]
|
||||
|
||||
/* load input */
|
||||
1: ld1 {v8.4s-v11.4s}, [x20], #64
|
||||
sub w21, w21, #1
|
||||
0: ld1 {v8.4s-v11.4s}, [x1], #64
|
||||
sub w2, w2, #1
|
||||
|
||||
CPU_LE( rev32 v8.16b, v8.16b )
|
||||
CPU_LE( rev32 v9.16b, v9.16b )
|
||||
CPU_LE( rev32 v10.16b, v10.16b )
|
||||
CPU_LE( rev32 v11.16b, v11.16b )
|
||||
|
||||
2: add t0.4s, v8.4s, k0.4s
|
||||
1: add t0.4s, v8.4s, k0.4s
|
||||
mov dg0v.16b, dgav.16b
|
||||
|
||||
add_update c, ev, k0, 8, 9, 10, 11, dgb
|
||||
@@ -126,25 +120,18 @@ CPU_LE( rev32 v11.16b, v11.16b )
|
||||
add dgbv.2s, dgbv.2s, dg1v.2s
|
||||
add dgav.4s, dgav.4s, dg0v.4s
|
||||
|
||||
cbz w21, 3f
|
||||
|
||||
if_will_cond_yield_neon
|
||||
st1 {dgav.4s}, [x19]
|
||||
str dgb, [x19, #16]
|
||||
do_cond_yield_neon
|
||||
cbz w2, 2f
|
||||
cond_yield 3f, x5
|
||||
b 0b
|
||||
endif_yield_neon
|
||||
|
||||
b 1b
|
||||
|
||||
/*
|
||||
* Final block: add padding and total bit count.
|
||||
* Skip if the input size was not a round multiple of the block size,
|
||||
* the padding is handled by the C code in that case.
|
||||
*/
|
||||
3: cbz x4, 4f
|
||||
2: cbz x4, 3f
|
||||
ldr_l w4, sha1_ce_offsetof_count, x4
|
||||
ldr x4, [x19, x4]
|
||||
ldr x4, [x0, x4]
|
||||
movi v9.2d, #0
|
||||
mov x8, #0x80000000
|
||||
movi v10.2d, #0
|
||||
@@ -153,11 +140,11 @@ CPU_LE( rev32 v11.16b, v11.16b )
|
||||
mov x4, #0
|
||||
mov v11.d[0], xzr
|
||||
mov v11.d[1], x7
|
||||
b 2b
|
||||
b 1b
|
||||
|
||||
/* store new state */
|
||||
4: st1 {dgav.4s}, [x19]
|
||||
str dgb, [x19, #16]
|
||||
frame_pop
|
||||
3: st1 {dgav.4s}, [x0]
|
||||
str dgb, [x0, #16]
|
||||
mov w0, w2
|
||||
ret
|
||||
SYM_FUNC_END(sha1_ce_transform)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user