Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto update from Herbert Xu:
 "API:
   - Restrict crypto_cipher to internal API users only.

  Algorithms:
   - Add x86 aesni acceleration for cts.
   - Improve x86 aesni acceleration for xts.
   - Remove x86 acceleration of some uncommon algorithms.
   - Remove RIPE-MD, Tiger and Salsa20.
   - Remove tnepres.
   - Add ARM acceleration for BLAKE2s and BLAKE2b.

  Drivers:
   - Add Keem Bay OCS HCU driver.
   - Add Marvell OcteonTX2 CPT PF driver.
   - Remove PicoXcell driver.
   - Remove mediatek driver"

* 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (154 commits)
  hwrng: timeriomem - Use device-managed registration API
  crypto: hisilicon/qm - fix printing format issue
  crypto: hisilicon/qm - do not reset hardware when CE happens
  crypto: hisilicon/qm - update irqflag
  crypto: hisilicon/qm - fix the value of 'QM_SQC_VFT_BASE_MASK_V2'
  crypto: hisilicon/qm - fix request missing error
  crypto: hisilicon/qm - removing driver after reset
  crypto: octeontx2 - fix -Wpointer-bool-conversion warning
  crypto: hisilicon/hpre - enable Elliptic curve cryptography
  crypto: hisilicon - PASID fixed on Kunpeng 930
  crypto: hisilicon/qm - fix use of 'dma_map_single'
  crypto: hisilicon/hpre - tiny fix
  crypto: hisilicon/hpre - adapt the number of clusters
  crypto: cpt - remove casting dma_alloc_coherent
  crypto: keembay-ocs-aes - Fix 'q' assignment during CCM B0 generation
  crypto: xor - Fix typo of optimization
  hwrng: optee - Use device-managed registration API
  crypto: arm64/crc-t10dif - move NEON yield to C code
  crypto: arm64/aes-ce-mac - simplify NEON yield
  crypto: arm64/aes-neonbs - remove NEON yield calls
  ...
This commit is contained in:
Linus Torvalds
2021-02-21 17:23:56 -08:00
207 changed files with 13973 additions and 15324 deletions

View File

@@ -174,7 +174,6 @@ Juha Yrjola <at solidboot.com>
Juha Yrjola <juha.yrjola@nokia.com>
Juha Yrjola <juha.yrjola@solidboot.com>
Julien Thierry <julien.thierry.kdev@gmail.com> <julien.thierry@arm.com>
Kamil Konieczny <k.konieczny@samsung.com> <k.konieczny@partner.samsung.com>
Kay Sievers <kay.sievers@vrfy.org>
Kees Cook <keescook@chromium.org> <kees.cook@canonical.com>
Kees Cook <keescook@chromium.org> <keescook@google.com>

View File

@@ -143,8 +143,8 @@ recalculate
journal_crypt:algorithm(:key) (the key is optional)
Encrypt the journal using given algorithm to make sure that the
attacker can't read the journal. You can use a block cipher here
(such as "cbc(aes)") or a stream cipher (for example "chacha20",
"salsa20" or "ctr(aes)").
(such as "cbc(aes)") or a stream cipher (for example "chacha20"
or "ctr(aes)").
The journal contains history of last writes to the block device,
an attacker reading the journal could see the last sector numbers

View File

@@ -28,8 +28,8 @@ Symmetric Key Cipher Request Handle
Single Block Cipher API
-----------------------
.. kernel-doc:: include/linux/crypto.h
.. kernel-doc:: include/crypto/internal/cipher.h
:doc: Single Block Cipher API
.. kernel-doc:: include/linux/crypto.h
.. kernel-doc:: include/crypto/internal/cipher.h
:functions: crypto_alloc_cipher crypto_free_cipher crypto_has_cipher crypto_cipher_blocksize crypto_cipher_setkey crypto_cipher_encrypt_one crypto_cipher_decrypt_one

View File

@@ -0,0 +1,46 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/crypto/intel,keembay-ocs-hcu.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Intel Keem Bay OCS HCU Device Tree Bindings
maintainers:
- Declan Murphy <declan.murphy@intel.com>
- Daniele Alessandrelli <daniele.alessandrelli@intel.com>
description:
The Intel Keem Bay Offload and Crypto Subsystem (OCS) Hash Control Unit (HCU)
provides hardware-accelerated hashing and HMAC.
properties:
compatible:
const: intel,keembay-ocs-hcu
reg:
maxItems: 1
interrupts:
maxItems: 1
clocks:
maxItems: 1
required:
- compatible
- reg
- interrupts
- clocks
additionalProperties: false
examples:
- |
#include <dt-bindings/interrupt-controller/arm-gic.h>
crypto@3000b000 {
compatible = "intel,keembay-ocs-hcu";
reg = <0x3000b000 0x1000>;
interrupts = <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&scmi_clk 94>;
};

View File

@@ -8,7 +8,6 @@ title: Samsung Exynos SoC SlimSSS (Slim Security SubSystem) module
maintainers:
- Krzysztof Kozlowski <krzk@kernel.org>
- Kamil Konieczny <k.konieczny@partner.samsung.com>
description: |+
The SlimSSS module in Exynos5433 SoC supports the following:

View File

@@ -8,7 +8,6 @@ title: Samsung Exynos SoC SSS (Security SubSystem) module
maintainers:
- Krzysztof Kozlowski <krzk@kernel.org>
- Kamil Konieczny <k.konieczny@partner.samsung.com>
description: |+
The SSS module in S5PV210 SoC supports the following:

View File

@@ -9032,6 +9032,17 @@ F: drivers/crypto/keembay/keembay-ocs-aes-core.c
F: drivers/crypto/keembay/ocs-aes.c
F: drivers/crypto/keembay/ocs-aes.h
INTEL KEEM BAY OCS HCU CRYPTO DRIVER
M: Daniele Alessandrelli <daniele.alessandrelli@intel.com>
M: Declan Murphy <declan.murphy@intel.com>
S: Maintained
F: Documentation/devicetree/bindings/crypto/intel,keembay-ocs-hcu.yaml
F: drivers/crypto/keembay/Kconfig
F: drivers/crypto/keembay/Makefile
F: drivers/crypto/keembay/keembay-ocs-hcu-core.c
F: drivers/crypto/keembay/ocs-hcu.c
F: drivers/crypto/keembay/ocs-hcu.h
INTEL MANAGEMENT ENGINE (mei)
M: Tomas Winkler <tomas.winkler@intel.com>
L: linux-kernel@vger.kernel.org
@@ -15683,7 +15694,6 @@ F: drivers/media/i2c/s5k5baf.c
SAMSUNG S5P Security SubSystem (SSS) DRIVER
M: Krzysztof Kozlowski <krzk@kernel.org>
M: Vladimir Zapolskiy <vz@mleia.com>
M: Kamil Konieczny <k.konieczny@samsung.com>
L: linux-crypto@vger.kernel.org
L: linux-samsung-soc@vger.kernel.org
S: Maintained

View File

@@ -62,6 +62,25 @@ config CRYPTO_SHA512_ARM
SHA-512 secure hash standard (DFIPS 180-2) implemented
using optimized ARM assembler and NEON, when available.
config CRYPTO_BLAKE2S_ARM
tristate "BLAKE2s digest algorithm (ARM)"
select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
help
BLAKE2s digest algorithm optimized with ARM scalar instructions. This
is faster than the generic implementations of BLAKE2s and BLAKE2b, but
slower than the NEON implementation of BLAKE2b. (There is no NEON
implementation of BLAKE2s, since NEON doesn't really help with it.)
config CRYPTO_BLAKE2B_NEON
tristate "BLAKE2b digest algorithm (ARM NEON)"
depends on KERNEL_MODE_NEON
select CRYPTO_BLAKE2B
help
BLAKE2b digest algorithm optimized with ARM NEON instructions.
On ARM processors that have NEON support but not the ARMv8
Crypto Extensions, typically this BLAKE2b implementation is
much faster than SHA-2 and slightly faster than SHA-1.
config CRYPTO_AES_ARM
tristate "Scalar AES cipher for ARM"
select CRYPTO_ALGAPI

View File

@@ -9,6 +9,8 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += blake2s-arm.o
obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
@@ -29,6 +31,8 @@ sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
sha512-arm-y := sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
blake2s-arm-y := blake2s-core.o blake2s-glue.o
blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o
sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o
aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o

View File

@@ -9,6 +9,7 @@
#include <asm/simd.h>
#include <crypto/aes.h>
#include <crypto/ctr.h>
#include <crypto/internal/cipher.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
@@ -23,6 +24,8 @@ MODULE_ALIAS_CRYPTO("cbc(aes)-all");
MODULE_ALIAS_CRYPTO("ctr(aes)");
MODULE_ALIAS_CRYPTO("xts(aes)");
MODULE_IMPORT_NS(CRYPTO_INTERNAL);
asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds);
asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],

View File

@@ -0,0 +1,347 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* BLAKE2b digest algorithm, NEON accelerated
*
* Copyright 2020 Google LLC
*
* Author: Eric Biggers <ebiggers@google.com>
*/
#include <linux/linkage.h>
.text
.fpu neon
// The arguments to blake2b_compress_neon()
STATE .req r0
BLOCK .req r1
NBLOCKS .req r2
INC .req r3
// Pointers to the rotation tables
ROR24_TABLE .req r4
ROR16_TABLE .req r5
// The original stack pointer
ORIG_SP .req r6
// NEON registers which contain the message words of the current block.
// M_0-M_3 are occasionally used for other purposes too.
M_0 .req d16
M_1 .req d17
M_2 .req d18
M_3 .req d19
M_4 .req d20
M_5 .req d21
M_6 .req d22
M_7 .req d23
M_8 .req d24
M_9 .req d25
M_10 .req d26
M_11 .req d27
M_12 .req d28
M_13 .req d29
M_14 .req d30
M_15 .req d31
.align 4
// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
// instruction. This is the most efficient way to implement these
// rotation amounts with NEON. (On Cortex-A53 it's the same speed as
// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
.Lror24_table:
.byte 3, 4, 5, 6, 7, 0, 1, 2
.Lror16_table:
.byte 2, 3, 4, 5, 6, 7, 0, 1
// The BLAKE2b initialization vector
.Lblake2b_IV:
.quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
.quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
.quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
.quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack
// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
// (M_0-M_3), so that they can be reloaded if they are used as temporary
// registers. The macro arguments s0-s15 give the order in which the message
// words are used in this round. 'final' is 1 if this is the final round.
.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \
s8, s9, s10, s11, s12, s13, s14, s15, final=0
// Mix the columns:
// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
// a += b + m[blake2b_sigma[r][2*i + 0]];
vadd.u64 q0, q0, q2
vadd.u64 q1, q1, q3
vadd.u64 d0, d0, M_\s0
vadd.u64 d1, d1, M_\s2
vadd.u64 d2, d2, M_\s4
vadd.u64 d3, d3, M_\s6
// d = ror64(d ^ a, 32);
veor q6, q6, q0
veor q7, q7, q1
vrev64.32 q6, q6
vrev64.32 q7, q7
// c += d;
vadd.u64 q4, q4, q6
vadd.u64 q5, q5, q7
// b = ror64(b ^ c, 24);
vld1.8 {M_0}, [ROR24_TABLE, :64]
veor q2, q2, q4
veor q3, q3, q5
vtbl.8 d4, {d4}, M_0
vtbl.8 d5, {d5}, M_0
vtbl.8 d6, {d6}, M_0
vtbl.8 d7, {d7}, M_0
// a += b + m[blake2b_sigma[r][2*i + 1]];
//
// M_0 got clobbered above, so we have to reload it if any of the four
// message words this step needs happens to be M_0. Otherwise we don't
// need to reload it here, as it will just get clobbered again below.
.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
vld1.8 {M_0}, [sp, :64]
.endif
vadd.u64 q0, q0, q2
vadd.u64 q1, q1, q3
vadd.u64 d0, d0, M_\s1
vadd.u64 d1, d1, M_\s3
vadd.u64 d2, d2, M_\s5
vadd.u64 d3, d3, M_\s7
// d = ror64(d ^ a, 16);
vld1.8 {M_0}, [ROR16_TABLE, :64]
veor q6, q6, q0
veor q7, q7, q1
vtbl.8 d12, {d12}, M_0
vtbl.8 d13, {d13}, M_0
vtbl.8 d14, {d14}, M_0
vtbl.8 d15, {d15}, M_0
// c += d;
vadd.u64 q4, q4, q6
vadd.u64 q5, q5, q7
// b = ror64(b ^ c, 63);
//
// This rotation amount isn't a multiple of 8, so it has to be
// implemented using a pair of shifts, which requires temporary
// registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
veor q8, q2, q4
veor q9, q3, q5
vshr.u64 q2, q8, #63
vshr.u64 q3, q9, #63
vsli.u64 q2, q8, #1
vsli.u64 q3, q9, #1
vld1.8 {q8-q9}, [sp, :256]
// Mix the diagonals:
// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
//
// There are two possible ways to do this: use 'vext' instructions to
// shift the rows of the matrix so that the diagonals become columns,
// and undo it afterwards; or just use 64-bit operations on 'd'
// registers instead of 128-bit operations on 'q' registers. We use the
// latter approach, as it performs much better on Cortex-A7.
// a += b + m[blake2b_sigma[r][2*i + 0]];
vadd.u64 d0, d0, d5
vadd.u64 d1, d1, d6
vadd.u64 d2, d2, d7
vadd.u64 d3, d3, d4
vadd.u64 d0, d0, M_\s8
vadd.u64 d1, d1, M_\s10
vadd.u64 d2, d2, M_\s12
vadd.u64 d3, d3, M_\s14
// d = ror64(d ^ a, 32);
veor d15, d15, d0
veor d12, d12, d1
veor d13, d13, d2
veor d14, d14, d3
vrev64.32 d15, d15
vrev64.32 d12, d12
vrev64.32 d13, d13
vrev64.32 d14, d14
// c += d;
vadd.u64 d10, d10, d15
vadd.u64 d11, d11, d12
vadd.u64 d8, d8, d13
vadd.u64 d9, d9, d14
// b = ror64(b ^ c, 24);
vld1.8 {M_0}, [ROR24_TABLE, :64]
veor d5, d5, d10
veor d6, d6, d11
veor d7, d7, d8
veor d4, d4, d9
vtbl.8 d5, {d5}, M_0
vtbl.8 d6, {d6}, M_0
vtbl.8 d7, {d7}, M_0
vtbl.8 d4, {d4}, M_0
// a += b + m[blake2b_sigma[r][2*i + 1]];
.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
vld1.8 {M_0}, [sp, :64]
.endif
vadd.u64 d0, d0, d5
vadd.u64 d1, d1, d6
vadd.u64 d2, d2, d7
vadd.u64 d3, d3, d4
vadd.u64 d0, d0, M_\s9
vadd.u64 d1, d1, M_\s11
vadd.u64 d2, d2, M_\s13
vadd.u64 d3, d3, M_\s15
// d = ror64(d ^ a, 16);
vld1.8 {M_0}, [ROR16_TABLE, :64]
veor d15, d15, d0
veor d12, d12, d1
veor d13, d13, d2
veor d14, d14, d3
vtbl.8 d12, {d12}, M_0
vtbl.8 d13, {d13}, M_0
vtbl.8 d14, {d14}, M_0
vtbl.8 d15, {d15}, M_0
// c += d;
vadd.u64 d10, d10, d15
vadd.u64 d11, d11, d12
vadd.u64 d8, d8, d13
vadd.u64 d9, d9, d14
// b = ror64(b ^ c, 63);
veor d16, d4, d9
veor d17, d5, d10
veor d18, d6, d11
veor d19, d7, d8
vshr.u64 q2, q8, #63
vshr.u64 q3, q9, #63
vsli.u64 q2, q8, #1
vsli.u64 q3, q9, #1
// Reloading q8-q9 can be skipped on the final round.
.if ! \final
vld1.8 {q8-q9}, [sp, :256]
.endif
.endm
//
// void blake2b_compress_neon(struct blake2b_state *state,
// const u8 *block, size_t nblocks, u32 inc);
//
// Only the first three fields of struct blake2b_state are used:
// u64 h[8]; (inout)
// u64 t[2]; (inout)
// u64 f[2]; (in)
//
.align 5
ENTRY(blake2b_compress_neon)
push {r4-r10}
// Allocate a 32-byte stack buffer that is 32-byte aligned.
mov ORIG_SP, sp
sub ip, sp, #32
bic ip, ip, #31
mov sp, ip
adr ROR24_TABLE, .Lror24_table
adr ROR16_TABLE, .Lror16_table
mov ip, STATE
vld1.64 {q0-q1}, [ip]! // Load h[0..3]
vld1.64 {q2-q3}, [ip]! // Load h[4..7]
.Lnext_block:
adr r10, .Lblake2b_IV
vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1]
vld1.64 {q4-q5}, [r10]! // Load IV[0..3]
vmov r7, r8, d28 // Copy t[0] to (r7, r8)
vld1.64 {q6-q7}, [r10] // Load IV[4..7]
adds r7, r7, INC // Increment counter
bcs .Lslow_inc_ctr
vmov.i32 d28[0], r7
vst1.64 {d28}, [ip] // Update t[0]
.Linc_ctr_done:
// Load the next message block and finish initializing the state matrix
// 'v'. Fortunately, there are exactly enough NEON registers to fit the
// entire state matrix in q0-q7 and the entire message block in q8-15.
//
// However, _blake2b_round also needs some extra registers for rotates,
// so we have to spill some registers. It's better to spill the message
// registers than the state registers, as the message doesn't change.
// Therefore we store a copy of the first 32 bytes of the message block
// (q8-q9) in an aligned buffer on the stack so that they can be
// reloaded when needed. (We could just reload directly from the
// message buffer, but it's faster to use aligned loads.)
vld1.8 {q8-q9}, [BLOCK]!
veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1]
vld1.8 {q10-q11}, [BLOCK]!
veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1]
vld1.8 {q12-q13}, [BLOCK]!
vst1.8 {q8-q9}, [sp, :256]
mov ip, STATE
vld1.8 {q14-q15}, [BLOCK]!
// Execute the rounds. Each round is provided the order in which it
// needs to use the message words.
_blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
_blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
_blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
_blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
_blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
_blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
_blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
_blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
_blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
_blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
_blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
_blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
final=1
// Fold the final state matrix into the hash chaining value:
//
// for (i = 0; i < 8; i++)
// h[i] ^= v[i] ^ v[i + 8];
//
vld1.64 {q8-q9}, [ip]! // Load old h[0..3]
veor q0, q0, q4 // v[0..1] ^= v[8..9]
veor q1, q1, q5 // v[2..3] ^= v[10..11]
vld1.64 {q10-q11}, [ip] // Load old h[4..7]
veor q2, q2, q6 // v[4..5] ^= v[12..13]
veor q3, q3, q7 // v[6..7] ^= v[14..15]
veor q0, q0, q8 // v[0..1] ^= h[0..1]
veor q1, q1, q9 // v[2..3] ^= h[2..3]
mov ip, STATE
subs NBLOCKS, NBLOCKS, #1 // nblocks--
vst1.64 {q0-q1}, [ip]! // Store new h[0..3]
veor q2, q2, q10 // v[4..5] ^= h[4..5]
veor q3, q3, q11 // v[6..7] ^= h[6..7]
vst1.64 {q2-q3}, [ip]! // Store new h[4..7]
// Advance to the next block, if there is one.
bne .Lnext_block // nblocks != 0?
mov sp, ORIG_SP
pop {r4-r10}
mov pc, lr
.Lslow_inc_ctr:
// Handle the case where the counter overflowed its low 32 bits, by
// carrying the overflow bit into the full 128-bit counter.
vmov r9, r10, d29
adcs r8, r8, #0
adcs r9, r9, #0
adc r10, r10, #0
vmov d28, r7, r8
vmov d29, r9, r10
vst1.64 {q14}, [ip] // Update t[0] and t[1]
b .Linc_ctr_done
ENDPROC(blake2b_compress_neon)

View File

@@ -0,0 +1,105 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* BLAKE2b digest algorithm, NEON accelerated
*
* Copyright 2020 Google LLC
*/
#include <crypto/internal/blake2b.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <linux/module.h>
#include <linux/sizes.h>
#include <asm/neon.h>
#include <asm/simd.h>
asmlinkage void blake2b_compress_neon(struct blake2b_state *state,
const u8 *block, size_t nblocks, u32 inc);
static void blake2b_compress_arch(struct blake2b_state *state,
const u8 *block, size_t nblocks, u32 inc)
{
if (!crypto_simd_usable()) {
blake2b_compress_generic(state, block, nblocks, inc);
return;
}
do {
const size_t blocks = min_t(size_t, nblocks,
SZ_4K / BLAKE2B_BLOCK_SIZE);
kernel_neon_begin();
blake2b_compress_neon(state, block, blocks, inc);
kernel_neon_end();
nblocks -= blocks;
block += blocks * BLAKE2B_BLOCK_SIZE;
} while (nblocks);
}
static int crypto_blake2b_update_neon(struct shash_desc *desc,
const u8 *in, unsigned int inlen)
{
return crypto_blake2b_update(desc, in, inlen, blake2b_compress_arch);
}
static int crypto_blake2b_final_neon(struct shash_desc *desc, u8 *out)
{
return crypto_blake2b_final(desc, out, blake2b_compress_arch);
}
#define BLAKE2B_ALG(name, driver_name, digest_size) \
{ \
.base.cra_name = name, \
.base.cra_driver_name = driver_name, \
.base.cra_priority = 200, \
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
.base.cra_blocksize = BLAKE2B_BLOCK_SIZE, \
.base.cra_ctxsize = sizeof(struct blake2b_tfm_ctx), \
.base.cra_module = THIS_MODULE, \
.digestsize = digest_size, \
.setkey = crypto_blake2b_setkey, \
.init = crypto_blake2b_init, \
.update = crypto_blake2b_update_neon, \
.final = crypto_blake2b_final_neon, \
.descsize = sizeof(struct blake2b_state), \
}
static struct shash_alg blake2b_neon_algs[] = {
BLAKE2B_ALG("blake2b-160", "blake2b-160-neon", BLAKE2B_160_HASH_SIZE),
BLAKE2B_ALG("blake2b-256", "blake2b-256-neon", BLAKE2B_256_HASH_SIZE),
BLAKE2B_ALG("blake2b-384", "blake2b-384-neon", BLAKE2B_384_HASH_SIZE),
BLAKE2B_ALG("blake2b-512", "blake2b-512-neon", BLAKE2B_512_HASH_SIZE),
};
static int __init blake2b_neon_mod_init(void)
{
if (!(elf_hwcap & HWCAP_NEON))
return -ENODEV;
return crypto_register_shashes(blake2b_neon_algs,
ARRAY_SIZE(blake2b_neon_algs));
}
static void __exit blake2b_neon_mod_exit(void)
{
return crypto_unregister_shashes(blake2b_neon_algs,
ARRAY_SIZE(blake2b_neon_algs));
}
module_init(blake2b_neon_mod_init);
module_exit(blake2b_neon_mod_exit);
MODULE_DESCRIPTION("BLAKE2b digest algorithm, NEON accelerated");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
MODULE_ALIAS_CRYPTO("blake2b-160");
MODULE_ALIAS_CRYPTO("blake2b-160-neon");
MODULE_ALIAS_CRYPTO("blake2b-256");
MODULE_ALIAS_CRYPTO("blake2b-256-neon");
MODULE_ALIAS_CRYPTO("blake2b-384");
MODULE_ALIAS_CRYPTO("blake2b-384-neon");
MODULE_ALIAS_CRYPTO("blake2b-512");
MODULE_ALIAS_CRYPTO("blake2b-512-neon");

View File

@@ -0,0 +1,285 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* BLAKE2s digest algorithm, ARM scalar implementation
*
* Copyright 2020 Google LLC
*
* Author: Eric Biggers <ebiggers@google.com>
*/
#include <linux/linkage.h>
// Registers used to hold message words temporarily. There aren't
// enough ARM registers to hold the whole message block, so we have to
// load the words on-demand.
M_0 .req r12
M_1 .req r14
// The BLAKE2s initialization vector
.Lblake2s_IV:
.word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
.word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
.macro __ldrd a, b, src, offset
#if __LINUX_ARM_ARCH__ >= 6
ldrd \a, \b, [\src, #\offset]
#else
ldr \a, [\src, #\offset]
ldr \b, [\src, #\offset + 4]
#endif
.endm
.macro __strd a, b, dst, offset
#if __LINUX_ARM_ARCH__ >= 6
strd \a, \b, [\dst, #\offset]
#else
str \a, [\dst, #\offset]
str \b, [\dst, #\offset + 4]
#endif
.endm
// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
// columns/diagonals. s0-s1 are the word offsets to the message words the first
// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
//
// Note that to save instructions, the rotations don't happen when the
// pseudocode says they should, but rather they are delayed until the values are
// used. See the comment above _blake2s_round().
.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3
ldr M_0, [sp, #32 + 4 * \s0]
ldr M_1, [sp, #32 + 4 * \s2]
// a += b + m[blake2s_sigma[r][2*i + 0]];
add \a0, \a0, \b0, ror #brot
add \a1, \a1, \b1, ror #brot
add \a0, \a0, M_0
add \a1, \a1, M_1
// d = ror32(d ^ a, 16);
eor \d0, \a0, \d0, ror #drot
eor \d1, \a1, \d1, ror #drot
// c += d;
add \c0, \c0, \d0, ror #16
add \c1, \c1, \d1, ror #16
// b = ror32(b ^ c, 12);
eor \b0, \c0, \b0, ror #brot
eor \b1, \c1, \b1, ror #brot
ldr M_0, [sp, #32 + 4 * \s1]
ldr M_1, [sp, #32 + 4 * \s3]
// a += b + m[blake2s_sigma[r][2*i + 1]];
add \a0, \a0, \b0, ror #12
add \a1, \a1, \b1, ror #12
add \a0, \a0, M_0
add \a1, \a1, M_1
// d = ror32(d ^ a, 8);
eor \d0, \a0, \d0, ror#16
eor \d1, \a1, \d1, ror#16
// c += d;
add \c0, \c0, \d0, ror#8
add \c1, \c1, \d1, ror#8
// b = ror32(b ^ c, 7);
eor \b0, \c0, \b0, ror#12
eor \b1, \c1, \b1, ror#12
.endm
// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9]
// are in r0..r9. The stack pointer points to 8 bytes of scratch space for
// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and
// r14 are free to use. The macro arguments s0-s15 give the order in which the
// message words are used in this round.
//
// All rotates are performed using the implicit rotate operand accepted by the
// 'add' and 'eor' instructions. This is faster than using explicit rotate
// instructions. To make this work, we allow the values in the second and last
// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
// wrong rotation amount. The rotation amount is then fixed up just in time
// when the values are used. 'brot' is the number of bits the values in row 'b'
// need to be rotated right to arrive at the correct values, and 'drot'
// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
// that they end up as (7, 8) after every round.
.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \
s8, s9, s10, s11, s12, s13, s14, s15
// Mix first two columns:
// (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
__ldrd r10, r11, sp, 16 // load v[12] and v[13]
_blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \
\s0, \s1, \s2, \s3
__strd r8, r9, sp, 0
__strd r10, r11, sp, 16
// Mix second two columns:
// (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
__ldrd r8, r9, sp, 8 // load v[10] and v[11]
__ldrd r10, r11, sp, 24 // load v[14] and v[15]
_blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \
\s4, \s5, \s6, \s7
str r10, [sp, #24] // store v[14]
// v[10], v[11], and v[15] are used below, so no need to store them yet.
.set brot, 7
.set drot, 8
// Mix first two diagonals:
// (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
ldr r10, [sp, #16] // load v[12]
_blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \
\s8, \s9, \s10, \s11
__strd r8, r9, sp, 8
str r11, [sp, #28]
str r10, [sp, #16]
// Mix second two diagonals:
// (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
__ldrd r8, r9, sp, 0 // load v[8] and v[9]
__ldrd r10, r11, sp, 20 // load v[13] and v[14]
_blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \
\s12, \s13, \s14, \s15
__strd r10, r11, sp, 20
.endm
//
// void blake2s_compress_arch(struct blake2s_state *state,
// const u8 *block, size_t nblocks, u32 inc);
//
// Only the first three fields of struct blake2s_state are used:
// u32 h[8]; (inout)
// u32 t[2]; (inout)
// u32 f[2]; (in)
//
.align 5
ENTRY(blake2s_compress_arch)
push {r0-r2,r4-r11,lr} // keep this an even number
.Lnext_block:
// r0 is 'state'
// r1 is 'block'
// r3 is 'inc'
// Load and increment the counter t[0..1].
__ldrd r10, r11, r0, 32
adds r10, r10, r3
adc r11, r11, #0
__strd r10, r11, r0, 32
// _blake2s_round is very short on registers, so copy the message block
// to the stack to save a register during the rounds. This also has the
// advantage that misalignment only needs to be dealt with in one place.
sub sp, sp, #64
mov r12, sp
tst r1, #3
bne .Lcopy_block_misaligned
ldmia r1!, {r2-r9}
stmia r12!, {r2-r9}
ldmia r1!, {r2-r9}
stmia r12, {r2-r9}
.Lcopy_block_done:
str r1, [sp, #68] // Update message pointer
// Calculate v[8..15]. Push v[9..15] onto the stack, and leave space
// for spilling v[8..9]. Leave v[8..9] in r8-r9.
mov r14, r0 // r14 = state
adr r12, .Lblake2s_IV
ldmia r12!, {r8-r9} // load IV[0..1]
__ldrd r0, r1, r14, 40 // load f[0..1]
ldm r12, {r2-r7} // load IV[3..7]
eor r4, r4, r10 // v[12] = IV[4] ^ t[0]
eor r5, r5, r11 // v[13] = IV[5] ^ t[1]
eor r6, r6, r0 // v[14] = IV[6] ^ f[0]
eor r7, r7, r1 // v[15] = IV[7] ^ f[1]
push {r2-r7} // push v[9..15]
sub sp, sp, #8 // leave space for v[8..9]
// Load h[0..7] == v[0..7].
ldm r14, {r0-r7}
// Execute the rounds. Each round is provided the order in which it
// needs to use the message words.
.set brot, 0
.set drot, 0
_blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
_blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
_blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
_blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
_blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
_blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
_blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
_blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
_blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
_blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
// Fold the final state matrix into the hash chaining value:
//
// for (i = 0; i < 8; i++)
// h[i] ^= v[i] ^ v[i + 8];
//
ldr r14, [sp, #96] // r14 = &h[0]
add sp, sp, #8 // v[8..9] are already loaded.
pop {r10-r11} // load v[10..11]
eor r0, r0, r8
eor r1, r1, r9
eor r2, r2, r10
eor r3, r3, r11
ldm r14, {r8-r11} // load h[0..3]
eor r0, r0, r8
eor r1, r1, r9
eor r2, r2, r10
eor r3, r3, r11
stmia r14!, {r0-r3} // store new h[0..3]
ldm r14, {r0-r3} // load old h[4..7]
pop {r8-r11} // load v[12..15]
eor r0, r0, r4, ror #brot
eor r1, r1, r5, ror #brot
eor r2, r2, r6, ror #brot
eor r3, r3, r7, ror #brot
eor r0, r0, r8, ror #drot
eor r1, r1, r9, ror #drot
eor r2, r2, r10, ror #drot
eor r3, r3, r11, ror #drot
add sp, sp, #64 // skip copy of message block
stm r14, {r0-r3} // store new h[4..7]
// Advance to the next block, if there is one. Note that if there are
// multiple blocks, then 'inc' (the counter increment amount) must be
// 64. So we can simply set it to 64 without re-loading it.
ldm sp, {r0, r1, r2} // load (state, block, nblocks)
mov r3, #64 // set 'inc'
subs r2, r2, #1 // nblocks--
str r2, [sp, #8]
bne .Lnext_block // nblocks != 0?
pop {r0-r2,r4-r11,pc}
// The next message block (pointed to by r1) isn't 4-byte aligned, so it
// can't be loaded using ldmia. Copy it to the stack buffer (pointed to
// by r12) using an alternative method. r2-r9 are free to use.
.Lcopy_block_misaligned:
mov r2, #64
1:
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
ldr r3, [r1], #4
#else
ldrb r3, [r1, #0]
ldrb r4, [r1, #1]
ldrb r5, [r1, #2]
ldrb r6, [r1, #3]
add r1, r1, #4
orr r3, r3, r4, lsl #8
orr r3, r3, r5, lsl #16
orr r3, r3, r6, lsl #24
#endif
subs r2, r2, #4
str r3, [r12], #4
bne 1b
b .Lcopy_block_done
ENDPROC(blake2s_compress_arch)

View File

@@ -0,0 +1,78 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* BLAKE2s digest algorithm, ARM scalar implementation
*
* Copyright 2020 Google LLC
*/
#include <crypto/internal/blake2s.h>
#include <crypto/internal/hash.h>
#include <linux/module.h>
/* defined in blake2s-core.S */
EXPORT_SYMBOL(blake2s_compress_arch);
static int crypto_blake2s_update_arm(struct shash_desc *desc,
const u8 *in, unsigned int inlen)
{
return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
}
static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
{
return crypto_blake2s_final(desc, out, blake2s_compress_arch);
}
#define BLAKE2S_ALG(name, driver_name, digest_size) \
{ \
.base.cra_name = name, \
.base.cra_driver_name = driver_name, \
.base.cra_priority = 200, \
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
.base.cra_module = THIS_MODULE, \
.digestsize = digest_size, \
.setkey = crypto_blake2s_setkey, \
.init = crypto_blake2s_init, \
.update = crypto_blake2s_update_arm, \
.final = crypto_blake2s_final_arm, \
.descsize = sizeof(struct blake2s_state), \
}
static struct shash_alg blake2s_arm_algs[] = {
BLAKE2S_ALG("blake2s-128", "blake2s-128-arm", BLAKE2S_128_HASH_SIZE),
BLAKE2S_ALG("blake2s-160", "blake2s-160-arm", BLAKE2S_160_HASH_SIZE),
BLAKE2S_ALG("blake2s-224", "blake2s-224-arm", BLAKE2S_224_HASH_SIZE),
BLAKE2S_ALG("blake2s-256", "blake2s-256-arm", BLAKE2S_256_HASH_SIZE),
};
static int __init blake2s_arm_mod_init(void)
{
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
crypto_register_shashes(blake2s_arm_algs,
ARRAY_SIZE(blake2s_arm_algs)) : 0;
}
static void __exit blake2s_arm_mod_exit(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
crypto_unregister_shashes(blake2s_arm_algs,
ARRAY_SIZE(blake2s_arm_algs));
}
module_init(blake2s_arm_mod_init);
module_exit(blake2s_arm_mod_exit);
MODULE_DESCRIPTION("BLAKE2s digest algorithm, ARM scalar implementation");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
MODULE_ALIAS_CRYPTO("blake2s-128");
MODULE_ALIAS_CRYPTO("blake2s-128-arm");
MODULE_ALIAS_CRYPTO("blake2s-160");
MODULE_ALIAS_CRYPTO("blake2s-160-arm");
MODULE_ALIAS_CRYPTO("blake2s-224");
MODULE_ALIAS_CRYPTO("blake2s-224-arm");
MODULE_ALIAS_CRYPTO("blake2s-256");
MODULE_ALIAS_CRYPTO("blake2s-256-arm");

View File

@@ -24,6 +24,7 @@
#ifdef USE_V8_CRYPTO_EXTENSIONS
#define MODE "ce"
#define PRIO 300
#define STRIDE 5
#define aes_expandkey ce_aes_expandkey
#define aes_ecb_encrypt ce_aes_ecb_encrypt
#define aes_ecb_decrypt ce_aes_ecb_decrypt
@@ -41,6 +42,7 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
#else
#define MODE "neon"
#define PRIO 200
#define STRIDE 4
#define aes_ecb_encrypt neon_aes_ecb_encrypt
#define aes_ecb_decrypt neon_aes_ecb_decrypt
#define aes_cbc_encrypt neon_aes_cbc_encrypt
@@ -55,7 +57,7 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
#define aes_mac_update neon_aes_mac_update
MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
#endif
#if defined(USE_V8_CRYPTO_EXTENSIONS) || !defined(CONFIG_CRYPTO_AES_ARM64_BS)
#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS)
MODULE_ALIAS_CRYPTO("ecb(aes)");
MODULE_ALIAS_CRYPTO("cbc(aes)");
MODULE_ALIAS_CRYPTO("ctr(aes)");
@@ -87,7 +89,7 @@ asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int bytes, u8 const iv[]);
asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks, u8 ctr[]);
int rounds, int bytes, u8 ctr[], u8 finalbuf[]);
asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[],
int rounds, int bytes, u32 const rk2[], u8 iv[],
@@ -103,9 +105,9 @@ asmlinkage void aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
int rounds, int blocks, u8 iv[],
u32 const rk2[]);
asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
int blocks, u8 dg[], int enc_before,
int enc_after);
asmlinkage int aes_mac_update(u8 const in[], u32 const rk[], int rounds,
int blocks, u8 dg[], int enc_before,
int enc_after);
struct crypto_aes_xts_ctx {
struct crypto_aes_ctx key1;
@@ -448,34 +450,36 @@ static int ctr_encrypt(struct skcipher_request *req)
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
int err, rounds = 6 + ctx->key_length / 4;
struct skcipher_walk walk;
int blocks;
err = skcipher_walk_virt(&walk, req, false);
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key_enc, rounds, blocks, walk.iv);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
if (walk.nbytes) {
u8 __aligned(8) tail[AES_BLOCK_SIZE];
while (walk.nbytes > 0) {
const u8 *src = walk.src.virt.addr;
unsigned int nbytes = walk.nbytes;
u8 *tdst = walk.dst.virt.addr;
u8 *tsrc = walk.src.virt.addr;
u8 *dst = walk.dst.virt.addr;
u8 buf[AES_BLOCK_SIZE];
unsigned int tail;
/*
* Tell aes_ctr_encrypt() to process a tail block.
*/
blocks = -1;
if (unlikely(nbytes < AES_BLOCK_SIZE))
src = memcpy(buf, src, nbytes);
else if (nbytes < walk.total)
nbytes &= ~(AES_BLOCK_SIZE - 1);
kernel_neon_begin();
aes_ctr_encrypt(tail, NULL, ctx->key_enc, rounds,
blocks, walk.iv);
aes_ctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes,
walk.iv, buf);
kernel_neon_end();
crypto_xor_cpy(tdst, tsrc, tail, nbytes);
err = skcipher_walk_done(&walk, 0);
tail = nbytes % (STRIDE * AES_BLOCK_SIZE);
if (tail > 0 && tail < AES_BLOCK_SIZE)
/*
* The final partial block could not be returned using
* an overlapping store, so it was passed via buf[]
* instead.
*/
memcpy(dst + nbytes - tail, buf, tail);
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
@@ -650,7 +654,7 @@ static int __maybe_unused xts_decrypt(struct skcipher_request *req)
}
static struct skcipher_alg aes_algs[] = { {
#if defined(USE_V8_CRYPTO_EXTENSIONS) || !defined(CONFIG_CRYPTO_AES_ARM64_BS)
#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS)
.base = {
.cra_name = "__ecb(aes)",
.cra_driver_name = "__ecb-aes-" MODE,
@@ -852,10 +856,17 @@ static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
int rounds = 6 + ctx->key_length / 4;
if (crypto_simd_usable()) {
kernel_neon_begin();
aes_mac_update(in, ctx->key_enc, rounds, blocks, dg, enc_before,
enc_after);
kernel_neon_end();
int rem;
do {
kernel_neon_begin();
rem = aes_mac_update(in, ctx->key_enc, rounds, blocks,
dg, enc_before, enc_after);
kernel_neon_end();
in += (blocks - rem) * AES_BLOCK_SIZE;
blocks = rem;
enc_before = 0;
} while (blocks);
} else {
if (enc_before)
aes_encrypt(ctx, dg, dg);

View File

@@ -321,42 +321,76 @@ AES_FUNC_END(aes_cbc_cts_decrypt)
/*
* aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 ctr[])
* int bytes, u8 ctr[], u8 finalbuf[])
*/
AES_FUNC_START(aes_ctr_encrypt)
stp x29, x30, [sp, #-16]!
mov x29, sp
enc_prepare w3, x2, x6
enc_prepare w3, x2, x12
ld1 {vctr.16b}, [x5]
umov x6, vctr.d[1] /* keep swabbed ctr in reg */
rev x6, x6
cmn w6, w4 /* 32 bit overflow? */
bcs .Lctrloop
umov x12, vctr.d[1] /* keep swabbed ctr in reg */
rev x12, x12
.LctrloopNx:
subs w4, w4, #MAX_STRIDE
bmi .Lctr1x
add w7, w6, #1
add w7, w4, #15
sub w4, w4, #MAX_STRIDE << 4
lsr w7, w7, #4
mov w8, #MAX_STRIDE
cmp w7, w8
csel w7, w7, w8, lt
adds x12, x12, x7
mov v0.16b, vctr.16b
add w8, w6, #2
mov v1.16b, vctr.16b
add w9, w6, #3
mov v2.16b, vctr.16b
add w9, w6, #3
rev w7, w7
mov v3.16b, vctr.16b
rev w8, w8
ST5( mov v4.16b, vctr.16b )
mov v1.s[3], w7
rev w9, w9
ST5( add w10, w6, #4 )
mov v2.s[3], w8
ST5( rev w10, w10 )
mov v3.s[3], w9
ST5( mov v4.s[3], w10 )
ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
bcs 0f
.subsection 1
/* apply carry to outgoing counter */
0: umov x8, vctr.d[0]
rev x8, x8
add x8, x8, #1
rev x8, x8
ins vctr.d[0], x8
/* apply carry to N counter blocks for N := x12 */
adr x16, 1f
sub x16, x16, x12, lsl #3
br x16
hint 34 // bti c
mov v0.d[0], vctr.d[0]
hint 34 // bti c
mov v1.d[0], vctr.d[0]
hint 34 // bti c
mov v2.d[0], vctr.d[0]
hint 34 // bti c
mov v3.d[0], vctr.d[0]
ST5( hint 34 )
ST5( mov v4.d[0], vctr.d[0] )
1: b 2f
.previous
2: rev x7, x12
ins vctr.d[1], x7
sub x7, x12, #MAX_STRIDE - 1
sub x8, x12, #MAX_STRIDE - 2
sub x9, x12, #MAX_STRIDE - 3
rev x7, x7
rev x8, x8
mov v1.d[1], x7
rev x9, x9
ST5( sub x10, x12, #MAX_STRIDE - 4 )
mov v2.d[1], x8
ST5( rev x10, x10 )
mov v3.d[1], x9
ST5( mov v4.d[1], x10 )
tbnz w4, #31, .Lctrtail
ld1 {v5.16b-v7.16b}, [x1], #48
ST4( bl aes_encrypt_block4x )
ST5( bl aes_encrypt_block5x )
eor v0.16b, v5.16b, v0.16b
@@ -368,47 +402,72 @@ ST5( ld1 {v5.16b-v6.16b}, [x1], #32 )
ST5( eor v4.16b, v6.16b, v4.16b )
st1 {v0.16b-v3.16b}, [x0], #64
ST5( st1 {v4.16b}, [x0], #16 )
add x6, x6, #MAX_STRIDE
rev x7, x6
ins vctr.d[1], x7
cbz w4, .Lctrout
b .LctrloopNx
.Lctr1x:
adds w4, w4, #MAX_STRIDE
beq .Lctrout
.Lctrloop:
mov v0.16b, vctr.16b
encrypt_block v0, w3, x2, x8, w7
adds x6, x6, #1 /* increment BE ctr */
rev x7, x6
ins vctr.d[1], x7
bcs .Lctrcarry /* overflow? */
.Lctrcarrydone:
subs w4, w4, #1
bmi .Lctrtailblock /* blocks <0 means tail block */
ld1 {v3.16b}, [x1], #16
eor v3.16b, v0.16b, v3.16b
st1 {v3.16b}, [x0], #16
bne .Lctrloop
.Lctrout:
st1 {vctr.16b}, [x5] /* return next CTR value */
ldp x29, x30, [sp], #16
ret
.Lctrtailblock:
st1 {v0.16b}, [x0]
.Lctrtail:
/* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */
mov x16, #16
ands x13, x4, #0xf
csel x13, x13, x16, ne
ST5( cmp w4, #64 - (MAX_STRIDE << 4) )
ST5( csel x14, x16, xzr, gt )
cmp w4, #48 - (MAX_STRIDE << 4)
csel x15, x16, xzr, gt
cmp w4, #32 - (MAX_STRIDE << 4)
csel x16, x16, xzr, gt
cmp w4, #16 - (MAX_STRIDE << 4)
ble .Lctrtail1x
adr_l x12, .Lcts_permute_table
add x12, x12, x13
ST5( ld1 {v5.16b}, [x1], x14 )
ld1 {v6.16b}, [x1], x15
ld1 {v7.16b}, [x1], x16
ST4( bl aes_encrypt_block4x )
ST5( bl aes_encrypt_block5x )
ld1 {v8.16b}, [x1], x13
ld1 {v9.16b}, [x1]
ld1 {v10.16b}, [x12]
ST4( eor v6.16b, v6.16b, v0.16b )
ST4( eor v7.16b, v7.16b, v1.16b )
ST4( tbl v3.16b, {v3.16b}, v10.16b )
ST4( eor v8.16b, v8.16b, v2.16b )
ST4( eor v9.16b, v9.16b, v3.16b )
ST5( eor v5.16b, v5.16b, v0.16b )
ST5( eor v6.16b, v6.16b, v1.16b )
ST5( tbl v4.16b, {v4.16b}, v10.16b )
ST5( eor v7.16b, v7.16b, v2.16b )
ST5( eor v8.16b, v8.16b, v3.16b )
ST5( eor v9.16b, v9.16b, v4.16b )
ST5( st1 {v5.16b}, [x0], x14 )
st1 {v6.16b}, [x0], x15
st1 {v7.16b}, [x0], x16
add x13, x13, x0
st1 {v9.16b}, [x13] // overlapping stores
st1 {v8.16b}, [x0]
b .Lctrout
.Lctrcarry:
umov x7, vctr.d[0] /* load upper word of ctr */
rev x7, x7 /* ... to handle the carry */
add x7, x7, #1
rev x7, x7
ins vctr.d[0], x7
b .Lctrcarrydone
.Lctrtail1x:
csel x0, x0, x6, eq // use finalbuf if less than a full block
ld1 {v5.16b}, [x1]
ST5( mov v3.16b, v4.16b )
encrypt_block v3, w3, x2, x8, w7
eor v5.16b, v5.16b, v3.16b
st1 {v5.16b}, [x0]
b .Lctrout
AES_FUNC_END(aes_ctr_encrypt)
@@ -619,61 +678,47 @@ AES_FUNC_END(aes_xts_decrypt)
* int blocks, u8 dg[], int enc_before, int enc_after)
*/
AES_FUNC_START(aes_mac_update)
frame_push 6
mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x6
ld1 {v0.16b}, [x23] /* get dg */
ld1 {v0.16b}, [x4] /* get dg */
enc_prepare w2, x1, x7
cbz w5, .Lmacloop4x
encrypt_block v0, w2, x1, x7, w8
.Lmacloop4x:
subs w22, w22, #4
subs w3, w3, #4
bmi .Lmac1x
ld1 {v1.16b-v4.16b}, [x19], #64 /* get next pt block */
ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */
eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
encrypt_block v0, w21, x20, x7, w8
encrypt_block v0, w2, x1, x7, w8
eor v0.16b, v0.16b, v2.16b
encrypt_block v0, w21, x20, x7, w8
encrypt_block v0, w2, x1, x7, w8
eor v0.16b, v0.16b, v3.16b
encrypt_block v0, w21, x20, x7, w8
encrypt_block v0, w2, x1, x7, w8
eor v0.16b, v0.16b, v4.16b
cmp w22, wzr
csinv x5, x24, xzr, eq
cmp w3, wzr
csinv x5, x6, xzr, eq
cbz w5, .Lmacout
encrypt_block v0, w21, x20, x7, w8
st1 {v0.16b}, [x23] /* return dg */
cond_yield_neon .Lmacrestart
encrypt_block v0, w2, x1, x7, w8
st1 {v0.16b}, [x4] /* return dg */
cond_yield .Lmacout, x7
b .Lmacloop4x
.Lmac1x:
add w22, w22, #4
add w3, w3, #4
.Lmacloop:
cbz w22, .Lmacout
ld1 {v1.16b}, [x19], #16 /* get next pt block */
cbz w3, .Lmacout
ld1 {v1.16b}, [x0], #16 /* get next pt block */
eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
subs w22, w22, #1
csinv x5, x24, xzr, eq
subs w3, w3, #1
csinv x5, x6, xzr, eq
cbz w5, .Lmacout
.Lmacenc:
encrypt_block v0, w21, x20, x7, w8
encrypt_block v0, w2, x1, x7, w8
b .Lmacloop
.Lmacout:
st1 {v0.16b}, [x23] /* return dg */
frame_pop
st1 {v0.16b}, [x4] /* return dg */
mov w0, w3
ret
.Lmacrestart:
ld1 {v0.16b}, [x23] /* get dg */
enc_prepare w21, x20, x0
b .Lmacloop4x
AES_FUNC_END(aes_mac_update)

View File

@@ -613,7 +613,6 @@ SYM_FUNC_END(aesbs_decrypt8)
st1 {\o7\().16b}, [x19], #16
cbz x23, 1f
cond_yield_neon
b 99b
1: frame_pop
@@ -715,7 +714,6 @@ SYM_FUNC_START(aesbs_cbc_decrypt)
1: st1 {v24.16b}, [x24] // store IV
cbz x23, 2f
cond_yield_neon
b 99b
2: frame_pop
@@ -801,7 +799,7 @@ SYM_FUNC_END(__xts_crypt8)
mov x23, x4
mov x24, x5
0: movi v30.2s, #0x1
movi v30.2s, #0x1
movi v25.2s, #0x87
uzp1 v30.4s, v30.4s, v25.4s
ld1 {v25.16b}, [x24]
@@ -846,7 +844,6 @@ SYM_FUNC_END(__xts_crypt8)
cbz x23, 1f
st1 {v25.16b}, [x24]
cond_yield_neon 0b
b 99b
1: st1 {v25.16b}, [x24]
@@ -889,7 +886,7 @@ SYM_FUNC_START(aesbs_ctr_encrypt)
cset x26, ne
add x23, x23, x26 // do one extra block if final
98: ldp x7, x8, [x24]
ldp x7, x8, [x24]
ld1 {v0.16b}, [x24]
CPU_LE( rev x7, x7 )
CPU_LE( rev x8, x8 )
@@ -967,7 +964,6 @@ CPU_LE( rev x8, x8 )
st1 {v0.16b}, [x24]
cbz x23, .Lctr_done
cond_yield_neon 98b
b 99b
.Lctr_done:

View File

@@ -68,10 +68,10 @@
.text
.arch armv8-a+crypto
init_crc .req w19
buf .req x20
len .req x21
fold_consts_ptr .req x22
init_crc .req w0
buf .req x1
len .req x2
fold_consts_ptr .req x3
fold_consts .req v10
@@ -257,12 +257,6 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
.endm
.macro crc_t10dif_pmull, p
frame_push 4, 128
mov init_crc, w0
mov buf, x1
mov len, x2
__pmull_init_\p
// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
@@ -317,26 +311,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
fold_32_bytes \p, v6, v7
subs len, len, #128
b.lt .Lfold_128_bytes_loop_done_\@
if_will_cond_yield_neon
stp q0, q1, [sp, #.Lframe_local_offset]
stp q2, q3, [sp, #.Lframe_local_offset + 32]
stp q4, q5, [sp, #.Lframe_local_offset + 64]
stp q6, q7, [sp, #.Lframe_local_offset + 96]
do_cond_yield_neon
ldp q0, q1, [sp, #.Lframe_local_offset]
ldp q2, q3, [sp, #.Lframe_local_offset + 32]
ldp q4, q5, [sp, #.Lframe_local_offset + 64]
ldp q6, q7, [sp, #.Lframe_local_offset + 96]
ld1 {fold_consts.2d}, [fold_consts_ptr]
__pmull_init_\p
__pmull_pre_\p fold_consts
endif_yield_neon
b .Lfold_128_bytes_loop_\@
.Lfold_128_bytes_loop_done_\@:
b.ge .Lfold_128_bytes_loop_\@
// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
@@ -453,7 +428,9 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
umov w0, v0.h[0]
frame_pop
.ifc \p, p8
ldp x29, x30, [sp], #16
.endif
ret
.Lless_than_256_bytes_\@:
@@ -489,7 +466,9 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// Assumes len >= 16.
//
SYM_FUNC_START(crc_t10dif_pmull_p8)
crc_t10dif_pmull p8
stp x29, x30, [sp, #-16]!
mov x29, sp
crc_t10dif_pmull p8
SYM_FUNC_END(crc_t10dif_pmull_p8)
.align 5

View File

@@ -37,9 +37,18 @@ static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
u16 *crc = shash_desc_ctx(desc);
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
kernel_neon_begin();
*crc = crc_t10dif_pmull_p8(*crc, data, length);
kernel_neon_end();
do {
unsigned int chunk = length;
if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
chunk = SZ_4K;
kernel_neon_begin();
*crc = crc_t10dif_pmull_p8(*crc, data, chunk);
kernel_neon_end();
data += chunk;
length -= chunk;
} while (length);
} else {
*crc = crc_t10dif_generic(*crc, data, length);
}
@@ -53,9 +62,18 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
u16 *crc = shash_desc_ctx(desc);
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
kernel_neon_begin();
*crc = crc_t10dif_pmull_p64(*crc, data, length);
kernel_neon_end();
do {
unsigned int chunk = length;
if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
chunk = SZ_4K;
kernel_neon_begin();
*crc = crc_t10dif_pmull_p64(*crc, data, chunk);
kernel_neon_end();
data += chunk;
length -= chunk;
} while (length);
} else {
*crc = crc_t10dif_generic(*crc, data, length);
}

View File

@@ -62,40 +62,34 @@
.endm
/*
* void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
* int blocks)
* int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
* int blocks)
*/
SYM_FUNC_START(sha1_ce_transform)
frame_push 3
mov x19, x0
mov x20, x1
mov x21, x2
/* load round constants */
0: loadrc k0.4s, 0x5a827999, w6
loadrc k0.4s, 0x5a827999, w6
loadrc k1.4s, 0x6ed9eba1, w6
loadrc k2.4s, 0x8f1bbcdc, w6
loadrc k3.4s, 0xca62c1d6, w6
/* load state */
ld1 {dgav.4s}, [x19]
ldr dgb, [x19, #16]
ld1 {dgav.4s}, [x0]
ldr dgb, [x0, #16]
/* load sha1_ce_state::finalize */
ldr_l w4, sha1_ce_offsetof_finalize, x4
ldr w4, [x19, x4]
ldr w4, [x0, x4]
/* load input */
1: ld1 {v8.4s-v11.4s}, [x20], #64
sub w21, w21, #1
0: ld1 {v8.4s-v11.4s}, [x1], #64
sub w2, w2, #1
CPU_LE( rev32 v8.16b, v8.16b )
CPU_LE( rev32 v9.16b, v9.16b )
CPU_LE( rev32 v10.16b, v10.16b )
CPU_LE( rev32 v11.16b, v11.16b )
2: add t0.4s, v8.4s, k0.4s
1: add t0.4s, v8.4s, k0.4s
mov dg0v.16b, dgav.16b
add_update c, ev, k0, 8, 9, 10, 11, dgb
@@ -126,25 +120,18 @@ CPU_LE( rev32 v11.16b, v11.16b )
add dgbv.2s, dgbv.2s, dg1v.2s
add dgav.4s, dgav.4s, dg0v.4s
cbz w21, 3f
if_will_cond_yield_neon
st1 {dgav.4s}, [x19]
str dgb, [x19, #16]
do_cond_yield_neon
cbz w2, 2f
cond_yield 3f, x5
b 0b
endif_yield_neon
b 1b
/*
* Final block: add padding and total bit count.
* Skip if the input size was not a round multiple of the block size,
* the padding is handled by the C code in that case.
*/
3: cbz x4, 4f
2: cbz x4, 3f
ldr_l w4, sha1_ce_offsetof_count, x4
ldr x4, [x19, x4]
ldr x4, [x0, x4]
movi v9.2d, #0
mov x8, #0x80000000
movi v10.2d, #0
@@ -153,11 +140,11 @@ CPU_LE( rev32 v11.16b, v11.16b )
mov x4, #0
mov v11.d[0], xzr
mov v11.d[1], x7
b 2b
b 1b
/* store new state */
4: st1 {dgav.4s}, [x19]
str dgb, [x19, #16]
frame_pop
3: st1 {dgav.4s}, [x0]
str dgb, [x0, #16]
mov w0, w2
ret
SYM_FUNC_END(sha1_ce_transform)

Some files were not shown because too many files have changed in this diff Show More