Backported ARM/NEON optimized crypto routines from kernel 3.13.y

This commit is contained in:
Omonar
2017-10-12 21:41:45 +03:00
parent e4daaacfe7
commit 6a01e2ff18
17 changed files with 5202 additions and 1 deletions

View File

@@ -2501,6 +2501,13 @@ config NEON
Say Y to include support code for NEON, the ARMv7 Advanced SIMD
Extension.
config KERNEL_MODE_NEON
bool "Support for NEON in kernel mode"
default n
depends on NEON
help
Say Y to include support for NEON in kernel mode.
endmenu
menu "Userspace binary formats"

View File

@@ -100,7 +100,7 @@ tune-$(CONFIG_CPU_V6) :=$(call cc-option,-mtune=arm1136j-s,-mtune=strongarm)
tune-$(CONFIG_CPU_V6K) :=$(call cc-option,-mtune=arm1136j-s,-mtune=strongarm)
ifeq ($(CONFIG_AEABI),y)
CFLAGS_ABI :=-mabi=aapcs-linux -mno-thumb-interwork
CFLAGS_ABI :=-mabi=aapcs-linux -mno-thumb-interwork -mfpu=vfp
else
CFLAGS_ABI :=$(call cc-option,-mapcs-32,-mabi=apcs-gnu) $(call cc-option,-mno-thumb-interwork,)
endif
@@ -118,6 +118,9 @@ AFLAGS_THUMB2 :=$(CFLAGS_THUMB2) -Wa$(comma)-mthumb
ifeq ($(CONFIG_THUMB2_AVOID_R_ARM_THM_JUMP11),y)
CFLAGS_MODULE +=-fno-optimize-sibling-calls
endif
else
CFLAGS_ISA :=$(call cc-option,-marm,)
AFLAGS_ISA :=$(CFLAGS_ISA)
endif
# Need -Uarm for gcc < 3.x
@@ -263,6 +266,7 @@ core-$(CONFIG_VFP) += arch/arm/vfp/
# If we have a machine-specific directory, then include it in the build.
core-y += arch/arm/kernel/ arch/arm/mm/ arch/arm/common/
core-y += arch/arm/net/
core-y += arch/arm/crypto/
core-y += $(machdirs) $(platdirs)
drivers-$(CONFIG_OPROFILE) += arch/arm/oprofile/

11
arch/arm/crypto/Makefile Normal file
View File

@@ -0,0 +1,11 @@
#
# Arch-specific CryptoAPI modules.
#
obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
aes-arm-y := aes-armv4.o aes_glue.o
aes-arm-bs-y := aesbs-core.o aesbs-glue.o
sha1-arm-y := sha1-armv4-large.o sha1_glue.o

1088
arch/arm/crypto/aes-armv4.S Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,98 @@
/*
* Glue Code for the asm optimized version of the AES Cipher Algorithm
*/
#include <linux/module.h>
#include <linux/crypto.h>
#include <crypto/aes.h>
#include "aes_glue.h"
EXPORT_SYMBOL(AES_encrypt);
EXPORT_SYMBOL(AES_decrypt);
EXPORT_SYMBOL(private_AES_set_encrypt_key);
EXPORT_SYMBOL(private_AES_set_decrypt_key);
static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct AES_CTX *ctx = crypto_tfm_ctx(tfm);
AES_encrypt(src, dst, &ctx->enc_key);
}
static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct AES_CTX *ctx = crypto_tfm_ctx(tfm);
AES_decrypt(src, dst, &ctx->dec_key);
}
static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
{
struct AES_CTX *ctx = crypto_tfm_ctx(tfm);
switch (key_len) {
case AES_KEYSIZE_128:
key_len = 128;
break;
case AES_KEYSIZE_192:
key_len = 192;
break;
case AES_KEYSIZE_256:
key_len = 256;
break;
default:
tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
return -EINVAL;
}
if (private_AES_set_encrypt_key(in_key, key_len, &ctx->enc_key) == -1) {
tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
return -EINVAL;
}
/* private_AES_set_decrypt_key expects an encryption key as input */
ctx->dec_key = ctx->enc_key;
if (private_AES_set_decrypt_key(in_key, key_len, &ctx->dec_key) == -1) {
tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
return -EINVAL;
}
return 0;
}
static struct crypto_alg aes_alg = {
.cra_name = "aes",
.cra_driver_name = "aes-asm",
.cra_priority = 200,
.cra_flags = CRYPTO_ALG_TYPE_CIPHER,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct AES_CTX),
.cra_module = THIS_MODULE,
.cra_list = LIST_HEAD_INIT(aes_alg.cra_list),
.cra_u = {
.cipher = {
.cia_min_keysize = AES_MIN_KEY_SIZE,
.cia_max_keysize = AES_MAX_KEY_SIZE,
.cia_setkey = aes_set_key,
.cia_encrypt = aes_encrypt,
.cia_decrypt = aes_decrypt
}
}
};
static int __init aes_init(void)
{
return crypto_register_alg(&aes_alg);
}
static void __exit aes_fini(void)
{
crypto_unregister_alg(&aes_alg);
}
module_init(aes_init);
module_exit(aes_fini);
MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm (ASM)");
MODULE_LICENSE("GPL");
MODULE_ALIAS("aes");
MODULE_ALIAS("aes-asm");
MODULE_AUTHOR("David McCullough <ucdevel@gmail.com>");

View File

@@ -0,0 +1,19 @@
#define AES_MAXNR 14
struct AES_KEY {
unsigned int rd_key[4 * (AES_MAXNR + 1)];
int rounds;
};
struct AES_CTX {
struct AES_KEY enc_key;
struct AES_KEY dec_key;
};
asmlinkage void AES_encrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
asmlinkage void AES_decrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey,
const int bits, struct AES_KEY *key);
asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey,
const int bits, struct AES_KEY *key);

2532
arch/arm/crypto/aesbs-core.S Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,434 @@
/*
* linux/arch/arm/crypto/aesbs-glue.c - glue code for NEON bit sliced AES
*
* Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <asm/neon.h>
#include <crypto/aes.h>
#include <crypto/ablk_helper.h>
#include <crypto/algapi.h>
#include <linux/module.h>
#include "aes_glue.h"
#define BIT_SLICED_KEY_MAXSIZE (128 * (AES_MAXNR - 1) + 2 * AES_BLOCK_SIZE)
struct BS_KEY {
struct AES_KEY rk;
int converted;
u8 __aligned(8) bs[BIT_SLICED_KEY_MAXSIZE];
} __aligned(8);
asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in);
asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in);
asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes,
struct BS_KEY *key, u8 iv[]);
asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks,
struct BS_KEY *key, u8 const iv[]);
asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes,
struct BS_KEY *key, u8 tweak[]);
asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes,
struct BS_KEY *key, u8 tweak[]);
struct aesbs_cbc_ctx {
struct AES_KEY enc;
struct BS_KEY dec;
};
struct aesbs_ctr_ctx {
struct BS_KEY enc;
};
struct aesbs_xts_ctx {
struct BS_KEY enc;
struct BS_KEY dec;
struct AES_KEY twkey;
};
static int aesbs_cbc_set_key(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
{
struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
int bits = key_len * 8;
if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) {
tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
return -EINVAL;
}
ctx->dec.rk = ctx->enc;
private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
ctx->dec.converted = 0;
return 0;
}
static int aesbs_ctr_set_key(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
{
struct aesbs_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
int bits = key_len * 8;
if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
return -EINVAL;
}
ctx->enc.converted = 0;
return 0;
}
static int aesbs_xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
{
struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
int bits = key_len * 4;
if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
return -EINVAL;
}
ctx->dec.rk = ctx->enc.rk;
private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
private_AES_set_encrypt_key(in_key + key_len / 2, bits, &ctx->twkey);
ctx->enc.converted = ctx->dec.converted = 0;
return 0;
}
static int aesbs_cbc_encrypt(struct blkcipher_desc *desc,
struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
while (walk.nbytes) {
u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
u8 *src = walk.src.virt.addr;
if (walk.dst.virt.addr == walk.src.virt.addr) {
u8 *iv = walk.iv;
do {
crypto_xor(src, iv, AES_BLOCK_SIZE);
AES_encrypt(src, src, &ctx->enc);
iv = src;
src += AES_BLOCK_SIZE;
} while (--blocks);
memcpy(walk.iv, iv, AES_BLOCK_SIZE);
} else {
u8 *dst = walk.dst.virt.addr;
do {
crypto_xor(walk.iv, src, AES_BLOCK_SIZE);
AES_encrypt(walk.iv, dst, &ctx->enc);
memcpy(walk.iv, dst, AES_BLOCK_SIZE);
src += AES_BLOCK_SIZE;
dst += AES_BLOCK_SIZE;
} while (--blocks);
}
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static int aesbs_cbc_decrypt(struct blkcipher_desc *desc,
struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
while ((walk.nbytes / AES_BLOCK_SIZE) >= 8) {
kernel_neon_begin();
bsaes_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
walk.nbytes, &ctx->dec, walk.iv);
kernel_neon_end();
err = blkcipher_walk_done(desc, &walk, 0);
}
while (walk.nbytes) {
u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
u8 *dst = walk.dst.virt.addr;
u8 *src = walk.src.virt.addr;
u8 bk[2][AES_BLOCK_SIZE];
u8 *iv = walk.iv;
do {
if (walk.dst.virt.addr == walk.src.virt.addr)
memcpy(bk[blocks & 1], src, AES_BLOCK_SIZE);
AES_decrypt(src, dst, &ctx->dec.rk);
crypto_xor(dst, iv, AES_BLOCK_SIZE);
if (walk.dst.virt.addr == walk.src.virt.addr)
iv = bk[blocks & 1];
else
iv = src;
dst += AES_BLOCK_SIZE;
src += AES_BLOCK_SIZE;
} while (--blocks);
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static void inc_be128_ctr(__be32 ctr[], u32 addend)
{
int i;
for (i = 3; i >= 0; i--, addend = 1) {
u32 n = be32_to_cpu(ctr[i]) + addend;
ctr[i] = cpu_to_be32(n);
if (n >= addend)
break;
}
}
static int aesbs_ctr_encrypt(struct blkcipher_desc *desc,
struct scatterlist *dst, struct scatterlist *src,
unsigned int nbytes)
{
struct aesbs_ctr_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
struct blkcipher_walk walk;
u32 blocks;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) {
u32 tail = walk.nbytes % AES_BLOCK_SIZE;
__be32 *ctr = (__be32 *)walk.iv;
u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]);
/* avoid 32 bit counter overflow in the NEON code */
if (unlikely(headroom < blocks)) {
blocks = headroom + 1;
tail = walk.nbytes - blocks * AES_BLOCK_SIZE;
}
kernel_neon_begin();
bsaes_ctr32_encrypt_blocks(walk.src.virt.addr,
walk.dst.virt.addr, blocks,
&ctx->enc, walk.iv);
kernel_neon_end();
inc_be128_ctr(ctr, blocks);
nbytes -= blocks * AES_BLOCK_SIZE;
if (nbytes && nbytes == tail && nbytes <= AES_BLOCK_SIZE)
break;
err = blkcipher_walk_done(desc, &walk, tail);
}
if (walk.nbytes) {
u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
u8 ks[AES_BLOCK_SIZE];
AES_encrypt(walk.iv, ks, &ctx->enc.rk);
if (tdst != tsrc)
memcpy(tdst, tsrc, nbytes);
crypto_xor(tdst, ks, nbytes);
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static int aesbs_xts_encrypt(struct blkcipher_desc *desc,
struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
/* generate the initial tweak */
AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
while (walk.nbytes) {
kernel_neon_begin();
bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
walk.nbytes, &ctx->enc, walk.iv);
kernel_neon_end();
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static int aesbs_xts_decrypt(struct blkcipher_desc *desc,
struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
/* generate the initial tweak */
AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
while (walk.nbytes) {
kernel_neon_begin();
bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
walk.nbytes, &ctx->dec, walk.iv);
kernel_neon_end();
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static struct crypto_alg aesbs_algs[] = { {
.cra_name = "__cbc-aes-neonbs",
.cra_driver_name = "__driver-cbc-aes-neonbs",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct aesbs_cbc_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_blkcipher = {
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = aesbs_cbc_set_key,
.encrypt = aesbs_cbc_encrypt,
.decrypt = aesbs_cbc_decrypt,
},
}, {
.cra_name = "__ctr-aes-neonbs",
.cra_driver_name = "__driver-ctr-aes-neonbs",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct aesbs_ctr_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_blkcipher = {
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = aesbs_ctr_set_key,
.encrypt = aesbs_ctr_encrypt,
.decrypt = aesbs_ctr_encrypt,
},
}, {
.cra_name = "__xts-aes-neonbs",
.cra_driver_name = "__driver-xts-aes-neonbs",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct aesbs_xts_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_blkcipher = {
.min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = aesbs_xts_set_key,
.encrypt = aesbs_xts_encrypt,
.decrypt = aesbs_xts_decrypt,
},
}, {
.cra_name = "cbc(aes)",
.cra_driver_name = "cbc-aes-neonbs",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_ablkcipher = {
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = __ablk_encrypt,
.decrypt = ablk_decrypt,
}
}, {
.cra_name = "ctr(aes)",
.cra_driver_name = "ctr-aes-neonbs",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_ablkcipher = {
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_decrypt,
}
}, {
.cra_name = "xts(aes)",
.cra_driver_name = "xts-aes-neonbs",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_ablkcipher = {
.min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_decrypt,
}
} };
static int __init aesbs_mod_init(void)
{
if (!cpu_has_neon())
return -ENODEV;
return crypto_register_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
}
static void __exit aesbs_mod_exit(void)
{
crypto_unregister_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
}
module_init(aesbs_mod_init);
module_exit(aesbs_mod_exit);
MODULE_DESCRIPTION("Bit sliced AES in CBC/CTR/XTS modes using NEON");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,497 @@
#define __ARM_ARCH__ __LINUX_ARM_ARCH__
@ ====================================================================
@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
@ CRYPTOGAMS licenses depending on where you obtain it. For further
@ details see http://www.openssl.org/~appro/cryptogams/.
@ ====================================================================
@ sha1_block procedure for ARMv4.
@
@ January 2007.
@ Size/performance trade-off
@ ====================================================================
@ impl size in bytes comp cycles[*] measured performance
@ ====================================================================
@ thumb 304 3212 4420
@ armv4-small 392/+29% 1958/+64% 2250/+96%
@ armv4-compact 740/+89% 1552/+26% 1840/+22%
@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
@ full unroll ~5100/+260% ~1260/+4% ~1300/+5%
@ ====================================================================
@ thumb = same as 'small' but in Thumb instructions[**] and
@ with recurring code in two private functions;
@ small = detached Xload/update, loops are folded;
@ compact = detached Xload/update, 5x unroll;
@ large = interleaved Xload/update, 5x unroll;
@ full unroll = interleaved Xload/update, full unroll, estimated[!];
@
@ [*] Manually counted instructions in "grand" loop body. Measured
@ performance is affected by prologue and epilogue overhead,
@ i-cache availability, branch penalties, etc.
@ [**] While each Thumb instruction is twice smaller, they are not as
@ diverse as ARM ones: e.g., there are only two arithmetic
@ instructions with 3 arguments, no [fixed] rotate, addressing
@ modes are limited. As result it takes more instructions to do
@ the same job in Thumb, therefore the code is never twice as
@ small and always slower.
@ [***] which is also ~35% better than compiler generated code. Dual-
@ issue Cortex A8 core was measured to process input block in
@ ~990 cycles.
@ August 2010.
@
@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
@ Cortex A8 core and in absolute terms ~870 cycles per input block
@ [or 13.6 cycles per byte].
@ February 2011.
@
@ Profiler-assisted and platform-specific optimization resulted in 10%
@ improvement on Cortex A8 core and 12.2 cycles per byte.
#include <linux/linkage.h>
.text
.align 2
ENTRY(sha1_block_data_order)
stmdb sp!,{r4-r12,lr}
add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
ldmia r0,{r3,r4,r5,r6,r7}
.Lloop:
ldr r8,.LK_00_19
mov r14,sp
sub sp,sp,#15*4
mov r5,r5,ror#30
mov r6,r6,ror#30
mov r7,r7,ror#30 @ [6]
.L_00_15:
#if __ARM_ARCH__<7
ldrb r10,[r1,#2]
ldrb r9,[r1,#3]
ldrb r11,[r1,#1]
add r7,r8,r7,ror#2 @ E+=K_00_19
ldrb r12,[r1],#4
orr r9,r9,r10,lsl#8
eor r10,r5,r6 @ F_xx_xx
orr r9,r9,r11,lsl#16
add r7,r7,r3,ror#27 @ E+=ROR(A,27)
orr r9,r9,r12,lsl#24
#else
ldr r9,[r1],#4 @ handles unaligned
add r7,r8,r7,ror#2 @ E+=K_00_19
eor r10,r5,r6 @ F_xx_xx
add r7,r7,r3,ror#27 @ E+=ROR(A,27)
#ifdef __ARMEL__
rev r9,r9 @ byte swap
#endif
#endif
and r10,r4,r10,ror#2
add r7,r7,r9 @ E+=X[i]
eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
str r9,[r14,#-4]!
add r7,r7,r10 @ E+=F_00_19(B,C,D)
#if __ARM_ARCH__<7
ldrb r10,[r1,#2]
ldrb r9,[r1,#3]
ldrb r11,[r1,#1]
add r6,r8,r6,ror#2 @ E+=K_00_19
ldrb r12,[r1],#4
orr r9,r9,r10,lsl#8
eor r10,r4,r5 @ F_xx_xx
orr r9,r9,r11,lsl#16
add r6,r6,r7,ror#27 @ E+=ROR(A,27)
orr r9,r9,r12,lsl#24
#else
ldr r9,[r1],#4 @ handles unaligned
add r6,r8,r6,ror#2 @ E+=K_00_19
eor r10,r4,r5 @ F_xx_xx
add r6,r6,r7,ror#27 @ E+=ROR(A,27)
#ifdef __ARMEL__
rev r9,r9 @ byte swap
#endif
#endif
and r10,r3,r10,ror#2
add r6,r6,r9 @ E+=X[i]
eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
str r9,[r14,#-4]!
add r6,r6,r10 @ E+=F_00_19(B,C,D)
#if __ARM_ARCH__<7
ldrb r10,[r1,#2]
ldrb r9,[r1,#3]
ldrb r11,[r1,#1]
add r5,r8,r5,ror#2 @ E+=K_00_19
ldrb r12,[r1],#4
orr r9,r9,r10,lsl#8
eor r10,r3,r4 @ F_xx_xx
orr r9,r9,r11,lsl#16
add r5,r5,r6,ror#27 @ E+=ROR(A,27)
orr r9,r9,r12,lsl#24
#else
ldr r9,[r1],#4 @ handles unaligned
add r5,r8,r5,ror#2 @ E+=K_00_19
eor r10,r3,r4 @ F_xx_xx
add r5,r5,r6,ror#27 @ E+=ROR(A,27)
#ifdef __ARMEL__
rev r9,r9 @ byte swap
#endif
#endif
and r10,r7,r10,ror#2
add r5,r5,r9 @ E+=X[i]
eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
str r9,[r14,#-4]!
add r5,r5,r10 @ E+=F_00_19(B,C,D)
#if __ARM_ARCH__<7
ldrb r10,[r1,#2]
ldrb r9,[r1,#3]
ldrb r11,[r1,#1]
add r4,r8,r4,ror#2 @ E+=K_00_19
ldrb r12,[r1],#4
orr r9,r9,r10,lsl#8
eor r10,r7,r3 @ F_xx_xx
orr r9,r9,r11,lsl#16
add r4,r4,r5,ror#27 @ E+=ROR(A,27)
orr r9,r9,r12,lsl#24
#else
ldr r9,[r1],#4 @ handles unaligned
add r4,r8,r4,ror#2 @ E+=K_00_19
eor r10,r7,r3 @ F_xx_xx
add r4,r4,r5,ror#27 @ E+=ROR(A,27)
#ifdef __ARMEL__
rev r9,r9 @ byte swap
#endif
#endif
and r10,r6,r10,ror#2
add r4,r4,r9 @ E+=X[i]
eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
str r9,[r14,#-4]!
add r4,r4,r10 @ E+=F_00_19(B,C,D)
#if __ARM_ARCH__<7
ldrb r10,[r1,#2]
ldrb r9,[r1,#3]
ldrb r11,[r1,#1]
add r3,r8,r3,ror#2 @ E+=K_00_19
ldrb r12,[r1],#4
orr r9,r9,r10,lsl#8
eor r10,r6,r7 @ F_xx_xx
orr r9,r9,r11,lsl#16
add r3,r3,r4,ror#27 @ E+=ROR(A,27)
orr r9,r9,r12,lsl#24
#else
ldr r9,[r1],#4 @ handles unaligned
add r3,r8,r3,ror#2 @ E+=K_00_19
eor r10,r6,r7 @ F_xx_xx
add r3,r3,r4,ror#27 @ E+=ROR(A,27)
#ifdef __ARMEL__
rev r9,r9 @ byte swap
#endif
#endif
and r10,r5,r10,ror#2
add r3,r3,r9 @ E+=X[i]
eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
str r9,[r14,#-4]!
add r3,r3,r10 @ E+=F_00_19(B,C,D)
cmp r14,sp
bne .L_00_15 @ [((11+4)*5+2)*3]
sub sp,sp,#25*4
#if __ARM_ARCH__<7
ldrb r10,[r1,#2]
ldrb r9,[r1,#3]
ldrb r11,[r1,#1]
add r7,r8,r7,ror#2 @ E+=K_00_19
ldrb r12,[r1],#4
orr r9,r9,r10,lsl#8
eor r10,r5,r6 @ F_xx_xx
orr r9,r9,r11,lsl#16
add r7,r7,r3,ror#27 @ E+=ROR(A,27)
orr r9,r9,r12,lsl#24
#else
ldr r9,[r1],#4 @ handles unaligned
add r7,r8,r7,ror#2 @ E+=K_00_19
eor r10,r5,r6 @ F_xx_xx
add r7,r7,r3,ror#27 @ E+=ROR(A,27)
#ifdef __ARMEL__
rev r9,r9 @ byte swap
#endif
#endif
and r10,r4,r10,ror#2
add r7,r7,r9 @ E+=X[i]
eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
str r9,[r14,#-4]!
add r7,r7,r10 @ E+=F_00_19(B,C,D)
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
ldr r11,[r14,#7*4]
add r6,r8,r6,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
eor r11,r11,r12 @ 1 cycle stall
eor r10,r4,r5 @ F_xx_xx
mov r9,r9,ror#31
add r6,r6,r7,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
str r9,[r14,#-4]!
and r10,r3,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r6,r6,r9 @ E+=X[i]
eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
add r6,r6,r10 @ E+=F_00_19(B,C,D)
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
ldr r11,[r14,#7*4]
add r5,r8,r5,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
eor r11,r11,r12 @ 1 cycle stall
eor r10,r3,r4 @ F_xx_xx
mov r9,r9,ror#31
add r5,r5,r6,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
str r9,[r14,#-4]!
and r10,r7,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r5,r5,r9 @ E+=X[i]
eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
add r5,r5,r10 @ E+=F_00_19(B,C,D)
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
ldr r11,[r14,#7*4]
add r4,r8,r4,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
eor r11,r11,r12 @ 1 cycle stall
eor r10,r7,r3 @ F_xx_xx
mov r9,r9,ror#31
add r4,r4,r5,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
str r9,[r14,#-4]!
and r10,r6,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r4,r4,r9 @ E+=X[i]
eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
add r4,r4,r10 @ E+=F_00_19(B,C,D)
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
ldr r11,[r14,#7*4]
add r3,r8,r3,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
eor r11,r11,r12 @ 1 cycle stall
eor r10,r6,r7 @ F_xx_xx
mov r9,r9,ror#31
add r3,r3,r4,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
str r9,[r14,#-4]!
and r10,r5,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r3,r3,r9 @ E+=X[i]
eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
add r3,r3,r10 @ E+=F_00_19(B,C,D)
ldr r8,.LK_20_39 @ [+15+16*4]
cmn sp,#0 @ [+3], clear carry to denote 20_39
.L_20_39_or_60_79:
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
ldr r11,[r14,#7*4]
add r7,r8,r7,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
eor r11,r11,r12 @ 1 cycle stall
eor r10,r5,r6 @ F_xx_xx
mov r9,r9,ror#31
add r7,r7,r3,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
str r9,[r14,#-4]!
eor r10,r4,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r7,r7,r9 @ E+=X[i]
add r7,r7,r10 @ E+=F_20_39(B,C,D)
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
ldr r11,[r14,#7*4]
add r6,r8,r6,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
eor r11,r11,r12 @ 1 cycle stall
eor r10,r4,r5 @ F_xx_xx
mov r9,r9,ror#31
add r6,r6,r7,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
str r9,[r14,#-4]!
eor r10,r3,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r6,r6,r9 @ E+=X[i]
add r6,r6,r10 @ E+=F_20_39(B,C,D)
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
ldr r11,[r14,#7*4]
add r5,r8,r5,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
eor r11,r11,r12 @ 1 cycle stall
eor r10,r3,r4 @ F_xx_xx
mov r9,r9,ror#31
add r5,r5,r6,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
str r9,[r14,#-4]!
eor r10,r7,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r5,r5,r9 @ E+=X[i]
add r5,r5,r10 @ E+=F_20_39(B,C,D)
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
ldr r11,[r14,#7*4]
add r4,r8,r4,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
eor r11,r11,r12 @ 1 cycle stall
eor r10,r7,r3 @ F_xx_xx
mov r9,r9,ror#31
add r4,r4,r5,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
str r9,[r14,#-4]!
eor r10,r6,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r4,r4,r9 @ E+=X[i]
add r4,r4,r10 @ E+=F_20_39(B,C,D)
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
ldr r11,[r14,#7*4]
add r3,r8,r3,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
eor r11,r11,r12 @ 1 cycle stall
eor r10,r6,r7 @ F_xx_xx
mov r9,r9,ror#31
add r3,r3,r4,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
str r9,[r14,#-4]!
eor r10,r5,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r3,r3,r9 @ E+=X[i]
add r3,r3,r10 @ E+=F_20_39(B,C,D)
ARM( teq r14,sp ) @ preserve carry
THUMB( mov r11,sp )
THUMB( teq r14,r11 ) @ preserve carry
bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
ldr r8,.LK_40_59
sub sp,sp,#20*4 @ [+2]
.L_40_59:
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
ldr r11,[r14,#7*4]
add r7,r8,r7,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
eor r11,r11,r12 @ 1 cycle stall
eor r10,r5,r6 @ F_xx_xx
mov r9,r9,ror#31
add r7,r7,r3,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
str r9,[r14,#-4]!
and r10,r4,r10,ror#2 @ F_xx_xx
and r11,r5,r6 @ F_xx_xx
add r7,r7,r9 @ E+=X[i]
add r7,r7,r10 @ E+=F_40_59(B,C,D)
add r7,r7,r11,ror#2
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
ldr r11,[r14,#7*4]
add r6,r8,r6,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
eor r11,r11,r12 @ 1 cycle stall
eor r10,r4,r5 @ F_xx_xx
mov r9,r9,ror#31
add r6,r6,r7,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
str r9,[r14,#-4]!
and r10,r3,r10,ror#2 @ F_xx_xx
and r11,r4,r5 @ F_xx_xx
add r6,r6,r9 @ E+=X[i]
add r6,r6,r10 @ E+=F_40_59(B,C,D)
add r6,r6,r11,ror#2
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
ldr r11,[r14,#7*4]
add r5,r8,r5,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
eor r11,r11,r12 @ 1 cycle stall
eor r10,r3,r4 @ F_xx_xx
mov r9,r9,ror#31
add r5,r5,r6,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
str r9,[r14,#-4]!
and r10,r7,r10,ror#2 @ F_xx_xx
and r11,r3,r4 @ F_xx_xx
add r5,r5,r9 @ E+=X[i]
add r5,r5,r10 @ E+=F_40_59(B,C,D)
add r5,r5,r11,ror#2
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
ldr r11,[r14,#7*4]
add r4,r8,r4,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
eor r11,r11,r12 @ 1 cycle stall
eor r10,r7,r3 @ F_xx_xx
mov r9,r9,ror#31
add r4,r4,r5,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
str r9,[r14,#-4]!
and r10,r6,r10,ror#2 @ F_xx_xx
and r11,r7,r3 @ F_xx_xx
add r4,r4,r9 @ E+=X[i]
add r4,r4,r10 @ E+=F_40_59(B,C,D)
add r4,r4,r11,ror#2
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
ldr r11,[r14,#7*4]
add r3,r8,r3,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
eor r11,r11,r12 @ 1 cycle stall
eor r10,r6,r7 @ F_xx_xx
mov r9,r9,ror#31
add r3,r3,r4,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
str r9,[r14,#-4]!
and r10,r5,r10,ror#2 @ F_xx_xx
and r11,r6,r7 @ F_xx_xx
add r3,r3,r9 @ E+=X[i]
add r3,r3,r10 @ E+=F_40_59(B,C,D)
add r3,r3,r11,ror#2
cmp r14,sp
bne .L_40_59 @ [+((12+5)*5+2)*4]
ldr r8,.LK_60_79
sub sp,sp,#20*4
cmp sp,#0 @ set carry to denote 60_79
b .L_20_39_or_60_79 @ [+4], spare 300 bytes
.L_done:
add sp,sp,#80*4 @ "deallocate" stack frame
ldmia r0,{r8,r9,r10,r11,r12}
add r3,r8,r3
add r4,r9,r4
add r5,r10,r5,ror#2
add r6,r11,r6,ror#2
add r7,r12,r7,ror#2
stmia r0,{r3,r4,r5,r6,r7}
teq r1,r2
bne .Lloop @ [+18], total 1307
ldmia sp!,{r4-r12,pc}
.align 2
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6
ENDPROC(sha1_block_data_order)
.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
.align 2

179
arch/arm/crypto/sha1_glue.c Normal file
View File

@@ -0,0 +1,179 @@
/*
* Cryptographic API.
* Glue code for the SHA1 Secure Hash Algorithm assembler implementation
*
* This file is based on sha1_generic.c and sha1_ssse3_glue.c
*
* Copyright (c) Alan Smithee.
* Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
* Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
* Copyright (c) Mathias Krause <minipli@googlemail.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
*/
#include <crypto/internal/hash.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/cryptohash.h>
#include <linux/types.h>
#include <crypto/sha.h>
#include <asm/byteorder.h>
struct SHA1_CTX {
uint32_t h0,h1,h2,h3,h4;
u64 count;
u8 data[SHA1_BLOCK_SIZE];
};
asmlinkage void sha1_block_data_order(struct SHA1_CTX *digest,
const unsigned char *data, unsigned int rounds);
static int sha1_init(struct shash_desc *desc)
{
struct SHA1_CTX *sctx = shash_desc_ctx(desc);
memset(sctx, 0, sizeof(*sctx));
sctx->h0 = SHA1_H0;
sctx->h1 = SHA1_H1;
sctx->h2 = SHA1_H2;
sctx->h3 = SHA1_H3;
sctx->h4 = SHA1_H4;
return 0;
}
static int __sha1_update(struct SHA1_CTX *sctx, const u8 *data,
unsigned int len, unsigned int partial)
{
unsigned int done = 0;
sctx->count += len;
if (partial) {
done = SHA1_BLOCK_SIZE - partial;
memcpy(sctx->data + partial, data, done);
sha1_block_data_order(sctx, sctx->data, 1);
}
if (len - done >= SHA1_BLOCK_SIZE) {
const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
sha1_block_data_order(sctx, data + done, rounds);
done += rounds * SHA1_BLOCK_SIZE;
}
memcpy(sctx->data, data + done, len - done);
return 0;
}
static int sha1_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
struct SHA1_CTX *sctx = shash_desc_ctx(desc);
unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
int res;
/* Handle the fast case right here */
if (partial + len < SHA1_BLOCK_SIZE) {
sctx->count += len;
memcpy(sctx->data + partial, data, len);
return 0;
}
res = __sha1_update(sctx, data, len, partial);
return res;
}
/* Add padding and return the message digest. */
static int sha1_final(struct shash_desc *desc, u8 *out)
{
struct SHA1_CTX *sctx = shash_desc_ctx(desc);
unsigned int i, index, padlen;
__be32 *dst = (__be32 *)out;
__be64 bits;
static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
bits = cpu_to_be64(sctx->count << 3);
/* Pad out to 56 mod 64 and append length */
index = sctx->count % SHA1_BLOCK_SIZE;
padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
/* We need to fill a whole block for __sha1_update() */
if (padlen <= 56) {
sctx->count += padlen;
memcpy(sctx->data + index, padding, padlen);
} else {
__sha1_update(sctx, padding, padlen, index);
}
__sha1_update(sctx, (const u8 *)&bits, sizeof(bits), 56);
/* Store state in digest */
for (i = 0; i < 5; i++)
dst[i] = cpu_to_be32(((u32 *)sctx)[i]);
/* Wipe context */
memset(sctx, 0, sizeof(*sctx));
return 0;
}
static int sha1_export(struct shash_desc *desc, void *out)
{
struct SHA1_CTX *sctx = shash_desc_ctx(desc);
memcpy(out, sctx, sizeof(*sctx));
return 0;
}
static int sha1_import(struct shash_desc *desc, const void *in)
{
struct SHA1_CTX *sctx = shash_desc_ctx(desc);
memcpy(sctx, in, sizeof(*sctx));
return 0;
}
static struct shash_alg alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_init,
.update = sha1_update,
.final = sha1_final,
.export = sha1_export,
.import = sha1_import,
.descsize = sizeof(struct SHA1_CTX),
.statesize = sizeof(struct SHA1_CTX),
.base = {
.cra_name = "sha1",
.cra_driver_name= "sha1-asm",
.cra_priority = 150,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
};
static int __init sha1_mod_init(void)
{
return crypto_register_shash(&alg);
}
static void __exit sha1_mod_fini(void)
{
crypto_unregister_shash(&alg);
}
module_init(sha1_mod_init);
module_exit(sha1_mod_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm (ARM)");
MODULE_ALIAS("sha1");
MODULE_AUTHOR("David McCullough <ucdevel@gmail.com>");

View File

@@ -0,0 +1,36 @@
/*
* linux/arch/arm/include/asm/neon.h
*
* Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <asm/hwcap.h>
#define cpu_has_neon() (!!(elf_hwcap & HWCAP_NEON))
#ifdef __ARM_NEON__
/*
* If you are affected by the BUILD_BUG below, it probably means that you are
* using NEON code /and/ calling the kernel_neon_begin() function from the same
* compilation unit. To prevent issues that may arise from GCC reordering or
* generating(1) NEON instructions outside of these begin/end functions, the
* only supported way of using NEON code in the kernel is by isolating it in a
* separate compilation unit, and calling it from another unit from inside a
* kernel_neon_begin/kernel_neon_end pair.
*
* (1) Current GCC (4.7) might generate NEON instructions at O3 level if
* -mpfu=neon is set.
*/
#define kernel_neon_begin() \
BUILD_BUG_ON_MSG(1, "kernel_neon_begin() called from NEON code")
#else
void kernel_neon_begin(void);
#endif
void kernel_neon_end(void);

View File

@@ -0,0 +1,14 @@
#include <linux/hardirq.h>
/*
* may_use_simd - whether it is allowable at this time to issue SIMD
* instructions or access the SIMD register file
*
* As architectures typically don't preserve the SIMD register file when
* taking an interrupt, !in_interrupt() should be a reasonable default.
*/
static __must_check inline bool may_use_simd(void)
{
return !in_interrupt();
}

View File

@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/uaccess.h>
#include <linux/user.h>
#include <linux/export.h>
#include <asm/cp15.h>
#include <asm/cputype.h>
@@ -648,6 +649,52 @@ static int vfp_hotplug(struct notifier_block *b, unsigned long action,
return NOTIFY_OK;
}
#ifdef CONFIG_KERNEL_MODE_NEON
/*
* Kernel-side NEON support functions
*/
void kernel_neon_begin(void)
{
struct thread_info *thread = current_thread_info();
unsigned int cpu;
u32 fpexc;
/*
* Kernel mode NEON is only allowed outside of interrupt context
* with preemption disabled. This will make sure that the kernel
* mode NEON register contents never need to be preserved.
*/
BUG_ON(in_interrupt());
cpu = get_cpu();
fpexc = fmrx(FPEXC) | FPEXC_EN;
fmxr(FPEXC, fpexc);
/*
* Save the userland NEON/VFP state. Under UP,
* the owner could be a task other than 'current'
*/
if (vfp_state_in_hw(cpu, thread))
vfp_save_state(&thread->vfpstate, fpexc);
#ifndef CONFIG_SMP
else if (vfp_current_hw_state[cpu] != NULL)
vfp_save_state(vfp_current_hw_state[cpu], fpexc);
#endif
vfp_current_hw_state[cpu] = NULL;
}
EXPORT_SYMBOL(kernel_neon_begin);
void kernel_neon_end(void)
{
/* Disable the NEON/VFP unit. */
fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN);
put_cpu();
}
EXPORT_SYMBOL(kernel_neon_end);
#endif /* CONFIG_KERNEL_MODE_NEON */
/*
* VFP support code initialisation.
*/

View File

@@ -174,6 +174,10 @@ config CRYPTO_TEST
help
Quick & dirty crypto test module.
config CRYPTO_ABLK_HELPER
tristate
select CRYPTO_CRYPTD
comment "Authenticated Encryption with Associated Data"
config CRYPTO_CCM
@@ -423,6 +427,15 @@ config CRYPTO_SHA1_SSSE3
using Supplemental SSE3 (SSSE3) instructions or Advanced Vector
Extensions (AVX), when available.
config CRYPTO_SHA1_ARM
tristate "SHA1 digest algorithm (ARM-asm)"
depends on ARM
select CRYPTO_SHA1
select CRYPTO_HASH
help
SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented
using optimized ARM assembler.
config CRYPTO_SHA256
tristate "SHA224 and SHA256 digest algorithm"
select CRYPTO_HASH
@@ -577,6 +590,46 @@ config CRYPTO_AES_NI_INTEL
ECB, CBC, LRW, PCBC, XTS. The 64 bit version has additional
acceleration for CTR.
config CRYPTO_AES_ARM
tristate "AES cipher algorithms (ARM-asm)"
depends on ARM
select CRYPTO_ALGAPI
select CRYPTO_AES
help
Use optimized AES assembler routines for ARM platforms.
AES cipher algorithms (FIPS-197). AES uses the Rijndael
algorithm.
Rijndael appears to be consistently a very good performer in
both hardware and software across a wide range of computing
environments regardless of its use in feedback or non-feedback
modes. Its key setup time is excellent, and its key agility is
good. Rijndael's very low memory requirements make it very well
suited for restricted-space environments, in which it also
demonstrates excellent performance. Rijndael's operations are
among the easiest to defend against power and timing attacks.
The AES specifies three key sizes: 128, 192 and 256 bits
See <http://csrc.nist.gov/encryption/aes/> for more information.
config CRYPTO_AES_ARM_BS
tristate "Bit sliced AES using NEON instructions"
depends on ARM && KERNEL_MODE_NEON
select CRYPTO_ALGAPI
select CRYPTO_AES_ARM
select CRYPTO_ABLK_HELPER
help
Use a faster and more secure NEON based implementation of AES in CBC,
CTR and XTS modes
Bit sliced AES gives around 45% speedup on Cortex-A15 for CTR mode
and for XTS mode encryption, CBC and XTS mode decryption speedup is
around 25%. (CBC encryption speed is not affected by this driver.)
This implementation does not rely on any lookup tables so it is
believed to be invulnerable to cache timing attacks.
config CRYPTO_ANUBIS
tristate "Anubis cipher algorithm"
select CRYPTO_ALGAPI

View File

@@ -98,3 +98,4 @@ obj-$(CONFIG_CRYPTO_USER_API_RNG) += algif_rng.o
obj-$(CONFIG_XOR_BLOCKS) += xor.o
obj-$(CONFIG_ASYNC_CORE) += async_tx/
obj-$(CONFIG_ASYMMETRIC_KEY_TYPE) += asymmetric_keys/
obj-$(CONFIG_CRYPTO_ABLK_HELPER) += ablk_helper.o

150
crypto/ablk_helper.c Normal file
View File

@@ -0,0 +1,150 @@
/*
* Shared async block cipher helpers
*
* Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
*
* Based on aesni-intel_glue.c by:
* Copyright (C) 2008, Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
*
*/
#include <linux/kernel.h>
#include <linux/crypto.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/hardirq.h>
#include <crypto/algapi.h>
#include <crypto/cryptd.h>
#include <crypto/ablk_helper.h>
#include <asm/simd.h>
int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
unsigned int key_len)
{
struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
int err;
crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
& CRYPTO_TFM_REQ_MASK);
err = crypto_ablkcipher_setkey(child, key, key_len);
crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
& CRYPTO_TFM_RES_MASK);
return err;
}
EXPORT_SYMBOL_GPL(ablk_set_key);
int __ablk_encrypt(struct ablkcipher_request *req)
{
struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
struct blkcipher_desc desc;
desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
desc.info = req->info;
desc.flags = 0;
return crypto_blkcipher_crt(desc.tfm)->encrypt(
&desc, req->dst, req->src, req->nbytes);
}
EXPORT_SYMBOL_GPL(__ablk_encrypt);
int ablk_encrypt(struct ablkcipher_request *req)
{
struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
if (!may_use_simd()) {
struct ablkcipher_request *cryptd_req =
ablkcipher_request_ctx(req);
*cryptd_req = *req;
ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
return crypto_ablkcipher_encrypt(cryptd_req);
} else {
return __ablk_encrypt(req);
}
}
EXPORT_SYMBOL_GPL(ablk_encrypt);
int ablk_decrypt(struct ablkcipher_request *req)
{
struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
if (!may_use_simd()) {
struct ablkcipher_request *cryptd_req =
ablkcipher_request_ctx(req);
*cryptd_req = *req;
ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
return crypto_ablkcipher_decrypt(cryptd_req);
} else {
struct blkcipher_desc desc;
desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
desc.info = req->info;
desc.flags = 0;
return crypto_blkcipher_crt(desc.tfm)->decrypt(
&desc, req->dst, req->src, req->nbytes);
}
}
EXPORT_SYMBOL_GPL(ablk_decrypt);
void ablk_exit(struct crypto_tfm *tfm)
{
struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm);
cryptd_free_ablkcipher(ctx->cryptd_tfm);
}
EXPORT_SYMBOL_GPL(ablk_exit);
int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name)
{
struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm);
struct cryptd_ablkcipher *cryptd_tfm;
cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
if (IS_ERR(cryptd_tfm))
return PTR_ERR(cryptd_tfm);
ctx->cryptd_tfm = cryptd_tfm;
tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
crypto_ablkcipher_reqsize(&cryptd_tfm->base);
return 0;
}
EXPORT_SYMBOL_GPL(ablk_init_common);
int ablk_init(struct crypto_tfm *tfm)
{
char drv_name[CRYPTO_MAX_ALG_NAME];
snprintf(drv_name, sizeof(drv_name), "__driver-%s",
crypto_tfm_alg_driver_name(tfm));
return ablk_init_common(tfm, drv_name);
}
EXPORT_SYMBOL_GPL(ablk_init);
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,31 @@
/*
* Shared async block cipher helpers
*/
#ifndef _CRYPTO_ABLK_HELPER_H
#define _CRYPTO_ABLK_HELPER_H
#include <linux/crypto.h>
#include <linux/kernel.h>
#include <crypto/cryptd.h>
struct async_helper_ctx {
struct cryptd_ablkcipher *cryptd_tfm;
};
extern int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
unsigned int key_len);
extern int __ablk_encrypt(struct ablkcipher_request *req);
extern int ablk_encrypt(struct ablkcipher_request *req);
extern int ablk_decrypt(struct ablkcipher_request *req);
extern void ablk_exit(struct crypto_tfm *tfm);
extern int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name);
extern int ablk_init(struct crypto_tfm *tfm);
#endif /* _CRYPTO_ABLK_HELPER_H */