Commit 6b5360a5 authored by Tianjia Zhang's avatar Tianjia Zhang Committed by Herbert Xu
Browse files

crypto: arm64/sm4 - add CE implementation for cmac/xcbc/cbcmac



This patch is a CE-optimized assembly implementation for cmac/xcbc/cbcmac.

Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 300 mode of
tcrypt, and compared the performance before and after this patch (the driver
used before this patch is XXXmac(sm4-ce)). The abscissas are blocks of
different lengths. The data is tabulated and the unit is Mb/s:

Before:

update-size    |      16      64     256    1024    2048    4096    8192
---------------+--------------------------------------------------------
cmac(sm4-ce)   |  293.33  403.69  503.76  527.78  531.10  535.46  535.81
xcbc(sm4-ce)   |  292.83  402.50  504.02  529.08  529.87  536.55  538.24
cbcmac(sm4-ce) |  318.42  415.79  497.12  515.05  523.15  521.19  523.01

After:

update-size    |      16      64     256    1024    2048    4096    8192
---------------+--------------------------------------------------------
cmac-sm4-ce    |  371.99  675.28  903.56  971.65  980.57  990.40  991.04
xcbc-sm4-ce    |  372.11  674.55  903.47  971.61  980.96  990.42  991.10
cbcmac-sm4-ce  |  371.63  675.33  903.23  972.07  981.42  990.93  991.45

Signed-off-by: default avatarTianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 01f63311
Loading
Loading
Loading
Loading
+70 −0
Original line number Diff line number Diff line
@@ -35,6 +35,7 @@
#define RTMP3	v19

#define RIV	v20
#define RMAC	v20
#define RMASK	v21


@@ -1007,6 +1008,75 @@ SYM_FUNC_START(sm4_ce_xts_dec)
	ret
SYM_FUNC_END(sm4_ce_xts_dec)

.align 3
SYM_FUNC_START(sm4_ce_mac_update)
	/* input:
	 *   x0: round key array, CTX
	 *   x1: digest
	 *   x2: src
	 *   w3: nblocks
	 *   w4: enc_before
	 *   w5: enc_after
	 */
	SM4_PREPARE(x0)

	ld1		{RMAC.16b}, [x1]

	cbz		w4, .Lmac_update

	SM4_CRYPT_BLK(RMAC)

.Lmac_update:
	cbz		w3, .Lmac_ret

	sub		w6, w3, #1
	cmp		w5, wzr
	csel		w3, w3, w6, ne

	cbz		w3, .Lmac_end

.Lmac_loop_4x:
	cmp		w3, #4
	blt		.Lmac_loop_1x

	sub		w3, w3, #4

	ld1		{v0.16b-v3.16b}, [x2], #64

	eor		RMAC.16b, RMAC.16b, v0.16b
	SM4_CRYPT_BLK(RMAC)
	eor		RMAC.16b, RMAC.16b, v1.16b
	SM4_CRYPT_BLK(RMAC)
	eor		RMAC.16b, RMAC.16b, v2.16b
	SM4_CRYPT_BLK(RMAC)
	eor		RMAC.16b, RMAC.16b, v3.16b
	SM4_CRYPT_BLK(RMAC)

	cbz		w3, .Lmac_end
	b		.Lmac_loop_4x

.Lmac_loop_1x:
	sub		w3, w3, #1

	ld1		{v0.16b}, [x2], #16

	eor		RMAC.16b, RMAC.16b, v0.16b
	SM4_CRYPT_BLK(RMAC)

	cbnz		w3, .Lmac_loop_1x


.Lmac_end:
	cbnz		w5, .Lmac_ret

	ld1		{v0.16b}, [x2], #16
	eor		RMAC.16b, RMAC.16b, v0.16b

.Lmac_ret:
	st1		{RMAC.16b}, [x1]
	ret
SYM_FUNC_END(sm4_ce_mac_update)


	.section	".rodata", "a"
	.align 4
+266 −1
Original line number Diff line number Diff line
@@ -14,8 +14,10 @@
#include <linux/cpufeature.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <crypto/b128ops.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <crypto/internal/hash.h>
#include <crypto/scatterwalk.h>
#include <crypto/xts.h>
#include <crypto/sm4.h>
@@ -47,6 +49,9 @@ asmlinkage void sm4_ce_xts_enc(const u32 *rkey1, u8 *dst, const u8 *src,
asmlinkage void sm4_ce_xts_dec(const u32 *rkey1, u8 *dst, const u8 *src,
			       u8 *tweak, unsigned int nbytes,
			       const u32 *rkey2_enc);
asmlinkage void sm4_ce_mac_update(const u32 *rkey_enc, u8 *digest,
				  const u8 *src, unsigned int nblocks,
				  bool enc_before, bool enc_after);

EXPORT_SYMBOL(sm4_ce_expand_key);
EXPORT_SYMBOL(sm4_ce_crypt_block);
@@ -58,6 +63,16 @@ struct sm4_xts_ctx {
	struct sm4_ctx key2;
};

struct sm4_mac_tfm_ctx {
	struct sm4_ctx key;
	u8 __aligned(8) consts[];
};

struct sm4_mac_desc_ctx {
	unsigned int len;
	u8 digest[SM4_BLOCK_SIZE];
};

static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key,
		      unsigned int key_len)
{
@@ -594,13 +609,260 @@ static struct skcipher_alg sm4_algs[] = {
	}
};

static int sm4_cbcmac_setkey(struct crypto_shash *tfm, const u8 *key,
			     unsigned int key_len)
{
	struct sm4_mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);

	if (key_len != SM4_KEY_SIZE)
		return -EINVAL;

	kernel_neon_begin();
	sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
			  crypto_sm4_fk, crypto_sm4_ck);
	kernel_neon_end();

	return 0;
}

static int sm4_cmac_setkey(struct crypto_shash *tfm, const u8 *key,
			   unsigned int key_len)
{
	struct sm4_mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
	be128 *consts = (be128 *)ctx->consts;
	u64 a, b;

	if (key_len != SM4_KEY_SIZE)
		return -EINVAL;

	memset(consts, 0, SM4_BLOCK_SIZE);

	kernel_neon_begin();

	sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
			  crypto_sm4_fk, crypto_sm4_ck);

	/* encrypt the zero block */
	sm4_ce_crypt_block(ctx->key.rkey_enc, (u8 *)consts, (const u8 *)consts);

	kernel_neon_end();

	/* gf(2^128) multiply zero-ciphertext with u and u^2 */
	a = be64_to_cpu(consts[0].a);
	b = be64_to_cpu(consts[0].b);
	consts[0].a = cpu_to_be64((a << 1) | (b >> 63));
	consts[0].b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0));

	a = be64_to_cpu(consts[0].a);
	b = be64_to_cpu(consts[0].b);
	consts[1].a = cpu_to_be64((a << 1) | (b >> 63));
	consts[1].b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0));

	return 0;
}

static int sm4_xcbc_setkey(struct crypto_shash *tfm, const u8 *key,
			   unsigned int key_len)
{
	struct sm4_mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
	u8 __aligned(8) key2[SM4_BLOCK_SIZE];
	static u8 const ks[3][SM4_BLOCK_SIZE] = {
		{ [0 ... SM4_BLOCK_SIZE - 1] = 0x1},
		{ [0 ... SM4_BLOCK_SIZE - 1] = 0x2},
		{ [0 ... SM4_BLOCK_SIZE - 1] = 0x3},
	};

	if (key_len != SM4_KEY_SIZE)
		return -EINVAL;

	kernel_neon_begin();

	sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
			  crypto_sm4_fk, crypto_sm4_ck);

	sm4_ce_crypt_block(ctx->key.rkey_enc, key2, ks[0]);
	sm4_ce_crypt(ctx->key.rkey_enc, ctx->consts, ks[1], 2);

	sm4_ce_expand_key(key2, ctx->key.rkey_enc, ctx->key.rkey_dec,
			  crypto_sm4_fk, crypto_sm4_ck);

	kernel_neon_end();

	return 0;
}

static int sm4_mac_init(struct shash_desc *desc)
{
	struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);

	memset(ctx->digest, 0, SM4_BLOCK_SIZE);
	ctx->len = 0;

	return 0;
}

static int sm4_mac_update(struct shash_desc *desc, const u8 *p,
			  unsigned int len)
{
	struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
	struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
	unsigned int l, nblocks;

	if (len == 0)
		return 0;

	if (ctx->len || ctx->len + len < SM4_BLOCK_SIZE) {
		l = min(len, SM4_BLOCK_SIZE - ctx->len);

		crypto_xor(ctx->digest + ctx->len, p, l);
		ctx->len += l;
		len -= l;
		p += l;
	}

	if (len && (ctx->len % SM4_BLOCK_SIZE) == 0) {
		kernel_neon_begin();

		if (len < SM4_BLOCK_SIZE && ctx->len == SM4_BLOCK_SIZE) {
			sm4_ce_crypt_block(tctx->key.rkey_enc,
					   ctx->digest, ctx->digest);
			ctx->len = 0;
		} else {
			nblocks = len / SM4_BLOCK_SIZE;
			len %= SM4_BLOCK_SIZE;

			sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, p,
					  nblocks, (ctx->len == SM4_BLOCK_SIZE),
					  (len != 0));

			p += nblocks * SM4_BLOCK_SIZE;

			if (len == 0)
				ctx->len = SM4_BLOCK_SIZE;
		}

		kernel_neon_end();

		if (len) {
			crypto_xor(ctx->digest, p, len);
			ctx->len = len;
		}
	}

	return 0;
}

static int sm4_cmac_final(struct shash_desc *desc, u8 *out)
{
	struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
	struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
	const u8 *consts = tctx->consts;

	if (ctx->len != SM4_BLOCK_SIZE) {
		ctx->digest[ctx->len] ^= 0x80;
		consts += SM4_BLOCK_SIZE;
	}

	kernel_neon_begin();
	sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, consts, 1,
			  false, true);
	kernel_neon_end();

	memcpy(out, ctx->digest, SM4_BLOCK_SIZE);

	return 0;
}

static int sm4_cbcmac_final(struct shash_desc *desc, u8 *out)
{
	struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
	struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);

	if (ctx->len) {
		kernel_neon_begin();
		sm4_ce_crypt_block(tctx->key.rkey_enc, ctx->digest,
				   ctx->digest);
		kernel_neon_end();
	}

	memcpy(out, ctx->digest, SM4_BLOCK_SIZE);

	return 0;
}

static struct shash_alg sm4_mac_algs[] = {
	{
		.base = {
			.cra_name		= "cmac(sm4)",
			.cra_driver_name	= "cmac-sm4-ce",
			.cra_priority		= 400,
			.cra_blocksize		= SM4_BLOCK_SIZE,
			.cra_ctxsize		= sizeof(struct sm4_mac_tfm_ctx)
							+ SM4_BLOCK_SIZE * 2,
			.cra_module		= THIS_MODULE,
		},
		.digestsize	= SM4_BLOCK_SIZE,
		.init		= sm4_mac_init,
		.update		= sm4_mac_update,
		.final		= sm4_cmac_final,
		.setkey		= sm4_cmac_setkey,
		.descsize	= sizeof(struct sm4_mac_desc_ctx),
	}, {
		.base = {
			.cra_name		= "xcbc(sm4)",
			.cra_driver_name	= "xcbc-sm4-ce",
			.cra_priority		= 400,
			.cra_blocksize		= SM4_BLOCK_SIZE,
			.cra_ctxsize		= sizeof(struct sm4_mac_tfm_ctx)
							+ SM4_BLOCK_SIZE * 2,
			.cra_module		= THIS_MODULE,
		},
		.digestsize	= SM4_BLOCK_SIZE,
		.init		= sm4_mac_init,
		.update		= sm4_mac_update,
		.final		= sm4_cmac_final,
		.setkey		= sm4_xcbc_setkey,
		.descsize	= sizeof(struct sm4_mac_desc_ctx),
	}, {
		.base = {
			.cra_name		= "cbcmac(sm4)",
			.cra_driver_name	= "cbcmac-sm4-ce",
			.cra_priority		= 400,
			.cra_blocksize		= 1,
			.cra_ctxsize		= sizeof(struct sm4_mac_tfm_ctx),
			.cra_module		= THIS_MODULE,
		},
		.digestsize	= SM4_BLOCK_SIZE,
		.init		= sm4_mac_init,
		.update		= sm4_mac_update,
		.final		= sm4_cbcmac_final,
		.setkey		= sm4_cbcmac_setkey,
		.descsize	= sizeof(struct sm4_mac_desc_ctx),
	}
};

static int __init sm4_init(void)
{
	return crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
	int err;

	err = crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
	if (err)
		return err;

	err = crypto_register_shashes(sm4_mac_algs, ARRAY_SIZE(sm4_mac_algs));
	if (err)
		goto out_err;

	return 0;

out_err:
	crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
	return err;
}

static void __exit sm4_exit(void)
{
	crypto_unregister_shashes(sm4_mac_algs, ARRAY_SIZE(sm4_mac_algs));
	crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
}

@@ -616,5 +878,8 @@ MODULE_ALIAS_CRYPTO("cfb(sm4)");
MODULE_ALIAS_CRYPTO("ctr(sm4)");
MODULE_ALIAS_CRYPTO("cts(cbc(sm4))");
MODULE_ALIAS_CRYPTO("xts(sm4)");
MODULE_ALIAS_CRYPTO("cmac(sm4)");
MODULE_ALIAS_CRYPTO("xcbc(sm4)");
MODULE_ALIAS_CRYPTO("cbcmac(sm4)");
MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
MODULE_LICENSE("GPL v2");