crypto: arm/crct10dif - Implement plain NEON variant (e7c1d1c9) · Commits · git / linux-nf

arch/arm/crypto/crct10dif-ce-core.S

+94 −4

Original line number	Diff line number	Diff line
		@@ -112,6 +112,82 @@
		FOLD_CONST_L .req q10l
		FOLD_CONST_H .req q10h

		/*
		* Pairwise long polynomial multiplication of two 16-bit values
		*
		* { w0, w1 }, { y0, y1 }
		*
		* by two 64-bit values
		*
		* { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
		*
		* where each vector element is a byte, ordered from least to most
		* significant. The resulting 80-bit vectors are XOR'ed together.
		*
		* This can be implemented using 8x8 long polynomial multiplication, by
		* reorganizing the input so that each pairwise 8x8 multiplication
		* produces one of the terms from the decomposition below, and
		* combining the results of each rank and shifting them into place.
		*
		* Rank
		* 0 w0x0 ^ \| y0z0 ^
		* 1 (w0x1 ^ w1x0) << 8 ^ \| (y0z1 ^ y1z0) << 8 ^
		* 2 (w0x2 ^ w1x1) << 16 ^ \| (y0z2 ^ y1z1) << 16 ^
		* 3 (w0x3 ^ w1x2) << 24 ^ \| (y0z3 ^ y1z2) << 24 ^
		* 4 (w0x4 ^ w1x3) << 32 ^ \| (y0z4 ^ y1z3) << 32 ^
		* 5 (w0x5 ^ w1x4) << 40 ^ \| (y0z5 ^ y1z4) << 40 ^
		* 6 (w0x6 ^ w1x5) << 48 ^ \| (y0z6 ^ y1z5) << 48 ^
		* 7 (w0x7 ^ w1x6) << 56 ^ \| (y0z7 ^ y1z6) << 56 ^
		* 8 w1x7 << 64 \| y1z7 << 64
		*
		* The inputs can be reorganized into
		*
		* { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
		* { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
		*
		* and after performing 8x8->16 bit long polynomial multiplication of
		* each of the halves of the first vector with those of the second one,
		* we obtain the following four vectors of 16-bit elements:
		*
		* a := { w0x0, w0x2, w0x4, w0x6 }, { y0z0, y0z2, y0z4, y0z6 }
		* b := { w0x1, w0x3, w0x5, w0x7 }, { y0z1, y0z3, y0z5, y0z7 }
		* c := { w1x0, w1x2, w1x4, w1x6 }, { y1z0, y1z2, y1z4, y1z6 }
		* d := { w1x1, w1x3, w1x5, w1x7 }, { y1z1, y1z3, y1z5, y1z7 }
		*
		* Results b and c can be XORed together, as the vector elements have
		* matching ranks. Then, the final XOR can be pulled forward, and
		* applied between the halves of each of the remaining three vectors,
		* which are then shifted into place, and XORed together to produce the
		* final 80-bit result.
		*/
		.macro pmull16x64_p8, v16, v64
		vext.8 q11, \v64, \v64, #1
		vld1.64 {q12}, [r4, :128]
		vuzp.8 q11, \v64
		vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24
		vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25
		bl __pmull16x64_p8
		veor \v64, q12, q14
		.endm

		__pmull16x64_p8:
		vmull.p8 q13, d23, d24
		vmull.p8 q14, d23, d25
		vmull.p8 q15, d22, d24
		vmull.p8 q12, d22, d25

		veor q14, q14, q15
		veor d24, d24, d25
		veor d26, d26, d27
		veor d28, d28, d29
		vmov.i32 d25, #0
		vmov.i32 d29, #0
		vext.8 q12, q12, q12, #14
		vext.8 q14, q14, q14, #15
		veor d24, d24, d26
		bx lr
		ENDPROC(__pmull16x64_p8)

		.macro pmull16x64_p64, v16, v64
		vmull.p64 q11, \v64\()l, \v16\()_L
		vmull.p64 \v64, \v64\()h, \v16\()_H
		@@ -249,9 +325,9 @@ CPU_LE( vrev64.8 q0, q0 )
		vswp q0l, q0h

		// q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
		mov_l r3, .Lbyteshift_table + 16
		sub r3, r3, len
		vld1.8 {q2}, [r3]
		mov_l r1, .Lbyteshift_table + 16
		sub r1, r1, len
		vld1.8 {q2}, [r1]
		vtbl.8 q1l, {q7l-q7h}, q2l
		vtbl.8 q1h, {q7l-q7h}, q2h

		@@ -341,9 +417,20 @@ ENTRY(crc_t10dif_pmull64)

		vmov.u16 r0, q0l[0]
		bx lr

		ENDPROC(crc_t10dif_pmull64)

		ENTRY(crc_t10dif_pmull8)
		push {r4, lr}
		mov_l r4, .L16x64perm

		crct10dif p8

		CPU_LE( vrev64.8 q7, q7 )
		vswp q7l, q7h
		vst1.64 {q7}, [r3, :128]
		pop {r4, pc}
		ENDPROC(crc_t10dif_pmull8)

		.section ".rodata", "a"
		.align 4

		@@ -376,3 +463,6 @@ ENDPROC(crc_t10dif_pmull64)
		.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
		.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
		.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0

		.L16x64perm:
		.quad 0x808080800000000, 0x909090901010101

arch/arm/crypto/crct10dif-ce-glue.c

+40 −5

Original line number	Diff line number	Diff line
		@@ -20,6 +20,8 @@
		#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U

		asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
		asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len,
		u8 out[16]);

		static int crct10dif_init(struct shash_desc *desc)
		{
		@@ -45,6 +47,27 @@ static int crct10dif_update_ce(struct shash_desc desc, const u8 data,
		return 0;
		}

		static int crct10dif_update_neon(struct shash_desc desc, const u8 data,
		unsigned int length)
		{
		u16 *crcp = shash_desc_ctx(desc);
		u8 buf[16] __aligned(16);
		u16 crc = *crcp;

		if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
		kernel_neon_begin();
		crc_t10dif_pmull8(crc, data, length, buf);
		kernel_neon_end();

		crc = 0;
		data = buf;
		length = sizeof(buf);
		}

		*crcp = crc_t10dif_generic(crc, data, length);
		return 0;
		}

		static int crct10dif_final(struct shash_desc desc, u8 out)
		{
		u16 *crc = shash_desc_ctx(desc);
		@@ -53,7 +76,19 @@ static int crct10dif_final(struct shash_desc desc, u8 out)
		return 0;
		}

		static struct shash_alg crc_t10dif_alg = {
		static struct shash_alg algs[] = {{
		.digestsize = CRC_T10DIF_DIGEST_SIZE,
		.init = crct10dif_init,
		.update = crct10dif_update_neon,
		.final = crct10dif_final,
		.descsize = CRC_T10DIF_DIGEST_SIZE,

		.base.cra_name = "crct10dif",
		.base.cra_driver_name = "crct10dif-arm-neon",
		.base.cra_priority = 150,
		.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
		.base.cra_module = THIS_MODULE,
		}, {
		.digestsize = CRC_T10DIF_DIGEST_SIZE,
		.init = crct10dif_init,
		.update = crct10dif_update_ce,
		@@ -65,19 +100,19 @@ static struct shash_alg crc_t10dif_alg = {
		.base.cra_priority = 200,
		.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
		.base.cra_module = THIS_MODULE,
		};
		}};

		static int __init crc_t10dif_mod_init(void)
		{
		if (!(elf_hwcap2 & HWCAP2_PMULL))
		if (!(elf_hwcap & HWCAP_NEON))
		return -ENODEV;

		return crypto_register_shash(&crc_t10dif_alg);
		return crypto_register_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
		}

		static void __exit crc_t10dif_mod_exit(void)
		{
		crypto_unregister_shash(&crc_t10dif_alg);
		crypto_unregister_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
		}

		module_init(crc_t10dif_mod_init);