Commit 779cee82 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu
Browse files

crypto: arm64/crct10dif - Remove remaining 64x64 PMULL fallback code



The only remaining user of the fallback implementation of 64x64
polynomial multiplication using 8x8 PMULL instructions is the final
reduction from a 16 byte vector to a 16-bit CRC.

The fallback code is complicated and messy, and this reduction has
little impact on the overall performance, so instead, let's calculate
the final CRC by passing the 16 byte vector to the generic CRC-T10DIF
implementation when running the fallback version.

Signed-off-by: default avatarArd Biesheuvel <ardb@kernel.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 67dfb1b7
Loading
Loading
Loading
Loading
+56 −188
Original line number Diff line number Diff line
@@ -74,137 +74,18 @@
	init_crc	.req	w0
	buf		.req	x1
	len		.req	x2
	fold_consts_ptr	.req	x3
	fold_consts_ptr	.req	x5

	fold_consts	.req	v10

	ad		.req	v14

	k00_16		.req	v15
	k32_48		.req	v16

	t3		.req	v17
	t4		.req	v18
	t5		.req	v19
	t6		.req	v20
	t7		.req	v21
	t8		.req	v22
	t9		.req	v23

	perm1		.req	v24
	perm2		.req	v25
	perm3		.req	v26
	perm4		.req	v27

	bd1		.req	v28
	bd2		.req	v29
	bd3		.req	v30
	bd4		.req	v31

	.macro		__pmull_init_p64
	.endm

	.macro		__pmull_pre_p64, bd
	.endm

	.macro		__pmull_init_p8
	// k00_16 := 0x0000000000000000_000000000000ffff
	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
	movi		k32_48.2d, #0xffffffff
	mov		k32_48.h[2], k32_48.h[0]
	ushr		k00_16.2d, k32_48.2d, #32

	// prepare the permutation vectors
	mov_q		x5, 0x080f0e0d0c0b0a09
	movi		perm4.8b, #8
	dup		perm1.2d, x5
	eor		perm1.16b, perm1.16b, perm4.16b
	ushr		perm2.2d, perm1.2d, #8
	ushr		perm3.2d, perm1.2d, #16
	ushr		perm4.2d, perm1.2d, #24
	sli		perm2.2d, perm1.2d, #56
	sli		perm3.2d, perm1.2d, #48
	sli		perm4.2d, perm1.2d, #40

	// Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
	movi		bd1.4h, #8, lsl #8
	orr		bd1.2s, #1, lsl #16
	orr		bd1.2s, #1, lsl #24
	zip1		bd1.16b, bd1.16b, bd1.16b
	zip1		bd1.16b, bd1.16b, bd1.16b
	.endm

	.macro		__pmull_pre_p8, bd
	tbl		bd1.16b, {\bd\().16b}, perm1.16b
	tbl		bd2.16b, {\bd\().16b}, perm2.16b
	tbl		bd3.16b, {\bd\().16b}, perm3.16b
	tbl		bd4.16b, {\bd\().16b}, perm4.16b
	.endm

SYM_FUNC_START_LOCAL(__pmull_p8_core)
.L__pmull_p8_core:
	ext		t4.8b, ad.8b, ad.8b, #1			// A1
	ext		t5.8b, ad.8b, ad.8b, #2			// A2
	ext		t6.8b, ad.8b, ad.8b, #3			// A3

	pmull		t4.8h, t4.8b, fold_consts.8b		// F = A1*B
	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
	pmull		t5.8h, t5.8b, fold_consts.8b		// H = A2*B
	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
	pmull		t6.8h, t6.8b, fold_consts.8b		// J = A3*B
	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
	b		0f

.L__pmull_p8_core2:
	tbl		t4.16b, {ad.16b}, perm1.16b		// A1
	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
	tbl		t6.16b, {ad.16b}, perm3.16b		// A3

	pmull2		t4.8h, t4.16b, fold_consts.16b		// F = A1*B
	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
	pmull2		t5.8h, t5.16b, fold_consts.16b		// H = A2*B
	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
	pmull2		t6.8h, t6.16b, fold_consts.16b		// J = A3*B
	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4

0:	eor		t4.16b, t4.16b, t8.16b			// L = E + F
	eor		t5.16b, t5.16b, t7.16b			// M = G + H
	eor		t6.16b, t6.16b, t9.16b			// N = I + J

	uzp1		t8.2d, t4.2d, t5.2d
	uzp2		t4.2d, t4.2d, t5.2d
	uzp1		t7.2d, t6.2d, t3.2d
	uzp2		t6.2d, t6.2d, t3.2d

	// t4 = (L) (P0 + P1) << 8
	// t5 = (M) (P2 + P3) << 16
	eor		t8.16b, t8.16b, t4.16b
	and		t4.16b, t4.16b, k32_48.16b

	// t6 = (N) (P4 + P5) << 24
	// t7 = (K) (P6 + P7) << 32
	eor		t7.16b, t7.16b, t6.16b
	and		t6.16b, t6.16b, k00_16.16b

	eor		t8.16b, t8.16b, t4.16b
	eor		t7.16b, t7.16b, t6.16b

	zip2		t5.2d, t8.2d, t4.2d
	zip1		t4.2d, t8.2d, t4.2d
	zip2		t3.2d, t7.2d, t6.2d
	zip1		t6.2d, t7.2d, t6.2d

	ext		t4.16b, t4.16b, t4.16b, #15
	ext		t5.16b, t5.16b, t5.16b, #14
	ext		t6.16b, t6.16b, t6.16b, #13
	ext		t3.16b, t3.16b, t3.16b, #12

	eor		t4.16b, t4.16b, t5.16b
	eor		t6.16b, t6.16b, t3.16b
	ret
SYM_FUNC_END(__pmull_p8_core)
	perm		.req	v27

	.macro		pmull16x64_p64, a16, b64, c64
	pmull2		\c64\().1q, \a16\().2d, \b64\().2d
@@ -266,7 +147,7 @@ SYM_FUNC_END(__pmull_p8_core)
	 */
	.macro		pmull16x64_p8, a16, b64, c64
	ext		t7.16b, \b64\().16b, \b64\().16b, #1
	tbl		t5.16b, {\a16\().16b}, bd1.16b
	tbl		t5.16b, {\a16\().16b}, perm.16b
	uzp1		t7.16b, \b64\().16b, t7.16b
	bl		__pmull_p8_16x64
	ext		\b64\().16b, t4.16b, t4.16b, #15
@@ -292,22 +173,6 @@ SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
	ret
SYM_FUNC_END(__pmull_p8_16x64)

	.macro		__pmull_p8, rq, ad, bd, i
	.ifnc		\bd, fold_consts
	.err
	.endif
	mov		ad.16b, \ad\().16b
	.ifb		\i
	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
	.else
	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
	.endif

	bl		.L__pmull_p8_core\i

	eor		\rq\().16b, \rq\().16b, t4.16b
	eor		\rq\().16b, \rq\().16b, t6.16b
	.endm

	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
	// into reg1, reg2.
@@ -340,16 +205,7 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
	.endm

	.macro		__pmull_p64, rd, rn, rm, n
	.ifb		\n
	pmull		\rd\().1q, \rn\().1d, \rm\().1d
	.else
	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
	.endif
	.endm

	.macro		crc_t10dif_pmull, p
	__pmull_init_\p

	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
	cmp		len, #256
@@ -479,47 +335,7 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
	pmull16x64_\p	fold_consts, v3, v0
	eor		v7.16b, v3.16b, v0.16b
	eor		v7.16b, v7.16b, v2.16b

.Lreduce_final_16_bytes_\@:
	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.

	movi		v2.16b, #0		// init zero register

	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
	__pmull_pre_\p	fold_consts

	// Fold the high 64 bits into the low 64 bits, while also multiplying by
	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
	// whose low 48 bits are 0.
	ext		v0.16b, v2.16b, v7.16b, #8
	__pmull_\p	v7, v7, fold_consts, 2	// high bits * x^48 * (x^80 mod G(x))
	eor		v0.16b, v0.16b, v7.16b	// + low bits * x^64

	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
	mov		v0.s[3], v2.s[0]	// zero high 32 bits
	__pmull_\p	v1, v1, fold_consts	// high 32 bits * x^48 * (x^48 mod G(x))
	eor		v0.16b, v0.16b, v1.16b	// + low bits

	// Load G(x) and floor(x^48 / G(x)).
	ld1		{fold_consts.2d}, [fold_consts_ptr]
	__pmull_pre_\p	fold_consts

	// Use Barrett reduction to compute the final CRC value.
	__pmull_\p	v1, v0, fold_consts, 2	// high 32 bits * floor(x^48 / G(x))
	ushr		v1.2d, v1.2d, #32	// /= x^32
	__pmull_\p	v1, v1, fold_consts	// *= G(x)
	ushr		v0.2d, v0.2d, #48
	eor		v0.16b, v0.16b, v1.16b	// + low 16 nonzero bits
	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.

	umov		w0, v0.h[0]
	.ifc		\p, p8
	frame_pop
	.endif
	ret
	b		.Lreduce_final_16_bytes_\@

.Lless_than_256_bytes_\@:
	// Checksumming a buffer of length 16...255 bytes
@@ -545,6 +361,8 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
	add		len, len, #16
	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31

.Lreduce_final_16_bytes_\@:
	.endm

//
@@ -554,7 +372,22 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
//
SYM_FUNC_START(crc_t10dif_pmull_p8)
	frame_push	1

	// Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
	movi		perm.4h, #8, lsl #8
	orr		perm.2s, #1, lsl #16
	orr		perm.2s, #1, lsl #24
	zip1		perm.16b, perm.16b, perm.16b
	zip1		perm.16b, perm.16b, perm.16b

	crc_t10dif_pmull p8

CPU_LE(	rev64		v7.16b, v7.16b			)
CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
	str		q7, [x3]

	frame_pop
	ret
SYM_FUNC_END(crc_t10dif_pmull_p8)

	.align		5
@@ -565,6 +398,41 @@ SYM_FUNC_END(crc_t10dif_pmull_p8)
//
SYM_FUNC_START(crc_t10dif_pmull_p64)
	crc_t10dif_pmull	p64

	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.

	movi		v2.16b, #0		// init zero register

	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
	ld1		{fold_consts.2d}, [fold_consts_ptr], #16

	// Fold the high 64 bits into the low 64 bits, while also multiplying by
	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
	// whose low 48 bits are 0.
	ext		v0.16b, v2.16b, v7.16b, #8
	pmull2		v7.1q, v7.2d, fold_consts.2d	// high bits * x^48 * (x^80 mod G(x))
	eor		v0.16b, v0.16b, v7.16b		// + low bits * x^64

	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
	mov		v0.s[3], v2.s[0]		// zero high 32 bits
	pmull		v1.1q, v1.1d, fold_consts.1d	// high 32 bits * x^48 * (x^48 mod G(x))
	eor		v0.16b, v0.16b, v1.16b		// + low bits

	// Load G(x) and floor(x^48 / G(x)).
	ld1		{fold_consts.2d}, [fold_consts_ptr]

	// Use Barrett reduction to compute the final CRC value.
	pmull2		v1.1q, v0.2d, fold_consts.2d	// high 32 bits * floor(x^48 / G(x))
	ushr		v1.2d, v1.2d, #32		// /= x^32
	pmull		v1.1q, v1.1d, fold_consts.1d	// *= G(x)
	ushr		v0.2d, v0.2d, #48
	eor		v0.16b, v0.16b, v1.16b		// + low 16 nonzero bits
	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.

	umov		w0, v0.h[0]
	ret
SYM_FUNC_END(crc_t10dif_pmull_p64)

	.section	".rodata", "a"
+12 −6
Original line number Diff line number Diff line
@@ -20,7 +20,8 @@

#define CRC_T10DIF_PMULL_CHUNK_SIZE	16U

asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
				    u8 out[16]);
asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);

static int crct10dif_init(struct shash_desc *desc)
@@ -34,16 +35,21 @@ static int crct10dif_init(struct shash_desc *desc)
static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
			    unsigned int length)
{
	u16 *crc = shash_desc_ctx(desc);
	u16 *crcp = shash_desc_ctx(desc);
	u16 crc = *crcp;
	u8 buf[16];

	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
	if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
		kernel_neon_begin();
		*crc = crc_t10dif_pmull_p8(*crc, data, length);
		crc_t10dif_pmull_p8(crc, data, length, buf);
		kernel_neon_end();
	} else {
		*crc = crc_t10dif_generic(*crc, data, length);

		crc = 0;
		data = buf;
		length = sizeof(buf);
	}

	*crcp = crc_t10dif_generic(crc, data, length);
	return 0;
}