Commit 802d8d11 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu
Browse files

crypto: arm/crct10dif - Macroify PMULL asm code



To allow an alternative version to be created of the PMULL based
CRC-T10DIF algorithm, turn the bulk of it into a macro, except for the
final reduction, which will only be used by the existing version.

Reviewed-by: default avatarEric Biggers <ebiggers@google.com>
Signed-off-by: default avatarArd Biesheuvel <ardb@kernel.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent fcf27785
Loading
Loading
Loading
Loading
+78 −76
Original line number Diff line number Diff line
@@ -112,48 +112,42 @@
	FOLD_CONST_L	.req	q10l
	FOLD_CONST_H	.req	q10h

        .macro		pmull16x64_p64, v16, v64
	vmull.p64	q11, \v64\()l, \v16\()_L
	vmull.p64	\v64, \v64\()h, \v16\()_H
	veor		\v64, \v64, q11
	.endm

	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
	// into reg1, reg2.
	.macro		fold_32_bytes, reg1, reg2
	vld1.64		{q11-q12}, [buf]!
	.macro		fold_32_bytes, reg1, reg2, p
	vld1.64		{q8-q9}, [buf]!

	vmull.p64	q8, \reg1\()h, FOLD_CONST_H
	vmull.p64	\reg1, \reg1\()l, FOLD_CONST_L
	vmull.p64	q9, \reg2\()h, FOLD_CONST_H
	vmull.p64	\reg2, \reg2\()l, FOLD_CONST_L
	pmull16x64_\p	FOLD_CONST, \reg1
	pmull16x64_\p	FOLD_CONST, \reg2

CPU_LE(	vrev64.8	q11, q11	)
CPU_LE(	vrev64.8	q12, q12	)
	vswp		q11l, q11h
	vswp		q12l, q12h
CPU_LE(	vrev64.8	q8, q8	)
CPU_LE(	vrev64.8	q9, q9	)
	vswp		q8l, q8h
	vswp		q9l, q9h

	veor.8		\reg1, \reg1, q8
	veor.8		\reg2, \reg2, q9
	veor.8		\reg1, \reg1, q11
	veor.8		\reg2, \reg2, q12
	.endm

	// Fold src_reg into dst_reg, optionally loading the next fold constants
	.macro		fold_16_bytes, src_reg, dst_reg, load_next_consts
	vmull.p64	q8, \src_reg\()l, FOLD_CONST_L
	vmull.p64	\src_reg, \src_reg\()h, FOLD_CONST_H
	.macro		fold_16_bytes, src_reg, dst_reg, p, load_next_consts
	pmull16x64_\p	FOLD_CONST, \src_reg
	.ifnb		\load_next_consts
	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
	.endif
	veor.8		\dst_reg, \dst_reg, q8
	veor.8		\dst_reg, \dst_reg, \src_reg
	.endm

//
// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
//
// Assumes len >= 16.
//
ENTRY(crc_t10dif_pmull)

	.macro		crct10dif, p
	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
	cmp		len, #256
	blt		.Lless_than_256_bytes
	blt		.Lless_than_256_bytes\@

	mov_l		fold_consts_ptr, .Lfold_across_128_bytes_consts

@@ -194,27 +188,27 @@ CPU_LE( vrev64.8 q7, q7 )

	// While >= 128 data bytes remain (not counting q0-q7), fold the 128
	// bytes q0-q7 into them, storing the result back into q0-q7.
.Lfold_128_bytes_loop:
	fold_32_bytes	q0, q1
	fold_32_bytes	q2, q3
	fold_32_bytes	q4, q5
	fold_32_bytes	q6, q7
.Lfold_128_bytes_loop\@:
	fold_32_bytes	q0, q1, \p
	fold_32_bytes	q2, q3, \p
	fold_32_bytes	q4, q5, \p
	fold_32_bytes	q6, q7, \p
	subs		len, len, #128
	bge		.Lfold_128_bytes_loop
	bge		.Lfold_128_bytes_loop\@

	// Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.

	// Fold across 64 bytes.
	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
	fold_16_bytes	q0, q4
	fold_16_bytes	q1, q5
	fold_16_bytes	q2, q6
	fold_16_bytes	q3, q7, 1
	fold_16_bytes	q0, q4, \p
	fold_16_bytes	q1, q5, \p
	fold_16_bytes	q2, q6, \p
	fold_16_bytes	q3, q7, \p, 1
	// Fold across 32 bytes.
	fold_16_bytes	q4, q6
	fold_16_bytes	q5, q7, 1
	fold_16_bytes	q4, q6, \p
	fold_16_bytes	q5, q7, \p, 1
	// Fold across 16 bytes.
	fold_16_bytes	q6, q7
	fold_16_bytes	q6, q7, \p

	// Add 128 to get the correct number of data bytes remaining in 0...127
	// (not counting q7), following the previous extra subtraction by 128.
@@ -224,25 +218,23 @@ CPU_LE( vrev64.8 q7, q7 )

	// While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
	// into them, storing the result back into q7.
	blt		.Lfold_16_bytes_loop_done
.Lfold_16_bytes_loop:
	vmull.p64	q8, q7l, FOLD_CONST_L
	vmull.p64	q7, q7h, FOLD_CONST_H
	veor.8		q7, q7, q8
	blt		.Lfold_16_bytes_loop_done\@
.Lfold_16_bytes_loop\@:
	pmull16x64_\p	FOLD_CONST, q7
	vld1.64		{q0}, [buf]!
CPU_LE(	vrev64.8	q0, q0	)
	vswp		q0l, q0h
	veor.8		q7, q7, q0
	subs		len, len, #16
	bge		.Lfold_16_bytes_loop
	bge		.Lfold_16_bytes_loop\@

.Lfold_16_bytes_loop_done:
.Lfold_16_bytes_loop_done\@:
	// Add 16 to get the correct number of data bytes remaining in 0...15
	// (not counting q7), following the previous extra subtraction by 16.
	adds		len, len, #16
	beq		.Lreduce_final_16_bytes
	beq		.Lreduce_final_16_bytes\@

.Lhandle_partial_segment:
.Lhandle_partial_segment\@:
	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
	// 16 bytes are in q7 and the rest are the remaining data in 'buf'.  To
	// do this without needing a fold constant for each possible 'len',
@@ -277,12 +269,46 @@ CPU_LE( vrev64.8 q0, q0 )
	vbsl.8		q2, q1, q0

	// Fold the first chunk into the second chunk, storing the result in q7.
	vmull.p64	q0, q3l, FOLD_CONST_L
	vmull.p64	q7, q3h, FOLD_CONST_H
	veor.8		q7, q7, q0
	veor.8		q7, q7, q2
	pmull16x64_\p	FOLD_CONST, q3
	veor.8		q7, q3, q2
	b		.Lreduce_final_16_bytes\@

.Lless_than_256_bytes\@:
	// Checksumming a buffer of length 16...255 bytes

	mov_l		fold_consts_ptr, .Lfold_across_16_bytes_consts

	// Load the first 16 data bytes.
	vld1.64		{q7}, [buf]!
CPU_LE(	vrev64.8	q7, q7	)
	vswp		q7l, q7h

	// XOR the first 16 data *bits* with the initial CRC value.
	vmov.i8		q0h, #0
	vmov.u16	q0h[3], init_crc
	veor.8		q7h, q7h, q0h

	// Load the fold-across-16-bytes constants.
	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!

	cmp		len, #16
	beq		.Lreduce_final_16_bytes\@	// len == 16
	subs		len, len, #32
	addlt		len, len, #16
	blt		.Lhandle_partial_segment\@	// 17 <= len <= 31
	b		.Lfold_16_bytes_loop\@		// 32 <= len <= 255

.Lreduce_final_16_bytes\@:
	.endm

//
// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
//
// Assumes len >= 16.
//
ENTRY(crc_t10dif_pmull64)
	crct10dif	p64

.Lreduce_final_16_bytes:
	// Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.

	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
@@ -316,31 +342,7 @@ CPU_LE( vrev64.8 q0, q0 )
	vmov.u16	r0, q0l[0]
	bx		lr

.Lless_than_256_bytes:
	// Checksumming a buffer of length 16...255 bytes

	mov_l		fold_consts_ptr, .Lfold_across_16_bytes_consts

	// Load the first 16 data bytes.
	vld1.64		{q7}, [buf]!
CPU_LE(	vrev64.8	q7, q7	)
	vswp		q7l, q7h

	// XOR the first 16 data *bits* with the initial CRC value.
	vmov.i8		q0h, #0
	vmov.u16	q0h[3], init_crc
	veor.8		q7h, q7h, q0h

	// Load the fold-across-16-bytes constants.
	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!

	cmp		len, #16
	beq		.Lreduce_final_16_bytes		// len == 16
	subs		len, len, #32
	addlt		len, len, #16
	blt		.Lhandle_partial_segment	// 17 <= len <= 31
	b		.Lfold_16_bytes_loop		// 32 <= len <= 255
ENDPROC(crc_t10dif_pmull)
ENDPROC(crc_t10dif_pmull64)

	.section	".rodata", "a"
	.align		4
+5 −5
Original line number Diff line number Diff line
@@ -19,7 +19,7 @@

#define CRC_T10DIF_PMULL_CHUNK_SIZE	16U

asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);

static int crct10dif_init(struct shash_desc *desc)
{
@@ -29,14 +29,14 @@ static int crct10dif_init(struct shash_desc *desc)
	return 0;
}

static int crct10dif_update(struct shash_desc *desc, const u8 *data,
static int crct10dif_update_ce(struct shash_desc *desc, const u8 *data,
			       unsigned int length)
{
	u16 *crc = shash_desc_ctx(desc);

	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
		kernel_neon_begin();
		*crc = crc_t10dif_pmull(*crc, data, length);
		*crc = crc_t10dif_pmull64(*crc, data, length);
		kernel_neon_end();
	} else {
		*crc = crc_t10dif_generic(*crc, data, length);
@@ -56,7 +56,7 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
static struct shash_alg crc_t10dif_alg = {
	.digestsize		= CRC_T10DIF_DIGEST_SIZE,
	.init			= crct10dif_init,
	.update			= crct10dif_update,
	.update			= crct10dif_update_ce,
	.final			= crct10dif_final,
	.descsize		= CRC_T10DIF_DIGEST_SIZE,