Commit 64e3586c authored by Eric Biggers's avatar Eric Biggers
Browse files

x86/crc32: update prototype for crc_pcl()



- Change the len parameter from unsigned int to size_t, so that the
  library function which takes a size_t can safely use this code.

- Rename to crc32c_x86_3way() which is much clearer.

- Move the crc parameter to the front, as this is the usual convention.

Reviewed-by: default avatarArd Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20241202010844.144356-12-ebiggers@kernel.org


Signed-off-by: default avatarEric Biggers <ebiggers@google.com>
parent 0f60a8ac
Loading
Loading
Loading
Loading
+3 −4
Original line number Diff line number Diff line
@@ -41,8 +41,7 @@
 */
#define CRC32C_PCL_BREAKEVEN	512

asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len,
				unsigned int crc_init);
asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
#endif /* CONFIG_X86_64 */

static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
@@ -159,7 +158,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
	 */
	if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
		kernel_fpu_begin();
		*crcp = crc_pcl(data, len, *crcp);
		*crcp = crc32c_x86_3way(*crcp, data, len);
		kernel_fpu_end();
	} else
		*crcp = crc32c_intel_le_hw(*crcp, data, len);
@@ -171,7 +170,7 @@ static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
{
	if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
		kernel_fpu_begin();
		*(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
		*(__le32 *)out = ~cpu_to_le32(crc32c_x86_3way(*crcp, data, len));
		kernel_fpu_end();
	} else
		*(__le32 *)out =
+32 −31
Original line number Diff line number Diff line
@@ -52,15 +52,16 @@
# regular CRC code that does not interleave the CRC instructions.
#define SMALL_SIZE 200

# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);

.text
SYM_FUNC_START(crc_pcl)
#define    bufp		  %rdi
#define    bufp_d	  %edi
#define    len		  %esi
#define    crc_init	  %edx
#define    crc_init_q	  %rdx
SYM_FUNC_START(crc32c_x86_3way)
#define    crc0		  %edi
#define    crc0_q	  %rdi
#define    bufp		  %rsi
#define    bufp_d	  %esi
#define    len		  %rdx
#define    len_dw	  %edx
#define    n_misaligned	  %ecx /* overlaps chunk_bytes! */
#define    n_misaligned_q %rcx
#define    chunk_bytes	  %ecx /* overlaps n_misaligned! */
@@ -85,9 +86,9 @@ SYM_FUNC_START(crc_pcl)
.Ldo_align:
	movq	(bufp), %rax
	add	n_misaligned_q, bufp
	sub	n_misaligned, len
	sub	n_misaligned_q, len
.Lalign_loop:
	crc32b	%al, crc_init		# compute crc32 of 1-byte
	crc32b	%al, crc0		# compute crc32 of 1-byte
	shr	$8, %rax		# get next byte
	dec	n_misaligned
	jne     .Lalign_loop
@@ -102,7 +103,7 @@ SYM_FUNC_START(crc_pcl)

.Lpartial_block:
	# Compute floor(len / 24) to get num qwords to process from each lane.
	imul	$2731, len, %eax	# 2731 = ceil(2^16 / 24)
	imul	$2731, len_dw, %eax	# 2731 = ceil(2^16 / 24)
	shr	$16, %eax
	jmp	.Lcrc_3lanes

@@ -125,16 +126,16 @@ SYM_FUNC_START(crc_pcl)
	# Unroll the loop by a factor of 4 to reduce the overhead of the loop
	# bookkeeping instructions, which can compete with crc32q for the ALUs.
.Lcrc_3lanes_4x_loop:
	crc32q	(bufp), crc_init_q
	crc32q	(bufp), crc0_q
	crc32q	(bufp,chunk_bytes_q), crc1
	crc32q	(bufp,chunk_bytes_q,2), crc2
	crc32q	8(bufp), crc_init_q
	crc32q	8(bufp), crc0_q
	crc32q	8(bufp,chunk_bytes_q), crc1
	crc32q	8(bufp,chunk_bytes_q,2), crc2
	crc32q	16(bufp), crc_init_q
	crc32q	16(bufp), crc0_q
	crc32q	16(bufp,chunk_bytes_q), crc1
	crc32q	16(bufp,chunk_bytes_q,2), crc2
	crc32q	24(bufp), crc_init_q
	crc32q	24(bufp), crc0_q
	crc32q	24(bufp,chunk_bytes_q), crc1
	crc32q	24(bufp,chunk_bytes_q,2), crc2
	add	$32, bufp
@@ -146,7 +147,7 @@ SYM_FUNC_START(crc_pcl)
	jz	.Lcrc_3lanes_last_qword

.Lcrc_3lanes_1x_loop:
	crc32q	(bufp), crc_init_q
	crc32q	(bufp), crc0_q
	crc32q	(bufp,chunk_bytes_q), crc1
	crc32q	(bufp,chunk_bytes_q,2), crc2
	add	$8, bufp
@@ -154,7 +155,7 @@ SYM_FUNC_START(crc_pcl)
	jnz	.Lcrc_3lanes_1x_loop

.Lcrc_3lanes_last_qword:
	crc32q	(bufp), crc_init_q
	crc32q	(bufp), crc0_q
	crc32q	(bufp,chunk_bytes_q), crc1
# SKIP  crc32q	(bufp,chunk_bytes_q,2), crc2	; Don't do this one yet

@@ -165,9 +166,9 @@ SYM_FUNC_START(crc_pcl)
	lea	(K_table-8)(%rip), %rax		# first entry is for idx 1
	pmovzxdq (%rax,chunk_bytes_q), %xmm0	# 2 consts: K1:K2
	lea	(chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
	sub	%eax, len			# len -= chunk_bytes * 3
	sub	%rax, len			# len -= chunk_bytes * 3

	movq	crc_init_q, %xmm1		# CRC for block 1
	movq	crc0_q, %xmm1			# CRC for block 1
	pclmulqdq $0x00, %xmm0, %xmm1		# Multiply by K2

	movq    crc1, %xmm2			# CRC for block 2
@@ -176,8 +177,8 @@ SYM_FUNC_START(crc_pcl)
	pxor    %xmm2,%xmm1
	movq    %xmm1, %rax
	xor	(bufp,chunk_bytes_q,2), %rax
	mov	crc2, crc_init_q
	crc32	%rax, crc_init_q
	mov	crc2, crc0_q
	crc32	%rax, crc0_q
	lea	8(bufp,chunk_bytes_q,2), bufp

	################################################################
@@ -193,34 +194,34 @@ SYM_FUNC_START(crc_pcl)
	## 6) Process any remainder without interleaving:
	#######################################################################
.Lsmall:
	test	len, len
	test	len_dw, len_dw
	jz	.Ldone
	mov	len, %eax
	mov	len_dw, %eax
	shr	$3, %eax
	jz	.Ldo_dword
.Ldo_qwords:
	crc32q	(bufp), crc_init_q
	crc32q	(bufp), crc0_q
	add	$8, bufp
	dec	%eax
	jnz	.Ldo_qwords
.Ldo_dword:
	test	$4, len
	test	$4, len_dw
	jz	.Ldo_word
	crc32l	(bufp), crc_init
	crc32l	(bufp), crc0
	add	$4, bufp
.Ldo_word:
	test	$2, len
	test	$2, len_dw
	jz	.Ldo_byte
	crc32w	(bufp), crc_init
	crc32w	(bufp), crc0
	add	$2, bufp
.Ldo_byte:
	test	$1, len
	test	$1, len_dw
	jz	.Ldone
	crc32b	(bufp), crc_init
	crc32b	(bufp), crc0
.Ldone:
	mov	crc_init, %eax
	mov	crc0, %eax
        RET
SYM_FUNC_END(crc_pcl)
SYM_FUNC_END(crc32c_x86_3way)

.section	.rodata, "a", @progbits
	################################################################