Commit a7acd77e authored by Eric Biggers's avatar Eric Biggers
Browse files

lib/crypto: x86/blake2s: Improve readability



Various cleanups for readability.  No change to the generated code:

- Add some comments
- Add #defines for arguments
- Rename some labels
- Use decimal constants instead of hex where it makes sense.
  (The pshufd immediates intentionally remain as hex.)
- Add blank lines when there's a logical break

The round loop still could use some work, but this is at least a start.

Reviewed-by: default avatarArd Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251102234209.62133-5-ebiggers@kernel.org


Signed-off-by: default avatarEric Biggers <ebiggers@kernel.org>
parent 83c1a867
Loading
Loading
Loading
Loading
+134 −97
Original line number Diff line number Diff line
@@ -50,34 +50,52 @@
.byte 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
.byte  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9

#define CTX		%rdi
#define DATA		%rsi
#define NBLOCKS		%rdx
#define INC		%ecx

.text
//
// void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
//			       const u8 *data, size_t nblocks, u32 inc);
//
// Only the first three fields of struct blake2s_ctx are used:
//	u32 h[8];	(inout)
//	u32 t[2];	(inout)
//	u32 f[2];	(in)
//
SYM_FUNC_START(blake2s_compress_ssse3)
	movdqu		(%rdi),%xmm0
	movdqu		0x10(%rdi),%xmm1
	movdqu		(CTX),%xmm0		// Load h[0..3]
	movdqu		16(CTX),%xmm1		// Load h[4..7]
	movdqa		.Lror16(%rip),%xmm12
	movdqa		.Lror8(%rip),%xmm13
	movdqu		0x20(%rdi),%xmm14
	movd		%ecx,%xmm15
	leaq		.Lsigma+0xa0(%rip),%r8
	jmp		.Lbeginofloop
	movdqu		32(CTX),%xmm14		// Load t and f
	movd		INC,%xmm15		// Load inc
	leaq		.Lsigma+160(%rip),%r8
	jmp		.Lssse3_mainloop

	.align		32
.Lbeginofloop:
	movdqa		%xmm0,%xmm10
	movdqa		%xmm1,%xmm11
	paddq		%xmm15,%xmm14
	movdqa		.Liv(%rip),%xmm2
.Lssse3_mainloop:
	// Main loop: each iteration processes one 64-byte block.
	movdqa		%xmm0,%xmm10		// Save h[0..3] and let v[0..3] = h[0..3]
	movdqa		%xmm1,%xmm11		// Save h[4..7] and let v[4..7] = h[4..7]
	paddq		%xmm15,%xmm14		// t += inc (64-bit addition)
	movdqa		.Liv(%rip),%xmm2	// v[8..11] = iv[0..3]
	movdqa		%xmm14,%xmm3
	pxor		.Liv+0x10(%rip),%xmm3
	pxor		.Liv+16(%rip),%xmm3	// v[12..15] = iv[4..7] ^ [t, f]
	leaq		.Lsigma(%rip),%rcx
.Lroundloop:

.Lssse3_roundloop:
	// Round loop: each iteration does 1 round (of 10 rounds total).
	movzbl		(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm4
	movzbl		0x1(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm5
	movzbl		0x2(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm6
	movzbl		0x3(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm7
	movd		(DATA,%rax,4),%xmm4
	movzbl		1(%rcx),%eax
	movd		(DATA,%rax,4),%xmm5
	movzbl		2(%rcx),%eax
	movd		(DATA,%rax,4),%xmm6
	movzbl		3(%rcx),%eax
	movd		(DATA,%rax,4),%xmm7
	punpckldq	%xmm5,%xmm4
	punpckldq	%xmm7,%xmm6
	punpcklqdq	%xmm6,%xmm4
@@ -88,17 +106,17 @@ SYM_FUNC_START(blake2s_compress_ssse3)
	paddd		%xmm3,%xmm2
	pxor		%xmm2,%xmm1
	movdqa		%xmm1,%xmm8
	psrld		$0xc,%xmm1
	pslld		$0x14,%xmm8
	psrld		$12,%xmm1
	pslld		$20,%xmm8
	por		%xmm8,%xmm1
	movzbl		0x4(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm5
	movzbl		0x5(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm6
	movzbl		0x6(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm7
	movzbl		0x7(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm4
	movzbl		4(%rcx),%eax
	movd		(DATA,%rax,4),%xmm5
	movzbl		5(%rcx),%eax
	movd		(DATA,%rax,4),%xmm6
	movzbl		6(%rcx),%eax
	movd		(DATA,%rax,4),%xmm7
	movzbl		7(%rcx),%eax
	movd		(DATA,%rax,4),%xmm4
	punpckldq	%xmm6,%xmm5
	punpckldq	%xmm4,%xmm7
	punpcklqdq	%xmm7,%xmm5
@@ -109,20 +127,20 @@ SYM_FUNC_START(blake2s_compress_ssse3)
	paddd		%xmm3,%xmm2
	pxor		%xmm2,%xmm1
	movdqa		%xmm1,%xmm8
	psrld		$0x7,%xmm1
	pslld		$0x19,%xmm8
	psrld		$7,%xmm1
	pslld		$25,%xmm8
	por		%xmm8,%xmm1
	pshufd		$0x93,%xmm0,%xmm0
	pshufd		$0x4e,%xmm3,%xmm3
	pshufd		$0x39,%xmm2,%xmm2
	movzbl		0x8(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm6
	movzbl		0x9(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm7
	movzbl		0xa(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm4
	movzbl		0xb(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm5
	movzbl		8(%rcx),%eax
	movd		(DATA,%rax,4),%xmm6
	movzbl		9(%rcx),%eax
	movd		(DATA,%rax,4),%xmm7
	movzbl		10(%rcx),%eax
	movd		(DATA,%rax,4),%xmm4
	movzbl		11(%rcx),%eax
	movd		(DATA,%rax,4),%xmm5
	punpckldq	%xmm7,%xmm6
	punpckldq	%xmm5,%xmm4
	punpcklqdq	%xmm4,%xmm6
@@ -133,17 +151,17 @@ SYM_FUNC_START(blake2s_compress_ssse3)
	paddd		%xmm3,%xmm2
	pxor		%xmm2,%xmm1
	movdqa		%xmm1,%xmm8
	psrld		$0xc,%xmm1
	pslld		$0x14,%xmm8
	psrld		$12,%xmm1
	pslld		$20,%xmm8
	por		%xmm8,%xmm1
	movzbl		0xc(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm7
	movzbl		0xd(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm4
	movzbl		0xe(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm5
	movzbl		0xf(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm6
	movzbl		12(%rcx),%eax
	movd		(DATA,%rax,4),%xmm7
	movzbl		13(%rcx),%eax
	movd		(DATA,%rax,4),%xmm4
	movzbl		14(%rcx),%eax
	movd		(DATA,%rax,4),%xmm5
	movzbl		15(%rcx),%eax
	movd		(DATA,%rax,4),%xmm6
	punpckldq	%xmm4,%xmm7
	punpckldq	%xmm6,%xmm5
	punpcklqdq	%xmm5,%xmm7
@@ -154,52 +172,68 @@ SYM_FUNC_START(blake2s_compress_ssse3)
	paddd		%xmm3,%xmm2
	pxor		%xmm2,%xmm1
	movdqa		%xmm1,%xmm8
	psrld		$0x7,%xmm1
	pslld		$0x19,%xmm8
	psrld		$7,%xmm1
	pslld		$25,%xmm8
	por		%xmm8,%xmm1
	pshufd		$0x39,%xmm0,%xmm0
	pshufd		$0x4e,%xmm3,%xmm3
	pshufd		$0x93,%xmm2,%xmm2
	addq		$0x10,%rcx
	addq		$16,%rcx
	cmpq		%r8,%rcx
	jnz		.Lroundloop
	jnz		.Lssse3_roundloop

	// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
	pxor		%xmm2,%xmm0
	pxor		%xmm3,%xmm1
	pxor		%xmm10,%xmm0
	pxor		%xmm11,%xmm1
	addq		$0x40,%rsi
	decq		%rdx
	jnz		.Lbeginofloop
	movdqu		%xmm0,(%rdi)
	movdqu		%xmm1,0x10(%rdi)
	movdqu		%xmm14,0x20(%rdi)
	addq		$64,DATA
	decq		NBLOCKS
	jnz		.Lssse3_mainloop

	movdqu		%xmm0,(CTX)		// Store new h[0..3]
	movdqu		%xmm1,16(CTX)		// Store new h[4..7]
	movdqu		%xmm14,32(CTX)		// Store new t and f
	RET
SYM_FUNC_END(blake2s_compress_ssse3)

//
// void blake2s_compress_avx512(struct blake2s_ctx *ctx,
//				const u8 *data, size_t nblocks, u32 inc);
//
// Only the first three fields of struct blake2s_ctx are used:
//	u32 h[8];	(inout)
//	u32 t[2];	(inout)
//	u32 f[2];	(in)
//
SYM_FUNC_START(blake2s_compress_avx512)
	vmovdqu		(%rdi),%xmm0
	vmovdqu		0x10(%rdi),%xmm1
	vmovdqu		0x20(%rdi),%xmm4
	vmovd		%ecx,%xmm5
	vmovdqa		.Liv(%rip),%xmm14
	vmovdqa		.Liv+16(%rip),%xmm15
	jmp		.Lblake2s_compress_avx512_mainloop
	vmovdqu		(CTX),%xmm0		// Load h[0..3]
	vmovdqu		16(CTX),%xmm1		// Load h[4..7]
	vmovdqu		32(CTX),%xmm4		// Load t and f
	vmovd		INC,%xmm5		// Load inc
	vmovdqa		.Liv(%rip),%xmm14	// Load iv[0..3]
	vmovdqa		.Liv+16(%rip),%xmm15	// Load iv[4..7]
	jmp		.Lavx512_mainloop

	.align		32
.Lblake2s_compress_avx512_mainloop:
	vmovdqa		%xmm0,%xmm10
	vmovdqa		%xmm1,%xmm11
	vpaddq		%xmm5,%xmm4,%xmm4
	vmovdqa		%xmm14,%xmm2
	vpxor		%xmm15,%xmm4,%xmm3
	vmovdqu		(%rsi),%ymm6
	vmovdqu		0x20(%rsi),%ymm7
	addq		$0x40,%rsi
.Lavx512_mainloop:
	// Main loop: each iteration processes one 64-byte block.
	vmovdqa		%xmm0,%xmm10		// Save h[0..3] and let v[0..3] = h[0..3]
	vmovdqa		%xmm1,%xmm11		// Save h[4..7] and let v[4..7] = h[4..7]
	vpaddq		%xmm5,%xmm4,%xmm4	// t += inc (64-bit addition)
	vmovdqa		%xmm14,%xmm2		// v[8..11] = iv[0..3]
	vpxor		%xmm15,%xmm4,%xmm3	// v[12..15] = iv[4..7] ^ [t, f]
	vmovdqu		(DATA),%ymm6		// Load first 8 data words
	vmovdqu		32(DATA),%ymm7		// Load second 8 data words
	addq		$64,DATA
	leaq		.Lsigma2(%rip),%rax
	movb		$0xa,%cl
.Lblake2s_compress_avx512_roundloop:
	movb		$10,%cl			// Set num rounds remaining

.Lavx512_roundloop:
	// Round loop: each iteration does 1 round (of 10 rounds total).
	vpmovzxbd	(%rax),%ymm8
	vpmovzxbd	0x8(%rax),%ymm9
	addq		$0x10,%rax
	vpmovzxbd	8(%rax),%ymm9
	addq		$16,%rax
	vpermi2d	%ymm7,%ymm6,%ymm8
	vpermi2d	%ymm7,%ymm6,%ymm9
	vmovdqa		%ymm8,%ymm6
@@ -207,50 +241,53 @@ SYM_FUNC_START(blake2s_compress_avx512)
	vpaddd		%xmm8,%xmm0,%xmm0
	vpaddd		%xmm1,%xmm0,%xmm0
	vpxor		%xmm0,%xmm3,%xmm3
	vprord		$0x10,%xmm3,%xmm3
	vprord		$16,%xmm3,%xmm3
	vpaddd		%xmm3,%xmm2,%xmm2
	vpxor		%xmm2,%xmm1,%xmm1
	vprord		$0xc,%xmm1,%xmm1
	vextracti128	$0x1,%ymm8,%xmm8
	vprord		$12,%xmm1,%xmm1
	vextracti128	$1,%ymm8,%xmm8
	vpaddd		%xmm8,%xmm0,%xmm0
	vpaddd		%xmm1,%xmm0,%xmm0
	vpxor		%xmm0,%xmm3,%xmm3
	vprord		$0x8,%xmm3,%xmm3
	vprord		$8,%xmm3,%xmm3
	vpaddd		%xmm3,%xmm2,%xmm2
	vpxor		%xmm2,%xmm1,%xmm1
	vprord		$0x7,%xmm1,%xmm1
	vprord		$7,%xmm1,%xmm1
	vpshufd		$0x93,%xmm0,%xmm0
	vpshufd		$0x4e,%xmm3,%xmm3
	vpshufd		$0x39,%xmm2,%xmm2
	vpaddd		%xmm9,%xmm0,%xmm0
	vpaddd		%xmm1,%xmm0,%xmm0
	vpxor		%xmm0,%xmm3,%xmm3
	vprord		$0x10,%xmm3,%xmm3
	vprord		$16,%xmm3,%xmm3
	vpaddd		%xmm3,%xmm2,%xmm2
	vpxor		%xmm2,%xmm1,%xmm1
	vprord		$0xc,%xmm1,%xmm1
	vextracti128	$0x1,%ymm9,%xmm9
	vprord		$12,%xmm1,%xmm1
	vextracti128	$1,%ymm9,%xmm9
	vpaddd		%xmm9,%xmm0,%xmm0
	vpaddd		%xmm1,%xmm0,%xmm0
	vpxor		%xmm0,%xmm3,%xmm3
	vprord		$0x8,%xmm3,%xmm3
	vprord		$8,%xmm3,%xmm3
	vpaddd		%xmm3,%xmm2,%xmm2
	vpxor		%xmm2,%xmm1,%xmm1
	vprord		$0x7,%xmm1,%xmm1
	vprord		$7,%xmm1,%xmm1
	vpshufd		$0x39,%xmm0,%xmm0
	vpshufd		$0x4e,%xmm3,%xmm3
	vpshufd		$0x93,%xmm2,%xmm2
	decb		%cl
	jne		.Lblake2s_compress_avx512_roundloop
	jne		.Lavx512_roundloop

	// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
	vpxor		%xmm10,%xmm0,%xmm0
	vpxor		%xmm11,%xmm1,%xmm1
	vpxor		%xmm2,%xmm0,%xmm0
	vpxor		%xmm3,%xmm1,%xmm1
	decq		%rdx
	jne		.Lblake2s_compress_avx512_mainloop
	vmovdqu		%xmm0,(%rdi)
	vmovdqu		%xmm1,0x10(%rdi)
	vmovdqu		%xmm4,0x20(%rdi)
	decq		NBLOCKS
	jne		.Lavx512_mainloop

	vmovdqu		%xmm0,(CTX)		// Store new h[0..3]
	vmovdqu		%xmm1,16(CTX)		// Store new h[4..7]
	vmovdqu		%xmm4,32(CTX)		// Store new t and f
	vzeroupper
	RET
SYM_FUNC_END(blake2s_compress_avx512)