Commit 570ef50a authored by Eric Biggers's avatar Eric Biggers Committed by Herbert Xu
Browse files

crypto: x86/aes-xts - optimize _compute_first_set_of_tweaks for AVX-512



Optimize the AVX-512 version of _compute_first_set_of_tweaks by using
vectorized shifts to compute the first vector of tweak blocks, and by
using byte-aligned shifts when multiplying by x^8.

AES-XTS performance on AMD Ryzen 9 9950X (Zen 5) improves by about 2%
for 4096-byte messages or 6% for 512-byte messages.  AES-XTS performance
on Intel Sapphire Rapids improves by about 1% for 4096-byte messages or
3% for 512-byte messages.  Code size decreases by 75 bytes which
outweighs the increase in rodata size of 16 bytes.

Signed-off-by: default avatarEric Biggers <ebiggers@google.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent bc23fe6d
Loading
Loading
Loading
Loading
+62 −28
Original line number Diff line number Diff line
@@ -100,6 +100,17 @@
	// exists when there's a carry out of the low 64 bits of the tweak.
	.quad	0x87, 1

	// These are the shift amounts that are needed when multiplying by [x^0,
	// x^1, x^2, x^3] to compute the first vector of tweaks when VL=64.
	//
	// The right shifts by 64 are expected to zeroize the destination.
	// 'vpsrlvq' is indeed defined to do that; i.e. it doesn't truncate the
	// amount to 64 & 63 = 0 like the 'shr' scalar shift instruction would.
.Lrshift_amounts:
	.byte	64, 64, 63, 63, 62, 62, 61, 61
.Llshift_amounts:
	.byte	0, 0, 1, 1, 2, 2, 3, 3

	// This table contains constants for vpshufb and vpblendvb, used to
	// handle variable byte shifts and blending during ciphertext stealing
	// on CPUs that don't support AVX512-style masking.
@@ -294,52 +305,75 @@
// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
// store them in the vector registers TWEAK0-TWEAK3.  Clobbers V0-V5.
.macro	_compute_first_set_of_tweaks
	vmovdqu		(TWEAK), TWEAK0_XMM
	_vbroadcast128	.Lgf_poly(%rip), GF_POLY
.if VL == 16
	// With VL=16, multiplying by x serially is fastest.
	vmovdqu		(TWEAK), TWEAK0_XMM
	vmovdqu		.Lgf_poly(%rip), GF_POLY
	_next_tweak	TWEAK0, %xmm0, TWEAK1
	_next_tweak	TWEAK1, %xmm0, TWEAK2
	_next_tweak	TWEAK2, %xmm0, TWEAK3
.else
.if VL == 32
	// Compute the second block of TWEAK0.
.elseif VL == 32
	vmovdqu		(TWEAK), TWEAK0_XMM
	vbroadcasti128	.Lgf_poly(%rip), GF_POLY

	// Compute the first vector of tweaks.
	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
	vinserti128	$1, %xmm1, TWEAK0, TWEAK0
.elseif VL == 64
	// Compute the remaining blocks of TWEAK0.
	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
	_next_tweak	%xmm1, %xmm0, %xmm2
	_next_tweak	%xmm2, %xmm0, %xmm3
	vinserti32x4	$1, %xmm1, TWEAK0, TWEAK0
	vinserti32x4	$2, %xmm2, TWEAK0, TWEAK0
	vinserti32x4	$3, %xmm3, TWEAK0, TWEAK0
.endif
	// Compute TWEAK[1-3] from TWEAK0.
	vpsrlq		$64 - 1*VL/16, TWEAK0, V0
	vpsrlq		$64 - 2*VL/16, TWEAK0, V2
	vpsrlq		$64 - 3*VL/16, TWEAK0, V4

	// Compute the next three vectors of tweaks:
	//	TWEAK1 = TWEAK0 * [x^2, x^2]
	//	TWEAK2 = TWEAK0 * [x^4, x^4]
	//	TWEAK3 = TWEAK0 * [x^6, x^6]
	vpsrlq		$64 - 2, TWEAK0, V0
	vpsrlq		$64 - 4, TWEAK0, V2
	vpsrlq		$64 - 6, TWEAK0, V4
	vpclmulqdq	$0x01, GF_POLY, V0, V1
	vpclmulqdq	$0x01, GF_POLY, V2, V3
	vpclmulqdq	$0x01, GF_POLY, V4, V5
	vpslldq		$8, V0, V0
	vpslldq		$8, V2, V2
	vpslldq		$8, V4, V4
	vpsllq		$1*VL/16, TWEAK0, TWEAK1
	vpsllq		$2*VL/16, TWEAK0, TWEAK2
	vpsllq		$3*VL/16, TWEAK0, TWEAK3
.if USE_AVX512
	vpternlogd	$0x96, V0, V1, TWEAK1
	vpternlogd	$0x96, V2, V3, TWEAK2
	vpternlogd	$0x96, V4, V5, TWEAK3
.else
	vpsllq		$2, TWEAK0, TWEAK1
	vpsllq		$4, TWEAK0, TWEAK2
	vpsllq		$6, TWEAK0, TWEAK3
	vpxor		V0, TWEAK1, TWEAK1
	vpxor		V2, TWEAK2, TWEAK2
	vpxor		V4, TWEAK3, TWEAK3
	vpxor		V1, TWEAK1, TWEAK1
	vpxor		V3, TWEAK2, TWEAK2
	vpxor		V5, TWEAK3, TWEAK3
.endif
.else
	vbroadcasti32x4	(TWEAK), TWEAK0
	vbroadcasti32x4	.Lgf_poly(%rip), GF_POLY

	// Compute the first vector of tweaks:
	//	TWEAK0 = broadcast128(TWEAK) * [x^0, x^1, x^2, x^3]
	vpmovzxbq	.Lrshift_amounts(%rip), V4
	vpsrlvq		V4, TWEAK0, V0
	vpclmulqdq	$0x01, GF_POLY, V0, V1
	vpmovzxbq	.Llshift_amounts(%rip), V4
	vpslldq		$8, V0, V0
	vpsllvq		V4, TWEAK0, TWEAK0
	vpternlogd	$0x96, V0, V1, TWEAK0

	// Compute the next three vectors of tweaks:
	//	TWEAK1 = TWEAK0 * [x^4, x^4, x^4, x^4]
	//	TWEAK2 = TWEAK0 * [x^8, x^8, x^8, x^8]
	//	TWEAK3 = TWEAK0 * [x^12, x^12, x^12, x^12]
	// x^8 only needs byte-aligned shifts, so optimize accordingly.
	vpsrlq		$64 - 4, TWEAK0, V0
	vpsrldq		$(64 - 8) / 8, TWEAK0, V2
	vpsrlq		$64 - 12, TWEAK0, V4
	vpclmulqdq	$0x01, GF_POLY, V0, V1
	vpclmulqdq	$0x01, GF_POLY, V2, V3
	vpclmulqdq	$0x01, GF_POLY, V4, V5
	vpslldq		$8, V0, V0
	vpslldq		$8, V4, V4
	vpsllq		$4, TWEAK0, TWEAK1
	vpslldq		$8 / 8, TWEAK0, TWEAK2
	vpsllq		$12, TWEAK0, TWEAK3
	vpternlogd	$0x96, V0, V1, TWEAK1
	vpxord		V3, TWEAK2, TWEAK2
	vpternlogd	$0x96, V4, V5, TWEAK3
.endif
.endm