Commit 543ea178 authored by Eric Biggers's avatar Eric Biggers Committed by Herbert Xu
Browse files

crypto: x86/aes-xts - optimize size of instructions operating on lengths



x86_64 has the "interesting" property that the instruction size is
generally a bit shorter for instructions that operate on the 32-bit (or
less) part of registers, or registers that are in the original set of 8.

This patch adjusts the AES-XTS code to take advantage of that property
by changing the LEN parameter from size_t to unsigned int (which is all
that's needed and is what the non-AVX implementation uses) and using the
%eax register for KEYLEN.

This decreases the size of aes-xts-avx-x86_64.o by 1.2%.

Note that changing the kmovq to kmovd was going to be needed anyway to
make the AVX10/256 code really work on CPUs that don't support 512-bit
vectors (since the AVX10 spec says that 64-bit opmask instructions will
only be supported on processors that support 512-bit vectors).

Signed-off-by: default avatarEric Biggers <ebiggers@google.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent e619723a
Loading
Loading
Loading
Loading
+21 −19
Original line number Diff line number Diff line
@@ -85,14 +85,16 @@
				// advanced to point to 7th-from-last round key
.set	SRC,		%rsi	// Pointer to next source data
.set	DST,		%rdx	// Pointer to next destination data
.set	LEN,		%rcx	// Remaining length in bytes
.set	LEN,		%ecx	// Remaining length in bytes
.set	LEN8,		%cl
.set	LEN64,		%rcx
.set	TWEAK,		%r8	// Pointer to next tweak

// %r9 holds the AES key length in bytes.
.set	KEYLEN,		%r9d
.set	KEYLEN64,	%r9
// %rax holds the AES key length in bytes.
.set	KEYLEN,		%eax
.set	KEYLEN64,	%rax

// %rax and %r10-r11 are available as temporaries.
// %r9-r11 are available as temporaries.

.macro	_define_Vi	i
.if VL == 16
@@ -565,9 +567,9 @@
	// subtracting 16 from LEN.  This is needed because ciphertext stealing
	// decryption uses the last two tweaks in reverse order.  We'll handle
	// the last full block and the partial block specially at the end.
	lea		-16(LEN), %rax
	test		$15, LEN
	cmovnz		%rax, LEN
	lea		-16(LEN), %eax
	test		$15, LEN8
	cmovnz		%eax, LEN
.endif

	// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
@@ -650,7 +652,7 @@
	// Check for the uncommon case where the data length isn't a multiple of
	// 4*VL.  Handle it out-of-line in order to optimize for the common
	// case.  In the common case, just fall through to the ret.
	test		$4*VL-1, LEN
	test		$4*VL-1, LEN8
	jnz		.Lhandle_remainder\@
.Ldone\@:
	// Store the next tweak back to *TWEAK to support continuation calls.
@@ -718,9 +720,9 @@

.if USE_AVX10
	// Create a mask that has the first LEN bits set.
	mov		$-1, %rax
	bzhi		LEN, %rax, %rax
	kmovq		%rax, %k1
	mov		$-1, %r9d
	bzhi		LEN, %r9d, %r9d
	kmovd		%r9d, %k1

	// Swap the first LEN bytes of the en/decryption of the last full block
	// with the partial block.  Note that to support in-place en/decryption,
@@ -730,23 +732,23 @@
	vmovdqu8	16(SRC), %xmm0{%k1}
	vmovdqu8	%xmm1, 16(DST){%k1}
.else
	lea		.Lcts_permute_table(%rip), %rax
	lea		.Lcts_permute_table(%rip), %r9

	// Load the src partial block, left-aligned.  Note that to support
	// in-place en/decryption, this must happen before the store to the dst
	// partial block.
	vmovdqu		(SRC, LEN, 1), %xmm1
	vmovdqu		(SRC, LEN64, 1), %xmm1

	// Shift the first LEN bytes of the en/decryption of the last full block
	// to the end of a register, then store it to DST+LEN.  This stores the
	// dst partial block.  It also writes to the second part of the dst last
	// full block, but that part is overwritten later.
	vpshufb		(%rax, LEN, 1), %xmm0, %xmm2
	vmovdqu		%xmm2, (DST, LEN, 1)
	vpshufb		(%r9, LEN64, 1), %xmm0, %xmm2
	vmovdqu		%xmm2, (DST, LEN64, 1)

	// Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
	sub		LEN, %rax
	vmovdqu		32(%rax), %xmm3
	sub		LEN64, %r9
	vmovdqu		32(%r9), %xmm3

	// Shift the src partial block to the beginning of its register.
	vpshufb		%xmm3, %xmm1, %xmm1
@@ -795,7 +797,7 @@ SYM_FUNC_END(aes_xts_encrypt_iv)
// instantiated from the above macro.  They all have the following prototype:
//
// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
//			const u8 *src, u8 *dst, size_t len,
//			const u8 *src, u8 *dst, unsigned int len,
//			u8 tweak[AES_BLOCK_SIZE]);
//
// |key| is the data key.  |tweak| contains the next tweak; the encryption of
+9 −9
Original line number Diff line number Diff line
@@ -899,7 +899,7 @@ static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key,
typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key,
				    u8 iv[AES_BLOCK_SIZE]);
typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
			       const u8 *src, u8 *dst, size_t len,
			       const u8 *src, u8 *dst, unsigned int len,
			       u8 tweak[AES_BLOCK_SIZE]);

/* This handles cases where the source and/or destination span pages. */
@@ -1021,14 +1021,14 @@ static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
}

static void aesni_xts_encrypt(const struct crypto_aes_ctx *key,
			      const u8 *src, u8 *dst, size_t len,
			      const u8 *src, u8 *dst, unsigned int len,
			      u8 tweak[AES_BLOCK_SIZE])
{
	aesni_xts_enc(key, dst, src, len, tweak);
}

static void aesni_xts_decrypt(const struct crypto_aes_ctx *key,
			      const u8 *src, u8 *dst, size_t len,
			      const u8 *src, u8 *dst, unsigned int len,
			      u8 tweak[AES_BLOCK_SIZE])
{
	aesni_xts_dec(key, dst, src, len, tweak);
@@ -1185,12 +1185,12 @@ asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,

#define DEFINE_XTS_ALG(suffix, driver_name, priority)			       \
									       \
asmlinkage void aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key,     \
					 const u8 *src, u8 *dst, size_t len,   \
					 u8 tweak[AES_BLOCK_SIZE]);	       \
asmlinkage void aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key,     \
					 const u8 *src, u8 *dst, size_t len,   \
					 u8 tweak[AES_BLOCK_SIZE]);	       \
asmlinkage void								       \
aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src,      \
			 u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
asmlinkage void								       \
aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src,      \
			 u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
									       \
static int xts_encrypt_##suffix(struct skcipher_request *req)		       \
{									       \