crypto: x86/aes - drop the avx10_256 AES-XTS and AES-CTR code (7d14fbc5) · Commits · git / linux-net

arch/x86/crypto/aes-ctr-avx-x86_64.S

+14 −33

Original line number	Diff line number	Diff line
		@@ -48,8 +48,7 @@
		// using the following sets of CPU features:
		// - AES-NI && AVX
		// - VAES && AVX2
		// - VAES && (AVX10/256 \|\| (AVX512BW && AVX512VL)) && BMI2
		// - VAES && (AVX10/512 \|\| (AVX512BW && AVX512VL)) && BMI2
		// - VAES && AVX512BW && AVX512VL && BMI2
		//
		// See the function definitions at the bottom of the file for more information.

		@@ -76,7 +75,6 @@
		.text

		// Move a vector between memory and a register.
		// The register operand must be in the first 16 vector registers.
		.macro _vmovdqu src, dst
		.if VL < 64
		vmovdqu \src, \dst
		@@ -86,7 +84,6 @@
		.endm

		// Move a vector between registers.
		// The registers must be in the first 16 vector registers.
		.macro _vmovdqa src, dst
		.if VL < 64
		vmovdqa \src, \dst
		@@ -96,7 +93,7 @@
		.endm

		// Broadcast a 128-bit value from memory to all 128-bit lanes of a vector
		// register. The register operand must be in the first 16 vector registers.
		// register.
		.macro _vbroadcast128 src, dst
		.if VL == 16
		vmovdqu \src, \dst
		@@ -108,7 +105,6 @@
		.endm

		// XOR two vectors together.
		// Any register operands must be in the first 16 vector registers.
		.macro _vpxor src1, src2, dst
		.if VL < 64
		vpxor \src1, \src2, \dst
		@@ -199,8 +195,8 @@
		// XOR each with the zero-th round key. Also update LE_CTR if !\final.
		.macro _prepare_2_ctr_vecs is_xctr, i0, i1, final=0
		.if \is_xctr
		.if USE_AVX10
		_vmovdqa LE_CTR, AESDATA\i0
		.if USE_AVX512
		vmovdqa64 LE_CTR, AESDATA\i0
		vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i0
		.else
		vpxor XCTR_IV, LE_CTR, AESDATA\i0
		@@ -208,7 +204,7 @@
		.endif
		vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1

		.if USE_AVX10
		.if USE_AVX512
		vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i1
		.else
		vpxor XCTR_IV, AESDATA\i1, AESDATA\i1
		@@ -481,18 +477,12 @@
		.Lxor_tail_partial_vec_0\@:
		// XOR the remaining 1 <= LEN < VL bytes. It's easy if masked
		// loads/stores are available; otherwise it's a bit harder...
		.if USE_AVX10
		.if VL <= 32
		mov $-1, %eax
		bzhi LEN, %eax, %eax
		kmovd %eax, %k1
		.else
		.if USE_AVX512
		mov $-1, %rax
		bzhi LEN64, %rax, %rax
		kmovq %rax, %k1
		.endif
		vmovdqu8 (SRC), AESDATA1{%k1}{z}
		_vpxor AESDATA1, AESDATA0, AESDATA0
		vpxord AESDATA1, AESDATA0, AESDATA0
		vmovdqu8 AESDATA0, (DST){%k1}
		.else
		.if VL == 32
		@@ -554,7 +544,7 @@
		// eliminates carries. \|ctr\| is the per-message block counter starting at 1.

		.set VL, 16
		.set USE_AVX10, 0
		.set USE_AVX512, 0
		SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx)
		_aes_ctr_crypt 0
		SYM_FUNC_END(aes_ctr64_crypt_aesni_avx)
		@@ -564,7 +554,7 @@ SYM_FUNC_END(aes_xctr_crypt_aesni_avx)

		#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
		.set VL, 32
		.set USE_AVX10, 0
		.set USE_AVX512, 0
		SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2)
		_aes_ctr_crypt 0
		SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2)
		@@ -572,21 +562,12 @@ SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2)
		_aes_ctr_crypt 1
		SYM_FUNC_END(aes_xctr_crypt_vaes_avx2)

		.set VL, 32
		.set USE_AVX10, 1
		SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_256)
		_aes_ctr_crypt 0
		SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_256)
		SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_256)
		_aes_ctr_crypt 1
		SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_256)

		.set VL, 64
		.set USE_AVX10, 1
		SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_512)
		.set USE_AVX512, 1
		SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx512)
		_aes_ctr_crypt 0
		SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_512)
		SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_512)
		SYM_FUNC_END(aes_ctr64_crypt_vaes_avx512)
		SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx512)
		_aes_ctr_crypt 1
		SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_512)
		SYM_FUNC_END(aes_xctr_crypt_vaes_avx512)
		#endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ

arch/x86/crypto/aes-xts-avx-x86_64.S

+50 −68

Original line number	Diff line number	Diff line
		@@ -52,32 +52,25 @@
		* different code, it uses a macro to generate several implementations that
		* share similar source code but are targeted at different CPUs, listed below:
		*
		* AES-NI + AVX
		* AES-NI && AVX
		* - 128-bit vectors (1 AES block per vector)
		* - VEX-coded instructions
		* - xmm0-xmm15
		* - This is for older CPUs that lack VAES but do have AVX.
		*
		* VAES + VPCLMULQDQ + AVX2
		* VAES && VPCLMULQDQ && AVX2
		* - 256-bit vectors (2 AES blocks per vector)
		* - VEX-coded instructions
		* - ymm0-ymm15
		* - This is for CPUs that have VAES but lack AVX512 or AVX10,
		* e.g. Intel's Alder Lake and AMD's Zen 3.
		* - This is for CPUs that have VAES but either lack AVX512 (e.g. Intel's
		* Alder Lake and AMD's Zen 3) or downclock too eagerly when using zmm
		* registers (e.g. Intel's Ice Lake).
		*
		* VAES + VPCLMULQDQ + AVX10/256 + BMI2
		* - 256-bit vectors (2 AES blocks per vector)
		* VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2
		* - 512-bit vectors (4 AES blocks per vector)
		* - EVEX-coded instructions
		* - ymm0-ymm31
		* - This is for CPUs that have AVX512 but where using zmm registers causes
		* downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
		* - By "AVX10/256" we really mean (AVX512BW + AVX512VL) \|\| AVX10/256.
		* To avoid confusion with 512-bit, we just write AVX10/256.
		*
		* VAES + VPCLMULQDQ + AVX10/512 + BMI2
		* - Same as the previous one, but upgrades to 512-bit vectors
		* (4 AES blocks per vector) in zmm0-zmm31.
		* - This is for CPUs that have good AVX512 or AVX10/512 support.
		* - zmm0-zmm31
		* - This is for CPUs that have good AVX512 support.
		*
		* This file doesn't have an implementation for AES-NI alone (without AVX), as
		* the lack of VEX would make all the assembly code different.
		@@ -109,7 +102,7 @@

		// This table contains constants for vpshufb and vpblendvb, used to
		// handle variable byte shifts and blending during ciphertext stealing
		// on CPUs that don't support AVX10-style masking.
		// on CPUs that don't support AVX512-style masking.
		.Lcts_permute_table:
		.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
		.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
		@@ -138,7 +131,7 @@
		.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
		_define_Vi \i
		.endr
		.if USE_AVX10
		.if USE_AVX512
		.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
		_define_Vi \i
		.endr
		@@ -193,7 +186,7 @@
		// keys to the end of this register range. I.e., AES-128 uses
		// KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
		// (All also use KEY0 for the XOR-only "round" at the beginning.)
		.if USE_AVX10
		.if USE_AVX512
		.set KEY1_XMM, %xmm16
		.set KEY1, V16
		.set KEY2_XMM, %xmm17
		@@ -227,7 +220,6 @@
		.endm

		// Move a vector between memory and a register.
		// The register operand must be in the first 16 vector registers.
		.macro _vmovdqu src, dst
		.if VL < 64
		vmovdqu \src, \dst
		@@ -238,9 +230,9 @@

		// Broadcast a 128-bit value into a vector.
		.macro _vbroadcast128 src, dst
		.if VL == 16 && !USE_AVX10
		.if VL == 16
		vmovdqu \src, \dst
		.elseif VL == 32 && !USE_AVX10
		.elseif VL == 32
		vbroadcasti128 \src, \dst
		.else
		vbroadcasti32x4 \src, \dst
		@@ -248,7 +240,6 @@
		.endm

		// XOR two vectors together.
		// Any register operands must be in the first 16 vector registers.
		.macro _vpxor src1, src2, dst
		.if VL < 64
		vpxor \src1, \src2, \dst
		@@ -259,7 +250,7 @@

		// XOR three vectors together.
		.macro _xor3 src1, src2, src3_and_dst
		.if USE_AVX10
		.if USE_AVX512
		// vpternlogd with immediate 0x96 is a three-argument XOR.
		vpternlogd $0x96, \src1, \src2, \src3_and_dst
		.else
		@@ -274,7 +265,7 @@
		vpshufd $0x13, \src, \tmp
		vpaddq \src, \src, \dst
		vpsrad $31, \tmp, \tmp
		.if USE_AVX10
		.if USE_AVX512
		vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst
		.else
		vpand GF_POLY_XMM, \tmp, \tmp
		@@ -337,7 +328,7 @@
		vpsllq $1*VL/16, TWEAK0, TWEAK1
		vpsllq $2*VL/16, TWEAK0, TWEAK2
		vpsllq $3*VL/16, TWEAK0, TWEAK3
		.if USE_AVX10
		.if USE_AVX512
		vpternlogd $0x96, V0, V1, TWEAK1
		vpternlogd $0x96, V2, V3, TWEAK2
		vpternlogd $0x96, V4, V5, TWEAK3
		@@ -474,26 +465,26 @@
		lea OFFS-16(KEY, KEYLEN64, 4), KEY

		// If all 32 SIMD registers are available, cache all the round keys.
		.if USE_AVX10
		.if USE_AVX512
		cmp $24, KEYLEN
		jl .Laes128\@
		je .Laes192\@
		_vbroadcast128 -6*16(KEY), KEY1
		_vbroadcast128 -5*16(KEY), KEY2
		vbroadcasti32x4 -6*16(KEY), KEY1
		vbroadcasti32x4 -5*16(KEY), KEY2
		.Laes192\@:
		_vbroadcast128 -4*16(KEY), KEY3
		_vbroadcast128 -3*16(KEY), KEY4
		vbroadcasti32x4 -4*16(KEY), KEY3
		vbroadcasti32x4 -3*16(KEY), KEY4
		.Laes128\@:
		_vbroadcast128 -2*16(KEY), KEY5
		_vbroadcast128 -1*16(KEY), KEY6
		_vbroadcast128 0*16(KEY), KEY7
		_vbroadcast128 1*16(KEY), KEY8
		_vbroadcast128 2*16(KEY), KEY9
		_vbroadcast128 3*16(KEY), KEY10
		_vbroadcast128 4*16(KEY), KEY11
		_vbroadcast128 5*16(KEY), KEY12
		_vbroadcast128 6*16(KEY), KEY13
		_vbroadcast128 7*16(KEY), KEY14
		vbroadcasti32x4 -2*16(KEY), KEY5
		vbroadcasti32x4 -1*16(KEY), KEY6
		vbroadcasti32x4 0*16(KEY), KEY7
		vbroadcasti32x4 1*16(KEY), KEY8
		vbroadcasti32x4 2*16(KEY), KEY9
		vbroadcasti32x4 3*16(KEY), KEY10
		vbroadcasti32x4 4*16(KEY), KEY11
		vbroadcasti32x4 5*16(KEY), KEY12
		vbroadcasti32x4 6*16(KEY), KEY13
		vbroadcasti32x4 7*16(KEY), KEY14
		.endif
		.endm

		@@ -521,7 +512,7 @@
		// using the same key for all block(s). The round key is loaded from the
		// appropriate register or memory location for round \i. May clobber \tmp.
		.macro _vaes_1x enc, i, xmm_suffix, data, tmp
		.if USE_AVX10
		.if USE_AVX512
		_vaes \enc, KEY\i\xmm_suffix, \data
		.else
		.ifnb \xmm_suffix
		@@ -538,7 +529,7 @@
		// appropriate register or memory location for round \i. In addition, does two
		// steps of the computation of the next set of tweaks. May clobber V4 and V5.
		.macro _vaes_4x enc, i
		.if USE_AVX10
		.if USE_AVX512
		_tweak_step (2*(\i-5))
		_vaes \enc, KEY\i, V0
		_vaes \enc, KEY\i, V1
		@@ -574,7 +565,7 @@
		.irp i, 5,6,7,8,9,10,11,12,13
		_vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp
		.endr
		.if USE_AVX10
		.if USE_AVX512
		vpxord KEY14\xmm_suffix, \tweak, \tmp
		.else
		.ifnb \xmm_suffix
		@@ -617,11 +608,11 @@
		// This is the main loop, en/decrypting 4*VL bytes per iteration.

		// XOR each source block with its tweak and the zero-th round key.
		.if USE_AVX10
		_vmovdqu 0*VL(SRC), V0
		_vmovdqu 1*VL(SRC), V1
		_vmovdqu 2*VL(SRC), V2
		_vmovdqu 3*VL(SRC), V3
		.if USE_AVX512
		vmovdqu8 0*VL(SRC), V0
		vmovdqu8 1*VL(SRC), V1
		vmovdqu8 2*VL(SRC), V2
		vmovdqu8 3*VL(SRC), V3
		vpternlogd $0x96, TWEAK0, KEY0, V0
		vpternlogd $0x96, TWEAK1, KEY0, V1
		vpternlogd $0x96, TWEAK2, KEY0, V2
		@@ -654,7 +645,7 @@
		// Reduce latency by doing the XOR before the vaesenclast, utilizing the
		// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a)
		// (and likewise for vaesdeclast).
		.if USE_AVX10
		.if USE_AVX512
		_tweak_step 18
		_tweak_step 19
		vpxord TWEAK0, KEY14, V4
		@@ -762,7 +753,7 @@
		_aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1
		.endif

		.if USE_AVX10
		.if USE_AVX512
		// Create a mask that has the first LEN bits set.
		mov $-1, %r9d
		bzhi LEN, %r9d, %r9d
		@@ -811,7 +802,7 @@
		// u8 iv[AES_BLOCK_SIZE]);
		//
		// Encrypt \|iv\| using the AES key \|tweak_key\| to get the first tweak. Assumes
		// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10.
		// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX512.
		SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
		.set TWEAK_KEY, %rdi
		.set IV, %rsi
		@@ -853,7 +844,7 @@ SYM_FUNC_END(aes_xts_encrypt_iv)
		// multiple of 16, then this function updates \|tweak\| to contain the next tweak.

		.set VL, 16
		.set USE_AVX10, 0
		.set USE_AVX512, 0
		SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
		_aes_xts_crypt 1
		SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
		@@ -863,7 +854,7 @@ SYM_FUNC_END(aes_xts_decrypt_aesni_avx)

		#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
		.set VL, 32
		.set USE_AVX10, 0
		.set USE_AVX512, 0
		SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
		_aes_xts_crypt 1
		SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
		@@ -871,21 +862,12 @@ SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
		_aes_xts_crypt 0
		SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)

		.set VL, 32
		.set USE_AVX10, 1
		SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
		_aes_xts_crypt 1
		SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
		SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
		_aes_xts_crypt 0
		SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)

		.set VL, 64
		.set USE_AVX10, 1
		SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
		.set USE_AVX512, 1
		SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx512)
		_aes_xts_crypt 1
		SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
		SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
		SYM_FUNC_END(aes_xts_encrypt_vaes_avx512)
		SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx512)
		_aes_xts_crypt 0
		SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
		SYM_FUNC_END(aes_xts_decrypt_vaes_avx512)
		#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */

arch/x86/crypto/aesni-intel_glue.c

+10 −20

Original line number	Diff line number	Diff line
		@@ -844,8 +844,7 @@ simd_skcipher_algs_##suffix[ARRAY_SIZE(skcipher_algs_##suffix)]
		DEFINE_AVX_SKCIPHER_ALGS(aesni_avx, "aesni-avx", 500);
		#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
		DEFINE_AVX_SKCIPHER_ALGS(vaes_avx2, "vaes-avx2", 600);
		DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_256, "vaes-avx10_256", 700);
		DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_512, "vaes-avx10_512", 800);
		DEFINE_AVX_SKCIPHER_ALGS(vaes_avx512, "vaes-avx512", 800);
		#endif

		/* The common part of the x86_64 AES-GCM key struct */
		@@ -1592,11 +1591,6 @@ static int __init register_avx_algs(void)
		XFEATURE_MASK_AVX512, NULL))
		return 0;

		err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_256,
		ARRAY_SIZE(skcipher_algs_vaes_avx10_256),
		simd_skcipher_algs_vaes_avx10_256);
		if (err)
		return err;
		err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256,
		ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256),
		aes_gcm_simdalgs_vaes_avx10_256);
		@@ -1606,15 +1600,15 @@ static int __init register_avx_algs(void)
		if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
		int i;

		for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx10_512); i++)
		skcipher_algs_vaes_avx10_512[i].base.cra_priority = 1;
		for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++)
		skcipher_algs_vaes_avx512[i].base.cra_priority = 1;
		for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++)
		aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1;
		}

		err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_512,
		ARRAY_SIZE(skcipher_algs_vaes_avx10_512),
		simd_skcipher_algs_vaes_avx10_512);
		err = simd_register_skciphers_compat(skcipher_algs_vaes_avx512,
		ARRAY_SIZE(skcipher_algs_vaes_avx512),
		simd_skcipher_algs_vaes_avx512);
		if (err)
		return err;
		err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512,
		@@ -1641,18 +1635,14 @@ static void unregister_avx_algs(void)
		simd_unregister_skciphers(skcipher_algs_vaes_avx2,
		ARRAY_SIZE(skcipher_algs_vaes_avx2),
		simd_skcipher_algs_vaes_avx2);
		if (simd_skcipher_algs_vaes_avx10_256[0])
		simd_unregister_skciphers(skcipher_algs_vaes_avx10_256,
		ARRAY_SIZE(skcipher_algs_vaes_avx10_256),
		simd_skcipher_algs_vaes_avx10_256);
		if (aes_gcm_simdalgs_vaes_avx10_256[0])
		simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256,
		ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256),
		aes_gcm_simdalgs_vaes_avx10_256);
		if (simd_skcipher_algs_vaes_avx10_512[0])
		simd_unregister_skciphers(skcipher_algs_vaes_avx10_512,
		ARRAY_SIZE(skcipher_algs_vaes_avx10_512),
		simd_skcipher_algs_vaes_avx10_512);
		if (simd_skcipher_algs_vaes_avx512[0])
		simd_unregister_skciphers(skcipher_algs_vaes_avx512,
		ARRAY_SIZE(skcipher_algs_vaes_avx512),
		simd_skcipher_algs_vaes_avx512);
		if (aes_gcm_simdalgs_vaes_avx10_512[0])
		simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512,
		ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512),