Commit 12b11e47 authored by Eric Biggers's avatar Eric Biggers
Browse files

lib/crypto: arm64: Assume a little-endian kernel



Since support for big-endian arm64 kernels was removed, the CPU_LE()
macro now unconditionally emits the code it is passed, and the CPU_BE()
macro now unconditionally discards the code it is passed.

Simplify the assembly code in lib/crypto/arm64/ accordingly.

Reviewed-by: default avatarArd Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20260401003331.144065-1-ebiggers@kernel.org


Signed-off-by: default avatarEric Biggers <ebiggers@kernel.org>
parent 180e92df
Loading
Loading
Loading
Loading
+0 −10
Original line number Diff line number Diff line
@@ -87,11 +87,6 @@
	ldp		w8, w9, [rk], #16
	ldp		w10, w11, [rk, #-8]

CPU_BE(	rev		w4, w4		)
CPU_BE(	rev		w5, w5		)
CPU_BE(	rev		w6, w6		)
CPU_BE(	rev		w7, w7		)

	eor		w4, w4, w8
	eor		w5, w5, w9
	eor		w6, w6, w10
@@ -112,11 +107,6 @@ CPU_BE( rev w7, w7 )
3:	adr_l		tt, \ltab
	\round		w4, w5, w6, w7, w8, w9, w10, w11, \bsz, b

CPU_BE(	rev		w4, w4		)
CPU_BE(	rev		w5, w5		)
CPU_BE(	rev		w6, w6		)
CPU_BE(	rev		w7, w7		)

	stp		w4, w5, [out]
	stp		w6, w7, [out, #8]
	ret
+0 −16
Original line number Diff line number Diff line
@@ -531,10 +531,6 @@ SYM_FUNC_START(chacha_4block_xor_neon)
	add		v3.4s, v3.4s, v19.4s
	  add		a2, a2, w8
	  add		a3, a3, w9
CPU_BE(	  rev		a0, a0		)
CPU_BE(	  rev		a1, a1		)
CPU_BE(	  rev		a2, a2		)
CPU_BE(	  rev		a3, a3		)

	ld4r		{v24.4s-v27.4s}, [x0], #16
	ld4r		{v28.4s-v31.4s}, [x0]
@@ -555,10 +551,6 @@ CPU_BE( rev a3, a3 )
	add		v7.4s, v7.4s, v23.4s
	  add		a6, a6, w8
	  add		a7, a7, w9
CPU_BE(	  rev		a4, a4		)
CPU_BE(	  rev		a5, a5		)
CPU_BE(	  rev		a6, a6		)
CPU_BE(	  rev		a7, a7		)

	// x8[0-3] += s2[0]
	// x9[0-3] += s2[1]
@@ -576,10 +568,6 @@ CPU_BE( rev a7, a7 )
	add		v11.4s, v11.4s, v27.4s
	  add		a10, a10, w8
	  add		a11, a11, w9
CPU_BE(	  rev		a8, a8		)
CPU_BE(	  rev		a9, a9		)
CPU_BE(	  rev		a10, a10	)
CPU_BE(	  rev		a11, a11	)

	// x12[0-3] += s3[0]
	// x13[0-3] += s3[1]
@@ -597,10 +585,6 @@ CPU_BE( rev a11, a11 )
	add		v15.4s, v15.4s, v31.4s
	  add		a14, a14, w8
	  add		a15, a15, w9
CPU_BE(	  rev		a12, a12	)
CPU_BE(	  rev		a13, a13	)
CPU_BE(	  rev		a14, a14	)
CPU_BE(	  rev		a15, a15	)

	// interleave 32-bit words in state n, n+1
	  ldp		w6, w7, [x2], #64
+1 −1
Original line number Diff line number Diff line
@@ -192,7 +192,7 @@ SYM_FUNC_START(pmull_ghash_update_p8)
	sub		x0, x0, #1

	/* multiply XL by SHASH in GF(2^128) */
CPU_LE(	rev64		T1.16b, T1.16b	)
	rev64		T1.16b, T1.16b

	ext		T2.16b, XL.16b, XL.16b, #8
	ext		IN1.16b, T1.16b, T1.16b, #8
+4 −4
Original line number Diff line number Diff line
@@ -80,10 +80,10 @@ SYM_FUNC_START(sha1_ce_transform)
0:	ld1		{v8.4s-v11.4s}, [x1], #64
	sub		x2, x2, #1

CPU_LE(	rev32		v8.16b, v8.16b		)
CPU_LE(	rev32		v9.16b, v9.16b		)
CPU_LE(	rev32		v10.16b, v10.16b	)
CPU_LE(	rev32		v11.16b, v11.16b	)
	rev32		v8.16b, v8.16b
	rev32		v9.16b, v9.16b
	rev32		v10.16b, v10.16b
	rev32		v11.16b, v11.16b

	add		t0.4s, v8.4s, k0.4s
	mov		dg0v.16b, dgav.16b
+19 −22
Original line number Diff line number Diff line
@@ -94,10 +94,10 @@ SYM_FUNC_START(sha256_ce_transform)
0:	ld1		{v16.4s-v19.4s}, [x1], #64
	sub		x2, x2, #1

CPU_LE(	rev32		v16.16b, v16.16b	)
CPU_LE(	rev32		v17.16b, v17.16b	)
CPU_LE(	rev32		v18.16b, v18.16b	)
CPU_LE(	rev32		v19.16b, v19.16b	)
	rev32		v16.16b, v16.16b
	rev32		v17.16b, v17.16b
	rev32		v18.16b, v18.16b
	rev32		v19.16b, v19.16b

	add		t0.4s, v16.4s, v0.4s
	mov		dg0v.16b, dgav.16b
@@ -289,14 +289,14 @@ SYM_FUNC_START(sha256_ce_finup2x)
	ld1		{v20.4s-v23.4s}, [data2], #64
.Lfinup2x_loop_have_data:
	// Convert the words of the data blocks from big endian.
CPU_LE(	rev32		v16.16b, v16.16b	)
CPU_LE(	rev32		v17.16b, v17.16b	)
CPU_LE(	rev32		v18.16b, v18.16b	)
CPU_LE(	rev32		v19.16b, v19.16b	)
CPU_LE(	rev32		v20.16b, v20.16b	)
CPU_LE(	rev32		v21.16b, v21.16b	)
CPU_LE(	rev32		v22.16b, v22.16b	)
CPU_LE(	rev32		v23.16b, v23.16b	)
	rev32		v16.16b, v16.16b
	rev32		v17.16b, v17.16b
	rev32		v18.16b, v18.16b
	rev32		v19.16b, v19.16b
	rev32		v20.16b, v20.16b
	rev32		v21.16b, v21.16b
	rev32		v22.16b, v22.16b
	rev32		v23.16b, v23.16b
.Lfinup2x_loop_have_bswapped_data:

	// Save the original state for each block.
@@ -336,11 +336,8 @@ CPU_LE( rev32 v23.16b, v23.16b )
	sub		w8, len, #64		// w8 = len - 64
	add		data1, data1, w8, sxtw	// data1 += len - 64
	add		data2, data2, w8, sxtw	// data2 += len - 64
CPU_LE(	mov		x9, #0x80		)
CPU_LE(	fmov		d16, x9			)
CPU_BE(	movi		v16.16b, #0		)
CPU_BE(	mov		x9, #0x8000000000000000	)
CPU_BE(	mov		v16.d[1], x9		)
	mov		x9, #0x80
	fmov		d16, x9
	movi		v17.16b, #0
	stp		q16, q17, [sp, #64]
	stp		q17, q17, [sp, #96]
@@ -348,7 +345,7 @@ CPU_BE( mov v16.d[1], x9 )
	cmp		len, #56
	b.ge		1f		// will count spill into its own block?
	lsl		count, count, #3
CPU_LE(	rev		count, count		)
	rev		count, count
	str		count, [x9, #56]
	mov		final_step, #2	// won't need count-only block
	b		2f
@@ -393,10 +390,10 @@ CPU_LE( rev count, count )

.Lfinup2x_done:
	// Write the two digests with all bytes in the correct order.
CPU_LE(	rev32		state0_a.16b, state0_a.16b	)
CPU_LE(	rev32		state1_a.16b, state1_a.16b	)
CPU_LE(	rev32		state0_b.16b, state0_b.16b	)
CPU_LE(	rev32		state1_b.16b, state1_b.16b	)
	rev32		state0_a.16b, state0_a.16b
	rev32		state1_a.16b, state1_a.16b
	rev32		state0_b.16b, state0_b.16b
	rev32		state1_b.16b, state1_b.16b
	st1		{state0_a.4s-state1_a.4s}, [out1]
	st1		{state0_b.4s-state1_b.4s}, [out2]
	add		sp, sp, #128
Loading