lib/crypto: arm64/sha256: Add support for 2-way interleaved hashing (34c3f1e3) · Commits · git / linux-nf

lib/crypto/arm64/sha256-ce.S

+278 −6

Original line number	Diff line number	Diff line
		@@ -70,18 +70,22 @@
		.word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
		.word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2

		.macro load_round_constants tmp
		adr_l \tmp, .Lsha2_rcon
		ld1 { v0.4s- v3.4s}, [\tmp], #64
		ld1 { v4.4s- v7.4s}, [\tmp], #64
		ld1 { v8.4s-v11.4s}, [\tmp], #64
		ld1 {v12.4s-v15.4s}, [\tmp]
		.endm

		/*
		* size_t __sha256_ce_transform(struct sha256_block_state *state,
		* const u8 *data, size_t nblocks);
		*/
		.text
		SYM_FUNC_START(__sha256_ce_transform)
		/* load round constants */
		adr_l x8, .Lsha2_rcon
		ld1 { v0.4s- v3.4s}, [x8], #64
		ld1 { v4.4s- v7.4s}, [x8], #64
		ld1 { v8.4s-v11.4s}, [x8], #64
		ld1 {v12.4s-v15.4s}, [x8]

		load_round_constants x8

		/* load state */
		ld1 {dgav.4s, dgbv.4s}, [x0]
		@@ -134,3 +138,271 @@ CPU_LE( rev32 v19.16b, v19.16b )
		mov x0, x2
		ret
		SYM_FUNC_END(__sha256_ce_transform)

		.unreq dga
		.unreq dgav
		.unreq dgb
		.unreq dgbv
		.unreq t0
		.unreq t1
		.unreq dg0q
		.unreq dg0v
		.unreq dg1q
		.unreq dg1v
		.unreq dg2q
		.unreq dg2v

		// parameters for sha256_ce_finup2x()
		ctx .req x0
		data1 .req x1
		data2 .req x2
		len .req w3
		out1 .req x4
		out2 .req x5

		// other scalar variables
		count .req x6
		final_step .req w7

		// x8-x9 are used as temporaries.

		// v0-v15 are used to cache the SHA-256 round constants.
		// v16-v19 are used for the message schedule for the first message.
		// v20-v23 are used for the message schedule for the second message.
		// v24-v31 are used for the state and temporaries as given below.
		// _a are for the first message and _b for the second.
		state0_a_q .req q24
		state0_a .req v24
		state1_a_q .req q25
		state1_a .req v25
		state0_b_q .req q26
		state0_b .req v26
		state1_b_q .req q27
		state1_b .req v27
		t0_a .req v28
		t0_b .req v29
		t1_a_q .req q30
		t1_a .req v30
		t1_b_q .req q31
		t1_b .req v31

		#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
		#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
		// offsetof(struct __sha256_ctx, state) is assumed to be 0.

		// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
		// and m0_b contain the current 4 message schedule words for the first
		// and second message respectively.
		//
		// If not all the message schedule words have been computed yet, then
		// this also computes 4 more message schedule words for each message.
		// m1_a-m3_a contain the next 3 groups of 4 message schedule words for
		// the first message, and likewise m1_b-m3_b for the second. After
		// consuming the current value of m0_a, this macro computes the group
		// after m3_a and writes it to m0_a, and likewise for *_b. This means
		// that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
		// m3_a, m0_a), and likewise for *_b, so the caller must cycle through
		// the registers accordingly.
		.macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \
		m0_b, m1_b, m2_b, m3_b
		add t0_a\().4s, \m0_a\().4s, \k\().4s
		add t0_b\().4s, \m0_b\().4s, \k\().4s
		.if \i < 48
		sha256su0 \m0_a\().4s, \m1_a\().4s
		sha256su0 \m0_b\().4s, \m1_b\().4s
		sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s
		sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s
		.endif
		mov t1_a.16b, state0_a.16b
		mov t1_b.16b, state0_b.16b
		sha256h state0_a_q, state1_a_q, t0_a\().4s
		sha256h state0_b_q, state1_b_q, t0_b\().4s
		sha256h2 state1_a_q, t1_a_q, t0_a\().4s
		sha256h2 state1_b_q, t1_b_q, t0_b\().4s
		.endm

		.macro do_16rounds_2x i, k0, k1, k2, k3
		do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23
		do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20
		do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21
		do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22
		.endm

		//
		// void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
		// const u8 data1, const u8 data2, int len,
		// u8 out1[SHA256_DIGEST_SIZE],
		// u8 out2[SHA256_DIGEST_SIZE]);
		//
		// This function computes the SHA-256 digests of two messages \|data1\| and
		// \|data2\| that are both \|len\| bytes long, starting from the initial context
		// \|ctx\|. \|len\| must be at least SHA256_BLOCK_SIZE.
		//
		// The instructions for the two SHA-256 operations are interleaved. On many
		// CPUs, this is almost twice as fast as hashing each message individually due
		// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
		//
		SYM_FUNC_START(sha256_ce_finup2x)
		sub sp, sp, #128
		mov final_step, #0
		load_round_constants x8

		// Load the initial state from ctx->state.
		ld1 {state0_a.4s-state1_a.4s}, [ctx]

		// Load ctx->bytecount. Take the mod 64 of it to get the number of
		// bytes that are buffered in ctx->buf. Also save it in a register with
		// len added to it.
		ldr x8, [ctx, #OFFSETOF_BYTECOUNT]
		add count, x8, len, sxtw
		and x8, x8, #63
		cbz x8, .Lfinup2x_enter_loop // No bytes buffered?

		// x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them
		// followed by the first 64 - x8 bytes of data. Since len >= 64, we
		// just load 64 bytes from each of ctx->buf, data1, and data2
		// unconditionally and rearrange the data as needed.
		add x9, ctx, #OFFSETOF_BUF
		ld1 {v16.16b-v19.16b}, [x9]
		st1 {v16.16b-v19.16b}, [sp]

		ld1 {v16.16b-v19.16b}, [data1], #64
		add x9, sp, x8
		st1 {v16.16b-v19.16b}, [x9]
		ld1 {v16.4s-v19.4s}, [sp]

		ld1 {v20.16b-v23.16b}, [data2], #64
		st1 {v20.16b-v23.16b}, [x9]
		ld1 {v20.4s-v23.4s}, [sp]

		sub len, len, #64
		sub data1, data1, x8
		sub data2, data2, x8
		add len, len, w8
		mov state0_b.16b, state0_a.16b
		mov state1_b.16b, state1_a.16b
		b .Lfinup2x_loop_have_data

		.Lfinup2x_enter_loop:
		sub len, len, #64
		mov state0_b.16b, state0_a.16b
		mov state1_b.16b, state1_a.16b
		.Lfinup2x_loop:
		// Load the next two data blocks.
		ld1 {v16.4s-v19.4s}, [data1], #64
		ld1 {v20.4s-v23.4s}, [data2], #64
		.Lfinup2x_loop_have_data:
		// Convert the words of the data blocks from big endian.
		CPU_LE( rev32 v16.16b, v16.16b )
		CPU_LE( rev32 v17.16b, v17.16b )
		CPU_LE( rev32 v18.16b, v18.16b )
		CPU_LE( rev32 v19.16b, v19.16b )
		CPU_LE( rev32 v20.16b, v20.16b )
		CPU_LE( rev32 v21.16b, v21.16b )
		CPU_LE( rev32 v22.16b, v22.16b )
		CPU_LE( rev32 v23.16b, v23.16b )
		.Lfinup2x_loop_have_bswapped_data:

		// Save the original state for each block.
		st1 {state0_a.4s-state1_b.4s}, [sp]

		// Do the SHA-256 rounds on each block.
		do_16rounds_2x 0, v0, v1, v2, v3
		do_16rounds_2x 16, v4, v5, v6, v7
		do_16rounds_2x 32, v8, v9, v10, v11
		do_16rounds_2x 48, v12, v13, v14, v15

		// Add the original state for each block.
		ld1 {v16.4s-v19.4s}, [sp]
		add state0_a.4s, state0_a.4s, v16.4s
		add state1_a.4s, state1_a.4s, v17.4s
		add state0_b.4s, state0_b.4s, v18.4s
		add state1_b.4s, state1_b.4s, v19.4s

		// Update len and loop back if more blocks remain.
		sub len, len, #64
		tbz len, #31, .Lfinup2x_loop // len >= 0?

		// Check if any final blocks need to be handled.
		// final_step = 2: all done
		// final_step = 1: need to do count-only padding block
		// final_step = 0: need to do the block with 0x80 padding byte
		tbnz final_step, #1, .Lfinup2x_done
		tbnz final_step, #0, .Lfinup2x_finalize_countonly
		add len, len, #64
		cbz len, .Lfinup2x_finalize_blockaligned

		// Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
		// To do this, write the padding starting with the 0x80 byte to
		// &sp[64]. Then for each message, copy the last 64 data bytes to sp
		// and load from &sp[64 - len] to get the needed padding block. This
		// code relies on the data buffers being >= 64 bytes in length.
		sub w8, len, #64 // w8 = len - 64
		add data1, data1, w8, sxtw // data1 += len - 64
		add data2, data2, w8, sxtw // data2 += len - 64
		CPU_LE( mov x9, #0x80 )
		CPU_LE( fmov d16, x9 )
		CPU_BE( movi v16.16b, #0 )
		CPU_BE( mov x9, #0x8000000000000000 )
		CPU_BE( mov v16.d[1], x9 )
		movi v17.16b, #0
		stp q16, q17, [sp, #64]
		stp q17, q17, [sp, #96]
		sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
		cmp len, #56
		b.ge 1f // will count spill into its own block?
		lsl count, count, #3
		CPU_LE( rev count, count )
		str count, [x9, #56]
		mov final_step, #2 // won't need count-only block
		b 2f
		1:
		mov final_step, #1 // will need count-only block
		2:
		ld1 {v16.16b-v19.16b}, [data1]
		st1 {v16.16b-v19.16b}, [sp]
		ld1 {v16.4s-v19.4s}, [x9]
		ld1 {v20.16b-v23.16b}, [data2]
		st1 {v20.16b-v23.16b}, [sp]
		ld1 {v20.4s-v23.4s}, [x9]
		b .Lfinup2x_loop_have_data

		// Prepare a padding block, either:
		//
		// {0x80, 0, 0, 0, ..., count (as __be64)}
		// This is for a block aligned message.
		//
		// { 0, 0, 0, 0, ..., count (as __be64)}
		// This is for a message whose length mod 64 is >= 56.
		//
		// Pre-swap the endianness of the words.
		.Lfinup2x_finalize_countonly:
		movi v16.2d, #0
		b 1f
		.Lfinup2x_finalize_blockaligned:
		mov x8, #0x80000000
		fmov d16, x8
		1:
		movi v17.2d, #0
		movi v18.2d, #0
		ror count, count, #29 // ror(lsl(count, 3), 32)
		mov v19.d[0], xzr
		mov v19.d[1], count
		mov v20.16b, v16.16b
		movi v21.2d, #0
		movi v22.2d, #0
		mov v23.16b, v19.16b
		mov final_step, #2
		b .Lfinup2x_loop_have_bswapped_data

		.Lfinup2x_done:
		// Write the two digests with all bytes in the correct order.
		CPU_LE( rev32 state0_a.16b, state0_a.16b )
		CPU_LE( rev32 state1_a.16b, state1_a.16b )
		CPU_LE( rev32 state0_b.16b, state0_b.16b )
		CPU_LE( rev32 state1_b.16b, state1_b.16b )
		st1 {state0_a.4s-state1_a.4s}, [out1]
		st1 {state0_b.4s-state1_b.4s}, [out2]
		add sp, sp, #128
		ret
		SYM_FUNC_END(sha256_ce_finup2x)

lib/crypto/arm64/sha256.h

+37 −0

Original line number	Diff line number	Diff line
		@@ -44,6 +44,43 @@ static void sha256_blocks(struct sha256_block_state *state,
		}
		}

		static_assert(offsetof(struct __sha256_ctx, state) == 0);
		static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
		static_assert(offsetof(struct __sha256_ctx, buf) == 40);
		asmlinkage void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
		const u8 data1, const u8 data2, int len,
		u8 out1[SHA256_DIGEST_SIZE],
		u8 out2[SHA256_DIGEST_SIZE]);

		#define sha256_finup_2x_arch sha256_finup_2x_arch
		static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
		const u8 data1, const u8 data2, size_t len,
		u8 out1[SHA256_DIGEST_SIZE],
		u8 out2[SHA256_DIGEST_SIZE])
		{
		/*
		* The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
		* Further limit len to 65536 to avoid spending too long with preemption
		* disabled. (Of course, in practice len is nearly always 4096 anyway.)
		*/
		if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
		static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE &&
		len <= 65536 && likely(may_use_simd())) {
		kernel_neon_begin();
		sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
		kernel_neon_end();
		kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
		kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
		return true;
		}
		return false;
		}

		static bool sha256_finup_2x_is_optimized_arch(void)
		{
		return static_key_enabled(&have_ce);
		}

		#ifdef CONFIG_KERNEL_MODE_NEON
		#define sha256_mod_init_arch sha256_mod_init_arch
		static inline void sha256_mod_init_arch(void)