Optimize fsverity with 2-way interleaved hashing

Add support for 2-way interleaved SHA-256 hashing to lib/crypto/, and make fsverity use it for faster file data verification. This improves fsverity performance on many x86_64 and arm64 processors. Later, I plan to make dm-verity use this too. -----BEGIN PGP SIGNATURE----- iIoEABYIADIWIQSacvsUNc7UX4ntmEPzXCl4vpKOKwUCaNg4/RQcZWJpZ2dlcnNA a2VybmVsLm9yZwAKCRDzXCl4vpKOK4fMAP9Xz00JNDfJ+mOVHIYOhAlWFGnug0X1 cvoRf4QXchNlbwD9HTJQQDQXnbsPy3QPrUVfl2FqCW7c6vRlBJijhD6j4wE= =6dCR -----END PGP SIGNATURE----- Merge tag 'fsverity-for-linus' of git://git.kernel.org/pub/scm/fs/fsverity/linux Pull interleaved SHA-256 hashing support from Eric Biggers: "Optimize fsverity with 2-way interleaved hashing Add support for 2-way interleaved SHA-256 hashing to lib/crypto/, and make fsverity use it for faster file data verification. This improves fsverity performance on many x86_64 and arm64 processors. Later, I plan to make dm-verity use this too" * tag 'fsverity-for-linus' of git://git.kernel.org/pub/scm/fs/fsverity/linux: fsverity: Use 2-way interleaved SHA-256 hashing when supported fsverity: Remove inode parameter from fsverity_hash_block() lib/crypto: tests: Add tests and benchmark for sha256_finup_2x() lib/crypto: x86/sha256: Add support for 2-way interleaved hashing lib/crypto: arm64/sha256: Add support for 2-way interleaved hashing lib/crypto: sha256: Add support for 2-way interleaved hashing
2025-09-29 15:55:20 -07:00 · 2025-09-29 15:55:20 -07:00 · 1896ce8eb6
parent d8768fb12a a1f692fd69
commit 1896ce8eb6
11 changed files with 1147 additions and 56 deletions
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@ -19,8 +19,7 @@ struct block_buffer {
 };

 /* Hash a block, writing the result to the next level's pending block buffer. */
-static int hash_one_block(struct inode *inode,
-			  const struct merkle_tree_params *params,
+static int hash_one_block(const struct merkle_tree_params *params,
 			  struct block_buffer *cur)
 {
 	struct block_buffer *next = cur + 1;
@ -36,8 +35,7 @@ static int hash_one_block(struct inode *inode,
 	/* Zero-pad the block if it's shorter than the block size. */
 	memset(&cur->data[cur->filled], 0, params->block_size - cur->filled);

-	fsverity_hash_block(params, inode, cur->data,
-			    &next->data[next->filled]);
+	fsverity_hash_block(params, cur->data, &next->data[next->filled]);
 	next->filled += params->digest_size;
 	cur->filled = 0;
 	return 0;
@ -123,7 +121,7 @@ static int build_merkle_tree(struct file *filp,
 			fsverity_err(inode, "Short read of file data");
 			goto out;
 		}
-		err = hash_one_block(inode, params, &buffers[-1]);
+		err = hash_one_block(params, &buffers[-1]);
 		if (err)
 			goto out;
 		for (level = 0; level < num_levels; level++) {
@ -134,7 +132,7 @@ static int build_merkle_tree(struct file *filp,
 			}
 			/* Next block at @level is full */

-			err = hash_one_block(inode, params, &buffers[level]);
+			err = hash_one_block(params, &buffers[level]);
 			if (err)
 				goto out;
 			err = write_merkle_tree_block(inode,
@ -154,7 +152,7 @@ static int build_merkle_tree(struct file *filp,
 	/* Finish all nonempty pending tree blocks. */
 	for (level = 0; level < num_levels; level++) {
 		if (buffers[level].filled != 0) {
-			err = hash_one_block(inode, params, &buffers[level]);
+			err = hash_one_block(params, &buffers[level]);
 			if (err)
 				goto out;
 			err = write_merkle_tree_block(inode,
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@ -90,7 +90,7 @@ union fsverity_hash_ctx *
 fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
 			    const u8 *salt, size_t salt_size);
 void fsverity_hash_block(const struct merkle_tree_params *params,
-			 const struct inode *inode, const void *data, u8 *out);
+			 const void *data, u8 *out);
 void fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
 			  const void *data, size_t size, u8 *out);
 void __init fsverity_check_hash_algs(void);
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@ -94,7 +94,6 @@ fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
 /**
 * fsverity_hash_block() - hash a single data or hash block
 * @params: the Merkle tree's parameters
- * @inode: inode for which the hashing is being done
 * @data: virtual address of a buffer containing the block to hash
 * @out: output digest, size 'params->digest_size' bytes
 *
@ -102,7 +101,7 @@ fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
 * in the Merkle tree parameters.
 */
 void fsverity_hash_block(const struct merkle_tree_params *params,
-			 const struct inode *inode, const void *data, u8 *out)
+			 const void *data, u8 *out)
 {
 	union fsverity_hash_ctx ctx;

--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@ -10,6 +10,31 @@
 #include <linux/bio.h>
 #include <linux/export.h>

+#define FS_VERITY_MAX_PENDING_BLOCKS 2
+
+struct fsverity_pending_block {
+	const void *data;
+	u64 pos;
+	u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE];
+};
+
+struct fsverity_verification_context {
+	struct inode *inode;
+	struct fsverity_info *vi;
+	unsigned long max_ra_pages;
+
+	/*
+	 * This is the queue of data blocks that are pending verification.  When
+	 * the crypto layer supports interleaved hashing, we allow multiple
+	 * blocks to be queued up in order to utilize it.  This can improve
+	 * performance significantly vs. sequential hashing of each block.
+	 */
+	int num_pending;
+	int max_pending;
+	struct fsverity_pending_block
+		pending_blocks[FS_VERITY_MAX_PENDING_BLOCKS];
+};
+
 static struct workqueue_struct *fsverity_read_workqueue;

 /*
@ -79,7 +104,7 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
 }

 /*
- * Verify a single data block against the file's Merkle tree.
+ * Verify the hash of a single data block against the file's Merkle tree.
 *
 * In principle, we need to verify the entire path to the root node.  However,
 * for efficiency the filesystem may cache the hash blocks.  Therefore we need
@ -88,10 +113,11 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
 *
 * Return: %true if the data block is valid, else %false.
 */
-static bool
-verify_data_block(struct inode *inode, struct fsverity_info *vi,
-		  const void *data, u64 data_pos, unsigned long max_ra_pages)
+static bool verify_data_block(struct inode *inode, struct fsverity_info *vi,
+			      const struct fsverity_pending_block *dblock,
+			      unsigned long max_ra_pages)
 {
+	const u64 data_pos = dblock->pos;
 	const struct merkle_tree_params *params = &vi->tree_params;
 	const unsigned int hsize = params->digest_size;
 	int level;
@ -115,8 +141,12 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
 	 */
 	u64 hidx = data_pos >> params->log_blocksize;

-	/* Up to 1 + FS_VERITY_MAX_LEVELS pages may be mapped at once */
-	BUILD_BUG_ON(1 + FS_VERITY_MAX_LEVELS > KM_MAX_IDX);
+	/*
+	 * Up to FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS pages may
+	 * be mapped at once.
+	 */
+	static_assert(FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS <=
+		      KM_MAX_IDX);

 	if (unlikely(data_pos >= inode->i_size)) {
 		/*
@ -127,7 +157,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
 		 * any part past EOF should be all zeroes.  Therefore, we need
 		 * to verify that any data blocks fully past EOF are all zeroes.
 		 */
-		if (memchr_inv(data, 0, params->block_size)) {
+		if (memchr_inv(dblock->data, 0, params->block_size)) {
 			fsverity_err(inode,
 				     "FILE CORRUPTED!  Data past EOF is not zeroed");
 			return false;
@ -202,7 +232,7 @@ descend:
 		unsigned long hblock_idx = hblocks[level - 1].index;
 		unsigned int hoffset = hblocks[level - 1].hoffset;

-		fsverity_hash_block(params, inode, haddr, real_hash);
+		fsverity_hash_block(params, haddr, real_hash);
 		if (memcmp(want_hash, real_hash, hsize) != 0)
 			goto corrupted;
 		/*
@ -220,18 +250,18 @@ descend:
 		put_page(hpage);
 	}

-	/* Finally, verify the data block. */
-	fsverity_hash_block(params, inode, data, real_hash);
-	if (memcmp(want_hash, real_hash, hsize) != 0)
+	/* Finally, verify the hash of the data block. */
+	if (memcmp(want_hash, dblock->real_hash, hsize) != 0)
 		goto corrupted;
 	return true;

 corrupted:
-	fsverity_err(inode,
-		     "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
-		     data_pos, level - 1,
-		     params->hash_alg->name, hsize, want_hash,
-		     params->hash_alg->name, hsize, real_hash);
+	fsverity_err(
+		inode,
+		"FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
+		data_pos, level - 1, params->hash_alg->name, hsize, want_hash,
+		params->hash_alg->name, hsize,
+		level == 0 ? dblock->real_hash : real_hash);
 error:
 	for (; level > 0; level--) {
 		kunmap_local(hblocks[level - 1].addr);
@ -240,13 +270,73 @@ error:
 	return false;
 }

-static bool
-verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
-		   unsigned long max_ra_pages)
+static void
+fsverity_init_verification_context(struct fsverity_verification_context *ctx,
+				   struct inode *inode,
+				   unsigned long max_ra_pages)
 {
-	struct inode *inode = data_folio->mapping->host;
 	struct fsverity_info *vi = *fsverity_info_addr(inode);
-	const unsigned int block_size = vi->tree_params.block_size;
+
+	ctx->inode = inode;
+	ctx->vi = vi;
+	ctx->max_ra_pages = max_ra_pages;
+	ctx->num_pending = 0;
+	if (vi->tree_params.hash_alg->algo_id == HASH_ALGO_SHA256 &&
+	    sha256_finup_2x_is_optimized())
+		ctx->max_pending = 2;
+	else
+		ctx->max_pending = 1;
+}
+
+static void
+fsverity_clear_pending_blocks(struct fsverity_verification_context *ctx)
+{
+	int i;
+
+	for (i = ctx->num_pending - 1; i >= 0; i--) {
+		kunmap_local(ctx->pending_blocks[i].data);
+		ctx->pending_blocks[i].data = NULL;
+	}
+	ctx->num_pending = 0;
+}
+
+static bool
+fsverity_verify_pending_blocks(struct fsverity_verification_context *ctx)
+{
+	struct fsverity_info *vi = ctx->vi;
+	const struct merkle_tree_params *params = &vi->tree_params;
+	int i;
+
+	if (ctx->num_pending == 2) {
+		/* num_pending == 2 implies that the algorithm is SHA-256 */
+		sha256_finup_2x(params->hashstate ? &params->hashstate->sha256 :
+						    NULL,
+				ctx->pending_blocks[0].data,
+				ctx->pending_blocks[1].data, params->block_size,
+				ctx->pending_blocks[0].real_hash,
+				ctx->pending_blocks[1].real_hash);
+	} else {
+		for (i = 0; i < ctx->num_pending; i++)
+			fsverity_hash_block(params, ctx->pending_blocks[i].data,
+					    ctx->pending_blocks[i].real_hash);
+	}
+
+	for (i = 0; i < ctx->num_pending; i++) {
+		if (!verify_data_block(ctx->inode, vi, &ctx->pending_blocks[i],
+				       ctx->max_ra_pages))
+			return false;
+	}
+	fsverity_clear_pending_blocks(ctx);
+	return true;
+}
+
+static bool fsverity_add_data_blocks(struct fsverity_verification_context *ctx,
+				     struct folio *data_folio, size_t len,
+				     size_t offset)
+{
+	struct fsverity_info *vi = ctx->vi;
+	const struct merkle_tree_params *params = &vi->tree_params;
+	const unsigned int block_size = params->block_size;
 	u64 pos = (u64)data_folio->index << PAGE_SHIFT;

 	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size)))
@ -255,14 +345,11 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
 			 folio_test_uptodate(data_folio)))
 		return false;
 	do {
-		void *data;
-		bool valid;
-
-		data = kmap_local_folio(data_folio, offset);
-		valid = verify_data_block(inode, vi, data, pos + offset,
-					  max_ra_pages);
-		kunmap_local(data);
-		if (!valid)
+		ctx->pending_blocks[ctx->num_pending].data =
+			kmap_local_folio(data_folio, offset);
+		ctx->pending_blocks[ctx->num_pending].pos = pos + offset;
+		if (++ctx->num_pending == ctx->max_pending &&
+		    !fsverity_verify_pending_blocks(ctx))
 			return false;
 		offset += block_size;
 		len -= block_size;
@ -284,7 +371,15 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
 */
 bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset)
 {
-	return verify_data_blocks(folio, len, offset, 0);
+	struct fsverity_verification_context ctx;
+
+	fsverity_init_verification_context(&ctx, folio->mapping->host, 0);
+
+	if (fsverity_add_data_blocks(&ctx, folio, len, offset) &&
+	    fsverity_verify_pending_blocks(&ctx))
+		return true;
+	fsverity_clear_pending_blocks(&ctx);
+	return false;
 }
 EXPORT_SYMBOL_GPL(fsverity_verify_blocks);

@ -305,6 +400,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
 */
 void fsverity_verify_bio(struct bio *bio)
 {
+	struct inode *inode = bio_first_folio_all(bio)->mapping->host;
+	struct fsverity_verification_context ctx;
 	struct folio_iter fi;
 	unsigned long max_ra_pages = 0;

@ -321,13 +418,21 @@ void fsverity_verify_bio(struct bio *bio)
 		max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2);
 	}

+	fsverity_init_verification_context(&ctx, inode, max_ra_pages);
+
 	bio_for_each_folio_all(fi, bio) {
-		if (!verify_data_blocks(fi.folio, fi.length, fi.offset,
-					max_ra_pages)) {
-			bio->bi_status = BLK_STS_IOERR;
-			break;
-		}
+		if (!fsverity_add_data_blocks(&ctx, fi.folio, fi.length,
+					      fi.offset))
+			goto ioerr;
 	}
+
+	if (!fsverity_verify_pending_blocks(&ctx))
+		goto ioerr;
+	return;
+
+ioerr:
+	fsverity_clear_pending_blocks(&ctx);
+	bio->bi_status = BLK_STS_IOERR;
 }
 EXPORT_SYMBOL_GPL(fsverity_verify_bio);
 #endif /* CONFIG_BLOCK */
--- a/include/crypto/sha2.h
+++ b/include/crypto/sha2.h
@ -375,6 +375,34 @@ void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
 */
 void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]);

+/**
+ * sha256_finup_2x() - Compute two SHA-256 digests from a common initial
+ *		       context.  On some CPUs, this is faster than sequentially
+ *		       computing each digest.
+ * @ctx: an optional initial context, which may have already processed data.  If
+ *	 NULL, a default initial context is used (equivalent to sha256_init()).
+ * @data1: data for the first message
+ * @data2: data for the second message
+ * @len: the length of each of @data1 and @data2, in bytes
+ * @out1: (output) the first SHA-256 message digest
+ * @out2: (output) the second SHA-256 message digest
+ *
+ * Context: Any context.
+ */
+void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
+		     const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
+		     u8 out2[SHA256_DIGEST_SIZE]);
+
+/**
+ * sha256_finup_2x_is_optimized() - Check if sha256_finup_2x() is using a real
+ *				    interleaved implementation, as opposed to a
+ *				    sequential fallback
+ * @return: true if optimized
+ *
+ * Context: Any context.
+ */
+bool sha256_finup_2x_is_optimized(void);
+
 /**
 * struct hmac_sha256_key - Prepared key for HMAC-SHA256
 * @key: private
--- a/lib/crypto/arm64/sha256-ce.S
+++ b/lib/crypto/arm64/sha256-ce.S
@ -70,18 +70,22 @@
 	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
 	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2

+	.macro load_round_constants	tmp
+	adr_l		\tmp, .Lsha2_rcon
+	ld1		{ v0.4s- v3.4s}, [\tmp], #64
+	ld1		{ v4.4s- v7.4s}, [\tmp], #64
+	ld1		{ v8.4s-v11.4s}, [\tmp], #64
+	ld1		{v12.4s-v15.4s}, [\tmp]
+	.endm
+
 	/*
 	 * size_t __sha256_ce_transform(struct sha256_block_state *state,
 	 *				const u8 *data, size_t nblocks);
 	 */
 	.text
 SYM_FUNC_START(__sha256_ce_transform)
-	/* load round constants */
-	adr_l		x8, .Lsha2_rcon
-	ld1		{ v0.4s- v3.4s}, [x8], #64
-	ld1		{ v4.4s- v7.4s}, [x8], #64
-	ld1		{ v8.4s-v11.4s}, [x8], #64
-	ld1		{v12.4s-v15.4s}, [x8]
+
+	load_round_constants	x8

 	/* load state */
 	ld1		{dgav.4s, dgbv.4s}, [x0]
@ -134,3 +138,271 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
 	mov		x0, x2
 	ret
 SYM_FUNC_END(__sha256_ce_transform)
+
+	.unreq dga
+	.unreq dgav
+	.unreq dgb
+	.unreq dgbv
+	.unreq t0
+	.unreq t1
+	.unreq dg0q
+	.unreq dg0v
+	.unreq dg1q
+	.unreq dg1v
+	.unreq dg2q
+	.unreq dg2v
+
+	// parameters for sha256_ce_finup2x()
+	ctx		.req	x0
+	data1		.req	x1
+	data2		.req	x2
+	len		.req	w3
+	out1		.req	x4
+	out2		.req	x5
+
+	// other scalar variables
+	count		.req	x6
+	final_step	.req	w7
+
+	// x8-x9 are used as temporaries.
+
+	// v0-v15 are used to cache the SHA-256 round constants.
+	// v16-v19 are used for the message schedule for the first message.
+	// v20-v23 are used for the message schedule for the second message.
+	// v24-v31 are used for the state and temporaries as given below.
+	// *_a are for the first message and *_b for the second.
+	state0_a_q	.req	q24
+	state0_a	.req	v24
+	state1_a_q	.req	q25
+	state1_a	.req	v25
+	state0_b_q	.req	q26
+	state0_b	.req	v26
+	state1_b_q	.req	q27
+	state1_b	.req	v27
+	t0_a		.req	v28
+	t0_b		.req	v29
+	t1_a_q		.req	q30
+	t1_a		.req	v30
+	t1_b_q		.req	q31
+	t1_b		.req	v31
+
+#define OFFSETOF_BYTECOUNT	32 // offsetof(struct __sha256_ctx, bytecount)
+#define OFFSETOF_BUF		40 // offsetof(struct __sha256_ctx, buf)
+// offsetof(struct __sha256_ctx, state) is assumed to be 0.
+
+	// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a
+	// and m0_b contain the current 4 message schedule words for the first
+	// and second message respectively.
+	//
+	// If not all the message schedule words have been computed yet, then
+	// this also computes 4 more message schedule words for each message.
+	// m1_a-m3_a contain the next 3 groups of 4 message schedule words for
+	// the first message, and likewise m1_b-m3_b for the second.  After
+	// consuming the current value of m0_a, this macro computes the group
+	// after m3_a and writes it to m0_a, and likewise for *_b.  This means
+	// that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
+	// m3_a, m0_a), and likewise for *_b, so the caller must cycle through
+	// the registers accordingly.
+	.macro	do_4rounds_2x	i, k,  m0_a, m1_a, m2_a, m3_a,  \
+				       m0_b, m1_b, m2_b, m3_b
+	add		t0_a\().4s, \m0_a\().4s, \k\().4s
+	add		t0_b\().4s, \m0_b\().4s, \k\().4s
+	.if \i < 48
+	sha256su0	\m0_a\().4s, \m1_a\().4s
+	sha256su0	\m0_b\().4s, \m1_b\().4s
+	sha256su1	\m0_a\().4s, \m2_a\().4s, \m3_a\().4s
+	sha256su1	\m0_b\().4s, \m2_b\().4s, \m3_b\().4s
+	.endif
+	mov		t1_a.16b, state0_a.16b
+	mov		t1_b.16b, state0_b.16b
+	sha256h		state0_a_q, state1_a_q, t0_a\().4s
+	sha256h		state0_b_q, state1_b_q, t0_b\().4s
+	sha256h2	state1_a_q, t1_a_q, t0_a\().4s
+	sha256h2	state1_b_q, t1_b_q, t0_b\().4s
+	.endm
+
+	.macro	do_16rounds_2x	i, k0, k1, k2, k3
+	do_4rounds_2x	\i + 0,  \k0,  v16, v17, v18, v19,  v20, v21, v22, v23
+	do_4rounds_2x	\i + 4,  \k1,  v17, v18, v19, v16,  v21, v22, v23, v20
+	do_4rounds_2x	\i + 8,  \k2,  v18, v19, v16, v17,  v22, v23, v20, v21
+	do_4rounds_2x	\i + 12, \k3,  v19, v16, v17, v18,  v23, v20, v21, v22
+	.endm
+
+//
+// void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
+//			  const u8 *data1, const u8 *data2, int len,
+//			  u8 out1[SHA256_DIGEST_SIZE],
+//			  u8 out2[SHA256_DIGEST_SIZE]);
+//
+// This function computes the SHA-256 digests of two messages |data1| and
+// |data2| that are both |len| bytes long, starting from the initial context
+// |ctx|.  |len| must be at least SHA256_BLOCK_SIZE.
+//
+// The instructions for the two SHA-256 operations are interleaved.  On many
+// CPUs, this is almost twice as fast as hashing each message individually due
+// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
+//
+SYM_FUNC_START(sha256_ce_finup2x)
+	sub		sp, sp, #128
+	mov		final_step, #0
+	load_round_constants	x8
+
+	// Load the initial state from ctx->state.
+	ld1		{state0_a.4s-state1_a.4s}, [ctx]
+
+	// Load ctx->bytecount.  Take the mod 64 of it to get the number of
+	// bytes that are buffered in ctx->buf.  Also save it in a register with
+	// len added to it.
+	ldr		x8, [ctx, #OFFSETOF_BYTECOUNT]
+	add		count, x8, len, sxtw
+	and		x8, x8, #63
+	cbz		x8, .Lfinup2x_enter_loop	// No bytes buffered?
+
+	// x8 bytes (1 to 63) are currently buffered in ctx->buf.  Load them
+	// followed by the first 64 - x8 bytes of data.  Since len >= 64, we
+	// just load 64 bytes from each of ctx->buf, data1, and data2
+	// unconditionally and rearrange the data as needed.
+	add		x9, ctx, #OFFSETOF_BUF
+	ld1		{v16.16b-v19.16b}, [x9]
+	st1		{v16.16b-v19.16b}, [sp]
+
+	ld1		{v16.16b-v19.16b}, [data1], #64
+	add		x9, sp, x8
+	st1		{v16.16b-v19.16b}, [x9]
+	ld1		{v16.4s-v19.4s}, [sp]
+
+	ld1		{v20.16b-v23.16b}, [data2], #64
+	st1		{v20.16b-v23.16b}, [x9]
+	ld1		{v20.4s-v23.4s}, [sp]
+
+	sub		len, len, #64
+	sub		data1, data1, x8
+	sub		data2, data2, x8
+	add		len, len, w8
+	mov		state0_b.16b, state0_a.16b
+	mov		state1_b.16b, state1_a.16b
+	b		.Lfinup2x_loop_have_data
+
+.Lfinup2x_enter_loop:
+	sub		len, len, #64
+	mov		state0_b.16b, state0_a.16b
+	mov		state1_b.16b, state1_a.16b
+.Lfinup2x_loop:
+	// Load the next two data blocks.
+	ld1		{v16.4s-v19.4s}, [data1], #64
+	ld1		{v20.4s-v23.4s}, [data2], #64
+.Lfinup2x_loop_have_data:
+	// Convert the words of the data blocks from big endian.
+CPU_LE(	rev32		v16.16b, v16.16b	)
+CPU_LE(	rev32		v17.16b, v17.16b	)
+CPU_LE(	rev32		v18.16b, v18.16b	)
+CPU_LE(	rev32		v19.16b, v19.16b	)
+CPU_LE(	rev32		v20.16b, v20.16b	)
+CPU_LE(	rev32		v21.16b, v21.16b	)
+CPU_LE(	rev32		v22.16b, v22.16b	)
+CPU_LE(	rev32		v23.16b, v23.16b	)
+.Lfinup2x_loop_have_bswapped_data:
+
+	// Save the original state for each block.
+	st1		{state0_a.4s-state1_b.4s}, [sp]
+
+	// Do the SHA-256 rounds on each block.
+	do_16rounds_2x	0,  v0, v1, v2, v3
+	do_16rounds_2x	16, v4, v5, v6, v7
+	do_16rounds_2x	32, v8, v9, v10, v11
+	do_16rounds_2x	48, v12, v13, v14, v15
+
+	// Add the original state for each block.
+	ld1		{v16.4s-v19.4s}, [sp]
+	add		state0_a.4s, state0_a.4s, v16.4s
+	add		state1_a.4s, state1_a.4s, v17.4s
+	add		state0_b.4s, state0_b.4s, v18.4s
+	add		state1_b.4s, state1_b.4s, v19.4s
+
+	// Update len and loop back if more blocks remain.
+	sub		len, len, #64
+	tbz		len, #31, .Lfinup2x_loop	// len >= 0?
+
+	// Check if any final blocks need to be handled.
+	// final_step = 2: all done
+	// final_step = 1: need to do count-only padding block
+	// final_step = 0: need to do the block with 0x80 padding byte
+	tbnz		final_step, #1, .Lfinup2x_done
+	tbnz		final_step, #0, .Lfinup2x_finalize_countonly
+	add		len, len, #64
+	cbz		len, .Lfinup2x_finalize_blockaligned
+
+	// Not block-aligned; 1 <= len <= 63 data bytes remain.  Pad the block.
+	// To do this, write the padding starting with the 0x80 byte to
+	// &sp[64].  Then for each message, copy the last 64 data bytes to sp
+	// and load from &sp[64 - len] to get the needed padding block.  This
+	// code relies on the data buffers being >= 64 bytes in length.
+	sub		w8, len, #64		// w8 = len - 64
+	add		data1, data1, w8, sxtw	// data1 += len - 64
+	add		data2, data2, w8, sxtw	// data2 += len - 64
+CPU_LE(	mov		x9, #0x80		)
+CPU_LE(	fmov		d16, x9			)
+CPU_BE(	movi		v16.16b, #0		)
+CPU_BE(	mov		x9, #0x8000000000000000	)
+CPU_BE(	mov		v16.d[1], x9		)
+	movi		v17.16b, #0
+	stp		q16, q17, [sp, #64]
+	stp		q17, q17, [sp, #96]
+	sub		x9, sp, w8, sxtw	// x9 = &sp[64 - len]
+	cmp		len, #56
+	b.ge		1f		// will count spill into its own block?
+	lsl		count, count, #3
+CPU_LE(	rev		count, count		)
+	str		count, [x9, #56]
+	mov		final_step, #2	// won't need count-only block
+	b		2f
+1:
+	mov		final_step, #1	// will need count-only block
+2:
+	ld1		{v16.16b-v19.16b}, [data1]
+	st1		{v16.16b-v19.16b}, [sp]
+	ld1		{v16.4s-v19.4s}, [x9]
+	ld1		{v20.16b-v23.16b}, [data2]
+	st1		{v20.16b-v23.16b}, [sp]
+	ld1		{v20.4s-v23.4s}, [x9]
+	b		.Lfinup2x_loop_have_data
+
+	// Prepare a padding block, either:
+	//
+	//	{0x80, 0, 0, 0, ..., count (as __be64)}
+	//	This is for a block aligned message.
+	//
+	//	{   0, 0, 0, 0, ..., count (as __be64)}
+	//	This is for a message whose length mod 64 is >= 56.
+	//
+	// Pre-swap the endianness of the words.
+.Lfinup2x_finalize_countonly:
+	movi		v16.2d, #0
+	b		1f
+.Lfinup2x_finalize_blockaligned:
+	mov		x8, #0x80000000
+	fmov		d16, x8
+1:
+	movi		v17.2d, #0
+	movi		v18.2d, #0
+	ror		count, count, #29	// ror(lsl(count, 3), 32)
+	mov		v19.d[0], xzr
+	mov		v19.d[1], count
+	mov		v20.16b, v16.16b
+	movi		v21.2d, #0
+	movi		v22.2d, #0
+	mov		v23.16b, v19.16b
+	mov		final_step, #2
+	b		.Lfinup2x_loop_have_bswapped_data
+
+.Lfinup2x_done:
+	// Write the two digests with all bytes in the correct order.
+CPU_LE(	rev32		state0_a.16b, state0_a.16b	)
+CPU_LE(	rev32		state1_a.16b, state1_a.16b	)
+CPU_LE(	rev32		state0_b.16b, state0_b.16b	)
+CPU_LE(	rev32		state1_b.16b, state1_b.16b	)
+	st1		{state0_a.4s-state1_a.4s}, [out1]
+	st1		{state0_b.4s-state1_b.4s}, [out2]
+	add		sp, sp, #128
+	ret
+SYM_FUNC_END(sha256_ce_finup2x)
--- a/lib/crypto/arm64/sha256.h
+++ b/lib/crypto/arm64/sha256.h
@ -44,6 +44,43 @@ static void sha256_blocks(struct sha256_block_state *state,
 	}
 }

+static_assert(offsetof(struct __sha256_ctx, state) == 0);
+static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
+static_assert(offsetof(struct __sha256_ctx, buf) == 40);
+asmlinkage void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
+				  const u8 *data1, const u8 *data2, int len,
+				  u8 out1[SHA256_DIGEST_SIZE],
+				  u8 out2[SHA256_DIGEST_SIZE]);
+
+#define sha256_finup_2x_arch sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+				 const u8 *data1, const u8 *data2, size_t len,
+				 u8 out1[SHA256_DIGEST_SIZE],
+				 u8 out2[SHA256_DIGEST_SIZE])
+{
+	/*
+	 * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
+	 * Further limit len to 65536 to avoid spending too long with preemption
+	 * disabled.  (Of course, in practice len is nearly always 4096 anyway.)
+	 */
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE &&
+	    len <= 65536 && likely(may_use_simd())) {
+		kernel_neon_begin();
+		sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
+		kernel_neon_end();
+		kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
+		kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
+		return true;
+	}
+	return false;
+}
+
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+	return static_key_enabled(&have_ce);
+}
+
 #ifdef CONFIG_KERNEL_MODE_NEON
 #define sha256_mod_init_arch sha256_mod_init_arch
 static void sha256_mod_init_arch(void)
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@ -25,13 +25,20 @@ static const struct sha256_block_state sha224_iv = {
 	},
 };

-static const struct sha256_block_state sha256_iv = {
-	.h = {
-		SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
-		SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+static const struct sha256_ctx initial_sha256_ctx = {
+	.ctx = {
+		.state = {
+			.h = {
+				SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
+				SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+			},
+		},
+		.bytecount = 0,
 	},
 };

+#define sha256_iv (initial_sha256_ctx.ctx.state)
+
 static const u32 sha256_K[64] = {
 	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
 	0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@ -261,8 +268,62 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
 }
 EXPORT_SYMBOL(sha256);

-/* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */
+/*
+ * Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined)
+ * doesn't need either HMAC support or interleaved hashing support
+ */
 #ifndef __DISABLE_EXPORTS
+
+#ifndef sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+				 const u8 *data1, const u8 *data2, size_t len,
+				 u8 out1[SHA256_DIGEST_SIZE],
+				 u8 out2[SHA256_DIGEST_SIZE])
+{
+	return false;
+}
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+	return false;
+}
+#endif
+
+/* Sequential fallback implementation of sha256_finup_2x() */
+static noinline_for_stack void sha256_finup_2x_sequential(
+	const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2,
+	size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE])
+{
+	struct __sha256_ctx mut_ctx;
+
+	mut_ctx = *ctx;
+	__sha256_update(&mut_ctx, data1, len);
+	__sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE);
+
+	mut_ctx = *ctx;
+	__sha256_update(&mut_ctx, data2, len);
+	__sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE);
+}
+
+void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
+		     const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
+		     u8 out2[SHA256_DIGEST_SIZE])
+{
+	if (ctx == NULL)
+		ctx = &initial_sha256_ctx;
+
+	if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1,
+					out2)))
+		return;
+	sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2);
+}
+EXPORT_SYMBOL_GPL(sha256_finup_2x);
+
+bool sha256_finup_2x_is_optimized(void)
+{
+	return sha256_finup_2x_is_optimized_arch();
+}
+EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized);
+
 static void __hmac_sha256_preparekey(struct sha256_block_state *istate,
 				     struct sha256_block_state *ostate,
 				     const u8 *raw_key, size_t raw_key_len,
--- a/lib/crypto/tests/sha256_kunit.c
+++ b/lib/crypto/tests/sha256_kunit.c
@ -5,6 +5,7 @@
 #include <crypto/sha2.h>
 #include "sha256-testvecs.h"

+/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
 #define HASH sha256
 #define HASH_CTX sha256_ctx
 #define HASH_SIZE SHA256_DIGEST_SIZE
@ -21,9 +22,192 @@
 #define HMAC_USINGRAWKEY hmac_sha256_usingrawkey
 #include "hash-test-template.h"

+static void free_guarded_buf(void *buf)
+{
+	vfree(buf);
+}
+
+/*
+ * Allocate a KUnit-managed buffer that has length @len bytes immediately
+ * followed by an unmapped page, and assert that the allocation succeeds.
+ */
+static void *alloc_guarded_buf(struct kunit *test, size_t len)
+{
+	size_t full_len = round_up(len, PAGE_SIZE);
+	void *buf = vmalloc(full_len);
+
+	KUNIT_ASSERT_NOT_NULL(test, buf);
+	KUNIT_ASSERT_EQ(test, 0,
+			kunit_add_action_or_reset(test, free_guarded_buf, buf));
+	return buf + full_len - len;
+}
+
+/*
+ * Test for sha256_finup_2x().  Specifically, choose various data lengths and
+ * salt lengths, and for each one, verify that sha256_finup_2x() produces the
+ * same results as sha256_update() and sha256_final().
+ *
+ * Use guarded buffers for all inputs and outputs to reliably detect any
+ * out-of-bounds reads or writes, even if they occur in assembly code.
+ */
+static void test_sha256_finup_2x(struct kunit *test)
+{
+	const size_t max_data_len = 16384;
+	u8 *data1_buf, *data2_buf, *hash1, *hash2;
+	u8 expected_hash1[SHA256_DIGEST_SIZE];
+	u8 expected_hash2[SHA256_DIGEST_SIZE];
+	u8 salt[SHA256_BLOCK_SIZE];
+	struct sha256_ctx *ctx;
+
+	data1_buf = alloc_guarded_buf(test, max_data_len);
+	data2_buf = alloc_guarded_buf(test, max_data_len);
+	hash1 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
+	hash2 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
+	ctx = alloc_guarded_buf(test, sizeof(*ctx));
+
+	rand_bytes(data1_buf, max_data_len);
+	rand_bytes(data2_buf, max_data_len);
+	rand_bytes(salt, sizeof(salt));
+
+	for (size_t i = 0; i < 500; i++) {
+		size_t salt_len = rand_length(sizeof(salt));
+		size_t data_len = rand_length(max_data_len);
+		const u8 *data1 = data1_buf + max_data_len - data_len;
+		const u8 *data2 = data2_buf + max_data_len - data_len;
+		struct sha256_ctx orig_ctx;
+
+		sha256_init(ctx);
+		sha256_update(ctx, salt, salt_len);
+		orig_ctx = *ctx;
+
+		sha256_finup_2x(ctx, data1, data2, data_len, hash1, hash2);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, ctx, &orig_ctx, sizeof(*ctx),
+			"sha256_finup_2x() modified its ctx argument");
+
+		sha256_update(ctx, data1, data_len);
+		sha256_final(ctx, expected_hash1);
+		sha256_update(&orig_ctx, data2, data_len);
+		sha256_final(&orig_ctx, expected_hash2);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, hash1, expected_hash1, SHA256_DIGEST_SIZE,
+			"Wrong hash1 with salt_len=%zu data_len=%zu", salt_len,
+			data_len);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, hash2, expected_hash2, SHA256_DIGEST_SIZE,
+			"Wrong hash2 with salt_len=%zu data_len=%zu", salt_len,
+			data_len);
+	}
+}
+
+/* Test sha256_finup_2x() with ctx == NULL */
+static void test_sha256_finup_2x_defaultctx(struct kunit *test)
+{
+	const size_t data_len = 128;
+	struct sha256_ctx ctx;
+	u8 hash1_a[SHA256_DIGEST_SIZE];
+	u8 hash2_a[SHA256_DIGEST_SIZE];
+	u8 hash1_b[SHA256_DIGEST_SIZE];
+	u8 hash2_b[SHA256_DIGEST_SIZE];
+
+	rand_bytes(test_buf, 2 * data_len);
+
+	sha256_init(&ctx);
+	sha256_finup_2x(&ctx, test_buf, &test_buf[data_len], data_len, hash1_a,
+			hash2_a);
+
+	sha256_finup_2x(NULL, test_buf, &test_buf[data_len], data_len, hash1_b,
+			hash2_b);
+
+	KUNIT_ASSERT_MEMEQ(test, hash1_a, hash1_b, SHA256_DIGEST_SIZE);
+	KUNIT_ASSERT_MEMEQ(test, hash2_a, hash2_b, SHA256_DIGEST_SIZE);
+}
+
+/*
+ * Test that sha256_finup_2x() and sha256_update/final() produce consistent
+ * results with total message lengths that require more than 32 bits.
+ */
+static void test_sha256_finup_2x_hugelen(struct kunit *test)
+{
+	const size_t data_len = 4 * SHA256_BLOCK_SIZE;
+	struct sha256_ctx ctx = {};
+	u8 expected_hash[SHA256_DIGEST_SIZE];
+	u8 hash[SHA256_DIGEST_SIZE];
+
+	rand_bytes(test_buf, data_len);
+	for (size_t align = 0; align < SHA256_BLOCK_SIZE; align++) {
+		sha256_init(&ctx);
+		ctx.ctx.bytecount = 0x123456789abcd00 + align;
+
+		sha256_finup_2x(&ctx, test_buf, test_buf, data_len, hash, hash);
+
+		sha256_update(&ctx, test_buf, data_len);
+		sha256_final(&ctx, expected_hash);
+
+		KUNIT_ASSERT_MEMEQ(test, hash, expected_hash,
+				   SHA256_DIGEST_SIZE);
+	}
+}
+
+/* Benchmark for sha256_finup_2x() */
+static void benchmark_sha256_finup_2x(struct kunit *test)
+{
+	/*
+	 * Try a few different salt lengths, since sha256_finup_2x() performance
+	 * may vary slightly for the same data_len depending on how many bytes
+	 * were already processed in the initial context.
+	 */
+	static const size_t salt_lens_to_test[] = { 0, 32, 64 };
+	const size_t data_len = 4096;
+	const size_t num_iters = 4096;
+	struct sha256_ctx ctx;
+	u8 hash1[SHA256_DIGEST_SIZE];
+	u8 hash2[SHA256_DIGEST_SIZE];
+
+	if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
+		kunit_skip(test, "not enabled");
+	if (!sha256_finup_2x_is_optimized())
+		kunit_skip(test, "not relevant");
+
+	rand_bytes(test_buf, data_len * 2);
+
+	/* Warm-up */
+	for (size_t i = 0; i < num_iters; i++)
+		sha256_finup_2x(NULL, &test_buf[0], &test_buf[data_len],
+				data_len, hash1, hash2);
+
+	for (size_t i = 0; i < ARRAY_SIZE(salt_lens_to_test); i++) {
+		size_t salt_len = salt_lens_to_test[i];
+		u64 t0, t1;
+
+		/*
+		 * Prepare the initial context.  The time to process the salt is
+		 * not measured; we're just interested in sha256_finup_2x().
+		 */
+		sha256_init(&ctx);
+		sha256_update(&ctx, test_buf, salt_len);
+
+		preempt_disable();
+		t0 = ktime_get_ns();
+		for (size_t j = 0; j < num_iters; j++)
+			sha256_finup_2x(&ctx, &test_buf[0], &test_buf[data_len],
+					data_len, hash1, hash2);
+		t1 = ktime_get_ns();
+		preempt_enable();
+		kunit_info(test, "data_len=%zu salt_len=%zu: %llu MB/s",
+			   data_len, salt_len,
+			   div64_u64((u64)data_len * 2 * num_iters * 1000,
+				     t1 - t0 ?: 1));
+	}
+}
+
 static struct kunit_case hash_test_cases[] = {
 	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_sha256_finup_2x),
+	KUNIT_CASE(test_sha256_finup_2x_defaultctx),
+	KUNIT_CASE(test_sha256_finup_2x_hugelen),
 	KUNIT_CASE(benchmark_hash),
+	KUNIT_CASE(benchmark_sha256_finup_2x),
 	{},
 };

--- a/lib/crypto/x86/sha256-ni-asm.S
+++ b/lib/crypto/x86/sha256-ni-asm.S
@ -165,6 +165,374 @@ SYM_FUNC_START(sha256_ni_transform)
 	RET
 SYM_FUNC_END(sha256_ni_transform)

+#undef DIGEST_PTR
+#undef DATA_PTR
+#undef NUM_BLKS
+#undef SHA256CONSTANTS
+#undef MSG
+#undef STATE0
+#undef STATE1
+#undef MSG0
+#undef MSG1
+#undef MSG2
+#undef MSG3
+#undef TMP
+#undef SHUF_MASK
+#undef ABEF_SAVE
+#undef CDGH_SAVE
+
+// parameters for sha256_ni_finup2x()
+#define CTX		%rdi
+#define DATA1		%rsi
+#define DATA2		%rdx
+#define LEN		%ecx
+#define LEN8		%cl
+#define LEN64		%rcx
+#define OUT1		%r8
+#define OUT2		%r9
+
+// other scalar variables
+#define SHA256CONSTANTS	%rax
+#define COUNT		%r10
+#define COUNT32		%r10d
+#define FINAL_STEP	%r11d
+
+// rbx is used as a temporary.
+
+#define MSG		%xmm0	// sha256rnds2 implicit operand
+#define STATE0_A	%xmm1
+#define STATE1_A	%xmm2
+#define STATE0_B	%xmm3
+#define STATE1_B	%xmm4
+#define TMP_A		%xmm5
+#define TMP_B		%xmm6
+#define MSG0_A		%xmm7
+#define MSG1_A		%xmm8
+#define MSG2_A		%xmm9
+#define MSG3_A		%xmm10
+#define MSG0_B		%xmm11
+#define MSG1_B		%xmm12
+#define MSG2_B		%xmm13
+#define MSG3_B		%xmm14
+#define SHUF_MASK	%xmm15
+
+#define OFFSETOF_STATE		0  // offsetof(struct __sha256_ctx, state)
+#define OFFSETOF_BYTECOUNT	32 // offsetof(struct __sha256_ctx, bytecount)
+#define OFFSETOF_BUF		40 // offsetof(struct __sha256_ctx, buf)
+
+// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a and m0_b
+// contain the current 4 message schedule words for the first and second message
+// respectively.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words for each message.  m1_a-m3_a contain
+// the next 3 groups of 4 message schedule words for the first message, and
+// likewise m1_b-m3_b for the second.  After consuming the current value of
+// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
+// likewise for *_b.  This means that the next (m0_a, m1_a, m2_a, m3_a) is the
+// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must
+// cycle through the registers accordingly.
+.macro	do_4rounds_2x	i, m0_a, m1_a, m2_a, m3_a,  m0_b, m1_b, m2_b, m3_b
+	movdqa		(\i-32)*4(SHA256CONSTANTS), TMP_A
+	movdqa		TMP_A, TMP_B
+	paddd		\m0_a, TMP_A
+	paddd		\m0_b, TMP_B
+.if \i < 48
+	sha256msg1	\m1_a, \m0_a
+	sha256msg1	\m1_b, \m0_b
+.endif
+	movdqa		TMP_A, MSG
+	sha256rnds2	STATE0_A, STATE1_A
+	movdqa		TMP_B, MSG
+	sha256rnds2	STATE0_B, STATE1_B
+	pshufd 		$0x0E, TMP_A, MSG
+	sha256rnds2	STATE1_A, STATE0_A
+	pshufd 		$0x0E, TMP_B, MSG
+	sha256rnds2	STATE1_B, STATE0_B
+.if \i < 48
+	movdqa		\m3_a, TMP_A
+	movdqa		\m3_b, TMP_B
+	palignr		$4, \m2_a, TMP_A
+	palignr		$4, \m2_b, TMP_B
+	paddd		TMP_A, \m0_a
+	paddd		TMP_B, \m0_b
+	sha256msg2	\m3_a, \m0_a
+	sha256msg2	\m3_b, \m0_b
+.endif
+.endm
+
+//
+// void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
+//			  const u8 *data1, const u8 *data2, int len,
+//			  u8 out1[SHA256_DIGEST_SIZE],
+//			  u8 out2[SHA256_DIGEST_SIZE]);
+//
+// This function computes the SHA-256 digests of two messages |data1| and
+// |data2| that are both |len| bytes long, starting from the initial context
+// |ctx|.  |len| must be at least SHA256_BLOCK_SIZE.
+//
+// The instructions for the two SHA-256 operations are interleaved.  On many
+// CPUs, this is almost twice as fast as hashing each message individually due
+// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
+//
+SYM_FUNC_START(sha256_ni_finup2x)
+	// Allocate 128 bytes of stack space, 16-byte aligned.
+	push		%rbx
+	push		%rbp
+	mov		%rsp, %rbp
+	sub		$128, %rsp
+	and		$~15, %rsp
+
+	// Load the shuffle mask for swapping the endianness of 32-bit words.
+	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+	// Set up pointer to the round constants.
+	lea		K256+32*4(%rip), SHA256CONSTANTS
+
+	// Initially we're not processing the final blocks.
+	xor		FINAL_STEP, FINAL_STEP
+
+	// Load the initial state from ctx->state.
+	movdqu		OFFSETOF_STATE+0*16(CTX), STATE0_A	// DCBA
+	movdqu		OFFSETOF_STATE+1*16(CTX), STATE1_A	// HGFE
+	movdqa		STATE0_A, TMP_A
+	punpcklqdq	STATE1_A, STATE0_A			// FEBA
+	punpckhqdq	TMP_A, STATE1_A				// DCHG
+	pshufd		$0x1B, STATE0_A, STATE0_A		// ABEF
+	pshufd		$0xB1, STATE1_A, STATE1_A		// CDGH
+
+	// Load ctx->bytecount.  Take the mod 64 of it to get the number of
+	// bytes that are buffered in ctx->buf.  Also save it in a register with
+	// LEN added to it.
+	mov		LEN, LEN
+	mov		OFFSETOF_BYTECOUNT(CTX), %rbx
+	lea		(%rbx, LEN64, 1), COUNT
+	and		$63, %ebx
+	jz		.Lfinup2x_enter_loop	// No bytes buffered?
+
+	// %ebx bytes (1 to 63) are currently buffered in ctx->buf.  Load them
+	// followed by the first 64 - %ebx bytes of data.  Since LEN >= 64, we
+	// just load 64 bytes from each of ctx->buf, DATA1, and DATA2
+	// unconditionally and rearrange the data as needed.
+
+	movdqu		OFFSETOF_BUF+0*16(CTX), MSG0_A
+	movdqu		OFFSETOF_BUF+1*16(CTX), MSG1_A
+	movdqu		OFFSETOF_BUF+2*16(CTX), MSG2_A
+	movdqu		OFFSETOF_BUF+3*16(CTX), MSG3_A
+	movdqa		MSG0_A, 0*16(%rsp)
+	movdqa		MSG1_A, 1*16(%rsp)
+	movdqa		MSG2_A, 2*16(%rsp)
+	movdqa		MSG3_A, 3*16(%rsp)
+
+	movdqu		0*16(DATA1), MSG0_A
+	movdqu		1*16(DATA1), MSG1_A
+	movdqu		2*16(DATA1), MSG2_A
+	movdqu		3*16(DATA1), MSG3_A
+	movdqu		MSG0_A, 0*16(%rsp,%rbx)
+	movdqu		MSG1_A, 1*16(%rsp,%rbx)
+	movdqu		MSG2_A, 2*16(%rsp,%rbx)
+	movdqu		MSG3_A, 3*16(%rsp,%rbx)
+	movdqa		0*16(%rsp), MSG0_A
+	movdqa		1*16(%rsp), MSG1_A
+	movdqa		2*16(%rsp), MSG2_A
+	movdqa		3*16(%rsp), MSG3_A
+
+	movdqu		0*16(DATA2), MSG0_B
+	movdqu		1*16(DATA2), MSG1_B
+	movdqu		2*16(DATA2), MSG2_B
+	movdqu		3*16(DATA2), MSG3_B
+	movdqu		MSG0_B, 0*16(%rsp,%rbx)
+	movdqu		MSG1_B, 1*16(%rsp,%rbx)
+	movdqu		MSG2_B, 2*16(%rsp,%rbx)
+	movdqu		MSG3_B, 3*16(%rsp,%rbx)
+	movdqa		0*16(%rsp), MSG0_B
+	movdqa		1*16(%rsp), MSG1_B
+	movdqa		2*16(%rsp), MSG2_B
+	movdqa		3*16(%rsp), MSG3_B
+
+	sub		$64, %rbx 	// rbx = buffered - 64
+	sub		%rbx, DATA1	// DATA1 += 64 - buffered
+	sub		%rbx, DATA2	// DATA2 += 64 - buffered
+	add		%ebx, LEN	// LEN += buffered - 64
+	movdqa		STATE0_A, STATE0_B
+	movdqa		STATE1_A, STATE1_B
+	jmp		.Lfinup2x_loop_have_data
+
+.Lfinup2x_enter_loop:
+	sub		$64, LEN
+	movdqa		STATE0_A, STATE0_B
+	movdqa		STATE1_A, STATE1_B
+.Lfinup2x_loop:
+	// Load the next two data blocks.
+	movdqu		0*16(DATA1), MSG0_A
+	movdqu		0*16(DATA2), MSG0_B
+	movdqu		1*16(DATA1), MSG1_A
+	movdqu		1*16(DATA2), MSG1_B
+	movdqu		2*16(DATA1), MSG2_A
+	movdqu		2*16(DATA2), MSG2_B
+	movdqu		3*16(DATA1), MSG3_A
+	movdqu		3*16(DATA2), MSG3_B
+	add		$64, DATA1
+	add		$64, DATA2
+.Lfinup2x_loop_have_data:
+	// Convert the words of the data blocks from big endian.
+	pshufb		SHUF_MASK, MSG0_A
+	pshufb		SHUF_MASK, MSG0_B
+	pshufb		SHUF_MASK, MSG1_A
+	pshufb		SHUF_MASK, MSG1_B
+	pshufb		SHUF_MASK, MSG2_A
+	pshufb		SHUF_MASK, MSG2_B
+	pshufb		SHUF_MASK, MSG3_A
+	pshufb		SHUF_MASK, MSG3_B
+.Lfinup2x_loop_have_bswapped_data:
+
+	// Save the original state for each block.
+	movdqa		STATE0_A, 0*16(%rsp)
+	movdqa		STATE0_B, 1*16(%rsp)
+	movdqa		STATE1_A, 2*16(%rsp)
+	movdqa		STATE1_B, 3*16(%rsp)
+
+	// Do the SHA-256 rounds on each block.
+.irp i, 0, 16, 32, 48
+	do_4rounds_2x	(\i + 0),  MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
+				   MSG0_B, MSG1_B, MSG2_B, MSG3_B
+	do_4rounds_2x	(\i + 4),  MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
+				   MSG1_B, MSG2_B, MSG3_B, MSG0_B
+	do_4rounds_2x	(\i + 8),  MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
+				   MSG2_B, MSG3_B, MSG0_B, MSG1_B
+	do_4rounds_2x	(\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
+				   MSG3_B, MSG0_B, MSG1_B, MSG2_B
+.endr
+
+	// Add the original state for each block.
+	paddd		0*16(%rsp), STATE0_A
+	paddd		1*16(%rsp), STATE0_B
+	paddd		2*16(%rsp), STATE1_A
+	paddd		3*16(%rsp), STATE1_B
+
+	// Update LEN and loop back if more blocks remain.
+	sub		$64, LEN
+	jge		.Lfinup2x_loop
+
+	// Check if any final blocks need to be handled.
+	// FINAL_STEP = 2: all done
+	// FINAL_STEP = 1: need to do count-only padding block
+	// FINAL_STEP = 0: need to do the block with 0x80 padding byte
+	cmp		$1, FINAL_STEP
+	jg		.Lfinup2x_done
+	je		.Lfinup2x_finalize_countonly
+	add		$64, LEN
+	jz		.Lfinup2x_finalize_blockaligned
+
+	// Not block-aligned; 1 <= LEN <= 63 data bytes remain.  Pad the block.
+	// To do this, write the padding starting with the 0x80 byte to
+	// &sp[64].  Then for each message, copy the last 64 data bytes to sp
+	// and load from &sp[64 - LEN] to get the needed padding block.  This
+	// code relies on the data buffers being >= 64 bytes in length.
+	mov		$64, %ebx
+	sub		LEN, %ebx		// ebx = 64 - LEN
+	sub		%rbx, DATA1		// DATA1 -= 64 - LEN
+	sub		%rbx, DATA2		// DATA2 -= 64 - LEN
+	mov		$0x80, FINAL_STEP   // using FINAL_STEP as a temporary
+	movd		FINAL_STEP, MSG0_A
+	pxor		MSG1_A, MSG1_A
+	movdqa		MSG0_A, 4*16(%rsp)
+	movdqa		MSG1_A, 5*16(%rsp)
+	movdqa		MSG1_A, 6*16(%rsp)
+	movdqa		MSG1_A, 7*16(%rsp)
+	cmp		$56, LEN
+	jge		1f	// will COUNT spill into its own block?
+	shl		$3, COUNT
+	bswap		COUNT
+	mov		COUNT, 56(%rsp,%rbx)
+	mov		$2, FINAL_STEP	// won't need count-only block
+	jmp		2f
+1:
+	mov		$1, FINAL_STEP	// will need count-only block
+2:
+	movdqu		0*16(DATA1), MSG0_A
+	movdqu		1*16(DATA1), MSG1_A
+	movdqu		2*16(DATA1), MSG2_A
+	movdqu		3*16(DATA1), MSG3_A
+	movdqa		MSG0_A, 0*16(%rsp)
+	movdqa		MSG1_A, 1*16(%rsp)
+	movdqa		MSG2_A, 2*16(%rsp)
+	movdqa		MSG3_A, 3*16(%rsp)
+	movdqu		0*16(%rsp,%rbx), MSG0_A
+	movdqu		1*16(%rsp,%rbx), MSG1_A
+	movdqu		2*16(%rsp,%rbx), MSG2_A
+	movdqu		3*16(%rsp,%rbx), MSG3_A
+
+	movdqu		0*16(DATA2), MSG0_B
+	movdqu		1*16(DATA2), MSG1_B
+	movdqu		2*16(DATA2), MSG2_B
+	movdqu		3*16(DATA2), MSG3_B
+	movdqa		MSG0_B, 0*16(%rsp)
+	movdqa		MSG1_B, 1*16(%rsp)
+	movdqa		MSG2_B, 2*16(%rsp)
+	movdqa		MSG3_B, 3*16(%rsp)
+	movdqu		0*16(%rsp,%rbx), MSG0_B
+	movdqu		1*16(%rsp,%rbx), MSG1_B
+	movdqu		2*16(%rsp,%rbx), MSG2_B
+	movdqu		3*16(%rsp,%rbx), MSG3_B
+	jmp		.Lfinup2x_loop_have_data
+
+	// Prepare a padding block, either:
+	//
+	//	{0x80, 0, 0, 0, ..., count (as __be64)}
+	//	This is for a block aligned message.
+	//
+	//	{   0, 0, 0, 0, ..., count (as __be64)}
+	//	This is for a message whose length mod 64 is >= 56.
+	//
+	// Pre-swap the endianness of the words.
+.Lfinup2x_finalize_countonly:
+	pxor		MSG0_A, MSG0_A
+	jmp		1f
+
+.Lfinup2x_finalize_blockaligned:
+	mov		$0x80000000, %ebx
+	movd		%ebx, MSG0_A
+1:
+	pxor		MSG1_A, MSG1_A
+	pxor		MSG2_A, MSG2_A
+	ror		$29, COUNT
+	movq		COUNT, MSG3_A
+	pslldq		$8, MSG3_A
+	movdqa		MSG0_A, MSG0_B
+	pxor		MSG1_B, MSG1_B
+	pxor		MSG2_B, MSG2_B
+	movdqa		MSG3_A, MSG3_B
+	mov		$2, FINAL_STEP
+	jmp		.Lfinup2x_loop_have_bswapped_data
+
+.Lfinup2x_done:
+	// Write the two digests with all bytes in the correct order.
+	movdqa		STATE0_A, TMP_A
+	movdqa		STATE0_B, TMP_B
+	punpcklqdq	STATE1_A, STATE0_A		// GHEF
+	punpcklqdq	STATE1_B, STATE0_B
+	punpckhqdq	TMP_A, STATE1_A			// ABCD
+	punpckhqdq	TMP_B, STATE1_B
+	pshufd		$0xB1, STATE0_A, STATE0_A	// HGFE
+	pshufd		$0xB1, STATE0_B, STATE0_B
+	pshufd		$0x1B, STATE1_A, STATE1_A	// DCBA
+	pshufd		$0x1B, STATE1_B, STATE1_B
+	pshufb		SHUF_MASK, STATE0_A
+	pshufb		SHUF_MASK, STATE0_B
+	pshufb		SHUF_MASK, STATE1_A
+	pshufb		SHUF_MASK, STATE1_B
+	movdqu		STATE0_A, 1*16(OUT1)
+	movdqu		STATE0_B, 1*16(OUT2)
+	movdqu		STATE1_A, 0*16(OUT1)
+	movdqu		STATE1_B, 0*16(OUT2)
+
+	mov		%rbp, %rsp
+	pop		%rbp
+	pop		%rbx
+	RET
+SYM_FUNC_END(sha256_ni_finup2x)
+
 .section	.rodata.cst256.K256, "aM", @progbits, 256
 .align 64
 K256:
--- a/lib/crypto/x86/sha256.h
+++ b/lib/crypto/x86/sha256.h
@ -7,6 +7,8 @@
 #include <asm/fpu/api.h>
 #include <linux/static_call.h>

+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni);
+
 DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic);

 #define DEFINE_X86_SHA256_FN(c_fn, asm_fn)                                 \
@ -35,11 +37,48 @@ static void sha256_blocks(struct sha256_block_state *state,
 	static_call(sha256_blocks_x86)(state, data, nblocks);
 }

+static_assert(offsetof(struct __sha256_ctx, state) == 0);
+static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
+static_assert(offsetof(struct __sha256_ctx, buf) == 40);
+asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
+				  const u8 *data1, const u8 *data2, int len,
+				  u8 out1[SHA256_DIGEST_SIZE],
+				  u8 out2[SHA256_DIGEST_SIZE]);
+
+#define sha256_finup_2x_arch sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+				 const u8 *data1, const u8 *data2, size_t len,
+				 u8 out1[SHA256_DIGEST_SIZE],
+				 u8 out2[SHA256_DIGEST_SIZE])
+{
+	/*
+	 * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
+	 * Further limit len to 65536 to avoid spending too long with preemption
+	 * disabled.  (Of course, in practice len is nearly always 4096 anyway.)
+	 */
+	if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE &&
+	    len <= 65536 && likely(irq_fpu_usable())) {
+		kernel_fpu_begin();
+		sha256_ni_finup2x(ctx, data1, data2, len, out1, out2);
+		kernel_fpu_end();
+		kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
+		kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
+		return true;
+	}
+	return false;
+}
+
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+	return static_key_enabled(&have_sha_ni);
+}
+
 #define sha256_mod_init_arch sha256_mod_init_arch
 static void sha256_mod_init_arch(void)
 {
 	if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
 		static_call_update(sha256_blocks_x86, sha256_blocks_ni);
+		static_branch_enable(&have_sha_ni);
 	} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
 				     NULL) &&
 		   boot_cpu_has(X86_FEATURE_AVX)) {