Optimize fsverity with 2-way interleaved hashing

Add support for 2-way interleaved SHA-256 hashing to lib/crypto/, and
 make fsverity use it for faster file data verification. This improves
 fsverity performance on many x86_64 and arm64 processors.
 
 Later, I plan to make dm-verity use this too.
 -----BEGIN PGP SIGNATURE-----
 
 iIoEABYIADIWIQSacvsUNc7UX4ntmEPzXCl4vpKOKwUCaNg4/RQcZWJpZ2dlcnNA
 a2VybmVsLm9yZwAKCRDzXCl4vpKOK4fMAP9Xz00JNDfJ+mOVHIYOhAlWFGnug0X1
 cvoRf4QXchNlbwD9HTJQQDQXnbsPy3QPrUVfl2FqCW7c6vRlBJijhD6j4wE=
 =6dCR
 -----END PGP SIGNATURE-----

Merge tag 'fsverity-for-linus' of git://git.kernel.org/pub/scm/fs/fsverity/linux

Pull interleaved SHA-256 hashing support from Eric Biggers:
 "Optimize fsverity with 2-way interleaved hashing

  Add support for 2-way interleaved SHA-256 hashing to lib/crypto/, and
  make fsverity use it for faster file data verification. This improves
  fsverity performance on many x86_64 and arm64 processors.

  Later, I plan to make dm-verity use this too"

* tag 'fsverity-for-linus' of git://git.kernel.org/pub/scm/fs/fsverity/linux:
  fsverity: Use 2-way interleaved SHA-256 hashing when supported
  fsverity: Remove inode parameter from fsverity_hash_block()
  lib/crypto: tests: Add tests and benchmark for sha256_finup_2x()
  lib/crypto: x86/sha256: Add support for 2-way interleaved hashing
  lib/crypto: arm64/sha256: Add support for 2-way interleaved hashing
  lib/crypto: sha256: Add support for 2-way interleaved hashing
This commit is contained in:
Linus Torvalds 2025-09-29 15:55:20 -07:00
commit 1896ce8eb6
11 changed files with 1147 additions and 56 deletions

View File

@ -19,8 +19,7 @@ struct block_buffer {
};
/* Hash a block, writing the result to the next level's pending block buffer. */
static int hash_one_block(struct inode *inode,
const struct merkle_tree_params *params,
static int hash_one_block(const struct merkle_tree_params *params,
struct block_buffer *cur)
{
struct block_buffer *next = cur + 1;
@ -36,8 +35,7 @@ static int hash_one_block(struct inode *inode,
/* Zero-pad the block if it's shorter than the block size. */
memset(&cur->data[cur->filled], 0, params->block_size - cur->filled);
fsverity_hash_block(params, inode, cur->data,
&next->data[next->filled]);
fsverity_hash_block(params, cur->data, &next->data[next->filled]);
next->filled += params->digest_size;
cur->filled = 0;
return 0;
@ -123,7 +121,7 @@ static int build_merkle_tree(struct file *filp,
fsverity_err(inode, "Short read of file data");
goto out;
}
err = hash_one_block(inode, params, &buffers[-1]);
err = hash_one_block(params, &buffers[-1]);
if (err)
goto out;
for (level = 0; level < num_levels; level++) {
@ -134,7 +132,7 @@ static int build_merkle_tree(struct file *filp,
}
/* Next block at @level is full */
err = hash_one_block(inode, params, &buffers[level]);
err = hash_one_block(params, &buffers[level]);
if (err)
goto out;
err = write_merkle_tree_block(inode,
@ -154,7 +152,7 @@ static int build_merkle_tree(struct file *filp,
/* Finish all nonempty pending tree blocks. */
for (level = 0; level < num_levels; level++) {
if (buffers[level].filled != 0) {
err = hash_one_block(inode, params, &buffers[level]);
err = hash_one_block(params, &buffers[level]);
if (err)
goto out;
err = write_merkle_tree_block(inode,

View File

@ -90,7 +90,7 @@ union fsverity_hash_ctx *
fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
const u8 *salt, size_t salt_size);
void fsverity_hash_block(const struct merkle_tree_params *params,
const struct inode *inode, const void *data, u8 *out);
const void *data, u8 *out);
void fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
const void *data, size_t size, u8 *out);
void __init fsverity_check_hash_algs(void);

View File

@ -94,7 +94,6 @@ fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
/**
* fsverity_hash_block() - hash a single data or hash block
* @params: the Merkle tree's parameters
* @inode: inode for which the hashing is being done
* @data: virtual address of a buffer containing the block to hash
* @out: output digest, size 'params->digest_size' bytes
*
@ -102,7 +101,7 @@ fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
* in the Merkle tree parameters.
*/
void fsverity_hash_block(const struct merkle_tree_params *params,
const struct inode *inode, const void *data, u8 *out)
const void *data, u8 *out)
{
union fsverity_hash_ctx ctx;

View File

@ -10,6 +10,31 @@
#include <linux/bio.h>
#include <linux/export.h>
#define FS_VERITY_MAX_PENDING_BLOCKS 2
struct fsverity_pending_block {
const void *data;
u64 pos;
u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE];
};
struct fsverity_verification_context {
struct inode *inode;
struct fsverity_info *vi;
unsigned long max_ra_pages;
/*
* This is the queue of data blocks that are pending verification. When
* the crypto layer supports interleaved hashing, we allow multiple
* blocks to be queued up in order to utilize it. This can improve
* performance significantly vs. sequential hashing of each block.
*/
int num_pending;
int max_pending;
struct fsverity_pending_block
pending_blocks[FS_VERITY_MAX_PENDING_BLOCKS];
};
static struct workqueue_struct *fsverity_read_workqueue;
/*
@ -79,7 +104,7 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
}
/*
* Verify a single data block against the file's Merkle tree.
* Verify the hash of a single data block against the file's Merkle tree.
*
* In principle, we need to verify the entire path to the root node. However,
* for efficiency the filesystem may cache the hash blocks. Therefore we need
@ -88,10 +113,11 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
*
* Return: %true if the data block is valid, else %false.
*/
static bool
verify_data_block(struct inode *inode, struct fsverity_info *vi,
const void *data, u64 data_pos, unsigned long max_ra_pages)
static bool verify_data_block(struct inode *inode, struct fsverity_info *vi,
const struct fsverity_pending_block *dblock,
unsigned long max_ra_pages)
{
const u64 data_pos = dblock->pos;
const struct merkle_tree_params *params = &vi->tree_params;
const unsigned int hsize = params->digest_size;
int level;
@ -115,8 +141,12 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
*/
u64 hidx = data_pos >> params->log_blocksize;
/* Up to 1 + FS_VERITY_MAX_LEVELS pages may be mapped at once */
BUILD_BUG_ON(1 + FS_VERITY_MAX_LEVELS > KM_MAX_IDX);
/*
* Up to FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS pages may
* be mapped at once.
*/
static_assert(FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS <=
KM_MAX_IDX);
if (unlikely(data_pos >= inode->i_size)) {
/*
@ -127,7 +157,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
* any part past EOF should be all zeroes. Therefore, we need
* to verify that any data blocks fully past EOF are all zeroes.
*/
if (memchr_inv(data, 0, params->block_size)) {
if (memchr_inv(dblock->data, 0, params->block_size)) {
fsverity_err(inode,
"FILE CORRUPTED! Data past EOF is not zeroed");
return false;
@ -202,7 +232,7 @@ descend:
unsigned long hblock_idx = hblocks[level - 1].index;
unsigned int hoffset = hblocks[level - 1].hoffset;
fsverity_hash_block(params, inode, haddr, real_hash);
fsverity_hash_block(params, haddr, real_hash);
if (memcmp(want_hash, real_hash, hsize) != 0)
goto corrupted;
/*
@ -220,18 +250,18 @@ descend:
put_page(hpage);
}
/* Finally, verify the data block. */
fsverity_hash_block(params, inode, data, real_hash);
if (memcmp(want_hash, real_hash, hsize) != 0)
/* Finally, verify the hash of the data block. */
if (memcmp(want_hash, dblock->real_hash, hsize) != 0)
goto corrupted;
return true;
corrupted:
fsverity_err(inode,
"FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
data_pos, level - 1,
params->hash_alg->name, hsize, want_hash,
params->hash_alg->name, hsize, real_hash);
fsverity_err(
inode,
"FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
data_pos, level - 1, params->hash_alg->name, hsize, want_hash,
params->hash_alg->name, hsize,
level == 0 ? dblock->real_hash : real_hash);
error:
for (; level > 0; level--) {
kunmap_local(hblocks[level - 1].addr);
@ -240,13 +270,73 @@ error:
return false;
}
static bool
verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
unsigned long max_ra_pages)
static void
fsverity_init_verification_context(struct fsverity_verification_context *ctx,
struct inode *inode,
unsigned long max_ra_pages)
{
struct inode *inode = data_folio->mapping->host;
struct fsverity_info *vi = *fsverity_info_addr(inode);
const unsigned int block_size = vi->tree_params.block_size;
ctx->inode = inode;
ctx->vi = vi;
ctx->max_ra_pages = max_ra_pages;
ctx->num_pending = 0;
if (vi->tree_params.hash_alg->algo_id == HASH_ALGO_SHA256 &&
sha256_finup_2x_is_optimized())
ctx->max_pending = 2;
else
ctx->max_pending = 1;
}
static void
fsverity_clear_pending_blocks(struct fsverity_verification_context *ctx)
{
int i;
for (i = ctx->num_pending - 1; i >= 0; i--) {
kunmap_local(ctx->pending_blocks[i].data);
ctx->pending_blocks[i].data = NULL;
}
ctx->num_pending = 0;
}
static bool
fsverity_verify_pending_blocks(struct fsverity_verification_context *ctx)
{
struct fsverity_info *vi = ctx->vi;
const struct merkle_tree_params *params = &vi->tree_params;
int i;
if (ctx->num_pending == 2) {
/* num_pending == 2 implies that the algorithm is SHA-256 */
sha256_finup_2x(params->hashstate ? &params->hashstate->sha256 :
NULL,
ctx->pending_blocks[0].data,
ctx->pending_blocks[1].data, params->block_size,
ctx->pending_blocks[0].real_hash,
ctx->pending_blocks[1].real_hash);
} else {
for (i = 0; i < ctx->num_pending; i++)
fsverity_hash_block(params, ctx->pending_blocks[i].data,
ctx->pending_blocks[i].real_hash);
}
for (i = 0; i < ctx->num_pending; i++) {
if (!verify_data_block(ctx->inode, vi, &ctx->pending_blocks[i],
ctx->max_ra_pages))
return false;
}
fsverity_clear_pending_blocks(ctx);
return true;
}
static bool fsverity_add_data_blocks(struct fsverity_verification_context *ctx,
struct folio *data_folio, size_t len,
size_t offset)
{
struct fsverity_info *vi = ctx->vi;
const struct merkle_tree_params *params = &vi->tree_params;
const unsigned int block_size = params->block_size;
u64 pos = (u64)data_folio->index << PAGE_SHIFT;
if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size)))
@ -255,14 +345,11 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
folio_test_uptodate(data_folio)))
return false;
do {
void *data;
bool valid;
data = kmap_local_folio(data_folio, offset);
valid = verify_data_block(inode, vi, data, pos + offset,
max_ra_pages);
kunmap_local(data);
if (!valid)
ctx->pending_blocks[ctx->num_pending].data =
kmap_local_folio(data_folio, offset);
ctx->pending_blocks[ctx->num_pending].pos = pos + offset;
if (++ctx->num_pending == ctx->max_pending &&
!fsverity_verify_pending_blocks(ctx))
return false;
offset += block_size;
len -= block_size;
@ -284,7 +371,15 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
*/
bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset)
{
return verify_data_blocks(folio, len, offset, 0);
struct fsverity_verification_context ctx;
fsverity_init_verification_context(&ctx, folio->mapping->host, 0);
if (fsverity_add_data_blocks(&ctx, folio, len, offset) &&
fsverity_verify_pending_blocks(&ctx))
return true;
fsverity_clear_pending_blocks(&ctx);
return false;
}
EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
@ -305,6 +400,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
*/
void fsverity_verify_bio(struct bio *bio)
{
struct inode *inode = bio_first_folio_all(bio)->mapping->host;
struct fsverity_verification_context ctx;
struct folio_iter fi;
unsigned long max_ra_pages = 0;
@ -321,13 +418,21 @@ void fsverity_verify_bio(struct bio *bio)
max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2);
}
fsverity_init_verification_context(&ctx, inode, max_ra_pages);
bio_for_each_folio_all(fi, bio) {
if (!verify_data_blocks(fi.folio, fi.length, fi.offset,
max_ra_pages)) {
bio->bi_status = BLK_STS_IOERR;
break;
}
if (!fsverity_add_data_blocks(&ctx, fi.folio, fi.length,
fi.offset))
goto ioerr;
}
if (!fsverity_verify_pending_blocks(&ctx))
goto ioerr;
return;
ioerr:
fsverity_clear_pending_blocks(&ctx);
bio->bi_status = BLK_STS_IOERR;
}
EXPORT_SYMBOL_GPL(fsverity_verify_bio);
#endif /* CONFIG_BLOCK */

View File

@ -375,6 +375,34 @@ void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
*/
void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]);
/**
* sha256_finup_2x() - Compute two SHA-256 digests from a common initial
* context. On some CPUs, this is faster than sequentially
* computing each digest.
* @ctx: an optional initial context, which may have already processed data. If
* NULL, a default initial context is used (equivalent to sha256_init()).
* @data1: data for the first message
* @data2: data for the second message
* @len: the length of each of @data1 and @data2, in bytes
* @out1: (output) the first SHA-256 message digest
* @out2: (output) the second SHA-256 message digest
*
* Context: Any context.
*/
void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
u8 out2[SHA256_DIGEST_SIZE]);
/**
* sha256_finup_2x_is_optimized() - Check if sha256_finup_2x() is using a real
* interleaved implementation, as opposed to a
* sequential fallback
* @return: true if optimized
*
* Context: Any context.
*/
bool sha256_finup_2x_is_optimized(void);
/**
* struct hmac_sha256_key - Prepared key for HMAC-SHA256
* @key: private

View File

@ -70,18 +70,22 @@
.word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
.word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
.macro load_round_constants tmp
adr_l \tmp, .Lsha2_rcon
ld1 { v0.4s- v3.4s}, [\tmp], #64
ld1 { v4.4s- v7.4s}, [\tmp], #64
ld1 { v8.4s-v11.4s}, [\tmp], #64
ld1 {v12.4s-v15.4s}, [\tmp]
.endm
/*
* size_t __sha256_ce_transform(struct sha256_block_state *state,
* const u8 *data, size_t nblocks);
*/
.text
SYM_FUNC_START(__sha256_ce_transform)
/* load round constants */
adr_l x8, .Lsha2_rcon
ld1 { v0.4s- v3.4s}, [x8], #64
ld1 { v4.4s- v7.4s}, [x8], #64
ld1 { v8.4s-v11.4s}, [x8], #64
ld1 {v12.4s-v15.4s}, [x8]
load_round_constants x8
/* load state */
ld1 {dgav.4s, dgbv.4s}, [x0]
@ -134,3 +138,271 @@ CPU_LE( rev32 v19.16b, v19.16b )
mov x0, x2
ret
SYM_FUNC_END(__sha256_ce_transform)
.unreq dga
.unreq dgav
.unreq dgb
.unreq dgbv
.unreq t0
.unreq t1
.unreq dg0q
.unreq dg0v
.unreq dg1q
.unreq dg1v
.unreq dg2q
.unreq dg2v
// parameters for sha256_ce_finup2x()
ctx .req x0
data1 .req x1
data2 .req x2
len .req w3
out1 .req x4
out2 .req x5
// other scalar variables
count .req x6
final_step .req w7
// x8-x9 are used as temporaries.
// v0-v15 are used to cache the SHA-256 round constants.
// v16-v19 are used for the message schedule for the first message.
// v20-v23 are used for the message schedule for the second message.
// v24-v31 are used for the state and temporaries as given below.
// *_a are for the first message and *_b for the second.
state0_a_q .req q24
state0_a .req v24
state1_a_q .req q25
state1_a .req v25
state0_b_q .req q26
state0_b .req v26
state1_b_q .req q27
state1_b .req v27
t0_a .req v28
t0_b .req v29
t1_a_q .req q30
t1_a .req v30
t1_b_q .req q31
t1_b .req v31
#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
// offsetof(struct __sha256_ctx, state) is assumed to be 0.
// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
// and m0_b contain the current 4 message schedule words for the first
// and second message respectively.
//
// If not all the message schedule words have been computed yet, then
// this also computes 4 more message schedule words for each message.
// m1_a-m3_a contain the next 3 groups of 4 message schedule words for
// the first message, and likewise m1_b-m3_b for the second. After
// consuming the current value of m0_a, this macro computes the group
// after m3_a and writes it to m0_a, and likewise for *_b. This means
// that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
// m3_a, m0_a), and likewise for *_b, so the caller must cycle through
// the registers accordingly.
.macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \
m0_b, m1_b, m2_b, m3_b
add t0_a\().4s, \m0_a\().4s, \k\().4s
add t0_b\().4s, \m0_b\().4s, \k\().4s
.if \i < 48
sha256su0 \m0_a\().4s, \m1_a\().4s
sha256su0 \m0_b\().4s, \m1_b\().4s
sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s
sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s
.endif
mov t1_a.16b, state0_a.16b
mov t1_b.16b, state0_b.16b
sha256h state0_a_q, state1_a_q, t0_a\().4s
sha256h state0_b_q, state1_b_q, t0_b\().4s
sha256h2 state1_a_q, t1_a_q, t0_a\().4s
sha256h2 state1_b_q, t1_b_q, t0_b\().4s
.endm
.macro do_16rounds_2x i, k0, k1, k2, k3
do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23
do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20
do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21
do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22
.endm
//
// void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
// const u8 *data1, const u8 *data2, int len,
// u8 out1[SHA256_DIGEST_SIZE],
// u8 out2[SHA256_DIGEST_SIZE]);
//
// This function computes the SHA-256 digests of two messages |data1| and
// |data2| that are both |len| bytes long, starting from the initial context
// |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
//
// The instructions for the two SHA-256 operations are interleaved. On many
// CPUs, this is almost twice as fast as hashing each message individually due
// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
//
SYM_FUNC_START(sha256_ce_finup2x)
sub sp, sp, #128
mov final_step, #0
load_round_constants x8
// Load the initial state from ctx->state.
ld1 {state0_a.4s-state1_a.4s}, [ctx]
// Load ctx->bytecount. Take the mod 64 of it to get the number of
// bytes that are buffered in ctx->buf. Also save it in a register with
// len added to it.
ldr x8, [ctx, #OFFSETOF_BYTECOUNT]
add count, x8, len, sxtw
and x8, x8, #63
cbz x8, .Lfinup2x_enter_loop // No bytes buffered?
// x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them
// followed by the first 64 - x8 bytes of data. Since len >= 64, we
// just load 64 bytes from each of ctx->buf, data1, and data2
// unconditionally and rearrange the data as needed.
add x9, ctx, #OFFSETOF_BUF
ld1 {v16.16b-v19.16b}, [x9]
st1 {v16.16b-v19.16b}, [sp]
ld1 {v16.16b-v19.16b}, [data1], #64
add x9, sp, x8
st1 {v16.16b-v19.16b}, [x9]
ld1 {v16.4s-v19.4s}, [sp]
ld1 {v20.16b-v23.16b}, [data2], #64
st1 {v20.16b-v23.16b}, [x9]
ld1 {v20.4s-v23.4s}, [sp]
sub len, len, #64
sub data1, data1, x8
sub data2, data2, x8
add len, len, w8
mov state0_b.16b, state0_a.16b
mov state1_b.16b, state1_a.16b
b .Lfinup2x_loop_have_data
.Lfinup2x_enter_loop:
sub len, len, #64
mov state0_b.16b, state0_a.16b
mov state1_b.16b, state1_a.16b
.Lfinup2x_loop:
// Load the next two data blocks.
ld1 {v16.4s-v19.4s}, [data1], #64
ld1 {v20.4s-v23.4s}, [data2], #64
.Lfinup2x_loop_have_data:
// Convert the words of the data blocks from big endian.
CPU_LE( rev32 v16.16b, v16.16b )
CPU_LE( rev32 v17.16b, v17.16b )
CPU_LE( rev32 v18.16b, v18.16b )
CPU_LE( rev32 v19.16b, v19.16b )
CPU_LE( rev32 v20.16b, v20.16b )
CPU_LE( rev32 v21.16b, v21.16b )
CPU_LE( rev32 v22.16b, v22.16b )
CPU_LE( rev32 v23.16b, v23.16b )
.Lfinup2x_loop_have_bswapped_data:
// Save the original state for each block.
st1 {state0_a.4s-state1_b.4s}, [sp]
// Do the SHA-256 rounds on each block.
do_16rounds_2x 0, v0, v1, v2, v3
do_16rounds_2x 16, v4, v5, v6, v7
do_16rounds_2x 32, v8, v9, v10, v11
do_16rounds_2x 48, v12, v13, v14, v15
// Add the original state for each block.
ld1 {v16.4s-v19.4s}, [sp]
add state0_a.4s, state0_a.4s, v16.4s
add state1_a.4s, state1_a.4s, v17.4s
add state0_b.4s, state0_b.4s, v18.4s
add state1_b.4s, state1_b.4s, v19.4s
// Update len and loop back if more blocks remain.
sub len, len, #64
tbz len, #31, .Lfinup2x_loop // len >= 0?
// Check if any final blocks need to be handled.
// final_step = 2: all done
// final_step = 1: need to do count-only padding block
// final_step = 0: need to do the block with 0x80 padding byte
tbnz final_step, #1, .Lfinup2x_done
tbnz final_step, #0, .Lfinup2x_finalize_countonly
add len, len, #64
cbz len, .Lfinup2x_finalize_blockaligned
// Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
// To do this, write the padding starting with the 0x80 byte to
// &sp[64]. Then for each message, copy the last 64 data bytes to sp
// and load from &sp[64 - len] to get the needed padding block. This
// code relies on the data buffers being >= 64 bytes in length.
sub w8, len, #64 // w8 = len - 64
add data1, data1, w8, sxtw // data1 += len - 64
add data2, data2, w8, sxtw // data2 += len - 64
CPU_LE( mov x9, #0x80 )
CPU_LE( fmov d16, x9 )
CPU_BE( movi v16.16b, #0 )
CPU_BE( mov x9, #0x8000000000000000 )
CPU_BE( mov v16.d[1], x9 )
movi v17.16b, #0
stp q16, q17, [sp, #64]
stp q17, q17, [sp, #96]
sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
cmp len, #56
b.ge 1f // will count spill into its own block?
lsl count, count, #3
CPU_LE( rev count, count )
str count, [x9, #56]
mov final_step, #2 // won't need count-only block
b 2f
1:
mov final_step, #1 // will need count-only block
2:
ld1 {v16.16b-v19.16b}, [data1]
st1 {v16.16b-v19.16b}, [sp]
ld1 {v16.4s-v19.4s}, [x9]
ld1 {v20.16b-v23.16b}, [data2]
st1 {v20.16b-v23.16b}, [sp]
ld1 {v20.4s-v23.4s}, [x9]
b .Lfinup2x_loop_have_data
// Prepare a padding block, either:
//
// {0x80, 0, 0, 0, ..., count (as __be64)}
// This is for a block aligned message.
//
// { 0, 0, 0, 0, ..., count (as __be64)}
// This is for a message whose length mod 64 is >= 56.
//
// Pre-swap the endianness of the words.
.Lfinup2x_finalize_countonly:
movi v16.2d, #0
b 1f
.Lfinup2x_finalize_blockaligned:
mov x8, #0x80000000
fmov d16, x8
1:
movi v17.2d, #0
movi v18.2d, #0
ror count, count, #29 // ror(lsl(count, 3), 32)
mov v19.d[0], xzr
mov v19.d[1], count
mov v20.16b, v16.16b
movi v21.2d, #0
movi v22.2d, #0
mov v23.16b, v19.16b
mov final_step, #2
b .Lfinup2x_loop_have_bswapped_data
.Lfinup2x_done:
// Write the two digests with all bytes in the correct order.
CPU_LE( rev32 state0_a.16b, state0_a.16b )
CPU_LE( rev32 state1_a.16b, state1_a.16b )
CPU_LE( rev32 state0_b.16b, state0_b.16b )
CPU_LE( rev32 state1_b.16b, state1_b.16b )
st1 {state0_a.4s-state1_a.4s}, [out1]
st1 {state0_b.4s-state1_b.4s}, [out2]
add sp, sp, #128
ret
SYM_FUNC_END(sha256_ce_finup2x)

View File

@ -44,6 +44,43 @@ static void sha256_blocks(struct sha256_block_state *state,
}
}
static_assert(offsetof(struct __sha256_ctx, state) == 0);
static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
static_assert(offsetof(struct __sha256_ctx, buf) == 40);
asmlinkage void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
const u8 *data1, const u8 *data2, int len,
u8 out1[SHA256_DIGEST_SIZE],
u8 out2[SHA256_DIGEST_SIZE]);
#define sha256_finup_2x_arch sha256_finup_2x_arch
static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
const u8 *data1, const u8 *data2, size_t len,
u8 out1[SHA256_DIGEST_SIZE],
u8 out2[SHA256_DIGEST_SIZE])
{
/*
* The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
* Further limit len to 65536 to avoid spending too long with preemption
* disabled. (Of course, in practice len is nearly always 4096 anyway.)
*/
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE &&
len <= 65536 && likely(may_use_simd())) {
kernel_neon_begin();
sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
kernel_neon_end();
kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
return true;
}
return false;
}
static bool sha256_finup_2x_is_optimized_arch(void)
{
return static_key_enabled(&have_ce);
}
#ifdef CONFIG_KERNEL_MODE_NEON
#define sha256_mod_init_arch sha256_mod_init_arch
static void sha256_mod_init_arch(void)

View File

@ -25,13 +25,20 @@ static const struct sha256_block_state sha224_iv = {
},
};
static const struct sha256_block_state sha256_iv = {
.h = {
SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
static const struct sha256_ctx initial_sha256_ctx = {
.ctx = {
.state = {
.h = {
SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
},
},
.bytecount = 0,
},
};
#define sha256_iv (initial_sha256_ctx.ctx.state)
static const u32 sha256_K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@ -261,8 +268,62 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
}
EXPORT_SYMBOL(sha256);
/* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */
/*
* Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined)
* doesn't need either HMAC support or interleaved hashing support
*/
#ifndef __DISABLE_EXPORTS
#ifndef sha256_finup_2x_arch
static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
const u8 *data1, const u8 *data2, size_t len,
u8 out1[SHA256_DIGEST_SIZE],
u8 out2[SHA256_DIGEST_SIZE])
{
return false;
}
static bool sha256_finup_2x_is_optimized_arch(void)
{
return false;
}
#endif
/* Sequential fallback implementation of sha256_finup_2x() */
static noinline_for_stack void sha256_finup_2x_sequential(
const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2,
size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE])
{
struct __sha256_ctx mut_ctx;
mut_ctx = *ctx;
__sha256_update(&mut_ctx, data1, len);
__sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE);
mut_ctx = *ctx;
__sha256_update(&mut_ctx, data2, len);
__sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE);
}
void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
u8 out2[SHA256_DIGEST_SIZE])
{
if (ctx == NULL)
ctx = &initial_sha256_ctx;
if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1,
out2)))
return;
sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2);
}
EXPORT_SYMBOL_GPL(sha256_finup_2x);
bool sha256_finup_2x_is_optimized(void)
{
return sha256_finup_2x_is_optimized_arch();
}
EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized);
static void __hmac_sha256_preparekey(struct sha256_block_state *istate,
struct sha256_block_state *ostate,
const u8 *raw_key, size_t raw_key_len,

View File

@ -5,6 +5,7 @@
#include <crypto/sha2.h>
#include "sha256-testvecs.h"
/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
#define HASH sha256
#define HASH_CTX sha256_ctx
#define HASH_SIZE SHA256_DIGEST_SIZE
@ -21,9 +22,192 @@
#define HMAC_USINGRAWKEY hmac_sha256_usingrawkey
#include "hash-test-template.h"
static void free_guarded_buf(void *buf)
{
vfree(buf);
}
/*
* Allocate a KUnit-managed buffer that has length @len bytes immediately
* followed by an unmapped page, and assert that the allocation succeeds.
*/
static void *alloc_guarded_buf(struct kunit *test, size_t len)
{
size_t full_len = round_up(len, PAGE_SIZE);
void *buf = vmalloc(full_len);
KUNIT_ASSERT_NOT_NULL(test, buf);
KUNIT_ASSERT_EQ(test, 0,
kunit_add_action_or_reset(test, free_guarded_buf, buf));
return buf + full_len - len;
}
/*
* Test for sha256_finup_2x(). Specifically, choose various data lengths and
* salt lengths, and for each one, verify that sha256_finup_2x() produces the
* same results as sha256_update() and sha256_final().
*
* Use guarded buffers for all inputs and outputs to reliably detect any
* out-of-bounds reads or writes, even if they occur in assembly code.
*/
static void test_sha256_finup_2x(struct kunit *test)
{
const size_t max_data_len = 16384;
u8 *data1_buf, *data2_buf, *hash1, *hash2;
u8 expected_hash1[SHA256_DIGEST_SIZE];
u8 expected_hash2[SHA256_DIGEST_SIZE];
u8 salt[SHA256_BLOCK_SIZE];
struct sha256_ctx *ctx;
data1_buf = alloc_guarded_buf(test, max_data_len);
data2_buf = alloc_guarded_buf(test, max_data_len);
hash1 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
hash2 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
ctx = alloc_guarded_buf(test, sizeof(*ctx));
rand_bytes(data1_buf, max_data_len);
rand_bytes(data2_buf, max_data_len);
rand_bytes(salt, sizeof(salt));
for (size_t i = 0; i < 500; i++) {
size_t salt_len = rand_length(sizeof(salt));
size_t data_len = rand_length(max_data_len);
const u8 *data1 = data1_buf + max_data_len - data_len;
const u8 *data2 = data2_buf + max_data_len - data_len;
struct sha256_ctx orig_ctx;
sha256_init(ctx);
sha256_update(ctx, salt, salt_len);
orig_ctx = *ctx;
sha256_finup_2x(ctx, data1, data2, data_len, hash1, hash2);
KUNIT_ASSERT_MEMEQ_MSG(
test, ctx, &orig_ctx, sizeof(*ctx),
"sha256_finup_2x() modified its ctx argument");
sha256_update(ctx, data1, data_len);
sha256_final(ctx, expected_hash1);
sha256_update(&orig_ctx, data2, data_len);
sha256_final(&orig_ctx, expected_hash2);
KUNIT_ASSERT_MEMEQ_MSG(
test, hash1, expected_hash1, SHA256_DIGEST_SIZE,
"Wrong hash1 with salt_len=%zu data_len=%zu", salt_len,
data_len);
KUNIT_ASSERT_MEMEQ_MSG(
test, hash2, expected_hash2, SHA256_DIGEST_SIZE,
"Wrong hash2 with salt_len=%zu data_len=%zu", salt_len,
data_len);
}
}
/* Test sha256_finup_2x() with ctx == NULL */
static void test_sha256_finup_2x_defaultctx(struct kunit *test)
{
const size_t data_len = 128;
struct sha256_ctx ctx;
u8 hash1_a[SHA256_DIGEST_SIZE];
u8 hash2_a[SHA256_DIGEST_SIZE];
u8 hash1_b[SHA256_DIGEST_SIZE];
u8 hash2_b[SHA256_DIGEST_SIZE];
rand_bytes(test_buf, 2 * data_len);
sha256_init(&ctx);
sha256_finup_2x(&ctx, test_buf, &test_buf[data_len], data_len, hash1_a,
hash2_a);
sha256_finup_2x(NULL, test_buf, &test_buf[data_len], data_len, hash1_b,
hash2_b);
KUNIT_ASSERT_MEMEQ(test, hash1_a, hash1_b, SHA256_DIGEST_SIZE);
KUNIT_ASSERT_MEMEQ(test, hash2_a, hash2_b, SHA256_DIGEST_SIZE);
}
/*
* Test that sha256_finup_2x() and sha256_update/final() produce consistent
* results with total message lengths that require more than 32 bits.
*/
static void test_sha256_finup_2x_hugelen(struct kunit *test)
{
const size_t data_len = 4 * SHA256_BLOCK_SIZE;
struct sha256_ctx ctx = {};
u8 expected_hash[SHA256_DIGEST_SIZE];
u8 hash[SHA256_DIGEST_SIZE];
rand_bytes(test_buf, data_len);
for (size_t align = 0; align < SHA256_BLOCK_SIZE; align++) {
sha256_init(&ctx);
ctx.ctx.bytecount = 0x123456789abcd00 + align;
sha256_finup_2x(&ctx, test_buf, test_buf, data_len, hash, hash);
sha256_update(&ctx, test_buf, data_len);
sha256_final(&ctx, expected_hash);
KUNIT_ASSERT_MEMEQ(test, hash, expected_hash,
SHA256_DIGEST_SIZE);
}
}
/* Benchmark for sha256_finup_2x() */
static void benchmark_sha256_finup_2x(struct kunit *test)
{
/*
* Try a few different salt lengths, since sha256_finup_2x() performance
* may vary slightly for the same data_len depending on how many bytes
* were already processed in the initial context.
*/
static const size_t salt_lens_to_test[] = { 0, 32, 64 };
const size_t data_len = 4096;
const size_t num_iters = 4096;
struct sha256_ctx ctx;
u8 hash1[SHA256_DIGEST_SIZE];
u8 hash2[SHA256_DIGEST_SIZE];
if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
kunit_skip(test, "not enabled");
if (!sha256_finup_2x_is_optimized())
kunit_skip(test, "not relevant");
rand_bytes(test_buf, data_len * 2);
/* Warm-up */
for (size_t i = 0; i < num_iters; i++)
sha256_finup_2x(NULL, &test_buf[0], &test_buf[data_len],
data_len, hash1, hash2);
for (size_t i = 0; i < ARRAY_SIZE(salt_lens_to_test); i++) {
size_t salt_len = salt_lens_to_test[i];
u64 t0, t1;
/*
* Prepare the initial context. The time to process the salt is
* not measured; we're just interested in sha256_finup_2x().
*/
sha256_init(&ctx);
sha256_update(&ctx, test_buf, salt_len);
preempt_disable();
t0 = ktime_get_ns();
for (size_t j = 0; j < num_iters; j++)
sha256_finup_2x(&ctx, &test_buf[0], &test_buf[data_len],
data_len, hash1, hash2);
t1 = ktime_get_ns();
preempt_enable();
kunit_info(test, "data_len=%zu salt_len=%zu: %llu MB/s",
data_len, salt_len,
div64_u64((u64)data_len * 2 * num_iters * 1000,
t1 - t0 ?: 1));
}
}
static struct kunit_case hash_test_cases[] = {
HASH_KUNIT_CASES,
KUNIT_CASE(test_sha256_finup_2x),
KUNIT_CASE(test_sha256_finup_2x_defaultctx),
KUNIT_CASE(test_sha256_finup_2x_hugelen),
KUNIT_CASE(benchmark_hash),
KUNIT_CASE(benchmark_sha256_finup_2x),
{},
};

View File

@ -165,6 +165,374 @@ SYM_FUNC_START(sha256_ni_transform)
RET
SYM_FUNC_END(sha256_ni_transform)
#undef DIGEST_PTR
#undef DATA_PTR
#undef NUM_BLKS
#undef SHA256CONSTANTS
#undef MSG
#undef STATE0
#undef STATE1
#undef MSG0
#undef MSG1
#undef MSG2
#undef MSG3
#undef TMP
#undef SHUF_MASK
#undef ABEF_SAVE
#undef CDGH_SAVE
// parameters for sha256_ni_finup2x()
#define CTX %rdi
#define DATA1 %rsi
#define DATA2 %rdx
#define LEN %ecx
#define LEN8 %cl
#define LEN64 %rcx
#define OUT1 %r8
#define OUT2 %r9
// other scalar variables
#define SHA256CONSTANTS %rax
#define COUNT %r10
#define COUNT32 %r10d
#define FINAL_STEP %r11d
// rbx is used as a temporary.
#define MSG %xmm0 // sha256rnds2 implicit operand
#define STATE0_A %xmm1
#define STATE1_A %xmm2
#define STATE0_B %xmm3
#define STATE1_B %xmm4
#define TMP_A %xmm5
#define TMP_B %xmm6
#define MSG0_A %xmm7
#define MSG1_A %xmm8
#define MSG2_A %xmm9
#define MSG3_A %xmm10
#define MSG0_B %xmm11
#define MSG1_B %xmm12
#define MSG2_B %xmm13
#define MSG3_B %xmm14
#define SHUF_MASK %xmm15
#define OFFSETOF_STATE 0 // offsetof(struct __sha256_ctx, state)
#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b
// contain the current 4 message schedule words for the first and second message
// respectively.
//
// If not all the message schedule words have been computed yet, then this also
// computes 4 more message schedule words for each message. m1_a-m3_a contain
// the next 3 groups of 4 message schedule words for the first message, and
// likewise m1_b-m3_b for the second. After consuming the current value of
// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
// likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the
// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must
// cycle through the registers accordingly.
.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b
movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A
movdqa TMP_A, TMP_B
paddd \m0_a, TMP_A
paddd \m0_b, TMP_B
.if \i < 48
sha256msg1 \m1_a, \m0_a
sha256msg1 \m1_b, \m0_b
.endif
movdqa TMP_A, MSG
sha256rnds2 STATE0_A, STATE1_A
movdqa TMP_B, MSG
sha256rnds2 STATE0_B, STATE1_B
pshufd $0x0E, TMP_A, MSG
sha256rnds2 STATE1_A, STATE0_A
pshufd $0x0E, TMP_B, MSG
sha256rnds2 STATE1_B, STATE0_B
.if \i < 48
movdqa \m3_a, TMP_A
movdqa \m3_b, TMP_B
palignr $4, \m2_a, TMP_A
palignr $4, \m2_b, TMP_B
paddd TMP_A, \m0_a
paddd TMP_B, \m0_b
sha256msg2 \m3_a, \m0_a
sha256msg2 \m3_b, \m0_b
.endif
.endm
//
// void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
// const u8 *data1, const u8 *data2, int len,
// u8 out1[SHA256_DIGEST_SIZE],
// u8 out2[SHA256_DIGEST_SIZE]);
//
// This function computes the SHA-256 digests of two messages |data1| and
// |data2| that are both |len| bytes long, starting from the initial context
// |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
//
// The instructions for the two SHA-256 operations are interleaved. On many
// CPUs, this is almost twice as fast as hashing each message individually due
// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
//
SYM_FUNC_START(sha256_ni_finup2x)
// Allocate 128 bytes of stack space, 16-byte aligned.
push %rbx
push %rbp
mov %rsp, %rbp
sub $128, %rsp
and $~15, %rsp
// Load the shuffle mask for swapping the endianness of 32-bit words.
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
// Set up pointer to the round constants.
lea K256+32*4(%rip), SHA256CONSTANTS
// Initially we're not processing the final blocks.
xor FINAL_STEP, FINAL_STEP
// Load the initial state from ctx->state.
movdqu OFFSETOF_STATE+0*16(CTX), STATE0_A // DCBA
movdqu OFFSETOF_STATE+1*16(CTX), STATE1_A // HGFE
movdqa STATE0_A, TMP_A
punpcklqdq STATE1_A, STATE0_A // FEBA
punpckhqdq TMP_A, STATE1_A // DCHG
pshufd $0x1B, STATE0_A, STATE0_A // ABEF
pshufd $0xB1, STATE1_A, STATE1_A // CDGH
// Load ctx->bytecount. Take the mod 64 of it to get the number of
// bytes that are buffered in ctx->buf. Also save it in a register with
// LEN added to it.
mov LEN, LEN
mov OFFSETOF_BYTECOUNT(CTX), %rbx
lea (%rbx, LEN64, 1), COUNT
and $63, %ebx
jz .Lfinup2x_enter_loop // No bytes buffered?
// %ebx bytes (1 to 63) are currently buffered in ctx->buf. Load them
// followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we
// just load 64 bytes from each of ctx->buf, DATA1, and DATA2
// unconditionally and rearrange the data as needed.
movdqu OFFSETOF_BUF+0*16(CTX), MSG0_A
movdqu OFFSETOF_BUF+1*16(CTX), MSG1_A
movdqu OFFSETOF_BUF+2*16(CTX), MSG2_A
movdqu OFFSETOF_BUF+3*16(CTX), MSG3_A
movdqa MSG0_A, 0*16(%rsp)
movdqa MSG1_A, 1*16(%rsp)
movdqa MSG2_A, 2*16(%rsp)
movdqa MSG3_A, 3*16(%rsp)
movdqu 0*16(DATA1), MSG0_A
movdqu 1*16(DATA1), MSG1_A
movdqu 2*16(DATA1), MSG2_A
movdqu 3*16(DATA1), MSG3_A
movdqu MSG0_A, 0*16(%rsp,%rbx)
movdqu MSG1_A, 1*16(%rsp,%rbx)
movdqu MSG2_A, 2*16(%rsp,%rbx)
movdqu MSG3_A, 3*16(%rsp,%rbx)
movdqa 0*16(%rsp), MSG0_A
movdqa 1*16(%rsp), MSG1_A
movdqa 2*16(%rsp), MSG2_A
movdqa 3*16(%rsp), MSG3_A
movdqu 0*16(DATA2), MSG0_B
movdqu 1*16(DATA2), MSG1_B
movdqu 2*16(DATA2), MSG2_B
movdqu 3*16(DATA2), MSG3_B
movdqu MSG0_B, 0*16(%rsp,%rbx)
movdqu MSG1_B, 1*16(%rsp,%rbx)
movdqu MSG2_B, 2*16(%rsp,%rbx)
movdqu MSG3_B, 3*16(%rsp,%rbx)
movdqa 0*16(%rsp), MSG0_B
movdqa 1*16(%rsp), MSG1_B
movdqa 2*16(%rsp), MSG2_B
movdqa 3*16(%rsp), MSG3_B
sub $64, %rbx // rbx = buffered - 64
sub %rbx, DATA1 // DATA1 += 64 - buffered
sub %rbx, DATA2 // DATA2 += 64 - buffered
add %ebx, LEN // LEN += buffered - 64
movdqa STATE0_A, STATE0_B
movdqa STATE1_A, STATE1_B
jmp .Lfinup2x_loop_have_data
.Lfinup2x_enter_loop:
sub $64, LEN
movdqa STATE0_A, STATE0_B
movdqa STATE1_A, STATE1_B
.Lfinup2x_loop:
// Load the next two data blocks.
movdqu 0*16(DATA1), MSG0_A
movdqu 0*16(DATA2), MSG0_B
movdqu 1*16(DATA1), MSG1_A
movdqu 1*16(DATA2), MSG1_B
movdqu 2*16(DATA1), MSG2_A
movdqu 2*16(DATA2), MSG2_B
movdqu 3*16(DATA1), MSG3_A
movdqu 3*16(DATA2), MSG3_B
add $64, DATA1
add $64, DATA2
.Lfinup2x_loop_have_data:
// Convert the words of the data blocks from big endian.
pshufb SHUF_MASK, MSG0_A
pshufb SHUF_MASK, MSG0_B
pshufb SHUF_MASK, MSG1_A
pshufb SHUF_MASK, MSG1_B
pshufb SHUF_MASK, MSG2_A
pshufb SHUF_MASK, MSG2_B
pshufb SHUF_MASK, MSG3_A
pshufb SHUF_MASK, MSG3_B
.Lfinup2x_loop_have_bswapped_data:
// Save the original state for each block.
movdqa STATE0_A, 0*16(%rsp)
movdqa STATE0_B, 1*16(%rsp)
movdqa STATE1_A, 2*16(%rsp)
movdqa STATE1_B, 3*16(%rsp)
// Do the SHA-256 rounds on each block.
.irp i, 0, 16, 32, 48
do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
MSG0_B, MSG1_B, MSG2_B, MSG3_B
do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
MSG1_B, MSG2_B, MSG3_B, MSG0_B
do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
MSG2_B, MSG3_B, MSG0_B, MSG1_B
do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
MSG3_B, MSG0_B, MSG1_B, MSG2_B
.endr
// Add the original state for each block.
paddd 0*16(%rsp), STATE0_A
paddd 1*16(%rsp), STATE0_B
paddd 2*16(%rsp), STATE1_A
paddd 3*16(%rsp), STATE1_B
// Update LEN and loop back if more blocks remain.
sub $64, LEN
jge .Lfinup2x_loop
// Check if any final blocks need to be handled.
// FINAL_STEP = 2: all done
// FINAL_STEP = 1: need to do count-only padding block
// FINAL_STEP = 0: need to do the block with 0x80 padding byte
cmp $1, FINAL_STEP
jg .Lfinup2x_done
je .Lfinup2x_finalize_countonly
add $64, LEN
jz .Lfinup2x_finalize_blockaligned
// Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block.
// To do this, write the padding starting with the 0x80 byte to
// &sp[64]. Then for each message, copy the last 64 data bytes to sp
// and load from &sp[64 - LEN] to get the needed padding block. This
// code relies on the data buffers being >= 64 bytes in length.
mov $64, %ebx
sub LEN, %ebx // ebx = 64 - LEN
sub %rbx, DATA1 // DATA1 -= 64 - LEN
sub %rbx, DATA2 // DATA2 -= 64 - LEN
mov $0x80, FINAL_STEP // using FINAL_STEP as a temporary
movd FINAL_STEP, MSG0_A
pxor MSG1_A, MSG1_A
movdqa MSG0_A, 4*16(%rsp)
movdqa MSG1_A, 5*16(%rsp)
movdqa MSG1_A, 6*16(%rsp)
movdqa MSG1_A, 7*16(%rsp)
cmp $56, LEN
jge 1f // will COUNT spill into its own block?
shl $3, COUNT
bswap COUNT
mov COUNT, 56(%rsp,%rbx)
mov $2, FINAL_STEP // won't need count-only block
jmp 2f
1:
mov $1, FINAL_STEP // will need count-only block
2:
movdqu 0*16(DATA1), MSG0_A
movdqu 1*16(DATA1), MSG1_A
movdqu 2*16(DATA1), MSG2_A
movdqu 3*16(DATA1), MSG3_A
movdqa MSG0_A, 0*16(%rsp)
movdqa MSG1_A, 1*16(%rsp)
movdqa MSG2_A, 2*16(%rsp)
movdqa MSG3_A, 3*16(%rsp)
movdqu 0*16(%rsp,%rbx), MSG0_A
movdqu 1*16(%rsp,%rbx), MSG1_A
movdqu 2*16(%rsp,%rbx), MSG2_A
movdqu 3*16(%rsp,%rbx), MSG3_A
movdqu 0*16(DATA2), MSG0_B
movdqu 1*16(DATA2), MSG1_B
movdqu 2*16(DATA2), MSG2_B
movdqu 3*16(DATA2), MSG3_B
movdqa MSG0_B, 0*16(%rsp)
movdqa MSG1_B, 1*16(%rsp)
movdqa MSG2_B, 2*16(%rsp)
movdqa MSG3_B, 3*16(%rsp)
movdqu 0*16(%rsp,%rbx), MSG0_B
movdqu 1*16(%rsp,%rbx), MSG1_B
movdqu 2*16(%rsp,%rbx), MSG2_B
movdqu 3*16(%rsp,%rbx), MSG3_B
jmp .Lfinup2x_loop_have_data
// Prepare a padding block, either:
//
// {0x80, 0, 0, 0, ..., count (as __be64)}
// This is for a block aligned message.
//
// { 0, 0, 0, 0, ..., count (as __be64)}
// This is for a message whose length mod 64 is >= 56.
//
// Pre-swap the endianness of the words.
.Lfinup2x_finalize_countonly:
pxor MSG0_A, MSG0_A
jmp 1f
.Lfinup2x_finalize_blockaligned:
mov $0x80000000, %ebx
movd %ebx, MSG0_A
1:
pxor MSG1_A, MSG1_A
pxor MSG2_A, MSG2_A
ror $29, COUNT
movq COUNT, MSG3_A
pslldq $8, MSG3_A
movdqa MSG0_A, MSG0_B
pxor MSG1_B, MSG1_B
pxor MSG2_B, MSG2_B
movdqa MSG3_A, MSG3_B
mov $2, FINAL_STEP
jmp .Lfinup2x_loop_have_bswapped_data
.Lfinup2x_done:
// Write the two digests with all bytes in the correct order.
movdqa STATE0_A, TMP_A
movdqa STATE0_B, TMP_B
punpcklqdq STATE1_A, STATE0_A // GHEF
punpcklqdq STATE1_B, STATE0_B
punpckhqdq TMP_A, STATE1_A // ABCD
punpckhqdq TMP_B, STATE1_B
pshufd $0xB1, STATE0_A, STATE0_A // HGFE
pshufd $0xB1, STATE0_B, STATE0_B
pshufd $0x1B, STATE1_A, STATE1_A // DCBA
pshufd $0x1B, STATE1_B, STATE1_B
pshufb SHUF_MASK, STATE0_A
pshufb SHUF_MASK, STATE0_B
pshufb SHUF_MASK, STATE1_A
pshufb SHUF_MASK, STATE1_B
movdqu STATE0_A, 1*16(OUT1)
movdqu STATE0_B, 1*16(OUT2)
movdqu STATE1_A, 0*16(OUT1)
movdqu STATE1_B, 0*16(OUT2)
mov %rbp, %rsp
pop %rbp
pop %rbx
RET
SYM_FUNC_END(sha256_ni_finup2x)
.section .rodata.cst256.K256, "aM", @progbits, 256
.align 64
K256:

View File

@ -7,6 +7,8 @@
#include <asm/fpu/api.h>
#include <linux/static_call.h>
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni);
DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic);
#define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \
@ -35,11 +37,48 @@ static void sha256_blocks(struct sha256_block_state *state,
static_call(sha256_blocks_x86)(state, data, nblocks);
}
static_assert(offsetof(struct __sha256_ctx, state) == 0);
static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
static_assert(offsetof(struct __sha256_ctx, buf) == 40);
asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
const u8 *data1, const u8 *data2, int len,
u8 out1[SHA256_DIGEST_SIZE],
u8 out2[SHA256_DIGEST_SIZE]);
#define sha256_finup_2x_arch sha256_finup_2x_arch
static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
const u8 *data1, const u8 *data2, size_t len,
u8 out1[SHA256_DIGEST_SIZE],
u8 out2[SHA256_DIGEST_SIZE])
{
/*
* The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
* Further limit len to 65536 to avoid spending too long with preemption
* disabled. (Of course, in practice len is nearly always 4096 anyway.)
*/
if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE &&
len <= 65536 && likely(irq_fpu_usable())) {
kernel_fpu_begin();
sha256_ni_finup2x(ctx, data1, data2, len, out1, out2);
kernel_fpu_end();
kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
return true;
}
return false;
}
static bool sha256_finup_2x_is_optimized_arch(void)
{
return static_key_enabled(&have_sha_ni);
}
#define sha256_mod_init_arch sha256_mod_init_arch
static void sha256_mod_init_arch(void)
{
if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
static_call_update(sha256_blocks_x86, sha256_blocks_ni);
static_branch_enable(&have_sha_ni);
} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
NULL) &&
boot_cpu_has(X86_FEATURE_AVX)) {