Optimize fsverity with 2-way interleaved hashing
Add support for 2-way interleaved SHA-256 hashing to lib/crypto/, and make fsverity use it for faster file data verification. This improves fsverity performance on many x86_64 and arm64 processors. Later, I plan to make dm-verity use this too. -----BEGIN PGP SIGNATURE----- iIoEABYIADIWIQSacvsUNc7UX4ntmEPzXCl4vpKOKwUCaNg4/RQcZWJpZ2dlcnNA a2VybmVsLm9yZwAKCRDzXCl4vpKOK4fMAP9Xz00JNDfJ+mOVHIYOhAlWFGnug0X1 cvoRf4QXchNlbwD9HTJQQDQXnbsPy3QPrUVfl2FqCW7c6vRlBJijhD6j4wE= =6dCR -----END PGP SIGNATURE----- Merge tag 'fsverity-for-linus' of git://git.kernel.org/pub/scm/fs/fsverity/linux Pull interleaved SHA-256 hashing support from Eric Biggers: "Optimize fsverity with 2-way interleaved hashing Add support for 2-way interleaved SHA-256 hashing to lib/crypto/, and make fsverity use it for faster file data verification. This improves fsverity performance on many x86_64 and arm64 processors. Later, I plan to make dm-verity use this too" * tag 'fsverity-for-linus' of git://git.kernel.org/pub/scm/fs/fsverity/linux: fsverity: Use 2-way interleaved SHA-256 hashing when supported fsverity: Remove inode parameter from fsverity_hash_block() lib/crypto: tests: Add tests and benchmark for sha256_finup_2x() lib/crypto: x86/sha256: Add support for 2-way interleaved hashing lib/crypto: arm64/sha256: Add support for 2-way interleaved hashing lib/crypto: sha256: Add support for 2-way interleaved hashing
This commit is contained in:
commit
1896ce8eb6
|
@ -19,8 +19,7 @@ struct block_buffer {
|
|||
};
|
||||
|
||||
/* Hash a block, writing the result to the next level's pending block buffer. */
|
||||
static int hash_one_block(struct inode *inode,
|
||||
const struct merkle_tree_params *params,
|
||||
static int hash_one_block(const struct merkle_tree_params *params,
|
||||
struct block_buffer *cur)
|
||||
{
|
||||
struct block_buffer *next = cur + 1;
|
||||
|
@ -36,8 +35,7 @@ static int hash_one_block(struct inode *inode,
|
|||
/* Zero-pad the block if it's shorter than the block size. */
|
||||
memset(&cur->data[cur->filled], 0, params->block_size - cur->filled);
|
||||
|
||||
fsverity_hash_block(params, inode, cur->data,
|
||||
&next->data[next->filled]);
|
||||
fsverity_hash_block(params, cur->data, &next->data[next->filled]);
|
||||
next->filled += params->digest_size;
|
||||
cur->filled = 0;
|
||||
return 0;
|
||||
|
@ -123,7 +121,7 @@ static int build_merkle_tree(struct file *filp,
|
|||
fsverity_err(inode, "Short read of file data");
|
||||
goto out;
|
||||
}
|
||||
err = hash_one_block(inode, params, &buffers[-1]);
|
||||
err = hash_one_block(params, &buffers[-1]);
|
||||
if (err)
|
||||
goto out;
|
||||
for (level = 0; level < num_levels; level++) {
|
||||
|
@ -134,7 +132,7 @@ static int build_merkle_tree(struct file *filp,
|
|||
}
|
||||
/* Next block at @level is full */
|
||||
|
||||
err = hash_one_block(inode, params, &buffers[level]);
|
||||
err = hash_one_block(params, &buffers[level]);
|
||||
if (err)
|
||||
goto out;
|
||||
err = write_merkle_tree_block(inode,
|
||||
|
@ -154,7 +152,7 @@ static int build_merkle_tree(struct file *filp,
|
|||
/* Finish all nonempty pending tree blocks. */
|
||||
for (level = 0; level < num_levels; level++) {
|
||||
if (buffers[level].filled != 0) {
|
||||
err = hash_one_block(inode, params, &buffers[level]);
|
||||
err = hash_one_block(params, &buffers[level]);
|
||||
if (err)
|
||||
goto out;
|
||||
err = write_merkle_tree_block(inode,
|
||||
|
|
|
@ -90,7 +90,7 @@ union fsverity_hash_ctx *
|
|||
fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
|
||||
const u8 *salt, size_t salt_size);
|
||||
void fsverity_hash_block(const struct merkle_tree_params *params,
|
||||
const struct inode *inode, const void *data, u8 *out);
|
||||
const void *data, u8 *out);
|
||||
void fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
|
||||
const void *data, size_t size, u8 *out);
|
||||
void __init fsverity_check_hash_algs(void);
|
||||
|
|
|
@ -94,7 +94,6 @@ fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
|
|||
/**
|
||||
* fsverity_hash_block() - hash a single data or hash block
|
||||
* @params: the Merkle tree's parameters
|
||||
* @inode: inode for which the hashing is being done
|
||||
* @data: virtual address of a buffer containing the block to hash
|
||||
* @out: output digest, size 'params->digest_size' bytes
|
||||
*
|
||||
|
@ -102,7 +101,7 @@ fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
|
|||
* in the Merkle tree parameters.
|
||||
*/
|
||||
void fsverity_hash_block(const struct merkle_tree_params *params,
|
||||
const struct inode *inode, const void *data, u8 *out)
|
||||
const void *data, u8 *out)
|
||||
{
|
||||
union fsverity_hash_ctx ctx;
|
||||
|
||||
|
|
|
@ -10,6 +10,31 @@
|
|||
#include <linux/bio.h>
|
||||
#include <linux/export.h>
|
||||
|
||||
#define FS_VERITY_MAX_PENDING_BLOCKS 2
|
||||
|
||||
struct fsverity_pending_block {
|
||||
const void *data;
|
||||
u64 pos;
|
||||
u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE];
|
||||
};
|
||||
|
||||
struct fsverity_verification_context {
|
||||
struct inode *inode;
|
||||
struct fsverity_info *vi;
|
||||
unsigned long max_ra_pages;
|
||||
|
||||
/*
|
||||
* This is the queue of data blocks that are pending verification. When
|
||||
* the crypto layer supports interleaved hashing, we allow multiple
|
||||
* blocks to be queued up in order to utilize it. This can improve
|
||||
* performance significantly vs. sequential hashing of each block.
|
||||
*/
|
||||
int num_pending;
|
||||
int max_pending;
|
||||
struct fsverity_pending_block
|
||||
pending_blocks[FS_VERITY_MAX_PENDING_BLOCKS];
|
||||
};
|
||||
|
||||
static struct workqueue_struct *fsverity_read_workqueue;
|
||||
|
||||
/*
|
||||
|
@ -79,7 +104,7 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
|
|||
}
|
||||
|
||||
/*
|
||||
* Verify a single data block against the file's Merkle tree.
|
||||
* Verify the hash of a single data block against the file's Merkle tree.
|
||||
*
|
||||
* In principle, we need to verify the entire path to the root node. However,
|
||||
* for efficiency the filesystem may cache the hash blocks. Therefore we need
|
||||
|
@ -88,10 +113,11 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
|
|||
*
|
||||
* Return: %true if the data block is valid, else %false.
|
||||
*/
|
||||
static bool
|
||||
verify_data_block(struct inode *inode, struct fsverity_info *vi,
|
||||
const void *data, u64 data_pos, unsigned long max_ra_pages)
|
||||
static bool verify_data_block(struct inode *inode, struct fsverity_info *vi,
|
||||
const struct fsverity_pending_block *dblock,
|
||||
unsigned long max_ra_pages)
|
||||
{
|
||||
const u64 data_pos = dblock->pos;
|
||||
const struct merkle_tree_params *params = &vi->tree_params;
|
||||
const unsigned int hsize = params->digest_size;
|
||||
int level;
|
||||
|
@ -115,8 +141,12 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
|
|||
*/
|
||||
u64 hidx = data_pos >> params->log_blocksize;
|
||||
|
||||
/* Up to 1 + FS_VERITY_MAX_LEVELS pages may be mapped at once */
|
||||
BUILD_BUG_ON(1 + FS_VERITY_MAX_LEVELS > KM_MAX_IDX);
|
||||
/*
|
||||
* Up to FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS pages may
|
||||
* be mapped at once.
|
||||
*/
|
||||
static_assert(FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS <=
|
||||
KM_MAX_IDX);
|
||||
|
||||
if (unlikely(data_pos >= inode->i_size)) {
|
||||
/*
|
||||
|
@ -127,7 +157,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
|
|||
* any part past EOF should be all zeroes. Therefore, we need
|
||||
* to verify that any data blocks fully past EOF are all zeroes.
|
||||
*/
|
||||
if (memchr_inv(data, 0, params->block_size)) {
|
||||
if (memchr_inv(dblock->data, 0, params->block_size)) {
|
||||
fsverity_err(inode,
|
||||
"FILE CORRUPTED! Data past EOF is not zeroed");
|
||||
return false;
|
||||
|
@ -202,7 +232,7 @@ descend:
|
|||
unsigned long hblock_idx = hblocks[level - 1].index;
|
||||
unsigned int hoffset = hblocks[level - 1].hoffset;
|
||||
|
||||
fsverity_hash_block(params, inode, haddr, real_hash);
|
||||
fsverity_hash_block(params, haddr, real_hash);
|
||||
if (memcmp(want_hash, real_hash, hsize) != 0)
|
||||
goto corrupted;
|
||||
/*
|
||||
|
@ -220,18 +250,18 @@ descend:
|
|||
put_page(hpage);
|
||||
}
|
||||
|
||||
/* Finally, verify the data block. */
|
||||
fsverity_hash_block(params, inode, data, real_hash);
|
||||
if (memcmp(want_hash, real_hash, hsize) != 0)
|
||||
/* Finally, verify the hash of the data block. */
|
||||
if (memcmp(want_hash, dblock->real_hash, hsize) != 0)
|
||||
goto corrupted;
|
||||
return true;
|
||||
|
||||
corrupted:
|
||||
fsverity_err(inode,
|
||||
"FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
|
||||
data_pos, level - 1,
|
||||
params->hash_alg->name, hsize, want_hash,
|
||||
params->hash_alg->name, hsize, real_hash);
|
||||
fsverity_err(
|
||||
inode,
|
||||
"FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
|
||||
data_pos, level - 1, params->hash_alg->name, hsize, want_hash,
|
||||
params->hash_alg->name, hsize,
|
||||
level == 0 ? dblock->real_hash : real_hash);
|
||||
error:
|
||||
for (; level > 0; level--) {
|
||||
kunmap_local(hblocks[level - 1].addr);
|
||||
|
@ -240,13 +270,73 @@ error:
|
|||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
|
||||
unsigned long max_ra_pages)
|
||||
static void
|
||||
fsverity_init_verification_context(struct fsverity_verification_context *ctx,
|
||||
struct inode *inode,
|
||||
unsigned long max_ra_pages)
|
||||
{
|
||||
struct inode *inode = data_folio->mapping->host;
|
||||
struct fsverity_info *vi = *fsverity_info_addr(inode);
|
||||
const unsigned int block_size = vi->tree_params.block_size;
|
||||
|
||||
ctx->inode = inode;
|
||||
ctx->vi = vi;
|
||||
ctx->max_ra_pages = max_ra_pages;
|
||||
ctx->num_pending = 0;
|
||||
if (vi->tree_params.hash_alg->algo_id == HASH_ALGO_SHA256 &&
|
||||
sha256_finup_2x_is_optimized())
|
||||
ctx->max_pending = 2;
|
||||
else
|
||||
ctx->max_pending = 1;
|
||||
}
|
||||
|
||||
static void
|
||||
fsverity_clear_pending_blocks(struct fsverity_verification_context *ctx)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = ctx->num_pending - 1; i >= 0; i--) {
|
||||
kunmap_local(ctx->pending_blocks[i].data);
|
||||
ctx->pending_blocks[i].data = NULL;
|
||||
}
|
||||
ctx->num_pending = 0;
|
||||
}
|
||||
|
||||
static bool
|
||||
fsverity_verify_pending_blocks(struct fsverity_verification_context *ctx)
|
||||
{
|
||||
struct fsverity_info *vi = ctx->vi;
|
||||
const struct merkle_tree_params *params = &vi->tree_params;
|
||||
int i;
|
||||
|
||||
if (ctx->num_pending == 2) {
|
||||
/* num_pending == 2 implies that the algorithm is SHA-256 */
|
||||
sha256_finup_2x(params->hashstate ? ¶ms->hashstate->sha256 :
|
||||
NULL,
|
||||
ctx->pending_blocks[0].data,
|
||||
ctx->pending_blocks[1].data, params->block_size,
|
||||
ctx->pending_blocks[0].real_hash,
|
||||
ctx->pending_blocks[1].real_hash);
|
||||
} else {
|
||||
for (i = 0; i < ctx->num_pending; i++)
|
||||
fsverity_hash_block(params, ctx->pending_blocks[i].data,
|
||||
ctx->pending_blocks[i].real_hash);
|
||||
}
|
||||
|
||||
for (i = 0; i < ctx->num_pending; i++) {
|
||||
if (!verify_data_block(ctx->inode, vi, &ctx->pending_blocks[i],
|
||||
ctx->max_ra_pages))
|
||||
return false;
|
||||
}
|
||||
fsverity_clear_pending_blocks(ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool fsverity_add_data_blocks(struct fsverity_verification_context *ctx,
|
||||
struct folio *data_folio, size_t len,
|
||||
size_t offset)
|
||||
{
|
||||
struct fsverity_info *vi = ctx->vi;
|
||||
const struct merkle_tree_params *params = &vi->tree_params;
|
||||
const unsigned int block_size = params->block_size;
|
||||
u64 pos = (u64)data_folio->index << PAGE_SHIFT;
|
||||
|
||||
if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size)))
|
||||
|
@ -255,14 +345,11 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
|
|||
folio_test_uptodate(data_folio)))
|
||||
return false;
|
||||
do {
|
||||
void *data;
|
||||
bool valid;
|
||||
|
||||
data = kmap_local_folio(data_folio, offset);
|
||||
valid = verify_data_block(inode, vi, data, pos + offset,
|
||||
max_ra_pages);
|
||||
kunmap_local(data);
|
||||
if (!valid)
|
||||
ctx->pending_blocks[ctx->num_pending].data =
|
||||
kmap_local_folio(data_folio, offset);
|
||||
ctx->pending_blocks[ctx->num_pending].pos = pos + offset;
|
||||
if (++ctx->num_pending == ctx->max_pending &&
|
||||
!fsverity_verify_pending_blocks(ctx))
|
||||
return false;
|
||||
offset += block_size;
|
||||
len -= block_size;
|
||||
|
@ -284,7 +371,15 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
|
|||
*/
|
||||
bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset)
|
||||
{
|
||||
return verify_data_blocks(folio, len, offset, 0);
|
||||
struct fsverity_verification_context ctx;
|
||||
|
||||
fsverity_init_verification_context(&ctx, folio->mapping->host, 0);
|
||||
|
||||
if (fsverity_add_data_blocks(&ctx, folio, len, offset) &&
|
||||
fsverity_verify_pending_blocks(&ctx))
|
||||
return true;
|
||||
fsverity_clear_pending_blocks(&ctx);
|
||||
return false;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
|
||||
|
||||
|
@ -305,6 +400,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
|
|||
*/
|
||||
void fsverity_verify_bio(struct bio *bio)
|
||||
{
|
||||
struct inode *inode = bio_first_folio_all(bio)->mapping->host;
|
||||
struct fsverity_verification_context ctx;
|
||||
struct folio_iter fi;
|
||||
unsigned long max_ra_pages = 0;
|
||||
|
||||
|
@ -321,13 +418,21 @@ void fsverity_verify_bio(struct bio *bio)
|
|||
max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2);
|
||||
}
|
||||
|
||||
fsverity_init_verification_context(&ctx, inode, max_ra_pages);
|
||||
|
||||
bio_for_each_folio_all(fi, bio) {
|
||||
if (!verify_data_blocks(fi.folio, fi.length, fi.offset,
|
||||
max_ra_pages)) {
|
||||
bio->bi_status = BLK_STS_IOERR;
|
||||
break;
|
||||
}
|
||||
if (!fsverity_add_data_blocks(&ctx, fi.folio, fi.length,
|
||||
fi.offset))
|
||||
goto ioerr;
|
||||
}
|
||||
|
||||
if (!fsverity_verify_pending_blocks(&ctx))
|
||||
goto ioerr;
|
||||
return;
|
||||
|
||||
ioerr:
|
||||
fsverity_clear_pending_blocks(&ctx);
|
||||
bio->bi_status = BLK_STS_IOERR;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(fsverity_verify_bio);
|
||||
#endif /* CONFIG_BLOCK */
|
||||
|
|
|
@ -375,6 +375,34 @@ void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
|
|||
*/
|
||||
void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]);
|
||||
|
||||
/**
|
||||
* sha256_finup_2x() - Compute two SHA-256 digests from a common initial
|
||||
* context. On some CPUs, this is faster than sequentially
|
||||
* computing each digest.
|
||||
* @ctx: an optional initial context, which may have already processed data. If
|
||||
* NULL, a default initial context is used (equivalent to sha256_init()).
|
||||
* @data1: data for the first message
|
||||
* @data2: data for the second message
|
||||
* @len: the length of each of @data1 and @data2, in bytes
|
||||
* @out1: (output) the first SHA-256 message digest
|
||||
* @out2: (output) the second SHA-256 message digest
|
||||
*
|
||||
* Context: Any context.
|
||||
*/
|
||||
void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
|
||||
const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
|
||||
u8 out2[SHA256_DIGEST_SIZE]);
|
||||
|
||||
/**
|
||||
* sha256_finup_2x_is_optimized() - Check if sha256_finup_2x() is using a real
|
||||
* interleaved implementation, as opposed to a
|
||||
* sequential fallback
|
||||
* @return: true if optimized
|
||||
*
|
||||
* Context: Any context.
|
||||
*/
|
||||
bool sha256_finup_2x_is_optimized(void);
|
||||
|
||||
/**
|
||||
* struct hmac_sha256_key - Prepared key for HMAC-SHA256
|
||||
* @key: private
|
||||
|
|
|
@ -70,18 +70,22 @@
|
|||
.word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
|
||||
.word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
|
||||
|
||||
.macro load_round_constants tmp
|
||||
adr_l \tmp, .Lsha2_rcon
|
||||
ld1 { v0.4s- v3.4s}, [\tmp], #64
|
||||
ld1 { v4.4s- v7.4s}, [\tmp], #64
|
||||
ld1 { v8.4s-v11.4s}, [\tmp], #64
|
||||
ld1 {v12.4s-v15.4s}, [\tmp]
|
||||
.endm
|
||||
|
||||
/*
|
||||
* size_t __sha256_ce_transform(struct sha256_block_state *state,
|
||||
* const u8 *data, size_t nblocks);
|
||||
*/
|
||||
.text
|
||||
SYM_FUNC_START(__sha256_ce_transform)
|
||||
/* load round constants */
|
||||
adr_l x8, .Lsha2_rcon
|
||||
ld1 { v0.4s- v3.4s}, [x8], #64
|
||||
ld1 { v4.4s- v7.4s}, [x8], #64
|
||||
ld1 { v8.4s-v11.4s}, [x8], #64
|
||||
ld1 {v12.4s-v15.4s}, [x8]
|
||||
|
||||
load_round_constants x8
|
||||
|
||||
/* load state */
|
||||
ld1 {dgav.4s, dgbv.4s}, [x0]
|
||||
|
@ -134,3 +138,271 @@ CPU_LE( rev32 v19.16b, v19.16b )
|
|||
mov x0, x2
|
||||
ret
|
||||
SYM_FUNC_END(__sha256_ce_transform)
|
||||
|
||||
.unreq dga
|
||||
.unreq dgav
|
||||
.unreq dgb
|
||||
.unreq dgbv
|
||||
.unreq t0
|
||||
.unreq t1
|
||||
.unreq dg0q
|
||||
.unreq dg0v
|
||||
.unreq dg1q
|
||||
.unreq dg1v
|
||||
.unreq dg2q
|
||||
.unreq dg2v
|
||||
|
||||
// parameters for sha256_ce_finup2x()
|
||||
ctx .req x0
|
||||
data1 .req x1
|
||||
data2 .req x2
|
||||
len .req w3
|
||||
out1 .req x4
|
||||
out2 .req x5
|
||||
|
||||
// other scalar variables
|
||||
count .req x6
|
||||
final_step .req w7
|
||||
|
||||
// x8-x9 are used as temporaries.
|
||||
|
||||
// v0-v15 are used to cache the SHA-256 round constants.
|
||||
// v16-v19 are used for the message schedule for the first message.
|
||||
// v20-v23 are used for the message schedule for the second message.
|
||||
// v24-v31 are used for the state and temporaries as given below.
|
||||
// *_a are for the first message and *_b for the second.
|
||||
state0_a_q .req q24
|
||||
state0_a .req v24
|
||||
state1_a_q .req q25
|
||||
state1_a .req v25
|
||||
state0_b_q .req q26
|
||||
state0_b .req v26
|
||||
state1_b_q .req q27
|
||||
state1_b .req v27
|
||||
t0_a .req v28
|
||||
t0_b .req v29
|
||||
t1_a_q .req q30
|
||||
t1_a .req v30
|
||||
t1_b_q .req q31
|
||||
t1_b .req v31
|
||||
|
||||
#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
|
||||
#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
|
||||
// offsetof(struct __sha256_ctx, state) is assumed to be 0.
|
||||
|
||||
// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
|
||||
// and m0_b contain the current 4 message schedule words for the first
|
||||
// and second message respectively.
|
||||
//
|
||||
// If not all the message schedule words have been computed yet, then
|
||||
// this also computes 4 more message schedule words for each message.
|
||||
// m1_a-m3_a contain the next 3 groups of 4 message schedule words for
|
||||
// the first message, and likewise m1_b-m3_b for the second. After
|
||||
// consuming the current value of m0_a, this macro computes the group
|
||||
// after m3_a and writes it to m0_a, and likewise for *_b. This means
|
||||
// that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
|
||||
// m3_a, m0_a), and likewise for *_b, so the caller must cycle through
|
||||
// the registers accordingly.
|
||||
.macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \
|
||||
m0_b, m1_b, m2_b, m3_b
|
||||
add t0_a\().4s, \m0_a\().4s, \k\().4s
|
||||
add t0_b\().4s, \m0_b\().4s, \k\().4s
|
||||
.if \i < 48
|
||||
sha256su0 \m0_a\().4s, \m1_a\().4s
|
||||
sha256su0 \m0_b\().4s, \m1_b\().4s
|
||||
sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s
|
||||
sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s
|
||||
.endif
|
||||
mov t1_a.16b, state0_a.16b
|
||||
mov t1_b.16b, state0_b.16b
|
||||
sha256h state0_a_q, state1_a_q, t0_a\().4s
|
||||
sha256h state0_b_q, state1_b_q, t0_b\().4s
|
||||
sha256h2 state1_a_q, t1_a_q, t0_a\().4s
|
||||
sha256h2 state1_b_q, t1_b_q, t0_b\().4s
|
||||
.endm
|
||||
|
||||
.macro do_16rounds_2x i, k0, k1, k2, k3
|
||||
do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20
|
||||
do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21
|
||||
do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22
|
||||
.endm
|
||||
|
||||
//
|
||||
// void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
|
||||
// const u8 *data1, const u8 *data2, int len,
|
||||
// u8 out1[SHA256_DIGEST_SIZE],
|
||||
// u8 out2[SHA256_DIGEST_SIZE]);
|
||||
//
|
||||
// This function computes the SHA-256 digests of two messages |data1| and
|
||||
// |data2| that are both |len| bytes long, starting from the initial context
|
||||
// |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
|
||||
//
|
||||
// The instructions for the two SHA-256 operations are interleaved. On many
|
||||
// CPUs, this is almost twice as fast as hashing each message individually due
|
||||
// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
|
||||
//
|
||||
SYM_FUNC_START(sha256_ce_finup2x)
|
||||
sub sp, sp, #128
|
||||
mov final_step, #0
|
||||
load_round_constants x8
|
||||
|
||||
// Load the initial state from ctx->state.
|
||||
ld1 {state0_a.4s-state1_a.4s}, [ctx]
|
||||
|
||||
// Load ctx->bytecount. Take the mod 64 of it to get the number of
|
||||
// bytes that are buffered in ctx->buf. Also save it in a register with
|
||||
// len added to it.
|
||||
ldr x8, [ctx, #OFFSETOF_BYTECOUNT]
|
||||
add count, x8, len, sxtw
|
||||
and x8, x8, #63
|
||||
cbz x8, .Lfinup2x_enter_loop // No bytes buffered?
|
||||
|
||||
// x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them
|
||||
// followed by the first 64 - x8 bytes of data. Since len >= 64, we
|
||||
// just load 64 bytes from each of ctx->buf, data1, and data2
|
||||
// unconditionally and rearrange the data as needed.
|
||||
add x9, ctx, #OFFSETOF_BUF
|
||||
ld1 {v16.16b-v19.16b}, [x9]
|
||||
st1 {v16.16b-v19.16b}, [sp]
|
||||
|
||||
ld1 {v16.16b-v19.16b}, [data1], #64
|
||||
add x9, sp, x8
|
||||
st1 {v16.16b-v19.16b}, [x9]
|
||||
ld1 {v16.4s-v19.4s}, [sp]
|
||||
|
||||
ld1 {v20.16b-v23.16b}, [data2], #64
|
||||
st1 {v20.16b-v23.16b}, [x9]
|
||||
ld1 {v20.4s-v23.4s}, [sp]
|
||||
|
||||
sub len, len, #64
|
||||
sub data1, data1, x8
|
||||
sub data2, data2, x8
|
||||
add len, len, w8
|
||||
mov state0_b.16b, state0_a.16b
|
||||
mov state1_b.16b, state1_a.16b
|
||||
b .Lfinup2x_loop_have_data
|
||||
|
||||
.Lfinup2x_enter_loop:
|
||||
sub len, len, #64
|
||||
mov state0_b.16b, state0_a.16b
|
||||
mov state1_b.16b, state1_a.16b
|
||||
.Lfinup2x_loop:
|
||||
// Load the next two data blocks.
|
||||
ld1 {v16.4s-v19.4s}, [data1], #64
|
||||
ld1 {v20.4s-v23.4s}, [data2], #64
|
||||
.Lfinup2x_loop_have_data:
|
||||
// Convert the words of the data blocks from big endian.
|
||||
CPU_LE( rev32 v16.16b, v16.16b )
|
||||
CPU_LE( rev32 v17.16b, v17.16b )
|
||||
CPU_LE( rev32 v18.16b, v18.16b )
|
||||
CPU_LE( rev32 v19.16b, v19.16b )
|
||||
CPU_LE( rev32 v20.16b, v20.16b )
|
||||
CPU_LE( rev32 v21.16b, v21.16b )
|
||||
CPU_LE( rev32 v22.16b, v22.16b )
|
||||
CPU_LE( rev32 v23.16b, v23.16b )
|
||||
.Lfinup2x_loop_have_bswapped_data:
|
||||
|
||||
// Save the original state for each block.
|
||||
st1 {state0_a.4s-state1_b.4s}, [sp]
|
||||
|
||||
// Do the SHA-256 rounds on each block.
|
||||
do_16rounds_2x 0, v0, v1, v2, v3
|
||||
do_16rounds_2x 16, v4, v5, v6, v7
|
||||
do_16rounds_2x 32, v8, v9, v10, v11
|
||||
do_16rounds_2x 48, v12, v13, v14, v15
|
||||
|
||||
// Add the original state for each block.
|
||||
ld1 {v16.4s-v19.4s}, [sp]
|
||||
add state0_a.4s, state0_a.4s, v16.4s
|
||||
add state1_a.4s, state1_a.4s, v17.4s
|
||||
add state0_b.4s, state0_b.4s, v18.4s
|
||||
add state1_b.4s, state1_b.4s, v19.4s
|
||||
|
||||
// Update len and loop back if more blocks remain.
|
||||
sub len, len, #64
|
||||
tbz len, #31, .Lfinup2x_loop // len >= 0?
|
||||
|
||||
// Check if any final blocks need to be handled.
|
||||
// final_step = 2: all done
|
||||
// final_step = 1: need to do count-only padding block
|
||||
// final_step = 0: need to do the block with 0x80 padding byte
|
||||
tbnz final_step, #1, .Lfinup2x_done
|
||||
tbnz final_step, #0, .Lfinup2x_finalize_countonly
|
||||
add len, len, #64
|
||||
cbz len, .Lfinup2x_finalize_blockaligned
|
||||
|
||||
// Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
|
||||
// To do this, write the padding starting with the 0x80 byte to
|
||||
// &sp[64]. Then for each message, copy the last 64 data bytes to sp
|
||||
// and load from &sp[64 - len] to get the needed padding block. This
|
||||
// code relies on the data buffers being >= 64 bytes in length.
|
||||
sub w8, len, #64 // w8 = len - 64
|
||||
add data1, data1, w8, sxtw // data1 += len - 64
|
||||
add data2, data2, w8, sxtw // data2 += len - 64
|
||||
CPU_LE( mov x9, #0x80 )
|
||||
CPU_LE( fmov d16, x9 )
|
||||
CPU_BE( movi v16.16b, #0 )
|
||||
CPU_BE( mov x9, #0x8000000000000000 )
|
||||
CPU_BE( mov v16.d[1], x9 )
|
||||
movi v17.16b, #0
|
||||
stp q16, q17, [sp, #64]
|
||||
stp q17, q17, [sp, #96]
|
||||
sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
|
||||
cmp len, #56
|
||||
b.ge 1f // will count spill into its own block?
|
||||
lsl count, count, #3
|
||||
CPU_LE( rev count, count )
|
||||
str count, [x9, #56]
|
||||
mov final_step, #2 // won't need count-only block
|
||||
b 2f
|
||||
1:
|
||||
mov final_step, #1 // will need count-only block
|
||||
2:
|
||||
ld1 {v16.16b-v19.16b}, [data1]
|
||||
st1 {v16.16b-v19.16b}, [sp]
|
||||
ld1 {v16.4s-v19.4s}, [x9]
|
||||
ld1 {v20.16b-v23.16b}, [data2]
|
||||
st1 {v20.16b-v23.16b}, [sp]
|
||||
ld1 {v20.4s-v23.4s}, [x9]
|
||||
b .Lfinup2x_loop_have_data
|
||||
|
||||
// Prepare a padding block, either:
|
||||
//
|
||||
// {0x80, 0, 0, 0, ..., count (as __be64)}
|
||||
// This is for a block aligned message.
|
||||
//
|
||||
// { 0, 0, 0, 0, ..., count (as __be64)}
|
||||
// This is for a message whose length mod 64 is >= 56.
|
||||
//
|
||||
// Pre-swap the endianness of the words.
|
||||
.Lfinup2x_finalize_countonly:
|
||||
movi v16.2d, #0
|
||||
b 1f
|
||||
.Lfinup2x_finalize_blockaligned:
|
||||
mov x8, #0x80000000
|
||||
fmov d16, x8
|
||||
1:
|
||||
movi v17.2d, #0
|
||||
movi v18.2d, #0
|
||||
ror count, count, #29 // ror(lsl(count, 3), 32)
|
||||
mov v19.d[0], xzr
|
||||
mov v19.d[1], count
|
||||
mov v20.16b, v16.16b
|
||||
movi v21.2d, #0
|
||||
movi v22.2d, #0
|
||||
mov v23.16b, v19.16b
|
||||
mov final_step, #2
|
||||
b .Lfinup2x_loop_have_bswapped_data
|
||||
|
||||
.Lfinup2x_done:
|
||||
// Write the two digests with all bytes in the correct order.
|
||||
CPU_LE( rev32 state0_a.16b, state0_a.16b )
|
||||
CPU_LE( rev32 state1_a.16b, state1_a.16b )
|
||||
CPU_LE( rev32 state0_b.16b, state0_b.16b )
|
||||
CPU_LE( rev32 state1_b.16b, state1_b.16b )
|
||||
st1 {state0_a.4s-state1_a.4s}, [out1]
|
||||
st1 {state0_b.4s-state1_b.4s}, [out2]
|
||||
add sp, sp, #128
|
||||
ret
|
||||
SYM_FUNC_END(sha256_ce_finup2x)
|
||||
|
|
|
@ -44,6 +44,43 @@ static void sha256_blocks(struct sha256_block_state *state,
|
|||
}
|
||||
}
|
||||
|
||||
static_assert(offsetof(struct __sha256_ctx, state) == 0);
|
||||
static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
|
||||
static_assert(offsetof(struct __sha256_ctx, buf) == 40);
|
||||
asmlinkage void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
|
||||
const u8 *data1, const u8 *data2, int len,
|
||||
u8 out1[SHA256_DIGEST_SIZE],
|
||||
u8 out2[SHA256_DIGEST_SIZE]);
|
||||
|
||||
#define sha256_finup_2x_arch sha256_finup_2x_arch
|
||||
static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
|
||||
const u8 *data1, const u8 *data2, size_t len,
|
||||
u8 out1[SHA256_DIGEST_SIZE],
|
||||
u8 out2[SHA256_DIGEST_SIZE])
|
||||
{
|
||||
/*
|
||||
* The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
|
||||
* Further limit len to 65536 to avoid spending too long with preemption
|
||||
* disabled. (Of course, in practice len is nearly always 4096 anyway.)
|
||||
*/
|
||||
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
|
||||
static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE &&
|
||||
len <= 65536 && likely(may_use_simd())) {
|
||||
kernel_neon_begin();
|
||||
sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
|
||||
kernel_neon_end();
|
||||
kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
|
||||
kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool sha256_finup_2x_is_optimized_arch(void)
|
||||
{
|
||||
return static_key_enabled(&have_ce);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_KERNEL_MODE_NEON
|
||||
#define sha256_mod_init_arch sha256_mod_init_arch
|
||||
static void sha256_mod_init_arch(void)
|
||||
|
|
|
@ -25,13 +25,20 @@ static const struct sha256_block_state sha224_iv = {
|
|||
},
|
||||
};
|
||||
|
||||
static const struct sha256_block_state sha256_iv = {
|
||||
.h = {
|
||||
SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
|
||||
SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
|
||||
static const struct sha256_ctx initial_sha256_ctx = {
|
||||
.ctx = {
|
||||
.state = {
|
||||
.h = {
|
||||
SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
|
||||
SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
|
||||
},
|
||||
},
|
||||
.bytecount = 0,
|
||||
},
|
||||
};
|
||||
|
||||
#define sha256_iv (initial_sha256_ctx.ctx.state)
|
||||
|
||||
static const u32 sha256_K[64] = {
|
||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
|
||||
0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
|
||||
|
@ -261,8 +268,62 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
|
|||
}
|
||||
EXPORT_SYMBOL(sha256);
|
||||
|
||||
/* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */
|
||||
/*
|
||||
* Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined)
|
||||
* doesn't need either HMAC support or interleaved hashing support
|
||||
*/
|
||||
#ifndef __DISABLE_EXPORTS
|
||||
|
||||
#ifndef sha256_finup_2x_arch
|
||||
static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
|
||||
const u8 *data1, const u8 *data2, size_t len,
|
||||
u8 out1[SHA256_DIGEST_SIZE],
|
||||
u8 out2[SHA256_DIGEST_SIZE])
|
||||
{
|
||||
return false;
|
||||
}
|
||||
static bool sha256_finup_2x_is_optimized_arch(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Sequential fallback implementation of sha256_finup_2x() */
|
||||
static noinline_for_stack void sha256_finup_2x_sequential(
|
||||
const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2,
|
||||
size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE])
|
||||
{
|
||||
struct __sha256_ctx mut_ctx;
|
||||
|
||||
mut_ctx = *ctx;
|
||||
__sha256_update(&mut_ctx, data1, len);
|
||||
__sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE);
|
||||
|
||||
mut_ctx = *ctx;
|
||||
__sha256_update(&mut_ctx, data2, len);
|
||||
__sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE);
|
||||
}
|
||||
|
||||
void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
|
||||
const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
|
||||
u8 out2[SHA256_DIGEST_SIZE])
|
||||
{
|
||||
if (ctx == NULL)
|
||||
ctx = &initial_sha256_ctx;
|
||||
|
||||
if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1,
|
||||
out2)))
|
||||
return;
|
||||
sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sha256_finup_2x);
|
||||
|
||||
bool sha256_finup_2x_is_optimized(void)
|
||||
{
|
||||
return sha256_finup_2x_is_optimized_arch();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized);
|
||||
|
||||
static void __hmac_sha256_preparekey(struct sha256_block_state *istate,
|
||||
struct sha256_block_state *ostate,
|
||||
const u8 *raw_key, size_t raw_key_len,
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
#include <crypto/sha2.h>
|
||||
#include "sha256-testvecs.h"
|
||||
|
||||
/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
|
||||
#define HASH sha256
|
||||
#define HASH_CTX sha256_ctx
|
||||
#define HASH_SIZE SHA256_DIGEST_SIZE
|
||||
|
@ -21,9 +22,192 @@
|
|||
#define HMAC_USINGRAWKEY hmac_sha256_usingrawkey
|
||||
#include "hash-test-template.h"
|
||||
|
||||
static void free_guarded_buf(void *buf)
|
||||
{
|
||||
vfree(buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a KUnit-managed buffer that has length @len bytes immediately
|
||||
* followed by an unmapped page, and assert that the allocation succeeds.
|
||||
*/
|
||||
static void *alloc_guarded_buf(struct kunit *test, size_t len)
|
||||
{
|
||||
size_t full_len = round_up(len, PAGE_SIZE);
|
||||
void *buf = vmalloc(full_len);
|
||||
|
||||
KUNIT_ASSERT_NOT_NULL(test, buf);
|
||||
KUNIT_ASSERT_EQ(test, 0,
|
||||
kunit_add_action_or_reset(test, free_guarded_buf, buf));
|
||||
return buf + full_len - len;
|
||||
}
|
||||
|
||||
/*
|
||||
* Test for sha256_finup_2x(). Specifically, choose various data lengths and
|
||||
* salt lengths, and for each one, verify that sha256_finup_2x() produces the
|
||||
* same results as sha256_update() and sha256_final().
|
||||
*
|
||||
* Use guarded buffers for all inputs and outputs to reliably detect any
|
||||
* out-of-bounds reads or writes, even if they occur in assembly code.
|
||||
*/
|
||||
static void test_sha256_finup_2x(struct kunit *test)
|
||||
{
|
||||
const size_t max_data_len = 16384;
|
||||
u8 *data1_buf, *data2_buf, *hash1, *hash2;
|
||||
u8 expected_hash1[SHA256_DIGEST_SIZE];
|
||||
u8 expected_hash2[SHA256_DIGEST_SIZE];
|
||||
u8 salt[SHA256_BLOCK_SIZE];
|
||||
struct sha256_ctx *ctx;
|
||||
|
||||
data1_buf = alloc_guarded_buf(test, max_data_len);
|
||||
data2_buf = alloc_guarded_buf(test, max_data_len);
|
||||
hash1 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
|
||||
hash2 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
|
||||
ctx = alloc_guarded_buf(test, sizeof(*ctx));
|
||||
|
||||
rand_bytes(data1_buf, max_data_len);
|
||||
rand_bytes(data2_buf, max_data_len);
|
||||
rand_bytes(salt, sizeof(salt));
|
||||
|
||||
for (size_t i = 0; i < 500; i++) {
|
||||
size_t salt_len = rand_length(sizeof(salt));
|
||||
size_t data_len = rand_length(max_data_len);
|
||||
const u8 *data1 = data1_buf + max_data_len - data_len;
|
||||
const u8 *data2 = data2_buf + max_data_len - data_len;
|
||||
struct sha256_ctx orig_ctx;
|
||||
|
||||
sha256_init(ctx);
|
||||
sha256_update(ctx, salt, salt_len);
|
||||
orig_ctx = *ctx;
|
||||
|
||||
sha256_finup_2x(ctx, data1, data2, data_len, hash1, hash2);
|
||||
KUNIT_ASSERT_MEMEQ_MSG(
|
||||
test, ctx, &orig_ctx, sizeof(*ctx),
|
||||
"sha256_finup_2x() modified its ctx argument");
|
||||
|
||||
sha256_update(ctx, data1, data_len);
|
||||
sha256_final(ctx, expected_hash1);
|
||||
sha256_update(&orig_ctx, data2, data_len);
|
||||
sha256_final(&orig_ctx, expected_hash2);
|
||||
KUNIT_ASSERT_MEMEQ_MSG(
|
||||
test, hash1, expected_hash1, SHA256_DIGEST_SIZE,
|
||||
"Wrong hash1 with salt_len=%zu data_len=%zu", salt_len,
|
||||
data_len);
|
||||
KUNIT_ASSERT_MEMEQ_MSG(
|
||||
test, hash2, expected_hash2, SHA256_DIGEST_SIZE,
|
||||
"Wrong hash2 with salt_len=%zu data_len=%zu", salt_len,
|
||||
data_len);
|
||||
}
|
||||
}
|
||||
|
||||
/* Test sha256_finup_2x() with ctx == NULL */
|
||||
static void test_sha256_finup_2x_defaultctx(struct kunit *test)
|
||||
{
|
||||
const size_t data_len = 128;
|
||||
struct sha256_ctx ctx;
|
||||
u8 hash1_a[SHA256_DIGEST_SIZE];
|
||||
u8 hash2_a[SHA256_DIGEST_SIZE];
|
||||
u8 hash1_b[SHA256_DIGEST_SIZE];
|
||||
u8 hash2_b[SHA256_DIGEST_SIZE];
|
||||
|
||||
rand_bytes(test_buf, 2 * data_len);
|
||||
|
||||
sha256_init(&ctx);
|
||||
sha256_finup_2x(&ctx, test_buf, &test_buf[data_len], data_len, hash1_a,
|
||||
hash2_a);
|
||||
|
||||
sha256_finup_2x(NULL, test_buf, &test_buf[data_len], data_len, hash1_b,
|
||||
hash2_b);
|
||||
|
||||
KUNIT_ASSERT_MEMEQ(test, hash1_a, hash1_b, SHA256_DIGEST_SIZE);
|
||||
KUNIT_ASSERT_MEMEQ(test, hash2_a, hash2_b, SHA256_DIGEST_SIZE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Test that sha256_finup_2x() and sha256_update/final() produce consistent
|
||||
* results with total message lengths that require more than 32 bits.
|
||||
*/
|
||||
static void test_sha256_finup_2x_hugelen(struct kunit *test)
|
||||
{
|
||||
const size_t data_len = 4 * SHA256_BLOCK_SIZE;
|
||||
struct sha256_ctx ctx = {};
|
||||
u8 expected_hash[SHA256_DIGEST_SIZE];
|
||||
u8 hash[SHA256_DIGEST_SIZE];
|
||||
|
||||
rand_bytes(test_buf, data_len);
|
||||
for (size_t align = 0; align < SHA256_BLOCK_SIZE; align++) {
|
||||
sha256_init(&ctx);
|
||||
ctx.ctx.bytecount = 0x123456789abcd00 + align;
|
||||
|
||||
sha256_finup_2x(&ctx, test_buf, test_buf, data_len, hash, hash);
|
||||
|
||||
sha256_update(&ctx, test_buf, data_len);
|
||||
sha256_final(&ctx, expected_hash);
|
||||
|
||||
KUNIT_ASSERT_MEMEQ(test, hash, expected_hash,
|
||||
SHA256_DIGEST_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
/* Benchmark for sha256_finup_2x() */
|
||||
static void benchmark_sha256_finup_2x(struct kunit *test)
|
||||
{
|
||||
/*
|
||||
* Try a few different salt lengths, since sha256_finup_2x() performance
|
||||
* may vary slightly for the same data_len depending on how many bytes
|
||||
* were already processed in the initial context.
|
||||
*/
|
||||
static const size_t salt_lens_to_test[] = { 0, 32, 64 };
|
||||
const size_t data_len = 4096;
|
||||
const size_t num_iters = 4096;
|
||||
struct sha256_ctx ctx;
|
||||
u8 hash1[SHA256_DIGEST_SIZE];
|
||||
u8 hash2[SHA256_DIGEST_SIZE];
|
||||
|
||||
if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
|
||||
kunit_skip(test, "not enabled");
|
||||
if (!sha256_finup_2x_is_optimized())
|
||||
kunit_skip(test, "not relevant");
|
||||
|
||||
rand_bytes(test_buf, data_len * 2);
|
||||
|
||||
/* Warm-up */
|
||||
for (size_t i = 0; i < num_iters; i++)
|
||||
sha256_finup_2x(NULL, &test_buf[0], &test_buf[data_len],
|
||||
data_len, hash1, hash2);
|
||||
|
||||
for (size_t i = 0; i < ARRAY_SIZE(salt_lens_to_test); i++) {
|
||||
size_t salt_len = salt_lens_to_test[i];
|
||||
u64 t0, t1;
|
||||
|
||||
/*
|
||||
* Prepare the initial context. The time to process the salt is
|
||||
* not measured; we're just interested in sha256_finup_2x().
|
||||
*/
|
||||
sha256_init(&ctx);
|
||||
sha256_update(&ctx, test_buf, salt_len);
|
||||
|
||||
preempt_disable();
|
||||
t0 = ktime_get_ns();
|
||||
for (size_t j = 0; j < num_iters; j++)
|
||||
sha256_finup_2x(&ctx, &test_buf[0], &test_buf[data_len],
|
||||
data_len, hash1, hash2);
|
||||
t1 = ktime_get_ns();
|
||||
preempt_enable();
|
||||
kunit_info(test, "data_len=%zu salt_len=%zu: %llu MB/s",
|
||||
data_len, salt_len,
|
||||
div64_u64((u64)data_len * 2 * num_iters * 1000,
|
||||
t1 - t0 ?: 1));
|
||||
}
|
||||
}
|
||||
|
||||
static struct kunit_case hash_test_cases[] = {
|
||||
HASH_KUNIT_CASES,
|
||||
KUNIT_CASE(test_sha256_finup_2x),
|
||||
KUNIT_CASE(test_sha256_finup_2x_defaultctx),
|
||||
KUNIT_CASE(test_sha256_finup_2x_hugelen),
|
||||
KUNIT_CASE(benchmark_hash),
|
||||
KUNIT_CASE(benchmark_sha256_finup_2x),
|
||||
{},
|
||||
};
|
||||
|
||||
|
|
|
@ -165,6 +165,374 @@ SYM_FUNC_START(sha256_ni_transform)
|
|||
RET
|
||||
SYM_FUNC_END(sha256_ni_transform)
|
||||
|
||||
#undef DIGEST_PTR
|
||||
#undef DATA_PTR
|
||||
#undef NUM_BLKS
|
||||
#undef SHA256CONSTANTS
|
||||
#undef MSG
|
||||
#undef STATE0
|
||||
#undef STATE1
|
||||
#undef MSG0
|
||||
#undef MSG1
|
||||
#undef MSG2
|
||||
#undef MSG3
|
||||
#undef TMP
|
||||
#undef SHUF_MASK
|
||||
#undef ABEF_SAVE
|
||||
#undef CDGH_SAVE
|
||||
|
||||
// parameters for sha256_ni_finup2x()
|
||||
#define CTX %rdi
|
||||
#define DATA1 %rsi
|
||||
#define DATA2 %rdx
|
||||
#define LEN %ecx
|
||||
#define LEN8 %cl
|
||||
#define LEN64 %rcx
|
||||
#define OUT1 %r8
|
||||
#define OUT2 %r9
|
||||
|
||||
// other scalar variables
|
||||
#define SHA256CONSTANTS %rax
|
||||
#define COUNT %r10
|
||||
#define COUNT32 %r10d
|
||||
#define FINAL_STEP %r11d
|
||||
|
||||
// rbx is used as a temporary.
|
||||
|
||||
#define MSG %xmm0 // sha256rnds2 implicit operand
|
||||
#define STATE0_A %xmm1
|
||||
#define STATE1_A %xmm2
|
||||
#define STATE0_B %xmm3
|
||||
#define STATE1_B %xmm4
|
||||
#define TMP_A %xmm5
|
||||
#define TMP_B %xmm6
|
||||
#define MSG0_A %xmm7
|
||||
#define MSG1_A %xmm8
|
||||
#define MSG2_A %xmm9
|
||||
#define MSG3_A %xmm10
|
||||
#define MSG0_B %xmm11
|
||||
#define MSG1_B %xmm12
|
||||
#define MSG2_B %xmm13
|
||||
#define MSG3_B %xmm14
|
||||
#define SHUF_MASK %xmm15
|
||||
|
||||
#define OFFSETOF_STATE 0 // offsetof(struct __sha256_ctx, state)
|
||||
#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
|
||||
#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
|
||||
|
||||
// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b
|
||||
// contain the current 4 message schedule words for the first and second message
|
||||
// respectively.
|
||||
//
|
||||
// If not all the message schedule words have been computed yet, then this also
|
||||
// computes 4 more message schedule words for each message. m1_a-m3_a contain
|
||||
// the next 3 groups of 4 message schedule words for the first message, and
|
||||
// likewise m1_b-m3_b for the second. After consuming the current value of
|
||||
// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
|
||||
// likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the
|
||||
// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must
|
||||
// cycle through the registers accordingly.
|
||||
.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b
|
||||
movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A
|
||||
movdqa TMP_A, TMP_B
|
||||
paddd \m0_a, TMP_A
|
||||
paddd \m0_b, TMP_B
|
||||
.if \i < 48
|
||||
sha256msg1 \m1_a, \m0_a
|
||||
sha256msg1 \m1_b, \m0_b
|
||||
.endif
|
||||
movdqa TMP_A, MSG
|
||||
sha256rnds2 STATE0_A, STATE1_A
|
||||
movdqa TMP_B, MSG
|
||||
sha256rnds2 STATE0_B, STATE1_B
|
||||
pshufd $0x0E, TMP_A, MSG
|
||||
sha256rnds2 STATE1_A, STATE0_A
|
||||
pshufd $0x0E, TMP_B, MSG
|
||||
sha256rnds2 STATE1_B, STATE0_B
|
||||
.if \i < 48
|
||||
movdqa \m3_a, TMP_A
|
||||
movdqa \m3_b, TMP_B
|
||||
palignr $4, \m2_a, TMP_A
|
||||
palignr $4, \m2_b, TMP_B
|
||||
paddd TMP_A, \m0_a
|
||||
paddd TMP_B, \m0_b
|
||||
sha256msg2 \m3_a, \m0_a
|
||||
sha256msg2 \m3_b, \m0_b
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//
|
||||
// void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
|
||||
// const u8 *data1, const u8 *data2, int len,
|
||||
// u8 out1[SHA256_DIGEST_SIZE],
|
||||
// u8 out2[SHA256_DIGEST_SIZE]);
|
||||
//
|
||||
// This function computes the SHA-256 digests of two messages |data1| and
|
||||
// |data2| that are both |len| bytes long, starting from the initial context
|
||||
// |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
|
||||
//
|
||||
// The instructions for the two SHA-256 operations are interleaved. On many
|
||||
// CPUs, this is almost twice as fast as hashing each message individually due
|
||||
// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
|
||||
//
|
||||
SYM_FUNC_START(sha256_ni_finup2x)
|
||||
// Allocate 128 bytes of stack space, 16-byte aligned.
|
||||
push %rbx
|
||||
push %rbp
|
||||
mov %rsp, %rbp
|
||||
sub $128, %rsp
|
||||
and $~15, %rsp
|
||||
|
||||
// Load the shuffle mask for swapping the endianness of 32-bit words.
|
||||
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
|
||||
|
||||
// Set up pointer to the round constants.
|
||||
lea K256+32*4(%rip), SHA256CONSTANTS
|
||||
|
||||
// Initially we're not processing the final blocks.
|
||||
xor FINAL_STEP, FINAL_STEP
|
||||
|
||||
// Load the initial state from ctx->state.
|
||||
movdqu OFFSETOF_STATE+0*16(CTX), STATE0_A // DCBA
|
||||
movdqu OFFSETOF_STATE+1*16(CTX), STATE1_A // HGFE
|
||||
movdqa STATE0_A, TMP_A
|
||||
punpcklqdq STATE1_A, STATE0_A // FEBA
|
||||
punpckhqdq TMP_A, STATE1_A // DCHG
|
||||
pshufd $0x1B, STATE0_A, STATE0_A // ABEF
|
||||
pshufd $0xB1, STATE1_A, STATE1_A // CDGH
|
||||
|
||||
// Load ctx->bytecount. Take the mod 64 of it to get the number of
|
||||
// bytes that are buffered in ctx->buf. Also save it in a register with
|
||||
// LEN added to it.
|
||||
mov LEN, LEN
|
||||
mov OFFSETOF_BYTECOUNT(CTX), %rbx
|
||||
lea (%rbx, LEN64, 1), COUNT
|
||||
and $63, %ebx
|
||||
jz .Lfinup2x_enter_loop // No bytes buffered?
|
||||
|
||||
// %ebx bytes (1 to 63) are currently buffered in ctx->buf. Load them
|
||||
// followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we
|
||||
// just load 64 bytes from each of ctx->buf, DATA1, and DATA2
|
||||
// unconditionally and rearrange the data as needed.
|
||||
|
||||
movdqu OFFSETOF_BUF+0*16(CTX), MSG0_A
|
||||
movdqu OFFSETOF_BUF+1*16(CTX), MSG1_A
|
||||
movdqu OFFSETOF_BUF+2*16(CTX), MSG2_A
|
||||
movdqu OFFSETOF_BUF+3*16(CTX), MSG3_A
|
||||
movdqa MSG0_A, 0*16(%rsp)
|
||||
movdqa MSG1_A, 1*16(%rsp)
|
||||
movdqa MSG2_A, 2*16(%rsp)
|
||||
movdqa MSG3_A, 3*16(%rsp)
|
||||
|
||||
movdqu 0*16(DATA1), MSG0_A
|
||||
movdqu 1*16(DATA1), MSG1_A
|
||||
movdqu 2*16(DATA1), MSG2_A
|
||||
movdqu 3*16(DATA1), MSG3_A
|
||||
movdqu MSG0_A, 0*16(%rsp,%rbx)
|
||||
movdqu MSG1_A, 1*16(%rsp,%rbx)
|
||||
movdqu MSG2_A, 2*16(%rsp,%rbx)
|
||||
movdqu MSG3_A, 3*16(%rsp,%rbx)
|
||||
movdqa 0*16(%rsp), MSG0_A
|
||||
movdqa 1*16(%rsp), MSG1_A
|
||||
movdqa 2*16(%rsp), MSG2_A
|
||||
movdqa 3*16(%rsp), MSG3_A
|
||||
|
||||
movdqu 0*16(DATA2), MSG0_B
|
||||
movdqu 1*16(DATA2), MSG1_B
|
||||
movdqu 2*16(DATA2), MSG2_B
|
||||
movdqu 3*16(DATA2), MSG3_B
|
||||
movdqu MSG0_B, 0*16(%rsp,%rbx)
|
||||
movdqu MSG1_B, 1*16(%rsp,%rbx)
|
||||
movdqu MSG2_B, 2*16(%rsp,%rbx)
|
||||
movdqu MSG3_B, 3*16(%rsp,%rbx)
|
||||
movdqa 0*16(%rsp), MSG0_B
|
||||
movdqa 1*16(%rsp), MSG1_B
|
||||
movdqa 2*16(%rsp), MSG2_B
|
||||
movdqa 3*16(%rsp), MSG3_B
|
||||
|
||||
sub $64, %rbx // rbx = buffered - 64
|
||||
sub %rbx, DATA1 // DATA1 += 64 - buffered
|
||||
sub %rbx, DATA2 // DATA2 += 64 - buffered
|
||||
add %ebx, LEN // LEN += buffered - 64
|
||||
movdqa STATE0_A, STATE0_B
|
||||
movdqa STATE1_A, STATE1_B
|
||||
jmp .Lfinup2x_loop_have_data
|
||||
|
||||
.Lfinup2x_enter_loop:
|
||||
sub $64, LEN
|
||||
movdqa STATE0_A, STATE0_B
|
||||
movdqa STATE1_A, STATE1_B
|
||||
.Lfinup2x_loop:
|
||||
// Load the next two data blocks.
|
||||
movdqu 0*16(DATA1), MSG0_A
|
||||
movdqu 0*16(DATA2), MSG0_B
|
||||
movdqu 1*16(DATA1), MSG1_A
|
||||
movdqu 1*16(DATA2), MSG1_B
|
||||
movdqu 2*16(DATA1), MSG2_A
|
||||
movdqu 2*16(DATA2), MSG2_B
|
||||
movdqu 3*16(DATA1), MSG3_A
|
||||
movdqu 3*16(DATA2), MSG3_B
|
||||
add $64, DATA1
|
||||
add $64, DATA2
|
||||
.Lfinup2x_loop_have_data:
|
||||
// Convert the words of the data blocks from big endian.
|
||||
pshufb SHUF_MASK, MSG0_A
|
||||
pshufb SHUF_MASK, MSG0_B
|
||||
pshufb SHUF_MASK, MSG1_A
|
||||
pshufb SHUF_MASK, MSG1_B
|
||||
pshufb SHUF_MASK, MSG2_A
|
||||
pshufb SHUF_MASK, MSG2_B
|
||||
pshufb SHUF_MASK, MSG3_A
|
||||
pshufb SHUF_MASK, MSG3_B
|
||||
.Lfinup2x_loop_have_bswapped_data:
|
||||
|
||||
// Save the original state for each block.
|
||||
movdqa STATE0_A, 0*16(%rsp)
|
||||
movdqa STATE0_B, 1*16(%rsp)
|
||||
movdqa STATE1_A, 2*16(%rsp)
|
||||
movdqa STATE1_B, 3*16(%rsp)
|
||||
|
||||
// Do the SHA-256 rounds on each block.
|
||||
.irp i, 0, 16, 32, 48
|
||||
do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
|
||||
MSG0_B, MSG1_B, MSG2_B, MSG3_B
|
||||
do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
|
||||
MSG1_B, MSG2_B, MSG3_B, MSG0_B
|
||||
do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
|
||||
MSG2_B, MSG3_B, MSG0_B, MSG1_B
|
||||
do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
|
||||
MSG3_B, MSG0_B, MSG1_B, MSG2_B
|
||||
.endr
|
||||
|
||||
// Add the original state for each block.
|
||||
paddd 0*16(%rsp), STATE0_A
|
||||
paddd 1*16(%rsp), STATE0_B
|
||||
paddd 2*16(%rsp), STATE1_A
|
||||
paddd 3*16(%rsp), STATE1_B
|
||||
|
||||
// Update LEN and loop back if more blocks remain.
|
||||
sub $64, LEN
|
||||
jge .Lfinup2x_loop
|
||||
|
||||
// Check if any final blocks need to be handled.
|
||||
// FINAL_STEP = 2: all done
|
||||
// FINAL_STEP = 1: need to do count-only padding block
|
||||
// FINAL_STEP = 0: need to do the block with 0x80 padding byte
|
||||
cmp $1, FINAL_STEP
|
||||
jg .Lfinup2x_done
|
||||
je .Lfinup2x_finalize_countonly
|
||||
add $64, LEN
|
||||
jz .Lfinup2x_finalize_blockaligned
|
||||
|
||||
// Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block.
|
||||
// To do this, write the padding starting with the 0x80 byte to
|
||||
// &sp[64]. Then for each message, copy the last 64 data bytes to sp
|
||||
// and load from &sp[64 - LEN] to get the needed padding block. This
|
||||
// code relies on the data buffers being >= 64 bytes in length.
|
||||
mov $64, %ebx
|
||||
sub LEN, %ebx // ebx = 64 - LEN
|
||||
sub %rbx, DATA1 // DATA1 -= 64 - LEN
|
||||
sub %rbx, DATA2 // DATA2 -= 64 - LEN
|
||||
mov $0x80, FINAL_STEP // using FINAL_STEP as a temporary
|
||||
movd FINAL_STEP, MSG0_A
|
||||
pxor MSG1_A, MSG1_A
|
||||
movdqa MSG0_A, 4*16(%rsp)
|
||||
movdqa MSG1_A, 5*16(%rsp)
|
||||
movdqa MSG1_A, 6*16(%rsp)
|
||||
movdqa MSG1_A, 7*16(%rsp)
|
||||
cmp $56, LEN
|
||||
jge 1f // will COUNT spill into its own block?
|
||||
shl $3, COUNT
|
||||
bswap COUNT
|
||||
mov COUNT, 56(%rsp,%rbx)
|
||||
mov $2, FINAL_STEP // won't need count-only block
|
||||
jmp 2f
|
||||
1:
|
||||
mov $1, FINAL_STEP // will need count-only block
|
||||
2:
|
||||
movdqu 0*16(DATA1), MSG0_A
|
||||
movdqu 1*16(DATA1), MSG1_A
|
||||
movdqu 2*16(DATA1), MSG2_A
|
||||
movdqu 3*16(DATA1), MSG3_A
|
||||
movdqa MSG0_A, 0*16(%rsp)
|
||||
movdqa MSG1_A, 1*16(%rsp)
|
||||
movdqa MSG2_A, 2*16(%rsp)
|
||||
movdqa MSG3_A, 3*16(%rsp)
|
||||
movdqu 0*16(%rsp,%rbx), MSG0_A
|
||||
movdqu 1*16(%rsp,%rbx), MSG1_A
|
||||
movdqu 2*16(%rsp,%rbx), MSG2_A
|
||||
movdqu 3*16(%rsp,%rbx), MSG3_A
|
||||
|
||||
movdqu 0*16(DATA2), MSG0_B
|
||||
movdqu 1*16(DATA2), MSG1_B
|
||||
movdqu 2*16(DATA2), MSG2_B
|
||||
movdqu 3*16(DATA2), MSG3_B
|
||||
movdqa MSG0_B, 0*16(%rsp)
|
||||
movdqa MSG1_B, 1*16(%rsp)
|
||||
movdqa MSG2_B, 2*16(%rsp)
|
||||
movdqa MSG3_B, 3*16(%rsp)
|
||||
movdqu 0*16(%rsp,%rbx), MSG0_B
|
||||
movdqu 1*16(%rsp,%rbx), MSG1_B
|
||||
movdqu 2*16(%rsp,%rbx), MSG2_B
|
||||
movdqu 3*16(%rsp,%rbx), MSG3_B
|
||||
jmp .Lfinup2x_loop_have_data
|
||||
|
||||
// Prepare a padding block, either:
|
||||
//
|
||||
// {0x80, 0, 0, 0, ..., count (as __be64)}
|
||||
// This is for a block aligned message.
|
||||
//
|
||||
// { 0, 0, 0, 0, ..., count (as __be64)}
|
||||
// This is for a message whose length mod 64 is >= 56.
|
||||
//
|
||||
// Pre-swap the endianness of the words.
|
||||
.Lfinup2x_finalize_countonly:
|
||||
pxor MSG0_A, MSG0_A
|
||||
jmp 1f
|
||||
|
||||
.Lfinup2x_finalize_blockaligned:
|
||||
mov $0x80000000, %ebx
|
||||
movd %ebx, MSG0_A
|
||||
1:
|
||||
pxor MSG1_A, MSG1_A
|
||||
pxor MSG2_A, MSG2_A
|
||||
ror $29, COUNT
|
||||
movq COUNT, MSG3_A
|
||||
pslldq $8, MSG3_A
|
||||
movdqa MSG0_A, MSG0_B
|
||||
pxor MSG1_B, MSG1_B
|
||||
pxor MSG2_B, MSG2_B
|
||||
movdqa MSG3_A, MSG3_B
|
||||
mov $2, FINAL_STEP
|
||||
jmp .Lfinup2x_loop_have_bswapped_data
|
||||
|
||||
.Lfinup2x_done:
|
||||
// Write the two digests with all bytes in the correct order.
|
||||
movdqa STATE0_A, TMP_A
|
||||
movdqa STATE0_B, TMP_B
|
||||
punpcklqdq STATE1_A, STATE0_A // GHEF
|
||||
punpcklqdq STATE1_B, STATE0_B
|
||||
punpckhqdq TMP_A, STATE1_A // ABCD
|
||||
punpckhqdq TMP_B, STATE1_B
|
||||
pshufd $0xB1, STATE0_A, STATE0_A // HGFE
|
||||
pshufd $0xB1, STATE0_B, STATE0_B
|
||||
pshufd $0x1B, STATE1_A, STATE1_A // DCBA
|
||||
pshufd $0x1B, STATE1_B, STATE1_B
|
||||
pshufb SHUF_MASK, STATE0_A
|
||||
pshufb SHUF_MASK, STATE0_B
|
||||
pshufb SHUF_MASK, STATE1_A
|
||||
pshufb SHUF_MASK, STATE1_B
|
||||
movdqu STATE0_A, 1*16(OUT1)
|
||||
movdqu STATE0_B, 1*16(OUT2)
|
||||
movdqu STATE1_A, 0*16(OUT1)
|
||||
movdqu STATE1_B, 0*16(OUT2)
|
||||
|
||||
mov %rbp, %rsp
|
||||
pop %rbp
|
||||
pop %rbx
|
||||
RET
|
||||
SYM_FUNC_END(sha256_ni_finup2x)
|
||||
|
||||
.section .rodata.cst256.K256, "aM", @progbits, 256
|
||||
.align 64
|
||||
K256:
|
||||
|
|
|
@ -7,6 +7,8 @@
|
|||
#include <asm/fpu/api.h>
|
||||
#include <linux/static_call.h>
|
||||
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni);
|
||||
|
||||
DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic);
|
||||
|
||||
#define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \
|
||||
|
@ -35,11 +37,48 @@ static void sha256_blocks(struct sha256_block_state *state,
|
|||
static_call(sha256_blocks_x86)(state, data, nblocks);
|
||||
}
|
||||
|
||||
static_assert(offsetof(struct __sha256_ctx, state) == 0);
|
||||
static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
|
||||
static_assert(offsetof(struct __sha256_ctx, buf) == 40);
|
||||
asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
|
||||
const u8 *data1, const u8 *data2, int len,
|
||||
u8 out1[SHA256_DIGEST_SIZE],
|
||||
u8 out2[SHA256_DIGEST_SIZE]);
|
||||
|
||||
#define sha256_finup_2x_arch sha256_finup_2x_arch
|
||||
static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
|
||||
const u8 *data1, const u8 *data2, size_t len,
|
||||
u8 out1[SHA256_DIGEST_SIZE],
|
||||
u8 out2[SHA256_DIGEST_SIZE])
|
||||
{
|
||||
/*
|
||||
* The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
|
||||
* Further limit len to 65536 to avoid spending too long with preemption
|
||||
* disabled. (Of course, in practice len is nearly always 4096 anyway.)
|
||||
*/
|
||||
if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE &&
|
||||
len <= 65536 && likely(irq_fpu_usable())) {
|
||||
kernel_fpu_begin();
|
||||
sha256_ni_finup2x(ctx, data1, data2, len, out1, out2);
|
||||
kernel_fpu_end();
|
||||
kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
|
||||
kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool sha256_finup_2x_is_optimized_arch(void)
|
||||
{
|
||||
return static_key_enabled(&have_sha_ni);
|
||||
}
|
||||
|
||||
#define sha256_mod_init_arch sha256_mod_init_arch
|
||||
static void sha256_mod_init_arch(void)
|
||||
{
|
||||
if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
|
||||
static_call_update(sha256_blocks_x86, sha256_blocks_ni);
|
||||
static_branch_enable(&have_sha_ni);
|
||||
} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
|
||||
NULL) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX)) {
|
||||
|
|
Loading…
Reference in New Issue