560 lines
17 KiB
ArmAsm
560 lines
17 KiB
ArmAsm
/*
|
|
* Intel SHA Extensions optimized implementation of a SHA-256 update function
|
|
*
|
|
* This file is provided under a dual BSD/GPLv2 license. When using or
|
|
* redistributing this file, you may do so under either license.
|
|
*
|
|
* GPL LICENSE SUMMARY
|
|
*
|
|
* Copyright(c) 2015 Intel Corporation.
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of version 2 of the GNU General Public License as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful, but
|
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*
|
|
* Contact Information:
|
|
* Sean Gulley <sean.m.gulley@intel.com>
|
|
* Tim Chen <tim.c.chen@linux.intel.com>
|
|
*
|
|
* BSD LICENSE
|
|
*
|
|
* Copyright(c) 2015 Intel Corporation.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in
|
|
* the documentation and/or other materials provided with the
|
|
* distribution.
|
|
* * Neither the name of Intel Corporation nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
|
|
#define STATE_PTR %rdi /* 1st arg */
|
|
#define DATA_PTR %rsi /* 2nd arg */
|
|
#define NUM_BLKS %rdx /* 3rd arg */
|
|
|
|
#define SHA256CONSTANTS %rax
|
|
|
|
#define MSG %xmm0 /* sha256rnds2 implicit operand */
|
|
#define STATE0 %xmm1
|
|
#define STATE1 %xmm2
|
|
#define MSG0 %xmm3
|
|
#define MSG1 %xmm4
|
|
#define MSG2 %xmm5
|
|
#define MSG3 %xmm6
|
|
#define TMP %xmm7
|
|
|
|
#define SHUF_MASK %xmm8
|
|
|
|
#define ABEF_SAVE %xmm9
|
|
#define CDGH_SAVE %xmm10
|
|
|
|
.macro do_4rounds i, m0, m1, m2, m3
|
|
.if \i < 16
|
|
movdqu \i*4(DATA_PTR), \m0
|
|
pshufb SHUF_MASK, \m0
|
|
.endif
|
|
movdqa (\i-32)*4(SHA256CONSTANTS), MSG
|
|
paddd \m0, MSG
|
|
sha256rnds2 STATE0, STATE1
|
|
.if \i >= 12 && \i < 60
|
|
movdqa \m0, TMP
|
|
palignr $4, \m3, TMP
|
|
paddd TMP, \m1
|
|
sha256msg2 \m0, \m1
|
|
.endif
|
|
punpckhqdq MSG, MSG
|
|
sha256rnds2 STATE1, STATE0
|
|
.if \i >= 4 && \i < 52
|
|
sha256msg1 \m0, \m3
|
|
.endif
|
|
.endm
|
|
|
|
/*
|
|
* Intel SHA Extensions optimized implementation of a SHA-256 block function
|
|
*
|
|
* This function takes a pointer to the current SHA-256 state, a pointer to the
|
|
* input data, and the number of 64-byte blocks to process. Once all blocks
|
|
* have been processed, the state is updated with the new state. This function
|
|
* only processes complete blocks. State initialization, buffering of partial
|
|
* blocks, and digest finalization is expected to be handled elsewhere.
|
|
*
|
|
* void sha256_ni_transform(struct sha256_block_state *state,
|
|
* const u8 *data, size_t nblocks);
|
|
*/
|
|
.text
|
|
SYM_FUNC_START(sha256_ni_transform)
|
|
|
|
shl $6, NUM_BLKS /* convert to bytes */
|
|
add DATA_PTR, NUM_BLKS /* pointer to end of data */
|
|
|
|
/*
|
|
* load initial hash values
|
|
* Need to reorder these appropriately
|
|
* DCBA, HGFE -> ABEF, CDGH
|
|
*/
|
|
movdqu 0*16(STATE_PTR), STATE0 /* DCBA */
|
|
movdqu 1*16(STATE_PTR), STATE1 /* HGFE */
|
|
|
|
movdqa STATE0, TMP
|
|
punpcklqdq STATE1, STATE0 /* FEBA */
|
|
punpckhqdq TMP, STATE1 /* DCHG */
|
|
pshufd $0x1B, STATE0, STATE0 /* ABEF */
|
|
pshufd $0xB1, STATE1, STATE1 /* CDGH */
|
|
|
|
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
|
|
lea K256+32*4(%rip), SHA256CONSTANTS
|
|
|
|
.Lloop0:
|
|
/* Save hash values for addition after rounds */
|
|
movdqa STATE0, ABEF_SAVE
|
|
movdqa STATE1, CDGH_SAVE
|
|
|
|
.irp i, 0, 16, 32, 48
|
|
do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3
|
|
do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0
|
|
do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1
|
|
do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2
|
|
.endr
|
|
|
|
/* Add current hash values with previously saved */
|
|
paddd ABEF_SAVE, STATE0
|
|
paddd CDGH_SAVE, STATE1
|
|
|
|
/* Increment data pointer and loop if more to process */
|
|
add $64, DATA_PTR
|
|
cmp NUM_BLKS, DATA_PTR
|
|
jne .Lloop0
|
|
|
|
/* Write hash values back in the correct order */
|
|
movdqa STATE0, TMP
|
|
punpcklqdq STATE1, STATE0 /* GHEF */
|
|
punpckhqdq TMP, STATE1 /* ABCD */
|
|
pshufd $0xB1, STATE0, STATE0 /* HGFE */
|
|
pshufd $0x1B, STATE1, STATE1 /* DCBA */
|
|
|
|
movdqu STATE1, 0*16(STATE_PTR)
|
|
movdqu STATE0, 1*16(STATE_PTR)
|
|
|
|
RET
|
|
SYM_FUNC_END(sha256_ni_transform)
|
|
|
|
#undef DIGEST_PTR
|
|
#undef DATA_PTR
|
|
#undef NUM_BLKS
|
|
#undef SHA256CONSTANTS
|
|
#undef MSG
|
|
#undef STATE0
|
|
#undef STATE1
|
|
#undef MSG0
|
|
#undef MSG1
|
|
#undef MSG2
|
|
#undef MSG3
|
|
#undef TMP
|
|
#undef SHUF_MASK
|
|
#undef ABEF_SAVE
|
|
#undef CDGH_SAVE
|
|
|
|
// parameters for sha256_ni_finup2x()
|
|
#define CTX %rdi
|
|
#define DATA1 %rsi
|
|
#define DATA2 %rdx
|
|
#define LEN %ecx
|
|
#define LEN8 %cl
|
|
#define LEN64 %rcx
|
|
#define OUT1 %r8
|
|
#define OUT2 %r9
|
|
|
|
// other scalar variables
|
|
#define SHA256CONSTANTS %rax
|
|
#define COUNT %r10
|
|
#define COUNT32 %r10d
|
|
#define FINAL_STEP %r11d
|
|
|
|
// rbx is used as a temporary.
|
|
|
|
#define MSG %xmm0 // sha256rnds2 implicit operand
|
|
#define STATE0_A %xmm1
|
|
#define STATE1_A %xmm2
|
|
#define STATE0_B %xmm3
|
|
#define STATE1_B %xmm4
|
|
#define TMP_A %xmm5
|
|
#define TMP_B %xmm6
|
|
#define MSG0_A %xmm7
|
|
#define MSG1_A %xmm8
|
|
#define MSG2_A %xmm9
|
|
#define MSG3_A %xmm10
|
|
#define MSG0_B %xmm11
|
|
#define MSG1_B %xmm12
|
|
#define MSG2_B %xmm13
|
|
#define MSG3_B %xmm14
|
|
#define SHUF_MASK %xmm15
|
|
|
|
#define OFFSETOF_STATE 0 // offsetof(struct __sha256_ctx, state)
|
|
#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
|
|
#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
|
|
|
|
// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b
|
|
// contain the current 4 message schedule words for the first and second message
|
|
// respectively.
|
|
//
|
|
// If not all the message schedule words have been computed yet, then this also
|
|
// computes 4 more message schedule words for each message. m1_a-m3_a contain
|
|
// the next 3 groups of 4 message schedule words for the first message, and
|
|
// likewise m1_b-m3_b for the second. After consuming the current value of
|
|
// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
|
|
// likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the
|
|
// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must
|
|
// cycle through the registers accordingly.
|
|
.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b
|
|
movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A
|
|
movdqa TMP_A, TMP_B
|
|
paddd \m0_a, TMP_A
|
|
paddd \m0_b, TMP_B
|
|
.if \i < 48
|
|
sha256msg1 \m1_a, \m0_a
|
|
sha256msg1 \m1_b, \m0_b
|
|
.endif
|
|
movdqa TMP_A, MSG
|
|
sha256rnds2 STATE0_A, STATE1_A
|
|
movdqa TMP_B, MSG
|
|
sha256rnds2 STATE0_B, STATE1_B
|
|
pshufd $0x0E, TMP_A, MSG
|
|
sha256rnds2 STATE1_A, STATE0_A
|
|
pshufd $0x0E, TMP_B, MSG
|
|
sha256rnds2 STATE1_B, STATE0_B
|
|
.if \i < 48
|
|
movdqa \m3_a, TMP_A
|
|
movdqa \m3_b, TMP_B
|
|
palignr $4, \m2_a, TMP_A
|
|
palignr $4, \m2_b, TMP_B
|
|
paddd TMP_A, \m0_a
|
|
paddd TMP_B, \m0_b
|
|
sha256msg2 \m3_a, \m0_a
|
|
sha256msg2 \m3_b, \m0_b
|
|
.endif
|
|
.endm
|
|
|
|
//
|
|
// void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
|
|
// const u8 *data1, const u8 *data2, int len,
|
|
// u8 out1[SHA256_DIGEST_SIZE],
|
|
// u8 out2[SHA256_DIGEST_SIZE]);
|
|
//
|
|
// This function computes the SHA-256 digests of two messages |data1| and
|
|
// |data2| that are both |len| bytes long, starting from the initial context
|
|
// |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
|
|
//
|
|
// The instructions for the two SHA-256 operations are interleaved. On many
|
|
// CPUs, this is almost twice as fast as hashing each message individually due
|
|
// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
|
|
//
|
|
SYM_FUNC_START(sha256_ni_finup2x)
|
|
// Allocate 128 bytes of stack space, 16-byte aligned.
|
|
push %rbx
|
|
push %rbp
|
|
mov %rsp, %rbp
|
|
sub $128, %rsp
|
|
and $~15, %rsp
|
|
|
|
// Load the shuffle mask for swapping the endianness of 32-bit words.
|
|
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
|
|
|
|
// Set up pointer to the round constants.
|
|
lea K256+32*4(%rip), SHA256CONSTANTS
|
|
|
|
// Initially we're not processing the final blocks.
|
|
xor FINAL_STEP, FINAL_STEP
|
|
|
|
// Load the initial state from ctx->state.
|
|
movdqu OFFSETOF_STATE+0*16(CTX), STATE0_A // DCBA
|
|
movdqu OFFSETOF_STATE+1*16(CTX), STATE1_A // HGFE
|
|
movdqa STATE0_A, TMP_A
|
|
punpcklqdq STATE1_A, STATE0_A // FEBA
|
|
punpckhqdq TMP_A, STATE1_A // DCHG
|
|
pshufd $0x1B, STATE0_A, STATE0_A // ABEF
|
|
pshufd $0xB1, STATE1_A, STATE1_A // CDGH
|
|
|
|
// Load ctx->bytecount. Take the mod 64 of it to get the number of
|
|
// bytes that are buffered in ctx->buf. Also save it in a register with
|
|
// LEN added to it.
|
|
mov LEN, LEN
|
|
mov OFFSETOF_BYTECOUNT(CTX), %rbx
|
|
lea (%rbx, LEN64, 1), COUNT
|
|
and $63, %ebx
|
|
jz .Lfinup2x_enter_loop // No bytes buffered?
|
|
|
|
// %ebx bytes (1 to 63) are currently buffered in ctx->buf. Load them
|
|
// followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we
|
|
// just load 64 bytes from each of ctx->buf, DATA1, and DATA2
|
|
// unconditionally and rearrange the data as needed.
|
|
|
|
movdqu OFFSETOF_BUF+0*16(CTX), MSG0_A
|
|
movdqu OFFSETOF_BUF+1*16(CTX), MSG1_A
|
|
movdqu OFFSETOF_BUF+2*16(CTX), MSG2_A
|
|
movdqu OFFSETOF_BUF+3*16(CTX), MSG3_A
|
|
movdqa MSG0_A, 0*16(%rsp)
|
|
movdqa MSG1_A, 1*16(%rsp)
|
|
movdqa MSG2_A, 2*16(%rsp)
|
|
movdqa MSG3_A, 3*16(%rsp)
|
|
|
|
movdqu 0*16(DATA1), MSG0_A
|
|
movdqu 1*16(DATA1), MSG1_A
|
|
movdqu 2*16(DATA1), MSG2_A
|
|
movdqu 3*16(DATA1), MSG3_A
|
|
movdqu MSG0_A, 0*16(%rsp,%rbx)
|
|
movdqu MSG1_A, 1*16(%rsp,%rbx)
|
|
movdqu MSG2_A, 2*16(%rsp,%rbx)
|
|
movdqu MSG3_A, 3*16(%rsp,%rbx)
|
|
movdqa 0*16(%rsp), MSG0_A
|
|
movdqa 1*16(%rsp), MSG1_A
|
|
movdqa 2*16(%rsp), MSG2_A
|
|
movdqa 3*16(%rsp), MSG3_A
|
|
|
|
movdqu 0*16(DATA2), MSG0_B
|
|
movdqu 1*16(DATA2), MSG1_B
|
|
movdqu 2*16(DATA2), MSG2_B
|
|
movdqu 3*16(DATA2), MSG3_B
|
|
movdqu MSG0_B, 0*16(%rsp,%rbx)
|
|
movdqu MSG1_B, 1*16(%rsp,%rbx)
|
|
movdqu MSG2_B, 2*16(%rsp,%rbx)
|
|
movdqu MSG3_B, 3*16(%rsp,%rbx)
|
|
movdqa 0*16(%rsp), MSG0_B
|
|
movdqa 1*16(%rsp), MSG1_B
|
|
movdqa 2*16(%rsp), MSG2_B
|
|
movdqa 3*16(%rsp), MSG3_B
|
|
|
|
sub $64, %rbx // rbx = buffered - 64
|
|
sub %rbx, DATA1 // DATA1 += 64 - buffered
|
|
sub %rbx, DATA2 // DATA2 += 64 - buffered
|
|
add %ebx, LEN // LEN += buffered - 64
|
|
movdqa STATE0_A, STATE0_B
|
|
movdqa STATE1_A, STATE1_B
|
|
jmp .Lfinup2x_loop_have_data
|
|
|
|
.Lfinup2x_enter_loop:
|
|
sub $64, LEN
|
|
movdqa STATE0_A, STATE0_B
|
|
movdqa STATE1_A, STATE1_B
|
|
.Lfinup2x_loop:
|
|
// Load the next two data blocks.
|
|
movdqu 0*16(DATA1), MSG0_A
|
|
movdqu 0*16(DATA2), MSG0_B
|
|
movdqu 1*16(DATA1), MSG1_A
|
|
movdqu 1*16(DATA2), MSG1_B
|
|
movdqu 2*16(DATA1), MSG2_A
|
|
movdqu 2*16(DATA2), MSG2_B
|
|
movdqu 3*16(DATA1), MSG3_A
|
|
movdqu 3*16(DATA2), MSG3_B
|
|
add $64, DATA1
|
|
add $64, DATA2
|
|
.Lfinup2x_loop_have_data:
|
|
// Convert the words of the data blocks from big endian.
|
|
pshufb SHUF_MASK, MSG0_A
|
|
pshufb SHUF_MASK, MSG0_B
|
|
pshufb SHUF_MASK, MSG1_A
|
|
pshufb SHUF_MASK, MSG1_B
|
|
pshufb SHUF_MASK, MSG2_A
|
|
pshufb SHUF_MASK, MSG2_B
|
|
pshufb SHUF_MASK, MSG3_A
|
|
pshufb SHUF_MASK, MSG3_B
|
|
.Lfinup2x_loop_have_bswapped_data:
|
|
|
|
// Save the original state for each block.
|
|
movdqa STATE0_A, 0*16(%rsp)
|
|
movdqa STATE0_B, 1*16(%rsp)
|
|
movdqa STATE1_A, 2*16(%rsp)
|
|
movdqa STATE1_B, 3*16(%rsp)
|
|
|
|
// Do the SHA-256 rounds on each block.
|
|
.irp i, 0, 16, 32, 48
|
|
do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
|
|
MSG0_B, MSG1_B, MSG2_B, MSG3_B
|
|
do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
|
|
MSG1_B, MSG2_B, MSG3_B, MSG0_B
|
|
do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
|
|
MSG2_B, MSG3_B, MSG0_B, MSG1_B
|
|
do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
|
|
MSG3_B, MSG0_B, MSG1_B, MSG2_B
|
|
.endr
|
|
|
|
// Add the original state for each block.
|
|
paddd 0*16(%rsp), STATE0_A
|
|
paddd 1*16(%rsp), STATE0_B
|
|
paddd 2*16(%rsp), STATE1_A
|
|
paddd 3*16(%rsp), STATE1_B
|
|
|
|
// Update LEN and loop back if more blocks remain.
|
|
sub $64, LEN
|
|
jge .Lfinup2x_loop
|
|
|
|
// Check if any final blocks need to be handled.
|
|
// FINAL_STEP = 2: all done
|
|
// FINAL_STEP = 1: need to do count-only padding block
|
|
// FINAL_STEP = 0: need to do the block with 0x80 padding byte
|
|
cmp $1, FINAL_STEP
|
|
jg .Lfinup2x_done
|
|
je .Lfinup2x_finalize_countonly
|
|
add $64, LEN
|
|
jz .Lfinup2x_finalize_blockaligned
|
|
|
|
// Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block.
|
|
// To do this, write the padding starting with the 0x80 byte to
|
|
// &sp[64]. Then for each message, copy the last 64 data bytes to sp
|
|
// and load from &sp[64 - LEN] to get the needed padding block. This
|
|
// code relies on the data buffers being >= 64 bytes in length.
|
|
mov $64, %ebx
|
|
sub LEN, %ebx // ebx = 64 - LEN
|
|
sub %rbx, DATA1 // DATA1 -= 64 - LEN
|
|
sub %rbx, DATA2 // DATA2 -= 64 - LEN
|
|
mov $0x80, FINAL_STEP // using FINAL_STEP as a temporary
|
|
movd FINAL_STEP, MSG0_A
|
|
pxor MSG1_A, MSG1_A
|
|
movdqa MSG0_A, 4*16(%rsp)
|
|
movdqa MSG1_A, 5*16(%rsp)
|
|
movdqa MSG1_A, 6*16(%rsp)
|
|
movdqa MSG1_A, 7*16(%rsp)
|
|
cmp $56, LEN
|
|
jge 1f // will COUNT spill into its own block?
|
|
shl $3, COUNT
|
|
bswap COUNT
|
|
mov COUNT, 56(%rsp,%rbx)
|
|
mov $2, FINAL_STEP // won't need count-only block
|
|
jmp 2f
|
|
1:
|
|
mov $1, FINAL_STEP // will need count-only block
|
|
2:
|
|
movdqu 0*16(DATA1), MSG0_A
|
|
movdqu 1*16(DATA1), MSG1_A
|
|
movdqu 2*16(DATA1), MSG2_A
|
|
movdqu 3*16(DATA1), MSG3_A
|
|
movdqa MSG0_A, 0*16(%rsp)
|
|
movdqa MSG1_A, 1*16(%rsp)
|
|
movdqa MSG2_A, 2*16(%rsp)
|
|
movdqa MSG3_A, 3*16(%rsp)
|
|
movdqu 0*16(%rsp,%rbx), MSG0_A
|
|
movdqu 1*16(%rsp,%rbx), MSG1_A
|
|
movdqu 2*16(%rsp,%rbx), MSG2_A
|
|
movdqu 3*16(%rsp,%rbx), MSG3_A
|
|
|
|
movdqu 0*16(DATA2), MSG0_B
|
|
movdqu 1*16(DATA2), MSG1_B
|
|
movdqu 2*16(DATA2), MSG2_B
|
|
movdqu 3*16(DATA2), MSG3_B
|
|
movdqa MSG0_B, 0*16(%rsp)
|
|
movdqa MSG1_B, 1*16(%rsp)
|
|
movdqa MSG2_B, 2*16(%rsp)
|
|
movdqa MSG3_B, 3*16(%rsp)
|
|
movdqu 0*16(%rsp,%rbx), MSG0_B
|
|
movdqu 1*16(%rsp,%rbx), MSG1_B
|
|
movdqu 2*16(%rsp,%rbx), MSG2_B
|
|
movdqu 3*16(%rsp,%rbx), MSG3_B
|
|
jmp .Lfinup2x_loop_have_data
|
|
|
|
// Prepare a padding block, either:
|
|
//
|
|
// {0x80, 0, 0, 0, ..., count (as __be64)}
|
|
// This is for a block aligned message.
|
|
//
|
|
// { 0, 0, 0, 0, ..., count (as __be64)}
|
|
// This is for a message whose length mod 64 is >= 56.
|
|
//
|
|
// Pre-swap the endianness of the words.
|
|
.Lfinup2x_finalize_countonly:
|
|
pxor MSG0_A, MSG0_A
|
|
jmp 1f
|
|
|
|
.Lfinup2x_finalize_blockaligned:
|
|
mov $0x80000000, %ebx
|
|
movd %ebx, MSG0_A
|
|
1:
|
|
pxor MSG1_A, MSG1_A
|
|
pxor MSG2_A, MSG2_A
|
|
ror $29, COUNT
|
|
movq COUNT, MSG3_A
|
|
pslldq $8, MSG3_A
|
|
movdqa MSG0_A, MSG0_B
|
|
pxor MSG1_B, MSG1_B
|
|
pxor MSG2_B, MSG2_B
|
|
movdqa MSG3_A, MSG3_B
|
|
mov $2, FINAL_STEP
|
|
jmp .Lfinup2x_loop_have_bswapped_data
|
|
|
|
.Lfinup2x_done:
|
|
// Write the two digests with all bytes in the correct order.
|
|
movdqa STATE0_A, TMP_A
|
|
movdqa STATE0_B, TMP_B
|
|
punpcklqdq STATE1_A, STATE0_A // GHEF
|
|
punpcklqdq STATE1_B, STATE0_B
|
|
punpckhqdq TMP_A, STATE1_A // ABCD
|
|
punpckhqdq TMP_B, STATE1_B
|
|
pshufd $0xB1, STATE0_A, STATE0_A // HGFE
|
|
pshufd $0xB1, STATE0_B, STATE0_B
|
|
pshufd $0x1B, STATE1_A, STATE1_A // DCBA
|
|
pshufd $0x1B, STATE1_B, STATE1_B
|
|
pshufb SHUF_MASK, STATE0_A
|
|
pshufb SHUF_MASK, STATE0_B
|
|
pshufb SHUF_MASK, STATE1_A
|
|
pshufb SHUF_MASK, STATE1_B
|
|
movdqu STATE0_A, 1*16(OUT1)
|
|
movdqu STATE0_B, 1*16(OUT2)
|
|
movdqu STATE1_A, 0*16(OUT1)
|
|
movdqu STATE1_B, 0*16(OUT2)
|
|
|
|
mov %rbp, %rsp
|
|
pop %rbp
|
|
pop %rbx
|
|
RET
|
|
SYM_FUNC_END(sha256_ni_finup2x)
|
|
|
|
.section .rodata.cst256.K256, "aM", @progbits, 256
|
|
.align 64
|
|
K256:
|
|
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
|
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
|
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
|
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
|
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
|
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
|
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
|
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
|
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
|
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
|
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
|
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
|
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
|
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
|
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
|
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
|
|
|
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
|
|
.align 16
|
|
PSHUFFLE_BYTE_FLIP_MASK:
|
|
.octa 0x0c0d0e0f08090a0b0405060700010203
|