Unverified Commit eeb7a893 authored by Palmer Dabbelt's avatar Palmer Dabbelt
Browse files

Merge patch series "riscv: mm: Extend mappable memory up to hint address"

Charlie Jenkins <charlie@rivosinc.com> says:

On riscv, mmap currently returns an address from the largest address
space that can fit entirely inside of the hint address. This makes it
such that the hint address is almost never returned. This patch raises
the mappable area up to and including the hint address. This allows mmap
to often return the hint address, which allows a performance improvement
over searching for a valid address as well as making the behavior more
similar to other architectures.

Note that a previous patch introduced stronger semantics compared to
other architectures for riscv mmap. On riscv, mmap will not use bits in
the upper bits of the virtual address depending on the hint address. On
other architectures, a random address is returned in the address space
requested. On all architectures the hint address will be returned if it
is available. This allows riscv applications to configure how many bits
in the virtual address should be left empty. This has the two benefits
of being able to request address spaces that are smaller than the
default and doesn't require the application to know the page table
layout of riscv.

* b4-shazam-merge:
  docs: riscv: Define behavior of mmap
  selftests: riscv: Generalize mm selftests
  riscv: mm: Use hint address in mmap if available

Link: https://lore.kernel.org/r/20240130-use_mmap_hint_address-v3-0-8a655cfa8bcb@rivosinc.com


Signed-off-by: default avatarPalmer Dabbelt <palmer@rivosinc.com>
parents 2b2ca354 cd6c916c
Loading
Loading
Loading
Loading
+5 −11
Original line number Diff line number Diff line
@@ -144,14 +144,8 @@ passing 0 into the hint address parameter of mmap. On CPUs with an address space
smaller than sv48, the CPU maximum supported address space will be the default.

Software can "opt-in" to receiving VAs from another VA space by providing
a hint address to mmap. A hint address passed to mmap will cause the largest
address space that fits entirely into the hint to be used, unless there is no
space left in the address space. If there is no space available in the requested
address space, an address in the next smallest available address space will be
returned.

For example, in order to obtain 48-bit VA space, a hint address greater than
:code:`1 << 47` must be provided. Note that this is 47 due to sv48 userspace
ending at :code:`1 << 47` and the addresses beyond this are reserved for the
kernel. Similarly, to obtain 57-bit VA space addresses, a hint address greater
than or equal to :code:`1 << 56` must be provided.
a hint address to mmap. When a hint address is passed to mmap, the returned
address will never use more bits than the hint address. For example, if a hint
address of `1 << 40` is passed to mmap, a valid returned address will never use
bits 41 through 63. If no mappable addresses are available in that range, mmap
will return `MAP_FAILED`.
+1 −0
Original line number Diff line number Diff line
@@ -44,6 +44,7 @@ CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_ONDEMAND=y
CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m
CONFIG_CPUFREQ_DT=y
CONFIG_ACPI_CPPC_CPUFREQ=m
CONFIG_VIRTUALIZATION=y
CONFIG_KVM=m
CONFIG_ACPI=y
+2 −2
Original line number Diff line number Diff line
@@ -3,14 +3,14 @@
menu "Accelerated Cryptographic Algorithms for CPU (riscv)"

config CRYPTO_AES_RISCV64
	tristate "Ciphers: AES, modes: ECB, CBC, CTR, XTS"
	tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTS"
	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
	select CRYPTO_ALGAPI
	select CRYPTO_LIB_AES
	select CRYPTO_SKCIPHER
	help
	  Block cipher: AES cipher algorithms
	  Length-preserving ciphers: AES with ECB, CBC, CTR, XTS
	  Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XTS

	  Architecture: riscv64 using:
	  - Zvkned vector crypto extension
+90 −3
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0-only
/*
 * AES using the RISC-V vector crypto extensions.  Includes the bare block
 * cipher and the ECB, CBC, CTR, and XTS modes.
 * cipher and the ECB, CBC, CBC-CTS, CTR, and XTS modes.
 *
 * Copyright (C) 2023 VRULL GmbH
 * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
 *
 * Copyright (C) 2023 SiFive, Inc.
 * Author: Jerry Shih <jerry.shih@sifive.com>
 *
 * Copyright 2024 Google LLC
 */

#include <asm/simd.h>
@@ -40,6 +42,10 @@ asmlinkage void aes_cbc_decrypt_zvkned(const struct crypto_aes_ctx *key,
				       const u8 *in, u8 *out, size_t len,
				       u8 iv[AES_BLOCK_SIZE]);

asmlinkage void aes_cbc_cts_crypt_zvkned(const struct crypto_aes_ctx *key,
					 const u8 *in, u8 *out, size_t len,
					 const u8 iv[AES_BLOCK_SIZE], bool enc);

asmlinkage void aes_ctr32_crypt_zvkned_zvkb(const struct crypto_aes_ctx *key,
					    const u8 *in, u8 *out, size_t len,
					    u8 iv[AES_BLOCK_SIZE]);
@@ -164,7 +170,7 @@ static int riscv64_aes_ecb_decrypt(struct skcipher_request *req)

/* AES-CBC */

static inline int riscv64_aes_cbc_crypt(struct skcipher_request *req, bool enc)
static int riscv64_aes_cbc_crypt(struct skcipher_request *req, bool enc)
{
	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
	const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
@@ -202,6 +208,70 @@ static int riscv64_aes_cbc_decrypt(struct skcipher_request *req)
	return riscv64_aes_cbc_crypt(req, false);
}

/* AES-CBC-CTS */

static int riscv64_aes_cbc_cts_crypt(struct skcipher_request *req, bool enc)
{
	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
	const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
	struct scatterlist sg_src[2], sg_dst[2];
	struct skcipher_request subreq;
	struct scatterlist *src, *dst;
	struct skcipher_walk walk;
	unsigned int cbc_len;
	int err;

	if (req->cryptlen < AES_BLOCK_SIZE)
		return -EINVAL;

	err = skcipher_walk_virt(&walk, req, false);
	if (err)
		return err;
	/*
	 * If the full message is available in one step, decrypt it in one call
	 * to the CBC-CTS assembly function.  This reduces overhead, especially
	 * on short messages.  Otherwise, fall back to doing CBC up to the last
	 * two blocks, then invoke CTS just for the ciphertext stealing.
	 */
	if (unlikely(walk.nbytes != req->cryptlen)) {
		cbc_len = round_down(req->cryptlen - AES_BLOCK_SIZE - 1,
				     AES_BLOCK_SIZE);
		skcipher_walk_abort(&walk);
		skcipher_request_set_tfm(&subreq, tfm);
		skcipher_request_set_callback(&subreq,
					      skcipher_request_flags(req),
					      NULL, NULL);
		skcipher_request_set_crypt(&subreq, req->src, req->dst,
					   cbc_len, req->iv);
		err = riscv64_aes_cbc_crypt(&subreq, enc);
		if (err)
			return err;
		dst = src = scatterwalk_ffwd(sg_src, req->src, cbc_len);
		if (req->dst != req->src)
			dst = scatterwalk_ffwd(sg_dst, req->dst, cbc_len);
		skcipher_request_set_crypt(&subreq, src, dst,
					   req->cryptlen - cbc_len, req->iv);
		err = skcipher_walk_virt(&walk, &subreq, false);
		if (err)
			return err;
	}
	kernel_vector_begin();
	aes_cbc_cts_crypt_zvkned(ctx, walk.src.virt.addr, walk.dst.virt.addr,
				 walk.nbytes, req->iv, enc);
	kernel_vector_end();
	return skcipher_walk_done(&walk, 0);
}

static int riscv64_aes_cbc_cts_encrypt(struct skcipher_request *req)
{
	return riscv64_aes_cbc_cts_crypt(req, true);
}

static int riscv64_aes_cbc_cts_decrypt(struct skcipher_request *req)
{
	return riscv64_aes_cbc_cts_crypt(req, false);
}

/* AES-CTR */

static int riscv64_aes_ctr_crypt(struct skcipher_request *req)
@@ -434,6 +504,22 @@ static struct skcipher_alg riscv64_zvkned_aes_skcipher_algs[] = {
			.cra_driver_name = "cbc-aes-riscv64-zvkned",
			.cra_module = THIS_MODULE,
		},
	}, {
		.setkey = riscv64_aes_setkey_skcipher,
		.encrypt = riscv64_aes_cbc_cts_encrypt,
		.decrypt = riscv64_aes_cbc_cts_decrypt,
		.min_keysize = AES_MIN_KEY_SIZE,
		.max_keysize = AES_MAX_KEY_SIZE,
		.ivsize = AES_BLOCK_SIZE,
		.walksize = 4 * AES_BLOCK_SIZE, /* matches LMUL=4 */
		.base = {
			.cra_blocksize = AES_BLOCK_SIZE,
			.cra_ctxsize = sizeof(struct crypto_aes_ctx),
			.cra_priority = 300,
			.cra_name = "cts(cbc(aes))",
			.cra_driver_name = "cts-cbc-aes-riscv64-zvkned",
			.cra_module = THIS_MODULE,
		},
	}
};

@@ -540,11 +626,12 @@ static void __exit riscv64_aes_mod_exit(void)
module_init(riscv64_aes_mod_init);
module_exit(riscv64_aes_mod_exit);

MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS (RISC-V accelerated)");
MODULE_DESCRIPTION("AES-ECB/CBC/CTS/CTR/XTS (RISC-V accelerated)");
MODULE_AUTHOR("Jerry Shih <jerry.shih@sifive.com>");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CRYPTO("aes");
MODULE_ALIAS_CRYPTO("ecb(aes)");
MODULE_ALIAS_CRYPTO("cbc(aes)");
MODULE_ALIAS_CRYPTO("cts(cbc(aes))");
MODULE_ALIAS_CRYPTO("ctr(aes)");
MODULE_ALIAS_CRYPTO("xts(aes)");
+168 −9
Original line number Diff line number Diff line
@@ -139,19 +139,25 @@ SYM_FUNC_END(aes_ecb_decrypt_zvkned)
.endm

.macro	aes_cbc_decrypt	keylen
	srli		LEN, LEN, 2	// Convert LEN from bytes to words
	vle32.v		v16, (IVP)	// Load IV
1:
	vle32.v		v17, (INP)	// Load ciphertext block
	vmv.v.v		v18, v17	// Save ciphertext block
	aes_decrypt	v17, \keylen	// Decrypt
	vxor.vv		v17, v17, v16	// XOR with IV or prev ciphertext block
	vse32.v		v17, (OUTP)	// Store plaintext block
	vmv.v.v		v16, v18	// Next "IV" is prev ciphertext block
	addi		INP, INP, 16
	addi		OUTP, OUTP, 16
	addi		LEN, LEN, -16
	vsetvli		t0, LEN, e32, m4, ta, ma
	vle32.v		v20, (INP)	// Load ciphertext blocks
	vslideup.vi	v16, v20, 4	// Setup prev ciphertext blocks
	addi		t1, t0, -4
	vslidedown.vx	v24, v20, t1	// Save last ciphertext block
	aes_decrypt	v20, \keylen	// Decrypt the blocks
	vxor.vv		v20, v20, v16	// XOR with prev ciphertext blocks
	vse32.v		v20, (OUTP)	// Store plaintext blocks
	vmv.v.v		v16, v24	// Next "IV" is last ciphertext block
	slli		t1, t0, 2	// Words to bytes
	add		INP, INP, t1
	add		OUTP, OUTP, t1
	sub		LEN, LEN, t0
	bnez		LEN, 1b

	vsetivli	zero, 4, e32, m1, ta, ma
	vse32.v		v16, (IVP)	// Store next IV
	ret
.endm
@@ -178,3 +184,156 @@ SYM_FUNC_START(aes_cbc_decrypt_zvkned)
192:
	aes_cbc_decrypt	192
SYM_FUNC_END(aes_cbc_decrypt_zvkned)

.macro	aes_cbc_cts_encrypt	keylen

	// CBC-encrypt all blocks except the last.  But don't store the
	// second-to-last block to the output buffer yet, since it will be
	// handled specially in the ciphertext stealing step.  Exception: if the
	// message is single-block, still encrypt the last (and only) block.
	li		t0, 16
	j		2f
1:
	vse32.v		v16, (OUTP)	// Store ciphertext block
	addi		OUTP, OUTP, 16
2:
	vle32.v		v17, (INP)	// Load plaintext block
	vxor.vv		v16, v16, v17	// XOR with IV or prev ciphertext block
	aes_encrypt	v16, \keylen	// Encrypt
	addi		INP, INP, 16
	addi		LEN, LEN, -16
	bgt		LEN, t0, 1b	// Repeat if more than one block remains

	// Special case: if the message is a single block, just do CBC.
	beqz		LEN, .Lcts_encrypt_done\@

	// Encrypt the last two blocks using ciphertext stealing as follows:
	//	C[n-1] = Encrypt(Encrypt(P[n-1] ^ C[n-2]) ^ P[n])
	//	C[n] = Encrypt(P[n-1] ^ C[n-2])[0..LEN]
	//
	// C[i] denotes the i'th ciphertext block, and likewise P[i] the i'th
	// plaintext block.  Block n, the last block, may be partial; its length
	// is 1 <= LEN <= 16.  If there are only 2 blocks, C[n-2] means the IV.
	//
	// v16 already contains Encrypt(P[n-1] ^ C[n-2]).
	// INP points to P[n].  OUTP points to where C[n-1] should go.
	// To support in-place encryption, load P[n] before storing C[n].
	addi		t0, OUTP, 16	// Get pointer to where C[n] should go
	vsetvli		zero, LEN, e8, m1, tu, ma
	vle8.v		v17, (INP)	// Load P[n]
	vse8.v		v16, (t0)	// Store C[n]
	vxor.vv		v16, v16, v17	// v16 = Encrypt(P[n-1] ^ C[n-2]) ^ P[n]
	vsetivli	zero, 4, e32, m1, ta, ma
	aes_encrypt	v16, \keylen
.Lcts_encrypt_done\@:
	vse32.v		v16, (OUTP)	// Store C[n-1] (or C[n] in single-block case)
	ret
.endm

#define LEN32		t4 // Length of remaining full blocks in 32-bit words
#define LEN_MOD16	t5 // Length of message in bytes mod 16

.macro	aes_cbc_cts_decrypt	keylen
	andi		LEN32, LEN, ~15
	srli		LEN32, LEN32, 2
	andi		LEN_MOD16, LEN, 15

	// Save C[n-2] in v28 so that it's available later during the ciphertext
	// stealing step.  If there are fewer than three blocks, C[n-2] means
	// the IV, otherwise it means the third-to-last ciphertext block.
	vmv.v.v		v28, v16	// IV
	add		t0, LEN, -33
	bltz		t0, .Lcts_decrypt_loop\@
	andi		t0, t0, ~15
	add		t0, t0, INP
	vle32.v		v28, (t0)

	// CBC-decrypt all full blocks.  For the last full block, or the last 2
	// full blocks if the message is block-aligned, this doesn't write the
	// correct output blocks (unless the message is only a single block),
	// because it XORs the wrong values with the raw AES plaintexts.  But we
	// fix this after this loop without redoing the AES decryptions.  This
	// approach allows more of the AES decryptions to be parallelized.
.Lcts_decrypt_loop\@:
	vsetvli		t0, LEN32, e32, m4, ta, ma
	addi		t1, t0, -4
	vle32.v		v20, (INP)	// Load next set of ciphertext blocks
	vmv.v.v		v24, v16	// Get IV or last ciphertext block of prev set
	vslideup.vi	v24, v20, 4	// Setup prev ciphertext blocks
	vslidedown.vx	v16, v20, t1	// Save last ciphertext block of this set
	aes_decrypt	v20, \keylen	// Decrypt this set of blocks
	vxor.vv		v24, v24, v20	// XOR prev ciphertext blocks with decrypted blocks
	vse32.v		v24, (OUTP)	// Store this set of plaintext blocks
	sub		LEN32, LEN32, t0
	slli		t0, t0, 2	// Words to bytes
	add		INP, INP, t0
	add		OUTP, OUTP, t0
	bnez		LEN32, .Lcts_decrypt_loop\@

	vsetivli	zero, 4, e32, m4, ta, ma
	vslidedown.vx	v20, v20, t1	// Extract raw plaintext of last full block
	addi		t0, OUTP, -16	// Get pointer to last full plaintext block
	bnez		LEN_MOD16, .Lcts_decrypt_non_block_aligned\@

	// Special case: if the message is a single block, just do CBC.
	li		t1, 16
	beq		LEN, t1, .Lcts_decrypt_done\@

	// Block-aligned message.  Just fix up the last 2 blocks.  We need:
	//
	//	P[n-1] = Decrypt(C[n]) ^ C[n-2]
	//	P[n] = Decrypt(C[n-1]) ^ C[n]
	//
	// We have C[n] in v16, Decrypt(C[n]) in v20, and C[n-2] in v28.
	// Together with Decrypt(C[n-1]) ^ C[n-2] from the output buffer, this
	// is everything needed to fix the output without re-decrypting blocks.
	addi		t1, OUTP, -32	// Get pointer to where P[n-1] should go
	vxor.vv		v20, v20, v28	// Decrypt(C[n]) ^ C[n-2] == P[n-1]
	vle32.v		v24, (t1)	// Decrypt(C[n-1]) ^ C[n-2]
	vse32.v		v20, (t1)	// Store P[n-1]
	vxor.vv		v20, v24, v16	// Decrypt(C[n-1]) ^ C[n-2] ^ C[n] == P[n] ^ C[n-2]
	j		.Lcts_decrypt_finish\@

.Lcts_decrypt_non_block_aligned\@:
	// Decrypt the last two blocks using ciphertext stealing as follows:
	//
	//	P[n-1] = Decrypt(C[n] || Decrypt(C[n-1])[LEN_MOD16..16]) ^ C[n-2]
	//	P[n] = (Decrypt(C[n-1]) ^ C[n])[0..LEN_MOD16]
	//
	// We already have Decrypt(C[n-1]) in v20 and C[n-2] in v28.
	vmv.v.v		v16, v20	// v16 = Decrypt(C[n-1])
	vsetvli		zero, LEN_MOD16, e8, m1, tu, ma
	vle8.v		v20, (INP)	// v20 = C[n] || Decrypt(C[n-1])[LEN_MOD16..16]
	vxor.vv		v16, v16, v20	// v16 = Decrypt(C[n-1]) ^ C[n]
	vse8.v		v16, (OUTP)	// Store P[n]
	vsetivli	zero, 4, e32, m1, ta, ma
	aes_decrypt	v20, \keylen	// v20 = Decrypt(C[n] || Decrypt(C[n-1])[LEN_MOD16..16])
.Lcts_decrypt_finish\@:
	vxor.vv		v20, v20, v28	// XOR with C[n-2]
	vse32.v		v20, (t0)	// Store last full plaintext block
.Lcts_decrypt_done\@:
	ret
.endm

.macro	aes_cbc_cts_crypt	keylen
	vle32.v		v16, (IVP)	// Load IV
	beqz		a5, .Lcts_decrypt\@
	aes_cbc_cts_encrypt \keylen
.Lcts_decrypt\@:
	aes_cbc_cts_decrypt \keylen
.endm

// void aes_cbc_cts_crypt_zvkned(const struct crypto_aes_ctx *key,
//			         const u8 *in, u8 *out, size_t len,
//				 const u8 iv[16], bool enc);
//
// Encrypts or decrypts a message with the CS3 variant of AES-CBC-CTS.
// This is the variant that unconditionally swaps the last two blocks.
SYM_FUNC_START(aes_cbc_cts_crypt_zvkned)
	aes_begin	KEYP, 128f, 192f
	aes_cbc_cts_crypt 256
128:
	aes_cbc_cts_crypt 128
192:
	aes_cbc_cts_crypt 192
SYM_FUNC_END(aes_cbc_cts_crypt_zvkned)
Loading