Unverified Commit 503638e0 authored by Alexandre Ghiti's avatar Alexandre Ghiti Committed by Palmer Dabbelt
Browse files

riscv: Stop emitting preventive sfence.vma for new vmalloc mappings

In 6.5, we removed the vmalloc fault path because that can't work (see
[1] [2]). Then in order to make sure that new page table entries were
seen by the page table walker, we had to preventively emit a sfence.vma
on all harts [3] but this solution is very costly since it relies on IPI.

And even there, we could end up in a loop of vmalloc faults if a vmalloc
allocation is done in the IPI path (for example if it is traced, see
[4]), which could result in a kernel stack overflow.

Those preventive sfence.vma needed to be emitted because:

- if the uarch caches invalid entries, the new mapping may not be
  observed by the page table walker and an invalidation may be needed.
- if the uarch does not cache invalid entries, a reordered access
  could "miss" the new mapping and traps: in that case, we would actually
  only need to retry the access, no sfence.vma is required.

So this patch removes those preventive sfence.vma and actually handles
the possible (and unlikely) exceptions. And since the kernel stacks
mappings lie in the vmalloc area, this handling must be done very early
when the trap is taken, at the very beginning of handle_exception: this
also rules out the vmalloc allocations in the fault path.

Link: https://lore.kernel.org/linux-riscv/20230531093817.665799-1-bjorn@kernel.org/ [1]
Link: https://lore.kernel.org/linux-riscv/20230801090927.2018653-1-dylan@andestech.com [2]
Link: https://lore.kernel.org/linux-riscv/20230725132246.817726-1-alexghiti@rivosinc.com/ [3]
Link: https://lore.kernel.org/lkml/20200508144043.13893-1-joro@8bytes.org/

 [4]
Signed-off-by: default avatarAlexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: default avatarYunhui Cui <cuiyunhui@bytedance.com>
Link: https://lore.kernel.org/r/20240717060125.139416-4-alexghiti@rivosinc.com


Signed-off-by: default avatarPalmer Dabbelt <palmer@rivosinc.com>
parent d25599b5
Loading
Loading
Loading
Loading
+17 −1
Original line number Diff line number Diff line
@@ -46,7 +46,23 @@ do { \
} while (0)

#ifdef CONFIG_64BIT
#define flush_cache_vmap(start, end)		flush_tlb_kernel_range(start, end)
extern u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1];
extern char _end[];
#define flush_cache_vmap flush_cache_vmap
static inline void flush_cache_vmap(unsigned long start, unsigned long end)
{
	if (is_vmalloc_or_module_addr((void *)start)) {
		int i;

		/*
		 * We don't care if concurrently a cpu resets this value since
		 * the only place this can happen is in handle_exception() where
		 * an sfence.vma is emitted.
		 */
		for (i = 0; i < ARRAY_SIZE(new_vmalloc); ++i)
			new_vmalloc[i] = -1ULL;
	}
}
#define flush_cache_vmap_early(start, end)	local_flush_tlb_kernel_range(start, end)
#endif

+7 −0
Original line number Diff line number Diff line
@@ -60,6 +60,13 @@ struct thread_info {
	void			*scs_base;
	void			*scs_sp;
#endif
#ifdef CONFIG_64BIT
	/*
	 * Used in handle_exception() to save a0, a1 and a2 before knowing if we
	 * can access the kernel stack.
	 */
	unsigned long		a0, a1, a2;
#endif
};

#ifdef CONFIG_SHADOW_CALL_STACK
+7 −0
Original line number Diff line number Diff line
@@ -36,6 +36,8 @@ void asm_offsets(void)
	OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]);
	OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]);
	OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]);

	OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);
	OFFSET(TASK_TI_FLAGS, task_struct, thread_info.flags);
	OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
	OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp);
@@ -43,6 +45,11 @@ void asm_offsets(void)
#ifdef CONFIG_SHADOW_CALL_STACK
	OFFSET(TASK_TI_SCS_SP, task_struct, thread_info.scs_sp);
#endif
#ifdef CONFIG_64BIT
	OFFSET(TASK_TI_A0, task_struct, thread_info.a0);
	OFFSET(TASK_TI_A1, task_struct, thread_info.a1);
	OFFSET(TASK_TI_A2, task_struct, thread_info.a2);
#endif

	OFFSET(TASK_TI_CPU_NUM, task_struct, thread_info.cpu);
	OFFSET(TASK_THREAD_F0,  task_struct, thread.fstate.f[0]);
+87 −0
Original line number Diff line number Diff line
@@ -19,6 +19,79 @@

	.section .irqentry.text, "ax"

.macro new_vmalloc_check
	REG_S 	a0, TASK_TI_A0(tp)
	csrr 	a0, CSR_CAUSE
	/* Exclude IRQs */
	blt  	a0, zero, _new_vmalloc_restore_context_a0

	REG_S 	a1, TASK_TI_A1(tp)
	/* Only check new_vmalloc if we are in page/protection fault */
	li   	a1, EXC_LOAD_PAGE_FAULT
	beq  	a0, a1, _new_vmalloc_kernel_address
	li   	a1, EXC_STORE_PAGE_FAULT
	beq  	a0, a1, _new_vmalloc_kernel_address
	li   	a1, EXC_INST_PAGE_FAULT
	bne  	a0, a1, _new_vmalloc_restore_context_a1

_new_vmalloc_kernel_address:
	/* Is it a kernel address? */
	csrr 	a0, CSR_TVAL
	bge 	a0, zero, _new_vmalloc_restore_context_a1

	/* Check if a new vmalloc mapping appeared that could explain the trap */
	REG_S	a2, TASK_TI_A2(tp)
	/*
	 * Computes:
	 * a0 = &new_vmalloc[BIT_WORD(cpu)]
	 * a1 = BIT_MASK(cpu)
	 */
	REG_L 	a2, TASK_TI_CPU(tp)
	/*
	 * Compute the new_vmalloc element position:
	 * (cpu / 64) * 8 = (cpu >> 6) << 3
	 */
	srli	a1, a2, 6
	slli	a1, a1, 3
	la	a0, new_vmalloc
	add	a0, a0, a1
	/*
	 * Compute the bit position in the new_vmalloc element:
	 * bit_pos = cpu % 64 = cpu - (cpu / 64) * 64 = cpu - (cpu >> 6) << 6
	 * 	   = cpu - ((cpu >> 6) << 3) << 3
	 */
	slli	a1, a1, 3
	sub	a1, a2, a1
	/* Compute the "get mask": 1 << bit_pos */
	li	a2, 1
	sll	a1, a2, a1

	/* Check the value of new_vmalloc for this cpu */
	REG_L	a2, 0(a0)
	and	a2, a2, a1
	beq	a2, zero, _new_vmalloc_restore_context

	/* Atomically reset the current cpu bit in new_vmalloc */
	amoxor.d	a0, a1, (a0)

	/* Only emit a sfence.vma if the uarch caches invalid entries */
	ALTERNATIVE("sfence.vma", "nop", 0, RISCV_ISA_EXT_SVVPTC, 1)

	REG_L	a0, TASK_TI_A0(tp)
	REG_L	a1, TASK_TI_A1(tp)
	REG_L	a2, TASK_TI_A2(tp)
	csrw	CSR_SCRATCH, x0
	sret

_new_vmalloc_restore_context:
	REG_L 	a2, TASK_TI_A2(tp)
_new_vmalloc_restore_context_a1:
	REG_L 	a1, TASK_TI_A1(tp)
_new_vmalloc_restore_context_a0:
	REG_L	a0, TASK_TI_A0(tp)
.endm


SYM_CODE_START(handle_exception)
	/*
	 * If coming from userspace, preserve the user thread pointer and load
@@ -30,6 +103,20 @@ SYM_CODE_START(handle_exception)

.Lrestore_kernel_tpsp:
	csrr tp, CSR_SCRATCH

#ifdef CONFIG_64BIT
	/*
	 * The RISC-V kernel does not eagerly emit a sfence.vma after each
	 * new vmalloc mapping, which may result in exceptions:
	 * - if the uarch caches invalid entries, the new mapping would not be
	 *   observed by the page table walker and an invalidation is needed.
	 * - if the uarch does not cache invalid entries, a reordered access
	 *   could "miss" the new mapping and traps: in that case, we only need
	 *   to retry the access, no sfence.vma is required.
	 */
	new_vmalloc_check
#endif

	REG_S sp, TASK_TI_KERNEL_SP(tp)

#ifdef CONFIG_VMAP_STACK
+2 −0
Original line number Diff line number Diff line
@@ -36,6 +36,8 @@

#include "../kernel/head.h"

u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1];

struct kernel_mapping kernel_map __ro_after_init;
EXPORT_SYMBOL(kernel_map);
#ifdef CONFIG_XIP_KERNEL