Commit 05066caf authored by Claudio Imbrenda's avatar Claudio Imbrenda Committed by Heiko Carstens
Browse files

s390/mm/fault: Handle guest-related program interrupts in KVM



Any program interrupt that happens in the host during the execution of
a KVM guest will now short circuit the fault handler and return to KVM
immediately. Guest fault handling (including pfault) will happen
entirely inside KVM.

When sie64a() returns zero, current->thread.gmap_int_code will contain
the program interrupt number that caused the exit, or zero if the exit
was not caused by a host program interrupt.

KVM will now take care of handling all guest faults in vcpu_post_run().

Since gmap faults will not be visible by the rest of the kernel, remove
GMAP_FAULT, the linux fault handlers for secure execution faults, the
exception table entries for the sie instruction, the nop padding after
the sie instruction, and all other references to guest faults from the
s390 code.

Signed-off-by: default avatarClaudio Imbrenda <imbrenda@linux.ibm.com>
Co-developed-by: default avatarHeiko Carstens <hca@linux.ibm.com>
Link: https://lore.kernel.org/r/20241022120601.167009-6-imbrenda@linux.ibm.com


Signed-off-by: default avatarHeiko Carstens <hca@linux.ibm.com>
parent 473aaf52
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -527,6 +527,9 @@ struct kvm_vcpu_stat {
#define PGM_REGION_FIRST_TRANS		0x39
#define PGM_REGION_SECOND_TRANS		0x3a
#define PGM_REGION_THIRD_TRANS		0x3b
#define PGM_SECURE_STORAGE_ACCESS	0x3d
#define PGM_NON_SECURE_STORAGE_ACCESS	0x3e
#define PGM_SECURE_STORAGE_VIOLATION	0x3f
#define PGM_MONITOR			0x40
#define PGM_PER				0x80
#define PGM_CRYPTO_OPERATION		0x119
+2 −3
Original line number Diff line number Diff line
@@ -39,6 +39,7 @@
#include <asm/runtime_instr.h>
#include <asm/irqflags.h>
#include <asm/alternative.h>
#include <asm/fault.h>

struct pcpu {
	unsigned long ec_mask;		/* bit mask for ec_xxx functions */
@@ -187,10 +188,8 @@ struct thread_struct {
	unsigned long hardirq_timer;		/* task cputime in hardirq context */
	unsigned long softirq_timer;		/* task cputime in softirq context */
	const sys_call_ptr_t *sys_call_table;	/* system call table address */
	unsigned long gmap_addr;		/* address of last gmap fault. */
	unsigned int gmap_write_flag;		/* gmap fault write indication */
	union teid gmap_teid;			/* address and flags of last gmap fault */
	unsigned int gmap_int_code;		/* int code of last gmap fault */
	unsigned int gmap_pfault;		/* signal of a pending guest pfault */
	int ufpu_flags;				/* user fpu flags */
	int kfpu_flags;				/* kernel fpu flags */

+1 −23
Original line number Diff line number Diff line
@@ -222,17 +222,6 @@ SYM_FUNC_START(__sie64a)
	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r14)	# load primary asce
	lg	%r14,__LC_CURRENT(%r14)
	mvi	__TI_sie(%r14),0
# some program checks are suppressing. C code (e.g. do_protection_exception)
# will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There
# are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable.
# Other instructions between __sie64a and .Lsie_done should not cause program
# interrupts. So lets use 3 nops as a landing pad for all possible rewinds.
.Lrewind_pad6:
	nopr	7
.Lrewind_pad4:
	nopr	7
.Lrewind_pad2:
	nopr	7
SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL)
	lg	%r14,__SF_SIE_SAVEAREA(%r15)	# load guest register save area
	stmg	%r0,%r13,0(%r14)		# save guest gprs 0-13
@@ -244,15 +233,6 @@ SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL)
	lmg	%r6,%r14,__SF_GPRS(%r15)	# restore kernel registers
	lg	%r2,__SF_SIE_REASON(%r15)	# return exit reason code
	BR_EX	%r14
.Lsie_fault:
	lghi	%r14,-EFAULT
	stg	%r14,__SF_SIE_REASON(%r15)	# set exit reason code
	j	sie_exit

	EX_TABLE(.Lrewind_pad6,.Lsie_fault)
	EX_TABLE(.Lrewind_pad4,.Lsie_fault)
	EX_TABLE(.Lrewind_pad2,.Lsie_fault)
	EX_TABLE(sie_exit,.Lsie_fault)
SYM_FUNC_END(__sie64a)
EXPORT_SYMBOL(__sie64a)
EXPORT_SYMBOL(sie_exit)
@@ -329,7 +309,6 @@ SYM_CODE_START(pgm_check_handler)
	BPOFF
	lmg	%r8,%r9,__LC_PGM_OLD_PSW(%r13)
	xgr	%r10,%r10
	xgr	%r12,%r12
	tmhh	%r8,0x0001		# coming from user space?
	jno	.Lpgm_skip_asce
	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r13)
@@ -341,7 +320,6 @@ SYM_CODE_START(pgm_check_handler)
	jz	1f
	BPENTER	__SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
	SIEEXIT __SF_SIE_CONTROL(%r15),%r13
	lg	%r12,__SF_SIE_GUEST_ASCE(%r15)
	lghi	%r10,_PIF_GUEST_FAULT
#endif
1:	tmhh	%r8,0x4000		# PER bit set in old PSW ?
@@ -355,7 +333,6 @@ SYM_CODE_START(pgm_check_handler)
3:	lg	%r15,__LC_KERNEL_STACK(%r13)
4:	la	%r11,STACK_FRAME_OVERHEAD(%r15)
	stg	%r10,__PT_FLAGS(%r11)
	stg	%r12,__PT_CR1(%r11)
	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
	stmg	%r0,%r7,__PT_R0(%r11)
	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA(%r13)
@@ -369,6 +346,7 @@ SYM_CODE_START(pgm_check_handler)
	xgr	%r5,%r5
	xgr	%r6,%r6
	xgr	%r7,%r7
	xgr	%r12,%r12
	lgr	%r2,%r11
	brasl	%r14,__do_pgm_check
	tmhh	%r8,0x0001		# returning to user space?
+19 −5
Original line number Diff line number Diff line
@@ -31,6 +31,7 @@
#include <asm/asm-extable.h>
#include <asm/vtime.h>
#include <asm/fpu.h>
#include <asm/fault.h>
#include "entry.h"

static inline void __user *get_trap_ip(struct pt_regs *regs)
@@ -317,9 +318,24 @@ void noinstr __do_pgm_check(struct pt_regs *regs)
	struct lowcore *lc = get_lowcore();
	irqentry_state_t state;
	unsigned int trapnr;
	union teid teid;

	teid.val = lc->trans_exc_code;
	regs->int_code = lc->pgm_int_code;
	regs->int_parm_long = lc->trans_exc_code;
	regs->int_parm_long = teid.val;

	/*
	 * In case of a guest fault, short-circuit the fault handler and return.
	 * This way the sie64a() function will return 0; fault address and
	 * other relevant bits are saved in current->thread.gmap_teid, and
	 * the fault number in current->thread.gmap_int_code. KVM will be
	 * able to use this information to handle the fault.
	 */
	if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) {
		current->thread.gmap_teid.val = regs->int_parm_long;
		current->thread.gmap_int_code = regs->int_code & 0xffff;
		return;
	}

	state = irqentry_enter(regs);

@@ -408,8 +424,8 @@ static void (*pgm_check_table[128])(struct pt_regs *regs) = {
	[0x3b]		= do_dat_exception,
	[0x3c]		= default_trap_handler,
	[0x3d]		= do_secure_storage_access,
	[0x3e]		= do_non_secure_storage_access,
	[0x3f]		= do_secure_storage_violation,
	[0x3e]		= default_trap_handler,
	[0x3f]		= default_trap_handler,
	[0x40]		= monitor_event_exception,
	[0x41 ... 0x7f] = default_trap_handler,
};
@@ -420,5 +436,3 @@ static void (*pgm_check_table[128])(struct pt_regs *regs) = {
	__stringify(default_trap_handler))

COND_TRAP(do_secure_storage_access);
COND_TRAP(do_non_secure_storage_access);
COND_TRAP(do_secure_storage_violation);
+103 −22
Original line number Diff line number Diff line
@@ -4646,12 +4646,11 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
	if (!vcpu->arch.gmap->pfault_enabled)
		return false;

	hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(current->thread.gmap_addr));
	hva += current->thread.gmap_addr & ~PAGE_MASK;
	hva = gfn_to_hva(vcpu->kvm, current->thread.gmap_teid.addr);
	if (read_guest_real(vcpu, vcpu->arch.pfault_token, &arch.pfault_token, 8))
		return false;

	return kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch);
	return kvm_setup_async_pf(vcpu, current->thread.gmap_teid.addr * PAGE_SIZE, hva, &arch);
}

static int vcpu_pre_run(struct kvm_vcpu *vcpu)
@@ -4689,6 +4688,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
	clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask);

	vcpu->arch.sie_block->icptcode = 0;
	current->thread.gmap_int_code = 0;
	cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
	VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags);
	trace_kvm_s390_sie_enter(vcpu, cpuflags);
@@ -4696,7 +4696,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
	return 0;
}

static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
static int vcpu_post_run_addressing_exception(struct kvm_vcpu *vcpu)
{
	struct kvm_s390_pgm_info pgm_info = {
		.code = PGM_ADDRESSING,
@@ -4732,10 +4732,106 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
	return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
}

static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
{
	unsigned long gaddr;
	unsigned int flags;
	int rc = 0;

	gaddr = current->thread.gmap_teid.addr * PAGE_SIZE;
	if (kvm_s390_cur_gmap_fault_is_write())
		flags = FAULT_FLAG_WRITE;

	switch (current->thread.gmap_int_code) {
	case 0:
		vcpu->stat.exit_null++;
		break;
	case PGM_NON_SECURE_STORAGE_ACCESS:
		KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm,
			"Unexpected program interrupt 0x%x, TEID 0x%016lx",
			current->thread.gmap_int_code, current->thread.gmap_teid.val);
		/*
		 * This is normal operation; a page belonging to a protected
		 * guest has not been imported yet. Try to import the page into
		 * the protected guest.
		 */
		if (gmap_convert_to_secure(vcpu->arch.gmap, gaddr) == -EINVAL)
			send_sig(SIGSEGV, current, 0);
		break;
	case PGM_SECURE_STORAGE_ACCESS:
	case PGM_SECURE_STORAGE_VIOLATION:
		KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm,
			"Unexpected program interrupt 0x%x, TEID 0x%016lx",
			current->thread.gmap_int_code, current->thread.gmap_teid.val);
		/*
		 * This can happen after a reboot with asynchronous teardown;
		 * the new guest (normal or protected) will run on top of the
		 * previous protected guest. The old pages need to be destroyed
		 * so the new guest can use them.
		 */
		if (gmap_destroy_page(vcpu->arch.gmap, gaddr)) {
			/*
			 * Either KVM messed up the secure guest mapping or the
			 * same page is mapped into multiple secure guests.
			 *
			 * This exception is only triggered when a guest 2 is
			 * running and can therefore never occur in kernel
			 * context.
			 */
			pr_warn_ratelimited("Secure storage violation (%x) in task: %s, pid %d\n",
					    current->thread.gmap_int_code, current->comm,
					    current->pid);
			send_sig(SIGSEGV, current, 0);
		}
		break;
	case PGM_PROTECTION:
	case PGM_SEGMENT_TRANSLATION:
	case PGM_PAGE_TRANSLATION:
	case PGM_ASCE_TYPE:
	case PGM_REGION_FIRST_TRANS:
	case PGM_REGION_SECOND_TRANS:
	case PGM_REGION_THIRD_TRANS:
		KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm,
			"Unexpected program interrupt 0x%x, TEID 0x%016lx",
			current->thread.gmap_int_code, current->thread.gmap_teid.val);
		if (vcpu->arch.gmap->pfault_enabled) {
			rc = gmap_fault(vcpu->arch.gmap, gaddr, flags | FAULT_FLAG_RETRY_NOWAIT);
			if (rc == -EFAULT)
				return vcpu_post_run_addressing_exception(vcpu);
			if (rc == -EAGAIN) {
				trace_kvm_s390_major_guest_pfault(vcpu);
				if (kvm_arch_setup_async_pf(vcpu))
					return 0;
				vcpu->stat.pfault_sync++;
			} else {
				return rc;
			}
		}
		rc = gmap_fault(vcpu->arch.gmap, gaddr, flags);
		if (rc == -EFAULT) {
			if (kvm_is_ucontrol(vcpu->kvm)) {
				vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
				vcpu->run->s390_ucontrol.trans_exc_code = gaddr;
				vcpu->run->s390_ucontrol.pgm_code = 0x10;
				return -EREMOTE;
			}
			return vcpu_post_run_addressing_exception(vcpu);
		}
		break;
	default:
		KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx",
			current->thread.gmap_int_code, current->thread.gmap_teid.val);
		send_sig(SIGSEGV, current, 0);
		break;
	}
	return rc;
}

static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
{
	struct mcck_volatile_info *mcck_info;
	struct sie_page *sie_page;
	int rc;

	VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
		   vcpu->arch.sie_block->icptcode);
@@ -4757,7 +4853,7 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
	}

	if (vcpu->arch.sie_block->icptcode > 0) {
		int rc = kvm_handle_sie_intercept(vcpu);
		rc = kvm_handle_sie_intercept(vcpu);

		if (rc != -EOPNOTSUPP)
			return rc;
@@ -4766,24 +4862,9 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
		vcpu->run->s390_sieic.ipa = vcpu->arch.sie_block->ipa;
		vcpu->run->s390_sieic.ipb = vcpu->arch.sie_block->ipb;
		return -EREMOTE;
	} else if (exit_reason != -EFAULT) {
		vcpu->stat.exit_null++;
		return 0;
	} else if (kvm_is_ucontrol(vcpu->kvm)) {
		vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
		vcpu->run->s390_ucontrol.trans_exc_code =
						current->thread.gmap_addr;
		vcpu->run->s390_ucontrol.pgm_code = 0x10;
		return -EREMOTE;
	} else if (current->thread.gmap_pfault) {
		trace_kvm_s390_major_guest_pfault(vcpu);
		current->thread.gmap_pfault = 0;
		if (kvm_arch_setup_async_pf(vcpu))
			return 0;
		vcpu->stat.pfault_sync++;
		return gmap_fault(vcpu->arch.gmap, current->thread.gmap_addr, FAULT_FLAG_WRITE);
	}
	return vcpu_post_run_fault_in_sie(vcpu);

	return vcpu_post_run_handle_fault(vcpu);
}

#define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK)
Loading