Commit 83409986 authored by Thomas Gleixner's avatar Thomas Gleixner Committed by Ingo Molnar
Browse files

rseq, virt: Retrigger RSEQ after vcpu_run()



Hypervisors invoke resume_user_mode_work() before entering the guest, which
clears TIF_NOTIFY_RESUME. The @regs argument is NULL as there is no user
space context available to them, so the rseq notify handler skips
inspecting the critical section, but updates the CPU/MM CID values
unconditionally so that the eventual pending rseq event is not lost on the
way to user space.

This is a pointless exercise as the task might be rescheduled before
actually returning to user space and it creates unnecessary work in the
vcpu_run() loops.

It's way more efficient to ignore that invocation based on @regs == NULL
and let the hypervisors re-raise TIF_NOTIFY_RESUME after returning from the
vcpu_run() loop before returning from the ioctl().

This ensures that a pending RSEQ update is not lost and the IDs are updated
before returning to user space.

Once the RSEQ handling is decoupled from TIF_NOTIFY_RESUME, this turns into
a NOOP.

Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
Reviewed-by: default avatarMathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: default avatarSean Christopherson <seanjc@google.com>
Link: https://patch.msgid.link/20251027084306.399495855@linutronix.de
parent d923739e
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@
#include <linux/crash_dump.h>
#include <linux/panic_notifier.h>
#include <linux/vmalloc.h>
#include <linux/rseq.h>

#include "mshv_eventfd.h"
#include "mshv.h"
@@ -560,6 +561,8 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
		}
	} while (!vp->run.flags.intercept_suspend);

	rseq_virt_userspace_exit();

	return ret;
}

+17 −0
Original line number Diff line number Diff line
@@ -37,6 +37,22 @@ static __always_inline void rseq_exit_to_user_mode(void)
	}
}

/*
 * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
 * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
 * that case just to do it eventually again before returning to user space,
 * the entry resume_user_mode_work() invocation is ignored as the register
 * argument is NULL.
 *
 * After returning from guest mode, they have to invoke this function to
 * re-raise TIF_NOTIFY_RESUME if necessary.
 */
static inline void rseq_virt_userspace_exit(void)
{
	if (current->rseq_event_pending)
		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
}

/*
 * If parent process has a registered restartable sequences area, the
 * child inherits. Unregister rseq for a clone with CLONE_VM set.
@@ -68,6 +84,7 @@ static inline void rseq_execve(struct task_struct *t)
static inline void rseq_handle_notify_resume(struct pt_regs *regs) { }
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
static inline void rseq_sched_switch_event(struct task_struct *t) { }
static inline void rseq_virt_userspace_exit(void) { }
static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
static inline void rseq_execve(struct task_struct *t) { }
static inline void rseq_exit_to_user_mode(void) { }
+41 −37
Original line number Diff line number Diff line
@@ -422,19 +422,25 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
{
	struct task_struct *t = current;
	int ret, sig;
	bool event;

	/*
	 * If invoked from hypervisors before entering the guest via
	 * resume_user_mode_work(), then @regs is a NULL pointer.
	 *
	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
	 * it before returning from the ioctl() to user space when
	 * rseq_event.sched_switch is set.
	 *
	 * So it's safe to ignore here instead of pointlessly updating it
	 * in the vcpu_run() loop.
	 */
	if (!regs)
		return;

	if (unlikely(t->flags & PF_EXITING))
		return;

	/*
	 * If invoked from hypervisors or IO-URING, then @regs is a NULL
	 * pointer, so fixup cannot be done. If the syscall which led to
	 * this invocation was invoked inside a critical section, then it
	 * will either end up in this code again or a possible violation of
	 * a syscall inside a critical region can only be detected by the
	 * debug code in rseq_syscall() in a debug enabled kernel.
	 */
	if (regs) {
	/*
	 * Read and clear the event pending bit first. If the task
	 * was not preempted or migrated or a signal is on the way,
@@ -453,8 +459,6 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
	 * with the result handed in to allow the detection of
	 * inconsistencies.
	 */
		bool event;

	scoped_guard(RSEQ_EVENT_GUARD) {
		event = t->rseq_event_pending;
		t->rseq_event_pending = false;
@@ -465,7 +469,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
		if (unlikely(ret < 0))
			goto error;
	}
	}

	if (unlikely(rseq_update_cpu_node_id(t)))
		goto error;
	return;
+7 −0
Original line number Diff line number Diff line
@@ -49,6 +49,7 @@
#include <linux/lockdep.h>
#include <linux/kthread.h>
#include <linux/suspend.h>
#include <linux/rseq.h>

#include <asm/processor.h>
#include <asm/ioctl.h>
@@ -4476,6 +4477,12 @@ static long kvm_vcpu_ioctl(struct file *filp,
		r = kvm_arch_vcpu_ioctl_run(vcpu);
		vcpu->wants_to_run = false;

		/*
		 * FIXME: Remove this hack once all KVM architectures
		 * support the generic TIF bits, i.e. a dedicated TIF_RSEQ.
		 */
		rseq_virt_userspace_exit();

		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
		break;
	}