KVM: x86/xen: improve accuracy of Xen timers (451a7078) · Commits · git / linux-net

arch/x86/kvm/x86.c

+56 −5

Original line number	Diff line number	Diff line
		@@ -2862,7 +2862,11 @@ static inline u64 vgettsc(struct pvclock_clock clock, u64 tsc_timestamp,
		return v * clock->mult;
		}

		static int do_monotonic_raw(s64 t, u64 tsc_timestamp)
		/*
		* As with get_kvmclock_base_ns(), this counts from boot time, at the
		* frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot).
		*/
		static int do_kvmclock_base(s64 t, u64 tsc_timestamp)
		{
		struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
		unsigned long seq;
		@@ -2881,6 +2885,29 @@ static int do_monotonic_raw(s64 t, u64 tsc_timestamp)
		return mode;
		}

		/*
		* This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with
		* no boot time offset.
		*/
		static int do_monotonic(s64 t, u64 tsc_timestamp)
		{
		struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
		unsigned long seq;
		int mode;
		u64 ns;

		do {
		seq = read_seqcount_begin(&gtod->seq);
		ns = gtod->clock.base_cycles;
		ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
		ns >>= gtod->clock.shift;
		ns += ktime_to_ns(gtod->clock.offset);
		} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
		*t = ns;

		return mode;
		}

		static int do_realtime(struct timespec64 ts, u64 tsc_timestamp)
		{
		struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
		@@ -2902,18 +2929,42 @@ static int do_realtime(struct timespec64 ts, u64 tsc_timestamp)
		return mode;
		}

		/* returns true if host is using TSC based clocksource */
		/*
		* Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and
		* reports the TSC value from which it do so. Returns true if host is
		* using TSC based clocksource.
		*/
		static bool kvm_get_time_and_clockread(s64 kernel_ns, u64 tsc_timestamp)
		{
		/* checked again under seqlock below */
		if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
		return false;

		return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
		return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns,
		tsc_timestamp));
		}

		/*
		* Calculates CLOCK_MONOTONIC and reports the TSC value from which it did
		* so. Returns true if host is using TSC based clocksource.
		*/
		bool kvm_get_monotonic_and_clockread(s64 kernel_ns, u64 tsc_timestamp)
		{
		/* checked again under seqlock below */
		if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
		return false;

		return gtod_is_based_on_tsc(do_monotonic(kernel_ns,
		tsc_timestamp));
		}

		/* returns true if host is using TSC based clocksource */
		/*
		* Calculates CLOCK_REALTIME and reports the TSC value from which it did
		* so. Returns true if host is using TSC based clocksource.
		*
		* DO NOT USE this for anything related to migration. You want CLOCK_TAI
		* for that.
		*/
		static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
		u64 *tsc_timestamp)
		{

arch/x86/kvm/x86.h

+1 −0

Original line number	Diff line number	Diff line
		@@ -294,6 +294,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);

		u64 get_kvmclock_ns(struct kvm *kvm);
		uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
		bool kvm_get_monotonic_and_clockread(s64 kernel_ns, u64 tsc_timestamp);

		int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
		gva_t addr, void *val, unsigned int bytes,

arch/x86/kvm/xen.c

+95 −35

Original line number	Diff line number	Diff line
		@@ -24,6 +24,7 @@
		#include <xen/interface/sched.h>

		#include <asm/xen/cpuid.h>
		#include <asm/pvclock.h>

		#include "cpuid.h"
		#include "trace.h"
		@@ -149,8 +150,93 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
		return HRTIMER_NORESTART;
		}

		static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
		static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
		bool linux_wa)
		{
		int64_t kernel_now, delta;
		uint64_t guest_now;

		/*
		* The guest provides the requested timeout in absolute nanoseconds
		* of the KVM clock — as it sees it, based on the scaled TSC and
		* the pvclock information provided by KVM.
		*
		* The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW
		* so use CLOCK_MONOTONIC. In the timescales covered by timers, the
		* difference won't matter much as there is no cumulative effect.
		*
		* Calculate the time for some arbitrary point in time around "now"
		* in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the
		* delta between the kvmclock "now" value and the guest's requested
		* timeout, apply the "Linux workaround" described below, and add
		* the resulting delta to the CLOCK_MONOTONIC "now" value, to get
		* the absolute CLOCK_MONOTONIC time at which the timer should
		* fire.
		*/
		if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock &&
		static_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
		uint64_t host_tsc, guest_tsc;

		if (!IS_ENABLED(CONFIG_64BIT) \|\|
		!kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) {
		/*
		* Don't fall back to get_kvmclock_ns() because it's
		* broken; it has a systemic error in its results
		* because it scales directly from host TSC to
		* nanoseconds, and doesn't scale first to guest TSC
		* and then to nanoseconds as the guest does.
		*
		* There is a small error introduced here because time
		* continues to elapse between the ktime_get() and the
		* subsequent rdtsc(). But not the systemic drift due
		* to get_kvmclock_ns().
		*/
		kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */
		host_tsc = rdtsc();
		}

		/* Calculate the guest kvmclock as the guest would do it. */
		guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc);
		guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock,
		guest_tsc);
		} else {
		/*
		* Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
		*
		* Also if the guest PV clock hasn't been set up yet, as is
		* likely to be the case during migration when the vCPU has
		* not been run yet. It would be possible to calculate the
		* scaling factors properly in that case but there's not much
		* point in doing so. The get_kvmclock_ns() drift accumulates
		* over time, so it's OK to use it at startup. Besides, on
		* migration there's going to be a little bit of skew in the
		* precise moment at which timers fire anyway. Often they'll
		* be in the "past" by the time the VM is running again after
		* migration.
		*/
		guest_now = get_kvmclock_ns(vcpu->kvm);
		kernel_now = ktime_get();
		}

		delta = guest_abs - guest_now;

		/*
		* Xen has a 'Linux workaround' in do_set_timer_op() which checks for
		* negative absolute timeout values (caused by integer overflow), and
		* for values about 13 days in the future (2^50ns) which would be
		* caused by jiffies overflow. For those cases, Xen sets the timeout
		* 100ms in the future (not too soon, since if a guest really did
		* set a long timeout on purpose we don't want to keep churning CPU
		* time by waking it up). Emulate Xen's workaround when starting the
		* timer in response to __HYPERVISOR_set_timer_op.
		*/
		if (linux_wa &&
		unlikely((int64_t)guest_abs < 0 \|\|
		(delta > 0 && (uint32_t) (delta >> 50) != 0))) {
		delta = 100 * NSEC_PER_MSEC;
		guest_abs = guest_now + delta;
		}

		/*
		* Avoid races with the old timer firing. Checking timer_expires
		* to avoid calling hrtimer_cancel() will only have false positives
		@@ -162,15 +248,13 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_
		atomic_set(&vcpu->arch.xen.timer_pending, 0);
		vcpu->arch.xen.timer_expires = guest_abs;

		if (delta_ns <= 0) {
		if (delta <= 0)
		xen_timer_callback(&vcpu->arch.xen.timer);
		} else {
		ktime_t ktime_now = ktime_get();
		else
		hrtimer_start(&vcpu->arch.xen.timer,
		ktime_add_ns(ktime_now, delta_ns),
		ktime_add_ns(kernel_now, delta),
		HRTIMER_MODE_ABS_HARD);
		}
		}

		static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
		{
		@@ -997,9 +1081,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu vcpu, struct kvm_xen_vcpu_attr data)

		/* Start the timer if the new value has a valid vector+expiry. */
		if (data->u.timer.port && data->u.timer.expires_ns)
		kvm_xen_start_timer(vcpu, data->u.timer.expires_ns,
		data->u.timer.expires_ns -
		get_kvmclock_ns(vcpu->kvm));
		kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, false);

		r = 0;
		break;
		@@ -1472,7 +1554,6 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
		{
		struct vcpu_set_singleshot_timer oneshot;
		struct x86_exception e;
		s64 delta;

		if (!kvm_xen_timer_enabled(vcpu))
		return false;
		@@ -1506,9 +1587,7 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
		return true;
		}

		/* A delta <= 0 results in an immediate callback, which is what we want */
		delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm);
		kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta);
		kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, false);
		*r = 0;
		return true;

		@@ -1531,29 +1610,10 @@ static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
		if (!kvm_xen_timer_enabled(vcpu))
		return false;

		if (timeout) {
		uint64_t guest_now = get_kvmclock_ns(vcpu->kvm);
		int64_t delta = timeout - guest_now;

		/* Xen has a 'Linux workaround' in do_set_timer_op() which
		* checks for negative absolute timeout values (caused by
		* integer overflow), and for values about 13 days in the
		* future (2^50ns) which would be caused by jiffies
		* overflow. For those cases, it sets the timeout 100ms in
		* the future (not too soon, since if a guest really did
		* set a long timeout on purpose we don't want to keep
		* churning CPU time by waking it up).
		*/
		if (unlikely((int64_t)timeout < 0 \|\|
		(delta > 0 && (uint32_t) (delta >> 50) != 0))) {
		delta = 100 * NSEC_PER_MSEC;
		timeout = guest_now + delta;
		}

		kvm_xen_start_timer(vcpu, timeout, delta);
		} else {
		if (timeout)
		kvm_xen_start_timer(vcpu, timeout, true);
		else
		kvm_xen_stop_timer(vcpu);
		}

		*r = 0;
		return true;