Commit fcce7c1e authored by Paolo Bonzini's avatar Paolo Bonzini
Browse files

Merge tag 'kvm-x86-pvclock-6.15' of https://github.com/kvm-x86/linux into HEAD

KVM PV clock changes for 6.15:

 - Don't take kvm->lock when iterating over vCPUs in the suspend notifier to
   fix a largely theoretical deadlock.

 - Use the vCPU's actual Xen PV clock information when starting the Xen timer,
   as the cached state in arch.hv_clock can be stale/bogus.

 - Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different
   PV clocks.

 - Restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only
   accounts for kvmclock, and there's no evidence that the flag is actually
   supported by Xen guests.

 - Clean up the per-vCPU "cache" of its reference pvclock, and instead only
   track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately
   expensive to compute, and rarely changes for modern setups).
parents 9b093f5b 1b3c3805
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -911,7 +911,8 @@ struct kvm_vcpu_arch {
	int (*complete_userspace_io)(struct kvm_vcpu *vcpu);

	gpa_t time;
	struct pvclock_vcpu_time_info hv_clock;
	s8  pvclock_tsc_shift;
	u32 pvclock_tsc_mul;
	unsigned int hw_tsc_khz;
	struct gfn_to_pfn_cache pv_time;
	/* set guest stopped flag in pvclock flags field */
+57 −58
Original line number Diff line number Diff line
@@ -3121,15 +3121,17 @@ u64 get_kvmclock_ns(struct kvm *kvm)
	return data.clock;
}

static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock,
				    struct kvm_vcpu *vcpu,
				    struct gfn_to_pfn_cache *gpc,
				    unsigned int offset,
				    bool force_tsc_unstable)
				    unsigned int offset)
{
	struct kvm_vcpu_arch *vcpu = &v->arch;
	struct pvclock_vcpu_time_info *guest_hv_clock;
	struct pvclock_vcpu_time_info hv_clock;
	unsigned long flags;

	memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock));

	read_lock_irqsave(&gpc->lock, flags);
	while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
		read_unlock_irqrestore(&gpc->lock, flags);
@@ -3149,52 +3151,34 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
	 * it is consistent.
	 */

	guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
	guest_hv_clock->version = hv_clock.version = (guest_hv_clock->version + 1) | 1;
	smp_wmb();

	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
	vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);

	if (vcpu->pvclock_set_guest_stopped_request) {
		vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
		vcpu->pvclock_set_guest_stopped_request = false;
	}

	memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
	hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);

	if (force_tsc_unstable)
		guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT;
	memcpy(guest_hv_clock, &hv_clock, sizeof(*guest_hv_clock));

	smp_wmb();

	guest_hv_clock->version = ++vcpu->hv_clock.version;
	guest_hv_clock->version = ++hv_clock.version;

	kvm_gpc_mark_dirty_in_slot(gpc);
	read_unlock_irqrestore(&gpc->lock, flags);

	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
	trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock);
}

static int kvm_guest_time_update(struct kvm_vcpu *v)
{
	struct pvclock_vcpu_time_info hv_clock = {};
	unsigned long flags, tgt_tsc_khz;
	unsigned seq;
	struct kvm_vcpu_arch *vcpu = &v->arch;
	struct kvm_arch *ka = &v->kvm->arch;
	s64 kernel_ns;
	u64 tsc_timestamp, host_tsc;
	u8 pvclock_flags;
	bool use_master_clock;
#ifdef CONFIG_KVM_XEN
	/*
	 * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless
	 * explicitly told to use TSC as its clocksource Xen will not set this bit.
	 * This default behaviour led to bugs in some guest kernels which cause
	 * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags.
	 */
	bool xen_pvclock_tsc_unstable =
		ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
#endif

	kernel_ns = 0;
	host_tsc = 0;
@@ -3255,35 +3239,58 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)

	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
		kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
				   &vcpu->hv_clock.tsc_shift,
				   &vcpu->hv_clock.tsc_to_system_mul);
				   &vcpu->pvclock_tsc_shift,
				   &vcpu->pvclock_tsc_mul);
		vcpu->hw_tsc_khz = tgt_tsc_khz;
		kvm_xen_update_tsc_info(v);
	}

	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
	hv_clock.tsc_shift = vcpu->pvclock_tsc_shift;
	hv_clock.tsc_to_system_mul = vcpu->pvclock_tsc_mul;
	hv_clock.tsc_timestamp = tsc_timestamp;
	hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
	vcpu->last_guest_tsc = tsc_timestamp;

	/* If the host uses TSC clocksource, then it is stable */
	pvclock_flags = 0;
	hv_clock.flags = 0;
	if (use_master_clock)
		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
		hv_clock.flags |= PVCLOCK_TSC_STABLE_BIT;

	if (vcpu->pv_time.active) {
		/*
		 * GUEST_STOPPED is only supported by kvmclock, and KVM's
		 * historic behavior is to only process the request if kvmclock
		 * is active/enabled.
		 */
		if (vcpu->pvclock_set_guest_stopped_request) {
			hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
			vcpu->pvclock_set_guest_stopped_request = false;
		}
		kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0);

		hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED;
	}

	vcpu->hv_clock.flags = pvclock_flags;
	kvm_hv_setup_tsc_page(v->kvm, &hv_clock);

	if (vcpu->pv_time.active)
		kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false);
#ifdef CONFIG_KVM_XEN
	/*
	 * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless
	 * explicitly told to use TSC as its clocksource Xen will not set this bit.
	 * This default behaviour led to bugs in some guest kernels which cause
	 * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags.
	 *
	 * Note!  Clear TSC_STABLE only for Xen clocks, i.e. the order matters!
	 */
	if (ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
		hv_clock.flags &= ~PVCLOCK_TSC_STABLE_BIT;

	if (vcpu->xen.vcpu_info_cache.active)
		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
					offsetof(struct compat_vcpu_info, time),
					xen_pvclock_tsc_unstable);
		kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache,
					offsetof(struct compat_vcpu_info, time));
	if (vcpu->xen.vcpu_time_info_cache.active)
		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0,
					xen_pvclock_tsc_unstable);
		kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0);
#endif
	kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
	return 0;
}

@@ -6910,23 +6917,15 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
{
	struct kvm_vcpu *vcpu;
	unsigned long i;
	int ret = 0;

	mutex_lock(&kvm->lock);
	kvm_for_each_vcpu(i, vcpu, kvm) {
		if (!vcpu->arch.pv_time.active)
			continue;

		ret = kvm_set_guest_paused(vcpu);
		if (ret) {
			kvm_err("Failed to pause guest VCPU%d: %d\n",
				vcpu->vcpu_id, ret);
			break;
		}
	}
	mutex_unlock(&kvm->lock);
	/*
	 * Ignore the return, marking the guest paused only "fails" if the vCPU
	 * isn't using kvmclock; continuing on is correct and desirable.
	 */
	kvm_for_each_vcpu(i, vcpu, kvm)
		(void)kvm_set_guest_paused(vcpu);

	return ret ? NOTIFY_BAD : NOTIFY_DONE;
	return NOTIFY_DONE;
}

int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
+62 −7
Original line number Diff line number Diff line
@@ -150,11 +150,46 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
	return HRTIMER_NORESTART;
}

static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu,
				 struct pvclock_vcpu_time_info *hv_clock,
				 struct gfn_to_pfn_cache *gpc,
				 unsigned int offset)
{
	unsigned long flags;
	int r;

	read_lock_irqsave(&gpc->lock, flags);
	while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) {
		read_unlock_irqrestore(&gpc->lock, flags);

		r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock));
		if (r)
			return r;

		read_lock_irqsave(&gpc->lock, flags);
	}

	memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock));
	read_unlock_irqrestore(&gpc->lock, flags);

	/*
	 * Sanity check TSC shift+multiplier to verify the guest's view of time
	 * is more or less consistent.
	 */
	if (hv_clock->tsc_shift != vcpu->arch.pvclock_tsc_shift ||
	    hv_clock->tsc_to_system_mul != vcpu->arch.pvclock_tsc_mul)
		return -EINVAL;

	return 0;
}

static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
				bool linux_wa)
{
	struct kvm_vcpu_xen *xen = &vcpu->arch.xen;
	int64_t kernel_now, delta;
	uint64_t guest_now;
	int r = -EOPNOTSUPP;

	/*
	 * The guest provides the requested timeout in absolute nanoseconds
@@ -173,10 +208,29 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
	 * the absolute CLOCK_MONOTONIC time at which the timer should
	 * fire.
	 */
	if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock &&
	    static_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
	do {
		struct pvclock_vcpu_time_info hv_clock;
		uint64_t host_tsc, guest_tsc;

		if (!static_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
		    !vcpu->kvm->arch.use_master_clock)
			break;

		/*
		 * If both Xen PV clocks are active, arbitrarily try to use the
		 * compat clock first, but also try to use the non-compat clock
		 * if the compat clock is unusable.  The two PV clocks hold the
		 * same information, but it's possible one (or both) is stale
		 * and/or currently unreachable.
		 */
		if (xen->vcpu_info_cache.active)
			r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_info_cache,
						  offsetof(struct compat_vcpu_info, time));
		if (r && xen->vcpu_time_info_cache.active)
			r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_time_info_cache, 0);
		if (r)
			break;

		if (!IS_ENABLED(CONFIG_64BIT) ||
		    !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) {
			/*
@@ -197,9 +251,10 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,

		/* Calculate the guest kvmclock as the guest would do it. */
		guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc);
		guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock,
						  guest_tsc);
	} else {
		guest_now = __pvclock_read_cycles(&hv_clock, guest_tsc);
	} while (0);

	if (r) {
		/*
		 * Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
		 *
@@ -2261,8 +2316,8 @@ void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)

	entry = kvm_find_cpuid_entry_index(vcpu, function, 1);
	if (entry) {
		entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul;
		entry->edx = vcpu->arch.hv_clock.tsc_shift;
		entry->ecx = vcpu->arch.pvclock_tsc_mul;
		entry->edx = vcpu->arch.pvclock_tsc_shift;
	}

	entry = kvm_find_cpuid_entry_index(vcpu, function, 2);