Commit 954a209f authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull kvm fixes from Paolo Bonzini:
 "ARM:

   - Correctly clean the BSS to the PoC before allowing EL2 to access it
     on nVHE/hVHE/protected configurations

   - Propagate ownership of debug registers in protected mode after the
     rework that landed in 6.14-rc1

   - Stop pretending that we can run the protected mode without a GICv3
     being present on the host

   - Fix a use-after-free situation that can occur if a vcpu fails to
     initialise the NV shadow S2 MMU contexts

   - Always evaluate the need to arm a background timer for fully
     emulated guest timers

   - Fix the emulation of EL1 timers in the absence of FEAT_ECV

   - Correctly handle the EL2 virtual timer, specially when HCR_EL2.E2H==0

  s390:

   - move some of the guest page table (gmap) logic into KVM itself,
     inching towards the final goal of completely removing gmap from the
     non-kvm memory management code.

     As an initial set of cleanups, move some code from mm/gmap into kvm
     and start using __kvm_faultin_pfn() to fault-in pages as needed;
     but especially stop abusing page->index and page->lru to aid in the
     pgdesc conversion.

  x86:

   - Add missing check in the fix to defer starting the huge page
     recovery vhost_task

   - SRSO_USER_KERNEL_NO does not need SYNTHESIZED_F"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (31 commits)
  KVM: x86/mmu: Ensure NX huge page recovery thread is alive before waking
  KVM: remove kvm_arch_post_init_vm
  KVM: selftests: Fix spelling mistake "initally" -> "initially"
  kvm: x86: SRSO_USER_KERNEL_NO is not synthesized
  KVM: arm64: timer: Don't adjust the EL2 virtual timer offset
  KVM: arm64: timer: Correctly handle EL1 timer emulation when !FEAT_ECV
  KVM: arm64: timer: Always evaluate the need for a soft timer
  KVM: arm64: Fix nested S2 MMU structures reallocation
  KVM: arm64: Fail protected mode init if no vgic hardware is present
  KVM: arm64: Flush/sync debug state in protected mode
  KVM: s390: selftests: Streamline uc_skey test to issue iske after sske
  KVM: s390: remove the last user of page->index
  KVM: s390: move PGSTE softbits
  KVM: s390: remove useless page->index usage
  KVM: s390: move gmap_shadow_pgt_lookup() into kvm
  KVM: s390: stop using lists to keep track of used dat tables
  KVM: s390: stop using page->index for non-shadow gmaps
  KVM: s390: move some gmap shadowing functions away from mm/gmap.c
  KVM: s390: get rid of gmap_translate()
  KVM: s390: get rid of gmap_fault()
  ...
parents 9946eaf5 43fb96ae
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -1419,7 +1419,7 @@ fetch) is injected in the guest.
S390:
^^^^^

Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set.
Returns -EINVAL or -EEXIST if the VM has the KVM_VM_S390_UCONTROL flag set.
Returns -EINVAL if called on a protected VM.

4.36 KVM_SET_TSS_ADDR
+11 −38
Original line number Diff line number Diff line
@@ -471,10 +471,8 @@ static void timer_emulate(struct arch_timer_context *ctx)

	trace_kvm_timer_emulate(ctx, should_fire);

	if (should_fire != ctx->irq.level) {
	if (should_fire != ctx->irq.level)
		kvm_timer_update_irq(ctx->vcpu, should_fire, ctx);
		return;
	}

	kvm_timer_update_status(ctx, should_fire);

@@ -761,21 +759,6 @@ static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu,
					    timer_irq(map->direct_ptimer),
					    &arch_timer_irq_ops);
		WARN_ON_ONCE(ret);

		/*
		 * The virtual offset behaviour is "interesting", as it
		 * always applies when HCR_EL2.E2H==0, but only when
		 * accessed from EL1 when HCR_EL2.E2H==1. So make sure we
		 * track E2H when putting the HV timer in "direct" mode.
		 */
		if (map->direct_vtimer == vcpu_hvtimer(vcpu)) {
			struct arch_timer_offset *offs = &map->direct_vtimer->offset;

			if (vcpu_el2_e2h_is_set(vcpu))
				offs->vcpu_offset = NULL;
			else
				offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2);
		}
	}
}

@@ -976,31 +959,21 @@ void kvm_timer_sync_nested(struct kvm_vcpu *vcpu)
	 * which allows trapping of the timer registers even with NV2.
	 * Still, this is still worse than FEAT_NV on its own. Meh.
	 */
	if (!vcpu_el2_e2h_is_set(vcpu)) {
		if (cpus_have_final_cap(ARM64_HAS_ECV))
			return;

		/*
		 * A non-VHE guest hypervisor doesn't have any direct access
		 * to its timers: the EL2 registers trap (and the HW is
		 * fully emulated), while the EL0 registers access memory
		 * despite the access being notionally direct. Boo.
		 *
		 * We update the hardware timer registers with the
		 * latest value written by the guest to the VNCR page
		 * and let the hardware take care of the rest.
		 */
		write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CTL_EL0),  SYS_CNTV_CTL);
		write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CVAL_EL0), SYS_CNTV_CVAL);
		write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CTL_EL0),  SYS_CNTP_CTL);
		write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CVAL_EL0), SYS_CNTP_CVAL);
	} else {
	if (!cpus_have_final_cap(ARM64_HAS_ECV)) {
		/*
		 * For a VHE guest hypervisor, the EL2 state is directly
		 * stored in the host EL1 timers, while the emulated EL0
		 * stored in the host EL1 timers, while the emulated EL1
		 * state is stored in the VNCR page. The latter could have
		 * been updated behind our back, and we must reset the
		 * emulation of the timers.
		 *
		 * A non-VHE guest hypervisor doesn't have any direct access
		 * to its timers: the EL2 registers trap despite being
		 * notionally direct (we use the EL1 HW, as for VHE), while
		 * the EL1 registers access memory.
		 *
		 * In both cases, process the emulated timers on each guest
		 * exit. Boo.
		 */
		struct timer_map map;
		get_timer_map(vcpu, &map);
+20 −0
Original line number Diff line number Diff line
@@ -2290,6 +2290,19 @@ static int __init init_subsystems(void)
		break;
	case -ENODEV:
	case -ENXIO:
		/*
		 * No VGIC? No pKVM for you.
		 *
		 * Protected mode assumes that VGICv3 is present, so no point
		 * in trying to hobble along if vgic initialization fails.
		 */
		if (is_protected_kvm_enabled())
			goto out;

		/*
		 * Otherwise, userspace could choose to implement a GIC for its
		 * guest on non-cooperative hardware.
		 */
		vgic_present = false;
		err = 0;
		break;
@@ -2400,6 +2413,13 @@ static void kvm_hyp_init_symbols(void)
	kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1);
	kvm_nvhe_sym(__icache_flags) = __icache_flags;
	kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;

	/*
	 * Flush entire BSS since part of its data containing init symbols is read
	 * while the MMU is off.
	 */
	kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start),
				kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start));
}

static int __init kvm_hyp_init_protection(u32 hyp_va_bits)
+24 −0
Original line number Diff line number Diff line
@@ -91,11 +91,34 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu)
	*host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED;
}

static void flush_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu)
{
	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;

	hyp_vcpu->vcpu.arch.debug_owner = host_vcpu->arch.debug_owner;

	if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu))
		hyp_vcpu->vcpu.arch.vcpu_debug_state = host_vcpu->arch.vcpu_debug_state;
	else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu))
		hyp_vcpu->vcpu.arch.external_debug_state = host_vcpu->arch.external_debug_state;
}

static void sync_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu)
{
	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;

	if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu))
		host_vcpu->arch.vcpu_debug_state = hyp_vcpu->vcpu.arch.vcpu_debug_state;
	else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu))
		host_vcpu->arch.external_debug_state = hyp_vcpu->vcpu.arch.external_debug_state;
}

static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
{
	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;

	fpsimd_sve_flush();
	flush_debug_state(hyp_vcpu);

	hyp_vcpu->vcpu.arch.ctxt	= host_vcpu->arch.ctxt;

@@ -123,6 +146,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
	unsigned int i;

	fpsimd_sve_sync(&hyp_vcpu->vcpu);
	sync_debug_state(hyp_vcpu);

	host_vcpu->arch.ctxt		= hyp_vcpu->vcpu.arch.ctxt;

+5 −4
Original line number Diff line number Diff line
@@ -67,26 +67,27 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
	if (!tmp)
		return -ENOMEM;

	swap(kvm->arch.nested_mmus, tmp);

	/*
	 * If we went through a realocation, adjust the MMU back-pointers in
	 * the previously initialised kvm_pgtable structures.
	 */
	if (kvm->arch.nested_mmus != tmp)
		for (int i = 0; i < kvm->arch.nested_mmus_size; i++)
			tmp[i].pgt->mmu = &tmp[i];
			kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i];

	for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++)
		ret = init_nested_s2_mmu(kvm, &tmp[i]);
		ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]);

	if (ret) {
		for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++)
			kvm_free_stage2_pgd(&tmp[i]);
			kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]);

		return ret;
	}

	kvm->arch.nested_mmus_size = num_mmus;
	kvm->arch.nested_mmus = tmp;

	return 0;
}
Loading