Commit f3826aa9 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull kvm updates from Paolo Bonzini:
 "This excludes the bulk of the x86 changes, which I will send
  separately. They have two not complex but relatively unusual conflicts
  so I will wait for other dust to settle.

  guest_memfd:

   - Add support for host userspace mapping of guest_memfd-backed memory
     for VM types that do NOT use support KVM_MEMORY_ATTRIBUTE_PRIVATE
     (which isn't precisely the same thing as CoCo VMs, since x86's
     SEV-MEM and SEV-ES have no way to detect private vs. shared).

     This lays the groundwork for removal of guest memory from the
     kernel direct map, as well as for limited mmap() for
     guest_memfd-backed memory.

     For more information see:
       - commit a6ad5413 ("Merge branch 'guest-memfd-mmap' into HEAD")
       - guest_memfd in Firecracker:
           https://github.com/firecracker-microvm/firecracker/tree/feature/secret-hiding
       - direct map removal:
           https://lore.kernel.org/all/20250221160728.1584559-1-roypat@amazon.co.uk/
       - mmap support:
           https://lore.kernel.org/all/20250328153133.3504118-1-tabba@google.com/

  ARM:

   - Add support for FF-A 1.2 as the secure memory conduit for pKVM,
     allowing more registers to be used as part of the message payload.

   - Change the way pKVM allocates its VM handles, making sure that the
     privileged hypervisor is never tricked into using uninitialised
     data.

   - Speed up MMIO range registration by avoiding unnecessary RCU
     synchronisation, which results in VMs starting much quicker.

   - Add the dump of the instruction stream when panic-ing in the EL2
     payload, just like the rest of the kernel has always done. This
     will hopefully help debugging non-VHE setups.

   - Add 52bit PA support to the stage-1 page-table walker, and make use
     of it to populate the fault level reported to the guest on failing
     to translate a stage-1 walk.

   - Add NV support to the GICv3-on-GICv5 emulation code, ensuring
     feature parity for guests, irrespective of the host platform.

   - Fix some really ugly architecture problems when dealing with debug
     in a nested VM. This has some bad performance impacts, but is at
     least correct.

   - Add enough infrastructure to be able to disable EL2 features and
     give effective values to the EL2 control registers. This then
     allows a bunch of features to be turned off, which helps cross-host
     migration.

   - Large rework of the selftest infrastructure to allow most tests to
     transparently run at EL2. This is the first step towards enabling
     NV testing.

   - Various fixes and improvements all over the map, including one BE
     fix, just in time for the removal of the feature.

  LoongArch:

   - Detect page table walk feature on new hardware

   - Add sign extension with kernel MMIO/IOCSR emulation

   - Improve in-kernel IPI emulation

   - Improve in-kernel PCH-PIC emulation

   - Move kvm_iocsr tracepoint out of generic code

  RISC-V:

   - Added SBI FWFT extension for Guest/VM with misaligned delegation
     and pointer masking PMLEN features

   - Added ONE_REG interface for SBI FWFT extension

   - Added Zicbop and bfloat16 extensions for Guest/VM

   - Enabled more common KVM selftests for RISC-V

   - Added SBI v3.0 PMU enhancements in KVM and perf driver

  s390:

   - Improve interrupt cpu for wakeup, in particular the heuristic to
     decide which vCPU to deliver a floating interrupt to.

   - Clear the PTE when discarding a swapped page because of CMMA; this
     bug was introduced in 6.16 when refactoring gmap code.

  x86 selftests:

   - Add #DE coverage in the fastops test (the only exception that's
     guest- triggerable in fastop-emulated instructions).

   - Fix PMU selftests errors encountered on Granite Rapids (GNR),
     Sierra Forest (SRF) and Clearwater Forest (CWF).

   - Minor cleanups and improvements

  x86 (guest side):

   - For the legacy PCI hole (memory between TOLUD and 4GiB) to UC when
     overriding guest MTRR for TDX/SNP to fix an issue where ACPI
     auto-mapping could map devices as WB and prevent the device drivers
     from mapping their devices with UC/UC-.

   - Make kvm_async_pf_task_wake() a local static helper and remove its
     export.

   - Use native qspinlocks when running in a VM with dedicated
     vCPU=>pCPU bindings even when PV_UNHALT is unsupported.

  Generic:

   - Remove a redundant __GFP_NOWARN from kvm_setup_async_pf() as
     __GFP_NOWARN is now included in GFP_NOWAIT.

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (178 commits)
  KVM: s390: Fix to clear PTE when discarding a swapped page
  KVM: arm64: selftests: Cover ID_AA64ISAR3_EL1 in set_id_regs
  KVM: arm64: selftests: Remove a duplicate register listing in set_id_regs
  KVM: arm64: selftests: Cope with arch silliness in EL2 selftest
  KVM: arm64: selftests: Add basic test for running in VHE EL2
  KVM: arm64: selftests: Enable EL2 by default
  KVM: arm64: selftests: Initialize HCR_EL2
  KVM: arm64: selftests: Use the vCPU attr for setting nr of PMU counters
  KVM: arm64: selftests: Use hyp timer IRQs when test runs at EL2
  KVM: arm64: selftests: Select SMCCC conduit based on current EL
  KVM: arm64: selftests: Provide helper for getting default vCPU target
  KVM: arm64: selftests: Alias EL1 registers to EL2 counterparts
  KVM: arm64: selftests: Create a VGICv3 for 'default' VMs
  KVM: arm64: selftests: Add unsanitised helpers for VGICv3 creation
  KVM: arm64: selftests: Add helper to check for VGICv3 support
  KVM: arm64: selftests: Initialize VGICv3 only once
  KVM: arm64: selftests: Provide kvm_arch_vm_post_create() in library code
  KVM: selftests: Add ex_str() to print human friendly name of exception vectors
  selftests/kvm: remove stale TODO in xapic_state_test
  KVM: selftests: Handle Intel Atom errata that leads to PMU event overcount
  ...
parents bf897d26 99cab802
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -6414,6 +6414,15 @@ most one mapping per page, i.e. binding multiple memory regions to a single
guest_memfd range is not allowed (any number of memory regions can be bound to
a single guest_memfd file, but the bound ranges must not overlap).

When the capability KVM_CAP_GUEST_MEMFD_MMAP is supported, the 'flags' field
supports GUEST_MEMFD_FLAG_MMAP.  Setting this flag on guest_memfd creation
enables mmap() and faulting of guest_memfd memory to host userspace.

When the KVM MMU performs a PFN lookup to service a guest fault and the backing
guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be
consumed from guest_memfd, regardless of whether it is a shared or a private
fault.

See KVM_SET_USER_MEMORY_REGION2 for additional details.

4.143 KVM_PRE_FAULT_MEMORY
+2 −0
Original line number Diff line number Diff line
@@ -81,6 +81,8 @@ enum __kvm_host_smccc_func {
	__KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff,
	__KVM_HOST_SMCCC_FUNC___vgic_v3_save_vmcr_aprs,
	__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs,
	__KVM_HOST_SMCCC_FUNC___pkvm_reserve_vm,
	__KVM_HOST_SMCCC_FUNC___pkvm_unreserve_vm,
	__KVM_HOST_SMCCC_FUNC___pkvm_init_vm,
	__KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu,
	__KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm,
+28 −6
Original line number Diff line number Diff line
@@ -220,6 +220,20 @@ static inline bool vcpu_el2_tge_is_set(const struct kvm_vcpu *vcpu)

static inline bool vcpu_el2_amo_is_set(const struct kvm_vcpu *vcpu)
{
	/*
	 * DDI0487L.b Known Issue D22105
	 *
	 * When executing at EL2 and HCR_EL2.{E2H,TGE} = {1, 0} it is
	 * IMPLEMENTATION DEFINED whether the effective value of HCR_EL2.AMO
	 * is the value programmed or 1.
	 *
	 * Make the implementation choice of treating the effective value as 1 as
	 * we cannot subsequently catch changes to TGE or AMO that would
	 * otherwise lead to the SError becoming deliverable.
	 */
	if (vcpu_is_el2(vcpu) && vcpu_el2_e2h_is_set(vcpu) && !vcpu_el2_tge_is_set(vcpu))
		return true;

	return ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2) & HCR_AMO;
}

@@ -511,21 +525,29 @@ static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
	if (vcpu_mode_is_32bit(vcpu)) {
		*vcpu_cpsr(vcpu) |= PSR_AA32_E_BIT;
	} else {
		u64 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
		enum vcpu_sysreg r;
		u64 sctlr;

		r = vcpu_has_nv(vcpu) ? SCTLR_EL2 : SCTLR_EL1;

		sctlr = vcpu_read_sys_reg(vcpu, r);
		sctlr |= SCTLR_ELx_EE;
		vcpu_write_sys_reg(vcpu, sctlr, SCTLR_EL1);
		vcpu_write_sys_reg(vcpu, sctlr, r);
	}
}

static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu)
{
	enum vcpu_sysreg r;
	u64 bit;

	if (vcpu_mode_is_32bit(vcpu))
		return !!(*vcpu_cpsr(vcpu) & PSR_AA32_E_BIT);

	if (vcpu_mode_priv(vcpu))
		return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_EE);
	else
		return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_EL1_E0E);
	r = is_hyp_ctxt(vcpu) ? SCTLR_EL2 : SCTLR_EL1;
	bit = vcpu_mode_priv(vcpu) ? SCTLR_ELx_EE : SCTLR_EL1_E0E;

	return vcpu_read_sys_reg(vcpu, r) & bit;
}

static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu,
+3 −2
Original line number Diff line number Diff line
@@ -252,7 +252,8 @@ struct kvm_protected_vm {
	pkvm_handle_t handle;
	struct kvm_hyp_memcache teardown_mc;
	struct kvm_hyp_memcache stage2_teardown_mc;
	bool enabled;
	bool is_protected;
	bool is_created;
};

struct kvm_mpidr_data {
@@ -1442,7 +1443,7 @@ struct kvm *kvm_arch_alloc_vm(void);

#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE

#define kvm_vm_is_protected(kvm)	(is_protected_kvm_enabled() && (kvm)->arch.pkvm.enabled)
#define kvm_vm_is_protected(kvm)	(is_protected_kvm_enabled() && (kvm)->arch.pkvm.is_protected)

#define vcpu_is_protected(vcpu)		kvm_vm_is_protected((vcpu)->kvm)

+25 −2
Original line number Diff line number Diff line
@@ -83,6 +83,8 @@ extern void check_nested_vcpu_requests(struct kvm_vcpu *vcpu);
extern void kvm_nested_flush_hwstate(struct kvm_vcpu *vcpu);
extern void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu);

extern void kvm_nested_setup_mdcr_el2(struct kvm_vcpu *vcpu);

struct kvm_s2_trans {
	phys_addr_t output;
	unsigned long block_size;
@@ -265,7 +267,7 @@ static inline u64 decode_range_tlbi(u64 val, u64 *range, u16 *asid)
	return base;
}

static inline unsigned int ps_to_output_size(unsigned int ps)
static inline unsigned int ps_to_output_size(unsigned int ps, bool pa52bit)
{
	switch (ps) {
	case 0: return 32;
@@ -273,7 +275,10 @@ static inline unsigned int ps_to_output_size(unsigned int ps)
	case 2: return 40;
	case 3: return 42;
	case 4: return 44;
	case 5:
	case 5: return 48;
	case 6: if (pa52bit)
			return 52;
		fallthrough;
	default:
		return 48;
	}
@@ -285,13 +290,28 @@ enum trans_regime {
	TR_EL2,
};

struct s1_walk_info;

struct s1_walk_context {
	struct s1_walk_info	*wi;
	u64			table_ipa;
	int			level;
};

struct s1_walk_filter {
	int	(*fn)(struct s1_walk_context *, void *);
	void	*priv;
};

struct s1_walk_info {
	struct s1_walk_filter	*filter;
	u64	     		baddr;
	enum trans_regime	regime;
	unsigned int		max_oa_bits;
	unsigned int		pgshift;
	unsigned int		txsz;
	int 	     		sl;
	u8			sh;
	bool			as_el0;
	bool	     		hpd;
	bool			e0poe;
@@ -299,6 +319,7 @@ struct s1_walk_info {
	bool			pan;
	bool	     		be;
	bool	     		s2;
	bool			pa52bit;
};

struct s1_walk_result {
@@ -334,6 +355,8 @@ struct s1_walk_result {

int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
		       struct s1_walk_result *wr, u64 va);
int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa,
			     int *level);

/* VNCR management */
int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu);
Loading