Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm (02e5f74e) · Commits · git / linux-net

Documentation/virt/kvm/api.rst

+17 −3

Original line number	Diff line number	Diff line
		@@ -1229,6 +1229,9 @@ It is not possible to read back a pending external abort (injected via
		KVM_SET_VCPU_EVENTS or otherwise) because such an exception is always delivered
		directly to the virtual CPU).

		Calling this ioctl on a vCPU that hasn't been initialized will return
		-ENOEXEC.

		::

		struct kvm_vcpu_events {
		@@ -1309,6 +1312,8 @@ exceptions by manipulating individual registers using the KVM_SET_ONE_REG API.

		See KVM_GET_VCPU_EVENTS for the data structure.

		Calling this ioctl on a vCPU that hasn't been initialized will return
		-ENOEXEC.

		4.33 KVM_GET_DEBUGREGS
		----------------------
		@@ -6432,9 +6437,18 @@ most one mapping per page, i.e. binding multiple memory regions to a single
		guest_memfd range is not allowed (any number of memory regions can be bound to
		a single guest_memfd file, but the bound ranges must not overlap).

		When the capability KVM_CAP_GUEST_MEMFD_MMAP is supported, the 'flags' field
		supports GUEST_MEMFD_FLAG_MMAP. Setting this flag on guest_memfd creation
		enables mmap() and faulting of guest_memfd memory to host userspace.
		The capability KVM_CAP_GUEST_MEMFD_FLAGS enumerates the `flags` that can be
		specified via KVM_CREATE_GUEST_MEMFD. Currently defined flags:

		============================ ================================================
		GUEST_MEMFD_FLAG_MMAP Enable using mmap() on the guest_memfd file
		descriptor.
		GUEST_MEMFD_FLAG_INIT_SHARED Make all memory in the file shared during
		KVM_CREATE_GUEST_MEMFD (memory files created
		without INIT_SHARED will be marked private).
		Shared memory can be faulted into host userspace
		page tables. Private memory cannot.
		============================ ================================================

		When the KVM MMU performs a PFN lookup to service a guest fault and the backing
		guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be

Documentation/virt/kvm/devices/arm-vgic-v3.rst

+2 −1

Original line number	Diff line number	Diff line
		@@ -13,7 +13,8 @@ will act as the VM interrupt controller, requiring emulated user-space devices
		to inject interrupts to the VGIC instead of directly to CPUs. It is not
		possible to create both a GICv3 and GICv2 on the same VM.

		Creating a guest GICv3 device requires a host GICv3 as well.
		Creating a guest GICv3 device requires a host GICv3 host, or a GICv5 host with
		support for FEAT_GCIE_LEGACY.


		Groups:

arch/arm64/include/asm/el2_setup.h

+32 −6

Original line number	Diff line number	Diff line
		@@ -24,22 +24,48 @@
		* ID_AA64MMFR4_EL1.E2H0 < 0. On such CPUs HCR_EL2.E2H is RES1, but it
		* can reset into an UNKNOWN state and might not read as 1 until it has
		* been initialized explicitly.
		*
		* Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but
		* don't advertise it (they predate this relaxation).
		*
		* Initalize HCR_EL2.E2H so that later code can rely upon HCR_EL2.E2H
		* indicating whether the CPU is running in E2H mode.
		*/
		mrs_s x1, SYS_ID_AA64MMFR4_EL1
		sbfx x1, x1, #ID_AA64MMFR4_EL1_E2H0_SHIFT, #ID_AA64MMFR4_EL1_E2H0_WIDTH
		cmp x1, #0
		b.ge .LnVHE_\@
		b.lt .LnE2H0_\@

		/*
		* Unfortunately, HCR_EL2.E2H can be RES1 even if not advertised
		* as such via ID_AA64MMFR4_EL1.E2H0:
		*
		* - Fruity CPUs predate the !FEAT_E2H0 relaxation, and seem to
		* have HCR_EL2.E2H implemented as RAO/WI.
		*
		* - On CPUs that lack FEAT_FGT, a hypervisor can't trap guest
		* reads of ID_AA64MMFR4_EL1 to advertise !FEAT_E2H0. NV
		* guests on these hosts can write to HCR_EL2.E2H without
		* trapping to the hypervisor, but these writes have no
		* functional effect.
		*
		* Handle both cases by checking for an essential VHE property
		* (system register remapping) to decide whether we're
		* effectively VHE-only or not.
		*/
		msr_hcr_el2 x0 // Setup HCR_EL2 as nVHE
		isb
		mov x1, #1 // Write something to FAR_EL1
		msr far_el1, x1
		isb
		mov x1, #2 // Try to overwrite it via FAR_EL2
		msr far_el2, x1
		isb
		mrs x1, far_el1 // If we see the latest write in FAR_EL1,
		cmp x1, #2 // we can safely assume we are VHE only.
		b.ne .LnVHE_\@ // Otherwise, we know that nVHE works.

		.LnE2H0_\@:
		orr x0, x0, #HCR_E2H
		.LnVHE_\@:
		msr_hcr_el2 x0
		isb
		.LnVHE_\@:
		.endm

		.macro __init_el2_sctlr

arch/arm64/include/asm/kvm_host.h

+50 −0

Original line number	Diff line number	Diff line
		@@ -816,6 +816,11 @@ struct kvm_vcpu_arch {
		u64 hcrx_el2;
		u64 mdcr_el2;

		struct {
		u64 r;
		u64 w;
		} fgt[__NR_FGT_GROUP_IDS__];

		/* Exception Information */
		struct kvm_vcpu_fault_info fault;

		@@ -1600,6 +1605,51 @@ static inline bool kvm_arch_has_irq_bypass(void)
		void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt);
		void get_reg_fixed_bits(struct kvm kvm, enum vcpu_sysreg reg, u64 res0, u64 *res1);
		void check_feature_map(void);
		void kvm_vcpu_load_fgt(struct kvm_vcpu *vcpu);

		static __always_inline enum fgt_group_id __fgt_reg_to_group_id(enum vcpu_sysreg reg)
		{
		switch (reg) {
		case HFGRTR_EL2:
		case HFGWTR_EL2:
		return HFGRTR_GROUP;
		case HFGITR_EL2:
		return HFGITR_GROUP;
		case HDFGRTR_EL2:
		case HDFGWTR_EL2:
		return HDFGRTR_GROUP;
		case HAFGRTR_EL2:
		return HAFGRTR_GROUP;
		case HFGRTR2_EL2:
		case HFGWTR2_EL2:
		return HFGRTR2_GROUP;
		case HFGITR2_EL2:
		return HFGITR2_GROUP;
		case HDFGRTR2_EL2:
		case HDFGWTR2_EL2:
		return HDFGRTR2_GROUP;
		default:
		BUILD_BUG_ON(1);
		}
		}

		#define vcpu_fgt(vcpu, reg) \
		({ \
		enum fgt_group_id id = __fgt_reg_to_group_id(reg); \
		u64 *p; \
		switch (reg) { \
		case HFGWTR_EL2: \
		case HDFGWTR_EL2: \
		case HFGWTR2_EL2: \
		case HDFGWTR2_EL2: \
		p = &(vcpu)->arch.fgt[id].w; \
		break; \
		default: \
		p = &(vcpu)->arch.fgt[id].r; \
		break; \
		} \
		\
		p; \
		})

		#endif /* __ARM64_KVM_HOST_H__ */

arch/arm64/kvm/arch_timer.c

+14 −91

Original line number	Diff line number	Diff line
		@@ -66,7 +66,7 @@ static int nr_timers(struct kvm_vcpu *vcpu)

		u32 timer_get_ctl(struct arch_timer_context *ctxt)
		{
		struct kvm_vcpu *vcpu = ctxt->vcpu;
		struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt);

		switch(arch_timer_ctx_index(ctxt)) {
		case TIMER_VTIMER:
		@@ -85,7 +85,7 @@ u32 timer_get_ctl(struct arch_timer_context *ctxt)

		u64 timer_get_cval(struct arch_timer_context *ctxt)
		{
		struct kvm_vcpu *vcpu = ctxt->vcpu;
		struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt);

		switch(arch_timer_ctx_index(ctxt)) {
		case TIMER_VTIMER:
		@@ -104,7 +104,7 @@ u64 timer_get_cval(struct arch_timer_context *ctxt)

		static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl)
		{
		struct kvm_vcpu *vcpu = ctxt->vcpu;
		struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt);

		switch(arch_timer_ctx_index(ctxt)) {
		case TIMER_VTIMER:
		@@ -126,7 +126,7 @@ static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl)

		static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval)
		{
		struct kvm_vcpu *vcpu = ctxt->vcpu;
		struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt);

		switch(arch_timer_ctx_index(ctxt)) {
		case TIMER_VTIMER:
		@@ -146,16 +146,6 @@ static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval)
		}
		}

		static void timer_set_offset(struct arch_timer_context *ctxt, u64 offset)
		{
		if (!ctxt->offset.vm_offset) {
		WARN(offset, "timer %ld\n", arch_timer_ctx_index(ctxt));
		return;
		}

		WRITE_ONCE(*ctxt->offset.vm_offset, offset);
		}

		u64 kvm_phys_timer_read(void)
		{
		return timecounter->cc->read(timecounter->cc);
		@@ -343,7 +333,7 @@ static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt)
		u64 ns;

		ctx = container_of(hrt, struct arch_timer_context, hrtimer);
		vcpu = ctx->vcpu;
		vcpu = timer_context_to_vcpu(ctx);

		trace_kvm_timer_hrtimer_expire(ctx);

		@@ -436,8 +426,9 @@ static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level)
		*
		* But hey, it's fast, right?
		*/
		if (is_hyp_ctxt(ctx->vcpu) &&
		(ctx == vcpu_vtimer(ctx->vcpu) \|\| ctx == vcpu_ptimer(ctx->vcpu))) {
		struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctx);
		if (is_hyp_ctxt(vcpu) &&
		(ctx == vcpu_vtimer(vcpu) \|\| ctx == vcpu_ptimer(vcpu))) {
		unsigned long val = timer_get_ctl(ctx);
		__assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &val, level);
		timer_set_ctl(ctx, val);
		@@ -470,7 +461,7 @@ static void timer_emulate(struct arch_timer_context *ctx)
		trace_kvm_timer_emulate(ctx, should_fire);

		if (should_fire != ctx->irq.level)
		kvm_timer_update_irq(ctx->vcpu, should_fire, ctx);
		kvm_timer_update_irq(timer_context_to_vcpu(ctx), should_fire, ctx);

		kvm_timer_update_status(ctx, should_fire);

		@@ -498,7 +489,7 @@ static void set_cntpoff(u64 cntpoff)

		static void timer_save_state(struct arch_timer_context *ctx)
		{
		struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
		struct arch_timer_cpu *timer = vcpu_timer(timer_context_to_vcpu(ctx));
		enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
		unsigned long flags;

		@@ -609,7 +600,7 @@ static void kvm_timer_unblocking(struct kvm_vcpu *vcpu)

		static void timer_restore_state(struct arch_timer_context *ctx)
		{
		struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
		struct arch_timer_cpu *timer = vcpu_timer(timer_context_to_vcpu(ctx));
		enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
		unsigned long flags;

		@@ -668,7 +659,7 @@ static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, boo

		static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx)
		{
		struct kvm_vcpu *vcpu = ctx->vcpu;
		struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctx);
		bool phys_active = false;

		/*
		@@ -677,7 +668,7 @@ static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx)
		* this point and the register restoration, we'll take the
		* interrupt anyway.
		*/
		kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx);
		kvm_timer_update_irq(vcpu, kvm_timer_should_fire(ctx), ctx);

		if (irqchip_in_kernel(vcpu->kvm))
		phys_active = kvm_vgic_map_is_active(vcpu, timer_irq(ctx));
		@@ -1063,7 +1054,7 @@ static void timer_context_init(struct kvm_vcpu *vcpu, int timerid)
		struct arch_timer_context *ctxt = vcpu_get_timer(vcpu, timerid);
		struct kvm *kvm = vcpu->kvm;

		ctxt->vcpu = vcpu;
		ctxt->timer_id = timerid;

		if (timerid == TIMER_VTIMER)
		ctxt->offset.vm_offset = &kvm->arch.timer_data.voffset;
		@@ -1121,49 +1112,6 @@ void kvm_timer_cpu_down(void)
		disable_percpu_irq(host_ptimer_irq);
		}

		int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
		{
		struct arch_timer_context *timer;

		switch (regid) {
		case KVM_REG_ARM_TIMER_CTL:
		timer = vcpu_vtimer(vcpu);
		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
		break;
		case KVM_REG_ARM_TIMER_CNT:
		if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET,
		&vcpu->kvm->arch.flags)) {
		timer = vcpu_vtimer(vcpu);
		timer_set_offset(timer, kvm_phys_timer_read() - value);
		}
		break;
		case KVM_REG_ARM_TIMER_CVAL:
		timer = vcpu_vtimer(vcpu);
		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
		break;
		case KVM_REG_ARM_PTIMER_CTL:
		timer = vcpu_ptimer(vcpu);
		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
		break;
		case KVM_REG_ARM_PTIMER_CNT:
		if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET,
		&vcpu->kvm->arch.flags)) {
		timer = vcpu_ptimer(vcpu);
		timer_set_offset(timer, kvm_phys_timer_read() - value);
		}
		break;
		case KVM_REG_ARM_PTIMER_CVAL:
		timer = vcpu_ptimer(vcpu);
		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
		break;

		default:
		return -1;
		}

		return 0;
		}

		static u64 read_timer_ctl(struct arch_timer_context *timer)
		{
		/*
		@@ -1180,31 +1128,6 @@ static u64 read_timer_ctl(struct arch_timer_context *timer)
		return ctl;
		}

		u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
		{
		switch (regid) {
		case KVM_REG_ARM_TIMER_CTL:
		return kvm_arm_timer_read(vcpu,
		vcpu_vtimer(vcpu), TIMER_REG_CTL);
		case KVM_REG_ARM_TIMER_CNT:
		return kvm_arm_timer_read(vcpu,
		vcpu_vtimer(vcpu), TIMER_REG_CNT);
		case KVM_REG_ARM_TIMER_CVAL:
		return kvm_arm_timer_read(vcpu,
		vcpu_vtimer(vcpu), TIMER_REG_CVAL);
		case KVM_REG_ARM_PTIMER_CTL:
		return kvm_arm_timer_read(vcpu,
		vcpu_ptimer(vcpu), TIMER_REG_CTL);
		case KVM_REG_ARM_PTIMER_CNT:
		return kvm_arm_timer_read(vcpu,
		vcpu_ptimer(vcpu), TIMER_REG_CNT);
		case KVM_REG_ARM_PTIMER_CVAL:
		return kvm_arm_timer_read(vcpu,
		vcpu_ptimer(vcpu), TIMER_REG_CVAL);
		}
		return (u64)-1;
		}

		static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
		struct arch_timer_context *timer,
		enum kvm_arch_timer_regs treg)