Commit 0bb933a9 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull x86 kvm fixes from Paolo Bonzini:

 - Avoid freeing stack-allocated node in kvm_async_pf_queue_task

 - Clear XSTATE_BV[i] in guest XSAVE state whenever XFD[i]=1

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  selftests: kvm: Verify TILELOADD actually #NM faults when XFD[18]=1
  selftests: kvm: try getting XFD and XSAVE state out of sync
  selftests: kvm: replace numbered sync points with actions
  x86/fpu: Clear XSTATE_BV[i] in guest XSAVE state whenever XFD[i]=1
  x86/kvm: Avoid freeing stack-allocated node in kvm_async_pf_queue_task
parents afd12f91 3611ca7c
Loading
Loading
Loading
Loading
+29 −3
Original line number Diff line number Diff line
@@ -319,10 +319,29 @@ EXPORT_SYMBOL_FOR_KVM(fpu_enable_guest_xfd_features);
#ifdef CONFIG_X86_64
void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
{
	struct fpstate *fpstate = guest_fpu->fpstate;

	fpregs_lock();
	guest_fpu->fpstate->xfd = xfd;
	if (guest_fpu->fpstate->in_use)
		xfd_update_state(guest_fpu->fpstate);

	/*
	 * KVM's guest ABI is that setting XFD[i]=1 *can* immediately revert the
	 * save state to its initial configuration.  Likewise, KVM_GET_XSAVE does
	 * the same as XSAVE and returns XSTATE_BV[i]=0 whenever XFD[i]=1.
	 *
	 * If the guest's FPU state is in hardware, just update XFD: the XSAVE
	 * in fpu_swap_kvm_fpstate will clear XSTATE_BV[i] whenever XFD[i]=1.
	 *
	 * If however the guest's FPU state is NOT resident in hardware, clear
	 * disabled components in XSTATE_BV now, or a subsequent XRSTOR will
	 * attempt to load disabled components and generate #NM _in the host_.
	 */
	if (xfd && test_thread_flag(TIF_NEED_FPU_LOAD))
		fpstate->regs.xsave.header.xfeatures &= ~xfd;

	fpstate->xfd = xfd;
	if (fpstate->in_use)
		xfd_update_state(fpstate);

	fpregs_unlock();
}
EXPORT_SYMBOL_FOR_KVM(fpu_update_guest_xfd);
@@ -430,6 +449,13 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
	if (ustate->xsave.header.xfeatures & ~xcr0)
		return -EINVAL;

	/*
	 * Disabled features must be in their initial state, otherwise XRSTOR
	 * causes an exception.
	 */
	if (WARN_ON_ONCE(ustate->xsave.header.xfeatures & kstate->xfd))
		return -EINVAL;

	/*
	 * Nullify @vpkru to preserve its current value if PKRU's bit isn't set
	 * in the header.  KVM's odd ABI is to leave PKRU untouched in this
+16 −3
Original line number Diff line number Diff line
@@ -89,6 +89,7 @@ struct kvm_task_sleep_node {
	struct swait_queue_head wq;
	u32 token;
	int cpu;
	bool dummy;
};

static struct kvm_task_sleep_head {
@@ -120,15 +121,26 @@ static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
	raw_spin_lock(&b->lock);
	e = _find_apf_task(b, token);
	if (e) {
		/* dummy entry exist -> wake up was delivered ahead of PF */
		struct kvm_task_sleep_node *dummy = NULL;

		/*
		 * The entry can either be a 'dummy' entry (which is put on the
		 * list when wake-up happens ahead of APF handling completion)
		 * or a token from another task which should not be touched.
		 */
		if (e->dummy) {
			hlist_del(&e->link);
			dummy = e;
		}

		raw_spin_unlock(&b->lock);
		kfree(e);
		kfree(dummy);
		return false;
	}

	n->token = token;
	n->cpu = smp_processor_id();
	n->dummy = false;
	init_swait_queue_head(&n->wq);
	hlist_add_head(&n->link, &b->list);
	raw_spin_unlock(&b->lock);
@@ -231,6 +243,7 @@ static void kvm_async_pf_task_wake(u32 token)
		}
		dummy->token = token;
		dummy->cpu = smp_processor_id();
		dummy->dummy = true;
		init_swait_queue_head(&dummy->wq);
		hlist_add_head(&dummy->link, &b->list);
		dummy = NULL;
+9 −0
Original line number Diff line number Diff line
@@ -5807,9 +5807,18 @@ static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
					struct kvm_xsave *guest_xsave)
{
	union fpregs_state *xstate = (union fpregs_state *)guest_xsave->region;

	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;

	/*
	 * For backwards compatibility, do not expect disabled features to be in
	 * their initial state.  XSTATE_BV[i] must still be cleared whenever
	 * XFD[i]=1, or XRSTOR would cause a #NM.
	 */
	xstate->xsave.header.xfeatures &= ~vcpu->arch.guest_fpu.fpstate->xfd;

	return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
					      guest_xsave->region,
					      kvm_caps.supported_xcr0,
+85 −59
Original line number Diff line number Diff line
@@ -69,6 +69,12 @@ static inline void __tileloadd(void *tile)
		     : : "a"(tile), "d"(0));
}

static inline int tileloadd_safe(void *tile)
{
	return kvm_asm_safe(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10",
			    "a"(tile), "d"(0));
}

static inline void __tilerelease(void)
{
	asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ::);
@@ -124,27 +130,52 @@ static void set_tilecfg(struct tile_config *cfg)
	}
}

enum {
	/* Retrieve TMM0 from guest, stash it for TEST_RESTORE_TILEDATA */
	TEST_SAVE_TILEDATA = 1,

	/* Check TMM0 against tiledata */
	TEST_COMPARE_TILEDATA = 2,

	/* Restore TMM0 from earlier save */
	TEST_RESTORE_TILEDATA = 4,

	/* Full VM save/restore */
	TEST_SAVE_RESTORE = 8,
};

static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
						    struct tile_data *tiledata,
						    struct xstate *xstate)
{
	int vector;

	GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE) &&
		     this_cpu_has(X86_FEATURE_OSXSAVE));
	check_xtile_info();
	GUEST_SYNC(1);
	GUEST_SYNC(TEST_SAVE_RESTORE);

	/* xfd=0, enable amx */
	wrmsr(MSR_IA32_XFD, 0);
	GUEST_SYNC(2);
	GUEST_SYNC(TEST_SAVE_RESTORE);
	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == 0);
	set_tilecfg(amx_cfg);
	__ldtilecfg(amx_cfg);
	GUEST_SYNC(3);
	GUEST_SYNC(TEST_SAVE_RESTORE);
	/* Check save/restore when trap to userspace */
	__tileloadd(tiledata);
	GUEST_SYNC(4);
	GUEST_SYNC(TEST_SAVE_TILEDATA | TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE);

	/* xfd=0x40000, disable amx tiledata */
	wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA);

	/* host tries setting tiledata while guest XFD is set */
	GUEST_SYNC(TEST_RESTORE_TILEDATA);
	GUEST_SYNC(TEST_SAVE_RESTORE);

	wrmsr(MSR_IA32_XFD, 0);
	__tilerelease();
	GUEST_SYNC(5);
	GUEST_SYNC(TEST_SAVE_RESTORE);
	/*
	 * After XSAVEC, XTILEDATA is cleared in the xstate_bv but is set in
	 * the xcomp_bv.
@@ -154,6 +185,8 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
	GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA));
	GUEST_ASSERT(xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA);

	/* #NM test */

	/* xfd=0x40000, disable amx tiledata */
	wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA);

@@ -166,32 +199,33 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
	GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA));
	GUEST_ASSERT((xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA));

	GUEST_SYNC(6);
	GUEST_SYNC(TEST_SAVE_RESTORE);
	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
	set_tilecfg(amx_cfg);
	__ldtilecfg(amx_cfg);
	/* Trigger #NM exception */
	__tileloadd(tiledata);
	GUEST_SYNC(10);

	GUEST_DONE();
}
	/* Trigger #NM exception */
	vector = tileloadd_safe(tiledata);
	__GUEST_ASSERT(vector == NM_VECTOR,
		       "Wanted #NM on tileloadd with XFD[18]=1, got %s",
		       ex_str(vector));

void guest_nm_handler(struct ex_regs *regs)
{
	/* Check if #NM is triggered by XFEATURE_MASK_XTILE_DATA */
	GUEST_SYNC(7);
	GUEST_ASSERT(!(get_cr0() & X86_CR0_TS));
	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA);
	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
	GUEST_SYNC(8);
	GUEST_SYNC(TEST_SAVE_RESTORE);
	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA);
	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
	/* Clear xfd_err */
	wrmsr(MSR_IA32_XFD_ERR, 0);
	/* xfd=0, enable amx */
	wrmsr(MSR_IA32_XFD, 0);
	GUEST_SYNC(9);
	GUEST_SYNC(TEST_SAVE_RESTORE);

	__tileloadd(tiledata);
	GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE);

	GUEST_DONE();
}

int main(int argc, char *argv[])
@@ -200,10 +234,10 @@ int main(int argc, char *argv[])
	struct kvm_vcpu *vcpu;
	struct kvm_vm *vm;
	struct kvm_x86_state *state;
	struct kvm_x86_state *tile_state = NULL;
	int xsave_restore_size;
	vm_vaddr_t amx_cfg, tiledata, xstate;
	struct ucall uc;
	u32 amx_offset;
	int ret;

	/*
@@ -228,9 +262,6 @@ int main(int argc, char *argv[])

	vcpu_regs_get(vcpu, &regs1);

	/* Register #NM handler */
	vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler);

	/* amx cfg for guest_code */
	amx_cfg = vm_vaddr_alloc_page(vm);
	memset(addr_gva2hva(vm, amx_cfg), 0x0, getpagesize());
@@ -244,6 +275,7 @@ int main(int argc, char *argv[])
	memset(addr_gva2hva(vm, xstate), 0, PAGE_SIZE * DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE));
	vcpu_args_set(vcpu, 3, amx_cfg, tiledata, xstate);

	int iter = 0;
	for (;;) {
		vcpu_run(vcpu);
		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
@@ -253,46 +285,31 @@ int main(int argc, char *argv[])
			REPORT_GUEST_ASSERT(uc);
			/* NOT REACHED */
		case UCALL_SYNC:
			switch (uc.args[1]) {
			case 1:
			case 2:
			case 3:
			case 5:
			case 6:
			case 7:
			case 8:
				fprintf(stderr, "GUEST_SYNC(%ld)\n", uc.args[1]);
				break;
			case 4:
			case 10:
				fprintf(stderr,
				"GUEST_SYNC(%ld), check save/restore status\n", uc.args[1]);
			++iter;
			if (uc.args[1] & TEST_SAVE_TILEDATA) {
				fprintf(stderr, "GUEST_SYNC #%d, save tiledata\n", iter);
				tile_state = vcpu_save_state(vcpu);
			}
			if (uc.args[1] & TEST_COMPARE_TILEDATA) {
				fprintf(stderr, "GUEST_SYNC #%d, check TMM0 contents\n", iter);

				/* Compacted mode, get amx offset by xsave area
				 * size subtract 8K amx size.
				 */
				amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE;
				state = vcpu_save_state(vcpu);
				void *amx_start = (void *)state->xsave + amx_offset;
				u32 amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE;
				void *amx_start = (void *)tile_state->xsave + amx_offset;
				void *tiles_data = (void *)addr_gva2hva(vm, tiledata);
				/* Only check TMM0 register, 1 tile */
				ret = memcmp(amx_start, tiles_data, TILE_SIZE);
				TEST_ASSERT(ret == 0, "memcmp failed, ret=%d", ret);
				kvm_x86_state_cleanup(state);
				break;
			case 9:
				fprintf(stderr,
				"GUEST_SYNC(%ld), #NM exception and enable amx\n", uc.args[1]);
				break;
			}
			break;
		case UCALL_DONE:
			fprintf(stderr, "UCALL_DONE\n");
			goto done;
		default:
			TEST_FAIL("Unknown ucall %lu", uc.cmd);
			if (uc.args[1] & TEST_RESTORE_TILEDATA) {
				fprintf(stderr, "GUEST_SYNC #%d, before KVM_SET_XSAVE\n", iter);
				vcpu_xsave_set(vcpu, tile_state->xsave);
				fprintf(stderr, "GUEST_SYNC #%d, after KVM_SET_XSAVE\n", iter);
			}

			if (uc.args[1] & TEST_SAVE_RESTORE) {
				fprintf(stderr, "GUEST_SYNC #%d, save/restore VM state\n", iter);
				state = vcpu_save_state(vcpu);
				memset(&regs1, 0, sizeof(regs1));
				vcpu_regs_get(vcpu, &regs1);
@@ -310,6 +327,15 @@ int main(int argc, char *argv[])
					    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
					    (ulong) regs2.rdi, (ulong) regs2.rsi);
			}
			break;
		case UCALL_DONE:
			fprintf(stderr, "UCALL_DONE\n");
			goto done;
		default:
			TEST_FAIL("Unknown ucall %lu", uc.cmd);
		}

	}
done:
	kvm_vm_free(vm);
}