Commit ac354b5c authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull kvm fixes from Paolo Bonzini:
 "s390:

   - Lots of small and not-so-small fixes for the newly rewritten gmap,
     mostly affecting the handling of nested guests.

  x86:

   - Fix an issue with shadow paging, which causes KVM to install an
     MMIO PTE in the shadow page tables without first zapping a non-MMIO
     SPTE if KVM didn't see the write that modified the shadowed guest
     PTE.

     While commit a54aa15c ("KVM: x86/mmu: Handle MMIO SPTEs
     directly in mmu_set_spte()") was right about it being impossible to
     miss such a write if it was coming from the guest, it failed to
     account for writes to guest memory that are outside the scope of
     KVM: if userspace modifies the guest PTE, and then the guest hits a
     relevant page fault, KVM will get confused"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  KVM: x86/mmu: Only WARN in direct MMUs when overwriting shadow-present SPTE
  KVM: x86/mmu: Drop/zap existing present SPTE even when creating an MMIO SPTE
  KVM: s390: Fix KVM_S390_VCPU_FAULT ioctl
  KVM: s390: vsie: Fix guest page tables protection
  KVM: s390: vsie: Fix unshadowing while shadowing
  KVM: s390: vsie: Fix refcount overflow for shadow gmaps
  KVM: s390: vsie: Fix nested guest memory shadowing
  KVM: s390: Correctly handle guest mappings without struct page
  KVM: s390: Fix gmap_link()
  KVM: s390: vsie: Fix check for pre-existing shadow mapping
  KVM: s390: Remove non-atomic dat_crstep_xchg()
  KVM: s390: vsie: Fix dat_split_ste()
parents b8a3bc85 df837460
Loading
Loading
Loading
Loading
+15 −85
Original line number Diff line number Diff line
@@ -134,32 +134,6 @@ int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newt
	return 0;
}

/**
 * dat_crstep_xchg() - Exchange a gmap CRSTE with another.
 * @crstep: Pointer to the CRST entry
 * @new: Replacement entry.
 * @gfn: The affected guest address.
 * @asce: The ASCE of the address space.
 *
 * Context: This function is assumed to be called with kvm->mmu_lock held.
 */
void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce)
{
	if (crstep->h.i) {
		WRITE_ONCE(*crstep, new);
		return;
	} else if (cpu_has_edat2()) {
		crdte_crste(crstep, *crstep, new, gfn, asce);
		return;
	}

	if (machine_has_tlb_guest())
		idte_crste(crstep, gfn, IDTE_GUEST_ASCE, asce, IDTE_GLOBAL);
	else
		idte_crste(crstep, gfn, 0, NULL_ASCE, IDTE_GLOBAL);
	WRITE_ONCE(*crstep, new);
}

/**
 * dat_crstep_xchg_atomic() - Atomically exchange a gmap CRSTE with another.
 * @crstep: Pointer to the CRST entry.
@@ -175,8 +149,8 @@ void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce
 *
 * Return: %true if the exchange was successful.
 */
bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn,
			    union asce asce)
bool __must_check dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new,
					 gfn_t gfn, union asce asce)
{
	if (old.h.i)
		return arch_try_cmpxchg((long *)crstep, &old.val, new.val);
@@ -292,6 +266,7 @@ static int dat_split_ste(struct kvm_s390_mmu_cache *mc, union pmd *pmdp, gfn_t g
				pt->ptes[i].val = init.val | i * PAGE_SIZE;
			/* No need to take locks as the page table is not installed yet. */
			pgste_init.prefix_notif = old.s.fc1.prefix_notif;
			pgste_init.vsie_notif = old.s.fc1.vsie_notif;
			pgste_init.pcl = uses_skeys && init.h.i;
			dat_init_pgstes(pt, pgste_init.val);
		} else {
@@ -893,7 +868,8 @@ static long _dat_slot_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct d

	/* This table entry needs to be updated. */
	if (walk->start <= gfn && walk->end >= next) {
		dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce);
		if (!dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce))
			return -EINVAL;
		/* A lower level table was present, needs to be freed. */
		if (!crste.h.fc && !crste.h.i) {
			if (is_pmd(crste))
@@ -1021,67 +997,21 @@ bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end)
	return _dat_walk_gfn_range(start, end, asce, &test_age_ops, 0, NULL) > 0;
}

int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level,
	     bool uses_skeys, struct guest_fault *f)
{
	union crste oldval, newval;
	union pte newpte, oldpte;
	union pgste pgste;
	int rc = 0;

	rc = dat_entry_walk(mc, f->gfn, asce, DAT_WALK_ALLOC_CONTINUE, level, &f->crstep, &f->ptep);
	if (rc == -EINVAL || rc == -ENOMEM)
		return rc;
	if (rc)
		return -EAGAIN;

	if (WARN_ON_ONCE(unlikely(get_level(f->crstep, f->ptep) > level)))
		return -EINVAL;

	if (f->ptep) {
		pgste = pgste_get_lock(f->ptep);
		oldpte = *f->ptep;
		newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page);
		newpte.s.sd = oldpte.s.sd;
		oldpte.s.sd = 0;
		if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) {
			pgste = __dat_ptep_xchg(f->ptep, pgste, newpte, f->gfn, asce, uses_skeys);
			if (f->callback)
				f->callback(f);
		} else {
			rc = -EAGAIN;
		}
		pgste_set_unlock(f->ptep, pgste);
	} else {
		oldval = READ_ONCE(*f->crstep);
		newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
				    f->write_attempt | oldval.s.fc1.d);
		newval.s.fc1.sd = oldval.s.fc1.sd;
		if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
		    crste_origin_large(oldval) != crste_origin_large(newval))
			return -EAGAIN;
		if (!dat_crstep_xchg_atomic(f->crstep, oldval, newval, f->gfn, asce))
			return -EAGAIN;
		if (f->callback)
			f->callback(f);
	}

	return rc;
}

static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
{
	union crste crste = READ_ONCE(*crstep);
	union crste newcrste, oldcrste;
	int *n = walk->priv;

	if (!crste.h.fc || crste.h.i || crste.h.p)
	do {
		oldcrste = READ_ONCE(*crstep);
		if (!oldcrste.h.fc || oldcrste.h.i || oldcrste.h.p)
			return 0;

		if (oldcrste.s.fc1.prefix_notif)
			break;
		newcrste = oldcrste;
		newcrste.s.fc1.prefix_notif = 1;
	} while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, walk->asce));
	*n = 2;
	if (crste.s.fc1.prefix_notif)
		return 0;
	crste.s.fc1.prefix_notif = 1;
	dat_crstep_xchg(crstep, crste, gfn, walk->asce);
	return 0;
}

+12 −11
Original line number Diff line number Diff line
@@ -160,14 +160,14 @@ union pmd {
			unsigned long              :44; /* HW */
			unsigned long              : 3; /* Unused */
			unsigned long              : 1; /* HW */
			unsigned long s            : 1; /* Special */
			unsigned long w            : 1; /* Writable soft-bit */
			unsigned long r            : 1; /* Readable soft-bit */
			unsigned long d            : 1; /* Dirty */
			unsigned long y            : 1; /* Young */
			unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
			unsigned long              : 3; /* HW */
			unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
			unsigned long vsie_notif   : 1; /* Referenced in a shadow table */
			unsigned long              : 1; /* Unused */
			unsigned long              : 4; /* HW */
			unsigned long sd           : 1; /* Soft-Dirty */
			unsigned long pr           : 1; /* Present */
@@ -183,14 +183,14 @@ union pud {
			unsigned long              :33; /* HW */
			unsigned long              :14; /* Unused */
			unsigned long              : 1; /* HW */
			unsigned long s            : 1; /* Special */
			unsigned long w            : 1; /* Writable soft-bit */
			unsigned long r            : 1; /* Readable soft-bit */
			unsigned long d            : 1; /* Dirty */
			unsigned long y            : 1; /* Young */
			unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
			unsigned long              : 3; /* HW */
			unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
			unsigned long vsie_notif   : 1; /* Referenced in a shadow table */
			unsigned long              : 1; /* Unused */
			unsigned long              : 4; /* HW */
			unsigned long sd           : 1; /* Soft-Dirty */
			unsigned long pr           : 1; /* Present */
@@ -254,14 +254,14 @@ union crste {
		struct {
			unsigned long              :47;
			unsigned long              : 1; /* HW (should be 0) */
			unsigned long s            : 1; /* Special */
			unsigned long w            : 1; /* Writable */
			unsigned long r            : 1; /* Readable */
			unsigned long d            : 1; /* Dirty */
			unsigned long y            : 1; /* Young */
			unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
			unsigned long              : 3; /* HW */
			unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
			unsigned long vsie_notif   : 1; /* Referenced in a shadow table */
			unsigned long              : 1;
			unsigned long              : 4; /* HW */
			unsigned long sd           : 1; /* Soft-Dirty */
			unsigned long pr           : 1; /* Present */
@@ -540,8 +540,6 @@ int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gf
		 u16 type, u16 param);
int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn);
bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end);
int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level,
	     bool uses_skeys, struct guest_fault *f);

int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty);
long dat_reset_cmma(union asce asce, gfn_t start_gfn);
@@ -938,11 +936,14 @@ static inline bool dat_pudp_xchg_atomic(union pud *pudp, union pud old, union pu
	return dat_crstep_xchg_atomic(_CRSTEP(pudp), _CRSTE(old), _CRSTE(new), gfn, asce);
}

static inline void dat_crstep_clear(union crste *crstep, gfn_t gfn, union asce asce)
static inline union crste dat_crstep_clear_atomic(union crste *crstep, gfn_t gfn, union asce asce)
{
	union crste newcrste = _CRSTE_EMPTY(crstep->h.tt);
	union crste oldcrste, empty = _CRSTE_EMPTY(crstep->h.tt);

	dat_crstep_xchg(crstep, newcrste, gfn, asce);
	do {
		oldcrste = READ_ONCE(*crstep);
	} while (!dat_crstep_xchg_atomic(crstep, oldcrste, empty, gfn, asce));
	return oldcrste;
}

static inline int get_level(union crste *crstep, union pte *ptep)
+51 −20
Original line number Diff line number Diff line
@@ -1436,13 +1436,21 @@ static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union

	if (!pgste_get_trylock(ptep_h, &pgste))
		return -EAGAIN;
	newpte = _pte(f->pfn, f->writable, !p, 0);
	newpte.s.d |= ptep->s.d;
	newpte.s.sd |= ptep->s.sd;
	newpte.h.p &= ptep->h.p;
	newpte = _pte(f->pfn, f->writable, !p, ptep_h->s.s);
	newpte.s.d |= ptep_h->s.d;
	newpte.s.sd |= ptep_h->s.sd;
	newpte.h.p &= ptep_h->h.p;
	if (!newpte.h.p && !f->writable) {
		rc = -EOPNOTSUPP;
	} else {
		pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false);
		pgste.vsie_notif = 1;
	}
	pgste_set_unlock(ptep_h, pgste);
	if (rc)
		return rc;
	if (!sg->parent)
		return -EAGAIN;

	newpte = _pte(f->pfn, 0, !p, 0);
	if (!pgste_get_trylock(ptep, &pgste))
@@ -1456,7 +1464,7 @@ static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union
static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, union crste *table,
			    struct guest_fault *f, bool p)
{
	union crste newcrste;
	union crste newcrste, oldcrste;
	gfn_t gfn;
	int rc;

@@ -1469,16 +1477,28 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni
	if (rc)
		return rc;

	newcrste = _crste_fc1(f->pfn, host->h.tt, f->writable, !p);
	newcrste.s.fc1.d |= host->s.fc1.d;
	newcrste.s.fc1.sd |= host->s.fc1.sd;
	newcrste.h.p &= host->h.p;
	do {
		/* _gmap_crstep_xchg_atomic() could have unshadowed this shadow gmap */
		if (!sg->parent)
			return -EAGAIN;
		oldcrste = READ_ONCE(*host);
		newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, f->writable, !p);
		newcrste.s.fc1.d |= oldcrste.s.fc1.d;
		newcrste.s.fc1.sd |= oldcrste.s.fc1.sd;
		newcrste.h.p &= oldcrste.h.p;
		newcrste.s.fc1.vsie_notif = 1;
	newcrste.s.fc1.prefix_notif = host->s.fc1.prefix_notif;
	_gmap_crstep_xchg(sg->parent, host, newcrste, f->gfn, false);
		newcrste.s.fc1.prefix_notif = oldcrste.s.fc1.prefix_notif;
		newcrste.s.fc1.s = oldcrste.s.fc1.s;
		if (!newcrste.h.p && !f->writable)
			return -EOPNOTSUPP;
	} while (!_gmap_crstep_xchg_atomic(sg->parent, host, oldcrste, newcrste, f->gfn, false));
	if (!sg->parent)
		return -EAGAIN;

	newcrste = _crste_fc1(f->pfn, host->h.tt, 0, !p);
	dat_crstep_xchg(table, newcrste, gpa_to_gfn(raddr), sg->asce);
	newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p);
	gfn = gpa_to_gfn(raddr);
	while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce))
		;
	return 0;
}

@@ -1502,21 +1522,31 @@ static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
	if (rc)
		return rc;

	/* A race occourred. The shadow mapping is already valid, nothing to do */
	if ((ptep && !ptep->h.i) || (!ptep && crste_leaf(*table)))
	/* A race occurred. The shadow mapping is already valid, nothing to do */
	if ((ptep && !ptep->h.i && ptep->h.p == w->p) ||
	    (!ptep && crste_leaf(*table) && !table->h.i && table->h.p == w->p))
		return 0;

	gl = get_level(table, ptep);

	/* In case of a real address space */
	if (w->level <= LEVEL_MEM) {
		l = TABLE_TYPE_PAGE_TABLE;
		hl = TABLE_TYPE_REGION1;
		goto real_address_space;
	}

	/*
	 * Skip levels that are already protected. For each level, protect
	 * only the page containing the entry, not the whole table.
	 */
	for (i = gl ; i >= w->level; i--) {
		rc = gmap_protect_rmap(mc, sg, entries[i - 1].gfn, gpa_to_gfn(saddr),
				       entries[i - 1].pfn, i, entries[i - 1].writable);
		rc = gmap_protect_rmap(mc, sg, entries[i].gfn, gpa_to_gfn(saddr),
				       entries[i].pfn, i + 1, entries[i].writable);
		if (rc)
			return rc;
		if (!sg->parent)
			return -EAGAIN;
	}

	rc = dat_entry_walk(NULL, entries[LEVEL_MEM].gfn, sg->parent->asce, DAT_WALK_LEAF,
@@ -1528,6 +1558,7 @@ static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
	/* Get the smallest granularity */
	l = min3(gl, hl, w->level);

real_address_space:
	flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0);
	/* If necessary, create the shadow mapping */
	if (l < gl) {
+114 −46
Original line number Diff line number Diff line
@@ -313,13 +313,16 @@ static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, st
	struct clear_young_pte_priv *priv = walk->priv;
	union crste crste, new;

	do {
		crste = READ_ONCE(*crstep);

		if (!crste.h.fc)
			return 0;
		if (!crste.s.fc1.y && crste.h.i)
			return 0;
	if (!crste_prefix(crste) || gmap_mkold_prefix(priv->gmap, gfn, end)) {
		if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end))
			break;

		new = crste;
		new.h.i = 1;
		new.s.fc1.y = 0;
@@ -328,8 +331,8 @@ static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, st
			folio_set_dirty(phys_to_folio(crste_origin_large(crste)));
		new.s.fc1.d = 0;
		new.h.p = 1;
		dat_crstep_xchg(crstep, new, gfn, walk->asce);
	}
	} while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce));

	priv->young = 1;
	return 0;
}
@@ -391,14 +394,18 @@ static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct
{
	struct gmap_unmap_priv *priv = walk->priv;
	struct folio *folio = NULL;
	union crste old = *crstep;

	if (crstep->h.fc) {
		if (crstep->s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
			folio = phys_to_folio(crste_origin_large(*crstep));
		gmap_crstep_xchg(priv->gmap, crstep, _CRSTE_EMPTY(crstep->h.tt), gfn);
	if (!old.h.fc)
		return 0;

	if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
		folio = phys_to_folio(crste_origin_large(old));
	/* No races should happen because kvm->mmu_lock is held in write mode */
	KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn),
		   priv->gmap->kvm);
	if (folio)
		uv_convert_from_secure_folio(folio);
	}

	return 0;
}
@@ -474,6 +481,7 @@ static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t

	if (fatal_signal_pending(current))
		return 1;
	do {
		crste = READ_ONCE(*table);
		if (!crste.h.fc)
			return 0;
@@ -485,12 +493,12 @@ static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t
		 * currently running, do not reset the protection, leave it marked as
		 * dirty.
		 */
	if (!crste.s.fc1.prefix_notif || gmap_mkold_prefix(gmap, gfn, end)) {
		if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end))
			break;
		new = crste;
		new.h.p = 1;
		new.s.fc1.sd = 0;
		gmap_crstep_xchg(gmap, table, new, gfn);
	}
	} while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn));

	for ( ; gfn < end; gfn++)
		mark_page_dirty(gmap->kvm, gfn);
@@ -511,7 +519,7 @@ void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end)
	_dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap);
}

static int gmap_handle_minor_crste_fault(union asce asce, struct guest_fault *f)
static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f)
{
	union crste newcrste, oldcrste = READ_ONCE(*f->crstep);

@@ -536,10 +544,8 @@ static int gmap_handle_minor_crste_fault(union asce asce, struct guest_fault *f)
			newcrste.s.fc1.d = 1;
			newcrste.s.fc1.sd = 1;
		}
		if (!oldcrste.s.fc1.d && newcrste.s.fc1.d)
			SetPageDirty(phys_to_page(crste_origin_large(newcrste)));
		/* In case of races, let the slow path deal with it. */
		return !dat_crstep_xchg_atomic(f->crstep, oldcrste, newcrste, f->gfn, asce);
		return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn);
	}
	/* Trying to write on a read-only page, let the slow path deal with it. */
	return 1;
@@ -568,8 +574,6 @@ static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste,
		newpte.s.d = 1;
		newpte.s.sd = 1;
	}
	if (!oldpte.s.d && newpte.s.d)
		SetPageDirty(pfn_to_page(newpte.h.pfra));
	*pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn);

	return 0;
@@ -606,7 +610,7 @@ int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault)
			fault->callback(fault);
		pgste_set_unlock(fault->ptep, pgste);
	} else {
		rc = gmap_handle_minor_crste_fault(gmap->asce, fault);
		rc = gmap_handle_minor_crste_fault(gmap, fault);
		if (!rc && fault->callback)
			fault->callback(fault);
	}
@@ -623,10 +627,61 @@ static inline bool gmap_1m_allowed(struct gmap *gmap, gfn_t gfn)
	return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags);
}

static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level,
		      struct guest_fault *f)
{
	union crste oldval, newval;
	union pte newpte, oldpte;
	union pgste pgste;
	int rc = 0;

	rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level,
			    &f->crstep, &f->ptep);
	if (rc == -ENOMEM)
		return rc;
	if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm))
		return rc;
	if (rc)
		return -EAGAIN;
	if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm))
		return -EINVAL;

	if (f->ptep) {
		pgste = pgste_get_lock(f->ptep);
		oldpte = *f->ptep;
		newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page);
		newpte.s.sd = oldpte.s.sd;
		oldpte.s.sd = 0;
		if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) {
			pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn);
			if (f->callback)
				f->callback(f);
		} else {
			rc = -EAGAIN;
		}
		pgste_set_unlock(f->ptep, pgste);
	} else {
		do {
			oldval = READ_ONCE(*f->crstep);
			newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
					    f->write_attempt | oldval.s.fc1.d);
			newval.s.fc1.s = !f->page;
			newval.s.fc1.sd = oldval.s.fc1.sd;
			if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
			    crste_origin_large(oldval) != crste_origin_large(newval))
				return -EAGAIN;
		} while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn));
		if (f->callback)
			f->callback(f);
	}

	return rc;
}

int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f)
{
	unsigned int order;
	int rc, level;
	int level;

	lockdep_assert_held(&gmap->kvm->mmu_lock);

@@ -638,16 +693,14 @@ int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fau
		else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f->gfn))
			level = TABLE_TYPE_SEGMENT;
	}
	rc = dat_link(mc, gmap->asce, level, uses_skeys(gmap), f);
	KVM_BUG_ON(rc == -EINVAL, gmap->kvm);
	return rc;
	return _gmap_link(mc, gmap, level, f);
}

static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
			     gfn_t p_gfn, gfn_t c_gfn, bool force_alloc)
{
	union crste newcrste, oldcrste;
	struct page_table *pt;
	union crste newcrste;
	union crste *crstep;
	union pte *ptep;
	int rc;
@@ -673,7 +726,11 @@ static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
			    &crstep, &ptep);
	if (rc)
		return rc;
	dat_crstep_xchg(crstep, newcrste, c_gfn, gmap->asce);
	do {
		oldcrste = READ_ONCE(*crstep);
		if (oldcrste.val == newcrste.val)
			break;
	} while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce));
	return 0;
}

@@ -777,8 +834,10 @@ static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn)
	int rc;

	rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep);
	if (!rc)
		dat_crstep_xchg(crstep, _PMD_EMPTY, c_gfn, gmap->asce);
	if (rc)
		return;
	while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce))
		;
}

void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count)
@@ -1017,8 +1076,8 @@ static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level)
			dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg));
		return;
	}
	crste = READ_ONCE(*crstep);
	dat_crstep_clear(crstep, r_gfn, sg->asce);

	crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce);
	if (crste_leaf(crste) || crste.h.i)
		return;
	if (is_pmd(crste))
@@ -1101,6 +1160,7 @@ struct gmap_protect_asce_top_level {
static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
						struct gmap_protect_asce_top_level *context)
{
	struct gmap *parent;
	int rc, i;

	guard(write_lock)(&sg->kvm->mmu_lock);
@@ -1108,7 +1168,12 @@ static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, s
	if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f))
		return -EAGAIN;

	scoped_guard(spinlock, &sg->parent->children_lock) {
	parent = READ_ONCE(sg->parent);
	if (!parent)
		return -EAGAIN;
	scoped_guard(spinlock, &parent->children_lock) {
		if (READ_ONCE(sg->parent) != parent)
			return -EAGAIN;
		for (i = 0; i < CRST_TABLE_PAGES; i++) {
			if (!context->f[i].valid)
				continue;
@@ -1191,6 +1256,9 @@ struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *pare
	struct gmap *sg, *new;
	int rc;

	if (WARN_ON(!parent))
		return ERR_PTR(-EINVAL);

	scoped_guard(spinlock, &parent->children_lock) {
		sg = gmap_find_shadow(parent, asce, edat_level);
		if (sg) {
+21 −12
Original line number Diff line number Diff line
@@ -185,6 +185,8 @@ static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, un
		else
			_gmap_handle_vsie_unshadow_event(gmap, gfn);
	}
	if (!ptep->s.d && newpte.s.d && !newpte.s.s)
		SetPageDirty(pfn_to_page(newpte.h.pfra));
	return __dat_ptep_xchg(ptep, pgste, newpte, gfn, gmap->asce, uses_skeys(gmap));
}

@@ -194,35 +196,42 @@ static inline union pgste gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, uni
	return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true);
}

static inline void _gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne,
static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep,
							 union crste oldcrste, union crste newcrste,
							 gfn_t gfn, bool needs_lock)
{
	unsigned long align = 8 + (is_pmd(*crstep) ? 0 : 11);
	unsigned long align = is_pmd(newcrste) ? _PAGE_ENTRIES : _PAGE_ENTRIES * _CRST_ENTRIES;

	if (KVM_BUG_ON(crstep->h.tt != oldcrste.h.tt || newcrste.h.tt != oldcrste.h.tt, gmap->kvm))
		return true;

	lockdep_assert_held(&gmap->kvm->mmu_lock);
	if (!needs_lock)
		lockdep_assert_held(&gmap->children_lock);

	gfn = ALIGN_DOWN(gfn, align);
	if (crste_prefix(*crstep) && (ne.h.p || ne.h.i || !crste_prefix(ne))) {
		ne.s.fc1.prefix_notif = 0;
	if (crste_prefix(oldcrste) && (newcrste.h.p || newcrste.h.i || !crste_prefix(newcrste))) {
		newcrste.s.fc1.prefix_notif = 0;
		gmap_unmap_prefix(gmap, gfn, gfn + align);
	}
	if (crste_leaf(*crstep) && crstep->s.fc1.vsie_notif &&
	    (ne.h.p || ne.h.i || !ne.s.fc1.vsie_notif)) {
		ne.s.fc1.vsie_notif = 0;
	if (crste_leaf(oldcrste) && oldcrste.s.fc1.vsie_notif &&
	    (newcrste.h.p || newcrste.h.i || !newcrste.s.fc1.vsie_notif)) {
		newcrste.s.fc1.vsie_notif = 0;
		if (needs_lock)
			gmap_handle_vsie_unshadow_event(gmap, gfn);
		else
			_gmap_handle_vsie_unshadow_event(gmap, gfn);
	}
	dat_crstep_xchg(crstep, ne, gfn, gmap->asce);
	if (!oldcrste.s.fc1.d && newcrste.s.fc1.d && !newcrste.s.fc1.s)
		SetPageDirty(phys_to_page(crste_origin_large(newcrste)));
	return dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce);
}

static inline void gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne,
static inline bool __must_check gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep,
							union crste oldcrste, union crste newcrste,
							gfn_t gfn)
{
	return _gmap_crstep_xchg(gmap, crstep, ne, gfn, true);
	return _gmap_crstep_xchg_atomic(gmap, crstep, oldcrste, newcrste, gfn, true);
}

/**
Loading