Commit b827ef02 authored by Claudio Imbrenda's avatar Claudio Imbrenda
Browse files

KVM: s390: Remove non-atomic dat_crstep_xchg()



In practice dat_crstep_xchg() is racy and hard to use correctly. Simply
remove it and replace its uses with dat_crstep_xchg_atomic().

This solves some actual races that lead to system hangs / crashes.

Opportunistically fix an alignment issue in _gmap_crstep_xchg_atomic().

Fixes: 589071ea ("KVM: s390: KVM page table management functions: clear and replace")
Fixes: 94fd9b16 ("KVM: s390: KVM page table management functions: lifecycle management")
Reviewed-by: default avatarSteffen Eiden <seiden@linux.ibm.com>
Signed-off-by: default avatarClaudio Imbrenda <imbrenda@linux.ibm.com>
parent 0f547553
Loading
Loading
Loading
Loading
+14 −37
Original line number Diff line number Diff line
@@ -134,32 +134,6 @@ int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newt
	return 0;
}

/**
 * dat_crstep_xchg() - Exchange a gmap CRSTE with another.
 * @crstep: Pointer to the CRST entry
 * @new: Replacement entry.
 * @gfn: The affected guest address.
 * @asce: The ASCE of the address space.
 *
 * Context: This function is assumed to be called with kvm->mmu_lock held.
 */
void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce)
{
	if (crstep->h.i) {
		WRITE_ONCE(*crstep, new);
		return;
	} else if (cpu_has_edat2()) {
		crdte_crste(crstep, *crstep, new, gfn, asce);
		return;
	}

	if (machine_has_tlb_guest())
		idte_crste(crstep, gfn, IDTE_GUEST_ASCE, asce, IDTE_GLOBAL);
	else
		idte_crste(crstep, gfn, 0, NULL_ASCE, IDTE_GLOBAL);
	WRITE_ONCE(*crstep, new);
}

/**
 * dat_crstep_xchg_atomic() - Atomically exchange a gmap CRSTE with another.
 * @crstep: Pointer to the CRST entry.
@@ -175,8 +149,8 @@ void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce
 *
 * Return: %true if the exchange was successful.
 */
bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn,
			    union asce asce)
bool __must_check dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new,
					 gfn_t gfn, union asce asce)
{
	if (old.h.i)
		return arch_try_cmpxchg((long *)crstep, &old.val, new.val);
@@ -894,7 +868,8 @@ static long _dat_slot_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct d

	/* This table entry needs to be updated. */
	if (walk->start <= gfn && walk->end >= next) {
		dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce);
		if (!dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce))
			return -EINVAL;
		/* A lower level table was present, needs to be freed. */
		if (!crste.h.fc && !crste.h.i) {
			if (is_pmd(crste))
@@ -1072,17 +1047,19 @@ int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level,

static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
{
	union crste crste = READ_ONCE(*crstep);
	union crste newcrste, oldcrste;
	int *n = walk->priv;

	if (!crste.h.fc || crste.h.i || crste.h.p)
	do {
		oldcrste = READ_ONCE(*crstep);
		if (!oldcrste.h.fc || oldcrste.h.i || oldcrste.h.p)
			return 0;

		if (oldcrste.s.fc1.prefix_notif)
			break;
		newcrste = oldcrste;
		newcrste.s.fc1.prefix_notif = 1;
	} while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, walk->asce));
	*n = 2;
	if (crste.s.fc1.prefix_notif)
		return 0;
	crste.s.fc1.prefix_notif = 1;
	dat_crstep_xchg(crstep, crste, gfn, walk->asce);
	return 0;
}

+6 −3
Original line number Diff line number Diff line
@@ -938,11 +938,14 @@ static inline bool dat_pudp_xchg_atomic(union pud *pudp, union pud old, union pu
	return dat_crstep_xchg_atomic(_CRSTEP(pudp), _CRSTE(old), _CRSTE(new), gfn, asce);
}

static inline void dat_crstep_clear(union crste *crstep, gfn_t gfn, union asce asce)
static inline union crste dat_crstep_clear_atomic(union crste *crstep, gfn_t gfn, union asce asce)
{
	union crste newcrste = _CRSTE_EMPTY(crstep->h.tt);
	union crste oldcrste, empty = _CRSTE_EMPTY(crstep->h.tt);

	dat_crstep_xchg(crstep, newcrste, gfn, asce);
	do {
		oldcrste = READ_ONCE(*crstep);
	} while (!dat_crstep_xchg_atomic(crstep, oldcrste, empty, gfn, asce));
	return oldcrste;
}

static inline int get_level(union crste *crstep, union pte *ptep)
+15 −11
Original line number Diff line number Diff line
@@ -1456,7 +1456,7 @@ static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union
static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, union crste *table,
			    struct guest_fault *f, bool p)
{
	union crste newcrste;
	union crste newcrste, oldcrste;
	gfn_t gfn;
	int rc;

@@ -1469,16 +1469,20 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni
	if (rc)
		return rc;

	newcrste = _crste_fc1(f->pfn, host->h.tt, f->writable, !p);
	newcrste.s.fc1.d |= host->s.fc1.d;
	newcrste.s.fc1.sd |= host->s.fc1.sd;
	newcrste.h.p &= host->h.p;
	do {
		oldcrste = READ_ONCE(*host);
		newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, f->writable, !p);
		newcrste.s.fc1.d |= oldcrste.s.fc1.d;
		newcrste.s.fc1.sd |= oldcrste.s.fc1.sd;
		newcrste.h.p &= oldcrste.h.p;
		newcrste.s.fc1.vsie_notif = 1;
	newcrste.s.fc1.prefix_notif = host->s.fc1.prefix_notif;
	_gmap_crstep_xchg(sg->parent, host, newcrste, f->gfn, false);
		newcrste.s.fc1.prefix_notif = oldcrste.s.fc1.prefix_notif;
	} while (!_gmap_crstep_xchg_atomic(sg->parent, host, oldcrste, newcrste, f->gfn, false));

	newcrste = _crste_fc1(f->pfn, host->h.tt, 0, !p);
	dat_crstep_xchg(table, newcrste, gpa_to_gfn(raddr), sg->asce);
	newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p);
	gfn = gpa_to_gfn(raddr);
	while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce))
		;
	return 0;
}

+48 −34
Original line number Diff line number Diff line
@@ -313,13 +313,16 @@ static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, st
	struct clear_young_pte_priv *priv = walk->priv;
	union crste crste, new;

	do {
		crste = READ_ONCE(*crstep);

		if (!crste.h.fc)
			return 0;
		if (!crste.s.fc1.y && crste.h.i)
			return 0;
	if (!crste_prefix(crste) || gmap_mkold_prefix(priv->gmap, gfn, end)) {
		if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end))
			break;

		new = crste;
		new.h.i = 1;
		new.s.fc1.y = 0;
@@ -328,8 +331,8 @@ static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, st
			folio_set_dirty(phys_to_folio(crste_origin_large(crste)));
		new.s.fc1.d = 0;
		new.h.p = 1;
		dat_crstep_xchg(crstep, new, gfn, walk->asce);
	}
	} while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce));

	priv->young = 1;
	return 0;
}
@@ -391,14 +394,18 @@ static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct
{
	struct gmap_unmap_priv *priv = walk->priv;
	struct folio *folio = NULL;
	union crste old = *crstep;

	if (crstep->h.fc) {
		if (crstep->s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
			folio = phys_to_folio(crste_origin_large(*crstep));
		gmap_crstep_xchg(priv->gmap, crstep, _CRSTE_EMPTY(crstep->h.tt), gfn);
	if (!old.h.fc)
		return 0;

	if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
		folio = phys_to_folio(crste_origin_large(old));
	/* No races should happen because kvm->mmu_lock is held in write mode */
	KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn),
		   priv->gmap->kvm);
	if (folio)
		uv_convert_from_secure_folio(folio);
	}

	return 0;
}
@@ -474,6 +481,7 @@ static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t

	if (fatal_signal_pending(current))
		return 1;
	do {
		crste = READ_ONCE(*table);
		if (!crste.h.fc)
			return 0;
@@ -485,12 +493,12 @@ static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t
		 * currently running, do not reset the protection, leave it marked as
		 * dirty.
		 */
	if (!crste.s.fc1.prefix_notif || gmap_mkold_prefix(gmap, gfn, end)) {
		if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end))
			break;
		new = crste;
		new.h.p = 1;
		new.s.fc1.sd = 0;
		gmap_crstep_xchg(gmap, table, new, gfn);
	}
	} while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn));

	for ( ; gfn < end; gfn++)
		mark_page_dirty(gmap->kvm, gfn);
@@ -646,8 +654,8 @@ int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fau
static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
			     gfn_t p_gfn, gfn_t c_gfn, bool force_alloc)
{
	union crste newcrste, oldcrste;
	struct page_table *pt;
	union crste newcrste;
	union crste *crstep;
	union pte *ptep;
	int rc;
@@ -673,7 +681,11 @@ static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
			    &crstep, &ptep);
	if (rc)
		return rc;
	dat_crstep_xchg(crstep, newcrste, c_gfn, gmap->asce);
	do {
		oldcrste = READ_ONCE(*crstep);
		if (oldcrste.val == newcrste.val)
			break;
	} while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce));
	return 0;
}

@@ -777,8 +789,10 @@ static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn)
	int rc;

	rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep);
	if (!rc)
		dat_crstep_xchg(crstep, _PMD_EMPTY, c_gfn, gmap->asce);
	if (rc)
		return;
	while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce))
		;
}

void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count)
@@ -1017,8 +1031,8 @@ static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level)
			dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg));
		return;
	}
	crste = READ_ONCE(*crstep);
	dat_crstep_clear(crstep, r_gfn, sg->asce);

	crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce);
	if (crste_leaf(crste) || crste.h.i)
		return;
	if (is_pmd(crste))
+17 −12
Original line number Diff line number Diff line
@@ -194,35 +194,40 @@ static inline union pgste gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, uni
	return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true);
}

static inline void _gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne,
static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep,
							 union crste oldcrste, union crste newcrste,
							 gfn_t gfn, bool needs_lock)
{
	unsigned long align = 8 + (is_pmd(*crstep) ? 0 : 11);
	unsigned long align = is_pmd(newcrste) ? _PAGE_ENTRIES : _PAGE_ENTRIES * _CRST_ENTRIES;

	if (KVM_BUG_ON(crstep->h.tt != oldcrste.h.tt || newcrste.h.tt != oldcrste.h.tt, gmap->kvm))
		return true;

	lockdep_assert_held(&gmap->kvm->mmu_lock);
	if (!needs_lock)
		lockdep_assert_held(&gmap->children_lock);

	gfn = ALIGN_DOWN(gfn, align);
	if (crste_prefix(*crstep) && (ne.h.p || ne.h.i || !crste_prefix(ne))) {
		ne.s.fc1.prefix_notif = 0;
	if (crste_prefix(oldcrste) && (newcrste.h.p || newcrste.h.i || !crste_prefix(newcrste))) {
		newcrste.s.fc1.prefix_notif = 0;
		gmap_unmap_prefix(gmap, gfn, gfn + align);
	}
	if (crste_leaf(*crstep) && crstep->s.fc1.vsie_notif &&
	    (ne.h.p || ne.h.i || !ne.s.fc1.vsie_notif)) {
		ne.s.fc1.vsie_notif = 0;
	if (crste_leaf(oldcrste) && oldcrste.s.fc1.vsie_notif &&
	    (newcrste.h.p || newcrste.h.i || !newcrste.s.fc1.vsie_notif)) {
		newcrste.s.fc1.vsie_notif = 0;
		if (needs_lock)
			gmap_handle_vsie_unshadow_event(gmap, gfn);
		else
			_gmap_handle_vsie_unshadow_event(gmap, gfn);
	}
	dat_crstep_xchg(crstep, ne, gfn, gmap->asce);
	return dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce);
}

static inline void gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne,
static inline bool __must_check gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep,
							union crste oldcrste, union crste newcrste,
							gfn_t gfn)
{
	return _gmap_crstep_xchg(gmap, crstep, ne, gfn, true);
	return _gmap_crstep_xchg_atomic(gmap, crstep, oldcrste, newcrste, gfn, true);
}

/**