Commit c3b1edea authored by Lu Baolu's avatar Lu Baolu Committed by Joerg Roedel
Browse files

iommu/vt-d: Fix race condition during PASID entry replacement



The Intel VT-d PASID table entry is 512 bits (64 bytes). When replacing
an active PASID entry (e.g., during domain replacement), the current
implementation calculates a new entry on the stack and copies it to the
table using a single structure assignment.

        struct pasid_entry *pte, new_pte;

        pte = intel_pasid_get_entry(dev, pasid);
        pasid_pte_config_first_level(iommu, &new_pte, ...);
        *pte = new_pte;

Because the hardware may fetch the 512-bit PASID entry in multiple
128-bit chunks, updating the entire entry while it is active (Present
bit set) risks a "torn" read. In this scenario, the IOMMU hardware
could observe an inconsistent state — partially new data and partially
old data — leading to unpredictable behavior or spurious faults.

Fix this by removing the unsafe "replace" helpers and following the
"clear-then-update" flow, which ensures the Present bit is cleared and
the required invalidation handshake is completed before the new
configuration is applied.

Fixes: 7543ee63 ("iommu/vt-d: Add pasid replace helpers")
Signed-off-by: default avatarLu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: default avatarSamiullah Khawaja <skhawaja@google.com>
Reviewed-by: default avatarKevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20260120061816.2132558-4-baolu.lu@linux.intel.com


Signed-off-by: default avatarJoerg Roedel <joerg.roedel@amd.com>
parent c1e4f1dc
Loading
Loading
Loading
Loading
+12 −17
Original line number Diff line number Diff line
@@ -1252,12 +1252,10 @@ int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev,
			       ioasid_t pasid, u16 did, phys_addr_t fsptptr,
			       int flags, struct iommu_domain *old)
{
	if (!old)
		return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid,
						     did, flags);
	return intel_pasid_replace_first_level(iommu, dev, fsptptr, pasid, did,
					       iommu_domain_did(old, iommu),
					       flags);
	if (old)
		intel_pasid_tear_down_entry(iommu, dev, pasid, false);

	return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid, did, flags);
}

static int domain_setup_second_level(struct intel_iommu *iommu,
@@ -1265,23 +1263,20 @@ static int domain_setup_second_level(struct intel_iommu *iommu,
				     struct device *dev, ioasid_t pasid,
				     struct iommu_domain *old)
{
	if (!old)
		return intel_pasid_setup_second_level(iommu, domain,
						      dev, pasid);
	return intel_pasid_replace_second_level(iommu, domain, dev,
						iommu_domain_did(old, iommu),
						pasid);
	if (old)
		intel_pasid_tear_down_entry(iommu, dev, pasid, false);

	return intel_pasid_setup_second_level(iommu, domain, dev, pasid);
}

static int domain_setup_passthrough(struct intel_iommu *iommu,
				    struct device *dev, ioasid_t pasid,
				    struct iommu_domain *old)
{
	if (!old)
	if (old)
		intel_pasid_tear_down_entry(iommu, dev, pasid, false);

	return intel_pasid_setup_pass_through(iommu, dev, pasid);
	return intel_pasid_replace_pass_through(iommu, dev,
						iommu_domain_did(old, iommu),
						pasid);
}

static int domain_setup_first_level(struct intel_iommu *iommu,
+4 −5
Original line number Diff line number Diff line
@@ -136,11 +136,10 @@ static int domain_setup_nested(struct intel_iommu *iommu,
			       struct device *dev, ioasid_t pasid,
			       struct iommu_domain *old)
{
	if (!old)
	if (old)
		intel_pasid_tear_down_entry(iommu, dev, pasid, false);

	return intel_pasid_setup_nested(iommu, dev, pasid, domain);
	return intel_pasid_replace_nested(iommu, dev, pasid,
					  iommu_domain_did(old, iommu),
					  domain);
}

static int intel_nested_set_dev_pasid(struct iommu_domain *domain,
+0 −184
Original line number Diff line number Diff line
@@ -417,50 +417,6 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev,
	return 0;
}

int intel_pasid_replace_first_level(struct intel_iommu *iommu,
				    struct device *dev, phys_addr_t fsptptr,
				    u32 pasid, u16 did, u16 old_did,
				    int flags)
{
	struct pasid_entry *pte, new_pte;

	if (!ecap_flts(iommu->ecap)) {
		pr_err("No first level translation support on %s\n",
		       iommu->name);
		return -EINVAL;
	}

	if ((flags & PASID_FLAG_FL5LP) && !cap_fl5lp_support(iommu->cap)) {
		pr_err("No 5-level paging support for first-level on %s\n",
		       iommu->name);
		return -EINVAL;
	}

	pasid_pte_config_first_level(iommu, &new_pte, fsptptr, did, flags);

	spin_lock(&iommu->lock);
	pte = intel_pasid_get_entry(dev, pasid);
	if (!pte) {
		spin_unlock(&iommu->lock);
		return -ENODEV;
	}

	if (!pasid_pte_is_present(pte)) {
		spin_unlock(&iommu->lock);
		return -EINVAL;
	}

	WARN_ON(old_did != pasid_get_domain_id(pte));

	*pte = new_pte;
	spin_unlock(&iommu->lock);

	intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
	intel_iommu_drain_pasid_prq(dev, pasid);

	return 0;
}

/*
 * Set up the scalable mode pasid entry for second only translation type.
 */
@@ -527,51 +483,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
	return 0;
}

int intel_pasid_replace_second_level(struct intel_iommu *iommu,
				     struct dmar_domain *domain,
				     struct device *dev, u16 old_did,
				     u32 pasid)
{
	struct pasid_entry *pte, new_pte;
	u16 did;

	/*
	 * If hardware advertises no support for second level
	 * translation, return directly.
	 */
	if (!ecap_slts(iommu->ecap)) {
		pr_err("No second level translation support on %s\n",
		       iommu->name);
		return -EINVAL;
	}

	did = domain_id_iommu(domain, iommu);

	pasid_pte_config_second_level(iommu, &new_pte, domain, did);

	spin_lock(&iommu->lock);
	pte = intel_pasid_get_entry(dev, pasid);
	if (!pte) {
		spin_unlock(&iommu->lock);
		return -ENODEV;
	}

	if (!pasid_pte_is_present(pte)) {
		spin_unlock(&iommu->lock);
		return -EINVAL;
	}

	WARN_ON(old_did != pasid_get_domain_id(pte));

	*pte = new_pte;
	spin_unlock(&iommu->lock);

	intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
	intel_iommu_drain_pasid_prq(dev, pasid);

	return 0;
}

/*
 * Set up dirty tracking on a second only or nested translation type.
 */
@@ -684,38 +595,6 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
	return 0;
}

int intel_pasid_replace_pass_through(struct intel_iommu *iommu,
				     struct device *dev, u16 old_did,
				     u32 pasid)
{
	struct pasid_entry *pte, new_pte;
	u16 did = FLPT_DEFAULT_DID;

	pasid_pte_config_pass_through(iommu, &new_pte, did);

	spin_lock(&iommu->lock);
	pte = intel_pasid_get_entry(dev, pasid);
	if (!pte) {
		spin_unlock(&iommu->lock);
		return -ENODEV;
	}

	if (!pasid_pte_is_present(pte)) {
		spin_unlock(&iommu->lock);
		return -EINVAL;
	}

	WARN_ON(old_did != pasid_get_domain_id(pte));

	*pte = new_pte;
	spin_unlock(&iommu->lock);

	intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
	intel_iommu_drain_pasid_prq(dev, pasid);

	return 0;
}

/*
 * Set the page snoop control for a pasid entry which has been set up.
 */
@@ -849,69 +728,6 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
	return 0;
}

int intel_pasid_replace_nested(struct intel_iommu *iommu,
			       struct device *dev, u32 pasid,
			       u16 old_did, struct dmar_domain *domain)
{
	struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg;
	struct dmar_domain *s2_domain = domain->s2_domain;
	u16 did = domain_id_iommu(domain, iommu);
	struct pasid_entry *pte, new_pte;

	/* Address width should match the address width supported by hardware */
	switch (s1_cfg->addr_width) {
	case ADDR_WIDTH_4LEVEL:
		break;
	case ADDR_WIDTH_5LEVEL:
		if (!cap_fl5lp_support(iommu->cap)) {
			dev_err_ratelimited(dev,
					    "5-level paging not supported\n");
			return -EINVAL;
		}
		break;
	default:
		dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n",
				    s1_cfg->addr_width);
		return -EINVAL;
	}

	if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) {
		pr_err_ratelimited("No supervisor request support on %s\n",
				   iommu->name);
		return -EINVAL;
	}

	if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) {
		pr_err_ratelimited("No extended access flag support on %s\n",
				   iommu->name);
		return -EINVAL;
	}

	pasid_pte_config_nestd(iommu, &new_pte, s1_cfg, s2_domain, did);

	spin_lock(&iommu->lock);
	pte = intel_pasid_get_entry(dev, pasid);
	if (!pte) {
		spin_unlock(&iommu->lock);
		return -ENODEV;
	}

	if (!pasid_pte_is_present(pte)) {
		spin_unlock(&iommu->lock);
		return -EINVAL;
	}

	WARN_ON(old_did != pasid_get_domain_id(pte));

	*pte = new_pte;
	spin_unlock(&iommu->lock);

	intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
	intel_iommu_drain_pasid_prq(dev, pasid);

	return 0;
}

/*
 * Interfaces to setup or teardown a pasid table to the scalable-mode
 * context table entry:
+0 −14
Original line number Diff line number Diff line
@@ -316,20 +316,6 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
				   struct device *dev, u32 pasid);
int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
			     u32 pasid, struct dmar_domain *domain);
int intel_pasid_replace_first_level(struct intel_iommu *iommu,
				    struct device *dev, phys_addr_t fsptptr,
				    u32 pasid, u16 did, u16 old_did, int flags);
int intel_pasid_replace_second_level(struct intel_iommu *iommu,
				     struct dmar_domain *domain,
				     struct device *dev, u16 old_did,
				     u32 pasid);
int intel_pasid_replace_pass_through(struct intel_iommu *iommu,
				     struct device *dev, u16 old_did,
				     u32 pasid);
int intel_pasid_replace_nested(struct intel_iommu *iommu,
			       struct device *dev, u32 pasid,
			       u16 old_did, struct dmar_domain *domain);

void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
				 struct device *dev, u32 pasid,
				 bool fault_ignore);