Commit 8b3f7873 authored by Suravee Suthikulpanit's avatar Suravee Suthikulpanit Committed by Joerg Roedel
Browse files

iommu/amd: Introduce helper function to update 256-bit DTE



The current implementation does not follow 128-bit write requirement
to update DTE as specified in the AMD I/O Virtualization Techonology
(IOMMU) Specification.

Therefore, modify the struct dev_table_entry to contain union of u128 data
array, and introduce a helper functions update_dte256() to update DTE using
two 128-bit cmpxchg operations to update 256-bit DTE with the modified
structure, and take into account the DTE[V, GV] bits when programming
the DTE to ensure proper order of DTE programming and flushing.

In addition, introduce a per-DTE spin_lock struct dev_data.dte_lock to
provide synchronization when updating the DTE to prevent cmpxchg128
failure.

Suggested-by: default avatarJason Gunthorpe <jgg@nvidia.com>
Suggested-by: default avatarUros Bizjak <ubizjak@gmail.com>
Reviewed-by: default avatarJason Gunthorpe <jgg@nvidia.com>
Reviewed-by: default avatarUros Bizjak <ubizjak@gmail.com>
Signed-off-by: default avatarSuravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Link: https://lore.kernel.org/r/20241118054937.5203-5-suravee.suthikulpanit@amd.com


Signed-off-by: default avatarJoerg Roedel <jroedel@suse.de>
parent 7bea695a
Loading
Loading
Loading
Loading
+9 −1
Original line number Diff line number Diff line
@@ -427,9 +427,13 @@
#define DTE_GCR3_SHIFT_C	43

#define DTE_GPT_LEVEL_SHIFT	54
#define DTE_GPT_LEVEL_MASK	GENMASK_ULL(55, 54)

#define GCR3_VALID		0x01ULL

/* DTE[128:179] | DTE[184:191] */
#define DTE_DATA2_INTR_MASK	~GENMASK_ULL(55, 52)

#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD)
@@ -842,6 +846,7 @@ struct devid_map {
struct iommu_dev_data {
	/*Protect against attach/detach races */
	struct mutex mutex;
	spinlock_t dte_lock;              /* DTE lock for 256-bit access */

	struct list_head list;		  /* For domain->dev_list */
	struct llist_node dev_data_list;  /* For global dev_data_list */
@@ -886,7 +891,10 @@ extern struct list_head amd_iommu_list;
 * Structure defining one entry in the device table
 */
struct dev_table_entry {
	union {
		u64 data[4];
		u128 data128[2];
	};
};

/*
+123 −0
Original line number Diff line number Diff line
@@ -83,12 +83,125 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
static void set_dte_entry(struct amd_iommu *iommu,
			  struct iommu_dev_data *dev_data);

static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid);

/****************************************************************************
 *
 * Helper functions
 *
 ****************************************************************************/

static __always_inline void amd_iommu_atomic128_set(__int128 *ptr, __int128 val)
{
	/*
	 * Note:
	 * We use arch_cmpxchg128_local() because:
	 * - Need cmpxchg16b instruction mainly for 128-bit store to DTE
	 *   (not necessary for cmpxchg since this function is already
	 *   protected by a spin_lock for this DTE).
	 * - Neither need LOCK_PREFIX nor try loop because of the spin_lock.
	 */
	arch_cmpxchg128_local(ptr, *ptr, val);
}

static void write_dte_upper128(struct dev_table_entry *ptr, struct dev_table_entry *new)
{
	struct dev_table_entry old;

	old.data128[1] = ptr->data128[1];
	/*
	 * Preserve DTE_DATA2_INTR_MASK. This needs to be
	 * done here since it requires to be inside
	 * spin_lock(&dev_data->dte_lock) context.
	 */
	new->data[2] &= ~DTE_DATA2_INTR_MASK;
	new->data[2] |= old.data[2] & DTE_DATA2_INTR_MASK;

	amd_iommu_atomic128_set(&ptr->data128[1], new->data128[1]);
}

static void write_dte_lower128(struct dev_table_entry *ptr, struct dev_table_entry *new)
{
	amd_iommu_atomic128_set(&ptr->data128[0], new->data128[0]);
}

/*
 * Note:
 * IOMMU reads the entire Device Table entry in a single 256-bit transaction
 * but the driver is programming DTE using 2 128-bit cmpxchg. So, the driver
 * need to ensure the following:
 *   - DTE[V|GV] bit is being written last when setting.
 *   - DTE[V|GV] bit is being written first when clearing.
 *
 * This function is used only by code, which updates DMA translation part of the DTE.
 * So, only consider control bits related to DMA when updating the entry.
 */
static void update_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data,
			  struct dev_table_entry *new)
{
	unsigned long flags;
	struct dev_table_entry *dev_table = get_dev_table(iommu);
	struct dev_table_entry *ptr = &dev_table[dev_data->devid];

	spin_lock_irqsave(&dev_data->dte_lock, flags);

	if (!(ptr->data[0] & DTE_FLAG_V)) {
		/* Existing DTE is not valid. */
		write_dte_upper128(ptr, new);
		write_dte_lower128(ptr, new);
		iommu_flush_dte_sync(iommu, dev_data->devid);
	} else if (!(new->data[0] & DTE_FLAG_V)) {
		/* Existing DTE is valid. New DTE is not valid.  */
		write_dte_lower128(ptr, new);
		write_dte_upper128(ptr, new);
		iommu_flush_dte_sync(iommu, dev_data->devid);
	} else if (!FIELD_GET(DTE_FLAG_GV, ptr->data[0])) {
		/*
		 * Both DTEs are valid.
		 * Existing DTE has no guest page table.
		 */
		write_dte_upper128(ptr, new);
		write_dte_lower128(ptr, new);
		iommu_flush_dte_sync(iommu, dev_data->devid);
	} else if (!FIELD_GET(DTE_FLAG_GV, new->data[0])) {
		/*
		 * Both DTEs are valid.
		 * Existing DTE has guest page table,
		 * new DTE has no guest page table,
		 */
		write_dte_lower128(ptr, new);
		write_dte_upper128(ptr, new);
		iommu_flush_dte_sync(iommu, dev_data->devid);
	} else if (FIELD_GET(DTE_GPT_LEVEL_MASK, ptr->data[2]) !=
		   FIELD_GET(DTE_GPT_LEVEL_MASK, new->data[2])) {
		/*
		 * Both DTEs are valid and have guest page table,
		 * but have different number of levels. So, we need
		 * to upadte both upper and lower 128-bit value, which
		 * require disabling and flushing.
		 */
		struct dev_table_entry clear = {};

		/* First disable DTE */
		write_dte_lower128(ptr, &clear);
		iommu_flush_dte_sync(iommu, dev_data->devid);

		/* Then update DTE */
		write_dte_upper128(ptr, new);
		write_dte_lower128(ptr, new);
		iommu_flush_dte_sync(iommu, dev_data->devid);
	} else {
		/*
		 * Both DTEs are valid and have guest page table,
		 * and same number of levels. We just need to only
		 * update the lower 128-bit. So no need to disable DTE.
		 */
		write_dte_lower128(ptr, new);
	}

	spin_unlock_irqrestore(&dev_data->dte_lock, flags);
}

static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom)
{
	return (pdom && (pdom->pd_mode == PD_MODE_V2));
@@ -209,6 +322,7 @@ static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid)
		return NULL;

	mutex_init(&dev_data->mutex);
	spin_lock_init(&dev_data->dte_lock);
	dev_data->devid = devid;
	ratelimit_default_init(&dev_data->rs);

@@ -1261,6 +1375,15 @@ static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
	return iommu_queue_command(iommu, &cmd);
}

static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid)
{
	int ret;

	ret = iommu_flush_dte(iommu, devid);
	if (!ret)
		iommu_completion_wait(iommu);
}

static void amd_iommu_flush_dte_all(struct amd_iommu *iommu)
{
	u32 devid;