Commit 40f5175d authored by Jason Gunthorpe's avatar Jason Gunthorpe
Browse files

iommufd: Implement sw_msi support natively

iommufd has a model where the iommu_domain can be changed while the VFIO
device is attached. In this case, the MSI should continue to work. This
corner case has not worked because the dma-iommu implementation of sw_msi
is tied to a single domain.

Implement the sw_msi mapping directly and use a global per-fd table to
associate assigned IOVA to the MSI pages. This allows the MSI pages to
be loaded into a domain before it is attached ensuring that MSI is not
disrupted.

Link: https://patch.msgid.link/r/e13d23eeacd67c0a692fc468c85b483f4dd51c57.1740014950.git.nicolinc@nvidia.com


Signed-off-by: default avatarNicolin Chen <nicolinc@nvidia.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@nvidia.com>
parent 748706d7
Loading
Loading
Loading
Loading
+139 −22
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@
#include <linux/iommufd.h>
#include <linux/slab.h>
#include <uapi/linux/iommufd.h>
#include <linux/msi.h>

#include "../iommu-priv.h"
#include "io_pagetable.h"
@@ -293,36 +294,152 @@ u32 iommufd_device_to_id(struct iommufd_device *idev)
}
EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD");

static int iommufd_group_setup_msi(struct iommufd_group *igroup,
				   struct iommufd_hwpt_paging *hwpt_paging)
/*
 * Get a iommufd_sw_msi_map for the msi physical address requested by the irq
 * layer. The mapping to IOVA is global to the iommufd file descriptor, every
 * domain that is attached to a device using the same MSI parameters will use
 * the same IOVA.
 */
static __maybe_unused struct iommufd_sw_msi_map *
iommufd_sw_msi_get_map(struct iommufd_ctx *ictx, phys_addr_t msi_addr,
		       phys_addr_t sw_msi_start)
{
	phys_addr_t sw_msi_start = igroup->sw_msi_start;
	struct iommufd_sw_msi_map *cur;
	unsigned int max_pgoff = 0;

	lockdep_assert_held(&ictx->sw_msi_lock);

	list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) {
		if (cur->sw_msi_start != sw_msi_start)
			continue;
		max_pgoff = max(max_pgoff, cur->pgoff + 1);
		if (cur->msi_addr == msi_addr)
			return cur;
	}

	if (ictx->sw_msi_id >=
	    BITS_PER_BYTE * sizeof_field(struct iommufd_sw_msi_maps, bitmap))
		return ERR_PTR(-EOVERFLOW);

	cur = kzalloc(sizeof(*cur), GFP_KERNEL);
	if (!cur)
		return ERR_PTR(-ENOMEM);

	cur->sw_msi_start = sw_msi_start;
	cur->msi_addr = msi_addr;
	cur->pgoff = max_pgoff;
	cur->id = ictx->sw_msi_id++;
	list_add_tail(&cur->sw_msi_item, &ictx->sw_msi_list);
	return cur;
}

static int iommufd_sw_msi_install(struct iommufd_ctx *ictx,
				  struct iommufd_hwpt_paging *hwpt_paging,
				  struct iommufd_sw_msi_map *msi_map)
{
	unsigned long iova;

	lockdep_assert_held(&ictx->sw_msi_lock);

	iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE;
	if (!test_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap)) {
		int rc;

		rc = iommu_map(hwpt_paging->common.domain, iova,
			       msi_map->msi_addr, PAGE_SIZE,
			       IOMMU_WRITE | IOMMU_READ | IOMMU_MMIO,
			       GFP_KERNEL_ACCOUNT);
		if (rc)
			return rc;
		__set_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap);
	}
	return 0;
}

/*
	 * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to
	 * call iommu_get_msi_cookie() on its behalf. This is necessary to setup
	 * the MSI window so iommu_dma_prepare_msi() can install pages into our
	 * domain after request_irq(). If it is not done interrupts will not
	 * work on this domain.
	 *
	 * FIXME: This is conceptually broken for iommufd since we want to allow
	 * userspace to change the domains, eg switch from an identity IOAS to a
	 * DMA IOAS. There is currently no way to create a MSI window that
	 * matches what the IRQ layer actually expects in a newly created
	 * domain.
 * Called by the irq code if the platform translates the MSI address through the
 * IOMMU. msi_addr is the physical address of the MSI page. iommufd will
 * allocate a fd global iova for the physical page that is the same on all
 * domains and devices.
 */
#ifdef CONFIG_IRQ_MSI_IOMMU
int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc,
		   phys_addr_t msi_addr)
{
	struct device *dev = msi_desc_to_dev(desc);
	struct iommufd_hwpt_paging *hwpt_paging;
	struct iommu_attach_handle *raw_handle;
	struct iommufd_attach_handle *handle;
	struct iommufd_sw_msi_map *msi_map;
	struct iommufd_ctx *ictx;
	unsigned long iova;
	int rc;

	/*
	 * It is safe to call iommu_attach_handle_get() here because the iommu
	 * core code invokes this under the group mutex which also prevents any
	 * change of the attach handle for the duration of this function.
	 */
	iommu_group_mutex_assert(dev);

	raw_handle =
		iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0);
	if (IS_ERR(raw_handle))
		return 0;
	hwpt_paging = find_hwpt_paging(domain->iommufd_hwpt);

	handle = to_iommufd_handle(raw_handle);
	/* No IOMMU_RESV_SW_MSI means no change to the msi_msg */
	if (handle->idev->igroup->sw_msi_start == PHYS_ADDR_MAX)
		return 0;

	ictx = handle->idev->ictx;
	guard(mutex)(&ictx->sw_msi_lock);
	/*
	 * The input msi_addr is the exact byte offset of the MSI doorbell, we
	 * assume the caller has checked that it is contained with a MMIO region
	 * that is secure to map at PAGE_SIZE.
	 */
	if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) {
		rc = iommu_get_msi_cookie(hwpt_paging->common.domain,
					  sw_msi_start);
	msi_map = iommufd_sw_msi_get_map(handle->idev->ictx,
					 msi_addr & PAGE_MASK,
					 handle->idev->igroup->sw_msi_start);
	if (IS_ERR(msi_map))
		return PTR_ERR(msi_map);

	rc = iommufd_sw_msi_install(ictx, hwpt_paging, msi_map);
	if (rc)
		return rc;
	__set_bit(msi_map->id, handle->idev->igroup->required_sw_msi.bitmap);

	iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE;
	msi_desc_set_iommu_msi_iova(desc, iova, PAGE_SHIFT);
	return 0;
}
#endif

static int iommufd_group_setup_msi(struct iommufd_group *igroup,
				   struct iommufd_hwpt_paging *hwpt_paging)
{
	struct iommufd_ctx *ictx = igroup->ictx;
	struct iommufd_sw_msi_map *cur;

	if (igroup->sw_msi_start == PHYS_ADDR_MAX)
		return 0;

	/*
		 * iommu_get_msi_cookie() can only be called once per domain,
		 * it returns -EBUSY on later calls.
	 * Install all the MSI pages the device has been using into the domain
	 */
		hwpt_paging->msi_cookie = true;
	guard(mutex)(&ictx->sw_msi_lock);
	list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) {
		int rc;

		if (cur->sw_msi_start != igroup->sw_msi_start ||
		    !test_bit(cur->id, igroup->required_sw_msi.bitmap))
			continue;

		rc = iommufd_sw_msi_install(ictx, hwpt_paging, cur);
		if (rc)
			return rc;
	}
	return 0;
}
+3 −0
Original line number Diff line number Diff line
@@ -156,6 +156,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
			goto out_abort;
		}
	}
	iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi);

	/*
	 * Set the coherency mode before we do iopt_table_add_domain() as some
@@ -251,6 +252,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
		goto out_abort;
	}
	hwpt->domain->owner = ops;
	iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi);

	if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
		rc = -EINVAL;
@@ -307,6 +309,7 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags,
		goto out_abort;
	}
	hwpt->domain->owner = viommu->iommu_dev->ops;
	iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi);

	if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
		rc = -EINVAL;
+22 −1
Original line number Diff line number Diff line
@@ -19,6 +19,22 @@ struct iommu_group;
struct iommu_option;
struct iommufd_device;

struct iommufd_sw_msi_map {
	struct list_head sw_msi_item;
	phys_addr_t sw_msi_start;
	phys_addr_t msi_addr;
	unsigned int pgoff;
	unsigned int id;
};

/* Bitmap of struct iommufd_sw_msi_map::id */
struct iommufd_sw_msi_maps {
	DECLARE_BITMAP(bitmap, 64);
};

int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc,
		   phys_addr_t msi_addr);

struct iommufd_ctx {
	struct file *file;
	struct xarray objects;
@@ -26,6 +42,10 @@ struct iommufd_ctx {
	wait_queue_head_t destroy_wait;
	struct rw_semaphore ioas_creation_lock;

	struct mutex sw_msi_lock;
	struct list_head sw_msi_list;
	unsigned int sw_msi_id;

	u8 account_mode;
	/* Compatibility with VFIO no iommu */
	u8 no_iommu_mode;
@@ -283,10 +303,10 @@ struct iommufd_hwpt_paging {
	struct iommufd_ioas *ioas;
	bool auto_domain : 1;
	bool enforce_cache_coherency : 1;
	bool msi_cookie : 1;
	bool nest_parent : 1;
	/* Head at iommufd_ioas::hwpt_list */
	struct list_head hwpt_item;
	struct iommufd_sw_msi_maps present_sw_msi;
};

struct iommufd_hwpt_nested {
@@ -383,6 +403,7 @@ struct iommufd_group {
	struct iommu_group *group;
	struct iommufd_hw_pagetable *hwpt;
	struct list_head device_list;
	struct iommufd_sw_msi_maps required_sw_msi;
	phys_addr_t sw_msi_start;
};

+9 −0
Original line number Diff line number Diff line
@@ -227,6 +227,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
	xa_init(&ictx->groups);
	ictx->file = filp;
	init_waitqueue_head(&ictx->destroy_wait);
	mutex_init(&ictx->sw_msi_lock);
	INIT_LIST_HEAD(&ictx->sw_msi_list);
	filp->private_data = ictx;
	return 0;
}
@@ -234,6 +236,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
static int iommufd_fops_release(struct inode *inode, struct file *filp)
{
	struct iommufd_ctx *ictx = filp->private_data;
	struct iommufd_sw_msi_map *next;
	struct iommufd_sw_msi_map *cur;
	struct iommufd_object *obj;

	/*
@@ -262,6 +266,11 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp)
			break;
	}
	WARN_ON(!xa_empty(&ictx->groups));

	mutex_destroy(&ictx->sw_msi_lock);
	list_for_each_entry_safe(cur, next, &ictx->sw_msi_list, sw_msi_item)
		kfree(cur);

	kfree(ictx);
	return 0;
}