Commit 463f46e1 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull iommufd updates from Jason Gunthorpe:
 "This brings three new iommufd capabilities:

   - Dirty tracking for DMA.

     AMD/ARM/Intel CPUs can now record if a DMA writes to a page in the
     IOPTEs within the IO page table. This can be used to generate a
     record of what memory is being dirtied by DMA activities during a
     VM migration process. A VMM like qemu will combine the IOMMU dirty
     bits with the CPU's dirty log to determine what memory to transfer.

     VFIO already has a DMA dirty tracking framework that requires PCI
     devices to implement tracking HW internally. The iommufd version
     provides an alternative that the VMM can select, if available. The
     two are designed to have very similar APIs.

   - Userspace controlled attributes for hardware page tables
     (HWPT/iommu_domain). There are currently a few generic attributes
     for HWPTs (support dirty tracking, and parent of a nest). This is
     an entry point for the userspace iommu driver to control the HW in
     detail.

   - Nested translation support for HWPTs. This is a 2D translation
     scheme similar to the CPU where a DMA goes through a first stage to
     determine an intermediate address which is then translated trough a
     second stage to a physical address.

     Like for CPU translation the first stage table would exist in VM
     controlled memory and the second stage is in the kernel and matches
     the VM's guest to physical map.

     As every IOMMU has a unique set of parameter to describe the S1 IO
     page table and its associated parameters the userspace IOMMU driver
     has to marshal the information into the correct format.

     This is 1/3 of the feature, it allows creating the nested
     translation and binding it to VFIO devices, however the API to
     support IOTLB and ATC invalidation of the stage 1 io page table,
     and forwarding of IO faults are still in progress.

  The series includes AMD and Intel support for dirty tracking. Intel
  support for nested translation.

  Along the way are a number of internal items:

   - New iommu core items: ops->domain_alloc_user(),
     ops->set_dirty_tracking, ops->read_and_clear_dirty(),
     IOMMU_DOMAIN_NESTED, and iommu_copy_struct_from_user

   - UAF fix in iopt_area_split()

   - Spelling fixes and some test suite improvement"

* tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd: (52 commits)
  iommufd: Organize the mock domain alloc functions closer to Joerg's tree
  iommufd/selftest: Fix page-size check in iommufd_test_dirty()
  iommufd: Add iopt_area_alloc()
  iommufd: Fix missing update of domains_itree after splitting iopt_area
  iommu/vt-d: Disallow read-only mappings to nest parent domain
  iommu/vt-d: Add nested domain allocation
  iommu/vt-d: Set the nested domain to a device
  iommu/vt-d: Make domain attach helpers to be extern
  iommu/vt-d: Add helper to setup pasid nested translation
  iommu/vt-d: Add helper for nested domain allocation
  iommu/vt-d: Extend dmar_domain to support nested domain
  iommufd: Add data structure for Intel VT-d stage-1 domain allocation
  iommu/vt-d: Enhance capability check for nested parent domain allocation
  iommufd/selftest: Add coverage for IOMMU_HWPT_ALLOC with nested HWPTs
  iommufd/selftest: Add nested domain allocation for mock domain
  iommu: Add iommu_copy_struct_from_user helper
  iommufd: Add a nested HW pagetable object
  iommu: Pass in parent domain with user_data to domain_alloc_user op
  iommufd: Share iommufd_hwpt_alloc with IOMMUFD_OBJ_HWPT_NESTED
  iommufd: Derive iommufd_hwpt_paging from iommufd_hw_pagetable
  ...
parents ff269e2c b2b67c99
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -7,6 +7,10 @@ config IOMMU_IOVA
config IOMMU_API
	bool

config IOMMUFD_DRIVER
	bool
	default n

menuconfig IOMMU_SUPPORT
	bool "IOMMU Hardware Support"
	depends on MMU
+1 −0
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@ config AMD_IOMMU
	select IOMMU_API
	select IOMMU_IOVA
	select IOMMU_IO_PGTABLE
	select IOMMUFD_DRIVER if IOMMUFD
	depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
	help
	  With this option you can enable support for AMD IOMMU hardware in
+12 −0
Original line number Diff line number Diff line
@@ -97,7 +97,9 @@
#define FEATURE_GATS_MASK	(3ULL)
#define FEATURE_GAM_VAPIC	BIT_ULL(21)
#define FEATURE_GIOSUP		BIT_ULL(48)
#define FEATURE_HASUP		BIT_ULL(49)
#define FEATURE_EPHSUP		BIT_ULL(50)
#define FEATURE_HDSUP		BIT_ULL(52)
#define FEATURE_SNP		BIT_ULL(63)

#define FEATURE_PASID_SHIFT	32
@@ -212,6 +214,7 @@
/* macros and definitions for device table entries */
#define DEV_ENTRY_VALID         0x00
#define DEV_ENTRY_TRANSLATION   0x01
#define DEV_ENTRY_HAD           0x07
#define DEV_ENTRY_PPR           0x34
#define DEV_ENTRY_IR            0x3d
#define DEV_ENTRY_IW            0x3e
@@ -370,10 +373,16 @@
#define PTE_LEVEL_PAGE_SIZE(level)			\
	(1ULL << (12 + (9 * (level))))

/*
 * The IOPTE dirty bit
 */
#define IOMMU_PTE_HD_BIT (6)

/*
 * Bit value definition for I/O PTE fields
 */
#define IOMMU_PTE_PR	BIT_ULL(0)
#define IOMMU_PTE_HD	BIT_ULL(IOMMU_PTE_HD_BIT)
#define IOMMU_PTE_U	BIT_ULL(59)
#define IOMMU_PTE_FC	BIT_ULL(60)
#define IOMMU_PTE_IR	BIT_ULL(61)
@@ -384,6 +393,7 @@
 */
#define DTE_FLAG_V	BIT_ULL(0)
#define DTE_FLAG_TV	BIT_ULL(1)
#define DTE_FLAG_HAD	(3ULL << 7)
#define DTE_FLAG_GIOV	BIT_ULL(54)
#define DTE_FLAG_GV	BIT_ULL(55)
#define DTE_GLX_SHIFT	(56)
@@ -413,6 +423,7 @@

#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD)
#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)

@@ -563,6 +574,7 @@ struct protection_domain {
	int nid;		/* Node ID */
	u64 *gcr3_tbl;		/* Guest CR3 table */
	unsigned long flags;	/* flags to find out type of domain */
	bool dirty_tracking;	/* dirty tracking is enabled in the domain */
	unsigned dev_cnt;	/* devices assigned to this domain */
	unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
};
+68 −0
Original line number Diff line number Diff line
@@ -486,6 +486,73 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo
	return (__pte & ~offset_mask) | (iova & offset_mask);
}

static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
				     unsigned long flags)
{
	bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
	bool dirty = false;
	int i, count;

	/*
	 * 2.2.3.2 Host Dirty Support
	 * When a non-default page size is used , software must OR the
	 * Dirty bits in all of the replicated host PTEs used to map
	 * the page. The IOMMU does not guarantee the Dirty bits are
	 * set in all of the replicated PTEs. Any portion of the page
	 * may have been written even if the Dirty bit is set in only
	 * one of the replicated PTEs.
	 */
	count = PAGE_SIZE_PTE_COUNT(size);
	for (i = 0; i < count && test_only; i++) {
		if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
			dirty = true;
			break;
		}
	}

	for (i = 0; i < count && !test_only; i++) {
		if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
				       (unsigned long *)&ptep[i])) {
			dirty = true;
		}
	}

	return dirty;
}

static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
					 unsigned long iova, size_t size,
					 unsigned long flags,
					 struct iommu_dirty_bitmap *dirty)
{
	struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
	unsigned long end = iova + size - 1;

	do {
		unsigned long pgsize = 0;
		u64 *ptep, pte;

		ptep = fetch_pte(pgtable, iova, &pgsize);
		if (ptep)
			pte = READ_ONCE(*ptep);
		if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
			pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
			iova += pgsize;
			continue;
		}

		/*
		 * Mark the whole IOVA range as dirty even if only one of
		 * the replicated PTEs were marked dirty.
		 */
		if (pte_test_and_clear_dirty(ptep, pgsize, flags))
			iommu_dirty_bitmap_record(dirty, iova, pgsize);
		iova += pgsize;
	} while (iova < end);

	return 0;
}

/*
 * ----------------------------------------------------
 */
@@ -527,6 +594,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo
	pgtable->iop.ops.map_pages    = iommu_v1_map_pages;
	pgtable->iop.ops.unmap_pages  = iommu_v1_unmap_pages;
	pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys;
	pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;

	return &pgtable->iop;
}
+144 −3
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/dma.h>
#include <uapi/linux/iommufd.h>

#include "amd_iommu.h"
#include "../dma-iommu.h"
@@ -65,6 +66,7 @@ LIST_HEAD(hpet_map);
LIST_HEAD(acpihid_map);

const struct iommu_ops amd_iommu_ops;
const struct iommu_dirty_ops amd_dirty_ops;

static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
int amd_iommu_max_glx_val = -1;
@@ -1610,6 +1612,9 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid,
			pte_root |= 1ULL << DEV_ENTRY_PPR;
	}

	if (domain->dirty_tracking)
		pte_root |= DTE_FLAG_HAD;

	if (domain->flags & PD_IOMMUV2_MASK) {
		u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
		u64 glx  = domain->glx;
@@ -2155,28 +2160,79 @@ static inline u64 dma_max_address(void)
	return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
}

static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
static bool amd_iommu_hd_support(struct amd_iommu *iommu)
{
	return iommu && (iommu->features & FEATURE_HDSUP);
}

static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
						  struct device *dev, u32 flags)
{
	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
	struct protection_domain *domain;
	struct amd_iommu *iommu = NULL;

	if (dev) {
		iommu = rlookup_amd_iommu(dev);
		if (!iommu)
			return ERR_PTR(-ENODEV);
	}

	/*
	 * Since DTE[Mode]=0 is prohibited on SNP-enabled system,
	 * default to use IOMMU_DOMAIN_DMA[_FQ].
	 */
	if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
		return NULL;
		return ERR_PTR(-EINVAL);

	if (dirty_tracking && !amd_iommu_hd_support(iommu))
		return ERR_PTR(-EOPNOTSUPP);

	domain = protection_domain_alloc(type);
	if (!domain)
		return NULL;
		return ERR_PTR(-ENOMEM);

	domain->domain.geometry.aperture_start = 0;
	domain->domain.geometry.aperture_end   = dma_max_address();
	domain->domain.geometry.force_aperture = true;

	if (iommu) {
		domain->domain.type = type;
		domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap;
		domain->domain.ops = iommu->iommu.ops->default_domain_ops;

		if (dirty_tracking)
			domain->domain.dirty_ops = &amd_dirty_ops;
	}

	return &domain->domain;
}

static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
{
	struct iommu_domain *domain;

	domain = do_iommu_domain_alloc(type, NULL, 0);
	if (IS_ERR(domain))
		return NULL;

	return domain;
}

static struct iommu_domain *
amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
			    struct iommu_domain *parent,
			    const struct iommu_user_data *user_data)

{
	unsigned int type = IOMMU_DOMAIN_UNMANAGED;

	if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data)
		return ERR_PTR(-EOPNOTSUPP);

	return do_iommu_domain_alloc(type, dev, flags);
}

static void amd_iommu_domain_free(struct iommu_domain *dom)
{
	struct protection_domain *domain;
@@ -2214,6 +2270,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,

	dev_data->defer_attach = false;

	/*
	 * Restrict to devices with compatible IOMMU hardware support
	 * when enforcement of dirty tracking is enabled.
	 */
	if (dom->dirty_ops && !amd_iommu_hd_support(iommu))
		return -EINVAL;

	if (dev_data->domain)
		detach_device(dev);

@@ -2332,6 +2395,11 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
		return true;
	case IOMMU_CAP_DEFERRED_FLUSH:
		return true;
	case IOMMU_CAP_DIRTY_TRACKING: {
		struct amd_iommu *iommu = rlookup_amd_iommu(dev);

		return amd_iommu_hd_support(iommu);
	}
	default:
		break;
	}
@@ -2339,6 +2407,73 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
	return false;
}

static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
					bool enable)
{
	struct protection_domain *pdomain = to_pdomain(domain);
	struct dev_table_entry *dev_table;
	struct iommu_dev_data *dev_data;
	bool domain_flush = false;
	struct amd_iommu *iommu;
	unsigned long flags;
	u64 pte_root;

	spin_lock_irqsave(&pdomain->lock, flags);
	if (!(pdomain->dirty_tracking ^ enable)) {
		spin_unlock_irqrestore(&pdomain->lock, flags);
		return 0;
	}

	list_for_each_entry(dev_data, &pdomain->dev_list, list) {
		iommu = rlookup_amd_iommu(dev_data->dev);
		if (!iommu)
			continue;

		dev_table = get_dev_table(iommu);
		pte_root = dev_table[dev_data->devid].data[0];

		pte_root = (enable ? pte_root | DTE_FLAG_HAD :
				     pte_root & ~DTE_FLAG_HAD);

		/* Flush device DTE */
		dev_table[dev_data->devid].data[0] = pte_root;
		device_flush_dte(dev_data);
		domain_flush = true;
	}

	/* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
	if (domain_flush) {
		amd_iommu_domain_flush_tlb_pde(pdomain);
		amd_iommu_domain_flush_complete(pdomain);
	}
	pdomain->dirty_tracking = enable;
	spin_unlock_irqrestore(&pdomain->lock, flags);

	return 0;
}

static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
					  unsigned long iova, size_t size,
					  unsigned long flags,
					  struct iommu_dirty_bitmap *dirty)
{
	struct protection_domain *pdomain = to_pdomain(domain);
	struct io_pgtable_ops *ops = &pdomain->iop.iop.ops;
	unsigned long lflags;

	if (!ops || !ops->read_and_clear_dirty)
		return -EOPNOTSUPP;

	spin_lock_irqsave(&pdomain->lock, lflags);
	if (!pdomain->dirty_tracking && dirty->bitmap) {
		spin_unlock_irqrestore(&pdomain->lock, lflags);
		return -EINVAL;
	}
	spin_unlock_irqrestore(&pdomain->lock, lflags);

	return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
}

static void amd_iommu_get_resv_regions(struct device *dev,
				       struct list_head *head)
{
@@ -2461,9 +2596,15 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
	return true;
}

const struct iommu_dirty_ops amd_dirty_ops = {
	.set_dirty_tracking = amd_iommu_set_dirty_tracking,
	.read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
};

const struct iommu_ops amd_iommu_ops = {
	.capable = amd_iommu_capable,
	.domain_alloc = amd_iommu_domain_alloc,
	.domain_alloc_user = amd_iommu_domain_alloc_user,
	.probe_device = amd_iommu_probe_device,
	.release_device = amd_iommu_release_device,
	.probe_finalize = amd_iommu_probe_finalize,
Loading