mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git/
synced 2026-04-17 22:23:45 -04:00
Add support for exporting PCI device MMIO regions through dma-buf, enabling safe sharing of non-struct page memory with controlled lifetime management. This allows RDMA and other subsystems to import dma-buf FDs and build them into memory regions for PCI P2P operations. The implementation provides a revocable attachment mechanism using dma-buf move operations. MMIO regions are normally pinned as BARs don't change physical addresses, but access is revoked when the VFIO device is closed or a PCI reset is issued. This ensures kernel self-defense against potentially hostile userspace. Currently VFIO can take MMIO regions from the device's BAR and map them into a PFNMAP VMA with special PTEs. This mapping type ensures the memory cannot be used with things like pin_user_pages(), hmm, and so on. In practice only the user process CPU and KVM can safely make use of these VMA. When VFIO shuts down these VMAs are cleaned by unmap_mapping_range() to prevent any UAF of the MMIO beyond driver unbind. However, VFIO type 1 has an insecure behavior where it uses follow_pfnmap_*() to fish a MMIO PFN out of a VMA and program it back into the IOMMU. This has a long history of enabling P2P DMA inside VMs, but has serious lifetime problems by allowing a UAF of the MMIO after the VFIO driver has been unbound. Introduce DMABUF as a new safe way to export a FD based handle for the MMIO regions. This can be consumed by existing DMABUF importers like RDMA or DRM without opening an UAF. A following series will add an importer to iommufd to obsolete the type 1 code and allow safe UAF-free MMIO P2P in VM cases. DMABUF has a built in synchronous invalidation mechanism called move_notify. VFIO keeps track of all drivers importing its MMIO and can invoke a synchronous invalidation callback to tell the importing drivers to DMA unmap and forget about the MMIO pfns. This process is being called revoke. This synchronous invalidation fully prevents any lifecycle problems. VFIO will do this before unbinding its driver ensuring there is no UAF of the MMIO beyond the driver lifecycle. Further, VFIO has additional behavior to block access to the MMIO during things like Function Level Reset. This is because some poor platforms may experience a MCE type crash when touching MMIO of a PCI device that is undergoing a reset. Today this is done by using unmap_mapping_range() on the VMAs. Extend that into the DMABUF world and temporarily revoke the MMIO from the DMABUF importers during FLR as well. This will more robustly prevent an errant P2P from possibly upsetting the platform. A DMABUF FD is a preferred handle for MMIO compared to using something like a pgmap because: - VFIO is supported, including its P2P feature, on archs that don't support pgmap - PCI devices have all sorts of BAR sizes, including ones smaller than a section so a pgmap cannot always be created - It is undesirable to waste a lot of memory for struct pages, especially for a case like a GPU with ~100GB of BAR size - We want a synchronous revoke semantic to support FLR with light hardware requirements Use the P2P subsystem to help generate the DMA mapping. This is a significant upgrade over the abuse of dma_map_resource() that has historically been used by DMABUF exporters. Experience with an OOT version of this patch shows that real systems do need this. This approach deals with all the P2P scenarios: - Non-zero PCI bus_offset - ACS flags routing traffic to the IOMMU - ACS flags that bypass the IOMMU - though vfio noiommu is required to hit this. There will be further work to formalize the revoke semantic in DMABUF. For now this acts like a move_notify dynamic exporter where importer fault handling will get a failure when they attempt to map. This means that only fully restartable fault capable importers can import the VFIO DMABUFs. A future revoke semantic should open this up to more HW as the HW only needs to invalidate, not handle restartable faults. Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Acked-by: Ankit Agrawal <ankita@nvidia.com> Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-10-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson <alex@shazbot.org>
207 lines
7.0 KiB
C
207 lines
7.0 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
|
|
* Author: Alex Williamson <alex.williamson@redhat.com>
|
|
*
|
|
* Derived from original vfio:
|
|
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
|
|
* Author: Tom Lyon, pugs@cisco.com
|
|
*/
|
|
|
|
#include <linux/mutex.h>
|
|
#include <linux/pci.h>
|
|
#include <linux/vfio.h>
|
|
#include <linux/irqbypass.h>
|
|
#include <linux/types.h>
|
|
#include <linux/uuid.h>
|
|
#include <linux/notifier.h>
|
|
|
|
#ifndef VFIO_PCI_CORE_H
|
|
#define VFIO_PCI_CORE_H
|
|
|
|
#define VFIO_PCI_OFFSET_SHIFT 40
|
|
#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT)
|
|
#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
|
|
#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
|
|
|
|
struct vfio_pci_core_device;
|
|
struct vfio_pci_region;
|
|
struct p2pdma_provider;
|
|
struct dma_buf_phys_vec;
|
|
|
|
struct vfio_pci_regops {
|
|
ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf,
|
|
size_t count, loff_t *ppos, bool iswrite);
|
|
void (*release)(struct vfio_pci_core_device *vdev,
|
|
struct vfio_pci_region *region);
|
|
int (*mmap)(struct vfio_pci_core_device *vdev,
|
|
struct vfio_pci_region *region,
|
|
struct vm_area_struct *vma);
|
|
int (*add_capability)(struct vfio_pci_core_device *vdev,
|
|
struct vfio_pci_region *region,
|
|
struct vfio_info_cap *caps);
|
|
};
|
|
|
|
struct vfio_pci_region {
|
|
u32 type;
|
|
u32 subtype;
|
|
const struct vfio_pci_regops *ops;
|
|
void *data;
|
|
size_t size;
|
|
u32 flags;
|
|
};
|
|
|
|
struct vfio_pci_device_ops {
|
|
int (*get_dmabuf_phys)(struct vfio_pci_core_device *vdev,
|
|
struct p2pdma_provider **provider,
|
|
unsigned int region_index,
|
|
struct dma_buf_phys_vec *phys_vec,
|
|
struct vfio_region_dma_range *dma_ranges,
|
|
size_t nr_ranges);
|
|
};
|
|
|
|
#if IS_ENABLED(CONFIG_VFIO_PCI_DMABUF)
|
|
int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec,
|
|
struct vfio_region_dma_range *dma_ranges,
|
|
size_t nr_ranges, phys_addr_t start,
|
|
phys_addr_t len);
|
|
int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev,
|
|
struct p2pdma_provider **provider,
|
|
unsigned int region_index,
|
|
struct dma_buf_phys_vec *phys_vec,
|
|
struct vfio_region_dma_range *dma_ranges,
|
|
size_t nr_ranges);
|
|
#else
|
|
static inline int
|
|
vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec,
|
|
struct vfio_region_dma_range *dma_ranges,
|
|
size_t nr_ranges, phys_addr_t start,
|
|
phys_addr_t len)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
static inline int vfio_pci_core_get_dmabuf_phys(
|
|
struct vfio_pci_core_device *vdev, struct p2pdma_provider **provider,
|
|
unsigned int region_index, struct dma_buf_phys_vec *phys_vec,
|
|
struct vfio_region_dma_range *dma_ranges, size_t nr_ranges)
|
|
{
|
|
return -EOPNOTSUPP;
|
|
}
|
|
#endif
|
|
|
|
struct vfio_pci_core_device {
|
|
struct vfio_device vdev;
|
|
struct pci_dev *pdev;
|
|
const struct vfio_pci_device_ops *pci_ops;
|
|
void __iomem *barmap[PCI_STD_NUM_BARS];
|
|
bool bar_mmap_supported[PCI_STD_NUM_BARS];
|
|
u8 *pci_config_map;
|
|
u8 *vconfig;
|
|
struct perm_bits *msi_perm;
|
|
spinlock_t irqlock;
|
|
struct mutex igate;
|
|
struct xarray ctx;
|
|
int irq_type;
|
|
int num_regions;
|
|
struct vfio_pci_region *region;
|
|
u8 msi_qmax;
|
|
u8 msix_bar;
|
|
u16 msix_size;
|
|
u32 msix_offset;
|
|
u32 rbar[7];
|
|
bool has_dyn_msix:1;
|
|
bool pci_2_3:1;
|
|
bool virq_disabled:1;
|
|
bool reset_works:1;
|
|
bool extended_caps:1;
|
|
bool bardirty:1;
|
|
bool has_vga:1;
|
|
bool needs_reset:1;
|
|
bool nointx:1;
|
|
bool needs_pm_restore:1;
|
|
bool pm_intx_masked:1;
|
|
bool pm_runtime_engaged:1;
|
|
struct pci_saved_state *pci_saved_state;
|
|
struct pci_saved_state *pm_save;
|
|
int ioeventfds_nr;
|
|
struct eventfd_ctx *err_trigger;
|
|
struct eventfd_ctx *req_trigger;
|
|
struct eventfd_ctx *pm_wake_eventfd_ctx;
|
|
struct list_head dummy_resources_list;
|
|
struct mutex ioeventfds_lock;
|
|
struct list_head ioeventfds_list;
|
|
struct vfio_pci_vf_token *vf_token;
|
|
struct list_head sriov_pfs_item;
|
|
struct vfio_pci_core_device *sriov_pf_core_dev;
|
|
struct notifier_block nb;
|
|
struct rw_semaphore memory_lock;
|
|
struct list_head dmabufs;
|
|
};
|
|
|
|
/* Will be exported for vfio pci drivers usage */
|
|
int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev,
|
|
unsigned int type, unsigned int subtype,
|
|
const struct vfio_pci_regops *ops,
|
|
size_t size, u32 flags, void *data);
|
|
void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga,
|
|
bool is_disable_idle_d3);
|
|
void vfio_pci_core_close_device(struct vfio_device *core_vdev);
|
|
int vfio_pci_core_init_dev(struct vfio_device *core_vdev);
|
|
void vfio_pci_core_release_dev(struct vfio_device *core_vdev);
|
|
int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev);
|
|
void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev);
|
|
extern const struct pci_error_handlers vfio_pci_core_err_handlers;
|
|
int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev,
|
|
int nr_virtfn);
|
|
long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
|
|
unsigned long arg);
|
|
int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
|
|
void __user *arg, size_t argsz);
|
|
ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
|
|
size_t count, loff_t *ppos);
|
|
ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
|
|
size_t count, loff_t *ppos);
|
|
int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma);
|
|
void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count);
|
|
int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
|
|
int vfio_pci_core_match_token_uuid(struct vfio_device *core_vdev,
|
|
const uuid_t *uuid);
|
|
int vfio_pci_core_enable(struct vfio_pci_core_device *vdev);
|
|
void vfio_pci_core_disable(struct vfio_pci_core_device *vdev);
|
|
void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev);
|
|
int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar);
|
|
pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
|
|
pci_channel_state_t state);
|
|
ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
|
|
void __iomem *io, char __user *buf,
|
|
loff_t off, size_t count, size_t x_start,
|
|
size_t x_end, bool iswrite);
|
|
bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt,
|
|
loff_t reg_start, size_t reg_cnt,
|
|
loff_t *buf_offset,
|
|
size_t *intersect_count,
|
|
size_t *register_offset);
|
|
#define VFIO_IOWRITE_DECLARATION(size) \
|
|
int vfio_pci_core_iowrite##size(struct vfio_pci_core_device *vdev, \
|
|
bool test_mem, u##size val, void __iomem *io);
|
|
|
|
VFIO_IOWRITE_DECLARATION(8)
|
|
VFIO_IOWRITE_DECLARATION(16)
|
|
VFIO_IOWRITE_DECLARATION(32)
|
|
#ifdef iowrite64
|
|
VFIO_IOWRITE_DECLARATION(64)
|
|
#endif
|
|
|
|
#define VFIO_IOREAD_DECLARATION(size) \
|
|
int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev, \
|
|
bool test_mem, u##size *val, void __iomem *io);
|
|
|
|
VFIO_IOREAD_DECLARATION(8)
|
|
VFIO_IOREAD_DECLARATION(16)
|
|
VFIO_IOREAD_DECLARATION(32)
|
|
#ifdef ioread64
|
|
VFIO_IOREAD_DECLARATION(64)
|
|
#endif
|
|
|
|
#endif /* VFIO_PCI_CORE_H */
|