Commit 3673f5be authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'vfio-v6.14-rc1' of https://github.com/awilliam/linux-vfio

Pull vfio updates from Alex Williamson:

 - Extend vfio-pci 8-byte read/write support to include archs defining
   CONFIG_GENERIC_IOMAP, such as x86, and remove now extraneous #ifdefs
   around 64-bit accessors (Ramesh Thomas)

 - Update vfio-pci shadow ROM handling and allow cached ROM from setup
   data to be exposed as a functional ROM BAR region when available
   (Yunxiang Li)

 - Update nvgrace-gpu vfio-pci variant driver for new Grace Blackwell
   hardware, conditionalizing the uncached BAR workaround for previous
   generation hardware based on the presence of a flag in a new DVSEC
   capability, and include a delay during probe for link training to
   complete, a new requirement for GB devices (Ankit Agrawal)

* tag 'vfio-v6.14-rc1' of https://github.com/awilliam/linux-vfio:
  vfio/nvgrace-gpu: Add GB200 SKU to the devid table
  vfio/nvgrace-gpu: Check the HBM training and C2C link status
  vfio/nvgrace-gpu: Expose the blackwell device PF BAR1 to the VM
  vfio/nvgrace-gpu: Read dvsec register to determine need for uncached resmem
  vfio/platform: check the bounds of read/write syscalls
  vfio/pci: Expose setup ROM at ROM bar when needed
  vfio/pci: Remove shadow ROM specific code paths
  vfio/pci: Remove #ifdef iowrite64 and #ifdef ioread64
  vfio/pci: Enable iowrite64 and ioread64 for vfio pci
parents 2ab002c7 2bb44754
Loading
Loading
Loading
Loading
+147 −22
Original line number Diff line number Diff line
@@ -5,6 +5,8 @@

#include <linux/sizes.h>
#include <linux/vfio_pci_core.h>
#include <linux/delay.h>
#include <linux/jiffies.h>

/*
 * The device memory usable to the workloads running in the VM is cached
@@ -17,12 +19,21 @@
#define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
#define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX

/* Memory size expected as non cached and reserved by the VM driver */
#define RESMEM_SIZE SZ_1G

/* A hardwired and constant ABI value between the GPU FW and VFIO driver. */
#define MEMBLK_SIZE SZ_512M

#define DVSEC_BITMAP_OFFSET 0xA
#define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0)

#define GPU_CAP_DVSEC_REGISTER 3

#define C2C_LINK_BAR0_OFFSET 0x1498
#define HBM_TRAINING_BAR0_OFFSET 0x200BC
#define STATUS_READY 0xFF

#define POLL_QUANTUM_MS 1000
#define POLL_TIMEOUT_MS (30 * 1000)

/*
 * The state of the two device memory region - resmem and usemem - is
 * saved as struct mem_region.
@@ -46,6 +57,7 @@ struct nvgrace_gpu_pci_core_device {
	struct mem_region resmem;
	/* Lock to control device memory kernel mapping */
	struct mutex remap_lock;
	bool has_mig_hw_bug;
};

static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
@@ -66,7 +78,7 @@ nvgrace_gpu_memregion(int index,
	if (index == USEMEM_REGION_INDEX)
		return &nvdev->usemem;

	if (index == RESMEM_REGION_INDEX)
	if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX)
		return &nvdev->resmem;

	return NULL;
@@ -751,40 +763,67 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
			      u64 memphys, u64 memlength)
{
	int ret = 0;
	u64 resmem_size = 0;

	/*
	 * The VM GPU device driver needs a non-cacheable region to support
	 * the MIG feature. Since the device memory is mapped as NORMAL cached,
	 * carve out a region from the end with a different NORMAL_NC
	 * property (called as reserved memory and represented as resmem). This
	 * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while
	 * exposing the rest (termed as usable memory and represented using usemem)
	 * as cacheable 64b BAR (region 4 and 5).
	 * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable
	 * region to support the MIG feature owing to a hardware bug. Since the
	 * device memory is mapped as NORMAL cached, carve out a region from the end
	 * with a different NORMAL_NC property (called as reserved memory and
	 * represented as resmem). This region then is exposed as a 64b BAR
	 * (region 2 and 3) to the VM, while exposing the rest (termed as usable
	 * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5).
	 *
	 *               devmem (memlength)
	 * |-------------------------------------------------|
	 * |                                           |
	 * usemem.memphys                              resmem.memphys
	 *
	 * This hardware bug is fixed on the Grace Blackwell platforms and the
	 * presence of the bug can be determined through nvdev->has_mig_hw_bug.
	 * Thus on systems with the hardware fix, there is no need to partition
	 * the GPU device memory and the entire memory is usable and mapped as
	 * NORMAL cached (i.e. resmem size is 0).
	 */
	if (nvdev->has_mig_hw_bug)
		resmem_size = SZ_1G;

	nvdev->usemem.memphys = memphys;

	/*
	 * The device memory exposed to the VM is added to the kernel by the
	 * VM driver module in chunks of memory block size. Only the usable
	 * memory (usemem) is added to the kernel for usage by the VM
	 * workloads. Make the usable memory size memblock aligned.
	 * VM driver module in chunks of memory block size. Note that only the
	 * usable memory (usemem) is added to the kernel for usage by the VM
	 * workloads.
	 */
	if (check_sub_overflow(memlength, RESMEM_SIZE,
	if (check_sub_overflow(memlength, resmem_size,
			       &nvdev->usemem.memlength)) {
		ret = -EOVERFLOW;
		goto done;
	}

	/*
	 * The USEMEM part of the device memory has to be MEMBLK_SIZE
	 * aligned. This is a hardwired ABI value between the GPU FW and
	 * VFIO driver. The VM device driver is also aware of it and make
	 * use of the value for its calculation to determine USEMEM size.
	 * The usemem region is exposed as a 64B Bar composed of region 4 and 5.
	 * Calculate and save the BAR size for the region.
	 */
	nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);

	/*
	 * If the hardware has the fix for MIG, there is no requirement
	 * for splitting the device memory to create RESMEM. The entire
	 * device memory is usable and will be USEMEM. Return here for
	 * such case.
	 */
	if (!nvdev->has_mig_hw_bug)
		goto done;

	/*
	 * When the device memory is split to workaround the MIG bug on
	 * Grace Hopper, the USEMEM part of the device memory has to be
	 * MEMBLK_SIZE aligned. This is a hardwired ABI value between the
	 * GPU FW and VFIO driver. The VM device driver is also aware of it
	 * and make use of the value for its calculation to determine USEMEM
	 * size. Note that the device memory may not be 512M aligned.
	 */
	nvdev->usemem.memlength = round_down(nvdev->usemem.memlength,
					     MEMBLK_SIZE);
@@ -803,15 +842,93 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
	}

	/*
	 * The memory regions are exposed as BARs. Calculate and save
	 * the BAR size for them.
	 * The resmem region is exposed as a 64b BAR composed of region 2 and 3
	 * for Grace Hopper. Calculate and save the BAR size for the region.
	 */
	nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
	nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength);
done:
	return ret;
}

static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
{
	int pcie_dvsec;
	u16 dvsec_ctrl16;

	pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA,
					       GPU_CAP_DVSEC_REGISTER);

	if (pcie_dvsec) {
		pci_read_config_word(pdev,
				     pcie_dvsec + DVSEC_BITMAP_OFFSET,
				     &dvsec_ctrl16);

		if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM)
			return false;
	}

	return true;
}

/*
 * To reduce the system bootup time, the HBM training has
 * been moved out of the UEFI on the Grace-Blackwell systems.
 *
 * The onus of checking whether the HBM training has completed
 * thus falls on the module. The HBM training status can be
 * determined from a BAR0 register.
 *
 * Similarly, another BAR0 register exposes the status of the
 * CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
 *
 * Poll these register and check for 30s. If the HBM training is
 * not complete or if the C2C link is not ready, fail the probe.
 *
 * While the wait is not required on Grace Hopper systems, it
 * is beneficial to make the check to ensure the device is in an
 * expected state.
 *
 * Ensure that the BAR0 region is enabled before accessing the
 * registers.
 */
static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev)
{
	unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
	void __iomem *io;
	int ret = -ETIME;

	ret = pci_enable_device(pdev);
	if (ret)
		return ret;

	ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME);
	if (ret)
		goto request_region_exit;

	io = pci_iomap(pdev, 0, 0);
	if (!io) {
		ret = -ENOMEM;
		goto iomap_exit;
	}

	do {
		if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
		    (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
			ret = 0;
			goto reg_check_exit;
		}
		msleep(POLL_QUANTUM_MS);
	} while (!time_after(jiffies, timeout));

reg_check_exit:
	pci_iounmap(pdev, io);
iomap_exit:
	pci_release_selected_regions(pdev, 1 << 0);
request_region_exit:
	pci_disable_device(pdev);
	return ret;
}

static int nvgrace_gpu_probe(struct pci_dev *pdev,
			     const struct pci_device_id *id)
{
@@ -820,6 +937,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
	u64 memphys, memlength;
	int ret;

	ret = nvgrace_gpu_wait_device_ready(pdev);
	if (ret)
		return ret;

	ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
	if (!ret)
		ops = &nvgrace_gpu_pci_ops;
@@ -832,6 +953,8 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
	dev_set_drvdata(&pdev->dev, &nvdev->core_device);

	if (ops == &nvgrace_gpu_pci_ops) {
		nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev);

		/*
		 * Device memory properties are identified in the host ACPI
		 * table. Set the nvgrace_gpu_pci_core_device structure.
@@ -868,6 +991,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) },
	/* GH200 SKU */
	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) },
	/* GB200 SKU */
	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) },
	{}
};

+4 −4
Original line number Diff line number Diff line
@@ -511,13 +511,13 @@ static void vfio_bar_fixup(struct vfio_pci_core_device *vdev)
		mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
		mask |= PCI_ROM_ADDRESS_ENABLE;
		*vbar &= cpu_to_le32((u32)mask);
	} else if (pdev->resource[PCI_ROM_RESOURCE].flags &
					IORESOURCE_ROM_SHADOW) {
		mask = ~(0x20000 - 1);
	} else if (pdev->rom && pdev->romlen) {
		mask = ~(roundup_pow_of_two(pdev->romlen) - 1);
		mask |= PCI_ROM_ADDRESS_ENABLE;
		*vbar &= cpu_to_le32((u32)mask);
	} else
	} else {
		*vbar = 0;
	}

	vdev->bardirty = false;
}
+18 −22
Original line number Diff line number Diff line
@@ -1054,31 +1054,27 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,

		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
		info.flags = 0;
		info.size = 0;

		/* Report the BAR size, not the ROM size */
		info.size = pci_resource_len(pdev, info.index);
		if (!info.size) {
			/* Shadow ROMs appear as PCI option ROMs */
			if (pdev->resource[PCI_ROM_RESOURCE].flags &
			    IORESOURCE_ROM_SHADOW)
				info.size = 0x20000;
			else
				break;
		}

		if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
			/*
		 * Is it really there?  Enable memory decode for implicit access
		 * in pci_map_rom().
			 * Check ROM content is valid. Need to enable memory
			 * decode for ROM access in pci_map_rom().
			 */
			cmd = vfio_pci_memory_lock_and_enable(vdev);
			io = pci_map_rom(pdev, &size);
			if (io) {
				info.flags = VFIO_REGION_INFO_FLAG_READ;
				/* Report the BAR size, not the ROM size. */
				info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE);
				pci_unmap_rom(pdev, io);
		} else {
			info.size = 0;
			}
			vfio_pci_memory_unlock_and_restore(vdev, cmd);
		} else if (pdev->rom && pdev->romlen) {
			info.flags = VFIO_REGION_INFO_FLAG_READ;
			/* Report BAR size as power of two. */
			info.size = roundup_pow_of_two(pdev->romlen);
		}

		break;
	}
+17 −21
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@
#include <linux/io.h>
#include <linux/vfio.h>
#include <linux/vgaarb.h>
#include <linux/io-64-nonatomic-lo-hi.h>

#include "vfio_pci_priv.h"

@@ -61,9 +62,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_iowrite##size);
VFIO_IOWRITE(8)
VFIO_IOWRITE(16)
VFIO_IOWRITE(32)
#ifdef iowrite64
VFIO_IOWRITE(64)
#endif

#define VFIO_IOREAD(size) \
int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev,	\
@@ -89,9 +88,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_ioread##size);
VFIO_IOREAD(8)
VFIO_IOREAD(16)
VFIO_IOREAD(32)
#ifdef ioread64
VFIO_IOREAD(64)
#endif

#define VFIO_IORDWR(size)						\
static int vfio_pci_iordwr##size(struct vfio_pci_core_device *vdev,\
@@ -127,9 +124,7 @@ static int vfio_pci_iordwr##size(struct vfio_pci_core_device *vdev,\
VFIO_IORDWR(8)
VFIO_IORDWR(16)
VFIO_IORDWR(32)
#if defined(ioread64) && defined(iowrite64)
VFIO_IORDWR(64)
#endif

/*
 * Read or write from an __iomem region (MMIO or I/O port) with an excluded
@@ -155,7 +150,6 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
		else
			fillable = 0;

#if defined(ioread64) && defined(iowrite64)
		if (fillable >= 8 && !(off % 8)) {
			ret = vfio_pci_iordwr64(vdev, iswrite, test_mem,
						io, buf, off, &filled);
@@ -163,7 +157,6 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
				return ret;

		} else
#endif
		if (fillable >= 4 && !(off % 4)) {
			ret = vfio_pci_iordwr32(vdev, iswrite, test_mem,
						io, buf, off, &filled);
@@ -244,9 +237,8 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,

	if (pci_resource_start(pdev, bar))
		end = pci_resource_len(pdev, bar);
	else if (bar == PCI_ROM_RESOURCE &&
		 pdev->resource[bar].flags & IORESOURCE_ROM_SHADOW)
		end = 0x20000;
	else if (bar == PCI_ROM_RESOURCE && pdev->rom && pdev->romlen)
		end = roundup_pow_of_two(pdev->romlen);
	else
		return -EINVAL;

@@ -261,11 +253,14 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
		 * excluded range at the end of the actual ROM.  This makes
		 * filling large ROM BARs much faster.
		 */
		if (pci_resource_start(pdev, bar)) {
			io = pci_map_rom(pdev, &x_start);
		if (!io) {
			done = -ENOMEM;
			goto out;
		} else {
			io = ioremap(pdev->rom, pdev->romlen);
			x_start = pdev->romlen;
		}
		if (!io)
			return -ENOMEM;
		x_end = end;
	} else {
		int ret = vfio_pci_core_setup_barmap(vdev, bar);
@@ -288,8 +283,13 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
	if (done >= 0)
		*ppos += done;

	if (bar == PCI_ROM_RESOURCE)
	if (bar == PCI_ROM_RESOURCE) {
		if (pci_resource_start(pdev, bar))
			pci_unmap_rom(pdev, io);
		else
			iounmap(io);
	}

out:
	return done;
}
@@ -381,12 +381,10 @@ static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd,
		vfio_pci_core_iowrite32(ioeventfd->vdev, test_mem,
					ioeventfd->data, ioeventfd->addr);
		break;
#ifdef iowrite64
	case 8:
		vfio_pci_core_iowrite64(ioeventfd->vdev, test_mem,
					ioeventfd->data, ioeventfd->addr);
		break;
#endif
	}
}

@@ -440,10 +438,8 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
	      pos >= vdev->msix_offset + vdev->msix_size))
		return -EINVAL;

#ifndef iowrite64
	if (count == 8)
		return -EINVAL;
#endif

	ret = vfio_pci_core_setup_barmap(vdev, bar);
	if (ret)
+10 −0
Original line number Diff line number Diff line
@@ -388,6 +388,11 @@ static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg,
{
	unsigned int done = 0;

	if (off >= reg->size)
		return -EINVAL;

	count = min_t(size_t, count, reg->size - off);

	if (!reg->ioaddr) {
		reg->ioaddr =
			ioremap(reg->addr, reg->size);
@@ -467,6 +472,11 @@ static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg,
{
	unsigned int done = 0;

	if (off >= reg->size)
		return -EINVAL;

	count = min_t(size_t, count, reg->size - off);

	if (!reg->ioaddr) {
		reg->ioaddr =
			ioremap(reg->addr, reg->size);