accel/habanalabs: add NVMe Direct I/O (HLDIO) infrastructure (8cbacc9a) · Commits · git / linux-net

drivers/accel/habanalabs/Kconfig

+23 −0

Original line number	Diff line number	Diff line
		@@ -27,3 +27,26 @@ config DRM_ACCEL_HABANALABS

		To compile this driver as a module, choose M here: the
		module will be called habanalabs.

		if DRM_ACCEL_HABANALABS

		config HL_HLDIO
		bool "Habanalabs NVMe Direct I/O (HLDIO)"
		depends on PCI_P2PDMA
		depends on BLOCK
		help
		Enable NVMe peer-to-peer direct I/O support for Habanalabs AI
		accelerators.

		This allows direct data transfers between NVMe storage devices
		and Habanalabs accelerators without involving system memory,
		using PCI peer-to-peer DMA capabilities.

		Requirements:
		- CONFIG_PCI_P2PDMA=y
		- NVMe device and Habanalabs accelerator under same PCI root complex
		- IOMMU disabled or in passthrough mode
		- Hardware supporting PCI P2P DMA

		If unsure, say N
		endif # DRM_ACCEL_HABANALABS

drivers/accel/habanalabs/common/Makefile

+5 −0

Original line number	Diff line number	Diff line
		@@ -13,3 +13,8 @@ HL_COMMON_FILES := common/habanalabs_drv.o common/device.o common/context.o \
		common/command_submission.o common/firmware_if.o \
		common/security.o common/state_dump.o \
		common/memory_mgr.o common/decoder.o

		# Conditionally add HLDIO support
		ifdef CONFIG_HL_HLDIO
		HL_COMMON_FILES += common/hldio.o
		endif
		No newline at end of file

drivers/accel/habanalabs/common/debugfs.c

+3 −0

Original line number	Diff line number	Diff line
		@@ -1891,6 +1891,7 @@ void hl_debugfs_device_fini(struct hl_device *hdev)
		vfree(entry->state_dump[i]);

		kfree(entry->entry_arr);

		}

		void hl_debugfs_add_device(struct hl_device *hdev)
		@@ -1903,6 +1904,7 @@ void hl_debugfs_add_device(struct hl_device *hdev)

		if (!hdev->asic_prop.fw_security_enabled)
		add_secured_nodes(dev_entry, dev_entry->root);

		}

		void hl_debugfs_add_file(struct hl_fpriv *hpriv)
		@@ -2035,3 +2037,4 @@ void hl_debugfs_set_state_dump(struct hl_device hdev, char data,

		up_write(&dev_entry->state_dump_sem);
		}

drivers/accel/habanalabs/common/habanalabs.h

+10 −2

Original line number	Diff line number	Diff line
		@@ -704,6 +704,7 @@ struct hl_hints_range {
		* @supports_advanced_cpucp_rc: true if new cpucp opcodes are supported.
		* @supports_engine_modes: true if changing engines/engine_cores modes is supported.
		* @support_dynamic_resereved_fw_size: true if we support dynamic reserved size for fw.
		* @supports_nvme: indicates whether the asic supports NVMe P2P DMA.
		*/
		struct asic_fixed_properties {
		struct hw_queue_properties *hw_queues_props;
		@@ -824,6 +825,7 @@ struct asic_fixed_properties {
		u8 supports_advanced_cpucp_rc;
		u8 supports_engine_modes;
		u8 support_dynamic_resereved_fw_size;
		u8 supports_nvme;
		};

		/**
		@@ -2276,6 +2278,9 @@ struct hl_vm {
		u8 init_done;
		};

		#ifdef CONFIG_HL_HLDIO
		#include "hldio.h"
		#endif

		/*
		* DEBUG, PROFILING STRUCTURE
		@@ -2346,7 +2351,6 @@ struct hl_fpriv {
		struct mutex ctx_lock;
		};


		/*
		* DebugFS
		*/
		@@ -2374,6 +2378,7 @@ struct hl_debugfs_entry {
		struct hl_dbg_device_entry *dev_entry;
		};


		/**
		* struct hl_dbg_device_entry - ASIC specific debugfs manager.
		* @root: root dentry.
		@@ -3334,6 +3339,7 @@ struct eq_heartbeat_debug_info {
		* @captured_err_info: holds information about errors.
		* @reset_info: holds current device reset information.
		* @heartbeat_debug_info: counters used to debug heartbeat failures.
		* @hldio: describes habanalabs direct storage interaction interface.
		* @irq_affinity_mask: mask of available CPU cores for user and decoder interrupt handling.
		* @stream_master_qid_arr: pointer to array with QIDs of master streams.
		* @fw_inner_major_ver: the major of current loaded preboot inner version.
		@@ -3527,7 +3533,9 @@ struct hl_device {
		struct hl_reset_info reset_info;

		struct eq_heartbeat_debug_info heartbeat_debug_info;

		#ifdef CONFIG_HL_HLDIO
		struct hl_dio hldio;
		#endif
		cpumask_t irq_affinity_mask;

		u32 *stream_master_qid_arr;

drivers/accel/habanalabs/common/hldio.c

0 → 100644

+437 −0

Original line number	Diff line number	Diff line
		// SPDX-License-Identifier: GPL-2.0

		/*
		* Copyright 2024 HabanaLabs, Ltd.
		* All Rights Reserved.
		*/

		#include "habanalabs.h"
		#include "hldio.h"
		#include <generated/uapi/linux/version.h>
		#include <linux/pci-p2pdma.h>
		#include <linux/blkdev.h>
		#include <linux/vmalloc.h>

		/*
		* NVMe Direct I/O implementation for habanalabs driver
		*
		* ASSUMPTIONS
		* ===========
		* 1. No IOMMU (well, technically it can work with IOMMU, but it is *almost useless).
		* 2. Only READ operations (can extend in the future).
		* 3. No sparse files (can overcome this in the future).
		* 4. Kernel version >= 6.9
		* 5. Requiring page alignment is OK (I don't see a solution to this one right,
		* now, how do we read partial pages?)
		* 6. Kernel compiled with CONFIG_PCI_P2PDMA. This requires a CUSTOM kernel.
		* Theoretically I have a slight idea on how this could be solvable, but it
		* is probably inacceptable for the upstream. Also may not work in the end.
		* 7. Either make sure our cards and disks are under the same PCI bridge, or
		* compile a custom kernel to hack around this.
		*/

		#define IO_STABILIZE_TIMEOUT 10000000 /* 10 seconds in microseconds */

		/*
		* This struct contains all the useful data I could milk out of the file handle
		* provided by the user.
		* @TODO: right now it is retrieved on each IO, but can be done once with some
		* dedicated IOCTL, call it for example HL_REGISTER_HANDLE.
		*/
		struct hl_dio_fd {
		/* Back pointer in case we need it in async completion */
		struct hl_ctx *ctx;
		/* Associated fd struct */
		struct file *filp;
		};

		/*
		* This is a single IO descriptor
		*/
		struct hl_direct_io {
		struct hl_dio_fd f;
		struct kiocb kio;
		struct bio_vec *bv;
		struct iov_iter iter;
		u64 device_va;
		u64 off_bytes;
		u64 len_bytes;
		u32 type;
		};

		bool hl_device_supports_nvme(struct hl_device *hdev)
		{
		return hdev->asic_prop.supports_nvme;
		}

		static int hl_dio_fd_register(struct hl_ctx ctx, int fd, struct hl_dio_fd f)
		{
		struct hl_device *hdev = ctx->hdev;
		struct block_device *bd;
		struct super_block *sb;
		struct inode *inode;
		struct gendisk *gd;
		struct device *disk_dev;
		int rc;

		f->filp = fget(fd);
		if (!f->filp) {
		rc = -ENOENT;
		goto out;
		}

		if (!(f->filp->f_flags & O_DIRECT)) {
		dev_err(hdev->dev, "file is not in the direct mode\n");
		rc = -EINVAL;
		goto fput;
		}

		if (!f->filp->f_op->read_iter) {
		dev_err(hdev->dev, "read iter is not supported, need to fall back to legacy\n");
		rc = -EINVAL;
		goto fput;
		}

		inode = file_inode(f->filp);
		sb = inode->i_sb;
		bd = sb->s_bdev;
		gd = bd->bd_disk;

		if (inode->i_blocks << sb->s_blocksize_bits < i_size_read(inode)) {
		dev_err(hdev->dev, "sparse files are not currently supported\n");
		rc = -EINVAL;
		goto fput;
		}

		if (!bd \|\| !gd) {
		dev_err(hdev->dev, "invalid block device\n");
		rc = -ENODEV;
		goto fput;
		}
		/* Get the underlying device from the block device */
		disk_dev = disk_to_dev(gd);
		if (!dma_pci_p2pdma_supported(disk_dev)) {
		dev_err(hdev->dev, "device does not support PCI P2P DMA\n");
		rc = -EOPNOTSUPP;
		goto fput;
		}

		/*
		* @TODO: Maybe we need additional checks here
		*/

		f->ctx = ctx;
		rc = 0;

		goto out;
		fput:
		fput(f->filp);
		out:
		return rc;
		}

		static void hl_dio_fd_unregister(struct hl_dio_fd *f)
		{
		fput(f->filp);
		}

		static long hl_dio_count_io(struct hl_device *hdev)
		{
		s64 sum = 0;
		int i;

		for_each_possible_cpu(i)
		sum += per_cpu(*hdev->hldio.inflight_ios, i);

		return sum;
		}

		static bool hl_dio_get_iopath(struct hl_ctx *ctx)
		{
		struct hl_device *hdev = ctx->hdev;

		if (hdev->hldio.io_enabled) {
		this_cpu_inc(*hdev->hldio.inflight_ios);

		/* Avoid race conditions */
		if (!hdev->hldio.io_enabled) {
		this_cpu_dec(*hdev->hldio.inflight_ios);
		return false;
		}

		hl_ctx_get(ctx);

		return true;
		}

		return false;
		}

		static void hl_dio_put_iopath(struct hl_ctx *ctx)
		{
		struct hl_device *hdev = ctx->hdev;

		hl_ctx_put(ctx);
		this_cpu_dec(*hdev->hldio.inflight_ios);
		}

		static void hl_dio_set_io_enabled(struct hl_device *hdev, bool enabled)
		{
		hdev->hldio.io_enabled = enabled;
		}

		static bool hl_dio_validate_io(struct hl_device hdev, struct hl_direct_io io)
		{
		if ((u64)io->device_va & ~PAGE_MASK) {
		dev_dbg(hdev->dev, "device address must be 4K aligned\n");
		return false;
		}

		if (io->len_bytes & ~PAGE_MASK) {
		dev_dbg(hdev->dev, "IO length must be 4K aligned\n");
		return false;
		}

		if (io->off_bytes & ~PAGE_MASK) {
		dev_dbg(hdev->dev, "IO offset must be 4K aligned\n");
		return false;
		}

		return true;
		}

		static struct page hl_dio_va2page(struct hl_device hdev, struct hl_ctx *ctx, u64 device_va)
		{
		struct hl_dio *hldio = &hdev->hldio;
		u64 device_pa;
		int rc, i;

		rc = hl_mmu_va_to_pa(ctx, device_va, &device_pa);
		if (rc) {
		dev_err(hdev->dev, "device virtual address translation error: %#llx (%d)",
		device_va, rc);
		return NULL;
		}

		for (i = 0 ; i < hldio->np2prs ; ++i) {
		if (device_pa >= hldio->p2prs[i].device_pa &&
		device_pa < hldio->p2prs[i].device_pa + hldio->p2prs[i].size)
		return hldio->p2prs[i].p2ppages[(device_pa - hldio->p2prs[i].device_pa) >>
		PAGE_SHIFT];
		}

		return NULL;
		}

		static ssize_t hl_direct_io(struct hl_device hdev, struct hl_direct_io io)
		{
		u64 npages, device_va;
		ssize_t rc;
		int i;

		if (!hl_dio_validate_io(hdev, io))
		return -EINVAL;

		if (!hl_dio_get_iopath(io->f.ctx)) {
		dev_info(hdev->dev, "can't schedule a new IO, IO is disabled\n");
		return -ESHUTDOWN;
		}

		init_sync_kiocb(&io->kio, io->f.filp);
		io->kio.ki_pos = io->off_bytes;

		npages = (io->len_bytes >> PAGE_SHIFT);

		/* @TODO: this can be implemented smarter, vmalloc in iopath is not
		* ideal. Maybe some variation of genpool. Number of pages may differ
		* greatly, so maybe even use pools of different sizes and chose the
		* closest one.
		*/
		io->bv = vzalloc(npages * sizeof(struct bio_vec));
		if (!io->bv)
		return -ENOMEM;

		for (i = 0, device_va = io->device_va; i < npages ; ++i, device_va += PAGE_SIZE) {
		io->bv[i].bv_page = hl_dio_va2page(hdev, io->f.ctx, device_va);
		if (!io->bv[i].bv_page) {
		dev_err(hdev->dev, "error getting page struct for device va %#llx",
		device_va);
		rc = -EFAULT;
		goto cleanup;
		}
		io->bv[i].bv_offset = 0;
		io->bv[i].bv_len = PAGE_SIZE;
		}

		iov_iter_bvec(&io->iter, io->type, io->bv, 1, io->len_bytes);
		if (io->f.filp->f_op && io->f.filp->f_op->read_iter)
		rc = io->f.filp->f_op->read_iter(&io->kio, &io->iter);
		else
		rc = -EINVAL;

		cleanup:
		vfree(io->bv);
		hl_dio_put_iopath(io->f.ctx);

		dev_dbg(hdev->dev, "IO ended with %ld\n", rc);

		return rc;
		}

		/*
		* @TODO: This function can be used as a callback for io completion under
		* kio->ki_complete in order to implement async IO.
		* Note that on more recent kernels there is no ret2.
		*/
		__maybe_unused static void hl_direct_io_complete(struct kiocb *kio, long ret, long ret2)
		{
		struct hl_direct_io *io = container_of(kio, struct hl_direct_io, kio);

		dev_dbg(io->f.ctx->hdev->dev, "IO completed with %ld\n", ret);

		/* Do something to copy result to user / notify completion */

		hl_dio_put_iopath(io->f.ctx);

		hl_dio_fd_unregister(&io->f);
		}

		/*
		* DMA disk to ASIC, wait for results. Must be invoked from the user context
		*/
		int hl_dio_ssd2hl(struct hl_device hdev, struct hl_ctx ctx, int fd,
		u64 device_va, off_t off_bytes, size_t len_bytes,
		size_t *len_read)
		{
		struct hl_direct_io *io;
		ssize_t rc;

		dev_dbg(hdev->dev, "SSD2HL fd=%d va=%#llx len=%#lx\n", fd, device_va, len_bytes);

		io = kzalloc(sizeof(*io), GFP_KERNEL);
		if (!io) {
		rc = -ENOMEM;
		goto out;
		}

		*io = (struct hl_direct_io){
		.device_va = device_va,
		.len_bytes = len_bytes,
		.off_bytes = off_bytes,
		.type = READ,
		};

		rc = hl_dio_fd_register(ctx, fd, &io->f);
		if (rc)
		goto kfree_io;

		rc = hl_direct_io(hdev, io);
		if (rc >= 0) {
		*len_read = rc;
		rc = 0;
		}

		/* This shall be called only in the case of a sync IO */
		hl_dio_fd_unregister(&io->f);
		kfree_io:
		kfree(io);
		out:
		return rc;
		}

		static void hl_p2p_region_fini(struct hl_device hdev, struct hl_p2p_region p2pr)
		{
		if (p2pr->p2ppages) {
		vfree(p2pr->p2ppages);
		p2pr->p2ppages = NULL;
		}

		if (p2pr->p2pmem) {
		dev_dbg(hdev->dev, "freeing P2P mem from %p, size=%#llx\n",
		p2pr->p2pmem, p2pr->size);
		pci_free_p2pmem(hdev->pdev, p2pr->p2pmem, p2pr->size);
		p2pr->p2pmem = NULL;
		}
		}

		void hl_p2p_region_fini_all(struct hl_device *hdev)
		{
		int i;

		for (i = 0 ; i < hdev->hldio.np2prs ; ++i)
		hl_p2p_region_fini(hdev, &hdev->hldio.p2prs[i]);

		kvfree(hdev->hldio.p2prs);
		hdev->hldio.p2prs = NULL;
		hdev->hldio.np2prs = 0;
		}

		int hl_p2p_region_init(struct hl_device hdev, struct hl_p2p_region p2pr)
		{
		void *addr;
		int rc, i;

		/* Start by publishing our p2p memory */
		rc = pci_p2pdma_add_resource(hdev->pdev, p2pr->bar, p2pr->size, p2pr->bar_offset);
		if (rc) {
		dev_err(hdev->dev, "error adding p2p resource: %d\n", rc);
		goto err;
		}

		/* Alloc all p2p mem */
		p2pr->p2pmem = pci_alloc_p2pmem(hdev->pdev, p2pr->size);
		if (!p2pr->p2pmem) {
		dev_err(hdev->dev, "error allocating p2p memory\n");
		rc = -ENOMEM;
		goto err;
		}

		p2pr->p2ppages = vmalloc((p2pr->size >> PAGE_SHIFT) * sizeof(struct page *));
		if (!p2pr->p2ppages) {
		rc = -ENOMEM;
		goto err;
		}

		for (i = 0, addr = p2pr->p2pmem ; i < (p2pr->size >> PAGE_SHIFT) ; ++i, addr += PAGE_SIZE) {
		p2pr->p2ppages[i] = virt_to_page(addr);
		if (!p2pr->p2ppages[i]) {
		rc = -EFAULT;
		goto err;
		}
		}

		return 0;
		err:
		hl_p2p_region_fini(hdev, p2pr);
		return rc;
		}

		int hl_dio_start(struct hl_device *hdev)
		{
		dev_dbg(hdev->dev, "initializing HLDIO\n");

		/* Initialize the IO counter and enable IO */
		hdev->hldio.inflight_ios = alloc_percpu(s64);
		if (!hdev->hldio.inflight_ios)
		return -ENOMEM;

		hl_dio_set_io_enabled(hdev, true);

		return 0;
		}

		void hl_dio_stop(struct hl_device *hdev)
		{
		dev_dbg(hdev->dev, "deinitializing HLDIO\n");

		if (hdev->hldio.io_enabled) {
		/* Wait for all the IO to finish */
		hl_dio_set_io_enabled(hdev, false);
		hl_poll_timeout_condition(hdev, !hl_dio_count_io(hdev), 1000, IO_STABILIZE_TIMEOUT);
		}

		if (hdev->hldio.inflight_ios) {
		free_percpu(hdev->hldio.inflight_ios);
		hdev->hldio.inflight_ios = NULL;
		}
		}