Commit b1011b2b authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'io_uring-6.15-20250418' of git://git.kernel.dk/linux

Pull io_uring fixes from Jens Axboe:

 - Correctly cap iov_iter->nr_segs for imports of registered buffers,
   both kbuf and normal ones.

   Three cleanups to make it saner first, then two fixes for each of the
   buffer types.

   This fixes a performance regression where partial buffer usage
   doesn't trim the tail number of segments, leading the block layer to
   iterate the IOs to check if it needs splitting.

 - Two patches tweaking the newly introduced zero-copy rx API, mostly to
   keep the API consistent once we add multiple interface queues per
   ring support in the 6.16 release.

 - zc rx unmapping fix for a dead device

* tag 'io_uring-6.15-20250418' of git://git.kernel.dk/linux:
  io_uring/zcrx: fix late dma unmap for a dead dev
  io_uring/rsrc: ensure segments counts are correct on kbuf buffers
  io_uring/rsrc: send exact nr_segs for fixed buffer
  io_uring/rsrc: refactor io_import_fixed
  io_uring/rsrc: separate kbuf offset adjustments
  io_uring/rsrc: don't skip offset calculation
  io_uring/zcrx: add pp to ifq conversion helper
  io_uring/zcrx: return ifq id to the user
parents fc96b232 f12ecf5e
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -1010,7 +1010,9 @@ struct io_uring_zcrx_ifq_reg {
	__u64	region_ptr; /* struct io_uring_region_desc * */

	struct io_uring_zcrx_offsets offsets;
	__u64	__resv[4];
	__u32	zcrx_id;
	__u32	__resv2;
	__u64	__resv[3];
};

#ifdef __cplusplus
+47 −45
Original line number Diff line number Diff line
@@ -1032,10 +1032,33 @@ static int validate_fixed_range(u64 buf_addr, size_t len,
	return 0;
}

static int io_import_kbuf(int ddir, struct iov_iter *iter,
			  struct io_mapped_ubuf *imu, size_t len, size_t offset)
{
	size_t count = len + offset;

	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count);
	iov_iter_advance(iter, offset);

	if (count < imu->len) {
		const struct bio_vec *bvec = iter->bvec;

		while (len > bvec->bv_len) {
			len -= bvec->bv_len;
			bvec++;
		}
		iter->nr_segs = 1 + bvec - iter->bvec;
	}
	return 0;
}

static int io_import_fixed(int ddir, struct iov_iter *iter,
			   struct io_mapped_ubuf *imu,
			   u64 buf_addr, size_t len)
{
	const struct bio_vec *bvec;
	size_t folio_mask;
	unsigned nr_segs;
	size_t offset;
	int ret;

@@ -1047,14 +1070,11 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
	if (!(imu->dir & (1 << ddir)))
		return -EFAULT;

	/*
	 * Might not be a start of buffer, set size appropriately
	 * and advance us to the beginning.
	 */
	offset = buf_addr - imu->ubuf;
	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);

	if (offset) {
	if (imu->is_kbuf)
		return io_import_kbuf(ddir, iter, imu, len, offset);

	/*
	 * Don't use iov_iter_advance() here, as it's really slow for
	 * using the latter parts of a big fixed buffer - it iterates
@@ -1064,39 +1084,21 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
	 * 1) it's a BVEC iter, we set it up
	 * 2) all bvecs are the same in size, except potentially the
	 *    first and last bvec
		 *
		 * So just find our index, and adjust the iterator afterwards.
		 * If the offset is within the first bvec (or the whole first
		 * bvec, just use iov_iter_advance(). This makes it easier
		 * since we can just skip the first segment, which may not
		 * be folio_size aligned.
	 */
		const struct bio_vec *bvec = imu->bvec;

		/*
		 * Kernel buffer bvecs, on the other hand, don't necessarily
		 * have the size property of user registered ones, so we have
		 * to use the slow iter advance.
		 */
		if (offset < bvec->bv_len) {
			iter->count -= offset;
			iter->iov_offset = offset;
		} else if (imu->is_kbuf) {
			iov_iter_advance(iter, offset);
		} else {
	folio_mask = (1UL << imu->folio_shift) - 1;
	bvec = imu->bvec;
	if (offset >= bvec->bv_len) {
		unsigned long seg_skip;

		/* skip first vec */
		offset -= bvec->bv_len;
		seg_skip = 1 + (offset >> imu->folio_shift);

			iter->bvec += seg_skip;
			iter->nr_segs -= seg_skip;
			iter->count -= bvec->bv_len + offset;
			iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
		}
		bvec += seg_skip;
		offset &= folio_mask;
	}

	nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift;
	iov_iter_bvec(iter, ddir, bvec, nr_segs, len);
	iter->iov_offset = offset;
	return 0;
}

+28 −9
Original line number Diff line number Diff line
@@ -26,6 +26,11 @@
#include "zcrx.h"
#include "rsrc.h"

static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
{
	return pp->mp_priv;
}

#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)

static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
@@ -46,14 +51,21 @@ static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,

static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
	guard(mutex)(&ifq->dma_lock);

	if (area->is_mapped)
		__io_zcrx_unmap_area(ifq, area, area->nia.num_niovs);
	area->is_mapped = false;
}

static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
	int i;

	guard(mutex)(&ifq->dma_lock);
	if (area->is_mapped)
		return 0;

	for (i = 0; i < area->nia.num_niovs; i++) {
		struct net_iov *niov = &area->nia.niovs[i];
		dma_addr_t dma;
@@ -275,6 +287,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
	ifq->ctx = ctx;
	spin_lock_init(&ifq->lock);
	spin_lock_init(&ifq->rq_lock);
	mutex_init(&ifq->dma_lock);
	return ifq;
}

@@ -324,6 +337,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
		put_device(ifq->dev);

	io_free_rbuf_ring(ifq);
	mutex_destroy(&ifq->dma_lock);
	kfree(ifq);
}

@@ -354,7 +368,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
		return -EFAULT;
	if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
		return -EFAULT;
	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)) ||
	    reg.__resv2 || reg.zcrx_id)
		return -EINVAL;
	if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
		return -EINVAL;
@@ -394,10 +409,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
		goto err;
	get_device(ifq->dev);

	ret = io_zcrx_map_area(ifq, ifq->area);
	if (ret)
		goto err;

	mp_param.mp_ops = &io_uring_pp_zc_ops;
	mp_param.mp_priv = ifq;
	ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param);
@@ -585,7 +596,7 @@ static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)

static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
{
	struct io_zcrx_ifq *ifq = pp->mp_priv;
	struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);

	/* pp should already be ensuring that */
	if (unlikely(pp->alloc.count))
@@ -617,7 +628,8 @@ static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)

static int io_pp_zc_init(struct page_pool *pp)
{
	struct io_zcrx_ifq *ifq = pp->mp_priv;
	struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
	int ret;

	if (WARN_ON_ONCE(!ifq))
		return -EINVAL;
@@ -630,13 +642,17 @@ static int io_pp_zc_init(struct page_pool *pp)
	if (pp->p.dma_dir != DMA_FROM_DEVICE)
		return -EOPNOTSUPP;

	ret = io_zcrx_map_area(ifq, ifq->area);
	if (ret)
		return ret;

	percpu_ref_get(&ifq->ctx->refs);
	return 0;
}

static void io_pp_zc_destroy(struct page_pool *pp)
{
	struct io_zcrx_ifq *ifq = pp->mp_priv;
	struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
	struct io_zcrx_area *area = ifq->area;

	if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
@@ -665,6 +681,9 @@ static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq)
	struct io_zcrx_ifq *ifq = mp_priv;

	io_zcrx_drop_netdev(ifq);
	if (ifq->area)
		io_zcrx_unmap_area(ifq, ifq->area);

	p->mp_ops = NULL;
	p->mp_priv = NULL;
}
@@ -791,7 +810,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,

	niov = netmem_to_net_iov(frag->netmem);
	if (niov->pp->mp_ops != &io_uring_pp_zc_ops ||
	    niov->pp->mp_priv != ifq)
	    io_pp_to_ifq(niov->pp) != ifq)
		return -EFAULT;

	if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))
+1 −0
Original line number Diff line number Diff line
@@ -38,6 +38,7 @@ struct io_zcrx_ifq {
	struct net_device		*netdev;
	netdevice_tracker		netdev_tracker;
	spinlock_t			lock;
	struct mutex			dma_lock;
};

#if defined(CONFIG_IO_URING_ZCRX)