Commit 71f0dd5a authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'io_uring-zero-copy-rx'

David Wei says:

====================
io_uring zero copy rx

This patchset contains net/ patches needed by a new io_uring request
implementing zero copy rx into userspace pages, eliminating a kernel
to user copy.

We configure a page pool that a driver uses to fill a hw rx queue to
hand out user pages instead of kernel pages. Any data that ends up
hitting this hw rx queue will thus be dma'd into userspace memory
directly, without needing to be bounced through kernel memory. 'Reading'
data out of a socket instead becomes a _notification_ mechanism, where
the kernel tells userspace where the data is. The overall approach is
similar to the devmem TCP proposal.

This relies on hw header/data split, flow steering and RSS to ensure
packet headers remain in kernel memory and only desired flows hit a hw
rx queue configured for zero copy. Configuring this is outside of the
scope of this patchset.

We share netdev core infra with devmem TCP. The main difference is that
io_uring is used for the uAPI and the lifetime of all objects are bound
to an io_uring instance. Data is 'read' using a new io_uring request
type. When done, data is returned via a new shared refill queue. A zero
copy page pool refills a hw rx queue from this refill queue directly. Of
course, the lifetime of these data buffers are managed by io_uring
rather than the networking stack, with different refcounting rules.

This patchset is the first step adding basic zero copy support. We will
extend this iteratively with new features e.g. dynamically allocated
zero copy areas, THP support, dmabuf support, improved copy fallback,
general optimisations and more.

In terms of netdev support, we're first targeting Broadcom bnxt. Patches
aren't included since Taehee Yoo has already sent a more comprehensive
patchset adding support in [1]. Google gve should already support this,
and Mellanox mlx5 support is WIP pending driver changes.

===========
Performance
===========

Note: Comparison with epoll + TCP_ZEROCOPY_RECEIVE isn't done yet.

Test setup:
* AMD EPYC 9454
* Broadcom BCM957508 200G
* Kernel v6.11 base [2]
* liburing fork [3]
* kperf fork [4]
* 4K MTU
* Single TCP flow

With application thread + net rx softirq pinned to _different_ cores:

+-------------------------------+
| epoll     | io_uring          |
|-----------|-------------------|
| 82.2 Gbps | 116.2 Gbps (+41%) |
+-------------------------------+

Pinned to _same_ core:

+-------------------------------+
| epoll     | io_uring          |
|-----------|-------------------|
| 62.6 Gbps | 80.9 Gbps (+29%)  |
+-------------------------------+

=====
Links
=====

Broadcom bnxt support:
[1]: https://lore.kernel.org/20241003160620.1521626-8-ap420073@gmail.com

Linux kernel branch including io_uring bits:
[2]: https://github.com/isilence/linux.git zcrx/v13

liburing for testing:
[3]: https://github.com/isilence/liburing.git zcrx/next

kperf for testing:
[4]: https://git.kernel.dk/kperf.git
====================

Link: https://patch.msgid.link/20250204215622.695511-1-dw@davidwei.uk


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents ba6ec099 6e18ed92
Loading
Loading
Loading
Loading
+15 −0
Original line number Diff line number Diff line
@@ -114,6 +114,9 @@ attribute-sets:
        doc: Bitmask of enabled AF_XDP features.
        type: u64
        enum: xsk-flags
  -
    name: io-uring-provider-info
    attributes: []
  -
    name: page-pool
    attributes:
@@ -171,6 +174,11 @@ attribute-sets:
        name: dmabuf
        doc: ID of the dmabuf this page-pool is attached to.
        type: u32
      -
        name: io-uring
        doc: io-uring memory provider information.
        type: nest
        nested-attributes: io-uring-provider-info
  -
    name: page-pool-info
    subset-of: page-pool
@@ -296,6 +304,11 @@ attribute-sets:
        name: dmabuf
        doc: ID of the dmabuf attached to this queue, if any.
        type: u32
      -
        name: io-uring
        doc: io_uring memory provider information.
        type: nest
        nested-attributes: io-uring-provider-info

  -
    name: qstats
@@ -572,6 +585,7 @@ operations:
            - inflight-mem
            - detach-time
            - dmabuf
            - io-uring
      dump:
        reply: *pp-reply
      config-cond: page-pool
@@ -637,6 +651,7 @@ operations:
            - napi-id
            - ifindex
            - dmabuf
            - io-uring
      dump:
        request:
          attributes:
+20 −1
Original line number Diff line number Diff line
@@ -24,11 +24,20 @@ struct net_iov {
	unsigned long __unused_padding;
	unsigned long pp_magic;
	struct page_pool *pp;
	struct dmabuf_genpool_chunk_owner *owner;
	struct net_iov_area *owner;
	unsigned long dma_addr;
	atomic_long_t pp_ref_count;
};

struct net_iov_area {
	/* Array of net_iovs for this area. */
	struct net_iov *niovs;
	size_t num_niovs;

	/* Offset into the dma-buf where this chunk starts.  */
	unsigned long base_virtual;
};

/* These fields in struct page are used by the page_pool and net stack:
 *
 *        struct {
@@ -54,6 +63,16 @@ NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr);
NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count);
#undef NET_IOV_ASSERT_OFFSET

static inline struct net_iov_area *net_iov_owner(const struct net_iov *niov)
{
	return niov->owner;
}

static inline unsigned int net_iov_idx(const struct net_iov *niov)
{
	return niov - net_iov_owner(niov)->niovs;
}

/* netmem */

/**
+45 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_PAGE_POOL_MEMORY_PROVIDER_H
#define _NET_PAGE_POOL_MEMORY_PROVIDER_H

#include <net/netmem.h>
#include <net/page_pool/types.h>

struct netdev_rx_queue;
struct sk_buff;

struct memory_provider_ops {
	netmem_ref (*alloc_netmems)(struct page_pool *pool, gfp_t gfp);
	bool (*release_netmem)(struct page_pool *pool, netmem_ref netmem);
	int (*init)(struct page_pool *pool);
	void (*destroy)(struct page_pool *pool);
	int (*nl_fill)(void *mp_priv, struct sk_buff *rsp,
		       struct netdev_rx_queue *rxq);
	void (*uninstall)(void *mp_priv, struct netdev_rx_queue *rxq);
};

bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr);
void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov);
void net_mp_niov_clear_page_pool(struct net_iov *niov);

int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx,
		    struct pp_memory_provider_params *p);
void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
		      struct pp_memory_provider_params *old_p);

/**
  * net_mp_netmem_place_in_cache() - give a netmem to a page pool
  * @pool:      the page pool to place the netmem into
  * @netmem:    netmem to give
  *
  * Push an accounted netmem into the page pool's allocation cache. The caller
  * must ensure that there is space in the cache. It should only be called off
  * the mp_ops->alloc_netmems() path.
  */
static inline void net_mp_netmem_place_in_cache(struct page_pool *pool,
						netmem_ref netmem)
{
	pool->alloc.cache[pool->alloc.count++] = netmem;
}

#endif
+4 −0
Original line number Diff line number Diff line
@@ -152,8 +152,11 @@ struct page_pool_stats {
 */
#define PAGE_POOL_FRAG_GROUP_ALIGN	(4 * sizeof(long))

struct memory_provider_ops;

struct pp_memory_provider_params {
	void *mp_priv;
	const struct memory_provider_ops *mp_ops;
};

struct page_pool {
@@ -216,6 +219,7 @@ struct page_pool {
	struct ptr_ring ring;

	void *mp_priv;
	const struct memory_provider_ops *mp_ops;

#ifdef CONFIG_PAGE_POOL_STATS
	/* recycle stats are per-cpu to avoid locking */
+7 −0
Original line number Diff line number Diff line
@@ -86,6 +86,11 @@ enum {
	NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
};

enum {
	__NETDEV_A_IO_URING_PROVIDER_INFO_MAX,
	NETDEV_A_IO_URING_PROVIDER_INFO_MAX = (__NETDEV_A_IO_URING_PROVIDER_INFO_MAX - 1)
};

enum {
	NETDEV_A_PAGE_POOL_ID = 1,
	NETDEV_A_PAGE_POOL_IFINDEX,
@@ -94,6 +99,7 @@ enum {
	NETDEV_A_PAGE_POOL_INFLIGHT_MEM,
	NETDEV_A_PAGE_POOL_DETACH_TIME,
	NETDEV_A_PAGE_POOL_DMABUF,
	NETDEV_A_PAGE_POOL_IO_URING,

	__NETDEV_A_PAGE_POOL_MAX,
	NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1)
@@ -136,6 +142,7 @@ enum {
	NETDEV_A_QUEUE_TYPE,
	NETDEV_A_QUEUE_NAPI_ID,
	NETDEV_A_QUEUE_DMABUF,
	NETDEV_A_QUEUE_IO_URING,

	__NETDEV_A_QUEUE_MAX,
	NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
Loading