Commit bd618489 authored by Mina Almasry's avatar Mina Almasry Committed by Paolo Abeni
Browse files

net: devmem: Implement TX path



Augment dmabuf binding to be able to handle TX. Additional to all the RX
binding, we also create tx_vec needed for the TX path.

Provide API for sendmsg to be able to send dmabufs bound to this device:

- Provide a new dmabuf_tx_cmsg which includes the dmabuf to send from.
- MSG_ZEROCOPY with SCM_DEVMEM_DMABUF cmsg indicates send from dma-buf.

Devmem is uncopyable, so piggyback off the existing MSG_ZEROCOPY
implementation, while disabling instances where MSG_ZEROCOPY falls back
to copying.

We additionally pipe the binding down to the new
zerocopy_fill_skb_from_devmem which fills a TX skb with net_iov netmems
instead of the traditional page netmems.

We also special case skb_frag_dma_map to return the dma-address of these
dmabuf net_iovs instead of attempting to map pages.

The TX path may release the dmabuf in a context where we cannot wait.
This happens when the user unbinds a TX dmabuf while there are still
references to its netmems in the TX path. In that case, the netmems will
be put_netmem'd from a context where we can't unmap the dmabuf, Resolve
this by making __net_devmem_dmabuf_binding_free schedule_work'd.

Based on work by Stanislav Fomichev <sdf@fomichev.me>. A lot of the meat
of the implementation came from devmem TCP RFC v1[1], which included the
TX path, but Stan did all the rebasing on top of netmem/net_iov.

Cc: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: default avatarKaiyuan Zhang <kaiyuanz@google.com>
Signed-off-by: default avatarMina Almasry <almasrymina@google.com>
Acked-by: default avatarStanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250508004830.4100853-5-almasrymina@google.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parent 8802087d
Loading
Loading
Loading
Loading
+13 −4
Original line number Diff line number Diff line
@@ -1707,13 +1707,16 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops;

struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
				       struct ubuf_info *uarg);
				       struct ubuf_info *uarg, bool devmem);

void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);

struct net_devmem_dmabuf_binding;

int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
			    struct sk_buff *skb, struct iov_iter *from,
			    size_t length);
			    size_t length,
			    struct net_devmem_dmabuf_binding *binding);

int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
				struct iov_iter *from, size_t length);
@@ -1721,12 +1724,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
					  struct msghdr *msg, int len)
{
	return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
	return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len,
				       NULL);
}

int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
			     struct msghdr *msg, int len,
			     struct ubuf_info *uarg);
			     struct ubuf_info *uarg,
			     struct net_devmem_dmabuf_binding *binding);

/* Internal */
#define skb_shinfo(SKB)	((struct skb_shared_info *)(skb_end_pointer(SKB)))
@@ -3697,6 +3702,10 @@ static inline dma_addr_t __skb_frag_dma_map(struct device *dev,
					    size_t offset, size_t size,
					    enum dma_data_direction dir)
{
	if (skb_frag_is_net_iov(frag)) {
		return netmem_to_net_iov(frag->netmem)->dma_addr + offset +
		       frag->offset;
	}
	return dma_map_page(dev, skb_frag_page(frag),
			    skb_frag_off(frag) + offset, size, dir);
}
+1 −0
Original line number Diff line number Diff line
@@ -1851,6 +1851,7 @@ struct sockcm_cookie {
	u32 tsflags;
	u32 ts_opt_id;
	u32 priority;
	u32 dmabuf_id;
};

static inline void sockcm_init(struct sockcm_cookie *sockc,
+1 −1
Original line number Diff line number Diff line
@@ -810,7 +810,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
		return io_zcrx_copy_frag(req, ifq, frag, off, len);

	niov = netmem_to_net_iov(frag->netmem);
	if (niov->pp->mp_ops != &io_uring_pp_zc_ops ||
	if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops ||
	    io_pp_to_ifq(niov->pp) != ifq)
		return -EFAULT;

+46 −2
Original line number Diff line number Diff line
@@ -63,6 +63,8 @@
#include <net/busy_poll.h>
#include <crypto/hash.h>

#include "devmem.h"

/*
 *	Is a socket 'connection oriented' ?
 */
@@ -691,9 +693,49 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
	return 0;
}

static int
zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from,
			      int length,
			      struct net_devmem_dmabuf_binding *binding)
{
	int i = skb_shinfo(skb)->nr_frags;
	size_t virt_addr, size, off;
	struct net_iov *niov;

	/* Devmem filling works by taking an IOVEC from the user where the
	 * iov_addrs are interpreted as an offset in bytes into the dma-buf to
	 * send from. We do not support other iter types.
	 */
	if (iov_iter_type(from) != ITER_IOVEC)
		return -EFAULT;

	while (length && iov_iter_count(from)) {
		if (i == MAX_SKB_FRAGS)
			return -EMSGSIZE;

		virt_addr = (size_t)iter_iov_addr(from);
		niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size);
		if (!niov)
			return -EFAULT;

		size = min_t(size_t, size, length);
		size = min_t(size_t, size, iter_iov_len(from));

		get_netmem(net_iov_to_netmem(niov));
		skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off,
				       size, PAGE_SIZE);
		iov_iter_advance(from, size);
		length -= size;
		i++;
	}

	return 0;
}

int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
			    struct sk_buff *skb, struct iov_iter *from,
			    size_t length)
			    size_t length,
			    struct net_devmem_dmabuf_binding *binding)
{
	unsigned long orig_size = skb->truesize;
	unsigned long truesize;
@@ -701,6 +743,8 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,

	if (msg && msg->msg_ubuf && msg->sg_from_iter)
		ret = msg->sg_from_iter(skb, from, length);
	else if (binding)
		ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding);
	else
		ret = zerocopy_fill_skb_from_iter(skb, from, length);

@@ -734,7 +778,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
	if (skb_copy_datagram_from_iter(skb, 0, from, copy))
		return -EFAULT;

	return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U);
	return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL);
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);

+99 −19
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@
#include <net/netdev_rx_queue.h>
#include <net/page_pool/helpers.h>
#include <net/page_pool/memory_provider.h>
#include <net/sock.h>
#include <trace/events/page_pool.h>

#include "devmem.h"
@@ -52,8 +53,10 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov)
	       ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
}

void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
{
	struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w);

	size_t size, avail;

	gen_pool_for_each_chunk(binding->chunk_pool,
@@ -71,8 +74,10 @@ void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
	dma_buf_detach(binding->dmabuf, binding->attachment);
	dma_buf_put(binding->dmabuf);
	xa_destroy(&binding->bound_rxqs);
	kvfree(binding->tx_vec);
	kfree(binding);
}
EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free);

struct net_iov *
net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
@@ -117,6 +122,13 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
	unsigned long xa_idx;
	unsigned int rxq_idx;

	xa_erase(&net_devmem_dmabuf_bindings, binding->id);

	/* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the
	 * erase.
	 */
	synchronize_net();

	if (binding->list.next)
		list_del(&binding->list);

@@ -131,8 +143,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
		__net_mp_close_rxq(binding->dev, rxq_idx, &mp_params);
	}

	xa_erase(&net_devmem_dmabuf_bindings, binding->id);

	net_devmem_dmabuf_binding_put(binding);
}

@@ -166,8 +176,9 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
}

struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
		       struct netlink_ext_ack *extack)
net_devmem_bind_dmabuf(struct net_device *dev,
		       enum dma_data_direction direction,
		       unsigned int dmabuf_fd, struct netlink_ext_ack *extack)
{
	struct net_devmem_dmabuf_binding *binding;
	static u32 id_alloc_next;
@@ -189,13 +200,6 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
	}

	binding->dev = dev;

	err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id,
			      binding, xa_limit_32b, &id_alloc_next,
			      GFP_KERNEL);
	if (err < 0)
		goto err_free_binding;

	xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);

	refcount_set(&binding->ref, 1);
@@ -206,26 +210,36 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
	if (IS_ERR(binding->attachment)) {
		err = PTR_ERR(binding->attachment);
		NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");
		goto err_free_id;
		goto err_free_binding;
	}

	binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment,
						       DMA_FROM_DEVICE);
						       direction);
	if (IS_ERR(binding->sgt)) {
		err = PTR_ERR(binding->sgt);
		NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment");
		goto err_detach;
	}

	if (direction == DMA_TO_DEVICE) {
		binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
						 sizeof(struct net_iov *),
						 GFP_KERNEL);
		if (!binding->tx_vec) {
			err = -ENOMEM;
			goto err_unmap;
		}
	}

	/* For simplicity we expect to make PAGE_SIZE allocations, but the
	 * binding can be much more flexible than that. We may be able to
	 * allocate MTU sized chunks here. Leave that for future work...
	 */
	binding->chunk_pool =
		gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev));
	binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
					      dev_to_node(&dev->dev));
	if (!binding->chunk_pool) {
		err = -ENOMEM;
		goto err_unmap;
		goto err_tx_vec;
	}

	virtual = 0;
@@ -270,24 +284,32 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
			niov->owner = &owner->area;
			page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
						      net_devmem_get_dma_addr(niov));
			if (direction == DMA_TO_DEVICE)
				binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov;
		}

		virtual += len;
	}

	err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id,
			      binding, xa_limit_32b, &id_alloc_next,
			      GFP_KERNEL);
	if (err < 0)
		goto err_free_chunks;

	return binding;

err_free_chunks:
	gen_pool_for_each_chunk(binding->chunk_pool,
				net_devmem_dmabuf_free_chunk_owner, NULL);
	gen_pool_destroy(binding->chunk_pool);
err_tx_vec:
	kvfree(binding->tx_vec);
err_unmap:
	dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt,
					  DMA_FROM_DEVICE);
err_detach:
	dma_buf_detach(dmabuf, binding->attachment);
err_free_id:
	xa_erase(&net_devmem_dmabuf_bindings, binding->id);
err_free_binding:
	kfree(binding);
err_put_dmabuf:
@@ -295,6 +317,21 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
	return ERR_PTR(err);
}

struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
{
	struct net_devmem_dmabuf_binding *binding;

	rcu_read_lock();
	binding = xa_load(&net_devmem_dmabuf_bindings, id);
	if (binding) {
		if (!net_devmem_dmabuf_binding_get(binding))
			binding = NULL;
	}
	rcu_read_unlock();

	return binding;
}

void net_devmem_get_net_iov(struct net_iov *niov)
{
	net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov));
@@ -305,6 +342,49 @@ void net_devmem_put_net_iov(struct net_iov *niov)
	net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov));
}

struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
							 unsigned int dmabuf_id)
{
	struct net_devmem_dmabuf_binding *binding;
	struct dst_entry *dst = __sk_dst_get(sk);
	int err = 0;

	binding = net_devmem_lookup_dmabuf(dmabuf_id);
	if (!binding || !binding->tx_vec) {
		err = -EINVAL;
		goto out_err;
	}

	/* The dma-addrs in this binding are only reachable to the corresponding
	 * net_device.
	 */
	if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) {
		err = -ENODEV;
		goto out_err;
	}

	return binding;

out_err:
	if (binding)
		net_devmem_dmabuf_binding_put(binding);

	return ERR_PTR(err);
}

struct net_iov *
net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding,
		       size_t virt_addr, size_t *off, size_t *size)
{
	if (virt_addr >= binding->dmabuf->size)
		return NULL;

	*off = virt_addr % PAGE_SIZE;
	*size = PAGE_SIZE - *off;

	return binding->tx_vec[virt_addr / PAGE_SIZE];
}

/*** "Dmabuf devmem memory provider" ***/

int mp_dmabuf_devmem_init(struct page_pool *pool)
Loading