Commit 8ab79ed5 authored by Mina Almasry's avatar Mina Almasry Committed by Jakub Kicinski
Browse files

page_pool: devmem support



Convert netmem to be a union of struct page and struct netmem. Overload
the LSB of struct netmem* to indicate that it's a net_iov, otherwise
it's a page.

Currently these entries in struct page are rented by the page_pool and
used exclusively by the net stack:

struct {
	unsigned long pp_magic;
	struct page_pool *pp;
	unsigned long _pp_mapping_pad;
	unsigned long dma_addr;
	atomic_long_t pp_ref_count;
};

Mirror these (and only these) entries into struct net_iov and implement
netmem helpers that can access these common fields regardless of
whether the underlying type is page or net_iov.

Implement checks for net_iov in netmem helpers which delegate to mm
APIs, to ensure net_iov are never passed to the mm stack.

Signed-off-by: default avatarMina Almasry <almasrymina@google.com>
Reviewed-by: default avatarPavel Begunkov <asml.silence@gmail.com>
Acked-by: default avatarJakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20240910171458.219195-6-almasrymina@google.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 28c5c74e
Loading
Loading
Loading
Loading
+117 −7
Original line number Diff line number Diff line
@@ -8,12 +8,52 @@
#ifndef _NET_NETMEM_H
#define _NET_NETMEM_H

#include <linux/mm.h>
#include <net/net_debug.h>

/* net_iov */

DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);

/*  We overload the LSB of the struct page pointer to indicate whether it's
 *  a page or net_iov.
 */
#define NET_IOV 0x01UL

struct net_iov {
	unsigned long __unused_padding;
	unsigned long pp_magic;
	struct page_pool *pp;
	struct dmabuf_genpool_chunk_owner *owner;
	unsigned long dma_addr;
	atomic_long_t pp_ref_count;
};

/* These fields in struct page are used by the page_pool and net stack:
 *
 *        struct {
 *                unsigned long pp_magic;
 *                struct page_pool *pp;
 *                unsigned long _pp_mapping_pad;
 *                unsigned long dma_addr;
 *                atomic_long_t pp_ref_count;
 *        };
 *
 * We mirror the page_pool fields here so the page_pool can access these fields
 * without worrying whether the underlying fields belong to a page or net_iov.
 *
 * The non-net stack fields of struct page are private to the mm stack and must
 * never be mirrored to net_iov.
 */
#define NET_IOV_ASSERT_OFFSET(pg, iov)             \
	static_assert(offsetof(struct page, pg) == \
		      offsetof(struct net_iov, iov))
NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic);
NET_IOV_ASSERT_OFFSET(pp, pp);
NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr);
NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count);
#undef NET_IOV_ASSERT_OFFSET

/* netmem */

/**
@@ -27,20 +67,37 @@ struct net_iov {
 */
typedef unsigned long __bitwise netmem_ref;

static inline bool netmem_is_net_iov(const netmem_ref netmem)
{
	return (__force unsigned long)netmem & NET_IOV;
}

/* This conversion fails (returns NULL) if the netmem_ref is not struct page
 * backed.
 *
 * Currently struct page is the only possible netmem, and this helper never
 * fails.
 */
static inline struct page *netmem_to_page(netmem_ref netmem)
{
	if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
		return NULL;

	return (__force struct page *)netmem;
}

/* Converting from page to netmem is always safe, because a page can always be
 * a netmem.
 */
static inline struct net_iov *netmem_to_net_iov(netmem_ref netmem)
{
	if (netmem_is_net_iov(netmem))
		return (struct net_iov *)((__force unsigned long)netmem &
					  ~NET_IOV);

	DEBUG_NET_WARN_ON_ONCE(true);
	return NULL;
}

static inline netmem_ref net_iov_to_netmem(struct net_iov *niov)
{
	return (__force netmem_ref)((unsigned long)niov | NET_IOV);
}

static inline netmem_ref page_to_netmem(struct page *page)
{
	return (__force netmem_ref)page;
@@ -48,17 +105,70 @@ static inline netmem_ref page_to_netmem(struct page *page)

static inline int netmem_ref_count(netmem_ref netmem)
{
	/* The non-pp refcount of net_iov is always 1. On net_iov, we only
	 * support pp refcounting which uses the pp_ref_count field.
	 */
	if (netmem_is_net_iov(netmem))
		return 1;

	return page_ref_count(netmem_to_page(netmem));
}

static inline unsigned long netmem_to_pfn(netmem_ref netmem)
static inline unsigned long netmem_pfn_trace(netmem_ref netmem)
{
	if (netmem_is_net_iov(netmem))
		return 0;

	return page_to_pfn(netmem_to_page(netmem));
}

static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem)
{
	return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV);
}

static inline struct page_pool *netmem_get_pp(netmem_ref netmem)
{
	return __netmem_clear_lsb(netmem)->pp;
}

static inline atomic_long_t *netmem_get_pp_ref_count_ref(netmem_ref netmem)
{
	return &__netmem_clear_lsb(netmem)->pp_ref_count;
}

static inline bool netmem_is_pref_nid(netmem_ref netmem, int pref_nid)
{
	/* NUMA node preference only makes sense if we're allocating
	 * system memory. Memory providers (which give us net_iovs)
	 * choose for us.
	 */
	if (netmem_is_net_iov(netmem))
		return true;

	return page_to_nid(netmem_to_page(netmem)) == pref_nid;
}

static inline netmem_ref netmem_compound_head(netmem_ref netmem)
{
	/* niov are never compounded */
	if (netmem_is_net_iov(netmem))
		return netmem;

	return page_to_netmem(compound_head(netmem_to_page(netmem)));
}

static inline void *netmem_address(netmem_ref netmem)
{
	if (netmem_is_net_iov(netmem))
		return NULL;

	return page_address(netmem_to_page(netmem));
}

static inline unsigned long netmem_get_dma_addr(netmem_ref netmem)
{
	return __netmem_clear_lsb(netmem)->dma_addr;
}

#endif /* _NET_NETMEM_H */
+7 −32
Original line number Diff line number Diff line
@@ -216,7 +216,7 @@ page_pool_get_dma_dir(const struct page_pool *pool)

static inline void page_pool_fragment_netmem(netmem_ref netmem, long nr)
{
	atomic_long_set(&netmem_to_page(netmem)->pp_ref_count, nr);
	atomic_long_set(netmem_get_pp_ref_count_ref(netmem), nr);
}

/**
@@ -244,7 +244,7 @@ static inline void page_pool_fragment_page(struct page *page, long nr)

static inline long page_pool_unref_netmem(netmem_ref netmem, long nr)
{
	struct page *page = netmem_to_page(netmem);
	atomic_long_t *pp_ref_count = netmem_get_pp_ref_count_ref(netmem);
	long ret;

	/* If nr == pp_ref_count then we have cleared all remaining
@@ -261,19 +261,19 @@ static inline long page_pool_unref_netmem(netmem_ref netmem, long nr)
	 * initially, and only overwrite it when the page is partitioned into
	 * more than one piece.
	 */
	if (atomic_long_read(&page->pp_ref_count) == nr) {
	if (atomic_long_read(pp_ref_count) == nr) {
		/* As we have ensured nr is always one for constant case using
		 * the BUILD_BUG_ON(), only need to handle the non-constant case
		 * here for pp_ref_count draining, which is a rare case.
		 */
		BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1);
		if (!__builtin_constant_p(nr))
			atomic_long_set(&page->pp_ref_count, 1);
			atomic_long_set(pp_ref_count, 1);

		return 0;
	}

	ret = atomic_long_sub_return(nr, &page->pp_ref_count);
	ret = atomic_long_sub_return(nr, pp_ref_count);
	WARN_ON(ret < 0);

	/* We are the last user here too, reset pp_ref_count back to 1 to
@@ -282,7 +282,7 @@ static inline long page_pool_unref_netmem(netmem_ref netmem, long nr)
	 * page_pool_unref_page() currently.
	 */
	if (unlikely(!ret))
		atomic_long_set(&page->pp_ref_count, 1);
		atomic_long_set(pp_ref_count, 1);

	return ret;
}
@@ -401,9 +401,7 @@ static inline void page_pool_free_va(struct page_pool *pool, void *va,

static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem)
{
	struct page *page = netmem_to_page(netmem);

	dma_addr_t ret = page->dma_addr;
	dma_addr_t ret = netmem_get_dma_addr(netmem);

	if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA)
		ret <<= PAGE_SHIFT;
@@ -423,24 +421,6 @@ static inline dma_addr_t page_pool_get_dma_addr(const struct page *page)
	return page_pool_get_dma_addr_netmem(page_to_netmem((struct page *)page));
}

static inline bool page_pool_set_dma_addr_netmem(netmem_ref netmem,
						 dma_addr_t addr)
{
	struct page *page = netmem_to_page(netmem);

	if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) {
		page->dma_addr = addr >> PAGE_SHIFT;

		/* We assume page alignment to shave off bottom bits,
		 * if this "compression" doesn't work we need to drop.
		 */
		return addr != (dma_addr_t)page->dma_addr << PAGE_SHIFT;
	}

	page->dma_addr = addr;
	return false;
}

/**
 * page_pool_dma_sync_for_cpu - sync Rx page for CPU after it's written by HW
 * @pool: &page_pool the @page belongs to
@@ -463,11 +443,6 @@ static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool,
				      page_pool_get_dma_dir(pool));
}

static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
{
	return page_pool_set_dma_addr_netmem(page_to_netmem(page), addr);
}

static inline bool page_pool_put(struct page_pool *pool)
{
	return refcount_dec_and_test(&pool->user_cnt);
+6 −6
Original line number Diff line number Diff line
@@ -57,12 +57,12 @@ TRACE_EVENT(page_pool_state_release,
		__entry->pool		= pool;
		__entry->netmem		= (__force unsigned long)netmem;
		__entry->release	= release;
		__entry->pfn		= netmem_to_pfn(netmem);
		__entry->pfn		= netmem_pfn_trace(netmem);
	),

	TP_printk("page_pool=%p netmem=%p pfn=0x%lx release=%u",
	TP_printk("page_pool=%p netmem=%p is_net_iov=%lu pfn=0x%lx release=%u",
		  __entry->pool, (void *)__entry->netmem,
		  __entry->pfn, __entry->release)
		  __entry->netmem & NET_IOV, __entry->pfn, __entry->release)
);

TRACE_EVENT(page_pool_state_hold,
@@ -83,12 +83,12 @@ TRACE_EVENT(page_pool_state_hold,
		__entry->pool	= pool;
		__entry->netmem	= (__force unsigned long)netmem;
		__entry->hold	= hold;
		__entry->pfn	= netmem_to_pfn(netmem);
		__entry->pfn	= netmem_pfn_trace(netmem);
	),

	TP_printk("page_pool=%p netmem=%p pfn=0x%lx hold=%u",
	TP_printk("page_pool=%p netmem=%p is_net_iov=%lu, pfn=0x%lx hold=%u",
		  __entry->pool, (void *)__entry->netmem,
		  __entry->pfn, __entry->hold)
		  __entry->netmem & NET_IOV, __entry->pfn, __entry->hold)
);

TRACE_EVENT(page_pool_update_nid,
+7 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@
#include <trace/events/page_pool.h>

#include "devmem.h"
#include "page_pool_priv.h"

/* Device memory support */

@@ -82,6 +83,10 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
	index = offset / PAGE_SIZE;
	niov = &owner->niovs[index];

	niov->pp_magic = 0;
	niov->pp = NULL;
	atomic_long_set(&niov->pp_ref_count, 0);

	return niov;
}

@@ -269,6 +274,8 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
		for (i = 0; i < owner->num_niovs; i++) {
			niov = &owner->niovs[i];
			niov->owner = owner;
			page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
						      net_devmem_get_dma_addr(niov));
		}

		virtual += len;

net/core/netmem_priv.h

0 → 100644
+31 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef __NETMEM_PRIV_H
#define __NETMEM_PRIV_H

static inline unsigned long netmem_get_pp_magic(netmem_ref netmem)
{
	return __netmem_clear_lsb(netmem)->pp_magic;
}

static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
{
	__netmem_clear_lsb(netmem)->pp_magic |= pp_magic;
}

static inline void netmem_clear_pp_magic(netmem_ref netmem)
{
	__netmem_clear_lsb(netmem)->pp_magic = 0;
}

static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool)
{
	__netmem_clear_lsb(netmem)->pp = pool;
}

static inline void netmem_set_dma_addr(netmem_ref netmem,
				       unsigned long dma_addr)
{
	__netmem_clear_lsb(netmem)->dma_addr = dma_addr;
}
#endif
Loading