Commit eb05529a authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'page_pool-allow-direct-bulk-recycling'

Alexander Lobakin says:

====================
page_pool: allow direct bulk recycling

Previously, there was no reliable way to check whether it's safe to use
direct PP cache. The drivers were passing @allow_direct to the PP
recycling functions and that was it. Bulk recycling is used by
xdp_return_frame_bulk() on .ndo_xdp_xmit() frames completion where
the page origin is unknown, thus the direct recycling has never been
tried.
Now that we have at least 2 ways of checking if we're allowed to perform
direct recycling -- pool->p.napi (Jakub) and pool->cpuid (Lorenzo), we
can use them when doing bulk recycling as well. Just move that logic
from the skb core to the PP core and call it before
__page_pool_put_page() every time @allow_direct is false.
Under high .ndo_xdp_xmit() traffic load, the win is 2-3% Pps assuming
the sending driver uses xdp_return_frame_bulk() on Tx completion.
====================

Link: https://lore.kernel.org/r/20240329165507.3240110-1-aleksander.lobakin@intel.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 8db2509f 39806b96
Loading
Loading
Loading
Loading
+6 −6
Original line number Diff line number Diff line
@@ -3510,25 +3510,25 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
		    unsigned int headroom);
int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
			 struct bpf_prog *prog);
bool napi_pp_put_page(struct page *page, bool napi_safe);
bool napi_pp_put_page(struct page *page);

static inline void
skb_page_unref(const struct sk_buff *skb, struct page *page, bool napi_safe)
skb_page_unref(const struct sk_buff *skb, struct page *page)
{
#ifdef CONFIG_PAGE_POOL
	if (skb->pp_recycle && napi_pp_put_page(page, napi_safe))
	if (skb->pp_recycle && napi_pp_put_page(page))
		return;
#endif
	put_page(page);
}

static inline void
napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
napi_frag_unref(skb_frag_t *frag, bool recycle)
{
	struct page *page = skb_frag_page(frag);

#ifdef CONFIG_PAGE_POOL
	if (recycle && napi_pp_put_page(page, napi_safe))
	if (recycle && napi_pp_put_page(page))
		return;
#endif
	put_page(page);
@@ -3544,7 +3544,7 @@ napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
 */
static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
{
	napi_frag_unref(frag, recycle, false);
	napi_frag_unref(frag, recycle);
}

/**
+33 −5
Original line number Diff line number Diff line
@@ -690,8 +690,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
			page_pool_dma_sync_for_device(pool, page,
						      dma_sync_size);

		if (allow_direct && in_softirq() &&
		    page_pool_recycle_in_cache(page, pool))
		if (allow_direct && page_pool_recycle_in_cache(page, pool))
			return NULL;

		/* Page found as candidate for recycling */
@@ -716,9 +715,35 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
	return NULL;
}

static bool page_pool_napi_local(const struct page_pool *pool)
{
	const struct napi_struct *napi;
	u32 cpuid;

	if (unlikely(!in_softirq()))
		return false;

	/* Allow direct recycle if we have reasons to believe that we are
	 * in the same context as the consumer would run, so there's
	 * no possible race.
	 * __page_pool_put_page() makes sure we're not in hardirq context
	 * and interrupts are enabled prior to accessing the cache.
	 */
	cpuid = smp_processor_id();
	if (READ_ONCE(pool->cpuid) == cpuid)
		return true;

	napi = READ_ONCE(pool->p.napi);

	return napi && READ_ONCE(napi->list_owner) == cpuid;
}

void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
				unsigned int dma_sync_size, bool allow_direct)
{
	if (!allow_direct)
		allow_direct = page_pool_napi_local(pool);

	page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
	if (page && !page_pool_recycle_in_ring(pool, page)) {
		/* Cache full, fallback to free pages */
@@ -747,8 +772,11 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
			     int count)
{
	int i, bulk_len = 0;
	bool allow_direct;
	bool in_softirq;

	allow_direct = page_pool_napi_local(pool);

	for (i = 0; i < count; i++) {
		struct page *page = virt_to_head_page(data[i]);

@@ -756,13 +784,13 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
		if (!page_pool_is_last_ref(page))
			continue;

		page = __page_pool_put_page(pool, page, -1, false);
		page = __page_pool_put_page(pool, page, -1, allow_direct);
		/* Approved for bulk recycling in ptr_ring cache */
		if (page)
			data[bulk_len++] = page;
	}

	if (unlikely(!bulk_len))
	if (!bulk_len)
		return;

	/* Bulk producer into ptr_ring page_pool cache */
@@ -969,7 +997,7 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
static void page_pool_disable_direct_recycling(struct page_pool *pool)
{
	/* Disable direct recycling based on pool->cpuid.
	 * Paired with READ_ONCE() in napi_pp_put_page().
	 * Paired with READ_ONCE() in page_pool_napi_local().
	 */
	WRITE_ONCE(pool->cpuid, -1);

+22 −48
Original line number Diff line number Diff line
@@ -1004,11 +1004,8 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
EXPORT_SYMBOL(skb_cow_data_for_xdp);

#if IS_ENABLED(CONFIG_PAGE_POOL)
bool napi_pp_put_page(struct page *page, bool napi_safe)
bool napi_pp_put_page(struct page *page)
{
	bool allow_direct = false;
	struct page_pool *pp;

	page = compound_head(page);

	/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
@@ -1021,39 +1018,18 @@ bool napi_pp_put_page(struct page *page, bool napi_safe)
	if (unlikely(!is_pp_page(page)))
		return false;

	pp = page->pp;

	/* Allow direct recycle if we have reasons to believe that we are
	 * in the same context as the consumer would run, so there's
	 * no possible race.
	 * __page_pool_put_page() makes sure we're not in hardirq context
	 * and interrupts are enabled prior to accessing the cache.
	 */
	if (napi_safe || in_softirq()) {
		const struct napi_struct *napi = READ_ONCE(pp->p.napi);
		unsigned int cpuid = smp_processor_id();

		allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid;
		allow_direct |= READ_ONCE(pp->cpuid) == cpuid;
	}

	/* Driver set this to memory recycling info. Reset it on recycle.
	 * This will *not* work for NIC using a split-page memory model.
	 * The page will be returned to the pool here regardless of the
	 * 'flipped' fragment being in use or not.
	 */
	page_pool_put_full_page(pp, page, allow_direct);
	page_pool_put_full_page(page->pp, page, false);

	return true;
}
EXPORT_SYMBOL(napi_pp_put_page);
#endif

static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
static bool skb_pp_recycle(struct sk_buff *skb, void *data)
{
	if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
		return false;
	return napi_pp_put_page(virt_to_page(data), napi_safe);
	return napi_pp_put_page(virt_to_page(data));
}

/**
@@ -1095,12 +1071,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)
		kfree(head);
}

static void skb_free_head(struct sk_buff *skb, bool napi_safe)
static void skb_free_head(struct sk_buff *skb)
{
	unsigned char *head = skb->head;

	if (skb->head_frag) {
		if (skb_pp_recycle(skb, head, napi_safe))
		if (skb_pp_recycle(skb, head))
			return;
		skb_free_frag(head);
	} else {
@@ -1108,8 +1084,7 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe)
	}
}

static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
			     bool napi_safe)
static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
{
	struct skb_shared_info *shinfo = skb_shinfo(skb);
	int i;
@@ -1126,13 +1101,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
	}

	for (i = 0; i < shinfo->nr_frags; i++)
		napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
		napi_frag_unref(&shinfo->frags[i], skb->pp_recycle);

free_head:
	if (shinfo->frag_list)
		kfree_skb_list_reason(shinfo->frag_list, reason);

	skb_free_head(skb, napi_safe);
	skb_free_head(skb);
exit:
	/* When we clone an SKB we copy the reycling bit. The pp_recycle
	 * bit is only set on the head though, so in order to avoid races
@@ -1193,12 +1168,11 @@ void skb_release_head_state(struct sk_buff *skb)
}

/* Free everything but the sk_buff shell. */
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
			    bool napi_safe)
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
{
	skb_release_head_state(skb);
	if (likely(skb->head))
		skb_release_data(skb, reason, napi_safe);
		skb_release_data(skb, reason);
}

/**
@@ -1212,7 +1186,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,

void __kfree_skb(struct sk_buff *skb)
{
	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
	kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);
@@ -1269,7 +1243,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
		return;
	}

	skb_release_all(skb, reason, false);
	skb_release_all(skb, reason);
	sa->skb_array[sa->skb_count++] = skb;

	if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
@@ -1443,7 +1417,7 @@ EXPORT_SYMBOL(consume_skb);
void __consume_stateless_skb(struct sk_buff *skb)
{
	trace_consume_skb(skb, __builtin_return_address(0));
	skb_release_data(skb, SKB_CONSUMED, false);
	skb_release_data(skb, SKB_CONSUMED);
	kfree_skbmem(skb);
}

@@ -1470,7 +1444,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)

void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
{
	skb_release_all(skb, reason, true);
	skb_release_all(skb, reason);
	napi_skb_cache_put(skb);
}

@@ -1508,7 +1482,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
		return;
	}

	skb_release_all(skb, SKB_CONSUMED, !!budget);
	skb_release_all(skb, SKB_CONSUMED);
	napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);
@@ -1639,7 +1613,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
 */
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
	skb_release_all(dst, SKB_CONSUMED, false);
	skb_release_all(dst, SKB_CONSUMED);
	return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);
@@ -2271,9 +2245,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
		if (skb_has_frag_list(skb))
			skb_clone_fraglist(skb);

		skb_release_data(skb, SKB_CONSUMED, false);
		skb_release_data(skb, SKB_CONSUMED);
	} else {
		skb_free_head(skb, false);
		skb_free_head(skb);
	}
	off = (data + nhead) - skb->head;

@@ -6574,12 +6548,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
			skb_frag_ref(skb, i);
		if (skb_has_frag_list(skb))
			skb_clone_fraglist(skb);
		skb_release_data(skb, SKB_CONSUMED, false);
		skb_release_data(skb, SKB_CONSUMED);
	} else {
		/* we can reuse existing recount- all we did was
		 * relocate values
		 */
		skb_free_head(skb, false);
		skb_free_head(skb);
	}

	skb->head = data;
@@ -6714,7 +6688,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
		skb_kfree_head(data, size);
		return -ENOMEM;
	}
	skb_release_data(skb, SKB_CONSUMED, false);
	skb_release_data(skb, SKB_CONSUMED);

	skb->head = data;
	skb->head_frag = 0;
+1 −1
Original line number Diff line number Diff line
@@ -114,7 +114,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
	 */
	if (req->src != req->dst)
		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
			skb_page_unref(skb, sg_page(sg), false);
			skb_page_unref(skb, sg_page(sg));
}

#ifdef CONFIG_INET_ESPINTCP
+1 −1
Original line number Diff line number Diff line
@@ -131,7 +131,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
	 */
	if (req->src != req->dst)
		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
			skb_page_unref(skb, sg_page(sg), false);
			skb_page_unref(skb, sg_page(sg));
}

#ifdef CONFIG_INET6_ESPINTCP