Commit dd3188dd authored by Paolo Abeni's avatar Paolo Abeni
Browse files

Merge branch 'net-remove-the-single-page-frag-cache-for-good'

Paolo Abeni says:

====================
net: remove the single page frag cache for good

This is another attempt at reverting commit dbae2b06 ("net: skb:
introduce and use a single page frag cache"), as it causes regressions
in specific use-cases.

Reverting such commit uncovers an allocation issue for build with
CONFIG_MAX_SKB_FRAGS=45, as reported by Sabrina.

This series handle the latter in patch 1 and brings the revert in patch
2.

Note that there is a little chicken-egg problem, as I included into the
patch 1's changelog the splat that would be visible only applying first
the revert: I think current patch order is better for bisectability,
still the splat is useful for correct attribution.
====================

Link: https://patch.msgid.link/cover.1739899357.git.pabeni@redhat.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents 878e7b11 6bc7e4eb
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -4117,7 +4117,6 @@ void netif_receive_skb_list(struct list_head *head);
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
void napi_gro_flush(struct napi_struct *napi, bool flush_old);
struct sk_buff *napi_get_frags(struct napi_struct *napi);
void napi_get_frags_check(struct napi_struct *napi);
gro_result_t napi_gro_frags(struct napi_struct *napi);

static inline void napi_free_frags(struct napi_struct *napi)
+3 −0
Original line number Diff line number Diff line
@@ -11,6 +11,9 @@
#include <net/udp.h>
#include <net/hotdata.h>

/* This should be increased if a protocol with a bigger head is added. */
#define GRO_MAX_HEAD (MAX_HEADER + 128)

struct napi_gro_cb {
	union {
		struct {
+17 −0
Original line number Diff line number Diff line
@@ -6991,6 +6991,23 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
	list_add_rcu(&napi->dev_list, higher); /* adds after higher */
}

/* Double check that napi_get_frags() allocates skbs with
 * skb->head being backed by slab, not a page fragment.
 * This is to make sure bug fixed in 3226b158e67c
 * ("net: avoid 32 x truesize under-estimation for tiny skbs")
 * does not accidentally come back.
 */
static void napi_get_frags_check(struct napi_struct *napi)
{
	struct sk_buff *skb;

	local_bh_disable();
	skb = napi_get_frags(napi);
	WARN_ON_ONCE(skb && skb->head_frag);
	napi_free_frags(napi);
	local_bh_enable();
}

void netif_napi_add_weight_locked(struct net_device *dev,
				  struct napi_struct *napi,
				  int (*poll)(struct napi_struct *, int),
+0 −3
Original line number Diff line number Diff line
@@ -7,9 +7,6 @@

#define MAX_GRO_SKBS 8

/* This should be increased if a protocol with a bigger head is added. */
#define GRO_MAX_HEAD (MAX_HEADER + 128)

static DEFINE_SPINLOCK(offload_lock);

/**
+10 −100
Original line number Diff line number Diff line
@@ -69,6 +69,7 @@
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
#include <net/gro.h>
#include <net/gso.h>
#include <net/hotdata.h>
#include <net/ip6_checksum.h>
@@ -95,7 +96,9 @@
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif

#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN)
#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \
					       GRO_MAX_HEAD_PAD))

/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
 * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
@@ -220,67 +223,9 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
#define NAPI_SKB_CACHE_BULK	16
#define NAPI_SKB_CACHE_HALF	(NAPI_SKB_CACHE_SIZE / 2)

#if PAGE_SIZE == SZ_4K

#define NAPI_HAS_SMALL_PAGE_FRAG	1
#define NAPI_SMALL_PAGE_PFMEMALLOC(nc)	((nc).pfmemalloc)

/* specialized page frag allocator using a single order 0 page
 * and slicing it into 1K sized fragment. Constrained to systems
 * with a very limited amount of 1K fragments fitting a single
 * page - to avoid excessive truesize underestimation
 */

struct page_frag_1k {
	void *va;
	u16 offset;
	bool pfmemalloc;
};

static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
{
	struct page *page;
	int offset;

	offset = nc->offset - SZ_1K;
	if (likely(offset >= 0))
		goto use_frag;

	page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
	if (!page)
		return NULL;

	nc->va = page_address(page);
	nc->pfmemalloc = page_is_pfmemalloc(page);
	offset = PAGE_SIZE - SZ_1K;
	page_ref_add(page, offset / SZ_1K);

use_frag:
	nc->offset = offset;
	return nc->va + offset;
}
#else

/* the small page is actually unused in this build; add dummy helpers
 * to please the compiler and avoid later preprocessor's conditionals
 */
#define NAPI_HAS_SMALL_PAGE_FRAG	0
#define NAPI_SMALL_PAGE_PFMEMALLOC(nc)	false

struct page_frag_1k {
};

static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
{
	return NULL;
}

#endif

struct napi_alloc_cache {
	local_lock_t bh_lock;
	struct page_frag_cache page;
	struct page_frag_1k page_small;
	unsigned int skb_count;
	void *skb_cache[NAPI_SKB_CACHE_SIZE];
};
@@ -290,23 +235,6 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
};

/* Double check that napi_get_frags() allocates skbs with
 * skb->head being backed by slab, not a page fragment.
 * This is to make sure bug fixed in 3226b158e67c
 * ("net: avoid 32 x truesize under-estimation for tiny skbs")
 * does not accidentally come back.
 */
void napi_get_frags_check(struct napi_struct *napi)
{
	struct sk_buff *skb;

	local_bh_disable();
	skb = napi_get_frags(napi);
	WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
	napi_free_frags(napi);
	local_bh_enable();
}

void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
@@ -736,7 +664,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
	/* If requested length is either too small or too big,
	 * we use kmalloc() for skb->head allocation.
	 */
	if (len <= SKB_WITH_OVERHEAD(1024) ||
	if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
@@ -813,10 +741,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)

	/* If requested length is either too small or too big,
	 * we use kmalloc() for skb->head allocation.
	 * When the small frag allocator is available, prefer it over kmalloc
	 * for small fragments
	 */
	if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
	if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
@@ -826,32 +752,16 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
		goto skb_success;
	}

	len = SKB_HEAD_ALIGN(len);

	if (sk_memalloc_socks())
		gfp_mask |= __GFP_MEMALLOC;

	local_lock_nested_bh(&napi_alloc_cache.bh_lock);
	nc = this_cpu_ptr(&napi_alloc_cache);
	if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
		/* we are artificially inflating the allocation size, but
		 * that is not as bad as it may look like, as:
		 * - 'len' less than GRO_MAX_HEAD makes little sense
		 * - On most systems, larger 'len' values lead to fragment
		 *   size above 512 bytes
		 * - kmalloc would use the kmalloc-1k slab for such values
		 * - Builds with smaller GRO_MAX_HEAD will very likely do
		 *   little networking, as that implies no WiFi and no
		 *   tunnels support, and 32 bits arches.
		 */
		len = SZ_1K;

		data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
		pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
	} else {
		len = SKB_HEAD_ALIGN(len);

	data = page_frag_alloc(&nc->page, len, gfp_mask);
	pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
	}
	local_unlock_nested_bh(&napi_alloc_cache.bh_lock);

	if (unlikely(!data))