Commit 88be0922 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files
Tony Nguyen says:

====================
ice: fix Rx data path for heavy 9k MTU traffic

Maciej Fijalkowski says:

This patchset fixes a pretty nasty issue that was reported by RedHat
folks which occurred after ~30 minutes (this value varied, just trying
here to state that it was not observed immediately but rather after a
considerable longer amount of time) when ice driver was tortured with
jumbo frames via mix of iperf traffic executed simultaneously with
wrk/nginx on client/server sides (HTTP and TCP workloads basically).

The reported splats were spanning across all the bad things that can
happen to the state of page - refcount underflow, use-after-free, etc.
One of these looked as follows:

[ 2084.019891] BUG: Bad page state in process swapper/34  pfn:97fcd0
[ 2084.025990] page:00000000a60ee772 refcount:-1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x97fcd0
[ 2084.035462] flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff)
[ 2084.041990] raw: 0017ffffc0000000 dead000000000100 dead000000000122 0000000000000000
[ 2084.049730] raw: 0000000000000000 0000000000000000 ffffffffffffffff 0000000000000000
[ 2084.057468] page dumped because: nonzero _refcount
[ 2084.062260] Modules linked in: bonding tls sunrpc intel_rapl_msr intel_rapl_common intel_uncore_frequency intel_uncore_frequency_common i10nm_edac nfit libnvdimm x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm mgag200 irqd
[ 2084.137829] CPU: 34 PID: 0 Comm: swapper/34 Kdump: loaded Not tainted 5.14.0-427.37.1.el9_4.x86_64 #1
[ 2084.147039] Hardware name: Dell Inc. PowerEdge R750/0216NK, BIOS 1.13.2 12/19/2023
[ 2084.154604] Call Trace:
[ 2084.157058]  <IRQ>
[ 2084.159080]  dump_stack_lvl+0x34/0x48
[ 2084.162752]  bad_page.cold+0x63/0x94
[ 2084.166333]  check_new_pages+0xb3/0xe0
[ 2084.170083]  rmqueue_bulk+0x2d2/0x9e0
[ 2084.173749]  ? ktime_get+0x35/0xa0
[ 2084.177159]  rmqueue_pcplist+0x13b/0x210
[ 2084.181081]  rmqueue+0x7d3/0xd40
[ 2084.184316]  ? xas_load+0x9/0xa0
[ 2084.187547]  ? xas_find+0x183/0x1d0
[ 2084.191041]  ? xa_find_after+0xd0/0x130
[ 2084.194879]  ? intel_iommu_iotlb_sync_map+0x89/0xe0
[ 2084.199759]  get_page_from_freelist+0x11f/0x530
[ 2084.204291]  __alloc_pages+0xf2/0x250
[ 2084.207958]  ice_alloc_rx_bufs+0xcc/0x1c0 [ice]
[ 2084.212543]  ice_clean_rx_irq+0x631/0xa20 [ice]
[ 2084.217111]  ice_napi_poll+0xdf/0x2a0 [ice]
[ 2084.221330]  __napi_poll+0x27/0x170
[ 2084.224824]  net_rx_action+0x233/0x2f0
[ 2084.228575]  __do_softirq+0xc7/0x2ac
[ 2084.232155]  __irq_exit_rcu+0xa1/0xc0
[ 2084.235821]  common_interrupt+0x80/0xa0
[ 2084.239662]  </IRQ>
[ 2084.241768]  <TASK>

The fix is mostly about reverting what was done in commit 1dc1a7e7
("ice: Centrallize Rx buffer recycling") followed by proper timing on
page_count() storage and then removing the ice_rx_buf::act related logic
(which was mostly introduced for purposes from cited commit).

Special thanks to Xu Du for providing reproducer and Jacob Keller for
initial extensive analysis.

* '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/net-queue:
  ice: stop storing XDP verdict within ice_rx_buf
  ice: gather page_count()'s of each frag right before XDP prog call
  ice: put Rx buffers after being done with current frame
====================

Link: https://patch.msgid.link/20250131185415.3741532-1-anthony.l.nguyen@intel.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 235174b2 468a1952
Loading
Loading
Loading
Loading
+103 −47
Original line number Diff line number Diff line
@@ -527,15 +527,14 @@ int ice_setup_rx_ring(struct ice_rx_ring *rx_ring)
 * @xdp: xdp_buff used as input to the XDP program
 * @xdp_prog: XDP program to run
 * @xdp_ring: ring to be used for XDP_TX action
 * @rx_buf: Rx buffer to store the XDP action
 * @eop_desc: Last descriptor in packet to read metadata from
 *
 * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
 */
static void
static u32
ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
	    struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring,
	    struct ice_rx_buf *rx_buf, union ice_32b_rx_flex_desc *eop_desc)
	    union ice_32b_rx_flex_desc *eop_desc)
{
	unsigned int ret = ICE_XDP_PASS;
	u32 act;
@@ -574,7 +573,7 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
		ret = ICE_XDP_CONSUMED;
	}
exit:
	ice_set_rx_bufs_act(xdp, rx_ring, ret);
	return ret;
}

/**
@@ -860,10 +859,8 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
		xdp_buff_set_frags_flag(xdp);
	}

	if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) {
		ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED);
	if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS))
		return -ENOMEM;
	}

	__skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page,
				   rx_buf->page_offset, size);
@@ -924,7 +921,6 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size,
	struct ice_rx_buf *rx_buf;

	rx_buf = &rx_ring->rx_buf[ntc];
	rx_buf->pgcnt = page_count(rx_buf->page);
	prefetchw(rx_buf->page);

	if (!size)
@@ -940,6 +936,31 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size,
	return rx_buf;
}

/**
 * ice_get_pgcnts - grab page_count() for gathered fragments
 * @rx_ring: Rx descriptor ring to store the page counts on
 *
 * This function is intended to be called right before running XDP
 * program so that the page recycling mechanism will be able to take
 * a correct decision regarding underlying pages; this is done in such
 * way as XDP program can change the refcount of page
 */
static void ice_get_pgcnts(struct ice_rx_ring *rx_ring)
{
	u32 nr_frags = rx_ring->nr_frags + 1;
	u32 idx = rx_ring->first_desc;
	struct ice_rx_buf *rx_buf;
	u32 cnt = rx_ring->count;

	for (int i = 0; i < nr_frags; i++) {
		rx_buf = &rx_ring->rx_buf[idx];
		rx_buf->pgcnt = page_count(rx_buf->page);

		if (++idx == cnt)
			idx = 0;
	}
}

/**
 * ice_build_skb - Build skb around an existing buffer
 * @rx_ring: Rx descriptor ring to transact packets on
@@ -1051,12 +1072,12 @@ ice_construct_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp)
				rx_buf->page_offset + headlen, size,
				xdp->frame_sz);
	} else {
		/* buffer is unused, change the act that should be taken later
		 * on; data was copied onto skb's linear part so there's no
		/* buffer is unused, restore biased page count in Rx buffer;
		 * data was copied onto skb's linear part so there's no
		 * need for adjusting page offset and we can reuse this buffer
		 * as-is
		 */
		rx_buf->act = ICE_SKB_CONSUMED;
		rx_buf->pagecnt_bias++;
	}

	if (unlikely(xdp_buff_has_frags(xdp))) {
@@ -1103,6 +1124,65 @@ ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf)
	rx_buf->page = NULL;
}

/**
 * ice_put_rx_mbuf - ice_put_rx_buf() caller, for all frame frags
 * @rx_ring: Rx ring with all the auxiliary data
 * @xdp: XDP buffer carrying linear + frags part
 * @xdp_xmit: XDP_TX/XDP_REDIRECT verdict storage
 * @ntc: a current next_to_clean value to be stored at rx_ring
 * @verdict: return code from XDP program execution
 *
 * Walk through gathered fragments and satisfy internal page
 * recycle mechanism; we take here an action related to verdict
 * returned by XDP program;
 */
static void ice_put_rx_mbuf(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
			    u32 *xdp_xmit, u32 ntc, u32 verdict)
{
	u32 nr_frags = rx_ring->nr_frags + 1;
	u32 idx = rx_ring->first_desc;
	u32 cnt = rx_ring->count;
	u32 post_xdp_frags = 1;
	struct ice_rx_buf *buf;
	int i;

	if (unlikely(xdp_buff_has_frags(xdp)))
		post_xdp_frags += xdp_get_shared_info_from_buff(xdp)->nr_frags;

	for (i = 0; i < post_xdp_frags; i++) {
		buf = &rx_ring->rx_buf[idx];

		if (verdict & (ICE_XDP_TX | ICE_XDP_REDIR)) {
			ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz);
			*xdp_xmit |= verdict;
		} else if (verdict & ICE_XDP_CONSUMED) {
			buf->pagecnt_bias++;
		} else if (verdict == ICE_XDP_PASS) {
			ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz);
		}

		ice_put_rx_buf(rx_ring, buf);

		if (++idx == cnt)
			idx = 0;
	}
	/* handle buffers that represented frags released by XDP prog;
	 * for these we keep pagecnt_bias as-is; refcount from struct page
	 * has been decremented within XDP prog and we do not have to increase
	 * the biased refcnt
	 */
	for (; i < nr_frags; i++) {
		buf = &rx_ring->rx_buf[idx];
		ice_put_rx_buf(rx_ring, buf);
		if (++idx == cnt)
			idx = 0;
	}

	xdp->data = NULL;
	rx_ring->first_desc = ntc;
	rx_ring->nr_frags = 0;
}

/**
 * ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
 * @rx_ring: Rx descriptor ring to transact packets on
@@ -1120,15 +1200,13 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
	unsigned int total_rx_bytes = 0, total_rx_pkts = 0;
	unsigned int offset = rx_ring->rx_offset;
	struct xdp_buff *xdp = &rx_ring->xdp;
	u32 cached_ntc = rx_ring->first_desc;
	struct ice_tx_ring *xdp_ring = NULL;
	struct bpf_prog *xdp_prog = NULL;
	u32 ntc = rx_ring->next_to_clean;
	u32 cached_ntu, xdp_verdict;
	u32 cnt = rx_ring->count;
	u32 xdp_xmit = 0;
	u32 cached_ntu;
	bool failure;
	u32 first;

	xdp_prog = READ_ONCE(rx_ring->xdp_prog);
	if (xdp_prog) {
@@ -1190,6 +1268,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
			xdp_prepare_buff(xdp, hard_start, offset, size, !!offset);
			xdp_buff_clear_frags_flag(xdp);
		} else if (ice_add_xdp_frag(rx_ring, xdp, rx_buf, size)) {
			ice_put_rx_mbuf(rx_ring, xdp, NULL, ntc, ICE_XDP_CONSUMED);
			break;
		}
		if (++ntc == cnt)
@@ -1199,15 +1278,15 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
		if (ice_is_non_eop(rx_ring, rx_desc))
			continue;

		ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_buf, rx_desc);
		if (rx_buf->act == ICE_XDP_PASS)
		ice_get_pgcnts(rx_ring);
		xdp_verdict = ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_desc);
		if (xdp_verdict == ICE_XDP_PASS)
			goto construct_skb;
		total_rx_bytes += xdp_get_buff_len(xdp);
		total_rx_pkts++;

		xdp->data = NULL;
		rx_ring->first_desc = ntc;
		rx_ring->nr_frags = 0;
		ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict);

		continue;
construct_skb:
		if (likely(ice_ring_uses_build_skb(rx_ring)))
@@ -1217,18 +1296,12 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
		/* exit if we failed to retrieve a buffer */
		if (!skb) {
			rx_ring->ring_stats->rx_stats.alloc_page_failed++;
			rx_buf->act = ICE_XDP_CONSUMED;
			if (unlikely(xdp_buff_has_frags(xdp)))
				ice_set_rx_bufs_act(xdp, rx_ring,
						    ICE_XDP_CONSUMED);
			xdp->data = NULL;
			rx_ring->first_desc = ntc;
			rx_ring->nr_frags = 0;
			break;
			xdp_verdict = ICE_XDP_CONSUMED;
		}
		xdp->data = NULL;
		rx_ring->first_desc = ntc;
		rx_ring->nr_frags = 0;
		ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict);

		if (!skb)
			break;

		stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
		if (unlikely(ice_test_staterr(rx_desc->wb.status_error0,
@@ -1257,23 +1330,6 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
		total_rx_pkts++;
	}

	first = rx_ring->first_desc;
	while (cached_ntc != first) {
		struct ice_rx_buf *buf = &rx_ring->rx_buf[cached_ntc];

		if (buf->act & (ICE_XDP_TX | ICE_XDP_REDIR)) {
			ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz);
			xdp_xmit |= buf->act;
		} else if (buf->act & ICE_XDP_CONSUMED) {
			buf->pagecnt_bias++;
		} else if (buf->act == ICE_XDP_PASS) {
			ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz);
		}

		ice_put_rx_buf(rx_ring, buf);
		if (++cached_ntc >= cnt)
			cached_ntc = 0;
	}
	rx_ring->next_to_clean = ntc;
	/* return up to cleaned_count buffers to hardware */
	failure = ice_alloc_rx_bufs(rx_ring, ICE_RX_DESC_UNUSED(rx_ring));
+0 −1
Original line number Diff line number Diff line
@@ -201,7 +201,6 @@ struct ice_rx_buf {
	struct page *page;
	unsigned int page_offset;
	unsigned int pgcnt;
	unsigned int act;
	unsigned int pagecnt_bias;
};

+0 −43
Original line number Diff line number Diff line
@@ -5,49 +5,6 @@
#define _ICE_TXRX_LIB_H_
#include "ice.h"

/**
 * ice_set_rx_bufs_act - propagate Rx buffer action to frags
 * @xdp: XDP buffer representing frame (linear and frags part)
 * @rx_ring: Rx ring struct
 * act: action to store onto Rx buffers related to XDP buffer parts
 *
 * Set action that should be taken before putting Rx buffer from first frag
 * to the last.
 */
static inline void
ice_set_rx_bufs_act(struct xdp_buff *xdp, const struct ice_rx_ring *rx_ring,
		    const unsigned int act)
{
	u32 sinfo_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags;
	u32 nr_frags = rx_ring->nr_frags + 1;
	u32 idx = rx_ring->first_desc;
	u32 cnt = rx_ring->count;
	struct ice_rx_buf *buf;

	for (int i = 0; i < nr_frags; i++) {
		buf = &rx_ring->rx_buf[idx];
		buf->act = act;

		if (++idx == cnt)
			idx = 0;
	}

	/* adjust pagecnt_bias on frags freed by XDP prog */
	if (sinfo_frags < rx_ring->nr_frags && act == ICE_XDP_CONSUMED) {
		u32 delta = rx_ring->nr_frags - sinfo_frags;

		while (delta) {
			if (idx == 0)
				idx = cnt - 1;
			else
				idx--;
			buf = &rx_ring->rx_buf[idx];
			buf->pagecnt_bias--;
			delta--;
		}
	}
}

/**
 * ice_test_staterr - tests bits in Rx descriptor status and error fields
 * @status_err_n: Rx descriptor status_error0 or status_error1 bits