Commit 031f1592 authored by Paolo Abeni's avatar Paolo Abeni
Browse files

Merge branch 'net-sched-fix-packet-loops-in-mirred-and-netem'

Jamal Hadi Salim says:

====================
net/sched: Fix packet loops in mirred and netem

This patchset adds a 2-bit per-skb tc_depth counter that travels with
the packet. The existing per-CPU mirred nest tracking loses state
when a packet is deferred through the backlog or moves between CPUs
via XPS/RPS. A per-skb field covers both cases.

Patch 1 adds the tc_depth field in a padding hole in sk_buff.
Patches 2-3 revert the check_netem_in_tree() fix and its tests,
which broke legitimate multi-netem configurations.
Patch 4 uses tc_depth to stop netem duplicate recursion.
Patch 5 uses tc_depth to catch mirred ingress redirect loops.
Patch 6 fixes the infinite loop in the mirred egress blockcast case.
Patch 7 fixes drop stats in early return error scenarios in tcf_mirred_act
for redirect (caught by Sashiko [1]).
Patches 8-9 add mirred and netem test cases.

[1] https://sashiko.dev/#/patchset/20260413082027.2244884-1-hxzene%40gmail.com
====================

Link: https://patch.msgid.link/20260525122556.973584-1-jhs@mojatatu.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents 9d5e7a46 0f6e00aa
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -821,6 +821,7 @@ enum skb_tstamp_type {
 *	@_sk_redir: socket redirection information for skmsg
 *	@_nfct: Associated connection, if any (with nfctinfo bits)
 *	@skb_iif: ifindex of device we arrived on
 *	@tc_depth: counter for packet duplication
 *	@tc_index: Traffic control index
 *	@hash: the packet hash
 *	@queue_mapping: Queue mapping for multiqueue devices
@@ -1030,6 +1031,7 @@ struct sk_buff {
	__u8			csum_not_inet:1;
#endif
	__u8			unreadable:1;
	__u8			tc_depth:2;
#if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
	__u16			tc_index;	/* traffic control index */
#endif
+51 −26
Original line number Diff line number Diff line
@@ -26,6 +26,10 @@
#include <net/tc_act/tc_mirred.h>
#include <net/tc_wrapper.h>

#define MIRRED_DEFER_LIMIT 3
_Static_assert(MIRRED_DEFER_LIMIT <= 3,
	       "MIRRED_DEFER_LIMIT exceeds tc_depth bitfield width");

static LIST_HEAD(mirred_list);
static DEFINE_SPINLOCK(mirred_list_lock);

@@ -234,12 +238,15 @@ tcf_mirred_forward(bool at_ingress, bool want_ingress, struct sk_buff *skb)
{
	int err;

	if (!want_ingress)
	if (!want_ingress) {
		err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
	else if (!at_ingress)
	} else {
		skb->tc_depth++;
		if (!at_ingress)
			err = netif_rx(skb);
		else
			err = netif_receive_skb(skb);
	}

	return err;
}
@@ -365,7 +372,8 @@ static int tcf_blockcast_redir(struct sk_buff *skb, struct tcf_mirred *m,
					 dev_is_mac_header_xmit(dev_prev),
					 m_eaction, retval);

	return retval;
	/* If the packet wasn't redirected, we have to register as a drop */
	return TC_ACT_SHOT;
}

static int tcf_blockcast_mirror(struct sk_buff *skb, struct tcf_mirred *m,
@@ -389,14 +397,12 @@ static int tcf_blockcast_mirror(struct sk_buff *skb, struct tcf_mirred *m,

static int tcf_blockcast(struct sk_buff *skb, struct tcf_mirred *m,
			 const u32 blockid, struct tcf_result *res,
			 int retval)
			 int m_eaction, int retval)
{
	const u32 exception_ifindex = skb->dev->ifindex;
	struct tcf_block *block;
	bool is_redirect;
	int m_eaction;

	m_eaction = READ_ONCE(m->tcfm_eaction);
	is_redirect = tcf_mirred_is_act_redirect(m_eaction);

	/* we are already under rcu protection, so can call block lookup
@@ -405,7 +411,7 @@ static int tcf_blockcast(struct sk_buff *skb, struct tcf_mirred *m,
	block = tcf_block_lookup(dev_net(skb->dev), blockid);
	if (!block || xa_empty(&block->ports)) {
		tcf_action_inc_overlimit_qstats(&m->common);
		return retval;
		return is_redirect ? TC_ACT_SHOT : retval;
	}

	if (is_redirect)
@@ -423,9 +429,10 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
{
	struct tcf_mirred *m = to_mirred(a);
	int retval = READ_ONCE(m->tcf_action);
	bool m_mac_header_xmit, is_redirect;
	struct netdev_xmit *xmit;
	bool m_mac_header_xmit;
	struct net_device *dev;
	bool want_ingress;
	int i, m_eaction;
	u32 blockid;

@@ -434,7 +441,8 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
#else
	xmit = this_cpu_ptr(&softnet_data.xmit);
#endif
	if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT)) {
	if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT ||
		     skb->tc_depth >= MIRRED_DEFER_LIMIT)) {
		net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n",
				     netdev_name(skb->dev));
		return TC_ACT_SHOT;
@@ -444,34 +452,51 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
	tcf_action_update_bstats(&m->common, skb);

	blockid = READ_ONCE(m->tcfm_blockid);
	if (blockid)
		return tcf_blockcast(skb, m, blockid, res, retval);
	m_eaction = READ_ONCE(m->tcfm_eaction);
	want_ingress = tcf_mirred_act_wants_ingress(m_eaction);
	if (blockid) {
		if (!want_ingress)
			xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = NULL;
		retval = tcf_blockcast(skb, m, blockid, res, m_eaction, retval);
		if (!want_ingress)
			xmit->sched_mirred_nest--;
		return retval;
	}

	is_redirect = tcf_mirred_is_act_redirect(m_eaction);

	dev = rcu_dereference_bh(m->tcfm_dev);
	if (unlikely(!dev)) {
		pr_notice_once("tc mirred: target device is gone\n");
		tcf_action_inc_overlimit_qstats(&m->common);
		return retval;
		goto err_out;
	}

	if (!want_ingress) {
		for (i = 0; i < xmit->sched_mirred_nest; i++) {
			if (xmit->sched_mirred_dev[i] != dev)
				continue;
			pr_notice_once("tc mirred: loop on device %s\n",
				       netdev_name(dev));
			tcf_action_inc_overlimit_qstats(&m->common);
		return retval;
			goto err_out;
		}

		xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev;
	}

	m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
	m_eaction = READ_ONCE(m->tcfm_eaction);

	retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction,
				   retval);
	if (!want_ingress)
		xmit->sched_mirred_nest--;

	return retval;

err_out:
	if (is_redirect)
		retval = TC_ACT_SHOT;
	return retval;
}

static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+3 −44
Original line number Diff line number Diff line
@@ -461,7 +461,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
	skb->prev = NULL;

	/* Random duplication */
	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng))
	if (q->duplicate && skb->tc_depth == 0 &&
	    q->duplicate >= get_crandom(&q->dup_cor, &q->prng))
		++count;

	/* Drop packet? */
@@ -540,11 +541,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
	 */
	if (skb2) {
		struct Qdisc *rootq = qdisc_root_bh(sch);
		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */

		q->duplicate = 0;
		skb2->tc_depth++; /* prevent duplicating a dup... */
		rootq->enqueue(skb2, rootq, to_free);
		q->duplicate = dupsave;
		skb2 = NULL;
	}

@@ -1007,41 +1006,6 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
	return 0;
}

static const struct Qdisc_class_ops netem_class_ops;

static int check_netem_in_tree(struct Qdisc *sch, bool duplicates,
			       struct netlink_ext_ack *extack)
{
	struct Qdisc *root, *q;
	unsigned int i;

	root = qdisc_root_sleeping(sch);

	if (sch != root && root->ops->cl_ops == &netem_class_ops) {
		if (duplicates ||
		    ((struct netem_sched_data *)qdisc_priv(root))->duplicate)
			goto err;
	}

	if (!qdisc_dev(root))
		return 0;

	hash_for_each(qdisc_dev(root)->qdisc_hash, i, q, hash) {
		if (sch != q && q->ops->cl_ops == &netem_class_ops) {
			if (duplicates ||
			    ((struct netem_sched_data *)qdisc_priv(q))->duplicate)
				goto err;
		}
	}

	return 0;

err:
	NL_SET_ERR_MSG(extack,
		       "netem: cannot mix duplicating netems with other netems in tree");
	return -EINVAL;
}

/* Parse netlink message to set options */
static int netem_change(struct Qdisc *sch, struct nlattr *opt,
			struct netlink_ext_ack *extack)
@@ -1118,11 +1082,6 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
	q->gap = qopt->gap;
	q->counter = 0;
	q->loss = qopt->loss;

	ret = check_netem_in_tree(sch, qopt->duplicate, extack);
	if (ret)
		goto unlock;

	q->duplicate = qopt->duplicate;

	/* for compatibility with earlier versions.
+615 −1

File changed.

Preview size limit exceeded, changes collapsed.

+3 −2
Original line number Diff line number Diff line
@@ -702,6 +702,7 @@
            "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 netem duplicate 100%",
            "$TC filter add dev $DUMMY parent 1:0 protocol ip prio 1 u32 match ip dst 10.10.10.1/32 flowid 1:1",
            "$TC class add dev $DUMMY parent 1:0 classid 1:2 hfsc ls m2 10Mbit",
            "$TC qdisc add dev $DUMMY parent 1:2 handle 3:0 netem duplicate 100%",
            "$TC filter add dev $DUMMY parent 1:0 protocol ip prio 2 u32 match ip dst 10.10.10.2/32 flowid 1:2",
            "ping -c 1 10.10.10.1 -I$DUMMY > /dev/null || true",
            "$TC filter del dev $DUMMY parent 1:0 protocol ip prio 1",
@@ -714,8 +715,8 @@
            {
                "kind": "hfsc",
                "handle": "1:",
                "bytes": 294,
                "packets": 3
                "bytes": 392,
                "packets": 4
            }
        ],
        "matchCount": "1",
Loading