Commit 191ff13e authored by Eric Dumazet's avatar Eric Dumazet Committed by Paolo Abeni
Browse files

net_sched: add qdisc_dequeue_drop() helper



Some qdisc like cake, codel, fq_codel might drop packets
in their dequeue() method.

This is currently problematic because dequeue() runs with
the qdisc spinlock held. Freeing skbs can be extremely expensive.

Add qdisc_dequeue_drop() method and a new TCQ_F_DEQUEUE_DROPS
so that these qdiscs can opt-in to defer the skb frees
after the socket spinlock is released.

TCQ_F_DEQUEUE_DROPS is an attempt to not penalize other qdiscs
with an extra cache line miss.

Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-14-edumazet@google.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parent 0170d7f4
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -114,12 +114,13 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,

void __qdisc_run(struct Qdisc *q);

static inline void qdisc_run(struct Qdisc *q)
static inline struct sk_buff *qdisc_run(struct Qdisc *q)
{
	if (qdisc_run_begin(q)) {
		__qdisc_run(q);
		qdisc_run_end(q);
		return qdisc_run_end(q);
	}
	return NULL;
}

extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
+27 −3
Original line number Diff line number Diff line
@@ -88,6 +88,8 @@ struct Qdisc {
#define TCQ_F_INVISIBLE		0x80 /* invisible by default in dump */
#define TCQ_F_NOLOCK		0x100 /* qdisc does not require locking */
#define TCQ_F_OFFLOADED		0x200 /* qdisc is offloaded to HW */
#define TCQ_F_DEQUEUE_DROPS	0x400 /* ->dequeue() can drop packets in q->to_free */

	u32			limit;
	const struct Qdisc_ops	*ops;
	struct qdisc_size_table	__rcu *stab;
@@ -119,6 +121,8 @@ struct Qdisc {

		/* Note : we only change qstats.backlog in fast path. */
		struct gnet_stats_queue	qstats;

		struct sk_buff		*to_free;
	__cacheline_group_end(Qdisc_write);


@@ -218,8 +222,10 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
	return true;
}

static inline void qdisc_run_end(struct Qdisc *qdisc)
static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc)
{
	struct sk_buff *to_free = NULL;

	if (qdisc->flags & TCQ_F_NOLOCK) {
		spin_unlock(&qdisc->seqlock);

@@ -232,9 +238,16 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
		if (unlikely(test_bit(__QDISC_STATE_MISSED,
				      &qdisc->state)))
			__netif_schedule(qdisc);
	} else {
		WRITE_ONCE(qdisc->running, false);
		return NULL;
	}

	if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) {
		to_free = qdisc->to_free;
		if (to_free)
			qdisc->to_free = NULL;
	}
	WRITE_ONCE(qdisc->running, false);
	return to_free;
}

static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
@@ -1116,6 +1129,17 @@ static inline void tcf_kfree_skb_list(struct sk_buff *skb)
	}
}

static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb,
				      enum skb_drop_reason reason)
{
	DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS));
	DEBUG_NET_WARN_ON_ONCE(q->flags & TCQ_F_NOLOCK);

	tcf_set_drop_reason(skb, reason);
	skb->next = q->to_free;
	q->to_free = skb;
}

/* Instead of calling kfree_skb() while root qdisc lock is held,
 * queue the skb for future freeing at end of __dev_xmit_skb()
 */
+13 −9
Original line number Diff line number Diff line
@@ -4141,7 +4141,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
				 struct net_device *dev,
				 struct netdev_queue *txq)
{
	struct sk_buff *next, *to_free = NULL;
	struct sk_buff *next, *to_free = NULL, *to_free2 = NULL;
	spinlock_t *root_lock = qdisc_lock(q);
	struct llist_node *ll_list, *first_n;
	unsigned long defer_count = 0;
@@ -4160,7 +4160,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
			if (unlikely(!nolock_qdisc_is_empty(q))) {
				rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
				__qdisc_run(q);
				qdisc_run_end(q);
				to_free2 = qdisc_run_end(q);

				goto free_skbs;
			}
@@ -4170,12 +4170,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
			    !nolock_qdisc_is_empty(q))
				__qdisc_run(q);

			qdisc_run_end(q);
			return NET_XMIT_SUCCESS;
			to_free2 = qdisc_run_end(q);
			rc = NET_XMIT_SUCCESS;
			goto free_skbs;
		}

		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
		qdisc_run(q);
		to_free2 = qdisc_run(q);
		goto free_skbs;
	}

@@ -4234,7 +4235,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
		qdisc_bstats_update(q, skb);
		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
			__qdisc_run(q);
		qdisc_run_end(q);
		to_free2 = qdisc_run_end(q);
		rc = NET_XMIT_SUCCESS;
	} else {
		int count = 0;
@@ -4246,7 +4247,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
			rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
			count++;
		}
		qdisc_run(q);
		to_free2 = qdisc_run(q);
		if (count != 1)
			rc = NET_XMIT_SUCCESS;
	}
@@ -4255,6 +4256,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,

free_skbs:
	tcf_kfree_skb_list(to_free);
	tcf_kfree_skb_list(to_free2);
	return rc;
}

@@ -5747,8 +5749,9 @@ static __latent_entropy void net_tx_action(void)
		rcu_read_lock();

		while (head) {
			struct Qdisc *q = head;
			spinlock_t *root_lock = NULL;
			struct sk_buff *to_free;
			struct Qdisc *q = head;

			head = head->next_sched;

@@ -5775,9 +5778,10 @@ static __latent_entropy void net_tx_action(void)
			}

			clear_bit(__QDISC_STATE_SCHED, &q->state);
			qdisc_run(q);
			to_free = qdisc_run(q);
			if (root_lock)
				spin_unlock(root_lock);
			tcf_kfree_skb_list(to_free);
		}

		rcu_read_unlock();