Commit 61e62802 authored by Paolo Abeni's avatar Paolo Abeni
Browse files

Merge branch 'net_sched-speedup-qdisc-dequeue'

Eric Dumazet says:

====================
net_sched: speedup qdisc dequeue

Avoid up to two cache line misses in qdisc dequeue() to fetch
skb_shinfo(skb)->gso_segs/gso_size while qdisc spinlock is held.

Idea is to cache gso_segs at enqueue time before spinlock is
acquired, in the first skb cache line, where we already
have qdisc_skb_cb(skb)->pkt_len.

This series gives a 8 % improvement in a TX intensive workload.

(120 Mpps -> 130 Mpps on a Turin host, IDPF with 32 TX queues)

v2: https://lore.kernel.org/netdev/20251111093204.1432437-1-edumazet@google.com/
v1: https://lore.kernel.org/netdev/20251110094505.3335073-1-edumazet@google.com/T/#m8f562ed148f807c02fd02c6cd243604d449615b9
====================

Link: https://patch.msgid.link/20251121083256.674562-1-edumazet@google.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents e3daf0e7 a6efc273
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -114,12 +114,13 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,

void __qdisc_run(struct Qdisc *q);

static inline void qdisc_run(struct Qdisc *q)
static inline struct sk_buff *qdisc_run(struct Qdisc *q)
{
	if (qdisc_run_begin(q)) {
		__qdisc_run(q);
		qdisc_run_end(q);
		return qdisc_run_end(q);
	}
	return NULL;
}

extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
+75 −26
Original line number Diff line number Diff line
@@ -88,6 +88,8 @@ struct Qdisc {
#define TCQ_F_INVISIBLE		0x80 /* invisible by default in dump */
#define TCQ_F_NOLOCK		0x100 /* qdisc does not require locking */
#define TCQ_F_OFFLOADED		0x200 /* qdisc is offloaded to HW */
#define TCQ_F_DEQUEUE_DROPS	0x400 /* ->dequeue() can drop packets in q->to_free */

	u32			limit;
	const struct Qdisc_ops	*ops;
	struct qdisc_size_table	__rcu *stab;
@@ -103,17 +105,26 @@ struct Qdisc {
	int			pad;
	refcount_t		refcnt;

	/*
	 * For performance sake on SMP, we put highly modified fields at the end
	 */
	struct sk_buff_head	gso_skb ____cacheline_aligned_in_smp;
	/* Cache line potentially dirtied in dequeue() or __netif_reschedule(). */
	__cacheline_group_begin(Qdisc_read_mostly) ____cacheline_aligned;
		struct sk_buff_head	gso_skb;
		struct Qdisc		*next_sched;
		struct sk_buff_head	skb_bad_txq;
	__cacheline_group_end(Qdisc_read_mostly);

	/* Fields dirtied in dequeue() fast path. */
	__cacheline_group_begin(Qdisc_write) ____cacheline_aligned;
		struct qdisc_skb_head	q;
		unsigned long		state;
		struct gnet_stats_basic_sync bstats;
	struct gnet_stats_queue	qstats;
		bool			running; /* must be written under qdisc spinlock */
	unsigned long		state;
	struct Qdisc            *next_sched;
	struct sk_buff_head	skb_bad_txq;

		/* Note : we only change qstats.backlog in fast path. */
		struct gnet_stats_queue	qstats;

		struct sk_buff		*to_free;
	__cacheline_group_end(Qdisc_write);


	atomic_long_t		defer_count ____cacheline_aligned_in_smp;
	struct llist_head	defer_list;
@@ -211,8 +222,10 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
	return true;
}

static inline void qdisc_run_end(struct Qdisc *qdisc)
static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc)
{
	struct sk_buff *to_free = NULL;

	if (qdisc->flags & TCQ_F_NOLOCK) {
		spin_unlock(&qdisc->seqlock);

@@ -225,9 +238,16 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
		if (unlikely(test_bit(__QDISC_STATE_MISSED,
				      &qdisc->state)))
			__netif_schedule(qdisc);
	} else {
		WRITE_ONCE(qdisc->running, false);
		return NULL;
	}

	if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) {
		to_free = qdisc->to_free;
		if (to_free)
			qdisc->to_free = NULL;
	}
	WRITE_ONCE(qdisc->running, false);
	return to_free;
}

static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
@@ -429,13 +449,16 @@ struct tcf_proto {
};

struct qdisc_skb_cb {
	struct {
	unsigned int		pkt_len;
		u16			slave_dev_queue_mapping;
	u16			pkt_segs;
	u16			tc_classid;
	};
#define QDISC_CB_PRIV_LEN 20
	unsigned char		data[QDISC_CB_PRIV_LEN];

	u16			slave_dev_queue_mapping;
	u8			post_ct:1;
	u8			post_ct_snat:1;
	u8			post_ct_dnat:1;
};

typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);
@@ -826,6 +849,15 @@ static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb)
	return qdisc_skb_cb(skb)->pkt_len;
}

static inline unsigned int qdisc_pkt_segs(const struct sk_buff *skb)
{
	u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs;

	DEBUG_NET_WARN_ON_ONCE(pkt_segs !=
			(skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1));
	return pkt_segs;
}

/* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */
enum net_xmit_qdisc_t {
	__NET_XMIT_STOLEN = 0x00010000,
@@ -867,9 +899,7 @@ static inline void _bstats_update(struct gnet_stats_basic_sync *bstats,
static inline void bstats_update(struct gnet_stats_basic_sync *bstats,
				 const struct sk_buff *skb)
{
	_bstats_update(bstats,
		       qdisc_pkt_len(skb),
		       skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1);
	_bstats_update(bstats, qdisc_pkt_len(skb), qdisc_pkt_segs(skb));
}

static inline void qdisc_bstats_cpu_update(struct Qdisc *sch,
@@ -1064,11 +1094,8 @@ struct tc_skb_cb {
	struct qdisc_skb_cb qdisc_cb;
	u32 drop_reason;

	u16 zone; /* Only valid if post_ct = true */
	u16 zone; /* Only valid if qdisc_skb_cb(skb)->post_ct = true */
	u16 mru;
	u8 post_ct:1;
	u8 post_ct_snat:1;
	u8 post_ct_dnat:1;
};

static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb)
@@ -1091,6 +1118,28 @@ static inline void tcf_set_drop_reason(const struct sk_buff *skb,
	tc_skb_cb(skb)->drop_reason = reason;
}

static inline void tcf_kfree_skb_list(struct sk_buff *skb)
{
	while (unlikely(skb)) {
		struct sk_buff *next = skb->next;

		prefetch(next);
		kfree_skb_reason(skb, tcf_get_drop_reason(skb));
		skb = next;
	}
}

static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb,
				      enum skb_drop_reason reason)
{
	DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS));
	DEBUG_NET_WARN_ON_ONCE(q->flags & TCQ_F_NOLOCK);

	tcf_set_drop_reason(skb, reason);
	skb->next = q->to_free;
	q->to_free = skb;
}

/* Instead of calling kfree_skb() while root qdisc lock is held,
 * queue the skb for future freeing at end of __dev_xmit_skb()
 */
+35 −27
Original line number Diff line number Diff line
@@ -4069,17 +4069,23 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
}
EXPORT_SYMBOL_GPL(validate_xmit_skb_list);

static void qdisc_pkt_len_init(struct sk_buff *skb)
static void qdisc_pkt_len_segs_init(struct sk_buff *skb)
{
	const struct skb_shared_info *shinfo = skb_shinfo(skb);
	struct skb_shared_info *shinfo = skb_shinfo(skb);
	u16 gso_segs;

	qdisc_skb_cb(skb)->pkt_len = skb->len;
	if (!shinfo->gso_size) {
		qdisc_skb_cb(skb)->pkt_segs = 1;
		return;
	}

	qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs;

	/* To get more precise estimation of bytes sent on wire,
	 * we add to pkt_len the headers size of all segments
	 */
	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
		u16 gso_segs = shinfo->gso_segs;
	if (skb_transport_header_was_set(skb)) {
		unsigned int hdr_len;

		/* mac layer + network layer */
@@ -4112,6 +4118,8 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
			if (payload <= 0)
				return;
			gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
			shinfo->gso_segs = gso_segs;
			qdisc_skb_cb(skb)->pkt_segs = gso_segs;
		}
		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
	}
@@ -4133,7 +4141,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
				 struct net_device *dev,
				 struct netdev_queue *txq)
{
	struct sk_buff *next, *to_free = NULL;
	struct sk_buff *next, *to_free = NULL, *to_free2 = NULL;
	spinlock_t *root_lock = qdisc_lock(q);
	struct llist_node *ll_list, *first_n;
	unsigned long defer_count = 0;
@@ -4152,9 +4160,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
			if (unlikely(!nolock_qdisc_is_empty(q))) {
				rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
				__qdisc_run(q);
				qdisc_run_end(q);
				to_free2 = qdisc_run_end(q);

				goto no_lock_out;
				goto free_skbs;
			}

			qdisc_bstats_cpu_update(q, skb);
@@ -4162,18 +4170,14 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
			    !nolock_qdisc_is_empty(q))
				__qdisc_run(q);

			qdisc_run_end(q);
			return NET_XMIT_SUCCESS;
			to_free2 = qdisc_run_end(q);
			rc = NET_XMIT_SUCCESS;
			goto free_skbs;
		}

		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
		qdisc_run(q);

no_lock_out:
		if (unlikely(to_free))
			kfree_skb_list_reason(to_free,
					      tcf_get_drop_reason(to_free));
		return rc;
		to_free2 = qdisc_run(q);
		goto free_skbs;
	}

	/* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
@@ -4186,7 +4190,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
	do {
		if (first_n && !defer_count) {
			defer_count = atomic_long_inc_return(&q->defer_count);
			if (unlikely(defer_count > q->limit)) {
			if (unlikely(defer_count > READ_ONCE(q->limit))) {
				kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP);
				return NET_XMIT_DROP;
			}
@@ -4231,26 +4235,28 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
		qdisc_bstats_update(q, skb);
		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
			__qdisc_run(q);
		qdisc_run_end(q);
		to_free2 = qdisc_run_end(q);
		rc = NET_XMIT_SUCCESS;
	} else {
		int count = 0;

		llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
			prefetch(next);
			prefetch(&next->priority);
			skb_mark_not_on_list(skb);
			rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
			count++;
		}
		qdisc_run(q);
		to_free2 = qdisc_run(q);
		if (count != 1)
			rc = NET_XMIT_SUCCESS;
	}
unlock:
	spin_unlock(root_lock);
	if (unlikely(to_free))
		kfree_skb_list_reason(to_free,
				      tcf_get_drop_reason(to_free));

free_skbs:
	tcf_kfree_skb_list(to_free);
	tcf_kfree_skb_list(to_free2);
	return rc;
}

@@ -4355,7 +4361,7 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
		return ret;

	tc_skb_cb(skb)->mru = 0;
	tc_skb_cb(skb)->post_ct = false;
	qdisc_skb_cb(skb)->post_ct = false;
	tcf_set_drop_reason(skb, *drop_reason);

	mini_qdisc_bstats_cpu_update(miniq, skb);
@@ -4426,7 +4432,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
		*pt_prev = NULL;
	}

	qdisc_skb_cb(skb)->pkt_len = skb->len;
	qdisc_pkt_len_segs_init(skb);
	tcx_set_ingress(skb, true);

	if (static_branch_unlikely(&tcx_needed_key)) {
@@ -4737,7 +4743,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)

	skb_update_prio(skb);

	qdisc_pkt_len_init(skb);
	qdisc_pkt_len_segs_init(skb);
	tcx_set_ingress(skb, false);
#ifdef CONFIG_NET_EGRESS
	if (static_branch_unlikely(&egress_needed_key)) {
@@ -5743,8 +5749,9 @@ static __latent_entropy void net_tx_action(void)
		rcu_read_lock();

		while (head) {
			struct Qdisc *q = head;
			spinlock_t *root_lock = NULL;
			struct sk_buff *to_free;
			struct Qdisc *q = head;

			head = head->next_sched;

@@ -5771,9 +5778,10 @@ static __latent_entropy void net_tx_action(void)
			}

			clear_bit(__QDISC_STATE_SCHED, &q->state);
			qdisc_run(q);
			to_free = qdisc_run(q);
			if (root_lock)
				spin_unlock(root_lock);
			tcf_kfree_skb_list(to_free);
		}

		rcu_read_unlock();
+4 −4
Original line number Diff line number Diff line
@@ -948,9 +948,9 @@ static int tcf_ct_act_nat(struct sk_buff *skb,
		return err & NF_VERDICT_MASK;

	if (action & BIT(NF_NAT_MANIP_SRC))
		tc_skb_cb(skb)->post_ct_snat = 1;
		qdisc_skb_cb(skb)->post_ct_snat = 1;
	if (action & BIT(NF_NAT_MANIP_DST))
		tc_skb_cb(skb)->post_ct_dnat = 1;
		qdisc_skb_cb(skb)->post_ct_dnat = 1;

	return err;
#else
@@ -986,7 +986,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
	tcf_action_update_bstats(&c->common, skb);

	if (clear) {
		tc_skb_cb(skb)->post_ct = false;
		qdisc_skb_cb(skb)->post_ct = false;
		ct = nf_ct_get(skb, &ctinfo);
		if (ct) {
			nf_ct_put(ct);
@@ -1097,7 +1097,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
out_push:
	skb_push_rcsum(skb, nh_ofs);

	tc_skb_cb(skb)->post_ct = true;
	qdisc_skb_cb(skb)->post_ct = true;
	tc_skb_cb(skb)->zone = p->zone;
out_clear:
	if (defrag)
+3 −3
Original line number Diff line number Diff line
@@ -1872,9 +1872,9 @@ int tcf_classify(struct sk_buff *skb,
			}
			ext->chain = last_executed_chain;
			ext->mru = cb->mru;
			ext->post_ct = cb->post_ct;
			ext->post_ct_snat = cb->post_ct_snat;
			ext->post_ct_dnat = cb->post_ct_dnat;
			ext->post_ct = qdisc_skb_cb(skb)->post_ct;
			ext->post_ct_snat = qdisc_skb_cb(skb)->post_ct_snat;
			ext->post_ct_dnat = qdisc_skb_cb(skb)->post_ct_dnat;
			ext->zone = cb->zone;
		}
	}
Loading