Commit 4ca05145 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'net-deal-with-sticky-tx-queues'

Eric Dumazet says:

====================
net: deal with sticky tx queues

Back in 2010, Tom Herbert added skb->ooo_okay to TCP flows.

Extend the feature to connected flows for other protocols like UDP.

skb->ooo_okay might never be set for bulk flows that always
have at least one skb in a qdisc queue of NIC queue,
especially if TX completion is delayed because of a stressed cpu
or aggressive interrupt mitigation.

The so-called "strange attractors" has caused many performance
issues, we need to do better now that TCP reacts better to
potential reorders.

Add new net.core.txq_reselection_ms sysctl to let
flows follow XPS and select a more efficient queue.

After this series, we no longer have to make sure threads
are pinned to cpus, they can migrate without adding
too much [spinlock, qdisc, TX completion] pressure anymore.
====================

Link: https://patch.msgid.link/20251013152234.842065-1-edumazet@google.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 1c51450f 4a770844
Loading
Loading
Loading
Loading
+17 −0
Original line number Diff line number Diff line
@@ -406,6 +406,23 @@ to SOCK_TXREHASH_DEFAULT (i. e. not overridden by setsockopt).
If set to 1 (default), hash rethink is performed on listening socket.
If set to 0, hash rethink is not performed.

txq_reselection_ms
------------------

Controls how often (in ms) a busy connected flow can select another tx queue.

A resection is desirable when/if user thread has migrated and XPS
would select a different queue. Same can occur without XPS
if the flow hash has changed.

But switching txq can introduce reorders, especially if the
old queue is under high pressure. Modern TCP stacks deal
well with reorders if they happen not too often.

To disable this feature, set the value to 0.

Default : 1000

gro_normal_batch
----------------

+1 −0
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@ struct netns_core {
	struct ctl_table_header	*sysctl_hdr;

	int	sysctl_somaxconn;
	int	sysctl_txq_reselection;
	int	sysctl_optmem_max;
	u8	sysctl_txrehash;
	u8	sysctl_tstamp_allow_data;
+14 −15
Original line number Diff line number Diff line
@@ -313,6 +313,7 @@ struct sk_filter;
  *	@sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
  *	              for timestamping
  *	@sk_tskey: counter to disambiguate concurrent tstamp requests
  *	@sk_tx_queue_mapping_jiffies: time in jiffies of last @sk_tx_queue_mapping refresh.
  *	@sk_zckey: counter to order MSG_ZEROCOPY notifications
  *	@sk_socket: Identd and reporting IO signals
  *	@sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock.
@@ -485,6 +486,7 @@ struct sock {
	unsigned long		sk_pacing_rate; /* bytes per second */
	atomic_t		sk_zckey;
	atomic_t		sk_tskey;
	unsigned long		sk_tx_queue_mapping_jiffies;
	__cacheline_group_end(sock_write_tx);

	__cacheline_group_begin(sock_read_tx);
@@ -1992,7 +1994,15 @@ static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
	/* Paired with READ_ONCE() in sk_tx_queue_get() and
	 * other WRITE_ONCE() because socket lock might be not held.
	 */
	if (READ_ONCE(sk->sk_tx_queue_mapping) != tx_queue) {
		WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
		WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies);
		return;
	}

	/* Refresh sk_tx_queue_mapping_jiffies if too old. */
	if (time_is_before_jiffies(READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + HZ))
		WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies);
}

#define NO_QUEUE_MAPPING	USHRT_MAX
@@ -2005,19 +2015,7 @@ static inline void sk_tx_queue_clear(struct sock *sk)
	WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING);
}

static inline int sk_tx_queue_get(const struct sock *sk)
{
	if (sk) {
		/* Paired with WRITE_ONCE() in sk_tx_queue_clear()
		 * and sk_tx_queue_set().
		 */
		int val = READ_ONCE(sk->sk_tx_queue_mapping);

		if (val != NO_QUEUE_MAPPING)
			return val;
	}
	return -1;
}
int sk_tx_queue_get(const struct sock *sk);

static inline void __sk_rx_queue_set(struct sock *sk,
				     const struct sk_buff *skb,
@@ -2303,6 +2301,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro
	return 0;
}

#define SK_WMEM_ALLOC_BIAS 1
/**
 * sk_wmem_alloc_get - returns write allocations
 * @sk: socket
@@ -2311,7 +2310,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro
 */
static inline int sk_wmem_alloc_get(const struct sock *sk)
{
	return refcount_read(&sk->sk_wmem_alloc) - 1;
	return refcount_read(&sk->sk_wmem_alloc) - SK_WMEM_ALLOC_BIAS;
}

/**
+1 −1
Original line number Diff line number Diff line
@@ -157,7 +157,7 @@ int vcc_create(struct net *net, struct socket *sock, int protocol, int family, i
	memset(&vcc->local, 0, sizeof(struct sockaddr_atmsvc));
	memset(&vcc->remote, 0, sizeof(struct sockaddr_atmsvc));
	vcc->qos.txtp.max_sdu = 1 << 16; /* for meta VCs */
	refcount_set(&sk->sk_wmem_alloc, 1);
	refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
	atomic_set(&sk->sk_rmem_alloc, 0);
	vcc->push = NULL;
	vcc->pop = NULL;
+27 −2
Original line number Diff line number Diff line
@@ -4591,6 +4591,32 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
}
EXPORT_SYMBOL(dev_pick_tx_zero);

int sk_tx_queue_get(const struct sock *sk)
{
	int resel, val;

	if (!sk)
		return -1;
	/* Paired with WRITE_ONCE() in sk_tx_queue_clear()
	 * and sk_tx_queue_set().
	 */
	val = READ_ONCE(sk->sk_tx_queue_mapping);

	if (val == NO_QUEUE_MAPPING)
		return -1;

	if (!sk_fullsock(sk))
		return val;

	resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection);
	if (resel && time_is_before_jiffies(
			READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel))
		return -1;

	return val;
}
EXPORT_SYMBOL(sk_tx_queue_get);

u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
		     struct net_device *sb_dev)
{
@@ -4606,8 +4632,7 @@ u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
		if (new_index < 0)
			new_index = skb_tx_hash(dev, sb_dev, skb);

		if (queue_index != new_index && sk &&
		    sk_fullsock(sk) &&
		if (sk && sk_fullsock(sk) &&
		    rcu_access_pointer(sk->sk_dst_cache))
			sk_tx_queue_set(sk, new_index);

Loading