Commit 8fc8911b authored by Paolo Abeni's avatar Paolo Abeni
Browse files

Merge branch 'tcp-backlog-processing-optims'

Eric Dumazet says:

====================
tcp: backlog processing optims

First patches are mostly preparing the ground for the last one.

Last patch of the series implements sort of ACK reduction
only for the cases a TCP receiver is under high stress,
which happens for high throughput flows.

This gives us a ~20% increase of single TCP flow (100Gbit -> 120Gbit)
====================

Link: https://lore.kernel.org/r/20230911170531.828100-1-edumazet@google.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents cd8bae85 133c4c0d
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -745,6 +745,13 @@ tcp_comp_sack_nr - INTEGER

	Default : 44

tcp_backlog_ack_defer - BOOLEAN
	If set, user thread processing socket backlog tries sending
	one ACK for the whole queue. This helps to avoid potential
	long latencies at end of a TCP socket syscall.

	Default : true

tcp_slow_start_after_idle - BOOLEAN
	If set, provide RFC2861 behavior and time out the congestion
	window after an idle period.  An idle period is defined at
+8 −6
Original line number Diff line number Diff line
@@ -463,15 +463,17 @@ enum tsq_enum {
	TCP_MTU_REDUCED_DEFERRED,  /* tcp_v{4|6}_err() could not call
				    * tcp_v{4|6}_mtu_reduced()
				    */
	TCP_ACK_DEFERRED,	   /* TX pure ack is deferred */
};

enum tsq_flags {
	TSQF_THROTTLED			= (1UL << TSQ_THROTTLED),
	TSQF_QUEUED			= (1UL << TSQ_QUEUED),
	TCPF_TSQ_DEFERRED		= (1UL << TCP_TSQ_DEFERRED),
	TCPF_WRITE_TIMER_DEFERRED	= (1UL << TCP_WRITE_TIMER_DEFERRED),
	TCPF_DELACK_TIMER_DEFERRED	= (1UL << TCP_DELACK_TIMER_DEFERRED),
	TCPF_MTU_REDUCED_DEFERRED	= (1UL << TCP_MTU_REDUCED_DEFERRED),
	TSQF_THROTTLED			= BIT(TSQ_THROTTLED),
	TSQF_QUEUED			= BIT(TSQ_QUEUED),
	TCPF_TSQ_DEFERRED		= BIT(TCP_TSQ_DEFERRED),
	TCPF_WRITE_TIMER_DEFERRED	= BIT(TCP_WRITE_TIMER_DEFERRED),
	TCPF_DELACK_TIMER_DEFERRED	= BIT(TCP_DELACK_TIMER_DEFERRED),
	TCPF_MTU_REDUCED_DEFERRED	= BIT(TCP_MTU_REDUCED_DEFERRED),
	TCPF_ACK_DEFERRED		= BIT(TCP_ACK_DEFERRED),
};

#define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
+1 −0
Original line number Diff line number Diff line
@@ -132,6 +132,7 @@ struct netns_ipv4 {
	u8 sysctl_tcp_syncookies;
	u8 sysctl_tcp_migrate_req;
	u8 sysctl_tcp_comp_sack_nr;
	u8 sysctl_tcp_backlog_ack_defer;
	int sysctl_tcp_reordering;
	u8 sysctl_tcp_retries1;
	u8 sysctl_tcp_retries2;
+4 −5
Original line number Diff line number Diff line
@@ -1823,13 +1823,12 @@ static inline bool sock_owned_by_user_nocheck(const struct sock *sk)

static inline void sock_release_ownership(struct sock *sk)
{
	if (sock_owned_by_user_nocheck(sk)) {
	DEBUG_NET_WARN_ON_ONCE(!sock_owned_by_user_nocheck(sk));
	sk->sk_lock.owned = 0;

	/* The sk_lock has mutex_unlock() semantics: */
	mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
}
}

/* no reclassification while locks are held */
static inline bool sock_allow_reclassification(const struct sock *csk)
+3 −3
Original line number Diff line number Diff line
@@ -3001,6 +3001,9 @@ void __sk_flush_backlog(struct sock *sk)
{
	spin_lock_bh(&sk->sk_lock.slock);
	__release_sock(sk);

	if (sk->sk_prot->release_cb)
		sk->sk_prot->release_cb(sk);
	spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL_GPL(__sk_flush_backlog);
@@ -3519,9 +3522,6 @@ void release_sock(struct sock *sk)
	if (sk->sk_backlog.tail)
		__release_sock(sk);

	/* Warning : release_cb() might need to release sk ownership,
	 * ie call sock_release_ownership(sk) before us.
	 */
	if (sk->sk_prot->release_cb)
		sk->sk_prot->release_cb(sk);

Loading