Commit 2da35e4b authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'tcp-receive-side-improvements'

Eric Dumazet says:

====================
tcp: receive side improvements

We have set tcp_rmem[2] to 15 MB for about 8 years at Google,
but had some issues for high speed flows on very small RTT.

TCP rx autotuning has a tendency to overestimate the RTT,
thus tp->rcvq_space.space and sk->sk_rcvbuf.

This makes TCP receive queues much bigger than necessary,
to a point cpu caches are evicted before application can
copy the data, on cpus using DDIO.

This series aims to fix this.

- First patch adds tcp_rcvbuf_grow() tracepoint, which was very
  convenient to study the various issues fixed in this series.

- Seven patches fix receiver autotune issues.

- Two patches fix sender side issues.

- Final patch increases tcp_rmem[2] so that TCP speed over WAN
  can meet modern needs.

Tested on a 200Gbit NIC, average max throughput of a single flow:

Before:
 73593 Mbit.

After:
 122514 Mbit.
====================

Link: https://patch.msgid.link/20250513193919.1089692-1-edumazet@google.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents bebd7b26 572be9bf
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -735,7 +735,7 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
	net.core.rmem_max.  Calling setsockopt() with SO_RCVBUF disables
	automatic tuning of that socket's receive buffer size, in which
	case this value is ignored.
	Default: between 131072 and 6MB, depending on RAM size.
	Default: between 131072 and 32MB, depending on RAM size.

tcp_sack - BOOLEAN
	Enable select acknowledgments (SACKS).
@@ -1099,7 +1099,7 @@ tcp_limit_output_bytes - INTEGER
	limits the number of bytes on qdisc or device to reduce artificial
	RTT/cwnd and reduce bufferbloat.

	Default: 1048576 (16 * 65536)
	Default: 4194304 (4 MB)

tcp_challenge_ack_limit - INTEGER
	Limits number of Challenge ACK sent per second, as recommended
+1 −1
Original line number Diff line number Diff line
@@ -340,7 +340,7 @@ struct tcp_sock {
	} rcv_rtt_est;
/* Receiver queue space */
	struct {
		u32	space;
		int	space;
		u32	seq;
		u64	time;
	} rcvq_space;
+73 −0
Original line number Diff line number Diff line
@@ -213,6 +213,79 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,
	TP_ARGS(sk)
);

TRACE_EVENT(tcp_rcvbuf_grow,

	TP_PROTO(struct sock *sk, int time),

	TP_ARGS(sk, time),

	TP_STRUCT__entry(
		__field(int, time)
		__field(__u32, rtt_us)
		__field(__u32, copied)
		__field(__u32, inq)
		__field(__u32, space)
		__field(__u32, ooo_space)
		__field(__u32, rcvbuf)
		__field(__u8, scaling_ratio)
		__field(__u16, sport)
		__field(__u16, dport)
		__field(__u16, family)
		__array(__u8, saddr, 4)
		__array(__u8, daddr, 4)
		__array(__u8, saddr_v6, 16)
		__array(__u8, daddr_v6, 16)
		__field(const void *, skaddr)
		__field(__u64, sock_cookie)
	),

	TP_fast_assign(
		struct inet_sock *inet = inet_sk(sk);
		struct tcp_sock *tp = tcp_sk(sk);
		__be32 *p32;

		__entry->time = time;
		__entry->rtt_us = tp->rcv_rtt_est.rtt_us >> 3;
		__entry->copied = tp->copied_seq - tp->rcvq_space.seq;
		__entry->inq = tp->rcv_nxt - tp->copied_seq;
		__entry->space = tp->rcvq_space.space;
		__entry->ooo_space = RB_EMPTY_ROOT(&tp->out_of_order_queue) ? 0 :
				     TCP_SKB_CB(tp->ooo_last_skb)->end_seq -
				     tp->rcv_nxt;

		__entry->rcvbuf = sk->sk_rcvbuf;
		__entry->scaling_ratio = tp->scaling_ratio;
		__entry->sport = ntohs(inet->inet_sport);
		__entry->dport = ntohs(inet->inet_dport);
		__entry->family = sk->sk_family;

		p32 = (__be32 *) __entry->saddr;
		*p32 = inet->inet_saddr;

		p32 = (__be32 *) __entry->daddr;
		*p32 = inet->inet_daddr;

		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);

		__entry->skaddr = sk;
		__entry->sock_cookie = sock_gen_cookie(sk);
	),

	TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u "
		  "family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 "
		  "saddrv6=%pI6c daddrv6=%pI6c skaddr=%p sock_cookie=%llx",
		  __entry->time, __entry->rtt_us, __entry->copied,
		  __entry->inq, __entry->space, __entry->ooo_space,
		  __entry->scaling_ratio, __entry->rcvbuf,
		  show_family_name(__entry->family),
		  __entry->sport, __entry->dport,
		  __entry->saddr, __entry->daddr,
		  __entry->saddr_v6, __entry->daddr_v6,
		  __entry->skaddr,
		  __entry->sock_cookie)
);

TRACE_EVENT(tcp_retransmit_synack,

	TP_PROTO(const struct sock *sk, const struct request_sock *req),
+1 −1
Original line number Diff line number Diff line
@@ -5231,7 +5231,7 @@ void __init tcp_init(void)
	/* Set per-socket limits to no more than 1/128 the pressure threshold */
	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
	max_wshare = min(4UL*1024*1024, limit);
	max_rshare = min(6UL*1024*1024, limit);
	max_rshare = min(32UL*1024*1024, limit);

	init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
	init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
+53 −57
Original line number Diff line number Diff line
@@ -664,10 +664,12 @@ EXPORT_IPV6_MOD(tcp_initialize_rcv_mss);
 */
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
	u32 new_sample = tp->rcv_rtt_est.rtt_us;
	long m = sample;
	u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us;
	long m = sample << 3;

	if (new_sample != 0) {
	if (old_sample == 0 || m < old_sample) {
		new_sample = m;
	} else {
		/* If we sample in larger samples in the non-timestamp
		 * case, we could grossly overestimate the RTT especially
		 * with chatty applications or bulk transfer apps which
@@ -678,17 +680,12 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
		 * else with timestamps disabled convergence takes too
		 * long.
		 */
		if (!win_dep) {
			m -= (new_sample >> 3);
			new_sample += m;
		} else {
			m <<= 3;
			if (m < new_sample)
				new_sample = m;
		}
	} else {
		/* No previous measure. */
		new_sample = m << 3;
		if (win_dep)
			return;
		/* Do not use this sample if receive queue is not empty. */
		if (tp->rcv_nxt != tp->copied_seq)
			return;
		new_sample = old_sample - (old_sample >> 3) + sample;
	}

	tp->rcv_rtt_est.rtt_us = new_sample;
@@ -712,7 +709,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
	tp->rcv_rtt_est.time = tp->tcp_mstamp;
}

static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta)
{
	u32 delta, delta_us;

@@ -722,7 +719,7 @@ static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)

	if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
		if (!delta)
			delta = 1;
			delta = min_delta;
		delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
		return delta_us;
	}
@@ -740,13 +737,39 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,

	if (TCP_SKB_CB(skb)->end_seq -
	    TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
		s32 delta = tcp_rtt_tsopt_us(tp);
		s32 delta = tcp_rtt_tsopt_us(tp, 0);

		if (delta >= 0)
		if (delta > 0)
			tcp_rcv_rtt_update(tp, delta, 0);
	}
}

static void tcp_rcvbuf_grow(struct sock *sk)
{
	const struct net *net = sock_net(sk);
	struct tcp_sock *tp = tcp_sk(sk);
	int rcvwin, rcvbuf, cap;

	if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
	    (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
		return;

	/* slow start: allow the sender to double its rate. */
	rcvwin = tp->rcvq_space.space << 1;

	if (!RB_EMPTY_ROOT(&tp->out_of_order_queue))
		rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt;

	cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);

	rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap);
	if (rcvbuf > sk->sk_rcvbuf) {
		WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
		/* Make the window clamp follow along.  */
		WRITE_ONCE(tp->window_clamp,
			   tcp_win_from_space(sk, rcvbuf));
	}
}
/*
 * This function should be called every time data is copied to user space.
 * It calculates the appropriate TCP receive buffer space.
@@ -754,8 +777,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
void tcp_rcv_space_adjust(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	u32 copied;
	int time;
	int time, inq, copied;

	trace_tcp_rcv_space_adjust(sk);

@@ -766,45 +788,18 @@ void tcp_rcv_space_adjust(struct sock *sk)

	/* Number of bytes copied to user in last RTT */
	copied = tp->copied_seq - tp->rcvq_space.seq;
	/* Number of bytes in receive queue. */
	inq = tp->rcv_nxt - tp->copied_seq;
	copied -= inq;
	if (copied <= tp->rcvq_space.space)
		goto new_measure;

	/* A bit of theory :
	 * copied = bytes received in previous RTT, our base window
	 * To cope with packet losses, we need a 2x factor
	 * To cope with slow start, and sender growing its cwin by 100 %
	 * every RTT, we need a 4x factor, because the ACK we are sending
	 * now is for the next RTT, not the current one :
	 * <prev RTT . ><current RTT .. ><next RTT .... >
	 */

	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
		u64 rcvwin, grow;
		int rcvbuf;

		/* minimal window to cope with packet losses, assuming
		 * steady state. Add some cushion because of small variations.
		 */
		rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
	trace_tcp_rcvbuf_grow(sk, time);

		/* Accommodate for sender rate increase (eg. slow start) */
		grow = rcvwin * (copied - tp->rcvq_space.space);
		do_div(grow, tp->rcvq_space.space);
		rcvwin += (grow << 1);

		rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
		if (rcvbuf > sk->sk_rcvbuf) {
			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);

			/* Make the window clamp follow along.  */
			WRITE_ONCE(tp->window_clamp,
				   tcp_win_from_space(sk, rcvbuf));
		}
	}
	tp->rcvq_space.space = copied;

	tcp_rcvbuf_grow(sk);

new_measure:
	tp->rcvq_space.seq = tp->copied_seq;
	tp->rcvq_space.time = tp->tcp_mstamp;
@@ -3226,7 +3221,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
	 */
	if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp &&
	    tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED)
		seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp);
		seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, 1);

	rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
	if (seq_rtt_us < 0)
@@ -5173,6 +5168,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
		skb_condense(skb);
		skb_set_owner_r(skb, sk);
	}
	tcp_rcvbuf_grow(sk);
}

static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
@@ -6873,6 +6869,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
		if (!tp->srtt_us)
			tcp_synack_rtt_meas(sk, req);

		if (tp->rx_opt.tstamp_ok)
			tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;

		if (req) {
			tcp_rcv_synrecv_state_fastopen(sk);
		} else {
@@ -6898,9 +6897,6 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
		tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);

		if (tp->rx_opt.tstamp_ok)
			tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;

		if (!inet_csk(sk)->icsk_ca_ops->cong_control)
			tcp_update_pacing_rate(sk);

Loading