Commit ca6a6f93 authored by Jakub Sitnicki's avatar Jakub Sitnicki Committed by Jakub Kicinski
Browse files

tcp: Add sysctl to configure TIME-WAIT reuse delay

Today we have a hardcoded delay of 1 sec before a TIME-WAIT socket can be
reused by reopening a connection. This is a safe choice based on an
assumption that the other TCP timestamp clock frequency, which is unknown
to us, may be as low as 1 Hz (RFC 7323, section 5.4).

However, this means that in the presence of short lived connections with an
RTT of couple of milliseconds, the time during which a 4-tuple is blocked
from reuse can be orders of magnitude longer that the connection lifetime.
Combined with a reduced pool of ephemeral ports, when using
IP_LOCAL_PORT_RANGE to share an egress IP address between hosts [1], the
long TIME-WAIT reuse delay can lead to port exhaustion, where all available
4-tuples are tied up in TIME-WAIT state.

Turn the reuse delay into a per-netns setting so that sysadmins can make
more aggressive assumptions about remote TCP timestamp clock frequency and
shorten the delay in order to allow connections to reincarnate faster.

Note that applications can completely bypass the TIME-WAIT delay protection
already today by locking the local port with bind() before connecting. Such
immediate connection reuse may result in PAWS failing to detect old
duplicate segments, leaving us with just the sequence number check as a
safety net.

This new configurable offers a trade off where the sysadmin can balance
between the risk of PAWS detection failing to act versus exhausting ports
by having sockets tied up in TIME-WAIT state for too long.

[1] https://lpc.events/event/16/contributions/1349/



Signed-off-by: default avatarJakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Reviewed-by: default avatarJason Xing <kerneljasonxing@gmail.com>
Link: https://patch.msgid.link/20241209-jakub-krn-909-poc-msec-tw-tstamp-v2-2-66aca0eed03e@cloudflare.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 19ce8cd3
Loading
Loading
Loading
Loading
+14 −0
Original line number Diff line number Diff line
@@ -1000,6 +1000,20 @@ tcp_tw_reuse - INTEGER

	Default: 2

tcp_tw_reuse_delay - UNSIGNED INTEGER
        The delay in milliseconds before a TIME-WAIT socket can be reused by a
        new connection, if TIME-WAIT socket reuse is enabled. The actual reuse
        threshold is within [N, N+1] range, where N is the requested delay in
        milliseconds, to ensure the delay interval is never shorter than the
        configured value.

        This setting contains an assumption about the other TCP timestamp clock
        tick interval. It should not be set to a value lower than the peer's
        clock tick for PAWS (Protection Against Wrapped Sequence numbers)
        mechanism work correctly for the reused connection.

        Default: 1000 (milliseconds)

tcp_window_scaling - BOOLEAN
	Enable window scaling as defined in RFC1323.

+1 −0
Original line number Diff line number Diff line
@@ -79,6 +79,7 @@ u8 sysctl_tcp_retries1
u8                              sysctl_tcp_retries2
u8                              sysctl_tcp_orphan_retries
u8                              sysctl_tcp_tw_reuse                                                                  timewait_sock_ops
unsigned_int                    sysctl_tcp_tw_reuse_delay                                                            timewait_sock_ops
int                             sysctl_tcp_fin_timeout                                                               TCP_LAST_ACK/tcp_rcv_state_process
unsigned_int                    sysctl_tcp_notsent_lowat                     read_mostly                             tcp_notsent_lowat/tcp_stream_memory_free
u8                              sysctl_tcp_sack                                                                      tcp_syn_options
+1 −0
Original line number Diff line number Diff line
@@ -175,6 +175,7 @@ struct netns_ipv4 {
	u8 sysctl_tcp_retries2;
	u8 sysctl_tcp_orphan_retries;
	u8 sysctl_tcp_tw_reuse;
	unsigned int sysctl_tcp_tw_reuse_delay;
	int sysctl_tcp_fin_timeout;
	u8 sysctl_tcp_sack;
	u8 sysctl_tcp_window_scaling;
+10 −0
Original line number Diff line number Diff line
@@ -45,6 +45,7 @@ static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;
static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;

/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -1065,6 +1066,15 @@ static struct ctl_table ipv4_net_table[] = {
		.extra1		= SYSCTL_ZERO,
		.extra2		= SYSCTL_TWO,
	},
	{
		.procname	= "tcp_tw_reuse_delay",
		.data		= &init_net.ipv4.sysctl_tcp_tw_reuse_delay,
		.maxlen		= sizeof(unsigned int),
		.mode		= 0644,
		.proc_handler	= proc_douintvec_minmax,
		.extra1		= SYSCTL_ONE,
		.extra2		= &tcp_tw_reuse_delay_max,
	},
	{
		.procname	= "tcp_max_syn_backlog",
		.data		= &init_net.ipv4.sysctl_max_syn_backlog,
+3 −1
Original line number Diff line number Diff line
@@ -163,7 +163,8 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
	   and use initial timestamp retrieved from peer table.
	 */
	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + MSEC_PER_SEC;
	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
	if (ts_recent_stamp &&
	    (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
@@ -3458,6 +3459,7 @@ static int __net_init tcp_sk_init(struct net *net)
	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
	net->ipv4.sysctl_tcp_tw_reuse = 2;
	net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;

	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);