Commit 154dee7c authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'make-time-wait-reuse-delay-deterministic-and-configurable'

Jakub Sitnicki says:

====================
Make TIME-WAIT reuse delay deterministic and configurable

This patch set is an effort to enable faster reuse of TIME-WAIT sockets.
We have recently talked about the motivation and the idea at Plumbers [1].

Experiment in production
------------------------

We are restarting our experiment on a small set of production nodes as the
code has slightly changed since v1 [2], and there are still a few weeks of
development window to soak the changes. We will report back if we observe
any regressions.

Packetdrill tests
-----------------

The packetdrill tests for TIME-WAIT reuse [3] did not change since v1.
Although we are not touching PAWS code any more, I would still like to add
tests to cover PAWS reject after TW reuse. This, however, requires patching
packetdrill as I mentioned in the last cover letter [2].

[1] https://lpc.events/event/18/contributions/1962/
[2] https://lore.kernel.org/r/20241113-jakub-krn-909-poc-msec-tw-tstamp-v2-0-b0a335247304@cloudflare.com
[3] https://github.com/google/packetdrill/pull/90

v1: https://lore.kernel.org/20241204-jakub-krn-909-poc-msec-tw-tstamp-v1-0-8b54467a0f34@cloudflare.com
RFCv2: https://lore.kernel.org/20241113-jakub-krn-909-poc-msec-tw-tstamp-v2-0-b0a335247304@cloudflare.com
RFCv1: https://lore.kernel.org/20240819-jakub-krn-909-poc-msec-tw-tstamp-v1-1-6567b5006fbe@cloudflare.com
====================

Link: https://patch.msgid.link/20241209-jakub-krn-909-poc-msec-tw-tstamp-v2-0-66aca0eed03e@cloudflare.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 148328b5 ca6a6f93
Loading
Loading
Loading
Loading
+14 −0
Original line number Diff line number Diff line
@@ -1000,6 +1000,20 @@ tcp_tw_reuse - INTEGER

	Default: 2

tcp_tw_reuse_delay - UNSIGNED INTEGER
        The delay in milliseconds before a TIME-WAIT socket can be reused by a
        new connection, if TIME-WAIT socket reuse is enabled. The actual reuse
        threshold is within [N, N+1] range, where N is the requested delay in
        milliseconds, to ensure the delay interval is never shorter than the
        configured value.

        This setting contains an assumption about the other TCP timestamp clock
        tick interval. It should not be set to a value lower than the peer's
        clock tick for PAWS (Protection Against Wrapped Sequence numbers)
        mechanism work correctly for the reused connection.

        Default: 1000 (milliseconds)

tcp_window_scaling - BOOLEAN
	Enable window scaling as defined in RFC1323.

+1 −0
Original line number Diff line number Diff line
@@ -79,6 +79,7 @@ u8 sysctl_tcp_retries1
u8                              sysctl_tcp_retries2
u8                              sysctl_tcp_orphan_retries
u8                              sysctl_tcp_tw_reuse                                                                  timewait_sock_ops
unsigned_int                    sysctl_tcp_tw_reuse_delay                                                            timewait_sock_ops
int                             sysctl_tcp_fin_timeout                                                               TCP_LAST_ACK/tcp_rcv_state_process
unsigned_int                    sysctl_tcp_notsent_lowat                     read_mostly                             tcp_notsent_lowat/tcp_stream_memory_free
u8                              sysctl_tcp_sack                                                                      tcp_syn_options
+4 −0
Original line number Diff line number Diff line
@@ -74,6 +74,10 @@ struct inet_timewait_sock {
				tw_tos		: 8;
	u32			tw_txhash;
	u32			tw_priority;
	/**
	 * @tw_reuse_stamp: Time of entry into %TCP_TIME_WAIT state in msec.
	 */
	u32			tw_entry_stamp;
	struct timer_list	tw_timer;
	struct inet_bind_bucket	*tw_tb;
	struct inet_bind2_bucket	*tw_tb2;
+1 −0
Original line number Diff line number Diff line
@@ -175,6 +175,7 @@ struct netns_ipv4 {
	u8 sysctl_tcp_retries2;
	u8 sysctl_tcp_orphan_retries;
	u8 sysctl_tcp_tw_reuse;
	unsigned int sysctl_tcp_tw_reuse_delay;
	int sysctl_tcp_fin_timeout;
	u8 sysctl_tcp_sack;
	u8 sysctl_tcp_window_scaling;
+10 −0
Original line number Diff line number Diff line
@@ -45,6 +45,7 @@ static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;
static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;

/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -1065,6 +1066,15 @@ static struct ctl_table ipv4_net_table[] = {
		.extra1		= SYSCTL_ZERO,
		.extra2		= SYSCTL_TWO,
	},
	{
		.procname	= "tcp_tw_reuse_delay",
		.data		= &init_net.ipv4.sysctl_tcp_tw_reuse_delay,
		.maxlen		= sizeof(unsigned int),
		.mode		= 0644,
		.proc_handler	= proc_douintvec_minmax,
		.extra1		= SYSCTL_ONE,
		.extra2		= &tcp_tw_reuse_delay_max,
	},
	{
		.procname	= "tcp_max_syn_backlog",
		.data		= &init_net.ipv4.sysctl_max_syn_backlog,
Loading