Commit 54a378f4 authored by Eric Dumazet's avatar Eric Dumazet Committed by Paolo Abeni
Browse files

tcp: add the ability to control max RTO



Currently, TCP stack uses a constant (120 seconds)
to limit the RTO value exponential growth.

Some applications want to set a lower value.

Add TCP_RTO_MAX_MS socket option to set a value (in ms)
between 1 and 120 seconds.

It is discouraged to change the socket rto max on a live
socket, as it might lead to unexpected disconnects.

Following patch is adding a netns sysctl to control the
default value at socket creation time.

Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Reviewed-by: default avatarJason Xing <kerneljasonxing@gmail.com>
Reviewed-by: default avatarNeal Cardwell <ncardwell@google.com>
Reviewed-by: default avatarKuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parent 48b69b4c
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ struct timer_list icsk_retransmit_timer read_mostly
struct timer_list                   icsk_delack_timer      read_mostly                             inet_csk_reset_xmit_timer,tcp_connect
u32                                 icsk_rto               read_write                              tcp_cwnd_validate,tcp_schedule_loss_probe,tcp_connect_init,tcp_connect,tcp_write_xmit,tcp_push_one
u32                                 icsk_rto_min
u32                                 icsk_rto_max           read_mostly                             tcp_reset_xmit_timer
u32                                 icsk_delack_max
u32                                 icsk_pmtu_cookie       read_write                              tcp_sync_mss,tcp_current_mss,tcp_send_syn_data,tcp_connect_init,tcp_connect
struct tcp_congestion_ops           icsk_ca_ops            read_write                              tcp_cwnd_validate,tcp_tso_segs,tcp_ca_dst_init,tcp_connect_init,tcp_connect,tcp_write_xmit
+1 −0
Original line number Diff line number Diff line
@@ -90,6 +90,7 @@ struct inet_connection_sock {
 	struct timer_list	  icsk_delack_timer;
	__u32			  icsk_rto;
	__u32                     icsk_rto_min;
	u32			  icsk_rto_max;
	__u32                     icsk_delack_max;
	__u32			  icsk_pmtu_cookie;
	const struct tcp_congestion_ops *icsk_ca_ops;
+11 −5
Original line number Diff line number Diff line
@@ -143,7 +143,8 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCP_DELACK_MIN	4U
#define TCP_ATO_MIN	4U
#endif
#define TCP_RTO_MAX	((unsigned)(120*HZ))
#define TCP_RTO_MAX_SEC 120
#define TCP_RTO_MAX	((unsigned)(TCP_RTO_MAX_SEC * HZ))
#define TCP_RTO_MIN	((unsigned)(HZ / 5))
#define TCP_TIMEOUT_MIN	(2U) /* Min timeout for TCP timers in jiffies */

@@ -740,10 +741,14 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu);
int tcp_mss_to_mtu(struct sock *sk, int mss);
void tcp_mtup_init(struct sock *sk);

static inline unsigned int tcp_rto_max(const struct sock *sk)
{
	return READ_ONCE(inet_csk(sk)->icsk_rto_max);
}

static inline void tcp_bound_rto(struct sock *sk)
{
	if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
		inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
	inet_csk(sk)->icsk_rto = min(inet_csk(sk)->icsk_rto, tcp_rto_max(sk));
}

static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
@@ -1428,7 +1433,8 @@ static inline void tcp_reset_xmit_timer(struct sock *sk,
{
	if (pace_delay)
		when += tcp_pacing_delay(sk);
	inet_csk_reset_xmit_timer(sk, what, when, TCP_RTO_MAX);
	inet_csk_reset_xmit_timer(sk, what, when,
				  tcp_rto_max(sk));
}

/* Something is really bad, we could not queue an additional packet,
+1 −0
Original line number Diff line number Diff line
@@ -136,6 +136,7 @@ enum {
#define TCP_AO_REPAIR		42	/* Get/Set SNEs and ISNs */

#define TCP_IS_MPTCP		43	/* Is MPTCP being used? */
#define TCP_RTO_MAX_MS		44	/* max rto time in ms */

#define TCP_REPAIR_ON		1
#define TCP_REPAIR_OFF		0
+12 −0
Original line number Diff line number Diff line
@@ -432,6 +432,10 @@ void tcp_init_sock(struct sock *sk)
	INIT_LIST_HEAD(&tp->tsorted_sent_queue);

	icsk->icsk_rto = TCP_TIMEOUT_INIT;

	/* Use a sysctl ? */
	icsk->icsk_rto_max = TCP_RTO_MAX;

	rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us);
	icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us);
	icsk->icsk_delack_max = TCP_DELACK_MAX;
@@ -3807,6 +3811,11 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
			   secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
					   TCP_RTO_MAX / HZ));
		return 0;
	case TCP_RTO_MAX_MS:
		if (val < MSEC_PER_SEC || val > TCP_RTO_MAX_SEC * MSEC_PER_SEC)
			return -EINVAL;
		WRITE_ONCE(inet_csk(sk)->icsk_rto_max, msecs_to_jiffies(val));
		return 0;
	}

	sockopt_lock_sock(sk);
@@ -4643,6 +4652,9 @@ int do_tcp_getsockopt(struct sock *sk, int level,
	case TCP_IS_MPTCP:
		val = 0;
		break;
	case TCP_RTO_MAX_MS:
		val = jiffies_to_msecs(tcp_rto_max(sk));
		break;
	default:
		return -ENOPROTOOPT;
	}
Loading