Commit bdf24b4b authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'tcp-ts-usec-resolution'

Eric Dumazet says:

====================
tcp: add optional usec resolution to TCP TS

As discussed in various public places in 2016, Google adopted
usec resolution in RFC 7323 TS values, at Van Jacobson suggestion.

Goals were :

1) better observability of delays in networking stacks/fabrics.

2) better disambiguation of events based on TSval/ecr values.

3) building block for congestion control modules needing usec resolution.

Back then we implemented a schem based on private SYN options
to safely negotiate the feature.

For upstream submission, we chose to use a much simpler route
attribute because this feature is probably going to be used
in private networks.

ip route add 10/8 ... features tcp_usec_ts

References:

https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf
https://datatracker.ietf.org/doc/draft-wang-tcpm-low-latency-opt/



First two patches are fixing old minor bugs and might be taken
by stable teams (thanks to appropriate Fixes: tags)
====================

Acked-by: default avatarNeal Cardwell <ncardwell@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 35c1b273 a77a0f5c
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -2259,7 +2259,7 @@ static void chtls_rx_ack(struct sock *sk, struct sk_buff *skb)

		if (tp->snd_una != snd_una) {
			tp->snd_una = snd_una;
			tp->rcv_tstamp = tcp_time_stamp(tp);
			tp->rcv_tstamp = tcp_jiffies32;
			if (tp->snd_una == tp->snd_nxt &&
			    !csk_flag_nochk(csk, CSK_TX_FAILOVER))
				csk_reset_flag(csk, CSK_TX_WAIT_IDLE);
+8 −1
Original line number Diff line number Diff line
@@ -152,6 +152,7 @@ struct tcp_request_sock {
	u64				snt_synack; /* first SYNACK sent time */
	bool				tfo_listener;
	bool				is_mptcp;
	s8				req_usec_ts;
#if IS_ENABLED(CONFIG_MPTCP)
	bool				drop_req;
#endif
@@ -257,7 +258,8 @@ struct tcp_sock {
	u8	compressed_ack;
	u8	dup_ack_counter:2,
		tlp_retrans:1,	/* TLP is a retransmission */
		unused:5;
		tcp_usec_ts:1, /* TSval values in usec */
		unused:4;
	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
	u8	chrono_type:2,	/* current chronograph type */
@@ -576,4 +578,9 @@ void tcp_sock_set_quickack(struct sock *sk, int val);
int tcp_sock_set_syncnt(struct sock *sk, int val);
int tcp_sock_set_user_timeout(struct sock *sk, int val);

static inline bool dst_tcp_usec_ts(const struct dst_entry *dst)
{
	return dst_feature(dst, RTAX_FEATURE_TCP_USEC_TS);
}

#endif	/* _LINUX_TCP_H */
+2 −1
Original line number Diff line number Diff line
@@ -67,7 +67,8 @@ struct inet_timewait_sock {
	/* And these are ours. */
	unsigned int		tw_transparent  : 1,
				tw_flowlabel	: 20,
				tw_pad		: 3,	/* 3 bits hole */
				tw_usec_ts	: 1,
				tw_pad		: 2,	/* 2 bits hole */
				tw_tos		: 8;
	u32			tw_txhash;
	u32			tw_priority;
+43 −16
Original line number Diff line number Diff line
@@ -166,7 +166,12 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define MAX_TCP_KEEPCNT		127
#define MAX_TCP_SYNCNT		127

#define TCP_PAWS_24DAYS	(60 * 60 * 24 * 24)
/* Ensure that TCP PAWS checks are relaxed after ~2147 seconds
 * to avoid overflows. This assumes a clock smaller than 1 Mhz.
 * Default clock is 1 Khz, tcp_usec_ts uses 1 Mhz.
 */
#define TCP_PAWS_WRAP (INT_MAX / USEC_PER_SEC)

#define TCP_PAWS_MSL	60		/* Per-host timestamps are invalidated
					 * after this time. It should be equal
					 * (or greater than) TCP_TIMEWAIT_LEN
@@ -798,22 +803,31 @@ static inline u64 tcp_clock_us(void)
	return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
}

/* This should only be used in contexts where tp->tcp_mstamp is up to date */
static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
static inline u64 tcp_clock_ms(void)
{
	return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
	return div_u64(tcp_clock_ns(), NSEC_PER_MSEC);
}

/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */
static inline u32 tcp_ns_to_ts(u64 ns)
/* TCP Timestamp included in TS option (RFC 1323) can either use ms
 * or usec resolution. Each socket carries a flag to select one or other
 * resolution, as the route attribute could change anytime.
 * Each flow must stick to initial resolution.
 */
static inline u32 tcp_clock_ts(bool usec_ts)
{
	return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ);
	return usec_ts ? tcp_clock_us() : tcp_clock_ms();
}

/* Could use tcp_clock_us() / 1000, but this version uses a single divide */
static inline u32 tcp_time_stamp_raw(void)
static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp)
{
	return tcp_ns_to_ts(tcp_clock_ns());
	return div_u64(tp->tcp_mstamp, USEC_PER_MSEC);
}

static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp)
{
	if (tp->tcp_usec_ts)
		return tp->tcp_mstamp;
	return tcp_time_stamp_ms(tp);
}

void tcp_mstamp_refresh(struct tcp_sock *tp);
@@ -823,17 +837,30 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
	return max_t(s64, t1 - t0, 0);
}

static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
{
	return tcp_ns_to_ts(skb->skb_mstamp_ns);
}

/* provide the departure time in us unit */
static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
{
	return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
}

/* Provide skb TSval in usec or ms unit */
static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb)
{
	if (usec_ts)
		return tcp_skb_timestamp_us(skb);

	return div_u64(skb->skb_mstamp_ns, NSEC_PER_MSEC);
}

static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw)
{
	return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset;
}

static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
{
	return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off;
}

#define tcp_flag_byte(th) (((u_int8_t *)th)[13])

@@ -1599,7 +1626,7 @@ static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt,
	if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win)
		return true;
	if (unlikely(!time_before32(ktime_get_seconds(),
				    rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)))
				    rx_opt->ts_recent_stamp + TCP_PAWS_WRAP)))
		return true;
	/*
	 * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0,
+11 −7
Original line number Diff line number Diff line
@@ -503,12 +503,16 @@ enum {
#define RTAX_MAX (__RTAX_MAX - 1)

#define RTAX_FEATURE_ECN		(1 << 0)
#define RTAX_FEATURE_SACK	(1 << 1)
#define RTAX_FEATURE_TIMESTAMP	(1 << 2)
#define RTAX_FEATURE_SACK		(1 << 1) /* unused */
#define RTAX_FEATURE_TIMESTAMP		(1 << 2) /* unused */
#define RTAX_FEATURE_ALLFRAG		(1 << 3)
#define RTAX_FEATURE_TCP_USEC_TS	(1 << 4)

#define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \
				 RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG)
#define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN |		\
				 RTAX_FEATURE_SACK |		\
				 RTAX_FEATURE_TIMESTAMP |	\
				 RTAX_FEATURE_ALLFRAG |		\
				 RTAX_FEATURE_TCP_USEC_TS)

struct rta_session {
	__u8	proto;
Loading