Commit 614e8316 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

tcp: add support for usec resolution in TCP TS values

Back in 2015, Van Jacobson suggested to use usec resolution in TCP TS values.
This has been implemented in our private kernels.

Goals were :

1) better observability of delays in networking stacks.
2) better disambiguation of events based on TSval/ecr values.
3) building block for congestion control modules needing usec resolution.

Back then we implemented a schem based on private SYN options
to negotiate the feature.

For upstream submission, we chose to use a route attribute,
because this feature is probably going to be used in private
networks [1] [2].

ip route add 10/8 ... features tcp_usec_ts

Note that RFC 7323 recommends a
  "timestamp clock frequency in the range 1 ms to 1 sec per tick.",
but also mentions
  "the maximum acceptable clock frequency is one tick every 59 ns."

[1] Unfortunately RFC 7323 5.5 (Outdated Timestamps) suggests
to invalidate TS.Recent values after a flow was idle for more
than 24 days. This is the part making usec_ts a problem
for peers following this recommendation for long living
idle flows.

[2] Attempts to standardize usec ts went nowhere:

https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf
https://datatracker.ietf.org/doc/draft-wang-tcpm-low-latency-opt/



Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent af772144
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -152,6 +152,7 @@ struct tcp_request_sock {
	u64				snt_synack; /* first SYNACK sent time */
	bool				tfo_listener;
	bool				is_mptcp;
	s8				req_usec_ts;
#if IS_ENABLED(CONFIG_MPTCP)
	bool				drop_req;
#endif
@@ -257,7 +258,8 @@ struct tcp_sock {
	u8	compressed_ack;
	u8	dup_ack_counter:2,
		tlp_retrans:1,	/* TLP is a retransmission */
		unused:5;
		tcp_usec_ts:1, /* TSval values in usec */
		unused:4;
	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
	u8	chrono_type:2,	/* current chronograph type */
+2 −1
Original line number Diff line number Diff line
@@ -67,7 +67,8 @@ struct inet_timewait_sock {
	/* And these are ours. */
	unsigned int		tw_transparent  : 1,
				tw_flowlabel	: 20,
				tw_pad		: 3,	/* 3 bits hole */
				tw_usec_ts	: 1,
				tw_pad		: 2,	/* 2 bits hole */
				tw_tos		: 8;
	u32			tw_txhash;
	u32			tw_priority;
+4 −2
Original line number Diff line number Diff line
@@ -825,6 +825,8 @@ static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp)

static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp)
{
	if (tp->tcp_usec_ts)
		return tp->tcp_mstamp;
	return tcp_time_stamp_ms(tp);
}

@@ -852,12 +854,12 @@ static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb)

static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw)
{
	return tcp_clock_ts(false) + tcptw->tw_ts_offset;
	return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset;
}

static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
{
	return tcp_clock_ts(false) + treq->ts_off;
	return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off;
}

#define tcp_flag_byte(th) (((u_int8_t *)th)[13])
+5 −1
Original line number Diff line number Diff line
@@ -84,7 +84,9 @@ u64 cookie_init_timestamp(struct request_sock *req, u64 now)
	if (ts > ts_now)
		ts -= (1UL << TSBITS);

	return ts * (NSEC_PER_SEC / TCP_TS_HZ);
	if (tcp_rsk(req)->req_usec_ts)
		return ts * NSEC_PER_USEC;
	return ts * NSEC_PER_MSEC;
}


@@ -304,6 +306,8 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
	treq->af_specific = af_ops;

	treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
	treq->req_usec_ts = -1;

#if IS_ENABLED(CONFIG_MPTCP)
	treq->is_mptcp = sk_is_mptcp(sk);
	if (treq->is_mptcp) {
+14 −4
Original line number Diff line number Diff line
@@ -3629,10 +3629,16 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
			tp->fastopen_no_cookie = val;
		break;
	case TCP_TIMESTAMP:
		if (!tp->repair)
		if (!tp->repair) {
			err = -EPERM;
		else
			WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(false));
			break;
		}
		/* val is an opaque field,
		 * and low order bit contains usec_ts enable bit.
		 * Its a best effort, and we do not care if user makes an error.
		 */
		tp->tcp_usec_ts = val & 1;
		WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts));
		break;
	case TCP_REPAIR_WINDOW:
		err = tcp_repair_set_window(tp, optval, optlen);
@@ -4143,7 +4149,11 @@ int do_tcp_getsockopt(struct sock *sk, int level,
		break;

	case TCP_TIMESTAMP:
		val = tcp_clock_ts(false) + READ_ONCE(tp->tsoffset);
		val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset);
		if (tp->tcp_usec_ts)
			val |= 1;
		else
			val &= ~1;
		break;
	case TCP_NOTSENT_LOWAT:
		val = READ_ONCE(tp->notsent_lowat);
Loading