Commit f6003468 authored by Paolo Abeni's avatar Paolo Abeni
Browse files

Merge branch 'accecn-protocol-patch-series'

Chia-Yu Chang says:

====================
AccECN protocol patch series

Please find the v19 AccECN protocol patch series, which covers the core
functionality of Accurate ECN, AccECN negotiation, AccECN TCP options,
and AccECN failure handling. The Accurate ECN draft can be found in
https://datatracker.ietf.org/doc/html/draft-ietf-tcpm-accurate-ecn-28, and it
will be RFC9768.

This patch series is part of the full AccECN patch series, which is available at
https://github.com/L4STeam/linux-net-next/commits/upstream_l4steam/
---
Chia-Yu Chang (3):
  tcp: accecn: AccECN option send control
  tcp: accecn: AccECN option failure handling
  tcp: accecn: try to fit AccECN option with SACK

Ilpo Järvinen (7):
  tcp: AccECN core
  tcp: accecn: AccECN negotiation
  tcp: accecn: add AccECN rx byte counters
  tcp: accecn: AccECN needs to know delivered bytes
  tcp: sack option handling improvements
  tcp: accecn: AccECN option
  tcp: accecn: AccECN option ceb/cep and ACE field multi-wrap heuristics

 Documentation/networking/ip-sysctl.rst        |  55 +-
 .../networking/net_cachelines/tcp_sock.rst    |  12 +
 include/linux/tcp.h                           |  28 +-
 include/net/netns/ipv4.h                      |   2 +
 include/net/tcp.h                             |  33 ++
 include/net/tcp_ecn.h                         | 554 +++++++++++++++++-
 include/uapi/linux/tcp.h                      |   9 +
 net/ipv4/syncookies.c                         |   4 +
 net/ipv4/sysctl_net_ipv4.c                    |  19 +
 net/ipv4/tcp.c                                |  30 +-
 net/ipv4/tcp_input.c                          | 318 +++++++++-
 net/ipv4/tcp_ipv4.c                           |   8 +-
 net/ipv4/tcp_minisocks.c                      |  40 +-
 net/ipv4/tcp_output.c                         | 239 +++++++-
 net/ipv6/syncookies.c                         |   2 +
 net/ipv6/tcp_ipv6.c                           |   1 +
 16 files changed, 1278 insertions(+), 76 deletions(-)
====================

Link: https://patch.msgid.link/20250916082434.100722-1-chia-yu.chang@nokia-bell-labs.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents 152ba35c e7e9da85
Loading
Loading
Loading
Loading
+44 −11
Original line number Diff line number Diff line
@@ -443,23 +443,56 @@ tcp_early_retrans - INTEGER

tcp_ecn - INTEGER
	Control use of Explicit Congestion Notification (ECN) by TCP.
	ECN is used only when both ends of the TCP connection indicate
	support for it.  This feature is useful in avoiding losses due
	to congestion by allowing supporting routers to signal
	congestion before having to drop packets.
	ECN is used only when both ends of the TCP connection indicate support
	for it. This feature is useful in avoiding losses due to congestion by
	allowing supporting routers to signal congestion before having to drop
	packets. A host that supports ECN both sends ECN at the IP layer and
	feeds back ECN at the TCP layer. The highest variant of ECN feedback
	that both peers support is chosen by the ECN negotiation (Accurate ECN,
	ECN, or no ECN).

	The highest negotiated variant for incoming connection requests
	and the highest variant requested by outgoing connection
	attempts:

	===== ==================== ====================
	Value Incoming connections Outgoing connections
	===== ==================== ====================
	0     No ECN               No ECN
	1     ECN                  ECN
	2     ECN                  No ECN
	3     AccECN               AccECN
	4     AccECN               ECN
	5     AccECN               No ECN
	===== ==================== ====================

	Default: 2

tcp_ecn_option - INTEGER
	Control Accurate ECN (AccECN) option sending when AccECN has been
	successfully negotiated during handshake. Send logic inhibits
	sending AccECN options regarless of this setting when no AccECN
	option has been seen for the reverse direction.

	Possible values are:

		=  =====================================================
		0  Disable ECN.  Neither initiate nor accept ECN.
		1  Enable ECN when requested by incoming connections and
		   also request ECN on outgoing connection attempts.
		2  Enable ECN when requested by incoming connections
		   but do not request ECN on outgoing connections.
		=  =====================================================
	= ============================================================
	0 Never send AccECN option. This also disables sending AccECN
	  option in SYN/ACK during handshake.
	1 Send AccECN option sparingly according to the minimum option
	  rules outlined in draft-ietf-tcpm-accurate-ecn.
	2 Send AccECN option on every packet whenever it fits into TCP
	  option space.
	= ============================================================

	Default: 2

tcp_ecn_option_beacon - INTEGER
	Control Accurate ECN (AccECN) option sending frequency per RTT and it
	takes effect only when tcp_ecn_option is set to 2.

	Default: 3 (AccECN will be send at least 3 times per RTT)

tcp_ecn_fallback - BOOLEAN
	If the kernel detects that ECN connection misbehaves, enable fall
	back to non-ECN. Currently, this knob implements the fallback
+12 −0
Original line number Diff line number Diff line
@@ -101,6 +101,18 @@ u32 prr_delivered
u32                           prr_out                 read_mostly         read_mostly         tcp_rate_skb_sent,tcp_newly_delivered(tx);tcp_ack,tcp_rate_gen,tcp_clean_rtx_queue(rx)
u32                           delivered               read_mostly         read_write          tcp_rate_skb_sent, tcp_newly_delivered(tx);tcp_ack, tcp_rate_gen, tcp_clean_rtx_queue (rx)
u32                           delivered_ce            read_mostly         read_write          tcp_rate_skb_sent(tx);tcp_rate_gen(rx)
u32                           received_ce             read_mostly         read_write
u32[3]                        received_ecn_bytes      read_mostly         read_write
u8:4                          received_ce_pending     read_mostly         read_write
u32[3]                        delivered_ecn_bytes                         read_write
u8:2                          syn_ect_snt             write_mostly        read_write
u8:2                          syn_ect_rcv             read_mostly         read_write
u8:2                          accecn_minlen           write_mostly        read_write
u8:2                          est_ecnfield                                read_write
u8:2                          accecn_opt_demand       read_mostly         read_write
u8:2                          prev_ecnfield                               read_write
u64                           accecn_opt_tstamp       read_write
u8:4                          accecn_fail_mode
u32                           lost                                        read_mostly         tcp_ack
u32                           app_limited             read_write          read_mostly         tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx)
u64                           first_tx_mstamp         read_write                              tcp_rate_skb_sent
+25 −3
Original line number Diff line number Diff line
@@ -122,8 +122,9 @@ struct tcp_options_received {
		smc_ok : 1,	/* SMC seen on SYN packet		*/
		snd_wscale : 4,	/* Window scaling received from sender	*/
		rcv_wscale : 4;	/* Window scaling to send to receiver	*/
	u8	saw_unknown:1,	/* Received unknown option		*/
		unused:7;
	u8	accecn:6,	/* AccECN index in header, 0=no options	*/
		saw_unknown:1,	/* Received unknown option		*/
		unused:1;
	u8	num_sacks;	/* Number of SACK blocks		*/
	u16	user_mss;	/* mss requested by user in ioctl	*/
	u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */
@@ -168,6 +169,11 @@ struct tcp_request_sock {
						  * after data-in-SYN.
						  */
	u8				syn_tos;
	bool				accecn_ok;
	u8				syn_ect_snt: 2,
					syn_ect_rcv: 2,
					accecn_fail_mode:4;
	u8				saw_accecn_opt  :2;
#ifdef CONFIG_TCP_AO
	u8				ao_keyid;
	u8				ao_rcv_next;
@@ -270,6 +276,7 @@ struct tcp_sock {
	u32	mdev_us;	/* medium deviation			*/
	u32	rtt_seq;	/* sequence number to update rttvar	*/
	u64	tcp_wstamp_ns;	/* departure time for next sent data packet */
	u64	accecn_opt_tstamp;	/* Last AccECN option sent timestamp */
	struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
	struct sk_buff *highest_sack;   /* skb just after the highest
					 * skb with SACKed bit set
@@ -287,6 +294,12 @@ struct tcp_sock {
 */
	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
		rate_app_limited:1;  /* rate_{delivered,interval_us} limited? */
	u8	received_ce_pending:4, /* Not yet transmit cnt of received_ce */
		unused2:4;
	u8	accecn_minlen:2,/* Minimum length of AccECN option sent */
		est_ecnfield:2,/* ECN field for AccECN delivered estimates */
		accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */
		prev_ecnfield:2; /* ECN bits from the previous segment */
	__be32	pred_flags;
	u64	tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
	u64	tcp_mstamp;	/* most recent packet received/sent */
@@ -299,6 +312,11 @@ struct tcp_sock {
	u32	snd_up;		/* Urgent pointer		*/
	u32	delivered;	/* Total data packets delivered incl. rexmits */
	u32	delivered_ce;	/* Like the above but only ECE marked packets */
	u32	received_ce;	/* Like the above but for rcvd CE marked pkts */
	u32	received_ecn_bytes[3]; /* received byte counters for three ECN
					* types: INET_ECN_ECT_1, INET_ECN_ECT_0,
					* and INET_ECN_CE
					*/
	u32	app_limited;	/* limited until "delivered" reaches this val */
	u32	rcv_wnd;	/* Current receiver window		*/
/*
@@ -326,6 +344,7 @@ struct tcp_sock {
	u32	rate_delivered;    /* saved rate sample: packets delivered */
	u32	rate_interval_us;  /* saved rate sample: time elapsed */
	u32	rcv_rtt_last_tsecr;
	u32	delivered_ecn_bytes[3];
	u64	first_tx_mstamp;  /* start of window send phase */
	u64	delivered_mstamp; /* time we reached "delivered" */
	u64	bytes_acked;	/* RFC4898 tcpEStatsAppHCThruOctetsAcked
@@ -372,7 +391,8 @@ struct tcp_sock {
	u8	compressed_ack;
	u8	dup_ack_counter:2,
		tlp_retrans:1,	/* TLP is a retransmission */
		unused:5;
		syn_ect_snt:2,	/* AccECN ECT memory, only */
		syn_ect_rcv:2;	/* ... needed during 3WHS + first seqno */
	u8	thin_lto    : 1,/* Use linear timeouts for thin streams */
		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
@@ -388,6 +408,8 @@ struct tcp_sock {
		syn_fastopen_child:1; /* created TFO passive child socket */

	u8	keepalive_probes; /* num of allowed keep alive probes	*/
	u8	accecn_fail_mode:4,	/* AccECN failure handling */
		saw_accecn_opt:2;	/* An AccECN option was seen */
	u32	tcp_tx_delay;	/* delay (in usec) added to TX packets */

/* RTT measurement */
+2 −0
Original line number Diff line number Diff line
@@ -148,6 +148,8 @@ struct netns_ipv4 {
	struct local_ports ip_local_ports;

	u8 sysctl_tcp_ecn;
	u8 sysctl_tcp_ecn_option;
	u8 sysctl_tcp_ecn_option_beacon;
	u8 sysctl_tcp_ecn_fallback;

	u8 sysctl_ip_default_ttl;
+33 −0
Original line number Diff line number Diff line
@@ -100,6 +100,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
/* Maximal number of window scale according to RFC1323 */
#define TCP_MAX_WSCALE		14U

/* Default sending frequency of accurate ECN option per RTT */
#define TCP_ACCECN_OPTION_BEACON	3

/* urg_data states */
#define TCP_URG_VALID	0x0100
#define TCP_URG_NOTYET	0x0200
@@ -213,6 +216,8 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCPOPT_AO		29	/* Authentication Option (RFC5925) */
#define TCPOPT_MPTCP		30	/* Multipath TCP (RFC6824) */
#define TCPOPT_FASTOPEN		34	/* Fast open (RFC7413) */
#define TCPOPT_ACCECN0		172	/* 0xAC: Accurate ECN Order 0 */
#define TCPOPT_ACCECN1		174	/* 0xAE: Accurate ECN Order 1 */
#define TCPOPT_EXP		254	/* Experimental */
/* Magic number to be after the option value for sharing TCP
 * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
@@ -230,6 +235,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCPOLEN_TIMESTAMP      10
#define TCPOLEN_MD5SIG         18
#define TCPOLEN_FASTOPEN_BASE  2
#define TCPOLEN_ACCECN_BASE    2
#define TCPOLEN_EXP_FASTOPEN_BASE  4
#define TCPOLEN_EXP_SMC_BASE   6

@@ -243,6 +249,14 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCPOLEN_MD5SIG_ALIGNED		20
#define TCPOLEN_MSS_ALIGNED		4
#define TCPOLEN_EXP_SMC_BASE_ALIGNED	8
#define TCPOLEN_ACCECN_PERFIELD		3

/* Maximum number of byte counters in AccECN option + size */
#define TCP_ACCECN_NUMFIELDS		3
#define TCP_ACCECN_MAXSIZE		(TCPOLEN_ACCECN_BASE + \
					 TCPOLEN_ACCECN_PERFIELD * \
					 TCP_ACCECN_NUMFIELDS)
#define TCP_ACCECN_SAFETY_SHIFT		1 /* SAFETY_FACTOR in accecn draft */

/* Flags in tp->nonagle */
#define TCP_NAGLE_OFF		1	/* Nagle's algo is disabled */
@@ -972,6 +986,18 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)

#define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)
#define TCPHDR_SYN_ECN	(TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)
#define TCPHDR_SYNACK_ACCECN (TCPHDR_SYN | TCPHDR_ACK | TCPHDR_CWR)

#define TCP_ACCECN_CEP_ACE_MASK 0x7
#define TCP_ACCECN_ACE_MAX_DELTA 6

/* To avoid/detect middlebox interference, not all counters start at 0.
 * See draft-ietf-tcpm-accurate-ecn for the latest values.
 */
#define TCP_ACCECN_CEP_INIT_OFFSET 5
#define TCP_ACCECN_E1B_INIT_OFFSET 1
#define TCP_ACCECN_E0B_INIT_OFFSET 1
#define TCP_ACCECN_CEB_INIT_OFFSET 0

/* State flags for sacked in struct tcp_skb_cb */
enum tcp_skb_cb_sacked_flags {
@@ -1782,11 +1808,18 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,

static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
	u32 ace;

	/* mptcp hooks are only on the slow path */
	if (sk_is_mptcp((struct sock *)tp))
		return;

	ace = tcp_ecn_mode_accecn(tp) ?
	      ((tp->delivered_ce + TCP_ACCECN_CEP_INIT_OFFSET) &
	       TCP_ACCECN_CEP_ACE_MASK) : 0;

	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
			       (ace << 22) |
			       ntohl(TCP_FLAG_ACK) |
			       snd_wnd);
}
Loading