Commit b5e74132 authored by Ilpo Järvinen's avatar Ilpo Järvinen Committed by Paolo Abeni
Browse files

tcp: accecn: AccECN option

The Accurate ECN allows echoing back the sum of bytes for
each IP ECN field value in the received packets using
AccECN option. This change implements AccECN option tx & rx
side processing without option send control related features
that are added by a later change.

Based on specification:
  https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt


(Some features of the spec will be added in the later changes
rather than in this one).

A full-length AccECN option is always attempted but if it does
not fit, the minimum length is selected based on the counters
that have changed since the last update. The AccECN option
(with 24-bit fields) often ends in odd sizes so the option
write code tries to take advantage of some nop used to pad
the other TCP options.

The delivered_ecn_bytes pairs with received_ecn_bytes similar
to how delivered_ce pairs with received_ce. In contrast to
ACE field, however, the option is not always available to update
delivered_ecn_bytes. For ACK w/o AccECN option, the delivered
bytes calculated based on the cumulative ACK+SACK information
are assigned to one of the counters using an estimation
heuristic to select the most likely ECN byte counter. Any
estimation error is corrected when the next AccECN option
arrives. It may occur that the heuristic gets too confused
when there are enough different byte counter deltas between
ACKs with the AccECN option in which case the heuristic just
gives up on updating the counters for a while.

tcp_ecn_option sysctl can be used to select option sending
mode for AccECN: TCP_ECN_OPTION_DISABLED, TCP_ECN_OPTION_MINIMUM,
and TCP_ECN_OPTION_FULL.

This patch increases the size of tcp_info struct, as there is
no existing holes for new u32 variables. Below are the pahole
outcomes before and after this patch:

[BEFORE THIS PATCH]
struct tcp_info {
    [...]
     __u32                     tcpi_total_rto_time;  /*   244     4 */

    /* size: 248, cachelines: 4, members: 61 */
}

[AFTER THIS PATCH]
struct tcp_info {
    [...]
    __u32                      tcpi_total_rto_time;  /*   244     4 */
    __u32                      tcpi_received_ce;     /*   248     4 */
    __u32                      tcpi_delivered_e1_bytes; /*   252     4 */
    __u32                      tcpi_delivered_e0_bytes; /*   256     4 */
    __u32                      tcpi_delivered_ce_bytes; /*   260     4 */
    __u32                      tcpi_received_e1_bytes; /*   264     4 */
    __u32                      tcpi_received_e0_bytes; /*   268     4 */
    __u32                      tcpi_received_ce_bytes; /*   272     4 */

    /* size: 280, cachelines: 5, members: 68 */
}

This patch uses the existing 1-byte holes in the tcp_sock_write_txrx
group for new u8 members, but adds a 4-byte hole in tcp_sock_write_rx
group after the new u32 delivered_ecn_bytes[3] member. Therefore, the
group size of tcp_sock_write_rx is increased from 96 to 112. Below
are the pahole outcomes before and after this patch:

[BEFORE THIS PATCH]
struct tcp_sock {
    [...]
    u8                         received_ce_pending:4; /*  2522: 0  1 */
    u8                         unused2:4;             /*  2522: 4  1 */
    /* XXX 1 byte hole, try to pack */

    [...]
    u32                        rcv_rtt_last_tsecr;    /*  2668     4 */

    [...]
    __cacheline_group_end__tcp_sock_write_rx[0];      /*  2728     0 */

    [...]
    /* size: 3200, cachelines: 50, members: 167 */
}

[AFTER THIS PATCH]
struct tcp_sock {
    [...]
    u8                         received_ce_pending:4;/*  2522: 0  1 */
    u8                         unused2:4;            /*  2522: 4  1 */
    u8                         accecn_minlen:2;      /*  2523: 0  1 */
    u8                         est_ecnfield:2;       /*  2523: 2  1 */
    u8                         unused3:4;            /*  2523: 4  1 */

    [...]
    u32                        rcv_rtt_last_tsecr;   /*  2668     4 */
    u32                        delivered_ecn_bytes[3];/*  2672    12 */
    /* XXX 4 bytes hole, try to pack */

    [...]
    __cacheline_group_end__tcp_sock_write_rx[0];     /*  2744     0 */

    [...]
    /* size: 3200, cachelines: 50, members: 171 */
}

Signed-off-by: default avatarIlpo Järvinen <ij@kernel.org>
Signed-off-by: default avatarNeal Cardwell <ncardwell@google.com>
Co-developed-by: default avatarChia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Signed-off-by: default avatarChia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250916082434.100722-7-chia-yu.chang@nokia-bell-labs.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parent 77a4fdf4
Loading
Loading
Loading
Loading
+19 −0
Original line number Diff line number Diff line
@@ -468,6 +468,25 @@ tcp_ecn - INTEGER

	Default: 2

tcp_ecn_option - INTEGER
	Control Accurate ECN (AccECN) option sending when AccECN has been
	successfully negotiated during handshake. Send logic inhibits
	sending AccECN options regarless of this setting when no AccECN
	option has been seen for the reverse direction.

	Possible values are:

	= ============================================================
	0 Never send AccECN option. This also disables sending AccECN
	  option in SYN/ACK during handshake.
	1 Send AccECN option sparingly according to the minimum option
	  rules outlined in draft-ietf-tcpm-accurate-ecn.
	2 Send AccECN option on every packet whenever it fits into TCP
	  option space.
	= ============================================================

	Default: 2

tcp_ecn_fallback - BOOLEAN
	If the kernel detects that ECN connection misbehaves, enable fall
	back to non-ECN. Currently, this knob implements the fallback
+3 −0
Original line number Diff line number Diff line
@@ -104,8 +104,11 @@ u32 delivered_ce read_mostly read_w
u32                           received_ce             read_mostly         read_write
u32[3]                        received_ecn_bytes      read_mostly         read_write
u8:4                          received_ce_pending     read_mostly         read_write
u32[3]                        delivered_ecn_bytes                         read_write
u8:2                          syn_ect_snt             write_mostly        read_write
u8:2                          syn_ect_rcv             read_mostly         read_write
u8:2                          accecn_minlen           write_mostly        read_write
u8:2                          est_ecnfield                                read_write
u8:4                          accecn_fail_mode
u32                           lost                                        read_mostly         tcp_ack
u32                           app_limited             read_write          read_mostly         tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx)
+7 −2
Original line number Diff line number Diff line
@@ -122,8 +122,9 @@ struct tcp_options_received {
		smc_ok : 1,	/* SMC seen on SYN packet		*/
		snd_wscale : 4,	/* Window scaling received from sender	*/
		rcv_wscale : 4;	/* Window scaling to send to receiver	*/
	u8	saw_unknown:1,	/* Received unknown option		*/
		unused:7;
	u8	accecn:6,	/* AccECN index in header, 0=no options	*/
		saw_unknown:1,	/* Received unknown option		*/
		unused:1;
	u8	num_sacks;	/* Number of SACK blocks		*/
	u16	user_mss;	/* mss requested by user in ioctl	*/
	u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */
@@ -293,6 +294,9 @@ struct tcp_sock {
		rate_app_limited:1;  /* rate_{delivered,interval_us} limited? */
	u8	received_ce_pending:4, /* Not yet transmit cnt of received_ce */
		unused2:4;
	u8	accecn_minlen:2,/* Minimum length of AccECN option sent */
		est_ecnfield:2,/* ECN field for AccECN delivered estimates */
		unused3:4;
	__be32	pred_flags;
	u64	tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
	u64	tcp_mstamp;	/* most recent packet received/sent */
@@ -337,6 +341,7 @@ struct tcp_sock {
	u32	rate_delivered;    /* saved rate sample: packets delivered */
	u32	rate_interval_us;  /* saved rate sample: time elapsed */
	u32	rcv_rtt_last_tsecr;
	u32	delivered_ecn_bytes[3];
	u64	first_tx_mstamp;  /* start of window send phase */
	u64	delivered_mstamp; /* time we reached "delivered" */
	u64	bytes_acked;	/* RFC4898 tcpEStatsAppHCThruOctetsAcked
+1 −0
Original line number Diff line number Diff line
@@ -148,6 +148,7 @@ struct netns_ipv4 {
	struct local_ports ip_local_ports;

	u8 sysctl_tcp_ecn;
	u8 sysctl_tcp_ecn_option;
	u8 sysctl_tcp_ecn_fallback;

	u8 sysctl_ip_default_ttl;
+13 −0
Original line number Diff line number Diff line
@@ -213,6 +213,8 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCPOPT_AO		29	/* Authentication Option (RFC5925) */
#define TCPOPT_MPTCP		30	/* Multipath TCP (RFC6824) */
#define TCPOPT_FASTOPEN		34	/* Fast open (RFC7413) */
#define TCPOPT_ACCECN0		172	/* 0xAC: Accurate ECN Order 0 */
#define TCPOPT_ACCECN1		174	/* 0xAE: Accurate ECN Order 1 */
#define TCPOPT_EXP		254	/* Experimental */
/* Magic number to be after the option value for sharing TCP
 * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
@@ -230,6 +232,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCPOLEN_TIMESTAMP      10
#define TCPOLEN_MD5SIG         18
#define TCPOLEN_FASTOPEN_BASE  2
#define TCPOLEN_ACCECN_BASE    2
#define TCPOLEN_EXP_FASTOPEN_BASE  4
#define TCPOLEN_EXP_SMC_BASE   6

@@ -243,6 +246,13 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCPOLEN_MD5SIG_ALIGNED		20
#define TCPOLEN_MSS_ALIGNED		4
#define TCPOLEN_EXP_SMC_BASE_ALIGNED	8
#define TCPOLEN_ACCECN_PERFIELD		3

/* Maximum number of byte counters in AccECN option + size */
#define TCP_ACCECN_NUMFIELDS		3
#define TCP_ACCECN_MAXSIZE		(TCPOLEN_ACCECN_BASE + \
					 TCPOLEN_ACCECN_PERFIELD * \
					 TCP_ACCECN_NUMFIELDS)

/* Flags in tp->nonagle */
#define TCP_NAGLE_OFF		1	/* Nagle's algo is disabled */
@@ -981,6 +991,9 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
 * See draft-ietf-tcpm-accurate-ecn for the latest values.
 */
#define TCP_ACCECN_CEP_INIT_OFFSET 5
#define TCP_ACCECN_E1B_INIT_OFFSET 1
#define TCP_ACCECN_E0B_INIT_OFFSET 1
#define TCP_ACCECN_CEB_INIT_OFFSET 0

/* State flags for sacked in struct tcp_skb_cb */
enum tcp_skb_cb_sacked_flags {
Loading