Commit aa55a7dd authored by Chia-Yu Chang's avatar Chia-Yu Chang Committed by Paolo Abeni
Browse files

tcp: accecn: AccECN option send control



Instead of sending the option in every ACK, limit sending to
those ACKs where the option is necessary:
- Handshake
- "Change-triggered ACK" + the ACK following it. The
  2nd ACK is necessary to unambiguously indicate which
  of the ECN byte counters in increasing. The first
  ACK has two counters increasing due to the ecnfield
  edge.
- ACKs with CE to allow CEP delta validations to take
  advantage of the option.
- Force option to be sent every at least once per 2^22
  bytes. The check is done using the bit edges of the
  byte counters (avoids need for extra variables).
- AccECN option beacon to send a few times per RTT even if
  nothing in the ECN state requires that. The default is 3
  times per RTT, and its period can be set via
  sysctl_tcp_ecn_option_beacon.

Below are the pahole outcomes before and after this patch,
in which the group size of tcp_sock_write_tx is increased
from 89 to 97 due to the new u64 accecn_opt_tstamp member:

[BEFORE THIS PATCH]
struct tcp_sock {
    [...]
    u64                        tcp_wstamp_ns;        /*  2488     8 */
    struct list_head           tsorted_sent_queue;   /*  2496    16 */

    [...]
    __cacheline_group_end__tcp_sock_write_tx[0];     /*  2521     0 */
    __cacheline_group_begin__tcp_sock_write_txrx[0]; /*  2521     0 */
    u8                         nonagle:4;            /*  2521: 0  1 */
    u8                         rate_app_limited:1;   /*  2521: 4  1 */
    /* XXX 3 bits hole, try to pack */

    /* Force alignment to the next boundary: */
    u8                         :0;
    u8                         received_ce_pending:4;/*  2522: 0  1 */
    u8                         unused2:4;            /*  2522: 4  1 */
    u8                         accecn_minlen:2;      /*  2523: 0  1 */
    u8                         est_ecnfield:2;       /*  2523: 2  1 */
    u8                         unused3:4;            /*  2523: 4  1 */

    [...]
    __cacheline_group_end__tcp_sock_write_txrx[0];   /*  2628     0 */

    [...]
    /* size: 3200, cachelines: 50, members: 171 */
}

[AFTER THIS PATCH]
struct tcp_sock {
    [...]
    u64                        tcp_wstamp_ns;        /*  2488     8 */
    u64                        accecn_opt_tstamp;    /*  2596     8 */
    struct list_head           tsorted_sent_queue;   /*  2504    16 */

    [...]
    __cacheline_group_end__tcp_sock_write_tx[0];     /*  2529     0 */
    __cacheline_group_begin__tcp_sock_write_txrx[0]; /*  2529     0 */
    u8                         nonagle:4;            /*  2529: 0  1 */
    u8                         rate_app_limited:1;   /*  2529: 4  1 */
    /* XXX 3 bits hole, try to pack */

    /* Force alignment to the next boundary: */
    u8                         :0;
    u8                         received_ce_pending:4;/*  2530: 0  1 */
    u8                         unused2:4;            /*  2530: 4  1 */
    u8                         accecn_minlen:2;      /*  2531: 0  1 */
    u8                         est_ecnfield:2;       /*  2531: 2  1 */
    u8                         accecn_opt_demand:2;  /*  2531: 4  1 */
    u8                         prev_ecnfield:2;      /*  2531: 6  1 */

    [...]
    __cacheline_group_end__tcp_sock_write_txrx[0];   /*  2636     0 */

    [...]
    /* size: 3200, cachelines: 50, members: 173 */
}

Signed-off-by: default avatarChia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Co-developed-by: default avatarIlpo Järvinen <ij@kernel.org>
Signed-off-by: default avatarIlpo Järvinen <ij@kernel.org>
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250916082434.100722-8-chia-yu.chang@nokia-bell-labs.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parent b5e74132
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -487,6 +487,12 @@ tcp_ecn_option - INTEGER

	Default: 2

tcp_ecn_option_beacon - INTEGER
	Control Accurate ECN (AccECN) option sending frequency per RTT and it
	takes effect only when tcp_ecn_option is set to 2.

	Default: 3 (AccECN will be send at least 3 times per RTT)

tcp_ecn_fallback - BOOLEAN
	If the kernel detects that ECN connection misbehaves, enable fall
	back to non-ECN. Currently, this knob implements the fallback
+3 −0
Original line number Diff line number Diff line
@@ -109,6 +109,9 @@ u8:2 syn_ect_snt write_mostly read_w
u8:2                          syn_ect_rcv             read_mostly         read_write
u8:2                          accecn_minlen           write_mostly        read_write
u8:2                          est_ecnfield                                read_write
u8:2                          accecn_opt_demand       read_mostly         read_write
u8:2                          prev_ecnfield                               read_write
u64                           accecn_opt_tstamp       read_write
u8:4                          accecn_fail_mode
u32                           lost                                        read_mostly         tcp_ack
u32                           app_limited             read_write          read_mostly         tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx)
+3 −1
Original line number Diff line number Diff line
@@ -275,6 +275,7 @@ struct tcp_sock {
	u32	mdev_us;	/* medium deviation			*/
	u32	rtt_seq;	/* sequence number to update rttvar	*/
	u64	tcp_wstamp_ns;	/* departure time for next sent data packet */
	u64	accecn_opt_tstamp;	/* Last AccECN option sent timestamp */
	struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
	struct sk_buff *highest_sack;   /* skb just after the highest
					 * skb with SACKed bit set
@@ -296,7 +297,8 @@ struct tcp_sock {
		unused2:4;
	u8	accecn_minlen:2,/* Minimum length of AccECN option sent */
		est_ecnfield:2,/* ECN field for AccECN delivered estimates */
		unused3:4;
		accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */
		prev_ecnfield:2; /* ECN bits from the previous segment */
	__be32	pred_flags;
	u64	tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
	u64	tcp_mstamp;	/* most recent packet received/sent */
+1 −0
Original line number Diff line number Diff line
@@ -149,6 +149,7 @@ struct netns_ipv4 {

	u8 sysctl_tcp_ecn;
	u8 sysctl_tcp_ecn_option;
	u8 sysctl_tcp_ecn_option_beacon;
	u8 sysctl_tcp_ecn_fallback;

	u8 sysctl_ip_default_ttl;
+3 −0
Original line number Diff line number Diff line
@@ -100,6 +100,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
/* Maximal number of window scale according to RFC1323 */
#define TCP_MAX_WSCALE		14U

/* Default sending frequency of accurate ECN option per RTT */
#define TCP_ACCECN_OPTION_BEACON	3

/* urg_data states */
#define TCP_URG_VALID	0x0100
#define TCP_URG_NOTYET	0x0200
Loading