Commit 3cae3427 authored by Ilpo Järvinen's avatar Ilpo Järvinen Committed by Paolo Abeni
Browse files

tcp: accecn: AccECN negotiation

Accurate ECN negotiation parts based on the specification:
  https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt



Accurate ECN is negotiated using ECE, CWR and AE flags in the
TCP header. TCP falls back into using RFC3168 ECN if one of the
ends supports only RFC3168-style ECN.

The AccECN negotiation includes reflecting IP ECN field value
seen in SYN and SYNACK back using the same bits as negotiation
to allow responding to SYN CE marks and to detect ECN field
mangling. CE marks should not occur currently because SYN=1
segments are sent with Non-ECT in IP ECN field (but proposal
exists to remove this restriction).

Reflecting SYN IP ECN field in SYNACK is relatively simple.
Reflecting SYNACK IP ECN field in the final/third ACK of
the handshake is more challenging. Linux TCP code is not well
prepared for using the final/third ACK a signalling channel
which makes things somewhat complicated here.

tcp_ecn sysctl can be used to select the highest ECN variant
(Accurate ECN, ECN, No ECN) that is attemped to be negotiated and
requested for incoming connection and outgoing connection:
TCP_ECN_IN_NOECN_OUT_NOECN, TCP_ECN_IN_ECN_OUT_ECN,
TCP_ECN_IN_ECN_OUT_NOECN, TCP_ECN_IN_ACCECN_OUT_ACCECN,
TCP_ECN_IN_ACCECN_OUT_ECN, and TCP_ECN_IN_ACCECN_OUT_NOECN.

After this patch, the size of tcp_request_sock remains unchanged
and no new holes are added. Below are the pahole outcomes before
and after this patch:

[BEFORE THIS PATCH]
struct tcp_request_sock {
    [...]
    u32                        rcv_nxt;              /*   352     4 */
    u8                         syn_tos;              /*   356     1 */

    /* size: 360, cachelines: 6, members: 16 */
}

[AFTER THIS PATCH]
struct tcp_request_sock {
    [...]
    u32                        rcv_nxt;              /*   352     4 */
    u8                         syn_tos;              /*   356     1 */
    bool                       accecn_ok;            /*   357     1 */
    u8                         syn_ect_snt:2;        /*   358: 0  1 */
    u8                         syn_ect_rcv:2;        /*   358: 2  1 */
    u8                         accecn_fail_mode:4;   /*   358: 4  1 */

    /* size: 360, cachelines: 6, members: 20 */
}

After this patch, the size of tcp_sock remains unchanged and no new
holes are added. Also, 4 bits of the existing 2-byte hole are exploited.
Below are the pahole outcomes before and after this patch:

[BEFORE THIS PATCH]
struct tcp_sock {
    [...]
    u8                         dup_ack_counter:2;    /*  2761: 0  1 */
    u8                         tlp_retrans:1;        /*  2761: 2  1 */
    u8                         unused:5;             /*  2761: 3  1 */
    u8                         thin_lto:1;           /*  2762: 0  1 */
    u8                         fastopen_connect:1;   /*  2762: 1  1 */
    u8                         fastopen_no_cookie:1; /*  2762: 2  1 */
    u8                         fastopen_client_fail:2; /*  2762: 3  1 */
    u8                         frto:1;               /*  2762: 5  1 */
    /* XXX 2 bits hole, try to pack */

    [...]
    u8                         keepalive_probes;     /*  2765     1 */
    /* XXX 2 bytes hole, try to pack */

    [...]
    /* size: 3200, cachelines: 50, members: 164 */
}

[AFTER THIS PATCH]
struct tcp_sock {
    [...]
    u8                         dup_ack_counter:2;    /*  2761: 0  1 */
    u8                         tlp_retrans:1;        /*  2761: 2  1 */
    u8                         syn_ect_snt:2;        /*  2761: 3  1 */
    u8                         syn_ect_rcv:2;        /*  2761: 5  1 */
    u8                         thin_lto:1;           /*  2761: 7  1 */
    u8                         fastopen_connect:1;   /*  2762: 0  1 */
    u8                         fastopen_no_cookie:1; /*  2762: 1  1 */
    u8                         fastopen_client_fail:2; /*  2762: 2  1 */
    u8                         frto:1;               /*  2762: 4  1 */
    /* XXX 3 bits hole, try to pack */

    [...]
    u8                         keepalive_probes;     /*  2765     1 */
    u8                         accecn_fail_mode:4;   /*  2766: 0  1 */
    /* XXX 4 bits hole, try to pack */
    /* XXX 1 byte hole, try to pack */

    [...]
    /* size: 3200, cachelines: 50, members: 166 */
}

Signed-off-by: default avatarIlpo Järvinen <ij@kernel.org>
Co-developed-by: default avatarOlivier Tilmans <olivier.tilmans@nokia.com>
Signed-off-by: default avatarOlivier Tilmans <olivier.tilmans@nokia.com>
Co-developed-by: default avatarChia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Signed-off-by: default avatarChia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Acked-by: default avatarPaolo Abeni <pabeni@redhat.com>
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250916082434.100722-3-chia-yu.chang@nokia-bell-labs.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parent 542a495c
Loading
Loading
Loading
Loading
+22 −14
Original line number Diff line number Diff line
@@ -443,20 +443,28 @@ tcp_early_retrans - INTEGER

tcp_ecn - INTEGER
	Control use of Explicit Congestion Notification (ECN) by TCP.
	ECN is used only when both ends of the TCP connection indicate
	support for it.  This feature is useful in avoiding losses due
	to congestion by allowing supporting routers to signal
	congestion before having to drop packets.

	Possible values are:

		=  =====================================================
		0  Disable ECN.  Neither initiate nor accept ECN.
		1  Enable ECN when requested by incoming connections and
		   also request ECN on outgoing connection attempts.
		2  Enable ECN when requested by incoming connections
		   but do not request ECN on outgoing connections.
		=  =====================================================
	ECN is used only when both ends of the TCP connection indicate support
	for it. This feature is useful in avoiding losses due to congestion by
	allowing supporting routers to signal congestion before having to drop
	packets. A host that supports ECN both sends ECN at the IP layer and
	feeds back ECN at the TCP layer. The highest variant of ECN feedback
	that both peers support is chosen by the ECN negotiation (Accurate ECN,
	ECN, or no ECN).

	The highest negotiated variant for incoming connection requests
	and the highest variant requested by outgoing connection
	attempts:

	===== ==================== ====================
	Value Incoming connections Outgoing connections
	===== ==================== ====================
	0     No ECN               No ECN
	1     ECN                  ECN
	2     ECN                  No ECN
	3     AccECN               AccECN
	4     AccECN               ECN
	5     AccECN               No ECN
	===== ==================== ====================

	Default: 2

+3 −0
Original line number Diff line number Diff line
@@ -103,6 +103,9 @@ u32 delivered read_mostly read_w
u32                           delivered_ce            read_mostly         read_write          tcp_rate_skb_sent(tx);tcp_rate_gen(rx)
u32                           received_ce             read_mostly         read_write
u8:4                          received_ce_pending     read_mostly         read_write
u8:2                          syn_ect_snt             write_mostly        read_write
u8:2                          syn_ect_rcv             read_mostly         read_write
u8:4                          accecn_fail_mode
u32                           lost                                        read_mostly         tcp_ack
u32                           app_limited             read_write          read_mostly         tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx)
u64                           first_tx_mstamp         read_write                              tcp_rate_skb_sent
+7 −1
Original line number Diff line number Diff line
@@ -168,6 +168,10 @@ struct tcp_request_sock {
						  * after data-in-SYN.
						  */
	u8				syn_tos;
	bool				accecn_ok;
	u8				syn_ect_snt: 2,
					syn_ect_rcv: 2,
					accecn_fail_mode:4;
#ifdef CONFIG_TCP_AO
	u8				ao_keyid;
	u8				ao_rcv_next;
@@ -375,7 +379,8 @@ struct tcp_sock {
	u8	compressed_ack;
	u8	dup_ack_counter:2,
		tlp_retrans:1,	/* TLP is a retransmission */
		unused:5;
		syn_ect_snt:2,	/* AccECN ECT memory, only */
		syn_ect_rcv:2;	/* ... needed during 3WHS + first seqno */
	u8	thin_lto    : 1,/* Use linear timeouts for thin streams */
		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
@@ -391,6 +396,7 @@ struct tcp_sock {
		syn_fastopen_child:1; /* created TFO passive child socket */

	u8	keepalive_probes; /* num of allowed keep alive probes	*/
	u8	accecn_fail_mode:4;	/* AccECN failure handling */
	u32	tcp_tx_delay;	/* delay (in usec) added to TX packets */

/* RTT measurement */
+1 −0
Original line number Diff line number Diff line
@@ -972,6 +972,7 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)

#define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)
#define TCPHDR_SYN_ECN	(TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)
#define TCPHDR_SYNACK_ACCECN (TCPHDR_SYN | TCPHDR_ACK | TCPHDR_CWR)

#define TCP_ACCECN_CEP_ACE_MASK 0x7
#define TCP_ACCECN_ACE_MAX_DELTA 6
+288 −22
Original line number Diff line number Diff line
@@ -4,12 +4,26 @@

#include <linux/tcp.h>
#include <linux/skbuff.h>
#include <linux/bitfield.h>

#include <net/inet_connection_sock.h>
#include <net/sock.h>
#include <net/tcp.h>
#include <net/inet_ecn.h>

/* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is
 * attemped to be negotiated and requested for incoming connection
 * and outgoing connection, respectively.
 */
enum tcp_ecn_mode {
	TCP_ECN_IN_NOECN_OUT_NOECN = 0,
	TCP_ECN_IN_ECN_OUT_ECN = 1,
	TCP_ECN_IN_ECN_OUT_NOECN = 2,
	TCP_ECN_IN_ACCECN_OUT_ACCECN = 3,
	TCP_ECN_IN_ACCECN_OUT_ECN = 4,
	TCP_ECN_IN_ACCECN_OUT_NOECN = 5,
};

static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp)
{
	/* Do not set CWR if in AccECN mode! */
@@ -39,19 +53,125 @@ static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
}

/* tp->accecn_fail_mode */
#define TCP_ACCECN_ACE_FAIL_SEND	BIT(0)
#define TCP_ACCECN_ACE_FAIL_RECV	BIT(1)
#define TCP_ACCECN_OPT_FAIL_SEND	BIT(2)
#define TCP_ACCECN_OPT_FAIL_RECV	BIT(3)

static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp)
{
	return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND;
}

static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp)
{
	return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV;
}

static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp)
{
	return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND;
}

static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp)
{
	return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV;
}

static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode)
{
	tp->accecn_fail_mode |= mode;
}

static inline u8 tcp_accecn_ace(const struct tcphdr *th)
{
	return (th->ae << 2) | (th->cwr << 1) | th->ece;
}

static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
/* Infer the ECT value our SYN arrived with from the echoed ACE field */
static inline int tcp_accecn_extract_syn_ect(u8 ace)
{
	tp->received_ce = 0;
	tp->received_ce_pending = 0;
	/* Below is an excerpt from the 1st block of Table 2 of AccECN spec */
	static const int ace_to_ecn[8] = {
		INET_ECN_ECT_0,		/* 0b000 (Undefined) */
		INET_ECN_ECT_1,		/* 0b001 (Undefined) */
		INET_ECN_NOT_ECT,	/* 0b010 (Not-ECT is received) */
		INET_ECN_ECT_1,		/* 0b011 (ECT-1 is received) */
		INET_ECN_ECT_0,		/* 0b100 (ECT-0 is received) */
		INET_ECN_ECT_1,		/* 0b101 (Reserved) */
		INET_ECN_CE,		/* 0b110 (CE is received) */
		INET_ECN_ECT_1		/* 0b111 (Undefined) */
	};

	return ace_to_ecn[ace & 0x7];
}

/* Check ECN field transition to detect invalid transitions */
static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv)
{
	if (rcv == snt)
		return true;

	/* Non-ECT altered to something or something became non-ECT */
	if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT)
		return false;
	/* CE -> ECT(0/1)? */
	if (snt == INET_ECN_CE)
		return false;
	return true;
}

static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace,
						    u8 sent_ect)
{
	u8 ect = tcp_accecn_extract_syn_ect(ace);
	struct tcp_sock *tp = tcp_sk(sk);

	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
		return true;

	if (!tcp_ect_transition_valid(sent_ect, ect)) {
		tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV);
		return false;
	}

	return true;
}

/* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */
static inline void tcp_accecn_third_ack(struct sock *sk,
					const struct sk_buff *skb, u8 sent_ect)
{
	u8 ace = tcp_accecn_ace(tcp_hdr(skb));
	struct tcp_sock *tp = tcp_sk(sk);

	switch (ace) {
	case 0x0:
		/* Invalid value */
		tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV);
		break;
	case 0x7:
	case 0x5:
	case 0x1:
		/* Unused but legal values */
		break;
	default:
		/* Validation only applies to first non-data packet */
		if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
		    !TCP_SKB_CB(skb)->sacked &&
		    tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) {
			if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) &&
			    !tp->delivered_ce)
				tp->delivered_ce++;
		}
		break;
	}
}

/* Updates Accurate ECN received counters from the received IP ECN field */
static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb)
static inline void tcp_ecn_received_counters(struct sock *sk,
					     const struct sk_buff *skb)
{
	u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK;
	u8 is_ce = INET_ECN_is_ce(ecnfield);
@@ -74,27 +194,152 @@ static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_bu
	}
}

static inline void tcp_accecn_set_ace(struct tcphdr *th, struct tcp_sock *tp)
/* AccECN specification, 5.1: [...] a server can determine that it
 * negotiated AccECN as [...] if the ACK contains an ACE field with
 * the value 0b010 to 0b111 (decimal 2 to 7).
 */
static inline bool cookie_accecn_ok(const struct tcphdr *th)
{
	return tcp_accecn_ace(th) > 0x1;
}

/* Used to form the ACE flags for SYN/ACK */
static inline u16 tcp_accecn_reflector_flags(u8 ect)
{
	/* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN.
	 * Below is an excerpt from the 1st block of Table 2 of AccECN spec,
	 * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE
	 */
	static const u8 ecn_to_ace_flags[4] = {
		0b010,	/* Not-ECT is received */
		0b011,	/* ECT(1) is received */
		0b100,	/* ECT(0) is received */
		0b110	/* CE is received */
	};

	return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]);
}

/* AccECN specification, 3.1.2: If a TCP server that implements AccECN
 * receives a SYN with the three TCP header flags (AE, CWR and ECE) set
 * to any combination other than 000, 011 or 111, it MUST negotiate the
 * use of AccECN as if they had been set to 111.
 */
static inline bool tcp_accecn_syn_requested(const struct tcphdr *th)
{
	u8 ace = tcp_accecn_ace(th);

	return ace && ace != 0x3;
}

static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
{
	tp->received_ce = 0;
	tp->received_ce_pending = 0;
}

/* Used for make_synack to form the ACE flags */
static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect)
{
	/* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received
	 * from SYN. Below is an excerpt from Table 2 of the AccECN spec:
	 * +====================+====================================+
	 * |  IP-ECN codepoint  |  Respective ACE falgs on SYN/ACK   |
	 * |   received on SYN  |       AE       CWR       ECE       |
	 * +====================+====================================+
	 * |      Not-ECT       |       0         1         0        |
	 * |      ECT(1)        |       0         1         1        |
	 * |      ECT(0)        |       1         0         0        |
	 * |        CE          |       1         1         0        |
	 * +====================+====================================+
	 */
	th->ae = !!(ect & INET_ECN_ECT_0);
	th->cwr = ect != INET_ECN_ECT_0;
	th->ece = ect == INET_ECN_ECT_1;
}

static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb,
				      struct tcphdr *th)
{
	u32 wire_ace;

	/* The final packet of the 3WHS or anything like it must reflect
	 * the SYN/ACK ECT instead of putting CEP into ACE field, such
	 * case show up in tcp_flags.
	 */
	if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) {
		wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET;
		th->ece = !!(wire_ace & 0x1);
		th->cwr = !!(wire_ace & 0x2);
		th->ae = !!(wire_ace & 0x4);
		tp->received_ce_pending = 0;
	}
}

static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp,
				      const struct tcphdr *th)
/* See Table 2 of the AccECN draft */
static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th,
				      u8 ip_dsfield)
{
	if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr))
	struct tcp_sock *tp = tcp_sk(sk);
	u8 ace = tcp_accecn_ace(th);

	switch (ace) {
	case 0x0:
	case 0x7:
		/* +========+========+============+=============+
		 * | A      | B      |  SYN/ACK   |  Feedback   |
		 * |        |        |    B->A    |  Mode of A  |
		 * |        |        | AE CWR ECE |             |
		 * +========+========+============+=============+
		 * | AccECN | No ECN | 0   0   0  |   Not ECN   |
		 * | AccECN | Broken | 1   1   1  |   Not ECN   |
		 * +========+========+============+=============+
		 */
		tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
		break;
	case 0x1:
	case 0x5:
		/* +========+========+============+=============+
		 * | A      | B      |  SYN/ACK   |  Feedback   |
		 * |        |        |    B->A    |  Mode of A  |
		 * |        |        | AE CWR ECE |             |
		 * +========+========+============+=============+
		 * | AccECN | Nonce  | 1   0   1  | (Reserved)  |
		 * | AccECN | ECN    | 0   0   1  | Classic ECN |
		 * | Nonce  | AccECN | 0   0   1  | Classic ECN |
		 * | ECN    | AccECN | 0   0   1  | Classic ECN |
		 * +========+========+============+=============+
		 */
		if (tcp_ecn_mode_pending(tp))
			/* Downgrade from AccECN, or requested initially */
			tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
		break;
	default:
		tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
		tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK;
		if (INET_ECN_is_ce(ip_dsfield) &&
		    tcp_accecn_validate_syn_feedback(sk, ace,
						     tp->syn_ect_snt)) {
			tp->received_ce++;
			tp->received_ce_pending++;
		}
		break;
	}
}

static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp,
				   const struct tcphdr *th)
static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th,
				   const struct sk_buff *skb)
{
	if (tcp_ecn_mode_pending(tp)) {
		if (!tcp_accecn_syn_requested(th)) {
			/* Downgrade to classic ECN feedback */
			tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
		} else {
			tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
					  INET_ECN_MASK;
			tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
		}
	}
	if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr))
		tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
}
@@ -110,7 +355,7 @@ static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp,
/* Packet ECN state for a SYN-ACK */
static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
{
	const struct tcp_sock *tp = tcp_sk(sk);
	struct tcp_sock *tp = tcp_sk(sk);

	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
	if (tcp_ecn_disabled(tp))
@@ -118,6 +363,13 @@ static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
	else if (tcp_ca_needs_ecn(sk) ||
		 tcp_bpf_ca_needs_ecn(sk))
		INET_ECN_xmit(sk);

	if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) {
		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE;
		TCP_SKB_CB(skb)->tcp_flags |=
			tcp_accecn_reflector_flags(tp->syn_ect_rcv);
		tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
	}
}

/* Packet ECN state for a SYN.  */
@@ -125,8 +377,13 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);
	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
	bool use_ecn, use_accecn;
	u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn);

	use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN;
	use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN ||
		  tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN ||
		  tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn;

	if (!use_ecn) {
		const struct dst_entry *dst = __sk_dst_get(sk);
@@ -142,23 +399,32 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
			INET_ECN_xmit(sk);

		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
		if (use_accecn) {
			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE;
			tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING);
			tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
		} else {
			tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
		}
	}
}

static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
{
	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) {
		/* tp->ecn_flags are cleared at a later point in time when
		 * SYN ACK is ultimatively being received.
		 */
		TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE;
	}
}

static inline void
tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
{
	if (inet_rsk(req)->ecn_ok)
	if (tcp_rsk(req)->accecn_ok)
		tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv);
	else if (inet_rsk(req)->ecn_ok)
		th->ece = 1;
}

Loading