Commit b40671b5 authored by Chia-Yu Chang's avatar Chia-Yu Chang Committed by Paolo Abeni
Browse files

tcp: accecn: AccECN option failure handling



AccECN option may fail in various way, handle these:
- Attempt to negotiate the use of AccECN on the 1st retransmitted SYN
	- From the 2nd retransmitted SYN, stop AccECN negotiation
- Remove option from SYN/ACK rexmits to handle blackholes
- If no option arrives in SYN/ACK, assume Option is not usable
        - If an option arrives later, re-enabled
- If option is zeroed, disable AccECN option processing

This patch use existing padding bits in tcp_request_sock and
holes in tcp_sock without increasing the size.

Signed-off-by: default avatarIlpo Järvinen <ij@kernel.org>
Signed-off-by: default avatarChia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250916082434.100722-9-chia-yu.chang@nokia-bell-labs.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parent aa55a7dd
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -173,6 +173,7 @@ struct tcp_request_sock {
	u8				syn_ect_snt: 2,
					syn_ect_rcv: 2,
					accecn_fail_mode:4;
	u8				saw_accecn_opt  :2;
#ifdef CONFIG_TCP_AO
	u8				ao_keyid;
	u8				ao_rcv_next;
@@ -407,7 +408,8 @@ struct tcp_sock {
		syn_fastopen_child:1; /* created TFO passive child socket */

	u8	keepalive_probes; /* num of allowed keep alive probes	*/
	u8	accecn_fail_mode:4;	/* AccECN failure handling */
	u8	accecn_fail_mode:4,	/* AccECN failure handling */
		saw_accecn_opt:2;	/* An AccECN option was seen */
	u32	tcp_tx_delay;	/* delay (in usec) added to TX packets */

/* RTT measurement */
+48 −3
Original line number Diff line number Diff line
@@ -91,6 +91,11 @@ static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode)
	tp->accecn_fail_mode |= mode;
}

#define TCP_ACCECN_OPT_NOT_SEEN		0x0
#define TCP_ACCECN_OPT_EMPTY_SEEN	0x1
#define TCP_ACCECN_OPT_COUNTER_SEEN	0x2
#define TCP_ACCECN_OPT_FAIL_SEEN	0x3

static inline u8 tcp_accecn_ace(const struct tcphdr *th)
{
	return (th->ae << 2) | (th->cwr << 1) | th->ece;
@@ -146,6 +151,14 @@ static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace,
	return true;
}

static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp,
						u8 saw_opt)
{
	tp->saw_accecn_opt = saw_opt;
	if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN)
		tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV);
}

/* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */
static inline void tcp_accecn_third_ack(struct sock *sk,
					const struct sk_buff *skb, u8 sent_ect)
@@ -428,9 +441,35 @@ static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb,
	}
}

static inline u8 tcp_accecn_option_init(const struct sk_buff *skb,
					u8 opt_offset)
{
	u8 *ptr = skb_transport_header(skb) + opt_offset;
	unsigned int optlen = ptr[1] - 2;

	if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1))
		return TCP_ACCECN_OPT_FAIL_SEEN;
	ptr += 2;

	/* Detect option zeroing: an AccECN connection "MAY check that the
	 * initial value of the EE0B field or the EE1B field is non-zero"
	 */
	if (optlen < TCPOLEN_ACCECN_PERFIELD)
		return TCP_ACCECN_OPT_EMPTY_SEEN;
	if (get_unaligned_be24(ptr) == 0)
		return TCP_ACCECN_OPT_FAIL_SEEN;
	if (optlen < TCPOLEN_ACCECN_PERFIELD * 3)
		return TCP_ACCECN_OPT_COUNTER_SEEN;
	ptr += TCPOLEN_ACCECN_PERFIELD * 2;
	if (get_unaligned_be24(ptr) == 0)
		return TCP_ACCECN_OPT_FAIL_SEEN;

	return TCP_ACCECN_OPT_COUNTER_SEEN;
}

/* See Table 2 of the AccECN draft */
static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th,
				      u8 ip_dsfield)
static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb,
				      const struct tcphdr *th, u8 ip_dsfield)
{
	struct tcp_sock *tp = tcp_sk(sk);
	u8 ace = tcp_accecn_ace(th);
@@ -469,7 +508,13 @@ static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th,
	default:
		tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
		tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK;
		if (tp->rx_opt.accecn &&
		    tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
			u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn);

			tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
			tp->accecn_opt_demand = 2;
		}
		if (INET_ECN_is_ce(ip_dsfield) &&
		    tcp_accecn_validate_syn_feedback(sk, ace,
						     tp->syn_ect_snt)) {
+2 −0
Original line number Diff line number Diff line
@@ -323,6 +323,8 @@ struct tcp_info {
	__u32	tcpi_received_e1_bytes;
	__u32	tcpi_received_e0_bytes;
	__u32	tcpi_received_ce_bytes;
	__u16	tcpi_accecn_fail_mode;
	__u16	tcpi_accecn_opt_seen;
};

/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
+3 −0
Original line number Diff line number Diff line
@@ -3409,6 +3409,7 @@ int tcp_disconnect(struct sock *sk, int flags)
	tp->delivered = 0;
	tp->delivered_ce = 0;
	tp->accecn_fail_mode = 0;
	tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
	tcp_accecn_init_counters(tp);
	tp->prev_ecnfield = 0;
	tp->accecn_opt_tstamp = 0;
@@ -4287,6 +4288,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
	if (tp->rto_stamp)
		info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;

	info->tcpi_accecn_fail_mode = tp->accecn_fail_mode;
	info->tcpi_accecn_opt_seen = tp->saw_accecn_opt;
	info->tcpi_received_ce = tp->received_ce;
	info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx];
	info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx];
+33 −2
Original line number Diff line number Diff line
@@ -398,7 +398,22 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp,
	unsigned int i;
	u8 *ptr;

	if (tcp_accecn_opt_fail_recv(tp))
		return false;

	if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) {
		if (!tp->saw_accecn_opt) {
			/* Too late to enable after this point due to
			 * potential counter wraps
			 */
			if (tp->bytes_sent >= (1 << 23) - 1) {
				u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN;

				tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
			}
			return false;
		}

		if (estimate_ecnfield) {
			u8 ecnfield = estimate_ecnfield - 1;

@@ -415,6 +430,13 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp,
	order1 = (ptr[0] == TCPOPT_ACCECN1);
	ptr += 2;

	if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
		tp->saw_accecn_opt = tcp_accecn_option_init(skb,
							    tp->rx_opt.accecn);
		if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN)
			tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV);
	}

	res = !!estimate_ecnfield;
	for (i = 0; i < 3; i++) {
		u32 init_offset;
@@ -6123,8 +6145,14 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
	if (th->syn) {
		if (tcp_ecn_mode_accecn(tp)) {
			accecn_reflector = true;
			if (tp->rx_opt.accecn &&
			    tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
				u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn);

				tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
				tcp_accecn_opt_demand_min(sk, 1);
			}
		}
		if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack &&
		    TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq &&
		    TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt &&
@@ -6606,7 +6634,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
		 */

		if (tcp_ecn_mode_any(tp))
			tcp_ecn_rcv_synack(sk, th, TCP_SKB_CB(skb)->ip_dsfield);
			tcp_ecn_rcv_synack(sk, skb, th,
					   TCP_SKB_CB(skb)->ip_dsfield);

		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
		tcp_try_undo_spurious_syn(sk);
@@ -7177,6 +7206,8 @@ static void tcp_openreq_init(struct request_sock *req,
	tcp_rsk(req)->snt_tsval_first = 0;
	tcp_rsk(req)->last_oow_ack_time = 0;
	tcp_rsk(req)->accecn_ok = 0;
	tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
	tcp_rsk(req)->accecn_fail_mode = 0;
	tcp_rsk(req)->syn_ect_rcv = 0;
	tcp_rsk(req)->syn_ect_snt = 0;
	req->mss = rx_opt->mss_clamp;
Loading