Commit 0f4b437b authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'tcp-fix-tcp_poll-races'

Eric Dumazet says:

====================
tcp: fix tcp_poll() races

Flakes in packetdrill tests stressing epoll_wait()
were root caused to bad ordering in tcp_write_err()

Precisely, we have to call sk_error_report() after
tcp_done().

When fixing this issue, we discovered tcp_abort(),
tcp_v4_err() and tcp_v6_err() had similar issues.

Since tcp_reset() has the correct ordering,
first patch takes part of it and creates
tcp_done_with_error() helper.
====================

Link: https://lore.kernel.org/r/20240528125253.1966136-1-edumazet@google.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents c3390677 fde6f897
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -677,6 +677,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
/* tcp_input.c */
void tcp_rearm_rto(struct sock *sk);
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
void tcp_done_with_error(struct sock *sk, int err);
void tcp_reset(struct sock *sk, struct sk_buff *skb);
void tcp_fin(struct sock *sk);
void tcp_check_space(struct sock *sk);
+2 −6
Original line number Diff line number Diff line
@@ -598,7 +598,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
		 */
		mask |= EPOLLOUT | EPOLLWRNORM;
	}
	/* This barrier is coupled with smp_wmb() in tcp_reset() */
	/* This barrier is coupled with smp_wmb() in tcp_done_with_error() */
	smp_rmb();
	if (READ_ONCE(sk->sk_err) ||
	    !skb_queue_empty_lockless(&sk->sk_error_queue))
@@ -4576,14 +4576,10 @@ int tcp_abort(struct sock *sk, int err)
	bh_lock_sock(sk);

	if (!sock_flag(sk, SOCK_DEAD)) {
		WRITE_ONCE(sk->sk_err, err);
		/* This barrier is coupled with smp_rmb() in tcp_poll() */
		smp_wmb();
		sk_error_report(sk);
		if (tcp_need_reset(sk->sk_state))
			tcp_send_active_reset(sk, GFP_ATOMIC,
					      SK_RST_REASON_NOT_SPECIFIED);
		tcp_done(sk);
		tcp_done_with_error(sk, err);
	}

	bh_unlock_sock(sk);
+21 −11
Original line number Diff line number Diff line
@@ -4436,9 +4436,26 @@ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
	return SKB_NOT_DROPPED_YET;
}


void tcp_done_with_error(struct sock *sk, int err)
{
	/* This barrier is coupled with smp_rmb() in tcp_poll() */
	WRITE_ONCE(sk->sk_err, err);
	smp_wmb();

	tcp_write_queue_purge(sk);
	tcp_done(sk);

	if (!sock_flag(sk, SOCK_DEAD))
		sk_error_report(sk);
}
EXPORT_SYMBOL(tcp_done_with_error);

/* When we get a reset we do this. */
void tcp_reset(struct sock *sk, struct sk_buff *skb)
{
	int err;

	trace_tcp_receive_reset(sk);

	/* mptcp can't tell us to ignore reset pkts,
@@ -4450,24 +4467,17 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb)
	/* We want the right error as BSD sees it (and indeed as we do). */
	switch (sk->sk_state) {
	case TCP_SYN_SENT:
		WRITE_ONCE(sk->sk_err, ECONNREFUSED);
		err = ECONNREFUSED;
		break;
	case TCP_CLOSE_WAIT:
		WRITE_ONCE(sk->sk_err, EPIPE);
		err = EPIPE;
		break;
	case TCP_CLOSE:
		return;
	default:
		WRITE_ONCE(sk->sk_err, ECONNRESET);
		err = ECONNRESET;
	}
	/* This barrier is coupled with smp_rmb() in tcp_poll() */
	smp_wmb();

	tcp_write_queue_purge(sk);
	tcp_done(sk);

	if (!sock_flag(sk, SOCK_DEAD))
		sk_error_report(sk);
	tcp_done_with_error(sk, err);
}

/*
+3 −8
Original line number Diff line number Diff line
@@ -611,15 +611,10 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)

		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);

		if (!sock_owned_by_user(sk)) {
			WRITE_ONCE(sk->sk_err, err);

			sk_error_report(sk);

			tcp_done(sk);
		} else {
		if (!sock_owned_by_user(sk))
			tcp_done_with_error(sk, err);
		else
			WRITE_ONCE(sk->sk_err_soft, err);
		}
		goto out;
	}

+1 −5
Original line number Diff line number Diff line
@@ -74,11 +74,7 @@ u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)

static void tcp_write_err(struct sock *sk)
{
	WRITE_ONCE(sk->sk_err, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT);
	sk_error_report(sk);

	tcp_write_queue_purge(sk);
	tcp_done(sk);
	tcp_done_with_error(sk, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT);
	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
}

Loading