Commit 07811361 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'tcp-fix-listener-wakeup-after-reuseport-migration'

Zhenzhong Wu says:

====================
tcp: fix listener wakeup after reuseport migration

This series fixes a missing wakeup when inet_csk_listen_stop() migrates
an established child socket from a closing listener to another socket
in the same SO_REUSEPORT group after the child has already been queued
for accept.

The target listener receives the migrated accept-queue entry via
inet_csk_reqsk_queue_add(), but its waiters are not notified.
Nonblocking accept() still succeeds because it checks the accept queue
directly, but readiness-based waiters can remain asleep until another
connection generates a wakeup.

Patch 1 notifies the target listener after a successful migration in
inet_csk_listen_stop() and protects the post-queue_add() nsk accesses
with rcu_read_lock()/rcu_read_unlock().

Patch 2 extends the existing migrate_reuseport BPF selftest with epoll
readiness checks inside migrate_dance(), around shutdown() where the
migration happens. The test now verifies that the target listener is
not ready before migration and becomes ready immediately after it, for
both TCP_ESTABLISHED and TCP_SYN_RECV. TCP_NEW_SYN_RECV remains
excluded because it still depends on later handshake completion.

Testing:
- On a local unpatched kernel, the focused migrate_reuseport test
  fails for the listener-migration cases and passes for the
  TCP_NEW_SYN_RECV cases:
    not ok 1 IPv4 TCP_ESTABLISHED  inet_csk_listen_stop
    not ok 2 IPv4 TCP_SYN_RECV     inet_csk_listen_stop
    ok 3 IPv4 TCP_NEW_SYN_RECV reqsk_timer_handler
    ok 4 IPv4 TCP_NEW_SYN_RECV inet_csk_complete_hashdance
    not ok 5 IPv6 TCP_ESTABLISHED  inet_csk_listen_stop
    not ok 6 IPv6 TCP_SYN_RECV     inet_csk_listen_stop
    ok 7 IPv6 TCP_NEW_SYN_RECV reqsk_timer_handler
    ok 8 IPv6 TCP_NEW_SYN_RECV inet_csk_complete_hashdance
- On a patched kernel booted under QEMU, the full migrate_reuseport
  selftest passes:
    ok 1 IPv4 TCP_ESTABLISHED  inet_csk_listen_stop
    ok 2 IPv4 TCP_SYN_RECV     inet_csk_listen_stop
    ok 3 IPv4 TCP_NEW_SYN_RECV reqsk_timer_handler
    ok 4 IPv4 TCP_NEW_SYN_RECV inet_csk_complete_hashdance
    ok 5 IPv6 TCP_ESTABLISHED  inet_csk_listen_stop
    ok 6 IPv6 TCP_SYN_RECV     inet_csk_listen_stop
    ok 7 IPv6 TCP_NEW_SYN_RECV reqsk_timer_handler
    ok 8 IPv6 TCP_NEW_SYN_RECV inet_csk_complete_hashdance
    SELFTEST_RC=0
====================

Link: https://patch.msgid.link/20260422024554.130346-1-jt26wzz@gmail.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents e08a9fac c01cfc48
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -1479,16 +1479,19 @@ void inet_csk_listen_stop(struct sock *sk)
			if (nreq) {
				refcount_set(&nreq->rsk_refcnt, 1);

				rcu_read_lock();
				if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {
					__NET_INC_STATS(sock_net(nsk),
							LINUX_MIB_TCPMIGRATEREQSUCCESS);
					reqsk_migrate_reset(req);
					READ_ONCE(nsk->sk_data_ready)(nsk);
				} else {
					__NET_INC_STATS(sock_net(nsk),
							LINUX_MIB_TCPMIGRATEREQFAILURE);
					reqsk_migrate_reset(nreq);
					__reqsk_free(nreq);
				}
				rcu_read_unlock();

				/* inet_csk_reqsk_queue_add() has already
				 * called inet_child_forget() on failure case.
+42 −7
Original line number Diff line number Diff line
@@ -7,24 +7,29 @@
 *   3. call listen() for 1 server socket. (migration target)
 *   4. update a map to migrate all child sockets
 *        to the last server socket (migrate_map[cookie] = 4)
 *   5. call shutdown() for first 4 server sockets
 *   5. for TCP_ESTABLISHED and TCP_SYN_RECV cases, verify via epoll
 *        that the last server socket is not ready before migration.
 *   6. call shutdown() for first 4 server sockets
 *        and migrate the requests in the accept queue
 *        to the last server socket.
 *   6. call listen() for the second server socket.
 *   7. call shutdown() for the last server
 *   7. for TCP_ESTABLISHED and TCP_SYN_RECV cases, verify via epoll
 *        that the last server socket is ready after migration.
 *   8. call listen() for the second server socket.
 *   9. call shutdown() for the last server
 *        and migrate the requests in the accept queue
 *        to the second server socket.
 *   8. call listen() for the last server.
 *   9. call shutdown() for the second server
 *  10. call listen() for the last server.
 *  11. call shutdown() for the second server
 *        and migrate the requests in the accept queue
 *        to the last server socket.
 *  10. call accept() for the last server socket.
 *  12. call accept() for the last server socket.
 *
 * Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
 */

#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#include <sys/epoll.h>

#include "test_progs.h"
#include "test_migrate_reuseport.skel.h"
@@ -350,21 +355,51 @@ static int update_maps(struct migrate_reuseport_test_case *test_case,

static int migrate_dance(struct migrate_reuseport_test_case *test_case)
{
	struct epoll_event ev = {
		.events = EPOLLIN,
	};
	int epoll = -1, nfds;
	int i, err;

	if (test_case->state != BPF_TCP_NEW_SYN_RECV) {
		epoll = epoll_create1(0);
		if (!ASSERT_NEQ(epoll, -1, "epoll_create1"))
			return -1;

		ev.data.fd = test_case->servers[MIGRATED_TO];
		if (!ASSERT_OK(epoll_ctl(epoll, EPOLL_CTL_ADD,
					 test_case->servers[MIGRATED_TO], &ev),
			       "epoll_ctl"))
			goto close_epoll;

		nfds = epoll_wait(epoll, &ev, 1, 0);
		if (!ASSERT_EQ(nfds, 0, "epoll_wait 1"))
			goto close_epoll;
	}

	/* Migrate TCP_ESTABLISHED and TCP_SYN_RECV requests
	 * to the last listener based on eBPF.
	 */
	for (i = 0; i < MIGRATED_TO; i++) {
		err = shutdown(test_case->servers[i], SHUT_RDWR);
		if (!ASSERT_OK(err, "shutdown"))
			return -1;
			goto close_epoll;
	}

	/* No dance for TCP_NEW_SYN_RECV to migrate based on eBPF */
	if (test_case->state == BPF_TCP_NEW_SYN_RECV)
		return 0;

	nfds = epoll_wait(epoll, &ev, 1, 0);
	if (!ASSERT_EQ(nfds, 1, "epoll_wait 2")) {
close_epoll:
		if (epoll >= 0)
			close(epoll);
		return -1;
	}

	close(epoll);

	/* Note that we use the second listener instead of the
	 * first one here.
	 *