Commit 5c70eb5c authored by Eric Dumazet's avatar Eric Dumazet Committed by Jakub Kicinski
Browse files

net: better track kernel sockets lifetime



While kernel sockets are dismantled during pernet_operations->exit(),
their freeing can be delayed by any tx packets still held in qdisc
or device queues, due to skb_set_owner_w() prior calls.

This then trigger the following warning from ref_tracker_dir_exit() [1]

To fix this, make sure that kernel sockets own a reference on net->passive.

Add sk_net_refcnt_upgrade() helper, used whenever a kernel socket
is converted to a refcounted one.

[1]

[  136.263918][   T35] ref_tracker: net notrefcnt@ffff8880638f01e0 has 1/2 users at
[  136.263918][   T35]      sk_alloc+0x2b3/0x370
[  136.263918][   T35]      inet6_create+0x6ce/0x10f0
[  136.263918][   T35]      __sock_create+0x4c0/0xa30
[  136.263918][   T35]      inet_ctl_sock_create+0xc2/0x250
[  136.263918][   T35]      igmp6_net_init+0x39/0x390
[  136.263918][   T35]      ops_init+0x31e/0x590
[  136.263918][   T35]      setup_net+0x287/0x9e0
[  136.263918][   T35]      copy_net_ns+0x33f/0x570
[  136.263918][   T35]      create_new_namespaces+0x425/0x7b0
[  136.263918][   T35]      unshare_nsproxy_namespaces+0x124/0x180
[  136.263918][   T35]      ksys_unshare+0x57d/0xa70
[  136.263918][   T35]      __x64_sys_unshare+0x38/0x40
[  136.263918][   T35]      do_syscall_64+0xf3/0x230
[  136.263918][   T35]      entry_SYSCALL_64_after_hwframe+0x77/0x7f
[  136.263918][   T35]
[  136.343488][   T35] ref_tracker: net notrefcnt@ffff8880638f01e0 has 1/2 users at
[  136.343488][   T35]      sk_alloc+0x2b3/0x370
[  136.343488][   T35]      inet6_create+0x6ce/0x10f0
[  136.343488][   T35]      __sock_create+0x4c0/0xa30
[  136.343488][   T35]      inet_ctl_sock_create+0xc2/0x250
[  136.343488][   T35]      ndisc_net_init+0xa7/0x2b0
[  136.343488][   T35]      ops_init+0x31e/0x590
[  136.343488][   T35]      setup_net+0x287/0x9e0
[  136.343488][   T35]      copy_net_ns+0x33f/0x570
[  136.343488][   T35]      create_new_namespaces+0x425/0x7b0
[  136.343488][   T35]      unshare_nsproxy_namespaces+0x124/0x180
[  136.343488][   T35]      ksys_unshare+0x57d/0xa70
[  136.343488][   T35]      __x64_sys_unshare+0x38/0x40
[  136.343488][   T35]      do_syscall_64+0xf3/0x230
[  136.343488][   T35]      entry_SYSCALL_64_after_hwframe+0x77/0x7f

Fixes: 0cafd77d ("net: add a refcount tracker for kernel sockets")
Reported-by: default avatar <syzbot+30a19e01a97420719891@syzkaller.appspotmail.com>
Closes: https://lore.kernel.org/netdev/67b72aeb.050a0220.14d86d.0283.GAE@google.com/T/#u


Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Reviewed-by: default avatarKuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20250220131854.4048077-1-edumazet@google.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent fde9836c
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -1751,6 +1751,7 @@ static inline bool sock_allow_reclassification(const struct sock *csk)
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
		      struct proto *prot, int kern);
void sk_free(struct sock *sk);
void sk_net_refcnt_upgrade(struct sock *sk);
void sk_destruct(struct sock *sk);
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
void sk_free_unlock_clone(struct sock *sk);
+22 −5
Original line number Diff line number Diff line
@@ -2246,6 +2246,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
			get_net_track(net, &sk->ns_tracker, priority);
			sock_inuse_add(net, 1);
		} else {
			net_passive_inc(net);
			__netns_tracker_alloc(net, &sk->ns_tracker,
					      false, priority);
		}
@@ -2270,6 +2271,7 @@ EXPORT_SYMBOL(sk_alloc);
static void __sk_destruct(struct rcu_head *head)
{
	struct sock *sk = container_of(head, struct sock, sk_rcu);
	struct net *net = sock_net(sk);
	struct sk_filter *filter;

	if (sk->sk_destruct)
@@ -2301,14 +2303,28 @@ static void __sk_destruct(struct rcu_head *head)
	put_cred(sk->sk_peer_cred);
	put_pid(sk->sk_peer_pid);

	if (likely(sk->sk_net_refcnt))
		put_net_track(sock_net(sk), &sk->ns_tracker);
	else
		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);

	if (likely(sk->sk_net_refcnt)) {
		put_net_track(net, &sk->ns_tracker);
	} else {
		__netns_tracker_free(net, &sk->ns_tracker, false);
		net_passive_dec(net);
	}
	sk_prot_free(sk->sk_prot_creator, sk);
}

void sk_net_refcnt_upgrade(struct sock *sk)
{
	struct net *net = sock_net(sk);

	WARN_ON_ONCE(sk->sk_net_refcnt);
	__netns_tracker_free(net, &sk->ns_tracker, false);
	net_passive_dec(net);
	sk->sk_net_refcnt = 1;
	get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
	sock_inuse_add(net, 1);
}
EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);

void sk_destruct(struct sock *sk)
{
	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
@@ -2405,6 +2421,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
		 * is not properly dismantling its kernel sockets at netns
		 * destroy time.
		 */
		net_passive_inc(sock_net(newsk));
		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
				      false, priority);
	}
+1 −4
Original line number Diff line number Diff line
@@ -1772,10 +1772,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
	 * needs it.
	 * Update ns_tracker to current stack trace and refcounted tracker.
	 */
	__netns_tracker_free(net, &sf->sk->ns_tracker, false);
	sf->sk->sk_net_refcnt = 1;
	get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL);
	sock_inuse_add(net, 1);
	sk_net_refcnt_upgrade(sf->sk);
	err = tcp_set_ulp(sf->sk, "mptcp");
	if (err)
		goto err_free;
+0 −10
Original line number Diff line number Diff line
@@ -795,16 +795,6 @@ static int netlink_release(struct socket *sock)

	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);

	/* Because struct net might disappear soon, do not keep a pointer. */
	if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) {
		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
		/* Because of deferred_put_nlk_sk and use of work queue,
		 * it is possible  netns will be freed before this socket.
		 */
		sock_net_set(sk, &init_net);
		__netns_tracker_alloc(&init_net, &sk->ns_tracker,
				      false, GFP_KERNEL);
	}
	call_rcu(&nlk->rcu, deferred_put_nlk_sk);
	return 0;
}
+2 −6
Original line number Diff line number Diff line
@@ -504,12 +504,8 @@ bool rds_tcp_tune(struct socket *sock)
			release_sock(sk);
			return false;
		}
		/* Update ns_tracker to current stack trace and refcounted tracker */
		__netns_tracker_free(net, &sk->ns_tracker, false);

		sk->sk_net_refcnt = 1;
		netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL);
		sock_inuse_add(net, 1);
		sk_net_refcnt_upgrade(sk);
		put_net(net);
	}
	rtn = net_generic(net, rds_tcp_netid);
	if (rtn->sndbuf_size > 0) {
Loading