Commit 2121c43f authored by Paolo Abeni's avatar Paolo Abeni
Browse files

Merge branch 'inet_diag-remove-three-mutexes-in-diag-dumps'

Eric Dumazet says:

====================
inet_diag: remove three mutexes in diag dumps

Surprisingly, inet_diag operations are serialized over a stack
of three mutexes, giving legacy /proc based files an unfair
advantage on modern hosts.

This series removes all of them, making inet_diag operations
(eg iproute2/ss) fully parallel.

1-2) Two first patches are adding data-race annotations
     and can be backported to stable kernels.

3-4) inet_diag_table_mutex can be replaced with RCU protection,
     if we add corresponding protection against module unload.

5-7) sock_diag_table_mutex can be replaced with RCU protection,
     if we add corresponding protection against module unload.

 8)  sock_diag_mutex is removed, as the old bug it was
     working around has been fixed more elegantly.

 9)  inet_diag_dump_icsk() can skip over empty buckets to reduce
     spinlock contention.
====================

Link: https://lore.kernel.org/r/20240122112603.3270097-1-edumazet@google.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents 736b5545 622a08e8
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@
struct inet_hashinfo;

struct inet_diag_handler {
	struct module	*owner;
	void		(*dump)(struct sk_buff *skb,
				struct netlink_callback *cb,
				const struct inet_diag_req_v2 *r);
+8 −2
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@ struct nlmsghdr;
struct sock;

struct sock_diag_handler {
	struct module *owner;
	__u8 family;
	int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh);
	int (*get_info)(struct sk_buff *skb, struct sock *sk);
@@ -22,8 +23,13 @@ struct sock_diag_handler {
int sock_diag_register(const struct sock_diag_handler *h);
void sock_diag_unregister(const struct sock_diag_handler *h);

void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
struct sock_diag_inet_compat {
	struct module *owner;
	int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh);
};

void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr);
void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr);

u64 __sock_gen_cookie(struct sock *sk);

+68 −52
Original line number Diff line number Diff line
@@ -16,9 +16,10 @@
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>

static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
static DEFINE_MUTEX(sock_diag_table_mutex);
static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX];

static struct sock_diag_inet_compat __rcu *inet_rcv_compat;

static struct workqueue_struct *broadcast_wq;

DEFINE_COOKIE(sock_cookie);
@@ -122,6 +123,24 @@ static size_t sock_diag_nlmsg_size(void)
	       + nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
}

static const struct sock_diag_handler *sock_diag_lock_handler(int family)
{
	const struct sock_diag_handler *handler;

	rcu_read_lock();
	handler = rcu_dereference(sock_diag_handlers[family]);
	if (handler && !try_module_get(handler->owner))
		handler = NULL;
	rcu_read_unlock();

	return handler;
}

static void sock_diag_unlock_handler(const struct sock_diag_handler *handler)
{
	module_put(handler->owner);
}

static void sock_diag_broadcast_destroy_work(struct work_struct *work)
{
	struct broadcast_sk *bsk =
@@ -138,12 +157,12 @@ static void sock_diag_broadcast_destroy_work(struct work_struct *work)
	if (!skb)
		goto out;

	mutex_lock(&sock_diag_table_mutex);
	hndl = sock_diag_handlers[sk->sk_family];
	if (hndl && hndl->get_info)
	hndl = sock_diag_lock_handler(sk->sk_family);
	if (hndl) {
		if (hndl->get_info)
			err = hndl->get_info(skb, sk);
	mutex_unlock(&sock_diag_table_mutex);

		sock_diag_unlock_handler(hndl);
	}
	if (!err)
		nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group,
				GFP_KERNEL);
@@ -166,51 +185,45 @@ void sock_diag_broadcast_destroy(struct sock *sk)
	queue_work(broadcast_wq, &bsk->work);
}

void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr)
{
	mutex_lock(&sock_diag_table_mutex);
	inet_rcv_compat = fn;
	mutex_unlock(&sock_diag_table_mutex);
	xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat,
	     ptr);
}
EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);

void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr)
{
	mutex_lock(&sock_diag_table_mutex);
	inet_rcv_compat = NULL;
	mutex_unlock(&sock_diag_table_mutex);
	const struct sock_diag_inet_compat *old;

	old = xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat,
		   NULL);
	WARN_ON_ONCE(old != ptr);
}
EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);

int sock_diag_register(const struct sock_diag_handler *hndl)
{
	int err = 0;
	int family = hndl->family;

	if (hndl->family >= AF_MAX)
	if (family >= AF_MAX)
		return -EINVAL;

	mutex_lock(&sock_diag_table_mutex);
	if (sock_diag_handlers[hndl->family])
		err = -EBUSY;
	else
		sock_diag_handlers[hndl->family] = hndl;
	mutex_unlock(&sock_diag_table_mutex);

	return err;
	return !cmpxchg((const struct sock_diag_handler **)
				&sock_diag_handlers[family],
			NULL, hndl) ? 0 : -EBUSY;
}
EXPORT_SYMBOL_GPL(sock_diag_register);

void sock_diag_unregister(const struct sock_diag_handler *hnld)
void sock_diag_unregister(const struct sock_diag_handler *hndl)
{
	int family = hnld->family;
	int family = hndl->family;

	if (family >= AF_MAX)
		return;

	mutex_lock(&sock_diag_table_mutex);
	BUG_ON(sock_diag_handlers[family] != hnld);
	sock_diag_handlers[family] = NULL;
	mutex_unlock(&sock_diag_table_mutex);
	xchg((const struct sock_diag_handler **)&sock_diag_handlers[family],
	     NULL);
}
EXPORT_SYMBOL_GPL(sock_diag_unregister);

@@ -227,20 +240,20 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
		return -EINVAL;
	req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX);

	if (sock_diag_handlers[req->sdiag_family] == NULL)
	if (!rcu_access_pointer(sock_diag_handlers[req->sdiag_family]))
		sock_load_diag_module(req->sdiag_family, 0);

	mutex_lock(&sock_diag_table_mutex);
	hndl = sock_diag_handlers[req->sdiag_family];
	hndl = sock_diag_lock_handler(req->sdiag_family);
	if (hndl == NULL)
		err = -ENOENT;
	else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
		return -ENOENT;

	if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
		err = hndl->dump(skb, nlh);
	else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy)
		err = hndl->destroy(skb, nlh);
	else
		err = -EOPNOTSUPP;
	mutex_unlock(&sock_diag_table_mutex);
	sock_diag_unlock_handler(hndl);

	return err;
}
@@ -248,20 +261,27 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
			     struct netlink_ext_ack *extack)
{
	const struct sock_diag_inet_compat *ptr;
	int ret;

	switch (nlh->nlmsg_type) {
	case TCPDIAG_GETSOCK:
	case DCCPDIAG_GETSOCK:
		if (inet_rcv_compat == NULL)

		if (!rcu_access_pointer(inet_rcv_compat))
			sock_load_diag_module(AF_INET, 0);

		mutex_lock(&sock_diag_table_mutex);
		if (inet_rcv_compat != NULL)
			ret = inet_rcv_compat(skb, nlh);
		else
		rcu_read_lock();
		ptr = rcu_dereference(inet_rcv_compat);
		if (ptr && !try_module_get(ptr->owner))
			ptr = NULL;
		rcu_read_unlock();

		ret = -EOPNOTSUPP;
		mutex_unlock(&sock_diag_table_mutex);
		if (ptr) {
			ret = ptr->fn(skb, nlh);
			module_put(ptr->owner);
		}

		return ret;
	case SOCK_DIAG_BY_FAMILY:
@@ -272,13 +292,9 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
	}
}

static DEFINE_MUTEX(sock_diag_mutex);

static void sock_diag_rcv(struct sk_buff *skb)
{
	mutex_lock(&sock_diag_mutex);
	netlink_rcv_skb(skb, &sock_diag_rcv_msg);
	mutex_unlock(&sock_diag_mutex);
}

static int sock_diag_bind(struct net *net, int group)
@@ -286,12 +302,12 @@ static int sock_diag_bind(struct net *net, int group)
	switch (group) {
	case SKNLGRP_INET_TCP_DESTROY:
	case SKNLGRP_INET_UDP_DESTROY:
		if (!sock_diag_handlers[AF_INET])
		if (!rcu_access_pointer(sock_diag_handlers[AF_INET]))
			sock_load_diag_module(AF_INET, 0);
		break;
	case SKNLGRP_INET6_TCP_DESTROY:
	case SKNLGRP_INET6_UDP_DESTROY:
		if (!sock_diag_handlers[AF_INET6])
		if (!rcu_access_pointer(sock_diag_handlers[AF_INET6]))
			sock_load_diag_module(AF_INET6, 0);
		break;
	}
+1 −0
Original line number Diff line number Diff line
@@ -58,6 +58,7 @@ static int dccp_diag_dump_one(struct netlink_callback *cb,
}

static const struct inet_diag_handler dccp_diag_handler = {
	.owner		 = THIS_MODULE,
	.dump		 = dccp_diag_dump,
	.dump_one	 = dccp_diag_dump_one,
	.idiag_get_info	 = dccp_diag_get_info,
+58 −43
Original line number Diff line number Diff line
@@ -32,7 +32,7 @@
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>

static const struct inet_diag_handler **inet_diag_table;
static const struct inet_diag_handler __rcu **inet_diag_table;

struct inet_diag_entry {
	const __be32 *saddr;
@@ -48,28 +48,28 @@ struct inet_diag_entry {
#endif
};

static DEFINE_MUTEX(inet_diag_table_mutex);

static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
{
	if (proto < 0 || proto >= IPPROTO_MAX) {
		mutex_lock(&inet_diag_table_mutex);
		return ERR_PTR(-ENOENT);
	}
	const struct inet_diag_handler *handler;

	if (!inet_diag_table[proto])
	if (proto < 0 || proto >= IPPROTO_MAX)
		return NULL;

	if (!READ_ONCE(inet_diag_table[proto]))
		sock_load_diag_module(AF_INET, proto);

	mutex_lock(&inet_diag_table_mutex);
	if (!inet_diag_table[proto])
		return ERR_PTR(-ENOENT);
	rcu_read_lock();
	handler = rcu_dereference(inet_diag_table[proto]);
	if (handler && !try_module_get(handler->owner))
		handler = NULL;
	rcu_read_unlock();

	return inet_diag_table[proto];
	return handler;
}

static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
{
	mutex_unlock(&inet_diag_table_mutex);
	module_put(handler->owner);
}

void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
@@ -104,9 +104,12 @@ static size_t inet_sk_attr_size(struct sock *sk,
	const struct inet_diag_handler *handler;
	size_t aux = 0;

	handler = inet_diag_table[req->sdiag_protocol];
	rcu_read_lock();
	handler = rcu_dereference(inet_diag_table[req->sdiag_protocol]);
	DEBUG_NET_WARN_ON_ONCE(!handler);
	if (handler && handler->idiag_get_aux_size)
		aux = handler->idiag_get_aux_size(sk, net_admin);
	rcu_read_unlock();

	return	  nla_total_size(sizeof(struct tcp_info))
		+ nla_total_size(sizeof(struct inet_diag_msg))
@@ -244,10 +247,16 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
	struct nlmsghdr  *nlh;
	struct nlattr *attr;
	void *info = NULL;
	int protocol;

	cb_data = cb->data;
	handler = inet_diag_table[inet_diag_get_protocol(req, cb_data)];
	BUG_ON(!handler);
	protocol = inet_diag_get_protocol(req, cb_data);

	/* inet_diag_lock_handler() made sure inet_diag_table[] is stable. */
	handler = rcu_dereference_protected(inet_diag_table[protocol], 1);
	DEBUG_NET_WARN_ON_ONCE(!handler);
	if (!handler)
		return -ENXIO;

	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
			cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
@@ -605,9 +614,10 @@ static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
	protocol = inet_diag_get_protocol(req, &dump_data);

	handler = inet_diag_lock_handler(protocol);
	if (IS_ERR(handler)) {
		err = PTR_ERR(handler);
	} else if (cmd == SOCK_DIAG_BY_FAMILY) {
	if (!handler)
		return -ENOENT;

	if (cmd == SOCK_DIAG_BY_FAMILY) {
		struct netlink_callback cb = {
			.nlh = nlh,
			.skb = in_skb,
@@ -1035,6 +1045,10 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
			num = 0;
			ilb = &hashinfo->lhash2[i];

			if (hlist_nulls_empty(&ilb->nulls_head)) {
				s_num = 0;
				continue;
			}
			spin_lock(&ilb->lock);
			sk_nulls_for_each(sk, node, &ilb->nulls_head) {
				struct inet_sock *inet = inet_sk(sk);
@@ -1099,6 +1113,10 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
			accum = 0;
			ibb = &hashinfo->bhash2[i];

			if (hlist_empty(&ibb->chain)) {
				s_num = 0;
				continue;
			}
			spin_lock_bh(&ibb->lock);
			inet_bind_bucket_for_each(tb2, &ibb->chain) {
				if (!net_eq(ib2_net(tb2), net))
@@ -1259,12 +1277,12 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
again:
	prev_min_dump_alloc = cb->min_dump_alloc;
	handler = inet_diag_lock_handler(protocol);
	if (!IS_ERR(handler))
	if (handler) {
		handler->dump(skb, cb, r);
	else
		err = PTR_ERR(handler);
		inet_diag_unlock_handler(handler);

	} else {
		err = -ENOENT;
	}
	/* The skb is not large enough to fit one sk info and
	 * inet_sk_diag_fill() has requested for a larger skb.
	 */
@@ -1457,10 +1475,9 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
	}

	handler = inet_diag_lock_handler(sk->sk_protocol);
	if (IS_ERR(handler)) {
		inet_diag_unlock_handler(handler);
	if (!handler) {
		nlmsg_cancel(skb, nlh);
		return PTR_ERR(handler);
		return -ENOENT;
	}

	attr = handler->idiag_info_size
@@ -1479,6 +1496,7 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
}

static const struct sock_diag_handler inet_diag_handler = {
	.owner = THIS_MODULE,
	.family = AF_INET,
	.dump = inet_diag_handler_cmd,
	.get_info = inet_diag_handler_get_info,
@@ -1486,6 +1504,7 @@ static const struct sock_diag_handler inet_diag_handler = {
};

static const struct sock_diag_handler inet6_diag_handler = {
	.owner = THIS_MODULE,
	.family = AF_INET6,
	.dump = inet_diag_handler_cmd,
	.get_info = inet_diag_handler_get_info,
@@ -1495,20 +1514,12 @@ static const struct sock_diag_handler inet6_diag_handler = {
int inet_diag_register(const struct inet_diag_handler *h)
{
	const __u16 type = h->idiag_type;
	int err = -EINVAL;

	if (type >= IPPROTO_MAX)
		goto out;
		return -EINVAL;

	mutex_lock(&inet_diag_table_mutex);
	err = -EEXIST;
	if (!inet_diag_table[type]) {
		inet_diag_table[type] = h;
		err = 0;
	}
	mutex_unlock(&inet_diag_table_mutex);
out:
	return err;
	return !cmpxchg((const struct inet_diag_handler **)&inet_diag_table[type],
			NULL, h) ? 0 : -EEXIST;
}
EXPORT_SYMBOL_GPL(inet_diag_register);

@@ -1519,12 +1530,16 @@ void inet_diag_unregister(const struct inet_diag_handler *h)
	if (type >= IPPROTO_MAX)
		return;

	mutex_lock(&inet_diag_table_mutex);
	inet_diag_table[type] = NULL;
	mutex_unlock(&inet_diag_table_mutex);
	xchg((const struct inet_diag_handler **)&inet_diag_table[type],
	     NULL);
}
EXPORT_SYMBOL_GPL(inet_diag_unregister);

static const struct sock_diag_inet_compat inet_diag_compat = {
	.owner	= THIS_MODULE,
	.fn	= inet_diag_rcv_msg_compat,
};

static int __init inet_diag_init(void)
{
	const int inet_diag_table_size = (IPPROTO_MAX *
@@ -1543,7 +1558,7 @@ static int __init inet_diag_init(void)
	if (err)
		goto out_free_inet;

	sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);
	sock_diag_register_inet_compat(&inet_diag_compat);
out:
	return err;

@@ -1558,7 +1573,7 @@ static void __exit inet_diag_exit(void)
{
	sock_diag_unregister(&inet6_diag_handler);
	sock_diag_unregister(&inet_diag_handler);
	sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);
	sock_diag_unregister_inet_compat(&inet_diag_compat);
	kfree(inet_diag_table);
}

Loading