Commit 3cbab892 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'inet-implement-lockless-rtm_getnetconf-ops'

Eric Dumazet says:

====================
inet: implement lockless RTM_GETNETCONF ops

This series removes RTNL use for RTM_GETNETCONF operations on AF_INET.

- Annotate data-races to avoid possible KCSAN splats.

- "ip -4 netconf show dev XXX" can be implemented without RTNL [1]

- "ip -4 netconf" dumps can be implemented using RCU instead of RTNL [1]

[1] This only refers to RTM_GETNETCONF operation, "ip" command
    also uses RTM_GETLINK dumps which are using RTNL at this moment.
====================

Link: https://lore.kernel.org/r/20240227092411.2315725-1-edumazet@google.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents e83ddcea 16748707
Loading
Loading
Loading
Loading
+8 −6
Original line number Diff line number Diff line
@@ -53,13 +53,15 @@ struct in_device {
};

#define IPV4_DEVCONF(cnf, attr) ((cnf).data[IPV4_DEVCONF_ ## attr - 1])
#define IPV4_DEVCONF_RO(cnf, attr) READ_ONCE(IPV4_DEVCONF(cnf, attr))
#define IPV4_DEVCONF_ALL(net, attr) \
	IPV4_DEVCONF((*(net)->ipv4.devconf_all), attr)
#define IPV4_DEVCONF_ALL_RO(net, attr) READ_ONCE(IPV4_DEVCONF_ALL(net, attr))

static inline int ipv4_devconf_get(struct in_device *in_dev, int index)
static inline int ipv4_devconf_get(const struct in_device *in_dev, int index)
{
	index--;
	return in_dev->cnf.data[index];
	return READ_ONCE(in_dev->cnf.data[index]);
}

static inline void ipv4_devconf_set(struct in_device *in_dev, int index,
@@ -67,7 +69,7 @@ static inline void ipv4_devconf_set(struct in_device *in_dev, int index,
{
	index--;
	set_bit(index, in_dev->cnf.state);
	in_dev->cnf.data[index] = val;
	WRITE_ONCE(in_dev->cnf.data[index], val);
}

static inline void ipv4_devconf_setall(struct in_device *in_dev)
@@ -81,18 +83,18 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
	ipv4_devconf_set((in_dev), IPV4_DEVCONF_ ## attr, (val))

#define IN_DEV_ANDCONF(in_dev, attr) \
	(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), attr) && \
	(IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr) && \
	 IN_DEV_CONF_GET((in_dev), attr))

#define IN_DEV_NET_ORCONF(in_dev, net, attr) \
	(IPV4_DEVCONF_ALL(net, attr) || \
	(IPV4_DEVCONF_ALL_RO(net, attr) || \
	 IN_DEV_CONF_GET((in_dev), attr))

#define IN_DEV_ORCONF(in_dev, attr) \
	IN_DEV_NET_ORCONF(in_dev, dev_net(in_dev->dev), attr)

#define IN_DEV_MAXCONF(in_dev, attr) \
	(max(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), attr), \
	(max(IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr), \
	     IN_DEV_CONF_GET((in_dev), attr)))

#define IN_DEV_FORWARD(in_dev)		IN_DEV_CONF_GET((in_dev), FORWARDING)
+68 −79
Original line number Diff line number Diff line
@@ -1982,7 +1982,7 @@ static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
		return -EMSGSIZE;

	for (i = 0; i < IPV4_DEVCONF_MAX; i++)
		((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i];
		((u32 *) nla_data(nla))[i] = READ_ONCE(in_dev->cnf.data[i]);

	return 0;
}
@@ -2068,9 +2068,9 @@ static int inet_netconf_msgsize_devconf(int type)
}

static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
				     struct ipv4_devconf *devconf, u32 portid,
				     u32 seq, int event, unsigned int flags,
				     int type)
				     const struct ipv4_devconf *devconf,
				     u32 portid, u32 seq, int event,
				     unsigned int flags, int type)
{
	struct nlmsghdr  *nlh;
	struct netconfmsg *ncm;
@@ -2095,27 +2095,28 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,

	if ((all || type == NETCONFA_FORWARDING) &&
	    nla_put_s32(skb, NETCONFA_FORWARDING,
			IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
			IPV4_DEVCONF_RO(*devconf, FORWARDING)) < 0)
		goto nla_put_failure;
	if ((all || type == NETCONFA_RP_FILTER) &&
	    nla_put_s32(skb, NETCONFA_RP_FILTER,
			IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
			IPV4_DEVCONF_RO(*devconf, RP_FILTER)) < 0)
		goto nla_put_failure;
	if ((all || type == NETCONFA_MC_FORWARDING) &&
	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,
			IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
			IPV4_DEVCONF_RO(*devconf, MC_FORWARDING)) < 0)
		goto nla_put_failure;
	if ((all || type == NETCONFA_BC_FORWARDING) &&
	    nla_put_s32(skb, NETCONFA_BC_FORWARDING,
			IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
			IPV4_DEVCONF_RO(*devconf, BC_FORWARDING)) < 0)
		goto nla_put_failure;
	if ((all || type == NETCONFA_PROXY_NEIGH) &&
	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
			IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
			IPV4_DEVCONF_RO(*devconf, PROXY_ARP)) < 0)
		goto nla_put_failure;
	if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
	    nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
			IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
			IPV4_DEVCONF_RO(*devconf,
					IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
		goto nla_put_failure;

out:
@@ -2205,20 +2206,19 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
{
	struct net *net = sock_net(in_skb->sk);
	struct nlattr *tb[NETCONFA_MAX + 1];
	const struct ipv4_devconf *devconf;
	struct in_device *in_dev = NULL;
	struct net_device *dev = NULL;
	struct sk_buff *skb;
	struct ipv4_devconf *devconf;
	struct in_device *in_dev;
	struct net_device *dev;
	int ifindex;
	int err;

	err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack);
	if (err)
		goto errout;
		return err;

	err = -EINVAL;
	if (!tb[NETCONFA_IFINDEX])
		goto errout;
		return -EINVAL;

	ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
	switch (ifindex) {
@@ -2229,10 +2229,10 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
		devconf = net->ipv4.devconf_dflt;
		break;
	default:
		dev = __dev_get_by_index(net, ifindex);
		if (!dev)
			goto errout;
		in_dev = __in_dev_get_rtnl(dev);
		err = -ENODEV;
		dev = dev_get_by_index(net, ifindex);
		if (dev)
			in_dev = in_dev_get(dev);
		if (!in_dev)
			goto errout;
		devconf = &in_dev->cnf;
@@ -2256,6 +2256,9 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
	}
	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
errout:
	if (in_dev)
		in_dev_put(in_dev);
	dev_put(dev);
	return err;
}

@@ -2264,11 +2267,13 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
{
	const struct nlmsghdr *nlh = cb->nlh;
	struct net *net = sock_net(skb->sk);
	int h, s_h;
	int idx, s_idx;
	struct {
		unsigned long ifindex;
		unsigned int all_default;
	} *ctx = (void *)cb->ctx;
	const struct in_device *in_dev;
	struct net_device *dev;
	struct in_device *in_dev;
	struct hlist_head *head;
	int err = 0;

	if (cb->strict_check) {
		struct netlink_ext_ack *extack = cb->extack;
@@ -2285,64 +2290,47 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
		}
	}

	s_h = cb->args[0];
	s_idx = idx = cb->args[1];

	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
		idx = 0;
		head = &net->dev_index_head[h];
	rcu_read_lock();
		cb->seq = inet_base_seq(net);
		hlist_for_each_entry_rcu(dev, head, index_hlist) {
			if (idx < s_idx)
				goto cont;
	for_each_netdev_dump(net, dev, ctx->ifindex) {
		in_dev = __in_dev_get_rcu(dev);
		if (!in_dev)
				goto cont;

			if (inet_netconf_fill_devconf(skb, dev->ifindex,
			continue;
		err = inet_netconf_fill_devconf(skb, dev->ifindex,
						&in_dev->cnf,
						NETLINK_CB(cb->skb).portid,
						nlh->nlmsg_seq,
						      RTM_NEWNETCONF,
						      NLM_F_MULTI,
						      NETCONFA_ALL) < 0) {
				rcu_read_unlock();
						RTM_NEWNETCONF, NLM_F_MULTI,
						NETCONFA_ALL);
		if (err < 0)
			goto done;
	}
			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
			idx++;
		}
		rcu_read_unlock();
	}
	if (h == NETDEV_HASHENTRIES) {
		if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
	if (ctx->all_default == 0) {
		err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
						net->ipv4.devconf_all,
						NETLINK_CB(cb->skb).portid,
						nlh->nlmsg_seq,
						RTM_NEWNETCONF, NLM_F_MULTI,
					      NETCONFA_ALL) < 0)
						NETCONFA_ALL);
		if (err < 0)
			goto done;
		else
			h++;
		ctx->all_default++;
	}
	if (h == NETDEV_HASHENTRIES + 1) {
		if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
	if (ctx->all_default == 1) {
		err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
						net->ipv4.devconf_dflt,
						NETLINK_CB(cb->skb).portid,
						nlh->nlmsg_seq,
						RTM_NEWNETCONF, NLM_F_MULTI,
					      NETCONFA_ALL) < 0)
						NETCONFA_ALL);
		if (err < 0)
			goto done;
		else
			h++;
		ctx->all_default++;
	}
done:
	cb->args[0] = h;
	cb->args[1] = idx;

	return skb->len;
	if (err < 0 && likely(skb->len))
		err = skb->len;
	rcu_read_unlock();
	return err;
}

#ifdef CONFIG_SYSCTL
@@ -2825,5 +2813,6 @@ void __init devinet_init(void)
	rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, 0);
	rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, 0);
	rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
		      inet_netconf_dump_devconf, 0);
		      inet_netconf_dump_devconf,
		      RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED);
}
+2 −2
Original line number Diff line number Diff line
@@ -120,12 +120,12 @@
 */

#define IGMP_V1_SEEN(in_dev) \
	(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
	(IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
	 IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
	 ((in_dev)->mr_v1_seen && \
	  time_before(jiffies, (in_dev)->mr_v1_seen)))
#define IGMP_V2_SEEN(in_dev) \
	(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
	(IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
	 IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
	 ((in_dev)->mr_v2_seen && \
	  time_before(jiffies, (in_dev)->mr_v2_seen)))
+1 −1
Original line number Diff line number Diff line
@@ -395,7 +395,7 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
		seq_printf(seq, " %s", snmp4_ipstats_list[i].name);

	seq_printf(seq, "\nIp: %d %d",
		   IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
		   IPV4_DEVCONF_ALL_RO(net, FORWARDING) ? 1 : 2,
		   READ_ONCE(net->ipv4.sysctl_ip_default_ttl));

	BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
+2 −2
Original line number Diff line number Diff line
@@ -2313,7 +2313,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
		if (IN_DEV_BFORWARD(in_dev))
			goto make_route;
		/* not do cache if bc_forwarding is enabled */
		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
		if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING))
			do_cache = false;
		goto brd_input;
	}
@@ -2993,7 +2993,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
#ifdef CONFIG_IP_MROUTE
			if (ipv4_is_multicast(dst) &&
			    !ipv4_is_local_multicast(dst) &&
			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
			    IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) {
				int err = ipmr_get_route(net, skb,
							 fl4->saddr, fl4->daddr,
							 r, portid);