Commit 3424291d authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'ipv4-fib-convert-rtm_newroute-and-rtm_delroute-to-per-netns-rtnl'

Kuniyuki Iwashima says:

====================
ipv4: fib: Convert RTM_NEWROUTE and RTM_DELROUTE to per-netns RTNL.

Patch 1 is misc cleanup.
Patch 2 ~ 8 converts two fib_info hash tables to per-netns.
Patch 9 ~ 12 converts rtnl_lock() to rtnl_net_lcok().

v2: https://lore.kernel.org/20250226192556.21633-1-kuniyu@amazon.com
v1: https://lore.kernel.org/20250225182250.74650-1-kuniyu@amazon.com
====================

Link: https://patch.msgid.link/20250228042328.96624-1-kuniyu@amazon.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents d110dbf1 1dd2af79
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -162,6 +162,8 @@ struct fib_info {
	struct fib_nh		fib_nh[] __counted_by(fib_nhs);
};

int __net_init fib4_semantics_init(struct net *net);
void __net_exit fib4_semantics_exit(struct net *net);

#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rule;
+3 −0
Original line number Diff line number Diff line
@@ -111,6 +111,9 @@ struct netns_ipv4 {
#endif
	struct hlist_head	*fib_table_hash;
	struct sock		*fibnl;
	struct hlist_head	*fib_info_hash;
	unsigned int		fib_info_hash_bits;
	unsigned int		fib_info_cnt;

	struct sock		*mc_autojoin_sk;

+54 −20
Original line number Diff line number Diff line
@@ -553,18 +553,16 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
			const struct in_ifaddr *ifa;
			struct in_device *in_dev;

			in_dev = __in_dev_get_rtnl(dev);
			in_dev = __in_dev_get_rtnl_net(dev);
			if (!in_dev)
				return -ENODEV;

			*colon = ':';

			rcu_read_lock();
			in_dev_for_each_ifa_rcu(ifa, in_dev) {
			in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) {
				if (strcmp(ifa->ifa_label, devname) == 0)
					break;
			}
			rcu_read_unlock();

			if (!ifa)
				return -ENODEV;
@@ -635,7 +633,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
			return -EPERM;

		rtnl_lock();
		rtnl_net_lock(net);
		err = rtentry_to_fib_config(net, cmd, rt, &cfg);
		if (err == 0) {
			struct fib_table *tb;
@@ -659,7 +657,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
			/* allocated by rtentry_to_fib_config() */
			kfree(cfg.fc_mx);
		}
		rtnl_unlock();
		rtnl_net_unlock(net);
		return err;
	}
	return -EINVAL;
@@ -837,19 +835,33 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
		}
	}

	if (cfg->fc_dst_len > 32) {
		NL_SET_ERR_MSG(extack, "Invalid prefix length");
		err = -EINVAL;
		goto errout;
	}

	if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) {
		NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length");
		err = -EINVAL;
		goto errout;
	}

	if (cfg->fc_nh_id) {
		if (cfg->fc_oif || cfg->fc_gw_family ||
		    cfg->fc_encap || cfg->fc_mp) {
			NL_SET_ERR_MSG(extack,
				       "Nexthop specification and nexthop id are mutually exclusive");
			return -EINVAL;
			err = -EINVAL;
			goto errout;
		}
	}

	if (has_gw && has_via) {
		NL_SET_ERR_MSG(extack,
			       "Nexthop configuration can not contain both GATEWAY and VIA");
		return -EINVAL;
		err = -EINVAL;
		goto errout;
	}

	if (!cfg->fc_table)
@@ -872,20 +884,24 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
	if (err < 0)
		goto errout;

	rtnl_net_lock(net);

	if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
		NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
		err = -EINVAL;
		goto errout;
		goto unlock;
	}

	tb = fib_get_table(net, cfg.fc_table);
	if (!tb) {
		NL_SET_ERR_MSG(extack, "FIB table does not exist");
		err = -ESRCH;
		goto errout;
		goto unlock;
	}

	err = fib_table_delete(net, tb, &cfg, extack);
unlock:
	rtnl_net_unlock(net);
errout:
	return err;
}
@@ -902,15 +918,20 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
	if (err < 0)
		goto errout;

	rtnl_net_lock(net);

	tb = fib_new_table(net, cfg.fc_table);
	if (!tb) {
		err = -ENOBUFS;
		goto errout;
		goto unlock;
	}

	err = fib_table_insert(net, tb, &cfg, extack);
	if (!err && cfg.fc_type == RTN_LOCAL)
		net->ipv4.fib_has_custom_local_routes = true;

unlock:
	rtnl_net_unlock(net);
errout:
	return err;
}
@@ -1450,7 +1471,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
		fib_sync_up(dev, RTNH_F_DEAD);
#endif
		atomic_inc(&net->ipv4.dev_addr_genid);
		rt_cache_flush(dev_net(dev));
		rt_cache_flush(net);
		break;
	case NETDEV_DOWN:
		fib_del_ifaddr(ifa, NULL);
@@ -1461,7 +1482,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
			 */
			fib_disable_ip(dev, event, true);
		} else {
			rt_cache_flush(dev_net(dev));
			rt_cache_flush(net);
		}
		break;
	}
@@ -1575,7 +1596,7 @@ static void ip_fib_net_exit(struct net *net)
{
	int i;

	ASSERT_RTNL();
	ASSERT_RTNL_NET(net);
#ifdef CONFIG_IP_MULTIPLE_TABLES
	RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
	RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
@@ -1615,9 +1636,15 @@ static int __net_init fib_net_init(struct net *net)
	error = ip_fib_net_init(net);
	if (error < 0)
		goto out;

	error = fib4_semantics_init(net);
	if (error)
		goto out_semantics;

	error = nl_fib_lookup_init(net);
	if (error < 0)
		goto out_nlfl;

	error = fib_proc_init(net);
	if (error < 0)
		goto out_proc;
@@ -1627,9 +1654,11 @@ static int __net_init fib_net_init(struct net *net)
out_proc:
	nl_fib_lookup_exit(net);
out_nlfl:
	rtnl_lock();
	fib4_semantics_exit(net);
out_semantics:
	rtnl_net_lock(net);
	ip_fib_net_exit(net);
	rtnl_unlock();
	rtnl_net_unlock(net);
	goto out;
}

@@ -1644,10 +1673,15 @@ static void __net_exit fib_net_exit_batch(struct list_head *net_list)
	struct net *net;

	rtnl_lock();
	list_for_each_entry(net, net_list, exit_list)
	list_for_each_entry(net, net_list, exit_list) {
		__rtnl_net_lock(net);
		ip_fib_net_exit(net);

		__rtnl_net_unlock(net);
	}
	rtnl_unlock();

	list_for_each_entry(net, net_list, exit_list)
		fib4_semantics_exit(net);
}

static struct pernet_operations fib_net_ops = {
@@ -1658,9 +1692,9 @@ static struct pernet_operations fib_net_ops = {

static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = {
	{.protocol = PF_INET, .msgtype = RTM_NEWROUTE,
	 .doit = inet_rtm_newroute},
	 .doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET},
	{.protocol = PF_INET, .msgtype = RTM_DELROUTE,
	 .doit = inet_rtm_delroute},
	 .doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET},
	{.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib,
	 .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
};
+100 −106
Original line number Diff line number Diff line
@@ -50,12 +50,6 @@

#include "fib_lookup.h"

static struct hlist_head *fib_info_hash;
static struct hlist_head *fib_info_laddrhash;
static unsigned int fib_info_hash_size;
static unsigned int fib_info_hash_bits;
static unsigned int fib_info_cnt;

/* for_nexthops and change_nexthops only used when nexthop object
 * is not set in a fib_info. The logic within can reference fib_nh.
 */
@@ -258,8 +252,7 @@ void fib_release_info(struct fib_info *fi)
	ASSERT_RTNL();
	if (fi && refcount_dec_and_test(&fi->fib_treeref)) {
		hlist_del(&fi->fib_hash);

		fib_info_cnt--;
		fi->fib_net->ipv4.fib_info_cnt--;

		if (fi->fib_prefsrc)
			hlist_del(&fi->fib_lhash);
@@ -335,11 +328,12 @@ static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope,
static unsigned int fib_info_hashfn_result(const struct net *net,
					   unsigned int val)
{
	return hash_32(val ^ net_hash_mix(net), fib_info_hash_bits);
	return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits);
}

static inline unsigned int fib_info_hashfn(struct fib_info *fi)
static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi)
{
	struct net *net = fi->fib_net;
	unsigned int val;

	val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol,
@@ -354,7 +348,70 @@ static inline unsigned int fib_info_hashfn(struct fib_info *fi)
		} endfor_nexthops(fi)
	}

	return fib_info_hashfn_result(fi->fib_net, val);
	return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)];
}

static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net,
						    __be32 val)
{
	unsigned int hash_bits = net->ipv4.fib_info_hash_bits;
	u32 slot;

	slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits);

	return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot];
}

static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits)
{
	/* The second half is used for prefsrc */
	return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head *),
			GFP_KERNEL);
}

static void fib_info_hash_free(struct hlist_head *head)
{
	kvfree(head);
}

static void fib_info_hash_grow(struct net *net)
{
	unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits;
	struct hlist_head *new_info_hash, *old_info_hash;
	unsigned int i;

	if (net->ipv4.fib_info_cnt < old_size)
		return;

	new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1);
	if (!new_info_hash)
		return;

	old_info_hash = net->ipv4.fib_info_hash;
	net->ipv4.fib_info_hash = new_info_hash;
	net->ipv4.fib_info_hash_bits += 1;

	for (i = 0; i < old_size; i++) {
		struct hlist_head *head = &old_info_hash[i];
		struct hlist_node *n;
		struct fib_info *fi;

		hlist_for_each_entry_safe(fi, n, head, fib_hash)
			hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi));
	}

	for (i = 0; i < old_size; i++) {
		struct hlist_head *lhead = &old_info_hash[old_size + i];
		struct hlist_node *n;
		struct fib_info *fi;

		hlist_for_each_entry_safe(fi, n, lhead, fib_lhash)
			hlist_add_head(&fi->fib_lhash,
				       fib_info_laddrhash_bucket(fi->fib_net,
								 fi->fib_prefsrc));
	}

	fib_info_hash_free(old_info_hash);
}

/* no metrics, only nexthop id */
@@ -370,13 +427,12 @@ static struct fib_info *fib_find_info_nh(struct net *net,
				 (__force u32)cfg->fc_prefsrc,
				 cfg->fc_priority);
	hash = fib_info_hashfn_result(net, hash);
	head = &fib_info_hash[hash];
	head = &net->ipv4.fib_info_hash[hash];

	hlist_for_each_entry(fi, head, fib_hash) {
		if (!net_eq(fi->fib_net, net))
			continue;
		if (!fi->nh || fi->nh->id != cfg->fc_nh_id)
			continue;

		if (cfg->fc_protocol == fi->fib_protocol &&
		    cfg->fc_scope == fi->fib_scope &&
		    cfg->fc_prefsrc == fi->fib_prefsrc &&
@@ -392,18 +448,13 @@ static struct fib_info *fib_find_info_nh(struct net *net,

static struct fib_info *fib_find_info(struct fib_info *nfi)
{
	struct hlist_head *head;
	struct hlist_head *head = fib_info_hash_bucket(nfi);
	struct fib_info *fi;
	unsigned int hash;

	hash = fib_info_hashfn(nfi);
	head = &fib_info_hash[hash];

	hlist_for_each_entry(fi, head, fib_hash) {
		if (!net_eq(fi->fib_net, nfi->fib_net))
			continue;
		if (fi->fib_nhs != nfi->fib_nhs)
			continue;

		if (nfi->fib_protocol == fi->fib_protocol &&
		    nfi->fib_scope == fi->fib_scope &&
		    nfi->fib_prefsrc == fi->fib_prefsrc &&
@@ -1239,64 +1290,6 @@ int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
	return err;
}

static struct hlist_head *
fib_info_laddrhash_bucket(const struct net *net, __be32 val)
{
	u32 slot = hash_32(net_hash_mix(net) ^ (__force u32)val,
			   fib_info_hash_bits);

	return &fib_info_laddrhash[slot];
}

static void fib_info_hash_move(struct hlist_head *new_info_hash,
			       struct hlist_head *new_laddrhash,
			       unsigned int new_size)
{
	struct hlist_head *old_info_hash, *old_laddrhash;
	unsigned int old_size = fib_info_hash_size;
	unsigned int i;

	ASSERT_RTNL();
	old_info_hash = fib_info_hash;
	old_laddrhash = fib_info_laddrhash;
	fib_info_hash_size = new_size;
	fib_info_hash_bits = ilog2(new_size);

	for (i = 0; i < old_size; i++) {
		struct hlist_head *head = &fib_info_hash[i];
		struct hlist_node *n;
		struct fib_info *fi;

		hlist_for_each_entry_safe(fi, n, head, fib_hash) {
			struct hlist_head *dest;
			unsigned int new_hash;

			new_hash = fib_info_hashfn(fi);
			dest = &new_info_hash[new_hash];
			hlist_add_head(&fi->fib_hash, dest);
		}
	}
	fib_info_hash = new_info_hash;

	fib_info_laddrhash = new_laddrhash;
	for (i = 0; i < old_size; i++) {
		struct hlist_head *lhead = &old_laddrhash[i];
		struct hlist_node *n;
		struct fib_info *fi;

		hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
			struct hlist_head *ldest;

			ldest = fib_info_laddrhash_bucket(fi->fib_net,
							  fi->fib_prefsrc);
			hlist_add_head(&fi->fib_lhash, ldest);
		}
	}

	kvfree(old_info_hash);
	kvfree(old_laddrhash);
}

__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
				 unsigned char scope)
{
@@ -1409,32 +1402,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
	}
#endif

	err = -ENOBUFS;
	fib_info_hash_grow(net);

	if (fib_info_cnt >= fib_info_hash_size) {
		unsigned int new_size = fib_info_hash_size << 1;
		struct hlist_head *new_info_hash;
		struct hlist_head *new_laddrhash;
		size_t bytes;

		if (!new_size)
			new_size = 16;
		bytes = (size_t)new_size * sizeof(struct hlist_head *);
		new_info_hash = kvzalloc(bytes, GFP_KERNEL);
		new_laddrhash = kvzalloc(bytes, GFP_KERNEL);
		if (!new_info_hash || !new_laddrhash) {
			kvfree(new_info_hash);
			kvfree(new_laddrhash);
		} else {
			fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
		}
		if (!fib_info_hash_size)
	fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL);
	if (!fi) {
		err = -ENOBUFS;
		goto failure;
	}

	fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL);
	if (!fi)
		goto failure;
	fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack);
	if (IS_ERR(fi->fib_metrics)) {
		err = PTR_ERR(fi->fib_metrics);
@@ -1571,9 +1546,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
	refcount_set(&fi->fib_treeref, 1);
	refcount_set(&fi->fib_clntref, 1);

	fib_info_cnt++;
	hlist_add_head(&fi->fib_hash,
		       &fib_info_hash[fib_info_hashfn(fi)]);
	net->ipv4.fib_info_cnt++;
	hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi));

	if (fi->fib_prefsrc) {
		struct hlist_head *head;

@@ -1855,7 +1830,7 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local)
	struct fib_info *fi;
	int ret = 0;

	if (!fib_info_laddrhash || local == 0)
	if (!local)
		return 0;

	head = fib_info_laddrhash_bucket(net, local);
@@ -2257,3 +2232,22 @@ void fib_select_path(struct net *net, struct fib_result *res,
			fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK);
	}
}

int __net_init fib4_semantics_init(struct net *net)
{
	unsigned int hash_bits = 4;

	net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits);
	if (!net->ipv4.fib_info_hash)
		return -ENOMEM;

	net->ipv4.fib_info_hash_bits = hash_bits;
	net->ipv4.fib_info_cnt = 0;

	return 0;
}

void __net_exit fib4_semantics_exit(struct net *net)
{
	fib_info_hash_free(net->ipv4.fib_info_hash);
}
+0 −22
Original line number Diff line number Diff line
@@ -1187,22 +1187,6 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
	return 0;
}

static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
{
	if (plen > KEYLENGTH) {
		NL_SET_ERR_MSG(extack, "Invalid prefix length");
		return false;
	}

	if ((plen < KEYLENGTH) && (key << plen)) {
		NL_SET_ERR_MSG(extack,
			       "Invalid prefix for given prefix length");
		return false;
	}

	return true;
}

static void fib_remove_alias(struct trie *t, struct key_vector *tp,
			     struct key_vector *l, struct fib_alias *old);

@@ -1223,9 +1207,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb,

	key = ntohl(cfg->fc_dst);

	if (!fib_valid_key_len(key, plen, extack))
		return -EINVAL;

	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);

	fi = fib_create_info(cfg, extack);
@@ -1717,9 +1698,6 @@ int fib_table_delete(struct net *net, struct fib_table *tb,

	key = ntohl(cfg->fc_dst);

	if (!fib_valid_key_len(key, plen, extack))
		return -EINVAL;

	l = fib_find_node(t, &tp, key);
	if (!l)
		return -ESRCH;