Commit b0e8cb1e authored by Paolo Abeni's avatar Paolo Abeni
Browse files

Merge branch 'ipv6-no-rtnl-for-ipv6-routing-table'

Kuniyuki Iwashima says:

====================
ipv6: No RTNL for IPv6 routing table.

IPv6 routing tables are protected by each table's lock and work in
the interrupt context, which means we basically don't need RTNL to
modify an IPv6 routing table itself.

Currently, the control paths require RTNL because we may need to
perform device and nexthop lookups; we must prevent dev/nexthop from
going away from the netns.

This, however, can be achieved by RCU as well.

If we are in the RCU critical section while adding an IPv6 route,
synchronize_net() in __dev_change_net_namespace() and
unregister_netdevice_many_notify() guarantee that the dev will not be
moved to another netns or removed.

Also, nexthop is guaranteed not to be freed during the RCU grace period.

If we care about a race between nexthop removal and IPv6 route addition,
we can get rid of RTNL from the control paths.

Patch 1 moves a validation for RTA_MULTIPATH earlier.
Patch 2 removes RTNL for SIOCDELRT and RTM_DELROUTE.
Patch 3 ~ 11 moves validation and memory allocation earlier.
Patch 12 prevents a race between two requests for the same table.
Patch 13 & 14 prevents the nexthop race mentioned above.
Patch 15 removes RTNL for SIOCADDRT and RTM_NEWROUTE.

Test:

The script [0] lets each CPU-X create 100000 routes on table-X in a
batch.

On c7a.metal-48xl EC2 instance with 192 CPUs,

without this series:

  $ sudo ./route_test.sh
  start adding routes
  added 19200000 routes (100000 routes * 192 tables).
  total routes: 19200006
  Time elapsed: 191577 milliseconds.

with this series:

  $ sudo ./route_test.sh
  start adding routes
  added 19200000 routes (100000 routes * 192 tables).
  total routes: 19200006
  Time elapsed: 62854 milliseconds.

I changed the number of routes (1000 ~ 100000 per CPU/table) and
consistently saw it finish 3x faster with this series.

[0]

mkdir tmp

NS="test"
ip netns add $NS
ip -n $NS link add veth0 type veth peer veth1
ip -n $NS link set veth0 up
ip -n $NS link set veth1 up

TABLES=()
for i in $(seq $(nproc)); do
    TABLES+=("$i")
done

ROUTES=()
for i in {1..100}; do
    for j in {1..1000}; do
	ROUTES+=("2001:$i:$j::/64")
    done
done

for TABLE in "${TABLES[@]}"; do
    (
	FILE="./tmp/batch-table-$TABLE.txt"
	> $FILE
	for ROUTE in "${ROUTES[@]}"; do
            echo "route add $ROUTE dev veth0 table $TABLE" >> $FILE
	done
    ) &
done

wait

echo "start adding routes"

START_TIME=$(date +%s%3N)
for TABLE in "${TABLES[@]}"; do
    ip -n $NS -6 -batch "./tmp/batch-table-$TABLE.txt" &
done

wait
END_TIME=$(date +%s%3N)
ELAPSED_TIME=$((END_TIME - START_TIME))

echo "added $((${#ROUTES[@]} * ${#TABLES[@]})) routes (${#ROUTES[@]} routes * ${#TABLES[@]} tables)."
echo "total routes: $(ip -n $NS -6 route show table all | wc -l)"  # Just for debug
echo "Time elapsed: ${ELAPSED_TIME} milliseconds."

ip netns del $NS
rm -fr ./tmp/

v2: https://lore.kernel.org/netdev/20250409011243.26195-1-kuniyu@amazon.com/
v1: https://lore.kernel.org/netdev/20250321040131.21057-1-kuniyu@amazon.com/
====================

Link: https://patch.msgid.link/20250418000443.43734-1-kuniyu@amazon.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents abcec3ed 169fd627
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -198,6 +198,7 @@ struct fib6_info {
					fib6_destroying:1,
					unused:4;

	struct list_head		purge_link;
	struct rcu_head			rcu;
	struct nexthop			*nh;
	struct fib6_nh			fib6_nh[];
+1 −0
Original line number Diff line number Diff line
@@ -72,6 +72,7 @@ struct netns_ipv6 {
	struct rt6_statistics   *rt6_stats;
	struct timer_list       ip6_fib_timer;
	struct hlist_head       *fib_table_hash;
	spinlock_t		fib_table_hash_lock;
	struct fib6_table       *fib6_main_tbl;
	struct list_head	fib6_walkers;
	rwlock_t		fib6_walker_lock;
+2 −0
Original line number Diff line number Diff line
@@ -152,6 +152,8 @@ struct nexthop {
	u8			protocol;   /* app managing this nh */
	u8			nh_flags;
	bool			is_group;
	bool			dead;
	spinlock_t		lock;       /* protect dead and f6i_list */

	refcount_t		refcnt;
	struct rcu_head		rcu;
+6 −4
Original line number Diff line number Diff line
@@ -617,10 +617,12 @@ int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc,
{
	int err;

	if (!nhc->nhc_pcpu_rth_output) {
		nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *,
							    gfp_flags);
		if (!nhc->nhc_pcpu_rth_output)
			return -ENOMEM;
	}

	if (encap) {
		struct lwtunnel_state *lwtstate;
+17 −5
Original line number Diff line number Diff line
@@ -541,6 +541,7 @@ static struct nexthop *nexthop_alloc(void)
		INIT_LIST_HEAD(&nh->f6i_list);
		INIT_LIST_HEAD(&nh->grp_list);
		INIT_LIST_HEAD(&nh->fdb_list);
		spin_lock_init(&nh->lock);
	}
	return nh;
}
@@ -1555,12 +1556,12 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
	if (nh->is_group) {
		struct nh_group *nhg;

		nhg = rtnl_dereference(nh->nh_grp);
		nhg = rcu_dereference_rtnl(nh->nh_grp);
		if (nhg->has_v4)
			goto no_v4_nh;
		is_fdb_nh = nhg->fdb_nh;
	} else {
		nhi = rtnl_dereference(nh->nh_info);
		nhi = rcu_dereference_rtnl(nh->nh_info);
		if (nhi->family == AF_INET)
			goto no_v4_nh;
		is_fdb_nh = nhi->fdb_nh;
@@ -2118,7 +2119,7 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
/* not called for nexthop replace */
static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
{
	struct fib6_info *f6i, *tmp;
	struct fib6_info *f6i;
	bool do_flush = false;
	struct fib_info *fi;

@@ -2129,13 +2130,24 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
	if (do_flush)
		fib_flush(net);

	/* ip6_del_rt removes the entry from this list hence the _safe */
	list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
	spin_lock_bh(&nh->lock);

	nh->dead = true;

	while (!list_empty(&nh->f6i_list)) {
		f6i = list_first_entry(&nh->f6i_list, typeof(*f6i), nh_list);

		/* __ip6_del_rt does a release, so do a hold here */
		fib6_info_hold(f6i);

		spin_unlock_bh(&nh->lock);
		ipv6_stub->ip6_del_rt(net, f6i,
				      !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode));

		spin_lock_bh(&nh->lock);
	}

	spin_unlock_bh(&nh->lock);
}

static void __remove_nexthop(struct net *net, struct nexthop *nh,
Loading