Commit f31b6fbf authored by Paolo Abeni's avatar Paolo Abeni
Browse files

Merge branch 'net-fix-lwtunnel-reentry-loops'

Justin Iurman says:

====================
net: fix lwtunnel reentry loops

When the destination is the same after the transformation, we enter a
lwtunnel loop. This is true for most of lwt users: ioam6, rpl, seg6,
seg6_local, ila_lwt, and lwt_bpf. It can happen in their input() and
output() handlers respectively, where either dst_input() or dst_output()
is called at the end. It can also happen in xmit() handlers.

Here is an example for rpl_input():

dump_stack_lvl+0x60/0x80
rpl_input+0x9d/0x320
lwtunnel_input+0x64/0xa0
lwtunnel_input+0x64/0xa0
lwtunnel_input+0x64/0xa0
lwtunnel_input+0x64/0xa0
lwtunnel_input+0x64/0xa0
[...]
lwtunnel_input+0x64/0xa0
lwtunnel_input+0x64/0xa0
lwtunnel_input+0x64/0xa0
lwtunnel_input+0x64/0xa0
lwtunnel_input+0x64/0xa0
ip6_sublist_rcv_finish+0x85/0x90
ip6_sublist_rcv+0x236/0x2f0

... until rpl_do_srh() fails, which means skb_cow_head() failed.

This series provides a fix at the core level of lwtunnel to catch such
loops when they're not caught by the respective lwtunnel users, and
handle the loop case in ioam6 which is one of the users. This series
also comes with a new selftest to detect some dst cache reference loops
in lwtunnel users.
====================

Link: https://patch.msgid.link/20250314120048.12569-1-justin.iurman@uliege.be


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents 47a9b5e5 3ed61b89
Loading
Loading
Loading
Loading
+53 −12
Original line number Diff line number Diff line
@@ -23,6 +23,8 @@
#include <net/ip6_fib.h>
#include <net/rtnh.h>

#include "dev.h"

DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled);
EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled);

@@ -325,13 +327,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap);

int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct dst_entry *dst = skb_dst(skb);
	const struct lwtunnel_encap_ops *ops;
	struct lwtunnel_state *lwtstate;
	int ret = -EINVAL;
	struct dst_entry *dst;
	int ret;

	if (dev_xmit_recursion()) {
		net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
				     __func__);
		ret = -ENETDOWN;
		goto drop;
	}

	if (!dst)
	dst = skb_dst(skb);
	if (!dst) {
		ret = -EINVAL;
		goto drop;
	}
	lwtstate = dst->lwtstate;

	if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
@@ -341,8 +353,11 @@ int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
	ret = -EOPNOTSUPP;
	rcu_read_lock();
	ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
	if (likely(ops && ops->output))
	if (likely(ops && ops->output)) {
		dev_xmit_recursion_inc();
		ret = ops->output(net, sk, skb);
		dev_xmit_recursion_dec();
	}
	rcu_read_unlock();

	if (ret == -EOPNOTSUPP)
@@ -359,13 +374,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_output);

int lwtunnel_xmit(struct sk_buff *skb)
{
	struct dst_entry *dst = skb_dst(skb);
	const struct lwtunnel_encap_ops *ops;
	struct lwtunnel_state *lwtstate;
	int ret = -EINVAL;
	struct dst_entry *dst;
	int ret;

	if (dev_xmit_recursion()) {
		net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
				     __func__);
		ret = -ENETDOWN;
		goto drop;
	}

	if (!dst)
	dst = skb_dst(skb);
	if (!dst) {
		ret = -EINVAL;
		goto drop;
	}

	lwtstate = dst->lwtstate;

@@ -376,8 +401,11 @@ int lwtunnel_xmit(struct sk_buff *skb)
	ret = -EOPNOTSUPP;
	rcu_read_lock();
	ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
	if (likely(ops && ops->xmit))
	if (likely(ops && ops->xmit)) {
		dev_xmit_recursion_inc();
		ret = ops->xmit(skb);
		dev_xmit_recursion_dec();
	}
	rcu_read_unlock();

	if (ret == -EOPNOTSUPP)
@@ -394,13 +422,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_xmit);

int lwtunnel_input(struct sk_buff *skb)
{
	struct dst_entry *dst = skb_dst(skb);
	const struct lwtunnel_encap_ops *ops;
	struct lwtunnel_state *lwtstate;
	int ret = -EINVAL;
	struct dst_entry *dst;
	int ret;

	if (!dst)
	if (dev_xmit_recursion()) {
		net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
				     __func__);
		ret = -ENETDOWN;
		goto drop;
	}

	dst = skb_dst(skb);
	if (!dst) {
		ret = -EINVAL;
		goto drop;
	}
	lwtstate = dst->lwtstate;

	if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
@@ -410,8 +448,11 @@ int lwtunnel_input(struct sk_buff *skb)
	ret = -EOPNOTSUPP;
	rcu_read_lock();
	ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
	if (likely(ops && ops->input))
	if (likely(ops && ops->input)) {
		dev_xmit_recursion_inc();
		ret = ops->input(skb);
		dev_xmit_recursion_dec();
	}
	rcu_read_unlock();

	if (ret == -EOPNOTSUPP)
+4 −4
Original line number Diff line number Diff line
@@ -337,7 +337,6 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb,
static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct dst_entry *dst = skb_dst(skb), *cache_dst = NULL;
	struct in6_addr orig_daddr;
	struct ioam6_lwt *ilwt;
	int err = -EINVAL;
	u32 pkt_cnt;
@@ -352,8 +351,6 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
	if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k)
		goto out;

	orig_daddr = ipv6_hdr(skb)->daddr;

	local_bh_disable();
	cache_dst = dst_cache_get(&ilwt->cache);
	local_bh_enable();
@@ -422,7 +419,10 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
			goto drop;
	}

	if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) {
	/* avoid lwtunnel_output() reentry loop when destination is the same
	 * after transformation (e.g., with the inline mode)
	 */
	if (dst->lwtstate != cache_dst->lwtstate) {
		skb_dst_drop(skb);
		skb_dst_set(skb, cache_dst);
		return dst_output(net, sk, skb);
+1 −0
Original line number Diff line number Diff line
@@ -101,6 +101,7 @@ TEST_PROGS += vlan_bridge_binding.sh
TEST_PROGS += bpf_offload.py
TEST_PROGS += ipv6_route_update_soft_lockup.sh
TEST_PROGS += busy_poll_test.sh
TEST_PROGS += lwt_dst_cache_ref_loop.sh

# YNL files, must be before "include ..lib.mk"
YNL_GEN_FILES := busy_poller netlink-dumps
+2 −0
Original line number Diff line number Diff line
@@ -107,3 +107,5 @@ CONFIG_XFRM_INTERFACE=m
CONFIG_XFRM_USER=m
CONFIG_IP_NF_MATCH_RPFILTER=m
CONFIG_IP6_NF_MATCH_RPFILTER=m
CONFIG_IPV6_ILA=m
CONFIG_IPV6_RPL_LWTUNNEL=y
+246 −0
Original line number Diff line number Diff line
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0+
#
# Author: Justin Iurman <justin.iurman@uliege.be>
#
# WARNING
# -------
# This is just a dummy script that triggers encap cases with possible dst cache
# reference loops in affected lwt users (see list below). Some cases are
# pathological configurations for simplicity, others are valid. Overall, we
# don't want this issue to happen, no matter what. In order to catch any
# reference loops, kmemleak MUST be used. The results alone are always blindly
# successful, don't rely on them. Note that the following tests may crash the
# kernel if the fix to prevent lwtunnel_{input|output|xmit}() reentry loops is
# not present.
#
# Affected lwt users so far (please update accordingly if needed):
#  - ila_lwt (output only)
#  - ioam6_iptunnel (output only)
#  - rpl_iptunnel (both input and output)
#  - seg6_iptunnel (both input and output)

source lib.sh

check_compatibility()
{
	setup_ns tmp_node &>/dev/null
	if [ $? != 0 ]; then
		echo "SKIP: Cannot create netns."
		exit $ksft_skip
	fi

	ip link add name veth0 netns $tmp_node type veth \
		peer name veth1 netns $tmp_node &>/dev/null
	local ret=$?

	ip -netns $tmp_node link set veth0 up &>/dev/null
	ret=$((ret + $?))

	ip -netns $tmp_node link set veth1 up &>/dev/null
	ret=$((ret + $?))

	if [ $ret != 0 ]; then
		echo "SKIP: Cannot configure links."
		cleanup_ns $tmp_node
		exit $ksft_skip
	fi

	lsmod 2>/dev/null | grep -q "ila"
	ila_lsmod=$?
	[ $ila_lsmod != 0 ] && modprobe ila &>/dev/null

	ip -netns $tmp_node route add 2001:db8:1::/64 \
		encap ila 1:2:3:4 csum-mode no-action ident-type luid \
			hook-type output \
		dev veth0 &>/dev/null

	ip -netns $tmp_node route add 2001:db8:2::/64 \
		encap ioam6 trace prealloc type 0x800000 ns 0 size 4 \
		dev veth0 &>/dev/null

	ip -netns $tmp_node route add 2001:db8:3::/64 \
		encap rpl segs 2001:db8:3::1 dev veth0 &>/dev/null

	ip -netns $tmp_node route add 2001:db8:4::/64 \
		encap seg6 mode inline segs 2001:db8:4::1 dev veth0 &>/dev/null

	ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap ila"
	skip_ila=$?

	ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap ioam6"
	skip_ioam6=$?

	ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap rpl"
	skip_rpl=$?

	ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap seg6"
	skip_seg6=$?

	cleanup_ns $tmp_node
}

setup()
{
	setup_ns alpha beta gamma &>/dev/null

	ip link add name veth-alpha netns $alpha type veth \
		peer name veth-betaL netns $beta &>/dev/null

	ip link add name veth-betaR netns $beta type veth \
		peer name veth-gamma netns $gamma &>/dev/null

	ip -netns $alpha link set veth-alpha name veth0 &>/dev/null
	ip -netns $beta link set veth-betaL name veth0 &>/dev/null
	ip -netns $beta link set veth-betaR name veth1 &>/dev/null
	ip -netns $gamma link set veth-gamma name veth0 &>/dev/null

	ip -netns $alpha addr add 2001:db8:1::2/64 dev veth0 &>/dev/null
	ip -netns $alpha link set veth0 up &>/dev/null
	ip -netns $alpha link set lo up &>/dev/null
	ip -netns $alpha route add 2001:db8:2::/64 \
		via 2001:db8:1::1 dev veth0 &>/dev/null

	ip -netns $beta addr add 2001:db8:1::1/64 dev veth0 &>/dev/null
	ip -netns $beta addr add 2001:db8:2::1/64 dev veth1 &>/dev/null
	ip -netns $beta link set veth0 up &>/dev/null
	ip -netns $beta link set veth1 up &>/dev/null
	ip -netns $beta link set lo up &>/dev/null
	ip -netns $beta route del 2001:db8:2::/64
	ip -netns $beta route add 2001:db8:2::/64 dev veth1
	ip netns exec $beta \
		sysctl -wq net.ipv6.conf.all.forwarding=1 &>/dev/null

	ip -netns $gamma addr add 2001:db8:2::2/64 dev veth0 &>/dev/null
	ip -netns $gamma link set veth0 up &>/dev/null
	ip -netns $gamma link set lo up &>/dev/null
	ip -netns $gamma route add 2001:db8:1::/64 \
		via 2001:db8:2::1 dev veth0 &>/dev/null

	sleep 1

	ip netns exec $alpha ping6 -c 5 -W 1 2001:db8:2::2 &>/dev/null
	if [ $? != 0 ]; then
		echo "SKIP: Setup failed."
		exit $ksft_skip
	fi

	sleep 1
}

cleanup()
{
	cleanup_ns $alpha $beta $gamma
	[ $ila_lsmod != 0 ] && modprobe -r ila &>/dev/null
}

run_ila()
{
	if [ $skip_ila != 0 ]; then
		echo "SKIP: ila (output)"
		return
	fi

	ip -netns $beta route del 2001:db8:2::/64
	ip -netns $beta route add 2001:db8:2:0:0:0:0:2/128 \
		encap ila 2001:db8:2:0 csum-mode no-action ident-type luid \
			hook-type output \
		dev veth1 &>/dev/null
	sleep 1

	echo "TEST: ila (output)"
	ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
	sleep 1

	ip -netns $beta route del 2001:db8:2:0:0:0:0:2/128
	ip -netns $beta route add 2001:db8:2::/64 dev veth1
	sleep 1
}

run_ioam6()
{
	if [ $skip_ioam6 != 0 ]; then
		echo "SKIP: ioam6 (output)"
		return
	fi

	ip -netns $beta route change 2001:db8:2::/64 \
		encap ioam6 trace prealloc type 0x800000 ns 1 size 4 \
		dev veth1 &>/dev/null
	sleep 1

	echo "TEST: ioam6 (output)"
	ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
	sleep 1
}

run_rpl()
{
	if [ $skip_rpl != 0 ]; then
		echo "SKIP: rpl (input)"
		echo "SKIP: rpl (output)"
		return
	fi

	ip -netns $beta route change 2001:db8:2::/64 \
		encap rpl segs 2001:db8:2::2 \
		dev veth1 &>/dev/null
	sleep 1

	echo "TEST: rpl (input)"
	ip netns exec $alpha ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
	sleep 1

	echo "TEST: rpl (output)"
	ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
	sleep 1
}

run_seg6()
{
	if [ $skip_seg6 != 0 ]; then
		echo "SKIP: seg6 (input)"
		echo "SKIP: seg6 (output)"
		return
	fi

	ip -netns $beta route change 2001:db8:2::/64 \
		encap seg6 mode inline segs 2001:db8:2::2 \
		dev veth1 &>/dev/null
	sleep 1

	echo "TEST: seg6 (input)"
	ip netns exec $alpha ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
	sleep 1

	echo "TEST: seg6 (output)"
	ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
	sleep 1
}

run()
{
	run_ila
	run_ioam6
	run_rpl
	run_seg6
}

if [ "$(id -u)" -ne 0 ]; then
	echo "SKIP: Need root privileges."
	exit $ksft_skip
fi

if [ ! -x "$(command -v ip)" ]; then
	echo "SKIP: Could not run test without ip tool."
	exit $ksft_skip
fi

check_compatibility

trap cleanup EXIT

setup
run

exit $ksft_pass