Merge branch 'net-fix-lwtunnel-reentry-loops' (f31b6fbf) · Commits · git / linux-nf

net/core/lwtunnel.c

+53 −12

Original line number	Diff line number	Diff line
		@@ -23,6 +23,8 @@
		#include <net/ip6_fib.h>
		#include <net/rtnh.h>

		#include "dev.h"

		DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled);
		EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled);

		@@ -325,13 +327,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap);

		int lwtunnel_output(struct net net, struct sock sk, struct sk_buff *skb)
		{
		struct dst_entry *dst = skb_dst(skb);
		const struct lwtunnel_encap_ops *ops;
		struct lwtunnel_state *lwtstate;
		int ret = -EINVAL;
		struct dst_entry *dst;
		int ret;

		if (dev_xmit_recursion()) {
		net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
		__func__);
		ret = -ENETDOWN;
		goto drop;
		}

		if (!dst)
		dst = skb_dst(skb);
		if (!dst) {
		ret = -EINVAL;
		goto drop;
		}
		lwtstate = dst->lwtstate;

		if (lwtstate->type == LWTUNNEL_ENCAP_NONE \|\|
		@@ -341,8 +353,11 @@ int lwtunnel_output(struct net net, struct sock sk, struct sk_buff *skb)
		ret = -EOPNOTSUPP;
		rcu_read_lock();
		ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
		if (likely(ops && ops->output))
		if (likely(ops && ops->output)) {
		dev_xmit_recursion_inc();
		ret = ops->output(net, sk, skb);
		dev_xmit_recursion_dec();
		}
		rcu_read_unlock();

		if (ret == -EOPNOTSUPP)
		@@ -359,13 +374,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_output);

		int lwtunnel_xmit(struct sk_buff *skb)
		{
		struct dst_entry *dst = skb_dst(skb);
		const struct lwtunnel_encap_ops *ops;
		struct lwtunnel_state *lwtstate;
		int ret = -EINVAL;
		struct dst_entry *dst;
		int ret;

		if (dev_xmit_recursion()) {
		net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
		__func__);
		ret = -ENETDOWN;
		goto drop;
		}

		if (!dst)
		dst = skb_dst(skb);
		if (!dst) {
		ret = -EINVAL;
		goto drop;
		}

		lwtstate = dst->lwtstate;

		@@ -376,8 +401,11 @@ int lwtunnel_xmit(struct sk_buff *skb)
		ret = -EOPNOTSUPP;
		rcu_read_lock();
		ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
		if (likely(ops && ops->xmit))
		if (likely(ops && ops->xmit)) {
		dev_xmit_recursion_inc();
		ret = ops->xmit(skb);
		dev_xmit_recursion_dec();
		}
		rcu_read_unlock();

		if (ret == -EOPNOTSUPP)
		@@ -394,13 +422,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_xmit);

		int lwtunnel_input(struct sk_buff *skb)
		{
		struct dst_entry *dst = skb_dst(skb);
		const struct lwtunnel_encap_ops *ops;
		struct lwtunnel_state *lwtstate;
		int ret = -EINVAL;
		struct dst_entry *dst;
		int ret;

		if (!dst)
		if (dev_xmit_recursion()) {
		net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
		__func__);
		ret = -ENETDOWN;
		goto drop;
		}

		dst = skb_dst(skb);
		if (!dst) {
		ret = -EINVAL;
		goto drop;
		}
		lwtstate = dst->lwtstate;

		if (lwtstate->type == LWTUNNEL_ENCAP_NONE \|\|
		@@ -410,8 +448,11 @@ int lwtunnel_input(struct sk_buff *skb)
		ret = -EOPNOTSUPP;
		rcu_read_lock();
		ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
		if (likely(ops && ops->input))
		if (likely(ops && ops->input)) {
		dev_xmit_recursion_inc();
		ret = ops->input(skb);
		dev_xmit_recursion_dec();
		}
		rcu_read_unlock();

		if (ret == -EOPNOTSUPP)

net/ipv6/ioam6_iptunnel.c

+4 −4

Original line number	Diff line number	Diff line
		@@ -337,7 +337,6 @@ static int ioam6_do_encap(struct net net, struct sk_buff skb,
		static int ioam6_output(struct net net, struct sock sk, struct sk_buff *skb)
		{
		struct dst_entry dst = skb_dst(skb), cache_dst = NULL;
		struct in6_addr orig_daddr;
		struct ioam6_lwt *ilwt;
		int err = -EINVAL;
		u32 pkt_cnt;
		@@ -352,8 +351,6 @@ static int ioam6_output(struct net net, struct sock sk, struct sk_buff *skb)
		if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k)
		goto out;

		orig_daddr = ipv6_hdr(skb)->daddr;

		local_bh_disable();
		cache_dst = dst_cache_get(&ilwt->cache);
		local_bh_enable();
		@@ -422,7 +419,10 @@ static int ioam6_output(struct net net, struct sock sk, struct sk_buff *skb)
		goto drop;
		}

		if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) {
		/* avoid lwtunnel_output() reentry loop when destination is the same
		* after transformation (e.g., with the inline mode)
		*/
		if (dst->lwtstate != cache_dst->lwtstate) {
		skb_dst_drop(skb);
		skb_dst_set(skb, cache_dst);
		return dst_output(net, sk, skb);

tools/testing/selftests/net/Makefile

+1 −0

Original line number	Diff line number	Diff line
		@@ -101,6 +101,7 @@ TEST_PROGS += vlan_bridge_binding.sh
		TEST_PROGS += bpf_offload.py
		TEST_PROGS += ipv6_route_update_soft_lockup.sh
		TEST_PROGS += busy_poll_test.sh
		TEST_PROGS += lwt_dst_cache_ref_loop.sh

		# YNL files, must be before "include ..lib.mk"
		YNL_GEN_FILES := busy_poller netlink-dumps

tools/testing/selftests/net/config

+2 −0

Original line number	Diff line number	Diff line
		@@ -107,3 +107,5 @@ CONFIG_XFRM_INTERFACE=m
		CONFIG_XFRM_USER=m
		CONFIG_IP_NF_MATCH_RPFILTER=m
		CONFIG_IP6_NF_MATCH_RPFILTER=m
		CONFIG_IPV6_ILA=m
		CONFIG_IPV6_RPL_LWTUNNEL=y

tools/testing/selftests/net/lwt_dst_cache_ref_loop.sh

0 → 100755

+246 −0

Original line number	Diff line number	Diff line
		#!/bin/bash
		# SPDX-License-Identifier: GPL-2.0+
		#
		# Author: Justin Iurman <justin.iurman@uliege.be>
		#
		# WARNING
		# -------
		# This is just a dummy script that triggers encap cases with possible dst cache
		# reference loops in affected lwt users (see list below). Some cases are
		# pathological configurations for simplicity, others are valid. Overall, we
		# don't want this issue to happen, no matter what. In order to catch any
		# reference loops, kmemleak MUST be used. The results alone are always blindly
		# successful, don't rely on them. Note that the following tests may crash the
		# kernel if the fix to prevent lwtunnel_{input\|output\|xmit}() reentry loops is
		# not present.
		#
		# Affected lwt users so far (please update accordingly if needed):
		# - ila_lwt (output only)
		# - ioam6_iptunnel (output only)
		# - rpl_iptunnel (both input and output)
		# - seg6_iptunnel (both input and output)

		source lib.sh

		check_compatibility()
		{
		setup_ns tmp_node &>/dev/null
		if [ $? != 0 ]; then
		echo "SKIP: Cannot create netns."
		exit $ksft_skip
		fi

		ip link add name veth0 netns $tmp_node type veth \
		peer name veth1 netns $tmp_node &>/dev/null
		local ret=$?

		ip -netns $tmp_node link set veth0 up &>/dev/null
		ret=$((ret + $?))

		ip -netns $tmp_node link set veth1 up &>/dev/null
		ret=$((ret + $?))

		if [ $ret != 0 ]; then
		echo "SKIP: Cannot configure links."
		cleanup_ns $tmp_node
		exit $ksft_skip
		fi

		lsmod 2>/dev/null \| grep -q "ila"
		ila_lsmod=$?
		[ $ila_lsmod != 0 ] && modprobe ila &>/dev/null

		ip -netns $tmp_node route add 2001:db8:1::/64 \
		encap ila 1:2:3:4 csum-mode no-action ident-type luid \
		hook-type output \
		dev veth0 &>/dev/null

		ip -netns $tmp_node route add 2001:db8:2::/64 \
		encap ioam6 trace prealloc type 0x800000 ns 0 size 4 \
		dev veth0 &>/dev/null

		ip -netns $tmp_node route add 2001:db8:3::/64 \
		encap rpl segs 2001:db8:3::1 dev veth0 &>/dev/null

		ip -netns $tmp_node route add 2001:db8:4::/64 \
		encap seg6 mode inline segs 2001:db8:4::1 dev veth0 &>/dev/null

		ip -netns $tmp_node -6 route 2>/dev/null \| grep -q "encap ila"
		skip_ila=$?

		ip -netns $tmp_node -6 route 2>/dev/null \| grep -q "encap ioam6"
		skip_ioam6=$?

		ip -netns $tmp_node -6 route 2>/dev/null \| grep -q "encap rpl"
		skip_rpl=$?

		ip -netns $tmp_node -6 route 2>/dev/null \| grep -q "encap seg6"
		skip_seg6=$?

		cleanup_ns $tmp_node
		}

		setup()
		{
		setup_ns alpha beta gamma &>/dev/null

		ip link add name veth-alpha netns $alpha type veth \
		peer name veth-betaL netns $beta &>/dev/null

		ip link add name veth-betaR netns $beta type veth \
		peer name veth-gamma netns $gamma &>/dev/null

		ip -netns $alpha link set veth-alpha name veth0 &>/dev/null
		ip -netns $beta link set veth-betaL name veth0 &>/dev/null
		ip -netns $beta link set veth-betaR name veth1 &>/dev/null
		ip -netns $gamma link set veth-gamma name veth0 &>/dev/null

		ip -netns $alpha addr add 2001:db8:1::2/64 dev veth0 &>/dev/null
		ip -netns $alpha link set veth0 up &>/dev/null
		ip -netns $alpha link set lo up &>/dev/null
		ip -netns $alpha route add 2001:db8:2::/64 \
		via 2001:db8:1::1 dev veth0 &>/dev/null

		ip -netns $beta addr add 2001:db8:1::1/64 dev veth0 &>/dev/null
		ip -netns $beta addr add 2001:db8:2::1/64 dev veth1 &>/dev/null
		ip -netns $beta link set veth0 up &>/dev/null
		ip -netns $beta link set veth1 up &>/dev/null
		ip -netns $beta link set lo up &>/dev/null
		ip -netns $beta route del 2001:db8:2::/64
		ip -netns $beta route add 2001:db8:2::/64 dev veth1
		ip netns exec $beta \
		sysctl -wq net.ipv6.conf.all.forwarding=1 &>/dev/null

		ip -netns $gamma addr add 2001:db8:2::2/64 dev veth0 &>/dev/null
		ip -netns $gamma link set veth0 up &>/dev/null
		ip -netns $gamma link set lo up &>/dev/null
		ip -netns $gamma route add 2001:db8:1::/64 \
		via 2001:db8:2::1 dev veth0 &>/dev/null

		sleep 1

		ip netns exec $alpha ping6 -c 5 -W 1 2001:db8:2::2 &>/dev/null
		if [ $? != 0 ]; then
		echo "SKIP: Setup failed."
		exit $ksft_skip
		fi

		sleep 1
		}

		cleanup()
		{
		cleanup_ns $alpha $beta $gamma
		[ $ila_lsmod != 0 ] && modprobe -r ila &>/dev/null
		}

		run_ila()
		{
		if [ $skip_ila != 0 ]; then
		echo "SKIP: ila (output)"
		return
		fi

		ip -netns $beta route del 2001:db8:2::/64
		ip -netns $beta route add 2001:db8:2:0:0:0:0:2/128 \
		encap ila 2001:db8:2:0 csum-mode no-action ident-type luid \
		hook-type output \
		dev veth1 &>/dev/null
		sleep 1

		echo "TEST: ila (output)"
		ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
		sleep 1

		ip -netns $beta route del 2001:db8:2:0:0:0:0:2/128
		ip -netns $beta route add 2001:db8:2::/64 dev veth1
		sleep 1
		}

		run_ioam6()
		{
		if [ $skip_ioam6 != 0 ]; then
		echo "SKIP: ioam6 (output)"
		return
		fi

		ip -netns $beta route change 2001:db8:2::/64 \
		encap ioam6 trace prealloc type 0x800000 ns 1 size 4 \
		dev veth1 &>/dev/null
		sleep 1

		echo "TEST: ioam6 (output)"
		ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
		sleep 1
		}

		run_rpl()
		{
		if [ $skip_rpl != 0 ]; then
		echo "SKIP: rpl (input)"
		echo "SKIP: rpl (output)"
		return
		fi

		ip -netns $beta route change 2001:db8:2::/64 \
		encap rpl segs 2001:db8:2::2 \
		dev veth1 &>/dev/null
		sleep 1

		echo "TEST: rpl (input)"
		ip netns exec $alpha ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
		sleep 1

		echo "TEST: rpl (output)"
		ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
		sleep 1
		}

		run_seg6()
		{
		if [ $skip_seg6 != 0 ]; then
		echo "SKIP: seg6 (input)"
		echo "SKIP: seg6 (output)"
		return
		fi

		ip -netns $beta route change 2001:db8:2::/64 \
		encap seg6 mode inline segs 2001:db8:2::2 \
		dev veth1 &>/dev/null
		sleep 1

		echo "TEST: seg6 (input)"
		ip netns exec $alpha ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
		sleep 1

		echo "TEST: seg6 (output)"
		ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
		sleep 1
		}

		run()
		{
		run_ila
		run_ioam6
		run_rpl
		run_seg6
		}

		if [ "$(id -u)" -ne 0 ]; then
		echo "SKIP: Need root privileges."
		exit $ksft_skip
		fi

		if [ ! -x "$(command -v ip)" ]; then
		echo "SKIP: Could not run test without ip tool."
		exit $ksft_skip
		fi

		check_compatibility

		trap cleanup EXIT

		setup
		run

		exit $ksft_pass