Commit 403f3e8f authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'add-bpf_xdp_get_xfrm_state-kfunc'

Daniel Xu says:

====================
Add bpf_xdp_get_xfrm_state() kfunc

This patchset adds two kfunc helpers, bpf_xdp_get_xfrm_state() and
bpf_xdp_xfrm_state_release() that wrap xfrm_state_lookup() and
xfrm_state_put(). The intent is to support software RSS (via XDP) for
the ongoing/upcoming ipsec pcpu work [0]. Recent experiments performed
on (hopefully) reproducible AWS testbeds indicate that single tunnel
pcpu ipsec can reach line rate on 100G ENA nics.

Note this patchset only tests/shows generic xfrm_state access. The
"secret sauce" (if you can really even call it that) involves accessing
a soon-to-be-upstreamed pcpu_num field in xfrm_state. Early example is
available here [1].

[0]: https://datatracker.ietf.org/doc/draft-ietf-ipsecme-multi-sa-performance/03/
[1]: https://github.com/danobi/xdp-tools/blob/e89a1c617aba3b50d990f779357d6ce2863ecb27/xdp-bench/xdp_redirect_cpumap.bpf.c#L385-L406



Changes from v5:
* Improve kfunc doc comments
* Remove extraneous replay-window setting on selftest reverse path
* Squash two kfunc commits into one
* Rebase to bpf-next to pick up bitfield write patches
* Remove testing of opts.error in selftest prog

Changes from v4:
* Fixup commit message for selftest
* Set opts->error -ENOENT for !x
* Revert single file xfrm + bpf

Changes from v3:
* Place all xfrm bpf integrations in xfrm_bpf.c
* Avoid using nval as a temporary
* Rebase to bpf-next
* Remove extraneous __failure_unpriv annotation for verifier tests

Changes from v2:
* Fix/simplify BPF_CORE_WRITE_BITFIELD() algorithm
* Added verifier tests for bitfield writes
* Fix state leakage across test_tunnel subtests

Changes from v1:
* Move xfrm tunnel tests to test_progs
* Fix writing to opts->error when opts is invalid
* Use __bpf_kfunc_start_defs()
* Remove unused vxlanhdr definition
* Add and use BPF_CORE_WRITE_BITFIELD() macro
* Make series bisect clean

Changes from RFCv2:
* Rebased to ipsec-next
* Fix netns leak

Changes from RFCv1:
* Add Antony's commit tags
* Add KF_ACQUIRE and KF_RELEASE semantics
====================

Reviewed-by: default avatarEyal Birger <eyal.birger@gmail.com>
Link: https://lore.kernel.org/r/cover.1702593901.git.dxu@dxuuu.xyz


Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 56925f38 2cd07b0e
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -2190,4 +2190,13 @@ static inline int register_xfrm_interface_bpf(void)

#endif

#if IS_ENABLED(CONFIG_DEBUG_INFO_BTF)
int register_xfrm_state_bpf(void);
#else
static inline int register_xfrm_state_bpf(void)
{
	return 0;
}
#endif

#endif	/* _NET_XFRM_H */
+1 −0
Original line number Diff line number Diff line
@@ -21,3 +21,4 @@ obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o
obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o
obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o
+2 −0
Original line number Diff line number Diff line
@@ -4218,6 +4218,8 @@ void __init xfrm_init(void)
#ifdef CONFIG_XFRM_ESPINTCP
	espintcp_init();
#endif

	register_xfrm_state_bpf();
}

#ifdef CONFIG_AUDITSYSCALL
+134 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0-only
/* Unstable XFRM state BPF helpers.
 *
 * Note that it is allowed to break compatibility for these functions since the
 * interface they are exposed through to BPF programs is explicitly unstable.
 */

#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>
#include <net/xdp.h>
#include <net/xfrm.h>

/* bpf_xfrm_state_opts - Options for XFRM state lookup helpers
 *
 * Members:
 * @error      - Out parameter, set for any errors encountered
 *		 Values:
 *		   -EINVAL - netns_id is less than -1
 *		   -EINVAL - opts__sz isn't BPF_XFRM_STATE_OPTS_SZ
 *		   -ENONET - No network namespace found for netns_id
 *		   -ENOENT - No xfrm_state found
 * @netns_id	- Specify the network namespace for lookup
 *		 Values:
 *		   BPF_F_CURRENT_NETNS (-1)
 *		     Use namespace associated with ctx
 *		   [0, S32_MAX]
 *		     Network Namespace ID
 * @mark	- XFRM mark to match on
 * @daddr	- Destination address to match on
 * @spi		- Security parameter index to match on
 * @proto	- IP protocol to match on (eg. IPPROTO_ESP)
 * @family	- Protocol family to match on (AF_INET/AF_INET6)
 */
struct bpf_xfrm_state_opts {
	s32 error;
	s32 netns_id;
	u32 mark;
	xfrm_address_t daddr;
	__be32 spi;
	u8 proto;
	u16 family;
};

enum {
	BPF_XFRM_STATE_OPTS_SZ = sizeof(struct bpf_xfrm_state_opts),
};

__bpf_kfunc_start_defs();

/* bpf_xdp_get_xfrm_state - Get XFRM state
 *
 * A `struct xfrm_state *`, if found, must be released with a corresponding
 * bpf_xdp_xfrm_state_release.
 *
 * Parameters:
 * @ctx	- Pointer to ctx (xdp_md) in XDP program
 *		    Cannot be NULL
 * @opts	- Options for lookup (documented above)
 *		    Cannot be NULL
 * @opts__sz	- Length of the bpf_xfrm_state_opts structure
 *		    Must be BPF_XFRM_STATE_OPTS_SZ
 */
__bpf_kfunc struct xfrm_state *
bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts, u32 opts__sz)
{
	struct xdp_buff *xdp = (struct xdp_buff *)ctx;
	struct net *net = dev_net(xdp->rxq->dev);
	struct xfrm_state *x;

	if (!opts || opts__sz < sizeof(opts->error))
		return NULL;

	if (opts__sz != BPF_XFRM_STATE_OPTS_SZ) {
		opts->error = -EINVAL;
		return NULL;
	}

	if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) {
		opts->error = -EINVAL;
		return NULL;
	}

	if (opts->netns_id >= 0) {
		net = get_net_ns_by_id(net, opts->netns_id);
		if (unlikely(!net)) {
			opts->error = -ENONET;
			return NULL;
		}
	}

	x = xfrm_state_lookup(net, opts->mark, &opts->daddr, opts->spi,
			      opts->proto, opts->family);

	if (opts->netns_id >= 0)
		put_net(net);
	if (!x)
		opts->error = -ENOENT;

	return x;
}

/* bpf_xdp_xfrm_state_release - Release acquired xfrm_state object
 *
 * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects
 * the program if any references remain in the program in all of the explored
 * states.
 *
 * Parameters:
 * @x		- Pointer to referenced xfrm_state object, obtained using
 *		  bpf_xdp_get_xfrm_state.
 */
__bpf_kfunc void bpf_xdp_xfrm_state_release(struct xfrm_state *x)
{
	xfrm_state_put(x);
}

__bpf_kfunc_end_defs();

BTF_SET8_START(xfrm_state_kfunc_set)
BTF_ID_FLAGS(func, bpf_xdp_get_xfrm_state, KF_RET_NULL | KF_ACQUIRE)
BTF_ID_FLAGS(func, bpf_xdp_xfrm_state_release, KF_RELEASE)
BTF_SET8_END(xfrm_state_kfunc_set)

static const struct btf_kfunc_id_set xfrm_state_xdp_kfunc_set = {
	.owner = THIS_MODULE,
	.set   = &xfrm_state_kfunc_set,
};

int __init register_xfrm_state_bpf(void)
{
	return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP,
					 &xfrm_state_xdp_kfunc_set);
}
+157 −5
Original line number Diff line number Diff line
@@ -50,6 +50,7 @@
 */

#include <arpa/inet.h>
#include <linux/if_link.h>
#include <linux/if_tun.h>
#include <linux/limits.h>
#include <linux/sysctl.h>
@@ -92,6 +93,11 @@
#define IPIP_TUNL_DEV0 "ipip00"
#define IPIP_TUNL_DEV1 "ipip11"

#define XFRM_AUTH "0x1111111111111111111111111111111111111111"
#define XFRM_ENC "0x22222222222222222222222222222222"
#define XFRM_SPI_IN_TO_OUT 0x1
#define XFRM_SPI_OUT_TO_IN 0x2

#define PING_ARGS "-i 0.01 -c 3 -w 10 -q"

static int config_device(void)
@@ -264,6 +270,92 @@ static void delete_ipip_tunnel(void)
	SYS_NOFAIL("ip fou del port 5555 2> /dev/null");
}

static int add_xfrm_tunnel(void)
{
	/* at_ns0 namespace
	 * at_ns0 -> root
	 */
	SYS(fail,
	    "ip netns exec at_ns0 "
		"ip xfrm state add src %s dst %s proto esp "
			"spi %d reqid 1 mode tunnel replay-window 42 "
			"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
	    IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT, XFRM_AUTH, XFRM_ENC);
	SYS(fail,
	    "ip netns exec at_ns0 "
		"ip xfrm policy add src %s/32 dst %s/32 dir out "
			"tmpl src %s dst %s proto esp reqid 1 "
			"mode tunnel",
	    IP4_ADDR_TUNL_DEV0, IP4_ADDR_TUNL_DEV1, IP4_ADDR_VETH0, IP4_ADDR1_VETH1);

	/* root -> at_ns0 */
	SYS(fail,
	    "ip netns exec at_ns0 "
		"ip xfrm state add src %s dst %s proto esp "
			"spi %d reqid 2 mode tunnel "
			"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
	    IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN, XFRM_AUTH, XFRM_ENC);
	SYS(fail,
	    "ip netns exec at_ns0 "
		"ip xfrm policy add src %s/32 dst %s/32 dir in "
			"tmpl src %s dst %s proto esp reqid 2 "
			"mode tunnel",
	    IP4_ADDR_TUNL_DEV1, IP4_ADDR_TUNL_DEV0, IP4_ADDR1_VETH1, IP4_ADDR_VETH0);

	/* address & route */
	SYS(fail, "ip netns exec at_ns0 ip addr add dev veth0 %s/32",
	    IP4_ADDR_TUNL_DEV0);
	SYS(fail, "ip netns exec at_ns0 ip route add %s dev veth0 via %s src %s",
	    IP4_ADDR_TUNL_DEV1, IP4_ADDR1_VETH1, IP4_ADDR_TUNL_DEV0);

	/* root namespace
	 * at_ns0 -> root
	 */
	SYS(fail,
	    "ip xfrm state add src %s dst %s proto esp "
		    "spi %d reqid 1 mode tunnel replay-window 42 "
		    "auth-trunc 'hmac(sha1)' %s 96  enc 'cbc(aes)' %s",
	    IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT, XFRM_AUTH, XFRM_ENC);
	SYS(fail,
	    "ip xfrm policy add src %s/32 dst %s/32 dir in "
		    "tmpl src %s dst %s proto esp reqid 1 "
		    "mode tunnel",
	    IP4_ADDR_TUNL_DEV0, IP4_ADDR_TUNL_DEV1, IP4_ADDR_VETH0, IP4_ADDR1_VETH1);

	/* root -> at_ns0 */
	SYS(fail,
	    "ip xfrm state add src %s dst %s proto esp "
		    "spi %d reqid 2 mode tunnel "
		    "auth-trunc 'hmac(sha1)' %s 96  enc 'cbc(aes)' %s",
	    IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN, XFRM_AUTH, XFRM_ENC);
	SYS(fail,
	    "ip xfrm policy add src %s/32 dst %s/32 dir out "
		    "tmpl src %s dst %s proto esp reqid 2 "
		    "mode tunnel",
	    IP4_ADDR_TUNL_DEV1, IP4_ADDR_TUNL_DEV0, IP4_ADDR1_VETH1, IP4_ADDR_VETH0);

	/* address & route */
	SYS(fail, "ip addr add dev veth1 %s/32", IP4_ADDR_TUNL_DEV1);
	SYS(fail, "ip route add %s dev veth1 via %s src %s",
	    IP4_ADDR_TUNL_DEV0, IP4_ADDR_VETH0, IP4_ADDR_TUNL_DEV1);

	return 0;
fail:
	return -1;
}

static void delete_xfrm_tunnel(void)
{
	SYS_NOFAIL("ip xfrm policy delete dir out src %s/32 dst %s/32 2> /dev/null",
		   IP4_ADDR_TUNL_DEV1, IP4_ADDR_TUNL_DEV0);
	SYS_NOFAIL("ip xfrm policy delete dir in src %s/32 dst %s/32 2> /dev/null",
		   IP4_ADDR_TUNL_DEV0, IP4_ADDR_TUNL_DEV1);
	SYS_NOFAIL("ip xfrm state delete src %s dst %s proto esp spi %d 2> /dev/null",
		   IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT);
	SYS_NOFAIL("ip xfrm state delete src %s dst %s proto esp spi %d 2> /dev/null",
		   IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN);
}

static int test_ping(int family, const char *addr)
{
	SYS(fail, "%s %s %s > /dev/null", ping_command(family), PING_ARGS, addr);
@@ -532,25 +624,85 @@ static void test_ipip_tunnel(enum ipip_encap encap)
		test_tunnel_kern__destroy(skel);
}

static void test_xfrm_tunnel(void)
{
	DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook,
			    .attach_point = BPF_TC_INGRESS);
	LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
	struct test_tunnel_kern *skel = NULL;
	struct nstoken *nstoken;
	int xdp_prog_fd;
	int tc_prog_fd;
	int ifindex;
	int err;

	err = add_xfrm_tunnel();
	if (!ASSERT_OK(err, "add_xfrm_tunnel"))
		return;

	skel = test_tunnel_kern__open_and_load();
	if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load"))
		goto done;

	ifindex = if_nametoindex("veth1");
	if (!ASSERT_NEQ(ifindex, 0, "veth1 ifindex"))
		goto done;

	/* attach tc prog to tunnel dev */
	tc_hook.ifindex = ifindex;
	tc_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state);
	if (!ASSERT_GE(tc_prog_fd, 0, "bpf_program__fd"))
		goto done;
	if (attach_tc_prog(&tc_hook, tc_prog_fd, -1))
		goto done;

	/* attach xdp prog to tunnel dev */
	xdp_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state_xdp);
	if (!ASSERT_GE(xdp_prog_fd, 0, "bpf_program__fd"))
		goto done;
	err = bpf_xdp_attach(ifindex, xdp_prog_fd, XDP_FLAGS_REPLACE, &opts);
	if (!ASSERT_OK(err, "bpf_xdp_attach"))
		goto done;

	/* ping from at_ns0 namespace test */
	nstoken = open_netns("at_ns0");
	err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1);
	close_netns(nstoken);
	if (!ASSERT_OK(err, "test_ping"))
		goto done;

	if (!ASSERT_EQ(skel->bss->xfrm_reqid, 1, "req_id"))
		goto done;
	if (!ASSERT_EQ(skel->bss->xfrm_spi, XFRM_SPI_IN_TO_OUT, "spi"))
		goto done;
	if (!ASSERT_EQ(skel->bss->xfrm_remote_ip, 0xac100164, "remote_ip"))
		goto done;
	if (!ASSERT_EQ(skel->bss->xfrm_replay_window, 42, "replay_window"))
		goto done;

done:
	delete_xfrm_tunnel();
	if (skel)
		test_tunnel_kern__destroy(skel);
}

#define RUN_TEST(name, ...)						\
	({								\
		if (test__start_subtest(#name)) {			\
			config_device();				\
			test_ ## name(__VA_ARGS__);			\
			cleanup();					\
		}							\
	})

static void *test_tunnel_run_tests(void *arg)
{
	cleanup();
	config_device();

	RUN_TEST(vxlan_tunnel);
	RUN_TEST(ip6vxlan_tunnel);
	RUN_TEST(ipip_tunnel, NONE);
	RUN_TEST(ipip_tunnel, FOU);
	RUN_TEST(ipip_tunnel, GUE);

	cleanup();
	RUN_TEST(xfrm_tunnel);

	return NULL;
}
Loading