Commit 2b1fd82c authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'bpf-arena-add-kfunc-for-reserving-arena-memory'



Emil Tsalapatis says:

====================
bpf/arena: Add kfunc for reserving arena memory

Add a new kfunc for BPF arenas that reserves a region of the mapping
to prevent it from being mapped. These regions serve as guards against
out-of-bounds accesses and are useful for debugging arena-related code.

>From v3 (20250709015712.97099-1-emil@etsalapatis.com)
------------------------------------------------------

- Added Acked-by tags by Yonghong.
- Replace hardcoded error numbers in selftests (Yonghong).
- Fixed selftest for partially freeing a reserved region (Yonghong).

>From v2 (20250702003351.197234-1-emil@etsalapatis.com)
------------------------------------------------------

- Removed -EALREADY and replaced with -EINVAL to bring error handling in
  line with the rest of the BPF code (Alexei).

>From v1 (20250620031118.245601-1-emil@etsalapatis.com)
------------------------------------------------------

- Removed the additional guard range tree. Adjusted tests accordingly.
  Reserved regions now behave like allocated regions, and can be
  unreserved using bpf_arena_free_pages(). They can also be allocated
  from userspace through minor faults. It is up to the user to prevent
  erroneous frees and/or use the BPF_F_SEGV_ON_FAULT flag to catch
  stray userspace accesses (Alexei).
- Changed terminology from guard pages to reserved pages (Alexei,
  Kartikeya).

Signed-off-by: default avatarEmil Tsalapatis <emil@etsalapatis.com>
====================

Link: https://patch.msgid.link/20250709191312.29840-1-emil@etsalapatis.com


Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents ad97cb2e 9f9559f0
Loading
Loading
Loading
Loading
+43 −0
Original line number Diff line number Diff line
@@ -550,6 +550,34 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
	}
}

/*
 * Reserve an arena virtual address range without populating it. This call stops
 * bpf_arena_alloc_pages from adding pages to this range.
 */
static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt)
{
	long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
	long pgoff;
	int ret;

	if (uaddr & ~PAGE_MASK)
		return 0;

	pgoff = compute_pgoff(arena, uaddr);
	if (pgoff + page_cnt > page_cnt_max)
		return -EINVAL;

	guard(mutex)(&arena->lock);

	/* Cannot guard already allocated pages. */
	ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
	if (ret)
		return -EBUSY;

	/* "Allocate" the region to prevent it from being allocated. */
	return range_tree_clear(&arena->rt, pgoff, page_cnt);
}

__bpf_kfunc_start_defs();

__bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
@@ -573,11 +601,26 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt
		return;
	arena_free_pages(arena, (long)ptr__ign, page_cnt);
}

__bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt)
{
	struct bpf_map *map = p__map;
	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);

	if (map->map_type != BPF_MAP_TYPE_ARENA)
		return -EINVAL;

	if (!page_cnt)
		return 0;

	return arena_reserve_pages(arena, (long)ptr__ign, page_cnt);
}
__bpf_kfunc_end_defs();

BTF_KFUNCS_START(arena_kfuncs)
BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2)
BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
BTF_KFUNCS_END(arena_kfuncs)

static const struct btf_kfunc_id_set common_kfunc_set = {
+3 −0
Original line number Diff line number Diff line
@@ -46,8 +46,11 @@

void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt,
				    int node_id, __u64 flags) __ksym __weak;
int bpf_arena_reserve_pages(void *map, void __arena *addr, __u32 page_cnt) __ksym __weak;
void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak;

#define arena_base(map) ((void __arena *)((struct bpf_arena *)(map))->user_vm_start)

#else /* when compiled as user space code */

#define __arena
+106 −0
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@

#define BPF_NO_KFUNC_PROTOTYPES
#include <vmlinux.h>
#include <errno.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "bpf_misc.h"
@@ -114,6 +115,111 @@ int basic_alloc3(void *ctx)
	return 0;
}

SEC("syscall")
__success __retval(0)
int basic_reserve1(void *ctx)
{
#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
	char __arena *page;
	int ret;

	page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
	if (!page)
		return 1;

	page += __PAGE_SIZE;

	/* Reserve the second page */
	ret = bpf_arena_reserve_pages(&arena, page, 1);
	if (ret)
		return 2;

	/* Try to explicitly allocate the reserved page. */
	page = bpf_arena_alloc_pages(&arena, page, 1, NUMA_NO_NODE, 0);
	if (page)
		return 3;

	/* Try to implicitly allocate the page (since there's only 2 of them). */
	page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
	if (page)
		return 4;
#endif
	return 0;
}

SEC("syscall")
__success __retval(0)
int basic_reserve2(void *ctx)
{
#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
	char __arena *page;
	int ret;

	page = arena_base(&arena);
	ret = bpf_arena_reserve_pages(&arena, page, 1);
	if (ret)
		return 1;

	page = bpf_arena_alloc_pages(&arena, page, 1, NUMA_NO_NODE, 0);
	if ((u64)page)
		return 2;
#endif
	return 0;
}

/* Reserve the same page twice, should return -EBUSY. */
SEC("syscall")
__success __retval(0)
int reserve_twice(void *ctx)
{
#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
	char __arena *page;
	int ret;

	page = arena_base(&arena);

	ret = bpf_arena_reserve_pages(&arena, page, 1);
	if (ret)
		return 1;

	ret = bpf_arena_reserve_pages(&arena, page, 1);
	if (ret != -EBUSY)
		return 2;
#endif
	return 0;
}

/* Try to reserve past the end of the arena. */
SEC("syscall")
__success __retval(0)
int reserve_invalid_region(void *ctx)
{
#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
	char __arena *page;
	int ret;

	/* Try a NULL pointer. */
	ret = bpf_arena_reserve_pages(&arena, NULL, 3);
	if (ret != -EINVAL)
		return 1;

	page = arena_base(&arena);

	ret = bpf_arena_reserve_pages(&arena, page, 3);
	if (ret != -EINVAL)
		return 2;

	ret = bpf_arena_reserve_pages(&arena, page, 4096);
	if (ret != -EINVAL)
		return 3;

	ret = bpf_arena_reserve_pages(&arena, page, (1ULL << 32) - 1);
	if (ret != -EINVAL)
		return 4;
#endif
	return 0;
}

SEC("iter.s/bpf_map")
__success __log_level(2)
int iter_maps1(struct bpf_iter__bpf_map *ctx)
+98 −0
Original line number Diff line number Diff line
@@ -67,6 +67,104 @@ int big_alloc1(void *ctx)
	return 0;
}

/* Try to access a reserved page. Behavior should be identical with accessing unallocated pages. */
SEC("syscall")
__success __retval(0)
int access_reserved(void *ctx)
{
#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
	volatile char __arena *page;
	char __arena *base;
	const size_t len = 4;
	int ret, i;

	/* Get a separate region of the arena. */
	page = base = arena_base(&arena) + 16384 * PAGE_SIZE;

	ret = bpf_arena_reserve_pages(&arena, base, len);
	if (ret)
		return 1;

	/* Try to dirty reserved memory. */
	for (i = 0; i < len && can_loop; i++)
		*page = 0x5a;

	for (i = 0; i < len && can_loop; i++) {
		page = (volatile char __arena *)(base + i * PAGE_SIZE);

		/*
		 * Error out in case either the write went through,
		 * or the address has random garbage.
		 */
		if (*page == 0x5a)
			return 2 + 2 * i;

		if (*page)
			return 2 + 2 * i + 1;
	}
#endif
	return 0;
}

/* Try to allocate a region overlapping with a reservation. */
SEC("syscall")
__success __retval(0)
int request_partially_reserved(void *ctx)
{
#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
	volatile char __arena *page;
	char __arena *base;
	int ret;

	/* Add an arbitrary page offset. */
	page = base = arena_base(&arena) + 4096 * __PAGE_SIZE;

	ret = bpf_arena_reserve_pages(&arena, base + 3 * __PAGE_SIZE, 4);
	if (ret)
		return 1;

	page = bpf_arena_alloc_pages(&arena, base, 5, NUMA_NO_NODE, 0);
	if ((u64)page != 0ULL)
		return 2;
#endif
	return 0;
}

SEC("syscall")
__success __retval(0)
int free_reserved(void *ctx)
{
#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
	char __arena *addr;
	char __arena *page;
	int ret;

	/* Add an arbitrary page offset. */
	addr = arena_base(&arena) + 32768 * __PAGE_SIZE;

	page = bpf_arena_alloc_pages(&arena, addr, 2, NUMA_NO_NODE, 0);
	if (!page)
		return 1;

	ret = bpf_arena_reserve_pages(&arena, addr + 2 * __PAGE_SIZE, 2);
	if (ret)
		return 2;

	/*
	 * Reserved and allocated pages should be interchangeable for
	 * bpf_arena_free_pages(). Free a reserved and an allocated
	 * page with a single call.
	 */
	bpf_arena_free_pages(&arena, addr + __PAGE_SIZE , 2);

	/* The free call above should have succeeded, so this allocation should too. */
	page = bpf_arena_alloc_pages(&arena, addr + __PAGE_SIZE, 2, NUMA_NO_NODE, 0);
	if (!page)
		return 3;
#endif
	return 0;
}

#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
#define PAGE_CNT 100
__u8 __arena * __arena page[PAGE_CNT]; /* occupies the first page */