Commit bfccacdf authored by Andrii Nakryiko's avatar Andrii Nakryiko
Browse files

Merge branch 'allow-mmap-of-sys-kernel-btf-vmlinux'



Lorenz Bauer says:

====================
Allow mmap of /sys/kernel/btf/vmlinux

I'd like to cut down the memory usage of parsing vmlinux BTF in ebpf-go.
With some upcoming changes the library is sitting at 5MiB for a parse.
Most of that memory is simply copying the BTF blob into user space.
By allowing vmlinux BTF to be mmapped read-only into user space I can
cut memory usage by about 75%.

Signed-off-by: default avatarLorenz Bauer <lmb@isovalent.com>
---
Changes in v5:
- Fix error return of btf_parse_raw_mmap (Andrii)
- Link to v4: https://lore.kernel.org/r/20250510-vmlinux-mmap-v4-0-69e424b2a672@isovalent.com

Changes in v4:
- Go back to remap_pfn_range for aarch64 compat
- Dropped btf_new_no_copy (Andrii)
- Fixed nits in selftests (Andrii)
- Clearer error handling in the mmap handler (Andrii)
- Fixed build on s390
- Link to v3: https://lore.kernel.org/r/20250505-vmlinux-mmap-v3-0-5d53afa060e8@isovalent.com

Changes in v3:
- Remove slightly confusing calculation of trailing (Alexei)
- Use vm_insert_page (Alexei)
- Simplified libbpf code
- Link to v2: https://lore.kernel.org/r/20250502-vmlinux-mmap-v2-0-95c271434519@isovalent.com

Changes in v2:
- Use btf__new in selftest
- Avoid vm_iomap_memory in btf_vmlinux_mmap
- Add VM_DONTDUMP
- Add support to libbpf
- Link to v1: https://lore.kernel.org/r/20250501-vmlinux-mmap-v1-0-aa2724572598@isovalent.com

---
====================

Link: https://patch.msgid.link/20250520-vmlinux-mmap-v5-0-e8c941acc414@isovalent.com


Signed-off-by: default avatarAndrii Nakryiko <andrii@kernel.org>
parents 8259eb0e 3c0421c9
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -667,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
 */
#ifdef CONFIG_DEBUG_INFO_BTF
#define BTF								\
	. = ALIGN(PAGE_SIZE);						\
	.BTF : AT(ADDR(.BTF) - LOAD_OFFSET) {				\
		BOUNDED_SECTION_BY(.BTF, _BTF)				\
	}								\
	. = ALIGN(4);							\
	. = ALIGN(PAGE_SIZE);						\
	.BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) {			\
		*(.BTF_ids)						\
	}
+32 −0
Original line number Diff line number Diff line
@@ -7,14 +7,46 @@
#include <linux/kobject.h>
#include <linux/init.h>
#include <linux/sysfs.h>
#include <linux/mm.h>
#include <linux/io.h>
#include <linux/btf.h>

/* See scripts/link-vmlinux.sh, gen_btf() func for details */
extern char __start_BTF[];
extern char __stop_BTF[];

static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
				  const struct bin_attribute *attr,
				  struct vm_area_struct *vma)
{
	unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
	size_t vm_size = vma->vm_end - vma->vm_start;
	phys_addr_t addr = virt_to_phys(__start_BTF);
	unsigned long pfn = addr >> PAGE_SHIFT;

	if (attr->private != __start_BTF || !PAGE_ALIGNED(addr))
		return -EINVAL;

	if (vma->vm_pgoff)
		return -EINVAL;

	if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE))
		return -EACCES;

	if (pfn + pages < pfn)
		return -EINVAL;

	if ((vm_size >> PAGE_SHIFT) > pages)
		return -EINVAL;

	vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE);
	return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
}

static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
	.attr = { .name = "vmlinux", .mode = 0444, },
	.read_new = sysfs_bin_attr_simple_read,
	.mmap = btf_sysfs_vmlinux_mmap,
};

struct kobject *btf_kobj;
+71 −18
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@
#include <sys/utsname.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <linux/kernel.h>
#include <linux/err.h>
#include <linux/btf.h>
@@ -120,6 +121,9 @@ struct btf {
	/* whether base_btf should be freed in btf_free for this instance */
	bool owns_base;

	/* whether raw_data is a (read-only) mmap */
	bool raw_data_is_mmap;

	/* BTF object FD, if loaded into kernel */
	int fd;

@@ -951,6 +955,17 @@ static bool btf_is_modifiable(const struct btf *btf)
	return (void *)btf->hdr != btf->raw_data;
}

static void btf_free_raw_data(struct btf *btf)
{
	if (btf->raw_data_is_mmap) {
		munmap(btf->raw_data, btf->raw_size);
		btf->raw_data_is_mmap = false;
	} else {
		free(btf->raw_data);
	}
	btf->raw_data = NULL;
}

void btf__free(struct btf *btf)
{
	if (IS_ERR_OR_NULL(btf))
@@ -970,7 +985,7 @@ void btf__free(struct btf *btf)
		free(btf->types_data);
		strset__free(btf->strs_set);
	}
	free(btf->raw_data);
	btf_free_raw_data(btf);
	free(btf->raw_data_swapped);
	free(btf->type_offs);
	if (btf->owns_base)
@@ -1030,7 +1045,7 @@ struct btf *btf__new_empty_split(struct btf *base_btf)
	return libbpf_ptr(btf_new_empty(base_btf));
}

static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf)
static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf, bool is_mmap)
{
	struct btf *btf;
	int err;
@@ -1050,12 +1065,18 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf)
		btf->start_str_off = base_btf->hdr->str_len;
	}

	if (is_mmap) {
		btf->raw_data = (void *)data;
		btf->raw_data_is_mmap = true;
	} else {
		btf->raw_data = malloc(size);
		if (!btf->raw_data) {
			err = -ENOMEM;
			goto done;
		}
		memcpy(btf->raw_data, data, size);
	}

	btf->raw_size = size;

	btf->hdr = btf->raw_data;
@@ -1083,12 +1104,12 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf)

struct btf *btf__new(const void *data, __u32 size)
{
	return libbpf_ptr(btf_new(data, size, NULL));
	return libbpf_ptr(btf_new(data, size, NULL, false));
}

struct btf *btf__new_split(const void *data, __u32 size, struct btf *base_btf)
{
	return libbpf_ptr(btf_new(data, size, base_btf));
	return libbpf_ptr(btf_new(data, size, base_btf, false));
}

struct btf_elf_secs {
@@ -1209,7 +1230,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf,

	if (secs.btf_base_data) {
		dist_base_btf = btf_new(secs.btf_base_data->d_buf, secs.btf_base_data->d_size,
					NULL);
					NULL, false);
		if (IS_ERR(dist_base_btf)) {
			err = PTR_ERR(dist_base_btf);
			dist_base_btf = NULL;
@@ -1218,7 +1239,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf,
	}

	btf = btf_new(secs.btf_data->d_buf, secs.btf_data->d_size,
		      dist_base_btf ?: base_btf);
		      dist_base_btf ?: base_btf, false);
	if (IS_ERR(btf)) {
		err = PTR_ERR(btf);
		goto done;
@@ -1335,7 +1356,7 @@ static struct btf *btf_parse_raw(const char *path, struct btf *base_btf)
	}

	/* finally parse BTF data */
	btf = btf_new(data, sz, base_btf);
	btf = btf_new(data, sz, base_btf, false);

err_out:
	free(data);
@@ -1354,6 +1375,37 @@ struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf)
	return libbpf_ptr(btf_parse_raw(path, base_btf));
}

static struct btf *btf_parse_raw_mmap(const char *path, struct btf *base_btf)
{
	struct stat st;
	void *data;
	struct btf *btf;
	int fd, err;

	fd = open(path, O_RDONLY);
	if (fd < 0)
		return libbpf_err_ptr(-errno);

	if (fstat(fd, &st) < 0) {
		err = -errno;
		close(fd);
		return libbpf_err_ptr(err);
	}

	data = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
	err = -errno;
	close(fd);

	if (data == MAP_FAILED)
		return libbpf_err_ptr(err);

	btf = btf_new(data, st.st_size, base_btf, true);
	if (IS_ERR(btf))
		munmap(data, st.st_size);

	return btf;
}

static struct btf *btf_parse(const char *path, struct btf *base_btf, struct btf_ext **btf_ext)
{
	struct btf *btf;
@@ -1618,7 +1670,7 @@ struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf)
		goto exit_free;
	}

	btf = btf_new(ptr, btf_info.btf_size, base_btf);
	btf = btf_new(ptr, btf_info.btf_size, base_btf, false);

exit_free:
	free(ptr);
@@ -1658,10 +1710,8 @@ struct btf *btf__load_from_kernel_by_id(__u32 id)

static void btf_invalidate_raw_data(struct btf *btf)
{
	if (btf->raw_data) {
		free(btf->raw_data);
		btf->raw_data = NULL;
	}
	if (btf->raw_data)
		btf_free_raw_data(btf);
	if (btf->raw_data_swapped) {
		free(btf->raw_data_swapped);
		btf->raw_data_swapped = NULL;
@@ -5331,7 +5381,10 @@ struct btf *btf__load_vmlinux_btf(void)
		pr_warn("kernel BTF is missing at '%s', was CONFIG_DEBUG_INFO_BTF enabled?\n",
			sysfs_btf_path);
	} else {
		btf = btf_parse_raw_mmap(sysfs_btf_path, NULL);
		if (IS_ERR(btf))
			btf = btf__parse(sysfs_btf_path, NULL);

		if (!btf) {
			err = -errno;
			pr_warn("failed to read kernel BTF from '%s': %s\n",
+81 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
/* Copyright (c) 2025 Isovalent */

#include <test_progs.h>
#include <bpf/btf.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>

static void test_btf_mmap_sysfs(const char *path, struct btf *base)
{
	struct stat st;
	__u64 btf_size, end;
	void *raw_data = NULL;
	int fd = -1;
	long page_size;
	struct btf *btf = NULL;

	page_size = sysconf(_SC_PAGESIZE);
	if (!ASSERT_GE(page_size, 0, "get_page_size"))
		goto cleanup;

	if (!ASSERT_OK(stat(path, &st), "stat_btf"))
		goto cleanup;

	btf_size = st.st_size;
	end = (btf_size + page_size - 1) / page_size * page_size;

	fd = open(path, O_RDONLY);
	if (!ASSERT_GE(fd, 0, "open_btf"))
		goto cleanup;

	raw_data = mmap(NULL, btf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
	if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_writable"))
		goto cleanup;

	raw_data = mmap(NULL, btf_size, PROT_READ, MAP_SHARED, fd, 0);
	if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_shared"))
		goto cleanup;

	raw_data = mmap(NULL, end + 1, PROT_READ, MAP_PRIVATE, fd, 0);
	if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_invalid_size"))
		goto cleanup;

	raw_data = mmap(NULL, end, PROT_READ, MAP_PRIVATE, fd, 0);
	if (!ASSERT_OK_PTR(raw_data, "mmap_btf"))
		goto cleanup;

	if (!ASSERT_EQ(mprotect(raw_data, btf_size, PROT_READ | PROT_WRITE), -1,
	    "mprotect_writable"))
		goto cleanup;

	if (!ASSERT_EQ(mprotect(raw_data, btf_size, PROT_READ | PROT_EXEC), -1,
	    "mprotect_executable"))
		goto cleanup;

	/* Check padding is zeroed */
	for (int i = btf_size; i < end; i++) {
		if (((__u8 *)raw_data)[i] != 0) {
			PRINT_FAIL("tail of BTF is not zero at page offset %d\n", i);
			goto cleanup;
		}
	}

	btf = btf__new_split(raw_data, btf_size, base);
	if (!ASSERT_OK_PTR(btf, "parse_btf"))
		goto cleanup;

cleanup:
	btf__free(btf);
	if (raw_data && raw_data != MAP_FAILED)
		munmap(raw_data, btf_size);
	if (fd >= 0)
		close(fd);
}

void test_btf_sysfs(void)
{
	test_btf_mmap_sysfs("/sys/kernel/btf/vmlinux", NULL);
}