Commit c336b0b3 authored by Puranjay Mohan's avatar Puranjay Mohan Committed by Alexei Starovoitov
Browse files

bpf: arena: populate vm_area without allocating memory



vm_area_map_pages() may allocate memory while inserting pages into bpf
arena's vm_area. In order to make bpf_arena_alloc_pages() kfunc
non-sleepable change bpf arena to populate pages without
allocating memory:
- at arena creation time populate all page table levels except
  the last level
- when new pages need to be inserted call apply_to_page_range() again
  with apply_range_set_cb() which will only set_pte_at() those pages and
  will not allocate memory.
- when freeing pages call apply_to_existing_page_range with
  apply_range_clear_cb() to clear the pte for the page to be removed. This
  doesn't free intermediate page table levels.

Signed-off-by: default avatarPuranjay Mohan <puranjay@kernel.org>
Link: https://lore.kernel.org/r/20251222195022.431211-2-puranjay@kernel.org


Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parent ac1c5bc7
Loading
Loading
Loading
Loading
+90 −10
Original line number Diff line number Diff line
@@ -2,11 +2,13 @@
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/cacheflush.h>
#include <linux/err.h>
#include "linux/filter.h"
#include <linux/btf_ids.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <asm/tlbflush.h>
#include "range_tree.h"

/*
@@ -92,6 +94,68 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr)
	return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
}

struct apply_range_data {
	struct page **pages;
	int i;
};

static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
{
	struct apply_range_data *d = data;
	struct page *page;

	if (!data)
		return 0;
	/* sanity check */
	if (unlikely(!pte_none(ptep_get(pte))))
		return -EBUSY;

	page = d->pages[d->i];
	/* paranoia, similar to vmap_pages_pte_range() */
	if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page))))
		return -EINVAL;

	set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
	d->i++;
	return 0;
}

static void flush_vmap_cache(unsigned long start, unsigned long size)
{
	flush_cache_vmap(start, start + size);
}

static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
{
	pte_t old_pte;
	struct page *page;

	/* sanity check */
	old_pte = ptep_get(pte);
	if (pte_none(old_pte) || !pte_present(old_pte))
		return 0; /* nothing to do */

	/* get page and free it */
	page = pte_page(old_pte);
	if (WARN_ON_ONCE(!page))
		return -EINVAL;

	pte_clear(&init_mm, addr, pte);

	/* ensure no stale TLB entries */
	flush_tlb_kernel_range(addr, addr + PAGE_SIZE);

	__free_page(page);

	return 0;
}

static int populate_pgtable_except_pte(struct bpf_arena *arena)
{
	return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
				   KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
}

static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
{
	struct vm_struct *kern_vm;
@@ -144,6 +208,12 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
		goto err;
	}
	mutex_init(&arena->lock);
	err = populate_pgtable_except_pte(arena);
	if (err) {
		range_tree_destroy(&arena->rt);
		bpf_map_area_free(arena);
		goto err;
	}

	return &arena->map;
err:
@@ -286,6 +356,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
	if (ret)
		return VM_FAULT_SIGSEGV;

	struct apply_range_data data = { .pages = &page, .i = 0 };
	/* Account into memcg of the process that created bpf_arena */
	ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
	if (ret) {
@@ -293,12 +364,13 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
		return VM_FAULT_SIGSEGV;
	}

	ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
	ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
	if (ret) {
		range_tree_set(&arena->rt, vmf->pgoff, 1);
		__free_page(page);
		return VM_FAULT_SIGSEGV;
	}
	flush_vmap_cache(kaddr, PAGE_SIZE);
out:
	page_ref_add(page, 1);
	vmf->page = page;
@@ -428,7 +500,8 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
	/* user_vm_end/start are fixed before bpf prog runs */
	long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
	u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
	struct page **pages;
	struct page **pages = NULL;
	long mapped = 0;
	long pgoff = 0;
	u32 uaddr32;
	int ret, i;
@@ -450,7 +523,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
	if (!pages)
		return 0;

	guard(mutex)(&arena->lock);
	mutex_lock(&arena->lock);

	if (uaddr) {
		ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
@@ -465,6 +538,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
	if (ret)
		goto out_free_pages;

	struct apply_range_data data = { .pages = pages, .i = 0 };
	ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
	if (ret)
		goto out;
@@ -477,18 +551,24 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
	 * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
	 * lower 32-bit and it's ok.
	 */
	ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
				kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
	if (ret) {
		for (i = 0; i < page_cnt; i++)
	apply_to_page_range(&init_mm, kern_vm_start + uaddr32,
			    page_cnt << PAGE_SHIFT, apply_range_set_cb, &data);
	mapped = data.i;
	flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT);
	if (mapped < page_cnt) {
		for (i = mapped; i < page_cnt; i++)
			__free_page(pages[i]);
		goto out;
	}
	mutex_unlock(&arena->lock);
	kvfree(pages);
	return clear_lo32(arena->user_vm_start) + uaddr32;
out:
	range_tree_set(&arena->rt, pgoff, page_cnt);
	range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped);
out_free_pages:
	mutex_unlock(&arena->lock);
	if (mapped)
		arena_free_pages(arena, uaddr32, mapped);
	kvfree(pages);
	return 0;
}
@@ -545,8 +625,8 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
			 * page_cnt is big it's faster to do the batched zap.
			 */
			zap_pages(arena, full_uaddr, 1);
		vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
		__free_page(page);
		apply_to_existing_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_clear_cb,
					     NULL);
	}
}