Commit 08701e30 authored by Andrii Nakryiko's avatar Andrii Nakryiko
Browse files

Merge branch 'bpf-introduce-bpf-arena'

Alexei Starovoitov says:

====================
bpf: Introduce BPF arena.

From: Alexei Starovoitov <ast@kernel.org>

v2->v3:
- contains bpf bits only, but cc-ing past audience for continuity
- since prerequisite patches landed, this series focus on the main
  functionality of bpf_arena.
- adopted Andrii's approach to support arena in libbpf.
- simplified LLVM support. Instead of two instructions it's now only one.
- switched to cond_break (instead of open coded iters) in selftests
- implemented several follow-ups that will be sent after this set
  . remember first IP and bpf insn that faulted in arena.
    report to user space via bpftool
  . copy paste and tweak glob_match() aka mini-regex as a selftests/bpf
- see patch 1 for detailed description of bpf_arena

v1->v2:
- Improved commit log with reasons for using vmap_pages_range() in arena.
  Thanks to Johannes
- Added support for __arena global variables in bpf programs
- Fixed race conditions spotted by Barret
- Fixed wrap32 issue spotted by Barret
- Fixed bpf_map_mmap_sz() the way Andrii suggested

The work on bpf_arena was inspired by Barret's work:
https://github.com/google/ghost-userspace/blob/main/lib/queue.bpf.h
that implements queues, lists and AVL trees completely as bpf programs
using giant bpf array map and integer indices instead of pointers.
bpf_arena is a sparse array that allows to use normal C pointers to
build such data structures. Last few patches implement page_frag
allocator, link list and hash table as bpf programs.

v1:
bpf programs have multiple options to communicate with user space:
- Various ring buffers (perf, ftrace, bpf): The data is streamed
  unidirectionally from bpf to user space.
- Hash map: The bpf program populates elements, and user space consumes
  them via bpf syscall.
- mmap()-ed array map: Libbpf creates an array map that is directly
  accessed by the bpf program and mmap-ed to user space. It's the fastest
  way. Its disadvantage is that memory for the whole array is reserved at
  the start.
====================

Link: https://lore.kernel.org/r/20240308010812.89848-1-alexei.starovoitov@gmail.com


Signed-off-by: default avatarAndrii Nakryiko <andrii@kernel.org>
parents 365c2b32 8df839ae
Loading
Loading
Loading
Loading
+230 −1
Original line number Diff line number Diff line
@@ -113,6 +113,7 @@ static int bpf_size_to_x86_bytes(int bpf_size)
/* Pick a register outside of BPF range for JIT internal work */
#define AUX_REG (MAX_BPF_JIT_REG + 1)
#define X86_REG_R9 (MAX_BPF_JIT_REG + 2)
#define X86_REG_R12 (MAX_BPF_JIT_REG + 3)

/*
 * The following table maps BPF registers to x86-64 registers.
@@ -139,6 +140,7 @@ static const int reg2hex[] = {
	[BPF_REG_AX] = 2, /* R10 temp register */
	[AUX_REG] = 3,    /* R11 temp register */
	[X86_REG_R9] = 1, /* R9 register, 6th function argument */
	[X86_REG_R12] = 4, /* R12 callee saved */
};

static const int reg2pt_regs[] = {
@@ -167,6 +169,7 @@ static bool is_ereg(u32 reg)
			     BIT(BPF_REG_8) |
			     BIT(BPF_REG_9) |
			     BIT(X86_REG_R9) |
			     BIT(X86_REG_R12) |
			     BIT(BPF_REG_AX));
}

@@ -205,6 +208,17 @@ static u8 add_2mod(u8 byte, u32 r1, u32 r2)
	return byte;
}

static u8 add_3mod(u8 byte, u32 r1, u32 r2, u32 index)
{
	if (is_ereg(r1))
		byte |= 1;
	if (is_ereg(index))
		byte |= 2;
	if (is_ereg(r2))
		byte |= 4;
	return byte;
}

/* Encode 'dst_reg' register into x86-64 opcode 'byte' */
static u8 add_1reg(u8 byte, u32 dst_reg)
{
@@ -645,6 +659,8 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
		pop_r12(&prog);
	} else {
		pop_callee_regs(&prog, callee_regs_used);
		if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena))
			pop_r12(&prog);
	}

	EMIT1(0x58);                              /* pop rax */
@@ -704,6 +720,8 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
		pop_r12(&prog);
	} else {
		pop_callee_regs(&prog, callee_regs_used);
		if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena))
			pop_r12(&prog);
	}

	EMIT1(0x58);                                  /* pop rax */
@@ -887,6 +905,18 @@ static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
	*pprog = prog;
}

static void emit_insn_suffix_SIB(u8 **pprog, u32 ptr_reg, u32 val_reg, u32 index_reg, int off)
{
	u8 *prog = *pprog;

	if (is_imm8(off)) {
		EMIT3(add_2reg(0x44, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off);
	} else {
		EMIT2_off32(add_2reg(0x84, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off);
	}
	*pprog = prog;
}

/*
 * Emit a REX byte if it will be necessary to address these registers
 */
@@ -968,6 +998,37 @@ static void emit_ldsx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
	*pprog = prog;
}

static void emit_ldx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off)
{
	u8 *prog = *pprog;

	switch (size) {
	case BPF_B:
		/* movzx rax, byte ptr [rax + r12 + off] */
		EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB6);
		break;
	case BPF_H:
		/* movzx rax, word ptr [rax + r12 + off] */
		EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB7);
		break;
	case BPF_W:
		/* mov eax, dword ptr [rax + r12 + off] */
		EMIT2(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x8B);
		break;
	case BPF_DW:
		/* mov rax, qword ptr [rax + r12 + off] */
		EMIT2(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x8B);
		break;
	}
	emit_insn_suffix_SIB(&prog, src_reg, dst_reg, index_reg, off);
	*pprog = prog;
}

static void emit_ldx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
	emit_ldx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off);
}

/* STX: *(u8*)(dst_reg + off) = src_reg */
static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
@@ -1002,6 +1063,71 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
	*pprog = prog;
}

/* STX: *(u8*)(dst_reg + index_reg + off) = src_reg */
static void emit_stx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off)
{
	u8 *prog = *pprog;

	switch (size) {
	case BPF_B:
		/* mov byte ptr [rax + r12 + off], al */
		EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x88);
		break;
	case BPF_H:
		/* mov word ptr [rax + r12 + off], ax */
		EMIT3(0x66, add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89);
		break;
	case BPF_W:
		/* mov dword ptr [rax + r12 + 1], eax */
		EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89);
		break;
	case BPF_DW:
		/* mov qword ptr [rax + r12 + 1], rax */
		EMIT2(add_3mod(0x48, dst_reg, src_reg, index_reg), 0x89);
		break;
	}
	emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off);
	*pprog = prog;
}

static void emit_stx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
	emit_stx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off);
}

/* ST: *(u8*)(dst_reg + index_reg + off) = imm32 */
static void emit_st_index(u8 **pprog, u32 size, u32 dst_reg, u32 index_reg, int off, int imm)
{
	u8 *prog = *pprog;

	switch (size) {
	case BPF_B:
		/* mov byte ptr [rax + r12 + off], imm8 */
		EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC6);
		break;
	case BPF_H:
		/* mov word ptr [rax + r12 + off], imm16 */
		EMIT3(0x66, add_3mod(0x40, dst_reg, 0, index_reg), 0xC7);
		break;
	case BPF_W:
		/* mov dword ptr [rax + r12 + 1], imm32 */
		EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC7);
		break;
	case BPF_DW:
		/* mov qword ptr [rax + r12 + 1], imm32 */
		EMIT2(add_3mod(0x48, dst_reg, 0, index_reg), 0xC7);
		break;
	}
	emit_insn_suffix_SIB(&prog, dst_reg, 0, index_reg, off);
	EMIT(imm, bpf_size_to_x86_bytes(size));
	*pprog = prog;
}

static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm)
{
	emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm);
}

static int emit_atomic(u8 **pprog, u8 atomic_op,
		       u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size)
{
@@ -1043,11 +1169,14 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
	return 0;
}

#define DONT_CLEAR 1

bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
{
	u32 reg = x->fixup >> 8;

	/* jump over faulting load and clear dest register */
	if (reg != DONT_CLEAR)
		*(unsigned long *)((void *)regs + reg) = 0;
	regs->ip += x->fixup & 0xff;
	return true;
@@ -1147,11 +1276,15 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
	bool tail_call_seen = false;
	bool seen_exit = false;
	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
	u64 arena_vm_start, user_vm_start;
	int i, excnt = 0;
	int ilen, proglen = 0;
	u8 *prog = temp;
	int err;

	arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
	user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);

	detect_reg_usage(insn, insn_cnt, callee_regs_used,
			 &tail_call_seen);

@@ -1172,8 +1305,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
		push_r12(&prog);
		push_callee_regs(&prog, all_callee_regs_used);
	} else {
		if (arena_vm_start)
			push_r12(&prog);
		push_callee_regs(&prog, callee_regs_used);
	}
	if (arena_vm_start)
		emit_mov_imm64(&prog, X86_REG_R12,
			       arena_vm_start >> 32, (u32) arena_vm_start);

	ilen = prog - temp;
	if (rw_image)
@@ -1213,6 +1351,40 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
			break;

		case BPF_ALU64 | BPF_MOV | BPF_X:
			if (insn->off == BPF_ADDR_SPACE_CAST &&
			    insn->imm == 1U << 16) {
				if (dst_reg != src_reg)
					/* 32-bit mov */
					emit_mov_reg(&prog, false, dst_reg, src_reg);
				/* shl dst_reg, 32 */
				maybe_emit_1mod(&prog, dst_reg, true);
				EMIT3(0xC1, add_1reg(0xE0, dst_reg), 32);

				/* or dst_reg, user_vm_start */
				maybe_emit_1mod(&prog, dst_reg, true);
				if (is_axreg(dst_reg))
					EMIT1_off32(0x0D,  user_vm_start >> 32);
				else
					EMIT2_off32(0x81, add_1reg(0xC8, dst_reg),  user_vm_start >> 32);

				/* rol dst_reg, 32 */
				maybe_emit_1mod(&prog, dst_reg, true);
				EMIT3(0xC1, add_1reg(0xC0, dst_reg), 32);

				/* xor r11, r11 */
				EMIT3(0x4D, 0x31, 0xDB);

				/* test dst_reg32, dst_reg32; check if lower 32-bit are zero */
				maybe_emit_mod(&prog, dst_reg, dst_reg, false);
				EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg));

				/* cmove r11, dst_reg; if so, set dst_reg to zero */
				/* WARNING: Intel swapped src/dst register encoding in CMOVcc !!! */
				maybe_emit_mod(&prog, AUX_REG, dst_reg, true);
				EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg));
				break;
			}
			fallthrough;
		case BPF_ALU | BPF_MOV | BPF_X:
			if (insn->off == 0)
				emit_mov_reg(&prog,
@@ -1564,6 +1736,56 @@ st: if (is_imm8(insn->off))
			emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
			break;

		case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
		case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
		case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
		case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
			start_of_ldx = prog;
			emit_st_r12(&prog, BPF_SIZE(insn->code), dst_reg, insn->off, insn->imm);
			goto populate_extable;

			/* LDX: dst_reg = *(u8*)(src_reg + r12 + off) */
		case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
		case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
		case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
		case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
		case BPF_STX | BPF_PROBE_MEM32 | BPF_B:
		case BPF_STX | BPF_PROBE_MEM32 | BPF_H:
		case BPF_STX | BPF_PROBE_MEM32 | BPF_W:
		case BPF_STX | BPF_PROBE_MEM32 | BPF_DW:
			start_of_ldx = prog;
			if (BPF_CLASS(insn->code) == BPF_LDX)
				emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
			else
				emit_stx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
populate_extable:
			{
				struct exception_table_entry *ex;
				u8 *_insn = image + proglen + (start_of_ldx - temp);
				s64 delta;

				if (!bpf_prog->aux->extable)
					break;

				if (excnt >= bpf_prog->aux->num_exentries) {
					pr_err("mem32 extable bug\n");
					return -EFAULT;
				}
				ex = &bpf_prog->aux->extable[excnt++];

				delta = _insn - (u8 *)&ex->insn;
				/* switch ex to rw buffer for writes */
				ex = (void *)rw_image + ((void *)ex - (void *)image);

				ex->insn = delta;

				ex->data = EX_TYPE_BPF;

				ex->fixup = (prog - start_of_ldx) |
					((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8);
			}
			break;

			/* LDX: dst_reg = *(u8*)(src_reg + off) */
		case BPF_LDX | BPF_MEM | BPF_B:
		case BPF_LDX | BPF_PROBE_MEM | BPF_B:
@@ -2036,6 +2258,8 @@ st: if (is_imm8(insn->off))
				pop_r12(&prog);
			} else {
				pop_callee_regs(&prog, callee_regs_used);
				if (arena_vm_start)
					pop_r12(&prog);
			}
			EMIT1(0xC9);         /* leave */
			emit_return(&prog, image + addrs[i - 1] + (prog - temp));
@@ -3243,6 +3467,11 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
	}
}

bool bpf_jit_supports_arena(void)
{
	return true;
}

bool bpf_jit_supports_ptr_xchg(void)
{
	return true;
+8 −2
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@ struct perf_event;
struct bpf_prog;
struct bpf_prog_aux;
struct bpf_map;
struct bpf_arena;
struct sock;
struct seq_file;
struct btf;
@@ -528,8 +529,8 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head,
			struct bpf_spin_lock *spin_lock);
void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
		      struct bpf_spin_lock *spin_lock);


u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena);
u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena);
int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);

struct bpf_offload_dev;
@@ -711,6 +712,7 @@ enum bpf_arg_type {
	 * on eBPF program stack
	 */
	ARG_PTR_TO_MEM,		/* pointer to valid memory (stack, packet, map value) */
	ARG_PTR_TO_ARENA,

	ARG_CONST_SIZE,		/* number of bytes accessed from memory */
	ARG_CONST_SIZE_OR_ZERO,	/* number of bytes accessed from memory or 0 */
@@ -882,6 +884,7 @@ enum bpf_reg_type {
	 * an explicit null check is required for this struct.
	 */
	PTR_TO_MEM,		 /* reg points to valid memory region */
	PTR_TO_ARENA,
	PTR_TO_BUF,		 /* reg points to a read/write buffer */
	PTR_TO_FUNC,		 /* reg points to a bpf program function */
	CONST_PTR_TO_DYNPTR,	 /* reg points to a const struct bpf_dynptr */
@@ -1457,6 +1460,7 @@ struct bpf_prog_aux {
	bool xdp_has_frags;
	bool exception_cb;
	bool exception_boundary;
	struct bpf_arena *arena;
	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
	const struct btf_type *attach_func_proto;
	/* function name for valid attach_btf_id */
@@ -2215,6 +2219,8 @@ int generic_map_delete_batch(struct bpf_map *map,
struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);

int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
			unsigned long nr_pages, struct page **page_array);
#ifdef CONFIG_MEMCG_KMEM
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
			   int node);
+1 −0
Original line number Diff line number Diff line
@@ -132,6 +132,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops)

BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
+1 −0
Original line number Diff line number Diff line
@@ -548,6 +548,7 @@ struct bpf_insn_aux_data {
	u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
	bool zext_dst; /* this insn zero extends dst reg */
	bool needs_zext; /* alu op needs to clear upper bits */
	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
	bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
	bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */
+4 −0
Original line number Diff line number Diff line
@@ -72,6 +72,9 @@ struct ctl_table_header;
/* unused opcode to mark special ldsx instruction. Same as BPF_IND */
#define BPF_PROBE_MEMSX	0x40

/* unused opcode to mark special load instruction. Same as BPF_MSH */
#define BPF_PROBE_MEM32	0xa0

/* unused opcode to mark call to interpreter with arguments */
#define BPF_CALL_ARGS	0xe0

@@ -959,6 +962,7 @@ bool bpf_jit_supports_kfunc_call(void);
bool bpf_jit_supports_far_kfunc_call(void);
bool bpf_jit_supports_exceptions(void);
bool bpf_jit_supports_ptr_xchg(void);
bool bpf_jit_supports_arena(void);
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
bool bpf_helper_changes_pkt_data(void *func);

Loading