Merge tag 'bpf-next-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Pull bpf updates from Alexei Starovoitov:

 - Support pulling non-linear xdp data with bpf_xdp_pull_data() kfunc
   (Amery Hung)

   Applied as a stable branch in bpf-next and net-next trees.

 - Support reading skb metadata via bpf_dynptr (Jakub Sitnicki)

   Also a stable branch in bpf-next and net-next trees.

 - Enforce expected_attach_type for tailcall compatibility (Daniel
   Borkmann)

 - Replace path-sensitive with path-insensitive live stack analysis in
   the verifier (Eduard Zingerman)

   This is a significant change in the verification logic. More details,
   motivation, long term plans are in the cover letter/merge commit.

 - Support signed BPF programs (KP Singh)

   This is another major feature that took years to materialize.

   Algorithm details are in the cover letter/marge commit

 - Add support for may_goto instruction to s390 JIT (Ilya Leoshkevich)

 - Add support for may_goto instruction to arm64 JIT (Puranjay Mohan)

 - Fix USDT SIB argument handling in libbpf (Jiawei Zhao)

 - Allow uprobe-bpf program to change context registers (Jiri Olsa)

 - Support signed loads from BPF arena (Kumar Kartikeya Dwivedi and
   Puranjay Mohan)

 - Allow access to union arguments in tracing programs (Leon Hwang)

 - Optimize rcu_read_lock() + migrate_disable() combination where it's
   used in BPF subsystem (Menglong Dong)

 - Introduce bpf_task_work_schedule*() kfuncs to schedule deferred
   execution of BPF callback in the context of a specific task using the
   kernel’s task_work infrastructure (Mykyta Yatsenko)

 - Enforce RCU protection for KF_RCU_PROTECTED kfuncs (Kumar Kartikeya
   Dwivedi)

 - Add stress test for rqspinlock in NMI (Kumar Kartikeya Dwivedi)

 - Improve the precision of tnum multiplier verifier operation
   (Nandakumar Edamana)

 - Use tnums to improve is_branch_taken() logic (Paul Chaignon)

 - Add support for atomic operations in arena in riscv JIT (Pu Lehui)

 - Report arena faults to BPF error stream (Puranjay Mohan)

 - Search for tracefs at /sys/kernel/tracing first in bpftool (Quentin
   Monnet)

 - Add bpf_strcasecmp() kfunc (Rong Tao)

 - Support lookup_and_delete_elem command in BPF_MAP_STACK_TRACE (Tao
   Chen)

* tag 'bpf-next-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (197 commits)
  libbpf: Replace AF_ALG with open coded SHA-256
  selftests/bpf: Add stress test for rqspinlock in NMI
  selftests/bpf: Add test case for different expected_attach_type
  bpf: Enforce expected_attach_type for tailcall compatibility
  bpftool: Remove duplicate string.h header
  bpf: Remove duplicate crypto/sha2.h header
  libbpf: Fix error when st-prefix_ops and ops from differ btf
  selftests/bpf: Test changing packet data from kfunc
  selftests/bpf: Add stacktrace map lookup_and_delete_elem test case
  selftests/bpf: Refactor stacktrace_map case with skeleton
  bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE
  selftests/bpf: Fix flaky bpf_cookie selftest
  selftests/bpf: Test changing packet data from global functions with a kfunc
  bpf: Emit struct bpf_xdp_sock type in vmlinux BTF
  selftests/bpf: Task_work selftest cleanup fixes
  MAINTAINERS: Delete inactive maintainers from AF_XDP
  bpf: Mark kfuncs as __noclone
  selftests/bpf: Add kprobe multi write ctx attach test
  selftests/bpf: Add kprobe write ctx attach test
  selftests/bpf: Add uprobe context ip register change test
  ...
This commit is contained in:
Linus Torvalds
2025-09-30 17:58:11 -07:00
254 changed files with 11853 additions and 2817 deletions

View File

@@ -3,7 +3,7 @@
# BPF interpreter that, for example, classic socket filters depend on.
config BPF
bool
select CRYPTO_LIB_SHA1
select CRYPTO_LIB_SHA256
# Used by archs to tell that they support BPF JIT compiler plus which
# flavour. Only one of the two can be selected for a specific arch since

View File

@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o

View File

@@ -633,3 +633,33 @@ static int __init kfunc_init(void)
return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
}
late_initcall(kfunc_init);
void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
{
struct bpf_stream_stage ss;
struct bpf_prog *prog;
u64 user_vm_start;
/*
* The RCU read lock is held to safely traverse the latch tree, but we
* don't need its protection when accessing the prog, since it will not
* disappear while we are handling the fault.
*/
rcu_read_lock();
prog = bpf_prog_ksym_find(fault_ip);
rcu_read_unlock();
if (!prog)
return;
/* Use main prog for stream access */
prog = prog->aux->main_prog_aux->prog;
user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena);
addr += clear_lo32(user_vm_start);
bpf_stream_stage(ss, prog, BPF_STDERR, ({
bpf_stream_printk(ss, "ERROR: Arena %s access at unmapped address 0x%lx\n",
write ? "WRITE" : "READ", addr);
bpf_stream_dump_stack(ss);
}));
}

View File

@@ -12,6 +12,7 @@
#include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/btf_ids.h>
#include <crypto/sha2.h>
#include "map_in_map.h"
@@ -174,6 +175,17 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
return array->value + (u64)array->elem_size * (index & array->index_mask);
}
static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size,
void *hash_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
sha256(array->value, (u64)array->elem_size * array->map.max_entries,
hash_buf);
memcpy(array->map.sha, hash_buf, sizeof(array->map.sha));
return 0;
}
static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
u32 off)
{
@@ -431,7 +443,7 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
static void array_map_free_timers_wq(struct bpf_map *map)
static void array_map_free_internal_structs(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
@@ -439,12 +451,14 @@ static void array_map_free_timers_wq(struct bpf_map *map)
/* We don't reset or free fields other than timer and workqueue
* on uref dropping to zero.
*/
if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) {
if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
for (i = 0; i < array->map.max_entries; i++) {
if (btf_record_has_field(map->record, BPF_TIMER))
bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
if (btf_record_has_field(map->record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i));
if (btf_record_has_field(map->record, BPF_TASK_WORK))
bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i));
}
}
}
@@ -783,7 +797,7 @@ const struct bpf_map_ops array_map_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_release_uref = array_map_free_timers_wq,
.map_release_uref = array_map_free_internal_structs,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
@@ -800,6 +814,7 @@ const struct bpf_map_ops array_map_ops = {
.map_mem_usage = array_map_mem_usage,
.map_btf_id = &array_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
.map_get_hash = &array_map_get_hash,
};
const struct bpf_map_ops percpu_array_map_ops = {

View File

@@ -45,8 +45,7 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
struct bpf_local_storage *local_storage;
migrate_disable();
rcu_read_lock();
rcu_read_lock_dont_migrate();
local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
if (!local_storage)
goto out;
@@ -55,8 +54,7 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
bpf_local_storage_destroy(local_storage);
bpf_cgrp_storage_unlock();
out:
rcu_read_unlock();
migrate_enable();
rcu_read_unlock_migrate();
}
static struct bpf_local_storage_data *

View File

@@ -62,8 +62,7 @@ void bpf_inode_storage_free(struct inode *inode)
if (!bsb)
return;
migrate_disable();
rcu_read_lock();
rcu_read_lock_dont_migrate();
local_storage = rcu_dereference(bsb->storage);
if (!local_storage)
@@ -71,8 +70,7 @@ void bpf_inode_storage_free(struct inode *inode)
bpf_local_storage_destroy(local_storage);
out:
rcu_read_unlock();
migrate_enable();
rcu_read_unlock_migrate();
}
static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)

View File

@@ -705,13 +705,11 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
migrate_enable();
rcu_read_unlock_trace();
} else {
rcu_read_lock();
migrate_disable();
rcu_read_lock_dont_migrate();
old_run_ctx = bpf_set_run_ctx(&run_ctx);
ret = bpf_prog_run(prog, ctx);
bpf_reset_run_ctx(old_run_ctx);
migrate_enable();
rcu_read_unlock();
rcu_read_unlock_migrate();
}
/* bpf program can only return 0 or 1:

View File

@@ -19,14 +19,6 @@
#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET)
static int get_next_cpu(int cpu)
{
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = cpumask_first(cpu_possible_mask);
return cpu;
}
/* Local list helpers */
static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
{
@@ -482,7 +474,7 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
steal = get_next_cpu(steal);
steal = cpumask_next_wrap(steal, cpu_possible_mask);
} while (!node && steal != first_steal);
loc_l->next_steal = steal;

View File

@@ -1174,6 +1174,18 @@ void bpf_struct_ops_put(const void *kdata)
bpf_map_put(&st_map->map);
}
u32 bpf_struct_ops_id(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
struct bpf_struct_ops_map *st_map;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
return st_map->map.id;
}
EXPORT_SYMBOL_GPL(bpf_struct_ops_id);
static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;

View File

@@ -70,8 +70,7 @@ void bpf_task_storage_free(struct task_struct *task)
{
struct bpf_local_storage *local_storage;
migrate_disable();
rcu_read_lock();
rcu_read_lock_dont_migrate();
local_storage = rcu_dereference(task->bpf_storage);
if (!local_storage)
@@ -81,8 +80,7 @@ void bpf_task_storage_free(struct task_struct *task)
bpf_local_storage_destroy(local_storage);
bpf_task_storage_unlock();
out:
rcu_read_unlock();
migrate_enable();
rcu_read_unlock_migrate();
}
static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)

View File

@@ -3478,60 +3478,45 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
return BTF_FIELD_FOUND;
}
#define field_mask_test_name(field_type, field_type_str) \
if (field_mask & field_type && !strcmp(name, field_type_str)) { \
type = field_type; \
goto end; \
}
static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type,
u32 field_mask, u32 *seen_mask,
int *align, int *sz)
u32 field_mask, u32 *seen_mask, int *align, int *sz)
{
int type = 0;
const struct {
enum btf_field_type type;
const char *const name;
const bool is_unique;
} field_types[] = {
{ BPF_SPIN_LOCK, "bpf_spin_lock", true },
{ BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true },
{ BPF_TIMER, "bpf_timer", true },
{ BPF_WORKQUEUE, "bpf_wq", true },
{ BPF_TASK_WORK, "bpf_task_work", true },
{ BPF_LIST_HEAD, "bpf_list_head", false },
{ BPF_LIST_NODE, "bpf_list_node", false },
{ BPF_RB_ROOT, "bpf_rb_root", false },
{ BPF_RB_NODE, "bpf_rb_node", false },
{ BPF_REFCOUNT, "bpf_refcount", false },
};
int type = 0, i;
const char *name = __btf_name_by_offset(btf, var_type->name_off);
const char *field_type_name;
enum btf_field_type field_type;
bool is_unique;
if (field_mask & BPF_SPIN_LOCK) {
if (!strcmp(name, "bpf_spin_lock")) {
if (*seen_mask & BPF_SPIN_LOCK)
for (i = 0; i < ARRAY_SIZE(field_types); ++i) {
field_type = field_types[i].type;
field_type_name = field_types[i].name;
is_unique = field_types[i].is_unique;
if (!(field_mask & field_type) || strcmp(name, field_type_name))
continue;
if (is_unique) {
if (*seen_mask & field_type)
return -E2BIG;
*seen_mask |= BPF_SPIN_LOCK;
type = BPF_SPIN_LOCK;
goto end;
*seen_mask |= field_type;
}
type = field_type;
goto end;
}
if (field_mask & BPF_RES_SPIN_LOCK) {
if (!strcmp(name, "bpf_res_spin_lock")) {
if (*seen_mask & BPF_RES_SPIN_LOCK)
return -E2BIG;
*seen_mask |= BPF_RES_SPIN_LOCK;
type = BPF_RES_SPIN_LOCK;
goto end;
}
}
if (field_mask & BPF_TIMER) {
if (!strcmp(name, "bpf_timer")) {
if (*seen_mask & BPF_TIMER)
return -E2BIG;
*seen_mask |= BPF_TIMER;
type = BPF_TIMER;
goto end;
}
}
if (field_mask & BPF_WORKQUEUE) {
if (!strcmp(name, "bpf_wq")) {
if (*seen_mask & BPF_WORKQUEUE)
return -E2BIG;
*seen_mask |= BPF_WORKQUEUE;
type = BPF_WORKQUEUE;
goto end;
}
}
field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head");
field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root");
field_mask_test_name(BPF_RB_NODE, "bpf_rb_node");
field_mask_test_name(BPF_REFCOUNT, "bpf_refcount");
/* Only return BPF_KPTR when all other types with matchable names fail */
if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) {
@@ -3545,8 +3530,6 @@ end:
return type;
}
#undef field_mask_test_name
/* Repeat a number of fields for a specified number of times.
*
* Copy the fields starting from the first field and repeat them for
@@ -3693,6 +3676,7 @@ static int btf_find_field_one(const struct btf *btf,
case BPF_LIST_NODE:
case BPF_RB_NODE:
case BPF_REFCOUNT:
case BPF_TASK_WORK:
ret = btf_find_struct(btf, var_type, off, sz, field_type,
info_cnt ? &info[0] : &tmp);
if (ret < 0)
@@ -3985,6 +3969,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
rec->timer_off = -EINVAL;
rec->wq_off = -EINVAL;
rec->refcount_off = -EINVAL;
rec->task_work_off = -EINVAL;
for (i = 0; i < cnt; i++) {
field_type_size = btf_field_type_size(info_arr[i].type);
if (info_arr[i].off + field_type_size > value_size) {
@@ -4024,6 +4009,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
/* Cache offset for faster lookup at runtime */
rec->wq_off = rec->fields[i].offset;
break;
case BPF_TASK_WORK:
WARN_ON_ONCE(rec->task_work_off >= 0);
rec->task_work_off = rec->fields[i].offset;
break;
case BPF_REFCOUNT:
WARN_ON_ONCE(rec->refcount_off >= 0);
/* Cache offset for faster lookup at runtime */
@@ -6762,7 +6751,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* skip modifiers */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
if (btf_type_is_small_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t))
/* accessing a scalar */
return true;
if (!btf_type_is_ptr(t)) {
@@ -7334,7 +7323,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id,
if (btf_type_is_ptr(t))
/* kernel size of pointer. Not BPF's size of pointer*/
return sizeof(void *);
if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
if (btf_type_is_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t))
return t->size;
return -EINVAL;
}
@@ -7343,7 +7332,7 @@ static u8 __get_type_fmodel_flags(const struct btf_type *t)
{
u8 flags = 0;
if (__btf_type_is_struct(t))
if (btf_type_is_struct(t))
flags |= BTF_FMODEL_STRUCT_ARG;
if (btf_type_is_signed_int(t))
flags |= BTF_FMODEL_SIGNED_ARG;
@@ -7384,7 +7373,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
ret = __get_type_size(btf, func->type, &t);
if (ret < 0 || __btf_type_is_struct(t)) {
if (ret < 0 || btf_type_is_struct(t)) {
bpf_log(log,
"The function %s return type %s is unsupported.\n",
tname, btf_type_str(t));

View File

@@ -27,14 +27,15 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key);
/*
* cgroup bpf destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup bpf
* destruction work items don't end up filling up max_active of system_wq
* destruction work items don't end up filling up max_active of system_percpu_wq
* which may lead to deadlock.
*/
static struct workqueue_struct *cgroup_bpf_destroy_wq;
static int __init cgroup_bpf_wq_init(void)
{
cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1);
cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy",
WQ_PERCPU, 1);
if (!cgroup_bpf_destroy_wq)
panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
return 0;
@@ -71,8 +72,7 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
u32 func_ret;
run_ctx.retval = retval;
migrate_disable();
rcu_read_lock();
rcu_read_lock_dont_migrate();
array = rcu_dereference(cgrp->effective[atype]);
item = &array->items[0];
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
@@ -88,8 +88,7 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
item++;
}
bpf_reset_run_ctx(old_run_ctx);
rcu_read_unlock();
migrate_enable();
rcu_read_unlock_migrate();
return run_ctx.retval;
}

View File

@@ -18,6 +18,7 @@
*/
#include <uapi/linux/btf.h>
#include <crypto/sha1.h>
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
@@ -38,6 +39,7 @@
#include <linux/bpf_mem_alloc.h>
#include <linux/memcontrol.h>
#include <linux/execmem.h>
#include <crypto/sha2.h>
#include <asm/barrier.h>
#include <linux/unaligned.h>
@@ -119,6 +121,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
fp->pages = size / PAGE_SIZE;
fp->aux = aux;
fp->aux->main_prog_aux = aux;
fp->aux->prog = fp;
fp->jit_requested = ebpf_jit_enabled();
fp->blinding_requested = bpf_jit_blinding_enabled(fp);
@@ -293,28 +296,18 @@ void __bpf_prog_free(struct bpf_prog *fp)
int bpf_prog_calc_tag(struct bpf_prog *fp)
{
const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64);
u32 raw_size = bpf_prog_tag_scratch_size(fp);
u32 digest[SHA1_DIGEST_WORDS];
u32 ws[SHA1_WORKSPACE_WORDS];
u32 i, bsize, psize, blocks;
size_t size = bpf_prog_insn_size(fp);
struct bpf_insn *dst;
bool was_ld_map;
u8 *raw, *todo;
__be32 *result;
__be64 *bits;
u32 i;
raw = vmalloc(raw_size);
if (!raw)
dst = vmalloc(size);
if (!dst)
return -ENOMEM;
sha1_init_raw(digest);
memset(ws, 0, sizeof(ws));
/* We need to take out the map fd for the digest calculation
* since they are unstable from user space side.
*/
dst = (void *)raw;
for (i = 0, was_ld_map = false; i < fp->len; i++) {
dst[i] = fp->insnsi[i];
if (!was_ld_map &&
@@ -334,33 +327,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
was_ld_map = false;
}
}
psize = bpf_prog_insn_size(fp);
memset(&raw[psize], 0, raw_size - psize);
raw[psize++] = 0x80;
bsize = round_up(psize, SHA1_BLOCK_SIZE);
blocks = bsize / SHA1_BLOCK_SIZE;
todo = raw;
if (bsize - psize >= sizeof(__be64)) {
bits = (__be64 *)(todo + bsize - sizeof(__be64));
} else {
bits = (__be64 *)(todo + bsize + bits_offset);
blocks++;
}
*bits = cpu_to_be64((psize - 1) << 3);
while (blocks--) {
sha1_transform(digest, todo, ws);
todo += SHA1_BLOCK_SIZE;
}
result = (__force __be32 *)digest;
for (i = 0; i < SHA1_DIGEST_WORDS; i++)
result[i] = cpu_to_be32(digest[i]);
memcpy(fp->tag, result, sizeof(fp->tag));
vfree(raw);
sha256((u8 *)dst, size, fp->digest);
vfree(dst);
return 0;
}
@@ -2393,6 +2361,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
map->owner->type = prog_type;
map->owner->jited = fp->jited;
map->owner->xdp_has_frags = aux->xdp_has_frags;
map->owner->expected_attach_type = fp->expected_attach_type;
map->owner->attach_func_proto = aux->attach_func_proto;
for_each_cgroup_storage_type(i) {
map->owner->storage_cookie[i] =
@@ -2404,6 +2373,10 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
ret = map->owner->type == prog_type &&
map->owner->jited == fp->jited &&
map->owner->xdp_has_frags == aux->xdp_has_frags;
if (ret &&
map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
map->owner->expected_attach_type != fp->expected_attach_type)
ret = false;
for_each_cgroup_storage_type(i) {
if (!ret)
break;
@@ -3329,9 +3302,8 @@ static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
rcu_read_unlock();
if (!prog)
return true;
if (bpf_is_subprog(prog))
return true;
ctxp->prog = prog;
/* Make sure we return the main prog if we found a subprog */
ctxp->prog = prog->aux->main_prog_aux->prog;
return false;
}

View File

@@ -550,7 +550,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu)));
if (old_rcpu) {
INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free);
queue_rcu_work(system_wq, &old_rcpu->free_work);
queue_rcu_work(system_percpu_wq, &old_rcpu->free_work);
}
}

View File

@@ -865,7 +865,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
struct bpf_dtab_netdev *dev;
dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
GFP_NOWAIT | __GFP_NOWARN,
GFP_NOWAIT,
dtab->map.numa_node);
if (!dev)
return ERR_PTR(-ENOMEM);

View File

@@ -215,7 +215,20 @@ static bool htab_has_extra_elems(struct bpf_htab *htab)
return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
}
static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem)
{
if (btf_record_has_field(htab->map.record, BPF_TIMER))
bpf_obj_free_timer(htab->map.record,
htab_elem_value(elem, htab->map.key_size));
if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(htab->map.record,
htab_elem_value(elem, htab->map.key_size));
if (btf_record_has_field(htab->map.record, BPF_TASK_WORK))
bpf_obj_free_task_work(htab->map.record,
htab_elem_value(elem, htab->map.key_size));
}
static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
int i;
@@ -227,12 +240,7 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
if (btf_record_has_field(htab->map.record, BPF_TIMER))
bpf_obj_free_timer(htab->map.record,
htab_elem_value(elem, htab->map.key_size));
if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(htab->map.record,
htab_elem_value(elem, htab->map.key_size));
htab_free_internal_structs(htab, elem);
cond_resched();
}
}
@@ -1490,7 +1498,7 @@ static void delete_all_elements(struct bpf_htab *htab)
}
}
static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
{
int i;
@@ -1502,28 +1510,23 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
hlist_nulls_for_each_entry(l, n, head, hash_node) {
/* We only free timer on uref dropping to zero */
if (btf_record_has_field(htab->map.record, BPF_TIMER))
bpf_obj_free_timer(htab->map.record,
htab_elem_value(l, htab->map.key_size));
if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(htab->map.record,
htab_elem_value(l, htab->map.key_size));
htab_free_internal_structs(htab, l);
}
cond_resched_rcu();
}
rcu_read_unlock();
}
static void htab_map_free_timers_and_wq(struct bpf_map *map)
static void htab_map_free_internal_structs(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
/* We only free timer and workqueue on uref dropping to zero */
if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) {
if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
if (!htab_is_prealloc(htab))
htab_free_malloced_timers_and_wq(htab);
htab_free_malloced_internal_structs(htab);
else
htab_free_prealloced_timers_and_wq(htab);
htab_free_prealloced_internal_structs(htab);
}
}
@@ -2255,7 +2258,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_release_uref = htab_map_free_timers_and_wq,
.map_release_uref = htab_map_free_internal_structs,
.map_lookup_elem = htab_map_lookup_elem,
.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
@@ -2276,7 +2279,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_release_uref = htab_map_free_timers_and_wq,
.map_release_uref = htab_map_free_internal_structs,
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,

View File

@@ -25,6 +25,9 @@
#include <linux/kasan.h>
#include <linux/bpf_verifier.h>
#include <linux/uaccess.h>
#include <linux/verification.h>
#include <linux/task_work.h>
#include <linux/irq_work.h>
#include "../../lib/kstrtox.h"
@@ -774,11 +777,9 @@ int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs)
{
int nest_level;
preempt_disable();
nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
this_cpu_dec(bpf_bprintf_nest_level);
preempt_enable();
return -EBUSY;
}
*bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
@@ -791,7 +792,6 @@ void bpf_put_buffers(void)
if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
return;
this_cpu_dec(bpf_bprintf_nest_level);
preempt_enable();
}
void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
@@ -1084,6 +1084,17 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx)
{
if (map->map_type == BPF_MAP_TYPE_ARRAY) {
struct bpf_array *array = container_of(map, struct bpf_array, map);
*arr_idx = ((char *)value - array->value) / array->elem_size;
return arr_idx;
}
return (void *)value - round_up(map->key_size, 8);
}
struct bpf_async_cb {
struct bpf_map *map;
struct bpf_prog *prog;
@@ -1166,15 +1177,8 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
* bpf_map_delete_elem() on the same timer.
*/
this_cpu_write(hrtimer_running, t);
if (map->map_type == BPF_MAP_TYPE_ARRAY) {
struct bpf_array *array = container_of(map, struct bpf_array, map);
/* compute the key */
idx = ((char *)value - array->value) / array->elem_size;
key = &idx;
} else { /* hash or lru */
key = value - round_up(map->key_size, 8);
}
key = map_key_from_value(map, value, &idx);
callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
/* The verifier checked that return value is zero. */
@@ -1200,15 +1204,7 @@ static void bpf_wq_work(struct work_struct *work)
if (!callback_fn)
return;
if (map->map_type == BPF_MAP_TYPE_ARRAY) {
struct bpf_array *array = container_of(map, struct bpf_array, map);
/* compute the key */
idx = ((char *)value - array->value) / array->elem_size;
key = &idx;
} else { /* hash or lru */
key = value - round_up(map->key_size, 8);
}
key = map_key_from_value(map, value, &idx);
rcu_read_lock_trace();
migrate_disable();
@@ -1600,7 +1596,7 @@ void bpf_timer_cancel_and_free(void *val)
* timer callback.
*/
if (this_cpu_read(hrtimer_running)) {
queue_work(system_unbound_wq, &t->cb.delete_work);
queue_work(system_dfl_wq, &t->cb.delete_work);
return;
}
@@ -1613,7 +1609,7 @@ void bpf_timer_cancel_and_free(void *val)
if (hrtimer_try_to_cancel(&t->timer) >= 0)
kfree_rcu(t, cb.rcu);
else
queue_work(system_unbound_wq, &t->cb.delete_work);
queue_work(system_dfl_wq, &t->cb.delete_work);
} else {
bpf_timer_delete_work(&t->cb.delete_work);
}
@@ -1783,6 +1779,9 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s
return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
case BPF_DYNPTR_TYPE_XDP:
return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
case BPF_DYNPTR_TYPE_SKB_META:
memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len);
return 0;
default:
WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
return -EFAULT;
@@ -1839,6 +1838,11 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
if (flags)
return -EINVAL;
return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
case BPF_DYNPTR_TYPE_SKB_META:
if (flags)
return -EINVAL;
memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len);
return 0;
default:
WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
return -EFAULT;
@@ -1885,6 +1889,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
return (unsigned long)(ptr->data + ptr->offset + offset);
case BPF_DYNPTR_TYPE_SKB:
case BPF_DYNPTR_TYPE_XDP:
case BPF_DYNPTR_TYPE_SKB_META:
/* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
return 0;
default:
@@ -2540,7 +2545,7 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
{
struct cgroup *cgrp;
cgrp = cgroup_get_from_id(cgid);
cgrp = __cgroup_get_from_id(cgid);
if (IS_ERR(cgrp))
return NULL;
return cgrp;
@@ -2713,6 +2718,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
return buffer__opt;
}
case BPF_DYNPTR_TYPE_SKB_META:
return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset);
default:
WARN_ONCE(true, "unknown dynptr type %d\n", type);
return NULL;
@@ -3344,6 +3351,36 @@ __bpf_kfunc void __bpf_trap(void)
* __get_kernel_nofault instead of plain dereference to make them safe.
*/
static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case)
{
char c1, c2;
int i;
if (!copy_from_kernel_nofault_allowed(s1, 1) ||
!copy_from_kernel_nofault_allowed(s2, 1)) {
return -ERANGE;
}
guard(pagefault)();
for (i = 0; i < XATTR_SIZE_MAX; i++) {
__get_kernel_nofault(&c1, s1, char, err_out);
__get_kernel_nofault(&c2, s2, char, err_out);
if (ignore_case) {
c1 = tolower(c1);
c2 = tolower(c2);
}
if (c1 != c2)
return c1 < c2 ? -1 : 1;
if (c1 == '\0')
return 0;
s1++;
s2++;
}
return -E2BIG;
err_out:
return -EFAULT;
}
/**
* bpf_strcmp - Compare two strings
* @s1__ign: One string
@@ -3359,28 +3396,25 @@ __bpf_kfunc void __bpf_trap(void)
*/
__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
{
char c1, c2;
int i;
return __bpf_strcasecmp(s1__ign, s2__ign, false);
}
if (!copy_from_kernel_nofault_allowed(s1__ign, 1) ||
!copy_from_kernel_nofault_allowed(s2__ign, 1)) {
return -ERANGE;
}
guard(pagefault)();
for (i = 0; i < XATTR_SIZE_MAX; i++) {
__get_kernel_nofault(&c1, s1__ign, char, err_out);
__get_kernel_nofault(&c2, s2__ign, char, err_out);
if (c1 != c2)
return c1 < c2 ? -1 : 1;
if (c1 == '\0')
return 0;
s1__ign++;
s2__ign++;
}
return -E2BIG;
err_out:
return -EFAULT;
/**
* bpf_strcasecmp - Compare two strings, ignoring the case of the characters
* @s1__ign: One string
* @s2__ign: Another string
*
* Return:
* * %0 - Strings are equal
* * %-1 - @s1__ign is smaller
* * %1 - @s2__ign is smaller
* * %-EFAULT - Cannot read one of the strings
* * %-E2BIG - One of strings is too large
* * %-ERANGE - One of strings is outside of kernel address space
*/
__bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign)
{
return __bpf_strcasecmp(s1__ign, s2__ign, true);
}
/**
@@ -3712,9 +3746,490 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
{
return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX);
}
#ifdef CONFIG_KEYS
/**
* bpf_lookup_user_key - lookup a key by its serial
* @serial: key handle serial number
* @flags: lookup-specific flags
*
* Search a key with a given *serial* and the provided *flags*.
* If found, increment the reference count of the key by one, and
* return it in the bpf_key structure.
*
* The bpf_key structure must be passed to bpf_key_put() when done
* with it, so that the key reference count is decremented and the
* bpf_key structure is freed.
*
* Permission checks are deferred to the time the key is used by
* one of the available key-specific kfuncs.
*
* Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
* special keyring (e.g. session keyring), if it doesn't yet exist.
* Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
* for the key construction, and to retrieve uninstantiated keys (keys
* without data attached to them).
*
* Return: a bpf_key pointer with a valid key pointer if the key is found, a
* NULL pointer otherwise.
*/
__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags)
{
key_ref_t key_ref;
struct bpf_key *bkey;
if (flags & ~KEY_LOOKUP_ALL)
return NULL;
/*
* Permission check is deferred until the key is used, as the
* intent of the caller is unknown here.
*/
key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
if (IS_ERR(key_ref))
return NULL;
bkey = kmalloc(sizeof(*bkey), GFP_KERNEL);
if (!bkey) {
key_put(key_ref_to_ptr(key_ref));
return NULL;
}
bkey->key = key_ref_to_ptr(key_ref);
bkey->has_ref = true;
return bkey;
}
/**
* bpf_lookup_system_key - lookup a key by a system-defined ID
* @id: key ID
*
* Obtain a bpf_key structure with a key pointer set to the passed key ID.
* The key pointer is marked as invalid, to prevent bpf_key_put() from
* attempting to decrement the key reference count on that pointer. The key
* pointer set in such way is currently understood only by
* verify_pkcs7_signature().
*
* Set *id* to one of the values defined in include/linux/verification.h:
* 0 for the primary keyring (immutable keyring of system keys);
* VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
* (where keys can be added only if they are vouched for by existing keys
* in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
* keyring (primarily used by the integrity subsystem to verify a kexec'ed
* kerned image and, possibly, the initramfs signature).
*
* Return: a bpf_key pointer with an invalid key pointer set from the
* pre-determined ID on success, a NULL pointer otherwise
*/
__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
{
struct bpf_key *bkey;
if (system_keyring_id_check(id) < 0)
return NULL;
bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC);
if (!bkey)
return NULL;
bkey->key = (struct key *)(unsigned long)id;
bkey->has_ref = false;
return bkey;
}
/**
* bpf_key_put - decrement key reference count if key is valid and free bpf_key
* @bkey: bpf_key structure
*
* Decrement the reference count of the key inside *bkey*, if the pointer
* is valid, and free *bkey*.
*/
__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
{
if (bkey->has_ref)
key_put(bkey->key);
kfree(bkey);
}
/**
* bpf_verify_pkcs7_signature - verify a PKCS#7 signature
* @data_p: data to verify
* @sig_p: signature of the data
* @trusted_keyring: keyring with keys trusted for signature verification
*
* Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
* with keys in a keyring referenced by *trusted_keyring*.
*
* Return: 0 on success, a negative value on error.
*/
__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
struct bpf_dynptr *sig_p,
struct bpf_key *trusted_keyring)
{
#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
const void *data, *sig;
u32 data_len, sig_len;
int ret;
if (trusted_keyring->has_ref) {
/*
* Do the permission check deferred in bpf_lookup_user_key().
* See bpf_lookup_user_key() for more details.
*
* A call to key_task_permission() here would be redundant, as
* it is already done by keyring_search() called by
* find_asymmetric_key().
*/
ret = key_validate(trusted_keyring->key);
if (ret < 0)
return ret;
}
data_len = __bpf_dynptr_size(data_ptr);
data = __bpf_dynptr_data(data_ptr, data_len);
sig_len = __bpf_dynptr_size(sig_ptr);
sig = __bpf_dynptr_data(sig_ptr, sig_len);
return verify_pkcs7_signature(data, data_len, sig, sig_len,
trusted_keyring->key,
VERIFYING_BPF_SIGNATURE, NULL,
NULL);
#else
return -EOPNOTSUPP;
#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
}
#endif /* CONFIG_KEYS */
typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value);
enum bpf_task_work_state {
/* bpf_task_work is ready to be used */
BPF_TW_STANDBY = 0,
/* irq work scheduling in progress */
BPF_TW_PENDING,
/* task work scheduling in progress */
BPF_TW_SCHEDULING,
/* task work is scheduled successfully */
BPF_TW_SCHEDULED,
/* callback is running */
BPF_TW_RUNNING,
/* associated BPF map value is deleted */
BPF_TW_FREED,
};
struct bpf_task_work_ctx {
enum bpf_task_work_state state;
refcount_t refcnt;
struct callback_head work;
struct irq_work irq_work;
/* bpf_prog that schedules task work */
struct bpf_prog *prog;
/* task for which callback is scheduled */
struct task_struct *task;
/* the map and map value associated with this context */
struct bpf_map *map;
void *map_val;
enum task_work_notify_mode mode;
bpf_task_work_callback_t callback_fn;
struct rcu_head rcu;
} __aligned(8);
/* Actual type for struct bpf_task_work */
struct bpf_task_work_kern {
struct bpf_task_work_ctx *ctx;
};
static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx)
{
if (ctx->prog) {
bpf_prog_put(ctx->prog);
ctx->prog = NULL;
}
if (ctx->task) {
bpf_task_release(ctx->task);
ctx->task = NULL;
}
}
static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx)
{
return refcount_inc_not_zero(&ctx->refcnt);
}
static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx)
{
if (!refcount_dec_and_test(&ctx->refcnt))
return;
bpf_task_work_ctx_reset(ctx);
/* bpf_mem_free expects migration to be disabled */
migrate_disable();
bpf_mem_free(&bpf_global_ma, ctx);
migrate_enable();
}
static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx)
{
/*
* Scheduled task_work callback holds ctx ref, so if we successfully
* cancelled, we put that ref on callback's behalf. If we couldn't
* cancel, callback will inevitably run or has already completed
* running, and it would have taken care of its ctx ref itself.
*/
if (task_work_cancel(ctx->task, &ctx->work))
bpf_task_work_ctx_put(ctx);
}
static void bpf_task_work_callback(struct callback_head *cb)
{
struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work);
enum bpf_task_work_state state;
u32 idx;
void *key;
/* Read lock is needed to protect ctx and map key/value access */
guard(rcu_tasks_trace)();
/*
* This callback may start running before bpf_task_work_irq() switched to
* SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING.
*/
state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING);
if (state == BPF_TW_SCHEDULED)
state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING);
if (state == BPF_TW_FREED) {
bpf_task_work_ctx_put(ctx);
return;
}
key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx);
migrate_disable();
ctx->callback_fn(ctx->map, key, ctx->map_val);
migrate_enable();
bpf_task_work_ctx_reset(ctx);
(void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY);
bpf_task_work_ctx_put(ctx);
}
static void bpf_task_work_irq(struct irq_work *irq_work)
{
struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
enum bpf_task_work_state state;
int err;
guard(rcu_tasks_trace)();
if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) {
bpf_task_work_ctx_put(ctx);
return;
}
err = task_work_add(ctx->task, &ctx->work, ctx->mode);
if (err) {
bpf_task_work_ctx_reset(ctx);
/*
* try to switch back to STANDBY for another task_work reuse, but we might have
* gone to FREED already, which is fine as we already cleaned up after ourselves
*/
(void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY);
bpf_task_work_ctx_put(ctx);
return;
}
/*
* It's technically possible for just scheduled task_work callback to
* complete running by now, going SCHEDULING -> RUNNING and then
* dropping its ctx refcount. Instead of capturing extra ref just to
* protected below ctx->state access, we rely on RCU protection to
* perform below SCHEDULING -> SCHEDULED attempt.
*/
state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED);
if (state == BPF_TW_FREED)
bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */
}
static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw,
struct bpf_map *map)
{
struct bpf_task_work_kern *twk = (void *)tw;
struct bpf_task_work_ctx *ctx, *old_ctx;
ctx = READ_ONCE(twk->ctx);
if (ctx)
return ctx;
ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx));
if (!ctx)
return ERR_PTR(-ENOMEM);
memset(ctx, 0, sizeof(*ctx));
refcount_set(&ctx->refcnt, 1); /* map's own ref */
ctx->state = BPF_TW_STANDBY;
old_ctx = cmpxchg(&twk->ctx, NULL, ctx);
if (old_ctx) {
/*
* tw->ctx is set by concurrent BPF program, release allocated
* memory and try to reuse already set context.
*/
bpf_mem_free(&bpf_global_ma, ctx);
return old_ctx;
}
return ctx; /* Success */
}
static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw,
struct bpf_map *map)
{
struct bpf_task_work_ctx *ctx;
ctx = bpf_task_work_fetch_ctx(tw, map);
if (IS_ERR(ctx))
return ctx;
/* try to get ref for task_work callback to hold */
if (!bpf_task_work_ctx_tryget(ctx))
return ERR_PTR(-EBUSY);
if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) {
/* lost acquiring race or map_release_uref() stole it from us, put ref and bail */
bpf_task_work_ctx_put(ctx);
return ERR_PTR(-EBUSY);
}
/*
* If no process or bpffs is holding a reference to the map, no new callbacks should be
* scheduled. This does not address any race or correctness issue, but rather is a policy
* choice: dropping user references should stop everything.
*/
if (!atomic64_read(&map->usercnt)) {
/* drop ref we just got for task_work callback itself */
bpf_task_work_ctx_put(ctx);
/* transfer map's ref into cancel_and_free() */
bpf_task_work_cancel_and_free(tw);
return ERR_PTR(-EBUSY);
}
return ctx;
}
static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw,
struct bpf_map *map, bpf_task_work_callback_t callback_fn,
struct bpf_prog_aux *aux, enum task_work_notify_mode mode)
{
struct bpf_prog *prog;
struct bpf_task_work_ctx *ctx;
int err;
BTF_TYPE_EMIT(struct bpf_task_work);
prog = bpf_prog_inc_not_zero(aux->prog);
if (IS_ERR(prog))
return -EBADF;
task = bpf_task_acquire(task);
if (!task) {
err = -EBADF;
goto release_prog;
}
ctx = bpf_task_work_acquire_ctx(tw, map);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
goto release_all;
}
ctx->task = task;
ctx->callback_fn = callback_fn;
ctx->prog = prog;
ctx->mode = mode;
ctx->map = map;
ctx->map_val = (void *)tw - map->record->task_work_off;
init_task_work(&ctx->work, bpf_task_work_callback);
init_irq_work(&ctx->irq_work, bpf_task_work_irq);
irq_work_queue(&ctx->irq_work);
return 0;
release_all:
bpf_task_release(task);
release_prog:
bpf_prog_put(prog);
return err;
}
/**
* bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode
* @task: Task struct for which callback should be scheduled
* @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
* @map__map: bpf_map that embeds struct bpf_task_work in the values
* @callback: pointer to BPF subprogram to call
* @aux__prog: user should pass NULL
*
* Return: 0 if task work has been scheduled successfully, negative error code otherwise
*/
__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw,
void *map__map, bpf_task_work_callback_t callback,
void *aux__prog)
{
return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL);
}
/**
* bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode
* @task: Task struct for which callback should be scheduled
* @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
* @map__map: bpf_map that embeds struct bpf_task_work in the values
* @callback: pointer to BPF subprogram to call
* @aux__prog: user should pass NULL
*
* Return: 0 if task work has been scheduled successfully, negative error code otherwise
*/
__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw,
void *map__map, bpf_task_work_callback_t callback,
void *aux__prog)
{
return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME);
}
__bpf_kfunc_end_defs();
static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
{
struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */
bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */
}
void bpf_task_work_cancel_and_free(void *val)
{
struct bpf_task_work_kern *twk = val;
struct bpf_task_work_ctx *ctx;
enum bpf_task_work_state state;
ctx = xchg(&twk->ctx, NULL);
if (!ctx)
return;
state = xchg(&ctx->state, BPF_TW_FREED);
if (state == BPF_TW_SCHEDULED) {
/* run in irq_work to avoid locks in NMI */
init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled);
irq_work_queue(&ctx->irq_work);
return;
}
bpf_task_work_ctx_put(ctx); /* put bpf map's ref */
}
BTF_KFUNCS_START(generic_btf_ids)
#ifdef CONFIG_CRASH_DUMP
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
@@ -3753,6 +4268,14 @@ BTF_ID_FLAGS(func, bpf_throw)
#ifdef CONFIG_BPF_EVENTS
BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS)
#endif
#ifdef CONFIG_KEYS
BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
#endif
#endif
BTF_KFUNCS_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
@@ -3834,6 +4357,7 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
#endif
BTF_ID_FLAGS(func, __bpf_trap)
BTF_ID_FLAGS(func, bpf_strcmp);
BTF_ID_FLAGS(func, bpf_strcasecmp);
BTF_ID_FLAGS(func, bpf_strchr);
BTF_ID_FLAGS(func, bpf_strchrnul);
BTF_ID_FLAGS(func, bpf_strnchr);
@@ -3848,6 +4372,8 @@ BTF_ID_FLAGS(func, bpf_strnstr);
BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
#endif
BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {

733
kernel/bpf/liveness.c Normal file
View File

@@ -0,0 +1,733 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
#include <linux/bpf_verifier.h>
#include <linux/hashtable.h>
#include <linux/jhash.h>
#include <linux/slab.h>
/*
* This file implements live stack slots analysis. After accumulating
* stack usage data, the analysis answers queries about whether a
* particular stack slot may be read by an instruction or any of it's
* successors. This data is consumed by the verifier states caching
* mechanism to decide which stack slots are important when looking for a
* visited state corresponding to the current state.
*
* The analysis is call chain sensitive, meaning that data is collected
* and queried for tuples (call chain, subprogram instruction index).
* Such sensitivity allows identifying if some subprogram call always
* leads to writes in the caller's stack.
*
* The basic idea is as follows:
* - As the verifier accumulates a set of visited states, the analysis instance
* accumulates a conservative estimate of stack slots that can be read
* or must be written for each visited tuple (call chain, instruction index).
* - If several states happen to visit the same instruction with the same
* call chain, stack usage information for the corresponding tuple is joined:
* - "may_read" set represents a union of all possibly read slots
* (any slot in "may_read" set might be read at or after the instruction);
* - "must_write" set represents an intersection of all possibly written slots
* (any slot in "must_write" set is guaranteed to be written by the instruction).
* - The analysis is split into two phases:
* - read and write marks accumulation;
* - read and write marks propagation.
* - The propagation phase is a textbook live variable data flow analysis:
*
* state[cc, i].live_after = U [state[cc, s].live_before for s in insn_successors(i)]
* state[cc, i].live_before =
* (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read
*
* Where:
* - `U` stands for set union
* - `/` stands for set difference;
* - `cc` stands for a call chain;
* - `i` and `s` are instruction indexes;
*
* The above equations are computed for each call chain and instruction
* index until state stops changing.
* - Additionally, in order to transfer "must_write" information from a
* subprogram to call instructions invoking this subprogram,
* the "must_write_acc" set is tracked for each (cc, i) tuple.
* A set of stack slots that are guaranteed to be written by this
* instruction or any of its successors (within the subprogram).
* The equation for "must_write_acc" propagation looks as follows:
*
* state[cc, i].must_write_acc =
* [state[cc, s].must_write_acc for s in insn_successors(i)]
* U state[cc, i].must_write
*
* (An intersection of all "must_write_acc" for instruction successors
* plus all "must_write" slots for the instruction itself).
* - After the propagation phase completes for a subprogram, information from
* (cc, 0) tuple (subprogram entry) is transferred to the caller's call chain:
* - "must_write_acc" set is intersected with the call site's "must_write" set;
* - "may_read" set is added to the call site's "may_read" set.
* - Any live stack queries must be taken after the propagation phase.
* - Accumulation and propagation phases can be entered multiple times,
* at any point in time:
* - "may_read" set only grows;
* - "must_write" set only shrinks;
* - for each visited verifier state with zero branches, all relevant
* read and write marks are already recorded by the analysis instance.
*
* Technically, the analysis is facilitated by the following data structures:
* - Call chain: for given verifier state, the call chain is a tuple of call
* instruction indexes leading to the current subprogram plus the subprogram
* entry point index.
* - Function instance: for a given call chain, for each instruction in
* the current subprogram, a mapping between instruction index and a
* set of "may_read", "must_write" and other marks accumulated for this
* instruction.
* - A hash table mapping call chains to function instances.
*/
struct callchain {
u32 callsites[MAX_CALL_FRAMES]; /* instruction pointer for each frame */
/* cached subprog_info[*].start for functions owning the frames:
* - sp_starts[curframe] used to get insn relative index within current function;
* - sp_starts[0..current-1] used for fast callchain_frame_up().
*/
u32 sp_starts[MAX_CALL_FRAMES];
u32 curframe; /* depth of callsites and sp_starts arrays */
};
struct per_frame_masks {
u64 may_read; /* stack slots that may be read by this instruction */
u64 must_write; /* stack slots written by this instruction */
u64 must_write_acc; /* stack slots written by this instruction and its successors */
u64 live_before; /* stack slots that may be read by this insn and its successors */
};
/*
* A function instance created for a specific callchain.
* Encapsulates read and write marks for each instruction in the function.
* Marks are tracked for each frame in the callchain.
*/
struct func_instance {
struct hlist_node hl_node;
struct callchain callchain;
u32 insn_cnt; /* cached number of insns in the function */
bool updated;
bool must_write_dropped;
/* Per frame, per instruction masks, frames allocated lazily. */
struct per_frame_masks *frames[MAX_CALL_FRAMES];
/* For each instruction a flag telling if "must_write" had been initialized for it. */
bool *must_write_set;
};
struct live_stack_query {
struct func_instance *instances[MAX_CALL_FRAMES]; /* valid in range [0..curframe] */
u32 curframe;
u32 insn_idx;
};
struct bpf_liveness {
DECLARE_HASHTABLE(func_instances, 8); /* maps callchain to func_instance */
struct live_stack_query live_stack_query; /* cache to avoid repetitive ht lookups */
/* Cached instance corresponding to env->cur_state, avoids per-instruction ht lookup */
struct func_instance *cur_instance;
/*
* Below fields are used to accumulate stack write marks for instruction at
* @write_insn_idx before submitting the marks to @cur_instance.
*/
u64 write_masks_acc[MAX_CALL_FRAMES];
u32 write_insn_idx;
};
/* Compute callchain corresponding to state @st at depth @frameno */
static void compute_callchain(struct bpf_verifier_env *env, struct bpf_verifier_state *st,
struct callchain *callchain, u32 frameno)
{
struct bpf_subprog_info *subprog_info = env->subprog_info;
u32 i;
memset(callchain, 0, sizeof(*callchain));
for (i = 0; i <= frameno; i++) {
callchain->sp_starts[i] = subprog_info[st->frame[i]->subprogno].start;
if (i < st->curframe)
callchain->callsites[i] = st->frame[i + 1]->callsite;
}
callchain->curframe = frameno;
callchain->callsites[callchain->curframe] = callchain->sp_starts[callchain->curframe];
}
static u32 hash_callchain(struct callchain *callchain)
{
return jhash2(callchain->callsites, callchain->curframe, 0);
}
static bool same_callsites(struct callchain *a, struct callchain *b)
{
int i;
if (a->curframe != b->curframe)
return false;
for (i = a->curframe; i >= 0; i--)
if (a->callsites[i] != b->callsites[i])
return false;
return true;
}
/*
* Find existing or allocate new function instance corresponding to @callchain.
* Instances are accumulated in env->liveness->func_instances and persist
* until the end of the verification process.
*/
static struct func_instance *__lookup_instance(struct bpf_verifier_env *env,
struct callchain *callchain)
{
struct bpf_liveness *liveness = env->liveness;
struct bpf_subprog_info *subprog;
struct func_instance *result;
u32 subprog_sz, size, key;
key = hash_callchain(callchain);
hash_for_each_possible(liveness->func_instances, result, hl_node, key)
if (same_callsites(&result->callchain, callchain))
return result;
subprog = bpf_find_containing_subprog(env, callchain->sp_starts[callchain->curframe]);
subprog_sz = (subprog + 1)->start - subprog->start;
size = sizeof(struct func_instance);
result = kvzalloc(size, GFP_KERNEL_ACCOUNT);
if (!result)
return ERR_PTR(-ENOMEM);
result->must_write_set = kvcalloc(subprog_sz, sizeof(*result->must_write_set),
GFP_KERNEL_ACCOUNT);
if (!result->must_write_set)
return ERR_PTR(-ENOMEM);
memcpy(&result->callchain, callchain, sizeof(*callchain));
result->insn_cnt = subprog_sz;
hash_add(liveness->func_instances, &result->hl_node, key);
return result;
}
static struct func_instance *lookup_instance(struct bpf_verifier_env *env,
struct bpf_verifier_state *st,
u32 frameno)
{
struct callchain callchain;
compute_callchain(env, st, &callchain, frameno);
return __lookup_instance(env, &callchain);
}
int bpf_stack_liveness_init(struct bpf_verifier_env *env)
{
env->liveness = kvzalloc(sizeof(*env->liveness), GFP_KERNEL_ACCOUNT);
if (!env->liveness)
return -ENOMEM;
hash_init(env->liveness->func_instances);
return 0;
}
void bpf_stack_liveness_free(struct bpf_verifier_env *env)
{
struct func_instance *instance;
struct hlist_node *tmp;
int bkt, i;
if (!env->liveness)
return;
hash_for_each_safe(env->liveness->func_instances, bkt, tmp, instance, hl_node) {
for (i = 0; i <= instance->callchain.curframe; i++)
kvfree(instance->frames[i]);
kvfree(instance->must_write_set);
kvfree(instance);
}
kvfree(env->liveness);
}
/*
* Convert absolute instruction index @insn_idx to an index relative
* to start of the function corresponding to @instance.
*/
static int relative_idx(struct func_instance *instance, u32 insn_idx)
{
return insn_idx - instance->callchain.sp_starts[instance->callchain.curframe];
}
static struct per_frame_masks *get_frame_masks(struct func_instance *instance,
u32 frame, u32 insn_idx)
{
if (!instance->frames[frame])
return NULL;
return &instance->frames[frame][relative_idx(instance, insn_idx)];
}
static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env,
struct func_instance *instance,
u32 frame, u32 insn_idx)
{
struct per_frame_masks *arr;
if (!instance->frames[frame]) {
arr = kvcalloc(instance->insn_cnt, sizeof(*arr), GFP_KERNEL_ACCOUNT);
instance->frames[frame] = arr;
if (!arr)
return ERR_PTR(-ENOMEM);
}
return get_frame_masks(instance, frame, insn_idx);
}
void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env)
{
env->liveness->cur_instance = NULL;
}
/* If @env->liveness->cur_instance is null, set it to instance corresponding to @env->cur_state. */
static int ensure_cur_instance(struct bpf_verifier_env *env)
{
struct bpf_liveness *liveness = env->liveness;
struct func_instance *instance;
if (liveness->cur_instance)
return 0;
instance = lookup_instance(env, env->cur_state, env->cur_state->curframe);
if (IS_ERR(instance))
return PTR_ERR(instance);
liveness->cur_instance = instance;
return 0;
}
/* Accumulate may_read masks for @frame at @insn_idx */
static int mark_stack_read(struct bpf_verifier_env *env,
struct func_instance *instance, u32 frame, u32 insn_idx, u64 mask)
{
struct per_frame_masks *masks;
u64 new_may_read;
masks = alloc_frame_masks(env, instance, frame, insn_idx);
if (IS_ERR(masks))
return PTR_ERR(masks);
new_may_read = masks->may_read | mask;
if (new_may_read != masks->may_read &&
((new_may_read | masks->live_before) != masks->live_before))
instance->updated = true;
masks->may_read |= mask;
return 0;
}
int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frame, u32 insn_idx, u64 mask)
{
int err;
err = ensure_cur_instance(env);
err = err ?: mark_stack_read(env, env->liveness->cur_instance, frame, insn_idx, mask);
return err;
}
static void reset_stack_write_marks(struct bpf_verifier_env *env,
struct func_instance *instance, u32 insn_idx)
{
struct bpf_liveness *liveness = env->liveness;
int i;
liveness->write_insn_idx = insn_idx;
for (i = 0; i <= instance->callchain.curframe; i++)
liveness->write_masks_acc[i] = 0;
}
int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx)
{
struct bpf_liveness *liveness = env->liveness;
int err;
err = ensure_cur_instance(env);
if (err)
return err;
reset_stack_write_marks(env, liveness->cur_instance, insn_idx);
return 0;
}
void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frame, u64 mask)
{
env->liveness->write_masks_acc[frame] |= mask;
}
static int commit_stack_write_marks(struct bpf_verifier_env *env,
struct func_instance *instance)
{
struct bpf_liveness *liveness = env->liveness;
u32 idx, frame, curframe, old_must_write;
struct per_frame_masks *masks;
u64 mask;
if (!instance)
return 0;
curframe = instance->callchain.curframe;
idx = relative_idx(instance, liveness->write_insn_idx);
for (frame = 0; frame <= curframe; frame++) {
mask = liveness->write_masks_acc[frame];
/* avoid allocating frames for zero masks */
if (mask == 0 && !instance->must_write_set[idx])
continue;
masks = alloc_frame_masks(env, instance, frame, liveness->write_insn_idx);
if (IS_ERR(masks))
return PTR_ERR(masks);
old_must_write = masks->must_write;
/*
* If instruction at this callchain is seen for a first time, set must_write equal
* to @mask. Otherwise take intersection with the previous value.
*/
if (instance->must_write_set[idx])
mask &= old_must_write;
if (old_must_write != mask) {
masks->must_write = mask;
instance->updated = true;
}
if (old_must_write & ~mask)
instance->must_write_dropped = true;
}
instance->must_write_set[idx] = true;
liveness->write_insn_idx = 0;
return 0;
}
/*
* Merge stack writes marks in @env->liveness->write_masks_acc
* with information already in @env->liveness->cur_instance.
*/
int bpf_commit_stack_write_marks(struct bpf_verifier_env *env)
{
return commit_stack_write_marks(env, env->liveness->cur_instance);
}
static char *fmt_callchain(struct bpf_verifier_env *env, struct callchain *callchain)
{
char *buf_end = env->tmp_str_buf + sizeof(env->tmp_str_buf);
char *buf = env->tmp_str_buf;
int i;
buf += snprintf(buf, buf_end - buf, "(");
for (i = 0; i <= callchain->curframe; i++)
buf += snprintf(buf, buf_end - buf, "%s%d", i ? "," : "", callchain->callsites[i]);
snprintf(buf, buf_end - buf, ")");
return env->tmp_str_buf;
}
static void log_mask_change(struct bpf_verifier_env *env, struct callchain *callchain,
char *pfx, u32 frame, u32 insn_idx, u64 old, u64 new)
{
u64 changed_bits = old ^ new;
u64 new_ones = new & changed_bits;
u64 new_zeros = ~new & changed_bits;
if (!changed_bits)
return;
bpf_log(&env->log, "%s frame %d insn %d ", fmt_callchain(env, callchain), frame, insn_idx);
if (new_ones) {
bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_ones);
bpf_log(&env->log, "+%s %s ", pfx, env->tmp_str_buf);
}
if (new_zeros) {
bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_zeros);
bpf_log(&env->log, "-%s %s", pfx, env->tmp_str_buf);
}
bpf_log(&env->log, "\n");
}
int bpf_jmp_offset(struct bpf_insn *insn)
{
u8 code = insn->code;
if (code == (BPF_JMP32 | BPF_JA))
return insn->imm;
return insn->off;
}
__diag_push();
__diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl");
inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2])
{
static const struct opcode_info {
bool can_jump;
bool can_fallthrough;
} opcode_info_tbl[256] = {
[0 ... 255] = {.can_jump = false, .can_fallthrough = true},
#define _J(code, ...) \
[BPF_JMP | code] = __VA_ARGS__, \
[BPF_JMP32 | code] = __VA_ARGS__
_J(BPF_EXIT, {.can_jump = false, .can_fallthrough = false}),
_J(BPF_JA, {.can_jump = true, .can_fallthrough = false}),
_J(BPF_JEQ, {.can_jump = true, .can_fallthrough = true}),
_J(BPF_JNE, {.can_jump = true, .can_fallthrough = true}),
_J(BPF_JLT, {.can_jump = true, .can_fallthrough = true}),
_J(BPF_JLE, {.can_jump = true, .can_fallthrough = true}),
_J(BPF_JGT, {.can_jump = true, .can_fallthrough = true}),
_J(BPF_JGE, {.can_jump = true, .can_fallthrough = true}),
_J(BPF_JSGT, {.can_jump = true, .can_fallthrough = true}),
_J(BPF_JSGE, {.can_jump = true, .can_fallthrough = true}),
_J(BPF_JSLT, {.can_jump = true, .can_fallthrough = true}),
_J(BPF_JSLE, {.can_jump = true, .can_fallthrough = true}),
_J(BPF_JCOND, {.can_jump = true, .can_fallthrough = true}),
_J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}),
#undef _J
};
struct bpf_insn *insn = &prog->insnsi[idx];
const struct opcode_info *opcode_info;
int i = 0, insn_sz;
opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)];
insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
if (opcode_info->can_fallthrough)
succ[i++] = idx + insn_sz;
if (opcode_info->can_jump)
succ[i++] = idx + bpf_jmp_offset(insn) + 1;
return i;
}
__diag_pop();
static struct func_instance *get_outer_instance(struct bpf_verifier_env *env,
struct func_instance *instance)
{
struct callchain callchain = instance->callchain;
/* Adjust @callchain to represent callchain one frame up */
callchain.callsites[callchain.curframe] = 0;
callchain.sp_starts[callchain.curframe] = 0;
callchain.curframe--;
callchain.callsites[callchain.curframe] = callchain.sp_starts[callchain.curframe];
return __lookup_instance(env, &callchain);
}
static u32 callchain_subprog_start(struct callchain *callchain)
{
return callchain->sp_starts[callchain->curframe];
}
/*
* Transfer @may_read and @must_write_acc marks from the first instruction of @instance,
* to the call instruction in function instance calling @instance.
*/
static int propagate_to_outer_instance(struct bpf_verifier_env *env,
struct func_instance *instance)
{
struct callchain *callchain = &instance->callchain;
u32 this_subprog_start, callsite, frame;
struct func_instance *outer_instance;
struct per_frame_masks *insn;
int err;
this_subprog_start = callchain_subprog_start(callchain);
outer_instance = get_outer_instance(env, instance);
callsite = callchain->callsites[callchain->curframe - 1];
reset_stack_write_marks(env, outer_instance, callsite);
for (frame = 0; frame < callchain->curframe; frame++) {
insn = get_frame_masks(instance, frame, this_subprog_start);
if (!insn)
continue;
bpf_mark_stack_write(env, frame, insn->must_write_acc);
err = mark_stack_read(env, outer_instance, frame, callsite, insn->live_before);
if (err)
return err;
}
commit_stack_write_marks(env, outer_instance);
return 0;
}
static inline bool update_insn(struct bpf_verifier_env *env,
struct func_instance *instance, u32 frame, u32 insn_idx)
{
struct bpf_insn_aux_data *aux = env->insn_aux_data;
u64 new_before, new_after, must_write_acc;
struct per_frame_masks *insn, *succ_insn;
u32 succ_num, s, succ[2];
bool changed;
succ_num = bpf_insn_successors(env->prog, insn_idx, succ);
if (unlikely(succ_num == 0))
return false;
changed = false;
insn = get_frame_masks(instance, frame, insn_idx);
new_before = 0;
new_after = 0;
/*
* New "must_write_acc" is an intersection of all "must_write_acc"
* of successors plus all "must_write" slots of instruction itself.
*/
must_write_acc = U64_MAX;
for (s = 0; s < succ_num; ++s) {
succ_insn = get_frame_masks(instance, frame, succ[s]);
new_after |= succ_insn->live_before;
must_write_acc &= succ_insn->must_write_acc;
}
must_write_acc |= insn->must_write;
/*
* New "live_before" is a union of all "live_before" of successors
* minus slots written by instruction plus slots read by instruction.
*/
new_before = (new_after & ~insn->must_write) | insn->may_read;
changed |= new_before != insn->live_before;
changed |= must_write_acc != insn->must_write_acc;
if (unlikely(env->log.level & BPF_LOG_LEVEL2) &&
(insn->may_read || insn->must_write ||
insn_idx == callchain_subprog_start(&instance->callchain) ||
aux[insn_idx].prune_point)) {
log_mask_change(env, &instance->callchain, "live",
frame, insn_idx, insn->live_before, new_before);
log_mask_change(env, &instance->callchain, "written",
frame, insn_idx, insn->must_write_acc, must_write_acc);
}
insn->live_before = new_before;
insn->must_write_acc = must_write_acc;
return changed;
}
/* Fixed-point computation of @live_before and @must_write_acc marks */
static int update_instance(struct bpf_verifier_env *env, struct func_instance *instance)
{
u32 i, frame, po_start, po_end, cnt, this_subprog_start;
struct callchain *callchain = &instance->callchain;
int *insn_postorder = env->cfg.insn_postorder;
struct bpf_subprog_info *subprog;
struct per_frame_masks *insn;
bool changed;
int err;
this_subprog_start = callchain_subprog_start(callchain);
/*
* If must_write marks were updated must_write_acc needs to be reset
* (to account for the case when new must_write sets became smaller).
*/
if (instance->must_write_dropped) {
for (frame = 0; frame <= callchain->curframe; frame++) {
if (!instance->frames[frame])
continue;
for (i = 0; i < instance->insn_cnt; i++) {
insn = get_frame_masks(instance, frame, this_subprog_start + i);
insn->must_write_acc = 0;
}
}
}
subprog = bpf_find_containing_subprog(env, this_subprog_start);
po_start = subprog->postorder_start;
po_end = (subprog + 1)->postorder_start;
cnt = 0;
/* repeat until fixed point is reached */
do {
cnt++;
changed = false;
for (frame = 0; frame <= instance->callchain.curframe; frame++) {
if (!instance->frames[frame])
continue;
for (i = po_start; i < po_end; i++)
changed |= update_insn(env, instance, frame, insn_postorder[i]);
}
} while (changed);
if (env->log.level & BPF_LOG_LEVEL2)
bpf_log(&env->log, "%s live stack update done in %d iterations\n",
fmt_callchain(env, callchain), cnt);
/* transfer marks accumulated for outer frames to outer func instance (caller) */
if (callchain->curframe > 0) {
err = propagate_to_outer_instance(env, instance);
if (err)
return err;
}
return 0;
}
/*
* Prepare all callchains within @env->cur_state for querying.
* This function should be called after each verifier.c:pop_stack()
* and whenever verifier.c:do_check_insn() processes subprogram exit.
* This would guarantee that visited verifier states with zero branches
* have their bpf_mark_stack_{read,write}() effects propagated in
* @env->liveness.
*/
int bpf_update_live_stack(struct bpf_verifier_env *env)
{
struct func_instance *instance;
int err, frame;
bpf_reset_live_stack_callchain(env);
for (frame = env->cur_state->curframe; frame >= 0; --frame) {
instance = lookup_instance(env, env->cur_state, frame);
if (IS_ERR(instance))
return PTR_ERR(instance);
if (instance->updated) {
err = update_instance(env, instance);
if (err)
return err;
instance->updated = false;
instance->must_write_dropped = false;
}
}
return 0;
}
static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 spi)
{
struct per_frame_masks *masks;
masks = get_frame_masks(instance, frameno, insn_idx);
return masks && (masks->live_before & BIT(spi));
}
int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
struct live_stack_query *q = &env->liveness->live_stack_query;
struct func_instance *instance;
u32 frame;
memset(q, 0, sizeof(*q));
for (frame = 0; frame <= st->curframe; frame++) {
instance = lookup_instance(env, st, frame);
if (IS_ERR(instance))
return PTR_ERR(instance);
q->instances[frame] = instance;
}
q->curframe = st->curframe;
q->insn_idx = st->insn_idx;
return 0;
}
bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi)
{
/*
* Slot is alive if it is read before q->st->insn_idx in current func instance,
* or if for some outer func instance:
* - alive before callsite if callsite calls callback, otherwise
* - alive after callsite
*/
struct live_stack_query *q = &env->liveness->live_stack_query;
struct func_instance *instance, *curframe_instance;
u32 i, callsite;
bool alive;
curframe_instance = q->instances[q->curframe];
if (is_live_before(curframe_instance, q->insn_idx, frameno, spi))
return true;
for (i = frameno; i < q->curframe; i++) {
callsite = curframe_instance->callchain.callsites[i];
instance = q->instances[i];
alive = bpf_calls_callback(env, callsite)
? is_live_before(instance, callsite, frameno, spi)
: is_live_before(instance, callsite + 1, frameno, spi);
if (alive)
return true;
}
return false;
}

View File

@@ -165,7 +165,7 @@ static long cgroup_storage_update_elem(struct bpf_map *map, void *key,
}
new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size),
__GFP_ZERO | GFP_NOWAIT | __GFP_NOWARN,
__GFP_ZERO | GFP_NOWAIT,
map->numa_node);
if (!new)
return -ENOMEM;

View File

@@ -498,6 +498,8 @@ const char *dynptr_type_str(enum bpf_dynptr_type type)
return "skb";
case BPF_DYNPTR_TYPE_XDP:
return "xdp";
case BPF_DYNPTR_TYPE_SKB_META:
return "skb_meta";
case BPF_DYNPTR_TYPE_INVALID:
return "<invalid>";
default:
@@ -540,19 +542,6 @@ static char slot_type_char[] = {
[STACK_IRQ_FLAG] = 'f'
};
static void print_liveness(struct bpf_verifier_env *env,
enum bpf_reg_liveness live)
{
if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
verbose(env, "_");
if (live & REG_LIVE_READ)
verbose(env, "r");
if (live & REG_LIVE_WRITTEN)
verbose(env, "w");
if (live & REG_LIVE_DONE)
verbose(env, "D");
}
#define UNUM_MAX_DECIMAL U16_MAX
#define SNUM_MAX_DECIMAL S16_MAX
#define SNUM_MIN_DECIMAL S16_MIN
@@ -770,7 +759,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
if (!print_all && !reg_scratched(env, i))
continue;
verbose(env, " R%d", i);
print_liveness(env, reg->live);
verbose(env, "=");
print_reg_state(env, state, reg);
}
@@ -803,9 +791,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
break;
types_buf[j] = '\0';
verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
print_liveness(env, reg->live);
verbose(env, "=%s", types_buf);
verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf);
print_reg_state(env, state, reg);
break;
case STACK_DYNPTR:
@@ -814,7 +800,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
reg = &state->stack[i].spilled_ptr;
verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
print_liveness(env, reg->live);
verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type));
if (reg->id)
verbose_a("id=%d", reg->id);
@@ -829,9 +814,8 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
if (!reg->ref_obj_id)
continue;
verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
print_liveness(env, reg->live);
verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)",
(-i - 1) * BPF_REG_SIZE,
iter_type_str(reg->iter.btf, reg->iter.btf_id),
reg->ref_obj_id, iter_state_str(reg->iter.state),
reg->iter.depth);
@@ -839,9 +823,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
case STACK_MISC:
case STACK_ZERO:
default:
verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
print_liveness(env, reg->live);
verbose(env, "=%s", types_buf);
verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf);
break;
}
}

View File

@@ -736,7 +736,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
/* Defer barriers into worker to let the rest of map memory to be freed */
memset(ma, 0, sizeof(*ma));
INIT_WORK(&copy->work, free_mem_alloc_deferred);
queue_work(system_unbound_wq, &copy->work);
queue_work(system_dfl_wq, &copy->work);
}
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)

View File

@@ -646,7 +646,15 @@ static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
}
/* Called from syscall */
int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
static int stack_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
return bpf_stackmap_extract(map, key, value, true);
}
/* Called from syscall */
int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value,
bool delete)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *old_bucket;
@@ -663,7 +671,10 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
memcpy(value, bucket->data, trace_len);
memset(value + trace_len, 0, map->value_size - trace_len);
old_bucket = xchg(&smap->buckets[id], bucket);
if (delete)
old_bucket = bucket;
else
old_bucket = xchg(&smap->buckets[id], bucket);
if (old_bucket)
pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
return 0;
@@ -754,6 +765,7 @@ const struct bpf_map_ops stack_trace_map_ops = {
.map_free = stack_map_free,
.map_get_next_key = stack_map_get_next_key,
.map_lookup_elem = stack_map_lookup_elem,
.map_lookup_and_delete_elem = stack_map_lookup_and_delete_elem,
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
.map_check_btf = map_check_no_btf,

View File

@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <crypto/sha2.h>
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
#include <linux/bpf_trace.h>
@@ -38,6 +39,7 @@
#include <linux/tracepoint.h>
#include <linux/overflow.h>
#include <linux/cookie.h>
#include <linux/verification.h>
#include <net/netfilter/nf_bpf_link.h>
#include <net/netkit.h>
@@ -318,7 +320,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
err = bpf_percpu_cgroup_storage_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_copy(map, key, value);
err = bpf_stackmap_extract(map, key, value, false);
} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
err = bpf_fd_array_map_lookup_elem(map, key, value);
} else if (IS_FD_HASH(map)) {
@@ -672,6 +674,7 @@ void btf_record_free(struct btf_record *rec)
case BPF_TIMER:
case BPF_REFCOUNT:
case BPF_WORKQUEUE:
case BPF_TASK_WORK:
/* Nothing to release */
break;
default:
@@ -725,6 +728,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
case BPF_TIMER:
case BPF_REFCOUNT:
case BPF_WORKQUEUE:
case BPF_TASK_WORK:
/* Nothing to acquire */
break;
default:
@@ -783,6 +787,13 @@ void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj)
bpf_wq_cancel_and_free(obj + rec->wq_off);
}
void bpf_obj_free_task_work(const struct btf_record *rec, void *obj)
{
if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK)))
return;
bpf_task_work_cancel_and_free(obj + rec->task_work_off);
}
void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
const struct btf_field *fields;
@@ -807,6 +818,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
case BPF_WORKQUEUE:
bpf_wq_cancel_and_free(field_ptr);
break;
case BPF_TASK_WORK:
bpf_task_work_cancel_and_free(field_ptr);
break;
case BPF_KPTR_UNREF:
WRITE_ONCE(*(u64 *)field_ptr, 0);
break;
@@ -860,6 +874,7 @@ static void bpf_map_free(struct bpf_map *map)
* the free of values or special fields allocated from bpf memory
* allocator.
*/
kfree(map->excl_prog_sha);
migrate_disable();
map->ops->map_free(map);
migrate_enable();
@@ -905,7 +920,7 @@ static void bpf_map_free_in_work(struct bpf_map *map)
/* Avoid spawning kworkers, since they all might contend
* for the same mutex like slab_mutex.
*/
queue_work(system_unbound_wq, &map->work);
queue_work(system_dfl_wq, &map->work);
}
static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
@@ -1237,7 +1252,8 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
map->record = btf_parse_fields(btf, value_type,
BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR,
BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR |
BPF_TASK_WORK,
map->value_size);
if (!IS_ERR_OR_NULL(map->record)) {
int i;
@@ -1269,6 +1285,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
break;
case BPF_TIMER:
case BPF_WORKQUEUE:
case BPF_TASK_WORK:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY) {
@@ -1338,9 +1355,9 @@ static bool bpf_net_capable(void)
return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
}
#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size
/* called via syscall */
static int map_create(union bpf_attr *attr, bool kernel)
static int map_create(union bpf_attr *attr, bpfptr_t uattr)
{
const struct bpf_map_ops *ops;
struct bpf_token *token = NULL;
@@ -1534,7 +1551,29 @@ static int map_create(union bpf_attr *attr, bool kernel)
attr->btf_vmlinux_value_type_id;
}
err = security_bpf_map_create(map, attr, token, kernel);
if (attr->excl_prog_hash) {
bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel);
if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) {
err = -EINVAL;
goto free_map;
}
map->excl_prog_sha = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
if (!map->excl_prog_sha) {
err = -ENOMEM;
goto free_map;
}
if (copy_from_bpfptr(map->excl_prog_sha, uprog_hash, SHA256_DIGEST_SIZE)) {
err = -EFAULT;
goto free_map;
}
} else if (attr->excl_prog_hash_size) {
return -EINVAL;
}
err = security_bpf_map_create(map, attr, token, uattr.is_kernel);
if (err)
goto free_map_sec;
@@ -1627,7 +1666,8 @@ struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
}
EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
int __weak bpf_stackmap_extract(struct bpf_map *map, void *key, void *value,
bool delete)
{
return -ENOTSUPP;
}
@@ -2158,7 +2198,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
} else if (map->map_type == BPF_MAP_TYPE_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
if (!bpf_map_is_offloaded(map)) {
bpf_disable_instrumentation();
rcu_read_lock();
@@ -2761,8 +2802,44 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
}
static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr,
bool is_kernel)
{
bpfptr_t usig = make_bpfptr(attr->signature, is_kernel);
struct bpf_dynptr_kern sig_ptr, insns_ptr;
struct bpf_key *key = NULL;
void *sig;
int err = 0;
if (system_keyring_id_check(attr->keyring_id) == 0)
key = bpf_lookup_system_key(attr->keyring_id);
else
key = bpf_lookup_user_key(attr->keyring_id, 0);
if (!key)
return -EINVAL;
sig = kvmemdup_bpfptr(usig, attr->signature_size);
if (IS_ERR(sig)) {
bpf_key_put(key);
return -ENOMEM;
}
bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0,
attr->signature_size);
bpf_dynptr_init(&insns_ptr, prog->insnsi, BPF_DYNPTR_TYPE_LOCAL, 0,
prog->len * sizeof(struct bpf_insn));
err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr,
(struct bpf_dynptr *)&sig_ptr, key);
bpf_key_put(key);
kvfree(sig);
return err;
}
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt
#define BPF_PROG_LOAD_LAST_FIELD keyring_id
static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
@@ -2926,6 +3003,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
/* eBPF programs must be GPL compatible to use GPL-ed functions */
prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;
if (attr->signature) {
err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel);
if (err)
goto free_prog;
}
prog->orig_prog = NULL;
prog->jited = 0;
@@ -5161,6 +5244,9 @@ static int bpf_map_get_info_by_fd(struct file *file,
info_len = min_t(u32, sizeof(info), info_len);
memset(&info, 0, sizeof(info));
if (copy_from_user(&info, uinfo, info_len))
return -EFAULT;
info.type = map->map_type;
info.id = map->id;
info.key_size = map->key_size;
@@ -5185,6 +5271,25 @@ static int bpf_map_get_info_by_fd(struct file *file,
return err;
}
if (info.hash) {
char __user *uhash = u64_to_user_ptr(info.hash);
if (!map->ops->map_get_hash)
return -EINVAL;
if (info.hash_size != SHA256_DIGEST_SIZE)
return -EINVAL;
err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha);
if (err != 0)
return err;
if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0)
return -EFAULT;
} else if (info.hash_size) {
return -EINVAL;
}
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
return -EFAULT;
@@ -6008,7 +6113,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
switch (cmd) {
case BPF_MAP_CREATE:
err = map_create(&attr, uattr.is_kernel);
err = map_create(&attr, uattr);
break;
case BPF_MAP_LOOKUP_ELEM:
err = map_lookup_elem(&attr);

View File

@@ -116,31 +116,55 @@ struct tnum tnum_xor(struct tnum a, struct tnum b)
return TNUM(v & ~mu, mu);
}
/* Generate partial products by multiplying each bit in the multiplier (tnum a)
* with the multiplicand (tnum b), and add the partial products after
* appropriately bit-shifting them. Instead of directly performing tnum addition
* on the generated partial products, equivalenty, decompose each partial
* product into two tnums, consisting of the value-sum (acc_v) and the
* mask-sum (acc_m) and then perform tnum addition on them. The following paper
* explains the algorithm in more detail: https://arxiv.org/abs/2105.05398.
/* Perform long multiplication, iterating through the bits in a using rshift:
* - if LSB(a) is a known 0, keep current accumulator
* - if LSB(a) is a known 1, add b to current accumulator
* - if LSB(a) is unknown, take a union of the above cases.
*
* For example:
*
* acc_0: acc_1:
*
* 11 * -> 11 * -> 11 * -> union(0011, 1001) == x0x1
* x1 01 11
* ------ ------ ------
* 11 11 11
* xx 00 11
* ------ ------ ------
* ???? 0011 1001
*/
struct tnum tnum_mul(struct tnum a, struct tnum b)
{
u64 acc_v = a.value * b.value;
struct tnum acc_m = TNUM(0, 0);
struct tnum acc = TNUM(0, 0);
while (a.value || a.mask) {
/* LSB of tnum a is a certain 1 */
if (a.value & 1)
acc_m = tnum_add(acc_m, TNUM(0, b.mask));
acc = tnum_add(acc, b);
/* LSB of tnum a is uncertain */
else if (a.mask & 1)
acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask));
else if (a.mask & 1) {
/* acc = tnum_union(acc_0, acc_1), where acc_0 and
* acc_1 are partial accumulators for cases
* LSB(a) = certain 0 and LSB(a) = certain 1.
* acc_0 = acc + 0 * b = acc.
* acc_1 = acc + 1 * b = tnum_add(acc, b).
*/
acc = tnum_union(acc, tnum_add(acc, b));
}
/* Note: no case for LSB is certain 0 */
a = tnum_rshift(a, 1);
b = tnum_lshift(b, 1);
}
return tnum_add(TNUM(acc_v, 0), acc_m);
return acc;
}
bool tnum_overlap(struct tnum a, struct tnum b)
{
u64 mu;
mu = ~a.mask & ~b.mask;
return (a.value & mu) == (b.value & mu);
}
/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
@@ -155,6 +179,19 @@ struct tnum tnum_intersect(struct tnum a, struct tnum b)
return TNUM(v & ~mu, mu);
}
/* Returns a tnum with the uncertainty from both a and b, and in addition, new
* uncertainty at any position that a and b disagree. This represents a
* superset of the union of the concrete sets of both a and b. Despite the
* overapproximation, it is optimal.
*/
struct tnum tnum_union(struct tnum a, struct tnum b)
{
u64 v = a.value & b.value;
u64 mu = (a.value ^ b.value) | a.mask | b.mask;
return TNUM(v & ~mu, mu);
}
struct tnum tnum_cast(struct tnum a, u8 size)
{
a.value &= (1ULL << (size * 8)) - 1;

View File

@@ -899,8 +899,7 @@ static __always_inline u64 notrace bpf_prog_start_time(void)
static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
rcu_read_lock();
migrate_disable();
rcu_read_lock_dont_migrate();
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
@@ -949,8 +948,7 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
update_prog_stats(prog, start);
this_cpu_dec(*(prog->active));
migrate_enable();
rcu_read_unlock();
rcu_read_unlock_migrate();
}
static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
@@ -960,8 +958,7 @@ static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
/* Runtime stats are exported via actual BPF_LSM_CGROUP
* programs, not the shims.
*/
rcu_read_lock();
migrate_disable();
rcu_read_lock_dont_migrate();
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
@@ -974,8 +971,7 @@ static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
{
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
migrate_enable();
rcu_read_unlock();
rcu_read_unlock_migrate();
}
u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
@@ -1033,8 +1029,7 @@ static u64 notrace __bpf_prog_enter(struct bpf_prog *prog,
struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
rcu_read_lock();
migrate_disable();
rcu_read_lock_dont_migrate();
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
@@ -1048,8 +1043,7 @@ static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start,
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
migrate_enable();
rcu_read_unlock();
rcu_read_unlock_migrate();
}
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)

File diff suppressed because it is too large Load Diff

View File

@@ -6481,15 +6481,15 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
}
/*
* cgroup_get_from_id : get the cgroup associated with cgroup id
* __cgroup_get_from_id : get the cgroup associated with cgroup id
* @id: cgroup id
* On success return the cgrp or ERR_PTR on failure
* Only cgroups within current task's cgroup NS are valid.
* There are no cgroup NS restrictions.
*/
struct cgroup *cgroup_get_from_id(u64 id)
struct cgroup *__cgroup_get_from_id(u64 id)
{
struct kernfs_node *kn;
struct cgroup *cgrp, *root_cgrp;
struct cgroup *cgrp;
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn)
@@ -6511,6 +6511,22 @@ struct cgroup *cgroup_get_from_id(u64 id)
if (!cgrp)
return ERR_PTR(-ENOENT);
return cgrp;
}
/*
* cgroup_get_from_id : get the cgroup associated with cgroup id
* @id: cgroup id
* On success return the cgrp or ERR_PTR on failure
* Only cgroups within current task's cgroup NS are valid.
*/
struct cgroup *cgroup_get_from_id(u64 id)
{
struct cgroup *cgrp, *root_cgrp;
cgrp = __cgroup_get_from_id(id);
if (IS_ERR(cgrp))
return cgrp;
root_cgrp = current_cgns_cgroup_dfl();
if (!cgroup_is_descendant(cgrp, root_cgrp)) {

View File

@@ -11245,6 +11245,10 @@ static int __perf_event_set_bpf_prog(struct perf_event *event,
if (prog->kprobe_override && !is_kprobe)
return -EINVAL;
/* Writing to context allowed only for uprobes. */
if (prog->aux->kprobe_write_ctx && !is_uprobe)
return -EINVAL;
if (is_tracepoint || is_syscall_tp) {
int off = trace_event_get_offsets(event->tp_event);

View File

@@ -2765,6 +2765,13 @@ static void handle_swbp(struct pt_regs *regs)
handler_chain(uprobe, regs);
/*
* If user decided to take execution elsewhere, it makes little sense
* to execute the original instruction, so let's skip it.
*/
if (instruction_pointer(regs) != bp_vaddr)
goto out;
/* Try to optimize after first hit. */
arch_uprobe_optimize(&uprobe->arch, bp_vaddr);

View File

@@ -22,7 +22,6 @@
#include <linux/bsearch.h>
#include <linux/sort.h>
#include <linux/key.h>
#include <linux/verification.h>
#include <linux/namei.h>
#include <net/bpf_sk_storage.h>
@@ -1241,188 +1240,6 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
#ifdef CONFIG_KEYS
__bpf_kfunc_start_defs();
/**
* bpf_lookup_user_key - lookup a key by its serial
* @serial: key handle serial number
* @flags: lookup-specific flags
*
* Search a key with a given *serial* and the provided *flags*.
* If found, increment the reference count of the key by one, and
* return it in the bpf_key structure.
*
* The bpf_key structure must be passed to bpf_key_put() when done
* with it, so that the key reference count is decremented and the
* bpf_key structure is freed.
*
* Permission checks are deferred to the time the key is used by
* one of the available key-specific kfuncs.
*
* Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
* special keyring (e.g. session keyring), if it doesn't yet exist.
* Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
* for the key construction, and to retrieve uninstantiated keys (keys
* without data attached to them).
*
* Return: a bpf_key pointer with a valid key pointer if the key is found, a
* NULL pointer otherwise.
*/
__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags)
{
key_ref_t key_ref;
struct bpf_key *bkey;
if (flags & ~KEY_LOOKUP_ALL)
return NULL;
/*
* Permission check is deferred until the key is used, as the
* intent of the caller is unknown here.
*/
key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
if (IS_ERR(key_ref))
return NULL;
bkey = kmalloc(sizeof(*bkey), GFP_KERNEL);
if (!bkey) {
key_put(key_ref_to_ptr(key_ref));
return NULL;
}
bkey->key = key_ref_to_ptr(key_ref);
bkey->has_ref = true;
return bkey;
}
/**
* bpf_lookup_system_key - lookup a key by a system-defined ID
* @id: key ID
*
* Obtain a bpf_key structure with a key pointer set to the passed key ID.
* The key pointer is marked as invalid, to prevent bpf_key_put() from
* attempting to decrement the key reference count on that pointer. The key
* pointer set in such way is currently understood only by
* verify_pkcs7_signature().
*
* Set *id* to one of the values defined in include/linux/verification.h:
* 0 for the primary keyring (immutable keyring of system keys);
* VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
* (where keys can be added only if they are vouched for by existing keys
* in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
* keyring (primarily used by the integrity subsystem to verify a kexec'ed
* kerned image and, possibly, the initramfs signature).
*
* Return: a bpf_key pointer with an invalid key pointer set from the
* pre-determined ID on success, a NULL pointer otherwise
*/
__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
{
struct bpf_key *bkey;
if (system_keyring_id_check(id) < 0)
return NULL;
bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC);
if (!bkey)
return NULL;
bkey->key = (struct key *)(unsigned long)id;
bkey->has_ref = false;
return bkey;
}
/**
* bpf_key_put - decrement key reference count if key is valid and free bpf_key
* @bkey: bpf_key structure
*
* Decrement the reference count of the key inside *bkey*, if the pointer
* is valid, and free *bkey*.
*/
__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
{
if (bkey->has_ref)
key_put(bkey->key);
kfree(bkey);
}
#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
/**
* bpf_verify_pkcs7_signature - verify a PKCS#7 signature
* @data_p: data to verify
* @sig_p: signature of the data
* @trusted_keyring: keyring with keys trusted for signature verification
*
* Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
* with keys in a keyring referenced by *trusted_keyring*.
*
* Return: 0 on success, a negative value on error.
*/
__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
struct bpf_dynptr *sig_p,
struct bpf_key *trusted_keyring)
{
struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
const void *data, *sig;
u32 data_len, sig_len;
int ret;
if (trusted_keyring->has_ref) {
/*
* Do the permission check deferred in bpf_lookup_user_key().
* See bpf_lookup_user_key() for more details.
*
* A call to key_task_permission() here would be redundant, as
* it is already done by keyring_search() called by
* find_asymmetric_key().
*/
ret = key_validate(trusted_keyring->key);
if (ret < 0)
return ret;
}
data_len = __bpf_dynptr_size(data_ptr);
data = __bpf_dynptr_data(data_ptr, data_len);
sig_len = __bpf_dynptr_size(sig_ptr);
sig = __bpf_dynptr_data(sig_ptr, sig_len);
return verify_pkcs7_signature(data, data_len, sig, sig_len,
trusted_keyring->key,
VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
NULL);
}
#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(key_sig_kfunc_set)
BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
#endif
BTF_KFUNCS_END(key_sig_kfunc_set)
static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = {
.owner = THIS_MODULE,
.set = &key_sig_kfunc_set,
};
static int __init bpf_key_sig_kfuncs_init(void)
{
return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
&bpf_key_sig_kfunc_set);
}
late_initcall(bpf_key_sig_kfuncs_init);
#endif /* CONFIG_KEYS */
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1521,8 +1338,6 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
{
if (off < 0 || off >= sizeof(struct pt_regs))
return false;
if (type != BPF_READ)
return false;
if (off % size != 0)
return false;
/*
@@ -1532,6 +1347,9 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
if (off + size > sizeof(struct pt_regs))
return false;
if (type == BPF_WRITE)
prog->aux->kprobe_write_ctx = true;
return true;
}
@@ -2728,20 +2546,25 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
struct pt_regs *regs;
int err;
/*
* graph tracer framework ensures we won't migrate, so there is no need
* to use migrate_disable for bpf_prog_run again. The check here just for
* __this_cpu_inc_return.
*/
cant_sleep();
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
bpf_prog_inc_misses_counter(link->link.prog);
err = 1;
goto out;
}
migrate_disable();
rcu_read_lock();
regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr());
old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
err = bpf_prog_run(link->link.prog, regs);
bpf_reset_run_ctx(old_run_ctx);
rcu_read_unlock();
migrate_enable();
out:
__this_cpu_dec(bpf_prog_active);
@@ -2913,6 +2736,10 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (!is_kprobe_multi(prog))
return -EINVAL;
/* Writing to context is not allowed for kprobes. */
if (prog->aux->kprobe_write_ctx)
return -EINVAL;
flags = attr->link_create.kprobe_multi.flags;
if (flags & ~BPF_F_KPROBE_MULTI_RETURN)
return -EINVAL;