Commit 3f6d04d7 authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'allow-bpf_refcount_acquire-of-mapval-obtained-via-direct-ld'

Dave Marchevsky says:

====================
Allow bpf_refcount_acquire of mapval obtained via direct LD

Consider this BPF program:

  struct cgv_node {
    int d;
    struct bpf_refcount r;
    struct bpf_rb_node rb;
  };

  struct val_stash {
    struct cgv_node __kptr *v;
  };

  struct {
    __uint(type, BPF_MAP_TYPE_ARRAY);
    __type(key, int);
    __type(value, struct val_stash);
    __uint(max_entries, 10);
  } array_map SEC(".maps");

  long bpf_program(void *ctx)
  {
    struct val_stash *mapval;
    struct cgv_node *p;
    int idx = 0;

    mapval = bpf_map_lookup_elem(&array_map, &idx);
    if (!mapval || !mapval->v) { /* omitted */ }

    p = bpf_refcount_acquire(mapval->v); /* Verification FAILs here */

    /* Add p to some tree */
    return 0;
  }

Verification fails on the refcount_acquire:

  160: (79) r1 = *(u64 *)(r8 +8)        ; R1_w=untrusted_ptr_or_null_cgv_node(id=11,off=0,imm=0) R8_w=map_value(id=10,off=0,ks=8,vs=16,imm=0) refs=6
  161: (b7) r2 = 0                      ; R2_w=0 refs=6
  162: (85) call bpf_refcount_acquire_impl#117824
  arg#0 is neither owning or non-owning ref

The above verifier dump is actually from sched_ext's scx_flatcg [0],
which is the motivating usecase for this series' changes. Specifically,
scx_flatcg stashes a rb_node type w/ cgroup-specific info (struct
cgv_node) in a map when the cgroup is created, then later puts that
cgroup's node in a rbtree in .enqueue . Making struct cgv_node
refcounted would simplify the code a bit by virtue of allowing us to
remove the kptr_xchg's, but "later puts that cgroups node in a rbtree"
is not possible without a refcount_acquire, which suffers from the above
verification failure.

If we get rid of PTR_UNTRUSTED flag, and add MEM_ALLOC | NON_OWN_REF,
mapval->v would be a non-owning ref and verification would succeed. Due
to the most recent set of refcount changes [1], which modified
bpf_obj_drop behavior to not reuse refcounted graph node's underlying
memory until after RCU grace period, this is safe to do. Once mapval->v
has the correct flags it _is_ a non-owning reference and verification of
the motivating example will succeed.

  [0]: https://github.com/sched-ext/sched_ext/blob/52911e1040a0f94b9c426dddcc00be5364a7ae9f/tools/sched_ext/scx_flatcg.bpf.c#L275
  [1]: https://lore.kernel.org/bpf/20230821193311.3290257-1-davemarchevsky@fb.com/

Summary of patches:
  * Patch 1 fixes an issue with bpf_refcount_acquire verification
    letting MAYBE_NULL ptrs through
    * Patch 2 tests Patch 1's fix
  * Patch 3 broadens the use of "free only after RCU GP" to all
    user-allocated types
  * Patch 4 is a small nonfunctional refactoring
  * Patch 5 changes verifier to mark direct LD of stashed graph node
    kptr as non-owning ref
    * Patch 6 tests Patch 5's verifier changes

Changelog:

v1 -> v2: https://lore.kernel.org/bpf/20231025214007.2920506-1-davemarchevsky@fb.com/

Series title changed to "Allow bpf_refcount_acquire of mapval obtained via
direct LD". V1's title was mistakenly truncated.

  * Patch 5 ("bpf: Mark direct ld of stashed bpf_{rb,list}_node as non-owning ref")
    * Direct LD of percpu kptr should not have MEM_ALLOC flag (Yonghong)
  * Patch 6 ("selftests/bpf: Test bpf_refcount_acquire of node obtained via direct ld")
    * Test read from stashed value as well
====================

Link: https://lore.kernel.org/r/20231107085639.3016113-1-davemarchevsky@fb.com


Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 82ce364c e9ed8df7
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -186,8 +186,8 @@ enum btf_field_type {
	BPF_LIST_NODE  = (1 << 6),
	BPF_RB_ROOT    = (1 << 7),
	BPF_RB_NODE    = (1 << 8),
	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD |
				 BPF_RB_NODE | BPF_RB_ROOT,
	BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE,
	BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD,
	BPF_REFCOUNT   = (1 << 9),
};

+4 −7
Original line number Diff line number Diff line
@@ -3840,9 +3840,6 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
	return ERR_PTR(ret);
}

#define GRAPH_ROOT_MASK (BPF_LIST_HEAD | BPF_RB_ROOT)
#define GRAPH_NODE_MASK (BPF_LIST_NODE | BPF_RB_NODE)

int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
{
	int i;
@@ -3855,13 +3852,13 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
	 * Hence we only need to ensure that bpf_{list_head,rb_root} ownership
	 * does not form cycles.
	 */
	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & GRAPH_ROOT_MASK))
	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT))
		return 0;
	for (i = 0; i < rec->cnt; i++) {
		struct btf_struct_meta *meta;
		u32 btf_id;

		if (!(rec->fields[i].type & GRAPH_ROOT_MASK))
		if (!(rec->fields[i].type & BPF_GRAPH_ROOT))
			continue;
		btf_id = rec->fields[i].graph_root.value_btf_id;
		meta = btf_find_struct_meta(btf, btf_id);
@@ -3873,7 +3870,7 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
		 * to check ownership cycle for a type unless it's also a
		 * node type.
		 */
		if (!(rec->field_mask & GRAPH_NODE_MASK))
		if (!(rec->field_mask & BPF_GRAPH_NODE))
			continue;

		/* We need to ensure ownership acyclicity among all types. The
@@ -3909,7 +3906,7 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
		 * - A is both an root and node.
		 * - B is only an node.
		 */
		if (meta->record->field_mask & GRAPH_ROOT_MASK)
		if (meta->record->field_mask & BPF_GRAPH_ROOT)
			return -ELOOP;
	}
	return 0;
+2 −5
Original line number Diff line number Diff line
@@ -1937,10 +1937,7 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
		ma = &bpf_global_percpu_ma;
	else
		ma = &bpf_global_ma;
	if (rec && rec->refcount_off >= 0)
	bpf_mem_free_rcu(ma, p);
	else
		bpf_mem_free(ma, p);
}

__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
@@ -2520,7 +2517,7 @@ BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
BTF_ID_FLAGS(func, bpf_list_push_front_impl)
BTF_ID_FLAGS(func, bpf_list_push_back_impl)
BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
+31 −5
Original line number Diff line number Diff line
@@ -5557,10 +5557,23 @@ BTF_SET_END(rcu_protected_types)
static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
{
	if (!btf_is_kernel(btf))
		return false;
		return true;
	return btf_id_set_contains(&rcu_protected_types, btf_id);
}
static struct btf_record *kptr_pointee_btf_record(struct btf_field *kptr_field)
{
	struct btf_struct_meta *meta;
	if (btf_is_kernel(kptr_field->kptr.btf))
		return NULL;
	meta = btf_find_struct_meta(kptr_field->kptr.btf,
				    kptr_field->kptr.btf_id);
	return meta ? meta->record : NULL;
}
static bool rcu_safe_kptr(const struct btf_field *field)
{
	const struct btf_field_kptr *kptr = &field->kptr;
@@ -5571,12 +5584,25 @@ static bool rcu_safe_kptr(const struct btf_field *field)
static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field)
{
	struct btf_record *rec;
	u32 ret;
	ret = PTR_MAYBE_NULL;
	if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) {
		if (kptr_field->type != BPF_KPTR_PERCPU)
			return PTR_MAYBE_NULL | MEM_RCU;
		return PTR_MAYBE_NULL | MEM_RCU | MEM_PERCPU;
		ret |= MEM_RCU;
		if (kptr_field->type == BPF_KPTR_PERCPU)
			ret |= MEM_PERCPU;
		else if (!btf_is_kernel(kptr_field->kptr.btf))
			ret |= MEM_ALLOC;
		rec = kptr_pointee_btf_record(kptr_field);
		if (rec && btf_record_has_field(rec, BPF_GRAPH_NODE))
			ret |= NON_OWN_REF;
	} else {
		ret |= PTR_UNTRUSTED;
	}
	return PTR_MAYBE_NULL | PTR_UNTRUSTED;
	return ret;
}
static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
+33 −0
Original line number Diff line number Diff line
@@ -73,6 +73,37 @@ static void test_local_kptr_stash_unstash(void)
	local_kptr_stash__destroy(skel);
}

static void test_refcount_acquire_without_unstash(void)
{
	LIBBPF_OPTS(bpf_test_run_opts, opts,
		    .data_in = &pkt_v4,
		    .data_size_in = sizeof(pkt_v4),
		    .repeat = 1,
	);
	struct local_kptr_stash *skel;
	int ret;

	skel = local_kptr_stash__open_and_load();
	if (!ASSERT_OK_PTR(skel, "local_kptr_stash__open_and_load"))
		return;

	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.refcount_acquire_without_unstash),
				     &opts);
	ASSERT_OK(ret, "refcount_acquire_without_unstash run");
	ASSERT_EQ(opts.retval, 2, "refcount_acquire_without_unstash retval");

	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.stash_refcounted_node), &opts);
	ASSERT_OK(ret, "stash_refcounted_node run");
	ASSERT_OK(opts.retval, "stash_refcounted_node retval");

	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.refcount_acquire_without_unstash),
				     &opts);
	ASSERT_OK(ret, "refcount_acquire_without_unstash (2) run");
	ASSERT_EQ(opts.retval, 42, "refcount_acquire_without_unstash (2) retval");

	local_kptr_stash__destroy(skel);
}

static void test_local_kptr_stash_fail(void)
{
	RUN_TESTS(local_kptr_stash_fail);
@@ -86,6 +117,8 @@ void test_local_kptr_stash(void)
		test_local_kptr_stash_plain();
	if (test__start_subtest("local_kptr_stash_unstash"))
		test_local_kptr_stash_unstash();
	if (test__start_subtest("refcount_acquire_without_unstash"))
		test_refcount_acquire_without_unstash();
	if (test__start_subtest("local_kptr_stash_fail"))
		test_local_kptr_stash_fail();
}
Loading