Commit b53b63db authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'support-eliding-map-lookup-nullness'

Daniel Xu says:

====================
Support eliding map lookup nullness

This patch allows progs to elide a null check on statically known map
lookup keys. In other words, if the verifier can statically prove that
the lookup will be in-bounds, allow the prog to drop the null check.

This is useful for two reasons:

1. Large numbers of nullness checks (especially when they cannot fail)
   unnecessarily pushes prog towards BPF_COMPLEXITY_LIMIT_JMP_SEQ.
2. It forms a tighter contract between programmer and verifier.

For (1), bpftrace is starting to make heavier use of percpu scratch
maps. As a result, for user scripts with large number of unrolled loops,
we are starting to hit jump complexity verification errors.  These
percpu lookups cannot fail anyways, as we only use static key values.
Eliding nullness probably results in less work for verifier as well.

For (2), percpu scratch maps are often used as a larger stack, as the
currrent stack is limited to 512 bytes. In these situations, it is
desirable for the programmer to express: "this lookup should never fail,
and if it does, it means I messed up the code". By omitting the null
check, the programmer can "ask" the verifier to double check the logic.

=== Changelog ===

Changes in v7:
* Use more accurate frame number when marking precise
* Add test for non-stack key
* Test for marking stack slot precise

Changes in v6:
* Use is_spilled_scalar_reg() helper and remove unnecessary comment
* Add back deleted selftest with different helper to dirty dst buffer
* Check size of spill is exactly key_size and update selftests
* Read slot_type from correct offset into the spi
* Rewrite selftests in C where possible
* Mark constant map keys as precise

Changes in v5:
* Dropped all acks
* Use s64 instead of long for const_map_key
* Ensure stack slot contains spilled reg before accessing spilled_ptr
* Ensure spilled reg is a scalar before accessing tnum const value
* Fix verifier selftest for 32-bit write to write at 8 byte alignment
  to ensure spill is tracked
* Introduce more precise tracking of helper stack accesses
* Do constant map key extraction as part of helper argument processing
  and then remove duplicated stack checks
* Use ret_flag instead of regs[BPF_REG_0].type
* Handle STACK_ZERO
* Fix bug in bpf_load_hdr_opt() arg annotation

Changes in v4:
* Only allow for CAP_BPF
* Add test for stack growing upwards
* Improve comment about stack growing upwards

Changes in v3:
* Check if stack is (erroneously) growing upwards
* Mention in commit message why existing tests needed change

Changes in v2:
* Added a check for when R2 is not a ptr to stack
* Added a check for when stack is uninitialized (no stack slot yet)
* Updated existing tests to account for null elision
* Added test case for when R2 can be both const and non-const
====================

Link: https://patch.msgid.link/cover.1736886479.git.dxu@dxuuu.xyz


Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 556a3994 f932a8e4
Loading
Loading
Loading
Loading
+106 −33
Original line number Diff line number Diff line
@@ -287,6 +287,7 @@ struct bpf_call_arg_meta {
	u32 ret_btf_id;
	u32 subprogno;
	struct btf_field *kptr_field;
	s64 const_map_key;
};
struct bpf_kfunc_call_arg_meta {
@@ -5303,7 +5304,7 @@ enum bpf_access_src {
static int check_stack_range_initialized(struct bpf_verifier_env *env,
					 int regno, int off, int access_size,
					 bool zero_size_allowed,
					 enum bpf_access_src type,
					 enum bpf_access_type type,
					 struct bpf_call_arg_meta *meta);
static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
@@ -5336,7 +5337,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env,
	/* Note that we pass a NULL meta, so raw access will not be permitted.
	 */
	err = check_stack_range_initialized(env, ptr_regno, off, size,
					    false, ACCESS_DIRECT, NULL);
					    false, BPF_READ, NULL);
	if (err)
		return err;
@@ -7190,7 +7191,7 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
static int check_stack_access_within_bounds(
		struct bpf_verifier_env *env,
		int regno, int off, int access_size,
		enum bpf_access_src src, enum bpf_access_type type)
		enum bpf_access_type type)
{
	struct bpf_reg_state *regs = cur_regs(env);
	struct bpf_reg_state *reg = regs + regno;
@@ -7199,10 +7200,7 @@ static int check_stack_access_within_bounds(
	int err;
	char *err_extra;
	if (src == ACCESS_HELPER)
		/* We don't know if helpers are reading or writing (or both). */
		err_extra = " indirect access to";
	else if (type == BPF_READ)
	if (type == BPF_READ)
		err_extra = " read from";
	else
		err_extra = " write to";
@@ -7420,7 +7418,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
	} else if (reg->type == PTR_TO_STACK) {
		/* Basic bounds checks. */
		err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
		err = check_stack_access_within_bounds(env, regno, off, size, t);
		if (err)
			return err;
@@ -7640,13 +7638,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
static int check_stack_range_initialized(
		struct bpf_verifier_env *env, int regno, int off,
		int access_size, bool zero_size_allowed,
		enum bpf_access_src type, struct bpf_call_arg_meta *meta)
		enum bpf_access_type type, struct bpf_call_arg_meta *meta)
{
	struct bpf_reg_state *reg = reg_state(env, regno);
	struct bpf_func_state *state = func(env, reg);
	int err, min_off, max_off, i, j, slot, spi;
	char *err_extra = type == ACCESS_HELPER ? " indirect" : "";
	enum bpf_access_type bounds_check_type;
	/* Some accesses can write anything into the stack, others are
	 * read-only.
	 */
@@ -7657,18 +7653,10 @@ static int check_stack_range_initialized(
		return -EACCES;
	}
	if (type == ACCESS_HELPER) {
		/* The bounds checks for writes are more permissive than for
		 * reads. However, if raw_mode is not set, we'll do extra
		 * checks below.
		 */
		bounds_check_type = BPF_WRITE;
	if (type == BPF_WRITE)
		clobber = true;
	} else {
		bounds_check_type = BPF_READ;
	}
	err = check_stack_access_within_bounds(env, regno, off, access_size,
					       type, bounds_check_type);
	err = check_stack_access_within_bounds(env, regno, off, access_size, type);
	if (err)
		return err;
@@ -7685,8 +7673,8 @@ static int check_stack_range_initialized(
			char tn_buf[48];
			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
			verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n",
				regno, err_extra, tn_buf);
			verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
				regno, tn_buf);
			return -EACCES;
		}
		/* Only initialized buffer on stack is allowed to be accessed
@@ -7739,7 +7727,7 @@ static int check_stack_range_initialized(
		slot = -i - 1;
		spi = slot / BPF_REG_SIZE;
		if (state->allocated_stack <= slot) {
			verbose(env, "verifier bug: allocated_stack too small");
			verbose(env, "verifier bug: allocated_stack too small\n");
			return -EFAULT;
		}
@@ -7767,14 +7755,14 @@ static int check_stack_range_initialized(
		}
		if (tnum_is_const(reg->var_off)) {
			verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
				err_extra, regno, min_off, i - min_off, access_size);
			verbose(env, "invalid read from stack R%d off %d+%d size %d\n",
				regno, min_off, i - min_off, access_size);
		} else {
			char tn_buf[48];
			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
			verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n",
				err_extra, regno, tn_buf, i - min_off, access_size);
			verbose(env, "invalid read from stack R%d var_off %s+%d size %d\n",
				regno, tn_buf, i - min_off, access_size);
		}
		return -EACCES;
mark:
@@ -7849,7 +7837,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
		return check_stack_range_initialized(
				env,
				regno, reg->off, access_size,
				zero_size_allowed, ACCESS_HELPER, meta);
				zero_size_allowed, access_type, meta);
	case PTR_TO_BTF_ID:
		return check_ptr_to_btf_access(env, regs, regno, reg->off,
					       access_size, BPF_READ, -1);
@@ -9161,6 +9149,63 @@ static int check_reg_const_str(struct bpf_verifier_env *env,
	return 0;
}
/* Returns constant key value if possible, else negative error */
static s64 get_constant_map_key(struct bpf_verifier_env *env,
				struct bpf_reg_state *key,
				u32 key_size)
{
	struct bpf_func_state *state = func(env, key);
	struct bpf_reg_state *reg;
	int slot, spi, off;
	int spill_size = 0;
	int zero_size = 0;
	int stack_off;
	int i, err;
	u8 *stype;
	if (!env->bpf_capable)
		return -EOPNOTSUPP;
	if (key->type != PTR_TO_STACK)
		return -EOPNOTSUPP;
	if (!tnum_is_const(key->var_off))
		return -EOPNOTSUPP;
	stack_off = key->off + key->var_off.value;
	slot = -stack_off - 1;
	spi = slot / BPF_REG_SIZE;
	off = slot % BPF_REG_SIZE;
	stype = state->stack[spi].slot_type;
	/* First handle precisely tracked STACK_ZERO */
	for (i = off; i >= 0 && stype[i] == STACK_ZERO; i--)
		zero_size++;
	if (zero_size >= key_size)
		return 0;
	/* Check that stack contains a scalar spill of expected size */
	if (!is_spilled_scalar_reg(&state->stack[spi]))
		return -EOPNOTSUPP;
	for (i = off; i >= 0 && stype[i] == STACK_SPILL; i--)
		spill_size++;
	if (spill_size != key_size)
		return -EOPNOTSUPP;
	reg = &state->stack[spi].spilled_ptr;
	if (!tnum_is_const(reg->var_off))
		/* Stack value not statically known */
		return -EOPNOTSUPP;
	/* We are relying on a constant value. So mark as precise
	 * to prevent pruning on it.
	 */
	bt_set_frame_slot(&env->bt, key->frameno, spi);
	err = mark_chain_precision_batch(env);
	if (err < 0)
		return err;
	return reg->var_off.value;
}
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
			  struct bpf_call_arg_meta *meta,
			  const struct bpf_func_proto *fn,
@@ -9171,6 +9216,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
	enum bpf_arg_type arg_type = fn->arg_type[arg];
	enum bpf_reg_type type = reg->type;
	u32 *arg_btf_id = NULL;
	u32 key_size;
	int err = 0;
	if (arg_type == ARG_DONTCARE)
@@ -9304,8 +9350,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
			verbose(env, "invalid map_ptr to access map->key\n");
			return -EACCES;
		}
		err = check_helper_mem_access(env, regno, meta->map_ptr->key_size,
					      BPF_READ, false, NULL);
		key_size = meta->map_ptr->key_size;
		err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL);
		if (err)
			return err;
		meta->const_map_key = get_constant_map_key(env, reg, key_size);
		if (meta->const_map_key < 0 && meta->const_map_key != -EOPNOTSUPP)
			return meta->const_map_key;
		break;
	case ARG_PTR_TO_MAP_VALUE:
		if (type_may_be_null(arg_type) && register_is_null(reg))
@@ -10829,6 +10880,21 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno
				 state->callback_subprogno == subprogno);
}
/* Returns whether or not the given map type can potentially elide
 * lookup return value nullness check. This is possible if the key
 * is statically known.
 */
static bool can_elide_value_nullness(enum bpf_map_type type)
{
	switch (type) {
	case BPF_MAP_TYPE_ARRAY:
	case BPF_MAP_TYPE_PERCPU_ARRAY:
		return true;
	default:
		return false;
	}
}
static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
			    const struct bpf_func_proto **ptr)
{
@@ -11195,10 +11261,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
				"kernel subsystem misconfigured verifier\n");
			return -EINVAL;
		}
		if (func_id == BPF_FUNC_map_lookup_elem &&
		    can_elide_value_nullness(meta.map_ptr->map_type) &&
		    meta.const_map_key >= 0 &&
		    meta.const_map_key < meta.map_ptr->max_entries)
			ret_flag &= ~PTR_MAYBE_NULL;
		regs[BPF_REG_0].map_ptr = meta.map_ptr;
		regs[BPF_REG_0].map_uid = meta.map_uid;
		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
		if (!type_may_be_null(ret_type) &&
		if (!type_may_be_null(ret_flag) &&
		    btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) {
			regs[BPF_REG_0].id = ++env->id_gen;
		}
+1 −1
Original line number Diff line number Diff line
@@ -7643,7 +7643,7 @@ static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
	.gpl_only	= false,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_PTR_TO_MEM,
	.arg2_type	= ARG_PTR_TO_MEM | MEM_WRITE,
	.arg3_type	= ARG_CONST_SIZE,
	.arg4_type	= ARG_ANYTHING,
};
+3 −3
Original line number Diff line number Diff line
@@ -192,7 +192,7 @@ int ringbuf_invalid_api(void *ctx)

/* Can't add a dynptr to a map */
SEC("?raw_tp")
__failure __msg("invalid indirect read from stack")
__failure __msg("invalid read from stack")
int add_dynptr_to_map1(void *ctx)
{
	struct bpf_dynptr ptr;
@@ -210,7 +210,7 @@ int add_dynptr_to_map1(void *ctx)

/* Can't add a struct with an embedded dynptr to a map */
SEC("?raw_tp")
__failure __msg("invalid indirect read from stack")
__failure __msg("invalid read from stack")
int add_dynptr_to_map2(void *ctx)
{
	struct test_info x;
@@ -398,7 +398,7 @@ int data_slice_missing_null_check2(void *ctx)
 * dynptr argument
 */
SEC("?raw_tp")
__failure __msg("invalid indirect read from stack")
__failure __msg("invalid read from stack")
int invalid_helper1(void *ctx)
{
	struct bpf_dynptr ptr;
+7 −7
Original line number Diff line number Diff line
@@ -524,11 +524,11 @@ int iter_subprog_iters(const void *ctx)
}

struct {
	__uint(type, BPF_MAP_TYPE_ARRAY);
	__uint(type, BPF_MAP_TYPE_HASH);
	__type(key, int);
	__type(value, int);
	__uint(max_entries, 1000);
} arr_map SEC(".maps");
} hash_map SEC(".maps");

SEC("?raw_tp")
__failure __msg("invalid mem access 'scalar'")
@@ -539,7 +539,7 @@ int iter_err_too_permissive1(const void *ctx)

	MY_PID_GUARD();

	map_val = bpf_map_lookup_elem(&arr_map, &key);
	map_val = bpf_map_lookup_elem(&hash_map, &key);
	if (!map_val)
		return 0;

@@ -561,12 +561,12 @@ int iter_err_too_permissive2(const void *ctx)

	MY_PID_GUARD();

	map_val = bpf_map_lookup_elem(&arr_map, &key);
	map_val = bpf_map_lookup_elem(&hash_map, &key);
	if (!map_val)
		return 0;

	bpf_repeat(1000000) {
		map_val = bpf_map_lookup_elem(&arr_map, &key);
		map_val = bpf_map_lookup_elem(&hash_map, &key);
	}

	*map_val = 123;
@@ -585,7 +585,7 @@ int iter_err_too_permissive3(const void *ctx)
	MY_PID_GUARD();

	bpf_repeat(1000000) {
		map_val = bpf_map_lookup_elem(&arr_map, &key);
		map_val = bpf_map_lookup_elem(&hash_map, &key);
		found = true;
	}

@@ -606,7 +606,7 @@ int iter_tricky_but_fine(const void *ctx)
	MY_PID_GUARD();

	bpf_repeat(1000000) {
		map_val = bpf_map_lookup_elem(&arr_map, &key);
		map_val = bpf_map_lookup_elem(&hash_map, &key);
		if (map_val) {
			found = true;
			break;
+1 −1
Original line number Diff line number Diff line
@@ -345,7 +345,7 @@ int reject_indirect_global_func_access(struct __sk_buff *ctx)
}

SEC("?tc")
__failure __msg("Unreleased reference id=5 alloc_insn=")
__failure __msg("Unreleased reference id=4 alloc_insn=")
int kptr_xchg_ref_state(struct __sk_buff *ctx)
{
	struct prog_test_ref_kfunc *p;
Loading