Commit b5f21708 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull bpf fixes from Daniel Borkmann::

 - Fix several issues for BPF LPM trie map which were found by syzbot
   and during addition of new test cases (Hou Tao)

 - Fix a missing process_iter_arg register type check in the BPF
   verifier (Kumar Kartikeya Dwivedi, Tao Lyu)

 - Fix several correctness gaps in the BPF verifier when interacting
   with the BPF stack without CAP_PERFMON (Kumar Kartikeya Dwivedi,
   Eduard Zingerman, Tao Lyu)

 - Fix OOB BPF map writes when deleting elements for the case of xsk map
   as well as devmap (Maciej Fijalkowski)

 - Fix xsk sockets to always clear DMA mapping information when
   unmapping the pool (Larysa Zaremba)

 - Fix sk_mem_uncharge logic in tcp_bpf_sendmsg to only uncharge after
   sent bytes have been finalized (Zijian Zhang)

 - Fix BPF sockmap with vsocks which was missing a queue check in poll
   and sockmap cleanup on close (Michal Luczaj)

 - Fix tools infra to override makefile ARCH variable if defined but
   empty, which addresses cross-building tools. (Björn Töpel)

 - Fix two resolve_btfids build warnings on unresolved bpf_lsm symbols
   (Thomas Weißschuh)

 - Fix a NULL pointer dereference in bpftool (Amir Mohammadi)

 - Fix BPF selftests to check for CONFIG_PREEMPTION instead of
   CONFIG_PREEMPT (Sebastian Andrzej Siewior)

* tag 'bpf-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf: (31 commits)
  selftests/bpf: Add more test cases for LPM trie
  selftests/bpf: Move test_lpm_map.c to map_tests
  bpf: Use raw_spinlock_t for LPM trie
  bpf: Switch to bpf mem allocator for LPM trie
  bpf: Fix exact match conditions in trie_get_next_key()
  bpf: Handle in-place update for full LPM trie correctly
  bpf: Handle BPF_EXIST and BPF_NOEXIST for LPM trie
  bpf: Remove unnecessary kfree(im_node) in lpm_trie_update_elem
  bpf: Remove unnecessary check when updating LPM trie
  selftests/bpf: Add test for narrow spill into 64-bit spilled scalar
  selftests/bpf: Add test for reading from STACK_INVALID slots
  selftests/bpf: Introduce __caps_unpriv annotation for tests
  bpf: Fix narrow scalar spill onto 64-bit spilled scalar slots
  bpf: Don't mark STACK_INVALID as STACK_MISC in mark_stack_slot_misc
  samples/bpf: Remove unnecessary -I flags from libbpf EXTRA_CFLAGS
  bpf: Zero index arg error string for dynptr and iter
  selftests/bpf: Add tests for iter arg check
  bpf: Ensure reg is PTR_TO_STACK in process_iter_arg
  tools: Override makefile ARCH variable if defined, but empty
  selftests/bpf: Add apply_bytes test to test_txmsg_redir_wait_sndmem in test_sockmap
  ...
parents f3ddc438 509df676
Loading
Loading
Loading
Loading
+0 −2
Original line number Diff line number Diff line
@@ -375,8 +375,6 @@ BTF_ID(func, bpf_lsm_socket_socketpair)

BTF_ID(func, bpf_lsm_syslog)
BTF_ID(func, bpf_lsm_task_alloc)
BTF_ID(func, bpf_lsm_current_getsecid_subj)
BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
BTF_ID(func, bpf_lsm_task_to_inode)
+3 −3
Original line number Diff line number Diff line
@@ -184,7 +184,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
static void dev_map_free(struct bpf_map *map)
{
	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
	int i;
	u32 i;

	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
	 * so the programs (can be more than one that used this map) were
@@ -821,7 +821,7 @@ static long dev_map_delete_elem(struct bpf_map *map, void *key)
{
	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
	struct bpf_dtab_netdev *old_dev;
	int k = *(u32 *)key;
	u32 k = *(u32 *)key;

	if (k >= map->max_entries)
		return -EINVAL;
@@ -838,7 +838,7 @@ static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
{
	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
	struct bpf_dtab_netdev *old_dev;
	int k = *(u32 *)key;
	u32 k = *(u32 *)key;
	unsigned long flags;
	int ret = -ENOENT;

+85 −48
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@
#include <net/ipv6.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/bpf_mem_alloc.h>

/* Intermediate node */
#define LPM_TREE_NODE_FLAG_IM BIT(0)
@@ -22,7 +23,6 @@
struct lpm_trie_node;

struct lpm_trie_node {
	struct rcu_head rcu;
	struct lpm_trie_node __rcu	*child[2];
	u32				prefixlen;
	u32				flags;
@@ -32,10 +32,11 @@ struct lpm_trie_node {
struct lpm_trie {
	struct bpf_map			map;
	struct lpm_trie_node __rcu	*root;
	struct bpf_mem_alloc		ma;
	size_t				n_entries;
	size_t				max_prefixlen;
	size_t				data_size;
	spinlock_t			lock;
	raw_spinlock_t			lock;
};

/* This trie implements a longest prefix match algorithm that can be used to
@@ -287,17 +288,18 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
	return found->data + trie->data_size;
}

static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
						 const void *value)
static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie,
						 const void *value,
						 bool disable_migration)
{
	struct lpm_trie_node *node;
	size_t size = sizeof(struct lpm_trie_node) + trie->data_size;

	if (value)
		size += trie->map.value_size;
	if (disable_migration)
		migrate_disable();
	node = bpf_mem_cache_alloc(&trie->ma);
	if (disable_migration)
		migrate_enable();

	node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
				    trie->map.numa_node);
	if (!node)
		return NULL;

@@ -310,12 +312,22 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
	return node;
}

static int trie_check_add_elem(struct lpm_trie *trie, u64 flags)
{
	if (flags == BPF_EXIST)
		return -ENOENT;
	if (trie->n_entries == trie->map.max_entries)
		return -ENOSPC;
	trie->n_entries++;
	return 0;
}

/* Called from syscall or from eBPF program */
static long trie_update_elem(struct bpf_map *map,
			     void *_key, void *value, u64 flags)
{
	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
	struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
	struct lpm_trie_node *node, *im_node, *new_node;
	struct lpm_trie_node *free_node = NULL;
	struct lpm_trie_node __rcu **slot;
	struct bpf_lpm_trie_key_u8 *key = _key;
@@ -330,22 +342,14 @@ static long trie_update_elem(struct bpf_map *map,
	if (key->prefixlen > trie->max_prefixlen)
		return -EINVAL;

	spin_lock_irqsave(&trie->lock, irq_flags);

	/* Allocate and fill a new node */

	if (trie->n_entries == trie->map.max_entries) {
		ret = -ENOSPC;
		goto out;
	}

	new_node = lpm_trie_node_alloc(trie, value);
	if (!new_node) {
		ret = -ENOMEM;
		goto out;
	}
	/* Allocate and fill a new node. Need to disable migration before
	 * invoking bpf_mem_cache_alloc().
	 */
	new_node = lpm_trie_node_alloc(trie, value, true);
	if (!new_node)
		return -ENOMEM;

	trie->n_entries++;
	raw_spin_lock_irqsave(&trie->lock, irq_flags);

	new_node->prefixlen = key->prefixlen;
	RCU_INIT_POINTER(new_node->child[0], NULL);
@@ -364,8 +368,7 @@ static long trie_update_elem(struct bpf_map *map,
		matchlen = longest_prefix_match(trie, node, key);

		if (node->prefixlen != matchlen ||
		    node->prefixlen == key->prefixlen ||
		    node->prefixlen == trie->max_prefixlen)
		    node->prefixlen == key->prefixlen)
			break;

		next_bit = extract_bit(key->data, node->prefixlen);
@@ -376,6 +379,10 @@ static long trie_update_elem(struct bpf_map *map,
	 * simply assign the @new_node to that slot and be done.
	 */
	if (!node) {
		ret = trie_check_add_elem(trie, flags);
		if (ret)
			goto out;

		rcu_assign_pointer(*slot, new_node);
		goto out;
	}
@@ -384,18 +391,30 @@ static long trie_update_elem(struct bpf_map *map,
	 * which already has the correct data array set.
	 */
	if (node->prefixlen == matchlen) {
		if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) {
			if (flags == BPF_NOEXIST) {
				ret = -EEXIST;
				goto out;
			}
		} else {
			ret = trie_check_add_elem(trie, flags);
			if (ret)
				goto out;
		}

		new_node->child[0] = node->child[0];
		new_node->child[1] = node->child[1];

		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
			trie->n_entries--;

		rcu_assign_pointer(*slot, new_node);
		free_node = node;

		goto out;
	}

	ret = trie_check_add_elem(trie, flags);
	if (ret)
		goto out;

	/* If the new node matches the prefix completely, it must be inserted
	 * as an ancestor. Simply insert it between @node and *@slot.
	 */
@@ -406,8 +425,10 @@ static long trie_update_elem(struct bpf_map *map,
		goto out;
	}

	im_node = lpm_trie_node_alloc(trie, NULL);
	/* migration is disabled within the locked scope */
	im_node = lpm_trie_node_alloc(trie, NULL, false);
	if (!im_node) {
		trie->n_entries--;
		ret = -ENOMEM;
		goto out;
	}
@@ -429,16 +450,13 @@ static long trie_update_elem(struct bpf_map *map,
	rcu_assign_pointer(*slot, im_node);

out:
	if (ret) {
		if (new_node)
			trie->n_entries--;
	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);

		kfree(new_node);
		kfree(im_node);
	}

	spin_unlock_irqrestore(&trie->lock, irq_flags);
	kfree_rcu(free_node, rcu);
	migrate_disable();
	if (ret)
		bpf_mem_cache_free(&trie->ma, new_node);
	bpf_mem_cache_free_rcu(&trie->ma, free_node);
	migrate_enable();

	return ret;
}
@@ -459,7 +477,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
	if (key->prefixlen > trie->max_prefixlen)
		return -EINVAL;

	spin_lock_irqsave(&trie->lock, irq_flags);
	raw_spin_lock_irqsave(&trie->lock, irq_flags);

	/* Walk the tree looking for an exact key/length match and keeping
	 * track of the path we traverse.  We will need to know the node
@@ -535,9 +553,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
	free_node = node;

out:
	spin_unlock_irqrestore(&trie->lock, irq_flags);
	kfree_rcu(free_parent, rcu);
	kfree_rcu(free_node, rcu);
	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);

	migrate_disable();
	bpf_mem_cache_free_rcu(&trie->ma, free_parent);
	bpf_mem_cache_free_rcu(&trie->ma, free_node);
	migrate_enable();

	return ret;
}
@@ -559,6 +580,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
	struct lpm_trie *trie;
	size_t leaf_size;
	int err;

	/* check sanity of attributes */
	if (attr->max_entries == 0 ||
@@ -581,9 +604,19 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
			  offsetof(struct bpf_lpm_trie_key_u8, data);
	trie->max_prefixlen = trie->data_size * 8;

	spin_lock_init(&trie->lock);
	raw_spin_lock_init(&trie->lock);

	/* Allocate intermediate and leaf nodes from the same allocator */
	leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
		    trie->map.value_size;
	err = bpf_mem_alloc_init(&trie->ma, leaf_size, false);
	if (err)
		goto free_out;
	return &trie->map;

free_out:
	bpf_map_area_free(trie);
	return ERR_PTR(err);
}

static void trie_free(struct bpf_map *map)
@@ -615,13 +648,17 @@ static void trie_free(struct bpf_map *map)
				continue;
			}

			kfree(node);
			/* No bpf program may access the map, so freeing the
			 * node without waiting for the extra RCU GP.
			 */
			bpf_mem_cache_raw_free(node);
			RCU_INIT_POINTER(*slot, NULL);
			break;
		}
	}

out:
	bpf_mem_alloc_destroy(&trie->ma);
	bpf_map_area_free(trie);
}

@@ -633,7 +670,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
	struct lpm_trie_node **node_stack = NULL;
	int err = 0, stack_ptr = -1;
	unsigned int next_bit;
	size_t matchlen;
	size_t matchlen = 0;

	/* The get_next_key follows postorder. For the 4 node example in
	 * the top of this file, the trie_get_next_key() returns the following
@@ -672,7 +709,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
		next_bit = extract_bit(key->data, node->prefixlen);
		node = rcu_dereference(node->child[next_bit]);
	}
	if (!node || node->prefixlen != key->prefixlen ||
	if (!node || node->prefixlen != matchlen ||
	    (node->flags & LPM_TREE_NODE_FLAG_IM))
		goto find_leftmost;

+18 −9
Original line number Diff line number Diff line
@@ -1202,14 +1202,17 @@ static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack)
/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which
 * case they are equivalent, or it's STACK_ZERO, in which case we preserve
 * more precise STACK_ZERO.
 * Note, in uprivileged mode leaving STACK_INVALID is wrong, so we take
 * env->allow_ptr_leaks into account and force STACK_MISC, if necessary.
 * Regardless of allow_ptr_leaks setting (i.e., privileged or unprivileged
 * mode), we won't promote STACK_INVALID to STACK_MISC. In privileged case it is
 * unnecessary as both are considered equivalent when loading data and pruning,
 * in case of unprivileged mode it will be incorrect to allow reads of invalid
 * slots.
 */
static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
{
	if (*stype == STACK_ZERO)
		return;
	if (env->allow_ptr_leaks && *stype == STACK_INVALID)
	if (*stype == STACK_INVALID)
		return;
	*stype = STACK_MISC;
}
@@ -4700,6 +4703,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
	 */
	if (!env->allow_ptr_leaks &&
	    is_spilled_reg(&state->stack[spi]) &&
	    !is_spilled_scalar_reg(&state->stack[spi]) &&
	    size != BPF_REG_SIZE) {
		verbose(env, "attempt to corrupt spilled pointer on stack\n");
		return -EACCES;
@@ -8071,7 +8075,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
	if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) {
		verbose(env,
			"arg#%d expected pointer to stack or const struct bpf_dynptr\n",
			regno);
			regno - 1);
		return -EINVAL;
	}
@@ -8125,7 +8129,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
		if (!is_dynptr_reg_valid_init(env, reg)) {
			verbose(env,
				"Expected an initialized dynptr as arg #%d\n",
				regno);
				regno - 1);
			return -EINVAL;
		}
@@ -8133,7 +8137,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
		if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
			verbose(env,
				"Expected a dynptr of type %s as arg #%d\n",
				dynptr_type_str(arg_to_dynptr_type(arg_type)), regno);
				dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1);
			return -EINVAL;
		}
@@ -8189,6 +8193,11 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
	const struct btf_type *t;
	int spi, err, i, nr_slots, btf_id;
	if (reg->type != PTR_TO_STACK) {
		verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1);
		return -EINVAL;
	}
	/* For iter_{new,next,destroy} functions, btf_check_iter_kfuncs()
	 * ensures struct convention, so we wouldn't need to do any BTF
	 * validation here. But given iter state can be passed as a parameter
@@ -8197,7 +8206,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
	 */
	btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1);
	if (btf_id < 0) {
		verbose(env, "expected valid iter pointer as arg #%d\n", regno);
		verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1);
		return -EINVAL;
	}
	t = btf_type_by_id(meta->btf, btf_id);
@@ -8207,7 +8216,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
		/* bpf_iter_<type>_new() expects pointer to uninit iter state */
		if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
			verbose(env, "expected uninitialized iter_%s as arg #%d\n",
				iter_type_str(meta->btf, btf_id), regno);
				iter_type_str(meta->btf, btf_id), regno - 1);
			return -EINVAL;
		}
@@ -8231,7 +8240,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
			break;
		case -EINVAL:
			verbose(env, "expected an initialized iter_%s as arg #%d\n",
				iter_type_str(meta->btf, btf_id), regno);
				iter_type_str(meta->btf, btf_id), regno - 1);
			return err;
		case -EPROTO:
			verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
+4 −7
Original line number Diff line number Diff line
@@ -441,7 +441,6 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
			cork = true;
			psock->cork = NULL;
		}
		sk_msg_return(sk, msg, tosend);
		release_sock(sk);

		origsize = msg->sg.size;
@@ -453,8 +452,9 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
			sock_put(sk_redir);

		lock_sock(sk);
		sk_mem_uncharge(sk, sent);
		if (unlikely(ret < 0)) {
			int free = sk_msg_free_nocharge(sk, msg);
			int free = sk_msg_free(sk, msg);

			if (!cork)
				*copied -= free;
@@ -468,7 +468,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
		break;
	case __SK_DROP:
	default:
		sk_msg_free_partial(sk, msg, tosend);
		sk_msg_free(sk, msg);
		sk_msg_apply_bytes(psock, tosend);
		*copied -= (tosend + delta);
		return -EACCES;
@@ -484,12 +484,9 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
		}
		if (msg &&
		    msg->sg.data[msg->sg.start].page_link &&
		    msg->sg.data[msg->sg.start].length) {
			if (eval == __SK_REDIRECT)
				sk_mem_charge(sk, tosend - sent);
		    msg->sg.data[msg->sg.start].length)
			goto more_data;
	}
	}
	return ret;
}

Loading