Commit 509df676 authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'fixes-for-lpm-trie'

Hou Tao says:

====================
This patch set fixes several issues for LPM trie. These issues were
found during adding new test cases or were reported by syzbot.

The patch set is structured as follows:

Patch #1~#2 are clean-ups for lpm_trie_update_elem().
Patch #3 handles BPF_EXIST and BPF_NOEXIST correctly for LPM trie.
Patch #4 fixes the accounting of n_entries when doing in-place update.
Patch #5 fixes the exact match condition in trie_get_next_key() and it
may skip keys when the passed key is not found in the map.
Patch #6~#7 switch from kmalloc() to bpf memory allocator for LPM trie
to fix several lock order warnings reported by syzbot. It also enables
raw_spinlock_t for LPM trie again. After these changes, the LPM trie will
be closer to being usable in any context (though the reentrance check of
trie->lock is still missing, but it is on my todo list).
Patch #8: move test_lpm_map to map_tests to make it run regularly.
Patch #9: add test cases for the issues fixed by patch #3~#5.

Please see individual patches for more details. Comments are always
welcome.

Change Log:
v3:
  * patch #2: remove the unnecessary NULL-init for im_node
  * patch #6: alloc the leaf node before disabling IRQ to low
    the possibility of -ENOMEM when leaf_size is large; Free
    these nodes outside the trie lock (Suggested by Alexei)
  * collect review and ack tags (Thanks for Toke & Daniel)

v2: https://lore.kernel.org/bpf/20241127004641.1118269-1-houtao@huaweicloud.com/
  * collect review tags (Thanks for Toke)
  * drop "Add bpf_mem_cache_is_mergeable() helper" patch
  * patch #3~#4: add fix tag
  * patch #4: rename the helper to trie_check_add_elem() and increase
    n_entries in it.
  * patch #6: use one bpf mem allocator and update commit message to
    clarify that using bpf mem allocator is more appropriate.
  * patch #7: update commit message to add the possible max running time
    for update operation.
  * patch #9: update commit message to specify the purpose of these test
    cases.

v1: https://lore.kernel.org/bpf/20241118010808.2243555-1-houtao@huaweicloud.com/
====================

Link: https://lore.kernel.org/all/20241206110622.1161752-1-houtao@huaweicloud.com/


Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents e2cf9133 04d4ce91
Loading
Loading
Loading
Loading
+85 −48
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@
#include <net/ipv6.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/bpf_mem_alloc.h>

/* Intermediate node */
#define LPM_TREE_NODE_FLAG_IM BIT(0)
@@ -22,7 +23,6 @@
struct lpm_trie_node;

struct lpm_trie_node {
	struct rcu_head rcu;
	struct lpm_trie_node __rcu	*child[2];
	u32				prefixlen;
	u32				flags;
@@ -32,10 +32,11 @@ struct lpm_trie_node {
struct lpm_trie {
	struct bpf_map			map;
	struct lpm_trie_node __rcu	*root;
	struct bpf_mem_alloc		ma;
	size_t				n_entries;
	size_t				max_prefixlen;
	size_t				data_size;
	spinlock_t			lock;
	raw_spinlock_t			lock;
};

/* This trie implements a longest prefix match algorithm that can be used to
@@ -287,17 +288,18 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
	return found->data + trie->data_size;
}

static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
						 const void *value)
static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie,
						 const void *value,
						 bool disable_migration)
{
	struct lpm_trie_node *node;
	size_t size = sizeof(struct lpm_trie_node) + trie->data_size;

	if (value)
		size += trie->map.value_size;
	if (disable_migration)
		migrate_disable();
	node = bpf_mem_cache_alloc(&trie->ma);
	if (disable_migration)
		migrate_enable();

	node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
				    trie->map.numa_node);
	if (!node)
		return NULL;

@@ -310,12 +312,22 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
	return node;
}

static int trie_check_add_elem(struct lpm_trie *trie, u64 flags)
{
	if (flags == BPF_EXIST)
		return -ENOENT;
	if (trie->n_entries == trie->map.max_entries)
		return -ENOSPC;
	trie->n_entries++;
	return 0;
}

/* Called from syscall or from eBPF program */
static long trie_update_elem(struct bpf_map *map,
			     void *_key, void *value, u64 flags)
{
	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
	struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
	struct lpm_trie_node *node, *im_node, *new_node;
	struct lpm_trie_node *free_node = NULL;
	struct lpm_trie_node __rcu **slot;
	struct bpf_lpm_trie_key_u8 *key = _key;
@@ -330,22 +342,14 @@ static long trie_update_elem(struct bpf_map *map,
	if (key->prefixlen > trie->max_prefixlen)
		return -EINVAL;

	spin_lock_irqsave(&trie->lock, irq_flags);

	/* Allocate and fill a new node */

	if (trie->n_entries == trie->map.max_entries) {
		ret = -ENOSPC;
		goto out;
	}

	new_node = lpm_trie_node_alloc(trie, value);
	if (!new_node) {
		ret = -ENOMEM;
		goto out;
	}
	/* Allocate and fill a new node. Need to disable migration before
	 * invoking bpf_mem_cache_alloc().
	 */
	new_node = lpm_trie_node_alloc(trie, value, true);
	if (!new_node)
		return -ENOMEM;

	trie->n_entries++;
	raw_spin_lock_irqsave(&trie->lock, irq_flags);

	new_node->prefixlen = key->prefixlen;
	RCU_INIT_POINTER(new_node->child[0], NULL);
@@ -364,8 +368,7 @@ static long trie_update_elem(struct bpf_map *map,
		matchlen = longest_prefix_match(trie, node, key);

		if (node->prefixlen != matchlen ||
		    node->prefixlen == key->prefixlen ||
		    node->prefixlen == trie->max_prefixlen)
		    node->prefixlen == key->prefixlen)
			break;

		next_bit = extract_bit(key->data, node->prefixlen);
@@ -376,6 +379,10 @@ static long trie_update_elem(struct bpf_map *map,
	 * simply assign the @new_node to that slot and be done.
	 */
	if (!node) {
		ret = trie_check_add_elem(trie, flags);
		if (ret)
			goto out;

		rcu_assign_pointer(*slot, new_node);
		goto out;
	}
@@ -384,18 +391,30 @@ static long trie_update_elem(struct bpf_map *map,
	 * which already has the correct data array set.
	 */
	if (node->prefixlen == matchlen) {
		if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) {
			if (flags == BPF_NOEXIST) {
				ret = -EEXIST;
				goto out;
			}
		} else {
			ret = trie_check_add_elem(trie, flags);
			if (ret)
				goto out;
		}

		new_node->child[0] = node->child[0];
		new_node->child[1] = node->child[1];

		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
			trie->n_entries--;

		rcu_assign_pointer(*slot, new_node);
		free_node = node;

		goto out;
	}

	ret = trie_check_add_elem(trie, flags);
	if (ret)
		goto out;

	/* If the new node matches the prefix completely, it must be inserted
	 * as an ancestor. Simply insert it between @node and *@slot.
	 */
@@ -406,8 +425,10 @@ static long trie_update_elem(struct bpf_map *map,
		goto out;
	}

	im_node = lpm_trie_node_alloc(trie, NULL);
	/* migration is disabled within the locked scope */
	im_node = lpm_trie_node_alloc(trie, NULL, false);
	if (!im_node) {
		trie->n_entries--;
		ret = -ENOMEM;
		goto out;
	}
@@ -429,16 +450,13 @@ static long trie_update_elem(struct bpf_map *map,
	rcu_assign_pointer(*slot, im_node);

out:
	if (ret) {
		if (new_node)
			trie->n_entries--;
	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);

		kfree(new_node);
		kfree(im_node);
	}

	spin_unlock_irqrestore(&trie->lock, irq_flags);
	kfree_rcu(free_node, rcu);
	migrate_disable();
	if (ret)
		bpf_mem_cache_free(&trie->ma, new_node);
	bpf_mem_cache_free_rcu(&trie->ma, free_node);
	migrate_enable();

	return ret;
}
@@ -459,7 +477,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
	if (key->prefixlen > trie->max_prefixlen)
		return -EINVAL;

	spin_lock_irqsave(&trie->lock, irq_flags);
	raw_spin_lock_irqsave(&trie->lock, irq_flags);

	/* Walk the tree looking for an exact key/length match and keeping
	 * track of the path we traverse.  We will need to know the node
@@ -535,9 +553,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
	free_node = node;

out:
	spin_unlock_irqrestore(&trie->lock, irq_flags);
	kfree_rcu(free_parent, rcu);
	kfree_rcu(free_node, rcu);
	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);

	migrate_disable();
	bpf_mem_cache_free_rcu(&trie->ma, free_parent);
	bpf_mem_cache_free_rcu(&trie->ma, free_node);
	migrate_enable();

	return ret;
}
@@ -559,6 +580,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
	struct lpm_trie *trie;
	size_t leaf_size;
	int err;

	/* check sanity of attributes */
	if (attr->max_entries == 0 ||
@@ -581,9 +604,19 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
			  offsetof(struct bpf_lpm_trie_key_u8, data);
	trie->max_prefixlen = trie->data_size * 8;

	spin_lock_init(&trie->lock);
	raw_spin_lock_init(&trie->lock);

	/* Allocate intermediate and leaf nodes from the same allocator */
	leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
		    trie->map.value_size;
	err = bpf_mem_alloc_init(&trie->ma, leaf_size, false);
	if (err)
		goto free_out;
	return &trie->map;

free_out:
	bpf_map_area_free(trie);
	return ERR_PTR(err);
}

static void trie_free(struct bpf_map *map)
@@ -615,13 +648,17 @@ static void trie_free(struct bpf_map *map)
				continue;
			}

			kfree(node);
			/* No bpf program may access the map, so freeing the
			 * node without waiting for the extra RCU GP.
			 */
			bpf_mem_cache_raw_free(node);
			RCU_INIT_POINTER(*slot, NULL);
			break;
		}
	}

out:
	bpf_mem_alloc_destroy(&trie->ma);
	bpf_map_area_free(trie);
}

@@ -633,7 +670,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
	struct lpm_trie_node **node_stack = NULL;
	int err = 0, stack_ptr = -1;
	unsigned int next_bit;
	size_t matchlen;
	size_t matchlen = 0;

	/* The get_next_key follows postorder. For the 4 node example in
	 * the top of this file, the trie_get_next_key() returns the following
@@ -672,7 +709,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
		next_bit = extract_bit(key->data, node->prefixlen);
		node = rcu_dereference(node->child[next_bit]);
	}
	if (!node || node->prefixlen != key->prefixlen ||
	if (!node || node->prefixlen != matchlen ||
	    (node->flags & LPM_TREE_NODE_FLAG_IM))
		goto find_leftmost;

+0 −1
Original line number Diff line number Diff line
@@ -5,7 +5,6 @@ bpf-syscall*
test_verifier
test_maps
test_lru_map
test_lpm_map
test_tag
FEATURE-DUMP.libbpf
FEATURE-DUMP.selftests
+1 −1
Original line number Diff line number Diff line
@@ -83,7 +83,7 @@ CLANG_CPUV4 := 1
endif

# Order correspond to 'make run_tests' order
TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_progs \
	test_sockmap \
	test_tcpnotify_user test_sysctl \
	test_progs-no_alu32
+398 −7
Original line number Diff line number Diff line
@@ -20,10 +20,12 @@
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <endian.h>
#include <arpa/inet.h>
#include <sys/time.h>

#include <bpf/bpf.h>
#include <test_maps.h>

#include "bpf_util.h"

@@ -33,6 +35,22 @@ struct tlpm_node {
	uint8_t key[];
};

struct lpm_trie_bytes_key {
	union {
		struct bpf_lpm_trie_key_hdr hdr;
		__u32 prefixlen;
	};
	unsigned char data[8];
};

struct lpm_trie_int_key {
	union {
		struct bpf_lpm_trie_key_hdr hdr;
		__u32 prefixlen;
	};
	unsigned int data;
};

static struct tlpm_node *tlpm_match(struct tlpm_node *list,
				    const uint8_t *key,
				    size_t n_bits);
@@ -223,7 +241,7 @@ static void test_lpm_map(int keysize)
	n_matches = 0;
	n_matches_after_delete = 0;
	n_nodes = 1 << 8;
	n_lookups = 1 << 16;
	n_lookups = 1 << 9;

	data = alloca(keysize);
	memset(data, 0, keysize);
@@ -770,16 +788,385 @@ static void test_lpm_multi_thread(void)
	close(map_fd);
}

int main(void)
static int lpm_trie_create(unsigned int key_size, unsigned int value_size, unsigned int max_entries)
{
	LIBBPF_OPTS(bpf_map_create_opts, opts);
	int fd;

	opts.map_flags = BPF_F_NO_PREALLOC;
	fd = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, "lpm_trie", key_size, value_size, max_entries,
			    &opts);
	CHECK(fd < 0, "bpf_map_create", "error %d\n", errno);

	return fd;
}

static void test_lpm_trie_update_flags(void)
{
	struct lpm_trie_int_key key;
	unsigned int value, got;
	int fd, err;

	fd = lpm_trie_create(sizeof(key), sizeof(value), 3);

	/* invalid flags (Error) */
	key.prefixlen = 32;
	key.data = 0;
	value = 0;
	err = bpf_map_update_elem(fd, &key, &value, BPF_F_LOCK);
	CHECK(err != -EINVAL, "invalid update flag", "error %d\n", err);

	/* invalid flags (Error) */
	key.prefixlen = 32;
	key.data = 0;
	value = 0;
	err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST | BPF_EXIST);
	CHECK(err != -EINVAL, "invalid update flag", "error %d\n", err);

	/* overwrite an empty qp-trie (Error) */
	key.prefixlen = 32;
	key.data = 0;
	value = 2;
	err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
	CHECK(err != -ENOENT, "overwrite empty qp-trie", "error %d\n", err);

	/* add a new node */
	key.prefixlen = 16;
	key.data = 0;
	value = 1;
	err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
	CHECK(err, "add new elem", "error %d\n", err);
	got = 0;
	err = bpf_map_lookup_elem(fd, &key, &got);
	CHECK(err, "lookup elem", "error %d\n", err);
	CHECK(got != value, "check value", "got %d exp %d\n", got, value);

	/* add the same node as new node (Error) */
	err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
	CHECK(err != -EEXIST, "add new elem again", "error %d\n", err);

	/* overwrite the existed node */
	value = 4;
	err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
	CHECK(err, "overwrite elem", "error %d\n", err);
	got = 0;
	err = bpf_map_lookup_elem(fd, &key, &got);
	CHECK(err, "lookup elem", "error %d\n", err);
	CHECK(got != value, "check value", "got %d exp %d\n", got, value);

	/* overwrite the node */
	value = 1;
	err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
	CHECK(err, "update elem", "error %d\n", err);
	got = 0;
	err = bpf_map_lookup_elem(fd, &key, &got);
	CHECK(err, "lookup elem", "error %d\n", err);
	CHECK(got != value, "check value", "got %d exp %d\n", got, value);

	/* overwrite a non-existent node which is the prefix of the first
	 * node (Error).
	 */
	key.prefixlen = 8;
	key.data = 0;
	value = 2;
	err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
	CHECK(err != -ENOENT, "overwrite nonexistent elem", "error %d\n", err);

	/* add a new node which is the prefix of the first node */
	err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
	CHECK(err, "add new elem", "error %d\n", err);
	got = 0;
	err = bpf_map_lookup_elem(fd, &key, &got);
	CHECK(err, "lookup key", "error %d\n", err);
	CHECK(got != value, "check value", "got %d exp %d\n", got, value);

	/* add another new node which will be the sibling of the first node */
	key.prefixlen = 9;
	key.data = htobe32(1 << 23);
	value = 5;
	err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
	CHECK(err, "add new elem", "error %d\n", err);
	got = 0;
	err = bpf_map_lookup_elem(fd, &key, &got);
	CHECK(err, "lookup key", "error %d\n", err);
	CHECK(got != value, "check value", "got %d exp %d\n", got, value);

	/* overwrite the third node */
	value = 3;
	err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
	CHECK(err, "overwrite elem", "error %d\n", err);
	got = 0;
	err = bpf_map_lookup_elem(fd, &key, &got);
	CHECK(err, "lookup key", "error %d\n", err);
	CHECK(got != value, "check value", "got %d exp %d\n", got, value);

	/* delete the second node to make it an intermediate node */
	key.prefixlen = 8;
	key.data = 0;
	err = bpf_map_delete_elem(fd, &key);
	CHECK(err, "del elem", "error %d\n", err);

	/* overwrite the intermediate node (Error) */
	value = 2;
	err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
	CHECK(err != -ENOENT, "overwrite nonexistent elem", "error %d\n", err);

	close(fd);
}

static void test_lpm_trie_update_full_map(void)
{
	struct lpm_trie_int_key key;
	int value, got;
	int fd, err;

	fd = lpm_trie_create(sizeof(key), sizeof(value), 3);

	/* add a new node */
	key.prefixlen = 16;
	key.data = 0;
	value = 0;
	err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
	CHECK(err, "add new elem", "error %d\n", err);
	got = 0;
	err = bpf_map_lookup_elem(fd, &key, &got);
	CHECK(err, "lookup elem", "error %d\n", err);
	CHECK(got != value, "check value", "got %d exp %d\n", got, value);

	/* add new node */
	key.prefixlen = 8;
	key.data = 0;
	value = 1;
	err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
	CHECK(err, "add new elem", "error %d\n", err);
	got = 0;
	err = bpf_map_lookup_elem(fd, &key, &got);
	CHECK(err, "lookup elem", "error %d\n", err);
	CHECK(got != value, "check value", "got %d exp %d\n", got, value);

	/* add new node */
	key.prefixlen = 9;
	key.data = htobe32(1 << 23);
	value = 2;
	err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
	CHECK(err, "add new elem", "error %d\n", err);
	got = 0;
	err = bpf_map_lookup_elem(fd, &key, &got);
	CHECK(err, "lookup elem", "error %d\n", err);
	CHECK(got != value, "check value", "got %d exp %d\n", got, value);

	/* try to add more node (Error) */
	key.prefixlen = 32;
	key.data = 0;
	value = 3;
	err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
	CHECK(err != -ENOSPC, "add to full trie", "error %d\n", err);

	/* update the value of an existed node with BPF_EXIST */
	key.prefixlen = 16;
	key.data = 0;
	value = 4;
	err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
	CHECK(err, "overwrite elem", "error %d\n", err);
	got = 0;
	err = bpf_map_lookup_elem(fd, &key, &got);
	CHECK(err, "lookup elem", "error %d\n", err);
	CHECK(got != value, "check value", "got %d exp %d\n", got, value);

	/* update the value of an existed node with BPF_ANY */
	key.prefixlen = 9;
	key.data = htobe32(1 << 23);
	value = 5;
	err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
	CHECK(err, "overwrite elem", "error %d\n", err);
	got = 0;
	err = bpf_map_lookup_elem(fd, &key, &got);
	CHECK(err, "lookup elem", "error %d\n", err);
	CHECK(got != value, "check value", "got %d exp %d\n", got, value);

	close(fd);
}

static int cmp_str(const void *a, const void *b)
{
	const char *str_a = *(const char **)a, *str_b = *(const char **)b;

	return strcmp(str_a, str_b);
}

/* Save strings in LPM trie. The trailing '\0' for each string will be
 * accounted in the prefixlen. The strings returned during the iteration
 * should be sorted as expected.
 */
static void test_lpm_trie_iterate_strs(void)
{
	static const char * const keys[] = {
		"ab", "abO", "abc", "abo", "abS", "abcd",
	};
	const char *sorted_keys[ARRAY_SIZE(keys)];
	struct lpm_trie_bytes_key key, next_key;
	unsigned int value, got, i, j, len;
	struct lpm_trie_bytes_key *cur;
	int fd, err;

	fd = lpm_trie_create(sizeof(key), sizeof(value), ARRAY_SIZE(keys));

	for (i = 0; i < ARRAY_SIZE(keys); i++) {
		unsigned int flags;

		/* add i-th element */
		flags = i % 2 ? BPF_NOEXIST : 0;
		len = strlen(keys[i]);
		/* include the trailing '\0' */
		key.prefixlen = (len + 1) * 8;
		memset(key.data, 0, sizeof(key.data));
		memcpy(key.data, keys[i], len);
		value = i + 100;
		err = bpf_map_update_elem(fd, &key, &value, flags);
		CHECK(err, "add elem", "#%u error %d\n", i, err);

		err = bpf_map_lookup_elem(fd, &key, &got);
		CHECK(err, "lookup elem", "#%u error %d\n", i, err);
		CHECK(got != value, "lookup elem", "#%u expect %u got %u\n", i, value, got);

		/* re-add i-th element (Error) */
		err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
		CHECK(err != -EEXIST, "re-add elem", "#%u error %d\n", i, err);

		/* Overwrite i-th element */
		flags = i % 2 ? 0 : BPF_EXIST;
		value = i;
		err = bpf_map_update_elem(fd, &key, &value, flags);
		CHECK(err, "update elem", "error %d\n", err);

		/* Lookup #[0~i] elements */
		for (j = 0; j <= i; j++) {
			len = strlen(keys[j]);
			key.prefixlen = (len + 1) * 8;
			memset(key.data, 0, sizeof(key.data));
			memcpy(key.data, keys[j], len);
			err = bpf_map_lookup_elem(fd, &key, &got);
			CHECK(err, "lookup elem", "#%u/%u error %d\n", i, j, err);
			CHECK(got != j, "lookup elem", "#%u/%u expect %u got %u\n",
			      i, j, value, got);
		}
	}

	/* Add element to a full qp-trie (Error) */
	key.prefixlen = sizeof(key.data) * 8;
	memset(key.data, 0, sizeof(key.data));
	value = 0;
	err = bpf_map_update_elem(fd, &key, &value, 0);
	CHECK(err != -ENOSPC, "add to full qp-trie", "error %d\n", err);

	/* Iterate sorted elements: no deletion */
	memcpy(sorted_keys, keys, sizeof(keys));
	qsort(sorted_keys, ARRAY_SIZE(sorted_keys), sizeof(sorted_keys[0]), cmp_str);
	cur = NULL;
	for (i = 0; i < ARRAY_SIZE(sorted_keys); i++) {
		len = strlen(sorted_keys[i]);
		err = bpf_map_get_next_key(fd, cur, &next_key);
		CHECK(err, "iterate", "#%u error %d\n", i, err);
		CHECK(next_key.prefixlen != (len + 1) * 8, "iterate",
		      "#%u invalid len %u expect %u\n",
		      i, next_key.prefixlen, (len + 1) * 8);
		CHECK(memcmp(sorted_keys[i], next_key.data, len + 1), "iterate",
		      "#%u got %.*s exp %.*s\n", i, len, next_key.data, len, sorted_keys[i]);

		cur = &next_key;
	}
	err = bpf_map_get_next_key(fd, cur, &next_key);
	CHECK(err != -ENOENT, "more element", "error %d\n", err);

	/* Iterate sorted elements: delete the found key after each iteration */
	cur = NULL;
	for (i = 0; i < ARRAY_SIZE(sorted_keys); i++) {
		len = strlen(sorted_keys[i]);
		err = bpf_map_get_next_key(fd, cur, &next_key);
		CHECK(err, "iterate", "#%u error %d\n", i, err);
		CHECK(next_key.prefixlen != (len + 1) * 8, "iterate",
		      "#%u invalid len %u expect %u\n",
		      i, next_key.prefixlen, (len + 1) * 8);
		CHECK(memcmp(sorted_keys[i], next_key.data, len + 1), "iterate",
		      "#%u got %.*s exp %.*s\n", i, len, next_key.data, len, sorted_keys[i]);

		cur = &next_key;

		err = bpf_map_delete_elem(fd, cur);
		CHECK(err, "delete", "#%u error %d\n", i, err);
	}
	err = bpf_map_get_next_key(fd, cur, &next_key);
	CHECK(err != -ENOENT, "non-empty qp-trie", "error %d\n", err);

	close(fd);
}

/* Use the fixed prefixlen (32) and save integers in LPM trie. The iteration of
 * LPM trie will return these integers in big-endian order, therefore, convert
 * these integers to big-endian before update. After each iteration, delete the
 * found key (the smallest integer) and expect the next iteration will return
 * the second smallest number.
 */
static void test_lpm_trie_iterate_ints(void)
{
	struct lpm_trie_int_key key, next_key;
	unsigned int i, max_entries;
	struct lpm_trie_int_key *cur;
	unsigned int *data_set;
	int fd, err;
	bool value;

	max_entries = 4096;
	data_set = calloc(max_entries, sizeof(*data_set));
	CHECK(!data_set, "malloc", "no mem\n");
	for (i = 0; i < max_entries; i++)
		data_set[i] = i;

	fd = lpm_trie_create(sizeof(key), sizeof(value), max_entries);
	value = true;
	for (i = 0; i < max_entries; i++) {
		key.prefixlen = 32;
		key.data = htobe32(data_set[i]);

		err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
		CHECK(err, "add elem", "#%u error %d\n", i, err);
	}

	cur = NULL;
	for (i = 0; i < max_entries; i++) {
		err = bpf_map_get_next_key(fd, cur, &next_key);
		CHECK(err, "iterate", "#%u error %d\n", i, err);
		CHECK(next_key.prefixlen != 32, "iterate", "#%u invalid len %u\n",
		      i, next_key.prefixlen);
		CHECK(be32toh(next_key.data) != data_set[i], "iterate", "#%u got 0x%x exp 0x%x\n",
		      i, be32toh(next_key.data), data_set[i]);
		cur = &next_key;

		/*
		 * Delete the minimal key, the next call of bpf_get_next_key()
		 * will return the second minimal key.
		 */
		err = bpf_map_delete_elem(fd, &next_key);
		CHECK(err, "del elem", "#%u elem error %d\n", i, err);
	}
	err = bpf_map_get_next_key(fd, cur, &next_key);
	CHECK(err != -ENOENT, "more element", "error %d\n", err);

	err = bpf_map_get_next_key(fd, NULL, &next_key);
	CHECK(err != -ENOENT, "no-empty qp-trie", "error %d\n", err);

	free(data_set);

	close(fd);
}

void test_lpm_trie_map_basic_ops(void)
{
	int i;

	/* we want predictable, pseudo random tests */
	srand(0xf00ba1);

	/* Use libbpf 1.0 API mode */
	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);

	test_lpm_basic();
	test_lpm_order();

@@ -792,6 +1179,10 @@ int main(void)
	test_lpm_get_next_key();
	test_lpm_multi_thread();

	printf("test_lpm: OK\n");
	return 0;
	test_lpm_trie_update_flags();
	test_lpm_trie_update_full_map();
	test_lpm_trie_iterate_strs();
	test_lpm_trie_iterate_ints();

	printf("%s: PASS\n", __func__);
}