Merge branch 'bpf-reduce-memory-usage-for-bpf_global_percpu_ma' (f8506c57) · Commits · git / linux-nf

include/linux/bpf_mem_alloc.h

+8 −0

Original line number	Diff line number	Diff line
		@@ -11,6 +11,7 @@ struct bpf_mem_caches;
		struct bpf_mem_alloc {
		struct bpf_mem_caches __percpu *caches;
		struct bpf_mem_cache __percpu *cache;
		struct obj_cgroup *objcg;
		bool percpu;
		struct work_struct work;
		};
		@@ -21,8 +22,15 @@ struct bpf_mem_alloc {
		* 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects.
		* Alloc and free are done with bpf_mem_{alloc,free}() and the size of
		* the returned object is given by the size argument of bpf_mem_alloc().
		* If percpu equals true, error will be returned in order to avoid
		* large memory consumption and the below bpf_mem_alloc_percpu_unit_init()
		* should be used to do on-demand per-cpu allocation for each size.
		*/
		int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
		/* Initialize a non-fix-size percpu memory allocator */
		int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc ma, struct obj_cgroup objcg);
		/* The percpu allocation with a specific unit size. */
		int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size);
		void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);

		/* kmalloc/kfree equivalent: */

kernel/bpf/memalloc.c

+81 −12

Original line number	Diff line number	Diff line
		@@ -121,6 +121,8 @@ struct bpf_mem_caches {
		struct bpf_mem_cache cache[NUM_CACHES];
		};

		static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};

		static struct llist_node notrace __llist_del_first(struct llist_head head)
		{
		struct llist_node entry, next;
		@@ -462,11 +464,17 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c)
		* consume ~ 11 Kbyte per cpu.
		* Typical case will be between 11K and 116K closer to 11K.
		* bpf progs can and should share bpf_mem_cache when possible.
		*
		* Percpu allocation is typically rare. To avoid potential unnecessary large
		* memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1.
		*/
		static void init_refill_work(struct bpf_mem_cache *c)
		{
		init_irq_work(&c->refill_work, bpf_mem_refill);
		if (c->unit_size <= 256) {
		if (c->percpu_size) {
		c->low_watermark = 1;
		c->high_watermark = 3;
		} else if (c->unit_size <= 256) {
		c->low_watermark = 32;
		c->high_watermark = 96;
		} else {
		@@ -483,11 +491,16 @@ static void init_refill_work(struct bpf_mem_cache *c)

		static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
		{
		/* To avoid consuming memory assume that 1st run of bpf
		* prog won't be doing more than 4 map_update_elem from
		* irq disabled region
		int cnt = 1;

		/* To avoid consuming memory, for non-percpu allocation, assume that
		* 1st run of bpf prog won't be doing more than 4 map_update_elem from
		* irq disabled region if unit size is less than or equal to 256.
		* For all other cases, let us just do one allocation.
		*/
		alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
		if (!c->percpu_size && c->unit_size <= 256)
		cnt = 4;
		alloc_bulk(c, cnt, cpu_to_node(cpu), false);
		}

		/* When size != 0 bpf_mem_cache for each cpu.
		@@ -499,12 +512,14 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
		*/
		int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
		{
		static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
		struct bpf_mem_caches cc, __percpu pcc;
		struct bpf_mem_cache c, __percpu pc;
		struct obj_cgroup *objcg = NULL;
		int cpu, i, unit_size, percpu_size = 0;

		if (percpu && size == 0)
		return -EINVAL;

		/* room for llist_node and per-cpu pointer */
		if (percpu)
		percpu_size = LLIST_NODE_SZ + sizeof(void *);
		@@ -523,6 +538,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
		if (memcg_bpf_enabled())
		objcg = get_obj_cgroup_from_current();
		#endif
		ma->objcg = objcg;

		for_each_possible_cpu(cpu) {
		c = per_cpu_ptr(pc, cpu);
		c->unit_size = unit_size;
		@@ -542,6 +559,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
		#ifdef CONFIG_MEMCG_KMEM
		objcg = get_obj_cgroup_from_current();
		#endif
		ma->objcg = objcg;
		for_each_possible_cpu(cpu) {
		cc = per_cpu_ptr(pcc, cpu);
		for (i = 0; i < NUM_CACHES; i++) {
		@@ -560,6 +578,56 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
		return 0;
		}

		int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc ma, struct obj_cgroup objcg)
		{
		struct bpf_mem_caches __percpu *pcc;

		pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL);
		if (!pcc)
		return -ENOMEM;

		ma->caches = pcc;
		ma->objcg = objcg;
		ma->percpu = true;
		return 0;
		}

		int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
		{
		struct bpf_mem_caches cc, __percpu pcc;
		int cpu, i, unit_size, percpu_size;
		struct obj_cgroup *objcg;
		struct bpf_mem_cache *c;

		i = bpf_mem_cache_idx(size);
		if (i < 0)
		return -EINVAL;

		/* room for llist_node and per-cpu pointer */
		percpu_size = LLIST_NODE_SZ + sizeof(void *);

		unit_size = sizes[i];
		objcg = ma->objcg;
		pcc = ma->caches;

		for_each_possible_cpu(cpu) {
		cc = per_cpu_ptr(pcc, cpu);
		c = &cc->cache[i];
		if (cpu == 0 && c->unit_size)
		break;

		c->unit_size = unit_size;
		c->objcg = objcg;
		c->percpu_size = percpu_size;
		c->tgt = c;

		init_refill_work(c);
		prefill_mem_cache(c, cpu);
		}

		return 0;
		}

		static void drain_mem_cache(struct bpf_mem_cache *c)
		{
		bool percpu = !!c->percpu_size;
		@@ -691,9 +759,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
		rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
		rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
		}
		/* objcg is the same across cpus */
		if (c->objcg)
		obj_cgroup_put(c->objcg);
		if (ma->objcg)
		obj_cgroup_put(ma->objcg);
		destroy_mem_alloc(ma, rcu_in_progress);
		}
		if (ma->caches) {
		@@ -709,8 +776,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
		rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
		}
		}
		if (c->objcg)
		obj_cgroup_put(c->objcg);
		if (ma->objcg)
		obj_cgroup_put(ma->objcg);
		destroy_mem_alloc(ma, rcu_in_progress);
		}
		}
		@@ -833,7 +900,9 @@ void notrace bpf_mem_alloc(struct bpf_mem_alloc ma, size_t size)
		if (!size)
		return NULL;

		idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ);
		if (!ma->percpu)
		size += LLIST_NODE_SZ;
		idx = bpf_mem_cache_idx(size);
		if (idx < 0)
		return NULL;

kernel/bpf/verifier.c

+31 −14

Original line number	Diff line number	Diff line
		@@ -195,6 +195,8 @@ struct bpf_verifier_stack_elem {
		POISON_POINTER_DELTA))
		#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))

		#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512

		static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
		static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
		static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
		@@ -12139,20 +12141,6 @@ static int check_kfunc_call(struct bpf_verifier_env env, struct bpf_insn insn,
		if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
		return -ENOMEM;

		if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
		if (!bpf_global_percpu_ma_set) {
		mutex_lock(&bpf_percpu_ma_lock);
		if (!bpf_global_percpu_ma_set) {
		err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
		if (!err)
		bpf_global_percpu_ma_set = true;
		}
		mutex_unlock(&bpf_percpu_ma_lock);
		if (err)
		return err;
		}
		}

		if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
		verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
		return -EINVAL;
		@@ -12173,6 +12161,35 @@ static int check_kfunc_call(struct bpf_verifier_env env, struct bpf_insn insn,
		return -EINVAL;
		}

		if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
		if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
		verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
		ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
		return -EINVAL;
		}

		if (!bpf_global_percpu_ma_set) {
		mutex_lock(&bpf_percpu_ma_lock);
		if (!bpf_global_percpu_ma_set) {
		/* Charge memory allocated with bpf_global_percpu_ma to
		* root memcg. The obj_cgroup for root memcg is NULL.
		*/
		err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
		if (!err)
		bpf_global_percpu_ma_set = true;
		}
		mutex_unlock(&bpf_percpu_ma_lock);
		if (err)
		return err;
		}

		mutex_lock(&bpf_percpu_ma_lock);
		err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
		mutex_unlock(&bpf_percpu_ma_lock);
		if (err)
		return err;
		}

		struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
		if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
		if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {

tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c

+13 −7

Original line number	Diff line number	Diff line
		@@ -14,7 +14,8 @@ static void do_bpf_ma_test(const char *name)
		struct test_bpf_ma *skel;
		struct bpf_program *prog;
		struct btf *btf;
		int i, err;
		int i, err, id;
		char tname[32];

		skel = test_bpf_ma__open();
		if (!ASSERT_OK_PTR(skel, "open"))
		@@ -25,16 +26,21 @@ static void do_bpf_ma_test(const char *name)
		goto out;

		for (i = 0; i < ARRAY_SIZE(skel->rodata->data_sizes); i++) {
		char name[32];
		int id;

		snprintf(name, sizeof(name), "bin_data_%u", skel->rodata->data_sizes[i]);
		id = btf__find_by_name_kind(btf, name, BTF_KIND_STRUCT);
		if (!ASSERT_GT(id, 0, "bin_data"))
		snprintf(tname, sizeof(tname), "bin_data_%u", skel->rodata->data_sizes[i]);
		id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT);
		if (!ASSERT_GT(id, 0, tname))
		goto out;
		skel->rodata->data_btf_ids[i] = id;
		}

		for (i = 0; i < ARRAY_SIZE(skel->rodata->percpu_data_sizes); i++) {
		snprintf(tname, sizeof(tname), "percpu_bin_data_%u", skel->rodata->percpu_data_sizes[i]);
		id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT);
		if (!ASSERT_GT(id, 0, tname))
		goto out;
		skel->rodata->percpu_data_btf_ids[i] = id;
		}

		prog = bpf_object__find_program_by_name(skel->obj, name);
		if (!ASSERT_OK_PTR(prog, "invalid prog name"))
		goto out;

tools/testing/selftests/bpf/progs/percpu_alloc_fail.c

+18 −0

Original line number	Diff line number	Diff line
		@@ -17,6 +17,10 @@ struct val_with_rb_root_t {
		struct bpf_spin_lock lock;
		};

		struct val_600b_t {
		char b[600];
		};

		struct elem {
		long sum;
		struct val_t __percpu_kptr *pc;
		@@ -161,4 +165,18 @@ int BPF_PROG(test_array_map_7)
		return 0;
		}

		SEC("?fentry.s/bpf_fentry_test1")
		__failure __msg("bpf_percpu_obj_new type size (600) is greater than 512")
		int BPF_PROG(test_array_map_8)
		{
		struct val_600b_t __percpu_kptr *p;

		p = bpf_percpu_obj_new(struct val_600b_t);
		if (!p)
		return 0;

		bpf_percpu_obj_drop(p);
		return 0;
		}

		char _license[] SEC("license") = "GPL";