Commit 4d2815a1 authored by Andrii Nakryiko's avatar Andrii Nakryiko
Browse files

Merge branch 'bpf-implement-mprog-api-on-top-of-existing-cgroup-progs'

Yonghong Song says:

====================
bpf: Implement mprog API on top of existing cgroup progs

Current cgroup prog ordering is appending at attachment time. This is not
ideal. In some cases, users want specific ordering at a particular cgroup
level. For example, in Meta, we have a case where three different
applications all have cgroup/setsockopt progs and they require specific
ordering. Current approach is to use a bpfchainer where one bpf prog
contains multiple global functions and each global function can be
freplaced by a prog for a specific application. The ordering of global
functions decides the ordering of those application specific bpf progs.
Using bpfchainer is a centralized approach and is not desirable as
one of applications acts as a daemon. The decentralized attachment
approach is more favorable for those applications.

To address this, the existing mprog API ([2]) seems an ideal solution with
supporting BPF_F_BEFORE and BPF_F_AFTER flags on top of existing cgroup
bpf implementation. More specifically, the support is added for prog/link
attachment with BPF_F_BEFORE and BPF_F_AFTER. The kernel mprog
interface ([2]) is not used and the implementation is directly done in
cgroup bpf code base. The mprog 'revision' is also implemented in
attach/detach/replace, so users can query revision number to check the
change of cgroup prog list.

The patch set contains 5 patches. Patch 1 adds revision support for
cgroup bpf progs. Patch 2 implements mprog API implementation for
prog/link attach and revision update. Patch 3 adds a new libbpf
API to do cgroup link attach with flags like BPF_F_BEFORE/BPF_F_AFTER.
Patches 4 and 5 add two tests to validate the implementation.

  [1] https://lore.kernel.org/r/20250224230116.283071-1-yonghong.song@linux.dev
  [2] https://lore.kernel.org/r/20230719140858.13224-2-daniel@iogearbox.net

Changelogs:
  v4 -> v5:
    - v4: https://lore.kernel.org/bpf/20250530173812.1823479-1-yonghong.song@linux.dev/
    - Remove early prog/link checking based flags and id_or_fd as later code
      will do checking as well.
    - Do proper cgroup flag checking for bpf_prog_attach().
  v3 -> v4:
    - v3: https://lore.kernel.org/bpf/20250517162720.4077882-1-yonghong.song@linux.dev/
    - Refactor some to make BPF_F_BEFORE/BPF_F_AFTER handling easier to understand.
    - Perviously, I degraded 'link' to 'prog' for later mprog handling. This is
      not correct. Similar to mprog.c, we should be check 'link' instead link->prog
      since it is possible two different links may have the same underlying prog and
      we do not want to miss supporting such use case.
  v2 -> v3:
    - v2: https://lore.kernel.org/bpf/20250508223524.487875-1-yonghong.song@linux.dev/
    - Big change to replace get_anchor_prog() to get_prog_list() so the
      'struct bpf_prog_list *' is returned directly.
    - Support 'BPF_F_BEFORE | BPF_F_AFTER' attachment if the prog list is empty
      and flags do not have 'BPF_F_LINK | BPF_F_ID' and id_or_fd is 0.
    - Add BPF_F_LINK support.
    - Patch 4 is added to reuse id_from_prog_fd() and id_from_link_fd().
  v1 -> v2:
    - v1: https://lore.kernel.org/bpf/20250411011523.1838771-1-yonghong.song@linux.dev/
    - Change cgroup_bpf.revisions from atomic64_t to u64.
    - Added missing bpf_prog_put in various places.
    - Rename get_cmp_prog() to get_anchor_prog(). The implementation tries to
      find the anchor prog regardless of whether id_or_fd is non-NULL or not.
    - Rename bpf_cgroup_prog_attached() to is_cgroup_prog_type() and handle
      BPF_PROG_TYPE_LSM properly (with BPF_LSM_CGROUP attach type).
    - I kept 'id || id_or_fd' condition as the condition 'id' is also used
      in mprog.c so I assume it is okay in cgroup.c as well.
====================

Link: https://patch.msgid.link/20250606163131.2428225-1-yonghong.song@linux.dev


Signed-off-by: default avatarAndrii Nakryiko <andrii@kernel.org>
parents e41079f5 e422d5f1
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -63,6 +63,7 @@ struct cgroup_bpf {
	 */
	struct hlist_head progs[MAX_CGROUP_BPF_ATTACH_TYPE];
	u8 flags[MAX_CGROUP_BPF_ATTACH_TYPE];
	u64 revisions[MAX_CGROUP_BPF_ATTACH_TYPE];

	/* list of cgroup shared storages */
	struct list_head storages;
+7 −0
Original line number Diff line number Diff line
@@ -1794,6 +1794,13 @@ union bpf_attr {
				};
				__u64		expected_revision;
			} netkit;
			struct {
				union {
					__u32	relative_fd;
					__u32	relative_id;
				};
				__u64		expected_revision;
			} cgroup;
		};
	} link_create;

+160 −22
Original line number Diff line number Diff line
@@ -658,6 +658,116 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
	return NULL;
}

static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd)
{
	struct bpf_link *link = ERR_PTR(-EINVAL);

	if (flags & BPF_F_ID)
		link = bpf_link_by_id(id_or_fd);
	else if (id_or_fd)
		link = bpf_link_get_from_fd(id_or_fd);
	return link;
}

static struct bpf_prog *bpf_get_anchor_prog(u32 flags, u32 id_or_fd)
{
	struct bpf_prog *prog = ERR_PTR(-EINVAL);

	if (flags & BPF_F_ID)
		prog = bpf_prog_by_id(id_or_fd);
	else if (id_or_fd)
		prog = bpf_prog_get(id_or_fd);
	return prog;
}

static struct bpf_prog_list *get_prog_list(struct hlist_head *progs, struct bpf_prog *prog,
					   struct bpf_cgroup_link *link, u32 flags, u32 id_or_fd)
{
	bool is_link = flags & BPF_F_LINK, is_id = flags & BPF_F_ID;
	struct bpf_prog_list *pltmp, *pl = ERR_PTR(-EINVAL);
	bool preorder = flags & BPF_F_PREORDER;
	struct bpf_link *anchor_link = NULL;
	struct bpf_prog *anchor_prog = NULL;
	bool is_before, is_after;

	is_before = flags & BPF_F_BEFORE;
	is_after = flags & BPF_F_AFTER;
	if (is_link || is_id || id_or_fd) {
		/* flags must have either BPF_F_BEFORE or BPF_F_AFTER */
		if (is_before == is_after)
			return ERR_PTR(-EINVAL);
		if ((is_link && !link) || (!is_link && !prog))
			return ERR_PTR(-EINVAL);
	} else if (!hlist_empty(progs)) {
		/* flags cannot have both BPF_F_BEFORE and BPF_F_AFTER */
		if (is_before && is_after)
			return ERR_PTR(-EINVAL);
	}

	if (is_link) {
		anchor_link = bpf_get_anchor_link(flags, id_or_fd);
		if (IS_ERR(anchor_link))
			return ERR_PTR(PTR_ERR(anchor_link));
	} else if (is_id || id_or_fd) {
		anchor_prog = bpf_get_anchor_prog(flags, id_or_fd);
		if (IS_ERR(anchor_prog))
			return ERR_PTR(PTR_ERR(anchor_prog));
	}

	if (!anchor_prog && !anchor_link) {
		/* if there is no anchor_prog/anchor_link, then BPF_F_PREORDER
		 * doesn't matter since either prepend or append to a combined
		 * list of progs will end up with correct result.
		 */
		hlist_for_each_entry(pltmp, progs, node) {
			if (is_before)
				return pltmp;
			if (pltmp->node.next)
				continue;
			return pltmp;
		}
		return NULL;
	}

	hlist_for_each_entry(pltmp, progs, node) {
		if ((anchor_prog && anchor_prog == pltmp->prog) ||
		    (anchor_link && anchor_link == &pltmp->link->link)) {
			if (!!(pltmp->flags & BPF_F_PREORDER) != preorder)
				goto out;
			pl = pltmp;
			goto out;
		}
	}

	pl = ERR_PTR(-ENOENT);
out:
	if (anchor_link)
		bpf_link_put(anchor_link);
	else
		bpf_prog_put(anchor_prog);
	return pl;
}

static int insert_pl_to_hlist(struct bpf_prog_list *pl, struct hlist_head *progs,
			      struct bpf_prog *prog, struct bpf_cgroup_link *link,
			      u32 flags, u32 id_or_fd)
{
	struct bpf_prog_list *pltmp;

	pltmp = get_prog_list(progs, prog, link, flags, id_or_fd);
	if (IS_ERR(pltmp))
		return PTR_ERR(pltmp);

	if (!pltmp)
		hlist_add_head(&pl->node, progs);
	else if (flags & BPF_F_BEFORE)
		hlist_add_before(&pl->node, &pltmp->node);
	else
		hlist_add_behind(&pl->node, &pltmp->node);

	return 0;
}

/**
 * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
 *                         propagate the change to descendants
@@ -667,6 +777,8 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
 * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
 * @type: Type of attach operation
 * @flags: Option flags
 * @id_or_fd: Relative prog id or fd
 * @revision: bpf_prog_list revision
 *
 * Exactly one of @prog or @link can be non-null.
 * Must be called with cgroup_mutex held.
@@ -674,7 +786,8 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
static int __cgroup_bpf_attach(struct cgroup *cgrp,
			       struct bpf_prog *prog, struct bpf_prog *replace_prog,
			       struct bpf_cgroup_link *link,
			       enum bpf_attach_type type, u32 flags)
			       enum bpf_attach_type type, u32 flags, u32 id_or_fd,
			       u64 revision)
{
	u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
	struct bpf_prog *old_prog = NULL;
@@ -690,6 +803,9 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
	    ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
		/* invalid combination */
		return -EINVAL;
	if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE | BPF_F_AFTER)))
		/* only either replace or insertion with before/after */
		return -EINVAL;
	if (link && (prog || replace_prog))
		/* only either link or prog/replace_prog can be specified */
		return -EINVAL;
@@ -700,6 +816,8 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
	atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
	if (atype < 0)
		return -EINVAL;
	if (revision && revision != cgrp->bpf.revisions[atype])
		return -ESTALE;

	progs = &cgrp->bpf.progs[atype];

@@ -728,21 +846,17 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
	if (pl) {
		old_prog = pl->prog;
	} else {
		struct hlist_node *last = NULL;

		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
		if (!pl) {
			bpf_cgroup_storages_free(new_storage);
			return -ENOMEM;
		}
		if (hlist_empty(progs))
			hlist_add_head(&pl->node, progs);
		else
			hlist_for_each(last, progs) {
				if (last->next)
					continue;
				hlist_add_behind(&pl->node, last);
				break;

		err = insert_pl_to_hlist(pl, progs, prog, link, flags, id_or_fd);
		if (err) {
			kfree(pl);
			bpf_cgroup_storages_free(new_storage);
			return err;
		}
	}

@@ -762,6 +876,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
	if (err)
		goto cleanup_trampoline;

	cgrp->bpf.revisions[atype] += 1;
	if (old_prog) {
		if (type == BPF_LSM_CGROUP)
			bpf_trampoline_unlink_cgroup_shim(old_prog);
@@ -793,12 +908,13 @@ static int cgroup_bpf_attach(struct cgroup *cgrp,
			     struct bpf_prog *prog, struct bpf_prog *replace_prog,
			     struct bpf_cgroup_link *link,
			     enum bpf_attach_type type,
			     u32 flags)
			     u32 flags, u32 id_or_fd, u64 revision)
{
	int ret;

	cgroup_lock();
	ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
	ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags,
				  id_or_fd, revision);
	cgroup_unlock();
	return ret;
}
@@ -886,6 +1002,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
	if (!found)
		return -ENOENT;

	cgrp->bpf.revisions[atype] += 1;
	old_prog = xchg(&link->link.prog, new_prog);
	replace_effective_prog(cgrp, atype, link);
	bpf_prog_put(old_prog);
@@ -1011,12 +1128,14 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
 * @prog: A program to detach or NULL
 * @link: A link to detach or NULL
 * @type: Type of detach operation
 * @revision: bpf_prog_list revision
 *
 * At most one of @prog or @link can be non-NULL.
 * Must be called with cgroup_mutex held.
 */
static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
			       struct bpf_cgroup_link *link, enum bpf_attach_type type)
			       struct bpf_cgroup_link *link, enum bpf_attach_type type,
			       u64 revision)
{
	enum cgroup_bpf_attach_type atype;
	struct bpf_prog *old_prog;
@@ -1034,6 +1153,9 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
	if (atype < 0)
		return -EINVAL;

	if (revision && revision != cgrp->bpf.revisions[atype])
		return -ESTALE;

	progs = &cgrp->bpf.progs[atype];
	flags = cgrp->bpf.flags[atype];

@@ -1059,6 +1181,7 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,

	/* now can actually delete it from this cgroup list */
	hlist_del(&pl->node);
	cgrp->bpf.revisions[atype] += 1;

	kfree(pl);
	if (hlist_empty(progs))
@@ -1074,12 +1197,12 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
}

static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
			     enum bpf_attach_type type)
			     enum bpf_attach_type type, u64 revision)
{
	int ret;

	cgroup_lock();
	ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
	ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, revision);
	cgroup_unlock();
	return ret;
}
@@ -1097,6 +1220,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
	struct bpf_prog_array *effective;
	int cnt, ret = 0, i;
	int total_cnt = 0;
	u64 revision = 0;
	u32 flags;

	if (effective_query && prog_attach_flags)
@@ -1134,6 +1258,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
		return -EFAULT;
	if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
		return -EFAULT;
	if (!effective_query && from_atype == to_atype)
		revision = cgrp->bpf.revisions[from_atype];
	if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
		return -EFAULT;
	if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
		/* return early if user requested only program count + flags */
		return 0;
@@ -1216,7 +1344,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
	}

	ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
				attr->attach_type, attr->attach_flags);
				attr->attach_type, attr->attach_flags,
				attr->relative_fd, attr->expected_revision);

	if (replace_prog)
		bpf_prog_put(replace_prog);
@@ -1238,7 +1367,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
	if (IS_ERR(prog))
		prog = NULL;

	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, attr->expected_revision);
	if (prog)
		bpf_prog_put(prog);

@@ -1267,7 +1396,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
	}

	WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
				    cg_link->type));
				    cg_link->type, 0));
	if (cg_link->type == BPF_LSM_CGROUP)
		bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);

@@ -1339,6 +1468,13 @@ static const struct bpf_link_ops bpf_cgroup_link_lops = {
	.fill_link_info = bpf_cgroup_link_fill_link_info,
};

#define BPF_F_LINK_ATTACH_MASK	\
	(BPF_F_ID |		\
	 BPF_F_BEFORE |		\
	 BPF_F_AFTER |		\
	 BPF_F_PREORDER |	\
	 BPF_F_LINK)

int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
	struct bpf_link_primer link_primer;
@@ -1346,7 +1482,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
	struct cgroup *cgrp;
	int err;

	if (attr->link_create.flags)
	if (attr->link_create.flags & (~BPF_F_LINK_ATTACH_MASK))
		return -EINVAL;

	cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
@@ -1370,7 +1506,9 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
	}

	err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
				link->type, BPF_F_ALLOW_MULTI);
				link->type, BPF_F_ALLOW_MULTI | attr->link_create.flags,
				attr->link_create.cgroup.relative_fd,
				attr->link_create.cgroup.expected_revision);
	if (err) {
		bpf_link_cleanup(&link_primer);
		goto out_put_cgroup;
+31 −15
Original line number Diff line number Diff line
@@ -4186,6 +4186,25 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
	}
}

static bool is_cgroup_prog_type(enum bpf_prog_type ptype, enum bpf_attach_type atype,
				bool check_atype)
{
	switch (ptype) {
	case BPF_PROG_TYPE_CGROUP_DEVICE:
	case BPF_PROG_TYPE_CGROUP_SKB:
	case BPF_PROG_TYPE_CGROUP_SOCK:
	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
	case BPF_PROG_TYPE_CGROUP_SYSCTL:
	case BPF_PROG_TYPE_SOCK_OPS:
		return true;
	case BPF_PROG_TYPE_LSM:
		return check_atype ? atype == BPF_LSM_CGROUP : true;
	default:
		return false;
	}
}

#define BPF_PROG_ATTACH_LAST_FIELD expected_revision

#define BPF_F_ATTACH_MASK_BASE	\
@@ -4216,6 +4235,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
	if (bpf_mprog_supported(ptype)) {
		if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
			return -EINVAL;
	} else if (is_cgroup_prog_type(ptype, 0, false)) {
		if (attr->attach_flags & ~(BPF_F_ATTACH_MASK_BASE | BPF_F_ATTACH_MASK_MPROG))
			return -EINVAL;
	} else {
		if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
			return -EINVAL;
@@ -4233,6 +4255,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
		return -EINVAL;
	}

	if (is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) {
		ret = cgroup_bpf_prog_attach(attr, ptype, prog);
		goto out;
	}

	switch (ptype) {
	case BPF_PROG_TYPE_SK_SKB:
	case BPF_PROG_TYPE_SK_MSG:
@@ -4244,20 +4271,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
	case BPF_PROG_TYPE_FLOW_DISSECTOR:
		ret = netns_bpf_prog_attach(attr, prog);
		break;
	case BPF_PROG_TYPE_CGROUP_DEVICE:
	case BPF_PROG_TYPE_CGROUP_SKB:
	case BPF_PROG_TYPE_CGROUP_SOCK:
	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
	case BPF_PROG_TYPE_CGROUP_SYSCTL:
	case BPF_PROG_TYPE_SOCK_OPS:
	case BPF_PROG_TYPE_LSM:
		if (ptype == BPF_PROG_TYPE_LSM &&
		    prog->expected_attach_type != BPF_LSM_CGROUP)
			ret = -EINVAL;
		else
			ret = cgroup_bpf_prog_attach(attr, ptype, prog);
		break;
	case BPF_PROG_TYPE_SCHED_CLS:
		if (attr->attach_type == BPF_TCX_INGRESS ||
		    attr->attach_type == BPF_TCX_EGRESS)
@@ -4268,7 +4281,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
	default:
		ret = -EINVAL;
	}

out:
	if (ret)
		bpf_prog_put(prog);
	return ret;
@@ -4296,6 +4309,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)
			if (IS_ERR(prog))
				return PTR_ERR(prog);
		}
	} else if (is_cgroup_prog_type(ptype, 0, false)) {
		if (attr->attach_flags || attr->relative_fd)
			return -EINVAL;
	} else if (attr->attach_flags ||
		   attr->relative_fd ||
		   attr->expected_revision) {
+5 −0
Original line number Diff line number Diff line
@@ -2074,6 +2074,11 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
	for_each_subsys(ss, ssid)
		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);

#ifdef CONFIG_CGROUP_BPF
	for (int i = 0; i < ARRAY_SIZE(cgrp->bpf.revisions); i++)
		cgrp->bpf.revisions[i] = 1;
#endif

	init_waitqueue_head(&cgrp->offline_waitq);
	INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
Loading