Commit 591beb0e authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'io_uring-bpf-restrictions.4-20260206' of...

Merge tag 'io_uring-bpf-restrictions.4-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring bpf filters from Jens Axboe:
 "This adds support for both cBPF filters for io_uring, as well as task
  inherited restrictions and filters.

  seccomp and io_uring don't play along nicely, as most of the
  interesting data to filter on resides somewhat out-of-band, in the
  submission queue ring.

  As a result, things like containers and systemd that apply seccomp
  filters, can't filter io_uring operations.

  That leaves them with just one choice if filtering is critical -
  filter the actual io_uring_setup(2) system call to simply disallow
  io_uring. That's rather unfortunate, and has limited us because of it.

  io_uring already has some filtering support. It requires the ring to
  be setup in a disabled state, and then a filter set can be applied.
  This filter set is completely bi-modal - an opcode is either enabled
  or it's not. Once a filter set is registered, the ring can be enabled.
  This is very restrictive, and it's not useful at all to systemd or
  containers which really want both broader and more specific control.

  This first adds support for cBPF filters for opcodes, which enables
  tighter control over what exactly a specific opcode may do. As
  examples, specific support is added for IORING_OP_OPENAT/OPENAT2,
  allowing filtering on resolve flags. And another example is added for
  IORING_OP_SOCKET, allowing filtering on domain/type/protocol. These
  are both common use cases. cBPF was chosen rather than eBPF, because
  the latter is often restricted in containers as well.

  These filters are run post the init phase of the request, which allows
  filters to even dip into data that is being passed in struct in user
  memory, as the init side of requests make that data stable by bringing
  it into the kernel. This allows filtering without needing to copy this
  data twice, or have filters etc know about the exact layout of the
  user data. The filters get the already copied and sanitized data
  passed.

  On top of that support is added for per-task filters, meaning that any
  ring created with a task that has a per-task filter will get those
  filters applied when it's created. These filters are inherited across
  fork as well. Once a filter has been registered, any further added
  filters may only further restrict what operations are permitted.

  Filters cannot change the return value of an operation, they can only
  permit or deny it based on the contents"

* tag 'io_uring-bpf-restrictions.4-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
  io_uring: allow registration of per-task restrictions
  io_uring: add task fork hook
  io_uring/bpf_filter: add ref counts to struct io_bpf_filter
  io_uring/bpf_filter: cache lookup table in ctx->bpf_filters
  io_uring/bpf_filter: allow filtering on contents of struct open_how
  io_uring/net: allow filtering on IORING_OP_SOCKET data
  io_uring: add support for BPF filtering for opcode restrictions
parents f5d4feed ed82f35b
Loading
Loading
Loading
Loading
+13 −1
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@ void __io_uring_free(struct task_struct *tsk);
void io_uring_unreg_ringfd(void);
const char *io_uring_get_opcode(u8 opcode);
bool io_is_uring_fops(struct file *file);
int __io_uring_fork(struct task_struct *tsk);

static inline void io_uring_files_cancel(void)
{
@@ -25,9 +26,16 @@ static inline void io_uring_task_cancel(void)
}
static inline void io_uring_free(struct task_struct *tsk)
{
	if (tsk->io_uring)
	if (tsk->io_uring || tsk->io_uring_restrict)
		__io_uring_free(tsk);
}
static inline int io_uring_fork(struct task_struct *tsk)
{
	if (tsk->io_uring_restrict)
		return __io_uring_fork(tsk);

	return 0;
}
#else
static inline void io_uring_task_cancel(void)
{
@@ -46,6 +54,10 @@ static inline bool io_is_uring_fops(struct file *file)
{
	return false;
}
static inline int io_uring_fork(struct task_struct *tsk)
{
	return 0;
}
#endif

#endif
+13 −0
Original line number Diff line number Diff line
@@ -219,9 +219,20 @@ struct io_rings {
	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
};

struct io_bpf_filter;
struct io_bpf_filters {
	refcount_t refs;	/* ref for ->bpf_filters */
	spinlock_t lock;	/* protects ->bpf_filters modifications */
	struct io_bpf_filter __rcu **filters;
	struct rcu_head rcu_head;
};

struct io_restriction {
	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
	struct io_bpf_filters *bpf_filters;
	/* ->bpf_filters needs COW on modification */
	bool bpf_filters_cow;
	u8 sqe_flags_allowed;
	u8 sqe_flags_required;
	/* IORING_OP_* restrictions exist */
@@ -278,6 +289,8 @@ struct io_ring_ctx {

		struct task_struct	*submitter_task;
		struct io_rings		*rings;
		/* cache of ->restrictions.bpf_filters->filters */
		struct io_bpf_filter __rcu	**bpf_filters;
		struct percpu_ref	refs;

		clockid_t		clockid;
+1 −0
Original line number Diff line number Diff line
@@ -1186,6 +1186,7 @@ struct task_struct {

#ifdef CONFIG_IO_URING
	struct io_uring_task		*io_uring;
	struct io_restriction		*io_uring_restrict;
#endif

	/* Namespaces: */
+10 −0
Original line number Diff line number Diff line
@@ -712,6 +712,9 @@ enum io_uring_register_op {
	/* auxiliary zcrx configuration, see enum zcrx_ctrl_op */
	IORING_REGISTER_ZCRX_CTRL		= 36,

	/* register bpf filtering programs */
	IORING_REGISTER_BPF_FILTER		= 37,

	/* this goes last */
	IORING_REGISTER_LAST,

@@ -817,6 +820,13 @@ struct io_uring_restriction {
	__u32 resv2[3];
};

struct io_uring_task_restriction {
	__u16 flags;
	__u16 nr_res;
	__u32 resv[3];
	__DECLARE_FLEX_ARRAY(struct io_uring_restriction, restrictions);
};

struct io_uring_clock_register {
	__u32	clockid;
	__u32	__resv[3];
+62 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
/*
 * Header file for the io_uring BPF filters.
 */
#ifndef LINUX_IO_URING_BPF_FILTER_H
#define LINUX_IO_URING_BPF_FILTER_H

#include <linux/types.h>

/*
 * Struct passed to filters.
 */
struct io_uring_bpf_ctx {
	__u64	user_data;
	__u8	opcode;
	__u8	sqe_flags;
	__u8	pdu_size;	/* size of aux data for filter */
	__u8	pad[5];
	union {
		struct {
			__u32	family;
			__u32	type;
			__u32	protocol;
		} socket;
		struct {
			__u64	flags;
			__u64	mode;
			__u64	resolve;
		} open;
	};
};

enum {
	/*
	 * If set, any currently unset opcode will have a deny filter attached
	 */
	IO_URING_BPF_FILTER_DENY_REST	= 1,
};

struct io_uring_bpf_filter {
	__u32	opcode;		/* io_uring opcode to filter */
	__u32	flags;
	__u32	filter_len;	/* number of BPF instructions */
	__u32	resv;
	__u64	filter_ptr;	/* pointer to BPF filter */
	__u64	resv2[5];
};

enum {
	IO_URING_BPF_CMD_FILTER	= 1,
};

struct io_uring_bpf {
	__u16	cmd_type;	/* IO_URING_BPF_* values */
	__u16	cmd_flags;	/* none so far */
	__u32	resv;
	union {
		struct io_uring_bpf_filter	filter;
	};
};

#endif
Loading