Commit ed82f35b authored by Jens Axboe's avatar Jens Axboe
Browse files

io_uring: allow registration of per-task restrictions



Currently io_uring supports restricting operations on a per-ring basis.
To use those, the ring must be setup in a disabled state by setting
IORING_SETUP_R_DISABLED. Then restrictions can be set for the ring, and
the ring can then be enabled.

This commit adds support for IORING_REGISTER_RESTRICTIONS with ring_fd
== -1, like the other "blind" register opcodes which work on the task
rather than a specific ring. This allows registration of the same kind
of restrictions as can been done on a specific ring, but with the task
itself. Once done, any ring created will inherit these restrictions.

If a restriction filter is registered with a task, then it's inherited
on fork for its children. Children may only further restrict operations,
not extend them.

Inheriting restrictions include both the classic
IORING_REGISTER_RESTRICTIONS based restrictions, as well as the BPF
filters that have been registered with the task via
IORING_REGISTER_BPF_FILTER.

Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 9fd99788
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -231,6 +231,8 @@ struct io_restriction {
	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
	struct io_bpf_filters *bpf_filters;
	/* ->bpf_filters needs COW on modification */
	bool bpf_filters_cow;
	u8 sqe_flags_allowed;
	u8 sqe_flags_required;
	/* IORING_OP_* restrictions exist */
+7 −0
Original line number Diff line number Diff line
@@ -808,6 +808,13 @@ struct io_uring_restriction {
	__u32 resv2[3];
};

struct io_uring_task_restriction {
	__u16 flags;
	__u16 nr_res;
	__u32 resv[3];
	__DECLARE_FLEX_ARRAY(struct io_uring_restriction, restrictions);
};

struct io_uring_clock_register {
	__u32	clockid;
	__u32	__resv[3];
+85 −1
Original line number Diff line number Diff line
@@ -249,13 +249,77 @@ static int io_uring_check_cbpf_filter(struct sock_filter *filter,
	return 0;
}

void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src)
{
	if (!src->bpf_filters)
		return;

	rcu_read_lock();
	/*
	 * If the src filter is going away, just ignore it.
	 */
	if (refcount_inc_not_zero(&src->bpf_filters->refs)) {
		dst->bpf_filters = src->bpf_filters;
		dst->bpf_filters_cow = true;
	}
	rcu_read_unlock();
}

/*
 * Allocate a new struct io_bpf_filters. Used when a filter is cloned and
 * modifications need to be made.
 */
static struct io_bpf_filters *io_bpf_filter_cow(struct io_restriction *src)
{
	struct io_bpf_filters *filters;
	struct io_bpf_filter *srcf;
	int i;

	filters = io_new_bpf_filters();
	if (IS_ERR(filters))
		return filters;

	/*
	 * Iterate filters from src and assign in destination. Grabbing
	 * a reference is enough, we don't need to duplicate the memory.
	 * This is safe because filters are only ever appended to the
	 * front of the list, hence the only memory ever touched inside
	 * a filter is the refcount.
	 */
	rcu_read_lock();
	for (i = 0; i < IORING_OP_LAST; i++) {
		srcf = rcu_dereference(src->bpf_filters->filters[i]);
		if (!srcf) {
			continue;
		} else if (srcf == &dummy_filter) {
			rcu_assign_pointer(filters->filters[i], &dummy_filter);
			continue;
		}

		/*
		 * Getting a ref on the first node is enough, putting the
		 * filter and iterating nodes to free will stop on the first
		 * one that doesn't hit zero when dropping.
		 */
		if (!refcount_inc_not_zero(&srcf->refs))
			goto err;
		rcu_assign_pointer(filters->filters[i], srcf);
	}
	rcu_read_unlock();
	return filters;
err:
	rcu_read_unlock();
	__io_put_bpf_filters(filters);
	return ERR_PTR(-EBUSY);
}

#define IO_URING_BPF_FILTER_FLAGS	IO_URING_BPF_FILTER_DENY_REST

int io_register_bpf_filter(struct io_restriction *res,
			   struct io_uring_bpf __user *arg)
{
	struct io_bpf_filters *filters, *old_filters = NULL;
	struct io_bpf_filter *filter, *old_filter;
	struct io_bpf_filters *filters;
	struct io_uring_bpf reg;
	struct bpf_prog *prog;
	struct sock_fprog fprog;
@@ -297,6 +361,17 @@ int io_register_bpf_filter(struct io_restriction *res,
			ret = PTR_ERR(filters);
			goto err_prog;
		}
	} else if (res->bpf_filters_cow) {
		filters = io_bpf_filter_cow(res);
		if (IS_ERR(filters)) {
			ret = PTR_ERR(filters);
			goto err_prog;
		}
		/*
		 * Stash old filters, we'll put them once we know we'll
		 * succeed. Until then, res->bpf_filters is left untouched.
		 */
		old_filters = res->bpf_filters;
	}

	filter = kzalloc(sizeof(*filter), GFP_KERNEL_ACCOUNT);
@@ -306,6 +381,15 @@ int io_register_bpf_filter(struct io_restriction *res,
	}
	refcount_set(&filter->refs, 1);
	filter->prog = prog;

	/*
	 * Success - install the new filter set now. If we did COW, put
	 * the old filters as we're replacing them.
	 */
	if (old_filters) {
		__io_put_bpf_filters(old_filters);
		res->bpf_filters_cow = false;
	}
	res->bpf_filters = filters;

	/*
+6 −0
Original line number Diff line number Diff line
@@ -13,6 +13,8 @@ int io_register_bpf_filter(struct io_restriction *res,

void io_put_bpf_filters(struct io_restriction *res);

void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src);

static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
					   struct io_kiocb *req)
{
@@ -37,6 +39,10 @@ static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
static inline void io_put_bpf_filters(struct io_restriction *res)
{
}
static inline void io_bpf_filter_clone(struct io_restriction *dst,
				       struct io_restriction *src)
{
}
#endif /* CONFIG_IO_URING_BPF */

#endif
+33 −0
Original line number Diff line number Diff line
@@ -2880,6 +2880,32 @@ int io_prepare_config(struct io_ctx_config *config)
	return 0;
}

void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src)
{
	memcpy(&dst->register_op, &src->register_op, sizeof(dst->register_op));
	memcpy(&dst->sqe_op, &src->sqe_op, sizeof(dst->sqe_op));
	dst->sqe_flags_allowed = src->sqe_flags_allowed;
	dst->sqe_flags_required = src->sqe_flags_required;
	dst->op_registered = src->op_registered;
	dst->reg_registered = src->reg_registered;

	io_bpf_filter_clone(dst, src);
}

static void io_ctx_restriction_clone(struct io_ring_ctx *ctx,
				     struct io_restriction *src)
{
	struct io_restriction *dst = &ctx->restrictions;

	io_restriction_clone(dst, src);
	if (dst->bpf_filters)
		WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters);
	if (dst->op_registered)
		ctx->op_restricted = 1;
	if (dst->reg_registered)
		ctx->reg_restricted = 1;
}

static __cold int io_uring_create(struct io_ctx_config *config)
{
	struct io_uring_params *p = &config->p;
@@ -2940,6 +2966,13 @@ static __cold int io_uring_create(struct io_ctx_config *config)
	else
		ctx->notify_method = TWA_SIGNAL;

	/*
	 * If the current task has restrictions enabled, then copy them to
	 * our newly created ring and mark it as registered.
	 */
	if (current->io_uring_restrict)
		io_ctx_restriction_clone(ctx, current->io_uring_restrict);

	/*
	 * This is just grabbed for accounting purposes. When a process exits,
	 * the mm is exited and dropped before the files, hence we need to hang
Loading