Commit 4f72ed49 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'io_uring-6.9-20240405' of git://git.kernel.dk/linux

Pull io_uring fixes from Jens Axboe:

 - Backport of some fixes that came up during development of the 6.10
   io_uring patches. This includes some kbuf cleanups and reference
   fixes.

 - Disable multishot read if we don't have NOWAIT support on the target

 - Fix for a dependency issue with workqueue flushing

* tag 'io_uring-6.9-20240405' of git://git.kernel.dk/linux:
  io_uring/kbuf: hold io_buffer_list reference over mmap
  io_uring/kbuf: protect io_buffer_list teardown with a reference
  io_uring/kbuf: get rid of bl->is_ready
  io_uring/kbuf: get rid of lower BGID lists
  io_uring: use private workqueue for exit work
  io_uring: disable io-wq execution of multishot NOWAIT requests
  io_uring/rw: don't allow multishot reads without NOWAIT support
parents 4de2ff26 561e4f94
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -294,7 +294,6 @@ struct io_ring_ctx {

		struct io_submit_state	submit_state;

		struct io_buffer_list	*io_bl;
		struct xarray		io_bl_xa;

		struct io_hash_table	cancel_table_locked;
+19 −12
Original line number Diff line number Diff line
@@ -147,6 +147,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
static void io_queue_sqe(struct io_kiocb *req);

struct kmem_cache *req_cachep;
static struct workqueue_struct *iou_wq __ro_after_init;

static int __read_mostly sysctl_io_uring_disabled;
static int __read_mostly sysctl_io_uring_group = -1;
@@ -350,7 +351,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
err:
	kfree(ctx->cancel_table.hbs);
	kfree(ctx->cancel_table_locked.hbs);
	kfree(ctx->io_bl);
	xa_destroy(&ctx->io_bl_xa);
	kfree(ctx);
	return NULL;
@@ -1982,10 +1982,15 @@ void io_wq_submit_work(struct io_wq_work *work)
		err = -EBADFD;
		if (!io_file_can_poll(req))
			goto fail;
		if (req->file->f_flags & O_NONBLOCK ||
		    req->file->f_mode & FMODE_NOWAIT) {
			err = -ECANCELED;
			if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK)
				goto fail;
			return;
		} else {
			req->flags &= ~REQ_F_APOLL_MULTISHOT;
		}
	}

	if (req->flags & REQ_F_FORCE_ASYNC) {
@@ -2926,7 +2931,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
	io_napi_free(ctx);
	kfree(ctx->cancel_table.hbs);
	kfree(ctx->cancel_table_locked.hbs);
	kfree(ctx->io_bl);
	xa_destroy(&ctx->io_bl_xa);
	kfree(ctx);
}
@@ -3161,7 +3165,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
	 * noise and overhead, there's no discernable change in runtime
	 * over using system_wq.
	 */
	queue_work(system_unbound_wq, &ctx->exit_work);
	queue_work(iou_wq, &ctx->exit_work);
}

static int io_uring_release(struct inode *inode, struct file *file)
@@ -3443,14 +3447,15 @@ static void *io_uring_validate_mmap_request(struct file *file,
		ptr = ctx->sq_sqes;
		break;
	case IORING_OFF_PBUF_RING: {
		struct io_buffer_list *bl;
		unsigned int bgid;

		bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
		rcu_read_lock();
		ptr = io_pbuf_get_address(ctx, bgid);
		rcu_read_unlock();
		if (!ptr)
			return ERR_PTR(-EINVAL);
		bl = io_pbuf_get_bl(ctx, bgid);
		if (IS_ERR(bl))
			return bl;
		ptr = bl->buf_ring;
		io_put_bl(ctx, bl);
		break;
		}
	default:
@@ -4185,6 +4190,8 @@ static int __init io_uring_init(void)
	io_buf_cachep = KMEM_CACHE(io_buffer,
					  SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);

	iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64);

#ifdef CONFIG_SYSCTL
	register_sysctl_init("kernel", kernel_io_uring_disabled_table);
#endif
+41 −77
Original line number Diff line number Diff line
@@ -17,8 +17,6 @@

#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))

#define BGID_ARRAY	64

/* BIDs are addressed by a 16-bit field in a CQE */
#define MAX_BIDS_PER_BGID (1 << 16)

@@ -40,13 +38,9 @@ struct io_buf_free {
	int				inuse;
};

static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
						   struct io_buffer_list *bl,
static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
							  unsigned int bgid)
{
	if (bl && bgid < BGID_ARRAY)
		return &bl[bgid];

	return xa_load(&ctx->io_bl_xa, bgid);
}

@@ -55,7 +49,7 @@ static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
{
	lockdep_assert_held(&ctx->uring_lock);

	return __io_buffer_get_list(ctx, ctx->io_bl, bgid);
	return __io_buffer_get_list(ctx, bgid);
}

static int io_buffer_add_list(struct io_ring_ctx *ctx,
@@ -67,11 +61,7 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
	 * always under the ->uring_lock, but the RCU lookup from mmap does.
	 */
	bl->bgid = bgid;
	smp_store_release(&bl->is_ready, 1);

	if (bgid < BGID_ARRAY)
		return 0;

	atomic_set(&bl->refs, 1);
	return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
}

@@ -208,24 +198,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
	return ret;
}

static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
{
	struct io_buffer_list *bl;
	int i;

	bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL);
	if (!bl)
		return -ENOMEM;

	for (i = 0; i < BGID_ARRAY; i++) {
		INIT_LIST_HEAD(&bl[i].buf_list);
		bl[i].bgid = i;
	}

	smp_store_release(&ctx->io_bl, bl);
	return 0;
}

/*
 * Mark the given mapped range as free for reuse
 */
@@ -294,24 +266,24 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
	return i;
}

void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
{
	if (atomic_dec_and_test(&bl->refs)) {
		__io_remove_buffers(ctx, bl, -1U);
		kfree_rcu(bl, rcu);
	}
}

void io_destroy_buffers(struct io_ring_ctx *ctx)
{
	struct io_buffer_list *bl;
	struct list_head *item, *tmp;
	struct io_buffer *buf;
	unsigned long index;
	int i;

	for (i = 0; i < BGID_ARRAY; i++) {
		if (!ctx->io_bl)
			break;
		__io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
	}

	xa_for_each(&ctx->io_bl_xa, index, bl) {
		xa_erase(&ctx->io_bl_xa, bl->bgid);
		__io_remove_buffers(ctx, bl, -1U);
		kfree_rcu(bl, rcu);
		io_put_bl(ctx, bl);
	}

	/*
@@ -489,12 +461,6 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)

	io_ring_submit_lock(ctx, issue_flags);

	if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
		ret = io_init_bl_list(ctx);
		if (ret)
			goto err;
	}

	bl = io_buffer_get_list(ctx, p->bgid);
	if (unlikely(!bl)) {
		bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
@@ -507,14 +473,9 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
		if (ret) {
			/*
			 * Doesn't need rcu free as it was never visible, but
			 * let's keep it consistent throughout. Also can't
			 * be a lower indexed array group, as adding one
			 * where lookup failed cannot happen.
			 * let's keep it consistent throughout.
			 */
			if (p->bgid >= BGID_ARRAY)
			kfree_rcu(bl, rcu);
			else
				WARN_ON_ONCE(1);
			goto err;
		}
	}
@@ -679,12 +640,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
	if (reg.ring_entries >= 65536)
		return -EINVAL;

	if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
		int ret = io_init_bl_list(ctx);
		if (ret)
			return ret;
	}

	bl = io_buffer_get_list(ctx, reg.bgid);
	if (bl) {
		/* if mapped buffer ring OR classic exists, don't allow */
@@ -733,11 +688,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
	if (!bl->is_buf_ring)
		return -EINVAL;

	__io_remove_buffers(ctx, bl, -1U);
	if (bl->bgid >= BGID_ARRAY) {
	xa_erase(&ctx->io_bl_xa, bl->bgid);
		kfree_rcu(bl, rcu);
	}
	io_put_bl(ctx, bl);
	return 0;
}

@@ -767,23 +719,35 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
	return 0;
}

void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
				      unsigned long bgid)
{
	struct io_buffer_list *bl;
	bool ret;

	bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid);

	if (!bl || !bl->is_mmap)
		return NULL;
	/*
	 * Ensure the list is fully setup. Only strictly needed for RCU lookup
	 * via mmap, and in that case only for the array indexed groups. For
	 * the xarray lookups, it's either visible and ready, or not at all.
	 * We have to be a bit careful here - we're inside mmap and cannot grab
	 * the uring_lock. This means the buffer_list could be simultaneously
	 * going away, if someone is trying to be sneaky. Look it up under rcu
	 * so we know it's not going away, and attempt to grab a reference to
	 * it. If the ref is already zero, then fail the mapping. If successful,
	 * the caller will call io_put_bl() to drop the the reference at at the
	 * end. This may then safely free the buffer_list (and drop the pages)
	 * at that point, vm_insert_pages() would've already grabbed the
	 * necessary vma references.
	 */
	if (!smp_load_acquire(&bl->is_ready))
		return NULL;
	rcu_read_lock();
	bl = xa_load(&ctx->io_bl_xa, bgid);
	/* must be a mmap'able buffer ring and have pages */
	ret = false;
	if (bl && bl->is_mmap)
		ret = atomic_inc_not_zero(&bl->refs);
	rcu_read_unlock();

	if (ret)
		return bl;

	return bl->buf_ring;
	return ERR_PTR(-EINVAL);
}

/*
+5 −3
Original line number Diff line number Diff line
@@ -25,12 +25,12 @@ struct io_buffer_list {
	__u16 head;
	__u16 mask;

	atomic_t refs;

	/* ring mapped provided buffers */
	__u8 is_buf_ring;
	/* ring mapped provided buffers, but mmap'ed by application */
	__u8 is_mmap;
	/* bl is visible from an RCU point of view for lookup */
	__u8 is_ready;
};

struct io_buffer {
@@ -61,7 +61,9 @@ void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);

bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);

void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid);
void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl);
struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
				      unsigned long bgid);

static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
{
+8 −1
Original line number Diff line number Diff line
@@ -936,6 +936,13 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)

	ret = __io_read(req, issue_flags);

	/*
	 * If the file doesn't support proper NOWAIT, then disable multishot
	 * and stay in single shot mode.
	 */
	if (!io_file_supports_nowait(req))
		req->flags &= ~REQ_F_APOLL_MULTISHOT;

	/*
	 * If we get -EAGAIN, recycle our buffer and just let normal poll
	 * handling arm it.
@@ -955,7 +962,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
	/*
	 * Any successful return value will keep the multishot read armed.
	 */
	if (ret > 0) {
	if (ret > 0 && req->flags & REQ_F_APOLL_MULTISHOT) {
		/*
		 * Put our buffer and post a CQE. If we fail to post a CQE, then
		 * jump to the termination path. This request is then done.