Commit c9a925b7 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'io_uring-6.7-2023-11-30' of git://git.kernel.dk/linux

Pull io_uring fixes from Jens Axboe:

 - Fix an issue with discontig page checking for IORING_SETUP_NO_MMAP

 - Fix an issue with not allowing IORING_SETUP_NO_MMAP also disallowing
   mmap'ed buffer rings

 - Fix an issue with deferred release of memory mapped pages

 - Fix a lockdep issue with IORING_SETUP_NO_MMAP

 - Use fget/fput consistently, even from our sync system calls. No real
   issue here, but if we were ever to allow closing io_uring descriptors
   it would be required. Let's play it safe and just use the full ref
   counted versions upfront. Most uses of io_uring are threaded anyway,
   and hence already doing the full version underneath.

* tag 'io_uring-6.7-2023-11-30' of git://git.kernel.dk/linux:
  io_uring: use fget/fput consistently
  io_uring: free io_buffer_list entries via RCU
  io_uring/kbuf: prune deferred locked cache when tearing down
  io_uring/kbuf: recycle freed mapped buffer ring entries
  io_uring/kbuf: defer release of mapped buffer rings
  io_uring: enable io_mem_alloc/free to be used in other parts
  io_uring: don't guard IORING_OFF_PBUF_RING with SETUP_NO_MMAP
  io_uring: don't allow discontig pages for IORING_SETUP_NO_MMAP
parents ee0c8a9b 73363c26
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -340,6 +340,9 @@ struct io_ring_ctx {

	struct list_head	io_buffers_cache;

	/* deferred free list, protected by ->uring_lock */
	struct hlist_head	io_buf_list;

	/* Keep this last, we don't need it for the fast path */
	struct wait_queue_head		poll_wq;
	struct io_restriction		restrictions;
+6 −5
Original line number Diff line number Diff line
@@ -273,7 +273,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
	};
	ktime_t timeout = KTIME_MAX;
	struct io_uring_sync_cancel_reg sc;
	struct fd f = { };
	struct file *file = NULL;
	DEFINE_WAIT(wait);
	int ret, i;

@@ -295,10 +295,10 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
	/* we can grab a normal file descriptor upfront */
	if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
	   !(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
		f = fdget(sc.fd);
		if (!f.file)
		file = fget(sc.fd);
		if (!file)
			return -EBADF;
		cd.file = f.file;
		cd.file = file;
	}

	ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
@@ -348,6 +348,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
	if (ret == -ENOENT || ret > 0)
		ret = 0;
out:
	fdput(f);
	if (file)
		fput(file);
	return ret;
}
+51 −44
Original line number Diff line number Diff line
@@ -325,6 +325,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
	INIT_LIST_HEAD(&ctx->sqd_list);
	INIT_LIST_HEAD(&ctx->cq_overflow_list);
	INIT_LIST_HEAD(&ctx->io_buffers_cache);
	INIT_HLIST_HEAD(&ctx->io_buf_list);
	io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
			    sizeof(struct io_rsrc_node));
	io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
@@ -2666,7 +2667,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}

static void io_mem_free(void *ptr)
void io_mem_free(void *ptr)
{
	if (!ptr)
		return;
@@ -2697,6 +2698,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
{
	struct page **page_array;
	unsigned int nr_pages;
	void *page_addr;
	int ret, i;

	*npages = 0;
@@ -2718,27 +2720,29 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
		io_pages_free(&page_array, ret > 0 ? ret : 0);
		return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
	}

	page_addr = page_address(page_array[0]);
	for (i = 0; i < nr_pages; i++) {
		ret = -EINVAL;

		/*
	 * Should be a single page. If the ring is small enough that we can
	 * use a normal page, that is fine. If we need multiple pages, then
	 * userspace should use a huge page. That's the only way to guarantee
	 * that we get contigious memory, outside of just being lucky or
	 * (currently) having low memory fragmentation.
		 * Can't support mapping user allocated ring memory on 32-bit
		 * archs where it could potentially reside in highmem. Just
		 * fail those with -EINVAL, just like we did on kernels that
		 * didn't support this feature.
		 */
	if (page_array[0] != page_array[ret - 1])
		if (PageHighMem(page_array[i]))
			goto err;

		/*
	 * Can't support mapping user allocated ring memory on 32-bit archs
	 * where it could potentially reside in highmem. Just fail those with
	 * -EINVAL, just like we did on kernels that didn't support this
	 * feature.
		 * No support for discontig pages for now, should either be a
		 * single normal page, or a huge page. Later on we can add
		 * support for remapping discontig pages, for now we will
		 * just fail them with EINVAL.
		 */
	for (i = 0; i < nr_pages; i++) {
		if (PageHighMem(page_array[i])) {
			ret = -EINVAL;
		if (page_address(page_array[i]) != page_addr)
			goto err;
		}
		page_addr += PAGE_SIZE;
	}

	*pages = page_array;
@@ -2775,7 +2779,7 @@ static void io_rings_free(struct io_ring_ctx *ctx)
	}
}

static void *io_mem_alloc(size_t size)
void *io_mem_alloc(size_t size)
{
	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
	void *ret;
@@ -2947,6 +2951,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
		ctx->mm_account = NULL;
	}
	io_rings_free(ctx);
	io_kbuf_mmap_list_free(ctx);

	percpu_ref_exit(&ctx->refs);
	free_uid(ctx->user);
@@ -3475,25 +3480,27 @@ static void *io_uring_validate_mmap_request(struct file *file,
	struct page *page;
	void *ptr;

	/* Don't allow mmap if the ring was setup without it */
	if (ctx->flags & IORING_SETUP_NO_MMAP)
		return ERR_PTR(-EINVAL);

	switch (offset & IORING_OFF_MMAP_MASK) {
	case IORING_OFF_SQ_RING:
	case IORING_OFF_CQ_RING:
		/* Don't allow mmap if the ring was setup without it */
		if (ctx->flags & IORING_SETUP_NO_MMAP)
			return ERR_PTR(-EINVAL);
		ptr = ctx->rings;
		break;
	case IORING_OFF_SQES:
		/* Don't allow mmap if the ring was setup without it */
		if (ctx->flags & IORING_SETUP_NO_MMAP)
			return ERR_PTR(-EINVAL);
		ptr = ctx->sq_sqes;
		break;
	case IORING_OFF_PBUF_RING: {
		unsigned int bgid;

		bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
		mutex_lock(&ctx->uring_lock);
		rcu_read_lock();
		ptr = io_pbuf_get_address(ctx, bgid);
		mutex_unlock(&ctx->uring_lock);
		rcu_read_unlock();
		if (!ptr)
			return ERR_PTR(-EINVAL);
		break;
@@ -3645,7 +3652,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
		size_t, argsz)
{
	struct io_ring_ctx *ctx;
	struct fd f;
	struct file *file;
	long ret;

	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
@@ -3663,20 +3670,19 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
			return -EINVAL;
		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
		f.file = tctx->registered_rings[fd];
		f.flags = 0;
		if (unlikely(!f.file))
		file = tctx->registered_rings[fd];
		if (unlikely(!file))
			return -EBADF;
	} else {
		f = fdget(fd);
		if (unlikely(!f.file))
		file = fget(fd);
		if (unlikely(!file))
			return -EBADF;
		ret = -EOPNOTSUPP;
		if (unlikely(!io_is_uring_fops(f.file)))
		if (unlikely(!io_is_uring_fops(file)))
			goto out;
	}

	ctx = f.file->private_data;
	ctx = file->private_data;
	ret = -EBADFD;
	if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
		goto out;
@@ -3770,7 +3776,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
		}
	}
out:
	fdput(f);
	if (!(flags & IORING_ENTER_REGISTERED_RING))
		fput(file);
	return ret;
}

@@ -4611,7 +4618,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
{
	struct io_ring_ctx *ctx;
	long ret = -EBADF;
	struct fd f;
	struct file *file;
	bool use_registered_ring;

	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
@@ -4630,27 +4637,27 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
			return -EINVAL;
		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
		f.file = tctx->registered_rings[fd];
		f.flags = 0;
		if (unlikely(!f.file))
		file = tctx->registered_rings[fd];
		if (unlikely(!file))
			return -EBADF;
	} else {
		f = fdget(fd);
		if (unlikely(!f.file))
		file = fget(fd);
		if (unlikely(!file))
			return -EBADF;
		ret = -EOPNOTSUPP;
		if (!io_is_uring_fops(f.file))
		if (!io_is_uring_fops(file))
			goto out_fput;
	}

	ctx = f.file->private_data;
	ctx = file->private_data;

	mutex_lock(&ctx->uring_lock);
	ret = __io_uring_register(ctx, opcode, arg, nr_args);
	mutex_unlock(&ctx->uring_lock);
	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
out_fput:
	fdput(f);
	if (!use_registered_ring)
		fput(file);
	return ret;
}

+3 −0
Original line number Diff line number Diff line
@@ -86,6 +86,9 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
			bool cancel_all);

void *io_mem_alloc(size_t size);
void io_mem_free(void *ptr);

#if defined(CONFIG_PROVE_LOCKING)
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
{
+156 −21
Original line number Diff line number Diff line
@@ -33,19 +33,42 @@ struct io_provide_buf {
	__u16				bid;
};

static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
struct io_buf_free {
	struct hlist_node		list;
	void				*mem;
	size_t				size;
	int				inuse;
};

static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
						   struct io_buffer_list *bl,
						   unsigned int bgid)
{
	if (ctx->io_bl && bgid < BGID_ARRAY)
		return &ctx->io_bl[bgid];
	if (bl && bgid < BGID_ARRAY)
		return &bl[bgid];

	return xa_load(&ctx->io_bl_xa, bgid);
}

static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
							unsigned int bgid)
{
	lockdep_assert_held(&ctx->uring_lock);

	return __io_buffer_get_list(ctx, ctx->io_bl, bgid);
}

static int io_buffer_add_list(struct io_ring_ctx *ctx,
			      struct io_buffer_list *bl, unsigned int bgid)
{
	/*
	 * Store buffer group ID and finally mark the list as visible.
	 * The normal lookup doesn't care about the visibility as we're
	 * always under the ->uring_lock, but the RCU lookup from mmap does.
	 */
	bl->bgid = bgid;
	smp_store_release(&bl->is_ready, 1);

	if (bgid < BGID_ARRAY)
		return 0;

@@ -196,21 +219,40 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,

static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
{
	struct io_buffer_list *bl;
	int i;

	ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
				GFP_KERNEL);
	if (!ctx->io_bl)
	bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL);
	if (!bl)
		return -ENOMEM;

	for (i = 0; i < BGID_ARRAY; i++) {
		INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
		ctx->io_bl[i].bgid = i;
		INIT_LIST_HEAD(&bl[i].buf_list);
		bl[i].bgid = i;
	}

	smp_store_release(&ctx->io_bl, bl);
	return 0;
}

/*
 * Mark the given mapped range as free for reuse
 */
static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
{
	struct io_buf_free *ibf;

	hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
		if (bl->buf_ring == ibf->mem) {
			ibf->inuse = 0;
			return;
		}
	}

	/* can't happen... */
	WARN_ON_ONCE(1);
}

static int __io_remove_buffers(struct io_ring_ctx *ctx,
			       struct io_buffer_list *bl, unsigned nbufs)
{
@@ -223,7 +265,11 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
	if (bl->is_mapped) {
		i = bl->buf_ring->tail - bl->head;
		if (bl->is_mmap) {
			folio_put(virt_to_folio(bl->buf_ring));
			/*
			 * io_kbuf_list_free() will free the page(s) at
			 * ->release() time.
			 */
			io_kbuf_mark_free(ctx, bl);
			bl->buf_ring = NULL;
			bl->is_mmap = 0;
		} else if (bl->buf_nr_pages) {
@@ -274,9 +320,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
	xa_for_each(&ctx->io_bl_xa, index, bl) {
		xa_erase(&ctx->io_bl_xa, bl->bgid);
		__io_remove_buffers(ctx, bl, -1U);
		kfree(bl);
		kfree_rcu(bl, rcu);
	}

	/*
	 * Move deferred locked entries to cache before pruning
	 */
	spin_lock(&ctx->completion_lock);
	if (!list_empty(&ctx->io_buffers_comp))
		list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache);
	spin_unlock(&ctx->completion_lock);

	list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
		buf = list_entry(item, struct io_buffer, list);
		kmem_cache_free(io_buf_cachep, buf);
@@ -460,7 +514,16 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
		INIT_LIST_HEAD(&bl->buf_list);
		ret = io_buffer_add_list(ctx, bl, p->bgid);
		if (ret) {
			kfree(bl);
			/*
			 * Doesn't need rcu free as it was never visible, but
			 * let's keep it consistent throughout. Also can't
			 * be a lower indexed array group, as adding one
			 * where lookup failed cannot happen.
			 */
			if (p->bgid >= BGID_ARRAY)
				kfree_rcu(bl, rcu);
			else
				WARN_ON_ONCE(1);
			goto err;
		}
	}
@@ -531,19 +594,63 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
	return -EINVAL;
}

static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
/*
 * See if we have a suitable region that we can reuse, rather than allocate
 * both a new io_buf_free and mem region again. We leave it on the list as
 * even a reused entry will need freeing at ring release.
 */
static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx,
						    size_t ring_size)
{
	struct io_buf_free *ibf, *best = NULL;
	size_t best_dist;

	hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
		size_t dist;

		if (ibf->inuse || ibf->size < ring_size)
			continue;
		dist = ibf->size - ring_size;
		if (!best || dist < best_dist) {
			best = ibf;
			if (!dist)
				break;
			best_dist = dist;
		}
	}

	return best;
}

static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
			      struct io_uring_buf_reg *reg,
			      struct io_buffer_list *bl)
{
	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
	struct io_buf_free *ibf;
	size_t ring_size;
	void *ptr;

	ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
	ptr = (void *) __get_free_pages(gfp, get_order(ring_size));

	/* Reuse existing entry, if we can */
	ibf = io_lookup_buf_free_entry(ctx, ring_size);
	if (!ibf) {
		ptr = io_mem_alloc(ring_size);
		if (!ptr)
			return -ENOMEM;

	bl->buf_ring = ptr;
		/* Allocate and store deferred free entry */
		ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
		if (!ibf) {
			io_mem_free(ptr);
			return -ENOMEM;
		}
		ibf->mem = ptr;
		ibf->size = ring_size;
		hlist_add_head(&ibf->list, &ctx->io_buf_list);
	}
	ibf->inuse = 1;
	bl->buf_ring = ibf->mem;
	bl->is_mapped = 1;
	bl->is_mmap = 1;
	return 0;
@@ -555,6 +662,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
	struct io_buffer_list *bl, *free_bl = NULL;
	int ret;

	lockdep_assert_held(&ctx->uring_lock);

	if (copy_from_user(&reg, arg, sizeof(reg)))
		return -EFAULT;

@@ -599,7 +708,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
	if (!(reg.flags & IOU_PBUF_RING_MMAP))
		ret = io_pin_pbuf_ring(&reg, bl);
	else
		ret = io_alloc_pbuf_ring(&reg, bl);
		ret = io_alloc_pbuf_ring(ctx, &reg, bl);

	if (!ret) {
		bl->nr_entries = reg.ring_entries;
@@ -609,7 +718,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
		return 0;
	}

	kfree(free_bl);
	kfree_rcu(free_bl, rcu);
	return ret;
}

@@ -618,6 +727,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
	struct io_uring_buf_reg reg;
	struct io_buffer_list *bl;

	lockdep_assert_held(&ctx->uring_lock);

	if (copy_from_user(&reg, arg, sizeof(reg)))
		return -EFAULT;
	if (reg.resv[0] || reg.resv[1] || reg.resv[2])
@@ -634,7 +745,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
	__io_remove_buffers(ctx, bl, -1U);
	if (bl->bgid >= BGID_ARRAY) {
		xa_erase(&ctx->io_bl_xa, bl->bgid);
		kfree(bl);
		kfree_rcu(bl, rcu);
	}
	return 0;
}
@@ -643,9 +754,33 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
{
	struct io_buffer_list *bl;

	bl = io_buffer_get_list(ctx, bgid);
	bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid);

	/*
	 * Ensure the list is fully setup. Only strictly needed for RCU lookup
	 * via mmap, and in that case only for the array indexed groups. For
	 * the xarray lookups, it's either visible and ready, or not at all.
	 */
	if (!smp_load_acquire(&bl->is_ready))
		return NULL;
	if (!bl || !bl->is_mmap)
		return NULL;

	return bl->buf_ring;
}

/*
 * Called at or after ->release(), free the mmap'ed buffers that we used
 * for memory mapped provided buffer rings.
 */
void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
{
	struct io_buf_free *ibf;
	struct hlist_node *tmp;

	hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
		hlist_del(&ibf->list);
		io_mem_free(ibf->mem);
		kfree(ibf);
	}
}
Loading