Merge tag 'io_uring-6.7-2023-11-30' of git://git.kernel.dk/linux (c9a925b7) · Commits · git / linux-nf

include/linux/io_uring_types.h

+3 −0

Original line number	Diff line number	Diff line
		@@ -340,6 +340,9 @@ struct io_ring_ctx {

		struct list_head io_buffers_cache;

		/* deferred free list, protected by ->uring_lock */
		struct hlist_head io_buf_list;

		/* Keep this last, we don't need it for the fast path */
		struct wait_queue_head poll_wq;
		struct io_restriction restrictions;

io_uring/cancel.c

+6 −5

Original line number	Diff line number	Diff line
		@@ -273,7 +273,7 @@ int io_sync_cancel(struct io_ring_ctx ctx, void __user arg)
		};
		ktime_t timeout = KTIME_MAX;
		struct io_uring_sync_cancel_reg sc;
		struct fd f = { };
		struct file *file = NULL;
		DEFINE_WAIT(wait);
		int ret, i;

		@@ -295,10 +295,10 @@ int io_sync_cancel(struct io_ring_ctx ctx, void __user arg)
		/* we can grab a normal file descriptor upfront */
		if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
		!(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
		f = fdget(sc.fd);
		if (!f.file)
		file = fget(sc.fd);
		if (!file)
		return -EBADF;
		cd.file = f.file;
		cd.file = file;
		}

		ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
		@@ -348,6 +348,7 @@ int io_sync_cancel(struct io_ring_ctx ctx, void __user arg)
		if (ret == -ENOENT \|\| ret > 0)
		ret = 0;
		out:
		fdput(f);
		if (file)
		fput(file);
		return ret;
		}

io_uring/io_uring.c

+51 −44

Original line number	Diff line number	Diff line
		@@ -325,6 +325,7 @@ static __cold struct io_ring_ctx io_ring_ctx_alloc(struct io_uring_params p)
		INIT_LIST_HEAD(&ctx->sqd_list);
		INIT_LIST_HEAD(&ctx->cq_overflow_list);
		INIT_LIST_HEAD(&ctx->io_buffers_cache);
		INIT_HLIST_HEAD(&ctx->io_buf_list);
		io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
		sizeof(struct io_rsrc_node));
		io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
		@@ -2666,7 +2667,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
		return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
		}

		static void io_mem_free(void *ptr)
		void io_mem_free(void *ptr)
		{
		if (!ptr)
		return;
		@@ -2697,6 +2698,7 @@ static void __io_uaddr_map(struct page *pages, unsigned short npages,
		{
		struct page **page_array;
		unsigned int nr_pages;
		void *page_addr;
		int ret, i;

		*npages = 0;
		@@ -2718,27 +2720,29 @@ static void __io_uaddr_map(struct page *pages, unsigned short npages,
		io_pages_free(&page_array, ret > 0 ? ret : 0);
		return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
		}

		page_addr = page_address(page_array[0]);
		for (i = 0; i < nr_pages; i++) {
		ret = -EINVAL;

		/*
		* Should be a single page. If the ring is small enough that we can
		* use a normal page, that is fine. If we need multiple pages, then
		* userspace should use a huge page. That's the only way to guarantee
		* that we get contigious memory, outside of just being lucky or
		* (currently) having low memory fragmentation.
		* Can't support mapping user allocated ring memory on 32-bit
		* archs where it could potentially reside in highmem. Just
		* fail those with -EINVAL, just like we did on kernels that
		* didn't support this feature.
		*/
		if (page_array[0] != page_array[ret - 1])
		if (PageHighMem(page_array[i]))
		goto err;

		/*
		* Can't support mapping user allocated ring memory on 32-bit archs
		* where it could potentially reside in highmem. Just fail those with
		* -EINVAL, just like we did on kernels that didn't support this
		* feature.
		* No support for discontig pages for now, should either be a
		* single normal page, or a huge page. Later on we can add
		* support for remapping discontig pages, for now we will
		* just fail them with EINVAL.
		*/
		for (i = 0; i < nr_pages; i++) {
		if (PageHighMem(page_array[i])) {
		ret = -EINVAL;
		if (page_address(page_array[i]) != page_addr)
		goto err;
		}
		page_addr += PAGE_SIZE;
		}

		*pages = page_array;
		@@ -2775,7 +2779,7 @@ static void io_rings_free(struct io_ring_ctx *ctx)
		}
		}

		static void *io_mem_alloc(size_t size)
		void *io_mem_alloc(size_t size)
		{
		gfp_t gfp = GFP_KERNEL_ACCOUNT \| __GFP_ZERO \| __GFP_NOWARN \| __GFP_COMP;
		void *ret;
		@@ -2947,6 +2951,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
		ctx->mm_account = NULL;
		}
		io_rings_free(ctx);
		io_kbuf_mmap_list_free(ctx);

		percpu_ref_exit(&ctx->refs);
		free_uid(ctx->user);
		@@ -3475,25 +3480,27 @@ static void io_uring_validate_mmap_request(struct file file,
		struct page *page;
		void *ptr;

		/* Don't allow mmap if the ring was setup without it */
		if (ctx->flags & IORING_SETUP_NO_MMAP)
		return ERR_PTR(-EINVAL);

		switch (offset & IORING_OFF_MMAP_MASK) {
		case IORING_OFF_SQ_RING:
		case IORING_OFF_CQ_RING:
		/* Don't allow mmap if the ring was setup without it */
		if (ctx->flags & IORING_SETUP_NO_MMAP)
		return ERR_PTR(-EINVAL);
		ptr = ctx->rings;
		break;
		case IORING_OFF_SQES:
		/* Don't allow mmap if the ring was setup without it */
		if (ctx->flags & IORING_SETUP_NO_MMAP)
		return ERR_PTR(-EINVAL);
		ptr = ctx->sq_sqes;
		break;
		case IORING_OFF_PBUF_RING: {
		unsigned int bgid;

		bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
		mutex_lock(&ctx->uring_lock);
		rcu_read_lock();
		ptr = io_pbuf_get_address(ctx, bgid);
		mutex_unlock(&ctx->uring_lock);
		rcu_read_unlock();
		if (!ptr)
		return ERR_PTR(-EINVAL);
		break;
		@@ -3645,7 +3652,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
		size_t, argsz)
		{
		struct io_ring_ctx *ctx;
		struct fd f;
		struct file *file;
		long ret;

		if (unlikely(flags & ~(IORING_ENTER_GETEVENTS \| IORING_ENTER_SQ_WAKEUP \|
		@@ -3663,20 +3670,19 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
		if (unlikely(!tctx \|\| fd >= IO_RINGFD_REG_MAX))
		return -EINVAL;
		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
		f.file = tctx->registered_rings[fd];
		f.flags = 0;
		if (unlikely(!f.file))
		file = tctx->registered_rings[fd];
		if (unlikely(!file))
		return -EBADF;
		} else {
		f = fdget(fd);
		if (unlikely(!f.file))
		file = fget(fd);
		if (unlikely(!file))
		return -EBADF;
		ret = -EOPNOTSUPP;
		if (unlikely(!io_is_uring_fops(f.file)))
		if (unlikely(!io_is_uring_fops(file)))
		goto out;
		}

		ctx = f.file->private_data;
		ctx = file->private_data;
		ret = -EBADFD;
		if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
		goto out;
		@@ -3770,7 +3776,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
		}
		}
		out:
		fdput(f);
		if (!(flags & IORING_ENTER_REGISTERED_RING))
		fput(file);
		return ret;
		}

		@@ -4611,7 +4618,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
		{
		struct io_ring_ctx *ctx;
		long ret = -EBADF;
		struct fd f;
		struct file *file;
		bool use_registered_ring;

		use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
		@@ -4630,27 +4637,27 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
		if (unlikely(!tctx \|\| fd >= IO_RINGFD_REG_MAX))
		return -EINVAL;
		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
		f.file = tctx->registered_rings[fd];
		f.flags = 0;
		if (unlikely(!f.file))
		file = tctx->registered_rings[fd];
		if (unlikely(!file))
		return -EBADF;
		} else {
		f = fdget(fd);
		if (unlikely(!f.file))
		file = fget(fd);
		if (unlikely(!file))
		return -EBADF;
		ret = -EOPNOTSUPP;
		if (!io_is_uring_fops(f.file))
		if (!io_is_uring_fops(file))
		goto out_fput;
		}

		ctx = f.file->private_data;
		ctx = file->private_data;

		mutex_lock(&ctx->uring_lock);
		ret = __io_uring_register(ctx, opcode, arg, nr_args);
		mutex_unlock(&ctx->uring_lock);
		trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
		out_fput:
		fdput(f);
		if (!use_registered_ring)
		fput(file);
		return ret;
		}

io_uring/io_uring.h

+3 −0

Original line number	Diff line number	Diff line
		@@ -86,6 +86,9 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
		bool io_match_task_safe(struct io_kiocb head, struct task_struct task,
		bool cancel_all);

		void *io_mem_alloc(size_t size);
		void io_mem_free(void *ptr);

		#if defined(CONFIG_PROVE_LOCKING)
		static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
		{

io_uring/kbuf.c

+156 −21

Original line number	Diff line number	Diff line
		@@ -33,19 +33,42 @@ struct io_provide_buf {
		__u16 bid;
		};

		static inline struct io_buffer_list io_buffer_get_list(struct io_ring_ctx ctx,
		struct io_buf_free {
		struct hlist_node list;
		void *mem;
		size_t size;
		int inuse;
		};

		static struct io_buffer_list __io_buffer_get_list(struct io_ring_ctx ctx,
		struct io_buffer_list *bl,
		unsigned int bgid)
		{
		if (ctx->io_bl && bgid < BGID_ARRAY)
		return &ctx->io_bl[bgid];
		if (bl && bgid < BGID_ARRAY)
		return &bl[bgid];

		return xa_load(&ctx->io_bl_xa, bgid);
		}

		static inline struct io_buffer_list io_buffer_get_list(struct io_ring_ctx ctx,
		unsigned int bgid)
		{
		lockdep_assert_held(&ctx->uring_lock);

		return __io_buffer_get_list(ctx, ctx->io_bl, bgid);
		}

		static int io_buffer_add_list(struct io_ring_ctx *ctx,
		struct io_buffer_list *bl, unsigned int bgid)
		{
		/*
		* Store buffer group ID and finally mark the list as visible.
		* The normal lookup doesn't care about the visibility as we're
		* always under the ->uring_lock, but the RCU lookup from mmap does.
		*/
		bl->bgid = bgid;
		smp_store_release(&bl->is_ready, 1);

		if (bgid < BGID_ARRAY)
		return 0;

		@@ -196,21 +219,40 @@ void __user io_buffer_select(struct io_kiocb req, size_t *len,

		static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
		{
		struct io_buffer_list *bl;
		int i;

		ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
		GFP_KERNEL);
		if (!ctx->io_bl)
		bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL);
		if (!bl)
		return -ENOMEM;

		for (i = 0; i < BGID_ARRAY; i++) {
		INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
		ctx->io_bl[i].bgid = i;
		INIT_LIST_HEAD(&bl[i].buf_list);
		bl[i].bgid = i;
		}

		smp_store_release(&ctx->io_bl, bl);
		return 0;
		}

		/*
		* Mark the given mapped range as free for reuse
		*/
		static void io_kbuf_mark_free(struct io_ring_ctx ctx, struct io_buffer_list bl)
		{
		struct io_buf_free *ibf;

		hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
		if (bl->buf_ring == ibf->mem) {
		ibf->inuse = 0;
		return;
		}
		}

		/* can't happen... */
		WARN_ON_ONCE(1);
		}

		static int __io_remove_buffers(struct io_ring_ctx *ctx,
		struct io_buffer_list *bl, unsigned nbufs)
		{
		@@ -223,7 +265,11 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
		if (bl->is_mapped) {
		i = bl->buf_ring->tail - bl->head;
		if (bl->is_mmap) {
		folio_put(virt_to_folio(bl->buf_ring));
		/*
		* io_kbuf_list_free() will free the page(s) at
		* ->release() time.
		*/
		io_kbuf_mark_free(ctx, bl);
		bl->buf_ring = NULL;
		bl->is_mmap = 0;
		} else if (bl->buf_nr_pages) {
		@@ -274,9 +320,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
		xa_for_each(&ctx->io_bl_xa, index, bl) {
		xa_erase(&ctx->io_bl_xa, bl->bgid);
		__io_remove_buffers(ctx, bl, -1U);
		kfree(bl);
		kfree_rcu(bl, rcu);
		}

		/*
		* Move deferred locked entries to cache before pruning
		*/
		spin_lock(&ctx->completion_lock);
		if (!list_empty(&ctx->io_buffers_comp))
		list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache);
		spin_unlock(&ctx->completion_lock);

		list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
		buf = list_entry(item, struct io_buffer, list);
		kmem_cache_free(io_buf_cachep, buf);
		@@ -460,7 +514,16 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
		INIT_LIST_HEAD(&bl->buf_list);
		ret = io_buffer_add_list(ctx, bl, p->bgid);
		if (ret) {
		kfree(bl);
		/*
		* Doesn't need rcu free as it was never visible, but
		* let's keep it consistent throughout. Also can't
		* be a lower indexed array group, as adding one
		* where lookup failed cannot happen.
		*/
		if (p->bgid >= BGID_ARRAY)
		kfree_rcu(bl, rcu);
		else
		WARN_ON_ONCE(1);
		goto err;
		}
		}
		@@ -531,19 +594,63 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
		return -EINVAL;
		}

		static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
		/*
		* See if we have a suitable region that we can reuse, rather than allocate
		* both a new io_buf_free and mem region again. We leave it on the list as
		* even a reused entry will need freeing at ring release.
		*/
		static struct io_buf_free io_lookup_buf_free_entry(struct io_ring_ctx ctx,
		size_t ring_size)
		{
		struct io_buf_free ibf, best = NULL;
		size_t best_dist;

		hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
		size_t dist;

		if (ibf->inuse \|\| ibf->size < ring_size)
		continue;
		dist = ibf->size - ring_size;
		if (!best \|\| dist < best_dist) {
		best = ibf;
		if (!dist)
		break;
		best_dist = dist;
		}
		}

		return best;
		}

		static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
		struct io_uring_buf_reg *reg,
		struct io_buffer_list *bl)
		{
		gfp_t gfp = GFP_KERNEL_ACCOUNT \| __GFP_ZERO \| __GFP_NOWARN \| __GFP_COMP;
		struct io_buf_free *ibf;
		size_t ring_size;
		void *ptr;

		ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
		ptr = (void *) __get_free_pages(gfp, get_order(ring_size));

		/* Reuse existing entry, if we can */
		ibf = io_lookup_buf_free_entry(ctx, ring_size);
		if (!ibf) {
		ptr = io_mem_alloc(ring_size);
		if (!ptr)
		return -ENOMEM;

		bl->buf_ring = ptr;
		/* Allocate and store deferred free entry */
		ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
		if (!ibf) {
		io_mem_free(ptr);
		return -ENOMEM;
		}
		ibf->mem = ptr;
		ibf->size = ring_size;
		hlist_add_head(&ibf->list, &ctx->io_buf_list);
		}
		ibf->inuse = 1;
		bl->buf_ring = ibf->mem;
		bl->is_mapped = 1;
		bl->is_mmap = 1;
		return 0;
		@@ -555,6 +662,8 @@ int io_register_pbuf_ring(struct io_ring_ctx ctx, void __user arg)
		struct io_buffer_list bl, free_bl = NULL;
		int ret;

		lockdep_assert_held(&ctx->uring_lock);

		if (copy_from_user(&reg, arg, sizeof(reg)))
		return -EFAULT;

		@@ -599,7 +708,7 @@ int io_register_pbuf_ring(struct io_ring_ctx ctx, void __user arg)
		if (!(reg.flags & IOU_PBUF_RING_MMAP))
		ret = io_pin_pbuf_ring(&reg, bl);
		else
		ret = io_alloc_pbuf_ring(&reg, bl);
		ret = io_alloc_pbuf_ring(ctx, &reg, bl);

		if (!ret) {
		bl->nr_entries = reg.ring_entries;
		@@ -609,7 +718,7 @@ int io_register_pbuf_ring(struct io_ring_ctx ctx, void __user arg)
		return 0;
		}

		kfree(free_bl);
		kfree_rcu(free_bl, rcu);
		return ret;
		}

		@@ -618,6 +727,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx ctx, void __user arg)
		struct io_uring_buf_reg reg;
		struct io_buffer_list *bl;

		lockdep_assert_held(&ctx->uring_lock);

		if (copy_from_user(&reg, arg, sizeof(reg)))
		return -EFAULT;
		if (reg.resv[0] \|\| reg.resv[1] \|\| reg.resv[2])
		@@ -634,7 +745,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx ctx, void __user arg)
		__io_remove_buffers(ctx, bl, -1U);
		if (bl->bgid >= BGID_ARRAY) {
		xa_erase(&ctx->io_bl_xa, bl->bgid);
		kfree(bl);
		kfree_rcu(bl, rcu);
		}
		return 0;
		}
		@@ -643,9 +754,33 @@ void io_pbuf_get_address(struct io_ring_ctx ctx, unsigned long bgid)
		{
		struct io_buffer_list *bl;

		bl = io_buffer_get_list(ctx, bgid);
		bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid);

		/*
		* Ensure the list is fully setup. Only strictly needed for RCU lookup
		* via mmap, and in that case only for the array indexed groups. For
		* the xarray lookups, it's either visible and ready, or not at all.
		*/
		if (!smp_load_acquire(&bl->is_ready))
		return NULL;
		if (!bl \|\| !bl->is_mmap)
		return NULL;

		return bl->buf_ring;
		}

		/*
		* Called at or after ->release(), free the mmap'ed buffers that we used
		* for memory mapped provided buffer rings.
		*/
		void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
		{
		struct io_buf_free *ibf;
		struct hlist_node *tmp;

		hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
		hlist_del(&ibf->list);
		io_mem_free(ibf->mem);
		kfree(ibf);
		}
		}