Commit 0a47e02d authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'io_uring-6.16-20250626' of git://git.kernel.dk/linux

Pull io_uring fixes from Jens Axboe:

 - Two tweaks for a recent fix: fixing a memory leak if multiple iovecs
   were initially mapped but only the first was used and hence turned
   into a UBUF rathan than an IOVEC iterator, and catching a case where
   a retry would be done even if the previous segment wasn't full

 - Small series fixing an issue making the vm unhappy if debugging is
   turned on, hitting a VM_BUG_ON_PAGE()

 - Fix a resource leak in io_import_dmabuf() in the error handling case,
   which is a regression in this merge window

 - Mark fallocate as needing to be write serialized, as is already done
   for truncate and buffered writes

* tag 'io_uring-6.16-20250626' of git://git.kernel.dk/linux:
  io_uring/kbuf: flag partial buffer mappings
  io_uring/net: mark iov as dynamically allocated even for single segments
  io_uring: fix resource leak in io_import_dmabuf()
  io_uring: don't assume uaddr alignment in io_vec_fill_bvec
  io_uring/rsrc: don't rely on user vaddr alignment
  io_uring/rsrc: fix folio unpinning
  io_uring: make fallocate be hashed work
parents 9c7331f1 178b8ff6
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -271,6 +271,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
		if (len > arg->max_len) {
			len = arg->max_len;
			if (!(bl->flags & IOBL_INC)) {
				arg->partial_map = 1;
				if (iov != arg->iovs)
					break;
				buf->len = len;
+2 −1
Original line number Diff line number Diff line
@@ -58,7 +58,8 @@ struct buf_sel_arg {
	size_t max_len;
	unsigned short nr_iovs;
	unsigned short mode;
	unsigned buf_group;
	unsigned short buf_group;
	unsigned short partial_map;
};

void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
+21 −13
Original line number Diff line number Diff line
@@ -75,12 +75,17 @@ struct io_sr_msg {
	u16				flags;
	/* initialised and used only by !msg send variants */
	u16				buf_group;
	bool				retry;
	unsigned short			retry_flags;
	void __user			*msg_control;
	/* used only for send zerocopy */
	struct io_kiocb 		*notif;
};

enum sr_retry_flags {
	IO_SR_MSG_RETRY		= 1,
	IO_SR_MSG_PARTIAL_MAP	= 2,
};

/*
 * Number of times we'll try and do receives if there's more data. If we
 * exceed this limit, then add us to the back of the queue and retry from
@@ -187,7 +192,7 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req,

	req->flags &= ~REQ_F_BL_EMPTY;
	sr->done_io = 0;
	sr->retry = false;
	sr->retry_flags = 0;
	sr->len = 0; /* get from the provided buffer */
}

@@ -397,7 +402,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);

	sr->done_io = 0;
	sr->retry = false;
	sr->retry_flags = 0;
	sr->len = READ_ONCE(sqe->len);
	sr->flags = READ_ONCE(sqe->ioprio);
	if (sr->flags & ~SENDMSG_FLAGS)
@@ -751,7 +756,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);

	sr->done_io = 0;
	sr->retry = false;
	sr->retry_flags = 0;

	if (unlikely(sqe->file_index || sqe->addr2))
		return -EINVAL;
@@ -823,7 +828,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,

		cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret),
				      issue_flags);
		if (sr->retry)
		if (sr->retry_flags & IO_SR_MSG_RETRY)
			cflags = req->cqe.flags | (cflags & CQE_F_MASK);
		/* bundle with no more immediate buffers, we're done */
		if (req->flags & REQ_F_BL_EMPTY)
@@ -832,12 +837,12 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
		 * If more is available AND it was a full transfer, retry and
		 * append to this one
		 */
		if (!sr->retry && kmsg->msg.msg_inq > 1 && this_ret > 0 &&
		if (!sr->retry_flags && kmsg->msg.msg_inq > 1 && this_ret > 0 &&
		    !iov_iter_count(&kmsg->msg.msg_iter)) {
			req->cqe.flags = cflags & ~CQE_F_MASK;
			sr->len = kmsg->msg.msg_inq;
			sr->done_io += this_ret;
			sr->retry = true;
			sr->retry_flags |= IO_SR_MSG_RETRY;
			return false;
		}
	} else {
@@ -1077,6 +1082,14 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
		if (unlikely(ret < 0))
			return ret;

		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
			kmsg->vec.nr = ret;
			kmsg->vec.iovec = arg.iovs;
			req->flags |= REQ_F_NEED_CLEANUP;
		}
		if (arg.partial_map)
			sr->retry_flags |= IO_SR_MSG_PARTIAL_MAP;

		/* special case 1 vec, can be a fast path */
		if (ret == 1) {
			sr->buf = arg.iovs[0].iov_base;
@@ -1085,11 +1098,6 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
		}
		iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
				arg.out_len);
		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
			kmsg->vec.nr = ret;
			kmsg->vec.iovec = arg.iovs;
			req->flags |= REQ_F_NEED_CLEANUP;
		}
	} else {
		void __user *buf;

@@ -1275,7 +1283,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
	int ret;

	zc->done_io = 0;
	zc->retry = false;
	zc->retry_flags = 0;

	if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
		return -EINVAL;
+1 −0
Original line number Diff line number Diff line
@@ -216,6 +216,7 @@ const struct io_issue_def io_issue_defs[] = {
	},
	[IORING_OP_FALLOCATE] = {
		.needs_file		= 1,
		.hash_reg_file          = 1,
		.prep			= io_fallocate_prep,
		.issue			= io_fallocate,
	},
+22 −8
Original line number Diff line number Diff line
@@ -112,8 +112,11 @@ static void io_release_ubuf(void *priv)
	struct io_mapped_ubuf *imu = priv;
	unsigned int i;

	for (i = 0; i < imu->nr_bvecs; i++)
		unpin_user_page(imu->bvec[i].bv_page);
	for (i = 0; i < imu->nr_bvecs; i++) {
		struct folio *folio = page_folio(imu->bvec[i].bv_page);

		unpin_user_folio(folio, 1);
	}
}

static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
@@ -731,6 +734,7 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,

	data->nr_pages_mid = folio_nr_pages(folio);
	data->folio_shift = folio_shift(folio);
	data->first_folio_page_idx = folio_page_idx(folio, page_array[0]);

	/*
	 * Check if pages are contiguous inside a folio, and all folios have
@@ -824,7 +828,11 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
	if (coalesced)
		imu->folio_shift = data.folio_shift;
	refcount_set(&imu->refs, 1);
	off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);

	off = (unsigned long)iov->iov_base & ~PAGE_MASK;
	if (coalesced)
		off += data.first_folio_page_idx << PAGE_SHIFT;

	node->buf = imu;
	ret = 0;

@@ -840,8 +848,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
	if (ret) {
		if (imu)
			io_free_imu(ctx, imu);
		if (pages)
			unpin_user_pages(pages, nr_pages);
		if (pages) {
			for (i = 0; i < nr_pages; i++)
				unpin_user_folio(page_folio(pages[i]), 1);
		}
		io_cache_free(&ctx->node_cache, node);
		node = ERR_PTR(ret);
	}
@@ -1329,7 +1339,6 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
{
	unsigned long folio_size = 1 << imu->folio_shift;
	unsigned long folio_mask = folio_size - 1;
	u64 folio_addr = imu->ubuf & ~folio_mask;
	struct bio_vec *res_bvec = vec->bvec;
	size_t total_len = 0;
	unsigned bvec_idx = 0;
@@ -1351,8 +1360,13 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
		if (unlikely(check_add_overflow(total_len, iov_len, &total_len)))
			return -EOVERFLOW;

		/* by using folio address it also accounts for bvec offset */
		offset = buf_addr - folio_addr;
		offset = buf_addr - imu->ubuf;
		/*
		 * Only the first bvec can have non zero bv_offset, account it
		 * here and work with full folios below.
		 */
		offset += imu->bvec[0].bv_offset;

		src_bvec = imu->bvec + (offset >> imu->folio_shift);
		offset &= folio_mask;

Loading