Commit f713ffa3 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'block-6.16-20250614' of git://git.kernel.dk/linux

Pull block fixes from Jens Axboe:

 - Fix for a deadlock on queue freeze with zoned writes

 - Fix for zoned append emulation

 - Two bio folio fixes, for sparsemem and for very large folios

 - Fix for a performance regression introduced in 6.13 when plug
   insertion was changed

 - Fix for NVMe passthrough handling for polled IO

 - Document the ublk auto registration feature

 - loop lockdep warning fix

* tag 'block-6.16-20250614' of git://git.kernel.dk/linux:
  nvme: always punt polled uring_cmd end_io work to task_work
  Documentation: ublk: Separate UBLK_F_AUTO_BUF_REG fallback behavior sublists
  block: Fix bvec_set_folio() for very large folios
  bio: Fix bio_first_folio() for SPARSEMEM without VMEMMAP
  block: use plug request list tail for one-shot backmerge attempt
  block: don't use submit_bio_noacct_nocheck in blk_zone_wplug_bio_work
  block: Clear BIO_EMULATES_ZONE_APPEND flag on BIO completion
  ublk: document auto buffer registration(UBLK_F_AUTO_BUF_REG)
  loop: move lo_set_size() out of queue freeze
parents 6d13760e 9ce6c987
Loading
Loading
Loading
Loading
+77 −0
Original line number Diff line number Diff line
@@ -352,6 +352,83 @@ For reaching best IO performance, ublk server should align its segment
parameter of `struct ublk_param_segment` with backend for avoiding
unnecessary IO split, which usually hurts io_uring performance.

Auto Buffer Registration
------------------------

The ``UBLK_F_AUTO_BUF_REG`` feature automatically handles buffer registration
and unregistration for I/O requests, which simplifies the buffer management
process and reduces overhead in the ublk server implementation.

This is another feature flag for using zero copy, and it is compatible with
``UBLK_F_SUPPORT_ZERO_COPY``.

Feature Overview
~~~~~~~~~~~~~~~~

This feature automatically registers request buffers to the io_uring context
before delivering I/O commands to the ublk server and unregisters them when
completing I/O commands. This eliminates the need for manual buffer
registration/unregistration via ``UBLK_IO_REGISTER_IO_BUF`` and
``UBLK_IO_UNREGISTER_IO_BUF`` commands, then IO handling in ublk server
can avoid dependency on the two uring_cmd operations.

IOs can't be issued concurrently to io_uring if there is any dependency
among these IOs. So this way not only simplifies ublk server implementation,
but also makes concurrent IO handling becomes possible by removing the
dependency on buffer registration & unregistration commands.

Usage Requirements
~~~~~~~~~~~~~~~~~~

1. The ublk server must create a sparse buffer table on the same ``io_ring_ctx``
   used for ``UBLK_IO_FETCH_REQ`` and ``UBLK_IO_COMMIT_AND_FETCH_REQ``. If
   uring_cmd is issued on a different ``io_ring_ctx``, manual buffer
   unregistration is required.

2. Buffer registration data must be passed via uring_cmd's ``sqe->addr`` with the
   following structure::

    struct ublk_auto_buf_reg {
        __u16 index;      /* Buffer index for registration */
        __u8 flags;       /* Registration flags */
        __u8 reserved0;   /* Reserved for future use */
        __u32 reserved1;  /* Reserved for future use */
    };

   ublk_auto_buf_reg_to_sqe_addr() is for converting the above structure into
   ``sqe->addr``.

3. All reserved fields in ``ublk_auto_buf_reg`` must be zeroed.

4. Optional flags can be passed via ``ublk_auto_buf_reg.flags``.

Fallback Behavior
~~~~~~~~~~~~~~~~~

If auto buffer registration fails:

1. When ``UBLK_AUTO_BUF_REG_FALLBACK`` is enabled:

   - The uring_cmd is completed
   - ``UBLK_IO_F_NEED_REG_BUF`` is set in ``ublksrv_io_desc.op_flags``
   - The ublk server must manually deal with the failure, such as, register
     the buffer manually, or using user copy feature for retrieving the data
     for handling ublk IO

2. If fallback is not enabled:

   - The ublk I/O request fails silently
   - The uring_cmd won't be completed

Limitations
~~~~~~~~~~~

- Requires same ``io_ring_ctx`` for all operations
- May require manual buffer management in fallback cases
- io_ring_ctx buffer table has a max size of 16K, which may not be enough
  in case that too many ublk devices are handled by this single io_ring_ctx
  and each one has very large queue depth

References
==========

+13 −13
Original line number Diff line number Diff line
@@ -998,21 +998,21 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
	if (!plug || rq_list_empty(&plug->mq_list))
		return false;

	rq = plug->mq_list.tail;
	if (rq->q == q)
		return blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
			BIO_MERGE_OK;
	else if (!plug->multiple_queues)
		return false;

	rq_list_for_each(&plug->mq_list, rq) {
		if (rq->q == q) {
		if (rq->q != q)
			continue;
		if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
		    BIO_MERGE_OK)
			return true;
		break;
	}

		/*
		 * Only keep iterating plug list for merges if we have multiple
		 * queues
		 */
		if (!plug->multiple_queues)
			break;
	}
	return false;
}

+6 −2
Original line number Diff line number Diff line
@@ -1225,6 +1225,7 @@ void blk_zone_write_plug_bio_endio(struct bio *bio)
	if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
		bio->bi_opf &= ~REQ_OP_MASK;
		bio->bi_opf |= REQ_OP_ZONE_APPEND;
		bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND);
	}

	/*
@@ -1306,7 +1307,6 @@ static void blk_zone_wplug_bio_work(struct work_struct *work)
	spin_unlock_irqrestore(&zwplug->lock, flags);

	bdev = bio->bi_bdev;
	submit_bio_noacct_nocheck(bio);

	/*
	 * blk-mq devices will reuse the extra reference on the request queue
@@ -1314,8 +1314,12 @@ static void blk_zone_wplug_bio_work(struct work_struct *work)
	 * path for BIO-based devices will not do that. So drop this extra
	 * reference here.
	 */
	if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO))
	if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) {
		bdev->bd_disk->fops->submit_bio(bio);
		blk_queue_exit(bdev->bd_disk->queue);
	} else {
		blk_mq_submit_bio(bio);
	}

put_zwplug:
	/* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
+5 −6
Original line number Diff line number Diff line
@@ -1248,12 +1248,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
	lo->lo_flags &= ~LOOP_SET_STATUS_CLEARABLE_FLAGS;
	lo->lo_flags |= (info->lo_flags & LOOP_SET_STATUS_SETTABLE_FLAGS);

	if (size_changed) {
		loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit,
					   lo->lo_backing_file);
		loop_set_size(lo, new_size);
	}

	/* update the direct I/O flag if lo_offset changed */
	loop_update_dio(lo);

@@ -1261,6 +1255,11 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
	blk_mq_unfreeze_queue(lo->lo_queue, memflags);
	if (partscan)
		clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state);
	if (!err && size_changed) {
		loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit,
					   lo->lo_backing_file);
		loop_set_size(lo, new_size);
	}
out_unlock:
	mutex_unlock(&lo->lo_mutex);
	if (partscan)
+7 −14
Original line number Diff line number Diff line
@@ -429,21 +429,14 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
	pdu->result = le64_to_cpu(nvme_req(req)->result.u64);

	/*
	 * For iopoll, complete it directly. Note that using the uring_cmd
	 * helper for this is safe only because we check blk_rq_is_poll().
	 * As that returns false if we're NOT on a polled queue, then it's
	 * safe to use the polled completion helper.
	 *
	 * Otherwise, move the completion to task work.
	 * IOPOLL could potentially complete this request directly, but
	 * if multiple rings are polling on the same queue, then it's possible
	 * for one ring to find completions for another ring. Punting the
	 * completion via task_work will always direct it to the right
	 * location, rather than potentially complete requests for ringA
	 * under iopoll invocations from ringB.
	 */
	if (blk_rq_is_poll(req)) {
		if (pdu->bio)
			blk_rq_unmap_user(pdu->bio);
		io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status);
	} else {
	io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
	}

	return RQ_END_IO_FREE;
}

Loading