Commit 4adc13ed authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-7.0/block-stable-pages-20260206' of...

Merge tag 'for-7.0/block-stable-pages-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull bounce buffer dio for stable pages from Jens Axboe:
 "This adds support for bounce buffering of dio for stable pages. This
  was all done by Christoph. In his words:

  This series tries to address the problem that under I/O pages can be
  modified during direct I/O, even when the device or file system
  require stable pages during I/O to calculate checksums, parity or data
  operations. It does so by adding block layer helpers to bounce buffer
  an iov_iter into a bio, then wires that up in iomap and ultimately
  XFS.

  The reason that the file system even needs to know about it, is
  because reads need a user context to copy the data back, and the
  infrastructure to defer ioends to a workqueue currently sits in XFS.
  I'm going to look into moving that into ioend and enabling it for
  other file systems. Additionally btrfs already has it's own
  infrastructure for this, and actually an urgent need to bounce buffer,
  so this should be useful there and could be wire up easily. In fact
  the idea comes from patches by Qu that did this in btrfs.

  This patch fixes all but one xfstests failures on T10 PI capable
  devices (generic/095 seems to have issues with a mix of mmap and
  splice still, I'm looking into that separately), and make qemu VMs
  running Windows, or Linux with swap enabled fine on an XFS file on a
  device using PI.

  Performance numbers on my (not exactly state of the art) NVMe PI test
  setup:

      Sequential reads using io_uring, QD=16.
      Bandwidth and CPU usage (usr/sys):

      | size |        zero copy         |          bounce          |
      +------+--------------------------+--------------------------+
      |   4k | 1316MiB/s (12.65/55.40%) | 1081MiB/s (11.76/49.78%) |
      |  64K | 3370MiB/s ( 5.46/18.20%) | 3365MiB/s ( 4.47/15.68%) |
      |   1M | 3401MiB/s ( 0.76/23.05%) | 3400MiB/s ( 0.80/09.06%) |
      +------+--------------------------+--------------------------+

      Sequential writes using io_uring, QD=16.
      Bandwidth and CPU usage (usr/sys):

      | size |        zero copy         |          bounce          |
      +------+--------------------------+--------------------------+
      |   4k |  882MiB/s (11.83/33.88%) |  750MiB/s (10.53/34.08%) |
      |  64K | 2009MiB/s ( 7.33/15.80%) | 2007MiB/s ( 7.47/24.71%) |
      |   1M | 1992MiB/s ( 7.26/ 9.13%) | 1992MiB/s ( 9.21/19.11%) |
      +------+--------------------------+--------------------------+

  Note that the 64k read numbers look really odd to me for the baseline
  zero copy case, but are reproducible over many repeated runs.

  The bounce read numbers should further improve when moving the PI
  validation to the file system and removing the double context switch,
  which I have patches for that will sent out soon"

* tag 'for-7.0/block-stable-pages-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
  xfs: use bounce buffering direct I/O when the device requires stable pages
  iomap: add a flag to bounce buffer direct I/O
  iomap: support ioends for direct reads
  iomap: rename IOMAP_DIO_DIRTY to IOMAP_DIO_USER_BACKED
  iomap: free the bio before completing the dio
  iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct
  iomap: split out the per-bio logic from iomap_dio_bio_iter
  iomap: simplify iomap_dio_bio_iter
  iomap: fix submission side handling of completion side errors
  block: add helpers to bounce buffer an iov_iter into bios
  block: remove bio_release_page
  iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  block: open code bio_add_page and fix handling of mismatching P2P ranges
  block: refactor get_contig_folio_len
  block: add a BIO_MAX_SIZE constant and use it
parents 0c00ed30 3373503d
Loading
Loading
Loading
Loading
+205 −127
Original line number Diff line number Diff line
@@ -958,7 +958,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
{
	if (bio->bi_vcnt >= bio->bi_max_vecs)
		return true;
	if (bio->bi_iter.bi_size > UINT_MAX - len)
	if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len)
		return true;
	return false;
}
@@ -1064,7 +1064,7 @@ int bio_add_page(struct bio *bio, struct page *page,
{
	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
		return 0;
	if (bio->bi_iter.bi_size > UINT_MAX - len)
	if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len)
		return 0;

	if (bio->bi_vcnt > 0) {
@@ -1091,7 +1091,7 @@ void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
{
	unsigned long nr = off / PAGE_SIZE;

	WARN_ON_ONCE(len > UINT_MAX);
	WARN_ON_ONCE(len > BIO_MAX_SIZE);
	__bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE);
}
EXPORT_SYMBOL_GPL(bio_add_folio_nofail);
@@ -1115,7 +1115,7 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
{
	unsigned long nr = off / PAGE_SIZE;

	if (len > UINT_MAX)
	if (len > BIO_MAX_SIZE)
		return false;
	return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0;
}
@@ -1206,122 +1206,6 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
	bio_set_flag(bio, BIO_CLONED);
}

static unsigned int get_contig_folio_len(unsigned int *num_pages,
					 struct page **pages, unsigned int i,
					 struct folio *folio, size_t left,
					 size_t offset)
{
	size_t bytes = left;
	size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes);
	unsigned int j;

	/*
	 * We might COW a single page in the middle of
	 * a large folio, so we have to check that all
	 * pages belong to the same folio.
	 */
	bytes -= contig_sz;
	for (j = i + 1; j < i + *num_pages; j++) {
		size_t next = min_t(size_t, PAGE_SIZE, bytes);

		if (page_folio(pages[j]) != folio ||
		    pages[j] != pages[j - 1] + 1) {
			break;
		}
		contig_sz += next;
		bytes -= next;
	}
	*num_pages = j - i;

	return contig_sz;
}

#define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))

/**
 * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
 * @bio: bio to add pages to
 * @iter: iov iterator describing the region to be mapped
 *
 * Extracts pages from *iter and appends them to @bio's bvec array.  The pages
 * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag.
 * For a multi-segment *iter, this function only adds pages from the next
 * non-empty segment of the iov iterator.
 */
static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
	iov_iter_extraction_t extraction_flags = 0;
	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
	struct page **pages = (struct page **)bv;
	ssize_t size;
	unsigned int num_pages, i = 0;
	size_t offset, folio_offset, left, len;
	int ret = 0;

	/*
	 * Move page array up in the allocated memory for the bio vecs as far as
	 * possible so that we can start filling biovecs from the beginning
	 * without overwriting the temporary page array.
	 */
	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);

	if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
		extraction_flags |= ITER_ALLOW_P2PDMA;

	size = iov_iter_extract_pages(iter, &pages,
				      UINT_MAX - bio->bi_iter.bi_size,
				      nr_pages, extraction_flags, &offset);
	if (unlikely(size <= 0))
		return size ? size : -EFAULT;

	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
	for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
		struct page *page = pages[i];
		struct folio *folio = page_folio(page);
		unsigned int old_vcnt = bio->bi_vcnt;

		folio_offset = ((size_t)folio_page_idx(folio, page) <<
			       PAGE_SHIFT) + offset;

		len = min(folio_size(folio) - folio_offset, left);

		num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);

		if (num_pages > 1)
			len = get_contig_folio_len(&num_pages, pages, i,
						   folio, left, offset);

		if (!bio_add_folio(bio, folio, len, folio_offset)) {
			WARN_ON_ONCE(1);
			ret = -EINVAL;
			goto out;
		}

		if (bio_flagged(bio, BIO_PAGE_PINNED)) {
			/*
			 * We're adding another fragment of a page that already
			 * was part of the last segment.  Undo our pin as the
			 * page was pinned when an earlier fragment of it was
			 * added to the bio and __bio_release_pages expects a
			 * single pin per page.
			 */
			if (offset && bio->bi_vcnt == old_vcnt)
				unpin_user_folio(folio, 1);
		}
		offset = 0;
	}

	iov_iter_revert(iter, left);
out:
	while (i < nr_pages)
		bio_release_page(bio, pages[i++]);

	return ret;
}

/*
 * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that
 * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length
@@ -1345,7 +1229,9 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
			break;
		}

		bio_release_page(bio, bv->bv_page);
		if (bio_flagged(bio, BIO_PAGE_PINNED))
			unpin_user_page(bv->bv_page);

		bio->bi_vcnt--;
		nbytes -= bv->bv_len;
	} while (nbytes);
@@ -1379,7 +1265,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
			   unsigned len_align_mask)
{
	int ret = 0;
	iov_iter_extraction_t flags = 0;

	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
		return -EIO;
@@ -1392,13 +1278,205 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,

	if (iov_iter_extract_will_pin(iter))
		bio_set_flag(bio, BIO_PAGE_PINNED);
	if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
		flags |= ITER_ALLOW_P2PDMA;

	do {
		ret = __bio_iov_iter_get_pages(bio, iter);
	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
		ssize_t ret;

		ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec,
				BIO_MAX_SIZE - bio->bi_iter.bi_size,
				&bio->bi_vcnt, bio->bi_max_vecs, flags);
		if (ret <= 0) {
			if (!bio->bi_vcnt)
				return ret;
			break;
		}
		bio->bi_iter.bi_size += ret;
	} while (iov_iter_count(iter) && !bio_full(bio, 0));

	if (bio->bi_vcnt)
	if (is_pci_p2pdma_page(bio->bi_io_vec->bv_page))
		bio->bi_opf |= REQ_NOMERGE;
	return bio_iov_iter_align_down(bio, iter, len_align_mask);
}

static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size)
{
	struct folio *folio;

	while (*size > PAGE_SIZE) {
		folio = folio_alloc(gfp | __GFP_NORETRY, get_order(*size));
		if (folio)
			return folio;
		*size = rounddown_pow_of_two(*size - 1);
	}

	return folio_alloc(gfp, get_order(*size));
}

static void bio_free_folios(struct bio *bio)
{
	struct bio_vec *bv;
	int i;

	bio_for_each_bvec_all(bv, bio, i) {
		struct folio *folio = page_folio(bv->bv_page);

		if (!is_zero_folio(folio))
			folio_put(folio);
	}
}

static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter)
{
	size_t total_len = iov_iter_count(iter);

	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
		return -EINVAL;
	if (WARN_ON_ONCE(bio->bi_iter.bi_size))
		return -EINVAL;
	if (WARN_ON_ONCE(bio->bi_vcnt >= bio->bi_max_vecs))
		return -EINVAL;

	do {
		size_t this_len = min(total_len, SZ_1M);
		struct folio *folio;

		if (this_len > PAGE_SIZE * 2)
			this_len = rounddown_pow_of_two(this_len);

		if (bio->bi_iter.bi_size > BIO_MAX_SIZE - this_len)
			break;

		folio = folio_alloc_greedy(GFP_KERNEL, &this_len);
		if (!folio)
			break;
		bio_add_folio_nofail(bio, folio, this_len, 0);

		if (copy_from_iter(folio_address(folio), this_len, iter) !=
				this_len) {
			bio_free_folios(bio);
			return -EFAULT;
		}

		total_len -= this_len;
	} while (total_len && bio->bi_vcnt < bio->bi_max_vecs);

	if (!bio->bi_iter.bi_size)
		return -ENOMEM;
	return 0;
}

static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter)
{
	size_t len = min(iov_iter_count(iter), SZ_1M);
	struct folio *folio;

	folio = folio_alloc_greedy(GFP_KERNEL, &len);
	if (!folio)
		return -ENOMEM;

	do {
		ssize_t ret;

		ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len,
				&bio->bi_vcnt, bio->bi_max_vecs - 1, 0);
		if (ret <= 0) {
			if (!bio->bi_vcnt)
				return ret;
			break;
		}
		len -= ret;
		bio->bi_iter.bi_size += ret;
	} while (len && bio->bi_vcnt < bio->bi_max_vecs - 1);

	/*
	 * Set the folio directly here.  The above loop has already calculated
	 * the correct bi_size, and we use bi_vcnt for the user buffers.  That
	 * is safe as bi_vcnt is only used by the submitter and not the actual
	 * I/O path.
	 */
	bvec_set_folio(&bio->bi_io_vec[0], folio, bio->bi_iter.bi_size, 0);
	if (iov_iter_extract_will_pin(iter))
		bio_set_flag(bio, BIO_PAGE_PINNED);
	return 0;
}

/**
 * bio_iov_iter_bounce - bounce buffer data from an iter into a bio
 * @bio:	bio to send
 * @iter:	iter to read from / write into
 *
 * Helper for direct I/O implementations that need to bounce buffer because
 * we need to checksum the data or perform other operations that require
 * consistency.  Allocates folios to back the bounce buffer, and for writes
 * copies the data into it.  Needs to be paired with bio_iov_iter_unbounce()
 * called on completion.
 */
int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter)
{
	if (op_is_write(bio_op(bio)))
		return bio_iov_iter_bounce_write(bio, iter);
	return bio_iov_iter_bounce_read(bio, iter);
}

static void bvec_unpin(struct bio_vec *bv, bool mark_dirty)
{
	struct folio *folio = page_folio(bv->bv_page);
	size_t nr_pages = (bv->bv_offset + bv->bv_len - 1) / PAGE_SIZE -
			bv->bv_offset / PAGE_SIZE + 1;

	if (mark_dirty)
		folio_mark_dirty_lock(folio);
	unpin_user_folio(folio, nr_pages);
}

static void bio_iov_iter_unbounce_read(struct bio *bio, bool is_error,
		bool mark_dirty)
{
	unsigned int len = bio->bi_io_vec[0].bv_len;

	if (likely(!is_error)) {
		void *buf = bvec_virt(&bio->bi_io_vec[0]);
		struct iov_iter to;

		iov_iter_bvec(&to, ITER_DEST, bio->bi_io_vec + 1, bio->bi_vcnt,
				len);
		/* copying to pinned pages should always work */
		WARN_ON_ONCE(copy_to_iter(buf, len, &to) != len);
	} else {
		/* No need to mark folios dirty if never copied to them */
		mark_dirty = false;
	}

	if (bio_flagged(bio, BIO_PAGE_PINNED)) {
		int i;

		for (i = 0; i < bio->bi_vcnt; i++)
			bvec_unpin(&bio->bi_io_vec[1 + i], mark_dirty);
	}

	folio_put(page_folio(bio->bi_io_vec[0].bv_page));
}

/**
 * bio_iov_iter_unbounce - finish a bounce buffer operation
 * @bio:	completed bio
 * @is_error:	%true if an I/O error occurred and data should not be copied
 * @mark_dirty:	If %true, folios will be marked dirty.
 *
 * Helper for direct I/O implementations that need to bounce buffer because
 * we need to checksum the data or perform other operations that require
 * consistency.  Called to complete a bio set up by bio_iov_iter_bounce().
 * Copies data back for reads, and marks the original folios dirty if
 * requested and then frees the bounce buffer.
 */
void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty)
{
	if (op_is_write(bio_op(bio)))
		bio_free_folios(bio);
	else
		bio_iov_iter_unbounce_read(bio, is_error, mark_dirty);
}

static void submit_bio_wait_endio(struct bio *bio)
+4 −5
Original line number Diff line number Diff line
@@ -32,7 +32,7 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector)
	 * Align the bio size to the discard granularity to make splitting the bio
	 * at discard granularity boundaries easier in the driver if needed.
	 */
	return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT;
	return round_down(BIO_MAX_SIZE, discard_granularity) >> SECTOR_SHIFT;
}

struct bio *blk_alloc_discard_bio(struct block_device *bdev,
@@ -107,8 +107,7 @@ static sector_t bio_write_zeroes_limit(struct block_device *bdev)
{
	sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;

	return min(bdev_write_zeroes_sectors(bdev),
		(UINT_MAX >> SECTOR_SHIFT) & ~bs_mask);
	return min(bdev_write_zeroes_sectors(bdev), BIO_MAX_SECTORS & ~bs_mask);
}

/*
@@ -337,8 +336,8 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
	int ret = 0;

	/* make sure that "len << SECTOR_SHIFT" doesn't overflow */
	if (max_sectors > UINT_MAX >> SECTOR_SHIFT)
		max_sectors = UINT_MAX >> SECTOR_SHIFT;
	if (max_sectors > BIO_MAX_SECTORS)
		max_sectors = BIO_MAX_SECTORS;
	max_sectors &= ~bs_mask;

	if (max_sectors == 0)
+4 −4
Original line number Diff line number Diff line
@@ -95,13 +95,13 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
}

/*
 * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
 * is defined as 'unsigned int', meantime it has to be aligned to with the
 * The maximum size that a bio can fit has to be aligned down to the
 * logical block size, which is the minimum accepted unit by hardware.
 */
static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim)
{
	return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
	return round_down(BIO_MAX_SIZE, lim->logical_block_size) >>
			SECTOR_SHIFT;
}

/*
@@ -515,7 +515,7 @@ unsigned int blk_recalc_rq_segments(struct request *rq)

	rq_for_each_bvec(bv, rq, iter)
		bvec_split_segs(&rq->q->limits, &bv, &nr_phys_segs, &bytes,
				UINT_MAX, UINT_MAX);
				UINT_MAX, BIO_MAX_SIZE);
	return nr_phys_segs;
}

+0 −11
Original line number Diff line number Diff line
@@ -599,17 +599,6 @@ void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors);

struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
		struct lock_class_key *lkclass);

/*
 * Clean up a page appropriately, where the page may be pinned, may have a
 * ref taken on it or neither.
 */
static inline void bio_release_page(struct bio *bio, struct page *page)
{
	if (bio_flagged(bio, BIO_PAGE_PINNED))
		unpin_user_page(page);
}

struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id);

int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);
+104 −87
Original line number Diff line number Diff line
@@ -23,7 +23,7 @@
#define IOMAP_DIO_WRITE_THROUGH	(1U << 28)
#define IOMAP_DIO_NEED_SYNC	(1U << 29)
#define IOMAP_DIO_WRITE		(1U << 30)
#define IOMAP_DIO_DIRTY		(1U << 31)
#define IOMAP_DIO_USER_BACKED	(1U << 31)

struct iomap_dio {
	struct kiocb		*iocb;
@@ -223,51 +223,52 @@ static void iomap_dio_done(struct iomap_dio *dio)
	iomap_dio_complete_work(&dio->aio.work);
}

void iomap_dio_bio_end_io(struct bio *bio)
static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion)
{
	struct iomap_dio *dio = bio->bi_private;
	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);

	if (bio->bi_status)
		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));

	if (atomic_dec_and_test(&dio->ref))
		iomap_dio_done(dio);

	if (should_dirty) {
	if (dio->flags & IOMAP_DIO_BOUNCE) {
		bio_iov_iter_unbounce(bio, !!dio->error,
				dio->flags & IOMAP_DIO_USER_BACKED);
		bio_put(bio);
	} else if (dio->flags & IOMAP_DIO_USER_BACKED) {
		bio_check_pages_dirty(bio);
	} else {
		bio_release_pages(bio, false);
		bio_put(bio);
	}
}
EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);

u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
{
	struct iomap_dio *dio = ioend->io_bio.bi_private;
	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
	u32 vec_count = ioend->io_bio.bi_vcnt;

	if (ioend->io_error)
		iomap_dio_set_error(dio, ioend->io_error);
	/* Do not touch bio below, we just gave up our reference. */

	if (atomic_dec_and_test(&dio->ref)) {
		/*
		 * Try to avoid another context switch for the completion given
		 * that we are already called from the ioend completion
		 * workqueue.
		 * Avoid another context switch for the completion when already
		 * called from the ioend completion workqueue.
		 */
		if (inline_completion)
			dio->flags &= ~IOMAP_DIO_COMP_WORK;
		iomap_dio_done(dio);
	}
}

	if (should_dirty) {
		bio_check_pages_dirty(&ioend->io_bio);
	} else {
		bio_release_pages(&ioend->io_bio, false);
		bio_put(&ioend->io_bio);
void iomap_dio_bio_end_io(struct bio *bio)
{
	struct iomap_dio *dio = bio->bi_private;

	if (bio->bi_status)
		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
	__iomap_dio_bio_end_io(bio, false);
}
EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);

u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
{
	struct iomap_dio *dio = ioend->io_bio.bi_private;
	u32 vec_count = ioend->io_bio.bi_vcnt;

	if (ioend->io_error)
		iomap_dio_set_error(dio, ioend->io_error);
	__iomap_dio_bio_end_io(&ioend->io_bio, true);

	/*
	 * Return the number of bvecs completed as even direct I/O completions
@@ -314,6 +315,65 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
	return 0;
}

static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter,
		struct iomap_dio *dio, loff_t pos, unsigned int alignment,
		blk_opf_t op)
{
	unsigned int nr_vecs;
	struct bio *bio;
	ssize_t ret;

	if (dio->flags & IOMAP_DIO_BOUNCE)
		nr_vecs = bio_iov_bounce_nr_vecs(dio->submit.iter, op);
	else
		nr_vecs = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);

	bio = iomap_dio_alloc_bio(iter, dio, nr_vecs, op);
	fscrypt_set_bio_crypt_ctx(bio, iter->inode,
			pos >> iter->inode->i_blkbits, GFP_KERNEL);
	bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
	bio->bi_write_hint = iter->inode->i_write_hint;
	bio->bi_ioprio = dio->iocb->ki_ioprio;
	bio->bi_private = dio;
	bio->bi_end_io = iomap_dio_bio_end_io;

	if (dio->flags & IOMAP_DIO_BOUNCE)
		ret = bio_iov_iter_bounce(bio, dio->submit.iter);
	else
		ret = bio_iov_iter_get_pages(bio, dio->submit.iter,
					     alignment - 1);
	if (unlikely(ret))
		goto out_put_bio;
	ret = bio->bi_iter.bi_size;

	/*
	 * An atomic write bio must cover the complete length.  If it doesn't,
	 * error out.
	 */
	if ((op & REQ_ATOMIC) && WARN_ON_ONCE(ret != iomap_length(iter))) {
		ret = -EINVAL;
		goto out_put_bio;
	}

	if (dio->flags & IOMAP_DIO_WRITE)
		task_io_account_write(ret);
	else if ((dio->flags & IOMAP_DIO_USER_BACKED) &&
		 !(dio->flags & IOMAP_DIO_BOUNCE))
		bio_set_pages_dirty(bio);

	/*
	 * We can only poll for single bio I/Os.
	 */
	if (iov_iter_count(dio->submit.iter))
		dio->iocb->ki_flags &= ~IOCB_HIPRI;
	iomap_dio_submit_bio(iter, dio, bio, pos);
	return ret;

out_put_bio:
	bio_put(bio);
	return ret;
}

static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
{
	const struct iomap *iomap = &iter->iomap;
@@ -322,12 +382,11 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
	const loff_t length = iomap_length(iter);
	loff_t pos = iter->pos;
	blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE;
	struct bio *bio;
	bool need_zeroout = false;
	int nr_pages, ret = 0;
	u64 copied = 0;
	size_t orig_count;
	unsigned int alignment;
	ssize_t ret = 0;

	/*
	 * File systems that write out of place and always allocate new blocks
@@ -452,67 +511,29 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
			goto out;
	}

	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
	do {
		size_t n;
		if (dio->error) {
			iov_iter_revert(dio->submit.iter, copied);
			copied = ret = 0;
		/*
		 * If completions already occurred and reported errors, give up now and
		 * don't bother submitting more bios.
		 */
		if (unlikely(data_race(dio->error)))
			goto out;
		}

		bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf);
		fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
					  GFP_KERNEL);
		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
		bio->bi_write_hint = inode->i_write_hint;
		bio->bi_ioprio = dio->iocb->ki_ioprio;
		bio->bi_private = dio;
		bio->bi_end_io = iomap_dio_bio_end_io;

		ret = bio_iov_iter_get_pages(bio, dio->submit.iter,
					     alignment - 1);
		if (unlikely(ret)) {
		ret = iomap_dio_bio_iter_one(iter, dio, pos, alignment, bio_opf);
		if (unlikely(ret < 0)) {
			/*
			 * We have to stop part way through an IO. We must fall
			 * through to the sub-block tail zeroing here, otherwise
			 * this short IO may expose stale data in the tail of
			 * the block we haven't written data to.
			 */
			bio_put(bio);
			goto zero_tail;
		}

		n = bio->bi_iter.bi_size;
		if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) {
			/*
			 * An atomic write bio must cover the complete length,
			 * which it doesn't, so error. We may need to zero out
			 * the tail (complete FS block), similar to when
			 * bio_iov_iter_get_pages() returns an error, above.
			 */
			ret = -EINVAL;
			bio_put(bio);
			goto zero_tail;
			break;
		}
		if (dio->flags & IOMAP_DIO_WRITE)
			task_io_account_write(n);
		else if (dio->flags & IOMAP_DIO_DIRTY)
			bio_set_pages_dirty(bio);

		dio->size += n;
		copied += n;

		nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
						 BIO_MAX_VECS);
		/*
		 * We can only poll for single bio I/Os.
		 */
		if (nr_pages)
			dio->iocb->ki_flags &= ~IOCB_HIPRI;
		iomap_dio_submit_bio(iter, dio, bio, pos);
		pos += n;
	} while (nr_pages);
		dio->size += ret;
		copied += ret;
		pos += ret;
		ret = 0;
	} while (iov_iter_count(dio->submit.iter));

	/*
	 * We need to zeroout the tail of a sub-block write if the extent type
@@ -520,7 +541,6 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
	 * the block tail in the latter case, we can expose stale data via mmap
	 * reads of the EOF block.
	 */
zero_tail:
	if (need_zeroout ||
	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
		/* zero out from the end of the write to the end of the block */
@@ -667,7 +687,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
	dio->i_size = i_size_read(inode);
	dio->dops = dops;
	dio->error = 0;
	dio->flags = 0;
	dio->flags = dio_flags & (IOMAP_DIO_FSBLOCK_ALIGNED | IOMAP_DIO_BOUNCE);
	dio->done_before = done_before;

	dio->submit.iter = iter;
@@ -676,15 +696,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
	if (iocb->ki_flags & IOCB_NOWAIT)
		iomi.flags |= IOMAP_NOWAIT;

	if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED)
		dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED;

	if (iov_iter_rw(iter) == READ) {
		if (iomi.pos >= dio->i_size)
			goto out_free_dio;

		if (user_backed_iter(iter))
			dio->flags |= IOMAP_DIO_DIRTY;
			dio->flags |= IOMAP_DIO_USER_BACKED;

		ret = kiocb_write_and_wait(iocb, iomi.len);
		if (ret)
Loading