Unverified Commit f8789733 authored by Christian Brauner's avatar Christian Brauner
Browse files

Merge patch series "iomap: allow the file system to submit the writeback bios"

Christoph Hellwig <hch@lst.de> says:

This series contains the iomap prep work to support zoned XFS.

The biggest changes are:

 - an option to reuse the ioend code for direct writes in addition to the
   current use for buffered writeback, which allows the file system to
   track completions on a per-bio basis instead of the current end_io
   callback which operates on the entire I/O.
   Note that it might make sense to split the ioend code from
   buffered-io.c into its own file with this.  Let me know what you think
   of that and I can include it in the next version
 - change of the writeback_ops so that the submit_bio call can be done by
   the file system.  Note that btrfs will also need this eventually when
   it starts using iomap
 - helpers to split ioend to the zone append queue_limits that plug
   into the previous item above.
 - a new ANON_WRITE flags for writes that don't have a block number
   assigned to them at the iomap level, leaving the file system to do
   that work in the submission handler.  Note that btrfs wants something
   similar also for compressed I/O, which should be able to reuse this,
   maybe with minor tweaks.
 - passing private data to a few more helper

The XFS changes to use this will be posted to the xfs list only to not
spam fsdevel too much.

* patches from https://lore.kernel.org/r/20250206064035.2323428-2-hch@lst.de:
  iomap: pass private data to iomap_truncate_page
  iomap: pass private data to iomap_zero_range
  iomap: pass private data to iomap_page_mkwrite
  iomap: add a io_private field to struct iomap_ioend
  iomap: optionally use ioends for direct I/O
  iomap: factor out a iomap_dio_done helper
  iomap: move common ioend code to ioend.c
  iomap: split bios to zone append limits in the submission handlers
  iomap: add a IOMAP_F_ANON_WRITE flag
  iomap: simplify io_flags and io_type in struct iomap_ioend
  iomap: allow the file system to submit the writeback bios

Link: https://lore.kernel.org/r/20250206064035.2323428-2-hch@lst.de


Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parents 2014c95a ddd402bb
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -246,6 +246,10 @@ The fields are as follows:
   * **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can
     be set by the filesystem for its own purposes.

   * **IOMAP_F_ANON_WRITE**: Indicates that (write) I/O does not have a target
     block assigned to it yet and the file system will do that in the bio
     submission handler, splitting the I/O as needed.

   These flags can be set by iomap itself during file operations.
   The filesystem should supply an ``->iomap_end`` function if it needs
   to observe these flags:
+5 −6
Original line number Diff line number Diff line
@@ -283,7 +283,7 @@ The ``ops`` structure must be specified and is as follows:
 struct iomap_writeback_ops {
     int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
                       loff_t offset, unsigned len);
     int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
     int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
     void (*discard_folio)(struct folio *folio, loff_t pos);
 };

@@ -306,13 +306,12 @@ The fields are as follows:
    purpose.
    This function must be supplied by the filesystem.

  - ``prepare_ioend``: Enables filesystems to transform the writeback
    ioend or perform any other preparatory work before the writeback I/O
    is submitted.
  - ``submit_ioend``: Allows the file systems to hook into writeback bio
    submission.
    This might include pre-write space accounting updates, or installing
    a custom ``->bi_end_io`` function for internal purposes, such as
    deferring the ioend completion to a workqueue to run metadata update
    transactions from process context.
    transactions from process context before submitting the bio.
    This function is optional.

  - ``discard_folio``: iomap calls this function after ``->map_blocks``
@@ -341,7 +340,7 @@ This can happen in interrupt or process context, depending on the
storage device.

Filesystems that need to update internal bookkeeping (e.g. unwritten
extent conversions) should provide a ``->prepare_ioend`` function to
extent conversions) should provide a ``->submit_ioend`` function to
set ``struct iomap_end::bio::bi_end_io`` to its own function.
This function should call ``iomap_finish_ioends`` after finishing its
own work (e.g. unwritten extent conversion).
+2 −1
Original line number Diff line number Diff line
@@ -1300,7 +1300,8 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from,
				 unsigned int length)
{
	BUG_ON(current->journal_info);
	return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
	return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops,
			NULL);
}

#define GFS2_JTRUNC_REVOKES 8192
+1 −0
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@ iomap-y += trace.o \
				   iter.o
iomap-$(CONFIG_BLOCK)		+= buffered-io.o \
				   direct-io.o \
				   ioend.o \
				   fiemap.o \
				   seek.o
iomap-$(CONFIG_SWAP)		+= swapfile.o
+47 −155
Original line number Diff line number Diff line
@@ -12,17 +12,15 @@
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/writeback.h>
#include <linux/list_sort.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/sched/signal.h>
#include <linux/migrate.h>
#include "internal.h"
#include "trace.h"

#include "../internal.h"

#define IOEND_BATCH_SIZE	4096

/*
 * Structure allocated for each folio to track per-block uptodate, dirty state
 * and I/O completions.
@@ -40,8 +38,6 @@ struct iomap_folio_state {
	unsigned long		state[];
};

static struct bio_set iomap_ioend_bioset;

static inline bool ifs_is_fully_uptodate(struct folio *folio,
		struct iomap_folio_state *ifs)
{
@@ -1395,13 +1391,14 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)

int
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
		const struct iomap_ops *ops)
		const struct iomap_ops *ops, void *private)
{
	struct iomap_iter iter = {
		.inode		= inode,
		.pos		= pos,
		.len		= len,
		.flags		= IOMAP_ZERO,
		.private	= private,
	};
	struct address_space *mapping = inode->i_mapping;
	unsigned int blocksize = i_blocksize(inode);
@@ -1461,7 +1458,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range);

int
iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
		const struct iomap_ops *ops)
		const struct iomap_ops *ops, void *private)
{
	unsigned int blocksize = i_blocksize(inode);
	unsigned int off = pos & (blocksize - 1);
@@ -1469,7 +1466,8 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
	/* Block boundary? Nothing to do */
	if (!off)
		return 0;
	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops,
			private);
}
EXPORT_SYMBOL_GPL(iomap_truncate_page);

@@ -1493,11 +1491,13 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
	return length;
}

vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
		void *private)
{
	struct iomap_iter iter = {
		.inode		= file_inode(vmf->vma->vm_file),
		.flags		= IOMAP_WRITE | IOMAP_FAULT,
		.private	= private,
	};
	struct folio *folio = page_folio(vmf->page);
	ssize_t ret;
@@ -1538,16 +1538,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
 * state, release holds on bios, and finally free up memory.  Do not use the
 * ioend after this.
 */
static u32
iomap_finish_ioend(struct iomap_ioend *ioend, int error)
u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
{
	struct inode *inode = ioend->io_inode;
	struct bio *bio = &ioend->io_bio;
	struct folio_iter fi;
	u32 folio_count = 0;

	if (error) {
		mapping_set_error(inode->i_mapping, error);
	if (ioend->io_error) {
		mapping_set_error(inode->i_mapping, ioend->io_error);
		if (!bio_flagged(bio, BIO_QUIET)) {
			pr_err_ratelimited(
"%s: writeback error on inode %lu, offset %lld, sector %llu",
@@ -1566,116 +1565,16 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
	return folio_count;
}

/*
 * Ioend completion routine for merged bios. This can only be called from task
 * contexts as merged ioends can be of unbound length. Hence we have to break up
 * the writeback completions into manageable chunks to avoid long scheduler
 * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
 * good batch processing throughput without creating adverse scheduler latency
 * conditions.
 */
void
iomap_finish_ioends(struct iomap_ioend *ioend, int error)
{
	struct list_head tmp;
	u32 completions;

	might_sleep();

	list_replace_init(&ioend->io_list, &tmp);
	completions = iomap_finish_ioend(ioend, error);

	while (!list_empty(&tmp)) {
		if (completions > IOEND_BATCH_SIZE * 8) {
			cond_resched();
			completions = 0;
		}
		ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
		list_del_init(&ioend->io_list);
		completions += iomap_finish_ioend(ioend, error);
	}
}
EXPORT_SYMBOL_GPL(iomap_finish_ioends);

/*
 * We can merge two adjacent ioends if they have the same set of work to do.
 */
static bool
iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
{
	if (ioend->io_bio.bi_status != next->io_bio.bi_status)
		return false;
	if (next->io_flags & IOMAP_F_BOUNDARY)
		return false;
	if ((ioend->io_flags & IOMAP_F_SHARED) ^
	    (next->io_flags & IOMAP_F_SHARED))
		return false;
	if ((ioend->io_type == IOMAP_UNWRITTEN) ^
	    (next->io_type == IOMAP_UNWRITTEN))
		return false;
	if (ioend->io_offset + ioend->io_size != next->io_offset)
		return false;
	/*
	 * Do not merge physically discontiguous ioends. The filesystem
	 * completion functions will have to iterate the physical
	 * discontiguities even if we merge the ioends at a logical level, so
	 * we don't gain anything by merging physical discontiguities here.
	 *
	 * We cannot use bio->bi_iter.bi_sector here as it is modified during
	 * submission so does not point to the start sector of the bio at
	 * completion.
	 */
	if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
		return false;
	return true;
}

void
iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
{
	struct iomap_ioend *next;

	INIT_LIST_HEAD(&ioend->io_list);

	while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
			io_list))) {
		if (!iomap_ioend_can_merge(ioend, next))
			break;
		list_move_tail(&next->io_list, &ioend->io_list);
		ioend->io_size += next->io_size;
	}
}
EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);

static int
iomap_ioend_compare(void *priv, const struct list_head *a,
		const struct list_head *b)
{
	struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
	struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);

	if (ia->io_offset < ib->io_offset)
		return -1;
	if (ia->io_offset > ib->io_offset)
		return 1;
	return 0;
}

void
iomap_sort_ioends(struct list_head *ioend_list)
{
	list_sort(NULL, ioend_list, iomap_ioend_compare);
}
EXPORT_SYMBOL_GPL(iomap_sort_ioends);

static void iomap_writepage_end_bio(struct bio *bio)
{
	iomap_finish_ioend(iomap_ioend_from_bio(bio),
			blk_status_to_errno(bio->bi_status));
	struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);

	ioend->io_error = blk_status_to_errno(bio->bi_status);
	iomap_finish_ioend_buffered(ioend);
}

/*
 * Submit the final bio for an ioend.
 * Submit an ioend.
 *
 * If @error is non-zero, it means that we have a situation where some part of
 * the submission process has failed after we've marked pages for writeback.
@@ -1694,14 +1593,18 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
	 * failure happened so that the file system end I/O handler gets called
	 * to clean up.
	 */
	if (wpc->ops->prepare_ioend)
		error = wpc->ops->prepare_ioend(wpc->ioend, error);
	if (wpc->ops->submit_ioend) {
		error = wpc->ops->submit_ioend(wpc, error);
	} else {
		if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
			error = -EIO;
		if (!error)
			submit_bio(&wpc->ioend->io_bio);
	}

	if (error) {
		wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
		bio_endio(&wpc->ioend->io_bio);
	} else {
		submit_bio(&wpc->ioend->io_bio);
	}

	wpc->ioend = NULL;
@@ -1709,9 +1612,9 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
}

static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
		struct writeback_control *wbc, struct inode *inode, loff_t pos)
		struct writeback_control *wbc, struct inode *inode, loff_t pos,
		u16 ioend_flags)
{
	struct iomap_ioend *ioend;
	struct bio *bio;

	bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
@@ -1719,36 +1622,24 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
			       GFP_NOFS, &iomap_ioend_bioset);
	bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
	bio->bi_end_io = iomap_writepage_end_bio;
	wbc_init_bio(wbc, bio);
	bio->bi_write_hint = inode->i_write_hint;

	ioend = iomap_ioend_from_bio(bio);
	INIT_LIST_HEAD(&ioend->io_list);
	ioend->io_type = wpc->iomap.type;
	ioend->io_flags = wpc->iomap.flags;
	if (pos > wpc->iomap.offset)
		wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
	ioend->io_inode = inode;
	ioend->io_size = 0;
	ioend->io_offset = pos;
	ioend->io_sector = bio->bi_iter.bi_sector;

	wbc_init_bio(wbc, bio);
	wpc->nr_folios = 0;
	return ioend;
	return iomap_init_ioend(inode, bio, pos, ioend_flags);
}

static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
		u16 ioend_flags)
{
	if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
		return false;
	if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
	    (wpc->ioend->io_flags & IOMAP_F_SHARED))
	if (ioend_flags & IOMAP_IOEND_BOUNDARY)
		return false;
	if (wpc->iomap.type != wpc->ioend->io_type)
	if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
	    (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
		return false;
	if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
		return false;
	if (iomap_sector(&wpc->iomap, pos) !=
	if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
	    iomap_sector(&wpc->iomap, pos) !=
	    bio_end_sector(&wpc->ioend->io_bio))
		return false;
	/*
@@ -1779,14 +1670,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
{
	struct iomap_folio_state *ifs = folio->private;
	size_t poff = offset_in_folio(folio, pos);
	unsigned int ioend_flags = 0;
	int error;

	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
	if (wpc->iomap.type == IOMAP_UNWRITTEN)
		ioend_flags |= IOMAP_IOEND_UNWRITTEN;
	if (wpc->iomap.flags & IOMAP_F_SHARED)
		ioend_flags |= IOMAP_IOEND_SHARED;
	if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
		ioend_flags |= IOMAP_IOEND_BOUNDARY;

	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
new_ioend:
		error = iomap_submit_ioend(wpc, 0);
		if (error)
			return error;
		wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
		wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
				ioend_flags);
	}

	if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
@@ -2062,11 +1962,3 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
	return iomap_submit_ioend(wpc, error);
}
EXPORT_SYMBOL_GPL(iomap_writepages);

static int __init iomap_buffered_init(void)
{
	return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
			   offsetof(struct iomap_ioend, io_bio),
			   BIOSET_NEED_BVECS);
}
fs_initcall(iomap_buffered_init);
Loading