Unverified Commit 2f368b5f authored by Christian Brauner's avatar Christian Brauner
Browse files

Merge patch series "refactor the iomap writeback code v5"

Christoph Hellwig <hch@lst.de> says:

This is an alternative approach to the writeback part of the
"fuse: use iomap for buffered writes + writeback" series from Joanne.

The big difference compared to Joanne's version is that I hope the
split between the generic and ioend/bio based writeback code is a bit
cleaner here.  We have two methods that define the split between the
generic writeback code, and the implemementation of it, and all knowledge
of ioends and bios now sits below that layer.

This version passes testing on xfs, and gets as far as mainline for
gfs2 (crashes in generic/361).

* patches from https://lore.kernel.org/20250710133343.399917-1-hch@lst.de:
  iomap: build the writeback code without CONFIG_BLOCK
  iomap: add read_folio_range() handler for buffered writes
  iomap: improve argument passing to iomap_read_folio_sync
  iomap: replace iomap_folio_ops with iomap_write_ops
  iomap: export iomap_writeback_folio
  iomap: move folio_unlock out of iomap_writeback_folio
  iomap: rename iomap_writepage_map to iomap_writeback_folio
  iomap: move all ioend handling to ioend.c
  iomap: add public helpers for uptodate state manipulation
  iomap: hide ioends from the generic writeback code
  iomap: refactor the writeback interface
  iomap: cleanup the pending writeback tracking in iomap_writepage_map_blocks
  iomap: pass more arguments using the iomap writeback context
  iomap: header diet

Link: https://lore.kernel.org/20250710133343.399917-1-hch@lst.de


Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parents 19272b37 5699b7e2
Loading
Loading
Loading
Loading
+0 −3
Original line number Diff line number Diff line
@@ -167,7 +167,6 @@ structure below:
     struct dax_device   *dax_dev;
     void                *inline_data;
     void                *private;
     const struct iomap_folio_ops *folio_ops;
     u64                 validity_cookie;
 };

@@ -292,8 +291,6 @@ The fields are as follows:
   <https://lore.kernel.org/all/20180619164137.13720-7-hch@lst.de/>`_.
   This value will be passed unchanged to ``->iomap_end``.

 * ``folio_ops`` will be covered in the section on pagecache operations.

 * ``validity_cookie`` is a magic freshness value set by the filesystem
   that should be used to detect stale mappings.
   For pagecache operations this is critical for correct operation
+28 −29
Original line number Diff line number Diff line
@@ -57,21 +57,19 @@ The following address space operations can be wrapped easily:
 * ``bmap``
 * ``swap_activate``

``struct iomap_folio_ops``
``struct iomap_write_ops``
--------------------------

The ``->iomap_begin`` function for pagecache operations may set the
``struct iomap::folio_ops`` field to an ops structure to override
default behaviors of iomap:

.. code-block:: c

 struct iomap_folio_ops {
 struct iomap_write_ops {
     struct folio *(*get_folio)(struct iomap_iter *iter, loff_t pos,
                                unsigned len);
     void (*put_folio)(struct inode *inode, loff_t pos, unsigned copied,
                       struct folio *folio);
     bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap);
     int (*read_folio_range)(const struct iomap_iter *iter,
     			struct folio *folio, loff_t pos, size_t len);
 };

iomap calls these functions:
@@ -127,6 +125,10 @@ iomap calls these functions:
    ``->iomap_valid``, then the iomap should considered stale and the
    validation failed.

  - ``read_folio_range``: Called to synchronously read in the range that will
    be written to. If this function is not provided, iomap will default to
    submitting a bio read request.

These ``struct kiocb`` flags are significant for buffered I/O with iomap:

 * ``IOCB_NOWAIT``: Turns on ``IOMAP_NOWAIT``.
@@ -271,7 +273,7 @@ writeback.
It does not lock ``i_rwsem`` or ``invalidate_lock``.

The dirty bit will be cleared for all folios run through the
``->map_blocks`` machinery described below even if the writeback fails.
``->writeback_range`` machinery described below even if the writeback fails.
This is to prevent dirty folio clots when storage devices fail; an
``-EIO`` is recorded for userspace to collect via ``fsync``.

@@ -283,15 +285,14 @@ The ``ops`` structure must be specified and is as follows:
.. code-block:: c

 struct iomap_writeback_ops {
     int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
                       loff_t offset, unsigned len);
     int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
     void (*discard_folio)(struct folio *folio, loff_t pos);
    int (*writeback_range)(struct iomap_writepage_ctx *wpc,
        struct folio *folio, u64 pos, unsigned int len, u64 end_pos);
    int (*writeback_submit)(struct iomap_writepage_ctx *wpc, int error);
 };

The fields are as follows:

  - ``map_blocks``: Sets ``wpc->iomap`` to the space mapping of the file
  - ``writeback_range``: Sets ``wpc->iomap`` to the space mapping of the file
    range (in bytes) given by ``offset`` and ``len``.
    iomap calls this function for each dirty fs block in each dirty folio,
    though it will `reuse mappings
@@ -306,27 +307,26 @@ The fields are as follows:
    This revalidation must be open-coded by the filesystem; it is
    unclear if ``iomap::validity_cookie`` can be reused for this
    purpose.
    This function must be supplied by the filesystem.

  - ``submit_ioend``: Allows the file systems to hook into writeback bio
    submission.
    This might include pre-write space accounting updates, or installing
    a custom ``->bi_end_io`` function for internal purposes, such as
    deferring the ioend completion to a workqueue to run metadata update
    transactions from process context before submitting the bio.
    This function is optional.

  - ``discard_folio``: iomap calls this function after ``->map_blocks``
    fails to schedule I/O for any part of a dirty folio.
    The function should throw away any reservations that may have been
    made for the write.
    If this methods fails to schedule I/O for any part of a dirty folio, it
    should throw away any reservations that may have been made for the write.
    The folio will be marked clean and an ``-EIO`` recorded in the
    pagecache.
    Filesystems can use this callback to `remove
    <https://lore.kernel.org/all/20201029163313.1766967-1-bfoster@redhat.com/>`_
    delalloc reservations to avoid having delalloc reservations for
    clean pagecache.
    This function is optional.
    This function must be supplied by the filesystem.

  - ``writeback_submit``: Submit the previous built writeback context.
    Block based file systems should use the iomap_ioend_writeback_submit
    helper, other file system can implement their own.
    File systems can optionall to hook into writeback bio submission.
    This might include pre-write space accounting updates, or installing
    a custom ``->bi_end_io`` function for internal purposes, such as
    deferring the ioend completion to a workqueue to run metadata update
    transactions from process context before submitting the bio.
    This function must be supplied by the filesystem.

Pagecache Writeback Completion
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -340,10 +340,9 @@ If the write failed, it will also set the error bits on the folios and
the address space.
This can happen in interrupt or process context, depending on the
storage device.

Filesystems that need to update internal bookkeeping (e.g. unwritten
extent conversions) should provide a ``->submit_ioend`` function to
set ``struct iomap_end::bio::bi_end_io`` to its own function.
extent conversions) should set their own bi_end_io on the bios
submitted by ``->submit_writeback``
This function should call ``iomap_finish_ioends`` after finishing its
own work (e.g. unwritten extent conversion).

+25 −12
Original line number Diff line number Diff line
@@ -537,30 +537,42 @@ static void blkdev_readahead(struct readahead_control *rac)
	iomap_readahead(rac, &blkdev_iomap_ops);
}

static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
		struct inode *inode, loff_t offset, unsigned int len)
static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc,
		struct folio *folio, u64 offset, unsigned int len, u64 end_pos)
{
	loff_t isize = i_size_read(inode);
	loff_t isize = i_size_read(wpc->inode);

	if (WARN_ON_ONCE(offset >= isize))
		return -EIO;
	if (offset >= wpc->iomap.offset &&
	    offset < wpc->iomap.offset + wpc->iomap.length)
		return 0;
	return blkdev_iomap_begin(inode, offset, isize - offset,

	if (offset < wpc->iomap.offset ||
	    offset >= wpc->iomap.offset + wpc->iomap.length) {
		int error;

		error = blkdev_iomap_begin(wpc->inode, offset, isize - offset,
				IOMAP_WRITE, &wpc->iomap, NULL);
		if (error)
			return error;
	}

	return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
}

static const struct iomap_writeback_ops blkdev_writeback_ops = {
	.map_blocks		= blkdev_map_blocks,
	.writeback_range	= blkdev_writeback_range,
	.writeback_submit	= iomap_ioend_writeback_submit,
};

static int blkdev_writepages(struct address_space *mapping,
		struct writeback_control *wbc)
{
	struct iomap_writepage_ctx wpc = { };
	struct iomap_writepage_ctx wpc = {
		.inode		= mapping->host,
		.wbc		= wbc,
		.ops		= &blkdev_writeback_ops
	};

	return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops);
	return iomap_writepages(&wpc);
}

const struct address_space_operations def_blk_aops = {
@@ -711,7 +723,8 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)

static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
{
	return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL);
	return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL,
			NULL);
}

/*
+6 −2
Original line number Diff line number Diff line
@@ -159,7 +159,11 @@ static int gfs2_writepages(struct address_space *mapping,
			   struct writeback_control *wbc)
{
	struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
	struct iomap_writepage_ctx wpc = { };
	struct iomap_writepage_ctx wpc = {
		.inode		= mapping->host,
		.wbc		= wbc,
		.ops		= &gfs2_writeback_ops,
	};
	int ret;

	/*
@@ -168,7 +172,7 @@ static int gfs2_writepages(struct address_space *mapping,
	 * want balance_dirty_pages() to loop indefinitely trying to write out
	 * pages held in the ail that it can't find.
	 */
	ret = iomap_writepages(mapping, wbc, &wpc, &gfs2_writeback_ops);
	ret = iomap_writepages(&wpc);
	if (ret == 0 && wbc->nr_to_write > 0)
		set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags);
	return ret;
+27 −21
Original line number Diff line number Diff line
@@ -963,12 +963,16 @@ static struct folio *
gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
{
	struct inode *inode = iter->inode;
	struct gfs2_inode *ip = GFS2_I(inode);
	unsigned int blockmask = i_blocksize(inode) - 1;
	struct gfs2_sbd *sdp = GFS2_SB(inode);
	unsigned int blocks;
	struct folio *folio;
	int status;

	if (!gfs2_is_jdata(ip) && !gfs2_is_stuffed(ip))
		return iomap_get_folio(iter, pos, len);

	blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
	status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
	if (status)
@@ -987,7 +991,7 @@ static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos,
	struct gfs2_inode *ip = GFS2_I(inode);
	struct gfs2_sbd *sdp = GFS2_SB(inode);

	if (!gfs2_is_stuffed(ip))
	if (gfs2_is_jdata(ip) && !gfs2_is_stuffed(ip))
		gfs2_trans_add_databufs(ip->i_gl, folio,
					offset_in_folio(folio, pos),
					copied);
@@ -995,13 +999,14 @@ static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos,
	folio_unlock(folio);
	folio_put(folio);

	if (gfs2_is_jdata(ip) || gfs2_is_stuffed(ip)) {
		if (tr->tr_num_buf_new)
			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);

		gfs2_trans_end(sdp);
	}
}

static const struct iomap_folio_ops gfs2_iomap_folio_ops = {
const struct iomap_write_ops gfs2_iomap_write_ops = {
	.get_folio = gfs2_iomap_get_folio,
	.put_folio = gfs2_iomap_put_folio,
};
@@ -1078,8 +1083,6 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
		gfs2_trans_end(sdp);
	}

	if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
		iomap->folio_ops = &gfs2_iomap_folio_ops;
	return 0;

out_trans_end:
@@ -1304,7 +1307,7 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from, loff_t length
		return 0;
	length = min(length, inode->i_size - from);
	return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops,
			NULL);
			&gfs2_iomap_write_ops, NULL);
}

#define GFS2_JTRUNC_REVOKES 8192
@@ -2469,23 +2472,26 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
	return error;
}

static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
		loff_t offset, unsigned int len)
static ssize_t gfs2_writeback_range(struct iomap_writepage_ctx *wpc,
		struct folio *folio, u64 offset, unsigned int len, u64 end_pos)
{
	int ret;

	if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode))))
	if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(wpc->inode))))
		return -EIO;

	if (offset >= wpc->iomap.offset &&
	    offset < wpc->iomap.offset + wpc->iomap.length)
		return 0;
	if (offset < wpc->iomap.offset ||
	    offset >= wpc->iomap.offset + wpc->iomap.length) {
		int ret;

		memset(&wpc->iomap, 0, sizeof(wpc->iomap));
	ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap);
		ret = gfs2_iomap_get(wpc->inode, offset, INT_MAX, &wpc->iomap);
		if (ret)
			return ret;
	}

	return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
}

const struct iomap_writeback_ops gfs2_writeback_ops = {
	.map_blocks		= gfs2_map_blocks,
	.writeback_range	= gfs2_writeback_range,
	.writeback_submit	= iomap_ioend_writeback_submit,
};
Loading