Unverified Commit 560507cb authored by Christian Brauner's avatar Christian Brauner
Browse files

Merge patch series "iomap: zero range folio batch support"

Brian Foster <bfoster@redhat.com> says:

This adds folio batch support for iomap. This initially only targets
zero range, the use case being zeroing of dirty folios over unwritten
mappings. There is potential to support other operations in the future:
iomap seek data/hole has similar raciness issues as zero range, the
prospect of using this for buffered write has been raised for granular
locking purposes, etc.

The one major caveat with this zero range implementation is that it
doesn't look at iomap_folio_state to determine whether to zero a
sub-folio portion of the folio. Instead it just relies on whether the
folio was dirty or not. This means that spurious zeroing of unwritten
ranges is possible if a folio is dirty but the target range includes a
subrange that is not.

The reasoning is that this is essentially a complexity tradeoff. The
current use cases for iomap_zero_range() are limited mostly to partial
block zeroing scenarios. It's relatively harmless to zero an unwritten
block (i.e. not a correctness issue), and this is something that
filesystems have done in the past without much notice or issue. The
advantage is less code and this makes it a little easier to use a
filemap lookup function for the batch rather than open coding more logic
in iomap. That said, this can probably be enhanced to look at ifs in the
future if the use case expands and/or other operations justify it.

WRT testing, I've tested with and without a local hack to redirect
fallocate zero range calls to iomap_zero_range() in XFS. This helps test
beyond the partial block/folio use case, i.e. to cover boundary
conditions like full folio batch handling, etc. I recently added patch 7
in spirit of that, which turns this logic into an XFS errortag. Further
comments on that are inline with patch 7.

* patches from https://lore.kernel.org/20251003134642.604736-1-bfoster@redhat.com

:
  xfs: error tag to force zeroing on debug kernels
  iomap: remove old partial eof zeroing optimization
  xfs: fill dirty folios on zero range of unwritten mappings
  xfs: always trim mapping to requested range for zero range
  iomap: optional zero range dirty folio processing
  iomap: remove pos+len BUG_ON() to after folio lookup
  filemap: add helper to look up dirty folios in a range

Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parents 4966b466 66d78a11
Loading
Loading
Loading
Loading
+85 −33
Original line number Diff line number Diff line
@@ -772,6 +772,28 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter,
	if (!mapping_large_folio_support(iter->inode->i_mapping))
		len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));

	if (iter->fbatch) {
		struct folio *folio = folio_batch_next(iter->fbatch);

		if (!folio)
			return NULL;

		/*
		 * The folio mapping generally shouldn't have changed based on
		 * fs locks, but be consistent with filemap lookup and retry
		 * the iter if it does.
		 */
		folio_lock(folio);
		if (unlikely(folio->mapping != iter->inode->i_mapping)) {
			iter->iomap.flags |= IOMAP_F_STALE;
			folio_unlock(folio);
			return NULL;
		}

		folio_get(folio);
		return folio;
	}

	if (write_ops && write_ops->get_folio)
		return write_ops->get_folio(iter, pos, len);
	return iomap_get_folio(iter, pos, len);
@@ -826,15 +848,14 @@ static int iomap_write_begin(struct iomap_iter *iter,
		size_t *poffset, u64 *plen)
{
	const struct iomap *srcmap = iomap_iter_srcmap(iter);
	loff_t pos = iter->pos;
	loff_t pos;
	u64 len = min_t(u64, SIZE_MAX, iomap_length(iter));
	struct folio *folio;
	int status = 0;

	len = min_not_zero(len, *plen);
	BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
	if (srcmap != &iter->iomap)
		BUG_ON(pos + len > srcmap->offset + srcmap->length);
	*foliop = NULL;
	*plen = 0;

	if (fatal_signal_pending(current))
		return -EINTR;
@@ -843,6 +864,15 @@ static int iomap_write_begin(struct iomap_iter *iter,
	if (IS_ERR(folio))
		return PTR_ERR(folio);

	/*
	 * No folio means we're done with a batch. We still have range to
	 * process so return and let the caller iterate and refill the batch.
	 */
	if (!folio) {
		WARN_ON_ONCE(!iter->fbatch);
		return 0;
	}

	/*
	 * Now we have a locked folio, before we do anything with it we need to
	 * check that the iomap we have cached is not stale. The inode extent
@@ -863,6 +893,22 @@ static int iomap_write_begin(struct iomap_iter *iter,
		}
	}

	/*
	 * The folios in a batch may not be contiguous. If we've skipped
	 * forward, advance the iter to the pos of the current folio. If the
	 * folio starts beyond the end of the mapping, it may have been trimmed
	 * since the lookup for whatever reason. Return a NULL folio to
	 * terminate the op.
	 */
	if (folio_pos(folio) > iter->pos) {
		len = min_t(u64, folio_pos(folio) - iter->pos,
				 iomap_length(iter));
		status = iomap_iter_advance(iter, len);
		len = iomap_length(iter);
		if (status || !len)
			goto out_unlock;
	}

	pos = iomap_trim_folio_range(iter, folio, poffset, &len);

	if (srcmap->type == IOMAP_INLINE)
@@ -1409,6 +1455,12 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
		if (iter->iomap.flags & IOMAP_F_STALE)
			break;

		/* a NULL folio means we're done with a folio batch */
		if (!folio) {
			status = iomap_iter_advance_full(iter);
			break;
		}

		/* warn about zeroing folios beyond eof that won't write back */
		WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size);

@@ -1433,6 +1485,26 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
	return status;
}

loff_t
iomap_fill_dirty_folios(
	struct iomap_iter	*iter,
	loff_t			offset,
	loff_t			length)
{
	struct address_space	*mapping = iter->inode->i_mapping;
	pgoff_t			start = offset >> PAGE_SHIFT;
	pgoff_t			end = (offset + length - 1) >> PAGE_SHIFT;

	iter->fbatch = kmalloc(sizeof(struct folio_batch), GFP_KERNEL);
	if (!iter->fbatch)
		return offset + length;
	folio_batch_init(iter->fbatch);

	filemap_get_folios_dirty(mapping, &start, end, iter->fbatch);
	return (start << PAGE_SHIFT);
}
EXPORT_SYMBOL_GPL(iomap_fill_dirty_folios);

int
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
		const struct iomap_ops *ops,
@@ -1446,46 +1518,26 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
		.private	= private,
	};
	struct address_space *mapping = inode->i_mapping;
	unsigned int blocksize = i_blocksize(inode);
	unsigned int off = pos & (blocksize - 1);
	loff_t plen = min_t(loff_t, len, blocksize - off);
	int ret;
	bool range_dirty;

	/*
	 * Zero range can skip mappings that are zero on disk so long as
	 * pagecache is clean. If pagecache was dirty prior to zero range, the
	 * mapping converts on writeback completion and so must be zeroed.
	 *
	 * The simplest way to deal with this across a range is to flush
	 * pagecache and process the updated mappings. To avoid excessive
	 * flushing on partial eof zeroing, special case it to zero the
	 * unaligned start portion if already dirty in pagecache.
	 */
	if (off &&
	    filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) {
		iter.len = plen;
		while ((ret = iomap_iter(&iter, ops)) > 0)
			iter.status = iomap_zero_iter(&iter, did_zero,
					write_ops);

		iter.len = len - (iter.pos - pos);
		if (ret || !iter.len)
			return ret;
	}

	/*
	 * To avoid an unconditional flush, check pagecache state and only flush
	 * if dirty and the fs returns a mapping that might convert on
	 * writeback.
	 */
	range_dirty = filemap_range_needs_writeback(inode->i_mapping,
					iter.pos, iter.pos + iter.len - 1);
	range_dirty = filemap_range_needs_writeback(mapping, iter.pos,
					iter.pos + iter.len - 1);
	while ((ret = iomap_iter(&iter, ops)) > 0) {
		const struct iomap *srcmap = iomap_iter_srcmap(&iter);

		if (srcmap->type == IOMAP_HOLE ||
		    srcmap->type == IOMAP_UNWRITTEN) {
		if (WARN_ON_ONCE(iter.fbatch &&
				 srcmap->type != IOMAP_UNWRITTEN))
			return -EIO;

		if (!iter.fbatch &&
		    (srcmap->type == IOMAP_HOLE ||
		     srcmap->type == IOMAP_UNWRITTEN)) {
			s64 status;

			if (range_dirty) {
+6 −0
Original line number Diff line number Diff line
@@ -8,6 +8,12 @@

static inline void iomap_iter_reset_iomap(struct iomap_iter *iter)
{
	if (iter->fbatch) {
		folio_batch_release(iter->fbatch);
		kfree(iter->fbatch);
		iter->fbatch = NULL;
	}

	iter->status = 0;
	memset(&iter->iomap, 0, sizeof(iter->iomap));
	memset(&iter->srcmap, 0, sizeof(iter->srcmap));
+4 −2
Original line number Diff line number Diff line
@@ -73,7 +73,8 @@
#define XFS_ERRTAG_WRITE_DELAY_MS			43
#define XFS_ERRTAG_EXCHMAPS_FINISH_ONE			44
#define XFS_ERRTAG_METAFILE_RESV_CRITICAL		45
#define XFS_ERRTAG_MAX					46
#define XFS_ERRTAG_FORCE_ZERO_RANGE			46
#define XFS_ERRTAG_MAX					47

/*
 * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -133,7 +134,8 @@ XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \
XFS_ERRTAG(WB_DELAY_MS,		wb_delay_ms,		3000) \
XFS_ERRTAG(WRITE_DELAY_MS,	write_delay_ms,		3000) \
XFS_ERRTAG(EXCHMAPS_FINISH_ONE,	exchmaps_finish_one,	1) \
XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit,	4)
XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit,	4) \
XFS_ERRTAG(FORCE_ZERO_RANGE,	force_zero_range,	4)
#endif /* XFS_ERRTAG */

#endif /* __XFS_ERRORTAG_H_ */
+22 −7
Original line number Diff line number Diff line
@@ -27,6 +27,8 @@
#include "xfs_file.h"
#include "xfs_aops.h"
#include "xfs_zone_alloc.h"
#include "xfs_error.h"
#include "xfs_errortag.h"

#include <linux/dax.h>
#include <linux/falloc.h>
@@ -1254,23 +1256,36 @@ xfs_falloc_zero_range(
	struct xfs_zone_alloc_ctx *ac)
{
	struct inode		*inode = file_inode(file);
	struct xfs_inode	*ip = XFS_I(inode);
	unsigned int		blksize = i_blocksize(inode);
	loff_t			new_size = 0;
	int			error;

	trace_xfs_zero_file_space(XFS_I(inode));
	trace_xfs_zero_file_space(ip);

	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
	if (error)
		return error;

	error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
	/*
	 * Zero range implements a full zeroing mechanism but is only used in
	 * limited situations. It is more efficient to allocate unwritten
	 * extents than to perform zeroing here, so use an errortag to randomly
	 * force zeroing on DEBUG kernels for added test coverage.
	 */
	if (XFS_TEST_ERROR(ip->i_mount,
			   XFS_ERRTAG_FORCE_ZERO_RANGE)) {
		error = xfs_zero_range(ip, offset, len, ac, NULL);
	} else {
		error = xfs_free_file_space(ip, offset, len, ac);
		if (error)
			return error;

	len = round_up(offset + len, blksize) - round_down(offset, blksize);
		len = round_up(offset + len, blksize) -
			round_down(offset, blksize);
		offset = round_down(offset, blksize);
	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
		error = xfs_alloc_file_space(ip, offset, len);
	}
	if (error)
		return error;
	return xfs_falloc_setsize(file, new_size);
+30 −8
Original line number Diff line number Diff line
@@ -1702,6 +1702,8 @@ xfs_buffered_write_iomap_begin(
	struct iomap		*iomap,
	struct iomap		*srcmap)
{
	struct iomap_iter	*iter = container_of(iomap, struct iomap_iter,
						     iomap);
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -1767,21 +1769,41 @@ xfs_buffered_write_iomap_begin(
	}

	/*
	 * For zeroing, trim a delalloc extent that extends beyond the EOF
	 * block.  If it starts beyond the EOF block, convert it to an
	 * For zeroing, trim extents that extend beyond the EOF block. If a
	 * delalloc extent starts beyond the EOF block, convert it to an
	 * unwritten extent.
	 */
	if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb &&
	    isnullstartblock(imap.br_startblock)) {
	if (flags & IOMAP_ZERO) {
		xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
		u64 end;

		if (offset_fsb >= eof_fsb)
		if (isnullstartblock(imap.br_startblock) &&
		    offset_fsb >= eof_fsb)
			goto convert_delay;
		if (end_fsb > eof_fsb) {
		if (offset_fsb < eof_fsb && end_fsb > eof_fsb)
			end_fsb = eof_fsb;
			xfs_trim_extent(&imap, offset_fsb,
					end_fsb - offset_fsb);

		/*
		 * Look up dirty folios for unwritten mappings within EOF.
		 * Providing this bypasses the flush iomap uses to trigger
		 * extent conversion when unwritten mappings have dirty
		 * pagecache in need of zeroing.
		 *
		 * Trim the mapping to the end pos of the lookup, which in turn
		 * was trimmed to the end of the batch if it became full before
		 * the end of the mapping.
		 */
		if (imap.br_state == XFS_EXT_UNWRITTEN &&
		    offset_fsb < eof_fsb) {
			loff_t len = min(count,
					 XFS_FSB_TO_B(mp, imap.br_blockcount));

			end = iomap_fill_dirty_folios(iter, offset, len);
			end_fsb = min_t(xfs_fileoff_t, end_fsb,
					XFS_B_TO_FSB(mp, end));
		}

		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
	}

	/*
Loading