Commit 0ec0d4ec authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'vfs-6.15-rc1.iomap' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs iomap updates from Christian Brauner:

 - Allow the filesystem to submit the writeback bios.

    - Allow the filsystem to track completions on a per-bio bases
      instead of the entire I/O.

    - Change writeback_ops so that ->submit_bio can be done by the
      filesystem.

    - A new ANON_WRITE flag for writes that don't have a block number
      assigned to them at the iomap level leaving the filesystem to do
      that work in the submission handler.

 - Incremental iterator advance

   The folio_batch support for zero range where the filesystem provides
   a batch of folios to process that might not be logically continguous
   requires more flexibility than the current offset based iteration
   currently offers.

   Update all iomap operations to advance the iterator within the
   operation and thus remove the need to advance from the core iomap
   iterator.

 - Make buffered writes work with RWF_DONTCACHE

   If RWF_DONTCACHE is set for a write, mark the folios being written as
   uncached. On writeback completion the pages will be dropped.

 - Introduce infrastructure for large atomic writes

   This will eventually be used by xfs and ext4.

* tag 'vfs-6.15-rc1.iomap' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (42 commits)
  iomap: rework IOMAP atomic flags
  iomap: comment on atomic write checks in iomap_dio_bio_iter()
  iomap: inline iomap_dio_bio_opflags()
  iomap: fix inline data on buffered read
  iomap: Lift blocksize restriction on atomic writes
  iomap: Support SW-based atomic writes
  iomap: Rename IOMAP_ATOMIC -> IOMAP_ATOMIC_HW
  xfs: flag as supporting FOP_DONTCACHE
  iomap: make buffered writes work with RWF_DONTCACHE
  iomap: introduce a full map advance helper
  iomap: rename iomap_iter processed field to status
  iomap: remove unnecessary advance from iomap_iter()
  dax: advance the iomap_iter on pte and pmd faults
  dax: advance the iomap_iter on dedupe range
  dax: advance the iomap_iter on unshare range
  dax: advance the iomap_iter on zero range
  dax: push advance down into dax_iomap_iter() for read and write
  dax: advance the iomap_iter in the read/write path
  iomap: convert misc simple ops to incremental advance
  iomap: advance the iter on direct I/O
  ...
parents df00ded2 c84042b3
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -246,6 +246,10 @@ The fields are as follows:
   * **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can
     be set by the filesystem for its own purposes.

   * **IOMAP_F_ANON_WRITE**: Indicates that (write) I/O does not have a target
     block assigned to it yet and the file system will do that in the bio
     submission handler, splitting the I/O as needed.

   These flags can be set by iomap itself during file operations.
   The filesystem should supply an ``->iomap_end`` function if it needs
   to observe these flags:
@@ -352,6 +356,11 @@ operations:
   ``IOMAP_NOWAIT`` is often set on behalf of ``IOCB_NOWAIT`` or
   ``RWF_NOWAIT``.

 * ``IOMAP_DONTCACHE`` is set when the caller wishes to perform a
   buffered file I/O and would like the kernel to drop the pagecache
   after the I/O completes, if it isn't already being used by another
   thread.

If it is necessary to read existing file contents from a `different
<https://lore.kernel.org/all/20191008071527.29304-9-hch@lst.de/>`_
device or address range on a device, the filesystem should return that
+29 −13
Original line number Diff line number Diff line
@@ -131,6 +131,8 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap:

 * ``IOCB_NOWAIT``: Turns on ``IOMAP_NOWAIT``.

 * ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``.

Internal per-Folio State
------------------------

@@ -283,7 +285,7 @@ The ``ops`` structure must be specified and is as follows:
 struct iomap_writeback_ops {
     int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
                       loff_t offset, unsigned len);
     int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
     int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
     void (*discard_folio)(struct folio *folio, loff_t pos);
 };

@@ -306,13 +308,12 @@ The fields are as follows:
    purpose.
    This function must be supplied by the filesystem.

  - ``prepare_ioend``: Enables filesystems to transform the writeback
    ioend or perform any other preparatory work before the writeback I/O
    is submitted.
  - ``submit_ioend``: Allows the file systems to hook into writeback bio
    submission.
    This might include pre-write space accounting updates, or installing
    a custom ``->bi_end_io`` function for internal purposes, such as
    deferring the ioend completion to a workqueue to run metadata update
    transactions from process context.
    transactions from process context before submitting the bio.
    This function is optional.

  - ``discard_folio``: iomap calls this function after ``->map_blocks``
@@ -341,7 +342,7 @@ This can happen in interrupt or process context, depending on the
storage device.

Filesystems that need to update internal bookkeeping (e.g. unwritten
extent conversions) should provide a ``->prepare_ioend`` function to
extent conversions) should provide a ``->submit_ioend`` function to
set ``struct iomap_end::bio::bi_end_io`` to its own function.
This function should call ``iomap_finish_ioends`` after finishing its
own work (e.g. unwritten extent conversion).
@@ -515,18 +516,33 @@ IOMAP_WRITE`` with any combination of the following enhancements:

 * ``IOMAP_ATOMIC``: This write is being issued with torn-write
   protection.
   Only a single bio can be created for the write, and the write must
   not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be
   set.
   Torn-write protection may be provided based on HW-offload or by a
   software mechanism provided by the filesystem.

   For HW-offload based support, only a single bio can be created for the
   write, and the write must not be split into multiple I/O requests, i.e.
   flag REQ_ATOMIC must be set.
   The file range to write must be aligned to satisfy the requirements
   of both the filesystem and the underlying block device's atomic
   commit capabilities.
   If filesystem metadata updates are required (e.g. unwritten extent
   conversion or copy on write), all updates for the entire file range
   conversion or copy-on-write), all updates for the entire file range
   must be committed atomically as well.
   Only one space mapping is allowed per untorn write.
   Untorn writes must be aligned to, and must not be longer than, a
   single file block.
   Untorn-writes may be longer than a single file block. In all cases,
   the mapping start disk block must have at least the same alignment as
   the write offset.
   The filesystems must set IOMAP_F_ATOMIC_BIO to inform iomap core of an
   untorn-write based on HW-offload.

   For untorn-writes based on a software mechanism provided by the
   filesystem, all the disk block alignment and single bio restrictions
   which apply for HW-offload based untorn-writes do not apply.
   The mechanism would typically be used as a fallback for when
   HW-offload based untorn-writes may not be issued, e.g. the range of the
   write covers multiple extents, meaning that it is not possible to issue
   a single bio.
   All filesystem metadata updates for the entire file range must be
   committed atomically as well.

Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
calling this function.
+61 −50
Original line number Diff line number Diff line
@@ -1258,7 +1258,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
}
#endif /* CONFIG_FS_DAX_PMD */

static s64 dax_unshare_iter(struct iomap_iter *iter)
static int dax_unshare_iter(struct iomap_iter *iter)
{
	struct iomap *iomap = &iter->iomap;
	const struct iomap *srcmap = iomap_iter_srcmap(iter);
@@ -1266,11 +1266,11 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
	u64 copy_len = iomap_length(iter);
	u32 mod;
	int id = 0;
	s64 ret = 0;
	s64 ret;
	void *daddr = NULL, *saddr = NULL;

	if (!iomap_want_unshare_iter(iter))
		return iomap_length(iter);
		return iomap_iter_advance_full(iter);

	/*
	 * Extend the file range to be aligned to fsblock/pagesize, because
@@ -1300,14 +1300,14 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
	if (ret < 0)
		goto out_unlock;

	if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0)
		ret = iomap_length(iter);
	else
	if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0)
		ret = -EIO;

out_unlock:
	dax_read_unlock(id);
	if (ret < 0)
		return dax_mem2blk_err(ret);
	return iomap_iter_advance_full(iter);
}

int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
@@ -1326,7 +1326,7 @@ int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,

	iter.len = min(len, size - pos);
	while ((ret = iomap_iter(&iter, ops)) > 0)
		iter.processed = dax_unshare_iter(&iter);
		iter.status = dax_unshare_iter(&iter);
	return ret;
}
EXPORT_SYMBOL_GPL(dax_file_unshare);
@@ -1354,17 +1354,16 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
	return ret;
}

static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
{
	const struct iomap *iomap = &iter->iomap;
	const struct iomap *srcmap = iomap_iter_srcmap(iter);
	loff_t pos = iter->pos;
	u64 length = iomap_length(iter);
	s64 written = 0;
	int ret;

	/* already zeroed?  we're done. */
	if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
		return length;
		return iomap_iter_advance(iter, &length);

	/*
	 * invalidate the pages whose sharing state is to be changed
@@ -1372,33 +1371,35 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
	 */
	if (iomap->flags & IOMAP_F_SHARED)
		invalidate_inode_pages2_range(iter->inode->i_mapping,
					      pos >> PAGE_SHIFT,
					      (pos + length - 1) >> PAGE_SHIFT);
				iter->pos >> PAGE_SHIFT,
				(iter->pos + length - 1) >> PAGE_SHIFT);

	do {
		loff_t pos = iter->pos;
		unsigned offset = offset_in_page(pos);
		unsigned size = min_t(u64, PAGE_SIZE - offset, length);
		pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
		long rc;
		int id;

		length = min_t(u64, PAGE_SIZE - offset, length);

		id = dax_read_lock();
		if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
			rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
		if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE)
			ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
		else
			rc = dax_memzero(iter, pos, size);
			ret = dax_memzero(iter, pos, length);
		dax_read_unlock(id);

		if (rc < 0)
			return rc;
		pos += size;
		length -= size;
		written += size;
		if (ret < 0)
			return ret;

		ret = iomap_iter_advance(iter, &length);
		if (ret)
			return ret;
	} while (length > 0);

	if (did_zero)
		*did_zero = true;
	return written;
	return ret;
}

int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
@@ -1413,7 +1414,7 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
	int ret;

	while ((ret = iomap_iter(&iter, ops)) > 0)
		iter.processed = dax_zero_iter(&iter, did_zero);
		iter.status = dax_zero_iter(&iter, did_zero);
	return ret;
}
EXPORT_SYMBOL_GPL(dax_zero_range);
@@ -1431,8 +1432,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
}
EXPORT_SYMBOL_GPL(dax_truncate_page);

static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
		struct iov_iter *iter)
static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
{
	const struct iomap *iomap = &iomi->iomap;
	const struct iomap *srcmap = iomap_iter_srcmap(iomi);
@@ -1451,8 +1451,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
		if (pos >= end)
			return 0;

		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
			return iov_iter_zero(min(length, end - pos), iter);
		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) {
			done = iov_iter_zero(min(length, end - pos), iter);
			return iomap_iter_advance(iomi, &done);
		}
	}

	/*
@@ -1485,7 +1487,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
	}

	id = dax_read_lock();
	while (pos < end) {
	while ((pos = iomi->pos) < end) {
		unsigned offset = pos & (PAGE_SIZE - 1);
		const size_t size = ALIGN(length + offset, PAGE_SIZE);
		pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
@@ -1535,18 +1537,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
			xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
					map_len, iter);

		pos += xfer;
		length -= xfer;
		done += xfer;

		if (xfer == 0)
		length = xfer;
		ret = iomap_iter_advance(iomi, &length);
		if (!ret && xfer == 0)
			ret = -EFAULT;
		if (xfer < map_len)
			break;
	}
	dax_read_unlock(id);

	return done ? done : ret;
	return ret;
}

/**
@@ -1586,7 +1586,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
		iomi.flags |= IOMAP_NOWAIT;

	while ((ret = iomap_iter(&iomi, ops)) > 0)
		iomi.processed = dax_iomap_iter(&iomi, iter);
		iomi.status = dax_iomap_iter(&iomi, iter);

	done = iomi.pos - iocb->ki_pos;
	iocb->ki_pos = iomi.pos;
@@ -1757,7 +1757,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,

	while ((error = iomap_iter(&iter, ops)) > 0) {
		if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
			iter.processed = -EIO;	/* fs corruption? */
			iter.status = -EIO;	/* fs corruption? */
			continue;
		}

@@ -1769,8 +1769,10 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
			ret |= VM_FAULT_MAJOR;
		}

		if (!(ret & VM_FAULT_ERROR))
			iter.processed = PAGE_SIZE;
		if (!(ret & VM_FAULT_ERROR)) {
			u64 length = PAGE_SIZE;
			iter.status = iomap_iter_advance(&iter, &length);
		}
	}

	if (iomap_errp)
@@ -1883,8 +1885,10 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
			continue; /* actually breaks out of the loop */

		ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
		if (ret != VM_FAULT_FALLBACK)
			iter.processed = PMD_SIZE;
		if (ret != VM_FAULT_FALLBACK) {
			u64 length = PMD_SIZE;
			iter.status = iomap_iter_advance(&iter, &length);
		}
	}

unlock_entry:
@@ -1999,12 +2003,13 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);

static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
static int dax_range_compare_iter(struct iomap_iter *it_src,
		struct iomap_iter *it_dest, u64 len, bool *same)
{
	const struct iomap *smap = &it_src->iomap;
	const struct iomap *dmap = &it_dest->iomap;
	loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
	u64 dest_len;
	void *saddr, *daddr;
	int id, ret;

@@ -2012,7 +2017,7 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src,

	if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
		*same = true;
		return len;
		goto advance;
	}

	if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
@@ -2035,7 +2040,13 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
	if (!*same)
		len = 0;
	dax_read_unlock(id);
	return len;

advance:
	dest_len = len;
	ret = iomap_iter_advance(it_src, &len);
	if (!ret)
		ret = iomap_iter_advance(it_dest, &dest_len);
	return ret;

out_unlock:
	dax_read_unlock(id);
@@ -2058,15 +2069,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
		.len		= len,
		.flags		= IOMAP_DAX,
	};
	int ret, compared = 0;
	int ret, status;

	while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
	       (ret = iomap_iter(&dst_iter, ops)) > 0) {
		compared = dax_range_compare_iter(&src_iter, &dst_iter,
		status = dax_range_compare_iter(&src_iter, &dst_iter,
				min(src_iter.len, dst_iter.len), same);
		if (compared < 0)
		if (status < 0)
			return ret;
		src_iter.processed = dst_iter.processed = compared;
		src_iter.status = dst_iter.status = status;
	}
	return ret;
}
+4 −0
Original line number Diff line number Diff line
@@ -3290,6 +3290,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
	if (map->m_flags & EXT4_MAP_NEW)
		iomap->flags |= IOMAP_F_NEW;

	/* HW-offload atomics are always used */
	if (flags & IOMAP_ATOMIC)
		iomap->flags |= IOMAP_F_ATOMIC_BIO;

	if (flags & IOMAP_DAX)
		iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
	else
+2 −1
Original line number Diff line number Diff line
@@ -1300,7 +1300,8 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from,
				 unsigned int length)
{
	BUG_ON(current->journal_info);
	return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
	return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops,
			NULL);
}

#define GFS2_JTRUNC_REVOKES 8192
Loading