Commit 568570fd authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'xfs-6.12-fixes-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs fixes from Carlos Maiolino:

 - Fix integer overflow in xrep_bmap

 - Fix stale dealloc punching for COW IO

* tag 'xfs-6.12-fixes-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
  xfs: punch delalloc extents from the COW fork for COW writes
  xfs: set IOMAP_F_SHARED for all COW fork allocations
  xfs: share more code in xfs_buffered_write_iomap_begin
  xfs: support the COW fork in xfs_bmap_punch_delalloc_range
  xfs: IOMAP_ZERO and IOMAP_UNSHARE already hold invalidate_lock
  xfs: take XFS_MMAPLOCK_EXCL xfs_file_write_zero_eof
  xfs: factor out a xfs_file_write_zero_eof helper
  iomap: move locking out of iomap_write_delalloc_release
  iomap: remove iomap_file_buffered_write_punch_delalloc
  iomap: factor out a iomap_last_written_block helper
  xfs: fix integer overflow in xrep_bmap
parents 5e9ab267 f6f91d29
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -208,7 +208,7 @@ The filesystem must arrange to `cancel
such `reservations
<https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/>`_
because writeback will not consume the reservation.
The ``iomap_file_buffered_write_punch_delalloc`` can be called from a
The ``iomap_write_delalloc_release`` can be called from a
``->iomap_end`` function to find all the clean areas of the folios
caching a fresh (``IOMAP_F_NEW``) delalloc mapping.
It takes the ``invalidate_lock``.
+36 −75
Original line number Diff line number Diff line
@@ -1145,10 +1145,36 @@ static void iomap_write_delalloc_scan(struct inode *inode,
}

/*
 * When a short write occurs, the filesystem might need to use ->iomap_end
 * to remove space reservations created in ->iomap_begin.
 *
 * For filesystems that use delayed allocation, there can be dirty pages over
 * the delalloc extent outside the range of a short write but still within the
 * delalloc extent allocated for this iomap if the write raced with page
 * faults.
 *
 * Punch out all the delalloc blocks in the range given except for those that
 * have dirty data still pending in the page cache - those are going to be
 * written and so must still retain the delalloc backing for writeback.
 *
 * The punch() callback *must* only punch delalloc extents in the range passed
 * to it. It must skip over all other types of extents in the range and leave
 * them completely unchanged. It must do this punch atomically with respect to
 * other extent modifications.
 *
 * The punch() callback may be called with a folio locked to prevent writeback
 * extent allocation racing at the edge of the range we are currently punching.
 * The locked folio may or may not cover the range being punched, so it is not
 * safe for the punch() callback to lock folios itself.
 *
 * Lock order is:
 *
 * inode->i_rwsem (shared or exclusive)
 *   inode->i_mapping->invalidate_lock (exclusive)
 *     folio_lock()
 *       ->punch
 *         internal filesystem allocation lock
 *
 * As we are scanning the page cache for data, we don't need to reimplement the
 * wheel - mapping_seek_hole_data() does exactly what we need to identify the
 * start and end of data ranges correctly even for sub-folio block sizes. This
@@ -1177,7 +1203,7 @@ static void iomap_write_delalloc_scan(struct inode *inode,
 * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
 * the code to subtle off-by-one bugs....
 */
static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
		loff_t end_byte, unsigned flags, struct iomap *iomap,
		iomap_punch_t punch)
{
@@ -1185,12 +1211,13 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
	loff_t scan_end_byte = min(i_size_read(inode), end_byte);

	/*
	 * Lock the mapping to avoid races with page faults re-instantiating
	 * folios and dirtying them via ->page_mkwrite whilst we walk the
	 * cache and perform delalloc extent removal. Failing to do this can
	 * leave dirty pages with no space reservation in the cache.
	 * The caller must hold invalidate_lock to avoid races with page faults
	 * re-instantiating folios and dirtying them via ->page_mkwrite whilst
	 * we walk the cache and perform delalloc extent removal.  Failing to do
	 * this can leave dirty pages with no space reservation in the cache.
	 */
	filemap_invalidate_lock(inode->i_mapping);
	lockdep_assert_held_write(&inode->i_mapping->invalidate_lock);

	while (start_byte < scan_end_byte) {
		loff_t		data_end;

@@ -1207,7 +1234,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
		if (start_byte == -ENXIO || start_byte == scan_end_byte)
			break;
		if (WARN_ON_ONCE(start_byte < 0))
			goto out_unlock;
			return;
		WARN_ON_ONCE(start_byte < punch_start_byte);
		WARN_ON_ONCE(start_byte > scan_end_byte);

@@ -1218,7 +1245,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
		data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
				scan_end_byte, SEEK_HOLE);
		if (WARN_ON_ONCE(data_end < 0))
			goto out_unlock;
			return;

		/*
		 * If we race with post-direct I/O invalidation of the page cache,
@@ -1240,74 +1267,8 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
	if (punch_start_byte < end_byte)
		punch(inode, punch_start_byte, end_byte - punch_start_byte,
				iomap);
out_unlock:
	filemap_invalidate_unlock(inode->i_mapping);
}

/*
 * When a short write occurs, the filesystem may need to remove reserved space
 * that was allocated in ->iomap_begin from it's ->iomap_end method. For
 * filesystems that use delayed allocation, we need to punch out delalloc
 * extents from the range that are not dirty in the page cache. As the write can
 * race with page faults, there can be dirty pages over the delalloc extent
 * outside the range of a short write but still within the delalloc extent
 * allocated for this iomap.
 *
 * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
 * simplify range iterations.
 *
 * The punch() callback *must* only punch delalloc extents in the range passed
 * to it. It must skip over all other types of extents in the range and leave
 * them completely unchanged. It must do this punch atomically with respect to
 * other extent modifications.
 *
 * The punch() callback may be called with a folio locked to prevent writeback
 * extent allocation racing at the edge of the range we are currently punching.
 * The locked folio may or may not cover the range being punched, so it is not
 * safe for the punch() callback to lock folios itself.
 *
 * Lock order is:
 *
 * inode->i_rwsem (shared or exclusive)
 *   inode->i_mapping->invalidate_lock (exclusive)
 *     folio_lock()
 *       ->punch
 *         internal filesystem allocation lock
 */
void iomap_file_buffered_write_punch_delalloc(struct inode *inode,
		loff_t pos, loff_t length, ssize_t written, unsigned flags,
		struct iomap *iomap, iomap_punch_t punch)
{
	loff_t			start_byte;
	loff_t			end_byte;
	unsigned int		blocksize = i_blocksize(inode);

	if (iomap->type != IOMAP_DELALLOC)
		return;

	/* If we didn't reserve the blocks, we're not allowed to punch them. */
	if (!(iomap->flags & IOMAP_F_NEW))
		return;

	/*
	 * start_byte refers to the first unused block after a short write. If
	 * nothing was written, round offset down to point at the first block in
	 * the range.
	 */
	if (unlikely(!written))
		start_byte = round_down(pos, blocksize);
	else
		start_byte = round_up(pos + written, blocksize);
	end_byte = round_up(pos + length, blocksize);

	/* Nothing to do if we've written the entire delalloc extent */
	if (start_byte >= end_byte)
		return;

	iomap_write_delalloc_release(inode, start_byte, end_byte, flags, iomap,
			punch);
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
EXPORT_SYMBOL_GPL(iomap_write_delalloc_release);

static loff_t iomap_unshare_iter(struct iomap_iter *iter)
{
+1 −1
Original line number Diff line number Diff line
@@ -801,7 +801,7 @@ xrep_bmap(
{
	struct xrep_bmap	*rb;
	char			*descr;
	unsigned int		max_bmbt_recs;
	xfs_extnum_t		max_bmbt_recs;
	bool			large_extcount;
	int			error = 0;

+2 −2
Original line number Diff line number Diff line
@@ -116,7 +116,7 @@ xfs_end_ioend(
	if (unlikely(error)) {
		if (ioend->io_flags & IOMAP_F_SHARED) {
			xfs_reflink_cancel_cow_range(ip, offset, size, true);
			xfs_bmap_punch_delalloc_range(ip, offset,
			xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
					offset + size);
		}
		goto done;
@@ -456,7 +456,7 @@ xfs_discard_folio(
	 * byte of the next folio. Hence the end offset is only dependent on the
	 * folio itself and not the start offset that is passed in.
	 */
	xfs_bmap_punch_delalloc_range(ip, pos,
	xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
				folio_pos(folio) + folio_size(folio));
}

+7 −3
Original line number Diff line number Diff line
@@ -442,11 +442,12 @@ xfs_getbmap(
void
xfs_bmap_punch_delalloc_range(
	struct xfs_inode	*ip,
	int			whichfork,
	xfs_off_t		start_byte,
	xfs_off_t		end_byte)
{
	struct xfs_mount	*mp = ip->i_mount;
	struct xfs_ifork	*ifp = &ip->i_df;
	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, start_byte);
	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, end_byte);
	struct xfs_bmbt_irec	got, del;
@@ -474,11 +475,14 @@ xfs_bmap_punch_delalloc_range(
			continue;
		}

		xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, &got, &del);
		xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
		if (!xfs_iext_get_extent(ifp, &icur, &got))
			break;
	}

	if (whichfork == XFS_COW_FORK && !ifp->if_bytes)
		xfs_inode_clear_cowblocks_tag(ip);

out_unlock:
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
@@ -580,7 +584,7 @@ xfs_free_eofblocks(
	 */
	if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) {
		if (ip->i_delayed_blks) {
			xfs_bmap_punch_delalloc_range(ip,
			xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK,
				round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
				LLONG_MAX);
		}
Loading