Commit 19ca0d8a authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs fix from David Sterba:
 "A fix for fast fsync that needs to handle errors during writes after
  some COW failure so it does not lead to an inconsistent state"

* tag 'for-6.10-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: ensure fast fsync waits for ordered extents after a write failure
parents e20b269d f13e01b8
Loading
Loading
Loading
Loading
+10 −0
Original line number Diff line number Diff line
@@ -89,6 +89,16 @@ enum {
	BTRFS_INODE_FREE_SPACE_INODE,
	/* Set when there are no capabilities in XATTs for the inode. */
	BTRFS_INODE_NO_CAP_XATTR,
	/*
	 * Set if an error happened when doing a COW write before submitting a
	 * bio or during writeback. Used for both buffered writes and direct IO
	 * writes. This is to signal a fast fsync that it has to wait for
	 * ordered extents to complete and therefore not log extent maps that
	 * point to unwritten extents (when an ordered extent completes and it
	 * has the BTRFS_ORDERED_IOERR flag set, it drops extent maps in its
	 * range).
	 */
	BTRFS_INODE_COW_WRITE_ERROR,
};

/* in memory btrfs inode */
+16 −0
Original line number Diff line number Diff line
@@ -1885,6 +1885,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
	 */
	if (full_sync || btrfs_is_zoned(fs_info)) {
		ret = btrfs_wait_ordered_range(inode, start, len);
		clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &BTRFS_I(inode)->runtime_flags);
	} else {
		/*
		 * Get our ordered extents as soon as possible to avoid doing
@@ -1894,6 +1895,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
		btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
						      &ctx.ordered_extents);
		ret = filemap_fdatawait_range(inode->i_mapping, start, end);
		if (ret)
			goto out_release_extents;

		/*
		 * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
		 * starting and waiting for writeback, because for buffered IO
		 * it may have been set during the end IO callback
		 * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
		 * case an error happened and we need to wait for ordered
		 * extents to complete so that any extent maps that point to
		 * unwritten locations are dropped and we don't log them.
		 */
		if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR,
				       &BTRFS_I(inode)->runtime_flags))
			ret = btrfs_wait_ordered_range(inode, start, len);
	}

	if (ret)
+31 −0
Original line number Diff line number Diff line
@@ -388,6 +388,37 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
	ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
	spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);

	/*
	 * If this is a COW write it means we created new extent maps for the
	 * range and they point to unwritten locations if we got an error either
	 * before submitting a bio or during IO.
	 *
	 * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we
	 * are queuing its completion below. During completion, at
	 * btrfs_finish_one_ordered(), we will drop the extent maps for the
	 * unwritten extents.
	 *
	 * However because completion runs in a work queue we can end up having
	 * a fast fsync running before that. In the case of direct IO, once we
	 * unlock the inode the fsync might start, and we queue the completion
	 * before unlocking the inode. In the case of buffered IO when writeback
	 * finishes (end_bbio_data_write()) we queue the completion, so if the
	 * writeback was triggered by a fast fsync, the fsync might start
	 * logging before ordered extent completion runs in the work queue.
	 *
	 * The fast fsync will log file extent items based on the extent maps it
	 * finds, so if by the time it collects extent maps the ordered extent
	 * completion didn't happen yet, it will log file extent items that
	 * point to unwritten extents, resulting in a corruption if a crash
	 * happens and the log tree is replayed. Note that a fast fsync does not
	 * wait for completion of ordered extents in order to reduce latency.
	 *
	 * Set a flag in the inode so that the next fast fsync will wait for
	 * ordered extents to complete before starting to log.
	 */
	if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
		set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);

	if (ret)
		btrfs_queue_ordered_fn(ordered);
	return ret;