Commit 5d6f0e98 authored by Filipe Manana's avatar Filipe Manana Committed by David Sterba
Browse files

btrfs: stop locking the source extent range during reflink



Nowadays before starting a reflink operation we do this:

1) Take the VFS lock of the inodes in exclusive mode (a rw semaphore);

2) Take the  mmap lock of the inodes (struct btrfs_inode::i_mmap_lock);

3) Flush all delalloc in the source and target ranges;

4) Wait for all ordered extents in the source and target ranges to
   complete;

5) Lock the source and destination ranges in the inodes' io trees.

In step 5 we lock the source range because:

1) We needed to serialize against mmap writes, but that is not needed
   anymore because nowadays we do that through the inode's i_mmap_lock
   (step 2). This happens since commit 8c99516a ("btrfs: exclude mmaps
   while doing remap");

2) To serialize against a concurrent relocation and avoid generating
   a delayed ref for an extent that was just dropped by relocation, see
   commit d8b55242 ("Btrfs: fix race between reflink/dedupe and
   relocation").

Locking the source range however blocks any concurrent reads for that
range and makes test case generic/733 fail.

So instead of locking the source range during reflinks, make relocation
read lock the inode's i_mmap_lock, so that it serializes with a concurrent
reflink while still able to run concurrently with mmap writes and allow
concurrent reads too.

Reviewed-by: default avatarBoris Burkov <boris@bur.io>
Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 4a43d735
Loading
Loading
Loading
Loading
+17 −37
Original line number Diff line number Diff line
@@ -616,35 +616,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
	return ret;
}

static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
				       struct inode *inode2, u64 loff2, u64 len)
{
	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL);
	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL);
}

static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
				     struct inode *inode2, u64 loff2, u64 len)
{
	u64 range1_end = loff1 + len - 1;
	u64 range2_end = loff2 + len - 1;

	if (inode1 < inode2) {
		swap(inode1, inode2);
		swap(loff1, loff2);
		swap(range1_end, range2_end);
	} else if (inode1 == inode2 && loff2 < loff1) {
		swap(loff1, loff2);
		swap(range1_end, range2_end);
	}

	lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL);
	lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL);

	btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end);
	btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end);
}

static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
{
	if (inode1 < inode2)
@@ -662,17 +633,21 @@ static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
				   struct inode *dst, u64 dst_loff)
{
	const u64 end = dst_loff + len - 1;
	struct extent_state *cached_state = NULL;
	struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info;
	const u64 bs = fs_info->sectorsize;
	int ret;

	/*
	 * Lock destination range to serialize with concurrent readahead() and
	 * source range to serialize with relocation.
	 * Lock destination range to serialize with concurrent readahead(), and
	 * we are safe from concurrency with relocation of source extents
	 * because we have already locked the inode's i_mmap_lock in exclusive
	 * mode.
	 */
	btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
	lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state);
	ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
	btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
	unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state);

	btrfs_btree_balance_dirty(fs_info);

@@ -724,6 +699,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
					u64 off, u64 olen, u64 destoff)
{
	struct extent_state *cached_state = NULL;
	struct inode *inode = file_inode(file);
	struct inode *src = file_inode(file_src);
	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
@@ -731,6 +707,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
	int wb_ret;
	u64 len = olen;
	u64 bs = fs_info->sectorsize;
	u64 end;

	/*
	 * VFS's generic_remap_file_range_prep() protects us from cloning the
@@ -763,12 +740,15 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
	}

	/*
	 * Lock destination range to serialize with concurrent readahead() and
	 * source range to serialize with relocation.
	 * Lock destination range to serialize with concurrent readahead(), and
	 * we are safe from concurrency with relocation of source extents
	 * because we have already locked the inode's i_mmap_lock in exclusive
	 * mode.
	 */
	btrfs_double_extent_lock(src, off, inode, destoff, len);
	end = destoff + len - 1;
	lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
	btrfs_double_extent_unlock(src, off, inode, destoff, len);
	unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);

	/*
	 * We may have copied an inline extent into a page of the destination
+7 −1
Original line number Diff line number Diff line
@@ -1127,16 +1127,22 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
						    fs_info->sectorsize));
				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
				end--;
				/* Take mmap lock to serialize with reflinks. */
				if (!down_read_trylock(&BTRFS_I(inode)->i_mmap_lock))
					continue;
				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
						      key.offset, end,
						      &cached_state);
				if (!ret)
				if (!ret) {
					up_read(&BTRFS_I(inode)->i_mmap_lock);
					continue;
				}

				btrfs_drop_extent_map_range(BTRFS_I(inode),
							    key.offset, end, true);
				unlock_extent(&BTRFS_I(inode)->io_tree,
					      key.offset, end, &cached_state);
				up_read(&BTRFS_I(inode)->i_mmap_lock);
			}
		}