Commit cc8a0934 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs fixes from David Sterba:

 - extent map shrinker fixes:
     - fix potential use after free accessing an inode to reach fs_info,
       the shrinker could do iput() in the meantime
     - skip unnecessary scanning of inodes without extent maps
     - do direct iput(), no need for indirection via workqueue

 - in block < page mode, fix race when extending i_size in buffered mode

 - fix minor memory leak in selftests

 - print descriptive error message when seeding device is not found

* tag 'for-6.14-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: fix data overwriting bug during buffered write when block size < page size
  btrfs: output an error message if btrfs failed to find the seed fsid
  btrfs: do regular iput instead of delayed iput during extent map shrinking
  btrfs: skip inodes without loaded extent maps when shrinking extent maps
  btrfs: fix use-after-free on inode when scanning root during em shrinking
  btrfs: selftests: fix btrfs_test_delayed_refs() leak of transaction
parents 3d85d6c8 efa11fd2
Loading
Loading
Loading
Loading
+59 −24
Original line number Diff line number Diff line
@@ -1128,6 +1128,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
	long nr_dropped = 0;
	struct rb_node *node;

	lockdep_assert_held_write(&tree->lock);

	/*
	 * Take the mmap lock so that we serialize with the inode logging phase
	 * of fsync because we may need to set the full sync flag on the inode,
@@ -1139,28 +1141,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
	 * to find new extents, which may not be there yet because ordered
	 * extents haven't completed yet.
	 *
	 * We also do a try lock because otherwise we could deadlock. This is
	 * because the shrinker for this filesystem may be invoked while we are
	 * in a path that is holding the mmap lock in write mode. For example in
	 * a reflink operation while COWing an extent buffer, when allocating
	 * pages for a new extent buffer and under memory pressure, the shrinker
	 * may be invoked, and therefore we would deadlock by attempting to read
	 * lock the mmap lock while we are holding already a write lock on it.
	 * We also do a try lock because we don't want to block for too long and
	 * we are holding the extent map tree's lock in write mode.
	 */
	if (!down_read_trylock(&inode->i_mmap_lock))
		return 0;

	/*
	 * We want to be fast so if the lock is busy we don't want to spend time
	 * waiting for it - either some task is about to do IO for the inode or
	 * we may have another task shrinking extent maps, here in this code, so
	 * skip this inode.
	 */
	if (!write_trylock(&tree->lock)) {
		up_read(&inode->i_mmap_lock);
		return 0;
	}

	node = rb_first(&tree->root);
	while (node) {
		struct rb_node *next = rb_next(node);
@@ -1201,12 +1187,61 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
			break;
		node = next;
	}
	write_unlock(&tree->lock);
	up_read(&inode->i_mmap_lock);

	return nr_dropped;
}

static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root,
						      u64 min_ino)
{
	struct btrfs_inode *inode;
	unsigned long from = min_ino;

	xa_lock(&root->inodes);
	while (true) {
		struct extent_map_tree *tree;

		inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
		if (!inode)
			break;

		tree = &inode->extent_tree;

		/*
		 * We want to be fast so if the lock is busy we don't want to
		 * spend time waiting for it (some task is about to do IO for
		 * the inode).
		 */
		if (!write_trylock(&tree->lock))
			goto next;

		/*
		 * Skip inode if it doesn't have loaded extent maps, so we avoid
		 * getting a reference and doing an iput later. This includes
		 * cases like files that were opened for things like stat(2), or
		 * files with all extent maps previously released through the
		 * release folio callback (btrfs_release_folio()) or released in
		 * a previous run, or directories which never have extent maps.
		 */
		if (RB_EMPTY_ROOT(&tree->root)) {
			write_unlock(&tree->lock);
			goto next;
		}

		if (igrab(&inode->vfs_inode))
			break;

		write_unlock(&tree->lock);
next:
		from = btrfs_ino(inode) + 1;
		cond_resched_lock(&root->inodes.xa_lock);
	}
	xa_unlock(&root->inodes);

	return inode;
}

static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{
	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1214,21 +1249,21 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
	long nr_dropped = 0;
	u64 min_ino = fs_info->em_shrinker_last_ino + 1;

	inode = btrfs_find_first_inode(root, min_ino);
	inode = find_first_inode_to_shrink(root, min_ino);
	while (inode) {
		nr_dropped += btrfs_scan_inode(inode, ctx);
		write_unlock(&inode->extent_tree.lock);

		min_ino = btrfs_ino(inode) + 1;
		fs_info->em_shrinker_last_ino = btrfs_ino(inode);
		btrfs_add_delayed_iput(inode);
		iput(&inode->vfs_inode);

		if (ctx->scanned >= ctx->nr_to_scan ||
		    btrfs_fs_closing(inode->root->fs_info))
		if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info))
			break;

		cond_resched();

		inode = btrfs_find_first_inode(root, min_ino);
		inode = find_first_inode_to_shrink(root, min_ino);
	}

	if (inode) {
+8 −1
Original line number Diff line number Diff line
@@ -1090,7 +1090,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
	u64 lockend;
	size_t num_written = 0;
	ssize_t ret;
	loff_t old_isize = i_size_read(inode);
	loff_t old_isize;
	unsigned int ilock_flags = 0;
	const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
	unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
@@ -1103,6 +1103,13 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
	if (ret < 0)
		return ret;

	/*
	 * We can only trust the isize with inode lock held, or it can race with
	 * other buffered writes and cause incorrect call of
	 * pagecache_isize_extended() to overwrite existing data.
	 */
	old_isize = i_size_read(inode);

	ret = generic_write_checks(iocb, i);
	if (ret <= 0)
		goto out;
+1 −0
Original line number Diff line number Diff line
@@ -1009,6 +1009,7 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize)
	if (!ret)
		ret = select_delayed_refs_test(&trans);

	kfree(transaction);
out_free_fs_info:
	btrfs_free_dummy_fs_info(fs_info);
	return ret;
+5 −1
Original line number Diff line number Diff line
@@ -7200,8 +7200,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,

	fs_devices = find_fsid(fsid, NULL);
	if (!fs_devices) {
		if (!btrfs_test_opt(fs_info, DEGRADED))
		if (!btrfs_test_opt(fs_info, DEGRADED)) {
			btrfs_err(fs_info,
		"failed to find fsid %pU when attempting to open seed devices",
				  fsid);
			return ERR_PTR(-ENOENT);
		}

		fs_devices = alloc_fs_devices(fsid);
		if (IS_ERR(fs_devices))