Commit 4c06e63b authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs fixes from David Sterba:

 - tree-log fixes:
    - fixes of log tracking of directories and subvolumes
    - fix iteration and error handling of inode references
      during log replay

 - fix free space tree rebuild (reported by syzbot)

* tag 'for-6.16-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: use btrfs_record_snapshot_destroy() during rmdir
  btrfs: propagate last_unlink_trans earlier when doing a rmdir
  btrfs: record new subvolume in parent dir earlier to avoid dir logging races
  btrfs: fix inode lookup error handling during log replay
  btrfs: fix iteration of extrefs during log replay
  btrfs: fix missing error handling when searching for inode refs during log replay
  btrfs: fix failure to rebuild free space tree using multiple transactions
parents 025c1970 157501b0
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -83,6 +83,8 @@ enum btrfs_block_group_flags {
	BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
	/* Does the block group need to be added to the free space tree? */
	BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
	/* Set after we add a new block group to the free space tree. */
	BLOCK_GROUP_FLAG_FREE_SPACE_ADDED,
	/* Indicate that the block group is placed on a sequential zone */
	BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE,
	/*
+40 −0
Original line number Diff line number Diff line
@@ -1241,6 +1241,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
{
	BTRFS_PATH_AUTO_FREE(path);
	struct btrfs_key key;
	struct rb_node *node;
	int nr;
	int ret;

@@ -1269,6 +1270,16 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
		btrfs_release_path(path);
	}

	node = rb_first_cached(&trans->fs_info->block_group_cache_tree);
	while (node) {
		struct btrfs_block_group *bg;

		bg = rb_entry(node, struct btrfs_block_group, cache_node);
		clear_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &bg->runtime_flags);
		node = rb_next(node);
		cond_resched();
	}

	return 0;
}

@@ -1358,12 +1369,18 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)

		block_group = rb_entry(node, struct btrfs_block_group,
				       cache_node);

		if (test_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED,
			     &block_group->runtime_flags))
			goto next;

		ret = populate_free_space_tree(trans, block_group);
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			btrfs_end_transaction(trans);
			return ret;
		}
next:
		if (btrfs_should_end_transaction(trans)) {
			btrfs_end_transaction(trans);
			trans = btrfs_start_transaction(free_space_root, 1);
@@ -1390,6 +1407,29 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,

	clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags);

	/*
	 * While rebuilding the free space tree we may allocate new metadata
	 * block groups while modifying the free space tree.
	 *
	 * Because during the rebuild (at btrfs_rebuild_free_space_tree()) we
	 * can use multiple transactions, every time btrfs_end_transaction() is
	 * called at btrfs_rebuild_free_space_tree() we finish the creation of
	 * new block groups by calling btrfs_create_pending_block_groups(), and
	 * that in turn calls us, through add_block_group_free_space(), to add
	 * a free space info item and a free space extent item for the block
	 * group.
	 *
	 * Then later btrfs_rebuild_free_space_tree() may find such new block
	 * groups and processes them with populate_free_space_tree(), which can
	 * fail with EEXIST since there are already items for the block group in
	 * the free space tree. Notice that we say "may find" because a new
	 * block group may be added to the block groups rbtree in a node before
	 * or after the block group currently being processed by the rebuild
	 * process. So signal the rebuild process to skip such new block groups
	 * if it finds them.
	 */
	set_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &block_group->runtime_flags);

	ret = add_new_free_space_info(trans, block_group, path);
	if (ret)
		return ret;
+18 −18
Original line number Diff line number Diff line
@@ -4710,7 +4710,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
	int ret = 0;
	struct btrfs_trans_handle *trans;
	u64 last_unlink_trans;
	struct fscrypt_name fname;

	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
@@ -4736,6 +4735,23 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
		goto out_notrans;
	}

	/*
	 * Propagate the last_unlink_trans value of the deleted dir to its
	 * parent directory. This is to prevent an unrecoverable log tree in the
	 * case we do something like this:
	 * 1) create dir foo
	 * 2) create snapshot under dir foo
	 * 3) delete the snapshot
	 * 4) rmdir foo
	 * 5) mkdir foo
	 * 6) fsync foo or some file inside foo
	 *
	 * This is because we can't unlink other roots when replaying the dir
	 * deletes for directory foo.
	 */
	if (BTRFS_I(inode)->last_unlink_trans >= trans->transid)
		btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));

	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
		ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
		goto out;
@@ -4745,27 +4761,11 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
	if (ret)
		goto out;

	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;

	/* now the directory is empty */
	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
				 &fname.disk_name);
	if (!ret) {
	if (!ret)
		btrfs_i_size_write(BTRFS_I(inode), 0);
		/*
		 * Propagate the last_unlink_trans value of the deleted dir to
		 * its parent directory. This is to prevent an unrecoverable
		 * log tree in the case we do something like this:
		 * 1) create dir foo
		 * 2) create snapshot under dir foo
		 * 3) delete the snapshot
		 * 4) rmdir foo
		 * 5) mkdir foo
		 * 6) fsync foo or some file inside foo
		 */
		if (last_unlink_trans >= trans->transid)
			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
	}
out:
	btrfs_end_transaction(trans);
out_notrans:
+2 −2
Original line number Diff line number Diff line
@@ -666,14 +666,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
		goto out;
	}

	btrfs_record_new_subvolume(trans, BTRFS_I(dir));

	ret = btrfs_create_new_inode(trans, &new_inode_args);
	if (ret) {
		btrfs_abort_transaction(trans, ret);
		goto out;
	}

	btrfs_record_new_subvolume(trans, BTRFS_I(dir));

	d_instantiate_new(dentry, new_inode_args.inode);
	new_inode_args.inode = NULL;

+69 −68
Original line number Diff line number Diff line
@@ -143,6 +143,9 @@ static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *r
	unsigned int nofs_flag;
	struct btrfs_inode *inode;

	/* Only meant to be called for subvolume roots and not for log roots. */
	ASSERT(is_fstree(btrfs_root_id(root)));

	/*
	 * We're holding a transaction handle whether we are logging or
	 * replaying a log tree, so we must make sure NOFS semantics apply
@@ -604,21 +607,6 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
	return 0;
}

/*
 * simple helper to read an inode off the disk from a given root
 * This can only be called for subvolume roots and not for the log
 */
static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root,
						   u64 objectid)
{
	struct btrfs_inode *inode;

	inode = btrfs_iget_logging(objectid, root);
	if (IS_ERR(inode))
		return NULL;
	return inode;
}

/* replays a single extent in 'eb' at 'slot' with 'key' into the
 * subvolume 'root'.  path is released on entry and should be released
 * on exit.
@@ -674,9 +662,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
		return -EUCLEAN;
	}

	inode = read_one_inode(root, key->objectid);
	if (!inode)
		return -EIO;
	inode = btrfs_iget_logging(key->objectid, root);
	if (IS_ERR(inode))
		return PTR_ERR(inode);

	/*
	 * first check to see if we already have this extent in the
@@ -948,9 +936,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,

	btrfs_release_path(path);

	inode = read_one_inode(root, location.objectid);
	if (!inode) {
		ret = -EIO;
	inode = btrfs_iget_logging(location.objectid, root);
	if (IS_ERR(inode)) {
		ret = PTR_ERR(inode);
		inode = NULL;
		goto out;
	}

@@ -1073,7 +1062,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
	search_key.type = BTRFS_INODE_REF_KEY;
	search_key.offset = parent_objectid;
	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
	if (ret == 0) {
	if (ret < 0) {
		return ret;
	} else if (ret == 0) {
		struct btrfs_inode_ref *victim_ref;
		unsigned long ptr;
		unsigned long ptr_end;
@@ -1146,13 +1137,13 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
			struct fscrypt_str victim_name;

			extref = (struct btrfs_inode_extref *)(base + cur_offset);
			victim_name.len = btrfs_inode_extref_name_len(leaf, extref);

			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
				goto next;

			ret = read_alloc_one_name(leaf, &extref->name,
				 btrfs_inode_extref_name_len(leaf, extref),
				 &victim_name);
						  victim_name.len, &victim_name);
			if (ret)
				return ret;

@@ -1167,10 +1158,10 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
				kfree(victim_name.name);
				return ret;
			} else if (!ret) {
				ret = -ENOENT;
				victim_parent = read_one_inode(root,
						parent_objectid);
				if (victim_parent) {
				victim_parent = btrfs_iget_logging(parent_objectid, root);
				if (IS_ERR(victim_parent)) {
					ret = PTR_ERR(victim_parent);
				} else {
					inc_nlink(&inode->vfs_inode);
					btrfs_release_path(path);

@@ -1315,9 +1306,9 @@ static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
			struct btrfs_inode *dir;

			btrfs_release_path(path);
			dir = read_one_inode(root, parent_id);
			if (!dir) {
				ret = -ENOENT;
			dir = btrfs_iget_logging(parent_id, root);
			if (IS_ERR(dir)) {
				ret = PTR_ERR(dir);
				kfree(name.name);
				goto out;
			}
@@ -1389,15 +1380,17 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
	 * copy the back ref in.  The link count fixup code will take
	 * care of the rest
	 */
	dir = read_one_inode(root, parent_objectid);
	if (!dir) {
		ret = -ENOENT;
	dir = btrfs_iget_logging(parent_objectid, root);
	if (IS_ERR(dir)) {
		ret = PTR_ERR(dir);
		dir = NULL;
		goto out;
	}

	inode = read_one_inode(root, inode_objectid);
	if (!inode) {
		ret = -EIO;
	inode = btrfs_iget_logging(inode_objectid, root);
	if (IS_ERR(inode)) {
		ret = PTR_ERR(inode);
		inode = NULL;
		goto out;
	}

@@ -1409,12 +1402,14 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
			 * parent object can change from one array
			 * item to another.
			 */
			if (!dir)
				dir = read_one_inode(root, parent_objectid);
			if (!dir) {
				ret = -ENOENT;
				dir = btrfs_iget_logging(parent_objectid, root);
				if (IS_ERR(dir)) {
					ret = PTR_ERR(dir);
					dir = NULL;
					goto out;
				}
			}
		} else {
			ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
		}
@@ -1682,9 +1677,9 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
			break;

		btrfs_release_path(path);
		inode = read_one_inode(root, key.offset);
		if (!inode) {
			ret = -EIO;
		inode = btrfs_iget_logging(key.offset, root);
		if (IS_ERR(inode)) {
			ret = PTR_ERR(inode);
			break;
		}

@@ -1720,9 +1715,9 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
	struct btrfs_inode *inode;
	struct inode *vfs_inode;

	inode = read_one_inode(root, objectid);
	if (!inode)
		return -EIO;
	inode = btrfs_iget_logging(objectid, root);
	if (IS_ERR(inode))
		return PTR_ERR(inode);

	vfs_inode = &inode->vfs_inode;
	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
@@ -1761,14 +1756,14 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
	struct btrfs_inode *dir;
	int ret;

	inode = read_one_inode(root, location->objectid);
	if (!inode)
		return -ENOENT;
	inode = btrfs_iget_logging(location->objectid, root);
	if (IS_ERR(inode))
		return PTR_ERR(inode);

	dir = read_one_inode(root, dirid);
	if (!dir) {
	dir = btrfs_iget_logging(dirid, root);
	if (IS_ERR(dir)) {
		iput(&inode->vfs_inode);
		return -EIO;
		return PTR_ERR(dir);
	}

	ret = btrfs_add_link(trans, dir, inode, name, 1, index);
@@ -1845,9 +1840,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
	bool update_size = true;
	bool name_added = false;

	dir = read_one_inode(root, key->objectid);
	if (!dir)
		return -EIO;
	dir = btrfs_iget_logging(key->objectid, root);
	if (IS_ERR(dir))
		return PTR_ERR(dir);

	ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
	if (ret)
@@ -2147,9 +2142,10 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
	btrfs_dir_item_key_to_cpu(eb, di, &location);
	btrfs_release_path(path);
	btrfs_release_path(log_path);
	inode = read_one_inode(root, location.objectid);
	if (!inode) {
		ret = -EIO;
	inode = btrfs_iget_logging(location.objectid, root);
	if (IS_ERR(inode)) {
		ret = PTR_ERR(inode);
		inode = NULL;
		goto out;
	}

@@ -2301,14 +2297,17 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
	if (!log_path)
		return -ENOMEM;

	dir = read_one_inode(root, dirid);
	/* it isn't an error if the inode isn't there, that can happen
	 * because we replay the deletes before we copy in the inode item
	 * from the log
	dir = btrfs_iget_logging(dirid, root);
	/*
	 * It isn't an error if the inode isn't there, that can happen because
	 * we replay the deletes before we copy in the inode item from the log.
	 */
	if (!dir) {
	if (IS_ERR(dir)) {
		btrfs_free_path(log_path);
		return 0;
		ret = PTR_ERR(dir);
		if (ret == -ENOENT)
			ret = 0;
		return ret;
	}

	range_start = 0;
@@ -2467,9 +2466,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
				struct btrfs_inode *inode;
				u64 from;

				inode = read_one_inode(root, key.objectid);
				if (!inode) {
					ret = -EIO;
				inode = btrfs_iget_logging(key.objectid, root);
				if (IS_ERR(inode)) {
					ret = PTR_ERR(inode);
					break;
				}
				from = ALIGN(i_size_read(&inode->vfs_inode),
@@ -7448,6 +7447,8 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
 * full log sync.
 * Also we don't need to worry with renames, since btrfs_rename() marks the log
 * for full commit when renaming a subvolume.
 *
 * Must be called before creating the subvolume entry in its parent directory.
 */
void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
				struct btrfs_inode *dir)