Commit 5ca7fe21 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs fixes from David Sterba:
 "Fixes:

   - fix invalid inode pointer dereferences during log replay

   - fix a race between renames and directory logging

   - fix shutting down delayed iput worker

   - fix device byte accounting when dropping chunk

   - in zoned mode, fix offset calculations for DUP profile when
     conventional and sequential zones are used together

  Regression fixes:

   - fix possible double unlock of extent buffer tree (xarray
     conversion)

   - in zoned mode, fix extent buffer refcount when writing out extents
     (xarray conversion)

  Error handling fixes and updates:

   - handle unexpected extent type when replaying log

   - check and warn if there are remaining delayed inodes when putting a
     root

   - fix assertion when building free space tree

   - handle csum tree error with mount option 'rescue=ibadroot'

  Other:

   - error message updates: add prefix to all scrub related messages,
     include other information in messages"

* tag 'for-6.16-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: zoned: fix alloc_offset calculation for partly conventional block groups
  btrfs: handle csum tree error with rescue=ibadroots correctly
  btrfs: fix race between async reclaim worker and close_ctree()
  btrfs: fix assertion when building free space tree
  btrfs: don't silently ignore unexpected extent type when replaying log
  btrfs: fix invalid inode pointer dereferences during log replay
  btrfs: fix double unlock of buffer_tree xarray when releasing subpage eb
  btrfs: update superblock's device bytes_used when dropping chunk
  btrfs: fix a race between renames and directory logging
  btrfs: scrub: add prefix for the error messages
  btrfs: warn if leaking delayed_nodes in btrfs_put_root()
  btrfs: fix delayed ref refcount leak in debug assertion
  btrfs: include root in error message when unlinking inode
  btrfs: don't drop a reference if btrfs_check_write_meta_pointer() fails
parents c0694456 c0d90a79
Loading
Loading
Loading
Loading
+4 −1
Original line number Diff line number Diff line
@@ -1377,7 +1377,10 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,

void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
{
	WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root));
	struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root);

	if (WARN_ON(node))
		refcount_dec(&node->refs);
}

static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
+21 −6
Original line number Diff line number Diff line
@@ -1835,6 +1835,8 @@ void btrfs_put_root(struct btrfs_root *root)
	if (refcount_dec_and_test(&root->refs)) {
		if (WARN_ON(!xa_empty(&root->inodes)))
			xa_destroy(&root->inodes);
		if (WARN_ON(!xa_empty(&root->delayed_nodes)))
			xa_destroy(&root->delayed_nodes);
		WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
		if (root->anon_dev)
			free_anon_bdev(root->anon_dev);
@@ -2156,7 +2158,6 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
		found = true;
		root = read_tree_root_path(tree_root, path, &key);
		if (IS_ERR(root)) {
			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
			ret = PTR_ERR(root);
			break;
		}
@@ -4310,8 +4311,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
	 *
	 * So wait for all ongoing ordered extents to complete and then run
	 * delayed iputs. This works because once we reach this point no one
	 * can either create new ordered extents nor create delayed iputs
	 * through some other means.
	 * can create new ordered extents, but delayed iputs can still be added
	 * by a reclaim worker (see comments further below).
	 *
	 * Also note that btrfs_wait_ordered_roots() is not safe here, because
	 * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
@@ -4322,15 +4323,29 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
	btrfs_flush_workqueue(fs_info->endio_write_workers);
	/* Ordered extents for free space inodes. */
	btrfs_flush_workqueue(fs_info->endio_freespace_worker);
	/*
	 * Run delayed iputs in case an async reclaim worker is waiting for them
	 * to be run as mentioned above.
	 */
	btrfs_run_delayed_iputs(fs_info);
	/* There should be no more workload to generate new delayed iputs. */
	set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state);

	cancel_work_sync(&fs_info->async_reclaim_work);
	cancel_work_sync(&fs_info->async_data_reclaim_work);
	cancel_work_sync(&fs_info->preempt_reclaim_work);
	cancel_work_sync(&fs_info->em_shrinker_work);

	/*
	 * Run delayed iputs again because an async reclaim worker may have
	 * added new ones if it was flushing delalloc:
	 *
	 * shrink_delalloc() -> btrfs_start_delalloc_roots() ->
	 *    start_delalloc_inodes() -> btrfs_add_delayed_iput()
	 */
	btrfs_run_delayed_iputs(fs_info);

	/* There should be no more workload to generate new delayed iputs. */
	set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state);

	/* Cancel or finish ongoing discard work */
	btrfs_discard_cleanup(fs_info);

+1 −1
Original line number Diff line number Diff line
@@ -4312,7 +4312,6 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
			spin_unlock(&eb->refs_lock);
			continue;
		}
		xa_unlock_irq(&fs_info->buffer_tree);

		/*
		 * If tree ref isn't set then we know the ref on this eb is a
@@ -4329,6 +4328,7 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
		 * check the folio private at the end.  And
		 * release_extent_buffer() will release the refs_lock.
		 */
		xa_unlock_irq(&fs_info->buffer_tree);
		release_extent_buffer(eb);
		xa_lock_irq(&fs_info->buffer_tree);
	}
+12 −4
Original line number Diff line number Diff line
@@ -1115,11 +1115,21 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
	ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
	if (ret < 0)
		goto out_locked;
	/*
	 * If ret is 1 (no key found), it means this is an empty block group,
	 * without any extents allocated from it and there's no block group
	 * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree
	 * because we are using the block group tree feature, so block group
	 * items are stored in the block group tree. It also means there are no
	 * extents allocated for block groups with a start offset beyond this
	 * block group's end offset (this is the last, highest, block group).
	 */
	if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE))
		ASSERT(ret == 0);

	start = block_group->start;
	end = block_group->start + block_group->length;
	while (1) {
	while (ret == 0) {
		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);

		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
@@ -1149,8 +1159,6 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
		ret = btrfs_next_item(extent_root, path);
		if (ret < 0)
			goto out_locked;
		if (ret)
			break;
	}
	if (start < end) {
		ret = __add_to_free_space_tree(trans, block_group, path2,
+67 −20
Original line number Diff line number Diff line
@@ -4250,9 +4250,9 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,

	ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
	if (ret) {
		btrfs_info(fs_info,
			"failed to delete reference to %.*s, inode %llu parent %llu",
			name->len, name->name, ino, dir_ino);
		btrfs_crit(fs_info,
	   "failed to delete reference to %.*s, root %llu inode %llu parent %llu",
			   name->len, name->name, btrfs_root_id(root), ino, dir_ino);
		btrfs_abort_transaction(trans, ret);
		goto err;
	}
@@ -8059,6 +8059,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
	int ret;
	int ret2;
	bool need_abort = false;
	bool logs_pinned = false;
	struct fscrypt_name old_fname, new_fname;
	struct fscrypt_str *old_name, *new_name;

@@ -8182,6 +8183,31 @@ static int btrfs_rename_exchange(struct inode *old_dir,
	inode_inc_iversion(new_inode);
	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);

	if (old_ino != BTRFS_FIRST_FREE_OBJECTID &&
	    new_ino != BTRFS_FIRST_FREE_OBJECTID) {
		/*
		 * If we are renaming in the same directory (and it's not for
		 * root entries) pin the log early to prevent any concurrent
		 * task from logging the directory after we removed the old
		 * entries and before we add the new entries, otherwise that
		 * task can sync a log without any entry for the inodes we are
		 * renaming and therefore replaying that log, if a power failure
		 * happens after syncing the log, would result in deleting the
		 * inodes.
		 *
		 * If the rename affects two different directories, we want to
		 * make sure the that there's no log commit that contains
		 * updates for only one of the directories but not for the
		 * other.
		 *
		 * If we are renaming an entry for a root, we don't care about
		 * log updates since we called btrfs_set_log_full_commit().
		 */
		btrfs_pin_log_trans(root);
		btrfs_pin_log_trans(dest);
		logs_pinned = true;
	}

	if (old_dentry->d_parent != new_dentry->d_parent) {
		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
					BTRFS_I(old_inode), true);
@@ -8253,30 +8279,23 @@ static int btrfs_rename_exchange(struct inode *old_dir,
		BTRFS_I(new_inode)->dir_index = new_idx;

	/*
	 * Now pin the logs of the roots. We do it to ensure that no other task
	 * can sync the logs while we are in progress with the rename, because
	 * that could result in an inconsistency in case any of the inodes that
	 * are part of this rename operation were logged before.
	 * Do the log updates for all inodes.
	 *
	 * If either entry is for a root we don't need to update the logs since
	 * we've called btrfs_set_log_full_commit() before.
	 */
	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
		btrfs_pin_log_trans(root);
	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
		btrfs_pin_log_trans(dest);

	/* Do the log updates for all inodes. */
	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
	if (logs_pinned) {
		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
				   old_rename_ctx.index, new_dentry->d_parent);
	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
		btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
				   new_rename_ctx.index, old_dentry->d_parent);
	}

	/* Now unpin the logs. */
	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
out_fail:
	if (logs_pinned) {
		btrfs_end_log_trans(root);
	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
		btrfs_end_log_trans(dest);
out_fail:
	}
	ret2 = btrfs_end_transaction(trans);
	ret = ret ? ret : ret2;
out_notrans:
@@ -8326,6 +8345,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
	int ret2;
	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
	struct fscrypt_name old_fname, new_fname;
	bool logs_pinned = false;

	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
		return -EPERM;
@@ -8460,6 +8480,29 @@ static int btrfs_rename(struct mnt_idmap *idmap,
	inode_inc_iversion(old_inode);
	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);

	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
		/*
		 * If we are renaming in the same directory (and it's not a
		 * root entry) pin the log to prevent any concurrent task from
		 * logging the directory after we removed the old entry and
		 * before we add the new entry, otherwise that task can sync
		 * a log without any entry for the inode we are renaming and
		 * therefore replaying that log, if a power failure happens
		 * after syncing the log, would result in deleting the inode.
		 *
		 * If the rename affects two different directories, we want to
		 * make sure the that there's no log commit that contains
		 * updates for only one of the directories but not for the
		 * other.
		 *
		 * If we are renaming an entry for a root, we don't care about
		 * log updates since we called btrfs_set_log_full_commit().
		 */
		btrfs_pin_log_trans(root);
		btrfs_pin_log_trans(dest);
		logs_pinned = true;
	}

	if (old_dentry->d_parent != new_dentry->d_parent)
		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
					BTRFS_I(old_inode), true);
@@ -8524,7 +8567,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
	if (old_inode->i_nlink == 1)
		BTRFS_I(old_inode)->dir_index = index;

	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
	if (logs_pinned)
		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
				   rename_ctx.index, new_dentry->d_parent);

@@ -8540,6 +8583,10 @@ static int btrfs_rename(struct mnt_idmap *idmap,
		}
	}
out_fail:
	if (logs_pinned) {
		btrfs_end_log_trans(root);
		btrfs_end_log_trans(dest);
	}
	ret2 = btrfs_end_transaction(trans);
	ret = ret ? ret : ret2;
out_notrans:
Loading