Commit c92b4d3d authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs updates from David Sterba:
 "User visible changes:

   - move shutdown ioctl support out of experimental features, a forced
     stop of filesystem operation until the next unmount; additionally
     there's a super block operation to forcibly remove a device from
     under the filesystem that could lead to a shutdown or not if the
     redundancy allows that

   - report filesystem shutdown using fserror mechanism

   - tree-checker updates:
      - verify free space info, extent and bitmap items
      - verify remap-tree items and related data in block group items

  Performance improvements:

   - speed up clearing first extent in the tracked range (+10%
     throughput on sample workload)

   - reduce COW rewrites of extent buffers during the same transaction

   - avoid taking big device lock to update device stats during
     transaction commit

   - fix unnecessary flush on close when truncating empty files
     (observed in practice on a backup application)

   - prevent direct reclaim during compressed readahead to avoid stalls
     under memory pressure

  Notable fixes:

   - fix chunk allocation strategy on RAID1-like block groups with
     disproportionate device sizes, this could lead to ENOSPC due to
     skewed reservation estimates

   - adjust metadata reservation overcommit ratio to be less aggressive
     and also try to flush if possible, this avoids ENOSPC and potential
     transaction aborts in some edge cases (that are otherwise hard to
     reproduce)

   - fix silent IO error in encoded writes and ordered extent split in
     zoned mode, the error was not correctly propagated to the address
     space and could lead to zeroed ranges

   - don't mark inline files NOCOMPRESS unexpectedly, the intent was to
     do that for single block writes of regular files

   - fix deadlock between reflink and transaction commit when using
     flushoncommit

   - fix overly strict item check of a running dev-replace operation

  Core:

   - zoned mode space reservation fixes:
      - cap delayed refs metadata reservation to avoid overcommit
      - update logic to reclaim partially unusable zones
      - add another state to flush and reclaim partially used zone
      - limit number of zones reclaimed in one go to avoid blocking
        other operations

   - don't let log trees consume global reserve on overcommit and fall
     back to transaction commit

   - revalidate extent buffer when checking its up-to-date status

   - add self tests for zoned mode block group specifics

   - reduce atomic allocations in some qgroup paths

   - avoid unnecessary root node COW during snapshotting

   - start new transaction in block group relocation conditionally

   - faster check of NOCOW files on currently snapshotted root

   - change how compressed bio size is tracked from bio and reduce the
     structure size

   - new tracepoint for search slot restart tracking

   - checksum list manipulation improvements

   - type, parameter cleanups, refactoring

   - error handling improvements, transaction abort call adjustments"

* tag 'for-7.1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (116 commits)
  btrfs: btrfs_log_dev_io_error() on all bio errors
  btrfs: fix silent IO error loss in encoded writes and zoned split
  btrfs: skip clearing EXTENT_DEFRAG for NOCOW ordered extents
  btrfs: use BTRFS_FS_UPDATE_UUID_TREE_GEN flag for UUID tree rescan check
  btrfs: remove duplicate journal_info reset on failure to commit transaction
  btrfs: tag as unlikely if statements that check for fs in error state
  btrfs: fix double free in create_space_info() error path
  btrfs: fix double free in create_space_info_sub_group() error path
  btrfs: do not reject a valid running dev-replace
  btrfs: only invalidate btree inode pages after all ebs are released
  btrfs: prevent direct reclaim during compressed readahead
  btrfs: replace BUG_ON() with error return in cache_save_setup()
  btrfs: zstd: don't cache sectorsize in a local variable
  btrfs: zlib: don't cache sectorsize in a local variable
  btrfs: zlib: drop redundant folio address variable
  btrfs: lzo: inline read/write length helpers
  btrfs: use common eb range validation in read_extent_buffer_to_user_nofault()
  btrfs: read eb folio index right before loops
  btrfs: rename local variable for offset in folio
  btrfs: unify types for binary search variables
  ...
parents 23acda7c fc3d5328
Loading
Loading
Loading
Loading
+0 −2
Original line number Diff line number Diff line
@@ -112,8 +112,6 @@ config BTRFS_EXPERIMENTAL

	  - large folio and block size (> page size) support

	  - shutdown ioctl and auto-degradation support

	  - asynchronous checksum generation for data writes

	  - remap-tree - logical address remapping tree
+4 −0
Original line number Diff line number Diff line
@@ -45,3 +45,7 @@ btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
	tests/free-space-tree-tests.o tests/extent-map-tests.o \
	tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o \
	tests/chunk-allocation-tests.o

ifeq ($(CONFIG_BLK_DEV_ZONED),y)
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/zoned-tests.o
endif
+0 −10
Original line number Diff line number Diff line
@@ -858,11 +858,6 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
			free_pref(ref);
			return PTR_ERR(eb);
		}
		if (unlikely(!extent_buffer_uptodate(eb))) {
			free_pref(ref);
			free_extent_buffer(eb);
			return -EIO;
		}

		if (lock)
			btrfs_tree_read_lock(eb);
@@ -1620,11 +1615,6 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx,
					ret = PTR_ERR(eb);
					goto out;
				}
				if (unlikely(!extent_buffer_uptodate(eb))) {
					free_extent_buffer(eb);
					ret = -EIO;
					goto out;
				}

				if (!path->skip_locking)
					btrfs_tree_read_lock(eb);
+10 −2
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@
 * Copyright (C) 2022 Christoph Hellwig.
 */

#include <linux/blk_types.h>
#include <linux/bio.h>
#include "bio.h"
#include "ctree.h"
@@ -350,11 +351,18 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de

static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev)
{
	blk_status_t sts = bio->bi_status;

	if (!dev || !dev->bdev)
		return;
	if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
	if (unlikely(sts == BLK_STS_OK))
		return;

	if (unlikely(sts != BLK_STS_IOERR && sts != BLK_STS_TARGET &&
		     sts != BLK_STS_MEDIUM && sts != BLK_STS_PROTECTION)) {
		btrfs_warn_rl(dev->fs_info, "bdev %s unexpected block io error: %d",
			      btrfs_dev_name(dev), sts);
		return;
	}
	if (btrfs_op(bio) == BTRFS_MAP_WRITE)
		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
	else if (!(bio->bi_opf & REQ_RAHEAD))
+188 −147
Original line number Diff line number Diff line
@@ -728,7 +728,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
	struct extent_buffer *leaf;
	struct btrfs_key key;
	u64 total_found = 0;
	u64 last = 0;
	u64 last = block_group->start;
	u32 nritems;
	int ret;
	bool wakeup = true;
@@ -737,7 +737,6 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
	if (!path)
		return -ENOMEM;

	last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
	extent_root = btrfs_extent_root(fs_info, last);
	if (unlikely(!extent_root)) {
		btrfs_err(fs_info,
@@ -1613,6 +1612,24 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)

		spin_lock(&space_info->lock);
		spin_lock(&block_group->lock);

		if (btrfs_is_zoned(fs_info) && btrfs_is_block_group_used(block_group) &&
		    block_group->zone_unusable >= div_u64(block_group->length, 2)) {
			/*
			 * If the block group has data left, but at least half
			 * of the block group is zone_unusable, mark it as
			 * reclaimable before continuing with the next block group.
			 */

			spin_unlock(&block_group->lock);
			spin_unlock(&space_info->lock);
			up_write(&space_info->groups_sem);

			btrfs_mark_bg_to_reclaim(block_group);

			goto next;
		}

		if (btrfs_is_block_group_used(block_group) ||
		    (block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) ||
		    list_is_singular(&block_group->list) ||
@@ -1679,7 +1696,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
		spin_unlock(&space_info->lock);

		/* We don't want to force the issue, only flip if it's ok. */
		ret = inc_block_group_ro(block_group, 0);
		ret = inc_block_group_ro(block_group, false);
		up_write(&space_info->groups_sem);
		if (ret < 0) {
			ret = 0;
@@ -1892,52 +1909,15 @@ static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 b
	return true;
}

void btrfs_reclaim_bgs_work(struct work_struct *work)
static int btrfs_reclaim_block_group(struct btrfs_block_group *bg, int *reclaimed)
{
	struct btrfs_fs_info *fs_info =
		container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
	struct btrfs_block_group *bg;
	struct btrfs_space_info *space_info;
	LIST_HEAD(retry_list);

	if (!btrfs_should_reclaim(fs_info))
		return;

	guard(super_write)(fs_info->sb);

	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
		return;

	/*
	 * Long running balances can keep us blocked here for eternity, so
	 * simply skip reclaim if we're unable to get the mutex.
	 */
	if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
		btrfs_exclop_finish(fs_info);
		return;
	}

	spin_lock(&fs_info->unused_bgs_lock);
	/*
	 * Sort happens under lock because we can't simply splice it and sort.
	 * The block groups might still be in use and reachable via bg_list,
	 * and their presence in the reclaim_bgs list must be preserved.
	 */
	list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
	while (!list_empty(&fs_info->reclaim_bgs)) {
	struct btrfs_fs_info *fs_info = bg->fs_info;
	struct btrfs_space_info *space_info = bg->space_info;
	u64 used;
	u64 reserved;
	u64 old_total;
	int ret = 0;

		bg = list_first_entry(&fs_info->reclaim_bgs,
				      struct btrfs_block_group,
				      bg_list);
		list_del_init(&bg->bg_list);

		space_info = bg->space_info;
		spin_unlock(&fs_info->unused_bgs_lock);

	/* Don't race with allocators so take the groups_sem */
	down_write(&space_info->groups_sem);

@@ -1953,8 +1933,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
		spin_unlock(&bg->lock);
		spin_unlock(&space_info->lock);
		up_write(&space_info->groups_sem);
			goto next;
		return 0;
	}

	if (bg->used == 0) {
		/*
		 * It is possible that we trigger relocation on a block
@@ -1972,9 +1953,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
		spin_unlock(&bg->lock);
		spin_unlock(&space_info->lock);
		up_write(&space_info->groups_sem);
			goto next;

		return 0;
	}

	/*
	 * The block group might no longer meet the reclaim condition by
	 * the time we get around to reclaiming it, so to avoid
@@ -1989,7 +1970,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
		spin_unlock(&bg->lock);
		spin_unlock(&space_info->lock);
		up_write(&space_info->groups_sem);
			goto next;
		return 0;
	}

	spin_unlock(&bg->lock);
@@ -2006,13 +1987,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
	 */
	if (btrfs_need_cleaner_sleep(fs_info)) {
		up_write(&space_info->groups_sem);
			goto next;
		return 0;
	}

		ret = inc_block_group_ro(bg, 0);
	ret = inc_block_group_ro(bg, false);
	up_write(&space_info->groups_sem);
	if (ret < 0)
			goto next;
		return ret;

	/*
	 * The amount of bytes reclaimed corresponds to the sum of the
@@ -2055,8 +2036,55 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
	if (space_info->total_bytes < old_total)
		btrfs_set_periodic_reclaim_ready(space_info, true);
	spin_unlock(&space_info->lock);
	if (!ret)
		(*reclaimed)++;

	return ret;
}

void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info, unsigned int limit)
{
	struct btrfs_block_group *bg;
	struct btrfs_space_info *space_info;
	LIST_HEAD(retry_list);
	int reclaimed = 0;

	if (!btrfs_should_reclaim(fs_info))
		return;

	guard(super_write)(fs_info->sb);

	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
		return;

	/*
	 * Long running balances can keep us blocked here for eternity, so
	 * simply skip reclaim if we're unable to get the mutex.
	 */
	if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
		btrfs_exclop_finish(fs_info);
		return;
	}

	spin_lock(&fs_info->unused_bgs_lock);
	/*
	 * Sort happens under lock because we can't simply splice it and sort.
	 * The block groups might still be in use and reachable via bg_list,
	 * and their presence in the reclaim_bgs list must be preserved.
	 */
	list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
	while (!list_empty(&fs_info->reclaim_bgs)) {
		int ret;

		bg = list_first_entry(&fs_info->reclaim_bgs,
				      struct btrfs_block_group,
				      bg_list);
		list_del_init(&bg->bg_list);

		space_info = bg->space_info;
		spin_unlock(&fs_info->unused_bgs_lock);
		ret = btrfs_reclaim_block_group(bg, &reclaimed);

next:
		if (ret && !READ_ONCE(space_info->periodic_reclaim))
			btrfs_link_bg_list(bg, &retry_list);
		btrfs_put_block_group(bg);
@@ -2074,6 +2102,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
		if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
			goto end;
		spin_lock(&fs_info->unused_bgs_lock);
		if (reclaimed >= limit)
			break;
	}
	spin_unlock(&fs_info->unused_bgs_lock);
	mutex_unlock(&fs_info->reclaim_bgs_lock);
@@ -2084,6 +2114,14 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
	btrfs_exclop_finish(fs_info);
}

void btrfs_reclaim_bgs_work(struct work_struct *work)
{
	struct btrfs_fs_info *fs_info =
		container_of(work, struct btrfs_fs_info, reclaim_bgs_work);

	btrfs_reclaim_block_groups(fs_info, -1);
}

void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
{
	btrfs_reclaim_sweep(fs_info);
@@ -2222,7 +2260,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
		io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map));

	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
	buf = kzalloc_objs(u64, map->num_stripes, GFP_NOFS);
	if (!buf) {
		ret = -ENOMEM;
		goto out;
@@ -2538,7 +2576,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
				btrfs_mark_bg_unused(cache);
		}
	} else {
		inc_block_group_ro(cache, 1);
		inc_block_group_ro(cache, true);
	}

	return 0;
@@ -2694,11 +2732,11 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
		list_for_each_entry(cache,
				&space_info->block_groups[BTRFS_RAID_RAID0],
				list)
			inc_block_group_ro(cache, 1);
			inc_block_group_ro(cache, true);
		list_for_each_entry(cache,
				&space_info->block_groups[BTRFS_RAID_SINGLE],
				list)
			inc_block_group_ro(cache, 1);
			inc_block_group_ro(cache, true);
	}

	btrfs_init_global_block_rsv(info);
@@ -3087,7 +3125,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
	 */
	if (sb_rdonly(fs_info->sb)) {
		mutex_lock(&fs_info->ro_block_group_mutex);
		ret = inc_block_group_ro(cache, 0);
		ret = inc_block_group_ro(cache, false);
		mutex_unlock(&fs_info->ro_block_group_mutex);
		return ret;
	}
@@ -3138,7 +3176,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
		}
	}

	ret = inc_block_group_ro(cache, 0);
	ret = inc_block_group_ro(cache, false);
	if (!ret)
		goto out;
	if (ret == -ETXTBSY)
@@ -3165,7 +3203,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
	if (ret < 0)
		goto out;

	ret = inc_block_group_ro(cache, 0);
	ret = inc_block_group_ro(cache, false);
	if (ret == -ETXTBSY)
		goto unlock_out;
out:
@@ -3305,7 +3343,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,

}

static int cache_save_setup(struct btrfs_block_group *block_group,
static void cache_save_setup(struct btrfs_block_group *block_group,
			     struct btrfs_trans_handle *trans,
			     struct btrfs_path *path)
{
@@ -3319,7 +3357,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
	int ret = 0;

	if (!btrfs_test_opt(fs_info, SPACE_CACHE))
		return 0;
		return;

	/*
	 * If this block group is smaller than 100 megs don't bother caching the
@@ -3329,11 +3367,11 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
		spin_lock(&block_group->lock);
		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
		spin_unlock(&block_group->lock);
		return 0;
		return;
	}

	if (TRANS_ABORTED(trans))
		return 0;
		return;
again:
	inode = lookup_free_space_inode(block_group, path);
	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -3343,7 +3381,13 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
	}

	if (IS_ERR(inode)) {
		BUG_ON(retries);
		if (retries) {
			ret = PTR_ERR(inode);
			btrfs_err(fs_info,
				  "failed to lookup free space inode after creation for block group %llu: %d",
				  block_group->start, ret);
			goto out_free;
		}
		retries++;

		if (block_group->ro)
@@ -3414,10 +3458,8 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
	 * We hit an ENOSPC when setting up the cache in this transaction, just
	 * skip doing the setup, we've already cleared the cache so we're safe.
	 */
	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
		ret = -ENOSPC;
	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags))
		goto out_put;
	}

	/*
	 * Try to preallocate enough space based on how big the block group is.
@@ -3465,7 +3507,6 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
	spin_unlock(&block_group->lock);

	extent_changeset_free(data_reserved);
	return ret;
}

int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
Loading