Commit a1b547f0 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs updates from David Sterba:
 "The highlights are new logic behind background block group reclaim,
  automatic removal of qgroup after removing a subvolume and new
  'rescue=' mount options.

  The rest is optimizations, cleanups and refactoring.

  User visible features:

   - dynamic block group reclaim:
      - tunable framework to avoid situations where eager data
        allocations prevent creating new metadata chunks due to lack of
        unallocated space
      - reuse sysfs knob bg_reclaim_threshold (otherwise used only in
        zoned mode) for a fixed value threshold
      - new on/off sysfs knob "dynamic_reclaim" calculating the value
        based on heuristics, aiming to keep spare working space for
        relocating chunks but not to needlessly relocate partially
        utilized block groups or reclaim newly allocated ones
      - stats are exported in sysfs per block group type, files
        "reclaim_*"
      - this may increase IO load at unexpected times but the corner
        case of no allocatable block groups is known to be worse

   - automatically remove qgroup of deleted subvolumes:
      - adjust qgroup removal conditions, make sure all related
        subvolume data are already removed, or return EBUSY, also take
        into account setting of sysfs drop_subtree_threshold
      - also works in squota mode

   - mount option updates: new modes of 'rescue=' that allow to mount
     images (read-only) that could have been partially converted by user
     space tools
      - ignoremetacsums  - invalid metadata checksums are ignored
      - ignoresuperflags - super block flags that track conversion in
                           progress (like UUID or checksums)

  Core:

   - size of struct btrfs_inode is now below 1024 (on a release config),
     improved memory packing and other secondary effects

   - switch tracking of open inodes from rb-tree to xarray, minor
     performance improvement

   - reduce number of empty transaction commits when there are no dirty
     data/metadata

   - memory allocation optimizations (reduced numbers, reordering out of
     critical sections)

   - extent map structure optimizations and refactoring, more sanity
     checks

   - more subpage in zoned mode preparations or fixes

   - general snapshot code cleanups, improvements and documentation

   - tree-checker updates: more file extent ram_bytes fixes, continued

   - raid-stripe-tree update (not backward compatible):
      - remove extent encoding field from the structure, can be inferred
        from other information
      - requires btrfs-progs 6.9.1 or newer

   - cleanups and refactoring
      - error message updates
      - error handling improvements
      - return type and parameter cleanups and improvements"

* tag 'for-6.11-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (152 commits)
  btrfs: fix extent map use-after-free when adding pages to compressed bio
  btrfs: fix bitmap leak when loading free space cache on duplicate entry
  btrfs: remove the BUG_ON() inside extent_range_clear_dirty_for_io()
  btrfs: move extent_range_clear_dirty_for_io() into inode.c
  btrfs: enhance compression error messages
  btrfs: fix data race when accessing the last_trans field of a root
  btrfs: rename the extra_gfp parameter of btrfs_alloc_page_array()
  btrfs: remove the extra_gfp parameter from btrfs_alloc_folio_array()
  btrfs: introduce new "rescue=ignoresuperflags" mount option
  btrfs: introduce new "rescue=ignoremetacsums" mount option
  btrfs: output the unrecognized super block flags as hex
  btrfs: remove unused Opt enums
  btrfs: tree-checker: add extra ram_bytes and disk_num_bytes check
  btrfs: fix the ram_bytes assignment for truncated ordered extents
  btrfs: make validate_extent_map() catch ram_bytes mismatch
  btrfs: ignore incorrect btrfs_file_extent_item::ram_bytes
  btrfs: cleanup the bytenr usage inside btrfs_extent_item_to_extent_map()
  btrfs: fix typo in error message in btrfs_validate_super()
  btrfs: move the direct IO code into its own file
  btrfs: pass a btrfs_inode to btrfs_set_prop()
  ...
parents 6706415b 8e786054
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -33,7 +33,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
	   subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
	   lru_cache.o raid-stripe-tree.o
	   lru_cache.o raid-stripe-tree.o fiemap.o direct-io.o

btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
+6 −9
Original line number Diff line number Diff line
@@ -34,7 +34,7 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e

static inline u8 get_unaligned_le8(const void *p)
{
       return *(u8 *)p;
       return *(const u8 *)p;
}

static inline void put_unaligned_le8(u8 val, void *p)
@@ -48,8 +48,8 @@ static inline void put_unaligned_le8(u8 val, void *p)
			    offsetof(type, member),			\
			    sizeof_field(type, member)))

#define write_eb_member(eb, ptr, type, member, result) (\
	write_extent_buffer(eb, (char *)(result),			\
#define write_eb_member(eb, ptr, type, member, source) (		\
	write_extent_buffer(eb, (const char *)(source),			\
			   ((unsigned long)(ptr)) +			\
			    offsetof(type, member),			\
			    sizeof_field(type, member)))
@@ -315,11 +315,8 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);

BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);
BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64);
BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64);
BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding,
			 struct btrfs_stripe_extent, encoding, 8);
BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64);
BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64);

@@ -353,7 +350,7 @@ static inline void btrfs_tree_block_key(const struct extent_buffer *eb,

static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb,
					    struct btrfs_tree_block_info *item,
					    struct btrfs_disk_key *key)
					    const struct btrfs_disk_key *key)
{
	write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
}
@@ -446,7 +443,7 @@ void btrfs_node_key(const struct extent_buffer *eb,
		    struct btrfs_disk_key *disk_key, int nr);

static inline void btrfs_set_node_key(const struct extent_buffer *eb,
				      struct btrfs_disk_key *disk_key, int nr)
				      const struct btrfs_disk_key *disk_key, int nr)
{
	unsigned long ptr;

@@ -512,7 +509,7 @@ static inline void btrfs_item_key(const struct extent_buffer *eb,
}

static inline void btrfs_set_item_key(struct extent_buffer *eb,
				      struct btrfs_disk_key *disk_key, int nr)
				      const struct btrfs_disk_key *disk_key, int nr)
{
	struct btrfs_item *item = btrfs_item_nr(eb, nr);

+2 −2
Original line number Diff line number Diff line
@@ -29,7 +29,7 @@ struct btrfs_failed_bio {
/* Is this a data path I/O that needs storage layer checksum and repair? */
static inline bool is_data_bbio(struct btrfs_bio *bbio)
{
	return bbio->inode && is_data_inode(&bbio->inode->vfs_inode);
	return bbio->inode && is_data_inode(bbio->inode);
}

static bool bbio_has_ordered_extent(struct btrfs_bio *bbio)
@@ -732,7 +732,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
		 * point, so they are handled as part of the no-checksum case.
		 */
		if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
		    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
		    !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
		    !btrfs_is_data_reloc_root(inode->root)) {
			if (should_async_write(bbio) &&
			    btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
+40 −13
Original line number Diff line number Diff line
@@ -1022,6 +1022,13 @@ static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
	}
}

static struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
{
	if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
		return fs_info->block_group_root;
	return btrfs_extent_root(fs_info, 0);
}

static int remove_block_group_item(struct btrfs_trans_handle *trans,
				   struct btrfs_path *path,
				   struct btrfs_block_group *block_group)
@@ -1757,24 +1764,21 @@ static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)

static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
{
	const struct btrfs_space_info *space_info = bg->space_info;
	const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
	const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info);
	u64 thresh_bytes = mult_perc(bg->length, thresh_pct);
	const u64 new_val = bg->used;
	const u64 old_val = new_val + bytes_freed;
	u64 thresh;

	if (reclaim_thresh == 0)
	if (thresh_bytes == 0)
		return false;

	thresh = mult_perc(bg->length, reclaim_thresh);

	/*
	 * If we were below the threshold before don't reclaim, we are likely a
	 * brand new block group and we don't want to relocate new block groups.
	 */
	if (old_val < thresh)
	if (old_val < thresh_bytes)
		return false;
	if (new_val >= thresh)
	if (new_val >= thresh_bytes)
		return false;
	return true;
}
@@ -1822,6 +1826,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
	list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
	while (!list_empty(&fs_info->reclaim_bgs)) {
		u64 zone_unusable;
		u64 reclaimed;
		int ret = 0;

		bg = list_first_entry(&fs_info->reclaim_bgs,
@@ -1835,6 +1840,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
		/* Don't race with allocators so take the groups_sem */
		down_write(&space_info->groups_sem);

		spin_lock(&space_info->lock);
		spin_lock(&bg->lock);
		if (bg->reserved || bg->pinned || bg->ro) {
			/*
@@ -1844,6 +1850,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
			 * this block group.
			 */
			spin_unlock(&bg->lock);
			spin_unlock(&space_info->lock);
			up_write(&space_info->groups_sem);
			goto next;
		}
@@ -1862,6 +1869,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
			if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
				btrfs_mark_bg_unused(bg);
			spin_unlock(&bg->lock);
			spin_unlock(&space_info->lock);
			up_write(&space_info->groups_sem);
			goto next;

@@ -1878,10 +1886,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
		 */
		if (!should_reclaim_block_group(bg, bg->length)) {
			spin_unlock(&bg->lock);
			spin_unlock(&space_info->lock);
			up_write(&space_info->groups_sem);
			goto next;
		}
		spin_unlock(&bg->lock);
		spin_unlock(&space_info->lock);

		/*
		 * Get out fast, in case we're read-only or unmounting the
@@ -1914,15 +1924,26 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
				div64_u64(bg->used * 100, bg->length),
				div64_u64(zone_unusable * 100, bg->length));
		trace_btrfs_reclaim_block_group(bg);
		reclaimed = bg->used;
		ret = btrfs_relocate_chunk(fs_info, bg->start);
		if (ret) {
			btrfs_dec_block_group_ro(bg);
			btrfs_err(fs_info, "error relocating chunk %llu",
				  bg->start);
			reclaimed = 0;
			spin_lock(&space_info->lock);
			space_info->reclaim_errors++;
			if (READ_ONCE(space_info->periodic_reclaim))
				space_info->periodic_reclaim_ready = false;
			spin_unlock(&space_info->lock);
		}
		spin_lock(&space_info->lock);
		space_info->reclaim_count++;
		space_info->reclaim_bytes += reclaimed;
		spin_unlock(&space_info->lock);

next:
		if (ret) {
		if (ret && !READ_ONCE(space_info->periodic_reclaim)) {
			/* Refcount held by the reclaim_bgs list after splice. */
			spin_lock(&fs_info->unused_bgs_lock);
			/*
@@ -1964,6 +1985,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)

void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
{
	btrfs_reclaim_sweep(fs_info);
	spin_lock(&fs_info->unused_bgs_lock);
	if (!list_empty(&fs_info->reclaim_bgs))
		queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
@@ -3662,9 +3684,12 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
		old_val += num_bytes;
		cache->used = old_val;
		cache->reserved -= num_bytes;
		cache->reclaim_mark = 0;
		space_info->bytes_reserved -= num_bytes;
		space_info->bytes_used += num_bytes;
		space_info->disk_used += num_bytes * factor;
		if (READ_ONCE(space_info->periodic_reclaim))
			btrfs_space_info_update_reclaimable(space_info, -num_bytes);
		spin_unlock(&cache->lock);
		spin_unlock(&space_info->lock);
	} else {
@@ -3674,7 +3699,9 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
		btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
		space_info->bytes_used -= num_bytes;
		space_info->disk_used -= num_bytes * factor;

		if (READ_ONCE(space_info->periodic_reclaim))
			btrfs_space_info_update_reclaimable(space_info, num_bytes);
		else
			reclaim = should_reclaim_block_group(cache, num_bytes);

		spin_unlock(&cache->lock);
@@ -4329,13 +4356,13 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
		spin_lock(&block_group->lock);
		if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
				       &block_group->runtime_flags)) {
			struct inode *inode = block_group->inode;
			struct btrfs_inode *inode = block_group->inode;

			block_group->inode = NULL;
			spin_unlock(&block_group->lock);

			ASSERT(block_group->io_ctl.inode == NULL);
			iput(inode);
			iput(&inode->vfs_inode);
		} else {
			spin_unlock(&block_group->lock);
		}
+2 −1
Original line number Diff line number Diff line
@@ -115,7 +115,7 @@ struct btrfs_caching_control {

struct btrfs_block_group {
	struct btrfs_fs_info *fs_info;
	struct inode *inode;
	struct btrfs_inode *inode;
	spinlock_t lock;
	u64 start;
	u64 length;
@@ -263,6 +263,7 @@ struct btrfs_block_group {
	struct work_struct zone_finish_work;
	struct extent_buffer *last_eb;
	enum btrfs_block_group_size_class size_class;
	u64 reclaim_mark;
};

static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
Loading