Commit 0eb4aaa2 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs updates from David Sterba:
 "User visible changes, features:

   - rebuilding of the free space tree at mount time is done in more
     transactions, fix potential hangs when the transaction thread is
     blocked due to large amount of block groups

   - more read IO balancing strategies (experimental config), add two
     new ways how to select a device for read if the profiles allow that
     (all RAID1*), the current default selects the device by pid which
     is good on average but less performant for single reader workloads

       - select preferred device for all reads (namely for testing)
       - round-robin, balance reads across devices relevant for the
         requested IO range

   - add encoded write ioctl support to io_uring (read was added in
     6.12), basis for writing send stream using that instead of
     syscalls, non-blocking mode is not yet implemented

   - support FS_IOC_READ_VERITY_METADATA, applications can use the
     metadata to do their own verification

   - pass inode's i_write_hint to bios, for parity with other
     filesystems, ioctls F_GET_RW_HINT/F_SET_RW_HINT

  Core:

   - in zoned mode: allow to directly reclaim a block group by simply
     resetting it, then it can be reused and another block group does
     not need to be allocated

   - super block validation now also does more comprehensive sys array
     validation, adding it to the points where superblock is validated
     (post-read, pre-write)

   - subpage mode fixes:
      - fix double accounting of blocks due to some races
      - improved or fixed error handling in a few cases (compression,
        delalloc)

   - raid stripe tree:
      - fix various cases with extent range splitting or deleting
      - implement hole punching to extent range
      - reduce number of stripe tree lookups during bio submission
      - more self-tests

   - updated self-tests (delayed refs)

   - error handling improvements

   - cleanups, refactoring
      - remove rest of backref caching infrastructure from relocation,
        not needed anymore
      - error message updates
      - remove unnecessary calls when extent buffer was marked dirty
      - unused parameter removal
      - code moved to new files

  Other code changes: add rb_find_add_cached() to the rb-tree API"

* tag 'for-6.14-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (127 commits)
  btrfs: selftests: add a selftest for deleting two out of three extents
  btrfs: selftests: add test for punching a hole into 3 RAID stripe-extents
  btrfs: selftests: add selftest for punching holes into the RAID stripe extents
  btrfs: selftests: test RAID stripe-tree deletion spanning two items
  btrfs: selftests: don't split RAID extents in half
  btrfs: selftests: check for correct return value of failed lookup
  btrfs: don't use btrfs_set_item_key_safe on RAID stripe-extents
  btrfs: implement hole punching for RAID stripe extents
  btrfs: fix deletion of a range spanning parts two RAID stripe extents
  btrfs: fix tail delete of RAID stripe-extents
  btrfs: fix front delete range calculation for RAID stripe extents
  btrfs: assert RAID stripe-extent length is always greater than 0
  btrfs: don't try to delete RAID stripe-extents if we don't need to
  btrfs: selftests: correct RAID stripe-tree feature flag setting
  btrfs: add io_uring interface for encoded writes
  btrfs: remove the unused locked_folio parameter from btrfs_cleanup_ordered_extents()
  btrfs: add extra error messages for delalloc range related errors
  btrfs: subpage: dump the involved bitmap when ASSERT() failed
  btrfs: subpage: fix the bitmap dump of the locked flags
  btrfs: do proper folio cleanup when run_delalloc_nocow() failed
  ...
parents 1851bccf 9d0c23db
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -44,4 +44,4 @@ btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
	tests/extent-buffer-tests.o tests/btrfs-tests.o \
	tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
	tests/free-space-tree-tests.o tests/extent-map-tests.o \
	tests/raid-stripe-tree-tests.o
	tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o
+3 −3
Original line number Diff line number Diff line
@@ -18,7 +18,7 @@ enum {
};

#define NO_THRESHOLD (-1)
#define DFT_THRESHOLD (32)
#define DEFAULT_THRESHOLD (32)

struct btrfs_workqueue {
	struct workqueue_struct *normal_wq;
@@ -94,9 +94,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,

	ret->limit_active = limit_active;
	if (thresh == 0)
		thresh = DFT_THRESHOLD;
		thresh = DEFAULT_THRESHOLD;
	/* For low threshold, disabling threshold is a better choice */
	if (thresh < DFT_THRESHOLD) {
	if (thresh < DEFAULT_THRESHOLD) {
		ret->current_active = limit_active;
		ret->thresh = NO_THRESHOLD;
	} else {
+63 −109
Original line number Diff line number Diff line
@@ -250,6 +250,21 @@ static int prelim_ref_compare(const struct prelim_ref *ref1,
	return 0;
}

static int prelim_ref_rb_add_cmp(const struct rb_node *new,
				 const struct rb_node *exist)
{
	const struct prelim_ref *ref_new =
		rb_entry(new, struct prelim_ref, rbnode);
	const struct prelim_ref *ref_exist =
		rb_entry(exist, struct prelim_ref, rbnode);

	/*
	 * prelim_ref_compare() expects the first parameter as the existing one,
	 * different from the rb_find_add_cached() order.
	 */
	return prelim_ref_compare(ref_exist, ref_new);
}

static void update_share_count(struct share_check *sc, int oldcount,
			       int newcount, const struct prelim_ref *newref)
{
@@ -278,25 +293,12 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
			      struct share_check *sc)
{
	struct rb_root_cached *root;
	struct rb_node **p;
	struct rb_node *parent = NULL;
	struct prelim_ref *ref;
	int result;
	bool leftmost = true;
	struct rb_node *exist;

	root = &preftree->root;
	p = &root->rb_root.rb_node;

	while (*p) {
		parent = *p;
		ref = rb_entry(parent, struct prelim_ref, rbnode);
		result = prelim_ref_compare(ref, newref);
		if (result < 0) {
			p = &(*p)->rb_left;
		} else if (result > 0) {
			p = &(*p)->rb_right;
			leftmost = false;
		} else {
	exist = rb_find_add_cached(&newref->rbnode, root, prelim_ref_rb_add_cmp);
	if (exist) {
		struct prelim_ref *ref = rb_entry(exist, struct prelim_ref, rbnode);
		/* Identical refs, merge them and free @newref */
		struct extent_inode_elem *eie = ref->inode_list;

@@ -320,13 +322,10 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
		free_pref(newref);
		return;
	}
	}

	update_share_count(sc, 0, newref->count, newref);
	preftree->count++;
	trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
	rb_link_node(&newref->rbnode, parent, p);
	rb_insert_color_cached(&newref->rbnode, root, leftmost);
}

/*
@@ -3022,9 +3021,6 @@ void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
	cache->rb_root = RB_ROOT;
	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
		INIT_LIST_HEAD(&cache->pending[i]);
	INIT_LIST_HEAD(&cache->changed);
	INIT_LIST_HEAD(&cache->detached);
	INIT_LIST_HEAD(&cache->leaves);
	INIT_LIST_HEAD(&cache->pending_edge);
	INIT_LIST_HEAD(&cache->useless_node);
	cache->fs_info = fs_info;
@@ -3132,29 +3128,17 @@ void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
				struct btrfs_backref_node *node)
{
	struct btrfs_backref_node *upper;
	struct btrfs_backref_edge *edge;

	if (!node)
		return;

	BUG_ON(!node->lowest && !node->detached);
	while (!list_empty(&node->upper)) {
		edge = list_entry(node->upper.next, struct btrfs_backref_edge,
				  list[LOWER]);
		upper = edge->node[UPPER];
		list_del(&edge->list[LOWER]);
		list_del(&edge->list[UPPER]);
		btrfs_backref_free_edge(cache, edge);

		/*
		 * Add the node to leaf node list if no other child block
		 * cached.
		 */
		if (list_empty(&upper->lower)) {
			list_add_tail(&upper->lower, &cache->leaves);
			upper->lowest = 1;
		}
	}

	btrfs_backref_drop_node(cache, node);
@@ -3166,33 +3150,13 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
{
	struct btrfs_backref_node *node;
	int i;

	while (!list_empty(&cache->detached)) {
		node = list_entry(cache->detached.next,
				  struct btrfs_backref_node, list);
	while ((node = rb_entry_safe(rb_first(&cache->rb_root),
				     struct btrfs_backref_node, rb_node)))
		btrfs_backref_cleanup_node(cache, node);
	}

	while (!list_empty(&cache->leaves)) {
		node = list_entry(cache->leaves.next,
				  struct btrfs_backref_node, lower);
		btrfs_backref_cleanup_node(cache, node);
	}

	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
		while (!list_empty(&cache->pending[i])) {
			node = list_first_entry(&cache->pending[i],
						struct btrfs_backref_node,
						list);
			btrfs_backref_cleanup_node(cache, node);
		}
	}
	ASSERT(list_empty(&cache->pending_edge));
	ASSERT(list_empty(&cache->useless_node));
	ASSERT(list_empty(&cache->changed));
	ASSERT(list_empty(&cache->detached));
	ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
	ASSERT(!cache->nr_nodes);
	ASSERT(!cache->nr_edges);
}
@@ -3316,8 +3280,12 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
	root = btrfs_get_fs_root(fs_info, ref_key->offset, false);
	if (IS_ERR(root))
		return PTR_ERR(root);
	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
		cur->cowonly = 1;

	/* We shouldn't be using backref cache for non-shareable roots. */
	if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
		btrfs_put_root(root);
		return -EUCLEAN;
	}

	if (btrfs_root_level(&root->root_item) == cur->level) {
		/* Tree root */
@@ -3403,8 +3371,15 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
				goto out;
			}
			upper->owner = btrfs_header_owner(eb);
			if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
				upper->cowonly = 1;

			/* We shouldn't be using backref cache for non shareable roots. */
			if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
				btrfs_put_root(root);
				btrfs_backref_free_edge(cache, edge);
				btrfs_backref_free_node(cache, upper);
				ret = -EUCLEAN;
				goto out;
			}

			/*
			 * If we know the block isn't shared we can avoid
@@ -3595,15 +3570,9 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,

	ASSERT(start->checked);

	/* Insert this node to cache if it's not COW-only */
	if (!start->cowonly) {
		rb_node = rb_simple_insert(&cache->rb_root, start->bytenr,
					   &start->rb_node);
	rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node);
	if (rb_node)
			btrfs_backref_panic(cache->fs_info, start->bytenr,
					    -EEXIST);
		list_add_tail(&start->lower, &cache->leaves);
	}
		btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST);

	/*
	 * Use breadth first search to iterate all related edges.
@@ -3642,11 +3611,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
		 * parents have already been linked.
		 */
		if (!RB_EMPTY_NODE(&upper->rb_node)) {
			if (upper->lowest) {
				list_del_init(&upper->lower);
				upper->lowest = 0;
			}

			list_add_tail(&edge->list[UPPER], &upper->lower);
			continue;
		}
@@ -3657,22 +3621,12 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
			return -EUCLEAN;
		}

		/* Sanity check, COW-only node has non-COW-only parent */
		if (start->cowonly != upper->cowonly) {
			ASSERT(0);
			return -EUCLEAN;
		}

		/* Only cache non-COW-only (subvolume trees) tree blocks */
		if (!upper->cowonly) {
		rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
					   &upper->rb_node);
			if (rb_node) {
				btrfs_backref_panic(cache->fs_info,
						upper->bytenr, -EEXIST);
		if (unlikely(rb_node)) {
			btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST);
			return -EUCLEAN;
		}
		}

		list_add_tail(&edge->list[UPPER], &upper->lower);

+6 −10
Original line number Diff line number Diff line
@@ -318,6 +318,12 @@ struct btrfs_backref_node {
		u64 bytenr;
	}; /* Use rb_simple_node for search/insert */

	/*
	 * This is a sanity check, whenever we COW a block we will update
	 * new_bytenr with it's current location, and we will check this in
	 * various places to validate that the cache makes sense, it shouldn't
	 * be used for anything else.
	 */
	u64 new_bytenr;
	/* Objectid of tree block owner, can be not uptodate */
	u64 owner;
@@ -335,10 +341,6 @@ struct btrfs_backref_node {
	struct extent_buffer *eb;
	/* Level of the tree block */
	unsigned int level:8;
	/* Is the block in a non-shareable tree */
	unsigned int cowonly:1;
	/* 1 if no child node is in the cache */
	unsigned int lowest:1;
	/* Is the extent buffer locked */
	unsigned int locked:1;
	/* Has the block been processed */
@@ -391,12 +393,6 @@ struct btrfs_backref_cache {
	 * level blocks may not reflect the new location
	 */
	struct list_head pending[BTRFS_MAX_LEVEL];
	/* List of backref nodes with no child node */
	struct list_head leaves;
	/* List of blocks that have been COWed in current transaction */
	struct list_head changed;
	/* List of detached backref node. */
	struct list_head detached;

	u64 last_trans;

+9 −2
Original line number Diff line number Diff line
@@ -453,6 +453,14 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
		(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
		dev->devid, bio->bi_iter.bi_size);

	/*
	 * Track reads if tracking is enabled; ignore I/O operations before the
	 * filesystem is fully initialized.
	 */
	if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info)
		percpu_counter_add(&dev->fs_info->stats_read_blocks,
				   bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits);

	if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
		blkcg_punt_bio_submit(bio);
	else
@@ -725,8 +733,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
			bio->bi_opf |= REQ_OP_ZONE_APPEND;
		}

		if (is_data_bbio(bbio) && bioc &&
		    btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
		if (is_data_bbio(bbio) && bioc && bioc->use_rst) {
			/*
			 * No locking for the list update, as we only add to
			 * the list in the I/O submission path, and list
Loading