Commit 19d7f65f authored by Josef Bacik's avatar Josef Bacik Committed by David Sterba
Browse files

btrfs: convert the buffer_radix to an xarray



In order to fully utilize xarray tagging to improve writeback we need to
convert the buffer_radix to a proper xarray.  This conversion is
relatively straightforward as the radix code uses the xarray underneath.
Using xarray directly allows for quite a lot less code.

Reviewed-by: default avatarFilipe Manana <fdmanana@suse.com>
Signed-off-by: default avatarJosef Bacik <josef@toxicpanda.com>
Reviewed-by: default avatarDavid Sterba <dsterba@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 656e9f51
Loading
Loading
Loading
Loading
+12 −2
Original line number Diff line number Diff line
@@ -2761,10 +2761,21 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
	return ret;
}

/*
 * Lockdep gets confused between our buffer_tree which requires IRQ locking because
 * we modify marks in the IRQ context, and our delayed inode xarray which doesn't
 * have these requirements. Use a class key so lockdep doesn't get them mixed up.
 */
static struct lock_class_key buffer_xa_class;

void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{
	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);

	/* Use the same flags as mapping->i_pages. */
	xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
	lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class);

	INIT_LIST_HEAD(&fs_info->trans_list);
	INIT_LIST_HEAD(&fs_info->dead_roots);
	INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -2776,7 +2787,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
	spin_lock_init(&fs_info->delayed_iput_lock);
	spin_lock_init(&fs_info->defrag_inodes_lock);
	spin_lock_init(&fs_info->super_lock);
	spin_lock_init(&fs_info->buffer_lock);
	spin_lock_init(&fs_info->unused_bgs_lock);
	spin_lock_init(&fs_info->treelog_bg_lock);
	spin_lock_init(&fs_info->zone_active_bgs_lock);
+86 −126
Original line number Diff line number Diff line
@@ -1866,20 +1866,18 @@ static void set_btree_ioerr(struct extent_buffer *eb)
 * context.
 */
static struct extent_buffer *find_extent_buffer_nolock(
		const struct btrfs_fs_info *fs_info, u64 start)
		struct btrfs_fs_info *fs_info, u64 start)
{
	struct extent_buffer *eb;
	unsigned long index = (start >> fs_info->sectorsize_bits);

	rcu_read_lock();
	eb = radix_tree_lookup(&fs_info->buffer_radix,
			       start >> fs_info->sectorsize_bits);
	if (eb && atomic_inc_not_zero(&eb->refs)) {
	eb = xa_load(&fs_info->buffer_tree, index);
	if (eb && !atomic_inc_not_zero(&eb->refs))
		eb = NULL;
	rcu_read_unlock();
	return eb;
}
	rcu_read_unlock();
	return NULL;
}

static void end_bbio_meta_write(struct btrfs_bio *bbio)
{
@@ -2742,11 +2740,10 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo

	if (!btrfs_meta_is_subpage(fs_info)) {
		/*
		 * We do this since we'll remove the pages after we've
		 * removed the eb from the radix tree, so we could race
		 * and have this page now attached to the new eb.  So
		 * only clear folio if it's still connected to
		 * this eb.
		 * We do this since we'll remove the pages after we've removed
		 * the eb from the xarray, so we could race and have this page
		 * now attached to the new eb.  So only clear folio if it's
		 * still connected to this eb.
		 */
		if (folio_test_private(folio) && folio_get_private(folio) == eb) {
			BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
@@ -2911,9 +2908,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
{
	int refs;
	/*
	 * The TREE_REF bit is first set when the extent_buffer is added
	 * to the radix tree. It is also reset, if unset, when a new reference
	 * is created by find_extent_buffer.
	 * The TREE_REF bit is first set when the extent_buffer is added to the
	 * xarray. It is also reset, if unset, when a new reference is created
	 * by find_extent_buffer.
	 *
	 * It is only cleared in two cases: freeing the last non-tree
	 * reference to the extent_buffer when its STALE bit is set or
@@ -2925,13 +2922,12 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
	 * conditions between the calls to check_buffer_tree_ref in those
	 * codepaths and clearing TREE_REF in try_release_extent_buffer.
	 *
	 * The actual lifetime of the extent_buffer in the radix tree is
	 * adequately protected by the refcount, but the TREE_REF bit and
	 * its corresponding reference are not. To protect against this
	 * class of races, we call check_buffer_tree_ref from the codepaths
	 * which trigger io. Note that once io is initiated, TREE_REF can no
	 * longer be cleared, so that is the moment at which any such race is
	 * best fixed.
	 * The actual lifetime of the extent_buffer in the xarray is adequately
	 * protected by the refcount, but the TREE_REF bit and its corresponding
	 * reference are not. To protect against this class of races, we call
	 * check_buffer_tree_ref() from the code paths which trigger io. Note that
	 * once io is initiated, TREE_REF can no longer be cleared, so that is
	 * the moment at which any such race is best fixed.
	 */
	refs = atomic_read(&eb->refs);
	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
@@ -2995,23 +2991,25 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
		return ERR_PTR(-ENOMEM);
	eb->fs_info = fs_info;
again:
	ret = radix_tree_preload(GFP_NOFS);
	if (ret) {
		exists = ERR_PTR(ret);
		goto free_eb;
	xa_lock_irq(&fs_info->buffer_tree);
	exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
			      NULL, eb, GFP_NOFS);
	if (xa_is_err(exists)) {
		ret = xa_err(exists);
		xa_unlock_irq(&fs_info->buffer_tree);
		btrfs_release_extent_buffer(eb);
		return ERR_PTR(ret);
	}
	spin_lock(&fs_info->buffer_lock);
	ret = radix_tree_insert(&fs_info->buffer_radix,
				start >> fs_info->sectorsize_bits, eb);
	spin_unlock(&fs_info->buffer_lock);
	radix_tree_preload_end();
	if (ret == -EEXIST) {
		exists = find_extent_buffer(fs_info, start);
		if (exists)
			goto free_eb;
		else
	if (exists) {
		if (!atomic_inc_not_zero(&exists->refs)) {
			/* The extent buffer is being freed, retry. */
			xa_unlock_irq(&fs_info->buffer_tree);
			goto again;
		}
		xa_unlock_irq(&fs_info->buffer_tree);
		goto free_eb;
	}
	xa_unlock_irq(&fs_info->buffer_tree);
	check_buffer_tree_ref(eb);

	return eb;
@@ -3032,9 +3030,9 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
	lockdep_assert_held(&folio->mapping->i_private_lock);

	/*
	 * For subpage case, we completely rely on radix tree to ensure we
	 * don't try to insert two ebs for the same bytenr.  So here we always
	 * return NULL and just continue.
	 * For subpage case, we completely rely on xarray to ensure we don't try
	 * to insert two ebs for the same bytenr.  So here we always return NULL
	 * and just continue.
	 */
	if (btrfs_meta_is_subpage(fs_info))
		return NULL;
@@ -3165,7 +3163,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
	/*
	 * To inform we have an extra eb under allocation, so that
	 * detach_extent_buffer_page() won't release the folio private when the
	 * eb hasn't been inserted into radix tree yet.
	 * eb hasn't been inserted into the xarray yet.
	 *
	 * The ref will be decreased when the eb releases the page, in
	 * detach_extent_buffer_page().  Thus needs no special handling in the
@@ -3299,10 +3297,9 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,

		/*
		 * We can't unlock the pages just yet since the extent buffer
		 * hasn't been properly inserted in the radix tree, this
		 * opens a race with btree_release_folio which can free a page
		 * while we are still filling in all pages for the buffer and
		 * we could crash.
		 * hasn't been properly inserted into the xarray, this opens a
		 * race with btree_release_folio() which can free a page while we
		 * are still filling in all pages for the buffer and we could crash.
		 */
	}
	if (uptodate)
@@ -3311,23 +3308,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
	if (page_contig)
		eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
again:
	ret = radix_tree_preload(GFP_NOFS);
	if (ret)
	xa_lock_irq(&fs_info->buffer_tree);
	existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
				   start >> fs_info->sectorsize_bits, NULL, eb,
				   GFP_NOFS);
	if (xa_is_err(existing_eb)) {
		ret = xa_err(existing_eb);
		xa_unlock_irq(&fs_info->buffer_tree);
		goto out;

	spin_lock(&fs_info->buffer_lock);
	ret = radix_tree_insert(&fs_info->buffer_radix,
				start >> fs_info->sectorsize_bits, eb);
	spin_unlock(&fs_info->buffer_lock);
	radix_tree_preload_end();
	if (ret == -EEXIST) {
		ret = 0;
		existing_eb = find_extent_buffer(fs_info, start);
		if (existing_eb)
			goto out;
		else
	}
	if (existing_eb) {
		if (!atomic_inc_not_zero(&existing_eb->refs)) {
			xa_unlock_irq(&fs_info->buffer_tree);
			goto again;
		}
		xa_unlock_irq(&fs_info->buffer_tree);
		goto out;
	}
	xa_unlock_irq(&fs_info->buffer_tree);

	/* add one reference for the tree */
	check_buffer_tree_ref(eb);

@@ -3397,10 +3396,23 @@ static int release_extent_buffer(struct extent_buffer *eb)

		spin_unlock(&eb->refs_lock);

		spin_lock(&fs_info->buffer_lock);
		radix_tree_delete_item(&fs_info->buffer_radix,
				       eb->start >> fs_info->sectorsize_bits, eb);
		spin_unlock(&fs_info->buffer_lock);
		/*
		 * We're erasing, theoretically there will be no allocations, so
		 * just use GFP_ATOMIC.
		 *
		 * We use cmpxchg instead of erase because we do not know if
		 * this eb is actually in the tree or not, we could be cleaning
		 * up an eb that we allocated but never inserted into the tree.
		 * Thus use cmpxchg to remove it from the tree if it is there,
		 * or leave the other entry if this isn't in the tree.
		 *
		 * The documentation says that putting a NULL value is the same
		 * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't
		 * in this case.
		 */
		xa_cmpxchg_irq(&fs_info->buffer_tree,
			       eb->start >> fs_info->sectorsize_bits, eb, NULL,
			       GFP_ATOMIC);

		btrfs_leak_debug_del_eb(eb);
		/* Should be safe to release folios at this point. */
@@ -4231,71 +4243,17 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
	}
}

#define GANG_LOOKUP_SIZE	16
static struct extent_buffer *get_next_extent_buffer(
		const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr)
{
	struct extent_buffer *gang[GANG_LOOKUP_SIZE];
	struct extent_buffer *found = NULL;
	u64 folio_start = folio_pos(folio);
	u64 cur = folio_start;

	ASSERT(in_range(bytenr, folio_start, PAGE_SIZE));
	lockdep_assert_held(&fs_info->buffer_lock);

	while (cur < folio_start + PAGE_SIZE) {
		int ret;
		int i;

		ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
				(void **)gang, cur >> fs_info->sectorsize_bits,
				min_t(unsigned int, GANG_LOOKUP_SIZE,
				      PAGE_SIZE / fs_info->nodesize));
		if (ret == 0)
			goto out;
		for (i = 0; i < ret; i++) {
			/* Already beyond page end */
			if (gang[i]->start >= folio_start + PAGE_SIZE)
				goto out;
			/* Found one */
			if (gang[i]->start >= bytenr) {
				found = gang[i];
				goto out;
			}
		}
		cur = gang[ret - 1]->start + gang[ret - 1]->len;
	}
out:
	return found;
}

static int try_release_subpage_extent_buffer(struct folio *folio)
{
	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
	u64 cur = folio_pos(folio);
	const u64 end = cur + PAGE_SIZE;
	struct extent_buffer *eb;
	unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits);
	unsigned long index = start;
	unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
	int ret;

	while (cur < end) {
		struct extent_buffer *eb = NULL;

		/*
		 * Unlike try_release_extent_buffer() which uses folio private
		 * to grab buffer, for subpage case we rely on radix tree, thus
		 * we need to ensure radix tree consistency.
		 *
		 * We also want an atomic snapshot of the radix tree, thus go
		 * with spinlock rather than RCU.
		 */
		spin_lock(&fs_info->buffer_lock);
		eb = get_next_extent_buffer(fs_info, folio, cur);
		if (!eb) {
			/* No more eb in the page range after or at cur */
			spin_unlock(&fs_info->buffer_lock);
			break;
		}
		cur = eb->start + eb->len;

	xa_lock_irq(&fs_info->buffer_tree);
	xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) {
		/*
		 * The same as try_release_extent_buffer(), to ensure the eb
		 * won't disappear out from under us.
@@ -4303,10 +4261,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
		spin_lock(&eb->refs_lock);
		if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
			spin_unlock(&eb->refs_lock);
			spin_unlock(&fs_info->buffer_lock);
			break;
			continue;
		}
		spin_unlock(&fs_info->buffer_lock);
		xa_unlock_irq(&fs_info->buffer_tree);

		/*
		 * If tree ref isn't set then we know the ref on this eb is a
@@ -4324,7 +4281,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
		 * release_extent_buffer() will release the refs_lock.
		 */
		release_extent_buffer(eb);
		xa_lock_irq(&fs_info->buffer_tree);
	}
	xa_unlock_irq(&fs_info->buffer_tree);

	/*
	 * Finally to check if we have cleared folio private, as if we have
	 * released all ebs in the page, the folio private should be cleared now.
+1 −3
Original line number Diff line number Diff line
@@ -777,10 +777,8 @@ struct btrfs_fs_info {

	struct btrfs_delayed_root *delayed_root;

	/* Extent buffer radix tree */
	spinlock_t buffer_lock;
	/* Entries are eb->start / sectorsize */
	struct radix_tree_root buffer_radix;
	struct xarray buffer_tree;

	/* Next backup root to be overwritten */
	int backup_root_index;
+8 −20
Original line number Diff line number Diff line
@@ -157,9 +157,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)

void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
{
	struct radix_tree_iter iter;
	void **slot;
	struct btrfs_device *dev, *tmp;
	struct extent_buffer *eb;
	unsigned long index;

	if (!fs_info)
		return;
@@ -169,25 +169,13 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)

	test_mnt->mnt_sb->s_fs_info = NULL;

	spin_lock(&fs_info->buffer_lock);
	radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
		struct extent_buffer *eb;

		eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
		if (!eb)
			continue;
		/* Shouldn't happen but that kind of thinking creates CVE's */
		if (radix_tree_exception(eb)) {
			if (radix_tree_deref_retry(eb))
				slot = radix_tree_iter_retry(&iter);
			continue;
		}
		slot = radix_tree_iter_resume(slot, &iter);
		spin_unlock(&fs_info->buffer_lock);
		free_extent_buffer_stale(eb);
		spin_lock(&fs_info->buffer_lock);
	xa_lock_irq(&fs_info->buffer_tree);
	xa_for_each(&fs_info->buffer_tree, index, eb) {
		xa_unlock_irq(&fs_info->buffer_tree);
		free_extent_buffer(eb);
		xa_lock_irq(&fs_info->buffer_tree);
	}
	spin_unlock(&fs_info->buffer_lock);
	xa_unlock_irq(&fs_info->buffer_tree);

	btrfs_mapping_tree_free(fs_info);
	list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
+2 −14
Original line number Diff line number Diff line
@@ -2171,27 +2171,15 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
{
	struct btrfs_fs_info *fs_info = block_group->fs_info;
	const u64 end = block_group->start + block_group->length;
	struct radix_tree_iter iter;
	struct extent_buffer *eb;
	void __rcu **slot;
	unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits);

	rcu_read_lock();
	radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
				 block_group->start >> fs_info->sectorsize_bits) {
		eb = radix_tree_deref_slot(slot);
		if (!eb)
			continue;
		if (radix_tree_deref_retry(eb)) {
			slot = radix_tree_iter_retry(&iter);
			continue;
		}

	xa_for_each_start(&fs_info->buffer_tree, index, eb, start) {
		if (eb->start < block_group->start)
			continue;
		if (eb->start >= end)
			break;

		slot = radix_tree_iter_resume(slot, &iter);
		rcu_read_unlock();
		wait_on_extent_buffer_writeback(eb);
		rcu_read_lock();