Commit 07978330 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs fixes from David Sterba:

 - fix handling of folio private changes.

   The private value holds pointer to our extent buffer structure
   representing a metadata range. Release and create of the range was
   not properly synchronized when updating the private bit which ended
   up in double folio_put, leading to all sorts of breakage

 - fix a crash, reported as duplicate key in metadata, but caused by a
   race of fsync and size extending write. Requires prealloc target
   range + fsync and other conditions (log tree state, timing)

 - fix leak of qgroup extent records after transaction abort

* tag 'for-6.10-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: protect folio::private when attaching extent buffer folios
  btrfs: fix leak of qgroup extent records after transaction abort
  btrfs: fix crash on racing fsync and size-extending write into prealloc
parents eecba7c0 f3a5367c
Loading
Loading
Loading
Loading
+1 −9
Original line number Diff line number Diff line
@@ -4538,18 +4538,10 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
				       struct btrfs_fs_info *fs_info)
{
	struct rb_node *node;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
	struct btrfs_delayed_ref_node *ref;

	delayed_refs = &trans->delayed_refs;

	spin_lock(&delayed_refs->lock);
	if (atomic_read(&delayed_refs->num_entries) == 0) {
		spin_unlock(&delayed_refs->lock);
		btrfs_debug(fs_info, "delayed_refs has NO entry");
		return;
	}

	while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
		struct btrfs_delayed_ref_head *head;
		struct rb_node *n;
+31 −29
Original line number Diff line number Diff line
@@ -3689,6 +3689,8 @@ static struct extent_buffer *grab_extent_buffer(
	struct folio *folio = page_folio(page);
	struct extent_buffer *exists;

	lockdep_assert_held(&page->mapping->i_private_lock);

	/*
	 * For subpage case, we completely rely on radix tree to ensure we
	 * don't try to insert two ebs for the same bytenr.  So here we always
@@ -3756,13 +3758,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
 * The caller needs to free the existing folios and retry using the same order.
 */
static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
				      struct btrfs_subpage *prealloc,
				      struct extent_buffer **found_eb_ret)
{

	struct btrfs_fs_info *fs_info = eb->fs_info;
	struct address_space *mapping = fs_info->btree_inode->i_mapping;
	const unsigned long index = eb->start >> PAGE_SHIFT;
	struct folio *existing_folio;
	struct folio *existing_folio = NULL;
	int ret;

	ASSERT(found_eb_ret);
@@ -3774,12 +3777,14 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
	ret = filemap_add_folio(mapping, eb->folios[i], index + i,
				GFP_NOFS | __GFP_NOFAIL);
	if (!ret)
		return 0;
		goto finish;

	existing_folio = filemap_lock_folio(mapping, index + i);
	/* The page cache only exists for a very short time, just retry. */
	if (IS_ERR(existing_folio))
	if (IS_ERR(existing_folio)) {
		existing_folio = NULL;
		goto retry;
	}

	/* For now, we should only have single-page folios for btree inode. */
	ASSERT(folio_nr_pages(existing_folio) == 1);
@@ -3790,14 +3795,13 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
		return -EAGAIN;
	}

	if (fs_info->nodesize < PAGE_SIZE) {
		/*
		 * We're going to reuse the existing page, can drop our page
		 * and subpage structure now.
		 */
finish:
	spin_lock(&mapping->i_private_lock);
	if (existing_folio && fs_info->nodesize < PAGE_SIZE) {
		/* We're going to reuse the existing page, can drop our folio now. */
		__free_page(folio_page(eb->folios[i], 0));
		eb->folios[i] = existing_folio;
	} else {
	} else if (existing_folio) {
		struct extent_buffer *existing_eb;

		existing_eb = grab_extent_buffer(fs_info,
@@ -3805,6 +3809,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
		if (existing_eb) {
			/* The extent buffer still exists, we can use it directly. */
			*found_eb_ret = existing_eb;
			spin_unlock(&mapping->i_private_lock);
			folio_unlock(existing_folio);
			folio_put(existing_folio);
			return 1;
@@ -3813,6 +3818,22 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
		__free_page(folio_page(eb->folios[i], 0));
		eb->folios[i] = existing_folio;
	}
	eb->folio_size = folio_size(eb->folios[i]);
	eb->folio_shift = folio_shift(eb->folios[i]);
	/* Should not fail, as we have preallocated the memory. */
	ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc);
	ASSERT(!ret);
	/*
	 * To inform we have an extra eb under allocation, so that
	 * detach_extent_buffer_page() won't release the folio private when the
	 * eb hasn't been inserted into radix tree yet.
	 *
	 * The ref will be decreased when the eb releases the page, in
	 * detach_extent_buffer_page().  Thus needs no special handling in the
	 * error path.
	 */
	btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]);
	spin_unlock(&mapping->i_private_lock);
	return 0;
}

@@ -3824,7 +3845,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
	int attached = 0;
	struct extent_buffer *eb;
	struct extent_buffer *existing_eb = NULL;
	struct address_space *mapping = fs_info->btree_inode->i_mapping;
	struct btrfs_subpage *prealloc = NULL;
	u64 lockdep_owner = owner_root;
	bool page_contig = true;
@@ -3890,7 +3910,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
	for (int i = 0; i < num_folios; i++) {
		struct folio *folio;

		ret = attach_eb_folio_to_filemap(eb, i, &existing_eb);
		ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb);
		if (ret > 0) {
			ASSERT(existing_eb);
			goto out;
@@ -3927,24 +3947,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
		 * and free the allocated page.
		 */
		folio = eb->folios[i];
		eb->folio_size = folio_size(folio);
		eb->folio_shift = folio_shift(folio);
		spin_lock(&mapping->i_private_lock);
		/* Should not fail, as we have preallocated the memory */
		ret = attach_extent_buffer_folio(eb, folio, prealloc);
		ASSERT(!ret);
		/*
		 * To inform we have extra eb under allocation, so that
		 * detach_extent_buffer_page() won't release the folio private
		 * when the eb hasn't yet been inserted into radix tree.
		 *
		 * The ref will be decreased when the eb released the page, in
		 * detach_extent_buffer_page().
		 * Thus needs no special handling in error path.
		 */
		btrfs_folio_inc_eb_refs(fs_info, folio);
		spin_unlock(&mapping->i_private_lock);

		WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));

		/*
+11 −6
Original line number Diff line number Diff line
@@ -4860,18 +4860,23 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
			path->slots[0]++;
			continue;
		}
		if (!dropped_extents) {
		/*
			 * Avoid logging extent items logged in past fsync calls
			 * and leading to duplicate keys in the log tree.
		 * Avoid overlapping items in the log tree. The first time we
		 * get here, get rid of everything from a past fsync. After
		 * that, if the current extent starts before the end of the last
		 * extent we copied, truncate the last one. This can happen if
		 * an ordered extent completion modifies the subvolume tree
		 * while btrfs_next_leaf() has the tree unlocked.
		 */
		if (!dropped_extents || key.offset < truncate_offset) {
			ret = truncate_inode_items(trans, root->log_root, inode,
						   truncate_offset,
						   min(key.offset, truncate_offset),
						   BTRFS_EXTENT_DATA_KEY);
			if (ret)
				goto out;
			dropped_extents = true;
		}
		truncate_offset = btrfs_file_extent_end(path);
		if (ins_nr == 0)
			start_slot = slot;
		ins_nr++;