Commit 1f3a3e2a authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs fixes from David Sterba:
 "A few regular fixes and one fix for space reservation regression since
  6.7 that users have been reporting:

   - fix over-reservation of metadata chunks due to not keeping proper
     balance between global block reserve and delayed refs reserve; in
     practice this leaves behind empty metadata block groups, the
     workaround is to reclaim them by using the '-musage=1' balance
     filter

   - other space reservation fixes:
      - do not delete unused block group if it may be used soon
      - do not reserve space for checksums for NOCOW files

   - fix extent map assertion failure when writing out free space inode

   - reject encoded write if inode has nodatasum flag set

   - fix chunk map leak when loading block group zone info"

* tag 'for-6.8-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: don't refill whole delayed refs block reserve when starting transaction
  btrfs: zoned: fix chunk map leak when loading block group zone info
  btrfs: reject encoded write if inode has nodatasum flag set
  btrfs: don't reserve space for checksums when writing to nocow files
  btrfs: add new unused block groups to the list of unused block groups
  btrfs: do not delete unused block group if it may be used soon
  btrfs: add and use helper to check if block group is used
  btrfs: don't drop extent_map for free space inode on write error
parents 91f842ff 2f6397e4
Loading
Loading
Loading
Loading
+78 −2
Original line number Diff line number Diff line
@@ -1455,6 +1455,7 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
 */
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{
	LIST_HEAD(retry_list);
	struct btrfs_block_group *block_group;
	struct btrfs_space_info *space_info;
	struct btrfs_trans_handle *trans;
@@ -1476,6 +1477,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)

	spin_lock(&fs_info->unused_bgs_lock);
	while (!list_empty(&fs_info->unused_bgs)) {
		u64 used;
		int trimming;

		block_group = list_first_entry(&fs_info->unused_bgs,
@@ -1511,9 +1513,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
			goto next;
		}

		spin_lock(&space_info->lock);
		spin_lock(&block_group->lock);
		if (block_group->reserved || block_group->pinned ||
		    block_group->used || block_group->ro ||
		if (btrfs_is_block_group_used(block_group) || block_group->ro ||
		    list_is_singular(&block_group->list)) {
			/*
			 * We want to bail if we made new allocations or have
@@ -1523,10 +1525,49 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
			 */
			trace_btrfs_skip_unused_block_group(block_group);
			spin_unlock(&block_group->lock);
			spin_unlock(&space_info->lock);
			up_write(&space_info->groups_sem);
			goto next;
		}

		/*
		 * The block group may be unused but there may be space reserved
		 * accounting with the existence of that block group, that is,
		 * space_info->bytes_may_use was incremented by a task but no
		 * space was yet allocated from the block group by the task.
		 * That space may or may not be allocated, as we are generally
		 * pessimistic about space reservation for metadata as well as
		 * for data when using compression (as we reserve space based on
		 * the worst case, when data can't be compressed, and before
		 * actually attempting compression, before starting writeback).
		 *
		 * So check if the total space of the space_info minus the size
		 * of this block group is less than the used space of the
		 * space_info - if that's the case, then it means we have tasks
		 * that might be relying on the block group in order to allocate
		 * extents, and add back the block group to the unused list when
		 * we finish, so that we retry later in case no tasks ended up
		 * needing to allocate extents from the block group.
		 */
		used = btrfs_space_info_used(space_info, true);
		if (space_info->total_bytes - block_group->length < used) {
			/*
			 * Add a reference for the list, compensate for the ref
			 * drop under the "next" label for the
			 * fs_info->unused_bgs list.
			 */
			btrfs_get_block_group(block_group);
			list_add_tail(&block_group->bg_list, &retry_list);

			trace_btrfs_skip_unused_block_group(block_group);
			spin_unlock(&block_group->lock);
			spin_unlock(&space_info->lock);
			up_write(&space_info->groups_sem);
			goto next;
		}

		spin_unlock(&block_group->lock);
		spin_unlock(&space_info->lock);

		/* We don't want to force the issue, only flip if it's ok. */
		ret = inc_block_group_ro(block_group, 0);
@@ -1650,12 +1691,16 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
		btrfs_put_block_group(block_group);
		spin_lock(&fs_info->unused_bgs_lock);
	}
	list_splice_tail(&retry_list, &fs_info->unused_bgs);
	spin_unlock(&fs_info->unused_bgs_lock);
	mutex_unlock(&fs_info->reclaim_bgs_lock);
	return;

flip_async:
	btrfs_end_transaction(trans);
	spin_lock(&fs_info->unused_bgs_lock);
	list_splice_tail(&retry_list, &fs_info->unused_bgs);
	spin_unlock(&fs_info->unused_bgs_lock);
	mutex_unlock(&fs_info->reclaim_bgs_lock);
	btrfs_put_block_group(block_group);
	btrfs_discard_punt_unused_bgs_list(fs_info);
@@ -2684,6 +2729,37 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
		btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
		list_del_init(&block_group->bg_list);
		clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);

		/*
		 * If the block group is still unused, add it to the list of
		 * unused block groups. The block group may have been created in
		 * order to satisfy a space reservation, in which case the
		 * extent allocation only happens later. But often we don't
		 * actually need to allocate space that we previously reserved,
		 * so the block group may become unused for a long time. For
		 * example for metadata we generally reserve space for a worst
		 * possible scenario, but then don't end up allocating all that
		 * space or none at all (due to no need to COW, extent buffers
		 * were already COWed in the current transaction and still
		 * unwritten, tree heights lower than the maximum possible
		 * height, etc). For data we generally reserve the axact amount
		 * of space we are going to allocate later, the exception is
		 * when using compression, as we must reserve space based on the
		 * uncompressed data size, because the compression is only done
		 * when writeback triggered and we don't know how much space we
		 * are actually going to need, so we reserve the uncompressed
		 * size because the data may be uncompressible in the worst case.
		 */
		if (ret == 0) {
			bool used;

			spin_lock(&block_group->lock);
			used = btrfs_is_block_group_used(block_group);
			spin_unlock(&block_group->lock);

			if (!used)
				btrfs_mark_bg_unused(block_group);
		}
	}
	btrfs_trans_release_chunk_metadata(trans);
}
+7 −0
Original line number Diff line number Diff line
@@ -257,6 +257,13 @@ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
	return (block_group->start + block_group->length);
}

static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
{
	lockdep_assert_held(&bg->lock);

	return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
}

static inline bool btrfs_is_block_group_data_only(
					struct btrfs_block_group *block_group)
{
+19 −10
Original line number Diff line number Diff line
@@ -245,7 +245,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
	u64 reserve_size = 0;
	u64 qgroup_rsv_size = 0;
	u64 csum_leaves;
	unsigned outstanding_extents;

	lockdep_assert_held(&inode->lock);
@@ -260,10 +259,12 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
						outstanding_extents);
		reserve_size += btrfs_calc_metadata_size(fs_info, 1);
	}
	csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
						 inode->csum_bytes);
	reserve_size += btrfs_calc_insert_metadata_size(fs_info,
							csum_leaves);
	if (!(inode->flags & BTRFS_INODE_NODATASUM)) {
		u64 csum_leaves;

		csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
		reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves);
	}
	/*
	 * For qgroup rsv, the calculation is very simple:
	 * account one nodesize for each outstanding extent
@@ -278,14 +279,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
	spin_unlock(&block_rsv->lock);
}

static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
static void calc_inode_reservations(struct btrfs_inode *inode,
				    u64 num_bytes, u64 disk_num_bytes,
				    u64 *meta_reserve, u64 *qgroup_reserve)
{
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
	u64 nr_extents = count_max_extents(fs_info, num_bytes);
	u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
	u64 csum_leaves;
	u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);

	if (inode->flags & BTRFS_INODE_NODATASUM)
		csum_leaves = 0;
	else
		csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);

	*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
						nr_extents + csum_leaves);

@@ -337,7 +344,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
	 * everything out and try again, which is bad.  This way we just
	 * over-reserve slightly, and clean up the mess when we are done.
	 */
	calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
	calc_inode_reservations(inode, num_bytes, disk_num_bytes,
				&meta_reserve, &qgroup_reserve);
	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
						 noflush);
@@ -359,6 +366,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
	nr_extents = count_max_extents(fs_info, num_bytes);
	spin_lock(&inode->lock);
	btrfs_mod_outstanding_extents(inode, nr_extents);
	if (!(inode->flags & BTRFS_INODE_NODATASUM))
		inode->csum_bytes += disk_num_bytes;
	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
	spin_unlock(&inode->lock);
@@ -393,6 +401,7 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,

	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
	spin_lock(&inode->lock);
	if (!(inode->flags & BTRFS_INODE_NODATASUM))
		inode->csum_bytes -= num_bytes;
	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
	spin_unlock(&inode->lock);
+24 −2
Original line number Diff line number Diff line
@@ -3184,8 +3184,23 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
			unwritten_start += logical_len;
		clear_extent_uptodate(io_tree, unwritten_start, end, NULL);

		/* Drop extent maps for the part of the extent we didn't write. */
		btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
		/*
		 * Drop extent maps for the part of the extent we didn't write.
		 *
		 * We have an exception here for the free_space_inode, this is
		 * because when we do btrfs_get_extent() on the free space inode
		 * we will search the commit root.  If this is a new block group
		 * we won't find anything, and we will trip over the assert in
		 * writepage where we do ASSERT(em->block_start !=
		 * EXTENT_MAP_HOLE).
		 *
		 * Theoretically we could also skip this for any NOCOW extent as
		 * we don't mess with the extent map tree in the NOCOW case, but
		 * for now simply skip this if we are the free space inode.
		 */
		if (!btrfs_is_free_space_inode(inode))
			btrfs_drop_extent_map_range(inode, unwritten_start,
						    end, false);

		/*
		 * If the ordered extent had an IOERR or something else went
@@ -10273,6 +10288,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
	if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
		return -EINVAL;

	/*
	 * Compressed extents should always have checksums, so error out if we
	 * have a NOCOW file or inode was created while mounted with NODATASUM.
	 */
	if (inode->flags & BTRFS_INODE_NODATASUM)
		return -EINVAL;

	orig_count = iov_iter_count(from);

	/* The extent size must be sane. */
+2 −36
Original line number Diff line number Diff line
@@ -564,56 +564,22 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
					u64 num_bytes,
					u64 *delayed_refs_bytes)
{
	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
	struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
	u64 extra_delayed_refs_bytes = 0;
	u64 bytes;
	u64 bytes = num_bytes + *delayed_refs_bytes;
	int ret;

	/*
	 * If there's a gap between the size of the delayed refs reserve and
	 * its reserved space, than some tasks have added delayed refs or bumped
	 * its size otherwise (due to block group creation or removal, or block
	 * group item update). Also try to allocate that gap in order to prevent
	 * using (and possibly abusing) the global reserve when committing the
	 * transaction.
	 */
	if (flush == BTRFS_RESERVE_FLUSH_ALL &&
	    !btrfs_block_rsv_full(delayed_refs_rsv)) {
		spin_lock(&delayed_refs_rsv->lock);
		if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
			extra_delayed_refs_bytes = delayed_refs_rsv->size -
				delayed_refs_rsv->reserved;
		spin_unlock(&delayed_refs_rsv->lock);
	}

	bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;

	/*
	 * We want to reserve all the bytes we may need all at once, so we only
	 * do 1 enospc flushing cycle per transaction start.
	 */
	ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
	if (ret == 0) {
		if (extra_delayed_refs_bytes > 0)
			btrfs_migrate_to_delayed_refs_rsv(fs_info,
							  extra_delayed_refs_bytes);
		return 0;
	}

	if (extra_delayed_refs_bytes > 0) {
		bytes -= extra_delayed_refs_bytes;
		ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
		if (ret == 0)
			return 0;
	}

	/*
	 * If we are an emergency flush, which can steal from the global block
	 * reserve, then attempt to not reserve space for the delayed refs, as
	 * we will consume space for them from the global block reserve.
	 */
	if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
	if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
		bytes -= *delayed_refs_bytes;
		*delayed_refs_bytes = 0;
		ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
Loading