Merge tag 'for-6.8-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux (1f3a3e2a) · Commits · git / linux-net

fs/btrfs/block-group.c

+78 −2

Original line number	Diff line number	Diff line
		@@ -1455,6 +1455,7 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
		*/
		void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
		{
		LIST_HEAD(retry_list);
		struct btrfs_block_group *block_group;
		struct btrfs_space_info *space_info;
		struct btrfs_trans_handle *trans;
		@@ -1476,6 +1477,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)

		spin_lock(&fs_info->unused_bgs_lock);
		while (!list_empty(&fs_info->unused_bgs)) {
		u64 used;
		int trimming;

		block_group = list_first_entry(&fs_info->unused_bgs,
		@@ -1511,9 +1513,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
		goto next;
		}

		spin_lock(&space_info->lock);
		spin_lock(&block_group->lock);
		if (block_group->reserved \|\| block_group->pinned \|\|
		block_group->used \|\| block_group->ro \|\|
		if (btrfs_is_block_group_used(block_group) \|\| block_group->ro \|\|
		list_is_singular(&block_group->list)) {
		/*
		* We want to bail if we made new allocations or have
		@@ -1523,10 +1525,49 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
		*/
		trace_btrfs_skip_unused_block_group(block_group);
		spin_unlock(&block_group->lock);
		spin_unlock(&space_info->lock);
		up_write(&space_info->groups_sem);
		goto next;
		}

		/*
		* The block group may be unused but there may be space reserved
		* accounting with the existence of that block group, that is,
		* space_info->bytes_may_use was incremented by a task but no
		* space was yet allocated from the block group by the task.
		* That space may or may not be allocated, as we are generally
		* pessimistic about space reservation for metadata as well as
		* for data when using compression (as we reserve space based on
		* the worst case, when data can't be compressed, and before
		* actually attempting compression, before starting writeback).
		*
		* So check if the total space of the space_info minus the size
		* of this block group is less than the used space of the
		* space_info - if that's the case, then it means we have tasks
		* that might be relying on the block group in order to allocate
		* extents, and add back the block group to the unused list when
		* we finish, so that we retry later in case no tasks ended up
		* needing to allocate extents from the block group.
		*/
		used = btrfs_space_info_used(space_info, true);
		if (space_info->total_bytes - block_group->length < used) {
		/*
		* Add a reference for the list, compensate for the ref
		* drop under the "next" label for the
		* fs_info->unused_bgs list.
		*/
		btrfs_get_block_group(block_group);
		list_add_tail(&block_group->bg_list, &retry_list);

		trace_btrfs_skip_unused_block_group(block_group);
		spin_unlock(&block_group->lock);
		spin_unlock(&space_info->lock);
		up_write(&space_info->groups_sem);
		goto next;
		}

		spin_unlock(&block_group->lock);
		spin_unlock(&space_info->lock);

		/* We don't want to force the issue, only flip if it's ok. */
		ret = inc_block_group_ro(block_group, 0);
		@@ -1650,12 +1691,16 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
		btrfs_put_block_group(block_group);
		spin_lock(&fs_info->unused_bgs_lock);
		}
		list_splice_tail(&retry_list, &fs_info->unused_bgs);
		spin_unlock(&fs_info->unused_bgs_lock);
		mutex_unlock(&fs_info->reclaim_bgs_lock);
		return;

		flip_async:
		btrfs_end_transaction(trans);
		spin_lock(&fs_info->unused_bgs_lock);
		list_splice_tail(&retry_list, &fs_info->unused_bgs);
		spin_unlock(&fs_info->unused_bgs_lock);
		mutex_unlock(&fs_info->reclaim_bgs_lock);
		btrfs_put_block_group(block_group);
		btrfs_discard_punt_unused_bgs_list(fs_info);
		@@ -2684,6 +2729,37 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
		btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
		list_del_init(&block_group->bg_list);
		clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);

		/*
		* If the block group is still unused, add it to the list of
		* unused block groups. The block group may have been created in
		* order to satisfy a space reservation, in which case the
		* extent allocation only happens later. But often we don't
		* actually need to allocate space that we previously reserved,
		* so the block group may become unused for a long time. For
		* example for metadata we generally reserve space for a worst
		* possible scenario, but then don't end up allocating all that
		* space or none at all (due to no need to COW, extent buffers
		* were already COWed in the current transaction and still
		* unwritten, tree heights lower than the maximum possible
		* height, etc). For data we generally reserve the axact amount
		* of space we are going to allocate later, the exception is
		* when using compression, as we must reserve space based on the
		* uncompressed data size, because the compression is only done
		* when writeback triggered and we don't know how much space we
		* are actually going to need, so we reserve the uncompressed
		* size because the data may be uncompressible in the worst case.
		*/
		if (ret == 0) {
		bool used;

		spin_lock(&block_group->lock);
		used = btrfs_is_block_group_used(block_group);
		spin_unlock(&block_group->lock);

		if (!used)
		btrfs_mark_bg_unused(block_group);
		}
		}
		btrfs_trans_release_chunk_metadata(trans);
		}

fs/btrfs/block-group.h

+7 −0

Original line number	Diff line number	Diff line
		@@ -257,6 +257,13 @@ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
		return (block_group->start + block_group->length);
		}

		static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
		{
		lockdep_assert_held(&bg->lock);

		return (bg->used > 0 \|\| bg->reserved > 0 \|\| bg->pinned > 0);
		}

		static inline bool btrfs_is_block_group_data_only(
		struct btrfs_block_group *block_group)
		{

fs/btrfs/delalloc-space.c

+19 −10

Original line number	Diff line number	Diff line
		@@ -245,7 +245,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
		struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
		u64 reserve_size = 0;
		u64 qgroup_rsv_size = 0;
		u64 csum_leaves;
		unsigned outstanding_extents;

		lockdep_assert_held(&inode->lock);
		@@ -260,10 +259,12 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
		outstanding_extents);
		reserve_size += btrfs_calc_metadata_size(fs_info, 1);
		}
		csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
		inode->csum_bytes);
		reserve_size += btrfs_calc_insert_metadata_size(fs_info,
		csum_leaves);
		if (!(inode->flags & BTRFS_INODE_NODATASUM)) {
		u64 csum_leaves;

		csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
		reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves);
		}
		/*
		* For qgroup rsv, the calculation is very simple:
		* account one nodesize for each outstanding extent
		@@ -278,14 +279,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
		spin_unlock(&block_rsv->lock);
		}

		static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
		static void calc_inode_reservations(struct btrfs_inode *inode,
		u64 num_bytes, u64 disk_num_bytes,
		u64 meta_reserve, u64 qgroup_reserve)
		{
		struct btrfs_fs_info *fs_info = inode->root->fs_info;
		u64 nr_extents = count_max_extents(fs_info, num_bytes);
		u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
		u64 csum_leaves;
		u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);

		if (inode->flags & BTRFS_INODE_NODATASUM)
		csum_leaves = 0;
		else
		csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);

		*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
		nr_extents + csum_leaves);

		@@ -337,7 +344,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
		* everything out and try again, which is bad. This way we just
		* over-reserve slightly, and clean up the mess when we are done.
		*/
		calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
		calc_inode_reservations(inode, num_bytes, disk_num_bytes,
		&meta_reserve, &qgroup_reserve);
		ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
		noflush);
		@@ -359,6 +366,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
		nr_extents = count_max_extents(fs_info, num_bytes);
		spin_lock(&inode->lock);
		btrfs_mod_outstanding_extents(inode, nr_extents);
		if (!(inode->flags & BTRFS_INODE_NODATASUM))
		inode->csum_bytes += disk_num_bytes;
		btrfs_calculate_inode_block_rsv_size(fs_info, inode);
		spin_unlock(&inode->lock);
		@@ -393,6 +401,7 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,

		num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
		spin_lock(&inode->lock);
		if (!(inode->flags & BTRFS_INODE_NODATASUM))
		inode->csum_bytes -= num_bytes;
		btrfs_calculate_inode_block_rsv_size(fs_info, inode);
		spin_unlock(&inode->lock);

fs/btrfs/inode.c

+24 −2

Original line number	Diff line number	Diff line
		@@ -3184,8 +3184,23 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
		unwritten_start += logical_len;
		clear_extent_uptodate(io_tree, unwritten_start, end, NULL);

		/* Drop extent maps for the part of the extent we didn't write. */
		btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
		/*
		* Drop extent maps for the part of the extent we didn't write.
		*
		* We have an exception here for the free_space_inode, this is
		* because when we do btrfs_get_extent() on the free space inode
		* we will search the commit root. If this is a new block group
		* we won't find anything, and we will trip over the assert in
		* writepage where we do ASSERT(em->block_start !=
		* EXTENT_MAP_HOLE).
		*
		* Theoretically we could also skip this for any NOCOW extent as
		* we don't mess with the extent map tree in the NOCOW case, but
		* for now simply skip this if we are the free space inode.
		*/
		if (!btrfs_is_free_space_inode(inode))
		btrfs_drop_extent_map_range(inode, unwritten_start,
		end, false);

		/*
		* If the ordered extent had an IOERR or something else went
		@@ -10273,6 +10288,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb iocb, struct iov_iter from,
		if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
		return -EINVAL;

		/*
		* Compressed extents should always have checksums, so error out if we
		* have a NOCOW file or inode was created while mounted with NODATASUM.
		*/
		if (inode->flags & BTRFS_INODE_NODATASUM)
		return -EINVAL;

		orig_count = iov_iter_count(from);

		/* The extent size must be sane. */

fs/btrfs/transaction.c

+2 −36

Original line number	Diff line number	Diff line
		@@ -564,56 +564,22 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
		u64 num_bytes,
		u64 *delayed_refs_bytes)
		{
		struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
		struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
		u64 extra_delayed_refs_bytes = 0;
		u64 bytes;
		u64 bytes = num_bytes + *delayed_refs_bytes;
		int ret;

		/*
		* If there's a gap between the size of the delayed refs reserve and
		* its reserved space, than some tasks have added delayed refs or bumped
		* its size otherwise (due to block group creation or removal, or block
		* group item update). Also try to allocate that gap in order to prevent
		* using (and possibly abusing) the global reserve when committing the
		* transaction.
		*/
		if (flush == BTRFS_RESERVE_FLUSH_ALL &&
		!btrfs_block_rsv_full(delayed_refs_rsv)) {
		spin_lock(&delayed_refs_rsv->lock);
		if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
		extra_delayed_refs_bytes = delayed_refs_rsv->size -
		delayed_refs_rsv->reserved;
		spin_unlock(&delayed_refs_rsv->lock);
		}

		bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;

		/*
		* We want to reserve all the bytes we may need all at once, so we only
		* do 1 enospc flushing cycle per transaction start.
		*/
		ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
		if (ret == 0) {
		if (extra_delayed_refs_bytes > 0)
		btrfs_migrate_to_delayed_refs_rsv(fs_info,
		extra_delayed_refs_bytes);
		return 0;
		}

		if (extra_delayed_refs_bytes > 0) {
		bytes -= extra_delayed_refs_bytes;
		ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
		if (ret == 0)
		return 0;
		}

		/*
		* If we are an emergency flush, which can steal from the global block
		* reserve, then attempt to not reserve space for the delayed refs, as
		* we will consume space for them from the global block reserve.
		*/
		if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
		if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
		bytes -= *delayed_refs_bytes;
		*delayed_refs_bytes = 0;
		ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);