Commit be48bcf0 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs fixes from David Sterba:
 "Several zoned mode fixes, mount option printing fixups, folio state
  handling fixes and one log replay fix.

   - zoned mode:
       - zone activation and finish fixes
       - block group reservation fixes

   - mount option fixes:
       - bring back printing of mount options with key=value that got
         accidentally dropped during mount option parsing in 6.8
       - fix inverse logic or typos when printing nodatasum/nodatacow

   - folio status fixes:
       - writeback fixes in zoned mode
       - properly reset dirty/writeback if submission fails
       - properly handle TOWRITE xarray mark/tag

   - do not set mtime/ctime to current time when unlinking for log
     replay"

* tag 'for-6.17-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: fix printing of mount info messages for NODATACOW/NODATASUM
  btrfs: restore mount option info messages during mount
  btrfs: fix incorrect log message for nobarrier mount option
  btrfs: fix buffer index in wait_eb_writebacks()
  btrfs: subpage: keep TOWRITE tag until folio is cleaned
  btrfs: clear TAG_TOWRITE from buffer tree when submitting a tree block
  btrfs: do not set mtime/ctime to current time when unlinking for log replay
  btrfs: clear block dirty if btrfs_writepage_cow_fixup() failed
  btrfs: clear block dirty if submit_one_sector() failed
  btrfs: zoned: limit active zones to max_open_zones
  btrfs: zoned: fix write time activation failure for metadata block group
  btrfs: zoned: fix data relocation block group reservation
  btrfs: zoned: skip ZONE FINISH of conventional zones
parents 074e461d 74857fdc
Loading
Loading
Loading
Loading
+19 −5
Original line number Diff line number Diff line
@@ -1512,7 +1512,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,

/*
 * Return 0 if we have submitted or queued the sector for submission.
 * Return <0 for critical errors.
 * Return <0 for critical errors, and the sector will have its dirty flag cleared.
 *
 * Caller should make sure filepos < i_size and handle filepos >= i_size case.
 */
@@ -1535,8 +1535,17 @@ static int submit_one_sector(struct btrfs_inode *inode,
	ASSERT(filepos < i_size);

	em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
	if (IS_ERR(em))
	if (IS_ERR(em)) {
		/*
		 * When submission failed, we should still clear the folio dirty.
		 * Or the folio will be written back again but without any
		 * ordered extent.
		 */
		btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
		btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
		btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);
		return PTR_ERR(em);
	}

	extent_offset = filepos - em->start;
	em_end = btrfs_extent_map_end(em);
@@ -1609,8 +1618,12 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
		folio_unlock(folio);
		return 1;
	}
	if (ret < 0)
	if (ret < 0) {
		btrfs_folio_clear_dirty(fs_info, folio, start, len);
		btrfs_folio_set_writeback(fs_info, folio, start, len);
		btrfs_folio_clear_writeback(fs_info, folio, start, len);
		return ret;
	}

	for (cur = start; cur < start + len; cur += fs_info->sectorsize)
		set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap);
@@ -1666,8 +1679,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
	 * Here we set writeback and clear for the range. If the full folio
	 * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
	 *
	 * If we hit any error, the corresponding sector will still be dirty
	 * thus no need to clear PAGECACHE_TAG_DIRTY.
	 * If we hit any error, the corresponding sector will have its dirty
	 * flag cleared and writeback finished, thus no need to handle the error case.
	 */
	if (!submitted_io && !error) {
		btrfs_folio_set_writeback(fs_info, folio, start, len);
@@ -1813,6 +1826,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
		xas_load(&xas);
		xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
		xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
		xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
		xas_unlock_irqrestore(&xas, flags);

		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+19 −10
Original line number Diff line number Diff line
@@ -4189,6 +4189,23 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
	return ret;
}

static void update_time_after_link_or_unlink(struct btrfs_inode *dir)
{
	struct timespec64 now;

	/*
	 * If we are replaying a log tree, we do not want to update the mtime
	 * and ctime of the parent directory with the current time, since the
	 * log replay procedure is responsible for setting them to their correct
	 * values (the ones it had when the fsync was done).
	 */
	if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags))
		return;

	now = inode_set_ctime_current(&dir->vfs_inode);
	inode_set_mtime_to_ts(&dir->vfs_inode, now);
}

/*
 * unlink helper that gets used here in inode.c and in the tree logging
 * recovery code.  It remove a link in a directory with a given name, and
@@ -4289,7 +4306,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
	inode_inc_iversion(&inode->vfs_inode);
	inode_set_ctime_current(&inode->vfs_inode);
	inode_inc_iversion(&dir->vfs_inode);
 	inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
	update_time_after_link_or_unlink(dir);

	return btrfs_update_inode(trans, dir);
}
@@ -6683,15 +6700,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
			   name->len * 2);
	inode_inc_iversion(&parent_inode->vfs_inode);
	/*
	 * If we are replaying a log tree, we do not want to update the mtime
	 * and ctime of the parent directory with the current time, since the
	 * log replay procedure is responsible for setting them to their correct
	 * values (the ones it had when the fsync was done).
	 */
	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
		inode_set_mtime_to_ts(&parent_inode->vfs_inode,
				      inode_set_ctime_current(&parent_inode->vfs_inode));
	update_time_after_link_or_unlink(parent_inode);

	ret = btrfs_update_inode(trans, parent_inode);
	if (ret)
+18 −1
Original line number Diff line number Diff line
@@ -448,8 +448,25 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,

	spin_lock_irqsave(&bfs->lock, flags);
	bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);

	/*
	 * Don't clear the TOWRITE tag when starting writeback on a still-dirty
	 * folio. Doing so can cause WB_SYNC_ALL writepages() to overlook it,
	 * assume writeback is complete, and exit too early — violating sync
	 * ordering guarantees.
	 */
	if (!folio_test_writeback(folio))
		folio_start_writeback(folio);
		__folio_start_writeback(folio, true);
	if (!folio_test_dirty(folio)) {
		struct address_space *mapping = folio_mapping(folio);
		XA_STATE(xas, &mapping->i_pages, folio->index);
		unsigned long flags;

		xas_lock_irqsave(&xas, flags);
		xas_load(&xas);
		xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
		xas_unlock_irqrestore(&xas, flags);
	}
	spin_unlock_irqrestore(&bfs->lock, flags);
}

+8 −5
Original line number Diff line number Diff line
@@ -88,6 +88,9 @@ struct btrfs_fs_context {
	refcount_t refs;
};

static void btrfs_emit_options(struct btrfs_fs_info *info,
			       struct btrfs_fs_context *old);

enum {
	Opt_acl,
	Opt_clear_cache,
@@ -698,12 +701,9 @@ bool btrfs_check_options(const struct btrfs_fs_info *info,

	if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
		if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
			btrfs_info(info, "disk space caching is enabled");
			btrfs_warn(info,
"space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2");
		}
		if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
			btrfs_info(info, "using free-space-tree");
	}

	return ret;
@@ -980,6 +980,8 @@ static int btrfs_fill_super(struct super_block *sb,
		return ret;
	}

	btrfs_emit_options(fs_info, NULL);

	inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
	if (IS_ERR(inode)) {
		ret = PTR_ERR(inode);
@@ -1437,7 +1439,7 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
{
	btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
	btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts");
	btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
	btrfs_info_if_set(info, old, NODATACOW, "setting nodatacow");
	btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations");
	btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme");
	btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers");
@@ -1459,10 +1461,11 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
	btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums");
	btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags");

	btrfs_info_if_unset(info, old, NODATASUM, "setting datasum");
	btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
	btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
	btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme");
	btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers");
	btrfs_info_if_unset(info, old, NOBARRIER, "turning on barriers");
	btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log");
	btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching");
	btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree");
+99 −34
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@
#include "accessors.h"
#include "bio.h"
#include "transaction.h"
#include "sysfs.h"

/* Maximum number of zones to report per blkdev_report_zones() call */
#define BTRFS_REPORT_NR_ZONES   4096
@@ -42,6 +43,9 @@
/* Number of superblock log zones */
#define BTRFS_NR_SB_LOG_ZONES 2

/* Default number of max active zones when the device has no limits. */
#define BTRFS_DEFAULT_MAX_ACTIVE_ZONES	128

/*
 * Minimum of active zones we need:
 *
@@ -416,7 +420,10 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
	if (!IS_ALIGNED(nr_sectors, zone_sectors))
		zone_info->nr_zones++;

	max_active_zones = bdev_max_active_zones(bdev);
	max_active_zones = min_not_zero(bdev_max_active_zones(bdev),
					bdev_max_open_zones(bdev));
	if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES)
		max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES;
	if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
		btrfs_err(fs_info,
"zoned: %s: max active zones %u is too small, need at least %u active zones",
@@ -2168,11 +2175,16 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
		goto out_unlock;
	}

	/* No space left */
	if (btrfs_zoned_bg_is_full(block_group)) {
	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) {
		/* The caller should check if the block group is full. */
		if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) {
			ret = false;
			goto out_unlock;
		}
	} else {
		/* Since it is already written, it should have been active. */
		WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start);
	}

	for (i = 0; i < map->num_stripes; i++) {
		struct btrfs_zoned_device_info *zinfo;
@@ -2230,7 +2242,7 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
	struct btrfs_fs_info *fs_info = block_group->fs_info;
	const u64 end = block_group->start + block_group->length;
	struct extent_buffer *eb;
	unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits);
	unsigned long index, start = (block_group->start >> fs_info->nodesize_bits);

	rcu_read_lock();
	xa_for_each_start(&fs_info->buffer_tree, index, eb, start) {
@@ -2245,6 +2257,40 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
	rcu_read_unlock();
}

static int call_zone_finish(struct btrfs_block_group *block_group,
			    struct btrfs_io_stripe *stripe)
{
	struct btrfs_device *device = stripe->dev;
	const u64 physical = stripe->physical;
	struct btrfs_zoned_device_info *zinfo = device->zone_info;
	int ret;

	if (!device->bdev)
		return 0;

	if (zinfo->max_active_zones == 0)
		return 0;

	if (btrfs_dev_is_sequential(device, physical)) {
		unsigned int nofs_flags;

		nofs_flags = memalloc_nofs_save();
		ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
				       physical >> SECTOR_SHIFT,
				       zinfo->zone_size >> SECTOR_SHIFT);
		memalloc_nofs_restore(nofs_flags);

		if (ret)
			return ret;
	}

	if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
		zinfo->reserved_active_zones++;
	btrfs_dev_clear_active_zone(device, physical);

	return 0;
}

static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
	struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -2329,31 +2375,12 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
	down_read(&dev_replace->rwsem);
	map = block_group->physical_map;
	for (i = 0; i < map->num_stripes; i++) {
		struct btrfs_device *device = map->stripes[i].dev;
		const u64 physical = map->stripes[i].physical;
		struct btrfs_zoned_device_info *zinfo = device->zone_info;
		unsigned int nofs_flags;

		if (!device->bdev)
			continue;

		if (zinfo->max_active_zones == 0)
			continue;

		nofs_flags = memalloc_nofs_save();
		ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
				       physical >> SECTOR_SHIFT,
				       zinfo->zone_size >> SECTOR_SHIFT);
		memalloc_nofs_restore(nofs_flags);

		ret = call_zone_finish(block_group, &map->stripes[i]);
		if (ret) {
			up_read(&dev_replace->rwsem);
			return ret;
		}

		if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
			zinfo->reserved_active_zones++;
		btrfs_dev_clear_active_zone(device, physical);
	}
	up_read(&dev_replace->rwsem);

@@ -2504,12 +2531,12 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
{
	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
	struct btrfs_space_info *space_info = data_sinfo->sub_group[0];
	struct btrfs_space_info *space_info = data_sinfo;
	struct btrfs_trans_handle *trans;
	struct btrfs_block_group *bg;
	struct list_head *bg_list;
	u64 alloc_flags;
	bool initial = false;
	bool first = true;
	bool did_chunk_alloc = false;
	int index;
	int ret;
@@ -2523,21 +2550,52 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
	if (sb_rdonly(fs_info->sb))
		return;

	ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
	alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
	index = btrfs_bg_flags_to_raid_index(alloc_flags);

	bg_list = &data_sinfo->block_groups[index];
	/* Scan the data space_info to find empty block groups. Take the second one. */
again:
	bg_list = &space_info->block_groups[index];
	list_for_each_entry(bg, bg_list, list) {
		if (bg->used > 0)
		if (bg->alloc_offset != 0)
			continue;

		if (!initial) {
			initial = true;
		if (first) {
			first = false;
			continue;
		}

		if (space_info == data_sinfo) {
			/* Migrate the block group to the data relocation space_info. */
			struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0];
			int factor;

			ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
			factor = btrfs_bg_type_to_factor(bg->flags);

			down_write(&space_info->groups_sem);
			list_del_init(&bg->list);
			/* We can assume this as we choose the second empty one. */
			ASSERT(!list_empty(&space_info->block_groups[index]));
			up_write(&space_info->groups_sem);

			spin_lock(&space_info->lock);
			space_info->total_bytes -= bg->length;
			space_info->disk_total -= bg->length * factor;
			/* There is no allocation ever happened. */
			ASSERT(bg->used == 0);
			ASSERT(bg->zone_unusable == 0);
			/* No super block in a block group on the zoned setup. */
			ASSERT(bg->bytes_super == 0);
			spin_unlock(&space_info->lock);

			bg->space_info = reloc_sinfo;
			if (reloc_sinfo->block_group_kobjs[index] == NULL)
				btrfs_sysfs_add_block_group_type(bg);

			btrfs_add_bg_to_space_info(fs_info, bg);
		}

		fs_info->data_reloc_bg = bg->start;
		set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags);
		btrfs_zone_activate(bg);
@@ -2552,11 +2610,18 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
	if (IS_ERR(trans))
		return;

	/* Allocate new BG in the data relocation space_info. */
	space_info = data_sinfo->sub_group[0];
	ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
	ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
	btrfs_end_transaction(trans);
	if (ret == 1) {
		/*
		 * We allocated a new block group in the data relocation space_info. We
		 * can take that one.
		 */
		first = false;
		did_chunk_alloc = true;
		bg_list = &space_info->block_groups[index];
		goto again;
	}
}