Commit 0d19d9e1 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'ext4_for_linus-6.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
 "Various ext4 bug fixes and cleanups. The fixes are mostly in the
  fstrim and mballoc code paths.

  Also enable dioread_nolock in the case where the block size is less
  than the page size (dioread_nolock has been default in the bs == ps
  case for quite some time)"

* tag 'ext4_for_linus-6.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: fix inconsistent between segment fstrim and full fstrim
  ext4: fallback to complex scan if aligned scan doesn't work
  ext4: convert ext4_da_do_write_end() to take a folio
  ext4: allow for the last group to be marked as trimmed
  ext4: move ext4_check_bdev_write_error() into nojournal mode
  jbd2: abort journal when detecting metadata writeback error of fs dev
  jbd2: remove unused 'JBD2_CHECKPOINT_IO_ERROR' and 'j_atomic_flags'
  jbd2: replace journal state flag by checking errseq
  jbd2: add errseq to detect client fs's bdev writeback error
  ext4: improving calculation of 'fe_{len|start}' in mb_find_extent()
  ext4: clarify handling of unwritten bh in __ext4_block_zero_page_range()
  ext4: treat end of range as exclusive in ext4_zero_range()
  ext4: enable dioread_nolock as default for bs < ps case
  ext4: delete redundant calculations in ext4_mb_get_buddy_page_lock()
  ext4: reduce unnecessary memory allocation in alloc_flex_gd()
  ext4: avoid online resizing failures due to oversized flex bg
  ext4: remove unnecessary check from alloc_flex_gd()
  ext4: unify the type of flexbg_size to unsigned int
parents 6bd593bc 68da4c44
Loading
Loading
Loading
Loading
+2 −3
Original line number Diff line number Diff line
@@ -235,8 +235,6 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,

	might_sleep();

	ext4_check_bdev_write_error(sb);

	if (ext4_handle_valid(handle)) {
		err = jbd2_journal_get_write_access(handle, bh);
		if (err) {
@@ -244,7 +242,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
						  handle, err);
			return err;
		}
	}
	} else
		ext4_check_bdev_write_error(sb);
	if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
		return 0;
	BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
+4 −2
Original line number Diff line number Diff line
@@ -4523,7 +4523,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
	 * Round up offset. This is not fallocate, we need to zero out
	 * blocks, so convert interior block aligned part of the range to
	 * unwritten and possibly manually zero out unaligned parts of the
	 * range.
	 * range. Here, start and partial_begin are inclusive, end and
	 * partial_end are exclusive.
	 */
	start = round_up(offset, 1 << blkbits);
	end = round_down((offset + len), 1 << blkbits);
@@ -4609,7 +4610,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
		 * disk in case of crash before zeroing trans is committed.
		 */
		if (ext4_should_journal_data(inode)) {
			ret = filemap_write_and_wait_range(mapping, start, end);
			ret = filemap_write_and_wait_range(mapping, start,
							   end - 1);
			if (ret) {
				filemap_invalidate_unlock(mapping);
				goto out_mutex;
+16 −9
Original line number Diff line number Diff line
@@ -2947,7 +2947,7 @@ static int ext4_da_should_update_i_disksize(struct folio *folio,

static int ext4_da_do_write_end(struct address_space *mapping,
			loff_t pos, unsigned len, unsigned copied,
			struct page *page)
			struct folio *folio)
{
	struct inode *inode = mapping->host;
	loff_t old_size = inode->i_size;
@@ -2958,12 +2958,13 @@ static int ext4_da_do_write_end(struct address_space *mapping,
	 * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
	 * flag, which all that's needed to trigger page writeback.
	 */
	copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL);
	copied = block_write_end(NULL, mapping, pos, len, copied,
			&folio->page, NULL);
	new_i_size = pos + copied;

	/*
	 * It's important to update i_size while still holding page lock,
	 * because page writeout could otherwise come in and zero beyond
	 * It's important to update i_size while still holding folio lock,
	 * because folio writeout could otherwise come in and zero beyond
	 * i_size.
	 *
	 * Since we are holding inode lock, we are sure i_disksize <=
@@ -2981,14 +2982,14 @@ static int ext4_da_do_write_end(struct address_space *mapping,

		i_size_write(inode, new_i_size);
		end = (new_i_size - 1) & (PAGE_SIZE - 1);
		if (copied && ext4_da_should_update_i_disksize(page_folio(page), end)) {
		if (copied && ext4_da_should_update_i_disksize(folio, end)) {
			ext4_update_i_disksize(inode, new_i_size);
			disksize_changed = true;
		}
	}

	unlock_page(page);
	put_page(page);
	folio_unlock(folio);
	folio_put(folio);

	if (old_size < pos)
		pagecache_isize_extended(inode, old_size, pos);
@@ -3027,10 +3028,10 @@ static int ext4_da_write_end(struct file *file,
		return ext4_write_inline_data_end(inode, pos, len, copied,
						  folio);

	if (unlikely(copied < len) && !PageUptodate(page))
	if (unlikely(copied < len) && !folio_test_uptodate(folio))
		copied = 0;

	return ext4_da_do_write_end(mapping, pos, len, copied, &folio->page);
	return ext4_da_do_write_end(mapping, pos, len, copied, folio);
}

/*
@@ -3630,6 +3631,12 @@ void ext4_set_aops(struct inode *inode)
		inode->i_mapping->a_ops = &ext4_aops;
}

/*
 * Here we can't skip an unwritten buffer even though it usually reads zero
 * because it might have data in pagecache (eg, if called from ext4_zero_range,
 * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a
 * racing writeback can come later and flush the stale pagecache to disk.
 */
static int __ext4_block_zero_page_range(handle_t *handle,
		struct address_space *mapping, loff_t from, loff_t length)
{
+37 −28
Original line number Diff line number Diff line
@@ -1456,9 +1456,8 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
		return 0;
	}

	block++;
	pnum = block / blocks_per_page;
	page = find_or_create_page(inode->i_mapping, pnum, gfp);
	/* blocks_per_page == 1, hence we need another page for the buddy */
	page = find_or_create_page(inode->i_mapping, block + 1, gfp);
	if (!page)
		return -ENOMEM;
	BUG_ON(page->mapping != inode->i_mapping);
@@ -1958,8 +1957,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
static int mb_find_extent(struct ext4_buddy *e4b, int block,
				int needed, struct ext4_free_extent *ex)
{
	int next = block;
	int max, order;
	int max, order, next;
	void *buddy;

	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
@@ -1977,16 +1975,12 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,

	/* find actual order */
	order = mb_find_order_for_block(e4b, block);
	block = block >> order;

	ex->fe_len = 1 << order;
	ex->fe_start = block << order;
	ex->fe_len = (1 << order) - (block & ((1 << order) - 1));
	ex->fe_start = block;
	ex->fe_group = e4b->bd_group;

	/* calc difference from given start */
	next = next - ex->fe_start;
	ex->fe_len -= next;
	ex->fe_start += next;
	block = block >> order;

	while (needed > ex->fe_len &&
	       mb_find_buddy(e4b, order, &max)) {
@@ -2895,14 +2889,19 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
			ac->ac_groups_scanned++;
			if (cr == CR_POWER2_ALIGNED)
				ext4_mb_simple_scan_group(ac, &e4b);
			else if ((cr == CR_GOAL_LEN_FAST ||
				 cr == CR_BEST_AVAIL_LEN) &&
				 sbi->s_stripe &&
			else {
				bool is_stripe_aligned = sbi->s_stripe &&
					!(ac->ac_g_ex.fe_len %
				 EXT4_B2C(sbi, sbi->s_stripe)))
					  EXT4_B2C(sbi, sbi->s_stripe));

				if ((cr == CR_GOAL_LEN_FAST ||
				     cr == CR_BEST_AVAIL_LEN) &&
				    is_stripe_aligned)
					ext4_mb_scan_aligned(ac, &e4b);
			else

				if (ac->ac_status == AC_STATUS_CONTINUE)
					ext4_mb_complex_scan_group(ac, &e4b);
			}

			ext4_unlock_group(sb, group);
			ext4_mb_unload_buddy(&e4b);
@@ -6735,11 +6734,16 @@ __acquires(bitlock)
static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
					   ext4_group_t grp)
{
	if (grp < ext4_get_groups_count(sb))
		return EXT4_CLUSTERS_PER_GROUP(sb) - 1;
	return (ext4_blocks_count(EXT4_SB(sb)->s_es) -
		ext4_group_first_block_no(sb, grp) - 1) >>
					EXT4_CLUSTER_BITS(sb);
	unsigned long nr_clusters_in_group;

	if (grp < (ext4_get_groups_count(sb) - 1))
		nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb);
	else
		nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) -
					ext4_group_first_block_no(sb, grp))
				       >> EXT4_CLUSTER_BITS(sb);

	return nr_clusters_in_group - 1;
}

static bool ext4_trim_interrupted(void)
@@ -6753,13 +6757,15 @@ static int ext4_try_to_trim_range(struct super_block *sb,
__acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
__releases(ext4_group_lock_ptr(sb, e4b->bd_group))
{
	ext4_grpblk_t next, count, free_count;
	ext4_grpblk_t next, count, free_count, last, origin_start;
	bool set_trimmed = false;
	void *bitmap;

	last = ext4_last_grp_cluster(sb, e4b->bd_group);
	bitmap = e4b->bd_bitmap;
	if (start == 0 && max >= ext4_last_grp_cluster(sb, e4b->bd_group))
	if (start == 0 && max >= last)
		set_trimmed = true;
	origin_start = start;
	start = max(e4b->bd_info->bb_first_free, start);
	count = 0;
	free_count = 0;
@@ -6768,7 +6774,10 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
		start = mb_find_next_zero_bit(bitmap, max + 1, start);
		if (start > max)
			break;
		next = mb_find_next_bit(bitmap, max + 1, start);

		next = mb_find_next_bit(bitmap, last + 1, start);
		if (origin_start == 0 && next >= last)
			set_trimmed = true;

		if ((next - start) >= minblocks) {
			int ret = ext4_trim_extent(sb, start, next - start, e4b);
+33 −16
Original line number Diff line number Diff line
@@ -218,35 +218,53 @@ struct ext4_new_flex_group_data {
						   in the flex group */
	__u16 *bg_flags;			/* block group flags of groups
						   in @groups */
	ext4_group_t resize_bg;			/* number of allocated
						   new_group_data */
	ext4_group_t count;			/* number of groups in @groups
						 */
};

/*
 * Avoiding memory allocation failures due to too many groups added each time.
 */
#define MAX_RESIZE_BG				16384

/*
 * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
 * @flexbg_size.
 *
 * Returns NULL on failure otherwise address of the allocated structure.
 */
static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size,
				ext4_group_t o_group, ext4_group_t n_group)
{
	ext4_group_t last_group;
	struct ext4_new_flex_group_data *flex_gd;

	flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
	if (flex_gd == NULL)
		goto out3;

	if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_group_data))
		goto out2;
	flex_gd->count = flexbg_size;
	if (unlikely(flexbg_size > MAX_RESIZE_BG))
		flex_gd->resize_bg = MAX_RESIZE_BG;
	else
		flex_gd->resize_bg = flexbg_size;

	flex_gd->groups = kmalloc_array(flexbg_size,
	/* Avoid allocating large 'groups' array if not needed */
	last_group = o_group | (flex_gd->resize_bg - 1);
	if (n_group <= last_group)
		flex_gd->resize_bg = 1 << fls(n_group - o_group + 1);
	else if (n_group - last_group < flex_gd->resize_bg)
		flex_gd->resize_bg = 1 << max(fls(last_group - o_group + 1),
					      fls(n_group - last_group));

	flex_gd->groups = kmalloc_array(flex_gd->resize_bg,
					sizeof(struct ext4_new_group_data),
					GFP_NOFS);
	if (flex_gd->groups == NULL)
		goto out2;

	flex_gd->bg_flags = kmalloc_array(flexbg_size, sizeof(__u16),
	flex_gd->bg_flags = kmalloc_array(flex_gd->resize_bg, sizeof(__u16),
					  GFP_NOFS);
	if (flex_gd->bg_flags == NULL)
		goto out1;
@@ -283,7 +301,7 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
 */
static int ext4_alloc_group_tables(struct super_block *sb,
				struct ext4_new_flex_group_data *flex_gd,
				int flexbg_size)
				unsigned int flexbg_size)
{
	struct ext4_new_group_data *group_data = flex_gd->groups;
	ext4_fsblk_t start_blk;
@@ -384,12 +402,12 @@ static int ext4_alloc_group_tables(struct super_block *sb,
		group = group_data[0].group;

		printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
		       "%d groups, flexbg size is %d:\n", flex_gd->count,
		       "%u groups, flexbg size is %u:\n", flex_gd->count,
		       flexbg_size);

		for (i = 0; i < flex_gd->count; i++) {
			ext4_debug(
			       "adding %s group %u: %u blocks (%d free, %d mdata blocks)\n",
			       "adding %s group %u: %u blocks (%u free, %u mdata blocks)\n",
			       ext4_bg_has_super(sb, group + i) ? "normal" :
			       "no-super", group + i,
			       group_data[i].blocks_count,
@@ -1605,8 +1623,7 @@ static int ext4_flex_group_add(struct super_block *sb,

static int ext4_setup_next_flex_gd(struct super_block *sb,
				    struct ext4_new_flex_group_data *flex_gd,
				    ext4_fsblk_t n_blocks_count,
				    unsigned long flexbg_size)
				    ext4_fsblk_t n_blocks_count)
{
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	struct ext4_super_block *es = sbi->s_es;
@@ -1630,7 +1647,7 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
	BUG_ON(last);
	ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);

	last_group = group | (flexbg_size - 1);
	last_group = group | (flex_gd->resize_bg - 1);
	if (last_group > n_group)
		last_group = n_group;

@@ -1990,8 +2007,9 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
	ext4_fsblk_t o_blocks_count;
	ext4_fsblk_t n_blocks_count_retry = 0;
	unsigned long last_update_time = 0;
	int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex;
	int err = 0;
	int meta_bg;
	unsigned int flexbg_size = ext4_flex_bg_size(sbi);

	/* See if the device is actually as big as what was requested */
	bh = ext4_sb_bread(sb, n_blocks_count - 1, 0);
@@ -2123,7 +2141,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
	if (err)
		goto out;

	flex_gd = alloc_flex_gd(flexbg_size);
	flex_gd = alloc_flex_gd(flexbg_size, o_group, n_group);
	if (flex_gd == NULL) {
		err = -ENOMEM;
		goto out;
@@ -2132,8 +2150,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
	/* Add flex groups. Note that a regular group is a
	 * flex group with 1 group.
	 */
	while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
					      flexbg_size)) {
	while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count)) {
		if (time_is_before_jiffies(last_update_time + HZ * 10)) {
			if (last_update_time)
				ext4_msg(sb, KERN_INFO,
Loading