Commit a6ecc2a4 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull ext4 updates from Ted Ts'o:
 "In addition to bug fixes and cleanups, there are two new features for
  ext4 in 5.14:

   - Allow applications to poll on changes to
     /sys/fs/ext4/*/errors_count

   - Add the ioctl EXT4_IOC_CHECKPOINT which allows the journal to be
     checkpointed, truncated and discarded or zero'ed"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (32 commits)
  jbd2: export jbd2_journal_[un]register_shrinker()
  ext4: notify sysfs on errors_count value change
  fs: remove bdev_try_to_free_page callback
  ext4: remove bdev_try_to_free_page() callback
  jbd2: simplify journal_clean_one_cp_list()
  jbd2,ext4: add a shrinker to release checkpointed buffers
  jbd2: remove redundant buffer io error checks
  jbd2: don't abort the journal when freeing buffers
  jbd2: ensure abort the journal if detect IO error when writing original buffer back
  jbd2: remove the out label in __jbd2_journal_remove_checkpoint()
  ext4: no need to verify new add extent block
  jbd2: clean up misleading comments for jbd2_fc_release_bufs
  ext4: add check to prevent attempting to resize an fs with sparse_super2
  ext4: consolidate checks for resize of bigalloc into ext4_resize_begin
  ext4: remove duplicate definition of ext4_xattr_ibody_inline_set()
  ext4: fsmap: fix the block/inode bitmap comment
  ext4: fix comment for s_hash_unsigned
  ext4: use local variable ei instead of EXT4_I() macro
  ext4: fix avefreec in find_group_orlov
  ext4: correct the cache_nr in tracepoint ext4_es_shrink_exit
  ...
parents 2cfa582b 16aa4c9a
Loading
Loading
Loading
Loading
+31 −8
Original line number Diff line number Diff line
@@ -4,14 +4,14 @@ Journal (jbd2)
--------------

Introduced in ext3, the ext4 filesystem employs a journal to protect the
filesystem against corruption in the case of a system crash. A small
continuous region of disk (default 128MiB) is reserved inside the
filesystem as a place to land “important” data writes on-disk as quickly
as possible. Once the important data transaction is fully written to the
disk and flushed from the disk write cache, a record of the data being
committed is also written to the journal. At some later point in time,
the journal code writes the transactions to their final locations on
disk (this could involve a lot of seeking or a lot of small
filesystem against metadata inconsistencies in the case of a system crash. Up
to 10,240,000 file system blocks (see man mke2fs(8) for more details on journal
size limits) can be reserved inside the filesystem as a place to land
important data writes on-disk as quickly as possible. Once the important
data transaction is fully written to the disk and flushed from the disk write
cache, a record of the data being committed is also written to the journal. At
some later point in time, the journal code writes the transactions to their
final locations on disk (this could involve a lot of seeking or a lot of small
read-write-erases) before erasing the commit record. Should the system
crash during the second slow write, the journal can be replayed all the
way to the latest commit record, guaranteeing the atomicity of whatever
@@ -731,3 +731,26 @@ point, the refcount for inode 11 is not reliable, but that gets fixed by the
replay of last inode 11 tag. Thus, by converting a non-idempotent procedure
into a series of idempotent outcomes, fast commits ensured idempotence during
the replay.

Journal Checkpoint
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Checkpointing the journal ensures all transactions and their associated buffers
are submitted to the disk. In-progress transactions are waited upon and included
in the checkpoint. Checkpointing is used internally during critical updates to
the filesystem including journal recovery, filesystem resizing, and freeing of
the journal_t structure.

A journal checkpoint can be triggered from userspace via the ioctl
EXT4_IOC_CHECKPOINT. This ioctl takes a single, u64 argument for flags.
Currently, three flags are supported. First, EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN
can be used to verify input to the ioctl. It returns error if there is any
invalid input, otherwise it returns success without performing
any checkpointing. This can be used to check whether the ioctl exists on a
system and to verify there are no issues with arguments or flags. The
other two flags are EXT4_IOC_CHECKPOINT_FLAG_DISCARD and
EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT. These flags cause the journal blocks to be
discarded or zero-filled, respectively, after the journal checkpoint is
complete. EXT4_IOC_CHECKPOINT_FLAG_DISCARD and EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT
cannot both be set. The ioctl may be useful when snapshotting a system or for
complying with content deletion SLOs.
+0 −15
Original line number Diff line number Diff line
@@ -1673,20 +1673,6 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
EXPORT_SYMBOL_GPL(blkdev_read_iter);

/*
 * Try to release a page associated with block device when the system
 * is under memory pressure.
 */
static int blkdev_releasepage(struct page *page, gfp_t wait)
{
	struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;

	if (super && super->s_op->bdev_try_to_free_page)
		return super->s_op->bdev_try_to_free_page(super, page, wait);

	return try_to_free_buffers(page);
}

static int blkdev_writepages(struct address_space *mapping,
			     struct writeback_control *wbc)
{
@@ -1701,7 +1687,6 @@ static const struct address_space_operations def_blk_aops = {
	.write_begin	= blkdev_write_begin,
	.write_end	= blkdev_write_end,
	.writepages	= blkdev_writepages,
	.releasepage	= blkdev_releasepage,
	.direct_IO	= blkdev_direct_IO,
	.migratepage	= buffer_migrate_page_norefs,
	.is_dirty_writeback = buffer_check_dirty_writeback,
+16 −2
Original line number Diff line number Diff line
@@ -720,6 +720,7 @@ enum {
#define EXT4_IOC_CLEAR_ES_CACHE		_IO('f', 40)
#define EXT4_IOC_GETSTATE		_IOW('f', 41, __u32)
#define EXT4_IOC_GET_ES_CACHE		_IOWR('f', 42, struct fiemap)
#define EXT4_IOC_CHECKPOINT		_IOW('f', 43, __u32)

#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)

@@ -741,6 +742,14 @@ enum {
#define EXT4_STATE_FLAG_NEWENTRY	0x00000004
#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE	0x00000008

/* flags for ioctl EXT4_IOC_CHECKPOINT */
#define EXT4_IOC_CHECKPOINT_FLAG_DISCARD	0x1
#define EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT	0x2
#define EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN	0x4
#define EXT4_IOC_CHECKPOINT_FLAG_VALID		(EXT4_IOC_CHECKPOINT_FLAG_DISCARD | \
						EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \
						EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)

#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
 * ioctl commands in 32 bit emulation
@@ -1477,7 +1486,7 @@ struct ext4_sb_info {
	unsigned int s_inode_goal;
	u32 s_hash_seed[4];
	int s_def_hash_version;
	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
	int s_hash_unsigned;	/* 3 if hash should be unsigned, 0 if not */
	struct percpu_counter s_freeclusters_counter;
	struct percpu_counter s_freeinodes_counter;
	struct percpu_counter s_dirs_counter;
@@ -1488,6 +1497,7 @@ struct ext4_sb_info {
	struct kobject s_kobj;
	struct completion s_kobj_unregister;
	struct super_block *s_sb;
	struct buffer_head *s_mmp_bh;

	/* Journaling */
	struct journal_s *s_journal;
@@ -3614,6 +3624,7 @@ extern const struct inode_operations ext4_symlink_inode_operations;
extern const struct inode_operations ext4_fast_symlink_inode_operations;

/* sysfs.c */
extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi);
extern int ext4_register_sysfs(struct super_block *sb);
extern void ext4_unregister_sysfs(struct super_block *sb);
extern int __init ext4_init_sysfs(void);
@@ -3720,6 +3731,9 @@ extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);

/* mmp.c */
extern void ext4_stop_mmpd(struct ext4_sb_info *sbi);

/* verity.c */
extern const struct fsverity_operations ext4_verityops;

@@ -3784,7 +3798,7 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
	 * have to read the block because we may read the old data
	 * successfully.
	 */
	if (!buffer_uptodate(bh) && buffer_write_io_error(bh))
	if (buffer_write_io_error(bh))
		set_buffer_uptodate(bh);
	return buffer_uptodate(bh);
}
+4 −0
Original line number Diff line number Diff line
@@ -825,6 +825,7 @@ void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
	eh->eh_entries = 0;
	eh->eh_magic = EXT4_EXT_MAGIC;
	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
	eh->eh_generation = 0;
	ext4_mark_inode_dirty(handle, inode);
}

@@ -1090,6 +1091,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
	neh->eh_magic = EXT4_EXT_MAGIC;
	neh->eh_depth = 0;
	neh->eh_generation = 0;

	/* move remainder of path[depth] to the new leaf */
	if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -1167,6 +1169,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
		neh->eh_magic = EXT4_EXT_MAGIC;
		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
		neh->eh_depth = cpu_to_le16(depth - i);
		neh->eh_generation = 0;
		fidx = EXT_FIRST_INDEX(neh);
		fidx->ei_block = border;
		ext4_idx_store_pblock(fidx, oldblock);
@@ -1306,6 +1309,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
	neh->eh_magic = EXT4_EXT_MAGIC;
	ext4_extent_block_csum_set(inode, neh);
	set_buffer_uptodate(bh);
	set_buffer_verified(bh);
	unlock_buffer(bh);

	err = ext4_handle_dirty_metadata(handle, inode, bh);
+1 −3
Original line number Diff line number Diff line
@@ -1574,11 +1574,9 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
	ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
	trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);

	if (!nr_to_scan)
		return ret;

	nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);

	ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
	trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
	return nr_shrunk;
}
Loading