Merge tag 'ext4_for_linus-7.0-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
 "New features and improvements for the ext4 file system

   - Avoid unnecessary cache invalidation in the extent status cache
     (es_cache) when adding extents to be cached in the es_cache and we
     are not changing the extent tree

   - Add a sysfs parameter, err_report_sec, to control how frequently to
     log a warning message that file system inconsistency has been
     detected (Previously we logged the warning message every 24 hours)

   - Avoid unnecessary forced ordered writes when appending to a file
     when delayed allocation is enabled

   - Defer splitting unwritten extents to I/O completion to improve
     write performance of concurrent direct I/O writes to multiple files

   - Refactor and add kunit tests to the extent splitting and conversion
     code paths

  Various Bug Fixes:

   - Fix a panic when the debugging DOUBLE_CHECK macro is defined

   - Avoid using fast commit for rare and complex file system operations
     to make fast commit easier to reason about. This can also avoid
     some corner cases that could result in file system inconsistency if
     there is a crash between the fast commit before a subsequent full
     commit

   - Fix memory leaks in error paths

   - Fix a false positive reports caused when running stress tests using
     mixed huge-page workloads caused by a race between page migration
     and bitmap updates

   - Fix a potential recursion into file system reclaim when evicting an
     inode when fast commit is enabled

   - Fix a warning caused by a potential double decrement to the dirty
     clusters counter when executing FS_IOC_SHUTDOWN when running a
     stress test

   - Enable mballoc optimized scanning regardless whether the inode is
     using indirect blocks or extent trees to map blocks"

* tag 'ext4_for_linus-7.0-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (45 commits)
  et4: allow zeroout when doing written to unwritten split
  ext4: refactor split and convert extents
  ext4: refactor zeroout path and handle all cases
  ext4: propagate flags to ext4_convert_unwritten_extents_endio()
  ext4: propagate flags to convert_initialized_extent()
  ext4: add extent status cache support to kunit tests
  ext4: kunit tests for higher level extent manipulation functions
  ext4: kunit tests for extent splitting and conversion
  ext4: use optimized mballoc scanning regardless of inode format
  ext4: always allocate blocks only from groups inode can use
  ext4: fix dirtyclusters double decrement on fs shutdown
  ext4: fast commit: make s_fc_lock reclaim-safe
  ext4: fix e4b bitmap inconsistency reports
  ext4: remove redundant NULL check after __GFP_NOFAIL
  ext4: remove EXT4_GET_BLOCKS_IO_CREATE_EXT
  ext4: simplify the mapping query logic in ext4_iomap_begin()
  ext4: remove unused unwritten parameter in ext4_dio_write_iter()
  ext4: remove useless ext4_iomap_overwrite_ops
  ext4: avoid starting handle when dio writing an unwritten extent
  ext4: don't split extent before submitting I/O
  ...
This commit is contained in:
Linus Torvalds
2026-02-12 10:19:58 -08:00
17 changed files with 1680 additions and 464 deletions

View File

@@ -707,15 +707,6 @@ enum {
* found an unwritten extent, we need to split it.
*/
#define EXT4_GET_BLOCKS_SPLIT_NOMERGE 0x0008
/*
* Caller is from the dio or dioread_nolock buffered IO, reqest to
* create an unwritten extent if it does not exist or split the
* found unwritten extent. Also do not merge the newly created
* unwritten extent, io end will convert unwritten to written,
* and try to merge the written extent.
*/
#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_SPLIT_NOMERGE|\
EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
/* Convert unwritten extent to initialized. */
#define EXT4_GET_BLOCKS_CONVERT 0x0010
/* Eventual metadata allocation (due to growing extent tree)
@@ -1692,6 +1683,8 @@ struct ext4_sb_info {
/* timer for periodic error stats printing */
struct timer_list s_err_report;
/* timeout in seconds for s_err_report; 0 disables the timer. */
unsigned long s_err_report_sec;
/* Lazy inode table initialization info */
struct ext4_li_request *s_li_request;
@@ -1795,6 +1788,10 @@ struct ext4_sb_info {
* Main fast commit lock. This lock protects accesses to the
* following fields:
* ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
*
* s_fc_lock can be taken from reclaim context (inode eviction) and is
* thus reclaim unsafe. Use ext4_fc_lock()/ext4_fc_unlock() helpers
* when acquiring / releasing the lock.
*/
struct mutex s_fc_lock;
struct buffer_head *s_fc_bh;
@@ -1839,6 +1836,18 @@ static inline void ext4_writepages_up_write(struct super_block *sb, int ctx)
percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem);
}
static inline int ext4_fc_lock(struct super_block *sb)
{
mutex_lock(&EXT4_SB(sb)->s_fc_lock);
return memalloc_nofs_save();
}
static inline void ext4_fc_unlock(struct super_block *sb, int ctx)
{
memalloc_nofs_restore(ctx);
mutex_unlock(&EXT4_SB(sb)->s_fc_lock);
}
static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
return ino == EXT4_ROOT_INO ||
@@ -2373,7 +2382,6 @@ static inline int ext4_emergency_state(struct super_block *sb)
#define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */
#define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */
/*
* Minimum number of groups in a flexgroup before we separate out
* directories into the first block group of a flexgroup
@@ -3199,6 +3207,7 @@ extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
unsigned int flags);
extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
ext4_group_t block_group);
extern void print_daily_error_info(struct timer_list *t);
extern __printf(7, 8)
void __ext4_error(struct super_block *, const char *, unsigned int, bool,
@@ -3795,6 +3804,10 @@ extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
int num,
struct ext4_ext_path *path);
@@ -3909,7 +3922,6 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
}
extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_overwrite_ops;
extern const struct iomap_ops ext4_iomap_report_ops;
static inline int ext4_buffer_uptodate(struct buffer_head *bh)

1027
fs/ext4/extents-test.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -32,6 +32,7 @@
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "xattr.h"
#include <kunit/static_stub.h>
#include <trace/events/ext4.h>
@@ -40,11 +41,9 @@
*/
#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
due to ENOSPC */
#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */
#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */
#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */
#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */
static struct ext4_ext_path *ext4_split_convert_extents(
handle_t *handle, struct inode *inode, struct ext4_map_blocks *map,
struct ext4_ext_path *path, int flags, unsigned int *allocated);
static __le32 ext4_extent_block_csum(struct inode *inode,
struct ext4_extent_header *eh)
@@ -86,8 +85,7 @@ static void ext4_extent_block_csum_set(struct inode *inode,
static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t split,
int split_flag, int flags);
ext4_lblk_t split, int flags);
static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
{
@@ -192,6 +190,9 @@ static int __ext4_ext_dirty(const char *where, unsigned int line,
{
int err;
KUNIT_STATIC_STUB_REDIRECT(__ext4_ext_dirty, where, line, handle, inode,
path);
WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
if (path->p_bh) {
ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
@@ -332,15 +333,12 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path, ext4_lblk_t lblk,
int nofail)
{
int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
if (nofail)
flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
return ext4_split_extent_at(handle, inode, path, lblk, unwritten ?
EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
flags);
return ext4_split_extent_at(handle, inode, path, lblk, flags);
}
static int
@@ -530,6 +528,8 @@ static void ext4_cache_extents(struct inode *inode,
ext4_lblk_t prev = 0;
int i;
KUNIT_STATIC_STUB_REDIRECT(ext4_cache_extents, inode, eh);
for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
unsigned int status = EXTENT_STATUS_WRITTEN;
ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
@@ -893,6 +893,8 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
int ret;
gfp_t gfp_flags = GFP_NOFS;
KUNIT_STATIC_STUB_REDIRECT(ext4_find_extent, inode, block, path, flags);
if (flags & EXT4_EX_NOFAIL)
gfp_flags |= __GFP_NOFAIL;
@@ -1985,6 +1987,9 @@ ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
ext4_lblk_t next;
int mb_flags = 0, unwritten;
KUNIT_STATIC_STUB_REDIRECT(ext4_ext_insert_extent, handle, inode, path,
newext, gb_flags);
if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
mb_flags |= EXT4_MB_DELALLOC_RESERVED;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
@@ -2944,10 +2949,6 @@ again:
} else {
path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
GFP_NOFS | __GFP_NOFAIL);
if (path == NULL) {
ext4_journal_stop(handle);
return -ENOMEM;
}
path[0].p_maxdepth = path[0].p_depth = depth;
path[0].p_hdr = ext_inode_hdr(inode);
i = 0;
@@ -3133,8 +3134,8 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
ext4_fsblk_t ee_pblock;
unsigned int ee_len;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
ee_pblock = ext4_ext_pblock(ex);
if (ee_len == 0)
@@ -3150,6 +3151,8 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
ext4_fsblk_t ee_pblock;
unsigned int ee_len;
KUNIT_STATIC_STUB_REDIRECT(ext4_ext_zeroout, inode, ex);
ee_len = ext4_ext_get_actual_len(ex);
ee_pblock = ext4_ext_pblock(ex);
return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
@@ -3163,35 +3166,30 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
* @inode: the file inode
* @path: the path to the extent
* @split: the logical block where the extent is splitted.
* @split_flags: indicates if the extent could be zeroout if split fails, and
* the states(init or unwritten) of new extents.
* @flags: flags used to insert new extent to extent tree.
*
*
* Splits extent [a, b] into two extents [a, @split) and [@split, b], states
* of which are determined by split_flag.
* of which are same as the original extent. No conversion is performed.
*
* There are two cases:
* a> the extent are splitted into two extent.
* b> split is not needed, and just mark the extent.
*
* Return an extent path pointer on success, or an error pointer on failure.
* Return an extent path pointer on success, or an error pointer on failure. On
* failure, the extent is restored to original state.
*/
static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t split,
int split_flag, int flags)
int flags)
{
ext4_fsblk_t newblock;
ext4_lblk_t ee_block;
struct ext4_extent *ex, newex, orig_ex, zero_ex;
struct ext4_extent *ex, newex, orig_ex;
struct ext4_extent *ex2 = NULL;
unsigned int ee_len, depth;
int err = 0;
int err = 0, insert_err = 0, is_unwrit = 0;
BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
(EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
/* Do not cache extents that are in the process of being modified. */
flags |= EXT4_EX_NOCACHE;
ext_debug(inode, "logical block %llu\n", (unsigned long long)split);
@@ -3202,39 +3200,24 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
newblock = split - ee_block + ext4_ext_pblock(ex);
is_unwrit = ext4_ext_is_unwritten(ex);
BUG_ON(split < ee_block || split >= (ee_block + ee_len));
BUG_ON(!ext4_ext_is_unwritten(ex) &&
split_flag & (EXT4_EXT_MAY_ZEROOUT |
EXT4_EXT_MARK_UNWRIT1 |
EXT4_EXT_MARK_UNWRIT2));
/*
* No split needed
*/
if (split == ee_block)
goto out;
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
if (split == ee_block) {
/*
* case b: block @split is the block that the extent begins with
* then we just change the state of the extent, and splitting
* is not needed.
*/
if (split_flag & EXT4_EXT_MARK_UNWRIT2)
ext4_ext_mark_unwritten(ex);
else
ext4_ext_mark_initialized(ex);
if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE))
ext4_ext_try_to_merge(handle, inode, path, ex);
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
goto out;
}
/* case a */
memcpy(&orig_ex, ex, sizeof(orig_ex));
ex->ee_len = cpu_to_le16(split - ee_block);
if (split_flag & EXT4_EXT_MARK_UNWRIT1)
if (is_unwrit)
ext4_ext_mark_unwritten(ex);
/*
@@ -3249,17 +3232,16 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
ex2->ee_block = cpu_to_le32(split);
ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block));
ext4_ext_store_pblock(ex2, newblock);
if (split_flag & EXT4_EXT_MARK_UNWRIT2)
if (is_unwrit)
ext4_ext_mark_unwritten(ex2);
path = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (!IS_ERR(path))
goto out;
err = PTR_ERR(path);
if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
return path;
insert_err = PTR_ERR(path);
err = 0;
/*
* Get a new path to try to zeroout or fix the extent length.
* Using EXT4_EX_NOFAIL guarantees that ext4_find_extent()
@@ -3272,70 +3254,124 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
if (IS_ERR(path)) {
EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld",
split, PTR_ERR(path));
return path;
goto out_path;
}
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
depth = ext_depth(inode);
ex = path[depth].p_ext;
if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
if (split_flag & EXT4_EXT_DATA_VALID1) {
err = ext4_ext_zeroout(inode, ex2);
zero_ex.ee_block = ex2->ee_block;
zero_ex.ee_len = cpu_to_le16(
ext4_ext_get_actual_len(ex2));
ext4_ext_store_pblock(&zero_ex,
ext4_ext_pblock(ex2));
} else {
err = ext4_ext_zeroout(inode, ex);
zero_ex.ee_block = ex->ee_block;
zero_ex.ee_len = cpu_to_le16(
ext4_ext_get_actual_len(ex));
ext4_ext_store_pblock(&zero_ex,
ext4_ext_pblock(ex));
}
} else {
err = ext4_ext_zeroout(inode, &orig_ex);
zero_ex.ee_block = orig_ex.ee_block;
zero_ex.ee_len = cpu_to_le16(
ext4_ext_get_actual_len(&orig_ex));
ext4_ext_store_pblock(&zero_ex,
ext4_ext_pblock(&orig_ex));
}
if (!err) {
/* update the extent length and mark as initialized */
ex->ee_len = cpu_to_le16(ee_len);
ext4_ext_try_to_merge(handle, inode, path, ex);
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
if (!err)
/* update extent status tree */
ext4_zeroout_es(inode, &zero_ex);
/* If we failed at this point, we don't know in which
* state the extent tree exactly is so don't try to fix
* length of the original extent as it may do even more
* damage.
*/
goto out;
}
}
fix_extent_len:
ex->ee_len = orig_ex.ee_len;
/*
* Ignore ext4_ext_dirty return value since we are already in error path
* and err is a non-zero error code.
*/
ext4_ext_dirty(handle, inode, path + path->p_depth);
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
out:
if (err) {
if (err || insert_err) {
ext4_free_ext_path(path);
path = ERR_PTR(err);
path = err ? ERR_PTR(err) : ERR_PTR(insert_err);
}
out_path:
if (IS_ERR(path))
/* Remove all remaining potentially stale extents. */
ext4_es_remove_extent(inode, ee_block, ee_len);
ext4_ext_show_leaf(inode, path);
return path;
}
static int ext4_split_extent_zeroout(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
struct ext4_map_blocks *map, int flags)
{
struct ext4_extent *ex;
unsigned int ee_len, depth;
ext4_lblk_t ee_block;
uint64_t lblk, pblk, len;
int is_unwrit;
int err = 0;
depth = ext_depth(inode);
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
is_unwrit = ext4_ext_is_unwritten(ex);
if (flags & EXT4_GET_BLOCKS_CONVERT) {
/*
* EXT4_GET_BLOCKS_CONVERT: Caller wants the range specified by
* map to be initialized. Zeroout everything except the map
* range.
*/
loff_t map_end = (loff_t) map->m_lblk + map->m_len;
loff_t ex_end = (loff_t) ee_block + ee_len;
if (!is_unwrit)
/* Shouldn't happen. Just exit */
return -EINVAL;
/* zeroout left */
if (map->m_lblk > ee_block) {
lblk = ee_block;
len = map->m_lblk - ee_block;
pblk = ext4_ext_pblock(ex);
err = ext4_issue_zeroout(inode, lblk, pblk, len);
if (err)
/* ZEROOUT failed, just return original error */
return err;
}
/* zeroout right */
if (map_end < ex_end) {
lblk = map_end;
len = ex_end - map_end;
pblk = ext4_ext_pblock(ex) + (map_end - ee_block);
err = ext4_issue_zeroout(inode, lblk, pblk, len);
if (err)
/* ZEROOUT failed, just return original error */
return err;
}
} else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
/*
* EXT4_GET_BLOCKS_CONVERT_UNWRITTEN: Caller wants the
* range specified by map to be marked unwritten.
* Zeroout the map range leaving rest as it is.
*/
if (is_unwrit)
/* Shouldn't happen. Just exit */
return -EINVAL;
lblk = map->m_lblk;
len = map->m_len;
pblk = ext4_ext_pblock(ex) + (map->m_lblk - ee_block);
err = ext4_issue_zeroout(inode, lblk, pblk, len);
if (err)
/* ZEROOUT failed, just return original error */
return err;
} else {
/*
* We no longer perform unwritten to unwritten splits in IO paths.
* Hence this should not happen.
*/
WARN_ON_ONCE(true);
return -EINVAL;
}
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
return err;
ext4_ext_mark_initialized(ex);
ext4_ext_dirty(handle, inode, path + depth);
if (err)
return err;
return 0;
}
/*
* ext4_split_extent() splits an extent and mark extent which is covered
* by @map as split_flags indicates
@@ -3352,13 +3388,13 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
struct ext4_ext_path *path,
struct ext4_map_blocks *map,
int split_flag, int flags,
unsigned int *allocated)
unsigned int *allocated, bool *did_zeroout)
{
ext4_lblk_t ee_block;
ext4_lblk_t ee_block, orig_ee_block;
struct ext4_extent *ex;
unsigned int ee_len, depth;
int unwritten;
int split_flag1, flags1;
unsigned int ee_len, orig_ee_len, depth;
int unwritten, orig_unwritten;
int orig_err = 0;
depth = ext_depth(inode);
ex = path[depth].p_ext;
@@ -3366,25 +3402,27 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
ee_len = ext4_ext_get_actual_len(ex);
unwritten = ext4_ext_is_unwritten(ex);
orig_ee_block = ee_block;
orig_ee_len = ee_len;
orig_unwritten = unwritten;
/* Do not cache extents that are in the process of being modified. */
flags |= EXT4_EX_NOCACHE;
if (map->m_lblk + map->m_len < ee_block + ee_len) {
split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
if (unwritten)
split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
EXT4_EXT_MARK_UNWRIT2;
if (split_flag & EXT4_EXT_DATA_VALID2)
split_flag1 |= EXT4_EXT_DATA_VALID1;
path = ext4_split_extent_at(handle, inode, path,
map->m_lblk + map->m_len, split_flag1, flags1);
map->m_lblk + map->m_len, flags);
if (IS_ERR(path))
return path;
goto try_zeroout;
/*
* Update path is required because previous ext4_split_extent_at
* may result in split of original leaf or extent zeroout.
*/
path = ext4_find_extent(inode, map->m_lblk, path, flags);
if (IS_ERR(path))
return path;
goto try_zeroout;
depth = ext_depth(inode);
ex = path[depth].p_ext;
if (!ex) {
@@ -3393,22 +3431,69 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
ext4_free_ext_path(path);
return ERR_PTR(-EFSCORRUPTED);
}
unwritten = ext4_ext_is_unwritten(ex);
/* extent would have changed so update original values */
orig_ee_block = le32_to_cpu(ex->ee_block);
orig_ee_len = ext4_ext_get_actual_len(ex);
orig_unwritten = ext4_ext_is_unwritten(ex);
}
if (map->m_lblk >= ee_block) {
split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
if (unwritten) {
split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
EXT4_EXT_MARK_UNWRIT2);
}
path = ext4_split_extent_at(handle, inode, path,
map->m_lblk, split_flag1, flags);
path = ext4_split_extent_at(handle, inode, path, map->m_lblk,
flags);
if (IS_ERR(path))
return path;
goto try_zeroout;
}
goto success;
try_zeroout:
/*
* There was an error in splitting the extent. So instead, just zeroout
* unwritten portions and convert it to initialized as a last resort. If
* there is any failure here we just return the original error
*/
orig_err = PTR_ERR(path);
if (orig_err != -ENOSPC && orig_err != -EDQUOT && orig_err != -ENOMEM)
goto out_orig_err;
/* we can't zeroout? just return the original err */
if (!(split_flag & EXT4_EXT_MAY_ZEROOUT))
goto out_orig_err;
if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
int max_zeroout_blks =
EXT4_SB(inode->i_sb)->s_extent_max_zeroout_kb >>
(inode->i_sb->s_blocksize_bits - 10);
if (map->m_len > max_zeroout_blks)
goto out_orig_err;
}
path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
if (IS_ERR(path))
goto out_orig_err;
depth = ext_depth(inode);
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
unwritten = ext4_ext_is_unwritten(ex);
/* extent to zeroout should have been unchanged but its not */
if (WARN_ON(ee_block != orig_ee_block || ee_len != orig_ee_len ||
unwritten != orig_unwritten))
goto out_free_path;
if (ext4_split_extent_zeroout(handle, inode, path, map, flags))
goto out_free_path;
/* zeroout succeeded */
if (did_zeroout)
*did_zeroout = true;
success:
if (allocated) {
if (map->m_lblk + map->m_len > ee_block + ee_len)
*allocated = ee_len - (map->m_lblk - ee_block);
@@ -3417,6 +3502,12 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
}
ext4_ext_show_leaf(inode, path);
return path;
out_free_path:
ext4_free_ext_path(path);
out_orig_err:
return ERR_PTR(orig_err);
}
/*
@@ -3452,7 +3543,6 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
ext4_lblk_t ee_block, eof_block;
unsigned int ee_len, depth, map_len = map->m_len;
int err = 0;
int split_flag = EXT4_EXT_DATA_VALID2;
unsigned int max_zeroout = 0;
ext_debug(inode, "logical block %llu, max_blocks %u\n",
@@ -3604,9 +3694,7 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
* It is safe to convert extent to initialized via explicit
* zeroout only if extent is fully inside i_size or new_size.
*/
split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
if (EXT4_EXT_MAY_ZEROOUT & split_flag)
if (ee_block + ee_len <= eof_block)
max_zeroout = sbi->s_extent_max_zeroout_kb >>
(inode->i_sb->s_blocksize_bits - 10);
@@ -3661,8 +3749,8 @@ ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
}
fallback:
path = ext4_split_extent(handle, inode, path, &split_map, split_flag,
flags, NULL);
path = ext4_split_convert_extents(handle, inode, &split_map, path,
flags | EXT4_GET_BLOCKS_CONVERT, NULL);
if (IS_ERR(path))
return path;
out:
@@ -3712,7 +3800,8 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
ext4_lblk_t ee_block;
struct ext4_extent *ex;
unsigned int ee_len;
int split_flag = 0, depth;
int split_flag = 0, depth, err = 0;
bool did_zeroout = false;
ext_debug(inode, "logical block %llu, max_blocks %u\n",
(unsigned long long)map->m_lblk, map->m_len);
@@ -3726,34 +3815,87 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
/* Convert to unwritten */
if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
split_flag |= EXT4_EXT_DATA_VALID1;
/* Convert to initialized */
} else if (flags & EXT4_GET_BLOCKS_CONVERT) {
/*
* It is safe to convert extent to initialized via explicit
* zeroout only if extent is fully inside i_size or new_size.
*/
split_flag |= ee_block + ee_len <= eof_block ?
EXT4_EXT_MAY_ZEROOUT : 0;
split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
/* No split needed */
if (ee_block == map->m_lblk && ee_len == map->m_len)
goto convert;
/*
* It is only safe to convert extent to initialized via explicit
* zeroout only if extent is fully inside i_size or new_size.
*/
split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
/*
* pass SPLIT_NOMERGE explicitly so we don't end up merging extents we
* just split.
*/
path = ext4_split_extent(handle, inode, path, map, split_flag,
flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE,
allocated, &did_zeroout);
if (IS_ERR(path))
return path;
convert:
path = ext4_find_extent(inode, map->m_lblk, path, flags);
if (IS_ERR(path))
return path;
depth = ext_depth(inode);
ex = path[depth].p_ext;
/*
* Conversion is already handled in case of zeroout
*/
if (!did_zeroout) {
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto err;
if (flags & EXT4_GET_BLOCKS_CONVERT)
ext4_ext_mark_initialized(ex);
else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)
ext4_ext_mark_unwritten(ex);
if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE))
/*
* note: ext4_ext_correct_indexes() isn't needed here because
* borders are not changed
*/
ext4_ext_try_to_merge(handle, inode, path, ex);
err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
goto err;
}
flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE;
return ext4_split_extent(handle, inode, path, map, split_flag, flags,
allocated);
/* Lets update the extent status tree after conversion */
if (!(flags & EXT4_EX_NOCACHE))
ext4_es_insert_extent(inode, le32_to_cpu(ex->ee_block),
ext4_ext_get_actual_len(ex),
ext4_ext_pblock(ex),
ext4_ext_is_unwritten(ex) ?
EXTENT_STATUS_UNWRITTEN :
EXTENT_STATUS_WRITTEN,
false);
err:
if (err) {
ext4_free_ext_path(path);
return ERR_PTR(err);
}
return path;
}
static struct ext4_ext_path *
ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path *path)
struct ext4_ext_path *path, int flags)
{
struct ext4_extent *ex;
ext4_lblk_t ee_block;
unsigned int ee_len;
int depth;
int err = 0;
depth = ext_depth(inode);
ex = path[depth].p_ext;
@@ -3763,66 +3905,21 @@ ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode,
ext_debug(inode, "logical block %llu, max_blocks %u\n",
(unsigned long long)ee_block, ee_len);
/* If extent is larger than requested it is a clear sign that we still
* have some extent state machine issues left. So extent_split is still
* required.
* TODO: Once all related issues will be fixed this situation should be
* illegal.
*/
if (ee_block != map->m_lblk || ee_len > map->m_len) {
#ifdef CONFIG_EXT4_DEBUG
ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
" len %u; IO logical block %llu, len %u",
inode->i_ino, (unsigned long long)ee_block, ee_len,
(unsigned long long)map->m_lblk, map->m_len);
#endif
path = ext4_split_convert_extents(handle, inode, map, path,
EXT4_GET_BLOCKS_CONVERT, NULL);
if (IS_ERR(path))
return path;
path = ext4_find_extent(inode, map->m_lblk, path, 0);
if (IS_ERR(path))
return path;
depth = ext_depth(inode);
ex = path[depth].p_ext;
}
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto errout;
/* first mark the extent as initialized */
ext4_ext_mark_initialized(ex);
/* note: ext4_ext_correct_indexes() isn't needed here because
* borders are not changed
*/
ext4_ext_try_to_merge(handle, inode, path, ex);
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
if (err)
goto errout;
ext4_ext_show_leaf(inode, path);
return path;
errout:
ext4_free_ext_path(path);
return ERR_PTR(err);
return ext4_split_convert_extents(handle, inode, map, path, flags,
NULL);
}
static struct ext4_ext_path *
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path *path,
int flags,
unsigned int *allocated)
{
struct ext4_extent *ex;
ext4_lblk_t ee_block;
unsigned int ee_len;
int depth;
int err = 0;
/*
* Make sure that the extent is no bigger than we support with
@@ -3839,53 +3936,33 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
ext_debug(inode, "logical block %llu, max_blocks %u\n",
(unsigned long long)ee_block, ee_len);
if (ee_block != map->m_lblk || ee_len > map->m_len) {
path = ext4_split_convert_extents(handle, inode, map, path,
EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, NULL);
if (IS_ERR(path))
return path;
path = ext4_split_convert_extents(handle, inode, map, path, flags,
NULL);
if (IS_ERR(path))
return path;
path = ext4_find_extent(inode, map->m_lblk, path, 0);
if (IS_ERR(path))
return path;
depth = ext_depth(inode);
ex = path[depth].p_ext;
if (!ex) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
(unsigned long) map->m_lblk);
err = -EFSCORRUPTED;
goto errout;
}
}
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto errout;
/* first mark the extent as unwritten */
ext4_ext_mark_unwritten(ex);
/* note: ext4_ext_correct_indexes() isn't needed here because
* borders are not changed
*/
ext4_ext_try_to_merge(handle, inode, path, ex);
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
if (err)
goto errout;
ext4_ext_show_leaf(inode, path);
ext4_update_inode_fsync_trans(handle, inode, 1);
map->m_flags |= EXT4_MAP_UNWRITTEN;
/*
* The extent might be initialized in case of zeroout.
*/
path = ext4_find_extent(inode, map->m_lblk, path, flags);
if (IS_ERR(path))
return path;
depth = ext_depth(inode);
ex = path[depth].p_ext;
if (ext4_ext_is_unwritten(ex))
map->m_flags |= EXT4_MAP_UNWRITTEN;
else
map->m_flags |= EXT4_MAP_MAPPED;
if (*allocated > map->m_len)
*allocated = map->m_len;
map->m_len = *allocated;
return path;
errout:
ext4_free_ext_path(path);
return ERR_PTR(err);
}
static struct ext4_ext_path *
@@ -3910,30 +3987,10 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
*allocated, newblock);
/* get_block() before submitting IO, split the extent */
if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE) {
path = ext4_split_convert_extents(handle, inode, map, path,
flags | EXT4_GET_BLOCKS_CONVERT, allocated);
if (IS_ERR(path))
return path;
/*
* shouldn't get a 0 allocated when splitting an extent unless
* m_len is 0 (bug) or extent has been corrupted
*/
if (unlikely(*allocated == 0)) {
EXT4_ERROR_INODE(inode,
"unexpected allocated == 0, m_len = %u",
map->m_len);
err = -EFSCORRUPTED;
goto errout;
}
map->m_flags |= EXT4_MAP_UNWRITTEN;
goto out;
}
/* IO end_io complete, convert the filled extent to written */
if (flags & EXT4_GET_BLOCKS_CONVERT) {
path = ext4_convert_unwritten_extents_endio(handle, inode,
map, path);
map, path, flags);
if (IS_ERR(path))
return path;
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3983,7 +4040,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
goto errout;
}
out:
map->m_flags |= EXT4_MAP_NEW;
map_out:
map->m_flags |= EXT4_MAP_MAPPED;
@@ -4160,8 +4216,7 @@ again:
insert_hole:
/* Put just found gap into cache to speed up subsequent requests */
ext_debug(inode, " -> %u:%u\n", hole_start, len);
ext4_es_insert_extent(inode, hole_start, len, ~0,
EXTENT_STATUS_HOLE, false);
ext4_es_cache_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE);
/* Update hole_len to reflect hole size after lblk */
if (hole_start != lblk)
@@ -4257,7 +4312,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if ((!ext4_ext_is_unwritten(ex)) &&
(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
path = convert_initialized_extent(handle,
inode, map, path, &allocated);
inode, map, path, flags, &allocated);
if (IS_ERR(path))
err = PTR_ERR(path);
goto out;
@@ -5375,7 +5430,8 @@ again:
if (!extent) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
(unsigned long) *iterator);
return -EFSCORRUPTED;
ret = -EFSCORRUPTED;
goto out;
}
if (SHIFT == SHIFT_LEFT && *iterator >
le32_to_cpu(extent->ee_block)) {
@@ -5541,7 +5597,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
struct ext4_extent *extent;
ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0;
unsigned int credits, ee_len;
int ret, depth, split_flag = 0;
int ret, depth;
loff_t start;
trace_ext4_insert_range(inode, offset, len);
@@ -5612,12 +5668,8 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
*/
if ((start_lblk > ee_start_lblk) &&
(start_lblk < (ee_start_lblk + ee_len))) {
if (ext4_ext_is_unwritten(extent))
split_flag = EXT4_EXT_MARK_UNWRIT1 |
EXT4_EXT_MARK_UNWRIT2;
path = ext4_split_extent_at(handle, inode, path,
start_lblk, split_flag,
EXT4_EX_NOCACHE |
start_lblk, EXT4_EX_NOCACHE |
EXT4_GET_BLOCKS_SPLIT_NOMERGE |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
}
@@ -6187,3 +6239,7 @@ out:
ext4_free_ext_path(path);
return 0;
}
#ifdef CONFIG_EXT4_KUNIT_TESTS
#include "extents-test.c"
#endif

View File

@@ -16,6 +16,7 @@
#include "ext4.h"
#include <trace/events/ext4.h>
#include <kunit/static_stub.h>
/*
* According to previous discussion in Ext4 Developer Workshop, we
@@ -178,7 +179,8 @@ static struct kmem_cache *ext4_pending_cachep;
static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
struct extent_status *prealloc);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t end, int *reserved,
ext4_lblk_t end, unsigned int status,
int *reserved, struct extent_status *res,
struct extent_status *prealloc);
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
@@ -242,6 +244,21 @@ static inline void ext4_es_inc_seq(struct inode *inode)
WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1);
}
static inline int __es_check_extent_status(struct extent_status *es,
unsigned int status,
struct extent_status *res)
{
if (ext4_es_type(es) & status)
return 0;
if (res) {
res->es_lblk = es->es_lblk;
res->es_len = es->es_len;
res->es_pblk = es->es_pblk;
}
return -EINVAL;
}
/*
* search through the tree for an delayed extent with a given offset. If
* it can't be found, try to find next extent.
@@ -882,7 +899,8 @@ out:
/*
* ext4_es_insert_extent() adds information to an inode's extent
* status tree.
* status tree. This interface is used for modifying extents. To cache
* on-disk extents, use ext4_es_cache_extent() instead.
*/
void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
@@ -929,7 +947,7 @@ retry:
pr = __alloc_pending(true);
write_lock(&EXT4_I(inode)->i_es_lock);
err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1);
err1 = __es_remove_extent(inode, lblk, end, 0, &resv_used, NULL, es1);
if (err1 != 0)
goto error;
/* Free preallocated extent if it didn't get used. */
@@ -961,10 +979,6 @@ retry:
}
pending = err3;
}
/*
* TODO: For cache on-disk extents, there is no need to increment
* the sequence counter, this requires future optimization.
*/
ext4_es_inc_seq(inode);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
@@ -998,17 +1012,24 @@ error:
}
/*
* ext4_es_cache_extent() inserts information into the extent status
* tree if and only if there isn't information about the range in
* question already.
* ext4_es_cache_extent() inserts information into the extent status tree
* only if there is no existing information about the specified range or
* if the existing extents have the same status.
*
* Note that this interface is only used for caching on-disk extent
* information and cannot be used to convert existing extents in the extent
* status tree. To convert existing extents, use ext4_es_insert_extent()
* instead.
*/
void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status)
{
struct extent_status *es;
struct extent_status newes;
struct extent_status chkes, newes;
ext4_lblk_t end = lblk + len - 1;
bool conflict = false;
int err;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
@@ -1016,7 +1037,6 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
newes.es_lblk = lblk;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, pblk, status);
trace_ext4_es_cache_extent(inode, &newes);
if (!len)
return;
@@ -1024,11 +1044,42 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
BUG_ON(end < lblk);
write_lock(&EXT4_I(inode)->i_es_lock);
es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
if (!es || es->es_lblk > end)
__es_insert_extent(inode, &newes, NULL);
if (es && es->es_lblk <= end) {
/* Found an extent that covers the entire range. */
if (es->es_lblk <= lblk && es->es_lblk + es->es_len > end) {
if (__es_check_extent_status(es, status, &chkes))
conflict = true;
goto unlock;
}
/* Check and remove all extents in range. */
err = __es_remove_extent(inode, lblk, end, status, NULL,
&chkes, NULL);
if (err) {
if (err == -EINVAL)
conflict = true;
goto unlock;
}
}
__es_insert_extent(inode, &newes, NULL);
trace_ext4_es_cache_extent(inode, &newes);
ext4_es_print_tree(inode);
unlock:
write_unlock(&EXT4_I(inode)->i_es_lock);
if (!conflict)
return;
/*
* A hole in the on-disk extent but a delayed extent in the extent
* status tree, is allowed.
*/
if (status == EXTENT_STATUS_HOLE &&
ext4_es_type(&chkes) == EXTENT_STATUS_DELAYED)
return;
ext4_warning_inode(inode,
"ES cache extent failed: add [%d,%d,%llu,0x%x] conflict with existing [%d,%d,%llu,0x%x]\n",
lblk, len, pblk, status, chkes.es_lblk, chkes.es_len,
ext4_es_pblock(&chkes), ext4_es_status(&chkes));
}
/*
@@ -1409,23 +1460,27 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
return rc->ndelayed;
}
/*
* __es_remove_extent - removes block range from extent status tree
*
* @inode - file containing range
* @lblk - first block in range
* @end - last block in range
* @status - the extent status to be checked
* @reserved - number of cluster reservations released
* @res - return the extent if the status is not match
* @prealloc - pre-allocated es to avoid memory allocation failures
*
* If @reserved is not NULL and delayed allocation is enabled, counts
* block/cluster reservations freed by removing range and if bigalloc
* enabled cancels pending reservations as needed. Returns 0 on success,
* error code on failure.
* enabled cancels pending reservations as needed. If @status is not
* zero, check extent status type while removing extent, return -EINVAL
* and pass out the extent through @res if not match. Returns 0 on
* success, error code on failure.
*/
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t end, int *reserved,
ext4_lblk_t end, unsigned int status,
int *reserved, struct extent_status *res,
struct extent_status *prealloc)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
@@ -1434,18 +1489,24 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status orig_es;
ext4_lblk_t len1, len2;
ext4_fsblk_t block;
int err = 0;
int err;
bool count_reserved = true;
struct rsvd_count rc;
if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
count_reserved = false;
if (status == 0)
status = ES_TYPE_MASK;
es = __es_tree_search(&tree->root, lblk);
if (!es)
goto out;
return 0;
if (es->es_lblk > end)
goto out;
return 0;
err = __es_check_extent_status(es, status, res);
if (err)
return err;
/* Simply invalidate cache_es. */
tree->cache_es = NULL;
@@ -1480,7 +1541,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
es->es_lblk = orig_es.es_lblk;
es->es_len = orig_es.es_len;
goto out;
return err;
}
} else {
es->es_lblk = end + 1;
@@ -1494,7 +1555,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
if (count_reserved)
count_rsvd(inode, orig_es.es_lblk + len1,
orig_es.es_len - len1 - len2, &orig_es, &rc);
goto out_get_reserved;
goto out;
}
if (len1 > 0) {
@@ -1509,6 +1570,9 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
}
while (es && ext4_es_end(es) <= end) {
err = __es_check_extent_status(es, status, res);
if (err)
return err;
if (count_reserved)
count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
node = rb_next(&es->rb_node);
@@ -1524,6 +1588,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
if (es && es->es_lblk < end + 1) {
ext4_lblk_t orig_len = es->es_len;
err = __es_check_extent_status(es, status, res);
if (err)
return err;
len1 = ext4_es_end(es) - end;
if (count_reserved)
count_rsvd(inode, es->es_lblk, orig_len - len1,
@@ -1536,11 +1604,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
}
}
out_get_reserved:
out:
if (count_reserved)
*reserved = get_rsvd(inode, end, es, &rc);
out:
return err;
return 0;
}
/*
@@ -1582,7 +1649,7 @@ retry:
* is reclaimed.
*/
write_lock(&EXT4_I(inode)->i_es_lock);
err = __es_remove_extent(inode, lblk, end, &reserved, es);
err = __es_remove_extent(inode, lblk, end, 0, &reserved, NULL, es);
if (err)
goto error;
/* Free preallocated extent if it didn't get used. */
@@ -2174,7 +2241,7 @@ retry:
}
write_lock(&EXT4_I(inode)->i_es_lock);
err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
err1 = __es_remove_extent(inode, lblk, end, 0, NULL, NULL, es1);
if (err1 != 0)
goto error;
/* Free preallocated extent if it didn't get used. */

View File

@@ -231,16 +231,16 @@ static bool ext4_fc_disabled(struct super_block *sb)
void ext4_fc_del(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_fc_dentry_update *fc_dentry;
wait_queue_head_t *wq;
int alloc_ctx;
if (ext4_fc_disabled(inode->i_sb))
return;
mutex_lock(&sbi->s_fc_lock);
alloc_ctx = ext4_fc_lock(inode->i_sb);
if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
mutex_unlock(&sbi->s_fc_lock);
ext4_fc_unlock(inode->i_sb, alloc_ctx);
return;
}
@@ -275,9 +275,9 @@ void ext4_fc_del(struct inode *inode)
#endif
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
mutex_unlock(&sbi->s_fc_lock);
ext4_fc_unlock(inode->i_sb, alloc_ctx);
schedule();
mutex_lock(&sbi->s_fc_lock);
alloc_ctx = ext4_fc_lock(inode->i_sb);
}
finish_wait(wq, &wait.wq_entry);
}
@@ -288,7 +288,7 @@ void ext4_fc_del(struct inode *inode)
* dentry create references, since it is not needed to log it anyways.
*/
if (list_empty(&ei->i_fc_dilist)) {
mutex_unlock(&sbi->s_fc_lock);
ext4_fc_unlock(inode->i_sb, alloc_ctx);
return;
}
@@ -298,7 +298,7 @@ void ext4_fc_del(struct inode *inode)
list_del_init(&fc_dentry->fcd_dilist);
WARN_ON(!list_empty(&ei->i_fc_dilist));
mutex_unlock(&sbi->s_fc_lock);
ext4_fc_unlock(inode->i_sb, alloc_ctx);
release_dentry_name_snapshot(&fc_dentry->fcd_name);
kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
@@ -315,6 +315,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
tid_t tid;
bool has_transaction = true;
bool is_ineligible;
int alloc_ctx;
if (ext4_fc_disabled(sb))
return;
@@ -329,12 +330,12 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
has_transaction = false;
read_unlock(&sbi->s_journal->j_state_lock);
}
mutex_lock(&sbi->s_fc_lock);
alloc_ctx = ext4_fc_lock(sb);
is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
sbi->s_fc_ineligible_tid = tid;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
mutex_unlock(&sbi->s_fc_lock);
ext4_fc_unlock(sb, alloc_ctx);
WARN_ON(reason >= EXT4_FC_REASON_MAX);
sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}
@@ -358,6 +359,7 @@ static int ext4_fc_track_template(
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
tid_t tid = 0;
int alloc_ctx;
int ret;
tid = handle->h_transaction->t_tid;
@@ -373,14 +375,14 @@ static int ext4_fc_track_template(
if (!enqueue)
return ret;
mutex_lock(&sbi->s_fc_lock);
alloc_ctx = ext4_fc_lock(inode->i_sb);
if (list_empty(&EXT4_I(inode)->i_fc_list))
list_add_tail(&EXT4_I(inode)->i_fc_list,
(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
&sbi->s_fc_q[FC_Q_STAGING] :
&sbi->s_fc_q[FC_Q_MAIN]);
mutex_unlock(&sbi->s_fc_lock);
ext4_fc_unlock(inode->i_sb, alloc_ctx);
return ret;
}
@@ -402,6 +404,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
struct inode *dir = dentry->d_parent->d_inode;
struct super_block *sb = inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
int alloc_ctx;
spin_unlock(&ei->i_fc_lock);
@@ -425,7 +428,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
take_dentry_name_snapshot(&node->fcd_name, dentry);
INIT_LIST_HEAD(&node->fcd_dilist);
INIT_LIST_HEAD(&node->fcd_list);
mutex_lock(&sbi->s_fc_lock);
alloc_ctx = ext4_fc_lock(sb);
if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
list_add_tail(&node->fcd_list,
@@ -446,7 +449,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
WARN_ON(!list_empty(&ei->i_fc_dilist));
list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
}
mutex_unlock(&sbi->s_fc_lock);
ext4_fc_unlock(sb, alloc_ctx);
spin_lock(&ei->i_fc_lock);
return 0;
@@ -1046,18 +1049,19 @@ static int ext4_fc_perform_commit(journal_t *journal)
struct blk_plug plug;
int ret = 0;
u32 crc = 0;
int alloc_ctx;
/*
* Step 1: Mark all inodes on s_fc_q[MAIN] with
* EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being
* freed until the data flush is over.
*/
mutex_lock(&sbi->s_fc_lock);
alloc_ctx = ext4_fc_lock(sb);
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ext4_set_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_FLUSHING_DATA);
}
mutex_unlock(&sbi->s_fc_lock);
ext4_fc_unlock(sb, alloc_ctx);
/* Step 2: Flush data for all the eligible inodes. */
ret = ext4_fc_flush_data(journal);
@@ -1067,7 +1071,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
* any error from step 2. This ensures that waiters waiting on
* EXT4_STATE_FC_FLUSHING_DATA can resume.
*/
mutex_lock(&sbi->s_fc_lock);
alloc_ctx = ext4_fc_lock(sb);
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ext4_clear_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_FLUSHING_DATA);
@@ -1084,7 +1088,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
* prepare_to_wait() in ext4_fc_del().
*/
smp_mb();
mutex_unlock(&sbi->s_fc_lock);
ext4_fc_unlock(sb, alloc_ctx);
/*
* If we encountered error in Step 2, return it now after clearing
@@ -1101,12 +1105,12 @@ static int ext4_fc_perform_commit(journal_t *journal)
* previous handles are now drained. We now mark the inodes on the
* commit queue as being committed.
*/
mutex_lock(&sbi->s_fc_lock);
alloc_ctx = ext4_fc_lock(sb);
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ext4_set_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_COMMITTING);
}
mutex_unlock(&sbi->s_fc_lock);
ext4_fc_unlock(sb, alloc_ctx);
jbd2_journal_unlock_updates(journal);
/*
@@ -1117,6 +1121,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
blkdev_issue_flush(journal->j_fs_dev);
blk_start_plug(&plug);
alloc_ctx = ext4_fc_lock(sb);
/* Step 6: Write fast commit blocks to disk. */
if (sbi->s_fc_bytes == 0) {
/*
@@ -1134,7 +1139,6 @@ static int ext4_fc_perform_commit(journal_t *journal)
}
/* Step 6.2: Now write all the dentry updates. */
mutex_lock(&sbi->s_fc_lock);
ret = ext4_fc_commit_dentry_updates(journal, &crc);
if (ret)
goto out;
@@ -1156,7 +1160,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
ret = ext4_fc_write_tail(sb, crc);
out:
mutex_unlock(&sbi->s_fc_lock);
ext4_fc_unlock(sb, alloc_ctx);
blk_finish_plug(&plug);
return ret;
}
@@ -1290,6 +1294,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_inode_info *ei;
struct ext4_fc_dentry_update *fc_dentry;
int alloc_ctx;
if (full && sbi->s_fc_bh)
sbi->s_fc_bh = NULL;
@@ -1297,7 +1302,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
trace_ext4_fc_cleanup(journal, full, tid);
jbd2_fc_release_bufs(journal);
mutex_lock(&sbi->s_fc_lock);
alloc_ctx = ext4_fc_lock(sb);
while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
struct ext4_inode_info,
@@ -1356,7 +1361,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
if (full)
sbi->s_fc_bytes = 0;
mutex_unlock(&sbi->s_fc_lock);
ext4_fc_unlock(sb, alloc_ctx);
trace_ext4_fc_stats(sb);
}
@@ -2302,6 +2307,9 @@ static const char * const fc_ineligible_reasons[] = {
[EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
[EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
[EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
[EXT4_FC_REASON_MIGRATE] = "Inode format migration",
[EXT4_FC_REASON_VERITY] = "fs-verity enable",
[EXT4_FC_REASON_MOVE_EXT] = "Move extents",
};
int ext4_fc_info_show(struct seq_file *seq, void *v)

View File

@@ -97,6 +97,9 @@ enum {
EXT4_FC_REASON_FALLOC_RANGE,
EXT4_FC_REASON_INODE_JOURNAL_DATA,
EXT4_FC_REASON_ENCRYPTED_FILENAME,
EXT4_FC_REASON_MIGRATE,
EXT4_FC_REASON_VERITY,
EXT4_FC_REASON_MOVE_EXT,
EXT4_FC_REASON_MAX
};

View File

@@ -419,22 +419,20 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
* updating inode i_disksize and/or orphan handling with exclusive lock.
*
* - shared locking will only be true mostly with overwrites, including
* initialized blocks and unwritten blocks. For overwrite unwritten blocks
* we protect splitting extents by i_data_sem in ext4_inode_info, so we can
* also release exclusive i_rwsem lock.
* initialized blocks and unwritten blocks.
*
* - Otherwise we will switch to exclusive i_rwsem lock.
*/
static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
bool *ilock_shared, bool *extend,
bool *unwritten, int *dio_flags)
int *dio_flags)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
loff_t offset;
size_t count;
ssize_t ret;
bool overwrite, unaligned_io;
bool overwrite, unaligned_io, unwritten;
restart:
ret = ext4_generic_write_checks(iocb, from);
@@ -446,7 +444,7 @@ restart:
unaligned_io = ext4_unaligned_io(inode, from, offset);
*extend = ext4_extending_io(inode, offset, count);
overwrite = ext4_overwrite_io(inode, offset, count, unwritten);
overwrite = ext4_overwrite_io(inode, offset, count, &unwritten);
/*
* Determine whether we need to upgrade to an exclusive lock. This is
@@ -461,7 +459,7 @@ restart:
*/
if (*ilock_shared &&
((!IS_NOSEC(inode) || *extend || !overwrite ||
(unaligned_io && *unwritten)))) {
(unaligned_io && unwritten)))) {
if (iocb->ki_flags & IOCB_NOWAIT) {
ret = -EAGAIN;
goto out;
@@ -484,7 +482,7 @@ restart:
ret = -EAGAIN;
goto out;
}
if (unaligned_io && (!overwrite || *unwritten))
if (unaligned_io && (!overwrite || unwritten))
inode_dio_wait(inode);
*dio_flags = IOMAP_DIO_FORCE_WAIT;
}
@@ -509,8 +507,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(iocb->ki_filp);
loff_t offset = iocb->ki_pos;
size_t count = iov_iter_count(from);
const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
bool extend = false, unwritten = false;
bool extend = false;
bool ilock_shared = true;
int dio_flags = 0;
@@ -556,7 +553,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend,
&unwritten, &dio_flags);
&dio_flags);
if (ret <= 0)
return ret;
@@ -576,9 +573,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
if (ilock_shared && !unwritten)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops,
dio_flags, NULL, 0);
if (ret == -ENOTBLK)
ret = 0;
@@ -859,7 +854,6 @@ static int ext4_sample_last_mounted(struct super_block *sb,
* when trying to sort through large numbers of block
* devices or filesystem images.
*/
memset(buf, 0, sizeof(buf));
path.mnt = mnt;
path.dentry = mnt->mnt_root;
cp = d_path(&path, buf, sizeof(buf));

View File

@@ -48,6 +48,8 @@
#include "acl.h"
#include "truncate.h"
#include <kunit/static_stub.h>
#include <trace/events/ext4.h>
static void ext4_journalled_zero_new_buffers(handle_t *handle,
@@ -400,6 +402,8 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
{
int ret;
KUNIT_STATIC_STUB_REDIRECT(ext4_issue_zeroout, inode, lblk, pblk, len);
if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
return fscrypt_zeroout_range(inode, lblk, pblk, len);
@@ -503,8 +507,8 @@ static int ext4_map_query_blocks_next_in_leaf(handle_t *handle,
retval = ext4_ext_map_blocks(handle, inode, &map2, 0);
if (retval <= 0) {
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status, false);
ext4_es_cache_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
return map->m_len;
}
@@ -525,20 +529,20 @@ static int ext4_map_query_blocks_next_in_leaf(handle_t *handle,
*/
if (map->m_pblk + map->m_len == map2.m_pblk &&
status == status2) {
ext4_es_insert_extent(inode, map->m_lblk,
map->m_len + map2.m_len, map->m_pblk,
status, false);
ext4_es_cache_extent(inode, map->m_lblk,
map->m_len + map2.m_len, map->m_pblk,
status);
map->m_len += map2.m_len;
} else {
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status, false);
ext4_es_cache_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
}
return map->m_len;
}
static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
unsigned int status;
int retval;
@@ -573,8 +577,8 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
map->m_len == orig_mlen) {
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status, false);
ext4_es_cache_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
} else {
retval = ext4_map_query_blocks_next_in_leaf(handle, inode, map,
orig_mlen);
@@ -584,10 +588,9 @@ out:
return retval;
}
static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
struct extent_status es;
unsigned int status;
int err, retval = 0;
@@ -648,16 +651,6 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
return err;
}
/*
* If the extent has been zeroed out, we don't need to update
* extent status tree.
*/
if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE &&
ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
if (ext4_es_is_written(&es))
return retval;
}
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk,
@@ -2375,7 +2368,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
get_blocks_flags |= EXT4_GET_BLOCKS_UNWRIT_EXT;
err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
if (err < 0)
@@ -3740,7 +3733,7 @@ retry:
else if (EXT4_LBLK_TO_B(inode, map->m_lblk) >= i_size_read(inode))
m_flags = EXT4_GET_BLOCKS_CREATE;
else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
m_flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
if (flags & IOMAP_ATOMIC)
ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags,
@@ -3812,22 +3805,25 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (offset + length <= i_size_read(inode)) {
ret = ext4_map_blocks(NULL, inode, &map, 0);
/*
* For atomic writes the entire requested length should
* be mapped.
* For DAX we convert extents to initialized ones before
* copying the data, otherwise we do it after I/O so
* there's no need to call into ext4_iomap_alloc().
*/
if (map.m_flags & EXT4_MAP_MAPPED) {
if ((!(flags & IOMAP_ATOMIC) && ret > 0) ||
(flags & IOMAP_ATOMIC && ret >= orig_mlen))
if ((map.m_flags & EXT4_MAP_MAPPED) ||
(!(flags & IOMAP_DAX) &&
(map.m_flags & EXT4_MAP_UNWRITTEN))) {
/*
* For atomic writes the entire requested
* length should be mapped.
*/
if (ret == orig_mlen ||
(!(flags & IOMAP_ATOMIC) && ret > 0))
goto out;
}
map.m_len = orig_mlen;
}
ret = ext4_iomap_alloc(inode, &map, flags);
} else {
/*
* This can be called for overwrites path from
* ext4_iomap_overwrite_begin().
*/
ret = ext4_map_blocks(NULL, inode, &map, 0);
}
@@ -3856,30 +3852,10 @@ out:
return 0;
}
static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
loff_t length, unsigned flags, struct iomap *iomap,
struct iomap *srcmap)
{
int ret;
/*
* Even for writes we don't need to allocate blocks, so just pretend
* we are reading to save overhead of starting a transaction.
*/
flags &= ~IOMAP_WRITE;
ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED);
return ret;
}
const struct iomap_ops ext4_iomap_ops = {
.iomap_begin = ext4_iomap_begin,
};
const struct iomap_ops ext4_iomap_overwrite_ops = {
.iomap_begin = ext4_iomap_overwrite_begin,
};
static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
loff_t length, unsigned int flags,
struct iomap *iomap, struct iomap *srcmap)
@@ -4133,9 +4109,13 @@ static int __ext4_block_zero_page_range(handle_t *handle,
if (ext4_should_journal_data(inode)) {
err = ext4_dirty_journalled_data(handle, bh);
} else {
err = 0;
mark_buffer_dirty(bh);
if (ext4_should_order_data(inode))
/*
* Only the written block requires ordered data to prevent
* exposing stale data.
*/
if (!buffer_unwritten(bh) && !buffer_delay(bh) &&
ext4_should_order_data(inode))
err = ext4_jbd2_inode_add_write(handle, inode, from,
length);
}

View File

@@ -968,6 +968,7 @@ static long ext4_ioctl_group_add(struct file *file,
err = ext4_group_add(sb, input);
if (EXT4_SB(sb)->s_journal) {
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
@@ -1613,6 +1614,8 @@ setversion_out:
err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE,
NULL);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);

View File

@@ -567,7 +567,7 @@ test_mark_diskspace_used_range(struct kunit *test,
bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP);
memset(bitmap, 0, sb->s_blocksize);
ret = ext4_mb_mark_diskspace_used(ac, NULL, 0);
ret = ext4_mb_mark_diskspace_used(ac, NULL);
KUNIT_ASSERT_EQ(test, ret, 0);
max = EXT4_CLUSTERS_PER_GROUP(sb);

View File

@@ -892,6 +892,21 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
}
}
static ext4_group_t ext4_get_allocation_groups_count(
struct ext4_allocation_context *ac)
{
ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb);
/* non-extent files are limited to low blocks/groups */
if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
ngroups = EXT4_SB(ac->ac_sb)->s_blockfile_groups;
/* Pairs with smp_wmb() in ext4_update_super() */
smp_rmb();
return ngroups;
}
static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac,
struct xarray *xa,
ext4_group_t start, ext4_group_t end)
@@ -899,7 +914,7 @@ static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac,
struct super_block *sb = ac->ac_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
enum criteria cr = ac->ac_criteria;
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t ngroups = ext4_get_allocation_groups_count(ac);
unsigned long group = start;
struct ext4_group_info *grp;
@@ -951,7 +966,7 @@ static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac,
ext4_group_t start, end;
start = group;
end = ext4_get_groups_count(ac->ac_sb);
end = ext4_get_allocation_groups_count(ac);
wrap_around:
for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
ret = ext4_mb_scan_groups_largest_free_order_range(ac, i,
@@ -1001,7 +1016,7 @@ static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac,
ext4_group_t start, end;
start = group;
end = ext4_get_groups_count(ac->ac_sb);
end = ext4_get_allocation_groups_count(ac);
wrap_around:
i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
@@ -1083,7 +1098,7 @@ static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac,
min_order = fls(ac->ac_o_ex.fe_len);
start = group;
end = ext4_get_groups_count(ac->ac_sb);
end = ext4_get_allocation_groups_count(ac);
wrap_around:
for (i = order; i >= min_order; i--) {
int frag_order;
@@ -1133,8 +1148,6 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac)
return 0;
if (ac->ac_criteria >= CR_GOAL_LEN_SLOW)
return 0;
if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
return 0;
return 1;
}
@@ -1182,11 +1195,7 @@ static int ext4_mb_scan_groups(struct ext4_allocation_context *ac)
int ret = 0;
ext4_group_t start;
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb);
/* non-extent files are limited to low blocks/groups */
if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
ngroups = sbi->s_blockfile_groups;
ext4_group_t ngroups = ext4_get_allocation_groups_count(ac);
/* searching for the right group start from the goal value specified */
start = ac->ac_g_ex.fe_group;
@@ -1706,16 +1715,17 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
/* Avoid locking the folio in the fast path ... */
folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
if (IS_ERR(folio) || !folio_test_uptodate(folio) || folio_test_locked(folio)) {
/*
* folio_test_locked is employed to detect ongoing folio
* migrations, since concurrent migrations can lead to
* bitmap inconsistency. And if we are not uptodate that
* implies somebody just created the folio but is yet to
* initialize it. We can drop the folio reference and
* try to get the folio with lock in both cases to avoid
* concurrency.
*/
if (!IS_ERR(folio))
/*
* drop the folio reference and try
* to get the folio with lock. If we
* are not uptodate that implies
* somebody just created the folio but
* is yet to initialize it. So
* wait for it to initialize.
*/
folio_put(folio);
folio = __filemap_get_folio(inode->i_mapping, pnum,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
@@ -1764,7 +1774,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
/* we need another folio for the buddy */
folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
if (IS_ERR(folio) || !folio_test_uptodate(folio) || folio_test_locked(folio)) {
if (!IS_ERR(folio))
folio_put(folio);
folio = __filemap_get_folio(inode->i_mapping, pnum,
@@ -4185,8 +4195,7 @@ out_err:
* Returns 0 if success or error code
*/
static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
handle_t *handle, unsigned int reserv_clstrs)
ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle)
{
struct ext4_group_desc *gdp;
struct ext4_sb_info *sbi;
@@ -4241,13 +4250,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
BUG_ON(changed != ac->ac_b_ex.fe_len);
#endif
percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
/*
* Now reduce the dirty block count also. Should not go negative
*/
if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
/* release all the reserved blocks if non delalloc */
percpu_counter_sub(&sbi->s_dirtyclusters_counter,
reserv_clstrs);
return err;
}
@@ -6331,7 +6333,7 @@ repeat:
ext4_mb_pa_put_free(ac);
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
*errp = ext4_mb_mark_diskspace_used(ac, handle);
if (*errp) {
ext4_discard_allocated_blocks(ac);
goto errout;
@@ -6362,12 +6364,9 @@ errout:
out:
if (inquota && ar->len < inquota)
dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
if (!ar->len) {
if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
/* release all the reserved blocks if non delalloc */
percpu_counter_sub(&sbi->s_dirtyclusters_counter,
reserv_clstrs);
}
/* release any reserved blocks */
if (reserv_clstrs)
percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs);
trace_ext4_allocate_blocks(ar, (unsigned long long)block);

View File

@@ -449,6 +449,12 @@ int ext4_ext_migrate(struct inode *inode)
retval = PTR_ERR(handle);
goto out_unlock;
}
/*
* This operation rewrites the inode's block mapping layout
* (indirect to extents) and is not tracked in the fast commit
* log, so disable fast commits for this transaction.
*/
ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MIGRATE, handle);
goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
owner[0] = i_uid_read(inode);
@@ -630,6 +636,12 @@ int ext4_ind_migrate(struct inode *inode)
ret = PTR_ERR(handle);
goto out_unlock;
}
/*
* This operation rewrites the inode's block mapping layout
* (extents to indirect blocks) and is not tracked in the fast
* commit log, so disable fast commits for this transaction.
*/
ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MIGRATE, handle);
down_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_ext_check_inode(inode);

View File

@@ -321,6 +321,8 @@ static int mext_move_extent(struct mext_data *mext, u64 *m_len)
ret = PTR_ERR(handle);
goto out;
}
ext4_fc_mark_ineligible(orig_inode->i_sb, EXT4_FC_REASON_MOVE_EXT,
handle);
ret = mext_move_begin(mext, folio, &move_type);
if (ret)

View File

@@ -3650,10 +3650,12 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly)
}
/*
* This function is called once a day if we have errors logged
* on the file system
* This function is called once a day by default if we have errors logged
* on the file system.
* Use the err_report_sec sysfs attribute to disable or adjust its call
* freequency.
*/
static void print_daily_error_info(struct timer_list *t)
void print_daily_error_info(struct timer_list *t)
{
struct ext4_sb_info *sbi = timer_container_of(sbi, t, s_err_report);
struct super_block *sb = sbi->s_sb;
@@ -3693,7 +3695,9 @@ static void print_daily_error_info(struct timer_list *t)
le64_to_cpu(es->s_last_error_block));
printk(KERN_CONT "\n");
}
mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
if (sbi->s_err_report_sec)
mod_timer(&sbi->s_err_report, jiffies + secs_to_jiffies(sbi->s_err_report_sec));
}
/* Find next suitable group and run ext4_init_inode_table */
@@ -5616,6 +5620,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
clear_opt2(sb, MB_OPTIMIZE_SCAN);
}
err = ext4_percpu_param_init(sbi);
if (err)
goto failed_mount5;
err = ext4_mb_init(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
@@ -5631,10 +5639,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sbi->s_journal->j_commit_callback =
ext4_journal_commit_callback;
err = ext4_percpu_param_init(sbi);
if (err)
goto failed_mount6;
if (ext4_has_feature_flex_bg(sb))
if (!ext4_fill_flex_info(sb)) {
ext4_msg(sb, KERN_ERR,
@@ -5690,8 +5694,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
clear_opt(sb, DISCARD);
}
if (es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
if (es->s_error_count) {
sbi->s_err_report_sec = 5*60; /* first time 5 minutes */
mod_timer(&sbi->s_err_report,
jiffies + secs_to_jiffies(sbi->s_err_report_sec));
}
sbi->s_err_report_sec = 24*60*60; /* Once a day */
/* Enable message ratelimiting. Default is 10 messages per 5 secs. */
ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
@@ -5716,8 +5724,8 @@ failed_mount7:
failed_mount6:
ext4_mb_release(sb);
ext4_flex_groups_free(sbi);
ext4_percpu_param_destroy(sbi);
failed_mount5:
ext4_percpu_param_destroy(sbi);
ext4_ext_release(sb);
ext4_release_system_zone(sb);
failed_mount4a:
@@ -6237,10 +6245,11 @@ static void ext4_update_super(struct super_block *sb)
ext4_errno_to_code(sbi->s_last_error_code);
/*
* Start the daily error reporting function if it hasn't been
* started already
* started already and sbi->s_err_report_sec is not zero
*/
if (!es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);
if (!es->s_error_count && !sbi->s_err_report_sec)
mod_timer(&sbi->s_err_report,
jiffies + secs_to_jiffies(sbi->s_err_report_sec));
le32_add_cpu(&es->s_error_count, sbi->s_add_error_count);
sbi->s_add_error_count = 0;
}

View File

@@ -40,6 +40,7 @@ typedef enum {
attr_pointer_string,
attr_pointer_atomic,
attr_journal_task,
attr_err_report_sec,
} attr_id_t;
typedef enum {
@@ -130,6 +131,36 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
return count;
}
static ssize_t err_report_sec_store(struct ext4_sb_info *sbi,
const char *buf, size_t count)
{
unsigned long t;
int ret;
ret = kstrtoul(skip_spaces(buf), 0, &t);
if (ret)
return ret;
/*the maximum time interval must not exceed one year.*/
if (t > (365*24*60*60))
return -EINVAL;
if (sbi->s_err_report_sec == t) /*nothing to do*/
goto out;
else if (!sbi->s_err_report_sec && t) {
timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
} else if (sbi->s_err_report_sec && !t) {
timer_delete_sync(&sbi->s_err_report);
goto out;
}
sbi->s_err_report_sec = t;
mod_timer(&sbi->s_err_report, jiffies + secs_to_jiffies(sbi->s_err_report_sec));
out:
return count;
}
static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf)
{
if (!sbi->s_journal)
@@ -217,6 +248,7 @@ EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group,
ext4_sb_info, s_mb_group_prealloc);
EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order,
ext4_sb_info, s_mb_best_avail_max_trim_order);
EXT4_ATTR_OFFSET(err_report_sec, 0644, err_report_sec, ext4_sb_info, s_err_report_sec);
EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
@@ -309,6 +341,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(last_trim_minblks),
ATTR_LIST(sb_update_sec),
ATTR_LIST(sb_update_kb),
ATTR_LIST(err_report_sec),
NULL,
};
ATTRIBUTE_GROUPS(ext4);
@@ -402,6 +435,7 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr));
case attr_pointer_ul:
case attr_err_report_sec:
return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr));
case attr_pointer_u8:
return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr));
@@ -525,6 +559,8 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
return inode_readahead_blks_store(sbi, buf, len);
case attr_trigger_test_error:
return trigger_test_error(sbi, buf, len);
case attr_err_report_sec:
return err_report_sec_store(sbi, buf, len);
default:
return ext4_generic_attr_store(a, sbi, buf, len);
}

View File

@@ -231,6 +231,8 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc,
goto cleanup;
}
ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_VERITY, handle);
err = ext4_orphan_del(handle, inode);
if (err)
goto stop_and_cleanup;

View File

@@ -102,6 +102,9 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_RENAME_DIR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_MIGRATE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_VERITY);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_MOVE_EXT);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);
#define show_fc_reason(reason) \
@@ -115,7 +118,10 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);
{ EXT4_FC_REASON_RENAME_DIR, "RENAME_DIR"}, \
{ EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}, \
{ EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \
{ EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"})
{ EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}, \
{ EXT4_FC_REASON_MIGRATE, "MIGRATE"}, \
{ EXT4_FC_REASON_VERITY, "VERITY"}, \
{ EXT4_FC_REASON_MOVE_EXT, "MOVE_EXT"})
TRACE_DEFINE_ENUM(CR_POWER2_ALIGNED);
TRACE_DEFINE_ENUM(CR_GOAL_LEN_FAST);