Commit 857d32f2 authored by Harshad Shirwadkar's avatar Harshad Shirwadkar Committed by Theodore Ts'o
Browse files

ext4: rework fast commit commit path



This patch reworks fast commit's commit path to remove locking the
journal for the entire duration of a fast commit. Instead, we only lock
the journal while marking all the eligible inodes as "committing". This
allows handles to make progress in parallel with the fast commit.

Signed-off-by: default avatarHarshad Shirwadkar <harshadshirwadkar@gmail.com>
Link: https://patch.msgid.link/20250508175908.1004880-5-harshadshirwadkar@gmail.com


Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
parent 0b64fd74
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -1916,6 +1916,7 @@ enum {
	EXT4_STATE_LUSTRE_EA_INODE,	/* Lustre-style ea_inode */
	EXT4_STATE_VERITY_IN_PROGRESS,	/* building fs-verity Merkle tree */
	EXT4_STATE_FC_COMMITTING,	/* Fast commit ongoing */
	EXT4_STATE_FC_FLUSHING_DATA,	/* Fast commit flushing data */
	EXT4_STATE_ORPHAN_FILE,		/* Inode orphaned in orphan file */
};

+125 −74
Original line number Diff line number Diff line
@@ -287,23 +287,54 @@ void ext4_fc_del(struct inode *inode)
	struct ext4_inode_info *ei = EXT4_I(inode);
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
	struct ext4_fc_dentry_update *fc_dentry;
	wait_queue_head_t *wq;

	if (ext4_fc_disabled(inode->i_sb))
		return;

restart:
	spin_lock(&sbi->s_fc_lock);
	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
		spin_unlock(&sbi->s_fc_lock);
		return;
	}

	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
		ext4_fc_wait_committing_inode(inode);
		goto restart;
	/*
	 * Since ext4_fc_del is called from ext4_evict_inode while having a
	 * handle open, there is no need for us to wait here even if a fast
	 * commit is going on. That is because, if this inode is being
	 * committed, ext4_mark_inode_dirty would have waited for inode commit
	 * operation to finish before we come here. So, by the time we come
	 * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
	 * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
	 * here.
	 *
	 * We may come here without any handles open in the "no_delete" case of
	 * ext4_evict_inode as well. However, if that happens, we first mark the
	 * file system as fast commit ineligible anyway. So, even in that case,
	 * it is okay to remove the inode from the fc list.
	 */
	WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
		&& !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
	while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
#if (BITS_PER_LONG < 64)
		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
				EXT4_STATE_FC_FLUSHING_DATA);
		wq = bit_waitqueue(&ei->i_state_flags,
				   EXT4_STATE_FC_FLUSHING_DATA);
#else
		DEFINE_WAIT_BIT(wait, &ei->i_flags,
				EXT4_STATE_FC_FLUSHING_DATA);
		wq = bit_waitqueue(&ei->i_flags,
				   EXT4_STATE_FC_FLUSHING_DATA);
#endif
		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
		if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
			spin_unlock(&sbi->s_fc_lock);
			schedule();
			spin_lock(&sbi->s_fc_lock);
		}
		finish_wait(wq, &wait.wq_entry);
	}

	if (!list_empty(&ei->i_fc_list))
	list_del_init(&ei->i_fc_list);

	/*
@@ -325,8 +356,6 @@ void ext4_fc_del(struct inode *inode)

	release_dentry_name_snapshot(&fc_dentry->fcd_name);
	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);

	return;
}

/*
@@ -590,9 +619,6 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
		return;

	if (!list_empty(&ei->i_fc_list))
		return;

	/*
	 * If we come here, we may sleep while waiting for the inode to
	 * commit. We shouldn't be holding i_data_sem when we go to sleep since
@@ -988,61 +1014,25 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
}


/* Submit data for all the fast commit inodes */
static int ext4_fc_submit_inode_data_all(journal_t *journal)
/* Flushes data of all the inodes in the commit queue. */
static int ext4_fc_flush_data(journal_t *journal)
{
	struct super_block *sb = journal->j_private;
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	struct ext4_inode_info *ei;
	int ret = 0;

	spin_lock(&sbi->s_fc_lock);
	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
		while (atomic_read(&ei->i_fc_updates)) {
			DEFINE_WAIT(wait);

			prepare_to_wait(&ei->i_fc_wait, &wait,
						TASK_UNINTERRUPTIBLE);
			if (atomic_read(&ei->i_fc_updates)) {
				spin_unlock(&sbi->s_fc_lock);
				schedule();
				spin_lock(&sbi->s_fc_lock);
			}
			finish_wait(&ei->i_fc_wait, &wait);
		}
		spin_unlock(&sbi->s_fc_lock);
		ret = jbd2_submit_inode_data(journal, ei->jinode);
		if (ret)
			return ret;
		spin_lock(&sbi->s_fc_lock);
	}
	spin_unlock(&sbi->s_fc_lock);

	return ret;
}

/* Wait for completion of data for all the fast commit inodes */
static int ext4_fc_wait_inode_data_all(journal_t *journal)
{
	struct super_block *sb = journal->j_private;
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	struct ext4_inode_info *pos, *n;
	int ret = 0;

	spin_lock(&sbi->s_fc_lock);
	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
		if (!ext4_test_inode_state(&pos->vfs_inode,
					   EXT4_STATE_FC_COMMITTING))
			continue;
		spin_unlock(&sbi->s_fc_lock);

		ret = jbd2_wait_inode_data(journal, pos->jinode);
	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
		ret = jbd2_wait_inode_data(journal, ei->jinode);
		if (ret)
			return ret;
		spin_lock(&sbi->s_fc_lock);
	}
	spin_unlock(&sbi->s_fc_lock);

	return 0;
}
@@ -1123,26 +1113,81 @@ static int ext4_fc_perform_commit(journal_t *journal)
	int ret = 0;
	u32 crc = 0;

	ret = ext4_fc_submit_inode_data_all(journal);
	if (ret)
		return ret;
	/*
	 * Step 1: Mark all inodes on s_fc_q[MAIN] with
	 * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being
	 * freed until the data flush is over.
	 */
	spin_lock(&sbi->s_fc_lock);
	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
		ext4_set_inode_state(&iter->vfs_inode,
				     EXT4_STATE_FC_FLUSHING_DATA);
	}
	spin_unlock(&sbi->s_fc_lock);

	ret = ext4_fc_wait_inode_data_all(journal);
	/* Step 2: Flush data for all the eligible inodes. */
	ret = ext4_fc_flush_data(journal);

	/*
	 * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning
	 * any error from step 2. This ensures that waiters waiting on
	 * EXT4_STATE_FC_FLUSHING_DATA can resume.
	 */
	spin_lock(&sbi->s_fc_lock);
	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
		ext4_clear_inode_state(&iter->vfs_inode,
				       EXT4_STATE_FC_FLUSHING_DATA);
#if (BITS_PER_LONG < 64)
		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA);
#else
		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA);
#endif
	}

	/*
	 * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before
	 * the waiter checks the bit. Pairs with implicit barrier in
	 * prepare_to_wait() in ext4_fc_del().
	 */
	smp_mb();
	spin_unlock(&sbi->s_fc_lock);

	/*
	 * If we encountered error in Step 2, return it now after clearing
	 * EXT4_STATE_FC_FLUSHING_DATA bit.
	 */
	if (ret)
		return ret;


	/* Step 4: Mark all inodes as being committed. */
	jbd2_journal_lock_updates(journal);
	/*
	 * If file system device is different from journal device, issue a cache
	 * flush before we start writing fast commit blocks.
	 * The journal is now locked. No more handles can start and all the
	 * previous handles are now drained. We now mark the inodes on the
	 * commit queue as being committed.
	 */
	spin_lock(&sbi->s_fc_lock);
	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
		ext4_set_inode_state(&iter->vfs_inode,
				     EXT4_STATE_FC_COMMITTING);
	}
	spin_unlock(&sbi->s_fc_lock);
	jbd2_journal_unlock_updates(journal);

	/*
	 * Step 5: If file system device is different from journal device,
	 * issue a cache flush before we start writing fast commit blocks.
	 */
	if (journal->j_fs_dev != journal->j_dev)
		blkdev_issue_flush(journal->j_fs_dev);

	blk_start_plug(&plug);
	/* Step 6: Write fast commit blocks to disk. */
	if (sbi->s_fc_bytes == 0) {
		/*
		 * Add a head tag only if this is the first fast commit
		 * in this TID.
		 * Step 6.1: Add a head tag only if this is the first fast
		 * commit in this TID.
		 */
		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
		head.fc_tid = cpu_to_le32(
@@ -1154,6 +1199,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
		}
	}

	/* Step 6.2: Now write all the dentry updates. */
	spin_lock(&sbi->s_fc_lock);
	ret = ext4_fc_commit_dentry_updates(journal, &crc);
	if (ret) {
@@ -1161,6 +1207,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
		goto out;
	}

	/* Step 6.3: Now write all the changed inodes to disk. */
	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
		inode = &iter->vfs_inode;
		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
@@ -1173,10 +1220,8 @@ static int ext4_fc_perform_commit(journal_t *journal)
		ret = ext4_fc_write_inode(inode, &crc);
		if (ret)
			goto out;
		spin_lock(&sbi->s_fc_lock);
	}
	spin_unlock(&sbi->s_fc_lock);

	/* Step 6.4: Finally write tail tag to conclude this fast commit. */
	ret = ext4_fc_write_tail(sb, crc);

out:
@@ -1298,7 +1343,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
{
	struct super_block *sb = journal->j_private;
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	struct ext4_inode_info *iter, *iter_n;
	struct ext4_inode_info *ei;
	struct ext4_fc_dentry_update *fc_dentry;

	if (full && sbi->s_fc_bh)
@@ -1308,13 +1353,15 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
	jbd2_fc_release_bufs(journal);

	spin_lock(&sbi->s_fc_lock);
	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
				 i_fc_list) {
		list_del_init(&iter->i_fc_list);
		ext4_clear_inode_state(&iter->vfs_inode,
	while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
		ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
					struct ext4_inode_info,
					i_fc_list);
		list_del_init(&ei->i_fc_list);
		ext4_clear_inode_state(&ei->vfs_inode,
				       EXT4_STATE_FC_COMMITTING);
		if (tid_geq(tid, iter->i_sync_tid)) {
			ext4_fc_reset_inode(&iter->vfs_inode);
		if (tid_geq(tid, ei->i_sync_tid)) {
			ext4_fc_reset_inode(&ei->vfs_inode);
		} else if (full) {
			/*
			 * We are called after a full commit, inode has been
@@ -1325,15 +1372,19 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
			 * time in that case (and tid doesn't increase so
			 * tid check above isn't reliable).
			 */
			list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list,
			list_add_tail(&ei->i_fc_list,
				      &sbi->s_fc_q[FC_Q_STAGING]);
		}
		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
		/*
		 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
		 * visible before we send the wakeup. Pairs with implicit
		 * barrier in prepare_to_wait() in ext4_fc_track_inode().
		 */
		smp_mb();
#if (BITS_PER_LONG < 64)
		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
		wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING);
#else
		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
		wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING);
#endif
	}

+0 −2
Original line number Diff line number Diff line
@@ -728,7 +728,6 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
	}
	journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
	write_unlock(&journal->j_state_lock);
	jbd2_journal_lock_updates(journal);

	return 0;
}
@@ -742,7 +741,6 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
{
	if (journal->j_fc_cleanup_callback)
		journal->j_fc_cleanup_callback(journal, 0, tid);
	jbd2_journal_unlock_updates(journal);
	write_lock(&journal->j_state_lock);
	journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
	if (fallback)