Unverified Commit 9426414f authored by Christian Brauner's avatar Christian Brauner
Browse files

Merge patch series "writeback: Avoid lockups when switching inodes"

Jan Kara <jack@suse.cz> says:

This patch series addresses lockups reported by users when systemd unit reading
lots of files from a filesystem mounted with lazytime mount option exits. See
patch 3 for more details about the reproducer.

There are two main issues why switching many inodes between wbs:

1) Multiple workers will be spawned to do the switching but they all contend
on the same wb->list_lock making all the parallelism pointless and just
wasting time.

2) Sorting of wb->b_dirty list by dirtied_time_when is inherently slow.

Patches 1-3 address these problems, patch 4 adds a tracepoint for better
observability of inode writeback switching.

* patches from https://lore.kernel.org/20250912103522.2935-1-jack@suse.cz

:
  writeback: Add tracepoint to track pending inode switches
  writeback: Avoid excessively long inode switching times
  writeback: Avoid softlockup when switching many inodes
  writeback: Avoid contention on wb->list_lock when switching inodes

Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parents 8f5ae30d 0cee64c5
Loading
Loading
Loading
Loading
+86 −47
Original line number Diff line number Diff line
@@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
}

struct inode_switch_wbs_context {
	struct rcu_work		work;
	/* List of queued switching contexts for the wb */
	struct llist_node	list;

	/*
	 * Multiple inodes can be switched at once.  The switching procedure
@@ -378,7 +379,6 @@ struct inode_switch_wbs_context {
	 * array embedded into struct inode_switch_wbs_context.  Otherwise
	 * an inode could be left in a non-consistent state.
	 */
	struct bdi_writeback	*new_wb;
	struct inode		*inodes[];
};

@@ -445,22 +445,23 @@ static bool inode_do_switch_wbs(struct inode *inode,
	 * Transfer to @new_wb's IO list if necessary.  If the @inode is dirty,
	 * the specific list @inode was on is ignored and the @inode is put on
	 * ->b_dirty which is always correct including from ->b_dirty_time.
	 * The transfer preserves @inode->dirtied_when ordering.  If the @inode
	 * was clean, it means it was on the b_attached list, so move it onto
	 * the b_attached list of @new_wb.
	 * If the @inode was clean, it means it was on the b_attached list, so
	 * move it onto the b_attached list of @new_wb.
	 */
	if (!list_empty(&inode->i_io_list)) {
		inode->i_wb = new_wb;

		if (inode->i_state & I_DIRTY_ALL) {
			struct inode *pos;

			list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
				if (time_after_eq(inode->dirtied_when,
						  pos->dirtied_when))
					break;
			/*
			 * We need to keep b_dirty list sorted by
			 * dirtied_time_when. However properly sorting the
			 * inode in the list gets too expensive when switching
			 * many inodes. So just attach inode at the end of the
			 * dirty list and clobber the dirtied_time_when.
			 */
			inode->dirtied_time_when = jiffies;
			inode_io_list_move_locked(inode, new_wb,
						  pos->i_io_list.prev);
						  &new_wb->b_dirty);
		} else {
			inode_cgwb_move_to_attached(inode, new_wb);
		}
@@ -486,13 +487,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
	return switched;
}

static void inode_switch_wbs_work_fn(struct work_struct *work)
static void process_inode_switch_wbs(struct bdi_writeback *new_wb,
				     struct inode_switch_wbs_context *isw)
{
	struct inode_switch_wbs_context *isw =
		container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
	struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
	struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
	struct bdi_writeback *new_wb = isw->new_wb;
	unsigned long nr_switched = 0;
	struct inode **inodep;

@@ -502,6 +501,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
	 */
	down_read(&bdi->wb_switch_rwsem);

	inodep = isw->inodes;
	/*
	 * By the time control reaches here, RCU grace period has passed
	 * since I_WB_SWITCH assertion and all wb stat update transactions
@@ -512,6 +512,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
	 * gives us exclusion against all wb related operations on @inode
	 * including IO list manipulations and stat updates.
	 */
relock:
	if (old_wb < new_wb) {
		spin_lock(&old_wb->list_lock);
		spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
@@ -520,10 +521,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
		spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
	}

	for (inodep = isw->inodes; *inodep; inodep++) {
	while (*inodep) {
		WARN_ON_ONCE((*inodep)->i_wb != old_wb);
		if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
			nr_switched++;
		inodep++;
		if (*inodep && need_resched()) {
			spin_unlock(&new_wb->list_lock);
			spin_unlock(&old_wb->list_lock);
			cond_resched();
			goto relock;
		}
	}

	spin_unlock(&new_wb->list_lock);
@@ -543,6 +551,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
	atomic_dec(&isw_nr_in_flight);
}

void inode_switch_wbs_work_fn(struct work_struct *work)
{
	struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback,
						    switch_work);
	struct inode_switch_wbs_context *isw, *next_isw;
	struct llist_node *list;

	/*
	 * Grab out reference to wb so that it cannot get freed under us
	 * after we process all the isw items.
	 */
	wb_get(new_wb);
	while (1) {
		list = llist_del_all(&new_wb->switch_wbs_ctxs);
		/* Nothing to do? */
		if (!list)
			break;
		/*
		 * In addition to synchronizing among switchers, I_WB_SWITCH
		 * tells the RCU protected stat update paths to grab the i_page
		 * lock so that stat transfer can synchronize against them.
		 * Let's continue after I_WB_SWITCH is guaranteed to be
		 * visible.
		 */
		synchronize_rcu();

		llist_for_each_entry_safe(isw, next_isw, list, list)
			process_inode_switch_wbs(new_wb, isw);
	}
	wb_put(new_wb);
}

static bool inode_prepare_wbs_switch(struct inode *inode,
				     struct bdi_writeback *new_wb)
{
@@ -572,6 +612,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
	return true;
}

static void wb_queue_isw(struct bdi_writeback *wb,
			 struct inode_switch_wbs_context *isw)
{
	if (llist_add(&isw->list, &wb->switch_wbs_ctxs))
		queue_work(isw_wq, &wb->switch_work);
}

/**
 * inode_switch_wbs - change the wb association of an inode
 * @inode: target inode
@@ -585,6 +632,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
	struct backing_dev_info *bdi = inode_to_bdi(inode);
	struct cgroup_subsys_state *memcg_css;
	struct inode_switch_wbs_context *isw;
	struct bdi_writeback *new_wb = NULL;

	/* noop if seems to be already in progress */
	if (inode->i_state & I_WB_SWITCH)
@@ -609,40 +657,35 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
	if (!memcg_css)
		goto out_free;

	isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
	new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
	css_put(memcg_css);
	if (!isw->new_wb)
	if (!new_wb)
		goto out_free;

	if (!inode_prepare_wbs_switch(inode, isw->new_wb))
	if (!inode_prepare_wbs_switch(inode, new_wb))
		goto out_free;

	isw->inodes[0] = inode;

	/*
	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
	 * the RCU protected stat update paths to grab the i_page
	 * lock so that stat transfer can synchronize against them.
	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
	 */
	INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
	queue_rcu_work(isw_wq, &isw->work);
	trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1);
	wb_queue_isw(new_wb, isw);
	return;

out_free:
	atomic_dec(&isw_nr_in_flight);
	if (isw->new_wb)
		wb_put(isw->new_wb);
	if (new_wb)
		wb_put(new_wb);
	kfree(isw);
}

static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb,
				   struct inode_switch_wbs_context *isw,
				   struct list_head *list, int *nr)
{
	struct inode *inode;

	list_for_each_entry(inode, list, i_io_list) {
		if (!inode_prepare_wbs_switch(inode, isw->new_wb))
		if (!inode_prepare_wbs_switch(inode, new_wb))
			continue;

		isw->inodes[*nr] = inode;
@@ -666,6 +709,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
{
	struct cgroup_subsys_state *memcg_css;
	struct inode_switch_wbs_context *isw;
	struct bdi_writeback *new_wb;
	int nr;
	bool restart = false;

@@ -678,12 +722,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)

	for (memcg_css = wb->memcg_css->parent; memcg_css;
	     memcg_css = memcg_css->parent) {
		isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
		if (isw->new_wb)
		new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
		if (new_wb)
			break;
	}
	if (unlikely(!isw->new_wb))
		isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
	if (unlikely(!new_wb))
		new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */

	nr = 0;
	spin_lock(&wb->list_lock);
@@ -695,27 +739,22 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
	 * bandwidth restrictions, as writeback of inode metadata is not
	 * accounted for.
	 */
	restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
	restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr);
	if (!restart)
		restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
		restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time,
						 &nr);
	spin_unlock(&wb->list_lock);

	/* no attached inodes? bail out */
	if (nr == 0) {
		atomic_dec(&isw_nr_in_flight);
		wb_put(isw->new_wb);
		wb_put(new_wb);
		kfree(isw);
		return restart;
	}

	/*
	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
	 * the RCU protected stat update paths to grab the i_page
	 * lock so that stat transfer can synchronize against them.
	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
	 */
	INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
	queue_rcu_work(isw_wq, &isw->work);
	trace_inode_switch_wbs_queue(wb, new_wb, nr);
	wb_queue_isw(new_wb, isw);

	return restart;
}
+4 −0
Original line number Diff line number Diff line
@@ -152,6 +152,10 @@ struct bdi_writeback {
	struct list_head blkcg_node;	/* anchored at blkcg->cgwb_list */
	struct list_head b_attached;	/* attached inodes, protected by list_lock */
	struct list_head offline_node;	/* anchored at offline_cgwbs */
	struct work_struct switch_work;	/* work used to perform inode switching
					 * to this wb */
	struct llist_head switch_wbs_ctxs;	/* queued contexts for
						 * writeback switching */

	union {
		struct work_struct release_work;
+2 −0
Original line number Diff line number Diff line
@@ -265,6 +265,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
		bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
}

void inode_switch_wbs_work_fn(struct work_struct *work);

#else	/* CONFIG_CGROUP_WRITEBACK */

static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
+29 −0
Original line number Diff line number Diff line
@@ -213,6 +213,35 @@ TRACE_EVENT(inode_foreign_history,
	)
);

TRACE_EVENT(inode_switch_wbs_queue,

	TP_PROTO(struct bdi_writeback *old_wb, struct bdi_writeback *new_wb,
		 unsigned int count),

	TP_ARGS(old_wb, new_wb, count),

	TP_STRUCT__entry(
		__array(char,		name, 32)
		__field(ino_t,		old_cgroup_ino)
		__field(ino_t,		new_cgroup_ino)
		__field(unsigned int,	count)
	),

	TP_fast_assign(
		strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32);
		__entry->old_cgroup_ino	= __trace_wb_assign_cgroup(old_wb);
		__entry->new_cgroup_ino	= __trace_wb_assign_cgroup(new_wb);
		__entry->count		= count;
	),

	TP_printk("bdi %s: old_cgroup_ino=%lu new_cgroup_ino=%lu count=%u",
		__entry->name,
		(unsigned long)__entry->old_cgroup_ino,
		(unsigned long)__entry->new_cgroup_ino,
		__entry->count
	)
);

TRACE_EVENT(inode_switch_wbs,

	TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,
+5 −0
Original line number Diff line number Diff line
@@ -633,6 +633,7 @@ static void cgwb_release_workfn(struct work_struct *work)
	wb_exit(wb);
	bdi_put(bdi);
	WARN_ON_ONCE(!list_empty(&wb->b_attached));
	WARN_ON_ONCE(work_pending(&wb->switch_work));
	call_rcu(&wb->rcu, cgwb_free_rcu);
}

@@ -709,6 +710,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
	wb->memcg_css = memcg_css;
	wb->blkcg_css = blkcg_css;
	INIT_LIST_HEAD(&wb->b_attached);
	INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn);
	init_llist_head(&wb->switch_wbs_ctxs);
	INIT_WORK(&wb->release_work, cgwb_release_workfn);
	set_bit(WB_registered, &wb->state);
	bdi_get(bdi);
@@ -839,6 +842,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
	if (!ret) {
		bdi->wb.memcg_css = &root_mem_cgroup->css;
		bdi->wb.blkcg_css = blkcg_root_css;
		INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn);
		init_llist_head(&bdi->wb.switch_wbs_ctxs);
	}
	return ret;
}