Unverified Commit e1b849cf authored by Jan Kara's avatar Jan Kara Committed by Christian Brauner
Browse files

writeback: Avoid contention on wb->list_lock when switching inodes



There can be multiple inode switch works that are trying to switch
inodes to / from the same wb. This can happen in particular if some
cgroup exits which owns many (thousands) inodes and we need to switch
them all. In this case several inode_switch_wbs_work_fn() instances will
be just spinning on the same wb->list_lock while only one of them makes
forward progress. This wastes CPU cycles and quickly leads to softlockup
reports and unusable system.

Instead of running several inode_switch_wbs_work_fn() instances in
parallel switching to the same wb and contending on wb->list_lock, run
just one work item per wb and manage a queue of isw items switching to
this wb.

Acked-by: default avatarTejun Heo <tj@kernel.org>
Signed-off-by: default avatarJan Kara <jack@suse.cz>
parent 8f5ae30d
Loading
Loading
Loading
Loading
+63 −36
Original line number Diff line number Diff line
@@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
}

struct inode_switch_wbs_context {
	struct rcu_work		work;
	/* List of queued switching contexts for the wb */
	struct llist_node	list;

	/*
	 * Multiple inodes can be switched at once.  The switching procedure
@@ -378,7 +379,6 @@ struct inode_switch_wbs_context {
	 * array embedded into struct inode_switch_wbs_context.  Otherwise
	 * an inode could be left in a non-consistent state.
	 */
	struct bdi_writeback	*new_wb;
	struct inode		*inodes[];
};

@@ -486,13 +486,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
	return switched;
}

static void inode_switch_wbs_work_fn(struct work_struct *work)
static void process_inode_switch_wbs(struct bdi_writeback *new_wb,
				     struct inode_switch_wbs_context *isw)
{
	struct inode_switch_wbs_context *isw =
		container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
	struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
	struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
	struct bdi_writeback *new_wb = isw->new_wb;
	unsigned long nr_switched = 0;
	struct inode **inodep;

@@ -543,6 +541,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
	atomic_dec(&isw_nr_in_flight);
}

void inode_switch_wbs_work_fn(struct work_struct *work)
{
	struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback,
						    switch_work);
	struct inode_switch_wbs_context *isw, *next_isw;
	struct llist_node *list;

	/*
	 * Grab out reference to wb so that it cannot get freed under us
	 * after we process all the isw items.
	 */
	wb_get(new_wb);
	while (1) {
		list = llist_del_all(&new_wb->switch_wbs_ctxs);
		/* Nothing to do? */
		if (!list)
			break;
		/*
		 * In addition to synchronizing among switchers, I_WB_SWITCH
		 * tells the RCU protected stat update paths to grab the i_page
		 * lock so that stat transfer can synchronize against them.
		 * Let's continue after I_WB_SWITCH is guaranteed to be
		 * visible.
		 */
		synchronize_rcu();

		llist_for_each_entry_safe(isw, next_isw, list, list)
			process_inode_switch_wbs(new_wb, isw);
	}
	wb_put(new_wb);
}

static bool inode_prepare_wbs_switch(struct inode *inode,
				     struct bdi_writeback *new_wb)
{
@@ -572,6 +602,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
	return true;
}

static void wb_queue_isw(struct bdi_writeback *wb,
			 struct inode_switch_wbs_context *isw)
{
	if (llist_add(&isw->list, &wb->switch_wbs_ctxs))
		queue_work(isw_wq, &wb->switch_work);
}

/**
 * inode_switch_wbs - change the wb association of an inode
 * @inode: target inode
@@ -585,6 +622,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
	struct backing_dev_info *bdi = inode_to_bdi(inode);
	struct cgroup_subsys_state *memcg_css;
	struct inode_switch_wbs_context *isw;
	struct bdi_writeback *new_wb = NULL;

	/* noop if seems to be already in progress */
	if (inode->i_state & I_WB_SWITCH)
@@ -609,40 +647,34 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
	if (!memcg_css)
		goto out_free;

	isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
	new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
	css_put(memcg_css);
	if (!isw->new_wb)
	if (!new_wb)
		goto out_free;

	if (!inode_prepare_wbs_switch(inode, isw->new_wb))
	if (!inode_prepare_wbs_switch(inode, new_wb))
		goto out_free;

	isw->inodes[0] = inode;

	/*
	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
	 * the RCU protected stat update paths to grab the i_page
	 * lock so that stat transfer can synchronize against them.
	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
	 */
	INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
	queue_rcu_work(isw_wq, &isw->work);
	wb_queue_isw(new_wb, isw);
	return;

out_free:
	atomic_dec(&isw_nr_in_flight);
	if (isw->new_wb)
		wb_put(isw->new_wb);
	if (new_wb)
		wb_put(new_wb);
	kfree(isw);
}

static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb,
				   struct inode_switch_wbs_context *isw,
				   struct list_head *list, int *nr)
{
	struct inode *inode;

	list_for_each_entry(inode, list, i_io_list) {
		if (!inode_prepare_wbs_switch(inode, isw->new_wb))
		if (!inode_prepare_wbs_switch(inode, new_wb))
			continue;

		isw->inodes[*nr] = inode;
@@ -666,6 +698,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
{
	struct cgroup_subsys_state *memcg_css;
	struct inode_switch_wbs_context *isw;
	struct bdi_writeback *new_wb;
	int nr;
	bool restart = false;

@@ -678,12 +711,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)

	for (memcg_css = wb->memcg_css->parent; memcg_css;
	     memcg_css = memcg_css->parent) {
		isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
		if (isw->new_wb)
		new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
		if (new_wb)
			break;
	}
	if (unlikely(!isw->new_wb))
		isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
	if (unlikely(!new_wb))
		new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */

	nr = 0;
	spin_lock(&wb->list_lock);
@@ -695,27 +728,21 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
	 * bandwidth restrictions, as writeback of inode metadata is not
	 * accounted for.
	 */
	restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
	restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr);
	if (!restart)
		restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
		restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time,
						 &nr);
	spin_unlock(&wb->list_lock);

	/* no attached inodes? bail out */
	if (nr == 0) {
		atomic_dec(&isw_nr_in_flight);
		wb_put(isw->new_wb);
		wb_put(new_wb);
		kfree(isw);
		return restart;
	}

	/*
	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
	 * the RCU protected stat update paths to grab the i_page
	 * lock so that stat transfer can synchronize against them.
	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
	 */
	INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
	queue_rcu_work(isw_wq, &isw->work);
	wb_queue_isw(new_wb, isw);

	return restart;
}
+4 −0
Original line number Diff line number Diff line
@@ -152,6 +152,10 @@ struct bdi_writeback {
	struct list_head blkcg_node;	/* anchored at blkcg->cgwb_list */
	struct list_head b_attached;	/* attached inodes, protected by list_lock */
	struct list_head offline_node;	/* anchored at offline_cgwbs */
	struct work_struct switch_work;	/* work used to perform inode switching
					 * to this wb */
	struct llist_head switch_wbs_ctxs;	/* queued contexts for
						 * writeback switching */

	union {
		struct work_struct release_work;
+2 −0
Original line number Diff line number Diff line
@@ -265,6 +265,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
		bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
}

void inode_switch_wbs_work_fn(struct work_struct *work);

#else	/* CONFIG_CGROUP_WRITEBACK */

static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
+5 −0
Original line number Diff line number Diff line
@@ -633,6 +633,7 @@ static void cgwb_release_workfn(struct work_struct *work)
	wb_exit(wb);
	bdi_put(bdi);
	WARN_ON_ONCE(!list_empty(&wb->b_attached));
	WARN_ON_ONCE(work_pending(&wb->switch_work));
	call_rcu(&wb->rcu, cgwb_free_rcu);
}

@@ -709,6 +710,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
	wb->memcg_css = memcg_css;
	wb->blkcg_css = blkcg_css;
	INIT_LIST_HEAD(&wb->b_attached);
	INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn);
	init_llist_head(&wb->switch_wbs_ctxs);
	INIT_WORK(&wb->release_work, cgwb_release_workfn);
	set_bit(WB_registered, &wb->state);
	bdi_get(bdi);
@@ -839,6 +842,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
	if (!ret) {
		bdi->wb.memcg_css = &root_mem_cgroup->css;
		bdi->wb.blkcg_css = blkcg_root_css;
		INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn);
		init_llist_head(&bdi->wb.switch_wbs_ctxs);
	}
	return ret;
}