Commit 04225d13 authored by Nilay Shroff's avatar Nilay Shroff Committed by Jens Axboe
Browse files

block: fix potential deadlock while running nr_hw_queue update

Move scheduler tags (sched_tags) allocation and deallocation outside
both the ->elevator_lock and ->freeze_lock when updating nr_hw_queues.
This change breaks the dependency chain from the percpu allocator lock
to the elevator lock, helping to prevent potential deadlocks, as
observed in the reported lockdep splat[1].

This commit introduces batch allocation and deallocation helpers for
sched_tags, which are now used from within __blk_mq_update_nr_hw_queues
routine while iterating through the tagset.

With this change, all sched_tags memory management is handled entirely
outside the ->elevator_lock and the ->freeze_lock context, thereby
eliminating the lock dependency that could otherwise manifest during
nr_hw_queues updates.

[1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/



Reported-by: default avatarStefan Haberland <sth@linux.ibm.com>
Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/


Reviewed-by: default avatarMing Lei <ming.lei@redhat.com>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarHannes Reinecke <hare@suse.de>
Signed-off-by: default avatarNilay Shroff <nilay@linux.ibm.com>
Link: https://lore.kernel.org/r/20250730074614.2537382-4-nilay@linux.ibm.com


Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent f5a6604f
Loading
Loading
Loading
Loading
+65 −0
Original line number Diff line number Diff line
@@ -427,6 +427,32 @@ void blk_mq_free_sched_tags(struct elevator_tags *et,
	kfree(et);
}

void blk_mq_free_sched_tags_batch(struct xarray *et_table,
		struct blk_mq_tag_set *set)
{
	struct request_queue *q;
	struct elevator_tags *et;

	lockdep_assert_held_write(&set->update_nr_hwq_lock);

	list_for_each_entry(q, &set->tag_list, tag_set_list) {
		/*
		 * Accessing q->elevator without holding q->elevator_lock is
		 * safe because we're holding here set->update_nr_hwq_lock in
		 * the writer context. So, scheduler update/switch code (which
		 * acquires the same lock but in the reader context) can't run
		 * concurrently.
		 */
		if (q->elevator) {
			et = xa_load(et_table, q->id);
			if (unlikely(!et))
				WARN_ON_ONCE(1);
			else
				blk_mq_free_sched_tags(et, set);
		}
	}
}

struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
		unsigned int nr_hw_queues)
{
@@ -477,6 +503,45 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
	return NULL;
}

int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
		struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
{
	struct request_queue *q;
	struct elevator_tags *et;
	gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;

	lockdep_assert_held_write(&set->update_nr_hwq_lock);

	list_for_each_entry(q, &set->tag_list, tag_set_list) {
		/*
		 * Accessing q->elevator without holding q->elevator_lock is
		 * safe because we're holding here set->update_nr_hwq_lock in
		 * the writer context. So, scheduler update/switch code (which
		 * acquires the same lock but in the reader context) can't run
		 * concurrently.
		 */
		if (q->elevator) {
			et = blk_mq_alloc_sched_tags(set, nr_hw_queues);
			if (!et)
				goto out_unwind;
			if (xa_insert(et_table, q->id, et, gfp))
				goto out_free_tags;
		}
	}
	return 0;
out_free_tags:
	blk_mq_free_sched_tags(et, set);
out_unwind:
	list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
		if (q->elevator) {
			et = xa_load(et_table, q->id);
			if (et)
				blk_mq_free_sched_tags(et, set);
		}
	}
	return -ENOMEM;
}

/* caller must have a reference to @e, will grab another one if successful */
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
		struct elevator_tags *et)
+4 −0
Original line number Diff line number Diff line
@@ -25,8 +25,12 @@ void blk_mq_sched_free_rqs(struct request_queue *q);

struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
		unsigned int nr_hw_queues);
int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
		struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
void blk_mq_free_sched_tags(struct elevator_tags *et,
		struct blk_mq_tag_set *set);
void blk_mq_free_sched_tags_batch(struct xarray *et_table,
		struct blk_mq_tag_set *set);

static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
+11 −5
Original line number Diff line number Diff line
@@ -4974,12 +4974,13 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 * Switch back to the elevator type stored in the xarray.
 */
static void blk_mq_elv_switch_back(struct request_queue *q,
		struct xarray *elv_tbl)
		struct xarray *elv_tbl, struct xarray *et_tbl)
{
	struct elevator_type *e = xa_load(elv_tbl, q->id);
	struct elevator_tags *t = xa_load(et_tbl, q->id);

	/* The elv_update_nr_hw_queues unfreezes the queue. */
	elv_update_nr_hw_queues(q, e);
	elv_update_nr_hw_queues(q, e, t);

	/* Drop the reference acquired in blk_mq_elv_switch_none. */
	if (e)
@@ -5031,7 +5032,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
	int prev_nr_hw_queues = set->nr_hw_queues;
	unsigned int memflags;
	int i;
	struct xarray elv_tbl;
	struct xarray elv_tbl, et_tbl;

	lockdep_assert_held(&set->tag_list_lock);

@@ -5044,6 +5045,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,

	memflags = memalloc_noio_save();

	xa_init(&et_tbl);
	if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0)
		goto out_memalloc_restore;

	xa_init(&elv_tbl);

	list_for_each_entry(q, &set->tag_list, tag_set_list) {
@@ -5087,7 +5092,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
switch_back:
	/* The blk_mq_elv_switch_back unfreezes queue for us. */
	list_for_each_entry(q, &set->tag_list, tag_set_list)
		blk_mq_elv_switch_back(q, &elv_tbl);
		blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl);

	list_for_each_entry(q, &set->tag_list, tag_set_list) {
		blk_mq_sysfs_register_hctxs(q);
@@ -5098,7 +5103,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
	}

	xa_destroy(&elv_tbl);

	xa_destroy(&et_tbl);
out_memalloc_restore:
	memalloc_noio_restore(memflags);

	/* Free the excess tags when nr_hw_queues shrink. */
+3 −1
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@
#include "blk-crypto-internal.h"

struct elevator_type;
struct elevator_tags;

/*
 * Default upper limit for the software max_sectors limit used for regular I/Os.
@@ -330,7 +331,8 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,

bool blk_insert_flush(struct request *rq);

void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e);
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
		struct elevator_tags *t);
void elevator_set_default(struct request_queue *q);
void elevator_set_none(struct request_queue *q);

+6 −9
Original line number Diff line number Diff line
@@ -705,7 +705,8 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
 * The I/O scheduler depends on the number of hardware queues, this forces a
 * reattachment when nr_hw_queues changes.
 */
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
		struct elevator_tags *t)
{
	struct blk_mq_tag_set *set = q->tag_set;
	struct elv_change_ctx ctx = {};
@@ -715,25 +716,21 @@ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)

	if (e && !blk_queue_dying(q) && blk_queue_registered(q)) {
		ctx.name = e->elevator_name;
		ctx.et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues);
		if (!ctx.et) {
			WARN_ON_ONCE(1);
			goto unfreeze;
		}
		ctx.et = t;

		mutex_lock(&q->elevator_lock);
		/* force to reattach elevator after nr_hw_queue is updated */
		ret = elevator_switch(q, &ctx);
		mutex_unlock(&q->elevator_lock);
	}
unfreeze:
	blk_mq_unfreeze_queue_nomemrestore(q);
	if (!ret)
		WARN_ON_ONCE(elevator_change_done(q, &ctx));
	/*
	 * Free sched tags if it's allocated but we couldn't switch elevator.
	 */
	if (ctx.et && !ctx.new)
		blk_mq_free_sched_tags(ctx.et, set);
	if (t && !ctx.new)
		blk_mq_free_sched_tags(t, set);
}

/*