Commit f5a6604f authored by Nilay Shroff's avatar Nilay Shroff Committed by Jens Axboe
Browse files

block: fix lockdep warning caused by lock dependency in elv_iosched_store

Recent lockdep reports [1] have revealed a potential deadlock caused by a
lock dependency between the percpu allocator lock and the elevator lock.
This issue can be avoided by ensuring that the allocation and release of
scheduler tags (sched_tags) are performed outside the elevator lock.
Furthermore, the queue does not need to be remain frozen during these
operations.

To address this, move all sched_tags allocations and deallocations outside
of both the ->elevator_lock and the ->freeze_lock. Since the lifetime of
the elevator queue and its associated sched_tags is closely tied, the
allocated sched_tags are now stored in the elevator queue structure. Then,
during the actual elevator switch (which runs under ->freeze_lock and
->elevator_lock), the pre-allocated sched_tags are assigned to the
appropriate q->hctx. Once the elevator switch is complete and the locks
are released, the old elevator queue and its associated sched_tags are
freed.

This commit specifically addresses the allocation/deallocation of sched_
tags during elevator switching. Note that sched_tags may also be allocated
in other contexts, such as during nr_hw_queues updates. Supporting that
use case will require batch allocation/deallocation, which will be handled
in a follow-up patch.

This restructuring ensures that sched_tags memory management occurs
entirely outside of the ->elevator_lock and ->freeze_lock context,
eliminating the lock dependency problem seen during scheduler updates.

[1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/



Reported-by: default avatarStefan Haberland <sth@linux.ibm.com>
Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/


Reviewed-by: default avatarMing Lei <ming.lei@redhat.com>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarHannes Reinecke <hare@suse.de>
Signed-off-by: default avatarNilay Shroff <nilay@linux.ibm.com>
Link: https://lore.kernel.org/r/20250730074614.2537382-3-nilay@linux.ibm.com


Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 49811586
Loading
Loading
Loading
Loading
+83 −72
Original line number Diff line number Diff line
@@ -374,64 +374,17 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);

static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
					  struct blk_mq_hw_ctx *hctx,
					  unsigned int hctx_idx)
{
	if (blk_mq_is_shared_tags(q->tag_set->flags)) {
		hctx->sched_tags = q->sched_shared_tags;
		return 0;
	}

	hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
						    q->nr_requests);

	if (!hctx->sched_tags)
		return -ENOMEM;
	return 0;
}

static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
{
	blk_mq_free_rq_map(queue->sched_shared_tags);
	queue->sched_shared_tags = NULL;
}

/* called in queue's release handler, tagset has gone away */
static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{
	struct blk_mq_hw_ctx *hctx;
	unsigned long i;

	queue_for_each_hw_ctx(q, hctx, i) {
		if (hctx->sched_tags) {
			if (!blk_mq_is_shared_tags(flags))
				blk_mq_free_rq_map(hctx->sched_tags);
	queue_for_each_hw_ctx(q, hctx, i)
		hctx->sched_tags = NULL;
		}
	}

	if (blk_mq_is_shared_tags(flags))
		blk_mq_exit_sched_shared_tags(q);
}

static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
{
	struct blk_mq_tag_set *set = queue->tag_set;

	/*
	 * Set initial depth at max so that we don't need to reallocate for
	 * updating nr_requests.
	 */
	queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
						BLK_MQ_NO_HCTX_IDX,
						MAX_SCHED_RQ);
	if (!queue->sched_shared_tags)
		return -ENOMEM;

	blk_mq_tag_update_sched_shared_tags(queue);

	return 0;
		q->sched_shared_tags = NULL;
}

void blk_mq_sched_reg_debugfs(struct request_queue *q)
@@ -458,8 +411,75 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q)
	mutex_unlock(&q->debugfs_mutex);
}

void blk_mq_free_sched_tags(struct elevator_tags *et,
		struct blk_mq_tag_set *set)
{
	unsigned long i;

	/* Shared tags are stored at index 0 in @tags. */
	if (blk_mq_is_shared_tags(set->flags))
		blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX);
	else {
		for (i = 0; i < et->nr_hw_queues; i++)
			blk_mq_free_map_and_rqs(set, et->tags[i], i);
	}

	kfree(et);
}

struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
		unsigned int nr_hw_queues)
{
	unsigned int nr_tags;
	int i;
	struct elevator_tags *et;
	gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;

	if (blk_mq_is_shared_tags(set->flags))
		nr_tags = 1;
	else
		nr_tags = nr_hw_queues;

	et = kmalloc(sizeof(struct elevator_tags) +
			nr_tags * sizeof(struct blk_mq_tags *), gfp);
	if (!et)
		return NULL;
	/*
	 * Default to double of smaller one between hw queue_depth and
	 * 128, since we don't split into sync/async like the old code
	 * did. Additionally, this is a per-hw queue depth.
	 */
	et->nr_requests = 2 * min_t(unsigned int, set->queue_depth,
			BLKDEV_DEFAULT_RQ);
	et->nr_hw_queues = nr_hw_queues;

	if (blk_mq_is_shared_tags(set->flags)) {
		/* Shared tags are stored at index 0 in @tags. */
		et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX,
					MAX_SCHED_RQ);
		if (!et->tags[0])
			goto out;
	} else {
		for (i = 0; i < et->nr_hw_queues; i++) {
			et->tags[i] = blk_mq_alloc_map_and_rqs(set, i,
					et->nr_requests);
			if (!et->tags[i])
				goto out_unwind;
		}
	}

	return et;
out_unwind:
	while (--i >= 0)
		blk_mq_free_map_and_rqs(set, et->tags[i], i);
out:
	kfree(et);
	return NULL;
}

/* caller must have a reference to @e, will grab another one if successful */
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
		struct elevator_tags *et)
{
	unsigned int flags = q->tag_set->flags;
	struct blk_mq_hw_ctx *hctx;
@@ -467,40 +487,33 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
	unsigned long i;
	int ret;

	/*
	 * Default to double of smaller one between hw queue_depth and 128,
	 * since we don't split into sync/async like the old code did.
	 * Additionally, this is a per-hw queue depth.
	 */
	q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
				   BLKDEV_DEFAULT_RQ);

	eq = elevator_alloc(q, e);
	eq = elevator_alloc(q, e, et);
	if (!eq)
		return -ENOMEM;

	q->nr_requests = et->nr_requests;

	if (blk_mq_is_shared_tags(flags)) {
		ret = blk_mq_init_sched_shared_tags(q);
		if (ret)
			goto err_put_elevator;
		/* Shared tags are stored at index 0 in @et->tags. */
		q->sched_shared_tags = et->tags[0];
		blk_mq_tag_update_sched_shared_tags(q);
	}

	queue_for_each_hw_ctx(q, hctx, i) {
		ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
		if (ret)
			goto err_free_map_and_rqs;
		if (blk_mq_is_shared_tags(flags))
			hctx->sched_tags = q->sched_shared_tags;
		else
			hctx->sched_tags = et->tags[i];
	}

	ret = e->ops.init_sched(q, eq);
	if (ret)
		goto err_free_map_and_rqs;
		goto out;

	queue_for_each_hw_ctx(q, hctx, i) {
		if (e->ops.init_hctx) {
			ret = e->ops.init_hctx(hctx, i);
			if (ret) {
				eq = q->elevator;
				blk_mq_sched_free_rqs(q);
				blk_mq_exit_sched(q, eq);
				kobject_put(&eq->kobj);
				return ret;
@@ -509,10 +522,8 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
	}
	return 0;

err_free_map_and_rqs:
	blk_mq_sched_free_rqs(q);
out:
	blk_mq_sched_tags_teardown(q, flags);
err_put_elevator:
	kobject_put(&eq->kobj);
	q->elevator = NULL;
	return ret;
+7 −1
Original line number Diff line number Diff line
@@ -18,10 +18,16 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);

void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);

int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
		struct elevator_tags *et);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_rqs(struct request_queue *q);

struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
		unsigned int nr_hw_queues);
void blk_mq_free_sched_tags(struct elevator_tags *et,
		struct blk_mq_tag_set *set);

static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+34 −6
Original line number Diff line number Diff line
@@ -54,6 +54,8 @@ struct elv_change_ctx {
	struct elevator_queue *old;
	/* for registering new elevator */
	struct elevator_queue *new;
	/* holds sched tags data */
	struct elevator_tags *et;
};

static DEFINE_SPINLOCK(elv_list_lock);
@@ -132,7 +134,7 @@ static struct elevator_type *elevator_find_get(const char *name)
static const struct kobj_type elv_ktype;

struct elevator_queue *elevator_alloc(struct request_queue *q,
				  struct elevator_type *e)
		struct elevator_type *e, struct elevator_tags *et)
{
	struct elevator_queue *eq;

@@ -145,6 +147,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
	kobject_init(&eq->kobj, &elv_ktype);
	mutex_init(&eq->sysfs_lock);
	hash_init(eq->hash);
	eq->et = et;

	return eq;
}
@@ -165,7 +168,6 @@ static void elevator_exit(struct request_queue *q)
	lockdep_assert_held(&q->elevator_lock);

	ioc_clear_queue(q);
	blk_mq_sched_free_rqs(q);

	mutex_lock(&e->sysfs_lock);
	blk_mq_exit_sched(q, e);
@@ -591,7 +593,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx)
	}

	if (new_e) {
		ret = blk_mq_init_sched(q, new_e);
		ret = blk_mq_init_sched(q, new_e, ctx->et);
		if (ret)
			goto out_unfreeze;
		ctx->new = q->elevator;
@@ -626,9 +628,11 @@ static void elv_exit_and_release(struct request_queue *q)
	elevator_exit(q);
	mutex_unlock(&q->elevator_lock);
	blk_mq_unfreeze_queue(q, memflags);
	if (e)
	if (e) {
		blk_mq_free_sched_tags(e->et, q->tag_set);
		kobject_put(&e->kobj);
	}
}

static int elevator_change_done(struct request_queue *q,
				struct elv_change_ctx *ctx)
@@ -640,6 +644,7 @@ static int elevator_change_done(struct request_queue *q,
				&ctx->old->flags);

		elv_unregister_queue(q, ctx->old);
		blk_mq_free_sched_tags(ctx->old->et, q->tag_set);
		kobject_put(&ctx->old->kobj);
		if (enable_wbt)
			wbt_enable_default(q->disk);
@@ -658,9 +663,16 @@ static int elevator_change_done(struct request_queue *q,
static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
{
	unsigned int memflags;
	struct blk_mq_tag_set *set = q->tag_set;
	int ret = 0;

	lockdep_assert_held(&q->tag_set->update_nr_hwq_lock);
	lockdep_assert_held(&set->update_nr_hwq_lock);

	if (strncmp(ctx->name, "none", 4)) {
		ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues);
		if (!ctx->et)
			return -ENOMEM;
	}

	memflags = blk_mq_freeze_queue(q);
	/*
@@ -680,6 +692,11 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
	blk_mq_unfreeze_queue(q, memflags);
	if (!ret)
		ret = elevator_change_done(q, ctx);
	/*
	 * Free sched tags if it's allocated but we couldn't switch elevator.
	 */
	if (ctx->et && !ctx->new)
		blk_mq_free_sched_tags(ctx->et, set);

	return ret;
}
@@ -690,6 +707,7 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
 */
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)
{
	struct blk_mq_tag_set *set = q->tag_set;
	struct elv_change_ctx ctx = {};
	int ret = -ENODEV;

@@ -697,15 +715,25 @@ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)

	if (e && !blk_queue_dying(q) && blk_queue_registered(q)) {
		ctx.name = e->elevator_name;

		ctx.et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues);
		if (!ctx.et) {
			WARN_ON_ONCE(1);
			goto unfreeze;
		}
		mutex_lock(&q->elevator_lock);
		/* force to reattach elevator after nr_hw_queue is updated */
		ret = elevator_switch(q, &ctx);
		mutex_unlock(&q->elevator_lock);
	}
unfreeze:
	blk_mq_unfreeze_queue_nomemrestore(q);
	if (!ret)
		WARN_ON_ONCE(elevator_change_done(q, &ctx));
	/*
	 * Free sched tags if it's allocated but we couldn't switch elevator.
	 */
	if (ctx.et && !ctx.new)
		blk_mq_free_sched_tags(ctx.et, set);
}

/*
+12 −2
Original line number Diff line number Diff line
@@ -23,6 +23,15 @@ enum elv_merge {
struct blk_mq_alloc_data;
struct blk_mq_hw_ctx;

struct elevator_tags {
	/* num. of hardware queues for which tags are allocated */
	unsigned int nr_hw_queues;
	/* depth used while allocating tags */
	unsigned int nr_requests;
	/* shared tag is stored at index 0 */
	struct blk_mq_tags *tags[];
};

struct elevator_mq_ops {
	int (*init_sched)(struct request_queue *, struct elevator_queue *);
	void (*exit_sched)(struct elevator_queue *);
@@ -113,6 +122,7 @@ struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
struct elevator_queue
{
	struct elevator_type *type;
	struct elevator_tags *et;
	void *elevator_data;
	struct kobject kobj;
	struct mutex sysfs_lock;
@@ -152,8 +162,8 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *page);
ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count);

extern bool elv_bio_merge_ok(struct request *, struct bio *);
extern struct elevator_queue *elevator_alloc(struct request_queue *,
					struct elevator_type *);
struct elevator_queue *elevator_alloc(struct request_queue *,
		struct elevator_type *, struct elevator_tags *);

/*
 * Helper functions.