Commit 1e44bedb authored by Ming Lei's avatar Ming Lei Committed by Jens Axboe
Browse files

block: unifying elevator change



Elevator change is one well-define behavior:

- tear down current elevator if it exists

- setup new elevator

It is supposed to cover any case for changing elevator by single
internal API, typically the following cases:

- setup default elevator in add_disk()

- switch to none in del_disk()

- reset elevator in blk_mq_update_nr_hw_queues()

- switch elevator in sysfs `store` elevator attribute

This patch uses elevator_change() to cover all above cases:

- every elevator switch is serialized with each other: add_disk/del_disk/
store elevator is serialized already, blk_mq_update_nr_hw_queues() uses
srcu for syncing with the other three cases

- for both add_disk()/del_disk(), queue freeze works at atomic mode
or has been froze, so the freeze in elevator_change() won't add extra
delay

- `struct elev_change_ctx` instance holds any info for changing elevator

Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarNilay Shroff <nilay@linux.ibm.com>
Reviewed-by: default avatarHannes Reinecke <hare@suse.de>
Signed-off-by: default avatarMing Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250505141805.2751237-17-ming.lei@redhat.com


Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 1e9db5c4
Loading
Loading
Loading
Loading
+7 −12
Original line number Diff line number Diff line
@@ -869,14 +869,9 @@ int blk_register_queue(struct gendisk *disk)
	if (ret)
		goto out_unregister_ia_ranges;

	if (queue_is_mq(q))
		elevator_set_default(q);
	mutex_lock(&q->elevator_lock);
	if (q->elevator) {
		ret = elv_register_queue(q, false);
		if (ret) {
			mutex_unlock(&q->elevator_lock);
			goto out_crypto_sysfs_unregister;
		}
	}
	wbt_enable_default(disk);
	mutex_unlock(&q->elevator_lock);

@@ -902,8 +897,6 @@ int blk_register_queue(struct gendisk *disk)

	return ret;

out_crypto_sysfs_unregister:
	blk_crypto_sysfs_unregister(disk);
out_unregister_ia_ranges:
	disk_unregister_independent_access_ranges(disk);
out_debugfs_remove:
@@ -951,9 +944,11 @@ void blk_unregister_queue(struct gendisk *disk)
		blk_mq_sysfs_unregister(disk);
	blk_crypto_sysfs_unregister(disk);

	mutex_lock(&q->elevator_lock);
	elv_unregister_queue(q);
	mutex_unlock(&q->elevator_lock);
	if (queue_is_mq(q)) {
		blk_mq_quiesce_queue(q);
		elevator_set_none(q);
		blk_mq_unquiesce_queue(q);
	}

	mutex_lock(&q->sysfs_lock);
	disk_unregister_independent_access_ranges(disk);
+2 −3
Original line number Diff line number Diff line
@@ -323,9 +323,8 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
bool blk_insert_flush(struct request *rq);

void elv_update_nr_hw_queues(struct request_queue *q);
void elevator_exit(struct request_queue *q);
int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q);
void elevator_set_default(struct request_queue *q);
void elevator_set_none(struct request_queue *q);

ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
		char *buf);
+53 −63
Original line number Diff line number Diff line
@@ -154,7 +154,7 @@ static void elevator_release(struct kobject *kobj)
	kfree(e);
}

void elevator_exit(struct request_queue *q)
static void elevator_exit(struct request_queue *q)
{
	struct elevator_queue *e = q->elevator;

@@ -458,7 +458,7 @@ static const struct kobj_type elv_ktype = {
	.release	= elevator_release,
};

int elv_register_queue(struct request_queue *q, bool uevent)
static int elv_register_queue(struct request_queue *q, bool uevent)
{
	struct elevator_queue *e = q->elevator;
	int error;
@@ -488,7 +488,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
	return error;
}

void elv_unregister_queue(struct request_queue *q)
static void elv_unregister_queue(struct request_queue *q)
{
	struct elevator_queue *e = q->elevator;

@@ -561,66 +561,6 @@ void elv_unregister(struct elevator_type *e)
}
EXPORT_SYMBOL_GPL(elv_unregister);

/*
 * For single queue devices, default to using mq-deadline. If we have multiple
 * queues or mq-deadline is not available, default to "none".
 */
static struct elevator_type *elevator_get_default(struct request_queue *q)
{
	if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
		return NULL;

	if (q->nr_hw_queues != 1 &&
	    !blk_mq_is_shared_tags(q->tag_set->flags))
		return NULL;

	return elevator_find_get("mq-deadline");
}

/*
 * Use the default elevator settings. If the chosen elevator initialization
 * fails, fall back to the "none" elevator (no elevator).
 */
void elevator_init_mq(struct request_queue *q)
{
	struct elevator_type *e;
	unsigned int memflags;
	int err;

	WARN_ON_ONCE(blk_queue_registered(q));

	if (unlikely(q->elevator))
		return;

	e = elevator_get_default(q);
	if (!e)
		return;

	/*
	 * We are called before adding disk, when there isn't any FS I/O,
	 * so freezing queue plus canceling dispatch work is enough to
	 * drain any dispatch activities originated from passthrough
	 * requests, then no need to quiesce queue which may add long boot
	 * latency, especially when lots of disks are involved.
	 *
	 * Disk isn't added yet, so verifying queue lock only manually.
	 */
	memflags = blk_mq_freeze_queue(q);

	blk_mq_cancel_work_sync(q);

	err = blk_mq_init_sched(q, e);

	blk_mq_unfreeze_queue(q, memflags);

	if (err) {
		pr_warn("\"%s\" elevator initialization failed, "
			"falling back to \"none\"\n", e->elevator_name);
	}

	elevator_put(e);
}

/*
 * Switch to new_e io scheduler.
 *
@@ -688,6 +628,16 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
	lockdep_assert_held(&q->tag_set->update_nr_hwq_lock);

	memflags = blk_mq_freeze_queue(q);
	/*
	 * May be called before adding disk, when there isn't any FS I/O,
	 * so freezing queue plus canceling dispatch work is enough to
	 * drain any dispatch activities originated from passthrough
	 * requests, then no need to quiesce queue which may add long boot
	 * latency, especially when lots of disks are involved.
	 *
	 * Disk isn't added yet, so verifying queue lock only manually.
	 */
	blk_mq_cancel_work_sync(q);
	mutex_lock(&q->elevator_lock);
	if (!(q->elevator && elevator_match(q->elevator->type, ctx->name)))
		ret = elevator_switch(q, ctx);
@@ -716,6 +666,46 @@ void elv_update_nr_hw_queues(struct request_queue *q)
	mutex_unlock(&q->elevator_lock);
}

/*
 * Use the default elevator settings. If the chosen elevator initialization
 * fails, fall back to the "none" elevator (no elevator).
 */
void elevator_set_default(struct request_queue *q)
{
	struct elv_change_ctx ctx = {
		.name = "mq-deadline",
		.no_uevent = true,
	};
	int err = 0;

	if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
		return;

	/*
	 * For single queue devices, default to using mq-deadline. If we
	 * have multiple queues or mq-deadline is not available, default
	 * to "none".
	 */
	if (elevator_find_get(ctx.name) && (q->nr_hw_queues == 1 ||
			 blk_mq_is_shared_tags(q->tag_set->flags)))
		err = elevator_change(q, &ctx);
	if (err < 0)
		pr_warn("\"%s\" elevator initialization, failed %d, "
			"falling back to \"none\"\n", ctx.name, err);
}

void elevator_set_none(struct request_queue *q)
{
	struct elv_change_ctx ctx = {
		.name	= "none",
	};
	int err;

	err = elevator_change(q, &ctx);
	if (err < 0)
		pr_warn("%s: set none elevator failed %d\n", __func__, err);
}

static void elv_iosched_load_module(const char *elevator_name)
{
	struct elevator_type *found;
+5 −23
Original line number Diff line number Diff line
@@ -432,12 +432,6 @@ static int __add_disk(struct device *parent, struct gendisk *disk,
		 */
		if (disk->fops->submit_bio || disk->fops->poll_bio)
			return -EINVAL;

		/*
		 * Initialize the I/O scheduler code and pick a default one if
		 * needed.
		 */
		elevator_init_mq(disk->queue);
	} else {
		if (!disk->fops->submit_bio)
			return -EINVAL;
@@ -454,7 +448,7 @@ static int __add_disk(struct device *parent, struct gendisk *disk,
	ret = -EINVAL;
	if (disk->major) {
		if (WARN_ON(!disk->minors))
			goto out_exit_elevator;
			goto out;

		if (disk->minors > DISK_MAX_PARTS) {
			pr_err("block: can't allocate more than %d partitions\n",
@@ -464,14 +458,14 @@ static int __add_disk(struct device *parent, struct gendisk *disk,
		if (disk->first_minor > MINORMASK ||
		    disk->minors > MINORMASK + 1 ||
		    disk->first_minor + disk->minors > MINORMASK + 1)
			goto out_exit_elevator;
			goto out;
	} else {
		if (WARN_ON(disk->minors))
			goto out_exit_elevator;
			goto out;

		ret = blk_alloc_ext_minor();
		if (ret < 0)
			goto out_exit_elevator;
			goto out;
		disk->major = BLOCK_EXT_MAJOR;
		disk->first_minor = ret;
	}
@@ -561,12 +555,7 @@ static int __add_disk(struct device *parent, struct gendisk *disk,
out_free_ext_minor:
	if (disk->major == BLOCK_EXT_MAJOR)
		blk_free_ext_minor(disk->first_minor);
out_exit_elevator:
	if (disk->queue->elevator) {
		mutex_lock(&disk->queue->elevator_lock);
		elevator_exit(disk->queue);
		mutex_unlock(&disk->queue->elevator_lock);
	}
out:
	return ret;
}

@@ -760,14 +749,7 @@ static void __del_gendisk(struct gendisk *disk)
	if (queue_is_mq(q))
		blk_mq_cancel_work_sync(q);

	blk_mq_quiesce_queue(q);
	if (q->elevator) {
		mutex_lock(&q->elevator_lock);
		elevator_exit(q);
		mutex_unlock(&q->elevator_lock);
	}
	rq_qos_exit(q);
	blk_mq_unquiesce_queue(q);

	/*
	 * If the disk does not own the queue, allow using passthrough requests