Merge tag 'block-6.17-20250808' of git://git.kernel.dk/linux (2988dfed) · Commits · git / linux-net

block/bfq-iosched.c

+21 −45

Original line number	Diff line number	Diff line
		@@ -454,17 +454,10 @@ static struct bfq_io_cq icq_to_bic(struct io_cq icq)
		*/
		static struct bfq_io_cq bfq_bic_lookup(struct request_queue q)
		{
		struct bfq_io_cq *icq;
		unsigned long flags;

		if (!current->io_context)
		return NULL;

		spin_lock_irqsave(&q->queue_lock, flags);
		icq = icq_to_bic(ioc_lookup_icq(q));
		spin_unlock_irqrestore(&q->queue_lock, flags);

		return icq;
		return icq_to_bic(ioc_lookup_icq(q));
		}

		/*
		@@ -701,17 +694,13 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
		{
		struct bfq_data *bfqd = data->q->elevator->elevator_data;
		struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
		int depth;
		unsigned limit = data->q->nr_requests;
		unsigned int act_idx;
		unsigned int limit, act_idx;

		/* Sync reads have full depth available */
		if (op_is_sync(opf) && !op_is_write(opf)) {
		depth = 0;
		} else {
		depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
		limit = (limit * depth) >> bfqd->full_depth_shift;
		}
		if (op_is_sync(opf) && !op_is_write(opf))
		limit = data->q->nr_requests;
		else
		limit = bfqd->async_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];

		for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) {
		/* Fast path to check if bfqq is already allocated. */
		@@ -725,14 +714,16 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
		* available requests and thus starve other entities.
		*/
		if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) {
		depth = 1;
		limit = 1;
		break;
		}
		}

		bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
		__func__, bfqd->wr_busy_queues, op_is_sync(opf), depth);
		if (depth)
		data->shallow_depth = depth;
		__func__, bfqd->wr_busy_queues, op_is_sync(opf), limit);

		if (limit < data->q->nr_requests)
		data->shallow_depth = limit;
		}

		static struct bfq_queue *
		@@ -2457,15 +2448,8 @@ static bool bfq_bio_merge(struct request_queue q, struct bio bio,
		unsigned int nr_segs)
		{
		struct bfq_data *bfqd = q->elevator->elevator_data;
		struct request *free = NULL;
		/*
		* bfq_bic_lookup grabs the queue_lock: invoke it now and
		* store its return value for later use, to avoid nesting
		* queue_lock inside the bfqd->lock. We assume that the bic
		* returned by bfq_bic_lookup does not go away before
		* bfqd->lock is taken.
		*/
		struct bfq_io_cq *bic = bfq_bic_lookup(q);
		struct request *free = NULL;
		bool ret;

		spin_lock_irq(&bfqd->lock);
		@@ -7128,9 +7112,8 @@ void bfq_put_async_queues(struct bfq_data bfqd, struct bfq_group bfqg)
		*/
		static void bfq_update_depths(struct bfq_data bfqd, struct sbitmap_queue bt)
		{
		unsigned int depth = 1U << bt->sb.shift;
		unsigned int nr_requests = bfqd->queue->nr_requests;

		bfqd->full_depth_shift = bt->sb.shift;
		/*
		* In-word depths if no bfq_queue is being weight-raised:
		* leaving 25% of tags only for sync reads.
		@@ -7142,13 +7125,13 @@ static void bfq_update_depths(struct bfq_data bfqd, struct sbitmap_queue bt)
		* limit 'something'.
		*/
		/* no more than 50% of tags for async I/O */
		bfqd->word_depths[0][0] = max(depth >> 1, 1U);
		bfqd->async_depths[0][0] = max(nr_requests >> 1, 1U);
		/*
		* no more than 75% of tags for sync writes (25% extra tags
		* w.r.t. async I/O, to prevent async I/O from starving sync
		* writes)
		*/
		bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U);
		bfqd->async_depths[0][1] = max((nr_requests * 3) >> 2, 1U);

		/*
		* In-word depths in case some bfq_queue is being weight-
		@@ -7158,9 +7141,9 @@ static void bfq_update_depths(struct bfq_data bfqd, struct sbitmap_queue bt)
		* shortage.
		*/
		/* no more than ~18% of tags for async I/O */
		bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U);
		bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U);
		/* no more than ~37% of tags for sync writes (~20% extra tags) */
		bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U);
		bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U);
		}

		static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
		@@ -7232,22 +7215,16 @@ static void bfq_init_root_group(struct bfq_group *root_group,
		root_group->sched_data.bfq_class_idle_last_service = jiffies;
		}

		static int bfq_init_queue(struct request_queue q, struct elevator_type e)
		static int bfq_init_queue(struct request_queue q, struct elevator_queue eq)
		{
		struct bfq_data *bfqd;
		struct elevator_queue *eq;
		unsigned int i;
		struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges;

		eq = elevator_alloc(q, e);
		if (!eq)
		return -ENOMEM;

		bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
		if (!bfqd) {
		kobject_put(&eq->kobj);
		if (!bfqd)
		return -ENOMEM;
		}

		eq->elevator_data = bfqd;

		spin_lock_irq(&q->queue_lock);
		@@ -7405,7 +7382,6 @@ static int bfq_init_queue(struct request_queue q, struct elevator_type e)

		out_free:
		kfree(bfqd);
		kobject_put(&eq->kobj);
		return -ENOMEM;
		}

block/bfq-iosched.h

+6 −7

Original line number	Diff line number	Diff line
		@@ -427,9 +427,6 @@ struct bfq_iocq_bfqq_data {
		*/
		bool saved_IO_bound;

		u64 saved_io_start_time;
		u64 saved_tot_idle_time;

		/*
		* Same purpose as the previous fields for the values of the
		* field keeping the queue's belonging to a large burst
		@@ -450,6 +447,9 @@ struct bfq_iocq_bfqq_data {
		*/
		unsigned int saved_weight;

		u64 saved_io_start_time;
		u64 saved_tot_idle_time;

		/*
		* Similar to previous fields: save wr information.
		*/
		@@ -457,13 +457,13 @@ struct bfq_iocq_bfqq_data {
		unsigned long saved_last_wr_start_finish;
		unsigned long saved_service_from_wr;
		unsigned long saved_wr_start_at_switch_to_srt;
		unsigned int saved_wr_cur_max_time;
		struct bfq_ttime saved_ttime;
		unsigned int saved_wr_cur_max_time;

		/* Save also injection state */
		u64 saved_last_serv_time_ns;
		unsigned int saved_inject_limit;
		unsigned long saved_decrease_time_jif;
		u64 saved_last_serv_time_ns;

		/* candidate queue for a stable merge (due to close creation time) */
		struct bfq_queue *stable_merge_bfqq;
		@@ -813,8 +813,7 @@ struct bfq_data {
		* Depth limits used in bfq_limit_depth (see comments on the
		* function)
		*/
		unsigned int word_depths[2][2];
		unsigned int full_depth_shift;
		unsigned int async_depths[2][2];

		/*
		* Number of independent actuators. This is equal to 1 in

block/blk-ioc.c

+6 −10

Original line number	Diff line number	Diff line
		@@ -308,24 +308,23 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk)

		#ifdef CONFIG_BLK_ICQ
		/**
		* ioc_lookup_icq - lookup io_cq from ioc
		* ioc_lookup_icq - lookup io_cq from ioc in io issue path
		* @q: the associated request_queue
		*
		* Look up io_cq associated with @ioc - @q pair from @ioc. Must be called
		* with @q->queue_lock held.
		* from io issue path, either return NULL if current issue io to @q for the
		* first time, or return a valid icq.
		*/
		struct io_cq ioc_lookup_icq(struct request_queue q)
		{
		struct io_context *ioc = current->io_context;
		struct io_cq *icq;

		lockdep_assert_held(&q->queue_lock);

		/*
		* icq's are indexed from @ioc using radix tree and hint pointer,
		* both of which are protected with RCU. All removals are done
		* holding both q and ioc locks, and we're holding q lock - if we
		* find a icq which points to us, it's guaranteed to be valid.
		* both of which are protected with RCU, io issue path ensures that
		* both request_queue and current task are valid, the found icq
		* is guaranteed to be valid until the io is done.
		*/
		rcu_read_lock();
		icq = rcu_dereference(ioc->icq_hint);
		@@ -419,10 +418,7 @@ struct io_cq ioc_find_get_icq(struct request_queue q)
		task_unlock(current);
		} else {
		get_io_context(ioc);

		spin_lock_irq(&q->queue_lock);
		icq = ioc_lookup_icq(q);
		spin_unlock_irq(&q->queue_lock);
		}

		if (!icq) {

block/blk-mq-sched.c

+152 −71

Original line number	Diff line number	Diff line
		@@ -374,64 +374,17 @@ bool blk_mq_sched_try_insert_merge(struct request_queue q, struct request rq,
		}
		EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);

		static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
		struct blk_mq_hw_ctx *hctx,
		unsigned int hctx_idx)
		{
		if (blk_mq_is_shared_tags(q->tag_set->flags)) {
		hctx->sched_tags = q->sched_shared_tags;
		return 0;
		}

		hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
		q->nr_requests);

		if (!hctx->sched_tags)
		return -ENOMEM;
		return 0;
		}

		static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
		{
		blk_mq_free_rq_map(queue->sched_shared_tags);
		queue->sched_shared_tags = NULL;
		}

		/* called in queue's release handler, tagset has gone away */
		static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
		{
		struct blk_mq_hw_ctx *hctx;
		unsigned long i;

		queue_for_each_hw_ctx(q, hctx, i) {
		if (hctx->sched_tags) {
		if (!blk_mq_is_shared_tags(flags))
		blk_mq_free_rq_map(hctx->sched_tags);
		queue_for_each_hw_ctx(q, hctx, i)
		hctx->sched_tags = NULL;
		}
		}

		if (blk_mq_is_shared_tags(flags))
		blk_mq_exit_sched_shared_tags(q);
		}

		static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
		{
		struct blk_mq_tag_set *set = queue->tag_set;

		/*
		* Set initial depth at max so that we don't need to reallocate for
		* updating nr_requests.
		*/
		queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
		BLK_MQ_NO_HCTX_IDX,
		MAX_SCHED_RQ);
		if (!queue->sched_shared_tags)
		return -ENOMEM;

		blk_mq_tag_update_sched_shared_tags(queue);

		return 0;
		q->sched_shared_tags = NULL;
		}

		void blk_mq_sched_reg_debugfs(struct request_queue *q)
		@@ -458,8 +411,140 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q)
		mutex_unlock(&q->debugfs_mutex);
		}

		void blk_mq_free_sched_tags(struct elevator_tags *et,
		struct blk_mq_tag_set *set)
		{
		unsigned long i;

		/* Shared tags are stored at index 0 in @tags. */
		if (blk_mq_is_shared_tags(set->flags))
		blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX);
		else {
		for (i = 0; i < et->nr_hw_queues; i++)
		blk_mq_free_map_and_rqs(set, et->tags[i], i);
		}

		kfree(et);
		}

		void blk_mq_free_sched_tags_batch(struct xarray *et_table,
		struct blk_mq_tag_set *set)
		{
		struct request_queue *q;
		struct elevator_tags *et;

		lockdep_assert_held_write(&set->update_nr_hwq_lock);

		list_for_each_entry(q, &set->tag_list, tag_set_list) {
		/*
		* Accessing q->elevator without holding q->elevator_lock is
		* safe because we're holding here set->update_nr_hwq_lock in
		* the writer context. So, scheduler update/switch code (which
		* acquires the same lock but in the reader context) can't run
		* concurrently.
		*/
		if (q->elevator) {
		et = xa_load(et_table, q->id);
		if (unlikely(!et))
		WARN_ON_ONCE(1);
		else
		blk_mq_free_sched_tags(et, set);
		}
		}
		}

		struct elevator_tags blk_mq_alloc_sched_tags(struct blk_mq_tag_set set,
		unsigned int nr_hw_queues)
		{
		unsigned int nr_tags;
		int i;
		struct elevator_tags *et;
		gfp_t gfp = GFP_NOIO \| __GFP_ZERO \| __GFP_NOWARN \| __GFP_NORETRY;

		if (blk_mq_is_shared_tags(set->flags))
		nr_tags = 1;
		else
		nr_tags = nr_hw_queues;

		et = kmalloc(sizeof(struct elevator_tags) +
		nr_tags * sizeof(struct blk_mq_tags *), gfp);
		if (!et)
		return NULL;
		/*
		* Default to double of smaller one between hw queue_depth and
		* 128, since we don't split into sync/async like the old code
		* did. Additionally, this is a per-hw queue depth.
		*/
		et->nr_requests = 2 * min_t(unsigned int, set->queue_depth,
		BLKDEV_DEFAULT_RQ);
		et->nr_hw_queues = nr_hw_queues;

		if (blk_mq_is_shared_tags(set->flags)) {
		/* Shared tags are stored at index 0 in @tags. */
		et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX,
		MAX_SCHED_RQ);
		if (!et->tags[0])
		goto out;
		} else {
		for (i = 0; i < et->nr_hw_queues; i++) {
		et->tags[i] = blk_mq_alloc_map_and_rqs(set, i,
		et->nr_requests);
		if (!et->tags[i])
		goto out_unwind;
		}
		}

		return et;
		out_unwind:
		while (--i >= 0)
		blk_mq_free_map_and_rqs(set, et->tags[i], i);
		out:
		kfree(et);
		return NULL;
		}

		int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
		struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
		{
		struct request_queue *q;
		struct elevator_tags *et;
		gfp_t gfp = GFP_NOIO \| __GFP_ZERO \| __GFP_NOWARN \| __GFP_NORETRY;

		lockdep_assert_held_write(&set->update_nr_hwq_lock);

		list_for_each_entry(q, &set->tag_list, tag_set_list) {
		/*
		* Accessing q->elevator without holding q->elevator_lock is
		* safe because we're holding here set->update_nr_hwq_lock in
		* the writer context. So, scheduler update/switch code (which
		* acquires the same lock but in the reader context) can't run
		* concurrently.
		*/
		if (q->elevator) {
		et = blk_mq_alloc_sched_tags(set, nr_hw_queues);
		if (!et)
		goto out_unwind;
		if (xa_insert(et_table, q->id, et, gfp))
		goto out_free_tags;
		}
		}
		return 0;
		out_free_tags:
		blk_mq_free_sched_tags(et, set);
		out_unwind:
		list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
		if (q->elevator) {
		et = xa_load(et_table, q->id);
		if (et)
		blk_mq_free_sched_tags(et, set);
		}
		}
		return -ENOMEM;
		}

		/* caller must have a reference to @e, will grab another one if successful */
		int blk_mq_init_sched(struct request_queue q, struct elevator_type e)
		int blk_mq_init_sched(struct request_queue q, struct elevator_type e,
		struct elevator_tags *et)
		{
		unsigned int flags = q->tag_set->flags;
		struct blk_mq_hw_ctx *hctx;
		@@ -467,36 +552,33 @@ int blk_mq_init_sched(struct request_queue q, struct elevator_type e)
		unsigned long i;
		int ret;

		/*
		* Default to double of smaller one between hw queue_depth and 128,
		* since we don't split into sync/async like the old code did.
		* Additionally, this is a per-hw queue depth.
		*/
		q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
		BLKDEV_DEFAULT_RQ);
		eq = elevator_alloc(q, e, et);
		if (!eq)
		return -ENOMEM;

		q->nr_requests = et->nr_requests;

		if (blk_mq_is_shared_tags(flags)) {
		ret = blk_mq_init_sched_shared_tags(q);
		if (ret)
		return ret;
		/* Shared tags are stored at index 0 in @et->tags. */
		q->sched_shared_tags = et->tags[0];
		blk_mq_tag_update_sched_shared_tags(q);
		}

		queue_for_each_hw_ctx(q, hctx, i) {
		ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
		if (ret)
		goto err_free_map_and_rqs;
		if (blk_mq_is_shared_tags(flags))
		hctx->sched_tags = q->sched_shared_tags;
		else
		hctx->sched_tags = et->tags[i];
		}

		ret = e->ops.init_sched(q, e);
		ret = e->ops.init_sched(q, eq);
		if (ret)
		goto err_free_map_and_rqs;
		goto out;

		queue_for_each_hw_ctx(q, hctx, i) {
		if (e->ops.init_hctx) {
		ret = e->ops.init_hctx(hctx, i);
		if (ret) {
		eq = q->elevator;
		blk_mq_sched_free_rqs(q);
		blk_mq_exit_sched(q, eq);
		kobject_put(&eq->kobj);
		return ret;
		@@ -505,10 +587,9 @@ int blk_mq_init_sched(struct request_queue q, struct elevator_type e)
		}
		return 0;

		err_free_map_and_rqs:
		blk_mq_sched_free_rqs(q);
		out:
		blk_mq_sched_tags_teardown(q, flags);

		kobject_put(&eq->kobj);
		q->elevator = NULL;
		return ret;
		}

block/blk-mq-sched.h

+11 −1

Original line number	Diff line number	Diff line
		@@ -18,10 +18,20 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);

		void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);

		int blk_mq_init_sched(struct request_queue q, struct elevator_type e);
		int blk_mq_init_sched(struct request_queue q, struct elevator_type e,
		struct elevator_tags *et);
		void blk_mq_exit_sched(struct request_queue q, struct elevator_queue e);
		void blk_mq_sched_free_rqs(struct request_queue *q);

		struct elevator_tags blk_mq_alloc_sched_tags(struct blk_mq_tag_set set,
		unsigned int nr_hw_queues);
		int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
		struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
		void blk_mq_free_sched_tags(struct elevator_tags *et,
		struct blk_mq_tag_set *set);
		void blk_mq_free_sched_tags_batch(struct xarray *et_table,
		struct blk_mq_tag_set *set);

		static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
		{
		if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))