Commit 54bdd67d authored by Keith Busch's avatar Keith Busch Committed by Jens Axboe
Browse files

blk-mq: remove hybrid polling



io_uring provides the only way user space can poll completions, and that
always sets BLK_POLL_NOSLEEP. This effectively makes hybrid polling dead
code, so remove it and everything supporting it.

Hybrid polling was effectively killed off with 9650b453, "block:
ignore RWF_HIPRI hint for sync dio", but still potentially reachable
through io_uring until d729cf9a, "io_uring: don't sleep when
polling for I/O", but hybrid polling probably should not have been
reachable through that async interface from the beginning.

Fixes: 9650b453 ("block: ignore RWF_HIPRI hint for sync dio")
Fixes: d729cf9a ("io_uring: don't sleep when polling for I/O")
Signed-off-by: default avatarKeith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20230320194926.3353144-1-kbusch@meta.com


Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 4cf2c3ab
Loading
Loading
Loading
Loading
+4 −11
Original line number Diff line number Diff line
@@ -336,18 +336,11 @@ What: /sys/block/<disk>/queue/io_poll_delay
Date:		November 2016
Contact:	linux-block@vger.kernel.org
Description:
		[RW] If polling is enabled, this controls what kind of polling
		will be performed. It defaults to -1, which is classic polling.
		[RW] This was used to control what kind of polling will be
		performed.  It is now fixed to -1, which is classic polling.
		In this mode, the CPU will repeatedly ask for completions
		without giving up any time.  If set to 0, a hybrid polling mode
		is used, where the kernel will attempt to make an educated guess
		at when the IO will complete. Based on this guess, the kernel
		will put the process issuing IO to sleep for an amount of time,
		before entering a classic poll loop. This mode might be a little
		slower than pure classic polling, but it will be more efficient.
		If set to a value larger than 0, the kernel will put the process
		issuing IO to sleep for this amount of microseconds before
		entering classic polling.
		without giving up any time.
		<deprecated>


What:		/sys/block/<disk>/queue/io_timeout
+0 −6
Original line number Diff line number Diff line
@@ -263,13 +263,7 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)

static void blk_free_queue(struct request_queue *q)
{
	if (q->poll_stat)
		blk_stat_remove_callback(q, q->poll_cb);
	blk_stat_free_callback(q->poll_cb);

	blk_free_queue_stats(q->stats);
	kfree(q->poll_stat);

	if (queue_is_mq(q))
		blk_mq_release(q);

+0 −26
Original line number Diff line number Diff line
@@ -15,33 +15,8 @@
#include "blk-mq-tag.h"
#include "blk-rq-qos.h"

static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
{
	if (stat->nr_samples) {
		seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu",
			   stat->nr_samples, stat->mean, stat->min, stat->max);
	} else {
		seq_puts(m, "samples=0");
	}
}

static int queue_poll_stat_show(void *data, struct seq_file *m)
{
	struct request_queue *q = data;
	int bucket;

	if (!q->poll_stat)
		return 0;

	for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) {
		seq_printf(m, "read  (%d Bytes): ", 1 << (9 + bucket));
		print_stat(m, &q->poll_stat[2 * bucket]);
		seq_puts(m, "\n");

		seq_printf(m, "write (%d Bytes): ",  1 << (9 + bucket));
		print_stat(m, &q->poll_stat[2 * bucket + 1]);
		seq_puts(m, "\n");
	}
	return 0;
}

@@ -282,7 +257,6 @@ static const char *const rqf_name[] = {
	RQF_NAME(STATS),
	RQF_NAME(SPECIAL_PAYLOAD),
	RQF_NAME(ZONE_WRITE_LOCKED),
	RQF_NAME(MQ_POLL_SLEPT),
	RQF_NAME(TIMED_OUT),
	RQF_NAME(ELV),
	RQF_NAME(RESV),
+6 −199
Original line number Diff line number Diff line
@@ -46,51 +46,15 @@

static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);

static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);

static int blk_mq_poll_stats_bkt(const struct request *rq)
{
	int ddir, sectors, bucket;

	ddir = rq_data_dir(rq);
	sectors = blk_rq_stats_sectors(rq);

	bucket = ddir + 2 * ilog2(sectors);

	if (bucket < 0)
		return -1;
	else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
		return ddir + BLK_MQ_POLL_STATS_BKTS - 2;

	return bucket;
}

#define BLK_QC_T_SHIFT		16
#define BLK_QC_T_INTERNAL	(1U << 31)

static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
		blk_qc_t qc)
{
	return xa_load(&q->hctx_table,
			(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT);
}

static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
		blk_qc_t qc)
{
	unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1);

	if (qc & BLK_QC_T_INTERNAL)
		return blk_mq_tag_to_rq(hctx->sched_tags, tag);
	return blk_mq_tag_to_rq(hctx->tags, tag);
	return xa_load(&q->hctx_table, qc);
}

static inline blk_qc_t blk_rq_to_qc(struct request *rq)
{
	return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) |
		(rq->tag != -1 ?
		 rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL));
	return rq->mq_hctx->queue_num;
}

/*
@@ -1038,10 +1002,8 @@ static inline void blk_account_io_start(struct request *req)

static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
{
	if (rq->rq_flags & RQF_STATS) {
		blk_mq_poll_stats_start(rq->q);
	if (rq->rq_flags & RQF_STATS)
		blk_stat_add(rq, now);
	}

	blk_mq_sched_completed_request(rq, now);
	blk_account_io_done(rq, now);
@@ -4222,14 +4184,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
	/* mark the queue as mq asap */
	q->mq_ops = set->ops;

	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
					     blk_mq_poll_stats_bkt,
					     BLK_MQ_POLL_STATS_BKTS, q);
	if (!q->poll_cb)
		goto err_exit;

	if (blk_mq_alloc_ctxs(q))
		goto err_poll;
		goto err_exit;

	/* init q->mq_kobj and sw queues' kobjects */
	blk_mq_sysfs_init(q);
@@ -4257,11 +4213,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,

	q->nr_requests = set->queue_depth;

	/*
	 * Default to classic polling
	 */
	q->poll_nsec = BLK_MQ_POLL_CLASSIC;

	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
	blk_mq_add_queue_tag_set(set, q);
	blk_mq_map_swqueue(q);
@@ -4269,9 +4220,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,

err_hctxs:
	blk_mq_release(q);
err_poll:
	blk_stat_free_callback(q->poll_cb);
	q->poll_cb = NULL;
err_exit:
	q->mq_ops = NULL;
	return -ENOMEM;
@@ -4768,138 +4716,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);

/* Enable polling stats and return whether they were already enabled. */
static bool blk_poll_stats_enable(struct request_queue *q)
{
	if (q->poll_stat)
		return true;

	return blk_stats_alloc_enable(q);
}

static void blk_mq_poll_stats_start(struct request_queue *q)
{
	/*
	 * We don't arm the callback if polling stats are not enabled or the
	 * callback is already active.
	 */
	if (!q->poll_stat || blk_stat_is_active(q->poll_cb))
		return;

	blk_stat_activate_msecs(q->poll_cb, 100);
}

static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
{
	struct request_queue *q = cb->data;
	int bucket;

	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
		if (cb->stat[bucket].nr_samples)
			q->poll_stat[bucket] = cb->stat[bucket];
	}
}

static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
				       struct request *rq)
{
	unsigned long ret = 0;
	int bucket;

	/*
	 * If stats collection isn't on, don't sleep but turn it on for
	 * future users
	 */
	if (!blk_poll_stats_enable(q))
		return 0;

	/*
	 * As an optimistic guess, use half of the mean service time
	 * for this type of request. We can (and should) make this smarter.
	 * For instance, if the completion latencies are tight, we can
	 * get closer than just half the mean. This is especially
	 * important on devices where the completion latencies are longer
	 * than ~10 usec. We do use the stats for the relevant IO size
	 * if available which does lead to better estimates.
	 */
	bucket = blk_mq_poll_stats_bkt(rq);
	if (bucket < 0)
		return ret;

	if (q->poll_stat[bucket].nr_samples)
		ret = (q->poll_stat[bucket].mean + 1) / 2;

	return ret;
}

static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc)
{
	struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc);
	struct request *rq = blk_qc_to_rq(hctx, qc);
	struct hrtimer_sleeper hs;
	enum hrtimer_mode mode;
	unsigned int nsecs;
	ktime_t kt;

	/*
	 * If a request has completed on queue that uses an I/O scheduler, we
	 * won't get back a request from blk_qc_to_rq.
	 */
	if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT))
		return false;

	/*
	 * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
	 *
	 *  0:	use half of prev avg
	 * >0:	use this specific value
	 */
	if (q->poll_nsec > 0)
		nsecs = q->poll_nsec;
	else
		nsecs = blk_mq_poll_nsecs(q, rq);

	if (!nsecs)
		return false;

	rq->rq_flags |= RQF_MQ_POLL_SLEPT;

	/*
	 * This will be replaced with the stats tracking code, using
	 * 'avg_completion_time / 2' as the pre-sleep target.
	 */
	kt = nsecs;

	mode = HRTIMER_MODE_REL;
	hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
	hrtimer_set_expires(&hs.timer, kt);

	do {
		if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
			break;
		set_current_state(TASK_UNINTERRUPTIBLE);
		hrtimer_sleeper_start_expires(&hs, mode);
		if (hs.task)
			io_schedule();
		hrtimer_cancel(&hs.timer);
		mode = HRTIMER_MODE_ABS;
	} while (hs.task && !signal_pending(current));

	__set_current_state(TASK_RUNNING);
	destroy_hrtimer_on_stack(&hs.timer);

	/*
	 * If we sleep, have the caller restart the poll loop to reset the
	 * state.  Like for the other success return cases, the caller is
	 * responsible for checking if the IO completed.  If the IO isn't
	 * complete, we'll get called again and will go straight to the busy
	 * poll loop.
	 */
	return true;
}

static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
			       struct io_comp_batch *iob, unsigned int flags)
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
		unsigned int flags)
{
	struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie);
	long state = get_current_state();
@@ -4926,17 +4744,6 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
	return 0;
}

int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
		unsigned int flags)
{
	if (!(flags & BLK_POLL_NOSLEEP) &&
	    q->poll_nsec != BLK_MQ_POLL_CLASSIC) {
		if (blk_mq_poll_hybrid(q, cookie))
			return 1;
	}
	return blk_mq_poll_classic(q, cookie, iob, flags);
}

unsigned int blk_mq_rq_cpu(struct request *rq)
{
	return rq->mq_ctx->cpu;
+0 −18
Original line number Diff line number Diff line
@@ -231,21 +231,3 @@ void blk_free_queue_stats(struct blk_queue_stats *stats)

	kfree(stats);
}

bool blk_stats_alloc_enable(struct request_queue *q)
{
	struct blk_rq_stat *poll_stat;

	poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat),
				GFP_ATOMIC);
	if (!poll_stat)
		return false;

	if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) {
		kfree(poll_stat);
		return true;
	}

	blk_stat_add_callback(q, q->poll_cb);
	return false;
}
Loading