Commit 2988dfed authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'block-6.17-20250808' of git://git.kernel.dk/linux

Pull more block updates from Jens Axboe:

 - MD pull request via Yu:
      - mddev null-ptr-dereference fix, by Erkun
      - md-cluster fail to remove the faulty disk regression fix, by
        Heming
      - minor cleanup, by Li Nan and Jinchao
      - mdadm lifetime regression fix reported by syzkaller, by Yu Kuai

 - MD pull request via Christoph
      - add support for getting the FDP featuee in fabrics passthru path
        (Nitesh Shetty)
      - add capability to connect to an administrative controller
        (Kamaljit Singh)
      - fix a leak on sgl setup error (Keith Busch)
      - initialize discovery subsys after debugfs is initialized
        (Mohamed Khalfella)
      - fix various comment typos (Bjorn Helgaas)
      - remove unneeded semicolons (Jiapeng Chong)

 - nvmet debugfs ordering issue fix

 - Fix UAF in the tag_set in zloop

 - Ensure sbitmap shallow depth covers entire set

 - Reduce lock roundtrips in io context lookup

 - Move scheduler tags alloc/free out of elevator and freeze lock, to
   fix some lockdep found issues

 - Improve robustness of queue limits checking

 - Fix a regression with IO priorities, if no io context exists

* tag 'block-6.17-20250808' of git://git.kernel.dk/linux: (26 commits)
  lib/sbitmap: make sbitmap_get_shallow() internal
  lib/sbitmap: convert shallow_depth from one word to the whole sbitmap
  nvmet: exit debugfs after discovery subsystem exits
  block, bfq: Reorder struct bfq_iocq_bfqq_data
  md: make rdev_addable usable for rcu mode
  md/raid1: remove struct pool_info and related code
  md/raid1: change r1conf->r1bio_pool to a pointer type
  block: ensure discard_granularity is zero when discard is not supported
  zloop: fix KASAN use-after-free of tag set
  block: Fix default IO priority if there is no IO context
  nvme: fix various comment typos
  nvme-auth: remove unneeded semicolon
  nvme-pci: fix leak on sgl setup error
  nvmet: initialize discovery subsys after debugfs is initialized
  nvme: add capability to connect to an administrative controller
  nvmet: add support for FDP in fabrics passthru path
  md: rename recovery_cp to resync_offset
  md/md-cluster: handle REMOVE message earlier
  md: fix create on open mddev lifetime regression
  block: fix potential deadlock while running nr_hw_queue update
  ...
parents 24bbfb89 45fa9f97
Loading
Loading
Loading
Loading
+21 −45
Original line number Diff line number Diff line
@@ -454,17 +454,10 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
 */
static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
{
	struct bfq_io_cq *icq;
	unsigned long flags;

	if (!current->io_context)
		return NULL;

	spin_lock_irqsave(&q->queue_lock, flags);
	icq = icq_to_bic(ioc_lookup_icq(q));
	spin_unlock_irqrestore(&q->queue_lock, flags);

	return icq;
	return icq_to_bic(ioc_lookup_icq(q));
}

/*
@@ -701,17 +694,13 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
{
	struct bfq_data *bfqd = data->q->elevator->elevator_data;
	struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
	int depth;
	unsigned limit = data->q->nr_requests;
	unsigned int act_idx;
	unsigned int limit, act_idx;

	/* Sync reads have full depth available */
	if (op_is_sync(opf) && !op_is_write(opf)) {
		depth = 0;
	} else {
		depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
		limit = (limit * depth) >> bfqd->full_depth_shift;
	}
	if (op_is_sync(opf) && !op_is_write(opf))
		limit = data->q->nr_requests;
	else
		limit = bfqd->async_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];

	for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) {
		/* Fast path to check if bfqq is already allocated. */
@@ -725,14 +714,16 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
		 * available requests and thus starve other entities.
		 */
		if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) {
			depth = 1;
			limit = 1;
			break;
		}
	}

	bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
		__func__, bfqd->wr_busy_queues, op_is_sync(opf), depth);
	if (depth)
		data->shallow_depth = depth;
		__func__, bfqd->wr_busy_queues, op_is_sync(opf), limit);

	if (limit < data->q->nr_requests)
		data->shallow_depth = limit;
}

static struct bfq_queue *
@@ -2457,15 +2448,8 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
		unsigned int nr_segs)
{
	struct bfq_data *bfqd = q->elevator->elevator_data;
	struct request *free = NULL;
	/*
	 * bfq_bic_lookup grabs the queue_lock: invoke it now and
	 * store its return value for later use, to avoid nesting
	 * queue_lock inside the bfqd->lock. We assume that the bic
	 * returned by bfq_bic_lookup does not go away before
	 * bfqd->lock is taken.
	 */
	struct bfq_io_cq *bic = bfq_bic_lookup(q);
	struct request *free = NULL;
	bool ret;

	spin_lock_irq(&bfqd->lock);
@@ -7128,9 +7112,8 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
 */
static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
{
	unsigned int depth = 1U << bt->sb.shift;
	unsigned int nr_requests = bfqd->queue->nr_requests;

	bfqd->full_depth_shift = bt->sb.shift;
	/*
	 * In-word depths if no bfq_queue is being weight-raised:
	 * leaving 25% of tags only for sync reads.
@@ -7142,13 +7125,13 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
	 * limit 'something'.
	 */
	/* no more than 50% of tags for async I/O */
	bfqd->word_depths[0][0] = max(depth >> 1, 1U);
	bfqd->async_depths[0][0] = max(nr_requests >> 1, 1U);
	/*
	 * no more than 75% of tags for sync writes (25% extra tags
	 * w.r.t. async I/O, to prevent async I/O from starving sync
	 * writes)
	 */
	bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U);
	bfqd->async_depths[0][1] = max((nr_requests * 3) >> 2, 1U);

	/*
	 * In-word depths in case some bfq_queue is being weight-
@@ -7158,9 +7141,9 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
	 * shortage.
	 */
	/* no more than ~18% of tags for async I/O */
	bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U);
	bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U);
	/* no more than ~37% of tags for sync writes (~20% extra tags) */
	bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U);
	bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U);
}

static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
@@ -7232,22 +7215,16 @@ static void bfq_init_root_group(struct bfq_group *root_group,
	root_group->sched_data.bfq_class_idle_last_service = jiffies;
}

static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq)
{
	struct bfq_data *bfqd;
	struct elevator_queue *eq;
	unsigned int i;
	struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges;

	eq = elevator_alloc(q, e);
	if (!eq)
		return -ENOMEM;

	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
	if (!bfqd) {
		kobject_put(&eq->kobj);
	if (!bfqd)
		return -ENOMEM;
	}

	eq->elevator_data = bfqd;

	spin_lock_irq(&q->queue_lock);
@@ -7405,7 +7382,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

out_free:
	kfree(bfqd);
	kobject_put(&eq->kobj);
	return -ENOMEM;
}

+6 −7
Original line number Diff line number Diff line
@@ -427,9 +427,6 @@ struct bfq_iocq_bfqq_data {
	 */
	bool saved_IO_bound;

	u64 saved_io_start_time;
	u64 saved_tot_idle_time;

	/*
	 * Same purpose as the previous fields for the values of the
	 * field keeping the queue's belonging to a large burst
@@ -450,6 +447,9 @@ struct bfq_iocq_bfqq_data {
	 */
	unsigned int saved_weight;

	u64 saved_io_start_time;
	u64 saved_tot_idle_time;

	/*
	 * Similar to previous fields: save wr information.
	 */
@@ -457,13 +457,13 @@ struct bfq_iocq_bfqq_data {
	unsigned long saved_last_wr_start_finish;
	unsigned long saved_service_from_wr;
	unsigned long saved_wr_start_at_switch_to_srt;
	unsigned int saved_wr_cur_max_time;
	struct bfq_ttime saved_ttime;
	unsigned int saved_wr_cur_max_time;

	/* Save also injection state */
	u64 saved_last_serv_time_ns;
	unsigned int saved_inject_limit;
	unsigned long saved_decrease_time_jif;
	u64 saved_last_serv_time_ns;

	/* candidate queue for a stable merge (due to close creation time) */
	struct bfq_queue *stable_merge_bfqq;
@@ -813,8 +813,7 @@ struct bfq_data {
	 * Depth limits used in bfq_limit_depth (see comments on the
	 * function)
	 */
	unsigned int word_depths[2][2];
	unsigned int full_depth_shift;
	unsigned int async_depths[2][2];

	/*
	 * Number of independent actuators. This is equal to 1 in
+6 −10
Original line number Diff line number Diff line
@@ -308,24 +308,23 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk)

#ifdef CONFIG_BLK_ICQ
/**
 * ioc_lookup_icq - lookup io_cq from ioc
 * ioc_lookup_icq - lookup io_cq from ioc in io issue path
 * @q: the associated request_queue
 *
 * Look up io_cq associated with @ioc - @q pair from @ioc.  Must be called
 * with @q->queue_lock held.
 * from io issue path, either return NULL if current issue io to @q for the
 * first time, or return a valid icq.
 */
struct io_cq *ioc_lookup_icq(struct request_queue *q)
{
	struct io_context *ioc = current->io_context;
	struct io_cq *icq;

	lockdep_assert_held(&q->queue_lock);

	/*
	 * icq's are indexed from @ioc using radix tree and hint pointer,
	 * both of which are protected with RCU.  All removals are done
	 * holding both q and ioc locks, and we're holding q lock - if we
	 * find a icq which points to us, it's guaranteed to be valid.
	 * both of which are protected with RCU, io issue path ensures that
	 * both request_queue and current task are valid, the found icq
	 * is guaranteed to be valid until the io is done.
	 */
	rcu_read_lock();
	icq = rcu_dereference(ioc->icq_hint);
@@ -419,10 +418,7 @@ struct io_cq *ioc_find_get_icq(struct request_queue *q)
		task_unlock(current);
	} else {
		get_io_context(ioc);

		spin_lock_irq(&q->queue_lock);
		icq = ioc_lookup_icq(q);
		spin_unlock_irq(&q->queue_lock);
	}

	if (!icq) {
+152 −71
Original line number Diff line number Diff line
@@ -374,64 +374,17 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);

static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
					  struct blk_mq_hw_ctx *hctx,
					  unsigned int hctx_idx)
{
	if (blk_mq_is_shared_tags(q->tag_set->flags)) {
		hctx->sched_tags = q->sched_shared_tags;
		return 0;
	}

	hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
						    q->nr_requests);

	if (!hctx->sched_tags)
		return -ENOMEM;
	return 0;
}

static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
{
	blk_mq_free_rq_map(queue->sched_shared_tags);
	queue->sched_shared_tags = NULL;
}

/* called in queue's release handler, tagset has gone away */
static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{
	struct blk_mq_hw_ctx *hctx;
	unsigned long i;

	queue_for_each_hw_ctx(q, hctx, i) {
		if (hctx->sched_tags) {
			if (!blk_mq_is_shared_tags(flags))
				blk_mq_free_rq_map(hctx->sched_tags);
	queue_for_each_hw_ctx(q, hctx, i)
		hctx->sched_tags = NULL;
		}
	}

	if (blk_mq_is_shared_tags(flags))
		blk_mq_exit_sched_shared_tags(q);
}

static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
{
	struct blk_mq_tag_set *set = queue->tag_set;

	/*
	 * Set initial depth at max so that we don't need to reallocate for
	 * updating nr_requests.
	 */
	queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
						BLK_MQ_NO_HCTX_IDX,
						MAX_SCHED_RQ);
	if (!queue->sched_shared_tags)
		return -ENOMEM;

	blk_mq_tag_update_sched_shared_tags(queue);

	return 0;
		q->sched_shared_tags = NULL;
}

void blk_mq_sched_reg_debugfs(struct request_queue *q)
@@ -458,8 +411,140 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q)
	mutex_unlock(&q->debugfs_mutex);
}

void blk_mq_free_sched_tags(struct elevator_tags *et,
		struct blk_mq_tag_set *set)
{
	unsigned long i;

	/* Shared tags are stored at index 0 in @tags. */
	if (blk_mq_is_shared_tags(set->flags))
		blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX);
	else {
		for (i = 0; i < et->nr_hw_queues; i++)
			blk_mq_free_map_and_rqs(set, et->tags[i], i);
	}

	kfree(et);
}

void blk_mq_free_sched_tags_batch(struct xarray *et_table,
		struct blk_mq_tag_set *set)
{
	struct request_queue *q;
	struct elevator_tags *et;

	lockdep_assert_held_write(&set->update_nr_hwq_lock);

	list_for_each_entry(q, &set->tag_list, tag_set_list) {
		/*
		 * Accessing q->elevator without holding q->elevator_lock is
		 * safe because we're holding here set->update_nr_hwq_lock in
		 * the writer context. So, scheduler update/switch code (which
		 * acquires the same lock but in the reader context) can't run
		 * concurrently.
		 */
		if (q->elevator) {
			et = xa_load(et_table, q->id);
			if (unlikely(!et))
				WARN_ON_ONCE(1);
			else
				blk_mq_free_sched_tags(et, set);
		}
	}
}

struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
		unsigned int nr_hw_queues)
{
	unsigned int nr_tags;
	int i;
	struct elevator_tags *et;
	gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;

	if (blk_mq_is_shared_tags(set->flags))
		nr_tags = 1;
	else
		nr_tags = nr_hw_queues;

	et = kmalloc(sizeof(struct elevator_tags) +
			nr_tags * sizeof(struct blk_mq_tags *), gfp);
	if (!et)
		return NULL;
	/*
	 * Default to double of smaller one between hw queue_depth and
	 * 128, since we don't split into sync/async like the old code
	 * did. Additionally, this is a per-hw queue depth.
	 */
	et->nr_requests = 2 * min_t(unsigned int, set->queue_depth,
			BLKDEV_DEFAULT_RQ);
	et->nr_hw_queues = nr_hw_queues;

	if (blk_mq_is_shared_tags(set->flags)) {
		/* Shared tags are stored at index 0 in @tags. */
		et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX,
					MAX_SCHED_RQ);
		if (!et->tags[0])
			goto out;
	} else {
		for (i = 0; i < et->nr_hw_queues; i++) {
			et->tags[i] = blk_mq_alloc_map_and_rqs(set, i,
					et->nr_requests);
			if (!et->tags[i])
				goto out_unwind;
		}
	}

	return et;
out_unwind:
	while (--i >= 0)
		blk_mq_free_map_and_rqs(set, et->tags[i], i);
out:
	kfree(et);
	return NULL;
}

int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
		struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
{
	struct request_queue *q;
	struct elevator_tags *et;
	gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;

	lockdep_assert_held_write(&set->update_nr_hwq_lock);

	list_for_each_entry(q, &set->tag_list, tag_set_list) {
		/*
		 * Accessing q->elevator without holding q->elevator_lock is
		 * safe because we're holding here set->update_nr_hwq_lock in
		 * the writer context. So, scheduler update/switch code (which
		 * acquires the same lock but in the reader context) can't run
		 * concurrently.
		 */
		if (q->elevator) {
			et = blk_mq_alloc_sched_tags(set, nr_hw_queues);
			if (!et)
				goto out_unwind;
			if (xa_insert(et_table, q->id, et, gfp))
				goto out_free_tags;
		}
	}
	return 0;
out_free_tags:
	blk_mq_free_sched_tags(et, set);
out_unwind:
	list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
		if (q->elevator) {
			et = xa_load(et_table, q->id);
			if (et)
				blk_mq_free_sched_tags(et, set);
		}
	}
	return -ENOMEM;
}

/* caller must have a reference to @e, will grab another one if successful */
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
		struct elevator_tags *et)
{
	unsigned int flags = q->tag_set->flags;
	struct blk_mq_hw_ctx *hctx;
@@ -467,36 +552,33 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
	unsigned long i;
	int ret;

	/*
	 * Default to double of smaller one between hw queue_depth and 128,
	 * since we don't split into sync/async like the old code did.
	 * Additionally, this is a per-hw queue depth.
	 */
	q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
				   BLKDEV_DEFAULT_RQ);
	eq = elevator_alloc(q, e, et);
	if (!eq)
		return -ENOMEM;

	q->nr_requests = et->nr_requests;

	if (blk_mq_is_shared_tags(flags)) {
		ret = blk_mq_init_sched_shared_tags(q);
		if (ret)
			return ret;
		/* Shared tags are stored at index 0 in @et->tags. */
		q->sched_shared_tags = et->tags[0];
		blk_mq_tag_update_sched_shared_tags(q);
	}

	queue_for_each_hw_ctx(q, hctx, i) {
		ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
		if (ret)
			goto err_free_map_and_rqs;
		if (blk_mq_is_shared_tags(flags))
			hctx->sched_tags = q->sched_shared_tags;
		else
			hctx->sched_tags = et->tags[i];
	}

	ret = e->ops.init_sched(q, e);
	ret = e->ops.init_sched(q, eq);
	if (ret)
		goto err_free_map_and_rqs;
		goto out;

	queue_for_each_hw_ctx(q, hctx, i) {
		if (e->ops.init_hctx) {
			ret = e->ops.init_hctx(hctx, i);
			if (ret) {
				eq = q->elevator;
				blk_mq_sched_free_rqs(q);
				blk_mq_exit_sched(q, eq);
				kobject_put(&eq->kobj);
				return ret;
@@ -505,10 +587,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
	}
	return 0;

err_free_map_and_rqs:
	blk_mq_sched_free_rqs(q);
out:
	blk_mq_sched_tags_teardown(q, flags);

	kobject_put(&eq->kobj);
	q->elevator = NULL;
	return ret;
}
+11 −1
Original line number Diff line number Diff line
@@ -18,10 +18,20 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);

void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);

int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
		struct elevator_tags *et);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_rqs(struct request_queue *q);

struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
		unsigned int nr_hw_queues);
int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
		struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
void blk_mq_free_sched_tags(struct elevator_tags *et,
		struct blk_mq_tag_set *set);
void blk_mq_free_sched_tags_batch(struct xarray *et_table,
		struct blk_mq_tag_set *set);

static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
Loading