From bb642e2d300ee27dcede65cda7ffc47a7047bd69 Mon Sep 17 00:00:00 2001 From: Amit Chaudhary Date: Fri, 26 Sep 2025 12:08:22 -0700 Subject: [PATCH 1/5] nvme-multipath: Skip nr_active increments in RETRY disposition For queue-depth I/O policy, this patch fixes unbalanced I/Os across nvme multipaths. Issue Description: The RETRY disposition incorrectly increments ns->ctrl->nr_active counter and reinitializes iostat start-time. In such cases nr_active counter never goes back to zero until that path disconnects and reconnects. Such a path is not chosen for new I/Os if multiple RETRY cases on a given a path cause its queue-depth counter to be artificially higher compared to other paths. This leads to unbalanced I/Os across paths. The patch skips incrementing nr_active if NVME_MPATH_CNT_ACTIVE is already set. And it skips restarting io stats if NVME_MPATH_IO_STATS is already set. base-commit: e989a3da2d371a4b6597ee8dee5c72e407b4db7a Fixes: d4d957b53d91eeb ("nvme-multipath: support io stats on the mpath device") Signed-off-by: Amit Chaudhary Reviewed-by: Randy Jennings Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 3da980dc60d9..543e17aead12 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -182,12 +182,14 @@ void nvme_mpath_start_request(struct request *rq) struct nvme_ns *ns = rq->q->queuedata; struct gendisk *disk = ns->head->disk; - if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) { + if ((READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) && + !(nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)) { atomic_inc(&ns->ctrl->nr_active); nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE; } - if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq)) + if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq) || + (nvme_req(rq)->flags & NVME_MPATH_IO_STATS)) return; nvme_req(rq)->flags |= NVME_MPATH_IO_STATS; From 7e091add9c433bab6912228799bf508e2414acc3 Mon Sep 17 00:00:00 2001 From: Martin George Date: Mon, 8 Sep 2025 22:54:57 +0530 Subject: [PATCH 2/5] nvme-auth: update sc_c in host response The sc_c field is currently not updated in the host response to the controller challenge leading to failures while attempting secure channel concatenation. Fix this by adding a new sc_c variable to the dhchap queue context structure which is appropriately set during negotiate and then used in the host response. Fixes: e88a7595b57f ("nvme-tcp: request secure channel concatenation") Signed-off-by: Martin George Signed-off-by: Prashanth Adurthi Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/auth.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 012fcfc79a73..a01178caf15b 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -36,6 +36,7 @@ struct nvme_dhchap_queue_context { u8 status; u8 dhgroup_id; u8 hash_id; + u8 sc_c; size_t hash_len; u8 c1[64]; u8 c2[64]; @@ -154,6 +155,8 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl, data->auth_protocol[0].dhchap.idlist[34] = NVME_AUTH_DHGROUP_6144; data->auth_protocol[0].dhchap.idlist[35] = NVME_AUTH_DHGROUP_8192; + chap->sc_c = data->sc_c; + return size; } @@ -489,7 +492,7 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl, ret = crypto_shash_update(shash, buf, 2); if (ret) goto out; - memset(buf, 0, sizeof(buf)); + *buf = chap->sc_c; ret = crypto_shash_update(shash, buf, 1); if (ret) goto out; @@ -500,6 +503,7 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl, strlen(ctrl->opts->host->nqn)); if (ret) goto out; + memset(buf, 0, sizeof(buf)); ret = crypto_shash_update(shash, buf, 1); if (ret) goto out; From dc96cefef0d3032c69e46a21b345c60e56b18934 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 15 Oct 2025 09:48:27 +0800 Subject: [PATCH 3/5] blk-mq: fix stale tag depth for shared sched tags in blk_mq_update_nr_requests() Commit 7f2799c546db ("blk-mq: cleanup shared tags case in blk_mq_update_nr_requests()") moves blk_mq_tag_update_sched_shared_tags() before q->nr_requests is updated, however, it's still using the old q->nr_requests to resize tag depth. Fix this problem by passing in expected new tag depth. Fixes: 7f2799c546db ("blk-mq: cleanup shared tags case in blk_mq_update_nr_requests()") Signed-off-by: Yu Kuai Reviewed-by: Ming Lei Reviewed-by: Nilay Shroff Reported-by: Chris Mason Link: https://lore.kernel.org/linux-block/20251014130507.4187235-2-clm@meta.com/ Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 2 +- block/blk-mq-tag.c | 5 +++-- block/blk-mq.c | 2 +- block/blk-mq.h | 3 ++- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index d06bb137a743..e0bed16485c3 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -557,7 +557,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, if (blk_mq_is_shared_tags(flags)) { /* Shared tags are stored at index 0 in @et->tags. */ q->sched_shared_tags = et->tags[0]; - blk_mq_tag_update_sched_shared_tags(q); + blk_mq_tag_update_sched_shared_tags(q, et->nr_requests); } queue_for_each_hw_ctx(q, hctx, i) { diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index c7a4d4b9cc87..5b664dbdf655 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -622,10 +622,11 @@ void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags); } -void blk_mq_tag_update_sched_shared_tags(struct request_queue *q) +void blk_mq_tag_update_sched_shared_tags(struct request_queue *q, + unsigned int nr) { sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags, - q->nr_requests - q->tag_set->reserved_tags); + nr - q->tag_set->reserved_tags); } /** diff --git a/block/blk-mq.c b/block/blk-mq.c index 09f579414161..d626d32f6e57 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4941,7 +4941,7 @@ struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, * tags can't grow, see blk_mq_alloc_sched_tags(). */ if (q->elevator) - blk_mq_tag_update_sched_shared_tags(q); + blk_mq_tag_update_sched_shared_tags(q, nr); else blk_mq_tag_resize_shared_tags(set, nr); } else if (!q->elevator) { diff --git a/block/blk-mq.h b/block/blk-mq.h index af42dc018808..c4fccdeb5441 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -186,7 +186,8 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size); -void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); +void blk_mq_tag_update_sched_shared_tags(struct request_queue *q, + unsigned int nr); void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, From 08823e89e3e269bf4c4a20b4c24a8119920cc7a4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 15 Oct 2025 18:30:39 +0800 Subject: [PATCH 4/5] block: Remove elevator_lock usage from blkg_conf frozen operations Remove the acquisition and release of q->elevator_lock in the blkg_conf_open_bdev_frozen() and blkg_conf_exit_frozen() functions. The elevator lock is no longer needed in these code paths since commit 78c271344b6f ("block: move wbt_enable_default() out of queue freezing from sched ->exit()") which introduces `disk->rqos_state_mutex` for protecting wbt state change, and not necessary to abuse elevator_lock for this purpose. This change helps to solve the lockdep warning reported from Yu Kuai[1]. Pass blktests/throtl with lockdep enabled. Links: https://lore.kernel.org/linux-block/e5e7ac3f-2063-473a-aafb-4d8d43e5576e@yukuai.org.cn/ [1] Fixes: commit 78c271344b6f ("block: move wbt_enable_default() out of queue freezing from sched ->exit()") Signed-off-by: Ming Lei Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f93de34fe87d..3cffb68ba5d8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -812,8 +812,7 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) } /* * Similar to blkg_conf_open_bdev, but additionally freezes the queue, - * acquires q->elevator_lock, and ensures the correct locking order - * between q->elevator_lock and q->rq_qos_mutex. + * ensures the correct locking order between freeze queue and q->rq_qos_mutex. * * This function returns negative error on failure. On success it returns * memflags which must be saved and later passed to blkg_conf_exit_frozen @@ -834,13 +833,11 @@ unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx) * At this point, we haven’t started protecting anything related to QoS, * so we release q->rq_qos_mutex here, which was first acquired in blkg_ * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing - * the queue and acquiring q->elevator_lock to maintain the correct - * locking order. + * the queue to maintain the correct locking order. */ mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue); - mutex_lock(&ctx->bdev->bd_queue->elevator_lock); mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex); return memflags; @@ -995,9 +992,8 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx) EXPORT_SYMBOL_GPL(blkg_conf_exit); /* - * Similar to blkg_conf_exit, but also unfreezes the queue and releases - * q->elevator_lock. Should be used when blkg_conf_open_bdev_frozen - * is used to open the bdev. + * Similar to blkg_conf_exit, but also unfreezes the queue. Should be used + * when blkg_conf_open_bdev_frozen is used to open the bdev. */ void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags) { @@ -1005,7 +1001,6 @@ void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags) struct request_queue *q = ctx->bdev->bd_queue; blkg_conf_exit(ctx); - mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); } } From 5a869d017793399fd1d2609ff27e900534173eb3 Mon Sep 17 00:00:00 2001 From: Wilfred Mallawa Date: Fri, 10 Oct 2025 17:19:42 +1000 Subject: [PATCH 5/5] nvme/tcp: handle tls partially sent records in write_space() With TLS enabled, records that are encrypted and appended to TLS TX list can fail to see a retry if the underlying TCP socket is busy, for example, hitting an EAGAIN from tcp_sendmsg_locked(). This is not known to the NVMe TCP driver, as the TLS layer successfully generated a record. Typically, the TLS write_space() callback would ensure such records are retried, but in the NVMe TCP Host driver, write_space() invokes nvme_tcp_write_space(). This causes a partially sent record in the TLS TX list to timeout after not being retried. This patch fixes the above by calling queue->write_space(), which calls into the TLS layer to retry any pending records. Fixes: be8e82caa685 ("nvme-tcp: enable TLS handshake upcall") Signed-off-by: Wilfred Mallawa Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 1413788ca7d5..9a96df1a511c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1081,6 +1081,9 @@ static void nvme_tcp_write_space(struct sock *sk) queue = sk->sk_user_data; if (likely(queue && sk_stream_is_writeable(sk))) { clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + /* Ensure pending TLS partial records are retried */ + if (nvme_tcp_queue_tls(queue)) + queue->write_space(sk); queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); } read_unlock_bh(&sk->sk_callback_lock);