Commit 39eb8101 authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge tag 'nvme-6.16-2025-05-20' of git://git.infradead.org/nvme into for-6.16/block

Pull NVMe updates from Christoph:

"nvme updates for Linux 6.16

 - add per-node DMA pools and use them for PRP/SGL allocations
   (Caleb Sander Mateos, Keith Busch)
 - nvme-fcloop refcounting fixes (Daniel Wagner)
 - support delayed removal of the multipath node and optionally support
   the multipath node for private namespaces (Nilay Shroff)
 - support shared CQs in the PCI endpoint target code (Wilfred Mallawa)
 - support admin-queue only authentication (Hannes Reinecke)
 - use the crc32c library instead of the crypto API (Eric Biggers)
 - misc cleanups (Christoph Hellwig, Marcelo Moreira, Hannes Reinecke,
   Leon Romanovsky, Gustavo A. R. Silva)"

* tag 'nvme-6.16-2025-05-20' of git://git.infradead.org/nvme: (42 commits)
  nvme: rename nvme_mpath_shutdown_disk to nvme_mpath_remove_disk
  nvme: introduce multipath_always_on module param
  nvme-multipath: introduce delayed removal of the multipath head node
  nvme-pci: derive and better document max segments limits
  nvme-pci: use struct_size for allocation struct nvme_dev
  nvme-pci: add a symolic name for the small pool size
  nvme-pci: use a better encoding for small prp pool allocations
  nvme-pci: rename the descriptor pools
  nvme-pci: remove struct nvme_descriptor
  nvme-pci: store aborted state in flags variable
  nvme-pci: don't try to use SGLs for metadata on the admin queue
  nvme-pci: make PRP list DMA pools per-NUMA-node
  nvme-pci: factor out a nvme_init_hctx_common() helper
  dmapool: add NUMA affinity support
  nvme-fc: do not reference lsrsp after failure
  nvmet-fcloop: don't wait for lport cleanup
  nvmet-fcloop: add missing fcloop_callback_host_done
  nvmet-fc: take tgtport refs for portentry
  nvmet-fc: free pending reqs on tgtport unregister
  nvmet-fcloop: drop response if targetport is gone
  ...
parents 496a3bc5 9e221d8c
Loading
Loading
Loading
Loading
+2 −13
Original line number Diff line number Diff line
@@ -242,7 +242,7 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
{
	const char *hmac_name;
	struct crypto_shash *key_tfm;
	struct shash_desc *shash;
	SHASH_DESC_ON_STACK(shash, key_tfm);
	struct nvme_dhchap_key *transformed_key;
	int ret, key_len;

@@ -267,19 +267,11 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
	if (IS_ERR(key_tfm))
		return ERR_CAST(key_tfm);

	shash = kmalloc(sizeof(struct shash_desc) +
			crypto_shash_descsize(key_tfm),
			GFP_KERNEL);
	if (!shash) {
		ret = -ENOMEM;
		goto out_free_key;
	}

	key_len = crypto_shash_digestsize(key_tfm);
	transformed_key = nvme_auth_alloc_key(key_len, key->hash);
	if (!transformed_key) {
		ret = -ENOMEM;
		goto out_free_shash;
		goto out_free_key;
	}

	shash->tfm = key_tfm;
@@ -299,15 +291,12 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
	if (ret < 0)
		goto out_free_transformed_key;

	kfree(shash);
	crypto_free_shash(key_tfm);

	return transformed_key;

out_free_transformed_key:
	nvme_auth_free_key(transformed_key);
out_free_shash:
	kfree(shash);
out_free_key:
	crypto_free_shash(key_tfm);

+22 −8
Original line number Diff line number Diff line
@@ -31,6 +31,7 @@ struct nvme_dhchap_queue_context {
	u32 s1;
	u32 s2;
	bool bi_directional;
	bool authenticated;
	u16 transaction;
	u8 status;
	u8 dhgroup_id;
@@ -682,6 +683,7 @@ static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap)
static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap)
{
	nvme_auth_reset_dhchap(chap);
	chap->authenticated = false;
	if (chap->shash_tfm)
		crypto_free_shash(chap->shash_tfm);
	if (chap->dh_tfm)
@@ -930,12 +932,14 @@ static void nvme_queue_auth_work(struct work_struct *work)
	}
	if (!ret) {
		chap->error = 0;
		chap->authenticated = true;
		if (ctrl->opts->concat &&
		    (ret = nvme_auth_secure_concat(ctrl, chap))) {
			dev_warn(ctrl->device,
				 "%s: qid %d failed to enable secure concatenation\n",
				 __func__, chap->qid);
			chap->error = ret;
			chap->authenticated = false;
		}
		return;
	}
@@ -1023,13 +1027,16 @@ static void nvme_ctrl_auth_work(struct work_struct *work)
		return;

	for (q = 1; q < ctrl->queue_count; q++) {
		ret = nvme_auth_negotiate(ctrl, q);
		if (ret) {
			dev_warn(ctrl->device,
				 "qid %d: error %d setting up authentication\n",
				 q, ret);
			break;
		}
		struct nvme_dhchap_queue_context *chap =
			&ctrl->dhchap_ctxs[q];
		/*
		 * Skip re-authentication if the queue had
		 * not been authenticated initially.
		 */
		if (!chap->authenticated)
			continue;
		cancel_work_sync(&chap->auth_work);
		queue_work(nvme_auth_wq, &chap->auth_work);
	}

	/*
@@ -1037,7 +1044,13 @@ static void nvme_ctrl_auth_work(struct work_struct *work)
	 * the controller terminates the connection.
	 */
	for (q = 1; q < ctrl->queue_count; q++) {
		ret = nvme_auth_wait(ctrl, q);
		struct nvme_dhchap_queue_context *chap =
			&ctrl->dhchap_ctxs[q];
		if (!chap->authenticated)
			continue;
		flush_work(&chap->auth_work);
		ret = chap->error;
		nvme_auth_reset_dhchap(chap);
		if (ret)
			dev_warn(ctrl->device,
				 "qid %d: authentication failed\n", q);
@@ -1076,6 +1089,7 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
		chap = &ctrl->dhchap_ctxs[i];
		chap->qid = i;
		chap->ctrl = ctrl;
		chap->authenticated = false;
		INIT_WORK(&chap->auth_work, nvme_queue_auth_work);
	}

+7 −5
Original line number Diff line number Diff line
@@ -668,7 +668,7 @@ static void nvme_free_ns_head(struct kref *ref)
	struct nvme_ns_head *head =
		container_of(ref, struct nvme_ns_head, ref);

	nvme_mpath_remove_disk(head);
	nvme_mpath_put_disk(head);
	ida_free(&head->subsys->ns_ida, head->instance);
	cleanup_srcu_struct(&head->srcu);
	nvme_put_subsystem(head->subsys);
@@ -3743,7 +3743,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
		 */
		if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
			continue;
		if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
		if (nvme_tryget_ns_head(h))
			return h;
	}

@@ -3987,7 +3987,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
		}
	} else {
		ret = -EINVAL;
		if (!info->is_shared || !head->shared) {
		if ((!info->is_shared || !head->shared) &&
		    !list_empty(&head->list)) {
			dev_err(ctrl->device,
				"Duplicate unshared namespace %d\n",
				info->nsid);
@@ -4191,6 +4192,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
	mutex_lock(&ns->ctrl->subsys->lock);
	list_del_rcu(&ns->siblings);
	if (list_empty(&ns->head->list)) {
		if (!nvme_mpath_queue_if_no_path(ns->head))
			list_del_init(&ns->head->entry);
		last_path = true;
	}
@@ -4212,7 +4214,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
	synchronize_srcu(&ns->ctrl->srcu);

	if (last_path)
		nvme_mpath_shutdown_disk(ns->head);
		nvme_mpath_remove_disk(ns->head);
	nvme_put_ns(ns);
}

+10 −3
Original line number Diff line number Diff line
@@ -1410,9 +1410,8 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
}

static void
nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
nvme_fc_xmt_ls_rsp_free(struct nvmefc_ls_rcv_op *lsop)
{
	struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private;
	struct nvme_fc_rport *rport = lsop->rport;
	struct nvme_fc_lport *lport = rport->lport;
	unsigned long flags;
@@ -1433,6 +1432,14 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
	nvme_fc_rport_put(rport);
}

static void
nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
{
	struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private;

	nvme_fc_xmt_ls_rsp_free(lsop);
}

static void
nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop)
{
@@ -1450,7 +1457,7 @@ nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop)
		dev_warn(lport->dev,
			"LLDD rejected LS RSP xmt: LS %d status %d\n",
			w0->ls_cmd, ret);
		nvme_fc_xmt_ls_rsp_done(lsop->lsrsp);
		nvme_fc_xmt_ls_rsp_free(lsop);
		return;
	}
}
+188 −18
Original line number Diff line number Diff line
@@ -10,10 +10,61 @@
#include "nvme.h"

bool multipath = true;
module_param(multipath, bool, 0444);
static bool multipath_always_on;

static int multipath_param_set(const char *val, const struct kernel_param *kp)
{
	int ret;
	bool *arg = kp->arg;

	ret = param_set_bool(val, kp);
	if (ret)
		return ret;

	if (multipath_always_on && !*arg) {
		pr_err("Can't disable multipath when multipath_always_on is configured.\n");
		*arg = true;
		return -EINVAL;
	}

	return 0;
}

static const struct kernel_param_ops multipath_param_ops = {
	.set = multipath_param_set,
	.get = param_get_bool,
};

module_param_cb(multipath, &multipath_param_ops, &multipath, 0444);
MODULE_PARM_DESC(multipath,
	"turn on native support for multiple controllers per subsystem");

static int multipath_always_on_set(const char *val,
		const struct kernel_param *kp)
{
	int ret;
	bool *arg = kp->arg;

	ret = param_set_bool(val, kp);
	if (ret < 0)
		return ret;

	if (*arg)
		multipath = true;

	return 0;
}

static const struct kernel_param_ops multipath_always_on_ops = {
	.set = multipath_always_on_set,
	.get = param_get_bool,
};

module_param_cb(multipath_always_on, &multipath_always_on_ops,
		&multipath_always_on, 0444);
MODULE_PARM_DESC(multipath_always_on,
	"create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support");

static const char *nvme_iopolicy_names[] = {
	[NVME_IOPOLICY_NUMA]	= "numa",
	[NVME_IOPOLICY_RR]	= "round-robin",
@@ -442,7 +493,17 @@ static bool nvme_available_path(struct nvme_ns_head *head)
			break;
		}
	}
	return false;

	/*
	 * If "head->delayed_removal_secs" is configured (i.e., non-zero), do
	 * not immediately fail I/O. Instead, requeue the I/O for the configured
	 * duration, anticipating that if there's a transient link failure then
	 * it may recover within this time window. This parameter is exported to
	 * userspace via sysfs, and its default value is zero. It is internally
	 * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
	 * non-zero, this flag is set to true. When zero, the flag is cleared.
	 */
	return nvme_mpath_queue_if_no_path(head);
}

static void nvme_ns_head_submit_bio(struct bio *bio)
@@ -617,6 +678,40 @@ static void nvme_requeue_work(struct work_struct *work)
	}
}

static void nvme_remove_head(struct nvme_ns_head *head)
{
	if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
		/*
		 * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
		 * to allow multipath to fail all I/O.
		 */
		kblockd_schedule_work(&head->requeue_work);

		nvme_cdev_del(&head->cdev, &head->cdev_device);
		synchronize_srcu(&head->srcu);
		del_gendisk(head->disk);
		nvme_put_ns_head(head);
	}
}

static void nvme_remove_head_work(struct work_struct *work)
{
	struct nvme_ns_head *head = container_of(to_delayed_work(work),
			struct nvme_ns_head, remove_work);
	bool remove = false;

	mutex_lock(&head->subsys->lock);
	if (list_empty(&head->list)) {
		list_del_init(&head->entry);
		remove = true;
	}
	mutex_unlock(&head->subsys->lock);
	if (remove)
		nvme_remove_head(head);

	module_put(THIS_MODULE);
}

int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
{
	struct queue_limits lim;
@@ -626,14 +721,25 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
	spin_lock_init(&head->requeue_lock);
	INIT_WORK(&head->requeue_work, nvme_requeue_work);
	INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
	INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
	head->delayed_removal_secs = 0;

	/*
	 * Add a multipath node if the subsystems supports multiple controllers.
	 * We also do this for private namespaces as the namespace sharing flag
	 * could change after a rescan.
	 * If "multipath_always_on" is enabled, a multipath node is added
	 * regardless of whether the disk is single/multi ported, and whether
	 * the namespace is shared or private. If "multipath_always_on" is not
	 * enabled, a multipath node is added only if the subsystem supports
	 * multiple controllers and the "multipath" option is configured. In
	 * either case, for private namespaces, we ensure that the NSID is
	 * unique.
	 */
	if (!multipath_always_on) {
		if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
	    !nvme_is_unique_nsid(ctrl, head) || !multipath)
				!multipath)
			return 0;
	}

	if (!nvme_is_unique_nsid(ctrl, head))
		return 0;

	blk_set_stacking_limits(&lim);
@@ -659,6 +765,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
	set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
	sprintf(head->disk->disk_name, "nvme%dn%d",
			ctrl->subsys->instance, head->instance);
	nvme_tryget_ns_head(head);
	return 0;
}

@@ -1015,6 +1122,49 @@ static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr
}
DEVICE_ATTR_RO(numa_nodes);

static ssize_t delayed_removal_secs_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);
	struct nvme_ns_head *head = disk->private_data;
	int ret;

	mutex_lock(&head->subsys->lock);
	ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs);
	mutex_unlock(&head->subsys->lock);
	return ret;
}

static ssize_t delayed_removal_secs_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t count)
{
	struct gendisk *disk = dev_to_disk(dev);
	struct nvme_ns_head *head = disk->private_data;
	unsigned int sec;
	int ret;

	ret = kstrtouint(buf, 0, &sec);
	if (ret < 0)
		return ret;

	mutex_lock(&head->subsys->lock);
	head->delayed_removal_secs = sec;
	if (sec)
		set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
	else
		clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
	mutex_unlock(&head->subsys->lock);
	/*
	 * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen
	 * by its reader.
	 */
	synchronize_srcu(&head->srcu);

	return count;
}

DEVICE_ATTR_RW(delayed_removal_secs);

static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
		struct nvme_ana_group_desc *desc, void *data)
{
@@ -1136,23 +1286,43 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
#endif
}

void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
	if (!head->disk)
		return;
	if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
		nvme_cdev_del(&head->cdev, &head->cdev_device);
	bool remove = false;

	mutex_lock(&head->subsys->lock);
	/*
		 * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
		 * to allow multipath to fail all I/O.
	 * We are called when all paths have been removed, and at that point
	 * head->list is expected to be empty. However, nvme_remove_ns() and
	 * nvme_init_ns_head() can run concurrently and so if head->delayed_
	 * removal_secs is configured, it is possible that by the time we reach
	 * this point, head->list may no longer be empty. Therefore, we recheck
	 * head->list here. If it is no longer empty then we skip enqueuing the
	 * delayed head removal work.
	 */
		synchronize_srcu(&head->srcu);
		kblockd_schedule_work(&head->requeue_work);
		del_gendisk(head->disk);
	if (!list_empty(&head->list))
		goto out;

	if (head->delayed_removal_secs) {
		/*
		 * Ensure that no one could remove this module while the head
		 * remove work is pending.
		 */
		if (!try_module_get(THIS_MODULE))
			goto out;
		queue_delayed_work(nvme_wq, &head->remove_work,
				head->delayed_removal_secs * HZ);
	} else {
		list_del_init(&head->entry);
		remove = true;
	}
out:
	mutex_unlock(&head->subsys->lock);
	if (remove)
		nvme_remove_head(head);
}

void nvme_mpath_remove_disk(struct nvme_ns_head *head)
void nvme_mpath_put_disk(struct nvme_ns_head *head)
{
	if (!head->disk)
		return;
Loading