Commit 4dbd2b2e authored by Nilay Shroff's avatar Nilay Shroff Committed by Keith Busch
Browse files

nvme-multipath: Add visibility for round-robin io-policy



This patch helps add nvme native multipath visibility for round-robin
io-policy. It creates a "multipath" sysfs directory under head gendisk
device node directory and then from "multipath" directory it adds a link
to each namespace path device the head node refers.

For instance, if we have a shared namespace accessible from two different
controllers/paths then we create a soft link to each path device from head
disk node as shown below:

$ ls -l /sys/block/nvme1n1/multipath/
nvme1c1n1 -> ../../../../../pci052e:78/052e:78:00.0/nvme/nvme1/nvme1c1n1
nvme1c3n1 -> ../../../../../pci058e:78/058e:78:00.0/nvme/nvme3/nvme1c3n1

In the above example, nvme1n1 is head gendisk node created for a shared
namespace and the namespace is accessible from nvme1c1n1 and nvme1c3n1
paths.

For round-robin I/O policy, we could easily infer from the above output
that I/O workload targeted to nvme1n1 would toggle across paths nvme1c1n1
and nvme1c3n1.

Reviewed-by: default avatarHannes Reinecke <hare@suse.de>
Signed-off-by: default avatarNilay Shroff <nilay@linux.ibm.com>
Signed-off-by: default avatarKeith Busch <kbusch@kernel.org>
parent 316dabe6
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -4020,6 +4020,9 @@ static void nvme_ns_remove(struct nvme_ns *ns)

	if (!nvme_ns_head_multipath(ns->head))
		nvme_cdev_del(&ns->cdev, &ns->cdev_device);

	nvme_mpath_remove_sysfs_link(ns);

	del_gendisk(ns->disk);

	mutex_lock(&ns->ctrl->namespaces_lock);
+99 −0
Original line number Diff line number Diff line
@@ -686,6 +686,8 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
		kblockd_schedule_work(&head->partition_scan_work);
	}

	nvme_mpath_add_sysfs_link(ns->head);

	mutex_lock(&head->lock);
	if (nvme_path_is_optimized(ns)) {
		int node, srcu_idx;
@@ -768,6 +770,25 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
	if (nvme_state_is_live(ns->ana_state) &&
	    nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
		nvme_mpath_set_live(ns);
	else {
		/*
		 * Add sysfs link from multipath head gendisk node to path
		 * device gendisk node.
		 * If path's ana state is live (i.e. state is either optimized
		 * or non-optimized) while we alloc the ns then sysfs link would
		 * be created from nvme_mpath_set_live(). In that case we would
		 * not fallthrough this code path. However for the path's ana
		 * state other than live, we call nvme_mpath_set_live() only
		 * after ana state transitioned to the live state. But we still
		 * want to create the sysfs link from head node to a path device
		 * irrespctive of the path's ana state.
		 * If we reach through here then it means that path's ana state
		 * is not live but still create the sysfs link to this path from
		 * head node if head node of the path has already come alive.
		 */
		if (test_bit(NVME_NSHEAD_DISK_LIVE, &ns->head->flags))
			nvme_mpath_add_sysfs_link(ns->head);
	}
}

static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
@@ -967,6 +988,84 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
	return -ENXIO; /* just break out of the loop */
}

void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head)
{
	struct device *target;
	int rc, srcu_idx;
	struct nvme_ns *ns;
	struct kobject *kobj;

	/*
	 * Ensure head disk node is already added otherwise we may get invalid
	 * kobj for head disk node
	 */
	if (!test_bit(GD_ADDED, &head->disk->state))
		return;

	kobj = &disk_to_dev(head->disk)->kobj;

	/*
	 * loop through each ns chained through the head->list and create the
	 * sysfs link from head node to the ns path node
	 */
	srcu_idx = srcu_read_lock(&head->srcu);

	list_for_each_entry_rcu(ns, &head->list, siblings) {
		/*
		 * Avoid creating link if it already exists for the given path.
		 * When path ana state transitions from optimized to non-
		 * optimized or vice-versa, the nvme_mpath_set_live() is
		 * invoked which in truns call this function. Now if the sysfs
		 * link already exists for the given path and we attempt to re-
		 * create the link then sysfs code would warn about it loudly.
		 * So we evaluate NVME_NS_SYSFS_ATTR_LINK flag here to ensure
		 * that we're not creating duplicate link.
		 * The test_and_set_bit() is used because it is protecting
		 * against multiple nvme paths being simultaneously added.
		 */
		if (test_and_set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
			continue;

		/*
		 * Ensure that ns path disk node is already added otherwise we
		 * may get invalid kobj name for target
		 */
		if (!test_bit(GD_ADDED, &ns->disk->state))
			continue;

		target = disk_to_dev(ns->disk);
		/*
		 * Create sysfs link from head gendisk kobject @kobj to the
		 * ns path gendisk kobject @target->kobj.
		 */
		rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name,
				&target->kobj, dev_name(target));
		if (unlikely(rc)) {
			dev_err(disk_to_dev(ns->head->disk),
					"failed to create link to %s\n",
					dev_name(target));
			clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
		}
	}

	srcu_read_unlock(&head->srcu, srcu_idx);
}

void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
{
	struct device *target;
	struct kobject *kobj;

	if (!test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
		return;

	target = disk_to_dev(ns->disk);
	kobj = &disk_to_dev(ns->head->disk)->kobj;
	sysfs_remove_link_from_group(kobj, nvme_ns_mpath_attr_group.name,
			dev_name(target));
	clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
}

void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
{
	if (nvme_ctrl_use_ana(ns->ctrl)) {
+14 −4
Original line number Diff line number Diff line
@@ -538,6 +538,7 @@ struct nvme_ns {
#define NVME_NS_ANA_PENDING		2
#define NVME_NS_FORCE_RO		3
#define NVME_NS_READY			4
#define NVME_NS_SYSFS_ATTR_LINK	5

	struct cdev		cdev;
	struct device		cdev_device;
@@ -933,6 +934,7 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);

extern const struct attribute_group *nvme_ns_attr_groups[];
extern const struct attribute_group nvme_ns_mpath_attr_group;
extern const struct pr_ops nvme_pr_ops;
extern const struct block_device_operations nvme_ns_head_ops;
extern const struct attribute_group nvme_dev_attrs_group;
@@ -955,6 +957,8 @@ void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
void nvme_failover_req(struct request *req);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns);
void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns);
void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
@@ -1009,6 +1013,12 @@ static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
}
static inline void nvme_mpath_add_sysfs_link(struct nvme_ns *ns)
{
}
static inline void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
{
}
static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
{
	return false;
+14 −0
Original line number Diff line number Diff line
@@ -299,8 +299,22 @@ static const struct attribute_group nvme_ns_attr_group = {
	.is_visible	= nvme_ns_attrs_are_visible,
};

#ifdef CONFIG_NVME_MULTIPATH
static struct attribute *nvme_ns_mpath_attrs[] = {
	NULL,
};

const struct attribute_group nvme_ns_mpath_attr_group = {
	.name           = "multipath",
	.attrs		= nvme_ns_mpath_attrs,
};
#endif

const struct attribute_group *nvme_ns_attr_groups[] = {
	&nvme_ns_attr_group,
#ifdef CONFIG_NVME_MULTIPATH
	&nvme_ns_mpath_attr_group,
#endif
	NULL,
};