Commit 1521dc24 authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge tag 'nvme-6.10-2024-05-29' of git://git.infradead.org/nvme into block-6.10

Pull NVMe fixes from Keith:

"nvme fixes for Linux 6.10

 - Removing unused fields (Kanchan)
 - Large folio offsets support (Kundan)
 - Multipath NUMA node initialiazation fix (Nilay)
 - Multipath IO stats accounting fixes (Keith)
 - Circular lockdep fix (Keith)
 - Target race condition fix (Sagi)
 - Target memory leak fix (Sagi)"

* tag 'nvme-6.10-2024-05-29' of git://git.infradead.org/nvme:
  nvmet: fix a possible leak when destroy a ctrl during qp establishment
  nvme: use srcu for iterating namespace list
  nvme: adjust multiples of NVME_CTRL_PAGE_SIZE in offset
  nvme: remove sgs and sws
  nvmet: fix ns enable/disable possible hang
  nvme-multipath: fix io accounting on failover
  nvme: fix multipath batched completion accounting
  nvme-multipath: find NUMA path only for online numa-node
parents 74d4ce92 c758b77d
Loading
Loading
Loading
Loading
+70 −46
Original line number Diff line number Diff line
@@ -414,7 +414,15 @@ static inline void nvme_end_req_zoned(struct request *req)
	}
}

static inline void nvme_end_req(struct request *req)
static inline void __nvme_end_req(struct request *req)
{
	nvme_end_req_zoned(req);
	nvme_trace_bio_complete(req);
	if (req->cmd_flags & REQ_NVME_MPATH)
		nvme_mpath_end_request(req);
}

void nvme_end_req(struct request *req)
{
	blk_status_t status = nvme_error_status(nvme_req(req)->status);

@@ -424,10 +432,7 @@ static inline void nvme_end_req(struct request *req)
		else
			nvme_log_error(req);
	}
	nvme_end_req_zoned(req);
	nvme_trace_bio_complete(req);
	if (req->cmd_flags & REQ_NVME_MPATH)
		nvme_mpath_end_request(req);
	__nvme_end_req(req);
	blk_mq_end_request(req, status);
}

@@ -476,7 +481,7 @@ void nvme_complete_batch_req(struct request *req)
{
	trace_nvme_complete_rq(req);
	nvme_cleanup_cmd(req);
	nvme_end_req_zoned(req);
	__nvme_end_req(req);
}
EXPORT_SYMBOL_GPL(nvme_complete_batch_req);

@@ -673,7 +678,7 @@ static void nvme_free_ns(struct kref *kref)
	kfree(ns);
}

static inline bool nvme_get_ns(struct nvme_ns *ns)
bool nvme_get_ns(struct nvme_ns *ns)
{
	return kref_get_unless_zero(&ns->kref);
}
@@ -3679,9 +3684,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
	struct nvme_ns *ns, *ret = NULL;
	int srcu_idx;

	down_read(&ctrl->namespaces_rwsem);
	list_for_each_entry(ns, &ctrl->namespaces, list) {
	srcu_idx = srcu_read_lock(&ctrl->srcu);
	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
		if (ns->head->ns_id == nsid) {
			if (!nvme_get_ns(ns))
				continue;
@@ -3691,7 +3697,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
		if (ns->head->ns_id > nsid)
			break;
	}
	up_read(&ctrl->namespaces_rwsem);
	srcu_read_unlock(&ctrl->srcu, srcu_idx);
	return ret;
}
EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
@@ -3705,7 +3711,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)

	list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
		if (tmp->head->ns_id < ns->head->ns_id) {
			list_add(&ns->list, &tmp->list);
			list_add_rcu(&ns->list, &tmp->list);
			return;
		}
	}
@@ -3771,17 +3777,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
	if (nvme_update_ns_info(ns, info))
		goto out_unlink_ns;

	down_write(&ctrl->namespaces_rwsem);
	mutex_lock(&ctrl->namespaces_lock);
	/*
	 * Ensure that no namespaces are added to the ctrl list after the queues
	 * are frozen, thereby avoiding a deadlock between scan and reset.
	 */
	if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
		up_write(&ctrl->namespaces_rwsem);
		mutex_unlock(&ctrl->namespaces_lock);
		goto out_unlink_ns;
	}
	nvme_ns_add_to_ctrl_list(ns);
	up_write(&ctrl->namespaces_rwsem);
	mutex_unlock(&ctrl->namespaces_lock);
	synchronize_srcu(&ctrl->srcu);
	nvme_get_ctrl(ctrl);

	if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
@@ -3804,9 +3811,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)

 out_cleanup_ns_from_list:
	nvme_put_ctrl(ctrl);
	down_write(&ctrl->namespaces_rwsem);
	list_del_init(&ns->list);
	up_write(&ctrl->namespaces_rwsem);
	mutex_lock(&ctrl->namespaces_lock);
	list_del_rcu(&ns->list);
	mutex_unlock(&ctrl->namespaces_lock);
	synchronize_srcu(&ctrl->srcu);
 out_unlink_ns:
	mutex_lock(&ctrl->subsys->lock);
	list_del_rcu(&ns->siblings);
@@ -3856,9 +3864,10 @@ static void nvme_ns_remove(struct nvme_ns *ns)
		nvme_cdev_del(&ns->cdev, &ns->cdev_device);
	del_gendisk(ns->disk);

	down_write(&ns->ctrl->namespaces_rwsem);
	list_del_init(&ns->list);
	up_write(&ns->ctrl->namespaces_rwsem);
	mutex_lock(&ns->ctrl->namespaces_lock);
	list_del_rcu(&ns->list);
	mutex_unlock(&ns->ctrl->namespaces_lock);
	synchronize_srcu(&ns->ctrl->srcu);

	if (last_path)
		nvme_mpath_shutdown_disk(ns->head);
@@ -3948,16 +3957,17 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
	struct nvme_ns *ns, *next;
	LIST_HEAD(rm_list);

	down_write(&ctrl->namespaces_rwsem);
	mutex_lock(&ctrl->namespaces_lock);
	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
		if (ns->head->ns_id > nsid)
			list_move_tail(&ns->list, &rm_list);
			list_splice_init_rcu(&ns->list, &rm_list,
					     synchronize_rcu);
	}
	up_write(&ctrl->namespaces_rwsem);
	mutex_unlock(&ctrl->namespaces_lock);
	synchronize_srcu(&ctrl->srcu);

	list_for_each_entry_safe(ns, next, &rm_list, list)
		nvme_ns_remove(ns);

}

static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
@@ -4127,9 +4137,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
	/* this is a no-op when called from the controller reset handler */
	nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);

	down_write(&ctrl->namespaces_rwsem);
	list_splice_init(&ctrl->namespaces, &ns_list);
	up_write(&ctrl->namespaces_rwsem);
	mutex_lock(&ctrl->namespaces_lock);
	list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu);
	mutex_unlock(&ctrl->namespaces_lock);
	synchronize_srcu(&ctrl->srcu);

	list_for_each_entry_safe(ns, next, &ns_list, list)
		nvme_ns_remove(ns);
@@ -4577,6 +4588,7 @@ static void nvme_free_ctrl(struct device *dev)
	key_put(ctrl->tls_key);
	nvme_free_cels(ctrl);
	nvme_mpath_uninit(ctrl);
	cleanup_srcu_struct(&ctrl->srcu);
	nvme_auth_stop(ctrl);
	nvme_auth_free(ctrl);
	__free_page(ctrl->discard_page);
@@ -4609,10 +4621,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
	ctrl->passthru_err_log_enabled = false;
	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
	spin_lock_init(&ctrl->lock);
	mutex_init(&ctrl->namespaces_lock);

	ret = init_srcu_struct(&ctrl->srcu);
	if (ret)
		return ret;

	mutex_init(&ctrl->scan_lock);
	INIT_LIST_HEAD(&ctrl->namespaces);
	xa_init(&ctrl->cels);
	init_rwsem(&ctrl->namespaces_rwsem);
	ctrl->dev = dev;
	ctrl->ops = ops;
	ctrl->quirks = quirks;
@@ -4692,6 +4709,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
out:
	if (ctrl->discard_page)
		__free_page(ctrl->discard_page);
	cleanup_srcu_struct(&ctrl->srcu);
	return ret;
}
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
@@ -4700,22 +4718,24 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl);
void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
{
	struct nvme_ns *ns;
	int srcu_idx;

	down_read(&ctrl->namespaces_rwsem);
	list_for_each_entry(ns, &ctrl->namespaces, list)
	srcu_idx = srcu_read_lock(&ctrl->srcu);
	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
		blk_mark_disk_dead(ns->disk);
	up_read(&ctrl->namespaces_rwsem);
	srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);

void nvme_unfreeze(struct nvme_ctrl *ctrl)
{
	struct nvme_ns *ns;
	int srcu_idx;

	down_read(&ctrl->namespaces_rwsem);
	list_for_each_entry(ns, &ctrl->namespaces, list)
	srcu_idx = srcu_read_lock(&ctrl->srcu);
	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
		blk_mq_unfreeze_queue(ns->queue);
	up_read(&ctrl->namespaces_rwsem);
	srcu_read_unlock(&ctrl->srcu, srcu_idx);
	clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
}
EXPORT_SYMBOL_GPL(nvme_unfreeze);
@@ -4723,14 +4743,15 @@ EXPORT_SYMBOL_GPL(nvme_unfreeze);
int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
{
	struct nvme_ns *ns;
	int srcu_idx;

	down_read(&ctrl->namespaces_rwsem);
	list_for_each_entry(ns, &ctrl->namespaces, list) {
	srcu_idx = srcu_read_lock(&ctrl->srcu);
	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
		if (timeout <= 0)
			break;
	}
	up_read(&ctrl->namespaces_rwsem);
	srcu_read_unlock(&ctrl->srcu, srcu_idx);
	return timeout;
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
@@ -4738,23 +4759,25 @@ EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
void nvme_wait_freeze(struct nvme_ctrl *ctrl)
{
	struct nvme_ns *ns;
	int srcu_idx;

	down_read(&ctrl->namespaces_rwsem);
	list_for_each_entry(ns, &ctrl->namespaces, list)
	srcu_idx = srcu_read_lock(&ctrl->srcu);
	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
		blk_mq_freeze_queue_wait(ns->queue);
	up_read(&ctrl->namespaces_rwsem);
	srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze);

void nvme_start_freeze(struct nvme_ctrl *ctrl)
{
	struct nvme_ns *ns;
	int srcu_idx;

	set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
	down_read(&ctrl->namespaces_rwsem);
	list_for_each_entry(ns, &ctrl->namespaces, list)
	srcu_idx = srcu_read_lock(&ctrl->srcu);
	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
		blk_freeze_queue_start(ns->queue);
	up_read(&ctrl->namespaces_rwsem);
	srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);

@@ -4797,11 +4820,12 @@ EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
{
	struct nvme_ns *ns;
	int srcu_idx;

	down_read(&ctrl->namespaces_rwsem);
	list_for_each_entry(ns, &ctrl->namespaces, list)
	srcu_idx = srcu_read_lock(&ctrl->srcu);
	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
		blk_sync_queue(ns->queue);
	up_read(&ctrl->namespaces_rwsem);
	srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_sync_io_queues);

+9 −6
Original line number Diff line number Diff line
@@ -789,15 +789,15 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
		bool open_for_write)
{
	struct nvme_ns *ns;
	int ret;
	int ret, srcu_idx;

	down_read(&ctrl->namespaces_rwsem);
	srcu_idx = srcu_read_lock(&ctrl->srcu);
	if (list_empty(&ctrl->namespaces)) {
		ret = -ENOTTY;
		goto out_unlock;
	}

	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
	ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list);
	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
		dev_warn(ctrl->device,
			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
@@ -807,15 +807,18 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,

	dev_warn(ctrl->device,
		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
	kref_get(&ns->kref);
	up_read(&ctrl->namespaces_rwsem);
	if (!nvme_get_ns(ns)) {
		ret = -ENXIO;
		goto out_unlock;
	}
	srcu_read_unlock(&ctrl->srcu, srcu_idx);

	ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write);
	nvme_put_ns(ns);
	return ret;

out_unlock:
	up_read(&ctrl->namespaces_rwsem);
	srcu_read_unlock(&ctrl->srcu, srcu_idx);
	return ret;
}

+15 −11
Original line number Diff line number Diff line
@@ -118,7 +118,8 @@ void nvme_failover_req(struct request *req)
	blk_steal_bios(&ns->head->requeue_list, req);
	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);

	blk_mq_end_request(req, 0);
	nvme_req(req)->status = 0;
	nvme_end_req(req);
	kblockd_schedule_work(&ns->head->requeue_work);
}

@@ -150,16 +151,17 @@ void nvme_mpath_end_request(struct request *rq)
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
{
	struct nvme_ns *ns;
	int srcu_idx;

	down_read(&ctrl->namespaces_rwsem);
	list_for_each_entry(ns, &ctrl->namespaces, list) {
	srcu_idx = srcu_read_lock(&ctrl->srcu);
	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
		if (!ns->head->disk)
			continue;
		kblockd_schedule_work(&ns->head->requeue_work);
		if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
			disk_uevent(ns->head->disk, KOBJ_CHANGE);
	}
	up_read(&ctrl->namespaces_rwsem);
	srcu_read_unlock(&ctrl->srcu, srcu_idx);
}

static const char *nvme_ana_state_names[] = {
@@ -193,13 +195,14 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
{
	struct nvme_ns *ns;
	int srcu_idx;

	down_read(&ctrl->namespaces_rwsem);
	list_for_each_entry(ns, &ctrl->namespaces, list) {
	srcu_idx = srcu_read_lock(&ctrl->srcu);
	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
		nvme_mpath_clear_current_path(ns);
		kblockd_schedule_work(&ns->head->requeue_work);
	}
	up_read(&ctrl->namespaces_rwsem);
	srcu_read_unlock(&ctrl->srcu, srcu_idx);
}

void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
@@ -595,7 +598,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
		int node, srcu_idx;

		srcu_idx = srcu_read_lock(&head->srcu);
		for_each_node(node)
		for_each_online_node(node)
			__nvme_find_path(head, node);
		srcu_read_unlock(&head->srcu, srcu_idx);
	}
@@ -680,6 +683,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
	unsigned *nr_change_groups = data;
	struct nvme_ns *ns;
	int srcu_idx;

	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
			le32_to_cpu(desc->grpid),
@@ -691,8 +695,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
	if (!nr_nsids)
		return 0;

	down_read(&ctrl->namespaces_rwsem);
	list_for_each_entry(ns, &ctrl->namespaces, list) {
	srcu_idx = srcu_read_lock(&ctrl->srcu);
	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
		unsigned nsid;
again:
		nsid = le32_to_cpu(desc->nsids[n]);
@@ -705,7 +709,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
		if (ns->head->ns_id > nsid)
			goto again;
	}
	up_read(&ctrl->namespaces_rwsem);
	srcu_read_unlock(&ctrl->srcu, srcu_idx);
	return 0;
}

+4 −3
Original line number Diff line number Diff line
@@ -282,7 +282,8 @@ struct nvme_ctrl {
	struct blk_mq_tag_set *tagset;
	struct blk_mq_tag_set *admin_tagset;
	struct list_head namespaces;
	struct rw_semaphore namespaces_rwsem;
	struct mutex namespaces_lock;
	struct srcu_struct srcu;
	struct device ctrl_device;
	struct device *device;	/* char device */
#ifdef CONFIG_NVME_HWMON
@@ -471,8 +472,6 @@ struct nvme_ns_head {
	u8			pi_type;
	u8			pi_offset;
	u8			guard_type;
	u16			sgs;
	u32			sws;
#ifdef CONFIG_BLK_DEV_ZONED
	u64			zsze;
#endif
@@ -767,6 +766,7 @@ static inline bool nvme_state_terminal(struct nvme_ctrl *ctrl)
	}
}

void nvme_end_req(struct request *req);
void nvme_complete_rq(struct request *req);
void nvme_complete_batch_req(struct request *req);

@@ -1161,6 +1161,7 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
		       struct nvme_command *cmd, int status);
struct nvme_ctrl *nvme_ctrl_from_file(struct file *file);
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid);
bool nvme_get_ns(struct nvme_ns *ns);
void nvme_put_ns(struct nvme_ns *ns);

static inline bool nvme_multi_css(struct nvme_ctrl *ctrl)
+2 −1
Original line number Diff line number Diff line
@@ -778,7 +778,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
		struct bio_vec bv = req_bvec(req);

		if (!is_pci_p2pdma_page(bv.bv_page)) {
			if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
			if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
			     bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
				return nvme_setup_prp_simple(dev, req,
							     &cmnd->rw, &bv);

Loading