Commit e6e7f7ac authored by Keith Busch's avatar Keith Busch
Browse files

nvme: ensure reset state check ordering



A different CPU may be setting the ctrl->state value, so ensure proper
barriers to prevent optimizing to a stale state. Normally it isn't a
problem to observe the wrong state as it is merely advisory to take a
quicker path during initialization and error recovery, but seeing an old
state can report unexpected ENETRESET errors when a reset request was in
fact successful.

Reported-by: default avatarMinh Hoang <mh2022@meta.com>
Reviewed-by: default avatarSagi Grimberg <sagi@grimberg.me>
Signed-off-by: default avatarKeith Busch <kbusch@kernel.org>
Signed-off-by: default avatarHannes Reinecke <hare@suse.de>
parent 5c687c28
Loading
Loading
Loading
Loading
+22 −20
Original line number Diff line number Diff line
@@ -131,7 +131,7 @@ void nvme_queue_scan(struct nvme_ctrl *ctrl)
	/*
	 * Only new queue scan work when admin and IO queues are both alive
	 */
	if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
	if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE && ctrl->tagset)
		queue_work(nvme_wq, &ctrl->scan_work);
}

@@ -143,7 +143,7 @@ void nvme_queue_scan(struct nvme_ctrl *ctrl)
 */
int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
{
	if (ctrl->state != NVME_CTRL_RESETTING)
	if (nvme_ctrl_state(ctrl) != NVME_CTRL_RESETTING)
		return -EBUSY;
	if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
		return -EBUSY;
@@ -156,7 +156,7 @@ static void nvme_failfast_work(struct work_struct *work)
	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
			struct nvme_ctrl, failfast_work);

	if (ctrl->state != NVME_CTRL_CONNECTING)
	if (nvme_ctrl_state(ctrl) != NVME_CTRL_CONNECTING)
		return;

	set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
@@ -200,7 +200,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
	ret = nvme_reset_ctrl(ctrl);
	if (!ret) {
		flush_work(&ctrl->reset_work);
		if (ctrl->state != NVME_CTRL_LIVE)
		if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
			ret = -ENETRESET;
	}

@@ -499,7 +499,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,

	spin_lock_irqsave(&ctrl->lock, flags);

	old_state = ctrl->state;
	old_state = nvme_ctrl_state(ctrl);
	switch (new_state) {
	case NVME_CTRL_LIVE:
		switch (old_state) {
@@ -567,7 +567,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
	}

	if (changed) {
		ctrl->state = new_state;
		WRITE_ONCE(ctrl->state, new_state);
		wake_up_all(&ctrl->state_wq);
	}

@@ -575,11 +575,11 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
	if (!changed)
		return false;

	if (ctrl->state == NVME_CTRL_LIVE) {
	if (new_state == NVME_CTRL_LIVE) {
		if (old_state == NVME_CTRL_CONNECTING)
			nvme_stop_failfast_work(ctrl);
		nvme_kick_requeue_lists(ctrl);
	} else if (ctrl->state == NVME_CTRL_CONNECTING &&
	} else if (new_state == NVME_CTRL_CONNECTING &&
		old_state == NVME_CTRL_RESETTING) {
		nvme_start_failfast_work(ctrl);
	}
@@ -592,7 +592,7 @@ EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
 */
static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
{
	switch (ctrl->state) {
	switch (nvme_ctrl_state(ctrl)) {
	case NVME_CTRL_NEW:
	case NVME_CTRL_LIVE:
	case NVME_CTRL_RESETTING:
@@ -617,7 +617,7 @@ bool nvme_wait_reset(struct nvme_ctrl *ctrl)
	wait_event(ctrl->state_wq,
		   nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
		   nvme_state_terminal(ctrl));
	return ctrl->state == NVME_CTRL_RESETTING;
	return nvme_ctrl_state(ctrl) == NVME_CTRL_RESETTING;
}
EXPORT_SYMBOL_GPL(nvme_wait_reset);

@@ -704,9 +704,11 @@ EXPORT_SYMBOL_GPL(nvme_init_request);
blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
		struct request *rq)
{
	if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
	    ctrl->state != NVME_CTRL_DELETING &&
	    ctrl->state != NVME_CTRL_DEAD &&
	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);

	if (state != NVME_CTRL_DELETING_NOIO &&
	    state != NVME_CTRL_DELETING &&
	    state != NVME_CTRL_DEAD &&
	    !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
	    !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
		return BLK_STS_RESOURCE;
@@ -736,7 +738,7 @@ bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
		 * command, which is require to set the queue live in the
		 * appropinquate states.
		 */
		switch (ctrl->state) {
		switch (nvme_ctrl_state(ctrl)) {
		case NVME_CTRL_CONNECTING:
			if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
			    (req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
@@ -2550,7 +2552,7 @@ static void nvme_set_latency_tolerance(struct device *dev, s32 val)

	if (ctrl->ps_max_latency_us != latency) {
		ctrl->ps_max_latency_us = latency;
		if (ctrl->state == NVME_CTRL_LIVE)
		if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
			nvme_configure_apst(ctrl);
	}
}
@@ -3238,7 +3240,7 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
	struct nvme_ctrl *ctrl =
		container_of(inode->i_cdev, struct nvme_ctrl, cdev);

	switch (ctrl->state) {
	switch (nvme_ctrl_state(ctrl)) {
	case NVME_CTRL_LIVE:
		break;
	default:
@@ -3924,7 +3926,7 @@ static void nvme_scan_work(struct work_struct *work)
	int ret;

	/* No tagset on a live ctrl means IO queues could not created */
	if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
	if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE || !ctrl->tagset)
		return;

	/*
@@ -3994,7 +3996,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
	 * removing the namespaces' disks; fail all the queues now to avoid
	 * potentially having to clean up the failed sync later.
	 */
	if (ctrl->state == NVME_CTRL_DEAD)
	if (nvme_ctrl_state(ctrl) == NVME_CTRL_DEAD)
		nvme_mark_namespaces_dead(ctrl);

	/* this is a no-op when called from the controller reset handler */
@@ -4076,7 +4078,7 @@ static void nvme_async_event_work(struct work_struct *work)
	 * flushing ctrl async_event_work after changing the controller state
	 * from LIVE and before freeing the admin queue.
	*/
	if (ctrl->state == NVME_CTRL_LIVE)
	if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
		ctrl->ops->submit_async_event(ctrl);
}

@@ -4471,7 +4473,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
{
	int ret;

	ctrl->state = NVME_CTRL_NEW;
	WRITE_ONCE(ctrl->state, NVME_CTRL_NEW);
	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
	spin_lock_init(&ctrl->lock);
	mutex_init(&ctrl->scan_lock);
+3 −3
Original line number Diff line number Diff line
@@ -557,7 +557,7 @@ nvme_fc_rport_get(struct nvme_fc_rport *rport)
static void
nvme_fc_resume_controller(struct nvme_fc_ctrl *ctrl)
{
	switch (ctrl->ctrl.state) {
	switch (nvme_ctrl_state(&ctrl->ctrl)) {
	case NVME_CTRL_NEW:
	case NVME_CTRL_CONNECTING:
		/*
@@ -793,7 +793,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
		"NVME-FC{%d}: controller connectivity lost. Awaiting "
		"Reconnect", ctrl->cnum);

	switch (ctrl->ctrl.state) {
	switch (nvme_ctrl_state(&ctrl->ctrl)) {
	case NVME_CTRL_NEW:
	case NVME_CTRL_LIVE:
		/*
@@ -3319,7 +3319,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
	unsigned long recon_delay = ctrl->ctrl.opts->reconnect_delay * HZ;
	bool recon = true;

	if (ctrl->ctrl.state != NVME_CTRL_CONNECTING)
	if (nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_CONNECTING)
		return;

	if (portptr->port_state == FC_OBJSTATE_ONLINE) {
+7 −7
Original line number Diff line number Diff line
@@ -1233,7 +1233,7 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);

	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
	switch (dev->ctrl.state) {
	switch (nvme_ctrl_state(&dev->ctrl)) {
	case NVME_CTRL_RESETTING:
	case NVME_CTRL_CONNECTING:
		return false;
@@ -1321,7 +1321,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
	 * cancellation error. All outstanding requests are completed on
	 * shutdown, so we return BLK_EH_DONE.
	 */
	switch (dev->ctrl.state) {
	switch (nvme_ctrl_state(&dev->ctrl)) {
	case NVME_CTRL_CONNECTING:
		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
		fallthrough;
@@ -1593,7 +1593,7 @@ static int nvme_setup_io_queues_trylock(struct nvme_dev *dev)
	/*
	 * Controller is in wrong state, fail early.
	 */
	if (dev->ctrl.state != NVME_CTRL_CONNECTING) {
	if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_CONNECTING) {
		mutex_unlock(&dev->shutdown_lock);
		return -ENODEV;
	}
@@ -2573,13 +2573,13 @@ static bool nvme_pci_ctrl_is_dead(struct nvme_dev *dev)

static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{
	enum nvme_ctrl_state state = nvme_ctrl_state(&dev->ctrl);
	struct pci_dev *pdev = to_pci_dev(dev->dev);
	bool dead;

	mutex_lock(&dev->shutdown_lock);
	dead = nvme_pci_ctrl_is_dead(dev);
	if (dev->ctrl.state == NVME_CTRL_LIVE ||
	    dev->ctrl.state == NVME_CTRL_RESETTING) {
	if (state == NVME_CTRL_LIVE || state == NVME_CTRL_RESETTING) {
		if (pci_is_enabled(pdev))
			nvme_start_freeze(&dev->ctrl);
		/*
@@ -2690,7 +2690,7 @@ static void nvme_reset_work(struct work_struct *work)
	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
	int result;

	if (dev->ctrl.state != NVME_CTRL_RESETTING) {
	if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_RESETTING) {
		dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
			 dev->ctrl.state);
		result = -ENODEV;
@@ -3192,7 +3192,7 @@ static int nvme_suspend(struct device *dev)
	nvme_wait_freeze(ctrl);
	nvme_sync_queues(ctrl);

	if (ctrl->state != NVME_CTRL_LIVE)
	if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
		goto unfreeze;

	/*
+14 −9
Original line number Diff line number Diff line
@@ -984,10 +984,11 @@ static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)

static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
{
	enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);

	/* If we are resetting/deleting then do nothing */
	if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) {
		WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
			ctrl->ctrl.state == NVME_CTRL_LIVE);
	if (state != NVME_CTRL_CONNECTING) {
		WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE);
		return;
	}

@@ -1059,8 +1060,10 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
		 * unless we're during creation of a new controller to
		 * avoid races with teardown flow.
		 */
		WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
			     ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
		enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);

		WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
			     state != NVME_CTRL_DELETING_NOIO);
		WARN_ON_ONCE(new);
		ret = -EINVAL;
		goto destroy_io;
@@ -1129,8 +1132,10 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)

	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
		/* state change failure is ok if we started ctrl delete */
		WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
			     ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
		enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);

		WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
			     state != NVME_CTRL_DELETING_NOIO);
		return;
	}

@@ -1162,7 +1167,7 @@ static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
	struct nvme_rdma_queue *queue = wc->qp->qp_context;
	struct nvme_rdma_ctrl *ctrl = queue->ctrl;

	if (ctrl->ctrl.state == NVME_CTRL_LIVE)
	if (nvme_ctrl_state(&ctrl->ctrl) == NVME_CTRL_LIVE)
		dev_info(ctrl->ctrl.device,
			     "%s for CQE 0x%p failed with status %s (%d)\n",
			     op, wc->wr_cqe,
@@ -1945,7 +1950,7 @@ static enum blk_eh_timer_return nvme_rdma_timeout(struct request *rq)
	dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
		 rq->tag, nvme_rdma_queue_idx(queue));

	if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
	if (nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_LIVE) {
		/*
		 * If we are resetting, connecting or deleting we should
		 * complete immediately because we may block controller
+17 −10
Original line number Diff line number Diff line
@@ -2152,10 +2152,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,

static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
{
	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);

	/* If we are resetting/deleting then do nothing */
	if (ctrl->state != NVME_CTRL_CONNECTING) {
		WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
			ctrl->state == NVME_CTRL_LIVE);
	if (state != NVME_CTRL_CONNECTING) {
		WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE);
		return;
	}

@@ -2215,8 +2216,10 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
		 * unless we're during creation of a new controller to
		 * avoid races with teardown flow.
		 */
		WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
			     ctrl->state != NVME_CTRL_DELETING_NOIO);
		enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);

		WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
			     state != NVME_CTRL_DELETING_NOIO);
		WARN_ON_ONCE(new);
		ret = -EINVAL;
		goto destroy_io;
@@ -2280,8 +2283,10 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)

	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
		/* state change failure is ok if we started ctrl delete */
		WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
			     ctrl->state != NVME_CTRL_DELETING_NOIO);
		enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);

		WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
			     state != NVME_CTRL_DELETING_NOIO);
		return;
	}

@@ -2311,8 +2316,10 @@ static void nvme_reset_ctrl_work(struct work_struct *work)

	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
		/* state change failure is ok if we started ctrl delete */
		WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
			     ctrl->state != NVME_CTRL_DELETING_NOIO);
		enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);

		WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
			     state != NVME_CTRL_DELETING_NOIO);
		return;
	}

@@ -2430,7 +2437,7 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
		nvme_tcp_queue_id(req->queue), nvme_cid(rq), pdu->hdr.type,
		opc, nvme_opcode_str(qid, opc, fctype));

	if (ctrl->state != NVME_CTRL_LIVE) {
	if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) {
		/*
		 * If we are resetting, connecting or deleting we should
		 * complete immediately because we may block controller