Commit 158e71bb authored by Aharon Landau's avatar Aharon Landau Committed by Leon Romanovsky
Browse files

RDMA/mlx5: Add a umr recovery flow

When a UMR fails, the UMR QP state changes to an error state. Therefore,
all the further UMR operations will fail too.

Add a recovery flow to the UMR QP, and repost the flushed WQEs.

Link: https://lore.kernel.org/r/6cc24816cca049bd8541317f5e41d3ac659445d3.1652588303.git.leonro@nvidia.com


Signed-off-by: default avatarAharon Landau <aharonl@nvidia.com>
Reviewed-by: default avatarMichael Guralnik <michaelgur@nvidia.com>
Signed-off-by: default avatarLeon Romanovsky <leon@kernel.org>
parent 650126a8
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -523,6 +523,10 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq,
			    "Requestor" : "Responder", cq->mcq.cqn);
		mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n",
			    err_cqe->syndrome, err_cqe->vendor_err_synd);
		if (wc->status != IB_WC_WR_FLUSH_ERR &&
		    (*cur_qp)->type == MLX5_IB_QPT_REG_UMR)
			dev->umrc.state = MLX5_UMR_STATE_RECOVER;

		if (opcode == MLX5_CQE_REQ_ERR) {
			wq = &(*cur_qp)->sq;
			wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
+11 −1
Original line number Diff line number Diff line
@@ -717,13 +717,23 @@ struct mlx5_ib_umr_context {
	struct completion	done;
};

enum {
	MLX5_UMR_STATE_ACTIVE,
	MLX5_UMR_STATE_RECOVER,
	MLX5_UMR_STATE_ERR,
};

struct umr_common {
	struct ib_pd	*pd;
	struct ib_cq	*cq;
	struct ib_qp	*qp;
	/* control access to UMR QP
	/* Protects from UMR QP overflow
	 */
	struct semaphore	sem;
	/* Protects from using UMR while the UMR is not active
	 */
	struct mutex lock;
	unsigned int state;
};

struct mlx5_cache_ent {
+68 −10
Original line number Diff line number Diff line
@@ -176,6 +176,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
	dev->umrc.pd = pd;

	sema_init(&dev->umrc.sem, MAX_UMR_WR);
	mutex_init(&dev->umrc.lock);

	return 0;

@@ -195,6 +196,31 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
	ib_dealloc_pd(dev->umrc.pd);
}

static int mlx5r_umr_recover(struct mlx5_ib_dev *dev)
{
	struct umr_common *umrc = &dev->umrc;
	struct ib_qp_attr attr;
	int err;

	attr.qp_state = IB_QPS_RESET;
	err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
	if (err) {
		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
		goto err;
	}

	err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
	if (err)
		goto err;

	umrc->state = MLX5_UMR_STATE_ACTIVE;
	return 0;

err:
	umrc->state = MLX5_UMR_STATE_ERR;
	return err;
}

static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
			       struct mlx5r_umr_wqe *wqe, bool with_data)
{
@@ -231,7 +257,7 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,

	id.ib_cqe = cqe;
	mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0,
			 MLX5_FENCE_MODE_NONE, MLX5_OPCODE_UMR);
			 MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR);

	mlx5r_ring_db(qp, 1, ctrl);

@@ -270,17 +296,49 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
	mlx5r_umr_init_context(&umr_context);

	down(&umrc->sem);
	while (true) {
		mutex_lock(&umrc->lock);
		if (umrc->state == MLX5_UMR_STATE_ERR) {
			mutex_unlock(&umrc->lock);
			err = -EFAULT;
			break;
		}

		if (umrc->state == MLX5_UMR_STATE_RECOVER) {
			mutex_unlock(&umrc->lock);
			usleep_range(3000, 5000);
			continue;
		}

		err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
					  with_data);
	if (err)
		mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
	else {
		mutex_unlock(&umrc->lock);
		if (err) {
			mlx5_ib_warn(dev, "UMR post send failed, err %d\n",
				     err);
			break;
		}

		wait_for_completion(&umr_context.done);
		if (umr_context.status != IB_WC_SUCCESS) {
			mlx5_ib_warn(dev, "reg umr failed (%u)\n",

		if (umr_context.status == IB_WC_SUCCESS)
			break;

		if (umr_context.status == IB_WC_WR_FLUSH_ERR)
			continue;

		WARN_ON_ONCE(1);
		mlx5_ib_warn(dev,
			"reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n",
			umr_context.status);
		mutex_lock(&umrc->lock);
		err = mlx5r_umr_recover(dev);
		mutex_unlock(&umrc->lock);
		if (err)
			mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
				     err);
		err = -EFAULT;
		}
		break;
	}
	up(&umrc->sem);
	return err;