Commit 323275ac authored by wenglianfa's avatar wenglianfa Committed by Leon Romanovsky
Browse files

RDMA/hns: Fix cpu stuck caused by printings during reset



During reset, cmd to destroy resources such as qp, cq, and mr may fail,
and error logs will be printed. When a large number of resources are
destroyed, there will be lots of printings, and it may lead to a cpu
stuck.

Delete some unnecessary printings and replace other printing functions
in these paths with the ratelimited version.

Fixes: 9a443537 ("IB/hns: Add driver files for hns RoCE driver")
Fixes: c7bcb134 ("RDMA/hns: Add SRQ support for hip08 kernel mode")
Fixes: 70f92521 ("RDMA/hns: Use the reserved loopback QPs to free MR before destroying MPT")
Fixes: 926a01dc ("RDMA/hns: Add QP operations support for hip08 SoC")
Signed-off-by: default avatarwenglianfa <wenglianfa@huawei.com>
Signed-off-by: default avatarJunxian Huang <huangjunxian6@hisilicon.com>
Link: https://patch.msgid.link/20241024124000.2931869-6-huangjunxian6@hisilicon.com


Signed-off-by: default avatarLeon Romanovsky <leon@kernel.org>
parent d81fb651
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -179,8 +179,8 @@ static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
	ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_CQC,
				      hr_cq->cqn);
	if (ret)
		dev_err(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n", ret,
			hr_cq->cqn);
		dev_err_ratelimited(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n",
				    ret, hr_cq->cqn);

	xa_erase_irq(&cq_table->array, hr_cq->cqn);

+2 −2
Original line number Diff line number Diff line
@@ -672,7 +672,7 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev,

	ret = hr_dev->hw->clear_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT);
	if (ret)
		dev_warn(dev, "failed to clear HEM base address, ret = %d.\n",
		dev_warn_ratelimited(dev, "failed to clear HEM base address, ret = %d.\n",
				     ret);

	hns_roce_free_hem(hr_dev, table->hem[i]);
+33 −40
Original line number Diff line number Diff line
@@ -373,19 +373,12 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr,
static int check_send_valid(struct hns_roce_dev *hr_dev,
			    struct hns_roce_qp *hr_qp)
{
	struct ib_device *ibdev = &hr_dev->ib_dev;

	if (unlikely(hr_qp->state == IB_QPS_RESET ||
		     hr_qp->state == IB_QPS_INIT ||
		     hr_qp->state == IB_QPS_RTR)) {
		ibdev_err(ibdev, "failed to post WQE, QP state %u!\n",
			  hr_qp->state);
		     hr_qp->state == IB_QPS_RTR))
		return -EINVAL;
	} else if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN)) {
		ibdev_err(ibdev, "failed to post WQE, dev state %d!\n",
			  hr_dev->state);
	else if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN))
		return -EIO;
	}

	return 0;
}
@@ -2775,7 +2768,7 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev,
	ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT,
				    IB_QPS_INIT, NULL);
	if (ret) {
		ibdev_err(ibdev, "failed to modify qp to init, ret = %d.\n",
		ibdev_err_ratelimited(ibdev, "failed to modify qp to init, ret = %d.\n",
				      ret);
		return ret;
	}
@@ -3421,7 +3414,7 @@ static int free_mr_post_send_lp_wqe(struct hns_roce_qp *hr_qp)

	ret = hns_roce_v2_post_send(&hr_qp->ibqp, send_wr, &bad_wr);
	if (ret) {
		ibdev_err(ibdev, "failed to post wqe for free mr, ret = %d.\n",
		ibdev_err_ratelimited(ibdev, "failed to post wqe for free mr, ret = %d.\n",
				      ret);
		return ret;
	}
@@ -3461,7 +3454,7 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev)

		ret = free_mr_post_send_lp_wqe(hr_qp);
		if (ret) {
			ibdev_err(ibdev,
			ibdev_err_ratelimited(ibdev,
					      "failed to send wqe (qp:0x%lx) for free mr, ret = %d.\n",
					      hr_qp->qpn, ret);
			break;
@@ -3474,14 +3467,14 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev)
	while (cqe_cnt) {
		npolled = hns_roce_v2_poll_cq(&free_mr->rsv_cq->ib_cq, cqe_cnt, wc);
		if (npolled < 0) {
			ibdev_err(ibdev,
			ibdev_err_ratelimited(ibdev,
					      "failed to poll cqe for free mr, remain %d cqe.\n",
					      cqe_cnt);
			goto out;
		}

		if (time_after(jiffies, end)) {
			ibdev_err(ibdev,
			ibdev_err_ratelimited(ibdev,
					      "failed to poll cqe for free mr and timeout, remain %d cqe.\n",
					      cqe_cnt);
			goto out;
@@ -5061,10 +5054,8 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp,
	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
	int ret = 0;

	if (!check_qp_state(cur_state, new_state)) {
		ibdev_err(&hr_dev->ib_dev, "Illegal state for QP!\n");
	if (!check_qp_state(cur_state, new_state))
		return -EINVAL;
	}

	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
		memset(qpc_mask, 0, hr_dev->caps.qpc_sz);
@@ -5325,7 +5316,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
	/* SW pass context to HW */
	ret = hns_roce_v2_qp_modify(hr_dev, context, qpc_mask, hr_qp);
	if (ret) {
		ibdev_err(ibdev, "failed to modify QP, ret = %d.\n", ret);
		ibdev_err_ratelimited(ibdev, "failed to modify QP, ret = %d.\n", ret);
		goto out;
	}

@@ -5463,7 +5454,9 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,

	ret = hns_roce_v2_query_qpc(hr_dev, hr_qp->qpn, &context);
	if (ret) {
		ibdev_err(ibdev, "failed to query QPC, ret = %d.\n", ret);
		ibdev_err_ratelimited(ibdev,
				      "failed to query QPC, ret = %d.\n",
				      ret);
		ret = -EINVAL;
		goto out;
	}
@@ -5471,7 +5464,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
	state = hr_reg_read(&context, QPC_QP_ST);
	tmp_qp_state = to_ib_qp_st((enum hns_roce_v2_qp_state)state);
	if (tmp_qp_state == -1) {
		ibdev_err(ibdev, "Illegal ib_qp_state\n");
		ibdev_err_ratelimited(ibdev, "Illegal ib_qp_state\n");
		ret = -EINVAL;
		goto out;
	}
@@ -5564,7 +5557,7 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
		ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0,
					    hr_qp->state, IB_QPS_RESET, udata);
		if (ret)
			ibdev_err(ibdev,
			ibdev_err_ratelimited(ibdev,
					      "failed to modify QP to RST, ret = %d.\n",
					      ret);
	}
@@ -5609,7 +5602,7 @@ int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)

	ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata);
	if (ret)
		ibdev_err(&hr_dev->ib_dev,
		ibdev_err_ratelimited(&hr_dev->ib_dev,
				      "failed to destroy QP, QPN = 0x%06lx, ret = %d.\n",
				      hr_qp->qpn, ret);

@@ -5905,7 +5898,7 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
				HNS_ROCE_CMD_MODIFY_CQC, hr_cq->cqn);
	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
	if (ret)
		ibdev_err(&hr_dev->ib_dev,
		ibdev_err_ratelimited(&hr_dev->ib_dev,
				      "failed to process cmd when modifying CQ, ret = %d.\n",
				      ret);

@@ -5931,7 +5924,7 @@ static int hns_roce_v2_query_cqc(struct hns_roce_dev *hr_dev, u32 cqn,
	ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma,
				HNS_ROCE_CMD_QUERY_CQC, cqn);
	if (ret) {
		ibdev_err(&hr_dev->ib_dev,
		ibdev_err_ratelimited(&hr_dev->ib_dev,
				      "failed to process cmd when querying CQ, ret = %d.\n",
				      ret);
		goto err_mailbox;
+2 −2
Original line number Diff line number Diff line
@@ -138,7 +138,7 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr
					      key_to_hw_index(mr->key) &
					      (hr_dev->caps.num_mtpts - 1));
		if (ret)
			ibdev_warn(ibdev, "failed to destroy mpt, ret = %d.\n",
			ibdev_warn_ratelimited(ibdev, "failed to destroy mpt, ret = %d.\n",
					       ret);
	}

+2 −2
Original line number Diff line number Diff line
@@ -151,7 +151,7 @@ static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq)
	ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_SRQ,
				      srq->srqn);
	if (ret)
		dev_err(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n",
		dev_err_ratelimited(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n",
				    ret, srq->srqn);

	xa_erase_irq(&srq_table->xa, srq->srqn);