Commit ae6f6dd5 authored by Leon Romanovsky's avatar Leon Romanovsky
Browse files

Delay mlx5_ib internal resources allocations



From: Leon Romanovsky <leonro@nvidia.com>

Internal mlx5_ib resources are created during mlx5_ib module load. This
behavior is not optimal because it consumes resources that are not
needed when SFs are created. This patch series delays the creation of
mlx5_ib internal resources to the stage when they actually used.

Signed-off-by: default avatarLeon Romanovsky <leon@kernel.org>
parents ef551352 d98995b4
Loading
Loading
Loading
Loading
+3 −16
Original line number Diff line number Diff line
@@ -1810,7 +1810,7 @@ static int set_ucontext_resp(struct ib_ucontext *uctx,
	}

	resp->qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
	if (dev->wc_support)
	if (mlx5_wc_support_get(dev->mdev))
		resp->bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev,
						      log_bf_reg_size);
	resp->cache_line_size = cache_line_size();
@@ -2337,7 +2337,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
	switch (command) {
	case MLX5_IB_MMAP_WC_PAGE:
	case MLX5_IB_MMAP_ALLOC_WC:
		if (!dev->wc_support)
		if (!mlx5_wc_support_get(dev->mdev))
			return -EPERM;
		fallthrough;
	case MLX5_IB_MMAP_NC_PAGE:
@@ -3612,7 +3612,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_UAR_OBJ_ALLOC)(
	    alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC)
		return -EOPNOTSUPP;

	if (!to_mdev(c->ibucontext.device)->wc_support &&
	if (!mlx5_wc_support_get(to_mdev(c->ibucontext.device)->mdev) &&
	    alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF)
		return -EOPNOTSUPP;

@@ -3766,18 +3766,6 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
	return err;
}

static int mlx5_ib_enable_driver(struct ib_device *dev)
{
	struct mlx5_ib_dev *mdev = to_mdev(dev);
	int ret;

	ret = mlx5_ib_test_wc(mdev);
	mlx5_ib_dbg(mdev, "Write-Combining %s",
		    mdev->wc_support ? "supported" : "not supported");

	return ret;
}

static const struct ib_device_ops mlx5_ib_dev_ops = {
	.owner = THIS_MODULE,
	.driver_id = RDMA_DRIVER_MLX5,
@@ -3808,7 +3796,6 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
	.drain_rq = mlx5_ib_drain_rq,
	.drain_sq = mlx5_ib_drain_sq,
	.device_group = &mlx5_attr_group,
	.enable_driver = mlx5_ib_enable_driver,
	.get_dev_fw_str = get_dev_fw_str,
	.get_dma_mr = mlx5_ib_get_dma_mr,
	.get_link_layer = mlx5_ib_port_link_layer,
+0 −198
Original line number Diff line number Diff line
@@ -30,10 +30,8 @@
 * SOFTWARE.
 */

#include <linux/io.h>
#include <rdma/ib_umem_odp.h>
#include "mlx5_ib.h"
#include <linux/jiffies.h>

/*
 * Fill in a physical address list. ib_umem_num_dma_blocks() entries will be
@@ -95,199 +93,3 @@ unsigned long __mlx5_umem_find_best_quantized_pgoff(
		return 0;
	return page_size;
}

#define WR_ID_BF 0xBF
#define WR_ID_END 0xBAD
#define TEST_WC_NUM_WQES 255
#define TEST_WC_POLLING_MAX_TIME_JIFFIES msecs_to_jiffies(100)
static int post_send_nop(struct mlx5_ib_dev *dev, struct ib_qp *ibqp, u64 wr_id,
			 bool signaled)
{
	struct mlx5_ib_qp *qp = to_mqp(ibqp);
	struct mlx5_wqe_ctrl_seg *ctrl;
	struct mlx5_bf *bf = &qp->bf;
	__be32 mmio_wqe[16] = {};
	unsigned long flags;
	unsigned int idx;

	if (unlikely(dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR))
		return -EIO;

	spin_lock_irqsave(&qp->sq.lock, flags);

	idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
	ctrl = mlx5_frag_buf_get_wqe(&qp->sq.fbc, idx);

	memset(ctrl, 0, sizeof(struct mlx5_wqe_ctrl_seg));
	ctrl->fm_ce_se = signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0;
	ctrl->opmod_idx_opcode =
		cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | MLX5_OPCODE_NOP);
	ctrl->qpn_ds = cpu_to_be32((sizeof(struct mlx5_wqe_ctrl_seg) / 16) |
				   (qp->trans_qp.base.mqp.qpn << 8));

	qp->sq.wrid[idx] = wr_id;
	qp->sq.w_list[idx].opcode = MLX5_OPCODE_NOP;
	qp->sq.wqe_head[idx] = qp->sq.head + 1;
	qp->sq.cur_post += DIV_ROUND_UP(sizeof(struct mlx5_wqe_ctrl_seg),
					MLX5_SEND_WQE_BB);
	qp->sq.w_list[idx].next = qp->sq.cur_post;
	qp->sq.head++;

	memcpy(mmio_wqe, ctrl, sizeof(*ctrl));
	((struct mlx5_wqe_ctrl_seg *)&mmio_wqe)->fm_ce_se |=
		MLX5_WQE_CTRL_CQ_UPDATE;

	/* Make sure that descriptors are written before
	 * updating doorbell record and ringing the doorbell
	 */
	wmb();

	qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post);

	/* Make sure doorbell record is visible to the HCA before
	 * we hit doorbell
	 */
	wmb();
	__iowrite64_copy(bf->bfreg->map + bf->offset, mmio_wqe,
			 sizeof(mmio_wqe) / 8);

	bf->offset ^= bf->buf_size;

	spin_unlock_irqrestore(&qp->sq.lock, flags);

	return 0;
}

static int test_wc_poll_cq_result(struct mlx5_ib_dev *dev, struct ib_cq *cq)
{
	int ret;
	struct ib_wc wc = {};
	unsigned long end = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES;

	do {
		ret = ib_poll_cq(cq, 1, &wc);
		if (ret < 0 || wc.status)
			return ret < 0 ? ret : -EINVAL;
		if (ret)
			break;
	} while (!time_after(jiffies, end));

	if (!ret)
		return -ETIMEDOUT;

	if (wc.wr_id != WR_ID_BF)
		ret = 0;

	return ret;
}

static int test_wc_do_send(struct mlx5_ib_dev *dev, struct ib_qp *qp)
{
	int err, i;

	for (i = 0; i < TEST_WC_NUM_WQES; i++) {
		err = post_send_nop(dev, qp, WR_ID_BF, false);
		if (err)
			return err;
	}

	return post_send_nop(dev, qp, WR_ID_END, true);
}

int mlx5_ib_test_wc(struct mlx5_ib_dev *dev)
{
	struct ib_cq_init_attr cq_attr = { .cqe = TEST_WC_NUM_WQES + 1 };
	int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
	struct ib_qp_init_attr qp_init_attr = {
		.cap = { .max_send_wr = TEST_WC_NUM_WQES },
		.qp_type = IB_QPT_UD,
		.sq_sig_type = IB_SIGNAL_REQ_WR,
		.create_flags = MLX5_IB_QP_CREATE_WC_TEST,
	};
	struct ib_qp_attr qp_attr = { .port_num = 1 };
	struct ib_device *ibdev = &dev->ib_dev;
	struct ib_qp *qp;
	struct ib_cq *cq;
	struct ib_pd *pd;
	int ret;

	if (!MLX5_CAP_GEN(dev->mdev, bf))
		return 0;

	if (!dev->mdev->roce.roce_en &&
	    port_type_cap == MLX5_CAP_PORT_TYPE_ETH) {
		if (mlx5_core_is_pf(dev->mdev))
			dev->wc_support = arch_can_pci_mmap_wc();
		return 0;
	}

	ret = mlx5_alloc_bfreg(dev->mdev, &dev->wc_bfreg, true, false);
	if (ret)
		goto print_err;

	if (!dev->wc_bfreg.wc)
		goto out1;

	pd = ib_alloc_pd(ibdev, 0);
	if (IS_ERR(pd)) {
		ret = PTR_ERR(pd);
		goto out1;
	}

	cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr);
	if (IS_ERR(cq)) {
		ret = PTR_ERR(cq);
		goto out2;
	}

	qp_init_attr.recv_cq = cq;
	qp_init_attr.send_cq = cq;
	qp = ib_create_qp(pd, &qp_init_attr);
	if (IS_ERR(qp)) {
		ret = PTR_ERR(qp);
		goto out3;
	}

	qp_attr.qp_state = IB_QPS_INIT;
	ret = ib_modify_qp(qp, &qp_attr,
			   IB_QP_STATE | IB_QP_PORT | IB_QP_PKEY_INDEX |
				   IB_QP_QKEY);
	if (ret)
		goto out4;

	qp_attr.qp_state = IB_QPS_RTR;
	ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
	if (ret)
		goto out4;

	qp_attr.qp_state = IB_QPS_RTS;
	ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);
	if (ret)
		goto out4;

	ret = test_wc_do_send(dev, qp);
	if (ret < 0)
		goto out4;

	ret = test_wc_poll_cq_result(dev, cq);
	if (ret > 0) {
		dev->wc_support = true;
		ret = 0;
	}

out4:
	ib_destroy_qp(qp);
out3:
	ib_destroy_cq(cq);
out2:
	ib_dealloc_pd(pd);
out1:
	mlx5_free_bfreg(dev->mdev, &dev->wc_bfreg);
print_err:
	if (ret)
		mlx5_ib_err(
			dev,
			"Error %d while trying to test write-combining support\n",
			ret);
	return ret;
}
+0 −3
Original line number Diff line number Diff line
@@ -341,7 +341,6 @@ struct mlx5_ib_flow_db {
 * rely on the range reserved for that use in the ib_qp_create_flags enum.
 */
#define MLX5_IB_QP_CREATE_SQPN_QP1	IB_QP_CREATE_RESERVED_START
#define MLX5_IB_QP_CREATE_WC_TEST	(IB_QP_CREATE_RESERVED_START << 1)

struct wr_list {
	u16	opcode;
@@ -1123,7 +1122,6 @@ struct mlx5_ib_dev {
	u8				ib_active:1;
	u8				is_rep:1;
	u8				lag_active:1;
	u8				wc_support:1;
	u8				fill_delay;
	struct umr_common		umrc;
	/* sync used page count stats
@@ -1149,7 +1147,6 @@ struct mlx5_ib_dev {
	/* Array with num_ports elements */
	struct mlx5_ib_port	*port;
	struct mlx5_sq_bfreg	bfreg;
	struct mlx5_sq_bfreg	wc_bfreg;
	struct mlx5_sq_bfreg	fp_bfreg;
	struct mlx5_ib_delay_drop	delay_drop;
	const struct mlx5_ib_profile	*profile;
+0 −16
Original line number Diff line number Diff line
@@ -1107,8 +1107,6 @@ static int _create_kernel_qp(struct mlx5_ib_dev *dev,

	if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR)
		qp->bf.bfreg = &dev->fp_bfreg;
	else if (qp->flags & MLX5_IB_QP_CREATE_WC_TEST)
		qp->bf.bfreg = &dev->wc_bfreg;
	else
		qp->bf.bfreg = &dev->bfreg;

@@ -2959,14 +2957,6 @@ static void process_create_flag(struct mlx5_ib_dev *dev, int *flags, int flag,
		return;
	}

	if (flag == MLX5_IB_QP_CREATE_WC_TEST) {
		/*
		 * Special case, if condition didn't meet, it won't be error,
		 * just different in-kernel flow.
		 */
		*flags &= ~MLX5_IB_QP_CREATE_WC_TEST;
		return;
	}
	mlx5_ib_dbg(dev, "Verbs create QP flag 0x%X is not supported\n", flag);
}

@@ -3027,8 +3017,6 @@ static int process_create_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
			    IB_QP_CREATE_PCI_WRITE_END_PADDING,
			    MLX5_CAP_GEN(mdev, end_pad), qp);

	process_create_flag(dev, &create_flags, MLX5_IB_QP_CREATE_WC_TEST,
			    qp_type != MLX5_IB_QPT_REG_UMR, qp);
	process_create_flag(dev, &create_flags, MLX5_IB_QP_CREATE_SQPN_QP1,
			    true, qp);

@@ -4609,10 +4597,6 @@ static bool mlx5_ib_modify_qp_allowed(struct mlx5_ib_dev *dev,
	if (qp->type == IB_QPT_RAW_PACKET || qp->type == MLX5_IB_QPT_REG_UMR)
		return true;

	/* Internal QP used for wc testing, with NOPs in wq */
	if (qp->flags & MLX5_IB_QP_CREATE_WC_TEST)
		return true;

	return false;
}

+1 −1
Original line number Diff line number Diff line
@@ -17,7 +17,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
		fs_counters.o fs_ft_pool.o rl.o lag/debugfs.o lag/lag.o dev.o events.o wq.o lib/gid.o \
		lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o diag/fs_tracepoint.o \
		diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o diag/reporter_vnic.o \
		fw_reset.o qos.o lib/tout.o lib/aso.o
		fw_reset.o qos.o lib/tout.o lib/aso.o wc.o

#
# Netdev basic
Loading