Commit 98cf1d1a authored by Leon Romanovsky's avatar Leon Romanovsky
Browse files

Add support and infrastructure for RDMA TRANSPORT

---------------------------------------------------------------------

Hi,

This is preparation series targeted for mlx5-next, which will be used
later in RDMA.

This series adds RDMA transport steering logic which would allow the
vport group manager to catch control packets from VFs and forward them
to control SW to help with congestion control.

In addition, RDMA will provide new set of APIs to better control exposed
FW capabilities and this series is needed to make sure mlx5 command
interface will ensure that privileged commands can always proceed,

Thanks

Link: https://lore.kernel.org/all/cover.1740574103.git.leon@kernel.org


Signed-off-by: default avatarLeon Romanovsky <leon@kernel.org>

* mlx5-next:
  net/mlx5: fs, add RDMA TRANSPORT steering domain support
  net/mlx5: Query ADV_RDMA capabilities
  net/mlx5: Limit non-privileged commands
  net/mlx5: Allow the throttle mechanism to be more dynamic
  net/mlx5: Add RDMA_CTRL HW capabilities
parents 83437689 15b103df
Loading
Loading
Loading
Loading
+104 −16
Original line number Diff line number Diff line
@@ -94,6 +94,11 @@ static u16 in_to_opcode(void *in)
	return MLX5_GET(mbox_in, in, opcode);
}

static u16 in_to_uid(void *in)
{
	return MLX5_GET(mbox_in, in, uid);
}

/* Returns true for opcodes that might be triggered very frequently and throttle
 * the command interface. Limit their command slots usage.
 */
@@ -823,7 +828,7 @@ static void cmd_status_print(struct mlx5_core_dev *dev, void *in, void *out)

	opcode = in_to_opcode(in);
	op_mod = MLX5_GET(mbox_in, in, op_mod);
	uid    = MLX5_GET(mbox_in, in, uid);
	uid    = in_to_uid(in);
	status = MLX5_GET(mbox_out, out, status);

	if (!uid && opcode != MLX5_CMD_OP_DESTROY_MKEY &&
@@ -1871,6 +1876,17 @@ static int is_manage_pages(void *in)
	return in_to_opcode(in) == MLX5_CMD_OP_MANAGE_PAGES;
}

static bool mlx5_has_privileged_uid(struct mlx5_core_dev *dev)
{
	return !xa_empty(&dev->cmd.vars.privileged_uids);
}

static bool mlx5_cmd_is_privileged_uid(struct mlx5_core_dev *dev,
				       u16 uid)
{
	return !!xa_load(&dev->cmd.vars.privileged_uids, uid);
}

/*  Notes:
 *    1. Callback functions may not sleep
 *    2. Page queue commands do not support asynchrous completion
@@ -1881,7 +1897,9 @@ static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
{
	struct mlx5_cmd_msg *inb, *outb;
	u16 opcode = in_to_opcode(in);
	bool throttle_op;
	bool throttle_locked = false;
	bool unpriv_locked = false;
	u16 uid = in_to_uid(in);
	int pages_queue;
	gfp_t gfp;
	u8 token;
@@ -1890,12 +1908,17 @@ static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
	if (mlx5_cmd_is_down(dev) || !opcode_allowed(&dev->cmd, opcode))
		return -ENXIO;

	throttle_op = mlx5_cmd_is_throttle_opcode(opcode);
	if (throttle_op) {
		if (callback) {
			if (down_trylock(&dev->cmd.vars.throttle_sem))
				return -EBUSY;
		} else {
	if (!callback) {
		/* The semaphore is already held for callback commands. It was
		 * acquired in mlx5_cmd_exec_cb()
		 */
		if (uid && mlx5_has_privileged_uid(dev)) {
			if (!mlx5_cmd_is_privileged_uid(dev, uid)) {
				unpriv_locked = true;
				down(&dev->cmd.vars.unprivileged_sem);
			}
		} else if (mlx5_cmd_is_throttle_opcode(opcode)) {
			throttle_locked = true;
			down(&dev->cmd.vars.throttle_sem);
		}
	}
@@ -1941,8 +1964,11 @@ static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
out_in:
	free_msg(dev, inb);
out_up:
	if (throttle_op)
	if (throttle_locked)
		up(&dev->cmd.vars.throttle_sem);
	if (unpriv_locked)
		up(&dev->cmd.vars.unprivileged_sem);

	return err;
}

@@ -2104,18 +2130,22 @@ static void mlx5_cmd_exec_cb_handler(int status, void *_work)
	struct mlx5_async_work *work = _work;
	struct mlx5_async_ctx *ctx;
	struct mlx5_core_dev *dev;
	u16 opcode;
	bool throttle_locked;
	bool unpriv_locked;

	ctx = work->ctx;
	dev = ctx->dev;
	opcode = work->opcode;
	throttle_locked = work->throttle_locked;
	unpriv_locked = work->unpriv_locked;
	status = cmd_status_err(dev, status, work->opcode, work->op_mod, work->out);
	work->user_callback(status, work);
	/* Can't access "work" from this point on. It could have been freed in
	 * the callback.
	 */
	if (mlx5_cmd_is_throttle_opcode(opcode))
	if (throttle_locked)
		up(&dev->cmd.vars.throttle_sem);
	if (unpriv_locked)
		up(&dev->cmd.vars.unprivileged_sem);
	if (atomic_dec_and_test(&ctx->num_inflight))
		complete(&ctx->inflight_done);
}
@@ -2124,6 +2154,8 @@ int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size,
		     void *out, int out_size, mlx5_async_cbk_t callback,
		     struct mlx5_async_work *work)
{
	struct mlx5_core_dev *dev = ctx->dev;
	u16 uid;
	int ret;

	work->ctx = ctx;
@@ -2131,11 +2163,43 @@ int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size,
	work->opcode = in_to_opcode(in);
	work->op_mod = MLX5_GET(mbox_in, in, op_mod);
	work->out = out;
	work->throttle_locked = false;
	work->unpriv_locked = false;
	uid = in_to_uid(in);

	if (WARN_ON(!atomic_inc_not_zero(&ctx->num_inflight)))
		return -EIO;
	ret = cmd_exec(ctx->dev, in, in_size, out, out_size,

	if (uid && mlx5_has_privileged_uid(dev)) {
		if (!mlx5_cmd_is_privileged_uid(dev, uid)) {
			if (down_trylock(&dev->cmd.vars.unprivileged_sem)) {
				ret = -EBUSY;
				goto dec_num_inflight;
			}
			work->unpriv_locked = true;
		}
	} else if (mlx5_cmd_is_throttle_opcode(in_to_opcode(in))) {
		if (down_trylock(&dev->cmd.vars.throttle_sem)) {
			ret = -EBUSY;
			goto dec_num_inflight;
		}
		work->throttle_locked = true;
	}

	ret = cmd_exec(dev, in, in_size, out, out_size,
		       mlx5_cmd_exec_cb_handler, work, false);
	if (ret && atomic_dec_and_test(&ctx->num_inflight))
	if (ret)
		goto sem_up;

	return 0;

sem_up:
	if (work->throttle_locked)
		up(&dev->cmd.vars.throttle_sem);
	if (work->unpriv_locked)
		up(&dev->cmd.vars.unprivileged_sem);
dec_num_inflight:
	if (atomic_dec_and_test(&ctx->num_inflight))
		complete(&ctx->inflight_done);

	return ret;
@@ -2371,10 +2435,16 @@ int mlx5_cmd_enable(struct mlx5_core_dev *dev)
	sema_init(&cmd->vars.sem, cmd->vars.max_reg_cmds);
	sema_init(&cmd->vars.pages_sem, 1);
	sema_init(&cmd->vars.throttle_sem, DIV_ROUND_UP(cmd->vars.max_reg_cmds, 2));
	sema_init(&cmd->vars.unprivileged_sem,
		  DIV_ROUND_UP(cmd->vars.max_reg_cmds, 2));

	xa_init(&cmd->vars.privileged_uids);

	cmd->pool = dma_pool_create("mlx5_cmd", mlx5_core_dma_dev(dev), size, align, 0);
	if (!cmd->pool)
		return -ENOMEM;
	if (!cmd->pool) {
		err = -ENOMEM;
		goto err_destroy_xa;
	}

	err = alloc_cmd_page(dev, cmd);
	if (err)
@@ -2408,6 +2478,8 @@ int mlx5_cmd_enable(struct mlx5_core_dev *dev)
	free_cmd_page(dev, cmd);
err_free_pool:
	dma_pool_destroy(cmd->pool);
err_destroy_xa:
	xa_destroy(&dev->cmd.vars.privileged_uids);
	return err;
}

@@ -2420,6 +2492,7 @@ void mlx5_cmd_disable(struct mlx5_core_dev *dev)
	destroy_msg_cache(dev);
	free_cmd_page(dev, cmd);
	dma_pool_destroy(cmd->pool);
	xa_destroy(&dev->cmd.vars.privileged_uids);
}

void mlx5_cmd_set_state(struct mlx5_core_dev *dev,
@@ -2427,3 +2500,18 @@ void mlx5_cmd_set_state(struct mlx5_core_dev *dev,
{
	dev->cmd.state = cmdif_state;
}

int mlx5_cmd_add_privileged_uid(struct mlx5_core_dev *dev, u16 uid)
{
	return xa_insert(&dev->cmd.vars.privileged_uids, uid,
			 xa_mk_value(uid), GFP_KERNEL);
}
EXPORT_SYMBOL(mlx5_cmd_add_privileged_uid);

void mlx5_cmd_remove_privileged_uid(struct mlx5_core_dev *dev, u16 uid)
{
	void *data = xa_erase(&dev->cmd.vars.privileged_uids, uid);

	WARN(!data, "Privileged UID %u does not exist\n", uid);
}
EXPORT_SYMBOL(mlx5_cmd_remove_privileged_uid);
+1 −1
Original line number Diff line number Diff line
@@ -27,7 +27,7 @@ esw_acl_table_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport, int ns,
	esw_debug(dev, "Create vport[%d] %s ACL table\n", vport_num,
		  ns == MLX5_FLOW_NAMESPACE_ESW_INGRESS ? "ingress" : "egress");

	root_ns = mlx5_get_flow_vport_acl_namespace(dev, ns, vport->index);
	root_ns = mlx5_get_flow_vport_namespace(dev, ns, vport->index);
	if (!root_ns) {
		esw_warn(dev, "Failed to get E-Switch root namespace for vport (%d)\n",
			 vport_num);
+3 −3
Original line number Diff line number Diff line
@@ -2828,7 +2828,7 @@ static int esw_set_master_egress_rule(struct mlx5_core_dev *master,
	if (IS_ERR(vport))
		return PTR_ERR(vport);

	egress_ns = mlx5_get_flow_vport_acl_namespace(master,
	egress_ns = mlx5_get_flow_vport_namespace(master,
						  MLX5_FLOW_NAMESPACE_ESW_EGRESS,
						  vport->index);
	if (!egress_ns)
+2 −0
Original line number Diff line number Diff line
@@ -1142,6 +1142,8 @@ const struct mlx5_flow_cmds *mlx5_fs_cmd_get_default(enum fs_flow_table_type typ
	case FS_FT_RDMA_RX:
	case FS_FT_RDMA_TX:
	case FS_FT_PORT_SEL:
	case FS_FT_RDMA_TRANSPORT_RX:
	case FS_FT_RDMA_TRANSPORT_TX:
		return mlx5_fs_cmd_get_fw_cmds();
	default:
		return mlx5_fs_cmd_get_stub_cmds();
+166 −12
Original line number Diff line number Diff line
@@ -1456,7 +1456,7 @@ mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
	struct mlx5_flow_table *ft;
	int autogroups_max_fte;

	ft = mlx5_create_flow_table(ns, ft_attr);
	ft = mlx5_create_vport_flow_table(ns, ft_attr, ft_attr->vport);
	if (IS_ERR(ft))
		return ft;

@@ -2764,9 +2764,9 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
}
EXPORT_SYMBOL(mlx5_get_flow_namespace);

struct mlx5_flow_namespace *mlx5_get_flow_vport_acl_namespace(struct mlx5_core_dev *dev,
							      enum mlx5_flow_namespace_type type,
							      int vport)
struct mlx5_flow_namespace *
mlx5_get_flow_vport_namespace(struct mlx5_core_dev *dev,
			      enum mlx5_flow_namespace_type type, int vport_idx)
{
	struct mlx5_flow_steering *steering = dev->priv.steering;

@@ -2775,25 +2775,43 @@ struct mlx5_flow_namespace *mlx5_get_flow_vport_acl_namespace(struct mlx5_core_d

	switch (type) {
	case MLX5_FLOW_NAMESPACE_ESW_EGRESS:
		if (vport >= steering->esw_egress_acl_vports)
		if (vport_idx >= steering->esw_egress_acl_vports)
			return NULL;
		if (steering->esw_egress_root_ns &&
		    steering->esw_egress_root_ns[vport])
			return &steering->esw_egress_root_ns[vport]->ns;
		    steering->esw_egress_root_ns[vport_idx])
			return &steering->esw_egress_root_ns[vport_idx]->ns;
		else
			return NULL;
	case MLX5_FLOW_NAMESPACE_ESW_INGRESS:
		if (vport >= steering->esw_ingress_acl_vports)
		if (vport_idx >= steering->esw_ingress_acl_vports)
			return NULL;
		if (steering->esw_ingress_root_ns &&
		    steering->esw_ingress_root_ns[vport])
			return &steering->esw_ingress_root_ns[vport]->ns;
		    steering->esw_ingress_root_ns[vport_idx])
			return &steering->esw_ingress_root_ns[vport_idx]->ns;
		else
			return NULL;
	case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX:
		if (vport_idx >= steering->rdma_transport_rx_vports)
			return NULL;
		if (steering->rdma_transport_rx_root_ns &&
		    steering->rdma_transport_rx_root_ns[vport_idx])
			return &steering->rdma_transport_rx_root_ns[vport_idx]->ns;
		else
			return NULL;
	case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX:
		if (vport_idx >= steering->rdma_transport_tx_vports)
			return NULL;

		if (steering->rdma_transport_tx_root_ns &&
		    steering->rdma_transport_tx_root_ns[vport_idx])
			return &steering->rdma_transport_tx_root_ns[vport_idx]->ns;
		else
			return NULL;
	default:
		return NULL;
	}
}
EXPORT_SYMBOL(mlx5_get_flow_vport_namespace);

static struct fs_prio *_fs_create_prio(struct mlx5_flow_namespace *ns,
				       unsigned int prio,
@@ -3199,6 +3217,127 @@ static int init_rdma_tx_root_ns(struct mlx5_flow_steering *steering)
	return err;
}

static int
init_rdma_transport_rx_root_ns_one(struct mlx5_flow_steering *steering,
				   int vport_idx)
{
	struct fs_prio *prio;

	steering->rdma_transport_rx_root_ns[vport_idx] =
		create_root_ns(steering, FS_FT_RDMA_TRANSPORT_RX);
	if (!steering->rdma_transport_rx_root_ns[vport_idx])
		return -ENOMEM;

	/* create 1 prio*/
	prio = fs_create_prio(&steering->rdma_transport_rx_root_ns[vport_idx]->ns,
			      MLX5_RDMA_TRANSPORT_BYPASS_PRIO, 1);
	return PTR_ERR_OR_ZERO(prio);
}

static int
init_rdma_transport_tx_root_ns_one(struct mlx5_flow_steering *steering,
				   int vport_idx)
{
	struct fs_prio *prio;

	steering->rdma_transport_tx_root_ns[vport_idx] =
		create_root_ns(steering, FS_FT_RDMA_TRANSPORT_TX);
	if (!steering->rdma_transport_tx_root_ns[vport_idx])
		return -ENOMEM;

	/* create 1 prio*/
	prio = fs_create_prio(&steering->rdma_transport_tx_root_ns[vport_idx]->ns,
			      MLX5_RDMA_TRANSPORT_BYPASS_PRIO, 1);
	return PTR_ERR_OR_ZERO(prio);
}

static int init_rdma_transport_rx_root_ns(struct mlx5_flow_steering *steering)
{
	struct mlx5_core_dev *dev = steering->dev;
	int total_vports;
	int err;
	int i;

	/* In case eswitch not supported and working in legacy mode */
	total_vports = mlx5_eswitch_get_total_vports(dev) ?: 1;

	steering->rdma_transport_rx_root_ns =
			kcalloc(total_vports,
				sizeof(*steering->rdma_transport_rx_root_ns),
				GFP_KERNEL);
	if (!steering->rdma_transport_rx_root_ns)
		return -ENOMEM;

	for (i = 0; i < total_vports; i++) {
		err = init_rdma_transport_rx_root_ns_one(steering, i);
		if (err)
			goto cleanup_root_ns;
	}
	steering->rdma_transport_rx_vports = total_vports;
	return 0;

cleanup_root_ns:
	while (i--)
		cleanup_root_ns(steering->rdma_transport_rx_root_ns[i]);
	kfree(steering->rdma_transport_rx_root_ns);
	steering->rdma_transport_rx_root_ns = NULL;
	return err;
}

static int init_rdma_transport_tx_root_ns(struct mlx5_flow_steering *steering)
{
	struct mlx5_core_dev *dev = steering->dev;
	int total_vports;
	int err;
	int i;

	/* In case eswitch not supported and working in legacy mode */
	total_vports = mlx5_eswitch_get_total_vports(dev) ?: 1;

	steering->rdma_transport_tx_root_ns =
			kcalloc(total_vports,
				sizeof(*steering->rdma_transport_tx_root_ns),
				GFP_KERNEL);
	if (!steering->rdma_transport_tx_root_ns)
		return -ENOMEM;

	for (i = 0; i < total_vports; i++) {
		err = init_rdma_transport_tx_root_ns_one(steering, i);
		if (err)
			goto cleanup_root_ns;
	}
	steering->rdma_transport_tx_vports = total_vports;
	return 0;

cleanup_root_ns:
	while (i--)
		cleanup_root_ns(steering->rdma_transport_tx_root_ns[i]);
	kfree(steering->rdma_transport_tx_root_ns);
	steering->rdma_transport_tx_root_ns = NULL;
	return err;
}

static void cleanup_rdma_transport_roots_ns(struct mlx5_flow_steering *steering)
{
	int i;

	if (steering->rdma_transport_rx_root_ns) {
		for (i = 0; i < steering->rdma_transport_rx_vports; i++)
			cleanup_root_ns(steering->rdma_transport_rx_root_ns[i]);

		kfree(steering->rdma_transport_rx_root_ns);
		steering->rdma_transport_rx_root_ns = NULL;
	}

	if (steering->rdma_transport_tx_root_ns) {
		for (i = 0; i < steering->rdma_transport_tx_vports; i++)
			cleanup_root_ns(steering->rdma_transport_tx_root_ns[i]);

		kfree(steering->rdma_transport_tx_root_ns);
		steering->rdma_transport_tx_root_ns = NULL;
	}
}

/* FT and tc chains are stored in the same array so we can re-use the
 * mlx5_get_fdb_sub_ns() and tc api for FT chains.
 * When creating a new ns for each chain store it in the first available slot.
@@ -3631,6 +3770,7 @@ void mlx5_fs_core_cleanup(struct mlx5_core_dev *dev)
	cleanup_root_ns(steering->rdma_rx_root_ns);
	cleanup_root_ns(steering->rdma_tx_root_ns);
	cleanup_root_ns(steering->egress_root_ns);
	cleanup_rdma_transport_roots_ns(steering);

	devl_params_unregister(priv_to_devlink(dev), mlx5_fs_params,
			       ARRAY_SIZE(mlx5_fs_params));
@@ -3700,6 +3840,18 @@ int mlx5_fs_core_init(struct mlx5_core_dev *dev)
			goto err;
	}

	if (MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_RX(dev, ft_support)) {
		err = init_rdma_transport_rx_root_ns(steering);
		if (err)
			goto err;
	}

	if (MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_TX(dev, ft_support)) {
		err = init_rdma_transport_tx_root_ns(steering);
		if (err)
			goto err;
	}

	return 0;

err:
@@ -3850,8 +4002,10 @@ mlx5_get_root_namespace(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type
	struct mlx5_flow_namespace *ns;

	if (ns_type == MLX5_FLOW_NAMESPACE_ESW_EGRESS ||
	    ns_type == MLX5_FLOW_NAMESPACE_ESW_INGRESS)
		ns = mlx5_get_flow_vport_acl_namespace(dev, ns_type, 0);
	    ns_type == MLX5_FLOW_NAMESPACE_ESW_INGRESS ||
	    ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX ||
	    ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX)
		ns = mlx5_get_flow_vport_namespace(dev, ns_type, 0);
	else
		ns = mlx5_get_flow_namespace(dev, ns_type);
	if (!ns)
Loading