Commit cf7e7377 authored by Carolina Jubran's avatar Carolina Jubran Committed by Jakub Kicinski
Browse files

net/mlx5: Manage TC arbiter nodes and implement full support for tc-bw



Introduce support for managing Traffic Class (TC) arbiter nodes and
associated vports TC nodes within the E-Switch QoS hierarchy. This
patch adds support for the new scheduling node type,
`SCHED_NODE_TYPE_VPORTS_TC_TSAR`, and implements full support for
setting tc-bw on both vports and nodes.

Key changes include:

- Introduced the new scheduling node type,
  `SCHED_NODE_TYPE_VPORTS_TC_TSAR`, for managing vports within the TC
  arbiter node.

- New helper functions for creating and destroying vports TC nodes
  under the TC arbiter.

- Updated the minimum rate normalization function to skip nodes of type
  `SCHED_NODE_TYPE_VPORTS_TC_TSAR`. Vports TC TSARs have bandwidth
  shares configured on them but not minimum rates, so their `min_rate`
  cannot be normalized.

- Implementation of `esw_qos_tc_arbiter_scheduling_setup()` and
  `esw_qos_tc_arbiter_scheduling_teardown()` for initializing and
  cleaning up TC arbiter scheduling elements. These functions now fully
  support tc-bw configuration on TC arbiter nodes.

- Introduced a new helper `esw_qos_calculate_tc_bw_divider()` to
  compute the total TC bandwidth share, which is used as a divider for
  normalizing each TC's share.

- Added `esw_qos_tc_arbiter_get_bw_shares()` and
  `esw_qos_set_tc_arbiter_bw_shares()` to handle the settings of
  bandwidth shares for vports traffic class TSARs.

- `esw_qos_set_tc_arbiter_bw_shares()` normalizes  each TC share based
  on the total and the firmware's maximum allowed TSAR bandwidth share.

- Refactored `mlx5_esw_devlink_rate_node_tc_bw_set()` and
  `mlx5_esw_devlink_rate_leaf_tc_bw_set()` to fully support configuring
  tc-bw on devlink rate nodes and vports, respectively.

- Refactored `mlx5_esw_qos_node_update_parent()` to ensure that tc-bw
  configuration remains compatible with setting a parent on a rate
  node, preserving level hierarchy functionality.

- Refactored `esw_qos_calc_bw_share()` to generalize its input so it
  can be used for both minimum rate and bandwidth share calculations.

Signed-off-by: default avatarCarolina Jubran <cjubran@nvidia.com>
Reviewed-by: default avatarCosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: default avatarTariq Toukan <tariqt@nvidia.com>
Signed-off-by: default avatarMark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250629142138.361537-8-mbloch@nvidia.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 97733d1e
Loading
Loading
Loading
Loading
+285 −9
Original line number Diff line number Diff line
@@ -67,6 +67,7 @@ enum sched_node_type {
	SCHED_NODE_TYPE_TC_ARBITER_TSAR,
	SCHED_NODE_TYPE_RATE_LIMITER,
	SCHED_NODE_TYPE_VPORT_TC,
	SCHED_NODE_TYPE_VPORTS_TC_TSAR,
};

static const char * const sched_node_type_str[] = {
@@ -75,6 +76,7 @@ static const char * const sched_node_type_str[] = {
	[SCHED_NODE_TYPE_TC_ARBITER_TSAR] = "TC Arbiter TSAR",
	[SCHED_NODE_TYPE_RATE_LIMITER] = "Rate Limiter",
	[SCHED_NODE_TYPE_VPORT_TC] = "vport TC",
	[SCHED_NODE_TYPE_VPORTS_TC_TSAR] = "vports TC TSAR",
};

struct mlx5_esw_sched_node {
@@ -187,6 +189,11 @@ mlx5_esw_qos_vport_get_parent(const struct mlx5_vport *vport)
static void esw_qos_sched_elem_warn(struct mlx5_esw_sched_node *node, int err, const char *op)
{
	switch (node->type) {
	case SCHED_NODE_TYPE_VPORTS_TC_TSAR:
		esw_warn(node->esw->dev,
			 "E-Switch %s %s scheduling element failed (tc=%d,err=%d)\n",
			 op, sched_node_type_str[node->type], node->tc, err);
		break;
	case SCHED_NODE_TYPE_VPORT_TC:
		esw_warn(node->esw->dev,
			 "E-Switch %s %s scheduling element failed (vport=%d,tc=%d,err=%d)\n",
@@ -345,11 +352,13 @@ static u32 esw_qos_calculate_min_rate_divider(struct mlx5_eswitch *esw,
	return 0;
}

static u32 esw_qos_calc_bw_share(u32 min_rate, u32 divider, u32 fw_max)
static u32 esw_qos_calc_bw_share(u32 value, u32 divider, u32 fw_max)
{
	if (!divider)
		return 0;
	return min_t(u32, max_t(u32, DIV_ROUND_UP(min_rate, divider), MLX5_MIN_BW_SHARE), fw_max);
	return min_t(u32, fw_max,
		     max_t(u32,
			   DIV_ROUND_UP(value, divider), MLX5_MIN_BW_SHARE));
}

static void esw_qos_update_sched_node_bw_share(struct mlx5_esw_sched_node *node,
@@ -376,7 +385,13 @@ static void esw_qos_normalize_min_rate(struct mlx5_eswitch *esw,
		if (node->esw != esw || node->ix == esw->qos.root_tsar_ix)
			continue;

		esw_qos_update_sched_node_bw_share(node, divider, extack);
		/* Vports TC TSARs don't have a minimum rate configured,
		 * so there's no need to update the bw_share on them.
		 */
		if (node->type != SCHED_NODE_TYPE_VPORTS_TC_TSAR) {
			esw_qos_update_sched_node_bw_share(node, divider,
							   extack);
		}

		if (list_empty(&node->children))
			continue;
@@ -385,6 +400,20 @@ static void esw_qos_normalize_min_rate(struct mlx5_eswitch *esw,
	}
}

static u32 esw_qos_calculate_tc_bw_divider(u32 *tc_bw)
{
	u32 total = 0;
	int i;

	for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++)
		total += tc_bw[i];

	/* If total is zero, tc-bw config is disabled and we shouldn't reach
	 * here.
	 */
	return WARN_ON(!total) ? 1 : total;
}

static int esw_qos_set_node_min_rate(struct mlx5_esw_sched_node *node,
				     u32 min_rate, struct netlink_ext_ack *extack)
{
@@ -527,6 +556,149 @@ static void esw_qos_destroy_node(struct mlx5_esw_sched_node *node, struct netlin
	__esw_qos_free_node(node);
}

static int esw_qos_create_vports_tc_node(struct mlx5_esw_sched_node *parent,
					 u8 tc, struct netlink_ext_ack *extack)
{
	u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
	struct mlx5_core_dev *dev = parent->esw->dev;
	struct mlx5_esw_sched_node *vports_tc_node;
	void *attr;
	int err;

	if (!mlx5_qos_element_type_supported(
		dev,
		SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR,
		SCHEDULING_HIERARCHY_E_SWITCH) ||
	    !mlx5_qos_tsar_type_supported(dev,
					  TSAR_ELEMENT_TSAR_TYPE_DWRR,
					  SCHEDULING_HIERARCHY_E_SWITCH))
		return -EOPNOTSUPP;

	vports_tc_node = __esw_qos_alloc_node(parent->esw, 0,
					      SCHED_NODE_TYPE_VPORTS_TC_TSAR,
					      parent);
	if (!vports_tc_node) {
		NL_SET_ERR_MSG_MOD(extack, "E-Switch alloc node failed");
		esw_warn(dev, "Failed to alloc vports TC node (tc=%d)\n", tc);
		return -ENOMEM;
	}

	attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes);
	MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_DWRR);
	MLX5_SET(tsar_element, attr, traffic_class, tc);
	MLX5_SET(scheduling_context, tsar_ctx, parent_element_id, parent->ix);
	MLX5_SET(scheduling_context, tsar_ctx, element_type,
		 SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR);

	err = esw_qos_node_create_sched_element(vports_tc_node, tsar_ctx,
						extack);
	if (err)
		goto err_create_sched_element;

	vports_tc_node->tc = tc;

	return 0;

err_create_sched_element:
	__esw_qos_free_node(vports_tc_node);
	return err;
}

static void
esw_qos_tc_arbiter_get_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node,
				 u32 *tc_bw)
{
	struct mlx5_esw_sched_node *vports_tc_node;

	list_for_each_entry(vports_tc_node, &tc_arbiter_node->children, entry)
		tc_bw[vports_tc_node->tc] = vports_tc_node->bw_share;
}

static void
esw_qos_set_tc_arbiter_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node,
				 u32 *tc_bw, struct netlink_ext_ack *extack)
{
	struct mlx5_eswitch *esw = tc_arbiter_node->esw;
	struct mlx5_esw_sched_node *vports_tc_node;
	u32 divider, fw_max_bw_share;

	fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
	divider = esw_qos_calculate_tc_bw_divider(tc_bw);
	list_for_each_entry(vports_tc_node, &tc_arbiter_node->children, entry) {
		u8 tc = vports_tc_node->tc;
		u32 bw_share;

		bw_share = tc_bw[tc] * fw_max_bw_share;
		bw_share = esw_qos_calc_bw_share(bw_share, divider,
						 fw_max_bw_share);
		esw_qos_sched_elem_config(vports_tc_node, 0, bw_share, extack);
	}
}

static void
esw_qos_destroy_vports_tc_nodes(struct mlx5_esw_sched_node *tc_arbiter_node,
				struct netlink_ext_ack *extack)
{
	struct mlx5_esw_sched_node *vports_tc_node, *tmp;

	list_for_each_entry_safe(vports_tc_node, tmp,
				 &tc_arbiter_node->children, entry)
		esw_qos_destroy_node(vports_tc_node, extack);
}

static int
esw_qos_create_vports_tc_nodes(struct mlx5_esw_sched_node *tc_arbiter_node,
			       struct netlink_ext_ack *extack)
{
	struct mlx5_eswitch *esw = tc_arbiter_node->esw;
	int err, i, num_tcs = esw_qos_num_tcs(esw->dev);

	for (i = 0; i < num_tcs; i++) {
		err = esw_qos_create_vports_tc_node(tc_arbiter_node, i, extack);
		if (err)
			goto err_tc_node_create;
	}

	return 0;

err_tc_node_create:
	esw_qos_destroy_vports_tc_nodes(tc_arbiter_node, NULL);
	return err;
}

static int esw_qos_create_tc_arbiter_sched_elem(
		struct mlx5_esw_sched_node *tc_arbiter_node,
		struct netlink_ext_ack *extack)
{
	u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
	u32 tsar_parent_ix;
	void *attr;

	if (!mlx5_qos_tsar_type_supported(tc_arbiter_node->esw->dev,
					  TSAR_ELEMENT_TSAR_TYPE_TC_ARB,
					  SCHEDULING_HIERARCHY_E_SWITCH)) {
		NL_SET_ERR_MSG_MOD(extack,
				   "E-Switch TC Arbiter scheduling element is not supported");
		return -EOPNOTSUPP;
	}

	attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes);
	MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_TC_ARB);
	tsar_parent_ix = tc_arbiter_node->parent ? tc_arbiter_node->parent->ix :
			 tc_arbiter_node->esw->qos.root_tsar_ix;
	MLX5_SET(scheduling_context, tsar_ctx, parent_element_id,
		 tsar_parent_ix);
	MLX5_SET(scheduling_context, tsar_ctx, element_type,
		 SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR);
	MLX5_SET(scheduling_context, tsar_ctx, max_average_bw,
		 tc_arbiter_node->max_rate);
	MLX5_SET(scheduling_context, tsar_ctx, bw_share,
		 tc_arbiter_node->bw_share);

	return esw_qos_node_create_sched_element(tc_arbiter_node, tsar_ctx,
						 extack);
}

static struct mlx5_esw_sched_node *
__esw_qos_create_vports_sched_node(struct mlx5_eswitch *esw, struct mlx5_esw_sched_node *parent,
				   struct netlink_ext_ack *extack)
@@ -591,6 +763,9 @@ static void __esw_qos_destroy_node(struct mlx5_esw_sched_node *node, struct netl
{
	struct mlx5_eswitch *esw = node->esw;

	if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR)
		esw_qos_destroy_vports_tc_nodes(node, extack);

	trace_mlx5_esw_node_qos_destroy(esw->dev, node, node->ix);
	esw_qos_destroy_node(node, extack);
	esw_qos_normalize_min_rate(esw, NULL, extack);
@@ -685,13 +860,38 @@ static void esw_qos_put(struct mlx5_eswitch *esw)
static void
esw_qos_tc_arbiter_scheduling_teardown(struct mlx5_esw_sched_node *node,
				       struct netlink_ext_ack *extack)
{}
{
	/* Clean up all Vports TC nodes within the TC arbiter node. */
	esw_qos_destroy_vports_tc_nodes(node, extack);
	/* Destroy the scheduling element for the TC arbiter node itself. */
	esw_qos_node_destroy_sched_element(node, extack);
}

static int esw_qos_tc_arbiter_scheduling_setup(struct mlx5_esw_sched_node *node,
					       struct netlink_ext_ack *extack)
{
	NL_SET_ERR_MSG_MOD(extack, "TC arbiter elements are not supported.");
	return -EOPNOTSUPP;
	u32 curr_ix = node->ix;
	int err;

	err = esw_qos_create_tc_arbiter_sched_elem(node, extack);
	if (err)
		return err;
	/* Initialize the vports TC nodes within created TC arbiter TSAR. */
	err = esw_qos_create_vports_tc_nodes(node, extack);
	if (err)
		goto err_vports_tc_nodes;

	node->type = SCHED_NODE_TYPE_TC_ARBITER_TSAR;

	return 0;

err_vports_tc_nodes:
	/* If initialization fails, clean up the scheduling element
	 * for the TC arbiter node.
	 */
	esw_qos_node_destroy_sched_element(node, NULL);
	node->ix = curr_ix;
	return err;
}

static int
@@ -1064,6 +1264,7 @@ static int esw_qos_vport_update(struct mlx5_vport *vport,
{
	struct mlx5_esw_sched_node *curr_parent = vport->qos.sched_node->parent;
	enum sched_node_type curr_type = vport->qos.sched_node->type;
	u32 curr_tc_bw[DEVLINK_RATE_TCS_MAX] = {0};
	int err;

	esw_assert_qos_lock_held(vport->dev->priv.eswitch);
@@ -1075,11 +1276,23 @@ static int esw_qos_vport_update(struct mlx5_vport *vport,
	if (err)
		return err;

	if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) {
		esw_qos_tc_arbiter_get_bw_shares(vport->qos.sched_node,
						 curr_tc_bw);
	}

	esw_qos_vport_disable(vport, extack);

	err = esw_qos_vport_enable(vport, type, parent, extack);
	if (err)
	if (err) {
		esw_qos_vport_enable(vport, curr_type, curr_parent, NULL);
		extack = NULL;
	}

	if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) {
		esw_qos_set_tc_arbiter_bw_shares(vport->qos.sched_node,
						 curr_tc_bw, extack);
	}

	return err;
}
@@ -1563,6 +1776,8 @@ int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_leaf,
					   SCHED_NODE_TYPE_TC_ARBITER_TSAR,
					   NULL, extack);
	}
	if (!err)
		esw_qos_set_tc_arbiter_bw_shares(vport_node, tc_bw, extack);
unlock:
	esw_qos_unlock(esw);
	return err;
@@ -1592,6 +1807,8 @@ int mlx5_esw_devlink_rate_node_tc_bw_set(struct devlink_rate *rate_node,
	}

	err = esw_qos_node_enable_tc_arbitration(node, extack);
	if (!err)
		esw_qos_set_tc_arbiter_bw_shares(node, tc_bw, extack);
unlock:
	esw_qos_unlock(esw);
	return err;
@@ -1716,6 +1933,20 @@ int mlx5_esw_devlink_rate_leaf_parent_set(struct devlink_rate *devlink_rate,
	return mlx5_esw_qos_vport_update_parent(vport, node, extack);
}

static bool esw_qos_is_node_empty(struct mlx5_esw_sched_node *node)
{
	if (list_empty(&node->children))
		return true;

	if (node->type != SCHED_NODE_TYPE_TC_ARBITER_TSAR)
		return false;

	node = list_first_entry(&node->children, struct mlx5_esw_sched_node,
				entry);

	return esw_qos_is_node_empty(node);
}

static int
mlx5_esw_qos_node_validate_set_parent(struct mlx5_esw_sched_node *node,
				      struct mlx5_esw_sched_node *parent,
@@ -1729,13 +1960,26 @@ mlx5_esw_qos_node_validate_set_parent(struct mlx5_esw_sched_node *node,
		return -EOPNOTSUPP;
	}

	if (!list_empty(&node->children)) {
	if (!esw_qos_is_node_empty(node)) {
		NL_SET_ERR_MSG_MOD(extack,
				   "Cannot reassign a node that contains rate objects");
		return -EOPNOTSUPP;
	}

	if (parent && parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
		NL_SET_ERR_MSG_MOD(extack,
				   "Cannot attach a node to a parent with TC bandwidth configured");
		return -EOPNOTSUPP;
	}

	new_level = parent ? parent->level + 1 : 2;
	if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
		/* Increase by one to account for the vports TC scheduling
		 * element.
		 */
		new_level += 1;
	}

	max_level = 1 << MLX5_CAP_QOS(node->esw->dev, log_esw_max_sched_depth);
	if (new_level > max_level) {
		NL_SET_ERR_MSG_MOD(extack,
@@ -1746,6 +1990,32 @@ mlx5_esw_qos_node_validate_set_parent(struct mlx5_esw_sched_node *node,
	return 0;
}

static int
esw_qos_tc_arbiter_node_update_parent(struct mlx5_esw_sched_node *node,
				      struct mlx5_esw_sched_node *parent,
				      struct netlink_ext_ack *extack)
{
	struct mlx5_esw_sched_node *curr_parent = node->parent;
	u32 curr_tc_bw[DEVLINK_RATE_TCS_MAX] = {0};
	struct mlx5_eswitch *esw = node->esw;
	int err;

	esw_qos_tc_arbiter_get_bw_shares(node, curr_tc_bw);
	esw_qos_tc_arbiter_scheduling_teardown(node, extack);
	esw_qos_node_set_parent(node, parent);
	err = esw_qos_tc_arbiter_scheduling_setup(node, extack);
	if (err) {
		esw_qos_node_set_parent(node, curr_parent);
		if (esw_qos_tc_arbiter_scheduling_setup(node, extack)) {
			esw_warn(esw->dev, "Node restore QoS failed\n");
			return err;
		}
	}
	esw_qos_set_tc_arbiter_bw_shares(node, curr_tc_bw, extack);

	return err;
}

static int esw_qos_vports_node_update_parent(struct mlx5_esw_sched_node *node,
					     struct mlx5_esw_sched_node *parent,
					     struct netlink_ext_ack *extack)
@@ -1791,7 +2061,13 @@ static int mlx5_esw_qos_node_update_parent(struct mlx5_esw_sched_node *node,

	esw_qos_lock(esw);
	curr_parent = node->parent;
	if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
		err = esw_qos_tc_arbiter_node_update_parent(node, parent,
							    extack);
	} else {
		err = esw_qos_vports_node_update_parent(node, parent, extack);
	}

	if (err)
		goto out;