Commit 50f1d188 authored by Or Har-Toov's avatar Or Har-Toov Committed by Leon Romanovsky
Browse files

net/mlx5: Propagate LAG effective max_tx_speed to vports



Currently, vports report only their parent's uplink speed, which in LAG
setups does not reflect the true aggregated bandwidth. This makes it
hard for upper-layer software to optimize load balancing decisions
based on accurate bandwidth information.

Fix the issue by calculating the possible maximum speed of a LAG as
the sum of speeds of all active uplinks that are part of the LAG.
Propagate this effective max speed to vports associated with the LAG
whenever a relevant event occurs, such as physical port link state
changes or LAG creation/modification.

With this change, upper-layer components receive accurate bandwidth
information corresponding to the active members of the LAG and can
make better load balancing decisions.

Signed-off-by: default avatarOr Har-Toov <ohartoov@nvidia.com>
Reviewed-by: default avatarMaher Sanalla <msanalla@nvidia.com>
Reviewed-by: default avatarMark Bloch <mbloch@nvidia.com>
Signed-off-by: default avatarEdward Srouji <edwards@nvidia.com>
Signed-off-by: default avatarLeon Romanovsky <leon@kernel.org>
parent 3df5dd46
Loading
Loading
Loading
Loading
+158 −0
Original line number Diff line number Diff line
@@ -996,6 +996,126 @@ static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond)
	       ldev->mode != MLX5_LAG_MODE_MPESW;
}

#ifdef CONFIG_MLX5_ESWITCH
static int
mlx5_lag_sum_devices_speed(struct mlx5_lag *ldev, u32 *sum_speed,
			   int (*get_speed)(struct mlx5_core_dev *, u32 *))
{
	struct mlx5_core_dev *pf_mdev;
	int pf_idx;
	u32 speed;
	int ret;

	*sum_speed = 0;
	mlx5_ldev_for_each(pf_idx, 0, ldev) {
		pf_mdev = ldev->pf[pf_idx].dev;
		if (!pf_mdev)
			continue;

		ret = get_speed(pf_mdev, &speed);
		if (ret) {
			mlx5_core_dbg(pf_mdev,
				      "Failed to get device speed using %ps. Device %s speed is not available (err=%d)\n",
				      get_speed, dev_name(pf_mdev->device),
				      ret);
			return ret;
		}

		*sum_speed += speed;
	}

	return 0;
}

static int mlx5_lag_sum_devices_max_speed(struct mlx5_lag *ldev, u32 *max_speed)
{
	return mlx5_lag_sum_devices_speed(ldev, max_speed,
					  mlx5_port_max_linkspeed);
}

static void mlx5_lag_modify_device_vports_speed(struct mlx5_core_dev *mdev,
						u32 speed)
{
	u16 op_mod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT;
	struct mlx5_eswitch *esw = mdev->priv.eswitch;
	struct mlx5_vport *vport;
	unsigned long i;
	int ret;

	if (!esw)
		return;

	if (!MLX5_CAP_ESW(mdev, esw_vport_state_max_tx_speed))
		return;

	mlx5_esw_for_each_vport(esw, i, vport) {
		if (!vport)
			continue;

		if (vport->vport == MLX5_VPORT_UPLINK)
			continue;

		ret = mlx5_modify_vport_max_tx_speed(mdev, op_mod,
						     vport->vport, true, speed);
		if (ret)
			mlx5_core_dbg(mdev,
				      "Failed to set vport %d speed %d, err=%d\n",
				      vport->vport, speed, ret);
	}
}

void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev)
{
	struct mlx5_core_dev *mdev;
	u32 speed;
	int pf_idx;

	speed = ldev->tracker.bond_speed_mbps;

	if (speed == SPEED_UNKNOWN)
		return;

	/* If speed is not set, use the sum of max speeds of all PFs */
	if (!speed && mlx5_lag_sum_devices_max_speed(ldev, &speed))
		return;

	speed = speed / MLX5_MAX_TX_SPEED_UNIT;

	mlx5_ldev_for_each(pf_idx, 0, ldev) {
		mdev = ldev->pf[pf_idx].dev;
		if (!mdev)
			continue;

		mlx5_lag_modify_device_vports_speed(mdev, speed);
	}
}

void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev)
{
	struct mlx5_core_dev *mdev;
	u32 speed;
	int pf_idx;
	int ret;

	mlx5_ldev_for_each(pf_idx, 0, ldev) {
		mdev = ldev->pf[pf_idx].dev;
		if (!mdev)
			continue;

		ret = mlx5_port_oper_linkspeed(mdev, &speed);
		if (ret) {
			mlx5_core_dbg(mdev,
				      "Failed to reset vports speed for device %s. Oper speed is not available (err=%d)\n",
				      dev_name(mdev->device), ret);
			continue;
		}

		speed = speed / MLX5_MAX_TX_SPEED_UNIT;
		mlx5_lag_modify_device_vports_speed(mdev, speed);
	}
}
#endif

static void mlx5_do_bond(struct mlx5_lag *ldev)
{
	int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
@@ -1083,9 +1203,12 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
						     ndev);
			dev_put(ndev);
		}
		mlx5_lag_set_vports_agg_speed(ldev);
	} else if (mlx5_lag_should_modify_lag(ldev, do_bond)) {
		mlx5_modify_lag(ldev, &tracker);
		mlx5_lag_set_vports_agg_speed(ldev);
	} else if (mlx5_lag_should_disable_lag(ldev, do_bond)) {
		mlx5_lag_reset_vports_speed(ldev);
		mlx5_disable_lag(ldev);
	}
}
@@ -1286,6 +1409,38 @@ static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev,
	return 1;
}

static void mlx5_lag_update_tracker_speed(struct lag_tracker *tracker,
					  struct net_device *ndev)
{
	struct ethtool_link_ksettings lksettings;
	struct net_device *bond_dev;
	int err;

	if (netif_is_lag_master(ndev))
		bond_dev = ndev;
	else
		bond_dev = netdev_master_upper_dev_get(ndev);

	if (!bond_dev) {
		tracker->bond_speed_mbps = SPEED_UNKNOWN;
		return;
	}

	err = __ethtool_get_link_ksettings(bond_dev, &lksettings);
	if (err) {
		netdev_dbg(bond_dev,
			   "Failed to get speed for bond dev %s, err=%d\n",
			   bond_dev->name, err);
		tracker->bond_speed_mbps = SPEED_UNKNOWN;
		return;
	}

	if (lksettings.base.speed == SPEED_UNKNOWN)
		tracker->bond_speed_mbps = 0;
	else
		tracker->bond_speed_mbps = lksettings.base.speed;
}

/* this handler is always registered to netdev events */
static int mlx5_lag_netdev_event(struct notifier_block *this,
				 unsigned long event, void *ptr)
@@ -1317,6 +1472,9 @@ static int mlx5_lag_netdev_event(struct notifier_block *this,
		break;
	}

	if (changed)
		mlx5_lag_update_tracker_speed(&tracker, ndev);

	ldev->tracker = tracker;

	if (changed)
+9 −0
Original line number Diff line number Diff line
@@ -48,6 +48,7 @@ struct lag_tracker {
	unsigned int is_bonded:1;
	unsigned int has_inactive:1;
	enum netdev_lag_hash hash_type;
	u32 bond_speed_mbps;
};

/* LAG data of a ConnectX card.
@@ -116,6 +117,14 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev);
void mlx5_lag_add_devices(struct mlx5_lag *ldev);
struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev);

#ifdef CONFIG_MLX5_ESWITCH
void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev);
void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev);
#else
static inline void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) {}
static inline void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev) {}
#endif

static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev)
{
	if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
+1 −0
Original line number Diff line number Diff line
@@ -381,6 +381,7 @@ const struct mlx5_link_info *mlx5_port_ptys2info(struct mlx5_core_dev *mdev,
u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev,
			     struct mlx5_link_info *info,
			     bool force_legacy);
int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);

#define MLX5_PPS_CAP(mdev) (MLX5_CAP_GEN((mdev), pps) &&		\
+24 −0
Original line number Diff line number Diff line
@@ -1200,6 +1200,30 @@ u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev,
	return link_modes;
}

int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
{
	const struct mlx5_link_info *table;
	struct mlx5_port_eth_proto eproto;
	u32 oper_speed = 0;
	u32 max_size;
	bool ext;
	int err;
	int i;

	ext = mlx5_ptys_ext_supported(mdev);
	err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto);
	if (err)
		return err;

	mlx5e_port_get_link_mode_info_arr(mdev, &table, &max_size, false);
	for (i = 0; i < max_size; ++i)
		if (eproto.oper & MLX5E_PROT_MASK(i))
			oper_speed = max(oper_speed, table[i].speed);

	*speed = oper_speed;
	return 0;
}

int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
{
	const struct mlx5_link_info *table;
+45 −0
Original line number Diff line number Diff line
@@ -62,6 +62,28 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
	return MLX5_GET(query_vport_state_out, out, state);
}

static int mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
					u16 vport, u8 other_vport,
					u8 *admin_state)
{
	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
	int err;

	MLX5_SET(query_vport_state_in, in, opcode,
		 MLX5_CMD_OP_QUERY_VPORT_STATE);
	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
	MLX5_SET(query_vport_state_in, in, vport_number, vport);
	MLX5_SET(query_vport_state_in, in, other_vport, other_vport);

	err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
	if (err)
		return err;

	*admin_state = MLX5_GET(query_vport_state_out, out, admin_state);
	return 0;
}

int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
				  u16 vport, u8 other_vport, u8 state)
{
@@ -77,6 +99,29 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
	return mlx5_cmd_exec_in(mdev, modify_vport_state, in);
}

int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod,
				   u16 vport, u8 other_vport, u16 max_tx_speed)
{
	u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {};
	u8 admin_state;
	int err;

	err = mlx5_query_vport_admin_state(mdev, opmod, vport, other_vport,
					   &admin_state);
	if (err)
		return err;

	MLX5_SET(modify_vport_state_in, in, opcode,
		 MLX5_CMD_OP_MODIFY_VPORT_STATE);
	MLX5_SET(modify_vport_state_in, in, op_mod, opmod);
	MLX5_SET(modify_vport_state_in, in, vport_number, vport);
	MLX5_SET(modify_vport_state_in, in, other_vport, other_vport);
	MLX5_SET(modify_vport_state_in, in, admin_state, admin_state);
	MLX5_SET(modify_vport_state_in, in, max_tx_speed, max_tx_speed);

	return mlx5_cmd_exec_in(mdev, modify_vport_state, in);
}

static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport,
					bool other_vport, u32 *out)
{
Loading