Commit cd8a4cfa authored by Paolo Abeni's avatar Paolo Abeni
Browse files

Merge branch 'e-switch-vport-sharing-delegation'

Saeed Mahameed says:

====================
E-Switch vport sharing & delegation

An mlx5 E-Switch FDB table can manage vports belonging to other sibling
physical functions, such as ECPF (ARM embedded cores) and Host PF (x86).
This enables a single source of truth for SDN software to manage network
pipelines from one host. While such functionality already exists in mlx5,
it is currently limited by static vport allocation,
meaning the number of vports shared between multi-host functions
must be known pre-boot.

This patchset enables delegated/external vports to be discovered
dynamically when switchdev mode is enabled, leveraging new firmware
capabilities for dynamic vport creation.

Adjacent functions that delegate their SR-IOV VFs to sibling PFs, can be
dynamically discovered on the sibling PF's switchdev mode enabling,
after sriov was enabled on the originating PF, allowing for more
flexible and scalable management in multi-host and ECPF-to-host
scenarios.

The patchset consists of the following changes:

- Refactoring of ACL root namespace handling: The storage of vport ACL root
  namespaces is converted from a linear array to an xarray, allowing dynamic
  creation of ACLs per individual vport.
- Improvements for vhca_id to vport mapping.
- Dynamic querying and creation of delegated functions/vports.
====================

Link: https://patch.msgid.link/20250829223722.900629-1-saeed@kernel.org


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents 48195dd1 0c2a02f3
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -69,7 +69,7 @@ mlx5_core-$(CONFIG_MLX5_TC_SAMPLE) += en/tc/sample.o
# Core extra
#
mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o eswitch_offloads.o eswitch_offloads_termtbl.o \
				      ecpf.o rdma.o esw/legacy.o \
				      ecpf.o rdma.o esw/legacy.o esw/adj_vport.o \
				      esw/devlink_port.o esw/vporttbl.o esw/qos.o esw/ipsec.o

mlx5_core-$(CONFIG_MLX5_ESWITCH)   += esw/acl/helper.o \
+209 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.

#include "fs_core.h"
#include "eswitch.h"

enum {
	MLX5_ADJ_VPORT_DISCONNECT = 0x0,
	MLX5_ADJ_VPORT_CONNECT = 0x1,
};

static int mlx5_esw_adj_vport_modify(struct mlx5_core_dev *dev,
				     u16 vport, bool connect)
{
	u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {};

	MLX5_SET(modify_vport_state_in, in, opcode,
		 MLX5_CMD_OP_MODIFY_VPORT_STATE);
	MLX5_SET(modify_vport_state_in, in, op_mod,
		 MLX5_VPORT_STATE_OP_MOD_ESW_VPORT);
	MLX5_SET(modify_vport_state_in, in, other_vport, 1);
	MLX5_SET(modify_vport_state_in, in, vport_number, vport);
	MLX5_SET(modify_vport_state_in, in, ingress_connect_valid, 1);
	MLX5_SET(modify_vport_state_in, in, egress_connect_valid, 1);
	MLX5_SET(modify_vport_state_in, in, ingress_connect, connect);
	MLX5_SET(modify_vport_state_in, in, egress_connect, connect);

	return mlx5_cmd_exec_in(dev, modify_vport_state, in);
}

static void mlx5_esw_destroy_esw_vport(struct mlx5_core_dev *dev, u16 vport)
{
	u32 in[MLX5_ST_SZ_DW(destroy_esw_vport_in)] = {};

	MLX5_SET(destroy_esw_vport_in, in, opcode,
		 MLX5_CMD_OPCODE_DESTROY_ESW_VPORT);
	MLX5_SET(destroy_esw_vport_in, in, vport_num, vport);

	mlx5_cmd_exec_in(dev, destroy_esw_vport, in);
}

static int mlx5_esw_create_esw_vport(struct mlx5_core_dev *dev, u16 vhca_id,
				     u16 *vport_num)
{
	u32 out[MLX5_ST_SZ_DW(create_esw_vport_out)] = {};
	u32 in[MLX5_ST_SZ_DW(create_esw_vport_in)] = {};
	int err;

	MLX5_SET(create_esw_vport_in, in, opcode,
		 MLX5_CMD_OPCODE_CREATE_ESW_VPORT);
	MLX5_SET(create_esw_vport_in, in, managed_vhca_id, vhca_id);

	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
	if (!err)
		*vport_num = MLX5_GET(create_esw_vport_out, out, vport_num);

	return err;
}

static int mlx5_esw_adj_vport_create(struct mlx5_eswitch *esw, u16 vhca_id,
				     const void *rid_info_reg)
{
	struct mlx5_vport *vport;
	u16 vport_num;
	int err;

	err = mlx5_esw_create_esw_vport(esw->dev, vhca_id, &vport_num);
	if (err) {
		esw_warn(esw->dev,
			 "Failed to create adjacent vport for vhca_id %d, err %d\n",
			 vhca_id, err);
		return err;
	}

	esw_debug(esw->dev, "Created adjacent vport[%d] %d for vhca_id 0x%x\n",
		  esw->last_vport_idx, vport_num, vhca_id);

	err = mlx5_esw_vport_alloc(esw, esw->last_vport_idx++, vport_num);
	if (err)
		goto destroy_esw_vport;

	xa_set_mark(&esw->vports, vport_num, MLX5_ESW_VPT_VF);
	vport = mlx5_eswitch_get_vport(esw, vport_num);
	vport->adjacent = true;
	vport->vhca_id = vhca_id;

	vport->adj_info.parent_pci_devfn =
		MLX5_GET(function_vhca_rid_info_reg, rid_info_reg,
			 parent_pci_device_function);
	vport->adj_info.function_id =
		MLX5_GET(function_vhca_rid_info_reg, rid_info_reg, function_id);

	mlx5_fs_vport_egress_acl_ns_add(esw->dev->priv.steering, vport->index);
	mlx5_fs_vport_ingress_acl_ns_add(esw->dev->priv.steering, vport->index);
	err = mlx5_esw_offloads_rep_add(esw, vport);
	if (err)
		goto acl_ns_remove;

	mlx5_esw_adj_vport_modify(esw->dev, vport_num, MLX5_ADJ_VPORT_CONNECT);
	return 0;

acl_ns_remove:
	mlx5_fs_vport_ingress_acl_ns_remove(esw->dev->priv.steering,
					    vport->index);
	mlx5_fs_vport_egress_acl_ns_remove(esw->dev->priv.steering,
					   vport->index);
	mlx5_esw_vport_free(esw, vport);
destroy_esw_vport:
	mlx5_esw_destroy_esw_vport(esw->dev, vport_num);
	return err;
}

static void mlx5_esw_adj_vport_destroy(struct mlx5_eswitch *esw,
				       struct mlx5_vport *vport)
{
	u16 vport_num = vport->vport;

	esw_debug(esw->dev, "Destroying adjacent vport %d for vhca_id 0x%x\n",
		  vport_num, vport->vhca_id);
	mlx5_esw_adj_vport_modify(esw->dev, vport_num,
				  MLX5_ADJ_VPORT_DISCONNECT);
	mlx5_esw_offloads_rep_remove(esw, vport);
	mlx5_fs_vport_egress_acl_ns_remove(esw->dev->priv.steering,
					   vport->index);
	mlx5_fs_vport_ingress_acl_ns_remove(esw->dev->priv.steering,
					    vport->index);
	mlx5_esw_vport_free(esw, vport);
	/* Reset the vport index back so new adj vports can use this index.
	 * When vport count can incrementally change, this needs to be modified.
	 */
	esw->last_vport_idx--;
	mlx5_esw_destroy_esw_vport(esw->dev, vport_num);
}

void mlx5_esw_adjacent_vhcas_cleanup(struct mlx5_eswitch *esw)
{
	struct mlx5_vport *vport;
	unsigned long i;

	if (!MLX5_CAP_GEN_2(esw->dev, delegated_vhca_max))
		return;

	mlx5_esw_for_each_vf_vport(esw, i, vport, U16_MAX) {
		if (!vport->adjacent)
			continue;
		mlx5_esw_adj_vport_destroy(esw, vport);
	}
}

void mlx5_esw_adjacent_vhcas_setup(struct mlx5_eswitch *esw)
{
	u32 delegated_vhca_max = MLX5_CAP_GEN_2(esw->dev, delegated_vhca_max);
	u32 in[MLX5_ST_SZ_DW(query_delegated_vhca_in)] = {};
	int outlen, err, i = 0;
	u8 *out;
	u32 count;

	if (!delegated_vhca_max)
		return;

	outlen = MLX5_ST_SZ_BYTES(query_delegated_vhca_out) +
		 delegated_vhca_max *
		 MLX5_ST_SZ_BYTES(delegated_function_vhca_rid_info);

	esw_debug(esw->dev, "delegated_vhca_max=%d\n", delegated_vhca_max);

	out = kvzalloc(outlen, GFP_KERNEL);
	if (!out)
		return;

	MLX5_SET(query_delegated_vhca_in, in, opcode,
		 MLX5_CMD_OPCODE_QUERY_DELEGATED_VHCA);

	err = mlx5_cmd_exec(esw->dev, in, sizeof(in), out, outlen);
	if (err) {
		kvfree(out);
		esw_warn(esw->dev, "Failed to query delegated vhca, err %d\n",
			 err);
		return;
	}

	count = MLX5_GET(query_delegated_vhca_out, out, functions_count);
	esw_debug(esw->dev, "Delegated vhca functions count %d\n", count);

	for (i = 0; i < count; i++) {
		const void *rid_info, *rid_info_reg;
		u16 vhca_id;

		rid_info = MLX5_ADDR_OF(query_delegated_vhca_out, out,
					delegated_function_vhca_rid_info[i]);

		rid_info_reg = MLX5_ADDR_OF(delegated_function_vhca_rid_info,
					    rid_info, function_vhca_rid_info);

		vhca_id = MLX5_GET(function_vhca_rid_info_reg, rid_info_reg,
				   vhca_id);
		esw_debug(esw->dev, "Delegating vhca_id 0x%x\n", vhca_id);

		err = mlx5_esw_adj_vport_create(esw, vhca_id, rid_info_reg);
		if (err) {
			esw_warn(esw->dev,
				 "Failed to init adjacent vhca 0x%x, err %d\n",
				 vhca_id, err);
			break;
		}
	}

	kvfree(out);
}
+10 −1
Original line number Diff line number Diff line
@@ -27,6 +27,7 @@ static void mlx5_esw_offloads_pf_vf_devlink_port_attrs_set(struct mlx5_eswitch *
{
	struct mlx5_core_dev *dev = esw->dev;
	struct netdev_phys_item_id ppid = {};
	struct mlx5_vport *vport;
	u32 controller_num = 0;
	bool external;
	u16 pfnum;
@@ -42,10 +43,18 @@ static void mlx5_esw_offloads_pf_vf_devlink_port_attrs_set(struct mlx5_eswitch *
		dl_port->attrs.switch_id.id_len = ppid.id_len;
		devlink_port_attrs_pci_pf_set(dl_port, controller_num, pfnum, external);
	} else if (mlx5_eswitch_is_vf_vport(esw, vport_num)) {
		u16 func_id = vport_num - 1;

		vport = mlx5_eswitch_get_vport(esw, vport_num);
		memcpy(dl_port->attrs.switch_id.id, ppid.id, ppid.id_len);
		dl_port->attrs.switch_id.id_len = ppid.id_len;
		if (vport->adjacent) {
			func_id = vport->adj_info.function_id;
			pfnum = vport->adj_info.parent_pci_devfn;
		}

		devlink_port_attrs_pci_vf_set(dl_port, controller_num, pfnum,
					      vport_num - 1, external);
					      func_id, external);
	}  else if (mlx5_core_is_ec_vf_vport(esw->dev, vport_num)) {
		u16 base_vport = mlx5_core_ec_vf_vport_base(dev);

+119 −12
Original line number Diff line number Diff line
@@ -1217,7 +1217,8 @@ void mlx5_eswitch_unload_vf_vports(struct mlx5_eswitch *esw, u16 num_vfs)
	unsigned long i;

	mlx5_esw_for_each_vf_vport(esw, i, vport, num_vfs) {
		if (!vport->enabled)
		/* Adjacent VFs are unloaded separately */
		if (!vport->enabled || vport->adjacent)
			continue;
		mlx5_eswitch_unload_pf_vf_vport(esw, vport->vport);
	}
@@ -1236,6 +1237,42 @@ static void mlx5_eswitch_unload_ec_vf_vports(struct mlx5_eswitch *esw,
	}
}

static void mlx5_eswitch_unload_adj_vf_vports(struct mlx5_eswitch *esw)
{
	struct mlx5_vport *vport;
	unsigned long i;

	mlx5_esw_for_each_vf_vport(esw, i, vport, U16_MAX) {
		if (!vport->enabled || !vport->adjacent)
			continue;
		mlx5_eswitch_unload_pf_vf_vport(esw, vport->vport);
	}
}

static int
mlx5_eswitch_load_adj_vf_vports(struct mlx5_eswitch *esw,
				enum mlx5_eswitch_vport_event enabled_events)
{
	struct mlx5_vport *vport;
	unsigned long i;
	int err;

	mlx5_esw_for_each_vf_vport(esw, i, vport, U16_MAX) {
		if (!vport->adjacent)
			continue;
		err = mlx5_eswitch_load_pf_vf_vport(esw, vport->vport,
						    enabled_events);
		if (err)
			goto unload_adj_vf_vport;
	}

	return 0;

unload_adj_vf_vport:
	mlx5_eswitch_unload_adj_vf_vports(esw);
	return err;
}

int mlx5_eswitch_load_vf_vports(struct mlx5_eswitch *esw, u16 num_vfs,
				enum mlx5_eswitch_vport_event enabled_events)
{
@@ -1345,8 +1382,16 @@ mlx5_eswitch_enable_pf_vf_vports(struct mlx5_eswitch *esw,
					  enabled_events);
	if (ret)
		goto vf_err;

	/* Enable adjacent VF vports */
	ret = mlx5_eswitch_load_adj_vf_vports(esw, enabled_events);
	if (ret)
		goto unload_vf_vports;

	return 0;

unload_vf_vports:
	mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs);
vf_err:
	if (mlx5_core_ec_sriov_enabled(esw->dev))
		mlx5_eswitch_unload_ec_vf_vports(esw, esw->esw_funcs.num_ec_vfs);
@@ -1367,6 +1412,8 @@ mlx5_eswitch_enable_pf_vf_vports(struct mlx5_eswitch *esw,
 */
void mlx5_eswitch_disable_pf_vf_vports(struct mlx5_eswitch *esw)
{
	mlx5_eswitch_unload_adj_vf_vports(esw);

	mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs);

	if (mlx5_core_ec_sriov_enabled(esw->dev))
@@ -1439,19 +1486,76 @@ static void mlx5_esw_mode_change_notify(struct mlx5_eswitch *esw, u16 mode)
	blocking_notifier_call_chain(&esw->n_head, 0, &info);
}

static int mlx5_esw_egress_acls_init(struct mlx5_core_dev *dev)
{
	struct mlx5_flow_steering *steering = dev->priv.steering;
	int total_vports = mlx5_eswitch_get_total_vports(dev);
	int err;
	int i;

	for (i = 0; i < total_vports; i++) {
		err = mlx5_fs_vport_egress_acl_ns_add(steering, i);
		if (err)
			goto acl_ns_remove;
	}
	return 0;

acl_ns_remove:
	while (i--)
		mlx5_fs_vport_egress_acl_ns_remove(steering, i);
	return err;
}

static void mlx5_esw_egress_acls_cleanup(struct mlx5_core_dev *dev)
{
	struct mlx5_flow_steering *steering = dev->priv.steering;
	int total_vports = mlx5_eswitch_get_total_vports(dev);
	int i;

	for (i = total_vports - 1; i >= 0; i--)
		mlx5_fs_vport_egress_acl_ns_remove(steering, i);
}

static int mlx5_esw_ingress_acls_init(struct mlx5_core_dev *dev)
{
	struct mlx5_flow_steering *steering = dev->priv.steering;
	int total_vports = mlx5_eswitch_get_total_vports(dev);
	int err;
	int i;

	for (i = 0; i < total_vports; i++) {
		err = mlx5_fs_vport_ingress_acl_ns_add(steering, i);
		if (err)
			goto acl_ns_remove;
	}
	return 0;

acl_ns_remove:
	while (i--)
		mlx5_fs_vport_ingress_acl_ns_remove(steering, i);
	return err;
}

static void mlx5_esw_ingress_acls_cleanup(struct mlx5_core_dev *dev)
{
	struct mlx5_flow_steering *steering = dev->priv.steering;
	int total_vports = mlx5_eswitch_get_total_vports(dev);
	int i;

	for (i = total_vports - 1; i >= 0; i--)
		mlx5_fs_vport_ingress_acl_ns_remove(steering, i);
}

static int mlx5_esw_acls_ns_init(struct mlx5_eswitch *esw)
{
	struct mlx5_core_dev *dev = esw->dev;
	int total_vports;
	int err;

	if (esw->flags & MLX5_ESWITCH_VPORT_ACL_NS_CREATED)
		return 0;

	total_vports = mlx5_eswitch_get_total_vports(dev);

	if (MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support)) {
		err = mlx5_fs_egress_acls_init(dev, total_vports);
		err = mlx5_esw_egress_acls_init(dev);
		if (err)
			return err;
	} else {
@@ -1459,7 +1563,7 @@ static int mlx5_esw_acls_ns_init(struct mlx5_eswitch *esw)
	}

	if (MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support)) {
		err = mlx5_fs_ingress_acls_init(dev, total_vports);
		err = mlx5_esw_ingress_acls_init(dev);
		if (err)
			goto err;
	} else {
@@ -1470,7 +1574,7 @@ static int mlx5_esw_acls_ns_init(struct mlx5_eswitch *esw)

err:
	if (MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support))
		mlx5_fs_egress_acls_cleanup(dev);
		mlx5_esw_egress_acls_cleanup(dev);
	return err;
}

@@ -1480,9 +1584,9 @@ static void mlx5_esw_acls_ns_cleanup(struct mlx5_eswitch *esw)

	esw->flags &= ~MLX5_ESWITCH_VPORT_ACL_NS_CREATED;
	if (MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support))
		mlx5_fs_ingress_acls_cleanup(dev);
		mlx5_esw_ingress_acls_cleanup(dev);
	if (MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support))
		mlx5_fs_egress_acls_cleanup(dev);
		mlx5_esw_egress_acls_cleanup(dev);
}

/**
@@ -1734,8 +1838,7 @@ int mlx5_esw_sf_max_hpf_functions(struct mlx5_core_dev *dev, u16 *max_sfs, u16 *
	return err;
}

static int mlx5_esw_vport_alloc(struct mlx5_eswitch *esw,
				int index, u16 vport_num)
int mlx5_esw_vport_alloc(struct mlx5_eswitch *esw, int index, u16 vport_num)
{
	struct mlx5_vport *vport;
	int err;
@@ -1762,8 +1865,9 @@ static int mlx5_esw_vport_alloc(struct mlx5_eswitch *esw,
	return err;
}

static void mlx5_esw_vport_free(struct mlx5_eswitch *esw, struct mlx5_vport *vport)
void mlx5_esw_vport_free(struct mlx5_eswitch *esw, struct mlx5_vport *vport)
{
	esw->total_vports--;
	xa_erase(&esw->vports, vport->vport);
	kfree(vport);
}
@@ -1847,6 +1951,9 @@ static int mlx5_esw_vports_init(struct mlx5_eswitch *esw)
	err = mlx5_esw_vport_alloc(esw, idx, MLX5_VPORT_UPLINK);
	if (err)
		goto err;

	/* Adjacent vports or other dynamically create vports will use this */
	esw->last_vport_idx = ++idx;
	return 0;

err:
+17 −0
Original line number Diff line number Diff line
@@ -216,6 +216,12 @@ struct mlx5_vport {
	u32                     metadata;
	int                     vhca_id;

	bool adjacent; /* delegated vhca from adjacent function */
	struct {
		u16 parent_pci_devfn; /* Adjacent parent PCI device function */
		u16 function_id; /* Function ID of the delegated VPort */
	} adj_info;

	struct mlx5_vport_info  info;

	/* Protected with the E-Switch qos domain lock. The Vport QoS can
@@ -384,6 +390,7 @@ struct mlx5_eswitch {

	struct mlx5_esw_bridge_offloads *br_offloads;
	struct mlx5_esw_offload offloads;
	u32 last_vport_idx;
	int                     mode;
	u16                     manager_vport;
	u16                     first_host_vport;
@@ -417,6 +424,8 @@ int mlx5_esw_qos_modify_vport_rate(struct mlx5_eswitch *esw, u16 vport_num, u32
/* E-Switch API */
int mlx5_eswitch_init(struct mlx5_core_dev *dev);
void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw);
int mlx5_esw_vport_alloc(struct mlx5_eswitch *esw, int index, u16 vport_num);
void mlx5_esw_vport_free(struct mlx5_eswitch *esw, struct mlx5_vport *vport);

#define MLX5_ESWITCH_IGNORE_NUM_VFS (-1)
int mlx5_eswitch_enable_locked(struct mlx5_eswitch *esw, int num_vfs);
@@ -622,6 +631,9 @@ bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0,

const u32 *mlx5_esw_query_functions(struct mlx5_core_dev *dev);

void mlx5_esw_adjacent_vhcas_setup(struct mlx5_eswitch *esw);
void mlx5_esw_adjacent_vhcas_cleanup(struct mlx5_eswitch *esw);

#define MLX5_DEBUG_ESWITCH_MASK BIT(3)

#define esw_info(__dev, format, ...)			\
@@ -831,6 +843,11 @@ void mlx5_esw_vport_vhca_id_unmap(struct mlx5_eswitch *esw,
int mlx5_eswitch_vhca_id_to_vport(struct mlx5_eswitch *esw, u16 vhca_id, u16 *vport_num);
bool mlx5_esw_vport_vhca_id(struct mlx5_eswitch *esw, u16 vportn, u16 *vhca_id);

void mlx5_esw_offloads_rep_remove(struct mlx5_eswitch *esw,
				  const struct mlx5_vport *vport);
int mlx5_esw_offloads_rep_add(struct mlx5_eswitch *esw,
			      const struct mlx5_vport *vport);

/**
 * struct mlx5_esw_event_info - Indicates eswitch mode changed/changing.
 *
Loading