Commit 26db4dbb authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files
Tony Nguyen says:

====================
ice: managing MSI-X in driver

Michal Swiatkowski says:

It is another try to allow user to manage amount of MSI-X used for each
feature in ice. First was via devlink resources API, it wasn't accepted
in upstream. Also static MSI-X allocation using devlink resources isn't
really user friendly.

This try is using more dynamic way. "Dynamic" across whole kernel when
platform supports it and "dynamic" across the driver when not.

To achieve that reuse global devlink parameter pf_msix_max and
pf_msix_min. It fits how ice hardware counts MSI-X. In case of ice amount
of MSI-X reported on PCI is a whole MSI-X for the card (with MSI-X for
VFs also). Having pf_msix_max allow user to statically set how many
MSI-X he wants on PF and how many should be reserved for VFs.

pf_msix_min is used to set minimum number of MSI-X with which ice driver
should probe correctly.

Meaning of this field in case of dynamic vs static allocation:
- on system with dynamic MSI-X allocation support
 * alloc pf_msix_min as static, rest will be allocated dynamically
- on system without dynamic MSI-X allocation support
 * try alloc pf_msix_max as static, minimum acceptable result is
 pf_msix_min

As Jesse and Piotr suggested pf_msix_max and pf_msix_min can (an
probably should) be stored in NVM. This patchset isn't implementing
that.

Dynamic (kernel or driver) way means that splitting MSI-X across the
RDMA and eth in case there is a MSI-X shortage isn't correct. Can work
when dynamic is only on driver site, but can't when dynamic is on kernel
site.

Let's remove this code and move to MSI-X allocation feature by feature.
If there is no more MSI-X for a feature, a feature is working with less
MSI-X or it is turned off.

There is a regression here. With MSI-X splitting user can run RDMA and
eth even on system with not enough MSI-X. Now only eth will work. RDMA
can be turned on by changing number of PF queues (lowering) and reprobe
RDMA driver.

Example:
72 CPU number, eth, RDMA and flow director (1 MSI-X), 1 MSI-X for OICR
on PF, and 1 more for RDMA. Card is using 1 + 72 + 1 + 72 + 1 = 147.

We set pf_msix_min = 2, pf_msix_max = 128

OICR: 1
eth: 72
flow director: 1
RDMA: 128 - 74 = 54

We can change number of queues on pf to 36 and do devlink reinit

OICR: 1
eth: 36
RDMA: 73
flow director: 1

We can also (implemented in "ice: enable_rdma devlink param") turned
RDMA off.

OICR: 1
eth: 72
RDMA: 0 (turned off)
flow director: 1

After this changes we have a static base vector for SRIOV (SIOV probably
in the feature). Last patch from this series is simplifying managing VF
MSI-X code based on static vector.

Now changing queues using ethtool is also changing MSI-X. If there is
enough MSI-X it is always one to one. When there is not enough there
will be more queues than MSI-X.

* 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue:
  ice: init flow director before RDMA
  ice: simplify VF MSI-X managing
  ice: enable_rdma devlink param
  ice: treat dyn_allowed only as suggestion
  ice, irdma: move interrupts code to irdma
  ice: get rid of num_lan_msix field
  ice: remove splitting MSI-X between features
  ice: devlink PF MSI-X max and min parameter
  ice: count combined queues using Rx/Tx count
====================

Link: https://patch.msgid.link/20250205185512.895887-1-anthony.l.nguyen@intel.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents ec730952 d67627e7
Loading
Loading
Loading
Loading
+11 −0
Original line number Diff line number Diff line
@@ -69,6 +69,17 @@ Parameters

       To verify that value has been set:
       $ devlink dev param show pci/0000:16:00.0 name tx_scheduling_layers
   * - ``msix_vec_per_pf_max``
     - driverinit
     - Set the max MSI-X that can be used by the PF, rest can be utilized for
       SRIOV. The range is from min value set in msix_vec_per_pf_min to
       2k/number of ports.
   * - ``msix_vec_per_pf_min``
     - driverinit
     - Set the min MSI-X that will be used by the PF. This value inform how many
       MSI-X will be allocated statically. The range is from 2 to value set
       in msix_vec_per_pf_max.

.. list-table:: Driver specific parameters implemented
    :widths: 5 5 90

+0 −2
Original line number Diff line number Diff line
@@ -498,8 +498,6 @@ static int irdma_save_msix_info(struct irdma_pci_f *rf)
	iw_qvlist->num_vectors = rf->msix_count;
	if (rf->msix_count <= num_online_cpus())
		rf->msix_shared = true;
	else if (rf->msix_count > num_online_cpus() + 1)
		rf->msix_count = num_online_cpus() + 1;

	pmsix = rf->msix_entries;
	for (i = 0, ceq_idx = 0; i < rf->msix_count; i++, iw_qvinfo++) {
+44 −2
Original line number Diff line number Diff line
@@ -206,6 +206,43 @@ static void irdma_lan_unregister_qset(struct irdma_sc_vsi *vsi,
		ibdev_dbg(&iwdev->ibdev, "WS: LAN free_res for rdma qset failed.\n");
}

static int irdma_init_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf)
{
	int i;

	rf->msix_count = num_online_cpus() + IRDMA_NUM_AEQ_MSIX;
	rf->msix_entries = kcalloc(rf->msix_count, sizeof(*rf->msix_entries),
				   GFP_KERNEL);
	if (!rf->msix_entries)
		return -ENOMEM;

	for (i = 0; i < rf->msix_count; i++)
		if (ice_alloc_rdma_qvector(pf, &rf->msix_entries[i]))
			break;

	if (i < IRDMA_MIN_MSIX) {
		for (; i > 0; i--)
			ice_free_rdma_qvector(pf, &rf->msix_entries[i]);

		kfree(rf->msix_entries);
		return -ENOMEM;
	}

	rf->msix_count = i;

	return 0;
}

static void irdma_deinit_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf)
{
	int i;

	for (i = 0; i < rf->msix_count; i++)
		ice_free_rdma_qvector(pf, &rf->msix_entries[i]);

	kfree(rf->msix_entries);
}

static void irdma_remove(struct auxiliary_device *aux_dev)
{
	struct iidc_auxiliary_dev *iidc_adev = container_of(aux_dev,
@@ -216,6 +253,7 @@ static void irdma_remove(struct auxiliary_device *aux_dev)

	irdma_ib_unregister_device(iwdev);
	ice_rdma_update_vsi_filter(pf, iwdev->vsi_num, false);
	irdma_deinit_interrupts(iwdev->rf, pf);

	pr_debug("INIT: Gen2 PF[%d] device remove success\n", PCI_FUNC(pf->pdev->devfn));
}
@@ -230,9 +268,7 @@ static void irdma_fill_device_info(struct irdma_device *iwdev, struct ice_pf *pf
	rf->gen_ops.unregister_qset = irdma_lan_unregister_qset;
	rf->hw.hw_addr = pf->hw.hw_addr;
	rf->pcidev = pf->pdev;
	rf->msix_count =  pf->num_rdma_msix;
	rf->pf_id = pf->hw.pf_id;
	rf->msix_entries = &pf->msix_entries[pf->rdma_base_vector];
	rf->default_vsi.vsi_idx = vsi->vsi_num;
	rf->protocol_used = pf->rdma_mode & IIDC_RDMA_PROTOCOL_ROCEV2 ?
			    IRDMA_ROCE_PROTOCOL_ONLY : IRDMA_IWARP_PROTOCOL_ONLY;
@@ -281,6 +317,10 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_
	irdma_fill_device_info(iwdev, pf, vsi);
	rf = iwdev->rf;

	err = irdma_init_interrupts(rf, pf);
	if (err)
		goto err_init_interrupts;

	err = irdma_ctrl_init_hw(rf);
	if (err)
		goto err_ctrl_init;
@@ -311,6 +351,8 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_
err_rt_init:
	irdma_ctrl_deinit_hw(rf);
err_ctrl_init:
	irdma_deinit_interrupts(rf, pf);
err_init_interrupts:
	kfree(iwdev->rf);
	ib_dealloc_device(&iwdev->ibdev);

+3 −0
Original line number Diff line number Diff line
@@ -117,6 +117,9 @@ extern struct auxiliary_driver i40iw_auxiliary_drv;

#define IRDMA_IRQ_NAME_STR_LEN (64)

#define IRDMA_NUM_AEQ_MSIX	1
#define IRDMA_MIN_MSIX		2

enum init_completion_state {
	INVALID_STATE = 0,
	INITIAL_STATE,
+102 −0
Original line number Diff line number Diff line
@@ -1205,6 +1205,25 @@ static int ice_devlink_set_parent(struct devlink_rate *devlink_rate,
	return status;
}

static void ice_set_min_max_msix(struct ice_pf *pf)
{
	struct devlink *devlink = priv_to_devlink(pf);
	union devlink_param_value val;
	int err;

	err = devl_param_driverinit_value_get(devlink,
					      DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN,
					      &val);
	if (!err)
		pf->msix.min = val.vu32;

	err = devl_param_driverinit_value_get(devlink,
					      DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX,
					      &val);
	if (!err)
		pf->msix.max = val.vu32;
}

/**
 * ice_devlink_reinit_up - do reinit of the given PF
 * @pf: pointer to the PF struct
@@ -1220,6 +1239,9 @@ static int ice_devlink_reinit_up(struct ice_pf *pf)
		return err;
	}

	/* load MSI-X values */
	ice_set_min_max_msix(pf);

	err = ice_init_dev(pf);
	if (err)
		goto unroll_hw_init;
@@ -1533,6 +1555,43 @@ static int ice_devlink_local_fwd_validate(struct devlink *devlink, u32 id,
	return 0;
}

static int
ice_devlink_msix_max_pf_validate(struct devlink *devlink, u32 id,
				 union devlink_param_value val,
				 struct netlink_ext_ack *extack)
{
	struct ice_pf *pf = devlink_priv(devlink);

	if (val.vu32 > pf->hw.func_caps.common_cap.num_msix_vectors)
		return -EINVAL;

	return 0;
}

static int
ice_devlink_msix_min_pf_validate(struct devlink *devlink, u32 id,
				 union devlink_param_value val,
				 struct netlink_ext_ack *extack)
{
	if (val.vu32 < ICE_MIN_MSIX)
		return -EINVAL;

	return 0;
}

static int ice_devlink_enable_rdma_validate(struct devlink *devlink, u32 id,
					    union devlink_param_value val,
					    struct netlink_ext_ack *extack)
{
	struct ice_pf *pf = devlink_priv(devlink);
	bool new_state = val.vbool;

	if (new_state && !test_bit(ICE_FLAG_RDMA_ENA, pf->flags))
		return -EOPNOTSUPP;

	return 0;
}

enum ice_param_id {
	ICE_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX,
	ICE_DEVLINK_PARAM_ID_TX_SCHED_LAYERS,
@@ -1548,6 +1607,17 @@ static const struct devlink_param ice_dvl_rdma_params[] = {
			      ice_devlink_enable_iw_get,
			      ice_devlink_enable_iw_set,
			      ice_devlink_enable_iw_validate),
	DEVLINK_PARAM_GENERIC(ENABLE_RDMA, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
			      NULL, NULL, ice_devlink_enable_rdma_validate),
};

static const struct devlink_param ice_dvl_msix_params[] = {
	DEVLINK_PARAM_GENERIC(MSIX_VEC_PER_PF_MAX,
			      BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
			      NULL, NULL, ice_devlink_msix_max_pf_validate),
	DEVLINK_PARAM_GENERIC(MSIX_VEC_PER_PF_MIN,
			      BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
			      NULL, NULL, ice_devlink_msix_min_pf_validate),
};

static const struct devlink_param ice_dvl_sched_params[] = {
@@ -1651,6 +1721,7 @@ void ice_devlink_unregister(struct ice_pf *pf)
int ice_devlink_register_params(struct ice_pf *pf)
{
	struct devlink *devlink = priv_to_devlink(pf);
	union devlink_param_value value;
	struct ice_hw *hw = &pf->hw;
	int status;

@@ -1659,10 +1730,39 @@ int ice_devlink_register_params(struct ice_pf *pf)
	if (status)
		return status;

	status = devl_params_register(devlink, ice_dvl_msix_params,
				      ARRAY_SIZE(ice_dvl_msix_params));
	if (status)
		goto unregister_rdma_params;

	if (hw->func_caps.common_cap.tx_sched_topo_comp_mode_en)
		status = devl_params_register(devlink, ice_dvl_sched_params,
					      ARRAY_SIZE(ice_dvl_sched_params));
	if (status)
		goto unregister_msix_params;

	value.vu32 = pf->msix.max;
	devl_param_driverinit_value_set(devlink,
					DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX,
					value);
	value.vu32 = pf->msix.min;
	devl_param_driverinit_value_set(devlink,
					DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN,
					value);

	value.vbool = test_bit(ICE_FLAG_RDMA_ENA, pf->flags);
	devl_param_driverinit_value_set(devlink,
					DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA,
					value);

	return 0;

unregister_msix_params:
	devl_params_unregister(devlink, ice_dvl_msix_params,
			       ARRAY_SIZE(ice_dvl_msix_params));
unregister_rdma_params:
	devl_params_unregister(devlink, ice_dvl_rdma_params,
			       ARRAY_SIZE(ice_dvl_rdma_params));
	return status;
}

@@ -1673,6 +1773,8 @@ void ice_devlink_unregister_params(struct ice_pf *pf)

	devl_params_unregister(devlink, ice_dvl_rdma_params,
			       ARRAY_SIZE(ice_dvl_rdma_params));
	devl_params_unregister(devlink, ice_dvl_msix_params,
			       ARRAY_SIZE(ice_dvl_msix_params));

	if (hw->func_caps.common_cap.tx_sched_topo_comp_mode_en)
		devl_params_unregister(devlink, ice_dvl_sched_params,
Loading