Commit 408da3a0 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files
Tony Nguyen says:

====================
Intel Wired LAN Driver Updates 2025-05-30 (ice, idpf)

For ice:
Michal resolves XDP issues related to Tx scheduler configuration with
large number of Tx queues.

Additional information:
https://lore.kernel.org/intel-wired-lan/20250513105529.241745-1-michal.kubiak@intel.com/

For idpf:
Brian Vazquez updates netif_subqueue_maybe_stop() condition check to
prevent possible races.

Emil shuts down virtchannel mailbox during reset to reduce timeout
delays as it's unavailable during that time.

* '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/net-queue:
  idpf: avoid mailbox timeout delays during reset
  idpf: fix a race in txq wakeup
  ice: fix rebuilding the Tx scheduler tree for large queue counts
  ice: create new Tx scheduler nodes for new queues only
  ice: fix Tx scheduler error handling in XDP callback
====================

Link: https://patch.msgid.link/20250530211221.2170484-1-anthony.l.nguyen@intel.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 3382a1ed 9dc63d8f
Loading
Loading
Loading
Loading
+33 −14
Original line number Diff line number Diff line
@@ -2740,6 +2740,27 @@ void ice_map_xdp_rings(struct ice_vsi *vsi)
	}
}

/**
 * ice_unmap_xdp_rings - Unmap XDP rings from interrupt vectors
 * @vsi: the VSI with XDP rings being unmapped
 */
static void ice_unmap_xdp_rings(struct ice_vsi *vsi)
{
	int v_idx;

	ice_for_each_q_vector(vsi, v_idx) {
		struct ice_q_vector *q_vector = vsi->q_vectors[v_idx];
		struct ice_tx_ring *ring;

		ice_for_each_tx_ring(ring, q_vector->tx)
			if (!ring->tx_buf || !ice_ring_is_xdp(ring))
				break;

		/* restore the value of last node prior to XDP setup */
		q_vector->tx.tx_ring = ring;
	}
}

/**
 * ice_prepare_xdp_rings - Allocate, configure and setup Tx rings for XDP
 * @vsi: VSI to bring up Tx rings used by XDP
@@ -2803,7 +2824,7 @@ int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog,
	if (status) {
		dev_err(dev, "Failed VSI LAN queue config for XDP, error: %d\n",
			status);
		goto clear_xdp_rings;
		goto unmap_xdp_rings;
	}

	/* assign the prog only when it's not already present on VSI;
@@ -2819,6 +2840,8 @@ int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog,
		ice_vsi_assign_bpf_prog(vsi, prog);

	return 0;
unmap_xdp_rings:
	ice_unmap_xdp_rings(vsi);
clear_xdp_rings:
	ice_for_each_xdp_txq(vsi, i)
		if (vsi->xdp_rings[i]) {
@@ -2835,6 +2858,8 @@ int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog,
	mutex_unlock(&pf->avail_q_mutex);

	devm_kfree(dev, vsi->xdp_rings);
	vsi->xdp_rings = NULL;

	return -ENOMEM;
}

@@ -2850,7 +2875,7 @@ int ice_destroy_xdp_rings(struct ice_vsi *vsi, enum ice_xdp_cfg cfg_type)
{
	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
	struct ice_pf *pf = vsi->back;
	int i, v_idx;
	int i;

	/* q_vectors are freed in reset path so there's no point in detaching
	 * rings
@@ -2858,17 +2883,7 @@ int ice_destroy_xdp_rings(struct ice_vsi *vsi, enum ice_xdp_cfg cfg_type)
	if (cfg_type == ICE_XDP_CFG_PART)
		goto free_qmap;

	ice_for_each_q_vector(vsi, v_idx) {
		struct ice_q_vector *q_vector = vsi->q_vectors[v_idx];
		struct ice_tx_ring *ring;

		ice_for_each_tx_ring(ring, q_vector->tx)
			if (!ring->tx_buf || !ice_ring_is_xdp(ring))
				break;

		/* restore the value of last node prior to XDP setup */
		q_vector->tx.tx_ring = ring;
	}
	ice_unmap_xdp_rings(vsi);

free_qmap:
	mutex_lock(&pf->avail_q_mutex);
@@ -3013,11 +3028,14 @@ ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog,
		xdp_ring_err = ice_vsi_determine_xdp_res(vsi);
		if (xdp_ring_err) {
			NL_SET_ERR_MSG_MOD(extack, "Not enough Tx resources for XDP");
			goto resume_if;
		} else {
			xdp_ring_err = ice_prepare_xdp_rings(vsi, prog,
							     ICE_XDP_CFG_FULL);
			if (xdp_ring_err)
			if (xdp_ring_err) {
				NL_SET_ERR_MSG_MOD(extack, "Setting up XDP Tx resources failed");
				goto resume_if;
			}
		}
		xdp_features_set_redirect_target(vsi->netdev, true);
		/* reallocate Rx queues that are used for zero-copy */
@@ -3035,6 +3053,7 @@ ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog,
			NL_SET_ERR_MSG_MOD(extack, "Freeing XDP Rx resources failed");
	}

resume_if:
	if (if_running)
		ret = ice_up(vsi);

+148 −33
Original line number Diff line number Diff line
@@ -84,6 +84,27 @@ ice_sched_find_node_by_teid(struct ice_sched_node *start_node, u32 teid)
	return NULL;
}

/**
 * ice_sched_find_next_vsi_node - find the next node for a given VSI
 * @vsi_node: VSI support node to start search with
 *
 * Return: Next VSI support node, or NULL.
 *
 * The function returns a pointer to the next node from the VSI layer
 * assigned to the given VSI, or NULL if there is no such a node.
 */
static struct ice_sched_node *
ice_sched_find_next_vsi_node(struct ice_sched_node *vsi_node)
{
	unsigned int vsi_handle = vsi_node->vsi_handle;

	while ((vsi_node = vsi_node->sibling) != NULL)
		if (vsi_node->vsi_handle == vsi_handle)
			break;

	return vsi_node;
}

/**
 * ice_aqc_send_sched_elem_cmd - send scheduling elements cmd
 * @hw: pointer to the HW struct
@@ -1084,8 +1105,10 @@ ice_sched_add_nodes_to_layer(struct ice_port_info *pi,
		if (parent->num_children < max_child_nodes) {
			new_num_nodes = max_child_nodes - parent->num_children;
		} else {
			/* This parent is full, try the next sibling */
			parent = parent->sibling;
			/* This parent is full,
			 * try the next available sibling.
			 */
			parent = ice_sched_find_next_vsi_node(parent);
			/* Don't modify the first node TEID memory if the
			 * first node was added already in the above call.
			 * Instead send some temp memory for all other
@@ -1528,12 +1551,23 @@ ice_sched_get_free_qparent(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
	/* get the first queue group node from VSI sub-tree */
	qgrp_node = ice_sched_get_first_node(pi, vsi_node, qgrp_layer);
	while (qgrp_node) {
		struct ice_sched_node *next_vsi_node;

		/* make sure the qgroup node is part of the VSI subtree */
		if (ice_sched_find_node_in_subtree(pi->hw, vsi_node, qgrp_node))
			if (qgrp_node->num_children < max_children &&
			    qgrp_node->owner == owner)
				break;
		qgrp_node = qgrp_node->sibling;
		if (qgrp_node)
			continue;

		next_vsi_node = ice_sched_find_next_vsi_node(vsi_node);
		if (!next_vsi_node)
			break;

		vsi_node = next_vsi_node;
		qgrp_node = ice_sched_get_first_node(pi, vsi_node, qgrp_layer);
	}

	/* Select the best queue group */
@@ -1604,16 +1638,16 @@ ice_sched_get_agg_node(struct ice_port_info *pi, struct ice_sched_node *tc_node,
/**
 * ice_sched_calc_vsi_child_nodes - calculate number of VSI child nodes
 * @hw: pointer to the HW struct
 * @num_qs: number of queues
 * @num_new_qs: number of new queues that will be added to the tree
 * @num_nodes: num nodes array
 *
 * This function calculates the number of VSI child nodes based on the
 * number of queues.
 */
static void
ice_sched_calc_vsi_child_nodes(struct ice_hw *hw, u16 num_qs, u16 *num_nodes)
ice_sched_calc_vsi_child_nodes(struct ice_hw *hw, u16 num_new_qs, u16 *num_nodes)
{
	u16 num = num_qs;
	u16 num = num_new_qs;
	u8 i, qgl, vsil;

	qgl = ice_sched_get_qgrp_layer(hw);
@@ -1779,7 +1813,11 @@ ice_sched_add_vsi_support_nodes(struct ice_port_info *pi, u16 vsi_handle,
		if (!parent)
			return -EIO;

		if (i == vsil)
		/* Do not modify the VSI handle for already existing VSI nodes,
		 * (if no new VSI node was added to the tree).
		 * Assign the VSI handle only to newly added VSI nodes.
		 */
		if (i == vsil && num_added)
			parent->vsi_handle = vsi_handle;
	}

@@ -1812,6 +1850,41 @@ ice_sched_add_vsi_to_topo(struct ice_port_info *pi, u16 vsi_handle, u8 tc)
					       num_nodes);
}

/**
 * ice_sched_recalc_vsi_support_nodes - recalculate VSI support nodes count
 * @hw: pointer to the HW struct
 * @vsi_node: pointer to the leftmost VSI node that needs to be extended
 * @new_numqs: new number of queues that has to be handled by the VSI
 * @new_num_nodes: pointer to nodes count table to modify the VSI layer entry
 *
 * This function recalculates the number of supported nodes that need to
 * be added after adding more Tx queues for a given VSI.
 * The number of new VSI support nodes that shall be added will be saved
 * to the @new_num_nodes table for the VSI layer.
 */
static void
ice_sched_recalc_vsi_support_nodes(struct ice_hw *hw,
				   struct ice_sched_node *vsi_node,
				   unsigned int new_numqs, u16 *new_num_nodes)
{
	u32 vsi_nodes_cnt = 1;
	u32 max_queue_cnt = 1;
	u32 qgl, vsil;

	qgl = ice_sched_get_qgrp_layer(hw);
	vsil = ice_sched_get_vsi_layer(hw);

	for (u32 i = vsil; i <= qgl; i++)
		max_queue_cnt *= hw->max_children[i];

	while ((vsi_node = ice_sched_find_next_vsi_node(vsi_node)) != NULL)
		vsi_nodes_cnt++;

	if (new_numqs > (max_queue_cnt * vsi_nodes_cnt))
		new_num_nodes[vsil] = DIV_ROUND_UP(new_numqs, max_queue_cnt) -
				      vsi_nodes_cnt;
}

/**
 * ice_sched_update_vsi_child_nodes - update VSI child nodes
 * @pi: port information structure
@@ -1863,15 +1936,25 @@ ice_sched_update_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
			return status;
	}

	if (new_numqs)
		ice_sched_calc_vsi_child_nodes(hw, new_numqs, new_num_nodes);
	/* Keep the max number of queue configuration all the time. Update the
	 * tree only if number of queues > previous number of queues. This may
	ice_sched_recalc_vsi_support_nodes(hw, vsi_node,
					   new_numqs, new_num_nodes);
	ice_sched_calc_vsi_child_nodes(hw, new_numqs - prev_numqs,
				       new_num_nodes);

	/* Never decrease the number of queues in the tree. Update the tree
	 * only if number of queues > previous number of queues. This may
	 * leave some extra nodes in the tree if number of queues < previous
	 * number but that wouldn't harm anything. Removing those extra nodes
	 * may complicate the code if those nodes are part of SRL or
	 * individually rate limited.
	 * Also, add the required VSI support nodes if the existing ones cannot
	 * handle the requested new number of queues.
	 */
	status = ice_sched_add_vsi_support_nodes(pi, vsi_handle, tc_node,
						 new_num_nodes);
	if (status)
		return status;

	status = ice_sched_add_vsi_child_nodes(pi, vsi_handle, tc_node,
					       new_num_nodes, owner);
	if (status)
@@ -2012,6 +2095,58 @@ static bool ice_sched_is_leaf_node_present(struct ice_sched_node *node)
	return (node->info.data.elem_type == ICE_AQC_ELEM_TYPE_LEAF);
}

/**
 * ice_sched_rm_vsi_subtree - remove all nodes assigned to a given VSI
 * @pi: port information structure
 * @vsi_node: pointer to the leftmost node of the VSI to be removed
 * @owner: LAN or RDMA
 * @tc: TC number
 *
 * Return: Zero in case of success, or -EBUSY if the VSI has leaf nodes in TC.
 *
 * This function removes all the VSI support nodes associated with a given VSI
 * and its LAN or RDMA children nodes from the scheduler tree.
 */
static int
ice_sched_rm_vsi_subtree(struct ice_port_info *pi,
			 struct ice_sched_node *vsi_node, u8 owner, u8 tc)
{
	u16 vsi_handle = vsi_node->vsi_handle;
	bool all_vsi_nodes_removed = true;
	int j = 0;

	while (vsi_node) {
		struct ice_sched_node *next_vsi_node;

		if (ice_sched_is_leaf_node_present(vsi_node)) {
			ice_debug(pi->hw, ICE_DBG_SCHED, "VSI has leaf nodes in TC %d\n", tc);
			return -EBUSY;
		}
		while (j < vsi_node->num_children) {
			if (vsi_node->children[j]->owner == owner)
				ice_free_sched_node(pi, vsi_node->children[j]);
			else
				j++;
		}

		next_vsi_node = ice_sched_find_next_vsi_node(vsi_node);

		/* remove the VSI if it has no children */
		if (!vsi_node->num_children)
			ice_free_sched_node(pi, vsi_node);
		else
			all_vsi_nodes_removed = false;

		vsi_node = next_vsi_node;
	}

	/* clean up aggregator related VSI info if any */
	if (all_vsi_nodes_removed)
		ice_sched_rm_agg_vsi_info(pi, vsi_handle);

	return 0;
}

/**
 * ice_sched_rm_vsi_cfg - remove the VSI and its children nodes
 * @pi: port information structure
@@ -2038,7 +2173,6 @@ ice_sched_rm_vsi_cfg(struct ice_port_info *pi, u16 vsi_handle, u8 owner)

	ice_for_each_traffic_class(i) {
		struct ice_sched_node *vsi_node, *tc_node;
		u8 j = 0;

		tc_node = ice_sched_get_tc_node(pi, i);
		if (!tc_node)
@@ -2048,31 +2182,12 @@ ice_sched_rm_vsi_cfg(struct ice_port_info *pi, u16 vsi_handle, u8 owner)
		if (!vsi_node)
			continue;

		if (ice_sched_is_leaf_node_present(vsi_node)) {
			ice_debug(pi->hw, ICE_DBG_SCHED, "VSI has leaf nodes in TC %d\n", i);
			status = -EBUSY;
		status = ice_sched_rm_vsi_subtree(pi, vsi_node, owner, i);
		if (status)
			goto exit_sched_rm_vsi_cfg;
		}
		while (j < vsi_node->num_children) {
			if (vsi_node->children[j]->owner == owner) {
				ice_free_sched_node(pi, vsi_node->children[j]);

				/* reset the counter again since the num
				 * children will be updated after node removal
				 */
				j = 0;
			} else {
				j++;
			}
		}
		/* remove the VSI if it has no children */
		if (!vsi_node->num_children) {
			ice_free_sched_node(pi, vsi_node);
		vsi_ctx->sched.vsi_node[i] = NULL;

			/* clean up aggregator related VSI info if any */
			ice_sched_rm_agg_vsi_info(pi, vsi_handle);
		}
		if (owner == ICE_SCHED_NODE_OWNER_LAN)
			vsi_ctx->sched.max_lanq[i] = 0;
		else
+13 −5
Original line number Diff line number Diff line
@@ -1801,12 +1801,20 @@ void idpf_vc_event_task(struct work_struct *work)
	if (test_bit(IDPF_REMOVE_IN_PROG, adapter->flags))
		return;

	if (test_bit(IDPF_HR_FUNC_RESET, adapter->flags) ||
	    test_bit(IDPF_HR_DRV_LOAD, adapter->flags)) {
	if (test_bit(IDPF_HR_FUNC_RESET, adapter->flags))
		goto func_reset;

	if (test_bit(IDPF_HR_DRV_LOAD, adapter->flags))
		goto drv_load;

	return;

func_reset:
	idpf_vc_xn_shutdown(adapter->vcxn_mngr);
drv_load:
	set_bit(IDPF_HR_RESET_IN_PROG, adapter->flags);
	idpf_init_hard_reset(adapter);
}
}

/**
 * idpf_initiate_soft_reset - Initiate a software reset
+5 −4
Original line number Diff line number Diff line
@@ -362,17 +362,18 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb,
{
	struct idpf_tx_offload_params offload = { };
	struct idpf_tx_buf *first;
	int csum, tso, needed;
	unsigned int count;
	__be16 protocol;
	int csum, tso;

	count = idpf_tx_desc_count_required(tx_q, skb);
	if (unlikely(!count))
		return idpf_tx_drop_skb(tx_q, skb);

	if (idpf_tx_maybe_stop_common(tx_q,
				      count + IDPF_TX_DESCS_PER_CACHE_LINE +
				      IDPF_TX_DESCS_FOR_CTX)) {
	needed = count + IDPF_TX_DESCS_PER_CACHE_LINE + IDPF_TX_DESCS_FOR_CTX;
	if (!netif_subqueue_maybe_stop(tx_q->netdev, tx_q->idx,
				       IDPF_DESC_UNUSED(tx_q),
				       needed, needed)) {
		idpf_tx_buf_hw_update(tx_q, tx_q->next_to_use, false);

		u64_stats_update_begin(&tx_q->stats_sync);
+17 −28
Original line number Diff line number Diff line
@@ -2184,6 +2184,19 @@ void idpf_tx_splitq_build_flow_desc(union idpf_tx_flex_desc *desc,
	desc->flow.qw1.compl_tag = cpu_to_le16(params->compl_tag);
}

/* Global conditions to tell whether the txq (and related resources)
 * has room to allow the use of "size" descriptors.
 */
static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 size)
{
	if (IDPF_DESC_UNUSED(tx_q) < size ||
	    IDPF_TX_COMPLQ_PENDING(tx_q->txq_grp) >
		IDPF_TX_COMPLQ_OVERFLOW_THRESH(tx_q->txq_grp->complq) ||
	    IDPF_TX_BUF_RSV_LOW(tx_q))
		return 0;
	return 1;
}

/**
 * idpf_tx_maybe_stop_splitq - 1st level check for Tx splitq stop conditions
 * @tx_q: the queue to be checked
@@ -2194,29 +2207,11 @@ void idpf_tx_splitq_build_flow_desc(union idpf_tx_flex_desc *desc,
static int idpf_tx_maybe_stop_splitq(struct idpf_tx_queue *tx_q,
				     unsigned int descs_needed)
{
	if (idpf_tx_maybe_stop_common(tx_q, descs_needed))
		goto out;

	/* If there are too many outstanding completions expected on the
	 * completion queue, stop the TX queue to give the device some time to
	 * catch up
	 */
	if (unlikely(IDPF_TX_COMPLQ_PENDING(tx_q->txq_grp) >
		     IDPF_TX_COMPLQ_OVERFLOW_THRESH(tx_q->txq_grp->complq)))
		goto splitq_stop;

	/* Also check for available book keeping buffers; if we are low, stop
	 * the queue to wait for more completions
	 */
	if (unlikely(IDPF_TX_BUF_RSV_LOW(tx_q)))
		goto splitq_stop;

	if (netif_subqueue_maybe_stop(tx_q->netdev, tx_q->idx,
				      idpf_txq_has_room(tx_q, descs_needed),
				      1, 1))
		return 0;

splitq_stop:
	netif_stop_subqueue(tx_q->netdev, tx_q->idx);

out:
	u64_stats_update_begin(&tx_q->stats_sync);
	u64_stats_inc(&tx_q->q_stats.q_busy);
	u64_stats_update_end(&tx_q->stats_sync);
@@ -2242,12 +2237,6 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val,
	nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx);
	tx_q->next_to_use = val;

	if (idpf_tx_maybe_stop_common(tx_q, IDPF_TX_DESC_NEEDED)) {
		u64_stats_update_begin(&tx_q->stats_sync);
		u64_stats_inc(&tx_q->q_stats.q_busy);
		u64_stats_update_end(&tx_q->stats_sync);
	}

	/* Force memory writes to complete before letting h/w
	 * know there are new descriptors to fetch.  (Only
	 * applicable for weak-ordered memory model archs,
Loading