Commit 7da375e2 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'net-mlx5e-shampo-enable-hw-gro-once-more'

Tariq Toukan says:

====================
net/mlx5e: SHAMPO, Enable HW GRO once more

This series enables hardware GRO for ConnectX-7 and newer NICs.
SHAMPO stands for Split Header And Merge Payload Offload.

The first part of the series contains important fixes and improvements.

The second part reworks the HW GRO counters.

Lastly, HW GRO is perf optimized and enabled.

Here are the bandwidth numbers for a simple iperf3 test over a single rq
where the application and irq are pinned to the same CPU:

+---------+--------+--------+-----------+-------------+
| streams | SW GRO | HW GRO | Unit      | Improvement |
+---------+--------+--------+-----------+-------------+
| 1       | 36     | 57     | Gbits/sec |    1.6 x    |
| 4       | 34     | 50     | Gbits/sec |    1.5 x    |
| 8       | 31     | 43     | Gbits/sec |    1.4 x    |
+---------+--------+--------+-----------+-------------+

Benchmark details:
VM based setup
CPU: Intel(R) Xeon(R) Platinum 8380 CPU, 24 cores
NIC: ConnectX-7 100GbE
iperf3 and irq running on same CPU over a single receive queue
====================

Link: https://lore.kernel.org/r/20240603212219.1037656-1-tariqt@nvidia.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents ed20142e 14ae2fd1
Loading
Loading
Loading
Loading
+15 −9
Original line number Diff line number Diff line
@@ -189,22 +189,19 @@ the software port.

   * - `rx[i]_gro_packets`
     - Number of received packets processed using hardware-accelerated GRO. The
       number of hardware GRO offloaded packets received on ring i.
       number of hardware GRO offloaded packets received on ring i. Only true GRO
       packets are counted: only packets that are in an SKB with a GRO count > 1.
     - Acceleration

   * - `rx[i]_gro_bytes`
     - Number of received bytes processed using hardware-accelerated GRO. The
       number of hardware GRO offloaded bytes received on ring i.
       number of hardware GRO offloaded bytes received on ring i. Only true GRO
       packets are counted: only packets that are in an SKB with a GRO count > 1.
     - Acceleration

   * - `rx[i]_gro_skbs`
     - The number of receive SKBs constructed while performing
       hardware-accelerated GRO.
     - Informative

   * - `rx[i]_gro_match_packets`
     - Number of received packets processed using hardware-accelerated GRO that
       met the flow table match criteria.
     - The number of GRO SKBs constructed from hardware-accelerated GRO. Only SKBs
       with a GRO count > 1 are counted.
     - Informative

   * - `rx[i]_gro_large_hds`
@@ -212,6 +209,15 @@ the software port.
       headers that require additional memory to be allocated.
     - Informative

   * - `rx[i]_hds_nodata_packets`
     - Number of header only packets in header/data split mode [#accel]_.
     - Informative

   * - `rx[i]_hds_nodata_bytes`
     - Number of bytes for header only packets in header/data split mode
       [#accel]_.
     - Informative

   * - `rx[i]_lro_packets`
     - The number of LRO packets received on ring i [#accel]_.
     - Acceleration
+2 −20
Original line number Diff line number Diff line
@@ -80,6 +80,7 @@ struct page_pool;
				 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

#define MLX5E_RX_MAX_HEAD (256)
#define MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE (8)
#define MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE (9)
#define MLX5E_SHAMPO_WQ_HEADER_PER_PAGE (PAGE_SIZE >> MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE)
#define MLX5E_SHAMPO_WQ_BASE_HEAD_ENTRY_SIZE (64)
@@ -146,25 +147,6 @@ struct page_pool;
#define MLX5E_TX_XSK_POLL_BUDGET       64
#define MLX5E_SQ_RECOVER_MIN_INTERVAL  500 /* msecs */

#define MLX5E_KLM_UMR_WQE_SZ(sgl_len)\
	(sizeof(struct mlx5e_umr_wqe) +\
	(sizeof(struct mlx5_klm) * (sgl_len)))

#define MLX5E_KLM_UMR_WQEBBS(klm_entries) \
	(DIV_ROUND_UP(MLX5E_KLM_UMR_WQE_SZ(klm_entries), MLX5_SEND_WQE_BB))

#define MLX5E_KLM_UMR_DS_CNT(klm_entries)\
	(DIV_ROUND_UP(MLX5E_KLM_UMR_WQE_SZ(klm_entries), MLX5_SEND_WQE_DS))

#define MLX5E_KLM_MAX_ENTRIES_PER_WQE(wqe_size)\
	(((wqe_size) - sizeof(struct mlx5e_umr_wqe)) / sizeof(struct mlx5_klm))

#define MLX5E_KLM_ENTRIES_PER_WQE(wqe_size)\
	ALIGN_DOWN(MLX5E_KLM_MAX_ENTRIES_PER_WQE(wqe_size), MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT)

#define MLX5E_MAX_KLM_PER_WQE(mdev) \
	MLX5E_KLM_ENTRIES_PER_WQE(MLX5_SEND_WQE_BB * mlx5e_get_max_sq_aligned_wqebbs(mdev))

#define mlx5e_state_dereference(priv, p) \
	rcu_dereference_protected((p), lockdep_is_held(&(priv)->state_lock))

@@ -1014,7 +996,7 @@ void mlx5e_build_ptys2ethtool_map(void);
bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev, u8 page_shift,
					    enum mlx5e_mpwrq_umr_mode umr_mode);

void mlx5e_shampo_dealloc_hd(struct mlx5e_rq *rq, u16 len, u16 start, bool close);
void mlx5e_shampo_dealloc_hd(struct mlx5e_rq *rq);
void mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats);
void mlx5e_fold_sw_stats64(struct mlx5e_priv *priv, struct rtnl_link_stats64 *s);

+6 −6
Original line number Diff line number Diff line
@@ -1071,18 +1071,18 @@ static u32 mlx5e_shampo_icosq_sz(struct mlx5_core_dev *mdev,
				 struct mlx5e_params *params,
				 struct mlx5e_rq_param *rq_param)
{
	int max_num_of_umr_per_wqe, max_hd_per_wqe, max_klm_per_umr, rest;
	int max_num_of_umr_per_wqe, max_hd_per_wqe, max_ksm_per_umr, rest;
	void *wqc = MLX5_ADDR_OF(rqc, rq_param->rqc, wq);
	int wq_size = BIT(MLX5_GET(wq, wqc, log_wq_sz));
	u32 wqebbs;

	max_klm_per_umr = MLX5E_MAX_KLM_PER_WQE(mdev);
	max_ksm_per_umr = MLX5E_MAX_KSM_PER_WQE(mdev);
	max_hd_per_wqe = mlx5e_shampo_hd_per_wqe(mdev, params, rq_param);
	max_num_of_umr_per_wqe = max_hd_per_wqe / max_klm_per_umr;
	rest = max_hd_per_wqe % max_klm_per_umr;
	wqebbs = MLX5E_KLM_UMR_WQEBBS(max_klm_per_umr) * max_num_of_umr_per_wqe;
	max_num_of_umr_per_wqe = max_hd_per_wqe / max_ksm_per_umr;
	rest = max_hd_per_wqe % max_ksm_per_umr;
	wqebbs = MLX5E_KSM_UMR_WQEBBS(max_ksm_per_umr) * max_num_of_umr_per_wqe;
	if (rest)
		wqebbs += MLX5E_KLM_UMR_WQEBBS(rest);
		wqebbs += MLX5E_KSM_UMR_WQEBBS(rest);
	wqebbs *= wq_size;
	return wqebbs;
}
+19 −0
Original line number Diff line number Diff line
@@ -34,6 +34,25 @@

#define MLX5E_RX_ERR_CQE(cqe) (get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)

#define MLX5E_KSM_UMR_WQE_SZ(sgl_len)\
	(sizeof(struct mlx5e_umr_wqe) +\
	(sizeof(struct mlx5_ksm) * (sgl_len)))

#define MLX5E_KSM_UMR_WQEBBS(ksm_entries) \
	(DIV_ROUND_UP(MLX5E_KSM_UMR_WQE_SZ(ksm_entries), MLX5_SEND_WQE_BB))

#define MLX5E_KSM_UMR_DS_CNT(ksm_entries)\
	(DIV_ROUND_UP(MLX5E_KSM_UMR_WQE_SZ(ksm_entries), MLX5_SEND_WQE_DS))

#define MLX5E_KSM_MAX_ENTRIES_PER_WQE(wqe_size)\
	(((wqe_size) - sizeof(struct mlx5e_umr_wqe)) / sizeof(struct mlx5_ksm))

#define MLX5E_KSM_ENTRIES_PER_WQE(wqe_size)\
	ALIGN_DOWN(MLX5E_KSM_MAX_ENTRIES_PER_WQE(wqe_size), MLX5_UMR_KSM_NUM_ENTRIES_ALIGNMENT)

#define MLX5E_MAX_KSM_PER_WQE(mdev) \
	MLX5E_KSM_ENTRIES_PER_WQE(MLX5_SEND_WQE_BB * mlx5e_get_max_sq_aligned_wqebbs(mdev))

static inline
ktime_t mlx5e_cqe_ts_to_ns(cqe_ts_to_ns func, struct mlx5_clock *clock, u64 cqe_ts)
{
+48 −23
Original line number Diff line number Diff line
@@ -74,6 +74,27 @@
#include "lib/devcom.h"
#include "lib/sd.h"

static bool mlx5e_hw_gro_supported(struct mlx5_core_dev *mdev)
{
	if (!MLX5_CAP_GEN(mdev, shampo))
		return false;

	/* Our HW-GRO implementation relies on "KSM Mkey" for
	 * SHAMPO headers buffer mapping
	 */
	if (!MLX5_CAP_GEN(mdev, fixed_buffer_size))
		return false;

	if (!MLX5_CAP_GEN_2(mdev, min_mkey_log_entity_size_fixed_buffer_valid))
		return false;

	if (MLX5_CAP_GEN_2(mdev, min_mkey_log_entity_size_fixed_buffer) >
	    MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE)
		return false;

	return true;
}

bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev, u8 page_shift,
					    enum mlx5e_mpwrq_umr_mode umr_mode)
{
@@ -504,8 +525,8 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
	return err;
}

static int mlx5e_create_umr_klm_mkey(struct mlx5_core_dev *mdev,
				     u64 nentries,
static int mlx5e_create_umr_ksm_mkey(struct mlx5_core_dev *mdev,
				     u64 nentries, u8 log_entry_size,
				     u32 *umr_mkey)
{
	int inlen;
@@ -525,12 +546,13 @@ static int mlx5e_create_umr_klm_mkey(struct mlx5_core_dev *mdev,
	MLX5_SET(mkc, mkc, umr_en, 1);
	MLX5_SET(mkc, mkc, lw, 1);
	MLX5_SET(mkc, mkc, lr, 1);
	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KSM);
	mlx5e_mkey_set_relaxed_ordering(mdev, mkc);
	MLX5_SET(mkc, mkc, qpn, 0xffffff);
	MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn);
	MLX5_SET(mkc, mkc, translations_octword_size, nentries);
	MLX5_SET(mkc, mkc, length64, 1);
	MLX5_SET(mkc, mkc, log_page_size, log_entry_size);
	MLX5_SET64(mkc, mkc, len, nentries << log_entry_size);
	err = mlx5_core_create_mkey(mdev, umr_mkey, in, inlen);

	kvfree(in);
@@ -565,14 +587,16 @@ static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq
static int mlx5e_create_rq_hd_umr_mkey(struct mlx5_core_dev *mdev,
				       struct mlx5e_rq *rq)
{
	u32 max_klm_size = BIT(MLX5_CAP_GEN(mdev, log_max_klm_list_size));
	u32 max_ksm_size = BIT(MLX5_CAP_GEN(mdev, log_max_klm_list_size));

	if (max_klm_size < rq->mpwqe.shampo->hd_per_wq) {
		mlx5_core_err(mdev, "max klm list size 0x%x is smaller than shampo header buffer list size 0x%x\n",
			      max_klm_size, rq->mpwqe.shampo->hd_per_wq);
	if (max_ksm_size < rq->mpwqe.shampo->hd_per_wq) {
		mlx5_core_err(mdev, "max ksm list size 0x%x is smaller than shampo header buffer list size 0x%x\n",
			      max_ksm_size, rq->mpwqe.shampo->hd_per_wq);
		return -EINVAL;
	}
	return mlx5e_create_umr_klm_mkey(mdev, rq->mpwqe.shampo->hd_per_wq,

	return mlx5e_create_umr_ksm_mkey(mdev, rq->mpwqe.shampo->hd_per_wq,
					 MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE,
					 &rq->mpwqe.shampo->mkey);
}

@@ -1208,15 +1232,6 @@ void mlx5e_free_rx_missing_descs(struct mlx5e_rq *rq)
		head = mlx5_wq_ll_get_wqe_next_ix(wq, head);
	}

	if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) {
		u16 len;

		len = (rq->mpwqe.shampo->pi - rq->mpwqe.shampo->ci) &
		      (rq->mpwqe.shampo->hd_per_wq - 1);
		mlx5e_shampo_dealloc_hd(rq, len, rq->mpwqe.shampo->ci, false);
		rq->mpwqe.shampo->pi = rq->mpwqe.shampo->ci;
	}

	rq->mpwqe.actual_wq_head = wq->head;
	rq->mpwqe.umr_in_progress = 0;
	rq->mpwqe.umr_completed = 0;
@@ -1244,8 +1259,7 @@ void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
		}

		if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state))
			mlx5e_shampo_dealloc_hd(rq, rq->mpwqe.shampo->hd_per_wq,
						0, true);
			mlx5e_shampo_dealloc_hd(rq);
	} else {
		struct mlx5_wq_cyc *wq = &rq->wqe.wq;
		u16 missing = mlx5_wq_cyc_missing(wq);
@@ -4259,13 +4273,19 @@ int mlx5e_set_features(struct net_device *netdev, netdev_features_t features)
#define MLX5E_HANDLE_FEATURE(feature, handler) \
	mlx5e_handle_feature(netdev, &oper_features, feature, handler)

	if (features & (NETIF_F_GRO_HW | NETIF_F_LRO)) {
		err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXFCS, set_feature_rx_fcs);
		err |= MLX5E_HANDLE_FEATURE(NETIF_F_LRO, set_feature_lro);
		err |= MLX5E_HANDLE_FEATURE(NETIF_F_GRO_HW, set_feature_hw_gro);
	} else {
		err |= MLX5E_HANDLE_FEATURE(NETIF_F_LRO, set_feature_lro);
		err |= MLX5E_HANDLE_FEATURE(NETIF_F_GRO_HW, set_feature_hw_gro);
		err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXFCS, set_feature_rx_fcs);
	}
	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_FILTER,
				    set_feature_cvlan_filter);
	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_hw_tc);
	err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXALL, set_feature_rx_all);
	err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXFCS, set_feature_rx_fcs);
	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_RX, set_feature_rx_vlan);
#ifdef CONFIG_MLX5_EN_ARFS
	err |= MLX5E_HANDLE_FEATURE(NETIF_F_NTUPLE, set_feature_arfs);
@@ -5332,6 +5352,11 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_FILTER;
	netdev->hw_features      |= NETIF_F_HW_VLAN_STAG_TX;

	if (mlx5e_hw_gro_supported(mdev) &&
	    mlx5e_check_fragmented_striding_rq_cap(mdev, PAGE_SHIFT,
						   MLX5E_MPWRQ_UMR_MODE_ALIGNED))
		netdev->hw_features    |= NETIF_F_GRO_HW;

	if (mlx5e_tunnel_any_tx_proto_supported(mdev)) {
		netdev->hw_enc_features |= NETIF_F_HW_CSUM;
		netdev->hw_enc_features |= NETIF_F_TSO;
Loading