Commit 24cf78c7 authored by Dragos Tatulea's avatar Dragos Tatulea Committed by Jakub Kicinski
Browse files

net/mlx5e: SHAMPO, Switch to header memcpy

Previously the HW-GRO code was using a separate page_pool for the header
buffer. The pages of the header buffer were replenished via UMR. This
mechanism has some drawbacks:
- Reference counting on the page_pool page frags is not cheap.
- UMRs have HW overhead for updating and also for access. Especially for
  the KLM type which was previously used.
- UMR code for headers is complex.

This patch switches to using a static memory area (static MTT MKEY) for
the header buffer and does a header memcpy. This happens only once per
GRO session. The SKB is allocated from the per-cpu NAPI SKB cache.

Performance numbers for x86:
+---------------------------------------------------------+
| Test                | Baseline   | Header Copy | Change |
|---------------------+------------+-------------+--------|
| iperf3 oncpu        |  59.5 Gbps |  64.00 Gbps |   7 %  |
| iperf3 offcpu       | 102.5 Gbps | 104.20 Gbps |   2 %  |
| kperf oncpu         | 115.0 Gbps | 130.00 Gbps |  12 %  |
| XDP_DROP (skb mode) |   3.9 Mpps |   3.9 Mpps  |   0 %  |
+---------------------------------------------------------+

Notes on test:
- System: Intel(R) Xeon(R) Platinum 8380 CPU @ 2.30GHz
- oncpu: NAPI and application running on same CPU
- offcpu: NAPI and application running on different CPUs
- MTU: 1500
- iperf3 tests are single stream, 60s with IPv6 (for slightly larger
  headers)
- kperf version [1]

[1] git://git.kernel.dk/kperf.git



Suggested-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: default avatarTariq Toukan <tariqt@nvidia.com>
Reviewed-by: default avatarJacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/20260204200345.1724098-1-tariqt@nvidia.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 215b5309
Loading
Loading
Loading
Loading
+8 −12
Original line number Diff line number Diff line
@@ -82,9 +82,10 @@ struct page_pool;

#define MLX5E_RX_MAX_HEAD (256)
#define MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE (8)
#define MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE (9)
#define MLX5E_SHAMPO_WQ_HEADER_PER_PAGE (PAGE_SIZE >> MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE)
#define MLX5E_SHAMPO_LOG_WQ_HEADER_PER_PAGE (PAGE_SHIFT - MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE)
#define MLX5E_SHAMPO_WQ_HEADER_PER_PAGE \
	(PAGE_SIZE >> MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE)
#define MLX5E_SHAMPO_LOG_WQ_HEADER_PER_PAGE \
	(PAGE_SHIFT - MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE)
#define MLX5E_SHAMPO_WQ_BASE_HEAD_ENTRY_SIZE_SHIFT (6)
#define MLX5E_SHAMPO_WQ_RESRV_SIZE_BASE_SHIFT (12)
#define MLX5E_SHAMPO_WQ_LOG_RESRV_SIZE (16)
@@ -638,16 +639,11 @@ struct mlx5e_dma_info {
};

struct mlx5e_shampo_hd {
	struct mlx5e_frag_page *pages;
	u32 hd_per_wq;
	u32 hd_per_page;
	u16 hd_per_wqe;
	u8 log_hd_per_page;
	u8 log_hd_entry_size;
	unsigned long *bitmap;
	u16 pi;
	u16 ci;
	__be32 mkey_be;
	u32 hd_buf_size;
	u32 mkey;
	u32 nentries;
	DECLARE_FLEX_ARRAY(struct mlx5e_dma_info, hd_buf_pages);
};

struct mlx5e_hw_gro_data {
+0 −23
Original line number Diff line number Diff line
@@ -1068,26 +1068,6 @@ u32 mlx5e_shampo_hd_per_wq(struct mlx5_core_dev *mdev,
	return hd_per_wq;
}

static u32 mlx5e_shampo_icosq_sz(struct mlx5_core_dev *mdev,
				 struct mlx5e_params *params,
				 struct mlx5e_rq_param *rq_param)
{
	int max_num_of_umr_per_wqe, max_hd_per_wqe, max_ksm_per_umr, rest;
	void *wqc = MLX5_ADDR_OF(rqc, rq_param->rqc, wq);
	int wq_size = BIT(MLX5_GET(wq, wqc, log_wq_sz));
	u32 wqebbs;

	max_ksm_per_umr = MLX5E_MAX_KSM_PER_WQE(mdev);
	max_hd_per_wqe = mlx5e_shampo_hd_per_wqe(mdev, params, rq_param);
	max_num_of_umr_per_wqe = max_hd_per_wqe / max_ksm_per_umr;
	rest = max_hd_per_wqe % max_ksm_per_umr;
	wqebbs = MLX5E_KSM_UMR_WQEBBS(max_ksm_per_umr) * max_num_of_umr_per_wqe;
	if (rest)
		wqebbs += MLX5E_KSM_UMR_WQEBBS(rest);
	wqebbs *= wq_size;
	return wqebbs;
}

#define MLX5E_LRO_TIMEOUT_ARR_SIZE                      4

u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout)
@@ -1173,9 +1153,6 @@ static u8 mlx5e_build_icosq_log_wq_sz(struct mlx5_core_dev *mdev,
		wqebbs += max_xsk_wqebbs;
	}

	if (params->packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO)
		wqebbs += mlx5e_shampo_icosq_sz(mdev, params, rqp);

	/* UMR WQEs don't cross the page boundary, they are padded with NOPs.
	 * This padding is always smaller than the max WQE size. That gives us
	 * at least (PAGE_SIZE - (max WQE size - MLX5_SEND_WQE_BB)) useful bytes
+0 −1
Original line number Diff line number Diff line
@@ -65,7 +65,6 @@ ktime_t mlx5e_cqe_ts_to_ns(cqe_ts_to_ns func, struct mlx5_clock *clock, u64 cqe_
enum mlx5e_icosq_wqe_type {
	MLX5E_ICOSQ_WQE_NOP,
	MLX5E_ICOSQ_WQE_UMR_RX,
	MLX5E_ICOSQ_WQE_SHAMPO_HD_UMR,
#ifdef CONFIG_MLX5_EN_TLS
	MLX5E_ICOSQ_WQE_UMR_TLS,
	MLX5E_ICOSQ_WQE_SET_PSV_TLS,
+121 −166
Original line number Diff line number Diff line
@@ -492,40 +492,6 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
	return err;
}

static int mlx5e_create_umr_ksm_mkey(struct mlx5_core_dev *mdev,
				     u64 nentries, u8 log_entry_size,
				     u32 *umr_mkey)
{
	int inlen;
	void *mkc;
	u32 *in;
	int err;

	inlen = MLX5_ST_SZ_BYTES(create_mkey_in);

	in = kvzalloc(inlen, GFP_KERNEL);
	if (!in)
		return -ENOMEM;

	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);

	MLX5_SET(mkc, mkc, free, 1);
	MLX5_SET(mkc, mkc, umr_en, 1);
	MLX5_SET(mkc, mkc, lw, 1);
	MLX5_SET(mkc, mkc, lr, 1);
	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KSM);
	mlx5e_mkey_set_relaxed_ordering(mdev, mkc);
	MLX5_SET(mkc, mkc, qpn, 0xffffff);
	MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn);
	MLX5_SET(mkc, mkc, translations_octword_size, nentries);
	MLX5_SET(mkc, mkc, log_page_size, log_entry_size);
	MLX5_SET64(mkc, mkc, len, nentries << log_entry_size);
	err = mlx5_core_create_mkey(mdev, umr_mkey, in, inlen);

	kvfree(in);
	return err;
}

static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq *rq)
{
	u32 xsk_chunk_size = rq->xsk_pool ? rq->xsk_pool->chunk_size : 0;
@@ -551,29 +517,6 @@ static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq
	return err;
}

static int mlx5e_create_rq_hd_umr_mkey(struct mlx5_core_dev *mdev,
				       u16 hd_per_wq, __be32 *umr_mkey)
{
	u32 max_ksm_size = BIT(MLX5_CAP_GEN(mdev, log_max_klm_list_size));
	u32 mkey;
	int err;

	if (max_ksm_size < hd_per_wq) {
		mlx5_core_err(mdev, "max ksm list size 0x%x is smaller than shampo header buffer list size 0x%x\n",
			      max_ksm_size, hd_per_wq);
		return -EINVAL;
	}

	err = mlx5e_create_umr_ksm_mkey(mdev, hd_per_wq,
					MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE,
					&mkey);
	if (err)
		return err;

	*umr_mkey = cpu_to_be32(mkey);
	return 0;
}

static void mlx5e_init_frags_partition(struct mlx5e_rq *rq)
{
	struct mlx5e_wqe_frag_info next_frag = {};
@@ -754,145 +697,169 @@ static int mlx5e_init_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param
				  xdp_frag_size);
}

static int mlx5e_rq_shampo_hd_info_alloc(struct mlx5e_rq *rq, u16 hd_per_wq,
					 int node)
static void mlx5e_release_rq_hd_pages(struct mlx5e_rq *rq,
				      struct mlx5e_shampo_hd *shampo)

{
	struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo;
	for (int i = 0; i < shampo->nentries; i++) {
		struct mlx5e_dma_info *info = &shampo->hd_buf_pages[i];

	shampo->hd_per_wq = hd_per_wq;
		if (!info->page)
			continue;

		dma_unmap_page(rq->pdev, info->addr, PAGE_SIZE,
			       rq->buff.map_dir);
		__free_page(info->page);
	}
}

static int mlx5e_alloc_rq_hd_pages(struct mlx5e_rq *rq, int node,
				   struct mlx5e_shampo_hd *shampo)
{
	int err, i;

	for (i = 0; i < shampo->nentries; i++) {
		struct page *page = alloc_pages_node(node, GFP_KERNEL, 0);
		dma_addr_t addr;

		if (!page) {
			err = -ENOMEM;
			goto err_free_pages;
		}

		addr = dma_map_page(rq->pdev, page, 0, PAGE_SIZE,
				    rq->buff.map_dir);
		err = dma_mapping_error(rq->pdev, addr);
		if (err) {
			__free_page(page);
			goto err_free_pages;
		}

	shampo->bitmap = bitmap_zalloc_node(hd_per_wq, GFP_KERNEL, node);
	shampo->pages = kvzalloc_node(array_size(hd_per_wq,
						 sizeof(*shampo->pages)),
				      GFP_KERNEL, node);
	if (!shampo->bitmap || !shampo->pages)
		goto err_nomem;
		shampo->hd_buf_pages[i].page = page;
		shampo->hd_buf_pages[i].addr = addr;
	}

	return 0;

err_nomem:
	kvfree(shampo->pages);
	bitmap_free(shampo->bitmap);
err_free_pages:
	mlx5e_release_rq_hd_pages(rq, shampo);

	return -ENOMEM;
	return err;
}

static void mlx5e_rq_shampo_hd_info_free(struct mlx5e_rq *rq)
static int mlx5e_create_rq_hd_mkey(struct mlx5_core_dev *mdev,
				   struct mlx5e_shampo_hd *shampo)
{
	kvfree(rq->mpwqe.shampo->pages);
	bitmap_free(rq->mpwqe.shampo->bitmap);
	enum mlx5e_mpwrq_umr_mode umr_mode = MLX5E_MPWRQ_UMR_MODE_ALIGNED;
	struct mlx5_mtt *mtt;
	void *mkc, *in;
	int inlen, err;
	u32 octwords;

	octwords = mlx5e_mpwrq_umr_octowords(shampo->nentries, umr_mode);
	inlen = MLX5_FLEXIBLE_INLEN(mdev, MLX5_ST_SZ_BYTES(create_mkey_in),
				    MLX5_OCTWORD, octwords);
	if (inlen < 0)
		return inlen;

	in = kvzalloc(inlen, GFP_KERNEL);
	if (!in)
		return -ENOMEM;

	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);

	MLX5_SET(mkc, mkc, lw, 1);
	MLX5_SET(mkc, mkc, lr, 1);
	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
	mlx5e_mkey_set_relaxed_ordering(mdev, mkc);
	MLX5_SET(mkc, mkc, qpn, 0xffffff);
	MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn);
	MLX5_SET64(mkc, mkc, len, shampo->hd_buf_size);
	MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
	MLX5_SET(mkc, mkc, translations_octword_size, octwords);
	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
		 octwords);

	mtt = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
	for (int i = 0; i < shampo->nentries; i++)
		mtt[i].ptag = cpu_to_be64(shampo->hd_buf_pages[i].addr);

	err = mlx5_core_create_mkey(mdev, &shampo->mkey, in, inlen);

	kvfree(in);
	return err;
}

static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev,
				struct mlx5e_params *params,
				struct mlx5e_rq_param *rqp,
				struct mlx5e_rq *rq,
				u32 *pool_size,
				int node)
{
	void *wqc = MLX5_ADDR_OF(rqc, rqp->rqc, wq);
	u8 log_hd_per_page, log_hd_entry_size;
	u16 hd_per_wq, hd_per_wqe;
	u32 hd_pool_size;
	int wq_size;
	int err;
	struct mlx5e_shampo_hd *shampo;
	int nentries, err, shampo_sz;
	u32 hd_per_wq, hd_buf_size;

	if (!test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state))
		return 0;

	rq->mpwqe.shampo = kvzalloc_node(sizeof(*rq->mpwqe.shampo),
					 GFP_KERNEL, node);
	if (!rq->mpwqe.shampo)
		return -ENOMEM;

	/* split headers data structures */
	hd_per_wq = mlx5e_shampo_hd_per_wq(mdev, params, rqp);
	err = mlx5e_rq_shampo_hd_info_alloc(rq, hd_per_wq, node);
	if (err)
		goto err_shampo_hd_info_alloc;

	err = mlx5e_create_rq_hd_umr_mkey(mdev, hd_per_wq,
					  &rq->mpwqe.shampo->mkey_be);
	if (err)
		goto err_umr_mkey;

	hd_per_wqe = mlx5e_shampo_hd_per_wqe(mdev, params, rqp);
	wq_size = BIT(MLX5_GET(wq, wqc, log_wq_sz));

	BUILD_BUG_ON(MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE > PAGE_SHIFT);
	if (hd_per_wqe >= MLX5E_SHAMPO_WQ_HEADER_PER_PAGE) {
		log_hd_per_page = MLX5E_SHAMPO_LOG_WQ_HEADER_PER_PAGE;
		log_hd_entry_size = MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE;
	} else {
		log_hd_per_page = order_base_2(hd_per_wqe);
		log_hd_entry_size = order_base_2(PAGE_SIZE / hd_per_wqe);
	hd_buf_size = hd_per_wq * BIT(MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE);
	nentries = hd_buf_size / PAGE_SIZE;
	if (!nentries) {
		mlx5_core_err(mdev, "SHAMPO header buffer size %u < %lu\n",
			      hd_buf_size, PAGE_SIZE);
		return -EINVAL;
	}

	rq->mpwqe.shampo->hd_per_wqe = hd_per_wqe;
	rq->mpwqe.shampo->hd_per_page = BIT(log_hd_per_page);
	rq->mpwqe.shampo->log_hd_per_page = log_hd_per_page;
	rq->mpwqe.shampo->log_hd_entry_size = log_hd_entry_size;

	hd_pool_size = (hd_per_wqe * wq_size) >> log_hd_per_page;

	if (netif_rxq_has_unreadable_mp(rq->netdev, rq->ix)) {
		/* Separate page pool for shampo headers */
		struct page_pool_params pp_params = { };
	shampo_sz = struct_size(shampo, hd_buf_pages, nentries);
	shampo = kvzalloc_node(shampo_sz, GFP_KERNEL, node);
	if (!shampo)
		return -ENOMEM;

		pp_params.order     = 0;
		pp_params.flags     = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
		pp_params.pool_size = hd_pool_size;
		pp_params.nid       = node;
		pp_params.dev       = rq->pdev;
		pp_params.napi      = rq->cq.napi;
		pp_params.netdev    = rq->netdev;
		pp_params.dma_dir   = rq->buff.map_dir;
		pp_params.max_len   = PAGE_SIZE;
	shampo->hd_per_wq = hd_per_wq;
	shampo->hd_buf_size = hd_buf_size;
	shampo->nentries = nentries;
	err = mlx5e_alloc_rq_hd_pages(rq, node, shampo);
	if (err)
		goto err_free;

		rq->hd_page_pool = page_pool_create(&pp_params);
		if (IS_ERR(rq->hd_page_pool)) {
			err = PTR_ERR(rq->hd_page_pool);
			rq->hd_page_pool = NULL;
			goto err_hds_page_pool;
		}
	} else {
		/* Common page pool, reserve space for headers. */
		*pool_size += hd_pool_size;
		rq->hd_page_pool = NULL;
	}
	err = mlx5e_create_rq_hd_mkey(mdev, shampo);
	if (err)
		goto err_release_pages;

	/* gro only data structures */
	rq->hw_gro_data = kvzalloc_node(sizeof(*rq->hw_gro_data), GFP_KERNEL, node);
	if (!rq->hw_gro_data) {
		err = -ENOMEM;
		goto err_hw_gro_data;
		goto err_destroy_mkey;
	}

	rq->mpwqe.shampo = shampo;

	return 0;

err_hw_gro_data:
	page_pool_destroy(rq->hd_page_pool);
err_hds_page_pool:
	mlx5_core_destroy_mkey(mdev, be32_to_cpu(rq->mpwqe.shampo->mkey_be));
err_umr_mkey:
	mlx5e_rq_shampo_hd_info_free(rq);
err_shampo_hd_info_alloc:
	kvfree(rq->mpwqe.shampo);
err_destroy_mkey:
	mlx5_core_destroy_mkey(mdev, shampo->mkey);
err_release_pages:
	mlx5e_release_rq_hd_pages(rq, shampo);
err_free:
	kvfree(shampo);

	return err;
}

static void mlx5e_rq_free_shampo(struct mlx5e_rq *rq)
{
	if (!test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state))
	struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo;

	if (!shampo)
		return;

	kvfree(rq->hw_gro_data);
	if (rq->hd_page_pool != rq->page_pool)
		page_pool_destroy(rq->hd_page_pool);
	mlx5e_rq_shampo_hd_info_free(rq);
	mlx5_core_destroy_mkey(rq->mdev,
			       be32_to_cpu(rq->mpwqe.shampo->mkey_be));
	kvfree(rq->mpwqe.shampo);
	mlx5_core_destroy_mkey(rq->mdev, shampo->mkey);
	mlx5e_release_rq_hd_pages(rq, shampo);
	kvfree(shampo);
}

static int mlx5e_alloc_rq(struct mlx5e_params *params,
@@ -970,7 +937,7 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params,
		if (err)
			goto err_rq_mkey;

		err = mlx5_rq_shampo_alloc(mdev, params, rqp, rq, &pool_size, node);
		err = mlx5_rq_shampo_alloc(mdev, params, rqp, rq, node);
		if (err)
			goto err_free_mpwqe_info;

@@ -1165,8 +1132,7 @@ int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param, u16 q_cou
	if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) {
		MLX5_SET(wq, wq, log_headers_buffer_entry_num,
			 order_base_2(rq->mpwqe.shampo->hd_per_wq));
		MLX5_SET(wq, wq, headers_mkey,
			 be32_to_cpu(rq->mpwqe.shampo->mkey_be));
		MLX5_SET(wq, wq, headers_mkey, rq->mpwqe.shampo->mkey);
	}

	mlx5_fill_page_frag_array(&rq->wq_ctrl.buf,
@@ -1326,14 +1292,6 @@ void mlx5e_free_rx_missing_descs(struct mlx5e_rq *rq)
	rq->mpwqe.actual_wq_head = wq->head;
	rq->mpwqe.umr_in_progress = 0;
	rq->mpwqe.umr_completed = 0;

	if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) {
		struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo;
		u16 len;

		len = (shampo->pi - shampo->ci) & shampo->hd_per_wq;
		mlx5e_shampo_fill_umr(rq, len);
	}
}

void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
@@ -1356,9 +1314,6 @@ void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
			mlx5_wq_ll_pop(wq, wqe_ix_be,
				       &wqe->next.next_wqe_index);
		}

		if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state))
			mlx5e_shampo_dealloc_hd(rq);
	} else {
		struct mlx5_wq_cyc *wq = &rq->wqe.wq;
		u16 missing = mlx5_wq_cyc_missing(wq);
+59 −282

File changed.

Preview size limit exceeded, changes collapsed.