Commit 7aba6664 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'xsk-the-lost-bits-from-chapter-iii'

Alexander Lobakin says:

====================
xsk: the lost bits from Chapter III

Before introducing libeth_xdp, we need to add a couple more generic
helpers. Notably:

* 01: add generic loop unrolling hint helpers;
* 04: add helper to get both xdp_desc's DMA address and metadata
  pointer in one go, saving several cycles and hotpath object
  code size in drivers (especially when unrolling).

Bonus:

* 02, 03: convert two drivers which were using custom macros to
  generic unrolled_count() (trivial, no object code changes).
====================

Link: https://patch.msgid.link/20250206182630.3914318-1-aleksander.lobakin@intel.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 5b281fe7 23d9324a
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@
/* Copyright(c) 2018 Intel Corporation. */

#include <linux/bpf_trace.h>
#include <linux/unroll.h>
#include <net/xdp_sock_drv.h>
#include "i40e_txrx_common.h"
#include "i40e_xsk.h"
@@ -529,7 +530,8 @@ static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *des
	dma_addr_t dma;
	u32 i;

	loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
	unrolled_count(PKTS_PER_BATCH)
	for (i = 0; i < PKTS_PER_BATCH; i++) {
		u32 cmd = I40E_TX_DESC_CMD_ICRC | xsk_is_eop_desc(&desc[i]);

		dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
+1 −9
Original line number Diff line number Diff line
@@ -6,7 +6,7 @@

#include <linux/types.h>

/* This value should match the pragma in the loop_unrolled_for
/* This value should match the pragma in the unrolled_count()
 * macro. Why 4? It is strictly empirical. It seems to be a good
 * compromise between the advantage of having simultaneous outstanding
 * reads to the DMA array that can hide each others latency and the
@@ -14,14 +14,6 @@
 */
#define PKTS_PER_BATCH 4

#ifdef __clang__
#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for
#elif __GNUC__ >= 8
#define loop_unrolled_for _Pragma("GCC unroll 4") for
#else
#define loop_unrolled_for for
#endif

struct i40e_ring;
struct i40e_vsi;
struct net_device;
+3 −1
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@
/* Copyright (c) 2019, Intel Corporation. */

#include <linux/bpf_trace.h>
#include <linux/unroll.h>
#include <net/xdp_sock_drv.h>
#include <net/xdp.h>
#include "ice.h"
@@ -989,7 +990,8 @@ static void ice_xmit_pkt_batch(struct ice_tx_ring *xdp_ring,
	struct ice_tx_desc *tx_desc;
	u32 i;

	loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
	unrolled_count(PKTS_PER_BATCH)
	for (i = 0; i < PKTS_PER_BATCH; i++) {
		dma_addr_t dma;

		dma = xsk_buff_raw_get_dma(xsk_pool, descs[i].addr);
+0 −8
Original line number Diff line number Diff line
@@ -7,14 +7,6 @@

#define PKTS_PER_BATCH 8

#ifdef __clang__
#define loop_unrolled_for _Pragma("clang loop unroll_count(8)") for
#elif __GNUC__ >= 8
#define loop_unrolled_for _Pragma("GCC unroll 8") for
#else
#define loop_unrolled_for for
#endif

struct ice_vsi;

#ifdef CONFIG_XDP_SOCKETS
+44 −0
Original line number Diff line number Diff line
@@ -9,6 +9,50 @@

#include <linux/args.h>

#ifdef CONFIG_CC_IS_CLANG
#define __pick_unrolled(x, y)		_Pragma(#x)
#elif CONFIG_GCC_VERSION >= 80000
#define __pick_unrolled(x, y)		_Pragma(#y)
#else
#define __pick_unrolled(x, y)		/* not supported */
#endif

/**
 * unrolled - loop attributes to ask the compiler to unroll it
 *
 * Usage:
 *
 * #define BATCH 8
 *
 *	unrolled_count(BATCH)
 *	for (u32 i = 0; i < BATCH; i++)
 *		// loop body without cross-iteration dependencies
 *
 * This is only a hint and the compiler is free to disable unrolling if it
 * thinks the count is suboptimal and may hurt performance and/or hugely
 * increase object code size.
 * Not having any cross-iteration dependencies (i.e. when iter x + 1 depends
 * on what iter x will do with variables) is not a strict requirement, but
 * provides best performance and object code size.
 * Available only on Clang and GCC 8.x onwards.
 */

/* Ask the compiler to pick an optimal unroll count, Clang only */
#define unrolled							\
	__pick_unrolled(clang loop unroll(enable), /* nothing */)

/* Unroll each @n iterations of the loop */
#define unrolled_count(n)						\
	__pick_unrolled(clang loop unroll_count(n), GCC unroll n)

/* Unroll the whole loop */
#define unrolled_full							\
	__pick_unrolled(clang loop unroll(full), GCC unroll 65534)

/* Never unroll the loop */
#define unrolled_none							\
	__pick_unrolled(clang loop unroll(disable), GCC unroll 1)

#define UNROLL(N, MACRO, args...) CONCATENATE(__UNROLL_, N)(MACRO, args)

#define __UNROLL_0(MACRO, args...)
Loading