Commit 24b8c193 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files
Tony Nguyen says:

====================
idpf: XDP chapter II: convert Tx completion to libeth

Alexander Lobakin says:

XDP for idpf is currently 5 chapters:
* convert Rx to libeth;
* convert Tx completion to libeth (this);
* generic XDP and XSk code changes;
* actual XDP for idpf via libeth_xdp;
* XSk for idpf (^).

Part II does the following:
* adds generic libeth Tx completion routines;
* converts idpf to use generic libeth Tx comp routines;
* fixes Tx queue timeouts and robustifies Tx completion in general;
* fixes Tx event/descriptor flushes (writebacks).

Most idpf patches again remove more lines than adds.
Generic Tx completion helpers and structs are needed as libeth_xdp
(Ch. III) makes use of them. WB_ON_ITR is needed since XDPSQs don't
want to work without it at all. Tx queue timeouts fixes are needed
since without them, it's way easier to catch a Tx timeout event when
WB_ON_ITR is enabled.

* '200GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue:
  idpf: enable WB_ON_ITR
  idpf: fix netdev Tx queue stop/wake
  idpf: refactor Tx completion routines
  netdevice: add netdev_tx_reset_subqueue() shorthand
  idpf: convert to libeth Tx buffer completion
  libeth: add Tx buffer completion helpers
====================

Link: https://patch.msgid.link/20240909205323.3110312-1-anthony.l.nguyen@intel.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents b2c8a506 9c4a27da
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -97,8 +97,10 @@ static int idpf_intr_reg_init(struct idpf_vport *vport)
		intr->dyn_ctl = idpf_get_reg_addr(adapter,
						  reg_vals[vec_id].dyn_ctl_reg);
		intr->dyn_ctl_intena_m = PF_GLINT_DYN_CTL_INTENA_M;
		intr->dyn_ctl_intena_msk_m = PF_GLINT_DYN_CTL_INTENA_MSK_M;
		intr->dyn_ctl_itridx_s = PF_GLINT_DYN_CTL_ITR_INDX_S;
		intr->dyn_ctl_intrvl_s = PF_GLINT_DYN_CTL_INTERVAL_S;
		intr->dyn_ctl_wb_on_itr_m = PF_GLINT_DYN_CTL_WB_ON_ITR_M;

		spacing = IDPF_ITR_IDX_SPACING(reg_vals[vec_id].itrn_index_spacing,
					       IDPF_PF_ITR_IDX_SPACING);
+50 −60
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@
/* Copyright (C) 2023 Intel Corporation */

#include <net/libeth/rx.h>
#include <net/libeth/tx.h>

#include "idpf.h"

@@ -224,6 +225,7 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q,
		/* record length, and DMA address */
		dma_unmap_len_set(tx_buf, len, size);
		dma_unmap_addr_set(tx_buf, dma, dma);
		tx_buf->type = LIBETH_SQE_FRAG;

		/* align size to end of page */
		max_data += -dma & (IDPF_TX_MAX_READ_REQ_SIZE - 1);
@@ -237,14 +239,17 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q,
								  offsets,
								  max_data,
								  td_tag);
			tx_desc++;
			i++;

			if (i == tx_q->desc_count) {
			if (unlikely(++i == tx_q->desc_count)) {
				tx_buf = &tx_q->tx_buf[0];
				tx_desc = &tx_q->base_tx[0];
				i = 0;
			} else {
				tx_buf++;
				tx_desc++;
			}

			tx_buf->type = LIBETH_SQE_EMPTY;

			dma += max_data;
			size -= max_data;

@@ -257,12 +262,14 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q,

		tx_desc->qw1 = idpf_tx_singleq_build_ctob(td_cmd, offsets,
							  size, td_tag);
		tx_desc++;
		i++;

		if (i == tx_q->desc_count) {
		if (unlikely(++i == tx_q->desc_count)) {
			tx_buf = &tx_q->tx_buf[0];
			tx_desc = &tx_q->base_tx[0];
			i = 0;
		} else {
			tx_buf++;
			tx_desc++;
		}

		size = skb_frag_size(frag);
@@ -270,8 +277,6 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q,

		dma = skb_frag_dma_map(tx_q->dev, frag, 0, size,
				       DMA_TO_DEVICE);

		tx_buf = &tx_q->tx_buf[i];
	}

	skb_tx_timestamp(first->skb);
@@ -282,13 +287,13 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q,
	tx_desc->qw1 = idpf_tx_singleq_build_ctob(td_cmd, offsets,
						  size, td_tag);

	IDPF_SINGLEQ_BUMP_RING_IDX(tx_q, i);
	first->type = LIBETH_SQE_SKB;
	first->rs_idx = i;

	/* set next_to_watch value indicating a packet is present */
	first->next_to_watch = tx_desc;
	IDPF_SINGLEQ_BUMP_RING_IDX(tx_q, i);

	nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx);
	netdev_tx_sent_queue(nq, first->bytecount);
	netdev_tx_sent_queue(nq, first->bytes);

	idpf_tx_buf_hw_update(tx_q, i, netdev_xmit_more());
}
@@ -306,8 +311,7 @@ idpf_tx_singleq_get_ctx_desc(struct idpf_tx_queue *txq)
	struct idpf_base_tx_ctx_desc *ctx_desc;
	int ntu = txq->next_to_use;

	memset(&txq->tx_buf[ntu], 0, sizeof(struct idpf_tx_buf));
	txq->tx_buf[ntu].ctx_entry = true;
	txq->tx_buf[ntu].type = LIBETH_SQE_CTX;

	ctx_desc = &txq->base_ctx[ntu];

@@ -371,6 +375,10 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb,
				      IDPF_TX_DESCS_FOR_CTX)) {
		idpf_tx_buf_hw_update(tx_q, tx_q->next_to_use, false);

		u64_stats_update_begin(&tx_q->stats_sync);
		u64_stats_inc(&tx_q->q_stats.q_busy);
		u64_stats_update_end(&tx_q->stats_sync);

		return NETDEV_TX_BUSY;
	}

@@ -396,11 +404,11 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb,
	first->skb = skb;

	if (tso) {
		first->gso_segs = offload.tso_segs;
		first->bytecount = skb->len + ((first->gso_segs - 1) * offload.tso_hdr_len);
		first->packets = offload.tso_segs;
		first->bytes = skb->len + ((first->packets - 1) * offload.tso_hdr_len);
	} else {
		first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN);
		first->gso_segs = 1;
		first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
		first->packets = 1;
	}
	idpf_tx_singleq_map(tx_q, first, &offload);

@@ -420,10 +428,15 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb,
static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget,
				  int *cleaned)
{
	unsigned int total_bytes = 0, total_pkts = 0;
	struct libeth_sq_napi_stats ss = { };
	struct idpf_base_tx_desc *tx_desc;
	u32 budget = tx_q->clean_budget;
	s16 ntc = tx_q->next_to_clean;
	struct libeth_cq_pp cp = {
		.dev	= tx_q->dev,
		.ss	= &ss,
		.napi	= napi_budget,
	};
	struct idpf_netdev_priv *np;
	struct idpf_tx_buf *tx_buf;
	struct netdev_queue *nq;
@@ -441,47 +454,26 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget,
		 * such. We can skip this descriptor since there is no buffer
		 * to clean.
		 */
		if (tx_buf->ctx_entry) {
			/* Clear this flag here to avoid stale flag values when
			 * this buffer is used for actual data in the future.
			 * There are cases where the tx_buf struct / the flags
			 * field will not be cleared before being reused.
			 */
			tx_buf->ctx_entry = false;
		if (unlikely(tx_buf->type <= LIBETH_SQE_CTX)) {
			tx_buf->type = LIBETH_SQE_EMPTY;
			goto fetch_next_txq_desc;
		}

		/* if next_to_watch is not set then no work pending */
		eop_desc = (struct idpf_base_tx_desc *)tx_buf->next_to_watch;
		if (!eop_desc)
		if (unlikely(tx_buf->type != LIBETH_SQE_SKB))
			break;

		/* prevent any other reads prior to eop_desc */
		/* prevent any other reads prior to type */
		smp_rmb();

		eop_desc = &tx_q->base_tx[tx_buf->rs_idx];

		/* if the descriptor isn't done, no work yet to do */
		if (!(eop_desc->qw1 &
		      cpu_to_le64(IDPF_TX_DESC_DTYPE_DESC_DONE)))
			break;

		/* clear next_to_watch to prevent false hangs */
		tx_buf->next_to_watch = NULL;

		/* update the statistics for this packet */
		total_bytes += tx_buf->bytecount;
		total_pkts += tx_buf->gso_segs;

		napi_consume_skb(tx_buf->skb, napi_budget);

		/* unmap skb header data */
		dma_unmap_single(tx_q->dev,
				 dma_unmap_addr(tx_buf, dma),
				 dma_unmap_len(tx_buf, len),
				 DMA_TO_DEVICE);

		/* clear tx_buf data */
		tx_buf->skb = NULL;
		dma_unmap_len_set(tx_buf, len, 0);
		libeth_tx_complete(tx_buf, &cp);

		/* unmap remaining buffers */
		while (tx_desc != eop_desc) {
@@ -495,13 +487,7 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget,
			}

			/* unmap any remaining paged data */
			if (dma_unmap_len(tx_buf, len)) {
				dma_unmap_page(tx_q->dev,
					       dma_unmap_addr(tx_buf, dma),
					       dma_unmap_len(tx_buf, len),
					       DMA_TO_DEVICE);
				dma_unmap_len_set(tx_buf, len, 0);
			}
			libeth_tx_complete(tx_buf, &cp);
		}

		/* update budget only if we did something */
@@ -521,11 +507,11 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget,
	ntc += tx_q->desc_count;
	tx_q->next_to_clean = ntc;

	*cleaned += total_pkts;
	*cleaned += ss.packets;

	u64_stats_update_begin(&tx_q->stats_sync);
	u64_stats_add(&tx_q->q_stats.packets, total_pkts);
	u64_stats_add(&tx_q->q_stats.bytes, total_bytes);
	u64_stats_add(&tx_q->q_stats.packets, ss.packets);
	u64_stats_add(&tx_q->q_stats.bytes, ss.bytes);
	u64_stats_update_end(&tx_q->stats_sync);

	np = netdev_priv(tx_q->netdev);
@@ -533,7 +519,7 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget,

	dont_wake = np->state != __IDPF_VPORT_UP ||
		    !netif_carrier_ok(tx_q->netdev);
	__netif_txq_completed_wake(nq, total_pkts, total_bytes,
	__netif_txq_completed_wake(nq, ss.packets, ss.bytes,
				   IDPF_DESC_UNUSED(tx_q), IDPF_TX_WAKE_THRESH,
				   dont_wake);

@@ -1134,8 +1120,10 @@ int idpf_vport_singleq_napi_poll(struct napi_struct *napi, int budget)
						    &work_done);

	/* If work not completed, return budget and polling will return */
	if (!clean_complete)
	if (!clean_complete) {
		idpf_vport_intr_set_wb_on_itr(q_vector);
		return budget;
	}

	work_done = min_t(int, work_done, budget - 1);

@@ -1144,6 +1132,8 @@ int idpf_vport_singleq_napi_poll(struct napi_struct *napi, int budget)
	 */
	if (likely(napi_complete_done(napi, work_done)))
		idpf_vport_intr_update_itr_ena_irq(q_vector);
	else
		idpf_vport_intr_set_wb_on_itr(q_vector);

	return work_done;
}
+183 −212

File changed.

Preview size limit exceeded, changes collapsed.

+39 −53
Original line number Diff line number Diff line
@@ -127,11 +127,10 @@ do { \
 */
#define IDPF_TX_COMPLQ_PENDING(txq)	\
	(((txq)->num_completions_pending >= (txq)->complq->num_completions ? \
	0 : U64_MAX) + \
	0 : U32_MAX) + \
	(txq)->num_completions_pending - (txq)->complq->num_completions)

#define IDPF_TX_SPLITQ_COMPL_TAG_WIDTH	16
#define IDPF_SPLITQ_TX_INVAL_COMPL_TAG	-1
/* Adjust the generation for the completion tag and wrap if necessary */
#define IDPF_TX_ADJ_COMPL_TAG_GEN(txq) \
	((++(txq)->compl_tag_cur_gen) >= (txq)->compl_tag_gen_max ? \
@@ -149,47 +148,7 @@ union idpf_tx_flex_desc {
	struct idpf_flex_tx_sched_desc flow; /* flow based scheduling */
};

/**
 * struct idpf_tx_buf
 * @next_to_watch: Next descriptor to clean
 * @skb: Pointer to the skb
 * @dma: DMA address
 * @len: DMA length
 * @bytecount: Number of bytes
 * @gso_segs: Number of GSO segments
 * @compl_tag: Splitq only, unique identifier for a buffer. Used to compare
 *	       with completion tag returned in buffer completion event.
 *	       Because the completion tag is expected to be the same in all
 *	       data descriptors for a given packet, and a single packet can
 *	       span multiple buffers, we need this field to track all
 *	       buffers associated with this completion tag independently of
 *	       the buf_id. The tag consists of a N bit buf_id and M upper
 *	       order "generation bits". See compl_tag_bufid_m and
 *	       compl_tag_gen_s in struct idpf_queue. We'll use a value of -1
 *	       to indicate the tag is not valid.
 * @ctx_entry: Singleq only. Used to indicate the corresponding entry
 *	       in the descriptor ring was used for a context descriptor and
 *	       this buffer entry should be skipped.
 */
struct idpf_tx_buf {
	void *next_to_watch;
	struct sk_buff *skb;
	DEFINE_DMA_UNMAP_ADDR(dma);
	DEFINE_DMA_UNMAP_LEN(len);
	unsigned int bytecount;
	unsigned short gso_segs;

	union {
		int compl_tag;

		bool ctx_entry;
	};
};

struct idpf_tx_stash {
	struct hlist_node hlist;
	struct idpf_tx_buf buf;
};
#define idpf_tx_buf libeth_sqe

/**
 * struct idpf_buf_lifo - LIFO for managing OOO completions
@@ -390,9 +349,11 @@ struct idpf_vec_regs {
 * struct idpf_intr_reg
 * @dyn_ctl: Dynamic control interrupt register
 * @dyn_ctl_intena_m: Mask for dyn_ctl interrupt enable
 * @dyn_ctl_intena_msk_m: Mask for dyn_ctl interrupt enable mask
 * @dyn_ctl_itridx_s: Register bit offset for ITR index
 * @dyn_ctl_itridx_m: Mask for ITR index
 * @dyn_ctl_intrvl_s: Register bit offset for ITR interval
 * @dyn_ctl_wb_on_itr_m: Mask for WB on ITR feature
 * @rx_itr: RX ITR register
 * @tx_itr: TX ITR register
 * @icr_ena: Interrupt cause register offset
@@ -401,9 +362,11 @@ struct idpf_vec_regs {
struct idpf_intr_reg {
	void __iomem *dyn_ctl;
	u32 dyn_ctl_intena_m;
	u32 dyn_ctl_intena_msk_m;
	u32 dyn_ctl_itridx_s;
	u32 dyn_ctl_itridx_m;
	u32 dyn_ctl_intrvl_s;
	u32 dyn_ctl_wb_on_itr_m;
	void __iomem *rx_itr;
	void __iomem *tx_itr;
	void __iomem *icr_ena;
@@ -424,6 +387,7 @@ struct idpf_intr_reg {
 * @intr_reg: See struct idpf_intr_reg
 * @napi: napi handler
 * @total_events: Number of interrupts processed
 * @wb_on_itr: whether WB on ITR is enabled
 * @tx_dim: Data for TX net_dim algorithm
 * @tx_itr_value: TX interrupt throttling rate
 * @tx_intr_mode: Dynamic ITR or not
@@ -454,6 +418,7 @@ struct idpf_q_vector {
	__cacheline_group_begin_aligned(read_write);
	struct napi_struct napi;
	u16 total_events;
	bool wb_on_itr;

	struct dim tx_dim;
	u16 tx_itr_value;
@@ -472,7 +437,7 @@ struct idpf_q_vector {
	cpumask_var_t affinity_mask;
	__cacheline_group_end_aligned(cold);
};
libeth_cacheline_set_assert(struct idpf_q_vector, 104,
libeth_cacheline_set_assert(struct idpf_q_vector, 112,
			    424 + 2 * sizeof(struct dim),
			    8 + sizeof(cpumask_var_t));

@@ -496,11 +461,6 @@ struct idpf_tx_queue_stats {
	u64_stats_t dma_map_errs;
};

struct idpf_cleaned_stats {
	u32 packets;
	u32 bytes;
};

#define IDPF_ITR_DYNAMIC	1
#define IDPF_ITR_MAX		0x1FE0
#define IDPF_ITR_20K		0x0032
@@ -688,7 +648,7 @@ struct idpf_tx_queue {

		void *desc_ring;
	};
	struct idpf_tx_buf *tx_buf;
	struct libeth_sqe *tx_buf;
	struct idpf_txq_group *txq_grp;
	struct device *dev;
	void __iomem *tail;
@@ -831,7 +791,7 @@ struct idpf_compl_queue {
	u32 next_to_use;
	u32 next_to_clean;

	u32 num_completions;
	aligned_u64 num_completions;
	__cacheline_group_end_aligned(read_write);

	__cacheline_group_begin_aligned(cold);
@@ -963,7 +923,7 @@ struct idpf_txq_group {

	struct idpf_compl_queue *complq;

	u32 num_completions_pending;
	aligned_u64 num_completions_pending;
};

static inline int idpf_q_vector_to_mem(const struct idpf_q_vector *q_vector)
@@ -1033,6 +993,25 @@ static inline void idpf_tx_splitq_build_desc(union idpf_tx_flex_desc *desc,
		idpf_tx_splitq_build_flow_desc(desc, params, td_cmd, size);
}

/**
 * idpf_vport_intr_set_wb_on_itr - enable descriptor writeback on disabled interrupts
 * @q_vector: pointer to queue vector struct
 */
static inline void idpf_vport_intr_set_wb_on_itr(struct idpf_q_vector *q_vector)
{
	struct idpf_intr_reg *reg;

	if (q_vector->wb_on_itr)
		return;

	q_vector->wb_on_itr = true;
	reg = &q_vector->intr_reg;

	writel(reg->dyn_ctl_wb_on_itr_m | reg->dyn_ctl_intena_msk_m |
	       (IDPF_NO_ITR_UPDATE_IDX << reg->dyn_ctl_itridx_s),
	       reg->dyn_ctl);
}

int idpf_vport_singleq_napi_poll(struct napi_struct *napi, int budget);
void idpf_vport_init_num_qs(struct idpf_vport *vport,
			    struct virtchnl2_create_vport *vport_msg);
@@ -1064,7 +1043,6 @@ void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb,
			   struct idpf_tx_buf *first, u16 ring_idx);
unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq,
					 struct sk_buff *skb);
int idpf_tx_maybe_stop_common(struct idpf_tx_queue *tx_q, unsigned int size);
void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue);
netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb,
				  struct idpf_tx_queue *tx_q);
@@ -1073,4 +1051,12 @@ bool idpf_rx_singleq_buf_hw_alloc_all(struct idpf_rx_queue *rxq,
				      u16 cleaned_count);
int idpf_tso(struct sk_buff *skb, struct idpf_tx_offload_params *off);

static inline bool idpf_tx_maybe_stop_common(struct idpf_tx_queue *tx_q,
					     u32 needed)
{
	return !netif_subqueue_maybe_stop(tx_q->netdev, tx_q->idx,
					  IDPF_DESC_UNUSED(tx_q),
					  needed, needed);
}

#endif /* !_IDPF_TXRX_H_ */
+2 −0
Original line number Diff line number Diff line
@@ -97,7 +97,9 @@ static int idpf_vf_intr_reg_init(struct idpf_vport *vport)
		intr->dyn_ctl = idpf_get_reg_addr(adapter,
						  reg_vals[vec_id].dyn_ctl_reg);
		intr->dyn_ctl_intena_m = VF_INT_DYN_CTLN_INTENA_M;
		intr->dyn_ctl_intena_msk_m = VF_INT_DYN_CTLN_INTENA_MSK_M;
		intr->dyn_ctl_itridx_s = VF_INT_DYN_CTLN_ITR_INDX_S;
		intr->dyn_ctl_wb_on_itr_m = VF_INT_DYN_CTLN_WB_ON_ITR_M;

		spacing = IDPF_ITR_IDX_SPACING(reg_vals[vec_id].itrn_index_spacing,
					       IDPF_VF_ITR_IDX_SPACING);
Loading