Commit e6c91556 authored by Alexander Lobakin's avatar Alexander Lobakin Committed by Tony Nguyen
Browse files

libeth: add Rx buffer management



Add a couple intuitive helpers to hide Rx buffer implementation details
in the library and not multiplicate it between drivers. The settings are
sorta optimized for 100G+ NICs, but nothing really HW-specific here.
Use the new page_pool_dev_alloc() to dynamically switch between
split-page and full-page modes depending on MTU, page size, required
headroom etc. For example, on x86_64 with the default driver settings
each page is shared between 2 buffers. Turning on XDP (not in this
series) -> increasing headroom requirement pushes truesize out of 2048
boundary, leading to that each buffer starts getting a full page.
The "ceiling" limit is %PAGE_SIZE, as only order-0 pages are used to
avoid compound overhead. For the above architecture, this means maximum
linear frame size of 3712 w/o XDP.
Not that &libeth_buf_queue is not a complete queue/ring structure for
now, rather a shim, but eventually the libeth-enabled drivers will move
to it, with iavf being the first one.

Signed-off-by: default avatarAlexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: default avatarTony Nguyen <anthony.l.nguyen@intel.com>
parent ce230f4f
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@

config LIBETH
	tristate
	select PAGE_POOL
	help
	  libeth is a common library containing routines shared between several
	  drivers, but not yet promoted to the generic kernel API.
+98 −0
Original line number Diff line number Diff line
@@ -3,6 +3,104 @@

#include <net/libeth/rx.h>

/* Rx buffer management */

/**
 * libeth_rx_hw_len - get the actual buffer size to be passed to HW
 * @pp: &page_pool_params of the netdev to calculate the size for
 * @max_len: maximum buffer size for a single descriptor
 *
 * Return: HW-writeable length per one buffer to pass it to the HW accounting:
 * MTU the @dev has, HW required alignment, minimum and maximum allowed values,
 * and system's page size.
 */
static u32 libeth_rx_hw_len(const struct page_pool_params *pp, u32 max_len)
{
	u32 len;

	len = READ_ONCE(pp->netdev->mtu) + LIBETH_RX_LL_LEN;
	len = ALIGN(len, LIBETH_RX_BUF_STRIDE);
	len = min3(len, ALIGN_DOWN(max_len ? : U32_MAX, LIBETH_RX_BUF_STRIDE),
		   pp->max_len);

	return len;
}

/**
 * libeth_rx_fq_create - create a PP with the default libeth settings
 * @fq: buffer queue struct to fill
 * @napi: &napi_struct covering this PP (no usage outside its poll loops)
 *
 * Return: %0 on success, -%errno on failure.
 */
int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi)
{
	struct page_pool_params pp = {
		.flags		= PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
		.order		= LIBETH_RX_PAGE_ORDER,
		.pool_size	= fq->count,
		.nid		= fq->nid,
		.dev		= napi->dev->dev.parent,
		.netdev		= napi->dev,
		.napi		= napi,
		.dma_dir	= DMA_FROM_DEVICE,
		.offset		= LIBETH_SKB_HEADROOM,
	};
	struct libeth_fqe *fqes;
	struct page_pool *pool;

	/* HW-writeable / syncable length per one page */
	pp.max_len = LIBETH_RX_PAGE_LEN(pp.offset);

	/* HW-writeable length per buffer */
	fq->buf_len = libeth_rx_hw_len(&pp, fq->buf_len);
	/* Buffer size to allocate */
	fq->truesize = roundup_pow_of_two(SKB_HEAD_ALIGN(pp.offset +
							 fq->buf_len));

	pool = page_pool_create(&pp);
	if (IS_ERR(pool))
		return PTR_ERR(pool);

	fqes = kvcalloc_node(fq->count, sizeof(*fqes), GFP_KERNEL, fq->nid);
	if (!fqes)
		goto err_buf;

	fq->fqes = fqes;
	fq->pp = pool;

	return 0;

err_buf:
	page_pool_destroy(pool);

	return -ENOMEM;
}
EXPORT_SYMBOL_NS_GPL(libeth_rx_fq_create, LIBETH);

/**
 * libeth_rx_fq_destroy - destroy a &page_pool created by libeth
 * @fq: buffer queue to process
 */
void libeth_rx_fq_destroy(struct libeth_fq *fq)
{
	kvfree(fq->fqes);
	page_pool_destroy(fq->pp);
}
EXPORT_SYMBOL_NS_GPL(libeth_rx_fq_destroy, LIBETH);

/**
 * libeth_rx_recycle_slow - recycle a libeth page from the NAPI context
 * @page: page to recycle
 *
 * To be used on exceptions or rare cases not requiring fast inline recycling.
 */
void libeth_rx_recycle_slow(struct page *page)
{
	page_pool_recycle_direct(page->pp, page);
}
EXPORT_SYMBOL_NS_GPL(libeth_rx_recycle_slow, LIBETH);

/* Converting abstract packet type numbers into a software structure with
 * the packet parameters to do O(1) lookup on Rx.
 */
+117 −0
Original line number Diff line number Diff line
@@ -4,8 +4,125 @@
#ifndef __LIBETH_RX_H
#define __LIBETH_RX_H

#include <linux/if_vlan.h>

#include <net/page_pool/helpers.h>
#include <net/xdp.h>

/* Rx buffer management */

/* Space reserved in front of each frame */
#define LIBETH_SKB_HEADROOM	(NET_SKB_PAD + NET_IP_ALIGN)
/* Maximum headroom for worst-case calculations */
#define LIBETH_MAX_HEADROOM	LIBETH_SKB_HEADROOM
/* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */
#define LIBETH_RX_LL_LEN	(ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN)

/* Always use order-0 pages */
#define LIBETH_RX_PAGE_ORDER	0
/* Pick a sane buffer stride and align to a cacheline boundary */
#define LIBETH_RX_BUF_STRIDE	SKB_DATA_ALIGN(128)
/* HW-writeable space in one buffer: truesize - headroom/tailroom, aligned */
#define LIBETH_RX_PAGE_LEN(hr)						  \
	ALIGN_DOWN(SKB_MAX_ORDER(hr, LIBETH_RX_PAGE_ORDER),		  \
		   LIBETH_RX_BUF_STRIDE)

/**
 * struct libeth_fqe - structure representing an Rx buffer (fill queue element)
 * @page: page holding the buffer
 * @offset: offset from the page start (to the headroom)
 * @truesize: total space occupied by the buffer (w/ headroom and tailroom)
 *
 * Depending on the MTU, API switches between one-page-per-frame and shared
 * page model (to conserve memory on bigger-page platforms). In case of the
 * former, @offset is always 0 and @truesize is always ```PAGE_SIZE```.
 */
struct libeth_fqe {
	struct page		*page;
	u32			offset;
	u32			truesize;
} __aligned_largest;

/**
 * struct libeth_fq - structure representing a buffer (fill) queue
 * @fp: hotpath part of the structure
 * @pp: &page_pool for buffer management
 * @fqes: array of Rx buffers
 * @truesize: size to allocate per buffer, w/overhead
 * @count: number of descriptors/buffers the queue has
 * @buf_len: HW-writeable length per each buffer
 * @nid: ID of the closest NUMA node with memory
 */
struct libeth_fq {
	struct_group_tagged(libeth_fq_fp, fp,
		struct page_pool	*pp;
		struct libeth_fqe	*fqes;

		u32			truesize;
		u32			count;
	);

	/* Cold fields */
	u32			buf_len;
	int			nid;
};

int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi);
void libeth_rx_fq_destroy(struct libeth_fq *fq);

/**
 * libeth_rx_alloc - allocate a new Rx buffer
 * @fq: fill queue to allocate for
 * @i: index of the buffer within the queue
 *
 * Return: DMA address to be passed to HW for Rx on successful allocation,
 * ```DMA_MAPPING_ERROR``` otherwise.
 */
static inline dma_addr_t libeth_rx_alloc(const struct libeth_fq_fp *fq, u32 i)
{
	struct libeth_fqe *buf = &fq->fqes[i];

	buf->truesize = fq->truesize;
	buf->page = page_pool_dev_alloc(fq->pp, &buf->offset, &buf->truesize);
	if (unlikely(!buf->page))
		return DMA_MAPPING_ERROR;

	return page_pool_get_dma_addr(buf->page) + buf->offset +
	       fq->pp->p.offset;
}

void libeth_rx_recycle_slow(struct page *page);

/**
 * libeth_rx_sync_for_cpu - synchronize or recycle buffer post DMA
 * @fqe: buffer to process
 * @len: frame length from the descriptor
 *
 * Process the buffer after it's written by HW. The regular path is to
 * synchronize DMA for CPU, but in case of no data it will be immediately
 * recycled back to its PP.
 *
 * Return: true when there's data to process, false otherwise.
 */
static inline bool libeth_rx_sync_for_cpu(const struct libeth_fqe *fqe,
					  u32 len)
{
	struct page *page = fqe->page;

	/* Very rare, but possible case. The most common reason:
	 * the last fragment contained FCS only, which was then
	 * stripped by the HW.
	 */
	if (unlikely(!len)) {
		libeth_rx_recycle_slow(page);
		return false;
	}

	page_pool_dma_sync_for_cpu(page->pp, page, fqe->offset, len);

	return true;
}

/* Converting abstract packet type numbers into a software structure with
 * the packet parameters to do O(1) lookup on Rx.
 */