Commit 78d90189 authored by Théo Lebrun's avatar Théo Lebrun Committed by Jakub Kicinski
Browse files

net: macb: single dma_alloc_coherent() for DMA descriptors



Move from 2*NUM_QUEUES dma_alloc_coherent() for DMA descriptor rings to
2 calls overall.

Issue is with how all queues share the same register for configuring the
upper 32-bits of Tx/Rx descriptor rings. Taking Tx, notice how TBQPH
does *not* depend on the queue index:

	#define GEM_TBQP(hw_q)		(0x0440 + ((hw_q) << 2))
	#define GEM_TBQPH(hw_q)		(0x04C8)

	queue_writel(queue, TBQP, lower_32_bits(queue->tx_ring_dma));
	#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
	if (bp->hw_dma_cap & HW_DMA_CAP_64B)
		queue_writel(queue, TBQPH, upper_32_bits(queue->tx_ring_dma));
	#endif

To maximise our chances of getting valid DMA addresses, we do a single
dma_alloc_coherent() across queues. This improves the odds because
alloc_pages() guarantees natural alignment. Other codepaths (IOMMU or
dev/arch dma_map_ops) don't give high enough guarantees
(even page-aligned isn't enough).

Two consideration:

 - dma_alloc_coherent() gives us page alignment. Here we remove this
   constraint meaning each queue's ring won't be page-aligned anymore.

 - This can save some tiny amounts of memory. Fewer allocations means
   (1) less overhead (constant cost per alloc) and (2) less wasted bytes
   due to alignment constraints.

   Example for (2): 4 queues, default ring size (512), 64-bit DMA
   descriptors, 16K pages:
    - Before: 8 allocs of 8K, each rounded to 16K => 64K wasted.
    - After:  2 allocs of 32K => 0K wasted.

Fixes: 02c958dd ("net/macb: add TX multiqueue support for gem")
Reviewed-by: default avatarSean Anderson <sean.anderson@linux.dev>
Acked-by: default avatarNicolas Ferre <nicolas.ferre@microchip.com>
Tested-by: Nicolas Ferre <nicolas.ferre@microchip.com> # on sam9x75
Signed-off-by: default avatarThéo Lebrun <theo.lebrun@bootlin.com>
Reviewed-by: default avatarSimon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250923-macb-fixes-v6-4-772d655cdeb6@bootlin.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 92d4256f
Loading
Loading
Loading
Loading
+41 −39
Original line number Diff line number Diff line
@@ -2478,34 +2478,32 @@ static unsigned int macb_rx_ring_size_per_queue(struct macb *bp)

static void macb_free_consistent(struct macb *bp)
{
	struct device *dev = &bp->pdev->dev;
	struct macb_queue *queue;
	unsigned int q;
	size_t size;

	if (bp->rx_ring_tieoff) {
		dma_free_coherent(&bp->pdev->dev, macb_dma_desc_get_size(bp),
		dma_free_coherent(dev, macb_dma_desc_get_size(bp),
				  bp->rx_ring_tieoff, bp->rx_ring_tieoff_dma);
		bp->rx_ring_tieoff = NULL;
	}

	bp->macbgem_ops.mog_free_rx_buffers(bp);

	size = bp->num_queues * macb_tx_ring_size_per_queue(bp);
	dma_free_coherent(dev, size, bp->queues[0].tx_ring, bp->queues[0].tx_ring_dma);

	size = bp->num_queues * macb_rx_ring_size_per_queue(bp);
	dma_free_coherent(dev, size, bp->queues[0].rx_ring, bp->queues[0].rx_ring_dma);

	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
		kfree(queue->tx_skb);
		queue->tx_skb = NULL;
		if (queue->tx_ring) {
			dma_free_coherent(&bp->pdev->dev,
					  macb_tx_ring_size_per_queue(bp),
					  queue->tx_ring, queue->tx_ring_dma);
		queue->tx_ring = NULL;
		}
		if (queue->rx_ring) {
			dma_free_coherent(&bp->pdev->dev,
					  macb_rx_ring_size_per_queue(bp),
					  queue->rx_ring, queue->rx_ring_dma);
		queue->rx_ring = NULL;
	}
}
}

static int gem_alloc_rx_buffers(struct macb *bp)
{
@@ -2545,41 +2543,45 @@ static int macb_alloc_rx_buffers(struct macb *bp)

static int macb_alloc_consistent(struct macb *bp)
{
	struct device *dev = &bp->pdev->dev;
	dma_addr_t tx_dma, rx_dma;
	struct macb_queue *queue;
	unsigned int q;
	u32 upper;
	int size;
	void *tx, *rx;
	size_t size;

	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
		size = macb_tx_ring_size_per_queue(bp);
		queue->tx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
						    &queue->tx_ring_dma,
						    GFP_KERNEL);
		upper = upper_32_bits(queue->tx_ring_dma);
		if (!queue->tx_ring ||
		    upper != upper_32_bits(bp->queues[0].tx_ring_dma))
	/*
	 * Upper 32-bits of Tx/Rx DMA descriptor for each queues much match!
	 * We cannot enforce this guarantee, the best we can do is do a single
	 * allocation and hope it will land into alloc_pages() that guarantees
	 * natural alignment of physical addresses.
	 */

	size = bp->num_queues * macb_tx_ring_size_per_queue(bp);
	tx = dma_alloc_coherent(dev, size, &tx_dma, GFP_KERNEL);
	if (!tx || upper_32_bits(tx_dma) != upper_32_bits(tx_dma + size - 1))
		goto out_err;
		netdev_dbg(bp->dev,
			   "Allocated TX ring for queue %u of %d bytes at %08lx (mapped %p)\n",
			   q, size, (unsigned long)queue->tx_ring_dma,
			   queue->tx_ring);
	netdev_dbg(bp->dev, "Allocated %zu bytes for %u TX rings at %08lx (mapped %p)\n",
		   size, bp->num_queues, (unsigned long)tx_dma, tx);

	size = bp->num_queues * macb_rx_ring_size_per_queue(bp);
	rx = dma_alloc_coherent(dev, size, &rx_dma, GFP_KERNEL);
	if (!rx || upper_32_bits(rx_dma) != upper_32_bits(rx_dma + size - 1))
		goto out_err;
	netdev_dbg(bp->dev, "Allocated %zu bytes for %u RX rings at %08lx (mapped %p)\n",
		   size, bp->num_queues, (unsigned long)rx_dma, rx);

	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
		queue->tx_ring = tx + macb_tx_ring_size_per_queue(bp) * q;
		queue->tx_ring_dma = tx_dma + macb_tx_ring_size_per_queue(bp) * q;

		queue->rx_ring = rx + macb_rx_ring_size_per_queue(bp) * q;
		queue->rx_ring_dma = rx_dma + macb_rx_ring_size_per_queue(bp) * q;

		size = bp->tx_ring_size * sizeof(struct macb_tx_skb);
		queue->tx_skb = kmalloc(size, GFP_KERNEL);
		if (!queue->tx_skb)
			goto out_err;

		size = macb_rx_ring_size_per_queue(bp);
		queue->rx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
						    &queue->rx_ring_dma,
						    GFP_KERNEL);
		upper = upper_32_bits(queue->rx_ring_dma);
		if (!queue->rx_ring ||
		    upper != upper_32_bits(bp->queues[0].rx_ring_dma))
			goto out_err;
		netdev_dbg(bp->dev,
			   "Allocated RX ring of %d bytes at %08lx (mapped %p)\n",
			   size, (unsigned long)queue->rx_ring_dma, queue->rx_ring);
	}
	if (bp->macbgem_ops.mog_alloc_rx_buffers(bp))
		goto out_err;