Commit d991666b authored by Bailey Forrest's avatar Bailey Forrest Committed by Jakub Kicinski
Browse files

gve: make IRQ handlers and page allocation NUMA aware



All memory in GVE is currently allocated without regard for the NUMA
node of the device. Because access to NUMA-local memory access is
significantly cheaper than access to a remote node, this change attempts
to ensure that page frags used in the RX path, including page pool
frags, are allocated on the NUMA node local to the gVNIC device. Note
that this attempt is best-effort. If necessary, the driver will still
allocate non-local memory, as __GFP_THISNODE is not passed. Descriptor
ring allocations are not updated, as dma_alloc_coherent handles that.

This change also modifies the IRQ affinity setting to only select CPUs
from the node local to the device, preserving the behavior that TX and
RX queues of the same index share CPU affinity.

Signed-off-by: default avatarBailey Forrest <bcf@google.com>
Signed-off-by: default avatarJoshua Washington <joshwash@google.com>
Reviewed-by: default avatarWillem de Bruijn <willemb@google.com>
Signed-off-by: default avatarHarshitha Ramamurthy <hramamurthy@google.com>
Signed-off-by: default avatarJeroen de Borst <jeroendb@google.com>
Reviewed-by: default avatarSimon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250707210107.2742029-1-jeroendb@google.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 11b5d56d
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -804,6 +804,7 @@ struct gve_priv {
	struct gve_tx_queue_config tx_cfg;
	struct gve_rx_queue_config rx_cfg;
	u32 num_ntfy_blks; /* split between TX and RX so must be even */
	int numa_node;

	struct gve_registers __iomem *reg_bar0; /* see gve_register.h */
	__be32 __iomem *db_bar2; /* "array" of doorbells */
+1 −0
Original line number Diff line number Diff line
@@ -246,6 +246,7 @@ struct page_pool *gve_rx_create_page_pool(struct gve_priv *priv,
		.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
		.order = 0,
		.pool_size = GVE_PAGE_POOL_SIZE_MULTIPLIER * priv->rx_desc_cnt,
		.nid = priv->numa_node,
		.dev = &priv->pdev->dev,
		.netdev = priv->dev,
		.napi = &priv->ntfy_blocks[ntfy_id].napi,
+24 −6
Original line number Diff line number Diff line
@@ -461,10 +461,19 @@ int gve_napi_poll_dqo(struct napi_struct *napi, int budget)
	return work_done;
}

static const struct cpumask *gve_get_node_mask(struct gve_priv *priv)
{
	if (priv->numa_node == NUMA_NO_NODE)
		return cpu_all_mask;
	else
		return cpumask_of_node(priv->numa_node);
}

static int gve_alloc_notify_blocks(struct gve_priv *priv)
{
	int num_vecs_requested = priv->num_ntfy_blks + 1;
	unsigned int active_cpus;
	const struct cpumask *node_mask;
	unsigned int cur_cpu;
	int vecs_enabled;
	int i, j;
	int err;
@@ -503,8 +512,6 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv)
		if (priv->rx_cfg.num_queues > priv->rx_cfg.max_queues)
			priv->rx_cfg.num_queues = priv->rx_cfg.max_queues;
	}
	/* Half the notification blocks go to TX and half to RX */
	active_cpus = min_t(int, priv->num_ntfy_blks / 2, num_online_cpus());

	/* Setup Management Vector  - the last vector */
	snprintf(priv->mgmt_msix_name, sizeof(priv->mgmt_msix_name), "gve-mgmnt@pci:%s",
@@ -533,6 +540,8 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv)
	}

	/* Setup the other blocks - the first n-1 vectors */
	node_mask = gve_get_node_mask(priv);
	cur_cpu = cpumask_first(node_mask);
	for (i = 0; i < priv->num_ntfy_blks; i++) {
		struct gve_notify_block *block = &priv->ntfy_blocks[i];
		int msix_idx = i;
@@ -549,9 +558,17 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv)
			goto abort_with_some_ntfy_blocks;
		}
		block->irq = priv->msix_vectors[msix_idx].vector;
		irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector,
				      get_cpu_mask(i % active_cpus));
		irq_set_affinity_and_hint(block->irq,
					  cpumask_of(cur_cpu));
		block->irq_db_index = &priv->irq_db_indices[i].index;

		cur_cpu = cpumask_next(cur_cpu, node_mask);
		/* Wrap once CPUs in the node have been exhausted, or when
		 * starting RX queue affinities. TX and RX queues of the same
		 * index share affinity.
		 */
		if (cur_cpu >= nr_cpu_ids || (i + 1) == priv->tx_cfg.max_queues)
			cur_cpu = cpumask_first(node_mask);
	}
	return 0;
abort_with_some_ntfy_blocks:
@@ -1040,7 +1057,7 @@ int gve_alloc_page(struct gve_priv *priv, struct device *dev,
		   struct page **page, dma_addr_t *dma,
		   enum dma_data_direction dir, gfp_t gfp_flags)
{
	*page = alloc_page(gfp_flags);
	*page = alloc_pages_node(priv->numa_node, gfp_flags, 0);
	if (!*page) {
		priv->page_alloc_fail++;
		return -ENOMEM;
@@ -2322,6 +2339,7 @@ static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device)
	 */
	priv->num_ntfy_blks = (num_ntfy - 1) & ~0x1;
	priv->mgmt_msix_idx = priv->num_ntfy_blks;
	priv->numa_node = dev_to_node(&priv->pdev->dev);

	priv->tx_cfg.max_queues =
		min_t(int, priv->tx_cfg.max_queues, priv->num_ntfy_blks / 2);
+7 −7
Original line number Diff line number Diff line
@@ -192,8 +192,8 @@ static int gve_rx_prefill_pages(struct gve_rx_ring *rx,
	 */
	slots = rx->mask + 1;

	rx->data.page_info = kvzalloc(slots *
				      sizeof(*rx->data.page_info), GFP_KERNEL);
	rx->data.page_info = kvcalloc_node(slots, sizeof(*rx->data.page_info),
					   GFP_KERNEL, priv->numa_node);
	if (!rx->data.page_info)
		return -ENOMEM;

@@ -216,7 +216,8 @@ static int gve_rx_prefill_pages(struct gve_rx_ring *rx,

	if (!rx->data.raw_addressing) {
		for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) {
			struct page *page = alloc_page(GFP_KERNEL);
			struct page *page = alloc_pages_node(priv->numa_node,
							     GFP_KERNEL, 0);

			if (!page) {
				err = -ENOMEM;
@@ -303,10 +304,9 @@ int gve_rx_alloc_ring_gqi(struct gve_priv *priv,

	rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1;
	rx->qpl_copy_pool_head = 0;
	rx->qpl_copy_pool = kvcalloc(rx->qpl_copy_pool_mask + 1,
	rx->qpl_copy_pool = kvcalloc_node(rx->qpl_copy_pool_mask + 1,
					  sizeof(rx->qpl_copy_pool[0]),
				     GFP_KERNEL);

					  GFP_KERNEL, priv->numa_node);
	if (!rx->qpl_copy_pool) {
		err = -ENOMEM;
		goto abort_with_slots;
+4 −4
Original line number Diff line number Diff line
@@ -237,9 +237,9 @@ int gve_rx_alloc_ring_dqo(struct gve_priv *priv,

	rx->dqo.num_buf_states = cfg->raw_addressing ? buffer_queue_slots :
		gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
	rx->dqo.buf_states = kvcalloc(rx->dqo.num_buf_states,
	rx->dqo.buf_states = kvcalloc_node(rx->dqo.num_buf_states,
					   sizeof(rx->dqo.buf_states[0]),
				      GFP_KERNEL);
					   GFP_KERNEL, priv->numa_node);
	if (!rx->dqo.buf_states)
		return -ENOMEM;

@@ -488,7 +488,7 @@ static int gve_rx_copy_ondemand(struct gve_rx_ring *rx,
				struct gve_rx_buf_state_dqo *buf_state,
				u16 buf_len)
{
	struct page *page = alloc_page(GFP_ATOMIC);
	struct page *page = alloc_pages_node(rx->gve->numa_node, GFP_ATOMIC, 0);
	int num_frags;

	if (!page)