Commit db739ff2 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'rfs-use-high-order-allocations-for-hash-tables'

Eric Dumazet says:

====================
rfs: use high-order allocations for hash tables

This series adds rps_tag_ptr which encodes both a pointer
and a size of a power-of-two hash table in a single long word.

RFS hash tables (global and per rx-queue) are converted to rps_tag_ptr.

This removes a cache line miss, and allows high-order allocations.

The global hash table can benefit from huge pages.
====================

Link: https://patch.msgid.link/20260302181432.1836150-1-edumazet@google.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents c26b8c4e a435163d
Loading
Loading
Loading
Loading
+9 −4
Original line number Diff line number Diff line
@@ -403,16 +403,21 @@ Both of these need to be set before RFS is enabled for a receive queue.
Values for both are rounded up to the nearest power of two. The
suggested flow count depends on the expected number of active connections
at any given time, which may be significantly less than the number of open
connections. We have found that a value of 32768 for rps_sock_flow_entries
works fairly well on a moderately loaded server.
connections. We have found that a value of 65536 for rps_sock_flow_entries
works fairly well on a moderately loaded server. Big servers might
need 1048576 or even higher values.

On a NUMA host it is advisable to spread rps_sock_flow_entries on all nodes.

numactl --interleave=all bash -c "echo 1048576 >/proc/sys/net/core/rps_sock_flow_entries"

For a single queue device, the rps_flow_cnt value for the single queue
would normally be configured to the same value as rps_sock_flow_entries.
For a multi-queue device, the rps_flow_cnt for each queue might be
configured as rps_sock_flow_entries / N, where N is the number of
queues. So for instance, if rps_sock_flow_entries is set to 32768 and there
queues. So for instance, if rps_sock_flow_entries is set to 131072 and there
are 16 configured receive queues, rps_flow_cnt for each queue might be
configured as 2048.
configured as 8192.


Accelerated RFS
+4 −1
Original line number Diff line number Diff line
@@ -6,6 +6,9 @@
#include <linux/types.h>
#include <linux/netdevice.h>
#include <net/protocol.h>
#ifdef CONFIG_RPS
#include <net/rps-types.h>
#endif

struct skb_defer_node {
	struct llist_head	defer_list;
@@ -33,7 +36,7 @@ struct net_hotdata {
	struct kmem_cache	*skbuff_fclone_cache;
	struct kmem_cache	*skb_small_head_cache;
#ifdef CONFIG_RPS
	struct rps_sock_flow_table __rcu *rps_sock_flow_table;
	rps_tag_ptr		rps_sock_flow_table;
	u32			rps_cpu_mask;
#endif
	struct skb_defer_node __percpu *skb_defer_nodes;
+2 −1
Original line number Diff line number Diff line
@@ -8,13 +8,14 @@
#include <net/xdp.h>
#include <net/page_pool/types.h>
#include <net/netdev_queues.h>
#include <net/rps-types.h>

/* This structure contains an instance of an RX queue. */
struct netdev_rx_queue {
	struct xdp_rxq_info		xdp_rxq;
#ifdef CONFIG_RPS
	struct rps_map __rcu		*rps_map;
	struct rps_dev_flow_table __rcu	*rps_flow_table;
	rps_tag_ptr			rps_flow_table;
#endif
	struct kobject			kobj;
	const struct attribute_group	**groups;
+24 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _NET_RPS_TYPES_H
#define _NET_RPS_TYPES_H

/* Define a rps_tag_ptr:
 * Low order 5 bits are used to store the ilog2(size) of an RPS table.
 */
typedef unsigned long rps_tag_ptr;

static inline u8 rps_tag_to_log(rps_tag_ptr tag_ptr)
{
	return tag_ptr & 31U;
}

static inline u32 rps_tag_to_mask(rps_tag_ptr tag_ptr)
{
	return (1U << rps_tag_to_log(tag_ptr)) - 1;
}

static inline void *rps_tag_to_table(rps_tag_ptr tag_ptr)
{
	return (void *)(tag_ptr & ~31UL);
}
#endif /* _NET_RPS_TYPES_H */
+19 −30
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@
#include <net/hotdata.h>

#ifdef CONFIG_RPS
#include <net/rps-types.h>

extern struct static_key_false rps_needed;
extern struct static_key_false rfs_needed;
@@ -38,17 +39,6 @@ struct rps_dev_flow {
};
#define RPS_NO_FILTER 0xffff

/*
 * The rps_dev_flow_table structure contains a table of flow mappings.
 */
struct rps_dev_flow_table {
	u8			log;
	struct rcu_head		rcu;
	struct rps_dev_flow	flows[];
};
#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
    ((_num) * sizeof(struct rps_dev_flow)))

/*
 * The rps_sock_flow_table contains mappings of flows to the last CPU
 * on which they were processed by the application (set in recvmsg).
@@ -60,41 +50,38 @@ struct rps_dev_flow_table {
 * meaning we use 32-6=26 bits for the hash.
 */
struct rps_sock_flow_table {
	struct rcu_head	rcu;
	u32		mask;

	u32		ents[] ____cacheline_aligned_in_smp;
	u32	ent;
};
#define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))

#define RPS_NO_CPU 0xffff

static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
					u32 hash)
static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
{
	unsigned int index = hash & table->mask;
	unsigned int index = hash & rps_tag_to_mask(tag_ptr);
	u32 val = hash & ~net_hotdata.rps_cpu_mask;
	struct rps_sock_flow_table *table;

	/* We only give a hint, preemption can change CPU under us */
	val |= raw_smp_processor_id();

	table = rps_tag_to_table(tag_ptr);
	/* The following WRITE_ONCE() is paired with the READ_ONCE()
	 * here, and another one in get_rps_cpu().
	 */
	if (READ_ONCE(table->ents[index]) != val)
		WRITE_ONCE(table->ents[index], val);
	if (READ_ONCE(table[index].ent) != val)
		WRITE_ONCE(table[index].ent, val);
}

static inline void _sock_rps_record_flow_hash(__u32 hash)
{
	struct rps_sock_flow_table *sock_flow_table;
	rps_tag_ptr tag_ptr;

	if (!hash)
		return;
	rcu_read_lock();
	sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
	if (sock_flow_table)
		rps_record_sock_flow(sock_flow_table, hash);
	tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table);
	if (tag_ptr)
		rps_record_sock_flow(tag_ptr, hash);
	rcu_read_unlock();
}

@@ -121,6 +108,7 @@ static inline void _sock_rps_record_flow(const struct sock *sk)
static inline void _sock_rps_delete_flow(const struct sock *sk)
{
	struct rps_sock_flow_table *table;
	rps_tag_ptr tag_ptr;
	u32 hash, index;

	hash = READ_ONCE(sk->sk_rxhash);
@@ -128,11 +116,12 @@ static inline void _sock_rps_delete_flow(const struct sock *sk)
		return;

	rcu_read_lock();
	table = rcu_dereference(net_hotdata.rps_sock_flow_table);
	if (table) {
		index = hash & table->mask;
		if (READ_ONCE(table->ents[index]) != RPS_NO_CPU)
			WRITE_ONCE(table->ents[index], RPS_NO_CPU);
	tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table);
	if (tag_ptr) {
		index = hash & rps_tag_to_mask(tag_ptr);
		table = rps_tag_to_table(tag_ptr);
		if (READ_ONCE(table[index].ent) != RPS_NO_CPU)
			WRITE_ONCE(table[index].ent, RPS_NO_CPU);
	}
	rcu_read_unlock();
}
Loading