Commit 86b26768 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'net-prevent-rps-table-overwrite-of-active-flows'

Krishna Kumar says:

====================
net: Prevent RPS table overwrite of active flows

This series splits the original RPS patch [1] into two patches for
net-next. It also addresses a kernel test robot warning by defining
rps_flow_is_active() only when aRFS is enabled. I tested v3 with
four builds and reboots: two for [PATCH 1/2] with aRFS enabled &
disabled, and two for [PATCH 2/2]. There are no code changes in v4
and v5, only documentation. Patch v6 has one line change to keep
'hash' field under #ifdef, and was test built with aRFS=on and
aRFS=off. The same two builds were done for v7, along with 15m load
testing with aRFS=on to ensure the new changes are correct.

The first patch prevents RPS table overwrite for active flows thereby
improving aRFS stability.

The second patch caches hash & flow_id in get_rps_cpu() to avoid
recalculating it in set_rps_cpu().

[1] lore.kernel.org/netdev/20250708081516.53048-1-krikku@gmail.com/
[2] lore.kernel.org/netdev/20250729104109.1687418-1-krikku@gmail.com/
====================

Link: https://patch.msgid.link/20250825031005.3674864-1-krikku@gmail.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents f0c88a0d 48aa3044
Loading
Loading
Loading
Loading
+5 −2
Original line number Diff line number Diff line
@@ -25,13 +25,16 @@ struct rps_map {

/*
 * The rps_dev_flow structure contains the mapping of a flow to a CPU, the
 * tail pointer for that CPU's input queue at the time of last enqueue, and
 * a hardware filter index.
 * tail pointer for that CPU's input queue at the time of last enqueue, a
 * hardware filter index, and the hash of the flow if aRFS is enabled.
 */
struct rps_dev_flow {
	u16		cpu;
	u16		filter;
	unsigned int	last_qtail;
#ifdef CONFIG_RFS_ACCEL
	u32		hash;
#endif
};
#define RPS_NO_FILTER 0xffff

+60 −11
Original line number Diff line number Diff line
@@ -4849,9 +4849,40 @@ static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
	return hash_32(hash, flow_table->log);
}

#ifdef CONFIG_RFS_ACCEL
/**
 * rps_flow_is_active - check whether the flow is recently active.
 * @rflow: Specific flow to check activity.
 * @flow_table: per-queue flowtable that @rflow belongs to.
 * @cpu: CPU saved in @rflow.
 *
 * If the CPU has processed many packets since the flow's last activity
 * (beyond 10 times the table size), the flow is considered stale.
 *
 * Return: true if flow was recently active.
 */
static bool rps_flow_is_active(struct rps_dev_flow *rflow,
			       struct rps_dev_flow_table *flow_table,
			       unsigned int cpu)
{
	unsigned int flow_last_active;
	unsigned int sd_input_head;

	if (cpu >= nr_cpu_ids)
		return false;

	sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head);
	flow_last_active = READ_ONCE(rflow->last_qtail);

	return (int)(sd_input_head - flow_last_active) <
		(int)(10 << flow_table->log);
}
#endif

static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
	    struct rps_dev_flow *rflow, u16 next_cpu)
	    struct rps_dev_flow *rflow, u16 next_cpu, u32 hash,
	    u32 flow_id)
{
	if (next_cpu < nr_cpu_ids) {
		u32 head;
@@ -4859,8 +4890,9 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
		struct netdev_rx_queue *rxqueue;
		struct rps_dev_flow_table *flow_table;
		struct rps_dev_flow *old_rflow;
		struct rps_dev_flow *tmp_rflow;
		unsigned int tmp_cpu;
		u16 rxq_index;
		u32 flow_id;
		int rc;

		/* Should we steer this flow to a different hardware queue? */
@@ -4875,14 +4907,29 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
		flow_table = rcu_dereference(rxqueue->rps_flow_table);
		if (!flow_table)
			goto out;
		flow_id = rfs_slot(skb_get_hash(skb), flow_table);

		tmp_rflow = &flow_table->flows[flow_id];
		tmp_cpu = READ_ONCE(tmp_rflow->cpu);

		if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) {
			if (rps_flow_is_active(tmp_rflow, flow_table,
					       tmp_cpu)) {
				if (hash != READ_ONCE(tmp_rflow->hash) ||
				    next_cpu == tmp_cpu)
					goto out;
			}
		}

		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
							rxq_index, flow_id);
		if (rc < 0)
			goto out;

		old_rflow = rflow;
		rflow = &flow_table->flows[flow_id];
		rflow = tmp_rflow;
		WRITE_ONCE(rflow->filter, rc);
		WRITE_ONCE(rflow->hash, hash);

		if (old_rflow->filter == rc)
			WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
	out:
@@ -4908,6 +4955,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
	struct rps_dev_flow_table *flow_table;
	struct rps_map *map;
	int cpu = -1;
	u32 flow_id;
	u32 tcpu;
	u32 hash;

@@ -4954,7 +5002,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
		/* OK, now we know there is a match,
		 * we can look at the local (per receive queue) flow table
		 */
		rflow = &flow_table->flows[rfs_slot(hash, flow_table)];
		flow_id = rfs_slot(hash, flow_table);
		rflow = &flow_table->flows[flow_id];
		tcpu = rflow->cpu;

		/*
@@ -4973,7 +5022,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
		     ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
		      rflow->last_qtail)) >= 0)) {
			tcpu = next_cpu;
			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
			rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash,
					    flow_id);
		}

		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
@@ -5017,17 +5067,16 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
	struct rps_dev_flow_table *flow_table;
	struct rps_dev_flow *rflow;
	bool expire = true;
	unsigned int cpu;

	rcu_read_lock();
	flow_table = rcu_dereference(rxqueue->rps_flow_table);
	if (flow_table && flow_id < (1UL << flow_table->log)) {
		unsigned int cpu;

		rflow = &flow_table->flows[flow_id];
		cpu = READ_ONCE(rflow->cpu);
		if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
		    ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
			   READ_ONCE(rflow->last_qtail)) <
		     (int)(10 << flow_table->log)))
		if (READ_ONCE(rflow->filter) == filter_id &&
		    rps_flow_is_active(rflow, flow_table, cpu))
			expire = false;
	}
	rcu_read_unlock();
+3 −1
Original line number Diff line number Diff line
@@ -1120,8 +1120,10 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
			return -ENOMEM;

		table->log = ilog2(mask) + 1;
		for (count = 0; count <= mask; count++)
		for (count = 0; count <= mask; count++) {
			table->flows[count].cpu = RPS_NO_CPU;
			table->flows[count].filter = RPS_NO_FILTER;
		}
	} else {
		table = NULL;
	}