Commit bf2468f9 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'locking-introduce-nested-bh-locking'

Sebastian Andrzej Siewior says:

====================
locking: Introduce nested-BH locking.

Disabling bottoms halves acts as per-CPU BKL. On PREEMPT_RT code within
local_bh_disable() section remains preemtible. As a result high prior
tasks (or threaded interrupts) will be blocked by lower-prio task (or
threaded interrupts) which are long running which includes softirq
sections.

The proposed way out is to introduce explicit per-CPU locks for
resources which are protected by local_bh_disable() and use those only
on PREEMPT_RT so there is no additional overhead for !PREEMPT_RT builds.

The series introduces the infrastructure and converts large parts of
networking which is largest stake holder here. Once this done the
per-CPU lock from local_bh_disable() on PREEMPT_RT can be lifted.

Performance testing. Baseline is net-next as of commit 93bda330
("Merge branch'net-constify-ctl_table-arguments-of-utility-functions'")
plus v6.10-rc1. A 10GiG link is used between two hosts. The command
   xdp-bench redirect-cpu --cpu 3 --remote-action drop eth1 -e

was invoked on the receiving side with a ixgbe. The sending side uses
pktgen_sample03_burst_single_flow.sh on i40e.

Baseline:
| eth1->?                 9,018,604 rx/s                  0 err,drop/s
|   receive total         9,018,604 pkt/s                 0 drop/s                0 error/s
|     cpu:7               9,018,604 pkt/s                 0 drop/s                0 error/s
|   enqueue to cpu 3      9,018,602 pkt/s                 0 drop/s             7.00 bulk-avg
|     cpu:7->3            9,018,602 pkt/s                 0 drop/s             7.00 bulk-avg
|   kthread total         9,018,606 pkt/s                 0 drop/s          214,698 sched
|     cpu:3               9,018,606 pkt/s                 0 drop/s          214,698 sched
|     xdp_stats                   0 pass/s        9,018,606 drop/s                0 redir/s
|       cpu:3                     0 pass/s        9,018,606 drop/s                0 redir/s
|   redirect_err                  0 error/s
|   xdp_exception                 0 hit/s

perf top --sort cpu,symbol --no-children:
|   18.14%  007  [k] bpf_prog_4f0ffbb35139c187_cpumap_l4_hash
|   13.29%  007  [k] ixgbe_poll
|   12.66%  003  [k] cpu_map_kthread_run
|    7.23%  003  [k] page_frag_free
|    6.76%  007  [k] xdp_do_redirect
|    3.76%  007  [k] cpu_map_redirect
|    3.13%  007  [k] bq_flush_to_queue
|    2.51%  003  [k] xdp_return_frame
|    1.93%  007  [k] try_to_wake_up
|    1.78%  007  [k] _raw_spin_lock
|    1.74%  007  [k] cpu_map_enqueue
|    1.56%  003  [k] bpf_prog_57cd311f2e27366b_cpumap_drop

With this series applied:
| eth1->?                10,329,340 rx/s                  0 err,drop/s
|   receive total        10,329,340 pkt/s                 0 drop/s                0 error/s
|     cpu:6              10,329,340 pkt/s                 0 drop/s                0 error/s
|   enqueue to cpu 3     10,329,338 pkt/s                 0 drop/s             8.00 bulk-avg
|     cpu:6->3           10,329,338 pkt/s                 0 drop/s             8.00 bulk-avg
|   kthread total        10,329,321 pkt/s                 0 drop/s           96,297 sched
|     cpu:3              10,329,321 pkt/s                 0 drop/s           96,297 sched
|     xdp_stats                   0 pass/s       10,329,321 drop/s                0 redir/s
|       cpu:3                     0 pass/s       10,329,321 drop/s                0 redir/s
|   redirect_err                  0 error/s
|   xdp_exception                 0 hit/s

perf top --sort cpu,symbol --no-children:
|   20.90%  006  [k] bpf_prog_4f0ffbb35139c187_cpumap_l4_hash
|   12.62%  006  [k] ixgbe_poll
|    9.82%  003  [k] page_frag_free
|    8.73%  003  [k] cpu_map_bpf_prog_run_xdp
|    6.63%  006  [k] xdp_do_redirect
|    4.94%  003  [k] cpu_map_kthread_run
|    4.28%  006  [k] cpu_map_redirect
|    4.03%  006  [k] bq_flush_to_queue
|    3.01%  003  [k] xdp_return_frame
|    1.95%  006  [k] _raw_spin_lock
|    1.94%  003  [k] bpf_prog_57cd311f2e27366b_cpumap_drop

This diff appears to be noise.

v8: https://lore.kernel.org/all/20240619072253.504963-1-bigeasy@linutronix.de
v7: https://lore.kernel.org/all/20240618072526.379909-1-bigeasy@linutronix.de
v6: https://lore.kernel.org/all/20240612170303.3896084-1-bigeasy@linutronix.de
v5: https://lore.kernel.org/all/20240607070427.1379327-1-bigeasy@linutronix.de
v4: https://lore.kernel.org/all/20240604154425.878636-1-bigeasy@linutronix.de
v3: https://lore.kernel.org/all/20240529162927.403425-1-bigeasy@linutronix.de
v2: https://lore.kernel.org/all/20240503182957.1042122-1-bigeasy@linutronix.de
v1: https://lore.kernel.org/all/20231215171020.687342-1-bigeasy@linutronix.de
====================

Link: https://patch.msgid.link/20240620132727.660738-1-bigeasy@linutronix.de


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 568ebdab 3f9fe37d
Loading
Loading
Loading
Loading
+88 −10
Original line number Diff line number Diff line
@@ -733,21 +733,101 @@ struct bpf_nh_params {
	};
};

/* flags for bpf_redirect_info kern_flags */
#define BPF_RI_F_RF_NO_DIRECT	BIT(0)	/* no napi_direct on return_frame */
#define BPF_RI_F_RI_INIT	BIT(1)
#define BPF_RI_F_CPU_MAP_INIT	BIT(2)
#define BPF_RI_F_DEV_MAP_INIT	BIT(3)
#define BPF_RI_F_XSK_MAP_INIT	BIT(4)

struct bpf_redirect_info {
	u64 tgt_index;
	void *tgt_value;
	struct bpf_map *map;
	u32 flags;
	u32 kern_flags;
	u32 map_id;
	enum bpf_map_type map_type;
	struct bpf_nh_params nh;
	u32 kern_flags;
};

DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
struct bpf_net_context {
	struct bpf_redirect_info ri;
	struct list_head cpu_map_flush_list;
	struct list_head dev_map_flush_list;
	struct list_head xskmap_map_flush_list;
};

/* flags for bpf_redirect_info kern_flags */
#define BPF_RI_F_RF_NO_DIRECT	BIT(0)	/* no napi_direct on return_frame */
static inline struct bpf_net_context *bpf_net_ctx_set(struct bpf_net_context *bpf_net_ctx)
{
	struct task_struct *tsk = current;

	if (tsk->bpf_net_context != NULL)
		return NULL;
	bpf_net_ctx->ri.kern_flags = 0;

	tsk->bpf_net_context = bpf_net_ctx;
	return bpf_net_ctx;
}

static inline void bpf_net_ctx_clear(struct bpf_net_context *bpf_net_ctx)
{
	if (bpf_net_ctx)
		current->bpf_net_context = NULL;
}

static inline struct bpf_net_context *bpf_net_ctx_get(void)
{
	return current->bpf_net_context;
}

static inline struct bpf_redirect_info *bpf_net_ctx_get_ri(void)
{
	struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

	if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_RI_INIT)) {
		memset(&bpf_net_ctx->ri, 0, offsetof(struct bpf_net_context, ri.nh));
		bpf_net_ctx->ri.kern_flags |= BPF_RI_F_RI_INIT;
	}

	return &bpf_net_ctx->ri;
}

static inline struct list_head *bpf_net_ctx_get_cpu_map_flush_list(void)
{
	struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

	if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_CPU_MAP_INIT)) {
		INIT_LIST_HEAD(&bpf_net_ctx->cpu_map_flush_list);
		bpf_net_ctx->ri.kern_flags |= BPF_RI_F_CPU_MAP_INIT;
	}

	return &bpf_net_ctx->cpu_map_flush_list;
}

static inline struct list_head *bpf_net_ctx_get_dev_flush_list(void)
{
	struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

	if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_DEV_MAP_INIT)) {
		INIT_LIST_HEAD(&bpf_net_ctx->dev_map_flush_list);
		bpf_net_ctx->ri.kern_flags |= BPF_RI_F_DEV_MAP_INIT;
	}

	return &bpf_net_ctx->dev_map_flush_list;
}

static inline struct list_head *bpf_net_ctx_get_xskmap_flush_list(void)
{
	struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

	if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_XSK_MAP_INIT)) {
		INIT_LIST_HEAD(&bpf_net_ctx->xskmap_map_flush_list);
		bpf_net_ctx->ri.kern_flags |= BPF_RI_F_XSK_MAP_INIT;
	}

	return &bpf_net_ctx->xskmap_map_flush_list;
}

/* Compute the linear packet data range [data, data_end) which
 * will be accessed by various program types (cls_bpf, act_bpf,
@@ -1018,25 +1098,23 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
				       const struct bpf_insn *patch, u32 len);
int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);

void bpf_clear_redirect_map(struct bpf_map *map);

static inline bool xdp_return_frame_no_direct(void)
{
	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

	return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT;
}

static inline void xdp_set_return_frame_no_direct(void)
{
	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

	ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT;
}

static inline void xdp_clear_return_frame_no_direct(void)
{
	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

	ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
}
@@ -1592,7 +1670,7 @@ static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 inde
						   u64 flags, const u64 flag_mask,
						   void *lookup_elem(struct bpf_map *map, u32 key))
{
	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
	const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;

	/* Lower bits of the flags are used as return code on lookup failure */
+21 −0
Original line number Diff line number Diff line
@@ -51,4 +51,25 @@
#define local_unlock_irqrestore(lock, flags)			\
	__local_unlock_irqrestore(lock, flags)

DEFINE_GUARD(local_lock, local_lock_t __percpu*,
	     local_lock(_T),
	     local_unlock(_T))
DEFINE_GUARD(local_lock_irq, local_lock_t __percpu*,
	     local_lock_irq(_T),
	     local_unlock_irq(_T))
DEFINE_LOCK_GUARD_1(local_lock_irqsave, local_lock_t __percpu,
		    local_lock_irqsave(_T->lock, _T->flags),
		    local_unlock_irqrestore(_T->lock, _T->flags),
		    unsigned long flags)

#define local_lock_nested_bh(_lock)				\
	__local_lock_nested_bh(_lock)

#define local_unlock_nested_bh(_lock)				\
	__local_unlock_nested_bh(_lock)

DEFINE_GUARD(local_lock_nested_bh, local_lock_t __percpu*,
	     local_lock_nested_bh(_T),
	     local_unlock_nested_bh(_T))

#endif
+31 −0
Original line number Diff line number Diff line
@@ -62,6 +62,17 @@ do { \
	local_lock_debug_init(lock);				\
} while (0)

#define __spinlock_nested_bh_init(lock)				\
do {								\
	static struct lock_class_key __key;			\
								\
	debug_check_no_locks_freed((void *)lock, sizeof(*lock));\
	lockdep_init_map_type(&(lock)->dep_map, #lock, &__key,  \
			      0, LD_WAIT_CONFIG, LD_WAIT_INV,	\
			      LD_LOCK_NORMAL);			\
	local_lock_debug_init(lock);				\
} while (0)

#define __local_lock(lock)					\
	do {							\
		preempt_disable();				\
@@ -98,6 +109,15 @@ do { \
		local_irq_restore(flags);			\
	} while (0)

#define __local_lock_nested_bh(lock)				\
	do {							\
		lockdep_assert_in_softirq();			\
		local_lock_acquire(this_cpu_ptr(lock));	\
	} while (0)

#define __local_unlock_nested_bh(lock)				\
	local_lock_release(this_cpu_ptr(lock))

#else /* !CONFIG_PREEMPT_RT */

/*
@@ -138,4 +158,15 @@ typedef spinlock_t local_lock_t;

#define __local_unlock_irqrestore(lock, flags)	__local_unlock(lock)

#define __local_lock_nested_bh(lock)				\
do {								\
	lockdep_assert_in_softirq_func();			\
	spin_lock(this_cpu_ptr(lock));				\
} while (0)

#define __local_unlock_nested_bh(lock)				\
do {								\
	spin_unlock(this_cpu_ptr((lock)));			\
} while (0)

#endif /* CONFIG_PREEMPT_RT */
+3 −0
Original line number Diff line number Diff line
@@ -600,6 +600,8 @@ do { \
		     (!in_softirq() || in_irq() || in_nmi()));		\
} while (0)

extern void lockdep_assert_in_softirq_func(void);

#else
# define might_lock(lock) do { } while (0)
# define might_lock_read(lock) do { } while (0)
@@ -613,6 +615,7 @@ do { \
# define lockdep_assert_preemption_enabled() do { } while (0)
# define lockdep_assert_preemption_disabled() do { } while (0)
# define lockdep_assert_in_softirq() do { } while (0)
# define lockdep_assert_in_softirq_func() do { } while (0)
#endif

#ifdef CONFIG_PROVE_RAW_LOCK_NESTING
+32 −11
Original line number Diff line number Diff line
@@ -43,6 +43,7 @@

#include <linux/netdev_features.h>
#include <linux/neighbour.h>
#include <linux/netdevice_xmit.h>
#include <uapi/linux/netdevice.h>
#include <uapi/linux/if_bonding.h>
#include <uapi/linux/pkt_cls.h>
@@ -3201,6 +3202,7 @@ static inline bool dev_has_header(const struct net_device *dev)
struct softnet_data {
	struct list_head	poll_list;
	struct sk_buff_head	process_queue;
	local_lock_t		process_queue_bh_lock;

	/* stats */
	unsigned int		processed;
@@ -3223,13 +3225,7 @@ struct softnet_data {
	struct sk_buff_head	xfrm_backlog;
#endif
	/* written and read only by owning cpu: */
	struct {
		u16 recursion;
		u8  more;
#ifdef CONFIG_NET_EGRESS
		u8  skip_txqueue;
#endif
	} xmit;
	struct netdev_xmit xmit;
#ifdef CONFIG_RPS
	/* input_queue_head should be written by cpu owning this struct,
	 * and only read by other cpus. Worth using a cache line.
@@ -3257,10 +3253,18 @@ struct softnet_data {

DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);

#ifndef CONFIG_PREEMPT_RT
static inline int dev_recursion_level(void)
{
	return this_cpu_read(softnet_data.xmit.recursion);
}
#else
static inline int dev_recursion_level(void)
{
	return current->net_xmit.recursion;
}

#endif

void __netif_schedule(struct Qdisc *q);
void netif_schedule_queue(struct netdev_queue *txq);
@@ -4872,18 +4876,35 @@ static inline ktime_t netdev_get_tstamp(struct net_device *dev,
	return hwtstamps->hwtstamp;
}

static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
					      struct sk_buff *skb, struct net_device *dev,
					      bool more)
#ifndef CONFIG_PREEMPT_RT
static inline void netdev_xmit_set_more(bool more)
{
	__this_cpu_write(softnet_data.xmit.more, more);
	return ops->ndo_start_xmit(skb, dev);
}

static inline bool netdev_xmit_more(void)
{
	return __this_cpu_read(softnet_data.xmit.more);
}
#else
static inline void netdev_xmit_set_more(bool more)
{
	current->net_xmit.more = more;
}

static inline bool netdev_xmit_more(void)
{
	return current->net_xmit.more;
}
#endif

static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
					      struct sk_buff *skb, struct net_device *dev,
					      bool more)
{
	netdev_xmit_set_more(more);
	return ops->ndo_start_xmit(skb, dev);
}

static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
					    struct netdev_queue *txq, bool more)
Loading