selftests/bpf: Introduce arena spin lock (88d706ba) · Commits · git / linux-net

tools/testing/selftests/bpf/bpf_arena_spin_lock.h

0 → 100644

+512 −0

Original line number	Diff line number	Diff line
		// SPDX-License-Identifier: GPL-2.0
		/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
		#ifndef BPF_ARENA_SPIN_LOCK_H
		#define BPF_ARENA_SPIN_LOCK_H

		#include <vmlinux.h>
		#include <bpf/bpf_helpers.h>
		#include "bpf_atomic.h"

		#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
		#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)

		#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)

		#define EBUSY 16
		#define EOPNOTSUPP 95
		#define ETIMEDOUT 110

		#ifndef __arena
		#define __arena __attribute__((address_space(1)))
		#endif

		extern unsigned long CONFIG_NR_CPUS __kconfig;

		#define arena_spinlock_t struct qspinlock
		/* FIXME: Using typedef causes CO-RE relocation error */
		/* typedef struct qspinlock arena_spinlock_t; */

		struct arena_mcs_spinlock {
		struct arena_mcs_spinlock __arena *next;
		int locked;
		int count;
		};

		struct arena_qnode {
		struct arena_mcs_spinlock mcs;
		};

		#define _Q_MAX_NODES 4
		#define _Q_PENDING_LOOPS 1

		/*
		* Bitfields in the atomic value:
		*
		* 0- 7: locked byte
		* 8: pending
		* 9-15: not used
		* 16-17: tail index
		* 18-31: tail cpu (+1)
		*/
		#define _Q_MAX_CPUS 1024

		#define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\
		<< _Q_ ## type ## _OFFSET)
		#define _Q_LOCKED_OFFSET 0
		#define _Q_LOCKED_BITS 8
		#define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED)

		#define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
		#define _Q_PENDING_BITS 8
		#define _Q_PENDING_MASK _Q_SET_MASK(PENDING)

		#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
		#define _Q_TAIL_IDX_BITS 2
		#define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX)

		#define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
		#define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET)
		#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU)

		#define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET
		#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK \| _Q_TAIL_CPU_MASK)

		#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET)
		#define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET)

		#define likely(x) __builtin_expect(!!(x), 1)
		#define unlikely(x) __builtin_expect(!!(x), 0)

		struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES];

		static inline u32 encode_tail(int cpu, int idx)
		{
		u32 tail;

		tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
		tail \|= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */

		return tail;
		}

		static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
		{
		u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
		u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;

		return &qnodes[cpu][idx].mcs;
		}

		static inline
		struct arena_mcs_spinlock __arena grab_mcs_node(struct arena_mcs_spinlock __arena base, int idx)
		{
		return &((struct arena_qnode __arena *)base + idx)->mcs;
		}

		#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK \| _Q_PENDING_MASK)

		/**
		* xchg_tail - Put in the new queue tail code word & retrieve previous one
		* @lock : Pointer to queued spinlock structure
		* @tail : The new queue tail code word
		* Return: The previous queue tail code word
		*
		* xchg(lock, tail)
		*
		* p,, -> n,, ; prev = xchg(lock, node)
		*/
		static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
		{
		u32 old, new;

		old = atomic_read(&lock->val);
		do {
		new = (old & _Q_LOCKED_PENDING_MASK) \| tail;
		/*
		* We can use relaxed semantics since the caller ensures that
		* the MCS node is properly initialized before updating the
		* tail.
		*/
		/* These loops are not expected to stall, but we still need to
		* prove to the verifier they will terminate eventually.
		*/
		cond_break_label(out);
		} while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));

		return old;
		out:
		bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
		return old;
		}

		/**
		* clear_pending - clear the pending bit.
		* @lock: Pointer to queued spinlock structure
		*
		* ,1, -> ,0,
		*/
		static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
		{
		WRITE_ONCE(lock->pending, 0);
		}

		/**
		* clear_pending_set_locked - take ownership and clear the pending bit.
		* @lock: Pointer to queued spinlock structure
		*
		* ,1,0 -> ,0,1
		*
		* Lock stealing is not allowed if this function is used.
		*/
		static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
		{
		WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
		}

		/**
		* set_locked - Set the lock bit and own the lock
		* @lock: Pointer to queued spinlock structure
		*
		* ,,0 -> *,0,1
		*/
		static __always_inline void set_locked(arena_spinlock_t __arena *lock)
		{
		WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
		}

		static __always_inline
		u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
		{
		u32 old, new;

		old = atomic_read(&lock->val);
		do {
		new = old \| _Q_PENDING_VAL;
		/*
		* These loops are not expected to stall, but we still need to
		* prove to the verifier they will terminate eventually.
		*/
		cond_break_label(out);
		} while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));

		return old;
		out:
		bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
		return old;
		}

		/**
		* arena_spin_trylock - try to acquire the queued spinlock
		* @lock : Pointer to queued spinlock structure
		* Return: 1 if lock acquired, 0 if failed
		*/
		static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
		{
		int val = atomic_read(&lock->val);

		if (unlikely(val))
		return 0;

		return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
		}

		__noinline
		int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
		{
		struct arena_mcs_spinlock __arena prev, next, node0, node;
		int ret = -ETIMEDOUT;
		u32 old, tail;
		int idx;

		/*
		* Wait for in-progress pending->locked hand-overs with a bounded
		* number of spins so that we guarantee forward progress.
		*
		* 0,1,0 -> 0,0,1
		*/
		if (val == _Q_PENDING_VAL) {
		int cnt = _Q_PENDING_LOOPS;
		val = atomic_cond_read_relaxed_label(&lock->val,
		(VAL != _Q_PENDING_VAL) \|\| !cnt--,
		release_err);
		}

		/*
		* If we observe any contention; queue.
		*/
		if (val & ~_Q_LOCKED_MASK)
		goto queue;

		/*
		* trylock \|\| pending
		*
		* 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
		*/
		val = arena_fetch_set_pending_acquire(lock);

		/*
		* If we observe contention, there is a concurrent locker.
		*
		* Undo and queue; our setting of PENDING might have made the
		* n,0,0 -> 0,0,0 transition fail and it will now be waiting
		* on @next to become !NULL.
		*/
		if (unlikely(val & ~_Q_LOCKED_MASK)) {

		/* Undo PENDING if we set it. */
		if (!(val & _Q_PENDING_MASK))
		clear_pending(lock);

		goto queue;
		}

		/*
		* We're pending, wait for the owner to go away.
		*
		* 0,1,1 -> *,1,0
		*
		* this wait loop must be a load-acquire such that we match the
		* store-release that clears the locked bit and create lock
		* sequentiality; this is because not all
		* clear_pending_set_locked() implementations imply full
		* barriers.
		*/
		if (val & _Q_LOCKED_MASK)
		smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);

		/*
		* take ownership and clear the pending bit.
		*
		* 0,1,0 -> 0,0,1
		*/
		clear_pending_set_locked(lock);
		return 0;

		/*
		* End of pending bit optimistic spinning and beginning of MCS
		* queuing.
		*/
		queue:
		node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs;
		idx = node0->count++;
		tail = encode_tail(bpf_get_smp_processor_id(), idx);

		/*
		* 4 nodes are allocated based on the assumption that there will not be
		* nested NMIs taking spinlocks. That may not be true in some
		* architectures even though the chance of needing more than 4 nodes
		* will still be extremely unlikely. When that happens, we simply return
		* an error. Original qspinlock has a trylock fallback in this case.
		*/
		if (unlikely(idx >= _Q_MAX_NODES)) {
		ret = -EBUSY;
		goto release_node_err;
		}

		node = grab_mcs_node(node0, idx);

		/*
		* Ensure that we increment the head node->count before initialising
		* the actual node. If the compiler is kind enough to reorder these
		* stores, then an IRQ could overwrite our assignments.
		*/
		barrier();

		node->locked = 0;
		node->next = NULL;

		/*
		* We touched a (possibly) cold cacheline in the per-cpu queue node;
		* attempt the trylock once more in the hope someone let go while we
		* weren't watching.
		*/
		if (arena_spin_trylock(lock))
		goto release;

		/*
		* Ensure that the initialisation of @node is complete before we
		* publish the updated tail via xchg_tail() and potentially link
		* @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
		*/
		smp_wmb();

		/*
		* Publish the updated tail.
		* We have already touched the queueing cacheline; don't bother with
		* pending stuff.
		*
		* p,, -> n,,
		*/
		old = xchg_tail(lock, tail);
		next = NULL;

		/*
		* if there was a previous node; link it and wait until reaching the
		* head of the waitqueue.
		*/
		if (old & _Q_TAIL_MASK) {
		prev = decode_tail(old);

		/* Link @node into the waitqueue. */
		WRITE_ONCE(prev->next, node);

		arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);

		/*
		* While waiting for the MCS lock, the next pointer may have
		* been set by another lock waiter. We cannot prefetch here
		* due to lack of equivalent instruction in BPF ISA.
		*/
		next = READ_ONCE(node->next);
		}

		/*
		* we're at the head of the waitqueue, wait for the owner & pending to
		* go away.
		*
		* ,x,y -> ,0,0
		*
		* this wait loop must use a load-acquire such that we match the
		* store-release that clears the locked bit and create lock
		* sequentiality; this is because the set_locked() function below
		* does not imply a full barrier.
		*/
		val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
		release_node_err);

		/*
		* claim the lock:
		*
		* n,0,0 -> 0,0,1 : lock, uncontended
		* ,,0 -> ,,1 : lock, contended
		*
		* If the queue head is the only one in the queue (lock value == tail)
		* and nobody is pending, clear the tail code and grab the lock.
		* Otherwise, we only need to grab the lock.
		*/

		/*
		* In the PV case we might already have _Q_LOCKED_VAL set, because
		* of lock stealing; therefore we must also allow:
		*
		* n,0,1 -> 0,0,1
		*
		* Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
		* above wait condition, therefore any concurrent setting of
		* PENDING will make the uncontended transition fail.
		*/
		if ((val & _Q_TAIL_MASK) == tail) {
		if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
		goto release; /* No contention */
		}

		/*
		* Either somebody is queued behind us or _Q_PENDING_VAL got set
		* which will then detect the remaining tail and queue behind us
		* ensuring we'll see a @next.
		*/
		set_locked(lock);

		/*
		* contended path; wait for next if not observed yet, release.
		*/
		if (!next)
		next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);

		arch_mcs_spin_unlock_contended(&next->locked);

		release:;
		/*
		* release the node
		*
		* Doing a normal dec vs this_cpu_dec is fine. An upper context always
		* decrements count it incremented before returning, thus we're fine.
		* For contexts interrupting us, they either observe our dec or not.
		* Just ensure the compiler doesn't reorder this statement, as a
		* this_cpu_dec implicitly implied that.
		*/
		barrier();
		node0->count--;
		return 0;
		release_node_err:
		barrier();
		node0->count--;
		goto release_err;
		release_err:
		return ret;
		}

		/**
		* arena_spin_lock - acquire a queued spinlock
		* @lock: Pointer to queued spinlock structure
		*
		* On error, returned value will be negative.
		* On success, zero is returned.
		*
		* The return value _must_ be tested against zero for success,
		* instead of checking it against negative, for passing the
		* BPF verifier.
		*
		* The user should do:
		* if (arena_spin_lock(...) != 0) // failure
		* or
		* if (arena_spin_lock(...) == 0) // success
		* or
		* if (arena_spin_lock(...)) // failure
		* or
		* if (!arena_spin_lock(...)) // success
		* instead of:
		* if (arena_spin_lock(...) < 0) // failure
		*
		* The return value can still be inspected later.
		*/
		static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
		{
		int val = 0;

		if (CONFIG_NR_CPUS > 1024)
		return -EOPNOTSUPP;

		bpf_preempt_disable();
		if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
		return 0;

		val = arena_spin_lock_slowpath(lock, val);
		/* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */
		if (val)
		bpf_preempt_enable();
		return val;
		}

		/**
		* arena_spin_unlock - release a queued spinlock
		* @lock : Pointer to queued spinlock structure
		*/
		static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
		{
		/*
		* unlock() needs release semantics:
		*/
		smp_store_release(&lock->locked, 0);
		bpf_preempt_enable();
		}

		#define arena_spin_lock_irqsave(lock, flags) \
		({ \
		int __ret; \
		bpf_local_irq_save(&(flags)); \
		__ret = arena_spin_lock((lock)); \
		if (__ret) \
		bpf_local_irq_restore(&(flags)); \
		(__ret); \
		})

		#define arena_spin_unlock_irqrestore(lock, flags) \
		({ \
		arena_spin_unlock((lock)); \
		bpf_local_irq_restore(&(flags)); \
		})

		#endif

		#endif /* BPF_ARENA_SPIN_LOCK_H */

tools/testing/selftests/bpf/bpf_atomic.h

0 → 100644

+140 −0

Original line number	Diff line number	Diff line
		// SPDX-License-Identifier: GPL-2.0
		/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
		#ifndef BPF_ATOMIC_H
		#define BPF_ATOMIC_H

		#include <vmlinux.h>
		#include <bpf/bpf_helpers.h>
		#include "bpf_experimental.h"

		extern bool CONFIG_X86_64 __kconfig __weak;

		/*
		* __unqual_typeof(x) - Declare an unqualified scalar type, leaving
		* non-scalar types unchanged,
		*
		* Prefer C11 _Generic for better compile-times and simpler code. Note: 'char'
		* is not type-compatible with 'signed char', and we define a separate case.
		*
		* This is copied verbatim from kernel's include/linux/compiler_types.h, but
		* with default expression (for pointers) changed from (x) to (typeof(x)0).
		*
		* This is because LLVM has a bug where for lvalue (x), it does not get rid of
		* an extra address_space qualifier, but does in case of rvalue (typeof(x)0).
		* Hence, for pointers, we need to create an rvalue expression to get the
		* desired type. See https://github.com/llvm/llvm-project/issues/53400.
		*/
		#define __scalar_type_to_expr_cases(type) \
		unsigned type : (unsigned type)0, signed type : (signed type)0

		#define __unqual_typeof(x) \
		typeof(_Generic((x), \
		char: (char)0, \
		__scalar_type_to_expr_cases(char), \
		__scalar_type_to_expr_cases(short), \
		__scalar_type_to_expr_cases(int), \
		__scalar_type_to_expr_cases(long), \
		__scalar_type_to_expr_cases(long long), \
		default: (typeof(x))0))

		/* No-op for BPF */
		#define cpu_relax() ({})

		#define READ_ONCE(x) ((volatile typeof(x) )&(x))

		#define WRITE_ONCE(x, val) (((volatile typeof(x) )&(x)) = (val))

		#define cmpxchg(p, old, new) __sync_val_compare_and_swap((p), old, new)

		#define try_cmpxchg(p, pold, new) \
		({ \
		__unqual_typeof((pold)) __o = (pold); \
		__unqual_typeof(*(p)) __r = cmpxchg(p, __o, new); \
		if (__r != __o) \
		*(pold) = __r; \
		__r == __o; \
		})

		#define try_cmpxchg_relaxed(p, pold, new) try_cmpxchg(p, pold, new)

		#define try_cmpxchg_acquire(p, pold, new) try_cmpxchg(p, pold, new)

		#define smp_mb() \
		({ \
		unsigned long __val; \
		__sync_fetch_and_add(&__val, 0); \
		})

		#define smp_rmb() \
		({ \
		if (!CONFIG_X86_64) \
		smp_mb(); \
		else \
		barrier(); \
		})

		#define smp_wmb() \
		({ \
		if (!CONFIG_X86_64) \
		smp_mb(); \
		else \
		barrier(); \
		})

		/* Control dependency provides LOAD->STORE, provide LOAD->LOAD */
		#define smp_acquire__after_ctrl_dep() ({ smp_rmb(); })

		#define smp_load_acquire(p) \
		({ \
		__unqual_typeof((p)) __v = READ_ONCE((p)); \
		if (!CONFIG_X86_64) \
		smp_mb(); \
		barrier(); \
		__v; \
		})

		#define smp_store_release(p, val) \
		({ \
		if (!CONFIG_X86_64) \
		smp_mb(); \
		barrier(); \
		WRITE_ONCE(*(p), val); \
		})

		#define smp_cond_load_relaxed_label(p, cond_expr, label) \
		({ \
		typeof(p) __ptr = (p); \
		__unqual_typeof(*(p)) VAL; \
		for (;;) { \
		VAL = (__unqual_typeof((p)))READ_ONCE(__ptr); \
		if (cond_expr) \
		break; \
		cond_break_label(label); \
		cpu_relax(); \
		} \
		(typeof(*(p)))VAL; \
		})

		#define smp_cond_load_acquire_label(p, cond_expr, label) \
		({ \
		__unqual_typeof(*p) __val = \
		smp_cond_load_relaxed_label(p, cond_expr, label); \
		smp_acquire__after_ctrl_dep(); \
		(typeof(*(p)))__val; \
		})

		#define atomic_read(p) READ_ONCE((p)->counter)

		#define atomic_cond_read_relaxed_label(p, cond_expr, label) \
		smp_cond_load_relaxed_label(&(p)->counter, cond_expr, label)

		#define atomic_cond_read_acquire_label(p, cond_expr, label) \
		smp_cond_load_acquire_label(&(p)->counter, cond_expr, label)

		#define atomic_try_cmpxchg_relaxed(p, pold, new) \
		try_cmpxchg_relaxed(&(p)->counter, pold, new)

		#define atomic_try_cmpxchg_acquire(p, pold, new) \
		try_cmpxchg_acquire(&(p)->counter, pold, new)

		#endif /* BPF_ATOMIC_H */