Commit 3a6fa573 authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'timed-may_goto'

Kumar Kartikeya Dwivedi says:

====================
Timed may_goto

This series replaces the current implementation of cond_break, which
uses the may_goto instruction, and counts 8 million iterations per stack
frame, with an implementation based on sampling time locally on the CPU.

This is done to permit a longer time for a given loop per-program
invocation. The accounting is still done per-stack frame, but the count
is used to instead amortize the cost of the logic to sample and check
the time spent since the start.

This is needed for expressing more complicated algorithms (spin locks,
waiting loops, etc.) in BPF programs without false positive expiration
of the loop. For instance, the plan is to make use of this for
implementing spin locks for BPF arena [0].

For the loop as follows:

for (int i = 0;; i++) {}

Testing on a bare-metal Sapphire Rapids Intel server yields the following
table (taking an average of 25 runs).

+-----------------------------+--------------+--------------+------------------+
| Loop type		      |	Iterations   |	Time (ms)   |	Time/iter (ns) |
+-----------------------------|--------------+--------------+------------------+
| may_goto		      |	8388608	     |	3	    |	0.36	       |
| timed_may_goto (count=65535)|	589674932    |	250	    |	0.42	       |
| bpf_for		      |	8388608	     |	10	    |	1.19	       |
+-----------------------------+--------------+--------------+------------------+

Here, count is used to amortize the time sampling and checking logic.

Obviously, this is the limit of an empty loop. Given the complexity of
the loop body, the time spent in the loop can be longer. Cancellations
will address the task of imposing an upper bound on program runtime.

For now, the implementation only supports x86.

  [0]: https://lore.kernel.org/bpf/20250118162238.2621311-1-memxor@gmail.com

Changelog:
----------
v1 -> v2
v1: https://lore.kernel.org/bpf/20250302201348.940234-1-memxor@gmail.com

 * Address comments from Alexei
   * Use kernel comment style for new code.
   * Remove p->count == 0 check in bpf_check_timed_may_goto.
   * Add comments on AX as argument/retval calling convention.
   * Add comments describing how the counting logic works.
   * Use BPF_EMIT_CALL instead of open-coding instruction encoding.
   * Change if ax != 1 goto pc+X condition to if ax != 0 goto pc+X.
====================

Link: https://patch.msgid.link/20250304003239.2390751-1-memxor@gmail.com


Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 2941e215 2fb76182
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -6,5 +6,5 @@
ifeq ($(CONFIG_X86_32),y)
        obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o
else
        obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o
        obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o bpf_timed_may_goto.o
endif
+5 −0
Original line number Diff line number Diff line
@@ -3791,3 +3791,8 @@ u64 bpf_arch_uaddress_limit(void)
{
	return 0;
}

bool bpf_jit_supports_timed_may_goto(void)
{
	return true;
}
+44 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */

#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/nospec-branch.h>

	.code64
	.section .text, "ax"

SYM_FUNC_START(arch_bpf_timed_may_goto)
	ANNOTATE_NOENDBR

	/* Save r0-r5. */
	pushq %rax
	pushq %rdi
	pushq %rsi
	pushq %rdx
	pushq %rcx
	pushq %r8

	/*
	 * r10 passes us stack depth, load the pointer to count and timestamp as
	 * first argument to the call below.
	 */
	leaq (%rbp, %r10, 1), %rdi

	/* Emit call depth accounting for call below. */
	CALL_DEPTH_ACCOUNT
	call bpf_check_timed_may_goto

	/* BPF_REG_AX=r10 will be stored into count, so move return value to it. */
	movq %rax, %r10

	/* Restore r5-r0. */
	popq %r8
	popq %rcx
	popq %rdx
	popq %rsi
	popq %rdi
	popq %rax

	RET
SYM_FUNC_END(arch_bpf_timed_may_goto)
+1 −0
Original line number Diff line number Diff line
@@ -1987,6 +1987,7 @@ struct bpf_array {
 */
enum {
	BPF_MAX_LOOPS = 8 * 1024 * 1024,
	BPF_MAX_TIMED_LOOPS = 0xffff,
};

#define BPF_F_ACCESS_MASK	(BPF_F_RDONLY |		\
+8 −0
Original line number Diff line number Diff line
@@ -669,6 +669,11 @@ struct bpf_prog_stats {
	struct u64_stats_sync syncp;
} __aligned(2 * sizeof(u64));

struct bpf_timed_may_goto {
	u64 count;
	u64 timestamp;
};

struct sk_filter {
	refcount_t	refcnt;
	struct rcu_head	rcu;
@@ -1130,8 +1135,11 @@ bool bpf_jit_supports_ptr_xchg(void);
bool bpf_jit_supports_arena(void);
bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
bool bpf_jit_supports_private_stack(void);
bool bpf_jit_supports_timed_may_goto(void);
u64 bpf_arch_uaddress_limit(void);
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
u64 arch_bpf_timed_may_goto(void);
u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *);
bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id);

static inline bool bpf_dump_raw_ok(const struct cred *cred)
Loading