Commit 378b7708 authored by Menglong Dong's avatar Menglong Dong Committed by Peter Zijlstra
Browse files

sched: Make migrate_{en,dis}able() inline



For now, migrate_enable and migrate_disable are global, which makes them
become hotspots in some case. Take BPF for example, the function calling
to migrate_enable and migrate_disable in BPF trampoline can introduce
significant overhead, and following is the 'perf top' of FENTRY's
benchmark (./tools/testing/selftests/bpf/bench trig-fentry):

  54.63% bpf_prog_2dcccf652aac1793_bench_trigger_fentry [k]
                 bpf_prog_2dcccf652aac1793_bench_trigger_fentry
  10.43% [kernel] [k] migrate_enable
  10.07% bpf_trampoline_6442517037 [k] bpf_trampoline_6442517037
  8.06% [kernel] [k] __bpf_prog_exit_recur
  4.11% libc.so.6 [.] syscall
  2.15% [kernel] [k] entry_SYSCALL_64
  1.48% [kernel] [k] memchr_inv
  1.32% [kernel] [k] fput
  1.16% [kernel] [k] _copy_to_user
  0.73% [kernel] [k] bpf_prog_test_run_raw_tp

So in this commit, we make migrate_enable/migrate_disable inline to obtain
better performance. The struct rq is defined internally in
kernel/sched/sched.h, and the field "nr_pinned" is accessed in
migrate_enable/migrate_disable, which makes it hard to make them inline.

Alexei Starovoitov suggests to generate the offset of "nr_pinned" in [1],
so we can define the migrate_enable/migrate_disable in
include/linux/sched.h and access "this_rq()->nr_pinned" with
"(void *)this_rq() + RQ_nr_pinned".

The offset of "nr_pinned" is generated in include/generated/rq-offsets.h
by kernel/sched/rq-offsets.c.

Generally speaking, we move the definition of migrate_enable and
migrate_disable to include/linux/sched.h from kernel/sched/core.c. The
calling to __set_cpus_allowed_ptr() is leaved in ___migrate_enable().

The "struct rq" is not available in include/linux/sched.h, so we can't
access the "runqueues" with this_cpu_ptr(), as the compilation will fail
in this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr():
  typeof((ptr) + 0)

So we introduce the this_rq_raw() and access the runqueues with
arch_raw_cpu_ptr/PERCPU_PTR directly.

The variable "runqueues" is not visible in the kernel modules, and export
it is not a good idea. As Peter Zijlstra advised in [2], we define and
export migrate_enable/migrate_disable in kernel/sched/core.c too, and use
them for the modules.

Before this patch, the performance of BPF FENTRY is:

  fentry         :  113.030 ± 0.149M/s
  fentry         :  112.501 ± 0.187M/s
  fentry         :  112.828 ± 0.267M/s
  fentry         :  115.287 ± 0.241M/s

After this patch, the performance of BPF FENTRY increases to:

  fentry         :  143.644 ± 0.670M/s
  fentry         :  149.764 ± 0.362M/s
  fentry         :  149.642 ± 0.156M/s
  fentry         :  145.263 ± 0.221M/s

Signed-off-by: default avatarMenglong Dong <dongml2@chinatelecom.cn>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/bpf/CAADnVQ+5sEDKHdsJY5ZsfGDO_1SEhhQWHrt2SMBG5SYyQ+jt7w@mail.gmail.com/ [1]
Link: https://lore.kernel.org/all/20250819123214.GH4067720@noisy.programming.kicks-ass.net/ [2]
parent 88a90315
Loading
Loading
Loading
Loading
+12 −1
Original line number Diff line number Diff line
@@ -34,13 +34,24 @@ arch/$(SRCARCH)/kernel/asm-offsets.s: $(timeconst-file) $(bounds-file)
$(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE
	$(call filechk,offsets,__ASM_OFFSETS_H__)

# Generate rq-offsets.h

rq-offsets-file := include/generated/rq-offsets.h

targets += kernel/sched/rq-offsets.s

kernel/sched/rq-offsets.s: $(offsets-file)

$(rq-offsets-file): kernel/sched/rq-offsets.s FORCE
	$(call filechk,offsets,__RQ_OFFSETS_H__)

# Check for missing system calls

quiet_cmd_syscalls = CALL    $<
      cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags) $(missing_syscalls_flags)

PHONY += missing-syscalls
missing-syscalls: scripts/checksyscalls.sh $(offsets-file)
missing-syscalls: scripts/checksyscalls.sh $(rq-offsets-file)
	$(call cmd,syscalls)

# Check the manual modification of atomic headers
+0 −3
Original line number Diff line number Diff line
@@ -424,8 +424,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
 *       work-conserving schedulers.
 *
 */
extern void migrate_disable(void);
extern void migrate_enable(void);

/**
 * preempt_disable_nested - Disable preemption inside a normally preempt disabled section
@@ -471,7 +469,6 @@ static __always_inline void preempt_enable_nested(void)

DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace())
DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())

#ifdef CONFIG_PREEMPT_DYNAMIC

+113 −0
Original line number Diff line number Diff line
@@ -49,6 +49,9 @@
#include <linux/tracepoint-defs.h>
#include <linux/unwind_deferred_types.h>
#include <asm/kmap_size.h>
#ifndef COMPILE_OFFSETS
#include <generated/rq-offsets.h>
#endif

/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
@@ -2317,4 +2320,114 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo
#define alloc_tag_restore(_tag, _old)		do {} while (0)
#endif

#ifndef MODULE
#ifndef COMPILE_OFFSETS

extern void ___migrate_enable(void);

struct rq;
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

/*
 * The "struct rq" is not available here, so we can't access the
 * "runqueues" with this_cpu_ptr(), as the compilation will fail in
 * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr():
 *   typeof((ptr) + 0)
 *
 * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here.
 */
#ifdef CONFIG_SMP
#define this_rq_raw() arch_raw_cpu_ptr(&runqueues)
#else
#define this_rq_raw() PERCPU_PTR(&runqueues)
#endif
#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned))

static inline void __migrate_enable(void)
{
	struct task_struct *p = current;

#ifdef CONFIG_DEBUG_PREEMPT
	/*
	 * Check both overflow from migrate_disable() and superfluous
	 * migrate_enable().
	 */
	if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
		return;
#endif

	if (p->migration_disabled > 1) {
		p->migration_disabled--;
		return;
	}

	/*
	 * Ensure stop_task runs either before or after this, and that
	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
	 */
	guard(preempt)();
	if (unlikely(p->cpus_ptr != &p->cpus_mask))
		___migrate_enable();
	/*
	 * Mustn't clear migration_disabled() until cpus_ptr points back at the
	 * regular cpus_mask, otherwise things that race (eg.
	 * select_fallback_rq) get confused.
	 */
	barrier();
	p->migration_disabled = 0;
	this_rq_pinned()--;
}

static inline void __migrate_disable(void)
{
	struct task_struct *p = current;

	if (p->migration_disabled) {
#ifdef CONFIG_DEBUG_PREEMPT
		/*
		 *Warn about overflow half-way through the range.
		 */
		WARN_ON_ONCE((s16)p->migration_disabled < 0);
#endif
		p->migration_disabled++;
		return;
	}

	guard(preempt)();
	this_rq_pinned()++;
	p->migration_disabled = 1;
}
#else /* !COMPILE_OFFSETS */
static inline void __migrate_disable(void) { }
static inline void __migrate_enable(void) { }
#endif /* !COMPILE_OFFSETS */

/*
 * So that it is possible to not export the runqueues variable, define and
 * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use
 * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will
 * be defined in kernel/sched/core.c.
 */
#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE
static inline void migrate_disable(void)
{
	__migrate_disable();
}

static inline void migrate_enable(void)
{
	__migrate_enable();
}
#else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */
extern void migrate_disable(void);
extern void migrate_enable(void);
#endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */

#else /* MODULE */
extern void migrate_disable(void);
extern void migrate_enable(void);
#endif /* MODULE */

DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())

#endif
+1 −0
Original line number Diff line number Diff line
@@ -23855,6 +23855,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
BTF_SET_START(btf_id_deny)
BTF_ID_UNUSED
#ifdef CONFIG_SMP
BTF_ID(func, ___migrate_enable)
BTF_ID(func, migrate_disable)
BTF_ID(func, migrate_enable)
#endif
+14 −49
Original line number Diff line number Diff line
@@ -7,6 +7,8 @@
 *  Copyright (C) 1991-2002  Linus Torvalds
 *  Copyright (C) 1998-2024  Ingo Molnar, Red Hat
 */
#define INSTANTIATE_EXPORTED_MIGRATE_DISABLE
#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/hrtimer_api.h>
#include <linux/ktime_api.h>
@@ -2381,28 +2383,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
	__do_set_cpus_allowed(p, &ac);
}

void migrate_disable(void)
{
	struct task_struct *p = current;

	if (p->migration_disabled) {
#ifdef CONFIG_DEBUG_PREEMPT
		/*
		 *Warn about overflow half-way through the range.
		 */
		WARN_ON_ONCE((s16)p->migration_disabled < 0);
#endif
		p->migration_disabled++;
		return;
	}

	guard(preempt)();
	this_rq()->nr_pinned++;
	p->migration_disabled = 1;
}
EXPORT_SYMBOL_GPL(migrate_disable);

void migrate_enable(void)
void ___migrate_enable(void)
{
	struct task_struct *p = current;
	struct affinity_context ac = {
@@ -2410,35 +2391,19 @@ void migrate_enable(void)
		.flags     = SCA_MIGRATE_ENABLE,
	};

#ifdef CONFIG_DEBUG_PREEMPT
	/*
	 * Check both overflow from migrate_disable() and superfluous
	 * migrate_enable().
	 */
	if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
		return;
#endif
	__set_cpus_allowed_ptr(p, &ac);
}
EXPORT_SYMBOL_GPL(___migrate_enable);

	if (p->migration_disabled > 1) {
		p->migration_disabled--;
		return;
void migrate_disable(void)
{
	__migrate_disable();
}
EXPORT_SYMBOL_GPL(migrate_disable);

	/*
	 * Ensure stop_task runs either before or after this, and that
	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
	 */
	guard(preempt)();
	if (p->cpus_ptr != &p->cpus_mask)
		__set_cpus_allowed_ptr(p, &ac);
	/*
	 * Mustn't clear migration_disabled() until cpus_ptr points back at the
	 * regular cpus_mask, otherwise things that race (eg.
	 * select_fallback_rq) get confused.
	 */
	barrier();
	p->migration_disabled = 0;
	this_rq()->nr_pinned--;
void migrate_enable(void)
{
	__migrate_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);

Loading