Commit ed2f752e authored by Uros Bizjak's avatar Uros Bizjak Committed by Ingo Molnar
Browse files

x86/percpu: Introduce const-qualified const_pcpu_hot to micro-optimize code generation



Some variables in pcpu_hot, currently current_task and top_of_stack
are actually per-thread variables implemented as per-CPU variables
and thus stable for the duration of the respective task.  There is
already an attempt to eliminate redundant reads from these variables
using this_cpu_read_stable() asm macro, which hides the dependency
on the read memory address. However, the compiler has limited ability
to eliminate asm common subexpressions, so this approach results in a
limited success.

The solution is to allow more aggressive elimination by aliasing
pcpu_hot into a const-qualified const_pcpu_hot, and to read stable
per-CPU variables from this constant copy.

The current per-CPU infrastructure does not support reads from
const-qualified variables. However, when the compiler supports segment
qualifiers, it is possible to declare the const-aliased variable in
the relevant named address space. The compiler considers access to the
variable, declared in this way, as a read from a constant location,
and will optimize reads from the variable accordingly.

By implementing constant-qualified const_pcpu_hot, the compiler can
eliminate redundant reads from the constant variables, reducing the
number of loads from current_task from 3766 to 3217 on a test build,
a -14.6% reduction.

The reduction of loads translates to the following code savings:

        text           data     bss      dec            hex filename
  25,477,353        4389456  808452 30675261        1d4113d vmlinux-old.o
  25,476,074        4389440  808452 30673966        1d40c2e vmlinux-new.o

representing a code size reduction of -1279 bytes.

[ mingo: Updated the changelog, EXPORT(const_pcpu_hot). ]

Co-developed-by: default avatarNadav Amit <namit@vmware.com>
Signed-off-by: default avatarNadav Amit <namit@vmware.com>
Signed-off-by: default avatarUros Bizjak <ubizjak@gmail.com>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231020162004.135244-1-ubizjak@gmail.com
parent 59bec00a
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -36,8 +36,15 @@ static_assert(sizeof(struct pcpu_hot) == 64);

DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot);

/* const-qualified alias to pcpu_hot, aliased by linker. */
DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override,
			const_pcpu_hot);

static __always_inline struct task_struct *get_current(void)
{
	if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
		return const_pcpu_hot.current_task;

	return this_cpu_read_stable(pcpu_hot.current_task);
}

+3 −3
Original line number Diff line number Diff line
@@ -413,9 +413,9 @@ do { \
 * accessed while this_cpu_read_stable() allows the value to be cached.
 * this_cpu_read_stable() is more efficient and can be used if its value
 * is guaranteed to be valid across cpus.  The current users include
 * get_current() and get_thread_info() both of which are actually
 * per-thread variables implemented as per-cpu variables and thus
 * stable for the duration of the respective task.
 * pcpu_hot.current_task and pcpu_hot.top_of_stack, both of which are
 * actually per-thread variables implemented as per-CPU variables and
 * thus stable for the duration of the respective task.
 */
#define this_cpu_read_stable_1(pcp)	percpu_stable_op(1, "mov", pcp)
#define this_cpu_read_stable_2(pcp)	percpu_stable_op(2, "mov", pcp)
+3 −0
Original line number Diff line number Diff line
@@ -518,6 +518,9 @@ static __always_inline unsigned long current_top_of_stack(void)
	 *  and around vm86 mode and sp0 on x86_64 is special because of the
	 *  entry trampoline.
	 */
	if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
		return pcpu_hot.top_of_stack;

	return this_cpu_read_stable(pcpu_hot.top_of_stack);
}

+1 −0
Original line number Diff line number Diff line
@@ -2051,6 +2051,7 @@ DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = {
	.top_of_stack	= TOP_OF_INIT_STACK,
};
EXPORT_PER_CPU_SYMBOL(pcpu_hot);
EXPORT_PER_CPU_SYMBOL(const_pcpu_hot);

#ifdef CONFIG_X86_64
DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
+1 −0
Original line number Diff line number Diff line
@@ -46,6 +46,7 @@ ENTRY(phys_startup_64)
#endif

jiffies = jiffies_64;
const_pcpu_hot = pcpu_hot;

#if defined(CONFIG_X86_64)
/*
Loading