Commit ed2f752e authored by Uros Bizjak's avatar Uros Bizjak Committed by Ingo Molnar

x86/percpu: Introduce const-qualified const_pcpu_hot to micro-optimize code generation

Some variables in pcpu_hot, currently current_task and top_of_stack
are actually per-thread variables implemented as per-CPU variables
and thus stable for the duration of the respective task.  There is
already an attempt to eliminate redundant reads from these variables
using this_cpu_read_stable() asm macro, which hides the dependency
on the read memory address. However, the compiler has limited ability
to eliminate asm common subexpressions, so this approach results in a
limited success.

The solution is to allow more aggressive elimination by aliasing
pcpu_hot into a const-qualified const_pcpu_hot, and to read stable
per-CPU variables from this constant copy.

The current per-CPU infrastructure does not support reads from
const-qualified variables. However, when the compiler supports segment
qualifiers, it is possible to declare the const-aliased variable in
the relevant named address space. The compiler considers access to the
variable, declared in this way, as a read from a constant location,
and will optimize reads from the variable accordingly.

By implementing constant-qualified const_pcpu_hot, the compiler can
eliminate redundant reads from the constant variables, reducing the
number of loads from current_task from 3766 to 3217 on a test build,
a -14.6% reduction.

The reduction of loads translates to the following code savings:

        text           data     bss      dec            hex filename
  25,477,353        4389456  808452 30675261        1d4113d vmlinux-old.o
  25,476,074        4389440  808452 30673966        1d40c2e vmlinux-new.o

representing a code size reduction of -1279 bytes.

[ mingo: Updated the changelog, EXPORT(const_pcpu_hot). ]
Co-developed-by: default avatarNadav Amit <namit@vmware.com>
Signed-off-by: default avatarNadav Amit <namit@vmware.com>
Signed-off-by: default avatarUros Bizjak <ubizjak@gmail.com>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231020162004.135244-1-ubizjak@gmail.com
parent 59bec00a
...@@ -36,8 +36,15 @@ static_assert(sizeof(struct pcpu_hot) == 64); ...@@ -36,8 +36,15 @@ static_assert(sizeof(struct pcpu_hot) == 64);
DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot); DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot);
/* const-qualified alias to pcpu_hot, aliased by linker. */
DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override,
const_pcpu_hot);
static __always_inline struct task_struct *get_current(void) static __always_inline struct task_struct *get_current(void)
{ {
if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
return const_pcpu_hot.current_task;
return this_cpu_read_stable(pcpu_hot.current_task); return this_cpu_read_stable(pcpu_hot.current_task);
} }
......
...@@ -413,9 +413,9 @@ do { \ ...@@ -413,9 +413,9 @@ do { \
* accessed while this_cpu_read_stable() allows the value to be cached. * accessed while this_cpu_read_stable() allows the value to be cached.
* this_cpu_read_stable() is more efficient and can be used if its value * this_cpu_read_stable() is more efficient and can be used if its value
* is guaranteed to be valid across cpus. The current users include * is guaranteed to be valid across cpus. The current users include
* get_current() and get_thread_info() both of which are actually * pcpu_hot.current_task and pcpu_hot.top_of_stack, both of which are
* per-thread variables implemented as per-cpu variables and thus * actually per-thread variables implemented as per-CPU variables and
* stable for the duration of the respective task. * thus stable for the duration of the respective task.
*/ */
#define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp) #define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp)
#define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp) #define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp)
......
...@@ -518,6 +518,9 @@ static __always_inline unsigned long current_top_of_stack(void) ...@@ -518,6 +518,9 @@ static __always_inline unsigned long current_top_of_stack(void)
* and around vm86 mode and sp0 on x86_64 is special because of the * and around vm86 mode and sp0 on x86_64 is special because of the
* entry trampoline. * entry trampoline.
*/ */
if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
return pcpu_hot.top_of_stack;
return this_cpu_read_stable(pcpu_hot.top_of_stack); return this_cpu_read_stable(pcpu_hot.top_of_stack);
} }
......
...@@ -2051,6 +2051,7 @@ DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { ...@@ -2051,6 +2051,7 @@ DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = {
.top_of_stack = TOP_OF_INIT_STACK, .top_of_stack = TOP_OF_INIT_STACK,
}; };
EXPORT_PER_CPU_SYMBOL(pcpu_hot); EXPORT_PER_CPU_SYMBOL(pcpu_hot);
EXPORT_PER_CPU_SYMBOL(const_pcpu_hot);
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
......
...@@ -46,6 +46,7 @@ ENTRY(phys_startup_64) ...@@ -46,6 +46,7 @@ ENTRY(phys_startup_64)
#endif #endif
jiffies = jiffies_64; jiffies = jiffies_64;
const_pcpu_hot = pcpu_hot;
#if defined(CONFIG_X86_64) #if defined(CONFIG_X86_64)
/* /*
......
...@@ -212,7 +212,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, ...@@ -212,7 +212,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
*/ */
#define ___ADDRESSABLE(sym, __attrs) \ #define ___ADDRESSABLE(sym, __attrs) \
static void * __used __attrs \ static void * __used __attrs \
__UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)&sym; __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym;
#define __ADDRESSABLE(sym) \ #define __ADDRESSABLE(sym) \
___ADDRESSABLE(sym, __section(".discard.addressable")) ___ADDRESSABLE(sym, __section(".discard.addressable"))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment