Commit 8c9b6a88 authored by Linus Torvalds's avatar Linus Torvalds

x86: improve on the non-rep 'clear_user' function

The old version was oddly written to have the repeat count in multiple
registers.  So instead of taking advantage of %rax being zero, it had
some sub-counts in it.  All just for a "single word clearing" loop,
which isn't even efficient to begin with.

So get rid of those games, and just keep all the state in the same
registers we got it in (and that we should return things in).  That not
only makes this act much more like 'rep stos' (which this function is
replacing), but makes it much easier to actually do the obvious loop
unrolling.

Also rename the function from the now nonsensical 'clear_user_original'
to what it now clearly is: 'rep_stos_alternative'.

End result: if we don't have a fast 'rep stosb', at least we can have a
fast fallback for it.
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 577e6a7f
...@@ -83,7 +83,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) ...@@ -83,7 +83,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
*/ */
__must_check unsigned long __must_check unsigned long
clear_user_original(void __user *addr, unsigned long len); rep_stos_alternative(void __user *addr, unsigned long len);
static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size) static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
{ {
...@@ -97,7 +97,7 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr ...@@ -97,7 +97,7 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
asm volatile( asm volatile(
"1:\n\t" "1:\n\t"
ALTERNATIVE("rep stosb", ALTERNATIVE("rep stosb",
"call clear_user_original", ALT_NOT(X86_FEATURE_FSRS)) "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
"2:\n" "2:\n"
_ASM_EXTABLE_UA(1b, 2b) _ASM_EXTABLE_UA(1b, 2b)
: "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
......
...@@ -57,59 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms) ...@@ -57,59 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
* Input: * Input:
* rdi destination * rdi destination
* rcx count * rcx count
* rax is zero
* *
* Output: * Output:
* rcx: uncleared bytes or 0 if successful. * rcx: uncleared bytes or 0 if successful.
*/ */
SYM_FUNC_START(clear_user_original) SYM_FUNC_START(rep_stos_alternative)
/* cmpq $64,%rcx
* Copy only the lower 32 bits of size as that is enough to handle the rest bytes, jae .Lunrolled
* i.e., no need for a 'q' suffix and thus a REX prefix.
*/
mov %ecx,%eax
shr $3,%rcx
jz .Lrest_bytes
# do the qwords first cmp $8,%ecx
.p2align 4 jae .Lword
.Lqwords:
movq $0,(%rdi)
lea 8(%rdi),%rdi
dec %rcx
jnz .Lqwords
.Lrest_bytes: testl %ecx,%ecx
and $7, %eax je .Lexit
jz .Lexit
# now do the rest bytes .Lclear_user_tail:
.Lbytes: 0: movb %al,(%rdi)
movb $0,(%rdi)
inc %rdi inc %rdi
dec %eax dec %rcx
jnz .Lbytes jnz .Lclear_user_tail
.Lexit: .Lexit:
/* RET
* %rax still needs to be cleared in the exception case because this function is called
* from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
* in case it might reuse it somewhere.
*/
xor %eax,%eax
RET
.Lqwords_exception: _ASM_EXTABLE_UA( 0b, .Lexit)
# convert remaining qwords back into bytes to return to caller
shl $3, %rcx
and $7, %eax
add %rax,%rcx
jmp .Lexit
.Lbytes_exception: .Lword:
mov %eax,%ecx 1: movq %rax,(%rdi)
jmp .Lexit addq $8,%rdi
sub $8,%ecx
je .Lexit
cmp $8,%ecx
jae .Lword
jmp .Lclear_user_tail
_ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception) .p2align 4
_ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception) .Lunrolled:
SYM_FUNC_END(clear_user_original) 10: movq %rax,(%rdi)
EXPORT_SYMBOL(clear_user_original) 11: movq %rax,8(%rdi)
12: movq %rax,16(%rdi)
13: movq %rax,24(%rdi)
14: movq %rax,32(%rdi)
15: movq %rax,40(%rdi)
16: movq %rax,48(%rdi)
17: movq %rax,56(%rdi)
addq $64,%rdi
subq $64,%rcx
cmpq $64,%rcx
jae .Lunrolled
cmpl $8,%ecx
jae .Lword
testl %ecx,%ecx
jne .Lclear_user_tail
RET
/*
* If we take an exception on any of the
* word stores, we know that %rcx isn't zero,
* so we can just go to the tail clearing to
* get the exact count.
*
* The unrolled case might end up clearing
* some bytes twice. Don't care.
*
* We could use the value in %rdi to avoid
* a second fault on the exact count case,
* but do we really care? No.
*
* Finally, we could try to align %rdi at the
* top of the unrolling. But unaligned stores
* just aren't that common or expensive.
*/
_ASM_EXTABLE_UA( 1b, .Lclear_user_tail)
_ASM_EXTABLE_UA(10b, .Lclear_user_tail)
_ASM_EXTABLE_UA(11b, .Lclear_user_tail)
_ASM_EXTABLE_UA(12b, .Lclear_user_tail)
_ASM_EXTABLE_UA(13b, .Lclear_user_tail)
_ASM_EXTABLE_UA(14b, .Lclear_user_tail)
_ASM_EXTABLE_UA(15b, .Lclear_user_tail)
_ASM_EXTABLE_UA(16b, .Lclear_user_tail)
_ASM_EXTABLE_UA(17b, .Lclear_user_tail)
SYM_FUNC_END(rep_stos_alternative)
EXPORT_SYMBOL(rep_stos_alternative)
...@@ -1284,7 +1284,7 @@ static const char *uaccess_safe_builtin[] = { ...@@ -1284,7 +1284,7 @@ static const char *uaccess_safe_builtin[] = {
"copy_mc_fragile_handle_tail", "copy_mc_fragile_handle_tail",
"copy_mc_enhanced_fast_string", "copy_mc_enhanced_fast_string",
"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */ "ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
"clear_user_original", "rep_stos_alternative",
"copy_user_generic_unrolled", "copy_user_generic_unrolled",
"__copy_user_nocache", "__copy_user_nocache",
NULL NULL
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment