Commit 8c9b6a88 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

x86: improve on the non-rep 'clear_user' function



The old version was oddly written to have the repeat count in multiple
registers.  So instead of taking advantage of %rax being zero, it had
some sub-counts in it.  All just for a "single word clearing" loop,
which isn't even efficient to begin with.

So get rid of those games, and just keep all the state in the same
registers we got it in (and that we should return things in).  That not
only makes this act much more like 'rep stos' (which this function is
replacing), but makes it much easier to actually do the obvious loop
unrolling.

Also rename the function from the now nonsensical 'clear_user_original'
to what it now clearly is: 'rep_stos_alternative'.

End result: if we don't have a fast 'rep stosb', at least we can have a
fast fallback for it.

Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 577e6a7f
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -83,7 +83,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
 */

__must_check unsigned long
clear_user_original(void __user *addr, unsigned long len);
rep_stos_alternative(void __user *addr, unsigned long len);

static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
{
@@ -97,7 +97,7 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
	asm volatile(
		"1:\n\t"
		ALTERNATIVE("rep stosb",
			    "call clear_user_original", ALT_NOT(X86_FEATURE_FSRS))
			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
		"2:\n"
	       _ASM_EXTABLE_UA(1b, 2b)
	       : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
+70 −44
Original line number Diff line number Diff line
@@ -57,59 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
 * Input:
 * rdi destination
 * rcx count
 * rax is zero
 *
 * Output:
 * rcx: uncleared bytes or 0 if successful.
 */
SYM_FUNC_START(clear_user_original)
	/*
	 * Copy only the lower 32 bits of size as that is enough to handle the rest bytes,
	 * i.e., no need for a 'q' suffix and thus a REX prefix.
	 */
	mov %ecx,%eax
	shr $3,%rcx
	jz .Lrest_bytes
SYM_FUNC_START(rep_stos_alternative)
	cmpq $64,%rcx
	jae .Lunrolled

	# do the qwords first
	.p2align 4
.Lqwords:
	movq $0,(%rdi)
	lea 8(%rdi),%rdi
	dec %rcx
	jnz .Lqwords
	cmp $8,%ecx
	jae .Lword

.Lrest_bytes:
	and $7,  %eax
	jz .Lexit
	testl %ecx,%ecx
	je .Lexit

	# now do the rest bytes
.Lbytes:
	movb $0,(%rdi)
.Lclear_user_tail:
0:	movb %al,(%rdi)
	inc %rdi
	dec %eax
	jnz .Lbytes

	dec %rcx
	jnz .Lclear_user_tail
.Lexit:
	/*
	 * %rax still needs to be cleared in the exception case because this function is called
	 * from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
	 * in case it might reuse it somewhere.
	 */
        xor %eax,%eax
	RET

.Lqwords_exception:
        # convert remaining qwords back into bytes to return to caller
        shl $3, %rcx
        and $7, %eax
        add %rax,%rcx
        jmp .Lexit
	_ASM_EXTABLE_UA( 0b, .Lexit)

.Lbytes_exception:
        mov %eax,%ecx
        jmp .Lexit
.Lword:
1:	movq %rax,(%rdi)
	addq $8,%rdi
	sub $8,%ecx
	je .Lexit
	cmp $8,%ecx
	jae .Lword
	jmp .Lclear_user_tail

        _ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
        _ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
SYM_FUNC_END(clear_user_original)
EXPORT_SYMBOL(clear_user_original)
	.p2align 4
.Lunrolled:
10:	movq %rax,(%rdi)
11:	movq %rax,8(%rdi)
12:	movq %rax,16(%rdi)
13:	movq %rax,24(%rdi)
14:	movq %rax,32(%rdi)
15:	movq %rax,40(%rdi)
16:	movq %rax,48(%rdi)
17:	movq %rax,56(%rdi)
	addq $64,%rdi
	subq $64,%rcx
	cmpq $64,%rcx
	jae .Lunrolled
	cmpl $8,%ecx
	jae .Lword
	testl %ecx,%ecx
	jne .Lclear_user_tail
	RET

	/*
	 * If we take an exception on any of the
	 * word stores, we know that %rcx isn't zero,
	 * so we can just go to the tail clearing to
	 * get the exact count.
	 *
	 * The unrolled case might end up clearing
	 * some bytes twice. Don't care.
	 *
	 * We could use the value in %rdi to avoid
	 * a second fault on the exact count case,
	 * but do we really care? No.
	 *
	 * Finally, we could try to align %rdi at the
	 * top of the unrolling. But unaligned stores
	 * just aren't that common or expensive.
	 */
	_ASM_EXTABLE_UA( 1b, .Lclear_user_tail)
	_ASM_EXTABLE_UA(10b, .Lclear_user_tail)
	_ASM_EXTABLE_UA(11b, .Lclear_user_tail)
	_ASM_EXTABLE_UA(12b, .Lclear_user_tail)
	_ASM_EXTABLE_UA(13b, .Lclear_user_tail)
	_ASM_EXTABLE_UA(14b, .Lclear_user_tail)
	_ASM_EXTABLE_UA(15b, .Lclear_user_tail)
	_ASM_EXTABLE_UA(16b, .Lclear_user_tail)
	_ASM_EXTABLE_UA(17b, .Lclear_user_tail)
SYM_FUNC_END(rep_stos_alternative)
EXPORT_SYMBOL(rep_stos_alternative)
+1 −1
Original line number Diff line number Diff line
@@ -1284,7 +1284,7 @@ static const char *uaccess_safe_builtin[] = {
	"copy_mc_fragile_handle_tail",
	"copy_mc_enhanced_fast_string",
	"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
	"clear_user_original",
	"rep_stos_alternative",
	"copy_user_generic_unrolled",
	"__copy_user_nocache",
	NULL