Commit a5624566 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'x86-rep-insns': x86 user copy clarifications

Merge my x86 user copy updates branch.

This cleans up a lot of our x86 memory copy code, particularly for user
accesses.  I've been pushing for microarchitectural support for good
memory copying and clearing for a long while, and it's been visible in
how the kernel has aggressively used 'rep movs' and 'rep stos' whenever
possible.

And that micro-architectural support has been improving over the years,
to the point where on modern CPU's the best option for a memory copy
that would become a function call (as opposed to being something that
can just be turned into individual 'mov' instructions) is now to inline
the string instruction sequence instead.

However, that only makes sense when we have the modern markers for this:
the x86 FSRM and FSRS capabilities ("Fast Short REP MOVS/STOS").

So this cleans up a lot of our historical code, gets rid of the legacy
marker use ("REP_GOOD" and "ERMS") from the memcpy/memset cases, and
replaces it with that modern reality.  Note that REP_GOOD and ERMS end
up still being used by the known large cases (ie page copyin gand
clearing).

The reason much of this ends up being about user memory accesses is that
the normal in-kernel cases are done by the compiler (__builtin_memcpy()
and __builtin_memset()) and getting to the point where we can use our
instruction rewriting to inline those to be string instructions will
need some compiler support.

In contrast, the user accessor functions are all entirely controlled by
the kernel code, so we can change those arbitrarily.

Thanks to Borislav Petkov for feedback on the series, and Jens testing
some of this on micro-architectures I didn't personally have access to.

* x86-rep-insns:
  x86: rewrite '__copy_user_nocache' function
  x86: remove 'zerorest' argument from __copy_user_nocache()
  x86: set FSRS automatically on AMD CPUs that have FSRM
  x86: improve on the non-rep 'copy_user' function
  x86: improve on the non-rep 'clear_user' function
  x86: inline the 'rep movs' in user copies for the FSRM case
  x86: move stac/clac from user copy routines into callers
  x86: don't use REP_GOOD or ERMS for user memory clearing
  x86: don't use REP_GOOD or ERMS for user memory copies
  x86: don't use REP_GOOD or ERMS for small memory clearing
  x86: don't use REP_GOOD or ERMS for small memory copies
parents 487c20b0 034ff37d
Loading
Loading
Loading
Loading
+25 −37
Original line number Diff line number Diff line
@@ -18,32 +18,26 @@

/* Handles exceptions in both to and from, but doesn't do access_ok */
__must_check unsigned long
copy_user_enhanced_fast_string(void *to, const void *from, unsigned len);
__must_check unsigned long
copy_user_generic_string(void *to, const void *from, unsigned len);
__must_check unsigned long
copy_user_generic_unrolled(void *to, const void *from, unsigned len);
rep_movs_alternative(void *to, const void *from, unsigned len);

static __always_inline __must_check unsigned long
copy_user_generic(void *to, const void *from, unsigned len)
copy_user_generic(void *to, const void *from, unsigned long len)
{
	unsigned ret;

	stac();
	/*
	 * If CPU has ERMS feature, use copy_user_enhanced_fast_string.
	 * Otherwise, if CPU has rep_good feature, use copy_user_generic_string.
	 * Otherwise, use copy_user_generic_unrolled.
	 * If CPU has FSRM feature, use 'rep movs'.
	 * Otherwise, use rep_movs_alternative.
	 */
	alternative_call_2(copy_user_generic_unrolled,
			 copy_user_generic_string,
			 X86_FEATURE_REP_GOOD,
			 copy_user_enhanced_fast_string,
			 X86_FEATURE_ERMS,
			 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
				     "=d" (len)),
			 "1" (to), "2" (from), "3" (len)
			 : "memory", "rcx", "r8", "r9", "r10", "r11");
	return ret;
	asm volatile(
		"1:\n\t"
		ALTERNATIVE("rep movsb",
			    "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
		"2:\n"
		_ASM_EXTABLE_UA(1b, 2b)
		:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
		: : "memory", "rax", "r8", "r9", "r10", "r11");
	clac();
	return len;
}

static __always_inline __must_check unsigned long
@@ -58,9 +52,7 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
	return copy_user_generic((__force void *)dst, src, size);
}

extern long __copy_user_nocache(void *dst, const void __user *src,
				unsigned size, int zerorest);

extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size);
extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
			   size_t len);
@@ -69,8 +61,12 @@ static inline int
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
				  unsigned size)
{
	long ret;
	kasan_check_write(dst, size);
	return __copy_user_nocache(dst, src, size, 0);
	stac();
	ret = __copy_user_nocache(dst, src, size);
	clac();
	return ret;
}

static inline int
@@ -85,11 +81,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
 */

__must_check unsigned long
clear_user_original(void __user *addr, unsigned long len);
__must_check unsigned long
clear_user_rep_good(void __user *addr, unsigned long len);
__must_check unsigned long
clear_user_erms(void __user *addr, unsigned long len);
rep_stos_alternative(void __user *addr, unsigned long len);

static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
{
@@ -102,16 +94,12 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
	 */
	asm volatile(
		"1:\n\t"
		ALTERNATIVE_3("rep stosb",
			      "call clear_user_erms",	  ALT_NOT(X86_FEATURE_FSRM),
			      "call clear_user_rep_good", ALT_NOT(X86_FEATURE_ERMS),
			      "call clear_user_original", ALT_NOT(X86_FEATURE_REP_GOOD))
		ALTERNATIVE("rep stosb",
			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
		"2:\n"
	       _ASM_EXTABLE_UA(1b, 2b)
	       : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
	       : "a" (0)
		/* rep_good clobbers %rdx */
	       : "rdx");
	       : "a" (0));

	clac();

+4 −0
Original line number Diff line number Diff line
@@ -929,6 +929,10 @@ static void init_amd(struct cpuinfo_x86 *c)
	if (c->x86 >= 0x10)
		set_cpu_cap(c, X86_FEATURE_REP_GOOD);

	/* AMD FSRM also implies FSRS */
	if (cpu_has(c, X86_FEATURE_FSRM))
		set_cpu_cap(c, X86_FEATURE_FSRS);

	/* get apicid instead of initial apic id from cpuid */
	c->apicid = hard_smp_processor_id();

+1 −1
Original line number Diff line number Diff line
@@ -71,6 +71,6 @@ ifneq ($(CONFIG_GENERIC_CSUM),y)
endif
        lib-y += clear_page_64.o copy_page_64.o
        lib-y += memmove_64.o memset_64.o
        lib-y += copy_user_64.o
        lib-y += copy_user_64.o copy_user_uncached_64.o
	lib-y += cmpxchg16b_emu.o
endif
+67 −116
Original line number Diff line number Diff line
@@ -57,134 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
 * Input:
 * rdi destination
 * rcx count
 * rax is zero
 *
 * Output:
 * rcx: uncleared bytes or 0 if successful.
 */
SYM_FUNC_START(clear_user_original)
	/*
	 * Copy only the lower 32 bits of size as that is enough to handle the rest bytes,
	 * i.e., no need for a 'q' suffix and thus a REX prefix.
	 */
	mov %ecx,%eax
	shr $3,%rcx
	jz .Lrest_bytes
SYM_FUNC_START(rep_stos_alternative)
	cmpq $64,%rcx
	jae .Lunrolled

	# do the qwords first
	.p2align 4
.Lqwords:
	movq $0,(%rdi)
	lea 8(%rdi),%rdi
	dec %rcx
	jnz .Lqwords
	cmp $8,%ecx
	jae .Lword

.Lrest_bytes:
	and $7,  %eax
	jz .Lexit
	testl %ecx,%ecx
	je .Lexit

	# now do the rest bytes
.Lbytes:
	movb $0,(%rdi)
.Lclear_user_tail:
0:	movb %al,(%rdi)
	inc %rdi
	dec %eax
	jnz .Lbytes

	dec %rcx
	jnz .Lclear_user_tail
.Lexit:
	/*
	 * %rax still needs to be cleared in the exception case because this function is called
	 * from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
	 * in case it might reuse it somewhere.
	 */
        xor %eax,%eax
	RET

.Lqwords_exception:
        # convert remaining qwords back into bytes to return to caller
        shl $3, %rcx
        and $7, %eax
        add %rax,%rcx
        jmp .Lexit

.Lbytes_exception:
        mov %eax,%ecx
        jmp .Lexit

        _ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
        _ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
SYM_FUNC_END(clear_user_original)
EXPORT_SYMBOL(clear_user_original)
	_ASM_EXTABLE_UA( 0b, .Lexit)

/*
 * Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is
 * present.
 * Input:
 * rdi destination
 * rcx count
 *
 * Output:
 * rcx: uncleared bytes or 0 if successful.
 */
SYM_FUNC_START(clear_user_rep_good)
	# call the original thing for less than a cacheline
	cmp $64, %rcx
	jb clear_user_original
.Lword:
1:	movq %rax,(%rdi)
	addq $8,%rdi
	sub $8,%ecx
	je .Lexit
	cmp $8,%ecx
	jae .Lword
	jmp .Lclear_user_tail

.Lprep:
	# copy lower 32-bits for rest bytes
	mov %ecx, %edx
	shr $3, %rcx
	jz .Lrep_good_rest_bytes

.Lrep_good_qwords:
	rep stosq

.Lrep_good_rest_bytes:
	and $7, %edx
	jz .Lrep_good_exit

.Lrep_good_bytes:
	mov %edx, %ecx
	rep stosb

.Lrep_good_exit:
	# see .Lexit comment above
	xor %eax, %eax
	.p2align 4
.Lunrolled:
10:	movq %rax,(%rdi)
11:	movq %rax,8(%rdi)
12:	movq %rax,16(%rdi)
13:	movq %rax,24(%rdi)
14:	movq %rax,32(%rdi)
15:	movq %rax,40(%rdi)
16:	movq %rax,48(%rdi)
17:	movq %rax,56(%rdi)
	addq $64,%rdi
	subq $64,%rcx
	cmpq $64,%rcx
	jae .Lunrolled
	cmpl $8,%ecx
	jae .Lword
	testl %ecx,%ecx
	jne .Lclear_user_tail
	RET

.Lrep_good_qwords_exception:
	# convert remaining qwords back into bytes to return to caller
	shl $3, %rcx
	and $7, %edx
	add %rdx, %rcx
	jmp .Lrep_good_exit

	_ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception)
	_ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit)
SYM_FUNC_END(clear_user_rep_good)
EXPORT_SYMBOL(clear_user_rep_good)

	/*
 * Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present.
 * Input:
 * rdi destination
 * rcx count
	 * If we take an exception on any of the
	 * word stores, we know that %rcx isn't zero,
	 * so we can just go to the tail clearing to
	 * get the exact count.
	 *
 * Output:
 * rcx: uncleared bytes or 0 if successful.
	 * The unrolled case might end up clearing
	 * some bytes twice. Don't care.
	 *
	 * We could use the value in %rdi to avoid
	 * a second fault on the exact count case,
	 * but do we really care? No.
	 *
	 * Finally, we could try to align %rdi at the
	 * top of the unrolling. But unaligned stores
	 * just aren't that common or expensive.
	 */
SYM_FUNC_START(clear_user_erms)
	# call the original thing for less than a cacheline
	cmp $64, %rcx
	jb clear_user_original

.Lerms_bytes:
	rep stosb

.Lerms_exit:
	xorl %eax,%eax
	RET

	_ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit)
SYM_FUNC_END(clear_user_erms)
EXPORT_SYMBOL(clear_user_erms)
	_ASM_EXTABLE_UA( 1b, .Lclear_user_tail)
	_ASM_EXTABLE_UA(10b, .Lclear_user_tail)
	_ASM_EXTABLE_UA(11b, .Lclear_user_tail)
	_ASM_EXTABLE_UA(12b, .Lclear_user_tail)
	_ASM_EXTABLE_UA(13b, .Lclear_user_tail)
	_ASM_EXTABLE_UA(14b, .Lclear_user_tail)
	_ASM_EXTABLE_UA(15b, .Lclear_user_tail)
	_ASM_EXTABLE_UA(16b, .Lclear_user_tail)
	_ASM_EXTABLE_UA(17b, .Lclear_user_tail)
SYM_FUNC_END(rep_stos_alternative)
EXPORT_SYMBOL(rep_stos_alternative)
+89 −385
Original line number Diff line number Diff line
@@ -7,404 +7,108 @@
 */

#include <linux/linkage.h>
#include <asm/current.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/export.h>
#include <asm/trapnr.h>

.macro ALIGN_DESTINATION
	/* check for bad alignment of destination */
	movl %edi,%ecx
	andl $7,%ecx
	jz 102f				/* already aligned */
	subl $8,%ecx
	negl %ecx
	subl %ecx,%edx
100:	movb (%rsi),%al
101:	movb %al,(%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
	jnz 100b
102:

	_ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align)
	_ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
.endm

/*
 * copy_user_generic_unrolled - memory copy with exception handling.
 * This version is for CPUs like P4 that don't have efficient micro
 * code for rep movsq
 *
 * Input:
 * rdi destination
 * rsi source
 * rdx count
 *
 * Output:
 * eax uncopied bytes or 0 if successful.
 */
SYM_FUNC_START(copy_user_generic_unrolled)
	ASM_STAC
	cmpl $8,%edx
	jb .Lcopy_user_short_string_bytes
	ALIGN_DESTINATION
	movl %edx,%ecx
	andl $63,%edx
	shrl $6,%ecx
	jz copy_user_short_string
1:	movq (%rsi),%r8
2:	movq 1*8(%rsi),%r9
3:	movq 2*8(%rsi),%r10
4:	movq 3*8(%rsi),%r11
5:	movq %r8,(%rdi)
6:	movq %r9,1*8(%rdi)
7:	movq %r10,2*8(%rdi)
8:	movq %r11,3*8(%rdi)
9:	movq 4*8(%rsi),%r8
10:	movq 5*8(%rsi),%r9
11:	movq 6*8(%rsi),%r10
12:	movq 7*8(%rsi),%r11
13:	movq %r8,4*8(%rdi)
14:	movq %r9,5*8(%rdi)
15:	movq %r10,6*8(%rdi)
16:	movq %r11,7*8(%rdi)
	leaq 64(%rsi),%rsi
	leaq 64(%rdi),%rdi
	decl %ecx
	jnz 1b
	jmp copy_user_short_string

30:	shll $6,%ecx
	addl %ecx,%edx
	jmp .Lcopy_user_handle_tail

	_ASM_EXTABLE_CPY(1b, 30b)
	_ASM_EXTABLE_CPY(2b, 30b)
	_ASM_EXTABLE_CPY(3b, 30b)
	_ASM_EXTABLE_CPY(4b, 30b)
	_ASM_EXTABLE_CPY(5b, 30b)
	_ASM_EXTABLE_CPY(6b, 30b)
	_ASM_EXTABLE_CPY(7b, 30b)
	_ASM_EXTABLE_CPY(8b, 30b)
	_ASM_EXTABLE_CPY(9b, 30b)
	_ASM_EXTABLE_CPY(10b, 30b)
	_ASM_EXTABLE_CPY(11b, 30b)
	_ASM_EXTABLE_CPY(12b, 30b)
	_ASM_EXTABLE_CPY(13b, 30b)
	_ASM_EXTABLE_CPY(14b, 30b)
	_ASM_EXTABLE_CPY(15b, 30b)
	_ASM_EXTABLE_CPY(16b, 30b)
SYM_FUNC_END(copy_user_generic_unrolled)
EXPORT_SYMBOL(copy_user_generic_unrolled)

/* Some CPUs run faster using the string copy instructions.
 * This is also a lot simpler. Use them when possible.
 *
 * Only 4GB of copy is supported. This shouldn't be a problem
 * because the kernel normally only writes from/to page sized chunks
 * even if user space passed a longer buffer.
 * And more would be dangerous because both Intel and AMD have
 * errata with rep movsq > 4GB. If someone feels the need to fix
 * this please consider this.
 * rep_movs_alternative - memory copy with exception handling.
 * This version is for CPUs that don't have FSRM (Fast Short Rep Movs)
 *
 * Input:
 * rdi destination
 * rsi source
 * rdx count
 * rcx count
 *
 * Output:
 * eax uncopied bytes or 0 if successful.
 */
SYM_FUNC_START(copy_user_generic_string)
	ASM_STAC
	cmpl $8,%edx
	jb 2f		/* less than 8 bytes, go to byte copy loop */
	ALIGN_DESTINATION
	movl %edx,%ecx
	shrl $3,%ecx
	andl $7,%edx
1:	rep movsq
2:	movl %edx,%ecx
3:	rep movsb
	xorl %eax,%eax
	ASM_CLAC
	RET

11:	leal (%rdx,%rcx,8),%ecx
12:	movl %ecx,%edx		/* ecx is zerorest also */
	jmp .Lcopy_user_handle_tail

	_ASM_EXTABLE_CPY(1b, 11b)
	_ASM_EXTABLE_CPY(3b, 12b)
SYM_FUNC_END(copy_user_generic_string)
EXPORT_SYMBOL(copy_user_generic_string)

/*
 * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
 * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
 *
 * Input:
 * rdi destination
 * rsi source
 * rdx count
 * rcx uncopied bytes or 0 if successful.
 *
 * Output:
 * eax uncopied bytes or 0 if successful.
 * NOTE! The calling convention is very intentionally the same as
 * for 'rep movs', so that we can rewrite the function call with
 * just a plain 'rep movs' on machines that have FSRM.  But to make
 * it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely.
 */
SYM_FUNC_START(copy_user_enhanced_fast_string)
	ASM_STAC
	/* CPUs without FSRM should avoid rep movsb for short copies */
	ALTERNATIVE "cmpl $64, %edx; jb copy_user_short_string", "", X86_FEATURE_FSRM
	movl %edx,%ecx
1:	rep movsb
	xorl %eax,%eax
	ASM_CLAC
SYM_FUNC_START(rep_movs_alternative)
	cmpq $64,%rcx
	jae .Lunrolled

	cmp $8,%ecx
	jae .Lword

	testl %ecx,%ecx
	je .Lexit

.Lcopy_user_tail:
0:	movb (%rsi),%al
1:	movb %al,(%rdi)
	inc %rdi
	inc %rsi
	dec %rcx
	jne .Lcopy_user_tail
.Lexit:
	RET

12:	movl %ecx,%edx		/* ecx is zerorest also */
	jmp .Lcopy_user_handle_tail

	_ASM_EXTABLE_CPY(1b, 12b)
SYM_FUNC_END(copy_user_enhanced_fast_string)
EXPORT_SYMBOL(copy_user_enhanced_fast_string)

/*
 * Try to copy last bytes and clear the rest if needed.
 * Since protection fault in copy_from/to_user is not a normal situation,
 * it is not necessary to optimize tail handling.
 * Don't try to copy the tail if machine check happened
 *
 * Input:
 * eax trap number written by ex_handler_copy()
 * rdi destination
 * rsi source
 * rdx count
 *
 * Output:
 * eax uncopied bytes or 0 if successful.
 */
SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
	cmp $X86_TRAP_MC,%eax
	je 3f

	movl %edx,%ecx
1:	rep movsb
2:	mov %ecx,%eax
	ASM_CLAC
	_ASM_EXTABLE_UA( 0b, .Lexit)
	_ASM_EXTABLE_UA( 1b, .Lexit)

	.p2align 4
.Lword:
2:	movq (%rsi),%rax
3:	movq %rax,(%rdi)
	addq $8,%rsi
	addq $8,%rdi
	sub $8,%ecx
	je .Lexit
	cmp $8,%ecx
	jae .Lword
	jmp .Lcopy_user_tail

	_ASM_EXTABLE_UA( 2b, .Lcopy_user_tail)
	_ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)

	.p2align 4
.Lunrolled:
10:	movq (%rsi),%r8
11:	movq 8(%rsi),%r9
12:	movq 16(%rsi),%r10
13:	movq 24(%rsi),%r11
14:	movq %r8,(%rdi)
15:	movq %r9,8(%rdi)
16:	movq %r10,16(%rdi)
17:	movq %r11,24(%rdi)
20:	movq 32(%rsi),%r8
21:	movq 40(%rsi),%r9
22:	movq 48(%rsi),%r10
23:	movq 56(%rsi),%r11
24:	movq %r8,32(%rdi)
25:	movq %r9,40(%rdi)
26:	movq %r10,48(%rdi)
27:	movq %r11,56(%rdi)
	addq $64,%rsi
	addq $64,%rdi
	subq $64,%rcx
	cmpq $64,%rcx
	jae .Lunrolled
	cmpl $8,%ecx
	jae .Lword
	testl %ecx,%ecx
	jne .Lcopy_user_tail
	RET

3:
	movl %edx,%eax
	ASM_CLAC
	RET

	_ASM_EXTABLE_CPY(1b, 2b)

.Lcopy_user_handle_align:
	addl %ecx,%edx			/* ecx is zerorest also */
	jmp .Lcopy_user_handle_tail

SYM_CODE_END(.Lcopy_user_handle_tail)

/*
 * Finish memcpy of less than 64 bytes.  #AC should already be set.
 *
 * Input:
 * rdi destination
 * rsi source
 * rdx count (< 64)
 *
 * Output:
 * eax uncopied bytes or 0 if successful.
 */
SYM_CODE_START_LOCAL(copy_user_short_string)
	movl %edx,%ecx
	andl $7,%edx
	shrl $3,%ecx
	jz .Lcopy_user_short_string_bytes
18:	movq (%rsi),%r8
19:	movq %r8,(%rdi)
	leaq 8(%rsi),%rsi
	leaq 8(%rdi),%rdi
	decl %ecx
	jnz 18b
.Lcopy_user_short_string_bytes:
	andl %edx,%edx
	jz 23f
	movl %edx,%ecx
21:	movb (%rsi),%al
22:	movb %al,(%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
	jnz 21b
23:	xor %eax,%eax
	ASM_CLAC
	RET

40:	leal (%rdx,%rcx,8),%edx
	jmp 60f
50:	movl %ecx,%edx		/* ecx is zerorest also */
60:	jmp .Lcopy_user_handle_tail

	_ASM_EXTABLE_CPY(18b, 40b)
	_ASM_EXTABLE_CPY(19b, 40b)
	_ASM_EXTABLE_CPY(21b, 50b)
	_ASM_EXTABLE_CPY(22b, 50b)
SYM_CODE_END(copy_user_short_string)

/*
 * copy_user_nocache - Uncached memory copy with exception handling
 * This will force destination out of cache for more performance.
 *
 * Note: Cached memory copy is used when destination or size is not
 * naturally aligned. That is:
 *  - Require 8-byte alignment when size is 8 bytes or larger.
 *  - Require 4-byte alignment when size is 4 bytes.
 */
SYM_FUNC_START(__copy_user_nocache)
	ASM_STAC

	/* If size is less than 8 bytes, go to 4-byte copy */
	cmpl $8,%edx
	jb .L_4b_nocache_copy_entry

	/* If destination is not 8-byte aligned, "cache" copy to align it */
	ALIGN_DESTINATION

	/* Set 4x8-byte copy count and remainder */
	movl %edx,%ecx
	andl $63,%edx
	shrl $6,%ecx
	jz .L_8b_nocache_copy_entry	/* jump if count is 0 */

	/* Perform 4x8-byte nocache loop-copy */
.L_4x8b_nocache_copy_loop:
1:	movq (%rsi),%r8
2:	movq 1*8(%rsi),%r9
3:	movq 2*8(%rsi),%r10
4:	movq 3*8(%rsi),%r11
5:	movnti %r8,(%rdi)
6:	movnti %r9,1*8(%rdi)
7:	movnti %r10,2*8(%rdi)
8:	movnti %r11,3*8(%rdi)
9:	movq 4*8(%rsi),%r8
10:	movq 5*8(%rsi),%r9
11:	movq 6*8(%rsi),%r10
12:	movq 7*8(%rsi),%r11
13:	movnti %r8,4*8(%rdi)
14:	movnti %r9,5*8(%rdi)
15:	movnti %r10,6*8(%rdi)
16:	movnti %r11,7*8(%rdi)
	leaq 64(%rsi),%rsi
	leaq 64(%rdi),%rdi
	decl %ecx
	jnz .L_4x8b_nocache_copy_loop

	/* Set 8-byte copy count and remainder */
.L_8b_nocache_copy_entry:
	movl %edx,%ecx
	andl $7,%edx
	shrl $3,%ecx
	jz .L_4b_nocache_copy_entry	/* jump if count is 0 */

	/* Perform 8-byte nocache loop-copy */
.L_8b_nocache_copy_loop:
20:	movq (%rsi),%r8
21:	movnti %r8,(%rdi)
	leaq 8(%rsi),%rsi
	leaq 8(%rdi),%rdi
	decl %ecx
	jnz .L_8b_nocache_copy_loop

	/* If no byte left, we're done */
.L_4b_nocache_copy_entry:
	andl %edx,%edx
	jz .L_finish_copy

	/* If destination is not 4-byte aligned, go to byte copy: */
	movl %edi,%ecx
	andl $3,%ecx
	jnz .L_1b_cache_copy_entry

	/* Set 4-byte copy count (1 or 0) and remainder */
	movl %edx,%ecx
	andl $3,%edx
	shrl $2,%ecx
	jz .L_1b_cache_copy_entry	/* jump if count is 0 */

	/* Perform 4-byte nocache copy: */
30:	movl (%rsi),%r8d
31:	movnti %r8d,(%rdi)
	leaq 4(%rsi),%rsi
	leaq 4(%rdi),%rdi

	/* If no bytes left, we're done: */
	andl %edx,%edx
	jz .L_finish_copy

	/* Perform byte "cache" loop-copy for the remainder */
.L_1b_cache_copy_entry:
	movl %edx,%ecx
.L_1b_cache_copy_loop:
40:	movb (%rsi),%al
41:	movb %al,(%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
	jnz .L_1b_cache_copy_loop

	/* Finished copying; fence the prior stores */
.L_finish_copy:
	xorl %eax,%eax
	ASM_CLAC
	sfence
	RET

.L_fixup_4x8b_copy:
	shll $6,%ecx
	addl %ecx,%edx
	jmp .L_fixup_handle_tail
.L_fixup_8b_copy:
	lea (%rdx,%rcx,8),%rdx
	jmp .L_fixup_handle_tail
.L_fixup_4b_copy:
	lea (%rdx,%rcx,4),%rdx
	jmp .L_fixup_handle_tail
.L_fixup_1b_copy:
	movl %ecx,%edx
.L_fixup_handle_tail:
	sfence
	jmp .Lcopy_user_handle_tail

	_ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy)
	_ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy)
	_ASM_EXTABLE_CPY(3b, .L_fixup_4x8b_copy)
	_ASM_EXTABLE_CPY(4b, .L_fixup_4x8b_copy)
	_ASM_EXTABLE_CPY(5b, .L_fixup_4x8b_copy)
	_ASM_EXTABLE_CPY(6b, .L_fixup_4x8b_copy)
	_ASM_EXTABLE_CPY(7b, .L_fixup_4x8b_copy)
	_ASM_EXTABLE_CPY(8b, .L_fixup_4x8b_copy)
	_ASM_EXTABLE_CPY(9b, .L_fixup_4x8b_copy)
	_ASM_EXTABLE_CPY(10b, .L_fixup_4x8b_copy)
	_ASM_EXTABLE_CPY(11b, .L_fixup_4x8b_copy)
	_ASM_EXTABLE_CPY(12b, .L_fixup_4x8b_copy)
	_ASM_EXTABLE_CPY(13b, .L_fixup_4x8b_copy)
	_ASM_EXTABLE_CPY(14b, .L_fixup_4x8b_copy)
	_ASM_EXTABLE_CPY(15b, .L_fixup_4x8b_copy)
	_ASM_EXTABLE_CPY(16b, .L_fixup_4x8b_copy)
	_ASM_EXTABLE_CPY(20b, .L_fixup_8b_copy)
	_ASM_EXTABLE_CPY(21b, .L_fixup_8b_copy)
	_ASM_EXTABLE_CPY(30b, .L_fixup_4b_copy)
	_ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy)
	_ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy)
	_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
SYM_FUNC_END(__copy_user_nocache)
EXPORT_SYMBOL(__copy_user_nocache)
	_ASM_EXTABLE_UA(10b, .Lcopy_user_tail)
	_ASM_EXTABLE_UA(11b, .Lcopy_user_tail)
	_ASM_EXTABLE_UA(12b, .Lcopy_user_tail)
	_ASM_EXTABLE_UA(13b, .Lcopy_user_tail)
	_ASM_EXTABLE_UA(14b, .Lcopy_user_tail)
	_ASM_EXTABLE_UA(15b, .Lcopy_user_tail)
	_ASM_EXTABLE_UA(16b, .Lcopy_user_tail)
	_ASM_EXTABLE_UA(17b, .Lcopy_user_tail)
	_ASM_EXTABLE_UA(20b, .Lcopy_user_tail)
	_ASM_EXTABLE_UA(21b, .Lcopy_user_tail)
	_ASM_EXTABLE_UA(22b, .Lcopy_user_tail)
	_ASM_EXTABLE_UA(23b, .Lcopy_user_tail)
	_ASM_EXTABLE_UA(24b, .Lcopy_user_tail)
	_ASM_EXTABLE_UA(25b, .Lcopy_user_tail)
	_ASM_EXTABLE_UA(26b, .Lcopy_user_tail)
	_ASM_EXTABLE_UA(27b, .Lcopy_user_tail)
SYM_FUNC_END(rep_movs_alternative)
EXPORT_SYMBOL(rep_movs_alternative)
Loading