Files
linux-net/arch/x86/lib/copy_user_64.S
Linus Torvalds 427fda2c8a x86: improve on the non-rep 'copy_user' function
The old 'copy_user_generic_unrolled' function was oddly implemented for
largely historical reasons: it had been largely based on the uncached
copy case, which has some other concerns.

For example, the __copy_user_nocache() function uses 'movnti' for the
destination stores, and those want the destination to be aligned.  In
contrast, the regular copy function doesn't really care, and trying to
align things only complicates matters.

Also, like the clear_user function, the copy function had some odd
handling of the repeat counts, complicating the exception handling for
no really good reason.  So as with clear_user, just write it to keep all
the byte counts in the %rcx register, exactly like the 'rep movs'
functionality that this replaces.

Unlike a real 'rep movs', we do allow for this to trash a few temporary
registers to not have to unnecessarily save/restore registers on the
stack.

And like the clearing case, rename this to what it now clearly is:
'rep_movs_alternative', and make it one coherent function, so that it
shows up as such in profiles (instead of the odd split between
"copy_user_generic_unrolled" and "copy_user_short_string", the latter of
which was not about strings at all, and which was shared with the
uncached case).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2023-04-18 17:05:28 -07:00

328 lines
7.5 KiB
ArmAsm

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
* Copyright 2002 Andi Kleen, SuSE Labs.
*
* Functions to copy from and to user space.
*/
#include <linux/linkage.h>
#include <asm/current.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/export.h>
#include <asm/trapnr.h>
/*
* rep_movs_alternative - memory copy with exception handling.
* This version is for CPUs that don't have FSRM (Fast Short Rep Movs)
*
* Input:
* rdi destination
* rsi source
* rcx count
*
* Output:
* rcx uncopied bytes or 0 if successful.
*
* NOTE! The calling convention is very intentionally the same as
* for 'rep movs', so that we can rewrite the function call with
* just a plain 'rep movs' on machines that have FSRM. But to make
* it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely.
*/
SYM_FUNC_START(rep_movs_alternative)
cmpq $64,%rcx
jae .Lunrolled
cmp $8,%ecx
jae .Lword
testl %ecx,%ecx
je .Lexit
.Lcopy_user_tail:
0: movb (%rsi),%al
1: movb %al,(%rdi)
inc %rdi
inc %rsi
dec %rcx
jne .Lcopy_user_tail
.Lexit:
RET
_ASM_EXTABLE_UA( 0b, .Lexit)
_ASM_EXTABLE_UA( 1b, .Lexit)
.p2align 4
.Lword:
2: movq (%rsi),%rax
3: movq %rax,(%rdi)
addq $8,%rsi
addq $8,%rdi
sub $8,%ecx
je .Lexit
cmp $8,%ecx
jae .Lword
jmp .Lcopy_user_tail
_ASM_EXTABLE_UA( 2b, .Lcopy_user_tail)
_ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
.p2align 4
.Lunrolled:
10: movq (%rsi),%r8
11: movq 8(%rsi),%r9
12: movq 16(%rsi),%r10
13: movq 24(%rsi),%r11
14: movq %r8,(%rdi)
15: movq %r9,8(%rdi)
16: movq %r10,16(%rdi)
17: movq %r11,24(%rdi)
20: movq 32(%rsi),%r8
21: movq 40(%rsi),%r9
22: movq 48(%rsi),%r10
23: movq 56(%rsi),%r11
24: movq %r8,32(%rdi)
25: movq %r9,40(%rdi)
26: movq %r10,48(%rdi)
27: movq %r11,56(%rdi)
addq $64,%rsi
addq $64,%rdi
subq $64,%rcx
cmpq $64,%rcx
jae .Lunrolled
cmpl $8,%ecx
jae .Lword
testl %ecx,%ecx
jne .Lcopy_user_tail
RET
_ASM_EXTABLE_UA(10b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(11b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(12b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(13b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(14b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(15b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(16b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(17b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(20b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(21b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(22b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(23b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(24b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(25b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(26b, .Lcopy_user_tail)
_ASM_EXTABLE_UA(27b, .Lcopy_user_tail)
SYM_FUNC_END(rep_movs_alternative)
EXPORT_SYMBOL(rep_movs_alternative)
/*
* The uncached copy needs to align the destination for
* movnti and friends.
*/
.macro ALIGN_DESTINATION
/* check for bad alignment of destination */
movl %edi,%ecx
andl $7,%ecx
jz 102f /* already aligned */
subl $8,%ecx
negl %ecx
subl %ecx,%edx
100: movb (%rsi),%al
101: movb %al,(%rdi)
incq %rsi
incq %rdi
decl %ecx
jnz 100b
102:
_ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align)
_ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
.endm
/*
* copy_user_nocache - Uncached memory copy with exception handling
* This will force destination out of cache for more performance.
*
* Note: Cached memory copy is used when destination or size is not
* naturally aligned. That is:
* - Require 8-byte alignment when size is 8 bytes or larger.
* - Require 4-byte alignment when size is 4 bytes.
*/
SYM_FUNC_START(__copy_user_nocache)
/* If size is less than 8 bytes, go to 4-byte copy */
cmpl $8,%edx
jb .L_4b_nocache_copy_entry
/* If destination is not 8-byte aligned, "cache" copy to align it */
ALIGN_DESTINATION
/* Set 4x8-byte copy count and remainder */
movl %edx,%ecx
andl $63,%edx
shrl $6,%ecx
jz .L_8b_nocache_copy_entry /* jump if count is 0 */
/* Perform 4x8-byte nocache loop-copy */
.L_4x8b_nocache_copy_loop:
1: movq (%rsi),%r8
2: movq 1*8(%rsi),%r9
3: movq 2*8(%rsi),%r10
4: movq 3*8(%rsi),%r11
5: movnti %r8,(%rdi)
6: movnti %r9,1*8(%rdi)
7: movnti %r10,2*8(%rdi)
8: movnti %r11,3*8(%rdi)
9: movq 4*8(%rsi),%r8
10: movq 5*8(%rsi),%r9
11: movq 6*8(%rsi),%r10
12: movq 7*8(%rsi),%r11
13: movnti %r8,4*8(%rdi)
14: movnti %r9,5*8(%rdi)
15: movnti %r10,6*8(%rdi)
16: movnti %r11,7*8(%rdi)
leaq 64(%rsi),%rsi
leaq 64(%rdi),%rdi
decl %ecx
jnz .L_4x8b_nocache_copy_loop
/* Set 8-byte copy count and remainder */
.L_8b_nocache_copy_entry:
movl %edx,%ecx
andl $7,%edx
shrl $3,%ecx
jz .L_4b_nocache_copy_entry /* jump if count is 0 */
/* Perform 8-byte nocache loop-copy */
.L_8b_nocache_copy_loop:
20: movq (%rsi),%r8
21: movnti %r8,(%rdi)
leaq 8(%rsi),%rsi
leaq 8(%rdi),%rdi
decl %ecx
jnz .L_8b_nocache_copy_loop
/* If no byte left, we're done */
.L_4b_nocache_copy_entry:
andl %edx,%edx
jz .L_finish_copy
/* If destination is not 4-byte aligned, go to byte copy: */
movl %edi,%ecx
andl $3,%ecx
jnz .L_1b_cache_copy_entry
/* Set 4-byte copy count (1 or 0) and remainder */
movl %edx,%ecx
andl $3,%edx
shrl $2,%ecx
jz .L_1b_cache_copy_entry /* jump if count is 0 */
/* Perform 4-byte nocache copy: */
30: movl (%rsi),%r8d
31: movnti %r8d,(%rdi)
leaq 4(%rsi),%rsi
leaq 4(%rdi),%rdi
/* If no bytes left, we're done: */
andl %edx,%edx
jz .L_finish_copy
/* Perform byte "cache" loop-copy for the remainder */
.L_1b_cache_copy_entry:
movl %edx,%ecx
.L_1b_cache_copy_loop:
40: movb (%rsi),%al
41: movb %al,(%rdi)
incq %rsi
incq %rdi
decl %ecx
jnz .L_1b_cache_copy_loop
/* Finished copying; fence the prior stores */
.L_finish_copy:
xorl %eax,%eax
sfence
RET
.L_fixup_4x8b_copy:
shll $6,%ecx
addl %ecx,%edx
jmp .L_fixup_handle_tail
.L_fixup_8b_copy:
lea (%rdx,%rcx,8),%rdx
jmp .L_fixup_handle_tail
.L_fixup_4b_copy:
lea (%rdx,%rcx,4),%rdx
jmp .L_fixup_handle_tail
.L_fixup_1b_copy:
movl %ecx,%edx
.L_fixup_handle_tail:
sfence
jmp .Lcopy_user_handle_tail
_ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(3b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(4b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(5b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(6b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(7b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(8b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(9b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(10b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(11b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(12b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(13b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(14b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(15b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(16b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(20b, .L_fixup_8b_copy)
_ASM_EXTABLE_CPY(21b, .L_fixup_8b_copy)
_ASM_EXTABLE_CPY(30b, .L_fixup_4b_copy)
_ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy)
_ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy)
_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
/*
* Try to copy last bytes and clear the rest if needed.
* Since protection fault in copy_from/to_user is not a normal situation,
* it is not necessary to optimize tail handling.
* Don't try to copy the tail if machine check happened
*
* Input:
* eax trap number written by ex_handler_copy()
* rdi destination
* rsi source
* rdx count
*
* Output:
* eax uncopied bytes or 0 if successful.
*/
.Lcopy_user_handle_tail:
cmp $X86_TRAP_MC,%eax
je 3f
movl %edx,%ecx
1: rep movsb
2: mov %ecx,%eax
RET
3:
movl %edx,%eax
RET
_ASM_EXTABLE_CPY(1b, 2b)
.Lcopy_user_handle_align:
addl %ecx,%edx /* ecx is zerorest also */
jmp .Lcopy_user_handle_tail
SYM_FUNC_END(__copy_user_nocache)
EXPORT_SYMBOL(__copy_user_nocache)