x86: rewrite '__copy_user_nocache' function (034ff37d) · Commits · git / linux-net

arch/x86/lib/Makefile

+1 −1

Original line number	Diff line number	Diff line
		@@ -71,6 +71,6 @@ ifneq ($(CONFIG_GENERIC_CSUM),y)
		endif
		lib-y += clear_page_64.o copy_page_64.o
		lib-y += memmove_64.o memset_64.o
		lib-y += copy_user_64.o
		lib-y += copy_user_64.o copy_user_uncached_64.o
		lib-y += cmpxchg16b_emu.o
		endif

arch/x86/lib/copy_user_64.S

+0 −213

Original line number	Diff line number	Diff line
		@@ -7,15 +7,8 @@
		*/

		#include <linux/linkage.h>
		#include <asm/current.h>
		#include <asm/asm-offsets.h>
		#include <asm/thread_info.h>
		#include <asm/cpufeatures.h>
		#include <asm/alternative.h>
		#include <asm/asm.h>
		#include <asm/smap.h>
		#include <asm/export.h>
		#include <asm/trapnr.h>

		/*
		* rep_movs_alternative - memory copy with exception handling.
		@@ -119,209 +112,3 @@ SYM_FUNC_START(rep_movs_alternative)
		_ASM_EXTABLE_UA(27b, .Lcopy_user_tail)
		SYM_FUNC_END(rep_movs_alternative)
		EXPORT_SYMBOL(rep_movs_alternative)

		/*
		* The uncached copy needs to align the destination for
		* movnti and friends.
		*/
		.macro ALIGN_DESTINATION
		/* check for bad alignment of destination */
		movl %edi,%ecx
		andl $7,%ecx
		jz 102f /* already aligned */
		subl $8,%ecx
		negl %ecx
		subl %ecx,%edx
		100: movb (%rsi),%al
		101: movb %al,(%rdi)
		incq %rsi
		incq %rdi
		decl %ecx
		jnz 100b
		102:

		_ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align)
		_ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
		.endm


		/*
		* copy_user_nocache - Uncached memory copy with exception handling
		* This will force destination out of cache for more performance.
		*
		* Note: Cached memory copy is used when destination or size is not
		* naturally aligned. That is:
		* - Require 8-byte alignment when size is 8 bytes or larger.
		* - Require 4-byte alignment when size is 4 bytes.
		*/
		SYM_FUNC_START(__copy_user_nocache)
		/* If size is less than 8 bytes, go to 4-byte copy */
		cmpl $8,%edx
		jb .L_4b_nocache_copy_entry

		/* If destination is not 8-byte aligned, "cache" copy to align it */
		ALIGN_DESTINATION

		/* Set 4x8-byte copy count and remainder */
		movl %edx,%ecx
		andl $63,%edx
		shrl $6,%ecx
		jz .L_8b_nocache_copy_entry /* jump if count is 0 */

		/* Perform 4x8-byte nocache loop-copy */
		.L_4x8b_nocache_copy_loop:
		1: movq (%rsi),%r8
		2: movq 1*8(%rsi),%r9
		3: movq 2*8(%rsi),%r10
		4: movq 3*8(%rsi),%r11
		5: movnti %r8,(%rdi)
		6: movnti %r9,1*8(%rdi)
		7: movnti %r10,2*8(%rdi)
		8: movnti %r11,3*8(%rdi)
		9: movq 4*8(%rsi),%r8
		10: movq 5*8(%rsi),%r9
		11: movq 6*8(%rsi),%r10
		12: movq 7*8(%rsi),%r11
		13: movnti %r8,4*8(%rdi)
		14: movnti %r9,5*8(%rdi)
		15: movnti %r10,6*8(%rdi)
		16: movnti %r11,7*8(%rdi)
		leaq 64(%rsi),%rsi
		leaq 64(%rdi),%rdi
		decl %ecx
		jnz .L_4x8b_nocache_copy_loop

		/* Set 8-byte copy count and remainder */
		.L_8b_nocache_copy_entry:
		movl %edx,%ecx
		andl $7,%edx
		shrl $3,%ecx
		jz .L_4b_nocache_copy_entry /* jump if count is 0 */

		/* Perform 8-byte nocache loop-copy */
		.L_8b_nocache_copy_loop:
		20: movq (%rsi),%r8
		21: movnti %r8,(%rdi)
		leaq 8(%rsi),%rsi
		leaq 8(%rdi),%rdi
		decl %ecx
		jnz .L_8b_nocache_copy_loop

		/* If no byte left, we're done */
		.L_4b_nocache_copy_entry:
		andl %edx,%edx
		jz .L_finish_copy

		/* If destination is not 4-byte aligned, go to byte copy: */
		movl %edi,%ecx
		andl $3,%ecx
		jnz .L_1b_cache_copy_entry

		/* Set 4-byte copy count (1 or 0) and remainder */
		movl %edx,%ecx
		andl $3,%edx
		shrl $2,%ecx
		jz .L_1b_cache_copy_entry /* jump if count is 0 */

		/* Perform 4-byte nocache copy: */
		30: movl (%rsi),%r8d
		31: movnti %r8d,(%rdi)
		leaq 4(%rsi),%rsi
		leaq 4(%rdi),%rdi

		/* If no bytes left, we're done: */
		andl %edx,%edx
		jz .L_finish_copy

		/* Perform byte "cache" loop-copy for the remainder */
		.L_1b_cache_copy_entry:
		movl %edx,%ecx
		.L_1b_cache_copy_loop:
		40: movb (%rsi),%al
		41: movb %al,(%rdi)
		incq %rsi
		incq %rdi
		decl %ecx
		jnz .L_1b_cache_copy_loop

		/* Finished copying; fence the prior stores */
		.L_finish_copy:
		xorl %eax,%eax
		sfence
		RET

		.L_fixup_4x8b_copy:
		shll $6,%ecx
		addl %ecx,%edx
		jmp .L_fixup_handle_tail
		.L_fixup_8b_copy:
		lea (%rdx,%rcx,8),%rdx
		jmp .L_fixup_handle_tail
		.L_fixup_4b_copy:
		lea (%rdx,%rcx,4),%rdx
		jmp .L_fixup_handle_tail
		.L_fixup_1b_copy:
		movl %ecx,%edx
		.L_fixup_handle_tail:
		sfence
		jmp .Lcopy_user_handle_tail

		_ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy)
		_ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy)
		_ASM_EXTABLE_CPY(3b, .L_fixup_4x8b_copy)
		_ASM_EXTABLE_CPY(4b, .L_fixup_4x8b_copy)
		_ASM_EXTABLE_CPY(5b, .L_fixup_4x8b_copy)
		_ASM_EXTABLE_CPY(6b, .L_fixup_4x8b_copy)
		_ASM_EXTABLE_CPY(7b, .L_fixup_4x8b_copy)
		_ASM_EXTABLE_CPY(8b, .L_fixup_4x8b_copy)
		_ASM_EXTABLE_CPY(9b, .L_fixup_4x8b_copy)
		_ASM_EXTABLE_CPY(10b, .L_fixup_4x8b_copy)
		_ASM_EXTABLE_CPY(11b, .L_fixup_4x8b_copy)
		_ASM_EXTABLE_CPY(12b, .L_fixup_4x8b_copy)
		_ASM_EXTABLE_CPY(13b, .L_fixup_4x8b_copy)
		_ASM_EXTABLE_CPY(14b, .L_fixup_4x8b_copy)
		_ASM_EXTABLE_CPY(15b, .L_fixup_4x8b_copy)
		_ASM_EXTABLE_CPY(16b, .L_fixup_4x8b_copy)
		_ASM_EXTABLE_CPY(20b, .L_fixup_8b_copy)
		_ASM_EXTABLE_CPY(21b, .L_fixup_8b_copy)
		_ASM_EXTABLE_CPY(30b, .L_fixup_4b_copy)
		_ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy)
		_ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy)
		_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)

		/*
		* Try to copy last bytes.
		* Since protection fault in copy_from/to_user is not a normal situation,
		* it is not necessary to optimize tail handling.
		* Don't try to copy the tail if machine check happened
		*
		* Input:
		* eax trap number written by ex_handler_copy()
		* rdi destination
		* rsi source
		* rdx count
		*
		* Output:
		* eax uncopied bytes or 0 if successful.
		*/
		.Lcopy_user_handle_tail:
		cmp $X86_TRAP_MC,%eax
		je 3f

		movl %edx,%ecx
		1: rep movsb
		2: mov %ecx,%eax
		RET

		3:
		movl %edx,%eax
		RET

		_ASM_EXTABLE_CPY(1b, 2b)

		.Lcopy_user_handle_align:
		addl %ecx,%edx
		jmp .Lcopy_user_handle_tail

		SYM_FUNC_END(__copy_user_nocache)
		EXPORT_SYMBOL(__copy_user_nocache)

arch/x86/lib/copy_user_uncached_64.S

0 → 100644

+242 −0

Original line number	Diff line number	Diff line
		/* SPDX-License-Identifier: GPL-2.0-only */
		/*
		* Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
		*/

		#include <linux/linkage.h>
		#include <asm/asm.h>
		#include <asm/export.h>

		/*
		* copy_user_nocache - Uncached memory copy with exception handling
		*
		* This copies from user space into kernel space, but the kernel
		* space accesses can take a machine check exception, so they too
		* need exception handling.
		*
		* Note: only 32-bit and 64-bit stores have non-temporal versions,
		* and we only use aligned versions. Any unaligned parts at the
		* start or end of the copy will be done using normal cached stores.
		*
		* Input:
		* rdi destination
		* rsi source
		* edx count
		*
		* Output:
		* rax uncopied bytes or 0 if successful.
		*/
		SYM_FUNC_START(__copy_user_nocache)
		/* If destination is not 7-byte aligned, we'll have to align it */
		testb $7,%dil
		jne .Lalign

		.Lis_aligned:
		cmp $64,%edx
		jb .Lquadwords

		.p2align 4,0x90
		.Lunrolled:
		10: movq (%rsi),%r8
		11: movq 8(%rsi),%r9
		12: movq 16(%rsi),%r10
		13: movq 24(%rsi),%r11
		20: movnti %r8,(%rdi)
		21: movnti %r9,8(%rdi)
		22: movnti %r10,16(%rdi)
		23: movnti %r11,24(%rdi)
		30: movq 32(%rsi),%r8
		31: movq 40(%rsi),%r9
		32: movq 48(%rsi),%r10
		33: movq 56(%rsi),%r11
		40: movnti %r8,32(%rdi)
		41: movnti %r9,40(%rdi)
		42: movnti %r10,48(%rdi)
		43: movnti %r11,56(%rdi)

		addq $64,%rsi
		addq $64,%rdi
		sub $64,%edx
		cmp $64,%edx
		jae .Lunrolled

		/*
		* First set of user mode loads have been done
		* without any stores, so if they fail, we can
		* just try the non-unrolled loop.
		*/
		_ASM_EXTABLE_UA(10b, .Lquadwords)
		_ASM_EXTABLE_UA(11b, .Lquadwords)
		_ASM_EXTABLE_UA(12b, .Lquadwords)
		_ASM_EXTABLE_UA(13b, .Lquadwords)

		/*
		* The second set of user mode loads have been
		* done with 32 bytes stored to the destination,
		* so we need to take that into account before
		* falling back to the unrolled loop.
		*/
		_ASM_EXTABLE_UA(30b, .Lfixup32)
		_ASM_EXTABLE_UA(31b, .Lfixup32)
		_ASM_EXTABLE_UA(32b, .Lfixup32)
		_ASM_EXTABLE_UA(33b, .Lfixup32)

		/*
		* An exception on a write means that we're
		* done, but we need to update the count
		* depending on where in the unrolled loop
		* we were.
		*/
		_ASM_EXTABLE_UA(20b, .Ldone0)
		_ASM_EXTABLE_UA(21b, .Ldone8)
		_ASM_EXTABLE_UA(22b, .Ldone16)
		_ASM_EXTABLE_UA(23b, .Ldone24)
		_ASM_EXTABLE_UA(40b, .Ldone32)
		_ASM_EXTABLE_UA(41b, .Ldone40)
		_ASM_EXTABLE_UA(42b, .Ldone48)
		_ASM_EXTABLE_UA(43b, .Ldone56)

		.Lquadwords:
		cmp $8,%edx
		jb .Llong
		50: movq (%rsi),%rax
		51: movnti %rax,(%rdi)
		addq $8,%rsi
		addq $8,%rdi
		sub $8,%edx
		jmp .Lquadwords

		/*
		* If we fail on the last full quadword, we will
		* not try to do any byte-wise cached accesses.
		* We will try to do one more 4-byte uncached
		* one, though.
		*/
		_ASM_EXTABLE_UA(50b, .Llast4)
		_ASM_EXTABLE_UA(51b, .Ldone0)

		.Llong:
		test $4,%dl
		je .Lword
		60: movl (%rsi),%eax
		61: movnti %eax,(%rdi)
		addq $4,%rsi
		addq $4,%rdi
		sub $4,%edx
		.Lword:
		sfence
		test $2,%dl
		je .Lbyte
		70: movw (%rsi),%ax
		71: movw %ax,(%rdi)
		addq $2,%rsi
		addq $2,%rdi
		sub $2,%edx
		.Lbyte:
		test $1,%dl
		je .Ldone
		80: movb (%rsi),%al
		81: movb %al,(%rdi)
		dec %edx
		.Ldone:
		mov %edx,%eax
		RET

		/*
		* If we fail on the last four bytes, we won't
		* bother with any fixups. It's dead, Jim. Note
		* that there's no need for 'sfence' for any
		* of this, since the exception will have been
		* serializing.
		*/
		_ASM_EXTABLE_UA(60b, .Ldone)
		_ASM_EXTABLE_UA(61b, .Ldone)
		_ASM_EXTABLE_UA(70b, .Ldone)
		_ASM_EXTABLE_UA(71b, .Ldone)
		_ASM_EXTABLE_UA(80b, .Ldone)
		_ASM_EXTABLE_UA(81b, .Ldone)

		/*
		* This is the "head needs aliging" case when
		* the destination isn't 8-byte aligned. The
		* 4-byte case can be done uncached, but any
		* smaller alignment is done with regular stores.
		*/
		.Lalign:
		test $1,%dil
		je .Lalign_word
		test %edx,%edx
		je .Ldone
		90: movb (%rsi),%al
		91: movb %al,(%rdi)
		inc %rsi
		inc %rdi
		dec %edx
		.Lalign_word:
		test $2,%dil
		je .Lalign_long
		cmp $2,%edx
		jb .Lbyte
		92: movw (%rsi),%ax
		93: movw %ax,(%rdi)
		addq $2,%rsi
		addq $2,%rdi
		sub $2,%edx
		.Lalign_long:
		test $4,%dil
		je .Lis_aligned
		cmp $4,%edx
		jb .Lword
		94: movl (%rsi),%eax
		95: movnti %eax,(%rdi)
		addq $4,%rsi
		addq $4,%rdi
		sub $4,%edx
		jmp .Lis_aligned

		/*
		* If we fail on the initial alignment accesses,
		* we're all done. Again, no point in trying to
		* do byte-by-byte probing if the 4-byte load
		* fails - we're not doing any uncached accesses
		* any more.
		*/
		_ASM_EXTABLE_UA(90b, .Ldone)
		_ASM_EXTABLE_UA(91b, .Ldone)
		_ASM_EXTABLE_UA(92b, .Ldone)
		_ASM_EXTABLE_UA(93b, .Ldone)
		_ASM_EXTABLE_UA(94b, .Ldone)
		_ASM_EXTABLE_UA(95b, .Ldone)

		/*
		* Exception table fixups for faults in the middle
		*/
		.Ldone56: sub $8,%edx
		.Ldone48: sub $8,%edx
		.Ldone40: sub $8,%edx
		.Ldone32: sub $8,%edx
		.Ldone24: sub $8,%edx
		.Ldone16: sub $8,%edx
		.Ldone8: sub $8,%edx
		.Ldone0:
		mov %edx,%eax
		RET

		.Lfixup32:
		addq $32,%rsi
		addq $32,%rdi
		sub $32,%edx
		jmp .Lquadwords

		.Llast4:
		52: movl (%rsi),%eax
		53: movnti %eax,(%rdi)
		sfence
		sub $4,%edx
		mov %edx,%eax
		RET
		_ASM_EXTABLE_UA(52b, .Ldone0)
		_ASM_EXTABLE_UA(53b, .Ldone0)

		SYM_FUNC_END(__copy_user_nocache)
		EXPORT_SYMBOL(__copy_user_nocache)