Commit 836ed3c4 authored by Kristina Martsenko's avatar Kristina Martsenko Committed by Catalin Marinas
Browse files

arm64: lib: Use MOPS for memcpy() routines

Make memcpy(), memmove() and memset() use the Armv8.8 FEAT_MOPS
instructions when implemented on the CPU.

The CPY*/SET* instructions copy or set a block of memory of arbitrary
size and alignment. They can be interrupted by the CPU and the copying
resumed later. Their performance is expected to be close to the best
generic copy/set sequence of loads/stores for a given CPU. Using them in
the kernel's copy/set routines therefore avoids the need to periodically
rewrite the routines to optimize for new microarchitectures. It could
also lead to a performance improvement for some CPUs and systems.

With this change the kernel will always use the instructions if they are
implemented on the CPU (and have not been disabled by the arm64.nomops
command line parameter). When not implemented the usual routines will be
used (patched via alternatives). Note, we need to patch B/NOP instead of
the whole sequence to avoid executing a partially patched sequence in
case the compiler generates a mem*() call inside the alternatives
patching code.

Note that MOPS instructions have relaxed behavior on Device memory, but
it is expected that these routines are not generally used on MMIO.

Note: For memcpy(), this uses the CPY* instructions instead of CPYF*, as
CPY* allows overlaps between the source and destination buffers, and
despite contradicting the C standard, compilers require that memcpy()
work on exactly overlapping source and destination:
  https://gcc.gnu.org/onlinedocs/gcc/Standards.html#C-Language
  https://reviews.llvm.org/D86993



Signed-off-by: default avatarKristina Martsenko <kristina.martsenko@arm.com>
Link: https://lore.kernel.org/r/20240930161051.3777828-5-kristina.martsenko@arm.com


Signed-off-by: default avatarCatalin Marinas <catalin.marinas@arm.com>
parent b616058c
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -2155,6 +2155,9 @@ config ARM64_EPAN
	  if the cpu does not implement the feature.
endmenu # "ARMv8.7 architectural features"

config AS_HAS_MOPS
	def_bool $(as-instr,.arch_extension mops)

menu "ARMv8.9 architectural features"

config ARM64_POE
+18 −1
Original line number Diff line number Diff line
@@ -57,7 +57,7 @@
   The loop tail is handled by always copying 64 bytes from the end.
*/

SYM_FUNC_START(__pi_memcpy)
SYM_FUNC_START_LOCAL(__pi_memcpy_generic)
	add	srcend, src, count
	add	dstend, dstin, count
	cmp	count, 128
@@ -238,7 +238,24 @@ L(copy64_from_start):
	stp	B_l, B_h, [dstin, 16]
	stp	C_l, C_h, [dstin]
	ret
SYM_FUNC_END(__pi_memcpy_generic)

#ifdef CONFIG_AS_HAS_MOPS
	.arch_extension mops
SYM_FUNC_START(__pi_memcpy)
alternative_if_not ARM64_HAS_MOPS
	b	__pi_memcpy_generic
alternative_else_nop_endif

	mov	dst, dstin
	cpyp	[dst]!, [src]!, count!
	cpym	[dst]!, [src]!, count!
	cpye	[dst]!, [src]!, count!
	ret
SYM_FUNC_END(__pi_memcpy)
#else
SYM_FUNC_ALIAS(__pi_memcpy, __pi_memcpy_generic)
#endif

SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
EXPORT_SYMBOL(__memcpy)
+19 −1
Original line number Diff line number Diff line
@@ -26,6 +26,7 @@
 */

dstin		.req	x0
val_x		.req	x1
val		.req	w1
count		.req	x2
tmp1		.req	x3
@@ -42,7 +43,7 @@ dst .req x8
tmp3w		.req	w9
tmp3		.req	x9

SYM_FUNC_START(__pi_memset)
SYM_FUNC_START_LOCAL(__pi_memset_generic)
	mov	dst, dstin	/* Preserve return value.  */
	and	A_lw, val, #255
	orr	A_lw, A_lw, A_lw, lsl #8
@@ -201,7 +202,24 @@ SYM_FUNC_START(__pi_memset)
	ands	count, count, zva_bits_x
	b.ne	.Ltail_maybe_long
	ret
SYM_FUNC_END(__pi_memset_generic)

#ifdef CONFIG_AS_HAS_MOPS
	.arch_extension mops
SYM_FUNC_START(__pi_memset)
alternative_if_not ARM64_HAS_MOPS
	b	__pi_memset_generic
alternative_else_nop_endif

	mov	dst, dstin
	setp	[dst]!, count!, val_x
	setm	[dst]!, count!, val_x
	sete	[dst]!, count!, val_x
	ret
SYM_FUNC_END(__pi_memset)
#else
SYM_FUNC_ALIAS(__pi_memset, __pi_memset_generic)
#endif

SYM_FUNC_ALIAS(__memset, __pi_memset)
EXPORT_SYMBOL(__memset)