Commit 80dcf0a7 authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Andrew Morton
Browse files

xor: pass the entire operation to the low-level ops

Currently the high-level xor code chunks up all operations into small
units for only up to 1 + 4 vectors, and passes it to four different
methods.  This means the FPU/vector context is entered and left a lot for
wide stripes, and a lot of indirect expensive indirect calls are
performed.  Switch to passing the entire gen_xor request to the low-level
ops, and provide a macro to dispatch it to the existing helper.

This reduce the number of indirect calls and FPU/vector context switches
by a factor approaching nr_stripes / 4, and also reduces source and binary
code size.

Link: https://lkml.kernel.org/r/20260327061704.3707577-27-hch@lst.de


Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarEric Biggers <ebiggers@kernel.org>
Tested-by: default avatarEric Biggers <ebiggers@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Borislav Petkov (AMD)" <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: David Sterba <dsterba@suse.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason A. Donenfeld <jason@zx2c4.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Li Nan <linan122@huawei.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Magnus Lindholm <linmag7@gmail.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Song Liu <song@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 0f629e72
Loading
Loading
Loading
Loading
+0 −5
Original line number Diff line number Diff line
@@ -2,11 +2,6 @@
#ifndef _XOR_H
#define _XOR_H

#define MAX_XOR_BLOCKS 4

extern void xor_blocks(unsigned int count, unsigned int bytes,
	void *dest, void **srcs);

void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes);

#endif /* _XOR_H */
+9 −10
Original line number Diff line number Diff line
@@ -832,18 +832,17 @@ xor_alpha_prefetch_5: \n\
	.end xor_alpha_prefetch_5				\n\
");

DO_XOR_BLOCKS(alpha, xor_alpha_2, xor_alpha_3, xor_alpha_4, xor_alpha_5);

struct xor_block_template xor_block_alpha = {
	.name		= "alpha",
	.do_2	= xor_alpha_2,
	.do_3	= xor_alpha_3,
	.do_4	= xor_alpha_4,
	.do_5	= xor_alpha_5,
	.xor_gen	= xor_gen_alpha,
};

DO_XOR_BLOCKS(alpha_prefetch, xor_alpha_prefetch_2, xor_alpha_prefetch_3,
		xor_alpha_prefetch_4, xor_alpha_prefetch_5);

struct xor_block_template xor_block_alpha_prefetch = {
	.name		= "alpha prefetch",
	.do_2	= xor_alpha_prefetch_2,
	.do_3	= xor_alpha_prefetch_3,
	.do_4	= xor_alpha_prefetch_4,
	.do_5	= xor_alpha_prefetch_5,
	.xor_gen	= xor_gen_alpha_prefetch,
};
+5 −44
Original line number Diff line number Diff line
@@ -5,54 +5,15 @@
#include "xor_impl.h"
#include "xor_arch.h"

extern struct xor_block_template const xor_block_neon_inner;

static void
xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
	   const unsigned long * __restrict p2)
{
	kernel_neon_begin();
	xor_block_neon_inner.do_2(bytes, p1, p2);
	kernel_neon_end();
}

static void
xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
	   const unsigned long * __restrict p2,
	   const unsigned long * __restrict p3)
{
	kernel_neon_begin();
	xor_block_neon_inner.do_3(bytes, p1, p2, p3);
	kernel_neon_end();
}

static void
xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
	   const unsigned long * __restrict p2,
	   const unsigned long * __restrict p3,
	   const unsigned long * __restrict p4)
{
	kernel_neon_begin();
	xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4);
	kernel_neon_end();
}

static void
xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
	   const unsigned long * __restrict p2,
	   const unsigned long * __restrict p3,
	   const unsigned long * __restrict p4,
	   const unsigned long * __restrict p5)
static void xor_gen_neon(void *dest, void **srcs, unsigned int src_cnt,
		unsigned int bytes)
{
	kernel_neon_begin();
	xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5);
	xor_gen_neon_inner(dest, srcs, src_cnt, bytes);
	kernel_neon_end();
}

struct xor_block_template xor_block_neon = {
	.name		= "neon",
	.do_2	= xor_neon_2,
	.do_3	= xor_neon_3,
	.do_4	= xor_neon_4,
	.do_5	= xor_neon_5
	.xor_gen	= xor_gen_neon,
};
+2 −7
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@
 */

#include "xor_impl.h"
#include "xor_arch.h"

#ifndef __ARM_NEON__
#error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon'
@@ -22,10 +23,4 @@
#define NO_TEMPLATE
#include "../xor-8regs.c"

struct xor_block_template const xor_block_neon_inner = {
	.name	= "__inner_neon__",
	.do_2	= xor_8regs_2,
	.do_3	= xor_8regs_3,
	.do_4	= xor_8regs_4,
	.do_5	= xor_8regs_5,
};
__DO_XOR_BLOCKS(neon_inner, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5);
+5 −5
Original line number Diff line number Diff line
@@ -127,10 +127,10 @@ xor_arm4regs_5(unsigned long bytes, unsigned long * __restrict p1,
	} while (--lines);
}

DO_XOR_BLOCKS(arm4regs, xor_arm4regs_2, xor_arm4regs_3, xor_arm4regs_4,
		xor_arm4regs_5);

struct xor_block_template xor_block_arm4regs = {
	.name		= "arm4regs",
	.do_2	= xor_arm4regs_2,
	.do_3	= xor_arm4regs_3,
	.do_4	= xor_arm4regs_4,
	.do_5	= xor_arm4regs_5,
	.xor_gen	= xor_gen_arm4regs,
};
Loading