Commit dcd3e1de authored by Heiko Carstens's avatar Heiko Carstens
Browse files

s390/checksum: provide csum_partial_copy_nocheck()



With csum_partial(), which reads all bytes into registers it is easy to
also implement csum_partial_copy_nocheck() which copies the buffer while
calculating its checksum.

For a 512 byte buffer this reduces the runtime by 19%. Compared to the old
generic variant (memcpy() + cksm instruction) runtime is reduced by 42%).

Signed-off-by: default avatarHeiko Carstens <hca@linux.ibm.com>
parent cb2a1dd5
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -32,6 +32,9 @@ static inline __wsum cksm(const void *buff, int len, __wsum sum)

__wsum csum_partial(const void *buff, int len, __wsum sum);

#define _HAVE_ARCH_CSUM_AND_COPY
__wsum csum_partial_copy_nocheck(const void *src, void *dst, int len);

/*
 * Fold a partial checksum without adding pseudo headers.
 */
+10 −0
Original line number Diff line number Diff line
@@ -531,6 +531,16 @@
	MRXBOPC 0, 0x37, v1
.endm

/* VECTOR STORE WITH LENGTH */
.macro VSTL	v, gr, disp, base
	VX_NUM	v1, \v
	GR_NUM	b2, \base
	GR_NUM	r3, \gr
	.word	0xE700 | ((v1&15) << 4) | r3
	.word	(b2 << 12) | (\disp)
	MRXBOPC 0, 0x3f, v1
.endm

/* Vector integer instructions */

/* VECTOR AND */
+58 −0
Original line number Diff line number Diff line
@@ -241,6 +241,64 @@ static __always_inline void fpu_vlvgf(u8 v, u32 val, u16 index)

#ifdef CONFIG_CC_IS_CLANG

static __always_inline void fpu_vst(u8 v1, const void *vxr)
{
	instrument_write(vxr, sizeof(__vector128));
	asm volatile("\n"
		"	la	1,%[vxr]\n"
		"	VST	%[v1],0,,1\n"
		: [vxr] "=R" (*(__vector128 *)vxr)
		: [v1] "I" (v1)
		: "memory", "1");
}

#else /* CONFIG_CC_IS_CLANG */

static __always_inline void fpu_vst(u8 v1, const void *vxr)
{
	instrument_write(vxr, sizeof(__vector128));
	asm volatile("VST	%[v1],%O[vxr],,%R[vxr]\n"
		     : [vxr] "=Q" (*(__vector128 *)vxr)
		     : [v1] "I" (v1)
		     : "memory");
}

#endif /* CONFIG_CC_IS_CLANG */

#ifdef CONFIG_CC_IS_CLANG

static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr)
{
	unsigned int size;

	size = min(index + 1, sizeof(__vector128));
	instrument_write(vxr, size);
	asm volatile("\n"
		"	la	1,%[vxr]\n"
		"	VSTL	%[v1],%[index],0,1\n"
		: [vxr] "=R" (*(u8 *)vxr)
		: [index] "d" (index), [v1] "I" (v1)
		: "memory", "1");
}

#else /* CONFIG_CC_IS_CLANG */

static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr)
{
	unsigned int size;

	size = min(index + 1, sizeof(__vector128));
	instrument_write(vxr, size);
	asm volatile("VSTL	%[v1],%[index],%O[vxr],%R[vxr]\n"
		     : [vxr] "=Q" (*(u8 *)vxr)
		     : [index] "d" (index), [v1] "I" (v1)
		     : "memory");
}

#endif /* CONFIG_CC_IS_CLANG */

#ifdef CONFIG_CC_IS_CLANG

#define fpu_vstm(_v1, _v3, _vxrs)					\
({									\
	unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128);	\
+41 −13
Original line number Diff line number Diff line
@@ -5,8 +5,8 @@
#include <asm/fpu.h>

/*
 * Computes the checksum of a memory block at buff, length len,
 * and adds in "sum" (32-bit).
 * Computes the checksum of a memory block at src, length len,
 * and adds in "sum" (32-bit). If copy is true copies to dst.
 *
 * Returns a 32-bit number suitable for feeding into itself
 * or csum_tcpudp_magic.
@@ -14,43 +14,60 @@
 * This function must be called with even lengths, except
 * for the last fragment, which may be odd.
 *
 * It's best to have buff aligned on a 64-bit boundary.
 * It's best to have src and dst aligned on a 64-bit boundary.
 */
__wsum csum_partial(const void *buff, int len, __wsum sum)
static __always_inline __wsum csum_copy(void *dst, const void *src, int len, __wsum sum, bool copy)
{
	DECLARE_KERNEL_FPU_ONSTACK8(vxstate);

	if (!cpu_has_vx())
		return cksm(buff, len, sum);
	if (!cpu_has_vx()) {
		if (copy)
			memcpy(dst, src, len);
		return cksm(dst, len, sum);
	}
	kernel_fpu_begin(&vxstate, KERNEL_VXR_V16V23);
	fpu_vlvgf(16, (__force u32)sum, 1);
	fpu_vzero(17);
	fpu_vzero(18);
	fpu_vzero(19);
	while (len >= 64) {
		fpu_vlm(20, 23, buff);
		fpu_vlm(20, 23, src);
		if (copy) {
			fpu_vstm(20, 23, dst);
			dst += 64;
		}
		fpu_vcksm(16, 20, 16);
		fpu_vcksm(17, 21, 17);
		fpu_vcksm(18, 22, 18);
		fpu_vcksm(19, 23, 19);
		buff += 64;
		src += 64;
		len -= 64;
	}
	while (len >= 32) {
		fpu_vlm(20, 21, buff);
		fpu_vlm(20, 21, src);
		if (copy) {
			fpu_vstm(20, 21, dst);
			dst += 32;
		}
		fpu_vcksm(16, 20, 16);
		fpu_vcksm(17, 21, 17);
		buff += 32;
		src += 32;
		len -= 32;
	}
	while (len >= 16) {
		fpu_vl(20, buff);
		fpu_vl(20, src);
		if (copy) {
			fpu_vst(20, dst);
			dst += 16;
		}
		fpu_vcksm(16, 20, 16);
		buff += 16;
		src += 16;
		len -= 16;
	}
	if (len) {
		fpu_vll(20, len - 1, buff);
		fpu_vll(20, len - 1, src);
		if (copy)
			fpu_vstl(20, len - 1, dst);
		fpu_vcksm(16, 20, 16);
	}
	fpu_vcksm(18, 19, 18);
@@ -60,4 +77,15 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
	kernel_fpu_end(&vxstate, KERNEL_VXR_V16V23);
	return sum;
}

__wsum csum_partial(const void *buff, int len, __wsum sum)
{
	return csum_copy(NULL, buff, len, sum, false);
}
EXPORT_SYMBOL(csum_partial);

__wsum csum_partial_copy_nocheck(const void *src, void *dst, int len)
{
	return csum_copy(dst, src, len, 0, true);
}
EXPORT_SYMBOL(csum_partial_copy_nocheck);