Commit 066c4091 authored by Heiko Carstens's avatar Heiko Carstens
Browse files

s390/fpu: decrease stack usage for some cases



The kernel_fpu structure has a quite large size of 520 bytes. In order to
reduce stack footprint introduce several kernel fpu structures with
different and also smaller sizes. This way every kernel fpu user must use
the correct variant. A compile time check verifies that the correct variant
is used.

There are several users which use only 16 instead of all 32 vector
registers. For those users the new kernel_fpu_16 structure with a size of
only 266 bytes can be used.

Signed-off-by: default avatarHeiko Carstens <hca@linux.ibm.com>
parent cad8c3ab
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -22,7 +22,7 @@ static void chacha20_crypt_s390(u32 *state, u8 *dst, const u8 *src,
				unsigned int nbytes, const u32 *key,
				u32 *counter)
{
	DECLARE_KERNEL_FPU_ONSTACK(vxstate);
	DECLARE_KERNEL_FPU_ONSTACK32(vxstate);

	kernel_fpu_begin(&vxstate, KERNEL_VXR);
	chacha20_vx(dst, src, nbytes, key, counter);
+1 −1
Original line number Diff line number Diff line
@@ -50,7 +50,7 @@ u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
				unsigned char const *data, size_t datalen)  \
	{								    \
		unsigned long prealign, aligned, remaining;		    \
		DECLARE_KERNEL_FPU_ONSTACK(vxstate);			    \
		DECLARE_KERNEL_FPU_ONSTACK16(vxstate);			    \
									    \
		if (datalen < VX_MIN_LEN + VX_ALIGN_MASK)		    \
			return ___crc32_sw(crc, data, datalen);		    \
+24 −6
Original line number Diff line number Diff line
@@ -16,14 +16,32 @@ struct fpu {
	__vector128 vxrs[__NUM_VXRS] __aligned(8);
};

/* In-kernel FPU state structure */
struct kernel_fpu {
struct kernel_fpu_hdr {
	int	mask;
	u32	fpc;
	__vector128 vxrs[__NUM_VXRS] __aligned(8);
};

#define DECLARE_KERNEL_FPU_ONSTACK(name)	\
	struct kernel_fpu name __uninitialized
struct kernel_fpu {
	struct kernel_fpu_hdr hdr;
	__vector128 vxrs[] __aligned(8);
};

#define KERNEL_FPU_STRUCT(vxr_size)				\
struct kernel_fpu_##vxr_size {					\
	struct kernel_fpu_hdr hdr;				\
	__vector128 vxrs[vxr_size] __aligned(8);		\
}

KERNEL_FPU_STRUCT(16);
KERNEL_FPU_STRUCT(32);

#define DECLARE_KERNEL_FPU_ONSTACK(vxr_size, name)		\
	struct kernel_fpu_##vxr_size name __uninitialized

#define DECLARE_KERNEL_FPU_ONSTACK16(name)			\
	DECLARE_KERNEL_FPU_ONSTACK(16, name)

#define DECLARE_KERNEL_FPU_ONSTACK32(name)			\
	DECLARE_KERNEL_FPU_ONSTACK(32, name)

#endif /* _ASM_S390_FPU_TYPES_H */
+42 −6
Original line number Diff line number Diff line
@@ -162,28 +162,64 @@ static __always_inline void load_fp_regs_vx(__vector128 *vxrs)
	__load_fp_regs(fprs, sizeof(__vector128) / sizeof(freg_t));
}

static inline void kernel_fpu_begin(struct kernel_fpu *state, int flags)
static inline void _kernel_fpu_begin(struct kernel_fpu *state, int flags)
{
	state->mask = READ_ONCE(current->thread.kfpu_flags);
	state->hdr.mask = READ_ONCE(current->thread.kfpu_flags);
	if (!test_thread_flag(TIF_FPU)) {
		/* Save user space FPU state and register contents */
		save_user_fpu_regs();
	} else if (state->mask & flags) {
	} else if (state->hdr.mask & flags) {
		/* Save FPU/vector register in-use by the kernel */
		__kernel_fpu_begin(state, flags);
	}
	__atomic_or(flags, &current->thread.kfpu_flags);
}

static inline void kernel_fpu_end(struct kernel_fpu *state, int flags)
static inline void _kernel_fpu_end(struct kernel_fpu *state, int flags)
{
	WRITE_ONCE(current->thread.kfpu_flags, state->mask);
	if (state->mask & flags) {
	WRITE_ONCE(current->thread.kfpu_flags, state->hdr.mask);
	if (state->hdr.mask & flags) {
		/* Restore FPU/vector register in-use by the kernel */
		__kernel_fpu_end(state, flags);
	}
}

void __kernel_fpu_invalid_size(void);

static __always_inline void kernel_fpu_check_size(int flags, unsigned int size)
{
	unsigned int cnt = 0;

	if (flags & KERNEL_VXR_V0V7)
		cnt += 8;
	if (flags & KERNEL_VXR_V8V15)
		cnt += 8;
	if (flags & KERNEL_VXR_V16V23)
		cnt += 8;
	if (flags & KERNEL_VXR_V24V31)
		cnt += 8;
	if (cnt != size)
		__kernel_fpu_invalid_size();
}

#define kernel_fpu_begin(state, flags)					\
{									\
	typeof(state) s = (state);					\
	int _flags = (flags);						\
									\
	kernel_fpu_check_size(_flags, ARRAY_SIZE(s->vxrs));		\
	_kernel_fpu_begin((struct kernel_fpu *)s, _flags);		\
}

#define kernel_fpu_end(state, flags)					\
{									\
	typeof(state) s = (state);					\
	int _flags = (flags);						\
									\
	kernel_fpu_check_size(_flags, ARRAY_SIZE(s->vxrs));		\
	_kernel_fpu_end((struct kernel_fpu *)s, _flags);		\
}

static inline void save_kernel_fpu_regs(struct thread_struct *thread)
{
	struct fpu *state = &thread->kfpu;
+24 −24
Original line number Diff line number Diff line
@@ -19,41 +19,41 @@ void __kernel_fpu_begin(struct kernel_fpu *state, int flags)
	 * Limit the save to the FPU/vector registers already
	 * in use by the previous context.
	 */
	flags &= state->mask;
	flags &= state->hdr.mask;
	if (flags & KERNEL_FPC)
		fpu_stfpc(&state->fpc);
		fpu_stfpc(&state->hdr.fpc);
	if (!cpu_has_vx()) {
		if (flags & KERNEL_VXR_LOW)
			save_fp_regs_vx(state->vxrs);
			save_fp_regs_vx(vxrs);
		return;
	}
	mask = flags & KERNEL_VXR;
	if (mask == KERNEL_VXR) {
		fpu_vstm(0, 15, &vxrs[0]);
		fpu_vstm(16, 31, &vxrs[16]);
		vxrs += fpu_vstm(0, 15, vxrs);
		vxrs += fpu_vstm(16, 31, vxrs);
		return;
	}
	if (mask == KERNEL_VXR_MID) {
		fpu_vstm(8, 23, &vxrs[8]);
		vxrs += fpu_vstm(8, 23, vxrs);
		return;
	}
	mask = flags & KERNEL_VXR_LOW;
	if (mask) {
		if (mask == KERNEL_VXR_LOW)
			fpu_vstm(0, 15, &vxrs[0]);
			vxrs += fpu_vstm(0, 15, vxrs);
		else if (mask == KERNEL_VXR_V0V7)
			fpu_vstm(0, 7, &vxrs[0]);
			vxrs += fpu_vstm(0, 7, vxrs);
		else
			fpu_vstm(8, 15, &vxrs[8]);
			vxrs += fpu_vstm(8, 15, vxrs);
	}
	mask = flags & KERNEL_VXR_HIGH;
	if (mask) {
		if (mask == KERNEL_VXR_HIGH)
			fpu_vstm(16, 31, &vxrs[16]);
			vxrs += fpu_vstm(16, 31, vxrs);
		else if (mask == KERNEL_VXR_V16V23)
			fpu_vstm(16, 23, &vxrs[16]);
			vxrs += fpu_vstm(16, 23, vxrs);
		else
			fpu_vstm(24, 31, &vxrs[24]);
			vxrs += fpu_vstm(24, 31, vxrs);
	}
}
EXPORT_SYMBOL(__kernel_fpu_begin);
@@ -68,41 +68,41 @@ void __kernel_fpu_end(struct kernel_fpu *state, int flags)
	 * previous context that have been overwritten by the
	 * current context.
	 */
	flags &= state->mask;
	flags &= state->hdr.mask;
	if (flags & KERNEL_FPC)
		fpu_lfpc(&state->fpc);
		fpu_lfpc(&state->hdr.fpc);
	if (!cpu_has_vx()) {
		if (flags & KERNEL_VXR_LOW)
			load_fp_regs_vx(state->vxrs);
			load_fp_regs_vx(vxrs);
		return;
	}
	mask = flags & KERNEL_VXR;
	if (mask == KERNEL_VXR) {
		fpu_vlm(0, 15, &vxrs[0]);
		fpu_vlm(16, 31, &vxrs[16]);
		vxrs += fpu_vlm(0, 15, vxrs);
		vxrs += fpu_vlm(16, 31, vxrs);
		return;
	}
	if (mask == KERNEL_VXR_MID) {
		fpu_vlm(8, 23, &vxrs[8]);
		vxrs += fpu_vlm(8, 23, vxrs);
		return;
	}
	mask = flags & KERNEL_VXR_LOW;
	if (mask) {
		if (mask == KERNEL_VXR_LOW)
			fpu_vlm(0, 15, &vxrs[0]);
			vxrs += fpu_vlm(0, 15, vxrs);
		else if (mask == KERNEL_VXR_V0V7)
			fpu_vlm(0, 7, &vxrs[0]);
			vxrs += fpu_vlm(0, 7, vxrs);
		else
			fpu_vlm(8, 15, &vxrs[8]);
			vxrs += fpu_vlm(8, 15, vxrs);
	}
	mask = flags & KERNEL_VXR_HIGH;
	if (mask) {
		if (mask == KERNEL_VXR_HIGH)
			fpu_vlm(16, 31, &vxrs[16]);
			vxrs += fpu_vlm(16, 31, vxrs);
		else if (mask == KERNEL_VXR_V16V23)
			fpu_vlm(16, 23, &vxrs[16]);
			vxrs += fpu_vlm(16, 23, vxrs);
		else
			fpu_vlm(24, 31, &vxrs[24]);
			vxrs += fpu_vlm(24, 31, vxrs);
	}
}
EXPORT_SYMBOL(__kernel_fpu_end);
Loading