s390/fpu: decrease stack usage for some cases (066c4091) · Commits · git / linux-net

arch/s390/crypto/chacha-glue.c

+1 −1

Original line number	Diff line number	Diff line
		@@ -22,7 +22,7 @@ static void chacha20_crypt_s390(u32 state, u8 dst, const u8 *src,
		unsigned int nbytes, const u32 *key,
		u32 *counter)
		{
		DECLARE_KERNEL_FPU_ONSTACK(vxstate);
		DECLARE_KERNEL_FPU_ONSTACK32(vxstate);

		kernel_fpu_begin(&vxstate, KERNEL_VXR);
		chacha20_vx(dst, src, nbytes, key, counter);

arch/s390/crypto/crc32-vx.c

+1 −1

Original line number	Diff line number	Diff line
		@@ -50,7 +50,7 @@ u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
		unsigned char const *data, size_t datalen) \
		{ \
		unsigned long prealign, aligned, remaining; \
		DECLARE_KERNEL_FPU_ONSTACK(vxstate); \
		DECLARE_KERNEL_FPU_ONSTACK16(vxstate); \
		\
		if (datalen < VX_MIN_LEN + VX_ALIGN_MASK) \
		return ___crc32_sw(crc, data, datalen); \

arch/s390/include/asm/fpu-types.h

+24 −6

Original line number	Diff line number	Diff line
		@@ -16,14 +16,32 @@ struct fpu {
		__vector128 vxrs[__NUM_VXRS] __aligned(8);
		};

		/* In-kernel FPU state structure */
		struct kernel_fpu {
		struct kernel_fpu_hdr {
		int mask;
		u32 fpc;
		__vector128 vxrs[__NUM_VXRS] __aligned(8);
		};

		#define DECLARE_KERNEL_FPU_ONSTACK(name) \
		struct kernel_fpu name __uninitialized
		struct kernel_fpu {
		struct kernel_fpu_hdr hdr;
		__vector128 vxrs[] __aligned(8);
		};

		#define KERNEL_FPU_STRUCT(vxr_size) \
		struct kernel_fpu_##vxr_size { \
		struct kernel_fpu_hdr hdr; \
		__vector128 vxrs[vxr_size] __aligned(8); \
		}

		KERNEL_FPU_STRUCT(16);
		KERNEL_FPU_STRUCT(32);

		#define DECLARE_KERNEL_FPU_ONSTACK(vxr_size, name) \
		struct kernel_fpu_##vxr_size name __uninitialized

		#define DECLARE_KERNEL_FPU_ONSTACK16(name) \
		DECLARE_KERNEL_FPU_ONSTACK(16, name)

		#define DECLARE_KERNEL_FPU_ONSTACK32(name) \
		DECLARE_KERNEL_FPU_ONSTACK(32, name)

		#endif /* _ASM_S390_FPU_TYPES_H */

arch/s390/include/asm/fpu.h

+42 −6

Original line number	Diff line number	Diff line
		@@ -162,28 +162,64 @@ static __always_inline void load_fp_regs_vx(__vector128 *vxrs)
		__load_fp_regs(fprs, sizeof(__vector128) / sizeof(freg_t));
		}

		static inline void kernel_fpu_begin(struct kernel_fpu *state, int flags)
		static inline void _kernel_fpu_begin(struct kernel_fpu *state, int flags)
		{
		state->mask = READ_ONCE(current->thread.kfpu_flags);
		state->hdr.mask = READ_ONCE(current->thread.kfpu_flags);
		if (!test_thread_flag(TIF_FPU)) {
		/* Save user space FPU state and register contents */
		save_user_fpu_regs();
		} else if (state->mask & flags) {
		} else if (state->hdr.mask & flags) {
		/* Save FPU/vector register in-use by the kernel */
		__kernel_fpu_begin(state, flags);
		}
		__atomic_or(flags, &current->thread.kfpu_flags);
		}

		static inline void kernel_fpu_end(struct kernel_fpu *state, int flags)
		static inline void _kernel_fpu_end(struct kernel_fpu *state, int flags)
		{
		WRITE_ONCE(current->thread.kfpu_flags, state->mask);
		if (state->mask & flags) {
		WRITE_ONCE(current->thread.kfpu_flags, state->hdr.mask);
		if (state->hdr.mask & flags) {
		/* Restore FPU/vector register in-use by the kernel */
		__kernel_fpu_end(state, flags);
		}
		}

		void __kernel_fpu_invalid_size(void);

		static __always_inline void kernel_fpu_check_size(int flags, unsigned int size)
		{
		unsigned int cnt = 0;

		if (flags & KERNEL_VXR_V0V7)
		cnt += 8;
		if (flags & KERNEL_VXR_V8V15)
		cnt += 8;
		if (flags & KERNEL_VXR_V16V23)
		cnt += 8;
		if (flags & KERNEL_VXR_V24V31)
		cnt += 8;
		if (cnt != size)
		__kernel_fpu_invalid_size();
		}

		#define kernel_fpu_begin(state, flags) \
		{ \
		typeof(state) s = (state); \
		int _flags = (flags); \
		\
		kernel_fpu_check_size(_flags, ARRAY_SIZE(s->vxrs)); \
		_kernel_fpu_begin((struct kernel_fpu *)s, _flags); \
		}

		#define kernel_fpu_end(state, flags) \
		{ \
		typeof(state) s = (state); \
		int _flags = (flags); \
		\
		kernel_fpu_check_size(_flags, ARRAY_SIZE(s->vxrs)); \
		_kernel_fpu_end((struct kernel_fpu *)s, _flags); \
		}

		static inline void save_kernel_fpu_regs(struct thread_struct *thread)
		{
		struct fpu *state = &thread->kfpu;

arch/s390/kernel/fpu.c

+24 −24

Original line number	Diff line number	Diff line
		@@ -19,41 +19,41 @@ void __kernel_fpu_begin(struct kernel_fpu *state, int flags)
		* Limit the save to the FPU/vector registers already
		* in use by the previous context.
		*/
		flags &= state->mask;
		flags &= state->hdr.mask;
		if (flags & KERNEL_FPC)
		fpu_stfpc(&state->fpc);
		fpu_stfpc(&state->hdr.fpc);
		if (!cpu_has_vx()) {
		if (flags & KERNEL_VXR_LOW)
		save_fp_regs_vx(state->vxrs);
		save_fp_regs_vx(vxrs);
		return;
		}
		mask = flags & KERNEL_VXR;
		if (mask == KERNEL_VXR) {
		fpu_vstm(0, 15, &vxrs[0]);
		fpu_vstm(16, 31, &vxrs[16]);
		vxrs += fpu_vstm(0, 15, vxrs);
		vxrs += fpu_vstm(16, 31, vxrs);
		return;
		}
		if (mask == KERNEL_VXR_MID) {
		fpu_vstm(8, 23, &vxrs[8]);
		vxrs += fpu_vstm(8, 23, vxrs);
		return;
		}
		mask = flags & KERNEL_VXR_LOW;
		if (mask) {
		if (mask == KERNEL_VXR_LOW)
		fpu_vstm(0, 15, &vxrs[0]);
		vxrs += fpu_vstm(0, 15, vxrs);
		else if (mask == KERNEL_VXR_V0V7)
		fpu_vstm(0, 7, &vxrs[0]);
		vxrs += fpu_vstm(0, 7, vxrs);
		else
		fpu_vstm(8, 15, &vxrs[8]);
		vxrs += fpu_vstm(8, 15, vxrs);
		}
		mask = flags & KERNEL_VXR_HIGH;
		if (mask) {
		if (mask == KERNEL_VXR_HIGH)
		fpu_vstm(16, 31, &vxrs[16]);
		vxrs += fpu_vstm(16, 31, vxrs);
		else if (mask == KERNEL_VXR_V16V23)
		fpu_vstm(16, 23, &vxrs[16]);
		vxrs += fpu_vstm(16, 23, vxrs);
		else
		fpu_vstm(24, 31, &vxrs[24]);
		vxrs += fpu_vstm(24, 31, vxrs);
		}
		}
		EXPORT_SYMBOL(__kernel_fpu_begin);
		@@ -68,41 +68,41 @@ void __kernel_fpu_end(struct kernel_fpu *state, int flags)
		* previous context that have been overwritten by the
		* current context.
		*/
		flags &= state->mask;
		flags &= state->hdr.mask;
		if (flags & KERNEL_FPC)
		fpu_lfpc(&state->fpc);
		fpu_lfpc(&state->hdr.fpc);
		if (!cpu_has_vx()) {
		if (flags & KERNEL_VXR_LOW)
		load_fp_regs_vx(state->vxrs);
		load_fp_regs_vx(vxrs);
		return;
		}
		mask = flags & KERNEL_VXR;
		if (mask == KERNEL_VXR) {
		fpu_vlm(0, 15, &vxrs[0]);
		fpu_vlm(16, 31, &vxrs[16]);
		vxrs += fpu_vlm(0, 15, vxrs);
		vxrs += fpu_vlm(16, 31, vxrs);
		return;
		}
		if (mask == KERNEL_VXR_MID) {
		fpu_vlm(8, 23, &vxrs[8]);
		vxrs += fpu_vlm(8, 23, vxrs);
		return;
		}
		mask = flags & KERNEL_VXR_LOW;
		if (mask) {
		if (mask == KERNEL_VXR_LOW)
		fpu_vlm(0, 15, &vxrs[0]);
		vxrs += fpu_vlm(0, 15, vxrs);
		else if (mask == KERNEL_VXR_V0V7)
		fpu_vlm(0, 7, &vxrs[0]);
		vxrs += fpu_vlm(0, 7, vxrs);
		else
		fpu_vlm(8, 15, &vxrs[8]);
		vxrs += fpu_vlm(8, 15, vxrs);
		}
		mask = flags & KERNEL_VXR_HIGH;
		if (mask) {
		if (mask == KERNEL_VXR_HIGH)
		fpu_vlm(16, 31, &vxrs[16]);
		vxrs += fpu_vlm(16, 31, vxrs);
		else if (mask == KERNEL_VXR_V16V23)
		fpu_vlm(16, 23, &vxrs[16]);
		vxrs += fpu_vlm(16, 23, vxrs);
		else
		fpu_vlm(24, 31, &vxrs[24]);
		vxrs += fpu_vlm(24, 31, vxrs);
		}
		}
		EXPORT_SYMBOL(__kernel_fpu_end);