Add combiner support for bfloat16.

2025-10-06  Michael Meissner  <meissner@linux.ibm.com>

gcc/

	* config.gcc (powerpc*-*-*): Add float16.o.
	* config/rs6000/float16.cc: New file.
	* config/rs6000/float16.md (various): Update comments.
	(<fp16_binary_name>bf3): Move code to bfloat16_expand_binary_op.
	(<fp16_binary_name>bf3_internal1): New combiner insns.
	(<fp16_binary_name>bf3_internal2): Likewise.
	(<fp16_binary_name>bf3_internal3): Likewise.
	(<fp16_binary_name>bf3_internal4): Likewise.
	* config/rs6000/rs6000-proto.h (bfloat16_expand_binary_op): New
	declaration.
	* config/rs6000/t-rs6000 (float16.o): New build rule.
This commit is contained in:
Michael Meissner 2025-10-06 23:12:47 -04:00
parent 9f5fe9d47f
commit d60ac67a7c
5 changed files with 297 additions and 59 deletions

View File

@ -524,6 +524,7 @@ powerpc*-*-*)
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
extra_objs="${extra_objs} float16.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"

View File

@ -0,0 +1,167 @@
/* Subroutines for the C front end on the PowerPC architecture.
Copyright (C) 2002-2025 Free Software Foundation, Inc.
Contributed by Zack Weinberg <zack@codesourcery.com>
and Paolo Bonzini <bonzini@gnu.org>
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 3, or (at your
option) any later version.
GCC is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
License for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
/* 16-bit floating point support. */
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "rtl.h"
#include "tree.h"
#include "memmodel.h"
#include "tm_p.h"
#include "stringpool.h"
#include "expmed.h"
#include "optabs.h"
#include "regs.h"
#include "insn-attr.h"
#include "flags.h"
#include "attribs.h"
#include "explow.h"
#include "expr.h"
#include "common/common-target.h"
#include "rs6000-internal.h"
/* Expand a bfloat16 floating point binary operation:
ICODE: Operation to perform.
OP0: Result (BFmode or SFmode).
OP1: First input argument (BFmode or SFmode).
OP2: Second input argument (BFmode or SFmode).
TMP0: Temporary for result (V4SFmode).
TMP1: Temporary for first input argument (V4SFmode).
TMP2: Temporary for second input argument (V4SFmode).
The operation is done as a V4SFmode vector operation. This is because
converting BFmode from a scalar BFmode to SFmode to do the operation and
back again takes quite a bit of time. GCC will only generate the native
operation if -Ofast is used. The float16.md code that calls this function
adds various combine operations to do the operation in V4SFmode instead of
SFmode. */
void
bfloat16_expand_binary_op (enum rtx_code icode,
rtx op0,
rtx op1,
rtx op2,
rtx tmp0,
rtx tmp1,
rtx tmp2)
{
if (GET_CODE (tmp0) == SCRATCH)
tmp0 = gen_reg_rtx (V4SFmode);
if (GET_CODE (tmp1) == SCRATCH)
tmp1 = gen_reg_rtx (V4SFmode);
if (GET_CODE (tmp2) == SCRATCH)
tmp2 = gen_reg_rtx (V4SFmode);
/* Convert operand1 and operand2 to V4SFmode format. We use SPLAT for
registers to get the value into the upper 32-bits. We can use XXSPLTW
to splat words instead of VSPLTIH since the XVCVBF16SPN instruction
ignores the odd half-words, and XXSPLTW can operate on all VSX registers
instead of just the Altivec registers. Using SPLAT instead of a shift
also insure that other bits are not a signalling NaN. If we are using
XXSPLTIW or XXSPLTIB to load the constant the other bits are duplicated. */
/* Operand1. */
if (GET_MODE (op1) == BFmode)
{
emit_insn (gen_xxspltw_bf (tmp1, op1));
emit_insn (gen_xvcvbf16spn_bf (tmp1, tmp1));
}
else if (GET_MODE (op1) == SFmode)
emit_insn (gen_vsx_splat_v4sf (tmp1,
force_reg (SFmode, op1)));
else
gcc_unreachable ();
/* Operand2. */
if (GET_MODE (op2) == BFmode)
{
if (REG_P (op2) || SUBREG_P (op2))
emit_insn (gen_xxspltw_bf (tmp2, op2));
else if (op2 == CONST0_RTX (BFmode))
emit_move_insn (tmp2, CONST0_RTX (V4SFmode));
else if (fp16_xxspltiw_constant (op2, BFmode))
{
rtx op2_bf = gen_lowpart (BFmode, tmp2);
emit_move_insn (op2_bf, op2);
}
else
gcc_unreachable ();
emit_insn (gen_xvcvbf16spn_bf (tmp2, tmp2));
}
else if (GET_MODE (op2) == SFmode)
{
if (REG_P (op2) || SUBREG_P (op2))
emit_insn (gen_vsx_splat_v4sf (tmp2, op2));
else if (op2 == CONST0_RTX (SFmode))
emit_move_insn (tmp2, CONST0_RTX (V4SFmode));
else if (GET_CODE (op2) == CONST_DOUBLE)
{
rtvec v = rtvec_alloc (4);
RTVEC_ELT (v, 0) = op2;
RTVEC_ELT (v, 1) = op2;
RTVEC_ELT (v, 2) = op2;
RTVEC_ELT (v, 3) = op2;
emit_insn (gen_rtx_SET (tmp2,
gen_rtx_CONST_VECTOR (V4SFmode, v)));
}
else
emit_insn (gen_vsx_splat_v4sf (tmp2,
force_reg (SFmode, op2)));
}
else
gcc_unreachable ();
/* Do the operation in V4SFmode. */
emit_insn (gen_rtx_SET (tmp0,
gen_rtx_fmt_ee (icode, V4SFmode, tmp1, tmp2)));
/* Convert V4SF result back to scalar mode. */
if (GET_MODE (op0) == BFmode)
emit_insn (gen_xvcvspbf16_bf (op0, tmp0));
else if (GET_MODE (op0) == SFmode)
{
rtx element = GEN_INT (WORDS_BIG_ENDIAN ? 2 : 3);
emit_insn (gen_vsx_extract_v4sf (op0, tmp0, element));
}
else
gcc_unreachable ();
}

View File

@ -244,8 +244,12 @@
}
[(set_attr "type" "fpsimple")])
;; Use DFmode to convert to/from 16-bit floating point types for
;; scalar floating point types other than SF/DFmode.
;; Convert between HFmode/BFmode and 128-bit binary floating point and
;; decimal floating point types. We use convert_move since some of the
;; types might not have valid RTX expanders. We use DFmode as the
;; intermediate conversion destination.
(define_expand "extend<FP16_HW:mode><fp16_float_convert:mode>2"
[(set (match_operand:fp16_float_convert 0 "vsx_register_operand")
(float_extend:fp16_float_convert
@ -254,10 +258,6 @@
{
rtx df_tmp = gen_reg_rtx (DFmode);
emit_insn (gen_extend<FP16_HW:mode>df2 (df_tmp, operands[1]));
/* convert_move handles things like conversion to Decimal types that
we don't have extenddfdd2 insns, so a call is made to do the
conversion. */
convert_move (operands[0], df_tmp, 0);
DONE;
})
@ -270,11 +270,7 @@
{
rtx df_tmp = gen_reg_rtx (DFmode);
/* convert_move handles things like conversion from Decimal types
that we don't have truncdddf2 insns, so a call is made for
the conversion. */
convert_move (df_tmp, operands[1], 0);
emit_insn (gen_truncdf<FP16_HW:mode>2 (operands[0], df_tmp));
DONE;
})
@ -329,8 +325,10 @@
DONE;
})
;; Convert the even elements of a vector 16-bit floating point to
;; V4SFmode.
;; V4SFmode. Deal with little endian vs. big endian element ordering
;; in identifying which elements are converted.
(define_expand "cvt_fp16_to_v4sf_<mode>"
[(set (match_operand:V4SF 0 "vsx_register_operand")
@ -422,57 +420,121 @@
"&& 1"
[(pc)]
{
rtx op0 = operands[0];
rtx op1 = operands[1];
rtx op2 = operands[2];
rtx tmp0 = operands[3];
rtx tmp1 = operands[4];
rtx tmp2 = operands[5];
bfloat16_expand_binary_op (<CODE>,
operands[0],
operands[1],
operands[2],
operands[3],
operands[4],
operands[5]);
DONE;
}
[(set_attr "type" "vecperm")
(set_attr "length" "24,24,32")])
if (GET_CODE (tmp0) == SCRATCH)
tmp0 = gen_reg_rtx (V4SFmode);
(define_insn_and_split "*<fp16_binary_name>bf3_internal1"
[(set (match_operand:SF 0 "vsx_register_operand" "=wa")
(fp16_binary_op:SF
(float_extend:SF
(match_operand:BF 1 "vsx_register_operand" "wa"))
(float_extend:SF
(match_operand:BF 2 "vsx_register_operand" "wa"))))
(clobber (match_scratch:V4SF 3 "=&wa"))
(clobber (match_scratch:V4SF 4 "=&wa"))
(clobber (match_scratch:V4SF 5 "=&wa"))]
"TARGET_BFLOAT16_HW"
"#"
"&& 1"
[(pc)]
{
bfloat16_expand_binary_op (<CODE>,
operands[0],
operands[1],
operands[2],
operands[3],
operands[4],
operands[5]);
DONE;
}
[(set_attr "type" "vecperm")
(set_attr "length" "24")])
if (GET_CODE (tmp1) == SCRATCH)
tmp1 = gen_reg_rtx (V4SFmode);
(define_insn_and_split "*<fp16_binary_name>bf3_internal2"
[(set (match_operand:BF 0 "vsx_register_operand" "=wa")
(float_truncate:BF
(fp16_binary_op:SF
(float_extend:SF
(match_operand:BF 1 "vsx_register_operand" "wa"))
(float_extend:SF
(match_operand:BF 2 "vsx_register_operand" "wa")))))
(clobber (match_scratch:V4SF 3 "=&wa"))
(clobber (match_scratch:V4SF 4 "=&wa"))
(clobber (match_scratch:V4SF 5 "=&wa"))]
"TARGET_BFLOAT16_HW"
"#"
"&& 1"
[(pc)]
{
bfloat16_expand_binary_op (<CODE>,
operands[0],
operands[1],
operands[2],
operands[3],
operands[4],
operands[5]);
DONE;
}
[(set_attr "type" "vecperm")
(set_attr "length" "24")])
if (GET_CODE (tmp2) == SCRATCH)
tmp2 = gen_reg_rtx (V4SFmode);
(define_insn_and_split "*<fp16_binary_name>bf3_internal3"
[(set (match_operand:SF 0 "vsx_register_operand" "=wa,wa,wa")
(fp16_binary_op:SF
(float_extend:SF
(match_operand:BF 1 "vsx_register_operand" "wa,wa,wa"))
(match_operand:SF 2 "input_operand" "wa,j,eP")))
(clobber (match_scratch:V4SF 3 "=&wa,&wa,&wa"))
(clobber (match_scratch:V4SF 4 "=&wa,&wa,&wa"))
(clobber (match_scratch:V4SF 5 "=&wa,&wa,&wa"))]
"TARGET_BFLOAT16_HW"
"#"
"&& 1"
[(pc)]
{
bfloat16_expand_binary_op (<CODE>,
operands[0],
operands[1],
operands[2],
operands[3],
operands[4],
operands[5]);
DONE;
}
[(set_attr "type" "vecperm")
(set_attr "length" "24,24,32")])
/* Convert operand1 and operand2 to V4SFmode format. We use SPLAT for
registers to get the value into the upper 32-bits. We can use XXSPLTW
to splat words instead of VSPLTIH since the XVCVBF16SPN instruction
ignores the odd half-words, and XXSPLTW can operate on all VSX registers
instead of just the Altivec registers. Using SPLAT instead of a shift
also insure that other bits are not a signalling NaN. If we are using
XXSPLTIW or XXSPLTIB to load the constant the other bits are duplicated. */
/* Operand1. */
emit_insn (gen_xxspltw_bf (tmp1, op1));
emit_insn (gen_xvcvbf16spn_bf (tmp1, tmp1));
/* Operand2. */
if (REG_P (op2) || SUBREG_P (op2))
emit_insn (gen_xxspltw_bf (tmp2, op2));
else if (op2 == CONST0_RTX (BFmode))
emit_move_insn (tmp2, CONST0_RTX (V4SFmode));
else if (fp16_xxspltiw_constant (op2, BFmode))
{
rtx op2_bf = gen_lowpart (BFmode, tmp2);
emit_move_insn (op2_bf, op2);
}
else
gcc_unreachable ();
emit_insn (gen_xvcvbf16spn_bf (tmp2, tmp2));
/* Do the operation in V4SFmode. */
emit_insn (gen_<fp16_binary_name>v4sf3 (tmp0, tmp1, tmp2));
/* Convert V4SF result back to scalar mode. */
emit_insn (gen_xvcvspbf16_bf (op0, tmp0));
(define_insn_and_split "*<fp16_binary_name>bf3_internal4"
[(set (match_operand:BF 0 "vsx_register_operand" "=wa,wa,wa")
(float_truncate:BF
(fp16_binary_op:SF
(float_extend:SF
(match_operand:BF 1 "vsx_register_operand" "wa,wa,wa"))
(match_operand:SF 2 "input_operand" "wa,j,eP"))))
(clobber (match_scratch:V4SF 3 "=&wa,&wa,&wa"))
(clobber (match_scratch:V4SF 4 "=&wa,&wa,&wa"))
(clobber (match_scratch:V4SF 5 "=&wa,&wa,&wa"))]
"TARGET_BFLOAT16_HW"
"#"
"&& 1"
[(pc)]
{
bfloat16_expand_binary_op (<CODE>,
operands[0],
operands[1],
operands[2],
operands[3],
operands[4],
operands[5]);
DONE;
}
[(set_attr "type" "vecperm")
@ -503,7 +565,7 @@
"xvcvbf16spn %x0,%x1"
[(set_attr "type" "vecperm")])
;; Convert a V4SFmode vector back to 16-bit floating point scalar. We
;; Convert a V4SFmode vector to a 16-bit floating point scalar. We
;; only care about the 2nd V4SFmode element, which is the element we
;; converted the 16-bit scalar (4th element) to V4SFmode to do the
;; operation, and converted it back.

View File

@ -258,6 +258,10 @@ extern bool vec_const_128bit_to_bytes (rtx, machine_mode,
extern unsigned constant_generates_lxvkq (vec_const_128bit_type *);
extern unsigned constant_generates_xxspltiw (vec_const_128bit_type *);
extern unsigned constant_generates_xxspltidp (vec_const_128bit_type *);
/* From float16.cc. */
extern void bfloat16_expand_binary_op (enum rtx_code, rtx, rtx, rtx,
rtx, rtx, rtx);
#endif /* RTX_CODE */
#ifdef TREE_CODE

View File

@ -87,6 +87,10 @@ rs6000-c.o: $(srcdir)/config/rs6000/rs6000-c.cc rs6000-builtins.h
$(COMPILE) $<
$(POSTCOMPILE)
float16.o: $(srcdir)/config/rs6000/float16.cc
$(COMPILE) $<
$(POSTCOMPILE)
#$(srcdir)/config/rs6000/fusion.md: $(srcdir)/config/rs6000/genfusion.pl
# $(srcdir)/config/rs6000/genfusion.pl > $(srcdir)/config/rs6000/fusion.md