mirror of git://gcc.gnu.org/git/gcc.git
Add combiner support for bfloat16.
2025-10-06 Michael Meissner <meissner@linux.ibm.com> gcc/ * config.gcc (powerpc*-*-*): Add float16.o. * config/rs6000/float16.cc: New file. * config/rs6000/float16.md (various): Update comments. (<fp16_binary_name>bf3): Move code to bfloat16_expand_binary_op. (<fp16_binary_name>bf3_internal1): New combiner insns. (<fp16_binary_name>bf3_internal2): Likewise. (<fp16_binary_name>bf3_internal3): Likewise. (<fp16_binary_name>bf3_internal4): Likewise. * config/rs6000/rs6000-proto.h (bfloat16_expand_binary_op): New declaration. * config/rs6000/t-rs6000 (float16.o): New build rule.
This commit is contained in:
parent
9f5fe9d47f
commit
d60ac67a7c
|
@ -524,6 +524,7 @@ powerpc*-*-*)
|
|||
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
|
||||
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
|
||||
extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
|
||||
extra_objs="${extra_objs} float16.o"
|
||||
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
|
||||
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
|
||||
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
|
||||
|
|
|
@ -0,0 +1,167 @@
|
|||
/* Subroutines for the C front end on the PowerPC architecture.
|
||||
Copyright (C) 2002-2025 Free Software Foundation, Inc.
|
||||
|
||||
Contributed by Zack Weinberg <zack@codesourcery.com>
|
||||
and Paolo Bonzini <bonzini@gnu.org>
|
||||
|
||||
This file is part of GCC.
|
||||
|
||||
GCC is free software; you can redistribute it and/or modify it
|
||||
under the terms of the GNU General Public License as published
|
||||
by the Free Software Foundation; either version 3, or (at your
|
||||
option) any later version.
|
||||
|
||||
GCC is distributed in the hope that it will be useful, but WITHOUT
|
||||
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
|
||||
License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with GCC; see the file COPYING3. If not see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
/* 16-bit floating point support. */
|
||||
|
||||
#include "config.h"
|
||||
#include "system.h"
|
||||
#include "coretypes.h"
|
||||
#include "backend.h"
|
||||
#include "rtl.h"
|
||||
#include "tree.h"
|
||||
#include "memmodel.h"
|
||||
#include "tm_p.h"
|
||||
#include "stringpool.h"
|
||||
#include "expmed.h"
|
||||
#include "optabs.h"
|
||||
#include "regs.h"
|
||||
#include "insn-attr.h"
|
||||
#include "flags.h"
|
||||
#include "attribs.h"
|
||||
#include "explow.h"
|
||||
#include "expr.h"
|
||||
#include "common/common-target.h"
|
||||
#include "rs6000-internal.h"
|
||||
|
||||
/* Expand a bfloat16 floating point binary operation:
|
||||
|
||||
ICODE: Operation to perform.
|
||||
OP0: Result (BFmode or SFmode).
|
||||
OP1: First input argument (BFmode or SFmode).
|
||||
OP2: Second input argument (BFmode or SFmode).
|
||||
TMP0: Temporary for result (V4SFmode).
|
||||
TMP1: Temporary for first input argument (V4SFmode).
|
||||
TMP2: Temporary for second input argument (V4SFmode).
|
||||
|
||||
The operation is done as a V4SFmode vector operation. This is because
|
||||
converting BFmode from a scalar BFmode to SFmode to do the operation and
|
||||
back again takes quite a bit of time. GCC will only generate the native
|
||||
operation if -Ofast is used. The float16.md code that calls this function
|
||||
adds various combine operations to do the operation in V4SFmode instead of
|
||||
SFmode. */
|
||||
|
||||
void
|
||||
bfloat16_expand_binary_op (enum rtx_code icode,
|
||||
rtx op0,
|
||||
rtx op1,
|
||||
rtx op2,
|
||||
rtx tmp0,
|
||||
rtx tmp1,
|
||||
rtx tmp2)
|
||||
{
|
||||
if (GET_CODE (tmp0) == SCRATCH)
|
||||
tmp0 = gen_reg_rtx (V4SFmode);
|
||||
|
||||
if (GET_CODE (tmp1) == SCRATCH)
|
||||
tmp1 = gen_reg_rtx (V4SFmode);
|
||||
|
||||
if (GET_CODE (tmp2) == SCRATCH)
|
||||
tmp2 = gen_reg_rtx (V4SFmode);
|
||||
|
||||
/* Convert operand1 and operand2 to V4SFmode format. We use SPLAT for
|
||||
registers to get the value into the upper 32-bits. We can use XXSPLTW
|
||||
to splat words instead of VSPLTIH since the XVCVBF16SPN instruction
|
||||
ignores the odd half-words, and XXSPLTW can operate on all VSX registers
|
||||
instead of just the Altivec registers. Using SPLAT instead of a shift
|
||||
also insure that other bits are not a signalling NaN. If we are using
|
||||
XXSPLTIW or XXSPLTIB to load the constant the other bits are duplicated. */
|
||||
|
||||
/* Operand1. */
|
||||
if (GET_MODE (op1) == BFmode)
|
||||
{
|
||||
emit_insn (gen_xxspltw_bf (tmp1, op1));
|
||||
emit_insn (gen_xvcvbf16spn_bf (tmp1, tmp1));
|
||||
}
|
||||
|
||||
else if (GET_MODE (op1) == SFmode)
|
||||
emit_insn (gen_vsx_splat_v4sf (tmp1,
|
||||
force_reg (SFmode, op1)));
|
||||
|
||||
else
|
||||
gcc_unreachable ();
|
||||
|
||||
/* Operand2. */
|
||||
if (GET_MODE (op2) == BFmode)
|
||||
{
|
||||
if (REG_P (op2) || SUBREG_P (op2))
|
||||
emit_insn (gen_xxspltw_bf (tmp2, op2));
|
||||
|
||||
else if (op2 == CONST0_RTX (BFmode))
|
||||
emit_move_insn (tmp2, CONST0_RTX (V4SFmode));
|
||||
|
||||
else if (fp16_xxspltiw_constant (op2, BFmode))
|
||||
{
|
||||
rtx op2_bf = gen_lowpart (BFmode, tmp2);
|
||||
emit_move_insn (op2_bf, op2);
|
||||
}
|
||||
|
||||
else
|
||||
gcc_unreachable ();
|
||||
|
||||
emit_insn (gen_xvcvbf16spn_bf (tmp2, tmp2));
|
||||
}
|
||||
|
||||
else if (GET_MODE (op2) == SFmode)
|
||||
{
|
||||
if (REG_P (op2) || SUBREG_P (op2))
|
||||
emit_insn (gen_vsx_splat_v4sf (tmp2, op2));
|
||||
|
||||
else if (op2 == CONST0_RTX (SFmode))
|
||||
emit_move_insn (tmp2, CONST0_RTX (V4SFmode));
|
||||
|
||||
else if (GET_CODE (op2) == CONST_DOUBLE)
|
||||
{
|
||||
rtvec v = rtvec_alloc (4);
|
||||
RTVEC_ELT (v, 0) = op2;
|
||||
RTVEC_ELT (v, 1) = op2;
|
||||
RTVEC_ELT (v, 2) = op2;
|
||||
RTVEC_ELT (v, 3) = op2;
|
||||
emit_insn (gen_rtx_SET (tmp2,
|
||||
gen_rtx_CONST_VECTOR (V4SFmode, v)));
|
||||
}
|
||||
|
||||
else
|
||||
emit_insn (gen_vsx_splat_v4sf (tmp2,
|
||||
force_reg (SFmode, op2)));
|
||||
}
|
||||
|
||||
else
|
||||
gcc_unreachable ();
|
||||
|
||||
/* Do the operation in V4SFmode. */
|
||||
emit_insn (gen_rtx_SET (tmp0,
|
||||
gen_rtx_fmt_ee (icode, V4SFmode, tmp1, tmp2)));
|
||||
|
||||
/* Convert V4SF result back to scalar mode. */
|
||||
if (GET_MODE (op0) == BFmode)
|
||||
emit_insn (gen_xvcvspbf16_bf (op0, tmp0));
|
||||
|
||||
else if (GET_MODE (op0) == SFmode)
|
||||
{
|
||||
rtx element = GEN_INT (WORDS_BIG_ENDIAN ? 2 : 3);
|
||||
emit_insn (gen_vsx_extract_v4sf (op0, tmp0, element));
|
||||
}
|
||||
|
||||
else
|
||||
gcc_unreachable ();
|
||||
}
|
||||
|
|
@ -244,8 +244,12 @@
|
|||
}
|
||||
[(set_attr "type" "fpsimple")])
|
||||
|
||||
;; Use DFmode to convert to/from 16-bit floating point types for
|
||||
;; scalar floating point types other than SF/DFmode.
|
||||
|
||||
;; Convert between HFmode/BFmode and 128-bit binary floating point and
|
||||
;; decimal floating point types. We use convert_move since some of the
|
||||
;; types might not have valid RTX expanders. We use DFmode as the
|
||||
;; intermediate conversion destination.
|
||||
|
||||
(define_expand "extend<FP16_HW:mode><fp16_float_convert:mode>2"
|
||||
[(set (match_operand:fp16_float_convert 0 "vsx_register_operand")
|
||||
(float_extend:fp16_float_convert
|
||||
|
@ -254,10 +258,6 @@
|
|||
{
|
||||
rtx df_tmp = gen_reg_rtx (DFmode);
|
||||
emit_insn (gen_extend<FP16_HW:mode>df2 (df_tmp, operands[1]));
|
||||
|
||||
/* convert_move handles things like conversion to Decimal types that
|
||||
we don't have extenddfdd2 insns, so a call is made to do the
|
||||
conversion. */
|
||||
convert_move (operands[0], df_tmp, 0);
|
||||
DONE;
|
||||
})
|
||||
|
@ -270,11 +270,7 @@
|
|||
{
|
||||
rtx df_tmp = gen_reg_rtx (DFmode);
|
||||
|
||||
/* convert_move handles things like conversion from Decimal types
|
||||
that we don't have truncdddf2 insns, so a call is made for
|
||||
the conversion. */
|
||||
convert_move (df_tmp, operands[1], 0);
|
||||
|
||||
emit_insn (gen_truncdf<FP16_HW:mode>2 (operands[0], df_tmp));
|
||||
DONE;
|
||||
})
|
||||
|
@ -329,8 +325,10 @@
|
|||
DONE;
|
||||
})
|
||||
|
||||
|
||||
;; Convert the even elements of a vector 16-bit floating point to
|
||||
;; V4SFmode.
|
||||
;; V4SFmode. Deal with little endian vs. big endian element ordering
|
||||
;; in identifying which elements are converted.
|
||||
|
||||
(define_expand "cvt_fp16_to_v4sf_<mode>"
|
||||
[(set (match_operand:V4SF 0 "vsx_register_operand")
|
||||
|
@ -422,57 +420,121 @@
|
|||
"&& 1"
|
||||
[(pc)]
|
||||
{
|
||||
rtx op0 = operands[0];
|
||||
rtx op1 = operands[1];
|
||||
rtx op2 = operands[2];
|
||||
rtx tmp0 = operands[3];
|
||||
rtx tmp1 = operands[4];
|
||||
rtx tmp2 = operands[5];
|
||||
bfloat16_expand_binary_op (<CODE>,
|
||||
operands[0],
|
||||
operands[1],
|
||||
operands[2],
|
||||
operands[3],
|
||||
operands[4],
|
||||
operands[5]);
|
||||
DONE;
|
||||
}
|
||||
[(set_attr "type" "vecperm")
|
||||
(set_attr "length" "24,24,32")])
|
||||
|
||||
if (GET_CODE (tmp0) == SCRATCH)
|
||||
tmp0 = gen_reg_rtx (V4SFmode);
|
||||
(define_insn_and_split "*<fp16_binary_name>bf3_internal1"
|
||||
[(set (match_operand:SF 0 "vsx_register_operand" "=wa")
|
||||
(fp16_binary_op:SF
|
||||
(float_extend:SF
|
||||
(match_operand:BF 1 "vsx_register_operand" "wa"))
|
||||
(float_extend:SF
|
||||
(match_operand:BF 2 "vsx_register_operand" "wa"))))
|
||||
(clobber (match_scratch:V4SF 3 "=&wa"))
|
||||
(clobber (match_scratch:V4SF 4 "=&wa"))
|
||||
(clobber (match_scratch:V4SF 5 "=&wa"))]
|
||||
"TARGET_BFLOAT16_HW"
|
||||
"#"
|
||||
"&& 1"
|
||||
[(pc)]
|
||||
{
|
||||
bfloat16_expand_binary_op (<CODE>,
|
||||
operands[0],
|
||||
operands[1],
|
||||
operands[2],
|
||||
operands[3],
|
||||
operands[4],
|
||||
operands[5]);
|
||||
DONE;
|
||||
}
|
||||
[(set_attr "type" "vecperm")
|
||||
(set_attr "length" "24")])
|
||||
|
||||
if (GET_CODE (tmp1) == SCRATCH)
|
||||
tmp1 = gen_reg_rtx (V4SFmode);
|
||||
(define_insn_and_split "*<fp16_binary_name>bf3_internal2"
|
||||
[(set (match_operand:BF 0 "vsx_register_operand" "=wa")
|
||||
(float_truncate:BF
|
||||
(fp16_binary_op:SF
|
||||
(float_extend:SF
|
||||
(match_operand:BF 1 "vsx_register_operand" "wa"))
|
||||
(float_extend:SF
|
||||
(match_operand:BF 2 "vsx_register_operand" "wa")))))
|
||||
(clobber (match_scratch:V4SF 3 "=&wa"))
|
||||
(clobber (match_scratch:V4SF 4 "=&wa"))
|
||||
(clobber (match_scratch:V4SF 5 "=&wa"))]
|
||||
"TARGET_BFLOAT16_HW"
|
||||
"#"
|
||||
"&& 1"
|
||||
[(pc)]
|
||||
{
|
||||
bfloat16_expand_binary_op (<CODE>,
|
||||
operands[0],
|
||||
operands[1],
|
||||
operands[2],
|
||||
operands[3],
|
||||
operands[4],
|
||||
operands[5]);
|
||||
DONE;
|
||||
}
|
||||
[(set_attr "type" "vecperm")
|
||||
(set_attr "length" "24")])
|
||||
|
||||
if (GET_CODE (tmp2) == SCRATCH)
|
||||
tmp2 = gen_reg_rtx (V4SFmode);
|
||||
(define_insn_and_split "*<fp16_binary_name>bf3_internal3"
|
||||
[(set (match_operand:SF 0 "vsx_register_operand" "=wa,wa,wa")
|
||||
(fp16_binary_op:SF
|
||||
(float_extend:SF
|
||||
(match_operand:BF 1 "vsx_register_operand" "wa,wa,wa"))
|
||||
(match_operand:SF 2 "input_operand" "wa,j,eP")))
|
||||
(clobber (match_scratch:V4SF 3 "=&wa,&wa,&wa"))
|
||||
(clobber (match_scratch:V4SF 4 "=&wa,&wa,&wa"))
|
||||
(clobber (match_scratch:V4SF 5 "=&wa,&wa,&wa"))]
|
||||
"TARGET_BFLOAT16_HW"
|
||||
"#"
|
||||
"&& 1"
|
||||
[(pc)]
|
||||
{
|
||||
bfloat16_expand_binary_op (<CODE>,
|
||||
operands[0],
|
||||
operands[1],
|
||||
operands[2],
|
||||
operands[3],
|
||||
operands[4],
|
||||
operands[5]);
|
||||
DONE;
|
||||
}
|
||||
[(set_attr "type" "vecperm")
|
||||
(set_attr "length" "24,24,32")])
|
||||
|
||||
/* Convert operand1 and operand2 to V4SFmode format. We use SPLAT for
|
||||
registers to get the value into the upper 32-bits. We can use XXSPLTW
|
||||
to splat words instead of VSPLTIH since the XVCVBF16SPN instruction
|
||||
ignores the odd half-words, and XXSPLTW can operate on all VSX registers
|
||||
instead of just the Altivec registers. Using SPLAT instead of a shift
|
||||
also insure that other bits are not a signalling NaN. If we are using
|
||||
XXSPLTIW or XXSPLTIB to load the constant the other bits are duplicated. */
|
||||
|
||||
/* Operand1. */
|
||||
emit_insn (gen_xxspltw_bf (tmp1, op1));
|
||||
emit_insn (gen_xvcvbf16spn_bf (tmp1, tmp1));
|
||||
|
||||
/* Operand2. */
|
||||
if (REG_P (op2) || SUBREG_P (op2))
|
||||
emit_insn (gen_xxspltw_bf (tmp2, op2));
|
||||
|
||||
else if (op2 == CONST0_RTX (BFmode))
|
||||
emit_move_insn (tmp2, CONST0_RTX (V4SFmode));
|
||||
|
||||
else if (fp16_xxspltiw_constant (op2, BFmode))
|
||||
{
|
||||
rtx op2_bf = gen_lowpart (BFmode, tmp2);
|
||||
emit_move_insn (op2_bf, op2);
|
||||
}
|
||||
|
||||
else
|
||||
gcc_unreachable ();
|
||||
|
||||
emit_insn (gen_xvcvbf16spn_bf (tmp2, tmp2));
|
||||
|
||||
/* Do the operation in V4SFmode. */
|
||||
emit_insn (gen_<fp16_binary_name>v4sf3 (tmp0, tmp1, tmp2));
|
||||
|
||||
/* Convert V4SF result back to scalar mode. */
|
||||
emit_insn (gen_xvcvspbf16_bf (op0, tmp0));
|
||||
(define_insn_and_split "*<fp16_binary_name>bf3_internal4"
|
||||
[(set (match_operand:BF 0 "vsx_register_operand" "=wa,wa,wa")
|
||||
(float_truncate:BF
|
||||
(fp16_binary_op:SF
|
||||
(float_extend:SF
|
||||
(match_operand:BF 1 "vsx_register_operand" "wa,wa,wa"))
|
||||
(match_operand:SF 2 "input_operand" "wa,j,eP"))))
|
||||
(clobber (match_scratch:V4SF 3 "=&wa,&wa,&wa"))
|
||||
(clobber (match_scratch:V4SF 4 "=&wa,&wa,&wa"))
|
||||
(clobber (match_scratch:V4SF 5 "=&wa,&wa,&wa"))]
|
||||
"TARGET_BFLOAT16_HW"
|
||||
"#"
|
||||
"&& 1"
|
||||
[(pc)]
|
||||
{
|
||||
bfloat16_expand_binary_op (<CODE>,
|
||||
operands[0],
|
||||
operands[1],
|
||||
operands[2],
|
||||
operands[3],
|
||||
operands[4],
|
||||
operands[5]);
|
||||
DONE;
|
||||
}
|
||||
[(set_attr "type" "vecperm")
|
||||
|
@ -503,7 +565,7 @@
|
|||
"xvcvbf16spn %x0,%x1"
|
||||
[(set_attr "type" "vecperm")])
|
||||
|
||||
;; Convert a V4SFmode vector back to 16-bit floating point scalar. We
|
||||
;; Convert a V4SFmode vector to a 16-bit floating point scalar. We
|
||||
;; only care about the 2nd V4SFmode element, which is the element we
|
||||
;; converted the 16-bit scalar (4th element) to V4SFmode to do the
|
||||
;; operation, and converted it back.
|
||||
|
|
|
@ -258,6 +258,10 @@ extern bool vec_const_128bit_to_bytes (rtx, machine_mode,
|
|||
extern unsigned constant_generates_lxvkq (vec_const_128bit_type *);
|
||||
extern unsigned constant_generates_xxspltiw (vec_const_128bit_type *);
|
||||
extern unsigned constant_generates_xxspltidp (vec_const_128bit_type *);
|
||||
|
||||
/* From float16.cc. */
|
||||
extern void bfloat16_expand_binary_op (enum rtx_code, rtx, rtx, rtx,
|
||||
rtx, rtx, rtx);
|
||||
#endif /* RTX_CODE */
|
||||
|
||||
#ifdef TREE_CODE
|
||||
|
|
|
@ -87,6 +87,10 @@ rs6000-c.o: $(srcdir)/config/rs6000/rs6000-c.cc rs6000-builtins.h
|
|||
$(COMPILE) $<
|
||||
$(POSTCOMPILE)
|
||||
|
||||
float16.o: $(srcdir)/config/rs6000/float16.cc
|
||||
$(COMPILE) $<
|
||||
$(POSTCOMPILE)
|
||||
|
||||
#$(srcdir)/config/rs6000/fusion.md: $(srcdir)/config/rs6000/genfusion.pl
|
||||
# $(srcdir)/config/rs6000/genfusion.pl > $(srcdir)/config/rs6000/fusion.md
|
||||
|
||||
|
|
Loading…
Reference in New Issue