Add combiner support for bfloat16.

2025-10-06 Michael Meissner <meissner@linux.ibm.com> gcc/ * config.gcc (powerpc*-*-*): Add float16.o. * config/rs6000/float16.cc: New file. * config/rs6000/float16.md (various): Update comments. (<fp16_binary_name>bf3): Move code to bfloat16_expand_binary_op. (<fp16_binary_name>bf3_internal1): New combiner insns. (<fp16_binary_name>bf3_internal2): Likewise. (<fp16_binary_name>bf3_internal3): Likewise. (<fp16_binary_name>bf3_internal4): Likewise. * config/rs6000/rs6000-proto.h (bfloat16_expand_binary_op): New declaration. * config/rs6000/t-rs6000 (float16.o): New build rule.
2025-10-06 23:12:47 -04:00 · 2025-10-06 23:12:47 -04:00 · d60ac67a7c
parent 9f5fe9d47f
commit d60ac67a7c
5 changed files with 297 additions and 59 deletions
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@ -524,6 +524,7 @@ powerpc*-*-*)
 	extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
 	extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
 	extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+	extra_objs="${extra_objs} float16.o"
 	extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
 	extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
 	extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
--- a/gcc/config/rs6000/float16.cc
+++ b/gcc/config/rs6000/float16.cc
@ -0,0 +1,167 @@
+/* Subroutines for the C front end on the PowerPC architecture.
+   Copyright (C) 2002-2025 Free Software Foundation, Inc.
+
+   Contributed by Zack Weinberg <zack@codesourcery.com>
+   and Paolo Bonzini <bonzini@gnu.org>
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+/* 16-bit floating point support.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "memmodel.h"
+#include "tm_p.h"
+#include "stringpool.h"
+#include "expmed.h"
+#include "optabs.h"
+#include "regs.h"
+#include "insn-attr.h"
+#include "flags.h"
+#include "attribs.h"
+#include "explow.h"
+#include "expr.h"
+#include "common/common-target.h"
+#include "rs6000-internal.h"
+
+/* Expand a bfloat16 floating point binary operation:
+
+   ICODE: Operation to perform.
+   OP0:   Result (BFmode or SFmode).
+   OP1:   First input argument (BFmode or SFmode).
+   OP2:   Second input argument (BFmode or SFmode).
+   TMP0:  Temporary for result (V4SFmode).
+   TMP1:  Temporary for first input argument (V4SFmode).
+   TMP2:  Temporary for second input argument (V4SFmode).
+
+   The operation is done as a V4SFmode vector operation.  This is because
+   converting BFmode from a scalar BFmode to SFmode to do the operation and
+   back again takes quite a bit of time.  GCC will only generate the native
+   operation if -Ofast is used.  The float16.md code that calls this function
+   adds various combine operations to do the operation in V4SFmode instead of
+   SFmode.  */
+	
+void
+bfloat16_expand_binary_op (enum rtx_code icode,
+			   rtx op0,
+			   rtx op1,
+			   rtx op2,
+			   rtx tmp0,
+			   rtx tmp1,
+			   rtx tmp2)
+{
+  if (GET_CODE (tmp0) == SCRATCH)
+    tmp0 = gen_reg_rtx (V4SFmode);
+
+  if (GET_CODE (tmp1) == SCRATCH)
+    tmp1 = gen_reg_rtx (V4SFmode);
+
+  if (GET_CODE (tmp2) == SCRATCH)
+    tmp2 = gen_reg_rtx (V4SFmode);
+
+  /* Convert operand1 and operand2 to V4SFmode format.  We use SPLAT for
+     registers to get the value into the upper 32-bits.  We can use XXSPLTW
+     to splat words instead of VSPLTIH since the XVCVBF16SPN instruction
+     ignores the odd half-words, and XXSPLTW can operate on all VSX registers
+     instead of just the Altivec registers.  Using SPLAT instead of a shift
+     also insure that other bits are not a signalling NaN.  If we are using
+     XXSPLTIW or XXSPLTIB to load the constant the other bits are duplicated.  */
+
+  /* Operand1.  */
+  if (GET_MODE (op1) == BFmode)
+    {
+      emit_insn (gen_xxspltw_bf (tmp1, op1));
+      emit_insn (gen_xvcvbf16spn_bf (tmp1, tmp1));
+    }
+
+  else if (GET_MODE (op1) == SFmode)
+    emit_insn (gen_vsx_splat_v4sf (tmp1,
+				   force_reg (SFmode, op1)));
+
+  else
+    gcc_unreachable ();
+
+  /* Operand2.  */
+  if (GET_MODE (op2) == BFmode)
+    {
+      if (REG_P (op2) || SUBREG_P (op2))
+	emit_insn (gen_xxspltw_bf (tmp2, op2));
+
+      else if (op2 == CONST0_RTX (BFmode))
+	emit_move_insn (tmp2, CONST0_RTX (V4SFmode));
+
+      else if (fp16_xxspltiw_constant (op2, BFmode))
+	{
+	  rtx op2_bf = gen_lowpart (BFmode, tmp2);
+	  emit_move_insn (op2_bf, op2);
+	}
+
+      else
+	gcc_unreachable ();
+
+      emit_insn (gen_xvcvbf16spn_bf (tmp2, tmp2));
+    }
+
+  else if (GET_MODE (op2) == SFmode)
+    {
+      if (REG_P (op2) || SUBREG_P (op2))
+	emit_insn (gen_vsx_splat_v4sf (tmp2, op2));
+
+      else if (op2 == CONST0_RTX (SFmode))
+	emit_move_insn (tmp2, CONST0_RTX (V4SFmode));
+
+      else if (GET_CODE (op2) == CONST_DOUBLE)
+	{
+	  rtvec v = rtvec_alloc (4);
+	  RTVEC_ELT (v, 0) = op2;
+	  RTVEC_ELT (v, 1) = op2;
+	  RTVEC_ELT (v, 2) = op2;
+	  RTVEC_ELT (v, 3) = op2;
+	  emit_insn (gen_rtx_SET (tmp2,
+				  gen_rtx_CONST_VECTOR (V4SFmode, v)));
+	}
+
+      else
+	emit_insn (gen_vsx_splat_v4sf (tmp2,
+				       force_reg (SFmode, op2)));
+    }
+
+  else
+    gcc_unreachable ();
+
+  /* Do the operation in V4SFmode.  */
+  emit_insn (gen_rtx_SET (tmp0,
+			  gen_rtx_fmt_ee (icode, V4SFmode, tmp1, tmp2)));
+
+  /* Convert V4SF result back to scalar mode.  */
+  if (GET_MODE (op0) == BFmode)
+    emit_insn (gen_xvcvspbf16_bf (op0, tmp0));
+
+  else if (GET_MODE (op0) == SFmode)
+    {
+      rtx element = GEN_INT (WORDS_BIG_ENDIAN ? 2 : 3);
+      emit_insn (gen_vsx_extract_v4sf (op0, tmp0, element));
+    }
+
+  else
+    gcc_unreachable ();
+}
+
--- a/gcc/config/rs6000/float16.md
+++ b/gcc/config/rs6000/float16.md
@ -244,8 +244,12 @@
 }
  [(set_attr "type" "fpsimple")])

-;; Use DFmode to convert to/from 16-bit floating point types for
-;; scalar floating point types other than SF/DFmode.
+
+;; Convert between HFmode/BFmode and 128-bit binary floating point and
+;; decimal floating point types.  We use convert_move since some of the
+;; types might not have valid RTX expanders.  We use DFmode as the
+;; intermediate conversion destination.
+
 (define_expand "extend<FP16_HW:mode><fp16_float_convert:mode>2"
  [(set (match_operand:fp16_float_convert 0 "vsx_register_operand")
 	(float_extend:fp16_float_convert
@ -254,10 +258,6 @@
 {
  rtx df_tmp = gen_reg_rtx (DFmode);
  emit_insn (gen_extend<FP16_HW:mode>df2 (df_tmp, operands[1]));
-
-  /* convert_move handles things like conversion to Decimal types that
-     we don't have extenddfdd2 insns, so a call is made to do the
-     conversion.  */
  convert_move (operands[0], df_tmp, 0);
  DONE;
 })
@ -270,11 +270,7 @@
 {
  rtx df_tmp = gen_reg_rtx (DFmode);

-  /* convert_move handles things like conversion from Decimal types
-     that we don't have truncdddf2 insns, so a call is made for
-     the conversion.  */
  convert_move (df_tmp, operands[1], 0);
-
  emit_insn (gen_truncdf<FP16_HW:mode>2 (operands[0], df_tmp));
  DONE;
 })
@ -329,8 +325,10 @@
  DONE;
 })

+
 ;; Convert the even elements of a vector 16-bit floating point to
-;; V4SFmode.
+;; V4SFmode.  Deal with little endian vs. big endian element ordering
+;; in identifying which elements are converted.

 (define_expand "cvt_fp16_to_v4sf_<mode>"
  [(set (match_operand:V4SF 0 "vsx_register_operand")
@ -422,57 +420,121 @@
  "&& 1"
  [(pc)]
 {
-  rtx op0 = operands[0];
-  rtx op1 = operands[1];
-  rtx op2 = operands[2];
-  rtx tmp0 = operands[3];
-  rtx tmp1 = operands[4];
-  rtx tmp2 = operands[5];
+  bfloat16_expand_binary_op (<CODE>,
+			     operands[0],
+			     operands[1],
+			     operands[2],
+			     operands[3],
+			     operands[4],
+			     operands[5]);
+  DONE;
+}
+  [(set_attr "type" "vecperm")
+   (set_attr "length" "24,24,32")])

-  if (GET_CODE (tmp0) == SCRATCH)
-    tmp0 = gen_reg_rtx (V4SFmode);
+(define_insn_and_split "*<fp16_binary_name>bf3_internal1"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
+	(fp16_binary_op:SF
+	 (float_extend:SF
+	  (match_operand:BF 1 "vsx_register_operand" "wa"))
+	 (float_extend:SF
+	  (match_operand:BF 2 "vsx_register_operand" "wa"))))
+   (clobber (match_scratch:V4SF 3 "=&wa"))
+   (clobber (match_scratch:V4SF 4 "=&wa"))
+   (clobber (match_scratch:V4SF 5 "=&wa"))]
+  "TARGET_BFLOAT16_HW"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_expand_binary_op (<CODE>,
+			     operands[0],
+			     operands[1],
+			     operands[2],
+			     operands[3],
+			     operands[4],
+			     operands[5]);
+  DONE;
+}
+  [(set_attr "type" "vecperm")
+   (set_attr "length" "24")])

-  if (GET_CODE (tmp1) == SCRATCH)
-    tmp1 = gen_reg_rtx (V4SFmode);
+(define_insn_and_split "*<fp16_binary_name>bf3_internal2"
+  [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+	(float_truncate:BF
+	 (fp16_binary_op:SF
+	  (float_extend:SF
+	   (match_operand:BF 1 "vsx_register_operand" "wa"))
+	  (float_extend:SF
+	   (match_operand:BF 2 "vsx_register_operand" "wa")))))
+   (clobber (match_scratch:V4SF 3 "=&wa"))
+   (clobber (match_scratch:V4SF 4 "=&wa"))
+   (clobber (match_scratch:V4SF 5 "=&wa"))]
+  "TARGET_BFLOAT16_HW"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_expand_binary_op (<CODE>,
+			     operands[0],
+			     operands[1],
+			     operands[2],
+			     operands[3],
+			     operands[4],
+			     operands[5]);
+  DONE;
+}
+  [(set_attr "type" "vecperm")
+   (set_attr "length" "24")])

-  if (GET_CODE (tmp2) == SCRATCH)
-    tmp2 = gen_reg_rtx (V4SFmode);
+(define_insn_and_split "*<fp16_binary_name>bf3_internal3"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=wa,wa,wa")
+	(fp16_binary_op:SF
+	 (float_extend:SF
+	  (match_operand:BF 1 "vsx_register_operand" "wa,wa,wa"))
+	 (match_operand:SF 2 "input_operand" "wa,j,eP")))
+   (clobber (match_scratch:V4SF 3 "=&wa,&wa,&wa"))
+   (clobber (match_scratch:V4SF 4 "=&wa,&wa,&wa"))
+   (clobber (match_scratch:V4SF 5 "=&wa,&wa,&wa"))]
+  "TARGET_BFLOAT16_HW"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_expand_binary_op (<CODE>,
+			     operands[0],
+			     operands[1],
+			     operands[2],
+			     operands[3],
+			     operands[4],
+			     operands[5]);
+  DONE;
+}
+  [(set_attr "type" "vecperm")
+   (set_attr "length" "24,24,32")])

-  /* Convert operand1 and operand2 to V4SFmode format.  We use SPLAT for
-     registers to get the value into the upper 32-bits.  We can use XXSPLTW
-     to splat words instead of VSPLTIH since the XVCVBF16SPN instruction
-     ignores the odd half-words, and XXSPLTW can operate on all VSX registers
-     instead of just the Altivec registers.  Using SPLAT instead of a shift
-     also insure that other bits are not a signalling NaN.  If we are using
-     XXSPLTIW or XXSPLTIB to load the constant the other bits are duplicated.  */
-
-  /* Operand1.  */
-  emit_insn (gen_xxspltw_bf (tmp1, op1));
-  emit_insn (gen_xvcvbf16spn_bf (tmp1, tmp1));
-
-  /* Operand2.  */
-  if (REG_P (op2) || SUBREG_P (op2))
-    emit_insn (gen_xxspltw_bf (tmp2, op2));
-
-  else if (op2 == CONST0_RTX (BFmode))
-    emit_move_insn (tmp2, CONST0_RTX (V4SFmode));
-
-  else if (fp16_xxspltiw_constant (op2, BFmode))
-    {
-      rtx op2_bf = gen_lowpart (BFmode, tmp2);
-      emit_move_insn (op2_bf, op2);
-    }
-
-  else
-    gcc_unreachable ();
-
-  emit_insn (gen_xvcvbf16spn_bf (tmp2, tmp2));
-
-  /* Do the operation in V4SFmode.  */
-  emit_insn (gen_<fp16_binary_name>v4sf3 (tmp0, tmp1, tmp2));
-
-  /* Convert V4SF result back to scalar mode.  */
-  emit_insn (gen_xvcvspbf16_bf (op0, tmp0));
+(define_insn_and_split "*<fp16_binary_name>bf3_internal4"
+  [(set (match_operand:BF 0 "vsx_register_operand" "=wa,wa,wa")
+	(float_truncate:BF
+	 (fp16_binary_op:SF
+	  (float_extend:SF
+	   (match_operand:BF 1 "vsx_register_operand" "wa,wa,wa"))
+	  (match_operand:SF 2 "input_operand" "wa,j,eP"))))
+   (clobber (match_scratch:V4SF 3 "=&wa,&wa,&wa"))
+   (clobber (match_scratch:V4SF 4 "=&wa,&wa,&wa"))
+   (clobber (match_scratch:V4SF 5 "=&wa,&wa,&wa"))]
+  "TARGET_BFLOAT16_HW"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_expand_binary_op (<CODE>,
+			     operands[0],
+			     operands[1],
+			     operands[2],
+			     operands[3],
+			     operands[4],
+			     operands[5]);
  DONE;
 }
  [(set_attr "type" "vecperm")
@ -503,7 +565,7 @@
  "xvcvbf16spn %x0,%x1"
  [(set_attr "type" "vecperm")])

-;; Convert a V4SFmode vector back to 16-bit floating point scalar.  We
+;; Convert a V4SFmode vector to a 16-bit floating point scalar.  We
 ;; only care about the 2nd V4SFmode element, which is the element we
 ;; converted the 16-bit scalar (4th element) to V4SFmode to do the
 ;; operation, and converted it back.
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@ -258,6 +258,10 @@ extern bool vec_const_128bit_to_bytes (rtx, machine_mode,
 extern unsigned constant_generates_lxvkq (vec_const_128bit_type *);
 extern unsigned constant_generates_xxspltiw (vec_const_128bit_type *);
 extern unsigned constant_generates_xxspltidp (vec_const_128bit_type *);
+
+/* From float16.cc.  */
+extern void bfloat16_expand_binary_op (enum rtx_code, rtx, rtx, rtx,
+				       rtx, rtx, rtx);
 #endif /* RTX_CODE */

 #ifdef TREE_CODE
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@ -87,6 +87,10 @@ rs6000-c.o: $(srcdir)/config/rs6000/rs6000-c.cc rs6000-builtins.h
 	$(COMPILE) $<
 	$(POSTCOMPILE)

+float16.o: $(srcdir)/config/rs6000/float16.cc
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+
 #$(srcdir)/config/rs6000/fusion.md: $(srcdir)/config/rs6000/genfusion.pl
 #	$(srcdir)/config/rs6000/genfusion.pl > $(srcdir)/config/rs6000/fusion.md