From d60ac67a7c52714031e5a9ccce5840afb924fd58 Mon Sep 17 00:00:00 2001 From: Michael Meissner Date: Mon, 6 Oct 2025 23:12:47 -0400 Subject: [PATCH] Add combiner support for bfloat16. 2025-10-06 Michael Meissner gcc/ * config.gcc (powerpc*-*-*): Add float16.o. * config/rs6000/float16.cc: New file. * config/rs6000/float16.md (various): Update comments. (bf3): Move code to bfloat16_expand_binary_op. (bf3_internal1): New combiner insns. (bf3_internal2): Likewise. (bf3_internal3): Likewise. (bf3_internal4): Likewise. * config/rs6000/rs6000-proto.h (bfloat16_expand_binary_op): New declaration. * config/rs6000/t-rs6000 (float16.o): New build rule. --- gcc/config.gcc | 1 + gcc/config/rs6000/float16.cc | 167 +++++++++++++++++++++++++++ gcc/config/rs6000/float16.md | 180 ++++++++++++++++++++---------- gcc/config/rs6000/rs6000-protos.h | 4 + gcc/config/rs6000/t-rs6000 | 4 + 5 files changed, 297 insertions(+), 59 deletions(-) create mode 100644 gcc/config/rs6000/float16.cc diff --git a/gcc/config.gcc b/gcc/config.gcc index 5fee641d5aef..007baf56a40d 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -524,6 +524,7 @@ powerpc*-*-*) extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o" extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o" extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o" + extra_objs="${extra_objs} float16.o" extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h" extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h" extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h" diff --git a/gcc/config/rs6000/float16.cc b/gcc/config/rs6000/float16.cc new file mode 100644 index 000000000000..fa196486a635 --- /dev/null +++ b/gcc/config/rs6000/float16.cc @@ -0,0 +1,167 @@ +/* Subroutines for the C front end on the PowerPC architecture. + Copyright (C) 2002-2025 Free Software Foundation, Inc. + + Contributed by Zack Weinberg + and Paolo Bonzini + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* 16-bit floating point support. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "tree.h" +#include "memmodel.h" +#include "tm_p.h" +#include "stringpool.h" +#include "expmed.h" +#include "optabs.h" +#include "regs.h" +#include "insn-attr.h" +#include "flags.h" +#include "attribs.h" +#include "explow.h" +#include "expr.h" +#include "common/common-target.h" +#include "rs6000-internal.h" + +/* Expand a bfloat16 floating point binary operation: + + ICODE: Operation to perform. + OP0: Result (BFmode or SFmode). + OP1: First input argument (BFmode or SFmode). + OP2: Second input argument (BFmode or SFmode). + TMP0: Temporary for result (V4SFmode). + TMP1: Temporary for first input argument (V4SFmode). + TMP2: Temporary for second input argument (V4SFmode). + + The operation is done as a V4SFmode vector operation. This is because + converting BFmode from a scalar BFmode to SFmode to do the operation and + back again takes quite a bit of time. GCC will only generate the native + operation if -Ofast is used. The float16.md code that calls this function + adds various combine operations to do the operation in V4SFmode instead of + SFmode. */ + +void +bfloat16_expand_binary_op (enum rtx_code icode, + rtx op0, + rtx op1, + rtx op2, + rtx tmp0, + rtx tmp1, + rtx tmp2) +{ + if (GET_CODE (tmp0) == SCRATCH) + tmp0 = gen_reg_rtx (V4SFmode); + + if (GET_CODE (tmp1) == SCRATCH) + tmp1 = gen_reg_rtx (V4SFmode); + + if (GET_CODE (tmp2) == SCRATCH) + tmp2 = gen_reg_rtx (V4SFmode); + + /* Convert operand1 and operand2 to V4SFmode format. We use SPLAT for + registers to get the value into the upper 32-bits. We can use XXSPLTW + to splat words instead of VSPLTIH since the XVCVBF16SPN instruction + ignores the odd half-words, and XXSPLTW can operate on all VSX registers + instead of just the Altivec registers. Using SPLAT instead of a shift + also insure that other bits are not a signalling NaN. If we are using + XXSPLTIW or XXSPLTIB to load the constant the other bits are duplicated. */ + + /* Operand1. */ + if (GET_MODE (op1) == BFmode) + { + emit_insn (gen_xxspltw_bf (tmp1, op1)); + emit_insn (gen_xvcvbf16spn_bf (tmp1, tmp1)); + } + + else if (GET_MODE (op1) == SFmode) + emit_insn (gen_vsx_splat_v4sf (tmp1, + force_reg (SFmode, op1))); + + else + gcc_unreachable (); + + /* Operand2. */ + if (GET_MODE (op2) == BFmode) + { + if (REG_P (op2) || SUBREG_P (op2)) + emit_insn (gen_xxspltw_bf (tmp2, op2)); + + else if (op2 == CONST0_RTX (BFmode)) + emit_move_insn (tmp2, CONST0_RTX (V4SFmode)); + + else if (fp16_xxspltiw_constant (op2, BFmode)) + { + rtx op2_bf = gen_lowpart (BFmode, tmp2); + emit_move_insn (op2_bf, op2); + } + + else + gcc_unreachable (); + + emit_insn (gen_xvcvbf16spn_bf (tmp2, tmp2)); + } + + else if (GET_MODE (op2) == SFmode) + { + if (REG_P (op2) || SUBREG_P (op2)) + emit_insn (gen_vsx_splat_v4sf (tmp2, op2)); + + else if (op2 == CONST0_RTX (SFmode)) + emit_move_insn (tmp2, CONST0_RTX (V4SFmode)); + + else if (GET_CODE (op2) == CONST_DOUBLE) + { + rtvec v = rtvec_alloc (4); + RTVEC_ELT (v, 0) = op2; + RTVEC_ELT (v, 1) = op2; + RTVEC_ELT (v, 2) = op2; + RTVEC_ELT (v, 3) = op2; + emit_insn (gen_rtx_SET (tmp2, + gen_rtx_CONST_VECTOR (V4SFmode, v))); + } + + else + emit_insn (gen_vsx_splat_v4sf (tmp2, + force_reg (SFmode, op2))); + } + + else + gcc_unreachable (); + + /* Do the operation in V4SFmode. */ + emit_insn (gen_rtx_SET (tmp0, + gen_rtx_fmt_ee (icode, V4SFmode, tmp1, tmp2))); + + /* Convert V4SF result back to scalar mode. */ + if (GET_MODE (op0) == BFmode) + emit_insn (gen_xvcvspbf16_bf (op0, tmp0)); + + else if (GET_MODE (op0) == SFmode) + { + rtx element = GEN_INT (WORDS_BIG_ENDIAN ? 2 : 3); + emit_insn (gen_vsx_extract_v4sf (op0, tmp0, element)); + } + + else + gcc_unreachable (); +} + diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md index fc5fd4b19ee4..db757888dbf5 100644 --- a/gcc/config/rs6000/float16.md +++ b/gcc/config/rs6000/float16.md @@ -244,8 +244,12 @@ } [(set_attr "type" "fpsimple")]) -;; Use DFmode to convert to/from 16-bit floating point types for -;; scalar floating point types other than SF/DFmode. + +;; Convert between HFmode/BFmode and 128-bit binary floating point and +;; decimal floating point types. We use convert_move since some of the +;; types might not have valid RTX expanders. We use DFmode as the +;; intermediate conversion destination. + (define_expand "extend2" [(set (match_operand:fp16_float_convert 0 "vsx_register_operand") (float_extend:fp16_float_convert @@ -254,10 +258,6 @@ { rtx df_tmp = gen_reg_rtx (DFmode); emit_insn (gen_extenddf2 (df_tmp, operands[1])); - - /* convert_move handles things like conversion to Decimal types that - we don't have extenddfdd2 insns, so a call is made to do the - conversion. */ convert_move (operands[0], df_tmp, 0); DONE; }) @@ -270,11 +270,7 @@ { rtx df_tmp = gen_reg_rtx (DFmode); - /* convert_move handles things like conversion from Decimal types - that we don't have truncdddf2 insns, so a call is made for - the conversion. */ convert_move (df_tmp, operands[1], 0); - emit_insn (gen_truncdf2 (operands[0], df_tmp)); DONE; }) @@ -329,8 +325,10 @@ DONE; }) + ;; Convert the even elements of a vector 16-bit floating point to -;; V4SFmode. +;; V4SFmode. Deal with little endian vs. big endian element ordering +;; in identifying which elements are converted. (define_expand "cvt_fp16_to_v4sf_" [(set (match_operand:V4SF 0 "vsx_register_operand") @@ -422,57 +420,121 @@ "&& 1" [(pc)] { - rtx op0 = operands[0]; - rtx op1 = operands[1]; - rtx op2 = operands[2]; - rtx tmp0 = operands[3]; - rtx tmp1 = operands[4]; - rtx tmp2 = operands[5]; + bfloat16_expand_binary_op (, + operands[0], + operands[1], + operands[2], + operands[3], + operands[4], + operands[5]); + DONE; +} + [(set_attr "type" "vecperm") + (set_attr "length" "24,24,32")]) - if (GET_CODE (tmp0) == SCRATCH) - tmp0 = gen_reg_rtx (V4SFmode); +(define_insn_and_split "*bf3_internal1" + [(set (match_operand:SF 0 "vsx_register_operand" "=wa") + (fp16_binary_op:SF + (float_extend:SF + (match_operand:BF 1 "vsx_register_operand" "wa")) + (float_extend:SF + (match_operand:BF 2 "vsx_register_operand" "wa")))) + (clobber (match_scratch:V4SF 3 "=&wa")) + (clobber (match_scratch:V4SF 4 "=&wa")) + (clobber (match_scratch:V4SF 5 "=&wa"))] + "TARGET_BFLOAT16_HW" + "#" + "&& 1" + [(pc)] +{ + bfloat16_expand_binary_op (, + operands[0], + operands[1], + operands[2], + operands[3], + operands[4], + operands[5]); + DONE; +} + [(set_attr "type" "vecperm") + (set_attr "length" "24")]) - if (GET_CODE (tmp1) == SCRATCH) - tmp1 = gen_reg_rtx (V4SFmode); +(define_insn_and_split "*bf3_internal2" + [(set (match_operand:BF 0 "vsx_register_operand" "=wa") + (float_truncate:BF + (fp16_binary_op:SF + (float_extend:SF + (match_operand:BF 1 "vsx_register_operand" "wa")) + (float_extend:SF + (match_operand:BF 2 "vsx_register_operand" "wa"))))) + (clobber (match_scratch:V4SF 3 "=&wa")) + (clobber (match_scratch:V4SF 4 "=&wa")) + (clobber (match_scratch:V4SF 5 "=&wa"))] + "TARGET_BFLOAT16_HW" + "#" + "&& 1" + [(pc)] +{ + bfloat16_expand_binary_op (, + operands[0], + operands[1], + operands[2], + operands[3], + operands[4], + operands[5]); + DONE; +} + [(set_attr "type" "vecperm") + (set_attr "length" "24")]) - if (GET_CODE (tmp2) == SCRATCH) - tmp2 = gen_reg_rtx (V4SFmode); +(define_insn_and_split "*bf3_internal3" + [(set (match_operand:SF 0 "vsx_register_operand" "=wa,wa,wa") + (fp16_binary_op:SF + (float_extend:SF + (match_operand:BF 1 "vsx_register_operand" "wa,wa,wa")) + (match_operand:SF 2 "input_operand" "wa,j,eP"))) + (clobber (match_scratch:V4SF 3 "=&wa,&wa,&wa")) + (clobber (match_scratch:V4SF 4 "=&wa,&wa,&wa")) + (clobber (match_scratch:V4SF 5 "=&wa,&wa,&wa"))] + "TARGET_BFLOAT16_HW" + "#" + "&& 1" + [(pc)] +{ + bfloat16_expand_binary_op (, + operands[0], + operands[1], + operands[2], + operands[3], + operands[4], + operands[5]); + DONE; +} + [(set_attr "type" "vecperm") + (set_attr "length" "24,24,32")]) - /* Convert operand1 and operand2 to V4SFmode format. We use SPLAT for - registers to get the value into the upper 32-bits. We can use XXSPLTW - to splat words instead of VSPLTIH since the XVCVBF16SPN instruction - ignores the odd half-words, and XXSPLTW can operate on all VSX registers - instead of just the Altivec registers. Using SPLAT instead of a shift - also insure that other bits are not a signalling NaN. If we are using - XXSPLTIW or XXSPLTIB to load the constant the other bits are duplicated. */ - - /* Operand1. */ - emit_insn (gen_xxspltw_bf (tmp1, op1)); - emit_insn (gen_xvcvbf16spn_bf (tmp1, tmp1)); - - /* Operand2. */ - if (REG_P (op2) || SUBREG_P (op2)) - emit_insn (gen_xxspltw_bf (tmp2, op2)); - - else if (op2 == CONST0_RTX (BFmode)) - emit_move_insn (tmp2, CONST0_RTX (V4SFmode)); - - else if (fp16_xxspltiw_constant (op2, BFmode)) - { - rtx op2_bf = gen_lowpart (BFmode, tmp2); - emit_move_insn (op2_bf, op2); - } - - else - gcc_unreachable (); - - emit_insn (gen_xvcvbf16spn_bf (tmp2, tmp2)); - - /* Do the operation in V4SFmode. */ - emit_insn (gen_v4sf3 (tmp0, tmp1, tmp2)); - - /* Convert V4SF result back to scalar mode. */ - emit_insn (gen_xvcvspbf16_bf (op0, tmp0)); +(define_insn_and_split "*bf3_internal4" + [(set (match_operand:BF 0 "vsx_register_operand" "=wa,wa,wa") + (float_truncate:BF + (fp16_binary_op:SF + (float_extend:SF + (match_operand:BF 1 "vsx_register_operand" "wa,wa,wa")) + (match_operand:SF 2 "input_operand" "wa,j,eP")))) + (clobber (match_scratch:V4SF 3 "=&wa,&wa,&wa")) + (clobber (match_scratch:V4SF 4 "=&wa,&wa,&wa")) + (clobber (match_scratch:V4SF 5 "=&wa,&wa,&wa"))] + "TARGET_BFLOAT16_HW" + "#" + "&& 1" + [(pc)] +{ + bfloat16_expand_binary_op (, + operands[0], + operands[1], + operands[2], + operands[3], + operands[4], + operands[5]); DONE; } [(set_attr "type" "vecperm") @@ -503,7 +565,7 @@ "xvcvbf16spn %x0,%x1" [(set_attr "type" "vecperm")]) -;; Convert a V4SFmode vector back to 16-bit floating point scalar. We +;; Convert a V4SFmode vector to a 16-bit floating point scalar. We ;; only care about the 2nd V4SFmode element, which is the element we ;; converted the 16-bit scalar (4th element) to V4SFmode to do the ;; operation, and converted it back. diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 9bf971370d41..23b7f29cbece 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -258,6 +258,10 @@ extern bool vec_const_128bit_to_bytes (rtx, machine_mode, extern unsigned constant_generates_lxvkq (vec_const_128bit_type *); extern unsigned constant_generates_xxspltiw (vec_const_128bit_type *); extern unsigned constant_generates_xxspltidp (vec_const_128bit_type *); + +/* From float16.cc. */ +extern void bfloat16_expand_binary_op (enum rtx_code, rtx, rtx, rtx, + rtx, rtx, rtx); #endif /* RTX_CODE */ #ifdef TREE_CODE diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 index a5d1c27424f3..c8f19865311c 100644 --- a/gcc/config/rs6000/t-rs6000 +++ b/gcc/config/rs6000/t-rs6000 @@ -87,6 +87,10 @@ rs6000-c.o: $(srcdir)/config/rs6000/rs6000-c.cc rs6000-builtins.h $(COMPILE) $< $(POSTCOMPILE) +float16.o: $(srcdir)/config/rs6000/float16.cc + $(COMPILE) $< + $(POSTCOMPILE) + #$(srcdir)/config/rs6000/fusion.md: $(srcdir)/config/rs6000/genfusion.pl # $(srcdir)/config/rs6000/genfusion.pl > $(srcdir)/config/rs6000/fusion.md