gcc/gcc/config/rs6000/float16.cc

364 lines
9.0 KiB
C++
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* Subroutines for the C front end on the PowerPC architecture.
Copyright (C) 2002-2025 Free Software Foundation, Inc.
Contributed by Zack Weinberg <zack@codesourcery.com>
and Paolo Bonzini <bonzini@gnu.org>
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 3, or (at your
option) any later version.
GCC is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
License for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
/* 16-bit floating point support. */
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "rtl.h"
#include "tree.h"
#include "memmodel.h"
#include "tm_p.h"
#include "stringpool.h"
#include "expmed.h"
#include "optabs.h"
#include "regs.h"
#include "insn-attr.h"
#include "flags.h"
#include "attribs.h"
#include "explow.h"
#include "expr.h"
#include "common/common-target.h"
#include "rs6000-internal.h"
/* Expand a bfloat16 scalar floating point operation:
ICODE: Operation to perform.
RESULT: Result of the operation.
OP1: Input operand1.
OP2: Input operand2.
OP3: Input operand3 or NULL_RTX.
SUBTYPE: Describe the operation.
The operation is done as a V4SFmode vector operation. This is because
converting BFmode from a scalar BFmode to SFmode to do the operation and
back again takes quite a bit of time. GCC will only generate the native
operation if -Ofast is used. The float16.md code that calls this function
adds various combine operations to do the operation in V4SFmode instead of
SFmode. */
void
bfloat16_operation_as_v4sf (enum rtx_code icode,
rtx result,
rtx op1,
rtx op2,
rtx op3,
enum fp16_operation subtype)
{
gcc_assert (can_create_pseudo_p ());
rtx result_v4sf = gen_reg_rtx (V4SFmode);
rtx ops_orig[3] = { op1, op2, op3 };
rtx ops_v4sf[3];
size_t n_opts;
switch (subtype)
{
case FP16_BINARY:
n_opts = 2;
gcc_assert (op3 == NULL_RTX);
break;
case FP16_FMA:
case FP16_FMS:
case FP16_NFMA:
case FP16_NFMS:
gcc_assert (icode == FMA);
n_opts = 3;
break;
case FP16_ABS_BINARY:
case FP16_NEG_BINARY:
default:
gcc_unreachable ();
}
for (size_t i = 0; i < n_opts; i++)
{
rtx op = ops_orig[i];
rtx tmp = ops_v4sf[i] = gen_reg_rtx (V4SFmode);
gcc_assert (op != NULL_RTX);
/* Remove truncation/extend added. */
if (GET_CODE (op) == FLOAT_EXTEND || GET_CODE (op) == FLOAT_TRUNCATE)
op = XEXP (op, 0);
/* Convert operands to V4SFmode format. We use SPLAT for registers to
get the value into the upper 32-bits. We can use XXSPLTW to splat
words instead of VSPLTIH since the XVCVBF16SPN instruction ignores the
odd half-words, and XXSPLTW can operate on all VSX registers instead
of just the Altivec registers. Using SPLAT instead of a shift also
insure that other bits are not a signalling NaN. If we are using
XXSPLTIW or XXSPLTIB to load the constant the other bits are
duplicated. */
if (op == CONST0_RTX (SFmode) || op == CONST0_RTX (BFmode))
emit_move_insn (tmp, CONST0_RTX (V4SFmode));
else if (GET_MODE (op) == BFmode)
{
emit_insn (gen_xxspltw_bf (tmp, force_reg (BFmode, op)));
emit_insn (gen_xvcvbf16spn_bf (tmp, tmp));
}
else if (GET_MODE (op) == SFmode)
{
if (GET_CODE (op) == CONST_DOUBLE)
{
rtvec v = rtvec_alloc (4);
for (size_t i = 0; i < 4; i++)
RTVEC_ELT (v, i) = op;
emit_insn (gen_rtx_SET (tmp,
gen_rtx_CONST_VECTOR (V4SFmode, v)));
}
else
emit_insn (gen_vsx_splat_v4sf (tmp,
force_reg (SFmode, op)));
}
else
gcc_unreachable ();
}
/* Do the operation in V4SFmode. */
switch (subtype)
{
case FP16_BINARY:
emit_insn (gen_rtx_SET (result_v4sf,
gen_rtx_fmt_ee (icode, V4SFmode,
ops_v4sf[0],
ops_v4sf[1])));
break;
case FP16_FMA:
case FP16_FMS:
case FP16_NFMA:
case FP16_NFMS:
{
rtx op1 = ops_v4sf[0];
rtx op2 = ops_v4sf[1];
rtx op3 = ops_v4sf[2];
if (subtype == FP16_FMS || subtype == FP16_NFMS)
op3 = gen_rtx_NEG (V4SFmode, op3);
rtx op_fma = gen_rtx_FMA (V4SFmode, op1, op2, op3);
if (subtype == FP16_NFMA || subtype == FP16_NFMS)
op_fma = gen_rtx_NEG (V4SFmode, op_fma);
emit_insn (gen_rtx_SET (result_v4sf, op_fma));
}
break;
case FP16_ABS_BINARY:
case FP16_NEG_BINARY:
default:
gcc_unreachable ();
}
/* Convert V4SF result back to scalar mode. */
if (GET_MODE (result) == BFmode)
emit_insn (gen_xvcvspbf16_bf (result, result_v4sf));
else if (GET_MODE (result) == SFmode)
{
rtx element = GEN_INT (WORDS_BIG_ENDIAN ? 2 : 3);
emit_insn (gen_vsx_extract_v4sf (result, result_v4sf, element));
}
else
gcc_unreachable ();
}
/* Expand a 16-bit vector operation:
ICODE: Operation to perform.
RESULT: Result of the operation.
OP1: Input operand1.
OP2: Input operand2.
OP3: Input operand3 or NULL_RTX.
SUBTYPE: Describe the operation. */
void
fp16_vectorization (enum rtx_code icode,
rtx result,
rtx op1,
rtx op2,
rtx op3,
enum fp16_operation subtype)
{
gcc_assert (can_create_pseudo_p ());
enum rtx_code unary_op = UNKNOWN;
machine_mode result_mode = GET_MODE (result);
rtx op_orig[3] = { op1, op2, op3 };
rtx op_hi[3];
rtx op_lo[3];
rtx result_hi;
rtx result_lo;
size_t n_opts;
switch (subtype)
{
case FP16_BINARY:
n_opts = 2;
break;
case FP16_NEG_BINARY:
n_opts = 2;
unary_op = NEG;
break;
case FP16_ABS_BINARY:
n_opts = 2;
unary_op = ABS;
break;
case FP16_FMA:
case FP16_FMS:
case FP16_NFMA:
case FP16_NFMS:
n_opts = 3;
break;
default:
gcc_unreachable ();
}
/* Allocate 2 temporaries for the results and the input operands. */
result_hi = gen_reg_rtx (V4SFmode);
result_lo = gen_reg_rtx (V4SFmode);
for (size_t i = 0; i < n_opts; i++)
{
gcc_assert (op_orig[i] != NULL_RTX);
op_hi[i] = gen_reg_rtx (V4SFmode); /* high register. */
op_lo[i] = gen_reg_rtx (V4SFmode); /* low register. */
rtx interleave_hi = gen_reg_rtx (result_mode);
rtx interleave_lo = gen_reg_rtx (result_mode);
rtx orig = op_orig[i];
rs6000_expand_interleave (interleave_hi, orig, orig, !BYTES_BIG_ENDIAN);
rs6000_expand_interleave (interleave_lo, orig, orig, BYTES_BIG_ENDIAN);
if (result_mode == V8HFmode)
{
emit_insn (gen_xvcvhpsp_v8hf (op_hi[i], interleave_hi));
emit_insn (gen_xvcvhpsp_v8hf (op_lo[i], interleave_lo));
}
else if (result_mode == V8BFmode)
{
emit_insn (gen_xvcvbf16spn_v8bf (op_hi[i], interleave_hi));
emit_insn (gen_xvcvbf16spn_v8bf (op_lo[i], interleave_lo));
}
else
gcc_unreachable ();
}
/* Do 2 sets of V4SFmode operations. */
switch (subtype)
{
case FP16_BINARY:
case FP16_NEG_BINARY:
case FP16_ABS_BINARY:
emit_insn (gen_rtx_SET (result_hi,
gen_rtx_fmt_ee (icode, V4SFmode,
op_hi[0],
op_hi[1])));
emit_insn (gen_rtx_SET (result_lo,
gen_rtx_fmt_ee (icode, V4SFmode,
op_lo[0],
op_lo[1])));
break;
case FP16_FMA:
case FP16_FMS:
case FP16_NFMA:
case FP16_NFMS:
{
rtx op1_hi = op_hi[0];
rtx op2_hi = op_hi[1];
rtx op3_hi = op_hi[2];
rtx op1_lo = op_lo[0];
rtx op2_lo = op_lo[1];
rtx op3_lo = op_lo[2];
if (subtype == FP16_FMS || subtype == FP16_NFMS)
{
op3_hi = gen_rtx_NEG (V4SFmode, op3_hi);
op3_lo = gen_rtx_NEG (V4SFmode, op3_lo);
}
rtx op_fma_hi = gen_rtx_FMA (V4SFmode, op1_hi, op2_hi, op3_hi);
rtx op_fma_lo = gen_rtx_FMA (V4SFmode, op1_lo, op2_lo, op3_lo);
if (subtype == FP16_NFMA || subtype == FP16_NFMS)
{
op_fma_hi = gen_rtx_NEG (V4SFmode, op_fma_hi);
op_fma_lo = gen_rtx_NEG (V4SFmode, op_fma_lo);
}
emit_insn (gen_rtx_SET (result_hi, op_fma_hi));
emit_insn (gen_rtx_SET (result_lo, op_fma_lo));
}
break;
default:
gcc_unreachable ();
}
/* Add any unary operator modifications. */
if (unary_op != UNKNOWN)
{
emit_insn (gen_rtx_SET (result_hi,
gen_rtx_fmt_e (unary_op, V4SFmode, result_hi)));
emit_insn (gen_rtx_SET (result_lo,
gen_rtx_fmt_e (unary_op, V4SFmode, result_lo)));
}
/* Combine the 2 V4SFmode operations into one V8HFmode/V8BFmode vector. */
if (result_mode == V8HFmode)
emit_insn (gen_vec_pack_trunc_v4sf_v8hf (result, result_hi, result_lo));
else if (result_mode == V8BFmode)
emit_insn (gen_vec_pack_trunc_v4sf (result, result_hi, result_lo));
else
gcc_unreachable ();
return;
}