gcc/float16.md at 16967ce96a096dee45545a88bef39a7c44253d8f

39 KiB

Raw Blame History

;; This file is part of GCC.

;; GCC is free software; you can redistribute it and/or modify it ;; under the terms of the GNU General Public License as published ;; by the Free Software Foundation; either version 3, or (at your ;; option) any later version.

;; GCC is distributed in the hope that it will be useful, but WITHOUT ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public ;; License for more details.

;; You should have received a copy of the GNU General Public License ;; along with GCC; see the file COPYING3. If not see ;; http://www.gnu.org/licenses/.

;; Support for _Float16 (HFmode) and __bfloat16 (BFmode)

;; Mode iterator for 16-bit floating point modes both as a scalar and ;; as a vector. (define_mode_iterator FP16 [(BF "TARGET_BFLOAT16") (HF "TARGET_FLOAT16")])

;; Mode iterator for 16-bit floating point modes on machines with ;; hardware support both as a scalar and as a vector. (define_mode_iterator FP16_HW [(BF "TARGET_BFLOAT16_HW") (HF "TARGET_FLOAT16_HW")])

(define_mode_iterator VFP16_HW [(V8BF "TARGET_BFLOAT16_HW") (V8HF "TARGET_FLOAT16_HW")])

;; Mode iterator for floating point modes other than SF/DFmode that we ;; convert to/from _Float16 (HFmode) via DFmode. (define_mode_iterator fp16_float_convert [TF KF IF SD DD TD])

;; Mode attribute giving the instruction to convert the even ;; V8HFmode or V8BFmode elements to V4SFmode (define_mode_attr cvt_fp16_to_v4sf_insn [(BF "xvcvbf16spn") (HF "xvcvhpsp") (V8BF "xvcvbf16spn") (V8HF "xvcvhpsp")])

;; Mode attribute giving the vector mode for a 16-bit floating point ;; scalar in both upper and lower case. (define_mode_attr FP16_VECTOR8 [(BF "V8BF") (HF "V8HF")])

(define_mode_attr fp16_vector8 [(BF "v8bf") (HF "v8hf")])

;; Mode attribute giving the vector mode with 4 16-bit floating point ;; elements given a scalar or 8 element vector. (define_mode_attr FP16_VECTOR4 [(BF "V4BF") (HF "V4HF") (V8BF "V4BF") (V8HF "V4HF")])

;; Binary operators for bfloat16/float16 vectorization. (define_code_iterator FP16_BINARY_OP [plus minus mult smax smin])

;; Standard names for the unary/binary/ternary operators (define_code_attr fp16_names [(abs "abs") (fma "fma") (plus "add") (minus "sub") (mult "mul") (neg "neg") (smax "smax") (smin "smin")])

;; UNSPEC constants (define_c_enum "unspec" [UNSPEC_FP16_SHIFT_LEFT_32BIT UNSPEC_CVT_FP16_TO_V4SF UNSPEC_XXSPLTW_FP16 UNSPEC_XVCVSPBF16_BF UNSPEC_XVCVSPHP_V8HF UNSPEC_XVCVSPBF16_V8BF])

;; _Float16 and __bfloat16 moves (define_expand "mov" [(set (match_operand:FP16 0 "nonimmediate_operand") (match_operand:FP16 1 "any_operand"))] "" { if (MEM_P (operands[0]) && !REG_P (operands[1])) operands[1] = force_reg (mode, operands[1]); })

;; On power10, we can load up HFmode and BFmode constants with xxspltiw ;; or pli. (define_insn "*mov_xxspltiw" [(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,wa,?r,?r") (match_operand:FP16 1 "fp16_xxspltiw_constant" "j,eP,j,eP"))] "TARGET_POWER10 && TARGET_PREFIXED" { rtx op1 = operands[1]; const REAL_VALUE_TYPE *rtype = CONST_DOUBLE_REAL_VALUE (op1); long real_words[1];

if (op1 == CONST0_RTX (mode)) return (vsx_register_operand (operands[0], mode) ? "xxspltib %x0,0" : "li %0,0");

real_to_target (real_words, rtype, mode); operands[2] = GEN_INT (real_words[0]); return (vsx_register_operand (operands[0], mode) ? "xxspltiw %x0,%2" : "pli %0,%2"); } [(set_attr "type" "vecsimple,vecsimple,,") (set_attr "prefixed" "no,yes,no,yes")])

(define_insn "*mov_internal" [(set (match_operand:FP16 0 "nonimmediate_operand" "=wa, wa, Z, r, r, m, r, wa, wa, r")

(match_operand:FP16 1 "any_operand"
                "wa,        Z,        wa,        r,          m,
                 r,         wa,       r,         j,          j"))]

"gpc_reg_operand (operands[0], mode) || gpc_reg_operand (operands[1], mode)" "@ xxlor %x0,%x1,%x1 lxsihzx %x0,%y1 stxsihx %x1,%y0 mr %0,%1 lhz%U1%X1 %0,%1 sth%U0%X0 %1,%0 mfvsrwz %0,%x1 mtvsrwz %x0,%1 xxspltib %x0,0 li %0,0" [(set_attr "type" "vecsimple, fpload, fpstore, *, load, store, mtvsr, mfvsr, vecsimple, ") (set_attr "isa" ", p9v, p9v, *, *, *, p8v, p8v, p9v, *")])

;; Vector duplicate (define_insn "*vecdup" [(set (match_operand:<FP16_VECTOR8> 0 "altivec_register_operand" "=v") (vec_duplicate:<FP16_VECTOR8> (match_operand:FP16 1 "altivec_register_operand" "v")))] "" "vsplth %0,%1,3" [(set_attr "type" "vecperm")])

;; Convert IEEE 16-bit floating point to/from other floating point modes.

(define_insn "extendhf2" [(set (match_operand:SFDF 0 "vsx_register_operand" "=wa") (float_extend:SFDF (match_operand:HF 1 "vsx_register_operand" "wa")))] "TARGET_FLOAT16_HW" "xscvhpdp %x0,%x1" [(set_attr "type" "fpsimple")])

(define_insn "trunchf2" [(set (match_operand:HF 0 "vsx_register_operand" "=wa") (float_truncate:HF (match_operand:SFDF 1 "vsx_register_operand" "wa")))] "TARGET_FLOAT16_HW" "xscvdphp %x0,%x1" [(set_attr "type" "fpsimple")])

;; Convert BFmode to SFmode/DFmode. ;; 3 instructions are generated: ;; VSPLTH -- duplicate BFmode into all elements ;; XVCVBF16SPN -- convert even BFmode elements to SFmode ;; XSCVSPNDP -- convert memory format of SFmode to DFmode. (define_insn_and_split "extendbf2" [(set (match_operand:SFDF 0 "vsx_register_operand" "=wa") (float_extend:SFDF (match_operand:BF 1 "vsx_register_operand" "v"))) (clobber (match_scratch:V8BF 2 "=v"))] "TARGET_BFLOAT16_HW" "#" "&& 1" [(pc)] { rtx op0 = operands[0]; rtx op1 = operands[1]; rtx op2_v8bf = operands[2];

if (GET_CODE (op2_v8bf) == SCRATCH) op2_v8bf = gen_reg_rtx (V8BFmode);

rtx op2_v4sf = gen_lowpart (V4SFmode, op2_v8bf);

/* XXSLDWI -- shift BFmode element into the upper 32 bits. */ emit_insn (gen_v8bf_shift_left_32bit (op2_v8bf, op1));

/* XVCVBF16SPN -- convert even V8BFmode elements to V4SFmode. */ emit_insn (gen_cvt_fp16_to_v4sf_v8bf (op2_v4sf, op2_v8bf));

/* XSCVSPNDP -- convert single V4SFmode element to DFmode. */ emit_insn (GET_MODE (op0) == SFmode ? gen_vsx_xscvspdpn_sf (op0, op2_v4sf) : gen_vsx_xscvspdpn (op0, op2_v4sf));

DONE; } [(set_attr "type" "fpsimple") (set_attr "length" "12")])

(define_insn "vsx_xscvdpsp_sf" [(set (match_operand:V4SF 0 "vsx_register_operand" "=f,?wa") (unspec:V4SF [(match_operand:SF 1 "vsx_register_operand" "f,wa")] UNSPEC_VSX_CVSPDP))] "VECTOR_UNIT_VSX_P (DFmode)" "xscvdpsp %x0,%x1" [(set_attr "type" "fp")])

;; Vector shift left by 32 bits to get the 16-bit floating point value ;; into the upper 32 bits for the conversion. (define_insn "<fp16_vector8>_shift_left_32bit" [(set (match_operand:<FP16_VECTOR8> 0 "vsx_register_operand" "=wa") (unspec:<FP16_VECTOR8> [(match_operand:FP16_HW 1 "vsx_register_operand" "wa")] UNSPEC_FP16_SHIFT_LEFT_32BIT))] "" "xxsldwi %x0,%x1,%x1,1" [(set_attr "type" "vecperm")])

;; Convert SFmode/DFmode to BFmode. ;; 2 instructions are generated: ;; XSCVDPSPN -- convert SFmode/DFmode scalar to V4SFmode ;; XVCVSPBF16 -- convert V4SFmode to even V8BFmode

(define_insn_and_split "truncbf2" [(set (match_operand:BF 0 "vsx_register_operand" "=wa") (float_truncate:BF (match_operand:SFDF 1 "vsx_register_operand" "wa"))) (clobber (match_scratch:V4SF 2 "=wa"))] "TARGET_BFLOAT16_HW" "#" "&& 1" [(pc)] { rtx op0 = operands[0]; rtx op1 = operands[1]; rtx op2 = operands[2];

if (GET_CODE (op2) == SCRATCH) op2 = gen_reg_rtx (V4SFmode);

emit_insn (GET_MODE (op1) == SFmode ? gen_vsx_xscvdpspn_sf (op2, op1) : gen_vsx_xscvdpspn (op2, op1));

emit_insn (gen_xvcvspbf16_bf (op0, op2)); DONE; } [(set_attr "type" "fpsimple")])

(define_insn "vsx_xscvdpspn_sf" [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") (unspec:V4SF [(match_operand:SF 1 "vsx_register_operand" "wa")] UNSPEC_VSX_CVDPSPN))] "TARGET_XSCVDPSPN" "xscvdpspn %x0,%x1" [(set_attr "type" "fp")]) ;; Convert between HFmode/BFmode and 128-bit binary floating point and ;; decimal floating point types. We use convert_move since some of the ;; types might not have valid RTX expanders. We use DFmode as the ;; intermediate conversion destination.

(define_expand "extend<FP16_HW:mode><fp16_float_convert:mode>2" [(set (match_operand:fp16_float_convert 0 "vsx_register_operand") (float_extend:fp16_float_convert (match_operand:FP16_HW 1 "vsx_register_operand")))] "" { rtx df_tmp = gen_reg_rtx (DFmode); emit_insn (gen_extend<FP16_HW:mode>df2 (df_tmp, operands[1])); convert_move (operands[0], df_tmp, 0); DONE; })

(define_expand "trunc<fp16_float_convert:mode><FP16_HW:mode>2" [(set (match_operand:FP16_HW 0 "vsx_register_operand") (float_truncate:FP16_HW (match_operand:fp16_float_convert 1 "vsx_register_operand")))] "" { rtx df_tmp = gen_reg_rtx (DFmode);

convert_move (df_tmp, operands[1], 0); emit_insn (gen_truncdf<FP16_HW:mode>2 (operands[0], df_tmp)); DONE; })

;; Convert integers to 16-bit floating point modes. (define_expand "floatGPR:mode<FP16_HW:mode>2" [(set (match_operand:FP16_HW 0 "vsx_register_operand") (float:FP16_HW (match_operand:GPR 1 "nonimmediate_operand")))] "" { rtx df_tmp = gen_reg_rtx (DFmode); emit_insn (gen_floatGPR:modedf2 (df_tmp, operands[1])); emit_insn (gen_truncdf<FP16_HW:mode>2 (operands[0], df_tmp)); DONE; })

(define_expand "floatunsGPR:mode<FP16_HW:mode>2" [(set (match_operand:FP16_HW 0 "vsx_register_operand") (unsigned_float:FP16_HW (match_operand:GPR 1 "nonimmediate_operand")))] "" { rtx df_tmp = gen_reg_rtx (DFmode); emit_insn (gen_floatunsGPR:modedf2 (df_tmp, operands[1])); emit_insn (gen_truncdf<FP16_HW:mode>2 (operands[0], df_tmp)); DONE; })

;; Convert 16-bit floating point modes to integers (define_expand "fix_trunc<FP16_HW:mode>GPR:mode2" [(set (match_operand:GPR 0 "vsx_register_operand") (fix:GPR (match_operand:FP16_HW 1 "vsx_register_operand")))] "" { rtx df_tmp = gen_reg_rtx (DFmode); emit_insn (gen_extend<FP16_HW:mode>df2 (df_tmp, operands[1])); emit_insn (gen_fix_truncdfGPR:mode2 (operands[0], df_tmp)); DONE; })

(define_expand "fixuns_trunc<FP16_HW:mode>GPR:mode2" [(set (match_operand:GPR 0 "vsx_register_operand") (unsigned_fix:GPR (match_operand:FP16_HW 1 "vsx_register_operand")))] "" { rtx df_tmp = gen_reg_rtx (DFmode); emit_insn (gen_extend<FP16_HW:mode>df2 (df_tmp, operands[1])); emit_insn (gen_fixuns_truncdfGPR:mode2 (operands[0], df_tmp)); DONE; })

;; Convert the even elements of a vector 16-bit floating point to ;; V4SFmode. Deal with little endian vs. big endian element ordering ;; in identifying which elements are converted.

(define_expand "cvt_fp16_to_v4sf_" [(set (match_operand:V4SF 0 "vsx_register_operand") (float_extend:V4SF (vec_select:<FP16_VECTOR4> (match_operand:VFP16_HW 1 "vsx_register_operand") (parallel [(match_dup 2) (match_dup 3) (match_dup 4) (match_dup 5)]))))] "" { int endian_adjust = WORDS_BIG_ENDIAN ? 0 : 1; operands[2] = GEN_INT (0 + endian_adjust); operands[3] = GEN_INT (2 + endian_adjust); operands[4] = GEN_INT (4 + endian_adjust); operands[5] = GEN_INT (6 + endian_adjust); })

(define_insn "*cvt_fp16_to_v4sf__le" [(set (match_operand:V4SF 0 "vsx_register_operand") (float_extend:V4SF (vec_select:<FP16_VECTOR4> (match_operand:VFP16_HW 1 "vsx_register_operand") (parallel [(const_int 1) (const_int 3) (const_int 5) (const_int 7)]))))] "!WORDS_BIG_ENDIAN" "<cvt_fp16_to_v4sf_insn> %x0,%x1" [(set_attr "type" "vecfloat")])

(define_insn "*cvt_fp16_to_v4sf__be" [(set (match_operand:V4SF 0 "vsx_register_operand") (float_extend:V4SF (vec_select:<FP16_VECTOR4> (match_operand:VFP16_HW 1 "vsx_register_operand") (parallel [(const_int 0) (const_int 2) (const_int 4) (const_int 6)]))))] "WORDS_BIG_ENDIAN" "<cvt_fp16_to_v4sf_insn> %x0,%x1" [(set_attr "type" "vecfloat")])

;; Duplicate and convert a 16-bit floating point scalar to V4SFmode.

(define_insn_and_split "*dup__to_v4sf" [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") (vec_duplicate:V4SF (float_extend:SF (match_operand:FP16_HW 1 "vsx_register_operand" "wa"))))] "" "#" "&& 1" [(pc)] { rtx op0 = operands[0]; rtx op1 = operands[1]; rtx op0_vfp16 = gen_lowpart (<FP16_VECTOR8>mode, op0);

emit_insn (gen_xxspltw_ (op0, op1)); emit_insn (gen_cvt_fp16_to_v4sf_<fp16_vector8> (op0, op0_vfp16)); DONE; } [(set_attr "length" "8") (set_attr "type" "vecperm")])

;; Optimize __bfloat16 binary operations. Unlike _Float16 where we ;; have instructions to convert between HFmode and SFmode as scalar ;; values, with BFmode, we only have vector conversions. Thus to do: ;; ;; __bfloat16 a, b, c; ;; a = b + c; ;; ;; the GCC compiler would normally generate: ;; ;; lxsihzx 0,4,2 // load __bfloat16 value b ;; lxsihzx 12,5,2 // load __bfloat16 value c ;; xxsldwi 0,0,0,1 // shift b into bits 16..31 ;; xxsldwi 12,12,12,1 // shift c into bits 16..31 ;; xvcvbf16spn 0,0 // vector convert b into V4SFmode ;; xvcvbf16spn 12,12 // vector convert c into V4SFmode ;; xscvspdpn 0,0 // convert b into SFmode scalar ;; xscvspdpn 12,12 // convert c into SFmode scalar ;; fadds 0,0,12 // add b+c ;; xscvdpspn 0,0 // convert b+c into SFmode memory format ;; xvcvspbf16 0,0 // convert b+c into BFmode memory format ;; stxsihx 0,3,2 // store b+c ;; ;; Using the following combiner patterns, the code generated would now ;; be: ;; ;; lxsihzx 12,4,2 // load __bfloat16 value b ;; lxsihzx 0,5,2 // load __bfloat16 value c ;; xxspltw 12,12,1 // shift b into bits 16..31 ;; xxspltw 0,0,1 // shift c into bits 16..31 ;; xvcvbf16spn 12,12 // vector convert b into V4SFmode ;; xvcvbf16spn 0,0 // vector convert c into V4SFmode ;; xvaddsp 0,0,12 // vector b+c in V4SFmode ;; xvcvspbf16 0,0 // convert b+c into BFmode memory format ;; stxsihx 0,3,2 // store b+c ;; ;; We cannot just define insns like 'addbf3' to keep the operation as ;; BFmode because GCC will not generate these patterns unless the user ;; uses -Ofast. Without -Ofast, it will always convert BFmode into ;; SFmode.

(define_insn_and_split "*bfloat16_binary_op_internal1" [(set (match_operand:SF 0 "vsx_register_operand") (match_operator:SF 1 "fp16_binary_operator" [(match_operand:SF 2 "bfloat16_v4sf_operand") (match_operand:SF 3 "bfloat16_v4sf_operand")]))] "TARGET_BFLOAT16_HW && can_create_pseudo_p () && (bfloat16_bf_operand (operands[2], SFmode) || bfloat16_bf_operand (operands[3], SFmode))" "#" "&& 1" [(pc)] { bfloat16_operation_as_v4sf (GET_CODE (operands[1]), operands[0], operands[2], operands[3], NULL_RTX, FP16_BINARY); DONE; })

(define_insn_and_split "*bfloat16_binary_op_internal2" [(set (match_operand:BF 0 "vsx_register_operand") (float_truncate:BF (match_operator:SF 1 "fp16_binary_operator" [(match_operand:SF 2 "bfloat16_v4sf_operand") (match_operand:SF 3 "bfloat16_v4sf_operand")])))] "TARGET_BFLOAT16_HW && can_create_pseudo_p () && (bfloat16_bf_operand (operands[2], SFmode) || bfloat16_bf_operand (operands[3], SFmode))" "#" "&& 1" [(pc)] { bfloat16_operation_as_v4sf (GET_CODE (operands[1]), operands[0], operands[2], operands[3], NULL_RTX, FP16_BINARY); DONE; })

(define_insn_and_split "*bfloat16_fma_internal1" [(set (match_operand:SF 0 "vsx_register_operand") (fma:SF (match_operand:SF 1 "bfloat16_v4sf_operand") (match_operand:SF 2 "bfloat16_v4sf_operand") (match_operand:SF 3 "bfloat16_v4sf_operand")))] "TARGET_BFLOAT16_HW && can_create_pseudo_p () && (bfloat16_bf_operand (operands[1], SFmode) + bfloat16_bf_operand (operands[2], SFmode) + bfloat16_bf_operand (operands[3], SFmode) >= 2)" "#" "&& 1" [(pc)] { bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], operands[3], FP16_FMA); DONE; })

(define_insn_and_split "*bfloat16_fma_internal2" [(set (match_operand:BF 0 "vsx_register_operand" "=wa") (float_truncate:BF (fma:SF (match_operand:SF 1 "bfloat16_v4sf_operand") (match_operand:SF 2 "bfloat16_v4sf_operand") (match_operand:SF 3 "bfloat16_v4sf_operand"))))] "TARGET_BFLOAT16_HW && can_create_pseudo_p () && (bfloat16_bf_operand (operands[1], SFmode) + bfloat16_bf_operand (operands[2], SFmode) + bfloat16_bf_operand (operands[3], SFmode) >= 2)" "#" "&& 1" [(pc)] { bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], operands[3], FP16_FMA); DONE; })

(define_insn_and_split "*bfloat16_fms_internal1" [(set (match_operand:SF 0 "vsx_register_operand") (fma:SF (match_operand:SF 1 "bfloat16_v4sf_operand") (match_operand:SF 2 "bfloat16_v4sf_operand") (neg:SF (match_operand:SF 3 "bfloat16_v4sf_operand"))))] "TARGET_BFLOAT16_HW && can_create_pseudo_p () && (bfloat16_bf_operand (operands[1], SFmode) + bfloat16_bf_operand (operands[2], SFmode) + bfloat16_bf_operand (operands[3], SFmode) >= 2)" "#" "&& 1" [(pc)] { bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], operands[3], FP16_FMS); DONE; })

(define_insn_and_split "*bfloat16_fms_interna2" [(set (match_operand:BF 0 "vsx_register_operand") (float_truncate:BF (fma:SF (match_operand:SF 1 "bfloat16_v4sf_operand") (match_operand:SF 2 "bfloat16_v4sf_operand") (neg:SF (match_operand:SF 3 "bfloat16_v4sf_operand")))))] "TARGET_BFLOAT16_HW && can_create_pseudo_p () && (bfloat16_bf_operand (operands[1], SFmode) + bfloat16_bf_operand (operands[2], SFmode) + bfloat16_bf_operand (operands[3], SFmode) >= 2)" "#" "&& 1" [(pc)] { bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], operands[3], FP16_FMS); DONE; })

(define_insn_and_split "*bfloat16_nfma_internal1" [(set (match_operand:SF 0 "vsx_register_operand") (neg:SF (fma:SF (match_operand:SF 1 "bfloat16_v4sf_operand") (match_operand:SF 2 "bfloat16_v4sf_operand") (match_operand:SF 3 "bfloat16_v4sf_operand"))))] "TARGET_BFLOAT16_HW && can_create_pseudo_p () && (bfloat16_bf_operand (operands[1], SFmode) + bfloat16_bf_operand (operands[2], SFmode) + bfloat16_bf_operand (operands[3], SFmode) >= 2)" "#" "&& 1" [(pc)] { bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], operands[3], FP16_NFMA); DONE; })

(define_insn_and_split "*bfloat16_nfma_internal2" [(set (match_operand:BF 0 "vsx_register_operand" "=wa") (float_truncate:BF (neg:SF (fma:SF (match_operand:SF 1 "bfloat16_v4sf_operand") (match_operand:SF 2 "bfloat16_v4sf_operand") (match_operand:SF 3 "bfloat16_v4sf_operand")))))] "TARGET_BFLOAT16_HW && can_create_pseudo_p () && (bfloat16_bf_operand (operands[1], SFmode) + bfloat16_bf_operand (operands[2], SFmode) + bfloat16_bf_operand (operands[3], SFmode) >= 2)" "#" "&& 1" [(pc)] { bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], operands[3], FP16_NFMA); DONE; })

(define_insn_and_split "*bfloat16_nfma_internal3" [(set (match_operand:BF 0 "vsx_register_operand" "=wa") (neg:BF (float_truncate:BF (fma:SF (match_operand:SF 1 "bfloat16_v4sf_operand") (match_operand:SF 2 "bfloat16_v4sf_operand") (match_operand:SF 3 "bfloat16_v4sf_operand")))))] "TARGET_BFLOAT16_HW && can_create_pseudo_p () && (bfloat16_bf_operand (operands[1], SFmode) + bfloat16_bf_operand (operands[2], SFmode) + bfloat16_bf_operand (operands[3], SFmode) >= 2)" "#" "&& 1" [(pc)] { bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], operands[3], FP16_NFMA); DONE; })

(define_insn_and_split "*bfloat16_nfms_internal1" [(set (match_operand:SF 0 "vsx_register_operand") (neg:SF (fma:SF (match_operand:SF 1 "bfloat16_v4sf_operand") (match_operand:SF 2 "bfloat16_v4sf_operand") (neg:SF (match_operand:SF 3 "bfloat16_v4sf_operand")))))] "TARGET_BFLOAT16_HW && can_create_pseudo_p () && (bfloat16_bf_operand (operands[1], SFmode) + bfloat16_bf_operand (operands[2], SFmode) + bfloat16_bf_operand (operands[3], SFmode) >= 2)" "#" "&& 1" [(pc)] { bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], operands[3], FP16_NFMS); DONE; })

(define_insn_and_split "*bfloat16_nfms_internal2" [(set (match_operand:BF 0 "vsx_register_operand") (float_truncate:BF (neg:SF (fma:SF (match_operand:SF 1 "bfloat16_v4sf_operand") (match_operand:SF 2 "bfloat16_v4sf_operand") (neg:SF (match_operand:SF 3 "bfloat16_v4sf_operand"))))))] "TARGET_BFLOAT16_HW && can_create_pseudo_p () && (bfloat16_bf_operand (operands[1], SFmode) + bfloat16_bf_operand (operands[2], SFmode) + bfloat16_bf_operand (operands[3], SFmode) >= 2)" "#" "&& 1" [(pc)] { bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], operands[3], FP16_NFMS); DONE; })

(define_insn_and_split "*bfloat16_nfms_internal3" [(set (match_operand:BF 0 "vsx_register_operand") (neg:BF (float_truncate:BF (fma:SF (match_operand:SF 1 "bfloat16_v4sf_operand") (match_operand:SF 2 "bfloat16_v4sf_operand") (neg:SF (match_operand:SF 3 "bfloat16_v4sf_operand"))))))] "TARGET_BFLOAT16_HW && can_create_pseudo_p () && (bfloat16_bf_operand (operands[1], SFmode) + bfloat16_bf_operand (operands[2], SFmode) + bfloat16_bf_operand (operands[3], SFmode) >= 2)" "#" "&& 1" [(pc)] { bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], operands[3], FP16_NFMS); DONE; }) ;; Add vectorization support for 16-bit floating point.

;; Negate vector bfloat16/float16 (define_insn_and_split "neg2" [(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa") (neg:VFP16_HW (match_operand:VFP16_HW 1 "vsx_register_operand" "wa"))) (clobber (match_scratch:VFP16_HW 2 "=&wa"))] "" "#" "&& 1" [(set (match_dup 2) (match_dup 3)) (set (match_dup 0) (xor:VFP16_HW (match_dup 1) (match_dup 2)))] { if (GET_CODE (operands[2]) == SCRATCH) operands[2] = gen_reg_rtx (mode);

REAL_VALUE_TYPE dconst;

gcc_assert (real_from_string (&dconst, "-0.0") == 0);

rtx neg0 = const_double_from_real_value (dconst, <VEC_base>mode); rtvec v = rtvec_alloc (8);

for (size_t i = 0; i < 8; i++) RTVEC_ELT (v, i) = neg0;

rtx vneg0 = force_const_mem (mode, gen_rtx_CONST_VECTOR (mode, v));

operands[3] = vneg0; } [(set_attr "type" "veclogical") (set_attr "length" "16")])

;; XOR used to negate a 16-bit floating point type

(define_insn "*xor3" [(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa") (xor:VFP16_HW (match_operand:VFP16_HW 1 "vsx_register_operand" "wa") (match_operand:VFP16_HW 2 "vsx_register_operand" "wa")))] "" "xxlxor %x0,%x1,%x2" [(set_attr "type" "veclogical")])

;; 16-bit floating point vector absolute value

(define_insn_and_split "abs2" [(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa") (abs:VFP16_HW (match_operand:VFP16_HW 1 "vsx_register_operand" "wa"))) (clobber (match_scratch:VFP16_HW 2 "=&wa"))] "" "#" "&& 1" [(set (match_dup 2) (match_dup 3)) (set (match_dup 0) (and:VFP16_HW (match_dup 1) (not:VFP16_HW (match_dup 2))))] { if (GET_CODE (operands[2]) == SCRATCH) operands[2] = gen_reg_rtx (mode);

REAL_VALUE_TYPE dconst;

gcc_assert (real_from_string (&dconst, "-0.0") == 0);

rtx neg0 = const_double_from_real_value (dconst, <VEC_base>mode); rtvec v = rtvec_alloc (8);

for (size_t i = 0; i < 8; i++) RTVEC_ELT (v, i) = neg0;

rtx vneg0 = force_const_mem (mode, gen_rtx_CONST_VECTOR (mode, v));

operands[3] = vneg0; } [(set_attr "type" "veclogical") (set_attr "length" "16")])

;; ANDC used to clear the sign bit of a 16-bit floating point type ;; for absolute value.

(define_insn "*andc3" [(set (match_operand:VFP16_HW 0 "gpc_reg_operand" "=wa") (and:VFP16_HW (match_operand:VFP16_HW 1 "gpc_reg_operand" "wa") (not:VFP16_HW (match_operand:VFP16_HW 2 "gpc_reg_operand" "wa"))))] "" "xxlandc %x0,%x1,%x2" [(set_attr "type" "veclogical")])

;; Binary operators being vectorized. (define_insn_and_split "<fp16_names>3" [(set (match_operand:VFP16_HW 0 "vsx_register_operand") (FP16_BINARY_OP:VFP16_HW (match_operand:VFP16_HW 1 "vsx_register_operand") (match_operand:VFP16_HW 2 "vsx_register_operand")))] "can_create_pseudo_p ()" "#" "&& 1" [(pc)] { fp16_vectorization (, operands[0], operands[1], operands[2], NULL_RTX, FP16_BINARY); DONE; })


;; Negative of binary operators being vectorized.
(define_insn_and_split "*neg_<fp16_names>3"
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
(neg:VFP16_HW
(FP16_BINARY_OP:VFP16_HW
(match_operand:VFP16_HW 1 "vsx_register_operand")
(match_operand:VFP16_HW 2 "vsx_register_operand"))))]
"can_create_pseudo_p ()"
"#"
"&& 1"
[(pc)]
{
fp16_vectorization (, operands[0], operands[1], operands[2],
NULL_RTX, FP16_NEG_BINARY);
DONE;
})

;; Absolute value of binary operators being vectorized.
(define_insn_and_split "*abs_<fp16_names>3"
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
(abs:VFP16_HW
(FP16_BINARY_OP:VFP16_HW
(match_operand:VFP16_HW 1 "vsx_register_operand")
(match_operand:VFP16_HW 2 "vsx_register_operand"))))]
"can_create_pseudo_p ()"
"#"
"&& 1"
[(pc)]
{
fp16_vectorization (, operands[0], operands[1], operands[2],
NULL_RTX, FP16_ABS_BINARY);
DONE;
})

;; FMA operations being vectorized.
(define_insn_and_split "fma4"
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
(fma:VFP16_HW
(match_operand:VFP16_HW 1 "vsx_register_operand")
(match_operand:VFP16_HW 2 "vsx_register_operand")
(match_operand:VFP16_HW 3 "vsx_register_operand")))]
"can_create_pseudo_p ()"
"#"
"&& 1"
[(pc)]
{
fp16_vectorization (FMA, operands[0], operands[1], operands[2],
operands[3], FP16_FMA);
DONE;
})
(define_insn_and_split "*fms4"
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
(fma:VFP16_HW
(match_operand:VFP16_HW 1 "vsx_register_operand")
(match_operand:VFP16_HW 2 "vsx_register_operand")
(neg:VFP16_HW
(match_operand:VFP16_HW 3 "vsx_register_operand"))))]
"can_create_pseudo_p ()"
"#"
"&& 1"
[(pc)]
{
fp16_vectorization (FMA, operands[0], operands[1], operands[2],
operands[3], FP16_FMS);
DONE;
})
(define_insn_and_split "*nfma4"
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
(neg:VFP16_HW
(fma:VFP16_HW
(match_operand:VFP16_HW 1 "vsx_register_operand")
(match_operand:VFP16_HW 2 "vsx_register_operand")
(match_operand:VFP16_HW 3 "vsx_register_operand"))))]
"can_create_pseudo_p ()"
"#"
"&& 1"
[(pc)]
{
fp16_vectorization (FMA, operands[0], operands[1], operands[2],
operands[3], FP16_NFMA);
DONE;
})
(define_insn_and_split "*nfms4"
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
(neg:VFP16_HW
(fma:VFP16_HW
(match_operand:VFP16_HW 1 "vsx_register_operand")
(match_operand:VFP16_HW 2 "vsx_register_operand")
(neg:VFP16_HW
(match_operand:VFP16_HW 3 "vsx_register_operand")))))]
"can_create_pseudo_p ()"
"#"
"&& 1"
[(pc)]
{
fp16_vectorization (FMA, operands[0], operands[1], operands[2],
operands[3], FP16_NFMS);
DONE;
})

;; If we do multiple __bfloat16 operations, between the first and
;; second operation, GCC will want to convert the first operation from
;; V4SFmode to SFmode and then reconvert it back to V4SFmode.  On the
;; PowerPC, this is complicated because internally in the vector
;; register, SFmode values are stored as DFmode values.
;;
;; For example, if we have:
;;
;;	__bfloat16 a, b, c, d;
;;	a = b + c + d;
;;
;; We would generate:
;;
;;      lxsihzx 0,4,2           // load b as BFmode
;;      lxsihzx 11,5,2          // load c as BFmode
;;      lxsihzx 12,6,2          // load d as BFmode
;;      xxspltw 0,0,1           // shift b into bits 16..31
;;      xxspltw 11,11,1         // shift c into bits 16..31
;;      xxspltw 12,12,1         // shift d into bits 16..31
;;      xvcvbf16spn 0,0         // convert b into V4SFmode
;;      xvcvbf16spn 11,11       // convert c into V4SFmode
;;      xvcvbf16spn 12,12       // convert d into V4SFmode
;;      xvaddsp 0,0,11          // calculate b+c as V4SFmode
;;      xscvspdp 0,0            // convert b+c into DFmode memory format
;;      xscvdpspn 0,0           // convert b+c into SFmode memory format
;;      xxspltw 0,0,0           // convert b+c into V4SFmode
;;      xvaddsp 12,12,0         // calculate b+c+d as V4SFmode
;;      xvcvspbf16 12,12        // convert b+c+d into BFmode memory format
;;      stxsihx 12,3,2          // store b+c+d
;;
;; With this peephole2, we can eliminate the xscvspdp and xscvdpspn
;; instructions.
;;
;; We keep the xxspltw between the two xvaddsp's in case the user
;; explicitly did a SFmode extract of element 0 and did a splat
;; operation.
(define_peephole2
[(set (match_operand:SF 0 "vsx_register_operand")
(unspec:SF
[(match_operand:V4SF 1 "vsx_register_operand")]
UNSPEC_VSX_CVSPDP))
(set (match_operand:V4SF 2 "vsx_register_operand")
(unspec:V4SF [(match_dup 0)] UNSPEC_VSX_CVDPSPN))]
"REGNO (operands[1]) == REGNO (operands[2])
|| peep2_reg_dead_p (1, operands[1])"
[(set (match_dup 2) (match_dup 1))])

;; Duplicate a HF/BF value so it can be used for xvcvhpspn/xvcvbf16spn.
;; Because xvcvhpspn/xvcvbf16spn only uses the even elements, we can
;; use xxspltw instead of vspltw.  This has the advantage that the
;; register allocator can use any of the 64 VSX registers instead of
;; being limited to the 32 Altivec registers that VSPLTH would require.
(define_insn "xxspltw_"
[(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
(unspec:V4SF [(match_operand:FP16_HW 1 "vsx_register_operand" "wa")]
UNSPEC_XXSPLTW_FP16))]
""
"xxspltw %x0,%x1,1"
[(set_attr "type" "vecperm")])
;; Convert a bfloat16 floating point scalar that has been splatted to
;; V4SFmode.
(define_insn "xvcvbf16spn_bf"
[(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
(unspec:V4SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
UNSPEC_CVT_FP16_TO_V4SF))]
"TARGET_BFLOAT16_HW"
"xvcvbf16spn %x0,%x1"
[(set_attr "type" "vecperm")])
;; Convert a V4SFmode vector to a 16-bit floating point scalar.  We
;; only care about the 2nd V4SFmode element, which is the element we
;; converted the 16-bit scalar (4th element) to V4SFmode to do the
;; operation, and converted it back.
(define_insn "xvcvspbf16_bf"
[(set (match_operand:BF 0 "vsx_register_operand" "=wa")
(unspec:BF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
UNSPEC_XVCVSPBF16_BF))]
"TARGET_BFLOAT16_HW"
"xvcvspbf16 %x0,%x1"
[(set_attr "type" "vecfloat")])

;; Negate 16-bit floating point by XOR with -0.0.
(define_insn_and_split "neg2"
[(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,?wr")
(neg:FP16 (match_operand:FP16 1 "gpc_reg_operand" "wa,wr")))
(clobber (match_scratch:FP16 2 "=&wa,&r"))]
""
"#"
"&& 1"
[(set (match_dup 2)
(match_dup 3))
(set (match_dup 0)
(xor:FP16 (match_dup 1)
(match_dup 2)))]
{
if (GET_CODE (operands[2]) == SCRATCH)
operands[2] = gen_reg_rtx (mode);
REAL_VALUE_TYPE dconst;
gcc_assert (real_from_string (&dconst, "-0.0") == 0);
rtx rc = const_double_from_real_value (dconst, mode);
if (!TARGET_PREFIXED)
rc = force_const_mem (mode, rc);
operands[3] = rc;
}
[(set_attr "type" "veclogical,integer")
(set_attr "length" "16")])
;; XOR used to negate a 16-bit floating point type
(define_insn "*xor3"
[(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,?wr")
(xor:FP16 (match_operand:FP16 1 "gpc_reg_operand" "wa,wr")
(match_operand:FP16 2 "gpc_reg_operand" "wa,wr")))]
""
"@
xxlxor %x0,%x1,%x2
xor %0,%1,%2"
[(set_attr "type" "veclogical,integer")])
;; 16-bit floating point absolute value
(define_insn_and_split "abs2"
[(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,?wr")
(abs:FP16
(match_operand:FP16 1 "gpc_reg_operand" "wa,wr")))
(clobber (match_scratch:FP16 2 "=&wa,&r"))]
""
"#"
"&& 1"
[(set (match_dup 2)
(match_dup 3))
(set (match_dup 0)
(and:FP16 (match_dup 1)
(not:FP16 (match_dup 2))))]
{
if (GET_CODE (operands[2]) == SCRATCH)
operands[2] = gen_reg_rtx (mode);
REAL_VALUE_TYPE dconst;
gcc_assert (real_from_string (&dconst, "-0.0") == 0);
rtx rc = const_double_from_real_value (dconst, mode);
if (!TARGET_PREFIXED)
rc = force_const_mem (mode, rc);
operands[3] = rc;
}
[(set_attr "type" "veclogical,integer")
(set_attr "length" "16")])
;; ANDC used to clear the sign bit of a 16-bit floating point type
;; for absolute value.
(define_insn "*andc3"
[(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,?wr")
(and:FP16 (match_operand:FP16 1 "gpc_reg_operand" "wa,wr")
(not:FP16
(match_operand:FP16 2 "gpc_reg_operand" "wa,wr"))))]
""
"@
xxlandc %x0,%x1,%x2
andc %0,%1,%2"
[(set_attr "type" "veclogical,integer")])
;; 16-bit negative floating point absolute value
(define_insn_and_split "*nabs2"
[(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,?wr")
(neg:FP16
(abs:FP16
(match_operand:FP16 1 "gpc_reg_operand" "wa,wr"))))
(clobber (match_scratch:FP16 2 "=&wa,&r"))]
""
"#"
"&& 1"
[(set (match_dup 2)
(match_dup 3))
(set (match_dup 0)
(ior:FP16 (match_dup 1)
(match_dup 2)))]
{
if (GET_CODE (operands[2]) == SCRATCH)
operands[2] = gen_reg_rtx (mode);
REAL_VALUE_TYPE dconst;
gcc_assert (real_from_string (&dconst, "-0.0") == 0);
rtx rc = const_double_from_real_value (dconst, mode);
if (!TARGET_PREFIXED)
rc = force_const_mem (mode, rc);
operands[3] = rc;
}
[(set_attr "type" "veclogical,integer")
(set_attr "length" "16")])
;; OR used to set the sign bit of a 16-bit floating point type
;; for negative absolute value.
(define_insn "*ior3"
[(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,?wr")
(ior:FP16 (match_operand:FP16 1 "gpc_reg_operand" "wa,wr")
(match_operand:FP16 2 "gpc_reg_operand" "wa,wr")))]
""
"@
xxlor %x0,%x1,%x2
or %0,%1,%2"
[(set_attr "type" "veclogical,integer")])

;; Vector Pack support.
;; Unfortunately the machine independent code assumes there is only one
;; 16-bit floating point type.  So we have to choose whether to support
;; packing _Float16 or __bfloat16.
(define_expand "vec_pack_trunc_v4sf_v8hf"
[(match_operand:V8HF 0 "vfloat_operand")
(match_operand:V4SF 1 "vfloat_operand")
(match_operand:V4SF 2 "vfloat_operand")]
"TARGET_FLOAT16_HW"
{
rtx r1 = gen_reg_rtx (V8HFmode);
rtx r2 = gen_reg_rtx (V8HFmode);
emit_insn (gen_xvcvsphp_v8hf (r1, operands[1]));
emit_insn (gen_xvcvsphp_v8hf (r2, operands[2]));
rs6000_expand_extract_even (operands[0], r1, r2);
DONE;
})
(define_expand "vec_pack_trunc_v4sf"
[(match_operand:V8BF 0 "vfloat_operand")
(match_operand:V4SF 1 "vfloat_operand")
(match_operand:V4SF 2 "vfloat_operand")]
"TARGET_BFLOAT16_HW"
{
rtx r1 = gen_reg_rtx (V8BFmode);
rtx r2 = gen_reg_rtx (V8BFmode);
emit_insn (gen_xvcvspbf16_v8bf (r1, operands[1]));
emit_insn (gen_xvcvspbf16_v8bf (r2, operands[2]));
rs6000_expand_extract_even (operands[0], r1, r2);
DONE;
})
;; Used for vector conversion to _Float16
(define_insn "xvcvsphp_v8hf"
[(set (match_operand:V8HF 0 "vsx_register_operand" "=wa")
(unspec:V8HF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
UNSPEC_XVCVSPHP_V8HF))]
"TARGET_P9_VECTOR"
"xvcvsphp %x0,%x1"
[(set_attr "type" "vecfloat")])
;; Used for vector conversion to __bloat16
(define_insn "xvcvspbf16_v8bf"
[(set (match_operand:V8BF 0 "vsx_register_operand" "=wa")
(unspec:V8BF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
UNSPEC_XVCVSPBF16_V8BF))]
"TARGET_BFLOAT16_HW"
"xvcvspbf16 %x0,%x1"
[(set_attr "type" "vecfloat")])
;; Vector unpack support.  Given the name is for the type being
;; unpacked, we can unpack both __bfloat16 and _Float16.
;; Unpack vector _Float16
(define_expand "vec_unpacks_hi_v8hf"
[(match_operand:V4SF 0 "vfloat_operand")
(match_operand:V8HF 1 "vfloat_operand")]
"TARGET_FLOAT16_HW"
{
rtx reg = gen_reg_rtx (V8HFmode);
rs6000_expand_interleave (reg, operands[1], operands[1], BYTES_BIG_ENDIAN);
emit_insn (gen_xvcvhpsp_v8hf (operands[0], reg));
DONE;
})
(define_expand "vec_unpacks_lo_v8hf"
[(match_operand:V4SF 0 "vfloat_operand")
(match_operand:V8HF 1 "vfloat_operand")]
"TARGET_FLOAT16_HW"
{
rtx reg = gen_reg_rtx (V8HFmode);
rs6000_expand_interleave (reg, operands[1], operands[1], !BYTES_BIG_ENDIAN);
emit_insn (gen_xvcvhpsp_v8hf (operands[0], reg));
DONE;
})
;; Used for vector conversion from _Float16
(define_insn "xvcvhpsp_v8hf"
[(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
(unspec:V4SF [(match_operand:V8HF 1 "vsx_register_operand" "wa")]
UNSPEC_CVT_FP16_TO_V4SF))]
"TARGET_BFLOAT16_HW"
"xvcvhpsp %x0,%x1"
[(set_attr "type" "vecperm")])
;; Unpack vector __bfloat16
(define_expand "vec_unpacks_hi_v8bf"
[(match_operand:V4SF 0 "vfloat_operand")
(match_operand:V8BF 1 "vfloat_operand")]
"TARGET_BFLOAT16_HW"
{
rtx reg = gen_reg_rtx (V8BFmode);
rs6000_expand_interleave (reg, operands[1], operands[1], BYTES_BIG_ENDIAN);
emit_insn (gen_xvcvbf16spn_v8bf (operands[0], reg));
DONE;
})
(define_expand "vec_unpacks_lo_v8bf"
[(match_operand:V4SF 0 "vfloat_operand")
(match_operand:V8BF 1 "vfloat_operand")]
"TARGET_BFLOAT16_HW"
{
rtx reg = gen_reg_rtx (V8BFmode);
rs6000_expand_interleave (reg, operands[1], operands[1], !BYTES_BIG_ENDIAN);
emit_insn (gen_xvcvbf16spn_v8bf (operands[0], reg));
DONE;
})
;; Used for vector conversion from __bfloat16
(define_insn "xvcvbf16spn_v8bf"
[(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
(unspec:V4SF [(match_operand:V8BF 1 "vsx_register_operand" "wa")]
UNSPEC_CVT_FP16_TO_V4SF))]
"TARGET_BFLOAT16_HW"
"xvcvbf16spn %x0,%x1"
[(set_attr "type" "vecperm")])

39 KiB Raw Blame History Unescape Escape

39 KiB

Raw Blame History