mirror of git://gcc.gnu.org/git/gcc.git
Add unary, fma 16-bit floating point vector optimization.
2025-10-17 Michael Meissner <meissner@linux.ibm.com> gcc/ * config/rs6000/float16.cc (bfloat16_operation_as_v4sf): Add support for vectorizing unary and fma 16-bit floating point. (fp16_vectorization): Likewise. * config/rs6000/float16.md (neg<mode>2, VFP16_HW iterator): Likewise. (xor<mode>2, VFP16_HW iterator): Likewise. (abs<mode>2, VFP16_HW iterator): Likewise. (andc<mode>2, VFP16_HW iterator): Likewise. (neg_<fp16_names><mode>2): Likewise. (abs_<fp16_names><mode>2): Likewise. (fma<fp16_names><mode>4): Likewise. (fms<fp16_names><mode>4): Likewise. (nfma<fp16_names><mode>4): Likewise. (nfms<fp16_names><mode>4): Likewise. * config/rs6000/rs6000-protos.h (FP16_ABS_BINARY): Likewise. (FP16_NEG_BINARY,): Likewise.
This commit is contained in:
parent
c6f7231367
commit
5fc9771c32
|
@ -88,6 +88,8 @@ bfloat16_operation_as_v4sf (enum rtx_code icode,
|
|||
n_opts = 3;
|
||||
break;
|
||||
|
||||
case FP16_ABS_BINARY:
|
||||
case FP16_NEG_BINARY:
|
||||
default:
|
||||
gcc_unreachable ();
|
||||
}
|
||||
|
@ -174,6 +176,8 @@ bfloat16_operation_as_v4sf (enum rtx_code icode,
|
|||
}
|
||||
break;
|
||||
|
||||
case FP16_ABS_BINARY:
|
||||
case FP16_NEG_BINARY:
|
||||
default:
|
||||
gcc_unreachable ();
|
||||
}
|
||||
|
@ -212,6 +216,7 @@ fp16_vectorization (enum rtx_code icode,
|
|||
{
|
||||
gcc_assert (can_create_pseudo_p ());
|
||||
|
||||
enum rtx_code unary_op = UNKNOWN;
|
||||
machine_mode result_mode = GET_MODE (result);
|
||||
rtx op_orig[3] = { op1, op2, op3 };
|
||||
rtx op_hi[3];
|
||||
|
@ -226,6 +231,16 @@ fp16_vectorization (enum rtx_code icode,
|
|||
n_opts = 2;
|
||||
break;
|
||||
|
||||
case FP16_NEG_BINARY:
|
||||
n_opts = 2;
|
||||
unary_op = NEG;
|
||||
break;
|
||||
|
||||
case FP16_ABS_BINARY:
|
||||
n_opts = 2;
|
||||
unary_op = ABS;
|
||||
break;
|
||||
|
||||
case FP16_FMA:
|
||||
case FP16_FMS:
|
||||
case FP16_NFMA:
|
||||
|
@ -274,6 +289,8 @@ fp16_vectorization (enum rtx_code icode,
|
|||
switch (subtype)
|
||||
{
|
||||
case FP16_BINARY:
|
||||
case FP16_NEG_BINARY:
|
||||
case FP16_ABS_BINARY:
|
||||
emit_insn (gen_rtx_SET (result_hi,
|
||||
gen_rtx_fmt_ee (icode, V4SFmode,
|
||||
op_hi[0],
|
||||
|
@ -322,6 +339,16 @@ fp16_vectorization (enum rtx_code icode,
|
|||
gcc_unreachable ();
|
||||
}
|
||||
|
||||
/* Add any unary operator modifications. */
|
||||
if (unary_op != UNKNOWN)
|
||||
{
|
||||
emit_insn (gen_rtx_SET (result_hi,
|
||||
gen_rtx_fmt_e (unary_op, V4SFmode, result_hi)));
|
||||
|
||||
emit_insn (gen_rtx_SET (result_lo,
|
||||
gen_rtx_fmt_e (unary_op, V4SFmode, result_lo)));
|
||||
}
|
||||
|
||||
/* Combine the 2 V4SFmode operations into one V8HFmode/V8BFmode vector. */
|
||||
if (result_mode == V8HFmode)
|
||||
emit_insn (gen_vec_pack_trunc_v4sf_v8hf (result, result_hi, result_lo));
|
||||
|
|
|
@ -706,6 +706,104 @@
|
|||
})
|
||||
|
||||
;; Add vectorization support for 16-bit floating point.
|
||||
|
||||
;; Negate vector bfloat16/float16
|
||||
(define_insn_and_split "neg<mode>2"
|
||||
[(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa")
|
||||
(neg:VFP16_HW
|
||||
(match_operand:VFP16_HW 1 "vsx_register_operand" "wa")))
|
||||
(clobber (match_scratch:VFP16_HW 2 "=&wa"))]
|
||||
""
|
||||
"#"
|
||||
"&& 1"
|
||||
[(set (match_dup 2)
|
||||
(match_dup 3))
|
||||
(set (match_dup 0)
|
||||
(xor:VFP16_HW (match_dup 1)
|
||||
(match_dup 2)))]
|
||||
{
|
||||
if (GET_CODE (operands[2]) == SCRATCH)
|
||||
operands[2] = gen_reg_rtx (<MODE>mode);
|
||||
|
||||
REAL_VALUE_TYPE dconst;
|
||||
|
||||
gcc_assert (real_from_string (&dconst, "-0.0") == 0);
|
||||
|
||||
rtx neg0 = const_double_from_real_value (dconst, <VEC_base>mode);
|
||||
rtvec v = rtvec_alloc (8);
|
||||
|
||||
for (size_t i = 0; i < 8; i++)
|
||||
RTVEC_ELT (v, i) = neg0;
|
||||
|
||||
rtx vneg0 = gen_rtx_CONST_VECTOR (<MODE>mode, v);
|
||||
if (!TARGET_PREFIXED)
|
||||
vneg0 = force_const_mem (<MODE>mode, vneg0);
|
||||
|
||||
operands[3] = vneg0;
|
||||
}
|
||||
[(set_attr "type" "veclogical")
|
||||
(set_attr "length" "16")])
|
||||
|
||||
;; XOR used to negate a 16-bit floating point type
|
||||
|
||||
(define_insn "*xor<mode>3"
|
||||
[(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa")
|
||||
(xor:VFP16_HW (match_operand:VFP16_HW 1 "vsx_register_operand" "wa")
|
||||
(match_operand:VFP16_HW 2 "vsx_register_operand" "wa")))]
|
||||
""
|
||||
"xxlxor %x0,%x1,%x2"
|
||||
[(set_attr "type" "veclogical")])
|
||||
|
||||
;; 16-bit floating point vector absolute value
|
||||
|
||||
(define_insn_and_split "abs<mode>2"
|
||||
[(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa")
|
||||
(abs:VFP16_HW
|
||||
(match_operand:VFP16_HW 1 "vsx_register_operand" "wa")))
|
||||
(clobber (match_scratch:VFP16_HW 2 "=&wa"))]
|
||||
""
|
||||
"#"
|
||||
"&& 1"
|
||||
[(set (match_dup 2)
|
||||
(match_dup 3))
|
||||
(set (match_dup 0)
|
||||
(and:VFP16_HW (match_dup 1)
|
||||
(not:VFP16_HW (match_dup 2))))]
|
||||
{
|
||||
if (GET_CODE (operands[2]) == SCRATCH)
|
||||
operands[2] = gen_reg_rtx (<MODE>mode);
|
||||
|
||||
REAL_VALUE_TYPE dconst;
|
||||
|
||||
gcc_assert (real_from_string (&dconst, "-0.0") == 0);
|
||||
|
||||
rtx neg0 = const_double_from_real_value (dconst, <VEC_base>mode);
|
||||
rtvec v = rtvec_alloc (8);
|
||||
|
||||
for (size_t i = 0; i < 8; i++)
|
||||
RTVEC_ELT (v, i) = neg0;
|
||||
|
||||
rtx vneg0 = gen_rtx_CONST_VECTOR (<MODE>mode, v);
|
||||
if (!TARGET_PREFIXED)
|
||||
vneg0 = force_const_mem (<MODE>mode, vneg0);
|
||||
|
||||
operands[3] = vneg0;
|
||||
}
|
||||
[(set_attr "type" "veclogical")
|
||||
(set_attr "length" "16")])
|
||||
|
||||
;; ANDC used to clear the sign bit of a 16-bit floating point type
|
||||
;; for absolute value.
|
||||
|
||||
(define_insn "*andc<mode>3"
|
||||
[(set (match_operand:VFP16_HW 0 "gpc_reg_operand" "=wa")
|
||||
(and:VFP16_HW (match_operand:VFP16_HW 1 "gpc_reg_operand" "wa")
|
||||
(not:VFP16_HW
|
||||
(match_operand:VFP16_HW 2 "gpc_reg_operand" "wa"))))]
|
||||
""
|
||||
"xxlandc %x0,%x1,%x2"
|
||||
[(set_attr "type" "veclogical")])
|
||||
|
||||
;; Binary operators being vectorized.
|
||||
(define_insn_and_split "<fp16_names><mode>3"
|
||||
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
|
||||
|
@ -722,6 +820,110 @@
|
|||
DONE;
|
||||
})
|
||||
|
||||
;; Negative of binary operators being vectorized.
|
||||
(define_insn_and_split "*neg_<fp16_names><mode>3"
|
||||
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
|
||||
(neg:VFP16_HW
|
||||
(FP16_BINARY_OP:VFP16_HW
|
||||
(match_operand:VFP16_HW 1 "vsx_register_operand")
|
||||
(match_operand:VFP16_HW 2 "vsx_register_operand"))))]
|
||||
"can_create_pseudo_p ()"
|
||||
"#"
|
||||
"&& 1"
|
||||
[(pc)]
|
||||
{
|
||||
fp16_vectorization (<CODE>, operands[0], operands[1], operands[2],
|
||||
NULL_RTX, FP16_NEG_BINARY);
|
||||
DONE;
|
||||
})
|
||||
|
||||
;; Absolute value of binary operators being vectorized.
|
||||
(define_insn_and_split "*abs_<fp16_names><mode>3"
|
||||
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
|
||||
(abs:VFP16_HW
|
||||
(FP16_BINARY_OP:VFP16_HW
|
||||
(match_operand:VFP16_HW 1 "vsx_register_operand")
|
||||
(match_operand:VFP16_HW 2 "vsx_register_operand"))))]
|
||||
"can_create_pseudo_p ()"
|
||||
"#"
|
||||
"&& 1"
|
||||
[(pc)]
|
||||
{
|
||||
fp16_vectorization (<CODE>, operands[0], operands[1], operands[2],
|
||||
NULL_RTX, FP16_ABS_BINARY);
|
||||
DONE;
|
||||
})
|
||||
|
||||
;; FMA operations being vectorized.
|
||||
(define_insn_and_split "fma<mode>4"
|
||||
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
|
||||
(fma:VFP16_HW
|
||||
(match_operand:VFP16_HW 1 "vsx_register_operand")
|
||||
(match_operand:VFP16_HW 2 "vsx_register_operand")
|
||||
(match_operand:VFP16_HW 3 "vsx_register_operand")))]
|
||||
"can_create_pseudo_p ()"
|
||||
"#"
|
||||
"&& 1"
|
||||
[(pc)]
|
||||
{
|
||||
fp16_vectorization (FMA, operands[0], operands[1], operands[2],
|
||||
operands[3], FP16_FMA);
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_insn_and_split "*fms<mode>4"
|
||||
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
|
||||
(fma:VFP16_HW
|
||||
(match_operand:VFP16_HW 1 "vsx_register_operand")
|
||||
(match_operand:VFP16_HW 2 "vsx_register_operand")
|
||||
(neg:VFP16_HW
|
||||
(match_operand:VFP16_HW 3 "vsx_register_operand"))))]
|
||||
"can_create_pseudo_p ()"
|
||||
"#"
|
||||
"&& 1"
|
||||
[(pc)]
|
||||
{
|
||||
fp16_vectorization (FMA, operands[0], operands[1], operands[2],
|
||||
operands[3], FP16_FMS);
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_insn_and_split "*nfma<mode>4"
|
||||
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
|
||||
(neg:VFP16_HW
|
||||
(fma:VFP16_HW
|
||||
(match_operand:VFP16_HW 1 "vsx_register_operand")
|
||||
(match_operand:VFP16_HW 2 "vsx_register_operand")
|
||||
(match_operand:VFP16_HW 3 "vsx_register_operand"))))]
|
||||
"can_create_pseudo_p ()"
|
||||
"#"
|
||||
"&& 1"
|
||||
[(pc)]
|
||||
{
|
||||
fp16_vectorization (FMA, operands[0], operands[1], operands[2],
|
||||
operands[3], FP16_NFMA);
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_insn_and_split "*nfms<mode>4"
|
||||
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
|
||||
(neg:VFP16_HW
|
||||
(fma:VFP16_HW
|
||||
(match_operand:VFP16_HW 1 "vsx_register_operand")
|
||||
(match_operand:VFP16_HW 2 "vsx_register_operand")
|
||||
(neg:VFP16_HW
|
||||
(match_operand:VFP16_HW 3 "vsx_register_operand")))))]
|
||||
"can_create_pseudo_p ()"
|
||||
"#"
|
||||
"&& 1"
|
||||
[(pc)]
|
||||
{
|
||||
fp16_vectorization (FMA, operands[0], operands[1], operands[2],
|
||||
operands[3], FP16_NFMS);
|
||||
DONE;
|
||||
})
|
||||
|
||||
|
||||
|
||||
;; If we do multiple __bfloat16 operations, between the first and
|
||||
;; second operation, GCC will want to convert the first operation from
|
||||
|
|
|
@ -263,6 +263,8 @@ extern unsigned constant_generates_xxspltidp (vec_const_128bit_type *);
|
|||
/* Optimize bfloat16 and float16 operations. */
|
||||
enum fp16_operation {
|
||||
FP16_BINARY, /* Bfloat16/float16 binary op. */
|
||||
FP16_ABS_BINARY, /* abs (binary op). */
|
||||
FP16_NEG_BINARY, /* - (binary op). */
|
||||
FP16_FMA, /* (a * b) + c. */
|
||||
FP16_FMS, /* (a * b) - c. */
|
||||
FP16_NFMA, /* - ((a * b) + c). */
|
||||
|
|
Loading…
Reference in New Issue