Add unary, fma 16-bit floating point vector optimization.

2025-10-17  Michael Meissner  <meissner@linux.ibm.com>

gcc/

	* config/rs6000/float16.cc (bfloat16_operation_as_v4sf): Add support for
	vectorizing unary and fma 16-bit floating point.
	(fp16_vectorization): Likewise.
	* config/rs6000/float16.md (neg<mode>2, VFP16_HW iterator): Likewise.
	(xor<mode>2, VFP16_HW iterator): Likewise.
	(abs<mode>2, VFP16_HW iterator): Likewise.
	(andc<mode>2, VFP16_HW iterator): Likewise.
	(neg_<fp16_names><mode>2): Likewise.
	(abs_<fp16_names><mode>2): Likewise.
	(fma<fp16_names><mode>4): Likewise.
	(fms<fp16_names><mode>4): Likewise.
	(nfma<fp16_names><mode>4): Likewise.
	(nfms<fp16_names><mode>4): Likewise.
	* config/rs6000/rs6000-protos.h (FP16_ABS_BINARY): Likewise.
	(FP16_NEG_BINARY,): Likewise.
This commit is contained in:
Michael Meissner 2025-10-17 13:04:46 -04:00
parent c6f7231367
commit 5fc9771c32
3 changed files with 231 additions and 0 deletions

View File

@ -88,6 +88,8 @@ bfloat16_operation_as_v4sf (enum rtx_code icode,
n_opts = 3;
break;
case FP16_ABS_BINARY:
case FP16_NEG_BINARY:
default:
gcc_unreachable ();
}
@ -174,6 +176,8 @@ bfloat16_operation_as_v4sf (enum rtx_code icode,
}
break;
case FP16_ABS_BINARY:
case FP16_NEG_BINARY:
default:
gcc_unreachable ();
}
@ -212,6 +216,7 @@ fp16_vectorization (enum rtx_code icode,
{
gcc_assert (can_create_pseudo_p ());
enum rtx_code unary_op = UNKNOWN;
machine_mode result_mode = GET_MODE (result);
rtx op_orig[3] = { op1, op2, op3 };
rtx op_hi[3];
@ -226,6 +231,16 @@ fp16_vectorization (enum rtx_code icode,
n_opts = 2;
break;
case FP16_NEG_BINARY:
n_opts = 2;
unary_op = NEG;
break;
case FP16_ABS_BINARY:
n_opts = 2;
unary_op = ABS;
break;
case FP16_FMA:
case FP16_FMS:
case FP16_NFMA:
@ -274,6 +289,8 @@ fp16_vectorization (enum rtx_code icode,
switch (subtype)
{
case FP16_BINARY:
case FP16_NEG_BINARY:
case FP16_ABS_BINARY:
emit_insn (gen_rtx_SET (result_hi,
gen_rtx_fmt_ee (icode, V4SFmode,
op_hi[0],
@ -322,6 +339,16 @@ fp16_vectorization (enum rtx_code icode,
gcc_unreachable ();
}
/* Add any unary operator modifications. */
if (unary_op != UNKNOWN)
{
emit_insn (gen_rtx_SET (result_hi,
gen_rtx_fmt_e (unary_op, V4SFmode, result_hi)));
emit_insn (gen_rtx_SET (result_lo,
gen_rtx_fmt_e (unary_op, V4SFmode, result_lo)));
}
/* Combine the 2 V4SFmode operations into one V8HFmode/V8BFmode vector. */
if (result_mode == V8HFmode)
emit_insn (gen_vec_pack_trunc_v4sf_v8hf (result, result_hi, result_lo));

View File

@ -706,6 +706,104 @@
})
;; Add vectorization support for 16-bit floating point.
;; Negate vector bfloat16/float16
(define_insn_and_split "neg<mode>2"
[(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa")
(neg:VFP16_HW
(match_operand:VFP16_HW 1 "vsx_register_operand" "wa")))
(clobber (match_scratch:VFP16_HW 2 "=&wa"))]
""
"#"
"&& 1"
[(set (match_dup 2)
(match_dup 3))
(set (match_dup 0)
(xor:VFP16_HW (match_dup 1)
(match_dup 2)))]
{
if (GET_CODE (operands[2]) == SCRATCH)
operands[2] = gen_reg_rtx (<MODE>mode);
REAL_VALUE_TYPE dconst;
gcc_assert (real_from_string (&dconst, "-0.0") == 0);
rtx neg0 = const_double_from_real_value (dconst, <VEC_base>mode);
rtvec v = rtvec_alloc (8);
for (size_t i = 0; i < 8; i++)
RTVEC_ELT (v, i) = neg0;
rtx vneg0 = gen_rtx_CONST_VECTOR (<MODE>mode, v);
if (!TARGET_PREFIXED)
vneg0 = force_const_mem (<MODE>mode, vneg0);
operands[3] = vneg0;
}
[(set_attr "type" "veclogical")
(set_attr "length" "16")])
;; XOR used to negate a 16-bit floating point type
(define_insn "*xor<mode>3"
[(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa")
(xor:VFP16_HW (match_operand:VFP16_HW 1 "vsx_register_operand" "wa")
(match_operand:VFP16_HW 2 "vsx_register_operand" "wa")))]
""
"xxlxor %x0,%x1,%x2"
[(set_attr "type" "veclogical")])
;; 16-bit floating point vector absolute value
(define_insn_and_split "abs<mode>2"
[(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa")
(abs:VFP16_HW
(match_operand:VFP16_HW 1 "vsx_register_operand" "wa")))
(clobber (match_scratch:VFP16_HW 2 "=&wa"))]
""
"#"
"&& 1"
[(set (match_dup 2)
(match_dup 3))
(set (match_dup 0)
(and:VFP16_HW (match_dup 1)
(not:VFP16_HW (match_dup 2))))]
{
if (GET_CODE (operands[2]) == SCRATCH)
operands[2] = gen_reg_rtx (<MODE>mode);
REAL_VALUE_TYPE dconst;
gcc_assert (real_from_string (&dconst, "-0.0") == 0);
rtx neg0 = const_double_from_real_value (dconst, <VEC_base>mode);
rtvec v = rtvec_alloc (8);
for (size_t i = 0; i < 8; i++)
RTVEC_ELT (v, i) = neg0;
rtx vneg0 = gen_rtx_CONST_VECTOR (<MODE>mode, v);
if (!TARGET_PREFIXED)
vneg0 = force_const_mem (<MODE>mode, vneg0);
operands[3] = vneg0;
}
[(set_attr "type" "veclogical")
(set_attr "length" "16")])
;; ANDC used to clear the sign bit of a 16-bit floating point type
;; for absolute value.
(define_insn "*andc<mode>3"
[(set (match_operand:VFP16_HW 0 "gpc_reg_operand" "=wa")
(and:VFP16_HW (match_operand:VFP16_HW 1 "gpc_reg_operand" "wa")
(not:VFP16_HW
(match_operand:VFP16_HW 2 "gpc_reg_operand" "wa"))))]
""
"xxlandc %x0,%x1,%x2"
[(set_attr "type" "veclogical")])
;; Binary operators being vectorized.
(define_insn_and_split "<fp16_names><mode>3"
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
@ -722,6 +820,110 @@
DONE;
})
;; Negative of binary operators being vectorized.
(define_insn_and_split "*neg_<fp16_names><mode>3"
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
(neg:VFP16_HW
(FP16_BINARY_OP:VFP16_HW
(match_operand:VFP16_HW 1 "vsx_register_operand")
(match_operand:VFP16_HW 2 "vsx_register_operand"))))]
"can_create_pseudo_p ()"
"#"
"&& 1"
[(pc)]
{
fp16_vectorization (<CODE>, operands[0], operands[1], operands[2],
NULL_RTX, FP16_NEG_BINARY);
DONE;
})
;; Absolute value of binary operators being vectorized.
(define_insn_and_split "*abs_<fp16_names><mode>3"
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
(abs:VFP16_HW
(FP16_BINARY_OP:VFP16_HW
(match_operand:VFP16_HW 1 "vsx_register_operand")
(match_operand:VFP16_HW 2 "vsx_register_operand"))))]
"can_create_pseudo_p ()"
"#"
"&& 1"
[(pc)]
{
fp16_vectorization (<CODE>, operands[0], operands[1], operands[2],
NULL_RTX, FP16_ABS_BINARY);
DONE;
})
;; FMA operations being vectorized.
(define_insn_and_split "fma<mode>4"
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
(fma:VFP16_HW
(match_operand:VFP16_HW 1 "vsx_register_operand")
(match_operand:VFP16_HW 2 "vsx_register_operand")
(match_operand:VFP16_HW 3 "vsx_register_operand")))]
"can_create_pseudo_p ()"
"#"
"&& 1"
[(pc)]
{
fp16_vectorization (FMA, operands[0], operands[1], operands[2],
operands[3], FP16_FMA);
DONE;
})
(define_insn_and_split "*fms<mode>4"
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
(fma:VFP16_HW
(match_operand:VFP16_HW 1 "vsx_register_operand")
(match_operand:VFP16_HW 2 "vsx_register_operand")
(neg:VFP16_HW
(match_operand:VFP16_HW 3 "vsx_register_operand"))))]
"can_create_pseudo_p ()"
"#"
"&& 1"
[(pc)]
{
fp16_vectorization (FMA, operands[0], operands[1], operands[2],
operands[3], FP16_FMS);
DONE;
})
(define_insn_and_split "*nfma<mode>4"
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
(neg:VFP16_HW
(fma:VFP16_HW
(match_operand:VFP16_HW 1 "vsx_register_operand")
(match_operand:VFP16_HW 2 "vsx_register_operand")
(match_operand:VFP16_HW 3 "vsx_register_operand"))))]
"can_create_pseudo_p ()"
"#"
"&& 1"
[(pc)]
{
fp16_vectorization (FMA, operands[0], operands[1], operands[2],
operands[3], FP16_NFMA);
DONE;
})
(define_insn_and_split "*nfms<mode>4"
[(set (match_operand:VFP16_HW 0 "vsx_register_operand")
(neg:VFP16_HW
(fma:VFP16_HW
(match_operand:VFP16_HW 1 "vsx_register_operand")
(match_operand:VFP16_HW 2 "vsx_register_operand")
(neg:VFP16_HW
(match_operand:VFP16_HW 3 "vsx_register_operand")))))]
"can_create_pseudo_p ()"
"#"
"&& 1"
[(pc)]
{
fp16_vectorization (FMA, operands[0], operands[1], operands[2],
operands[3], FP16_NFMS);
DONE;
})
;; If we do multiple __bfloat16 operations, between the first and
;; second operation, GCC will want to convert the first operation from

View File

@ -263,6 +263,8 @@ extern unsigned constant_generates_xxspltidp (vec_const_128bit_type *);
/* Optimize bfloat16 and float16 operations. */
enum fp16_operation {
FP16_BINARY, /* Bfloat16/float16 binary op. */
FP16_ABS_BINARY, /* abs (binary op). */
FP16_NEG_BINARY, /* - (binary op). */
FP16_FMA, /* (a * b) + c. */
FP16_FMS, /* (a * b) - c. */
FP16_NFMA, /* - ((a * b) + c). */