Add unary, fma 16-bit floating point vector optimization.

2025-10-17 Michael Meissner <meissner@linux.ibm.com> gcc/ * config/rs6000/float16.cc (bfloat16_operation_as_v4sf): Add support for vectorizing unary and fma 16-bit floating point. (fp16_vectorization): Likewise. * config/rs6000/float16.md (neg<mode>2, VFP16_HW iterator): Likewise. (xor<mode>2, VFP16_HW iterator): Likewise. (abs<mode>2, VFP16_HW iterator): Likewise. (andc<mode>2, VFP16_HW iterator): Likewise. (neg_<fp16_names><mode>2): Likewise. (abs_<fp16_names><mode>2): Likewise. (fma<fp16_names><mode>4): Likewise. (fms<fp16_names><mode>4): Likewise. (nfma<fp16_names><mode>4): Likewise. (nfms<fp16_names><mode>4): Likewise. * config/rs6000/rs6000-protos.h (FP16_ABS_BINARY): Likewise. (FP16_NEG_BINARY,): Likewise.
2025-10-17 13:04:46 -04:00 · 2025-10-17 13:04:46 -04:00 · 5fc9771c32
parent c6f7231367
commit 5fc9771c32
3 changed files with 231 additions and 0 deletions
--- a/gcc/config/rs6000/float16.cc
+++ b/gcc/config/rs6000/float16.cc
@ -88,6 +88,8 @@ bfloat16_operation_as_v4sf (enum rtx_code icode,
      n_opts = 3;
      break;

+    case FP16_ABS_BINARY:
+    case FP16_NEG_BINARY:
    default:
      gcc_unreachable ();
    }
@ -174,6 +176,8 @@ bfloat16_operation_as_v4sf (enum rtx_code icode,
      }
      break;

+    case FP16_ABS_BINARY:
+    case FP16_NEG_BINARY:
    default:
      gcc_unreachable ();
    }
@ -212,6 +216,7 @@ fp16_vectorization (enum rtx_code icode,
 {
  gcc_assert (can_create_pseudo_p ());

+  enum rtx_code unary_op = UNKNOWN;
  machine_mode result_mode = GET_MODE (result);
  rtx op_orig[3] = { op1, op2, op3 };
  rtx op_hi[3];
@ -226,6 +231,16 @@ fp16_vectorization (enum rtx_code icode,
      n_opts = 2;
      break;

+    case FP16_NEG_BINARY:
+      n_opts = 2;
+      unary_op = NEG;
+      break;
+
+    case FP16_ABS_BINARY:
+      n_opts = 2;
+      unary_op = ABS;
+      break;
+
    case FP16_FMA:
    case FP16_FMS:
    case FP16_NFMA:
@ -274,6 +289,8 @@ fp16_vectorization (enum rtx_code icode,
  switch (subtype)
    {
    case FP16_BINARY:
+    case FP16_NEG_BINARY:
+    case FP16_ABS_BINARY:
      emit_insn (gen_rtx_SET (result_hi,
 			      gen_rtx_fmt_ee (icode, V4SFmode,
 					      op_hi[0],
@ -322,6 +339,16 @@ fp16_vectorization (enum rtx_code icode,
      gcc_unreachable ();
    }

+  /* Add any unary operator modifications.  */
+  if (unary_op != UNKNOWN)
+    {
+      emit_insn (gen_rtx_SET (result_hi,
+			      gen_rtx_fmt_e (unary_op, V4SFmode, result_hi)));
+
+      emit_insn (gen_rtx_SET (result_lo,
+			      gen_rtx_fmt_e (unary_op, V4SFmode, result_lo)));
+    }
+
  /* Combine the 2 V4SFmode operations into one V8HFmode/V8BFmode vector.  */
  if (result_mode == V8HFmode)
    emit_insn (gen_vec_pack_trunc_v4sf_v8hf (result, result_hi, result_lo));
--- a/gcc/config/rs6000/float16.md
+++ b/gcc/config/rs6000/float16.md
@ -706,6 +706,104 @@
 })

 ;; Add vectorization support for 16-bit floating point.
+
+;; Negate vector bfloat16/float16
+(define_insn_and_split "neg<mode>2"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa")
+	(neg:VFP16_HW
+	 (match_operand:VFP16_HW 1 "vsx_register_operand" "wa")))
+   (clobber (match_scratch:VFP16_HW 2 "=&wa"))]
+  ""
+  "#"
+  "&& 1"
+  [(set (match_dup 2)
+	(match_dup 3))
+   (set (match_dup 0)
+	(xor:VFP16_HW (match_dup 1)
+		      (match_dup 2)))]
+{
+  if (GET_CODE (operands[2]) == SCRATCH)
+    operands[2] = gen_reg_rtx (<MODE>mode);
+
+  REAL_VALUE_TYPE dconst;
+
+  gcc_assert (real_from_string (&dconst, "-0.0") == 0);
+
+  rtx neg0 = const_double_from_real_value (dconst, <VEC_base>mode);
+  rtvec v = rtvec_alloc (8);
+
+  for (size_t i = 0; i < 8; i++)
+  RTVEC_ELT (v, i) = neg0;
+
+  rtx vneg0 = gen_rtx_CONST_VECTOR (<MODE>mode, v);
+  if (!TARGET_PREFIXED)
+    vneg0 = force_const_mem (<MODE>mode, vneg0);
+
+  operands[3] = vneg0;
+}
+  [(set_attr "type" "veclogical")
+   (set_attr "length" "16")])
+
+;; XOR used to negate a 16-bit floating point type
+
+(define_insn "*xor<mode>3"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa")
+	(xor:VFP16_HW (match_operand:VFP16_HW 1 "vsx_register_operand" "wa")
+		      (match_operand:VFP16_HW 2 "vsx_register_operand" "wa")))]
+  ""
+  "xxlxor %x0,%x1,%x2"
+  [(set_attr "type" "veclogical")])
+
+;; 16-bit floating point vector absolute value
+
+(define_insn_and_split "abs<mode>2"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa")
+	(abs:VFP16_HW
+	 (match_operand:VFP16_HW 1 "vsx_register_operand" "wa")))
+   (clobber (match_scratch:VFP16_HW 2 "=&wa"))]
+  ""
+  "#"
+  "&& 1"
+  [(set (match_dup 2)
+	(match_dup 3))
+   (set (match_dup 0)
+	(and:VFP16_HW (match_dup 1)
+		      (not:VFP16_HW (match_dup 2))))]
+{
+  if (GET_CODE (operands[2]) == SCRATCH)
+    operands[2] = gen_reg_rtx (<MODE>mode);
+
+  REAL_VALUE_TYPE dconst;
+
+  gcc_assert (real_from_string (&dconst, "-0.0") == 0);
+
+  rtx neg0 = const_double_from_real_value (dconst, <VEC_base>mode);
+  rtvec v = rtvec_alloc (8);
+
+  for (size_t i = 0; i < 8; i++)
+  RTVEC_ELT (v, i) = neg0;
+
+  rtx vneg0 = gen_rtx_CONST_VECTOR (<MODE>mode, v);
+  if (!TARGET_PREFIXED)
+    vneg0 = force_const_mem (<MODE>mode, vneg0);
+
+  operands[3] = vneg0;
+}
+  [(set_attr "type" "veclogical")
+   (set_attr "length" "16")])
+
+;; ANDC used to clear the sign bit of a 16-bit floating point type
+;; for absolute value.
+
+(define_insn "*andc<mode>3"
+  [(set (match_operand:VFP16_HW 0 "gpc_reg_operand" "=wa")
+	(and:VFP16_HW (match_operand:VFP16_HW 1 "gpc_reg_operand" "wa")
+		      (not:VFP16_HW
+		       (match_operand:VFP16_HW 2 "gpc_reg_operand" "wa"))))]
+  ""
+  "xxlandc %x0,%x1,%x2"
+  [(set_attr "type" "veclogical")])
+
 ;; Binary operators being vectorized.
 (define_insn_and_split "<fp16_names><mode>3"
  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
@ -722,6 +820,110 @@
  DONE;
 })

+;; Negative of binary operators being vectorized.
+(define_insn_and_split "*neg_<fp16_names><mode>3"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+	(neg:VFP16_HW
+	 (FP16_BINARY_OP:VFP16_HW
+	  (match_operand:VFP16_HW 1 "vsx_register_operand")
+	  (match_operand:VFP16_HW 2 "vsx_register_operand"))))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (<CODE>, operands[0], operands[1], operands[2],
+		      NULL_RTX, FP16_NEG_BINARY);
+  DONE;
+})
+
+;; Absolute value of binary operators being vectorized.
+(define_insn_and_split "*abs_<fp16_names><mode>3"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+	(abs:VFP16_HW
+	 (FP16_BINARY_OP:VFP16_HW
+	  (match_operand:VFP16_HW 1 "vsx_register_operand")
+	  (match_operand:VFP16_HW 2 "vsx_register_operand"))))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (<CODE>, operands[0], operands[1], operands[2],
+		      NULL_RTX, FP16_ABS_BINARY);
+  DONE;
+})
+
+;; FMA operations being vectorized.
+(define_insn_and_split "fma<mode>4"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+	(fma:VFP16_HW
+	 (match_operand:VFP16_HW 1 "vsx_register_operand")
+	 (match_operand:VFP16_HW 2 "vsx_register_operand")
+	 (match_operand:VFP16_HW 3 "vsx_register_operand")))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+		      operands[3], FP16_FMA);
+  DONE;
+})
+
+(define_insn_and_split "*fms<mode>4"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+	(fma:VFP16_HW
+	 (match_operand:VFP16_HW 1 "vsx_register_operand")
+	 (match_operand:VFP16_HW 2 "vsx_register_operand")
+	 (neg:VFP16_HW
+	  (match_operand:VFP16_HW 3 "vsx_register_operand"))))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+		      operands[3], FP16_FMS);
+  DONE;
+})
+
+(define_insn_and_split "*nfma<mode>4"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+	(neg:VFP16_HW
+	 (fma:VFP16_HW
+	  (match_operand:VFP16_HW 1 "vsx_register_operand")
+	  (match_operand:VFP16_HW 2 "vsx_register_operand")
+	  (match_operand:VFP16_HW 3 "vsx_register_operand"))))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+		      operands[3], FP16_NFMA);
+  DONE;
+})
+
+(define_insn_and_split "*nfms<mode>4"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+	(neg:VFP16_HW
+	 (fma:VFP16_HW
+	  (match_operand:VFP16_HW 1 "vsx_register_operand")
+	  (match_operand:VFP16_HW 2 "vsx_register_operand")
+	  (neg:VFP16_HW
+	   (match_operand:VFP16_HW 3 "vsx_register_operand")))))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+		      operands[3], FP16_NFMS);
+  DONE;
+})
+
+

 ;; If we do multiple __bfloat16 operations, between the first and
 ;; second operation, GCC will want to convert the first operation from
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@ -263,6 +263,8 @@ extern unsigned constant_generates_xxspltidp (vec_const_128bit_type *);
 /* Optimize bfloat16 and float16 operations.  */
 enum fp16_operation {
  FP16_BINARY,				/* Bfloat16/float16 binary op.  */
+  FP16_ABS_BINARY,			/* abs (binary op).  */
+  FP16_NEG_BINARY,			/* - (binary op).  */
  FP16_FMA,				/* (a * b) + c.  */
  FP16_FMS,				/* (a * b) - c.  */
  FP16_NFMA,				/* - ((a * b) + c).  */