AArch64: add double widen_sum optab using dotprod for Adv.SIMD [PR122069]

This patch implements support for using dotproduct to do sum reductions by changing += a into += (a * 1). i.e. we seed the multiplication with 1. Given the example int foo_int(unsigned char *x, unsigned char * restrict y) { int sum = 0; for (int i = 0; i < 8000; i++) sum += char_abs(x[i] - y[i]); return sum; } we used to generate .L2: ldr q0, [x0, x2] ldr q28, [x1, x2] sub v28.16b, v0.16b, v28.16b zip1 v29.16b, v28.16b, v31.16b zip2 v28.16b, v28.16b, v31.16b uaddw v30.4s, v30.4s, v29.4h uaddw2 v30.4s, v30.4s, v29.8h uaddw v30.4s, v30.4s, v28.4h uaddw2 v30.4s, v30.4s, v28.8h add x2, x2, 16 cmp x2, x3 bne .L2 addv s31, v30.4s but now generates with +dotprod .L2: ldr q29, [x0, x2] ldr q28, [x1, x2] sub v28.16b, v29.16b, v28.16b udot v31.4s, v28.16b, v30.16b add x2, x2, 16 cmp x2, x3 bne .L2 addv s31, v31.4s gcc/ChangeLog: PR middle-end/122069 * config/aarch64/aarch64-simd.md (widen_ssum<mode><vsi2qi>3): New. (widen_usum<mode><vsi2qi>3): New. gcc/testsuite/ChangeLog: PR middle-end/122069 * gcc.target/aarch64/pr122069_3.c: New test. * gcc.target/aarch64/pr122069_4.c: New test.
2025-10-18 08:20:07 +01:00 · 2025-10-18 08:20:07 +01:00 · c8dc5d5070
parent b394181afd
commit c8dc5d5070
3 changed files with 154 additions and 0 deletions
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@ -4703,6 +4703,38 @@
  DONE;
 })

+(define_expand "widen_ssum<mode><vsi2qi>3"
+  [(set (match_operand:VS 0 "register_operand")
+	(plus:VS (sign_extend:VS
+		   (match_operand:<VSI2QI> 1 "register_operand"))
+		 (match_operand:VS 2 "register_operand")))]
+  "TARGET_DOTPROD"
+  {
+    rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
+    emit_insn (gen_sdot_prod<mode><vsi2qi> (operands[0], operands[1], ones,
+					    operands[2]));
+    DONE;
+  }
+)
+
+;; Use dot product to perform double widening sum reductions by
+;; changing += a into += (a * 1).  i.e. we seed the multiplication with 1.
+(define_expand "widen_usum<mode><vsi2qi>3"
+  [(set (match_operand:VS 0 "register_operand")
+	(plus:VS (zero_extend:VS
+		        (match_operand:<VSI2QI> 1 "register_operand"))
+		      (match_operand:VS 2 "register_operand")))]
+  "TARGET_DOTPROD"
+  {
+    rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
+    emit_insn (gen_udot_prod<mode><vsi2qi> (operands[0], operands[1], ones,
+					    operands[2]));
+    DONE;
+  }
+)
+
+;; Use dot product to perform double widening sum reductions by
+;; changing += a into += (a * 1).  i.e. we seed the multiplication with 1.
 (define_insn "aarch64_<ANY_EXTEND:su>subw<mode>"
  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
 	(minus:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
--- a/gcc/testsuite/gcc.target/aarch64/pr122069_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/pr122069_3.c
@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+dotprod -mautovec-preference=asimd-only --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/
+/* { dg-final { check-function-bodies "**" "" } } */
+
+inline char char_abs(char i) {
+  return (i < 0 ? -i : i);
+}
+
+/*
+** foo_int:
+** 	...
+** 	sub	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** 	udot	v[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b
+** 	...
+*/
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+  int sum = 0;
+  for (int i = 0; i < 8000; i++)
+     sum += char_abs(x[i] - y[i]);
+  return sum;
+}
+
+/*
+** foo2_int:
+** 	...
+** 	add	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uaddw2	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.8h
+** 	...
+*/
+int foo2_int(unsigned short *x, unsigned short * restrict y) {
+  int sum = 0;
+  for (int i = 0; i < 8000; i++)
+    {
+      x[i] = x[i] + y[i];
+      sum += x[i];
+    }
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
--- a/gcc/testsuite/gcc.target/aarch64/pr122069_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/pr122069_4.c
@ -0,0 +1,81 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw }*/
+/* { dg-options "-O3 -march=armv8.2-a+dotprod -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/
+
+inline char char_abs(char i) {
+  return (i < 0 ? -i : i);
+}
+
+__attribute__((noipa))
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+  int sum = 0;
+  for (int i = 0; i < 100; i++)
+     sum += char_abs(x[i] - y[i]);
+  return sum;
+}
+
+__attribute__((noipa))
+int foo2_int(unsigned short *x, unsigned short * restrict y,
+	     unsigned short * restrict z) {
+  int sum = 0;
+  for (int i = 0; i < 100; i++)
+    {
+      z[i] = x[i] + y[i];
+      sum += z[i];
+    }
+  return sum;
+}
+
+__attribute__((noipa))
+int foo_int2(unsigned char *x, unsigned char * restrict y) {
+  int sum = 0;
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+     sum += char_abs(x[i] - y[i]);
+  return sum;
+}
+
+__attribute__((noipa))
+int foo2_int2(unsigned short *x, unsigned short * restrict y,
+	      unsigned short * restrict z) {
+  int sum = 0;
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+    {
+      z[i] = x[i] + y[i];
+      sum += z[i];
+    }
+  return sum;
+}
+
+int main ()
+{
+  unsigned short a[100];
+  unsigned short b[100];
+  unsigned short r1[100];
+  unsigned short r2[100];
+  unsigned char c[100];
+  unsigned char d[100];
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+    {
+      a[i] = c[i] = i;
+      b[i] = d[i] = 100 - i;
+    }
+
+  if (foo_int (c, d) != foo_int2 (c, d))
+    __builtin_abort();
+
+
+  if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
+    __builtin_abort();
+
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+    if (r1[i] != r2[i])
+      __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */