diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 6488119a1402..eaa8d57cc413 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4703,6 +4703,38 @@ DONE; }) +(define_expand "widen_ssum3" + [(set (match_operand:VS 0 "register_operand") + (plus:VS (sign_extend:VS + (match_operand: 1 "register_operand")) + (match_operand:VS 2 "register_operand")))] + "TARGET_DOTPROD" + { + rtx ones = force_reg (mode, CONST1_RTX (mode)); + emit_insn (gen_sdot_prod (operands[0], operands[1], ones, + operands[2])); + DONE; + } +) + +;; Use dot product to perform double widening sum reductions by +;; changing += a into += (a * 1). i.e. we seed the multiplication with 1. +(define_expand "widen_usum3" + [(set (match_operand:VS 0 "register_operand") + (plus:VS (zero_extend:VS + (match_operand: 1 "register_operand")) + (match_operand:VS 2 "register_operand")))] + "TARGET_DOTPROD" + { + rtx ones = force_reg (mode, CONST1_RTX (mode)); + emit_insn (gen_udot_prod (operands[0], operands[1], ones, + operands[2])); + DONE; + } +) + +;; Use dot product to perform double widening sum reductions by +;; changing += a into += (a * 1). i.e. we seed the multiplication with 1. (define_insn "aarch64_subw" [(set (match_operand: 0 "register_operand" "=w") (minus: (match_operand: 1 "register_operand" "w") diff --git a/gcc/testsuite/gcc.target/aarch64/pr122069_3.c b/gcc/testsuite/gcc.target/aarch64/pr122069_3.c new file mode 100644 index 000000000000..0e832c43032a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr122069_3.c @@ -0,0 +1,41 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8.2-a+dotprod -mautovec-preference=asimd-only --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/ +/* { dg-final { check-function-bodies "**" "" } } */ + +inline char char_abs(char i) { + return (i < 0 ? -i : i); +} + +/* +** foo_int: +** ... +** sub v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b +** udot v[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b +** ... +*/ +int foo_int(unsigned char *x, unsigned char * restrict y) { + int sum = 0; + for (int i = 0; i < 8000; i++) + sum += char_abs(x[i] - y[i]); + return sum; +} + +/* +** foo2_int: +** ... +** add v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h +** uaddw2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.8h +** ... +*/ +int foo2_int(unsigned short *x, unsigned short * restrict y) { + int sum = 0; + for (int i = 0; i < 8000; i++) + { + x[i] = x[i] + y[i]; + sum += x[i]; + } + return sum; +} + +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/pr122069_4.c b/gcc/testsuite/gcc.target/aarch64/pr122069_4.c new file mode 100644 index 000000000000..22d5f631de21 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr122069_4.c @@ -0,0 +1,81 @@ +/* { dg-do run } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw }*/ +/* { dg-options "-O3 -march=armv8.2-a+dotprod -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +inline char char_abs(char i) { + return (i < 0 ? -i : i); +} + +__attribute__((noipa)) +int foo_int(unsigned char *x, unsigned char * restrict y) { + int sum = 0; + for (int i = 0; i < 100; i++) + sum += char_abs(x[i] - y[i]); + return sum; +} + +__attribute__((noipa)) +int foo2_int(unsigned short *x, unsigned short * restrict y, + unsigned short * restrict z) { + int sum = 0; + for (int i = 0; i < 100; i++) + { + z[i] = x[i] + y[i]; + sum += z[i]; + } + return sum; +} + +__attribute__((noipa)) +int foo_int2(unsigned char *x, unsigned char * restrict y) { + int sum = 0; +#pragma GCC novector + for (int i = 0; i < 100; i++) + sum += char_abs(x[i] - y[i]); + return sum; +} + +__attribute__((noipa)) +int foo2_int2(unsigned short *x, unsigned short * restrict y, + unsigned short * restrict z) { + int sum = 0; +#pragma GCC novector + for (int i = 0; i < 100; i++) + { + z[i] = x[i] + y[i]; + sum += z[i]; + } + return sum; +} + +int main () +{ + unsigned short a[100]; + unsigned short b[100]; + unsigned short r1[100]; + unsigned short r2[100]; + unsigned char c[100]; + unsigned char d[100]; +#pragma GCC novector + for (int i = 0; i < 100; i++) + { + a[i] = c[i] = i; + b[i] = d[i] = 100 - i; + } + + if (foo_int (c, d) != foo_int2 (c, d)) + __builtin_abort(); + + + if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2)) + __builtin_abort(); + +#pragma GCC novector + for (int i = 0; i < 100; i++) + if (r1[i] != r2[i]) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */ \ No newline at end of file