mirror of git://gcc.gnu.org/git/gcc.git
AArch64: Implement widen_[us]sum using 2-way [US]UDOT for SVE2p1 [PR122069]
SVE2p1 adds 2-way dotproduct which we can use when we have to do a single step widening addition. This is useful for instance when the value to be widened does not come from a load. For example for int foo2_int(unsigned short *x, unsigned short * restrict y) { int sum = 0; for (int i = 0; i < 8000; i++) { x[i] = x[i] + y[i]; sum += x[i]; } return sum; } we used to generate .L12: ld1h z30.h, p7/z, [x0, x2, lsl 1] ld1h z29.h, p7/z, [x1, x2, lsl 1] add z30.h, z30.h, z29.h uaddwb z31.s, z31.s, z30.h uaddwt z31.s, z31.s, z30.h st1h z30.h, p7, [x0, x2, lsl 1] mov x3, x2 inch x2 cmp w2, w4 bls .L12 inch x3 uaddv d31, p7, z31.s but with +sve2p1 .L12: ld1h z31.h, p7/z, [x0, x2, lsl 1] ld1h z29.h, p7/z, [x1, x2, lsl 1] add z31.h, z31.h, z29.h udot z30.s, z31.h, z28.h st1h z31.h, p7, [x0, x2, lsl 1] mov x3, x2 inch x2 cmp w2, w4 bls .L12 inch x3 uaddv d30, p7, z30.s gcc/ChangeLog: PR middle-end/122069 * config/aarch64/aarch64-sve2.md (widen_ssum<mode><Vnarrow>3): Update. (widen_usum<mode><Vnarrow>3): Update. gcc/testsuite/ChangeLog: PR middle-end/122069 * gcc.target/aarch64/sve2/pr122069_3.c: New test. * gcc.target/aarch64/sve2/pr122069_4.c: New test.
This commit is contained in:
parent
25c8a8d431
commit
75fb400d29
|
@ -2390,7 +2390,19 @@
|
||||||
(match_dup 1)]
|
(match_dup 1)]
|
||||||
UNSPEC_SADDWT))]
|
UNSPEC_SADDWT))]
|
||||||
"TARGET_SVE2"
|
"TARGET_SVE2"
|
||||||
)
|
{
|
||||||
|
/* Use dot product to perform double widening sum reductions by
|
||||||
|
changing += a into += (a * 1). i.e. we seed the multiplication with 1. */
|
||||||
|
if (TARGET_SVE2p1_OR_SME2
|
||||||
|
&& <VNARROW>mode == VNx8HImode
|
||||||
|
&& <MODE>mode == VNx4SImode)
|
||||||
|
{
|
||||||
|
rtx ones = force_reg (VNx8HImode, CONST1_RTX (VNx8HImode));
|
||||||
|
emit_insn (gen_sdot_prodvnx4sivnx8hi (operands[0], operands[1],
|
||||||
|
ones, operands[2]));
|
||||||
|
DONE;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
;; Define single step widening for widen_usum using UADDWB and UADDWT
|
;; Define single step widening for widen_usum using UADDWB and UADDWT
|
||||||
(define_expand "widen_usum<mode><Vnarrow>3"
|
(define_expand "widen_usum<mode><Vnarrow>3"
|
||||||
|
@ -2405,7 +2417,17 @@
|
||||||
(match_dup 1)]
|
(match_dup 1)]
|
||||||
UNSPEC_UADDWT))]
|
UNSPEC_UADDWT))]
|
||||||
"TARGET_SVE2"
|
"TARGET_SVE2"
|
||||||
)
|
{
|
||||||
|
if (TARGET_SVE2p1_OR_SME2
|
||||||
|
&& <VNARROW>mode == VNx8HImode
|
||||||
|
&& <MODE>mode == VNx4SImode)
|
||||||
|
{
|
||||||
|
rtx ones = force_reg (VNx8HImode, CONST1_RTX (VNx8HImode));
|
||||||
|
emit_insn (gen_udot_prodvnx4sivnx8hi (operands[0], operands[1],
|
||||||
|
ones, operands[2]));
|
||||||
|
DONE;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
;; -------------------------------------------------------------------------
|
;; -------------------------------------------------------------------------
|
||||||
;; ---- [INT] Long binary arithmetic
|
;; ---- [INT] Long binary arithmetic
|
||||||
|
|
|
@ -0,0 +1,39 @@
|
||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-options "-O3 -march=armv8-a+sve2p1 -mautovec-preference=sve-only --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/
|
||||||
|
/* { dg-final { check-function-bodies "**" "" } } */
|
||||||
|
|
||||||
|
inline char char_abs(char i) {
|
||||||
|
return (i < 0 ? -i : i);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** foo_int:
|
||||||
|
** ...
|
||||||
|
** sub z[0-9]+.b, z[0-9]+.b, z[0-9]+.b
|
||||||
|
** udot z[0-9]+.s, z[0-9]+.b, z[0-9]+.b
|
||||||
|
** ...
|
||||||
|
*/
|
||||||
|
int foo_int(unsigned char *x, unsigned char * restrict y) {
|
||||||
|
int sum = 0;
|
||||||
|
for (int i = 0; i < 8000; i++)
|
||||||
|
sum += char_abs(x[i] - y[i]);
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** foo2_int:
|
||||||
|
** ...
|
||||||
|
** udot z[0-9]+.s, z[0-9]+.h, z[0-9]+.h
|
||||||
|
** ...
|
||||||
|
*/
|
||||||
|
int foo2_int(unsigned short *x, unsigned short * restrict y) {
|
||||||
|
int sum = 0;
|
||||||
|
for (int i = 0; i < 8000; i++)
|
||||||
|
{
|
||||||
|
x[i] = x[i] + y[i];
|
||||||
|
sum += x[i];
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
|
|
@ -0,0 +1,81 @@
|
||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-require-effective-target aarch64_sve2p1_hw } */
|
||||||
|
/* { dg-options "-O3 -march=armv8-a+sve2p1 -mautovec-preference=sve-only -fdump-tree-vect-details" }*/
|
||||||
|
|
||||||
|
inline char char_abs(char i) {
|
||||||
|
return (i < 0 ? -i : i);
|
||||||
|
}
|
||||||
|
|
||||||
|
__attribute__((noipa))
|
||||||
|
int foo_int(unsigned char *x, unsigned char * restrict y) {
|
||||||
|
int sum = 0;
|
||||||
|
for (int i = 0; i < 100; i++)
|
||||||
|
sum += char_abs(x[i] - y[i]);
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
__attribute__((noipa))
|
||||||
|
int foo2_int(unsigned short *x, unsigned short * restrict y,
|
||||||
|
unsigned short * restrict z) {
|
||||||
|
int sum = 0;
|
||||||
|
for (int i = 0; i < 100; i++)
|
||||||
|
{
|
||||||
|
z[i] = x[i] + y[i];
|
||||||
|
sum += z[i];
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
__attribute__((noipa))
|
||||||
|
int foo_int2(unsigned char *x, unsigned char * restrict y) {
|
||||||
|
int sum = 0;
|
||||||
|
#pragma GCC novector
|
||||||
|
for (int i = 0; i < 100; i++)
|
||||||
|
sum += char_abs(x[i] - y[i]);
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
__attribute__((noipa))
|
||||||
|
int foo2_int2(unsigned short *x, unsigned short * restrict y,
|
||||||
|
unsigned short * restrict z) {
|
||||||
|
int sum = 0;
|
||||||
|
#pragma GCC novector
|
||||||
|
for (int i = 0; i < 100; i++)
|
||||||
|
{
|
||||||
|
z[i] = x[i] + y[i];
|
||||||
|
sum += z[i];
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main ()
|
||||||
|
{
|
||||||
|
unsigned short a[100];
|
||||||
|
unsigned short b[100];
|
||||||
|
unsigned short r1[100];
|
||||||
|
unsigned short r2[100];
|
||||||
|
unsigned char c[100];
|
||||||
|
unsigned char d[100];
|
||||||
|
#pragma GCC novector
|
||||||
|
for (int i = 0; i < 100; i++)
|
||||||
|
{
|
||||||
|
a[i] = c[i] = i;
|
||||||
|
b[i] = d[i] = 100 - i;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (foo_int (c, d) != foo_int2 (c, d))
|
||||||
|
__builtin_abort();
|
||||||
|
|
||||||
|
|
||||||
|
if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
|
||||||
|
__builtin_abort();
|
||||||
|
|
||||||
|
#pragma GCC novector
|
||||||
|
for (int i = 0; i < 100; i++)
|
||||||
|
if (r1[i] != r2[i])
|
||||||
|
__builtin_abort ();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
|
Loading…
Reference in New Issue