AArch64: Implement widen_[us]sum using [US]ADDW[TB] for SVE2 [PR122069]

SVE2 adds [US]ADDW[TB] which we can use when we have to do a single step
widening addition.  This is useful for instance when the value to be widened
does not come from a load.  For example for

int foo2_int(unsigned short *x, unsigned short * restrict y) {
  int sum = 0;
  for (int i = 0; i < 8000; i++)
    {
      x[i] = x[i] + y[i];
      sum += x[i];
    }
  return sum;
}

we used to generate

.L6:
        ld1h    z1.h, p7/z, [x0, x2, lsl 1]
        ld1h    z29.h, p7/z, [x1, x2, lsl 1]
        add     z29.h, z29.h, z1.h
        punpklo p6.h, p7.b
        uunpklo z0.s, z29.h
        add     z31.s, p6/m, z31.s, z0.s
        punpkhi p6.h, p7.b
        uunpkhi z30.s, z29.h
        add     z31.s, p6/m, z31.s, z30.s
        st1h    z29.h, p7, [x0, x2, lsl 1]
        add     x2, x2, x4
        whilelo p7.h, w2, w3
        b.any   .L6
        ptrue   p7.b, all
        uaddv   d31, p7, z31.s

but with +sve2

.L12:
        ld1h    z30.h, p7/z, [x0, x2, lsl 1]
        ld1h    z29.h, p7/z, [x1, x2, lsl 1]
        add     z30.h, z30.h, z29.h
        uaddwb  z31.s, z31.s, z30.h
        uaddwt  z31.s, z31.s, z30.h
        st1h    z30.h, p7, [x0, x2, lsl 1]
        mov     x3, x2
        inch    x2
        cmp     w2, w4
        bls     .L12
        inch    x3
        uaddv   d31, p7, z31.s

gcc/ChangeLog:

	PR middle-end/122069
	* config/aarch64/aarch64-sve2.md: (widen_ssum<mode><Vnarrow>3): New.
	(widen_usum<mode><Vnarrow>3): New.
	* config/aarch64/iterators.md (Vnarrow): New, to match VNARROW.

gcc/testsuite/ChangeLog:

	PR middle-end/122069
	* gcc.target/aarch64/sve2/pr122069_1.c: New test.
	* gcc.target/aarch64/sve2/pr122069_2.c: New test.
This commit is contained in:
Tamar Christina 2025-10-18 08:22:18 +01:00
parent 2f719014bf
commit 25c8a8d431
4 changed files with 157 additions and 0 deletions

View File

@ -2377,6 +2377,36 @@
[(set_attr "sve_type" "sve_int_general")]
)
;; Define single step widening for widen_ssum using SADDWB and SADDWT
(define_expand "widen_ssum<mode><Vnarrow>3"
[(set (match_operand:SVE_FULL_HSDI 0 "register_operand")
(unspec:SVE_FULL_HSDI
[(match_operand:SVE_FULL_HSDI 2 "register_operand")
(match_operand:<VNARROW> 1 "register_operand")]
UNSPEC_SADDWB))
(set (match_dup 0)
(unspec:SVE_FULL_HSDI
[(match_dup 0)
(match_dup 1)]
UNSPEC_SADDWT))]
"TARGET_SVE2"
)
;; Define single step widening for widen_usum using UADDWB and UADDWT
(define_expand "widen_usum<mode><Vnarrow>3"
[(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
(unspec:SVE_FULL_HSDI
[(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
(match_operand:<VNARROW> 1 "register_operand" "w")]
UNSPEC_UADDWB))
(set (match_dup 0)
(unspec:SVE_FULL_HSDI
[(match_dup 0)
(match_dup 1)]
UNSPEC_UADDWT))]
"TARGET_SVE2"
)
;; -------------------------------------------------------------------------
;; ---- [INT] Long binary arithmetic
;; -------------------------------------------------------------------------

View File

@ -1935,6 +1935,11 @@
(VNx2DI "VNx4SI") (VNx2DF "VNx4SF")
(VNx8SI "VNx8HI") (VNx16SI "VNx16QI")
(VNx8DI "VNx8HI")])
(define_mode_attr Vnarrow [(VNx8HI "vnx16qi")
(VNx4SI "vnx8hi") (VNx4SF "vnx8hf")
(VNx2DI "vnx4si") (VNx2DF "vnx4sf")
(VNx8SI "vnx8hi") (VNx16SI "vnx16qi")
(VNx8DI "vnx8hi")])
;; Suffix mapping Advanced SIMD modes to be expanded as SVE instructions.
(define_mode_attr sve_di_suf [(VNx16QI "") (VNx8HI "") (VNx4SI "") (VNx2DI "")

View File

@ -0,0 +1,41 @@
/* { dg-do compile } */
/* { dg-options "-O3 -march=armv8-a+sve2 -mautovec-preference=sve-only --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/
/* { dg-final { check-function-bodies "**" "" } } */
inline char char_abs(char i) {
return (i < 0 ? -i : i);
}
/*
** foo_int:
** ...
** sub z[0-9]+.b, z[0-9]+.b, z[0-9]+.b
** udot z[0-9]+.s, z[0-9]+.b, z[0-9]+.b
** ...
*/
int foo_int(unsigned char *x, unsigned char * restrict y) {
int sum = 0;
for (int i = 0; i < 8000; i++)
sum += char_abs(x[i] - y[i]);
return sum;
}
/*
** foo2_int:
** ...
** add z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
** uaddwb z[0-9]+.s, z[0-9]+.s, z[0-9]+.h
** uaddwt z[0-9]+.s, z[0-9]+.s, z[0-9]+.h
** ...
*/
int foo2_int(unsigned short *x, unsigned short * restrict y) {
int sum = 0;
for (int i = 0; i < 8000; i++)
{
x[i] = x[i] + y[i];
sum += x[i];
}
return sum;
}
/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */

View File

@ -0,0 +1,81 @@
/* { dg-do run } */
/* { dg-require-effective-target aarch64_sve2_hw } */
/* { dg-options "-O3 -march=armv8-a+sve2 -mautovec-preference=sve-only -fdump-tree-vect-details" }*/
inline char char_abs(char i) {
return (i < 0 ? -i : i);
}
__attribute__((noipa))
int foo_int(unsigned char *x, unsigned char * restrict y) {
int sum = 0;
for (int i = 0; i < 100; i++)
sum += char_abs(x[i] - y[i]);
return sum;
}
__attribute__((noipa))
int foo2_int(unsigned short *x, unsigned short * restrict y,
unsigned short * restrict z) {
int sum = 0;
for (int i = 0; i < 100; i++)
{
z[i] = x[i] + y[i];
sum += z[i];
}
return sum;
}
__attribute__((noipa))
int foo_int2(unsigned char *x, unsigned char * restrict y) {
int sum = 0;
#pragma GCC novector
for (int i = 0; i < 100; i++)
sum += char_abs(x[i] - y[i]);
return sum;
}
__attribute__((noipa))
int foo2_int2(unsigned short *x, unsigned short * restrict y,
unsigned short * restrict z) {
int sum = 0;
#pragma GCC novector
for (int i = 0; i < 100; i++)
{
z[i] = x[i] + y[i];
sum += z[i];
}
return sum;
}
int main ()
{
unsigned short a[100];
unsigned short b[100];
unsigned short r1[100];
unsigned short r2[100];
unsigned char c[100];
unsigned char d[100];
#pragma GCC novector
for (int i = 0; i < 100; i++)
{
a[i] = c[i] = i;
b[i] = d[i] = 100 - i;
}
if (foo_int (c, d) != foo_int2 (c, d))
__builtin_abort();
if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
__builtin_abort();
#pragma GCC novector
for (int i = 0; i < 100; i++)
if (r1[i] != r2[i])
__builtin_abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */