[aarch64] Improve code-gen for vector initialization with single constant element.

gcc/ChangeLog:
	* config/aarch64/aarch64.cc (aarch64_expand_vector_init): Tweak condition
	if (n_var == n_elts && n_elts <= 16) to allow a single constant,
	and if maxv == 1, use constant element for duplicating into register.

gcc/testsuite/ChangeLog:
	* gcc.target/aarch64/vec-init-single-const.c: New test.
	* gcc.target/aarch64/vec-init-single-const-be.c: Likewise.
	* gcc.target/aarch64/vec-init-single-const-2.c: Likewise.
This commit is contained in:
Prathamesh Kulkarni 2023-06-12 23:14:40 +05:30
parent 38944ec2a6
commit 9eb757d117
4 changed files with 176 additions and 8 deletions

View File

@ -22254,7 +22254,7 @@ aarch64_expand_vector_init_fallback (rtx target, rtx vals)
and matches[X][1] with the count of duplicate elements (if X is the and matches[X][1] with the count of duplicate elements (if X is the
earliest element which has duplicates). */ earliest element which has duplicates). */
if (n_var == n_elts && n_elts <= 16) if (n_var >= n_elts - 1 && n_elts <= 16)
{ {
int matches[16][2] = {0}; int matches[16][2] = {0};
for (int i = 0; i < n_elts; i++) for (int i = 0; i < n_elts; i++)
@ -22271,12 +22271,23 @@ aarch64_expand_vector_init_fallback (rtx target, rtx vals)
} }
int maxelement = 0; int maxelement = 0;
int maxv = 0; int maxv = 0;
rtx const_elem = NULL_RTX;
int const_elem_pos = 0;
for (int i = 0; i < n_elts; i++) for (int i = 0; i < n_elts; i++)
if (matches[i][1] > maxv) {
{ if (matches[i][1] > maxv)
maxelement = i; {
maxv = matches[i][1]; maxelement = i;
} maxv = matches[i][1];
}
if (CONST_INT_P (XVECEXP (vals, 0, i))
|| CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
{
const_elem_pos = i;
const_elem = XVECEXP (vals, 0, i);
}
}
/* Create a duplicate of the most common element, unless all elements /* Create a duplicate of the most common element, unless all elements
are equally useless to us, in which case just immediately set the are equally useless to us, in which case just immediately set the
@ -22314,8 +22325,19 @@ aarch64_expand_vector_init_fallback (rtx target, rtx vals)
vector register. For big-endian we want that position to hold vector register. For big-endian we want that position to hold
the last element of VALS. */ the last element of VALS. */
maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0; maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode)); /* If we have a single constant element, use that for duplicating
instead. */
if (const_elem)
{
maxelement = const_elem_pos;
aarch64_emit_move (target, gen_vec_duplicate (mode, const_elem));
}
else
{
rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
}
} }
else else
{ {

View File

@ -0,0 +1,30 @@
/* { dg-do compile } */
/* { dg-options "-O2" } */
#include <arm_neon.h>
/* In case where there are no duplicate elements in vector initializer,
check that the constant is used for duplication. */
int8x16_t f_s8(int8_t a0, int8_t a1, int8_t a2, int8_t a3, int8_t a4,
int8_t a5, int8_t a6, int8_t a7, int8_t a8, int8_t a9,
int8_t a10, int8_t a11, int8_t a12, int8_t a13, int8_t a14)
{
return (int8x16_t) { a0, a1, a2, a3, a4, a5, a6, a7,
a8, a9, a10, a11, a12, a13, a14, 1 };
}
int16x8_t f_s16(int16_t a0, int16_t a1, int16_t a2, int16_t a3, int16_t a4,
int16_t a5, int16_t a6)
{
return (int16x8_t) { a0, a1, a2, a3, a4, a5, a6, 1 };
}
int32x4_t f_s32(int32_t a0, int32_t a1, int32_t a2)
{
return (int32x4_t) { a0, a1, a2, 1 };
}
/* { dg-final { scan-assembler {\tmovi\tv[0-9]+\.8b, 0x1} } } */
/* { dg-final { scan-assembler {\tmovi\tv[0-9]+\.4h, 0x1} } } */
/* { dg-final { scan-assembler {\tmovi\tv[0-9]+\.2s, 0x1} } } */

View File

@ -0,0 +1,58 @@
/* { dg-do compile } */
/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
/* { dg-final { check-function-bodies "**" "" "" { target { be } } } } */
#include <arm_neon.h>
/*
** f_s8:
** dup v0.16b, w0
** movi (v[0-9]+)\.8b, 0x1
** ins v0.b\[0\], \1\.b\[0\]
** ret
*/
int8x16_t f_s8(int8_t x)
{
return (int8x16_t) { x, x, x, x, x, x, x, x,
x, x, x, x, x, x, x, 1 };
}
/*
** f_s16:
** dup v0.8h, w0
** movi (v[0-9]+)\.4h, 0x1
** ins v0.h\[0\], \1\.h\[0\]
** ret
*/
int16x8_t f_s16(int16_t x)
{
return (int16x8_t) { x, x, x, x, x, x, x, 1 };
}
/*
** f_s32:
** dup v0.4s, w0
** movi (v[0-9]+)\.2s, 0x1
** ins v0.s\[0\], \1\.s\[0\]
** ret
*/
int32x4_t f_s32(int32_t x)
{
return (int32x4_t) { x, x, x, 1 };
}
/*
** f_s64:
** adrp x[0-9]+, .LC[0-9]+
** ldr q0, \[x[0-9]+, #:lo12:.LC[0-9]+\]
** ins v0\.d\[1\], x0
** ret
*/
int64x2_t f_s64(int64_t x)
{
return (int64x2_t) { x, 1 };
}

View File

@ -0,0 +1,58 @@
/* { dg-do compile } */
/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
#include <arm_neon.h>
/*
** f_s8:
** dup v0.16b, w0
** movi (v[0-9]+)\.8b, 0x1
** ins v0.b\[15\], \1\.b\[0\]
** ret
*/
int8x16_t f_s8(int8_t x)
{
return (int8x16_t) { x, x, x, x, x, x, x, x,
x, x, x, x, x, x, x, 1 };
}
/*
** f_s16:
** dup v0.8h, w0
** movi (v[0-9]+)\.4h, 0x1
** ins v0.h\[7\], \1\.h\[0\]
** ret
*/
int16x8_t f_s16(int16_t x)
{
return (int16x8_t) { x, x, x, x, x, x, x, 1 };
}
/*
** f_s32:
** dup v0.4s, w0
** movi (v[0-9]+)\.2s, 0x1
** ins v0.s\[3\], \1\.s\[0\]
** ret
*/
int32x4_t f_s32(int32_t x)
{
return (int32x4_t) { x, x, x, 1 };
}
/*
** f_s64:
** adrp x[0-9]+, .LC[0-9]+
** ldr q0, \[x[0-9]+, #:lo12:.LC[0-9]+\]
** ins v0\.d\[0\], x0
** ret
*/
int64x2_t f_s64(int64_t x)
{
return (int64x2_t) { x, 1 };
}