tree-optimization/121256 - properly support SLP in vectorizable recurrence

We failed to build the correct initialization vector.  For VLA
vectors and a non-uniform initialization vector this rejects
vectorization for now.

	PR tree-optimization/121256
	* tree-vect-loop.cc (vectorizable_recurr): Build a correct
	initialization vector for SLP_TREE_LANES > 1.

	* gcc.dg/vect/vect-recurr-pr121256.c: New testcase.
	* gcc.dg/vect/vect-recurr-pr121256-2.c: Likewise.

(cherry picked from commit 26dc9aa285)
This commit is contained in:
Richard Biener 2025-07-27 18:42:25 +02:00
parent 234f122a9b
commit 97ec33fed5
3 changed files with 158 additions and 8 deletions

View File

@ -0,0 +1,49 @@
/* { dg-additional-options "-mavx2" { target avx2_runtime } } */
#include "tree-vect.h"
#define B 0
#define G 1
#define R 2
int red = 153;
int green = 66;
int blue = 187;
static void __attribute__((noipa))
sub_left_prediction_bgr32(int *restrict dst, int *restrict src)
{
for (int i = 0; i < 8; i++) {
int rt = src[i * 3 + R];
int gt = src[i * 3 + G];
int bt = src[i * 3 + B];
dst[i * 3 + R] = rt - red;
dst[i * 3 + G] = gt - green;
dst[i * 3 + B] = bt - blue;
red = rt;
green = gt;
blue = bt;
}
}
int main()
{
int dst[8*3];
int src[8*3] = { 160, 73, 194, 17, 33, 99, 0, 12, 283, 87, 73, 11,
9, 7, 1, 23, 19, 13, 77, 233, 97, 78, 2, 5 };
int dst2[8*3] = {-27, 7, 41, -143, -40, -95, -17, -21, 184, 87, 61,
-272, -78, -66, -10, 14, 12, 12, 54, 214, 84, 1, -231, -92};
check_vect ();
sub_left_prediction_bgr32(dst, src);
#pragma GCC novector
for (int i = 0; i < 8*3; ++i)
if (dst[i] != dst2[i])
__builtin_abort();
return 0;
}

View File

@ -0,0 +1,54 @@
/* { dg-additional-options "-mavx2" { target avx2_runtime } } */
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include "tree-vect.h"
#define B 0
#define G 1
#define R 2
#define A 3
int red = 153;
int green = 66;
int blue = 187;
int alpha = 255;
static void __attribute__((noipa))
sub_left_prediction_bgr32(uint8_t *restrict dst, uint8_t *restrict src, int w)
{
for (int i = 0; i < 8; i++) {
int rt = src[i * 4 + R];
int gt = src[i * 4 + G];
int bt = src[i * 4 + B];
int at = src[i * 4 + A];
dst[i * 4 + R] = rt - red;
dst[i * 4 + G] = gt - green;
dst[i * 4 + B] = bt - blue;
dst[i * 4 + A] = at - alpha;
red = rt;
green = gt;
blue = bt;
alpha = at;
}
}
int main()
{
check_vect ();
uint8_t *dst = calloc(36, sizeof(uint8_t));
uint8_t *src = calloc(36, sizeof(uint8_t));
src[R] = 160;
src[G] = 73;
src[B] = 194;
src[A] = 255;
sub_left_prediction_bgr32(dst, src, 33);
if (dst[R] != 7 || dst[B] != 7 || dst[A] != 0)
__builtin_abort();
}

View File

@ -9206,6 +9206,36 @@ vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
return false;
}
/* We need to be able to build a { ..., a, b } init vector with
dist number of distinct trailing values. Always possible
when dist == 1 or when nunits is constant or when the initializations
are uniform. */
tree uniform_initval = NULL_TREE;
edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
if (slp_node)
for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
{
gphi *phi = as_a <gphi *> (s->stmt);
if (! uniform_initval)
uniform_initval = PHI_ARG_DEF_FROM_EDGE (phi, pe);
else if (! operand_equal_p (uniform_initval,
PHI_ARG_DEF_FROM_EDGE (phi, pe)))
{
uniform_initval = NULL_TREE;
break;
}
}
else
uniform_initval = PHI_ARG_DEF_FROM_EDGE (phi, pe);
if (!uniform_initval && !nunits.is_constant ())
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"cannot build initialization vector for "
"first order recurrence\n");
return false;
}
/* First-order recurrence autovectorization needs to handle permutation
with indices = [nunits-1, nunits, nunits+1, ...]. */
vec_perm_builder sel (nunits, 1, 3);
@ -9274,21 +9304,38 @@ vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
return true;
}
edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
basic_block bb = gimple_bb (phi);
tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
tree vec_init;
if (! uniform_initval)
{
gimple_seq stmts = NULL;
preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
gsi_insert_seq_on_edge_immediate (pe, stmts);
vec<constructor_elt, va_gc> *v = NULL;
vec_alloc (v, nunits.to_constant ());
for (unsigned i = 0; i < nunits.to_constant () - dist; ++i)
CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
build_zero_cst (TREE_TYPE (vectype)));
for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
{
gphi *phi = as_a <gphi *> (s->stmt);
tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
if (!useless_type_conversion_p (TREE_TYPE (vectype),
TREE_TYPE (preheader)))
{
gimple_seq stmts = NULL;
preheader = gimple_convert (&stmts,
TREE_TYPE (vectype), preheader);
gsi_insert_seq_on_edge_immediate (pe, stmts);
}
CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, preheader);
}
vec_init = build_constructor (vectype, v);
}
tree vec_init = build_vector_from_val (vectype, preheader);
else
vec_init = uniform_initval;
vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
/* Create the vectorized first-order PHI node. */
tree vec_dest = vect_get_new_vect_var (vectype,
vect_simple_var, "vec_recur_");
basic_block bb = gimple_bb (phi);
gphi *new_phi = create_phi_node (vec_dest, bb);
add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);