vect: Move slp_perm checking into get_load_store_type.

This moves the setting of slp_perm, checking of basic-block SLP gaps, as
well as the final check for perm_ok to get_load_store_type.
Also, slp_perm is moved to ls_data.

gcc/ChangeLog:

	* tree-vect-stmts.cc (get_load_store_type): Add load-permutation
	checks and setting of slp_perm.
	(vectorizable_store): Remove perm_ok argument.
	(vectorizable_load): Ditto and replace slp_perm by ls.slp_perm.
	* tree-vectorizer.h (struct vect_load_store_data): Add slp_perm.
This commit is contained in:
Robin Dapp 2025-10-10 18:39:01 +02:00
parent b6e802fd55
commit aefb2dbb64
2 changed files with 92 additions and 85 deletions

View File

@ -2062,16 +2062,13 @@ vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype,
VECTYPE is the vector type that the vectorized statements will use.
If ELSVALS is nonzero the supported else values will be stored in the
vector ELSVALS points to.
For loads PERM_OK indicates whether we can code generate a
SLP_TREE_LOAD_PERMUTATION on the node. */
vector ELSVALS points to. */
static bool
get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
tree vectype, slp_tree slp_node,
bool masked_p, vec_load_store_type vls_type,
bool perm_ok, vect_load_store_data *ls)
vect_load_store_data *ls)
{
vect_memory_access_type *memory_access_type = &ls->memory_access_type;
poly_int64 *poffset = &ls->poffset;
@ -2081,6 +2078,8 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
internal_fn *lanes_ifn = &ls->lanes_ifn;
vec<int> *elsvals = &ls->elsvals;
tree *ls_type = &ls->ls_type;
bool *slp_perm = &ls->slp_perm;
unsigned *n_perms = &ls->n_perms;
loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
@ -2093,6 +2092,15 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
*misalignment = DR_MISALIGNMENT_UNKNOWN;
*poffset = 0;
*ls_type = NULL_TREE;
*slp_perm = false;
*n_perms = -1U;
bool perm_ok = true;
poly_int64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL,
vf, true, n_perms);
if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
{
@ -2534,7 +2542,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
poly_uint64 read_amount
= vf * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
read_amount *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
read_amount *= group_size;
auto target_alignment
= DR_TARGET_ALIGNMENT (STMT_VINFO_DR_INFO (stmt_info));
@ -2627,6 +2635,60 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
if (!loop_vinfo && *memory_access_type == VMAT_ELEMENTWISE)
return false;
/* Some loads need to explicitly permute the loaded data if there
is a load permutation. Among those are:
- VMAT_ELEMENTWISE.
- VMAT_STRIDED_SLP.
- VMAT_GATHER_SCATTER:
- Strided gather (fallback for VMAT_STRIDED_SLP if #lanes == 1).
- Grouped strided gather (ditto but for #lanes > 1).
For VMAT_ELEMENTWISE we can fold the load permutation into the
individual indices we access directly, eliding the permutation.
Strided gather only allows load permutations for the
single-element case. */
if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
&& !(*memory_access_type == VMAT_ELEMENTWISE
|| (mat_gather_scatter_p (*memory_access_type)
&& SLP_TREE_LANES (slp_node) == 1
&& single_element_p)))
{
if (!loop_vinfo)
{
/* In BB vectorization we may not actually use a loaded vector
accessing elements in excess of DR_GROUP_SIZE. */
stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
group_info = DR_GROUP_FIRST_ELEMENT (group_info);
unsigned HOST_WIDE_INT nunits;
unsigned j, k, maxk = 0;
FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
if (k > maxk)
maxk = k;
tree vectype = SLP_TREE_VECTYPE (slp_node);
if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
|| maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"BB vectorization with gaps at the end of "
"a load is not supported\n");
return false;
}
}
if (!perm_ok)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION,
vect_location,
"unsupported load permutation\n");
return false;
}
*slp_perm = true;
}
return true;
}
@ -8009,7 +8071,7 @@ vectorizable_store (vec_info *vinfo,
vect_load_store_data &ls = slp_node->get_data (_ls_data);
if (cost_vec
&& !get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask_node,
vls_type, false, &_ls_data))
vls_type, &_ls_data))
return false;
/* Temporary aliases to analysis data, should not be modified through
these. */
@ -9454,7 +9516,6 @@ vectorizable_load (vec_info *vinfo,
bool compute_in_loop = false;
class loop *at_loop;
int vec_num;
bool slp_perm = false;
bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
poly_uint64 vf;
tree aggr_type;
@ -9592,17 +9653,11 @@ vectorizable_load (vec_info *vinfo,
else
group_size = 1;
bool perm_ok = true;
unsigned n_perms = -1U;
if (cost_vec && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
true, &n_perms);
vect_load_store_data _ls_data{};
vect_load_store_data &ls = slp_node->get_data (_ls_data);
if (cost_vec
&& !get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask_node,
VLS_LOAD, perm_ok, &ls))
VLS_LOAD, &ls))
return false;
/* Temporary aliases to analysis data, should not be modified through
these. */
@ -9623,56 +9678,6 @@ vectorizable_load (vec_info *vinfo,
bool type_mode_padding_p
= TYPE_PRECISION (scalar_type) < GET_MODE_PRECISION (GET_MODE_INNER (mode));
/* ??? The following checks should really be part of
get_load_store_type. */
if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
&& !(memory_access_type == VMAT_ELEMENTWISE
|| (mat_gather_scatter_p (memory_access_type)
&& SLP_TREE_LANES (slp_node) == 1
&& (!grouped_load
|| !DR_GROUP_NEXT_ELEMENT (first_stmt_info)))))
{
slp_perm = true;
if (!loop_vinfo && cost_vec)
{
/* In BB vectorization we may not actually use a loaded vector
accessing elements in excess of DR_GROUP_SIZE. */
stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
group_info = DR_GROUP_FIRST_ELEMENT (group_info);
unsigned HOST_WIDE_INT nunits;
unsigned j, k, maxk = 0;
FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
if (k > maxk)
maxk = k;
tree vectype = SLP_TREE_VECTYPE (slp_node);
if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
|| maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"BB vectorization with gaps at the end of "
"a load is not supported\n");
return false;
}
}
if (cost_vec)
{
if (!perm_ok)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION,
vect_location,
"unsupported load permutation\n");
return false;
}
ls.n_perms = n_perms;
}
else
n_perms = ls.n_perms;
}
if (slp_node->ldst_lanes
&& memory_access_type != VMAT_LOAD_STORE_LANES)
{
@ -10027,7 +10032,7 @@ vectorizable_load (vec_info *vinfo,
not only the number of vector stmts the permutation result
fits in. */
int ncopies;
if (slp_perm)
if (ls.slp_perm)
{
gcc_assert (memory_access_type != VMAT_ELEMENTWISE);
/* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
@ -10135,18 +10140,18 @@ vectorizable_load (vec_info *vinfo,
if (!costing_p)
{
if (slp_perm)
if (ls.slp_perm)
dr_chain.quick_push (gimple_assign_lhs (new_stmt));
else
slp_node->push_vec_def (new_stmt);
}
}
if (slp_perm)
if (ls.slp_perm)
{
if (costing_p)
{
gcc_assert (n_perms != -1U);
inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
gcc_assert (ls.n_perms != -1U);
inside_cost += record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
slp_node, 0, vect_body);
}
else
@ -10154,7 +10159,7 @@ vectorizable_load (vec_info *vinfo,
unsigned n_perms2;
vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
false, &n_perms2);
gcc_assert (n_perms == n_perms2);
gcc_assert (ls.n_perms == n_perms2);
}
}
@ -10219,7 +10224,7 @@ vectorizable_load (vec_info *vinfo,
instead the access is contiguous but it might be
permuted. No gap adjustment is needed though. */
;
else if (slp_perm
else if (ls.slp_perm
&& (group_size != scalar_lanes
|| !multiple_p (nunits, group_size)))
{
@ -10568,7 +10573,7 @@ vectorizable_load (vec_info *vinfo,
if (mat_gather_scatter_p (memory_access_type))
{
gcc_assert ((!grouped_load && !slp_perm) || ls.ls_type);
gcc_assert ((!grouped_load && !ls.slp_perm) || ls.ls_type);
/* If we pun the original vectype the loads as well as costing, length,
etc. is performed with the new type. After loading we VIEW_CONVERT
@ -10930,14 +10935,14 @@ vectorizable_load (vec_info *vinfo,
/* Store vector loads in the corresponding SLP_NODE. */
if (!costing_p)
{
if (slp_perm)
if (ls.slp_perm)
dr_chain.quick_push (gimple_assign_lhs (new_stmt));
else
slp_node->push_vec_def (new_stmt);
}
}
if (slp_perm)
if (ls.slp_perm)
{
if (costing_p)
{
@ -11034,7 +11039,7 @@ vectorizable_load (vec_info *vinfo,
stmt_info, bump);
}
if (grouped_load || slp_perm)
if (grouped_load || ls.slp_perm)
dr_chain.create (vec_num);
gimple *new_stmt = NULL;
@ -11531,11 +11536,11 @@ vectorizable_load (vec_info *vinfo,
/* Collect vector loads and later create their permutation in
vect_transform_slp_perm_load. */
if (!costing_p && (grouped_load || slp_perm))
if (!costing_p && (grouped_load || ls.slp_perm))
dr_chain.quick_push (new_temp);
/* Store vector loads in the corresponding SLP_NODE. */
if (!costing_p && !slp_perm)
if (!costing_p && !ls.slp_perm)
slp_node->push_vec_def (new_stmt);
/* With SLP permutation we load the gaps as well, without
@ -11544,7 +11549,7 @@ vectorizable_load (vec_info *vinfo,
group_elt += nunits;
if (!costing_p
&& maybe_ne (group_gap_adj, 0U)
&& !slp_perm
&& !ls.slp_perm
&& known_eq (group_elt, group_size - group_gap_adj))
{
poly_wide_int bump_val
@ -11561,7 +11566,7 @@ vectorizable_load (vec_info *vinfo,
elements loaded for a permuted SLP load. */
if (!costing_p
&& maybe_ne (group_gap_adj, 0U)
&& slp_perm)
&& ls.slp_perm)
{
poly_wide_int bump_val
= (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
@ -11572,7 +11577,7 @@ vectorizable_load (vec_info *vinfo,
stmt_info, bump);
}
if (slp_perm)
if (ls.slp_perm)
{
/* For SLP we know we've seen all possible uses of dr_chain so
direct vect_transform_slp_perm_load to DCE the unused parts.
@ -11580,9 +11585,9 @@ vectorizable_load (vec_info *vinfo,
in PR101120 and friends. */
if (costing_p)
{
gcc_assert (n_perms != -1U);
if (n_perms != 0)
inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm,
gcc_assert (ls.n_perms != -1U);
if (ls.n_perms != 0)
inside_cost = record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
slp_node, 0, vect_body);
}
else
@ -11591,7 +11596,7 @@ vectorizable_load (vec_info *vinfo,
bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
gsi, vf, false, &n_perms2,
nullptr, true);
gcc_assert (ok && n_perms == n_perms2);
gcc_assert (ok && ls.n_perms == n_perms2);
}
dr_chain.release ();
}

View File

@ -290,6 +290,8 @@ struct vect_load_store_data : vect_data {
tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
tree ls_type; // VMAT_GATHER_SCATTER_IFN
auto_vec<int> elsvals;
/* True if the load requires a load permutation. */
bool slp_perm; // SLP_TREE_LOAD_PERMUTATION
unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
};