mirror of git://gcc.gnu.org/git/gcc.git
Avoid unnecessary peeling for gaps with LD3
vectorizable_load forces peeling for gaps if the vectorisation factor
is not a multiple of the group size, since in that case we'd normally load
beyond the original scalar accesses but drop the excess elements as part
of a following permute:
if (loop_vinfo
&& ! STMT_VINFO_STRIDED_P (stmt_info)
&& (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
|| (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0)))
This isn't necessary for LOAD_LANES though, since it loads only the
data needed and does the permute itself.
Tested on aarch64-linux-gnu and x86_64-linux-gnu.
gcc/
* tree-vect-stmts.c (vectorizable_load): Reorder checks so that
load_lanes/grouped_load classification comes first. Don't check
whether the vectorization factor is a multiple of the group size
for load_lanes.
gcc/testsuite/
* gcc.dg/vect/vect-load-lanes-peeling-1.c: New test.
From-SVN: r236632
This commit is contained in:
parent
836dbb1a27
commit
d3465d72ba
|
|
@ -1,3 +1,10 @@
|
|||
2016-05-24 Richard Sandiford <richard.sandiford@arm.com>
|
||||
|
||||
* tree-vect-stmts.c (vectorizable_load): Reorder checks so that
|
||||
load_lanes/grouped_load classification comes first. Don't check
|
||||
whether the vectorization factor is a multiple of the group size
|
||||
for load_lanes.
|
||||
|
||||
2016-05-24 Richard Sandiford <richard.sandiford@arm.com>
|
||||
|
||||
* tree-vect-data-refs.c (vect_analyze_group_access_1): Set
|
||||
|
|
|
|||
|
|
@ -1,3 +1,7 @@
|
|||
2016-05-24 Richard Sandiford <richard.sandiford@arm.com>
|
||||
|
||||
* gcc.dg/vect/vect-load-lanes-peeling-1.c: New test.
|
||||
|
||||
2016-05-24 Richard Biener <rguenther@suse.de>
|
||||
|
||||
PR middle-end/70434
|
||||
|
|
|
|||
|
|
@ -0,0 +1,13 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-require-effective-target vect_int } */
|
||||
/* { dg-require-effective-target vect_load_lanes } */
|
||||
|
||||
void
|
||||
f (int *__restrict a, int *__restrict b)
|
||||
{
|
||||
for (int i = 0; i < 96; ++i)
|
||||
a[i] = b[i * 3] + b[i * 3 + 1] + b[i * 3 + 2];
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-not "Data access with gaps" "vect" } } */
|
||||
/* { dg-final { scan-tree-dump-not "epilog loop required" "vect" } } */
|
||||
|
|
@ -6303,6 +6303,17 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
|
|||
gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info));
|
||||
|
||||
first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
|
||||
group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
|
||||
|
||||
if (!slp
|
||||
&& !PURE_SLP_STMT (stmt_info)
|
||||
&& !STMT_VINFO_STRIDED_P (stmt_info))
|
||||
{
|
||||
if (vect_load_lanes_supported (vectype, group_size))
|
||||
load_lanes_p = true;
|
||||
else if (!vect_grouped_load_supported (vectype, group_size))
|
||||
return false;
|
||||
}
|
||||
|
||||
/* If this is single-element interleaving with an element distance
|
||||
that leaves unused vector loads around punt - we at least create
|
||||
|
|
@ -6330,7 +6341,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
|
|||
if (loop_vinfo
|
||||
&& ! STMT_VINFO_STRIDED_P (stmt_info)
|
||||
&& (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
|
||||
|| (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0)))
|
||||
|| (!slp && !load_lanes_p && vf % group_size != 0)))
|
||||
{
|
||||
if (dump_enabled_p ())
|
||||
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
|
||||
|
|
@ -6350,8 +6361,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
|
|||
if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
|
||||
slp_perm = true;
|
||||
|
||||
group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
|
||||
|
||||
/* ??? The following is overly pessimistic (as well as the loop
|
||||
case above) in the case we can statically determine the excess
|
||||
elements loaded are within the bounds of a decl that is accessed.
|
||||
|
|
@ -6364,16 +6373,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
|
|||
return false;
|
||||
}
|
||||
|
||||
if (!slp
|
||||
&& !PURE_SLP_STMT (stmt_info)
|
||||
&& !STMT_VINFO_STRIDED_P (stmt_info))
|
||||
{
|
||||
if (vect_load_lanes_supported (vectype, group_size))
|
||||
load_lanes_p = true;
|
||||
else if (!vect_grouped_load_supported (vectype, group_size))
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Invalidate assumptions made by dependence analysis when vectorization
|
||||
on the unrolled body effectively re-orders stmts. */
|
||||
if (!PURE_SLP_STMT (stmt_info)
|
||||
|
|
|
|||
Loading…
Reference in New Issue