Avoid unnecessary peeling for gaps with LD3

vectorizable_load forces peeling for gaps if the vectorisation factor
is not a multiple of the group size, since in that case we'd normally load
beyond the original scalar accesses but drop the excess elements as part
of a following permute:

          if (loop_vinfo
              && ! STMT_VINFO_STRIDED_P (stmt_info)
              && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
                  || (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0)))

This isn't necessary for LOAD_LANES though, since it loads only the
data needed and does the permute itself.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.

gcc/
	* tree-vect-stmts.c (vectorizable_load): Reorder checks so that
	load_lanes/grouped_load classification comes first.  Don't check
	whether the vectorization factor is a multiple of the group size
	for load_lanes.

gcc/testsuite/
	* gcc.dg/vect/vect-load-lanes-peeling-1.c: New test.

From-SVN: r236632
This commit is contained in:
Richard Sandiford 2016-05-24 10:15:36 +00:00 committed by Richard Sandiford
parent 836dbb1a27
commit d3465d72ba
4 changed files with 36 additions and 13 deletions

View File

@ -1,3 +1,10 @@
2016-05-24 Richard Sandiford <richard.sandiford@arm.com>
* tree-vect-stmts.c (vectorizable_load): Reorder checks so that
load_lanes/grouped_load classification comes first. Don't check
whether the vectorization factor is a multiple of the group size
for load_lanes.
2016-05-24 Richard Sandiford <richard.sandiford@arm.com> 2016-05-24 Richard Sandiford <richard.sandiford@arm.com>
* tree-vect-data-refs.c (vect_analyze_group_access_1): Set * tree-vect-data-refs.c (vect_analyze_group_access_1): Set

View File

@ -1,3 +1,7 @@
2016-05-24 Richard Sandiford <richard.sandiford@arm.com>
* gcc.dg/vect/vect-load-lanes-peeling-1.c: New test.
2016-05-24 Richard Biener <rguenther@suse.de> 2016-05-24 Richard Biener <rguenther@suse.de>
PR middle-end/70434 PR middle-end/70434

View File

@ -0,0 +1,13 @@
/* { dg-do compile } */
/* { dg-require-effective-target vect_int } */
/* { dg-require-effective-target vect_load_lanes } */
void
f (int *__restrict a, int *__restrict b)
{
for (int i = 0; i < 96; ++i)
a[i] = b[i * 3] + b[i * 3 + 1] + b[i * 3 + 2];
}
/* { dg-final { scan-tree-dump-not "Data access with gaps" "vect" } } */
/* { dg-final { scan-tree-dump-not "epilog loop required" "vect" } } */

View File

@ -6303,6 +6303,17 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)); gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info));
first_stmt = GROUP_FIRST_ELEMENT (stmt_info); first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
if (!slp
&& !PURE_SLP_STMT (stmt_info)
&& !STMT_VINFO_STRIDED_P (stmt_info))
{
if (vect_load_lanes_supported (vectype, group_size))
load_lanes_p = true;
else if (!vect_grouped_load_supported (vectype, group_size))
return false;
}
/* If this is single-element interleaving with an element distance /* If this is single-element interleaving with an element distance
that leaves unused vector loads around punt - we at least create that leaves unused vector loads around punt - we at least create
@ -6330,7 +6341,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
if (loop_vinfo if (loop_vinfo
&& ! STMT_VINFO_STRIDED_P (stmt_info) && ! STMT_VINFO_STRIDED_P (stmt_info)
&& (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0 && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
|| (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0))) || (!slp && !load_lanes_p && vf % group_size != 0)))
{ {
if (dump_enabled_p ()) if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@ -6350,8 +6361,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
slp_perm = true; slp_perm = true;
group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
/* ??? The following is overly pessimistic (as well as the loop /* ??? The following is overly pessimistic (as well as the loop
case above) in the case we can statically determine the excess case above) in the case we can statically determine the excess
elements loaded are within the bounds of a decl that is accessed. elements loaded are within the bounds of a decl that is accessed.
@ -6364,16 +6373,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
return false; return false;
} }
if (!slp
&& !PURE_SLP_STMT (stmt_info)
&& !STMT_VINFO_STRIDED_P (stmt_info))
{
if (vect_load_lanes_supported (vectype, group_size))
load_lanes_p = true;
else if (!vect_grouped_load_supported (vectype, group_size))
return false;
}
/* Invalidate assumptions made by dependence analysis when vectorization /* Invalidate assumptions made by dependence analysis when vectorization
on the unrolled body effectively re-orders stmts. */ on the unrolled body effectively re-orders stmts. */
if (!PURE_SLP_STMT (stmt_info) if (!PURE_SLP_STMT (stmt_info)