Avoid unnecessary peeling for gaps with LD3

vectorizable_load forces peeling for gaps if the vectorisation factor is not a multiple of the group size, since in that case we'd normally load beyond the original scalar accesses but drop the excess elements as part of a following permute: if (loop_vinfo && ! STMT_VINFO_STRIDED_P (stmt_info) && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0 || (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0))) This isn't necessary for LOAD_LANES though, since it loads only the data needed and does the permute itself. Tested on aarch64-linux-gnu and x86_64-linux-gnu. gcc/ * tree-vect-stmts.c (vectorizable_load): Reorder checks so that load_lanes/grouped_load classification comes first. Don't check whether the vectorization factor is a multiple of the group size for load_lanes. gcc/testsuite/ * gcc.dg/vect/vect-load-lanes-peeling-1.c: New test. From-SVN: r236632
2016-05-24 10:15:36 +00:00 · 2016-05-24 10:15:36 +00:00 · d3465d72ba
parent 836dbb1a27
commit d3465d72ba
4 changed files with 36 additions and 13 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,10 @@
+2016-05-24  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* tree-vect-stmts.c (vectorizable_load): Reorder checks so that
+	load_lanes/grouped_load classification comes first.  Don't check
+	whether the vectorization factor is a multiple of the group size
+	for load_lanes.
+
 2016-05-24  Richard Sandiford  <richard.sandiford@arm.com>

 	* tree-vect-data-refs.c (vect_analyze_group_access_1): Set
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@ -1,3 +1,7 @@
+2016-05-24  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* gcc.dg/vect/vect-load-lanes-peeling-1.c: New test.
+
 2016-05-24  Richard Biener  <rguenther@suse.de>

 	PR middle-end/70434
--- a/gcc/testsuite/gcc.dg/vect/vect-load-lanes-peeling-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-load-lanes-peeling-1.c
@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_load_lanes } */
+
+void
+f (int *__restrict a, int *__restrict b)
+{
+  for (int i = 0; i < 96; ++i)
+    a[i] = b[i * 3] + b[i * 3 + 1] + b[i * 3 + 2];
+}
+
+/* { dg-final { scan-tree-dump-not "Data access with gaps" "vect" } } */
+/* { dg-final { scan-tree-dump-not "epilog loop required" "vect" } } */
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@ -6303,6 +6303,17 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
      gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info));

      first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
+      group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
+
+      if (!slp
+	  && !PURE_SLP_STMT (stmt_info)
+	  && !STMT_VINFO_STRIDED_P (stmt_info))
+	{
+	  if (vect_load_lanes_supported (vectype, group_size))
+	    load_lanes_p = true;
+	  else if (!vect_grouped_load_supported (vectype, group_size))
+	    return false;
+	}

      /* If this is single-element interleaving with an element distance
         that leaves unused vector loads around punt - we at least create
@ -6330,7 +6341,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
      if (loop_vinfo
 	  && ! STMT_VINFO_STRIDED_P (stmt_info)
 	  && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
-	      || (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0)))
+	      || (!slp && !load_lanes_p && vf % group_size != 0)))
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@ -6350,8 +6361,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
      if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
 	slp_perm = true;

-      group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
-
      /* ???  The following is overly pessimistic (as well as the loop
         case above) in the case we can statically determine the excess
 	 elements loaded are within the bounds of a decl that is accessed.
@ -6364,16 +6373,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	  return false;
 	}

-      if (!slp
-	  && !PURE_SLP_STMT (stmt_info)
-	  && !STMT_VINFO_STRIDED_P (stmt_info))
-	{
-	  if (vect_load_lanes_supported (vectype, group_size))
-	    load_lanes_p = true;
-	  else if (!vect_grouped_load_supported (vectype, group_size))
-	    return false;
-	}
-
      /* Invalidate assumptions made by dependence analysis when vectorization
 	 on the unrolled body effectively re-orders stmts.  */
      if (!PURE_SLP_STMT (stmt_info)