vect: Move slp_perm checking into get_load_store_type.

This moves the setting of slp_perm, checking of basic-block SLP gaps, as well as the final check for perm_ok to get_load_store_type. Also, slp_perm is moved to ls_data. gcc/ChangeLog: * tree-vect-stmts.cc (get_load_store_type): Add load-permutation checks and setting of slp_perm. (vectorizable_store): Remove perm_ok argument. (vectorizable_load): Ditto and replace slp_perm by ls.slp_perm. * tree-vectorizer.h (struct vect_load_store_data): Add slp_perm.
2025-10-10 18:39:01 +02:00 · 2025-10-10 18:39:01 +02:00 · aefb2dbb64
parent b6e802fd55
commit aefb2dbb64
2 changed files with 92 additions and 85 deletions
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@ -2062,16 +2062,13 @@ vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype,
   VECTYPE is the vector type that the vectorized statements will use.

   If ELSVALS is nonzero the supported else values will be stored in the
-   vector ELSVALS points to.
-
-   For loads PERM_OK indicates whether we can code generate a
-   SLP_TREE_LOAD_PERMUTATION on the node.  */
+   vector ELSVALS points to.  */

 static bool
 get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
 		     tree vectype, slp_tree slp_node,
 		     bool masked_p, vec_load_store_type vls_type,
-		     bool perm_ok, vect_load_store_data *ls)
+		     vect_load_store_data *ls)
 {
  vect_memory_access_type *memory_access_type = &ls->memory_access_type;
  poly_int64 *poffset = &ls->poffset;
@ -2081,6 +2078,8 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
  internal_fn *lanes_ifn = &ls->lanes_ifn;
  vec<int> *elsvals = &ls->elsvals;
  tree *ls_type = &ls->ls_type;
+  bool *slp_perm = &ls->slp_perm;
+  unsigned *n_perms = &ls->n_perms;
  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
  class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
@ -2093,6 +2092,15 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
  *misalignment = DR_MISALIGNMENT_UNKNOWN;
  *poffset = 0;
  *ls_type = NULL_TREE;
+  *slp_perm = false;
+  *n_perms = -1U;
+
+  bool perm_ok = true;
+  poly_int64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
+
+  if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
+    perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL,
+					    vf, true, n_perms);

  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    {
@ -2534,7 +2542,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
      poly_uint64 read_amount
 	= vf * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
-	read_amount *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
+	read_amount *= group_size;

      auto target_alignment
 	= DR_TARGET_ALIGNMENT (STMT_VINFO_DR_INFO (stmt_info));
@ -2627,6 +2635,60 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
  if (!loop_vinfo && *memory_access_type == VMAT_ELEMENTWISE)
    return false;

+  /* Some loads need to explicitly permute the loaded data if there
+     is a load permutation.  Among those are:
+      - VMAT_ELEMENTWISE.
+      - VMAT_STRIDED_SLP.
+      - VMAT_GATHER_SCATTER:
+	- Strided gather (fallback for VMAT_STRIDED_SLP if #lanes == 1).
+	- Grouped strided gather (ditto but for #lanes > 1).
+
+     For VMAT_ELEMENTWISE we can fold the load permutation into the
+     individual indices we access directly, eliding the permutation.
+     Strided gather only allows load permutations for the
+     single-element case.  */
+
+  if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
+      && !(*memory_access_type == VMAT_ELEMENTWISE
+	   || (mat_gather_scatter_p (*memory_access_type)
+	       && SLP_TREE_LANES (slp_node) == 1
+	       && single_element_p)))
+    {
+      if (!loop_vinfo)
+	{
+	  /* In BB vectorization we may not actually use a loaded vector
+	     accessing elements in excess of DR_GROUP_SIZE.  */
+	  stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
+	  group_info = DR_GROUP_FIRST_ELEMENT (group_info);
+	  unsigned HOST_WIDE_INT nunits;
+	  unsigned j, k, maxk = 0;
+	  FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
+	    if (k > maxk)
+	      maxk = k;
+	  tree vectype = SLP_TREE_VECTYPE (slp_node);
+	  if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
+	      || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "BB vectorization with gaps at the end of "
+				 "a load is not supported\n");
+	      return false;
+	    }
+	}
+
+      if (!perm_ok)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+			     vect_location,
+			     "unsupported load permutation\n");
+	  return false;
+	}
+
+      *slp_perm = true;
+    }
+
  return true;
 }

@ -8009,7 +8071,7 @@ vectorizable_store (vec_info *vinfo,
  vect_load_store_data &ls = slp_node->get_data (_ls_data);
  if (cost_vec
      && !get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask_node,
-			       vls_type, false, &_ls_data))
+			       vls_type, &_ls_data))
    return false;
  /* Temporary aliases to analysis data, should not be modified through
     these.  */
@ -9454,7 +9516,6 @@ vectorizable_load (vec_info *vinfo,
  bool compute_in_loop = false;
  class loop *at_loop;
  int vec_num;
-  bool slp_perm = false;
  bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
  poly_uint64 vf;
  tree aggr_type;
@ -9592,17 +9653,11 @@ vectorizable_load (vec_info *vinfo,
  else
    group_size = 1;

-  bool perm_ok = true;
-  unsigned n_perms = -1U;
-  if (cost_vec && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
-    perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
-					    true, &n_perms);
-
  vect_load_store_data _ls_data{};
  vect_load_store_data &ls = slp_node->get_data (_ls_data);
  if (cost_vec
      && !get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask_node,
-			       VLS_LOAD, perm_ok, &ls))
+			       VLS_LOAD, &ls))
    return false;
  /* Temporary aliases to analysis data, should not be modified through
     these.  */
@ -9623,56 +9678,6 @@ vectorizable_load (vec_info *vinfo,
  bool type_mode_padding_p
    = TYPE_PRECISION (scalar_type) < GET_MODE_PRECISION (GET_MODE_INNER (mode));

-  /* ???  The following checks should really be part of
-     get_load_store_type.  */
-  if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
-      && !(memory_access_type == VMAT_ELEMENTWISE
-	   || (mat_gather_scatter_p (memory_access_type)
-	       && SLP_TREE_LANES (slp_node) == 1
-	       && (!grouped_load
-		   || !DR_GROUP_NEXT_ELEMENT (first_stmt_info)))))
-    {
-      slp_perm = true;
-
-      if (!loop_vinfo && cost_vec)
-	{
-	  /* In BB vectorization we may not actually use a loaded vector
-	     accessing elements in excess of DR_GROUP_SIZE.  */
-	  stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
-	  group_info = DR_GROUP_FIRST_ELEMENT (group_info);
-	  unsigned HOST_WIDE_INT nunits;
-	  unsigned j, k, maxk = 0;
-	  FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
-	      if (k > maxk)
-		maxk = k;
-	  tree vectype = SLP_TREE_VECTYPE (slp_node);
-	  if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
-	      || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "BB vectorization with gaps at the end of "
-				 "a load is not supported\n");
-	      return false;
-	    }
-	}
-
-      if (cost_vec)
-	{
-	  if (!perm_ok)
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-				 vect_location,
-				 "unsupported load permutation\n");
-	      return false;
-	    }
-	  ls.n_perms = n_perms;
-	}
-      else
-	n_perms = ls.n_perms;
-    }
-
  if (slp_node->ldst_lanes
      && memory_access_type != VMAT_LOAD_STORE_LANES)
    {
@ -10027,7 +10032,7 @@ vectorizable_load (vec_info *vinfo,
 	 not only the number of vector stmts the permutation result
 	 fits in.  */
      int ncopies;
-      if (slp_perm)
+      if (ls.slp_perm)
 	{
 	  gcc_assert (memory_access_type != VMAT_ELEMENTWISE);
 	  /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
@ -10135,18 +10140,18 @@ vectorizable_load (vec_info *vinfo,

 	  if (!costing_p)
 	    {
-	      if (slp_perm)
+	      if (ls.slp_perm)
 		dr_chain.quick_push (gimple_assign_lhs (new_stmt));
 	      else
 		slp_node->push_vec_def (new_stmt);
 	    }
 	}
-      if (slp_perm)
+      if (ls.slp_perm)
 	{
 	  if (costing_p)
 	    {
-	      gcc_assert (n_perms != -1U);
-	      inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
+	      gcc_assert (ls.n_perms != -1U);
+	      inside_cost += record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
 					       slp_node, 0, vect_body);
 	    }
 	  else
@ -10154,7 +10159,7 @@ vectorizable_load (vec_info *vinfo,
 	      unsigned n_perms2;
 	      vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
 					    false, &n_perms2);
-	      gcc_assert (n_perms == n_perms2);
+	      gcc_assert (ls.n_perms == n_perms2);
 	    }
 	}

@ -10219,7 +10224,7 @@ vectorizable_load (vec_info *vinfo,
 	   instead the access is contiguous but it might be
 	   permuted.  No gap adjustment is needed though.  */
 	;
-      else if (slp_perm
+      else if (ls.slp_perm
 	       && (group_size != scalar_lanes
 		   || !multiple_p (nunits, group_size)))
 	{
@ -10568,7 +10573,7 @@ vectorizable_load (vec_info *vinfo,

  if (mat_gather_scatter_p (memory_access_type))
    {
-      gcc_assert ((!grouped_load && !slp_perm) || ls.ls_type);
+      gcc_assert ((!grouped_load && !ls.slp_perm) || ls.ls_type);

      /* If we pun the original vectype the loads as well as costing, length,
 	 etc. is performed with the new type.  After loading we VIEW_CONVERT
@ -10930,14 +10935,14 @@ vectorizable_load (vec_info *vinfo,
 	  /* Store vector loads in the corresponding SLP_NODE.  */
 	  if (!costing_p)
 	    {
-	      if (slp_perm)
+	      if (ls.slp_perm)
 		dr_chain.quick_push (gimple_assign_lhs (new_stmt));
 	      else
 		slp_node->push_vec_def (new_stmt);
 	    }
 	}

-      if (slp_perm)
+      if (ls.slp_perm)
 	{
 	  if (costing_p)
 	    {
@ -11034,7 +11039,7 @@ vectorizable_load (vec_info *vinfo,
 				       stmt_info, bump);
    }

-  if (grouped_load || slp_perm)
+  if (grouped_load || ls.slp_perm)
    dr_chain.create (vec_num);

  gimple *new_stmt = NULL;
@ -11531,11 +11536,11 @@ vectorizable_load (vec_info *vinfo,

      /* Collect vector loads and later create their permutation in
 	 vect_transform_slp_perm_load.  */
-      if (!costing_p && (grouped_load || slp_perm))
+      if (!costing_p && (grouped_load || ls.slp_perm))
 	dr_chain.quick_push (new_temp);

      /* Store vector loads in the corresponding SLP_NODE.  */
-      if (!costing_p && !slp_perm)
+      if (!costing_p && !ls.slp_perm)
 	slp_node->push_vec_def (new_stmt);

      /* With SLP permutation we load the gaps as well, without
@ -11544,7 +11549,7 @@ vectorizable_load (vec_info *vinfo,
      group_elt += nunits;
      if (!costing_p
 	  && maybe_ne (group_gap_adj, 0U)
-	  && !slp_perm
+	  && !ls.slp_perm
 	  && known_eq (group_elt, group_size - group_gap_adj))
 	{
 	  poly_wide_int bump_val
@ -11561,7 +11566,7 @@ vectorizable_load (vec_info *vinfo,
     elements loaded for a permuted SLP load.  */
  if (!costing_p
      && maybe_ne (group_gap_adj, 0U)
-      && slp_perm)
+      && ls.slp_perm)
    {
      poly_wide_int bump_val
 	= (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
@ -11572,7 +11577,7 @@ vectorizable_load (vec_info *vinfo,
 				     stmt_info, bump);
    }

-  if (slp_perm)
+  if (ls.slp_perm)
    {
      /* For SLP we know we've seen all possible uses of dr_chain so
 	 direct vect_transform_slp_perm_load to DCE the unused parts.
@ -11580,9 +11585,9 @@ vectorizable_load (vec_info *vinfo,
 	 in PR101120 and friends.  */
      if (costing_p)
 	{
-	  gcc_assert (n_perms != -1U);
-	  if (n_perms != 0)
-	    inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm,
+	  gcc_assert (ls.n_perms != -1U);
+	  if (ls.n_perms != 0)
+	    inside_cost = record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
 					    slp_node, 0, vect_body);
 	}
      else
@ -11591,7 +11596,7 @@ vectorizable_load (vec_info *vinfo,
 	  bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
 						  gsi, vf, false, &n_perms2,
 						  nullptr, true);
-	  gcc_assert (ok && n_perms == n_perms2);
+	  gcc_assert (ok && ls.n_perms == n_perms2);
 	}
      dr_chain.release ();
    }
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@ -290,6 +290,8 @@ struct vect_load_store_data : vect_data {
  tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
  tree ls_type; // VMAT_GATHER_SCATTER_IFN
  auto_vec<int> elsvals;
+  /* True if the load requires a load permutation.  */
+  bool slp_perm;    // SLP_TREE_LOAD_PERMUTATION
  unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
 };