tree-parloops: Enable runtime thread detection with -ftree-parallelize-loops

This patch adds runtime thread count detection to auto-parallelization. -ftree-parallelize-loops option generates parallelized loops without specifying a fixed thread count, deferring this decision to program execution time where it is controlled by the OMP_NUM_THREADS environment variable. Bootstrap and regression tested on aarch64-linux. Compiled SPEC HPC pot3d https://www.spec.org/hpc2021/docs/benchmarks/628.pot3d_s.html with -ftree-parallelize-loops and tested without having OMP_NUM_THREADS set in the environment and with OMP_NUM_THREADS set to different values. gcc/ChangeLog: * doc/invoke.texi (ftree-parallelize-loops): Update. * common.opt (ftree-parallelize-loops): Add alias that maps to special value INT_MAX for runtime thread detection. * tree-parloops.cc (create_parallel_loop): Use INT_MAX for runtime detection. Call gimple_build_omp_parallel without building a OMP_CLAUSE_NUM_THREADS clause. (gen_parallel_loop): For auto-detection, use a conservative estimate of 2 threads. (parallelize_loops): Same. gcc/testsuite/ChangeLog: * gcc.dg/autopar/runtime-auto.c: New test. Signed-off-by: Sebastian Pop <spop@nvidia.com>
2025-07-25 17:55:03 +02:00 · 2025-07-25 17:55:03 +02:00 · f708b83d19
parent 0272058797
commit f708b83d19
4 changed files with 94 additions and 17 deletions
--- a/gcc/common.opt
+++ b/gcc/common.opt
@ -3303,6 +3303,10 @@ ftree-parallelize-loops=
 Common Joined RejectNegative UInteger Var(flag_tree_parallelize_loops) Init(1) Optimization
 -ftree-parallelize-loops=<number>	Enable automatic parallelization of loops.
 ftree-parallelize-loops
 Common Alias(ftree-parallelize-loops=,2147483647,1)
 Enable automatic parallelization of loops.
 ftree-phiprop
 Common Var(flag_tree_phiprop) Init(1) Optimization
 Enable hoisting loads from conditional pointers.
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@ -659,7 +659,7 @@ Objective-C and Objective-C++ Dialects}.
 -ftree-phiprop  -ftree-loop-distribution  -ftree-loop-distribute-patterns
 -ftree-loop-ivcanon  -ftree-loop-linear  -ftree-loop-optimize
 -ftree-loop-vectorize
-ftree-parallelize-loops=@var{n}  -ftree-pre  -ftree-partial-pre  -ftree-pta
+-ftree-parallelize-loops[=@var{n}]  -ftree-pre  -ftree-partial-pre  -ftree-pta
 -ftree-reassoc  -ftree-scev-cprop  -ftree-sink  -ftree-slsr  -ftree-sra
 -ftree-switch-conversion  -ftree-tail-merge
 -ftree-ter  -ftree-vectorize  -ftree-vrp  -ftrivial-auto-var-init
@ -14691,8 +14691,9 @@ variable merging and induction variable elimination) on trees.
 Enabled by default at @option{-O1} and higher.
@opindex ftree-parallelize-loops
-@item -ftree-parallelize-loops=n
+@item -ftree-parallelize-loops
-Parallelize loops, i.e., split their iteration space to run in n threads.
+@itemx -ftree-parallelize-loops=@var{n}
 Parallelize loops, i.e., split their iteration space to run in multiple threads.
 This is only possible for loops whose iterations are independent
 and can be arbitrarily reordered.  The optimization is only
 profitable on multiprocessor machines, for loops that are CPU-intensive,
@ -14700,6 +14701,17 @@ rather than constrained e.g.@: by memory bandwidth.  This option
 implies @option{-pthread}, and thus is only supported on targets
 that have support for @option{-pthread}.
 When a positive value @var{n} is specified, the number of threads is fixed
 at compile time and cannot be changed after compilation. The compiler
 generates ``#pragma omp parallel num_threads(@var{n})''.
 When used without @code{=@var{n}} (i.e., @option{-ftree-parallelize-loops}),
 the number of threads is determined at program execution time via the
@env{OMP_NUM_THREADS} environment variable. If @env{OMP_NUM_THREADS} is not
 set, the OpenMP runtime automatically detects the number of available
 processors and uses that value. This enables creating binaries that
 adapt to different hardware configurations without recompilation.
@opindex ftree-pta
@item -ftree-pta
 Perform function-local points-to analysis on trees.  This flag is
--- a/gcc/testsuite/gcc.dg/autopar/runtime-auto.c
+++ b/gcc/testsuite/gcc.dg/autopar/runtime-auto.c
@ -0,0 +1,53 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -ftree-parallelize-loops -fdump-tree-parloops2-details" } */
 void abort (void);
 #define N 1000
 int a[N], b[N], c[N];
 void
 test_parallel_loop (void)
 {
  int i;
  /* This loop should be auto-parallelized when -ftree-parallelize-loops
     (without =number) is used for runtime thread detection via OMP_NUM_THREADS.  */
  for (i = 0; i < N; i++)
    a[i] = b[i] + c[i];
 }
 int
 main (void)
 {
  int i;
  for (i = 0; i < N; i++)
    {
      b[i] = i;
      c[i] = i * 2;
    }
  test_parallel_loop ();
  for (i = 0; i < N; i++)
    {
      if (a[i] != b[i] + c[i])
 	abort ();
    }
  return 0;
 }
 /* Check that the loop is parallelized with runtime thread detection.  */
 /* { dg-final { scan-tree-dump "parallelizing" "parloops2" } } */
 /* Check that "#pragma omp parallel" is generated.  */
 /* { dg-final { scan-tree-dump "pragma omp parallel" "parloops2" } } */
 /* Check that instead of generating a num_threads(x) clause, the compiler calls
   "__builtin_omp_get_num_threads" that will set the number of threads at
   program execution time.  */
 /* { dg-final { scan-tree-dump "__builtin_omp_get_num_threads" "parloops2" } } */
--- a/gcc/tree-parloops.cc
+++ b/gcc/tree-parloops.cc
@ -2601,10 +2601,19 @@ create_parallel_loop (class loop *loop, tree loop_fn, tree data,
      gsi = gsi_last_bb (paral_bb);
      gcc_checking_assert (n_threads != 0);
      if (n_threads == INT_MAX)
 	/* No hardcoded thread count, let OpenMP runtime decide.  */
 	omp_par_stmt = gimple_build_omp_parallel (NULL, NULL_TREE, loop_fn,
 						  data);
      else
 	{
 	  /* Build the OMP_CLAUSE_NUM_THREADS clause only if we have a fixed
 	     thread count.  */
 	  t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
 	  OMP_CLAUSE_NUM_THREADS_EXPR (t)
 	    = build_int_cst (integer_type_node, n_threads);
 	  omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
 	}
      gimple_set_location (omp_par_stmt, loc);
      gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
@ -2812,7 +2821,6 @@ gen_parallel_loop (class loop *loop,
  struct clsn_data clsn_data;
  location_t loc;
  gimple *cond_stmt;
  unsigned int m_p_thread=2;
  /* From
@ -2885,15 +2893,14 @@ gen_parallel_loop (class loop *loop,
  if (!oacc_kernels_p)
    {
      if (loop->inner)
 	m_p_thread=2;
      else
 	m_p_thread=MIN_PER_THREAD;
      gcc_checking_assert (n_threads != 0);
      /* For runtime thread detection, use a conservative estimate of 2 threads
 	 for the many iterations condition check.  */
      unsigned threads = (n_threads == INT_MAX) ? 2 : n_threads;
      unsigned m_p_thread = loop->inner ? 2 : MIN_PER_THREAD;
      many_iterations_cond =
 	fold_build2 (GE_EXPR, boolean_type_node,
-		     nit, build_int_cst (type, m_p_thread * n_threads - 1));
+		     nit, build_int_cst (type, m_p_thread * threads - 1));
      many_iterations_cond
 	= fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
@ -3905,14 +3912,15 @@ parallelize_loops (bool oacc_kernels_p)
      estimated = estimated_loop_iterations_int (loop);
      if (estimated == -1)
 	estimated = get_likely_max_loop_iterations_int (loop);
      /* For runtime thread detection, use an estimate of 2 threads.  */
      unsigned threads = (n_threads == INT_MAX) ? 2 : n_threads;
      unsigned m_p_thread = loop->inner ? 2 : MIN_PER_THREAD;
      /* FIXME: Bypass this check as graphite doesn't update the
 	 count and frequency correctly now.  */
      if (!flag_loop_parallelize_all
 	  && !oacc_kernels_p
 	  && ((estimated != -1
-	       && (estimated
+	       && (estimated < ((HOST_WIDE_INT) threads * m_p_thread - 1)))
 		   < ((HOST_WIDE_INT) n_threads
 		      * (loop->inner ? 2 : MIN_PER_THREAD) - 1)))
 	      /* Do not bother with loops in cold areas.  */
 	      || optimize_loop_nest_for_size_p (loop)))
 	continue;