tree-parloops: Enable runtime thread detection with -ftree-parallelize-loops

This patch adds runtime thread count detection to auto-parallelization.
-ftree-parallelize-loops option generates parallelized loops without
specifying a fixed thread count, deferring this decision to program execution
time where it is controlled by the OMP_NUM_THREADS environment variable.

Bootstrap and regression tested on aarch64-linux.  Compiled SPEC HPC pot3d
https://www.spec.org/hpc2021/docs/benchmarks/628.pot3d_s.html with
-ftree-parallelize-loops and tested without having OMP_NUM_THREADS set in the
environment and with OMP_NUM_THREADS set to different values.

gcc/ChangeLog:

	* doc/invoke.texi (ftree-parallelize-loops): Update.
	* common.opt (ftree-parallelize-loops): Add alias that maps to
	special value INT_MAX for runtime thread detection.
	* tree-parloops.cc (create_parallel_loop): Use INT_MAX for runtime
	detection.  Call gimple_build_omp_parallel without building a
	OMP_CLAUSE_NUM_THREADS clause.
	(gen_parallel_loop): For auto-detection, use a conservative
	estimate of 2 threads.
	(parallelize_loops): Same.

gcc/testsuite/ChangeLog:

	* gcc.dg/autopar/runtime-auto.c: New test.

Signed-off-by: Sebastian Pop <spop@nvidia.com>
This commit is contained in:
Sebastian Pop 2025-07-25 17:55:03 +02:00 committed by Richard Biener
parent 0272058797
commit f708b83d19
4 changed files with 94 additions and 17 deletions

View File

@ -3303,6 +3303,10 @@ ftree-parallelize-loops=
Common Joined RejectNegative UInteger Var(flag_tree_parallelize_loops) Init(1) Optimization Common Joined RejectNegative UInteger Var(flag_tree_parallelize_loops) Init(1) Optimization
-ftree-parallelize-loops=<number> Enable automatic parallelization of loops. -ftree-parallelize-loops=<number> Enable automatic parallelization of loops.
ftree-parallelize-loops
Common Alias(ftree-parallelize-loops=,2147483647,1)
Enable automatic parallelization of loops.
ftree-phiprop ftree-phiprop
Common Var(flag_tree_phiprop) Init(1) Optimization Common Var(flag_tree_phiprop) Init(1) Optimization
Enable hoisting loads from conditional pointers. Enable hoisting loads from conditional pointers.

View File

@ -659,7 +659,7 @@ Objective-C and Objective-C++ Dialects}.
-ftree-phiprop -ftree-loop-distribution -ftree-loop-distribute-patterns -ftree-phiprop -ftree-loop-distribution -ftree-loop-distribute-patterns
-ftree-loop-ivcanon -ftree-loop-linear -ftree-loop-optimize -ftree-loop-ivcanon -ftree-loop-linear -ftree-loop-optimize
-ftree-loop-vectorize -ftree-loop-vectorize
-ftree-parallelize-loops=@var{n} -ftree-pre -ftree-partial-pre -ftree-pta -ftree-parallelize-loops[=@var{n}] -ftree-pre -ftree-partial-pre -ftree-pta
-ftree-reassoc -ftree-scev-cprop -ftree-sink -ftree-slsr -ftree-sra -ftree-reassoc -ftree-scev-cprop -ftree-sink -ftree-slsr -ftree-sra
-ftree-switch-conversion -ftree-tail-merge -ftree-switch-conversion -ftree-tail-merge
-ftree-ter -ftree-vectorize -ftree-vrp -ftrivial-auto-var-init -ftree-ter -ftree-vectorize -ftree-vrp -ftrivial-auto-var-init
@ -14691,8 +14691,9 @@ variable merging and induction variable elimination) on trees.
Enabled by default at @option{-O1} and higher. Enabled by default at @option{-O1} and higher.
@opindex ftree-parallelize-loops @opindex ftree-parallelize-loops
@item -ftree-parallelize-loops=n @item -ftree-parallelize-loops
Parallelize loops, i.e., split their iteration space to run in n threads. @itemx -ftree-parallelize-loops=@var{n}
Parallelize loops, i.e., split their iteration space to run in multiple threads.
This is only possible for loops whose iterations are independent This is only possible for loops whose iterations are independent
and can be arbitrarily reordered. The optimization is only and can be arbitrarily reordered. The optimization is only
profitable on multiprocessor machines, for loops that are CPU-intensive, profitable on multiprocessor machines, for loops that are CPU-intensive,
@ -14700,6 +14701,17 @@ rather than constrained e.g.@: by memory bandwidth. This option
implies @option{-pthread}, and thus is only supported on targets implies @option{-pthread}, and thus is only supported on targets
that have support for @option{-pthread}. that have support for @option{-pthread}.
When a positive value @var{n} is specified, the number of threads is fixed
at compile time and cannot be changed after compilation. The compiler
generates ``#pragma omp parallel num_threads(@var{n})''.
When used without @code{=@var{n}} (i.e., @option{-ftree-parallelize-loops}),
the number of threads is determined at program execution time via the
@env{OMP_NUM_THREADS} environment variable. If @env{OMP_NUM_THREADS} is not
set, the OpenMP runtime automatically detects the number of available
processors and uses that value. This enables creating binaries that
adapt to different hardware configurations without recompilation.
@opindex ftree-pta @opindex ftree-pta
@item -ftree-pta @item -ftree-pta
Perform function-local points-to analysis on trees. This flag is Perform function-local points-to analysis on trees. This flag is

View File

@ -0,0 +1,53 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-parallelize-loops -fdump-tree-parloops2-details" } */
void abort (void);
#define N 1000
int a[N], b[N], c[N];
void
test_parallel_loop (void)
{
int i;
/* This loop should be auto-parallelized when -ftree-parallelize-loops
(without =number) is used for runtime thread detection via OMP_NUM_THREADS. */
for (i = 0; i < N; i++)
a[i] = b[i] + c[i];
}
int
main (void)
{
int i;
for (i = 0; i < N; i++)
{
b[i] = i;
c[i] = i * 2;
}
test_parallel_loop ();
for (i = 0; i < N; i++)
{
if (a[i] != b[i] + c[i])
abort ();
}
return 0;
}
/* Check that the loop is parallelized with runtime thread detection. */
/* { dg-final { scan-tree-dump "parallelizing" "parloops2" } } */
/* Check that "#pragma omp parallel" is generated. */
/* { dg-final { scan-tree-dump "pragma omp parallel" "parloops2" } } */
/* Check that instead of generating a num_threads(x) clause, the compiler calls
"__builtin_omp_get_num_threads" that will set the number of threads at
program execution time. */
/* { dg-final { scan-tree-dump "__builtin_omp_get_num_threads" "parloops2" } } */

View File

@ -2601,10 +2601,19 @@ create_parallel_loop (class loop *loop, tree loop_fn, tree data,
gsi = gsi_last_bb (paral_bb); gsi = gsi_last_bb (paral_bb);
gcc_checking_assert (n_threads != 0); gcc_checking_assert (n_threads != 0);
t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS); if (n_threads == INT_MAX)
OMP_CLAUSE_NUM_THREADS_EXPR (t) /* No hardcoded thread count, let OpenMP runtime decide. */
= build_int_cst (integer_type_node, n_threads); omp_par_stmt = gimple_build_omp_parallel (NULL, NULL_TREE, loop_fn,
omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data); data);
else
{
/* Build the OMP_CLAUSE_NUM_THREADS clause only if we have a fixed
thread count. */
t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
OMP_CLAUSE_NUM_THREADS_EXPR (t)
= build_int_cst (integer_type_node, n_threads);
omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
}
gimple_set_location (omp_par_stmt, loc); gimple_set_location (omp_par_stmt, loc);
gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT); gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
@ -2812,7 +2821,6 @@ gen_parallel_loop (class loop *loop,
struct clsn_data clsn_data; struct clsn_data clsn_data;
location_t loc; location_t loc;
gimple *cond_stmt; gimple *cond_stmt;
unsigned int m_p_thread=2;
/* From /* From
@ -2885,15 +2893,14 @@ gen_parallel_loop (class loop *loop,
if (!oacc_kernels_p) if (!oacc_kernels_p)
{ {
if (loop->inner)
m_p_thread=2;
else
m_p_thread=MIN_PER_THREAD;
gcc_checking_assert (n_threads != 0); gcc_checking_assert (n_threads != 0);
/* For runtime thread detection, use a conservative estimate of 2 threads
for the many iterations condition check. */
unsigned threads = (n_threads == INT_MAX) ? 2 : n_threads;
unsigned m_p_thread = loop->inner ? 2 : MIN_PER_THREAD;
many_iterations_cond = many_iterations_cond =
fold_build2 (GE_EXPR, boolean_type_node, fold_build2 (GE_EXPR, boolean_type_node,
nit, build_int_cst (type, m_p_thread * n_threads - 1)); nit, build_int_cst (type, m_p_thread * threads - 1));
many_iterations_cond many_iterations_cond
= fold_build2 (TRUTH_AND_EXPR, boolean_type_node, = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
@ -3905,14 +3912,15 @@ parallelize_loops (bool oacc_kernels_p)
estimated = estimated_loop_iterations_int (loop); estimated = estimated_loop_iterations_int (loop);
if (estimated == -1) if (estimated == -1)
estimated = get_likely_max_loop_iterations_int (loop); estimated = get_likely_max_loop_iterations_int (loop);
/* For runtime thread detection, use an estimate of 2 threads. */
unsigned threads = (n_threads == INT_MAX) ? 2 : n_threads;
unsigned m_p_thread = loop->inner ? 2 : MIN_PER_THREAD;
/* FIXME: Bypass this check as graphite doesn't update the /* FIXME: Bypass this check as graphite doesn't update the
count and frequency correctly now. */ count and frequency correctly now. */
if (!flag_loop_parallelize_all if (!flag_loop_parallelize_all
&& !oacc_kernels_p && !oacc_kernels_p
&& ((estimated != -1 && ((estimated != -1
&& (estimated && (estimated < ((HOST_WIDE_INT) threads * m_p_thread - 1)))
< ((HOST_WIDE_INT) n_threads
* (loop->inner ? 2 : MIN_PER_THREAD) - 1)))
/* Do not bother with loops in cold areas. */ /* Do not bother with loops in cold areas. */
|| optimize_loop_nest_for_size_p (loop))) || optimize_loop_nest_for_size_p (loop)))
continue; continue;