mirror of git://gcc.gnu.org/git/gcc.git
tree-parloops: Enable runtime thread detection with -ftree-parallelize-loops
This patch adds runtime thread count detection to auto-parallelization. -ftree-parallelize-loops option generates parallelized loops without specifying a fixed thread count, deferring this decision to program execution time where it is controlled by the OMP_NUM_THREADS environment variable. Bootstrap and regression tested on aarch64-linux. Compiled SPEC HPC pot3d https://www.spec.org/hpc2021/docs/benchmarks/628.pot3d_s.html with -ftree-parallelize-loops and tested without having OMP_NUM_THREADS set in the environment and with OMP_NUM_THREADS set to different values. gcc/ChangeLog: * doc/invoke.texi (ftree-parallelize-loops): Update. * common.opt (ftree-parallelize-loops): Add alias that maps to special value INT_MAX for runtime thread detection. * tree-parloops.cc (create_parallel_loop): Use INT_MAX for runtime detection. Call gimple_build_omp_parallel without building a OMP_CLAUSE_NUM_THREADS clause. (gen_parallel_loop): For auto-detection, use a conservative estimate of 2 threads. (parallelize_loops): Same. gcc/testsuite/ChangeLog: * gcc.dg/autopar/runtime-auto.c: New test. Signed-off-by: Sebastian Pop <spop@nvidia.com>
This commit is contained in:
parent
0272058797
commit
f708b83d19
|
@ -3303,6 +3303,10 @@ ftree-parallelize-loops=
|
|||
Common Joined RejectNegative UInteger Var(flag_tree_parallelize_loops) Init(1) Optimization
|
||||
-ftree-parallelize-loops=<number> Enable automatic parallelization of loops.
|
||||
|
||||
ftree-parallelize-loops
|
||||
Common Alias(ftree-parallelize-loops=,2147483647,1)
|
||||
Enable automatic parallelization of loops.
|
||||
|
||||
ftree-phiprop
|
||||
Common Var(flag_tree_phiprop) Init(1) Optimization
|
||||
Enable hoisting loads from conditional pointers.
|
||||
|
|
|
@ -659,7 +659,7 @@ Objective-C and Objective-C++ Dialects}.
|
|||
-ftree-phiprop -ftree-loop-distribution -ftree-loop-distribute-patterns
|
||||
-ftree-loop-ivcanon -ftree-loop-linear -ftree-loop-optimize
|
||||
-ftree-loop-vectorize
|
||||
-ftree-parallelize-loops=@var{n} -ftree-pre -ftree-partial-pre -ftree-pta
|
||||
-ftree-parallelize-loops[=@var{n}] -ftree-pre -ftree-partial-pre -ftree-pta
|
||||
-ftree-reassoc -ftree-scev-cprop -ftree-sink -ftree-slsr -ftree-sra
|
||||
-ftree-switch-conversion -ftree-tail-merge
|
||||
-ftree-ter -ftree-vectorize -ftree-vrp -ftrivial-auto-var-init
|
||||
|
@ -14691,8 +14691,9 @@ variable merging and induction variable elimination) on trees.
|
|||
Enabled by default at @option{-O1} and higher.
|
||||
|
||||
@opindex ftree-parallelize-loops
|
||||
@item -ftree-parallelize-loops=n
|
||||
Parallelize loops, i.e., split their iteration space to run in n threads.
|
||||
@item -ftree-parallelize-loops
|
||||
@itemx -ftree-parallelize-loops=@var{n}
|
||||
Parallelize loops, i.e., split their iteration space to run in multiple threads.
|
||||
This is only possible for loops whose iterations are independent
|
||||
and can be arbitrarily reordered. The optimization is only
|
||||
profitable on multiprocessor machines, for loops that are CPU-intensive,
|
||||
|
@ -14700,6 +14701,17 @@ rather than constrained e.g.@: by memory bandwidth. This option
|
|||
implies @option{-pthread}, and thus is only supported on targets
|
||||
that have support for @option{-pthread}.
|
||||
|
||||
When a positive value @var{n} is specified, the number of threads is fixed
|
||||
at compile time and cannot be changed after compilation. The compiler
|
||||
generates ``#pragma omp parallel num_threads(@var{n})''.
|
||||
|
||||
When used without @code{=@var{n}} (i.e., @option{-ftree-parallelize-loops}),
|
||||
the number of threads is determined at program execution time via the
|
||||
@env{OMP_NUM_THREADS} environment variable. If @env{OMP_NUM_THREADS} is not
|
||||
set, the OpenMP runtime automatically detects the number of available
|
||||
processors and uses that value. This enables creating binaries that
|
||||
adapt to different hardware configurations without recompilation.
|
||||
|
||||
@opindex ftree-pta
|
||||
@item -ftree-pta
|
||||
Perform function-local points-to analysis on trees. This flag is
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -ftree-parallelize-loops -fdump-tree-parloops2-details" } */
|
||||
|
||||
void abort (void);
|
||||
|
||||
#define N 1000
|
||||
|
||||
int a[N], b[N], c[N];
|
||||
|
||||
void
|
||||
test_parallel_loop (void)
|
||||
{
|
||||
int i;
|
||||
|
||||
/* This loop should be auto-parallelized when -ftree-parallelize-loops
|
||||
(without =number) is used for runtime thread detection via OMP_NUM_THREADS. */
|
||||
for (i = 0; i < N; i++)
|
||||
a[i] = b[i] + c[i];
|
||||
}
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
{
|
||||
b[i] = i;
|
||||
c[i] = i * 2;
|
||||
}
|
||||
|
||||
test_parallel_loop ();
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
{
|
||||
if (a[i] != b[i] + c[i])
|
||||
abort ();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Check that the loop is parallelized with runtime thread detection. */
|
||||
/* { dg-final { scan-tree-dump "parallelizing" "parloops2" } } */
|
||||
|
||||
/* Check that "#pragma omp parallel" is generated. */
|
||||
/* { dg-final { scan-tree-dump "pragma omp parallel" "parloops2" } } */
|
||||
|
||||
/* Check that instead of generating a num_threads(x) clause, the compiler calls
|
||||
"__builtin_omp_get_num_threads" that will set the number of threads at
|
||||
program execution time. */
|
||||
/* { dg-final { scan-tree-dump "__builtin_omp_get_num_threads" "parloops2" } } */
|
||||
|
|
@ -2601,10 +2601,19 @@ create_parallel_loop (class loop *loop, tree loop_fn, tree data,
|
|||
gsi = gsi_last_bb (paral_bb);
|
||||
|
||||
gcc_checking_assert (n_threads != 0);
|
||||
t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
|
||||
OMP_CLAUSE_NUM_THREADS_EXPR (t)
|
||||
= build_int_cst (integer_type_node, n_threads);
|
||||
omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
|
||||
if (n_threads == INT_MAX)
|
||||
/* No hardcoded thread count, let OpenMP runtime decide. */
|
||||
omp_par_stmt = gimple_build_omp_parallel (NULL, NULL_TREE, loop_fn,
|
||||
data);
|
||||
else
|
||||
{
|
||||
/* Build the OMP_CLAUSE_NUM_THREADS clause only if we have a fixed
|
||||
thread count. */
|
||||
t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
|
||||
OMP_CLAUSE_NUM_THREADS_EXPR (t)
|
||||
= build_int_cst (integer_type_node, n_threads);
|
||||
omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
|
||||
}
|
||||
gimple_set_location (omp_par_stmt, loc);
|
||||
|
||||
gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
|
||||
|
@ -2812,7 +2821,6 @@ gen_parallel_loop (class loop *loop,
|
|||
struct clsn_data clsn_data;
|
||||
location_t loc;
|
||||
gimple *cond_stmt;
|
||||
unsigned int m_p_thread=2;
|
||||
|
||||
/* From
|
||||
|
||||
|
@ -2885,15 +2893,14 @@ gen_parallel_loop (class loop *loop,
|
|||
|
||||
if (!oacc_kernels_p)
|
||||
{
|
||||
if (loop->inner)
|
||||
m_p_thread=2;
|
||||
else
|
||||
m_p_thread=MIN_PER_THREAD;
|
||||
|
||||
gcc_checking_assert (n_threads != 0);
|
||||
/* For runtime thread detection, use a conservative estimate of 2 threads
|
||||
for the many iterations condition check. */
|
||||
unsigned threads = (n_threads == INT_MAX) ? 2 : n_threads;
|
||||
unsigned m_p_thread = loop->inner ? 2 : MIN_PER_THREAD;
|
||||
many_iterations_cond =
|
||||
fold_build2 (GE_EXPR, boolean_type_node,
|
||||
nit, build_int_cst (type, m_p_thread * n_threads - 1));
|
||||
nit, build_int_cst (type, m_p_thread * threads - 1));
|
||||
|
||||
many_iterations_cond
|
||||
= fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
|
||||
|
@ -3905,14 +3912,15 @@ parallelize_loops (bool oacc_kernels_p)
|
|||
estimated = estimated_loop_iterations_int (loop);
|
||||
if (estimated == -1)
|
||||
estimated = get_likely_max_loop_iterations_int (loop);
|
||||
/* For runtime thread detection, use an estimate of 2 threads. */
|
||||
unsigned threads = (n_threads == INT_MAX) ? 2 : n_threads;
|
||||
unsigned m_p_thread = loop->inner ? 2 : MIN_PER_THREAD;
|
||||
/* FIXME: Bypass this check as graphite doesn't update the
|
||||
count and frequency correctly now. */
|
||||
if (!flag_loop_parallelize_all
|
||||
&& !oacc_kernels_p
|
||||
&& ((estimated != -1
|
||||
&& (estimated
|
||||
< ((HOST_WIDE_INT) n_threads
|
||||
* (loop->inner ? 2 : MIN_PER_THREAD) - 1)))
|
||||
&& (estimated < ((HOST_WIDE_INT) threads * m_p_thread - 1)))
|
||||
/* Do not bother with loops in cold areas. */
|
||||
|| optimize_loop_nest_for_size_p (loop)))
|
||||
continue;
|
||||
|
|
Loading…
Reference in New Issue