mirror of git://gcc.gnu.org/git/gcc.git
tree-parloops: Enable runtime thread detection with -ftree-parallelize-loops
This patch adds runtime thread count detection to auto-parallelization. -ftree-parallelize-loops option generates parallelized loops without specifying a fixed thread count, deferring this decision to program execution time where it is controlled by the OMP_NUM_THREADS environment variable. Bootstrap and regression tested on aarch64-linux. Compiled SPEC HPC pot3d https://www.spec.org/hpc2021/docs/benchmarks/628.pot3d_s.html with -ftree-parallelize-loops and tested without having OMP_NUM_THREADS set in the environment and with OMP_NUM_THREADS set to different values. gcc/ChangeLog: * doc/invoke.texi (ftree-parallelize-loops): Update. * common.opt (ftree-parallelize-loops): Add alias that maps to special value INT_MAX for runtime thread detection. * tree-parloops.cc (create_parallel_loop): Use INT_MAX for runtime detection. Call gimple_build_omp_parallel without building a OMP_CLAUSE_NUM_THREADS clause. (gen_parallel_loop): For auto-detection, use a conservative estimate of 2 threads. (parallelize_loops): Same. gcc/testsuite/ChangeLog: * gcc.dg/autopar/runtime-auto.c: New test. Signed-off-by: Sebastian Pop <spop@nvidia.com>
This commit is contained in:
parent
0272058797
commit
f708b83d19
|
@ -3303,6 +3303,10 @@ ftree-parallelize-loops=
|
||||||
Common Joined RejectNegative UInteger Var(flag_tree_parallelize_loops) Init(1) Optimization
|
Common Joined RejectNegative UInteger Var(flag_tree_parallelize_loops) Init(1) Optimization
|
||||||
-ftree-parallelize-loops=<number> Enable automatic parallelization of loops.
|
-ftree-parallelize-loops=<number> Enable automatic parallelization of loops.
|
||||||
|
|
||||||
|
ftree-parallelize-loops
|
||||||
|
Common Alias(ftree-parallelize-loops=,2147483647,1)
|
||||||
|
Enable automatic parallelization of loops.
|
||||||
|
|
||||||
ftree-phiprop
|
ftree-phiprop
|
||||||
Common Var(flag_tree_phiprop) Init(1) Optimization
|
Common Var(flag_tree_phiprop) Init(1) Optimization
|
||||||
Enable hoisting loads from conditional pointers.
|
Enable hoisting loads from conditional pointers.
|
||||||
|
|
|
@ -659,7 +659,7 @@ Objective-C and Objective-C++ Dialects}.
|
||||||
-ftree-phiprop -ftree-loop-distribution -ftree-loop-distribute-patterns
|
-ftree-phiprop -ftree-loop-distribution -ftree-loop-distribute-patterns
|
||||||
-ftree-loop-ivcanon -ftree-loop-linear -ftree-loop-optimize
|
-ftree-loop-ivcanon -ftree-loop-linear -ftree-loop-optimize
|
||||||
-ftree-loop-vectorize
|
-ftree-loop-vectorize
|
||||||
-ftree-parallelize-loops=@var{n} -ftree-pre -ftree-partial-pre -ftree-pta
|
-ftree-parallelize-loops[=@var{n}] -ftree-pre -ftree-partial-pre -ftree-pta
|
||||||
-ftree-reassoc -ftree-scev-cprop -ftree-sink -ftree-slsr -ftree-sra
|
-ftree-reassoc -ftree-scev-cprop -ftree-sink -ftree-slsr -ftree-sra
|
||||||
-ftree-switch-conversion -ftree-tail-merge
|
-ftree-switch-conversion -ftree-tail-merge
|
||||||
-ftree-ter -ftree-vectorize -ftree-vrp -ftrivial-auto-var-init
|
-ftree-ter -ftree-vectorize -ftree-vrp -ftrivial-auto-var-init
|
||||||
|
@ -14691,8 +14691,9 @@ variable merging and induction variable elimination) on trees.
|
||||||
Enabled by default at @option{-O1} and higher.
|
Enabled by default at @option{-O1} and higher.
|
||||||
|
|
||||||
@opindex ftree-parallelize-loops
|
@opindex ftree-parallelize-loops
|
||||||
@item -ftree-parallelize-loops=n
|
@item -ftree-parallelize-loops
|
||||||
Parallelize loops, i.e., split their iteration space to run in n threads.
|
@itemx -ftree-parallelize-loops=@var{n}
|
||||||
|
Parallelize loops, i.e., split their iteration space to run in multiple threads.
|
||||||
This is only possible for loops whose iterations are independent
|
This is only possible for loops whose iterations are independent
|
||||||
and can be arbitrarily reordered. The optimization is only
|
and can be arbitrarily reordered. The optimization is only
|
||||||
profitable on multiprocessor machines, for loops that are CPU-intensive,
|
profitable on multiprocessor machines, for loops that are CPU-intensive,
|
||||||
|
@ -14700,6 +14701,17 @@ rather than constrained e.g.@: by memory bandwidth. This option
|
||||||
implies @option{-pthread}, and thus is only supported on targets
|
implies @option{-pthread}, and thus is only supported on targets
|
||||||
that have support for @option{-pthread}.
|
that have support for @option{-pthread}.
|
||||||
|
|
||||||
|
When a positive value @var{n} is specified, the number of threads is fixed
|
||||||
|
at compile time and cannot be changed after compilation. The compiler
|
||||||
|
generates ``#pragma omp parallel num_threads(@var{n})''.
|
||||||
|
|
||||||
|
When used without @code{=@var{n}} (i.e., @option{-ftree-parallelize-loops}),
|
||||||
|
the number of threads is determined at program execution time via the
|
||||||
|
@env{OMP_NUM_THREADS} environment variable. If @env{OMP_NUM_THREADS} is not
|
||||||
|
set, the OpenMP runtime automatically detects the number of available
|
||||||
|
processors and uses that value. This enables creating binaries that
|
||||||
|
adapt to different hardware configurations without recompilation.
|
||||||
|
|
||||||
@opindex ftree-pta
|
@opindex ftree-pta
|
||||||
@item -ftree-pta
|
@item -ftree-pta
|
||||||
Perform function-local points-to analysis on trees. This flag is
|
Perform function-local points-to analysis on trees. This flag is
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-options "-O2 -ftree-parallelize-loops -fdump-tree-parloops2-details" } */
|
||||||
|
|
||||||
|
void abort (void);
|
||||||
|
|
||||||
|
#define N 1000
|
||||||
|
|
||||||
|
int a[N], b[N], c[N];
|
||||||
|
|
||||||
|
void
|
||||||
|
test_parallel_loop (void)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
/* This loop should be auto-parallelized when -ftree-parallelize-loops
|
||||||
|
(without =number) is used for runtime thread detection via OMP_NUM_THREADS. */
|
||||||
|
for (i = 0; i < N; i++)
|
||||||
|
a[i] = b[i] + c[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
main (void)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < N; i++)
|
||||||
|
{
|
||||||
|
b[i] = i;
|
||||||
|
c[i] = i * 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
test_parallel_loop ();
|
||||||
|
|
||||||
|
for (i = 0; i < N; i++)
|
||||||
|
{
|
||||||
|
if (a[i] != b[i] + c[i])
|
||||||
|
abort ();
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check that the loop is parallelized with runtime thread detection. */
|
||||||
|
/* { dg-final { scan-tree-dump "parallelizing" "parloops2" } } */
|
||||||
|
|
||||||
|
/* Check that "#pragma omp parallel" is generated. */
|
||||||
|
/* { dg-final { scan-tree-dump "pragma omp parallel" "parloops2" } } */
|
||||||
|
|
||||||
|
/* Check that instead of generating a num_threads(x) clause, the compiler calls
|
||||||
|
"__builtin_omp_get_num_threads" that will set the number of threads at
|
||||||
|
program execution time. */
|
||||||
|
/* { dg-final { scan-tree-dump "__builtin_omp_get_num_threads" "parloops2" } } */
|
||||||
|
|
|
@ -2601,10 +2601,19 @@ create_parallel_loop (class loop *loop, tree loop_fn, tree data,
|
||||||
gsi = gsi_last_bb (paral_bb);
|
gsi = gsi_last_bb (paral_bb);
|
||||||
|
|
||||||
gcc_checking_assert (n_threads != 0);
|
gcc_checking_assert (n_threads != 0);
|
||||||
|
if (n_threads == INT_MAX)
|
||||||
|
/* No hardcoded thread count, let OpenMP runtime decide. */
|
||||||
|
omp_par_stmt = gimple_build_omp_parallel (NULL, NULL_TREE, loop_fn,
|
||||||
|
data);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Build the OMP_CLAUSE_NUM_THREADS clause only if we have a fixed
|
||||||
|
thread count. */
|
||||||
t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
|
t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
|
||||||
OMP_CLAUSE_NUM_THREADS_EXPR (t)
|
OMP_CLAUSE_NUM_THREADS_EXPR (t)
|
||||||
= build_int_cst (integer_type_node, n_threads);
|
= build_int_cst (integer_type_node, n_threads);
|
||||||
omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
|
omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
|
||||||
|
}
|
||||||
gimple_set_location (omp_par_stmt, loc);
|
gimple_set_location (omp_par_stmt, loc);
|
||||||
|
|
||||||
gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
|
gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
|
||||||
|
@ -2812,7 +2821,6 @@ gen_parallel_loop (class loop *loop,
|
||||||
struct clsn_data clsn_data;
|
struct clsn_data clsn_data;
|
||||||
location_t loc;
|
location_t loc;
|
||||||
gimple *cond_stmt;
|
gimple *cond_stmt;
|
||||||
unsigned int m_p_thread=2;
|
|
||||||
|
|
||||||
/* From
|
/* From
|
||||||
|
|
||||||
|
@ -2885,15 +2893,14 @@ gen_parallel_loop (class loop *loop,
|
||||||
|
|
||||||
if (!oacc_kernels_p)
|
if (!oacc_kernels_p)
|
||||||
{
|
{
|
||||||
if (loop->inner)
|
|
||||||
m_p_thread=2;
|
|
||||||
else
|
|
||||||
m_p_thread=MIN_PER_THREAD;
|
|
||||||
|
|
||||||
gcc_checking_assert (n_threads != 0);
|
gcc_checking_assert (n_threads != 0);
|
||||||
|
/* For runtime thread detection, use a conservative estimate of 2 threads
|
||||||
|
for the many iterations condition check. */
|
||||||
|
unsigned threads = (n_threads == INT_MAX) ? 2 : n_threads;
|
||||||
|
unsigned m_p_thread = loop->inner ? 2 : MIN_PER_THREAD;
|
||||||
many_iterations_cond =
|
many_iterations_cond =
|
||||||
fold_build2 (GE_EXPR, boolean_type_node,
|
fold_build2 (GE_EXPR, boolean_type_node,
|
||||||
nit, build_int_cst (type, m_p_thread * n_threads - 1));
|
nit, build_int_cst (type, m_p_thread * threads - 1));
|
||||||
|
|
||||||
many_iterations_cond
|
many_iterations_cond
|
||||||
= fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
|
= fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
|
||||||
|
@ -3905,14 +3912,15 @@ parallelize_loops (bool oacc_kernels_p)
|
||||||
estimated = estimated_loop_iterations_int (loop);
|
estimated = estimated_loop_iterations_int (loop);
|
||||||
if (estimated == -1)
|
if (estimated == -1)
|
||||||
estimated = get_likely_max_loop_iterations_int (loop);
|
estimated = get_likely_max_loop_iterations_int (loop);
|
||||||
|
/* For runtime thread detection, use an estimate of 2 threads. */
|
||||||
|
unsigned threads = (n_threads == INT_MAX) ? 2 : n_threads;
|
||||||
|
unsigned m_p_thread = loop->inner ? 2 : MIN_PER_THREAD;
|
||||||
/* FIXME: Bypass this check as graphite doesn't update the
|
/* FIXME: Bypass this check as graphite doesn't update the
|
||||||
count and frequency correctly now. */
|
count and frequency correctly now. */
|
||||||
if (!flag_loop_parallelize_all
|
if (!flag_loop_parallelize_all
|
||||||
&& !oacc_kernels_p
|
&& !oacc_kernels_p
|
||||||
&& ((estimated != -1
|
&& ((estimated != -1
|
||||||
&& (estimated
|
&& (estimated < ((HOST_WIDE_INT) threads * m_p_thread - 1)))
|
||||||
< ((HOST_WIDE_INT) n_threads
|
|
||||||
* (loop->inner ? 2 : MIN_PER_THREAD) - 1)))
|
|
||||||
/* Do not bother with loops in cold areas. */
|
/* Do not bother with loops in cold areas. */
|
||||||
|| optimize_loop_nest_for_size_p (loop)))
|
|| optimize_loop_nest_for_size_p (loop)))
|
||||||
continue;
|
continue;
|
||||||
|
|
Loading…
Reference in New Issue