re PR libgomp/49490 (suboptimal load balancing in loops)

PR libgomp/49490
	* omp-low.c (expand_omp_for_static_nochunk): Only
	use n ceil/ nthreads size for the first
	n % nthreads threads in the team instead of
	all threads except for the last few ones which
	get less work or none at all.

	* iter.c (gomp_iter_static_next): For chunk size 0
	only use n ceil/ nthreads size for the first
	n % nthreads threads in the team instead of
	all threads except for the last few ones which
	get less work or none at all.
	* iter_ull.c (gomp_iter_ull_static_next): Likewise.
	* env.c (parse_schedule): If OMP_SCHEDULE doesn't have
	chunk argument, set run_sched_modifier to 0 for static
	resp. 1 for other kinds.  If chunk argument is 0
	and not static, set value to 1.

From-SVN: r175315
This commit is contained in:
Jakub Jelinek 2011-06-22 22:39:25 +02:00 committed by Jakub Jelinek
parent 4fb489e796
commit fb79f500af
6 changed files with 96 additions and 29 deletions

View File

@ -1,5 +1,12 @@
2011-06-22 Jakub Jelinek <jakub@redhat.com> 2011-06-22 Jakub Jelinek <jakub@redhat.com>
PR libgomp/49490
* omp-low.c (expand_omp_for_static_nochunk): Only
use n ceil/ nthreads size for the first
n % nthreads threads in the team instead of
all threads except for the last few ones which
get less work or none at all.
PR debug/49496 PR debug/49496
* tree-vect-patterns.c (vect_recog_widen_mult_pattern): Ignore debug * tree-vect-patterns.c (vect_recog_widen_mult_pattern): Ignore debug
uses. uses.

View File

@ -3,7 +3,7 @@
marshalling to implement data sharing and copying clauses. marshalling to implement data sharing and copying clauses.
Contributed by Diego Novillo <dnovillo@redhat.com> Contributed by Diego Novillo <dnovillo@redhat.com>
Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010 Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
Free Software Foundation, Inc. Free Software Foundation, Inc.
This file is part of GCC. This file is part of GCC.
@ -4108,9 +4108,14 @@ expand_omp_for_generic (struct omp_region *region,
else else
n = (adj + N2 - N1) / STEP; n = (adj + N2 - N1) / STEP;
q = n / nthreads; q = n / nthreads;
q += (q * nthreads != n); tt = n % nthreads;
s0 = q * threadid; if (threadid < tt) goto L3; else goto L4;
e0 = min(s0 + q, n); L3:
tt = 0;
q = q + 1;
L4:
s0 = q * threadid + tt;
e0 = s0 + q;
V = s0 * STEP + N1; V = s0 * STEP + N1;
if (s0 >= e0) goto L2; else goto L0; if (s0 >= e0) goto L2; else goto L0;
L0: L0:
@ -4126,12 +4131,14 @@ static void
expand_omp_for_static_nochunk (struct omp_region *region, expand_omp_for_static_nochunk (struct omp_region *region,
struct omp_for_data *fd) struct omp_for_data *fd)
{ {
tree n, q, s0, e0, e, t, nthreads, threadid; tree n, q, s0, e0, e, t, tt, nthreads, threadid;
tree type, itype, vmain, vback; tree type, itype, vmain, vback;
basic_block entry_bb, exit_bb, seq_start_bb, body_bb, cont_bb; basic_block entry_bb, second_bb, third_bb, exit_bb, seq_start_bb;
basic_block body_bb, cont_bb;
basic_block fin_bb; basic_block fin_bb;
gimple_stmt_iterator gsi; gimple_stmt_iterator gsi;
gimple stmt; gimple stmt;
edge ep;
itype = type = TREE_TYPE (fd->loop.v); itype = type = TREE_TYPE (fd->loop.v);
if (POINTER_TYPE_P (type)) if (POINTER_TYPE_P (type))
@ -4185,19 +4192,39 @@ expand_omp_for_static_nochunk (struct omp_region *region,
t = fold_convert (itype, t); t = fold_convert (itype, t);
n = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT); n = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
q = create_tmp_var (itype, "q");
t = fold_build2 (TRUNC_DIV_EXPR, itype, n, nthreads); t = fold_build2 (TRUNC_DIV_EXPR, itype, n, nthreads);
q = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT); t = force_gimple_operand_gsi (&gsi, t, false, NULL_TREE, true, GSI_SAME_STMT);
gsi_insert_before (&gsi, gimple_build_assign (q, t), GSI_SAME_STMT);
t = fold_build2 (MULT_EXPR, itype, q, nthreads); tt = create_tmp_var (itype, "tt");
t = fold_build2 (NE_EXPR, itype, t, n); t = fold_build2 (TRUNC_MOD_EXPR, itype, n, nthreads);
t = fold_build2 (PLUS_EXPR, itype, q, t); t = force_gimple_operand_gsi (&gsi, t, false, NULL_TREE, true, GSI_SAME_STMT);
q = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT); gsi_insert_before (&gsi, gimple_build_assign (tt, t), GSI_SAME_STMT);
t = build2 (LT_EXPR, boolean_type_node, threadid, tt);
stmt = gimple_build_cond_empty (t);
gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
second_bb = split_block (entry_bb, stmt)->dest;
gsi = gsi_last_bb (second_bb);
gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
gsi_insert_before (&gsi, gimple_build_assign (tt, build_int_cst (itype, 0)),
GSI_SAME_STMT);
stmt = gimple_build_assign_with_ops (PLUS_EXPR, q, q,
build_int_cst (itype, 1));
gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
third_bb = split_block (second_bb, stmt)->dest;
gsi = gsi_last_bb (third_bb);
gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
t = build2 (MULT_EXPR, itype, q, threadid); t = build2 (MULT_EXPR, itype, q, threadid);
t = build2 (PLUS_EXPR, itype, t, tt);
s0 = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT); s0 = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
t = fold_build2 (PLUS_EXPR, itype, s0, q); t = fold_build2 (PLUS_EXPR, itype, s0, q);
t = fold_build2 (MIN_EXPR, itype, t, n);
e0 = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT); e0 = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
t = build2 (GE_EXPR, boolean_type_node, s0, e0); t = build2 (GE_EXPR, boolean_type_node, s0, e0);
@ -4263,13 +4290,20 @@ expand_omp_for_static_nochunk (struct omp_region *region,
gsi_remove (&gsi, true); gsi_remove (&gsi, true);
/* Connect all the blocks. */ /* Connect all the blocks. */
find_edge (entry_bb, seq_start_bb)->flags = EDGE_FALSE_VALUE; ep = make_edge (entry_bb, third_bb, EDGE_FALSE_VALUE);
find_edge (entry_bb, fin_bb)->flags = EDGE_TRUE_VALUE; ep->probability = REG_BR_PROB_BASE / 4 * 3;
ep = find_edge (entry_bb, second_bb);
ep->flags = EDGE_TRUE_VALUE;
ep->probability = REG_BR_PROB_BASE / 4;
find_edge (third_bb, seq_start_bb)->flags = EDGE_FALSE_VALUE;
find_edge (third_bb, fin_bb)->flags = EDGE_TRUE_VALUE;
find_edge (cont_bb, body_bb)->flags = EDGE_TRUE_VALUE; find_edge (cont_bb, body_bb)->flags = EDGE_TRUE_VALUE;
find_edge (cont_bb, fin_bb)->flags = EDGE_FALSE_VALUE; find_edge (cont_bb, fin_bb)->flags = EDGE_FALSE_VALUE;
set_immediate_dominator (CDI_DOMINATORS, seq_start_bb, entry_bb); set_immediate_dominator (CDI_DOMINATORS, second_bb, entry_bb);
set_immediate_dominator (CDI_DOMINATORS, third_bb, entry_bb);
set_immediate_dominator (CDI_DOMINATORS, seq_start_bb, third_bb);
set_immediate_dominator (CDI_DOMINATORS, body_bb, set_immediate_dominator (CDI_DOMINATORS, body_bb,
recompute_dominator (CDI_DOMINATORS, body_bb)); recompute_dominator (CDI_DOMINATORS, body_bb));
set_immediate_dominator (CDI_DOMINATORS, fin_bb, set_immediate_dominator (CDI_DOMINATORS, fin_bb,

View File

@ -1,3 +1,17 @@
2011-06-22 Jakub Jelinek <jakub@redhat.com>
PR libgomp/49490
* iter.c (gomp_iter_static_next): For chunk size 0
only use n ceil/ nthreads size for the first
n % nthreads threads in the team instead of
all threads except for the last few ones which
get less work or none at all.
* iter_ull.c (gomp_iter_ull_static_next): Likewise.
* env.c (parse_schedule): If OMP_SCHEDULE doesn't have
chunk argument, set run_sched_modifier to 0 for static
resp. 1 for other kinds. If chunk argument is 0
and not static, set value to 1.
2011-05-19 Jakub Jelinek <jakub@redhat.com> 2011-05-19 Jakub Jelinek <jakub@redhat.com>
PR c++/49043 PR c++/49043

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010 /* Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
Free Software Foundation, Inc. Free Software Foundation, Inc.
Contributed by Richard Henderson <rth@redhat.com>. Contributed by Richard Henderson <rth@redhat.com>.
@ -108,7 +108,11 @@ parse_schedule (void)
while (isspace ((unsigned char) *env)) while (isspace ((unsigned char) *env))
++env; ++env;
if (*env == '\0') if (*env == '\0')
return; {
gomp_global_icv.run_sched_modifier
= gomp_global_icv.run_sched_var != GFS_STATIC;
return;
}
if (*env++ != ',') if (*env++ != ',')
goto unknown; goto unknown;
while (isspace ((unsigned char) *env)) while (isspace ((unsigned char) *env))
@ -129,6 +133,8 @@ parse_schedule (void)
if ((int)value != value) if ((int)value != value)
goto invalid; goto invalid;
if (value == 0 && gomp_global_icv.run_sched_var != GFS_STATIC)
value = 1;
gomp_global_icv.run_sched_modifier = value; gomp_global_icv.run_sched_modifier = value;
return; return;

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc. /* Copyright (C) 2005, 2008, 2009, 2011 Free Software Foundation, Inc.
Contributed by Richard Henderson <rth@redhat.com>. Contributed by Richard Henderson <rth@redhat.com>.
This file is part of the GNU OpenMP Library (libgomp). This file is part of the GNU OpenMP Library (libgomp).
@ -59,7 +59,7 @@ gomp_iter_static_next (long *pstart, long *pend)
trip through the outer loop. */ trip through the outer loop. */
if (ws->chunk_size == 0) if (ws->chunk_size == 0)
{ {
unsigned long n, q, i; unsigned long n, q, i, t;
unsigned long s0, e0; unsigned long s0, e0;
long s, e; long s, e;
@ -74,11 +74,14 @@ gomp_iter_static_next (long *pstart, long *pend)
/* Compute the "zero-based" start and end points. That is, as /* Compute the "zero-based" start and end points. That is, as
if the loop began at zero and incremented by one. */ if the loop began at zero and incremented by one. */
q = n / nthreads; q = n / nthreads;
q += (q * nthreads != n); t = n % nthreads;
s0 = q * i; if (i < t)
{
t = 0;
q++;
}
s0 = q * i + t;
e0 = s0 + q; e0 = s0 + q;
if (e0 > n)
e0 = n;
/* Notice when no iterations allocated for this thread. */ /* Notice when no iterations allocated for this thread. */
if (s0 >= e0) if (s0 >= e0)

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc. /* Copyright (C) 2005, 2008, 2009, 2011 Free Software Foundation, Inc.
Contributed by Richard Henderson <rth@redhat.com>. Contributed by Richard Henderson <rth@redhat.com>.
This file is part of the GNU OpenMP Library (libgomp). This file is part of the GNU OpenMP Library (libgomp).
@ -60,7 +60,7 @@ gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
trip through the outer loop. */ trip through the outer loop. */
if (ws->chunk_size_ull == 0) if (ws->chunk_size_ull == 0)
{ {
gomp_ull n, q, i, s0, e0, s, e; gomp_ull n, q, i, t, s0, e0, s, e;
if (thr->ts.static_trip > 0) if (thr->ts.static_trip > 0)
return 1; return 1;
@ -75,11 +75,14 @@ gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
/* Compute the "zero-based" start and end points. That is, as /* Compute the "zero-based" start and end points. That is, as
if the loop began at zero and incremented by one. */ if the loop began at zero and incremented by one. */
q = n / nthreads; q = n / nthreads;
q += (q * nthreads != n); t = n % nthreads;
s0 = q * i; if (i < t)
{
t = 0;
q++;
}
s0 = q * i + t;
e0 = s0 + q; e0 = s0 + q;
if (e0 > n)
e0 = n;
/* Notice when no iterations allocated for this thread. */ /* Notice when no iterations allocated for this thread. */
if (s0 >= e0) if (s0 >= e0)