mirror of git://gcc.gnu.org/git/gcc.git
nvptx.c (global_lock_var): New.
gcc/ * config/nvptx/nvptx.c (global_lock_var): New. (nvptx_global_lock_addr): New. (nvptx_lockless_update): Recomment and adjust for clarity. (nvptx_lockfull_update): New. (nvptx_reduction_update): New. (nvptx_goacc_reduction_fini): Call it. libgcc/ * config/nvptx/reduction.c: New. * config/nvptx/t-nvptx (LIB2ADD): Add it. libgomp/ * testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: Add worker & gang cases. * testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: Likewise. From-SVN: r230545
This commit is contained in:
parent
d085c46817
commit
33f47f4279
|
|
@ -1,3 +1,12 @@
|
||||||
|
2015-11-18 Nathan Sidwell <nathan@codesourcery.com>
|
||||||
|
|
||||||
|
* config/nvptx/nvptx.c (global_lock_var): New.
|
||||||
|
(nvptx_global_lock_addr): New.
|
||||||
|
(nvptx_lockless_update): Recomment and adjust for clarity.
|
||||||
|
(nvptx_lockfull_update): New.
|
||||||
|
(nvptx_reduction_update): New.
|
||||||
|
(nvptx_goacc_reduction_fini): Call it.
|
||||||
|
|
||||||
2015-11-18 Bernd Schmidt <bschmidt@redhat.com>
|
2015-11-18 Bernd Schmidt <bschmidt@redhat.com>
|
||||||
|
|
||||||
* regrename.h (struct du_head): Add target_data_1 and target_data_2
|
* regrename.h (struct du_head): Add target_data_1 and target_data_2
|
||||||
|
|
|
||||||
|
|
@ -114,6 +114,9 @@ static unsigned worker_red_align;
|
||||||
#define worker_red_name "__worker_red"
|
#define worker_red_name "__worker_red"
|
||||||
static GTY(()) rtx worker_red_sym;
|
static GTY(()) rtx worker_red_sym;
|
||||||
|
|
||||||
|
/* Global lock variable, needed for 128bit worker & gang reductions. */
|
||||||
|
static GTY(()) tree global_lock_var;
|
||||||
|
|
||||||
/* Allocate a new, cleared machine_function structure. */
|
/* Allocate a new, cleared machine_function structure. */
|
||||||
|
|
||||||
static struct machine_function *
|
static struct machine_function *
|
||||||
|
|
@ -3681,8 +3684,45 @@ nvptx_generate_vector_shuffle (location_t loc,
|
||||||
gimplify_assign (dest_var, expr, seq);
|
gimplify_assign (dest_var, expr, seq);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Lazily generate the global lock var decl and return its address. */
|
||||||
|
|
||||||
|
static tree
|
||||||
|
nvptx_global_lock_addr ()
|
||||||
|
{
|
||||||
|
tree v = global_lock_var;
|
||||||
|
|
||||||
|
if (!v)
|
||||||
|
{
|
||||||
|
tree name = get_identifier ("__reduction_lock");
|
||||||
|
tree type = build_qualified_type (unsigned_type_node,
|
||||||
|
TYPE_QUAL_VOLATILE);
|
||||||
|
v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
|
||||||
|
global_lock_var = v;
|
||||||
|
DECL_ARTIFICIAL (v) = 1;
|
||||||
|
DECL_EXTERNAL (v) = 1;
|
||||||
|
TREE_STATIC (v) = 1;
|
||||||
|
TREE_PUBLIC (v) = 1;
|
||||||
|
TREE_USED (v) = 1;
|
||||||
|
mark_addressable (v);
|
||||||
|
mark_decl_referenced (v);
|
||||||
|
}
|
||||||
|
|
||||||
|
return build_fold_addr_expr (v);
|
||||||
|
}
|
||||||
|
|
||||||
/* Insert code to locklessly update *PTR with *PTR OP VAR just before
|
/* Insert code to locklessly update *PTR with *PTR OP VAR just before
|
||||||
GSI. */
|
GSI. We use a lockless scheme for nearly all case, which looks
|
||||||
|
like:
|
||||||
|
actual = initval(OP);
|
||||||
|
do {
|
||||||
|
guess = actual;
|
||||||
|
write = guess OP myval;
|
||||||
|
actual = cmp&swap (ptr, guess, write)
|
||||||
|
} while (actual bit-different-to guess);
|
||||||
|
return write;
|
||||||
|
|
||||||
|
This relies on a cmp&swap instruction, which is available for 32-
|
||||||
|
and 64-bit types. Larger types must use a locking scheme. */
|
||||||
|
|
||||||
static tree
|
static tree
|
||||||
nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
|
nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
|
||||||
|
|
@ -3690,46 +3730,30 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
|
||||||
{
|
{
|
||||||
unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
|
unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
|
||||||
tree_code code = NOP_EXPR;
|
tree_code code = NOP_EXPR;
|
||||||
tree type = unsigned_type_node;
|
tree arg_type = unsigned_type_node;
|
||||||
|
tree var_type = TREE_TYPE (var);
|
||||||
|
|
||||||
enum machine_mode mode = TYPE_MODE (TREE_TYPE (var));
|
if (TREE_CODE (var_type) == COMPLEX_TYPE
|
||||||
|
|| TREE_CODE (var_type) == REAL_TYPE)
|
||||||
if (!INTEGRAL_MODE_P (mode))
|
|
||||||
code = VIEW_CONVERT_EXPR;
|
code = VIEW_CONVERT_EXPR;
|
||||||
if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (DImode))
|
|
||||||
|
if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
|
||||||
{
|
{
|
||||||
|
arg_type = long_long_unsigned_type_node;
|
||||||
fn = NVPTX_BUILTIN_CMP_SWAPLL;
|
fn = NVPTX_BUILTIN_CMP_SWAPLL;
|
||||||
type = long_long_unsigned_type_node;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tree swap_fn = nvptx_builtin_decl (fn, true);
|
||||||
|
|
||||||
gimple_seq init_seq = NULL;
|
gimple_seq init_seq = NULL;
|
||||||
tree init_var = make_ssa_name (type);
|
tree init_var = make_ssa_name (arg_type);
|
||||||
tree init_expr = omp_reduction_init_op (loc, op, TREE_TYPE (var));
|
tree init_expr = omp_reduction_init_op (loc, op, var_type);
|
||||||
init_expr = fold_build1 (code, type, init_expr);
|
init_expr = fold_build1 (code, arg_type, init_expr);
|
||||||
gimplify_assign (init_var, init_expr, &init_seq);
|
gimplify_assign (init_var, init_expr, &init_seq);
|
||||||
gimple *init_end = gimple_seq_last (init_seq);
|
gimple *init_end = gimple_seq_last (init_seq);
|
||||||
|
|
||||||
gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
|
gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
|
||||||
|
|
||||||
gimple_seq loop_seq = NULL;
|
|
||||||
tree expect_var = make_ssa_name (type);
|
|
||||||
tree actual_var = make_ssa_name (type);
|
|
||||||
tree write_var = make_ssa_name (type);
|
|
||||||
|
|
||||||
tree write_expr = fold_build1 (code, TREE_TYPE (var), expect_var);
|
|
||||||
write_expr = fold_build2 (op, TREE_TYPE (var), write_expr, var);
|
|
||||||
write_expr = fold_build1 (code, type, write_expr);
|
|
||||||
gimplify_assign (write_var, write_expr, &loop_seq);
|
|
||||||
|
|
||||||
tree swap_expr = nvptx_builtin_decl (fn, true);
|
|
||||||
swap_expr = build_call_expr_loc (loc, swap_expr, 3,
|
|
||||||
ptr, expect_var, write_var);
|
|
||||||
gimplify_assign (actual_var, swap_expr, &loop_seq);
|
|
||||||
|
|
||||||
gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
|
|
||||||
NULL_TREE, NULL_TREE);
|
|
||||||
gimple_seq_add_stmt (&loop_seq, cond);
|
|
||||||
|
|
||||||
/* Split the block just after the init stmts. */
|
/* Split the block just after the init stmts. */
|
||||||
basic_block pre_bb = gsi_bb (*gsi);
|
basic_block pre_bb = gsi_bb (*gsi);
|
||||||
edge pre_edge = split_block (pre_bb, init_end);
|
edge pre_edge = split_block (pre_bb, init_end);
|
||||||
|
|
@ -3738,12 +3762,34 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
|
||||||
/* Reset the iterator. */
|
/* Reset the iterator. */
|
||||||
*gsi = gsi_for_stmt (gsi_stmt (*gsi));
|
*gsi = gsi_for_stmt (gsi_stmt (*gsi));
|
||||||
|
|
||||||
/* Insert the loop statements. */
|
tree expect_var = make_ssa_name (arg_type);
|
||||||
gimple *loop_end = gimple_seq_last (loop_seq);
|
tree actual_var = make_ssa_name (arg_type);
|
||||||
gsi_insert_seq_before (gsi, loop_seq, GSI_SAME_STMT);
|
tree write_var = make_ssa_name (arg_type);
|
||||||
|
|
||||||
/* Split the block just after the loop stmts. */
|
/* Build and insert the reduction calculation. */
|
||||||
edge post_edge = split_block (loop_bb, loop_end);
|
gimple_seq red_seq = NULL;
|
||||||
|
tree write_expr = fold_build1 (code, var_type, expect_var);
|
||||||
|
write_expr = fold_build2 (op, var_type, write_expr, var);
|
||||||
|
write_expr = fold_build1 (code, arg_type, write_expr);
|
||||||
|
gimplify_assign (write_var, write_expr, &red_seq);
|
||||||
|
|
||||||
|
gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
|
||||||
|
|
||||||
|
/* Build & insert the cmp&swap sequence. */
|
||||||
|
gimple_seq latch_seq = NULL;
|
||||||
|
tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
|
||||||
|
ptr, expect_var, write_var);
|
||||||
|
gimplify_assign (actual_var, swap_expr, &latch_seq);
|
||||||
|
|
||||||
|
gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
|
||||||
|
NULL_TREE, NULL_TREE);
|
||||||
|
gimple_seq_add_stmt (&latch_seq, cond);
|
||||||
|
|
||||||
|
gimple *latch_end = gimple_seq_last (latch_seq);
|
||||||
|
gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
|
||||||
|
|
||||||
|
/* Split the block just after the latch stmts. */
|
||||||
|
edge post_edge = split_block (loop_bb, latch_end);
|
||||||
basic_block post_bb = post_edge->dest;
|
basic_block post_bb = post_edge->dest;
|
||||||
loop_bb = post_edge->src;
|
loop_bb = post_edge->src;
|
||||||
*gsi = gsi_for_stmt (gsi_stmt (*gsi));
|
*gsi = gsi_for_stmt (gsi_stmt (*gsi));
|
||||||
|
|
@ -3762,7 +3808,123 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
|
||||||
loop->latch = loop_bb;
|
loop->latch = loop_bb;
|
||||||
add_loop (loop, loop_bb->loop_father);
|
add_loop (loop, loop_bb->loop_father);
|
||||||
|
|
||||||
return fold_build1 (code, TREE_TYPE (var), write_var);
|
return fold_build1 (code, var_type, write_var);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Insert code to lockfully update *PTR with *PTR OP VAR just before
|
||||||
|
GSI. This is necessary for types larger than 64 bits, where there
|
||||||
|
is no cmp&swap instruction to implement a lockless scheme. We use
|
||||||
|
a lock variable in global memory.
|
||||||
|
|
||||||
|
while (cmp&swap (&lock_var, 0, 1))
|
||||||
|
continue;
|
||||||
|
T accum = *ptr;
|
||||||
|
accum = accum OP var;
|
||||||
|
*ptr = accum;
|
||||||
|
cmp&swap (&lock_var, 1, 0);
|
||||||
|
return accum;
|
||||||
|
|
||||||
|
A lock in global memory is necessary to force execution engine
|
||||||
|
descheduling and avoid resource starvation that can occur if the
|
||||||
|
lock is in .shared memory. */
|
||||||
|
|
||||||
|
static tree
|
||||||
|
nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
|
||||||
|
tree ptr, tree var, tree_code op)
|
||||||
|
{
|
||||||
|
tree var_type = TREE_TYPE (var);
|
||||||
|
tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
|
||||||
|
tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
|
||||||
|
tree uns_locked = build_int_cst (unsigned_type_node, 1);
|
||||||
|
|
||||||
|
/* Split the block just before the gsi. Insert a gimple nop to make
|
||||||
|
this easier. */
|
||||||
|
gimple *nop = gimple_build_nop ();
|
||||||
|
gsi_insert_before (gsi, nop, GSI_SAME_STMT);
|
||||||
|
basic_block entry_bb = gsi_bb (*gsi);
|
||||||
|
edge entry_edge = split_block (entry_bb, nop);
|
||||||
|
basic_block lock_bb = entry_edge->dest;
|
||||||
|
/* Reset the iterator. */
|
||||||
|
*gsi = gsi_for_stmt (gsi_stmt (*gsi));
|
||||||
|
|
||||||
|
/* Build and insert the locking sequence. */
|
||||||
|
gimple_seq lock_seq = NULL;
|
||||||
|
tree lock_var = make_ssa_name (unsigned_type_node);
|
||||||
|
tree lock_expr = nvptx_global_lock_addr ();
|
||||||
|
lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
|
||||||
|
uns_unlocked, uns_locked);
|
||||||
|
gimplify_assign (lock_var, lock_expr, &lock_seq);
|
||||||
|
gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
|
||||||
|
NULL_TREE, NULL_TREE);
|
||||||
|
gimple_seq_add_stmt (&lock_seq, cond);
|
||||||
|
gimple *lock_end = gimple_seq_last (lock_seq);
|
||||||
|
gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
|
||||||
|
|
||||||
|
/* Split the block just after the lock sequence. */
|
||||||
|
edge locked_edge = split_block (lock_bb, lock_end);
|
||||||
|
basic_block update_bb = locked_edge->dest;
|
||||||
|
lock_bb = locked_edge->src;
|
||||||
|
*gsi = gsi_for_stmt (gsi_stmt (*gsi));
|
||||||
|
|
||||||
|
/* Create the lock loop ... */
|
||||||
|
locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
|
||||||
|
make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
|
||||||
|
set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
|
||||||
|
set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
|
||||||
|
|
||||||
|
/* ... and the loop structure. */
|
||||||
|
loop *lock_loop = alloc_loop ();
|
||||||
|
lock_loop->header = lock_bb;
|
||||||
|
lock_loop->latch = lock_bb;
|
||||||
|
lock_loop->nb_iterations_estimate = 1;
|
||||||
|
lock_loop->any_estimate = true;
|
||||||
|
add_loop (lock_loop, entry_bb->loop_father);
|
||||||
|
|
||||||
|
/* Build and insert the reduction calculation. */
|
||||||
|
gimple_seq red_seq = NULL;
|
||||||
|
tree acc_in = make_ssa_name (var_type);
|
||||||
|
tree ref_in = build_simple_mem_ref (ptr);
|
||||||
|
TREE_THIS_VOLATILE (ref_in) = 1;
|
||||||
|
gimplify_assign (acc_in, ref_in, &red_seq);
|
||||||
|
|
||||||
|
tree acc_out = make_ssa_name (var_type);
|
||||||
|
tree update_expr = fold_build2 (op, var_type, ref_in, var);
|
||||||
|
gimplify_assign (acc_out, update_expr, &red_seq);
|
||||||
|
|
||||||
|
tree ref_out = build_simple_mem_ref (ptr);
|
||||||
|
TREE_THIS_VOLATILE (ref_out) = 1;
|
||||||
|
gimplify_assign (ref_out, acc_out, &red_seq);
|
||||||
|
|
||||||
|
gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
|
||||||
|
|
||||||
|
/* Build & insert the unlock sequence. */
|
||||||
|
gimple_seq unlock_seq = NULL;
|
||||||
|
tree unlock_expr = nvptx_global_lock_addr ();
|
||||||
|
unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
|
||||||
|
uns_locked, uns_unlocked);
|
||||||
|
gimplify_and_add (unlock_expr, &unlock_seq);
|
||||||
|
gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
|
||||||
|
|
||||||
|
return acc_out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Emit a sequence to update a reduction accumlator at *PTR with the
|
||||||
|
value held in VAR using operator OP. Return the updated value.
|
||||||
|
|
||||||
|
TODO: optimize for atomic ops and indepedent complex ops. */
|
||||||
|
|
||||||
|
static tree
|
||||||
|
nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
|
||||||
|
tree ptr, tree var, tree_code op)
|
||||||
|
{
|
||||||
|
tree type = TREE_TYPE (var);
|
||||||
|
tree size = TYPE_SIZE (type);
|
||||||
|
|
||||||
|
if (size == TYPE_SIZE (unsigned_type_node)
|
||||||
|
|| size == TYPE_SIZE (long_long_unsigned_type_node))
|
||||||
|
return nvptx_lockless_update (loc, gsi, ptr, var, op);
|
||||||
|
else
|
||||||
|
return nvptx_lockfull_update (loc, gsi, ptr, var, op);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
|
/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
|
||||||
|
|
@ -3944,10 +4106,10 @@ nvptx_goacc_reduction_fini (gcall *call)
|
||||||
|
|
||||||
if (accum)
|
if (accum)
|
||||||
{
|
{
|
||||||
/* Locklessly update the accumulator. */
|
/* UPDATE the accumulator. */
|
||||||
gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
|
gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
|
||||||
seq = NULL;
|
seq = NULL;
|
||||||
r = nvptx_lockless_update (gimple_location (call), &gsi,
|
r = nvptx_reduction_update (gimple_location (call), &gsi,
|
||||||
accum, var, op);
|
accum, var, op);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,8 @@
|
||||||
|
2015-11-18 Nathan Sidwell <nathan@codesourcery.com>
|
||||||
|
|
||||||
|
* config/nvptx/reduction.c: New.
|
||||||
|
* config/nvptx/t-nvptx (LIB2ADD): Add it.
|
||||||
|
|
||||||
2015-11-15 David Edelsohn <dje.gcc@gmail.com>
|
2015-11-15 David Edelsohn <dje.gcc@gmail.com>
|
||||||
|
|
||||||
* config/rs6000/on_exit.c: New file.
|
* config/rs6000/on_exit.c: New file.
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,31 @@
|
||||||
|
/* Oversized reductions lock variable
|
||||||
|
Copyright (C) 2015 Free Software Foundation, Inc.
|
||||||
|
Contributed by Mentor Graphics.
|
||||||
|
|
||||||
|
This file is part of GCC.
|
||||||
|
|
||||||
|
GCC is free software; you can redistribute it and/or modify it under
|
||||||
|
the terms of the GNU General Public License as published by the Free
|
||||||
|
Software Foundation; either version 3, or (at your option) any later
|
||||||
|
version.
|
||||||
|
|
||||||
|
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||||
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||||
|
for more details.
|
||||||
|
|
||||||
|
Under Section 7 of GPL version 3, you are granted additional
|
||||||
|
permissions described in the GCC Runtime Library Exception, version
|
||||||
|
3.1, as published by the Free Software Foundation.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License and
|
||||||
|
a copy of the GCC Runtime Library Exception along with this program;
|
||||||
|
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
||||||
|
<http://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
|
||||||
|
/* We use a global lock variable for reductions on objects larger than
|
||||||
|
64 bits. Until and unless proven that lock contention for
|
||||||
|
different reduction is a problem, a single lock will suffice. */
|
||||||
|
|
||||||
|
unsigned volatile __reduction_lock = 0;
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
LIB2ADD=$(srcdir)/config/nvptx/malloc.asm \
|
LIB2ADD=$(srcdir)/config/nvptx/malloc.asm \
|
||||||
$(srcdir)/config/nvptx/free.asm \
|
$(srcdir)/config/nvptx/free.asm \
|
||||||
$(srcdir)/config/nvptx/realloc.c
|
$(srcdir)/config/nvptx/realloc.c \
|
||||||
|
$(srcdir)/config/nvptx/reduction.c
|
||||||
|
|
||||||
LIB2ADDEH=
|
LIB2ADDEH=
|
||||||
LIB2FUNCS_EXCLUDE=__main
|
LIB2FUNCS_EXCLUDE=__main
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,9 @@
|
||||||
|
2015-11-18 Nathan Sidwell <nathan@codesourcery.com>
|
||||||
|
|
||||||
|
* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: Add
|
||||||
|
worker & gang cases.
|
||||||
|
* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: Likewise.
|
||||||
|
|
||||||
2015-11-17 Cesar Philippidis <cesar@codesourcery.com>
|
2015-11-17 Cesar Philippidis <cesar@codesourcery.com>
|
||||||
|
|
||||||
* config/nvptx/priority_queue.c: New file.
|
* config/nvptx/priority_queue.c: New file.
|
||||||
|
|
|
||||||
|
|
@ -14,28 +14,17 @@ int close_enough (double _Complex a, double _Complex b)
|
||||||
return mag2_diff / mag2_a < (FRAC * FRAC);
|
return mag2_diff / mag2_a < (FRAC * FRAC);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main (void)
|
|
||||||
{
|
|
||||||
#define N 100
|
#define N 100
|
||||||
double _Complex ary[N], sum, prod, tsum, tprod;
|
|
||||||
int ix;
|
|
||||||
|
|
||||||
sum = tsum = 0;
|
static int __attribute__ ((noinline))
|
||||||
prod = tprod = 1;
|
vector (double _Complex ary[N], double _Complex sum, double _Complex prod)
|
||||||
|
{
|
||||||
|
double _Complex tsum = 0, tprod = 1;
|
||||||
|
|
||||||
for (ix = 0; ix < N; ix++)
|
#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod)
|
||||||
{
|
|
||||||
double frac = ix * (1.0 / 1024) + 1.0;
|
|
||||||
|
|
||||||
ary[ix] = frac + frac * 2.0i - 1.0i;
|
|
||||||
sum += ary[ix];
|
|
||||||
prod *= ary[ix];
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod)
|
|
||||||
{
|
{
|
||||||
#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
|
#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
|
||||||
for (ix = 0; ix < N; ix++)
|
for (int ix = 0; ix < N; ix++)
|
||||||
{
|
{
|
||||||
tsum += ary[ix];
|
tsum += ary[ix];
|
||||||
tprod *= ary[ix];
|
tprod *= ary[ix];
|
||||||
|
|
@ -50,3 +39,76 @@ int main (void)
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int __attribute__ ((noinline))
|
||||||
|
worker (double _Complex ary[N], double _Complex sum, double _Complex prod)
|
||||||
|
{
|
||||||
|
double _Complex tsum = 0, tprod = 1;
|
||||||
|
|
||||||
|
#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod)
|
||||||
|
{
|
||||||
|
#pragma acc loop worker reduction(+:tsum) reduction (*:tprod)
|
||||||
|
for (int ix = 0; ix < N; ix++)
|
||||||
|
{
|
||||||
|
tsum += ary[ix];
|
||||||
|
tprod *= ary[ix];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!close_enough (sum, tsum))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (!close_enough (prod, tprod))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int __attribute__ ((noinline))
|
||||||
|
gang (double _Complex ary[N], double _Complex sum, double _Complex prod)
|
||||||
|
{
|
||||||
|
double _Complex tsum = 0, tprod = 1;
|
||||||
|
|
||||||
|
#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod)
|
||||||
|
{
|
||||||
|
#pragma acc loop gang reduction(+:tsum) reduction (*:tprod)
|
||||||
|
for (int ix = 0; ix < N; ix++)
|
||||||
|
{
|
||||||
|
tsum += ary[ix];
|
||||||
|
tprod *= ary[ix];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!close_enough (sum, tsum))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (!close_enough (prod, tprod))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main (void)
|
||||||
|
{
|
||||||
|
double _Complex ary[N], sum = 0, prod = 1;
|
||||||
|
|
||||||
|
for (int ix = 0; ix < N; ix++)
|
||||||
|
{
|
||||||
|
double frac = ix * (1.0 / 1024) + 1.0;
|
||||||
|
|
||||||
|
ary[ix] = frac + frac * 2.0i - 1.0i;
|
||||||
|
sum += ary[ix];
|
||||||
|
prod *= ary[ix];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vector (ary, sum, prod))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (worker (ary, sum, prod))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (gang (ary, sum, prod))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -14,28 +14,17 @@ int close_enough (float _Complex a, float _Complex b)
|
||||||
return mag2_diff / mag2_a < (FRAC * FRAC);
|
return mag2_diff / mag2_a < (FRAC * FRAC);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main (void)
|
|
||||||
{
|
|
||||||
#define N 100
|
#define N 100
|
||||||
float _Complex ary[N], sum, prod, tsum, tprod;
|
|
||||||
int ix;
|
|
||||||
|
|
||||||
sum = tsum = 0;
|
static int __attribute__ ((noinline))
|
||||||
prod = tprod = 1;
|
vector (float _Complex ary[N], float _Complex sum, float _Complex prod)
|
||||||
|
{
|
||||||
|
float _Complex tsum = 0, tprod = 1;
|
||||||
|
|
||||||
for (ix = 0; ix < N; ix++)
|
#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod)
|
||||||
{
|
|
||||||
float frac = ix * (1.0f / 1024) + 1.0f;
|
|
||||||
|
|
||||||
ary[ix] = frac + frac * 2.0i - 1.0i;
|
|
||||||
sum += ary[ix];
|
|
||||||
prod *= ary[ix];
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod)
|
|
||||||
{
|
{
|
||||||
#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
|
#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
|
||||||
for (ix = 0; ix < N; ix++)
|
for (int ix = 0; ix < N; ix++)
|
||||||
{
|
{
|
||||||
tsum += ary[ix];
|
tsum += ary[ix];
|
||||||
tprod *= ary[ix];
|
tprod *= ary[ix];
|
||||||
|
|
@ -50,3 +39,76 @@ int main (void)
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int __attribute__ ((noinline))
|
||||||
|
worker (float _Complex ary[N], float _Complex sum, float _Complex prod)
|
||||||
|
{
|
||||||
|
float _Complex tsum = 0, tprod = 1;
|
||||||
|
|
||||||
|
#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod)
|
||||||
|
{
|
||||||
|
#pragma acc loop worker reduction(+:tsum) reduction (*:tprod)
|
||||||
|
for (int ix = 0; ix < N; ix++)
|
||||||
|
{
|
||||||
|
tsum += ary[ix];
|
||||||
|
tprod *= ary[ix];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!close_enough (sum, tsum))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (!close_enough (prod, tprod))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int __attribute__ ((noinline))
|
||||||
|
gang (float _Complex ary[N], float _Complex sum, float _Complex prod)
|
||||||
|
{
|
||||||
|
float _Complex tsum = 0, tprod = 1;
|
||||||
|
|
||||||
|
#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod)
|
||||||
|
{
|
||||||
|
#pragma acc loop gang reduction(+:tsum) reduction (*:tprod)
|
||||||
|
for (int ix = 0; ix < N; ix++)
|
||||||
|
{
|
||||||
|
tsum += ary[ix];
|
||||||
|
tprod *= ary[ix];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!close_enough (sum, tsum))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (!close_enough (prod, tprod))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main (void)
|
||||||
|
{
|
||||||
|
float _Complex ary[N], sum = 0, prod = 1;
|
||||||
|
|
||||||
|
for (int ix = 0; ix < N; ix++)
|
||||||
|
{
|
||||||
|
float frac = ix * (1.0f / 1024) + 1.0f;
|
||||||
|
|
||||||
|
ary[ix] = frac + frac * 2.0i - 1.0i;
|
||||||
|
sum += ary[ix];
|
||||||
|
prod *= ary[ix];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vector (ary, sum, prod))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (worker (ary, sum, prod))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (gang (ary, sum, prod))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue