mirror of git://gcc.gnu.org/git/gcc.git
				
				
				
			nvptx.c (global_lock_var): New.
gcc/ * config/nvptx/nvptx.c (global_lock_var): New. (nvptx_global_lock_addr): New. (nvptx_lockless_update): Recomment and adjust for clarity. (nvptx_lockfull_update): New. (nvptx_reduction_update): New. (nvptx_goacc_reduction_fini): Call it. libgcc/ * config/nvptx/reduction.c: New. * config/nvptx/t-nvptx (LIB2ADD): Add it. libgomp/ * testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: Add worker & gang cases. * testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: Likewise. From-SVN: r230545
This commit is contained in:
		
							parent
							
								
									d085c46817
								
							
						
					
					
						commit
						33f47f4279
					
				|  | @ -1,3 +1,12 @@ | |||
| 2015-11-18  Nathan Sidwell  <nathan@codesourcery.com> | ||||
| 
 | ||||
| 	* config/nvptx/nvptx.c (global_lock_var): New. | ||||
| 	(nvptx_global_lock_addr): New. | ||||
| 	(nvptx_lockless_update): Recomment and adjust for clarity. | ||||
| 	(nvptx_lockfull_update): New. | ||||
| 	(nvptx_reduction_update): New. | ||||
| 	(nvptx_goacc_reduction_fini): Call it. | ||||
| 
 | ||||
| 2015-11-18  Bernd Schmidt  <bschmidt@redhat.com> | ||||
| 
 | ||||
| 	* regrename.h (struct du_head): Add target_data_1 and target_data_2 | ||||
|  |  | |||
|  | @ -114,6 +114,9 @@ static unsigned worker_red_align; | |||
| #define worker_red_name "__worker_red" | ||||
| static GTY(()) rtx worker_red_sym; | ||||
| 
 | ||||
| /* Global lock variable, needed for 128bit worker & gang reductions.  */ | ||||
| static GTY(()) tree global_lock_var; | ||||
| 
 | ||||
| /* Allocate a new, cleared machine_function structure.  */ | ||||
| 
 | ||||
| static struct machine_function * | ||||
|  | @ -3681,8 +3684,45 @@ nvptx_generate_vector_shuffle (location_t loc, | |||
|   gimplify_assign (dest_var, expr, seq); | ||||
| } | ||||
| 
 | ||||
| /* Insert code to locklessly update  *PTR with *PTR OP VAR just before
 | ||||
|    GSI.  */ | ||||
| /* Lazily generate the global lock var decl and return its address.  */ | ||||
| 
 | ||||
| static tree | ||||
| nvptx_global_lock_addr () | ||||
| { | ||||
|   tree v = global_lock_var; | ||||
|    | ||||
|   if (!v) | ||||
|     { | ||||
|       tree name = get_identifier ("__reduction_lock"); | ||||
|       tree type = build_qualified_type (unsigned_type_node, | ||||
| 					TYPE_QUAL_VOLATILE); | ||||
|       v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type); | ||||
|       global_lock_var = v; | ||||
|       DECL_ARTIFICIAL (v) = 1; | ||||
|       DECL_EXTERNAL (v) = 1; | ||||
|       TREE_STATIC (v) = 1; | ||||
|       TREE_PUBLIC (v) = 1; | ||||
|       TREE_USED (v) = 1; | ||||
|       mark_addressable (v); | ||||
|       mark_decl_referenced (v); | ||||
|     } | ||||
| 
 | ||||
|   return build_fold_addr_expr (v); | ||||
| } | ||||
| 
 | ||||
| /* Insert code to locklessly update *PTR with *PTR OP VAR just before
 | ||||
|    GSI.  We use a lockless scheme for nearly all case, which looks | ||||
|    like: | ||||
|      actual = initval(OP); | ||||
|      do { | ||||
|        guess = actual; | ||||
|        write = guess OP myval; | ||||
|        actual = cmp&swap (ptr, guess, write) | ||||
|      } while (actual bit-different-to guess); | ||||
|    return write; | ||||
| 
 | ||||
|    This relies on a cmp&swap instruction, which is available for 32- | ||||
|    and 64-bit types.  Larger types must use a locking scheme.  */ | ||||
| 
 | ||||
| static tree | ||||
| nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, | ||||
|  | @ -3690,46 +3730,30 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, | |||
| { | ||||
|   unsigned fn = NVPTX_BUILTIN_CMP_SWAP; | ||||
|   tree_code code = NOP_EXPR; | ||||
|   tree type = unsigned_type_node; | ||||
|   tree arg_type = unsigned_type_node; | ||||
|   tree var_type = TREE_TYPE (var); | ||||
| 
 | ||||
|   enum machine_mode mode = TYPE_MODE (TREE_TYPE (var)); | ||||
| 
 | ||||
|   if (!INTEGRAL_MODE_P (mode)) | ||||
|   if (TREE_CODE (var_type) == COMPLEX_TYPE | ||||
|       || TREE_CODE (var_type) == REAL_TYPE) | ||||
|     code = VIEW_CONVERT_EXPR; | ||||
|   if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (DImode)) | ||||
| 
 | ||||
|   if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node)) | ||||
|     { | ||||
|       arg_type = long_long_unsigned_type_node; | ||||
|       fn = NVPTX_BUILTIN_CMP_SWAPLL; | ||||
|       type = long_long_unsigned_type_node; | ||||
|     } | ||||
| 
 | ||||
|   tree swap_fn = nvptx_builtin_decl (fn, true); | ||||
| 
 | ||||
|   gimple_seq init_seq = NULL; | ||||
|   tree init_var = make_ssa_name (type); | ||||
|   tree init_expr = omp_reduction_init_op (loc, op, TREE_TYPE (var)); | ||||
|   init_expr = fold_build1 (code, type, init_expr); | ||||
|   tree init_var = make_ssa_name (arg_type); | ||||
|   tree init_expr = omp_reduction_init_op (loc, op, var_type); | ||||
|   init_expr = fold_build1 (code, arg_type, init_expr); | ||||
|   gimplify_assign (init_var, init_expr, &init_seq); | ||||
|   gimple *init_end = gimple_seq_last (init_seq); | ||||
| 
 | ||||
|   gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT); | ||||
|    | ||||
|   gimple_seq loop_seq = NULL; | ||||
|   tree expect_var = make_ssa_name (type); | ||||
|   tree actual_var = make_ssa_name (type); | ||||
|   tree write_var = make_ssa_name (type); | ||||
|    | ||||
|   tree write_expr = fold_build1 (code, TREE_TYPE (var), expect_var); | ||||
|   write_expr = fold_build2 (op, TREE_TYPE (var), write_expr, var); | ||||
|   write_expr = fold_build1 (code, type, write_expr); | ||||
|   gimplify_assign (write_var, write_expr, &loop_seq); | ||||
| 
 | ||||
|   tree swap_expr = nvptx_builtin_decl (fn, true); | ||||
|   swap_expr = build_call_expr_loc (loc, swap_expr, 3, | ||||
| 				   ptr, expect_var, write_var); | ||||
|   gimplify_assign (actual_var, swap_expr, &loop_seq); | ||||
| 
 | ||||
|   gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var, | ||||
| 				   NULL_TREE, NULL_TREE); | ||||
|   gimple_seq_add_stmt (&loop_seq, cond); | ||||
| 
 | ||||
|   /* Split the block just after the init stmts.  */ | ||||
|   basic_block pre_bb = gsi_bb (*gsi); | ||||
|   edge pre_edge = split_block (pre_bb, init_end); | ||||
|  | @ -3738,12 +3762,34 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, | |||
|   /* Reset the iterator.  */ | ||||
|   *gsi = gsi_for_stmt (gsi_stmt (*gsi)); | ||||
| 
 | ||||
|   /* Insert the loop statements.  */ | ||||
|   gimple *loop_end = gimple_seq_last (loop_seq); | ||||
|   gsi_insert_seq_before (gsi, loop_seq, GSI_SAME_STMT); | ||||
|   tree expect_var = make_ssa_name (arg_type); | ||||
|   tree actual_var = make_ssa_name (arg_type); | ||||
|   tree write_var = make_ssa_name (arg_type); | ||||
|    | ||||
|   /* Split the block just after the loop stmts.  */ | ||||
|   edge post_edge = split_block (loop_bb, loop_end); | ||||
|   /* Build and insert the reduction calculation.  */ | ||||
|   gimple_seq red_seq = NULL; | ||||
|   tree write_expr = fold_build1 (code, var_type, expect_var); | ||||
|   write_expr = fold_build2 (op, var_type, write_expr, var); | ||||
|   write_expr = fold_build1 (code, arg_type, write_expr); | ||||
|   gimplify_assign (write_var, write_expr, &red_seq); | ||||
| 
 | ||||
|   gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT); | ||||
| 
 | ||||
|   /* Build & insert the cmp&swap sequence.  */ | ||||
|   gimple_seq latch_seq = NULL; | ||||
|   tree swap_expr = build_call_expr_loc (loc, swap_fn, 3, | ||||
| 					ptr, expect_var, write_var); | ||||
|   gimplify_assign (actual_var, swap_expr, &latch_seq); | ||||
| 
 | ||||
|   gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var, | ||||
| 				   NULL_TREE, NULL_TREE); | ||||
|   gimple_seq_add_stmt (&latch_seq, cond); | ||||
| 
 | ||||
|   gimple *latch_end = gimple_seq_last (latch_seq); | ||||
|   gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT); | ||||
| 
 | ||||
|   /* Split the block just after the latch stmts.  */ | ||||
|   edge post_edge = split_block (loop_bb, latch_end); | ||||
|   basic_block post_bb = post_edge->dest; | ||||
|   loop_bb = post_edge->src; | ||||
|   *gsi = gsi_for_stmt (gsi_stmt (*gsi)); | ||||
|  | @ -3762,7 +3808,123 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, | |||
|   loop->latch = loop_bb; | ||||
|   add_loop (loop, loop_bb->loop_father); | ||||
| 
 | ||||
|   return fold_build1 (code, TREE_TYPE (var), write_var); | ||||
|   return fold_build1 (code, var_type, write_var); | ||||
| } | ||||
| 
 | ||||
| /* Insert code to lockfully update *PTR with *PTR OP VAR just before
 | ||||
|    GSI.  This is necessary for types larger than 64 bits, where there | ||||
|    is no cmp&swap instruction to implement a lockless scheme.  We use | ||||
|    a lock variable in global memory. | ||||
| 
 | ||||
|    while (cmp&swap (&lock_var, 0, 1)) | ||||
|      continue; | ||||
|    T accum = *ptr; | ||||
|    accum = accum OP var; | ||||
|    *ptr = accum; | ||||
|    cmp&swap (&lock_var, 1, 0); | ||||
|    return accum; | ||||
| 
 | ||||
|    A lock in global memory is necessary to force execution engine | ||||
|    descheduling and avoid resource starvation that can occur if the | ||||
|    lock is in .shared memory.  */ | ||||
| 
 | ||||
| static tree | ||||
| nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi, | ||||
| 		       tree ptr, tree var, tree_code op) | ||||
| { | ||||
|   tree var_type = TREE_TYPE (var); | ||||
|   tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true); | ||||
|   tree uns_unlocked = build_int_cst (unsigned_type_node, 0); | ||||
|   tree uns_locked = build_int_cst (unsigned_type_node, 1); | ||||
| 
 | ||||
|   /* Split the block just before the gsi.  Insert a gimple nop to make
 | ||||
|      this easier.  */ | ||||
|   gimple *nop = gimple_build_nop (); | ||||
|   gsi_insert_before (gsi, nop, GSI_SAME_STMT); | ||||
|   basic_block entry_bb = gsi_bb (*gsi); | ||||
|   edge entry_edge = split_block (entry_bb, nop); | ||||
|   basic_block lock_bb = entry_edge->dest; | ||||
|   /* Reset the iterator.  */ | ||||
|   *gsi = gsi_for_stmt (gsi_stmt (*gsi)); | ||||
| 
 | ||||
|   /* Build and insert the locking sequence.  */ | ||||
|   gimple_seq lock_seq = NULL; | ||||
|   tree lock_var = make_ssa_name (unsigned_type_node); | ||||
|   tree lock_expr = nvptx_global_lock_addr (); | ||||
|   lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr, | ||||
| 				   uns_unlocked, uns_locked); | ||||
|   gimplify_assign (lock_var, lock_expr, &lock_seq); | ||||
|   gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked, | ||||
| 				   NULL_TREE, NULL_TREE); | ||||
|   gimple_seq_add_stmt (&lock_seq, cond); | ||||
|   gimple *lock_end = gimple_seq_last (lock_seq); | ||||
|   gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT); | ||||
| 
 | ||||
|   /* Split the block just after the lock sequence.  */ | ||||
|   edge locked_edge = split_block (lock_bb, lock_end); | ||||
|   basic_block update_bb = locked_edge->dest; | ||||
|   lock_bb = locked_edge->src; | ||||
|   *gsi = gsi_for_stmt (gsi_stmt (*gsi)); | ||||
|    | ||||
|   /* Create the lock loop ... */ | ||||
|   locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU; | ||||
|   make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE); | ||||
|   set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb); | ||||
|   set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb); | ||||
| 
 | ||||
|   /* ... and the loop structure.  */ | ||||
|   loop *lock_loop = alloc_loop (); | ||||
|   lock_loop->header = lock_bb; | ||||
|   lock_loop->latch = lock_bb; | ||||
|   lock_loop->nb_iterations_estimate = 1; | ||||
|   lock_loop->any_estimate = true; | ||||
|   add_loop (lock_loop, entry_bb->loop_father); | ||||
| 
 | ||||
|   /* Build and insert the reduction calculation.  */ | ||||
|   gimple_seq red_seq = NULL; | ||||
|   tree acc_in = make_ssa_name (var_type); | ||||
|   tree ref_in = build_simple_mem_ref (ptr); | ||||
|   TREE_THIS_VOLATILE (ref_in) = 1; | ||||
|   gimplify_assign (acc_in, ref_in, &red_seq); | ||||
|    | ||||
|   tree acc_out = make_ssa_name (var_type); | ||||
|   tree update_expr = fold_build2 (op, var_type, ref_in, var); | ||||
|   gimplify_assign (acc_out, update_expr, &red_seq); | ||||
|    | ||||
|   tree ref_out = build_simple_mem_ref (ptr); | ||||
|   TREE_THIS_VOLATILE (ref_out) = 1; | ||||
|   gimplify_assign (ref_out, acc_out, &red_seq); | ||||
| 
 | ||||
|   gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT); | ||||
| 
 | ||||
|   /* Build & insert the unlock sequence.  */ | ||||
|   gimple_seq unlock_seq = NULL; | ||||
|   tree unlock_expr = nvptx_global_lock_addr (); | ||||
|   unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr, | ||||
| 				     uns_locked, uns_unlocked); | ||||
|   gimplify_and_add (unlock_expr, &unlock_seq); | ||||
|   gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT); | ||||
| 
 | ||||
|   return acc_out; | ||||
| } | ||||
| 
 | ||||
| /* Emit a sequence to update a reduction accumlator at *PTR with the
 | ||||
|    value held in VAR using operator OP.  Return the updated value. | ||||
| 
 | ||||
|    TODO: optimize for atomic ops and indepedent complex ops.  */ | ||||
| 
 | ||||
| static tree | ||||
| nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi, | ||||
| 			tree ptr, tree var, tree_code op) | ||||
| { | ||||
|   tree type = TREE_TYPE (var); | ||||
|   tree size = TYPE_SIZE (type); | ||||
| 
 | ||||
|   if (size == TYPE_SIZE (unsigned_type_node) | ||||
|       || size == TYPE_SIZE (long_long_unsigned_type_node)) | ||||
|     return nvptx_lockless_update (loc, gsi, ptr, var, op); | ||||
|   else | ||||
|     return nvptx_lockfull_update (loc, gsi, ptr, var, op); | ||||
| } | ||||
| 
 | ||||
| /* NVPTX implementation of GOACC_REDUCTION_SETUP.  */ | ||||
|  | @ -3944,11 +4106,11 @@ nvptx_goacc_reduction_fini (gcall *call) | |||
| 
 | ||||
|       if (accum) | ||||
| 	{ | ||||
| 	  /* Locklessly update the accumulator.  */ | ||||
| 	  /* UPDATE the accumulator.  */ | ||||
| 	  gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); | ||||
| 	  seq = NULL; | ||||
| 	  r = nvptx_lockless_update (gimple_location (call), &gsi, | ||||
| 				     accum, var, op); | ||||
| 	  r = nvptx_reduction_update (gimple_location (call), &gsi, | ||||
| 				      accum, var, op); | ||||
| 	} | ||||
|     } | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,3 +1,8 @@ | |||
| 2015-11-18  Nathan Sidwell  <nathan@codesourcery.com> | ||||
| 
 | ||||
| 	* config/nvptx/reduction.c: New. | ||||
| 	* config/nvptx/t-nvptx (LIB2ADD): Add it. | ||||
| 
 | ||||
| 2015-11-15  David Edelsohn  <dje.gcc@gmail.com> | ||||
| 
 | ||||
| 	* config/rs6000/on_exit.c: New file. | ||||
|  |  | |||
|  | @ -0,0 +1,31 @@ | |||
| /* Oversized reductions lock  variable
 | ||||
|    Copyright (C) 2015 Free Software Foundation, Inc. | ||||
|    Contributed by Mentor Graphics. | ||||
| 
 | ||||
| This file is part of GCC. | ||||
| 
 | ||||
| GCC is free software; you can redistribute it and/or modify it under | ||||
| the terms of the GNU General Public License as published by the Free | ||||
| Software Foundation; either version 3, or (at your option) any later | ||||
| version. | ||||
| 
 | ||||
| GCC is distributed in the hope that it will be useful, but WITHOUT ANY | ||||
| WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||||
| FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License | ||||
| for more details. | ||||
| 
 | ||||
| Under Section 7 of GPL version 3, you are granted additional | ||||
| permissions described in the GCC Runtime Library Exception, version | ||||
| 3.1, as published by the Free Software Foundation. | ||||
| 
 | ||||
| You should have received a copy of the GNU General Public License and | ||||
| a copy of the GCC Runtime Library Exception along with this program; | ||||
| see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see | ||||
| <http://www.gnu.org/licenses/>.  */
 | ||||
| 
 | ||||
| 
 | ||||
| /* We use a global lock variable for reductions on objects larger than
 | ||||
|    64 bits.  Until and unless proven that lock contention for | ||||
|    different reduction is a problem, a single lock will suffice.  */ | ||||
| 
 | ||||
| unsigned volatile __reduction_lock = 0; | ||||
|  | @ -1,6 +1,7 @@ | |||
| LIB2ADD=$(srcdir)/config/nvptx/malloc.asm \ | ||||
| 	$(srcdir)/config/nvptx/free.asm \ | ||||
| 	$(srcdir)/config/nvptx/realloc.c | ||||
| 	$(srcdir)/config/nvptx/realloc.c \ | ||||
| 	$(srcdir)/config/nvptx/reduction.c | ||||
| 
 | ||||
| LIB2ADDEH= | ||||
| LIB2FUNCS_EXCLUDE=__main | ||||
|  |  | |||
|  | @ -1,3 +1,9 @@ | |||
| 2015-11-18  Nathan Sidwell  <nathan@codesourcery.com> | ||||
| 
 | ||||
| 	* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: Add | ||||
| 	worker & gang cases. | ||||
| 	* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: Likewise. | ||||
| 
 | ||||
| 2015-11-17  Cesar Philippidis  <cesar@codesourcery.com> | ||||
| 
 | ||||
| 	* config/nvptx/priority_queue.c: New file. | ||||
|  |  | |||
|  | @ -14,28 +14,17 @@ int close_enough (double _Complex a, double _Complex b) | |||
|   return mag2_diff / mag2_a < (FRAC * FRAC); | ||||
| } | ||||
| 
 | ||||
| int main (void) | ||||
| { | ||||
| #define N 100 | ||||
|   double _Complex ary[N], sum, prod, tsum, tprod; | ||||
|   int ix; | ||||
| 
 | ||||
|   sum = tsum = 0; | ||||
|   prod = tprod = 1; | ||||
| static int __attribute__ ((noinline)) | ||||
| vector (double _Complex ary[N], double _Complex sum, double _Complex prod) | ||||
| { | ||||
|   double _Complex tsum = 0, tprod = 1; | ||||
| 
 | ||||
|   for (ix = 0; ix < N;  ix++) | ||||
|     { | ||||
|       double frac = ix * (1.0 / 1024) + 1.0; | ||||
|        | ||||
|       ary[ix] = frac + frac * 2.0i - 1.0i; | ||||
|       sum += ary[ix]; | ||||
|       prod *= ary[ix]; | ||||
|     } | ||||
| 
 | ||||
| #pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod) | ||||
| #pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod) | ||||
|   { | ||||
| #pragma acc loop vector reduction(+:tsum) reduction (*:tprod) | ||||
|     for (ix = 0; ix < N; ix++) | ||||
|     for (int ix = 0; ix < N; ix++) | ||||
|       { | ||||
| 	tsum += ary[ix]; | ||||
| 	tprod *= ary[ix]; | ||||
|  | @ -50,3 +39,76 @@ int main (void) | |||
| 
 | ||||
|   return 0; | ||||
| } | ||||
| 
 | ||||
| static int __attribute__ ((noinline)) | ||||
| worker (double _Complex ary[N], double _Complex sum, double _Complex prod) | ||||
| { | ||||
|   double _Complex tsum = 0, tprod = 1; | ||||
| 
 | ||||
| #pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod) | ||||
|   { | ||||
| #pragma acc loop worker reduction(+:tsum) reduction (*:tprod) | ||||
|     for (int ix = 0; ix < N; ix++) | ||||
|       { | ||||
| 	tsum += ary[ix]; | ||||
| 	tprod *= ary[ix]; | ||||
|       } | ||||
|   } | ||||
| 
 | ||||
|   if (!close_enough (sum, tsum)) | ||||
|     return 1; | ||||
| 
 | ||||
|   if (!close_enough (prod, tprod)) | ||||
|     return 1; | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
| 
 | ||||
| static int __attribute__ ((noinline)) | ||||
| gang (double _Complex ary[N], double _Complex sum, double _Complex prod) | ||||
| { | ||||
|   double _Complex tsum = 0, tprod = 1; | ||||
| 
 | ||||
| #pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod) | ||||
|   { | ||||
| #pragma acc loop gang reduction(+:tsum) reduction (*:tprod) | ||||
|     for (int ix = 0; ix < N; ix++) | ||||
|       { | ||||
| 	tsum += ary[ix]; | ||||
| 	tprod *= ary[ix]; | ||||
|       } | ||||
|   } | ||||
| 
 | ||||
|   if (!close_enough (sum, tsum)) | ||||
|     return 1; | ||||
| 
 | ||||
|   if (!close_enough (prod, tprod)) | ||||
|     return 1; | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
| 
 | ||||
| int main (void) | ||||
| { | ||||
|   double _Complex ary[N], sum = 0, prod = 1; | ||||
| 
 | ||||
|   for (int ix = 0; ix < N;  ix++) | ||||
|     { | ||||
|       double frac = ix * (1.0 / 1024) + 1.0; | ||||
|        | ||||
|       ary[ix] = frac + frac * 2.0i - 1.0i; | ||||
|       sum += ary[ix]; | ||||
|       prod *= ary[ix]; | ||||
|     } | ||||
| 
 | ||||
|   if (vector (ary, sum, prod)) | ||||
|     return 1; | ||||
|    | ||||
|   if (worker (ary, sum, prod)) | ||||
|     return 1; | ||||
| 
 | ||||
|   if (gang (ary, sum, prod)) | ||||
|     return 1; | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
|  |  | |||
|  | @ -14,28 +14,17 @@ int close_enough (float _Complex a, float _Complex b) | |||
|   return mag2_diff / mag2_a < (FRAC * FRAC); | ||||
| } | ||||
| 
 | ||||
| int main (void) | ||||
| { | ||||
| #define N 100 | ||||
|   float _Complex ary[N], sum, prod, tsum, tprod; | ||||
|   int ix; | ||||
| 
 | ||||
|   sum = tsum = 0; | ||||
|   prod = tprod = 1; | ||||
| static int __attribute__ ((noinline)) | ||||
| vector (float _Complex ary[N], float _Complex sum, float _Complex prod) | ||||
| { | ||||
|   float _Complex tsum = 0, tprod = 1; | ||||
| 
 | ||||
|   for (ix = 0; ix < N;  ix++) | ||||
|     { | ||||
|       float frac = ix * (1.0f / 1024) + 1.0f; | ||||
|        | ||||
|       ary[ix] = frac + frac * 2.0i - 1.0i; | ||||
|       sum += ary[ix]; | ||||
|       prod *= ary[ix]; | ||||
|     } | ||||
| 
 | ||||
| #pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod) | ||||
| #pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod) | ||||
|   { | ||||
| #pragma acc loop vector reduction(+:tsum) reduction (*:tprod) | ||||
|     for (ix = 0; ix < N; ix++) | ||||
|     for (int ix = 0; ix < N; ix++) | ||||
|       { | ||||
| 	tsum += ary[ix]; | ||||
| 	tprod *= ary[ix]; | ||||
|  | @ -50,3 +39,76 @@ int main (void) | |||
| 
 | ||||
|   return 0; | ||||
| } | ||||
| 
 | ||||
| static int __attribute__ ((noinline)) | ||||
| worker (float _Complex ary[N], float _Complex sum, float _Complex prod) | ||||
| { | ||||
|   float _Complex tsum = 0, tprod = 1; | ||||
| 
 | ||||
| #pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod) | ||||
|   { | ||||
| #pragma acc loop worker reduction(+:tsum) reduction (*:tprod) | ||||
|     for (int ix = 0; ix < N; ix++) | ||||
|       { | ||||
| 	tsum += ary[ix]; | ||||
| 	tprod *= ary[ix]; | ||||
|       } | ||||
|   } | ||||
| 
 | ||||
|   if (!close_enough (sum, tsum)) | ||||
|     return 1; | ||||
| 
 | ||||
|   if (!close_enough (prod, tprod)) | ||||
|     return 1; | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
| 
 | ||||
| static int __attribute__ ((noinline)) | ||||
| gang (float _Complex ary[N], float _Complex sum, float _Complex prod) | ||||
| { | ||||
|   float _Complex tsum = 0, tprod = 1; | ||||
| 
 | ||||
| #pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod) | ||||
|   { | ||||
| #pragma acc loop gang reduction(+:tsum) reduction (*:tprod) | ||||
|     for (int ix = 0; ix < N; ix++) | ||||
|       { | ||||
| 	tsum += ary[ix]; | ||||
| 	tprod *= ary[ix]; | ||||
|       } | ||||
|   } | ||||
| 
 | ||||
|   if (!close_enough (sum, tsum)) | ||||
|     return 1; | ||||
| 
 | ||||
|   if (!close_enough (prod, tprod)) | ||||
|     return 1; | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
| 
 | ||||
| int main (void) | ||||
| { | ||||
|   float _Complex ary[N], sum = 0, prod = 1; | ||||
| 
 | ||||
|   for (int ix = 0; ix < N;  ix++) | ||||
|     { | ||||
|       float frac = ix * (1.0f / 1024) + 1.0f; | ||||
|        | ||||
|       ary[ix] = frac + frac * 2.0i - 1.0i; | ||||
|       sum += ary[ix]; | ||||
|       prod *= ary[ix]; | ||||
|     } | ||||
| 
 | ||||
|   if (vector (ary, sum, prod)) | ||||
|     return 1; | ||||
|    | ||||
|   if (worker (ary, sum, prod)) | ||||
|     return 1; | ||||
| 
 | ||||
|   if (gang (ary, sum, prod)) | ||||
|     return 1; | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Nathan Sidwell
						Nathan Sidwell