mirror of git://gcc.gnu.org/git/gcc.git
libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation
... by using the existing 'goacc_asyncqueue' instead of re-coding parts of it. Follow-up to commit131d18e928"libgomp/nvptx: Prepare for reverse-offload callback handling", and commitea4b23d9c8"libgomp: Handle OpenMP's reverse offloads". libgomp/ * target.c (gomp_target_rev): Instead of 'dev_to_host_cpy', 'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'. * libgomp.h (gomp_target_rev): Adjust. * libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust. * libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust. * plugin/plugin-gcn.c (process_reverse_offload): Adjust. * plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy) (rev_off_host_to_dev_cpy): Remove. (GOMP_OFFLOAD_run): Adjust.
This commit is contained in:
parent
bd6dbdb196
commit
130c2f3c3a
|
|
@ -82,11 +82,8 @@ GOMP_PLUGIN_fatal (const char *msg, ...)
|
|||
void
|
||||
GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
|
||||
uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
|
||||
void (*dev_to_host_cpy) (void *, const void *, size_t,
|
||||
void *),
|
||||
void (*host_to_dev_cpy) (void *, const void *, size_t,
|
||||
void *), void *token)
|
||||
struct goacc_asyncqueue *aq)
|
||||
{
|
||||
gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num,
|
||||
dev_to_host_cpy, host_to_dev_cpy, token);
|
||||
aq);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -121,11 +121,7 @@ extern void GOMP_PLUGIN_fatal (const char *, ...)
|
|||
__attribute__ ((noreturn, format (printf, 1, 2)));
|
||||
|
||||
extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t,
|
||||
uint64_t, int,
|
||||
void (*) (void *, const void *, size_t,
|
||||
void *),
|
||||
void (*) (void *, const void *, size_t,
|
||||
void *), void *);
|
||||
uint64_t, int, struct goacc_asyncqueue *);
|
||||
|
||||
/* Prototypes for functions implemented by libgomp plugins. */
|
||||
extern const char *GOMP_OFFLOAD_get_name (void);
|
||||
|
|
|
|||
|
|
@ -1130,10 +1130,7 @@ extern void gomp_init_targets_once (void);
|
|||
extern int gomp_get_num_devices (void);
|
||||
extern bool gomp_target_task_fn (void *);
|
||||
extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
|
||||
int,
|
||||
void (*) (void *, const void *, size_t, void *),
|
||||
void (*) (void *, const void *, size_t, void *),
|
||||
void *);
|
||||
int, struct goacc_asyncqueue *);
|
||||
|
||||
/* Splay tree definitions. */
|
||||
typedef struct splay_tree_node_s *splay_tree_node;
|
||||
|
|
|
|||
|
|
@ -1949,7 +1949,7 @@ process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
|
|||
{
|
||||
int dev_num = dev_num64;
|
||||
GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
|
||||
NULL, NULL, NULL);
|
||||
NULL);
|
||||
}
|
||||
|
||||
/* Output any data written to console output from the kernel. It is expected
|
||||
|
|
|
|||
|
|
@ -56,6 +56,7 @@
|
|||
#include <unistd.h>
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
|
||||
block to cache between kernel invocations. For soft-stacks blocks bigger
|
||||
|
|
@ -1625,11 +1626,11 @@ GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
|
|||
return 1;
|
||||
}
|
||||
|
||||
struct goacc_asyncqueue *
|
||||
GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
|
||||
static struct goacc_asyncqueue *
|
||||
nvptx_goacc_asyncqueue_construct (unsigned int flags)
|
||||
{
|
||||
CUstream stream = NULL;
|
||||
CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
|
||||
CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
|
||||
|
||||
struct goacc_asyncqueue *aq
|
||||
= GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
|
||||
|
|
@ -1637,14 +1638,26 @@ GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
|
|||
return aq;
|
||||
}
|
||||
|
||||
bool
|
||||
GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
|
||||
struct goacc_asyncqueue *
|
||||
GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
|
||||
{
|
||||
return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
|
||||
}
|
||||
|
||||
static bool
|
||||
nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
|
||||
{
|
||||
CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
|
||||
free (aq);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
|
||||
{
|
||||
return nvptx_goacc_asyncqueue_destruct (aq);
|
||||
}
|
||||
|
||||
int
|
||||
GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
|
||||
{
|
||||
|
|
@ -1658,13 +1671,19 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
|
|||
return -1;
|
||||
}
|
||||
|
||||
bool
|
||||
GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
|
||||
static bool
|
||||
nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
|
||||
{
|
||||
CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
|
||||
{
|
||||
return nvptx_goacc_asyncqueue_synchronize (aq);
|
||||
}
|
||||
|
||||
bool
|
||||
GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
|
||||
struct goacc_asyncqueue *aq2)
|
||||
|
|
@ -1924,22 +1943,6 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
|
|||
}
|
||||
|
||||
|
||||
void
|
||||
rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size,
|
||||
CUstream stream)
|
||||
{
|
||||
CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream);
|
||||
CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
|
||||
}
|
||||
|
||||
void
|
||||
rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size,
|
||||
CUstream stream)
|
||||
{
|
||||
CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream);
|
||||
CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
|
||||
}
|
||||
|
||||
void
|
||||
GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
|
||||
{
|
||||
|
|
@ -1973,9 +1976,17 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
|
|||
}
|
||||
nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
|
||||
|
||||
size_t stack_size = nvptx_stacks_size ();
|
||||
bool reverse_offload = ptx_dev->rev_data != NULL;
|
||||
CUstream copy_stream = NULL;
|
||||
struct goacc_asyncqueue *reverse_offload_aq = NULL;
|
||||
if (reverse_offload)
|
||||
{
|
||||
reverse_offload_aq
|
||||
= nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
|
||||
if (!reverse_offload_aq)
|
||||
exit (EXIT_FAILURE);
|
||||
}
|
||||
|
||||
size_t stack_size = nvptx_stacks_size ();
|
||||
|
||||
pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
|
||||
void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
|
||||
|
|
@ -1989,8 +2000,6 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
|
|||
GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
|
||||
" [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
|
||||
__FUNCTION__, fn_name, teams, threads);
|
||||
if (reverse_offload)
|
||||
CUDA_CALL_ASSERT (cuStreamCreate, ©_stream, CU_STREAM_NON_BLOCKING);
|
||||
r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
|
||||
32, threads, 1, 0, NULL, NULL, config);
|
||||
if (r != CUDA_SUCCESS)
|
||||
|
|
@ -2013,17 +2022,15 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
|
|||
GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
|
||||
rev_data->addrs, rev_data->sizes,
|
||||
rev_data->kinds, rev_data->dev_num,
|
||||
rev_off_dev_to_host_cpy,
|
||||
rev_off_host_to_dev_cpy, copy_stream);
|
||||
CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream);
|
||||
reverse_offload_aq);
|
||||
if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
|
||||
exit (EXIT_FAILURE);
|
||||
__atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
|
||||
}
|
||||
usleep (1);
|
||||
}
|
||||
else
|
||||
r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
|
||||
if (reverse_offload)
|
||||
CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream);
|
||||
if (r == CUDA_ERROR_LAUNCH_FAILED)
|
||||
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
|
||||
maybe_abort_msg);
|
||||
|
|
@ -2031,6 +2038,12 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
|
|||
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
|
||||
|
||||
pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
|
||||
|
||||
if (reverse_offload)
|
||||
{
|
||||
if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
|
||||
exit (EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
/* TODO: Implement GOMP_OFFLOAD_async_run. */
|
||||
|
|
|
|||
102
libgomp/target.c
102
libgomp/target.c
|
|
@ -3299,9 +3299,7 @@ gomp_map_cdata_lookup (struct cpy_data *d, uint64_t *devaddrs,
|
|||
void
|
||||
gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
|
||||
uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
|
||||
void (*dev_to_host_cpy) (void *, const void *, size_t, void*),
|
||||
void (*host_to_dev_cpy) (void *, const void *, size_t, void*),
|
||||
void *token)
|
||||
struct goacc_asyncqueue *aq)
|
||||
{
|
||||
/* Return early if there is no offload code. */
|
||||
if (sizeof (OFFLOAD_PLUGINS) == sizeof (""))
|
||||
|
|
@ -3343,26 +3341,17 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
|
|||
devaddrs = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
|
||||
sizes = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
|
||||
kinds = (unsigned short *) gomp_malloc (mapnum * sizeof (unsigned short));
|
||||
if (dev_to_host_cpy)
|
||||
{
|
||||
dev_to_host_cpy (devaddrs, (const void *) (uintptr_t) devaddrs_ptr,
|
||||
mapnum * sizeof (uint64_t), token);
|
||||
dev_to_host_cpy (sizes, (const void *) (uintptr_t) sizes_ptr,
|
||||
mapnum * sizeof (uint64_t), token);
|
||||
dev_to_host_cpy (kinds, (const void *) (uintptr_t) kinds_ptr,
|
||||
mapnum * sizeof (unsigned short), token);
|
||||
}
|
||||
else
|
||||
{
|
||||
gomp_copy_dev2host (devicep, NULL, devaddrs,
|
||||
(const void *) (uintptr_t) devaddrs_ptr,
|
||||
mapnum * sizeof (uint64_t));
|
||||
gomp_copy_dev2host (devicep, NULL, sizes,
|
||||
(const void *) (uintptr_t) sizes_ptr,
|
||||
mapnum * sizeof (uint64_t));
|
||||
gomp_copy_dev2host (devicep, NULL, kinds, (const void *) (uintptr_t) kinds_ptr,
|
||||
mapnum * sizeof (unsigned short));
|
||||
}
|
||||
gomp_copy_dev2host (devicep, aq, devaddrs,
|
||||
(const void *) (uintptr_t) devaddrs_ptr,
|
||||
mapnum * sizeof (uint64_t));
|
||||
gomp_copy_dev2host (devicep, aq, sizes,
|
||||
(const void *) (uintptr_t) sizes_ptr,
|
||||
mapnum * sizeof (uint64_t));
|
||||
gomp_copy_dev2host (devicep, aq, kinds,
|
||||
(const void *) (uintptr_t) kinds_ptr,
|
||||
mapnum * sizeof (unsigned short));
|
||||
if (aq && !devicep->openacc.async.synchronize_func (aq))
|
||||
exit (EXIT_FAILURE);
|
||||
}
|
||||
|
||||
size_t tgt_align = 0, tgt_size = 0;
|
||||
|
|
@ -3389,13 +3378,14 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
|
|||
if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
memcpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
|
||||
(size_t) sizes[i]);
|
||||
else if (dev_to_host_cpy)
|
||||
dev_to_host_cpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
|
||||
(size_t) sizes[i], token);
|
||||
else
|
||||
gomp_copy_dev2host (devicep, NULL, tgt + tgt_size,
|
||||
(void *) (uintptr_t) devaddrs[i],
|
||||
(size_t) sizes[i]);
|
||||
{
|
||||
gomp_copy_dev2host (devicep, aq, tgt + tgt_size,
|
||||
(void *) (uintptr_t) devaddrs[i],
|
||||
(size_t) sizes[i]);
|
||||
if (aq && !devicep->openacc.async.synchronize_func (aq))
|
||||
exit (EXIT_FAILURE);
|
||||
}
|
||||
devaddrs[i] = (uint64_t) (uintptr_t) tgt + tgt_size;
|
||||
tgt_size = tgt_size + sizes[i];
|
||||
if ((devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
|
|
@ -3485,15 +3475,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
|
|||
|| kind == GOMP_MAP_ALWAYS_TO
|
||||
|| kind == GOMP_MAP_ALWAYS_TOFROM)
|
||||
{
|
||||
if (dev_to_host_cpy)
|
||||
dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
|
||||
(void *) (uintptr_t) cdata[i].devaddr,
|
||||
sizes[i], token);
|
||||
else
|
||||
gomp_copy_dev2host (devicep, NULL,
|
||||
(void *) (uintptr_t) devaddrs[i],
|
||||
(void *) (uintptr_t) cdata[i].devaddr,
|
||||
sizes[i]);
|
||||
gomp_copy_dev2host (devicep, aq,
|
||||
(void *) (uintptr_t) devaddrs[i],
|
||||
(void *) (uintptr_t) cdata[i].devaddr,
|
||||
sizes[i]);
|
||||
if (aq && !devicep->openacc.async.synchronize_func (aq))
|
||||
{
|
||||
gomp_mutex_unlock (&devicep->lock);
|
||||
exit (EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
if (struct_cpy)
|
||||
struct_cpy--;
|
||||
|
|
@ -3560,15 +3550,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
|
|||
devaddrs[i]
|
||||
= (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
|
||||
sizes[i]);
|
||||
if (dev_to_host_cpy)
|
||||
dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
|
||||
(void *) (uintptr_t) cdata[i].devaddr,
|
||||
sizes[i], token);
|
||||
else
|
||||
gomp_copy_dev2host (devicep, NULL,
|
||||
(void *) (uintptr_t) devaddrs[i],
|
||||
(void *) (uintptr_t) cdata[i].devaddr,
|
||||
sizes[i]);
|
||||
gomp_copy_dev2host (devicep, aq,
|
||||
(void *) (uintptr_t) devaddrs[i],
|
||||
(void *) (uintptr_t) cdata[i].devaddr,
|
||||
sizes[i]);
|
||||
if (aq && !devicep->openacc.async.synchronize_func (aq))
|
||||
{
|
||||
gomp_mutex_unlock (&devicep->lock);
|
||||
exit (EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
for (j = i + 1; j < mapnum; j++)
|
||||
{
|
||||
|
|
@ -3672,15 +3662,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
|
|||
/* FALLTHRU */
|
||||
case GOMP_MAP_FROM:
|
||||
case GOMP_MAP_TOFROM:
|
||||
if (copy && host_to_dev_cpy)
|
||||
host_to_dev_cpy ((void *) (uintptr_t) cdata[i].devaddr,
|
||||
(void *) (uintptr_t) devaddrs[i],
|
||||
sizes[i], token);
|
||||
else if (copy)
|
||||
gomp_copy_host2dev (devicep, NULL,
|
||||
(void *) (uintptr_t) cdata[i].devaddr,
|
||||
(void *) (uintptr_t) devaddrs[i],
|
||||
sizes[i], false, NULL);
|
||||
if (copy)
|
||||
{
|
||||
gomp_copy_host2dev (devicep, aq,
|
||||
(void *) (uintptr_t) cdata[i].devaddr,
|
||||
(void *) (uintptr_t) devaddrs[i],
|
||||
sizes[i], false, NULL);
|
||||
if (aq && !devicep->openacc.async.synchronize_func (aq))
|
||||
exit (EXIT_FAILURE);
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue