mirror of git://gcc.gnu.org/git/gcc.git
[nvptx] Remove use of CUDA unified memory in libgomp
libgomp/ * plugin/plugin-nvptx.c (struct cuda_map): New. (struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev, h_tail with (cuda_map *) map. (cuda_map_create): New function. (cuda_map_destroy): New function. (map_init): Update to use a linked list of cuda_map objects. (map_fini): Likewise. (map_pop): Likewise. (map_push): Likewise. Return CUdeviceptr instead of void. (init_streams_for_device): Remove stales references to ptx_stream members. (select_stream_for_async): Likewise. (nvptx_exec): Update call to map_init. From-SVN: r264397
This commit is contained in:
parent
5e594075c8
commit
2049befdd0
|
@ -1,3 +1,19 @@
|
||||||
|
2018-09-18 Cesar Philippidis <cesar@codesourcery.com>
|
||||||
|
|
||||||
|
* plugin/plugin-nvptx.c (struct cuda_map): New.
|
||||||
|
(struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev,
|
||||||
|
h_tail with (cuda_map *) map.
|
||||||
|
(cuda_map_create): New function.
|
||||||
|
(cuda_map_destroy): New function.
|
||||||
|
(map_init): Update to use a linked list of cuda_map objects.
|
||||||
|
(map_fini): Likewise.
|
||||||
|
(map_pop): Likewise.
|
||||||
|
(map_push): Likewise. Return CUdeviceptr instead of void.
|
||||||
|
(init_streams_for_device): Remove stales references to ptx_stream
|
||||||
|
members.
|
||||||
|
(select_stream_for_async): Likewise.
|
||||||
|
(nvptx_exec): Update call to map_init.
|
||||||
|
|
||||||
2018-09-09 Cesar Philippidis <cesar@codesourcery.com>
|
2018-09-09 Cesar Philippidis <cesar@codesourcery.com>
|
||||||
Julian Brown <julian@codesourcery.com>
|
Julian Brown <julian@codesourcery.com>
|
||||||
|
|
||||||
|
|
|
@ -192,20 +192,20 @@ cuda_error (CUresult r)
|
||||||
static unsigned int instantiated_devices = 0;
|
static unsigned int instantiated_devices = 0;
|
||||||
static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
|
static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
|
|
||||||
|
struct cuda_map
|
||||||
|
{
|
||||||
|
CUdeviceptr d;
|
||||||
|
size_t size;
|
||||||
|
bool active;
|
||||||
|
struct cuda_map *next;
|
||||||
|
};
|
||||||
|
|
||||||
struct ptx_stream
|
struct ptx_stream
|
||||||
{
|
{
|
||||||
CUstream stream;
|
CUstream stream;
|
||||||
pthread_t host_thread;
|
pthread_t host_thread;
|
||||||
bool multithreaded;
|
bool multithreaded;
|
||||||
|
struct cuda_map *map;
|
||||||
CUdeviceptr d;
|
|
||||||
void *h;
|
|
||||||
void *h_begin;
|
|
||||||
void *h_end;
|
|
||||||
void *h_next;
|
|
||||||
void *h_prev;
|
|
||||||
void *h_tail;
|
|
||||||
|
|
||||||
struct ptx_stream *next;
|
struct ptx_stream *next;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -217,103 +217,116 @@ struct nvptx_thread
|
||||||
struct ptx_device *ptx_dev;
|
struct ptx_device *ptx_dev;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static struct cuda_map *
|
||||||
|
cuda_map_create (size_t size)
|
||||||
|
{
|
||||||
|
struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
|
||||||
|
|
||||||
|
assert (map);
|
||||||
|
|
||||||
|
map->next = NULL;
|
||||||
|
map->size = size;
|
||||||
|
map->active = false;
|
||||||
|
|
||||||
|
CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
|
||||||
|
assert (map->d);
|
||||||
|
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
cuda_map_destroy (struct cuda_map *map)
|
||||||
|
{
|
||||||
|
CUDA_CALL_ASSERT (cuMemFree, map->d);
|
||||||
|
free (map);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The following map_* routines manage the CUDA device memory that
|
||||||
|
contains the data mapping arguments for cuLaunchKernel. Each
|
||||||
|
asynchronous PTX stream may have multiple pending kernel
|
||||||
|
invocations, which are launched in a FIFO order. As such, the map
|
||||||
|
routines maintains a queue of cuLaunchKernel arguments.
|
||||||
|
|
||||||
|
Calls to map_push and map_pop must be guarded by ptx_event_lock.
|
||||||
|
Likewise, calls to map_init and map_fini are guarded by
|
||||||
|
ptx_dev_lock inside GOMP_OFFLOAD_init_device and
|
||||||
|
GOMP_OFFLOAD_fini_device, respectively. */
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
map_init (struct ptx_stream *s)
|
map_init (struct ptx_stream *s)
|
||||||
{
|
{
|
||||||
int size = getpagesize ();
|
int size = getpagesize ();
|
||||||
|
|
||||||
assert (s);
|
assert (s);
|
||||||
assert (!s->d);
|
|
||||||
assert (!s->h);
|
|
||||||
|
|
||||||
CUDA_CALL (cuMemAllocHost, &s->h, size);
|
s->map = cuda_map_create (size);
|
||||||
CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
|
|
||||||
|
|
||||||
assert (s->h);
|
|
||||||
|
|
||||||
s->h_begin = s->h;
|
|
||||||
s->h_end = s->h_begin + size;
|
|
||||||
s->h_next = s->h_prev = s->h_tail = s->h_begin;
|
|
||||||
|
|
||||||
assert (s->h_next);
|
|
||||||
assert (s->h_end);
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
map_fini (struct ptx_stream *s)
|
map_fini (struct ptx_stream *s)
|
||||||
{
|
{
|
||||||
CUDA_CALL (cuMemFreeHost, s->h);
|
assert (s->map->next == NULL);
|
||||||
|
assert (!s->map->active);
|
||||||
|
|
||||||
|
cuda_map_destroy (s->map);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
map_pop (struct ptx_stream *s)
|
map_pop (struct ptx_stream *s)
|
||||||
{
|
{
|
||||||
assert (s != NULL);
|
struct cuda_map *next;
|
||||||
assert (s->h_next);
|
|
||||||
assert (s->h_prev);
|
|
||||||
assert (s->h_tail);
|
|
||||||
|
|
||||||
s->h_tail = s->h_next;
|
|
||||||
|
|
||||||
if (s->h_tail >= s->h_end)
|
|
||||||
s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
|
|
||||||
|
|
||||||
if (s->h_next == s->h_tail)
|
|
||||||
s->h_prev = s->h_next;
|
|
||||||
|
|
||||||
assert (s->h_next >= s->h_begin);
|
|
||||||
assert (s->h_tail >= s->h_begin);
|
|
||||||
assert (s->h_prev >= s->h_begin);
|
|
||||||
|
|
||||||
assert (s->h_next <= s->h_end);
|
|
||||||
assert (s->h_tail <= s->h_end);
|
|
||||||
assert (s->h_prev <= s->h_end);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
map_push (struct ptx_stream *s, size_t size, void **h, void **d)
|
|
||||||
{
|
|
||||||
int left;
|
|
||||||
int offset;
|
|
||||||
|
|
||||||
assert (s != NULL);
|
assert (s != NULL);
|
||||||
|
|
||||||
left = s->h_end - s->h_next;
|
if (s->map->next == NULL)
|
||||||
|
|
||||||
assert (s->h_prev);
|
|
||||||
assert (s->h_next);
|
|
||||||
|
|
||||||
if (size >= left)
|
|
||||||
{
|
{
|
||||||
assert (s->h_next == s->h_prev);
|
s->map->active = false;
|
||||||
s->h_next = s->h_prev = s->h_tail = s->h_begin;
|
|
||||||
}
|
|
||||||
|
|
||||||
assert (s->h_next);
|
|
||||||
|
|
||||||
offset = s->h_next - s->h;
|
|
||||||
|
|
||||||
*d = (void *)(s->d + offset);
|
|
||||||
*h = (void *)(s->h + offset);
|
|
||||||
|
|
||||||
s->h_prev = s->h_next;
|
|
||||||
s->h_next += size;
|
|
||||||
|
|
||||||
assert (s->h_prev);
|
|
||||||
assert (s->h_next);
|
|
||||||
|
|
||||||
assert (s->h_next >= s->h_begin);
|
|
||||||
assert (s->h_tail >= s->h_begin);
|
|
||||||
assert (s->h_prev >= s->h_begin);
|
|
||||||
assert (s->h_next <= s->h_end);
|
|
||||||
assert (s->h_tail <= s->h_end);
|
|
||||||
assert (s->h_prev <= s->h_end);
|
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
next = s->map->next;
|
||||||
|
cuda_map_destroy (s->map);
|
||||||
|
s->map = next;
|
||||||
|
}
|
||||||
|
|
||||||
|
static CUdeviceptr
|
||||||
|
map_push (struct ptx_stream *s, size_t size)
|
||||||
|
{
|
||||||
|
struct cuda_map *map = NULL, *t = NULL;
|
||||||
|
|
||||||
|
assert (s);
|
||||||
|
assert (s->map);
|
||||||
|
|
||||||
|
/* Each PTX stream requires a separate data region to store the
|
||||||
|
launch arguments for cuLaunchKernel. Allocate a new
|
||||||
|
cuda_map and push it to the end of the list. */
|
||||||
|
if (s->map->active)
|
||||||
|
{
|
||||||
|
map = cuda_map_create (size);
|
||||||
|
|
||||||
|
for (t = s->map; t->next != NULL; t = t->next)
|
||||||
|
;
|
||||||
|
|
||||||
|
t->next = map;
|
||||||
|
}
|
||||||
|
else if (s->map->size < size)
|
||||||
|
{
|
||||||
|
cuda_map_destroy (s->map);
|
||||||
|
map = cuda_map_create (size);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
map = s->map;
|
||||||
|
|
||||||
|
s->map = map;
|
||||||
|
s->map->active = true;
|
||||||
|
|
||||||
|
return s->map->d;
|
||||||
|
}
|
||||||
|
|
||||||
/* Target data function launch information. */
|
/* Target data function launch information. */
|
||||||
|
|
||||||
struct targ_fn_launch
|
struct targ_fn_launch
|
||||||
|
@ -442,8 +455,6 @@ init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
|
||||||
null_stream->stream = NULL;
|
null_stream->stream = NULL;
|
||||||
null_stream->host_thread = pthread_self ();
|
null_stream->host_thread = pthread_self ();
|
||||||
null_stream->multithreaded = true;
|
null_stream->multithreaded = true;
|
||||||
null_stream->d = (CUdeviceptr) NULL;
|
|
||||||
null_stream->h = NULL;
|
|
||||||
if (!map_init (null_stream))
|
if (!map_init (null_stream))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
@ -578,8 +589,6 @@ select_stream_for_async (int async, pthread_t thread, bool create,
|
||||||
s->host_thread = thread;
|
s->host_thread = thread;
|
||||||
s->multithreaded = false;
|
s->multithreaded = false;
|
||||||
|
|
||||||
s->d = (CUdeviceptr) NULL;
|
|
||||||
s->h = NULL;
|
|
||||||
if (!map_init (s))
|
if (!map_init (s))
|
||||||
{
|
{
|
||||||
pthread_mutex_unlock (&ptx_dev->stream_lock);
|
pthread_mutex_unlock (&ptx_dev->stream_lock);
|
||||||
|
@ -1120,7 +1129,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
|
||||||
int i;
|
int i;
|
||||||
struct ptx_stream *dev_str;
|
struct ptx_stream *dev_str;
|
||||||
void *kargs[1];
|
void *kargs[1];
|
||||||
void *hp, *dp;
|
void *hp;
|
||||||
|
CUdeviceptr dp;
|
||||||
struct nvptx_thread *nvthd = nvptx_thread ();
|
struct nvptx_thread *nvthd = nvptx_thread ();
|
||||||
int warp_size = nvthd->ptx_dev->warp_size;
|
int warp_size = nvthd->ptx_dev->warp_size;
|
||||||
const char *maybe_abort_msg = "(perhaps abort was called)";
|
const char *maybe_abort_msg = "(perhaps abort was called)";
|
||||||
|
@ -1295,17 +1305,19 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
|
||||||
/* This reserves a chunk of a pre-allocated page of memory mapped on both
|
/* This reserves a chunk of a pre-allocated page of memory mapped on both
|
||||||
the host and the device. HP is a host pointer to the new chunk, and DP is
|
the host and the device. HP is a host pointer to the new chunk, and DP is
|
||||||
the corresponding device pointer. */
|
the corresponding device pointer. */
|
||||||
map_push (dev_str, mapnum * sizeof (void *), &hp, &dp);
|
pthread_mutex_lock (&ptx_event_lock);
|
||||||
|
dp = map_push (dev_str, mapnum * sizeof (void *));
|
||||||
|
pthread_mutex_unlock (&ptx_event_lock);
|
||||||
|
|
||||||
GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
|
GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
|
||||||
|
|
||||||
/* Copy the array of arguments to the mapped page. */
|
/* Copy the array of arguments to the mapped page. */
|
||||||
|
hp = alloca(sizeof(void *) * mapnum);
|
||||||
for (i = 0; i < mapnum; i++)
|
for (i = 0; i < mapnum; i++)
|
||||||
((void **) hp)[i] = devaddrs[i];
|
((void **) hp)[i] = devaddrs[i];
|
||||||
|
|
||||||
/* Copy the (device) pointers to arguments to the device (dp and hp might in
|
/* Copy the (device) pointers to arguments to the device */
|
||||||
fact have the same value on a unified-memory system). */
|
CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
|
||||||
CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
|
|
||||||
mapnum * sizeof (void *));
|
mapnum * sizeof (void *));
|
||||||
GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
|
GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
|
||||||
" gangs=%u, workers=%u, vectors=%u\n",
|
" gangs=%u, workers=%u, vectors=%u\n",
|
||||||
|
|
Loading…
Reference in New Issue