mirror of git://gcc.gnu.org/git/gcc.git
Allow libgomp 'cbuf' buffering with OpenACC 'async' for 'ephemeral' data
This does *allow*, but under no circumstances is this currently going to be used: all potentially applicable data is non-'ephemeral', and thus not considered for 'gomp_coalesce_buf_add' for OpenACC 'async'. (But a use will emerge later.) Follow-up to commit r12-2530-gd88a6951586c7229b25708f4486eaaf4bf4b5bbe "Don't use libgomp 'cbuf' buffering with OpenACC 'async'", addressing this TODO comment: TODO ... but we could allow CBUF usage for EPHEMERAL data? (Open question: is it more performant to use libgomp CBUF buffering or individual device asyncronous copying?) Ephemeral data is small, and therefore individual device asyncronous copying does seem dubious -- in particular given that for all those, we'd individually have to allocate and queue for deallocation a temporary buffer to capture the ephemeral data. Instead, just let the 'cbuf' *be* the temporary buffer. libgomp/ * target.c (gomp_copy_host2dev, gomp_map_vars_internal): Allow libgomp 'cbuf' buffering with OpenACC 'async' for 'ephemeral' data.
This commit is contained in:
parent
199867d07b
commit
2b2340e236
|
@ -310,10 +310,8 @@ struct gomp_coalesce_buf
|
||||||
|
|
||||||
This must not be used for asynchronous copies, because the host data might
|
This must not be used for asynchronous copies, because the host data might
|
||||||
not be computed yet (by an earlier asynchronous compute region, for
|
not be computed yet (by an earlier asynchronous compute region, for
|
||||||
example).
|
example). The exception is for EPHEMERAL data, that we know is available
|
||||||
TODO ... but we could allow CBUF usage for EPHEMERAL data? (Open question:
|
already "by construction". */
|
||||||
is it more performant to use libgomp CBUF buffering or individual device
|
|
||||||
asyncronous copying?) */
|
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
gomp_coalesce_buf_add (struct gomp_coalesce_buf *cbuf, size_t start, size_t len)
|
gomp_coalesce_buf_add (struct gomp_coalesce_buf *cbuf, size_t start, size_t len)
|
||||||
|
@ -377,30 +375,6 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep,
|
||||||
void *d, const void *h, size_t sz,
|
void *d, const void *h, size_t sz,
|
||||||
bool ephemeral, struct gomp_coalesce_buf *cbuf)
|
bool ephemeral, struct gomp_coalesce_buf *cbuf)
|
||||||
{
|
{
|
||||||
if (__builtin_expect (aq != NULL, 0))
|
|
||||||
{
|
|
||||||
/* See 'gomp_coalesce_buf_add'. */
|
|
||||||
assert (!cbuf);
|
|
||||||
|
|
||||||
void *h_buf = (void *) h;
|
|
||||||
if (ephemeral)
|
|
||||||
{
|
|
||||||
/* We're queueing up an asynchronous copy from data that may
|
|
||||||
disappear before the transfer takes place (i.e. because it is a
|
|
||||||
stack local in a function that is no longer executing). Make a
|
|
||||||
copy of the data into a temporary buffer in those cases. */
|
|
||||||
h_buf = gomp_malloc (sz);
|
|
||||||
memcpy (h_buf, h, sz);
|
|
||||||
}
|
|
||||||
goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func,
|
|
||||||
"dev", d, "host", h_buf, h, sz, aq);
|
|
||||||
if (ephemeral)
|
|
||||||
/* Free temporary buffer once the transfer has completed. */
|
|
||||||
devicep->openacc.async.queue_callback_func (aq, free, h_buf);
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cbuf)
|
if (cbuf)
|
||||||
{
|
{
|
||||||
uintptr_t doff = (uintptr_t) d - cbuf->tgt->tgt_start;
|
uintptr_t doff = (uintptr_t) d - cbuf->tgt->tgt_start;
|
||||||
|
@ -420,6 +394,12 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep,
|
||||||
gomp_mutex_unlock (&devicep->lock);
|
gomp_mutex_unlock (&devicep->lock);
|
||||||
gomp_fatal ("internal libgomp cbuf error");
|
gomp_fatal ("internal libgomp cbuf error");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* In an asynchronous context, verify that CBUF isn't used
|
||||||
|
with non-EPHEMERAL data; see 'gomp_coalesce_buf_add'. */
|
||||||
|
if (__builtin_expect (aq != NULL, 0))
|
||||||
|
assert (ephemeral);
|
||||||
|
|
||||||
memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0].start),
|
memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0].start),
|
||||||
h, sz);
|
h, sz);
|
||||||
return;
|
return;
|
||||||
|
@ -430,7 +410,28 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
|
if (__builtin_expect (aq != NULL, 0))
|
||||||
|
{
|
||||||
|
void *h_buf = (void *) h;
|
||||||
|
if (ephemeral)
|
||||||
|
{
|
||||||
|
/* We're queueing up an asynchronous copy from data that may
|
||||||
|
disappear before the transfer takes place (i.e. because it is a
|
||||||
|
stack local in a function that is no longer executing). As we've
|
||||||
|
not been able to use CBUF, make a copy of the data into a
|
||||||
|
temporary buffer. */
|
||||||
|
h_buf = gomp_malloc (sz);
|
||||||
|
memcpy (h_buf, h, sz);
|
||||||
|
}
|
||||||
|
goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func,
|
||||||
|
"dev", d, "host", h_buf, h, sz, aq);
|
||||||
|
if (ephemeral)
|
||||||
|
/* Free once the transfer has completed. */
|
||||||
|
devicep->openacc.async.queue_callback_func (aq, free, h_buf);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
gomp_device_copy (devicep, devicep->host2dev_func,
|
||||||
|
"dev", d, "host", h, sz);
|
||||||
}
|
}
|
||||||
|
|
||||||
attribute_hidden void
|
attribute_hidden void
|
||||||
|
@ -1751,9 +1752,6 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
|
||||||
|
|
||||||
if (cbufp)
|
if (cbufp)
|
||||||
{
|
{
|
||||||
/* See 'gomp_coalesce_buf_add'. */
|
|
||||||
assert (!aq);
|
|
||||||
|
|
||||||
long c = 0;
|
long c = 0;
|
||||||
for (c = 0; c < cbuf.chunk_cnt; ++c)
|
for (c = 0; c < cbuf.chunk_cnt; ++c)
|
||||||
gomp_copy_host2dev (devicep, aq,
|
gomp_copy_host2dev (devicep, aq,
|
||||||
|
@ -1761,8 +1759,12 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
|
||||||
(char *) cbuf.buf + (cbuf.chunks[c].start
|
(char *) cbuf.buf + (cbuf.chunks[c].start
|
||||||
- cbuf.chunks[0].start),
|
- cbuf.chunks[0].start),
|
||||||
cbuf.chunks[c].end - cbuf.chunks[c].start,
|
cbuf.chunks[c].end - cbuf.chunks[c].start,
|
||||||
true, NULL);
|
false, NULL);
|
||||||
free (cbuf.buf);
|
if (aq)
|
||||||
|
/* Free once the transfer has completed. */
|
||||||
|
devicep->openacc.async.queue_callback_func (aq, free, cbuf.buf);
|
||||||
|
else
|
||||||
|
free (cbuf.buf);
|
||||||
cbuf.buf = NULL;
|
cbuf.buf = NULL;
|
||||||
cbufp = NULL;
|
cbufp = NULL;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue