Allow libgomp 'cbuf' buffering with OpenACC 'async' for 'ephemeral' data

This does *allow*, but under no circumstances is this currently going to be
used: all potentially applicable data is non-'ephemeral', and thus not
considered for 'gomp_coalesce_buf_add' for OpenACC 'async'.  (But a use will
emerge later.)

Follow-up to commit r12-2530-gd88a6951586c7229b25708f4486eaaf4bf4b5bbe
"Don't use libgomp 'cbuf' buffering with OpenACC 'async'", addressing this
TODO comment:

    TODO ... but we could allow CBUF usage for EPHEMERAL data?  (Open question:
    is it more performant to use libgomp CBUF buffering or individual device
    asyncronous copying?)

Ephemeral data is small, and therefore individual device asyncronous copying
does seem dubious -- in particular given that for all those, we'd individually
have to allocate and queue for deallocation a temporary buffer to capture the
ephemeral data.  Instead, just let the 'cbuf' *be* the temporary buffer.

	libgomp/
	* target.c (gomp_copy_host2dev, gomp_map_vars_internal): Allow
	libgomp 'cbuf' buffering with OpenACC 'async' for 'ephemeral'
	data.
This commit is contained in:
Thomas Schwinge 2023-02-27 16:41:17 +01:00
parent 199867d07b
commit 2b2340e236
1 changed files with 36 additions and 34 deletions

View File

@ -310,10 +310,8 @@ struct gomp_coalesce_buf
This must not be used for asynchronous copies, because the host data might
not be computed yet (by an earlier asynchronous compute region, for
example).
TODO ... but we could allow CBUF usage for EPHEMERAL data? (Open question:
is it more performant to use libgomp CBUF buffering or individual device
asyncronous copying?) */
example). The exception is for EPHEMERAL data, that we know is available
already "by construction". */
static inline void
gomp_coalesce_buf_add (struct gomp_coalesce_buf *cbuf, size_t start, size_t len)
@ -377,30 +375,6 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep,
void *d, const void *h, size_t sz,
bool ephemeral, struct gomp_coalesce_buf *cbuf)
{
if (__builtin_expect (aq != NULL, 0))
{
/* See 'gomp_coalesce_buf_add'. */
assert (!cbuf);
void *h_buf = (void *) h;
if (ephemeral)
{
/* We're queueing up an asynchronous copy from data that may
disappear before the transfer takes place (i.e. because it is a
stack local in a function that is no longer executing). Make a
copy of the data into a temporary buffer in those cases. */
h_buf = gomp_malloc (sz);
memcpy (h_buf, h, sz);
}
goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func,
"dev", d, "host", h_buf, h, sz, aq);
if (ephemeral)
/* Free temporary buffer once the transfer has completed. */
devicep->openacc.async.queue_callback_func (aq, free, h_buf);
return;
}
if (cbuf)
{
uintptr_t doff = (uintptr_t) d - cbuf->tgt->tgt_start;
@ -420,6 +394,12 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep,
gomp_mutex_unlock (&devicep->lock);
gomp_fatal ("internal libgomp cbuf error");
}
/* In an asynchronous context, verify that CBUF isn't used
with non-EPHEMERAL data; see 'gomp_coalesce_buf_add'. */
if (__builtin_expect (aq != NULL, 0))
assert (ephemeral);
memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0].start),
h, sz);
return;
@ -430,7 +410,28 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep,
}
}
gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
if (__builtin_expect (aq != NULL, 0))
{
void *h_buf = (void *) h;
if (ephemeral)
{
/* We're queueing up an asynchronous copy from data that may
disappear before the transfer takes place (i.e. because it is a
stack local in a function that is no longer executing). As we've
not been able to use CBUF, make a copy of the data into a
temporary buffer. */
h_buf = gomp_malloc (sz);
memcpy (h_buf, h, sz);
}
goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func,
"dev", d, "host", h_buf, h, sz, aq);
if (ephemeral)
/* Free once the transfer has completed. */
devicep->openacc.async.queue_callback_func (aq, free, h_buf);
}
else
gomp_device_copy (devicep, devicep->host2dev_func,
"dev", d, "host", h, sz);
}
attribute_hidden void
@ -1751,9 +1752,6 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
if (cbufp)
{
/* See 'gomp_coalesce_buf_add'. */
assert (!aq);
long c = 0;
for (c = 0; c < cbuf.chunk_cnt; ++c)
gomp_copy_host2dev (devicep, aq,
@ -1761,8 +1759,12 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
(char *) cbuf.buf + (cbuf.chunks[c].start
- cbuf.chunks[0].start),
cbuf.chunks[c].end - cbuf.chunks[c].start,
true, NULL);
free (cbuf.buf);
false, NULL);
if (aq)
/* Free once the transfer has completed. */
devicep->openacc.async.queue_callback_func (aq, free, cbuf.buf);
else
free (cbuf.buf);
cbuf.buf = NULL;
cbufp = NULL;
}