libgomp: Add OpenACC's acc_memcpy_device{,_async} routines [PR93226]

libgomp/ChangeLog:

	PR libgomp/93226
	* libgomp-plugin.h (GOMP_OFFLOAD_openacc_async_dev2dev): New
	prototype.
	* libgomp.h (struct acc_dispatch_t): Add dev2dev_func.
	(gomp_copy_dev2dev): New prototype.
	* libgomp.map (OACC_2.6.1): New; add acc_memcpy_device{,_async}.
	* libgomp.texi (acc_memcpy_device): New.
	* oacc-mem.c (memcpy_tofrom_device): Change to take from/to
	device boolean; use memcpy not memmove; add early return if
	size == 0 or same device + same ptr.
	(acc_memcpy_to_device, acc_memcpy_to_device_async,
	acc_memcpy_from_device, acc_memcpy_from_device_async): Update.
	(acc_memcpy_device, acc_memcpy_device_async): New.
	* openacc.f90 (acc_memcpy_device, acc_memcpy_device_async):
	Add interface.
	* openacc_lib.h (acc_memcpy_device, acc_memcpy_device_async):
	Likewise.
	* openacc.h (acc_memcpy_device, acc_memcpy_device_async): Add
	prototype.
	* plugin/plugin-gcn.c (GOMP_OFFLOAD_openacc_async_host2dev):
	Update comment.
	(GOMP_OFFLOAD_openacc_async_dev2host): Update call.
	(GOMP_OFFLOAD_openacc_async_dev2dev): New.
	* plugin/plugin-nvptx.c (cuda_memcpy_dev_sanity_check): New.
	(GOMP_OFFLOAD_dev2dev): Call it.
	(GOMP_OFFLOAD_openacc_async_dev2dev): New.
	* target.c (gomp_copy_dev2dev): New.
	(gomp_load_plugin_for_device): Load dev2dev and async_dev2dev.
	* testsuite/libgomp.oacc-c-c++-common/acc_memcpy_device-1.c: New test.
	* testsuite/libgomp.oacc-fortran/acc_memcpy_device-1.f90: New test.
This commit is contained in:
Tobias Burnus 2025-05-29 22:47:06 +02:00
parent 2047aa4ce2
commit f4aa6b5a8d
13 changed files with 409 additions and 19 deletions

View File

@ -200,6 +200,8 @@ extern bool GOMP_OFFLOAD_openacc_async_dev2host (int, void *, const void *, size
struct goacc_asyncqueue *);
extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *, size_t,
struct goacc_asyncqueue *);
extern bool GOMP_OFFLOAD_openacc_async_dev2dev (int, void *, const void *, size_t,
struct goacc_asyncqueue *);
extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void);
extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void);
extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *);

View File

@ -1360,6 +1360,7 @@ typedef struct acc_dispatch_t
__typeof (GOMP_OFFLOAD_openacc_async_exec) *exec_func;
__typeof (GOMP_OFFLOAD_openacc_async_dev2host) *dev2host_func;
__typeof (GOMP_OFFLOAD_openacc_async_host2dev) *host2dev_func;
__typeof (GOMP_OFFLOAD_openacc_async_dev2dev) *dev2dev_func;
} async;
__typeof (GOMP_OFFLOAD_openacc_get_property) *get_property_func;
@ -1467,6 +1468,9 @@ extern void gomp_copy_host2dev (struct gomp_device_descr *,
extern void gomp_copy_dev2host (struct gomp_device_descr *,
struct goacc_asyncqueue *, void *, const void *,
size_t);
extern void gomp_copy_dev2dev (struct gomp_device_descr *,
struct goacc_asyncqueue *, void *, const void *,
size_t);
extern uintptr_t gomp_map_val (struct target_mem_desc *, void **, size_t);
extern bool gomp_attach_pointer (struct gomp_device_descr *,
struct goacc_asyncqueue *, splay_tree,

View File

@ -609,6 +609,12 @@ OACC_2.6 {
acc_get_property_string_h_;
} OACC_2.5.1;
OACC_2.6.1 {
global:
acc_memcpy_device;
acc_memcpy_device_async;
} OACC_2.6;
GOACC_2.0 {
global:
GOACC_data_end;

View File

@ -4763,6 +4763,7 @@ acceleration device.
present on device.
* acc_memcpy_to_device:: Copy host memory to device memory.
* acc_memcpy_from_device:: Copy device memory to host memory.
* acc_memcpy_device:: Copy memory within a device.
* acc_attach:: Let device pointer point to device-pointer target.
* acc_detach:: Let device pointer point to host-pointer target.
@ -5837,6 +5838,44 @@ This function copies device memory specified by device address of
@node acc_memcpy_device
@section @code{acc_memcpy_device} -- Copy memory within a device.
@table @asis
@item @emph{Description}
This function copies device memory from one memory location to another
on the current device. It copies @var{bytes} bytes of data from the device
address, specified by @var{data_dev_src}, to the device address
@var{data_dev_dest}. The @code{_async} version performs the transfer
asnychronously using the queue associated with @var{async_arg}.
@item @emph{C/C++}:
@multitable @columnfractions .20 .80
@item @emph{Prototype}: @tab @code{void acc_memcpy_device(d_void* data_dev_dest,}
@item @tab @code{d_void* data_dev_src, size_t bytes);}
@item @emph{Prototype}: @tab @code{void acc_memcpy_device_async(d_void* data_dev_dest,}
@item @tab @code{d_void* data_dev_src, size_t bytes, int async_arg);}
@end multitable
@item @emph{Fortran}:
@multitable @columnfractions .20 .80
@item @emph{Interface}: @tab @code{subroutine acc_memcpy_device(data_dev_dest, &}
@item @tab @code{data_dev_src, bytes)}
@item @emph{Interface}: @tab @code{subroutine acc_memcpy_device_async(data_dev_dest, &}
@item @tab @code{data_dev_src, bytes, async_arg)}
@item @tab @code{type(c_ptr), value :: data_dev_dest}
@item @tab @code{type(c_ptr), value :: data_dev_src}
@item @tab @code{integer(c_size_t), value :: bytes}
@item @tab @code{integer(acc_handle_kind), value :: async_arg}
@end multitable
@item @emph{Reference}:
@uref{https://www.openacc.org, OpenACC specification v2.6}, section
3.2.33. @uref{https://www.openacc.org, OpenACC specification v3.3}, section
3.2.28.
@end table
@node acc_attach
@section @code{acc_attach} -- Let device pointer point to device-pointer target.
@table @asis

View File

@ -171,21 +171,22 @@ acc_free (void *d)
}
static void
memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
const char *libfnname)
memcpy_tofrom_device (bool dev_to, bool dev_from, void *dst, void *src,
size_t s, int async, const char *libfnname)
{
/* No need to call lazy open here, as the device pointer must have
been obtained from a routine that did that. */
struct goacc_thread *thr = goacc_thread ();
assert (thr && thr->dev);
if (s == 0)
return;
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
{
if (from)
memmove (h, d, s);
else
memmove (d, h, s);
if (src == dst)
return;
memcpy (dst, src, s);
return;
}
@ -199,10 +200,15 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
}
goacc_aq aq = get_goacc_asyncqueue (async);
if (from)
gomp_copy_dev2host (thr->dev, aq, h, d, s);
if (dev_to && dev_from)
{
if (dst != src)
gomp_copy_dev2dev (thr->dev, aq, dst, src, s);
}
else if (dev_from)
gomp_copy_dev2host (thr->dev, aq, dst, src, s);
else
gomp_copy_host2dev (thr->dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL);
gomp_copy_host2dev (thr->dev, aq, dst, src, s, false, /* TODO: cbuf? */ NULL);
if (profiling_p)
{
@ -214,25 +220,37 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
void
acc_memcpy_to_device (void *d, void *h, size_t s)
{
memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__);
memcpy_tofrom_device (true, false, d, h, s, acc_async_sync, __FUNCTION__);
}
void
acc_memcpy_to_device_async (void *d, void *h, size_t s, int async)
{
memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__);
memcpy_tofrom_device (true, false, d, h, s, async, __FUNCTION__);
}
void
acc_memcpy_from_device (void *h, void *d, size_t s)
{
memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__);
memcpy_tofrom_device (false, true, h, d, s, acc_async_sync, __FUNCTION__);
}
void
acc_memcpy_from_device_async (void *h, void *d, size_t s, int async)
{
memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__);
memcpy_tofrom_device (false, true, h, d, s, async, __FUNCTION__);
}
void
acc_memcpy_device (void *dst, void *src, size_t s)
{
memcpy_tofrom_device (true, true, dst, src, s, acc_async_sync, __FUNCTION__);
}
void
acc_memcpy_device_async (void *dst, void *src, size_t s, int async)
{
memcpy_tofrom_device (true, true, dst, src, s, async, __FUNCTION__);
}
/* Return the device pointer that corresponds to host data H. Or NULL

View File

@ -797,6 +797,7 @@ module openacc
public :: acc_copyout_finalize, acc_delete_finalize
public :: acc_memcpy_to_device, acc_memcpy_to_device_async
public :: acc_memcpy_from_device, acc_memcpy_from_device_async
public :: acc_memcpy_device, acc_memcpy_device_async
integer, parameter :: openacc_version = 201711
@ -1046,6 +1047,27 @@ module openacc
end subroutine
end interface
interface
subroutine acc_memcpy_device (data_dev_dest, data_dev_src, bytes) bind(C)
use iso_c_binding, only: c_ptr, c_size_t
type(c_ptr), value :: data_dev_dest
type(c_ptr), value :: data_dev_src
integer(c_size_t), value :: bytes
end subroutine
end interface
interface
subroutine acc_memcpy_device_async (data_dev_dest, data_dev_src, &
bytes, async_arg) bind(C)
use iso_c_binding, only: c_ptr, c_size_t
import :: acc_handle_kind
type(c_ptr), value :: data_dev_dest
type(c_ptr), value :: data_dev_src
integer(c_size_t), value :: bytes
integer(acc_handle_kind), value :: async_arg
end subroutine
end interface
interface acc_copyin_async
procedure :: acc_copyin_async_32_h
procedure :: acc_copyin_async_64_h

View File

@ -123,6 +123,7 @@ void *acc_hostptr (void *) __GOACC_NOTHROW;
int acc_is_present (void *, size_t) __GOACC_NOTHROW;
void acc_memcpy_to_device (void *, void *, size_t) __GOACC_NOTHROW;
void acc_memcpy_from_device (void *, void *, size_t) __GOACC_NOTHROW;
void acc_memcpy_device (void *, void *, size_t) __GOACC_NOTHROW;
void acc_attach (void **) __GOACC_NOTHROW;
void acc_attach_async (void **, int) __GOACC_NOTHROW;
void acc_detach (void **) __GOACC_NOTHROW;
@ -136,7 +137,7 @@ void acc_delete_finalize_async (void *, size_t, int) __GOACC_NOTHROW;
void acc_detach_finalize (void **) __GOACC_NOTHROW;
void acc_detach_finalize_async (void **, int) __GOACC_NOTHROW;
/* Async functions, specified in OpenACC 2.5. */
/* Async functions, specified in OpenACC 2.5, acc_memcpy_device in 2.6. */
void acc_copyin_async (void *, size_t, int) __GOACC_NOTHROW;
void acc_create_async (void *, size_t, int) __GOACC_NOTHROW;
void acc_copyout_async (void *, size_t, int) __GOACC_NOTHROW;
@ -145,6 +146,7 @@ void acc_update_device_async (void *, size_t, int) __GOACC_NOTHROW;
void acc_update_self_async (void *, size_t, int) __GOACC_NOTHROW;
void acc_memcpy_to_device_async (void *, void *, size_t, int) __GOACC_NOTHROW;
void acc_memcpy_from_device_async (void *, void *, size_t, int) __GOACC_NOTHROW;
void acc_memcpy_device_async (void *, void *, size_t, int) __GOACC_NOTHROW;
/* CUDA-specific routines. */
void *acc_get_current_cuda_device (void) __GOACC_NOTHROW;

View File

@ -528,6 +528,30 @@
end subroutine
end interface
interface
subroutine acc_memcpy_device(data_dev_dest, data_dev_src, &
& bytes) bind(C)
use iso_c_binding, only: c_ptr, c_size_t
type(c_ptr), value :: data_dev_dest
type(c_ptr), value :: data_dev_src
integer(c_size_t), value :: bytes
end subroutine
end interface
interface
subroutine acc_memcpy_device_async(data_dev_dest, &
& data_dev_src, bytes, &
& async_arg) bind(C)
use iso_c_binding, only: c_ptr, c_size_t
import :: acc_handle_kind
type(c_ptr), value :: data_dev_dest
type(c_ptr), value :: data_dev_src
integer(c_size_t), value :: bytes
integer(acc_handle_kind), value :: async_arg
end subroutine
end interface
interface acc_copyin_async
subroutine acc_copyin_async_32_h (a, len, async)
use iso_c_binding, only: c_int32_t

View File

@ -5079,7 +5079,8 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
queue_push_callback (aq, fn, data);
}
/* Queue up an asynchronous data copy from host to DEVICE. */
/* Queue up an asynchronous data copy from host to DEVICE.
(Also handles dev2host and dev2dev.) */
bool
GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
@ -5097,10 +5098,16 @@ bool
GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
size_t n, struct goacc_asyncqueue *aq)
{
struct agent_info *agent = get_agent_info (device);
assert (agent == aq->agent);
queue_push_copy (aq, dst, src, n);
return true;
return GOMP_OFFLOAD_openacc_async_host2dev (device, dst, src, n, aq);
}
/* Queue up an asynchronous data copy from DEVICE to DEVICE. */
bool
GOMP_OFFLOAD_openacc_async_dev2dev (int device, void *dst, const void *src,
size_t n, struct goacc_asyncqueue *aq)
{
return GOMP_OFFLOAD_openacc_async_host2dev (device, dst, src, n, aq);
}
union goacc_property_value

View File

@ -2018,6 +2018,34 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
cuda_callback_wrapper, (void *) b, 0);
}
static bool
cuda_memcpy_dev_sanity_check (const void *d1, const void *d2, size_t s)
{
CUdeviceptr pb1, pb2;
size_t ps1, ps2;
if (!s)
return true;
if (!d1 || !d2)
{
GOMP_PLUGIN_error ("invalid device address");
return false;
}
CUDA_CALL (cuMemGetAddressRange, &pb1, &ps1, (CUdeviceptr) d1);
CUDA_CALL (cuMemGetAddressRange, &pb2, &ps2, (CUdeviceptr) d2);
if (!pb1 || !pb2)
{
GOMP_PLUGIN_error ("invalid device address");
return false;
}
if ((void *)(d1 + s) > (void *)(pb1 + ps1)
|| (void *)(d2 + s) > (void *)(pb2 + ps2))
{
GOMP_PLUGIN_error ("invalid size");
return false;
}
return true;
}
static bool
cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
{
@ -2077,6 +2105,9 @@ GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
bool
GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
{
if (!nvptx_attach_host_thread_to_device (ord)
|| !cuda_memcpy_dev_sanity_check (dst, src, n))
return false;
CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
return true;
}
@ -2288,6 +2319,18 @@ GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
return true;
}
bool
GOMP_OFFLOAD_openacc_async_dev2dev (int ord, void *dst, const void *src,
size_t n, struct goacc_asyncqueue *aq)
{
if (!nvptx_attach_host_thread_to_device (ord)
|| !cuda_memcpy_dev_sanity_check (dst, src, n))
return false;
CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
aq->cuda_stream);
return true;
}
union goacc_property_value
GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
{

View File

@ -461,6 +461,19 @@ gomp_copy_dev2host (struct gomp_device_descr *devicep,
gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
}
attribute_hidden void
gomp_copy_dev2dev (struct gomp_device_descr *devicep,
struct goacc_asyncqueue *aq,
void *dst, const void *src, size_t sz)
{
if (__builtin_expect (aq != NULL, 0))
goacc_device_copy_async (devicep, devicep->openacc.async.dev2dev_func,
"dev", dst, "dev", src, NULL, sz, aq);
else
gomp_device_copy (devicep, devicep->dev2dev_func, "dev", dst,
"dev", src, sz);
}
static void
gomp_free_device_memory (struct gomp_device_descr *devicep, void *devptr)
{
@ -5573,6 +5586,7 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
|| !DLSYM_OPT (openacc.async.exec, openacc_async_exec)
|| !DLSYM_OPT (openacc.async.dev2host, openacc_async_dev2host)
|| !DLSYM_OPT (openacc.async.host2dev, openacc_async_host2dev)
|| !DLSYM_OPT (openacc.async.dev2dev, openacc_async_dev2dev)
|| !DLSYM_OPT (openacc.get_property, openacc_get_property))
{
/* Require all the OpenACC handlers if we have

View File

@ -0,0 +1,96 @@
/* { dg-prune-output "using .vector_length \\(32\\)" } */
/* PR libgomp/93226 */
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <openacc.h>
enum { N = 1024 };
static int D[N];
#pragma acc declare device_resident(D)
#pragma acc routine
intptr_t init_d()
{
for (int i = 0; i < N; i++)
D[i] = 27*i;
return (intptr_t) &D[0];
}
int
main ()
{
int *a, *b, *e;
void *d_a, *d_b, *d_c, *d_d, *d_e, *d_f;
intptr_t intptr;
bool fail = false;
a = (int *) malloc (N*sizeof (int));
b = (int *) malloc (N*sizeof (int));
e = (int *) malloc (N*sizeof (int));
d_c = acc_malloc (N*sizeof (int));
d_f = acc_malloc (N*sizeof (int));
memset (e, 0xff, N*sizeof (int));
d_e = acc_copyin (e, N*sizeof (int));
#pragma acc serial copyout(intptr)
intptr = init_d ();
d_d = (void*) intptr;
acc_memcpy_device (d_c, d_d, N*sizeof (int));
#pragma acc serial copy(fail) deviceptr(d_c) firstprivate(intptr)
{
int *cc = (int *) d_c;
int *dd = (int *) intptr;
for (int i = 0; i < N; i++)
if (dd[i] != 27*i || cc[i] != 27*i)
{
fail = true;
__builtin_abort ();
}
}
if (fail) __builtin_abort ();
for (int i = 0; i < N; i++)
a[i] = 11*i;
for (int i = 0; i < N; i++)
b[i] = 31*i;
d_a = acc_copyin (a, N*sizeof (int));
acc_copyin_async (b, N*sizeof (int), acc_async_noval);
#pragma acc parallel deviceptr(d_c) async
{
int *cc = (int *) d_c;
#pragma acc loop
for (int i = 0; i < N; i++)
cc[i] = -17*i;
}
acc_memcpy_device_async (d_d, d_a, N*sizeof (int), acc_async_noval);
acc_memcpy_device_async (d_f, d_c, N*sizeof (int), acc_async_noval);
acc_wait (acc_async_noval);
d_b = acc_deviceptr (b);
acc_memcpy_device_async (d_e, d_b, N*sizeof (int), acc_async_noval);
acc_wait (acc_async_noval);
#pragma acc serial deviceptr(d_d, d_e, d_f) copy(fail)
{
int *dd = (int *) d_d;
int *ee = (int *) d_e;
int *ff = (int *) d_f;
for (int i = 0; i < N; i++)
if (dd[i] != 11*i
|| ee[i] != 31*i
|| ff[i] != -17*i)
{
fail = true;
__builtin_abort ();
}
}
if (fail) __builtin_abort ();
}

View File

@ -0,0 +1,113 @@
! { dg-prune-output "using .vector_length \\(32\\)" }
! PR libgomp/93226 */
module m
use iso_c_binding
use openacc
implicit none (external, type)
integer, parameter :: N = 1024
integer :: D(N)
!$acc declare device_resident(D)
contains
integer(c_intptr_t) function init_d()
!$acc routine
integer :: i
do i = 1, N
D(i) = 27*i
end do
init_d = loc(D)
end
end module
program main
use m
implicit none (external, type)
integer, allocatable, target :: a(:), b(:), e(:)
type(c_ptr) :: d_a, d_b, d_c, d_d, d_e, d_f
integer(c_intptr_t) intptr
integer :: i
logical fail
fail = .false.
allocate(a(N), b(N), e(N))
d_c = acc_malloc (N*c_sizeof (i))
d_f = acc_malloc (N*c_sizeof (i))
e = huge(e)
call acc_copyin (e, N*c_sizeof (i));
d_e = acc_deviceptr (e);
!$acc serial copyout(intptr)
intptr = init_d ()
!$acc end serial
d_d = transfer(intptr, d_d)
call acc_memcpy_device (d_c, d_d, N*c_sizeof (i))
!$acc serial copy(fail) copy(a) deviceptr(d_c, d_d) firstprivate(intptr)
block
integer, pointer :: cc(:), dd(:)
call c_f_pointer (d_c, cc, [N])
call c_f_pointer (d_d, dd, [N])
a = cc
do i = 1, N
if (dd(i) /= 27*i .or. cc(i) /= 27*i) then
fail = .true.
stop 1
end if
end do
end block
!$acc end serial
if (fail) error stop 1
do i = 1, N
a(i) = 11*i
b(i) = 31*i
end do
call acc_copyin (a, N*c_sizeof (i))
d_a = acc_deviceptr (a)
call acc_copyin_async (b, N*c_sizeof (i), acc_async_noval)
!$acc parallel deviceptr(d_c) private(i) async
block
integer, pointer :: cc(:)
call c_f_pointer (d_c, cc, [N])
!$acc loop
do i = 1, N
cc(i) = -17*i
end do
end block
!$acc end parallel
call acc_memcpy_device_async (d_d, d_a, N*c_sizeof (i), acc_async_noval)
call acc_memcpy_device_async (d_f, d_c, N*c_sizeof (i), acc_async_noval)
call acc_wait (acc_async_noval)
d_b = acc_deviceptr (b)
call acc_memcpy_device_async (d_e, d_b, N*c_sizeof (i), acc_async_noval)
call acc_wait (acc_async_noval)
!$acc serial deviceptr(d_d, d_e, d_f) private(i) copy(fail)
block
integer, pointer :: dd(:), ee(:), ff(:)
call c_f_pointer (d_d, dd, [N])
call c_f_pointer (d_e, ee, [N])
call c_f_pointer (d_f, ff, [N])
do i = 1, N
if (dd(i) /= 11*i &
.or. ee(i) /= 31*i &
.or. ff(i) /= -17*i) then
fail = .true.
stop 2
end if
end do
end block
!$acc end serial
if (fail) error stop 2
end