mirror of git://gcc.gnu.org/git/gcc.git
libgomp: Add OpenMP's omp_target_memset/omp_target_memset_async
PR libgomp/120444 include/ChangeLog: * cuda/cuda.h (cuMemsetD8, cuMemsetD8Async): Declare. libgomp/ChangeLog: * libgomp-plugin.h (GOMP_OFFLOAD_memset): Declare. * libgomp.h (struct gomp_device_descr): Add memset_func. * libgomp.map (GOMP_6.0.1): Add omp_target_memset{,_async}. * libgomp.texi (Device Memory Routines): Document them. * omp.h.in (omp_target_memset, omp_target_memset_async): Declare. * omp_lib.f90.in (omp_target_memset, omp_target_memset_async): Add interfaces. * omp_lib.h.in (omp_target_memset, omp_target_memset_async): Likewise. * plugin/cuda-lib.def: Add cuMemsetD8. * plugin/plugin-gcn.c (struct hsa_runtime_fn_info): Add hsa_amd_memory_fill_fn. (init_hsa_runtime_functions): DLSYM_OPT_FN load it. (GOMP_OFFLOAD_memset): New. * plugin/plugin-nvptx.c (GOMP_OFFLOAD_memset): New. * target.c (omp_target_memset_int, omp_target_memset, omp_target_memset_async_helper, omp_target_memset_async): New. (gomp_load_plugin_for_device): Add DLSYM (memset). * testsuite/libgomp.c-c++-common/omp_target_memset.c: New test. * testsuite/libgomp.c-c++-common/omp_target_memset-2.c: New test. * testsuite/libgomp.c-c++-common/omp_target_memset-3.c: New test. * testsuite/libgomp.fortran/omp_target_memset.f90: New test. * testsuite/libgomp.fortran/omp_target_memset-2.f90: New test.
This commit is contained in:
parent
d7f33a35bf
commit
4e47e2f833
|
@ -279,6 +279,9 @@ CUresult cuMemcpy3D (const CUDA_MEMCPY3D *);
|
|||
CUresult cuMemcpy3DAsync (const CUDA_MEMCPY3D *, CUstream);
|
||||
CUresult cuMemcpy3DPeer (const CUDA_MEMCPY3D_PEER *);
|
||||
CUresult cuMemcpy3DPeerAsync (const CUDA_MEMCPY3D_PEER *, CUstream);
|
||||
#define cuMemsetD8 cuMemsetD8_v2
|
||||
CUresult cuMemsetD8 (CUdeviceptr, unsigned char, size_t);
|
||||
CUresult cuMemsetD8Async (CUdeviceptr, unsigned char, size_t, CUstream);
|
||||
#define cuMemFree cuMemFree_v2
|
||||
CUresult cuMemFree (CUdeviceptr);
|
||||
CUresult cuMemFreeHost (void *);
|
||||
|
|
|
@ -177,6 +177,7 @@ extern int GOMP_OFFLOAD_memcpy3d (int, int, size_t, size_t, size_t, void *,
|
|||
size_t, size_t, size_t, size_t, size_t,
|
||||
const void *, size_t, size_t, size_t, size_t,
|
||||
size_t);
|
||||
extern bool GOMP_OFFLOAD_memset (int, void *, int, size_t);
|
||||
extern bool GOMP_OFFLOAD_can_run (void *);
|
||||
extern void GOMP_OFFLOAD_run (int, void *, void *, void **);
|
||||
extern void GOMP_OFFLOAD_async_run (int, void *, void *, void **, void *);
|
||||
|
|
|
@ -1421,9 +1421,10 @@ struct gomp_device_descr
|
|||
__typeof (GOMP_OFFLOAD_free) *free_func;
|
||||
__typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
|
||||
__typeof (GOMP_OFFLOAD_host2dev) *host2dev_func;
|
||||
__typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
|
||||
__typeof (GOMP_OFFLOAD_memcpy2d) *memcpy2d_func;
|
||||
__typeof (GOMP_OFFLOAD_memcpy3d) *memcpy3d_func;
|
||||
__typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
|
||||
__typeof (GOMP_OFFLOAD_memset) *memset_func;
|
||||
__typeof (GOMP_OFFLOAD_can_run) *can_run_func;
|
||||
__typeof (GOMP_OFFLOAD_run) *run_func;
|
||||
__typeof (GOMP_OFFLOAD_async_run) *async_run_func;
|
||||
|
|
|
@ -453,6 +453,12 @@ GOMP_6.0 {
|
|||
omp_get_uid_from_device_8_;
|
||||
} GOMP_5.1.3;
|
||||
|
||||
GOMP_6.0.1 {
|
||||
global:
|
||||
omp_target_memset;
|
||||
omp_target_memset_async;
|
||||
} GOMP_6.0;
|
||||
|
||||
OACC_2.0 {
|
||||
global:
|
||||
acc_get_num_devices;
|
||||
|
|
|
@ -603,7 +603,7 @@ to address of matching mapped list item per 5.1, Sect. 2.21.7.2 @tab N @tab
|
|||
@code{omp_get_device_teams_thread_limit}, and
|
||||
@code{omp_set_device_teams_thread_limit} routines @tab N @tab
|
||||
@item @code{omp_target_memset} and @code{omp_target_memset_async} routines
|
||||
@tab N @tab
|
||||
@tab Y @tab
|
||||
@item Fortran version of the interop runtime routines @tab Y @tab
|
||||
@item Routines for obtaining memory spaces/allocators for shared/device memory
|
||||
@tab N @tab
|
||||
|
@ -1984,8 +1984,8 @@ pointers on devices. They have C linkage and do not throw exceptions.
|
|||
* omp_target_memcpy_async:: Copy data between devices asynchronously
|
||||
* omp_target_memcpy_rect:: Copy a subvolume of data between devices
|
||||
* omp_target_memcpy_rect_async:: Copy a subvolume of data between devices asynchronously
|
||||
@c * omp_target_memset:: <fixme>/TR12
|
||||
@c * omp_target_memset_async:: <fixme>/TR12
|
||||
* omp_target_memset:: Set bytes in device memory
|
||||
* omp_target_memset_async:: Set bytes in device memory asynchronously
|
||||
* omp_target_associate_ptr:: Associate a device pointer with a host pointer
|
||||
* omp_target_disassociate_ptr:: Remove device--host pointer association
|
||||
* omp_get_mapped_ptr:: Return device pointer to a host pointer
|
||||
|
@ -2398,6 +2398,98 @@ the initial device.
|
|||
@end table
|
||||
|
||||
|
||||
@node omp_target_memset
|
||||
@subsection @code{omp_target_memset} -- Set bytes in device memory
|
||||
@table @asis
|
||||
@item @emph{Description}:
|
||||
This routine fills memory on the device identified by device number
|
||||
@var{device_num}. Starting from the device address @var{ptr}, the first
|
||||
@var{count} bytes are set to the value @var{val}, converted to
|
||||
@code{unsigned char}. If @var{count} is zero, the routine has no effect;
|
||||
if @var{ptr} is @code{NULL}, the behavior is unspecified. The function
|
||||
returns @var{ptr}.
|
||||
|
||||
The @var{device_num} must be a conforming device number and @var{ptr} must be
|
||||
a valid device pointer for that device. Running this routine in a
|
||||
@code{target} region except on the initial device is not supported.
|
||||
|
||||
@item @emph{C/C++}
|
||||
@multitable @columnfractions .20 .80
|
||||
@item @emph{Prototype}: @tab @code{void *omp_target_memcpy(void *ptr,}
|
||||
@item @tab @code{ int val,}
|
||||
@item @tab @code{ size_t count,}
|
||||
@item @tab @code{ int device_num)}
|
||||
@end multitable
|
||||
|
||||
@item @emph{Fortran}:
|
||||
@multitable @columnfractions .20 .80
|
||||
@item @emph{Interface}: @tab @code{type(c_ptr) function omp_target_memset( &}
|
||||
@item @tab @code{ ptr, val, count, device_num) bind(C)}
|
||||
@item @tab @code{use, intrinsic :: iso_c_binding, only: c_ptr, c_size_t, c_int}
|
||||
@item @tab @code{type(c_ptr), value :: ptr}
|
||||
@item @tab @code{integer(c_size_t), value :: count}
|
||||
@item @tab @code{integer(c_int), value :: val, device_num}
|
||||
@end multitable
|
||||
|
||||
@item @emph{See also}:
|
||||
@ref{omp_target_memset_async}
|
||||
|
||||
@item @emph{Reference}:
|
||||
@uref{https://www.openmp.org, OpenMP specification v6.0}, Section 25.8.1
|
||||
@end table
|
||||
|
||||
|
||||
|
||||
@node omp_target_memset_async
|
||||
@subsection @code{omp_target_memset} -- Set bytes in device memory asynchronously
|
||||
@table @asis
|
||||
@item @emph{Description}:
|
||||
This routine fills memory on the device identified by device number
|
||||
@var{device_num}. Starting from the device address @var{ptr}, the first
|
||||
@var{count} bytes are set to the value @var{val}, converted to
|
||||
@code{unsigned char}. If @var{count} is zero, the routine has no effect;
|
||||
if @var{ptr} is @code{NULL}, the behavior is unspecified. Task dependence
|
||||
is expressed by passing an array of depend objects to @var{depobj_list}, where
|
||||
the number of array elements is passed as @var{depobj_count}; if the count is
|
||||
zero, the @var{depobj_list} argument is ignored. In C++ and Fortran, the
|
||||
@var{depobj_list} argument can also be omitted in that case. The function
|
||||
returns @var{ptr}.
|
||||
|
||||
The @var{device_num} must be a conforming device number and @var{ptr} must be
|
||||
a valid device pointer for that device. Running this routine in a
|
||||
@code{target} region except on the initial device is not supported.
|
||||
|
||||
@item @emph{C/C++}
|
||||
@multitable @columnfractions .20 .80
|
||||
@item @emph{Prototype}: @tab @code{void *omp_target_memcpy_async(void *ptr,}
|
||||
@item @tab @code{ int val,}
|
||||
@item @tab @code{ size_t count,}
|
||||
@item @tab @code{ int device_num,}
|
||||
@item @tab @code{ int depobj_count,}
|
||||
@item @tab @code{ omp_depend_t *depobj_list)}
|
||||
@end multitable
|
||||
|
||||
@item @emph{Fortran}:
|
||||
@multitable @columnfractions .20 .80
|
||||
@item @emph{Interface}: @tab @code{type(c_ptr) function omp_target_memset_async( &}
|
||||
@item @tab @code{ ptr, val, count, device_num, &}
|
||||
@item @tab @code{ depobj_count, depobj_list) bind(C)}
|
||||
@item @tab @code{use, intrinsic :: iso_c_binding, only: c_ptr, c_size_t, c_int}
|
||||
@item @tab @code{type(c_ptr), value :: ptr}
|
||||
@item @tab @code{integer(c_size_t), value :: count}
|
||||
@item @tab @code{integer(c_int), value :: val, device_num, depobj_count}
|
||||
@item @tab @code{integer(omp_depend_kind), optional :: depobj_list(*)}
|
||||
@end multitable
|
||||
|
||||
|
||||
@item @emph{See also}:
|
||||
@ref{omp_target_memset}
|
||||
|
||||
@item @emph{Reference}:
|
||||
@uref{https://www.openmp.org, OpenMP specification v6.0}, Section 25.8.2
|
||||
@end table
|
||||
|
||||
|
||||
|
||||
@node omp_target_associate_ptr
|
||||
@subsection @code{omp_target_associate_ptr} -- Associate a device pointer with a host pointer
|
||||
|
|
|
@ -347,6 +347,10 @@ extern int omp_target_memcpy_rect_async (void *, const void *, __SIZE_TYPE__,
|
|||
const __SIZE_TYPE__ *, int, int, int,
|
||||
omp_depend_t * __GOMP_DEFAULT_NULL)
|
||||
__GOMP_NOTHROW;
|
||||
extern void *omp_target_memset (void *, int, __SIZE_TYPE__, int) __GOMP_NOTHROW;
|
||||
extern void *omp_target_memset_async (void *, int, __SIZE_TYPE__, int,
|
||||
int, omp_depend_t * __GOMP_DEFAULT_NULL)
|
||||
__GOMP_NOTHROW;
|
||||
extern int omp_target_associate_ptr (const void *, const void *, __SIZE_TYPE__,
|
||||
__SIZE_TYPE__, int) __GOMP_NOTHROW;
|
||||
extern int omp_target_disassociate_ptr (const void *, int) __GOMP_NOTHROW;
|
||||
|
|
|
@ -903,6 +903,29 @@
|
|||
end function omp_target_memcpy_rect_async
|
||||
end interface
|
||||
|
||||
interface
|
||||
function omp_target_memset (ptr, val, count, device_num) bind(c)
|
||||
use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
|
||||
type(c_ptr) :: omp_target_memset
|
||||
type(c_ptr), value :: ptr
|
||||
integer(c_size_t), value :: count
|
||||
integer(c_int), value :: val, device_num
|
||||
end function omp_target_memset
|
||||
end interface
|
||||
|
||||
interface
|
||||
function omp_target_memset_async (ptr, val, count, device_num, &
|
||||
depobj_count, depobj_list) bind(c)
|
||||
use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
|
||||
import :: omp_depend_kind
|
||||
type(c_ptr) :: omp_target_memset_async
|
||||
type(c_ptr), value :: ptr
|
||||
integer(c_size_t), value :: count
|
||||
integer(c_int), value :: val, device_num, depobj_count
|
||||
integer(omp_depend_kind), optional :: depobj_list(*)
|
||||
end function omp_target_memset_async
|
||||
end interface
|
||||
|
||||
interface
|
||||
function omp_target_associate_ptr (host_ptr, device_ptr, size, &
|
||||
device_offset, device_num) bind(c)
|
||||
|
|
|
@ -504,6 +504,31 @@
|
|||
end function omp_target_memcpy_rect_async
|
||||
end interface
|
||||
|
||||
interface
|
||||
function omp_target_memset (ptr, val, count, device_num) bind(c)
|
||||
use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
|
||||
type(c_ptr) omp_target_memset
|
||||
type(c_ptr), value :: ptr
|
||||
integer(c_size_t), value :: count
|
||||
integer(c_int), value :: val, device_num
|
||||
end function omp_target_memset
|
||||
end interface
|
||||
|
||||
interface
|
||||
function omp_target_memset_async (ptr, val, count, device_num, &
|
||||
& depobj_count, depobj_list) &
|
||||
& bind(c)
|
||||
use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
|
||||
import :: omp_depend_kind
|
||||
type(c_ptr) :: omp_target_memset_async
|
||||
type(c_ptr), value :: ptr
|
||||
integer(c_size_t), value :: count
|
||||
integer(c_int), value :: val, device_num, depobj_count
|
||||
integer(omp_depend_kind), optional :: depobj_list(*)
|
||||
end function omp_target_memset_async
|
||||
end interface
|
||||
|
||||
|
||||
interface
|
||||
function omp_target_associate_ptr (host_ptr, device_ptr, size, &
|
||||
& device_offset, device_num) &
|
||||
|
|
|
@ -42,6 +42,7 @@ CUDA_ONE_CALL (cuMemcpyHtoDAsync)
|
|||
CUDA_ONE_CALL (cuMemcpy2D)
|
||||
CUDA_ONE_CALL (cuMemcpy2DUnaligned)
|
||||
CUDA_ONE_CALL (cuMemcpy3D)
|
||||
CUDA_ONE_CALL (cuMemsetD8)
|
||||
CUDA_ONE_CALL (cuMemFree)
|
||||
CUDA_ONE_CALL (cuMemFreeHost)
|
||||
CUDA_ONE_CALL (cuMemGetAddressRange)
|
||||
|
|
|
@ -208,6 +208,8 @@ struct hsa_runtime_fn_info
|
|||
hsa_status_t (*hsa_code_object_deserialize_fn)
|
||||
(void *serialized_code_object, size_t serialized_code_object_size,
|
||||
const char *options, hsa_code_object_t *code_object);
|
||||
hsa_status_t (*hsa_amd_memory_fill_fn)(void *ptr, uint32_t value,
|
||||
size_t count);
|
||||
hsa_status_t (*hsa_amd_memory_lock_fn)
|
||||
(void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
|
||||
void **agent_ptr);
|
||||
|
@ -1456,6 +1458,7 @@ init_hsa_runtime_functions (void)
|
|||
DLSYM_FN (hsa_signal_load_acquire)
|
||||
DLSYM_FN (hsa_queue_destroy)
|
||||
DLSYM_FN (hsa_code_object_deserialize)
|
||||
DLSYM_OPT_FN (hsa_amd_memory_fill)
|
||||
DLSYM_OPT_FN (hsa_amd_memory_lock)
|
||||
DLSYM_OPT_FN (hsa_amd_memory_unlock)
|
||||
DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
|
||||
|
@ -4435,6 +4438,83 @@ init_hip_runtime_functions (void)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count)
|
||||
{
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
|
||||
/* A memset feature is only provided via hsa_amd_memory_fill; while it
|
||||
is fast, it is an HSA extension and it has two requirements: The memory
|
||||
must be aligned to multiples of 4 bytes - and, by construction, only
|
||||
multiples of 4 bytes can be filled (uint32_t value argument).
|
||||
|
||||
This means: Either not using that function or up to three function calls:
|
||||
- copy 1 to 3 bytes to get alignment (hsa_memory_copy), if unaligned
|
||||
- call hsa_amd_memory_fill
|
||||
- copy remaining 1 to 3 bytes (hsa_memory_copy), if after alignment
|
||||
count is not a multiple of 4 bytes.
|
||||
|
||||
Having more than one function call is only profitable if there is
|
||||
enough data to process; see below for the used heuristic values. */
|
||||
|
||||
uint8_t v8 = (uint8_t) val;
|
||||
size_t before = (4 - (uintptr_t) ptr % 4) % 4; /* 0 to 3 bytes. */
|
||||
size_t tail = (count - before) % 4; /* 0 to 3 bytes. */
|
||||
|
||||
/* Heuristic */
|
||||
enum {
|
||||
/* Prefer alloca to malloc up to ... */
|
||||
alloca_size = 256, /* bytes */
|
||||
/* Call hsa_amd_memory_fill also when two copy calls are required. */
|
||||
always_use_fill = 256*1024, /* bytes */
|
||||
/* Call hsa_amd_memory_fill also when on copy call is required. */
|
||||
use_fill_one_copy = (128+64)*1024 /* bytes */
|
||||
};
|
||||
|
||||
/* Do not call hsa_amd_memory_fill when any of the following conditions
|
||||
is true. Note that it is always preferred if available and
|
||||
before == tail == 0. */
|
||||
if (__builtin_expect (!hsa_fns.hsa_amd_memory_fill_fn, 0)
|
||||
|| (before && tail && count < always_use_fill)
|
||||
|| ((before || tail) && count < use_fill_one_copy))
|
||||
before = count;
|
||||
|
||||
/* Copy call for alignment - or all data, if condition above is true. */
|
||||
if (before)
|
||||
{
|
||||
void *data;
|
||||
if (before > alloca_size)
|
||||
data = malloc (before * sizeof (uint8_t));
|
||||
else
|
||||
data = alloca (before * sizeof (uint8_t));
|
||||
memset (data, val, before);
|
||||
status = hsa_fns.hsa_memory_copy_fn (ptr, data, before);
|
||||
if (before > alloca_size)
|
||||
free (data);
|
||||
if (data == 0 || status != HSA_STATUS_SUCCESS)
|
||||
goto fail;
|
||||
count -= before;
|
||||
}
|
||||
|
||||
if (count == 0)
|
||||
return true;
|
||||
|
||||
ptr += before;
|
||||
|
||||
uint32_t values = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24);
|
||||
status = hsa_fns.hsa_amd_memory_fill_fn (ptr, values, count / 4);
|
||||
if (tail && status == HSA_STATUS_SUCCESS)
|
||||
{
|
||||
ptr += count - tail;
|
||||
status = hsa_fns.hsa_memory_copy_fn (ptr, &values, tail);
|
||||
}
|
||||
if (status == HSA_STATUS_SUCCESS)
|
||||
return true;
|
||||
|
||||
fail:
|
||||
GOMP_PLUGIN_error ("memory set failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
void
|
||||
GOMP_OFFLOAD_interop (struct interop_obj_t *obj, int ord,
|
||||
|
|
|
@ -2297,6 +2297,15 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
|
|||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count)
|
||||
{
|
||||
if (!nvptx_attach_host_thread_to_device (ord))
|
||||
return false;
|
||||
CUDA_CALL (cuMemsetD8, (CUdeviceptr) ptr, (unsigned char) val, count);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
|
||||
size_t n, struct goacc_asyncqueue *aq)
|
||||
|
|
|
@ -5003,6 +5003,88 @@ omp_target_memcpy_rect_async (void *dst, const void *src, size_t element_size,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
omp_target_memset_int (void *ptr, int val, size_t count,
|
||||
struct gomp_device_descr *devicep)
|
||||
{
|
||||
if (__builtin_expect (count == 0, 0))
|
||||
return;
|
||||
if (devicep == NULL)
|
||||
{
|
||||
memset (ptr, val, count);
|
||||
return;
|
||||
}
|
||||
|
||||
gomp_mutex_lock (&devicep->lock);
|
||||
int ret = devicep->memset_func (devicep->target_id, ptr, val, count);
|
||||
gomp_mutex_unlock (&devicep->lock);
|
||||
if (!ret)
|
||||
gomp_fatal ("omp_target_memset failed");
|
||||
}
|
||||
|
||||
void*
|
||||
omp_target_memset (void *ptr, int val, size_t count, int device_num)
|
||||
{
|
||||
struct gomp_device_descr *devicep;
|
||||
if (device_num == omp_initial_device
|
||||
|| device_num == gomp_get_num_devices ()
|
||||
|| (devicep = resolve_device (device_num, false)) == NULL
|
||||
|| !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
|
||||
|| devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
devicep = NULL;
|
||||
|
||||
omp_target_memset_int (ptr, val, count, devicep);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
typedef struct
|
||||
{
|
||||
void *ptr;
|
||||
size_t count;
|
||||
struct gomp_device_descr *devicep;
|
||||
int val;
|
||||
} omp_target_memset_data;
|
||||
|
||||
static void
|
||||
omp_target_memset_async_helper (void *args)
|
||||
{
|
||||
omp_target_memset_data *a = args;
|
||||
omp_target_memset_int (a->ptr, a->val, a->count, a->devicep);
|
||||
}
|
||||
|
||||
void*
|
||||
omp_target_memset_async (void *ptr, int val, size_t count, int device_num,
|
||||
int depobj_count, omp_depend_t *depobj_list)
|
||||
{
|
||||
void *depend[depobj_count + 5];
|
||||
struct gomp_device_descr *devicep;
|
||||
unsigned flags = 0;
|
||||
int i;
|
||||
|
||||
if (device_num == omp_initial_device
|
||||
|| device_num == gomp_get_num_devices ()
|
||||
|| (devicep = resolve_device (device_num, false)) == NULL
|
||||
|| !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
|
||||
|| devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
||||
devicep = NULL;
|
||||
|
||||
omp_target_memset_data s = {.ptr = ptr, .val = val, .count = count,
|
||||
.devicep = devicep};
|
||||
if (depobj_count > 0 && depobj_list != NULL)
|
||||
{
|
||||
flags |= GOMP_TASK_FLAG_DEPEND;
|
||||
depend[0] = 0;
|
||||
depend[1] = (void *) (uintptr_t) depobj_count;
|
||||
depend[2] = depend[3] = depend[4] = 0;
|
||||
for (i = 0; i < depobj_count; ++i)
|
||||
depend[i + 5] = &depobj_list[i];
|
||||
}
|
||||
|
||||
GOMP_task (omp_target_memset_async_helper, &s, NULL, sizeof (s),
|
||||
__alignof__ (s), true, flags, depend, 0, NULL);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
int
|
||||
omp_target_associate_ptr (const void *host_ptr, const void *device_ptr,
|
||||
size_t size, size_t device_offset, int device_num)
|
||||
|
@ -5568,6 +5650,7 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
|
|||
DLSYM_OPT (async_run, async_run);
|
||||
DLSYM_OPT (can_run, can_run);
|
||||
DLSYM (dev2dev);
|
||||
DLSYM (memset);
|
||||
}
|
||||
if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
|
||||
{
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
// PR libgomp/120444
|
||||
// Async version
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for (int dev = omp_initial_device; dev <= omp_get_num_devices (); dev++)
|
||||
{
|
||||
char *ptr = (char *) omp_target_alloc (sizeof(int) * 1024, dev);
|
||||
|
||||
omp_depend_t dep;
|
||||
#pragma omp depobj(dep) depend(inout: ptr)
|
||||
|
||||
/* Play also around with the alignment - as hsa_amd_memory_fill operates
|
||||
on multiples of 4 bytes (uint32_t). */
|
||||
|
||||
for (int start = 0; start < 32; start++)
|
||||
for (int tail = 0; tail < 32; tail++)
|
||||
{
|
||||
unsigned char val = '0' + start + tail;
|
||||
#if __cplusplus
|
||||
void *ptr2 = omp_target_memset_async (ptr + start, val,
|
||||
1024 - start - tail, dev, 0);
|
||||
#else
|
||||
void *ptr2 = omp_target_memset_async (ptr + start, val,
|
||||
1024 - start - tail, dev, 0, nullptr);
|
||||
#endif
|
||||
if (ptr + start != ptr2)
|
||||
__builtin_abort ();
|
||||
|
||||
#pragma omp taskwait
|
||||
|
||||
#pragma omp target device(dev) is_device_ptr(ptr) depend(depobj: dep) nowait
|
||||
for (int i = start; i < 1024 - start - tail; i++)
|
||||
{
|
||||
if (ptr[i] != val)
|
||||
__builtin_abort ();
|
||||
ptr[i] += 2;
|
||||
}
|
||||
|
||||
omp_target_memset_async (ptr + start, val + 3,
|
||||
1024 - start - tail, dev, 1, &dep);
|
||||
|
||||
#pragma omp target device(dev) is_device_ptr(ptr) depend(depobj: dep) nowait
|
||||
for (int i = start; i < 1024 - start - tail; i++)
|
||||
{
|
||||
if (ptr[i] != val + 3)
|
||||
__builtin_abort ();
|
||||
ptr[i] += 1;
|
||||
}
|
||||
|
||||
omp_target_memset_async (ptr + start, val - 3,
|
||||
1024 - start - tail, dev, 1, &dep);
|
||||
|
||||
#pragma omp taskwait depend (depobj: dep)
|
||||
}
|
||||
#pragma omp depobj(dep) destroy
|
||||
omp_target_free (ptr, dev);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <omp.h>
|
||||
|
||||
#define MIN(x,y) ((x) < (y) ? x : y)
|
||||
|
||||
enum { N = 524288 + 8 };
|
||||
|
||||
static void
|
||||
init_val (int8_t *ptr, int val, size_t count)
|
||||
{
|
||||
#pragma omp target is_device_ptr(ptr) firstprivate(val, count)
|
||||
__builtin_memset (ptr, val, count);
|
||||
}
|
||||
|
||||
static void
|
||||
check_val (int8_t *ptr, int val, size_t count)
|
||||
{
|
||||
if (count == 0)
|
||||
return;
|
||||
#pragma omp target is_device_ptr(ptr) firstprivate(val, count)
|
||||
for (size_t i = 0; i < count; i++)
|
||||
if (ptr[i] != val) __builtin_abort ();
|
||||
}
|
||||
|
||||
static void
|
||||
test_it (void *ptr, int lshift, size_t count)
|
||||
{
|
||||
if (N < count + lshift) __builtin_abort ();
|
||||
if (lshift >= 4) __builtin_abort ();
|
||||
ptr += lshift;
|
||||
|
||||
init_val (ptr, 'z', MIN (count + 32, N - lshift));
|
||||
|
||||
omp_target_memset (ptr, '1', count, omp_get_default_device());
|
||||
|
||||
check_val (ptr, '1', count);
|
||||
check_val (ptr + count, 'z', MIN (32, N - lshift - count));
|
||||
}
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
size_t size;
|
||||
void *ptr = omp_target_alloc (N + 3, omp_get_default_device());
|
||||
ptr += (4 - (uintptr_t) ptr % 4) % 4;
|
||||
if ((uintptr_t) ptr % 4 != 0) __builtin_abort ();
|
||||
|
||||
test_it (ptr, 0, 1);
|
||||
test_it (ptr, 3, 1);
|
||||
test_it (ptr, 0, 4);
|
||||
test_it (ptr, 3, 4);
|
||||
test_it (ptr, 0, 5);
|
||||
test_it (ptr, 3, 5);
|
||||
test_it (ptr, 0, 6);
|
||||
test_it (ptr, 3, 6);
|
||||
|
||||
for (int i = 1; i <= 9; i++)
|
||||
{
|
||||
switch (i)
|
||||
{
|
||||
case 1: size = 16; break; // = 2^4 bytes
|
||||
case 2: size = 32; break; // = 2^5 bytes
|
||||
case 3: size = 64; break; // = 2^7 bytes
|
||||
case 4: size = 128; break; // = 2^7 bytes
|
||||
case 5: size = 256; break; // = 2^8 bytes
|
||||
case 6: size = 512; break; // = 2^9 bytes
|
||||
case 7: size = 65536; break; // = 2^16 bytes
|
||||
case 8: size = 262144; break; // = 2^18 bytes
|
||||
case 9: size = 524288; break; // = 2^20 bytes
|
||||
default: __builtin_abort ();
|
||||
}
|
||||
test_it (ptr, 0, size);
|
||||
test_it (ptr, 3, size);
|
||||
test_it (ptr, 0, size + 1);
|
||||
test_it (ptr, 3, size + 1);
|
||||
test_it (ptr, 3, size + 2);
|
||||
}
|
||||
omp_target_free (ptr, omp_get_default_device());
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
// PR libgomp/120444
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
for (int dev = omp_initial_device; dev < omp_get_num_devices (); dev++)
|
||||
{
|
||||
char *ptr = (char *) omp_target_alloc (sizeof(int) * 1024, dev);
|
||||
|
||||
/* Play also around with the alignment - as hsa_amd_memory_fill operates
|
||||
on multiples of 4 bytes (uint32_t). */
|
||||
|
||||
for (int start = 0; start < 32; start++)
|
||||
for (int tail = 0; tail < 32; tail++)
|
||||
{
|
||||
unsigned char val = '0' + start + tail;
|
||||
void *ptr2 = omp_target_memset (ptr + start, val,
|
||||
1024 - start - tail, dev);
|
||||
if (ptr + start != ptr2)
|
||||
__builtin_abort ();
|
||||
|
||||
#pragma omp target device(dev) is_device_ptr(ptr)
|
||||
for (int i = start; i < 1024 - start - tail; i++)
|
||||
if (ptr[i] != val)
|
||||
__builtin_abort ();
|
||||
|
||||
}
|
||||
|
||||
/* Check 'small' values for correctness. */
|
||||
|
||||
for (int start = 0; start < 32; start++)
|
||||
for (int size = 0; size <= 64 + 32; size++)
|
||||
{
|
||||
omp_target_memset (ptr, 'a' - 2, 1024, dev);
|
||||
|
||||
unsigned char val = '0' + start + size % 32;
|
||||
void *ptr2 = omp_target_memset (ptr + start, val, size, dev);
|
||||
|
||||
if (ptr + start != ptr2)
|
||||
__builtin_abort ();
|
||||
|
||||
if (size == 0)
|
||||
continue;
|
||||
|
||||
#pragma omp target device(dev) is_device_ptr(ptr)
|
||||
{
|
||||
for (int i = 0; i < start; i++)
|
||||
if (ptr[i] != 'a' - 2)
|
||||
__builtin_abort ();
|
||||
for (int i = start; i < start + size; i++)
|
||||
if (ptr[i] != val)
|
||||
__builtin_abort ();
|
||||
for (int i = start + size + 1; i < 1024; i++)
|
||||
if (ptr[i] != 'a' - 2)
|
||||
__builtin_abort ();
|
||||
}
|
||||
}
|
||||
|
||||
omp_target_free (ptr, dev);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
! PR libgomp/120444
|
||||
! Async version
|
||||
|
||||
use omp_lib
|
||||
use iso_c_binding
|
||||
implicit none (type, external)
|
||||
integer(c_int) :: dev
|
||||
|
||||
!$omp parallel do
|
||||
do dev = omp_initial_device, omp_get_num_devices ()
|
||||
block
|
||||
integer(c_int) :: i, val, start, tail
|
||||
type(c_ptr) :: ptr, ptr2, tmpptr
|
||||
integer(c_int8_t), pointer, contiguous :: fptr(:)
|
||||
integer(c_intptr_t) :: intptr
|
||||
integer(c_size_t), parameter :: count = 1024
|
||||
integer(omp_depend_kind) :: dep(1)
|
||||
|
||||
ptr = omp_target_alloc (count, dev)
|
||||
|
||||
!$omp depobj(dep(1)) depend(inout: ptr)
|
||||
|
||||
! Play also around with the alignment - as hsa_amd_memory_fill operates
|
||||
! on multiples of 4 bytes (c_int32_t)
|
||||
|
||||
do start = 0, 31
|
||||
do tail = 0, 31
|
||||
val = iachar('0') + start + tail
|
||||
|
||||
tmpptr = transfer (transfer (ptr, intptr) + start, tmpptr)
|
||||
ptr2 = omp_target_memset_async (tmpptr, val, count - start - tail, dev, 0)
|
||||
|
||||
if (.not. c_associated (tmpptr, ptr2)) stop 1
|
||||
|
||||
!$omp taskwait
|
||||
|
||||
!$omp target device(dev) is_device_ptr(ptr) depend(depobj: dep(1)) nowait
|
||||
do i = 1 + start, int(count, c_int) - start - tail
|
||||
call c_f_pointer (ptr, fptr, [count])
|
||||
if (fptr(i) /= int (val, c_int8_t)) stop 2
|
||||
fptr(i) = fptr(i) + 2_c_int8_t
|
||||
end do
|
||||
!$omp end target
|
||||
|
||||
ptr2 = omp_target_memset_async (tmpptr, val + 3, &
|
||||
count - start - tail, dev, 1, dep)
|
||||
|
||||
!$omp target device(dev) is_device_ptr(ptr) depend(depobj: dep(1)) nowait
|
||||
do i = 1 + start, int(count, c_int) - start - tail
|
||||
call c_f_pointer (ptr, fptr, [count])
|
||||
if (fptr(i) /= int (val + 3, c_int8_t)) stop 3
|
||||
fptr(i) = fptr(i) - 1_c_int8_t
|
||||
end do
|
||||
!$omp end target
|
||||
|
||||
ptr2 = omp_target_memset_async (tmpptr, val - 3, &
|
||||
count - start - tail, dev, 1, dep)
|
||||
|
||||
!$omp taskwait depend (depobj: dep(1))
|
||||
end do
|
||||
end do
|
||||
|
||||
!$omp depobj(dep(1)) destroy
|
||||
call omp_target_free (ptr, dev);
|
||||
end block
|
||||
end do
|
||||
end
|
|
@ -0,0 +1,39 @@
|
|||
! PR libgomp/120444
|
||||
|
||||
use omp_lib
|
||||
use iso_c_binding
|
||||
implicit none (type, external)
|
||||
|
||||
integer(c_int) :: dev, i, val, start, tail
|
||||
type(c_ptr) :: ptr, ptr2, tmpptr
|
||||
integer(c_int8_t), pointer, contiguous :: fptr(:)
|
||||
integer(c_intptr_t) :: intptr
|
||||
integer(c_size_t), parameter :: count = 1024
|
||||
|
||||
do dev = omp_initial_device, omp_get_num_devices ()
|
||||
ptr = omp_target_alloc (count, dev)
|
||||
|
||||
! Play also around with the alignment - as hsa_amd_memory_fill operates
|
||||
! on multiples of 4 bytes (c_int32_t)
|
||||
|
||||
do start = 0, 31
|
||||
do tail = 0, 31
|
||||
val = iachar('0') + start + tail
|
||||
|
||||
tmpptr = transfer (transfer (ptr, intptr) + start, tmpptr)
|
||||
ptr2 = omp_target_memset (tmpptr, val, count - start - tail, dev)
|
||||
|
||||
if (.not. c_associated (tmpptr, ptr2)) stop 1
|
||||
|
||||
!$omp target device(dev) is_device_ptr(ptr)
|
||||
do i = 1 + start, int(count, c_int) - start - tail
|
||||
call c_f_pointer (ptr, fptr, [count])
|
||||
if (fptr(i) /= int (val, c_int8_t)) stop 2
|
||||
end do
|
||||
!$omp end target
|
||||
end do
|
||||
end do
|
||||
|
||||
call omp_target_free (ptr, dev);
|
||||
end do
|
||||
end
|
Loading…
Reference in New Issue