diff --git a/gcc/fortran/openmp.cc b/gcc/fortran/openmp.cc index 9c9d0c2c5340..81d624b7b54a 100644 --- a/gcc/fortran/openmp.cc +++ b/gcc/fortran/openmp.cc @@ -8359,9 +8359,9 @@ resolve_omp_udr_clause (gfc_omp_namelist *n, gfc_namespace *ns, } /* Assume that a constant expression in the range 1 (omp_default_mem_alloc) - to 8 (omp_thread_mem_alloc) range, or 200 (ompx_gnu_pinned_mem_alloc) is - fine. The original symbol name is already lost during matching via - gfc_match_expr. */ + to GOMP_OMP_PREDEF_ALLOC_MAX, or GOMP_OMPX_PREDEF_ALLOC_MIN to + GOMP_OMPX_PREDEF_ALLOC_MAX is fine. The original symbol name is already + lost during matching via gfc_match_expr. */ static bool is_predefined_allocator (gfc_expr *expr) { diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h index 6be1ac0ab438..28510a3150cb 100644 --- a/include/cuda/cuda.h +++ b/include/cuda/cuda.h @@ -87,6 +87,10 @@ typedef enum { CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88 } CUdevice_attribute; +typedef enum { + CU_MEM_ATTACH_GLOBAL = 0x1 +} CUmemAttach_flags; + enum { CU_EVENT_DEFAULT = 0, CU_EVENT_DISABLE_TIMING = 2 @@ -254,6 +258,7 @@ CUresult cuMemAlloc (CUdeviceptr *, size_t); #define cuMemAllocHost cuMemAllocHost_v2 CUresult cuMemAllocHost (void **, size_t); CUresult cuMemHostAlloc (void **, size_t, unsigned int); +CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int); CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t); CUresult cuMemcpyPeer (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t); CUresult cuMemcpyPeerAsync (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream); diff --git a/include/gomp-constants.h b/include/gomp-constants.h index 963436aee560..db55b2600531 100644 --- a/include/gomp-constants.h +++ b/include/gomp-constants.h @@ -395,7 +395,12 @@ enum gomp_map_kind /* Predefined allocator value ranges. */ #define GOMP_OMP_PREDEF_ALLOC_MAX 8 #define GOMP_OMPX_PREDEF_ALLOC_MIN 200 -#define GOMP_OMPX_PREDEF_ALLOC_MAX 200 +#define GOMP_OMPX_PREDEF_ALLOC_MAX 201 + +/* Predefined memspace value ranges. */ +#define GOMP_OMP_PREDEF_MEMSPACE_MAX 4 +#define GOMP_OMPX_PREDEF_MEMSPACE_MIN 200 +#define GOMP_OMPX_PREDEF_MEMSPACE_MAX 200 /* Predefined allocator with access == thread. */ #define GOMP_OMP_PREDEF_ALLOC_THREADS 8 diff --git a/libgomp/allocator.c b/libgomp/allocator.c index 4a683d90bba3..44c41cadd0bb 100644 --- a/libgomp/allocator.c +++ b/libgomp/allocator.c @@ -100,34 +100,57 @@ GOMP_is_alloc (void *ptr) #define omp_max_predefined_alloc omp_thread_mem_alloc #define ompx_gnu_min_predefined_alloc ompx_gnu_pinned_mem_alloc -#define ompx_gnu_max_predefined_alloc ompx_gnu_pinned_mem_alloc +#define ompx_gnu_max_predefined_alloc ompx_gnu_managed_mem_alloc _Static_assert (GOMP_OMP_PREDEF_ALLOC_MAX == omp_thread_mem_alloc, "GOMP_OMP_PREDEF_ALLOC_MAX == omp_thread_mem_alloc"); _Static_assert (GOMP_OMPX_PREDEF_ALLOC_MIN == ompx_gnu_min_predefined_alloc, - "GOMP_OMP_PREDEF_ALLOC_MAX == omp_thread_mem_alloc"); + "GOMP_OMPX_PREDEF_ALLOC_MIN == ompx_gnu_min_predefined_alloc"); _Static_assert (GOMP_OMPX_PREDEF_ALLOC_MAX == ompx_gnu_max_predefined_alloc, - "GOMP_OMP_PREDEF_ALLOC_MAX == omp_thread_mem_alloc"); + "GOMP_OMPX_PREDEF_ALLOC_MAX == ompx_gnu_max_predefined_alloc"); _Static_assert (GOMP_OMP_PREDEF_ALLOC_THREADS == omp_thread_mem_alloc, "GOMP_OMP_PREDEF_ALLOC_THREADS == omp_thread_mem_alloc"); +#define omp_max_predefined_mem_space omp_low_lat_mem_space +#define ompx_gnu_min_predefined_mem_space ompx_gnu_managed_mem_space +#define ompx_gnu_max_predefined_mem_space ompx_gnu_managed_mem_space + +_Static_assert (GOMP_OMP_PREDEF_MEMSPACE_MAX == omp_max_predefined_mem_space, + "GOMP_OMP_PREDEF_MEMSPACE_MAX == omp_max_predefined_mem_space"); +_Static_assert (GOMP_OMPX_PREDEF_MEMSPACE_MIN == ompx_gnu_min_predefined_mem_space, + "GOMP_OMPX_PREDEF_MEMSPACE_MIN == ompx_gnu_min_predefined_mem_space"); +_Static_assert (GOMP_OMPX_PREDEF_MEMSPACE_MAX == ompx_gnu_max_predefined_mem_space, + "GOMP_OMPX_PREDEF_MEMSPACE_MAX == ompx_gnu_max_predefined_mem_space"); + +#if 0 /* For testing the fall-back macros compile, only. */ +#undef MEMSPACE_ALLOC +#undef MEMSPACE_CALLOC +#undef MEMSPACE_REALLOC +#undef MEMSPACE_FREE +#undef MEMSPACE_VALIDATE +#endif + /* These macros may be overridden in config//allocator.c. The defaults (no override) are to return NULL for pinned memory requests - and pass through to the regular OS calls otherwise. + or non-standard memory spaces (these need a deliberate implementation), and + pass through to the regular OS calls otherwise. The following definitions (ab)use comma operators to avoid unused variable errors. */ #ifndef MEMSPACE_ALLOC #define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \ - (PIN ? NULL : malloc (((void)(MEMSPACE), (SIZE)))) + ((PIN) || (MEMSPACE) > GOMP_OMP_PREDEF_MEMSPACE_MAX \ + ? NULL : malloc (((void)(MEMSPACE), (SIZE)))) #endif #ifndef MEMSPACE_CALLOC #define MEMSPACE_CALLOC(MEMSPACE, SIZE, PIN) \ - (PIN ? NULL : calloc (1, (((void)(MEMSPACE), (SIZE))))) + ((PIN) || (MEMSPACE) > GOMP_OMP_PREDEF_MEMSPACE_MAX \ + ? NULL : calloc (1, (((void)(MEMSPACE), (SIZE))))) #endif #ifndef MEMSPACE_REALLOC #define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE, OLDPIN, PIN) \ - ((PIN) || (OLDPIN) ? NULL \ - : realloc (ADDR, (((void)(MEMSPACE), (void)(OLDSIZE), (SIZE))))) + ((PIN) || (OLDPIN) || (MEMSPACE) > GOMP_OMP_PREDEF_MEMSPACE_MAX \ + ? NULL \ + : realloc (ADDR, (((void)(MEMSPACE), (void)(OLDSIZE), (SIZE))))) #endif #ifndef MEMSPACE_FREE #define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE, PIN) \ @@ -135,7 +158,8 @@ _Static_assert (GOMP_OMP_PREDEF_ALLOC_THREADS == omp_thread_mem_alloc, #endif #ifndef MEMSPACE_VALIDATE #define MEMSPACE_VALIDATE(MEMSPACE, ACCESS, PIN) \ - (PIN ? 0 : ((void)(MEMSPACE), (void)(ACCESS), 1)) + ((PIN) || (MEMSPACE) > GOMP_OMP_PREDEF_MEMSPACE_MAX \ + ? 0 : ((void)(MEMSPACE), (void)(ACCESS), 1)) #endif /* Map the predefined allocators to the correct memory space. @@ -155,6 +179,7 @@ static const omp_memspace_handle_t predefined_omp_alloc_mapping[] = { }; static const omp_memspace_handle_t predefined_ompx_gnu_alloc_mapping[] = { omp_default_mem_space, /* ompx_gnu_pinned_mem_alloc. */ + ompx_gnu_managed_mem_space, /* ompx_gnu_managed_mem_alloc. */ }; #define ARRAY_SIZE(A) (sizeof (A) / sizeof ((A)[0])) @@ -389,7 +414,9 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits, struct omp_allocator_data *ret; int i; - if (memspace > omp_low_lat_mem_space) + if (memspace > omp_max_predefined_mem_space + && (memspace < ompx_gnu_min_predefined_mem_space + || memspace > ompx_gnu_max_predefined_mem_space)) return omp_null_allocator; for (i = 0; i < ntraits; i++) switch (traits[i].key) diff --git a/libgomp/config/gcn/allocator.c b/libgomp/config/gcn/allocator.c index 92aa2db2cc6a..969cfa9ccd9e 100644 --- a/libgomp/config/gcn/allocator.c +++ b/libgomp/config/gcn/allocator.c @@ -56,8 +56,12 @@ gcn_memspace_alloc (omp_memspace_handle_t memspace, size_t size) return __gcn_lowlat_alloc (shared_pool, size); } + else if (memspace > GOMP_OMP_PREDEF_MEMSPACE_MAX) + /* No non-standard memspaces are implemented for device-side amdgcn. */ + return NULL; else return malloc (size); + } static void * @@ -69,6 +73,9 @@ gcn_memspace_calloc (omp_memspace_handle_t memspace, size_t size) return __gcn_lowlat_calloc (shared_pool, size); } + else if (memspace > GOMP_OMP_PREDEF_MEMSPACE_MAX) + /* No non-standard memspaces are implemented for device-side amdgcn. */ + return NULL; else return calloc (1, size); } @@ -96,6 +103,9 @@ gcn_memspace_realloc (omp_memspace_handle_t memspace, void *addr, return __gcn_lowlat_realloc (shared_pool, addr, oldsize, size); } + else if (memspace > GOMP_OMP_PREDEF_MEMSPACE_MAX) + /* No non-standard memspaces are implemented for device-side amdgcn. */ + return NULL; else return realloc (addr, size); } @@ -105,8 +115,14 @@ gcn_memspace_validate (omp_memspace_handle_t memspace, unsigned access) { /* Disallow use of low-latency memory when it must be accessible by all threads. */ - return (memspace != omp_low_lat_mem_space - || access != omp_atv_all); + if (memspace == omp_low_lat_mem_space + && access == omp_atv_all) + return false; + + /* Otherwise, standard memspaces are accepted, even when we don't have + anything special to do with them, and non-standard memspaces are assumed + to need explicit support. */ + return (memspace <= GOMP_OMP_PREDEF_MEMSPACE_MAX); } #define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \ diff --git a/libgomp/config/linux/allocator.c b/libgomp/config/linux/allocator.c index f957bb3421ac..c144c5972835 100644 --- a/libgomp/config/linux/allocator.c +++ b/libgomp/config/linux/allocator.c @@ -80,7 +80,9 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin, { void *addr = NULL; - if (pin) + if (memspace == ompx_gnu_managed_mem_space) + addr = gomp_managed_alloc (size); + else if (pin) { int using_device = __atomic_load_n (&using_device_for_page_locked, MEMMODEL_RELAXED); @@ -155,7 +157,15 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin, static void * linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin) { - if (pin) + if (memspace == ompx_gnu_managed_mem_space) + { + void *ret = gomp_managed_alloc (size); + if (!ret) + return NULL; + memset (ret, 0, size); + return ret; + } + else if (pin) return linux_memspace_alloc (memspace, size, pin, true); else return calloc (1, size); @@ -165,7 +175,9 @@ static void linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size, int pin) { - if (pin) + if (memspace == ompx_gnu_managed_mem_space) + gomp_managed_free (addr); + else if (pin) { int using_device = __atomic_load_n (&using_device_for_page_locked, @@ -186,7 +198,10 @@ static void * linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr, size_t oldsize, size_t size, int oldpin, int pin) { - if (oldpin && pin) + if (memspace == ompx_gnu_managed_mem_space) + /* Realloc is not implemented for device Managed Memory. */ + ; + else if (oldpin && pin) { int using_device = __atomic_load_n (&using_device_for_page_locked, @@ -221,7 +236,8 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr, static int linux_memspace_validate (omp_memspace_handle_t, unsigned, int) { - /* Everything should be accepted on Linux, including pinning. */ + /* Everything should be accepted on Linux, including pinning and + non-standard memspaces. */ return 1; } diff --git a/libgomp/config/nvptx/allocator.c b/libgomp/config/nvptx/allocator.c index 7e9e343d2a94..8bbc14a49dbc 100644 --- a/libgomp/config/nvptx/allocator.c +++ b/libgomp/config/nvptx/allocator.c @@ -61,6 +61,9 @@ nvptx_memspace_alloc (omp_memspace_handle_t memspace, size_t size) return __nvptx_lowlat_alloc (shared_pool, size); } + else if (memspace > GOMP_OMP_PREDEF_MEMSPACE_MAX) + /* No non-standard memspaces are implemented for device-side nvptx. */ + return NULL; else return malloc (size); } @@ -75,6 +78,9 @@ nvptx_memspace_calloc (omp_memspace_handle_t memspace, size_t size) return __nvptx_lowlat_calloc (shared_pool, size); } + else if (memspace > GOMP_OMP_PREDEF_MEMSPACE_MAX) + /* No non-standard memspaces are implemented for device-side nvptx. */ + return NULL; else return calloc (1, size); } @@ -104,6 +110,9 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr, return __nvptx_lowlat_realloc (shared_pool, addr, oldsize, size); } + else if (memspace > GOMP_OMP_PREDEF_MEMSPACE_MAX) + /* No non-standard memspaces are implemented for device-side nvptx. */ + return NULL; else return realloc (addr, size); } @@ -115,12 +124,19 @@ nvptx_memspace_validate (omp_memspace_handle_t memspace, unsigned access) || (__PTX_ISA_VERSION_MAJOR__ == 4 && __PTX_ISA_VERSION_MINOR >= 1) /* Disallow use of low-latency memory when it must be accessible by all threads. */ - return (memspace != omp_low_lat_mem_space - || access != omp_atv_all); + if (memspace == omp_low_lat_mem_space + && access == omp_atv_all) + return false; #else /* Low-latency memory is not available before PTX 4.1. */ - return (memspace != omp_low_lat_mem_space); + if (memspace == omp_low_lat_mem_space) + return false; #endif + + /* Otherwise, standard memspaces are accepted, even when we don't have + anything special to do with them, and non-standard memspaces are assumed + to need explicit support. */ + return (memspace <= GOMP_OMP_PREDEF_MEMSPACE_MAX); } #define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \ diff --git a/libgomp/env.c b/libgomp/env.c index f63a36afdd23..48bb7890e7b4 100644 --- a/libgomp/env.c +++ b/libgomp/env.c @@ -1231,6 +1231,12 @@ parse_affinity (bool ignore) return false; } +/* These are reminders to add new allocators to parse_allocator. */ +_Static_assert (GOMP_OMP_PREDEF_ALLOC_MAX == omp_thread_mem_alloc); +_Static_assert (GOMP_OMPX_PREDEF_ALLOC_MAX == ompx_gnu_managed_mem_alloc); +_Static_assert (GOMP_OMP_PREDEF_MEMSPACE_MAX == omp_low_lat_mem_space); +_Static_assert (GOMP_OMPX_PREDEF_MEMSPACE_MAX == ompx_gnu_managed_mem_space); + /* Parse the OMP_ALLOCATOR environment variable and return the value. */ static bool parse_allocator (const char *env, const char *val, void *const params[]) @@ -1249,12 +1255,12 @@ parse_allocator (const char *env, const char *val, void *const params[]) ++val; if (0) ; -#define C(v, m) \ +#define C(v, is_memspace) \ else if (strncasecmp (val, #v, sizeof (#v) - 1) == 0) \ { \ *ret = v; \ val += sizeof (#v) - 1; \ - memspace = m; \ + memspace = is_memspace; \ } C (omp_default_mem_alloc, false) C (omp_large_cap_mem_alloc, false) @@ -1265,11 +1271,13 @@ parse_allocator (const char *env, const char *val, void *const params[]) C (omp_pteam_mem_alloc, false) C (omp_thread_mem_alloc, false) C (ompx_gnu_pinned_mem_alloc, false) + C (ompx_gnu_managed_mem_alloc, false) C (omp_default_mem_space, true) C (omp_large_cap_mem_space, true) C (omp_const_mem_space, true) C (omp_high_bw_mem_space, true) C (omp_low_lat_mem_space, true) + C (ompx_gnu_managed_mem_space, true) #undef C else goto invalid; diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h index f2baed9bad92..5b4704484dd0 100644 --- a/libgomp/libgomp-plugin.h +++ b/libgomp/libgomp-plugin.h @@ -171,6 +171,8 @@ extern int GOMP_OFFLOAD_load_image (int, unsigned, const void *, extern bool GOMP_OFFLOAD_unload_image (int, unsigned, const void *); extern void *GOMP_OFFLOAD_alloc (int, size_t); extern bool GOMP_OFFLOAD_free (int, void *); +extern void *GOMP_OFFLOAD_managed_alloc (int, size_t); +extern bool GOMP_OFFLOAD_managed_free (int, void *); extern bool GOMP_OFFLOAD_page_locked_host_alloc (void **, size_t); extern bool GOMP_OFFLOAD_page_locked_host_free (void *); extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t); diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 3d406be175e4..ff445d1e90c6 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -1135,6 +1135,8 @@ extern int gomp_get_num_devices (void); extern bool gomp_target_task_fn (void *); extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, int, struct goacc_asyncqueue *); +extern void *gomp_managed_alloc (size_t size); +extern void gomp_managed_free (void *device_ptr); extern bool gomp_page_locked_host_alloc (void **, size_t); extern void gomp_page_locked_host_free (void *); @@ -1421,6 +1423,8 @@ struct gomp_device_descr __typeof (GOMP_OFFLOAD_unload_image) *unload_image_func; __typeof (GOMP_OFFLOAD_alloc) *alloc_func; __typeof (GOMP_OFFLOAD_free) *free_func; + __typeof (GOMP_OFFLOAD_managed_alloc) *managed_alloc_func; + __typeof (GOMP_OFFLOAD_managed_free) *managed_free_func; __typeof (GOMP_OFFLOAD_page_locked_host_alloc) *page_locked_host_alloc_func; __typeof (GOMP_OFFLOAD_page_locked_host_free) *page_locked_host_free_func; __typeof (GOMP_OFFLOAD_dev2host) *dev2host_func; diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi index c73cd8f87446..e08df90bc547 100644 --- a/libgomp/libgomp.texi +++ b/libgomp/libgomp.texi @@ -6924,6 +6924,7 @@ GCC supports the following predefined allocators and predefined memory spaces: @item omp_pteam_mem_alloc @tab omp_low_lat_mem_space (implementation defined) @item omp_thread_mem_alloc @tab omp_low_lat_mem_space (implementation defined) @item ompx_gnu_pinned_mem_alloc @tab omp_default_mem_space (GNU extension) +@item ompx_gnu_managed_mem_alloc @tab ompx_gnu_managed_mem_space (GNU extension) @end multitable Each predefined allocator, including @code{omp_null_allocator}, has a corresponding @@ -6951,6 +6952,7 @@ The following allocator templates are supported: @item omp_pteam_mem_alloc @tab omp::allocator::pteam_mem @item omp_thread_mem_alloc @tab omp::allocator::thread_mem @item ompx_gnu_pinned_mem_alloc @tab ompx::allocator::gnu_pinned_mem +@item ompx_gnu_managed_mem_alloc @tab ompx::allocator::gnu_managed_mem @end multitable The following traits are available when constructing a new allocator; @@ -7010,6 +7012,19 @@ For the memory spaces, the following applies: unless the memkind library is available @item @code{omp_high_bw_mem_space} maps to @code{omp_default_mem_space}, unless the memkind library is available +@item @code{ompx_gnu_managed_mem_space} is a GNU extension that provides + managed memory accessible by both host and devices. The memory space is + available if the offload target associated with the + @var{default-device-var} ICV supports managed memory (see + @ref{Offload-Target Specifics}). This memory is accessible by both the + host and the device at the same address, so it need not be mapped with + @code{map} clauses. Instead, use the @code{is_device_ptr} clause or + @code{has_device_addr} clause to indicate that the pointer is already + accessible on the device. If managed memory is not supported by the + default device, as configured at the moment the allocator is called, then + the allocator will use the fall-back setting. If the default device is + configured differently when the memory is freed, via @code{omp_free} or + @code{omp_realloc}, the result may be undefined. @end itemize On Linux systems, where the @uref{https://github.com/memkind/memkind, memkind @@ -7169,6 +7184,11 @@ The implementation remark: a performance boost for NVPTX offload code and also allows unlimited use of pinned memory regardless of the OS @code{ulimit}/@code{rlimit} settings. +@item Managed memory allocated with the OpenMP + @code{ompx_gnu_managed_mem_alloc} allocator or in the + @code{ompx_gnu_managed_mem_space} is not currently supported for AMD GPU + devices; attempting to use it in an allocator will trigger the fall-back + trait. @item The OpenMP routines @code{omp_target_memcpy_rect} and @code{omp_target_memcpy_rect_async} and the @code{target update} directive for non-contiguous list items use the 3D memory-copy function @@ -7331,6 +7351,20 @@ The implementation remark: @code{omp_thread_mem_alloc}, all use low-latency memory as first preference, and fall back to main graphics memory when the low-latency pool is exhausted. +@item Managed memory allocated on the host with the + @code{ompx_gnu_managed_mem_alloc} allocator or in the + @code{ompx_gnu_managed_mem_space} (both GNU extensions) allocate memory + in the CUDA Managed Memory space using @code{cuMemAllocManaged}. This + memory is accessible by both the host and the device at the same address, + so it need not be mapped with @code{map} clauses. Instead, use the + @code{is_device_ptr} clause or @code{has_device_addr} clause to indicate + that the pointer is already accessible on the device. The CUDA runtime + will automatically handle data migration between host and device as + needed. If managed memory is not supported by the default device, as + configured at the moment the allocator is called, then the allocator will + use the fall-back setting. If the default device is configured + differently when the memory is freed, via @code{omp_free} or + @code{omp_realloc}, the result may be undefined. @item The OpenMP routines @code{omp_target_memcpy_rect} and @code{omp_target_memcpy_rect_async} and the @code{target update} directive for non-contiguous list items use the 2D and 3D memory-copy diff --git a/libgomp/omp.h.in b/libgomp/omp.h.in index 566a3c28b942..329d8dc35e07 100644 --- a/libgomp/omp.h.in +++ b/libgomp/omp.h.in @@ -121,6 +121,7 @@ typedef enum omp_memspace_handle_t __GOMP_UINTPTR_T_ENUM omp_const_mem_space = 2, omp_high_bw_mem_space = 3, omp_low_lat_mem_space = 4, + ompx_gnu_managed_mem_space = 200, __omp_memspace_handle_t_max__ = __UINTPTR_MAX__ } omp_memspace_handle_t; @@ -136,6 +137,7 @@ typedef enum omp_allocator_handle_t __GOMP_UINTPTR_T_ENUM omp_pteam_mem_alloc = 7, omp_thread_mem_alloc = 8, ompx_gnu_pinned_mem_alloc = 200, + ompx_gnu_managed_mem_alloc = 201, __omp_allocator_handle_t_max__ = __UINTPTR_MAX__ } omp_allocator_handle_t; @@ -563,6 +565,10 @@ template struct gnu_pinned_mem : omp::allocator::__detail::__allocator_templ <__T, ompx_gnu_pinned_mem_alloc> {}; +template +struct gnu_managed_mem + : omp::allocator::__detail::__allocator_templ <__T, + ompx_gnu_managed_mem_alloc> {}; } /* namespace allocator */ diff --git a/libgomp/omp_lib.f90.in b/libgomp/omp_lib.f90.in index 74e0bfea344c..1b1a16320884 100644 --- a/libgomp/omp_lib.f90.in +++ b/libgomp/omp_lib.f90.in @@ -164,6 +164,8 @@ parameter :: omp_thread_mem_alloc = 8 integer (kind=omp_allocator_handle_kind), & parameter :: ompx_gnu_pinned_mem_alloc = 200 + integer (kind=omp_allocator_handle_kind), & + parameter :: ompx_gnu_managed_mem_alloc = 201 integer (omp_memspace_handle_kind), & parameter :: omp_default_mem_space = 0 integer (omp_memspace_handle_kind), & @@ -174,6 +176,8 @@ parameter :: omp_high_bw_mem_space = 3 integer (omp_memspace_handle_kind), & parameter :: omp_low_lat_mem_space = 4 + integer (omp_memspace_handle_kind), & + parameter :: ompx_gnu_managed_mem_space = 200 integer, parameter :: omp_initial_device = -1 integer, parameter :: omp_invalid_device = -4 integer, parameter :: omp_default_device = -5 diff --git a/libgomp/omp_lib.h.in b/libgomp/omp_lib.h.in index 9422515dc379..93305834545a 100644 --- a/libgomp/omp_lib.h.in +++ b/libgomp/omp_lib.h.in @@ -162,6 +162,7 @@ integer (omp_allocator_handle_kind) omp_pteam_mem_alloc integer (omp_allocator_handle_kind) omp_thread_mem_alloc integer (omp_allocator_handle_kind) ompx_gnu_pinned_mem_alloc + integer (omp_allocator_handle_kind) ompx_gnu_managed_mem_alloc parameter (omp_null_allocator = 0) parameter (omp_default_mem_alloc = 1) parameter (omp_large_cap_mem_alloc = 2) @@ -172,16 +173,19 @@ parameter (omp_pteam_mem_alloc = 7) parameter (omp_thread_mem_alloc = 8) parameter (ompx_gnu_pinned_mem_alloc = 200) + parameter (ompx_gnu_managed_mem_alloc = 201) integer (omp_memspace_handle_kind) omp_default_mem_space integer (omp_memspace_handle_kind) omp_large_cap_mem_space integer (omp_memspace_handle_kind) omp_const_mem_space integer (omp_memspace_handle_kind) omp_high_bw_mem_space integer (omp_memspace_handle_kind) omp_low_lat_mem_space + integer (omp_memspace_handle_kind) ompx_gnu_managed_mem_space parameter (omp_default_mem_space = 0) parameter (omp_large_cap_mem_space = 1) parameter (omp_const_mem_space = 2) parameter (omp_high_bw_mem_space = 3) parameter (omp_low_lat_mem_space = 4) + parameter (ompx_gnu_managed_mem_space = 200) integer omp_initial_device, omp_invalid_device, omp_default_device parameter (omp_initial_device = -1) parameter (omp_invalid_device = -4) diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def index 7f4ddcc6bd1a..67c783d8566d 100644 --- a/libgomp/plugin/cuda-lib.def +++ b/libgomp/plugin/cuda-lib.def @@ -33,6 +33,7 @@ CUDA_ONE_CALL (cuLinkDestroy) CUDA_ONE_CALL (cuMemAlloc) CUDA_ONE_CALL (cuMemAllocHost) CUDA_ONE_CALL (cuMemHostAlloc) +CUDA_ONE_CALL (cuMemAllocManaged) CUDA_ONE_CALL (cuMemcpy) CUDA_ONE_CALL (cuMemcpyDtoDAsync) CUDA_ONE_CALL (cuMemcpyDtoH) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 5ad66688e7ef..dd8bcf9c5070 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -1133,11 +1133,13 @@ nvptx_stacks_free (struct ptx_device *ptx_dev, bool force) } static void * -nvptx_alloc (size_t s, bool suppress_errors) +nvptx_alloc (size_t s, bool suppress_errors, bool managed) { CUdeviceptr d; - CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s); + CUresult r = (managed ? CUDA_CALL_NOCHECK (cuMemAllocManaged, &d, s, + CU_MEM_ATTACH_GLOBAL) + : CUDA_CALL_NOCHECK (cuMemAlloc, &d, s)); if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY) return NULL; else if (r != CUDA_SUCCESS) @@ -1843,8 +1845,8 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) return ret; } -void * -GOMP_OFFLOAD_alloc (int ord, size_t size) +static void * +cleanup_and_alloc (int ord, size_t size, bool managed) { if (!nvptx_attach_host_thread_to_device (ord)) return NULL; @@ -1867,7 +1869,7 @@ GOMP_OFFLOAD_alloc (int ord, size_t size) blocks = tmp; } - void *d = nvptx_alloc (size, true); + void *d = nvptx_alloc (size, true, managed); if (d) return d; else @@ -1875,10 +1877,22 @@ GOMP_OFFLOAD_alloc (int ord, size_t size) /* Memory allocation failed. Try freeing the stacks block, and retrying. */ nvptx_stacks_free (ptx_dev, true); - return nvptx_alloc (size, false); + return nvptx_alloc (size, false, managed); } } +void * +GOMP_OFFLOAD_alloc (int ord, size_t size) +{ + return cleanup_and_alloc (ord, size, false); +} + +void * +GOMP_OFFLOAD_managed_alloc (int ord, size_t size) +{ + return cleanup_and_alloc (ord, size, true); +} + bool GOMP_OFFLOAD_free (int ord, void *ptr) { @@ -1886,6 +1900,12 @@ GOMP_OFFLOAD_free (int ord, void *ptr) && nvptx_free (ptr, ptx_devices[ord])); } +bool +GOMP_OFFLOAD_managed_free (int ord, void *ptr) +{ + return GOMP_OFFLOAD_free (ord, ptr); +} + bool GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size) { diff --git a/libgomp/target.c b/libgomp/target.c index 002a144b4abb..49d4218b1f7f 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -4707,6 +4707,46 @@ omp_target_free (void *device_ptr, int device_num) gomp_mutex_unlock (&devicep->lock); } +void * +gomp_managed_alloc (size_t size) +{ + struct gomp_task_icv *icv = gomp_icv (false); + struct gomp_device_descr *devicep = resolve_device (icv->default_device_var, + false); + if (devicep == NULL) + return NULL; + + void *ret = NULL; + gomp_mutex_lock (&devicep->lock); + if (devicep->managed_alloc_func) + ret = devicep->managed_alloc_func (devicep->target_id, size); + gomp_mutex_unlock (&devicep->lock); + return ret; +} + +void +gomp_managed_free (void *device_ptr) +{ + if (device_ptr == NULL) + return; + + struct gomp_task_icv *icv = gomp_icv (false); + struct gomp_device_descr *devicep = resolve_device (icv->default_device_var, + false); + if (devicep == NULL) + gomp_fatal ("attempted to free managed memory at %p, but the default " + "device is set to the host device", device_ptr); + + gomp_mutex_lock (&devicep->lock); + if (!devicep->managed_free_func + || !devicep->managed_free_func (devicep->target_id, device_ptr)) + { + gomp_mutex_unlock (&devicep->lock); + gomp_fatal ("error in freeing managed memory block at %p", device_ptr); + } + gomp_mutex_unlock (&devicep->lock); +} + /* Device (really: libgomp plugin) to use for paged-locked memory. We assume there is either none or exactly one such device for the lifetime of the process. */ @@ -5967,6 +6007,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device, DLSYM (unload_image); DLSYM (alloc); DLSYM (free); + DLSYM_OPT (managed_alloc, managed_alloc); + DLSYM_OPT (managed_free, managed_free); DLSYM_OPT (page_locked_host_alloc, page_locked_host_alloc); DLSYM_OPT (page_locked_host_free, page_locked_host_free); DLSYM (dev2host); diff --git a/libgomp/testsuite/lib/libgomp.exp b/libgomp/testsuite/lib/libgomp.exp index fd475ac3fe63..ba55cd39e2ba 100644 --- a/libgomp/testsuite/lib/libgomp.exp +++ b/libgomp/testsuite/lib/libgomp.exp @@ -722,3 +722,12 @@ int main() { return 0; } } "-lhipblas" ] } + +# return 1 if OpenMP Device Managed Memory is supported + +proc check_effective_target_omp_managedmem { } { + if { [check_effective_target_offload_device_nvptx] } { + return 1 + } + return 0 +} diff --git a/libgomp/testsuite/libgomp.c++/alloc-managed-1.C b/libgomp/testsuite/libgomp.c++/alloc-managed-1.C new file mode 100644 index 000000000000..afd7fd648c67 --- /dev/null +++ b/libgomp/testsuite/libgomp.c++/alloc-managed-1.C @@ -0,0 +1,35 @@ +// { dg-do run } +// { dg-require-effective-target omp_managedmem } + +// Check that the ompx::allocator::gnu_managed_mem allocator can allocate +// Managed Memory, and that host and target can see the data, at the same +// address, without a mapping. + +#include +#include +#include + +int +main () +{ + using Allocator = ompx::allocator::gnu_managed_mem; + using Traits = std::allocator_traits; + + Allocator alloc; + int *a = Traits::allocate (alloc, 1); + if (!a) + __builtin_abort (); + + Traits::construct (alloc, a, 42); + std::uintptr_t a_p = reinterpret_cast(a); + + #pragma omp target is_device_ptr(a) + { + if (*a != 42 || a_p != reinterpret_cast(a)) + __builtin_abort (); + } + + Traits::destroy (alloc, a); + Traits::deallocate (alloc, a, 1); + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/alloc-managed-1.c b/libgomp/testsuite/libgomp.c/alloc-managed-1.c new file mode 100644 index 000000000000..31b252fc0ae6 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/alloc-managed-1.c @@ -0,0 +1,28 @@ +/* { dg-do run } */ +/* { dg-require-effective-target omp_managedmem } */ + +/* Check that omp_alloc can allocate Managed Memory, and that host and target + can see the data, at the same address, without a mapping. */ + +#include +#include + +int +main () +{ + int *a = (int *) omp_alloc(sizeof(int), ompx_gnu_managed_mem_alloc); + if (!a) + __builtin_abort (); + + *a = 42; + uintptr_t a_p = (uintptr_t)a; + + #pragma omp target is_device_ptr(a) + { + if (*a != 42 || a_p != (uintptr_t)a) + __builtin_abort (); + } + + omp_free(a, ompx_gnu_managed_mem_alloc); + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/alloc-managed-2.c b/libgomp/testsuite/libgomp.c/alloc-managed-2.c new file mode 100644 index 000000000000..f7fd30a4f679 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/alloc-managed-2.c @@ -0,0 +1,38 @@ +/* { dg-do run } */ +/* { dg-require-effective-target omp_managedmem } */ + +/* Check that omp_calloc can allocate Managed Memory, and that host and target + can see the data, at the same address, without a mapping. */ + +#include +#include + +int +main () +{ + int *a = (int *) omp_calloc(5, sizeof(int), ompx_gnu_managed_mem_alloc); + if (!a) + __builtin_abort (); + + /* Check that memory is zero-initialized */ + for (int i = 0; i < 5; i++) + if (a[i] != 0) + __builtin_abort (); + + a[0] = 42; + a[4] = 99; + uintptr_t a_p = (uintptr_t)a; + + #pragma omp target is_device_ptr(a) + { + if (a[0] != 42 || a[4] != 99 || a_p != (uintptr_t)a) + __builtin_abort (); + /* Check zero-initialization on device side */ + for (int i = 1; i < 4; i++) + if (a[i] != 0) + __builtin_abort (); + } + + omp_free(a, ompx_gnu_managed_mem_alloc); + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/alloc-managed-3.c b/libgomp/testsuite/libgomp.c/alloc-managed-3.c new file mode 100644 index 000000000000..17828b76962e --- /dev/null +++ b/libgomp/testsuite/libgomp.c/alloc-managed-3.c @@ -0,0 +1,44 @@ +/* { dg-do run } */ +/* { dg-require-effective-target omp_managedmem } */ + +/* Check that omp_realloc can allocate Managed Memory, and that host and target + can see the data, at the same address, without a mapping. */ + +#include +#include + +int +main () +{ + int *a = (int *) omp_alloc(2 * sizeof(int), ompx_gnu_managed_mem_alloc); + if (!a) + __builtin_abort (); + + a[0] = 42; + a[1] = 43; + + /* Reallocate to larger size */ + int *b = (int *) omp_realloc(a, 5 * sizeof(int), ompx_gnu_managed_mem_alloc, + ompx_gnu_managed_mem_alloc); + if (!b) + __builtin_abort (); + + /* Check that original data is preserved */ + if (b[0] != 42 || b[1] != 43) + __builtin_abort (); + + b[2] = 44; + b[3] = 45; + b[4] = 46; + uintptr_t b_p = (uintptr_t)b; + + #pragma omp target is_device_ptr(b) + { + if (b[0] != 42 || b[1] != 43 || b[2] != 44 || b[3] != 45 || b[4] != 46 + || b_p != (uintptr_t)b) + __builtin_abort (); + } + + omp_free(b, ompx_gnu_managed_mem_alloc); + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/alloc-managed-4.c b/libgomp/testsuite/libgomp.c/alloc-managed-4.c new file mode 100644 index 000000000000..4eaf8259b6fc --- /dev/null +++ b/libgomp/testsuite/libgomp.c/alloc-managed-4.c @@ -0,0 +1,22 @@ +/* { dg-do run } */ +/* { dg-require-effective-target omp_managedmem } */ +/* { dg-shouldfail "" } */ +/* { dg-output "libgomp: attempted to free managed memory at 0x\[0-9a-f\]+, but the default device is set to the host device" } */ + +/* Check that omp_free emits an error if the default device has been changed + to the host device. */ + +#include +#include + +int +main () +{ + int *a = (int *) omp_alloc(2 * sizeof(int), ompx_gnu_managed_mem_alloc); + if (!a) + __builtin_abort (); + + omp_set_default_device (omp_initial_device); + omp_free(a, ompx_gnu_managed_mem_alloc); + return 0; +} diff --git a/libgomp/testsuite/libgomp.fortran/alloc-managed-1.f90 b/libgomp/testsuite/libgomp.fortran/alloc-managed-1.f90 new file mode 100644 index 000000000000..685aeef7dae2 --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/alloc-managed-1.f90 @@ -0,0 +1,29 @@ +! { dg-do run } +! { dg-require-effective-target omp_managedmem } + +! Check that omp_alloc can allocate Managed Memory, and that host and target +! can see the data, at the same address, without a mapping. + +program main + use omp_lib + use iso_c_binding + implicit none + + type(c_ptr) :: cptr + integer, pointer :: a + integer(c_intptr_t) :: a_p, a_p2 + + cptr = omp_alloc(c_sizeof(a), ompx_gnu_managed_mem_alloc) + if (.not. c_associated(cptr)) stop 1 + + call c_f_pointer(cptr, a) + a = 42 + a_p = transfer(c_loc(a), a_p) + + !$omp target is_device_ptr(a) + a_p2 = transfer(c_loc(a), a_p2) + if (a /= 42 .or. a_p /= a_p2) stop 2 + !$omp end target + + call omp_free(cptr, ompx_gnu_managed_mem_alloc) +end program main