diff --git a/libgomp/allocator.c b/libgomp/allocator.c index fa398128368..a8a80f8028d 100644 --- a/libgomp/allocator.c +++ b/libgomp/allocator.c @@ -56,6 +56,10 @@ #define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \ free (((void)(MEMSPACE), (void)(SIZE), (ADDR))) #endif +#ifndef MEMSPACE_VALIDATE +#define MEMSPACE_VALIDATE(MEMSPACE, ACCESS) \ + (((void)(MEMSPACE), (void)(ACCESS), 1)) +#endif /* Map the predefined allocators to the correct memory space. The index to this table is the omp_allocator_handle_t enum value. @@ -439,6 +443,10 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits, if (data.pinned) return omp_null_allocator; + /* Reject unsupported memory spaces. */ + if (!MEMSPACE_VALIDATE (data.memspace, data.access)) + return omp_null_allocator; + ret = gomp_malloc (sizeof (struct omp_allocator_data)); *ret = data; #ifndef HAVE_SYNC_BUILTINS @@ -522,6 +530,10 @@ retry: new_size += new_alignment - sizeof (void *); if (__builtin_add_overflow (size, new_size, &new_size)) goto fail; +#ifdef OMP_LOW_LAT_MEM_ALLOC_INVALID + if (allocator == omp_low_lat_mem_alloc) + goto fail; +#endif if (__builtin_expect (allocator_data && allocator_data->pool_size < ~(uintptr_t) 0, 0)) @@ -820,6 +832,10 @@ retry: goto fail; if (__builtin_add_overflow (size_temp, new_size, &new_size)) goto fail; +#ifdef OMP_LOW_LAT_MEM_ALLOC_INVALID + if (allocator == omp_low_lat_mem_alloc) + goto fail; +#endif if (__builtin_expect (allocator_data && allocator_data->pool_size < ~(uintptr_t) 0, 0)) @@ -1054,6 +1070,10 @@ retry: if (__builtin_add_overflow (size, new_size, &new_size)) goto fail; old_size = data->size; +#ifdef OMP_LOW_LAT_MEM_ALLOC_INVALID + if (allocator == omp_low_lat_mem_alloc) + goto fail; +#endif if (__builtin_expect (allocator_data && allocator_data->pool_size < ~(uintptr_t) 0, 0)) diff --git a/libgomp/config/nvptx/allocator.c b/libgomp/config/nvptx/allocator.c index 6014fba177f..a3302411bcb 100644 --- a/libgomp/config/nvptx/allocator.c +++ b/libgomp/config/nvptx/allocator.c @@ -108,6 +108,21 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr, return realloc (addr, size); } +static inline int +nvptx_memspace_validate (omp_memspace_handle_t memspace, unsigned access) +{ +#if __PTX_ISA_VERSION_MAJOR__ > 4 \ + || (__PTX_ISA_VERSION_MAJOR__ == 4 && __PTX_ISA_VERSION_MINOR >= 1) + /* Disallow use of low-latency memory when it must be accessible by + all threads. */ + return (memspace != omp_low_lat_mem_space + || access != omp_atv_all); +#else + /* Low-latency memory is not available before PTX 4.1. */ + return (memspace != omp_low_lat_mem_space); +#endif +} + #define MEMSPACE_ALLOC(MEMSPACE, SIZE) \ nvptx_memspace_alloc (MEMSPACE, SIZE) #define MEMSPACE_CALLOC(MEMSPACE, SIZE) \ @@ -116,5 +131,11 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr, nvptx_memspace_realloc (MEMSPACE, ADDR, OLDSIZE, SIZE) #define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \ nvptx_memspace_free (MEMSPACE, ADDR, SIZE) +#define MEMSPACE_VALIDATE(MEMSPACE, ACCESS) \ + nvptx_memspace_validate (MEMSPACE, ACCESS) + +/* The default low-latency memspace implies omp_atv_all, which is incompatible + with the .shared memory space. */ +#define OMP_LOW_LAT_MEM_ALLOC_INVALID 1 #include "../../allocator.c" diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi index 54c4bc26584..8d57c17c450 100644 --- a/libgomp/libgomp.texi +++ b/libgomp/libgomp.texi @@ -5767,6 +5767,9 @@ Additional notes regarding the traits: @item The @code{sync_hint} trait has no effect. @end itemize +See also: +@ref{Offload-Target Specifics} + @c --------------------------------------------------------------------- @c Offload-Target Specifics @c --------------------------------------------------------------------- @@ -5900,6 +5903,21 @@ The implementation remark: directive for non-contiguous list items will use the 2D and 3D memory-copy functions of the CUDA library. Higher dimensions will call those functions in a loop and are therefore supported. +@item Low-latency memory (@code{omp_low_lat_mem_space}) is supported when the + the @code{access} trait is set to @code{cgroup}, the ISA is at least + @code{sm_53}, and the PTX version is at least 4.1. The default pool size + is 8 kiB per team, but may be adjusted at runtime by setting environment + variable @code{GOMP_NVPTX_LOWLAT_POOL=@var{bytes}}. The maximum value is + limited by the available hardware, and care should be taken that the + selected pool size does not unduly limit the number of teams that can + run simultaneously. +@item @code{omp_low_lat_mem_alloc} cannot be used with true low-latency memory + because the definition implies the @code{omp_atv_all} trait; main + graphics memory is used instead. +@item @code{omp_cgroup_mem_alloc}, @code{omp_pteam_mem_alloc}, and + @code{omp_thread_mem_alloc}, all use low-latency memory as first + preference, and fall back to main graphics memory when the low-latency + pool is exhausted. @end itemize diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-1.c b/libgomp/testsuite/libgomp.c/omp_alloc-1.c index f4e594f1e98..7f7f440c12c 100644 --- a/libgomp/testsuite/libgomp.c/omp_alloc-1.c +++ b/libgomp/testsuite/libgomp.c/omp_alloc-1.c @@ -32,12 +32,21 @@ test (int n, omp_allocator_handle_t allocator) int main () { + /* omp_low_lat_mem_alloc doesn't actually get low-latency memory on GPU. */ + omp_allocator_handle_t gpu_lowlat = 0; + #pragma omp target map(from:gpu_lowlat) + { + omp_alloctrait_t traits[1] = { { omp_atk_access, omp_atv_cgroup } }; + gpu_lowlat = omp_init_allocator (omp_low_lat_mem_space, 1, traits); + } + // Smaller than low-latency memory limit test (10, omp_default_mem_alloc); test (10, omp_large_cap_mem_alloc); test (10, omp_const_mem_alloc); test (10, omp_high_bw_mem_alloc); test (10, omp_low_lat_mem_alloc); + test (10, gpu_lowlat); test (10, omp_cgroup_mem_alloc); test (10, omp_pteam_mem_alloc); test (10, omp_thread_mem_alloc); @@ -48,6 +57,7 @@ main () test (100000, omp_const_mem_alloc); test (100000, omp_high_bw_mem_alloc); test (100000, omp_low_lat_mem_alloc); + test (100000, gpu_lowlat); test (100000, omp_cgroup_mem_alloc); test (100000, omp_pteam_mem_alloc); test (100000, omp_thread_mem_alloc); diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-2.c b/libgomp/testsuite/libgomp.c/omp_alloc-2.c index e9fd1602946..54523f1061e 100644 --- a/libgomp/testsuite/libgomp.c/omp_alloc-2.c +++ b/libgomp/testsuite/libgomp.c/omp_alloc-2.c @@ -40,12 +40,19 @@ test (int n, omp_allocator_handle_t allocator) int main () { + /* omp_low_lat_mem_alloc doesn't actually get low-latency memory on GPU. */ + omp_alloctrait_t traits[1] = { { omp_atk_access, omp_atv_cgroup } }; + omp_allocator_handle_t gpu_lowlat; + #pragma omp target map(from:gpu_lowlat) + gpu_lowlat = omp_init_allocator (omp_low_lat_mem_space, 1, traits); + // Smaller than low-latency memory limit test (10, omp_default_mem_alloc); test (10, omp_large_cap_mem_alloc); test (10, omp_const_mem_alloc); test (10, omp_high_bw_mem_alloc); test (10, omp_low_lat_mem_alloc); + test (10, gpu_lowlat); test (10, omp_cgroup_mem_alloc); test (10, omp_pteam_mem_alloc); test (10, omp_thread_mem_alloc); @@ -56,6 +63,7 @@ main () test (1000, omp_const_mem_alloc); test (1000, omp_high_bw_mem_alloc); test (1000, omp_low_lat_mem_alloc); + test (1000, gpu_lowlat); test (1000, omp_cgroup_mem_alloc); test (1000, omp_pteam_mem_alloc); test (1000, omp_thread_mem_alloc); diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-3.c b/libgomp/testsuite/libgomp.c/omp_alloc-3.c index 792e2200f30..682d149d379 100644 --- a/libgomp/testsuite/libgomp.c/omp_alloc-3.c +++ b/libgomp/testsuite/libgomp.c/omp_alloc-3.c @@ -28,12 +28,19 @@ test (omp_allocator_handle_t allocator) int main () { + /* omp_low_lat_mem_alloc doesn't actually get low-latency memory on GPU. */ + omp_alloctrait_t traits[1] = { { omp_atk_access, omp_atv_cgroup } }; + omp_allocator_handle_t gpu_lowlat; + #pragma omp target map(from:gpu_lowlat) + gpu_lowlat = omp_init_allocator (omp_low_lat_mem_space, 1, traits); + // Smaller than low-latency memory limit test (omp_default_mem_alloc); test (omp_large_cap_mem_alloc); test (omp_const_mem_alloc); test (omp_high_bw_mem_alloc); test (omp_low_lat_mem_alloc); + test (gpu_lowlat); test (omp_cgroup_mem_alloc); test (omp_pteam_mem_alloc); test (omp_thread_mem_alloc); diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-4.c b/libgomp/testsuite/libgomp.c/omp_alloc-4.c index c7d0c46c6b3..03841404daa 100644 --- a/libgomp/testsuite/libgomp.c/omp_alloc-4.c +++ b/libgomp/testsuite/libgomp.c/omp_alloc-4.c @@ -26,10 +26,11 @@ main () #pragma omp target { /* Ensure that the memory we get *is* low-latency with a null-fallback. */ - omp_alloctrait_t traits[1] - = { { omp_atk_fallback, omp_atv_null_fb } }; + omp_alloctrait_t traits[2] + = { { omp_atk_fallback, omp_atv_null_fb }, + { omp_atk_access, omp_atv_cgroup } }; omp_allocator_handle_t lowlat = omp_init_allocator (omp_low_lat_mem_space, - 1, traits); + 2, traits); int size = 4; diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-5.c b/libgomp/testsuite/libgomp.c/omp_alloc-5.c index 10805ded6d0..26bf38c1ca6 100644 --- a/libgomp/testsuite/libgomp.c/omp_alloc-5.c +++ b/libgomp/testsuite/libgomp.c/omp_alloc-5.c @@ -39,12 +39,19 @@ test (int n, omp_allocator_handle_t allocator) int main () { + /* omp_low_lat_mem_alloc doesn't actually get low-latency memory on GPU. */ + omp_alloctrait_t traits[1] = { { omp_atk_access, omp_atv_cgroup } }; + omp_allocator_handle_t gpu_lowlat; + #pragma omp target map(from:gpu_lowlat) + gpu_lowlat = omp_init_allocator (omp_low_lat_mem_space, 1, traits); + // Smaller than low-latency memory limit test (10, omp_default_mem_alloc); test (10, omp_large_cap_mem_alloc); test (10, omp_const_mem_alloc); test (10, omp_high_bw_mem_alloc); test (10, omp_low_lat_mem_alloc); + test (10, gpu_lowlat); test (10, omp_cgroup_mem_alloc); test (10, omp_pteam_mem_alloc); test (10, omp_thread_mem_alloc); @@ -55,6 +62,7 @@ main () test (100000, omp_const_mem_alloc); test (100000, omp_high_bw_mem_alloc); test (100000, omp_low_lat_mem_alloc); + test (100000, gpu_lowlat); test (100000, omp_cgroup_mem_alloc); test (100000, omp_pteam_mem_alloc); test (100000, omp_thread_mem_alloc); diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-6.c b/libgomp/testsuite/libgomp.c/omp_alloc-6.c index b326cad9233..13e8747dc3b 100644 --- a/libgomp/testsuite/libgomp.c/omp_alloc-6.c +++ b/libgomp/testsuite/libgomp.c/omp_alloc-6.c @@ -26,10 +26,11 @@ main () #pragma omp target { /* Ensure that the memory we get *is* low-latency with a null-fallback. */ - omp_alloctrait_t traits[1] - = { { omp_atk_fallback, omp_atv_null_fb } }; + omp_alloctrait_t traits[2] + = { { omp_atk_fallback, omp_atv_null_fb }, + { omp_atk_access, omp_atv_cgroup } }; omp_allocator_handle_t lowlat = omp_init_allocator (omp_low_lat_mem_space, - 1, traits); + 2, traits); int size = 16; diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-traits.c b/libgomp/testsuite/libgomp.c/omp_alloc-traits.c new file mode 100644 index 00000000000..4ff0fca4986 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/omp_alloc-traits.c @@ -0,0 +1,66 @@ +/* { dg-do run } */ + +/* { dg-require-effective-target offload_device } */ +/* { dg-xfail-if "not implemented" { ! offload_target_nvptx } } */ + +/* Test that GPU low-latency allocation is limited to team access. */ + +#include +#include + +#pragma omp requires dynamic_allocators + +int +main () +{ + #pragma omp target + { + /* Ensure that the memory we get *is* low-latency with a null-fallback. */ + omp_alloctrait_t traits[2] + = { { omp_atk_fallback, omp_atv_null_fb }, + { omp_atk_access, omp_atv_cgroup } }; + omp_allocator_handle_t lowlat = omp_init_allocator (omp_low_lat_mem_space, + 2, traits); // good + + omp_alloctrait_t traits_all[2] + = { { omp_atk_fallback, omp_atv_null_fb }, + { omp_atk_access, omp_atv_all } }; + omp_allocator_handle_t lowlat_all + = omp_init_allocator (omp_low_lat_mem_space, 2, traits_all); // bad + + omp_alloctrait_t traits_default[1] + = { { omp_atk_fallback, omp_atv_null_fb } }; + omp_allocator_handle_t lowlat_default + = omp_init_allocator (omp_low_lat_mem_space, 1, traits_default); // bad + + if (lowlat_all != omp_null_allocator + || lowlat_default != omp_null_allocator) + __builtin_abort (); + + void *a = omp_alloc (1, lowlat); // good + + if (!a) + __builtin_abort (); + + omp_free (a, lowlat); + + + a = omp_calloc (1, 1, lowlat); // good + + if (!a) + __builtin_abort (); + + omp_free (a, lowlat); + + + a = omp_realloc (NULL, 1, lowlat, lowlat); // good + + if (!a) + __builtin_abort (); + + omp_free (a, lowlat); + } + + return 0; +} +