openmp, nvptx: low-lat memory access traits

The NVPTX low latency memory is not accessible outside the team that allocates
it, and therefore should be unavailable for allocators with the access trait
"all".  This change means that the omp_low_lat_mem_alloc predefined
allocator no longer works (but omp_cgroup_mem_alloc still does).

libgomp/ChangeLog:

	* allocator.c (MEMSPACE_VALIDATE): New macro.
	(omp_init_allocator): Use MEMSPACE_VALIDATE.
	(omp_aligned_alloc): Use OMP_LOW_LAT_MEM_ALLOC_INVALID.
	(omp_aligned_calloc): Likewise.
	(omp_realloc): Likewise.
	* config/nvptx/allocator.c (nvptx_memspace_validate): New function.
	(MEMSPACE_VALIDATE): New macro.
	(OMP_LOW_LAT_MEM_ALLOC_INVALID): New define.
	* libgomp.texi: Document low-latency implementation details.
	* testsuite/libgomp.c/omp_alloc-1.c (main): Add gnu_lowlat.
	* testsuite/libgomp.c/omp_alloc-2.c (main): Add gnu_lowlat.
	* testsuite/libgomp.c/omp_alloc-3.c (main): Add gnu_lowlat.
	* testsuite/libgomp.c/omp_alloc-4.c (main): Add access trait.
	* testsuite/libgomp.c/omp_alloc-5.c (main): Add gnu_lowlat.
	* testsuite/libgomp.c/omp_alloc-6.c (main): Add access trait.
	* testsuite/libgomp.c/omp_alloc-traits.c: New test.
This commit is contained in:
Andrew Stubbs 2022-01-27 13:48:50 +00:00
parent 30486fab71
commit e9a19ead49
10 changed files with 166 additions and 6 deletions

View file

@ -56,6 +56,10 @@
#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \
free (((void)(MEMSPACE), (void)(SIZE), (ADDR)))
#endif
#ifndef MEMSPACE_VALIDATE
#define MEMSPACE_VALIDATE(MEMSPACE, ACCESS) \
(((void)(MEMSPACE), (void)(ACCESS), 1))
#endif
/* Map the predefined allocators to the correct memory space.
The index to this table is the omp_allocator_handle_t enum value.
@ -439,6 +443,10 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
if (data.pinned)
return omp_null_allocator;
/* Reject unsupported memory spaces. */
if (!MEMSPACE_VALIDATE (data.memspace, data.access))
return omp_null_allocator;
ret = gomp_malloc (sizeof (struct omp_allocator_data));
*ret = data;
#ifndef HAVE_SYNC_BUILTINS
@ -522,6 +530,10 @@ retry:
new_size += new_alignment - sizeof (void *);
if (__builtin_add_overflow (size, new_size, &new_size))
goto fail;
#ifdef OMP_LOW_LAT_MEM_ALLOC_INVALID
if (allocator == omp_low_lat_mem_alloc)
goto fail;
#endif
if (__builtin_expect (allocator_data
&& allocator_data->pool_size < ~(uintptr_t) 0, 0))
@ -820,6 +832,10 @@ retry:
goto fail;
if (__builtin_add_overflow (size_temp, new_size, &new_size))
goto fail;
#ifdef OMP_LOW_LAT_MEM_ALLOC_INVALID
if (allocator == omp_low_lat_mem_alloc)
goto fail;
#endif
if (__builtin_expect (allocator_data
&& allocator_data->pool_size < ~(uintptr_t) 0, 0))
@ -1054,6 +1070,10 @@ retry:
if (__builtin_add_overflow (size, new_size, &new_size))
goto fail;
old_size = data->size;
#ifdef OMP_LOW_LAT_MEM_ALLOC_INVALID
if (allocator == omp_low_lat_mem_alloc)
goto fail;
#endif
if (__builtin_expect (allocator_data
&& allocator_data->pool_size < ~(uintptr_t) 0, 0))

View file

@ -108,6 +108,21 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
return realloc (addr, size);
}
static inline int
nvptx_memspace_validate (omp_memspace_handle_t memspace, unsigned access)
{
#if __PTX_ISA_VERSION_MAJOR__ > 4 \
|| (__PTX_ISA_VERSION_MAJOR__ == 4 && __PTX_ISA_VERSION_MINOR >= 1)
/* Disallow use of low-latency memory when it must be accessible by
all threads. */
return (memspace != omp_low_lat_mem_space
|| access != omp_atv_all);
#else
/* Low-latency memory is not available before PTX 4.1. */
return (memspace != omp_low_lat_mem_space);
#endif
}
#define MEMSPACE_ALLOC(MEMSPACE, SIZE) \
nvptx_memspace_alloc (MEMSPACE, SIZE)
#define MEMSPACE_CALLOC(MEMSPACE, SIZE) \
@ -116,5 +131,11 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
nvptx_memspace_realloc (MEMSPACE, ADDR, OLDSIZE, SIZE)
#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \
nvptx_memspace_free (MEMSPACE, ADDR, SIZE)
#define MEMSPACE_VALIDATE(MEMSPACE, ACCESS) \
nvptx_memspace_validate (MEMSPACE, ACCESS)
/* The default low-latency memspace implies omp_atv_all, which is incompatible
with the .shared memory space. */
#define OMP_LOW_LAT_MEM_ALLOC_INVALID 1
#include "../../allocator.c"

View file

@ -5767,6 +5767,9 @@ Additional notes regarding the traits:
@item The @code{sync_hint} trait has no effect.
@end itemize
See also:
@ref{Offload-Target Specifics}
@c ---------------------------------------------------------------------
@c Offload-Target Specifics
@c ---------------------------------------------------------------------
@ -5900,6 +5903,21 @@ The implementation remark:
directive for non-contiguous list items will use the 2D and 3D
memory-copy functions of the CUDA library. Higher dimensions will
call those functions in a loop and are therefore supported.
@item Low-latency memory (@code{omp_low_lat_mem_space}) is supported when the
the @code{access} trait is set to @code{cgroup}, the ISA is at least
@code{sm_53}, and the PTX version is at least 4.1. The default pool size
is 8 kiB per team, but may be adjusted at runtime by setting environment
variable @code{GOMP_NVPTX_LOWLAT_POOL=@var{bytes}}. The maximum value is
limited by the available hardware, and care should be taken that the
selected pool size does not unduly limit the number of teams that can
run simultaneously.
@item @code{omp_low_lat_mem_alloc} cannot be used with true low-latency memory
because the definition implies the @code{omp_atv_all} trait; main
graphics memory is used instead.
@item @code{omp_cgroup_mem_alloc}, @code{omp_pteam_mem_alloc}, and
@code{omp_thread_mem_alloc}, all use low-latency memory as first
preference, and fall back to main graphics memory when the low-latency
pool is exhausted.
@end itemize

View file

@ -32,12 +32,21 @@ test (int n, omp_allocator_handle_t allocator)
int
main ()
{
/* omp_low_lat_mem_alloc doesn't actually get low-latency memory on GPU. */
omp_allocator_handle_t gpu_lowlat = 0;
#pragma omp target map(from:gpu_lowlat)
{
omp_alloctrait_t traits[1] = { { omp_atk_access, omp_atv_cgroup } };
gpu_lowlat = omp_init_allocator (omp_low_lat_mem_space, 1, traits);
}
// Smaller than low-latency memory limit
test (10, omp_default_mem_alloc);
test (10, omp_large_cap_mem_alloc);
test (10, omp_const_mem_alloc);
test (10, omp_high_bw_mem_alloc);
test (10, omp_low_lat_mem_alloc);
test (10, gpu_lowlat);
test (10, omp_cgroup_mem_alloc);
test (10, omp_pteam_mem_alloc);
test (10, omp_thread_mem_alloc);
@ -48,6 +57,7 @@ main ()
test (100000, omp_const_mem_alloc);
test (100000, omp_high_bw_mem_alloc);
test (100000, omp_low_lat_mem_alloc);
test (100000, gpu_lowlat);
test (100000, omp_cgroup_mem_alloc);
test (100000, omp_pteam_mem_alloc);
test (100000, omp_thread_mem_alloc);

View file

@ -40,12 +40,19 @@ test (int n, omp_allocator_handle_t allocator)
int
main ()
{
/* omp_low_lat_mem_alloc doesn't actually get low-latency memory on GPU. */
omp_alloctrait_t traits[1] = { { omp_atk_access, omp_atv_cgroup } };
omp_allocator_handle_t gpu_lowlat;
#pragma omp target map(from:gpu_lowlat)
gpu_lowlat = omp_init_allocator (omp_low_lat_mem_space, 1, traits);
// Smaller than low-latency memory limit
test (10, omp_default_mem_alloc);
test (10, omp_large_cap_mem_alloc);
test (10, omp_const_mem_alloc);
test (10, omp_high_bw_mem_alloc);
test (10, omp_low_lat_mem_alloc);
test (10, gpu_lowlat);
test (10, omp_cgroup_mem_alloc);
test (10, omp_pteam_mem_alloc);
test (10, omp_thread_mem_alloc);
@ -56,6 +63,7 @@ main ()
test (1000, omp_const_mem_alloc);
test (1000, omp_high_bw_mem_alloc);
test (1000, omp_low_lat_mem_alloc);
test (1000, gpu_lowlat);
test (1000, omp_cgroup_mem_alloc);
test (1000, omp_pteam_mem_alloc);
test (1000, omp_thread_mem_alloc);

View file

@ -28,12 +28,19 @@ test (omp_allocator_handle_t allocator)
int
main ()
{
/* omp_low_lat_mem_alloc doesn't actually get low-latency memory on GPU. */
omp_alloctrait_t traits[1] = { { omp_atk_access, omp_atv_cgroup } };
omp_allocator_handle_t gpu_lowlat;
#pragma omp target map(from:gpu_lowlat)
gpu_lowlat = omp_init_allocator (omp_low_lat_mem_space, 1, traits);
// Smaller than low-latency memory limit
test (omp_default_mem_alloc);
test (omp_large_cap_mem_alloc);
test (omp_const_mem_alloc);
test (omp_high_bw_mem_alloc);
test (omp_low_lat_mem_alloc);
test (gpu_lowlat);
test (omp_cgroup_mem_alloc);
test (omp_pteam_mem_alloc);
test (omp_thread_mem_alloc);

View file

@ -26,10 +26,11 @@ main ()
#pragma omp target
{
/* Ensure that the memory we get *is* low-latency with a null-fallback. */
omp_alloctrait_t traits[1]
= { { omp_atk_fallback, omp_atv_null_fb } };
omp_alloctrait_t traits[2]
= { { omp_atk_fallback, omp_atv_null_fb },
{ omp_atk_access, omp_atv_cgroup } };
omp_allocator_handle_t lowlat = omp_init_allocator (omp_low_lat_mem_space,
1, traits);
2, traits);
int size = 4;

View file

@ -39,12 +39,19 @@ test (int n, omp_allocator_handle_t allocator)
int
main ()
{
/* omp_low_lat_mem_alloc doesn't actually get low-latency memory on GPU. */
omp_alloctrait_t traits[1] = { { omp_atk_access, omp_atv_cgroup } };
omp_allocator_handle_t gpu_lowlat;
#pragma omp target map(from:gpu_lowlat)
gpu_lowlat = omp_init_allocator (omp_low_lat_mem_space, 1, traits);
// Smaller than low-latency memory limit
test (10, omp_default_mem_alloc);
test (10, omp_large_cap_mem_alloc);
test (10, omp_const_mem_alloc);
test (10, omp_high_bw_mem_alloc);
test (10, omp_low_lat_mem_alloc);
test (10, gpu_lowlat);
test (10, omp_cgroup_mem_alloc);
test (10, omp_pteam_mem_alloc);
test (10, omp_thread_mem_alloc);
@ -55,6 +62,7 @@ main ()
test (100000, omp_const_mem_alloc);
test (100000, omp_high_bw_mem_alloc);
test (100000, omp_low_lat_mem_alloc);
test (100000, gpu_lowlat);
test (100000, omp_cgroup_mem_alloc);
test (100000, omp_pteam_mem_alloc);
test (100000, omp_thread_mem_alloc);

View file

@ -26,10 +26,11 @@ main ()
#pragma omp target
{
/* Ensure that the memory we get *is* low-latency with a null-fallback. */
omp_alloctrait_t traits[1]
= { { omp_atk_fallback, omp_atv_null_fb } };
omp_alloctrait_t traits[2]
= { { omp_atk_fallback, omp_atv_null_fb },
{ omp_atk_access, omp_atv_cgroup } };
omp_allocator_handle_t lowlat = omp_init_allocator (omp_low_lat_mem_space,
1, traits);
2, traits);
int size = 16;

View file

@ -0,0 +1,66 @@
/* { dg-do run } */
/* { dg-require-effective-target offload_device } */
/* { dg-xfail-if "not implemented" { ! offload_target_nvptx } } */
/* Test that GPU low-latency allocation is limited to team access. */
#include <stddef.h>
#include <omp.h>
#pragma omp requires dynamic_allocators
int
main ()
{
#pragma omp target
{
/* Ensure that the memory we get *is* low-latency with a null-fallback. */
omp_alloctrait_t traits[2]
= { { omp_atk_fallback, omp_atv_null_fb },
{ omp_atk_access, omp_atv_cgroup } };
omp_allocator_handle_t lowlat = omp_init_allocator (omp_low_lat_mem_space,
2, traits); // good
omp_alloctrait_t traits_all[2]
= { { omp_atk_fallback, omp_atv_null_fb },
{ omp_atk_access, omp_atv_all } };
omp_allocator_handle_t lowlat_all
= omp_init_allocator (omp_low_lat_mem_space, 2, traits_all); // bad
omp_alloctrait_t traits_default[1]
= { { omp_atk_fallback, omp_atv_null_fb } };
omp_allocator_handle_t lowlat_default
= omp_init_allocator (omp_low_lat_mem_space, 1, traits_default); // bad
if (lowlat_all != omp_null_allocator
|| lowlat_default != omp_null_allocator)
__builtin_abort ();
void *a = omp_alloc (1, lowlat); // good
if (!a)
__builtin_abort ();
omp_free (a, lowlat);
a = omp_calloc (1, 1, lowlat); // good
if (!a)
__builtin_abort ();
omp_free (a, lowlat);
a = omp_realloc (NULL, 1, lowlat, lowlat); // good
if (!a)
__builtin_abort ();
omp_free (a, lowlat);
}
return 0;
}