libgomp: Enable USM for some nvptx devices
A few high-end nvptx devices support the attribute CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS; for those, unified shared memory is supported in hardware. This patch enables support for those - if all installed nvptx devices have this feature (as the capabilities are per device type). This exposes a bug in gomp_copy_back_icvs as it did before use omp_get_mapped_ptr to find mapped variables, but that returns the unchanged pointer in cased of shared memory. But in this case, we have a few actually mapped pointers - like the ICV variables. Additionally, there was a mismatch with regards to '-1' for the device number as gomp_copy_back_icvs and omp_get_mapped_ptr count differently. Hence, do the lookup manually. include/ChangeLog: * cuda/cuda.h (CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS): Add. libgomp/ChangeLog: * libgomp.texi (nvptx): Update USM description. * plugin/plugin-nvptx.c (GOMP_OFFLOAD_get_num_devices): Claim support when requesting USM and all devices support CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. * target.c (gomp_copy_back_icvs): Fix device ptr lookup. (gomp_target_init): Set GOMP_OFFLOAD_CAP_SHARED_MEM is the devices supports USM.
This commit is contained in:
parent
19c491d184
commit
4ccb3366ad
4 changed files with 45 additions and 4 deletions
|
@ -83,7 +83,8 @@ typedef enum {
|
|||
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
|
||||
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
|
||||
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,
|
||||
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
|
||||
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,
|
||||
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88
|
||||
} CUdevice_attribute;
|
||||
|
||||
enum {
|
||||
|
|
|
@ -6435,8 +6435,11 @@ The implementation remark:
|
|||
the next reverse offload region is only executed after the previous
|
||||
one returned.
|
||||
@item OpenMP code that has a @code{requires} directive with
|
||||
@code{unified_shared_memory} will remove any nvptx device from the
|
||||
list of available devices (``host fallback'').
|
||||
@code{unified_shared_memory} runs on nvptx devices if and only if
|
||||
all of those support the @code{pageableMemoryAccess} property;@footnote{
|
||||
@uref{https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements}}
|
||||
otherwise, all nvptx device are removed from the list of available
|
||||
devices (``host fallback'').
|
||||
@item The default per-warp stack size is 128 kiB; see also @code{-msoft-stack}
|
||||
in the GCC manual.
|
||||
@item The OpenMP routines @code{omp_target_memcpy_rect} and
|
||||
|
|
|
@ -1201,8 +1201,23 @@ GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
|
|||
if (num_devices > 0
|
||||
&& ((omp_requires_mask
|
||||
& ~(GOMP_REQUIRES_UNIFIED_ADDRESS
|
||||
| GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
|
||||
| GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
|
||||
return -1;
|
||||
/* Check whether host page access (direct or via migration) is supported;
|
||||
if so, enable USM. Currently, capabilities is per device type, hence,
|
||||
check all devices. */
|
||||
if (num_devices > 0
|
||||
&& (omp_requires_mask & GOMP_REQUIRES_UNIFIED_SHARED_MEMORY))
|
||||
for (int dev = 0; dev < num_devices; dev++)
|
||||
{
|
||||
int pi;
|
||||
CUresult r;
|
||||
r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
|
||||
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS, dev);
|
||||
if (r != CUDA_SUCCESS || pi == 0)
|
||||
return -1;
|
||||
}
|
||||
return num_devices;
|
||||
}
|
||||
|
||||
|
|
|
@ -2969,8 +2969,25 @@ gomp_copy_back_icvs (struct gomp_device_descr *devicep, int device)
|
|||
if (item == NULL)
|
||||
return;
|
||||
|
||||
gomp_mutex_lock (&devicep->lock);
|
||||
|
||||
struct splay_tree_s *mem_map = &devicep->mem_map;
|
||||
struct splay_tree_key_s cur_node;
|
||||
void *dev_ptr = NULL;
|
||||
|
||||
void *host_ptr = &item->icvs;
|
||||
void *dev_ptr = omp_get_mapped_ptr (host_ptr, device);
|
||||
cur_node.host_start = (uintptr_t) host_ptr;
|
||||
cur_node.host_end = cur_node.host_start;
|
||||
splay_tree_key n = gomp_map_0len_lookup (mem_map, &cur_node);
|
||||
|
||||
if (n)
|
||||
{
|
||||
uintptr_t offset = cur_node.host_start - n->host_start;
|
||||
dev_ptr = (void *) (n->tgt->tgt_start + n->tgt_offset + offset);
|
||||
}
|
||||
|
||||
gomp_mutex_unlock (&devicep->lock);
|
||||
|
||||
if (dev_ptr != NULL)
|
||||
gomp_copy_dev2host (devicep, NULL, host_ptr, dev_ptr,
|
||||
sizeof (struct gomp_offload_icvs));
|
||||
|
@ -5303,6 +5320,11 @@ gomp_target_init (void)
|
|||
{
|
||||
/* Augment DEVICES and NUM_DEVICES. */
|
||||
|
||||
/* If USM has been requested and is supported by all devices
|
||||
of this type, set the capability accordingly. */
|
||||
if (omp_requires_mask & GOMP_REQUIRES_UNIFIED_SHARED_MEMORY)
|
||||
current_device.capabilities |= GOMP_OFFLOAD_CAP_SHARED_MEM;
|
||||
|
||||
devs = realloc (devs, (num_devs + new_num_devs)
|
||||
* sizeof (struct gomp_device_descr));
|
||||
if (!devs)
|
||||
|
|
Loading…
Add table
Reference in a new issue