libgomp: Add no-target-region rev offload test + fix plugin-nvptx
OpenMP permits that a 'target device(ancestor:1)' is called without being enclosed in a target region - using the current device (i.e. the host) in that case. This commit adds a testcase for this. In case of nvptx, the missing on-device 'GOMP_target_ext' call causes that it and also the associated on-device GOMP_REV_OFFLOAD_VAR variable are not linked in from nvptx's libgomp.a. Thus, handle the failing cuModuleGetGlobal gracefully by disabling reverse offload and assuming that the failure is fine. libgomp/ChangeLog: * plugin/plugin-nvptx.c (GOMP_OFFLOAD_load_image): Use unsigned int for 'i' to match 'fn_entries'; regard absent GOMP_REV_OFFLOAD_VAR as valid and the code having no reverse-offload code. * testsuite/libgomp.c-c++-common/reverse-offload-2.c: New test.
This commit is contained in:
parent
c16e85d726
commit
9f9d128f45
2 changed files with 73 additions and 12 deletions
|
@ -1390,7 +1390,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
|
|||
else if (rev_fn_table)
|
||||
{
|
||||
CUdeviceptr var;
|
||||
size_t bytes, i;
|
||||
size_t bytes;
|
||||
unsigned int i;
|
||||
r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
|
||||
"$offload_func_table");
|
||||
if (r != CUDA_SUCCESS)
|
||||
|
@ -1413,12 +1414,11 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
|
|||
|
||||
if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL)
|
||||
{
|
||||
/* cuMemHostAlloc memory is accessible on the device, if unified-shared
|
||||
address is supported; this is assumed - see comment in
|
||||
nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
|
||||
CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
|
||||
sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
|
||||
CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
|
||||
/* Get the on-device GOMP_REV_OFFLOAD_VAR variable. It should be
|
||||
available but it might be not. One reason could be: if the user code
|
||||
has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext
|
||||
is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR
|
||||
are not linked in. */
|
||||
CUdeviceptr device_rev_offload_var;
|
||||
size_t device_rev_offload_size;
|
||||
CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal,
|
||||
|
@ -1426,11 +1426,23 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
|
|||
&device_rev_offload_size, module,
|
||||
XSTRING (GOMP_REV_OFFLOAD_VAR));
|
||||
if (r != CUDA_SUCCESS)
|
||||
GOMP_PLUGIN_fatal ("cuModuleGetGlobal error - GOMP_REV_OFFLOAD_VAR: %s", cuda_error (r));
|
||||
r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
|
||||
sizeof (dp));
|
||||
if (r != CUDA_SUCCESS)
|
||||
GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
|
||||
{
|
||||
free (*rev_fn_table);
|
||||
*rev_fn_table = NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* cuMemHostAlloc memory is accessible on the device, if
|
||||
unified-shared address is supported; this is assumed - see comment
|
||||
in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
|
||||
CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
|
||||
sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
|
||||
CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
|
||||
r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
|
||||
sizeof (dp));
|
||||
if (r != CUDA_SUCCESS)
|
||||
GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
|
||||
}
|
||||
}
|
||||
|
||||
nvptx_set_clocktick (module, dev);
|
||||
|
|
49
libgomp/testsuite/libgomp.c-c++-common/reverse-offload-2.c
Normal file
49
libgomp/testsuite/libgomp.c-c++-common/reverse-offload-2.c
Normal file
|
@ -0,0 +1,49 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
|
||||
|
||||
#pragma omp requires reverse_offload
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
int A[10];
|
||||
int y;
|
||||
|
||||
for (int i = 0; i < 10; i++)
|
||||
A[i] = 2*i;
|
||||
|
||||
y = 42;
|
||||
|
||||
/* Pointlessly copy to the default device. */
|
||||
#pragma omp target data map(to: A)
|
||||
{
|
||||
/* Not enclosed in a target region (= i.e. running on the host); the
|
||||
following is valid - it runs on the current device (= host). */
|
||||
#pragma omp target device ( ancestor:1 ) firstprivate(y) map(to: A)
|
||||
{
|
||||
if (y != 42)
|
||||
__builtin_abort ();
|
||||
for (int i = 0; i < 10; i++)
|
||||
if (A[i] != 2*i)
|
||||
__builtin_abort ();
|
||||
for (int i = 0; i < 10; i++)
|
||||
if (A[i] != 2*i)
|
||||
A[i] = 4*i;
|
||||
y = 31;
|
||||
}
|
||||
|
||||
if (y != 42)
|
||||
__builtin_abort ();
|
||||
for (int i = 0; i < 10; i++)
|
||||
if (A[i] != 2*i)
|
||||
__builtin_abort ();
|
||||
}
|
||||
|
||||
if (y != 42)
|
||||
__builtin_abort ();
|
||||
for (int i = 0; i < 10; i++)
|
||||
if (A[i] != 2*i)
|
||||
__builtin_abort ();
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Add table
Reference in a new issue