OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for omp_target_memcpy_rect
When copying a 2D or 3D rectangular memmory block, the performance is better when using CUDA's cuMemcpy2D/cuMemcpy3D instead of copying the data one by one. That's what this commit does. Additionally, it permits device-to-device copies, if neccessary using a temporary variable on the host. include/ChangeLog: * cuda/cuda.h (CUlimit): Add CUDA_ERROR_NOT_INITIALIZED, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_INVALID_HANDLE. (CUarray, CUmemorytype, CUDA_MEMCPY2D, CUDA_MEMCPY3D, CUDA_MEMCPY3D_PEER): New typdefs. (cuMemcpy2D, cuMemcpy2DAsync, cuMemcpy2DUnaligned, cuMemcpy3D, cuMemcpy3DAsync, cuMemcpy3DPeer, cuMemcpy3DPeerAsync): New prototypes. libgomp/ChangeLog: * libgomp-plugin.h (GOMP_OFFLOAD_memcpy2d, GOMP_OFFLOAD_memcpy3d): New prototypes. * libgomp.h (struct gomp_device_descr): Add memcpy2d_func and memcpy3d_func. * libgomp.texi (nvtpx): Document when cuMemcpy2D/cuMemcpy3D is used. * oacc-host.c (memcpy2d_func, .memcpy3d_func): Init with NULL. * plugin/cuda-lib.def (cuMemcpy2D, cuMemcpy2DUnaligned, cuMemcpy3D): Invoke via CUDA_ONE_CALL. * plugin/plugin-nvptx.c (GOMP_OFFLOAD_memcpy2d, GOMP_OFFLOAD_memcpy3d): New. * target.c (omp_target_memcpy_rect_worker): (omp_target_memcpy_rect_check, omp_target_memcpy_rect_copy): Permit all device-to-device copyies; invoke new plugins for 2D and 3D copying when available. (gomp_load_plugin_for_device): DLSYM the new plugin functions. * testsuite/libgomp.c/target-12.c: Fix dimension bug. * testsuite/libgomp.fortran/target-12.f90: Likewise. * testsuite/libgomp.fortran/target-memcpy-rect-1.f90: New test.
This commit is contained in:
parent
c194a41336
commit
25072a477a
11 changed files with 885 additions and 32 deletions
|
@ -47,6 +47,7 @@ typedef void *CUevent;
|
|||
typedef void *CUfunction;
|
||||
typedef void *CUlinkState;
|
||||
typedef void *CUmodule;
|
||||
typedef void *CUarray;
|
||||
typedef size_t (*CUoccupancyB2DSize)(int);
|
||||
typedef void *CUstream;
|
||||
|
||||
|
@ -54,7 +55,10 @@ typedef enum {
|
|||
CUDA_SUCCESS = 0,
|
||||
CUDA_ERROR_INVALID_VALUE = 1,
|
||||
CUDA_ERROR_OUT_OF_MEMORY = 2,
|
||||
CUDA_ERROR_NOT_INITIALIZED = 3,
|
||||
CUDA_ERROR_DEINITIALIZED = 4,
|
||||
CUDA_ERROR_INVALID_CONTEXT = 201,
|
||||
CUDA_ERROR_INVALID_HANDLE = 400,
|
||||
CUDA_ERROR_NOT_FOUND = 500,
|
||||
CUDA_ERROR_NOT_READY = 600,
|
||||
CUDA_ERROR_LAUNCH_FAILED = 719,
|
||||
|
@ -126,6 +130,75 @@ typedef enum {
|
|||
CU_LIMIT_MALLOC_HEAP_SIZE = 0x02,
|
||||
} CUlimit;
|
||||
|
||||
typedef enum {
|
||||
CU_MEMORYTYPE_HOST = 0x01,
|
||||
CU_MEMORYTYPE_DEVICE = 0x02,
|
||||
CU_MEMORYTYPE_ARRAY = 0x03,
|
||||
CU_MEMORYTYPE_UNIFIED = 0x04
|
||||
} CUmemorytype;
|
||||
|
||||
typedef struct {
|
||||
size_t srcXInBytes, srcY;
|
||||
CUmemorytype srcMemoryType;
|
||||
const void *srcHost;
|
||||
CUdeviceptr srcDevice;
|
||||
CUarray srcArray;
|
||||
size_t srcPitch;
|
||||
|
||||
size_t dstXInBytes, dstY;
|
||||
CUmemorytype dstMemoryType;
|
||||
const void *dstHost;
|
||||
CUdeviceptr dstDevice;
|
||||
CUarray dstArray;
|
||||
size_t dstPitch;
|
||||
|
||||
size_t WidthInBytes, Height;
|
||||
} CUDA_MEMCPY2D;
|
||||
|
||||
typedef struct {
|
||||
size_t srcXInBytes, srcY, srcZ;
|
||||
size_t srcLOD;
|
||||
CUmemorytype srcMemoryType;
|
||||
const void *srcHost;
|
||||
CUdeviceptr srcDevice;
|
||||
CUarray srcArray;
|
||||
void *dummy;
|
||||
size_t srcPitch, srcHeight;
|
||||
|
||||
size_t dstXInBytes, dstY, dstZ;
|
||||
size_t dstLOD;
|
||||
CUmemorytype dstMemoryType;
|
||||
const void *dstHost;
|
||||
CUdeviceptr dstDevice;
|
||||
CUarray dstArray;
|
||||
void *dummy2;
|
||||
size_t dstPitch, dstHeight;
|
||||
|
||||
size_t WidthInBytes, Height, Depth;
|
||||
} CUDA_MEMCPY3D;
|
||||
|
||||
typedef struct {
|
||||
size_t srcXInBytes, srcY, srcZ;
|
||||
size_t srcLOD;
|
||||
CUmemorytype srcMemoryType;
|
||||
const void *srcHost;
|
||||
CUdeviceptr srcDevice;
|
||||
CUarray srcArray;
|
||||
CUcontext srcContext;
|
||||
size_t srcPitch, srcHeight;
|
||||
|
||||
size_t dstXInBytes, dstY, dstZ;
|
||||
size_t dstLOD;
|
||||
CUmemorytype dstMemoryType;
|
||||
const void *dstHost;
|
||||
CUdeviceptr dstDevice;
|
||||
CUarray dstArray;
|
||||
CUcontext dstContext;
|
||||
size_t dstPitch, dstHeight;
|
||||
|
||||
size_t WidthInBytes, Height, Depth;
|
||||
} CUDA_MEMCPY3D_PEER;
|
||||
|
||||
#define cuCtxCreate cuCtxCreate_v2
|
||||
CUresult cuCtxCreate (CUcontext *, unsigned, CUdevice);
|
||||
#define cuCtxDestroy cuCtxDestroy_v2
|
||||
|
@ -183,6 +256,18 @@ CUresult cuMemcpyDtoHAsync (void *, CUdeviceptr, size_t, CUstream);
|
|||
CUresult cuMemcpyHtoD (CUdeviceptr, const void *, size_t);
|
||||
#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
|
||||
CUresult cuMemcpyHtoDAsync (CUdeviceptr, const void *, size_t, CUstream);
|
||||
#define cuMemcpy2D cuMemcpy2D_v2
|
||||
CUresult cuMemcpy2D (const CUDA_MEMCPY2D *);
|
||||
#define cuMemcpy2DAsync cuMemcpy2DAsync_v2
|
||||
CUresult cuMemcpy2DAsync (const CUDA_MEMCPY2D *, CUstream);
|
||||
#define cuMemcpy2DUnaligned cuMemcpy2DUnaligned_v2
|
||||
CUresult cuMemcpy2DUnaligned (const CUDA_MEMCPY2D *);
|
||||
#define cuMemcpy3D cuMemcpy3D_v2
|
||||
CUresult cuMemcpy3D (const CUDA_MEMCPY3D *);
|
||||
#define cuMemcpy3DAsync cuMemcpy3DAsync_v2
|
||||
CUresult cuMemcpy3DAsync (const CUDA_MEMCPY3D *, CUstream);
|
||||
CUresult cuMemcpy3DPeer (const CUDA_MEMCPY3D_PEER *);
|
||||
CUresult cuMemcpy3DPeerAsync (const CUDA_MEMCPY3D_PEER *, CUstream);
|
||||
#define cuMemFree cuMemFree_v2
|
||||
CUresult cuMemFree (CUdeviceptr);
|
||||
CUresult cuMemFreeHost (void *);
|
||||
|
|
|
@ -139,6 +139,13 @@ extern bool GOMP_OFFLOAD_free (int, void *);
|
|||
extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t);
|
||||
extern bool GOMP_OFFLOAD_host2dev (int, void *, const void *, size_t);
|
||||
extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t);
|
||||
extern int GOMP_OFFLOAD_memcpy2d (int, int, size_t, size_t,
|
||||
void*, size_t, size_t, size_t,
|
||||
const void*, size_t, size_t, size_t);
|
||||
extern int GOMP_OFFLOAD_memcpy3d (int, int, size_t, size_t, size_t, void *,
|
||||
size_t, size_t, size_t, size_t, size_t,
|
||||
const void *, size_t, size_t, size_t, size_t,
|
||||
size_t);
|
||||
extern bool GOMP_OFFLOAD_can_run (void *);
|
||||
extern void GOMP_OFFLOAD_run (int, void *, void *, void **);
|
||||
extern void GOMP_OFFLOAD_async_run (int, void *, void *, void **, void *);
|
||||
|
|
|
@ -1388,6 +1388,8 @@ struct gomp_device_descr
|
|||
__typeof (GOMP_OFFLOAD_free) *free_func;
|
||||
__typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
|
||||
__typeof (GOMP_OFFLOAD_host2dev) *host2dev_func;
|
||||
__typeof (GOMP_OFFLOAD_memcpy2d) *memcpy2d_func;
|
||||
__typeof (GOMP_OFFLOAD_memcpy3d) *memcpy3d_func;
|
||||
__typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
|
||||
__typeof (GOMP_OFFLOAD_can_run) *can_run_func;
|
||||
__typeof (GOMP_OFFLOAD_run) *run_func;
|
||||
|
|
|
@ -5060,6 +5060,11 @@ The implementation remark:
|
|||
list of available devices (``host fallback'').
|
||||
@item The default per-warp stack size is 128 kiB; see also @code{-msoft-stack}
|
||||
in the GCC manual.
|
||||
@item The OpenMP routines @code{omp_target_memcpy_rect} and
|
||||
@code{omp_target_memcpy_rect_async} and the @code{target update}
|
||||
directive for non-contiguous list items will use the 2D and 3D
|
||||
memory-copy functions of the CUDA library. Higher dimensions will
|
||||
call those functions in a loop and are therefore supported.
|
||||
@end itemize
|
||||
|
||||
|
||||
|
|
|
@ -281,6 +281,8 @@ static struct gomp_device_descr host_dispatch =
|
|||
.free_func = host_free,
|
||||
.dev2host_func = host_dev2host,
|
||||
.host2dev_func = host_host2dev,
|
||||
.memcpy2d_func = NULL,
|
||||
.memcpy3d_func = NULL,
|
||||
.run_func = host_run,
|
||||
|
||||
.mem_map = { NULL },
|
||||
|
|
|
@ -36,6 +36,9 @@ CUDA_ONE_CALL (cuMemcpyDtoH)
|
|||
CUDA_ONE_CALL (cuMemcpyDtoHAsync)
|
||||
CUDA_ONE_CALL (cuMemcpyHtoD)
|
||||
CUDA_ONE_CALL (cuMemcpyHtoDAsync)
|
||||
CUDA_ONE_CALL (cuMemcpy2D)
|
||||
CUDA_ONE_CALL (cuMemcpy2DUnaligned)
|
||||
CUDA_ONE_CALL (cuMemcpy3D)
|
||||
CUDA_ONE_CALL (cuMemFree)
|
||||
CUDA_ONE_CALL (cuMemFreeHost)
|
||||
CUDA_ONE_CALL (cuMemGetAddressRange)
|
||||
|
|
|
@ -1781,6 +1781,122 @@ GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
|
|||
return true;
|
||||
}
|
||||
|
||||
int
|
||||
GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
|
||||
size_t dim0_len, void *dst, size_t dst_offset1_size,
|
||||
size_t dst_offset0_len, size_t dst_dim1_size,
|
||||
const void *src, size_t src_offset1_size,
|
||||
size_t src_offset0_len, size_t src_dim1_size)
|
||||
{
|
||||
if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
|
||||
return false;
|
||||
|
||||
/* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
|
||||
|
||||
CUDA_MEMCPY2D data;
|
||||
data.WidthInBytes = dim1_size;
|
||||
data.Height = dim0_len;
|
||||
|
||||
if (dst_ord == -1)
|
||||
{
|
||||
data.dstMemoryType = CU_MEMORYTYPE_HOST;
|
||||
data.dstHost = dst;
|
||||
}
|
||||
else
|
||||
{
|
||||
data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
|
||||
data.dstDevice = (CUdeviceptr) dst;
|
||||
}
|
||||
data.dstPitch = dst_dim1_size;
|
||||
data.dstXInBytes = dst_offset1_size;
|
||||
data.dstY = dst_offset0_len;
|
||||
|
||||
if (src_ord == -1)
|
||||
{
|
||||
data.srcMemoryType = CU_MEMORYTYPE_HOST;
|
||||
data.srcHost = src;
|
||||
}
|
||||
else
|
||||
{
|
||||
data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
|
||||
data.srcDevice = (CUdeviceptr) src;
|
||||
}
|
||||
data.srcPitch = src_dim1_size;
|
||||
data.srcXInBytes = src_offset1_size;
|
||||
data.srcY = src_offset0_len;
|
||||
|
||||
CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
|
||||
if (res == CUDA_ERROR_INVALID_VALUE)
|
||||
/* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
|
||||
for (some) memory not allocated by cuMemAllocPitch, cuMemcpy2D fails
|
||||
with an error; try the slower cuMemcpy2DUnaligned now. */
|
||||
CUDA_CALL (cuMemcpy2DUnaligned, &data);
|
||||
else if (res != CUDA_SUCCESS)
|
||||
{
|
||||
GOMP_PLUGIN_error ("cuMemcpy2D error: %s", cuda_error (res));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int
|
||||
GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
|
||||
size_t dim1_len, size_t dim0_len, void *dst,
|
||||
size_t dst_offset2_size, size_t dst_offset1_len,
|
||||
size_t dst_offset0_len, size_t dst_dim2_size,
|
||||
size_t dst_dim1_len, const void *src,
|
||||
size_t src_offset2_size, size_t src_offset1_len,
|
||||
size_t src_offset0_len, size_t src_dim2_size,
|
||||
size_t src_dim1_len)
|
||||
{
|
||||
if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
|
||||
return false;
|
||||
|
||||
/* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
|
||||
|
||||
CUDA_MEMCPY3D data;
|
||||
data.WidthInBytes = dim2_size;
|
||||
data.Height = dim1_len;
|
||||
data.Depth = dim0_len;
|
||||
|
||||
if (dst_ord == -1)
|
||||
{
|
||||
data.dstMemoryType = CU_MEMORYTYPE_HOST;
|
||||
data.dstHost = dst;
|
||||
}
|
||||
else
|
||||
{
|
||||
data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
|
||||
data.dstDevice = (CUdeviceptr) dst;
|
||||
}
|
||||
data.dstPitch = dst_dim2_size;
|
||||
data.dstHeight = dst_dim1_len;
|
||||
data.dstXInBytes = dst_offset2_size;
|
||||
data.dstY = dst_offset1_len;
|
||||
data.dstZ = dst_offset0_len;
|
||||
data.dstLOD = 0;
|
||||
|
||||
if (src_ord == -1)
|
||||
{
|
||||
data.srcMemoryType = CU_MEMORYTYPE_HOST;
|
||||
data.srcHost = src;
|
||||
}
|
||||
else
|
||||
{
|
||||
data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
|
||||
data.srcDevice = (CUdeviceptr) src;
|
||||
}
|
||||
data.srcPitch = src_dim2_size;
|
||||
data.srcHeight = src_dim1_len;
|
||||
data.srcXInBytes = src_offset2_size;
|
||||
data.srcY = src_offset1_len;
|
||||
data.srcZ = src_offset0_len;
|
||||
data.srcLOD = 0;
|
||||
|
||||
CUDA_CALL (cuMemcpy3D, &data);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
|
||||
size_t n, struct goacc_asyncqueue *aq)
|
||||
|
|
154
libgomp/target.c
154
libgomp/target.c
|
@ -4526,7 +4526,8 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
|
|||
const size_t *dst_dimensions,
|
||||
const size_t *src_dimensions,
|
||||
struct gomp_device_descr *dst_devicep,
|
||||
struct gomp_device_descr *src_devicep)
|
||||
struct gomp_device_descr *src_devicep,
|
||||
size_t *tmp_size, void **tmp)
|
||||
{
|
||||
size_t dst_slice = element_size;
|
||||
size_t src_slice = element_size;
|
||||
|
@ -4539,36 +4540,120 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
|
|||
|| __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off)
|
||||
|| __builtin_mul_overflow (element_size, src_offsets[0], &src_off))
|
||||
return EINVAL;
|
||||
if (dst_devicep == NULL && src_devicep == NULL)
|
||||
if (src_devicep != NULL && src_devicep == dst_devicep)
|
||||
ret = src_devicep->dev2dev_func (src_devicep->target_id,
|
||||
(char *) dst + dst_off,
|
||||
(const char *) src + src_off,
|
||||
length);
|
||||
else if (src_devicep != NULL
|
||||
&& (dst_devicep == NULL
|
||||
|| (dst_devicep->capabilities
|
||||
& GOMP_OFFLOAD_CAP_SHARED_MEM)))
|
||||
ret = src_devicep->dev2host_func (src_devicep->target_id,
|
||||
(char *) dst + dst_off,
|
||||
(const char *) src + src_off,
|
||||
length);
|
||||
else if (dst_devicep != NULL
|
||||
&& (src_devicep == NULL
|
||||
|| (src_devicep->capabilities
|
||||
& GOMP_OFFLOAD_CAP_SHARED_MEM)))
|
||||
ret = dst_devicep->host2dev_func (dst_devicep->target_id,
|
||||
(char *) dst + dst_off,
|
||||
(const char *) src + src_off,
|
||||
length);
|
||||
else if (dst_devicep == NULL && src_devicep == NULL)
|
||||
{
|
||||
memcpy ((char *) dst + dst_off, (const char *) src + src_off,
|
||||
length);
|
||||
ret = 1;
|
||||
}
|
||||
else if (src_devicep == NULL)
|
||||
ret = dst_devicep->host2dev_func (dst_devicep->target_id,
|
||||
(char *) dst + dst_off,
|
||||
(const char *) src + src_off,
|
||||
length);
|
||||
else if (dst_devicep == NULL)
|
||||
ret = src_devicep->dev2host_func (src_devicep->target_id,
|
||||
(char *) dst + dst_off,
|
||||
(const char *) src + src_off,
|
||||
length);
|
||||
else if (src_devicep == dst_devicep)
|
||||
ret = src_devicep->dev2dev_func (src_devicep->target_id,
|
||||
(char *) dst + dst_off,
|
||||
(const char *) src + src_off,
|
||||
length);
|
||||
else
|
||||
ret = 0;
|
||||
{
|
||||
if (*tmp_size == 0)
|
||||
{
|
||||
*tmp_size = length;
|
||||
*tmp = malloc (length);
|
||||
if (*tmp == NULL)
|
||||
return ENOMEM;
|
||||
}
|
||||
else if (*tmp_size < length)
|
||||
{
|
||||
*tmp_size = length;
|
||||
*tmp = realloc (*tmp, length);
|
||||
if (*tmp == NULL)
|
||||
return ENOMEM;
|
||||
}
|
||||
ret = src_devicep->dev2host_func (src_devicep->target_id, *tmp,
|
||||
(const char *) src + src_off,
|
||||
length);
|
||||
if (ret == 1)
|
||||
ret = dst_devicep->host2dev_func (dst_devicep->target_id,
|
||||
(char *) dst + dst_off, *tmp,
|
||||
length);
|
||||
}
|
||||
return ret ? 0 : EINVAL;
|
||||
}
|
||||
|
||||
/* FIXME: it would be nice to have some plugin function to handle
|
||||
num_dims == 2 and num_dims == 3 more efficiently. Larger ones can
|
||||
be handled in the generic recursion below, and for host-host it
|
||||
should be used even for any num_dims >= 2. */
|
||||
/* host->device, device->host and same-device device->device. */
|
||||
if (num_dims == 2
|
||||
&& ((src_devicep
|
||||
&& src_devicep == dst_devicep
|
||||
&& src_devicep->memcpy2d_func)
|
||||
|| (!src_devicep != !dst_devicep
|
||||
&& ((src_devicep && src_devicep->memcpy2d_func)
|
||||
|| (dst_devicep && dst_devicep->memcpy2d_func)))))
|
||||
{
|
||||
size_t vol_sz1, dst_sz1, src_sz1, dst_off_sz1, src_off_sz1;
|
||||
int dst_id = dst_devicep ? dst_devicep->target_id : -1;
|
||||
int src_id = src_devicep ? src_devicep->target_id : -1;
|
||||
struct gomp_device_descr *devp = dst_devicep ? dst_devicep : src_devicep;
|
||||
|
||||
if (__builtin_mul_overflow (volume[1], element_size, &vol_sz1)
|
||||
|| __builtin_mul_overflow (dst_dimensions[1], element_size, &dst_sz1)
|
||||
|| __builtin_mul_overflow (src_dimensions[1], element_size, &src_sz1)
|
||||
|| __builtin_mul_overflow (dst_offsets[1], element_size, &dst_off_sz1)
|
||||
|| __builtin_mul_overflow (src_offsets[1], element_size,
|
||||
&src_off_sz1))
|
||||
return EINVAL;
|
||||
ret = devp->memcpy2d_func (dst_id, src_id, vol_sz1, volume[0],
|
||||
dst, dst_off_sz1, dst_offsets[0], dst_sz1,
|
||||
src, src_off_sz1, src_offsets[0], src_sz1);
|
||||
if (ret != -1)
|
||||
return ret ? 0 : EINVAL;
|
||||
}
|
||||
else if (num_dims == 3
|
||||
&& ((src_devicep
|
||||
&& src_devicep == dst_devicep
|
||||
&& src_devicep->memcpy3d_func)
|
||||
|| (!src_devicep != !dst_devicep
|
||||
&& ((src_devicep && src_devicep->memcpy3d_func)
|
||||
|| (dst_devicep && dst_devicep->memcpy3d_func)))))
|
||||
{
|
||||
size_t vol_sz2, dst_sz2, src_sz2, dst_off_sz2, src_off_sz2;
|
||||
int dst_id = dst_devicep ? dst_devicep->target_id : -1;
|
||||
int src_id = src_devicep ? src_devicep->target_id : -1;
|
||||
struct gomp_device_descr *devp = dst_devicep ? dst_devicep : src_devicep;
|
||||
|
||||
if (__builtin_mul_overflow (volume[2], element_size, &vol_sz2)
|
||||
|| __builtin_mul_overflow (dst_dimensions[2], element_size, &dst_sz2)
|
||||
|| __builtin_mul_overflow (src_dimensions[2], element_size, &src_sz2)
|
||||
|| __builtin_mul_overflow (dst_offsets[2], element_size, &dst_off_sz2)
|
||||
|| __builtin_mul_overflow (src_offsets[2], element_size,
|
||||
&src_off_sz2))
|
||||
return EINVAL;
|
||||
ret = devp->memcpy3d_func (dst_id, src_id, vol_sz2, volume[1], volume[0],
|
||||
dst, dst_off_sz2, dst_offsets[1],
|
||||
dst_offsets[0], dst_sz2, dst_dimensions[1],
|
||||
src, src_off_sz2, src_offsets[1],
|
||||
src_offsets[0], src_sz2, src_dimensions[1]);
|
||||
if (ret != -1)
|
||||
return ret ? 0 : EINVAL;
|
||||
}
|
||||
|
||||
for (i = 1; i < num_dims; i++)
|
||||
if (__builtin_mul_overflow (dst_slice, dst_dimensions[i], &dst_slice)
|
||||
|
@ -4585,7 +4670,7 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
|
|||
volume + 1, dst_offsets + 1,
|
||||
src_offsets + 1, dst_dimensions + 1,
|
||||
src_dimensions + 1, dst_devicep,
|
||||
src_devicep);
|
||||
src_devicep, tmp_size, tmp);
|
||||
if (ret)
|
||||
return ret;
|
||||
dst_off += dst_slice;
|
||||
|
@ -4608,9 +4693,6 @@ omp_target_memcpy_rect_check (void *dst, const void *src, int dst_device_num,
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (*src_devicep != NULL && *dst_devicep != NULL && *src_devicep != *dst_devicep)
|
||||
return EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -4624,18 +4706,36 @@ omp_target_memcpy_rect_copy (void *dst, const void *src,
|
|||
struct gomp_device_descr *dst_devicep,
|
||||
struct gomp_device_descr *src_devicep)
|
||||
{
|
||||
if (src_devicep)
|
||||
size_t tmp_size = 0;
|
||||
void *tmp = NULL;
|
||||
bool lock_src;
|
||||
bool lock_dst;
|
||||
|
||||
lock_src = (src_devicep
|
||||
&& (!dst_devicep
|
||||
|| src_devicep == dst_devicep
|
||||
|| !(src_devicep->capabilities
|
||||
& GOMP_OFFLOAD_CAP_SHARED_MEM)));
|
||||
lock_dst = (dst_devicep
|
||||
&& (!lock_src
|
||||
|| (src_devicep != dst_devicep
|
||||
&& !(dst_devicep->capabilities
|
||||
& GOMP_OFFLOAD_CAP_SHARED_MEM))));
|
||||
if (lock_src)
|
||||
gomp_mutex_lock (&src_devicep->lock);
|
||||
else if (dst_devicep)
|
||||
if (lock_dst)
|
||||
gomp_mutex_lock (&dst_devicep->lock);
|
||||
int ret = omp_target_memcpy_rect_worker (dst, src, element_size, num_dims,
|
||||
volume, dst_offsets, src_offsets,
|
||||
dst_dimensions, src_dimensions,
|
||||
dst_devicep, src_devicep);
|
||||
if (src_devicep)
|
||||
dst_devicep, src_devicep,
|
||||
&tmp_size, &tmp);
|
||||
if (lock_src)
|
||||
gomp_mutex_unlock (&src_devicep->lock);
|
||||
else if (dst_devicep)
|
||||
if (lock_dst)
|
||||
gomp_mutex_unlock (&dst_devicep->lock);
|
||||
if (tmp)
|
||||
free (tmp);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -4976,6 +5076,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
|
|||
DLSYM (free);
|
||||
DLSYM (dev2host);
|
||||
DLSYM (host2dev);
|
||||
DLSYM (memcpy2d);
|
||||
DLSYM (memcpy3d);
|
||||
device->capabilities = device->get_caps_func ();
|
||||
if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
|
||||
{
|
||||
|
|
|
@ -80,12 +80,12 @@ main ()
|
|||
src_offsets[2] = 1;
|
||||
src_offsets[1] = 0;
|
||||
src_offsets[0] = 3;
|
||||
dst_dimensions[2] = 2;
|
||||
dst_dimensions[2] = 3;
|
||||
dst_dimensions[1] = 3;
|
||||
dst_dimensions[0] = 6;
|
||||
src_dimensions[2] = 3;
|
||||
src_dimensions[1] = 4;
|
||||
src_dimensions[0] = 6;
|
||||
src_dimensions[0] = 9;
|
||||
if (omp_target_memcpy_rect (p, q, sizeof (int), 3, volume,
|
||||
dst_offsets, src_offsets, dst_dimensions,
|
||||
src_dimensions, d, id) != 0)
|
||||
|
@ -98,7 +98,7 @@ main ()
|
|||
for (j = 0; j < 6; j++)
|
||||
for (k = 0; k < 3; k++)
|
||||
for (l = 0; l < 2; l++)
|
||||
if (q[j * 6 + k * 2 + l] != 3 * 12 + 4 + 1 + l + k * 3 + j * 12)
|
||||
if (q[j * 9 + k * 3 + l] != 3 * 12 + 4 + 1 + l + k * 3 + j * 12)
|
||||
err = 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -93,12 +93,12 @@ program main
|
|||
src_offsets(2) = 1
|
||||
src_offsets(1) = 0
|
||||
src_offsets(0) = 3
|
||||
dst_dimensions(2) = 2
|
||||
dst_dimensions(2) = 3
|
||||
dst_dimensions(1) = 3
|
||||
dst_dimensions(0) = 6
|
||||
src_dimensions(2) = 3
|
||||
src_dimensions(1) = 4
|
||||
src_dimensions(0) = 6
|
||||
src_dimensions(0) = 9
|
||||
|
||||
if (omp_target_memcpy_rect (p, c_loc (q), sizeof (q(0)), 3, volume, &
|
||||
dst_offsets, src_offsets, dst_dimensions, &
|
||||
|
@ -112,7 +112,7 @@ program main
|
|||
do j = 0, 5
|
||||
do k = 0, 2
|
||||
do l = 0, 1
|
||||
if (q(j * 6 + k * 2 + l) /= 3 * 12 + 4 + 1 + l + k * 3 + j * 12) &
|
||||
if (q(j * 9 + k * 3 + l) /= 3 * 12 + 4 + 1 + l + k * 3 + j * 12) &
|
||||
err = .true.
|
||||
end do
|
||||
end do
|
||||
|
|
531
libgomp/testsuite/libgomp.fortran/target-memcpy-rect-1.f90
Normal file
531
libgomp/testsuite/libgomp.fortran/target-memcpy-rect-1.f90
Normal file
|
@ -0,0 +1,531 @@
|
|||
program main
|
||||
use iso_c_binding
|
||||
use omp_lib
|
||||
implicit none (type, external)
|
||||
|
||||
integer(c_size_t), parameter :: sizeof_int = 4
|
||||
integer, parameter :: sk = c_size_t
|
||||
logical, allocatable :: isshared(:)
|
||||
integer, allocatable :: maxdim(:,:)
|
||||
integer :: ndev
|
||||
|
||||
ndev = omp_get_num_devices()
|
||||
call init_isshared
|
||||
call init_maxdim
|
||||
|
||||
call one
|
||||
call two
|
||||
call three
|
||||
call four
|
||||
|
||||
deallocate(isshared, maxdim)
|
||||
contains
|
||||
|
||||
subroutine init_maxdim
|
||||
integer :: dev, dev2, r
|
||||
integer(c_size_t), parameter :: nl = 0
|
||||
|
||||
allocate(maxdim(0:ndev,0:ndev))
|
||||
do dev = 0, ndev
|
||||
do dev2 = 0, ndev
|
||||
r = omp_target_memcpy_rect (c_null_ptr, c_null_ptr, nl, &
|
||||
num_dims=1_c_int, volume=[nl], &
|
||||
dst_offsets=[nl], src_offsets=[nl], &
|
||||
dst_dimensions=[nl], src_dimensions=[nl], &
|
||||
dst_device_num=dev, src_device_num=omp_initial_device)
|
||||
if (r < 3) stop 1 ! OpenMP requirement
|
||||
if (r < huge(0_c_int)) stop 2 ! GCC implementation
|
||||
maxdim(dev2,dev) = r
|
||||
end do
|
||||
end do
|
||||
end subroutine
|
||||
|
||||
subroutine init_isshared
|
||||
integer :: dev
|
||||
logical :: dev_isshared
|
||||
|
||||
allocate(isshared(0:ndev))
|
||||
do dev = 0, ndev
|
||||
dev_isshared = .false.
|
||||
!$omp target device(dev) map(to: dev_isshared)
|
||||
dev_isshared = .true.
|
||||
!$omp end target
|
||||
isshared(dev) = dev_isshared
|
||||
end do
|
||||
end subroutine
|
||||
|
||||
|
||||
subroutine one
|
||||
integer(c_size_t), parameter :: N1 = 30
|
||||
integer, target :: host_data(N1)
|
||||
type(c_ptr) :: dev_cptr(0:ndev), cptr, tmp_cptr
|
||||
integer :: dev, dev2, i, r
|
||||
|
||||
do dev = 0, ndev
|
||||
dev_cptr(dev) = omp_target_alloc (N1*sizeof_int, dev)
|
||||
if (.not. c_associated (dev_cptr(dev))) stop 11
|
||||
end do
|
||||
|
||||
do i = 1, N1
|
||||
host_data(i) = i
|
||||
end do
|
||||
|
||||
! copy full array host -> all devices + check value + set per-device value
|
||||
do dev = 0, ndev
|
||||
r = omp_target_memcpy_rect (dev_cptr(dev), c_loc(host_data), sizeof_int, &
|
||||
num_dims=1_c_int, volume=[N1], &
|
||||
dst_offsets=[0_sk], src_offsets=[0_sk], &
|
||||
dst_dimensions=[N1], src_dimensions=[N1], &
|
||||
dst_device_num=dev, src_device_num=omp_initial_device)
|
||||
if (r /= 0) stop 12
|
||||
cptr = dev_cptr(dev)
|
||||
!$omp target device(dev) is_device_ptr(cptr)
|
||||
block
|
||||
integer, pointer, contiguous :: fptr(:)
|
||||
call c_f_pointer(cptr, fptr, [N1])
|
||||
do i = 1, N1
|
||||
if (fptr(i) /= i) stop 13
|
||||
fptr(i) = i*100 + 10000 * (dev+3)
|
||||
end do
|
||||
end block
|
||||
end do
|
||||
|
||||
! Test strided data - forth and back - same array sizes
|
||||
do dev = 0, ndev
|
||||
do dev2 = 0, ndev
|
||||
tmp_cptr = omp_target_alloc (N1*sizeof_int, dev)
|
||||
if (.not. c_associated (tmp_cptr)) stop 14
|
||||
|
||||
!$omp target device(dev) is_device_ptr(tmp_cptr)
|
||||
block
|
||||
integer, pointer, contiguous :: fptr(:)
|
||||
call c_f_pointer(tmp_cptr, fptr, [N1])
|
||||
do i = 1, N1
|
||||
fptr(i) = i*100 + 10000*(dev+1)
|
||||
end do
|
||||
end block
|
||||
|
||||
if (N1-17 > N1 - max(12,13)) stop 18
|
||||
r = omp_target_memcpy_rect (dev_cptr(dev2), tmp_cptr, sizeof_int, &
|
||||
num_dims=1_c_int, volume=[N1-17], &
|
||||
dst_offsets=[12_sk], src_offsets=[13_sk], &
|
||||
dst_dimensions=[N1], src_dimensions=[N1], &
|
||||
dst_device_num=dev2, src_device_num=dev)
|
||||
if (r /= 0) stop 15
|
||||
|
||||
cptr = dev_cptr(dev2)
|
||||
!$omp target device(dev2) is_device_ptr(cptr)
|
||||
block
|
||||
logical :: checked(N1)
|
||||
integer, pointer, contiguous :: fptr(:)
|
||||
call c_f_pointer(cptr, fptr, [N1])
|
||||
checked = .false.
|
||||
do i = 1, N1-17
|
||||
if (fptr(i+12) /= (i+13)*100 + 10000 * (dev+1)) stop 16
|
||||
checked(i+12) = .true.
|
||||
end do
|
||||
! original device value
|
||||
do i = 1, N1
|
||||
if (.not. checked(i)) then
|
||||
if (fptr(i) /= i*100 + 10000 * (dev2+3)) stop 17
|
||||
end if
|
||||
end do
|
||||
end block
|
||||
call omp_target_free (tmp_cptr, dev)
|
||||
end do
|
||||
|
||||
! reset to original value
|
||||
do dev2 = 0, ndev
|
||||
cptr = dev_cptr(dev2)
|
||||
!$omp target device(dev2) is_device_ptr(cptr)
|
||||
block
|
||||
integer, pointer, contiguous :: fptr(:)
|
||||
call c_f_pointer(cptr, fptr, [N1])
|
||||
do i = 1, N1
|
||||
fptr(i) = i*100 + 10000 * (dev2+3)
|
||||
end do
|
||||
end block
|
||||
end do
|
||||
end do
|
||||
|
||||
do dev = 0, ndev
|
||||
call omp_target_free (dev_cptr(dev), dev)
|
||||
end do
|
||||
end subroutine
|
||||
|
||||
|
||||
subroutine two
|
||||
integer(c_size_t), parameter :: N = 10, M = 30
|
||||
integer, target :: host_data(N,M)
|
||||
type(c_ptr) :: dev_cptr(0:ndev), cptr, tmp_cptr
|
||||
integer :: dev, dev2, i, j, r
|
||||
|
||||
do dev = 0, ndev
|
||||
dev_cptr(dev) = omp_target_alloc (N*M*sizeof_int, dev)
|
||||
if (.not. c_associated (dev_cptr(dev))) stop 21
|
||||
end do
|
||||
|
||||
do i = 1, M
|
||||
do j = 1, N
|
||||
host_data(j,i) = i*100 + j
|
||||
end do
|
||||
end do
|
||||
|
||||
! copy full array host -> all devices + check value + set per-device value
|
||||
do dev = 0, ndev
|
||||
r = omp_target_memcpy_rect (dev_cptr(dev), c_loc(host_data), sizeof_int, &
|
||||
num_dims=2_c_int, volume=[M, N], &
|
||||
dst_offsets=[0_sk, 0_sk], src_offsets=[0_sk, 0_sk], &
|
||||
dst_dimensions=[M, N], src_dimensions=[M,N], &
|
||||
dst_device_num=dev, src_device_num=omp_initial_device)
|
||||
if (r /= 0) stop 22
|
||||
cptr = dev_cptr(dev)
|
||||
!$omp target device(dev) is_device_ptr(cptr)
|
||||
block
|
||||
integer, pointer, contiguous :: fptr(:,:)
|
||||
call c_f_pointer(cptr, fptr, [N,M])
|
||||
do i = 1, M
|
||||
do j = 1, N
|
||||
if (fptr(j,i) /= i*100 + j) stop 23
|
||||
fptr(j,i) = i*100 + j + 1000 * dev
|
||||
end do
|
||||
end do
|
||||
end block
|
||||
end do
|
||||
|
||||
! Test strided data - forth and back - same array sizes
|
||||
do dev = 0, ndev
|
||||
do dev2 = 0, ndev
|
||||
tmp_cptr = omp_target_alloc (N*M*sizeof_int, dev)
|
||||
if (.not. c_associated (tmp_cptr)) stop 24
|
||||
|
||||
!$omp target device(dev) is_device_ptr(tmp_cptr)
|
||||
block
|
||||
integer, pointer, contiguous :: fptr(:,:)
|
||||
call c_f_pointer(tmp_cptr, fptr, [N,M])
|
||||
do i = 1, M
|
||||
do j = 1, N
|
||||
fptr(j,i) = i*100 + j + 100000 * (dev+1)
|
||||
end do
|
||||
end do
|
||||
end block
|
||||
|
||||
if (M-14 > M - max(5,2) &
|
||||
.or. N-3 > N - max(2,1)) stop 28
|
||||
r = omp_target_memcpy_rect (dev_cptr(dev2), tmp_cptr, sizeof_int, &
|
||||
num_dims=2_c_int, volume=[M-14, N-3], &
|
||||
dst_offsets=[5_sk, 3_sk], src_offsets=[2_sk, 1_sk], &
|
||||
dst_dimensions=[M, N], src_dimensions=[M,N], &
|
||||
dst_device_num=dev2, src_device_num=dev)
|
||||
if (r /= 0) stop 25
|
||||
|
||||
cptr = dev_cptr(dev2)
|
||||
!$omp target device(dev2) is_device_ptr(cptr)
|
||||
block
|
||||
logical :: checked(N,M)
|
||||
integer, pointer, contiguous :: fptr(:,:)
|
||||
call c_f_pointer(cptr, fptr, [N,M])
|
||||
checked = .false.
|
||||
do i = 1, M-14
|
||||
do j = 1, N-3
|
||||
if (fptr(j+3, i+5) /= (i+2)*100 + (j+1) + 100000 * (dev+1)) stop 26
|
||||
checked(j+3, i+5) = .true.
|
||||
end do
|
||||
end do
|
||||
! original device value
|
||||
do i = 1, M
|
||||
do j = 1, N
|
||||
if (.not. checked(j,i)) then
|
||||
if (fptr(j,i) /= i*100 + j + 1000 * dev2) stop 27
|
||||
end if
|
||||
end do
|
||||
end do
|
||||
end block
|
||||
call omp_target_free (tmp_cptr, dev)
|
||||
end do
|
||||
|
||||
! reset to original value
|
||||
do dev2 = 0, ndev
|
||||
cptr = dev_cptr(dev2)
|
||||
!$omp target device(dev2) is_device_ptr(cptr)
|
||||
block
|
||||
integer, pointer, contiguous :: fptr(:,:)
|
||||
call c_f_pointer(cptr, fptr, [N,M])
|
||||
do i = 1, M
|
||||
do j = 1, N
|
||||
fptr(j,i) = i*100 + j + 1000 * dev2
|
||||
end do
|
||||
end do
|
||||
end block
|
||||
end do
|
||||
end do
|
||||
|
||||
do dev = 0, ndev
|
||||
call omp_target_free (dev_cptr(dev), dev)
|
||||
end do
|
||||
end subroutine
|
||||
|
||||
|
||||
subroutine three
|
||||
integer(c_size_t), parameter :: N1 = 10, N2 = 30, N3 = 15
|
||||
integer, target :: host_data(N3,N2,N1)
|
||||
type(c_ptr) :: dev_cptr(0:ndev), cptr, tmp_cptr
|
||||
integer :: dev, dev2, i, j, k, r
|
||||
|
||||
do dev = 0, ndev
|
||||
dev_cptr(dev) = omp_target_alloc (N1*N2*N3*sizeof_int, dev)
|
||||
if (.not. c_associated (dev_cptr(dev))) stop 31
|
||||
end do
|
||||
|
||||
do i = 1, N1
|
||||
do j = 1, N2
|
||||
do k = 1, N3
|
||||
host_data(k, j,i) = i*1000 + 100*j + k
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
|
||||
! copy full array host -> all devices + check value + set per-device value
|
||||
do dev = 0, ndev
|
||||
r = omp_target_memcpy_rect (dev_cptr(dev), c_loc(host_data), sizeof_int, &
|
||||
num_dims=3_c_int, volume=[N1, N2, N3], &
|
||||
dst_offsets=[0_sk, 0_sk, 0_sk], src_offsets=[0_sk, 0_sk, 0_sk], &
|
||||
dst_dimensions=[N1, N2, N3], src_dimensions=[N1, N2, N3], &
|
||||
dst_device_num=dev, src_device_num=omp_initial_device)
|
||||
if (r /= 0) stop 32
|
||||
cptr = dev_cptr(dev)
|
||||
!$omp target device(dev) is_device_ptr(cptr)
|
||||
block
|
||||
integer, pointer, contiguous :: fptr(:,:,:)
|
||||
call c_f_pointer(cptr, fptr, [N3,N2,N1])
|
||||
do i = 1, N1
|
||||
do j = 1, N2
|
||||
do k = 1, N3
|
||||
if (fptr(k, j,i) /= i*1000 + 100*j + k) stop 33
|
||||
fptr(k,j,i) = i*1000 + 100*j + k + 1000 * dev
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
end block
|
||||
end do
|
||||
|
||||
! Test strided data - forth and back - same array sizes
|
||||
do dev = 0, ndev
|
||||
do dev2 = 0, ndev
|
||||
tmp_cptr = omp_target_alloc (N1*N2*N3*sizeof_int, dev)
|
||||
if (.not. c_associated (tmp_cptr)) stop 34
|
||||
|
||||
!$omp target device(dev) is_device_ptr(tmp_cptr)
|
||||
block
|
||||
integer, pointer, contiguous :: fptr(:,:,:)
|
||||
call c_f_pointer(tmp_cptr, fptr, [N3,N2,N1])
|
||||
do i = 1, N1
|
||||
do j = 1, N2
|
||||
do k = 1, N3
|
||||
fptr(k,j,i) = i*1000 + 100*j + k + 100000 * (dev+1)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
end block
|
||||
|
||||
if (N1-5 > N1 - max(5,2) &
|
||||
.or. N2-13 > N2 - max(3,1) &
|
||||
.or. N3-5 > N3 - max(2,4)) stop 38
|
||||
r = omp_target_memcpy_rect (dev_cptr(dev2), tmp_cptr, sizeof_int, &
|
||||
num_dims=3_c_int, volume=[N1-5, N2-13,N3-5], &
|
||||
dst_offsets=[5_sk, 3_sk,2_sk], src_offsets=[2_sk, 1_sk,4_sk], &
|
||||
dst_dimensions=[N1,N2,N3], src_dimensions=[N1,N2,N3], &
|
||||
dst_device_num=dev2, src_device_num=dev)
|
||||
if (r /= 0) stop 35
|
||||
|
||||
cptr = dev_cptr(dev2)
|
||||
!$omp target device(dev2) is_device_ptr(cptr)
|
||||
block
|
||||
logical :: checked(N3,N2,N1)
|
||||
integer, pointer, contiguous :: fptr(:,:,:)
|
||||
call c_f_pointer(cptr, fptr, [N3,N2,N1])
|
||||
checked = .false.
|
||||
do i = 1, N1-5
|
||||
do j = 1, N2-13
|
||||
do k = 1, N3-5
|
||||
if (fptr(k+2, j+3, i+5) /= (i+2)*1000 + 100*(j+1) + (k+4) + 100000 * (dev+1)) stop 36
|
||||
checked(k+2, j+3, i+5) = .true.
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
! original device value
|
||||
do i = 1, N1
|
||||
do j = 1, N2
|
||||
do k = 1, N3
|
||||
if (.not. checked(k,j,i)) then
|
||||
if (fptr(k,j,i) /= i*1000 + 100*j + k + 1000 * dev2) stop 37
|
||||
end if
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
end block
|
||||
call omp_target_free (tmp_cptr, dev)
|
||||
end do
|
||||
|
||||
! reset to original value
|
||||
do dev2 = 0, ndev
|
||||
cptr = dev_cptr(dev2)
|
||||
!$omp target device(dev2) is_device_ptr(cptr)
|
||||
block
|
||||
integer, pointer, contiguous :: fptr(:,:,:)
|
||||
call c_f_pointer(cptr, fptr, [N3,N2,N1])
|
||||
do i = 1, N1
|
||||
do j = 1, N2
|
||||
do k = 1, N3
|
||||
fptr(k,j,i) = i*1000 + 100*j + k + 1000 * dev2
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
end block
|
||||
end do
|
||||
end do
|
||||
|
||||
do dev = 0, ndev
|
||||
call omp_target_free (dev_cptr(dev), dev)
|
||||
end do
|
||||
end subroutine
|
||||
|
||||
|
||||
subroutine four
|
||||
integer(c_size_t), parameter :: N1 = 10, N2 = 30, N3 = 15, N4 = 25
|
||||
integer, target :: host_data(N4, N3,N2,N1)
|
||||
type(c_ptr) :: dev_cptr(0:ndev), cptr, tmp_cptr
|
||||
integer :: dev, dev2, i, j, k, ll, r
|
||||
|
||||
do dev = 0, ndev
|
||||
dev_cptr(dev) = omp_target_alloc (N1*N2*N3*N4*sizeof_int, dev)
|
||||
if (.not. c_associated (dev_cptr(dev))) stop 41
|
||||
end do
|
||||
|
||||
do i = 1, N1
|
||||
do j = 1, N2
|
||||
do k = 1, N3
|
||||
do ll = 1, N4
|
||||
host_data(ll, k, j,i) = i*1000 + 100*j + k*10 + ll
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
|
||||
! copy full array host -> all devices + check value + set per-device value
|
||||
do dev = 0, ndev
|
||||
r = omp_target_memcpy_rect (dev_cptr(dev), c_loc(host_data), sizeof_int, &
|
||||
num_dims=4_c_int, volume=[N1, N2, N3, N4], &
|
||||
dst_offsets=[0_sk, 0_sk, 0_sk, 0_sk], src_offsets=[0_sk, 0_sk, 0_sk, 0_sk], &
|
||||
dst_dimensions=[N1, N2, N3, N4], src_dimensions=[N1, N2, N3, N4], &
|
||||
dst_device_num=dev, src_device_num=omp_initial_device)
|
||||
if (r /= 0) stop 42
|
||||
cptr = dev_cptr(dev)
|
||||
!$omp target device(dev) is_device_ptr(cptr)
|
||||
block
|
||||
integer, pointer, contiguous :: fptr(:,:,:,:)
|
||||
call c_f_pointer(cptr, fptr, [N4,N3,N2,N1])
|
||||
do i = 1, N1
|
||||
do j = 1, N2
|
||||
do k = 1, N3
|
||||
do ll = 1, N4
|
||||
if (fptr(ll, k, j,i) /= i*1000 + 100*j + k*10 + ll) stop 43
|
||||
fptr(ll,k,j,i) = i*1000 + 100*j + k*10 + ll + 1000 * dev
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
end block
|
||||
end do
|
||||
|
||||
! Test strided data - forth and back - same array sizes
|
||||
do dev = 0, ndev
|
||||
do dev2 = 0, ndev
|
||||
tmp_cptr = omp_target_alloc (N1*N2*N3*N4*sizeof_int, dev)
|
||||
if (.not. c_associated (tmp_cptr)) stop 44
|
||||
|
||||
!$omp target device(dev) is_device_ptr(tmp_cptr)
|
||||
block
|
||||
integer, pointer, contiguous :: fptr(:,:,:,:)
|
||||
call c_f_pointer(tmp_cptr, fptr, [N4,N3,N2,N1])
|
||||
do i = 1, N1
|
||||
do j = 1, N2
|
||||
do k = 1, N3
|
||||
do ll = 1, N4
|
||||
fptr(ll,k,j,i) = i*1000 + 100*j + k*10 + ll + 100000 * (dev+1)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
end block
|
||||
|
||||
if (N1-5 > N1 - max(5,2) &
|
||||
.or. N2-13 > N2 - max(3,1) &
|
||||
.or. N3-5 > N3 - max(2,4) &
|
||||
.or. N4-11 > N4 - max(7,5)) stop 48
|
||||
r = omp_target_memcpy_rect (dev_cptr(dev2), tmp_cptr, sizeof_int, &
|
||||
num_dims=4_c_int, volume=[N1-5, N2-13,N3-5,N4-11], &
|
||||
dst_offsets=[5_sk, 3_sk,2_sk,7_sk], src_offsets=[2_sk, 1_sk,4_sk,5_sk], &
|
||||
dst_dimensions=[N1,N2,N3,N4], src_dimensions=[N1,N2,N3,N4], &
|
||||
dst_device_num=dev2, src_device_num=dev)
|
||||
if (r /= 0) stop 45
|
||||
|
||||
cptr = dev_cptr(dev2)
|
||||
!$omp target device(dev2) is_device_ptr(cptr)
|
||||
block
|
||||
logical, allocatable :: checked(:,:,:,:) ! allocatble to reduce stack size
|
||||
integer, pointer, contiguous :: fptr(:,:,:,:)
|
||||
call c_f_pointer(cptr, fptr, [N4,N3,N2,N1])
|
||||
allocate (checked(N4,N3,N2,N1), source=.false.)
|
||||
do i = 1, N1-5
|
||||
do j = 1, N2-13
|
||||
do k = 1, N3-5
|
||||
do ll = 1, N4-11
|
||||
if (fptr(ll+7, k+2, j+3, i+5) /= (i+2)*1000 + 100*(j+1) + (k+4)*10 + ll+5 + 100000 * (dev+1)) stop 46
|
||||
checked(ll+7, k+2, j+3, i+5) = .true.
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
! original device value
|
||||
do i = 1, N1
|
||||
do j = 1, N2
|
||||
do k = 1, N3
|
||||
do ll = 1, N4
|
||||
if (.not. checked(ll,k,j,i)) then
|
||||
if (fptr(ll,k,j,i) /= i*1000 + 100*j + k*10 + ll + 1000 * dev2) stop 47
|
||||
end if
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
deallocate (checked)
|
||||
end block
|
||||
call omp_target_free (tmp_cptr, dev)
|
||||
end do
|
||||
|
||||
! reset to original value
|
||||
do dev2 = 0, ndev
|
||||
cptr = dev_cptr(dev2)
|
||||
!$omp target device(dev2) is_device_ptr(cptr)
|
||||
block
|
||||
integer, pointer, contiguous :: fptr(:,:,:,:)
|
||||
call c_f_pointer(cptr, fptr, [N4,N3,N2,N1])
|
||||
do i = 1, N1
|
||||
do j = 1, N2
|
||||
do k = 1, N3
|
||||
do ll = 1, N4
|
||||
fptr(ll,k,j,i) = i*1000 + 100*j + k*10 + ll + 1000 * dev2
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
end block
|
||||
end do
|
||||
end do
|
||||
|
||||
do dev = 0, ndev
|
||||
call omp_target_free (dev_cptr(dev), dev)
|
||||
end do
|
||||
end subroutine
|
||||
end program
|
Loading…
Add table
Reference in a new issue