From 25072a477a56a727b369bf9b20f4d18198ff5894 Mon Sep 17 00:00:00 2001 From: Tobias Burnus Date: Wed, 26 Jul 2023 16:22:35 +0200 Subject: [PATCH] OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for omp_target_memcpy_rect When copying a 2D or 3D rectangular memmory block, the performance is better when using CUDA's cuMemcpy2D/cuMemcpy3D instead of copying the data one by one. That's what this commit does. Additionally, it permits device-to-device copies, if neccessary using a temporary variable on the host. include/ChangeLog: * cuda/cuda.h (CUlimit): Add CUDA_ERROR_NOT_INITIALIZED, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_INVALID_HANDLE. (CUarray, CUmemorytype, CUDA_MEMCPY2D, CUDA_MEMCPY3D, CUDA_MEMCPY3D_PEER): New typdefs. (cuMemcpy2D, cuMemcpy2DAsync, cuMemcpy2DUnaligned, cuMemcpy3D, cuMemcpy3DAsync, cuMemcpy3DPeer, cuMemcpy3DPeerAsync): New prototypes. libgomp/ChangeLog: * libgomp-plugin.h (GOMP_OFFLOAD_memcpy2d, GOMP_OFFLOAD_memcpy3d): New prototypes. * libgomp.h (struct gomp_device_descr): Add memcpy2d_func and memcpy3d_func. * libgomp.texi (nvtpx): Document when cuMemcpy2D/cuMemcpy3D is used. * oacc-host.c (memcpy2d_func, .memcpy3d_func): Init with NULL. * plugin/cuda-lib.def (cuMemcpy2D, cuMemcpy2DUnaligned, cuMemcpy3D): Invoke via CUDA_ONE_CALL. * plugin/plugin-nvptx.c (GOMP_OFFLOAD_memcpy2d, GOMP_OFFLOAD_memcpy3d): New. * target.c (omp_target_memcpy_rect_worker): (omp_target_memcpy_rect_check, omp_target_memcpy_rect_copy): Permit all device-to-device copyies; invoke new plugins for 2D and 3D copying when available. (gomp_load_plugin_for_device): DLSYM the new plugin functions. * testsuite/libgomp.c/target-12.c: Fix dimension bug. * testsuite/libgomp.fortran/target-12.f90: Likewise. * testsuite/libgomp.fortran/target-memcpy-rect-1.f90: New test. --- include/cuda/cuda.h | 85 +++ libgomp/libgomp-plugin.h | 7 + libgomp/libgomp.h | 2 + libgomp/libgomp.texi | 5 + libgomp/oacc-host.c | 2 + libgomp/plugin/cuda-lib.def | 3 + libgomp/plugin/plugin-nvptx.c | 116 ++++ libgomp/target.c | 154 ++++- libgomp/testsuite/libgomp.c/target-12.c | 6 +- .../testsuite/libgomp.fortran/target-12.f90 | 6 +- .../libgomp.fortran/target-memcpy-rect-1.f90 | 531 ++++++++++++++++++ 11 files changed, 885 insertions(+), 32 deletions(-) create mode 100644 libgomp/testsuite/libgomp.fortran/target-memcpy-rect-1.f90 diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h index 338626fb6dc..09c3c2b8dbe 100644 --- a/include/cuda/cuda.h +++ b/include/cuda/cuda.h @@ -47,6 +47,7 @@ typedef void *CUevent; typedef void *CUfunction; typedef void *CUlinkState; typedef void *CUmodule; +typedef void *CUarray; typedef size_t (*CUoccupancyB2DSize)(int); typedef void *CUstream; @@ -54,7 +55,10 @@ typedef enum { CUDA_SUCCESS = 0, CUDA_ERROR_INVALID_VALUE = 1, CUDA_ERROR_OUT_OF_MEMORY = 2, + CUDA_ERROR_NOT_INITIALIZED = 3, + CUDA_ERROR_DEINITIALIZED = 4, CUDA_ERROR_INVALID_CONTEXT = 201, + CUDA_ERROR_INVALID_HANDLE = 400, CUDA_ERROR_NOT_FOUND = 500, CUDA_ERROR_NOT_READY = 600, CUDA_ERROR_LAUNCH_FAILED = 719, @@ -126,6 +130,75 @@ typedef enum { CU_LIMIT_MALLOC_HEAP_SIZE = 0x02, } CUlimit; +typedef enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 +} CUmemorytype; + +typedef struct { + size_t srcXInBytes, srcY; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + size_t srcPitch; + + size_t dstXInBytes, dstY; + CUmemorytype dstMemoryType; + const void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + size_t dstPitch; + + size_t WidthInBytes, Height; +} CUDA_MEMCPY2D; + +typedef struct { + size_t srcXInBytes, srcY, srcZ; + size_t srcLOD; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + void *dummy; + size_t srcPitch, srcHeight; + + size_t dstXInBytes, dstY, dstZ; + size_t dstLOD; + CUmemorytype dstMemoryType; + const void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + void *dummy2; + size_t dstPitch, dstHeight; + + size_t WidthInBytes, Height, Depth; +} CUDA_MEMCPY3D; + +typedef struct { + size_t srcXInBytes, srcY, srcZ; + size_t srcLOD; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + CUcontext srcContext; + size_t srcPitch, srcHeight; + + size_t dstXInBytes, dstY, dstZ; + size_t dstLOD; + CUmemorytype dstMemoryType; + const void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + CUcontext dstContext; + size_t dstPitch, dstHeight; + + size_t WidthInBytes, Height, Depth; +} CUDA_MEMCPY3D_PEER; + #define cuCtxCreate cuCtxCreate_v2 CUresult cuCtxCreate (CUcontext *, unsigned, CUdevice); #define cuCtxDestroy cuCtxDestroy_v2 @@ -183,6 +256,18 @@ CUresult cuMemcpyDtoHAsync (void *, CUdeviceptr, size_t, CUstream); CUresult cuMemcpyHtoD (CUdeviceptr, const void *, size_t); #define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2 CUresult cuMemcpyHtoDAsync (CUdeviceptr, const void *, size_t, CUstream); +#define cuMemcpy2D cuMemcpy2D_v2 +CUresult cuMemcpy2D (const CUDA_MEMCPY2D *); +#define cuMemcpy2DAsync cuMemcpy2DAsync_v2 +CUresult cuMemcpy2DAsync (const CUDA_MEMCPY2D *, CUstream); +#define cuMemcpy2DUnaligned cuMemcpy2DUnaligned_v2 +CUresult cuMemcpy2DUnaligned (const CUDA_MEMCPY2D *); +#define cuMemcpy3D cuMemcpy3D_v2 +CUresult cuMemcpy3D (const CUDA_MEMCPY3D *); +#define cuMemcpy3DAsync cuMemcpy3DAsync_v2 +CUresult cuMemcpy3DAsync (const CUDA_MEMCPY3D *, CUstream); +CUresult cuMemcpy3DPeer (const CUDA_MEMCPY3D_PEER *); +CUresult cuMemcpy3DPeerAsync (const CUDA_MEMCPY3D_PEER *, CUstream); #define cuMemFree cuMemFree_v2 CUresult cuMemFree (CUdeviceptr); CUresult cuMemFreeHost (void *); diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h index 42ee3d6c7f9..dc993882c3b 100644 --- a/libgomp/libgomp-plugin.h +++ b/libgomp/libgomp-plugin.h @@ -139,6 +139,13 @@ extern bool GOMP_OFFLOAD_free (int, void *); extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t); extern bool GOMP_OFFLOAD_host2dev (int, void *, const void *, size_t); extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t); +extern int GOMP_OFFLOAD_memcpy2d (int, int, size_t, size_t, + void*, size_t, size_t, size_t, + const void*, size_t, size_t, size_t); +extern int GOMP_OFFLOAD_memcpy3d (int, int, size_t, size_t, size_t, void *, + size_t, size_t, size_t, size_t, size_t, + const void *, size_t, size_t, size_t, size_t, + size_t); extern bool GOMP_OFFLOAD_can_run (void *); extern void GOMP_OFFLOAD_run (int, void *, void *, void **); extern void GOMP_OFFLOAD_async_run (int, void *, void *, void **, void *); diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 4d2bfab4b71..68f20651fbf 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -1388,6 +1388,8 @@ struct gomp_device_descr __typeof (GOMP_OFFLOAD_free) *free_func; __typeof (GOMP_OFFLOAD_dev2host) *dev2host_func; __typeof (GOMP_OFFLOAD_host2dev) *host2dev_func; + __typeof (GOMP_OFFLOAD_memcpy2d) *memcpy2d_func; + __typeof (GOMP_OFFLOAD_memcpy3d) *memcpy3d_func; __typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func; __typeof (GOMP_OFFLOAD_can_run) *can_run_func; __typeof (GOMP_OFFLOAD_run) *run_func; diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi index 4ac01e977ec..073c181f627 100644 --- a/libgomp/libgomp.texi +++ b/libgomp/libgomp.texi @@ -5060,6 +5060,11 @@ The implementation remark: list of available devices (``host fallback''). @item The default per-warp stack size is 128 kiB; see also @code{-msoft-stack} in the GCC manual. +@item The OpenMP routines @code{omp_target_memcpy_rect} and + @code{omp_target_memcpy_rect_async} and the @code{target update} + directive for non-contiguous list items will use the 2D and 3D + memory-copy functions of the CUDA library. Higher dimensions will + call those functions in a loop and are therefore supported. @end itemize diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c index b75e8bead39..5980d510838 100644 --- a/libgomp/oacc-host.c +++ b/libgomp/oacc-host.c @@ -281,6 +281,8 @@ static struct gomp_device_descr host_dispatch = .free_func = host_free, .dev2host_func = host_dev2host, .host2dev_func = host_host2dev, + .memcpy2d_func = NULL, + .memcpy3d_func = NULL, .run_func = host_run, .mem_map = { NULL }, diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def index dff42d6a9ac..007c6e0f4df 100644 --- a/libgomp/plugin/cuda-lib.def +++ b/libgomp/plugin/cuda-lib.def @@ -36,6 +36,9 @@ CUDA_ONE_CALL (cuMemcpyDtoH) CUDA_ONE_CALL (cuMemcpyDtoHAsync) CUDA_ONE_CALL (cuMemcpyHtoD) CUDA_ONE_CALL (cuMemcpyHtoDAsync) +CUDA_ONE_CALL (cuMemcpy2D) +CUDA_ONE_CALL (cuMemcpy2DUnaligned) +CUDA_ONE_CALL (cuMemcpy3D) CUDA_ONE_CALL (cuMemFree) CUDA_ONE_CALL (cuMemFreeHost) CUDA_ONE_CALL (cuMemGetAddressRange) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index ffc8e2d79d1..9cdc55cac6b 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -1781,6 +1781,122 @@ GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n) return true; } +int +GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size, + size_t dim0_len, void *dst, size_t dst_offset1_size, + size_t dst_offset0_len, size_t dst_dim1_size, + const void *src, size_t src_offset1_size, + size_t src_offset0_len, size_t src_dim1_size) +{ + if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord)) + return false; + + /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */ + + CUDA_MEMCPY2D data; + data.WidthInBytes = dim1_size; + data.Height = dim0_len; + + if (dst_ord == -1) + { + data.dstMemoryType = CU_MEMORYTYPE_HOST; + data.dstHost = dst; + } + else + { + data.dstMemoryType = CU_MEMORYTYPE_DEVICE; + data.dstDevice = (CUdeviceptr) dst; + } + data.dstPitch = dst_dim1_size; + data.dstXInBytes = dst_offset1_size; + data.dstY = dst_offset0_len; + + if (src_ord == -1) + { + data.srcMemoryType = CU_MEMORYTYPE_HOST; + data.srcHost = src; + } + else + { + data.srcMemoryType = CU_MEMORYTYPE_DEVICE; + data.srcDevice = (CUdeviceptr) src; + } + data.srcPitch = src_dim1_size; + data.srcXInBytes = src_offset1_size; + data.srcY = src_offset0_len; + + CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data); + if (res == CUDA_ERROR_INVALID_VALUE) + /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device + for (some) memory not allocated by cuMemAllocPitch, cuMemcpy2D fails + with an error; try the slower cuMemcpy2DUnaligned now. */ + CUDA_CALL (cuMemcpy2DUnaligned, &data); + else if (res != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("cuMemcpy2D error: %s", cuda_error (res)); + return false; + } + return true; +} + +int +GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size, + size_t dim1_len, size_t dim0_len, void *dst, + size_t dst_offset2_size, size_t dst_offset1_len, + size_t dst_offset0_len, size_t dst_dim2_size, + size_t dst_dim1_len, const void *src, + size_t src_offset2_size, size_t src_offset1_len, + size_t src_offset0_len, size_t src_dim2_size, + size_t src_dim1_len) +{ + if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord)) + return false; + + /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */ + + CUDA_MEMCPY3D data; + data.WidthInBytes = dim2_size; + data.Height = dim1_len; + data.Depth = dim0_len; + + if (dst_ord == -1) + { + data.dstMemoryType = CU_MEMORYTYPE_HOST; + data.dstHost = dst; + } + else + { + data.dstMemoryType = CU_MEMORYTYPE_DEVICE; + data.dstDevice = (CUdeviceptr) dst; + } + data.dstPitch = dst_dim2_size; + data.dstHeight = dst_dim1_len; + data.dstXInBytes = dst_offset2_size; + data.dstY = dst_offset1_len; + data.dstZ = dst_offset0_len; + data.dstLOD = 0; + + if (src_ord == -1) + { + data.srcMemoryType = CU_MEMORYTYPE_HOST; + data.srcHost = src; + } + else + { + data.srcMemoryType = CU_MEMORYTYPE_DEVICE; + data.srcDevice = (CUdeviceptr) src; + } + data.srcPitch = src_dim2_size; + data.srcHeight = src_dim1_len; + data.srcXInBytes = src_offset2_size; + data.srcY = src_offset1_len; + data.srcZ = src_offset0_len; + data.srcLOD = 0; + + CUDA_CALL (cuMemcpy3D, &data); + return true; +} + bool GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src, size_t n, struct goacc_asyncqueue *aq) diff --git a/libgomp/target.c b/libgomp/target.c index 80c25a16f1e..5cf2e8dce37 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -4526,7 +4526,8 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size, const size_t *dst_dimensions, const size_t *src_dimensions, struct gomp_device_descr *dst_devicep, - struct gomp_device_descr *src_devicep) + struct gomp_device_descr *src_devicep, + size_t *tmp_size, void **tmp) { size_t dst_slice = element_size; size_t src_slice = element_size; @@ -4539,36 +4540,120 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size, || __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off) || __builtin_mul_overflow (element_size, src_offsets[0], &src_off)) return EINVAL; - if (dst_devicep == NULL && src_devicep == NULL) + if (src_devicep != NULL && src_devicep == dst_devicep) + ret = src_devicep->dev2dev_func (src_devicep->target_id, + (char *) dst + dst_off, + (const char *) src + src_off, + length); + else if (src_devicep != NULL + && (dst_devicep == NULL + || (dst_devicep->capabilities + & GOMP_OFFLOAD_CAP_SHARED_MEM))) + ret = src_devicep->dev2host_func (src_devicep->target_id, + (char *) dst + dst_off, + (const char *) src + src_off, + length); + else if (dst_devicep != NULL + && (src_devicep == NULL + || (src_devicep->capabilities + & GOMP_OFFLOAD_CAP_SHARED_MEM))) + ret = dst_devicep->host2dev_func (dst_devicep->target_id, + (char *) dst + dst_off, + (const char *) src + src_off, + length); + else if (dst_devicep == NULL && src_devicep == NULL) { memcpy ((char *) dst + dst_off, (const char *) src + src_off, length); ret = 1; } - else if (src_devicep == NULL) - ret = dst_devicep->host2dev_func (dst_devicep->target_id, - (char *) dst + dst_off, - (const char *) src + src_off, - length); - else if (dst_devicep == NULL) - ret = src_devicep->dev2host_func (src_devicep->target_id, - (char *) dst + dst_off, - (const char *) src + src_off, - length); else if (src_devicep == dst_devicep) ret = src_devicep->dev2dev_func (src_devicep->target_id, (char *) dst + dst_off, (const char *) src + src_off, length); else - ret = 0; + { + if (*tmp_size == 0) + { + *tmp_size = length; + *tmp = malloc (length); + if (*tmp == NULL) + return ENOMEM; + } + else if (*tmp_size < length) + { + *tmp_size = length; + *tmp = realloc (*tmp, length); + if (*tmp == NULL) + return ENOMEM; + } + ret = src_devicep->dev2host_func (src_devicep->target_id, *tmp, + (const char *) src + src_off, + length); + if (ret == 1) + ret = dst_devicep->host2dev_func (dst_devicep->target_id, + (char *) dst + dst_off, *tmp, + length); + } return ret ? 0 : EINVAL; } - /* FIXME: it would be nice to have some plugin function to handle - num_dims == 2 and num_dims == 3 more efficiently. Larger ones can - be handled in the generic recursion below, and for host-host it - should be used even for any num_dims >= 2. */ + /* host->device, device->host and same-device device->device. */ + if (num_dims == 2 + && ((src_devicep + && src_devicep == dst_devicep + && src_devicep->memcpy2d_func) + || (!src_devicep != !dst_devicep + && ((src_devicep && src_devicep->memcpy2d_func) + || (dst_devicep && dst_devicep->memcpy2d_func))))) + { + size_t vol_sz1, dst_sz1, src_sz1, dst_off_sz1, src_off_sz1; + int dst_id = dst_devicep ? dst_devicep->target_id : -1; + int src_id = src_devicep ? src_devicep->target_id : -1; + struct gomp_device_descr *devp = dst_devicep ? dst_devicep : src_devicep; + + if (__builtin_mul_overflow (volume[1], element_size, &vol_sz1) + || __builtin_mul_overflow (dst_dimensions[1], element_size, &dst_sz1) + || __builtin_mul_overflow (src_dimensions[1], element_size, &src_sz1) + || __builtin_mul_overflow (dst_offsets[1], element_size, &dst_off_sz1) + || __builtin_mul_overflow (src_offsets[1], element_size, + &src_off_sz1)) + return EINVAL; + ret = devp->memcpy2d_func (dst_id, src_id, vol_sz1, volume[0], + dst, dst_off_sz1, dst_offsets[0], dst_sz1, + src, src_off_sz1, src_offsets[0], src_sz1); + if (ret != -1) + return ret ? 0 : EINVAL; + } + else if (num_dims == 3 + && ((src_devicep + && src_devicep == dst_devicep + && src_devicep->memcpy3d_func) + || (!src_devicep != !dst_devicep + && ((src_devicep && src_devicep->memcpy3d_func) + || (dst_devicep && dst_devicep->memcpy3d_func))))) + { + size_t vol_sz2, dst_sz2, src_sz2, dst_off_sz2, src_off_sz2; + int dst_id = dst_devicep ? dst_devicep->target_id : -1; + int src_id = src_devicep ? src_devicep->target_id : -1; + struct gomp_device_descr *devp = dst_devicep ? dst_devicep : src_devicep; + + if (__builtin_mul_overflow (volume[2], element_size, &vol_sz2) + || __builtin_mul_overflow (dst_dimensions[2], element_size, &dst_sz2) + || __builtin_mul_overflow (src_dimensions[2], element_size, &src_sz2) + || __builtin_mul_overflow (dst_offsets[2], element_size, &dst_off_sz2) + || __builtin_mul_overflow (src_offsets[2], element_size, + &src_off_sz2)) + return EINVAL; + ret = devp->memcpy3d_func (dst_id, src_id, vol_sz2, volume[1], volume[0], + dst, dst_off_sz2, dst_offsets[1], + dst_offsets[0], dst_sz2, dst_dimensions[1], + src, src_off_sz2, src_offsets[1], + src_offsets[0], src_sz2, src_dimensions[1]); + if (ret != -1) + return ret ? 0 : EINVAL; + } for (i = 1; i < num_dims; i++) if (__builtin_mul_overflow (dst_slice, dst_dimensions[i], &dst_slice) @@ -4585,7 +4670,7 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size, volume + 1, dst_offsets + 1, src_offsets + 1, dst_dimensions + 1, src_dimensions + 1, dst_devicep, - src_devicep); + src_devicep, tmp_size, tmp); if (ret) return ret; dst_off += dst_slice; @@ -4608,9 +4693,6 @@ omp_target_memcpy_rect_check (void *dst, const void *src, int dst_device_num, if (ret) return ret; - if (*src_devicep != NULL && *dst_devicep != NULL && *src_devicep != *dst_devicep) - return EINVAL; - return 0; } @@ -4624,18 +4706,36 @@ omp_target_memcpy_rect_copy (void *dst, const void *src, struct gomp_device_descr *dst_devicep, struct gomp_device_descr *src_devicep) { - if (src_devicep) + size_t tmp_size = 0; + void *tmp = NULL; + bool lock_src; + bool lock_dst; + + lock_src = (src_devicep + && (!dst_devicep + || src_devicep == dst_devicep + || !(src_devicep->capabilities + & GOMP_OFFLOAD_CAP_SHARED_MEM))); + lock_dst = (dst_devicep + && (!lock_src + || (src_devicep != dst_devicep + && !(dst_devicep->capabilities + & GOMP_OFFLOAD_CAP_SHARED_MEM)))); + if (lock_src) gomp_mutex_lock (&src_devicep->lock); - else if (dst_devicep) + if (lock_dst) gomp_mutex_lock (&dst_devicep->lock); int ret = omp_target_memcpy_rect_worker (dst, src, element_size, num_dims, volume, dst_offsets, src_offsets, dst_dimensions, src_dimensions, - dst_devicep, src_devicep); - if (src_devicep) + dst_devicep, src_devicep, + &tmp_size, &tmp); + if (lock_src) gomp_mutex_unlock (&src_devicep->lock); - else if (dst_devicep) + if (lock_dst) gomp_mutex_unlock (&dst_devicep->lock); + if (tmp) + free (tmp); return ret; } @@ -4976,6 +5076,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device, DLSYM (free); DLSYM (dev2host); DLSYM (host2dev); + DLSYM (memcpy2d); + DLSYM (memcpy3d); device->capabilities = device->get_caps_func (); if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) { diff --git a/libgomp/testsuite/libgomp.c/target-12.c b/libgomp/testsuite/libgomp.c/target-12.c index b439e56577c..4fbfae19eda 100644 --- a/libgomp/testsuite/libgomp.c/target-12.c +++ b/libgomp/testsuite/libgomp.c/target-12.c @@ -80,12 +80,12 @@ main () src_offsets[2] = 1; src_offsets[1] = 0; src_offsets[0] = 3; - dst_dimensions[2] = 2; + dst_dimensions[2] = 3; dst_dimensions[1] = 3; dst_dimensions[0] = 6; src_dimensions[2] = 3; src_dimensions[1] = 4; - src_dimensions[0] = 6; + src_dimensions[0] = 9; if (omp_target_memcpy_rect (p, q, sizeof (int), 3, volume, dst_offsets, src_offsets, dst_dimensions, src_dimensions, d, id) != 0) @@ -98,7 +98,7 @@ main () for (j = 0; j < 6; j++) for (k = 0; k < 3; k++) for (l = 0; l < 2; l++) - if (q[j * 6 + k * 2 + l] != 3 * 12 + 4 + 1 + l + k * 3 + j * 12) + if (q[j * 9 + k * 3 + l] != 3 * 12 + 4 + 1 + l + k * 3 + j * 12) err = 1; } diff --git a/libgomp/testsuite/libgomp.fortran/target-12.f90 b/libgomp/testsuite/libgomp.fortran/target-12.f90 index 17c78f18f9b..eba4b62a18e 100644 --- a/libgomp/testsuite/libgomp.fortran/target-12.f90 +++ b/libgomp/testsuite/libgomp.fortran/target-12.f90 @@ -93,12 +93,12 @@ program main src_offsets(2) = 1 src_offsets(1) = 0 src_offsets(0) = 3 - dst_dimensions(2) = 2 + dst_dimensions(2) = 3 dst_dimensions(1) = 3 dst_dimensions(0) = 6 src_dimensions(2) = 3 src_dimensions(1) = 4 - src_dimensions(0) = 6 + src_dimensions(0) = 9 if (omp_target_memcpy_rect (p, c_loc (q), sizeof (q(0)), 3, volume, & dst_offsets, src_offsets, dst_dimensions, & @@ -112,7 +112,7 @@ program main do j = 0, 5 do k = 0, 2 do l = 0, 1 - if (q(j * 6 + k * 2 + l) /= 3 * 12 + 4 + 1 + l + k * 3 + j * 12) & + if (q(j * 9 + k * 3 + l) /= 3 * 12 + 4 + 1 + l + k * 3 + j * 12) & err = .true. end do end do diff --git a/libgomp/testsuite/libgomp.fortran/target-memcpy-rect-1.f90 b/libgomp/testsuite/libgomp.fortran/target-memcpy-rect-1.f90 new file mode 100644 index 00000000000..406c23b8408 --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/target-memcpy-rect-1.f90 @@ -0,0 +1,531 @@ +program main + use iso_c_binding + use omp_lib + implicit none (type, external) + + integer(c_size_t), parameter :: sizeof_int = 4 + integer, parameter :: sk = c_size_t + logical, allocatable :: isshared(:) + integer, allocatable :: maxdim(:,:) + integer :: ndev + + ndev = omp_get_num_devices() + call init_isshared + call init_maxdim + + call one + call two + call three + call four + + deallocate(isshared, maxdim) +contains + + subroutine init_maxdim + integer :: dev, dev2, r + integer(c_size_t), parameter :: nl = 0 + + allocate(maxdim(0:ndev,0:ndev)) + do dev = 0, ndev + do dev2 = 0, ndev + r = omp_target_memcpy_rect (c_null_ptr, c_null_ptr, nl, & + num_dims=1_c_int, volume=[nl], & + dst_offsets=[nl], src_offsets=[nl], & + dst_dimensions=[nl], src_dimensions=[nl], & + dst_device_num=dev, src_device_num=omp_initial_device) + if (r < 3) stop 1 ! OpenMP requirement + if (r < huge(0_c_int)) stop 2 ! GCC implementation + maxdim(dev2,dev) = r + end do + end do + end subroutine + + subroutine init_isshared + integer :: dev + logical :: dev_isshared + + allocate(isshared(0:ndev)) + do dev = 0, ndev + dev_isshared = .false. + !$omp target device(dev) map(to: dev_isshared) + dev_isshared = .true. + !$omp end target + isshared(dev) = dev_isshared + end do + end subroutine + + + subroutine one + integer(c_size_t), parameter :: N1 = 30 + integer, target :: host_data(N1) + type(c_ptr) :: dev_cptr(0:ndev), cptr, tmp_cptr + integer :: dev, dev2, i, r + + do dev = 0, ndev + dev_cptr(dev) = omp_target_alloc (N1*sizeof_int, dev) + if (.not. c_associated (dev_cptr(dev))) stop 11 + end do + + do i = 1, N1 + host_data(i) = i + end do + + ! copy full array host -> all devices + check value + set per-device value + do dev = 0, ndev + r = omp_target_memcpy_rect (dev_cptr(dev), c_loc(host_data), sizeof_int, & + num_dims=1_c_int, volume=[N1], & + dst_offsets=[0_sk], src_offsets=[0_sk], & + dst_dimensions=[N1], src_dimensions=[N1], & + dst_device_num=dev, src_device_num=omp_initial_device) + if (r /= 0) stop 12 + cptr = dev_cptr(dev) + !$omp target device(dev) is_device_ptr(cptr) + block + integer, pointer, contiguous :: fptr(:) + call c_f_pointer(cptr, fptr, [N1]) + do i = 1, N1 + if (fptr(i) /= i) stop 13 + fptr(i) = i*100 + 10000 * (dev+3) + end do + end block + end do + + ! Test strided data - forth and back - same array sizes + do dev = 0, ndev + do dev2 = 0, ndev + tmp_cptr = omp_target_alloc (N1*sizeof_int, dev) + if (.not. c_associated (tmp_cptr)) stop 14 + + !$omp target device(dev) is_device_ptr(tmp_cptr) + block + integer, pointer, contiguous :: fptr(:) + call c_f_pointer(tmp_cptr, fptr, [N1]) + do i = 1, N1 + fptr(i) = i*100 + 10000*(dev+1) + end do + end block + + if (N1-17 > N1 - max(12,13)) stop 18 + r = omp_target_memcpy_rect (dev_cptr(dev2), tmp_cptr, sizeof_int, & + num_dims=1_c_int, volume=[N1-17], & + dst_offsets=[12_sk], src_offsets=[13_sk], & + dst_dimensions=[N1], src_dimensions=[N1], & + dst_device_num=dev2, src_device_num=dev) + if (r /= 0) stop 15 + + cptr = dev_cptr(dev2) + !$omp target device(dev2) is_device_ptr(cptr) + block + logical :: checked(N1) + integer, pointer, contiguous :: fptr(:) + call c_f_pointer(cptr, fptr, [N1]) + checked = .false. + do i = 1, N1-17 + if (fptr(i+12) /= (i+13)*100 + 10000 * (dev+1)) stop 16 + checked(i+12) = .true. + end do + ! original device value + do i = 1, N1 + if (.not. checked(i)) then + if (fptr(i) /= i*100 + 10000 * (dev2+3)) stop 17 + end if + end do + end block + call omp_target_free (tmp_cptr, dev) + end do + + ! reset to original value + do dev2 = 0, ndev + cptr = dev_cptr(dev2) + !$omp target device(dev2) is_device_ptr(cptr) + block + integer, pointer, contiguous :: fptr(:) + call c_f_pointer(cptr, fptr, [N1]) + do i = 1, N1 + fptr(i) = i*100 + 10000 * (dev2+3) + end do + end block + end do + end do + + do dev = 0, ndev + call omp_target_free (dev_cptr(dev), dev) + end do + end subroutine + + + subroutine two + integer(c_size_t), parameter :: N = 10, M = 30 + integer, target :: host_data(N,M) + type(c_ptr) :: dev_cptr(0:ndev), cptr, tmp_cptr + integer :: dev, dev2, i, j, r + + do dev = 0, ndev + dev_cptr(dev) = omp_target_alloc (N*M*sizeof_int, dev) + if (.not. c_associated (dev_cptr(dev))) stop 21 + end do + + do i = 1, M + do j = 1, N + host_data(j,i) = i*100 + j + end do + end do + + ! copy full array host -> all devices + check value + set per-device value + do dev = 0, ndev + r = omp_target_memcpy_rect (dev_cptr(dev), c_loc(host_data), sizeof_int, & + num_dims=2_c_int, volume=[M, N], & + dst_offsets=[0_sk, 0_sk], src_offsets=[0_sk, 0_sk], & + dst_dimensions=[M, N], src_dimensions=[M,N], & + dst_device_num=dev, src_device_num=omp_initial_device) + if (r /= 0) stop 22 + cptr = dev_cptr(dev) + !$omp target device(dev) is_device_ptr(cptr) + block + integer, pointer, contiguous :: fptr(:,:) + call c_f_pointer(cptr, fptr, [N,M]) + do i = 1, M + do j = 1, N + if (fptr(j,i) /= i*100 + j) stop 23 + fptr(j,i) = i*100 + j + 1000 * dev + end do + end do + end block + end do + + ! Test strided data - forth and back - same array sizes + do dev = 0, ndev + do dev2 = 0, ndev + tmp_cptr = omp_target_alloc (N*M*sizeof_int, dev) + if (.not. c_associated (tmp_cptr)) stop 24 + + !$omp target device(dev) is_device_ptr(tmp_cptr) + block + integer, pointer, contiguous :: fptr(:,:) + call c_f_pointer(tmp_cptr, fptr, [N,M]) + do i = 1, M + do j = 1, N + fptr(j,i) = i*100 + j + 100000 * (dev+1) + end do + end do + end block + + if (M-14 > M - max(5,2) & + .or. N-3 > N - max(2,1)) stop 28 + r = omp_target_memcpy_rect (dev_cptr(dev2), tmp_cptr, sizeof_int, & + num_dims=2_c_int, volume=[M-14, N-3], & + dst_offsets=[5_sk, 3_sk], src_offsets=[2_sk, 1_sk], & + dst_dimensions=[M, N], src_dimensions=[M,N], & + dst_device_num=dev2, src_device_num=dev) + if (r /= 0) stop 25 + + cptr = dev_cptr(dev2) + !$omp target device(dev2) is_device_ptr(cptr) + block + logical :: checked(N,M) + integer, pointer, contiguous :: fptr(:,:) + call c_f_pointer(cptr, fptr, [N,M]) + checked = .false. + do i = 1, M-14 + do j = 1, N-3 + if (fptr(j+3, i+5) /= (i+2)*100 + (j+1) + 100000 * (dev+1)) stop 26 + checked(j+3, i+5) = .true. + end do + end do + ! original device value + do i = 1, M + do j = 1, N + if (.not. checked(j,i)) then + if (fptr(j,i) /= i*100 + j + 1000 * dev2) stop 27 + end if + end do + end do + end block + call omp_target_free (tmp_cptr, dev) + end do + + ! reset to original value + do dev2 = 0, ndev + cptr = dev_cptr(dev2) + !$omp target device(dev2) is_device_ptr(cptr) + block + integer, pointer, contiguous :: fptr(:,:) + call c_f_pointer(cptr, fptr, [N,M]) + do i = 1, M + do j = 1, N + fptr(j,i) = i*100 + j + 1000 * dev2 + end do + end do + end block + end do + end do + + do dev = 0, ndev + call omp_target_free (dev_cptr(dev), dev) + end do + end subroutine + + + subroutine three + integer(c_size_t), parameter :: N1 = 10, N2 = 30, N3 = 15 + integer, target :: host_data(N3,N2,N1) + type(c_ptr) :: dev_cptr(0:ndev), cptr, tmp_cptr + integer :: dev, dev2, i, j, k, r + + do dev = 0, ndev + dev_cptr(dev) = omp_target_alloc (N1*N2*N3*sizeof_int, dev) + if (.not. c_associated (dev_cptr(dev))) stop 31 + end do + + do i = 1, N1 + do j = 1, N2 + do k = 1, N3 + host_data(k, j,i) = i*1000 + 100*j + k + end do + end do + end do + + ! copy full array host -> all devices + check value + set per-device value + do dev = 0, ndev + r = omp_target_memcpy_rect (dev_cptr(dev), c_loc(host_data), sizeof_int, & + num_dims=3_c_int, volume=[N1, N2, N3], & + dst_offsets=[0_sk, 0_sk, 0_sk], src_offsets=[0_sk, 0_sk, 0_sk], & + dst_dimensions=[N1, N2, N3], src_dimensions=[N1, N2, N3], & + dst_device_num=dev, src_device_num=omp_initial_device) + if (r /= 0) stop 32 + cptr = dev_cptr(dev) + !$omp target device(dev) is_device_ptr(cptr) + block + integer, pointer, contiguous :: fptr(:,:,:) + call c_f_pointer(cptr, fptr, [N3,N2,N1]) + do i = 1, N1 + do j = 1, N2 + do k = 1, N3 + if (fptr(k, j,i) /= i*1000 + 100*j + k) stop 33 + fptr(k,j,i) = i*1000 + 100*j + k + 1000 * dev + end do + end do + end do + end block + end do + + ! Test strided data - forth and back - same array sizes + do dev = 0, ndev + do dev2 = 0, ndev + tmp_cptr = omp_target_alloc (N1*N2*N3*sizeof_int, dev) + if (.not. c_associated (tmp_cptr)) stop 34 + + !$omp target device(dev) is_device_ptr(tmp_cptr) + block + integer, pointer, contiguous :: fptr(:,:,:) + call c_f_pointer(tmp_cptr, fptr, [N3,N2,N1]) + do i = 1, N1 + do j = 1, N2 + do k = 1, N3 + fptr(k,j,i) = i*1000 + 100*j + k + 100000 * (dev+1) + end do + end do + end do + end block + + if (N1-5 > N1 - max(5,2) & + .or. N2-13 > N2 - max(3,1) & + .or. N3-5 > N3 - max(2,4)) stop 38 + r = omp_target_memcpy_rect (dev_cptr(dev2), tmp_cptr, sizeof_int, & + num_dims=3_c_int, volume=[N1-5, N2-13,N3-5], & + dst_offsets=[5_sk, 3_sk,2_sk], src_offsets=[2_sk, 1_sk,4_sk], & + dst_dimensions=[N1,N2,N3], src_dimensions=[N1,N2,N3], & + dst_device_num=dev2, src_device_num=dev) + if (r /= 0) stop 35 + + cptr = dev_cptr(dev2) + !$omp target device(dev2) is_device_ptr(cptr) + block + logical :: checked(N3,N2,N1) + integer, pointer, contiguous :: fptr(:,:,:) + call c_f_pointer(cptr, fptr, [N3,N2,N1]) + checked = .false. + do i = 1, N1-5 + do j = 1, N2-13 + do k = 1, N3-5 + if (fptr(k+2, j+3, i+5) /= (i+2)*1000 + 100*(j+1) + (k+4) + 100000 * (dev+1)) stop 36 + checked(k+2, j+3, i+5) = .true. + end do + end do + end do + ! original device value + do i = 1, N1 + do j = 1, N2 + do k = 1, N3 + if (.not. checked(k,j,i)) then + if (fptr(k,j,i) /= i*1000 + 100*j + k + 1000 * dev2) stop 37 + end if + end do + end do + end do + end block + call omp_target_free (tmp_cptr, dev) + end do + + ! reset to original value + do dev2 = 0, ndev + cptr = dev_cptr(dev2) + !$omp target device(dev2) is_device_ptr(cptr) + block + integer, pointer, contiguous :: fptr(:,:,:) + call c_f_pointer(cptr, fptr, [N3,N2,N1]) + do i = 1, N1 + do j = 1, N2 + do k = 1, N3 + fptr(k,j,i) = i*1000 + 100*j + k + 1000 * dev2 + end do + end do + end do + end block + end do + end do + + do dev = 0, ndev + call omp_target_free (dev_cptr(dev), dev) + end do + end subroutine + + + subroutine four + integer(c_size_t), parameter :: N1 = 10, N2 = 30, N3 = 15, N4 = 25 + integer, target :: host_data(N4, N3,N2,N1) + type(c_ptr) :: dev_cptr(0:ndev), cptr, tmp_cptr + integer :: dev, dev2, i, j, k, ll, r + + do dev = 0, ndev + dev_cptr(dev) = omp_target_alloc (N1*N2*N3*N4*sizeof_int, dev) + if (.not. c_associated (dev_cptr(dev))) stop 41 + end do + + do i = 1, N1 + do j = 1, N2 + do k = 1, N3 + do ll = 1, N4 + host_data(ll, k, j,i) = i*1000 + 100*j + k*10 + ll + end do + end do + end do + end do + + ! copy full array host -> all devices + check value + set per-device value + do dev = 0, ndev + r = omp_target_memcpy_rect (dev_cptr(dev), c_loc(host_data), sizeof_int, & + num_dims=4_c_int, volume=[N1, N2, N3, N4], & + dst_offsets=[0_sk, 0_sk, 0_sk, 0_sk], src_offsets=[0_sk, 0_sk, 0_sk, 0_sk], & + dst_dimensions=[N1, N2, N3, N4], src_dimensions=[N1, N2, N3, N4], & + dst_device_num=dev, src_device_num=omp_initial_device) + if (r /= 0) stop 42 + cptr = dev_cptr(dev) + !$omp target device(dev) is_device_ptr(cptr) + block + integer, pointer, contiguous :: fptr(:,:,:,:) + call c_f_pointer(cptr, fptr, [N4,N3,N2,N1]) + do i = 1, N1 + do j = 1, N2 + do k = 1, N3 + do ll = 1, N4 + if (fptr(ll, k, j,i) /= i*1000 + 100*j + k*10 + ll) stop 43 + fptr(ll,k,j,i) = i*1000 + 100*j + k*10 + ll + 1000 * dev + end do + end do + end do + end do + end block + end do + + ! Test strided data - forth and back - same array sizes + do dev = 0, ndev + do dev2 = 0, ndev + tmp_cptr = omp_target_alloc (N1*N2*N3*N4*sizeof_int, dev) + if (.not. c_associated (tmp_cptr)) stop 44 + + !$omp target device(dev) is_device_ptr(tmp_cptr) + block + integer, pointer, contiguous :: fptr(:,:,:,:) + call c_f_pointer(tmp_cptr, fptr, [N4,N3,N2,N1]) + do i = 1, N1 + do j = 1, N2 + do k = 1, N3 + do ll = 1, N4 + fptr(ll,k,j,i) = i*1000 + 100*j + k*10 + ll + 100000 * (dev+1) + end do + end do + end do + end do + end block + + if (N1-5 > N1 - max(5,2) & + .or. N2-13 > N2 - max(3,1) & + .or. N3-5 > N3 - max(2,4) & + .or. N4-11 > N4 - max(7,5)) stop 48 + r = omp_target_memcpy_rect (dev_cptr(dev2), tmp_cptr, sizeof_int, & + num_dims=4_c_int, volume=[N1-5, N2-13,N3-5,N4-11], & + dst_offsets=[5_sk, 3_sk,2_sk,7_sk], src_offsets=[2_sk, 1_sk,4_sk,5_sk], & + dst_dimensions=[N1,N2,N3,N4], src_dimensions=[N1,N2,N3,N4], & + dst_device_num=dev2, src_device_num=dev) + if (r /= 0) stop 45 + + cptr = dev_cptr(dev2) + !$omp target device(dev2) is_device_ptr(cptr) + block + logical, allocatable :: checked(:,:,:,:) ! allocatble to reduce stack size + integer, pointer, contiguous :: fptr(:,:,:,:) + call c_f_pointer(cptr, fptr, [N4,N3,N2,N1]) + allocate (checked(N4,N3,N2,N1), source=.false.) + do i = 1, N1-5 + do j = 1, N2-13 + do k = 1, N3-5 + do ll = 1, N4-11 + if (fptr(ll+7, k+2, j+3, i+5) /= (i+2)*1000 + 100*(j+1) + (k+4)*10 + ll+5 + 100000 * (dev+1)) stop 46 + checked(ll+7, k+2, j+3, i+5) = .true. + end do + end do + end do + end do + ! original device value + do i = 1, N1 + do j = 1, N2 + do k = 1, N3 + do ll = 1, N4 + if (.not. checked(ll,k,j,i)) then + if (fptr(ll,k,j,i) /= i*1000 + 100*j + k*10 + ll + 1000 * dev2) stop 47 + end if + end do + end do + end do + end do + deallocate (checked) + end block + call omp_target_free (tmp_cptr, dev) + end do + + ! reset to original value + do dev2 = 0, ndev + cptr = dev_cptr(dev2) + !$omp target device(dev2) is_device_ptr(cptr) + block + integer, pointer, contiguous :: fptr(:,:,:,:) + call c_f_pointer(cptr, fptr, [N4,N3,N2,N1]) + do i = 1, N1 + do j = 1, N2 + do k = 1, N3 + do ll = 1, N4 + fptr(ll,k,j,i) = i*1000 + 100*j + k*10 + ll + 1000 * dev2 + end do + end do + end do + end do + end block + end do + end do + + do dev = 0, ndev + call omp_target_free (dev_cptr(dev), dev) + end do + end subroutine +end program