diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 5803683afc4..8fc9379d1b3 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -1157,29 +1157,7 @@ struct target_var_desc { uintptr_t length; }; -struct target_mem_desc { - /* Reference count. */ - uintptr_t refcount; - /* All the splay nodes allocated together. */ - splay_tree_node array; - /* Start of the target region. */ - uintptr_t tgt_start; - /* End of the targer region. */ - uintptr_t tgt_end; - /* Handle to free. */ - void *to_free; - /* Previous target_mem_desc. */ - struct target_mem_desc *prev; - /* Number of items in following list. */ - size_t list_count; - - /* Corresponding target device descriptor. */ - struct gomp_device_descr *device_descr; - - /* List of target items to remove (or decrease refcount) - at the end of region. */ - struct target_var_desc list[]; -}; +struct target_mem_desc; /* Special value for refcount - mask to indicate existence of special values. Right now we allocate 3 bits. */ @@ -1273,6 +1251,58 @@ splay_compare (splay_tree_key x, splay_tree_key y) #include "splay-tree.h" +/* Reverse offload splay-tree handling (functions only). */ + +struct reverse_splay_tree_key_s { + /* Address of the device object. */ + uint64_t dev; + splay_tree_key k; +}; + +typedef struct reverse_splay_tree_node_s *reverse_splay_tree_node; +typedef struct reverse_splay_tree_s *reverse_splay_tree; +typedef struct reverse_splay_tree_key_s *reverse_splay_tree_key; + +static inline int +reverse_splay_compare (reverse_splay_tree_key x, reverse_splay_tree_key y) +{ + if (x->dev < y->dev) + return -1; + if (x->dev > y->dev) + return 1; + return 0; +} + +#define splay_tree_prefix reverse +#include "splay-tree.h" + +struct target_mem_desc { + /* Reference count. */ + uintptr_t refcount; + /* All the splay nodes allocated together. */ + splay_tree_node array; + /* Likewise for the reverse lookup device->host for reverse offload. */ + reverse_splay_tree_node rev_array; + /* Start of the target region. */ + uintptr_t tgt_start; + /* End of the targer region. */ + uintptr_t tgt_end; + /* Handle to free. */ + void *to_free; + /* Previous target_mem_desc. */ + struct target_mem_desc *prev; + /* Number of items in following list. */ + size_t list_count; + + /* Corresponding target device descriptor. */ + struct gomp_device_descr *device_descr; + + /* List of target items to remove (or decrease refcount) + at the end of region. */ + struct target_var_desc list[]; +}; + + typedef struct acc_dispatch_t { /* Execute. */ @@ -1367,6 +1397,7 @@ struct gomp_device_descr /* Splay tree containing information about mapped memory regions. */ struct splay_tree_s mem_map; + struct reverse_splay_tree_s mem_map_rev; /* Mutex for the mutable data. */ gomp_mutex_t lock; diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c index 4e3971ae1a9..77a24279e2f 100644 --- a/libgomp/oacc-host.c +++ b/libgomp/oacc-host.c @@ -284,6 +284,7 @@ static struct gomp_device_descr host_dispatch = .run_func = host_run, .mem_map = { NULL }, + .mem_map_rev = { NULL }, /* .lock initialized in goacc_host_init. */ .state = GOMP_DEVICE_UNINITIALIZED, diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index e803f083591..4f4c25a90ba 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -1190,7 +1190,9 @@ GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask) devices were present. Unified-shared address: see comment in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */ if (num_devices > 0 - && (omp_requires_mask & ~GOMP_REQUIRES_UNIFIED_ADDRESS) != 0) + && ((omp_requires_mask + & ~(GOMP_REQUIRES_UNIFIED_ADDRESS + | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0)) return -1; return num_devices; } diff --git a/libgomp/splay-tree.c b/libgomp/splay-tree.c index 2d8742fd19a..3809e90b68d 100644 --- a/libgomp/splay-tree.c +++ b/libgomp/splay-tree.c @@ -236,3 +236,25 @@ splay_tree_foreach (splay_tree sp, splay_tree_callback func, void *data) { splay_tree_foreach_internal (sp->root, func, data); } + +/* Like above, except when func returns != 0, stop early. */ + +static int +splay_tree_foreach_internal_lazy (splay_tree_node node, + splay_tree_callback_stop func, void *data) +{ + if (!node) + return 0; + if (func (&node->key, data)) + return 1; + if (splay_tree_foreach_internal_lazy (node->left, func, data)) + return 1; + /* Yeah, whatever. GCC can fix my tail recursion. */ + return splay_tree_foreach_internal_lazy (node->right, func, data); +} + +attribute_hidden void +splay_tree_foreach_lazy (splay_tree sp, splay_tree_callback_stop func, void *data) +{ + splay_tree_foreach_internal_lazy (sp->root, func, data); +} diff --git a/libgomp/splay-tree.h b/libgomp/splay-tree.h index 992381c4000..7dea0cc87f7 100644 --- a/libgomp/splay-tree.h +++ b/libgomp/splay-tree.h @@ -78,8 +78,12 @@ typedef struct splay_tree_key_s *splay_tree_key; splay_tree_name (splay_tree_prefix, splay_tree_remove) # define splay_tree_foreach \ splay_tree_name (splay_tree_prefix, splay_tree_foreach) +# define splay_tree_foreach_lazy \ + splay_tree_name (splay_tree_prefix, splay_tree_foreach_lazy) # define splay_tree_callback \ splay_tree_name (splay_tree_prefix, splay_tree_callback) +# define splay_tree_callback_stop \ + splay_tree_name (splay_tree_prefix, splay_tree_callback_stop) #endif #ifndef splay_tree_c @@ -99,11 +103,13 @@ struct splay_tree_s { }; typedef void (*splay_tree_callback) (splay_tree_key, void *); +typedef int (*splay_tree_callback_stop) (splay_tree_key, void *); extern splay_tree_key splay_tree_lookup (splay_tree, splay_tree_key); extern void splay_tree_insert (splay_tree, splay_tree_node); extern void splay_tree_remove (splay_tree, splay_tree_key); extern void splay_tree_foreach (splay_tree, splay_tree_callback, void *); +extern void splay_tree_foreach_lazy (splay_tree, splay_tree_callback_stop, void *); #else /* splay_tree_c */ # ifdef splay_tree_prefix # include "splay-tree.c" @@ -125,6 +131,8 @@ extern void splay_tree_foreach (splay_tree, splay_tree_callback, void *); # undef splay_tree_insert # undef splay_tree_remove # undef splay_tree_foreach +# undef splay_tree_foreach_lazy # undef splay_tree_callback +# undef splay_tree_callback_stop # undef splay_tree_prefix #endif diff --git a/libgomp/target.c b/libgomp/target.c index 06809dbc710..e38cc3b6f1c 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -45,6 +45,12 @@ #include "plugin-suffix.h" #endif +/* Define another splay tree instantiation - for reverse offload. */ +#define splay_tree_prefix reverse +#define splay_tree_c +#include "splay-tree.h" + + typedef uintptr_t *hash_entry_type; static inline void * htab_alloc (size_t size) { return gomp_malloc (size); } static inline void htab_free (void *ptr) { free (ptr); } @@ -200,6 +206,12 @@ gomp_map_lookup (splay_tree mem_map, splay_tree_key key) return splay_tree_lookup (mem_map, key); } +static inline reverse_splay_tree_key +gomp_map_lookup_rev (reverse_splay_tree mem_map_rev, reverse_splay_tree_key key) +{ + return reverse_splay_tree_lookup (mem_map_rev, key); +} + static inline splay_tree_key gomp_map_0len_lookup (splay_tree mem_map, splay_tree_key key) { @@ -1674,7 +1686,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, - k->host_start), sizes[j], cbufp, false); } - } + } i = j - 1; break; case GOMP_MAP_FORCE_PRESENT: @@ -2205,11 +2217,16 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version, /* Load image to device and get target addresses for the image. */ struct addr_pair *target_table = NULL; + uint64_t *rev_target_fn_table = NULL; int i, num_target_entries; + /* With reverse offload, insert also target-host addresses. */ + bool rev_lookup = omp_requires_mask & GOMP_REQUIRES_REVERSE_OFFLOAD; + num_target_entries = devicep->load_image_func (devicep->target_id, version, - target_data, &target_table, NULL); + target_data, &target_table, + rev_lookup ? &rev_target_fn_table : NULL); if (num_target_entries != num_funcs + num_vars /* "+1" due to the additional ICV struct. */ @@ -2228,6 +2245,10 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version, /* "+1" due to the additional ICV struct. */ tgt->array = gomp_malloc ((num_funcs + num_vars + 1) * sizeof (*tgt->array)); + if (rev_target_fn_table) + tgt->rev_array = gomp_malloc (num_funcs * sizeof (*tgt->rev_array)); + else + tgt->rev_array = NULL; tgt->refcount = REFCOUNT_INFINITY; tgt->tgt_start = 0; tgt->tgt_end = 0; @@ -2236,6 +2257,7 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version, tgt->list_count = 0; tgt->device_descr = devicep; splay_tree_node array = tgt->array; + reverse_splay_tree_node rev_array = tgt->rev_array; for (i = 0; i < num_funcs; i++) { @@ -2250,6 +2272,17 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version, array->left = NULL; array->right = NULL; splay_tree_insert (&devicep->mem_map, array); + if (rev_target_fn_table) + { + reverse_splay_tree_key k2 = &rev_array->key; + k2->dev = rev_target_fn_table[i]; + k2->k = k; + rev_array->left = NULL; + rev_array->right = NULL; + if (k2->dev != 0) + reverse_splay_tree_insert (&devicep->mem_map_rev, rev_array); + rev_array++; + } array++; } @@ -2369,6 +2402,15 @@ gomp_unload_image_from_device (struct gomp_device_descr *devicep, gomp_mutex_unlock (&devicep->lock); gomp_fatal ("image unload fail"); } + if (devicep->mem_map_rev.root) + { + /* Free reverse offload splay tree + data; 'tgt->rev_array' is the only + real allocation. */ + assert (node && node->tgt && node->tgt->rev_array); + assert (devicep->mem_map_rev.root->key.k->tgt == node->tgt); + free (node->tgt->rev_array); + devicep->mem_map_rev.root = NULL; + } /* Remove mappings from splay tree. */ int i; @@ -3114,23 +3156,549 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum, } + +/* Reverse lookup (device addr -> host addr) for reverse offload. We avoid + keeping track of all variable handling - assuming that reverse offload occurs + ony very rarely. Downside is that the reverse search is slow. */ + +struct gomp_splay_tree_rev_lookup_data { + uintptr_t tgt_start; + uintptr_t tgt_end; + splay_tree_key key; +}; + +static int +gomp_splay_tree_rev_lookup (splay_tree_key key, void *d) +{ + struct gomp_splay_tree_rev_lookup_data *data; + data = (struct gomp_splay_tree_rev_lookup_data *)d; + uintptr_t tgt_start = key->tgt->tgt_start + key->tgt_offset; + + if (tgt_start > data->tgt_start || key->tgt->list_count == 0) + return 0; + + size_t j; + for (j = 0; j < key->tgt->list_count; j++) + if (key->tgt->list[j].key == key) + break; + assert (j < key->tgt->list_count); + uintptr_t tgt_end = tgt_start + key->tgt->list[j].length; + + if ((tgt_start == data->tgt_start && tgt_end == data->tgt_end) + || (tgt_end > data->tgt_start && tgt_start < data->tgt_end)) + { + data->key = key; + return 1; + } + return 0; +} + +static inline splay_tree_key +gomp_map_rev_lookup (splay_tree mem_map, uint64_t tgt_start, uint64_t tgt_end, + bool zero_len) +{ + struct gomp_splay_tree_rev_lookup_data data; + data.key = NULL; + data.tgt_start = tgt_start; + data.tgt_end = tgt_end; + + if (tgt_start != tgt_end) + { + splay_tree_foreach_lazy (mem_map, gomp_splay_tree_rev_lookup, &data); + return data.key; + } + + data.tgt_end++; + splay_tree_foreach_lazy (mem_map, gomp_splay_tree_rev_lookup, &data); + if (data.key != NULL || zero_len) + return data.key; + data.tgt_end--; + + data.tgt_start--; + splay_tree_foreach_lazy (mem_map, gomp_splay_tree_rev_lookup, &data); + return data.key; +} + +struct cpy_data +{ + uint64_t devaddr; + bool present, aligned; +}; + + +/* Search just mapped reverse-offload data; returns index if found, + otherwise >= n. */ + +static inline int +gomp_map_cdata_lookup_int (struct cpy_data *d, uint64_t *devaddrs, + unsigned short *kinds, uint64_t *sizes, size_t n, + uint64_t tgt_start, uint64_t tgt_end) +{ + const bool short_mapkind = true; + const int typemask = short_mapkind ? 0xff : 0x7; + size_t i; + for (i = 0; i < n; i++) + { + bool is_struct = ((get_kind (short_mapkind, kinds, i) & typemask) + == GOMP_MAP_STRUCT); + uint64_t dev_end; + if (!is_struct) + dev_end = d[i].devaddr + sizes[i]; + else + { + if (i + sizes[i] < n) + dev_end = d[i + sizes[i]].devaddr + sizes[i + sizes[i]]; + else + dev_end = devaddrs[i + sizes[i]] + sizes[i + sizes[i]]; + } + if ((d[i].devaddr == tgt_start && dev_end == tgt_end) + || (dev_end > tgt_start && d[i].devaddr < tgt_end)) + break; + if (is_struct) + i += sizes[i]; + } + return i; +} + +static inline int +gomp_map_cdata_lookup (struct cpy_data *d, uint64_t *devaddrs, + unsigned short *kinds, uint64_t *sizes, + size_t n, uint64_t tgt_start, uint64_t tgt_end, + bool zero_len) +{ + size_t i; + if (tgt_start != tgt_end) + return gomp_map_cdata_lookup_int (d, devaddrs, kinds, sizes, n, + tgt_start, tgt_end); + tgt_end++; + i = gomp_map_cdata_lookup_int (d, devaddrs, kinds, sizes, n, + tgt_start, tgt_end); + if (i < n || zero_len) + return i; + tgt_end--; + + tgt_start--; + return gomp_map_cdata_lookup_int (d, devaddrs, kinds, sizes, n, + tgt_start, tgt_end); +} + /* Handle reverse offload. This is called by the device plugins for a - reverse offload; it is not called if the outer target runs on the host. */ + reverse offload; it is not called if the outer target runs on the host. + The mapping is simplified device-affecting constructs (except for target + with device(ancestor:1)) must not be encountered; in particular not + target (enter/exit) data. */ void -gomp_target_rev (uint64_t fn_ptr __attribute__ ((unused)), - uint64_t mapnum __attribute__ ((unused)), - uint64_t devaddrs_ptr __attribute__ ((unused)), - uint64_t sizes_ptr __attribute__ ((unused)), - uint64_t kinds_ptr __attribute__ ((unused)), - int dev_num __attribute__ ((unused)), - void (*dev_to_host_cpy) (void *, const void *, size_t, - void *) __attribute__ ((unused)), - void (*host_to_dev_cpy) (void *, const void *, size_t, - void *) __attribute__ ((unused)), - void *token __attribute__ ((unused))) +gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr, + uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num, + void (*dev_to_host_cpy) (void *, const void *, size_t, void*), + void (*host_to_dev_cpy) (void *, const void *, size_t, void*), + void *token) { - __builtin_unreachable (); + /* Return early if there is no offload code. */ + if (sizeof (OFFLOAD_PLUGINS) == sizeof ("")) + return; + /* Currently, this fails because of calculate_firstprivate_requirements + below; it could be fixed but additional code needs to be updated to + handle 32bit hosts - thus, it is not worthwhile. */ + if (sizeof (void *) != sizeof (uint64_t)) + gomp_fatal ("Reverse offload of 32bit hosts not supported."); + + struct cpy_data *cdata = NULL; + uint64_t *devaddrs; + uint64_t *sizes; + unsigned short *kinds; + const bool short_mapkind = true; + const int typemask = short_mapkind ? 0xff : 0x7; + struct gomp_device_descr *devicep = resolve_device (dev_num, false); + + reverse_splay_tree_key n; + struct reverse_splay_tree_key_s k; + k.dev = fn_ptr; + + gomp_mutex_lock (&devicep->lock); + n = gomp_map_lookup_rev (&devicep->mem_map_rev, &k); + gomp_mutex_unlock (&devicep->lock); + + if (n == NULL) + gomp_fatal ("Cannot find reverse-offload function"); + void (*host_fn)() = (void (*)()) n->k->host_start; + + if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) + { + devaddrs = (uint64_t *) (uintptr_t) devaddrs_ptr; + sizes = (uint64_t *) (uintptr_t) sizes_ptr; + kinds = (unsigned short *) (uintptr_t) kinds_ptr; + } + else + { + devaddrs = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t)); + sizes = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t)); + kinds = (unsigned short *) gomp_malloc (mapnum * sizeof (unsigned short)); + if (dev_to_host_cpy) + { + dev_to_host_cpy (devaddrs, (const void *) (uintptr_t) devaddrs_ptr, + mapnum * sizeof (uint64_t), token); + dev_to_host_cpy (sizes, (const void *) (uintptr_t) sizes_ptr, + mapnum * sizeof (uint64_t), token); + dev_to_host_cpy (kinds, (const void *) (uintptr_t) kinds_ptr, + mapnum * sizeof (unsigned short), token); + } + else + { + gomp_copy_dev2host (devicep, NULL, devaddrs, + (const void *) (uintptr_t) devaddrs_ptr, + mapnum * sizeof (uint64_t)); + gomp_copy_dev2host (devicep, NULL, sizes, + (const void *) (uintptr_t) sizes_ptr, + mapnum * sizeof (uint64_t)); + gomp_copy_dev2host (devicep, NULL, kinds, (const void *) (uintptr_t) kinds_ptr, + mapnum * sizeof (unsigned short)); + } + } + + size_t tgt_align = 0, tgt_size = 0; + + /* If actually executed on 32bit systems, the casts lead to wrong code; + but 32bit with offloading is not supported; see top of this function. */ + calculate_firstprivate_requirements (mapnum, (void *) (uintptr_t) sizes, + (void *) (uintptr_t) kinds, + &tgt_align, &tgt_size); + + if (tgt_align) + { + char *tgt = gomp_alloca (tgt_size + tgt_align - 1); + uintptr_t al = (uintptr_t) tgt & (tgt_align - 1); + if (al) + tgt += tgt_align - al; + tgt_size = 0; + for (uint64_t i = 0; i < mapnum; i++) + if (get_kind (short_mapkind, kinds, i) == GOMP_MAP_FIRSTPRIVATE + && devaddrs[i] != 0) + { + size_t align = (size_t) 1 << (kinds[i] >> 8); + tgt_size = (tgt_size + align - 1) & ~(align - 1); + if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) + memcpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i], + (size_t) sizes[i]); + else if (dev_to_host_cpy) + dev_to_host_cpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i], + (size_t) sizes[i], token); + else + gomp_copy_dev2host (devicep, NULL, tgt + tgt_size, + (void *) (uintptr_t) devaddrs[i], + (size_t) sizes[i]); + devaddrs[i] = (uint64_t) (uintptr_t) tgt + tgt_size; + tgt_size = tgt_size + sizes[i]; + if ((devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) + && i + 1 < mapnum + && ((get_kind (short_mapkind, kinds, i) & typemask) + == GOMP_MAP_ATTACH)) + { + *(uint64_t*) (uintptr_t) (devaddrs[i+1] + sizes[i+1]) + = (uint64_t) devaddrs[i]; + ++i; + } + } + } + + if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)) + { + size_t j, struct_cpy = 0; + splay_tree_key n2; + cdata = gomp_alloca (sizeof (*cdata) * mapnum); + memset (cdata, '\0', sizeof (*cdata) * mapnum); + gomp_mutex_lock (&devicep->lock); + for (uint64_t i = 0; i < mapnum; i++) + { + if (devaddrs[i] == 0) + continue; + n = NULL; + int kind = get_kind (short_mapkind, kinds, i) & typemask; + switch (kind) + { + case GOMP_MAP_FIRSTPRIVATE: + case GOMP_MAP_FIRSTPRIVATE_INT: + continue; + + case GOMP_MAP_DELETE: + case GOMP_MAP_RELEASE: + case GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION: + /* Assume it is present; look it up - but ignore otherwise. */ + case GOMP_MAP_ALLOC: + case GOMP_MAP_FROM: + case GOMP_MAP_FORCE_ALLOC: + case GOMP_MAP_FORCE_FROM: + case GOMP_MAP_ALWAYS_FROM: + case GOMP_MAP_TO: + case GOMP_MAP_TOFROM: + case GOMP_MAP_FORCE_TO: + case GOMP_MAP_FORCE_TOFROM: + case GOMP_MAP_ALWAYS_TO: + case GOMP_MAP_ALWAYS_TOFROM: + case GOMP_MAP_ZERO_LEN_ARRAY_SECTION: + cdata[i].devaddr = devaddrs[i]; + bool zero_len = (kind == GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION + || kind == GOMP_MAP_ZERO_LEN_ARRAY_SECTION); + j = gomp_map_cdata_lookup (cdata, devaddrs, kinds, sizes, i, + devaddrs[i], + devaddrs[i] + sizes[i], zero_len); + if (j < i) + { + n2 = NULL; + cdata[i].present = true; + devaddrs[i] = devaddrs[j] + devaddrs[i] - cdata[j].devaddr; + } + else + { + n2 = gomp_map_rev_lookup (&devicep->mem_map, + devaddrs[i], + devaddrs[i] + sizes[i], zero_len); + cdata[i].present = n2 != NULL; + } + if (!cdata[i].present + && kind != GOMP_MAP_DELETE + && kind != GOMP_MAP_RELEASE + && kind != GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION) + { + cdata[i].aligned = true; + size_t align = (size_t) 1 << (kinds[i] >> 8); + devaddrs[i] + = (uint64_t) (uintptr_t) gomp_aligned_alloc (align, + sizes[i]); + } + else if (n2 != NULL) + devaddrs[i] = (n2->host_start + cdata[i].devaddr + - (n2->tgt->tgt_start + n2->tgt_offset)); + if (((!cdata[i].present || struct_cpy) + && (kind == GOMP_MAP_TO || kind == GOMP_MAP_TOFROM)) + || kind == GOMP_MAP_FORCE_TO + || kind == GOMP_MAP_FORCE_TOFROM + || kind == GOMP_MAP_ALWAYS_TO + || kind == GOMP_MAP_ALWAYS_TOFROM) + { + if (dev_to_host_cpy) + dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i], + (void *) (uintptr_t) cdata[i].devaddr, + sizes[i], token); + else + gomp_copy_dev2host (devicep, NULL, + (void *) (uintptr_t) devaddrs[i], + (void *) (uintptr_t) cdata[i].devaddr, + sizes[i]); + } + if (struct_cpy) + struct_cpy--; + break; + case GOMP_MAP_ATTACH: + case GOMP_MAP_POINTER: + case GOMP_MAP_ALWAYS_POINTER: + n2 = gomp_map_rev_lookup (&devicep->mem_map, + devaddrs[i] + sizes[i], + devaddrs[i] + sizes[i] + + sizeof (void*), false); + cdata[i].present = n2 != NULL; + cdata[i].devaddr = devaddrs[i]; + if (n2) + devaddrs[i] = (n2->host_start + cdata[i].devaddr + - (n2->tgt->tgt_start + n2->tgt_offset)); + else + { + j = gomp_map_cdata_lookup (cdata, devaddrs, kinds, sizes, i, + devaddrs[i] + sizes[i], + devaddrs[i] + sizes[i] + + sizeof (void*), false); + if (j < i) + { + cdata[i].present = true; + devaddrs[i] = (devaddrs[j] + devaddrs[i] + - cdata[j].devaddr); + } + } + if (!cdata[i].present) + devaddrs[i] = (uintptr_t) gomp_malloc (sizeof (void*)); + /* Assume that when present, the pointer is already correct. */ + if (!n2) + *(uint64_t *) (uintptr_t) (devaddrs[i] + sizes[i]) + = devaddrs[i-1]; + break; + case GOMP_MAP_TO_PSET: + /* Assume that when present, the pointers are fine and no 'to:' + is required. */ + n2 = gomp_map_rev_lookup (&devicep->mem_map, + devaddrs[i], devaddrs[i] + sizes[i], + false); + cdata[i].present = n2 != NULL; + cdata[i].devaddr = devaddrs[i]; + if (n2) + devaddrs[i] = (n2->host_start + cdata[i].devaddr + - (n2->tgt->tgt_start + n2->tgt_offset)); + else + { + j = gomp_map_cdata_lookup (cdata, devaddrs, kinds, sizes, i, + devaddrs[i], + devaddrs[i] + sizes[i], false); + if (j < i) + { + cdata[i].present = true; + devaddrs[i] = (devaddrs[j] + devaddrs[i] + - cdata[j].devaddr); + } + } + if (!cdata[i].present) + { + cdata[i].aligned = true; + size_t align = (size_t) 1 << (kinds[i] >> 8); + devaddrs[i] + = (uint64_t) (uintptr_t) gomp_aligned_alloc (align, + sizes[i]); + if (dev_to_host_cpy) + dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i], + (void *) (uintptr_t) cdata[i].devaddr, + sizes[i], token); + else + gomp_copy_dev2host (devicep, NULL, + (void *) (uintptr_t) devaddrs[i], + (void *) (uintptr_t) cdata[i].devaddr, + sizes[i]); + } + for (j = i + 1; j < mapnum; j++) + { + kind = get_kind (short_mapkind, kinds, j) & typemask; + if (!GOMP_MAP_ALWAYS_POINTER_P (kind) + && !GOMP_MAP_POINTER_P (kind)) + break; + if (devaddrs[j] < devaddrs[i]) + break; + if (cdata[i].present) + continue; + if (devaddrs[j] == 0) + { + *(uint64_t *) (uintptr_t) (devaddrs[i] + sizes[j]) = 0; + continue; + } + int k; + n2 = NULL; + cdata[i].present = true; + cdata[j].devaddr = devaddrs[j]; + k = gomp_map_cdata_lookup (cdata, devaddrs, kinds, sizes, j, + devaddrs[j], + devaddrs[j] + sizeof (void*), + false); + if (k < j) + devaddrs[j] = (devaddrs[k] + devaddrs[j] + - cdata[k].devaddr); + else + { + n2 = gomp_map_rev_lookup (&devicep->mem_map, + devaddrs[j], + devaddrs[j] + sizeof (void*), + false); + if (n2 == NULL) + { + gomp_mutex_unlock (&devicep->lock); + gomp_fatal ("Pointer target wasn't mapped"); + } + devaddrs[j] = (n2->host_start + cdata[j].devaddr + - (n2->tgt->tgt_start + n2->tgt_offset)); + } + *(void **) (uintptr_t) (devaddrs[i] + sizes[j]) + = (void *) (uintptr_t) devaddrs[j]; + } + i = j -1; + break; + case GOMP_MAP_STRUCT: + n2 = gomp_map_rev_lookup (&devicep->mem_map, devaddrs[i+1], + devaddrs[i + sizes[i]] + + sizes[i + sizes[i]], false); + cdata[i].present = n2 != NULL; + cdata[i].devaddr = devaddrs[i]; + struct_cpy = cdata[i].present ? 0 : sizes[i]; + if (!n2) + { + size_t sz = (size_t) (devaddrs[i + sizes[i]] + - devaddrs[i+1] + + sizes[i + sizes[i]]); + size_t align = (size_t) 1 << (kinds[i] >> 8); + cdata[i].aligned = true; + devaddrs[i] = (uintptr_t) gomp_aligned_alloc (align, sz); + devaddrs[i] -= devaddrs[i+1] - cdata[i].devaddr; + } + else + devaddrs[i] = (n2->host_start + cdata[i].devaddr + - (n2->tgt->tgt_start + n2->tgt_offset)); + break; + default: + gomp_mutex_unlock (&devicep->lock); + gomp_fatal ("gomp_target_rev unhandled kind 0x%.4x", kinds[i]); + } + } + gomp_mutex_unlock (&devicep->lock); + } + + host_fn (devaddrs); + + if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)) + { + uint64_t struct_cpy = 0; + bool clean_struct = false; + for (uint64_t i = 0; i < mapnum; i++) + { + if (cdata[i].devaddr == 0) + continue; + int kind = get_kind (short_mapkind, kinds, i) & typemask; + bool copy = !cdata[i].present || struct_cpy; + switch (kind) + { + case GOMP_MAP_FORCE_FROM: + case GOMP_MAP_FORCE_TOFROM: + case GOMP_MAP_ALWAYS_FROM: + case GOMP_MAP_ALWAYS_TOFROM: + copy = true; + /* FALLTHRU */ + case GOMP_MAP_FROM: + case GOMP_MAP_TOFROM: + if (copy && host_to_dev_cpy) + host_to_dev_cpy ((void *) (uintptr_t) cdata[i].devaddr, + (void *) (uintptr_t) devaddrs[i], + sizes[i], token); + else if (copy) + gomp_copy_host2dev (devicep, NULL, + (void *) (uintptr_t) cdata[i].devaddr, + (void *) (uintptr_t) devaddrs[i], + sizes[i], false, NULL); + default: + break; + } + if (struct_cpy) + { + struct_cpy--; + continue; + } + if (kind == GOMP_MAP_STRUCT && !cdata[i].present) + { + clean_struct = true; + struct_cpy = sizes[i]; + } + else if (cdata[i].aligned) + gomp_aligned_free ((void *) (uintptr_t) devaddrs[i]); + else if (!cdata[i].present) + free ((void *) (uintptr_t) devaddrs[i]); + } + if (clean_struct) + for (uint64_t i = 0; i < mapnum; i++) + if (!cdata[i].present + && ((get_kind (short_mapkind, kinds, i) & typemask) + == GOMP_MAP_STRUCT)) + { + devaddrs[i] += cdata[i+1].devaddr - cdata[i].devaddr; + gomp_aligned_free ((void *) (uintptr_t) devaddrs[i]); + } + + free (devaddrs); + free (sizes); + free (kinds); + } } /* Host fallback for GOMP_target_data{,_ext} routines. */ @@ -4510,6 +5078,7 @@ gomp_target_init (void) /* current_device.capabilities has already been set. */ current_device.type = current_device.get_type_func (); current_device.mem_map.root = NULL; + current_device.mem_map_rev.root = NULL; current_device.state = GOMP_DEVICE_UNINITIALIZED; for (i = 0; i < new_num_devs; i++) { diff --git a/libgomp/testsuite/libgomp.fortran/reverse-offload-2.f90 b/libgomp/testsuite/libgomp.fortran/reverse-offload-2.f90 new file mode 100644 index 00000000000..067639bc67e --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/reverse-offload-2.f90 @@ -0,0 +1,72 @@ +! { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } + +implicit none +!$omp requires reverse_offload +integer :: A(50), A2(50) +integer :: i, error +logical :: shared_mem + +error = 0 +shared_mem = .false. +A = [(i, i=1,50)] +A2 = [(i, i=1,50)] + +!$omp target map(to: shared_mem) + shared_mem = .true. +!$omp end target + +!$omp target map(to: A(20:40), A2(20:40)) map(from: error) +block + integer :: B(10), C(10) + B = 99 + C = 88 + A(20:40) = -A(20:40) + A2(20:40) = -A2(20:40) + + !$omp target device (ancestor:1) & + !$omp& map(to: A(25:35)) map(always, to: A2(25:35)) & + !$omp& map(from:B(4:8)) map(tofrom:C(4:8)) + if (shared_mem) then + if (any (A(25:35) /= [(-i,i=25,35)])) stop 20 + else + if (any (A(25:35) /= [( i,i=25,35)])) stop 21 + end if + if (any (A2(25:35) /= [(-i,i=25,35)])) stop 22 + if (any (C(4:8) /= 88)) stop 23 + + A(25:35) = -A(25:35)*10 + A2(25:35) = -A2(25:35)*10 + B(4:8) = [4,5,6,7,8] + C(4:8) = [-4,-5,-6,-7,-8] + !$omp end target + + if (any (B(1:3) /= 99) .or. any (B(9:10) /= 99)) then + error = 30 + elseif (any (B(4:8) /= [4,5,6,7,8])) then + error = 31 + elseif (any (C(1:3) /= 88) .or. any (C(9:10) /= 88)) then + error = 32 + elseif (any (C(4:8) /= [-4,-5,-6,-7,-8])) then + error = 33 + else + error = 0 + endif +end block + +if (error /= 0) stop error + +if (shared_mem) then + if (any (A(1:19) /= [( i, i=1,19)])) stop 1 + if (any (A(20:24) /= [(-i, i=20,24)])) stop 2 + if (any (A(36:40) /= [(-i, i=36,40)])) stop 3 + if (any (A(41:50) /= [( i, i=41,50)])) stop 4 + + if (any (A(25:35) /= [( 10*i, i=25,35)])) stop 5 +else + if (any (A(1:24) /= [( i, i=1,24)])) stop 6 + if (any (A(36:50) /= [( i, i=36,50)])) stop 7 + + if (any (A(25:35) /= [(-10*i, i=25,35)])) stop 8 +end if +if (any (A2(25:35) /= [( 10*i, i=25,35)])) stop 9 +end diff --git a/libgomp/testsuite/libgomp.fortran/reverse-offload-3.f90 b/libgomp/testsuite/libgomp.fortran/reverse-offload-3.f90 new file mode 100644 index 00000000000..2fd2f5b6589 --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/reverse-offload-3.f90 @@ -0,0 +1,68 @@ +! { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } + +implicit none +!$omp requires reverse_offload +integer :: A(50), A2(50), A3(50) +integer :: i +logical :: shared_mem + +shared_mem = .false. +A = [(3*i, i=1,50)] +A2 = [(7*i, i=1,50)] +A3 = [(11*i, i=1,50)] + +!$omp target map(to: shared_mem) + shared_mem = .true. +!$omp end target + +!$omp target map(to: A(20:40), A2(20:40), A3(20:40)) +block + integer :: C(10) + C = 88 + A(20:40) = -2*A(20:40) + A2(20:40) = -9*A2(20:40) + A3(20:40) = -13*A3(20:40) + + !$omp target device (ancestor:1) & + !$omp& map(from: A(25:35)) map(always, from: A2(25:35)) & + !$omp& map(alloc: A3(25:35)) map(alloc:C(4:8)) + if (shared_mem) then + if (any (A(25:35) /= [(-2*3*i, i=25,35)])) stop 1 + if (any (A2(25:35) /= [(-9*7*i, i=25,35)])) stop 2 + if (any (A3(25:35) /= [(-13*11*i, i=25,35)])) stop 3 + else + if (any (A(25:35) /= [(3*i, i=25,35)])) stop 4 + if (any (A2(25:35) /= [(7*i, i=25,35)])) stop 5 + if (any (A3(25:35) /= [(11*i, i=25,35)])) stop 6 + end if + + A(25:35) = A(25:35)*5 + A2(25:35) = A2(25:35)*8 + A3(25:35) = A3(25:35)*18 + C(4:8) = [4,5,6,7,8] + !$omp end target + + if (shared_mem) then + if (any (A(25:35) /= [(-2*3*5*i, i=25,35)])) stop 7 + if (any (A2(25:35) /= [(-9*7*8*i, i=25,35)])) stop 8 + if (any (A3(25:35) /= [(-13*11*18*i, i=25,35)])) stop 9 + if (any (C(4:8) /= [4,5,6,7,8])) stop 10 + else + if (any (A(25:35) /= [(-2*3*i, i=25,35)])) stop 11 + if (any (A2(25:35) /= [(7*8*i, i=25,35)])) stop 12 + if (any (A3(25:35) /= [(-13*11*i, i=25,35)])) stop 13 + if (any (C(4:8) /= 88)) stop 14 + end if +end block + +if (shared_mem) then + if (any (A(25:35) /= [(-2*3*5*i, i=25,35)])) stop + if (any (A2(25:35) /= [(-9*7**8*i, i=25,35)])) stop + if (any (A3(25:35) /= [(-13*11*18*i, i=25,35)])) stop +else + if (any (A(25:35) /= [(3*5*i, i=25,35)])) stop + if (any (A2(25:35) /= [(7*8*i, i=25,35)])) stop + if (any (A3(25:35) /= [(11*18*i, i=25,35)])) stop +end if + +end diff --git a/libgomp/testsuite/libgomp.fortran/reverse-offload-4.f90 b/libgomp/testsuite/libgomp.fortran/reverse-offload-4.f90 new file mode 100644 index 00000000000..fb27aa73eba --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/reverse-offload-4.f90 @@ -0,0 +1,129 @@ +! { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } + +implicit none +!$omp requires reverse_offload + +type t2 + integer :: a, b, c +end type t2 + +type t + integer :: A(5), B(5), C(5) + integer, pointer :: P(:), P2 !Just some padding + type(t2) :: tt !Just some padding +end type t + +type(t) :: S1, S2 +logical :: shared_mem + +shared_mem = .false. + +!$omp target map(to: shared_mem) + shared_mem = .true. +!$omp end target + +s1%A = [1,2,3,4,5] +s1%B = [10,20,30,40,50] +s1%C = [11,22,33,44,55] +s2%A = 2*s1%A +s2%B = 2*s1%B +s2%C = 2*s1%C + +!$omp target & +!$omp& map(to: s1%b, s1%c) & +!$omp& map(to: s2%b, s2%c) +block + type(t) :: si1, si2, si3, si4 + + s1%B = -10 * s1%B + s1%C = -10 * s1%C + s2%B = -15 * s2%B + s2%C = -15 * s2%C + + si1%A = -1 * [1,2,3,4,5] + si1%B = -1 * [10,20,30,40,50] + si1%C = -1 * [11,22,33,44,55] + si2%A = -23 * [1,2,3,4,5] + si2%B = -23 * [10,20,30,40,50] + si2%C = -23 * [11,22,33,44,55] + + !$omp target device (ancestor:1) & + !$omp& map(to: si1%C, si1%B) & + !$omp& map(tofrom: si2%C, si2%B) & + !$omp& map(always, to: s1%B) & + !$omp& map( to: s2%B) + if (any (s1%A /= [1,2,3,4,5])) stop 1 + if (any (s1%B /= -10 * [10,20,30,40,50])) stop 2 + if (shared_mem) then + if (any (s1%C /= -10 * [11,22,33,44,55])) stop 4 + else + if (any (s1%C /= [11,22,33,44,55])) stop 3 + endif + if (any (s2%A /= 2 * [1,2,3,4,5])) stop 4 + if (shared_mem) then + if (any (s2%B /= -15 * 2 * [10,20,30,40,50])) stop 5 + if (any (s2%C /= -15 * 2 * [11,22,33,44,55])) stop 6 + else + if (any (s2%B /= 2 * [10,20,30,40,50])) stop 7 + if (any (s2%C /= 2 * [11,22,33,44,55])) stop 8 + endif + if (any (si1%B /= -1 * [10,20,30,40,50])) stop 9 + if (any (si1%C /= -1 * [11,22,33,44,55])) stop 10 + if (any (si2%B /= -23 * [10,20,30,40,50])) stop 10 + if (any (si2%C /= -23 * [11,22,33,44,55])) stop 11 + + s1%A = 5 * s1%A + s1%B = 7 * s1%B + s1%C = 13 * s1%C + s2%A = 9 * s2%A + s2%B = 21 * s2%B + s2%C = 31 * s2%C + si1%B = -11 * si1%B + si1%C = -13 * si1%C + si2%B = -27 * si2%B + si2%C = -29 * si2%C + !$omp end target + + if (shared_mem) then + if (any (s1%B /= -10 * 7 * [10,20,30,40,50])) stop 20 + if (any (s1%C /= -10 * 13 * [11,22,33,44,55])) stop 21 + else + if (any (s1%B /= -10 * [10,20,30,40,50])) stop 22 + if (any (s1%C /= -10 * [11,22,33,44,55])) stop 23 + endif + if (shared_mem) then + if (any (s2%B /= -15 * 2 * 21 * [10,20,30,40,50])) stop 24 + if (any (s2%C /= -15 * 2 * 31 * [11,22,33,44,55])) stop 25 + else + if (any (s2%B /= -15 * 2 * [10,20,30,40,50])) stop 26 + if (any (s2%C /= -15 * 2 * [11,22,33,44,55])) stop 27 + endif + if (any (si1%A /= -1 * [1,2,3,4,5])) stop 28 + if (shared_mem) then + if (any (si1%B /= -1 * (-11) * [10,20,30,40,50])) stop 29 + if (any (si1%C /= -1 * (-13) * [11,22,33,44,55])) stop 30 + else + if (any (si1%B /= -1 * [10,20,30,40,50])) stop 31 + if (any (si1%C /= -1 * [11,22,33,44,55])) stop 32 + endif + if (any (si2%A /= -23 * [1,2,3,4,5])) stop 33 + if (any (si2%B /= -23 * (-27) * [10,20,30,40,50])) stop 34 + if (any (si2%C /= -23 * (-29) * [11,22,33,44,55])) stop 35 +end block + +if (any (s1%A /= 5 * [1,2,3,4,5])) stop 40 +if (any (s1%B /= -10 * 7 * [10,20,30,40,50])) stop 41 +if (shared_mem) then + if (any (s1%C /= -10 * 13 * [11,22,33,44,55])) stop 42 +else + if (any (s1%C /= 13 * [11,22,33,44,55])) stop 43 +endif +if (any (s2%A /= 2 * 9 * [1,2,3,4,5])) stop 44 +if (shared_mem) then + if (any (s2%B /= -15 * 2 * 21 * [10,20,30,40,50])) stop 45 + if (any (s2%C /= -15 * 2 * 31 * [11,22,33,44,55])) stop 46 +else + if (any (s2%B /= 2 * 21 * [10,20,30,40,50])) stop 47 + if (any (s2%C /= 2 * 31 * [11,22,33,44,55])) stop 48 +endif +end diff --git a/libgomp/testsuite/libgomp.fortran/reverse-offload-5.f90 b/libgomp/testsuite/libgomp.fortran/reverse-offload-5.f90 new file mode 100644 index 00000000000..ef7eb7bdd52 --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/reverse-offload-5.f90 @@ -0,0 +1,100 @@ +! { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } +! { dg-xfail-run-if "Copying on-device allocated memory fails with cuMemcpyDtoHAsync error: invalid argument" { offload_device_nvptx } } + +! Because of the nvptx fail, a non-device alloc version has been +! created: reverse-offload-5a.f90 + +implicit none +!$omp requires reverse_offload + +integer, allocatable :: A(:), A2(:), s1, s2 +integer :: i +logical :: shared_mem + +shared_mem = .false. + +a = [1,2,3,4] +a2 = [8,7,6,5] +s1 = 532 +s2 = 55 + +!$omp target map(to: shared_mem) + shared_mem = .true. +!$omp end target + +!$omp target map(to: A, A2, s1, s2) +block + integer, allocatable :: ai(:), ai2(:), si1, si2 + + a = a * 2 + a2 = a2 * 3 + s1 = s1 * 4 + s2 = s2 * 5 + + ai = [23,35,86,43] + ai2 = [8,4,7,1] + si1 = 64 + si2 = 765 + + !$omp target device (ancestor:1) & + !$omp& map(to: A, s1, ai, si1) map(always, to: a2, s2) & + !$omp& map(tofrom: ai2, si2) + if (shared_mem) then + if (any (a /= 2 * [1,2,3,4])) stop 1 + if (s1 /= 4 * 532) stop 2 + else + if (any (a /= [1,2,3,4])) stop 3 + if (s1 /= 532) stop 4 + endif + if (any (a2 /= 3 * [8,7,6,5])) stop 5 + if (s2 /= 5 * 55) stop 6 + if (any (ai /= [23,35,86,43])) stop 7 + if (any (ai2 /= [8,4,7,1])) stop 8 + if (si1 /= 64) stop 9 + if (si2 /= 765) stop 10 + + a = a*3 + a2 = a2*7 + s1 = s1*11 + s2 = s2*5 + ai = ai*13 + ai2 = ai2*21 + si1 = si1*27 + si2 = si2*31 + !$omp end target + + if (shared_mem) then + if (any (a /= 3 * 2 * [1,2,3,4])) stop 11 + if (any (a2 /= 7 * 3 * [8,7,6,5])) stop 12 + if (s1 /= 11 * 4 * 532) stop 13 + if (s2 /= 5 * 5 * 55) stop 14 + if (any (ai /= 13 * [23,35,86,43])) stop 15 + if (si1 /= 27 * 64) stop 16 + else + if (any (a /= 2 * [1,2,3,4])) stop 17 + if (any (a2 /= 3 * [8,7,6,5])) stop 18 + if (s1 /= 4 * 532) stop 19 + if (s2 /= 5 * 55) stop 20 + if (any (ai /= [23,35,86,43])) stop 22 + if (si1 /= 64) stop 23 + endif + if (any (ai2 /= 21 * [8,4,7,1])) stop 24 + if (si2 /= 31 * 765) stop 25 + + deallocate (ai, ai2, si1, si2) +end block + +if (shared_mem) then + if (any (a /= 3 * 2 * [1,2,3,4])) stop 30 + if (any (a2 /= 7 * 3 * [8,7,6,5])) stop 31 + if (s1 /= 11 * 4 * 532) stop 32 + if (s2 /= 5 * 5 * 55) stop 33 +else + if (any (a /= 3 * [1,2,3,4])) stop 34 + if (any (a2 /= 3 * 7 * [8,7,6,5])) stop 35 + if (s1 /= 11 * 532) stop 36 + if (s2 /= 5 * 5 * 55) stop 37 +endif + +deallocate (a, a2, s1, s2) +end diff --git a/libgomp/testsuite/libgomp.fortran/reverse-offload-5a.f90 b/libgomp/testsuite/libgomp.fortran/reverse-offload-5a.f90 new file mode 100644 index 00000000000..914d10d8144 --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/reverse-offload-5a.f90 @@ -0,0 +1,98 @@ +! { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } + +! Copying on-device allocated memory fails with cuMemcpyDtoHAsync error: invalid argument +! Hence, reverse-offload-5.f90 has been copied to *this* file, +! which uses on-host allocated vars - but only on the device side stack variables + +implicit none +!$omp requires reverse_offload + +integer, allocatable :: A(:), A2(:), s1, s2 +integer :: i,D(4) +logical :: shared_mem + +shared_mem = .false. + +a = [1,2,3,4] +a2 = [8,7,6,5] +s1 = 532 +s2 = 55 + +!$omp target map(to: shared_mem) + shared_mem = .true. +!$omp end target + +!$omp target map(to: A, A2, s1, s2) +block + ! stack variables: + integer :: ai(4), ai2(4), si1, si2 + + a = a * 2 + a2 = a2 * 3 + s1 = s1 * 4 + s2 = s2 * 5 + + ai = [23,35,86,43] + ai2 = [8,4,7,1] + si1 = 64 + si2 = 765 + + !$omp target device (ancestor:1) & + !$omp& map(to: A, s1, ai, si1) map(always, to: a2, s2) & + !$omp& map(tofrom: ai2, si2) + if (shared_mem) then + if (any (a /= 2 * [1,2,3,4])) stop 1 + if (s1 /= 4 * 532) stop 2 + else + if (any (a /= [1,2,3,4])) stop 3 + if (s1 /= 532) stop 4 + endif + if (any (a2 /= 3 * [8,7,6,5])) stop 5 + if (s2 /= 5 * 55) stop 6 + if (any (ai /= [23,35,86,43])) stop 7 + if (any (ai2 /= [8,4,7,1])) stop 8 + if (si1 /= 64) stop 9 + if (si2 /= 765) stop 10 + + a = a*3 + a2 = a2*7 + s1 = s1*11 + s2 = s2*5 + ai = ai*13 + ai2 = ai2*21 + si1 = si1*27 + si2 = si2*31 + !$omp end target + + if (shared_mem) then + if (any (a /= 3 * 2 * [1,2,3,4])) stop 11 + if (any (a2 /= 7 * 3 * [8,7,6,5])) stop 12 + if (s1 /= 11 * 4 * 532) stop 13 + if (s2 /= 5 * 5 * 55) stop 14 + if (any (ai /= 13 * [23,35,86,43])) stop 15 + if (si1 /= 27 * 64) stop 16 + else + if (any (a /= 2 * [1,2,3,4])) stop 17 + if (any (a2 /= 3 * [8,7,6,5])) stop 18 + if (s1 /= 4 * 532) stop 19 + if (s2 /= 5 * 55) stop 20 + if (any (ai /= [23,35,86,43])) stop 22 + if (si1 /= 64) stop 23 + endif + if (any (ai2 /= 21 * [8,4,7,1])) stop 24 + if (si2 /= 31 * 765) stop 25 +end block +if (shared_mem) then + if (any (a /= 3 * 2 * [1,2,3,4])) stop 30 + if (any (a2 /= 7 * 3 * [8,7,6,5])) stop 31 + if (s1 /= 11 * 4 * 532) stop 32 + if (s2 /= 5 * 5 * 55) stop 33 +else + if (any (a /= 3 * [1,2,3,4])) stop 34 + if (any (a2 /= 3 * 7 * [8,7,6,5])) stop 35 + if (s1 /= 11 * 532) stop 36 + if (s2 /= 5 * 5 * 55) stop 37 +endif + +deallocate (a, a2, s1, s2) +end