libgomp: Handle OpenMP's reverse offloads

This commit enabled reverse offload for nvptx such that gomp_target_rev
actually gets called.  And it fills the latter function to do all of
the following: finding the host function to the device func ptr and
copying the arguments to the host, processing the mapping/firstprivate,
calling the host function, copying back the data and freeing as needed.

The data handling is made easier by assuming that all host variables
either existed before (and are in the mapping) or that those are
devices variables not yet available on the host. Thus, the reverse
mapping can do without refcounts etc. Note that the spec disallows
inside a target region device-affecting constructs other than target
plus ancestor device-modifier and it also limits the clauses permitted
on this construct.

For the function addresses, an additional splay tree is used; for
the lookup of mapped variables, the existing splay-tree is used.
Unfortunately, its data structure requires a full walk of the tree;
Additionally, the just mapped variables are recorded in a separate
data structure an extra lookup. While the lookup is slow, assuming
that only few variables get mapped in each reverse offload construct
and that reverse offload is the exception and not performance critical,
this seems to be acceptable.

libgomp/ChangeLog:

	* libgomp.h (struct target_mem_desc): Predeclare; move
	below after 'reverse_splay_tree_node' and add rev_array
	member.
	(struct reverse_splay_tree_key_s, reverse_splay_compare): New.
	(reverse_splay_tree_node, reverse_splay_tree,
	reverse_splay_tree_key): New typedef.
	(struct gomp_device_descr): Add mem_map_rev member.
	* oacc-host.c (host_dispatch): NULL init .mem_map_rev.
	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_get_num_devices): Claim
	support for GOMP_REQUIRES_REVERSE_OFFLOAD.
	* splay-tree.h (splay_tree_callback_stop): New typedef; like
	splay_tree_callback but returning int not void.
	(splay_tree_foreach_lazy): Define; like splay_tree_foreach but
	taking splay_tree_callback_stop as argument.
	* splay-tree.c (splay_tree_foreach_internal_lazy,
	splay_tree_foreach_lazy): New; but early exit if callback returns
	nonzero.
	* target.c: Instatiate splay_tree_c with splay_tree_prefix 'reverse'.
	(gomp_map_lookup_rev): New.
	(gomp_load_image_to_device): Handle reverse-offload function
	lookup table.
	(gomp_unload_image_from_device): Free devicep->mem_map_rev.
	(struct gomp_splay_tree_rev_lookup_data, gomp_splay_tree_rev_lookup,
	gomp_map_rev_lookup, struct cpy_data, gomp_map_cdata_lookup_int,
	gomp_map_cdata_lookup): New auxiliary structs and functions for
	gomp_target_rev.
	(gomp_target_rev): Implement reverse offloading and its mapping.
	(gomp_target_init): Init current_device.mem_map_rev.root.
	* testsuite/libgomp.fortran/reverse-offload-2.f90: New test.
	* testsuite/libgomp.fortran/reverse-offload-3.f90: New test.
	* testsuite/libgomp.fortran/reverse-offload-4.f90: New test.
	* testsuite/libgomp.fortran/reverse-offload-5.f90: New test.
	* testsuite/libgomp.fortran/reverse-offload-5a.f90: New test without
	mapping of on-device allocated variables.
This commit is contained in:
Tobias Burnus 2022-12-10 13:42:08 +01:00
parent 68ee8a64ac
commit ea4b23d9c8
11 changed files with 1139 additions and 39 deletions

View file

@ -1157,29 +1157,7 @@ struct target_var_desc {
uintptr_t length;
};
struct target_mem_desc {
/* Reference count. */
uintptr_t refcount;
/* All the splay nodes allocated together. */
splay_tree_node array;
/* Start of the target region. */
uintptr_t tgt_start;
/* End of the targer region. */
uintptr_t tgt_end;
/* Handle to free. */
void *to_free;
/* Previous target_mem_desc. */
struct target_mem_desc *prev;
/* Number of items in following list. */
size_t list_count;
/* Corresponding target device descriptor. */
struct gomp_device_descr *device_descr;
/* List of target items to remove (or decrease refcount)
at the end of region. */
struct target_var_desc list[];
};
struct target_mem_desc;
/* Special value for refcount - mask to indicate existence of special
values. Right now we allocate 3 bits. */
@ -1273,6 +1251,58 @@ splay_compare (splay_tree_key x, splay_tree_key y)
#include "splay-tree.h"
/* Reverse offload splay-tree handling (functions only). */
struct reverse_splay_tree_key_s {
/* Address of the device object. */
uint64_t dev;
splay_tree_key k;
};
typedef struct reverse_splay_tree_node_s *reverse_splay_tree_node;
typedef struct reverse_splay_tree_s *reverse_splay_tree;
typedef struct reverse_splay_tree_key_s *reverse_splay_tree_key;
static inline int
reverse_splay_compare (reverse_splay_tree_key x, reverse_splay_tree_key y)
{
if (x->dev < y->dev)
return -1;
if (x->dev > y->dev)
return 1;
return 0;
}
#define splay_tree_prefix reverse
#include "splay-tree.h"
struct target_mem_desc {
/* Reference count. */
uintptr_t refcount;
/* All the splay nodes allocated together. */
splay_tree_node array;
/* Likewise for the reverse lookup device->host for reverse offload. */
reverse_splay_tree_node rev_array;
/* Start of the target region. */
uintptr_t tgt_start;
/* End of the targer region. */
uintptr_t tgt_end;
/* Handle to free. */
void *to_free;
/* Previous target_mem_desc. */
struct target_mem_desc *prev;
/* Number of items in following list. */
size_t list_count;
/* Corresponding target device descriptor. */
struct gomp_device_descr *device_descr;
/* List of target items to remove (or decrease refcount)
at the end of region. */
struct target_var_desc list[];
};
typedef struct acc_dispatch_t
{
/* Execute. */
@ -1367,6 +1397,7 @@ struct gomp_device_descr
/* Splay tree containing information about mapped memory regions. */
struct splay_tree_s mem_map;
struct reverse_splay_tree_s mem_map_rev;
/* Mutex for the mutable data. */
gomp_mutex_t lock;

View file

@ -284,6 +284,7 @@ static struct gomp_device_descr host_dispatch =
.run_func = host_run,
.mem_map = { NULL },
.mem_map_rev = { NULL },
/* .lock initialized in goacc_host_init. */
.state = GOMP_DEVICE_UNINITIALIZED,

View file

@ -1190,7 +1190,9 @@ GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
devices were present. Unified-shared address: see comment in
nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
if (num_devices > 0
&& (omp_requires_mask & ~GOMP_REQUIRES_UNIFIED_ADDRESS) != 0)
&& ((omp_requires_mask
& ~(GOMP_REQUIRES_UNIFIED_ADDRESS
| GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
return -1;
return num_devices;
}

View file

@ -236,3 +236,25 @@ splay_tree_foreach (splay_tree sp, splay_tree_callback func, void *data)
{
splay_tree_foreach_internal (sp->root, func, data);
}
/* Like above, except when func returns != 0, stop early. */
static int
splay_tree_foreach_internal_lazy (splay_tree_node node,
splay_tree_callback_stop func, void *data)
{
if (!node)
return 0;
if (func (&node->key, data))
return 1;
if (splay_tree_foreach_internal_lazy (node->left, func, data))
return 1;
/* Yeah, whatever. GCC can fix my tail recursion. */
return splay_tree_foreach_internal_lazy (node->right, func, data);
}
attribute_hidden void
splay_tree_foreach_lazy (splay_tree sp, splay_tree_callback_stop func, void *data)
{
splay_tree_foreach_internal_lazy (sp->root, func, data);
}

View file

@ -78,8 +78,12 @@ typedef struct splay_tree_key_s *splay_tree_key;
splay_tree_name (splay_tree_prefix, splay_tree_remove)
# define splay_tree_foreach \
splay_tree_name (splay_tree_prefix, splay_tree_foreach)
# define splay_tree_foreach_lazy \
splay_tree_name (splay_tree_prefix, splay_tree_foreach_lazy)
# define splay_tree_callback \
splay_tree_name (splay_tree_prefix, splay_tree_callback)
# define splay_tree_callback_stop \
splay_tree_name (splay_tree_prefix, splay_tree_callback_stop)
#endif
#ifndef splay_tree_c
@ -99,11 +103,13 @@ struct splay_tree_s {
};
typedef void (*splay_tree_callback) (splay_tree_key, void *);
typedef int (*splay_tree_callback_stop) (splay_tree_key, void *);
extern splay_tree_key splay_tree_lookup (splay_tree, splay_tree_key);
extern void splay_tree_insert (splay_tree, splay_tree_node);
extern void splay_tree_remove (splay_tree, splay_tree_key);
extern void splay_tree_foreach (splay_tree, splay_tree_callback, void *);
extern void splay_tree_foreach_lazy (splay_tree, splay_tree_callback_stop, void *);
#else /* splay_tree_c */
# ifdef splay_tree_prefix
# include "splay-tree.c"
@ -125,6 +131,8 @@ extern void splay_tree_foreach (splay_tree, splay_tree_callback, void *);
# undef splay_tree_insert
# undef splay_tree_remove
# undef splay_tree_foreach
# undef splay_tree_foreach_lazy
# undef splay_tree_callback
# undef splay_tree_callback_stop
# undef splay_tree_prefix
#endif

View file

@ -45,6 +45,12 @@
#include "plugin-suffix.h"
#endif
/* Define another splay tree instantiation - for reverse offload. */
#define splay_tree_prefix reverse
#define splay_tree_c
#include "splay-tree.h"
typedef uintptr_t *hash_entry_type;
static inline void * htab_alloc (size_t size) { return gomp_malloc (size); }
static inline void htab_free (void *ptr) { free (ptr); }
@ -200,6 +206,12 @@ gomp_map_lookup (splay_tree mem_map, splay_tree_key key)
return splay_tree_lookup (mem_map, key);
}
static inline reverse_splay_tree_key
gomp_map_lookup_rev (reverse_splay_tree mem_map_rev, reverse_splay_tree_key key)
{
return reverse_splay_tree_lookup (mem_map_rev, key);
}
static inline splay_tree_key
gomp_map_0len_lookup (splay_tree mem_map, splay_tree_key key)
{
@ -1674,7 +1686,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
- k->host_start),
sizes[j], cbufp, false);
}
}
}
i = j - 1;
break;
case GOMP_MAP_FORCE_PRESENT:
@ -2205,11 +2217,16 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
/* Load image to device and get target addresses for the image. */
struct addr_pair *target_table = NULL;
uint64_t *rev_target_fn_table = NULL;
int i, num_target_entries;
/* With reverse offload, insert also target-host addresses. */
bool rev_lookup = omp_requires_mask & GOMP_REQUIRES_REVERSE_OFFLOAD;
num_target_entries
= devicep->load_image_func (devicep->target_id, version,
target_data, &target_table, NULL);
target_data, &target_table,
rev_lookup ? &rev_target_fn_table : NULL);
if (num_target_entries != num_funcs + num_vars
/* "+1" due to the additional ICV struct. */
@ -2228,6 +2245,10 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
/* "+1" due to the additional ICV struct. */
tgt->array = gomp_malloc ((num_funcs + num_vars + 1)
* sizeof (*tgt->array));
if (rev_target_fn_table)
tgt->rev_array = gomp_malloc (num_funcs * sizeof (*tgt->rev_array));
else
tgt->rev_array = NULL;
tgt->refcount = REFCOUNT_INFINITY;
tgt->tgt_start = 0;
tgt->tgt_end = 0;
@ -2236,6 +2257,7 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
tgt->list_count = 0;
tgt->device_descr = devicep;
splay_tree_node array = tgt->array;
reverse_splay_tree_node rev_array = tgt->rev_array;
for (i = 0; i < num_funcs; i++)
{
@ -2250,6 +2272,17 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
array->left = NULL;
array->right = NULL;
splay_tree_insert (&devicep->mem_map, array);
if (rev_target_fn_table)
{
reverse_splay_tree_key k2 = &rev_array->key;
k2->dev = rev_target_fn_table[i];
k2->k = k;
rev_array->left = NULL;
rev_array->right = NULL;
if (k2->dev != 0)
reverse_splay_tree_insert (&devicep->mem_map_rev, rev_array);
rev_array++;
}
array++;
}
@ -2369,6 +2402,15 @@ gomp_unload_image_from_device (struct gomp_device_descr *devicep,
gomp_mutex_unlock (&devicep->lock);
gomp_fatal ("image unload fail");
}
if (devicep->mem_map_rev.root)
{
/* Free reverse offload splay tree + data; 'tgt->rev_array' is the only
real allocation. */
assert (node && node->tgt && node->tgt->rev_array);
assert (devicep->mem_map_rev.root->key.k->tgt == node->tgt);
free (node->tgt->rev_array);
devicep->mem_map_rev.root = NULL;
}
/* Remove mappings from splay tree. */
int i;
@ -3114,23 +3156,549 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
}
/* Reverse lookup (device addr -> host addr) for reverse offload. We avoid
keeping track of all variable handling - assuming that reverse offload occurs
ony very rarely. Downside is that the reverse search is slow. */
struct gomp_splay_tree_rev_lookup_data {
uintptr_t tgt_start;
uintptr_t tgt_end;
splay_tree_key key;
};
static int
gomp_splay_tree_rev_lookup (splay_tree_key key, void *d)
{
struct gomp_splay_tree_rev_lookup_data *data;
data = (struct gomp_splay_tree_rev_lookup_data *)d;
uintptr_t tgt_start = key->tgt->tgt_start + key->tgt_offset;
if (tgt_start > data->tgt_start || key->tgt->list_count == 0)
return 0;
size_t j;
for (j = 0; j < key->tgt->list_count; j++)
if (key->tgt->list[j].key == key)
break;
assert (j < key->tgt->list_count);
uintptr_t tgt_end = tgt_start + key->tgt->list[j].length;
if ((tgt_start == data->tgt_start && tgt_end == data->tgt_end)
|| (tgt_end > data->tgt_start && tgt_start < data->tgt_end))
{
data->key = key;
return 1;
}
return 0;
}
static inline splay_tree_key
gomp_map_rev_lookup (splay_tree mem_map, uint64_t tgt_start, uint64_t tgt_end,
bool zero_len)
{
struct gomp_splay_tree_rev_lookup_data data;
data.key = NULL;
data.tgt_start = tgt_start;
data.tgt_end = tgt_end;
if (tgt_start != tgt_end)
{
splay_tree_foreach_lazy (mem_map, gomp_splay_tree_rev_lookup, &data);
return data.key;
}
data.tgt_end++;
splay_tree_foreach_lazy (mem_map, gomp_splay_tree_rev_lookup, &data);
if (data.key != NULL || zero_len)
return data.key;
data.tgt_end--;
data.tgt_start--;
splay_tree_foreach_lazy (mem_map, gomp_splay_tree_rev_lookup, &data);
return data.key;
}
struct cpy_data
{
uint64_t devaddr;
bool present, aligned;
};
/* Search just mapped reverse-offload data; returns index if found,
otherwise >= n. */
static inline int
gomp_map_cdata_lookup_int (struct cpy_data *d, uint64_t *devaddrs,
unsigned short *kinds, uint64_t *sizes, size_t n,
uint64_t tgt_start, uint64_t tgt_end)
{
const bool short_mapkind = true;
const int typemask = short_mapkind ? 0xff : 0x7;
size_t i;
for (i = 0; i < n; i++)
{
bool is_struct = ((get_kind (short_mapkind, kinds, i) & typemask)
== GOMP_MAP_STRUCT);
uint64_t dev_end;
if (!is_struct)
dev_end = d[i].devaddr + sizes[i];
else
{
if (i + sizes[i] < n)
dev_end = d[i + sizes[i]].devaddr + sizes[i + sizes[i]];
else
dev_end = devaddrs[i + sizes[i]] + sizes[i + sizes[i]];
}
if ((d[i].devaddr == tgt_start && dev_end == tgt_end)
|| (dev_end > tgt_start && d[i].devaddr < tgt_end))
break;
if (is_struct)
i += sizes[i];
}
return i;
}
static inline int
gomp_map_cdata_lookup (struct cpy_data *d, uint64_t *devaddrs,
unsigned short *kinds, uint64_t *sizes,
size_t n, uint64_t tgt_start, uint64_t tgt_end,
bool zero_len)
{
size_t i;
if (tgt_start != tgt_end)
return gomp_map_cdata_lookup_int (d, devaddrs, kinds, sizes, n,
tgt_start, tgt_end);
tgt_end++;
i = gomp_map_cdata_lookup_int (d, devaddrs, kinds, sizes, n,
tgt_start, tgt_end);
if (i < n || zero_len)
return i;
tgt_end--;
tgt_start--;
return gomp_map_cdata_lookup_int (d, devaddrs, kinds, sizes, n,
tgt_start, tgt_end);
}
/* Handle reverse offload. This is called by the device plugins for a
reverse offload; it is not called if the outer target runs on the host. */
reverse offload; it is not called if the outer target runs on the host.
The mapping is simplified device-affecting constructs (except for target
with device(ancestor:1)) must not be encountered; in particular not
target (enter/exit) data. */
void
gomp_target_rev (uint64_t fn_ptr __attribute__ ((unused)),
uint64_t mapnum __attribute__ ((unused)),
uint64_t devaddrs_ptr __attribute__ ((unused)),
uint64_t sizes_ptr __attribute__ ((unused)),
uint64_t kinds_ptr __attribute__ ((unused)),
int dev_num __attribute__ ((unused)),
void (*dev_to_host_cpy) (void *, const void *, size_t,
void *) __attribute__ ((unused)),
void (*host_to_dev_cpy) (void *, const void *, size_t,
void *) __attribute__ ((unused)),
void *token __attribute__ ((unused)))
gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
void (*dev_to_host_cpy) (void *, const void *, size_t, void*),
void (*host_to_dev_cpy) (void *, const void *, size_t, void*),
void *token)
{
__builtin_unreachable ();
/* Return early if there is no offload code. */
if (sizeof (OFFLOAD_PLUGINS) == sizeof (""))
return;
/* Currently, this fails because of calculate_firstprivate_requirements
below; it could be fixed but additional code needs to be updated to
handle 32bit hosts - thus, it is not worthwhile. */
if (sizeof (void *) != sizeof (uint64_t))
gomp_fatal ("Reverse offload of 32bit hosts not supported.");
struct cpy_data *cdata = NULL;
uint64_t *devaddrs;
uint64_t *sizes;
unsigned short *kinds;
const bool short_mapkind = true;
const int typemask = short_mapkind ? 0xff : 0x7;
struct gomp_device_descr *devicep = resolve_device (dev_num, false);
reverse_splay_tree_key n;
struct reverse_splay_tree_key_s k;
k.dev = fn_ptr;
gomp_mutex_lock (&devicep->lock);
n = gomp_map_lookup_rev (&devicep->mem_map_rev, &k);
gomp_mutex_unlock (&devicep->lock);
if (n == NULL)
gomp_fatal ("Cannot find reverse-offload function");
void (*host_fn)() = (void (*)()) n->k->host_start;
if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
{
devaddrs = (uint64_t *) (uintptr_t) devaddrs_ptr;
sizes = (uint64_t *) (uintptr_t) sizes_ptr;
kinds = (unsigned short *) (uintptr_t) kinds_ptr;
}
else
{
devaddrs = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
sizes = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
kinds = (unsigned short *) gomp_malloc (mapnum * sizeof (unsigned short));
if (dev_to_host_cpy)
{
dev_to_host_cpy (devaddrs, (const void *) (uintptr_t) devaddrs_ptr,
mapnum * sizeof (uint64_t), token);
dev_to_host_cpy (sizes, (const void *) (uintptr_t) sizes_ptr,
mapnum * sizeof (uint64_t), token);
dev_to_host_cpy (kinds, (const void *) (uintptr_t) kinds_ptr,
mapnum * sizeof (unsigned short), token);
}
else
{
gomp_copy_dev2host (devicep, NULL, devaddrs,
(const void *) (uintptr_t) devaddrs_ptr,
mapnum * sizeof (uint64_t));
gomp_copy_dev2host (devicep, NULL, sizes,
(const void *) (uintptr_t) sizes_ptr,
mapnum * sizeof (uint64_t));
gomp_copy_dev2host (devicep, NULL, kinds, (const void *) (uintptr_t) kinds_ptr,
mapnum * sizeof (unsigned short));
}
}
size_t tgt_align = 0, tgt_size = 0;
/* If actually executed on 32bit systems, the casts lead to wrong code;
but 32bit with offloading is not supported; see top of this function. */
calculate_firstprivate_requirements (mapnum, (void *) (uintptr_t) sizes,
(void *) (uintptr_t) kinds,
&tgt_align, &tgt_size);
if (tgt_align)
{
char *tgt = gomp_alloca (tgt_size + tgt_align - 1);
uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
if (al)
tgt += tgt_align - al;
tgt_size = 0;
for (uint64_t i = 0; i < mapnum; i++)
if (get_kind (short_mapkind, kinds, i) == GOMP_MAP_FIRSTPRIVATE
&& devaddrs[i] != 0)
{
size_t align = (size_t) 1 << (kinds[i] >> 8);
tgt_size = (tgt_size + align - 1) & ~(align - 1);
if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
memcpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
(size_t) sizes[i]);
else if (dev_to_host_cpy)
dev_to_host_cpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
(size_t) sizes[i], token);
else
gomp_copy_dev2host (devicep, NULL, tgt + tgt_size,
(void *) (uintptr_t) devaddrs[i],
(size_t) sizes[i]);
devaddrs[i] = (uint64_t) (uintptr_t) tgt + tgt_size;
tgt_size = tgt_size + sizes[i];
if ((devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
&& i + 1 < mapnum
&& ((get_kind (short_mapkind, kinds, i) & typemask)
== GOMP_MAP_ATTACH))
{
*(uint64_t*) (uintptr_t) (devaddrs[i+1] + sizes[i+1])
= (uint64_t) devaddrs[i];
++i;
}
}
}
if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM))
{
size_t j, struct_cpy = 0;
splay_tree_key n2;
cdata = gomp_alloca (sizeof (*cdata) * mapnum);
memset (cdata, '\0', sizeof (*cdata) * mapnum);
gomp_mutex_lock (&devicep->lock);
for (uint64_t i = 0; i < mapnum; i++)
{
if (devaddrs[i] == 0)
continue;
n = NULL;
int kind = get_kind (short_mapkind, kinds, i) & typemask;
switch (kind)
{
case GOMP_MAP_FIRSTPRIVATE:
case GOMP_MAP_FIRSTPRIVATE_INT:
continue;
case GOMP_MAP_DELETE:
case GOMP_MAP_RELEASE:
case GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION:
/* Assume it is present; look it up - but ignore otherwise. */
case GOMP_MAP_ALLOC:
case GOMP_MAP_FROM:
case GOMP_MAP_FORCE_ALLOC:
case GOMP_MAP_FORCE_FROM:
case GOMP_MAP_ALWAYS_FROM:
case GOMP_MAP_TO:
case GOMP_MAP_TOFROM:
case GOMP_MAP_FORCE_TO:
case GOMP_MAP_FORCE_TOFROM:
case GOMP_MAP_ALWAYS_TO:
case GOMP_MAP_ALWAYS_TOFROM:
case GOMP_MAP_ZERO_LEN_ARRAY_SECTION:
cdata[i].devaddr = devaddrs[i];
bool zero_len = (kind == GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION
|| kind == GOMP_MAP_ZERO_LEN_ARRAY_SECTION);
j = gomp_map_cdata_lookup (cdata, devaddrs, kinds, sizes, i,
devaddrs[i],
devaddrs[i] + sizes[i], zero_len);
if (j < i)
{
n2 = NULL;
cdata[i].present = true;
devaddrs[i] = devaddrs[j] + devaddrs[i] - cdata[j].devaddr;
}
else
{
n2 = gomp_map_rev_lookup (&devicep->mem_map,
devaddrs[i],
devaddrs[i] + sizes[i], zero_len);
cdata[i].present = n2 != NULL;
}
if (!cdata[i].present
&& kind != GOMP_MAP_DELETE
&& kind != GOMP_MAP_RELEASE
&& kind != GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION)
{
cdata[i].aligned = true;
size_t align = (size_t) 1 << (kinds[i] >> 8);
devaddrs[i]
= (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
sizes[i]);
}
else if (n2 != NULL)
devaddrs[i] = (n2->host_start + cdata[i].devaddr
- (n2->tgt->tgt_start + n2->tgt_offset));
if (((!cdata[i].present || struct_cpy)
&& (kind == GOMP_MAP_TO || kind == GOMP_MAP_TOFROM))
|| kind == GOMP_MAP_FORCE_TO
|| kind == GOMP_MAP_FORCE_TOFROM
|| kind == GOMP_MAP_ALWAYS_TO
|| kind == GOMP_MAP_ALWAYS_TOFROM)
{
if (dev_to_host_cpy)
dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
(void *) (uintptr_t) cdata[i].devaddr,
sizes[i], token);
else
gomp_copy_dev2host (devicep, NULL,
(void *) (uintptr_t) devaddrs[i],
(void *) (uintptr_t) cdata[i].devaddr,
sizes[i]);
}
if (struct_cpy)
struct_cpy--;
break;
case GOMP_MAP_ATTACH:
case GOMP_MAP_POINTER:
case GOMP_MAP_ALWAYS_POINTER:
n2 = gomp_map_rev_lookup (&devicep->mem_map,
devaddrs[i] + sizes[i],
devaddrs[i] + sizes[i]
+ sizeof (void*), false);
cdata[i].present = n2 != NULL;
cdata[i].devaddr = devaddrs[i];
if (n2)
devaddrs[i] = (n2->host_start + cdata[i].devaddr
- (n2->tgt->tgt_start + n2->tgt_offset));
else
{
j = gomp_map_cdata_lookup (cdata, devaddrs, kinds, sizes, i,
devaddrs[i] + sizes[i],
devaddrs[i] + sizes[i]
+ sizeof (void*), false);
if (j < i)
{
cdata[i].present = true;
devaddrs[i] = (devaddrs[j] + devaddrs[i]
- cdata[j].devaddr);
}
}
if (!cdata[i].present)
devaddrs[i] = (uintptr_t) gomp_malloc (sizeof (void*));
/* Assume that when present, the pointer is already correct. */
if (!n2)
*(uint64_t *) (uintptr_t) (devaddrs[i] + sizes[i])
= devaddrs[i-1];
break;
case GOMP_MAP_TO_PSET:
/* Assume that when present, the pointers are fine and no 'to:'
is required. */
n2 = gomp_map_rev_lookup (&devicep->mem_map,
devaddrs[i], devaddrs[i] + sizes[i],
false);
cdata[i].present = n2 != NULL;
cdata[i].devaddr = devaddrs[i];
if (n2)
devaddrs[i] = (n2->host_start + cdata[i].devaddr
- (n2->tgt->tgt_start + n2->tgt_offset));
else
{
j = gomp_map_cdata_lookup (cdata, devaddrs, kinds, sizes, i,
devaddrs[i],
devaddrs[i] + sizes[i], false);
if (j < i)
{
cdata[i].present = true;
devaddrs[i] = (devaddrs[j] + devaddrs[i]
- cdata[j].devaddr);
}
}
if (!cdata[i].present)
{
cdata[i].aligned = true;
size_t align = (size_t) 1 << (kinds[i] >> 8);
devaddrs[i]
= (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
sizes[i]);
if (dev_to_host_cpy)
dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
(void *) (uintptr_t) cdata[i].devaddr,
sizes[i], token);
else
gomp_copy_dev2host (devicep, NULL,
(void *) (uintptr_t) devaddrs[i],
(void *) (uintptr_t) cdata[i].devaddr,
sizes[i]);
}
for (j = i + 1; j < mapnum; j++)
{
kind = get_kind (short_mapkind, kinds, j) & typemask;
if (!GOMP_MAP_ALWAYS_POINTER_P (kind)
&& !GOMP_MAP_POINTER_P (kind))
break;
if (devaddrs[j] < devaddrs[i])
break;
if (cdata[i].present)
continue;
if (devaddrs[j] == 0)
{
*(uint64_t *) (uintptr_t) (devaddrs[i] + sizes[j]) = 0;
continue;
}
int k;
n2 = NULL;
cdata[i].present = true;
cdata[j].devaddr = devaddrs[j];
k = gomp_map_cdata_lookup (cdata, devaddrs, kinds, sizes, j,
devaddrs[j],
devaddrs[j] + sizeof (void*),
false);
if (k < j)
devaddrs[j] = (devaddrs[k] + devaddrs[j]
- cdata[k].devaddr);
else
{
n2 = gomp_map_rev_lookup (&devicep->mem_map,
devaddrs[j],
devaddrs[j] + sizeof (void*),
false);
if (n2 == NULL)
{
gomp_mutex_unlock (&devicep->lock);
gomp_fatal ("Pointer target wasn't mapped");
}
devaddrs[j] = (n2->host_start + cdata[j].devaddr
- (n2->tgt->tgt_start + n2->tgt_offset));
}
*(void **) (uintptr_t) (devaddrs[i] + sizes[j])
= (void *) (uintptr_t) devaddrs[j];
}
i = j -1;
break;
case GOMP_MAP_STRUCT:
n2 = gomp_map_rev_lookup (&devicep->mem_map, devaddrs[i+1],
devaddrs[i + sizes[i]]
+ sizes[i + sizes[i]], false);
cdata[i].present = n2 != NULL;
cdata[i].devaddr = devaddrs[i];
struct_cpy = cdata[i].present ? 0 : sizes[i];
if (!n2)
{
size_t sz = (size_t) (devaddrs[i + sizes[i]]
- devaddrs[i+1]
+ sizes[i + sizes[i]]);
size_t align = (size_t) 1 << (kinds[i] >> 8);
cdata[i].aligned = true;
devaddrs[i] = (uintptr_t) gomp_aligned_alloc (align, sz);
devaddrs[i] -= devaddrs[i+1] - cdata[i].devaddr;
}
else
devaddrs[i] = (n2->host_start + cdata[i].devaddr
- (n2->tgt->tgt_start + n2->tgt_offset));
break;
default:
gomp_mutex_unlock (&devicep->lock);
gomp_fatal ("gomp_target_rev unhandled kind 0x%.4x", kinds[i]);
}
}
gomp_mutex_unlock (&devicep->lock);
}
host_fn (devaddrs);
if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM))
{
uint64_t struct_cpy = 0;
bool clean_struct = false;
for (uint64_t i = 0; i < mapnum; i++)
{
if (cdata[i].devaddr == 0)
continue;
int kind = get_kind (short_mapkind, kinds, i) & typemask;
bool copy = !cdata[i].present || struct_cpy;
switch (kind)
{
case GOMP_MAP_FORCE_FROM:
case GOMP_MAP_FORCE_TOFROM:
case GOMP_MAP_ALWAYS_FROM:
case GOMP_MAP_ALWAYS_TOFROM:
copy = true;
/* FALLTHRU */
case GOMP_MAP_FROM:
case GOMP_MAP_TOFROM:
if (copy && host_to_dev_cpy)
host_to_dev_cpy ((void *) (uintptr_t) cdata[i].devaddr,
(void *) (uintptr_t) devaddrs[i],
sizes[i], token);
else if (copy)
gomp_copy_host2dev (devicep, NULL,
(void *) (uintptr_t) cdata[i].devaddr,
(void *) (uintptr_t) devaddrs[i],
sizes[i], false, NULL);
default:
break;
}
if (struct_cpy)
{
struct_cpy--;
continue;
}
if (kind == GOMP_MAP_STRUCT && !cdata[i].present)
{
clean_struct = true;
struct_cpy = sizes[i];
}
else if (cdata[i].aligned)
gomp_aligned_free ((void *) (uintptr_t) devaddrs[i]);
else if (!cdata[i].present)
free ((void *) (uintptr_t) devaddrs[i]);
}
if (clean_struct)
for (uint64_t i = 0; i < mapnum; i++)
if (!cdata[i].present
&& ((get_kind (short_mapkind, kinds, i) & typemask)
== GOMP_MAP_STRUCT))
{
devaddrs[i] += cdata[i+1].devaddr - cdata[i].devaddr;
gomp_aligned_free ((void *) (uintptr_t) devaddrs[i]);
}
free (devaddrs);
free (sizes);
free (kinds);
}
}
/* Host fallback for GOMP_target_data{,_ext} routines. */
@ -4510,6 +5078,7 @@ gomp_target_init (void)
/* current_device.capabilities has already been set. */
current_device.type = current_device.get_type_func ();
current_device.mem_map.root = NULL;
current_device.mem_map_rev.root = NULL;
current_device.state = GOMP_DEVICE_UNINITIALIZED;
for (i = 0; i < new_num_devs; i++)
{

View file

@ -0,0 +1,72 @@
! { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } }
implicit none
!$omp requires reverse_offload
integer :: A(50), A2(50)
integer :: i, error
logical :: shared_mem
error = 0
shared_mem = .false.
A = [(i, i=1,50)]
A2 = [(i, i=1,50)]
!$omp target map(to: shared_mem)
shared_mem = .true.
!$omp end target
!$omp target map(to: A(20:40), A2(20:40)) map(from: error)
block
integer :: B(10), C(10)
B = 99
C = 88
A(20:40) = -A(20:40)
A2(20:40) = -A2(20:40)
!$omp target device (ancestor:1) &
!$omp& map(to: A(25:35)) map(always, to: A2(25:35)) &
!$omp& map(from:B(4:8)) map(tofrom:C(4:8))
if (shared_mem) then
if (any (A(25:35) /= [(-i,i=25,35)])) stop 20
else
if (any (A(25:35) /= [( i,i=25,35)])) stop 21
end if
if (any (A2(25:35) /= [(-i,i=25,35)])) stop 22
if (any (C(4:8) /= 88)) stop 23
A(25:35) = -A(25:35)*10
A2(25:35) = -A2(25:35)*10
B(4:8) = [4,5,6,7,8]
C(4:8) = [-4,-5,-6,-7,-8]
!$omp end target
if (any (B(1:3) /= 99) .or. any (B(9:10) /= 99)) then
error = 30
elseif (any (B(4:8) /= [4,5,6,7,8])) then
error = 31
elseif (any (C(1:3) /= 88) .or. any (C(9:10) /= 88)) then
error = 32
elseif (any (C(4:8) /= [-4,-5,-6,-7,-8])) then
error = 33
else
error = 0
endif
end block
if (error /= 0) stop error
if (shared_mem) then
if (any (A(1:19) /= [( i, i=1,19)])) stop 1
if (any (A(20:24) /= [(-i, i=20,24)])) stop 2
if (any (A(36:40) /= [(-i, i=36,40)])) stop 3
if (any (A(41:50) /= [( i, i=41,50)])) stop 4
if (any (A(25:35) /= [( 10*i, i=25,35)])) stop 5
else
if (any (A(1:24) /= [( i, i=1,24)])) stop 6
if (any (A(36:50) /= [( i, i=36,50)])) stop 7
if (any (A(25:35) /= [(-10*i, i=25,35)])) stop 8
end if
if (any (A2(25:35) /= [( 10*i, i=25,35)])) stop 9
end

View file

@ -0,0 +1,68 @@
! { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } }
implicit none
!$omp requires reverse_offload
integer :: A(50), A2(50), A3(50)
integer :: i
logical :: shared_mem
shared_mem = .false.
A = [(3*i, i=1,50)]
A2 = [(7*i, i=1,50)]
A3 = [(11*i, i=1,50)]
!$omp target map(to: shared_mem)
shared_mem = .true.
!$omp end target
!$omp target map(to: A(20:40), A2(20:40), A3(20:40))
block
integer :: C(10)
C = 88
A(20:40) = -2*A(20:40)
A2(20:40) = -9*A2(20:40)
A3(20:40) = -13*A3(20:40)
!$omp target device (ancestor:1) &
!$omp& map(from: A(25:35)) map(always, from: A2(25:35)) &
!$omp& map(alloc: A3(25:35)) map(alloc:C(4:8))
if (shared_mem) then
if (any (A(25:35) /= [(-2*3*i, i=25,35)])) stop 1
if (any (A2(25:35) /= [(-9*7*i, i=25,35)])) stop 2
if (any (A3(25:35) /= [(-13*11*i, i=25,35)])) stop 3
else
if (any (A(25:35) /= [(3*i, i=25,35)])) stop 4
if (any (A2(25:35) /= [(7*i, i=25,35)])) stop 5
if (any (A3(25:35) /= [(11*i, i=25,35)])) stop 6
end if
A(25:35) = A(25:35)*5
A2(25:35) = A2(25:35)*8
A3(25:35) = A3(25:35)*18
C(4:8) = [4,5,6,7,8]
!$omp end target
if (shared_mem) then
if (any (A(25:35) /= [(-2*3*5*i, i=25,35)])) stop 7
if (any (A2(25:35) /= [(-9*7*8*i, i=25,35)])) stop 8
if (any (A3(25:35) /= [(-13*11*18*i, i=25,35)])) stop 9
if (any (C(4:8) /= [4,5,6,7,8])) stop 10
else
if (any (A(25:35) /= [(-2*3*i, i=25,35)])) stop 11
if (any (A2(25:35) /= [(7*8*i, i=25,35)])) stop 12
if (any (A3(25:35) /= [(-13*11*i, i=25,35)])) stop 13
if (any (C(4:8) /= 88)) stop 14
end if
end block
if (shared_mem) then
if (any (A(25:35) /= [(-2*3*5*i, i=25,35)])) stop
if (any (A2(25:35) /= [(-9*7**8*i, i=25,35)])) stop
if (any (A3(25:35) /= [(-13*11*18*i, i=25,35)])) stop
else
if (any (A(25:35) /= [(3*5*i, i=25,35)])) stop
if (any (A2(25:35) /= [(7*8*i, i=25,35)])) stop
if (any (A3(25:35) /= [(11*18*i, i=25,35)])) stop
end if
end

View file

@ -0,0 +1,129 @@
! { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } }
implicit none
!$omp requires reverse_offload
type t2
integer :: a, b, c
end type t2
type t
integer :: A(5), B(5), C(5)
integer, pointer :: P(:), P2 !Just some padding
type(t2) :: tt !Just some padding
end type t
type(t) :: S1, S2
logical :: shared_mem
shared_mem = .false.
!$omp target map(to: shared_mem)
shared_mem = .true.
!$omp end target
s1%A = [1,2,3,4,5]
s1%B = [10,20,30,40,50]
s1%C = [11,22,33,44,55]
s2%A = 2*s1%A
s2%B = 2*s1%B
s2%C = 2*s1%C
!$omp target &
!$omp& map(to: s1%b, s1%c) &
!$omp& map(to: s2%b, s2%c)
block
type(t) :: si1, si2, si3, si4
s1%B = -10 * s1%B
s1%C = -10 * s1%C
s2%B = -15 * s2%B
s2%C = -15 * s2%C
si1%A = -1 * [1,2,3,4,5]
si1%B = -1 * [10,20,30,40,50]
si1%C = -1 * [11,22,33,44,55]
si2%A = -23 * [1,2,3,4,5]
si2%B = -23 * [10,20,30,40,50]
si2%C = -23 * [11,22,33,44,55]
!$omp target device (ancestor:1) &
!$omp& map(to: si1%C, si1%B) &
!$omp& map(tofrom: si2%C, si2%B) &
!$omp& map(always, to: s1%B) &
!$omp& map( to: s2%B)
if (any (s1%A /= [1,2,3,4,5])) stop 1
if (any (s1%B /= -10 * [10,20,30,40,50])) stop 2
if (shared_mem) then
if (any (s1%C /= -10 * [11,22,33,44,55])) stop 4
else
if (any (s1%C /= [11,22,33,44,55])) stop 3
endif
if (any (s2%A /= 2 * [1,2,3,4,5])) stop 4
if (shared_mem) then
if (any (s2%B /= -15 * 2 * [10,20,30,40,50])) stop 5
if (any (s2%C /= -15 * 2 * [11,22,33,44,55])) stop 6
else
if (any (s2%B /= 2 * [10,20,30,40,50])) stop 7
if (any (s2%C /= 2 * [11,22,33,44,55])) stop 8
endif
if (any (si1%B /= -1 * [10,20,30,40,50])) stop 9
if (any (si1%C /= -1 * [11,22,33,44,55])) stop 10
if (any (si2%B /= -23 * [10,20,30,40,50])) stop 10
if (any (si2%C /= -23 * [11,22,33,44,55])) stop 11
s1%A = 5 * s1%A
s1%B = 7 * s1%B
s1%C = 13 * s1%C
s2%A = 9 * s2%A
s2%B = 21 * s2%B
s2%C = 31 * s2%C
si1%B = -11 * si1%B
si1%C = -13 * si1%C
si2%B = -27 * si2%B
si2%C = -29 * si2%C
!$omp end target
if (shared_mem) then
if (any (s1%B /= -10 * 7 * [10,20,30,40,50])) stop 20
if (any (s1%C /= -10 * 13 * [11,22,33,44,55])) stop 21
else
if (any (s1%B /= -10 * [10,20,30,40,50])) stop 22
if (any (s1%C /= -10 * [11,22,33,44,55])) stop 23
endif
if (shared_mem) then
if (any (s2%B /= -15 * 2 * 21 * [10,20,30,40,50])) stop 24
if (any (s2%C /= -15 * 2 * 31 * [11,22,33,44,55])) stop 25
else
if (any (s2%B /= -15 * 2 * [10,20,30,40,50])) stop 26
if (any (s2%C /= -15 * 2 * [11,22,33,44,55])) stop 27
endif
if (any (si1%A /= -1 * [1,2,3,4,5])) stop 28
if (shared_mem) then
if (any (si1%B /= -1 * (-11) * [10,20,30,40,50])) stop 29
if (any (si1%C /= -1 * (-13) * [11,22,33,44,55])) stop 30
else
if (any (si1%B /= -1 * [10,20,30,40,50])) stop 31
if (any (si1%C /= -1 * [11,22,33,44,55])) stop 32
endif
if (any (si2%A /= -23 * [1,2,3,4,5])) stop 33
if (any (si2%B /= -23 * (-27) * [10,20,30,40,50])) stop 34
if (any (si2%C /= -23 * (-29) * [11,22,33,44,55])) stop 35
end block
if (any (s1%A /= 5 * [1,2,3,4,5])) stop 40
if (any (s1%B /= -10 * 7 * [10,20,30,40,50])) stop 41
if (shared_mem) then
if (any (s1%C /= -10 * 13 * [11,22,33,44,55])) stop 42
else
if (any (s1%C /= 13 * [11,22,33,44,55])) stop 43
endif
if (any (s2%A /= 2 * 9 * [1,2,3,4,5])) stop 44
if (shared_mem) then
if (any (s2%B /= -15 * 2 * 21 * [10,20,30,40,50])) stop 45
if (any (s2%C /= -15 * 2 * 31 * [11,22,33,44,55])) stop 46
else
if (any (s2%B /= 2 * 21 * [10,20,30,40,50])) stop 47
if (any (s2%C /= 2 * 31 * [11,22,33,44,55])) stop 48
endif
end

View file

@ -0,0 +1,100 @@
! { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } }
! { dg-xfail-run-if "Copying on-device allocated memory fails with cuMemcpyDtoHAsync error: invalid argument" { offload_device_nvptx } }
! Because of the nvptx fail, a non-device alloc version has been
! created: reverse-offload-5a.f90
implicit none
!$omp requires reverse_offload
integer, allocatable :: A(:), A2(:), s1, s2
integer :: i
logical :: shared_mem
shared_mem = .false.
a = [1,2,3,4]
a2 = [8,7,6,5]
s1 = 532
s2 = 55
!$omp target map(to: shared_mem)
shared_mem = .true.
!$omp end target
!$omp target map(to: A, A2, s1, s2)
block
integer, allocatable :: ai(:), ai2(:), si1, si2
a = a * 2
a2 = a2 * 3
s1 = s1 * 4
s2 = s2 * 5
ai = [23,35,86,43]
ai2 = [8,4,7,1]
si1 = 64
si2 = 765
!$omp target device (ancestor:1) &
!$omp& map(to: A, s1, ai, si1) map(always, to: a2, s2) &
!$omp& map(tofrom: ai2, si2)
if (shared_mem) then
if (any (a /= 2 * [1,2,3,4])) stop 1
if (s1 /= 4 * 532) stop 2
else
if (any (a /= [1,2,3,4])) stop 3
if (s1 /= 532) stop 4
endif
if (any (a2 /= 3 * [8,7,6,5])) stop 5
if (s2 /= 5 * 55) stop 6
if (any (ai /= [23,35,86,43])) stop 7
if (any (ai2 /= [8,4,7,1])) stop 8
if (si1 /= 64) stop 9
if (si2 /= 765) stop 10
a = a*3
a2 = a2*7
s1 = s1*11
s2 = s2*5
ai = ai*13
ai2 = ai2*21
si1 = si1*27
si2 = si2*31
!$omp end target
if (shared_mem) then
if (any (a /= 3 * 2 * [1,2,3,4])) stop 11
if (any (a2 /= 7 * 3 * [8,7,6,5])) stop 12
if (s1 /= 11 * 4 * 532) stop 13
if (s2 /= 5 * 5 * 55) stop 14
if (any (ai /= 13 * [23,35,86,43])) stop 15
if (si1 /= 27 * 64) stop 16
else
if (any (a /= 2 * [1,2,3,4])) stop 17
if (any (a2 /= 3 * [8,7,6,5])) stop 18
if (s1 /= 4 * 532) stop 19
if (s2 /= 5 * 55) stop 20
if (any (ai /= [23,35,86,43])) stop 22
if (si1 /= 64) stop 23
endif
if (any (ai2 /= 21 * [8,4,7,1])) stop 24
if (si2 /= 31 * 765) stop 25
deallocate (ai, ai2, si1, si2)
end block
if (shared_mem) then
if (any (a /= 3 * 2 * [1,2,3,4])) stop 30
if (any (a2 /= 7 * 3 * [8,7,6,5])) stop 31
if (s1 /= 11 * 4 * 532) stop 32
if (s2 /= 5 * 5 * 55) stop 33
else
if (any (a /= 3 * [1,2,3,4])) stop 34
if (any (a2 /= 3 * 7 * [8,7,6,5])) stop 35
if (s1 /= 11 * 532) stop 36
if (s2 /= 5 * 5 * 55) stop 37
endif
deallocate (a, a2, s1, s2)
end

View file

@ -0,0 +1,98 @@
! { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } }
! Copying on-device allocated memory fails with cuMemcpyDtoHAsync error: invalid argument
! Hence, reverse-offload-5.f90 has been copied to *this* file,
! which uses on-host allocated vars - but only on the device side stack variables
implicit none
!$omp requires reverse_offload
integer, allocatable :: A(:), A2(:), s1, s2
integer :: i,D(4)
logical :: shared_mem
shared_mem = .false.
a = [1,2,3,4]
a2 = [8,7,6,5]
s1 = 532
s2 = 55
!$omp target map(to: shared_mem)
shared_mem = .true.
!$omp end target
!$omp target map(to: A, A2, s1, s2)
block
! stack variables:
integer :: ai(4), ai2(4), si1, si2
a = a * 2
a2 = a2 * 3
s1 = s1 * 4
s2 = s2 * 5
ai = [23,35,86,43]
ai2 = [8,4,7,1]
si1 = 64
si2 = 765
!$omp target device (ancestor:1) &
!$omp& map(to: A, s1, ai, si1) map(always, to: a2, s2) &
!$omp& map(tofrom: ai2, si2)
if (shared_mem) then
if (any (a /= 2 * [1,2,3,4])) stop 1
if (s1 /= 4 * 532) stop 2
else
if (any (a /= [1,2,3,4])) stop 3
if (s1 /= 532) stop 4
endif
if (any (a2 /= 3 * [8,7,6,5])) stop 5
if (s2 /= 5 * 55) stop 6
if (any (ai /= [23,35,86,43])) stop 7
if (any (ai2 /= [8,4,7,1])) stop 8
if (si1 /= 64) stop 9
if (si2 /= 765) stop 10
a = a*3
a2 = a2*7
s1 = s1*11
s2 = s2*5
ai = ai*13
ai2 = ai2*21
si1 = si1*27
si2 = si2*31
!$omp end target
if (shared_mem) then
if (any (a /= 3 * 2 * [1,2,3,4])) stop 11
if (any (a2 /= 7 * 3 * [8,7,6,5])) stop 12
if (s1 /= 11 * 4 * 532) stop 13
if (s2 /= 5 * 5 * 55) stop 14
if (any (ai /= 13 * [23,35,86,43])) stop 15
if (si1 /= 27 * 64) stop 16
else
if (any (a /= 2 * [1,2,3,4])) stop 17
if (any (a2 /= 3 * [8,7,6,5])) stop 18
if (s1 /= 4 * 532) stop 19
if (s2 /= 5 * 55) stop 20
if (any (ai /= [23,35,86,43])) stop 22
if (si1 /= 64) stop 23
endif
if (any (ai2 /= 21 * [8,4,7,1])) stop 24
if (si2 /= 31 * 765) stop 25
end block
if (shared_mem) then
if (any (a /= 3 * 2 * [1,2,3,4])) stop 30
if (any (a2 /= 7 * 3 * [8,7,6,5])) stop 31
if (s1 /= 11 * 4 * 532) stop 32
if (s2 /= 5 * 5 * 55) stop 33
else
if (any (a /= 3 * [1,2,3,4])) stop 34
if (any (a2 /= 3 * 7 * [8,7,6,5])) stop 35
if (s1 /= 11 * 532) stop 36
if (s2 /= 5 * 5 * 55) stop 37
endif
deallocate (a, a2, s1, s2)
end