libgomp, nvptx: low-latency memory allocator
This patch adds support for allocating low-latency ".shared" memory on NVPTX GPU device, via the omp_low_lat_mem_space and omp_alloc. The memory can be allocated, reallocated, and freed using a basic but fast algorithm, is thread safe and the size of the low-latency heap can be configured using the GOMP_NVPTX_LOWLAT_POOL environment variable. The use of the PTX dynamic_smem_size feature means that low-latency allocator will not work with the PTX 3.1 multilib. For now, the omp_low_lat_mem_alloc allocator also works, but that will change when I implement the access traits. libgomp/ChangeLog: * allocator.c (MEMSPACE_ALLOC): New macro. (MEMSPACE_CALLOC): New macro. (MEMSPACE_REALLOC): New macro. (MEMSPACE_FREE): New macro. (predefined_alloc_mapping): New array. Add _Static_assert to match. (ARRAY_SIZE): New macro. (omp_aligned_alloc): Use MEMSPACE_ALLOC. Implement fall-backs for predefined allocators. Simplify existing fall-backs. (omp_free): Use MEMSPACE_FREE. (omp_calloc): Use MEMSPACE_CALLOC. Implement fall-backs for predefined allocators. Simplify existing fall-backs. (omp_realloc): Use MEMSPACE_REALLOC, MEMSPACE_ALLOC, and MEMSPACE_FREE. Implement fall-backs for predefined allocators. Simplify existing fall-backs. * config/nvptx/team.c (__nvptx_lowlat_pool): New asm variable. (__nvptx_lowlat_init): New prototype. (gomp_nvptx_main): Call __nvptx_lowlat_init. * libgomp.texi: Update memory space table. * plugin/plugin-nvptx.c (lowlat_pool_size): New variable. (GOMP_OFFLOAD_init_device): Read the GOMP_NVPTX_LOWLAT_POOL envvar. (GOMP_OFFLOAD_run): Apply lowlat_pool_size. * basic-allocator.c: New file. * config/nvptx/allocator.c: New file. * testsuite/libgomp.c/omp_alloc-1.c: New test. * testsuite/libgomp.c/omp_alloc-2.c: New test. * testsuite/libgomp.c/omp_alloc-3.c: New test. * testsuite/libgomp.c/omp_alloc-4.c: New test. * testsuite/libgomp.c/omp_alloc-5.c: New test. * testsuite/libgomp.c/omp_alloc-6.c: New test. Co-authored-by: Kwok Cheung Yeung <kcy@codesourcery.com> Co-Authored-By: Thomas Schwinge <thomas@codesourcery.com>
This commit is contained in:
parent
458e7c9379
commit
30486fab71
12 changed files with 1239 additions and 105 deletions
|
@ -37,6 +37,47 @@
|
|||
|
||||
#define omp_max_predefined_alloc omp_thread_mem_alloc
|
||||
|
||||
/* These macros may be overridden in config/<target>/allocator.c.
|
||||
The following definitions (ab)use comma operators to avoid unused
|
||||
variable errors. */
|
||||
#ifndef MEMSPACE_ALLOC
|
||||
#define MEMSPACE_ALLOC(MEMSPACE, SIZE) \
|
||||
malloc (((void)(MEMSPACE), (SIZE)))
|
||||
#endif
|
||||
#ifndef MEMSPACE_CALLOC
|
||||
#define MEMSPACE_CALLOC(MEMSPACE, SIZE) \
|
||||
calloc (1, (((void)(MEMSPACE), (SIZE))))
|
||||
#endif
|
||||
#ifndef MEMSPACE_REALLOC
|
||||
#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE) \
|
||||
realloc (ADDR, (((void)(MEMSPACE), (void)(OLDSIZE), (SIZE))))
|
||||
#endif
|
||||
#ifndef MEMSPACE_FREE
|
||||
#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \
|
||||
free (((void)(MEMSPACE), (void)(SIZE), (ADDR)))
|
||||
#endif
|
||||
|
||||
/* Map the predefined allocators to the correct memory space.
|
||||
The index to this table is the omp_allocator_handle_t enum value.
|
||||
When the user calls omp_alloc with a predefined allocator this
|
||||
table determines what memory they get. */
|
||||
static const omp_memspace_handle_t predefined_alloc_mapping[] = {
|
||||
omp_default_mem_space, /* omp_null_allocator doesn't actually use this. */
|
||||
omp_default_mem_space, /* omp_default_mem_alloc. */
|
||||
omp_large_cap_mem_space, /* omp_large_cap_mem_alloc. */
|
||||
omp_const_mem_space, /* omp_const_mem_alloc. */
|
||||
omp_high_bw_mem_space, /* omp_high_bw_mem_alloc. */
|
||||
omp_low_lat_mem_space, /* omp_low_lat_mem_alloc. */
|
||||
omp_low_lat_mem_space, /* omp_cgroup_mem_alloc (implementation defined). */
|
||||
omp_low_lat_mem_space, /* omp_pteam_mem_alloc (implementation defined). */
|
||||
omp_low_lat_mem_space, /* omp_thread_mem_alloc (implementation defined). */
|
||||
};
|
||||
|
||||
#define ARRAY_SIZE(A) (sizeof (A) / sizeof ((A)[0]))
|
||||
_Static_assert (ARRAY_SIZE (predefined_alloc_mapping)
|
||||
== omp_max_predefined_alloc + 1,
|
||||
"predefined_alloc_mapping must match omp_memspace_handle_t");
|
||||
|
||||
enum gomp_numa_memkind_kind
|
||||
{
|
||||
GOMP_MEMKIND_NONE = 0,
|
||||
|
@ -533,7 +574,7 @@ retry:
|
|||
}
|
||||
else
|
||||
#endif
|
||||
ptr = malloc (new_size);
|
||||
ptr = MEMSPACE_ALLOC (allocator_data->memspace, new_size);
|
||||
if (ptr == NULL)
|
||||
{
|
||||
#ifdef HAVE_SYNC_BUILTINS
|
||||
|
@ -565,7 +606,13 @@ retry:
|
|||
}
|
||||
else
|
||||
#endif
|
||||
ptr = malloc (new_size);
|
||||
{
|
||||
omp_memspace_handle_t memspace;
|
||||
memspace = (allocator_data
|
||||
? allocator_data->memspace
|
||||
: predefined_alloc_mapping[allocator]);
|
||||
ptr = MEMSPACE_ALLOC (memspace, new_size);
|
||||
}
|
||||
if (ptr == NULL)
|
||||
goto fail;
|
||||
}
|
||||
|
@ -582,36 +629,26 @@ retry:
|
|||
((struct omp_mem_header *) ret)[-1].allocator = allocator;
|
||||
return ret;
|
||||
|
||||
fail:
|
||||
if (allocator_data)
|
||||
fail:;
|
||||
int fallback = (allocator_data
|
||||
? allocator_data->fallback
|
||||
: allocator == omp_default_mem_alloc
|
||||
? omp_atv_null_fb
|
||||
: omp_atv_default_mem_fb);
|
||||
switch (fallback)
|
||||
{
|
||||
switch (allocator_data->fallback)
|
||||
{
|
||||
case omp_atv_default_mem_fb:
|
||||
if ((new_alignment > sizeof (void *) && new_alignment > alignment)
|
||||
#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
|
||||
|| memkind
|
||||
#endif
|
||||
|| (allocator_data
|
||||
&& allocator_data->pool_size < ~(uintptr_t) 0))
|
||||
{
|
||||
allocator = omp_default_mem_alloc;
|
||||
goto retry;
|
||||
}
|
||||
/* Otherwise, we've already performed default mem allocation
|
||||
and if that failed, it won't succeed again (unless it was
|
||||
intermittent. Return NULL then, as that is the fallback. */
|
||||
break;
|
||||
case omp_atv_null_fb:
|
||||
break;
|
||||
default:
|
||||
case omp_atv_abort_fb:
|
||||
gomp_fatal ("Out of memory allocating %lu bytes",
|
||||
(unsigned long) size);
|
||||
case omp_atv_allocator_fb:
|
||||
allocator = allocator_data->fb_data;
|
||||
goto retry;
|
||||
}
|
||||
case omp_atv_default_mem_fb:
|
||||
allocator = omp_default_mem_alloc;
|
||||
goto retry;
|
||||
case omp_atv_null_fb:
|
||||
break;
|
||||
default:
|
||||
case omp_atv_abort_fb:
|
||||
gomp_fatal ("Out of memory allocating %lu bytes",
|
||||
(unsigned long) size);
|
||||
case omp_atv_allocator_fb:
|
||||
allocator = allocator_data->fb_data;
|
||||
goto retry;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
@ -644,6 +681,7 @@ void
|
|||
omp_free (void *ptr, omp_allocator_handle_t allocator)
|
||||
{
|
||||
struct omp_mem_header *data;
|
||||
omp_memspace_handle_t memspace = omp_default_mem_space;
|
||||
|
||||
if (ptr == NULL)
|
||||
return;
|
||||
|
@ -683,10 +721,12 @@ omp_free (void *ptr, omp_allocator_handle_t allocator)
|
|||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
memspace = allocator_data->memspace;
|
||||
}
|
||||
#ifdef LIBGOMP_USE_MEMKIND
|
||||
else
|
||||
{
|
||||
#ifdef LIBGOMP_USE_MEMKIND
|
||||
enum gomp_numa_memkind_kind memkind = GOMP_MEMKIND_NONE;
|
||||
if (data->allocator == omp_high_bw_mem_alloc)
|
||||
memkind = GOMP_MEMKIND_HBW_PREFERRED;
|
||||
|
@ -702,9 +742,12 @@ omp_free (void *ptr, omp_allocator_handle_t allocator)
|
|||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
free (data->ptr);
|
||||
|
||||
memspace = predefined_alloc_mapping[data->allocator];
|
||||
}
|
||||
|
||||
MEMSPACE_FREE (memspace, data->ptr, data->size);
|
||||
}
|
||||
|
||||
ialias (omp_free)
|
||||
|
@ -831,7 +874,7 @@ retry:
|
|||
}
|
||||
else
|
||||
#endif
|
||||
ptr = calloc (1, new_size);
|
||||
ptr = MEMSPACE_CALLOC (allocator_data->memspace, new_size);
|
||||
if (ptr == NULL)
|
||||
{
|
||||
#ifdef HAVE_SYNC_BUILTINS
|
||||
|
@ -865,7 +908,13 @@ retry:
|
|||
}
|
||||
else
|
||||
#endif
|
||||
ptr = calloc (1, new_size);
|
||||
{
|
||||
omp_memspace_handle_t memspace;
|
||||
memspace = (allocator_data
|
||||
? allocator_data->memspace
|
||||
: predefined_alloc_mapping[allocator]);
|
||||
ptr = MEMSPACE_CALLOC (memspace, new_size);
|
||||
}
|
||||
if (ptr == NULL)
|
||||
goto fail;
|
||||
}
|
||||
|
@ -882,36 +931,26 @@ retry:
|
|||
((struct omp_mem_header *) ret)[-1].allocator = allocator;
|
||||
return ret;
|
||||
|
||||
fail:
|
||||
if (allocator_data)
|
||||
fail:;
|
||||
int fallback = (allocator_data
|
||||
? allocator_data->fallback
|
||||
: allocator == omp_default_mem_alloc
|
||||
? omp_atv_null_fb
|
||||
: omp_atv_default_mem_fb);
|
||||
switch (fallback)
|
||||
{
|
||||
switch (allocator_data->fallback)
|
||||
{
|
||||
case omp_atv_default_mem_fb:
|
||||
if ((new_alignment > sizeof (void *) && new_alignment > alignment)
|
||||
#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
|
||||
|| memkind
|
||||
#endif
|
||||
|| (allocator_data
|
||||
&& allocator_data->pool_size < ~(uintptr_t) 0))
|
||||
{
|
||||
allocator = omp_default_mem_alloc;
|
||||
goto retry;
|
||||
}
|
||||
/* Otherwise, we've already performed default mem allocation
|
||||
and if that failed, it won't succeed again (unless it was
|
||||
intermittent. Return NULL then, as that is the fallback. */
|
||||
break;
|
||||
case omp_atv_null_fb:
|
||||
break;
|
||||
default:
|
||||
case omp_atv_abort_fb:
|
||||
gomp_fatal ("Out of memory allocating %lu bytes",
|
||||
(unsigned long) (size * nmemb));
|
||||
case omp_atv_allocator_fb:
|
||||
allocator = allocator_data->fb_data;
|
||||
goto retry;
|
||||
}
|
||||
case omp_atv_default_mem_fb:
|
||||
allocator = omp_default_mem_alloc;
|
||||
goto retry;
|
||||
case omp_atv_null_fb:
|
||||
break;
|
||||
default:
|
||||
case omp_atv_abort_fb:
|
||||
gomp_fatal ("Out of memory allocating %lu bytes",
|
||||
(unsigned long) (size * nmemb));
|
||||
case omp_atv_allocator_fb:
|
||||
allocator = allocator_data->fb_data;
|
||||
goto retry;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
@ -1101,9 +1140,10 @@ retry:
|
|||
else
|
||||
#endif
|
||||
if (prev_size)
|
||||
new_ptr = realloc (data->ptr, new_size);
|
||||
new_ptr = MEMSPACE_REALLOC (allocator_data->memspace, data->ptr,
|
||||
data->size, new_size);
|
||||
else
|
||||
new_ptr = malloc (new_size);
|
||||
new_ptr = MEMSPACE_ALLOC (allocator_data->memspace, new_size);
|
||||
if (new_ptr == NULL)
|
||||
{
|
||||
#ifdef HAVE_SYNC_BUILTINS
|
||||
|
@ -1151,7 +1191,13 @@ retry:
|
|||
}
|
||||
else
|
||||
#endif
|
||||
new_ptr = realloc (data->ptr, new_size);
|
||||
{
|
||||
omp_memspace_handle_t memspace;
|
||||
memspace = (allocator_data
|
||||
? allocator_data->memspace
|
||||
: predefined_alloc_mapping[allocator]);
|
||||
new_ptr = MEMSPACE_REALLOC (memspace, data->ptr, data->size, new_size);
|
||||
}
|
||||
if (new_ptr == NULL)
|
||||
goto fail;
|
||||
ret = (char *) new_ptr + sizeof (struct omp_mem_header);
|
||||
|
@ -1178,7 +1224,13 @@ retry:
|
|||
}
|
||||
else
|
||||
#endif
|
||||
new_ptr = malloc (new_size);
|
||||
{
|
||||
omp_memspace_handle_t memspace;
|
||||
memspace = (allocator_data
|
||||
? allocator_data->memspace
|
||||
: predefined_alloc_mapping[allocator]);
|
||||
new_ptr = MEMSPACE_ALLOC (memspace, new_size);
|
||||
}
|
||||
if (new_ptr == NULL)
|
||||
goto fail;
|
||||
}
|
||||
|
@ -1227,39 +1279,35 @@ retry:
|
|||
return ret;
|
||||
}
|
||||
#endif
|
||||
free (data->ptr);
|
||||
{
|
||||
omp_memspace_handle_t was_memspace;
|
||||
was_memspace = (free_allocator_data
|
||||
? free_allocator_data->memspace
|
||||
: predefined_alloc_mapping[free_allocator]);
|
||||
MEMSPACE_FREE (was_memspace, data->ptr, data->size);
|
||||
}
|
||||
return ret;
|
||||
|
||||
fail:
|
||||
if (allocator_data)
|
||||
fail:;
|
||||
int fallback = (allocator_data
|
||||
? allocator_data->fallback
|
||||
: allocator == omp_default_mem_alloc
|
||||
? omp_atv_null_fb
|
||||
: omp_atv_default_mem_fb);
|
||||
switch (fallback)
|
||||
{
|
||||
switch (allocator_data->fallback)
|
||||
{
|
||||
case omp_atv_default_mem_fb:
|
||||
if (new_alignment > sizeof (void *)
|
||||
#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
|
||||
|| memkind
|
||||
#endif
|
||||
|| (allocator_data
|
||||
&& allocator_data->pool_size < ~(uintptr_t) 0))
|
||||
{
|
||||
allocator = omp_default_mem_alloc;
|
||||
goto retry;
|
||||
}
|
||||
/* Otherwise, we've already performed default mem allocation
|
||||
and if that failed, it won't succeed again (unless it was
|
||||
intermittent. Return NULL then, as that is the fallback. */
|
||||
break;
|
||||
case omp_atv_null_fb:
|
||||
break;
|
||||
default:
|
||||
case omp_atv_abort_fb:
|
||||
gomp_fatal ("Out of memory allocating %lu bytes",
|
||||
(unsigned long) size);
|
||||
case omp_atv_allocator_fb:
|
||||
allocator = allocator_data->fb_data;
|
||||
goto retry;
|
||||
}
|
||||
case omp_atv_default_mem_fb:
|
||||
allocator = omp_default_mem_alloc;
|
||||
goto retry;
|
||||
case omp_atv_null_fb:
|
||||
break;
|
||||
default:
|
||||
case omp_atv_abort_fb:
|
||||
gomp_fatal ("Out of memory allocating %lu bytes",
|
||||
(unsigned long) size);
|
||||
case omp_atv_allocator_fb:
|
||||
allocator = allocator_data->fb_data;
|
||||
goto retry;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
|
382
libgomp/basic-allocator.c
Normal file
382
libgomp/basic-allocator.c
Normal file
|
@ -0,0 +1,382 @@
|
|||
/* Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU Offloading and Multi Processing Library
|
||||
(libgomp).
|
||||
|
||||
Libgomp is free software; you can redistribute it and/or modify it
|
||||
under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3, or (at your option)
|
||||
any later version.
|
||||
|
||||
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
more details.
|
||||
|
||||
Under Section 7 of GPL version 3, you are granted additional
|
||||
permissions described in the GCC Runtime Library Exception, version
|
||||
3.1, as published by the Free Software Foundation.
|
||||
|
||||
You should have received a copy of the GNU General Public License and
|
||||
a copy of the GCC Runtime Library Exception along with this program;
|
||||
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
/* This is a basic "malloc" implementation intended for use with small,
|
||||
low-latency memories.
|
||||
|
||||
To use this template, define BASIC_ALLOC_PREFIX, and then #include the
|
||||
source file. The other configuration macros are optional.
|
||||
|
||||
The root heap descriptor is stored in the first bytes of the heap, and each
|
||||
free chunk contains a similar descriptor for the next free chunk in the
|
||||
chain.
|
||||
|
||||
The descriptor is two values: offset and size, which describe the
|
||||
location of a chunk of memory available for allocation. The offset is
|
||||
relative to the base of the heap. The special offset value 0xffffffff
|
||||
indicates that the heap (free chain) is locked. The offset and size are
|
||||
32-bit values so the base alignment can be 8-bytes.
|
||||
|
||||
Memory is allocated to the first free chunk that fits. The free chain
|
||||
is always stored in order of the offset to assist coalescing adjacent
|
||||
chunks. */
|
||||
|
||||
#include "libgomp.h"
|
||||
|
||||
#ifndef BASIC_ALLOC_PREFIX
|
||||
#error "BASIC_ALLOC_PREFIX not defined."
|
||||
#endif
|
||||
|
||||
#ifndef BASIC_ALLOC_YIELD
|
||||
#define BASIC_ALLOC_YIELD
|
||||
#endif
|
||||
|
||||
#define ALIGN(VAR) (((VAR) + 7) & ~7) /* 8-byte granularity. */
|
||||
|
||||
#define fn1(prefix, name) prefix ## _ ## name
|
||||
#define fn(prefix, name) fn1 (prefix, name)
|
||||
#define basic_alloc_init fn(BASIC_ALLOC_PREFIX,init)
|
||||
#define basic_alloc_alloc fn(BASIC_ALLOC_PREFIX,alloc)
|
||||
#define basic_alloc_calloc fn(BASIC_ALLOC_PREFIX,calloc)
|
||||
#define basic_alloc_free fn(BASIC_ALLOC_PREFIX,free)
|
||||
#define basic_alloc_realloc fn(BASIC_ALLOC_PREFIX,realloc)
|
||||
|
||||
typedef struct {
|
||||
uint32_t offset;
|
||||
uint32_t size;
|
||||
} heapdesc;
|
||||
|
||||
void
|
||||
basic_alloc_init (char *heap, size_t limit)
|
||||
{
|
||||
if (heap == NULL)
|
||||
return;
|
||||
|
||||
/* Initialize the head of the free chain. */
|
||||
heapdesc *root = (heapdesc *) heap;
|
||||
root->offset = ALIGN(1);
|
||||
root->size = limit - root->offset;
|
||||
|
||||
/* And terminate the chain. */
|
||||
heapdesc *next = (heapdesc *) (heap + root->offset);
|
||||
next->offset = 0;
|
||||
next->size = 0;
|
||||
}
|
||||
|
||||
static void *
|
||||
basic_alloc_alloc (char *heap, size_t size)
|
||||
{
|
||||
if (heap == NULL)
|
||||
return NULL;
|
||||
|
||||
/* Memory is allocated in N-byte granularity. */
|
||||
size = ALIGN (size);
|
||||
|
||||
/* Acquire a lock on the low-latency heap. */
|
||||
heapdesc root, *root_ptr = (heapdesc *) heap;
|
||||
do
|
||||
{
|
||||
root.offset = __atomic_exchange_n (&root_ptr->offset, 0xffffffff,
|
||||
MEMMODEL_ACQUIRE);
|
||||
if (root.offset != 0xffffffff)
|
||||
{
|
||||
root.size = root_ptr->size;
|
||||
break;
|
||||
}
|
||||
/* Spin. */
|
||||
BASIC_ALLOC_YIELD;
|
||||
}
|
||||
while (1);
|
||||
|
||||
/* Walk the free chain. */
|
||||
heapdesc chunk = root;
|
||||
heapdesc *prev_chunkptr = NULL;
|
||||
heapdesc *chunkptr = (heapdesc *) (heap + chunk.offset);
|
||||
heapdesc onward_chain = *chunkptr;
|
||||
while (chunk.size != 0 && (uint32_t) size > chunk.size)
|
||||
{
|
||||
chunk = onward_chain;
|
||||
prev_chunkptr = chunkptr;
|
||||
chunkptr = (heapdesc *) (heap + chunk.offset);
|
||||
onward_chain = *chunkptr;
|
||||
}
|
||||
|
||||
void *result = NULL;
|
||||
if (chunk.size != 0)
|
||||
{
|
||||
/* Allocation successful. */
|
||||
result = chunkptr;
|
||||
|
||||
/* Update the free chain. */
|
||||
heapdesc stillfree = chunk;
|
||||
stillfree.offset += size;
|
||||
stillfree.size -= size;
|
||||
heapdesc *stillfreeptr = (heapdesc *) (heap + stillfree.offset);
|
||||
|
||||
if (stillfree.size == 0)
|
||||
/* The whole chunk was used. */
|
||||
stillfree = onward_chain;
|
||||
else
|
||||
/* The chunk was split, so restore the onward chain. */
|
||||
*stillfreeptr = onward_chain;
|
||||
|
||||
/* The previous free slot or root now points to stillfree. */
|
||||
if (prev_chunkptr)
|
||||
*prev_chunkptr = stillfree;
|
||||
else
|
||||
root = stillfree;
|
||||
}
|
||||
|
||||
/* Update the free chain root and release the lock. */
|
||||
root_ptr->size = root.size;
|
||||
__atomic_store_n (&root_ptr->offset, root.offset, MEMMODEL_RELEASE);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void *
|
||||
basic_alloc_calloc (char *heap, size_t size)
|
||||
{
|
||||
/* Memory is allocated in N-byte granularity. */
|
||||
size = ALIGN (size);
|
||||
|
||||
uint64_t *result = basic_alloc_alloc (heap, size);
|
||||
if (result)
|
||||
/* Inline memset in which we know size is a multiple of 8. */
|
||||
for (unsigned i = 0; i < (unsigned) size / 8; i++)
|
||||
result[i] = 0;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void
|
||||
basic_alloc_free (char *heap, void *addr, size_t size)
|
||||
{
|
||||
/* Memory is allocated in N-byte granularity. */
|
||||
size = ALIGN (size);
|
||||
|
||||
/* Acquire a lock on the low-latency heap. */
|
||||
heapdesc root, *root_ptr = (heapdesc *) heap;
|
||||
do
|
||||
{
|
||||
root.offset = __atomic_exchange_n (&root_ptr->offset, 0xffffffff,
|
||||
MEMMODEL_ACQUIRE);
|
||||
if (root.offset != 0xffffffff)
|
||||
{
|
||||
root.size = root_ptr->size;
|
||||
break;
|
||||
}
|
||||
/* Spin. */
|
||||
BASIC_ALLOC_YIELD;
|
||||
}
|
||||
while (1);
|
||||
|
||||
/* Walk the free chain to find where to insert a new entry. */
|
||||
heapdesc chunk = root, prev_chunk = {0};
|
||||
heapdesc *prev_chunkptr = NULL, *prevprev_chunkptr = NULL;
|
||||
heapdesc *chunkptr = (heapdesc *) (heap + chunk.offset);
|
||||
heapdesc onward_chain = *chunkptr;
|
||||
while (chunk.size != 0 && addr > (void *) chunkptr)
|
||||
{
|
||||
prev_chunk = chunk;
|
||||
chunk = onward_chain;
|
||||
prevprev_chunkptr = prev_chunkptr;
|
||||
prev_chunkptr = chunkptr;
|
||||
chunkptr = (heapdesc *) (heap + chunk.offset);
|
||||
onward_chain = *chunkptr;
|
||||
}
|
||||
|
||||
/* Create the new chunk descriptor. */
|
||||
heapdesc newfreechunk;
|
||||
newfreechunk.offset = (uint32_t) ((uintptr_t) addr - (uintptr_t) heap);
|
||||
newfreechunk.size = (uint32_t) size;
|
||||
|
||||
/* Coalesce adjacent free chunks. */
|
||||
if (newfreechunk.offset + size == chunk.offset)
|
||||
{
|
||||
/* Free chunk follows. */
|
||||
newfreechunk.size += chunk.size;
|
||||
chunk = onward_chain;
|
||||
}
|
||||
if (prev_chunkptr)
|
||||
{
|
||||
if (prev_chunk.offset + prev_chunk.size
|
||||
== newfreechunk.offset)
|
||||
{
|
||||
/* Free chunk precedes. */
|
||||
newfreechunk.offset = prev_chunk.offset;
|
||||
newfreechunk.size += prev_chunk.size;
|
||||
addr = heap + prev_chunk.offset;
|
||||
prev_chunkptr = prevprev_chunkptr;
|
||||
}
|
||||
}
|
||||
|
||||
/* Update the free chain in the new and previous chunks. */
|
||||
*(heapdesc *) addr = chunk;
|
||||
if (prev_chunkptr)
|
||||
*prev_chunkptr = newfreechunk;
|
||||
else
|
||||
root = newfreechunk;
|
||||
|
||||
/* Update the free chain root and release the lock. */
|
||||
root_ptr->size = root.size;
|
||||
__atomic_store_n (&root_ptr->offset, root.offset, MEMMODEL_RELEASE);
|
||||
|
||||
}
|
||||
|
||||
static void *
|
||||
basic_alloc_realloc (char *heap, void *addr, size_t oldsize,
|
||||
size_t size)
|
||||
{
|
||||
/* Memory is allocated in N-byte granularity. */
|
||||
oldsize = ALIGN (oldsize);
|
||||
size = ALIGN (size);
|
||||
|
||||
if (oldsize == size)
|
||||
return addr;
|
||||
|
||||
/* Acquire a lock on the low-latency heap. */
|
||||
heapdesc root, *root_ptr = (heapdesc *) heap;
|
||||
do
|
||||
{
|
||||
root.offset = __atomic_exchange_n (&root_ptr->offset, 0xffffffff,
|
||||
MEMMODEL_ACQUIRE);
|
||||
if (root.offset != 0xffffffff)
|
||||
{
|
||||
root.size = root_ptr->size;
|
||||
break;
|
||||
}
|
||||
/* Spin. */
|
||||
BASIC_ALLOC_YIELD;
|
||||
}
|
||||
while (1);
|
||||
|
||||
/* Walk the free chain. */
|
||||
heapdesc chunk = root;
|
||||
heapdesc *prev_chunkptr = NULL;
|
||||
heapdesc *chunkptr = (heapdesc *) (heap + chunk.offset);
|
||||
heapdesc onward_chain = *chunkptr;
|
||||
while (chunk.size != 0 && (void *) chunkptr < addr)
|
||||
{
|
||||
chunk = onward_chain;
|
||||
prev_chunkptr = chunkptr;
|
||||
chunkptr = (heapdesc *) (heap + chunk.offset);
|
||||
onward_chain = *chunkptr;
|
||||
}
|
||||
|
||||
void *result = NULL;
|
||||
if (size < oldsize)
|
||||
{
|
||||
/* The new allocation is smaller than the old; we can always
|
||||
shrink an allocation in place. */
|
||||
result = addr;
|
||||
|
||||
heapdesc *nowfreeptr = (heapdesc *) (addr + size);
|
||||
|
||||
/* Update the free chain. */
|
||||
heapdesc nowfree;
|
||||
nowfree.offset = (char *) nowfreeptr - heap;
|
||||
nowfree.size = oldsize - size;
|
||||
|
||||
if (nowfree.offset + size == chunk.offset)
|
||||
{
|
||||
/* Coalesce following free chunk. */
|
||||
nowfree.size += chunk.size;
|
||||
*nowfreeptr = onward_chain;
|
||||
}
|
||||
else
|
||||
*nowfreeptr = chunk;
|
||||
|
||||
/* The previous free slot or root now points to nowfree. */
|
||||
if (prev_chunkptr)
|
||||
*prev_chunkptr = nowfree;
|
||||
else
|
||||
root = nowfree;
|
||||
}
|
||||
else if (chunk.size != 0
|
||||
&& (char *) addr + oldsize == (char *) chunkptr
|
||||
&& chunk.size >= size-oldsize)
|
||||
{
|
||||
/* The new allocation is larger than the old, and we found a
|
||||
large enough free block right after the existing block,
|
||||
so we extend into that space. */
|
||||
result = addr;
|
||||
|
||||
uint32_t delta = size-oldsize;
|
||||
|
||||
/* Update the free chain. */
|
||||
heapdesc stillfree = chunk;
|
||||
stillfree.offset += delta;
|
||||
stillfree.size -= delta;
|
||||
heapdesc *stillfreeptr = (heapdesc *) (heap + stillfree.offset);
|
||||
|
||||
if (stillfree.size == 0)
|
||||
/* The whole chunk was used. */
|
||||
stillfree = onward_chain;
|
||||
else
|
||||
/* The chunk was split, so restore the onward chain. */
|
||||
*stillfreeptr = onward_chain;
|
||||
|
||||
/* The previous free slot or root now points to stillfree. */
|
||||
if (prev_chunkptr)
|
||||
*prev_chunkptr = stillfree;
|
||||
else
|
||||
root = stillfree;
|
||||
}
|
||||
/* Else realloc in-place has failed and result remains NULL. */
|
||||
|
||||
/* Update the free chain root and release the lock. */
|
||||
root_ptr->size = root.size;
|
||||
__atomic_store_n (&root_ptr->offset, root.offset, MEMMODEL_RELEASE);
|
||||
|
||||
if (result == NULL)
|
||||
{
|
||||
/* The allocation could not be extended in place, so we simply
|
||||
allocate fresh memory and move the data. If we can't allocate
|
||||
from low-latency memory then we leave the original alloaction
|
||||
intact and return NULL.
|
||||
We could do a fall-back to main memory, but we don't know what
|
||||
the fall-back trait said to do. */
|
||||
result = basic_alloc_alloc (heap, size);
|
||||
if (result != NULL)
|
||||
{
|
||||
/* Inline memcpy in which we know oldsize is a multiple of 8. */
|
||||
uint64_t *from = addr, *to = result;
|
||||
for (unsigned i = 0; i < (unsigned) oldsize / 8; i++)
|
||||
to[i] = from[i];
|
||||
|
||||
basic_alloc_free (heap, addr, oldsize);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#undef ALIGN
|
||||
#undef fn1
|
||||
#undef fn
|
||||
#undef basic_alloc_init
|
||||
#undef basic_alloc_alloc
|
||||
#undef basic_alloc_free
|
||||
#undef basic_alloc_realloc
|
120
libgomp/config/nvptx/allocator.c
Normal file
120
libgomp/config/nvptx/allocator.c
Normal file
|
@ -0,0 +1,120 @@
|
|||
/* Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU Offloading and Multi Processing Library
|
||||
(libgomp).
|
||||
|
||||
Libgomp is free software; you can redistribute it and/or modify it
|
||||
under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3, or (at your option)
|
||||
any later version.
|
||||
|
||||
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
more details.
|
||||
|
||||
Under Section 7 of GPL version 3, you are granted additional
|
||||
permissions described in the GCC Runtime Library Exception, version
|
||||
3.1, as published by the Free Software Foundation.
|
||||
|
||||
You should have received a copy of the GNU General Public License and
|
||||
a copy of the GCC Runtime Library Exception along with this program;
|
||||
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
/* The low-latency allocators use space reserved in .shared memory when the
|
||||
kernel is launched. The heap is initialized in gomp_nvptx_main and all
|
||||
allocations are forgotten when the kernel exits. Allocations to other
|
||||
memory spaces all use the system malloc syscall.
|
||||
|
||||
The root heap descriptor is stored elsewhere in shared memory, and each
|
||||
free chunk contains a similar descriptor for the next free chunk in the
|
||||
chain.
|
||||
|
||||
The descriptor is two 16-bit values: offset and size, which describe the
|
||||
location of a chunk of memory available for allocation. The offset is
|
||||
relative to the base of the heap. The special value 0xffff, 0xffff
|
||||
indicates that the heap is locked. The descriptor is encoded into a
|
||||
single 32-bit integer so that it may be easily accessed atomically.
|
||||
|
||||
Memory is allocated to the first free chunk that fits. The free chain
|
||||
is always stored in order of the offset to assist coalescing adjacent
|
||||
chunks. */
|
||||
|
||||
#include "libgomp.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
#define BASIC_ALLOC_PREFIX __nvptx_lowlat
|
||||
#include "../../basic-allocator.c"
|
||||
|
||||
/* There should be some .shared space reserved for us. There's no way to
|
||||
express this magic extern sizeless array in C so use asm. */
|
||||
asm (".extern .shared .u8 __nvptx_lowlat_pool[];\n");
|
||||
|
||||
static void *
|
||||
nvptx_memspace_alloc (omp_memspace_handle_t memspace, size_t size)
|
||||
{
|
||||
if (memspace == omp_low_lat_mem_space)
|
||||
{
|
||||
char *shared_pool;
|
||||
asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r" (shared_pool));
|
||||
|
||||
return __nvptx_lowlat_alloc (shared_pool, size);
|
||||
}
|
||||
else
|
||||
return malloc (size);
|
||||
}
|
||||
|
||||
static void *
|
||||
nvptx_memspace_calloc (omp_memspace_handle_t memspace, size_t size)
|
||||
{
|
||||
if (memspace == omp_low_lat_mem_space)
|
||||
{
|
||||
char *shared_pool;
|
||||
asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r" (shared_pool));
|
||||
|
||||
return __nvptx_lowlat_calloc (shared_pool, size);
|
||||
}
|
||||
else
|
||||
return calloc (1, size);
|
||||
}
|
||||
|
||||
static void
|
||||
nvptx_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size)
|
||||
{
|
||||
if (memspace == omp_low_lat_mem_space)
|
||||
{
|
||||
char *shared_pool;
|
||||
asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r" (shared_pool));
|
||||
|
||||
__nvptx_lowlat_free (shared_pool, addr, size);
|
||||
}
|
||||
else
|
||||
free (addr);
|
||||
}
|
||||
|
||||
static void *
|
||||
nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
|
||||
size_t oldsize, size_t size)
|
||||
{
|
||||
if (memspace == omp_low_lat_mem_space)
|
||||
{
|
||||
char *shared_pool;
|
||||
asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r" (shared_pool));
|
||||
|
||||
return __nvptx_lowlat_realloc (shared_pool, addr, oldsize, size);
|
||||
}
|
||||
else
|
||||
return realloc (addr, size);
|
||||
}
|
||||
|
||||
#define MEMSPACE_ALLOC(MEMSPACE, SIZE) \
|
||||
nvptx_memspace_alloc (MEMSPACE, SIZE)
|
||||
#define MEMSPACE_CALLOC(MEMSPACE, SIZE) \
|
||||
nvptx_memspace_calloc (MEMSPACE, SIZE)
|
||||
#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE) \
|
||||
nvptx_memspace_realloc (MEMSPACE, ADDR, OLDSIZE, SIZE)
|
||||
#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \
|
||||
nvptx_memspace_free (MEMSPACE, ADDR, SIZE)
|
||||
|
||||
#include "../../allocator.c"
|
|
@ -37,6 +37,12 @@ int __gomp_team_num __attribute__((shared,nocommon));
|
|||
static void gomp_thread_start (struct gomp_thread_pool *);
|
||||
extern void build_indirect_map (void);
|
||||
|
||||
/* There should be some .shared space reserved for us. There's no way to
|
||||
express this magic extern sizeless array in C so use asm. */
|
||||
asm (".extern .shared .u8 __nvptx_lowlat_pool[];\n");
|
||||
|
||||
/* Defined in basic-allocator.c via config/nvptx/allocator.c. */
|
||||
void __nvptx_lowlat_init (void *heap, size_t size);
|
||||
|
||||
/* This externally visible function handles target region entry. It
|
||||
sets up a per-team thread pool and transfers control by calling FN (FN_DATA)
|
||||
|
@ -68,6 +74,18 @@ gomp_nvptx_main (void (*fn) (void *), void *fn_data)
|
|||
nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs));
|
||||
memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs));
|
||||
|
||||
/* Find the low-latency heap details .... */
|
||||
uint32_t *shared_pool;
|
||||
uint32_t shared_pool_size = 0;
|
||||
asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool));
|
||||
#if __PTX_ISA_VERSION_MAJOR__ > 4 \
|
||||
|| (__PTX_ISA_VERSION_MAJOR__ == 4 && __PTX_ISA_VERSION_MINOR__ >= 1)
|
||||
asm ("mov.u32\t%0, %%dynamic_smem_size;\n"
|
||||
: "=r"(shared_pool_size));
|
||||
#endif
|
||||
__nvptx_lowlat_init (shared_pool, shared_pool_size);
|
||||
|
||||
/* Initialize the thread pool. */
|
||||
struct gomp_thread_pool *pool = alloca (sizeof (*pool));
|
||||
pool->threads = alloca (ntids * sizeof (*pool->threads));
|
||||
for (tid = 0; tid < ntids; tid++)
|
||||
|
|
|
@ -3012,9 +3012,9 @@ value.
|
|||
@item omp_const_mem_alloc @tab omp_const_mem_space
|
||||
@item omp_high_bw_mem_alloc @tab omp_high_bw_mem_space
|
||||
@item omp_low_lat_mem_alloc @tab omp_low_lat_mem_space
|
||||
@item omp_cgroup_mem_alloc @tab --
|
||||
@item omp_pteam_mem_alloc @tab --
|
||||
@item omp_thread_mem_alloc @tab --
|
||||
@item omp_cgroup_mem_alloc @tab omp_low_lat_mem_space (implementation defined)
|
||||
@item omp_pteam_mem_alloc @tab omp_low_lat_mem_space (implementation defined)
|
||||
@item omp_thread_mem_alloc @tab omp_low_lat_mem_space (implementation defined)
|
||||
@end multitable
|
||||
|
||||
The predefined allocators use the default values for the traits,
|
||||
|
@ -3060,7 +3060,7 @@ OMP_ALLOCATOR=omp_low_lat_mem_space:pinned=true,partition=nearest
|
|||
|
||||
@item @emph{See also}:
|
||||
@ref{Memory allocation}, @ref{omp_get_default_allocator},
|
||||
@ref{omp_set_default_allocator}
|
||||
@ref{omp_set_default_allocator}, @ref{Offload-Target Specifics}
|
||||
|
||||
@item @emph{Reference}:
|
||||
@uref{https://www.openmp.org, OpenMP specification v5.0}, Section 6.21
|
||||
|
@ -5710,7 +5710,8 @@ For the memory spaces, the following applies:
|
|||
@itemize
|
||||
@item @code{omp_default_mem_space} is supported
|
||||
@item @code{omp_const_mem_space} maps to @code{omp_default_mem_space}
|
||||
@item @code{omp_low_lat_mem_space} maps to @code{omp_default_mem_space}
|
||||
@item @code{omp_low_lat_mem_space} is only available on supported devices,
|
||||
and maps to @code{omp_default_mem_space} otherwise.
|
||||
@item @code{omp_large_cap_mem_space} maps to @code{omp_default_mem_space},
|
||||
unless the memkind library is available
|
||||
@item @code{omp_high_bw_mem_space} maps to @code{omp_default_mem_space},
|
||||
|
|
|
@ -341,6 +341,11 @@ struct ptx_device
|
|||
|
||||
static struct ptx_device **ptx_devices;
|
||||
|
||||
/* OpenMP kernels reserve a small amount of ".shared" space for use by
|
||||
omp_alloc. The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
|
||||
default is set here. */
|
||||
static unsigned lowlat_pool_size = 8 * 1024;
|
||||
|
||||
static inline struct nvptx_thread *
|
||||
nvptx_thread (void)
|
||||
{
|
||||
|
@ -1219,6 +1224,22 @@ GOMP_OFFLOAD_init_device (int n)
|
|||
instantiated_devices++;
|
||||
}
|
||||
|
||||
const char *var_name = "GOMP_NVPTX_LOWLAT_POOL";
|
||||
const char *env_var = secure_getenv (var_name);
|
||||
notify_var (var_name, env_var);
|
||||
|
||||
if (env_var != NULL)
|
||||
{
|
||||
char *endptr;
|
||||
unsigned long val = strtoul (env_var, &endptr, 10);
|
||||
if (endptr == NULL || *endptr != '\0'
|
||||
|| errno == ERANGE || errno == EINVAL
|
||||
|| val > UINT_MAX)
|
||||
GOMP_PLUGIN_error ("Error parsing %s", var_name);
|
||||
else
|
||||
lowlat_pool_size = val;
|
||||
}
|
||||
|
||||
pthread_mutex_unlock (&ptx_dev_lock);
|
||||
|
||||
return dev != NULL;
|
||||
|
@ -2178,7 +2199,7 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
|
|||
" [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
|
||||
__FUNCTION__, fn_name, teams, threads);
|
||||
r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
|
||||
32, threads, 1, 0, NULL, NULL, config);
|
||||
32, threads, 1, lowlat_pool_size, NULL, NULL, config);
|
||||
if (r != CUDA_SUCCESS)
|
||||
GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
|
||||
if (reverse_offload)
|
||||
|
|
56
libgomp/testsuite/libgomp.c/omp_alloc-1.c
Normal file
56
libgomp/testsuite/libgomp.c/omp_alloc-1.c
Normal file
|
@ -0,0 +1,56 @@
|
|||
/* { dg-do run } */
|
||||
|
||||
/* Test that omp_alloc returns usable memory. */
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#pragma omp requires dynamic_allocators
|
||||
|
||||
void
|
||||
test (int n, omp_allocator_handle_t allocator)
|
||||
{
|
||||
#pragma omp target map(to:n) map(to:allocator)
|
||||
{
|
||||
int *a;
|
||||
a = (int *) omp_alloc (n * sizeof (int), allocator);
|
||||
|
||||
#pragma omp parallel
|
||||
for (int i = 0; i < n; i++)
|
||||
a[i] = i;
|
||||
|
||||
for (int i = 0; i < n; i++)
|
||||
if (a[i] != i)
|
||||
{
|
||||
__builtin_printf ("data mismatch at %i\n", i);
|
||||
__builtin_abort ();
|
||||
}
|
||||
|
||||
omp_free (a, allocator);
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
// Smaller than low-latency memory limit
|
||||
test (10, omp_default_mem_alloc);
|
||||
test (10, omp_large_cap_mem_alloc);
|
||||
test (10, omp_const_mem_alloc);
|
||||
test (10, omp_high_bw_mem_alloc);
|
||||
test (10, omp_low_lat_mem_alloc);
|
||||
test (10, omp_cgroup_mem_alloc);
|
||||
test (10, omp_pteam_mem_alloc);
|
||||
test (10, omp_thread_mem_alloc);
|
||||
|
||||
// Larger than low-latency memory limit
|
||||
test (100000, omp_default_mem_alloc);
|
||||
test (100000, omp_large_cap_mem_alloc);
|
||||
test (100000, omp_const_mem_alloc);
|
||||
test (100000, omp_high_bw_mem_alloc);
|
||||
test (100000, omp_low_lat_mem_alloc);
|
||||
test (100000, omp_cgroup_mem_alloc);
|
||||
test (100000, omp_pteam_mem_alloc);
|
||||
test (100000, omp_thread_mem_alloc);
|
||||
|
||||
return 0;
|
||||
}
|
64
libgomp/testsuite/libgomp.c/omp_alloc-2.c
Normal file
64
libgomp/testsuite/libgomp.c/omp_alloc-2.c
Normal file
|
@ -0,0 +1,64 @@
|
|||
/* { dg-do run } */
|
||||
|
||||
/* Test concurrent and repeated allocations. */
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#pragma omp requires dynamic_allocators
|
||||
|
||||
void
|
||||
test (int n, omp_allocator_handle_t allocator)
|
||||
{
|
||||
#pragma omp target map(to:n) map(to:allocator)
|
||||
{
|
||||
int **a;
|
||||
a = (int **) omp_alloc (n * sizeof (int *), allocator);
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
/*Use 10x to ensure we do activate low-latency fall-back. */
|
||||
a[i] = omp_alloc (sizeof (int) * 10, allocator);
|
||||
a[i][0] = i;
|
||||
}
|
||||
|
||||
for (int i = 0; i < n; i++)
|
||||
if (a[i][0] != i)
|
||||
{
|
||||
__builtin_printf ("data mismatch at %i\n", i);
|
||||
__builtin_abort ();
|
||||
}
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < n; i++)
|
||||
omp_free (a[i], allocator);
|
||||
|
||||
omp_free (a, allocator);
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
// Smaller than low-latency memory limit
|
||||
test (10, omp_default_mem_alloc);
|
||||
test (10, omp_large_cap_mem_alloc);
|
||||
test (10, omp_const_mem_alloc);
|
||||
test (10, omp_high_bw_mem_alloc);
|
||||
test (10, omp_low_lat_mem_alloc);
|
||||
test (10, omp_cgroup_mem_alloc);
|
||||
test (10, omp_pteam_mem_alloc);
|
||||
test (10, omp_thread_mem_alloc);
|
||||
|
||||
// Larger than low-latency memory limit (on aggregate)
|
||||
test (1000, omp_default_mem_alloc);
|
||||
test (1000, omp_large_cap_mem_alloc);
|
||||
test (1000, omp_const_mem_alloc);
|
||||
test (1000, omp_high_bw_mem_alloc);
|
||||
test (1000, omp_low_lat_mem_alloc);
|
||||
test (1000, omp_cgroup_mem_alloc);
|
||||
test (1000, omp_pteam_mem_alloc);
|
||||
test (1000, omp_thread_mem_alloc);
|
||||
|
||||
return 0;
|
||||
}
|
42
libgomp/testsuite/libgomp.c/omp_alloc-3.c
Normal file
42
libgomp/testsuite/libgomp.c/omp_alloc-3.c
Normal file
|
@ -0,0 +1,42 @@
|
|||
/* { dg-do run } */
|
||||
|
||||
/* Stress-test omp_alloc/omp_malloc under concurrency. */
|
||||
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#pragma omp requires dynamic_allocators
|
||||
|
||||
#define N 1000
|
||||
|
||||
void
|
||||
test (omp_allocator_handle_t allocator)
|
||||
{
|
||||
#pragma omp target map(to:allocator)
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++)
|
||||
{
|
||||
int *p = omp_alloc (sizeof (int), allocator);
|
||||
omp_free (p, allocator);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
// Smaller than low-latency memory limit
|
||||
test (omp_default_mem_alloc);
|
||||
test (omp_large_cap_mem_alloc);
|
||||
test (omp_const_mem_alloc);
|
||||
test (omp_high_bw_mem_alloc);
|
||||
test (omp_low_lat_mem_alloc);
|
||||
test (omp_cgroup_mem_alloc);
|
||||
test (omp_pteam_mem_alloc);
|
||||
test (omp_thread_mem_alloc);
|
||||
|
||||
return 0;
|
||||
}
|
199
libgomp/testsuite/libgomp.c/omp_alloc-4.c
Normal file
199
libgomp/testsuite/libgomp.c/omp_alloc-4.c
Normal file
|
@ -0,0 +1,199 @@
|
|||
/* { dg-do run } */
|
||||
|
||||
/* Test that low-latency free chains are sound. */
|
||||
|
||||
#include <stddef.h>
|
||||
#include <omp.h>
|
||||
|
||||
#pragma omp requires dynamic_allocators
|
||||
|
||||
void
|
||||
check (int cond, const char *msg)
|
||||
{
|
||||
if (!cond)
|
||||
{
|
||||
__builtin_printf ("%s\n", msg);
|
||||
__builtin_abort ();
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
if (omp_get_initial_device () == omp_get_default_device ())
|
||||
return 0; /* This test isn't interesting with host-fallback. */
|
||||
|
||||
#pragma omp target
|
||||
{
|
||||
/* Ensure that the memory we get *is* low-latency with a null-fallback. */
|
||||
omp_alloctrait_t traits[1]
|
||||
= { { omp_atk_fallback, omp_atv_null_fb } };
|
||||
omp_allocator_handle_t lowlat = omp_init_allocator (omp_low_lat_mem_space,
|
||||
1, traits);
|
||||
|
||||
int size = 4;
|
||||
|
||||
char *a = omp_alloc (size, lowlat);
|
||||
char *b = omp_alloc (size, lowlat);
|
||||
char *c = omp_alloc (size, lowlat);
|
||||
char *d = omp_alloc (size, lowlat);
|
||||
|
||||
/* There are headers and padding to account for. */
|
||||
int size2 = size + (b-a);
|
||||
int size3 = size + (c-a);
|
||||
int size4 = size + (d-a) + 100; /* Random larger amount. */
|
||||
|
||||
check (a != NULL && b != NULL && c != NULL && d != NULL,
|
||||
"omp_alloc returned NULL\n");
|
||||
|
||||
omp_free (a, lowlat);
|
||||
char *p = omp_alloc (size, lowlat);
|
||||
check (p == a, "allocate did not reuse first chunk");
|
||||
|
||||
omp_free (b, lowlat);
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == b, "allocate did not reuse second chunk");
|
||||
|
||||
omp_free (c, lowlat);
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == c, "allocate did not reuse third chunk");
|
||||
|
||||
omp_free (a, lowlat);
|
||||
omp_free (b, lowlat);
|
||||
p = omp_alloc (size2, lowlat);
|
||||
check (p == a, "allocate did not coalesce first two chunks");
|
||||
|
||||
omp_free (p, lowlat);
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == a, "allocate did not split first chunk (1)");
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == b, "allocate did not split first chunk (2)");
|
||||
|
||||
omp_free (b, lowlat);
|
||||
omp_free (c, lowlat);
|
||||
p = omp_alloc (size2, lowlat);
|
||||
check (p == b, "allocate did not coalesce middle two chunks");
|
||||
|
||||
omp_free (p, lowlat);
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == b, "allocate did not split second chunk (1)");
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == c, "allocate did not split second chunk (2)");
|
||||
|
||||
omp_free (b, lowlat);
|
||||
omp_free (a, lowlat);
|
||||
p = omp_alloc (size2, lowlat);
|
||||
check (p == a, "allocate did not coalesce first two chunks, reverse free");
|
||||
|
||||
omp_free (p, lowlat);
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == a, "allocate did not split first chunk (1), reverse free");
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == b, "allocate did not split first chunk (2), reverse free");
|
||||
|
||||
omp_free (c, lowlat);
|
||||
omp_free (b, lowlat);
|
||||
p = omp_alloc (size2, lowlat);
|
||||
check (p == b, "allocate did not coalesce second two chunks, reverse free");
|
||||
|
||||
omp_free (p, lowlat);
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == b, "allocate did not split second chunk (1), reverse free");
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == c, "allocate did not split second chunk (2), reverse free");
|
||||
|
||||
omp_free (a, lowlat);
|
||||
omp_free (b, lowlat);
|
||||
omp_free (c, lowlat);
|
||||
p = omp_alloc (size3, lowlat);
|
||||
check (p == a, "allocate did not coalesce first three chunks");
|
||||
|
||||
omp_free (p, lowlat);
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == a, "allocate did not split first chunk (1)");
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == b, "allocate did not split first chunk (2)");
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == c, "allocate did not split first chunk (3)");
|
||||
|
||||
omp_free (b, lowlat);
|
||||
omp_free (c, lowlat);
|
||||
omp_free (d, lowlat);
|
||||
p = omp_alloc (size3, lowlat);
|
||||
check (p == b, "allocate did not coalesce last three chunks");
|
||||
|
||||
omp_free (p, lowlat);
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == b, "allocate did not split second chunk (1)");
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == c, "allocate did not split second chunk (2)");
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == d, "allocate did not split second chunk (3)");
|
||||
|
||||
omp_free (c, lowlat);
|
||||
omp_free (b, lowlat);
|
||||
omp_free (a, lowlat);
|
||||
p = omp_alloc (size3, lowlat);
|
||||
check (p == a, "allocate did not coalesce first three chunks, reverse free");
|
||||
|
||||
omp_free (p, lowlat);
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == a, "allocate did not split first chunk (1), reverse free");
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == b, "allocate did not split first chunk (2), reverse free");
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == c, "allocate did not split first chunk (3), reverse free");
|
||||
|
||||
omp_free (d, lowlat);
|
||||
omp_free (c, lowlat);
|
||||
omp_free (b, lowlat);
|
||||
p = omp_alloc (size3, lowlat);
|
||||
check (p == b, "allocate did not coalesce second three chunks, reverse free");
|
||||
|
||||
omp_free (p, lowlat);
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == b, "allocate did not split second chunk (1), reverse free");
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == c, "allocate did not split second chunk (2), reverse free");
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == d, "allocate did not split second chunk (3), reverse free");
|
||||
|
||||
omp_free (c, lowlat);
|
||||
omp_free (a, lowlat);
|
||||
omp_free (b, lowlat);
|
||||
p = omp_alloc (size3, lowlat);
|
||||
check (p == a, "allocate did not coalesce first three chunks, mixed free");
|
||||
|
||||
omp_free (p, lowlat);
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == a, "allocate did not split first chunk (1), mixed free");
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == b, "allocate did not split first chunk (2), mixed free");
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == c, "allocate did not split first chunk (3), mixed free");
|
||||
|
||||
omp_free (d, lowlat);
|
||||
omp_free (b, lowlat);
|
||||
omp_free (c, lowlat);
|
||||
p = omp_alloc (size3, lowlat);
|
||||
check (p == b, "allocate did not coalesce second three chunks, mixed free");
|
||||
|
||||
omp_free (p, lowlat);
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == b, "allocate did not split second chunk (1), mixed free");
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == c, "allocate did not split second chunk (2), mixed free");
|
||||
p = omp_alloc (size, lowlat);
|
||||
check (p == d, "allocate did not split second chunk (3), mixed free");
|
||||
|
||||
omp_free (a, lowlat);
|
||||
omp_free (b, lowlat);
|
||||
omp_free (c, lowlat);
|
||||
omp_free (d, lowlat);
|
||||
p = omp_alloc (size4, lowlat);
|
||||
check (p == a, "allocate did not coalesce all memory");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
63
libgomp/testsuite/libgomp.c/omp_alloc-5.c
Normal file
63
libgomp/testsuite/libgomp.c/omp_alloc-5.c
Normal file
|
@ -0,0 +1,63 @@
|
|||
/* { dg-do run } */
|
||||
|
||||
/* Test calloc with omp_alloc. */
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#pragma omp requires dynamic_allocators
|
||||
|
||||
void
|
||||
test (int n, omp_allocator_handle_t allocator)
|
||||
{
|
||||
#pragma omp target map(to:n) map(to:allocator)
|
||||
{
|
||||
int *a;
|
||||
a = (int *) omp_calloc (n, sizeof (int), allocator);
|
||||
|
||||
for (int i = 0; i < n; i++)
|
||||
if (a[i] != 0)
|
||||
{
|
||||
__builtin_printf ("memory not zeroed at %i\n", i);
|
||||
__builtin_abort ();
|
||||
}
|
||||
|
||||
#pragma omp parallel
|
||||
for (int i = 0; i < n; i++)
|
||||
a[i] = i;
|
||||
|
||||
for (int i = 0; i < n; i++)
|
||||
if (a[i] != i)
|
||||
{
|
||||
__builtin_printf ("data mismatch at %i\n", i);
|
||||
__builtin_abort ();
|
||||
}
|
||||
|
||||
omp_free (a, allocator);
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
// Smaller than low-latency memory limit
|
||||
test (10, omp_default_mem_alloc);
|
||||
test (10, omp_large_cap_mem_alloc);
|
||||
test (10, omp_const_mem_alloc);
|
||||
test (10, omp_high_bw_mem_alloc);
|
||||
test (10, omp_low_lat_mem_alloc);
|
||||
test (10, omp_cgroup_mem_alloc);
|
||||
test (10, omp_pteam_mem_alloc);
|
||||
test (10, omp_thread_mem_alloc);
|
||||
|
||||
// Larger than low-latency memory limit
|
||||
test (100000, omp_default_mem_alloc);
|
||||
test (100000, omp_large_cap_mem_alloc);
|
||||
test (100000, omp_const_mem_alloc);
|
||||
test (100000, omp_high_bw_mem_alloc);
|
||||
test (100000, omp_low_lat_mem_alloc);
|
||||
test (100000, omp_cgroup_mem_alloc);
|
||||
test (100000, omp_pteam_mem_alloc);
|
||||
test (100000, omp_thread_mem_alloc);
|
||||
|
||||
return 0;
|
||||
}
|
120
libgomp/testsuite/libgomp.c/omp_alloc-6.c
Normal file
120
libgomp/testsuite/libgomp.c/omp_alloc-6.c
Normal file
|
@ -0,0 +1,120 @@
|
|||
/* { dg-do run } */
|
||||
|
||||
/* Test that low-latency realloc and free chains are sound. */
|
||||
|
||||
#include <stddef.h>
|
||||
#include <omp.h>
|
||||
|
||||
#pragma omp requires dynamic_allocators
|
||||
|
||||
void
|
||||
check (int cond, const char *msg)
|
||||
{
|
||||
if (!cond)
|
||||
{
|
||||
__builtin_printf ("%s\n", msg);
|
||||
__builtin_abort ();
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
if (omp_get_initial_device () == omp_get_default_device ())
|
||||
return 0; /* This test isn't interesting with host-fallback. */
|
||||
|
||||
#pragma omp target
|
||||
{
|
||||
/* Ensure that the memory we get *is* low-latency with a null-fallback. */
|
||||
omp_alloctrait_t traits[1]
|
||||
= { { omp_atk_fallback, omp_atv_null_fb } };
|
||||
omp_allocator_handle_t lowlat = omp_init_allocator (omp_low_lat_mem_space,
|
||||
1, traits);
|
||||
|
||||
int size = 16;
|
||||
|
||||
char *a = (char *) omp_alloc (size, lowlat);
|
||||
char *b = (char *) omp_alloc (size, lowlat);
|
||||
char *c = (char *) omp_alloc (size, lowlat);
|
||||
char *d = (char *) omp_alloc (size, lowlat);
|
||||
|
||||
/* There are headers and padding to account for. */
|
||||
int size2 = size + (b-a);
|
||||
int size3 = size + (c-a);
|
||||
int size4 = size + (d-a) + 100; /* Random larger amount. */
|
||||
|
||||
check (a != NULL && b != NULL && c != NULL && d != NULL,
|
||||
"omp_alloc returned NULL\n");
|
||||
|
||||
char *p = omp_realloc (b, size, lowlat, lowlat);
|
||||
check (p == b, "realloc did not reuse same size chunk, no space after");
|
||||
|
||||
p = omp_realloc (b, size-8, lowlat, lowlat);
|
||||
check (p == b, "realloc did not reuse smaller chunk, no space after");
|
||||
|
||||
p = omp_realloc (b, size, lowlat, lowlat);
|
||||
check (p == b, "realloc did not reuse original size chunk, no space after");
|
||||
|
||||
/* Make space after b. */
|
||||
omp_free (c, lowlat);
|
||||
|
||||
p = omp_realloc (b, size, lowlat, lowlat);
|
||||
check (p == b, "realloc did not reuse same size chunk");
|
||||
|
||||
p = omp_realloc (b, size-8, lowlat, lowlat);
|
||||
check (p == b, "realloc did not reuse smaller chunk");
|
||||
|
||||
p = omp_realloc (b, size, lowlat, lowlat);
|
||||
check (p == b, "realloc did not reuse original size chunk");
|
||||
|
||||
p = omp_realloc (b, size+8, lowlat, lowlat);
|
||||
check (p == b, "realloc did not extend in place by a little");
|
||||
|
||||
p = omp_realloc (b, size2, lowlat, lowlat);
|
||||
check (p == b, "realloc did not extend into whole next chunk");
|
||||
|
||||
p = omp_realloc (b, size3, lowlat, lowlat);
|
||||
check (p != b, "realloc did not move b elsewhere");
|
||||
omp_free (p, lowlat);
|
||||
|
||||
|
||||
p = omp_realloc (a, size, lowlat, lowlat);
|
||||
check (p == a, "realloc did not reuse same size chunk, first position");
|
||||
|
||||
p = omp_realloc (a, size-8, lowlat, lowlat);
|
||||
check (p == a, "realloc did not reuse smaller chunk, first position");
|
||||
|
||||
p = omp_realloc (a, size, lowlat, lowlat);
|
||||
check (p == a, "realloc did not reuse original size chunk, first position");
|
||||
|
||||
p = omp_realloc (a, size+8, lowlat, lowlat);
|
||||
check (p == a, "realloc did not extend in place by a little, first position");
|
||||
|
||||
p = omp_realloc (a, size3, lowlat, lowlat);
|
||||
check (p == a, "realloc did not extend into whole next chunk, first position");
|
||||
|
||||
p = omp_realloc (a, size4, lowlat, lowlat);
|
||||
check (p != a, "realloc did not move a elsewhere, first position");
|
||||
omp_free (p, lowlat);
|
||||
|
||||
|
||||
p = omp_realloc (d, size, lowlat, lowlat);
|
||||
check (p == d, "realloc did not reuse same size chunk, last position");
|
||||
|
||||
p = omp_realloc (d, size-8, lowlat, lowlat);
|
||||
check (p == d, "realloc did not reuse smaller chunk, last position");
|
||||
|
||||
p = omp_realloc (d, size, lowlat, lowlat);
|
||||
check (p == d, "realloc did not reuse original size chunk, last position");
|
||||
|
||||
p = omp_realloc (d, size+8, lowlat, lowlat);
|
||||
check (p == d, "realloc did not extend in place by d little, last position");
|
||||
|
||||
/* Larger than low latency memory. */
|
||||
p = omp_realloc (d, 100000000, lowlat, lowlat);
|
||||
check (p == NULL, "realloc did not fail on OOM");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
Add table
Reference in a new issue