amdgcn, libgomp: low-latency allocator
This implements the OpenMP low-latency memory allocator for AMD GCN using the small per-team LDS memory (Local Data Store). Since addresses can now refer to LDS space, the "Global" address space is no-longer compatible. This patch therefore switches the backend to use entirely "Flat" addressing (which supports both memories). A future patch will re-enable "global" instructions for cases where it is known to be safe to do so. gcc/ChangeLog: * config/gcn/gcn-builtins.def (DISPATCH_PTR): New built-in. * config/gcn/gcn.cc (gcn_init_machine_status): Disable global addressing. (gcn_expand_builtin_1): Implement GCN_BUILTIN_DISPATCH_PTR. libgomp/ChangeLog: * config/gcn/libgomp-gcn.h (TEAM_ARENA_START): Move to here. (TEAM_ARENA_FREE): Likewise. (TEAM_ARENA_END): Likewise. (GCN_LOWLAT_HEAP): New. * config/gcn/team.c (LITTLEENDIAN_CPU): New, and import hsa.h. (__gcn_lowlat_init): New prototype. (gomp_gcn_enter_kernel): Initialize the low-latency heap. * libgomp.h (TEAM_ARENA_START): Move to libgomp.h. (TEAM_ARENA_FREE): Likewise. (TEAM_ARENA_END): Likewise. * plugin/plugin-gcn.c (lowlat_size): New variable. (print_kernel_dispatch): Label the group_segment_size purpose. (init_environment_variables): Read GOMP_GCN_LOWLAT_POOL. (create_kernel_dispatch): Pass low-latency head allocation to kernel. (run_kernel): Use shadow; don't assume values. * testsuite/libgomp.c/omp_alloc-traits.c: Enable for amdgcn. * config/gcn/allocator.c: New file. * libgomp.texi: Document low-latency implementation details.
This commit is contained in:
parent
e9a19ead49
commit
e7d6c277fa
9 changed files with 205 additions and 11 deletions
|
@ -164,6 +164,8 @@ DEF_BUILTIN (FIRST_CALL_THIS_THREAD_P, -1, "first_call_this_thread_p", B_INSN,
|
|||
_A1 (GCN_BTI_BOOL), gcn_expand_builtin_1)
|
||||
DEF_BUILTIN (KERNARG_PTR, -1, "kernarg_ptr", B_INSN, _A1 (GCN_BTI_VOIDPTR),
|
||||
gcn_expand_builtin_1)
|
||||
DEF_BUILTIN (DISPATCH_PTR, -1, "dispatch_ptr", B_INSN, _A1 (GCN_BTI_VOIDPTR),
|
||||
gcn_expand_builtin_1)
|
||||
DEF_BUILTIN (GET_STACK_LIMIT, -1, "get_stack_limit", B_INSN,
|
||||
_A1 (GCN_BTI_VOIDPTR), gcn_expand_builtin_1)
|
||||
|
||||
|
|
|
@ -110,7 +110,8 @@ gcn_init_machine_status (void)
|
|||
|
||||
f = ggc_cleared_alloc<machine_function> ();
|
||||
|
||||
if (TARGET_GCN3)
|
||||
// FIXME: re-enable global addressing with safety for LDS-flat addresses
|
||||
//if (TARGET_GCN3)
|
||||
f->use_flat_addressing = true;
|
||||
|
||||
return f;
|
||||
|
@ -4879,6 +4880,19 @@ gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
|
|||
}
|
||||
return ptr;
|
||||
}
|
||||
case GCN_BUILTIN_DISPATCH_PTR:
|
||||
{
|
||||
rtx ptr;
|
||||
if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
|
||||
ptr = gen_rtx_REG (DImode,
|
||||
cfun->machine->args.reg[DISPATCH_PTR_ARG]);
|
||||
else
|
||||
{
|
||||
ptr = gen_reg_rtx (DImode);
|
||||
emit_move_insn (ptr, const0_rtx);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
case GCN_BUILTIN_FIRST_CALL_THIS_THREAD_P:
|
||||
{
|
||||
/* Stash a marker in the unused upper 16 bits of s[0:1] to indicate
|
||||
|
|
127
libgomp/config/gcn/allocator.c
Normal file
127
libgomp/config/gcn/allocator.c
Normal file
|
@ -0,0 +1,127 @@
|
|||
/* Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU Offloading and Multi Processing Library
|
||||
(libgomp).
|
||||
|
||||
Libgomp is free software; you can redistribute it and/or modify it
|
||||
under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3, or (at your option)
|
||||
any later version.
|
||||
|
||||
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
more details.
|
||||
|
||||
Under Section 7 of GPL version 3, you are granted additional
|
||||
permissions described in the GCC Runtime Library Exception, version
|
||||
3.1, as published by the Free Software Foundation.
|
||||
|
||||
You should have received a copy of the GNU General Public License and
|
||||
a copy of the GCC Runtime Library Exception along with this program;
|
||||
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
/* The low-latency allocators use space reserved in LDS memory when the
|
||||
kernel is launched. The heap is initialized in gomp_gcn_enter_kernel and
|
||||
all allocations are forgotten when the kernel exits. Allocations to other
|
||||
memory spaces all use the system malloc syscall.
|
||||
|
||||
The pointers returned are 64-bit "Flat" addresses indistinguishable from
|
||||
regular pointers, but only compatible with the "flat_load/store"
|
||||
instructions. The compiler has been coded to assign default address
|
||||
spaces accordingly.
|
||||
|
||||
LDS memory is not visible to other teams, and therefore may only be used
|
||||
when the memspace access trait is set accordingly. */
|
||||
|
||||
#include "libgomp.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
#define BASIC_ALLOC_PREFIX __gcn_lowlat
|
||||
#define BASIC_ALLOC_YIELD asm ("s_sleep 1" ::: "memory")
|
||||
#include "../../basic-allocator.c"
|
||||
|
||||
/* The low-latency heap is located in LDS memory, but we need the __flat
|
||||
address space for compatibility reasons. */
|
||||
#define FLAT_HEAP_PTR \
|
||||
((void *) (uintptr_t) (void __flat *) (void __lds *) GCN_LOWLAT_HEAP)
|
||||
|
||||
static void *
|
||||
gcn_memspace_alloc (omp_memspace_handle_t memspace, size_t size)
|
||||
{
|
||||
if (memspace == omp_low_lat_mem_space)
|
||||
{
|
||||
char *shared_pool = FLAT_HEAP_PTR;
|
||||
|
||||
return __gcn_lowlat_alloc (shared_pool, size);
|
||||
}
|
||||
else
|
||||
return malloc (size);
|
||||
}
|
||||
|
||||
static void *
|
||||
gcn_memspace_calloc (omp_memspace_handle_t memspace, size_t size)
|
||||
{
|
||||
if (memspace == omp_low_lat_mem_space)
|
||||
{
|
||||
char *shared_pool = FLAT_HEAP_PTR;
|
||||
|
||||
return __gcn_lowlat_calloc (shared_pool, size);
|
||||
}
|
||||
else
|
||||
return calloc (1, size);
|
||||
}
|
||||
|
||||
static void
|
||||
gcn_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size)
|
||||
{
|
||||
if (memspace == omp_low_lat_mem_space)
|
||||
{
|
||||
char *shared_pool = FLAT_HEAP_PTR;
|
||||
|
||||
__gcn_lowlat_free (shared_pool, addr, size);
|
||||
}
|
||||
else
|
||||
free (addr);
|
||||
}
|
||||
|
||||
static void *
|
||||
gcn_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
|
||||
size_t oldsize, size_t size)
|
||||
{
|
||||
if (memspace == omp_low_lat_mem_space)
|
||||
{
|
||||
char *shared_pool = FLAT_HEAP_PTR;
|
||||
|
||||
return __gcn_lowlat_realloc (shared_pool, addr, oldsize, size);
|
||||
}
|
||||
else
|
||||
return realloc (addr, size);
|
||||
}
|
||||
|
||||
static inline int
|
||||
gcn_memspace_validate (omp_memspace_handle_t memspace, unsigned access)
|
||||
{
|
||||
/* Disallow use of low-latency memory when it must be accessible by
|
||||
all threads. */
|
||||
return (memspace != omp_low_lat_mem_space
|
||||
|| access != omp_atv_all);
|
||||
}
|
||||
|
||||
#define MEMSPACE_ALLOC(MEMSPACE, SIZE) \
|
||||
gcn_memspace_alloc (MEMSPACE, SIZE)
|
||||
#define MEMSPACE_CALLOC(MEMSPACE, SIZE) \
|
||||
gcn_memspace_calloc (MEMSPACE, SIZE)
|
||||
#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE) \
|
||||
gcn_memspace_realloc (MEMSPACE, ADDR, OLDSIZE, SIZE)
|
||||
#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \
|
||||
gcn_memspace_free (MEMSPACE, ADDR, SIZE)
|
||||
#define MEMSPACE_VALIDATE(MEMSPACE, ACCESS) \
|
||||
gcn_memspace_validate (MEMSPACE, ACCESS)
|
||||
|
||||
/* The default low-latency memspace implies omp_atv_all, which is incompatible
|
||||
with the LDS memory space. */
|
||||
#define OMP_LOW_LAT_MEM_ALLOC_INVALID 1
|
||||
|
||||
#include "../../allocator.c"
|
|
@ -33,6 +33,12 @@
|
|||
#define DEFAULT_GCN_STACK_SIZE (32*1024)
|
||||
#define DEFAULT_TEAM_ARENA_SIZE (64*1024)
|
||||
|
||||
/* These define the LDS location of data needed by OpenMP. */
|
||||
#define TEAM_ARENA_START 16 /* LDS offset of free pointer. */
|
||||
#define TEAM_ARENA_FREE 24 /* LDS offset of free pointer. */
|
||||
#define TEAM_ARENA_END 32 /* LDS offset of end pointer. */
|
||||
#define GCN_LOWLAT_HEAP 40 /* LDS offset of the OpenMP low-latency heap. */
|
||||
|
||||
struct heap
|
||||
{
|
||||
int64_t size;
|
||||
|
|
|
@ -29,6 +29,12 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define LITTLEENDIAN_CPU
|
||||
#include "hsa.h"
|
||||
|
||||
/* Defined in basic-allocator.c via config/amdgcn/allocator.c. */
|
||||
void __gcn_lowlat_init (void *heap, size_t size);
|
||||
|
||||
static void gomp_thread_start (struct gomp_thread_pool *);
|
||||
extern void build_indirect_map (void);
|
||||
|
||||
|
@ -75,6 +81,12 @@ gomp_gcn_enter_kernel (void)
|
|||
*arena_free = team_arena;
|
||||
*arena_end = team_arena + kernargs->arena_size_per_team;
|
||||
|
||||
/* Initialize the low-latency heap. The header is the size. */
|
||||
void __lds *lowlat = (void __lds *)GCN_LOWLAT_HEAP;
|
||||
hsa_kernel_dispatch_packet_t *queue_ptr = __builtin_gcn_dispatch_ptr ();
|
||||
__gcn_lowlat_init ((void*)(uintptr_t)(void __flat*)lowlat,
|
||||
queue_ptr->group_segment_size - GCN_LOWLAT_HEAP);
|
||||
|
||||
/* Allocate and initialize the team-local-storage data. */
|
||||
struct gomp_thread *thrs = team_malloc_cleared (sizeof (*thrs)
|
||||
* numthreads);
|
||||
|
|
|
@ -114,9 +114,6 @@ extern void gomp_aligned_free (void *);
|
|||
#ifdef __AMDGCN__
|
||||
#include "libgomp-gcn.h"
|
||||
/* The arena is initialized in config/gcn/team.c. */
|
||||
#define TEAM_ARENA_START 16 /* LDS offset of free pointer. */
|
||||
#define TEAM_ARENA_FREE 24 /* LDS offset of free pointer. */
|
||||
#define TEAM_ARENA_END 32 /* LDS offset of end pointer. */
|
||||
|
||||
static inline void * __attribute__((malloc))
|
||||
team_malloc (size_t size)
|
||||
|
|
|
@ -5836,6 +5836,19 @@ The implementation remark:
|
|||
available devices (``host fallback'').
|
||||
@item The available stack size can be changed using the @code{GCN_STACK_SIZE}
|
||||
environment variable; the default is 32 kiB per thread.
|
||||
@item Low-latency memory (@code{omp_low_lat_mem_space}) is supported when the
|
||||
the @code{access} trait is set to @code{cgroup}. The default pool size
|
||||
is automatically scaled to share the 64 kiB LDS memory between the number
|
||||
of teams configured to run on each compute-unit, but may be adjusted at
|
||||
runtime by setting environment variable
|
||||
@code{GOMP_GCN_LOWLAT_POOL=@var{bytes}}.
|
||||
@item @code{omp_low_lat_mem_alloc} cannot be used with true low-latency memory
|
||||
because the definition implies the @code{omp_atv_all} trait; main
|
||||
graphics memory is used instead.
|
||||
@item @code{omp_cgroup_mem_alloc}, @code{omp_pteam_mem_alloc}, and
|
||||
@code{omp_thread_mem_alloc}, all use low-latency memory as first
|
||||
preference, and fall back to main graphics memory when the low-latency
|
||||
pool is exhausted.
|
||||
@end itemize
|
||||
|
||||
|
||||
|
|
|
@ -550,6 +550,7 @@ static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
|
|||
|
||||
static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
|
||||
static int stack_size = DEFAULT_GCN_STACK_SIZE;
|
||||
static int lowlat_size = -1;
|
||||
|
||||
/* Flag to decide whether print to stderr information about what is going on.
|
||||
Set in init_debug depending on environment variables. */
|
||||
|
@ -1016,8 +1017,8 @@ print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
|
|||
fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
|
||||
fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
|
||||
dispatch->private_segment_size);
|
||||
fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
|
||||
dispatch->group_segment_size);
|
||||
fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
|
||||
"", dispatch->group_segment_size);
|
||||
fprintf (stderr, "\n");
|
||||
}
|
||||
|
||||
|
@ -1088,6 +1089,10 @@ init_environment_variables (void)
|
|||
if (tmp)
|
||||
stack_size = tmp;;
|
||||
}
|
||||
|
||||
const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
|
||||
if (lowlat)
|
||||
lowlat_size = atoi (lowlat);
|
||||
}
|
||||
|
||||
/* Return malloc'd string with name of SYMBOL. */
|
||||
|
@ -1930,7 +1935,25 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
|
|||
|
||||
shadow->signal = sync_signal.handle;
|
||||
shadow->private_segment_size = kernel->private_segment_size;
|
||||
shadow->group_segment_size = kernel->group_segment_size;
|
||||
|
||||
if (lowlat_size < 0)
|
||||
{
|
||||
/* Divide the LDS between the number of running teams.
|
||||
Allocate not less than is defined in the kernel metadata. */
|
||||
int teams_per_cu = num_teams / get_cu_count (agent);
|
||||
int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
|
||||
shadow->group_segment_size
|
||||
= (kernel->group_segment_size > LDS_per_team
|
||||
? kernel->group_segment_size
|
||||
: LDS_per_team);;
|
||||
}
|
||||
else if (lowlat_size < GCN_LOWLAT_HEAP+8)
|
||||
/* Ensure that there's space for the OpenMP libgomp data. */
|
||||
shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
|
||||
else
|
||||
shadow->group_segment_size = (lowlat_size > 65536
|
||||
? 65536
|
||||
: lowlat_size);
|
||||
|
||||
/* We expect kernels to request a single pointer, explicitly, and the
|
||||
rest of struct kernargs, implicitly. If they request anything else
|
||||
|
@ -2290,9 +2313,9 @@ run_kernel (struct kernel_info *kernel, void *vars,
|
|||
print_kernel_dispatch (shadow, 2);
|
||||
}
|
||||
|
||||
packet->private_segment_size = kernel->private_segment_size;
|
||||
packet->group_segment_size = kernel->group_segment_size;
|
||||
packet->kernel_object = kernel->object;
|
||||
packet->private_segment_size = shadow->private_segment_size;
|
||||
packet->group_segment_size = shadow->group_segment_size;
|
||||
packet->kernel_object = shadow->object;
|
||||
packet->kernarg_address = shadow->kernarg_address;
|
||||
hsa_signal_t s;
|
||||
s.handle = shadow->signal;
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/* { dg-do run } */
|
||||
|
||||
/* { dg-require-effective-target offload_device } */
|
||||
/* { dg-xfail-if "not implemented" { ! offload_target_nvptx } } */
|
||||
/* { dg-xfail-if "not implemented" { ! { offload_target_nvptx || offload_target_amdgcn } } } */
|
||||
|
||||
/* Test that GPU low-latency allocation is limited to team access. */
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue