amdgcn, libgomp: low-latency allocator

This implements the OpenMP low-latency memory allocator for AMD GCN using the
small per-team LDS memory (Local Data Store).

Since addresses can now refer to LDS space, the "Global" address space is
no-longer compatible.  This patch therefore switches the backend to use
entirely "Flat" addressing (which supports both memories).  A future patch
will re-enable "global" instructions for cases where it is known to be safe
to do so.

gcc/ChangeLog:

	* config/gcn/gcn-builtins.def (DISPATCH_PTR): New built-in.
	* config/gcn/gcn.cc (gcn_init_machine_status): Disable global
	addressing.
	(gcn_expand_builtin_1): Implement GCN_BUILTIN_DISPATCH_PTR.

libgomp/ChangeLog:

	* config/gcn/libgomp-gcn.h (TEAM_ARENA_START): Move to here.
	(TEAM_ARENA_FREE): Likewise.
	(TEAM_ARENA_END): Likewise.
	(GCN_LOWLAT_HEAP): New.
	* config/gcn/team.c (LITTLEENDIAN_CPU): New, and import hsa.h.
	(__gcn_lowlat_init): New prototype.
	(gomp_gcn_enter_kernel): Initialize the low-latency heap.
	* libgomp.h (TEAM_ARENA_START): Move to libgomp.h.
	(TEAM_ARENA_FREE): Likewise.
	(TEAM_ARENA_END): Likewise.
	* plugin/plugin-gcn.c (lowlat_size): New variable.
	(print_kernel_dispatch): Label the group_segment_size purpose.
	(init_environment_variables): Read GOMP_GCN_LOWLAT_POOL.
	(create_kernel_dispatch): Pass low-latency head allocation to kernel.
	(run_kernel): Use shadow; don't assume values.
	* testsuite/libgomp.c/omp_alloc-traits.c: Enable for amdgcn.
	* config/gcn/allocator.c: New file.
	* libgomp.texi: Document low-latency implementation details.
This commit is contained in:
Andrew Stubbs 2023-01-30 14:43:00 +00:00
parent e9a19ead49
commit e7d6c277fa
9 changed files with 205 additions and 11 deletions

View file

@ -164,6 +164,8 @@ DEF_BUILTIN (FIRST_CALL_THIS_THREAD_P, -1, "first_call_this_thread_p", B_INSN,
_A1 (GCN_BTI_BOOL), gcn_expand_builtin_1)
DEF_BUILTIN (KERNARG_PTR, -1, "kernarg_ptr", B_INSN, _A1 (GCN_BTI_VOIDPTR),
gcn_expand_builtin_1)
DEF_BUILTIN (DISPATCH_PTR, -1, "dispatch_ptr", B_INSN, _A1 (GCN_BTI_VOIDPTR),
gcn_expand_builtin_1)
DEF_BUILTIN (GET_STACK_LIMIT, -1, "get_stack_limit", B_INSN,
_A1 (GCN_BTI_VOIDPTR), gcn_expand_builtin_1)

View file

@ -110,7 +110,8 @@ gcn_init_machine_status (void)
f = ggc_cleared_alloc<machine_function> ();
if (TARGET_GCN3)
// FIXME: re-enable global addressing with safety for LDS-flat addresses
//if (TARGET_GCN3)
f->use_flat_addressing = true;
return f;
@ -4879,6 +4880,19 @@ gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
}
return ptr;
}
case GCN_BUILTIN_DISPATCH_PTR:
{
rtx ptr;
if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
ptr = gen_rtx_REG (DImode,
cfun->machine->args.reg[DISPATCH_PTR_ARG]);
else
{
ptr = gen_reg_rtx (DImode);
emit_move_insn (ptr, const0_rtx);
}
return ptr;
}
case GCN_BUILTIN_FIRST_CALL_THIS_THREAD_P:
{
/* Stash a marker in the unused upper 16 bits of s[0:1] to indicate

View file

@ -0,0 +1,127 @@
/* Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* The low-latency allocators use space reserved in LDS memory when the
kernel is launched. The heap is initialized in gomp_gcn_enter_kernel and
all allocations are forgotten when the kernel exits. Allocations to other
memory spaces all use the system malloc syscall.
The pointers returned are 64-bit "Flat" addresses indistinguishable from
regular pointers, but only compatible with the "flat_load/store"
instructions. The compiler has been coded to assign default address
spaces accordingly.
LDS memory is not visible to other teams, and therefore may only be used
when the memspace access trait is set accordingly. */
#include "libgomp.h"
#include <stdlib.h>
#define BASIC_ALLOC_PREFIX __gcn_lowlat
#define BASIC_ALLOC_YIELD asm ("s_sleep 1" ::: "memory")
#include "../../basic-allocator.c"
/* The low-latency heap is located in LDS memory, but we need the __flat
address space for compatibility reasons. */
#define FLAT_HEAP_PTR \
((void *) (uintptr_t) (void __flat *) (void __lds *) GCN_LOWLAT_HEAP)
static void *
gcn_memspace_alloc (omp_memspace_handle_t memspace, size_t size)
{
if (memspace == omp_low_lat_mem_space)
{
char *shared_pool = FLAT_HEAP_PTR;
return __gcn_lowlat_alloc (shared_pool, size);
}
else
return malloc (size);
}
static void *
gcn_memspace_calloc (omp_memspace_handle_t memspace, size_t size)
{
if (memspace == omp_low_lat_mem_space)
{
char *shared_pool = FLAT_HEAP_PTR;
return __gcn_lowlat_calloc (shared_pool, size);
}
else
return calloc (1, size);
}
static void
gcn_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size)
{
if (memspace == omp_low_lat_mem_space)
{
char *shared_pool = FLAT_HEAP_PTR;
__gcn_lowlat_free (shared_pool, addr, size);
}
else
free (addr);
}
static void *
gcn_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
size_t oldsize, size_t size)
{
if (memspace == omp_low_lat_mem_space)
{
char *shared_pool = FLAT_HEAP_PTR;
return __gcn_lowlat_realloc (shared_pool, addr, oldsize, size);
}
else
return realloc (addr, size);
}
static inline int
gcn_memspace_validate (omp_memspace_handle_t memspace, unsigned access)
{
/* Disallow use of low-latency memory when it must be accessible by
all threads. */
return (memspace != omp_low_lat_mem_space
|| access != omp_atv_all);
}
#define MEMSPACE_ALLOC(MEMSPACE, SIZE) \
gcn_memspace_alloc (MEMSPACE, SIZE)
#define MEMSPACE_CALLOC(MEMSPACE, SIZE) \
gcn_memspace_calloc (MEMSPACE, SIZE)
#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE) \
gcn_memspace_realloc (MEMSPACE, ADDR, OLDSIZE, SIZE)
#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \
gcn_memspace_free (MEMSPACE, ADDR, SIZE)
#define MEMSPACE_VALIDATE(MEMSPACE, ACCESS) \
gcn_memspace_validate (MEMSPACE, ACCESS)
/* The default low-latency memspace implies omp_atv_all, which is incompatible
with the LDS memory space. */
#define OMP_LOW_LAT_MEM_ALLOC_INVALID 1
#include "../../allocator.c"

View file

@ -33,6 +33,12 @@
#define DEFAULT_GCN_STACK_SIZE (32*1024)
#define DEFAULT_TEAM_ARENA_SIZE (64*1024)
/* These define the LDS location of data needed by OpenMP. */
#define TEAM_ARENA_START 16 /* LDS offset of free pointer. */
#define TEAM_ARENA_FREE 24 /* LDS offset of free pointer. */
#define TEAM_ARENA_END 32 /* LDS offset of end pointer. */
#define GCN_LOWLAT_HEAP 40 /* LDS offset of the OpenMP low-latency heap. */
struct heap
{
int64_t size;

View file

@ -29,6 +29,12 @@
#include <stdlib.h>
#include <string.h>
#define LITTLEENDIAN_CPU
#include "hsa.h"
/* Defined in basic-allocator.c via config/amdgcn/allocator.c. */
void __gcn_lowlat_init (void *heap, size_t size);
static void gomp_thread_start (struct gomp_thread_pool *);
extern void build_indirect_map (void);
@ -75,6 +81,12 @@ gomp_gcn_enter_kernel (void)
*arena_free = team_arena;
*arena_end = team_arena + kernargs->arena_size_per_team;
/* Initialize the low-latency heap. The header is the size. */
void __lds *lowlat = (void __lds *)GCN_LOWLAT_HEAP;
hsa_kernel_dispatch_packet_t *queue_ptr = __builtin_gcn_dispatch_ptr ();
__gcn_lowlat_init ((void*)(uintptr_t)(void __flat*)lowlat,
queue_ptr->group_segment_size - GCN_LOWLAT_HEAP);
/* Allocate and initialize the team-local-storage data. */
struct gomp_thread *thrs = team_malloc_cleared (sizeof (*thrs)
* numthreads);

View file

@ -114,9 +114,6 @@ extern void gomp_aligned_free (void *);
#ifdef __AMDGCN__
#include "libgomp-gcn.h"
/* The arena is initialized in config/gcn/team.c. */
#define TEAM_ARENA_START 16 /* LDS offset of free pointer. */
#define TEAM_ARENA_FREE 24 /* LDS offset of free pointer. */
#define TEAM_ARENA_END 32 /* LDS offset of end pointer. */
static inline void * __attribute__((malloc))
team_malloc (size_t size)

View file

@ -5836,6 +5836,19 @@ The implementation remark:
available devices (``host fallback'').
@item The available stack size can be changed using the @code{GCN_STACK_SIZE}
environment variable; the default is 32 kiB per thread.
@item Low-latency memory (@code{omp_low_lat_mem_space}) is supported when the
the @code{access} trait is set to @code{cgroup}. The default pool size
is automatically scaled to share the 64 kiB LDS memory between the number
of teams configured to run on each compute-unit, but may be adjusted at
runtime by setting environment variable
@code{GOMP_GCN_LOWLAT_POOL=@var{bytes}}.
@item @code{omp_low_lat_mem_alloc} cannot be used with true low-latency memory
because the definition implies the @code{omp_atv_all} trait; main
graphics memory is used instead.
@item @code{omp_cgroup_mem_alloc}, @code{omp_pteam_mem_alloc}, and
@code{omp_thread_mem_alloc}, all use low-latency memory as first
preference, and fall back to main graphics memory when the low-latency
pool is exhausted.
@end itemize

View file

@ -550,6 +550,7 @@ static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
static int stack_size = DEFAULT_GCN_STACK_SIZE;
static int lowlat_size = -1;
/* Flag to decide whether print to stderr information about what is going on.
Set in init_debug depending on environment variables. */
@ -1016,8 +1017,8 @@ print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
dispatch->private_segment_size);
fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
dispatch->group_segment_size);
fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
"", dispatch->group_segment_size);
fprintf (stderr, "\n");
}
@ -1088,6 +1089,10 @@ init_environment_variables (void)
if (tmp)
stack_size = tmp;;
}
const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
if (lowlat)
lowlat_size = atoi (lowlat);
}
/* Return malloc'd string with name of SYMBOL. */
@ -1930,7 +1935,25 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
shadow->signal = sync_signal.handle;
shadow->private_segment_size = kernel->private_segment_size;
shadow->group_segment_size = kernel->group_segment_size;
if (lowlat_size < 0)
{
/* Divide the LDS between the number of running teams.
Allocate not less than is defined in the kernel metadata. */
int teams_per_cu = num_teams / get_cu_count (agent);
int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
shadow->group_segment_size
= (kernel->group_segment_size > LDS_per_team
? kernel->group_segment_size
: LDS_per_team);;
}
else if (lowlat_size < GCN_LOWLAT_HEAP+8)
/* Ensure that there's space for the OpenMP libgomp data. */
shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
else
shadow->group_segment_size = (lowlat_size > 65536
? 65536
: lowlat_size);
/* We expect kernels to request a single pointer, explicitly, and the
rest of struct kernargs, implicitly. If they request anything else
@ -2290,9 +2313,9 @@ run_kernel (struct kernel_info *kernel, void *vars,
print_kernel_dispatch (shadow, 2);
}
packet->private_segment_size = kernel->private_segment_size;
packet->group_segment_size = kernel->group_segment_size;
packet->kernel_object = kernel->object;
packet->private_segment_size = shadow->private_segment_size;
packet->group_segment_size = shadow->group_segment_size;
packet->kernel_object = shadow->object;
packet->kernarg_address = shadow->kernarg_address;
hsa_signal_t s;
s.handle = shadow->signal;

View file

@ -1,7 +1,7 @@
/* { dg-do run } */
/* { dg-require-effective-target offload_device } */
/* { dg-xfail-if "not implemented" { ! offload_target_nvptx } } */
/* { dg-xfail-if "not implemented" { ! { offload_target_nvptx || offload_target_amdgcn } } } */
/* Test that GPU low-latency allocation is limited to team access. */