[nvptx] Don't allow vector_length 64 with num_workers 16
When using a compiler build with: ... +#define PTX_DEFAULT_VECTOR_LENGTH PTX_CTA_SIZE ... consider a test-case: ... int main (void) { #pragma acc parallel vector_length (64) #pragma acc loop worker for (unsigned int i = 0; i < 32; i++) #pragma acc loop vector for (unsigned int j = 0; j < 64; j++) ; return 0; } ... If num_workers is 16, either because: - we add a "num_workers (16)" clause on the parallel directive, or - we set "GOMP_OPENACC_DIM=:16:", or - the libgomp plugin chooses 16 num_workers we run into an illegal instruction at runtime, because a bar.sync instruction tries to use a barrier 16. The instruction is illegal, because ptx supports only 16 barriers per CTA, and the valid range is 0..15. The problem is that with a warp-multiple vector length, we use a code generation scheme with a per-worker barrier. And because barrier zero is reserved for per-cta barrier, only the remaining 15 barriers can be used as per-worker barrier, and consequently we can't use num_workers larger than 15. This problem occurs only for vector_length 64. For vector_length 32, we use a different code generation scheme, and for vector_length >= 96, the maximum num_workers is not big enough not to trigger this problem. Also, this problem only occurs for num_workers 16. As explained above, num_workers 15 is safe to use, and 16 is already the maximum num_workers for vector_length 64. This patch fixes the problem in both the compiler (handling "num_workers (16)") and in the libgomp nvptx plugin (with and without "GOMP_OPENACC_DIM=:16:"). 2019-01-11 Tom de Vries <tdevries@suse.de> * config/nvptx/nvptx.c (PTX_CTA_NUM_BARRIERS, PTX_PER_CTA_BARRIER) (PTX_NUM_PER_CTA_BARRIER, PTX_FIRST_PER_WORKER_BARRIER) (PTX_NUM_PER_WORKER_BARRIERS): Define. (nvptx_apply_dim_limits): Prevent vector_length 64 and num_workers 16. * plugin/plugin-nvptx.c (nvptx_exec): Prevent vector_length 64 and num_workers 16. From-SVN: r267838
This commit is contained in:
parent
69b09a587d
commit
052aaaceed
4 changed files with 48 additions and 0 deletions
|
@ -1,3 +1,11 @@
|
|||
2019-01-11 Tom de Vries <tdevries@suse.de>
|
||||
|
||||
* config/nvptx/nvptx.c (PTX_CTA_NUM_BARRIERS, PTX_PER_CTA_BARRIER)
|
||||
(PTX_NUM_PER_CTA_BARRIER, PTX_FIRST_PER_WORKER_BARRIER)
|
||||
(PTX_NUM_PER_WORKER_BARRIERS): Define.
|
||||
(nvptx_apply_dim_limits): Prevent vector_length 64 and
|
||||
num_workers 16.
|
||||
|
||||
2019-01-11 Tom de Vries <tdevries@suse.de>
|
||||
|
||||
* config/nvptx/nvptx.c (PTX_CTA_SIZE): Move up.
|
||||
|
|
|
@ -87,8 +87,14 @@
|
|||
2.x. */
|
||||
#define PTX_CTA_SIZE 1024
|
||||
|
||||
#define PTX_CTA_NUM_BARRIERS 16
|
||||
#define PTX_WARP_SIZE 32
|
||||
|
||||
#define PTX_PER_CTA_BARRIER 0
|
||||
#define PTX_NUM_PER_CTA_BARRIERS 1
|
||||
#define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
|
||||
#define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
|
||||
|
||||
#define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
|
||||
#define PTX_MAX_VECTOR_LENGTH PTX_WARP_SIZE
|
||||
#define PTX_WORKER_LENGTH 32
|
||||
|
@ -5496,6 +5502,13 @@ nvptx_apply_dim_limits (int dims[])
|
|||
if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
|
||||
&& dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
|
||||
dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
|
||||
|
||||
/* If we need a per-worker barrier ... . */
|
||||
if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
|
||||
&& dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
|
||||
/* Don't use more barriers than available. */
|
||||
dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
|
||||
PTX_NUM_PER_WORKER_BARRIERS);
|
||||
}
|
||||
|
||||
/* Return true if FNDECL contains calls to vector-partitionable routines. */
|
||||
|
|
|
@ -1,3 +1,8 @@
|
|||
2019-01-11 Tom de Vries <tdevries@suse.de>
|
||||
|
||||
* plugin/plugin-nvptx.c (nvptx_exec): Prevent vector_length 64 and
|
||||
num_workers 16.
|
||||
|
||||
2019-01-11 Tom de Vries <tdevries@suse.de>
|
||||
|
||||
* testsuite/libgomp.oacc-c-c++-common/reduction-1.c: Remove
|
||||
|
|
|
@ -1273,6 +1273,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
|
|||
: dims[GOMP_DIM_VECTOR]);
|
||||
workers = blocks / actual_vectors;
|
||||
workers = MAX (workers, 1);
|
||||
/* If we need a per-worker barrier ... . */
|
||||
if (actual_vectors > 32)
|
||||
/* Don't use more barriers than available. */
|
||||
workers = MIN (workers, 15);
|
||||
}
|
||||
|
||||
for (i = 0; i != GOMP_DIM_MAX; i++)
|
||||
|
@ -1303,6 +1307,24 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
|
|||
suggest_workers, suggest_workers);
|
||||
}
|
||||
|
||||
/* Check if the accelerator has sufficient barrier resources to
|
||||
launch the offloaded kernel. */
|
||||
if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
|
||||
{
|
||||
const char *msg
|
||||
= ("The Nvidia accelerator has insufficient barrier resources to launch"
|
||||
" '%s' with num_workers = %d and vector_length = %d"
|
||||
"; "
|
||||
"recompile the program with 'num_workers = x' on that offloaded"
|
||||
" region or '-fopenacc-dim=:x:' where x <= 15"
|
||||
"; "
|
||||
"or, recompile the program with 'vector_length = 32' on that"
|
||||
" offloaded region"
|
||||
".\n");
|
||||
GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
|
||||
dims[GOMP_DIM_VECTOR]);
|
||||
}
|
||||
|
||||
/* This reserves a chunk of a pre-allocated page of memory mapped on both
|
||||
the host and the device. HP is a host pointer to the new chunk, and DP is
|
||||
the corresponding device pointer. */
|
||||
|
|
Loading…
Add table
Reference in a new issue