diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 5495b9a6f4a..5d9c536e414 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2019-01-11 Tom de Vries + + * config/nvptx/nvptx.c (PTX_CTA_NUM_BARRIERS, PTX_PER_CTA_BARRIER) + (PTX_NUM_PER_CTA_BARRIER, PTX_FIRST_PER_WORKER_BARRIER) + (PTX_NUM_PER_WORKER_BARRIERS): Define. + (nvptx_apply_dim_limits): Prevent vector_length 64 and + num_workers 16. + 2019-01-11 Tom de Vries * config/nvptx/nvptx.c (PTX_CTA_SIZE): Move up. diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 643f5e86ccc..b37010ff58e 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -87,8 +87,14 @@ 2.x. */ #define PTX_CTA_SIZE 1024 +#define PTX_CTA_NUM_BARRIERS 16 #define PTX_WARP_SIZE 32 +#define PTX_PER_CTA_BARRIER 0 +#define PTX_NUM_PER_CTA_BARRIERS 1 +#define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS) +#define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS) + #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE #define PTX_MAX_VECTOR_LENGTH PTX_WARP_SIZE #define PTX_WORKER_LENGTH 32 @@ -5496,6 +5502,13 @@ nvptx_apply_dim_limits (int dims[]) if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0 && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE) dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; + + /* If we need a per-worker barrier ... . */ + if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0 + && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE) + /* Don't use more barriers than available. */ + dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER], + PTX_NUM_PER_WORKER_BARRIERS); } /* Return true if FNDECL contains calls to vector-partitionable routines. */ diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index ea339dec963..8c04d5a53f4 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,3 +1,8 @@ +2019-01-11 Tom de Vries + + * plugin/plugin-nvptx.c (nvptx_exec): Prevent vector_length 64 and + num_workers 16. + 2019-01-11 Tom de Vries * testsuite/libgomp.oacc-c-c++-common/reduction-1.c: Remove diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 60553bdf3bd..c80da64c422 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -1273,6 +1273,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, : dims[GOMP_DIM_VECTOR]); workers = blocks / actual_vectors; workers = MAX (workers, 1); + /* If we need a per-worker barrier ... . */ + if (actual_vectors > 32) + /* Don't use more barriers than available. */ + workers = MIN (workers, 15); } for (i = 0; i != GOMP_DIM_MAX; i++) @@ -1303,6 +1307,24 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, suggest_workers, suggest_workers); } + /* Check if the accelerator has sufficient barrier resources to + launch the offloaded kernel. */ + if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32) + { + const char *msg + = ("The Nvidia accelerator has insufficient barrier resources to launch" + " '%s' with num_workers = %d and vector_length = %d" + "; " + "recompile the program with 'num_workers = x' on that offloaded" + " region or '-fopenacc-dim=:x:' where x <= 15" + "; " + "or, recompile the program with 'vector_length = 32' on that" + " offloaded region" + ".\n"); + GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER], + dims[GOMP_DIM_VECTOR]); + } + /* This reserves a chunk of a pre-allocated page of memory mapped on both the host and the device. HP is a host pointer to the new chunk, and DP is the corresponding device pointer. */