[libgomp, testsuite] Fix insufficient resources in test-cases
When running libgomp test-case broadcast-many.c on an nvptx accelerator
(T400, driver version 470.86), I run into:
...
libgomp: The Nvidia accelerator has insufficient resources to launch \
'main$_omp_fn$0' with num_workers = 32 and vector_length = 32; \
recompile the program with 'num_workers = x and vector_length = y' on \
that offloaded region or '-fopenacc-dim=❌y' where x * y <= 896.
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/broadcast-many.c \
-DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none \
-O0 execution test
...
The error does not occur when using GOMP_NVPTX_JIT=-O0.
Fix this by using 896 / 32 == 28 workers for ACC_DEVICE_TYPE_nvidia.
Likewise for some other test-cases.
Tested libgomp on x86_64 with nvptx accelerator.
libgomp/ChangeLog:
2022-01-27 Tom de Vries <tdevries@suse.de>
* testsuite/libgomp.oacc-c-c++-common/broadcast-many.c: Reduce
num_workers for nvidia accelerator to fix libgomp error 'insufficient
resources'.
* testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-4.c:
Same.
* testsuite/libgomp.oacc-c-c++-common/reduction-7.c: Same.
This commit is contained in:
parent
be362d5e12
commit
d43fbc7d3f
3 changed files with 25 additions and 3 deletions
|
@ -5,6 +5,13 @@
|
|||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#if ACC_DEVICE_TYPE_nvidia
|
||||
/* To avoid 'libgomp: The Nvidia accelerator has insufficient resources'. */
|
||||
#define NUM_WORKERS 28
|
||||
#else
|
||||
#define NUM_WORKERS 32
|
||||
#endif
|
||||
|
||||
#define LOCAL(n) double n = input;
|
||||
#define LOCALS(n) LOCAL(n##1) LOCAL(n##2) LOCAL(n##3) LOCAL(n##4) \
|
||||
LOCAL(n##5) LOCAL(n##6) LOCAL(n##7) LOCAL(n##8)
|
||||
|
@ -23,7 +30,7 @@ int main (void)
|
|||
int ret;
|
||||
int input = 1;
|
||||
|
||||
#pragma acc parallel num_gangs(1) num_workers(32) copyout(ret)
|
||||
#pragma acc parallel num_gangs(1) num_workers(NUM_WORKERS) copyout(ret)
|
||||
{
|
||||
int w = 0;
|
||||
LOCALS2(h);
|
||||
|
|
|
@ -1,5 +1,12 @@
|
|||
#include <assert.h>
|
||||
|
||||
#if ACC_DEVICE_TYPE_nvidia
|
||||
/* To avoid 'libgomp: The Nvidia accelerator has insufficient resources'. */
|
||||
#define NUM_WORKERS 24
|
||||
#else
|
||||
#define NUM_WORKERS 32
|
||||
#endif
|
||||
|
||||
/* Test of reduction on both parallel and loop directives (workers and vectors
|
||||
together in gang-partitioned mode, float type, multiple reductions). */
|
||||
|
||||
|
@ -13,7 +20,8 @@ main (int argc, char *argv[])
|
|||
for (i = 0; i < 32768; i++)
|
||||
arr[i] = i % (32768 / 64);
|
||||
|
||||
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
|
||||
#pragma acc parallel \
|
||||
num_gangs(32) num_workers(NUM_WORKERS) vector_length(32) \
|
||||
reduction(+:res) reduction(max:mres) copy(res, mres)
|
||||
{
|
||||
#pragma acc loop gang /* { dg-warning "nested loop in reduction needs reduction clause for 'm\?res'" "TODO" } */
|
||||
|
|
|
@ -181,6 +181,12 @@ void gwv_np_3()
|
|||
assert (res == hres);
|
||||
}
|
||||
|
||||
#if ACC_DEVICE_TYPE_nvidia
|
||||
/* To avoid 'libgomp: The Nvidia accelerator has insufficient resources'. */
|
||||
#define NUM_WORKERS 28
|
||||
#else
|
||||
#define NUM_WORKERS 32
|
||||
#endif
|
||||
|
||||
/* Test of reduction on loop directive (gangs, workers and vectors, multiple
|
||||
non-private reduction variables, float type). */
|
||||
|
@ -194,7 +200,7 @@ void gwv_np_4()
|
|||
for (i = 0; i < 32768; i++)
|
||||
arr[i] = i % (32768 / 64);
|
||||
|
||||
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
|
||||
#pragma acc parallel num_gangs(32) num_workers(NUM_WORKERS) vector_length(32)
|
||||
{
|
||||
#pragma acc loop gang reduction(+:res) reduction(max:mres)
|
||||
for (j = 0; j < 32; j++)
|
||||
|
@ -235,6 +241,7 @@ void gwv_np_4()
|
|||
assert (mres == hmres);
|
||||
}
|
||||
|
||||
#undef NUM_WORKERS
|
||||
|
||||
/* Test of reduction on loop directive (vectors, private reduction
|
||||
variable). */
|
||||
|
|
Loading…
Add table
Reference in a new issue