Update OpenACC tests for amdgcn

2020-01-20 Andrew Stubbs <ams@codesourcery.com> libgomp/ * testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Skip test on gcn. * testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c (main): Adjust test dimensions for amdgcn. * testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c (main): Adjust gang/worker/vector expectations dynamically. * testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c (main): Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c (main): Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c (main): Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c (main): Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c (main): Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c (main): Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-v-1.c (main): Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-w-1.c (main): Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c (main): Likewise. * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c (acc_gang): Recognise acc_device_radeon. (acc_worker): Likewise. (acc_vector): Likewise. (main): Set expectations for amdgcn. * testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c (main): Adjust gang/worker/vector expectations dynamically. * testsuite/libgomp.oacc-c-c++-common/routine-v-1.c (main): Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-w-1.c (main): Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c (main): Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c: Set expectations for amdgcn.
2019-11-14 16:16:04 +00:00 · 2019-11-14 16:16:04 +00:00 · 09e0ad6253
commit 09e0ad6253
parent 3a43459715
19 changed files with 180 additions and 45 deletions
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@ -1,3 +1,33 @@
+2020-01-20  Andrew Stubbs  <ams@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Skip test on gcn.
+	* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c (main):
+	Adjust test dimensions for amdgcn.
+	* testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c (main): Adjust
+	gang/worker/vector expectations dynamically.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
+	(main): Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c (main): Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c (main): Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c (main): Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c (main): Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c (main): Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-v-1.c (main): Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-w-1.c (main): Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c (main): Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+	(acc_gang): Recognise acc_device_radeon.
+	(acc_worker): Likewise.
+	(acc_vector): Likewise.
+	(main): Set expectations for amdgcn.
+	* testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
+	(main): Adjust gang/worker/vector expectations dynamically.
+	* testsuite/libgomp.oacc-c-c++-common/routine-v-1.c (main): Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-w-1.c (main): Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c (main): Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c: Set expectations
+	for amdgcn.
+
 2020-01-17  Andrew Stubbs  <ams@codesourcery.com>

 	* config/accel/openacc.f90 (openacc_kinds): Rename acc_device_gcn to
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
@ -1,3 +1,6 @@
+/* AMD GCN does not use 32-lane vectors.
+   { dg-skip-if "unsuitable dimensions" { openacc_amdgcn_accel_selected } { "*" } { "" } } */
+
 /* { dg-additional-options "-fopenacc-dim=32" } */

 #include <stdio.h>
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
@ -128,5 +128,14 @@ int test_1 (int gp, int wp, int vp)

 int main ()
 {
+#ifdef ACC_DEVICE_TYPE_gcn
+  /* AMD GCN uses the autovectorizer for the vector dimension: the use
+     of a function call in vector-partitioned code in this test is not
+     currently supported.  */
+  /* AMD GCN does not currently support multiple workers.  This should be
+     set to 16 when that changes.  */
+  return test_1 (16, 1, 1);
+#else
  return test_1 (16, 16, 32);
+#endif
 }
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
@ -9,11 +9,13 @@ int main ()
  int ix;
  int exit = 0;
  int ondev = 0;
+  int gangsize, workersize, vectorsize;

  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
+	    copy(ary) copy(ondev) copyout(gangsize, workersize, vectorsize)
  {
 #pragma acc loop gang worker vector
    for (unsigned ix = 0; ix < N; ix++)
@ -32,6 +34,10 @@ int main ()
 	else
 	  ary[ix] = ix;
      }
+
+    gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }

  for (ix = 0; ix < N; ix++)
@ -39,11 +45,12 @@ int main ()
      int expected = ix;
      if(ondev)
 	{
-	  int chunk_size = (N + 32*32*32 - 1) / (32*32*32);
+	  int chunk_size = (N + gangsize * workersize * vectorsize - 1)
+			   / (gangsize * workersize * vectorsize);
 	  
-	  int g = ix / (chunk_size * 32 * 32);
-	  int w = ix / 32 % 32;
-	  int v = ix % 32;
+	  int g = ix / (chunk_size * workersize * vectorsize);
+	  int w = (ix / vectorsize) % workersize;
+	  int v = ix % vectorsize;

 	  expected = (g << 16) | (w << 8) | v;
 	}
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
@ -8,8 +8,10 @@ int main ()
  int ix;
  int ondev = 0;
  int t = 0, h = 0;
-  
-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ondev)
+  int gangsize, workersize, vectorsize;
+
+#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
+	copy(ondev) copyout(gangsize, workersize, vectorsize)
  {
 #pragma acc loop gang worker vector reduction(+:t)
    for (unsigned ix = 0; ix < N; ix++)
@ -28,18 +30,22 @@ int main ()
 	  }
 	t += val;
      }
+    gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }

  for (ix = 0; ix < N; ix++)
    {
      int val = ix;
-      if(ondev)
+      if (ondev)
 	{
-	  int chunk_size = (N + 32*32*32 - 1) / (32*32*32);
+	  int chunk_size = (N + gangsize * workersize * vectorsize - 1)
+			   / (gangsize * workersize * vectorsize);
 	  
-	  int g = ix / (chunk_size * 32 * 32);
-	  int w = ix / 32 % 32;
-	  int v = ix % 32;
+	  int g = ix / (chunk_size * vectorsize * workersize);
+	  int w = ix / vectorsize % workersize;
+	  int v = ix % vectorsize;

 	  val = (g << 16) | (w << 8) | v;
 	}
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c
@ -9,8 +9,9 @@ int main ()
  int ix;
  int ondev = 0;
  int t = 0,  h = 0;
+  int vectorsize;

-#pragma acc parallel vector_length(32) copy(ondev)
+#pragma acc parallel vector_length(32) copy(ondev) copyout(vectorsize)
  {
 #pragma acc loop vector reduction (+:t)
    for (unsigned ix = 0; ix < N; ix++)
@ -29,6 +30,7 @@ int main ()
 	  }
 	t += val;
      }
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }

  for (ix = 0; ix < N; ix++)
@ -38,7 +40,7 @@ int main ()
 	{
 	  int g = 0;
 	  int w = 0;
-	  int v = ix % 32;
+	  int v = ix % vectorsize;

 	  val = (g << 16) | (w << 8) | v;
 	}
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c
@ -9,8 +9,9 @@ int main ()
  int ix;
  int ondev = 0;
  int q = 0,  h = 0;
+  int vectorsize;

-#pragma acc parallel vector_length(32) copy(q) copy(ondev)
+#pragma acc parallel vector_length(32) copy(q) copy(ondev) copyout(vectorsize)
  {
    int t = q;
    
@ -32,6 +33,7 @@ int main ()
 	t += val;
      }
    q = t;
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }

  for (ix = 0; ix < N; ix++)
@ -41,7 +43,7 @@ int main ()
 	{
 	  int g = 0;
 	  int w = 0;
-	  int v = ix % 32;
+	  int v = ix % vectorsize;

 	  val = (g << 16) | (w << 8) | v;
 	}
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c
@ -8,8 +8,10 @@ int main ()
  int ix;
  int ondev = 0;
  int t = 0,  h = 0;
+  int workersize;

-#pragma acc parallel num_workers(32) vector_length(32) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ondev) \
+	    copyout(workersize)
  {
 #pragma acc loop worker reduction(+:t)
    for (unsigned ix = 0; ix < N; ix++)
@ -28,6 +30,7 @@ int main ()
 	  }
 	t += val;
      }
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
  }

  for (ix = 0; ix < N; ix++)
@ -36,7 +39,7 @@ int main ()
      if(ondev)
 	{
 	  int g = 0;
-	  int w = ix % 32;
+	  int w = ix % workersize;
 	  int v = 0;

 	  val = (g << 16) | (w << 8) | v;
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c
@ -8,8 +8,10 @@ int main ()
  int ix;
  int ondev = 0;
  int q = 0,  h = 0;
+  int workersize;

-#pragma acc parallel num_workers(32) vector_length(32) copy(q) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(q) copy(ondev) \
+	    copyout(workersize)
  {
    int t = q;
    
@ -31,6 +33,7 @@ int main ()
 	t += val;
      }
    q = t;
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
  }

  for (ix = 0; ix < N; ix++)
@ -39,7 +42,7 @@ int main ()
      if(ondev)
 	{
 	  int g = 0;
-	  int w = ix % 32;
+	  int w = ix % workersize;
 	  int v = 0;

 	  val = (g << 16) | (w << 8) | v;
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
@ -8,8 +8,10 @@ int main ()
  int ix;
  int ondev = 0;
  int t = 0, h = 0;
+  int workersize, vectorsize;
  
-#pragma acc parallel num_workers(32) vector_length(32) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ondev) \
+	    copyout(workersize, vectorsize)
  {
 #pragma acc loop worker vector reduction (+:t)
    for (unsigned ix = 0; ix < N; ix++)
@ -28,6 +30,8 @@ int main ()
 	  }
 	t += val;
      }
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }

  for (ix = 0; ix < N; ix++)
@ -36,8 +40,8 @@ int main ()
      if(ondev)
 	{
 	  int g = 0;
-	  int w = (ix / 32) % 32;
-	  int v = ix % 32;
+	  int w = (ix / vectorsize) % workersize;
+	  int v = ix % vectorsize;

 	  val = (g << 16) | (w << 8) | v;
 	}
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c
@ -9,11 +9,13 @@ int main ()
  int ix;
  int exit = 0;
  int ondev = 0;
+  int vectorsize;

  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
-#pragma acc parallel vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel vector_length(32) copy(ary) copy(ondev) \
+	    copyout(vectorsize)
  {
 #pragma acc loop vector
    for (unsigned ix = 0; ix < N; ix++)
@ -31,6 +33,7 @@ int main ()
 	else
 	  ary[ix] = ix;
      }
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }

  for (ix = 0; ix < N; ix++)
@ -40,7 +43,7 @@ int main ()
 	{
 	  int g = 0;
 	  int w = 0;
-	  int v = ix % 32;
+	  int v = ix % vectorsize;

 	  expected = (g << 16) | (w << 8) | v;
 	}
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c
@ -9,11 +9,13 @@ int main ()
  int ix;
  int exit = 0;
  int ondev = 0;
+  int workersize;

  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
+	    copyout(workersize)
  {
 #pragma acc loop worker
    for (unsigned ix = 0; ix < N; ix++)
@ -31,6 +33,7 @@ int main ()
 	else
 	  ary[ix] = ix;
      }
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
  }

  for (ix = 0; ix < N; ix++)
@ -39,7 +42,7 @@ int main ()
      if(ondev)
 	{
 	  int g = 0;
-	  int w = ix % 32;
+	  int w = ix % workersize;
 	  int v = 0;

 	  expected = (g << 16) | (w << 8) | v;
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
@ -9,11 +9,13 @@ int main ()
  int ix;
  int exit = 0;
  int ondev = 0;
+  int workersize, vectorsize;

  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
+	    copyout(workersize, vectorsize)
  {
 #pragma acc loop worker vector
    for (unsigned ix = 0; ix < N; ix++)
@ -31,6 +33,8 @@ int main ()
 	else
 	  ary[ix] = ix;
      }
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }

  for (ix = 0; ix < N; ix++)
@ -39,8 +43,8 @@ int main ()
      if(ondev)
 	{
 	  int g = 0;
-	  int w = (ix / 32) % 32;
-	  int v = ix % 32;
+	  int w = (ix / vectorsize) % workersize;
+	  int v = ix % vectorsize;

 	  expected = (g << 16) | (w << 8) | v;
 	}
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@ -14,7 +14,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
 {
  if (acc_on_device ((int) acc_device_host))
    return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device ((int) acc_device_nvidia)
+	   || acc_on_device ((int) acc_device_radeon))
    return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
  else
    __builtin_abort ();
@ -25,7 +26,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
 {
  if (acc_on_device ((int) acc_device_host))
    return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device ((int) acc_device_nvidia)
+	   || acc_on_device ((int) acc_device_radeon))
    return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
  else
    __builtin_abort ();
@ -36,7 +38,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
 {
  if (acc_on_device ((int) acc_device_host))
    return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device ((int) acc_device_nvidia)
+	   || acc_on_device ((int) acc_device_radeon))
    return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
  else
    __builtin_abort ();
@ -282,6 +285,12 @@ int main ()
 	  /* The GCC nvptx back end enforces num_workers (32).  */
 	  workers_actual = 32;
 	}
+      else if (acc_on_device (acc_device_radeon))
+	{
+	  /* The GCC GCN back end is limited to num_workers (16).
+	     Temporarily set this to 1 until multiple workers are permitted. */
+	  workers_actual = 1; // 16;
+	}
      else
 	__builtin_abort ();
 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
@ -328,6 +337,11 @@ int main ()
 	  /* We're actually executing with num_workers (32).  */
 	  /* workers_actual = 32; */
 	}
+      else if (acc_on_device (acc_device_radeon))
+	{
+	  /* The GCC GCN back end is limited to num_workers (16).  */
+	  workers_actual = 16;
+	}
      else
 	__builtin_abort ();
 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
@ -367,6 +381,11 @@ int main ()
 	  /* The GCC nvptx back end enforces vector_length (32).  */
 	  vectors_actual = 1024;
 	}
+      else if (acc_on_device (acc_device_radeon))
+	{
+	  /* The GCC GCN back end enforces vector_length (1): autovectorize. */
+	  vectors_actual = 1;
+	}
      else
 	__builtin_abort ();
 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
@ -407,6 +426,13 @@ int main ()
 	  /* The GCC nvptx back end enforces vector_length (32).  */
 	  vectors_actual = 32;
 	}
+      else if (acc_on_device (acc_device_radeon))
+	{
+	  /* Because of the way vectors are implemented for GCN, a vector loop
+	     containing a seq routine call will not vectorize calls to that
+	     routine.  Hence, we'll only get one "vector".  */
+	  vectors_actual = 1;
+	}
      else
 	__builtin_abort ();
 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
@ -433,6 +459,9 @@ int main ()
       in the following case.  So, limit ourselves here.  */
    if (acc_get_device_type () == acc_device_nvidia)
      gangs = 3;
+    /* Similar appears to be true for GCN.  */
+    if (acc_get_device_type () == acc_device_radeon)
+      gangs = 3;
    int gangs_actual = gangs;
 #define WORKERS 3
    int workers_actual = WORKERS;
@ -459,6 +488,13 @@ int main ()
 	  /* The GCC nvptx back end enforces vector_length (32).  */
 	  vectors_actual = 32;
 	}
+      else if (acc_on_device (acc_device_radeon))
+	{
+	  /* Temporary setting, until multiple workers are permitted.  */
+	  workers_actual = 1;
+	  /* See above comments about GCN vectors_actual.  */
+	  vectors_actual = 1;
+	}
      else
 	__builtin_abort ();
 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
@ -30,14 +30,18 @@ int main ()
  int ix;
  int exit = 0;
  int ondev = 0;
+  int gangsize, workersize, vectorsize;

  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev) copyout(gangsize, workersize, vectorsize)
  {
    ondev = acc_on_device (acc_device_not_host);
    gang (ary);
+    gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }

  for (ix = 0; ix < N; ix++)
@ -45,11 +49,12 @@ int main ()
      int expected = ix;
      if(ondev)
 	{
-	  int chunk_size = (N + 32*32*32 - 1) / (32*32*32);
+	  int chunk_size = (N + gangsize * workersize * vectorsize - 1)
+			   / (gangsize * workersize * vectorsize);
 	  
-	  int g = ix / (chunk_size * 32 * 32);
-	  int w = ix / 32 % 32;
-	  int v = ix % 32;
+	  int g = ix / (chunk_size * vectorsize * workersize);
+	  int w = (ix / vectorsize) % workersize;
+	  int v = ix % vectorsize;

 	  expected = (g << 16) | (w << 8) | v;
 	}
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-v-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-v-1.c
@ -30,14 +30,17 @@ int main ()
  int ix;
  int exit = 0;
  int ondev = 0;
+  int vectorsize;

  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
-#pragma acc parallel vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel vector_length(32) copy(ary) copy(ondev) \
+	    copyout(vectorsize)
  {
    ondev = acc_on_device (acc_device_not_host);
    vector (ary);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }

  for (ix = 0; ix < N; ix++)
@ -47,7 +50,7 @@ int main ()
 	{
 	  int g = 0;
 	  int w = 0;
-	  int v = ix % 32;
+	  int v = ix % vectorsize;

 	  expected = (g << 16) | (w << 8) | v;
 	}
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-w-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-w-1.c
@ -30,14 +30,17 @@ int main ()
  int ix;
  int exit = 0;
  int ondev = 0;
+  int workersize;

  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
+	    copyout(workersize)
  {
    ondev = acc_on_device (acc_device_not_host);
    worker (ary);
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
  }

  for (ix = 0; ix < N; ix++)
@ -46,7 +49,7 @@ int main ()
      if(ondev)
 	{
 	  int g = 0;
-	  int w = ix % 32;
+	  int w = ix % workersize;
 	  int v = 0;

 	  expected = (g << 16) | (w << 8) | v;
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
@ -30,14 +30,18 @@ int main ()
  int ix;
  int exit = 0;
  int ondev = 0;
+  int workersize, vectorsize;

  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
+	    copyout(workersize, vectorsize)
  {
    ondev = acc_on_device (acc_device_not_host);
    worker (ary);
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }

  for (ix = 0; ix < N; ix++)
@ -46,8 +50,8 @@ int main ()
      if(ondev)
 	{
 	  int g = 0;
-	  int w = (ix / 32) % 32;
-	  int v = ix % 32;
+	  int w = (ix / vectorsize) % workersize;
+	  int v = ix % vectorsize;

 	  expected = (g << 16) | (w << 8) | v;
 	}
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c
@ -2,8 +2,13 @@
 #include <openacc.h>
 #include <gomp-constants.h>

+#ifdef ACC_DEVICE_TYPE_gcn
+#define NUM_WORKERS 16
+#define NUM_VECTORS 1
+#else
 #define NUM_WORKERS 16
 #define NUM_VECTORS 32
+#endif
 #define WIDTH 64
 #define HEIGHT 32