Decompose OpenACC 'kernels' constructs into parts, a sequence of compute constructs

Not yet enabled by default: for now, the current mode of OpenACC 'kernels'
constructs handling still remains '-fopenacc-kernels=parloops', but that is to
change later.

	gcc/
	* omp-oacc-kernels-decompose.cc: New.
	* Makefile.in (OBJS): Add it.
	* passes.def: Instantiate it.
	* tree-pass.h (make_pass_omp_oacc_kernels_decompose): Declare.
	* flag-types.h (enum openacc_kernels): Add.
	* doc/invoke.texi (-fopenacc-kernels): Document.
	* gimple.h (enum gf_mask): Add
	'GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED',
	'GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE',
	'GF_OMP_TARGET_KIND_OACC_DATA_KERNELS'.
	(is_gimple_omp_oacc, is_gimple_omp_offloaded): Handle these.
	* gimple-pretty-print.c (dump_gimple_omp_target): Likewise.
	* omp-expand.c (expand_omp_target, build_omp_regions_1)
	(omp_make_gimple_edges): Likewise.
	* omp-low.c (scan_sharing_clauses, scan_omp_for)
	(check_omp_nesting_restrictions, lower_oacc_reductions)
	(lower_oacc_head_mark, lower_omp_target): Likewise.
	* omp-offload.c (execute_oacc_device_lower): Likewise.
	gcc/c-family/
	* c.opt (fopenacc-kernels): Add.
	gcc/fortran/
	* lang.opt (fopenacc-kernels): Add.
	gcc/testsuite/
	* c-c++-common/goacc/kernels-decompose-1.c: New.
	* c-c++-common/goacc/kernels-decompose-2.c: New.
	* c-c++-common/goacc/kernels-decompose-ice-1.c: New.
	* c-c++-common/goacc/kernels-decompose-ice-2.c: New.
	* gfortran.dg/goacc/kernels-decompose-1.f95: New.
	* gfortran.dg/goacc/kernels-decompose-2.f95: New.
	* c-c++-common/goacc/if-clause-2.c: Adjust.
	* gfortran.dg/goacc/kernels-tree.f95: Likewise.
	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/declare-vla-kernels-decompose-ice-1.c:
	New.
	* testsuite/libgomp.oacc-c-c++-common/declare-vla-kernels-decompose.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/declare-vla.c: Adjust.
	* testsuite/libgomp.oacc-fortran/pr94358-1.f90: Likewise.

Co-authored-by: Thomas Schwinge <thomas@codesourcery.com>
This commit is contained in:
Gergö Barany 2019-02-01 00:59:30 +01:00 committed by Thomas Schwinge
parent bd78857554
commit e898ce7997
26 changed files with 2355 additions and 16 deletions

View file

@ -1480,6 +1480,7 @@ OBJS = \
omp-expand.o \
omp-general.o \
omp-low.o \
omp-oacc-kernels-decompose.o \
omp-simd-clone.o \
opt-problem.o \
optabs.o \

View file

@ -1796,6 +1796,19 @@ fopenacc-dim=
C ObjC C++ ObjC++ LTO Joined Var(flag_openacc_dims)
Specify default OpenACC compute dimensions.
fopenacc-kernels=
C ObjC C++ ObjC++ RejectNegative Joined Enum(openacc_kernels) Var(flag_openacc_kernels) Init(OPENACC_KERNELS_PARLOOPS)
-fopenacc-kernels=[decompose|parloops] Specify mode of OpenACC 'kernels' constructs handling.
Enum
Name(openacc_kernels) Type(enum openacc_kernels)
EnumValue
Enum(openacc_kernels) String(decompose) Value(OPENACC_KERNELS_DECOMPOSE)
EnumValue
Enum(openacc_kernels) String(parloops) Value(OPENACC_KERNELS_PARLOOPS)
fopenmp
C ObjC C++ ObjC++ LTO Var(flag_openmp)
Enable OpenMP (implies -frecursive in Fortran).

View file

@ -201,7 +201,7 @@ in the following sections.
-aux-info @var{filename} -fallow-parameterless-variadic-functions @gol
-fno-asm -fno-builtin -fno-builtin-@var{function} -fgimple@gol
-fhosted -ffreestanding @gol
-fopenacc -fopenacc-dim=@var{geom} @gol
-fopenacc -fopenacc-dim=@var{geom} -fopenacc-kernels=@var{mode} @gol
-fopenmp -fopenmp-simd @gol
-fms-extensions -fplan9-extensions -fsso-struct=@var{endianness} @gol
-fallow-single-precision -fcond-mismatch -flax-vector-conversions @gol
@ -2589,6 +2589,18 @@ not explicitly specify. The @var{geom} value is a triple of
':'-separated sizes, in order 'gang', 'worker' and, 'vector'. A size
can be omitted, to use a target-specific default value.
@item -fopenacc-kernels=@var{mode}
@opindex fopenacc-kernels
@cindex OpenACC accelerator programming
Specify mode of OpenACC `kernels' constructs handling.
With @option{-fopenacc-kernels=decompose}, OpenACC `kernels'
constructs are decomposed into parts, a sequence of compute
constructs, each then handled individually.
This is work in progress.
With @option{-fopenacc-kernels=parloops}, OpenACC `kernels' constructs
are handled by the @samp{parloops} pass, en bloc.
This is the current default.
@item -fopenmp
@opindex fopenmp
@cindex OpenMP parallel

View file

@ -415,6 +415,13 @@ enum evrp_mode
EVRP_MODE_RVRP_DEBUG = EVRP_MODE_RVRP_ONLY | EVRP_MODE_DEBUG
};
/* Modes of OpenACC 'kernels' constructs handling. */
enum openacc_kernels
{
OPENACC_KERNELS_DECOMPOSE,
OPENACC_KERNELS_PARLOOPS
};
#endif
#endif /* ! GCC_FLAG_TYPES_H */

View file

@ -687,6 +687,10 @@ fopenacc-dim=
Fortran LTO Joined Var(flag_openacc_dims)
; Documented in C
fopenacc-kernels=
Fortran RejectNegative Joined Enum(openacc_kernels) Var(flag_openacc_kernels) Init(OPENACC_KERNELS_PARLOOPS)
; Documented in C
fopenmp
Fortran LTO
; Documented in C

View file

@ -1700,6 +1700,15 @@ dump_gimple_omp_target (pretty_printer *buffer, const gomp_target *gs,
case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
kind = " oacc_host_data";
break;
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
kind = " oacc_parallel_kernels_parallelized";
break;
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
kind = " oacc_parallel_kernels_gang_single";
break;
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
kind = " oacc_data_kernels";
break;
default:
gcc_unreachable ();
}

View file

@ -175,6 +175,15 @@ enum gf_mask {
GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA = 10,
GF_OMP_TARGET_KIND_OACC_DECLARE = 11,
GF_OMP_TARGET_KIND_OACC_HOST_DATA = 12,
/* A 'GF_OMP_TARGET_KIND_OACC_PARALLEL' representing an OpenACC 'kernels'
decomposed part, parallelized. */
GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED = 13,
/* A 'GF_OMP_TARGET_KIND_OACC_PARALLEL' representing an OpenACC 'kernels'
decomposed part, "gang-single". */
GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE = 14,
/* A 'GF_OMP_TARGET_KIND_OACC_DATA' representing an OpenACC 'kernels'
decomposed parts' 'data' construct. */
GF_OMP_TARGET_KIND_OACC_DATA_KERNELS = 15,
GF_OMP_TEAMS_HOST = 1 << 0,
/* True on an GIMPLE_OMP_RETURN statement if the return does not require
@ -6511,6 +6520,9 @@ is_gimple_omp_oacc (const gimple *stmt)
case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
case GF_OMP_TARGET_KIND_OACC_DECLARE:
case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
return true;
default:
return false;
@ -6536,6 +6548,8 @@ is_gimple_omp_offloaded (const gimple *stmt)
case GF_OMP_TARGET_KIND_OACC_PARALLEL:
case GF_OMP_TARGET_KIND_OACC_KERNELS:
case GF_OMP_TARGET_KIND_OACC_SERIAL:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
return true;
default:
return false;

View file

@ -9257,11 +9257,14 @@ expand_omp_target (struct omp_region *region)
case GF_OMP_TARGET_KIND_OACC_UPDATE:
case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
case GF_OMP_TARGET_KIND_OACC_DECLARE:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
data_region = false;
break;
case GF_OMP_TARGET_KIND_DATA:
case GF_OMP_TARGET_KIND_OACC_DATA:
case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
data_region = true;
break;
default:
@ -9307,6 +9310,16 @@ expand_omp_target (struct omp_region *region)
= tree_cons (get_identifier ("oacc serial"),
NULL_TREE, DECL_ATTRIBUTES (child_fn));
break;
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
DECL_ATTRIBUTES (child_fn)
= tree_cons (get_identifier ("oacc parallel_kernels_parallelized"),
NULL_TREE, DECL_ATTRIBUTES (child_fn));
break;
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
DECL_ATTRIBUTES (child_fn)
= tree_cons (get_identifier ("oacc parallel_kernels_gang_single"),
NULL_TREE, DECL_ATTRIBUTES (child_fn));
break;
default:
/* Make sure we don't miss any. */
gcc_checking_assert (!(is_gimple_omp_oacc (entry_stmt)
@ -9517,10 +9530,13 @@ expand_omp_target (struct omp_region *region)
case GF_OMP_TARGET_KIND_OACC_PARALLEL:
case GF_OMP_TARGET_KIND_OACC_KERNELS:
case GF_OMP_TARGET_KIND_OACC_SERIAL:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
start_ix = BUILT_IN_GOACC_PARALLEL;
break;
case GF_OMP_TARGET_KIND_OACC_DATA:
case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
start_ix = BUILT_IN_GOACC_DATA_START;
break;
case GF_OMP_TARGET_KIND_OACC_UPDATE:
@ -9993,6 +10009,9 @@ build_omp_regions_1 (basic_block bb, struct omp_region *parent,
case GF_OMP_TARGET_KIND_OACC_SERIAL:
case GF_OMP_TARGET_KIND_OACC_DATA:
case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
break;
case GF_OMP_TARGET_KIND_UPDATE:
case GF_OMP_TARGET_KIND_ENTER_DATA:
@ -10247,6 +10266,9 @@ omp_make_gimple_edges (basic_block bb, struct omp_region **region,
case GF_OMP_TARGET_KIND_OACC_SERIAL:
case GF_OMP_TARGET_KIND_OACC_DATA:
case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
break;
case GF_OMP_TARGET_KIND_UPDATE:
case GF_OMP_TARGET_KIND_ENTER_DATA:

View file

@ -193,8 +193,8 @@ static tree scan_omp_1_op (tree *, int *, void *);
*handled_ops_p = false; \
break;
/* Return true if CTX corresponds to an OpenACC 'parallel' or 'serial'
region. */
/* Return whether CTX represents an OpenACC 'parallel' or 'serial' construct.
(This doesn't include OpenACC 'kernels' decomposed parts.) */
static bool
is_oacc_parallel_or_serial (omp_context *ctx)
@ -207,7 +207,8 @@ is_oacc_parallel_or_serial (omp_context *ctx)
== GF_OMP_TARGET_KIND_OACC_SERIAL)));
}
/* Return true if CTX corresponds to an oacc kernels region. */
/* Return whether CTX represents an OpenACC 'kernels' construct.
(This doesn't include OpenACC 'kernels' decomposed parts.) */
static bool
is_oacc_kernels (omp_context *ctx)
@ -218,6 +219,21 @@ is_oacc_kernels (omp_context *ctx)
== GF_OMP_TARGET_KIND_OACC_KERNELS));
}
/* Return whether CTX represents an OpenACC 'kernels' decomposed part. */
static bool
is_oacc_kernels_decomposed_part (omp_context *ctx)
{
enum gimple_code outer_type = gimple_code (ctx->stmt);
return ((outer_type == GIMPLE_OMP_TARGET)
&& ((gimple_omp_target_kind (ctx->stmt)
== GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
|| (gimple_omp_target_kind (ctx->stmt)
== GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
|| (gimple_omp_target_kind (ctx->stmt)
== GF_OMP_TARGET_KIND_OACC_DATA_KERNELS)));
}
/* Return true if STMT corresponds to an OpenMP target region. */
static bool
is_omp_target (gimple *stmt)
@ -1200,6 +1216,8 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
{
/* No 'reduction' clauses on OpenACC 'kernels'. */
gcc_checking_assert (!is_oacc_kernels (ctx));
/* Likewise, on OpenACC 'kernels' decomposed parts. */
gcc_checking_assert (!is_oacc_kernels_decomposed_part (ctx));
ctx->local_reduction_clauses
= tree_cons (NULL, c, ctx->local_reduction_clauses);
@ -2415,7 +2433,9 @@ enclosing_target_ctx (omp_context *ctx)
return ctx;
}
/* Return true if ctx is part of an oacc kernels region. */
/* Return whether CTX's parent compute construct is an OpenACC 'kernels'
construct.
(This doesn't include OpenACC 'kernels' decomposed parts.) */
static bool
ctx_in_oacc_kernels_region (omp_context *ctx)
@ -2431,7 +2451,8 @@ ctx_in_oacc_kernels_region (omp_context *ctx)
return false;
}
/* Check the parallelism clauses inside a kernels regions.
/* Check the parallelism clauses inside a OpenACC 'kernels' region.
(This doesn't include OpenACC 'kernels' decomposed parts.)
Until kernels handling moves to use the same loop indirection
scheme as parallel, we need to do this checking early. */
@ -2533,6 +2554,10 @@ scan_omp_for (gomp_for *stmt, omp_context *outer_ctx)
if (c_op0)
{
/* By construction, this is impossible for OpenACC 'kernels'
decomposed parts. */
gcc_assert (!(tgt && is_oacc_kernels_decomposed_part (tgt)));
error_at (OMP_CLAUSE_LOCATION (c),
"argument not permitted on %qs clause",
omp_clause_code_name[OMP_CLAUSE_CODE (c)]);
@ -3070,6 +3095,8 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
case GF_OMP_TARGET_KIND_OACC_PARALLEL:
case GF_OMP_TARGET_KIND_OACC_KERNELS:
case GF_OMP_TARGET_KIND_OACC_SERIAL:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
ok = true;
break;
@ -3526,6 +3553,11 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
case GF_OMP_TARGET_KIND_OACC_DECLARE: stmt_name = "declare"; break;
case GF_OMP_TARGET_KIND_OACC_HOST_DATA: stmt_name = "host_data";
break;
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
/* OpenACC 'kernels' decomposed parts. */
stmt_name = "kernels"; break;
default: gcc_unreachable ();
}
switch (gimple_omp_target_kind (ctx->stmt))
@ -3541,6 +3573,11 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
case GF_OMP_TARGET_KIND_OACC_DATA: ctx_stmt_name = "data"; break;
case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
ctx_stmt_name = "host_data"; break;
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
/* OpenACC 'kernels' decomposed parts. */
ctx_stmt_name = "kernels"; break;
default: gcc_unreachable ();
}
@ -6930,6 +6967,8 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
{
/* No 'reduction' clauses on OpenACC 'kernels'. */
gcc_checking_assert (!is_oacc_kernels (ctx));
/* Likewise, on OpenACC 'kernels' decomposed parts. */
gcc_checking_assert (!is_oacc_kernels_decomposed_part (ctx));
tree orig = OMP_CLAUSE_DECL (c);
tree var = maybe_lookup_decl (orig, ctx);
@ -7785,6 +7824,8 @@ lower_oacc_head_mark (location_t loc, tree ddvar, tree clauses,
else if (is_oacc_kernels (tgt))
/* Not using this loops handling inside OpenACC 'kernels' regions. */
gcc_unreachable ();
else if (is_oacc_kernels_decomposed_part (tgt))
;
else
gcc_unreachable ();
@ -7792,6 +7833,14 @@ lower_oacc_head_mark (location_t loc, tree ddvar, tree clauses,
if (!tgt || is_oacc_parallel_or_serial (tgt))
tag |= OLF_INDEPENDENT;
/* Loops inside OpenACC 'kernels' decomposed parts' regions are expected to
have an explicit 'seq' or 'independent' clause, and no 'auto' clause. */
if (tgt && is_oacc_kernels_decomposed_part (tgt))
{
gcc_assert (tag & (OLF_SEQ | OLF_INDEPENDENT));
gcc_assert (!(tag & OLF_AUTO));
}
if (tag & OLF_TILE)
/* Tiling could use all 3 levels. */
levels = 3;
@ -11639,11 +11688,14 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
case GF_OMP_TARGET_KIND_OACC_UPDATE:
case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
case GF_OMP_TARGET_KIND_OACC_DECLARE:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
data_region = false;
break;
case GF_OMP_TARGET_KIND_DATA:
case GF_OMP_TARGET_KIND_OACC_DATA:
case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
data_region = true;
break;
default:
@ -11829,6 +11881,8 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
{
/* No 'firstprivate' clauses on OpenACC 'kernels'. */
gcc_checking_assert (!is_oacc_kernels (ctx));
/* Likewise, on OpenACC 'kernels' decomposed parts. */
gcc_checking_assert (!is_oacc_kernels_decomposed_part (ctx));
goto oacc_firstprivate;
}
@ -11861,6 +11915,8 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
{
/* No 'private' clauses on OpenACC 'kernels'. */
gcc_checking_assert (!is_oacc_kernels (ctx));
/* Likewise, on OpenACC 'kernels' decomposed parts. */
gcc_checking_assert (!is_oacc_kernels_decomposed_part (ctx));
break;
}

File diff suppressed because it is too large Load diff

View file

@ -1771,11 +1771,19 @@ execute_oacc_device_lower ()
bool is_oacc_serial
= (lookup_attribute ("oacc serial",
DECL_ATTRIBUTES (current_function_decl)) != NULL);
bool is_oacc_parallel_kernels_parallelized
= (lookup_attribute ("oacc parallel_kernels_parallelized",
DECL_ATTRIBUTES (current_function_decl)) != NULL);
bool is_oacc_parallel_kernels_gang_single
= (lookup_attribute ("oacc parallel_kernels_gang_single",
DECL_ATTRIBUTES (current_function_decl)) != NULL);
int fn_level = oacc_fn_attrib_level (attrs);
bool is_oacc_routine = (fn_level >= 0);
gcc_checking_assert (is_oacc_parallel
+ is_oacc_kernels
+ is_oacc_serial
+ is_oacc_parallel_kernels_parallelized
+ is_oacc_parallel_kernels_gang_single
+ is_oacc_routine
== 1);
@ -1795,6 +1803,12 @@ execute_oacc_device_lower ()
? "parallelized" : "unparallelized"));
else if (is_oacc_serial)
fprintf (dump_file, "Function is OpenACC serial offload\n");
else if (is_oacc_parallel_kernels_parallelized)
fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
"parallel_kernels_parallelized");
else if (is_oacc_parallel_kernels_gang_single)
fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
"parallel_kernels_gang_single");
else if (is_oacc_routine)
fprintf (dump_file, "Function is OpenACC routine level %d\n",
fn_level);
@ -1838,6 +1852,11 @@ execute_oacc_device_lower ()
fprintf (dump_file, "]\n");
}
/* Verify that for OpenACC 'kernels' decomposed "gang-single" parts we launch
a single gang only. */
if (is_oacc_parallel_kernels_gang_single)
gcc_checking_assert (dims[GOMP_DIM_GANG] == 1);
oacc_loop_process (loops);
if (dump_file)
{

View file

@ -34,6 +34,7 @@ along with GCC; see the file COPYING3. If not see
NEXT_PASS (pass_warn_unused_result);
NEXT_PASS (pass_diagnose_omp_blocks);
NEXT_PASS (pass_diagnose_tm_blocks);
NEXT_PASS (pass_omp_oacc_kernels_decompose);
NEXT_PASS (pass_lower_omp);
NEXT_PASS (pass_lower_cf);
NEXT_PASS (pass_lower_tm);

View file

@ -1,11 +1,21 @@
/* { dg-additional-options "-fdump-tree-gimple" } */
/* { dg-additional-options "-fopenacc-kernels=decompose" }
{ dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose" } */
void
f (short c)
{
#pragma acc parallel if(c)
;
#pragma acc kernels if(c)
;
#pragma acc data if(c)
;
#pragma acc update device(c) if(c)
#pragma acc parallel if(c) copy(c)
++c;
#pragma acc kernels if(c) copy(c)
/* { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_kernels map\(tofrom:c \[len: [0-9]+\]\) if\(_[0-9]+\)$} 1 "gimple" } } */
/* { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_data_kernels map\(tofrom:c \[len: [0-9]+\]\) if\(_[0-9]+\)$} 1 "omp_oacc_kernels_decompose" } }
{ dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_parallel_kernels_gang_single async\(-1\) num_gangs\(1\) map\(force_present:c \[len: [0-9]+\]\) if\(_[0-9]+\)$} 1 "omp_oacc_kernels_decompose" } } */
++c;
#pragma acc data if(c) copy(c)
++c;
#pragma acc update if(c) device(c)
}

View file

@ -0,0 +1,83 @@
/* Test OpenACC 'kernels' construct decomposition. */
/* { dg-additional-options "-fopt-info-omp-all" } */
/* { dg-additional-options "-fdump-tree-gimple" } */
/* { dg-additional-options "-fopenacc-kernels=decompose" }
{ dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose" } */
/* See also '../../gfortran.dg/goacc/kernels-decompose-1.f95'. */
#define N 1024
unsigned int a[N];
int
main (void)
{
int i;
unsigned int sum = 1;
#pragma acc kernels copyin(a[0:N]) copy(sum)
/* { dg-bogus "optimized: assigned OpenACC seq loop parallelism" "TODO" { xfail *-*-* } .-1 }
TODO Is this maybe the report that belongs to the XFAILed report further down? */
{
#pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */
/* { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } */
/* { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
for (i = 0; i < N; ++i)
sum += a[i];
sum++; /* { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } */
a[0]++;
#pragma acc loop independent /* { dg-line l_loop_i[incr c_loop_i] } */
/* { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i } */
/* { dg-optimized "assigned OpenACC gang vector loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
for (i = 0; i < N; ++i)
sum += a[i];
if (sum > 10) /* { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" } */
{
#pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */
/* { dg-missed "unparallelized loop nest in OpenACC 'kernels' region: it's executed conditionally" "" { target *-*-* } l_loop_i$c_loop_i } */
/*TODO { dg-optimized "assigned OpenACC seq loop parallelism" "TODO" { xfail *-*-* } l_loop_i$c_loop_i } */
for (i = 0; i < N; ++i)
sum += a[i];
}
#pragma acc loop auto /* { dg-line l_loop_i[incr c_loop_i] } */
/* { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } */
/* { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
for (i = 0; i < N; ++i)
sum += a[i];
}
return 0;
}
/* { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_kernels map\(tofrom:sum \[len: [0-9]+\]\) map\(to:a\[0\] \[len: [0-9]+\]\) map\(firstprivate:a \[pointer assign, bias: 0\]\)$} 1 "gimple" } }
{ dg-final { scan-tree-dump-times {(?n)#pragma acc loop private\(i\)$} 2 "gimple" } }
{ dg-final { scan-tree-dump-times {(?n)#pragma acc loop independent private\(i\)$} 1 "gimple" } }
{ dg-final { scan-tree-dump-times {(?n)#pragma acc loop auto private\(i\)$} 1 "gimple" } }
{ dg-final { scan-tree-dump-times {(?n)#pragma acc loop} 4 "gimple" } } */
/* Check that the OpenACC 'kernels' got decomposed into 'data' and an enclosed
sequence of compute constructs.
{ dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_data_kernels map\(tofrom:sum \[len: [0-9]+\]\) map\(to:a\[0\] \[len: [0-9]+\]\)$} 1 "omp_oacc_kernels_decompose" } }
As noted above, we get three "old-style" kernel regions, one gang-single region, and one parallelized loop region.
{ dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_kernels async\(-1\) map\(force_present:sum \[len: [0-9]+\]\) map\(force_present:a\[0\] \[len: [0-9]+\]\) map\(firstprivate:a \[pointer assign, bias: 0\]\)$} 3 "omp_oacc_kernels_decompose" } }
{ dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_parallel_kernels_parallelized async\(-1\) map\(force_present:sum \[len: [0-9]+\]\) map\(force_present:a\[0\] \[len: [0-9]+\]\) map\(firstprivate:a \[pointer assign, bias: 0\]\)$} 1 "omp_oacc_kernels_decompose" } }
{ dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_parallel_kernels_gang_single async\(-1\) num_gangs\(1\) map\(force_present:sum \[len: [0-9]+\]\) map\(force_present:a\[0\] \[len: [0-9]+\]\) map\(firstprivate:a \[pointer assign, bias: 0\]\)$} 1 "omp_oacc_kernels_decompose" } }
'data' plus five CCs.
{ dg-final { scan-tree-dump-times {(?n)#pragma omp target } 6 "omp_oacc_kernels_decompose" } }
{ dg-final { scan-tree-dump-times {(?n)#pragma acc loop private\(i\)$} 2 "omp_oacc_kernels_decompose" } }
{ dg-final { scan-tree-dump-times {(?n)#pragma acc loop independent private\(i\)$} 1 "omp_oacc_kernels_decompose" } }
{ dg-final { scan-tree-dump-times {(?n)#pragma acc loop auto private\(i\)$} 1 "omp_oacc_kernels_decompose" } }
{ dg-final { scan-tree-dump-times {(?n)#pragma acc loop} 4 "omp_oacc_kernels_decompose" } }
Each of the parallel regions is async, and there is a final call to
__builtin_GOACC_wait.
{ dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 "omp_oacc_kernels_decompose" } } */

View file

@ -0,0 +1,141 @@
/* Test OpenACC 'kernels' construct decomposition. */
/* { dg-additional-options "-fopt-info-omp-all" } */
/* { dg-additional-options "-fopenacc-kernels=decompose" }
/* { dg-additional-options "-O2" } for 'parloops'. */
/* See also '../../gfortran.dg/goacc/kernels-decompose-2.f95'. */
#pragma acc routine gang
extern int
f_g (int);
#pragma acc routine worker
extern int
f_w (int);
#pragma acc routine vector
extern int
f_v (int);
#pragma acc routine seq
extern int
f_s (int);
int
main ()
{
int x, y, z;
#define N 10
int a[N], b[N], c[N];
#pragma acc kernels
{
x = 0; /* { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } */
y = x < 10;
z = x++;
;
}
{ /*TODO Instead of using 'for (int i = 0; [...])', move 'int i' outside, to work around for ICE detailed in 'kernels-decompose-ice-1.c'. */
int i;
#pragma acc kernels /* { dg-optimized "assigned OpenACC gang loop parallelism" } */
for (i = 0; i < N; i++) /* { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" } */
a[i] = 0;
}
#pragma acc kernels loop /* { dg-line l_loop_i[incr c_loop_i] } */
/* { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } */
/* { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
for (int i = 0; i < N; i++)
b[i] = a[N - i - 1];
#pragma acc kernels
{
#pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */
/* { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } */
/* { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
for (int i = 0; i < N; i++)
b[i] = a[N - i - 1];
#pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */
/* { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } */
/* { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
for (int i = 0; i < N; i++)
c[i] = a[i] * b[i];
a[z] = 0; /* { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } */
#pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */
/* { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } */
/* { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
for (int i = 0; i < N; i++)
c[i] += a[i];
#pragma acc loop seq /* { dg-line l_loop_i[incr c_loop_i] } */
/* { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i } */
/* { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
for (int i = 0 + 1; i < N; i++)
c[i] += c[i - 1];
}
#pragma acc kernels
/*TODO What does this mean?
TODO { dg-optimized "assigned OpenACC worker vector loop parallelism" "" { target *-*-* } .-2 } */
{
#pragma acc loop independent /* { dg-line l_loop_i[incr c_loop_i] } */
/* { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
/* { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i } */
for (int i = 0; i < N; ++i)
#pragma acc loop independent /* { dg-line l_loop_j[incr c_loop_j] } */
/* { dg-optimized "assigned OpenACC worker loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j } */
for (int j = 0; j < N; ++j)
#pragma acc loop independent /* { dg-line l_loop_k[incr c_loop_k] } */
/* { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } l_loop_k$c_loop_k } */
/* { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_k$c_loop_k } */
for (int k = 0; k < N; ++k)
a[(i + j + k) % N]
= b[j]
+ f_v (c[k]); /* { dg-optimized "assigned OpenACC vector loop parallelism" } */
/*TODO Should the following turn into "gang-single" instead of "parloops"?
TODO The problem is that the first STMT is 'if (y <= 4) goto <D.2547>; else goto <D.2548>;', thus "parloops". */
if (y < 5) /* { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" } */
#pragma acc loop independent /* { dg-line l_loop_j[incr c_loop_j] } */
/* { dg-missed "unparallelized loop nest in OpenACC 'kernels' region: it's executed conditionally" "" { target *-*-* } l_loop_j$c_loop_j } */
for (int j = 0; j < N; ++j)
b[j] = f_w (c[j]);
}
#pragma acc kernels
{
y = f_g (a[5]); /* { dg-line l_part[incr c_part] } */
/*TODO If such a construct is placed in its own part (like it is, here), can't this actually use gang paralelism, instead of "gang-single"?
{ dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" "" { target *-*-* } l_part$c_part } */
/* { dg-optimized "assigned OpenACC gang worker vector loop parallelism" "" { target *-*-* } l_part$c_part } */
#pragma acc loop independent /* { dg-line l_loop_j[incr c_loop_j] } */
/* { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_j$c_loop_j } */
/* { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j } */
for (int j = 0; j < N; ++j)
b[j] = y + f_w (c[j]); /* { dg-optimized "assigned OpenACC worker vector loop parallelism" } */
}
#pragma acc kernels
{
y = 3; /* { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } */
#pragma acc loop independent /* { dg-line l_loop_j[incr c_loop_j] } */
/* { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_j$c_loop_j } */
/* { dg-optimized "assigned OpenACC gang worker loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j } */
for (int j = 0; j < N; ++j)
b[j] = y + f_v (c[j]); /* { dg-optimized "assigned OpenACC vector loop parallelism" } */
z = 2; /* { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } */
}
#pragma acc kernels /* { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } */
;
return 0;
}

View file

@ -0,0 +1,108 @@
/* Test OpenACC 'kernels' construct decomposition. */
/* { dg-additional-options "-fopt-info-omp-all" } */
/* { dg-additional-options "-fopenacc-kernels=decompose" } */
/* { dg-ice "TODO" }
{ dg-prune-output "during GIMPLE pass: omplower" } */
/* Reduced from 'kernels-decompose-2.c'.
(Hopefully) similar instances:
- 'libgomp.oacc-c-c++-common/declare-vla-kernels-decompose-ice-1.c'
- 'libgomp.oacc-c-c++-common/kernels-decompose-1.c'
*/
int
main ()
{
#define N 10
#pragma acc kernels
for (int i = 0; i < N; i++) /* { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" } */
;
return 0;
}
/*
In 'gimple' we've got:
main ()
{
int D.2087;
{
int a[10];
try
{
#pragma omp target oacc_kernels map(tofrom:a [len: 40])
{
{
int i;
i = 0;
goto <D.2085>;
[...]
..., which in 'omp_oacc_kernels_decompose' we turn into:
main ()
{
int D.2087;
{
int a[10];
try
{
#pragma omp target oacc_data_kernels map(tofrom:a [len: 40])
{
try
{
{
int i;
#pragma omp target oacc_data_kernels map(alloc:i [len: 4])
{
try
{
{
#pragma omp target oacc_kernels async(-1) map(force_present:i [len: 4]) map(force_present:a [len: 40])
{
i = 0;
goto <D.2085>;
[...]
..., which results in ICE in:
#1 0x0000000000d2247b in lower_omp_target (gsi_p=gsi_p@entry=0x7fffffffbc90, ctx=ctx@entry=0x2c994c0) at [...]/gcc/omp-low.c:11981
11981 gcc_assert (offloaded);
(gdb) list
11976 talign = TYPE_ALIGN_UNIT (TREE_TYPE (TREE_TYPE (ovar)));
11977 gimplify_assign (x, var, &ilist);
11978 }
11979 else if (is_gimple_reg (var))
11980 {
11981 gcc_assert (offloaded);
11982 tree avar = create_tmp_var (TREE_TYPE (var));
11983 mark_addressable (avar);
11984 enum gomp_map_kind map_kind = OMP_CLAUSE_MAP_KIND (c);
11985 if (GOMP_MAP_COPY_TO_P (map_kind)
(gdb) call debug_tree(var)
<var_decl 0x7ffff7feebd0 i
type <integer_type 0x7ffff67be5e8 int sizes-gimplified public SI
size <integer_cst 0x7ffff67a5f18 constant 32>
unit-size <integer_cst 0x7ffff67a5f30 constant 4>
align:32 warn_if_not_align:0 symtab:0 alias-set -1 canonical-type 0x7ffff67be5e8 precision:32 min <integer_cst 0x7ffff67a5ed0 -2147483648> max <integer_cst 0x7ffff67a5ee8 2147483647>
pointer_to_this <pointer_type 0x7ffff67c69d8>>
used read SI [...]:15:12 size <integer_cst 0x7ffff67a5f18 32> unit-size <integer_cst 0x7ffff67a5f30 4>
align:32 warn_if_not_align:0 context <function_decl 0x7ffff68eea00 main>>
Just defusing the 'assert' is not sufficient:
libgomp: present clause: !acc_is_present (0x7ffe29cba3ec, 4 (0x4))
TODO Can't the 'omp_oacc_kernels_decompose' transformation be much simpler, such that we avoid the intermediate 'data' if we've got just one compute construct inside it?
TODO But it's not clear if that'd just resolve one simple instance of the general problem?
*/

View file

@ -0,0 +1,16 @@
/* Test OpenACC 'kernels' construct decomposition. */
/* { dg-additional-options "-fopenacc-kernels=decompose" } */
/* { dg-ice "TODO" }
{ dg-prune-output "during GIMPLE pass: omp_oacc_kernels_decompose" } */
/* Reduced from 'kernels-decompose-ice-1.c'. */
int
main ()
{
#pragma acc kernels
{
int i;
}
}

View file

@ -0,0 +1,81 @@
! Test OpenACC 'kernels' construct decomposition.
! { dg-additional-options "-fopt-info-omp-all" }
! { dg-additional-options "-fdump-tree-gimple" }
! { dg-additional-options "-fopenacc-kernels=decompose" }
! { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose" }
! See also '../../c-c++-common/goacc/kernels-decompose-1.c'.
program main
implicit none
integer, parameter :: N = 1024
integer, dimension (1:N) :: a
integer :: i, sum
!$acc kernels copyin(a(1:N)) copy(sum)
! { dg-bogus "optimized: assigned OpenACC seq loop parallelism" "TODO" { xfail *-*-* } .-1 }
!TODO Is this maybe the report that belongs to the XFAILed report further down? */
!$acc loop ! { dg-line l_loop_i[incr c_loop_i] }
! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
do i = 1, N
sum = sum + a(i)
end do
sum = sum + 1 ! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" }
a(1) = a(1) + 1
!$acc loop independent ! { dg-line l_loop_i[incr c_loop_i] }
! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i }
! { dg-optimized "assigned OpenACC gang vector loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
do i = 1, N
sum = sum + a(i)
end do
if (sum .gt. 10) then ! { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" }
!$acc loop ! { dg-line l_loop_i[incr c_loop_i] }
! { dg-missed "unparallelized loop nest in OpenACC 'kernels' region: it's executed conditionally" "" { target *-*-* } l_loop_i$c_loop_i }
!TODO { dg-optimized "assigned OpenACC seq loop parallelism" "TODO" { xfail *-*-* } l_loop_i$c_loop_i }
do i = 1, N
sum = sum + a(i)
end do
end if
!$acc loop auto ! { dg-line l_loop_i[incr c_loop_i] }
! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
do i = 1, N
sum = sum + a(i)
end do
!$acc end kernels
end program main
! { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_kernels map\(to:a\[_[0-9]+\] \[len: _[0-9]+\]\) map\(alloc:a \[pointer assign, bias: _[0-9]+\]\) map\(tofrom:sum \[len: [0-9]+\]\)$} 1 "gimple" } }
! { dg-final { scan-tree-dump-times {(?n)#pragma acc loop private\(i\)$} 2 "gimple" } }
! { dg-final { scan-tree-dump-times {(?n)#pragma acc loop private\(i\) independent$} 1 "gimple" } }
! { dg-final { scan-tree-dump-times {(?n)#pragma acc loop private\(i\) auto$} 1 "gimple" } }
! { dg-final { scan-tree-dump-times {(?n)#pragma acc loop} 4 "gimple" } }
! Check that the OpenACC 'kernels' got decomposed into 'data' and an enclosed
! sequence of compute constructs.
! { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_data_kernels map\(to:a\[_[0-9]+\] \[len: _[0-9]+\]\) map\(tofrom:sum \[len: [0-9]+\]\)$} 1 "omp_oacc_kernels_decompose" } }
! As noted above, we get three "old-style" kernel regions, one gang-single region, and one parallelized loop region.
! { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_kernels async\(-1\) map\(force_present:a\[_[0-9]+\] \[len: _[0-9]+\]\) map\(alloc:a \[pointer assign, bias: _[0-9]+\]\) map\(force_present:sum \[len: [0-9]+\]\)$} 3 "omp_oacc_kernels_decompose" } }
! { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_parallel_kernels_parallelized async\(-1\) map\(force_present:a\[_[0-9]+\] \[len: _[0-9]+\]\) map\(alloc:a \[pointer assign, bias: _[0-9]+\]\) map\(force_present:sum \[len: [0-9]+\]\)$} 1 "omp_oacc_kernels_decompose" } }
! { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_parallel_kernels_gang_single async\(-1\) num_gangs\(1\) map\(force_present:a\[_[0-9]+\] \[len: _[0-9]+\]\) map\(alloc:a \[pointer assign, bias: _[0-9]+\]\) map\(force_present:sum \[len: [0-9]+\]\)$} 1 "omp_oacc_kernels_decompose" } }
!
! 'data' plus five CCs.
! { dg-final { scan-tree-dump-times {(?n)#pragma omp target } 6 "omp_oacc_kernels_decompose" } }
! { dg-final { scan-tree-dump-times {(?n)#pragma acc loop private\(i\)$} 2 "omp_oacc_kernels_decompose" } }
! { dg-final { scan-tree-dump-times {(?n)#pragma acc loop private\(i\) independent$} 1 "omp_oacc_kernels_decompose" } }
! { dg-final { scan-tree-dump-times {(?n)#pragma acc loop private\(i\) auto} 1 "omp_oacc_kernels_decompose" } }
! { dg-final { scan-tree-dump-times {(?n)#pragma acc loop} 4 "omp_oacc_kernels_decompose" } }
! Each of the parallel regions is async, and there is a final call to
! __builtin_GOACC_wait.
! { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 "omp_oacc_kernels_decompose" } }

View file

@ -0,0 +1,142 @@
! Test OpenACC 'kernels' construct decomposition.
! { dg-additional-options "-fopt-info-omp-all" }
! { dg-additional-options "-fopenacc-kernels=decompose" }
! { dg-additional-options "-O2" } for 'parloops'.
! See also '../../c-c++-common/goacc/kernels-decompose-2.c'.
program main
implicit none
integer, external :: f_g
!$acc routine (f_g) gang
integer, external :: f_w
!$acc routine (f_w) worker
integer, external :: f_v
!$acc routine (f_v) vector
integer, external :: f_s
!$acc routine (f_s) seq
integer :: i, j, k
integer :: x, y, z
logical :: y_l
integer, parameter :: N = 10
integer :: a(N), b(N), c(N)
!$acc kernels
x = 0 ! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" }
y = 0
y_l = x < 10
z = x
x = x + 1
;
!$acc end kernels
!$acc kernels ! { dg-optimized "assigned OpenACC gang loop parallelism" }
do i = 1, N ! { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" }
a(i) = 0
end do
!$acc end kernels
!$acc kernels loop ! { dg-line l_loop_i[incr c_loop_i] }
! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
do i = 1, N
b(i) = a(N - i + 1)
end do
!$acc kernels
!$acc loop ! { dg-line l_loop_i[incr c_loop_i] }
! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
do i = 1, N
b(i) = a(N - i + 1)
end do
!$acc loop ! { dg-line l_loop_i[incr c_loop_i] }
! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
do i = 1, N
c(i) = a(i) * b(i)
end do
a(z) = 0 ! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" }
!$acc loop ! { dg-line l_loop_i[incr c_loop_i] }
! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
do i = 1, N
c(i) = c(i) + a(i)
end do
!$acc loop seq ! { dg-line l_loop_i[incr c_loop_i] }
! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i }
! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
do i = 1 + 1, N
c(i) = c(i) + c(i - 1)
end do
!$acc end kernels
!$acc kernels
!TODO What does this mean?
!TODO { dg-optimized "assigned OpenACC worker vector loop parallelism" "" { target *-*-* } .-2 }
!$acc loop independent ! { dg-line l_loop_i[incr c_loop_i] }
! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i }
! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
do i = 1, N
!$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] }
! { dg-optimized "assigned OpenACC worker loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j }
do j = 1, N
!$acc loop independent ! { dg-line l_loop_k[incr c_loop_k] }
! { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } l_loop_k$c_loop_k }
! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_k$c_loop_k }
do k = 1, N
a(1 + mod(i + j + k, N)) &
= b(j) &
+ f_v (c(k)) ! { dg-optimized "assigned OpenACC vector loop parallelism" }
end do
end do
end do
!TODO Should the following turn into "gang-single" instead of "parloops"?
!TODO The problem is that the first STMT is 'if (y <= 4) goto <D.2547>; else goto <D.2548>;', thus "parloops".
if (y < 5) then ! { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" }
!$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] }
! { dg-missed "unparallelized loop nest in OpenACC 'kernels' region: it's executed conditionally" "" { target *-*-* } l_loop_j$c_loop_j }
do j = 1, N
b(j) = f_w (c(j))
end do
end if
!$acc end kernels
!$acc kernels
y = f_g (a(5)) ! { dg-line l_part[incr c_part] }
!TODO If such a construct is placed in its own part (like it is, here), can't this actually use gang paralelism, instead of "gang-single"?
! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" "" { target *-*-* } l_part$c_part }
! { dg-optimized "assigned OpenACC gang worker vector loop parallelism" "" { target *-*-* } l_part$c_part }
!$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] }
! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_j$c_loop_j }
! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j }
do j = 1, N
b(j) = y + f_w (c(j)) ! { dg-optimized "assigned OpenACC worker vector loop parallelism" }
end do
!$acc end kernels
!$acc kernels
y = 3 ! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" }
!$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] }
! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_j$c_loop_j }
! { dg-optimized "assigned OpenACC gang worker loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j }
do j = 1, N
b(j) = y + f_v (c(j)) ! { dg-optimized "assigned OpenACC vector loop parallelism" }
end do
z = 2 ! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" }
!$acc end kernels
!$acc kernels ! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" }
!$acc end kernels
end program main

View file

@ -1,5 +1,7 @@
! { dg-do compile }
! { dg-additional-options "-fdump-tree-original" }
! { dg-additional-options "-fopenacc-kernels=decompose" }
! { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose" }
program test
implicit none
@ -34,3 +36,6 @@ end program test
! { dg-final { scan-tree-dump-times "map\\(alloc:t\\)" 1 "original" } }
! { dg-final { scan-tree-dump-times "map\\(force_deviceptr:u\\)" 1 "original" } }
! { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_data_kernels if\(D\.[0-9]+\)$} 1 "omp_oacc_kernels_decompose" } }
! { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_parallel_kernels_gang_single num_gangs\(1\) if\(D\.[0-9]+\) async\(-1\)$} 1 "omp_oacc_kernels_decompose" } }

View file

@ -416,6 +416,7 @@ extern gimple_opt_pass *make_pass_lower_switch (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_lower_switch_O0 (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_lower_vector (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_lower_vector_ssa (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_omp_oacc_kernels_decompose (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_lower_omp (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_diagnose_omp_blocks (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_expand_omp (gcc::context *ctxt);

View file

@ -0,0 +1,8 @@
/* { dg-additional-options "-fopenacc-kernels=decompose" } */
/* Hopefully, this is the same issue as '../../../gcc/testsuite/c-c++-common/goacc/kernels-decompose-ice-1.c'.
{ dg-ice "TODO" }
TODO { dg-prune-output "during GIMPLE pass: omplower" }
TODO { dg-do link } */
#undef KERNELS_DECOMPOSE_ICE_HACK
#include "declare-vla.c"

View file

@ -0,0 +1,6 @@
/* { dg-additional-options "-fopenacc-kernels=decompose" } */
/* See also 'declare-vla-kernels-decompose-ice-1.c'. */
#define KERNELS_DECOMPOSE_ICE_HACK
#include "declare-vla.c"

View file

@ -38,6 +38,12 @@ f_data (void)
for (i = 0; i < N; i++)
A[i] = -i;
/* See 'declare-vla-kernels-decompose.c'. */
#ifdef KERNELS_DECOMPOSE_ICE_HACK
(volatile int *) &i;
(volatile int *) &N;
#endif
# pragma acc kernels
for (i = 0; i < N; i++)
A[i] = i;

View file

@ -0,0 +1,38 @@
/* Test OpenACC 'kernels' construct decomposition. */
/* { dg-additional-options "-fopt-info-omp-all" } */
/* { dg-additional-options "-fopenacc-kernels=decompose" } */
#undef NDEBUG
#include <assert.h>
int main()
{
int a = 0;
/*TODO Without making 'a' addressable, for GCN offloading we will not see the expected value copied out. (But it does work for nvptx offloading, strange...) */
(volatile int *) &a;
#define N 123
int b[N] = { 0 };
#pragma acc kernels
{
int c = 234; /* { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } */
/*TODO Hopefully, this is the same issue as '../../../gcc/testsuite/c-c++-common/goacc/kernels-decompose-ice-1.c'. */
(volatile int *) &c;
#pragma acc loop independent gang /* { dg-line l_loop_i[incr c_loop_i] } */
/* { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i } */
/* { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
for (int i = 0; i < N; ++i)
b[i] = c;
a = c; /* { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } */
}
for (int i = 0; i < N; ++i)
assert (b[i] == 234);
assert (a == 234);
return 0;
}

View file

@ -1,17 +1,22 @@
! { dg-do run }
! { dg-additional-options "-fopt-info-omp-all" }
! { dg-additional-options "-fopenacc-kernels=decompose" }
subroutine kernel(lo, hi, a, b, c)
implicit none
integer :: lo, hi, i
real, dimension(lo:hi) :: a, b, c
!$acc kernels copyin(lo, hi) ! { dg-optimized "assigned OpenACC seq loop parallelism" }
!$acc loop independent
!$acc kernels copyin(lo, hi)
!$acc loop independent ! { dg-line l_loop_i[incr c_loop_i] }
! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i }
! { dg-optimized "assigned OpenACC gang vector loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
do i = lo, hi
b(i) = a(i)
end do
!$acc loop independent
!$acc loop independent ! { dg-line l_loop_i[incr c_loop_i] }
! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i }
! { dg-optimized "assigned OpenACC gang vector loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
do i = lo, hi
c(i) = b(i)
end do