New pass: loop flattening.

2010-09-09  Sebastian Pop  <sebastian.pop@amd.com>

	* Makefile.in (OBJS-common): Add graphite-flattening.o.
	(graphite-flattening.o): New rule.
	* common.opt (floop-flatten): New flag.
	* doc/invoke.texi (-floop-flatten): Documented.
	* graphite-flattening.c: New.
	* graphite-poly.c (apply_poly_transforms): Call flatten_all_loops.
	* graphite-poly.h (flatten_all_loops): Declared.
	(lst_remove_loop_and_inline_stmts_in_loop_father): New.
	* tree-ssa-loop.c (gate_graphite_transforms): When flag_loop_flatten
	is set, also set flag_graphite.

From-SVN: r164804
This commit is contained in:
Sebastian Pop 2010-09-30 21:20:45 +00:00 committed by Sebastian Pop
parent c498b9b997
commit 98af4c9ffe
9 changed files with 525 additions and 7 deletions

View file

@ -1,3 +1,16 @@
2010-09-30 Sebastian Pop <sebastian.pop@amd.com>
* Makefile.in (OBJS-common): Add graphite-flattening.o.
(graphite-flattening.o): New rule.
* common.opt (floop-flatten): New flag.
* doc/invoke.texi (-floop-flatten): Documented.
* graphite-flattening.c: New.
* graphite-poly.c (apply_poly_transforms): Call flatten_all_loops.
* graphite-poly.h (flatten_all_loops): Declared.
(lst_remove_loop_and_inline_stmts_in_loop_father): New.
* tree-ssa-loop.c (gate_graphite_transforms): When flag_loop_flatten
is set, also set flag_graphite.
2010-09-30 Sebastian Pop <sebastian.pop@amd.com>
* graphite-poly.c (cloog_checksum): New.

View file

@ -1,3 +1,16 @@
2010-09-09 Sebastian Pop <sebastian.pop@amd.com>
* Makefile.in (OBJS-common): Add graphite-flattening.o.
(graphite-flattening.o): New rule.
* common.opt (floop-flatten): New flag.
* doc/invoke.texi (-floop-flatten): Documented.
* graphite-flattening.c: New.
* graphite-poly.c (apply_poly_transforms): Call flatten_all_loops.
* graphite-poly.h (flatten_all_loops): Declared.
(lst_remove_loop_and_inline_stmts_in_loop_father): New.
* tree-ssa-loop.c (gate_graphite_transforms): When flag_loop_flatten
is set, also set flag_graphite.
2010-09-09 Sebastian Pop <sebastian.pop@amd.com>
* graphite-poly.c (cloog_checksum): New.

View file

@ -1244,6 +1244,7 @@ OBJS-common = \
graphite-clast-to-gimple.o \
graphite-cloog-util.o \
graphite-dependences.o \
graphite-flattening.o \
graphite-interchange.o \
graphite-poly.o \
graphite-ppl.o \
@ -2695,6 +2696,12 @@ graphite-dependences.o: graphite-dependences.c $(CONFIG_H) $(SYSTEM_H) \
$(TOPLEV_H) $(DIAGNOSTIC_CORE_H) $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \
$(GIMPLE_H) $(TREE_DATA_REF_H) tree-pass.h domwalk.h \
graphite.h graphite-poly.h graphite-ppl.h graphite-dependences.h
graphite-flattening.o: graphite-flattening.c $(CONFIG_H) $(SYSTEM_H) \
coretypes.h $(TM_H) $(GGC_H) $(TREE_H) $(RTL_H) output.h \
$(BASIC_BLOCK_H) $(DIAGNOSTIC_H) $(TOPLEV_H) $(TREE_FLOW_H) \
$(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) $(GIMPLE_H) \
$(TREE_DATA_REF_H) tree-pass.h domwalk.h value-prof.h graphite.h \
graphite-poly.h graphite-ppl.h
graphite-interchange.o: graphite-interchange.c $(CONFIG_H) $(SYSTEM_H) \
coretypes.h \
$(TM_H) $(GGC_H) $(TREE_H) $(RTL_H) output.h $(BASIC_BLOCK_H) $(DIAGNOSTIC_H) \

View file

@ -870,6 +870,10 @@ floop-block
Common Report Var(flag_loop_block) Optimization
Enable Loop Blocking transformation
floop-flatten
Common Report Var(flag_loop_flatten) Optimization
Enable Loop Flattening transformation
fstrict-volatile-bitfields
Common Report Var(flag_strict_volatile_bitfields) Init(-1)
Force bitfield accesses to match their type width

View file

@ -352,7 +352,7 @@ Objective-C and Objective-C++ Dialects}.
-fira-loop-pressure -fno-ira-share-save-slots @gol
-fno-ira-share-spill-slots -fira-verbose=@var{n} @gol
-fivopts -fkeep-inline-functions -fkeep-static-consts @gol
-floop-block -floop-interchange -floop-strip-mine @gol
-floop-block -floop-flatten -floop-interchange -floop-strip-mine @gol
-floop-parallelize-all -flto -flto-compression-level -flto-report @gol
-fltrans -fltrans-output-list -fmerge-all-constants -fmerge-constants @gol
-fmodulo-sched -fmodulo-sched-allow-regmoves -fmove-loop-invariants @gol
@ -6798,6 +6798,7 @@ Perform linear loop transformations on tree. This flag can improve cache
performance and allow further loop optimizations to take place.
@item -floop-interchange
@opindex floop-interchange
Perform loop interchange transformations on loops. Interchanging two
nested loops switches the inner and outer loops. For example, given a
loop like:
@ -6826,6 +6827,7 @@ with @option{--with-ppl} and @option{--with-cloog} to enable the
Graphite loop transformation infrastructure.
@item -floop-strip-mine
@opindex floop-strip-mine
Perform loop strip mining transformations on loops. Strip mining
splits a loop into two nested loops. The outer loop has strides
equal to the strip size and the inner loop has strides of the
@ -6851,6 +6853,7 @@ be configured with @option{--with-ppl} and @option{--with-cloog} to
enable the Graphite loop transformation infrastructure.
@item -floop-block
@opindex floop-block
Perform loop blocking transformations on loops. Blocking strip mines
each loop in the loop nest such that the memory accesses of the
element loops fit inside caches. The strip length can be changed
@ -6892,7 +6895,14 @@ GIMPLE -> GRAPHITE -> GIMPLE transformation. Some minimal optimizations
are also performed by the code generator CLooG, like index splitting and
dead code elimination in loops.
@item -floop-flatten
@opindex floop-flatten
Removes the loop nesting structure: transforms the loop nest into a
single loop. This transformation can be useful to vectorize all the
levels of the loop nest.
@item -floop-parallelize-all
@opindex floop-parallelize-all
Use the Graphite data dependence analysis to identify loops that can
be parallelized. Parallelize all the loops that can be analyzed to
not contain loop carried dependences without checking that it is

442
gcc/graphite-flattening.c Normal file
View file

@ -0,0 +1,442 @@
/* Loop flattening for Graphite.
Copyright (C) 2010 Free Software Foundation, Inc.
Contributed by Sebastian Pop <sebastian.pop@amd.com>.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "tm.h"
#include "ggc.h"
#include "tree.h"
#include "rtl.h"
#include "output.h"
#include "basic-block.h"
#include "diagnostic.h"
#include "tree-flow.h"
#include "toplev.h"
#include "tree-dump.h"
#include "timevar.h"
#include "cfgloop.h"
#include "tree-chrec.h"
#include "tree-data-ref.h"
#include "tree-scalar-evolution.h"
#include "tree-pass.h"
#include "domwalk.h"
#include "value-prof.h"
#include "pointer-set.h"
#include "gimple.h"
#include "params.h"
#ifdef HAVE_cloog
#include "ppl_c.h"
#include "sese.h"
#include "graphite-ppl.h"
#include "graphite.h"
#include "graphite-poly.h"
/* The loop flattening pass transforms loop nests into a single loop,
removing the loop nesting structure. The auto-vectorization can
then apply on the full loop body, without needing the outer-loop
vectorization.
The canonical example is as follows: suppose that we have a loop
nest with known iteration counts
| for (i = 1; i <= 6; i++)
| for (j = 1; j <= 6; j++)
| S1(i,j);
The loop flattening is performed by linearizing the iteration space
using the function "f (x) = 6 * i + j". In this case, CLooG would
produce this code:
| for (c1=7;c1<=42;c1++) {
| i = floord(c1-1,6);
| S1(i,c1-6*i);
| }
There are several limitations for loop flattening that are linked
to the expressivity of the polyhedral model. One has to take an
upper bound approximation to deal with the parametric case of loop
flattening. For example, in the loop nest:
| for (i = 1; i <= N; i++)
| for (j = 1; j <= M; j++)
| S1(i,j);
One would like to flatten this loop using a linearization function
like this "f (x) = M * i + j". However CLooG's schedules are not
expressive enough to deal with this case, and so the parameter M
has to be replaced by an integer upper bound approximation. If we
further know in the context of the scop that "M <= 6", then it is
possible to linearize the loop with "f (x) = 6 * i + j". In this
case, CLooG would produce this code:
| for (c1=7;c1<=6*M+N;c1++) {
| i = ceild(c1-N,6);
| if (i <= floord(c1-1,6)) {
| S1(i,c1-6*i);
| }
| }
For an arbitrarily complex loop nests the algorithm proceeds in two
steps. First, the LST is flattened by removing the loops structure
and by inserting the statements in the order they appear in
depth-first order. Then, the scattering of each statement is
transformed such that it
Supposing that the original program is represented by the following
LST:
| (loop_1
| stmt_1
| (loop_2 stmt_3
| (loop_3 stmt_4)
| (loop_4 stmt_5 stmt_6)
| stmt_7
| )
| stmt_2
| )
Loop flattening traverses the LST in depth-first order, and
flattens pairs of loops successively by projecting the inner loops
in the iteration domain of the outer loops:
lst_project_loop (loop_2, loop_3, stride)
| (loop_1
| stmt_1
| (loop_2 stmt_3 stmt_4
| (loop_4 stmt_5 stmt_6)
| stmt_7
| )
| stmt_2
| )
lst_project_loop (loop_2, loop_4, stride)
| (loop_1
| stmt_1
| (loop_2 stmt_3 stmt_4 stmt_5 stmt_6 stmt_7)
| stmt_2
| )
lst_project_loop (loop_1, loop_2, stride)
| (loop_1
| stmt_1 stmt_3 stmt_4 stmt_5 stmt_6 stmt_7 stmt_2
| )
At each step, the iteration domain of the outer loop is enlarged to
contain enough points to iterate over the inner loop domain. */
/* Initializes RES to the number of iterations of the linearized loop
LST. RES is the cardinal of the iteration domain of LST. */
static void
lst_linearized_niter (lst_p lst, mpz_t res)
{
int i;
lst_p l;
mpz_t n;
mpz_init (n);
mpz_set_si (res, 0);
FOR_EACH_VEC_ELT (lst_p, LST_SEQ (lst), i, l)
if (LST_LOOP_P (l))
{
lst_linearized_niter (l, n);
mpz_add (res, res, n);
}
if (LST_LOOP_P (lst))
{
lst_niter_for_loop (lst, n);
if (mpz_cmp_si (res, 0) != 0)
mpz_mul (res, res, n);
else
mpz_set (res, n);
}
mpz_clear (n);
}
/* Applies the translation "f (x) = x + OFFSET" to the loop containing
STMT. */
static void
lst_offset (lst_p stmt, mpz_t offset)
{
lst_p inner = LST_LOOP_FATHER (stmt);
poly_bb_p pbb = LST_PBB (stmt);
ppl_Polyhedron_t poly = PBB_TRANSFORMED_SCATTERING (pbb);
int inner_depth = lst_depth (inner);
ppl_dimension_type inner_dim = psct_dynamic_dim (pbb, inner_depth);
ppl_Linear_Expression_t expr;
ppl_dimension_type dim;
ppl_Coefficient_t one;
mpz_t x;
mpz_init (x);
mpz_set_si (x, 1);
ppl_new_Coefficient (&one);
ppl_assign_Coefficient_from_mpz_t (one, x);
ppl_Polyhedron_space_dimension (poly, &dim);
ppl_new_Linear_Expression_with_dimension (&expr, dim);
ppl_set_coef (expr, inner_dim, 1);
ppl_set_inhomogeneous_gmp (expr, offset);
ppl_Polyhedron_affine_image (poly, inner_dim, expr, one);
ppl_delete_Linear_Expression (expr);
ppl_delete_Coefficient (one);
}
/* Scale by FACTOR the loop LST containing STMT. */
static void
lst_scale (lst_p lst, lst_p stmt, mpz_t factor)
{
mpz_t x;
ppl_Coefficient_t one;
int outer_depth = lst_depth (lst);
poly_bb_p pbb = LST_PBB (stmt);
ppl_Polyhedron_t poly = PBB_TRANSFORMED_SCATTERING (pbb);
ppl_dimension_type outer_dim = psct_dynamic_dim (pbb, outer_depth);
ppl_Linear_Expression_t expr;
ppl_dimension_type dim;
mpz_init (x);
mpz_set_si (x, 1);
ppl_new_Coefficient (&one);
ppl_assign_Coefficient_from_mpz_t (one, x);
ppl_Polyhedron_space_dimension (poly, &dim);
ppl_new_Linear_Expression_with_dimension (&expr, dim);
/* outer_dim = factor * outer_dim. */
ppl_set_coef_gmp (expr, outer_dim, factor);
ppl_Polyhedron_affine_image (poly, outer_dim, expr, one);
ppl_delete_Linear_Expression (expr);
mpz_clear (x);
ppl_delete_Coefficient (one);
}
/* Project the INNER loop into the iteration domain of the OUTER loop.
STRIDE is the number of iterations between two iterations of the
outer loop. */
static void
lst_project_loop (lst_p outer, lst_p inner, mpz_t stride)
{
int i;
lst_p stmt;
mpz_t x;
ppl_Coefficient_t one;
int outer_depth = lst_depth (outer);
int inner_depth = lst_depth (inner);
mpz_init (x);
mpz_set_si (x, 1);
ppl_new_Coefficient (&one);
ppl_assign_Coefficient_from_mpz_t (one, x);
FOR_EACH_VEC_ELT (lst_p, LST_SEQ (inner), i, stmt)
{
poly_bb_p pbb = LST_PBB (stmt);
ppl_Polyhedron_t poly = PBB_TRANSFORMED_SCATTERING (pbb);
ppl_dimension_type outer_dim = psct_dynamic_dim (pbb, outer_depth);
ppl_dimension_type inner_dim = psct_dynamic_dim (pbb, inner_depth);
ppl_Linear_Expression_t expr;
ppl_dimension_type dim;
ppl_dimension_type *ds;
/* There should be no loops under INNER. */
gcc_assert (!LST_LOOP_P (stmt));
ppl_Polyhedron_space_dimension (poly, &dim);
ppl_new_Linear_Expression_with_dimension (&expr, dim);
/* outer_dim = outer_dim * stride + inner_dim. */
ppl_set_coef (expr, inner_dim, 1);
ppl_set_coef_gmp (expr, outer_dim, stride);
ppl_Polyhedron_affine_image (poly, outer_dim, expr, one);
ppl_delete_Linear_Expression (expr);
/* Project on inner_dim. */
ppl_new_Linear_Expression_with_dimension (&expr, dim - 1);
ppl_Polyhedron_affine_image (poly, inner_dim, expr, one);
ppl_delete_Linear_Expression (expr);
/* Remove inner loop and the static schedule of its body. */
ds = XNEWVEC (ppl_dimension_type, 2);
ds[0] = inner_dim;
ds[1] = inner_dim + 1;
ppl_Polyhedron_remove_space_dimensions (poly, ds, 2);
PBB_NB_SCATTERING_TRANSFORM (pbb) -= 2;
free (ds);
}
mpz_clear (x);
ppl_delete_Coefficient (one);
}
/* Flattens the loop nest LST. Return true when something changed.
OFFSET is used to compute the number of iterations of the outermost
loop before the current LST is executed. */
static bool
lst_flatten_loop (lst_p lst, mpz_t init_offset)
{
int i;
lst_p l;
bool res = false;
mpz_t n, one, offset, stride;
mpz_init (n);
mpz_init (one);
mpz_init (offset);
mpz_init (stride);
mpz_set (offset, init_offset);
mpz_set_si (one, 1);
lst_linearized_niter (lst, stride);
lst_niter_for_loop (lst, n);
mpz_tdiv_q (stride, stride, n);
FOR_EACH_VEC_ELT (lst_p, LST_SEQ (lst), i, l)
if (LST_LOOP_P (l))
{
res = true;
lst_flatten_loop (l, offset);
lst_niter_for_loop (l, n);
lst_project_loop (lst, l, stride);
/* The offset is the number of iterations minus 1, as we want
to execute the next statements at the same iteration as the
last iteration of the loop. */
mpz_sub (n, n, one);
mpz_add (offset, offset, n);
}
else
{
lst_scale (lst, l, stride);
if (mpz_cmp_si (offset, 0) != 0)
lst_offset (l, offset);
}
FOR_EACH_VEC_ELT (lst_p, LST_SEQ (lst), i, l)
if (LST_LOOP_P (l))
lst_remove_loop_and_inline_stmts_in_loop_father (l);
mpz_clear (n);
mpz_clear (one);
mpz_clear (offset);
mpz_clear (stride);
return res;
}
/* Remove all but the first 3 dimensions of the scattering:
- dim0: the static schedule for the loop
- dim1: the dynamic schedule of the loop
- dim2: the static schedule for the loop body. */
static void
remove_unused_scattering_dimensions (lst_p lst)
{
int i;
lst_p stmt;
mpz_t x;
ppl_Coefficient_t one;
mpz_init (x);
mpz_set_si (x, 1);
ppl_new_Coefficient (&one);
ppl_assign_Coefficient_from_mpz_t (one, x);
FOR_EACH_VEC_ELT (lst_p, LST_SEQ (lst), i, stmt)
{
poly_bb_p pbb = LST_PBB (stmt);
ppl_Polyhedron_t poly = PBB_TRANSFORMED_SCATTERING (pbb);
int j, nb_dims_to_remove = PBB_NB_SCATTERING_TRANSFORM (pbb) - 3;
ppl_dimension_type *ds;
/* There should be no loops inside LST after flattening. */
gcc_assert (!LST_LOOP_P (stmt));
if (!nb_dims_to_remove)
continue;
ds = XNEWVEC (ppl_dimension_type, nb_dims_to_remove);
for (j = 0; j < nb_dims_to_remove; j++)
ds[j] = j + 3;
ppl_Polyhedron_remove_space_dimensions (poly, ds, nb_dims_to_remove);
PBB_NB_SCATTERING_TRANSFORM (pbb) -= nb_dims_to_remove;
free (ds);
}
mpz_clear (x);
ppl_delete_Coefficient (one);
}
/* Flattens all the loop nests of LST. Return true when something
changed. */
static bool
lst_do_flatten (lst_p lst)
{
int i;
lst_p l;
bool res = false;
mpz_t zero;
if (!lst
|| !LST_LOOP_P (lst))
return false;
mpz_init (zero);
mpz_set_si (zero, 0);
FOR_EACH_VEC_ELT (lst_p, LST_SEQ (lst), i, l)
if (LST_LOOP_P (l))
{
res |= lst_flatten_loop (l, zero);
remove_unused_scattering_dimensions (l);
}
lst_update_scattering (lst);
mpz_clear (zero);
return res;
}
/* Flatten all the loop nests in SCOP. Returns true when something
changed. */
bool
flatten_all_loops (scop_p scop)
{
return lst_do_flatten (SCOP_TRANSFORMED_SCHEDULE (scop));
}
#endif

View file

@ -783,6 +783,9 @@ apply_poly_transforms (scop_p scop)
transform_done |= scop_do_interchange (scop);
}
if (flag_loop_flatten)
transform_done |= flatten_all_loops (scop);
/* This feature is only enabled in the Graphite branch. */
if (0)
{
@ -1688,7 +1691,8 @@ pbb_number_of_iterations_at_time (poly_bb_p pbb,
ppl_delete_Constraint_System (cs);
}
/* Compute the lower bound on the original iteration domain. */
/* Compute the lower bound on the original iteration domain and add
it to the scattering. */
ppl_new_Pointset_Powerset_C_Polyhedron_from_C_Polyhedron
(&sctr_lb, PBB_TRANSFORMED_SCATTERING (pbb));
for (i = 0; i < (int) domain_dim; i++)

View file

@ -414,6 +414,7 @@ extern void debug_iteration_domains (scop_p, int);
extern bool scop_do_interchange (scop_p);
extern bool scop_do_strip_mine (scop_p);
extern bool scop_do_block (scop_p);
extern bool flatten_all_loops (scop_p);
extern void pbb_number_of_iterations_at_time (poly_bb_p, graphite_dim_t, mpz_t);
extern void pbb_remove_duplicate_pdrs (poly_bb_p);
@ -944,7 +945,7 @@ find_lst_loop (lst_p stmt, int loop_depth)
return loop;
}
/* Return the first lst representing a PBB statement in LST. */
/* Return the first LST representing a PBB statement in LST. */
static inline lst_p
lst_find_first_pbb (lst_p lst)
@ -968,7 +969,7 @@ lst_find_first_pbb (lst_p lst)
return NULL;
}
/* Returns true when LST is a loop that does not contains
/* Returns true when LST is a loop that does not contain
statements. */
static inline bool
@ -977,7 +978,7 @@ lst_empty_p (lst_p lst)
return !lst_find_first_pbb (lst);
}
/* Return the last lst representing a PBB statement in LST. */
/* Return the last LST representing a PBB statement in LST. */
static inline lst_p
lst_find_last_pbb (lst_p lst)
@ -1061,6 +1062,26 @@ lst_remove_from_sequence (lst_p lst)
LST_LOOP_FATHER (lst) = NULL;
}
/* Removes the loop LST and inline its body in the father loop. */
static inline void
lst_remove_loop_and_inline_stmts_in_loop_father (lst_p lst)
{
lst_p l, father = LST_LOOP_FATHER (lst);
int i, dewey = lst_dewey_number (lst);
gcc_assert (lst && father && dewey >= 0);
VEC_ordered_remove (lst_p, LST_SEQ (father), dewey);
LST_LOOP_FATHER (lst) = NULL;
FOR_EACH_VEC_ELT (lst_p, LST_SEQ (lst), i, l)
{
VEC_safe_insert (lst_p, heap, LST_SEQ (father), dewey + i, l);
LST_LOOP_FATHER (l) = father;
}
}
/* Sets NITER to the upper bound approximation of the number of
iterations of loop LST. */

View file

@ -303,8 +303,12 @@ gate_graphite_transforms (void)
{
/* Enable -fgraphite pass if any one of the graphite optimization flags
is turned on. */
if (flag_loop_block || flag_loop_interchange || flag_loop_strip_mine
|| flag_graphite_identity || flag_loop_parallelize_all)
if (flag_loop_block
|| flag_loop_interchange
|| flag_loop_strip_mine
|| flag_graphite_identity
|| flag_loop_parallelize_all
|| flag_loop_flatten)
flag_graphite = 1;
return flag_graphite != 0;