tree-loop-distribution.c (INCLUDE_ALGORITHM): New header file.

* tree-loop-distribution.c (INCLUDE_ALGORITHM): New header file.
	(tree-ssa-loop-ivopts.h): New header file.
	(struct builtin_info): New fields.
	(classify_builtin_1): Compute and record base and offset parts for
	memset builtin partition by calling strip_offset.
	(offset_cmp, fuse_memset_builtins): New functions.
	(finalize_partitions): Fuse adjacent memset partitions by calling
	above function.
	* tree-ssa-loop-ivopts.c (strip_offset): Delete static declaration.
	Expose the interface.
	* tree-ssa-loop-ivopts.h (strip_offset): New declaration.

	* gcc.dg/tree-ssa/ldist-17.c: Adjust test string.
	* gcc.dg/tree-ssa/ldist-32.c: New test.
	* gcc.dg/tree-ssa/ldist-35.c: New test.
	* gcc.dg/tree-ssa/ldist-36.c: New test.

From-SVN: r253859
This commit is contained in:
Bin Cheng 2017-10-18 16:04:16 +00:00 committed by Bin Cheng
parent 85aa9ed64b
commit 957f0d8faf
9 changed files with 232 additions and 7 deletions

View file

@ -1,3 +1,17 @@
2017-10-18 Bin Cheng <bin.cheng@arm.com>
* tree-loop-distribution.c (INCLUDE_ALGORITHM): New header file.
(tree-ssa-loop-ivopts.h): New header file.
(struct builtin_info): New fields.
(classify_builtin_1): Compute and record base and offset parts for
memset builtin partition by calling strip_offset.
(offset_cmp, fuse_memset_builtins): New functions.
(finalize_partitions): Fuse adjacent memset partitions by calling
above function.
* tree-ssa-loop-ivopts.c (strip_offset): Delete static declaration.
Expose the interface.
* tree-ssa-loop-ivopts.h (strip_offset): New declaration.
2017-10-18 Bin Cheng <bin.cheng@arm.com>
PR tree-optimization/82574

View file

@ -1,3 +1,10 @@
2017-10-18 Bin Cheng <bin.cheng@arm.com>
* gcc.dg/tree-ssa/ldist-17.c: Adjust test string.
* gcc.dg/tree-ssa/ldist-32.c: New test.
* gcc.dg/tree-ssa/ldist-35.c: New test.
* gcc.dg/tree-ssa/ldist-36.c: New test.
2017-10-18 Bin Cheng <bin.cheng@arm.com>
PR tree-optimization/82574

View file

@ -45,5 +45,5 @@ mad_synth_mute (struct mad_synth *synth)
return;
}
/* { dg-final { scan-tree-dump "distributed: split to 0 loops and 4 library calls" "ldist" } } */
/* { dg-final { scan-tree-dump-times "generated memset zero" 4 "ldist" } } */
/* { dg-final { scan-tree-dump "Loop nest . distributed: split to 0 loops and 1 library calls" "ldist" } } */
/* { dg-final { scan-tree-dump-times "generated memset zero" 1 "ldist" } } */

View file

@ -0,0 +1,29 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -ftree-loop-distribute-patterns -fdump-tree-ldist-details" } */
#define M (256)
#define N (512)
struct st
{
int a[M][N];
int c[M];
int b[M][N];
};
void
foo (struct st *p)
{
for (unsigned i = 0; i < M; ++i)
{
p->c[i] = 0;
for (unsigned j = N; j > 0; --j)
{
p->a[i][j - 1] = 0;
p->b[i][j - 1] = 0;
}
}
}
/* { dg-final { scan-tree-dump-times "Loop nest . distributed: split to 0 loops and 1 library" 1 "ldist" } } */
/* { dg-final { scan-tree-dump-times "__builtin_memset \\(.*, 0, 1049600\\);" 1 "ldist" } } */

View file

@ -0,0 +1,28 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -ftree-loop-distribute-patterns -fdump-tree-ldist-details" } */
#define M (256)
#define N (512)
struct st
{
int a[M][N];
int c[M];
int b[M][N];
};
void
foo (struct st * restrict p, struct st * restrict q)
{
for (unsigned i = 0; i < M; ++i)
{
p->c[i] = 0;
for (unsigned j = N; j > 0; --j)
{
p->a[i][j - 1] = q->a[i][j - 1];
p->b[i][j - 1] = 0;
}
}
}
/* { dg-final { scan-tree-dump-times "Loop nest . distributed: split to 0 loops and 1 library" 1 "ldist" { xfail *-*-* } } } */

View file

@ -0,0 +1,28 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -ftree-loop-distribute-patterns -fdump-tree-ldist-details" } */
#define M (256)
#define N (512)
struct st
{
int a[M][N];
int c[M];
int b[M][N];
};
void
foo (struct st * restrict p)
{
for (unsigned i = 0; i < M; ++i)
{
p->c[i] = 0;
for (unsigned j = N; j > 0; --j)
{
p->b[i][j - 1] = p->a[i][j - 1];
p->a[i][j - 1] = 0;
}
}
}
/* { dg-final { scan-tree-dump-times "Loop nest . distributed: split to 0 loops and 3 library" 1 "ldist" } } */

View file

@ -90,6 +90,7 @@ along with GCC; see the file COPYING3. If not see
data reuse. */
#include "config.h"
#define INCLUDE_ALGORITHM /* stable_sort */
#include "system.h"
#include "coretypes.h"
#include "backend.h"
@ -106,6 +107,7 @@ along with GCC; see the file COPYING3. If not see
#include "stor-layout.h"
#include "tree-cfg.h"
#include "tree-ssa-loop-manip.h"
#include "tree-ssa-loop-ivopts.h"
#include "tree-ssa-loop.h"
#include "tree-into-ssa.h"
#include "tree-ssa.h"
@ -604,6 +606,10 @@ struct builtin_info
tree dst_base;
tree src_base;
tree size;
/* Base and offset part of dst_base after stripping constant offset. This
is only used in memset builtin distribution for now. */
tree dst_base_base;
unsigned HOST_WIDE_INT dst_base_offset;
};
/* Partition for loop distribution. */
@ -1504,7 +1510,11 @@ classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
if (!compute_access_range (loop, dr, &base, &size))
return;
partition->builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
struct builtin_info *builtin;
builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
builtin->dst_base_base = strip_offset (builtin->dst_base,
&builtin->dst_base_offset);
partition->builtin = builtin;
partition->kind = PKIND_MEMSET;
}
@ -2480,6 +2490,113 @@ version_for_distribution_p (vec<struct partition *> *partitions,
return (alias_ddrs->length () > 0);
}
/* Compare base offset of builtin mem* partitions P1 and P2. */
static bool
offset_cmp (struct partition *p1, struct partition *p2)
{
gcc_assert (p1 != NULL && p1->builtin != NULL);
gcc_assert (p2 != NULL && p2->builtin != NULL);
return p1->builtin->dst_base_offset < p2->builtin->dst_base_offset;
}
/* Fuse adjacent memset builtin PARTITIONS if possible. This is a special
case optimization transforming below code:
__builtin_memset (&obj, 0, 100);
_1 = &obj + 100;
__builtin_memset (_1, 0, 200);
_2 = &obj + 300;
__builtin_memset (_2, 0, 100);
into:
__builtin_memset (&obj, 0, 400);
Note we don't have dependence information between different partitions
at this point, as a result, we can't handle nonadjacent memset builtin
partitions since dependence might be broken. */
static void
fuse_memset_builtins (vec<struct partition *> *partitions)
{
unsigned i, j;
struct partition *part1, *part2;
for (i = 0; partitions->iterate (i, &part1);)
{
if (part1->kind != PKIND_MEMSET)
{
i++;
continue;
}
/* Find sub-array of memset builtins of the same base. Index range
of the sub-array is [i, j) with "j > i". */
for (j = i + 1; partitions->iterate (j, &part2); ++j)
{
if (part2->kind != PKIND_MEMSET
|| !operand_equal_p (part1->builtin->dst_base_base,
part2->builtin->dst_base_base, 0))
break;
}
/* Stable sort is required in order to avoid breaking dependence. */
std::stable_sort (&(*partitions)[i],
&(*partitions)[i] + j - i, offset_cmp);
/* Continue with next partition. */
i = j;
}
/* Merge all consecutive memset builtin partitions. */
for (i = 0; i < partitions->length () - 1;)
{
part1 = (*partitions)[i];
if (part1->kind != PKIND_MEMSET)
{
i++;
continue;
}
part2 = (*partitions)[i + 1];
/* Only merge memset partitions of the same base and with constant
access sizes. */
if (part2->kind != PKIND_MEMSET
|| TREE_CODE (part1->builtin->size) != INTEGER_CST
|| TREE_CODE (part2->builtin->size) != INTEGER_CST
|| !operand_equal_p (part1->builtin->dst_base_base,
part2->builtin->dst_base_base, 0))
{
i++;
continue;
}
tree rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
tree rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
int bytev1 = const_with_all_bytes_same (rhs1);
int bytev2 = const_with_all_bytes_same (rhs2);
/* Only merge memset partitions of the same value. */
if (bytev1 != bytev2 || bytev1 == -1)
{
i++;
continue;
}
wide_int end1 = wi::add (part1->builtin->dst_base_offset,
wi::to_wide (part1->builtin->size));
/* Only merge adjacent memset partitions. */
if (wi::ne_p (end1, part2->builtin->dst_base_offset))
{
i++;
continue;
}
/* Merge partitions[i] and partitions[i+1]. */
part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
part1->builtin->size,
part2->builtin->size);
partition_free (part2);
partitions->ordered_remove (i + 1);
}
}
/* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
ALIAS_DDRS contains ddrs which need runtime alias check. */
@ -2523,6 +2640,10 @@ finalize_partitions (struct loop *loop, vec<struct partition *> *partitions,
}
partitions->truncate (1);
}
/* Fuse memset builtins if possible. */
if (partitions->length () > 1)
fuse_memset_builtins (partitions);
}
/* Distributes the code from LOOP in such a way that producer statements

View file

@ -1550,9 +1550,6 @@ record_invariant (struct ivopts_data *data, tree op, bool nonlinear_use)
bitmap_set_bit (data->relevant, SSA_NAME_VERSION (op));
}
static tree
strip_offset (tree expr, unsigned HOST_WIDE_INT *offset);
/* Record a group of TYPE. */
static struct iv_group *
@ -2863,7 +2860,7 @@ strip_offset_1 (tree expr, bool inside_addr, bool top_compref,
/* Strips constant offsets from EXPR and stores them to OFFSET. */
static tree
tree
strip_offset (tree expr, unsigned HOST_WIDE_INT *offset)
{
HOST_WIDE_INT off;

View file

@ -28,6 +28,7 @@ extern void dump_cand (FILE *, struct iv_cand *);
extern bool contains_abnormal_ssa_name_p (tree);
extern struct loop *outermost_invariant_loop_for_expr (struct loop *, tree);
extern bool expr_invariant_in_loop_p (struct loop *, tree);
extern tree strip_offset (tree, unsigned HOST_WIDE_INT *);
bool may_be_nonaddressable_p (tree expr);
void tree_ssa_iv_optimize (void);