tree-ssa-loop-prefetch.c (schedule_prefetches): Cleanup and improve comments.
* tree-ssa-loop-prefetch.c (schedule_prefetches): Cleanup and improve comments. (issue_prefetch_ref): Move assignment to write_p out of loop. (determine_unroll_factor): Do not take PARAM_MAX_UNROLL_TIMES and SIMULTANEOUS_PREFETCHES into account. (loop_prefetch_arrays): Do not pass ahead to determine_unroll_factor. * lambda-code.c (lcm): Renamed to ... (least_common_multiple): ... and exported. * tree-flow.h (least_common_multiple): Declare. From-SVN: r118730
This commit is contained in:
parent
946e1bc757
commit
911b3fdbe3
6 changed files with 94 additions and 56 deletions
|
@ -1,3 +1,15 @@
|
|||
2006-11-12 Zdenek Dvorak <dvorakz@suse.cz>
|
||||
|
||||
* tree-ssa-loop-prefetch.c (schedule_prefetches): Cleanup and improve
|
||||
comments.
|
||||
(issue_prefetch_ref): Move assignment to write_p out of loop.
|
||||
(determine_unroll_factor): Do not take PARAM_MAX_UNROLL_TIMES and
|
||||
SIMULTANEOUS_PREFETCHES into account.
|
||||
(loop_prefetch_arrays): Do not pass ahead to determine_unroll_factor.
|
||||
* lambda-code.c (lcm): Renamed to ...
|
||||
(least_common_multiple): ... and exported.
|
||||
* tree-flow.h (least_common_multiple): Declare.
|
||||
|
||||
2006-11-12 Zdenek Dvorak <dvorakz@suse.cz>
|
||||
|
||||
* Makefile.in (tree-data-ref.o): Add langhooks.h dependency.
|
||||
|
|
|
@ -442,8 +442,8 @@ lambda_lattice_compute_base (lambda_loopnest nest)
|
|||
|
||||
/* Compute the least common multiple of two numbers A and B . */
|
||||
|
||||
static int
|
||||
lcm (int a, int b)
|
||||
int
|
||||
least_common_multiple (int a, int b)
|
||||
{
|
||||
return (abs (a) * abs (b) / gcd (a, b));
|
||||
}
|
||||
|
@ -577,7 +577,7 @@ compute_nest_using_fourier_motzkin (int size,
|
|||
{
|
||||
if (A[k][i] < 0)
|
||||
{
|
||||
multiple = lcm (A[j][i], A[k][i]);
|
||||
multiple = least_common_multiple (A[j][i], A[k][i]);
|
||||
f1 = multiple / A[j][i];
|
||||
f2 = -1 * multiple / A[k][i];
|
||||
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
2006-11-12 Zdenek Dvorak <dvorakz@suse.cz>
|
||||
|
||||
* gcc.dg/tree-ssa/prefetch-3.c: New test.
|
||||
|
||||
2006-11-12 Roger Sayle <roger@eyesopen.com>
|
||||
|
||||
PR tree-optimization/13827
|
||||
|
|
22
gcc/testsuite/gcc.dg/tree-ssa/prefetch-3.c
Normal file
22
gcc/testsuite/gcc.dg/tree-ssa/prefetch-3.c
Normal file
|
@ -0,0 +1,22 @@
|
|||
/* Prefetching used to prefer nonsensical unroll factor of 5 in this testcase. */
|
||||
|
||||
/* { dg-do compile { target i?86-*-* } } */
|
||||
/* { dg-options "-O2 -fprefetch-loop-arrays -march=athlon -msse2 -mfpmath=sse -fdump-tree-aprefetch-details" } */
|
||||
|
||||
#define N 1000000
|
||||
|
||||
double a[N];
|
||||
|
||||
double test(void)
|
||||
{
|
||||
unsigned i;
|
||||
double sum = 0;
|
||||
|
||||
for (i = 0; i < N; i += 2)
|
||||
sum += (a[i] * a[i+1]);
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "unroll factor 4" 1 "aprefetch" } } */
|
||||
/* { dg-final { cleanup-tree-dump "aprefetch" } } */
|
|
@ -1036,4 +1036,7 @@ void swap_tree_operands (tree, tree *, tree *);
|
|||
|
||||
extern void recalculate_used_alone (void);
|
||||
extern bool updating_used_alone;
|
||||
|
||||
int least_common_multiple (int, int);
|
||||
|
||||
#endif /* _TREE_FLOW_H */
|
||||
|
|
|
@ -744,19 +744,21 @@ static bool
|
|||
schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
|
||||
unsigned ahead)
|
||||
{
|
||||
unsigned max_prefetches, n_prefetches;
|
||||
unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
|
||||
unsigned slots_per_prefetch;
|
||||
struct mem_ref *ref;
|
||||
bool any = false;
|
||||
|
||||
max_prefetches = (SIMULTANEOUS_PREFETCHES * unroll_factor) / ahead;
|
||||
if (max_prefetches > (unsigned) SIMULTANEOUS_PREFETCHES)
|
||||
max_prefetches = SIMULTANEOUS_PREFETCHES;
|
||||
/* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
|
||||
remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
|
||||
|
||||
/* The prefetch will run for AHEAD iterations of the original loop, i.e.,
|
||||
AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
|
||||
it will need a prefetch slot. */
|
||||
slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
|
||||
if (dump_file && (dump_flags & TDF_DETAILS))
|
||||
fprintf (dump_file, "Max prefetches to issue: %d.\n", max_prefetches);
|
||||
|
||||
if (!max_prefetches)
|
||||
return false;
|
||||
fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
|
||||
slots_per_prefetch);
|
||||
|
||||
/* For now we just take memory references one by one and issue
|
||||
prefetches for as many as possible. The groups are sorted
|
||||
|
@ -769,16 +771,24 @@ schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
|
|||
if (!should_issue_prefetch_p (ref))
|
||||
continue;
|
||||
|
||||
ref->issue_prefetch_p = true;
|
||||
|
||||
/* If prefetch_mod is less then unroll_factor, we need to insert
|
||||
several prefetches for the reference. */
|
||||
/* If we need to prefetch the reference each PREFETCH_MOD iterations,
|
||||
and we unroll the loop UNROLL_FACTOR times, we need to insert
|
||||
ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
|
||||
iteration. */
|
||||
n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
|
||||
/ ref->prefetch_mod);
|
||||
if (max_prefetches <= n_prefetches)
|
||||
return true;
|
||||
prefetch_slots = n_prefetches * slots_per_prefetch;
|
||||
|
||||
max_prefetches -= n_prefetches;
|
||||
/* If more than half of the prefetches would be lost anyway, do not
|
||||
issue the prefetch. */
|
||||
if (2 * remaining_prefetch_slots < prefetch_slots)
|
||||
continue;
|
||||
|
||||
ref->issue_prefetch_p = true;
|
||||
|
||||
if (remaining_prefetch_slots <= prefetch_slots)
|
||||
return true;
|
||||
remaining_prefetch_slots -= prefetch_slots;
|
||||
any = true;
|
||||
}
|
||||
|
||||
|
@ -822,6 +832,7 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
|
|||
/ ref->prefetch_mod);
|
||||
addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
|
||||
addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL);
|
||||
write_p = ref->write_p ? integer_one_node : integer_zero_node;
|
||||
|
||||
for (ap = 0; ap < n_prefetches; ap++)
|
||||
{
|
||||
|
@ -832,10 +843,9 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
|
|||
addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL);
|
||||
|
||||
/* Create the prefetch instruction. */
|
||||
write_p = ref->write_p ? integer_one_node : integer_zero_node;
|
||||
params = tree_cons (NULL_TREE, addr,
|
||||
tree_cons (NULL_TREE, write_p, NULL_TREE));
|
||||
|
||||
|
||||
prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH],
|
||||
params);
|
||||
bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT);
|
||||
|
@ -888,48 +898,36 @@ should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
|
|||
|
||||
static unsigned
|
||||
determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
|
||||
unsigned ahead, unsigned ninsns,
|
||||
struct tree_niter_desc *desc)
|
||||
unsigned ninsns, struct tree_niter_desc *desc)
|
||||
{
|
||||
unsigned upper_bound, size_factor, constraint_factor;
|
||||
unsigned factor, max_mod_constraint, ahead_factor;
|
||||
unsigned upper_bound;
|
||||
unsigned nfactor, factor, mod_constraint;
|
||||
struct mem_ref_group *agp;
|
||||
struct mem_ref *ref;
|
||||
|
||||
upper_bound = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
|
||||
|
||||
/* First check whether the loop is not too large to unroll. */
|
||||
size_factor = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
|
||||
if (size_factor <= 1)
|
||||
/* First check whether the loop is not too large to unroll. We ignore
|
||||
PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
|
||||
from unrolling them enough to make exactly one cache line covered by each
|
||||
iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
|
||||
us from unrolling the loops too many times in cases where we only expect
|
||||
gains from better scheduling and decreasing loop overhead, which is not
|
||||
the case here. */
|
||||
upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
|
||||
if (upper_bound <= 1)
|
||||
return 1;
|
||||
|
||||
if (size_factor < upper_bound)
|
||||
upper_bound = size_factor;
|
||||
|
||||
max_mod_constraint = 1;
|
||||
/* Choose the factor so that we may prefetch each cache just once,
|
||||
but bound the unrolling by UPPER_BOUND. */
|
||||
factor = 1;
|
||||
for (agp = refs; agp; agp = agp->next)
|
||||
for (ref = agp->refs; ref; ref = ref->next)
|
||||
if (should_issue_prefetch_p (ref)
|
||||
&& ref->prefetch_mod > max_mod_constraint)
|
||||
max_mod_constraint = ref->prefetch_mod;
|
||||
|
||||
/* Set constraint_factor as large as needed to be able to satisfy the
|
||||
largest modulo constraint. */
|
||||
constraint_factor = max_mod_constraint;
|
||||
|
||||
/* If ahead is too large in comparison with the number of available
|
||||
prefetches, unroll the loop as much as needed to be able to prefetch
|
||||
at least partially some of the references in the loop. */
|
||||
ahead_factor = ((ahead + SIMULTANEOUS_PREFETCHES - 1)
|
||||
/ SIMULTANEOUS_PREFETCHES);
|
||||
|
||||
/* Unroll as much as useful, but bound the code size growth. */
|
||||
if (constraint_factor < ahead_factor)
|
||||
factor = ahead_factor;
|
||||
else
|
||||
factor = constraint_factor;
|
||||
if (factor > upper_bound)
|
||||
factor = upper_bound;
|
||||
if (should_issue_prefetch_p (ref))
|
||||
{
|
||||
mod_constraint = ref->prefetch_mod;
|
||||
nfactor = least_common_multiple (mod_constraint, factor);
|
||||
if (nfactor <= upper_bound)
|
||||
factor = nfactor;
|
||||
}
|
||||
|
||||
if (!should_unroll_loop_p (loop, desc, factor))
|
||||
return 1;
|
||||
|
@ -964,8 +962,7 @@ loop_prefetch_arrays (struct loops *loops, struct loop *loop)
|
|||
instructions executed per iteration of the loop. */
|
||||
ninsns = tree_num_loop_insns (loop);
|
||||
ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns;
|
||||
unroll_factor = determine_unroll_factor (loop, refs, ahead, ninsns,
|
||||
&desc);
|
||||
unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc);
|
||||
if (dump_file && (dump_flags & TDF_DETAILS))
|
||||
fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue