Fix profile update in tree_transform_and_unroll_loop
Fixe profile update in tree_transform_and_unroll_loop which is used by predictive comming. I stared by attempt to fix gcc.dg/tree-ssa/update-unroll-1.c I xfailed last week, but it turned to be harder job. Unrolling was never fixed for changes in duplicate_loop_body_to_header_edge which is now smarter on getting profile right when some exists are eliminated. A lot of manual profile can thus now be done using existing infrastructure. I also noticed that scale_dominated_blocks_in_loop does job identical to loop I wrote in scale_loop_profile and thus I commonized the implementaiton and removed recursion. I also extended duplicate_loop_body_to_header_edge to handle flat profiles same way as we do in vectorizer. Without it we end up with less then 0 iteration count in gcc.dg/tree-ssa/update-unroll-1.c (it is unrolled 32times but predicted to iterated fewer times) and added missing code to update loop_info. gcc/ChangeLog: * cfgloopmanip.cc (scale_dominated_blocks_in_loop): Move here from tree-ssa-loop-manip.cc and avoid recursion. (scale_loop_profile): Use scale_dominated_blocks_in_loop. (duplicate_loop_body_to_header_edge): Add DLTHE_FLAG_FLAT_PROFILE flag. * cfgloopmanip.h (DLTHE_FLAG_FLAT_PROFILE): Define. (scale_dominated_blocks_in_loop): Declare. * predict.cc (dump_prediction): Do not ICE on uninitialized probability. (change_edge_frequency): Remove. * predict.h (change_edge_frequency): Remove. * tree-ssa-loop-manip.cc (scale_dominated_blocks_in_loop): Move to cfgloopmanip.cc. (niter_for_unrolled_loop): Remove. (tree_transform_and_unroll_loop): Fix profile update. gcc/testsuite/ChangeLog: * gcc.dg/pr102385.c: Check for no profile mismatches. * gcc.dg/pr96931.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-1.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-2.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-3.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-4.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-5.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-7.c: Check for one profile mismatch. * gcc.dg/tree-ssa/predcom-8.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-1.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-10.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-11.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-12.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-2.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-3.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-4.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-5.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-6.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-7.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-8.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-9.c: Check for no profile mismatches. * gcc.dg/tree-ssa/update-unroll-1.c: Unxfail.
This commit is contained in:
parent
5b06b3b877
commit
9bd1ee6d33
27 changed files with 158 additions and 275 deletions
|
@ -499,6 +499,32 @@ scale_loop_frequencies (class loop *loop, profile_probability p)
|
|||
free (bbs);
|
||||
}
|
||||
|
||||
/* Scales the frequencies of all basic blocks in LOOP that are strictly
|
||||
dominated by BB by NUM/DEN. */
|
||||
|
||||
void
|
||||
scale_dominated_blocks_in_loop (class loop *loop, basic_block bb,
|
||||
profile_count num, profile_count den)
|
||||
{
|
||||
basic_block son;
|
||||
|
||||
if (!den.nonzero_p () && !(num == profile_count::zero ()))
|
||||
return;
|
||||
auto_vec <basic_block, 8> worklist;
|
||||
worklist.safe_push (bb);
|
||||
|
||||
while (!worklist.is_empty ())
|
||||
for (son = first_dom_son (CDI_DOMINATORS, worklist.pop ());
|
||||
son;
|
||||
son = next_dom_son (CDI_DOMINATORS, son))
|
||||
{
|
||||
if (!flow_bb_inside_loop_p (loop, son))
|
||||
continue;
|
||||
son->count = son->count.apply_scale (num, den);
|
||||
worklist.safe_push (son);
|
||||
}
|
||||
}
|
||||
|
||||
/* Scale profile in LOOP by P.
|
||||
If ITERATION_BOUND is not -1, scale even further if loop is predicted
|
||||
to iterate too many times.
|
||||
|
@ -649,19 +675,9 @@ scale_loop_profile (class loop *loop, profile_probability p,
|
|||
if (other_edge && other_edge->dest == loop->latch)
|
||||
loop->latch->count -= new_exit_count - old_exit_count;
|
||||
else
|
||||
{
|
||||
basic_block *body = get_loop_body (loop);
|
||||
profile_count new_count = exit_edge->src->count - new_exit_count;
|
||||
profile_count old_count = exit_edge->src->count - old_exit_count;
|
||||
|
||||
for (unsigned int i = 0; i < loop->num_nodes; i++)
|
||||
if (body[i] != exit_edge->src
|
||||
&& dominated_by_p (CDI_DOMINATORS, body[i], exit_edge->src))
|
||||
body[i]->count = body[i]->count.apply_scale (new_count,
|
||||
old_count);
|
||||
|
||||
free (body);
|
||||
}
|
||||
scale_dominated_blocks_in_loop (loop, exit_edge->src,
|
||||
exit_edge->src->count - new_exit_count,
|
||||
exit_edge->src->count - old_exit_count);
|
||||
}
|
||||
else if (dump_file && (dump_flags & TDF_DETAILS))
|
||||
{
|
||||
|
@ -1237,6 +1253,7 @@ duplicate_loop_body_to_header_edge (class loop *loop, edge e,
|
|||
should've managed the flags so all except for original loop
|
||||
has won't exist set. */
|
||||
scale_act = wanted_count.probability_in (count_in);
|
||||
|
||||
/* Now simulate the duplication adjustments and compute header
|
||||
frequency of the last copy. */
|
||||
for (i = 0; i < ndupl; i++)
|
||||
|
@ -1252,16 +1269,21 @@ duplicate_loop_body_to_header_edge (class loop *loop, edge e,
|
|||
profile_probability prob_pass_main = bitmap_bit_p (wont_exit, 0)
|
||||
? prob_pass_wont_exit
|
||||
: prob_pass_thru;
|
||||
profile_probability p = prob_pass_main;
|
||||
profile_count scale_main_den = count_in;
|
||||
for (i = 0; i < ndupl; i++)
|
||||
if (!(flags & DLTHE_FLAG_FLAT_PROFILE))
|
||||
{
|
||||
scale_main_den += count_in.apply_probability (p);
|
||||
p = p * scale_step[i];
|
||||
profile_probability p = prob_pass_main;
|
||||
profile_count scale_main_den = count_in;
|
||||
for (i = 0; i < ndupl; i++)
|
||||
{
|
||||
scale_main_den += count_in.apply_probability (p);
|
||||
p = p * scale_step[i];
|
||||
}
|
||||
/* If original loop is executed COUNT_IN times, the unrolled
|
||||
loop will account SCALE_MAIN_DEN times. */
|
||||
scale_main = count_in.probability_in (scale_main_den);
|
||||
}
|
||||
/* If original loop is executed COUNT_IN times, the unrolled
|
||||
loop will account SCALE_MAIN_DEN times. */
|
||||
scale_main = count_in.probability_in (scale_main_den);
|
||||
else
|
||||
scale_main = profile_probability::always ();
|
||||
scale_act = scale_main * prob_pass_main;
|
||||
}
|
||||
else
|
||||
|
|
|
@ -32,6 +32,8 @@ enum
|
|||
field of newly create BB. */
|
||||
#define DLTHE_FLAG_COMPLETTE_PEEL 4 /* Update frequencies expecting
|
||||
a complete peeling. */
|
||||
#define DLTHE_FLAG_FLAT_PROFILE 8 /* Profile is flat; do not reduce
|
||||
count by unroll factor. */
|
||||
extern edge mfb_kj_edge;
|
||||
|
||||
extern bool remove_path (edge, bool * = NULL, bitmap = NULL);
|
||||
|
@ -64,5 +66,7 @@ class loop * loop_version (class loop *, void *,
|
|||
profile_probability, profile_probability,
|
||||
profile_probability, profile_probability, bool);
|
||||
void adjust_loop_info_after_peeling (class loop *loop, int npeel, bool precise);
|
||||
void scale_dominated_blocks_in_loop (class loop *loop, basic_block bb,
|
||||
profile_count num, profile_count den);
|
||||
|
||||
#endif /* GCC_CFGLOOPMANIP_H */
|
||||
|
|
|
@ -790,7 +790,7 @@ dump_prediction (FILE *file, enum br_predictor predictor, int probability,
|
|||
{
|
||||
fprintf (file, " exec ");
|
||||
bb->count.dump (file);
|
||||
if (e)
|
||||
if (e && e->count ().initialized_p () && bb->count.to_gcov_type ())
|
||||
{
|
||||
fprintf (file, " hit ");
|
||||
e->count ().dump (file);
|
||||
|
@ -4634,43 +4634,6 @@ force_edge_cold (edge e, bool impossible)
|
|||
}
|
||||
}
|
||||
|
||||
/* Change E's probability to NEW_E_PROB, redistributing the probabilities
|
||||
of other outgoing edges proportionally.
|
||||
|
||||
Note that this function does not change the profile counts of any
|
||||
basic blocks. The caller must do that instead, using whatever
|
||||
information it has about the region that needs updating. */
|
||||
|
||||
void
|
||||
change_edge_frequency (edge e, profile_probability new_e_prob)
|
||||
{
|
||||
profile_probability old_e_prob = e->probability;
|
||||
profile_probability old_other_prob = old_e_prob.invert ();
|
||||
profile_probability new_other_prob = new_e_prob.invert ();
|
||||
|
||||
e->probability = new_e_prob;
|
||||
profile_probability cumulative_prob = new_e_prob;
|
||||
|
||||
unsigned int num_other = EDGE_COUNT (e->src->succs) - 1;
|
||||
edge other_e;
|
||||
edge_iterator ei;
|
||||
FOR_EACH_EDGE (other_e, ei, e->src->succs)
|
||||
if (other_e != e)
|
||||
{
|
||||
num_other -= 1;
|
||||
if (num_other == 0)
|
||||
/* Ensure that the probabilities add up to 1 without
|
||||
rounding error. */
|
||||
other_e->probability = cumulative_prob.invert ();
|
||||
else
|
||||
{
|
||||
other_e->probability /= old_other_prob;
|
||||
other_e->probability *= new_other_prob;
|
||||
cumulative_prob += other_e->probability;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if CHECKING_P
|
||||
|
||||
namespace selftest {
|
||||
|
|
|
@ -100,7 +100,6 @@ extern void rebuild_frequencies (void);
|
|||
extern void report_predictor_hitrates (void);
|
||||
extern void force_edge_cold (edge, bool);
|
||||
extern void propagate_unlikely_bbs_forward (void);
|
||||
extern void change_edge_frequency (edge, profile_probability);
|
||||
|
||||
extern void add_reg_br_prob_note (rtx_insn *, profile_probability);
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* { dg-options "-Wall -Wextra -O2 -fno-toplevel-reorder -fno-tree-ch -fno-tree-dce -fno-tree-dominator-opts -fno-tree-dse -fno-tree-loop-ivcanon -fpredictive-commoning" } */
|
||||
/* { dg-options "-Wall -Wextra -O2 -fno-toplevel-reorder -fno-tree-ch -fno-tree-dce -fno-tree-dominator-opts -fno-tree-dse -fno-tree-loop-ivcanon -fpredictive-commoning -fdump-tree-pcom-details-blocks -fdump-tree-lim-details-blocks" } */
|
||||
|
||||
short a, b;
|
||||
int c[9];
|
||||
|
@ -12,3 +12,5 @@ void e() {
|
|||
}
|
||||
}
|
||||
int main() {return 0;}
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "lim2" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O1 -fpredictive-commoning -fno-tree-loop-im" } */
|
||||
/* { dg-options "-O1 -fpredictive-commoning -fno-tree-loop-im -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
int bl;
|
||||
|
||||
|
@ -17,3 +17,4 @@ ie (void)
|
|||
|
||||
ie ();
|
||||
}
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -fno-tree-vectorize -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O2 -fno-tree-vectorize -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
void abort (void);
|
||||
|
||||
|
@ -47,3 +47,4 @@ int main(void)
|
|||
|
||||
/* Also check that we undid the transformation previously made by PRE. */
|
||||
/* { dg-final { scan-tree-dump-times "looparound ref" 1 "pcom" } } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details -fno-tree-pre" } */
|
||||
/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks -fno-tree-pre" } */
|
||||
/* { dg-additional-options "-fno-tree-vectorize" { target amdgcn-*-* } } */
|
||||
|
||||
void abort (void);
|
||||
|
@ -44,3 +44,4 @@ int main(void)
|
|||
|
||||
/* Verify that both loops were transformed and unrolled. */
|
||||
/* { dg-final { scan-tree-dump-times "Unrolling 2 times." 2 "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details -fno-tree-pre -fno-tree-loop-vectorize" } */
|
||||
/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks -fno-tree-pre -fno-tree-loop-vectorize" } */
|
||||
|
||||
int a[1000], b[1000];
|
||||
|
||||
|
@ -13,3 +13,4 @@ void test(void)
|
|||
|
||||
/* Verify that we used 3 temporary variables for the loop. */
|
||||
/* { dg-final { scan-tree-dump-times "Unrolling 3 times." 1 "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
/* Test for predictive commoning of expressions, without reassociation. */
|
||||
|
||||
|
@ -26,3 +26,4 @@ int main(void)
|
|||
|
||||
/* { dg-final { scan-tree-dump-times "Combination" 1 "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump-times "Unrolling 3 times." 1 "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
/* Test for predictive commoning of expressions, with reassociation. */
|
||||
|
||||
|
@ -26,3 +26,4 @@ int main(void)
|
|||
|
||||
/* { dg-final { scan-tree-dump-times "Combination" 2 "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump-times "Unrolling 3 times." 1 "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O3 -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O3 -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
int b, f, d[5][2];
|
||||
unsigned int c;
|
||||
|
@ -15,3 +15,7 @@ main ()
|
|||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump "Executing predictive commoning" "pcom" } } */
|
||||
/* dom pass introduces one mismatch after simplfying mispredicted conditional
|
||||
on c being non-zero on first iteration. This happens since c is global variable
|
||||
and needs alias analysis. */
|
||||
/* { dg-final { scan-tree-dump-times "Invalid sum" 1 "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O3 -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O3 -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
int is_sorted(int *a, int n)
|
||||
{
|
||||
|
@ -10,3 +10,4 @@ int is_sorted(int *a, int n)
|
|||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump "Executing predictive commoning without unrolling" "pcom" } } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
int arr[105] = {2, 3, 5, 7, 11};
|
||||
int result0[10] = {2, 3, 5, 7, 11};
|
||||
|
@ -60,3 +60,4 @@ int main (void)
|
|||
return 0;
|
||||
}
|
||||
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
int arr[105] = {2, 3, 5, 7, 11};
|
||||
int result0[10] = {2, 3, 5, 7, 11};
|
||||
|
@ -41,4 +41,4 @@ int main (void)
|
|||
return 0;
|
||||
}
|
||||
/* { dg-final { scan-tree-dump-not "Store-stores chain" "pcom"} } */
|
||||
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
int arr[105] = {2, 3, 5, 7, 11};
|
||||
int x[105] = {2, 3, 5, 7, 11};
|
||||
|
@ -48,4 +48,5 @@ int main (void)
|
|||
}
|
||||
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump "Store-loads chain" "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
int arr[105] = {2, 3, 5, 7, 11};
|
||||
int result0[10] = {2, 3, 5, 7, 11};
|
||||
|
@ -65,3 +65,4 @@ int main (void)
|
|||
return 0;
|
||||
}
|
||||
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
int arr[105] = {2, 3, 5, 7, 11};
|
||||
int result0[10] = {2, 3, 5, 7, 11};
|
||||
|
@ -60,3 +60,4 @@ int main (void)
|
|||
return 0;
|
||||
}
|
||||
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -fno-tree-vectorize -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O2 -fno-tree-vectorize -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
int arr1[105] = {2, 3, 5, 7, 11, 13, 0};
|
||||
int arr2[105] = {2, 3, 5, 7, 11, 13, 0};
|
||||
|
@ -106,3 +106,4 @@ int main (void)
|
|||
return 0;
|
||||
}
|
||||
/* { dg-final { scan-tree-dump-times "Store-stores chain" 4 "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
int arr[105] = {2, 3, 5, 7, 11};
|
||||
int result0[10] = {2, 3, 5, 7, 11};
|
||||
|
@ -59,3 +59,4 @@ int main (void)
|
|||
return 0;
|
||||
}
|
||||
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
int arr[105] = {2, 3, 5, 7, 11};
|
||||
int result0[10] = {2, 3, 5, 7, 11};
|
||||
|
@ -61,3 +61,4 @@ int main (void)
|
|||
return 0;
|
||||
}
|
||||
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
int arr[105] = {2, 3, 5, 7, 11, 13, 17, 19};
|
||||
int result0[10] = {2, 3, 5, 7, 11, 13, 17, 19};
|
||||
|
@ -63,3 +63,4 @@ int main (void)
|
|||
return 0;
|
||||
}
|
||||
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
int arr[105] = {2, 3, 5, 7, 11, 13, 17, 19};
|
||||
int result0[10] = {2, 3, 5, 7, 11, 13, 17, 19};
|
||||
|
@ -61,3 +61,4 @@ int main (void)
|
|||
return 0;
|
||||
}
|
||||
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
int arr[105] = {2, 3, 5, 7, 11, 13, 17, 19};
|
||||
int result0[10] = {2, 3, 5, 7, 11, 13, 17, 19};
|
||||
|
@ -58,3 +58,4 @@ int main (void)
|
|||
|
||||
return 0;
|
||||
}
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
|
||||
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
|
||||
|
||||
int arr1[105] = {2, 3, 5, 7, 11, 13, 17, 19};
|
||||
int arr2[105] = {2, 3, 5, 7, 11, 13, 17, 19};
|
||||
|
@ -88,3 +88,4 @@ int main (void)
|
|||
return 0;
|
||||
}
|
||||
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
|
||||
|
|
|
@ -16,5 +16,4 @@ int foo(unsigned n)
|
|||
/* We used to make the probability that the body of the loop (unrolled
|
||||
to enable prefetching) is entered 0, which is not correct. */
|
||||
|
||||
/* { dg-final { scan-tree-dump-not "Invalid sum" "aprefetch" { xfail *-*-* }} } */
|
||||
/* { dg-final { scan-tree-dump-not "SUCC: 7 .100.0%" "aprefetch"} } */
|
||||
|
|
|
@ -1040,71 +1040,6 @@ determine_exit_conditions (class loop *loop, class tree_niter_desc *desc,
|
|||
*exit_bound = bound;
|
||||
}
|
||||
|
||||
/* Scales the frequencies of all basic blocks in LOOP that are strictly
|
||||
dominated by BB by NUM/DEN. */
|
||||
|
||||
static void
|
||||
scale_dominated_blocks_in_loop (class loop *loop, basic_block bb,
|
||||
profile_count num, profile_count den)
|
||||
{
|
||||
basic_block son;
|
||||
|
||||
if (!den.nonzero_p () && !(num == profile_count::zero ()))
|
||||
return;
|
||||
|
||||
for (son = first_dom_son (CDI_DOMINATORS, bb);
|
||||
son;
|
||||
son = next_dom_son (CDI_DOMINATORS, son))
|
||||
{
|
||||
if (!flow_bb_inside_loop_p (loop, son))
|
||||
continue;
|
||||
scale_bbs_frequencies_profile_count (&son, 1, num, den);
|
||||
scale_dominated_blocks_in_loop (loop, son, num, den);
|
||||
}
|
||||
}
|
||||
|
||||
/* Return estimated niter for LOOP after unrolling by FACTOR times. */
|
||||
|
||||
gcov_type
|
||||
niter_for_unrolled_loop (class loop *loop, unsigned factor)
|
||||
{
|
||||
gcc_assert (factor != 0);
|
||||
bool profile_p = false;
|
||||
gcov_type est_niter = expected_loop_iterations_unbounded (loop, &profile_p);
|
||||
/* Note that this is really CEIL (est_niter + 1, factor) - 1, where the
|
||||
"+ 1" converts latch iterations to loop iterations and the "- 1"
|
||||
converts back. */
|
||||
gcov_type new_est_niter = est_niter / factor;
|
||||
|
||||
if (est_niter == -1)
|
||||
return -1;
|
||||
|
||||
/* Without profile feedback, loops for which we do not know a better estimate
|
||||
are assumed to roll 10 times. When we unroll such loop, it appears to
|
||||
roll too little, and it may even seem to be cold. To avoid this, we
|
||||
ensure that the created loop appears to roll at least 5 times (but at
|
||||
most as many times as before unrolling). Don't do adjustment if profile
|
||||
feedback is present. */
|
||||
if (new_est_niter < 5 && !profile_p)
|
||||
{
|
||||
if (est_niter < 5)
|
||||
new_est_niter = est_niter;
|
||||
else
|
||||
new_est_niter = 5;
|
||||
}
|
||||
|
||||
if (loop->any_upper_bound)
|
||||
{
|
||||
/* As above, this is really CEIL (upper_bound + 1, factor) - 1. */
|
||||
widest_int bound = wi::udiv_floor (loop->nb_iterations_upper_bound,
|
||||
factor);
|
||||
if (wi::ltu_p (bound, new_est_niter))
|
||||
new_est_niter = bound.to_uhwi ();
|
||||
}
|
||||
|
||||
return new_est_niter;
|
||||
}
|
||||
|
||||
/* Unroll LOOP FACTOR times. LOOP is known to have a single exit edge
|
||||
whose source block dominates the latch. DESC describes the number of
|
||||
iterations of LOOP.
|
||||
|
@ -1169,47 +1104,39 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
|
|||
transform_callback transform,
|
||||
void *data)
|
||||
{
|
||||
gcov_type new_est_niter = niter_for_unrolled_loop (loop, factor);
|
||||
unsigned irr = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
|
||||
|
||||
enum tree_code exit_cmp;
|
||||
tree enter_main_cond, exit_base, exit_step, exit_bound;
|
||||
bool flat = maybe_flat_loop_profile (loop);
|
||||
determine_exit_conditions (loop, desc, factor,
|
||||
&enter_main_cond, &exit_base, &exit_step,
|
||||
&exit_cmp, &exit_bound);
|
||||
bool single_loop_p = !exit_base;
|
||||
|
||||
/* Let us assume that the unrolled loop is quite likely to be entered. */
|
||||
profile_probability prob_entry;
|
||||
if (integer_nonzerop (enter_main_cond))
|
||||
prob_entry = profile_probability::always ();
|
||||
else
|
||||
prob_entry = profile_probability::guessed_always ()
|
||||
.apply_scale (PROB_UNROLLED_LOOP_ENTERED, 100);
|
||||
|
||||
gcond *exit_if = nullptr;
|
||||
class loop *new_loop = nullptr;
|
||||
edge new_exit;
|
||||
if (!single_loop_p)
|
||||
{
|
||||
edge exit = single_dom_exit (loop);
|
||||
profile_count entry_count = loop_preheader_edge (loop)->src->count;
|
||||
/* Let us assume that the unrolled loop is quite likely to be entered. */
|
||||
profile_probability prob_entry;
|
||||
if (integer_nonzerop (enter_main_cond))
|
||||
prob_entry = profile_probability::always ();
|
||||
else
|
||||
prob_entry = profile_probability::guessed_always ()
|
||||
.apply_scale (PROB_UNROLLED_LOOP_ENTERED, 100);
|
||||
|
||||
|
||||
/* The values for scales should keep profile consistent, and somewhat
|
||||
close to correct.
|
||||
|
||||
TODO: The current value of SCALE_REST makes it appear that the loop
|
||||
that is created by splitting the remaining iterations of the unrolled
|
||||
loop is executed the same number of times as the original loop, and
|
||||
with the same frequencies, which is obviously wrong. This does not
|
||||
appear to cause problems, so we do not bother with fixing it for now.
|
||||
To make the profile correct, we would need to change the probability
|
||||
of the exit edge of the loop, and recompute the distribution of
|
||||
frequencies in its body because of this change (scale the frequencies
|
||||
of blocks before and after the exit by appropriate factors). */
|
||||
profile_probability scale_unrolled = prob_entry;
|
||||
close to correct. */
|
||||
new_loop = loop_version (loop, enter_main_cond, NULL, prob_entry,
|
||||
prob_entry.invert (), scale_unrolled,
|
||||
profile_probability::guessed_always (),
|
||||
prob_entry.invert (),
|
||||
prob_entry,
|
||||
/* We will later redirect exit from vectorized
|
||||
loop to new_loop. */
|
||||
profile_probability::always (),
|
||||
true);
|
||||
gcc_assert (new_loop != NULL);
|
||||
update_ssa (TODO_update_ssa_no_phi);
|
||||
|
@ -1220,18 +1147,16 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
|
|||
edge precond_edge = single_pred_edge (rest);
|
||||
split_edge (loop_latch_edge (loop));
|
||||
basic_block exit_bb = single_pred (loop->latch);
|
||||
edge exit = single_dom_exit (loop);
|
||||
|
||||
/* Since the exit edge will be removed, the frequency of all the blocks
|
||||
in the loop that are dominated by it must be scaled by
|
||||
1 / (1 - exit->probability). */
|
||||
in the loop that are dominated by it must be scaled. */
|
||||
if (exit->probability.initialized_p ())
|
||||
scale_dominated_blocks_in_loop (loop, exit->src,
|
||||
/* We are scaling up here so
|
||||
probability does not fit. */
|
||||
loop->header->count,
|
||||
loop->header->count
|
||||
- loop->header->count.apply_probability
|
||||
(exit->probability));
|
||||
exit->src->count,
|
||||
exit->src->count - exit->count ());
|
||||
|
||||
gimple_stmt_iterator bsi = gsi_last_bb (exit_bb);
|
||||
exit_if = gimple_build_cond (EQ_EXPR, integer_zero_node,
|
||||
|
@ -1243,14 +1168,14 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
|
|||
rescan_loop_exit (new_exit, true, false);
|
||||
|
||||
/* Set the probability of new exit to the same of the old one. Fix
|
||||
the frequency of the latch block, by scaling it back by
|
||||
1 - exit->probability. */
|
||||
the count of the latch block. */
|
||||
new_exit->probability = exit->probability;
|
||||
edge new_nonexit = single_pred_edge (loop->latch);
|
||||
new_nonexit->probability = exit->probability.invert ();
|
||||
new_nonexit->flags = EDGE_TRUE_VALUE;
|
||||
if (new_nonexit->probability.initialized_p ())
|
||||
scale_bbs_frequencies (&loop->latch, 1, new_nonexit->probability);
|
||||
set_edge_probability_and_rescale_others
|
||||
(exit, profile_probability::never ());
|
||||
loop->latch->count = new_nonexit->count ();
|
||||
|
||||
edge old_entry = loop_preheader_edge (loop);
|
||||
edge new_entry = loop_preheader_edge (new_loop);
|
||||
|
@ -1296,12 +1221,21 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
|
|||
}
|
||||
|
||||
remove_path (exit);
|
||||
/* We will later redirect exit from vectorized loop to new_loop. */
|
||||
loop_preheader_edge (new_loop)->src->count = entry_count;
|
||||
|
||||
/* The epilog loop latch executes at most factor - 1 times.
|
||||
Since the epilog is entered unconditionally it will need to handle
|
||||
up to factor executions of its body. */
|
||||
new_loop->any_upper_bound = 1;
|
||||
new_loop->any_upper_bound = true;
|
||||
new_loop->nb_iterations_upper_bound = factor - 1;
|
||||
/* We do not really know estimate on number of iterations, since we do not
|
||||
track any estimates modulo unroll factor.
|
||||
Drop estimate from loop_info and scale loop profile.
|
||||
It may be more realistic to scale loop profile to factor / 2 - 1,
|
||||
but vectorizer also uses factor - 1. */
|
||||
new_loop->any_estimate = false;
|
||||
scale_loop_profile (new_loop, profile_probability::always (), factor - 1);
|
||||
}
|
||||
else
|
||||
new_exit = single_dom_exit (loop);
|
||||
|
@ -1318,10 +1252,10 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
|
|||
|
||||
auto_vec<edge> to_remove;
|
||||
bool ok
|
||||
= gimple_duplicate_loop_body_to_header_edge (loop, loop_latch_edge (loop),
|
||||
factor - 1, wont_exit,
|
||||
new_exit, &to_remove,
|
||||
DLTHE_FLAG_UPDATE_FREQ);
|
||||
= gimple_duplicate_loop_body_to_header_edge
|
||||
(loop, loop_latch_edge (loop), factor - 1, wont_exit,
|
||||
new_exit, &to_remove,
|
||||
DLTHE_FLAG_UPDATE_FREQ | (flat ? DLTHE_FLAG_FLAT_PROFILE : 0));
|
||||
gcc_assert (ok);
|
||||
|
||||
for (edge e : to_remove)
|
||||
|
@ -1332,36 +1266,25 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
|
|||
update_ssa (TODO_update_ssa);
|
||||
|
||||
new_exit = single_dom_exit (loop);
|
||||
|
||||
/* gimple_duplicate_loop_body_to_header_edge depending on
|
||||
DLTHE_FLAG_UPDATE_FREQ either keeps original frequency of the loop header
|
||||
or scales it down accordingly.
|
||||
However exit edge probability is kept as original. Fix it if needed
|
||||
and compensate. */
|
||||
profile_probability new_prob
|
||||
= loop_preheader_edge
|
||||
(loop)->count ().probability_in (new_exit->src->count);
|
||||
if (!(new_prob == new_exit->probability))
|
||||
{
|
||||
profile_count old_count = new_exit->src->count - new_exit->count ();
|
||||
set_edge_probability_and_rescale_others (new_exit, new_prob);
|
||||
profile_count new_count = new_exit->src->count - new_exit->count ();
|
||||
scale_dominated_blocks_in_loop (loop, new_exit->src,
|
||||
new_count, old_count);
|
||||
}
|
||||
if (!single_loop_p)
|
||||
{
|
||||
/* Ensure that the frequencies in the loop match the new estimated
|
||||
number of iterations, and change the probability of the new
|
||||
exit edge. */
|
||||
|
||||
profile_count freq_h = loop->header->count;
|
||||
profile_count freq_e = (loop_preheader_edge (loop))->count ();
|
||||
if (freq_h.nonzero_p ())
|
||||
{
|
||||
/* Avoid dropping loop body profile counter to 0 because of zero
|
||||
count in loop's preheader. */
|
||||
if (freq_h.nonzero_p () && !(freq_e == profile_count::zero ()))
|
||||
freq_e = freq_e.force_nonzero ();
|
||||
scale_loop_frequencies (loop, freq_e.probability_in (freq_h));
|
||||
}
|
||||
|
||||
basic_block rest = new_exit->dest;
|
||||
new_exit->probability
|
||||
= (profile_probability::always () / (new_est_niter + 1));
|
||||
|
||||
rest->count += new_exit->count ();
|
||||
|
||||
edge new_nonexit = single_pred_edge (loop->latch);
|
||||
profile_probability prob = new_nonexit->probability;
|
||||
new_nonexit->probability = new_exit->probability.invert ();
|
||||
prob = new_nonexit->probability / prob;
|
||||
if (prob.initialized_p ())
|
||||
scale_bbs_frequencies (&loop->latch, 1, prob);
|
||||
|
||||
/* Finally create the new counter for number of iterations and add
|
||||
the new exit instruction. */
|
||||
tree ctr_before, ctr_after;
|
||||
|
@ -1374,66 +1297,15 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
|
|||
gimple_cond_set_rhs (exit_if, exit_bound);
|
||||
update_stmt (exit_if);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* gimple_duplicate_loop_to_header_edge has adjusted the loop body's
|
||||
original profile counts in line with the unroll factor. However,
|
||||
the old counts might not have been consistent with the old
|
||||
iteration count.
|
||||
|
||||
Therefore, if the iteration count is known exactly, make sure that the
|
||||
profile counts of the loop header (and any other blocks that might be
|
||||
executed in the final iteration) are consistent with the combination
|
||||
of (a) the incoming profile count and (b) the new iteration count. */
|
||||
profile_count in_count = loop_preheader_edge (loop)->count ();
|
||||
profile_count old_header_count = loop->header->count;
|
||||
if (in_count.nonzero_p ()
|
||||
&& old_header_count.nonzero_p ()
|
||||
&& TREE_CODE (desc->niter) == INTEGER_CST)
|
||||
{
|
||||
/* The + 1 converts latch counts to iteration counts. */
|
||||
profile_count new_header_count = in_count * (new_est_niter + 1);
|
||||
basic_block *body = get_loop_body (loop);
|
||||
scale_bbs_frequencies_profile_count (body, loop->num_nodes,
|
||||
new_header_count,
|
||||
old_header_count);
|
||||
free (body);
|
||||
}
|
||||
|
||||
/* gimple_duplicate_loop_to_header_edge discarded FACTOR - 1
|
||||
exit edges and adjusted the loop body's profile counts for the
|
||||
new probabilities of the remaining non-exit edges. However,
|
||||
the remaining exit edge still has the same probability as it
|
||||
did before, even though it is now more likely.
|
||||
|
||||
Therefore, all blocks executed after a failed exit test now have
|
||||
a profile count that is too high, and the sum of the profile counts
|
||||
for the header's incoming edges is greater than the profile count
|
||||
of the header itself.
|
||||
|
||||
Adjust the profile counts of all code in the loop body after
|
||||
the exit test so that the sum of the counts on entry to the
|
||||
header agree. */
|
||||
profile_count old_latch_count = loop_latch_edge (loop)->count ();
|
||||
profile_count new_latch_count = loop->header->count - in_count;
|
||||
if (old_latch_count.nonzero_p () && new_latch_count.nonzero_p ())
|
||||
scale_dominated_blocks_in_loop (loop, new_exit->src, new_latch_count,
|
||||
old_latch_count);
|
||||
|
||||
/* Set the probability of the exit edge based on NEW_EST_NITER
|
||||
(which estimates latch counts rather than iteration counts).
|
||||
Update the probabilities of other edges to match.
|
||||
|
||||
If the profile counts are large enough to give the required
|
||||
precision, the updates above will have made
|
||||
|
||||
e->dest->count / e->src->count ~= new e->probability
|
||||
|
||||
for every outgoing edge e of NEW_EXIT->src. */
|
||||
profile_probability new_exit_prob
|
||||
= profile_probability::always () / (new_est_niter + 1);
|
||||
change_edge_frequency (new_exit, new_exit_prob);
|
||||
}
|
||||
if (loop->any_upper_bound)
|
||||
loop->nb_iterations_upper_bound = wi::udiv_floor
|
||||
(loop->nb_iterations_upper_bound + 1, factor) - 1;
|
||||
if (loop->any_likely_upper_bound)
|
||||
loop->nb_iterations_likely_upper_bound = wi::udiv_floor
|
||||
(loop->nb_iterations_likely_upper_bound + 1, factor) - 1;
|
||||
if (loop->any_estimate)
|
||||
loop->nb_iterations_estimate = wi::udiv_floor
|
||||
(loop->nb_iterations_estimate + 1, factor) - 1;
|
||||
|
||||
checking_verify_flow_info ();
|
||||
checking_verify_loop_structure ();
|
||||
|
|
Loading…
Add table
Reference in a new issue