Fix profile update in tree_transform_and_unroll_loop

Fixe profile update in tree_transform_and_unroll_loop which is used
by predictive comming.  I stared by attempt to fix
gcc.dg/tree-ssa/update-unroll-1.c I xfailed last week, but it turned to be
harder job.

Unrolling was never fixed for changes in duplicate_loop_body_to_header_edge
which is now smarter on getting profile right when some exists are eliminated.
A lot of manual profile can thus now be done using existing infrastructure.

I also noticed that scale_dominated_blocks_in_loop does job identical
to loop I wrote in scale_loop_profile and thus I commonized the implementaiton
and removed recursion.

I also extended duplicate_loop_body_to_header_edge to handle flat profiles same
way as we do in vectorizer. Without it we end up with less then 0 iteration
count in gcc.dg/tree-ssa/update-unroll-1.c (it is unrolled 32times but predicted
to iterated fewer times) and added missing code to update loop_info.

gcc/ChangeLog:

	* cfgloopmanip.cc (scale_dominated_blocks_in_loop): Move here from
	tree-ssa-loop-manip.cc and avoid recursion.
	(scale_loop_profile): Use scale_dominated_blocks_in_loop.
	(duplicate_loop_body_to_header_edge): Add DLTHE_FLAG_FLAT_PROFILE
	flag.
	* cfgloopmanip.h (DLTHE_FLAG_FLAT_PROFILE): Define.
	(scale_dominated_blocks_in_loop): Declare.
	* predict.cc (dump_prediction): Do not ICE on uninitialized probability.
	(change_edge_frequency): Remove.
	* predict.h (change_edge_frequency): Remove.
	* tree-ssa-loop-manip.cc (scale_dominated_blocks_in_loop): Move to
	cfgloopmanip.cc.
	(niter_for_unrolled_loop): Remove.
	(tree_transform_and_unroll_loop): Fix profile update.

gcc/testsuite/ChangeLog:

	* gcc.dg/pr102385.c: Check for no profile mismatches.
	* gcc.dg/pr96931.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-1.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-2.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-3.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-4.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-5.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-7.c: Check for one profile mismatch.
	* gcc.dg/tree-ssa/predcom-8.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-dse-1.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-dse-10.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-dse-11.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-dse-12.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-dse-2.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-dse-3.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-dse-4.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-dse-5.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-dse-6.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-dse-7.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-dse-8.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/predcom-dse-9.c: Check for no profile mismatches.
	* gcc.dg/tree-ssa/update-unroll-1.c: Unxfail.
This commit is contained in:
Jan Hubicka 2023-07-27 16:17:59 +02:00
parent 5b06b3b877
commit 9bd1ee6d33
27 changed files with 158 additions and 275 deletions

View file

@ -499,6 +499,32 @@ scale_loop_frequencies (class loop *loop, profile_probability p)
free (bbs);
}
/* Scales the frequencies of all basic blocks in LOOP that are strictly
dominated by BB by NUM/DEN. */
void
scale_dominated_blocks_in_loop (class loop *loop, basic_block bb,
profile_count num, profile_count den)
{
basic_block son;
if (!den.nonzero_p () && !(num == profile_count::zero ()))
return;
auto_vec <basic_block, 8> worklist;
worklist.safe_push (bb);
while (!worklist.is_empty ())
for (son = first_dom_son (CDI_DOMINATORS, worklist.pop ());
son;
son = next_dom_son (CDI_DOMINATORS, son))
{
if (!flow_bb_inside_loop_p (loop, son))
continue;
son->count = son->count.apply_scale (num, den);
worklist.safe_push (son);
}
}
/* Scale profile in LOOP by P.
If ITERATION_BOUND is not -1, scale even further if loop is predicted
to iterate too many times.
@ -649,19 +675,9 @@ scale_loop_profile (class loop *loop, profile_probability p,
if (other_edge && other_edge->dest == loop->latch)
loop->latch->count -= new_exit_count - old_exit_count;
else
{
basic_block *body = get_loop_body (loop);
profile_count new_count = exit_edge->src->count - new_exit_count;
profile_count old_count = exit_edge->src->count - old_exit_count;
for (unsigned int i = 0; i < loop->num_nodes; i++)
if (body[i] != exit_edge->src
&& dominated_by_p (CDI_DOMINATORS, body[i], exit_edge->src))
body[i]->count = body[i]->count.apply_scale (new_count,
old_count);
free (body);
}
scale_dominated_blocks_in_loop (loop, exit_edge->src,
exit_edge->src->count - new_exit_count,
exit_edge->src->count - old_exit_count);
}
else if (dump_file && (dump_flags & TDF_DETAILS))
{
@ -1237,6 +1253,7 @@ duplicate_loop_body_to_header_edge (class loop *loop, edge e,
should've managed the flags so all except for original loop
has won't exist set. */
scale_act = wanted_count.probability_in (count_in);
/* Now simulate the duplication adjustments and compute header
frequency of the last copy. */
for (i = 0; i < ndupl; i++)
@ -1252,16 +1269,21 @@ duplicate_loop_body_to_header_edge (class loop *loop, edge e,
profile_probability prob_pass_main = bitmap_bit_p (wont_exit, 0)
? prob_pass_wont_exit
: prob_pass_thru;
profile_probability p = prob_pass_main;
profile_count scale_main_den = count_in;
for (i = 0; i < ndupl; i++)
if (!(flags & DLTHE_FLAG_FLAT_PROFILE))
{
scale_main_den += count_in.apply_probability (p);
p = p * scale_step[i];
profile_probability p = prob_pass_main;
profile_count scale_main_den = count_in;
for (i = 0; i < ndupl; i++)
{
scale_main_den += count_in.apply_probability (p);
p = p * scale_step[i];
}
/* If original loop is executed COUNT_IN times, the unrolled
loop will account SCALE_MAIN_DEN times. */
scale_main = count_in.probability_in (scale_main_den);
}
/* If original loop is executed COUNT_IN times, the unrolled
loop will account SCALE_MAIN_DEN times. */
scale_main = count_in.probability_in (scale_main_den);
else
scale_main = profile_probability::always ();
scale_act = scale_main * prob_pass_main;
}
else

View file

@ -32,6 +32,8 @@ enum
field of newly create BB. */
#define DLTHE_FLAG_COMPLETTE_PEEL 4 /* Update frequencies expecting
a complete peeling. */
#define DLTHE_FLAG_FLAT_PROFILE 8 /* Profile is flat; do not reduce
count by unroll factor. */
extern edge mfb_kj_edge;
extern bool remove_path (edge, bool * = NULL, bitmap = NULL);
@ -64,5 +66,7 @@ class loop * loop_version (class loop *, void *,
profile_probability, profile_probability,
profile_probability, profile_probability, bool);
void adjust_loop_info_after_peeling (class loop *loop, int npeel, bool precise);
void scale_dominated_blocks_in_loop (class loop *loop, basic_block bb,
profile_count num, profile_count den);
#endif /* GCC_CFGLOOPMANIP_H */

View file

@ -790,7 +790,7 @@ dump_prediction (FILE *file, enum br_predictor predictor, int probability,
{
fprintf (file, " exec ");
bb->count.dump (file);
if (e)
if (e && e->count ().initialized_p () && bb->count.to_gcov_type ())
{
fprintf (file, " hit ");
e->count ().dump (file);
@ -4634,43 +4634,6 @@ force_edge_cold (edge e, bool impossible)
}
}
/* Change E's probability to NEW_E_PROB, redistributing the probabilities
of other outgoing edges proportionally.
Note that this function does not change the profile counts of any
basic blocks. The caller must do that instead, using whatever
information it has about the region that needs updating. */
void
change_edge_frequency (edge e, profile_probability new_e_prob)
{
profile_probability old_e_prob = e->probability;
profile_probability old_other_prob = old_e_prob.invert ();
profile_probability new_other_prob = new_e_prob.invert ();
e->probability = new_e_prob;
profile_probability cumulative_prob = new_e_prob;
unsigned int num_other = EDGE_COUNT (e->src->succs) - 1;
edge other_e;
edge_iterator ei;
FOR_EACH_EDGE (other_e, ei, e->src->succs)
if (other_e != e)
{
num_other -= 1;
if (num_other == 0)
/* Ensure that the probabilities add up to 1 without
rounding error. */
other_e->probability = cumulative_prob.invert ();
else
{
other_e->probability /= old_other_prob;
other_e->probability *= new_other_prob;
cumulative_prob += other_e->probability;
}
}
}
#if CHECKING_P
namespace selftest {

View file

@ -100,7 +100,6 @@ extern void rebuild_frequencies (void);
extern void report_predictor_hitrates (void);
extern void force_edge_cold (edge, bool);
extern void propagate_unlikely_bbs_forward (void);
extern void change_edge_frequency (edge, profile_probability);
extern void add_reg_br_prob_note (rtx_insn *, profile_probability);

View file

@ -1,4 +1,4 @@
/* { dg-options "-Wall -Wextra -O2 -fno-toplevel-reorder -fno-tree-ch -fno-tree-dce -fno-tree-dominator-opts -fno-tree-dse -fno-tree-loop-ivcanon -fpredictive-commoning" } */
/* { dg-options "-Wall -Wextra -O2 -fno-toplevel-reorder -fno-tree-ch -fno-tree-dce -fno-tree-dominator-opts -fno-tree-dse -fno-tree-loop-ivcanon -fpredictive-commoning -fdump-tree-pcom-details-blocks -fdump-tree-lim-details-blocks" } */
short a, b;
int c[9];
@ -12,3 +12,5 @@ void e() {
}
}
int main() {return 0;}
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "lim2" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do compile } */
/* { dg-options "-O1 -fpredictive-commoning -fno-tree-loop-im" } */
/* { dg-options "-O1 -fpredictive-commoning -fno-tree-loop-im -fdump-tree-pcom-details-blocks" } */
int bl;
@ -17,3 +17,4 @@ ie (void)
ie ();
}
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,6 +1,6 @@
/* { dg-do compile } */
/* { dg-do run } */
/* { dg-options "-O2 -fno-tree-vectorize -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details" } */
/* { dg-options "-O2 -fno-tree-vectorize -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
void abort (void);
@ -47,3 +47,4 @@ int main(void)
/* Also check that we undid the transformation previously made by PRE. */
/* { dg-final { scan-tree-dump-times "looparound ref" 1 "pcom" } } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do run } */
/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details -fno-tree-pre" } */
/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks -fno-tree-pre" } */
/* { dg-additional-options "-fno-tree-vectorize" { target amdgcn-*-* } } */
void abort (void);
@ -44,3 +44,4 @@ int main(void)
/* Verify that both loops were transformed and unrolled. */
/* { dg-final { scan-tree-dump-times "Unrolling 2 times." 2 "pcom"} } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do compile } */
/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details -fno-tree-pre -fno-tree-loop-vectorize" } */
/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks -fno-tree-pre -fno-tree-loop-vectorize" } */
int a[1000], b[1000];
@ -13,3 +13,4 @@ void test(void)
/* Verify that we used 3 temporary variables for the loop. */
/* { dg-final { scan-tree-dump-times "Unrolling 3 times." 1 "pcom"} } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do run } */
/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details" } */
/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
/* Test for predictive commoning of expressions, without reassociation. */
@ -26,3 +26,4 @@ int main(void)
/* { dg-final { scan-tree-dump-times "Combination" 1 "pcom"} } */
/* { dg-final { scan-tree-dump-times "Unrolling 3 times." 1 "pcom"} } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do run } */
/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details" } */
/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
/* Test for predictive commoning of expressions, with reassociation. */
@ -26,3 +26,4 @@ int main(void)
/* { dg-final { scan-tree-dump-times "Combination" 2 "pcom"} } */
/* { dg-final { scan-tree-dump-times "Unrolling 3 times." 1 "pcom"} } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do run } */
/* { dg-options "-O3 -fdump-tree-pcom-details" } */
/* { dg-options "-O3 -fdump-tree-pcom-details-blocks" } */
int b, f, d[5][2];
unsigned int c;
@ -15,3 +15,7 @@ main ()
}
/* { dg-final { scan-tree-dump "Executing predictive commoning" "pcom" } } */
/* dom pass introduces one mismatch after simplfying mispredicted conditional
on c being non-zero on first iteration. This happens since c is global variable
and needs alias analysis. */
/* { dg-final { scan-tree-dump-times "Invalid sum" 1 "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do compile } */
/* { dg-options "-O3 -fdump-tree-pcom-details" } */
/* { dg-options "-O3 -fdump-tree-pcom-details-blocks" } */
int is_sorted(int *a, int n)
{
@ -10,3 +10,4 @@ int is_sorted(int *a, int n)
}
/* { dg-final { scan-tree-dump "Executing predictive commoning without unrolling" "pcom" } } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do run } */
/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details" } */
/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
int arr[105] = {2, 3, 5, 7, 11};
int result0[10] = {2, 3, 5, 7, 11};
@ -60,3 +60,4 @@ int main (void)
return 0;
}
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do run } */
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
int arr[105] = {2, 3, 5, 7, 11};
int result0[10] = {2, 3, 5, 7, 11};
@ -41,4 +41,4 @@ int main (void)
return 0;
}
/* { dg-final { scan-tree-dump-not "Store-stores chain" "pcom"} } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do run } */
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
int arr[105] = {2, 3, 5, 7, 11};
int x[105] = {2, 3, 5, 7, 11};
@ -48,4 +48,5 @@ int main (void)
}
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
/* { dg-final { scan-tree-dump "Store-loads chain" "pcom"} } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do run } */
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
int arr[105] = {2, 3, 5, 7, 11};
int result0[10] = {2, 3, 5, 7, 11};
@ -65,3 +65,4 @@ int main (void)
return 0;
}
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do run } */
/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details" } */
/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
int arr[105] = {2, 3, 5, 7, 11};
int result0[10] = {2, 3, 5, 7, 11};
@ -60,3 +60,4 @@ int main (void)
return 0;
}
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do run } */
/* { dg-options "-O2 -fno-tree-vectorize -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details" } */
/* { dg-options "-O2 -fno-tree-vectorize -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
int arr1[105] = {2, 3, 5, 7, 11, 13, 0};
int arr2[105] = {2, 3, 5, 7, 11, 13, 0};
@ -106,3 +106,4 @@ int main (void)
return 0;
}
/* { dg-final { scan-tree-dump-times "Store-stores chain" 4 "pcom"} } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do run } */
/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details" } */
/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
int arr[105] = {2, 3, 5, 7, 11};
int result0[10] = {2, 3, 5, 7, 11};
@ -59,3 +59,4 @@ int main (void)
return 0;
}
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do run } */
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
int arr[105] = {2, 3, 5, 7, 11};
int result0[10] = {2, 3, 5, 7, 11};
@ -61,3 +61,4 @@ int main (void)
return 0;
}
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do run } */
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
int arr[105] = {2, 3, 5, 7, 11, 13, 17, 19};
int result0[10] = {2, 3, 5, 7, 11, 13, 17, 19};
@ -63,3 +63,4 @@ int main (void)
return 0;
}
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do run } */
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
int arr[105] = {2, 3, 5, 7, 11, 13, 17, 19};
int result0[10] = {2, 3, 5, 7, 11, 13, 17, 19};
@ -61,3 +61,4 @@ int main (void)
return 0;
}
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do run } */
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
int arr[105] = {2, 3, 5, 7, 11, 13, 17, 19};
int result0[10] = {2, 3, 5, 7, 11, 13, 17, 19};
@ -58,3 +58,4 @@ int main (void)
return 0;
}
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -1,5 +1,5 @@
/* { dg-do run } */
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */
int arr1[105] = {2, 3, 5, 7, 11, 13, 17, 19};
int arr2[105] = {2, 3, 5, 7, 11, 13, 17, 19};
@ -88,3 +88,4 @@ int main (void)
return 0;
}
/* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

View file

@ -16,5 +16,4 @@ int foo(unsigned n)
/* We used to make the probability that the body of the loop (unrolled
to enable prefetching) is entered 0, which is not correct. */
/* { dg-final { scan-tree-dump-not "Invalid sum" "aprefetch" { xfail *-*-* }} } */
/* { dg-final { scan-tree-dump-not "SUCC: 7 .100.0%" "aprefetch"} } */

View file

@ -1040,71 +1040,6 @@ determine_exit_conditions (class loop *loop, class tree_niter_desc *desc,
*exit_bound = bound;
}
/* Scales the frequencies of all basic blocks in LOOP that are strictly
dominated by BB by NUM/DEN. */
static void
scale_dominated_blocks_in_loop (class loop *loop, basic_block bb,
profile_count num, profile_count den)
{
basic_block son;
if (!den.nonzero_p () && !(num == profile_count::zero ()))
return;
for (son = first_dom_son (CDI_DOMINATORS, bb);
son;
son = next_dom_son (CDI_DOMINATORS, son))
{
if (!flow_bb_inside_loop_p (loop, son))
continue;
scale_bbs_frequencies_profile_count (&son, 1, num, den);
scale_dominated_blocks_in_loop (loop, son, num, den);
}
}
/* Return estimated niter for LOOP after unrolling by FACTOR times. */
gcov_type
niter_for_unrolled_loop (class loop *loop, unsigned factor)
{
gcc_assert (factor != 0);
bool profile_p = false;
gcov_type est_niter = expected_loop_iterations_unbounded (loop, &profile_p);
/* Note that this is really CEIL (est_niter + 1, factor) - 1, where the
"+ 1" converts latch iterations to loop iterations and the "- 1"
converts back. */
gcov_type new_est_niter = est_niter / factor;
if (est_niter == -1)
return -1;
/* Without profile feedback, loops for which we do not know a better estimate
are assumed to roll 10 times. When we unroll such loop, it appears to
roll too little, and it may even seem to be cold. To avoid this, we
ensure that the created loop appears to roll at least 5 times (but at
most as many times as before unrolling). Don't do adjustment if profile
feedback is present. */
if (new_est_niter < 5 && !profile_p)
{
if (est_niter < 5)
new_est_niter = est_niter;
else
new_est_niter = 5;
}
if (loop->any_upper_bound)
{
/* As above, this is really CEIL (upper_bound + 1, factor) - 1. */
widest_int bound = wi::udiv_floor (loop->nb_iterations_upper_bound,
factor);
if (wi::ltu_p (bound, new_est_niter))
new_est_niter = bound.to_uhwi ();
}
return new_est_niter;
}
/* Unroll LOOP FACTOR times. LOOP is known to have a single exit edge
whose source block dominates the latch. DESC describes the number of
iterations of LOOP.
@ -1169,47 +1104,39 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
transform_callback transform,
void *data)
{
gcov_type new_est_niter = niter_for_unrolled_loop (loop, factor);
unsigned irr = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
enum tree_code exit_cmp;
tree enter_main_cond, exit_base, exit_step, exit_bound;
bool flat = maybe_flat_loop_profile (loop);
determine_exit_conditions (loop, desc, factor,
&enter_main_cond, &exit_base, &exit_step,
&exit_cmp, &exit_bound);
bool single_loop_p = !exit_base;
/* Let us assume that the unrolled loop is quite likely to be entered. */
profile_probability prob_entry;
if (integer_nonzerop (enter_main_cond))
prob_entry = profile_probability::always ();
else
prob_entry = profile_probability::guessed_always ()
.apply_scale (PROB_UNROLLED_LOOP_ENTERED, 100);
gcond *exit_if = nullptr;
class loop *new_loop = nullptr;
edge new_exit;
if (!single_loop_p)
{
edge exit = single_dom_exit (loop);
profile_count entry_count = loop_preheader_edge (loop)->src->count;
/* Let us assume that the unrolled loop is quite likely to be entered. */
profile_probability prob_entry;
if (integer_nonzerop (enter_main_cond))
prob_entry = profile_probability::always ();
else
prob_entry = profile_probability::guessed_always ()
.apply_scale (PROB_UNROLLED_LOOP_ENTERED, 100);
/* The values for scales should keep profile consistent, and somewhat
close to correct.
TODO: The current value of SCALE_REST makes it appear that the loop
that is created by splitting the remaining iterations of the unrolled
loop is executed the same number of times as the original loop, and
with the same frequencies, which is obviously wrong. This does not
appear to cause problems, so we do not bother with fixing it for now.
To make the profile correct, we would need to change the probability
of the exit edge of the loop, and recompute the distribution of
frequencies in its body because of this change (scale the frequencies
of blocks before and after the exit by appropriate factors). */
profile_probability scale_unrolled = prob_entry;
close to correct. */
new_loop = loop_version (loop, enter_main_cond, NULL, prob_entry,
prob_entry.invert (), scale_unrolled,
profile_probability::guessed_always (),
prob_entry.invert (),
prob_entry,
/* We will later redirect exit from vectorized
loop to new_loop. */
profile_probability::always (),
true);
gcc_assert (new_loop != NULL);
update_ssa (TODO_update_ssa_no_phi);
@ -1220,18 +1147,16 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
edge precond_edge = single_pred_edge (rest);
split_edge (loop_latch_edge (loop));
basic_block exit_bb = single_pred (loop->latch);
edge exit = single_dom_exit (loop);
/* Since the exit edge will be removed, the frequency of all the blocks
in the loop that are dominated by it must be scaled by
1 / (1 - exit->probability). */
in the loop that are dominated by it must be scaled. */
if (exit->probability.initialized_p ())
scale_dominated_blocks_in_loop (loop, exit->src,
/* We are scaling up here so
probability does not fit. */
loop->header->count,
loop->header->count
- loop->header->count.apply_probability
(exit->probability));
exit->src->count,
exit->src->count - exit->count ());
gimple_stmt_iterator bsi = gsi_last_bb (exit_bb);
exit_if = gimple_build_cond (EQ_EXPR, integer_zero_node,
@ -1243,14 +1168,14 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
rescan_loop_exit (new_exit, true, false);
/* Set the probability of new exit to the same of the old one. Fix
the frequency of the latch block, by scaling it back by
1 - exit->probability. */
the count of the latch block. */
new_exit->probability = exit->probability;
edge new_nonexit = single_pred_edge (loop->latch);
new_nonexit->probability = exit->probability.invert ();
new_nonexit->flags = EDGE_TRUE_VALUE;
if (new_nonexit->probability.initialized_p ())
scale_bbs_frequencies (&loop->latch, 1, new_nonexit->probability);
set_edge_probability_and_rescale_others
(exit, profile_probability::never ());
loop->latch->count = new_nonexit->count ();
edge old_entry = loop_preheader_edge (loop);
edge new_entry = loop_preheader_edge (new_loop);
@ -1296,12 +1221,21 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
}
remove_path (exit);
/* We will later redirect exit from vectorized loop to new_loop. */
loop_preheader_edge (new_loop)->src->count = entry_count;
/* The epilog loop latch executes at most factor - 1 times.
Since the epilog is entered unconditionally it will need to handle
up to factor executions of its body. */
new_loop->any_upper_bound = 1;
new_loop->any_upper_bound = true;
new_loop->nb_iterations_upper_bound = factor - 1;
/* We do not really know estimate on number of iterations, since we do not
track any estimates modulo unroll factor.
Drop estimate from loop_info and scale loop profile.
It may be more realistic to scale loop profile to factor / 2 - 1,
but vectorizer also uses factor - 1. */
new_loop->any_estimate = false;
scale_loop_profile (new_loop, profile_probability::always (), factor - 1);
}
else
new_exit = single_dom_exit (loop);
@ -1318,10 +1252,10 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
auto_vec<edge> to_remove;
bool ok
= gimple_duplicate_loop_body_to_header_edge (loop, loop_latch_edge (loop),
factor - 1, wont_exit,
new_exit, &to_remove,
DLTHE_FLAG_UPDATE_FREQ);
= gimple_duplicate_loop_body_to_header_edge
(loop, loop_latch_edge (loop), factor - 1, wont_exit,
new_exit, &to_remove,
DLTHE_FLAG_UPDATE_FREQ | (flat ? DLTHE_FLAG_FLAT_PROFILE : 0));
gcc_assert (ok);
for (edge e : to_remove)
@ -1332,36 +1266,25 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
update_ssa (TODO_update_ssa);
new_exit = single_dom_exit (loop);
/* gimple_duplicate_loop_body_to_header_edge depending on
DLTHE_FLAG_UPDATE_FREQ either keeps original frequency of the loop header
or scales it down accordingly.
However exit edge probability is kept as original. Fix it if needed
and compensate. */
profile_probability new_prob
= loop_preheader_edge
(loop)->count ().probability_in (new_exit->src->count);
if (!(new_prob == new_exit->probability))
{
profile_count old_count = new_exit->src->count - new_exit->count ();
set_edge_probability_and_rescale_others (new_exit, new_prob);
profile_count new_count = new_exit->src->count - new_exit->count ();
scale_dominated_blocks_in_loop (loop, new_exit->src,
new_count, old_count);
}
if (!single_loop_p)
{
/* Ensure that the frequencies in the loop match the new estimated
number of iterations, and change the probability of the new
exit edge. */
profile_count freq_h = loop->header->count;
profile_count freq_e = (loop_preheader_edge (loop))->count ();
if (freq_h.nonzero_p ())
{
/* Avoid dropping loop body profile counter to 0 because of zero
count in loop's preheader. */
if (freq_h.nonzero_p () && !(freq_e == profile_count::zero ()))
freq_e = freq_e.force_nonzero ();
scale_loop_frequencies (loop, freq_e.probability_in (freq_h));
}
basic_block rest = new_exit->dest;
new_exit->probability
= (profile_probability::always () / (new_est_niter + 1));
rest->count += new_exit->count ();
edge new_nonexit = single_pred_edge (loop->latch);
profile_probability prob = new_nonexit->probability;
new_nonexit->probability = new_exit->probability.invert ();
prob = new_nonexit->probability / prob;
if (prob.initialized_p ())
scale_bbs_frequencies (&loop->latch, 1, prob);
/* Finally create the new counter for number of iterations and add
the new exit instruction. */
tree ctr_before, ctr_after;
@ -1374,66 +1297,15 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
gimple_cond_set_rhs (exit_if, exit_bound);
update_stmt (exit_if);
}
else
{
/* gimple_duplicate_loop_to_header_edge has adjusted the loop body's
original profile counts in line with the unroll factor. However,
the old counts might not have been consistent with the old
iteration count.
Therefore, if the iteration count is known exactly, make sure that the
profile counts of the loop header (and any other blocks that might be
executed in the final iteration) are consistent with the combination
of (a) the incoming profile count and (b) the new iteration count. */
profile_count in_count = loop_preheader_edge (loop)->count ();
profile_count old_header_count = loop->header->count;
if (in_count.nonzero_p ()
&& old_header_count.nonzero_p ()
&& TREE_CODE (desc->niter) == INTEGER_CST)
{
/* The + 1 converts latch counts to iteration counts. */
profile_count new_header_count = in_count * (new_est_niter + 1);
basic_block *body = get_loop_body (loop);
scale_bbs_frequencies_profile_count (body, loop->num_nodes,
new_header_count,
old_header_count);
free (body);
}
/* gimple_duplicate_loop_to_header_edge discarded FACTOR - 1
exit edges and adjusted the loop body's profile counts for the
new probabilities of the remaining non-exit edges. However,
the remaining exit edge still has the same probability as it
did before, even though it is now more likely.
Therefore, all blocks executed after a failed exit test now have
a profile count that is too high, and the sum of the profile counts
for the header's incoming edges is greater than the profile count
of the header itself.
Adjust the profile counts of all code in the loop body after
the exit test so that the sum of the counts on entry to the
header agree. */
profile_count old_latch_count = loop_latch_edge (loop)->count ();
profile_count new_latch_count = loop->header->count - in_count;
if (old_latch_count.nonzero_p () && new_latch_count.nonzero_p ())
scale_dominated_blocks_in_loop (loop, new_exit->src, new_latch_count,
old_latch_count);
/* Set the probability of the exit edge based on NEW_EST_NITER
(which estimates latch counts rather than iteration counts).
Update the probabilities of other edges to match.
If the profile counts are large enough to give the required
precision, the updates above will have made
e->dest->count / e->src->count ~= new e->probability
for every outgoing edge e of NEW_EXIT->src. */
profile_probability new_exit_prob
= profile_probability::always () / (new_est_niter + 1);
change_edge_frequency (new_exit, new_exit_prob);
}
if (loop->any_upper_bound)
loop->nb_iterations_upper_bound = wi::udiv_floor
(loop->nb_iterations_upper_bound + 1, factor) - 1;
if (loop->any_likely_upper_bound)
loop->nb_iterations_likely_upper_bound = wi::udiv_floor
(loop->nb_iterations_likely_upper_bound + 1, factor) - 1;
if (loop->any_estimate)
loop->nb_iterations_estimate = wi::udiv_floor
(loop->nb_iterations_estimate + 1, factor) - 1;
checking_verify_flow_info ();
checking_verify_loop_structure ();