Fix profile update in tree_transform_and_unroll_loop

Fixe profile update in tree_transform_and_unroll_loop which is used by predictive comming. I stared by attempt to fix gcc.dg/tree-ssa/update-unroll-1.c I xfailed last week, but it turned to be harder job. Unrolling was never fixed for changes in duplicate_loop_body_to_header_edge which is now smarter on getting profile right when some exists are eliminated. A lot of manual profile can thus now be done using existing infrastructure. I also noticed that scale_dominated_blocks_in_loop does job identical to loop I wrote in scale_loop_profile and thus I commonized the implementaiton and removed recursion. I also extended duplicate_loop_body_to_header_edge to handle flat profiles same way as we do in vectorizer. Without it we end up with less then 0 iteration count in gcc.dg/tree-ssa/update-unroll-1.c (it is unrolled 32times but predicted to iterated fewer times) and added missing code to update loop_info. gcc/ChangeLog: * cfgloopmanip.cc (scale_dominated_blocks_in_loop): Move here from tree-ssa-loop-manip.cc and avoid recursion. (scale_loop_profile): Use scale_dominated_blocks_in_loop. (duplicate_loop_body_to_header_edge): Add DLTHE_FLAG_FLAT_PROFILE flag. * cfgloopmanip.h (DLTHE_FLAG_FLAT_PROFILE): Define. (scale_dominated_blocks_in_loop): Declare. * predict.cc (dump_prediction): Do not ICE on uninitialized probability. (change_edge_frequency): Remove. * predict.h (change_edge_frequency): Remove. * tree-ssa-loop-manip.cc (scale_dominated_blocks_in_loop): Move to cfgloopmanip.cc. (niter_for_unrolled_loop): Remove. (tree_transform_and_unroll_loop): Fix profile update. gcc/testsuite/ChangeLog: * gcc.dg/pr102385.c: Check for no profile mismatches. * gcc.dg/pr96931.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-1.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-2.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-3.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-4.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-5.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-7.c: Check for one profile mismatch. * gcc.dg/tree-ssa/predcom-8.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-1.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-10.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-11.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-12.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-2.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-3.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-4.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-5.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-6.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-7.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-8.c: Check for no profile mismatches. * gcc.dg/tree-ssa/predcom-dse-9.c: Check for no profile mismatches. * gcc.dg/tree-ssa/update-unroll-1.c: Unxfail.
2023-07-27 16:17:59 +02:00 · 2023-07-27 16:17:59 +02:00 · 9bd1ee6d33
commit 9bd1ee6d33
parent 5b06b3b877
27 changed files with 158 additions and 275 deletions
--- a/gcc/cfgloopmanip.cc
+++ b/gcc/cfgloopmanip.cc
@ -499,6 +499,32 @@ scale_loop_frequencies (class loop *loop, profile_probability p)
  free (bbs);
 }

+/* Scales the frequencies of all basic blocks in LOOP that are strictly
+   dominated by BB by NUM/DEN.  */
+
+void
+scale_dominated_blocks_in_loop (class loop *loop, basic_block bb,
+				profile_count num, profile_count den)
+{
+  basic_block son;
+
+  if (!den.nonzero_p () && !(num == profile_count::zero ()))
+    return;
+  auto_vec <basic_block, 8> worklist;
+  worklist.safe_push (bb);
+
+  while (!worklist.is_empty ())
+    for (son = first_dom_son (CDI_DOMINATORS, worklist.pop ());
+	 son;
+	 son = next_dom_son (CDI_DOMINATORS, son))
+      {
+	if (!flow_bb_inside_loop_p (loop, son))
+	  continue;
+	son->count = son->count.apply_scale (num, den);
+	worklist.safe_push (son);
+      }
+}
+
 /* Scale profile in LOOP by P.
   If ITERATION_BOUND is not -1, scale even further if loop is predicted
   to iterate too many times.
@ -649,19 +675,9 @@ scale_loop_profile (class loop *loop, profile_probability p,
      if (other_edge && other_edge->dest == loop->latch)
 	loop->latch->count -= new_exit_count - old_exit_count;
      else
-	{
-	  basic_block *body = get_loop_body (loop);
-	  profile_count new_count = exit_edge->src->count - new_exit_count;
-	  profile_count old_count = exit_edge->src->count - old_exit_count;
-
-	  for (unsigned int i = 0; i < loop->num_nodes; i++)
-	    if (body[i] != exit_edge->src
-		&& dominated_by_p (CDI_DOMINATORS, body[i], exit_edge->src))
-	      body[i]->count = body[i]->count.apply_scale (new_count,
-							   old_count);
-
-	  free (body);
-	}
+	scale_dominated_blocks_in_loop (loop, exit_edge->src,
+					exit_edge->src->count - new_exit_count,
+					exit_edge->src->count - old_exit_count);
    }
  else if (dump_file && (dump_flags & TDF_DETAILS))
    {
@ -1237,6 +1253,7 @@ duplicate_loop_body_to_header_edge (class loop *loop, edge e,
 	     should've managed the flags so all except for original loop
 	     has won't exist set.  */
 	  scale_act = wanted_count.probability_in (count_in);
+
 	  /* Now simulate the duplication adjustments and compute header
 	     frequency of the last copy.  */
 	  for (i = 0; i < ndupl; i++)
@ -1252,16 +1269,21 @@ duplicate_loop_body_to_header_edge (class loop *loop, edge e,
 	  profile_probability prob_pass_main = bitmap_bit_p (wont_exit, 0)
 							? prob_pass_wont_exit
 							: prob_pass_thru;
-	  profile_probability p = prob_pass_main;
-	  profile_count scale_main_den = count_in;
-	  for (i = 0; i < ndupl; i++)
+	  if (!(flags & DLTHE_FLAG_FLAT_PROFILE))
 	    {
-	      scale_main_den += count_in.apply_probability (p);
-	      p = p * scale_step[i];
+	      profile_probability p = prob_pass_main;
+	      profile_count scale_main_den = count_in;
+	      for (i = 0; i < ndupl; i++)
+		{
+		  scale_main_den += count_in.apply_probability (p);
+		  p = p * scale_step[i];
+		}
+	      /* If original loop is executed COUNT_IN times, the unrolled
+		 loop will account SCALE_MAIN_DEN times.  */
+	      scale_main = count_in.probability_in (scale_main_den);
 	    }
-	  /* If original loop is executed COUNT_IN times, the unrolled
-	     loop will account SCALE_MAIN_DEN times.  */
-	  scale_main = count_in.probability_in (scale_main_den);
+	  else
+	    scale_main = profile_probability::always ();
 	  scale_act = scale_main * prob_pass_main;
 	}
      else
--- a/gcc/cfgloopmanip.h
+++ b/gcc/cfgloopmanip.h
@ -32,6 +32,8 @@ enum
 					   field of newly create BB.  */
 #define DLTHE_FLAG_COMPLETTE_PEEL 4	/* Update frequencies expecting
 					   a complete peeling.  */
+#define DLTHE_FLAG_FLAT_PROFILE 8	/* Profile is flat; do not reduce
+					   count by unroll factor.  */
 extern edge mfb_kj_edge;

 extern bool remove_path (edge, bool * = NULL, bitmap = NULL);
@ -64,5 +66,7 @@ class loop * loop_version (class loop *, void *,
 			    profile_probability, profile_probability,
 			    profile_probability, profile_probability, bool);
 void adjust_loop_info_after_peeling (class loop *loop, int npeel, bool precise);
+void scale_dominated_blocks_in_loop (class loop *loop, basic_block bb,
+				     profile_count num, profile_count den);

 #endif /* GCC_CFGLOOPMANIP_H */
--- a/gcc/predict.cc
+++ b/gcc/predict.cc
@ -790,7 +790,7 @@ dump_prediction (FILE *file, enum br_predictor predictor, int probability,
    {
      fprintf (file, "  exec ");
      bb->count.dump (file);
-      if (e)
+      if (e && e->count ().initialized_p () && bb->count.to_gcov_type ())
 	{
 	  fprintf (file, " hit ");
 	  e->count ().dump (file);
@ -4634,43 +4634,6 @@ force_edge_cold (edge e, bool impossible)
    }
 }

-/* Change E's probability to NEW_E_PROB, redistributing the probabilities
-   of other outgoing edges proportionally.
-
-   Note that this function does not change the profile counts of any
-   basic blocks.  The caller must do that instead, using whatever
-   information it has about the region that needs updating.  */
-
-void
-change_edge_frequency (edge e, profile_probability new_e_prob)
-{
-  profile_probability old_e_prob = e->probability;
-  profile_probability old_other_prob = old_e_prob.invert ();
-  profile_probability new_other_prob = new_e_prob.invert ();
-
-  e->probability = new_e_prob;
-  profile_probability cumulative_prob = new_e_prob;
-
-  unsigned int num_other = EDGE_COUNT (e->src->succs) - 1;
-  edge other_e;
-  edge_iterator ei;
-  FOR_EACH_EDGE (other_e, ei, e->src->succs)
-    if (other_e != e)
-      {
-	num_other -= 1;
-	if (num_other == 0)
-	  /* Ensure that the probabilities add up to 1 without
-	     rounding error.  */
-	  other_e->probability = cumulative_prob.invert ();
-	else
-	  {
-	    other_e->probability /= old_other_prob;
-	    other_e->probability *= new_other_prob;
-	    cumulative_prob += other_e->probability;
-	  }
-      }
-}
-
 #if CHECKING_P

 namespace selftest {
--- a/gcc/predict.h
+++ b/gcc/predict.h
@ -100,7 +100,6 @@ extern void rebuild_frequencies (void);
 extern void report_predictor_hitrates (void);
 extern void force_edge_cold (edge, bool);
 extern void propagate_unlikely_bbs_forward (void);
-extern void change_edge_frequency (edge, profile_probability);

 extern void add_reg_br_prob_note (rtx_insn *, profile_probability);

--- a/gcc/testsuite/gcc.dg/pr102385.c
+++ b/gcc/testsuite/gcc.dg/pr102385.c
@ -1,4 +1,4 @@
-/* { dg-options "-Wall -Wextra -O2 -fno-toplevel-reorder -fno-tree-ch -fno-tree-dce -fno-tree-dominator-opts -fno-tree-dse -fno-tree-loop-ivcanon -fpredictive-commoning" } */
+/* { dg-options "-Wall -Wextra -O2 -fno-toplevel-reorder -fno-tree-ch -fno-tree-dce -fno-tree-dominator-opts -fno-tree-dse -fno-tree-loop-ivcanon -fpredictive-commoning -fdump-tree-pcom-details-blocks -fdump-tree-lim-details-blocks" } */

 short a, b;
 int c[9];
@ -12,3 +12,5 @@ void e() {
  }
 }
 int main() {return 0;}
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "lim2" } } */
--- a/gcc/testsuite/gcc.dg/pr96931.c
+++ b/gcc/testsuite/gcc.dg/pr96931.c
@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O1 -fpredictive-commoning -fno-tree-loop-im" } */
+/* { dg-options "-O1 -fpredictive-commoning -fno-tree-loop-im -fdump-tree-pcom-details-blocks" } */

 int bl;

@ -17,3 +17,4 @@ ie (void)

  ie ();
 }
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-1.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-1.c
@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-do run } */
-/* { dg-options "-O2 -fno-tree-vectorize -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details" } */
+/* { dg-options "-O2 -fno-tree-vectorize -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */

 void abort (void);

@ -47,3 +47,4 @@ int main(void)

 /* Also check that we undid the transformation previously made by PRE.  */
 /* { dg-final { scan-tree-dump-times "looparound ref" 1 "pcom" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-2.c
@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details -fno-tree-pre" } */
+/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks -fno-tree-pre" } */
 /* { dg-additional-options "-fno-tree-vectorize" { target amdgcn-*-* } } */

 void abort (void);
@ -44,3 +44,4 @@ int main(void)

 /* Verify that both loops were transformed and unrolled.  */
 /* { dg-final { scan-tree-dump-times "Unrolling 2 times." 2 "pcom"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-3.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-3.c
@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details -fno-tree-pre -fno-tree-loop-vectorize" } */
+/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks -fno-tree-pre -fno-tree-loop-vectorize" } */

 int a[1000], b[1000];

@ -13,3 +13,4 @@ void test(void)

 /* Verify that we used 3 temporary variables for the loop.  */
 /* { dg-final { scan-tree-dump-times "Unrolling 3 times." 1 "pcom"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-4.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-4.c
@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details" } */
+/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */

 /* Test for predictive commoning of expressions, without reassociation.  */

@ -26,3 +26,4 @@ int main(void)

 /* { dg-final { scan-tree-dump-times "Combination" 1 "pcom"} } */
 /* { dg-final { scan-tree-dump-times "Unrolling 3 times." 1 "pcom"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-5.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-5.c
@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details" } */
+/* { dg-options "-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */

 /* Test for predictive commoning of expressions, with reassociation.  */

@ -26,3 +26,4 @@ int main(void)

 /* { dg-final { scan-tree-dump-times "Combination" 2 "pcom"} } */
 /* { dg-final { scan-tree-dump-times "Unrolling 3 times." 1 "pcom"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-7.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-7.c
@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O3 -fdump-tree-pcom-details" } */
+/* { dg-options "-O3 -fdump-tree-pcom-details-blocks" } */

 int b, f, d[5][2];
 unsigned int c;
@ -15,3 +15,7 @@ main ()
 }

 /* { dg-final { scan-tree-dump "Executing predictive commoning" "pcom" } } */
+/* dom pass introduces one mismatch after simplfying mispredicted conditional
+   on c being non-zero on first iteration.  This happens since c is global variable
+   and needs alias analysis.  */
+/* { dg-final { scan-tree-dump-times "Invalid sum" 1 "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-8.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-8.c
@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O3 -fdump-tree-pcom-details" } */
+/* { dg-options "-O3 -fdump-tree-pcom-details-blocks" } */

 int is_sorted(int *a, int n)
 {
@ -10,3 +10,4 @@ int is_sorted(int *a, int n)
 }

 /* { dg-final { scan-tree-dump "Executing predictive commoning without unrolling" "pcom" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-1.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-1.c
@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details" } */
+/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */

 int arr[105] = {2, 3, 5, 7, 11};
 int result0[10] = {2, 3, 5, 7, 11};
@ -60,3 +60,4 @@ int main (void)
  return 0;
 }
 /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-10.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-10.c
@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
+/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */

 int arr[105] = {2, 3, 5, 7, 11};
 int result0[10] = {2, 3, 5, 7, 11};
@ -41,4 +41,4 @@ int main (void)
  return 0;
 }
 /* { dg-final { scan-tree-dump-not "Store-stores chain" "pcom"} } */
-
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-11.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-11.c
@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
+/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */

 int arr[105] = {2, 3, 5, 7, 11};
 int x[105] = {2, 3, 5, 7, 11};
@ -48,4 +48,5 @@ int main (void)
 }
 /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
 /* { dg-final { scan-tree-dump "Store-loads chain" "pcom"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */

--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-12.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-12.c
@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
+/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */

 int arr[105] = {2, 3, 5, 7, 11};
 int result0[10] = {2, 3, 5, 7, 11};
@ -65,3 +65,4 @@ int main (void)
  return 0;
 }
 /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-2.c
@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details" } */
+/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */

 int arr[105] = {2, 3, 5, 7, 11};
 int result0[10] = {2, 3, 5, 7, 11};
@ -60,3 +60,4 @@ int main (void)
  return 0;
 }
 /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-3.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-3.c
@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -fno-tree-vectorize -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details" } */
+/* { dg-options "-O2 -fno-tree-vectorize -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */

 int arr1[105] = {2, 3, 5, 7, 11, 13, 0};
 int arr2[105] = {2, 3, 5, 7, 11, 13, 0};
@ -106,3 +106,4 @@ int main (void)
  return 0;
 }
 /* { dg-final { scan-tree-dump-times "Store-stores chain" 4 "pcom"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-4.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-4.c
@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details" } */
+/* { dg-options "-O2 -fno-inline -fno-tree-loop-distribute-patterns -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */

 int arr[105] = {2, 3, 5, 7, 11};
 int result0[10] = {2, 3, 5, 7, 11};
@ -59,3 +59,4 @@ int main (void)
  return 0;
 }
 /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-5.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-5.c
@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
+/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */

 int arr[105] = {2, 3, 5, 7, 11};
 int result0[10] = {2, 3, 5, 7, 11};
@ -61,3 +61,4 @@ int main (void)
  return 0;
 }
 /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-6.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-6.c
@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
+/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */

 int arr[105] = {2, 3, 5, 7, 11, 13, 17, 19};
 int result0[10] = {2, 3, 5, 7, 11, 13, 17, 19};
@ -63,3 +63,4 @@ int main (void)
  return 0;
 }
 /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-7.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-7.c
@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
+/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */

 int arr[105] = {2, 3, 5, 7, 11, 13, 17, 19};
 int result0[10] = {2, 3, 5, 7, 11, 13, 17, 19};
@ -61,3 +61,4 @@ int main (void)
  return 0;
 }
 /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-8.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-8.c
@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
+/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */

 int arr[105] = {2, 3, 5, 7, 11, 13, 17, 19};
 int result0[10] = {2, 3, 5, 7, 11, 13, 17, 19};
@ -58,3 +58,4 @@ int main (void)

  return 0;
 }
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-9.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/predcom-dse-9.c
@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details" } */
+/* { dg-options "-O2 -fno-inline -fpredictive-commoning -fdump-tree-pcom-details-blocks" } */

 int arr1[105] = {2, 3, 5, 7, 11, 13, 17, 19};
 int arr2[105] = {2, 3, 5, 7, 11, 13, 17, 19};
@ -88,3 +88,4 @@ int main (void)
  return 0;
 }
 /* { dg-final { scan-tree-dump "Store-stores chain" "pcom"} } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "pcom" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/update-unroll-1.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/update-unroll-1.c
@ -16,5 +16,4 @@ int foo(unsigned n)
 /* We used to make the probability that the body of the loop (unrolled
   to enable prefetching) is entered 0, which is not correct.  */

-/* { dg-final { scan-tree-dump-not "Invalid sum" "aprefetch" { xfail *-*-* }} } */
 /* { dg-final { scan-tree-dump-not "SUCC: 7 .100.0%" "aprefetch"} } */
--- a/gcc/tree-ssa-loop-manip.cc
+++ b/gcc/tree-ssa-loop-manip.cc
@ -1040,71 +1040,6 @@ determine_exit_conditions (class loop *loop, class tree_niter_desc *desc,
  *exit_bound = bound;
 }

-/* Scales the frequencies of all basic blocks in LOOP that are strictly
-   dominated by BB by NUM/DEN.  */
-
-static void
-scale_dominated_blocks_in_loop (class loop *loop, basic_block bb,
-				profile_count num, profile_count den)
-{
-  basic_block son;
-
-  if (!den.nonzero_p () && !(num == profile_count::zero ()))
-    return;
-
-  for (son = first_dom_son (CDI_DOMINATORS, bb);
-       son;
-       son = next_dom_son (CDI_DOMINATORS, son))
-    {
-      if (!flow_bb_inside_loop_p (loop, son))
-	continue;
-      scale_bbs_frequencies_profile_count (&son, 1, num, den);
-      scale_dominated_blocks_in_loop (loop, son, num, den);
-    }
-}
-
-/* Return estimated niter for LOOP after unrolling by FACTOR times.  */
-
-gcov_type
-niter_for_unrolled_loop (class loop *loop, unsigned factor)
-{
-  gcc_assert (factor != 0);
-  bool profile_p = false;
-  gcov_type est_niter = expected_loop_iterations_unbounded (loop, &profile_p);
-  /* Note that this is really CEIL (est_niter + 1, factor) - 1, where the
-     "+ 1" converts latch iterations to loop iterations and the "- 1"
-     converts back.  */
-  gcov_type new_est_niter = est_niter / factor;
-
-  if (est_niter == -1)
-    return -1;
-
-  /* Without profile feedback, loops for which we do not know a better estimate
-     are assumed to roll 10 times.  When we unroll such loop, it appears to
-     roll too little, and it may even seem to be cold.  To avoid this, we
-     ensure that the created loop appears to roll at least 5 times (but at
-     most as many times as before unrolling).  Don't do adjustment if profile
-     feedback is present.  */
-  if (new_est_niter < 5 && !profile_p)
-    {
-      if (est_niter < 5)
-	new_est_niter = est_niter;
-      else
-	new_est_niter = 5;
-    }
-
-  if (loop->any_upper_bound)
-    {
-      /* As above, this is really CEIL (upper_bound + 1, factor) - 1.  */
-      widest_int bound = wi::udiv_floor (loop->nb_iterations_upper_bound,
-					 factor);
-      if (wi::ltu_p (bound, new_est_niter))
-	new_est_niter = bound.to_uhwi ();
-    }
-
-  return new_est_niter;
-}
-
 /* Unroll LOOP FACTOR times.  LOOP is known to have a single exit edge
   whose source block dominates the latch.  DESC describes the number of
   iterations of LOOP.
@ -1169,47 +1104,39 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
 				transform_callback transform,
 				void *data)
 {
-  gcov_type new_est_niter = niter_for_unrolled_loop (loop, factor);
  unsigned irr = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;

  enum tree_code exit_cmp;
  tree enter_main_cond, exit_base, exit_step, exit_bound;
+  bool flat = maybe_flat_loop_profile (loop);
  determine_exit_conditions (loop, desc, factor,
 			     &enter_main_cond, &exit_base, &exit_step,
 			     &exit_cmp, &exit_bound);
  bool single_loop_p = !exit_base;

-  /* Let us assume that the unrolled loop is quite likely to be entered.  */
-  profile_probability prob_entry;
-  if (integer_nonzerop (enter_main_cond))
-    prob_entry = profile_probability::always ();
-  else
-    prob_entry = profile_probability::guessed_always ()
-			.apply_scale (PROB_UNROLLED_LOOP_ENTERED, 100);
-
  gcond *exit_if = nullptr;
  class loop *new_loop = nullptr;
  edge new_exit;
  if (!single_loop_p)
    {
-      edge exit = single_dom_exit (loop);
+      profile_count entry_count = loop_preheader_edge (loop)->src->count;
+      /* Let us assume that the unrolled loop is quite likely to be entered.  */
+      profile_probability prob_entry;
+      if (integer_nonzerop (enter_main_cond))
+	prob_entry = profile_probability::always ();
+      else
+	prob_entry = profile_probability::guessed_always ()
+			    .apply_scale (PROB_UNROLLED_LOOP_ENTERED, 100);
+

      /* The values for scales should keep profile consistent, and somewhat
-	 close to correct.
-
-	 TODO: The current value of SCALE_REST makes it appear that the loop
-	 that is created by splitting the remaining iterations of the unrolled
-	 loop is executed the same number of times as the original loop, and
-	 with the same frequencies, which is obviously wrong.  This does not
-	 appear to cause problems, so we do not bother with fixing it for now.
-	 To make the profile correct, we would need to change the probability
-	 of the exit edge of the loop, and recompute the distribution of
-	 frequencies in its body because of this change (scale the frequencies
-	 of blocks before and after the exit by appropriate factors).  */
-      profile_probability scale_unrolled = prob_entry;
+	 close to correct.  */
      new_loop = loop_version (loop, enter_main_cond, NULL, prob_entry,
-			       prob_entry.invert (), scale_unrolled,
-			       profile_probability::guessed_always (),
+			       prob_entry.invert (),
+			       prob_entry,
+			       /* We will later redirect exit from vectorized
+				  loop to new_loop.  */
+			       profile_probability::always (),
 			       true);
      gcc_assert (new_loop != NULL);
      update_ssa (TODO_update_ssa_no_phi);
@ -1220,18 +1147,16 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
      edge precond_edge = single_pred_edge (rest);
      split_edge (loop_latch_edge (loop));
      basic_block exit_bb = single_pred (loop->latch);
+      edge exit = single_dom_exit (loop);

      /* Since the exit edge will be removed, the frequency of all the blocks
-	 in the loop that are dominated by it must be scaled by
-	 1 / (1 - exit->probability).  */
+	 in the loop that are dominated by it must be scaled.  */
      if (exit->probability.initialized_p ())
 	scale_dominated_blocks_in_loop (loop, exit->src,
 					/* We are scaling up here so
 					   probability does not fit.  */
-					loop->header->count,
-					loop->header->count
-					- loop->header->count.apply_probability
-					    (exit->probability));
+					exit->src->count,
+					exit->src->count - exit->count ());

      gimple_stmt_iterator bsi = gsi_last_bb (exit_bb);
      exit_if = gimple_build_cond (EQ_EXPR, integer_zero_node,
@ -1243,14 +1168,14 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
      rescan_loop_exit (new_exit, true, false);

      /* Set the probability of new exit to the same of the old one.  Fix
-	 the frequency of the latch block, by scaling it back by
-	 1 - exit->probability.  */
+	 the count of the latch block.  */
      new_exit->probability = exit->probability;
      edge new_nonexit = single_pred_edge (loop->latch);
      new_nonexit->probability = exit->probability.invert ();
      new_nonexit->flags = EDGE_TRUE_VALUE;
-      if (new_nonexit->probability.initialized_p ())
-	scale_bbs_frequencies (&loop->latch, 1, new_nonexit->probability);
+      set_edge_probability_and_rescale_others
+	      (exit, profile_probability::never ());
+      loop->latch->count = new_nonexit->count ();

      edge old_entry = loop_preheader_edge (loop);
      edge new_entry = loop_preheader_edge (new_loop);
@ -1296,12 +1221,21 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
 	}

      remove_path (exit);
+      /* We will later redirect exit from vectorized loop to new_loop.  */
+      loop_preheader_edge (new_loop)->src->count = entry_count;

      /* The epilog loop latch executes at most factor - 1 times.
 	 Since the epilog is entered unconditionally it will need to handle
 	 up to factor executions of its body.  */
-      new_loop->any_upper_bound = 1;
+      new_loop->any_upper_bound = true;
      new_loop->nb_iterations_upper_bound = factor - 1;
+      /* We do not really know estimate on number of iterations, since we do not
+	 track any estimates modulo unroll factor.
+	 Drop estimate from loop_info and scale loop profile.
+	 It may be more realistic to scale loop profile to factor / 2 - 1,
+	 but vectorizer also uses factor - 1.  */
+      new_loop->any_estimate = false;
+      scale_loop_profile (new_loop, profile_probability::always (), factor - 1);
    }
  else
    new_exit = single_dom_exit (loop);
@ -1318,10 +1252,10 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,

  auto_vec<edge> to_remove;
  bool ok
-    = gimple_duplicate_loop_body_to_header_edge (loop, loop_latch_edge (loop),
-						 factor - 1, wont_exit,
-						 new_exit, &to_remove,
-						 DLTHE_FLAG_UPDATE_FREQ);
+    = gimple_duplicate_loop_body_to_header_edge
+	    (loop, loop_latch_edge (loop), factor - 1, wont_exit,
+	     new_exit, &to_remove,
+	     DLTHE_FLAG_UPDATE_FREQ | (flat ? DLTHE_FLAG_FLAT_PROFILE : 0));
  gcc_assert (ok);

  for (edge e : to_remove)
@ -1332,36 +1266,25 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
  update_ssa (TODO_update_ssa);

  new_exit = single_dom_exit (loop);
+
+  /* gimple_duplicate_loop_body_to_header_edge depending on
+     DLTHE_FLAG_UPDATE_FREQ either keeps original frequency of the loop header
+     or scales it down accordingly.
+     However exit edge probability is kept as original.  Fix it if needed
+     and compensate.  */
+  profile_probability new_prob
+	  = loop_preheader_edge
+		  (loop)->count ().probability_in (new_exit->src->count);
+  if (!(new_prob == new_exit->probability))
+    {
+      profile_count old_count = new_exit->src->count - new_exit->count ();
+      set_edge_probability_and_rescale_others (new_exit, new_prob);
+      profile_count new_count = new_exit->src->count - new_exit->count ();
+      scale_dominated_blocks_in_loop (loop, new_exit->src,
+				      new_count, old_count);
+    }
  if (!single_loop_p)
    {
-      /* Ensure that the frequencies in the loop match the new estimated
-	 number of iterations, and change the probability of the new
-	 exit edge.  */
-
-      profile_count freq_h = loop->header->count;
-      profile_count freq_e = (loop_preheader_edge (loop))->count ();
-      if (freq_h.nonzero_p ())
-	{
-	  /* Avoid dropping loop body profile counter to 0 because of zero
-	     count in loop's preheader.  */
-	  if (freq_h.nonzero_p () && !(freq_e == profile_count::zero ()))
-	    freq_e = freq_e.force_nonzero ();
-	  scale_loop_frequencies (loop, freq_e.probability_in (freq_h));
-	}
-
-      basic_block rest = new_exit->dest;
-      new_exit->probability
-	= (profile_probability::always () / (new_est_niter + 1));
-
-      rest->count += new_exit->count ();
-
-      edge new_nonexit = single_pred_edge (loop->latch);
-      profile_probability prob = new_nonexit->probability;
-      new_nonexit->probability = new_exit->probability.invert ();
-      prob = new_nonexit->probability / prob;
-      if (prob.initialized_p ())
-	scale_bbs_frequencies (&loop->latch, 1, prob);
-
      /* Finally create the new counter for number of iterations and add
 	 the new exit instruction.  */
      tree ctr_before, ctr_after;
@ -1374,66 +1297,15 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
      gimple_cond_set_rhs (exit_if, exit_bound);
      update_stmt (exit_if);
    }
-  else
-    {
-      /* gimple_duplicate_loop_to_header_edge has adjusted the loop body's
-	 original profile counts in line with the unroll factor.  However,
-	 the old counts might not have been consistent with the old
-	 iteration count.
-
-	 Therefore, if the iteration count is known exactly, make sure that the
-	 profile counts of the loop header (and any other blocks that might be
-	 executed in the final iteration) are consistent with the combination
-	 of (a) the incoming profile count and (b) the new iteration count.  */
-      profile_count in_count = loop_preheader_edge (loop)->count ();
-      profile_count old_header_count = loop->header->count;
-      if (in_count.nonzero_p ()
-	  && old_header_count.nonzero_p ()
-	  && TREE_CODE (desc->niter) == INTEGER_CST)
-	{
-	  /* The + 1 converts latch counts to iteration counts.  */
-	  profile_count new_header_count = in_count * (new_est_niter + 1);
-	  basic_block *body = get_loop_body (loop);
-	  scale_bbs_frequencies_profile_count (body, loop->num_nodes,
-					       new_header_count,
-					       old_header_count);
-	  free (body);
-	}
-
-      /* gimple_duplicate_loop_to_header_edge discarded FACTOR - 1
-	 exit edges and adjusted the loop body's profile counts for the
-	 new probabilities of the remaining non-exit edges.  However,
-	 the remaining exit edge still has the same probability as it
-	 did before, even though it is now more likely.
-
-	 Therefore, all blocks executed after a failed exit test now have
-	 a profile count that is too high, and the sum of the profile counts
-	 for the header's incoming edges is greater than the profile count
-	 of the header itself.
-
-	 Adjust the profile counts of all code in the loop body after
-	 the exit test so that the sum of the counts on entry to the
-	 header agree.  */
-      profile_count old_latch_count = loop_latch_edge (loop)->count ();
-      profile_count new_latch_count = loop->header->count - in_count;
-      if (old_latch_count.nonzero_p () && new_latch_count.nonzero_p ())
-	scale_dominated_blocks_in_loop (loop, new_exit->src, new_latch_count,
-					old_latch_count);
-
-      /* Set the probability of the exit edge based on NEW_EST_NITER
-	 (which estimates latch counts rather than iteration counts).
-	 Update the probabilities of other edges to match.
-
-	 If the profile counts are large enough to give the required
-	 precision, the updates above will have made
-
-	    e->dest->count / e->src->count ~= new e->probability
-
-	 for every outgoing edge e of NEW_EXIT->src.  */
-      profile_probability new_exit_prob
-	= profile_probability::always () / (new_est_niter + 1);
-      change_edge_frequency (new_exit, new_exit_prob);
-    }
+  if (loop->any_upper_bound)
+    loop->nb_iterations_upper_bound = wi::udiv_floor
+		(loop->nb_iterations_upper_bound + 1, factor) - 1;
+  if (loop->any_likely_upper_bound)
+    loop->nb_iterations_likely_upper_bound = wi::udiv_floor
+		(loop->nb_iterations_likely_upper_bound + 1, factor) - 1;
+  if (loop->any_estimate)
+    loop->nb_iterations_estimate = wi::udiv_floor
+		(loop->nb_iterations_estimate + 1, factor) - 1;

  checking_verify_flow_info ();
  checking_verify_loop_structure ();