Disallow loop rotation and loop header crossing in jump threaders.

There is a lot of fall-out from this patch, as there were many threading tests that assumed the restrictions introduced by this patch were valid. Some tests have merely shifted the threading to after loop optimizations, but others ended up with no threading opportunities at all. Surprisingly some tests ended up with more total threads. It was a crapshoot all around. On a postive note, there are 6 tests that no longer XFAIL, and one guality test which now passes. I felt a bit queasy about such a fundamental change wrt threading, so I ran it through my callgrind test harness (.ii files from a bootstrap). There was no change in overall compilation, DOM, or the VRP threaders. However, there was a slight increase of 1.63% in the backward threader. I'm pretty sure we could reduce this if we incorporated the restrictions into their profitability code. This way we could stop the search when we ran into one of these restrictions. Not sure it's worth it at this point. Tested on x86-64 Linux. Co-authored-by: Richard Biener <rguenther@suse.de> gcc/ChangeLog: * tree-ssa-threadupdate.c (cancel_thread): Dump threading reason on the same line as the threading cancellation. (jt_path_registry::cancel_invalid_paths): Avoid rotating loops. Avoid threading through loop headers where the path remains in the loop. libgomp/ChangeLog: * testsuite/libgomp.graphite/force-parallel-5.c: Remove xfail. gcc/testsuite/ChangeLog: * gcc.dg/Warray-bounds-87.c: Remove xfail. * gcc.dg/analyzer/pr94851-2.c: Remove xfail. * gcc.dg/graphite/pr69728.c: Remove xfail. * gcc.dg/graphite/scop-dsyr2k.c: Remove xfail. * gcc.dg/graphite/scop-dsyrk.c: Remove xfail. * gcc.dg/shrink-wrap-loop.c: Remove xfail. * gcc.dg/loop-8.c: Adjust for new threading restrictions. * gcc.dg/tree-ssa/ifc-20040816-1.c: Same. * gcc.dg/tree-ssa/pr21559.c: Same. * gcc.dg/tree-ssa/pr59597.c: Same. * gcc.dg/tree-ssa/pr71437.c: Same. * gcc.dg/tree-ssa/pr77445-2.c: Same. * gcc.dg/tree-ssa/ssa-dom-thread-4.c: Same. * gcc.dg/tree-ssa/ssa-dom-thread-7.c: Same. * gcc.dg/vect/bb-slp-16.c: Same. * gcc.dg/tree-ssa/ssa-dom-thread-6.c: Remove. * gcc.dg/tree-ssa/ssa-dom-thread-18.c: Remove. * gcc.dg/tree-ssa/ssa-dom-thread-2a.c: Remove. * gcc.dg/tree-ssa/ssa-thread-invalid.c: New test.
2021-10-04 09:47:02 +02:00 · 2021-10-04 09:47:02 +02:00 · d8edfadfc7
commit d8edfadfc7
parent f36240f8c8
21 changed files with 207 additions and 219 deletions
--- a/gcc/testsuite/gcc.dg/Warray-bounds-87.c
+++ b/gcc/testsuite/gcc.dg/Warray-bounds-87.c
@ -33,7 +33,7 @@ static unsigned int h (int i, int j)
    case 9:
      return j;
    case 10:
-      return a[i];  // { dg-bogus "-Warray-bounds" "pr101671" { xfail *-*-* } }
+      return a[i];  // { dg-bogus "-Warray-bounds" "pr101671" }
    }
  return 0;
 }
--- a/gcc/testsuite/gcc.dg/analyzer/pr94851-2.c
+++ b/gcc/testsuite/gcc.dg/analyzer/pr94851-2.c
@ -45,7 +45,7 @@ int pamark(void) {
    if (curbp->b_amark == (AMARK *)NULL)
      curbp->b_amark = p;
    else
-      last->m_next = p; /* { dg-warning "dereference of NULL 'last'" "deref" { xfail *-*-* } } */
+      last->m_next = p; /* { dg-warning "dereference of NULL 'last'" "deref" } */
  }

  p->m_name = (char)c; /* { dg-bogus "leak of 'p'" "bogus leak" } */
--- a/gcc/testsuite/gcc.dg/graphite/pr69728.c
+++ b/gcc/testsuite/gcc.dg/graphite/pr69728.c
@ -24,6 +24,4 @@ fn1 ()
   run into scheduling issues before here, not being able to handle
   empty domains.  */

-/* XFAILed by fix for PR86865.  */
-
-/* { dg-final { scan-tree-dump "loop nest optimized" "graphite" { xfail *-*-* } } }  */
+/* { dg-final { scan-tree-dump "loop nest optimized" "graphite" } } */
--- a/gcc/testsuite/gcc.dg/graphite/scop-dsyr2k.c
+++ b/gcc/testsuite/gcc.dg/graphite/scop-dsyr2k.c
@ -17,4 +17,4 @@ void dsyr2k(int N) {
 #pragma endscop
 }

-/* { dg-final { scan-tree-dump-times "number of SCoPs: 1" 1 "graphite" { xfail *-*-* } } } */ 
+/* { dg-final { scan-tree-dump-times "number of SCoPs: 1" 1 "graphite" } } */
--- a/gcc/testsuite/gcc.dg/graphite/scop-dsyrk.c
+++ b/gcc/testsuite/gcc.dg/graphite/scop-dsyrk.c
@ -19,4 +19,4 @@ void dsyrk(int N)
 #pragma endscop
 }

-/* { dg-final { scan-tree-dump-times "number of SCoPs: 1" 1 "graphite" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "number of SCoPs: 1" 1 "graphite" } } */
--- a/gcc/testsuite/gcc.dg/loop-8.c
+++ b/gcc/testsuite/gcc.dg/loop-8.c
@ -11,18 +11,23 @@ f (int *a, int *b)
 {
  int i;

-  for (i = 0; i < 100; i++)
+  i = 100;
+  if (i > 0)
    {
-      int d = 42;
+      do
+	{
+	  int d = 42;

-      a[i] = d;
-      if (i % 2)
-	d = i;
-      b[i] = d;
+	  a[i] = d;
+	  if (i % 2)
+	    d = i;
+	  b[i] = d;
+	  ++i;
+	}
+      while (i < 100);
    }
 }

 /* Load of 42 is moved out of the loop, introducing a new pseudo register.  */
-/* { dg-final { scan-rtl-dump-times "Decided" 1 "loop2_invariant" } } */
 /* { dg-final { scan-rtl-dump-not "without introducing a new temporary register" "loop2_invariant" } } */

--- a/gcc/testsuite/gcc.dg/shrink-wrap-loop.c
+++ b/gcc/testsuite/gcc.dg/shrink-wrap-loop.c
@ -1,58 +1,6 @@
 /* { dg-do compile { target { { { i?86-*-* x86_64-*-* } && lp64 } || { arm_thumb2 } } } } */
 /* { dg-options "-O2 -fdump-rtl-pro_and_epilogue"  } */

-/*
-Our new threader is threading things a bit too early, and causing the
-testcase in gcc.dg/shrink-wrap-loop.c to fail.
-
-  The gist is this BB inside a loop:
-
-  <bb 6> :
-  # p_2 = PHI <p2_6(D)(2), p_12(5)>
-  if (p_2 != 0B)
-    goto <bb 3>; [INV]
-  else
-    goto <bb 7>; [INV]
-
-Our threader can move this check outside of the loop (good).  This is
-done before branch probabilities are calculated and causes the probs
-to be calculated as:
-
-<bb 2> [local count: 216361238]:
-  if (p2_6(D) != 0B)
-    goto <bb 7>; [54.59%]
-  else
-    goto <bb 6>; [45.41%]
-
-Logically this seems correct to me.  A simple check outside of a loop
-should slightly but not overwhelmingly favor a non-zero value.
-
-Interestingly however, the old threader couldn't get this, but the IL
-ended up identical, albeit with different probabilities.  What happens
-is that, because the old code could not thread this, the p2 != 0 check
-would remain inside the loop and probs would be calculated thusly:
-
-  <bb 6> [local count: 1073741824]:
-  # p_2 = PHI <p2_6(D)(2), p_12(5)>
-  if (p_2 != 0B)
-    goto <bb 3>; [94.50%]
-  else
-    goto <bb 7>; [5.50%]
-
-Then when the loop header copying pass ("ch") shuffled things around,
-the IL would end up identical to my early threader code, but with the
-probabilities would remain as 94.5/5.5.
-
-The above discrepancy causes the RTL ifcvt pass to generate different
-code, and by the time we get to the shrink wrapping pass, things look
-sufficiently different such that the legacy code can actually shrink
-wrap, whereas our new code does not.
-
-IMO, if the loop-ch pass moves conditionals outside of a loop, the
-probabilities should be adjusted, but that does mean the shrink wrap
-won't happen for this contrived testcase.
- */
-
 int foo (int *p1, int *p2);

 int
@ -68,4 +16,4 @@ test (int *p1, int *p2)

  return 1;
 }
-/* { dg-final { scan-rtl-dump "Performing shrink-wrapping" "pro_and_epilogue" { xfail *-*-* } } } */
+/* { dg-final { scan-rtl-dump "Performing shrink-wrapping" "pro_and_epilogue" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/ifc-20040816-1.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ifc-20040816-1.c
@ -39,4 +39,4 @@ int main1 ()
   which is folded by vectorizer.  Both outgoing edges must have probability
   100% so the resulting profile match after folding.  */
 /* { dg-final { scan-tree-dump-times "Invalid sum of outgoing probabilities 200.0" 1 "ifcvt" } } */
-/* { dg-final { scan-tree-dump-times "Invalid sum of incoming counts" 1 "ifcvt" } } */
+/* { dg-final { scan-tree-dump-times "Invalid sum of incoming counts" 2 "ifcvt" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr21559.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr21559.c
@ -35,10 +35,7 @@ void foo (void)
 /* First, we should simplify the bits < 0 test within the loop.  */
 /* { dg-final { scan-tree-dump-times "Simplified relational" 1 "evrp" } } */

-/* Second, we should thread the edge out of the loop via the break
-   statement.  We also realize that the final bytes == 0 test is useless,
-   and thread over it.  We also know that toread != 0 is useless when
-   entering while loop and thread over it.  */
-/* { dg-final { scan-tree-dump-times "Threaded jump" 3 "vrp-thread1" } } */
+/* We used to check for 3 threaded jumps here, but they all would
+   rotate the loop.  */


--- a/gcc/testsuite/gcc.dg/tree-ssa/pr59597.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr59597.c
@ -56,11 +56,7 @@ main (int argc, char argv[])
  return crc;
 }

-/* Previously we had 3 jump threads, but one of them crossed loops.
-   The reason the old threader was allowing it, was because there was
-   an ASSERT_EXPR getting in the way.  Without the ASSERT_EXPR, we
-   have an empty pre-header block as the final block in the thread,
-   which the threader will simply join with the next block which *is*
-   in a different loop.  */
-/* { dg-final { scan-tree-dump-times "Registering jump thread" 2 "vrp-thread1" } } */
+/* None of the threads we can get in vrp-thread1 are valid.  They all
+   cross or rotate loops.  */
+/* { dg-final { scan-tree-dump-not "Registering jump thread" "vrp-thread1" } } */
 /* { dg-final { scan-tree-dump-not "joiner" "vrp-thread1" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr71437.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr71437.c
@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-ffast-math -O3 -fdump-tree-vrp-thread1-details" } */
+/* { dg-options "-ffast-math -O3 -fdump-tree-dom3-details" } */

 int I = 50, J = 50;
 int S, L;
@ -39,4 +39,8 @@ void foo (int K)
 	bar (LD, SD);
    }
 }
-/* { dg-final { scan-tree-dump-times "Threaded jump " 2 "vrp-thread1" } } */
+
+/* We used to get 1 vrp-thread1 candidates here, but they now get
+   deferred until after loop opts are done, because they were rotating
+   loops.  */
+/* { dg-final { scan-tree-dump-times "Threaded jump " 2 "dom3" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr77445-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr77445-2.c
@ -123,8 +123,7 @@ enum STATES FMS( u8 **in , u32 *transitions) {
   aarch64 has the highest CASE_VALUES_THRESHOLD in GCC.  It's high enough
   to change decisions in switch expansion which in turn can expose new
   jump threading opportunities.  Skip the later tests on aarch64.  */
-/* { dg-final { scan-tree-dump "Jumps threaded: \[7-9\]" "thread1" } } */
-/* { dg-final { scan-tree-dump-times "Invalid sum" 1 "thread1" } } */
+/* { dg-final { scan-tree-dump "Jumps threaded: \[7-9\]" "thread2" } } */
 /* { dg-final { scan-tree-dump-not "optimizing for size" "thread1" } } */
 /* { dg-final { scan-tree-dump-not "optimizing for size" "thread2" } } */
 /* { dg-final { scan-tree-dump-not "optimizing for size" "thread3" { target { ! aarch64*-*-* } } } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-18.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-18.c
@ -1,27 +0,0 @@
-/* { dg-do compile } */ 
-/* { dg-options "-O2 -fdump-tree-vrp-thread1-details -std=gnu89 --param logical-op-non-short-circuit=0" } */
-
-#include "ssa-dom-thread-4.c"
-
-/* On targets that define LOGICAL_OP_NON_SHORT_CIRCUIT to 0, we split both
-   "a_elt || b_elt" and "b_elt && kill_elt" into two conditions each,
-   rather than using "(var1 != 0) op (var2 != 0)".  Also, as on other targets,
-   we duplicate the header of the inner "while" loop.  There are then
-   4 threading opportunities:
-
-   1x "!a_elt && b_elt" in the outer "while" loop
-      -> the start of the inner "while" loop,
-	 skipping the known-true "b_elt" in the first condition.
-   1x "!b_elt" in the first condition
-      -> the outer "while" loop's continuation point,
-	 skipping the known-false "b_elt" in the second condition.
-   2x "kill_elt->indx >= b_elt->indx" in the first "while" loop
-      -> "kill_elt->indx == b_elt->indx" in the second condition,
-	 skipping the known-true "b_elt && kill_elt" in the second
-	 condition.
-
-   All the cases are picked up by VRP1 as jump threads.  */
-
-/* There used to be 6 jump threads found by thread1, but they all
-   depended on threading through distinct loops in ethread.  */
-/* { dg-final { scan-tree-dump-times "Threaded" 2 "vrp-thread1" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-2a.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-2a.c
@ -1,21 +0,0 @@
-/* { dg-do compile } */ 
-/* { dg-options "-O2 -fdump-tree-vrp-thread1-stats -fdump-tree-dom2-stats" } */
-
-void bla();
-
-/* In the following case, we should be able to thread edge through
-   the loop header.  */
-
-void thread_entry_through_header (void)
-{
-  int i;
-
-  for (i = 0; i < 170; i++)
-    bla ();
-}
-
-/* There's a single jump thread that should be handled by the VRP
-   jump threading pass.  */
-/* { dg-final { scan-tree-dump-times "Jumps threaded: 1" 1 "vrp-thread1"} } */
-/* { dg-final { scan-tree-dump-times "Jumps threaded: 2" 0 "vrp-thread1"} } */
-/* { dg-final { scan-tree-dump-not "Jumps threaded" "dom2"} } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-4.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-4.c
@ -1,5 +1,5 @@
 /* { dg-do compile } */ 
-/* { dg-options "-O2 -fdump-tree-vrp-thread1-details -fdump-tree-dom2-details -std=gnu89 --param logical-op-non-short-circuit=1" } */
+/* { dg-options "-O2 -fdump-tree-vrp-thread2-details -fdump-tree-dom2-details -std=gnu89 --param logical-op-non-short-circuit=1" } */
 struct bitmap_head_def;
 typedef struct bitmap_head_def *bitmap;
 typedef const struct bitmap_head_def *const_bitmap;
@ -53,10 +53,8 @@ bitmap_ior_and_compl (bitmap dst, const_bitmap a, const_bitmap b,

  return changed;
 }
-/* The block starting the second conditional has  3 incoming edges,
-   we should thread all three, but due to a bug in the threading
-   code we missed the edge when the first conditional is false
-   (b_elt is zero, which means the second conditional is always
-   zero.  VRP1 catches all three.  */
-/* { dg-final { scan-tree-dump-times "Registering jump thread" 2 "vrp-thread1" } } */
-/* { dg-final { scan-tree-dump-times "Path crosses loops" 1 "vrp-thread1" } } */
+/* We used to catch 3 jump threads in vrp-thread1, but they all
+   rotated the loop, so they were disallowed.  This in turn created
+   other opportunities for the other threaders which result in the the
+   post-loop threader (vrp-thread2) catching more.  */
+/* { dg-final { scan-tree-dump-times "Registering jump thread" 5 "vrp-thread2" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-6.c
@ -1,44 +0,0 @@
-/* { dg-do compile } */
-/* { dg-options "-O2 -fdump-tree-thread1-details -fdump-tree-thread3-details" } */
-
-/* { dg-final { scan-tree-dump-times "Registering jump" 6 "thread1" } } */
-/* { dg-final { scan-tree-dump-times "Registering jump" 1 "thread3" } } */
-
-int sum0, sum1, sum2, sum3;
-int foo (char *s, char **ret)
-{
-  int state=0;
-  char c;
-
-  for (; *s && state != 4; s++)
-    {
-      c = *s;
-      if (c == '*')
-	{
-	  s++;
-	  break;
-	}
-      switch (state)
-	{
-	case 0:
-	  if (c == '+')
-	    state = 1;
-	  else if (c != '-')
-	    sum0+=c;
-	  break;
-	case 1:
-	  if (c == '+')
-	    state = 2;
-	  else if (c == '-')
-	    state = 0;
-	  else
-	    sum1+=c;
-	  break;
-	default:
-	  break;
-	}
-
-    }
-  *ret = s;
-  return state;
-}
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
@ -1,15 +1,14 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -fdump-tree-thread1-stats -fdump-tree-thread2-stats -fdump-tree-dom2-stats -fdump-tree-thread3-stats -fdump-tree-dom3-stats -fdump-tree-vrp2-stats -fno-guess-branch-probability" } */

-/* { dg-final { scan-tree-dump "Jumps threaded: 12"  "thread1" } } */
-/* { dg-final { scan-tree-dump "Jumps threaded: 5" "thread3" { target { ! aarch64*-*-* } } } } */
+/* { dg-final { scan-tree-dump "Jumps threaded: 12"  "thread3" } } */
 /* { dg-final { scan-tree-dump-not "Jumps threaded"  "dom2" } } */

 /* aarch64 has the highest CASE_VALUES_THRESHOLD in GCC.  It's high enough
   to change decisions in switch expansion which in turn can expose new
   jump threading opportunities.  Skip the later tests on aarch64.  */
 /* { dg-final { scan-tree-dump-not "Jumps threaded"  "dom3" { target { ! aarch64*-*-* } } } } */
-/* { dg-final { scan-tree-dump-not "Jumps threaded"  "vrp2" { target { ! aarch64*-*-* } } } } */
+/* { dg-final { scan-tree-dump-not "Jumps threaded"  "vrp-thread2" { target { ! aarch64*-*-* } } } } */

 enum STATE {
  S0=0,
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-thread-invalid.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-thread-invalid.c
@ -0,0 +1,102 @@
+// { dg-do compile }
+// { dg-options "-O2 -fgimple -fdump-statistics" }
+//
+// This is a collection of seemingly threadble paths that should not be allowed.
+
+void foobar (int);
+
+// Possible thread from 2->4->3, but it would rotate the loop.
+void __GIMPLE (ssa)
+f1 ()
+{
+  int i;
+
+  // Pre-header.
+  __BB(2):
+  goto __BB4;
+
+  // Latch.
+  __BB(3):
+  foobar (i_1);
+  i_5 = i_1 + 1;
+  goto __BB4;
+
+  __BB(4,loop_header(1)):
+  i_1 = __PHI (__BB2: 0, __BB3: i_5);
+  if (i_1 != 101)
+    goto __BB3;
+  else
+    goto __BB5;
+
+  __BB(5):
+  return;
+
+}
+
+// Possible thread from 2->3->5 but threading through the empty latch
+// would create a non-empty latch.
+void __GIMPLE (ssa)
+f2 ()
+{
+  int i;
+
+  // Pre-header.
+  __BB(2):
+  goto __BB3;
+
+  __BB(3,loop_header(1)):
+  i_8 = __PHI (__BB5: i_5, __BB2: 0);
+  foobar (i_8);
+  i_5 = i_8 + 1;
+  if (i_5 != 256)
+    goto __BB5;
+  else
+    goto __BB4;
+
+  // Latch.
+  __BB(5):
+  goto __BB3;
+
+  __BB(4):
+  return;
+
+}
+
+// Possible thread from 3->5->6->3 but this would thread through the
+// header but not exit the loop.
+int __GIMPLE (ssa)
+f3 (int a)
+{
+  int i;
+
+  __BB(2):
+  goto __BB6;
+
+  __BB(3):
+  if (i_1 != 0)
+    goto __BB4;
+  else
+    goto __BB5;
+
+  __BB(4):
+  foobar (5);
+  goto __BB5;
+
+  // Latch.
+  __BB(5):
+  i_7 = i_1 + 1;
+  goto __BB6;
+
+  __BB(6,loop_header(1)):
+  i_1 = __PHI (__BB2: 1, __BB5: i_7);
+  if (i_1 <= 99)
+    goto __BB3;
+  else
+    goto __BB7;
+
+  __BB(7):
+  return;
+
+}
+
+// { dg-final { scan-tree-dump-not "Jumps threaded" "statistics" } }
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-16.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-16.c
@ -16,41 +16,52 @@ main1 (int dummy)
  unsigned int *pin = &in[0];
  unsigned int *pout = &out[0];
  unsigned int a = 0;
-  
-  for (i = 0; i < N; i++)
+
+  i = N;
+  if (i > 0)
    {
-      *pout++ = *pin++ + a;
-      *pout++ = *pin++ + a;
-      *pout++ = *pin++ + a;
-      *pout++ = *pin++ + a;
-      *pout++ = *pin++ + a;
-      *pout++ = *pin++ + a;
-      *pout++ = *pin++ + a;
-      *pout++ = *pin++ + a;
-      if (arr[i] = i)
-        a = i;
-      else
-        a = 2;
+      do
+	{
+	  *pout++ = *pin++ + a;
+	  *pout++ = *pin++ + a;
+	  *pout++ = *pin++ + a;
+	  *pout++ = *pin++ + a;
+	  *pout++ = *pin++ + a;
+	  *pout++ = *pin++ + a;
+	  *pout++ = *pin++ + a;
+	  *pout++ = *pin++ + a;
+	  if (arr[i] = i)
+	    a = i;
+	  else
+	    a = 2;
+	}
+      while (i < N);
    }

  a = 0;
-  /* check results: */ 
-  for (i = 0; i < N; i++)
+  /* check results: */
+  i = N;
+  if (i > 0)
    {
-      if (out[i*8] !=  in[i*8] + a
-         || out[i*8 + 1] != in[i*8 + 1] + a
-         || out[i*8 + 2] != in[i*8 + 2] + a
-         || out[i*8 + 3] != in[i*8 + 3] + a
-         || out[i*8 + 4] != in[i*8 + 4] + a
-         || out[i*8 + 5] != in[i*8 + 5] + a
-         || out[i*8 + 6] != in[i*8 + 6] + a
-         || out[i*8 + 7] != in[i*8 + 7] + a)
-	abort ();
+      do
+	{
+	  if (out[i*8] !=  in[i*8] + a
+	      || out[i*8 + 1] != in[i*8 + 1] + a
+	      || out[i*8 + 2] != in[i*8 + 2] + a
+	      || out[i*8 + 3] != in[i*8 + 3] + a
+	      || out[i*8 + 4] != in[i*8 + 4] + a
+	      || out[i*8 + 5] != in[i*8 + 5] + a
+	      || out[i*8 + 6] != in[i*8 + 6] + a
+	      || out[i*8 + 7] != in[i*8 + 7] + a)
+	    abort ();

-      if (arr[i] = i)
-        a = i;
-      else
-        a = 2;
+	  if (arr[i] = i)
+	    a = i;
+	  else
+	    a = 2;
+	  i++;
+	}
+      while (i < N);
    }

  return 0;
@ -66,4 +77,3 @@ int main (void)
 }

 /* { dg-final { scan-tree-dump-times "optimized: basic block" 1 "slp1" } } */
-  
--- a/gcc/tree-ssa-threadupdate.c
+++ b/gcc/tree-ssa-threadupdate.c
@ -278,7 +278,7 @@ cancel_thread (vec<jump_thread_edge *> *path, const char *reason = NULL)
  if (dump_file && (dump_flags & TDF_DETAILS))
    {
      if (reason)
-	fprintf (dump_file, "%s:\n", reason);
+	fprintf (dump_file, "%s: ", reason);

      dump_jump_thread_path (dump_file, *path, false);
      fprintf (dump_file, "\n");
@ -2771,6 +2771,7 @@ jt_path_registry::cancel_invalid_paths (vec<jump_thread_edge *> &path)
  bool seen_latch = false;
  int loops_crossed = 0;
  bool crossed_latch = false;
+  bool crossed_loop_header = false;
  // Use ->dest here instead of ->src to ignore the first block.  The
  // first block is allowed to be in a different loop, since it'll be
  // redirected.  See similar comment in profitable_path_p: "we don't
@ -2804,6 +2805,14 @@ jt_path_registry::cancel_invalid_paths (vec<jump_thread_edge *> &path)
 	  ++loops_crossed;
 	}

+      // ?? Avoid threading through loop headers that remain in the
+      // loop, as such threadings tend to create sub-loops which
+      // _might_ be OK ??.
+      if (e->dest->loop_father->header == e->dest
+	  && !flow_loop_nested_p (exit->dest->loop_father,
+				  e->dest->loop_father))
+	crossed_loop_header = true;
+
      if (flag_checking && !m_backedge_threads)
 	gcc_assert ((path[i]->e->flags & EDGE_DFS_BACK) == 0);
    }
@ -2829,6 +2838,21 @@ jt_path_registry::cancel_invalid_paths (vec<jump_thread_edge *> &path)
      cancel_thread (&path, "Path crosses loops");
      return true;
    }
+  // The path should either start and end in the same loop or exit the
+  // loop it starts in but never enter a loop.  This also catches
+  // creating irreducible loops, not only rotation.
+  if (entry->src->loop_father != exit->dest->loop_father
+      && !flow_loop_nested_p (exit->src->loop_father,
+			      entry->dest->loop_father))
+    {
+      cancel_thread (&path, "Path rotates loop");
+      return true;
+    }
+  if (crossed_loop_header)
+    {
+      cancel_thread (&path, "Path crosses loop header but does not exit it");
+      return true;
+    }
  return false;
 }

--- a/libgomp/testsuite/libgomp.graphite/force-parallel-5.c
+++ b/libgomp/testsuite/libgomp.graphite/force-parallel-5.c
@ -31,6 +31,6 @@ int main(void)
 }

 /* Check that parallel code generation part make the right answer.  */
-/* { dg-final { scan-tree-dump-times "2 loops carried no dependency" 1 "graphite" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "2 loops carried no dependency" 1 "graphite" } } */
 /* { dg-final { scan-tree-dump-times "loopfn.0" 4 "optimized" } } */
 /* { dg-final { scan-tree-dump-times "loopfn.1" 4 "optimized" } } */