From 46ccce1de686c1b437eff43431dc20d20d4687c0 Mon Sep 17 00:00:00 2001
From: Tamar Christina <tamar.christina@arm.com>
Date: Wed, 16 Apr 2025 13:09:05 +0100
Subject: [PATCH] middle-end: Fix incorrect codegen with PFA and VLS [PR119351]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The following example:

#define N 512
#define START 2
#define END 505

int x[N] __attribute__((aligned(32)));

int __attribute__((noipa))
foo (void)
{
  for (signed int i = START; i < END; ++i)
    {
      if (x[i] == 0)
        return i;
    }
  return -1;
}

generates incorrect code with fixed length SVE because for early break we need
to know which value to start the scalar loop with if we take an early exit.

Historically this means that we take the first element of every induction.
this is because there's an assumption in place, that even with masked loops the
masks come from a whilel* instruction.

As such we reduce using a BIT_FIELD_REF <, 0>.

When PFA was added this assumption was correct for non-masked loop, however we
assumed that PFA for VLA wouldn't work for now, and disabled it using the
alignment requirement checks.  We also expected VLS to PFA using scalar loops.

However as this PR shows, for VLS the vectorizer can, and does in some
circumstances choose to peel using masks by masking the first iteration of the
loop with an additional alignment mask.

When this is done, the first elements of the predicate can be inactive. In this
example element 1 is inactive based on the calculated misalignment.  hence the
-1 value in the first vector IV element.

When we reduce using BIT_FIELD_REF we get the wrong value.

This patch updates it by creating a new scalar PHI that keeps track of whether
we are the first iteration of the loop (with the additional masking) or whether
we have taken a loop iteration already.

The generated sequence:

pre-header:
  bb1:
    i_1 = <number of leading inactive elements>

header:
  bb2:
    i_2 = PHI <i_1(bb1), 0(latch)>
    …

early-exit:
  bb3:
    i_3 = iv_step * i_2 + PHI<vector-iv>

Which eliminates the need to do an expensive mask based reduction.

This fixes gromacs with one OpenMP thread. But with > 1 there is still an issue.

gcc/ChangeLog:

	PR tree-optimization/119351
	* tree-vectorizer.h (LOOP_VINFO_MASK_NITERS_PFA_OFFSET,
	LOOP_VINFO_NON_LINEAR_IV): New.
	(class _loop_vec_info): Add mask_skip_niters_pfa_offset and
	nonlinear_iv.
	* tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Initialize them.
	(vect_analyze_scalar_cycles_1): Record non-linear inductions.
	(vectorizable_induction): If early break and PFA using masking create a
	new phi which tracks where the scalar code needs to start...
	(vectorizable_live_operation): ...and generate the adjustments here.
	(vect_use_loop_mask_for_alignment_p): Reject non-linear inductions and
	early break needing peeling.

gcc/testsuite/ChangeLog:

	PR tree-optimization/119351
	* gcc.target/aarch64/sve/peel_ind_10.c: New test.
	* gcc.target/aarch64/sve/peel_ind_10_run.c: New test.
	* gcc.target/aarch64/sve/peel_ind_5.c: New test.
	* gcc.target/aarch64/sve/peel_ind_5_run.c: New test.
	* gcc.target/aarch64/sve/peel_ind_6.c: New test.
	* gcc.target/aarch64/sve/peel_ind_6_run.c: New test.
	* gcc.target/aarch64/sve/peel_ind_7.c: New test.
	* gcc.target/aarch64/sve/peel_ind_7_run.c: New test.
	* gcc.target/aarch64/sve/peel_ind_8.c: New test.
	* gcc.target/aarch64/sve/peel_ind_8_run.c: New test.
	* gcc.target/aarch64/sve/peel_ind_9.c: New test.
	* gcc.target/aarch64/sve/peel_ind_9_run.c: New test.
---
 .../gcc.target/aarch64/sve/peel_ind_10.c      | 24 +++++
 .../gcc.target/aarch64/sve/peel_ind_10_run.c  | 17 ++++
 .../gcc.target/aarch64/sve/peel_ind_5.c       | 24 +++++
 .../gcc.target/aarch64/sve/peel_ind_5_run.c   | 17 ++++
 .../gcc.target/aarch64/sve/peel_ind_6.c       | 24 +++++
 .../gcc.target/aarch64/sve/peel_ind_6_run.c   | 17 ++++
 .../gcc.target/aarch64/sve/peel_ind_7.c       | 24 +++++
 .../gcc.target/aarch64/sve/peel_ind_7_run.c   | 17 ++++
 .../gcc.target/aarch64/sve/peel_ind_8.c       | 24 +++++
 .../gcc.target/aarch64/sve/peel_ind_8_run.c   | 17 ++++
 .../gcc.target/aarch64/sve/peel_ind_9.c       | 25 +++++
 .../gcc.target/aarch64/sve/peel_ind_9_run.c   | 17 ++++
 gcc/tree-vect-loop.cc                         | 95 ++++++++++++++++++-
 gcc/tree-vectorizer.h                         | 18 +++-
 14 files changed, 357 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c
new file mode 100644
index 00000000000..b7a7bc5cb0c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c
@@ -0,0 +1,24 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 0
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (int start)
+{
+  for (unsigned int i = start; i < END; ++i)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c
new file mode 100644
index 00000000000..6169aebcc40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_10.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo (START);
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c
new file mode 100644
index 00000000000..a03bb1dec21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c
@@ -0,0 +1,24 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 2
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+  for (signed int i = START; i < END; ++i)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c
new file mode 100644
index 00000000000..f26befeab7e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_5.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo ();
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c
new file mode 100644
index 00000000000..9bfd1a65c4f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c
@@ -0,0 +1,24 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 1
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (int start)
+{
+  for (unsigned int i = start; i < END; ++i)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c
new file mode 100644
index 00000000000..4fdf3e4e7ca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_6.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo (START);
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c
new file mode 100644
index 00000000000..0182e131a17
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c
@@ -0,0 +1,24 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 1
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+  for (unsigned int i = START; i < END; ++i)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c
new file mode 100644
index 00000000000..05608dd85f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_7.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo ();
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c
new file mode 100644
index 00000000000..043348b55d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c
@@ -0,0 +1,24 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 1
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+  for (unsigned int i = START; i < END; i*=2)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c
new file mode 100644
index 00000000000..aa8612248bf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_8.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo ();
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c
new file mode 100644
index 00000000000..cc904e88170
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c
@@ -0,0 +1,25 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 1
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+  for (int *p = x + START; p < x + END; p++)
+    {
+      if (*p == 0)
+        return START;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* Peels using a scalar loop.  */
+/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c
new file mode 100644
index 00000000000..767f8bd284c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_9.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo ();
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 9413dcef702..958b829fa8d 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -653,6 +653,10 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
+
+      /* Mark if we have a non-linear IV.  */
+      LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
+	= STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
     }
 
 
@@ -1046,12 +1050,14 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     suggested_unroll_factor (1),
     max_vectorization_factor (0),
     mask_skip_niters (NULL_TREE),
+    mask_skip_niters_pfa_offset (NULL_TREE),
     rgroup_compare_type (NULL_TREE),
     simd_if_cond (NULL_TREE),
     partial_vector_style (vect_partial_vectors_none),
     unaligned_dr (NULL),
     peeling_for_alignment (0),
     ptr_mask (0),
+    nonlinear_iv (false),
     ivexpr_map (NULL),
     scan_map (NULL),
     slp_unrolling_factor (1),
@@ -10678,6 +10684,54 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 				       LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
 	  peel_mul = gimple_build_vector_from_val (&init_stmts,
 						   step_vectype, peel_mul);
+
+	  /* If early break then we have to create a new PHI which we can use as
+	    an offset to adjust the induction reduction in early exits.
+
+	    This is because when peeling for alignment using masking, the first
+	    few elements of the vector can be inactive.  As such if we find the
+	    entry in the first iteration we have adjust the starting point of
+	    the scalar code.
+
+	    We do this by creating a new scalar PHI that keeps track of whether
+	    we are the first iteration of the loop (with the additional masking)
+	    or whether we have taken a loop iteration already.
+
+	    The generated sequence:
+
+	    pre-header:
+	      bb1:
+		i_1 = <number of leading inactive elements>
+
+	    header:
+	      bb2:
+		i_2 = PHI <i_1(bb1), 0(latch)>
+		…
+
+	    early-exit:
+	      bb3:
+		i_3 = iv_step * i_2 + PHI<vector-iv>
+
+	    The first part of the adjustment to create i_1 and i_2 are done here
+	    and the last part creating i_3 is done in
+	    vectorizable_live_operations when the induction extraction is
+	    materialized.  */
+	  if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+	      && !LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
+	    {
+	      auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+	      tree ty_skip_niters = TREE_TYPE (skip_niters);
+	      tree break_lhs_phi = vect_get_new_vect_var (ty_skip_niters,
+							  vect_scalar_var,
+							  "pfa_iv_offset");
+	      gphi *nphi = create_phi_node (break_lhs_phi, bb);
+	      add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION);
+	      add_phi_arg (nphi, build_zero_cst (ty_skip_niters),
+			   loop_latch_edge (iv_loop), UNKNOWN_LOCATION);
+
+	      LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo)
+		= PHI_RESULT (nphi);
+	    }
 	}
       tree step_mul = NULL_TREE;
       unsigned ivn;
@@ -11565,8 +11619,10 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	      /* For early exit where the exit is not in the BB that leads
 		 to the latch then we're restarting the iteration in the
 		 scalar loop.  So get the first live value.  */
-	      if ((all_exits_as_early_p || !main_exit_edge)
-		  && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
+	      bool early_break_first_element_p
+		= (all_exits_as_early_p || !main_exit_edge)
+		   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def;
+	      if (early_break_first_element_p)
 		{
 		  tmp_vec_lhs = vec_lhs0;
 		  tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
@@ -11581,6 +11637,41 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 						 lhs_type, &exit_gsi);
 
 	      auto gsi = gsi_for_stmt (use_stmt);
+	      if (early_break_first_element_p
+		  && LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
+		{
+		  tree step_expr
+		    = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
+		  tree break_lhs_phi
+		    = LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo);
+		  tree ty_skip_niters = TREE_TYPE (break_lhs_phi);
+		  gimple_seq iv_stmts = NULL;
+
+		  /* Now create the PHI for the outside loop usage to
+		     retrieve the value for the offset counter.  */
+		  tree rphi_step
+		    = gimple_convert (&iv_stmts, ty_skip_niters, step_expr);
+		  tree tmp2
+		    = gimple_build (&iv_stmts, MULT_EXPR,
+				    ty_skip_niters, rphi_step,
+				    break_lhs_phi);
+
+		  if (POINTER_TYPE_P (TREE_TYPE (new_tree)))
+		    tmp2 = gimple_build (&iv_stmts, POINTER_PLUS_EXPR,
+					 TREE_TYPE (new_tree), new_tree, tmp2);
+		  else
+		    {
+		      tmp2 = gimple_convert (&iv_stmts, TREE_TYPE (new_tree),
+					     tmp2);
+		      tmp2 = gimple_build (&iv_stmts, PLUS_EXPR,
+					   TREE_TYPE (new_tree), new_tree,
+					   tmp2);
+		    }
+
+		  new_tree = tmp2;
+		  gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
+		}
+
 	      tree lhs_phi = gimple_phi_result (use_stmt);
 	      remove_phi_node (&gsi, false);
 	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 97caf61b345..01d19c77656 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -818,6 +818,11 @@ public:
      elements that should be false in the first mask).  */
   tree mask_skip_niters;
 
+  /* If we are using a loop mask to align memory addresses and we're in an
+     early break loop then this variable contains the number of elements that
+     were skipped during the initial iteration of the loop. */
+  tree mask_skip_niters_pfa_offset;
+
   /* The type that the loop control IV should be converted to before
      testing which of the VF scalars are active and inactive.
      Only meaningful if LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
@@ -854,6 +859,9 @@ public:
   /* The mask used to check the alignment of pointers or arrays.  */
   int ptr_mask;
 
+  /* Indicates whether the loop has any non-linear IV.  */
+  bool nonlinear_iv;
+
   /* Data Dependence Relations defining address ranges that are candidates
      for a run-time aliasing check.  */
   auto_vec<ddr_p> may_alias_ddrs;
@@ -1064,6 +1072,7 @@ public:
 #define LOOP_VINFO_MASKS(L)                (L)->masks
 #define LOOP_VINFO_LENS(L)                 (L)->lens
 #define LOOP_VINFO_MASK_SKIP_NITERS(L)     (L)->mask_skip_niters
+#define LOOP_VINFO_MASK_NITERS_PFA_OFFSET(L) (L)->mask_skip_niters_pfa_offset
 #define LOOP_VINFO_RGROUP_COMPARE_TYPE(L)  (L)->rgroup_compare_type
 #define LOOP_VINFO_RGROUP_IV_TYPE(L)       (L)->rgroup_iv_type
 #define LOOP_VINFO_PARTIAL_VECTORS_STYLE(L) (L)->partial_vector_style
@@ -1073,6 +1082,7 @@ public:
 #define LOOP_VINFO_DDRS(L)                 (L)->shared->ddrs
 #define LOOP_VINFO_INT_NITERS(L)           (TREE_INT_CST_LOW ((L)->num_iters))
 #define LOOP_VINFO_PEELING_FOR_ALIGNMENT(L) (L)->peeling_for_alignment
+#define LOOP_VINFO_NON_LINEAR_IV(L)        (L)->nonlinear_iv
 #define LOOP_VINFO_UNALIGNED_DR(L)         (L)->unaligned_dr
 #define LOOP_VINFO_MAY_MISALIGN_STMTS(L)   (L)->may_misalign_stmts
 #define LOOP_VINFO_MAY_ALIAS_DDRS(L)       (L)->may_alias_ddrs
@@ -2138,8 +2148,14 @@ unlimited_cost_model (loop_p loop)
 inline bool
 vect_use_loop_mask_for_alignment_p (loop_vec_info loop_vinfo)
 {
+  /* With early break vectorization we don't know whether the accesses will stay
+     inside the loop or not.  TODO: The early break adjustment code can be
+     implemented the same way as vectorizable_linear_induction.  However we
+     can't test this today so reject it.  */
   return (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
-	  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
+	  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+	  && !(LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
+	       && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)));
 }
 
 /* Return the number of vectors of type VECTYPE that are needed to get