middle-end: Fix incorrect codegen with PFA and VLS [PR119351]

The following example:

#define N 512
#define START 2
#define END 505

int x[N] __attribute__((aligned(32)));

int __attribute__((noipa))
foo (void)
{
  for (signed int i = START; i < END; ++i)
    {
      if (x[i] == 0)
        return i;
    }
  return -1;
}

generates incorrect code with fixed length SVE because for early break we need
to know which value to start the scalar loop with if we take an early exit.

Historically this means that we take the first element of every induction.
this is because there's an assumption in place, that even with masked loops the
masks come from a whilel* instruction.

As such we reduce using a BIT_FIELD_REF <, 0>.

When PFA was added this assumption was correct for non-masked loop, however we
assumed that PFA for VLA wouldn't work for now, and disabled it using the
alignment requirement checks.  We also expected VLS to PFA using scalar loops.

However as this PR shows, for VLS the vectorizer can, and does in some
circumstances choose to peel using masks by masking the first iteration of the
loop with an additional alignment mask.

When this is done, the first elements of the predicate can be inactive. In this
example element 1 is inactive based on the calculated misalignment.  hence the
-1 value in the first vector IV element.

When we reduce using BIT_FIELD_REF we get the wrong value.

This patch updates it by creating a new scalar PHI that keeps track of whether
we are the first iteration of the loop (with the additional masking) or whether
we have taken a loop iteration already.

The generated sequence:

pre-header:
  bb1:
    i_1 = <number of leading inactive elements>

header:
  bb2:
    i_2 = PHI <i_1(bb1), 0(latch)>
    …

early-exit:
  bb3:
    i_3 = iv_step * i_2 + PHI<vector-iv>

Which eliminates the need to do an expensive mask based reduction.

This fixes gromacs with one OpenMP thread. But with > 1 there is still an issue.

gcc/ChangeLog:

	PR tree-optimization/119351
	* tree-vectorizer.h (LOOP_VINFO_MASK_NITERS_PFA_OFFSET,
	LOOP_VINFO_NON_LINEAR_IV): New.
	(class _loop_vec_info): Add mask_skip_niters_pfa_offset and
	nonlinear_iv.
	* tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Initialize them.
	(vect_analyze_scalar_cycles_1): Record non-linear inductions.
	(vectorizable_induction): If early break and PFA using masking create a
	new phi which tracks where the scalar code needs to start...
	(vectorizable_live_operation): ...and generate the adjustments here.
	(vect_use_loop_mask_for_alignment_p): Reject non-linear inductions and
	early break needing peeling.

gcc/testsuite/ChangeLog:

	PR tree-optimization/119351
	* gcc.target/aarch64/sve/peel_ind_10.c: New test.
	* gcc.target/aarch64/sve/peel_ind_10_run.c: New test.
	* gcc.target/aarch64/sve/peel_ind_5.c: New test.
	* gcc.target/aarch64/sve/peel_ind_5_run.c: New test.
	* gcc.target/aarch64/sve/peel_ind_6.c: New test.
	* gcc.target/aarch64/sve/peel_ind_6_run.c: New test.
	* gcc.target/aarch64/sve/peel_ind_7.c: New test.
	* gcc.target/aarch64/sve/peel_ind_7_run.c: New test.
	* gcc.target/aarch64/sve/peel_ind_8.c: New test.
	* gcc.target/aarch64/sve/peel_ind_8_run.c: New test.
	* gcc.target/aarch64/sve/peel_ind_9.c: New test.
	* gcc.target/aarch64/sve/peel_ind_9_run.c: New test.
This commit is contained in:
Tamar Christina 2025-04-16 13:09:05 +01:00
parent 473dde5252
commit 46ccce1de6
14 changed files with 357 additions and 3 deletions

View file

@ -0,0 +1,24 @@
/* Fix for PR119351 alignment peeling with vectors and VLS. */
/* { dg-do compile } */
/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
#define N 512
#define START 0
#define END 505
int x[N] __attribute__((aligned(32)));
int __attribute__((noipa))
foo (int start)
{
for (unsigned int i = start; i < END; ++i)
{
if (x[i] == 0)
return i;
}
return -1;
}
/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */

View file

@ -0,0 +1,17 @@
/* Fix for PR119351 alignment peeling with vectors and VLS. */
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
#include "peel_ind_10.c"
int __attribute__ ((optimize (1)))
main (void)
{
int res = foo (START);
asm volatile ("");
if (res != START)
__builtin_abort ();
return 0;
}

View file

@ -0,0 +1,24 @@
/* Fix for PR119351 alignment peeling with vectors and VLS. */
/* { dg-do compile } */
/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
#define N 512
#define START 2
#define END 505
int x[N] __attribute__((aligned(32)));
int __attribute__((noipa))
foo (void)
{
for (signed int i = START; i < END; ++i)
{
if (x[i] == 0)
return i;
}
return -1;
}
/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */

View file

@ -0,0 +1,17 @@
/* Fix for PR119351 alignment peeling with vectors and VLS. */
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
#include "peel_ind_5.c"
int __attribute__ ((optimize (1)))
main (void)
{
int res = foo ();
asm volatile ("");
if (res != START)
__builtin_abort ();
return 0;
}

View file

@ -0,0 +1,24 @@
/* Fix for PR119351 alignment peeling with vectors and VLS. */
/* { dg-do compile } */
/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
#define N 512
#define START 1
#define END 505
int x[N] __attribute__((aligned(32)));
int __attribute__((noipa))
foo (int start)
{
for (unsigned int i = start; i < END; ++i)
{
if (x[i] == 0)
return i;
}
return -1;
}
/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */

View file

@ -0,0 +1,17 @@
/* Fix for PR119351 alignment peeling with vectors and VLS. */
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
#include "peel_ind_6.c"
int __attribute__ ((optimize (1)))
main (void)
{
int res = foo (START);
asm volatile ("");
if (res != START)
__builtin_abort ();
return 0;
}

View file

@ -0,0 +1,24 @@
/* Fix for PR119351 alignment peeling with vectors and VLS. */
/* { dg-do compile } */
/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
#define N 512
#define START 1
#define END 505
int x[N] __attribute__((aligned(32)));
int __attribute__((noipa))
foo (void)
{
for (unsigned int i = START; i < END; ++i)
{
if (x[i] == 0)
return i;
}
return -1;
}
/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */

View file

@ -0,0 +1,17 @@
/* Fix for PR119351 alignment peeling with vectors and VLS. */
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
#include "peel_ind_7.c"
int __attribute__ ((optimize (1)))
main (void)
{
int res = foo ();
asm volatile ("");
if (res != START)
__builtin_abort ();
return 0;
}

View file

@ -0,0 +1,24 @@
/* Fix for PR119351 alignment peeling with vectors and VLS. */
/* { dg-do compile } */
/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
#define N 512
#define START 1
#define END 505
int x[N] __attribute__((aligned(32)));
int __attribute__((noipa))
foo (void)
{
for (unsigned int i = START; i < END; i*=2)
{
if (x[i] == 0)
return i;
}
return -1;
}
/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */

View file

@ -0,0 +1,17 @@
/* Fix for PR119351 alignment peeling with vectors and VLS. */
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
#include "peel_ind_8.c"
int __attribute__ ((optimize (1)))
main (void)
{
int res = foo ();
asm volatile ("");
if (res != START)
__builtin_abort ();
return 0;
}

View file

@ -0,0 +1,25 @@
/* Fix for PR119351 alignment peeling with vectors and VLS. */
/* { dg-do compile } */
/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
#define N 512
#define START 1
#define END 505
int x[N] __attribute__((aligned(32)));
int __attribute__((noipa))
foo (void)
{
for (int *p = x + START; p < x + END; p++)
{
if (*p == 0)
return START;
}
return -1;
}
/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
/* Peels using a scalar loop. */
/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */

View file

@ -0,0 +1,17 @@
/* Fix for PR119351 alignment peeling with vectors and VLS. */
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
#include "peel_ind_9.c"
int __attribute__ ((optimize (1)))
main (void)
{
int res = foo ();
asm volatile ("");
if (res != START)
__builtin_abort ();
return 0;
}

View file

@ -653,6 +653,10 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
/* Mark if we have a non-linear IV. */
LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
= STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
}
@ -1046,12 +1050,14 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
suggested_unroll_factor (1),
max_vectorization_factor (0),
mask_skip_niters (NULL_TREE),
mask_skip_niters_pfa_offset (NULL_TREE),
rgroup_compare_type (NULL_TREE),
simd_if_cond (NULL_TREE),
partial_vector_style (vect_partial_vectors_none),
unaligned_dr (NULL),
peeling_for_alignment (0),
ptr_mask (0),
nonlinear_iv (false),
ivexpr_map (NULL),
scan_map (NULL),
slp_unrolling_factor (1),
@ -10678,6 +10684,54 @@ vectorizable_induction (loop_vec_info loop_vinfo,
LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
peel_mul = gimple_build_vector_from_val (&init_stmts,
step_vectype, peel_mul);
/* If early break then we have to create a new PHI which we can use as
an offset to adjust the induction reduction in early exits.
This is because when peeling for alignment using masking, the first
few elements of the vector can be inactive. As such if we find the
entry in the first iteration we have adjust the starting point of
the scalar code.
We do this by creating a new scalar PHI that keeps track of whether
we are the first iteration of the loop (with the additional masking)
or whether we have taken a loop iteration already.
The generated sequence:
pre-header:
bb1:
i_1 = <number of leading inactive elements>
header:
bb2:
i_2 = PHI <i_1(bb1), 0(latch)>
early-exit:
bb3:
i_3 = iv_step * i_2 + PHI<vector-iv>
The first part of the adjustment to create i_1 and i_2 are done here
and the last part creating i_3 is done in
vectorizable_live_operations when the induction extraction is
materialized. */
if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
&& !LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
{
auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
tree ty_skip_niters = TREE_TYPE (skip_niters);
tree break_lhs_phi = vect_get_new_vect_var (ty_skip_niters,
vect_scalar_var,
"pfa_iv_offset");
gphi *nphi = create_phi_node (break_lhs_phi, bb);
add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION);
add_phi_arg (nphi, build_zero_cst (ty_skip_niters),
loop_latch_edge (iv_loop), UNKNOWN_LOCATION);
LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo)
= PHI_RESULT (nphi);
}
}
tree step_mul = NULL_TREE;
unsigned ivn;
@ -11565,8 +11619,10 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
/* For early exit where the exit is not in the BB that leads
to the latch then we're restarting the iteration in the
scalar loop. So get the first live value. */
if ((all_exits_as_early_p || !main_exit_edge)
&& STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
bool early_break_first_element_p
= (all_exits_as_early_p || !main_exit_edge)
&& STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def;
if (early_break_first_element_p)
{
tmp_vec_lhs = vec_lhs0;
tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
@ -11581,6 +11637,41 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
lhs_type, &exit_gsi);
auto gsi = gsi_for_stmt (use_stmt);
if (early_break_first_element_p
&& LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
{
tree step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
tree break_lhs_phi
= LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo);
tree ty_skip_niters = TREE_TYPE (break_lhs_phi);
gimple_seq iv_stmts = NULL;
/* Now create the PHI for the outside loop usage to
retrieve the value for the offset counter. */
tree rphi_step
= gimple_convert (&iv_stmts, ty_skip_niters, step_expr);
tree tmp2
= gimple_build (&iv_stmts, MULT_EXPR,
ty_skip_niters, rphi_step,
break_lhs_phi);
if (POINTER_TYPE_P (TREE_TYPE (new_tree)))
tmp2 = gimple_build (&iv_stmts, POINTER_PLUS_EXPR,
TREE_TYPE (new_tree), new_tree, tmp2);
else
{
tmp2 = gimple_convert (&iv_stmts, TREE_TYPE (new_tree),
tmp2);
tmp2 = gimple_build (&iv_stmts, PLUS_EXPR,
TREE_TYPE (new_tree), new_tree,
tmp2);
}
new_tree = tmp2;
gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
}
tree lhs_phi = gimple_phi_result (use_stmt);
remove_phi_node (&gsi, false);
gimple *copy = gimple_build_assign (lhs_phi, new_tree);

View file

@ -818,6 +818,11 @@ public:
elements that should be false in the first mask). */
tree mask_skip_niters;
/* If we are using a loop mask to align memory addresses and we're in an
early break loop then this variable contains the number of elements that
were skipped during the initial iteration of the loop. */
tree mask_skip_niters_pfa_offset;
/* The type that the loop control IV should be converted to before
testing which of the VF scalars are active and inactive.
Only meaningful if LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
@ -854,6 +859,9 @@ public:
/* The mask used to check the alignment of pointers or arrays. */
int ptr_mask;
/* Indicates whether the loop has any non-linear IV. */
bool nonlinear_iv;
/* Data Dependence Relations defining address ranges that are candidates
for a run-time aliasing check. */
auto_vec<ddr_p> may_alias_ddrs;
@ -1064,6 +1072,7 @@ public:
#define LOOP_VINFO_MASKS(L) (L)->masks
#define LOOP_VINFO_LENS(L) (L)->lens
#define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters
#define LOOP_VINFO_MASK_NITERS_PFA_OFFSET(L) (L)->mask_skip_niters_pfa_offset
#define LOOP_VINFO_RGROUP_COMPARE_TYPE(L) (L)->rgroup_compare_type
#define LOOP_VINFO_RGROUP_IV_TYPE(L) (L)->rgroup_iv_type
#define LOOP_VINFO_PARTIAL_VECTORS_STYLE(L) (L)->partial_vector_style
@ -1073,6 +1082,7 @@ public:
#define LOOP_VINFO_DDRS(L) (L)->shared->ddrs
#define LOOP_VINFO_INT_NITERS(L) (TREE_INT_CST_LOW ((L)->num_iters))
#define LOOP_VINFO_PEELING_FOR_ALIGNMENT(L) (L)->peeling_for_alignment
#define LOOP_VINFO_NON_LINEAR_IV(L) (L)->nonlinear_iv
#define LOOP_VINFO_UNALIGNED_DR(L) (L)->unaligned_dr
#define LOOP_VINFO_MAY_MISALIGN_STMTS(L) (L)->may_misalign_stmts
#define LOOP_VINFO_MAY_ALIAS_DDRS(L) (L)->may_alias_ddrs
@ -2138,8 +2148,14 @@ unlimited_cost_model (loop_p loop)
inline bool
vect_use_loop_mask_for_alignment_p (loop_vec_info loop_vinfo)
{
/* With early break vectorization we don't know whether the accesses will stay
inside the loop or not. TODO: The early break adjustment code can be
implemented the same way as vectorizable_linear_induction. However we
can't test this today so reject it. */
return (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
&& LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
&& LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
&& !(LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
&& LOOP_VINFO_EARLY_BREAKS (loop_vinfo)));
}
/* Return the number of vectors of type VECTYPE that are needed to get