Vect: use a small step to calculate induction for the unrolled loop (PR tree-optimization/110449)

If a loop is unrolled by n times during vectoriation, two steps are used to
calculate the induction variable:
  - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
  - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)

This patch calculates an extra vec_n to replace vec_loop:
  vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.

So that we can save the large step register and related operations.

gcc/ChangeLog:

	PR tree-optimization/110449
	* tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
	vec_loop for the unrolled loop.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/pr110449.c: New testcase.
This commit is contained in:
Hao Liu OS 2023-07-06 10:04:46 -06:00 committed by Jeff Law
parent b90a70984a
commit 224fd59b2d
2 changed files with 58 additions and 3 deletions

View file

@ -0,0 +1,40 @@
/* { dg-do compile } */
/* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" } */
/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
/* Calcualte the vectorized induction with smaller step for an unrolled loop.
before (suggested_unroll_factor=2):
fmov s30, 8.0e+0
fmov s31, 4.0e+0
dup v27.4s, v30.s[0]
dup v28.4s, v31.s[0]
.L6:
mov v30.16b, v31.16b
fadd v31.4s, v31.4s, v27.4s
fadd v29.4s, v30.4s, v28.4s
stp q30, q29, [x0]
add x0, x0, 32
cmp x1, x0
bne .L6
after:
fmov s31, 4.0e+0
dup v29.4s, v31.s[0]
.L6:
fadd v30.4s, v31.4s, v29.4s
stp q31, q30, [x0]
add x0, x0, 32
fadd v31.4s, v29.4s, v30.4s
cmp x0, x1
bne .L6 */
void
foo2 (float *arr, float freq, float step)
{
for (int i = 0; i < 1024; i++)
{
arr[i] = freq;
freq += step;
}
}

View file

@ -10098,7 +10098,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
new_vec, step_vectype, NULL);
vec_def = induc_def;
for (i = 1; i < ncopies; i++)
for (i = 1; i < ncopies + 1; i++)
{
/* vec_i = vec_prev + vec_step */
gimple_seq stmts = NULL;
@ -10108,8 +10108,23 @@ vectorizable_induction (loop_vec_info loop_vinfo,
vec_def = gimple_convert (&stmts, vectype, vec_def);
gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
new_stmt = SSA_NAME_DEF_STMT (vec_def);
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
if (i < ncopies)
{
new_stmt = SSA_NAME_DEF_STMT (vec_def);
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
}
else
{
/* vec_1 = vec_iv + (VF/n * S)
vec_2 = vec_1 + (VF/n * S)
...
vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
vec_n is used as vec_loop to save the large step register and
related operations. */
add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
UNKNOWN_LOCATION);
}
}
}