Account for the cost of generating loop masks

We didn't take the cost of generating loop masks into account, and so
tended to underestimate the cost of loops that need multiple masks.

2019-11-13  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* tree-vect-loop.c (vect_estimate_min_profitable_iters): Include
	the cost of generating loop masks.

gcc/testsuite/
	* gcc.target/aarch64/sve/mask_struct_store_3.c: Add
	-fno-vect-cost-model.
	* gcc.target/aarch64/sve/mask_struct_store_3_run.c: Likewise.
	* gcc.target/aarch64/sve/peel_ind_2.c: Likewise.
	* gcc.target/aarch64/sve/peel_ind_2_run.c: Likewise.
	* gcc.target/aarch64/sve/peel_ind_3.c: Likewise.
	* gcc.target/aarch64/sve/peel_ind_3_run.c: Likewise.

From-SVN: r278125
This commit is contained in:
Richard Sandiford 2019-11-13 09:12:17 +00:00 committed by Richard Sandiford
parent 6eed64b96d
commit 61e5f2df03
9 changed files with 48 additions and 7 deletions

View file

@ -1,3 +1,8 @@
2019-11-13 Richard Sandiford <richard.sandiford@arm.com>
* tree-vect-loop.c (vect_estimate_min_profitable_iters): Include
the cost of generating loop masks.
2019-11-13 Richard Sandiford <richard.sandiford@arm.com>
* tree-vectorizer.h (vect_apply_runtime_profitability_check_p):

View file

@ -1,3 +1,13 @@
2019-11-13 Richard Sandiford <richard.sandiford@arm.com>
* gcc.target/aarch64/sve/mask_struct_store_3.c: Add
-fno-vect-cost-model.
* gcc.target/aarch64/sve/mask_struct_store_3_run.c: Likewise.
* gcc.target/aarch64/sve/peel_ind_2.c: Likewise.
* gcc.target/aarch64/sve/peel_ind_2_run.c: Likewise.
* gcc.target/aarch64/sve/peel_ind_3.c: Likewise.
* gcc.target/aarch64/sve/peel_ind_3_run.c: Likewise.
2019-11-13 Richard Sandiford <richard.sandiford@arm.com>
PR c++/92206

View file

@ -1,5 +1,5 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */
#include <stdint.h>

View file

@ -1,5 +1,5 @@
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */
#include "mask_struct_store_3.c"

View file

@ -1,7 +1,7 @@
/* { dg-do compile } */
/* Pick an arbitrary target for which unaligned accesses are more
expensive. */
/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx" } */
/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx -fno-vect-cost-model" } */
#define N 512
#define START 7

View file

@ -1,6 +1,6 @@
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O3 -mtune=thunderx" } */
/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */
/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256 -fno-vect-cost-model" { target aarch64_sve256_hw } } */
#include "peel_ind_2.c"

View file

@ -1,7 +1,7 @@
/* { dg-do compile } */
/* Pick an arbitrary target for which unaligned accesses are more
expensive. */
/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx" } */
/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx -fno-vect-cost-model" } */
#define N 32
#define MAX_START 8

View file

@ -1,6 +1,6 @@
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O3 -mtune=thunderx" } */
/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */
/* { dg-options "-O3 -mtune=thunderx -fno-vect-cost-model" } */
/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256 -fno-vect-cost-model" { target aarch64_sve256_hw } } */
#include "peel_ind_3.c"

View file

@ -3291,6 +3291,32 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
si->kind, si->stmt_info, si->misalign,
vect_epilogue);
}
/* Calculate how many masks we need to generate. */
unsigned int num_masks = 0;
rgroup_masks *rgm;
unsigned int num_vectors_m1;
FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
if (rgm->mask_type)
num_masks += num_vectors_m1 + 1;
gcc_assert (num_masks > 0);
/* In the worst case, we need to generate each mask in the prologue
and in the loop body. One of the loop body mask instructions
replaces the comparison in the scalar loop, and since we don't
count the scalar comparison against the scalar body, we shouldn't
count that vector instruction against the vector body either.
Sometimes we can use unpacks instead of generating prologue
masks and sometimes the prologue mask will fold to a constant,
so the actual prologue cost might be smaller. However, it's
simpler and safer to use the worst-case cost; if this ends up
being the tie-breaker between vectorizing or not, then it's
probably better not to vectorize. */
(void) add_stmt_cost (target_cost_data, num_masks, vector_stmt,
NULL, 0, vect_prologue);
(void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt,
NULL, 0, vect_body);
}
else if (npeel < 0)
{