move permutation validity check

This delays the SLP permutation check to vectorizable_load and optimizes
permutations only after all SLP instances have been generated and the
vectorization factor is determined.

2020-05-08  Richard Biener  <rguenther@suse.de>

	* tree-vectorizer.h (vec_info::slp_loads): New.
	(vect_optimize_slp): Declare.
	* tree-vect-slp.c (vect_attempt_slp_rearrange_stmts):  Do
	nothing when there are no loads.
	(vect_gather_slp_loads): Gather loads into a vector.
	(vect_supported_load_permutation_p): Remove.
	(vect_analyze_slp_instance): Do not verify permutation
	validity here.
	(vect_analyze_slp): Optimize permutations of reductions
	after all SLP instances have been gathered and gather
	all loads.
	(vect_optimize_slp): New function split out from
	vect_supported_load_permutation_p.  Elide some permutations.
	(vect_slp_analyze_bb_1): Call vect_optimize_slp.
	* tree-vect-loop.c (vect_analyze_loop_2): Likewise.
	* tree-vect-stmts.c (vectorizable_load): Check whether
	the load can be permuted.  When generating code assert we can.

	* gcc.dg/vect/bb-slp-pr68892.c: Adjust for not supported
	SLP permutations becoming builds from scalars.
	* gcc.dg/vect/bb-slp-pr78205.c: Likewise.
	* gcc.dg/vect/bb-slp-34.c: Likewise.
This commit is contained in:
Richard Biener 2020-02-24 15:36:40 +01:00
parent debfaee5d5
commit bc484e2509
9 changed files with 180 additions and 180 deletions

View file

@ -1,3 +1,23 @@
2020-05-08 Richard Biener <rguenther@suse.de>
* tree-vectorizer.h (vec_info::slp_loads): New.
(vect_optimize_slp): Declare.
* tree-vect-slp.c (vect_attempt_slp_rearrange_stmts): Do
nothing when there are no loads.
(vect_gather_slp_loads): Gather loads into a vector.
(vect_supported_load_permutation_p): Remove.
(vect_analyze_slp_instance): Do not verify permutation
validity here.
(vect_analyze_slp): Optimize permutations of reductions
after all SLP instances have been gathered and gather
all loads.
(vect_optimize_slp): New function split out from
vect_supported_load_permutation_p. Elide some permutations.
(vect_slp_analyze_bb_1): Call vect_optimize_slp.
* tree-vect-loop.c (vect_analyze_loop_2): Likewise.
* tree-vect-stmts.c (vectorizable_load): Check whether
the load can be permuted. When generating code assert we can.
2020-05-08 Richard Biener <rguenther@suse.de>
* tree-ssa-sccvn.c (rpo_avail): Change type to

View file

@ -1,3 +1,10 @@
2020-05-08 Richard Biener <rguenther@suse.de>
* gcc.dg/vect/bb-slp-pr68892.c: Adjust for not supported
SLP permutations becoming builds from scalars.
* gcc.dg/vect/bb-slp-pr78205.c: Likewise.
* gcc.dg/vect/bb-slp-34.c: Likewise.
2020-05-08 Nathan Sidwell <nathan@acm.org>
* c-c++-common/raw-string-6.c: Adjust EOF error location.

View file

@ -32,5 +32,4 @@ int main()
return 0;
}
/* ??? XFAILed because we access "excess" elements with the permutation. */
/* { dg-final { scan-tree-dump "basic block vectorized" "slp2" { target vect_perm xfail { ! { aarch64*-*-* arm*-*-* } } } } } */
/* { dg-final { scan-tree-dump "basic block vectorized" "slp2" { target vect_perm } } } */

View file

@ -13,7 +13,8 @@ void foo(void)
b[3] = a[3][0];
}
/* ??? The profitability check is not reached because we give up on the
gaps we access earlier. */
/* ??? Due to the gaps we fall back to scalar loads which makes the
vectorization profitable. */
/* { dg-final { scan-tree-dump "not profitable" "slp2" { xfail *-*-* } } } */
/* { dg-final { scan-tree-dump-times "Basic block will be vectorized" 0 "slp2" } } */
/* { dg-final { scan-tree-dump-times "BB vectorization with gaps at the end of a load is not supported" 1 "slp2" } } */
/* { dg-final { scan-tree-dump-times "Basic block will be vectorized" 1 "slp2" } } */

View file

@ -19,7 +19,9 @@ void foo ()
}
/* We may not vectorize the store to x[] as it accesses c out-of bounds
but we do want to vectorize the other two store groups. */
but we do want to vectorize the other two store groups. But we may
end up using scalar loads to vectorize the last group. */
/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */
/* { dg-final { scan-tree-dump-times "x\\\[\[0-1\]\\\] = " 2 "optimized" } } */
/* { dg-final { scan-tree-dump-times "BB vectorization with gaps at the end of a load is not supported" 1 "slp2" } } */
/* { dg-final { scan-tree-dump-times " = c\\\[4\\\];" 1 "optimized" } } */

View file

@ -2049,6 +2049,9 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
/* Update the vectorization factor based on the SLP decision. */
vect_update_vf_for_slp (loop_vinfo);
/* Optimize the SLP graph with the vectorization factor fixed. */
vect_optimize_slp (loop_vinfo);
}
bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);

View file

@ -1815,6 +1815,9 @@ vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
unsigned int lidx;
slp_tree node, load;
if (SLP_INSTANCE_LOADS (slp_instn).is_empty ())
return false;
/* Compare all the permutation sequences to the first one. We know
that at least one load is permuted. */
node = SLP_INSTANCE_LOADS (slp_instn)[0];
@ -1889,7 +1892,7 @@ vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
/* Gather loads in the SLP graph NODE and populate the INST loads array. */
static void
vect_gather_slp_loads (slp_instance inst, slp_tree node,
vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
hash_set<slp_tree> &visited)
{
if (visited.add (node))
@ -1902,14 +1905,14 @@ vect_gather_slp_loads (slp_instance inst, slp_tree node,
stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
&& DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
SLP_INSTANCE_LOADS (inst).safe_push (node);
loads.safe_push (node);
}
else
{
unsigned i;
slp_tree child;
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
vect_gather_slp_loads (inst, child, visited);
vect_gather_slp_loads (loads, child, visited);
}
}
@ -1917,137 +1920,7 @@ static void
vect_gather_slp_loads (slp_instance inst, slp_tree node)
{
hash_set<slp_tree> visited;
vect_gather_slp_loads (inst, node, visited);
}
/* Check if the required load permutations in the SLP instance
SLP_INSTN are supported. */
static bool
vect_supported_load_permutation_p (vec_info *vinfo, slp_instance slp_instn)
{
unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
unsigned int i, j, k, next;
slp_tree node;
if (dump_enabled_p ())
{
dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
if (node->load_permutation.exists ())
FOR_EACH_VEC_ELT (node->load_permutation, j, next)
dump_printf (MSG_NOTE, "%d ", next);
else
for (k = 0; k < group_size; ++k)
dump_printf (MSG_NOTE, "%d ", k);
dump_printf (MSG_NOTE, "\n");
}
/* In case of reduction every load permutation is allowed, since the order
of the reduction statements is not important (as opposed to the case of
grouped stores). The only condition we need to check is that all the
load nodes are of the same size and have the same permutation (and then
rearrange all the nodes of the SLP instance according to this
permutation). */
/* Check that all the load nodes are of the same size. */
/* ??? Can't we assert this? */
FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
return false;
node = SLP_INSTANCE_TREE (slp_instn);
stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
/* Reduction (there are no data-refs in the root).
In reduction chain the order of the loads is not important. */
if (!STMT_VINFO_DATA_REF (stmt_info)
&& !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
vect_attempt_slp_rearrange_stmts (slp_instn);
/* In basic block vectorization we allow any subchain of an interleaving
chain.
FORNOW: not supported in loop SLP because of realignment compications. */
if (is_a <bb_vec_info> (vinfo))
{
/* Check whether the loads in an instance form a subchain and thus
no permutation is necessary. */
FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
{
if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
continue;
bool subchain_p = true;
stmt_vec_info next_load_info = NULL;
stmt_vec_info load_info;
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
{
if (j != 0
&& (next_load_info != load_info
|| DR_GROUP_GAP (load_info) != 1))
{
subchain_p = false;
break;
}
next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
}
if (subchain_p)
SLP_TREE_LOAD_PERMUTATION (node).release ();
else
{
stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
group_info = DR_GROUP_FIRST_ELEMENT (group_info);
unsigned HOST_WIDE_INT nunits;
unsigned k, maxk = 0;
FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
if (k > maxk)
maxk = k;
/* In BB vectorization we may not actually use a loaded vector
accessing elements in excess of DR_GROUP_SIZE. */
tree vectype = STMT_VINFO_VECTYPE (group_info);
if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
|| maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"BB vectorization with gaps at the end of "
"a load is not supported\n");
return false;
}
/* Verify the permutation can be generated. */
vec<tree> tem;
unsigned n_perms;
if (!vect_transform_slp_perm_load (vinfo, node, tem, NULL,
1, true, &n_perms))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION,
vect_location,
"unsupported load permutation\n");
return false;
}
}
}
return true;
}
/* For loop vectorization verify we can generate the permutation. Be
conservative about the vectorization factor, there are permutations
that will use three vector inputs only starting from a specific factor
and the vectorization factor is not yet final.
??? The SLP instance unrolling factor might not be the maximum one. */
unsigned n_perms;
poly_uint64 test_vf
= force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
LOOP_VINFO_VECT_FACTOR
(as_a <loop_vec_info> (vinfo)));
FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
if (node->load_permutation.exists ()
&& !vect_transform_slp_perm_load (vinfo, node, vNULL, NULL, test_vf,
true, &n_perms))
return false;
return true;
vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
}
@ -2289,7 +2162,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
"SLP size %u vs. limit %u.\n",
tree_size, max_tree_size);
/* Compute the load permutation. */
/* Check whether any load is possibly permuted. */
slp_tree load_node;
bool loads_permuted = false;
FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
@ -2298,43 +2171,15 @@ vect_analyze_slp_instance (vec_info *vinfo,
continue;
unsigned j;
stmt_vec_info load_info;
bool this_load_permuted = false;
stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
(SLP_TREE_SCALAR_STMTS (load_node)[0]);
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
{
this_load_permuted = true;
loads_permuted = true;
break;
}
if (!this_load_permuted
/* The load requires permutation when unrolling exposes
a gap either because the group is larger than the SLP
group-size or because there is a gap between the groups. */
&& (known_eq (unrolling_factor, 1U)
|| (group_size == DR_GROUP_SIZE (first_stmt_info)
&& DR_GROUP_GAP (first_stmt_info) == 0)))
{
SLP_TREE_LOAD_PERMUTATION (load_node).release ();
continue;
}
loads_permuted = true;
}
if (loads_permuted)
{
if (!vect_supported_load_permutation_p (vinfo, new_instance))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: unsupported load "
"permutation %G", stmt_info->stmt);
vect_free_slp_instance (new_instance, false);
return false;
}
}
/* If the loads and stores can be handled with load/store-lan
/* If the loads and stores can be handled with load/store-lane
instructions do not generate this SLP instance. */
if (is_a <loop_vec_info> (vinfo)
&& loads_permuted
@ -2514,9 +2359,93 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
vect_free_slp_tree ((*it).second, false);
delete bst_map;
/* Optimize permutations in SLP reductions. */
slp_instance instance;
FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
{
slp_tree node = SLP_INSTANCE_TREE (instance);
stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
/* Reduction (there are no data-refs in the root).
In reduction chain the order of the loads is not important. */
if (!STMT_VINFO_DATA_REF (stmt_info)
&& !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
vect_attempt_slp_rearrange_stmts (instance);
}
/* Gather all loads in the SLP graph. */
hash_set<slp_tree> visited;
FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
vect_gather_slp_loads (vinfo->slp_loads, SLP_INSTANCE_TREE (instance),
visited);
return opt_result::success ();
}
void
vect_optimize_slp (vec_info *vinfo)
{
slp_tree node;
unsigned i;
FOR_EACH_VEC_ELT (vinfo->slp_loads, i, node)
{
if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
continue;
/* In basic block vectorization we allow any subchain of an interleaving
chain.
FORNOW: not in loop SLP because of realignment complications. */
if (is_a <bb_vec_info> (vinfo))
{
bool subchain_p = true;
stmt_vec_info next_load_info = NULL;
stmt_vec_info load_info;
unsigned j;
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
{
if (j != 0
&& (next_load_info != load_info
|| DR_GROUP_GAP (load_info) != 1))
{
subchain_p = false;
break;
}
next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
}
if (subchain_p)
{
SLP_TREE_LOAD_PERMUTATION (node).release ();
continue;
}
}
else
{
stmt_vec_info load_info;
bool this_load_permuted = false;
unsigned j;
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
{
this_load_permuted = true;
break;
}
stmt_vec_info first_stmt_info
= DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
if (!this_load_permuted
/* The load requires permutation when unrolling exposes
a gap either because the group is larger than the SLP
group-size or because there is a gap between the groups. */
&& (known_eq (LOOP_VINFO_VECT_FACTOR (as_a <loop_vec_info> (vinfo)), 1U)
|| ((SLP_TREE_SCALAR_STMTS (node).length ()
== DR_GROUP_SIZE (first_stmt_info))
&& DR_GROUP_GAP (first_stmt_info) == 0)))
{
SLP_TREE_LOAD_PERMUTATION (node).release ();
continue;
}
}
}
}
/* For each possible SLP instance decide whether to SLP it and calculate overall
unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
@ -3180,6 +3109,9 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
return false;
}
/* Optimize permutations. */
vect_optimize_slp (bb_vinfo);
vect_record_base_alignments (bb_vinfo);
/* Analyze and verify the alignment of data references and the

View file

@ -8781,7 +8781,44 @@ vectorizable_load (vec_info *vinfo,
}
if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
slp_perm = true;
{
slp_perm = true;
if (!loop_vinfo)
{
/* In BB vectorization we may not actually use a loaded vector
accessing elements in excess of DR_GROUP_SIZE. */
stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
group_info = DR_GROUP_FIRST_ELEMENT (group_info);
unsigned HOST_WIDE_INT nunits;
unsigned j, k, maxk = 0;
FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
if (k > maxk)
maxk = k;
tree vectype = STMT_VINFO_VECTYPE (group_info);
if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
|| maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"BB vectorization with gaps at the end of "
"a load is not supported\n");
return false;
}
}
auto_vec<tree> tem;
unsigned n_perms;
if (!vect_transform_slp_perm_load (vinfo, slp_node, tem, NULL, vf,
true, &n_perms))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION,
vect_location,
"unsupported load permutation\n");
return false;
}
}
/* Invalidate assumptions made by dependence analysis when vectorization
on the unrolled body effectively re-orders stmts. */
@ -9896,12 +9933,9 @@ vectorizable_load (vec_info *vinfo,
if (slp_perm)
{
unsigned n_perms;
if (!vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
false, &n_perms))
{
dr_chain.release ();
return false;
}
bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
gsi, vf, false, &n_perms);
gcc_assert (ok);
}
else
{

View file

@ -324,8 +324,9 @@ public:
/* The mapping of GIMPLE UID to stmt_vec_info. */
vec<stmt_vec_info> stmt_vec_infos;
/* All SLP instances. */
/* The SLP graph. */
auto_vec<slp_instance> slp_instances;
auto_vec<slp_tree> slp_loads;
/* Maps base addresses to an innermost_loop_behavior that gives the maximum
known alignment for that base. */
@ -1858,6 +1859,7 @@ extern void vect_schedule_slp (vec_info *);
extern opt_result vect_analyze_slp (vec_info *, unsigned);
extern bool vect_make_slp_decision (loop_vec_info);
extern void vect_detect_hybrid_slp (loop_vec_info);
extern void vect_optimize_slp (vec_info *);
extern void vect_get_slp_defs (vec_info *, slp_tree, vec<vec<tree> > *,
unsigned n = -1U);
extern bool vect_slp_bb (basic_block);