Also lower SLP grouped loads with just one consumer
This makes sure to produce interleaving schemes or load-lanes for single-element interleaving and other permutes that otherwise would use more than three vectors. It exposes the latent issue that single-element interleaving with large gaps can be inefficient - the mitigation in get_group_load_store_type doesn't trigger when we clear the load permutation. It also exposes the fact that not all permutes can be lowered in the best way in a vector length agnostic way so I've added an exception to keep power-of-two size contiguous aligned chunks unlowered (unless we want load-lanes). The optimal handling of load/store vectorization is going to continue to be a learning process. * tree-vect-slp.cc (vect_lower_load_permutations): Also process single-use grouped loads. Avoid lowering contiguous aligned power-of-two sized chunks, those are better handled by the vector size specific SLP code generation. * tree-vect-stmts.cc (get_group_load_store_type): Drop the unrelated requirement of a load permutation for the single-element interleaving limit. * gcc.dg/vect/slp-46.c: Remove XFAIL.
This commit is contained in:
parent
4292297a0f
commit
7164d98266
3 changed files with 39 additions and 20 deletions
|
@ -98,4 +98,4 @@ main ()
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail { vect_load_lanes && vect_variable_length } } } } */
|
||||
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */
|
||||
|
|
|
@ -4315,6 +4315,37 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
|
|||
&& ld_lanes_lanes == 0)
|
||||
continue;
|
||||
|
||||
/* Build the permute to get the original load permutation order. */
|
||||
bool contiguous = true;
|
||||
lane_permutation_t final_perm;
|
||||
final_perm.create (SLP_TREE_LANES (load));
|
||||
for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
|
||||
{
|
||||
final_perm.quick_push
|
||||
(std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
|
||||
if (i != 0
|
||||
&& (SLP_TREE_LOAD_PERMUTATION (load)[i]
|
||||
!= SLP_TREE_LOAD_PERMUTATION (load)[i-1] + 1))
|
||||
contiguous = false;
|
||||
}
|
||||
|
||||
/* When the load permutation accesses a contiguous unpermuted,
|
||||
power-of-two aligned and sized chunk leave the load alone.
|
||||
We can likely (re-)load it more efficiently rather than
|
||||
extracting it from the larger load.
|
||||
??? Long-term some of the lowering should move to where
|
||||
the vector types involved are fixed. */
|
||||
if (ld_lanes_lanes == 0
|
||||
&& contiguous
|
||||
&& (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
|
||||
&& pow2p_hwi (SLP_TREE_LANES (load))
|
||||
&& SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
|
||||
&& group_lanes % SLP_TREE_LANES (load) == 0)
|
||||
{
|
||||
final_perm.release ();
|
||||
continue;
|
||||
}
|
||||
|
||||
/* First build (and possibly re-use) a load node for the
|
||||
unpermuted group. Gaps in the middle and on the end are
|
||||
represented with NULL stmts. */
|
||||
|
@ -4338,13 +4369,6 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
|
|||
&max_nunits, matches, &limit,
|
||||
&tree_size, bst_map);
|
||||
|
||||
/* Build the permute to get the original load permutation order. */
|
||||
lane_permutation_t final_perm;
|
||||
final_perm.create (SLP_TREE_LANES (load));
|
||||
for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
|
||||
final_perm.quick_push
|
||||
(std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
|
||||
|
||||
if (ld_lanes_lanes != 0)
|
||||
{
|
||||
/* ??? If this is not in sync with what get_load_store_type
|
||||
|
@ -4503,20 +4527,16 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
|
|||
&& STMT_VINFO_GROUPED_ACCESS (b0)
|
||||
&& DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
|
||||
continue;
|
||||
/* Just one SLP load of a possible group, leave those alone. */
|
||||
if (i == firsti + 1)
|
||||
{
|
||||
firsti = i;
|
||||
continue;
|
||||
}
|
||||
/* Now we have multiple SLP loads of the same group from
|
||||
/* Now we have one or multiple SLP loads of the same group from
|
||||
firsti to i - 1. */
|
||||
vect_lower_load_permutations (loop_vinfo, bst_map,
|
||||
make_array_slice (&loads[firsti],
|
||||
i - firsti));
|
||||
if (STMT_VINFO_GROUPED_ACCESS (a0))
|
||||
vect_lower_load_permutations (loop_vinfo, bst_map,
|
||||
make_array_slice (&loads[firsti],
|
||||
i - firsti));
|
||||
firsti = i;
|
||||
}
|
||||
if (firsti < loads.length () - 1)
|
||||
if (firsti < loads.length ()
|
||||
&& STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
|
||||
vect_lower_load_permutations (loop_vinfo, bst_map,
|
||||
make_array_slice (&loads[firsti],
|
||||
loads.length () - firsti));
|
||||
|
|
|
@ -2187,7 +2187,6 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
|
|||
blow up memory, see PR65518). */
|
||||
if (loop_vinfo
|
||||
&& *memory_access_type == VMAT_CONTIGUOUS
|
||||
&& SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
|
||||
&& single_element_p
|
||||
&& maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
|
||||
{
|
||||
|
|
Loading…
Add table
Reference in a new issue