tree-optimization/96043 - BB vectorization costing improvement

This makes the BB vectorizer cost independent SLP subgraphs
separately.  While on pristine trunk and for x86_64 I failed to
distill a testcase where the vectorizer would think _any_
basic-block vectorization opportunity is not profitable I do
have pending work that would make the cost savings of a
profitable opportunity make another independently not
profitable opportunity vectorized.

2020-09-08  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/96043
	* tree-vectorizer.h (_slp_instance::cost_vec): New.
	(_slp_instance::subgraph_entries): Likewise.
	(BB_VINFO_TARGET_COST_DATA): Remove.
	* tree-vect-slp.c (vect_free_slp_instance): Free
	cost_vec and subgraph_entries.
	(vect_analyze_slp_instance): Initialize them.
	(vect_slp_analyze_operations): Defer passing costs to
	the target, instead record them in the SLP graph entry.
	(get_ultimate_leader): New helper for graph partitioning.
	(vect_bb_partition_graph_r): Likewise.
	(vect_bb_partition_graph): New function to partition the
	SLP graph into independently costable parts.
	(vect_bb_vectorization_profitable_p): Adjust to work on
	a subgraph.
	(vect_bb_vectorization_profitable_p): New wrapper,
	discarding non-profitable vectorization of subgraphs.
	(vect_slp_analyze_bb_1): Call vect_bb_partition_graph before
	costing.

	* gcc.dg/vect/costmodel/x86_64/costmodel-pr69297.c: Adjust.
This commit is contained in:
Richard Biener 2020-09-08 14:49:59 +02:00
parent 484af18ee1
commit 47ddf4c7b1
3 changed files with 194 additions and 19 deletions

View file

@ -74,10 +74,28 @@ foo (int* diff)
d[13] = m[12] - m[13];
d[14] = m[14] + m[15];
d[15] = m[15] - m[14];
/* The following obviously profitable part should not make
the former unprofitable one profitable. */
diff[16 + 16] = diff[16];
diff[17 + 16] = diff[17];
diff[18 + 16] = diff[18];
diff[19 + 16] = diff[19];
diff[20 + 16] = diff[20];
diff[21 + 16] = diff[21];
diff[22 + 16] = diff[22];
diff[23 + 16] = diff[23];
diff[24 + 16] = diff[24];
diff[25 + 16] = diff[25];
diff[26 + 16] = diff[26];
diff[27 + 16] = diff[27];
diff[28 + 16] = diff[28];
diff[29 + 16] = diff[29];
diff[30 + 16] = diff[30];
diff[31 + 16] = diff[31];
for (k=0; k<16; k++)
satd += abs(d[k]);
return satd;
}
/* { dg-final { scan-tree-dump "vectorization is not profitable" "slp1" } } */
/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp1" } } */
/* { dg-final { scan-tree-dump-times "Vectorizing SLP tree" 1 "slp1" } } */

View file

@ -126,6 +126,8 @@ vect_free_slp_instance (slp_instance instance, bool final_p)
{
vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
SLP_INSTANCE_LOADS (instance).release ();
instance->subgraph_entries.release ();
instance->cost_vec.release ();
free (instance);
}
@ -2263,6 +2265,8 @@ vect_analyze_slp_instance (vec_info *vinfo,
SLP_INSTANCE_LOADS (new_instance) = vNULL;
SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
new_instance->reduc_phis = NULL;
new_instance->cost_vec = vNULL;
new_instance->subgraph_entries = vNULL;
vect_gather_slp_loads (new_instance, node);
if (dump_enabled_p ())
@ -3133,8 +3137,8 @@ vect_slp_analyze_operations (vec_info *vinfo)
visited.add (*x);
i++;
add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
cost_vec.release ();
/* Remember the SLP graph entry cost for later. */
instance->cost_vec = cost_vec;
}
}
@ -3142,18 +3146,108 @@ vect_slp_analyze_operations (vec_info *vinfo)
if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
{
hash_set<stmt_vec_info> svisited;
stmt_vector_for_cost cost_vec;
cost_vec.create (2);
for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
instance, &cost_vec, svisited);
add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
cost_vec.release ();
instance, &instance->cost_vec, svisited);
}
return !vinfo->slp_instances.is_empty ();
}
/* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
closing the eventual chain. */
static slp_instance
get_ultimate_leader (slp_instance instance,
hash_map<slp_instance, slp_instance> &instance_leader)
{
auto_vec<slp_instance *, 8> chain;
slp_instance *tem;
while (*(tem = instance_leader.get (instance)) != instance)
{
chain.safe_push (tem);
instance = *tem;
}
while (!chain.is_empty ())
*chain.pop () = instance;
return instance;
}
/* Worker of vect_bb_partition_graph, recurse on NODE. */
static void
vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
slp_instance instance, slp_tree node,
hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
hash_map<slp_instance, slp_instance> &instance_leader)
{
stmt_vec_info stmt_info;
unsigned i;
bool all = true;
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
{
bool existed_p;
slp_instance &stmt_instance
= stmt_to_instance.get_or_insert (stmt_info, &existed_p);
if (!existed_p)
all = false;
else if (stmt_instance != instance)
{
/* If we're running into a previously marked stmt make us the
leader of the current ultimate leader. This keeps the
leader chain acyclic and works even when the current instance
connects two previously independent graph parts. */
slp_instance stmt_leader
= get_ultimate_leader (stmt_instance, instance_leader);
if (stmt_leader != instance)
instance_leader.put (stmt_leader, instance);
}
stmt_instance = instance;
}
/* If not all stmts had been visited we have to recurse on children. */
if (all)
return;
slp_tree child;
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
instance_leader);
}
/* Partition the SLP graph into pieces that can be costed independently. */
static void
vect_bb_partition_graph (bb_vec_info bb_vinfo)
{
DUMP_VECT_SCOPE ("vect_bb_partition_graph");
/* First walk the SLP graph assigning each involved scalar stmt a
corresponding SLP graph entry and upon visiting a previously
marked stmt, make the stmts leader the current SLP graph entry. */
hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
hash_map<slp_instance, slp_instance> instance_leader;
slp_instance instance;
for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
{
instance_leader.put (instance, instance);
vect_bb_partition_graph_r (bb_vinfo,
instance, SLP_INSTANCE_TREE (instance),
stmt_to_instance, instance_leader);
}
/* Then collect entries to each independent subgraph. */
for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
{
slp_instance leader = get_ultimate_leader (instance, instance_leader);
leader->subgraph_entries.safe_push (instance);
if (dump_enabled_p ()
&& leader != instance)
dump_printf_loc (MSG_NOTE, vect_location,
"instance %p is leader of %p\n",
leader, instance);
}
}
/* Compute the scalar cost of the SLP node NODE and its children
and return it. Do not account defs that are marked in LIFE and
@ -3261,18 +3355,22 @@ vect_bb_slp_scalar_cost (vec_info *vinfo,
}
}
/* Check if vectorization of the basic block is profitable. */
/* Check if vectorization of the basic block is profitable for the
subgraph denoted by SLP_INSTANCES. */
static bool
vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
vec<slp_instance> slp_instances)
{
vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
slp_instance instance;
int i;
unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
/* Calculate scalar cost. */
void *vect_target_cost_data = init_cost (NULL);
/* Calculate scalar cost and sum the cost for the vector stmts
previously collected. */
stmt_vector_for_cost scalar_costs;
scalar_costs.create (0);
hash_set<slp_tree> visited;
@ -3284,22 +3382,25 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
vect_bb_slp_scalar_cost (bb_vinfo,
SLP_INSTANCE_TREE (instance),
&life, &scalar_costs, visited);
add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
instance->cost_vec.release ();
}
/* Unset visited flag. */
stmt_info_for_cost *si;
FOR_EACH_VEC_ELT (scalar_costs, i, si)
gimple_set_visited (si->stmt_info->stmt, false);
void *target_cost_data = init_cost (NULL);
add_stmt_costs (bb_vinfo, target_cost_data, &scalar_costs);
void *scalar_target_cost_data = init_cost (NULL);
add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
scalar_costs.release ();
unsigned dummy;
finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
destroy_cost_data (target_cost_data);
finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
destroy_cost_data (scalar_target_cost_data);
/* Complete the target-specific cost calculation. */
finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
/* Complete the target-specific vector cost calculation. */
finish_cost (vect_target_cost_data, &vec_prologue_cost,
&vec_inside_cost, &vec_epilogue_cost);
destroy_cost_data (vect_target_cost_data);
vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
@ -3324,6 +3425,54 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
return true;
}
/* For each SLP subgraph determine profitability and remove parts not so.
Returns true if any profitable to vectorize subgraph remains. */
static bool
vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
{
slp_instance instance;
unsigned i;
auto_vec<slp_instance> subgraphs (BB_VINFO_SLP_INSTANCES (bb_vinfo).length ());
FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
if (!instance->subgraph_entries.is_empty ())
subgraphs.quick_push (instance);
BB_VINFO_SLP_INSTANCES (bb_vinfo).truncate (0);
for (i = 0; i < subgraphs.length ();)
{
instance = subgraphs[i];
if (!vect_bb_vectorization_profitable_p (bb_vinfo,
instance->subgraph_entries))
{
/* ??? We need to think of providing better dump/opt-report
locations here. */
if (dump_enabled_p ())
{
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"not vectorized: vectorization is not "
"profitable.\n");
}
slp_instance entry;
unsigned j;
FOR_EACH_VEC_ELT (instance->subgraph_entries, j, entry)
if (entry != instance)
vect_free_slp_instance (entry, false);
vect_free_slp_instance (instance, false);
subgraphs.ordered_remove (i);
}
else
{
slp_instance entry;
unsigned j;
FOR_EACH_VEC_ELT (instance->subgraph_entries, j, entry)
BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (entry);
++i;
}
}
return !BB_VINFO_SLP_INSTANCES (bb_vinfo).is_empty ();
}
/* Find any vectorizable constructors and add them to the grouped_store
array. */
@ -3465,6 +3614,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
return false;
}
vect_bb_partition_graph (bb_vinfo);
/* Cost model: check if the vectorization is worthwhile. */
if (!unlimited_cost_model (NULL)
&& !vect_bb_vectorization_profitable_p (bb_vinfo))

View file

@ -183,6 +183,13 @@ public:
/* The SLP node containing the reduction PHIs. */
slp_tree reduc_phis;
/* Vector cost of this entry to the SLP graph. */
stmt_vector_for_cost cost_vec;
/* If this instance is the main entry of a subgraph the set of
entries into the same subgraph, including itself. */
vec<_slp_instance *> subgraph_entries;
} *slp_instance;
@ -913,7 +920,6 @@ public:
#define BB_VINFO_SLP_INSTANCES(B) (B)->slp_instances
#define BB_VINFO_DATAREFS(B) (B)->shared->datarefs
#define BB_VINFO_DDRS(B) (B)->shared->ddrs
#define BB_VINFO_TARGET_COST_DATA(B) (B)->target_cost_data
static inline bb_vec_info
vec_info_for_bb (basic_block bb)