vect/rs6000: Support vector with length cost modeling

This patch is to add the cost modeling for vector with length,
it mainly follows what we generate for vector with length in
functions vect_set_loop_controls_directly and vect_gen_len
at the worst case.

For Power, the length is expected to be in bits 0-7 (high bits),
we have to model the cost of shifting bits, which is implemented
in adjust_vect_cost_per_loop.

Bootstrapped/regtested on powerpc64le-linux-gnu (P9) with explicit
param vect-partial-vector-usage=1.

gcc/ChangeLog:

	* config/rs6000/rs6000.c (rs6000_adjust_vect_cost_per_loop): New
	function.
	(rs6000_finish_cost): Call rs6000_adjust_vect_cost_per_loop.
	* tree-vect-loop.c (vect_estimate_min_profitable_iters): Add cost
	modeling for vector with length.
	(vect_rgroup_iv_might_wrap_p): New function, factored out from...
	* tree-vect-loop-manip.c (vect_set_loop_controls_directly): ...this.
	Update function comment.
	* tree-vect-stmts.c (vect_gen_len): Update function comment.
	* tree-vectorizer.h (vect_rgroup_iv_might_wrap_p): New declare.
This commit is contained in:
Kewen Lin 2020-07-29 14:38:39 +08:00 committed by Kewen Lin
parent ea858d0957
commit dfdf9085d3
5 changed files with 127 additions and 15 deletions

View file

@ -5181,6 +5181,34 @@ rs6000_add_stmt_cost (class vec_info *vinfo, void *data, int count,
return retval;
}
/* For some target specific vectorization cost which can't be handled per stmt,
we check the requisite conditions and adjust the vectorization cost
accordingly if satisfied. One typical example is to model shift cost for
vector with length by counting number of required lengths under condition
LOOP_VINFO_FULLY_WITH_LENGTH_P. */
static void
rs6000_adjust_vect_cost_per_loop (rs6000_cost_data *data)
{
struct loop *loop = data->loop_info;
gcc_assert (loop);
loop_vec_info loop_vinfo = loop_vec_info_for_loop (loop);
if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
{
rgroup_controls *rgc;
unsigned int num_vectors_m1;
unsigned int shift_cnt = 0;
FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
if (rgc->type)
/* Each length needs one shift to fill into bits 0-7. */
shift_cnt += num_vectors_m1 + 1;
rs6000_add_stmt_cost (loop_vinfo, (void *) data, shift_cnt, scalar_stmt,
NULL, NULL_TREE, 0, vect_body);
}
}
/* Implement targetm.vectorize.finish_cost. */
static void
@ -5190,7 +5218,10 @@ rs6000_finish_cost (void *data, unsigned *prologue_cost,
rs6000_cost_data *cost_data = (rs6000_cost_data*) data;
if (cost_data->loop_info)
rs6000_density_test (cost_data);
{
rs6000_adjust_vect_cost_per_loop (cost_data);
rs6000_density_test (cost_data);
}
/* Don't vectorize minimum-vectorization-factor, simple copy loops
that require versioning for any reason. The vectorization is at

View file

@ -412,7 +412,11 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
This means that we cannot guarantee that such an induction variable
would ever hit a value that produces a set of all-false masks or zero
lengths for RGC. */
lengths for RGC.
Note: the cost of the code generated by this function is modeled
by vect_estimate_min_profitable_iters, so changes here may need
corresponding changes there. */
static tree
vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
@ -711,8 +715,6 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
else
niters = gimple_convert (&preheader_seq, compare_type, niters);
widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
/* Iterate over all the rgroups and fill in their controls. We could use
the first control from any rgroup for the loop condition; here we
arbitrarily pick the last. */
@ -739,11 +741,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
/* See whether zero-based IV would ever generate all-false masks
or zero length before wrapping around. */
unsigned nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
bool might_wrap_p
= (iv_limit == -1
|| (wi::min_precision (iv_limit * nitems_per_iter, UNSIGNED)
> compare_precision));
bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
/* Set up all controls for this group. */
test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,

View file

@ -3798,6 +3798,58 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
(void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
vector_stmt, NULL, NULL_TREE, 0, vect_body);
}
else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
{
/* Referring to the functions vect_set_loop_condition_partial_vectors
and vect_set_loop_controls_directly, we need to generate each
length in the prologue and in the loop body if required. Although
there are some possible optimizations, we consider the worst case
here. */
bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
bool need_iterate_p
= (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
&& !vect_known_niters_smaller_than_vf (loop_vinfo));
/* Calculate how many statements to be added. */
unsigned int prologue_stmts = 0;
unsigned int body_stmts = 0;
rgroup_controls *rgc;
unsigned int num_vectors_m1;
FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
if (rgc->type)
{
/* May need one SHIFT for nitems_total computation. */
unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
if (nitems != 1 && !niters_known_p)
prologue_stmts += 1;
/* May need one MAX and one MINUS for wrap around. */
if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
prologue_stmts += 2;
/* Need one MAX and one MINUS for each batch limit excepting for
the 1st one. */
prologue_stmts += num_vectors_m1 * 2;
unsigned int num_vectors = num_vectors_m1 + 1;
/* Need to set up lengths in prologue, only one MIN required
for each since start index is zero. */
prologue_stmts += num_vectors;
/* Each may need two MINs and one MINUS to update lengths in body
for next iteration. */
if (need_iterate_p)
body_stmts += 3 * num_vectors;
}
(void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
(void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
scalar_stmt, NULL, NULL_TREE, 0, vect_body);
}
/* FORNOW: The scalar outside cost is incremented in one of the
following ways:
@ -3932,8 +3984,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
}
/* ??? The "if" arm is written to handle all cases; see below for what
we would do for !LOOP_VINFO_FULLY_MASKED_P. */
if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
{
/* Rewriting the condition above in terms of the number of
vector iterations (vniters) rather than the number of
@ -3960,7 +4012,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
min_vec_niters);
if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
{
/* Now that we know the minimum number of vector iterations,
find the minimum niters for which the scalar cost is larger:
@ -4015,6 +4067,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
&& min_profitable_iters < (assumed_vf + peel_iters_prologue))
/* We want the vectorized loop to execute at least once. */
min_profitable_iters = assumed_vf + peel_iters_prologue;
else if (min_profitable_iters < peel_iters_prologue)
/* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
vectorized loop executes at least once. */
min_profitable_iters = peel_iters_prologue;
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
@ -4032,7 +4088,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
if (vec_outside_cost <= 0)
min_profitable_estimate = 0;
else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
{
/* This is a repeat of the code above, but with + SOC rather
than - SOC. */
@ -4044,7 +4100,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
if (outside_overhead > 0)
min_vec_niters = outside_overhead / saving_per_viter + 1;
if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
{
int threshold = (vec_inside_cost * min_vec_niters
+ vec_outside_cost
@ -9351,3 +9407,25 @@ vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
return iv_limit;
}
/* For the given rgroup_controls RGC, check whether an induction variable
would ever hit a value that produces a set of all-false masks or zero
lengths before wrapping around. Return true if it's possible to wrap
around before hitting the desirable value, otherwise return false. */
bool
vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
{
widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
if (iv_limit == -1)
return true;
tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
unsigned int compare_precision = TYPE_PRECISION (compare_type);
unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
return true;
return false;
}

View file

@ -12109,7 +12109,11 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
min_of_start_and_end = min (START_INDEX, END_INDEX);
left_len = END_INDEX - min_of_start_and_end;
rhs = min (left_len, LEN_LIMIT);
LEN = rhs; */
LEN = rhs;
Note: the cost of the code generated by this function is modeled
by vect_estimate_min_profitable_iters, so changes here may need
corresponding changes there. */
gimple_seq
vect_gen_len (tree len, tree start_index, tree end_index, tree len_limit)

View file

@ -1960,6 +1960,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
/* In tree-vect-loop.c. */
extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
/* Used in tree-vect-loop-manip.c */
extern void determine_peel_for_niter (loop_vec_info);
/* Used in gimple-loop-interchange.c and tree-parloops.c. */