Turn TODO_rebuild_frequencies to a pass

Currently we rebiuild profile_counts from profile_probability after inlining,
because there is a chance that producing large loop nests may get unrealistically
large profile_count values.  This is much less of concern when we switched to
new profile_count representation while back.

This propagation can also compensate for profile inconsistencies caused by
optimization passes.  Since inliner is followed by basic cleanup passes that
does not use profile, we get more realistic profile by delaying the recomputation
after basic optimizations exposed by inlininig are finished.

This does not fit into TODO machinery, so I turn rebuilding into stand alone
pass and schedule it before first consumer of profile in the optimization
queue.

I also added logic that avoids repropagating when CFG is good and not too close
to overflow.  Propagating visits very basic block loop_depth times, so it is
not linear and avoiding it may help a bit.

On tramp3d we get 14 functions repropagated and 916 are OK.  The repropagated
functions are RB tree ones where we produce crazy loop nests by recurisve inlining.
This is something to fix independently.

gcc/ChangeLog:

	* passes.cc (execute_function_todo): Remove
	TODO_rebuild_frequencies
	* passes.def: Add rebuild_frequencies pass.
	* predict.cc (estimate_bb_frequencies): Drop
	force parameter.
	(tree_estimate_probability): Update call of
	estimate_bb_frequencies.
	(rebuild_frequencies): Turn into a pass; verify CFG profile consistency
	first and do not rebuild if not necessary.
	(class pass_rebuild_frequencies): New.
	(make_pass_rebuild_frequencies): New.
	* profile-count.h: Add profile_count::very_large_p.
	* tree-inline.cc (optimize_inline_calls): Do not return
	TODO_rebuild_frequencies
	* tree-pass.h (TODO_rebuild_frequencies): Remove.
	(make_pass_rebuild_frequencies): Declare.
This commit is contained in:
Jan Hubicka 2023-07-14 17:14:15 +02:00
parent 0d2673e995
commit aa6741ef2e
6 changed files with 253 additions and 121 deletions

View file

@ -2075,9 +2075,6 @@ execute_function_todo (function *fn, void *data)
if (flags & TODO_remove_unused_locals)
remove_unused_locals ();
if (flags & TODO_rebuild_frequencies)
rebuild_frequencies ();
if (flags & TODO_rebuild_cgraph_edges)
cgraph_edge::rebuild_edges ();

View file

@ -206,6 +206,10 @@ along with GCC; see the file COPYING3. If not see
NEXT_PASS (pass_post_ipa_warn);
/* Must run before loop unrolling. */
NEXT_PASS (pass_warn_access, /*early=*/true);
/* Profile count may overflow as a result of inlinining very large
loop nests. This pass should run before any late pass that makes
use of profile. */
NEXT_PASS (pass_rebuild_frequencies);
NEXT_PASS (pass_complete_unrolli);
NEXT_PASS (pass_backprop);
NEXT_PASS (pass_phiprop);
@ -395,6 +399,10 @@ along with GCC; see the file COPYING3. If not see
to forward object-size and builtin folding results properly. */
NEXT_PASS (pass_copy_prop);
NEXT_PASS (pass_dce);
/* Profile count may overflow as a result of inlinining very large
loop nests. This pass should run before any late pass that makes
use of profile. */
NEXT_PASS (pass_rebuild_frequencies);
NEXT_PASS (pass_sancov);
NEXT_PASS (pass_asan);
NEXT_PASS (pass_tsan);

View file

@ -89,7 +89,7 @@ static void predict_paths_leading_to_edge (edge, enum br_predictor,
static bool can_predict_insn_p (const rtx_insn *);
static HOST_WIDE_INT get_predictor_value (br_predictor, HOST_WIDE_INT);
static void determine_unlikely_bbs ();
static void estimate_bb_frequencies (bool force);
static void estimate_bb_frequencies ();
/* Information we hold about each branch predictor.
Filled using information from predict.def. */
@ -3169,8 +3169,9 @@ tree_estimate_probability (bool dry_run)
delete bb_predictions;
bb_predictions = NULL;
if (!dry_run)
estimate_bb_frequencies (false);
if (!dry_run
&& profile_status_for_fn (cfun) != PROFILE_READ)
estimate_bb_frequencies ();
free_dominance_info (CDI_POST_DOMINATORS);
remove_fake_exit_edges ();
}
@ -3923,103 +3924,97 @@ determine_unlikely_bbs ()
}
/* Estimate and propagate basic block frequencies using the given branch
probabilities. If FORCE is true, the frequencies are used to estimate
the counts even when there are already non-zero profile counts. */
probabilities. */
static void
estimate_bb_frequencies (bool force)
estimate_bb_frequencies ()
{
basic_block bb;
sreal freq_max;
determine_unlikely_bbs ();
if (force || profile_status_for_fn (cfun) != PROFILE_READ
|| !update_max_bb_count ())
mark_dfs_back_edges ();
single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun))->probability =
profile_probability::always ();
/* Set up block info for each basic block. */
alloc_aux_for_blocks (sizeof (block_info));
alloc_aux_for_edges (sizeof (edge_prob_info));
FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR_FOR_FN (cfun), NULL, next_bb)
{
edge e;
edge_iterator ei;
mark_dfs_back_edges ();
single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun))->probability =
profile_probability::always ();
/* Set up block info for each basic block. */
alloc_aux_for_blocks (sizeof (block_info));
alloc_aux_for_edges (sizeof (edge_prob_info));
FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR_FOR_FN (cfun), NULL, next_bb)
FOR_EACH_EDGE (e, ei, bb->succs)
{
edge e;
edge_iterator ei;
FOR_EACH_EDGE (e, ei, bb->succs)
{
/* FIXME: Graphite is producing edges with no profile. Once
this is fixed, drop this. */
if (e->probability.initialized_p ())
EDGE_INFO (e)->back_edge_prob
= e->probability.to_sreal ();
else
/* back_edge_prob = 0.5 */
EDGE_INFO (e)->back_edge_prob = sreal (1, -1);
}
/* FIXME: Graphite is producing edges with no profile. Once
this is fixed, drop this. */
if (e->probability.initialized_p ())
EDGE_INFO (e)->back_edge_prob
= e->probability.to_sreal ();
else
/* back_edge_prob = 0.5 */
EDGE_INFO (e)->back_edge_prob = sreal (1, -1);
}
/* First compute frequencies locally for each loop from innermost
to outermost to examine frequencies for back edges. */
estimate_loops ();
freq_max = 0;
FOR_EACH_BB_FN (bb, cfun)
if (freq_max < BLOCK_INFO (bb)->frequency)
freq_max = BLOCK_INFO (bb)->frequency;
/* Scaling frequencies up to maximal profile count may result in
frequent overflows especially when inlining loops.
Small scalling results in unnecesary precision loss. Stay in
the half of the (exponential) range. */
freq_max = (sreal (1) << (profile_count::n_bits / 2)) / freq_max;
if (freq_max < 16)
freq_max = 16;
profile_count ipa_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count.ipa ();
cfun->cfg->count_max = profile_count::uninitialized ();
FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR_FOR_FN (cfun), NULL, next_bb)
{
sreal tmp = BLOCK_INFO (bb)->frequency;
if (tmp >= 1)
{
gimple_stmt_iterator gsi;
tree decl;
/* Self recursive calls can not have frequency greater than 1
or program will never terminate. This will result in an
inconsistent bb profile but it is better than greatly confusing
IPA cost metrics. */
for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
if (is_gimple_call (gsi_stmt (gsi))
&& (decl = gimple_call_fndecl (gsi_stmt (gsi))) != NULL
&& recursive_call_p (current_function_decl, decl))
{
if (dump_file)
fprintf (dump_file, "Dropping frequency of recursive call"
" in bb %i from %f\n", bb->index,
tmp.to_double ());
tmp = (sreal)9 / (sreal)10;
break;
}
}
tmp = tmp * freq_max + sreal (1, -1);
profile_count count = profile_count::from_gcov_type (tmp.to_int ());
/* If we have profile feedback in which this function was never
executed, then preserve this info. */
if (!(bb->count == profile_count::zero ()))
bb->count = count.guessed_local ().combine_with_ipa_count (ipa_count);
cfun->cfg->count_max = cfun->cfg->count_max.max (bb->count);
}
free_aux_for_blocks ();
free_aux_for_edges ();
}
/* First compute frequencies locally for each loop from innermost
to outermost to examine frequencies for back edges. */
estimate_loops ();
freq_max = 0;
FOR_EACH_BB_FN (bb, cfun)
if (freq_max < BLOCK_INFO (bb)->frequency)
freq_max = BLOCK_INFO (bb)->frequency;
/* Scaling frequencies up to maximal profile count may result in
frequent overflows especially when inlining loops.
Small scalling results in unnecesary precision loss. Stay in
the half of the (exponential) range. */
freq_max = (sreal (1) << (profile_count::n_bits / 2)) / freq_max;
if (freq_max < 16)
freq_max = 16;
profile_count ipa_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count.ipa ();
cfun->cfg->count_max = profile_count::uninitialized ();
FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR_FOR_FN (cfun), NULL, next_bb)
{
sreal tmp = BLOCK_INFO (bb)->frequency;
if (tmp >= 1)
{
gimple_stmt_iterator gsi;
tree decl;
/* Self recursive calls can not have frequency greater than 1
or program will never terminate. This will result in an
inconsistent bb profile but it is better than greatly confusing
IPA cost metrics. */
for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
if (is_gimple_call (gsi_stmt (gsi))
&& (decl = gimple_call_fndecl (gsi_stmt (gsi))) != NULL
&& recursive_call_p (current_function_decl, decl))
{
if (dump_file)
fprintf (dump_file, "Dropping frequency of recursive call"
" in bb %i from %f\n", bb->index,
tmp.to_double ());
tmp = (sreal)9 / (sreal)10;
break;
}
}
tmp = tmp * freq_max + sreal (1, -1);
profile_count count = profile_count::from_gcov_type (tmp.to_int ());
/* If we have profile feedback in which this function was never
executed, then preserve this info. */
if (!(bb->count == profile_count::zero ()))
bb->count = count.guessed_local ().combine_with_ipa_count (ipa_count);
cfun->cfg->count_max = cfun->cfg->count_max.max (bb->count);
}
free_aux_for_blocks ();
free_aux_for_edges ();
compute_function_frequency ();
}
@ -4307,38 +4302,162 @@ make_pass_strip_predict_hints (gcc::context *ctxt)
void
rebuild_frequencies (void)
{
timevar_push (TV_REBUILD_FREQUENCIES);
/* If we have no profile, do nothing. Note that after inlining
profile_status_for_fn may not represent the actual presence/absence of
profile. */
if (profile_status_for_fn (cfun) == PROFILE_ABSENT
&& !ENTRY_BLOCK_PTR_FOR_FN (cfun)->count.initialized_p ())
return;
/* When the max bb count in the function is small, there is a higher
chance that there were truncation errors in the integer scaling
of counts by inlining and other optimizations. This could lead
to incorrect classification of code as being cold when it isn't.
In that case, force the estimation of bb counts/frequencies from the
branch probabilities, rather than computing frequencies from counts,
which may also lead to frequencies incorrectly reduced to 0. There
is less precision in the probabilities, so we only do this for small
max counts. */
cfun->cfg->count_max = profile_count::uninitialized ();
/* See if everything is OK and update count_max. */
basic_block bb;
FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR_FOR_FN (cfun), NULL, next_bb)
cfun->cfg->count_max = cfun->cfg->count_max.max (bb->count);
bool inconsistency_found = false;
bool uninitialized_probablity_found = false;
bool uninitialized_count_found = false;
if (profile_status_for_fn (cfun) == PROFILE_GUESSED)
cfun->cfg->count_max = profile_count::uninitialized ();
FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR_FOR_FN (cfun), NULL, next_bb)
{
loop_optimizer_init (LOOPS_HAVE_MARKED_IRREDUCIBLE_REGIONS);
connect_infinite_loops_to_exit ();
estimate_bb_frequencies (true);
remove_fake_exit_edges ();
loop_optimizer_finalize ();
cfun->cfg->count_max = cfun->cfg->count_max.max (bb->count);
/* Uninitialized count may be result of inlining or an omision in an
optimization pass. */
if (!bb->count.initialized_p ())
{
uninitialized_count_found = true;
if (dump_file)
fprintf (dump_file, "BB %i has uninitialized count\n",
bb->index);
}
if (bb != ENTRY_BLOCK_PTR_FOR_FN (cfun)
&& (!uninitialized_probablity_found || !inconsistency_found))
{
profile_count sum = profile_count::zero ();
edge e;
edge_iterator ei;
FOR_EACH_EDGE (e, ei, bb->preds)
{
sum += e->count ();
/* Uninitialized probability may be result of inlining or an
omision in an optimization pass. */
if (!e->probability.initialized_p ())
{
if (dump_file)
fprintf (dump_file,
"Edge %i->%i has uninitialized probability\n",
e->src->index, e->dest->index);
}
}
if (sum.differs_from_p (bb->count))
{
if (dump_file)
fprintf (dump_file,
"BB %i has invalid sum of incomming counts\n",
bb->index);
inconsistency_found = true;
}
}
}
else if (profile_status_for_fn (cfun) == PROFILE_READ)
update_max_bb_count ();
else if (profile_status_for_fn (cfun) == PROFILE_ABSENT
&& !flag_guess_branch_prob)
;
else
gcc_unreachable ();
timevar_pop (TV_REBUILD_FREQUENCIES);
/* If everything is OK, do not re-propagate frequencies. */
if (!inconsistency_found
&& (!uninitialized_count_found || uninitialized_probablity_found)
&& !cfun->cfg->count_max.very_large_p ())
{
if (dump_file)
fprintf (dump_file, "Profile is consistent\n");
return;
}
/* Do not re-propagate if we have profile feedback. Even if the profile is
inconsistent from previous transofrmations, it is probably more realistic
for hot part of the program than result of repropagating.
Consider example where we previously has
if (test)
then [large probability for true]
and we later proved that test is always 0. In this case, if profile was
read correctly, we must have duplicated the conditional (for example by
inlining) in to a context where test is false. From profile feedback
we know that most executions if the conditionals were true, so the
important copy is not the one we look on.
Propagating from probabilities would make profile look consistent, but
because probablities after code duplication may not be representative
for a given run, we would only propagate the error further. */
if (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count.ipa ().nonzero_p ()
&& !uninitialized_count_found)
{
if (dump_file)
fprintf (dump_file,
"Profile is inconsistent but read from profile feedback;"
" not rebuilding\n");
return;
}
loop_optimizer_init (LOOPS_HAVE_MARKED_IRREDUCIBLE_REGIONS);
connect_infinite_loops_to_exit ();
estimate_bb_frequencies ();
remove_fake_exit_edges ();
loop_optimizer_finalize ();
if (dump_file)
fprintf (dump_file, "Rebuilt basic block counts\n");
return;
}
namespace {
const pass_data pass_data_rebuild_frequencies =
{
GIMPLE_PASS, /* type */
"rebuild_frequencies", /* name */
OPTGROUP_NONE, /* optinfo_flags */
TV_REBUILD_FREQUENCIES, /* tv_id */
PROP_cfg, /* properties_required */
0, /* properties_provided */
0, /* properties_destroyed */
0, /* todo_flags_start */
0, /* todo_flags_finish */
};
class pass_rebuild_frequencies : public gimple_opt_pass
{
public:
pass_rebuild_frequencies (gcc::context *ctxt)
: gimple_opt_pass (pass_data_rebuild_frequencies, ctxt)
{}
/* opt_pass methods: */
opt_pass * clone () final override
{
return new pass_rebuild_frequencies (m_ctxt);
}
void set_pass_param (unsigned int n, bool param) final override
{
gcc_assert (n == 0);
early_p = param;
}
unsigned int execute (function *) final override
{
rebuild_frequencies ();
return 0;
}
private:
bool early_p;
}; // class pass_rebuild_frequencies
} // anon namespace
gimple_opt_pass *
make_pass_rebuild_frequencies (gcc::context *ctxt)
{
return new pass_rebuild_frequencies (ctxt);
}
/* Perform a dry run of the branch prediction pass and report comparsion of

View file

@ -1276,6 +1276,16 @@ public:
return ret;
}
/* Return true if profile count is very large, so we risk overflows
with loop transformations. */
bool
very_large_p ()
{
if (!initialized_p ())
return false;
return m_val > max_count / 65536;
}
int to_frequency (struct function *fun) const;
int to_cgraph_frequency (profile_count entry_bb_count) const;
sreal to_sreal_scale (profile_count in, bool *known = NULL) const;

View file

@ -5526,9 +5526,7 @@ optimize_inline_calls (tree fn)
return (TODO_update_ssa
| TODO_cleanup_cfg
| (gimple_in_ssa_p (cfun) ? TODO_remove_unused_locals : 0)
| (gimple_in_ssa_p (cfun) ? TODO_update_address_taken : 0)
| (profile_status_for_fn (cfun) != PROFILE_ABSENT
? TODO_rebuild_frequencies : 0));
| (gimple_in_ssa_p (cfun) ? TODO_update_address_taken : 0));
}
/* Passed to walk_tree. Copies the node pointed to, if appropriate. */

View file

@ -239,7 +239,6 @@ protected:
#define TODO_verify_il (1 << 6)
#define TODO_dump_symtab (1 << 7)
#define TODO_remove_functions (1 << 8)
#define TODO_rebuild_frequencies (1 << 9)
/* To-do flags for calls to update_ssa. */
@ -418,6 +417,7 @@ extern gimple_opt_pass *make_pass_pre (gcc::context *ctxt);
extern unsigned int tail_merge_optimize (bool);
extern gimple_opt_pass *make_pass_profile (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_strip_predict_hints (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_rebuild_frequencies (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_lower_complex_O0 (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_lower_complex (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_lower_switch (gcc::context *ctxt);