aarch64: Make latency account for synthetic VEC_PERM_EXPRs [PR116901]

Another problem in pr110625_[24].c was that the latency calculations
were ignoring VEC_PERM_EXPRs that had no associated stmt_vec_info.
Such VEC_PERM_EXPRs are common and expected for SLP these days.

After this change, the number of general ops in the testcases seems
to be accurate apart from one remaining detail: we assume that the
extension in a permuted extending load is free, even though the
extension happens after the permutation.  Fixing that would require
more information from the vectoriser and so isn't GCC 15 material.
It also should cease to be a problem if we do end up moving the
permutation to its own node, rather than keeping it as part of
the load.

gcc/
	PR target/116901
	* config/aarch64/aarch64.cc (aarch64_vector_costs::count_ops): Allow
	stmt_info to be null.
	(aarch64_vector_costs::add_stmt_cost): Call count_ops even if
	stmt_info is null.
This commit is contained in:
Richard Sandiford 2025-03-12 09:40:11 +00:00
parent 855b61b61e
commit e406994e31

View file

@ -17498,7 +17498,8 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
/* Calculate the minimum cycles per iteration imposed by a reduction
operation. */
if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
if (stmt_info
&& (kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
&& vect_is_reduction (stmt_info))
{
unsigned int base
@ -17534,7 +17535,8 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
costing when we see a vec_to_scalar on a stmt with VMAT_GATHER_SCATTER we
are dealing with an emulated instruction and should adjust costing
properly. */
if (kind == vec_to_scalar
if (stmt_info
&& kind == vec_to_scalar
&& (m_vec_flags & VEC_ADVSIMD)
&& vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
{
@ -17590,7 +17592,8 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
case vector_load:
case unaligned_load:
ops->loads += count;
if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
if (m_vec_flags
|| (stmt_info && FLOAT_TYPE_P (aarch64_dr_type (stmt_info))))
ops->general_ops += base_issue->fp_simd_load_general_ops * count;
break;
@ -17598,24 +17601,29 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
case unaligned_store:
case scalar_store:
ops->stores += count;
if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
if (m_vec_flags
|| (stmt_info && FLOAT_TYPE_P (aarch64_dr_type (stmt_info))))
ops->general_ops += base_issue->fp_simd_store_general_ops * count;
break;
}
/* Add any embedded comparison operations. */
if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
if (stmt_info
&& (kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
&& vect_embedded_comparison_type (stmt_info))
ops->general_ops += count;
/* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
have only accounted for one. */
if ((kind == vector_stmt || kind == vec_to_scalar)
if (stmt_info
&& (kind == vector_stmt || kind == vec_to_scalar)
&& vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
ops->general_ops += count;
/* Count the predicate operations needed by an SVE comparison. */
if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
if (stmt_info
&& sve_issue
&& (kind == vector_stmt || kind == vec_to_scalar))
if (tree type = vect_comparison_type (stmt_info))
{
unsigned int base = (FLOAT_TYPE_P (type)
@ -17625,7 +17633,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
}
/* Add any extra overhead associated with LD[234] and ST[234] operations. */
if (simd_issue)
if (stmt_info && simd_issue)
switch (aarch64_ld234_st234_vectors (kind, stmt_info, node))
{
case 2:
@ -17642,7 +17650,8 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
}
/* Add any overhead associated with gather loads and scatter stores. */
if (sve_issue
if (stmt_info
&& sve_issue
&& (kind == scalar_load || kind == scalar_store)
&& vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
{
@ -17852,16 +17861,6 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info, node,
vectype, m_vec_flags, stmt_cost);
/* If we're recording a nonzero vector loop body cost for the
innermost loop, also estimate the operations that would need
to be issued by all relevant implementations of the loop. */
if (loop_vinfo
&& (m_costing_for_scalar || where == vect_body)
&& (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
&& stmt_cost != 0)
for (auto &ops : m_ops)
count_ops (count, kind, stmt_info, node, &ops);
/* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
estimate the number of statements in the unrolled Advanced SIMD
loop. For simplicitly, we assume that one iteration of the
@ -17886,6 +17885,16 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
}
}
/* If we're recording a nonzero vector loop body cost for the
innermost loop, also estimate the operations that would need
to be issued by all relevant implementations of the loop. */
if (loop_vinfo
&& (m_costing_for_scalar || where == vect_body)
&& (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
&& stmt_cost != 0)
for (auto &ops : m_ops)
count_ops (count, kind, stmt_info, node, &ops);
/* If the statement stores to a decl that is known to be the argument
to a vld1 in the same function, ignore the store for costing purposes.
See the comment above m_stores_to_vector_load_decl for more details. */