diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 2ffa96ec24b..ca1ed9e8758 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -281,19 +281,177 @@ struct sve_vec_cost : simd_vec_cost const int scatter_store_elt_cost; }; +/* Base information about how the CPU issues code, containing + information that is relevant to scalar, Advanced SIMD and SVE + operations. + + The structure uses the general term "operation" to refer to + whichever subdivision of an instruction makes sense for the CPU. + These operations would typically be micro operations or macro + operations. + + Note that this structure and the ones derived from it are only + as general as they need to be for the CPUs that currently use them. + They will probably need to be extended or refined as more CPUs are + added. */ +struct aarch64_base_vec_issue_info +{ + /* How many loads and stores can be issued per cycle. */ + const unsigned int loads_stores_per_cycle; + + /* How many stores can be issued per cycle. */ + const unsigned int stores_per_cycle; + + /* How many integer or FP/SIMD operations can be issued per cycle. + + Currently we don't try to distinguish the two. For vector code, + we only really track FP/SIMD operations during vector costing; + we don't for example try to cost arithmetic operations like + address calculations, which are only decided later during ivopts. + + For scalar code, we effectively assume that code operates entirely + on integers or entirely on floating-point values. Again, we don't + try to take address calculations into account. + + This is not very precise, but it's only meant to be a heuristic. + We could certainly try to do better in future if there's an example + of something that would benefit. */ + const unsigned int general_ops_per_cycle; + + /* How many FP/SIMD operations to count for a floating-point or + vector load operation. + + When constructing an Advanced SIMD vector from elements that have + been loaded from memory, these values apply to each individual load. + When using an SVE gather load, the values apply to each element of + the gather. */ + const unsigned int fp_simd_load_general_ops; + + /* How many FP/SIMD operations to count for a floating-point or + vector store operation. + + When storing individual elements of an Advanced SIMD vector out to + memory, these values apply to each individual store. When using an + SVE scatter store, these values apply to each element of the scatter. */ + const unsigned int fp_simd_store_general_ops; +}; + +using aarch64_scalar_vec_issue_info = aarch64_base_vec_issue_info; + +/* Base information about the issue stage for vector operations. + This structure contains information that is relevant to both + Advanced SIMD and SVE. */ +struct aarch64_simd_vec_issue_info : aarch64_base_vec_issue_info +{ + constexpr aarch64_simd_vec_issue_info (aarch64_base_vec_issue_info base, + unsigned int ld2_st2_general_ops, + unsigned int ld3_st3_general_ops, + unsigned int ld4_st4_general_ops) + : aarch64_base_vec_issue_info (base), + ld2_st2_general_ops (ld2_st2_general_ops), + ld3_st3_general_ops (ld3_st3_general_ops), + ld4_st4_general_ops (ld4_st4_general_ops) + {} + + /* How many FP/SIMD operations to count for each vector loaded or + stored by an LD[234] or ST[234] operation, in addition to the + base costs given in the parent class. For example, the full + number of operations for an LD3 would be: + + load ops: 3 + general ops: 3 * (fp_simd_load_general_ops + ld3_st3_general_ops). */ + const unsigned int ld2_st2_general_ops; + const unsigned int ld3_st3_general_ops; + const unsigned int ld4_st4_general_ops; +}; + +using aarch64_advsimd_vec_issue_info = aarch64_simd_vec_issue_info; + +/* Information about the issue stage for SVE. The main thing this adds + is a concept of "predicate operations". */ +struct aarch64_sve_vec_issue_info : aarch64_simd_vec_issue_info +{ + constexpr aarch64_sve_vec_issue_info + (aarch64_simd_vec_issue_info base, + unsigned int pred_ops_per_cycle, + unsigned int while_pred_ops, + unsigned int int_cmp_pred_ops, + unsigned int fp_cmp_pred_ops, + unsigned int gather_scatter_pair_general_ops, + unsigned int gather_scatter_pair_pred_ops) + : aarch64_simd_vec_issue_info (base), + pred_ops_per_cycle (pred_ops_per_cycle), + while_pred_ops (while_pred_ops), + int_cmp_pred_ops (int_cmp_pred_ops), + fp_cmp_pred_ops (fp_cmp_pred_ops), + gather_scatter_pair_general_ops (gather_scatter_pair_general_ops), + gather_scatter_pair_pred_ops (gather_scatter_pair_pred_ops) + {} + + /* How many predicate operations can be issued per cycle. */ + const unsigned int pred_ops_per_cycle; + + /* How many predicate operations are generated by a WHILExx + instruction. */ + const unsigned int while_pred_ops; + + /* How many predicate operations are generated by an integer + comparison instruction. */ + const unsigned int int_cmp_pred_ops; + + /* How many predicate operations are generated by a floating-point + comparison instruction. */ + const unsigned int fp_cmp_pred_ops; + + /* How many general and predicate operations are generated by each pair + of elements in a gather load or scatter store. These values apply + on top of the per-element counts recorded in fp_simd_load_general_ops + and fp_simd_store_general_ops. + + The reason for using pairs is that that is the largest possible + granule size for 128-bit SVE, which can load and store 2 64-bit + elements or 4 32-bit elements. */ + const unsigned int gather_scatter_pair_general_ops; + const unsigned int gather_scatter_pair_pred_ops; +}; + +/* Information related to instruction issue for a particular CPU. */ +struct aarch64_vec_issue_info +{ + const aarch64_base_vec_issue_info *const scalar; + const aarch64_simd_vec_issue_info *const advsimd; + const aarch64_sve_vec_issue_info *const sve; +}; + /* Cost for vector insn classes. */ struct cpu_vector_cost { - const int scalar_int_stmt_cost; /* Cost of any int scalar operation, - excluding load and store. */ - const int scalar_fp_stmt_cost; /* Cost of any fp scalar operation, - excluding load and store. */ - const int scalar_load_cost; /* Cost of scalar load. */ - const int scalar_store_cost; /* Cost of scalar store. */ - const int cond_taken_branch_cost; /* Cost of taken branch. */ - const int cond_not_taken_branch_cost; /* Cost of not taken branch. */ - const advsimd_vec_cost *advsimd; /* Cost of Advanced SIMD operations. */ - const sve_vec_cost *sve; /* Cost of SVE operations. */ + /* Cost of any integer scalar operation, excluding load and store. */ + const int scalar_int_stmt_cost; + + /* Cost of any fp scalar operation, excluding load and store. */ + const int scalar_fp_stmt_cost; + + /* Cost of a scalar load. */ + const int scalar_load_cost; + + /* Cost of a scalar store. */ + const int scalar_store_cost; + + /* Cost of a taken branch. */ + const int cond_taken_branch_cost; + + /* Cost of a not-taken branch. */ + const int cond_not_taken_branch_cost; + + /* Cost of an Advanced SIMD operations. */ + const advsimd_vec_cost *advsimd; + + /* Cost of an SVE operations, or null if SVE is not implemented. */ + const sve_vec_cost *sve; + + /* Issue information, or null if none is provided. */ + const aarch64_vec_issue_info *const issue_info; }; /* Branch costs. */ diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 6d18d82079c..6d961bea5dc 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -74,6 +74,8 @@ #include "intl.h" #include "expmed.h" #include "function-abi.h" +#include "gimple-pretty-print.h" +#include "tree-ssa-loop-niter.h" /* This file should be included last. */ #include "target-def.h" @@ -652,7 +654,8 @@ static const struct cpu_vector_cost generic_vector_cost = 3, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &generic_advsimd_vector_cost, /* advsimd */ - &generic_sve_vector_cost /* sve */ + &generic_sve_vector_cost, /* sve */ + nullptr /* issue_info */ }; static const advsimd_vec_cost a64fx_advsimd_vector_cost = @@ -719,7 +722,8 @@ static const struct cpu_vector_cost a64fx_vector_cost = 3, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &a64fx_advsimd_vector_cost, /* advsimd */ - &a64fx_sve_vector_cost /* sve */ + &a64fx_sve_vector_cost, /* sve */ + nullptr /* issue_info */ }; static const advsimd_vec_cost qdf24xx_advsimd_vector_cost = @@ -756,7 +760,8 @@ static const struct cpu_vector_cost qdf24xx_vector_cost = 3, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &qdf24xx_advsimd_vector_cost, /* advsimd */ - NULL /* sve */ + nullptr, /* sve */ + nullptr /* issue_info */ }; @@ -794,7 +799,8 @@ static const struct cpu_vector_cost thunderx_vector_cost = 3, /* cond_taken_branch_cost */ 3, /* cond_not_taken_branch_cost */ &thunderx_advsimd_vector_cost, /* advsimd */ - NULL /* sve */ + nullptr, /* sve */ + nullptr /* issue_info */ }; static const advsimd_vec_cost tsv110_advsimd_vector_cost = @@ -830,7 +836,8 @@ static const struct cpu_vector_cost tsv110_vector_cost = 1, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &tsv110_advsimd_vector_cost, /* advsimd */ - NULL, /* sve */ + nullptr, /* sve */ + nullptr /* issue_info */ }; static const advsimd_vec_cost cortexa57_advsimd_vector_cost = @@ -867,7 +874,8 @@ static const struct cpu_vector_cost cortexa57_vector_cost = 1, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &cortexa57_advsimd_vector_cost, /* advsimd */ - NULL /* sve */ + nullptr, /* sve */ + nullptr /* issue_info */ }; static const advsimd_vec_cost exynosm1_advsimd_vector_cost = @@ -903,7 +911,8 @@ static const struct cpu_vector_cost exynosm1_vector_cost = 1, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &exynosm1_advsimd_vector_cost, /* advsimd */ - NULL /* sve */ + nullptr, /* sve */ + nullptr /* issue_info */ }; static const advsimd_vec_cost xgene1_advsimd_vector_cost = @@ -940,7 +949,8 @@ static const struct cpu_vector_cost xgene1_vector_cost = 2, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &xgene1_advsimd_vector_cost, /* advsimd */ - NULL /* sve */ + nullptr, /* sve */ + nullptr /* issue_info */ }; static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost = @@ -977,7 +987,8 @@ static const struct cpu_vector_cost thunderx2t99_vector_cost = 2, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &thunderx2t99_advsimd_vector_cost, /* advsimd */ - NULL /* sve */ + nullptr, /* sve */ + nullptr /* issue_info */ }; static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost = @@ -1013,7 +1024,8 @@ static const struct cpu_vector_cost thunderx3t110_vector_cost = 2, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &thunderx3t110_advsimd_vector_cost, /* advsimd */ - NULL /* sve */ + nullptr, /* sve */ + nullptr /* issue_info */ }; @@ -1696,6 +1708,58 @@ static const sve_vec_cost neoversev1_sve_vector_cost = 3 /* scatter_store_elt_cost */ }; +static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info = +{ + 3, /* loads_stores_per_cycle */ + 2, /* stores_per_cycle */ + 4, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ +}; + +static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info = +{ + { + 3, /* loads_stores_per_cycle */ + 2, /* stores_per_cycle */ + 4, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ + }, + 2, /* ld2_st2_general_ops */ + 2, /* ld3_st3_general_ops */ + 3 /* ld4_st4_general_ops */ +}; + +static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info = +{ + { + { + 2, /* loads_per_cycle */ + 2, /* stores_per_cycle */ + 2, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ + }, + 2, /* ld2_st2_general_ops */ + 2, /* ld3_st3_general_ops */ + 3 /* ld4_st4_general_ops */ + }, + 1, /* pred_ops_per_cycle */ + 2, /* while_pred_ops */ + 2, /* int_cmp_pred_ops */ + 1, /* fp_cmp_pred_ops */ + 1, /* gather_scatter_pair_general_ops */ + 1 /* gather_scatter_pair_pred_ops */ +}; + +static const aarch64_vec_issue_info neoversev1_vec_issue_info = +{ + &neoversev1_scalar_issue_info, + &neoversev1_advsimd_issue_info, + &neoversev1_sve_issue_info +}; + /* Neoverse V1 costs for vector insn classes. */ static const struct cpu_vector_cost neoversev1_vector_cost = { @@ -1706,7 +1770,8 @@ static const struct cpu_vector_cost neoversev1_vector_cost = 1, /* cond_taken_branch_cost */ 1, /* cond_not_taken_branch_cost */ &neoversev1_advsimd_vector_cost, /* advsimd */ - &neoversev1_sve_vector_cost /* sve */ + &neoversev1_sve_vector_cost, /* sve */ + &neoversev1_vec_issue_info /* issue_info */ }; static const struct tune_params neoversev1_tunings = @@ -14120,6 +14185,38 @@ aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn, /* Vectorizer cost model target hooks. */ +/* Information about how the CPU would issue the scalar, Advanced SIMD + or SVE version of a vector loop, using the scheme defined by the + aarch64_base_vec_issue_info hierarchy of structures. */ +struct aarch64_vec_op_count +{ + void dump () const; + + /* The number of individual "general" operations. See the comments + in aarch64_base_vec_issue_info for details. */ + unsigned int general_ops = 0; + + /* The number of load and store operations, under the same scheme + as above. */ + unsigned int loads = 0; + unsigned int stores = 0; + + /* The minimum number of cycles needed to execute all loop-carried + operations, which in the vector code become associated with + reductions. */ + unsigned int reduction_latency = 0; +}; + +/* Extends aarch64_vec_op_count with SVE-specific information. */ +struct aarch64_sve_op_count : aarch64_vec_op_count +{ + void dump () const; + + /* The number of individual predicate operations. See the comments + in aarch64_sve_vec_issue_info for details. */ + unsigned int pred_ops = 0; +}; + /* Information about vector code that we're in the process of costing. */ struct aarch64_vector_costs { @@ -14138,6 +14235,10 @@ struct aarch64_vector_costs vectorization. */ bool is_loop = false; + /* True if we've seen an SVE operation that we cannot currently vectorize + using Advanced SIMD. */ + bool saw_sve_only_op = false; + /* - If VEC_FLAGS is zero then we're costing the original scalar code. - If VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced SIMD code. @@ -14159,6 +14260,32 @@ struct aarch64_vector_costs the heuristic. */ unsigned HOST_WIDE_INT unrolled_advsimd_niters = 0; unsigned HOST_WIDE_INT unrolled_advsimd_stmts = 0; + + /* If we're vectorizing a loop that executes a constant number of times, + this variable gives the number of times that the vector loop would + iterate, otherwise it is zero. */ + uint64_t num_vector_iterations = 0; + + /* Used only when vectorizing loops. Estimates the number and kind of scalar + operations that would be needed to perform the same work as one iteration + of the vector loop. */ + aarch64_vec_op_count scalar_ops; + + /* Used only when vectorizing loops. If VEC_FLAGS & VEC_ADVSIMD, + this structure estimates the number and kind of operations that the + vector loop would contain. If VEC_FLAGS & VEC_SVE, the structure + estimates what the equivalent Advanced SIMD-only code would need in + order to perform the same work as one iteration of the SVE loop. */ + aarch64_vec_op_count advsimd_ops; + + /* Used only when vectorizing loops with SVE. It estimates the number and + kind of operations that the SVE loop would contain. */ + aarch64_sve_op_count sve_ops; + + /* Used to detect cases in which we end up costing the same load twice, + once to account for results that are actually used and once to account + for unused results. */ + hash_map, unsigned int> seen_loads; }; /* Implement TARGET_VECTORIZE_INIT_COST. */ @@ -14190,6 +14317,16 @@ aarch64_simd_vec_costs (tree vectype) return costs->advsimd; } +/* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */ +static const simd_vec_cost * +aarch64_simd_vec_costs_for_flags (unsigned int flags) +{ + const cpu_vector_cost *costs = aarch64_tune_params.vec_costs; + if ((flags & VEC_ANY_SVE) && costs->sve) + return costs->sve; + return costs->advsimd; +} + /* Decide whether to use the unrolling heuristic described above aarch64_vector_costs::unrolled_advsimd_niters, updating that field if so. LOOP_VINFO describes the loop that we're vectorizing @@ -14250,6 +14387,19 @@ aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo, { costs->is_loop = true; + /* Record the number of times that the vector loop would execute, + if known. */ + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + auto scalar_niters = max_stmt_executions_int (loop); + if (scalar_niters >= 0) + { + unsigned int vf = vect_vf_for_cost (loop_vinfo); + if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) + costs->num_vector_iterations = scalar_niters / vf; + else + costs->num_vector_iterations = CEIL (scalar_niters, vf); + } + /* Detect whether we're costing the scalar code or the vector code. This is a bit hacky: it would be better if the vectorizer told us directly. @@ -14265,6 +14415,20 @@ aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo, apply the unrolling heuristic described above aarch64_vector_costs::unrolled_advsimd_niters. */ aarch64_record_potential_advsimd_unrolling (loop_vinfo, costs); + + /* Record the issue information for any SVE WHILE instructions that the + loop needs. */ + auto *issue_info = aarch64_tune_params.vec_costs->issue_info; + if (issue_info->sve && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) + { + unsigned int num_masks = 0; + rgroup_controls *rgm; + unsigned int num_vectors_m1; + FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm) + if (rgm->type) + num_masks += num_vectors_m1 + 1; + costs->sve_ops.pred_ops += num_masks * issue_info->sve->while_pred_ops; + } } /* Do one-time initialization of COSTS given that we're costing the block @@ -14407,6 +14571,17 @@ aarch64_embedded_comparison_type (stmt_vec_info stmt_info) return NULL_TREE; } +/* If STMT_INFO is a comparison or contains an embedded comparison, return the + scalar type of the values being compared. Return null otherwise. */ +static tree +aarch64_comparison_type (stmt_vec_info stmt_info) +{ + if (auto *assign = dyn_cast (stmt_info->stmt)) + if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison) + return TREE_TYPE (gimple_assign_rhs1 (assign)); + return aarch64_embedded_comparison_type (stmt_info); +} + /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD vectors would produce a series of LDP or STP operations. KIND is the kind of statement that STMT_INFO represents. */ @@ -14470,6 +14645,79 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info) && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type)); } +/* Return true if STMT_INFO is the second part of a two-statement multiply-add + or multiply-subtract sequence that might be suitable for fusing into a + single instruction. */ +static bool +aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info) +{ + gassign *assign = dyn_cast (stmt_info->stmt); + if (!assign) + return false; + tree_code code = gimple_assign_rhs_code (assign); + if (code != PLUS_EXPR && code != MINUS_EXPR) + return false; + + if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign)) + || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign))) + return false; + + for (int i = 1; i < 3; ++i) + { + tree rhs = gimple_op (assign, i); + /* ??? Should we try to check for a single use as well? */ + if (TREE_CODE (rhs) != SSA_NAME) + continue; + + stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs); + if (!def_stmt_info + || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def) + continue; + gassign *rhs_assign = dyn_cast (def_stmt_info->stmt); + if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR) + continue; + + return true; + } + return false; +} + +/* Return true if the vectorized form of STMT_INFO is something that is only + possible when using SVE instead of Advanced SIMD. VECTYPE is the type of + the vector that STMT_INFO is operating on. */ +static bool +aarch64_sve_only_stmt_p (stmt_vec_info stmt_info, tree vectype) +{ + if (!aarch64_sve_mode_p (TYPE_MODE (vectype))) + return false; + + if (STMT_VINFO_DATA_REF (stmt_info)) + { + /* Check for true gathers and scatters (rather than just strided accesses + that we've chosen to implement using gathers and scatters). Although + in principle we could use elementwise accesses for Advanced SIMD, + the vectorizer doesn't yet support that. */ + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + return true; + + /* Check for masked loads and stores. */ + if (auto *call = dyn_cast (stmt_info->stmt)) + if (gimple_call_internal_p (call) + && internal_fn_mask_index (gimple_call_internal_fn (call)) >= 0) + return true; + } + + /* Check for 64-bit integer multiplications. */ + auto *assign = dyn_cast (stmt_info->stmt); + if (assign + && gimple_assign_rhs_code (assign) == MULT_EXPR + && GET_MODE_INNER (TYPE_MODE (vectype)) == DImode + && !integer_pow2p (gimple_assign_rhs2 (assign))) + return true; + + return false; +} + /* We are considering implementing STMT_INFO using SVE vector type VECTYPE. If STMT_INFO is an in-loop reduction that SVE supports directly, return its latency in cycles, otherwise return zero. SVE_COSTS specifies the @@ -14507,6 +14755,59 @@ aarch64_sve_in_loop_reduction_latency (vec_info *vinfo, return 0; } +/* STMT_INFO describes a loop-carried operation in the original scalar code + that we are considering implementing as a reduction. Return one of the + following values, depending on VEC_FLAGS: + + - If VEC_FLAGS is zero, return the loop carry latency of the original + scalar operation. + + - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the + the Advanced SIMD implementation. + + - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the + SVE implementation. + + VECTYPE is the type of vector that the vectorizer is considering using + for STMT_INFO, which might be different from the type of vector described + by VEC_FLAGS. */ +static unsigned int +aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info, + tree vectype, unsigned int vec_flags) +{ + const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs; + const sve_vec_cost *sve_costs = nullptr; + if (vec_flags & VEC_ANY_SVE) + sve_costs = aarch64_tune_params.vec_costs->sve; + + /* If the caller is asking for the SVE latency, check for forms of reduction + that only SVE can handle directly. */ + if (sve_costs) + { + unsigned int latency + = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, vectype, + sve_costs); + if (latency) + return latency; + } + + /* Handle scalar costs. */ + if (vec_flags == 0) + { + if (FLOAT_TYPE_P (vectype)) + return vec_costs->scalar_fp_stmt_cost; + return vec_costs->scalar_int_stmt_cost; + } + + /* Otherwise, the loop body just contains normal integer or FP operations, + with a vector reduction outside the loop. */ + const simd_vec_cost *simd_costs + = aarch64_simd_vec_costs_for_flags (vec_flags); + if (FLOAT_TYPE_P (vectype)) + return simd_costs->fp_stmt_cost; + return simd_costs->int_stmt_cost; +} + /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost for STMT_INFO, which has cost kind KIND. If this is a scalar operation, try to subdivide the target-independent categorization provided by KIND @@ -14729,6 +15030,203 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info, return stmt_cost; } +/* VINFO, COSTS, COUNT, KIND, STMT_INFO and VECTYPE are the same as for + TARGET_VECTORIZE_ADD_STMT_COST and they describe an operation in the + body of a vector loop. Record issue information relating to the vector + operation in OPS, where OPS is one of COSTS->scalar_ops, COSTS->advsimd_ops + or COSTS->sve_ops; see the comments above those variables for details. + In addition: + + - VEC_FLAGS is zero if OPS is COSTS->scalar_ops. + + - VEC_FLAGS & VEC_ADVSIMD is nonzero if OPS is COSTS->advsimd_ops. + + - VEC_FLAGS & VEC_ANY_SVE is nonzero if OPS is COSTS->sve_ops. + + ISSUE_INFO provides the scalar, Advanced SIMD or SVE issue information + associated with OPS and VEC_FLAGS. FACTOR says how many iterations of + the loop described by VEC_FLAGS would be needed to match one iteration + of the vector loop in VINFO. */ +static void +aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs, + unsigned int count, enum vect_cost_for_stmt kind, + _stmt_vec_info *stmt_info, tree vectype, + unsigned int vec_flags, aarch64_vec_op_count *ops, + const aarch64_base_vec_issue_info *issue_info, + unsigned int factor) +{ + if (!issue_info) + return; + + const aarch64_simd_vec_issue_info *simd_issue = nullptr; + if (vec_flags) + simd_issue = static_cast (issue_info); + + const aarch64_sve_vec_issue_info *sve_issue = nullptr; + if (vec_flags & VEC_ANY_SVE) + sve_issue = static_cast (issue_info); + + /* Calculate the minimum cycles per iteration imposed by a reduction + operation. */ + if ((kind == vector_stmt || kind == vec_to_scalar) + && aarch64_is_reduction (stmt_info)) + { + unsigned int base + = aarch64_in_loop_reduction_latency (vinfo, stmt_info, vectype, + vec_flags); + if (aarch64_reduc_type (vinfo, stmt_info) == FOLD_LEFT_REDUCTION) + { + if (aarch64_sve_mode_p (TYPE_MODE (vectype))) + { + /* When costing an SVE FADDA, the vectorizer treats vec_to_scalar + as a single operation, whereas for Advanced SIMD it is a + per-element one. Increase the factor accordingly, both for + the reduction_latency calculation and for the op couting. */ + if (vec_flags & VEC_ADVSIMD) + factor = vect_nunits_for_cost (vectype); + } + else + /* An Advanced SIMD fold-left reduction is the same as a + scalar one and the vectorizer therefore treats vec_to_scalar + as a per-element cost. There is no extra factor to apply for + scalar code, either for reduction_latency or for the op + counting below. */ + factor = 1; + } + + /* ??? Ideally for vector code we'd do COUNT * FACTOR reductions in + parallel, but unfortunately that's not yet the case. */ + ops->reduction_latency = MAX (ops->reduction_latency, + base * count * factor); + } + + /* Assume that multiply-adds will become a single operation. */ + if (stmt_info && aarch64_multiply_add_p (vinfo, stmt_info)) + return; + + /* When costing scalar statements in vector code, the count already + includes the number of scalar elements in the vector, so we don't + need to apply the factor as well. */ + if (kind == scalar_load || kind == scalar_store || kind == scalar_stmt) + factor = 1; + + /* This can go negative with the load handling below. */ + int num_copies = count * factor; + + /* Count the basic operation cost associated with KIND. */ + switch (kind) + { + case cond_branch_taken: + case cond_branch_not_taken: + case vector_gather_load: + case vector_scatter_store: + /* We currently don't expect these to be used in a loop body. */ + break; + + case vec_perm: + case vec_promote_demote: + case vec_construct: + case vec_to_scalar: + case scalar_to_vec: + /* Assume that these operations have no overhead in the original + scalar code. */ + if (!vec_flags) + break; + /* Fallthrough. */ + case vector_stmt: + case scalar_stmt: + ops->general_ops += num_copies; + break; + + case scalar_load: + case vector_load: + case unaligned_load: + /* When costing scalars, detect cases in which we are called twice for + the same load. This happens for LD[234] operations if only some of + the results are used. The first time represents the cost of loading + the unused vectors, while the second time represents the cost of + loading the useful parts. Only the latter should count towards the + scalar costs. */ + if (stmt_info && !vec_flags) + { + bool existed = false; + unsigned int &prev_count + = costs->seen_loads.get_or_insert (stmt_info, &existed); + if (existed) + num_copies -= prev_count; + else + prev_count = num_copies; + } + ops->loads += num_copies; + if (vec_flags || FLOAT_TYPE_P (vectype)) + ops->general_ops += issue_info->fp_simd_load_general_ops * num_copies; + break; + + case vector_store: + case unaligned_store: + case scalar_store: + ops->stores += num_copies; + if (vec_flags || FLOAT_TYPE_P (vectype)) + ops->general_ops += issue_info->fp_simd_store_general_ops * num_copies; + break; + } + + /* Add any embedded comparison operations. */ + if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar) + && aarch64_embedded_comparison_type (stmt_info)) + ops->general_ops += num_copies; + + /* Detect COND_REDUCTIONs and things that would need to become + COND_REDUCTIONs if they were implemented using Advanced SIMD. + There are then two sets of VEC_COND_EXPRs, whereas so far we + have only accounted for one. */ + if (vec_flags && (kind == vector_stmt || kind == vec_to_scalar)) + { + int reduc_type = aarch64_reduc_type (vinfo, stmt_info); + if ((reduc_type == EXTRACT_LAST_REDUCTION && (vec_flags & VEC_ADVSIMD)) + || reduc_type == COND_REDUCTION) + ops->general_ops += num_copies; + } + + /* Count the predicate operations needed by an SVE comparison. */ + if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar)) + if (tree type = aarch64_comparison_type (stmt_info)) + { + unsigned int base = (FLOAT_TYPE_P (type) + ? sve_issue->fp_cmp_pred_ops + : sve_issue->int_cmp_pred_ops); + costs->sve_ops.pred_ops += base * num_copies; + } + + /* Add any extra overhead associated with LD[234] and ST[234] operations. */ + if (simd_issue) + switch (aarch64_ld234_st234_vectors (kind, stmt_info)) + { + case 2: + ops->general_ops += simd_issue->ld2_st2_general_ops * num_copies; + break; + + case 3: + ops->general_ops += simd_issue->ld3_st3_general_ops * num_copies; + break; + + case 4: + ops->general_ops += simd_issue->ld4_st4_general_ops * num_copies; + break; + } + + /* Add any overhead associated with gather loads and scatter stores. */ + if (sve_issue + && (kind == scalar_load || kind == scalar_store) + && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER) + { + unsigned int pairs = CEIL (count, 2); + costs->sve_ops.pred_ops + += sve_issue->gather_scatter_pair_pred_ops * pairs; + ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs; + } +} + /* Implement targetm.vectorize.add_stmt_cost. */ static unsigned aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, @@ -14760,6 +15258,9 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, of just looking at KIND. */ if (stmt_info && aarch64_use_new_vector_costs_p ()) { + if (vectype && aarch64_sve_only_stmt_p (stmt_info, vectype)) + costs->saw_sve_only_op = true; + stmt_cost = aarch64_detect_scalar_stmt_subtype (vinfo, kind, stmt_info, stmt_cost); @@ -14781,6 +15282,44 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype, stmt_cost); + /* If we're recording a nonzero vector loop body cost, also estimate + the operations that would need to be issued by all relevant + implementations of the loop. */ + auto *issue_info = aarch64_tune_params.vec_costs->issue_info; + if (loop_vinfo + && issue_info + && costs->vec_flags + && where == vect_body + && vectype + && stmt_cost != 0) + { + /* Record estimates for the scalar code. */ + aarch64_count_ops (vinfo, costs, count, kind, stmt_info, vectype, + 0, &costs->scalar_ops, issue_info->scalar, + vect_nunits_for_cost (vectype)); + + if (aarch64_sve_mode_p (vinfo->vector_mode) && issue_info->sve) + { + /* Record estimates for a possible Advanced SIMD version + of the SVE code. */ + aarch64_count_ops (vinfo, costs, count, kind, stmt_info, + vectype, VEC_ADVSIMD, &costs->advsimd_ops, + issue_info->advsimd, + aarch64_estimated_sve_vq ()); + + /* Record estimates for the SVE code itself. */ + aarch64_count_ops (vinfo, costs, count, kind, stmt_info, + vectype, VEC_ANY_SVE, &costs->sve_ops, + issue_info->sve, 1); + } + else + /* Record estimates for the Advanced SIMD code. Treat SVE like + Advanced SIMD if the CPU has no specific SVE costs. */ + aarch64_count_ops (vinfo, costs, count, kind, stmt_info, + vectype, VEC_ADVSIMD, &costs->advsimd_ops, + issue_info->advsimd, 1); + } + /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic, estimate the number of statements in the unrolled Advanced SIMD loop. For simplicitly, we assume that one iteration of the @@ -14805,12 +15344,56 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, return retval; } +/* Dump information about the structure. */ +void +aarch64_vec_op_count::dump () const +{ + dump_printf_loc (MSG_NOTE, vect_location, + " load operations = %d\n", loads); + dump_printf_loc (MSG_NOTE, vect_location, + " store operations = %d\n", stores); + dump_printf_loc (MSG_NOTE, vect_location, + " general operations = %d\n", general_ops); + dump_printf_loc (MSG_NOTE, vect_location, + " reduction latency = %d\n", reduction_latency); +} + +/* Dump information about the structure. */ +void +aarch64_sve_op_count::dump () const +{ + aarch64_vec_op_count::dump (); + dump_printf_loc (MSG_NOTE, vect_location, + " predicate operations = %d\n", pred_ops); +} + +/* Use ISSUE_INFO to estimate the minimum number of cycles needed to issue + the operations described by OPS. This is a very simplistic model! */ +static unsigned int +aarch64_estimate_min_cycles_per_iter + (const aarch64_vec_op_count *ops, + const aarch64_base_vec_issue_info *issue_info) +{ + unsigned int cycles = MAX (ops->reduction_latency, 1); + cycles = MAX (cycles, CEIL (ops->stores, issue_info->stores_per_cycle)); + cycles = MAX (cycles, CEIL (ops->loads + ops->stores, + issue_info->loads_stores_per_cycle)); + cycles = MAX (cycles, CEIL (ops->general_ops, + issue_info->general_ops_per_cycle)); + return cycles; +} + /* BODY_COST is the cost of a vector loop body recorded in COSTS. Adjust the cost as necessary and return the new cost. */ static unsigned int aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) { unsigned int orig_body_cost = body_cost; + bool should_disparage = false; + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Original vector body cost = %d\n", body_cost); if (costs->unrolled_advsimd_stmts) { @@ -14841,10 +15424,203 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) "Increasing body cost to %d to account for" " unrolling\n", min_cost); body_cost = min_cost; + should_disparage = true; } } } + auto *issue_info = aarch64_tune_params.vec_costs->issue_info; + if (!issue_info) + return body_cost; + + unsigned int scalar_cycles_per_iter + = aarch64_estimate_min_cycles_per_iter (&costs->scalar_ops, + issue_info->scalar); + unsigned int advsimd_cycles_per_iter + = aarch64_estimate_min_cycles_per_iter (&costs->advsimd_ops, + issue_info->advsimd); + bool could_use_advsimd + = ((costs->vec_flags & VEC_ADVSIMD) + || (aarch64_autovec_preference != 2 + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT) + && !costs->saw_sve_only_op)); + + if (dump_enabled_p ()) + { + if (IN_RANGE (costs->num_vector_iterations, 0, 65536)) + dump_printf_loc (MSG_NOTE, vect_location, + "Vector loop iterates at most %wd times\n", + costs->num_vector_iterations); + dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n"); + costs->scalar_ops.dump (); + dump_printf_loc (MSG_NOTE, vect_location, + " estimated cycles per iteration = %d\n", + scalar_cycles_per_iter); + if (could_use_advsimd) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Advanced SIMD issue estimate:\n"); + costs->advsimd_ops.dump (); + dump_printf_loc (MSG_NOTE, vect_location, + " estimated cycles per iteration = %d\n", + advsimd_cycles_per_iter); + } + else + dump_printf_loc (MSG_NOTE, vect_location, + "Loop could not use Advanced SIMD\n"); + } + + uint64_t vector_cycles_per_iter = advsimd_cycles_per_iter; + unsigned int vector_reduction_latency = costs->advsimd_ops.reduction_latency; + if ((costs->vec_flags & VEC_ANY_SVE) && issue_info->sve) + { + /* Estimate the minimum number of cycles per iteration needed to issue + non-predicate operations. */ + unsigned int sve_cycles_per_iter + = aarch64_estimate_min_cycles_per_iter (&costs->sve_ops, + issue_info->sve); + + /* Separately estimate the minimum number of cycles per iteration needed + to issue the predicate operations. */ + unsigned int pred_cycles_per_iter + = CEIL (costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle); + + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n"); + costs->sve_ops.dump (); + dump_printf_loc (MSG_NOTE, vect_location, + " estimated cycles per iteration for non-predicate" + " operations = %d\n", sve_cycles_per_iter); + if (costs->sve_ops.pred_ops) + dump_printf_loc (MSG_NOTE, vect_location, " estimated cycles per" + " iteration for predicate operations = %d\n", + pred_cycles_per_iter); + } + + vector_cycles_per_iter = MAX (sve_cycles_per_iter, pred_cycles_per_iter); + vector_reduction_latency = costs->sve_ops.reduction_latency; + + /* If the scalar version of the loop could issue at least as + quickly as the predicate parts of the SVE loop, make the SVE loop + prohibitively expensive. In this case vectorization is adding an + overhead that the original scalar code didn't have. + + This is mostly intended to detect cases in which WHILELOs dominate + for very tight loops, which is something that normal latency-based + costs would not model. Adding this kind of cliffedge would be + too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter; + code later in the function handles that case in a more + conservative way. */ + uint64_t sve_estimate = pred_cycles_per_iter + 1; + if (scalar_cycles_per_iter < sve_estimate) + { + unsigned int min_cost + = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR); + if (body_cost < min_cost) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Increasing body cost to %d because the" + " scalar code could issue within the limit" + " imposed by predicate operations\n", + min_cost); + body_cost = min_cost; + should_disparage = true; + } + } + + /* If it appears that the Advanced SIMD version of a loop could issue + more quickly than the SVE one, increase the SVE cost in proportion + to the difference. The intention is to make Advanced SIMD preferable + in cases where an Advanced SIMD version exists, without increasing + the costs so much that SVE won't be used at all. + + The reasoning is similar to the scalar vs. predicate comparison above: + if the issue rate of the SVE code is limited by predicate operations + (i.e. if pred_cycles_per_iter > sve_cycles_per_iter), and if the + Advanced SIMD code could issue within the limit imposed by the + predicate operations, the predicate operations are adding an + overhead that the original code didn't have and so we should prefer + the Advanced SIMD version. However, if the predicate operations + do not dominate in this way, we should only increase the cost of + the SVE code if sve_cycles_per_iter is strictly greater than + advsimd_cycles_per_iter. Given rounding effects, this should mean + that Advanced SIMD is either better or at least no worse. */ + if (sve_cycles_per_iter >= pred_cycles_per_iter) + sve_estimate = sve_cycles_per_iter; + if (could_use_advsimd && advsimd_cycles_per_iter < sve_estimate) + { + /* This ensures that min_cost > orig_body_cost * 2. */ + unsigned int min_cost + = orig_body_cost * CEIL (sve_estimate, advsimd_cycles_per_iter) + 1; + if (body_cost < min_cost) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Increasing body cost to %d because Advanced" + " SIMD code could issue as quickly\n", + min_cost); + body_cost = min_cost; + should_disparage = true; + } + } + } + + /* Decide whether to stick to latency-based costs or whether to try to + take issue rates into account. */ + unsigned int threshold = aarch64_loop_vect_issue_rate_niters; + if (costs->vec_flags & VEC_ANY_SVE) + threshold = CEIL (threshold, aarch64_estimated_sve_vq ()); + + if (costs->num_vector_iterations >= 1 + && costs->num_vector_iterations < threshold) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Low iteration count, so using pure latency" + " costs\n"); + } + /* Increase the cost of the vector code if it looks like the scalar code + could issue more quickly. These values are only rough estimates, + so minor differences should only result in minor changes. */ + else if (scalar_cycles_per_iter < vector_cycles_per_iter) + { + body_cost = CEIL (body_cost * vector_cycles_per_iter, + scalar_cycles_per_iter); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Increasing body cost to %d because scalar code" + " would issue more quickly\n", body_cost); + } + /* In general, it's expected that the proposed vector code would be able + to issue more quickly than the original scalar code. This should + already be reflected to some extent in the latency-based costs. + + However, the latency-based costs effectively assume that the scalar + code and the vector code execute serially, which tends to underplay + one important case: if the real (non-serialized) execution time of + a scalar iteration is dominated by loop-carried dependencies, + and if the vector code is able to reduce both the length of + the loop-carried dependencies *and* the number of cycles needed + to issue the code in general, we can be more confident that the + vector code is an improvement, even if adding the other (non-loop-carried) + latencies tends to hide this saving. We therefore reduce the cost of the + vector loop body in proportion to the saving. */ + else if (costs->scalar_ops.reduction_latency > vector_reduction_latency + && costs->scalar_ops.reduction_latency == scalar_cycles_per_iter + && scalar_cycles_per_iter > vector_cycles_per_iter + && !should_disparage) + { + body_cost = CEIL (body_cost * vector_cycles_per_iter, + scalar_cycles_per_iter); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Decreasing body cost to %d account for smaller" + " reduction latency\n", body_cost); + } + return body_cost; } diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt index 91e5c6151a7..32191cf1acf 100644 --- a/gcc/config/aarch64/aarch64.opt +++ b/gcc/config/aarch64/aarch64.opt @@ -277,3 +277,6 @@ The number of Newton iterations for calculating the reciprocal for double type. -param=aarch64-autovec-preference= Target Joined UInteger Var(aarch64_autovec_preference) Init(0) IntegerRange(0, 4) Param + +-param=aarch64-loop-vect-issue-rate-niters= +Target Joined UInteger Var(aarch64_loop_vect_issue_rate_niters) Init(6) IntegerRange(0, 65536) Param diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index ca204c7c3d8..7f1231303bf 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -14417,6 +14417,14 @@ Use both Advanced SIMD and SVE. Prefer SVE when the costs are deemed equal. @end table The default value is 0. +@item aarch64-loop-vect-issue-rate-niters +The tuning for some AArch64 CPUs tries to take both latencies and issue +rates into account when deciding whether a loop should be vectorized +using SVE, vectorized using Advanced SIMD, or not vectorized at all. +If this parameter is set to @var{n}, GCC will not use this heuristic +for loops that are known to execute in fewer than @var{n} Advanced +SIMD iterations. + @end table @end table