diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 2ffa96ec24b..ca1ed9e8758 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -281,19 +281,177 @@ struct sve_vec_cost : simd_vec_cost
   const int scatter_store_elt_cost;
 };
 
+/* Base information about how the CPU issues code, containing
+   information that is relevant to scalar, Advanced SIMD and SVE
+   operations.
+
+   The structure uses the general term "operation" to refer to
+   whichever subdivision of an instruction makes sense for the CPU.
+   These operations would typically be micro operations or macro
+   operations.
+
+   Note that this structure and the ones derived from it are only
+   as general as they need to be for the CPUs that currently use them.
+   They will probably need to be extended or refined as more CPUs are
+   added.  */
+struct aarch64_base_vec_issue_info
+{
+  /* How many loads and stores can be issued per cycle.  */
+  const unsigned int loads_stores_per_cycle;
+
+  /* How many stores can be issued per cycle.  */
+  const unsigned int stores_per_cycle;
+
+  /* How many integer or FP/SIMD operations can be issued per cycle.
+
+     Currently we don't try to distinguish the two.  For vector code,
+     we only really track FP/SIMD operations during vector costing;
+     we don't for example try to cost arithmetic operations like
+     address calculations, which are only decided later during ivopts.
+
+     For scalar code, we effectively assume that code operates entirely
+     on integers or entirely on floating-point values.  Again, we don't
+     try to take address calculations into account.
+
+     This is not very precise, but it's only meant to be a heuristic.
+     We could certainly try to do better in future if there's an example
+     of something that would benefit.  */
+  const unsigned int general_ops_per_cycle;
+
+  /* How many FP/SIMD operations to count for a floating-point or
+     vector load operation.
+
+     When constructing an Advanced SIMD vector from elements that have
+     been loaded from memory, these values apply to each individual load.
+     When using an SVE gather load, the values apply to each element of
+     the gather.  */
+  const unsigned int fp_simd_load_general_ops;
+
+  /* How many FP/SIMD operations to count for a floating-point or
+     vector store operation.
+
+     When storing individual elements of an Advanced SIMD vector out to
+     memory, these values apply to each individual store.  When using an
+     SVE scatter store, these values apply to each element of the scatter.  */
+  const unsigned int fp_simd_store_general_ops;
+};
+
+using aarch64_scalar_vec_issue_info = aarch64_base_vec_issue_info;
+
+/* Base information about the issue stage for vector operations.
+   This structure contains information that is relevant to both
+   Advanced SIMD and SVE.  */
+struct aarch64_simd_vec_issue_info : aarch64_base_vec_issue_info
+{
+  constexpr aarch64_simd_vec_issue_info (aarch64_base_vec_issue_info base,
+					 unsigned int ld2_st2_general_ops,
+					 unsigned int ld3_st3_general_ops,
+					 unsigned int ld4_st4_general_ops)
+    : aarch64_base_vec_issue_info (base),
+      ld2_st2_general_ops (ld2_st2_general_ops),
+      ld3_st3_general_ops (ld3_st3_general_ops),
+      ld4_st4_general_ops (ld4_st4_general_ops)
+  {}
+
+  /* How many FP/SIMD operations to count for each vector loaded or
+     stored by an LD[234] or ST[234] operation, in addition to the
+     base costs given in the parent class.  For example, the full
+     number of operations for an LD3 would be:
+
+       load ops:    3
+       general ops: 3 * (fp_simd_load_general_ops + ld3_st3_general_ops).  */
+  const unsigned int ld2_st2_general_ops;
+  const unsigned int ld3_st3_general_ops;
+  const unsigned int ld4_st4_general_ops;
+};
+
+using aarch64_advsimd_vec_issue_info = aarch64_simd_vec_issue_info;
+
+/* Information about the issue stage for SVE.  The main thing this adds
+   is a concept of "predicate operations".  */
+struct aarch64_sve_vec_issue_info : aarch64_simd_vec_issue_info
+{
+  constexpr aarch64_sve_vec_issue_info
+    (aarch64_simd_vec_issue_info base,
+     unsigned int pred_ops_per_cycle,
+     unsigned int while_pred_ops,
+     unsigned int int_cmp_pred_ops,
+     unsigned int fp_cmp_pred_ops,
+     unsigned int gather_scatter_pair_general_ops,
+     unsigned int gather_scatter_pair_pred_ops)
+    : aarch64_simd_vec_issue_info (base),
+      pred_ops_per_cycle (pred_ops_per_cycle),
+      while_pred_ops (while_pred_ops),
+      int_cmp_pred_ops (int_cmp_pred_ops),
+      fp_cmp_pred_ops (fp_cmp_pred_ops),
+      gather_scatter_pair_general_ops (gather_scatter_pair_general_ops),
+      gather_scatter_pair_pred_ops (gather_scatter_pair_pred_ops)
+  {}
+
+  /* How many predicate operations can be issued per cycle.  */
+  const unsigned int pred_ops_per_cycle;
+
+  /* How many predicate operations are generated by a WHILExx
+     instruction.  */
+  const unsigned int while_pred_ops;
+
+  /* How many predicate operations are generated by an integer
+     comparison instruction.  */
+  const unsigned int int_cmp_pred_ops;
+
+  /* How many predicate operations are generated by a floating-point
+     comparison instruction.  */
+  const unsigned int fp_cmp_pred_ops;
+
+  /* How many general and predicate operations are generated by each pair
+     of elements in a gather load or scatter store.  These values apply
+     on top of the per-element counts recorded in fp_simd_load_general_ops
+     and fp_simd_store_general_ops.
+
+     The reason for using pairs is that that is the largest possible
+     granule size for 128-bit SVE, which can load and store 2 64-bit
+     elements or 4 32-bit elements.  */
+  const unsigned int gather_scatter_pair_general_ops;
+  const unsigned int gather_scatter_pair_pred_ops;
+};
+
+/* Information related to instruction issue for a particular CPU.  */
+struct aarch64_vec_issue_info
+{
+  const aarch64_base_vec_issue_info *const scalar;
+  const aarch64_simd_vec_issue_info *const advsimd;
+  const aarch64_sve_vec_issue_info *const sve;
+};
+
 /* Cost for vector insn classes.  */
 struct cpu_vector_cost
 {
-  const int scalar_int_stmt_cost;	 /* Cost of any int scalar operation,
-					    excluding load and store.  */
-  const int scalar_fp_stmt_cost;	 /* Cost of any fp scalar operation,
-					    excluding load and store.  */
-  const int scalar_load_cost;		 /* Cost of scalar load.  */
-  const int scalar_store_cost;		 /* Cost of scalar store.  */
-  const int cond_taken_branch_cost;	 /* Cost of taken branch.  */
-  const int cond_not_taken_branch_cost;  /* Cost of not taken branch.  */
-  const advsimd_vec_cost *advsimd;	 /* Cost of Advanced SIMD operations.  */
-  const sve_vec_cost *sve;		 /* Cost of SVE operations.  */
+  /* Cost of any integer scalar operation, excluding load and store.  */
+  const int scalar_int_stmt_cost;
+
+  /* Cost of any fp scalar operation, excluding load and store.  */
+  const int scalar_fp_stmt_cost;
+
+  /* Cost of a scalar load.  */
+  const int scalar_load_cost;
+
+  /* Cost of a scalar store.  */
+  const int scalar_store_cost;
+
+  /* Cost of a taken branch.  */
+  const int cond_taken_branch_cost;
+
+  /* Cost of a not-taken branch.  */
+  const int cond_not_taken_branch_cost;
+
+  /* Cost of an Advanced SIMD operations.  */
+  const advsimd_vec_cost *advsimd;
+
+  /* Cost of an SVE operations, or null if SVE is not implemented.  */
+  const sve_vec_cost *sve;
+
+  /* Issue information, or null if none is provided.  */
+  const aarch64_vec_issue_info *const issue_info;
 };
 
 /* Branch costs.  */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 6d18d82079c..6d961bea5dc 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -74,6 +74,8 @@
 #include "intl.h"
 #include "expmed.h"
 #include "function-abi.h"
+#include "gimple-pretty-print.h"
+#include "tree-ssa-loop-niter.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -652,7 +654,8 @@ static const struct cpu_vector_cost generic_vector_cost =
   3, /* cond_taken_branch_cost  */
   1, /* cond_not_taken_branch_cost  */
   &generic_advsimd_vector_cost, /* advsimd  */
-  &generic_sve_vector_cost /* sve */
+  &generic_sve_vector_cost, /* sve */
+  nullptr /* issue_info  */
 };
 
 static const advsimd_vec_cost a64fx_advsimd_vector_cost =
@@ -719,7 +722,8 @@ static const struct cpu_vector_cost a64fx_vector_cost =
   3, /* cond_taken_branch_cost  */
   1, /* cond_not_taken_branch_cost  */
   &a64fx_advsimd_vector_cost, /* advsimd  */
-  &a64fx_sve_vector_cost /* sve  */
+  &a64fx_sve_vector_cost, /* sve  */
+  nullptr /* issue_info  */
 };
 
 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
@@ -756,7 +760,8 @@ static const struct cpu_vector_cost qdf24xx_vector_cost =
   3, /* cond_taken_branch_cost  */
   1, /* cond_not_taken_branch_cost  */
   &qdf24xx_advsimd_vector_cost, /* advsimd  */
-  NULL /* sve  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
 };
 
 
@@ -794,7 +799,8 @@ static const struct cpu_vector_cost thunderx_vector_cost =
   3, /* cond_taken_branch_cost  */
   3, /* cond_not_taken_branch_cost  */
   &thunderx_advsimd_vector_cost, /* advsimd  */
-  NULL /* sve  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
 };
 
 static const advsimd_vec_cost tsv110_advsimd_vector_cost =
@@ -830,7 +836,8 @@ static const struct cpu_vector_cost tsv110_vector_cost =
   1, /* cond_taken_branch_cost  */
   1, /* cond_not_taken_branch_cost  */
   &tsv110_advsimd_vector_cost, /* advsimd  */
-  NULL, /* sve  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
 };
 
 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
@@ -867,7 +874,8 @@ static const struct cpu_vector_cost cortexa57_vector_cost =
   1, /* cond_taken_branch_cost  */
   1, /* cond_not_taken_branch_cost  */
   &cortexa57_advsimd_vector_cost, /* advsimd  */
-  NULL /* sve  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
 };
 
 static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
@@ -903,7 +911,8 @@ static const struct cpu_vector_cost exynosm1_vector_cost =
   1, /* cond_taken_branch_cost  */
   1, /* cond_not_taken_branch_cost  */
   &exynosm1_advsimd_vector_cost, /* advsimd  */
-  NULL /* sve  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
 };
 
 static const advsimd_vec_cost xgene1_advsimd_vector_cost =
@@ -940,7 +949,8 @@ static const struct cpu_vector_cost xgene1_vector_cost =
   2, /* cond_taken_branch_cost  */
   1, /* cond_not_taken_branch_cost  */
   &xgene1_advsimd_vector_cost, /* advsimd  */
-  NULL /* sve  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
 };
 
 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
@@ -977,7 +987,8 @@ static const struct cpu_vector_cost thunderx2t99_vector_cost =
   2, /* cond_taken_branch_cost  */
   1,  /* cond_not_taken_branch_cost  */
   &thunderx2t99_advsimd_vector_cost, /* advsimd  */
-  NULL /* sve  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
 };
 
 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
@@ -1013,7 +1024,8 @@ static const struct cpu_vector_cost thunderx3t110_vector_cost =
   2, /* cond_taken_branch_cost  */
   1,  /* cond_not_taken_branch_cost  */
   &thunderx3t110_advsimd_vector_cost, /* advsimd  */
-  NULL /* sve  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
 };
 
 
@@ -1696,6 +1708,58 @@ static const sve_vec_cost neoversev1_sve_vector_cost =
   3 /* scatter_store_elt_cost  */
 };
 
+static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
+{
+  3, /* loads_stores_per_cycle  */
+  2, /* stores_per_cycle  */
+  4, /* general_ops_per_cycle  */
+  0, /* fp_simd_load_general_ops  */
+  1 /* fp_simd_store_general_ops  */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
+{
+  {
+    3, /* loads_stores_per_cycle  */
+    2, /* stores_per_cycle  */
+    4, /* general_ops_per_cycle  */
+    0, /* fp_simd_load_general_ops  */
+    1 /* fp_simd_store_general_ops  */
+  },
+  2, /* ld2_st2_general_ops  */
+  2, /* ld3_st3_general_ops  */
+  3 /* ld4_st4_general_ops  */
+};
+
+static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
+{
+  {
+    {
+      2, /* loads_per_cycle  */
+      2, /* stores_per_cycle  */
+      2, /* general_ops_per_cycle  */
+      0, /* fp_simd_load_general_ops  */
+      1 /* fp_simd_store_general_ops  */
+    },
+    2, /* ld2_st2_general_ops  */
+    2, /* ld3_st3_general_ops  */
+    3 /* ld4_st4_general_ops  */
+  },
+  1, /* pred_ops_per_cycle  */
+  2, /* while_pred_ops  */
+  2, /* int_cmp_pred_ops  */
+  1, /* fp_cmp_pred_ops  */
+  1, /* gather_scatter_pair_general_ops  */
+  1 /* gather_scatter_pair_pred_ops  */
+};
+
+static const aarch64_vec_issue_info neoversev1_vec_issue_info =
+{
+  &neoversev1_scalar_issue_info,
+  &neoversev1_advsimd_issue_info,
+  &neoversev1_sve_issue_info
+};
+
 /* Neoverse V1 costs for vector insn classes.  */
 static const struct cpu_vector_cost neoversev1_vector_cost =
 {
@@ -1706,7 +1770,8 @@ static const struct cpu_vector_cost neoversev1_vector_cost =
   1, /* cond_taken_branch_cost  */
   1, /* cond_not_taken_branch_cost  */
   &neoversev1_advsimd_vector_cost, /* advsimd  */
-  &neoversev1_sve_vector_cost /* sve  */
+  &neoversev1_sve_vector_cost, /* sve  */
+  &neoversev1_vec_issue_info /* issue_info  */
 };
 
 static const struct tune_params neoversev1_tunings =
@@ -14120,6 +14185,38 @@ aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
 
 /* Vectorizer cost model target hooks.  */
 
+/* Information about how the CPU would issue the scalar, Advanced SIMD
+   or SVE version of a vector loop, using the scheme defined by the
+   aarch64_base_vec_issue_info hierarchy of structures.  */
+struct aarch64_vec_op_count
+{
+  void dump () const;
+
+  /* The number of individual "general" operations.  See the comments
+     in aarch64_base_vec_issue_info for details.  */
+  unsigned int general_ops = 0;
+
+  /* The number of load and store operations, under the same scheme
+     as above.  */
+  unsigned int loads = 0;
+  unsigned int stores = 0;
+
+  /* The minimum number of cycles needed to execute all loop-carried
+     operations, which in the vector code become associated with
+     reductions.  */
+  unsigned int reduction_latency = 0;
+};
+
+/* Extends aarch64_vec_op_count with SVE-specific information.  */
+struct aarch64_sve_op_count : aarch64_vec_op_count
+{
+  void dump () const;
+
+  /* The number of individual predicate operations.  See the comments
+     in aarch64_sve_vec_issue_info for details.  */
+  unsigned int pred_ops = 0;
+};
+
 /* Information about vector code that we're in the process of costing.  */
 struct aarch64_vector_costs
 {
@@ -14138,6 +14235,10 @@ struct aarch64_vector_costs
      vectorization.  */
   bool is_loop = false;
 
+  /* True if we've seen an SVE operation that we cannot currently vectorize
+     using Advanced SIMD.  */
+  bool saw_sve_only_op = false;
+
   /* - If VEC_FLAGS is zero then we're costing the original scalar code.
      - If VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
        SIMD code.
@@ -14159,6 +14260,32 @@ struct aarch64_vector_costs
      the heuristic.  */
   unsigned HOST_WIDE_INT unrolled_advsimd_niters = 0;
   unsigned HOST_WIDE_INT unrolled_advsimd_stmts = 0;
+
+  /* If we're vectorizing a loop that executes a constant number of times,
+     this variable gives the number of times that the vector loop would
+     iterate, otherwise it is zero.  */
+  uint64_t num_vector_iterations = 0;
+
+  /* Used only when vectorizing loops.  Estimates the number and kind of scalar
+     operations that would be needed to perform the same work as one iteration
+     of the vector loop.  */
+  aarch64_vec_op_count scalar_ops;
+
+  /* Used only when vectorizing loops.  If VEC_FLAGS & VEC_ADVSIMD,
+     this structure estimates the number and kind of operations that the
+     vector loop would contain.  If VEC_FLAGS & VEC_SVE, the structure
+     estimates what the equivalent Advanced SIMD-only code would need in
+     order to perform the same work as one iteration of the SVE loop.  */
+  aarch64_vec_op_count advsimd_ops;
+
+  /* Used only when vectorizing loops with SVE.  It estimates the number and
+     kind of operations that the SVE loop would contain.  */
+  aarch64_sve_op_count sve_ops;
+
+  /* Used to detect cases in which we end up costing the same load twice,
+     once to account for results that are actually used and once to account
+     for unused results.  */
+  hash_map<nofree_ptr_hash<_stmt_vec_info>, unsigned int> seen_loads;
 };
 
 /* Implement TARGET_VECTORIZE_INIT_COST.  */
@@ -14190,6 +14317,16 @@ aarch64_simd_vec_costs (tree vectype)
   return costs->advsimd;
 }
 
+/* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS.  */
+static const simd_vec_cost *
+aarch64_simd_vec_costs_for_flags (unsigned int flags)
+{
+  const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
+  if ((flags & VEC_ANY_SVE) && costs->sve)
+    return costs->sve;
+  return costs->advsimd;
+}
+
 /* Decide whether to use the unrolling heuristic described above
    aarch64_vector_costs::unrolled_advsimd_niters, updating that
    field if so.  LOOP_VINFO describes the loop that we're vectorizing
@@ -14250,6 +14387,19 @@ aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo,
 {
   costs->is_loop = true;
 
+  /* Record the number of times that the vector loop would execute,
+     if known.  */
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  auto scalar_niters = max_stmt_executions_int (loop);
+  if (scalar_niters >= 0)
+    {
+      unsigned int vf = vect_vf_for_cost (loop_vinfo);
+      if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
+	costs->num_vector_iterations = scalar_niters / vf;
+      else
+	costs->num_vector_iterations = CEIL (scalar_niters, vf);
+    }
+
   /* Detect whether we're costing the scalar code or the vector code.
      This is a bit hacky: it would be better if the vectorizer told
      us directly.
@@ -14265,6 +14415,20 @@ aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo,
      apply the unrolling heuristic described above
      aarch64_vector_costs::unrolled_advsimd_niters.  */
   aarch64_record_potential_advsimd_unrolling (loop_vinfo, costs);
+
+  /* Record the issue information for any SVE WHILE instructions that the
+     loop needs.  */
+  auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
+  if (issue_info->sve && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
+    {
+      unsigned int num_masks = 0;
+      rgroup_controls *rgm;
+      unsigned int num_vectors_m1;
+      FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
+	if (rgm->type)
+	  num_masks += num_vectors_m1 + 1;
+      costs->sve_ops.pred_ops += num_masks * issue_info->sve->while_pred_ops;
+    }
 }
 
 /* Do one-time initialization of COSTS given that we're costing the block
@@ -14407,6 +14571,17 @@ aarch64_embedded_comparison_type (stmt_vec_info stmt_info)
   return NULL_TREE;
 }
 
+/* If STMT_INFO is a comparison or contains an embedded comparison, return the
+   scalar type of the values being compared.  Return null otherwise.  */
+static tree
+aarch64_comparison_type (stmt_vec_info stmt_info)
+{
+  if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
+    if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison)
+      return TREE_TYPE (gimple_assign_rhs1 (assign));
+  return aarch64_embedded_comparison_type (stmt_info);
+}
+
 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
    vectors would produce a series of LDP or STP operations.  KIND is the
    kind of statement that STMT_INFO represents.  */
@@ -14470,6 +14645,79 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info)
 	  && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
 }
 
+/* Return true if STMT_INFO is the second part of a two-statement multiply-add
+   or multiply-subtract sequence that might be suitable for fusing into a
+   single instruction.  */
+static bool
+aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info)
+{
+  gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
+  if (!assign)
+    return false;
+  tree_code code = gimple_assign_rhs_code (assign);
+  if (code != PLUS_EXPR && code != MINUS_EXPR)
+    return false;
+
+  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
+      || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
+    return false;
+
+  for (int i = 1; i < 3; ++i)
+    {
+      tree rhs = gimple_op (assign, i);
+      /* ??? Should we try to check for a single use as well?  */
+      if (TREE_CODE (rhs) != SSA_NAME)
+	continue;
+
+      stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
+      if (!def_stmt_info
+	  || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
+	continue;
+      gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
+      if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
+	continue;
+
+      return true;
+    }
+  return false;
+}
+
+/* Return true if the vectorized form of STMT_INFO is something that is only
+   possible when using SVE instead of Advanced SIMD.  VECTYPE is the type of
+   the vector that STMT_INFO is operating on.  */
+static bool
+aarch64_sve_only_stmt_p (stmt_vec_info stmt_info, tree vectype)
+{
+  if (!aarch64_sve_mode_p (TYPE_MODE (vectype)))
+    return false;
+
+  if (STMT_VINFO_DATA_REF (stmt_info))
+    {
+      /* Check for true gathers and scatters (rather than just strided accesses
+	 that we've chosen to implement using gathers and scatters).  Although
+	 in principle we could use elementwise accesses for Advanced SIMD,
+	 the vectorizer doesn't yet support that.  */
+      if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+	return true;
+
+      /* Check for masked loads and stores.  */
+      if (auto *call = dyn_cast<gcall *> (stmt_info->stmt))
+	if (gimple_call_internal_p (call)
+	    && internal_fn_mask_index (gimple_call_internal_fn (call)) >= 0)
+	  return true;
+    }
+
+  /* Check for 64-bit integer multiplications.  */
+  auto *assign = dyn_cast<gassign *> (stmt_info->stmt);
+  if (assign
+      && gimple_assign_rhs_code (assign) == MULT_EXPR
+      && GET_MODE_INNER (TYPE_MODE (vectype)) == DImode
+      && !integer_pow2p (gimple_assign_rhs2 (assign)))
+    return true;
+
+  return false;
+}
+
 /* We are considering implementing STMT_INFO using SVE vector type VECTYPE.
    If STMT_INFO is an in-loop reduction that SVE supports directly, return
    its latency in cycles, otherwise return zero.  SVE_COSTS specifies the
@@ -14507,6 +14755,59 @@ aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
   return 0;
 }
 
+/* STMT_INFO describes a loop-carried operation in the original scalar code
+   that we are considering implementing as a reduction.  Return one of the
+   following values, depending on VEC_FLAGS:
+
+   - If VEC_FLAGS is zero, return the loop carry latency of the original
+     scalar operation.
+
+   - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
+     the Advanced SIMD implementation.
+
+   - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
+     SVE implementation.
+
+   VECTYPE is the type of vector that the vectorizer is considering using
+   for STMT_INFO, which might be different from the type of vector described
+   by VEC_FLAGS.  */
+static unsigned int
+aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
+				   tree vectype, unsigned int vec_flags)
+{
+  const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
+  const sve_vec_cost *sve_costs = nullptr;
+  if (vec_flags & VEC_ANY_SVE)
+    sve_costs = aarch64_tune_params.vec_costs->sve;
+
+  /* If the caller is asking for the SVE latency, check for forms of reduction
+     that only SVE can handle directly.  */
+  if (sve_costs)
+    {
+      unsigned int latency
+	= aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, vectype,
+						 sve_costs);
+      if (latency)
+	return latency;
+    }
+
+  /* Handle scalar costs.  */
+  if (vec_flags == 0)
+    {
+      if (FLOAT_TYPE_P (vectype))
+	return vec_costs->scalar_fp_stmt_cost;
+      return vec_costs->scalar_int_stmt_cost;
+    }
+
+  /* Otherwise, the loop body just contains normal integer or FP operations,
+     with a vector reduction outside the loop.  */
+  const simd_vec_cost *simd_costs
+    = aarch64_simd_vec_costs_for_flags (vec_flags);
+  if (FLOAT_TYPE_P (vectype))
+    return simd_costs->fp_stmt_cost;
+  return simd_costs->int_stmt_cost;
+}
+
 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
    for STMT_INFO, which has cost kind KIND.  If this is a scalar operation,
    try to subdivide the target-independent categorization provided by KIND
@@ -14729,6 +15030,203 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
   return stmt_cost;
 }
 
+/* VINFO, COSTS, COUNT, KIND, STMT_INFO and VECTYPE are the same as for
+   TARGET_VECTORIZE_ADD_STMT_COST and they describe an operation in the
+   body of a vector loop.  Record issue information relating to the vector
+   operation in OPS, where OPS is one of COSTS->scalar_ops, COSTS->advsimd_ops
+   or COSTS->sve_ops; see the comments above those variables for details.
+   In addition:
+
+   - VEC_FLAGS is zero if OPS is COSTS->scalar_ops.
+
+   - VEC_FLAGS & VEC_ADVSIMD is nonzero if OPS is COSTS->advsimd_ops.
+
+   - VEC_FLAGS & VEC_ANY_SVE is nonzero if OPS is COSTS->sve_ops.
+
+   ISSUE_INFO provides the scalar, Advanced SIMD or SVE issue information
+   associated with OPS and VEC_FLAGS.  FACTOR says how many iterations of
+   the loop described by VEC_FLAGS would be needed to match one iteration
+   of the vector loop in VINFO.  */
+static void
+aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
+		   unsigned int count, enum vect_cost_for_stmt kind,
+		   _stmt_vec_info *stmt_info, tree vectype,
+		   unsigned int vec_flags, aarch64_vec_op_count *ops,
+		   const aarch64_base_vec_issue_info *issue_info,
+		   unsigned int factor)
+{
+  if (!issue_info)
+    return;
+
+  const aarch64_simd_vec_issue_info *simd_issue = nullptr;
+  if (vec_flags)
+    simd_issue = static_cast<const aarch64_simd_vec_issue_info *> (issue_info);
+
+  const aarch64_sve_vec_issue_info *sve_issue = nullptr;
+  if (vec_flags & VEC_ANY_SVE)
+    sve_issue = static_cast<const aarch64_sve_vec_issue_info *> (issue_info);
+
+  /* Calculate the minimum cycles per iteration imposed by a reduction
+     operation.  */
+  if ((kind == vector_stmt || kind == vec_to_scalar)
+      && aarch64_is_reduction (stmt_info))
+    {
+      unsigned int base
+	= aarch64_in_loop_reduction_latency (vinfo, stmt_info, vectype,
+					     vec_flags);
+      if (aarch64_reduc_type (vinfo, stmt_info) == FOLD_LEFT_REDUCTION)
+	{
+	  if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
+	    {
+	      /* When costing an SVE FADDA, the vectorizer treats vec_to_scalar
+		 as a single operation, whereas for Advanced SIMD it is a
+		 per-element one.  Increase the factor accordingly, both for
+		 the reduction_latency calculation and for the op couting.  */
+	      if (vec_flags & VEC_ADVSIMD)
+		factor = vect_nunits_for_cost (vectype);
+	    }
+	  else
+	    /* An Advanced SIMD fold-left reduction is the same as a
+	       scalar one and the vectorizer therefore treats vec_to_scalar
+	       as a per-element cost.  There is no extra factor to apply for
+	       scalar code, either for reduction_latency or for the op
+	       counting below.  */
+	    factor = 1;
+	}
+
+      /* ??? Ideally for vector code we'd do COUNT * FACTOR reductions in
+	 parallel, but unfortunately that's not yet the case.  */
+      ops->reduction_latency = MAX (ops->reduction_latency,
+				    base * count * factor);
+    }
+
+  /* Assume that multiply-adds will become a single operation.  */
+  if (stmt_info && aarch64_multiply_add_p (vinfo, stmt_info))
+    return;
+
+  /* When costing scalar statements in vector code, the count already
+     includes the number of scalar elements in the vector, so we don't
+     need to apply the factor as well.  */
+  if (kind == scalar_load || kind == scalar_store || kind == scalar_stmt)
+    factor = 1;
+
+  /* This can go negative with the load handling below.  */
+  int num_copies = count * factor;
+
+  /* Count the basic operation cost associated with KIND.  */
+  switch (kind)
+    {
+    case cond_branch_taken:
+    case cond_branch_not_taken:
+    case vector_gather_load:
+    case vector_scatter_store:
+      /* We currently don't expect these to be used in a loop body.  */
+      break;
+
+    case vec_perm:
+    case vec_promote_demote:
+    case vec_construct:
+    case vec_to_scalar:
+    case scalar_to_vec:
+      /* Assume that these operations have no overhead in the original
+	 scalar code.  */
+      if (!vec_flags)
+	break;
+      /* Fallthrough.  */
+    case vector_stmt:
+    case scalar_stmt:
+      ops->general_ops += num_copies;
+      break;
+
+    case scalar_load:
+    case vector_load:
+    case unaligned_load:
+      /* When costing scalars, detect cases in which we are called twice for
+	 the same load.  This happens for LD[234] operations if only some of
+	 the results are used.  The first time represents the cost of loading
+	 the unused vectors, while the second time represents the cost of
+	 loading the useful parts.  Only the latter should count towards the
+	 scalar costs.  */
+      if (stmt_info && !vec_flags)
+	{
+	  bool existed = false;
+	  unsigned int &prev_count
+	    = costs->seen_loads.get_or_insert (stmt_info, &existed);
+	  if (existed)
+	    num_copies -= prev_count;
+	  else
+	    prev_count = num_copies;
+	}
+      ops->loads += num_copies;
+      if (vec_flags || FLOAT_TYPE_P (vectype))
+	ops->general_ops += issue_info->fp_simd_load_general_ops * num_copies;
+      break;
+
+    case vector_store:
+    case unaligned_store:
+    case scalar_store:
+      ops->stores += num_copies;
+      if (vec_flags || FLOAT_TYPE_P (vectype))
+	ops->general_ops += issue_info->fp_simd_store_general_ops * num_copies;
+      break;
+    }
+
+  /* Add any embedded comparison operations.  */
+  if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
+      && aarch64_embedded_comparison_type (stmt_info))
+    ops->general_ops += num_copies;
+
+  /* Detect COND_REDUCTIONs and things that would need to become
+     COND_REDUCTIONs if they were implemented using Advanced SIMD.
+     There are then two sets of VEC_COND_EXPRs, whereas so far we
+     have only accounted for one.  */
+  if (vec_flags && (kind == vector_stmt || kind == vec_to_scalar))
+    {
+      int reduc_type = aarch64_reduc_type (vinfo, stmt_info);
+      if ((reduc_type == EXTRACT_LAST_REDUCTION && (vec_flags & VEC_ADVSIMD))
+	  || reduc_type == COND_REDUCTION)
+	ops->general_ops += num_copies;
+    }
+
+  /* Count the predicate operations needed by an SVE comparison.  */
+  if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
+    if (tree type = aarch64_comparison_type (stmt_info))
+      {
+	unsigned int base = (FLOAT_TYPE_P (type)
+			     ? sve_issue->fp_cmp_pred_ops
+			     : sve_issue->int_cmp_pred_ops);
+	costs->sve_ops.pred_ops += base * num_copies;
+      }
+
+  /* Add any extra overhead associated with LD[234] and ST[234] operations.  */
+  if (simd_issue)
+    switch (aarch64_ld234_st234_vectors (kind, stmt_info))
+      {
+      case 2:
+	ops->general_ops += simd_issue->ld2_st2_general_ops * num_copies;
+	break;
+
+      case 3:
+	ops->general_ops += simd_issue->ld3_st3_general_ops * num_copies;
+	break;
+
+      case 4:
+	ops->general_ops += simd_issue->ld4_st4_general_ops * num_copies;
+	break;
+      }
+
+  /* Add any overhead associated with gather loads and scatter stores.  */
+  if (sve_issue
+      && (kind == scalar_load || kind == scalar_store)
+      && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
+    {
+      unsigned int pairs = CEIL (count, 2);
+      costs->sve_ops.pred_ops
+	+= sve_issue->gather_scatter_pair_pred_ops * pairs;
+      ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
+    }
+}
+
 /* Implement targetm.vectorize.add_stmt_cost.  */
 static unsigned
 aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
@@ -14760,6 +15258,9 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
 	 of just looking at KIND.  */
       if (stmt_info && aarch64_use_new_vector_costs_p ())
 	{
+	  if (vectype && aarch64_sve_only_stmt_p (stmt_info, vectype))
+	    costs->saw_sve_only_op = true;
+
 	  stmt_cost = aarch64_detect_scalar_stmt_subtype
 	    (vinfo, kind, stmt_info, stmt_cost);
 
@@ -14781,6 +15282,44 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
 	  stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
 						stmt_cost);
 
+	  /* If we're recording a nonzero vector loop body cost, also estimate
+	     the operations that would need to be issued by all relevant
+	     implementations of the loop.  */
+	  auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
+	  if (loop_vinfo
+	      && issue_info
+	      && costs->vec_flags
+	      && where == vect_body
+	      && vectype
+	      && stmt_cost != 0)
+	    {
+	      /* Record estimates for the scalar code.  */
+	      aarch64_count_ops (vinfo, costs, count, kind, stmt_info, vectype,
+				 0, &costs->scalar_ops, issue_info->scalar,
+				 vect_nunits_for_cost (vectype));
+
+	      if (aarch64_sve_mode_p (vinfo->vector_mode) && issue_info->sve)
+		{
+		  /* Record estimates for a possible Advanced SIMD version
+		     of the SVE code.  */
+		  aarch64_count_ops (vinfo, costs, count, kind, stmt_info,
+				     vectype, VEC_ADVSIMD, &costs->advsimd_ops,
+				     issue_info->advsimd,
+				     aarch64_estimated_sve_vq ());
+
+		  /* Record estimates for the SVE code itself.  */
+		  aarch64_count_ops (vinfo, costs, count, kind, stmt_info,
+				     vectype, VEC_ANY_SVE, &costs->sve_ops,
+				     issue_info->sve, 1);
+		}
+	      else
+		/* Record estimates for the Advanced SIMD code.  Treat SVE like
+		   Advanced SIMD if the CPU has no specific SVE costs.  */
+		aarch64_count_ops (vinfo, costs, count, kind, stmt_info,
+				   vectype, VEC_ADVSIMD, &costs->advsimd_ops,
+				   issue_info->advsimd, 1);
+	    }
+
 	  /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
 	     estimate the number of statements in the unrolled Advanced SIMD
 	     loop.  For simplicitly, we assume that one iteration of the
@@ -14805,12 +15344,56 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
   return retval;
 }
 
+/* Dump information about the structure.  */
+void
+aarch64_vec_op_count::dump () const
+{
+  dump_printf_loc (MSG_NOTE, vect_location,
+		   "  load operations = %d\n", loads);
+  dump_printf_loc (MSG_NOTE, vect_location,
+		   "  store operations = %d\n", stores);
+  dump_printf_loc (MSG_NOTE, vect_location,
+		   "  general operations = %d\n", general_ops);
+  dump_printf_loc (MSG_NOTE, vect_location,
+		   "  reduction latency = %d\n", reduction_latency);
+}
+
+/* Dump information about the structure.  */
+void
+aarch64_sve_op_count::dump () const
+{
+  aarch64_vec_op_count::dump ();
+  dump_printf_loc (MSG_NOTE, vect_location,
+		   "  predicate operations = %d\n", pred_ops);
+}
+
+/* Use ISSUE_INFO to estimate the minimum number of cycles needed to issue
+   the operations described by OPS.  This is a very simplistic model!  */
+static unsigned int
+aarch64_estimate_min_cycles_per_iter
+  (const aarch64_vec_op_count *ops,
+   const aarch64_base_vec_issue_info *issue_info)
+{
+  unsigned int cycles = MAX (ops->reduction_latency, 1);
+  cycles = MAX (cycles, CEIL (ops->stores, issue_info->stores_per_cycle));
+  cycles = MAX (cycles, CEIL (ops->loads + ops->stores,
+			      issue_info->loads_stores_per_cycle));
+  cycles = MAX (cycles, CEIL (ops->general_ops,
+			      issue_info->general_ops_per_cycle));
+  return cycles;
+}
+
 /* BODY_COST is the cost of a vector loop body recorded in COSTS.
    Adjust the cost as necessary and return the new cost.  */
 static unsigned int
 aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
 {
   unsigned int orig_body_cost = body_cost;
+  bool should_disparage = false;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "Original vector body cost = %d\n", body_cost);
 
   if (costs->unrolled_advsimd_stmts)
     {
@@ -14841,10 +15424,203 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
 				 "Increasing body cost to %d to account for"
 				 " unrolling\n", min_cost);
 	      body_cost = min_cost;
+	      should_disparage = true;
 	    }
 	}
     }
 
+  auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
+  if (!issue_info)
+    return body_cost;
+
+  unsigned int scalar_cycles_per_iter
+    = aarch64_estimate_min_cycles_per_iter (&costs->scalar_ops,
+					    issue_info->scalar);
+  unsigned int advsimd_cycles_per_iter
+    = aarch64_estimate_min_cycles_per_iter (&costs->advsimd_ops,
+					    issue_info->advsimd);
+  bool could_use_advsimd
+    = ((costs->vec_flags & VEC_ADVSIMD)
+       || (aarch64_autovec_preference != 2
+	   && (aarch64_tune_params.extra_tuning_flags
+	       & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT)
+	   && !costs->saw_sve_only_op));
+
+  if (dump_enabled_p ())
+    {
+      if (IN_RANGE (costs->num_vector_iterations, 0, 65536))
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Vector loop iterates at most %wd times\n",
+			 costs->num_vector_iterations);
+      dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
+      costs->scalar_ops.dump ();
+      dump_printf_loc (MSG_NOTE, vect_location,
+		       "  estimated cycles per iteration = %d\n",
+		       scalar_cycles_per_iter);
+      if (could_use_advsimd)
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "Advanced SIMD issue estimate:\n");
+	  costs->advsimd_ops.dump ();
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "  estimated cycles per iteration = %d\n",
+			   advsimd_cycles_per_iter);
+	}
+      else
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Loop could not use Advanced SIMD\n");
+    }
+
+  uint64_t vector_cycles_per_iter = advsimd_cycles_per_iter;
+  unsigned int vector_reduction_latency = costs->advsimd_ops.reduction_latency;
+  if ((costs->vec_flags & VEC_ANY_SVE) && issue_info->sve)
+    {
+      /* Estimate the minimum number of cycles per iteration needed to issue
+	 non-predicate operations.  */
+      unsigned int sve_cycles_per_iter
+	= aarch64_estimate_min_cycles_per_iter (&costs->sve_ops,
+						issue_info->sve);
+
+      /* Separately estimate the minimum number of cycles per iteration needed
+	 to issue the predicate operations.  */
+      unsigned int pred_cycles_per_iter
+	= CEIL (costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle);
+
+      if (dump_enabled_p ())
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
+	  costs->sve_ops.dump ();
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "  estimated cycles per iteration for non-predicate"
+			   " operations = %d\n", sve_cycles_per_iter);
+	  if (costs->sve_ops.pred_ops)
+	    dump_printf_loc (MSG_NOTE, vect_location, "  estimated cycles per"
+			     " iteration for predicate operations = %d\n",
+			     pred_cycles_per_iter);
+	}
+
+      vector_cycles_per_iter = MAX (sve_cycles_per_iter, pred_cycles_per_iter);
+      vector_reduction_latency = costs->sve_ops.reduction_latency;
+
+      /* If the scalar version of the loop could issue at least as
+	 quickly as the predicate parts of the SVE loop, make the SVE loop
+	 prohibitively expensive.  In this case vectorization is adding an
+	 overhead that the original scalar code didn't have.
+
+	 This is mostly intended to detect cases in which WHILELOs dominate
+	 for very tight loops, which is something that normal latency-based
+	 costs would not model.  Adding this kind of cliffedge would be
+	 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
+	 code later in the function handles that case in a more
+	 conservative way.  */
+      uint64_t sve_estimate = pred_cycles_per_iter + 1;
+      if (scalar_cycles_per_iter < sve_estimate)
+	{
+	  unsigned int min_cost
+	    = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
+	  if (body_cost < min_cost)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "Increasing body cost to %d because the"
+				 " scalar code could issue within the limit"
+				 " imposed by predicate operations\n",
+				 min_cost);
+	      body_cost = min_cost;
+	      should_disparage = true;
+	    }
+	}
+
+      /* If it appears that the Advanced SIMD version of a loop could issue
+	 more quickly than the SVE one, increase the SVE cost in proportion
+	 to the difference.  The intention is to make Advanced SIMD preferable
+	 in cases where an Advanced SIMD version exists, without increasing
+	 the costs so much that SVE won't be used at all.
+
+	 The reasoning is similar to the scalar vs. predicate comparison above:
+	 if the issue rate of the SVE code is limited by predicate operations
+	 (i.e. if pred_cycles_per_iter > sve_cycles_per_iter), and if the
+	 Advanced SIMD code could issue within the limit imposed by the
+	 predicate operations, the predicate operations are adding an
+	 overhead that the original code didn't have and so we should prefer
+	 the Advanced SIMD version.  However, if the predicate operations
+	 do not dominate in this way, we should only increase the cost of
+	 the SVE code if sve_cycles_per_iter is strictly greater than
+	 advsimd_cycles_per_iter.  Given rounding effects, this should mean
+	 that Advanced SIMD is either better or at least no worse.  */
+      if (sve_cycles_per_iter >= pred_cycles_per_iter)
+	sve_estimate = sve_cycles_per_iter;
+      if (could_use_advsimd && advsimd_cycles_per_iter < sve_estimate)
+	{
+	  /* This ensures that min_cost > orig_body_cost * 2.  */
+	  unsigned int min_cost
+	    = orig_body_cost * CEIL (sve_estimate, advsimd_cycles_per_iter) + 1;
+	  if (body_cost < min_cost)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "Increasing body cost to %d because Advanced"
+				 " SIMD code could issue as quickly\n",
+				 min_cost);
+	      body_cost = min_cost;
+	      should_disparage = true;
+	    }
+	}
+    }
+
+  /* Decide whether to stick to latency-based costs or whether to try to
+     take issue rates into account.  */
+  unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
+  if (costs->vec_flags & VEC_ANY_SVE)
+    threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
+
+  if (costs->num_vector_iterations >= 1
+      && costs->num_vector_iterations < threshold)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Low iteration count, so using pure latency"
+			 " costs\n");
+    }
+  /* Increase the cost of the vector code if it looks like the scalar code
+     could issue more quickly.  These values are only rough estimates,
+     so minor differences should only result in minor changes.  */
+  else if (scalar_cycles_per_iter < vector_cycles_per_iter)
+    {
+      body_cost = CEIL (body_cost * vector_cycles_per_iter,
+			scalar_cycles_per_iter);
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Increasing body cost to %d because scalar code"
+			 " would issue more quickly\n", body_cost);
+    }
+  /* In general, it's expected that the proposed vector code would be able
+     to issue more quickly than the original scalar code.  This should
+     already be reflected to some extent in the latency-based costs.
+
+     However, the latency-based costs effectively assume that the scalar
+     code and the vector code execute serially, which tends to underplay
+     one important case: if the real (non-serialized) execution time of
+     a scalar iteration is dominated by loop-carried dependencies,
+     and if the vector code is able to reduce both the length of
+     the loop-carried dependencies *and* the number of cycles needed
+     to issue the code in general, we can be more confident that the
+     vector code is an improvement, even if adding the other (non-loop-carried)
+     latencies tends to hide this saving.  We therefore reduce the cost of the
+     vector loop body in proportion to the saving.  */
+  else if (costs->scalar_ops.reduction_latency > vector_reduction_latency
+	   && costs->scalar_ops.reduction_latency == scalar_cycles_per_iter
+	   && scalar_cycles_per_iter > vector_cycles_per_iter
+	   && !should_disparage)
+    {
+      body_cost = CEIL (body_cost * vector_cycles_per_iter,
+			scalar_cycles_per_iter);
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Decreasing body cost to %d account for smaller"
+			 " reduction latency\n", body_cost);
+    }
+
   return body_cost;
 }
 
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 91e5c6151a7..32191cf1acf 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -277,3 +277,6 @@ The number of Newton iterations for calculating the reciprocal for double type.
 
 -param=aarch64-autovec-preference=
 Target Joined UInteger Var(aarch64_autovec_preference) Init(0) IntegerRange(0, 4) Param
+
+-param=aarch64-loop-vect-issue-rate-niters=
+Target Joined UInteger Var(aarch64_loop_vect_issue_rate_niters) Init(6) IntegerRange(0, 65536) Param
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index ca204c7c3d8..7f1231303bf 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14417,6 +14417,14 @@ Use both Advanced SIMD and SVE.  Prefer SVE when the costs are deemed equal.
 @end table
 The default value is 0.
 
+@item aarch64-loop-vect-issue-rate-niters
+The tuning for some AArch64 CPUs tries to take both latencies and issue
+rates into account when deciding whether a loop should be vectorized
+using SVE, vectorized using Advanced SIMD, or not vectorized at all.
+If this parameter is set to @var{n}, GCC will not use this heuristic
+for loops that are known to execute in fewer than @var{n} Advanced
+SIMD iterations.
+
 @end table
 
 @end table