RISC-V: Apply vla vs. vls mode heuristic vector COST model

This patch apply vla vs. vls mode heuristic which can fixes the following FAILs: FAIL: gcc.target/riscv/rvv/autovec/pr111751.c -O3 -ftree-vectorize scan-assembler-not vset FAIL: gcc.target/riscv/rvv/autovec/pr111751.c -O3 -ftree-vectorize scan-assembler-times li\\s+[a-x0-9]+,0\\s+ret 2 The root cause of this FAIL is we failed to pick VLS mode for the vectorization. Before this patch: foo2: addi sp,sp,-208 addi a2,sp,64 addi a5,sp,128 lui a6,%hi(.LANCHOR0) sd ra,200(sp) addi a6,a6,%lo(.LANCHOR0) mv a0,a2 mv a1,a5 li a3,16 mv a4,sp vsetivli zero,8,e64,m8,ta,ma vle64.v v8,0(a6) vse64.v v8,0(a2) vse64.v v8,0(a5) .L4: vsetvli a5,a3,e32,m1,ta,ma slli a2,a5,2 vle32.v v2,0(a1) vle32.v v1,0(a0) sub a3,a3,a5 vadd.vv v1,v1,v2 vse32.v v1,0(a4) add a1,a1,a2 add a0,a0,a2 add a4,a4,a2 bne a3,zero,.L4 lw a4,128(sp) lw a5,64(sp) addw a5,a5,a4 lw a4,0(sp) bne a4,a5,.L5 lw a4,132(sp) lw a5,68(sp) addw a5,a5,a4 lw a4,4(sp) bne a4,a5,.L5 lw a4,136(sp) lw a5,72(sp) addw a5,a5,a4 lw a4,8(sp) bne a4,a5,.L5 lw a4,140(sp) lw a5,76(sp) addw a5,a5,a4 lw a4,12(sp) bne a4,a5,.L5 lw a4,144(sp) lw a5,80(sp) addw a5,a5,a4 lw a4,16(sp) bne a4,a5,.L5 lw a4,148(sp) lw a5,84(sp) addw a5,a5,a4 lw a4,20(sp) bne a4,a5,.L5 lw a4,152(sp) lw a5,88(sp) addw a5,a5,a4 lw a4,24(sp) bne a4,a5,.L5 lw a4,156(sp) lw a5,92(sp) addw a5,a5,a4 lw a4,28(sp) bne a4,a5,.L5 lw a4,160(sp) lw a5,96(sp) addw a5,a5,a4 lw a4,32(sp) bne a4,a5,.L5 lw a4,164(sp) lw a5,100(sp) addw a5,a5,a4 lw a4,36(sp) bne a4,a5,.L5 lw a4,168(sp) lw a5,104(sp) addw a5,a5,a4 lw a4,40(sp) bne a4,a5,.L5 lw a4,172(sp) lw a5,108(sp) addw a5,a5,a4 lw a4,44(sp) bne a4,a5,.L5 lw a4,176(sp) lw a5,112(sp) addw a5,a5,a4 lw a4,48(sp) bne a4,a5,.L5 lw a4,180(sp) lw a5,116(sp) addw a5,a5,a4 lw a4,52(sp) bne a4,a5,.L5 lw a4,184(sp) lw a5,120(sp) addw a5,a5,a4 lw a4,56(sp) bne a4,a5,.L5 lw a4,188(sp) lw a5,124(sp) addw a5,a5,a4 lw a4,60(sp) bne a4,a5,.L5 ld ra,200(sp) li a0,0 addi sp,sp,208 jr ra .L5: call abort After this patch: li a0,0 ret The heuristic leverage ARM SVE and fully tested and confirm we have same behavior as ARM SVE GCC and RVV Clang. gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (costs::analyze_loop_vinfo): New function. (costs::record_potential_vls_unrolling): Ditto. (costs::prefer_unrolled_loop): Ditto. (costs::better_main_loop_than_p): Ditto. (costs::add_stmt_cost): Ditto. * config/riscv/riscv-vector-costs.h (enum cost_type_enum): New enum. * config/riscv/t-riscv: Add new include files. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr111313.c: Adapt test. * gcc.target/riscv/rvv/autovec/vls/shift-3.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-1.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-2.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-3.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-4.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-5.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-6.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-7.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-8.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-9.c: New test.
2023-12-12 22:25:52 +08:00 · 2023-12-12 22:25:52 +08:00 · 8501edba91
commit 8501edba91
parent 26250632df
17 changed files with 408 additions and 4 deletions
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "ssa.h"
 #include "backend.h"
 #include "tree-data-ref.h"
+#include "tree-ssa-loop-niter.h"

 /* This file should be included last.  */
 #include "riscv-vector-costs.h"
@ -601,7 +602,101 @@ preferred_new_lmul_p (loop_vec_info other_loop_vinfo)

 costs::costs (vec_info *vinfo, bool costing_for_scalar)
  : vector_costs (vinfo, costing_for_scalar)
-{}
+{
+  if (costing_for_scalar)
+    m_cost_type = SCALAR_COST;
+  else if (riscv_v_ext_vector_mode_p (vinfo->vector_mode))
+    m_cost_type = VLA_VECTOR_COST;
+  else
+    m_cost_type = VLS_VECTOR_COST;
+}
+
+/* Do one-time initialization of the costs given that we're
+   costing the loop vectorization described by LOOP_VINFO.  */
+void
+costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
+{
+  /* Record the number of times that the vector loop would execute,
+     if known.  */
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  auto scalar_niters = max_stmt_executions_int (loop);
+  if (scalar_niters >= 0)
+    {
+      unsigned int vf = vect_vf_for_cost (loop_vinfo);
+      if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
+	m_num_vector_iterations = scalar_niters / vf;
+      else
+	m_num_vector_iterations = CEIL (scalar_niters, vf);
+    }
+
+  /* Detect whether we're vectorizing for VLA and should apply the unrolling
+     heuristic described above m_unrolled_vls_niters.  */
+  record_potential_vls_unrolling (loop_vinfo);
+}
+
+/* Decide whether to use the unrolling heuristic described above
+   m_unrolled_vls_niters, updating that field if so.  LOOP_VINFO
+   describes the loop that we're vectorizing.  */
+void
+costs::record_potential_vls_unrolling (loop_vec_info loop_vinfo)
+{
+  /* We only want to apply the heuristic if LOOP_VINFO is being
+     vectorized for VLA.  */
+  if (m_cost_type != VLA_VECTOR_COST)
+    return;
+
+  /* We don't want to apply the heuristic to outer loops, since it's
+     harder to track two levels of unrolling.  */
+  if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
+    return;
+
+  /* Only handle cases in which the number of VLS iterations
+     would be known at compile time but the number of SVE iterations
+     would not.  */
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      || BYTES_PER_RISCV_VECTOR.is_constant ())
+    return;
+
+  /* Guess how many times the VLS loop would iterate and make
+     sure that it is within the complete unrolling limit.  Even if the
+     number of iterations is small enough, the number of statements might
+     not be, which is why we need to estimate the number of statements too.  */
+  unsigned int vls_vf = vect_vf_for_cost (loop_vinfo);
+  unsigned HOST_WIDE_INT unrolled_vls_niters
+    = LOOP_VINFO_INT_NITERS (loop_vinfo) / vls_vf;
+  if (unrolled_vls_niters > (unsigned int) param_max_completely_peel_times)
+    return;
+
+  /* Record that we're applying the heuristic and should try to estimate
+     the number of statements in the VLS loop.  */
+  m_unrolled_vls_niters = unrolled_vls_niters;
+}
+
+/* Return true if (a) we're applying the VLS vs. VLA unrolling
+   heuristic described above m_unrolled_vls_niters and (b) the heuristic
+   says that we should prefer the VLS loop.  */
+bool
+costs::prefer_unrolled_loop () const
+{
+  if (!m_unrolled_vls_stmts)
+    return false;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "Number of insns in"
+		     " unrolled VLS loop = " HOST_WIDE_INT_PRINT_UNSIGNED "\n",
+		     m_unrolled_vls_stmts);
+
+  /* The balance here is tricky.  On the one hand, we can't be sure whether
+     the code is vectorizable with VLS or not.  However, even if
+     it isn't vectorizable with VLS, there's a possibility that
+     the scalar code could also be unrolled.  Some of the code might then
+     benefit from SLP, or from using LDP and STP.  We therefore apply
+     the heuristic regardless of can_use_vls_p.  */
+  return (m_unrolled_vls_stmts
+	  && (m_unrolled_vls_stmts
+	      <= (unsigned int) param_max_completely_peeled_insns));
+}

 bool
 costs::better_main_loop_than_p (const vector_costs *uncast_other) const
@ -618,6 +713,21 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
 		     GET_MODE_NAME (other_loop_vinfo->vector_mode),
 		     vect_vf_for_cost (other_loop_vinfo));

+  /* Apply the unrolling heuristic described above m_unrolled_vls_niters.  */
+  if (bool (m_unrolled_vls_stmts) != bool (other->m_unrolled_vls_stmts))
+    {
+      bool this_prefer_unrolled = this->prefer_unrolled_loop ();
+      bool other_prefer_unrolled = other->prefer_unrolled_loop ();
+      if (this_prefer_unrolled != other_prefer_unrolled)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "Preferring VLS loop because"
+			     " it can be unrolled\n");
+	  return other_prefer_unrolled;
+	}
+    }
+
  if (!LOOP_VINFO_NITERS_KNOWN_P (this_loop_vinfo)
      && riscv_autovec_lmul == RVV_DYNAMIC)
    {
@ -643,6 +753,28 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
  /* TODO: Use default STMT cost model.
 	   We will support more accurate STMT cost model later.  */
  int stmt_cost = default_builtin_vectorization_cost (kind, vectype, misalign);
+
+  /* Do one-time initialization based on the vinfo.  */
+  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
+  if (!m_analyzed_vinfo)
+    {
+      if (loop_vinfo)
+	analyze_loop_vinfo (loop_vinfo);
+
+      m_analyzed_vinfo = true;
+    }
+
+  if (stmt_info)
+    {
+      /* If we're applying the VLA vs. VLS unrolling heuristic,
+	 estimate the number of statements in the unrolled VLS
+	 loop.  For simplicitly, we assume that one iteration of the
+	 VLS loop would need the same number of statements
+	 as one iteration of the VLA loop.  */
+      if (where == vect_body && m_unrolled_vls_niters)
+	m_unrolled_vls_stmts += count * m_unrolled_vls_niters;
+    }
+
  return record_stmt_cost (stmt_info, where, count * stmt_cost);
 }

--- a/gcc/config/riscv/riscv-vector-costs.h
+++ b/gcc/config/riscv/riscv-vector-costs.h
@ -30,6 +30,13 @@ struct stmt_point
  gimple *stmt;
 };

+enum cost_type_enum
+{
+  SCALAR_COST,
+  VLA_VECTOR_COST,
+  VLS_VECTOR_COST
+};
+
 /* Pair typedef used by live range: <start, end>.  */
 typedef std::pair<unsigned int, unsigned int> pair;

@ -49,6 +56,42 @@ private:
 			      tree vectype, int misalign,
 			      vect_cost_model_location where) override;
  void finish_cost (const vector_costs *) override;
+
+  /* True if we have performed one-time initialization based on the
+     vec_info.  */
+  bool m_analyzed_vinfo = false;
+
+  /* - If M_COST_TYPE = SCALAR_COST then we're costing the original scalar code.
+     - If M_COST_TYPE = VLA_VECTOR_COST is nonzero then we're costing VLA
+       partial vectorization codes.
+     - If M_COST_TYPE = VLS_VECTOR_COST is nonzero then we're costing VLS
+       minimum length vector codes.  */
+  enum cost_type_enum m_cost_type;
+
+  /* On some CPUs, VLA and VLS provide the same theoretical vector
+     throughput, such as 4x128 VLS vs. 2x256 VLA.  In those
+     situations, we try to predict whether an VLS implementation
+     of the loop could be completely unrolled and become straight-line code.
+     If so, it is generally better to use the VLS version rather
+     than length-agnostic VLA, since the VLA loop would execute an unknown
+     number of times and so could not be completely unrolled in the same way.
+
+     If we're applying this heuristic, M_UNROLLED_VLS_NITERS is the
+     number of VLS loop iterations that would be unrolled and
+     M_UNROLLED_VLS_STMTS estimates the total number of statements
+     in the unrolled loop.  Both values are zero if we're not applying
+     the heuristic.  */
+  unsigned HOST_WIDE_INT m_unrolled_vls_niters = 0;
+  unsigned HOST_WIDE_INT m_unrolled_vls_stmts = 0;
+
+  /* If we're vectorizing a loop that executes a constant number of times,
+     this variable gives the number of times that the vector loop would
+     iterate, otherwise it is zero.  */
+  uint64_t m_num_vector_iterations = 0;
+
+  void analyze_loop_vinfo (loop_vec_info);
+  void record_potential_vls_unrolling (loop_vec_info);
+  bool prefer_unrolled_loop () const;
 };

 } // namespace riscv_vector
--- a/gcc/config/riscv/t-riscv
+++ b/gcc/config/riscv/t-riscv
@ -74,7 +74,7 @@ riscv-vector-costs.o: $(srcdir)/config/riscv/riscv-vector-costs.cc \
  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(TARGET_H) $(FUNCTION_H) \
  $(TREE_H) basic-block.h $(RTL_H) gimple.h targhooks.h cfgloop.h \
  fold-const.h $(TM_P_H) tree-vectorizer.h gimple-iterator.h bitmap.h \
-  ssa.h backend.h \
+  ssa.h backend.h tree-data-ref.h tree-ssa-loop-niter.h \
  $(srcdir)/config/riscv/riscv-vector-costs.h
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/riscv/riscv-vector-costs.cc
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-1.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-1.c
@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+void
+foo (int *__restrict a, int *__restrict b, int *__restrict c)
+{
+  for (int i = 0; i < 16; i++)
+    a[i] = b[i] + c[i];
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*4,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c
@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m4 -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include <stdint-gcc.h>
+
+#define N 40
+
+int a[N];
+
+__attribute__ ((noinline)) int
+foo (){
+  int i,j;
+  int sum,x;
+
+  for (i = 0; i < N; i++) {
+    sum = 0;
+    for (j = 0; j < N; j++) {
+      sum += (i + j);
+    }
+    a[i] = sum;
+  }
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*16,\s*e32,\s*m4,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 2 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c
@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include <stdint-gcc.h>
+
+#define N 40
+
+int a[N];
+
+__attribute__ ((noinline)) int
+foo (){
+  int i,j;
+  int sum,x;
+
+  for (i = 0; i < N; i++) {
+    sum = 0;
+    for (j = 0; j < N; j++) {
+      sum += (i + j);
+    }
+    a[i] = sum;
+  }
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m8,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 } } */
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c
@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include <stdint-gcc.h>
+
+#define N 40
+
+int a[N];
+
+__attribute__ ((noinline)) int
+foo (){
+  int i,j;
+  int sum,x;
+
+  for (i = 0; i < N; i++) {
+    sum = 0;
+    for (j = 0; j < N; j++) {
+      sum += (i + j);
+    }
+    a[i] = sum;
+  }
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m8,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 } } */
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-2.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-2.c
@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m2" } */
+
+void
+foo (int *__restrict a, int *__restrict b, int *__restrict c)
+{
+  for (int i = 0; i < 16; i++)
+    a[i] = b[i] + c[i];
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-3.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-3.c
@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m4" } */
+
+void
+foo (int *__restrict a, int *__restrict b, int *__restrict c)
+{
+  for (int i = 0; i < 16; i++)
+    a[i] = b[i] + c[i];
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*16,\s*e32,\s*m4,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-4.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-4.c
@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8" } */
+
+void
+foo (int *__restrict a, int *__restrict b, int *__restrict c)
+{
+  for (int i = 0; i < 16; i++)
+    a[i] = b[i] + c[i];
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*16,\s*e32,\s*m4,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-5.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-5.c
@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=dynamic" } */
+
+void
+foo (int *__restrict a, int *__restrict b, int *__restrict c)
+{
+  for (int i = 0; i < 16; i++)
+    a[i] = b[i] + c[i];
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*16,\s*e32,\s*m4,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-6.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-6.c
@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8" } */
+
+void
+foo (int *__restrict a, int *__restrict b, int *__restrict c)
+{
+  for (int i = 0; i < 32; i++)
+    a[i] = b[i] + c[i];
+}
+
+/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m8,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetivli} } } */
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-7.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-7.c
@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=dynamic" } */
+
+void
+foo (int *__restrict a, int *__restrict b, int *__restrict c)
+{
+  for (int i = 0; i < 32; i++)
+    a[i] = b[i] + c[i];
+}
+
+/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m8,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetivli} } } */
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-8.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-8.c
@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include <stdint-gcc.h>
+
+#define N 40
+
+int a[N];
+
+__attribute__ ((noinline)) int
+foo (){
+  int i,j;
+  int sum,x;
+
+  for (i = 0; i < N; i++) {
+    sum = 0;
+    for (j = 0; j < N; j++) {
+      sum += (i + j);
+    }
+    a[i] = sum;
+  }
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*4,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-9.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-9.c
@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m2" } */
+
+#include <stdint-gcc.h>
+
+#define N 40
+
+int a[N];
+
+__attribute__ ((noinline)) int
+foo (){
+  int i,j;
+  int sum,x;
+
+  for (i = 0; i < N; i++) {
+    sum = 0;
+    for (j = 0; j < N; j++) {
+      sum += (i + j);
+    }
+    a[i] = sum;
+  }
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr111313.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr111313.c
@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=rv64gcv -mabi=lp64d --param=riscv-autovec-preference=scalable -O3 -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d --param=riscv-autovec-preference=scalable -O3 -fno-schedule-insns -fno-schedule-insns2 -fno-vect-cost-model" } */

 #define K 32
 short in[2*K][K];
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-3.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-3.c
@ -53,5 +53,5 @@ DEF_OP_VV (shift, 128, int64_t, <<)
 DEF_OP_VV (shift, 256, int64_t, <<)
 DEF_OP_VV (shift, 512, int64_t, <<)

-/* { dg-final { scan-assembler-times {vsll\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 43 } } */
+/* { dg-final { scan-assembler-times {vsll\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 46 } } */
 /* { dg-final { scan-assembler-not {csrr} } } */