RISC-V: Apply vla vs. vls mode heuristic vector COST model

This patch apply vla vs. vls mode heuristic which can fixes the following FAILs:
FAIL: gcc.target/riscv/rvv/autovec/pr111751.c -O3 -ftree-vectorize
scan-assembler-not vset
FAIL: gcc.target/riscv/rvv/autovec/pr111751.c -O3 -ftree-vectorize
scan-assembler-times li\\s+[a-x0-9]+,0\\s+ret 2

The root cause of this FAIL is we failed to pick VLS mode for the vectorization.

Before this patch:

foo2:
        addi    sp,sp,-208
        addi    a2,sp,64
        addi    a5,sp,128
        lui     a6,%hi(.LANCHOR0)
        sd      ra,200(sp)
        addi    a6,a6,%lo(.LANCHOR0)
        mv      a0,a2
        mv      a1,a5
        li      a3,16
        mv      a4,sp
        vsetivli        zero,8,e64,m8,ta,ma
        vle64.v v8,0(a6)
        vse64.v v8,0(a2)
        vse64.v v8,0(a5)
.L4:
        vsetvli a5,a3,e32,m1,ta,ma
        slli    a2,a5,2
        vle32.v v2,0(a1)
        vle32.v v1,0(a0)
        sub     a3,a3,a5
        vadd.vv v1,v1,v2
        vse32.v v1,0(a4)
        add     a1,a1,a2
        add     a0,a0,a2
        add     a4,a4,a2
        bne     a3,zero,.L4
        lw      a4,128(sp)
        lw      a5,64(sp)
        addw    a5,a5,a4
        lw      a4,0(sp)
        bne     a4,a5,.L5
        lw      a4,132(sp)
        lw      a5,68(sp)
        addw    a5,a5,a4
        lw      a4,4(sp)
        bne     a4,a5,.L5
        lw      a4,136(sp)
        lw      a5,72(sp)
        addw    a5,a5,a4
        lw      a4,8(sp)
        bne     a4,a5,.L5
        lw      a4,140(sp)
        lw      a5,76(sp)
        addw    a5,a5,a4
        lw      a4,12(sp)
        bne     a4,a5,.L5
        lw      a4,144(sp)
        lw      a5,80(sp)
        addw    a5,a5,a4
        lw      a4,16(sp)
        bne     a4,a5,.L5
        lw      a4,148(sp)
        lw      a5,84(sp)
        addw    a5,a5,a4
        lw      a4,20(sp)
        bne     a4,a5,.L5
        lw      a4,152(sp)
        lw      a5,88(sp)
        addw    a5,a5,a4
        lw      a4,24(sp)
        bne     a4,a5,.L5
        lw      a4,156(sp)
        lw      a5,92(sp)
        addw    a5,a5,a4
        lw      a4,28(sp)
        bne     a4,a5,.L5
        lw      a4,160(sp)
        lw      a5,96(sp)
        addw    a5,a5,a4
        lw      a4,32(sp)
        bne     a4,a5,.L5
        lw      a4,164(sp)
        lw      a5,100(sp)
        addw    a5,a5,a4
        lw      a4,36(sp)
        bne     a4,a5,.L5
        lw      a4,168(sp)
        lw      a5,104(sp)
        addw    a5,a5,a4
        lw      a4,40(sp)
        bne     a4,a5,.L5
        lw      a4,172(sp)
        lw      a5,108(sp)
        addw    a5,a5,a4
        lw      a4,44(sp)
        bne     a4,a5,.L5
        lw      a4,176(sp)
        lw      a5,112(sp)
        addw    a5,a5,a4
        lw      a4,48(sp)
        bne     a4,a5,.L5
        lw      a4,180(sp)
        lw      a5,116(sp)
        addw    a5,a5,a4
        lw      a4,52(sp)
        bne     a4,a5,.L5
        lw      a4,184(sp)
        lw      a5,120(sp)
        addw    a5,a5,a4
        lw      a4,56(sp)
        bne     a4,a5,.L5
        lw      a4,188(sp)
        lw      a5,124(sp)
        addw    a5,a5,a4
        lw      a4,60(sp)
        bne     a4,a5,.L5
        ld      ra,200(sp)
        li      a0,0
        addi    sp,sp,208
        jr      ra
.L5:
        call    abort

After this patch:

        li      a0,0
        ret

The heuristic leverage ARM SVE and fully tested and confirm we have same behavior
as ARM SVE GCC and RVV Clang.

gcc/ChangeLog:

	* config/riscv/riscv-vector-costs.cc (costs::analyze_loop_vinfo): New function.
	(costs::record_potential_vls_unrolling): Ditto.
	(costs::prefer_unrolled_loop): Ditto.
	(costs::better_main_loop_than_p): Ditto.
	(costs::add_stmt_cost): Ditto.
	* config/riscv/riscv-vector-costs.h (enum cost_type_enum): New enum.
	* config/riscv/t-riscv: Add new include files.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/autovec/pr111313.c: Adapt test.
	* gcc.target/riscv/rvv/autovec/vls/shift-3.c: Ditto.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-1.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-2.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-3.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-4.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-5.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-6.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-7.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-8.c: New test.
	* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-9.c: New test.
This commit is contained in:
Juzhe-Zhong 2023-12-12 22:25:52 +08:00 committed by Pan Li
parent 26250632df
commit 8501edba91
17 changed files with 408 additions and 4 deletions

View file

@ -41,6 +41,7 @@ along with GCC; see the file COPYING3. If not see
#include "ssa.h"
#include "backend.h"
#include "tree-data-ref.h"
#include "tree-ssa-loop-niter.h"
/* This file should be included last. */
#include "riscv-vector-costs.h"
@ -601,7 +602,101 @@ preferred_new_lmul_p (loop_vec_info other_loop_vinfo)
costs::costs (vec_info *vinfo, bool costing_for_scalar)
: vector_costs (vinfo, costing_for_scalar)
{}
{
if (costing_for_scalar)
m_cost_type = SCALAR_COST;
else if (riscv_v_ext_vector_mode_p (vinfo->vector_mode))
m_cost_type = VLA_VECTOR_COST;
else
m_cost_type = VLS_VECTOR_COST;
}
/* Do one-time initialization of the costs given that we're
costing the loop vectorization described by LOOP_VINFO. */
void
costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
{
/* Record the number of times that the vector loop would execute,
if known. */
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
auto scalar_niters = max_stmt_executions_int (loop);
if (scalar_niters >= 0)
{
unsigned int vf = vect_vf_for_cost (loop_vinfo);
if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
m_num_vector_iterations = scalar_niters / vf;
else
m_num_vector_iterations = CEIL (scalar_niters, vf);
}
/* Detect whether we're vectorizing for VLA and should apply the unrolling
heuristic described above m_unrolled_vls_niters. */
record_potential_vls_unrolling (loop_vinfo);
}
/* Decide whether to use the unrolling heuristic described above
m_unrolled_vls_niters, updating that field if so. LOOP_VINFO
describes the loop that we're vectorizing. */
void
costs::record_potential_vls_unrolling (loop_vec_info loop_vinfo)
{
/* We only want to apply the heuristic if LOOP_VINFO is being
vectorized for VLA. */
if (m_cost_type != VLA_VECTOR_COST)
return;
/* We don't want to apply the heuristic to outer loops, since it's
harder to track two levels of unrolling. */
if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
return;
/* Only handle cases in which the number of VLS iterations
would be known at compile time but the number of SVE iterations
would not. */
if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
|| BYTES_PER_RISCV_VECTOR.is_constant ())
return;
/* Guess how many times the VLS loop would iterate and make
sure that it is within the complete unrolling limit. Even if the
number of iterations is small enough, the number of statements might
not be, which is why we need to estimate the number of statements too. */
unsigned int vls_vf = vect_vf_for_cost (loop_vinfo);
unsigned HOST_WIDE_INT unrolled_vls_niters
= LOOP_VINFO_INT_NITERS (loop_vinfo) / vls_vf;
if (unrolled_vls_niters > (unsigned int) param_max_completely_peel_times)
return;
/* Record that we're applying the heuristic and should try to estimate
the number of statements in the VLS loop. */
m_unrolled_vls_niters = unrolled_vls_niters;
}
/* Return true if (a) we're applying the VLS vs. VLA unrolling
heuristic described above m_unrolled_vls_niters and (b) the heuristic
says that we should prefer the VLS loop. */
bool
costs::prefer_unrolled_loop () const
{
if (!m_unrolled_vls_stmts)
return false;
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"Number of insns in"
" unrolled VLS loop = " HOST_WIDE_INT_PRINT_UNSIGNED "\n",
m_unrolled_vls_stmts);
/* The balance here is tricky. On the one hand, we can't be sure whether
the code is vectorizable with VLS or not. However, even if
it isn't vectorizable with VLS, there's a possibility that
the scalar code could also be unrolled. Some of the code might then
benefit from SLP, or from using LDP and STP. We therefore apply
the heuristic regardless of can_use_vls_p. */
return (m_unrolled_vls_stmts
&& (m_unrolled_vls_stmts
<= (unsigned int) param_max_completely_peeled_insns));
}
bool
costs::better_main_loop_than_p (const vector_costs *uncast_other) const
@ -618,6 +713,21 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
GET_MODE_NAME (other_loop_vinfo->vector_mode),
vect_vf_for_cost (other_loop_vinfo));
/* Apply the unrolling heuristic described above m_unrolled_vls_niters. */
if (bool (m_unrolled_vls_stmts) != bool (other->m_unrolled_vls_stmts))
{
bool this_prefer_unrolled = this->prefer_unrolled_loop ();
bool other_prefer_unrolled = other->prefer_unrolled_loop ();
if (this_prefer_unrolled != other_prefer_unrolled)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"Preferring VLS loop because"
" it can be unrolled\n");
return other_prefer_unrolled;
}
}
if (!LOOP_VINFO_NITERS_KNOWN_P (this_loop_vinfo)
&& riscv_autovec_lmul == RVV_DYNAMIC)
{
@ -643,6 +753,28 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
/* TODO: Use default STMT cost model.
We will support more accurate STMT cost model later. */
int stmt_cost = default_builtin_vectorization_cost (kind, vectype, misalign);
/* Do one-time initialization based on the vinfo. */
loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
if (!m_analyzed_vinfo)
{
if (loop_vinfo)
analyze_loop_vinfo (loop_vinfo);
m_analyzed_vinfo = true;
}
if (stmt_info)
{
/* If we're applying the VLA vs. VLS unrolling heuristic,
estimate the number of statements in the unrolled VLS
loop. For simplicitly, we assume that one iteration of the
VLS loop would need the same number of statements
as one iteration of the VLA loop. */
if (where == vect_body && m_unrolled_vls_niters)
m_unrolled_vls_stmts += count * m_unrolled_vls_niters;
}
return record_stmt_cost (stmt_info, where, count * stmt_cost);
}

View file

@ -30,6 +30,13 @@ struct stmt_point
gimple *stmt;
};
enum cost_type_enum
{
SCALAR_COST,
VLA_VECTOR_COST,
VLS_VECTOR_COST
};
/* Pair typedef used by live range: <start, end>. */
typedef std::pair<unsigned int, unsigned int> pair;
@ -49,6 +56,42 @@ private:
tree vectype, int misalign,
vect_cost_model_location where) override;
void finish_cost (const vector_costs *) override;
/* True if we have performed one-time initialization based on the
vec_info. */
bool m_analyzed_vinfo = false;
/* - If M_COST_TYPE = SCALAR_COST then we're costing the original scalar code.
- If M_COST_TYPE = VLA_VECTOR_COST is nonzero then we're costing VLA
partial vectorization codes.
- If M_COST_TYPE = VLS_VECTOR_COST is nonzero then we're costing VLS
minimum length vector codes. */
enum cost_type_enum m_cost_type;
/* On some CPUs, VLA and VLS provide the same theoretical vector
throughput, such as 4x128 VLS vs. 2x256 VLA. In those
situations, we try to predict whether an VLS implementation
of the loop could be completely unrolled and become straight-line code.
If so, it is generally better to use the VLS version rather
than length-agnostic VLA, since the VLA loop would execute an unknown
number of times and so could not be completely unrolled in the same way.
If we're applying this heuristic, M_UNROLLED_VLS_NITERS is the
number of VLS loop iterations that would be unrolled and
M_UNROLLED_VLS_STMTS estimates the total number of statements
in the unrolled loop. Both values are zero if we're not applying
the heuristic. */
unsigned HOST_WIDE_INT m_unrolled_vls_niters = 0;
unsigned HOST_WIDE_INT m_unrolled_vls_stmts = 0;
/* If we're vectorizing a loop that executes a constant number of times,
this variable gives the number of times that the vector loop would
iterate, otherwise it is zero. */
uint64_t m_num_vector_iterations = 0;
void analyze_loop_vinfo (loop_vec_info);
void record_potential_vls_unrolling (loop_vec_info);
bool prefer_unrolled_loop () const;
};
} // namespace riscv_vector

View file

@ -74,7 +74,7 @@ riscv-vector-costs.o: $(srcdir)/config/riscv/riscv-vector-costs.cc \
$(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(TARGET_H) $(FUNCTION_H) \
$(TREE_H) basic-block.h $(RTL_H) gimple.h targhooks.h cfgloop.h \
fold-const.h $(TM_P_H) tree-vectorizer.h gimple-iterator.h bitmap.h \
ssa.h backend.h \
ssa.h backend.h tree-data-ref.h tree-ssa-loop-niter.h \
$(srcdir)/config/riscv/riscv-vector-costs.h
$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
$(srcdir)/config/riscv/riscv-vector-costs.cc

View file

@ -0,0 +1,13 @@
/* { dg-do compile } */
/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
void
foo (int *__restrict a, int *__restrict b, int *__restrict c)
{
for (int i = 0; i < 16; i++)
a[i] = b[i] + c[i];
}
/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*4,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 } } */
/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
/* { dg-final { scan-assembler-not {vsetvli} } } */

View file

@ -0,0 +1,28 @@
/* { dg-do compile } */
/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m4 -fno-schedule-insns -fno-schedule-insns2" } */
#include <stdint-gcc.h>
#define N 40
int a[N];
__attribute__ ((noinline)) int
foo (){
int i,j;
int sum,x;
for (i = 0; i < N; i++) {
sum = 0;
for (j = 0; j < N; j++) {
sum += (i + j);
}
a[i] = sum;
}
return 0;
}
/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*16,\s*e32,\s*m4,\s*t[au],\s*m[au]} 1 } } */
/* { dg-final { scan-assembler-times {vsetivli} 2 } } */
/* { dg-final { scan-assembler-not {vsetvli} } } */

View file

@ -0,0 +1,28 @@
/* { dg-do compile } */
/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -fno-schedule-insns -fno-schedule-insns2" } */
#include <stdint-gcc.h>
#define N 40
int a[N];
__attribute__ ((noinline)) int
foo (){
int i,j;
int sum,x;
for (i = 0; i < N; i++) {
sum = 0;
for (j = 0; j < N; j++) {
sum += (i + j);
}
a[i] = sum;
}
return 0;
}
/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m8,\s*t[au],\s*m[au]} 1 } } */
/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
/* { dg-final { scan-assembler-times {vsetvli} 1 } } */

View file

@ -0,0 +1,28 @@
/* { dg-do compile } */
/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -fno-schedule-insns -fno-schedule-insns2" } */
#include <stdint-gcc.h>
#define N 40
int a[N];
__attribute__ ((noinline)) int
foo (){
int i,j;
int sum,x;
for (i = 0; i < N; i++) {
sum = 0;
for (j = 0; j < N; j++) {
sum += (i + j);
}
a[i] = sum;
}
return 0;
}
/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m8,\s*t[au],\s*m[au]} 1 } } */
/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
/* { dg-final { scan-assembler-times {vsetvli} 1 } } */

View file

@ -0,0 +1,13 @@
/* { dg-do compile } */
/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m2" } */
void
foo (int *__restrict a, int *__restrict b, int *__restrict c)
{
for (int i = 0; i < 16; i++)
a[i] = b[i] + c[i];
}
/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
/* { dg-final { scan-assembler-not {vsetvli} } } */

View file

@ -0,0 +1,13 @@
/* { dg-do compile } */
/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m4" } */
void
foo (int *__restrict a, int *__restrict b, int *__restrict c)
{
for (int i = 0; i < 16; i++)
a[i] = b[i] + c[i];
}
/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*16,\s*e32,\s*m4,\s*t[au],\s*m[au]} 1 } } */
/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
/* { dg-final { scan-assembler-not {vsetvli} } } */

View file

@ -0,0 +1,13 @@
/* { dg-do compile } */
/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8" } */
void
foo (int *__restrict a, int *__restrict b, int *__restrict c)
{
for (int i = 0; i < 16; i++)
a[i] = b[i] + c[i];
}
/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*16,\s*e32,\s*m4,\s*t[au],\s*m[au]} 1 } } */
/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
/* { dg-final { scan-assembler-not {vsetvli} } } */

View file

@ -0,0 +1,13 @@
/* { dg-do compile } */
/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=dynamic" } */
void
foo (int *__restrict a, int *__restrict b, int *__restrict c)
{
for (int i = 0; i < 16; i++)
a[i] = b[i] + c[i];
}
/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*16,\s*e32,\s*m4,\s*t[au],\s*m[au]} 1 } } */
/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
/* { dg-final { scan-assembler-not {vsetvli} } } */

View file

@ -0,0 +1,13 @@
/* { dg-do compile } */
/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8" } */
void
foo (int *__restrict a, int *__restrict b, int *__restrict c)
{
for (int i = 0; i < 32; i++)
a[i] = b[i] + c[i];
}
/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m8,\s*t[au],\s*m[au]} 1 } } */
/* { dg-final { scan-assembler-times {vsetvli} 1 } } */
/* { dg-final { scan-assembler-not {vsetivli} } } */

View file

@ -0,0 +1,13 @@
/* { dg-do compile } */
/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=dynamic" } */
void
foo (int *__restrict a, int *__restrict b, int *__restrict c)
{
for (int i = 0; i < 32; i++)
a[i] = b[i] + c[i];
}
/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m8,\s*t[au],\s*m[au]} 1 } } */
/* { dg-final { scan-assembler-times {vsetvli} 1 } } */
/* { dg-final { scan-assembler-not {vsetivli} } } */

View file

@ -0,0 +1,27 @@
/* { dg-do compile } */
/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
#include <stdint-gcc.h>
#define N 40
int a[N];
__attribute__ ((noinline)) int
foo (){
int i,j;
int sum,x;
for (i = 0; i < N; i++) {
sum = 0;
for (j = 0; j < N; j++) {
sum += (i + j);
}
a[i] = sum;
}
return 0;
}
/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*4,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 } } */
/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
/* { dg-final { scan-assembler-not {vsetvli} } } */

View file

@ -0,0 +1,27 @@
/* { dg-do compile } */
/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m2" } */
#include <stdint-gcc.h>
#define N 40
int a[N];
__attribute__ ((noinline)) int
foo (){
int i,j;
int sum,x;
for (i = 0; i < N; i++) {
sum = 0;
for (j = 0; j < N; j++) {
sum += (i + j);
}
a[i] = sum;
}
return 0;
}
/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
/* { dg-final { scan-assembler-not {vsetvli} } } */

View file

@ -1,5 +1,5 @@
/* { dg-do compile } */
/* { dg-options "-march=rv64gcv -mabi=lp64d --param=riscv-autovec-preference=scalable -O3 -fno-schedule-insns -fno-schedule-insns2" } */
/* { dg-options "-march=rv64gcv -mabi=lp64d --param=riscv-autovec-preference=scalable -O3 -fno-schedule-insns -fno-schedule-insns2 -fno-vect-cost-model" } */
#define K 32
short in[2*K][K];

View file

@ -53,5 +53,5 @@ DEF_OP_VV (shift, 128, int64_t, <<)
DEF_OP_VV (shift, 256, int64_t, <<)
DEF_OP_VV (shift, 512, int64_t, <<)
/* { dg-final { scan-assembler-times {vsll\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 43 } } */
/* { dg-final { scan-assembler-times {vsll\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 46 } } */
/* { dg-final { scan-assembler-not {csrr} } } */