RISC-V: Using merge approach to optimize repeating sequence in vec_init
This patch would like to optimize the VLS vector initialization like repeating sequence. From the vslide1down to the vmerge with a simple cost model, aka every instruction only has 1 cost. Given code with -march=rv64gcv_zvl256b --param riscv-autovec-preference=fixed-vlmax typedef int64_t vnx32di __attribute__ ((vector_size (256))); __attribute__ ((noipa)) void f_vnx32di (int64_t a, int64_t b, int64_t *out) { vnx32di v = { a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, }; *(vnx32di *) out = v; } Before this patch: vslide1down.vx (x31 times) After this patch: li a5,-1431654400 addi a5,a5,-1365 li a3,-1431654400 addi a3,a3,-1366 slli a5,a5,32 add a5,a5,a3 vsetvli a4,zero,e64,m8,ta,ma vmv.v.x v8,a0 vmv.s.x v0,a5 vmerge.vxm v8,v8,a1,v0 vs8r.v v8,0(a2) Since we dont't have SEW = 128 in vec_duplicate, we can't combine ab into SEW = 128 element and then broadcast this big element. Signed-off-by: Pan Li <pan2.li@intel.com> Co-Authored by: Juzhe-Zhong <juzhe.zhong@rivai.ai> gcc/ChangeLog: * config/riscv/riscv-protos.h (enum insn_type): New type. * config/riscv/riscv-v.cc (RVV_INSN_OPERANDS_MAX): New macro. (rvv_builder::can_duplicate_repeating_sequence_p): Align the referenced class member. (rvv_builder::get_merged_repeating_sequence): Ditto. (rvv_builder::repeating_sequence_use_merge_profitable_p): New function to evaluate the optimization cost. (rvv_builder::get_merge_scalar_mask): New function to get the merge mask. (emit_scalar_move_insn): New function to emit vmv.s.x. (emit_vlmax_integer_move_insn): New function to emit vlmax vmv.v.x. (emit_nonvlmax_integer_move_insn): New function to emit nonvlmax vmv.v.x. (get_repeating_sequence_dup_machine_mode): New function to get the dup machine mode. (expand_vector_init_merge_repeating_sequence): New function to perform the optimization. (expand_vec_init): Add this vector init optimization. * config/riscv/riscv.h (BITS_PER_WORD): New macro. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c: New test. Signed-off-by: Pan Li <pan2.li@intel.com>
This commit is contained in:
parent
8d1d9b1648
commit
a99dc11fe2
11 changed files with 457 additions and 6 deletions
|
@ -142,6 +142,7 @@ enum insn_type
|
|||
RVV_CMP_MU_OP = RVV_CMP_OP + 2, /* +2 means mask and maskoff operand. */
|
||||
RVV_UNOP_MU = RVV_UNOP + 2, /* Likewise. */
|
||||
RVV_TERNOP = 5,
|
||||
RVV_SCALAR_MOV_OP = 4, /* +1 for VUNDEF according to vector.md. */
|
||||
};
|
||||
enum vlmul_type
|
||||
{
|
||||
|
|
|
@ -21,6 +21,10 @@
|
|||
|
||||
#define IN_TARGET_CODE 1
|
||||
|
||||
/* We have a maximum of 11 operands for RVV instruction patterns according to
|
||||
the vector.md. */
|
||||
#define RVV_INSN_OPERANDS_MAX 11
|
||||
|
||||
#include "config.h"
|
||||
#include "system.h"
|
||||
#include "coretypes.h"
|
||||
|
@ -1286,19 +1290,32 @@ public:
|
|||
: rtx_vector_builder (mode, npatterns, nelts_per_pattern)
|
||||
{
|
||||
m_inner_mode = GET_MODE_INNER (mode);
|
||||
m_inner_size = GET_MODE_BITSIZE (m_inner_mode).to_constant ();
|
||||
m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode);
|
||||
m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode);
|
||||
|
||||
gcc_assert (
|
||||
int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode));
|
||||
}
|
||||
|
||||
bool can_duplicate_repeating_sequence_p ();
|
||||
rtx get_merged_repeating_sequence ();
|
||||
|
||||
bool repeating_sequence_use_merge_profitable_p ();
|
||||
rtx get_merge_scalar_mask (unsigned int) const;
|
||||
|
||||
machine_mode new_mode () const { return m_new_mode; }
|
||||
scalar_mode inner_mode () const { return m_inner_mode; }
|
||||
scalar_int_mode inner_int_mode () const { return m_inner_int_mode; }
|
||||
unsigned int inner_bits_size () const { return m_inner_bits_size; }
|
||||
unsigned int inner_bytes_size () const { return m_inner_bytes_size; }
|
||||
|
||||
private:
|
||||
machine_mode m_inner_mode;
|
||||
scalar_mode m_inner_mode;
|
||||
scalar_int_mode m_inner_int_mode;
|
||||
machine_mode m_new_mode;
|
||||
scalar_int_mode m_new_inner_mode;
|
||||
unsigned int m_inner_size;
|
||||
unsigned int m_inner_bits_size;
|
||||
unsigned int m_inner_bytes_size;
|
||||
};
|
||||
|
||||
/* Return true if the vector duplicated by a super element which is the fusion
|
||||
|
@ -1309,7 +1326,7 @@ bool
|
|||
rvv_builder::can_duplicate_repeating_sequence_p ()
|
||||
{
|
||||
poly_uint64 new_size = exact_div (full_nelts (), npatterns ());
|
||||
unsigned int new_inner_size = m_inner_size * npatterns ();
|
||||
unsigned int new_inner_size = m_inner_bits_size * npatterns ();
|
||||
if (!int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode)
|
||||
|| GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD
|
||||
|| !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode))
|
||||
|
@ -1317,6 +1334,61 @@ rvv_builder::can_duplicate_repeating_sequence_p ()
|
|||
return repeating_sequence_p (0, full_nelts ().to_constant (), npatterns ());
|
||||
}
|
||||
|
||||
/* Return true if it is a repeating sequence that using
|
||||
merge approach has better codegen than using default
|
||||
approach (slide1down).
|
||||
|
||||
Sequence A:
|
||||
{a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
|
||||
|
||||
nelts = 16
|
||||
npatterns = 2
|
||||
|
||||
for merging a we need mask 101010....
|
||||
for merging b we need mask 010101....
|
||||
|
||||
Foreach element in the npattern, we need to build a mask in scalar register.
|
||||
Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar
|
||||
instruction and 1 scalar move to v0 register. Finally we need vector merge
|
||||
to merge them.
|
||||
|
||||
lui a5, #imm
|
||||
add a5, #imm
|
||||
vmov.s.x v0, a5
|
||||
vmerge.vxm v9, v9, a1, v0
|
||||
|
||||
So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
|
||||
If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
|
||||
So return true in this case as it is profitable.
|
||||
|
||||
Sequence B:
|
||||
{a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
|
||||
|
||||
nelts = 16
|
||||
npatterns = 8
|
||||
|
||||
COST of merge approach = (3 + 1) * npatterns = 24
|
||||
COST of slide1down approach = nelts = 16
|
||||
Return false in this case as it is NOT profitable in merge approach.
|
||||
*/
|
||||
bool
|
||||
rvv_builder::repeating_sequence_use_merge_profitable_p ()
|
||||
{
|
||||
if (inner_bytes_size () > UNITS_PER_WORD)
|
||||
return false;
|
||||
|
||||
unsigned int nelts = full_nelts ().to_constant ();
|
||||
|
||||
if (!repeating_sequence_p (0, nelts, npatterns ()))
|
||||
return false;
|
||||
|
||||
unsigned int merge_cost = 1;
|
||||
unsigned int build_merge_mask_cost = 3;
|
||||
unsigned int slide1down_cost = nelts;
|
||||
|
||||
return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost;
|
||||
}
|
||||
|
||||
/* Merge the repeating sequence into a single element and return the RTX. */
|
||||
rtx
|
||||
rvv_builder::get_merged_repeating_sequence ()
|
||||
|
@ -1324,11 +1396,11 @@ rvv_builder::get_merged_repeating_sequence ()
|
|||
scalar_int_mode mode = Pmode;
|
||||
rtx target = gen_reg_rtx (mode);
|
||||
emit_move_insn (target, const0_rtx);
|
||||
rtx imm = gen_int_mode ((1ULL << m_inner_size) - 1, mode);
|
||||
rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode);
|
||||
/* { a, b, a, b }: Generate duplicate element = b << bits | a. */
|
||||
for (unsigned int i = 0; i < npatterns (); i++)
|
||||
{
|
||||
unsigned int loc = m_inner_size * i;
|
||||
unsigned int loc = m_inner_bits_size * i;
|
||||
rtx shift = gen_int_mode (loc, mode);
|
||||
rtx ele = gen_lowpart (mode, elt (i));
|
||||
rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false,
|
||||
|
@ -1344,6 +1416,29 @@ rvv_builder::get_merged_repeating_sequence ()
|
|||
return target;
|
||||
}
|
||||
|
||||
/* Get the mask for merge approach.
|
||||
|
||||
Consider such following case:
|
||||
{a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
|
||||
To merge "a", the mask should be 1010....
|
||||
To merge "b", the mask should be 0101....
|
||||
*/
|
||||
rtx
|
||||
rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern) const
|
||||
{
|
||||
unsigned HOST_WIDE_INT mask = 0;
|
||||
unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern);
|
||||
|
||||
gcc_assert (BITS_PER_WORD % npatterns () == 0);
|
||||
|
||||
int limit = BITS_PER_WORD / npatterns ();
|
||||
|
||||
for (int i = 0; i < limit; i++)
|
||||
mask |= base_mask << (i * npatterns ());
|
||||
|
||||
return gen_int_mode (mask, inner_int_mode ());
|
||||
}
|
||||
|
||||
/* Subroutine of riscv_vector_expand_vector_init.
|
||||
Works as follows:
|
||||
(a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
|
||||
|
@ -1371,6 +1466,111 @@ expand_vector_init_insert_elems (rtx target, const rvv_builder &builder,
|
|||
}
|
||||
}
|
||||
|
||||
/* Emit vmv.s.x instruction. */
|
||||
|
||||
static void
|
||||
emit_scalar_move_insn (unsigned icode, rtx *ops)
|
||||
{
|
||||
machine_mode data_mode = GET_MODE (ops[0]);
|
||||
machine_mode mask_mode = get_mask_mode (data_mode).require ();
|
||||
insn_expander<RVV_INSN_OPERANDS_MAX> e (riscv_vector::RVV_SCALAR_MOV_OP,
|
||||
/* HAS_DEST_P */ true,
|
||||
/* FULLY_UNMASKED_P */ false,
|
||||
/* USE_REAL_MERGE_P */ true,
|
||||
/* HAS_AVL_P */ true,
|
||||
/* VLMAX_P */ false,
|
||||
data_mode, mask_mode);
|
||||
e.set_policy (TAIL_ANY);
|
||||
e.set_policy (MASK_ANY);
|
||||
e.set_vl (CONST1_RTX (Pmode));
|
||||
e.emit_insn ((enum insn_code) icode, ops);
|
||||
}
|
||||
|
||||
/* Emit vmv.v.x instruction with vlmax. */
|
||||
|
||||
static void
|
||||
emit_vlmax_integer_move_insn (unsigned icode, rtx *ops, rtx vl)
|
||||
{
|
||||
emit_vlmax_insn (icode, riscv_vector::RVV_UNOP, ops, vl);
|
||||
}
|
||||
|
||||
/* Emit vmv.v.x instruction with nonvlmax. */
|
||||
|
||||
static void
|
||||
emit_nonvlmax_integer_move_insn (unsigned icode, rtx *ops, rtx avl)
|
||||
{
|
||||
emit_nonvlmax_insn (icode, riscv_vector::RVV_UNOP, ops, avl);
|
||||
}
|
||||
|
||||
/* Emit merge instruction. */
|
||||
|
||||
static machine_mode
|
||||
get_repeating_sequence_dup_machine_mode (const rvv_builder &builder)
|
||||
{
|
||||
poly_uint64 dup_nunits = GET_MODE_NUNITS (builder.mode ());
|
||||
|
||||
if (known_ge (GET_MODE_SIZE (builder.mode ()), BYTES_PER_RISCV_VECTOR))
|
||||
{
|
||||
dup_nunits = exact_div (BYTES_PER_RISCV_VECTOR,
|
||||
builder.inner_bytes_size ());
|
||||
}
|
||||
|
||||
return get_vector_mode (builder.inner_int_mode (), dup_nunits).require ();
|
||||
}
|
||||
|
||||
/* Use merge approach to initialize the vector with repeating sequence.
|
||||
v = {a, b, a, b, a, b, a, b}.
|
||||
|
||||
v = broadcast (a).
|
||||
mask = 0b01010101....
|
||||
v = merge (v, b, mask)
|
||||
*/
|
||||
static void
|
||||
expand_vector_init_merge_repeating_sequence (rtx target,
|
||||
const rvv_builder &builder)
|
||||
{
|
||||
machine_mode dup_mode = get_repeating_sequence_dup_machine_mode (builder);
|
||||
machine_mode dup_mask_mode = get_mask_mode (dup_mode).require ();
|
||||
machine_mode mask_mode = get_mask_mode (builder.mode ()).require ();
|
||||
uint64_t full_nelts = builder.full_nelts ().to_constant ();
|
||||
|
||||
/* Step 1: Broadcast the first pattern. */
|
||||
rtx ops[] = {target, force_reg (GET_MODE_INNER (dup_mode), builder.elt (0))};
|
||||
emit_vlmax_integer_move_insn (code_for_pred_broadcast (builder.mode ()),
|
||||
ops, NULL_RTX);
|
||||
|
||||
/* Step 2: Merge the rest iteration of pattern. */
|
||||
for (unsigned int i = 1; i < builder.npatterns (); i++)
|
||||
{
|
||||
/* Step 2-1: Generate mask register v0 for each merge. */
|
||||
rtx merge_mask = builder.get_merge_scalar_mask (i);
|
||||
rtx mask = gen_reg_rtx (mask_mode);
|
||||
rtx dup = gen_reg_rtx (dup_mode);
|
||||
|
||||
if (full_nelts <= BITS_PER_WORD) /* vmv.s.x. */
|
||||
{
|
||||
rtx ops[] = {dup, gen_scalar_move_mask (dup_mask_mode),
|
||||
RVV_VUNDEF (dup_mode), merge_mask};
|
||||
emit_scalar_move_insn (code_for_pred_broadcast (GET_MODE (dup)),
|
||||
ops);
|
||||
}
|
||||
else /* vmv.v.x. */
|
||||
{
|
||||
rtx ops[] = {dup, force_reg (GET_MODE_INNER (dup_mode), merge_mask)};
|
||||
rtx vl = gen_int_mode (CEIL (full_nelts, BITS_PER_WORD), Pmode);
|
||||
emit_nonvlmax_integer_move_insn (code_for_pred_broadcast (dup_mode),
|
||||
ops, vl);
|
||||
}
|
||||
|
||||
emit_move_insn (mask, gen_lowpart (mask_mode, dup));
|
||||
|
||||
/* Step 2-2: Merge pattern according to the mask. */
|
||||
rtx ops[] = {target, target, builder.elt (i), mask};
|
||||
emit_vlmax_merge_insn (code_for_pred_merge_scalar (GET_MODE (target)),
|
||||
riscv_vector::RVV_MERGE_OP, ops);
|
||||
}
|
||||
}
|
||||
|
||||
/* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
|
||||
|
||||
void
|
||||
|
@ -1394,6 +1594,19 @@ expand_vec_init (rtx target, rtx vals)
|
|||
emit_move_insn (target, gen_lowpart (mode, dup));
|
||||
return;
|
||||
}
|
||||
|
||||
/* Case 2: Optimize repeating sequence cases that Case 1 can
|
||||
not handle and it is profitable. For example:
|
||||
ELEMENT BITSIZE = 64.
|
||||
v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
|
||||
We can't find a vector mode for "ab" which will be combined into
|
||||
128-bit element to duplicate. */
|
||||
if (v.repeating_sequence_use_merge_profitable_p ())
|
||||
{
|
||||
expand_vector_init_merge_repeating_sequence (target, v);
|
||||
return;
|
||||
}
|
||||
|
||||
/* TODO: We will support more Initialization of vector in the future. */
|
||||
}
|
||||
|
||||
|
|
|
@ -150,6 +150,7 @@ ASM_MISA_SPEC
|
|||
|
||||
/* Width of a word, in units (bytes). */
|
||||
#define UNITS_PER_WORD (TARGET_64BIT ? 8 : 4)
|
||||
#define BITS_PER_WORD (BITS_PER_UNIT * UNITS_PER_WORD)
|
||||
#ifndef IN_LIBGCC2
|
||||
#define MIN_UNITS_PER_WORD 4
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-additional-options "-march=rv64gcv_zvl1024b -mabi=lp64d" } */
|
||||
|
||||
#include <stdint-gcc.h>
|
||||
|
||||
typedef int64_t vnx16di __attribute__ ((vector_size (1024)));
|
||||
|
||||
__attribute__ ((noipa)) void
|
||||
f_vnx16di (int64_t a, int64_t b, int64_t *out)
|
||||
{
|
||||
vnx16di v = {
|
||||
a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
|
||||
a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
|
||||
a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
|
||||
a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
|
||||
};
|
||||
*(vnx16di *) out = v;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times {vmv\.v\.x\s+v[0-9]+,\s*[a-x0-9]+} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {vmerge\.vxm\s+v[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */
|
|
@ -0,0 +1,24 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
|
||||
|
||||
#include <stdint-gcc.h>
|
||||
|
||||
typedef double vnx8df __attribute__ ((vector_size (64)));
|
||||
typedef double vnx16df __attribute__ ((vector_size (128)));
|
||||
|
||||
__attribute__ ((noipa)) void
|
||||
f_vnx8df (double a, double b, double *out)
|
||||
{
|
||||
vnx8df v = {a, b, a, b, a, b, a, b};
|
||||
*(vnx8df *) out = v;
|
||||
}
|
||||
|
||||
__attribute__ ((noipa)) void
|
||||
f_vnx16df (double a, double b, double *out)
|
||||
{
|
||||
vnx16df v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b};
|
||||
*(vnx16df *) out = v;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times {vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {vfmerge\.vfm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */
|
|
@ -0,0 +1,25 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
|
||||
|
||||
#include <stdint-gcc.h>
|
||||
|
||||
typedef int64_t vnx8di __attribute__ ((vector_size (64)));
|
||||
typedef int64_t vnx16di __attribute__ ((vector_size (128)));
|
||||
|
||||
__attribute__ ((noipa)) void
|
||||
f_vnx8di (int64_t a, int64_t b, int64_t *out)
|
||||
{
|
||||
vnx8di v = {a, b, a, b, a, b, a, b};
|
||||
*(vnx8di *) out = v;
|
||||
}
|
||||
|
||||
__attribute__ ((noipa)) void
|
||||
f_vnx16di (int64_t a, int64_t b, int64_t *out)
|
||||
{
|
||||
vnx16di v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b};
|
||||
*(vnx16di *) out = v;
|
||||
}
|
||||
|
||||
|
||||
/* { dg-final { scan-assembler-times {vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {vmerge\.vxm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */
|
|
@ -0,0 +1,15 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
|
||||
|
||||
#include <stdint-gcc.h>
|
||||
|
||||
typedef int64_t vnx8di __attribute__ ((vector_size (64)));
|
||||
|
||||
__attribute__ ((noipa)) void
|
||||
f_vnx8di (int64_t a, int64_t b, int64_t c, int64_t d, int64_t *out)
|
||||
{
|
||||
vnx8di v = {a, b, c, d, a, b, c, d};
|
||||
*(vnx8di *) out = v;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times {vslide1down\.vx\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+} 7 } } */
|
|
@ -0,0 +1,17 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
|
||||
|
||||
#include <stdint-gcc.h>
|
||||
|
||||
typedef int64_t vnx16di __attribute__ ((vector_size (128)));
|
||||
|
||||
__attribute__ ((noipa)) void
|
||||
f_vnx16di (int64_t a, int64_t b, int64_t c, int64_t d, int64_t *out)
|
||||
{
|
||||
vnx16di v = {a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d,};
|
||||
*(vnx16di *) out = v;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times {vmv\.v\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 0 } } */
|
||||
/* { dg-final { scan-assembler-times {vmerge\.vxm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 0 } } */
|
|
@ -0,0 +1,47 @@
|
|||
/* { dg-do run { target { riscv_vector } } } */
|
||||
/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
|
||||
|
||||
#include "init-repeat-sequence-2.c"
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
double a = -1789089.23423;
|
||||
double b = -8916156.45644;
|
||||
|
||||
double v_vnx8df[sizeof (vnx8df) / sizeof (double)];
|
||||
f_vnx8df (a, b, v_vnx8df);
|
||||
|
||||
return 0;
|
||||
for (int i = 0; i < sizeof (vnx8df) / sizeof (double); i++)
|
||||
{
|
||||
if (i % 2 == 0)
|
||||
{
|
||||
if (v_vnx8df[i] != a)
|
||||
__builtin_abort ();
|
||||
}
|
||||
else
|
||||
{
|
||||
if (v_vnx8df[i] != b)
|
||||
__builtin_abort ();
|
||||
}
|
||||
}
|
||||
|
||||
double v_vnx16df[sizeof (vnx16df) / sizeof (double)];
|
||||
f_vnx16df (a, b, v_vnx16df);
|
||||
for (int i = 0; i < sizeof (vnx16df) / sizeof (double); i++)
|
||||
{
|
||||
if (i % 2 == 0)
|
||||
{
|
||||
if (v_vnx16df[i] != a)
|
||||
__builtin_abort ();
|
||||
}
|
||||
else
|
||||
{
|
||||
if (v_vnx16df[i] != b)
|
||||
__builtin_abort ();
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
/* { dg-do run { target { riscv_vector } } } */
|
||||
/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
|
||||
|
||||
#include "init-repeat-sequence-3.c"
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
int64_t a = -178908923423;
|
||||
int64_t b = -891615645644;
|
||||
|
||||
int64_t v_vnx8di[sizeof (vnx8di) / sizeof (int64_t)];
|
||||
f_vnx8di (a, b, v_vnx8di);
|
||||
for (int i = 0; i < sizeof (vnx8di) / sizeof (int64_t); i++)
|
||||
{
|
||||
if (i % 2 == 0)
|
||||
{
|
||||
if (v_vnx8di[i] != a)
|
||||
__builtin_abort ();
|
||||
}
|
||||
else
|
||||
{
|
||||
if (v_vnx8di[i] != b)
|
||||
__builtin_abort ();
|
||||
}
|
||||
}
|
||||
|
||||
int64_t v_vnx16di[sizeof (vnx16di) / sizeof (int64_t)];
|
||||
f_vnx16di (a, b, v_vnx16di);
|
||||
|
||||
for (int i = 0; i < sizeof (vnx16di) / sizeof (int64_t); i++)
|
||||
{
|
||||
if (i % 2 == 0)
|
||||
{
|
||||
if (v_vnx16di[i] != a)
|
||||
__builtin_abort ();
|
||||
}
|
||||
else
|
||||
{
|
||||
if (v_vnx16di[i] != b)
|
||||
__builtin_abort ();
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
/* { dg-do run { target { riscv_vector } } } */
|
||||
/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
|
||||
|
||||
#include "init-repeat-sequence-5.c"
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
int64_t a = -178908923423;
|
||||
int64_t b = -891615645644;
|
||||
int64_t c = 78908923423;
|
||||
int64_t d = 81615645644;
|
||||
|
||||
int64_t v_vnx16di[sizeof (vnx16di) / sizeof (int64_t)];
|
||||
f_vnx16di (a, b, c, d, v_vnx16di);
|
||||
for (int i = 0; i < sizeof (vnx16di) / sizeof (int64_t); i++)
|
||||
{
|
||||
if (i % 4 == 0)
|
||||
{
|
||||
if (v_vnx16di[i] != a)
|
||||
__builtin_abort ();
|
||||
}
|
||||
else if (i % 4 == 1)
|
||||
{
|
||||
if (v_vnx16di[i] != b)
|
||||
__builtin_abort ();
|
||||
}
|
||||
else if (i % 4 == 2)
|
||||
{
|
||||
if (v_vnx16di[i] != c)
|
||||
__builtin_abort ();
|
||||
}
|
||||
else
|
||||
{
|
||||
if (v_vnx16di[i] != d)
|
||||
__builtin_abort ();
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Add table
Reference in a new issue