arm: Auto-vectorization for MVE: add pack/unpack patterns
This patch adds vec_unpack<US>_hi_<mode>, vec_unpack<US>_lo_<mode>, vec_pack_trunc_<mode> patterns for MVE. It does so by moving the unpack patterns from neon.md to vec-common.md, while adding them support for MVE. The pack expander is derived from the Neon one (which in turn is renamed into neon_quad_vec_pack_trunc_<mode>). The patch introduces mve_vec_unpack<US>_lo_<mode> and mve_vec_unpack<US>_hi_<mode> which are similar to their Neon counterparts, except for the assembly syntax. The patch introduces mve_vec_pack_trunc_lo_<mode> to avoid the need for a zero-initialized temporary, which is needed if the vec_pack_trunc_<mode> expander calls @mve_vmovn[bt]q_<supf><mode> instead. With this patch, we can now vectorize the 16 and 8-bit versions of vclz and vshl, although the generated code could still be improved. For test_clz_s16, we now generate vldrh.16 q3, [r1] vmovlb.s16 q2, q3 vmovlt.s16 q3, q3 vclz.i32 q2, q2 vclz.i32 q3, q3 vmovnb.i32 q1, q2 vmovnt.i32 q1, q3 vstrh.16 q1, [r0] which could be improved to vldrh.16 q3, [r1] vclz.i16 q1, q3 vstrh.16 q1, [r0] if we could avoid the need for unpack/pack steps. For reference, clang-12 generates: vldrh.s32 q0, [r1] vldrh.s32 q1, [r1, #8] vclz.i32 q0, q0 vstrh.32 q0, [r0] vclz.i32 q0, q1 vstrh.32 q0, [r0, #8] 2021-06-11 Christophe Lyon <christophe.lyon@linaro.org> gcc/ * config/arm/mve.md (mve_vec_unpack<US>_lo_<mode>): New pattern. (mve_vec_unpack<US>_hi_<mode>): New pattern. (@mve_vec_pack_trunc_lo_<mode>): New pattern. (mve_vmovntq_<supf><mode>): Prefix with '@'. * config/arm/neon.md (vec_unpack<US>_hi_<mode>): Move to vec-common.md. (vec_unpack<US>_lo_<mode>): Likewise. (vec_pack_trunc_<mode>): Rename to neon_quad_vec_pack_trunc_<mode>. * config/arm/vec-common.md (vec_unpack<US>_hi_<mode>): New pattern. (vec_unpack<US>_lo_<mode>): New. (vec_pack_trunc_<mode>): New. gcc/testsuite/ * gcc.target/arm/simd/mve-vclz.c: Update expected results. * gcc.target/arm/simd/mve-vshl.c: Likewise. * gcc.target/arm/simd/mve-vec-pack.c: New test. * gcc.target/arm/simd/mve-vec-unpack.c: New test.
This commit is contained in:
parent
12d13cf50f
commit
046a3beb16
7 changed files with 168 additions and 43 deletions
|
@ -535,6 +535,26 @@
|
|||
[(set_attr "type" "mve_move")
|
||||
])
|
||||
|
||||
(define_insn "mve_vec_unpack<US>_lo_<mode>"
|
||||
[(set (match_operand:<V_unpack> 0 "register_operand" "=w")
|
||||
(SE:<V_unpack> (vec_select:<V_HALF>
|
||||
(match_operand:MVE_3 1 "register_operand" "w")
|
||||
(match_operand:MVE_3 2 "vect_par_constant_low" ""))))]
|
||||
"TARGET_HAVE_MVE"
|
||||
"vmovlb.<US>%#<V_sz_elem> %q0, %q1"
|
||||
[(set_attr "type" "mve_move")]
|
||||
)
|
||||
|
||||
(define_insn "mve_vec_unpack<US>_hi_<mode>"
|
||||
[(set (match_operand:<V_unpack> 0 "register_operand" "=w")
|
||||
(SE:<V_unpack> (vec_select:<V_HALF>
|
||||
(match_operand:MVE_3 1 "register_operand" "w")
|
||||
(match_operand:MVE_3 2 "vect_par_constant_high" ""))))]
|
||||
"TARGET_HAVE_MVE"
|
||||
"vmovlt.<US>%#<V_sz_elem> %q0, %q1"
|
||||
[(set_attr "type" "mve_move")]
|
||||
)
|
||||
|
||||
;;
|
||||
;; [vcvtpq_s, vcvtpq_u])
|
||||
;;
|
||||
|
@ -2199,10 +2219,23 @@
|
|||
[(set_attr "type" "mve_move")
|
||||
])
|
||||
|
||||
;; vmovnb pattern used by the vec_pack_trunc expander to avoid the
|
||||
;; need for an uninitialized input operand.
|
||||
(define_insn "@mve_vec_pack_trunc_lo_<mode>"
|
||||
[
|
||||
(set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
|
||||
(unspec:<V_narrow_pack> [(match_operand:MVE_5 1 "s_register_operand" "w")]
|
||||
VMOVNBQ_S))
|
||||
]
|
||||
"TARGET_HAVE_MVE"
|
||||
"vmovnb.i%#<V_sz_elem> %q0, %q1"
|
||||
[(set_attr "type" "mve_move")
|
||||
])
|
||||
|
||||
;;
|
||||
;; [vmovntq_s, vmovntq_u])
|
||||
;;
|
||||
(define_insn "mve_vmovntq_<supf><mode>"
|
||||
(define_insn "@mve_vmovntq_<supf><mode>"
|
||||
[
|
||||
(set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
|
||||
(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
|
||||
|
|
|
@ -5924,43 +5924,6 @@ if (BYTES_BIG_ENDIAN)
|
|||
[(set_attr "type" "neon_shift_imm_long")]
|
||||
)
|
||||
|
||||
(define_expand "vec_unpack<US>_hi_<mode>"
|
||||
[(match_operand:<V_unpack> 0 "register_operand")
|
||||
(SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
|
||||
"TARGET_NEON && !BYTES_BIG_ENDIAN"
|
||||
{
|
||||
rtvec v = rtvec_alloc (<V_mode_nunits>/2) ;
|
||||
rtx t1;
|
||||
int i;
|
||||
for (i = 0; i < (<V_mode_nunits>/2); i++)
|
||||
RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
|
||||
|
||||
t1 = gen_rtx_PARALLEL (<MODE>mode, v);
|
||||
emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0],
|
||||
operands[1],
|
||||
t1));
|
||||
DONE;
|
||||
}
|
||||
)
|
||||
|
||||
(define_expand "vec_unpack<US>_lo_<mode>"
|
||||
[(match_operand:<V_unpack> 0 "register_operand")
|
||||
(SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
|
||||
"TARGET_NEON && !BYTES_BIG_ENDIAN"
|
||||
{
|
||||
rtvec v = rtvec_alloc (<V_mode_nunits>/2) ;
|
||||
rtx t1;
|
||||
int i;
|
||||
for (i = 0; i < (<V_mode_nunits>/2) ; i++)
|
||||
RTVEC_ELT (v, i) = GEN_INT (i);
|
||||
t1 = gen_rtx_PARALLEL (<MODE>mode, v);
|
||||
emit_insn (gen_neon_vec_unpack<US>_lo_<mode> (operands[0],
|
||||
operands[1],
|
||||
t1));
|
||||
DONE;
|
||||
}
|
||||
)
|
||||
|
||||
(define_insn "neon_vec_<US>mult_lo_<mode>"
|
||||
[(set (match_operand:<V_unpack> 0 "register_operand" "=w")
|
||||
(mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
|
||||
|
@ -6176,7 +6139,7 @@ if (BYTES_BIG_ENDIAN)
|
|||
; because the ordering of vector elements in Q registers is different from what
|
||||
; the semantics of the instructions require.
|
||||
|
||||
(define_insn "vec_pack_trunc_<mode>"
|
||||
(define_insn "neon_quad_vec_pack_trunc_<mode>"
|
||||
[(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w")
|
||||
(vec_concat:<V_narrow_pack>
|
||||
(truncate:<V_narrow>
|
||||
|
|
|
@ -632,3 +632,73 @@
|
|||
"ARM_HAVE_<MODE>_ARITH
|
||||
&& !TARGET_REALLY_IWMMXT"
|
||||
)
|
||||
|
||||
;; vmovl[tb] are not available for V4SI on MVE
|
||||
(define_expand "vec_unpack<US>_hi_<mode>"
|
||||
[(set (match_operand:<V_unpack> 0 "register_operand")
|
||||
(SE:<V_unpack> (vec_select:<V_HALF>
|
||||
(match_operand:VU 1 "register_operand")
|
||||
(match_dup 2))))]
|
||||
"ARM_HAVE_<MODE>_ARITH
|
||||
&& !TARGET_REALLY_IWMMXT
|
||||
&& ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE)
|
||||
&& !BYTES_BIG_ENDIAN"
|
||||
{
|
||||
rtvec v = rtvec_alloc (<V_mode_nunits>/2);
|
||||
int i;
|
||||
for (i = 0; i < (<V_mode_nunits>/2); i++)
|
||||
RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
|
||||
|
||||
operands[2] = gen_rtx_PARALLEL (<MODE>mode, v);
|
||||
}
|
||||
)
|
||||
|
||||
;; vmovl[tb] are not available for V4SI on MVE
|
||||
(define_expand "vec_unpack<US>_lo_<mode>"
|
||||
[(set (match_operand:<V_unpack> 0 "register_operand")
|
||||
(SE:<V_unpack> (vec_select:<V_HALF>
|
||||
(match_operand:VU 1 "register_operand")
|
||||
(match_dup 2))))]
|
||||
"ARM_HAVE_<MODE>_ARITH
|
||||
&& !TARGET_REALLY_IWMMXT
|
||||
&& ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE)
|
||||
&& !BYTES_BIG_ENDIAN"
|
||||
{
|
||||
rtvec v = rtvec_alloc (<V_mode_nunits>/2);
|
||||
int i;
|
||||
for (i = 0; i < (<V_mode_nunits>/2) ; i++)
|
||||
RTVEC_ELT (v, i) = GEN_INT (i);
|
||||
|
||||
operands[2] = gen_rtx_PARALLEL (<MODE>mode, v);
|
||||
|
||||
}
|
||||
)
|
||||
|
||||
;; vmovn[tb] are not available for V2DI on MVE
|
||||
(define_expand "vec_pack_trunc_<mode>"
|
||||
[(set (match_operand:<V_narrow_pack> 0 "register_operand")
|
||||
(vec_concat:<V_narrow_pack>
|
||||
(truncate:<V_narrow>
|
||||
(match_operand:VN 1 "register_operand"))
|
||||
(truncate:<V_narrow>
|
||||
(match_operand:VN 2 "register_operand"))))]
|
||||
"ARM_HAVE_<MODE>_ARITH
|
||||
&& !TARGET_REALLY_IWMMXT
|
||||
&& ! (<MODE>mode == V2DImode && TARGET_HAVE_MVE)
|
||||
&& !BYTES_BIG_ENDIAN"
|
||||
{
|
||||
if (TARGET_NEON)
|
||||
{
|
||||
emit_insn (gen_neon_quad_vec_pack_trunc_<mode> (operands[0], operands[1],
|
||||
operands[2]));
|
||||
}
|
||||
else
|
||||
{
|
||||
rtx tmpreg = gen_reg_rtx (<V_narrow_pack>mode);
|
||||
emit_insn (gen_mve_vec_pack_trunc_lo (<MODE>mode, tmpreg, operands[1]));
|
||||
emit_insn (gen_mve_vmovntq (VMOVNTQ_S, <MODE>mode,
|
||||
operands[0], tmpreg, operands[2]));
|
||||
}
|
||||
DONE;
|
||||
}
|
||||
)
|
||||
|
|
|
@ -21,8 +21,9 @@ FUNC(u, uint, 16, clz)
|
|||
FUNC(s, int, 8, clz)
|
||||
FUNC(u, uint, 8, clz)
|
||||
|
||||
/* 16 and 8-bit versions are not vectorized because they need pack/unpack
|
||||
patterns since __builtin_clz uses 32-bit parameter and return value. */
|
||||
/* { dg-final { scan-assembler-times {vclz\.i32 q[0-9]+, q[0-9]+} 2 } } */
|
||||
/* 16 and 8-bit versions still use 32-bit intermediate temporaries, so for
|
||||
instance instead of using vclz.i8, we need 4 vclz.i32, leading to a total of
|
||||
14 vclz.i32 expected in this testcase. */
|
||||
/* { dg-final { scan-assembler-times {vclz\.i32 q[0-9]+, q[0-9]+} 14 } } */
|
||||
/* { dg-final { scan-assembler-times {vclz\.i16 q[0-9]+, q[0-9]+} 2 { xfail *-*-* } } } */
|
||||
/* { dg-final { scan-assembler-times {vclz\.i8 q[0-9]+, q[0-9]+} 2 { xfail *-*-* } } } */
|
||||
|
|
26
gcc/testsuite/gcc.target/arm/simd/mve-vec-pack.c
Normal file
26
gcc/testsuite/gcc.target/arm/simd/mve-vec-pack.c
Normal file
|
@ -0,0 +1,26 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-require-effective-target arm_v8_1m_mve_ok } */
|
||||
/* { dg-add-options arm_v8_1m_mve } */
|
||||
/* { dg-additional-options "-O3" } */
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define FUNC(SIGN, TYPE, DSTBITS, BITS, NAME) \
|
||||
void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##DSTBITS##_t * __restrict__ dest, \
|
||||
TYPE##BITS##_t *a) { \
|
||||
int i; \
|
||||
for (i=0; i < (256 / BITS); i++) { \
|
||||
dest[i] = a[i]; \
|
||||
} \
|
||||
}
|
||||
|
||||
FUNC(s, int, 16, 32, pack)
|
||||
FUNC(u, uint, 16, 32, pack)
|
||||
FUNC(s, int, 8, 16, pack)
|
||||
FUNC(u, uint, 8, 16, pack)
|
||||
|
||||
/* { dg-final { scan-assembler-times {vmovnt\.i32\tq[0-9]+, q[0-9]+} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {vmovnb\.i32\tq[0-9]+, q[0-9]+} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {vmovnt\.i16\tq[0-9]+, q[0-9]+} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {vmovnb\.i16\tq[0-9]+, q[0-9]+} 2 } } */
|
||||
/* { dg-final { scan-assembler-not {vldr\.64\td[0-9]+, \.L} } } */
|
29
gcc/testsuite/gcc.target/arm/simd/mve-vec-unpack.c
Normal file
29
gcc/testsuite/gcc.target/arm/simd/mve-vec-unpack.c
Normal file
|
@ -0,0 +1,29 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-require-effective-target arm_v8_1m_mve_ok } */
|
||||
/* { dg-add-options arm_v8_1m_mve } */
|
||||
/* { dg-additional-options "-O3" } */
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define FUNC(SIGN, TYPE, DSTBITS, BITS, NAME) \
|
||||
void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##DSTBITS##_t * __restrict__ dest, \
|
||||
TYPE##BITS##_t *a) { \
|
||||
int i; \
|
||||
for (i=0; i < (128 / BITS); i++) { \
|
||||
dest[i] = a[i]; \
|
||||
} \
|
||||
}
|
||||
|
||||
FUNC(s, int, 32, 16, unpack)
|
||||
FUNC(u, uint, 32, 16, unpack)
|
||||
FUNC(s, int, 16, 8, unpack)
|
||||
FUNC(u, uint, 16, 8, unpack)
|
||||
|
||||
/* { dg-final { scan-assembler-times {vmovlt\.s16 q[0-9]+, q[0-9]+} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {vmovlb\.s16 q[0-9]+, q[0-9]+} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {vmovlt\.u16 q[0-9]+, q[0-9]+} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {vmovlb\.u16 q[0-9]+, q[0-9]+} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {vmovlt\.s8 q[0-9]+, q[0-9]+} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {vmovlb\.s8 q[0-9]+, q[0-9]+} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {vmovlt\.u8 q[0-9]+, q[0-9]+} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {vmovlb\.u8 q[0-9]+, q[0-9]+} 1 } } */
|
|
@ -56,7 +56,10 @@ FUNC_IMM(u, uint, 8, 16, <<, vshlimm)
|
|||
/* MVE has only 128-bit vectors, so we can vectorize only half of the
|
||||
functions above. */
|
||||
/* We only emit vshl.u, which is equivalent to vshl.s anyway. */
|
||||
/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 2 } } */
|
||||
/* 16 and 8-bit versions still use 32-bit intermediate temporaries, so for
|
||||
instance instead of using vshl.u8, we need 4 vshl.i32, leading to a total of
|
||||
14 vshl.i32 expected in this testcase. */
|
||||
/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 14 } } */
|
||||
|
||||
/* We emit vshl.i when the shift amount is an immediate. */
|
||||
/* { dg-final { scan-assembler-times {vshl.i[0-9]+\tq[0-9]+, q[0-9]+} 6 } } */
|
||||
|
|
Loading…
Add table
Reference in a new issue