AArch64: Implement four and eight chunk VLA concats [PR118272]
The following testcase #pragma GCC target ("+sve") extern char __attribute__ ((simd, const)) fn3 (int, short); void test_fn3 (float *a, float *b, double *c, int n) { for (int i = 0; i < n; ++i) a[i] = fn3 (b[i], c[i]); } at -Ofast ICEs because my previous patch only added support for combining 2 partial SVE vectors into a bigger vector. However There can also 4 and 8 piece subvectors. This patch fixes this by implementing the missing expansions. gcc/ChangeLog: PR target/96342 PR target/118272 * config/aarch64/aarch64-sve.md (vec_init<mode><Vquad>, vec_initvnx16qivnx2qi): New. * config/aarch64/aarch64.cc (aarch64_sve_expand_vector_init_subvector): Rewrite to support any arbitrary combinations. * config/aarch64/iterators.md (SVE_NO2E): Update to use SVE_NO4E (SVE_NO2E, Vquad): New. gcc/testsuite/ChangeLog: PR target/96342 PR target/118272 * gcc.target/aarch64/vect-simd-clone-3.c: New test.
This commit is contained in:
parent
f345ae3e6b
commit
830bead485
4 changed files with 93 additions and 11 deletions
|
@ -2839,6 +2839,7 @@
|
|||
}
|
||||
)
|
||||
|
||||
;; Vector constructor combining two half vectors { a, b }
|
||||
(define_expand "vec_init<mode><Vhalf>"
|
||||
[(match_operand:SVE_NO2E 0 "register_operand")
|
||||
(match_operand 1 "")]
|
||||
|
@ -2849,6 +2850,28 @@
|
|||
}
|
||||
)
|
||||
|
||||
;; Vector constructor combining four quad vectors { a, b, c, d }
|
||||
(define_expand "vec_init<mode><Vquad>"
|
||||
[(match_operand:SVE_NO4E 0 "register_operand")
|
||||
(match_operand 1 "")]
|
||||
"TARGET_SVE"
|
||||
{
|
||||
aarch64_sve_expand_vector_init_subvector (operands[0], operands[1]);
|
||||
DONE;
|
||||
}
|
||||
)
|
||||
|
||||
;; Vector constructor combining eight vectors { a, b, c, d, ... }
|
||||
(define_expand "vec_initvnx16qivnx2qi"
|
||||
[(match_operand:VNx16QI 0 "register_operand")
|
||||
(match_operand 1 "")]
|
||||
"TARGET_SVE"
|
||||
{
|
||||
aarch64_sve_expand_vector_init_subvector (operands[0], operands[1]);
|
||||
DONE;
|
||||
}
|
||||
)
|
||||
|
||||
;; Shift an SVE vector left and insert a scalar into element 0.
|
||||
(define_insn "vec_shl_insert_<mode>"
|
||||
[(set (match_operand:SVE_FULL 0 "register_operand")
|
||||
|
|
|
@ -24879,18 +24879,42 @@ aarch64_sve_expand_vector_init_subvector (rtx target, rtx vals)
|
|||
machine_mode mode = GET_MODE (target);
|
||||
int nelts = XVECLEN (vals, 0);
|
||||
|
||||
gcc_assert (nelts == 2);
|
||||
gcc_assert (nelts % 2 == 0);
|
||||
|
||||
rtx arg0 = XVECEXP (vals, 0, 0);
|
||||
rtx arg1 = XVECEXP (vals, 0, 1);
|
||||
|
||||
/* If we have two elements and are concatting vector. */
|
||||
machine_mode elem_mode = GET_MODE (arg0);
|
||||
/* We have to be concatting vector. */
|
||||
machine_mode elem_mode = GET_MODE (XVECEXP (vals, 0, 0));
|
||||
gcc_assert (VECTOR_MODE_P (elem_mode));
|
||||
|
||||
arg0 = force_reg (elem_mode, arg0);
|
||||
arg1 = force_reg (elem_mode, arg1);
|
||||
emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
|
||||
auto_vec<rtx> worklist;
|
||||
machine_mode wider_mode = elem_mode;
|
||||
|
||||
for (int i = 0; i < nelts; i++)
|
||||
worklist.safe_push (force_reg (elem_mode, XVECEXP (vals, 0, i)));
|
||||
|
||||
/* Keep widening pairwise to have maximum throughput. */
|
||||
while (nelts >= 2)
|
||||
{
|
||||
wider_mode
|
||||
= related_vector_mode (wider_mode, GET_MODE_INNER (wider_mode),
|
||||
GET_MODE_NUNITS (wider_mode) * 2).require ();
|
||||
|
||||
for (int i = 0; i < nelts; i += 2)
|
||||
{
|
||||
rtx arg0 = worklist[i];
|
||||
rtx arg1 = worklist[i+1];
|
||||
gcc_assert (GET_MODE (arg0) == GET_MODE (arg1));
|
||||
|
||||
rtx tmp = gen_reg_rtx (wider_mode);
|
||||
emit_insn (gen_aarch64_pack_partial (wider_mode, tmp, arg0, arg1));
|
||||
worklist[i / 2] = tmp;
|
||||
}
|
||||
|
||||
nelts /= 2;
|
||||
}
|
||||
|
||||
gcc_assert (wider_mode == mode);
|
||||
emit_move_insn (target, worklist[0]);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -140,9 +140,12 @@
|
|||
;; VQ without 2 element modes.
|
||||
(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
|
||||
|
||||
;; SVE modes without 2 and 4 element modes.
|
||||
(define_mode_iterator SVE_NO4E [VNx16QI VNx8QI VNx8HI VNx8HF VNx8BF])
|
||||
|
||||
;; SVE modes without 2 element modes.
|
||||
(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI VNx8HF
|
||||
VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF])
|
||||
(define_mode_iterator SVE_NO2E [SVE_NO4E VNx4QI VNx4HI VNx4HF VNx4BF VNx4SI
|
||||
VNx4SF])
|
||||
|
||||
;; 2 element quad vector modes.
|
||||
(define_mode_iterator VQ_2E [V2DI V2DF])
|
||||
|
@ -1764,6 +1767,11 @@
|
|||
(VNx8BF "vnx4bf") (VNx4BF "vnx2bf")
|
||||
(VNx4SI "vnx2si") (VNx4SF "vnx2sf")])
|
||||
|
||||
;; Quad modes of all vector modes, in lower-case.
|
||||
(define_mode_attr Vquad [(VNx16QI "vnx4qi") (VNx8QI "vnx2qi")
|
||||
(VNx8HI "vnx2hi") (VNx8HF "vnx2hf")
|
||||
(VNx8BF "vnx2bf")])
|
||||
|
||||
;; Single-element half modes of quad vector modes.
|
||||
(define_mode_attr V1HALF [(V2DI "V1DI") (V2DF "V1DF")])
|
||||
|
||||
|
|
27
gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c
Normal file
27
gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c
Normal file
|
@ -0,0 +1,27 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-std=c99" } */
|
||||
/* { dg-additional-options "-O3 -march=armv8-a" } */
|
||||
|
||||
#pragma GCC target ("+sve")
|
||||
extern char __attribute__ ((simd, const)) fn3 (int, short);
|
||||
void test_fn3 (float *a, float *b, double *c, int n)
|
||||
{
|
||||
for (int i = 0; i < n; ++i)
|
||||
a[i] = fn3 (b[i], c[i]);
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */
|
||||
|
||||
extern char __attribute__ ((simd, const)) fn4 (int, char);
|
||||
void test_fn4 (float *a, float *b, double *c, int n)
|
||||
{
|
||||
for (int i = 0; i < n; ++i)
|
||||
a[i] = fn4 (b[i], c[i]);
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn4\n} } } */
|
||||
|
||||
/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 6 } } */
|
||||
/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 16 } } */
|
||||
/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 24 } } */
|
||||
|
Loading…
Add table
Reference in a new issue