aarch64: remove extra XTN in vector concatenation
GIMPLE code which performs a narrowing truncation on the result of a vector concatenation currently results in an unnecessary XTN being emitted following a UZP1 to concate the operands. In cases such as this, UZP1 should instead use a smaller arrangement specifier to replace the XTN instruction. This is seen in cases such as in this GIMPLE example: int32x2_t foo (svint64_t a, svint64_t b) { vector(2) int vect__2.8; long int _1; long int _3; vector(2) long int _12; <bb 2> [local count: 1073741824]: _1 = svaddv_s64 ({ -1, 0, 0, 0, 0, 0, 0, 0, ... }, a_6(D)); _3 = svaddv_s64 ({ -1, 0, 0, 0, 0, 0, 0, 0, ... }, b_7(D)); _12 = {_1, _3}; vect__2.8_13 = (vector(2) int) _12; return vect__2.8_13; } Original assembly generated: bar: ptrue p3.b, all uaddv d0, p3, z0.d uaddv d1, p3, z1.d uzp1 v0.2d, v0.2d, v1.2d xtn v0.2s, v0.2d ret This patch therefore defines the *aarch64_trunc_concat<mode> insn which truncates the concatenation result, rather than concatenating the truncated operands (such as in *aarch64_narrow_trunc<mode>), resulting in the following optimised assembly being emitted: bar: ptrue p3.b, all uaddv d0, p3, z0.d uaddv d1, p3, z1.d uzp1 v0.2s, v0.2s, v1.2s ret This patch passes all regression tests on aarch64 with no new failures. A supporting test for this optimisation is also written and passes. OK for master? I do not have commit rights so I cannot push the patch myself. gcc/ChangeLog: * config/aarch64/aarch64-simd.md: (*aarch64_trunc_concat) new insn definition. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/truncated_concatenation_1.c: new test for the above example and other modes covered by insn definitions.
This commit is contained in:
parent
7a7903dec5
commit
6069f02a48
2 changed files with 48 additions and 0 deletions
|
@ -1874,6 +1874,22 @@
|
|||
[(set_attr "type" "neon_permute<q>")]
|
||||
)
|
||||
|
||||
(define_insn "*aarch64_trunc_concat<mode>"
|
||||
[(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
|
||||
(truncate:<VNARROWQ>
|
||||
(vec_concat:VQN
|
||||
(match_operand:<VHALF> 1 "register_operand" "w")
|
||||
(match_operand:<VHALF> 2 "register_operand" "w"))))]
|
||||
"TARGET_SIMD"
|
||||
{
|
||||
if (!BYTES_BIG_ENDIAN)
|
||||
return "uzp1\\t%0.<Vntype>, %1.<Vntype>, %2.<Vntype>";
|
||||
else
|
||||
return "uzp1\\t%0.<Vntype>, %2.<Vntype>, %1.<Vntype>";
|
||||
}
|
||||
[(set_attr "type" "neon_permute<q>")]
|
||||
)
|
||||
|
||||
;; Packing doubles.
|
||||
|
||||
(define_expand "vec_pack_trunc_<mode>"
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O3 -Wall -march=armv8.2-a+sve" } */
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <arm_sve.h>
|
||||
|
||||
int8x8_t f1 (int16x4_t a, int16x4_t b) {
|
||||
int8x8_t ab = vdup_n_s8 (0);
|
||||
int16x8_t ab_concat = vcombine_s16 (a, b);
|
||||
ab = vmovn_s16 (ab_concat);
|
||||
return ab;
|
||||
}
|
||||
|
||||
int16x4_t f2 (int32x2_t a, int32x2_t b) {
|
||||
int16x4_t ab = vdup_n_s16 (0);
|
||||
int32x4_t ab_concat = vcombine_s32 (a, b);
|
||||
ab = vmovn_s32 (ab_concat);
|
||||
return ab;
|
||||
}
|
||||
|
||||
int32x2_t f3 (svint64_t a, svint64_t b) {
|
||||
int32x2_t ab = vdup_n_s32 (0);
|
||||
ab = vset_lane_s32 ((int)svaddv_s64 (svptrue_b64 (), a), ab, 0);
|
||||
ab = vset_lane_s32 ((int)svaddv_s64 (svptrue_b64 (), b), ab, 1);
|
||||
return ab;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-not {\txtn\t} } }*/
|
||||
/* { dg-final { scan-assembler-not {\tfcvtn\t} } }*/
|
||||
/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.8b, v[0-9]+\.8b, v[0-9]+\.8b} 1 } }*/
|
||||
/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.4h, v[0-9]+\.4h, v[0-9]+\.4h} 1 } }*/
|
||||
/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.2s, v[0-9]+\.2s, v[0-9]+\.2s} 1 } }*/
|
Loading…
Add table
Reference in a new issue