aarch64: Fold vget_low_* intrinsics to BIT_FIELD_REF [PR102171]
This patch folds vget_low_* intrinsics to BIT_FILED_REF to open up more optimization opportunities for gimple optimizers. While we are here, we also remove the vget_low_* definitions from arm_neon.h and use the new intrinsics framework. PR target/102171 gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (AARCH64_SIMD_VGET_LOW_BUILTINS): New macro to create definitions for all vget_low intrinsics. (VGET_LOW_BUILTIN): Likewise. (enum aarch64_builtins): Add vget_low function codes. (aarch64_general_fold_builtin): Fold vget_low calls. * config/aarch64/aarch64-simd-builtins.def: Delete vget_low builtins. * config/aarch64/aarch64-simd.md (aarch64_get_low<mode>): Delete. (aarch64_vget_lo_halfv8bf): Likewise. * config/aarch64/arm_neon.h (__attribute__): Delete. (vget_low_f16): Likewise. (vget_low_f32): Likewise. (vget_low_f64): Likewise. (vget_low_p8): Likewise. (vget_low_p16): Likewise. (vget_low_p64): Likewise. (vget_low_s8): Likewise. (vget_low_s16): Likewise. (vget_low_s32): Likewise. (vget_low_s64): Likewise. (vget_low_u8): Likewise. (vget_low_u16): Likewise. (vget_low_u32): Likewise. (vget_low_u64): Likewise. (vget_low_bf16): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/pr113573.c: Replace __builtin_aarch64_get_lowv8hi with vget_low_s16. * gcc.target/aarch64/vget_low_2.c: New test. * gcc.target/aarch64/vget_low_2_be.c: New test. Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
This commit is contained in:
parent
e14c673ea9
commit
a2e4fe5a53
7 changed files with 124 additions and 132 deletions
|
@ -658,6 +658,23 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = {
|
|||
VREINTERPRET_BUILTINS \
|
||||
VREINTERPRETQ_BUILTINS
|
||||
|
||||
#define AARCH64_SIMD_VGET_LOW_BUILTINS \
|
||||
VGET_LOW_BUILTIN(f16) \
|
||||
VGET_LOW_BUILTIN(f32) \
|
||||
VGET_LOW_BUILTIN(f64) \
|
||||
VGET_LOW_BUILTIN(p8) \
|
||||
VGET_LOW_BUILTIN(p16) \
|
||||
VGET_LOW_BUILTIN(p64) \
|
||||
VGET_LOW_BUILTIN(s8) \
|
||||
VGET_LOW_BUILTIN(s16) \
|
||||
VGET_LOW_BUILTIN(s32) \
|
||||
VGET_LOW_BUILTIN(s64) \
|
||||
VGET_LOW_BUILTIN(u8) \
|
||||
VGET_LOW_BUILTIN(u16) \
|
||||
VGET_LOW_BUILTIN(u32) \
|
||||
VGET_LOW_BUILTIN(u64) \
|
||||
VGET_LOW_BUILTIN(bf16)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
const char *name;
|
||||
|
@ -697,6 +714,9 @@ typedef struct
|
|||
#define VREINTERPRET_BUILTIN(A, B, L) \
|
||||
AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B,
|
||||
|
||||
#define VGET_LOW_BUILTIN(A) \
|
||||
AARCH64_SIMD_BUILTIN_VGET_LOW_##A,
|
||||
|
||||
#undef VAR1
|
||||
#define VAR1(T, N, MAP, FLAG, A) \
|
||||
AARCH64_SIMD_BUILTIN_##T##_##N##A,
|
||||
|
@ -732,6 +752,7 @@ enum aarch64_builtins
|
|||
AARCH64_CRC32_BUILTIN_MAX,
|
||||
/* SIMD intrinsic builtins. */
|
||||
AARCH64_SIMD_VREINTERPRET_BUILTINS
|
||||
AARCH64_SIMD_VGET_LOW_BUILTINS
|
||||
/* ARMv8.3-A Pointer Authentication Builtins. */
|
||||
AARCH64_PAUTH_BUILTIN_AUTIA1716,
|
||||
AARCH64_PAUTH_BUILTIN_PACIA1716,
|
||||
|
@ -823,8 +844,37 @@ static aarch64_fcmla_laneq_builtin_datum aarch64_fcmla_lane_builtin_data[] = {
|
|||
&& SIMD_INTR_QUAL(A) == SIMD_INTR_QUAL(B) \
|
||||
},
|
||||
|
||||
#undef VGET_LOW_BUILTIN
|
||||
#define VGET_LOW_BUILTIN(A) \
|
||||
{"vget_low_" #A, \
|
||||
AARCH64_SIMD_BUILTIN_VGET_LOW_##A, \
|
||||
2, \
|
||||
{ SIMD_INTR_MODE(A, d), SIMD_INTR_MODE(A, q) }, \
|
||||
{ SIMD_INTR_QUAL(A), SIMD_INTR_QUAL(A) }, \
|
||||
FLAG_AUTO_FP, \
|
||||
false \
|
||||
},
|
||||
|
||||
#define AARCH64_SIMD_VGET_LOW_BUILTINS \
|
||||
VGET_LOW_BUILTIN(f16) \
|
||||
VGET_LOW_BUILTIN(f32) \
|
||||
VGET_LOW_BUILTIN(f64) \
|
||||
VGET_LOW_BUILTIN(p8) \
|
||||
VGET_LOW_BUILTIN(p16) \
|
||||
VGET_LOW_BUILTIN(p64) \
|
||||
VGET_LOW_BUILTIN(s8) \
|
||||
VGET_LOW_BUILTIN(s16) \
|
||||
VGET_LOW_BUILTIN(s32) \
|
||||
VGET_LOW_BUILTIN(s64) \
|
||||
VGET_LOW_BUILTIN(u8) \
|
||||
VGET_LOW_BUILTIN(u16) \
|
||||
VGET_LOW_BUILTIN(u32) \
|
||||
VGET_LOW_BUILTIN(u64) \
|
||||
VGET_LOW_BUILTIN(bf16)
|
||||
|
||||
static const aarch64_simd_intrinsic_datum aarch64_simd_intrinsic_data[] = {
|
||||
AARCH64_SIMD_VREINTERPRET_BUILTINS
|
||||
AARCH64_SIMD_VGET_LOW_BUILTINS
|
||||
};
|
||||
|
||||
|
||||
|
@ -3216,6 +3266,9 @@ aarch64_fold_builtin_lane_check (tree arg0, tree arg1, tree arg2)
|
|||
#define VREINTERPRET_BUILTIN(A, B, L) \
|
||||
case AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B:
|
||||
|
||||
#undef VGET_LOW_BUILTIN
|
||||
#define VGET_LOW_BUILTIN(A) \
|
||||
case AARCH64_SIMD_BUILTIN_VGET_LOW_##A:
|
||||
|
||||
/* Try to fold a call to the built-in function with subcode FCODE. The
|
||||
function is passed the N_ARGS arguments in ARGS and it returns a value
|
||||
|
@ -3235,6 +3288,13 @@ aarch64_general_fold_builtin (unsigned int fcode, tree type,
|
|||
return fold_build1 (FLOAT_EXPR, type, args[0]);
|
||||
AARCH64_SIMD_VREINTERPRET_BUILTINS
|
||||
return fold_build1 (VIEW_CONVERT_EXPR, type, args[0]);
|
||||
AARCH64_SIMD_VGET_LOW_BUILTINS
|
||||
{
|
||||
auto pos = BYTES_BIG_ENDIAN ? 64 : 0;
|
||||
|
||||
return fold_build3 (BIT_FIELD_REF, type, args[0], bitsize_int (64),
|
||||
bitsize_int (pos));
|
||||
}
|
||||
case AARCH64_SIMD_BUILTIN_LANE_CHECK:
|
||||
gcc_assert (n_args == 3);
|
||||
if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2]))
|
||||
|
|
|
@ -65,8 +65,6 @@
|
|||
BUILTIN_VS (UNOP, ctz, 2, NONE)
|
||||
BUILTIN_VB (UNOP, popcount, 2, NONE)
|
||||
|
||||
/* Implemented by aarch64_get_low<mode>. */
|
||||
BUILTIN_VQMOV (UNOP, get_low, 0, AUTO_FP)
|
||||
/* Implemented by aarch64_get_high<mode>. */
|
||||
BUILTIN_VQMOV (UNOP, get_high, 0, AUTO_FP)
|
||||
|
||||
|
@ -960,8 +958,7 @@
|
|||
VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, FP, v4sf)
|
||||
VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, FP, v4sf)
|
||||
|
||||
/* Implemented by aarch64_vget_lo/hi_halfv8bf. */
|
||||
VAR1 (UNOP, vget_lo_half, 0, AUTO_FP, v8bf)
|
||||
/* Implemented by aarch64_vget_hi_halfv8bf. */
|
||||
VAR1 (UNOP, vget_hi_half, 0, AUTO_FP, v8bf)
|
||||
|
||||
/* Implemented by aarch64_simd_<sur>mmlav16qi. */
|
||||
|
|
|
@ -288,17 +288,6 @@
|
|||
}
|
||||
)
|
||||
|
||||
(define_expand "aarch64_get_low<mode>"
|
||||
[(match_operand:<VHALF> 0 "register_operand")
|
||||
(match_operand:VQMOV 1 "register_operand")]
|
||||
"TARGET_FLOAT"
|
||||
{
|
||||
rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
|
||||
emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], lo));
|
||||
DONE;
|
||||
}
|
||||
)
|
||||
|
||||
(define_expand "aarch64_get_high<mode>"
|
||||
[(match_operand:<VHALF> 0 "register_operand")
|
||||
(match_operand:VQMOV 1 "register_operand")]
|
||||
|
@ -9774,17 +9763,7 @@
|
|||
[(set_attr "type" "neon_dot<VDQSF:q>")]
|
||||
)
|
||||
|
||||
;; vget_low/high_bf16
|
||||
(define_expand "aarch64_vget_lo_halfv8bf"
|
||||
[(match_operand:V4BF 0 "register_operand")
|
||||
(match_operand:V8BF 1 "register_operand")]
|
||||
"TARGET_BF16_SIMD"
|
||||
{
|
||||
rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, false);
|
||||
emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], p));
|
||||
DONE;
|
||||
})
|
||||
|
||||
;; vget_high_bf16
|
||||
(define_expand "aarch64_vget_hi_halfv8bf"
|
||||
[(match_operand:V4BF 0 "register_operand")
|
||||
(match_operand:V8BF 1 "register_operand")]
|
||||
|
|
|
@ -3027,104 +3027,6 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
|
|||
return __aarch64_vset_lane_any (__elem, __vec, __index);
|
||||
}
|
||||
|
||||
__extension__ extern __inline float16x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_low_f16 (float16x8_t __a)
|
||||
{
|
||||
return __builtin_aarch64_get_lowv8hf (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline float32x2_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_low_f32 (float32x4_t __a)
|
||||
{
|
||||
return __builtin_aarch64_get_lowv4sf (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline float64x1_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_low_f64 (float64x2_t __a)
|
||||
{
|
||||
return (float64x1_t) {__builtin_aarch64_get_lowv2df (__a)};
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly8x8_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_low_p8 (poly8x16_t __a)
|
||||
{
|
||||
return (poly8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly16x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_low_p16 (poly16x8_t __a)
|
||||
{
|
||||
return (poly16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly64x1_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_low_p64 (poly64x2_t __a)
|
||||
{
|
||||
return (poly64x1_t) __builtin_aarch64_get_lowv2di ((int64x2_t) __a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline int8x8_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_low_s8 (int8x16_t __a)
|
||||
{
|
||||
return __builtin_aarch64_get_lowv16qi (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline int16x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_low_s16 (int16x8_t __a)
|
||||
{
|
||||
return __builtin_aarch64_get_lowv8hi (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline int32x2_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_low_s32 (int32x4_t __a)
|
||||
{
|
||||
return __builtin_aarch64_get_lowv4si (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline int64x1_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_low_s64 (int64x2_t __a)
|
||||
{
|
||||
return (int64x1_t) {__builtin_aarch64_get_lowv2di (__a)};
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint8x8_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_low_u8 (uint8x16_t __a)
|
||||
{
|
||||
return (uint8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint16x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_low_u16 (uint16x8_t __a)
|
||||
{
|
||||
return (uint16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint32x2_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_low_u32 (uint32x4_t __a)
|
||||
{
|
||||
return (uint32x2_t) __builtin_aarch64_get_lowv4si ((int32x4_t) __a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint64x1_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_low_u64 (uint64x2_t __a)
|
||||
{
|
||||
return (uint64x1_t) {__builtin_aarch64_get_lowv2di ((int64x2_t) __a)};
|
||||
}
|
||||
|
||||
__extension__ extern __inline float16x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_high_f16 (float16x8_t __a)
|
||||
|
@ -28479,13 +28381,6 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
|
|||
return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
|
||||
}
|
||||
|
||||
__extension__ extern __inline bfloat16x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_low_bf16 (bfloat16x8_t __a)
|
||||
{
|
||||
return __builtin_aarch64_vget_lo_halfv8bf (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline bfloat16x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vget_high_bf16 (bfloat16x8_t __a)
|
||||
|
|
|
@ -26,7 +26,7 @@ void jsimd_extbgrx_ycc_convert_neon() {
|
|||
int y_l = vmull_laneq_u16(r);
|
||||
uint16x8_t __a = g;
|
||||
jsimd_extbgrx_ycc_convert_neon___trans_tmp_2 =
|
||||
(uint16x4_t)__builtin_aarch64_get_lowv8hi((int16x8_t)__a);
|
||||
(uint16x4_t)vget_low_s16((int16x8_t)__a);
|
||||
__a = b;
|
||||
int cb_l = scaled_128_5;
|
||||
int cb_h = scaled_128_5;
|
||||
|
|
30
gcc/testsuite/gcc.target/aarch64/vget_low_2.c
Normal file
30
gcc/testsuite/gcc.target/aarch64/vget_low_2.c
Normal file
|
@ -0,0 +1,30 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O3 -fdump-tree-optimized" } */
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define VARIANTS \
|
||||
VARIANT (uint8x8_t, uint8x16_t, u8) \
|
||||
VARIANT (uint16x4_t, uint16x8_t, u16) \
|
||||
VARIANT (uint32x2_t, uint32x4_t, u32) \
|
||||
VARIANT (uint64x1_t, uint64x2_t, u64) \
|
||||
VARIANT (int8x8_t, int8x16_t, s8) \
|
||||
VARIANT (int16x4_t, int16x8_t, s16) \
|
||||
VARIANT (int32x2_t, int32x4_t, s32) \
|
||||
VARIANT (int64x1_t, int64x2_t, s64) \
|
||||
VARIANT (float16x4_t, float16x8_t, f16) \
|
||||
VARIANT (float32x2_t, float32x4_t, f32) \
|
||||
VARIANT (float64x1_t, float64x2_t, f64) \
|
||||
VARIANT (bfloat16x4_t, bfloat16x8_t, bf16)
|
||||
|
||||
/* vget_low_* intrinsics should become BIT_FIELD_REF. */
|
||||
#define VARIANT(TYPE64, TYPE128, SUFFIX) \
|
||||
TYPE64 \
|
||||
test_vget_low_##SUFFIX (TYPE128 vec) \
|
||||
{ \
|
||||
return vget_low_##SUFFIX (vec); \
|
||||
}
|
||||
|
||||
VARIANTS
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF <vec_\[0-9\]*\\\(D\\\), 64, 0>" 12 "optimized" } } */
|
31
gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
Normal file
31
gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
Normal file
|
@ -0,0 +1,31 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-require-effective-target stdint_types_mbig_endian } */
|
||||
/* { dg-options "-O3 -fdump-tree-optimized -mbig-endian" } */
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define VARIANTS \
|
||||
VARIANT (uint8x8_t, uint8x16_t, u8) \
|
||||
VARIANT (uint16x4_t, uint16x8_t, u16) \
|
||||
VARIANT (uint32x2_t, uint32x4_t, u32) \
|
||||
VARIANT (uint64x1_t, uint64x2_t, u64) \
|
||||
VARIANT (int8x8_t, int8x16_t, s8) \
|
||||
VARIANT (int16x4_t, int16x8_t, s16) \
|
||||
VARIANT (int32x2_t, int32x4_t, s32) \
|
||||
VARIANT (int64x1_t, int64x2_t, s64) \
|
||||
VARIANT (float16x4_t, float16x8_t, f16) \
|
||||
VARIANT (float32x2_t, float32x4_t, f32) \
|
||||
VARIANT (float64x1_t, float64x2_t, f64) \
|
||||
VARIANT (bfloat16x4_t, bfloat16x8_t, bf16)
|
||||
|
||||
/* vget_low_* intrinsics should become BIT_FIELD_REF. */
|
||||
#define VARIANT(TYPE64, TYPE128, SUFFIX) \
|
||||
TYPE64 \
|
||||
test_vget_low_##SUFFIX (TYPE128 vec) \
|
||||
{ \
|
||||
return vget_low_##SUFFIX (vec); \
|
||||
}
|
||||
|
||||
VARIANTS
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF <vec_\[0-9\]*\\\(D\\\), 64, 64>" 12 "optimized" } } */
|
Loading…
Add table
Reference in a new issue