[AArch64] Rewrite the vdup_lane intrinsics in C

gcc/ * config/aarch64/aarch64-simd-builtins.def (dup_lane_scalar): Remove. * config/aarch64/aarch64-simd.md (aarch64_simd_dup): Add 'w->w' alternative. (aarch64_dup_lane<mode>): Allow for VALL. (aarch64_dup_lane_scalar<mode>): Remove. (aarch64_dup_lane_<vswap_width_name><mode>): New. (aarch64_get_lane_signed<mode>): Add w->w altenative. (aarch64_get_lane_unsigned<mode>): Likewise. (aarch64_get_lane<mode>): Likewise. * config/aarch64/aarch64.c (aarch64_evpc_dup): New. (aarch64_expand_vec_perm_const_1): Use aarch64_evpc_dup. * config/aarch64/iterators.md (VSWAP_WIDTH): New. (VCON): Change container of V2SF. (vswap_width_name): Likewise. * config/aarch64/arm_neon.h (__aarch64_vdup_lane_any): New. (__aarch64_vdup<q>_lane<q>_<fpsu><8,16,32,64>): Likewise. (vdup<q>_n_<psuf><8,16,32,64>): Convert to C implementation. (vdup<q>_lane<q>_<fpsu><8,16,32,64>): Likewise. gcc/testsuite/ * gcc.target/aarch64/scalar_intrinsics.c (vdup<bhsd>_lane<su><8,16,32,64>): Force values to SIMD registers. From-SVN: r202180
2013-09-02 16:22:10 +00:00 · 2013-09-02 16:22:10 +00:00 · 91bd4114a7
commit 91bd4114a7
parent d617d2d806
7 changed files with 887 additions and 602 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,26 @@
 2013-09-02  James Greenhalgh  <james.greenhalgh@arm.com>
 	* config/aarch64/aarch64-simd-builtins.def
 	(dup_lane_scalar): Remove.
 	* config/aarch64/aarch64-simd.md
 	(aarch64_simd_dup): Add 'w->w' alternative.
 	(aarch64_dup_lane<mode>): Allow for VALL.
 	(aarch64_dup_lane_scalar<mode>): Remove.
 	(aarch64_dup_lane_<vswap_width_name><mode>): New.
 	(aarch64_get_lane_signed<mode>): Add w->w altenative.
 	(aarch64_get_lane_unsigned<mode>): Likewise.
 	(aarch64_get_lane<mode>): Likewise.
 	* config/aarch64/aarch64.c (aarch64_evpc_dup): New.
 	(aarch64_expand_vec_perm_const_1): Use aarch64_evpc_dup.
 	* config/aarch64/iterators.md (VSWAP_WIDTH): New.
 	(VCON): Change container of V2SF.
 	(vswap_width_name): Likewise.
 	* config/aarch64/arm_neon.h
 	(__aarch64_vdup_lane_any): New.
 	(__aarch64_vdup<q>_lane<q>_<fpsu><8,16,32,64>): Likewise.
 	(vdup<q>_n_<psuf><8,16,32,64>): Convert to C implementation.
 	(vdup<q>_lane<q>_<fpsu><8,16,32,64>): Likewise.
 2013-09-02  Eric Botcazou  <ebotcazou@adacore.com>
 	PR middle-end/56382
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@ -336,24 +336,13 @@
 })
 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQ 0 "register_operand" "=w")
+  [(set (match_operand:VDQ 0 "register_operand" "=w, w")
-        (vec_duplicate:VDQ (match_operand:<VEL> 1 "register_operand" "r")))]
+        (vec_duplicate:VDQ (match_operand:<VEL> 1 "register_operand" "r, w")))]
  "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %<vw>1"
+  "@
-  [(set_attr "simd_type" "simd_dupgp")
+   dup\\t%0.<Vtype>, %<vw>1
-   (set_attr "simd_mode" "<MODE>")]
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-)
+  [(set_attr "simd_type" "simd_dupgp, simd_dup")
 (define_insn "aarch64_dup_lane<mode>"
  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
        (vec_duplicate:VDQ_I
 	  (vec_select:<VEL>
 	    (match_operand:<VCON> 1 "register_operand" "w")
 	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
          )))]
  "TARGET_SIMD"
  "dup\\t%<v>0<Vmtype>, %1.<Vetype>[%2]"
  [(set_attr "simd_type" "simd_dup")
   (set_attr "simd_mode" "<MODE>")]
 )
@ -366,6 +355,32 @@
   (set_attr "simd_mode" "<MODE>")]
 )
 (define_insn "aarch64_dup_lane<mode>"
  [(set (match_operand:VALL 0 "register_operand" "=w")
 	(vec_duplicate:VALL
 	  (vec_select:<VEL>
 	    (match_operand:VALL 1 "register_operand" "w")
 	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
          )))]
  "TARGET_SIMD"
  "dup\\t%0.<Vtype>, %1.<Vetype>[%2]"
  [(set_attr "simd_type" "simd_dup")
   (set_attr "simd_mode" "<MODE>")]
 )
 (define_insn "aarch64_dup_lane_<vswap_width_name><mode>"
  [(set (match_operand:VALL 0 "register_operand" "=w")
 	(vec_duplicate:VALL
 	  (vec_select:<VEL>
 	    (match_operand:<VSWAP_WIDTH> 1 "register_operand" "w")
 	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
          )))]
  "TARGET_SIMD"
  "dup\\t%0.<Vtype>, %1.<Vetype>[%2]"
  [(set_attr "simd_type" "simd_dup")
   (set_attr "simd_mode" "<MODE>")]
 )
 (define_insn "*aarch64_simd_mov<mode>"
  [(set (match_operand:VD 0 "aarch64_simd_nonimmediate_operand"
 		"=w, Utv,  w, ?r, ?w, ?r, w")
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@ -7931,6 +7931,55 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
  return true;
 }
 static bool
 aarch64_evpc_dup (struct expand_vec_perm_d *d)
 {
  rtx (*gen) (rtx, rtx, rtx);
  rtx out = d->target;
  rtx in0;
  enum machine_mode vmode = d->vmode;
  unsigned int i, elt, nelt = d->nelt;
  rtx lane;
  /* TODO: This may not be big-endian safe.  */
  if (BYTES_BIG_ENDIAN)
    return false;
  elt = d->perm[0];
  for (i = 1; i < nelt; i++)
    {
      if (elt != d->perm[i])
 	return false;
    }
  /* The generic preparation in aarch64_expand_vec_perm_const_1
     swaps the operand order and the permute indices if it finds
     d->perm[0] to be in the second operand.  Thus, we can always
     use d->op0 and need not do any extra arithmetic to get the
     correct lane number.  */
  in0 = d->op0;
  lane = GEN_INT (elt);
  switch (vmode)
    {
    case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
    case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
    case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
    case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
    case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
    case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
    case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
    case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
    case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
    case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
    default:
      return false;
    }
  emit_insn (gen (out, in0, lane));
  return true;
 }
 static bool
 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
 {
@ -7988,6 +8037,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 	return true;
      else if (aarch64_evpc_trn (d))
 	return true;
      else if (aarch64_evpc_dup (d))
 	return true;
      return aarch64_evpc_tbl (d);
    }
  return false;
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@ -383,7 +383,7 @@
 			(V4HI "V8HI") (V8HI "V8HI")
 			(V2SI "V4SI") (V4SI "V4SI")
 			(DI   "V2DI") (V2DI "V2DI")
-			(V2SF "V2SF") (V4SF "V4SF")
+			(V2SF "V4SF") (V4SF "V4SF")
 			(V2DF "V2DF") (SI   "V4SI")
 			(HI   "V8HI") (QI   "V16QI")])
@ -527,6 +527,20 @@
 (define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si")])
 (define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI")])
 (define_mode_attr VSWAP_WIDTH [(V8QI "V16QI") (V16QI "V8QI")
 				(V4HI "V8HI") (V8HI  "V4HI")
 				(V2SI "V4SI") (V4SI  "V2SI")
 				(DI   "V2DI") (V2DI  "DI")
 				(V2SF "V4SF") (V4SF  "V2SF")
 				(DF   "V2DF") (V2DF  "DF")])
 (define_mode_attr vswap_width_name [(V8QI "to_128") (V16QI "to_64")
 				    (V4HI "to_128") (V8HI  "to_64")
 				    (V2SI "to_128") (V4SI  "to_64")
 				    (DI   "to_128") (V2DI  "to_64")
 				    (V2SF "to_128") (V4SF  "to_64")
 				    (DF   "to_128") (V2DF  "to_64")])
 ;; -------------------------------------------------------------------
 ;; Code Iterators
 ;; -------------------------------------------------------------------
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@ -1,3 +1,8 @@
 2013-09-02  James Greenhalgh  <james.greenhalgh@arm.com>
 	* gcc.target/aarch64/scalar_intrinsics.c
 	(vdup<bhsd>_lane<su><8,16,32,64>): Force values to SIMD registers.
 2013-09-02  Richard Biener  <rguenther@suse.de>
 	PR middle-end/57511
--- a/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c
+++ b/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c
@ -198,13 +198,21 @@ test_vcltzd_s64 (int64x1_t a)
 int8x1_t
 test_vdupb_lane_s8 (int8x16_t a)
 {
-  return vdupb_lane_s8 (a, 2);
+  int8x1_t res;
  force_simd (a);
  res = vdupb_laneq_s8 (a, 2);
  force_simd (res);
  return res;
 }
 uint8x1_t
 test_vdupb_lane_u8 (uint8x16_t a)
 {
-  return vdupb_lane_u8 (a, 2);
+  uint8x1_t res;
  force_simd (a);
  res = vdupb_laneq_u8 (a, 2);
  force_simd (res);
  return res;
 }
 /* { dg-final { scan-assembler-times "aarch64_get_lanev8hi" 2 } } */
@ -212,13 +220,21 @@ test_vdupb_lane_u8 (uint8x16_t a)
 int16x1_t
 test_vduph_lane_s16 (int16x8_t a)
 {
-  return vduph_lane_s16 (a, 2);
+  int16x1_t res;
  force_simd (a);
  res = vduph_laneq_s16 (a, 2);
  force_simd (res);
  return res;
 }
 uint16x1_t
 test_vduph_lane_u16 (uint16x8_t a)
 {
-  return vduph_lane_u16 (a, 2);
+  uint16x1_t res;
  force_simd (a);
  res = vduph_laneq_u16 (a, 2);
  force_simd (res);
  return res;
 }
 /* { dg-final { scan-assembler-times "aarch64_get_lanev4si" 2 } } */
@ -226,13 +242,21 @@ test_vduph_lane_u16 (uint16x8_t a)
 int32x1_t
 test_vdups_lane_s32 (int32x4_t a)
 {
-  return vdups_lane_s32 (a, 2);
+  int32x1_t res;
  force_simd (a);
  res = vdups_laneq_s32 (a, 2);
  force_simd (res);
  return res;
 }
 uint32x1_t
 test_vdups_lane_u32 (uint32x4_t a)
 {
-  return vdups_lane_u32 (a, 2);
+  uint32x1_t res;
  force_simd (a);
  res = vdups_laneq_u32 (a, 2);
  force_simd (res);
  return res;
 }
 /* { dg-final { scan-assembler-times "aarch64_get_lanev2di" 2 } } */
@ -240,13 +264,21 @@ test_vdups_lane_u32 (uint32x4_t a)
 int64x1_t
 test_vdupd_lane_s64 (int64x2_t a)
 {
-  return vdupd_lane_s64 (a, 1);
+  int64x1_t res;
  force_simd (a);
  res = vdupd_laneq_s64 (a, 1);
  force_simd (res);
  return res;
 }
 uint64x1_t
 test_vdupd_lane_u64 (uint64x2_t a)
 {
-  return vdupd_lane_u64 (a, 1);
+  uint64x1_t res;
  force_simd (a);
  res = vdupd_laneq_u64 (a, 1);
  force_simd (res);
  return res;
 }
 /* { dg-final { scan-assembler-times "\\tcmtst\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 2 } } */