[AARCH64] Implement Vector Permute Support.

gcc/ * config/aarch64/aarch64-protos.h (aarch64_split_combinev16qi): New. (aarch64_expand_vec_perm): Likewise. (aarch64_expand_vec_perm_const): Likewise. * config/aarch64/aarch64-simd.md (vec_perm_const<mode>): New. (vec_perm<mode>): Likewise. (aarch64_tbl1<mode>): Likewise. (aarch64_tbl2v16qi): Likewise. (aarch64_combinev16qi): New. * config/aarch64/aarch64.c (aarch64_vectorize_vec_perm_const_ok): New. (aarch64_split_combinev16qi): Likewise. (MAX_VECT_LEN): Define. (expand_vec_perm_d): New. (aarch64_expand_vec_perm_1): Likewise. (aarch64_expand_vec_perm): Likewise. (aarch64_evpc_tbl): Likewise. (aarch64_expand_vec_perm_const_1): Likewise. (aarch64_expand_vec_perm_const): Likewise. (aarch64_vectorize_vec_perm_const_ok): Likewise. (TARGET_VECTORIZE_VEC_PERM_CONST_OK): Likewise. * config/aarch64/iterators.md (unspec): Add UNSPEC_TBL, UNSPEC_CONCAT. (V_cmp_result): Add mapping for V2DF. gcc/testsuite/ * lib/target-supports.exp (check_effective_target_vect_perm): Allow aarch64*-*-*. (check_effective_target_vect_perm_byte): Likewise. (check_effective_target_vect_perm_short): Likewise. (check_effective_target_vect_char_mult): Likewise. (check_effective_target_vect_extract_even_odd): Likewise. (check_effective_target_vect_interleave): Likewise. From-SVN: r194218
2012-12-05 11:36:00 +00:00 · 2012-12-05 11:36:00 +00:00 · 88b080739a
commit 88b080739a
parent 246ff1aef6
7 changed files with 420 additions and 6 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,30 @@
+2012-12-05  James Greenhalgh  <james.greenhalgh@arm.com>
+
+	* config/aarch64/aarch64-protos.h
+	(aarch64_split_combinev16qi): New.
+	(aarch64_expand_vec_perm): Likewise.
+	(aarch64_expand_vec_perm_const): Likewise.
+	* config/aarch64/aarch64-simd.md (vec_perm_const<mode>): New.
+	(vec_perm<mode>): Likewise.
+	(aarch64_tbl1<mode>): Likewise.
+	(aarch64_tbl2v16qi): Likewise.
+	(aarch64_combinev16qi): New.
+	* config/aarch64/aarch64.c
+	(aarch64_vectorize_vec_perm_const_ok): New.
+	(aarch64_split_combinev16qi): Likewise.
+	(MAX_VECT_LEN): Define.
+	(expand_vec_perm_d): New.
+	(aarch64_expand_vec_perm_1): Likewise.
+	(aarch64_expand_vec_perm): Likewise.
+	(aarch64_evpc_tbl): Likewise.
+	(aarch64_expand_vec_perm_const_1): Likewise.
+	(aarch64_expand_vec_perm_const): Likewise.
+	(aarch64_vectorize_vec_perm_const_ok): Likewise.
+	(TARGET_VECTORIZE_VEC_PERM_CONST_OK): Likewise.
+	* config/aarch64/iterators.md
+	(unspec): Add UNSPEC_TBL, UNSPEC_CONCAT.
+	(V_cmp_result): Add mapping for V2DF.
+
 2012-12-05  Yufeng Zhang  <yufeng.zhang@arm.com>

 	* config/aarch64/aarch64.c (aarch64_simd_mangle_map_entry): New
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@ -241,4 +241,9 @@ aarch64_builtin_vectorized_function (tree fndecl,
 				     tree type_out,
 				     tree type_in);

+extern void aarch64_split_combinev16qi (rtx operands[3]);
+extern void aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel);
+extern bool
+aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel);
+
 #endif /* GCC_AARCH64_PROTOS_H */
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@ -3338,6 +3338,74 @@

 ;; Permuted-store expanders for neon intrinsics.

+;; Permute instructions
+
+;; vec_perm support
+
+(define_expand "vec_perm_const<mode>"
+  [(match_operand:VALL 0 "register_operand")
+   (match_operand:VALL 1 "register_operand")
+   (match_operand:VALL 2 "register_operand")
+   (match_operand:<V_cmp_result> 3)]
+  "TARGET_SIMD"
+{
+  if (aarch64_expand_vec_perm_const (operands[0], operands[1],
+				     operands[2], operands[3]))
+    DONE;
+  else
+    FAIL;
+})
+
+(define_expand "vec_perm<mode>"
+  [(match_operand:VB 0 "register_operand")
+   (match_operand:VB 1 "register_operand")
+   (match_operand:VB 2 "register_operand")
+   (match_operand:VB 3 "register_operand")]
+  "TARGET_SIMD"
+{
+  aarch64_expand_vec_perm (operands[0], operands[1],
+			   operands[2], operands[3]);
+  DONE;
+})
+
+(define_insn "aarch64_tbl1<mode>"
+  [(set (match_operand:VB 0 "register_operand" "=w")
+	(unspec:VB [(match_operand:V16QI 1 "register_operand" "w")
+		    (match_operand:VB 2 "register_operand" "w")]
+		   UNSPEC_TBL))]
+  "TARGET_SIMD"
+  "tbl\\t%0.<Vtype>, {%1.16b}, %2.<Vtype>"
+  [(set_attr "simd_type" "simd_tbl")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+;; Two source registers.
+
+(define_insn "aarch64_tbl2v16qi"
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
+	(unspec:V16QI [(match_operand:OI 1 "register_operand" "w")
+		       (match_operand:V16QI 2 "register_operand" "w")]
+		      UNSPEC_TBL))]
+  "TARGET_SIMD"
+  "tbl\\t%0.16b, {%S1.16b - %T1.16b}, %2.16b"
+  [(set_attr "simd_type" "simd_tbl")
+   (set_attr "simd_mode" "V16QI")]
+)
+
+(define_insn_and_split "aarch64_combinev16qi"
+  [(set (match_operand:OI 0 "register_operand" "=w")
+	(unspec:OI [(match_operand:V16QI 1 "register_operand" "w")
+		    (match_operand:V16QI 2 "register_operand" "w")]
+		   UNSPEC_CONCAT))]
+  "TARGET_SIMD"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  aarch64_split_combinev16qi (operands);
+  DONE;
+})
+
 (define_insn "aarch64_st2<mode>_dreg"
  [(set (match_operand:TI 0 "aarch64_simd_struct_operand" "=Utv")
 	(unspec:TI [(match_operand:OI 1 "register_operand" "w")
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@ -110,6 +110,9 @@ static unsigned bit_count (unsigned HOST_WIDE_INT);
 static bool aarch64_const_vec_all_same_int_p (rtx,
 					      HOST_WIDE_INT, HOST_WIDE_INT);

+static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+						 const unsigned char *sel);
+
 /* The processor for which instructions should be scheduled.  */
 enum aarch64_processor aarch64_tune = generic;

@ -6782,6 +6785,292 @@ aarch64_c_mode_for_suffix (char suffix)
  return VOIDmode;
 }

+/* Split operands into moves from op[1] + op[2] into op[0].  */
+
+void
+aarch64_split_combinev16qi (rtx operands[3])
+{
+  unsigned int dest = REGNO (operands[0]);
+  unsigned int src1 = REGNO (operands[1]);
+  unsigned int src2 = REGNO (operands[2]);
+  enum machine_mode halfmode = GET_MODE (operands[1]);
+  unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
+  rtx destlo, desthi;
+
+  gcc_assert (halfmode == V16QImode);
+
+  if (src1 == dest && src2 == dest + halfregs)
+    {
+      /* No-op move.  Can't split to nothing; emit something.  */
+      emit_note (NOTE_INSN_DELETED);
+      return;
+    }
+
+  /* Preserve register attributes for variable tracking.  */
+  destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
+  desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
+			       GET_MODE_SIZE (halfmode));
+
+  /* Special case of reversed high/low parts.  */
+  if (reg_overlap_mentioned_p (operands[2], destlo)
+      && reg_overlap_mentioned_p (operands[1], desthi))
+    {
+      emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
+      emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
+      emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
+    }
+  else if (!reg_overlap_mentioned_p (operands[2], destlo))
+    {
+      /* Try to avoid unnecessary moves if part of the result
+	 is in the right place already.  */
+      if (src1 != dest)
+	emit_move_insn (destlo, operands[1]);
+      if (src2 != dest + halfregs)
+	emit_move_insn (desthi, operands[2]);
+    }
+  else
+    {
+      if (src2 != dest + halfregs)
+	emit_move_insn (desthi, operands[2]);
+      if (src1 != dest)
+	emit_move_insn (destlo, operands[1]);
+    }
+}
+
+/* vec_perm support.  */
+
+#define MAX_VECT_LEN 16
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool one_vector_p;
+  bool testing_p;
+};
+
+/* Generate a variable permutation.  */
+
+static void
+aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  enum machine_mode vmode = GET_MODE (target);
+  bool one_vector_p = rtx_equal_p (op0, op1);
+
+  gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
+  gcc_checking_assert (GET_MODE (op0) == vmode);
+  gcc_checking_assert (GET_MODE (op1) == vmode);
+  gcc_checking_assert (GET_MODE (sel) == vmode);
+  gcc_checking_assert (TARGET_SIMD);
+
+  if (one_vector_p)
+    {
+      if (vmode == V8QImode)
+	{
+	  /* Expand the argument to a V16QI mode by duplicating it.  */
+	  rtx pair = gen_reg_rtx (V16QImode);
+	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
+	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
+	}
+      else
+	{
+	  emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
+	}
+    }
+  else
+    {
+      rtx pair;
+
+      if (vmode == V8QImode)
+	{
+	  pair = gen_reg_rtx (V16QImode);
+	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
+	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
+	}
+      else
+	{
+	  pair = gen_reg_rtx (OImode);
+	  emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
+	  emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
+	}
+    }
+}
+
+void
+aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  enum machine_mode vmode = GET_MODE (target);
+  unsigned int i, nelt = GET_MODE_NUNITS (vmode);
+  bool one_vector_p = rtx_equal_p (op0, op1);
+  rtx rmask[MAX_VECT_LEN], mask;
+
+  gcc_checking_assert (!BYTES_BIG_ENDIAN);
+
+  /* The TBL instruction does not use a modulo index, so we must take care
+     of that ourselves.  */
+  mask = GEN_INT (one_vector_p ? nelt - 1 : 2 * nelt - 1);
+  for (i = 0; i < nelt; ++i)
+    rmask[i] = mask;
+  mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rmask));
+  sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
+
+  aarch64_expand_vec_perm_1 (target, op0, op1, sel);
+}
+
+static bool
+aarch64_evpc_tbl (struct expand_vec_perm_d *d)
+{
+  rtx rperm[MAX_VECT_LEN], sel;
+  enum machine_mode vmode = d->vmode;
+  unsigned int i, nelt = d->nelt;
+
+  /* TODO: ARM's TBL indexing is little-endian.  In order to handle GCC's
+     numbering of elements for big-endian, we must reverse the order.  */
+  if (BYTES_BIG_ENDIAN)
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  /* Generic code will try constant permutation twice.  Once with the
+     original mode and again with the elements lowered to QImode.
+     So wait and don't do the selector expansion ourselves.  */
+  if (vmode != V8QImode && vmode != V16QImode)
+    return false;
+
+  for (i = 0; i < nelt; ++i)
+    rperm[i] = GEN_INT (d->perm[i]);
+  sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
+  sel = force_reg (vmode, sel);
+
+  aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
+  return true;
+}
+
+static bool
+aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+{
+  /* The pattern matching functions above are written to look for a small
+     number to begin the sequence (0, 1, N/2).  If we begin with an index
+     from the second operand, we can swap the operands.  */
+  if (d->perm[0] >= d->nelt)
+    {
+      unsigned i, nelt = d->nelt;
+      rtx x;
+
+      for (i = 0; i < nelt; ++i)
+	d->perm[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
+
+      x = d->op0;
+      d->op0 = d->op1;
+      d->op1 = x;
+    }
+
+  if (TARGET_SIMD)
+    return aarch64_evpc_tbl (d);
+  return false;
+}
+
+/* Expand a vec_perm_const pattern.  */
+
+bool
+aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  struct expand_vec_perm_d d;
+  int i, nelt, which;
+
+  d.target = target;
+  d.op0 = op0;
+  d.op1 = op1;
+
+  d.vmode = GET_MODE (target);
+  gcc_assert (VECTOR_MODE_P (d.vmode));
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = false;
+
+  for (i = which = 0; i < nelt; ++i)
+    {
+      rtx e = XVECEXP (sel, 0, i);
+      int ei = INTVAL (e) & (2 * nelt - 1);
+      which |= (ei < nelt ? 1 : 2);
+      d.perm[i] = ei;
+    }
+
+  switch (which)
+    {
+    default:
+      gcc_unreachable ();
+
+    case 3:
+      d.one_vector_p = false;
+      if (!rtx_equal_p (op0, op1))
+	break;
+
+      /* The elements of PERM do not suggest that only the first operand
+	 is used, but both operands are identical.  Allow easier matching
+	 of the permutation by folding the permutation into the single
+	 input vector.  */
+      /* Fall Through.  */
+    case 2:
+      for (i = 0; i < nelt; ++i)
+	d.perm[i] &= nelt - 1;
+      d.op0 = op1;
+      d.one_vector_p = true;
+      break;
+
+    case 1:
+      d.op1 = op0;
+      d.one_vector_p = true;
+      break;
+    }
+
+  return aarch64_expand_vec_perm_const_1 (&d);
+}
+
+static bool
+aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+				     const unsigned char *sel)
+{
+  struct expand_vec_perm_d d;
+  unsigned int i, nelt, which;
+  bool ret;
+
+  d.vmode = vmode;
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = true;
+  memcpy (d.perm, sel, nelt);
+
+  /* Calculate whether all elements are in one vector.  */
+  for (i = which = 0; i < nelt; ++i)
+    {
+      unsigned char e = d.perm[i];
+      gcc_assert (e < 2 * nelt);
+      which |= (e < nelt ? 1 : 2);
+    }
+
+  /* If all elements are from the second vector, reindex as if from the
+     first vector.  */
+  if (which == 2)
+    for (i = 0; i < nelt; ++i)
+      d.perm[i] -= nelt;
+
+  /* Check whether the mask can be applied to a single vector.  */
+  d.one_vector_p = (which != 3);
+
+  d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
+  d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
+  if (!d.one_vector_p)
+    d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+
+  start_sequence ();
+  ret = aarch64_expand_vec_perm_const_1 (&d);
+  end_sequence ();
+
+  return ret;
+}
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost

@ -6985,6 +7274,12 @@ aarch64_c_mode_for_suffix (char suffix)
 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
  aarch64_simd_vector_alignment_reachable

+/* vec_perm support.  */
+
+#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
+#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
+  aarch64_vectorize_vec_perm_const_ok
+
 struct gcc_target targetm = TARGET_INITIALIZER;

 #include "gt-aarch64.h"
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@ -228,6 +228,8 @@
    UNSPEC_FMAX		; Used in aarch64-simd.md.
    UNSPEC_FMIN		; Used in aarch64-simd.md.
    UNSPEC_BSL		; Used in aarch64-simd.md.
+    UNSPEC_TBL		; Used in vector permute patterns.
+    UNSPEC_CONCAT	; Used in vector permute patterns.
 ])

 ;; -------------------------------------------------------------------
@ -415,8 +417,9 @@
 (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
 				(V4HI "V4HI") (V8HI  "V8HI")
 				(V2SI "V2SI") (V4SI  "V4SI")
+				(DI   "DI")   (V2DI  "V2DI")
 				(V2SF "V2SI") (V4SF  "V4SI")
-				(DI   "DI")   (V2DI  "V2DI")])
+				(V2DF "V2DI")])

 ;; Vm for lane instructions is restricted to FP_LO_REGS.
 (define_mode_attr vwx [(V4HI "x") (V8HI "x") (HI "x")
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@ -1,3 +1,13 @@
+2012-12-05  James Greenhalgh  <james.greenhalgh@arm.com>
+
+	* lib/target-supports.exp
+	(check_effective_target_vect_perm): Allow aarch64*-*-*.
+	(check_effective_target_vect_perm_byte): Likewise.
+	(check_effective_target_vect_perm_short): Likewise.
+	(check_effective_target_vect_char_mult): Likewise.
+	(check_effective_target_vect_extract_even_odd): Likewise.
+	(check_effective_target_vect_interleave): Likewise.
+
 2012-12-05  Yufeng Zhang  <yufeng.zhang@arm.com>

 	* g++.dg/abi/mangle-neon-aarch64.C: New test.
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@ -3014,6 +3014,7 @@ proc check_effective_target_vect_perm { } {
    } else {
        set et_vect_perm_saved 0
        if { [is-effective-target arm_neon_ok]
+	     || [istarget aarch64*-*-*]
 	     || [istarget powerpc*-*-*]
             || [istarget spu-*-*]
 	     || [istarget i?86-*-*]
@ -3040,6 +3041,7 @@ proc check_effective_target_vect_perm_byte { } {
    } else {
        set et_vect_perm_byte_saved 0
        if { [is-effective-target arm_neon_ok]
+	     || [istarget aarch64*-*-*]
 	     || [istarget powerpc*-*-*]
             || [istarget spu-*-*] } {
            set et_vect_perm_byte_saved 1
@ -3062,6 +3064,7 @@ proc check_effective_target_vect_perm_short { } {
    } else {
        set et_vect_perm_short_saved 0
        if { [is-effective-target arm_neon_ok]
+	     || [istarget aarch64*-*-*]
 	     || [istarget powerpc*-*-*]
             || [istarget spu-*-*] } {
            set et_vect_perm_short_saved 1
@ -3697,7 +3700,8 @@ proc check_effective_target_vect_char_mult { } {
 	verbose "check_effective_target_vect_char_mult: using cached result" 2
    } else {
 	set et_vect_char_mult_saved 0
-	if { [istarget ia64-*-*]
+	if { [istarget aarch64*-*-*]
+	     || [istarget ia64-*-*]
 	     || [istarget i?86-*-*]
 	     || [istarget x86_64-*-*]
            || [check_effective_target_arm32] } {
@ -3768,8 +3772,9 @@ proc check_effective_target_vect_extract_even_odd { } {
        verbose "check_effective_target_vect_extract_even_odd: using cached result" 2
    } else {
        set et_vect_extract_even_odd_saved 0 
-        if { [istarget powerpc*-*-*] 
-            || [is-effective-target arm_neon_ok]
+	if { [istarget aarch64*-*-*]
+	     || [istarget powerpc*-*-*]
+	     || [is-effective-target arm_neon_ok]
             || [istarget i?86-*-*]
             || [istarget x86_64-*-*]
             || [istarget ia64-*-*]
@ -3793,8 +3798,9 @@ proc check_effective_target_vect_interleave { } {
        verbose "check_effective_target_vect_interleave: using cached result" 2
    } else {
        set et_vect_interleave_saved 0
-        if { [istarget powerpc*-*-*]
-            || [is-effective-target arm_neon_ok]
+	if { [istarget aarch64*-*-*]
+	     || [istarget powerpc*-*-*]
+	     || [is-effective-target arm_neon_ok]
             || [istarget i?86-*-*]
             || [istarget x86_64-*-*]
             || [istarget ia64-*-*]