aarch64: Support permutes on unpacked SVE vectors

This patch adds support for permuting unpacked SVE vectors using: - DUP - EXT - REV[BHW] - REV - TRN[12] - UZP[12] - ZIP[12] This involves rewriting the REV[BHW] permute code so that the inputs and outputs of the insn pattern have the same mode as the vectors being permuted. This is different from the ACLE form, where the reversal happens within individual elements rather than within groups of multiple elements. The patch does not add a conditional version of REV[BHW]. I'll come back to that once we have partial-vector comparisons and selects. The patch is really just enablement, adding an extra tool to the toolbox. It doesn't bring any significant vectorisation opportunities on its own. However, the patch does have one artificial example that is now vectorised in a better way than before. gcc/ * config/aarch64/aarch64-modes.def (VNx2BF, VNx4BF): Adjust nunits and alignment based on the current VG. * config/aarch64/iterators.md (SVE_ALL, SVE_24, SVE_2, SVE_4): Add partial SVE BF modes. (UNSPEC_REVBHW): New unspec. (Vetype, Vesize, Vctype, VEL, Vel, vwcore, V_INT_CONTAINER) (v_int_container, VPRED, vpred): Handle partial SVE BF modes. (container_bits, Vcwtype): New mode attributes. * config/aarch64/aarch64-sve.md (@aarch64_sve_revbhw_<SVE_ALL:mode><PRED_HSD:mode>): New pattern. (@aarch64_sve_dup_lane<mode>): Extended from SVE_FULL to SVE_ALL. (@aarch64_sve_rev<mode>, @aarch64_sve_<perm_insn><mode>): Likewise. (@aarch64_sve_ext<mode>): Likewise. * config/aarch64/aarch64.c (aarch64_classify_vector_mode): Handle E_VNx2BFmode and E_VNx4BFmode. (aarch64_evpc_rev_local): Base the analysis on the container size instead of the element size. Use the new aarch64_sve_revbhw patterns for SVE. (aarch64_evpc_dup): Handle partial SVE data modes. Use the container size instead of the element size when applying the SVE immediate limit. Fix a previously incorrect bounds check. (aarch64_expand_vec_perm_const_1): Handle partial SVE data modes. gcc/testsuite/ * gcc.target/aarch64/sve/dup_lane_2.c: New test. * gcc.target/aarch64/sve/dup_lane_3.c: Likewise. * gcc.target/aarch64/sve/ext_4.c: Likewise. * gcc.target/aarch64/sve/rev_2.c: Likewise. * gcc.target/aarch64/sve/revhw_1.c: Likewise. * gcc.target/aarch64/sve/revhw_2.c: Likewise. * gcc.target/aarch64/sve/slp_perm_8.c: Likewise. * gcc.target/aarch64/sve/trn1_2.c: Likewise. * gcc.target/aarch64/sve/trn2_2.c: Likewise. * gcc.target/aarch64/sve/uzp1_2.c: Likewise. * gcc.target/aarch64/sve/uzp2_2.c: Likewise. * gcc.target/aarch64/sve/zip1_2.c: Likewise. * gcc.target/aarch64/sve/zip2_2.c: Likewise.
2020-11-06 16:49:28 +00:00 · 2020-11-06 16:49:28 +00:00 · 6c3ce63b04
commit 6c3ce63b04
parent 9b11203e33
17 changed files with 3684 additions and 59 deletions
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@ -136,11 +136,13 @@ ADJUST_NUNITS (VNx2QI, aarch64_sve_vg);
 ADJUST_NUNITS (VNx2HI, aarch64_sve_vg);
 ADJUST_NUNITS (VNx2SI, aarch64_sve_vg);
 ADJUST_NUNITS (VNx2HF, aarch64_sve_vg);
+ADJUST_NUNITS (VNx2BF, aarch64_sve_vg);
 ADJUST_NUNITS (VNx2SF, aarch64_sve_vg);

 ADJUST_NUNITS (VNx4QI, aarch64_sve_vg * 2);
 ADJUST_NUNITS (VNx4HI, aarch64_sve_vg * 2);
 ADJUST_NUNITS (VNx4HF, aarch64_sve_vg * 2);
+ADJUST_NUNITS (VNx4BF, aarch64_sve_vg * 2);

 ADJUST_NUNITS (VNx8QI, aarch64_sve_vg * 4);

@ -151,7 +153,9 @@ ADJUST_ALIGNMENT (VNx8QI, 1);
 ADJUST_ALIGNMENT (VNx2HI, 2);
 ADJUST_ALIGNMENT (VNx4HI, 2);
 ADJUST_ALIGNMENT (VNx2HF, 2);
+ADJUST_ALIGNMENT (VNx2BF, 2);
 ADJUST_ALIGNMENT (VNx4HF, 2);
+ADJUST_ALIGNMENT (VNx4BF, 2);

 ADJUST_ALIGNMENT (VNx2SI, 4);
 ADJUST_ALIGNMENT (VNx2SF, 4);
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@ -3009,6 +3009,22 @@
  "<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
 )

+;; Another way of expressing the REVB, REVH and REVW patterns, with this
+;; form being easier for permutes.  The predicate mode determines the number
+;; of lanes and the data mode decides the granularity of the reversal within
+;; each lane.
+(define_insn "@aarch64_sve_revbhw_<SVE_ALL:mode><PRED_HSD:mode>"
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+	(unspec:SVE_ALL
+	  [(match_operand:PRED_HSD 1 "register_operand" "Upl")
+	   (unspec:SVE_ALL
+	     [(match_operand:SVE_ALL 2 "register_operand" "w")]
+	     UNSPEC_REVBHW)]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE && <PRED_HSD:elem_bits> > <SVE_ALL:container_bits>"
+  "rev<SVE_ALL:Vcwtype>\t%0.<PRED_HSD:Vetype>, %1/m, %2.<PRED_HSD:Vetype>"
+)
+
 ;; Predicated integer unary operations with merging.
 (define_insn "@cond_<optab><mode>"
  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w, ?&w")
@ -8273,14 +8289,14 @@

 ;; Duplicate one element of a vector.
 (define_insn "@aarch64_sve_dup_lane<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-	(vec_duplicate:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+	(vec_duplicate:SVE_ALL
 	  (vec_select:<VEL>
-	    (match_operand:SVE_FULL 1 "register_operand" "w")
+	    (match_operand:SVE_ALL 1 "register_operand" "w")
 	    (parallel [(match_operand:SI 2 "const_int_operand")]))))]
  "TARGET_SVE
-   && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 0, 63)"
-  "dup\t%0.<Vetype>, %1.<Vetype>[%2]"
+   && IN_RANGE (INTVAL (operands[2]) * <container_bits> / 8, 0, 63)"
+  "dup\t%0.<Vctype>, %1.<Vctype>[%2]"
 )

 ;; Use DUP.Q to duplicate a 128-bit segment of a register.
@ -8321,17 +8337,18 @@

 ;; Reverse the order of elements within a full vector.
 (define_insn "@aarch64_sve_rev<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-	(unspec:SVE_FULL
-	  [(match_operand:SVE_FULL 1 "register_operand" "w")]
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+	(unspec:SVE_ALL
+	  [(match_operand:SVE_ALL 1 "register_operand" "w")]
 	  UNSPEC_REV))]
  "TARGET_SVE"
-  "rev\t%0.<Vetype>, %1.<Vetype>")
+  "rev\t%0.<Vctype>, %1.<Vctype>")

 ;; -------------------------------------------------------------------------
 ;; ---- [INT,FP] Special-purpose binary permutes
 ;; -------------------------------------------------------------------------
 ;; Includes:
+;; - EXT
 ;; - SPLICE
 ;; - TRN1
 ;; - TRN2
@ -8359,13 +8376,13 @@
 ;; Permutes that take half the elements from one vector and half the
 ;; elements from the other.
 (define_insn "@aarch64_sve_<perm_insn><mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-	(unspec:SVE_FULL
-	  [(match_operand:SVE_FULL 1 "register_operand" "w")
-	   (match_operand:SVE_FULL 2 "register_operand" "w")]
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+	(unspec:SVE_ALL
+	  [(match_operand:SVE_ALL 1 "register_operand" "w")
+	   (match_operand:SVE_ALL 2 "register_operand" "w")]
 	  PERMUTE))]
  "TARGET_SVE"
-  "<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+  "<perm_insn>\t%0.<Vctype>, %1.<Vctype>, %2.<Vctype>"
 )

 ;; Apply PERMUTE to 128-bit sequences.  The behavior of these patterns
@ -8383,16 +8400,16 @@
 ;; Concatenate two vectors and extract a subvector.  Note that the
 ;; immediate (third) operand is the lane index not the byte index.
 (define_insn "@aarch64_sve_ext<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, ?&w")
-	(unspec:SVE_FULL
-	  [(match_operand:SVE_FULL 1 "register_operand" "0, w")
-	   (match_operand:SVE_FULL 2 "register_operand" "w, w")
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_ALL
+	  [(match_operand:SVE_ALL 1 "register_operand" "0, w")
+	   (match_operand:SVE_ALL 2 "register_operand" "w, w")
 	   (match_operand:SI 3 "const_int_operand")]
 	  UNSPEC_EXT))]
  "TARGET_SVE
-   && IN_RANGE (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode), 0, 255)"
+   && IN_RANGE (INTVAL (operands[3]) * <container_bits> / 8, 0, 255)"
  {
-    operands[3] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode));
+    operands[3] = GEN_INT (INTVAL (operands[3]) * <container_bits> / 8);
    return (which_alternative == 0
 	    ? "ext\\t%0.b, %0.b, %2.b, #%3"
 	    : "movprfx\t%0, %1\;ext\\t%0.b, %0.b, %2.b, #%3");
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@ -2226,6 +2226,9 @@ aarch64_classify_vector_mode (machine_mode mode)
    /* Partial SVE HF vectors.  */
    case E_VNx2HFmode:
    case E_VNx4HFmode:
+    /* Partial SVE BF vectors.  */
+    case E_VNx2BFmode:
+    case E_VNx4BFmode:
    /* Partial SVE SF vector.  */
    case E_VNx2SFmode:
      return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
@ -20468,18 +20471,21 @@ aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
      || !diff)
    return false;

-  size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
-  if (size == 8)
+  if (d->vec_flags & VEC_SVE_DATA)
+    size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
+  else
+    size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
+  if (size == 64)
    {
      unspec = UNSPEC_REV64;
      pred_mode = VNx2BImode;
    }
-  else if (size == 4)
+  else if (size == 32)
    {
      unspec = UNSPEC_REV32;
      pred_mode = VNx4BImode;
    }
-  else if (size == 2)
+  else if (size == 16)
    {
      unspec = UNSPEC_REV16;
      pred_mode = VNx8BImode;
@ -20496,28 +20502,11 @@ aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
  if (d->testing_p)
    return true;

-  if (d->vec_flags == VEC_SVE_DATA)
+  if (d->vec_flags & VEC_SVE_DATA)
    {
-      machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
-      rtx target = gen_reg_rtx (int_mode);
-      if (BYTES_BIG_ENDIAN)
-	/* The act of taking a subreg between INT_MODE and d->vmode
-	   is itself a reversing operation on big-endian targets;
-	   see the comment at the head of aarch64-sve.md for details.
-	   First reinterpret OP0 as INT_MODE without using a subreg
-	   and without changing the contents.  */
-	emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
-      else
-	{
-	  /* For SVE we use REV[BHW] unspecs derived from the element size
-	     of v->mode and vector modes whose elements have SIZE bytes.
-	     This ensures that the vector modes match the predicate modes.  */
-	  int unspec = aarch64_sve_rev_unspec (d->vmode);
-	  rtx pred = aarch64_ptrue_reg (pred_mode);
-	  emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
-				       gen_lowpart (int_mode, d->op0)));
-	}
-      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+      rtx pred = aarch64_ptrue_reg (pred_mode);
+      emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
+					 d->target, pred, d->op0));
      return true;
    }
  rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
@ -20562,7 +20551,8 @@ aarch64_evpc_dup (struct expand_vec_perm_d *d)
      || !d->perm[0].is_constant (&elt))
    return false;

-  if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
+  if ((d->vec_flags & VEC_SVE_DATA)
+      && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
    return false;

  /* Success! */
@ -20782,6 +20772,7 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)

  if ((d->vec_flags == VEC_ADVSIMD
       || d->vec_flags == VEC_SVE_DATA
+       || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
       || d->vec_flags == VEC_SVE_PRED)
      && known_gt (nelt, 1))
    {
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@ -400,7 +400,7 @@
 (define_mode_iterator SVE_ALL [VNx16QI VNx8QI VNx4QI VNx2QI
 			       VNx8HI VNx4HI VNx2HI
 			       VNx8HF VNx4HF VNx2HF
-			       VNx8BF
+			       VNx8BF VNx4BF VNx2BF
 			       VNx4SI VNx2SI
 			       VNx4SF VNx2SF
 			       VNx2DI
@ -418,11 +418,13 @@
 				VNx2DI])

 ;; SVE modes with 2 or 4 elements.
-(define_mode_iterator SVE_24 [VNx2QI VNx2HI VNx2HF VNx2SI VNx2SF VNx2DI VNx2DF
-			      VNx4QI VNx4HI VNx4HF VNx4SI VNx4SF])
+(define_mode_iterator SVE_24 [VNx2QI VNx2HI VNx2HF VNx2BF VNx2SI VNx2SF
+			      VNx2DI VNx2DF
+			      VNx4QI VNx4HI VNx4HF VNx4BF VNx4SI VNx4SF])

 ;; SVE modes with 2 elements.
-(define_mode_iterator SVE_2 [VNx2QI VNx2HI VNx2HF VNx2SI VNx2SF VNx2DI VNx2DF])
+(define_mode_iterator SVE_2 [VNx2QI VNx2HI VNx2HF VNx2BF
+			     VNx2SI VNx2SF VNx2DI VNx2DF])

 ;; SVE integer modes with 2 elements, excluding the widest element.
 (define_mode_iterator SVE_2BHSI [VNx2QI VNx2HI VNx2SI])
@ -431,7 +433,7 @@
 (define_mode_iterator SVE_2HSDI [VNx2HI VNx2SI VNx2DI])

 ;; SVE modes with 4 elements.
-(define_mode_iterator SVE_4 [VNx4QI VNx4HI VNx4HF VNx4SI VNx4SF])
+(define_mode_iterator SVE_4 [VNx4QI VNx4HI VNx4HF VNx4BF VNx4SI VNx4SF])

 ;; SVE integer modes with 4 elements, excluding the widest element.
 (define_mode_iterator SVE_4BHI [VNx4QI VNx4HI])
@ -621,6 +623,7 @@
    UNSPEC_REVB		; Used in aarch64-sve.md.
    UNSPEC_REVH		; Used in aarch64-sve.md.
    UNSPEC_REVW		; Used in aarch64-sve.md.
+    UNSPEC_REVBHW	; Used in aarch64-sve.md.
    UNSPEC_SMUL_HIGHPART ; Used in aarch64-sve.md.
    UNSPEC_UMUL_HIGHPART ; Used in aarch64-sve.md.
    UNSPEC_FMLA		; Used in aarch64-sve.md.
@ -968,6 +971,16 @@
 			     (VNx4SI "32") (VNx2DI "64")
 			     (VNx8HF "16") (VNx4SF "32") (VNx2DF "64")])

+;; The number of bits in a vector container.
+(define_mode_attr container_bits [(VNx16QI "8")
+				  (VNx8HI "16") (VNx8QI "16") (VNx8HF "16")
+				  (VNx8BF "16")
+				  (VNx4SI "32") (VNx4HI "32") (VNx4QI "32")
+				  (VNx4SF "32") (VNx4HF "32") (VNx4BF "32")
+				  (VNx2DI "64") (VNx2SI "64") (VNx2HI "64")
+				  (VNx2QI "64") (VNx2DF "64") (VNx2SF "64")
+				  (VNx2HF "64") (VNx2BF "64")])
+
 ;; Attribute to describe constants acceptable in logical operations
 (define_mode_attr lconst [(SI "K") (DI "L")])

@ -1029,7 +1042,7 @@
 			  (VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
 			  (VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
 			  (VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
-			  (VNx8BF "h")
+			  (VNx8BF "h") (VNx4BF "h") (VNx2BF "h")
 			  (VNx4SI "s") (VNx2SI "s")
 			  (VNx4SF "s") (VNx2SF "s")
 			  (VNx2DI "d")
@ -1047,7 +1060,7 @@
 (define_mode_attr Vesize [(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
 			  (VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
 			  (VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
-			  (VNx8BF "h")
+			  (VNx8BF "h") (VNx4BF "h") (VNx2BF "h")
 			  (VNx4SI "w") (VNx2SI "w")
 			  (VNx4SF "w") (VNx2SF "w")
 			  (VNx2DI "d")
@ -1066,12 +1079,23 @@
 (define_mode_attr Vctype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "s") (VNx2QI "d")
 			  (VNx8HI "h") (VNx4HI "s") (VNx2HI "d")
 			  (VNx8HF "h") (VNx4HF "s") (VNx2HF "d")
-			  (VNx8BF "h")
+			  (VNx8BF "h") (VNx4BF "s") (VNx2BF "d")
 			  (VNx4SI "s") (VNx2SI "d")
 			  (VNx4SF "s") (VNx2SF "d")
 			  (VNx2DI "d")
 			  (VNx2DF "d")])

+;; The instruction mnemonic suffix for an SVE mode's element container,
+;; i.e. the Vewtype of full SVE modes that have the same number of elements.
+(define_mode_attr Vcwtype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "w") (VNx2QI "d")
+			   (VNx8HI "h") (VNx4HI "w") (VNx2HI "d")
+			   (VNx8HF "h") (VNx4HF "w") (VNx2HF "d")
+			   (VNx8BF "h") (VNx4BF "w") (VNx2BF "d")
+			   (VNx4SI "w") (VNx2SI "d")
+			   (VNx4SF "w") (VNx2SF "d")
+			   (VNx2DI "d")
+			   (VNx2DF "d")])
+
 ;; Vetype is used everywhere in scheduling type and assembly output,
 ;; sometimes they are not the same, for example HF modes on some
 ;; instructions.  stype is defined to represent scheduling type
@ -1107,7 +1131,7 @@
 		       (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
 		       (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
 		       (VNx8HF "HF") (VNx4HF "HF") (VNx2HF "HF")
-		       (VNx8BF "BF")
+		       (VNx8BF "BF") (VNx4BF "BF") (VNx2BF "BF")
 		       (VNx4SI "SI") (VNx2SI "SI")
 		       (VNx4SF "SF") (VNx2SF "SF")
 		       (VNx2DI "DI")
@ -1127,7 +1151,7 @@
 		       (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
 		       (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
 		       (VNx8HF "hf") (VNx4HF "hf") (VNx2HF "hf")
-		       (VNx8BF "bf")
+		       (VNx8BF "bf") (VNx4BF "bf") (VNx2BF "bf")
 		       (VNx4SI "si") (VNx2SI "si")
 		       (VNx4SF "sf") (VNx2SF "sf")
 		       (VNx2DI "di")
@ -1310,7 +1334,7 @@
 			  (VNx16QI "w") (VNx8QI "w") (VNx4QI "w") (VNx2QI "w")
 			  (VNx8HI "w") (VNx4HI "w") (VNx2HI "w")
 			  (VNx8HF "w") (VNx4HF "w") (VNx2HF "w")
-			  (VNx8BF "w")
+			  (VNx8BF "w") (VNx4BF "w") (VNx2BF "w")
 			  (VNx4SI "w") (VNx2SI "w")
 			  (VNx4SF "w") (VNx2SF "w")
 			  (VNx2DI "x")
@ -1380,6 +1404,8 @@
 				   (VNx2DI "VNx2DI")
 				   (VNx8HF "VNx8HI") (VNx4HF "VNx4SI")
 				   (VNx2HF "VNx2DI")
+				   (VNx8BF "VNx8HI") (VNx4BF "VNx4SI")
+				   (VNx2BF "VNx2DI")
 				   (VNx4SF "VNx4SI") (VNx2SF "VNx2DI")
 				   (VNx2DF "VNx2DI")])

@ -1392,6 +1418,8 @@
 				   (VNx2DI "vnx2di")
 				   (VNx8HF "vnx8hi") (VNx4HF "vnx4si")
 				   (VNx2HF "vnx2di")
+				   (VNx8BF "vnx8hi") (VNx4BF "vnx4si")
+				   (VNx2BF "vnx2di")
 				   (VNx4SF "vnx4si") (VNx2SF "vnx2di")
 				   (VNx2DF "vnx2di")])

@ -1617,7 +1645,7 @@
 			 (VNx4QI "VNx4BI") (VNx2QI "VNx2BI")
 			 (VNx8HI "VNx8BI") (VNx4HI "VNx4BI") (VNx2HI "VNx2BI")
 			 (VNx8HF "VNx8BI") (VNx4HF "VNx4BI") (VNx2HF "VNx2BI")
-			 (VNx8BF "VNx8BI")
+			 (VNx8BF "VNx8BI") (VNx4BF "VNx4BI") (VNx2BF "VNx2BI")
 			 (VNx4SI "VNx4BI") (VNx2SI "VNx2BI")
 			 (VNx4SF "VNx4BI") (VNx2SF "VNx2BI")
 			 (VNx2DI "VNx2BI")
@ -1643,7 +1671,7 @@
 			 (VNx4QI "vnx4bi") (VNx2QI "vnx2bi")
 			 (VNx8HI "vnx8bi") (VNx4HI "vnx4bi") (VNx2HI "vnx2bi")
 			 (VNx8HF "vnx8bi") (VNx4HF "vnx4bi") (VNx2HF "vnx2bi")
-			 (VNx8BF "vnx8bi")
+			 (VNx8BF "vnx8bi") (VNx4BF "vnx4bi") (VNx2BF "vnx2bi")
 			 (VNx4SI "vnx4bi") (VNx2SI "vnx2bi")
 			 (VNx4SF "vnx4bi") (VNx2SF "vnx2bi")
 			 (VNx2DI "vnx2bi")
--- a/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_2.c
@ -0,0 +1,331 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B
+#define PERM1(B) PERM0 (B), PERM0 (B)
+#define PERM2(B) PERM1 (B), PERM1 (B)
+#define PERM3(B) PERM2 (B), PERM2 (B)
+#define PERM4(B) PERM3 (B), PERM3 (B)
+#define PERM5(B) PERM4 (B), PERM4 (B)
+#define PERM6(B) PERM5 (B), PERM5 (B)
+
+/*
+** qi_dup_h_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.h, \2\.h\[1\]
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_dup_h_1 (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_dup_h_31:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.h, \2\.h\[31\]
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_dup_h_31 (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (31) });
+}
+
+/*
+** qi_dup_s_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.s, \2\.s\[1\]
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_dup_s_1 (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
+}
+
+/*
+** qi_dup_s_15:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.s, \2\.s\[15\]
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_dup_s_15 (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (15) });
+}
+
+/*
+** qi_dup_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[1\]
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_dup_d_1 (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
+}
+
+/*
+** qi_dup_d_7:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[7\]
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_dup_d_7 (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (7) });
+}
+
+/*
+** hi_dup_s_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.s, \2\.s\[1\]
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_dup_s_1 (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hi_dup_s_15:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.s, \2\.s\[15\]
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_dup_s_15 (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (15) });
+}
+
+/*
+** hf_dup_s_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.s, \2\.s\[1\]
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_dup_s_1 (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hf_dup_s_11:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.s, \2\.s\[11\]
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_dup_s_11 (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (11) });
+}
+
+/*
+** bf_dup_s_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.s, \2\.s\[1\]
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_dup_s_1 (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** bf_dup_s_13:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.s, \2\.s\[13\]
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_dup_s_13 (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (13) });
+}
+
+/*
+** hi_dup_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[1\]
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_dup_d_1 (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hi_dup_d_7:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[7\]
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_dup_d_7 (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (7) });
+}
+
+/*
+** hf_dup_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[1\]
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_dup_d_1 (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hf_dup_d_5:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[5\]
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_dup_d_5 (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (5) });
+}
+
+/*
+** bf_dup_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[1\]
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_dup_d_1 (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** bf_dup_d_6:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[6\]
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_dup_d_6 (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (6) });
+}
+
+/*
+** si_dup_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[1\]
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_dup_d_1 (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** si_dup_d_7:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[7\]
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_dup_d_7 (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (7) });
+}
+
+/*
+** sf_dup_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[1\]
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_dup_d_1 (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** sf_dup_d_7:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[7\]
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_dup_d_7 (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (7) });
+}
--- a/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_3.c
@ -0,0 +1,90 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B
+#define PERM1(B) PERM0 (B), PERM0 (B)
+#define PERM2(B) PERM1 (B), PERM1 (B)
+#define PERM3(B) PERM2 (B), PERM2 (B)
+#define PERM4(B) PERM3 (B), PERM3 (B)
+#define PERM5(B) PERM4 (B), PERM4 (B)
+#define PERM6(B) PERM5 (B), PERM5 (B)
+
+v128qi
+qi_dup_h_32 (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (32) });
+}
+
+v64qi
+qi_dup_s_16 (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (16) });
+}
+
+v32qi
+qi_dup_d_8 (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (8) });
+}
+
+v64hi
+hi_dup_s_16 (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) });
+}
+
+v64hf
+hf_dup_s_16 (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) });
+}
+
+v64bf
+bf_dup_s_16 (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) });
+}
+
+v32hi
+hi_dup_d_8 (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) });
+}
+
+v32hf
+hf_dup_d_8 (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) });
+}
+
+v32bf
+bf_dup_d_8 (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) });
+}
+
+v32si
+si_dup_d_8 (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (8) });
+}
+
+v32sf
+sf_dup_d_8 (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (8) });
+}
+
+/* { dg-final { scan-assembler-not {\tdup\tz} } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/ext_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/ext_4.c
@ -0,0 +1,353 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B + 1
+#define PERM1(B) PERM0 (B), PERM0 (B + 2)
+#define PERM2(B) PERM1 (B), PERM1 (B + 4)
+#define PERM3(B) PERM2 (B), PERM2 (B + 8)
+#define PERM4(B) PERM3 (B), PERM3 (B + 16)
+#define PERM5(B) PERM4 (B), PERM4 (B + 32)
+#define PERM6(B) PERM5 (B), PERM5 (B + 64)
+
+/*
+** qi_ext_h_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #2
+**	st1b	\2\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_ext_h_1 (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_ext_h_1_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ext	\3\.b, \3\.b, \2\.b, #2
+**	st1b	\3\.h, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	ext	\4\.b, \4\.b, \5\.b, #2
+**	st1b	\4\.h, \1, \[x8\]
+** )
+**	ret
+*/
+v128qi
+qi_ext_h_1_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_ext_h_127:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #254
+**	st1b	\2\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_ext_h_127 (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (127) });
+}
+
+/*
+** qi_ext_s_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #4
+**	st1b	\2\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_ext_s_1 (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
+}
+
+/*
+** qi_ext_s_63:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #252
+**	st1b	\2\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_ext_s_63 (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (63) });
+}
+
+/*
+** qi_ext_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #8
+**	st1b	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_ext_d_1 (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
+}
+
+/*
+** qi_ext_d_31:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #248
+**	st1b	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_ext_d_31 (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (31) });
+}
+
+/*
+** hi_ext_s_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #4
+**	st1h	\2\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_ext_s_1 (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hi_ext_s_63:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #252
+**	st1h	\2\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_ext_s_63 (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
+}
+
+/*
+** hf_ext_s_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #4
+**	st1h	\2\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_ext_s_1 (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hf_ext_s_60:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #240
+**	st1h	\2\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_ext_s_60 (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (60) });
+}
+
+/*
+** bf_ext_s_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #4
+**	st1h	\2\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_ext_s_1 (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** bf_ext_s_40:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #160
+**	st1h	\2\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_ext_s_40 (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (40) });
+}
+
+/*
+** hi_ext_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #8
+**	st1h	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_ext_d_1 (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hi_ext_d_31:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #248
+**	st1h	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_ext_d_31 (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
+}
+
+/*
+** hf_ext_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #8
+**	st1h	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_ext_d_1 (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hf_ext_d_18:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #144
+**	st1h	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_ext_d_18 (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (18) });
+}
+
+/*
+** bf_ext_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #8
+**	st1h	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_ext_d_1 (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** bf_ext_d_7:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #56
+**	st1h	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_ext_d_7 (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (7) });
+}
+
+/*
+** si_ext_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #8
+**	st1w	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_ext_d_1 (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** si_ext_d_31:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #248
+**	st1w	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_ext_d_31 (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
+}
+
+/*
+** sf_ext_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #8
+**	st1w	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_ext_d_1 (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** sf_ext_d_31:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #248
+**	st1w	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_ext_d_31 (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
+}
--- a/gcc/testsuite/gcc.target/aarch64/sve/rev_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/rev_2.c
@ -0,0 +1,177 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B - 1
+#define PERM1(B) PERM0 (B), PERM0 (B - 2)
+#define PERM2(B) PERM1 (B), PERM1 (B - 4)
+#define PERM3(B) PERM2 (B), PERM2 (B - 8)
+#define PERM4(B) PERM3 (B), PERM3 (B - 16)
+#define PERM5(B) PERM4 (B), PERM4 (B - 32)
+#define PERM6(B) PERM5 (B), PERM5 (B - 64)
+
+/*
+** qi_rev_h:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_rev_h (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (127) });
+}
+
+/*
+** qi_rev_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_rev_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (63) });
+}
+
+/*
+** qi_rev_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_rev_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (31) });
+}
+
+/*
+** hi_rev_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_rev_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
+}
+
+/*
+** hf_rev_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_rev_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
+}
+
+/*
+** bf_rev_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_rev_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
+}
+
+/*
+** hi_rev_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_rev_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
+}
+
+/*
+** hf_rev_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_rev_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
+}
+
+/*
+** bf_rev_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_rev_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
+}
+
+/*
+** si_rev_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_rev_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
+}
+
+/*
+** sf_rev_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_rev_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
+}
--- a/gcc/testsuite/gcc.target/aarch64/sve/revhw_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/revhw_1.c
@ -0,0 +1,127 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+
+#define PERM0(B) B + 1, B
+#define PERM1(B) PERM0 (B), PERM0 (B + 2)
+#define PERM2(B) PERM1 (B), PERM1 (B + 4)
+#define PERM3(B) PERM2 (B), PERM2 (B + 8)
+#define PERM4(B) PERM3 (B), PERM3 (B + 16)
+#define PERM5(B) PERM4 (B), PERM4 (B + 32)
+#define PERM6(B) PERM5 (B), PERM5 (B + 64)
+
+/*
+** qi_revh_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	revh	(z[0-9]+)\.s, \1/m, \2\.s
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_revh_s (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/*
+** qi_revw_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	revw	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_revw_d (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+/*
+** hi_revw_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	revw	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_revw_d (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hf_revw_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	revw	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_revw_d (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** bf_revw_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	revw	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_revw_d (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+#undef PERM1
+#define PERM1(B) PERM0 (B + 2), PERM0 (B)
+
+/*
+** qi_revh_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	revh	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_revh_d (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+v64qi
+qi_revw_q (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+v64hi
+hi_revw_q (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+#undef PERM2
+#define PERM2(B) PERM0 (B + 4), PERM0 (B)
+
+v128qi
+qi_revh_q (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/* { dg-final { scan-assembler-times {\trev.\t} 6 } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/revhw_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/revhw_2.c
@ -0,0 +1,127 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mbig-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+
+#define PERM0(B) B + 1, B
+#define PERM1(B) PERM0 (B), PERM0 (B + 2)
+#define PERM2(B) PERM1 (B), PERM1 (B + 4)
+#define PERM3(B) PERM2 (B), PERM2 (B + 8)
+#define PERM4(B) PERM3 (B), PERM3 (B + 16)
+#define PERM5(B) PERM4 (B), PERM4 (B + 32)
+#define PERM6(B) PERM5 (B), PERM5 (B + 64)
+
+/*
+** qi_revh_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	revh	(z[0-9]+)\.s, \1/m, \2\.s
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_revh_s (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/*
+** qi_revw_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	revw	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_revw_d (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+/*
+** hi_revw_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	revw	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_revw_d (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hf_revw_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	revw	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_revw_d (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** bf_revw_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	revw	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_revw_d (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+#undef PERM1
+#define PERM1(B) PERM0 (B + 2), PERM0 (B)
+
+/*
+** qi_revh_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	revh	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_revh_d (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+v64qi
+qi_revw_q (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+v64hi
+hi_revw_q (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+#undef PERM2
+#define PERM2(B) PERM0 (B + 4), PERM0 (B)
+
+v128qi
+qi_revh_q (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/* { dg-final { scan-assembler-times {\trev.\t} 6 } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_8.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_8.c
@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+void
+f (short *restrict s, signed char *restrict c)
+{
+  for (int i = 0; i < 8; i += 2)
+    {
+      s[i] = c[i];
+      s[i + 1] = c[i];
+    }
+}
+
+/* Ideally this would use LD1SB, but currently we use LD1B and
+   sign-extend it after the permute.  */
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, vl6\n} } } */
+/* { dg-final { scan-assembler {\tld1s?b\tz[0-9]+\.h} } } */
+/* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.h,} } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/trn1_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/trn1_2.c
@ -0,0 +1,403 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B, C) B, B + C
+#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 2, C)
+#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 4, C)
+#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 8, C)
+#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 16, C)
+#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 32, C)
+#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 64, C)
+
+/*
+** qi_trn1_h_a:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_trn1_h_a (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 0) });
+}
+
+/*
+** qi_trn1_h_b:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_trn1_h_b (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 128) });
+}
+
+/*
+** qi_trn1_h_c:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_trn1_h_c (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (128, 0) });
+}
+
+/*
+** qi_trn1_h_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	trn1	\3\.h, \3\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	trn1	\4\.h, \4\.h, \5\.h
+**	st1b	\4\.h, \1, \[x8\]
+** )
+**	ret
+*/
+v128qi
+qi_trn1_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (0, 128) });
+}
+
+/*
+** qi_trn1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_trn1_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0, 64) });
+}
+
+/*
+** qi_trn1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn1	\3\.s, \3\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	trn1	\4\.s, \4\.s, \5\.s
+**	st1b	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64qi
+qi_trn1_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (0, 64) });
+}
+
+/*
+** qi_trn1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_trn1_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (0, 32) });
+}
+
+/*
+** qi_trn1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	\3\.d, \3\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	trn1	\4\.d, \4\.d, \5\.d
+**	st1b	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32qi
+qi_trn1_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (0, 32) });
+}
+
+/*
+** hi_trn1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_trn1_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hi_trn1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	trn1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hi
+hi_trn1_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hf_trn1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_trn1_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hf_trn1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	trn1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hf
+hf_trn1_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** bf_trn1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_trn1_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** bf_trn1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	trn1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64bf
+bf_trn1_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hi_trn1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_trn1_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hi_trn1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	trn1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hi
+hi_trn1_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hf_trn1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_trn1_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hf_trn1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	trn1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hf
+hf_trn1_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** bf_trn1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_trn1_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** bf_trn1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	trn1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32bf
+bf_trn1_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** si_trn1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_trn1_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
+}
+
+/*
+** sf_trn1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_trn1_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
+}
--- a/gcc/testsuite/gcc.target/aarch64/sve/trn2_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/trn2_2.c
@ -0,0 +1,403 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B, C) B, B + C
+#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 2, C)
+#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 4, C)
+#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 8, C)
+#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 16, C)
+#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 32, C)
+#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 64, C)
+
+/*
+** qi_trn2_h_a:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_trn2_h_a (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 0) });
+}
+
+/*
+** qi_trn2_h_b:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_trn2_h_b (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 128) });
+}
+
+/*
+** qi_trn2_h_c:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_trn2_h_c (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 0) });
+}
+
+/*
+** qi_trn2_h_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	trn2	\3\.h, \3\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	trn2	\4\.h, \4\.h, \5\.h
+**	st1b	\4\.h, \1, \[x8\]
+** )
+**	ret
+*/
+v128qi
+qi_trn2_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (1, 128) });
+}
+
+/*
+** qi_trn2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_trn2_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (1, 64) });
+}
+
+/*
+** qi_trn2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn2	\3\.s, \3\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	trn2	\4\.s, \4\.s, \5\.s
+**	st1b	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64qi
+qi_trn2_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (1, 64) });
+}
+
+/*
+** qi_trn2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_trn2_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (1, 32) });
+}
+
+/*
+** qi_trn2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	\3\.d, \3\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	trn2	\4\.d, \4\.d, \5\.d
+**	st1b	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32qi
+qi_trn2_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (1, 32) });
+}
+
+/*
+** hi_trn2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_trn2_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** hi_trn2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	trn2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hi
+hi_trn2_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** hf_trn2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_trn2_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** hf_trn2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	trn2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hf
+hf_trn2_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** bf_trn2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_trn2_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** bf_trn2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	trn2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64bf
+bf_trn2_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** hi_trn2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_trn2_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** hi_trn2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	trn2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hi
+hi_trn2_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** hf_trn2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_trn2_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** hf_trn2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	trn2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hf
+hf_trn2_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** bf_trn2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_trn2_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** bf_trn2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	trn2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32bf
+bf_trn2_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** si_trn2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_trn2_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1, 32) });
+}
+
+/*
+** sf_trn2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_trn2_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1, 32) });
+}
--- a/gcc/testsuite/gcc.target/aarch64/sve/uzp1_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/uzp1_2.c
@ -0,0 +1,375 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B + 2
+#define PERM1(B) PERM0 (B), PERM0 (B + 4)
+#define PERM2(B) PERM1 (B), PERM1 (B + 8)
+#define PERM3(B) PERM2 (B), PERM2 (B + 16)
+#define PERM4(B) PERM3 (B), PERM3 (B + 32)
+#define PERM5(B) PERM4 (B), PERM4 (B + 64)
+#define PERM6(B) PERM5 (B), PERM5 (B + 128)
+
+/*
+** qi_uzp1_h:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_uzp1_h (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/*
+** qi_uzp1_h_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	uzp1	\3\.h, \3\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	uzp1	\4\.h, \4\.h, \5\.h
+**	st1b	\4\.h, \1, \[x8\]
+** )
+**	ret
+*/
+v128qi
+qi_uzp1_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (0) });
+}
+
+/*
+** qi_uzp1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_uzp1_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+/*
+** qi_uzp1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp1	\3\.s, \3\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	uzp1	\4\.s, \4\.s, \5\.s
+**	st1b	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64qi
+qi_uzp1_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (0) });
+}
+
+/*
+** qi_uzp1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_uzp1_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (0) });
+}
+
+/*
+** qi_uzp1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	\3\.d, \3\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	uzp1	\4\.d, \4\.d, \5\.d
+**	st1b	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32qi
+qi_uzp1_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (0) });
+}
+
+/*
+** hi_uzp1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_uzp1_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hi_uzp1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	uzp1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hi
+hi_uzp1_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hf_uzp1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_uzp1_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hf_uzp1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	uzp1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hf
+hf_uzp1_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
+}
+
+/*
+** bf_uzp1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_uzp1_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** bf_uzp1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	uzp1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64bf
+bf_uzp1_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hi_uzp1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_uzp1_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
+}
+
+/*
+** hi_uzp1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	uzp1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hi
+hi_uzp1_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
+}
+
+/*
+** hf_uzp1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_uzp1_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
+}
+
+/*
+** hf_uzp1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	uzp1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hf
+hf_uzp1_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
+}
+
+/*
+** bf_uzp1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_uzp1_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
+}
+
+/*
+** bf_uzp1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	uzp1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32bf
+bf_uzp1_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
+}
+
+/*
+** si_uzp1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_uzp1_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0) });
+}
+
+/*
+** sf_uzp1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_uzp1_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0) });
+}
--- a/gcc/testsuite/gcc.target/aarch64/sve/uzp2_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/uzp2_2.c
@ -0,0 +1,375 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B + 2
+#define PERM1(B) PERM0 (B), PERM0 (B + 4)
+#define PERM2(B) PERM1 (B), PERM1 (B + 8)
+#define PERM3(B) PERM2 (B), PERM2 (B + 16)
+#define PERM4(B) PERM3 (B), PERM3 (B + 32)
+#define PERM5(B) PERM4 (B), PERM4 (B + 64)
+#define PERM6(B) PERM5 (B), PERM5 (B + 128)
+
+/*
+** qi_uzp2_h:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_uzp2_h (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_uzp2_h_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	uzp2	\3\.h, \3\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	uzp2	\4\.h, \4\.h, \5\.h
+**	st1b	\4\.h, \1, \[x8\]
+** )
+**	ret
+*/
+v128qi
+qi_uzp2_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_uzp2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_uzp2_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
+}
+
+/*
+** qi_uzp2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp2	\3\.s, \3\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	uzp2	\4\.s, \4\.s, \5\.s
+**	st1b	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64qi
+qi_uzp2_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (1) });
+}
+
+/*
+** qi_uzp2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_uzp2_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
+}
+
+/*
+** qi_uzp2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	\3\.d, \3\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	uzp2	\4\.d, \4\.d, \5\.d
+**	st1b	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32qi
+qi_uzp2_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (1) });
+}
+
+/*
+** hi_uzp2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_uzp2_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hi_uzp2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	uzp2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hi
+hi_uzp2_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hf_uzp2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_uzp2_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hf_uzp2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	uzp2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hf
+hf_uzp2_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) });
+}
+
+/*
+** bf_uzp2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_uzp2_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** bf_uzp2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	uzp2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64bf
+bf_uzp2_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hi_uzp2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_uzp2_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hi_uzp2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	uzp2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hi
+hi_uzp2_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hf_uzp2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_uzp2_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hf_uzp2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	uzp2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hf
+hf_uzp2_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) });
+}
+
+/*
+** bf_uzp2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_uzp2_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** bf_uzp2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	uzp2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32bf
+bf_uzp2_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) });
+}
+
+/*
+** si_uzp2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_uzp2_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** sf_uzp2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_uzp2_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
--- a/gcc/testsuite/gcc.target/aarch64/sve/zip1_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/zip1_2.c
@ -0,0 +1,403 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B, C) B, B + C
+#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 1, C)
+#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 2, C)
+#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 4, C)
+#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 8, C)
+#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 16, C)
+#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 32, C)
+
+/*
+** qi_zip1_h_a:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_zip1_h_a (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 0) });
+}
+
+/*
+** qi_zip1_h_b:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_zip1_h_b (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 128) });
+}
+
+/*
+** qi_zip1_h_c:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_zip1_h_c (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (128, 0) });
+}
+
+/*
+** qi_zip1_h_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	zip1	\3\.h, \3\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	zip1	\4\.h, \4\.h, \5\.h
+**	st1b	\4\.h, \1, \[x8\]
+** )
+**	ret
+*/
+v128qi
+qi_zip1_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (0, 128) });
+}
+
+/*
+** qi_zip1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_zip1_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0, 64) });
+}
+
+/*
+** qi_zip1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip1	\3\.s, \3\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	zip1	\4\.s, \4\.s, \5\.s
+**	st1b	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64qi
+qi_zip1_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (0, 64) });
+}
+
+/*
+** qi_zip1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_zip1_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (0, 32) });
+}
+
+/*
+** qi_zip1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	\3\.d, \3\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	zip1	\4\.d, \4\.d, \5\.d
+**	st1b	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32qi
+qi_zip1_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (0, 32) });
+}
+
+/*
+** hi_zip1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_zip1_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hi_zip1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	zip1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hi
+hi_zip1_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hf_zip1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_zip1_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hf_zip1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	zip1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hf
+hf_zip1_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** bf_zip1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_zip1_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** bf_zip1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	zip1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64bf
+bf_zip1_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hi_zip1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_zip1_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hi_zip1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	zip1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hi
+hi_zip1_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hf_zip1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_zip1_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hf_zip1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	zip1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hf
+hf_zip1_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** bf_zip1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_zip1_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** bf_zip1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	zip1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32bf
+bf_zip1_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** si_zip1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_zip1_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
+}
+
+/*
+** sf_zip1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_zip1_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
+}
--- a/gcc/testsuite/gcc.target/aarch64/sve/zip2_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/zip2_2.c
@ -0,0 +1,403 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B, C) B, B + C
+#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 1, C)
+#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 2, C)
+#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 4, C)
+#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 8, C)
+#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 16, C)
+#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 32, C)
+
+/*
+** qi_zip2_h_a:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_zip2_h_a (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (64, 128) });
+}
+
+/*
+** qi_zip2_h_b:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_zip2_h_b (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (64, 128) });
+}
+
+/*
+** qi_zip2_h_c:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_zip2_h_c (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (192, 0) });
+}
+
+/*
+** qi_zip2_h_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	zip2	\3\.h, \3\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	zip2	\4\.h, \4\.h, \5\.h
+**	st1b	\4\.h, \1, \[x8\]
+** )
+**	ret
+*/
+v128qi
+qi_zip2_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (64, 128) });
+}
+
+/*
+** qi_zip2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_zip2_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (32, 64) });
+}
+
+/*
+** qi_zip2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip2	\3\.s, \3\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	zip2	\4\.s, \4\.s, \5\.s
+**	st1b	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64qi
+qi_zip2_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (32, 64) });
+}
+
+/*
+** qi_zip2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_zip2_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (16, 32) });
+}
+
+/*
+** qi_zip2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	\3\.d, \3\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	zip2	\4\.d, \4\.d, \5\.d
+**	st1b	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32qi
+qi_zip2_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (16, 32) });
+}
+
+/*
+** hi_zip2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_zip2_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** hi_zip2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	zip2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hi
+hi_zip2_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** hf_zip2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_zip2_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** hf_zip2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	zip2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hf
+hf_zip2_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** bf_zip2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_zip2_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** bf_zip2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	zip2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64bf
+bf_zip2_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** hi_zip2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_zip2_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** hi_zip2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	zip2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hi
+hi_zip2_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** hf_zip2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_zip2_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** hf_zip2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	zip2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hf
+hf_zip2_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** bf_zip2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_zip2_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** bf_zip2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	zip2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32bf
+bf_zip2_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** si_zip2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_zip2_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (16, 32) });
+}
+
+/*
+** sf_zip2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_zip2_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (16, 32) });
+}