aarch64: Support permutes on unpacked SVE vectors

This patch adds support for permuting unpacked SVE vectors using:

- DUP
- EXT
- REV[BHW]
- REV
- TRN[12]
- UZP[12]
- ZIP[12]

This involves rewriting the REV[BHW] permute code so that the inputs
and outputs of the insn pattern have the same mode as the vectors
being permuted.  This is different from the ACLE form, where the
reversal happens within individual elements rather than within
groups of multiple elements.

The patch does not add a conditional version of REV[BHW].  I'll come
back to that once we have partial-vector comparisons and selects.

The patch is really just enablement, adding an extra tool to the
toolbox.  It doesn't bring any significant vectorisation opportunities
on its own.  However, the patch does have one artificial example that
is now vectorised in a better way than before.

gcc/
	* config/aarch64/aarch64-modes.def (VNx2BF, VNx4BF): Adjust nunits
	and alignment based on the current VG.
	* config/aarch64/iterators.md (SVE_ALL, SVE_24, SVE_2, SVE_4): Add
	partial SVE BF modes.
	(UNSPEC_REVBHW): New unspec.
	(Vetype, Vesize, Vctype, VEL, Vel, vwcore, V_INT_CONTAINER)
	(v_int_container, VPRED, vpred): Handle partial SVE BF modes.
	(container_bits, Vcwtype): New mode attributes.
	* config/aarch64/aarch64-sve.md
	(@aarch64_sve_revbhw_<SVE_ALL:mode><PRED_HSD:mode>): New pattern.
	(@aarch64_sve_dup_lane<mode>): Extended from SVE_FULL to SVE_ALL.
	(@aarch64_sve_rev<mode>, @aarch64_sve_<perm_insn><mode>): Likewise.
	(@aarch64_sve_ext<mode>): Likewise.
	* config/aarch64/aarch64.c (aarch64_classify_vector_mode): Handle
	E_VNx2BFmode and E_VNx4BFmode.
	(aarch64_evpc_rev_local): Base the analysis on the container size
	instead of the element size.  Use the new aarch64_sve_revbhw
	patterns for SVE.
	(aarch64_evpc_dup): Handle partial SVE data modes.  Use the
	container size instead of the element size when applying the
	SVE immediate limit.  Fix a previously incorrect bounds check.
	(aarch64_expand_vec_perm_const_1): Handle partial SVE data modes.

gcc/testsuite/
	* gcc.target/aarch64/sve/dup_lane_2.c: New test.
	* gcc.target/aarch64/sve/dup_lane_3.c: Likewise.
	* gcc.target/aarch64/sve/ext_4.c: Likewise.
	* gcc.target/aarch64/sve/rev_2.c: Likewise.
	* gcc.target/aarch64/sve/revhw_1.c: Likewise.
	* gcc.target/aarch64/sve/revhw_2.c: Likewise.
	* gcc.target/aarch64/sve/slp_perm_8.c: Likewise.
	* gcc.target/aarch64/sve/trn1_2.c: Likewise.
	* gcc.target/aarch64/sve/trn2_2.c: Likewise.
	* gcc.target/aarch64/sve/uzp1_2.c: Likewise.
	* gcc.target/aarch64/sve/uzp2_2.c: Likewise.
	* gcc.target/aarch64/sve/zip1_2.c: Likewise.
	* gcc.target/aarch64/sve/zip2_2.c: Likewise.
This commit is contained in:
Richard Sandiford 2020-11-06 16:49:28 +00:00
parent 9b11203e33
commit 6c3ce63b04
17 changed files with 3684 additions and 59 deletions

View file

@ -136,11 +136,13 @@ ADJUST_NUNITS (VNx2QI, aarch64_sve_vg);
ADJUST_NUNITS (VNx2HI, aarch64_sve_vg);
ADJUST_NUNITS (VNx2SI, aarch64_sve_vg);
ADJUST_NUNITS (VNx2HF, aarch64_sve_vg);
ADJUST_NUNITS (VNx2BF, aarch64_sve_vg);
ADJUST_NUNITS (VNx2SF, aarch64_sve_vg);
ADJUST_NUNITS (VNx4QI, aarch64_sve_vg * 2);
ADJUST_NUNITS (VNx4HI, aarch64_sve_vg * 2);
ADJUST_NUNITS (VNx4HF, aarch64_sve_vg * 2);
ADJUST_NUNITS (VNx4BF, aarch64_sve_vg * 2);
ADJUST_NUNITS (VNx8QI, aarch64_sve_vg * 4);
@ -151,7 +153,9 @@ ADJUST_ALIGNMENT (VNx8QI, 1);
ADJUST_ALIGNMENT (VNx2HI, 2);
ADJUST_ALIGNMENT (VNx4HI, 2);
ADJUST_ALIGNMENT (VNx2HF, 2);
ADJUST_ALIGNMENT (VNx2BF, 2);
ADJUST_ALIGNMENT (VNx4HF, 2);
ADJUST_ALIGNMENT (VNx4BF, 2);
ADJUST_ALIGNMENT (VNx2SI, 4);
ADJUST_ALIGNMENT (VNx2SF, 4);

View file

@ -3009,6 +3009,22 @@
"<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
)
;; Another way of expressing the REVB, REVH and REVW patterns, with this
;; form being easier for permutes. The predicate mode determines the number
;; of lanes and the data mode decides the granularity of the reversal within
;; each lane.
(define_insn "@aarch64_sve_revbhw_<SVE_ALL:mode><PRED_HSD:mode>"
[(set (match_operand:SVE_ALL 0 "register_operand" "=w")
(unspec:SVE_ALL
[(match_operand:PRED_HSD 1 "register_operand" "Upl")
(unspec:SVE_ALL
[(match_operand:SVE_ALL 2 "register_operand" "w")]
UNSPEC_REVBHW)]
UNSPEC_PRED_X))]
"TARGET_SVE && <PRED_HSD:elem_bits> > <SVE_ALL:container_bits>"
"rev<SVE_ALL:Vcwtype>\t%0.<PRED_HSD:Vetype>, %1/m, %2.<PRED_HSD:Vetype>"
)
;; Predicated integer unary operations with merging.
(define_insn "@cond_<optab><mode>"
[(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w, ?&w")
@ -8273,14 +8289,14 @@
;; Duplicate one element of a vector.
(define_insn "@aarch64_sve_dup_lane<mode>"
[(set (match_operand:SVE_FULL 0 "register_operand" "=w")
(vec_duplicate:SVE_FULL
[(set (match_operand:SVE_ALL 0 "register_operand" "=w")
(vec_duplicate:SVE_ALL
(vec_select:<VEL>
(match_operand:SVE_FULL 1 "register_operand" "w")
(match_operand:SVE_ALL 1 "register_operand" "w")
(parallel [(match_operand:SI 2 "const_int_operand")]))))]
"TARGET_SVE
&& IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 0, 63)"
"dup\t%0.<Vetype>, %1.<Vetype>[%2]"
&& IN_RANGE (INTVAL (operands[2]) * <container_bits> / 8, 0, 63)"
"dup\t%0.<Vctype>, %1.<Vctype>[%2]"
)
;; Use DUP.Q to duplicate a 128-bit segment of a register.
@ -8321,17 +8337,18 @@
;; Reverse the order of elements within a full vector.
(define_insn "@aarch64_sve_rev<mode>"
[(set (match_operand:SVE_FULL 0 "register_operand" "=w")
(unspec:SVE_FULL
[(match_operand:SVE_FULL 1 "register_operand" "w")]
[(set (match_operand:SVE_ALL 0 "register_operand" "=w")
(unspec:SVE_ALL
[(match_operand:SVE_ALL 1 "register_operand" "w")]
UNSPEC_REV))]
"TARGET_SVE"
"rev\t%0.<Vetype>, %1.<Vetype>")
"rev\t%0.<Vctype>, %1.<Vctype>")
;; -------------------------------------------------------------------------
;; ---- [INT,FP] Special-purpose binary permutes
;; -------------------------------------------------------------------------
;; Includes:
;; - EXT
;; - SPLICE
;; - TRN1
;; - TRN2
@ -8359,13 +8376,13 @@
;; Permutes that take half the elements from one vector and half the
;; elements from the other.
(define_insn "@aarch64_sve_<perm_insn><mode>"
[(set (match_operand:SVE_FULL 0 "register_operand" "=w")
(unspec:SVE_FULL
[(match_operand:SVE_FULL 1 "register_operand" "w")
(match_operand:SVE_FULL 2 "register_operand" "w")]
[(set (match_operand:SVE_ALL 0 "register_operand" "=w")
(unspec:SVE_ALL
[(match_operand:SVE_ALL 1 "register_operand" "w")
(match_operand:SVE_ALL 2 "register_operand" "w")]
PERMUTE))]
"TARGET_SVE"
"<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
"<perm_insn>\t%0.<Vctype>, %1.<Vctype>, %2.<Vctype>"
)
;; Apply PERMUTE to 128-bit sequences. The behavior of these patterns
@ -8383,16 +8400,16 @@
;; Concatenate two vectors and extract a subvector. Note that the
;; immediate (third) operand is the lane index not the byte index.
(define_insn "@aarch64_sve_ext<mode>"
[(set (match_operand:SVE_FULL 0 "register_operand" "=w, ?&w")
(unspec:SVE_FULL
[(match_operand:SVE_FULL 1 "register_operand" "0, w")
(match_operand:SVE_FULL 2 "register_operand" "w, w")
[(set (match_operand:SVE_ALL 0 "register_operand" "=w, ?&w")
(unspec:SVE_ALL
[(match_operand:SVE_ALL 1 "register_operand" "0, w")
(match_operand:SVE_ALL 2 "register_operand" "w, w")
(match_operand:SI 3 "const_int_operand")]
UNSPEC_EXT))]
"TARGET_SVE
&& IN_RANGE (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode), 0, 255)"
&& IN_RANGE (INTVAL (operands[3]) * <container_bits> / 8, 0, 255)"
{
operands[3] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode));
operands[3] = GEN_INT (INTVAL (operands[3]) * <container_bits> / 8);
return (which_alternative == 0
? "ext\\t%0.b, %0.b, %2.b, #%3"
: "movprfx\t%0, %1\;ext\\t%0.b, %0.b, %2.b, #%3");

View file

@ -2226,6 +2226,9 @@ aarch64_classify_vector_mode (machine_mode mode)
/* Partial SVE HF vectors. */
case E_VNx2HFmode:
case E_VNx4HFmode:
/* Partial SVE BF vectors. */
case E_VNx2BFmode:
case E_VNx4BFmode:
/* Partial SVE SF vector. */
case E_VNx2SFmode:
return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
@ -20468,18 +20471,21 @@ aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
|| !diff)
return false;
size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
if (size == 8)
if (d->vec_flags & VEC_SVE_DATA)
size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
else
size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
if (size == 64)
{
unspec = UNSPEC_REV64;
pred_mode = VNx2BImode;
}
else if (size == 4)
else if (size == 32)
{
unspec = UNSPEC_REV32;
pred_mode = VNx4BImode;
}
else if (size == 2)
else if (size == 16)
{
unspec = UNSPEC_REV16;
pred_mode = VNx8BImode;
@ -20496,28 +20502,11 @@ aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
if (d->testing_p)
return true;
if (d->vec_flags == VEC_SVE_DATA)
if (d->vec_flags & VEC_SVE_DATA)
{
machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
rtx target = gen_reg_rtx (int_mode);
if (BYTES_BIG_ENDIAN)
/* The act of taking a subreg between INT_MODE and d->vmode
is itself a reversing operation on big-endian targets;
see the comment at the head of aarch64-sve.md for details.
First reinterpret OP0 as INT_MODE without using a subreg
and without changing the contents. */
emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
else
{
/* For SVE we use REV[BHW] unspecs derived from the element size
of v->mode and vector modes whose elements have SIZE bytes.
This ensures that the vector modes match the predicate modes. */
int unspec = aarch64_sve_rev_unspec (d->vmode);
rtx pred = aarch64_ptrue_reg (pred_mode);
emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
gen_lowpart (int_mode, d->op0)));
}
emit_move_insn (d->target, gen_lowpart (d->vmode, target));
rtx pred = aarch64_ptrue_reg (pred_mode);
emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
d->target, pred, d->op0));
return true;
}
rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
@ -20562,7 +20551,8 @@ aarch64_evpc_dup (struct expand_vec_perm_d *d)
|| !d->perm[0].is_constant (&elt))
return false;
if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
if ((d->vec_flags & VEC_SVE_DATA)
&& elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
return false;
/* Success! */
@ -20782,6 +20772,7 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
if ((d->vec_flags == VEC_ADVSIMD
|| d->vec_flags == VEC_SVE_DATA
|| d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
|| d->vec_flags == VEC_SVE_PRED)
&& known_gt (nelt, 1))
{

View file

@ -400,7 +400,7 @@
(define_mode_iterator SVE_ALL [VNx16QI VNx8QI VNx4QI VNx2QI
VNx8HI VNx4HI VNx2HI
VNx8HF VNx4HF VNx2HF
VNx8BF
VNx8BF VNx4BF VNx2BF
VNx4SI VNx2SI
VNx4SF VNx2SF
VNx2DI
@ -418,11 +418,13 @@
VNx2DI])
;; SVE modes with 2 or 4 elements.
(define_mode_iterator SVE_24 [VNx2QI VNx2HI VNx2HF VNx2SI VNx2SF VNx2DI VNx2DF
VNx4QI VNx4HI VNx4HF VNx4SI VNx4SF])
(define_mode_iterator SVE_24 [VNx2QI VNx2HI VNx2HF VNx2BF VNx2SI VNx2SF
VNx2DI VNx2DF
VNx4QI VNx4HI VNx4HF VNx4BF VNx4SI VNx4SF])
;; SVE modes with 2 elements.
(define_mode_iterator SVE_2 [VNx2QI VNx2HI VNx2HF VNx2SI VNx2SF VNx2DI VNx2DF])
(define_mode_iterator SVE_2 [VNx2QI VNx2HI VNx2HF VNx2BF
VNx2SI VNx2SF VNx2DI VNx2DF])
;; SVE integer modes with 2 elements, excluding the widest element.
(define_mode_iterator SVE_2BHSI [VNx2QI VNx2HI VNx2SI])
@ -431,7 +433,7 @@
(define_mode_iterator SVE_2HSDI [VNx2HI VNx2SI VNx2DI])
;; SVE modes with 4 elements.
(define_mode_iterator SVE_4 [VNx4QI VNx4HI VNx4HF VNx4SI VNx4SF])
(define_mode_iterator SVE_4 [VNx4QI VNx4HI VNx4HF VNx4BF VNx4SI VNx4SF])
;; SVE integer modes with 4 elements, excluding the widest element.
(define_mode_iterator SVE_4BHI [VNx4QI VNx4HI])
@ -621,6 +623,7 @@
UNSPEC_REVB ; Used in aarch64-sve.md.
UNSPEC_REVH ; Used in aarch64-sve.md.
UNSPEC_REVW ; Used in aarch64-sve.md.
UNSPEC_REVBHW ; Used in aarch64-sve.md.
UNSPEC_SMUL_HIGHPART ; Used in aarch64-sve.md.
UNSPEC_UMUL_HIGHPART ; Used in aarch64-sve.md.
UNSPEC_FMLA ; Used in aarch64-sve.md.
@ -968,6 +971,16 @@
(VNx4SI "32") (VNx2DI "64")
(VNx8HF "16") (VNx4SF "32") (VNx2DF "64")])
;; The number of bits in a vector container.
(define_mode_attr container_bits [(VNx16QI "8")
(VNx8HI "16") (VNx8QI "16") (VNx8HF "16")
(VNx8BF "16")
(VNx4SI "32") (VNx4HI "32") (VNx4QI "32")
(VNx4SF "32") (VNx4HF "32") (VNx4BF "32")
(VNx2DI "64") (VNx2SI "64") (VNx2HI "64")
(VNx2QI "64") (VNx2DF "64") (VNx2SF "64")
(VNx2HF "64") (VNx2BF "64")])
;; Attribute to describe constants acceptable in logical operations
(define_mode_attr lconst [(SI "K") (DI "L")])
@ -1029,7 +1042,7 @@
(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
(VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
(VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
(VNx8BF "h")
(VNx8BF "h") (VNx4BF "h") (VNx2BF "h")
(VNx4SI "s") (VNx2SI "s")
(VNx4SF "s") (VNx2SF "s")
(VNx2DI "d")
@ -1047,7 +1060,7 @@
(define_mode_attr Vesize [(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
(VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
(VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
(VNx8BF "h")
(VNx8BF "h") (VNx4BF "h") (VNx2BF "h")
(VNx4SI "w") (VNx2SI "w")
(VNx4SF "w") (VNx2SF "w")
(VNx2DI "d")
@ -1066,12 +1079,23 @@
(define_mode_attr Vctype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "s") (VNx2QI "d")
(VNx8HI "h") (VNx4HI "s") (VNx2HI "d")
(VNx8HF "h") (VNx4HF "s") (VNx2HF "d")
(VNx8BF "h")
(VNx8BF "h") (VNx4BF "s") (VNx2BF "d")
(VNx4SI "s") (VNx2SI "d")
(VNx4SF "s") (VNx2SF "d")
(VNx2DI "d")
(VNx2DF "d")])
;; The instruction mnemonic suffix for an SVE mode's element container,
;; i.e. the Vewtype of full SVE modes that have the same number of elements.
(define_mode_attr Vcwtype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "w") (VNx2QI "d")
(VNx8HI "h") (VNx4HI "w") (VNx2HI "d")
(VNx8HF "h") (VNx4HF "w") (VNx2HF "d")
(VNx8BF "h") (VNx4BF "w") (VNx2BF "d")
(VNx4SI "w") (VNx2SI "d")
(VNx4SF "w") (VNx2SF "d")
(VNx2DI "d")
(VNx2DF "d")])
;; Vetype is used everywhere in scheduling type and assembly output,
;; sometimes they are not the same, for example HF modes on some
;; instructions. stype is defined to represent scheduling type
@ -1107,7 +1131,7 @@
(VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
(VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
(VNx8HF "HF") (VNx4HF "HF") (VNx2HF "HF")
(VNx8BF "BF")
(VNx8BF "BF") (VNx4BF "BF") (VNx2BF "BF")
(VNx4SI "SI") (VNx2SI "SI")
(VNx4SF "SF") (VNx2SF "SF")
(VNx2DI "DI")
@ -1127,7 +1151,7 @@
(VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
(VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
(VNx8HF "hf") (VNx4HF "hf") (VNx2HF "hf")
(VNx8BF "bf")
(VNx8BF "bf") (VNx4BF "bf") (VNx2BF "bf")
(VNx4SI "si") (VNx2SI "si")
(VNx4SF "sf") (VNx2SF "sf")
(VNx2DI "di")
@ -1310,7 +1334,7 @@
(VNx16QI "w") (VNx8QI "w") (VNx4QI "w") (VNx2QI "w")
(VNx8HI "w") (VNx4HI "w") (VNx2HI "w")
(VNx8HF "w") (VNx4HF "w") (VNx2HF "w")
(VNx8BF "w")
(VNx8BF "w") (VNx4BF "w") (VNx2BF "w")
(VNx4SI "w") (VNx2SI "w")
(VNx4SF "w") (VNx2SF "w")
(VNx2DI "x")
@ -1380,6 +1404,8 @@
(VNx2DI "VNx2DI")
(VNx8HF "VNx8HI") (VNx4HF "VNx4SI")
(VNx2HF "VNx2DI")
(VNx8BF "VNx8HI") (VNx4BF "VNx4SI")
(VNx2BF "VNx2DI")
(VNx4SF "VNx4SI") (VNx2SF "VNx2DI")
(VNx2DF "VNx2DI")])
@ -1392,6 +1418,8 @@
(VNx2DI "vnx2di")
(VNx8HF "vnx8hi") (VNx4HF "vnx4si")
(VNx2HF "vnx2di")
(VNx8BF "vnx8hi") (VNx4BF "vnx4si")
(VNx2BF "vnx2di")
(VNx4SF "vnx4si") (VNx2SF "vnx2di")
(VNx2DF "vnx2di")])
@ -1617,7 +1645,7 @@
(VNx4QI "VNx4BI") (VNx2QI "VNx2BI")
(VNx8HI "VNx8BI") (VNx4HI "VNx4BI") (VNx2HI "VNx2BI")
(VNx8HF "VNx8BI") (VNx4HF "VNx4BI") (VNx2HF "VNx2BI")
(VNx8BF "VNx8BI")
(VNx8BF "VNx8BI") (VNx4BF "VNx4BI") (VNx2BF "VNx2BI")
(VNx4SI "VNx4BI") (VNx2SI "VNx2BI")
(VNx4SF "VNx4BI") (VNx2SF "VNx2BI")
(VNx2DI "VNx2BI")
@ -1643,7 +1671,7 @@
(VNx4QI "vnx4bi") (VNx2QI "vnx2bi")
(VNx8HI "vnx8bi") (VNx4HI "vnx4bi") (VNx2HI "vnx2bi")
(VNx8HF "vnx8bi") (VNx4HF "vnx4bi") (VNx2HF "vnx2bi")
(VNx8BF "vnx8bi")
(VNx8BF "vnx8bi") (VNx4BF "vnx4bi") (VNx2BF "vnx2bi")
(VNx4SI "vnx4bi") (VNx2SI "vnx2bi")
(VNx4SF "vnx4bi") (VNx2SF "vnx2bi")
(VNx2DI "vnx2bi")

View file

@ -0,0 +1,331 @@
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
/* { dg-final { check-function-bodies "**" "" } } */
typedef unsigned char v128qi __attribute__((vector_size(128)));
typedef unsigned char v64qi __attribute__((vector_size(64)));
typedef unsigned char v32qi __attribute__((vector_size(32)));
typedef unsigned short v64hi __attribute__((vector_size(128)));
typedef unsigned short v32hi __attribute__((vector_size(64)));
typedef _Float16 v64hf __attribute__((vector_size(128)));
typedef _Float16 v32hf __attribute__((vector_size(64)));
typedef __bf16 v64bf __attribute__((vector_size(128)));
typedef __bf16 v32bf __attribute__((vector_size(64)));
typedef unsigned int v32si __attribute__((vector_size(128)));
typedef float v32sf __attribute__((vector_size(128)));
#define PERM0(B) B, B
#define PERM1(B) PERM0 (B), PERM0 (B)
#define PERM2(B) PERM1 (B), PERM1 (B)
#define PERM3(B) PERM2 (B), PERM2 (B)
#define PERM4(B) PERM3 (B), PERM3 (B)
#define PERM5(B) PERM4 (B), PERM4 (B)
#define PERM6(B) PERM5 (B), PERM5 (B)
/*
** qi_dup_h_1:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** dup (z[0-9]+)\.h, \2\.h\[1\]
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_dup_h_1 (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
}
/*
** qi_dup_h_31:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** dup (z[0-9]+)\.h, \2\.h\[31\]
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_dup_h_31 (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (31) });
}
/*
** qi_dup_s_1:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** dup (z[0-9]+)\.s, \2\.s\[1\]
** st1b \3\.s, \1, \[x8\]
** ret
*/
v64qi
qi_dup_s_1 (v64qi x)
{
return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
}
/*
** qi_dup_s_15:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** dup (z[0-9]+)\.s, \2\.s\[15\]
** st1b \3\.s, \1, \[x8\]
** ret
*/
v64qi
qi_dup_s_15 (v64qi x)
{
return __builtin_shuffle (x, x, (v64qi) { PERM5 (15) });
}
/*
** qi_dup_d_1:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** dup (z[0-9]+)\.d, \2\.d\[1\]
** st1b \3\.d, \1, \[x8\]
** ret
*/
v32qi
qi_dup_d_1 (v32qi x)
{
return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
}
/*
** qi_dup_d_7:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** dup (z[0-9]+)\.d, \2\.d\[7\]
** st1b \3\.d, \1, \[x8\]
** ret
*/
v32qi
qi_dup_d_7 (v32qi x)
{
return __builtin_shuffle (x, x, (v32qi) { PERM4 (7) });
}
/*
** hi_dup_s_1:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** dup (z[0-9]+)\.s, \2\.s\[1\]
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hi
hi_dup_s_1 (v64hi x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
}
/*
** hi_dup_s_15:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** dup (z[0-9]+)\.s, \2\.s\[15\]
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hi
hi_dup_s_15 (v64hi x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (15) });
}
/*
** hf_dup_s_1:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** dup (z[0-9]+)\.s, \2\.s\[1\]
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hf
hf_dup_s_1 (v64hf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
}
/*
** hf_dup_s_11:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** dup (z[0-9]+)\.s, \2\.s\[11\]
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hf
hf_dup_s_11 (v64hf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (11) });
}
/*
** bf_dup_s_1:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** dup (z[0-9]+)\.s, \2\.s\[1\]
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64bf
bf_dup_s_1 (v64bf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
}
/*
** bf_dup_s_13:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** dup (z[0-9]+)\.s, \2\.s\[13\]
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64bf
bf_dup_s_13 (v64bf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (13) });
}
/*
** hi_dup_d_1:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** dup (z[0-9]+)\.d, \2\.d\[1\]
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hi
hi_dup_d_1 (v32hi x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
}
/*
** hi_dup_d_7:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** dup (z[0-9]+)\.d, \2\.d\[7\]
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hi
hi_dup_d_7 (v32hi x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (7) });
}
/*
** hf_dup_d_1:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** dup (z[0-9]+)\.d, \2\.d\[1\]
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hf
hf_dup_d_1 (v32hf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
}
/*
** hf_dup_d_5:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** dup (z[0-9]+)\.d, \2\.d\[5\]
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hf
hf_dup_d_5 (v32hf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (5) });
}
/*
** bf_dup_d_1:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** dup (z[0-9]+)\.d, \2\.d\[1\]
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32bf
bf_dup_d_1 (v32bf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
}
/*
** bf_dup_d_6:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** dup (z[0-9]+)\.d, \2\.d\[6\]
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32bf
bf_dup_d_6 (v32bf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (6) });
}
/*
** si_dup_d_1:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** dup (z[0-9]+)\.d, \2\.d\[1\]
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32si
si_dup_d_1 (v32si x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
}
/*
** si_dup_d_7:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** dup (z[0-9]+)\.d, \2\.d\[7\]
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32si
si_dup_d_7 (v32si x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (7) });
}
/*
** sf_dup_d_1:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** dup (z[0-9]+)\.d, \2\.d\[1\]
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32sf
sf_dup_d_1 (v32sf x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
}
/*
** sf_dup_d_7:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** dup (z[0-9]+)\.d, \2\.d\[7\]
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32sf
sf_dup_d_7 (v32sf x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (7) });
}

View file

@ -0,0 +1,90 @@
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
typedef unsigned char v128qi __attribute__((vector_size(128)));
typedef unsigned char v64qi __attribute__((vector_size(64)));
typedef unsigned char v32qi __attribute__((vector_size(32)));
typedef unsigned short v64hi __attribute__((vector_size(128)));
typedef unsigned short v32hi __attribute__((vector_size(64)));
typedef _Float16 v64hf __attribute__((vector_size(128)));
typedef _Float16 v32hf __attribute__((vector_size(64)));
typedef __bf16 v64bf __attribute__((vector_size(128)));
typedef __bf16 v32bf __attribute__((vector_size(64)));
typedef unsigned int v32si __attribute__((vector_size(128)));
typedef float v32sf __attribute__((vector_size(128)));
#define PERM0(B) B, B
#define PERM1(B) PERM0 (B), PERM0 (B)
#define PERM2(B) PERM1 (B), PERM1 (B)
#define PERM3(B) PERM2 (B), PERM2 (B)
#define PERM4(B) PERM3 (B), PERM3 (B)
#define PERM5(B) PERM4 (B), PERM4 (B)
#define PERM6(B) PERM5 (B), PERM5 (B)
v128qi
qi_dup_h_32 (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (32) });
}
v64qi
qi_dup_s_16 (v64qi x)
{
return __builtin_shuffle (x, x, (v64qi) { PERM5 (16) });
}
v32qi
qi_dup_d_8 (v32qi x)
{
return __builtin_shuffle (x, x, (v32qi) { PERM4 (8) });
}
v64hi
hi_dup_s_16 (v64hi x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) });
}
v64hf
hf_dup_s_16 (v64hf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) });
}
v64bf
bf_dup_s_16 (v64bf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) });
}
v32hi
hi_dup_d_8 (v32hi x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) });
}
v32hf
hf_dup_d_8 (v32hf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) });
}
v32bf
bf_dup_d_8 (v32bf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) });
}
v32si
si_dup_d_8 (v32si x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (8) });
}
v32sf
sf_dup_d_8 (v32sf x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (8) });
}
/* { dg-final { scan-assembler-not {\tdup\tz} } } */

View file

@ -0,0 +1,353 @@
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
/* { dg-final { check-function-bodies "**" "" } } */
typedef unsigned char v128qi __attribute__((vector_size(128)));
typedef unsigned char v64qi __attribute__((vector_size(64)));
typedef unsigned char v32qi __attribute__((vector_size(32)));
typedef unsigned short v64hi __attribute__((vector_size(128)));
typedef unsigned short v32hi __attribute__((vector_size(64)));
typedef _Float16 v64hf __attribute__((vector_size(128)));
typedef _Float16 v32hf __attribute__((vector_size(64)));
typedef __bf16 v64bf __attribute__((vector_size(128)));
typedef __bf16 v32bf __attribute__((vector_size(64)));
typedef unsigned int v32si __attribute__((vector_size(128)));
typedef float v32sf __attribute__((vector_size(128)));
#define PERM0(B) B, B + 1
#define PERM1(B) PERM0 (B), PERM0 (B + 2)
#define PERM2(B) PERM1 (B), PERM1 (B + 4)
#define PERM3(B) PERM2 (B), PERM2 (B + 8)
#define PERM4(B) PERM3 (B), PERM3 (B + 16)
#define PERM5(B) PERM4 (B), PERM4 (B + 32)
#define PERM6(B) PERM5 (B), PERM5 (B + 64)
/*
** qi_ext_h_1:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #2
** st1b \2\.h, \1, \[x8\]
** ret
*/
v128qi
qi_ext_h_1 (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
}
/*
** qi_ext_h_1_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** ext \3\.b, \3\.b, \2\.b, #2
** st1b \3\.h, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
** ext \4\.b, \4\.b, \5\.b, #2
** st1b \4\.h, \1, \[x8\]
** )
** ret
*/
v128qi
qi_ext_h_1_two_op (v128qi x, v128qi y)
{
return __builtin_shuffle (x, y, (v128qi) { PERM6 (1) });
}
/*
** qi_ext_h_127:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #254
** st1b \2\.h, \1, \[x8\]
** ret
*/
v128qi
qi_ext_h_127 (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (127) });
}
/*
** qi_ext_s_1:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #4
** st1b \2\.s, \1, \[x8\]
** ret
*/
v64qi
qi_ext_s_1 (v64qi x)
{
return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
}
/*
** qi_ext_s_63:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #252
** st1b \2\.s, \1, \[x8\]
** ret
*/
v64qi
qi_ext_s_63 (v64qi x)
{
return __builtin_shuffle (x, x, (v64qi) { PERM5 (63) });
}
/*
** qi_ext_d_1:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #8
** st1b \2\.d, \1, \[x8\]
** ret
*/
v32qi
qi_ext_d_1 (v32qi x)
{
return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
}
/*
** qi_ext_d_31:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #248
** st1b \2\.d, \1, \[x8\]
** ret
*/
v32qi
qi_ext_d_31 (v32qi x)
{
return __builtin_shuffle (x, x, (v32qi) { PERM4 (31) });
}
/*
** hi_ext_s_1:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #4
** st1h \2\.s, \1, \[x8\]
** ret
*/
v64hi
hi_ext_s_1 (v64hi x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
}
/*
** hi_ext_s_63:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #252
** st1h \2\.s, \1, \[x8\]
** ret
*/
v64hi
hi_ext_s_63 (v64hi x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
}
/*
** hf_ext_s_1:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #4
** st1h \2\.s, \1, \[x8\]
** ret
*/
v64hf
hf_ext_s_1 (v64hf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
}
/*
** hf_ext_s_60:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #240
** st1h \2\.s, \1, \[x8\]
** ret
*/
v64hf
hf_ext_s_60 (v64hf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (60) });
}
/*
** bf_ext_s_1:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #4
** st1h \2\.s, \1, \[x8\]
** ret
*/
v64bf
bf_ext_s_1 (v64bf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
}
/*
** bf_ext_s_40:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #160
** st1h \2\.s, \1, \[x8\]
** ret
*/
v64bf
bf_ext_s_40 (v64bf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (40) });
}
/*
** hi_ext_d_1:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #8
** st1h \2\.d, \1, \[x8\]
** ret
*/
v32hi
hi_ext_d_1 (v32hi x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
}
/*
** hi_ext_d_31:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #248
** st1h \2\.d, \1, \[x8\]
** ret
*/
v32hi
hi_ext_d_31 (v32hi x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
}
/*
** hf_ext_d_1:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #8
** st1h \2\.d, \1, \[x8\]
** ret
*/
v32hf
hf_ext_d_1 (v32hf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
}
/*
** hf_ext_d_18:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #144
** st1h \2\.d, \1, \[x8\]
** ret
*/
v32hf
hf_ext_d_18 (v32hf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (18) });
}
/*
** bf_ext_d_1:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #8
** st1h \2\.d, \1, \[x8\]
** ret
*/
v32bf
bf_ext_d_1 (v32bf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
}
/*
** bf_ext_d_7:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #56
** st1h \2\.d, \1, \[x8\]
** ret
*/
v32bf
bf_ext_d_7 (v32bf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (7) });
}
/*
** si_ext_d_1:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #8
** st1w \2\.d, \1, \[x8\]
** ret
*/
v32si
si_ext_d_1 (v32si x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
}
/*
** si_ext_d_31:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #248
** st1w \2\.d, \1, \[x8\]
** ret
*/
v32si
si_ext_d_31 (v32si x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
}
/*
** sf_ext_d_1:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #8
** st1w \2\.d, \1, \[x8\]
** ret
*/
v32sf
sf_ext_d_1 (v32sf x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
}
/*
** sf_ext_d_31:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** ext \2\.b, \2\.b, \2\.b, #248
** st1w \2\.d, \1, \[x8\]
** ret
*/
v32sf
sf_ext_d_31 (v32sf x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
}

View file

@ -0,0 +1,177 @@
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
/* { dg-final { check-function-bodies "**" "" } } */
typedef unsigned char v128qi __attribute__((vector_size(128)));
typedef unsigned char v64qi __attribute__((vector_size(64)));
typedef unsigned char v32qi __attribute__((vector_size(32)));
typedef unsigned short v64hi __attribute__((vector_size(128)));
typedef unsigned short v32hi __attribute__((vector_size(64)));
typedef _Float16 v64hf __attribute__((vector_size(128)));
typedef _Float16 v32hf __attribute__((vector_size(64)));
typedef __bf16 v64bf __attribute__((vector_size(128)));
typedef __bf16 v32bf __attribute__((vector_size(64)));
typedef unsigned int v32si __attribute__((vector_size(128)));
typedef float v32sf __attribute__((vector_size(128)));
#define PERM0(B) B, B - 1
#define PERM1(B) PERM0 (B), PERM0 (B - 2)
#define PERM2(B) PERM1 (B), PERM1 (B - 4)
#define PERM3(B) PERM2 (B), PERM2 (B - 8)
#define PERM4(B) PERM3 (B), PERM3 (B - 16)
#define PERM5(B) PERM4 (B), PERM4 (B - 32)
#define PERM6(B) PERM5 (B), PERM5 (B - 64)
/*
** qi_rev_h:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** rev (z[0-9]+)\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_rev_h (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (127) });
}
/*
** qi_rev_s:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** rev (z[0-9]+)\.s, \2\.s
** st1b \3\.s, \1, \[x8\]
** ret
*/
v64qi
qi_rev_s (v64qi x)
{
return __builtin_shuffle (x, x, (v64qi) { PERM5 (63) });
}
/*
** qi_rev_d:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** rev (z[0-9]+)\.d, \2\.d
** st1b \3\.d, \1, \[x8\]
** ret
*/
v32qi
qi_rev_d (v32qi x)
{
return __builtin_shuffle (x, x, (v32qi) { PERM4 (31) });
}
/*
** hi_rev_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** rev (z[0-9]+)\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hi
hi_rev_s (v64hi x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
}
/*
** hf_rev_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** rev (z[0-9]+)\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hf
hf_rev_s (v64hf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
}
/*
** bf_rev_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** rev (z[0-9]+)\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64bf
bf_rev_s (v64bf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
}
/*
** hi_rev_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** rev (z[0-9]+)\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hi
hi_rev_d (v32hi x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
}
/*
** hf_rev_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** rev (z[0-9]+)\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hf
hf_rev_d (v32hf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
}
/*
** bf_rev_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** rev (z[0-9]+)\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32bf
bf_rev_d (v32bf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
}
/*
** si_rev_d:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** rev (z[0-9]+)\.d, \2\.d
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32si
si_rev_d (v32si x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
}
/*
** sf_rev_d:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** rev (z[0-9]+)\.d, \2\.d
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32sf
sf_rev_d (v32sf x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
}

View file

@ -0,0 +1,127 @@
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
/* { dg-final { check-function-bodies "**" "" } } */
typedef unsigned char v128qi __attribute__((vector_size(128)));
typedef unsigned char v64qi __attribute__((vector_size(64)));
typedef unsigned short v64hi __attribute__((vector_size(128)));
typedef _Float16 v64hf __attribute__((vector_size(128)));
typedef __bf16 v64bf __attribute__((vector_size(128)));
#define PERM0(B) B + 1, B
#define PERM1(B) PERM0 (B), PERM0 (B + 2)
#define PERM2(B) PERM1 (B), PERM1 (B + 4)
#define PERM3(B) PERM2 (B), PERM2 (B + 8)
#define PERM4(B) PERM3 (B), PERM3 (B + 16)
#define PERM5(B) PERM4 (B), PERM4 (B + 32)
#define PERM6(B) PERM5 (B), PERM5 (B + 64)
/*
** qi_revh_s:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** revh (z[0-9]+)\.s, \1/m, \2\.s
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_revh_s (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
}
/*
** qi_revw_d:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** revw (z[0-9]+)\.d, \1/m, \2\.d
** st1b \3\.s, \1, \[x8\]
** ret
*/
v64qi
qi_revw_d (v64qi x)
{
return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
}
/*
** hi_revw_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** revw (z[0-9]+)\.d, \1/m, \2\.d
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hi
hi_revw_d (v64hi x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
}
/*
** hf_revw_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** revw (z[0-9]+)\.d, \1/m, \2\.d
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hf
hf_revw_d (v64hf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
}
/*
** bf_revw_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** revw (z[0-9]+)\.d, \1/m, \2\.d
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64bf
bf_revw_d (v64bf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
}
#undef PERM1
#define PERM1(B) PERM0 (B + 2), PERM0 (B)
/*
** qi_revh_d:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** revh (z[0-9]+)\.d, \1/m, \2\.d
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_revh_d (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
}
v64qi
qi_revw_q (v64qi x)
{
return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
}
v64hi
hi_revw_q (v64hi x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
}
#undef PERM2
#define PERM2(B) PERM0 (B + 4), PERM0 (B)
v128qi
qi_revh_q (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
}
/* { dg-final { scan-assembler-times {\trev.\t} 6 } } */

View file

@ -0,0 +1,127 @@
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O -msve-vector-bits=2048 -mbig-endian --save-temps" } */
/* { dg-final { check-function-bodies "**" "" } } */
typedef unsigned char v128qi __attribute__((vector_size(128)));
typedef unsigned char v64qi __attribute__((vector_size(64)));
typedef unsigned short v64hi __attribute__((vector_size(128)));
typedef _Float16 v64hf __attribute__((vector_size(128)));
typedef __bf16 v64bf __attribute__((vector_size(128)));
#define PERM0(B) B + 1, B
#define PERM1(B) PERM0 (B), PERM0 (B + 2)
#define PERM2(B) PERM1 (B), PERM1 (B + 4)
#define PERM3(B) PERM2 (B), PERM2 (B + 8)
#define PERM4(B) PERM3 (B), PERM3 (B + 16)
#define PERM5(B) PERM4 (B), PERM4 (B + 32)
#define PERM6(B) PERM5 (B), PERM5 (B + 64)
/*
** qi_revh_s:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** revh (z[0-9]+)\.s, \1/m, \2\.s
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_revh_s (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
}
/*
** qi_revw_d:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** revw (z[0-9]+)\.d, \1/m, \2\.d
** st1b \3\.s, \1, \[x8\]
** ret
*/
v64qi
qi_revw_d (v64qi x)
{
return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
}
/*
** hi_revw_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** revw (z[0-9]+)\.d, \1/m, \2\.d
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hi
hi_revw_d (v64hi x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
}
/*
** hf_revw_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** revw (z[0-9]+)\.d, \1/m, \2\.d
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hf
hf_revw_d (v64hf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
}
/*
** bf_revw_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** revw (z[0-9]+)\.d, \1/m, \2\.d
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64bf
bf_revw_d (v64bf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
}
#undef PERM1
#define PERM1(B) PERM0 (B + 2), PERM0 (B)
/*
** qi_revh_d:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** revh (z[0-9]+)\.d, \1/m, \2\.d
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_revh_d (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
}
v64qi
qi_revw_q (v64qi x)
{
return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
}
v64hi
hi_revw_q (v64hi x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
}
#undef PERM2
#define PERM2(B) PERM0 (B + 4), PERM0 (B)
v128qi
qi_revh_q (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
}
/* { dg-final { scan-assembler-times {\trev.\t} 6 } } */

View file

@ -0,0 +1,18 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
void
f (short *restrict s, signed char *restrict c)
{
for (int i = 0; i < 8; i += 2)
{
s[i] = c[i];
s[i + 1] = c[i];
}
}
/* Ideally this would use LD1SB, but currently we use LD1B and
sign-extend it after the permute. */
/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, vl6\n} } } */
/* { dg-final { scan-assembler {\tld1s?b\tz[0-9]+\.h} } } */
/* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.h,} } } */

View file

@ -0,0 +1,403 @@
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
/* { dg-final { check-function-bodies "**" "" } } */
typedef unsigned char v128qi __attribute__((vector_size(128)));
typedef unsigned char v64qi __attribute__((vector_size(64)));
typedef unsigned char v32qi __attribute__((vector_size(32)));
typedef unsigned short v64hi __attribute__((vector_size(128)));
typedef unsigned short v32hi __attribute__((vector_size(64)));
typedef _Float16 v64hf __attribute__((vector_size(128)));
typedef _Float16 v32hf __attribute__((vector_size(64)));
typedef __bf16 v64bf __attribute__((vector_size(128)));
typedef __bf16 v32bf __attribute__((vector_size(64)));
typedef unsigned int v32si __attribute__((vector_size(128)));
typedef float v32sf __attribute__((vector_size(128)));
#define PERM0(B, C) B, B + C
#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 2, C)
#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 4, C)
#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 8, C)
#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 16, C)
#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 32, C)
#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 64, C)
/*
** qi_trn1_h_a:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** trn1 (z[0-9]+)\.h, \2\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_trn1_h_a (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 0) });
}
/*
** qi_trn1_h_b:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** trn1 (z[0-9]+)\.h, \2\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_trn1_h_b (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 128) });
}
/*
** qi_trn1_h_c:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** trn1 (z[0-9]+)\.h, \2\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_trn1_h_c (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (128, 0) });
}
/*
** qi_trn1_h_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** trn1 \3\.h, \3\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
** trn1 \4\.h, \4\.h, \5\.h
** st1b \4\.h, \1, \[x8\]
** )
** ret
*/
v128qi
qi_trn1_h_two_op (v128qi x, v128qi y)
{
return __builtin_shuffle (x, y, (v128qi) { PERM6 (0, 128) });
}
/*
** qi_trn1_s:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** trn1 (z[0-9]+)\.s, \2\.s, \2\.s
** st1b \3\.s, \1, \[x8\]
** ret
*/
v64qi
qi_trn1_s (v64qi x)
{
return __builtin_shuffle (x, x, (v64qi) { PERM5 (0, 64) });
}
/*
** qi_trn1_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** trn1 \3\.s, \3\.s, \2\.s
** st1b \3\.s, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
** trn1 \4\.s, \4\.s, \5\.s
** st1b \4\.s, \1, \[x8\]
** )
** ret
*/
v64qi
qi_trn1_s_two_op (v64qi x, v64qi y)
{
return __builtin_shuffle (x, y, (v64qi) { PERM5 (0, 64) });
}
/*
** qi_trn1_d:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** trn1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1b \3\.d, \1, \[x8\]
** ret
*/
v32qi
qi_trn1_d (v32qi x)
{
return __builtin_shuffle (x, x, (v32qi) { PERM4 (0, 32) });
}
/*
** qi_trn1_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** trn1 \3\.d, \3\.d, \2\.d
** st1b \3\.d, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
** trn1 \4\.d, \4\.d, \5\.d
** st1b \4\.d, \1, \[x8\]
** )
** ret
*/
v32qi
qi_trn1_d_two_op (v32qi x, v32qi y)
{
return __builtin_shuffle (x, y, (v32qi) { PERM4 (0, 32) });
}
/*
** hi_trn1_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** trn1 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hi
hi_trn1_s (v64hi x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
}
/*
** hi_trn1_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** trn1 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** trn1 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64hi
hi_trn1_s_two_op (v64hi x, v64hi y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
}
/*
** hf_trn1_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** trn1 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hf
hf_trn1_s (v64hf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
}
/*
** hf_trn1_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** trn1 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** trn1 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64hf
hf_trn1_s_two_op (v64hf x, v64hf y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
}
/*
** bf_trn1_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** trn1 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64bf
bf_trn1_s (v64bf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
}
/*
** bf_trn1_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** trn1 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** trn1 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64bf
bf_trn1_s_two_op (v64bf x, v64bf y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
}
/*
** hi_trn1_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** trn1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hi
hi_trn1_d (v32hi x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
}
/*
** hi_trn1_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** trn1 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** trn1 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32hi
hi_trn1_d_two_op (v32hi x, v32hi y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
}
/*
** hf_trn1_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** trn1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hf
hf_trn1_d (v32hf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
}
/*
** hf_trn1_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** trn1 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** trn1 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32hf
hf_trn1_d_two_op (v32hf x, v32hf y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
}
/*
** bf_trn1_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** trn1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32bf
bf_trn1_d (v32bf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
}
/*
** bf_trn1_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** trn1 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** trn1 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32bf
bf_trn1_d_two_op (v32bf x, v32bf y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
}
/*
** si_trn1_d:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** trn1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32si
si_trn1_d (v32si x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
}
/*
** sf_trn1_d:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** trn1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32sf
sf_trn1_d (v32sf x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
}

View file

@ -0,0 +1,403 @@
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
/* { dg-final { check-function-bodies "**" "" } } */
typedef unsigned char v128qi __attribute__((vector_size(128)));
typedef unsigned char v64qi __attribute__((vector_size(64)));
typedef unsigned char v32qi __attribute__((vector_size(32)));
typedef unsigned short v64hi __attribute__((vector_size(128)));
typedef unsigned short v32hi __attribute__((vector_size(64)));
typedef _Float16 v64hf __attribute__((vector_size(128)));
typedef _Float16 v32hf __attribute__((vector_size(64)));
typedef __bf16 v64bf __attribute__((vector_size(128)));
typedef __bf16 v32bf __attribute__((vector_size(64)));
typedef unsigned int v32si __attribute__((vector_size(128)));
typedef float v32sf __attribute__((vector_size(128)));
#define PERM0(B, C) B, B + C
#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 2, C)
#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 4, C)
#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 8, C)
#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 16, C)
#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 32, C)
#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 64, C)
/*
** qi_trn2_h_a:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** trn2 (z[0-9]+)\.h, \2\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_trn2_h_a (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 0) });
}
/*
** qi_trn2_h_b:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** trn2 (z[0-9]+)\.h, \2\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_trn2_h_b (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 128) });
}
/*
** qi_trn2_h_c:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** trn2 (z[0-9]+)\.h, \2\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_trn2_h_c (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 0) });
}
/*
** qi_trn2_h_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** trn2 \3\.h, \3\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
** trn2 \4\.h, \4\.h, \5\.h
** st1b \4\.h, \1, \[x8\]
** )
** ret
*/
v128qi
qi_trn2_h_two_op (v128qi x, v128qi y)
{
return __builtin_shuffle (x, y, (v128qi) { PERM6 (1, 128) });
}
/*
** qi_trn2_s:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** trn2 (z[0-9]+)\.s, \2\.s, \2\.s
** st1b \3\.s, \1, \[x8\]
** ret
*/
v64qi
qi_trn2_s (v64qi x)
{
return __builtin_shuffle (x, x, (v64qi) { PERM5 (1, 64) });
}
/*
** qi_trn2_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** trn2 \3\.s, \3\.s, \2\.s
** st1b \3\.s, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
** trn2 \4\.s, \4\.s, \5\.s
** st1b \4\.s, \1, \[x8\]
** )
** ret
*/
v64qi
qi_trn2_s_two_op (v64qi x, v64qi y)
{
return __builtin_shuffle (x, y, (v64qi) { PERM5 (1, 64) });
}
/*
** qi_trn2_d:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** trn2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1b \3\.d, \1, \[x8\]
** ret
*/
v32qi
qi_trn2_d (v32qi x)
{
return __builtin_shuffle (x, x, (v32qi) { PERM4 (1, 32) });
}
/*
** qi_trn2_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** trn2 \3\.d, \3\.d, \2\.d
** st1b \3\.d, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
** trn2 \4\.d, \4\.d, \5\.d
** st1b \4\.d, \1, \[x8\]
** )
** ret
*/
v32qi
qi_trn2_d_two_op (v32qi x, v32qi y)
{
return __builtin_shuffle (x, y, (v32qi) { PERM4 (1, 32) });
}
/*
** hi_trn2_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** trn2 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hi
hi_trn2_s (v64hi x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) });
}
/*
** hi_trn2_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** trn2 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** trn2 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64hi
hi_trn2_s_two_op (v64hi x, v64hi y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) });
}
/*
** hf_trn2_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** trn2 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hf
hf_trn2_s (v64hf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) });
}
/*
** hf_trn2_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** trn2 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** trn2 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64hf
hf_trn2_s_two_op (v64hf x, v64hf y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) });
}
/*
** bf_trn2_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** trn2 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64bf
bf_trn2_s (v64bf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) });
}
/*
** bf_trn2_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** trn2 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** trn2 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64bf
bf_trn2_s_two_op (v64bf x, v64bf y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) });
}
/*
** hi_trn2_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** trn2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hi
hi_trn2_d (v32hi x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) });
}
/*
** hi_trn2_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** trn2 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** trn2 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32hi
hi_trn2_d_two_op (v32hi x, v32hi y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) });
}
/*
** hf_trn2_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** trn2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hf
hf_trn2_d (v32hf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) });
}
/*
** hf_trn2_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** trn2 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** trn2 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32hf
hf_trn2_d_two_op (v32hf x, v32hf y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) });
}
/*
** bf_trn2_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** trn2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32bf
bf_trn2_d (v32bf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) });
}
/*
** bf_trn2_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** trn2 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** trn2 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32bf
bf_trn2_d_two_op (v32bf x, v32bf y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) });
}
/*
** si_trn2_d:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** trn2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32si
si_trn2_d (v32si x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (1, 32) });
}
/*
** sf_trn2_d:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** trn2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32sf
sf_trn2_d (v32sf x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (1, 32) });
}

View file

@ -0,0 +1,375 @@
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
/* { dg-final { check-function-bodies "**" "" } } */
typedef unsigned char v128qi __attribute__((vector_size(128)));
typedef unsigned char v64qi __attribute__((vector_size(64)));
typedef unsigned char v32qi __attribute__((vector_size(32)));
typedef unsigned short v64hi __attribute__((vector_size(128)));
typedef unsigned short v32hi __attribute__((vector_size(64)));
typedef _Float16 v64hf __attribute__((vector_size(128)));
typedef _Float16 v32hf __attribute__((vector_size(64)));
typedef __bf16 v64bf __attribute__((vector_size(128)));
typedef __bf16 v32bf __attribute__((vector_size(64)));
typedef unsigned int v32si __attribute__((vector_size(128)));
typedef float v32sf __attribute__((vector_size(128)));
#define PERM0(B) B, B + 2
#define PERM1(B) PERM0 (B), PERM0 (B + 4)
#define PERM2(B) PERM1 (B), PERM1 (B + 8)
#define PERM3(B) PERM2 (B), PERM2 (B + 16)
#define PERM4(B) PERM3 (B), PERM3 (B + 32)
#define PERM5(B) PERM4 (B), PERM4 (B + 64)
#define PERM6(B) PERM5 (B), PERM5 (B + 128)
/*
** qi_uzp1_h:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** uzp1 (z[0-9]+)\.h, \2\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_uzp1_h (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
}
/*
** qi_uzp1_h_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** uzp1 \3\.h, \3\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
** uzp1 \4\.h, \4\.h, \5\.h
** st1b \4\.h, \1, \[x8\]
** )
** ret
*/
v128qi
qi_uzp1_h_two_op (v128qi x, v128qi y)
{
return __builtin_shuffle (x, y, (v128qi) { PERM6 (0) });
}
/*
** qi_uzp1_s:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** uzp1 (z[0-9]+)\.s, \2\.s, \2\.s
** st1b \3\.s, \1, \[x8\]
** ret
*/
v64qi
qi_uzp1_s (v64qi x)
{
return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
}
/*
** qi_uzp1_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** uzp1 \3\.s, \3\.s, \2\.s
** st1b \3\.s, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
** uzp1 \4\.s, \4\.s, \5\.s
** st1b \4\.s, \1, \[x8\]
** )
** ret
*/
v64qi
qi_uzp1_s_two_op (v64qi x, v64qi y)
{
return __builtin_shuffle (x, y, (v64qi) { PERM5 (0) });
}
/*
** qi_uzp1_d:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1b \3\.d, \1, \[x8\]
** ret
*/
v32qi
qi_uzp1_d (v32qi x)
{
return __builtin_shuffle (x, x, (v32qi) { PERM4 (0) });
}
/*
** qi_uzp1_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** uzp1 \3\.d, \3\.d, \2\.d
** st1b \3\.d, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
** uzp1 \4\.d, \4\.d, \5\.d
** st1b \4\.d, \1, \[x8\]
** )
** ret
*/
v32qi
qi_uzp1_d_two_op (v32qi x, v32qi y)
{
return __builtin_shuffle (x, y, (v32qi) { PERM4 (0) });
}
/*
** hi_uzp1_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** uzp1 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hi
hi_uzp1_s (v64hi x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
}
/*
** hi_uzp1_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** uzp1 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** uzp1 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64hi
hi_uzp1_s_two_op (v64hi x, v64hi y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
}
/*
** hf_uzp1_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** uzp1 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hf
hf_uzp1_s (v64hf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
}
/*
** hf_uzp1_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** uzp1 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** uzp1 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64hf
hf_uzp1_s_two_op (v64hf x, v64hf y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
}
/*
** bf_uzp1_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** uzp1 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64bf
bf_uzp1_s (v64bf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
}
/*
** bf_uzp1_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** uzp1 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** uzp1 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64bf
bf_uzp1_s_two_op (v64bf x, v64bf y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
}
/*
** hi_uzp1_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hi
hi_uzp1_d (v32hi x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
}
/*
** hi_uzp1_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** uzp1 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** uzp1 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32hi
hi_uzp1_d_two_op (v32hi x, v32hi y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
}
/*
** hf_uzp1_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hf
hf_uzp1_d (v32hf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
}
/*
** hf_uzp1_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** uzp1 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** uzp1 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32hf
hf_uzp1_d_two_op (v32hf x, v32hf y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
}
/*
** bf_uzp1_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32bf
bf_uzp1_d (v32bf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
}
/*
** bf_uzp1_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** uzp1 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** uzp1 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32bf
bf_uzp1_d_two_op (v32bf x, v32bf y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
}
/*
** si_uzp1_d:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32si
si_uzp1_d (v32si x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (0) });
}
/*
** sf_uzp1_d:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32sf
sf_uzp1_d (v32sf x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (0) });
}

View file

@ -0,0 +1,375 @@
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
/* { dg-final { check-function-bodies "**" "" } } */
typedef unsigned char v128qi __attribute__((vector_size(128)));
typedef unsigned char v64qi __attribute__((vector_size(64)));
typedef unsigned char v32qi __attribute__((vector_size(32)));
typedef unsigned short v64hi __attribute__((vector_size(128)));
typedef unsigned short v32hi __attribute__((vector_size(64)));
typedef _Float16 v64hf __attribute__((vector_size(128)));
typedef _Float16 v32hf __attribute__((vector_size(64)));
typedef __bf16 v64bf __attribute__((vector_size(128)));
typedef __bf16 v32bf __attribute__((vector_size(64)));
typedef unsigned int v32si __attribute__((vector_size(128)));
typedef float v32sf __attribute__((vector_size(128)));
#define PERM0(B) B, B + 2
#define PERM1(B) PERM0 (B), PERM0 (B + 4)
#define PERM2(B) PERM1 (B), PERM1 (B + 8)
#define PERM3(B) PERM2 (B), PERM2 (B + 16)
#define PERM4(B) PERM3 (B), PERM3 (B + 32)
#define PERM5(B) PERM4 (B), PERM4 (B + 64)
#define PERM6(B) PERM5 (B), PERM5 (B + 128)
/*
** qi_uzp2_h:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** uzp2 (z[0-9]+)\.h, \2\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_uzp2_h (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
}
/*
** qi_uzp2_h_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** uzp2 \3\.h, \3\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
** uzp2 \4\.h, \4\.h, \5\.h
** st1b \4\.h, \1, \[x8\]
** )
** ret
*/
v128qi
qi_uzp2_h_two_op (v128qi x, v128qi y)
{
return __builtin_shuffle (x, y, (v128qi) { PERM6 (1) });
}
/*
** qi_uzp2_s:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** uzp2 (z[0-9]+)\.s, \2\.s, \2\.s
** st1b \3\.s, \1, \[x8\]
** ret
*/
v64qi
qi_uzp2_s (v64qi x)
{
return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
}
/*
** qi_uzp2_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** uzp2 \3\.s, \3\.s, \2\.s
** st1b \3\.s, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
** uzp2 \4\.s, \4\.s, \5\.s
** st1b \4\.s, \1, \[x8\]
** )
** ret
*/
v64qi
qi_uzp2_s_two_op (v64qi x, v64qi y)
{
return __builtin_shuffle (x, y, (v64qi) { PERM5 (1) });
}
/*
** qi_uzp2_d:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1b \3\.d, \1, \[x8\]
** ret
*/
v32qi
qi_uzp2_d (v32qi x)
{
return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
}
/*
** qi_uzp2_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** uzp2 \3\.d, \3\.d, \2\.d
** st1b \3\.d, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
** uzp2 \4\.d, \4\.d, \5\.d
** st1b \4\.d, \1, \[x8\]
** )
** ret
*/
v32qi
qi_uzp2_d_two_op (v32qi x, v32qi y)
{
return __builtin_shuffle (x, y, (v32qi) { PERM4 (1) });
}
/*
** hi_uzp2_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** uzp2 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hi
hi_uzp2_s (v64hi x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
}
/*
** hi_uzp2_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** uzp2 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** uzp2 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64hi
hi_uzp2_s_two_op (v64hi x, v64hi y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) });
}
/*
** hf_uzp2_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** uzp2 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hf
hf_uzp2_s (v64hf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
}
/*
** hf_uzp2_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** uzp2 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** uzp2 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64hf
hf_uzp2_s_two_op (v64hf x, v64hf y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) });
}
/*
** bf_uzp2_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** uzp2 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64bf
bf_uzp2_s (v64bf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
}
/*
** bf_uzp2_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** uzp2 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** uzp2 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64bf
bf_uzp2_s_two_op (v64bf x, v64bf y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) });
}
/*
** hi_uzp2_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hi
hi_uzp2_d (v32hi x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
}
/*
** hi_uzp2_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** uzp2 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** uzp2 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32hi
hi_uzp2_d_two_op (v32hi x, v32hi y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) });
}
/*
** hf_uzp2_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hf
hf_uzp2_d (v32hf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
}
/*
** hf_uzp2_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** uzp2 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** uzp2 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32hf
hf_uzp2_d_two_op (v32hf x, v32hf y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) });
}
/*
** bf_uzp2_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32bf
bf_uzp2_d (v32bf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
}
/*
** bf_uzp2_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** uzp2 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** uzp2 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32bf
bf_uzp2_d_two_op (v32bf x, v32bf y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) });
}
/*
** si_uzp2_d:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32si
si_uzp2_d (v32si x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
}
/*
** sf_uzp2_d:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32sf
sf_uzp2_d (v32sf x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
}

View file

@ -0,0 +1,403 @@
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
/* { dg-final { check-function-bodies "**" "" } } */
typedef unsigned char v128qi __attribute__((vector_size(128)));
typedef unsigned char v64qi __attribute__((vector_size(64)));
typedef unsigned char v32qi __attribute__((vector_size(32)));
typedef unsigned short v64hi __attribute__((vector_size(128)));
typedef unsigned short v32hi __attribute__((vector_size(64)));
typedef _Float16 v64hf __attribute__((vector_size(128)));
typedef _Float16 v32hf __attribute__((vector_size(64)));
typedef __bf16 v64bf __attribute__((vector_size(128)));
typedef __bf16 v32bf __attribute__((vector_size(64)));
typedef unsigned int v32si __attribute__((vector_size(128)));
typedef float v32sf __attribute__((vector_size(128)));
#define PERM0(B, C) B, B + C
#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 1, C)
#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 2, C)
#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 4, C)
#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 8, C)
#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 16, C)
#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 32, C)
/*
** qi_zip1_h_a:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** zip1 (z[0-9]+)\.h, \2\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_zip1_h_a (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 0) });
}
/*
** qi_zip1_h_b:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** zip1 (z[0-9]+)\.h, \2\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_zip1_h_b (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 128) });
}
/*
** qi_zip1_h_c:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** zip1 (z[0-9]+)\.h, \2\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_zip1_h_c (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (128, 0) });
}
/*
** qi_zip1_h_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** zip1 \3\.h, \3\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
** zip1 \4\.h, \4\.h, \5\.h
** st1b \4\.h, \1, \[x8\]
** )
** ret
*/
v128qi
qi_zip1_h_two_op (v128qi x, v128qi y)
{
return __builtin_shuffle (x, y, (v128qi) { PERM6 (0, 128) });
}
/*
** qi_zip1_s:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** zip1 (z[0-9]+)\.s, \2\.s, \2\.s
** st1b \3\.s, \1, \[x8\]
** ret
*/
v64qi
qi_zip1_s (v64qi x)
{
return __builtin_shuffle (x, x, (v64qi) { PERM5 (0, 64) });
}
/*
** qi_zip1_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** zip1 \3\.s, \3\.s, \2\.s
** st1b \3\.s, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
** zip1 \4\.s, \4\.s, \5\.s
** st1b \4\.s, \1, \[x8\]
** )
** ret
*/
v64qi
qi_zip1_s_two_op (v64qi x, v64qi y)
{
return __builtin_shuffle (x, y, (v64qi) { PERM5 (0, 64) });
}
/*
** qi_zip1_d:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** zip1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1b \3\.d, \1, \[x8\]
** ret
*/
v32qi
qi_zip1_d (v32qi x)
{
return __builtin_shuffle (x, x, (v32qi) { PERM4 (0, 32) });
}
/*
** qi_zip1_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** zip1 \3\.d, \3\.d, \2\.d
** st1b \3\.d, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
** zip1 \4\.d, \4\.d, \5\.d
** st1b \4\.d, \1, \[x8\]
** )
** ret
*/
v32qi
qi_zip1_d_two_op (v32qi x, v32qi y)
{
return __builtin_shuffle (x, y, (v32qi) { PERM4 (0, 32) });
}
/*
** hi_zip1_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** zip1 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hi
hi_zip1_s (v64hi x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
}
/*
** hi_zip1_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** zip1 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** zip1 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64hi
hi_zip1_s_two_op (v64hi x, v64hi y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
}
/*
** hf_zip1_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** zip1 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hf
hf_zip1_s (v64hf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
}
/*
** hf_zip1_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** zip1 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** zip1 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64hf
hf_zip1_s_two_op (v64hf x, v64hf y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
}
/*
** bf_zip1_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** zip1 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64bf
bf_zip1_s (v64bf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
}
/*
** bf_zip1_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** zip1 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** zip1 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64bf
bf_zip1_s_two_op (v64bf x, v64bf y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
}
/*
** hi_zip1_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** zip1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hi
hi_zip1_d (v32hi x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
}
/*
** hi_zip1_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** zip1 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** zip1 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32hi
hi_zip1_d_two_op (v32hi x, v32hi y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
}
/*
** hf_zip1_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** zip1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hf
hf_zip1_d (v32hf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
}
/*
** hf_zip1_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** zip1 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** zip1 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32hf
hf_zip1_d_two_op (v32hf x, v32hf y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
}
/*
** bf_zip1_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** zip1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32bf
bf_zip1_d (v32bf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
}
/*
** bf_zip1_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** zip1 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** zip1 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32bf
bf_zip1_d_two_op (v32bf x, v32bf y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
}
/*
** si_zip1_d:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** zip1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32si
si_zip1_d (v32si x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
}
/*
** sf_zip1_d:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** zip1 (z[0-9]+)\.d, \2\.d, \2\.d
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32sf
sf_zip1_d (v32sf x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
}

View file

@ -0,0 +1,403 @@
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
/* { dg-final { check-function-bodies "**" "" } } */
typedef unsigned char v128qi __attribute__((vector_size(128)));
typedef unsigned char v64qi __attribute__((vector_size(64)));
typedef unsigned char v32qi __attribute__((vector_size(32)));
typedef unsigned short v64hi __attribute__((vector_size(128)));
typedef unsigned short v32hi __attribute__((vector_size(64)));
typedef _Float16 v64hf __attribute__((vector_size(128)));
typedef _Float16 v32hf __attribute__((vector_size(64)));
typedef __bf16 v64bf __attribute__((vector_size(128)));
typedef __bf16 v32bf __attribute__((vector_size(64)));
typedef unsigned int v32si __attribute__((vector_size(128)));
typedef float v32sf __attribute__((vector_size(128)));
#define PERM0(B, C) B, B + C
#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 1, C)
#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 2, C)
#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 4, C)
#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 8, C)
#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 16, C)
#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 32, C)
/*
** qi_zip2_h_a:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** zip2 (z[0-9]+)\.h, \2\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_zip2_h_a (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (64, 128) });
}
/*
** qi_zip2_h_b:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** zip2 (z[0-9]+)\.h, \2\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_zip2_h_b (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (64, 128) });
}
/*
** qi_zip2_h_c:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** zip2 (z[0-9]+)\.h, \2\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** ret
*/
v128qi
qi_zip2_h_c (v128qi x)
{
return __builtin_shuffle (x, x, (v128qi) { PERM6 (192, 0) });
}
/*
** qi_zip2_h_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** zip2 \3\.h, \3\.h, \2\.h
** st1b \3\.h, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
** zip2 \4\.h, \4\.h, \5\.h
** st1b \4\.h, \1, \[x8\]
** )
** ret
*/
v128qi
qi_zip2_h_two_op (v128qi x, v128qi y)
{
return __builtin_shuffle (x, y, (v128qi) { PERM6 (64, 128) });
}
/*
** qi_zip2_s:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** zip2 (z[0-9]+)\.s, \2\.s, \2\.s
** st1b \3\.s, \1, \[x8\]
** ret
*/
v64qi
qi_zip2_s (v64qi x)
{
return __builtin_shuffle (x, x, (v64qi) { PERM5 (32, 64) });
}
/*
** qi_zip2_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** zip2 \3\.s, \3\.s, \2\.s
** st1b \3\.s, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
** zip2 \4\.s, \4\.s, \5\.s
** st1b \4\.s, \1, \[x8\]
** )
** ret
*/
v64qi
qi_zip2_s_two_op (v64qi x, v64qi y)
{
return __builtin_shuffle (x, y, (v64qi) { PERM5 (32, 64) });
}
/*
** qi_zip2_d:
** ptrue (p[0-7])\.b, vl256
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** zip2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1b \3\.d, \1, \[x8\]
** ret
*/
v32qi
qi_zip2_d (v32qi x)
{
return __builtin_shuffle (x, x, (v32qi) { PERM4 (16, 32) });
}
/*
** qi_zip2_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** zip2 \3\.d, \3\.d, \2\.d
** st1b \3\.d, \1, \[x8\]
** |
** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
** zip2 \4\.d, \4\.d, \5\.d
** st1b \4\.d, \1, \[x8\]
** )
** ret
*/
v32qi
qi_zip2_d_two_op (v32qi x, v32qi y)
{
return __builtin_shuffle (x, y, (v32qi) { PERM4 (16, 32) });
}
/*
** hi_zip2_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** zip2 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hi
hi_zip2_s (v64hi x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) });
}
/*
** hi_zip2_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** zip2 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** zip2 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64hi
hi_zip2_s_two_op (v64hi x, v64hi y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) });
}
/*
** hf_zip2_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** zip2 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64hf
hf_zip2_s (v64hf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) });
}
/*
** hf_zip2_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** zip2 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** zip2 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64hf
hf_zip2_s_two_op (v64hf x, v64hf y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) });
}
/*
** bf_zip2_s:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** zip2 (z[0-9]+)\.s, \2\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** ret
*/
v64bf
bf_zip2_s (v64bf x)
{
return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) });
}
/*
** bf_zip2_s_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** zip2 \3\.s, \3\.s, \2\.s
** st1h \3\.s, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
** zip2 \4\.s, \4\.s, \5\.s
** st1h \4\.s, \1, \[x8\]
** )
** ret
*/
v64bf
bf_zip2_s_two_op (v64bf x, v64bf y)
{
return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) });
}
/*
** hi_zip2_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** zip2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hi
hi_zip2_d (v32hi x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) });
}
/*
** hi_zip2_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** zip2 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** zip2 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32hi
hi_zip2_d_two_op (v32hi x, v32hi y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) });
}
/*
** hf_zip2_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** zip2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32hf
hf_zip2_d (v32hf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) });
}
/*
** hf_zip2_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** zip2 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** zip2 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32hf
hf_zip2_d_two_op (v32hf x, v32hf y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) });
}
/*
** bf_zip2_d:
** ptrue (p[0-7])\.b, vl256
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** zip2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** ret
*/
v32bf
bf_zip2_d (v32bf x)
{
return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) });
}
/*
** bf_zip2_d_two_op:
** ptrue (p[0-7])\.b, vl256
** (
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** zip2 \3\.d, \3\.d, \2\.d
** st1h \3\.d, \1, \[x8\]
** |
** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
** zip2 \4\.d, \4\.d, \5\.d
** st1h \4\.d, \1, \[x8\]
** )
** ret
*/
v32bf
bf_zip2_d_two_op (v32bf x, v32bf y)
{
return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) });
}
/*
** si_zip2_d:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** zip2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32si
si_zip2_d (v32si x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (16, 32) });
}
/*
** sf_zip2_d:
** ptrue (p[0-7])\.b, vl256
** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
** zip2 (z[0-9]+)\.d, \2\.d, \2\.d
** st1w \3\.d, \1, \[x8\]
** ret
*/
v32sf
sf_zip2_d (v32sf x)
{
return __builtin_shuffle (x, x, (v32si) { PERM4 (16, 32) });
}