i386: Implement 4-byte vector (V4QI/V2HI) constant permutations
2021-07-05 Uroš Bizjak <ubizjak@gmail.com> gcc/ * config/i386/i386-expand.c (ix86_split_mmx_punpck): Handle V4QI and V2HI modes. (expand_vec_perm_blend): Allow 4-byte vector modes with TARGET_SSE4_1. Handle V4QI mode. Emit mmx_pblendvb32 for 4-byte modes. (expand_vec_perm_pshufb): Rewrite to use switch statemets. Handle 4-byte dual operands with TARGET_XOP and single operands with TARGET_SSSE3. Emit mmx_ppermv32 for TARGET_XOP and mmx_pshufbv4qi3 for TARGET_SSSE3. (expand_vec_perm_pblendv): Allow 4-byte vector modes with TARGET_SSE4_1. (expand_vec_perm_interleave2): Allow 4-byte vector modes. (expand_vec_perm_pshufb2): Allow 4-byte vector modes with TARGET_SSSE3. (expand_vec_perm_even_odd_1): Handle V4QI mode. (expand_vec_perm_broadcast_1): Handle V4QI mode. (ix86_vectorize_vec_perm_const): Handle V4QI mode. * config/i386/mmx.md (mmx_ppermv32): New insn pattern. (mmx_pshufbv4qi3): Ditto. (*mmx_pblendw32): Ditto. (*mmx_pblendw64): Rename from *mmx_pblendw. (mmx_punpckhbw_low): New insn_and_split pattern. (mmx_punpcklbw_low): Ditto.
This commit is contained in:
parent
8e0b3827bb
commit
be8749f939
2 changed files with 372 additions and 187 deletions
|
@ -933,6 +933,7 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
|
|||
|
||||
switch (mode)
|
||||
{
|
||||
case E_V4QImode:
|
||||
case E_V8QImode:
|
||||
sse_mode = V16QImode;
|
||||
double_sse_mode = V32QImode;
|
||||
|
@ -949,6 +950,7 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
|
|||
break;
|
||||
|
||||
case E_V4HImode:
|
||||
case E_V2HImode:
|
||||
sse_mode = V8HImode;
|
||||
double_sse_mode = V16HImode;
|
||||
mask = gen_rtx_PARALLEL (VOIDmode,
|
||||
|
@ -991,7 +993,7 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
|
|||
rtx insn = gen_rtx_SET (dest, op2);
|
||||
emit_insn (insn);
|
||||
|
||||
/* Move bits 64:127 to bits 0:63. */
|
||||
/* Move high bits to low bits. */
|
||||
if (high_p)
|
||||
{
|
||||
if (sse_mode == V4SFmode)
|
||||
|
@ -1004,9 +1006,19 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
|
|||
}
|
||||
else
|
||||
{
|
||||
mask = gen_rtx_PARALLEL (VOIDmode,
|
||||
gen_rtvec (4, GEN_INT (2), GEN_INT (3),
|
||||
GEN_INT (0), GEN_INT (1)));
|
||||
int sz = GET_MODE_SIZE (mode);
|
||||
|
||||
if (sz == 4)
|
||||
mask = gen_rtx_PARALLEL (VOIDmode,
|
||||
gen_rtvec (4, GEN_INT (1), GEN_INT (0),
|
||||
GEN_INT (0), GEN_INT (1)));
|
||||
else if (sz == 8)
|
||||
mask = gen_rtx_PARALLEL (VOIDmode,
|
||||
gen_rtvec (4, GEN_INT (2), GEN_INT (3),
|
||||
GEN_INT (0), GEN_INT (1)));
|
||||
else
|
||||
gcc_unreachable ();
|
||||
|
||||
dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
|
||||
op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
|
||||
}
|
||||
|
@ -17331,7 +17343,8 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
|
|||
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
|
||||
;
|
||||
else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
|
||||
|| GET_MODE_SIZE (vmode) == 8))
|
||||
|| GET_MODE_SIZE (vmode) == 8
|
||||
|| GET_MODE_SIZE (vmode) == 4))
|
||||
;
|
||||
else
|
||||
return false;
|
||||
|
@ -17408,7 +17421,9 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
|
|||
vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
|
||||
vperm = force_reg (vmode, vperm);
|
||||
|
||||
if (GET_MODE_SIZE (vmode) == 8)
|
||||
if (GET_MODE_SIZE (vmode) == 4)
|
||||
emit_insn (gen_mmx_pblendvb32 (target, op0, op1, vperm));
|
||||
else if (GET_MODE_SIZE (vmode) == 8)
|
||||
emit_insn (gen_mmx_pblendvb64 (target, op0, op1, vperm));
|
||||
else if (GET_MODE_SIZE (vmode) == 16)
|
||||
emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
|
||||
|
@ -17440,6 +17455,16 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
|
|||
vmode = V4HImode;
|
||||
goto do_subreg;
|
||||
|
||||
case E_V4QImode:
|
||||
for (i = 0; i < 4; i += 2)
|
||||
if (d->perm[i] + 1 != d->perm[i + 1])
|
||||
goto use_pblendvb;
|
||||
|
||||
for (i = 0; i < 2; ++i)
|
||||
mask |= (d->perm[i * 2] >= 4) << i;
|
||||
vmode = V2HImode;
|
||||
goto do_subreg;
|
||||
|
||||
case E_V32QImode:
|
||||
/* See if bytes move in pairs. If not, vpblendvb must be used. */
|
||||
for (i = 0; i < 32; i += 2)
|
||||
|
@ -17697,163 +17722,176 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
|
|||
nelt = d->nelt;
|
||||
|
||||
if (!d->one_operand_p)
|
||||
{
|
||||
if (GET_MODE_SIZE (d->vmode) == 8)
|
||||
{
|
||||
if (!TARGET_XOP)
|
||||
return false;
|
||||
vmode = V8QImode;
|
||||
}
|
||||
else if (GET_MODE_SIZE (d->vmode) == 16)
|
||||
{
|
||||
if (!TARGET_XOP)
|
||||
return false;
|
||||
}
|
||||
else if (GET_MODE_SIZE (d->vmode) == 32)
|
||||
{
|
||||
if (!TARGET_AVX2)
|
||||
return false;
|
||||
|
||||
if (valid_perm_using_mode_p (V2TImode, d))
|
||||
{
|
||||
if (d->testing_p)
|
||||
return true;
|
||||
|
||||
/* Use vperm2i128 insn. The pattern uses
|
||||
V4DImode instead of V2TImode. */
|
||||
target = d->target;
|
||||
if (d->vmode != V4DImode)
|
||||
target = gen_reg_rtx (V4DImode);
|
||||
op0 = gen_lowpart (V4DImode, d->op0);
|
||||
op1 = gen_lowpart (V4DImode, d->op1);
|
||||
rperm[0]
|
||||
= GEN_INT ((d->perm[0] / (nelt / 2))
|
||||
| ((d->perm[nelt / 2] / (nelt / 2)) * 16));
|
||||
emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
|
||||
if (target != d->target)
|
||||
emit_move_insn (d->target, gen_lowpart (d->vmode, target));
|
||||
return true;
|
||||
}
|
||||
switch (GET_MODE_SIZE (d->vmode))
|
||||
{
|
||||
case 4:
|
||||
if (!TARGET_XOP)
|
||||
return false;
|
||||
}
|
||||
else
|
||||
vmode = V4QImode;
|
||||
break;
|
||||
|
||||
case 8:
|
||||
if (!TARGET_XOP)
|
||||
return false;
|
||||
vmode = V8QImode;
|
||||
break;
|
||||
|
||||
case 16:
|
||||
if (!TARGET_XOP)
|
||||
return false;
|
||||
break;
|
||||
|
||||
case 32:
|
||||
if (!TARGET_AVX2)
|
||||
return false;
|
||||
|
||||
if (valid_perm_using_mode_p (V2TImode, d))
|
||||
{
|
||||
if (d->testing_p)
|
||||
return true;
|
||||
|
||||
/* Use vperm2i128 insn. The pattern uses
|
||||
V4DImode instead of V2TImode. */
|
||||
target = d->target;
|
||||
if (d->vmode != V4DImode)
|
||||
target = gen_reg_rtx (V4DImode);
|
||||
op0 = gen_lowpart (V4DImode, d->op0);
|
||||
op1 = gen_lowpart (V4DImode, d->op1);
|
||||
rperm[0]
|
||||
= GEN_INT ((d->perm[0] / (nelt / 2))
|
||||
| ((d->perm[nelt / 2] / (nelt / 2)) * 16));
|
||||
emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
|
||||
if (target != d->target)
|
||||
emit_move_insn (d->target, gen_lowpart (d->vmode, target));
|
||||
return true;
|
||||
}
|
||||
/* FALLTHRU */
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (GET_MODE_SIZE (d->vmode) == 8)
|
||||
{
|
||||
if (!TARGET_SSSE3)
|
||||
return false;
|
||||
vmode = V8QImode;
|
||||
}
|
||||
else if (GET_MODE_SIZE (d->vmode) == 16)
|
||||
{
|
||||
if (!TARGET_SSSE3)
|
||||
return false;
|
||||
}
|
||||
else if (GET_MODE_SIZE (d->vmode) == 32)
|
||||
{
|
||||
if (!TARGET_AVX2)
|
||||
return false;
|
||||
switch (GET_MODE_SIZE (d->vmode))
|
||||
{
|
||||
case 4:
|
||||
if (!TARGET_SSSE3)
|
||||
return false;
|
||||
vmode = V4QImode;
|
||||
break;
|
||||
|
||||
/* V4DImode should be already handled through
|
||||
expand_vselect by vpermq instruction. */
|
||||
gcc_assert (d->vmode != V4DImode);
|
||||
case 8:
|
||||
if (!TARGET_SSSE3)
|
||||
return false;
|
||||
vmode = V8QImode;
|
||||
break;
|
||||
|
||||
vmode = V32QImode;
|
||||
if (d->vmode == V8SImode
|
||||
|| d->vmode == V16HImode
|
||||
|| d->vmode == V32QImode)
|
||||
{
|
||||
/* First see if vpermq can be used for
|
||||
V8SImode/V16HImode/V32QImode. */
|
||||
if (valid_perm_using_mode_p (V4DImode, d))
|
||||
{
|
||||
for (i = 0; i < 4; i++)
|
||||
perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
|
||||
if (d->testing_p)
|
||||
case 16:
|
||||
if (!TARGET_SSSE3)
|
||||
return false;
|
||||
break;
|
||||
|
||||
case 32:
|
||||
if (!TARGET_AVX2)
|
||||
return false;
|
||||
|
||||
/* V4DImode should be already handled through
|
||||
expand_vselect by vpermq instruction. */
|
||||
gcc_assert (d->vmode != V4DImode);
|
||||
|
||||
vmode = V32QImode;
|
||||
if (d->vmode == V8SImode
|
||||
|| d->vmode == V16HImode
|
||||
|| d->vmode == V32QImode)
|
||||
{
|
||||
/* First see if vpermq can be used for
|
||||
V8SImode/V16HImode/V32QImode. */
|
||||
if (valid_perm_using_mode_p (V4DImode, d))
|
||||
{
|
||||
for (i = 0; i < 4; i++)
|
||||
perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
|
||||
if (d->testing_p)
|
||||
return true;
|
||||
target = gen_reg_rtx (V4DImode);
|
||||
if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
|
||||
perm, 4, false))
|
||||
{
|
||||
emit_move_insn (d->target,
|
||||
gen_lowpart (d->vmode, target));
|
||||
return true;
|
||||
target = gen_reg_rtx (V4DImode);
|
||||
if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
|
||||
perm, 4, false))
|
||||
{
|
||||
emit_move_insn (d->target,
|
||||
gen_lowpart (d->vmode, target));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Next see if vpermd can be used. */
|
||||
if (valid_perm_using_mode_p (V8SImode, d))
|
||||
vmode = V8SImode;
|
||||
}
|
||||
/* Or if vpermps can be used. */
|
||||
else if (d->vmode == V8SFmode)
|
||||
vmode = V8SImode;
|
||||
/* Next see if vpermd can be used. */
|
||||
if (valid_perm_using_mode_p (V8SImode, d))
|
||||
vmode = V8SImode;
|
||||
}
|
||||
/* Or if vpermps can be used. */
|
||||
else if (d->vmode == V8SFmode)
|
||||
vmode = V8SImode;
|
||||
|
||||
if (vmode == V32QImode)
|
||||
{
|
||||
/* vpshufb only works intra lanes, it is not
|
||||
possible to shuffle bytes in between the lanes. */
|
||||
for (i = 0; i < nelt; ++i)
|
||||
if ((d->perm[i] ^ i) & (nelt / 2))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else if (GET_MODE_SIZE (d->vmode) == 64)
|
||||
{
|
||||
if (!TARGET_AVX512BW)
|
||||
return false;
|
||||
if (vmode == V32QImode)
|
||||
{
|
||||
/* vpshufb only works intra lanes, it is not
|
||||
possible to shuffle bytes in between the lanes. */
|
||||
for (i = 0; i < nelt; ++i)
|
||||
if ((d->perm[i] ^ i) & (nelt / 2))
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
||||
/* If vpermq didn't work, vpshufb won't work either. */
|
||||
if (d->vmode == V8DFmode || d->vmode == V8DImode)
|
||||
return false;
|
||||
case 64:
|
||||
if (!TARGET_AVX512BW)
|
||||
return false;
|
||||
|
||||
vmode = V64QImode;
|
||||
if (d->vmode == V16SImode
|
||||
|| d->vmode == V32HImode
|
||||
|| d->vmode == V64QImode)
|
||||
{
|
||||
/* First see if vpermq can be used for
|
||||
V16SImode/V32HImode/V64QImode. */
|
||||
if (valid_perm_using_mode_p (V8DImode, d))
|
||||
{
|
||||
for (i = 0; i < 8; i++)
|
||||
perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
|
||||
if (d->testing_p)
|
||||
/* If vpermq didn't work, vpshufb won't work either. */
|
||||
if (d->vmode == V8DFmode || d->vmode == V8DImode)
|
||||
return false;
|
||||
|
||||
vmode = V64QImode;
|
||||
if (d->vmode == V16SImode
|
||||
|| d->vmode == V32HImode
|
||||
|| d->vmode == V64QImode)
|
||||
{
|
||||
/* First see if vpermq can be used for
|
||||
V16SImode/V32HImode/V64QImode. */
|
||||
if (valid_perm_using_mode_p (V8DImode, d))
|
||||
{
|
||||
for (i = 0; i < 8; i++)
|
||||
perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
|
||||
if (d->testing_p)
|
||||
return true;
|
||||
target = gen_reg_rtx (V8DImode);
|
||||
if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
|
||||
perm, 8, false))
|
||||
{
|
||||
emit_move_insn (d->target,
|
||||
gen_lowpart (d->vmode, target));
|
||||
return true;
|
||||
target = gen_reg_rtx (V8DImode);
|
||||
if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
|
||||
perm, 8, false))
|
||||
{
|
||||
emit_move_insn (d->target,
|
||||
gen_lowpart (d->vmode, target));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Next see if vpermd can be used. */
|
||||
if (valid_perm_using_mode_p (V16SImode, d))
|
||||
vmode = V16SImode;
|
||||
}
|
||||
/* Or if vpermps can be used. */
|
||||
else if (d->vmode == V16SFmode)
|
||||
vmode = V16SImode;
|
||||
if (vmode == V64QImode)
|
||||
{
|
||||
/* vpshufb only works intra lanes, it is not
|
||||
possible to shuffle bytes in between the lanes. */
|
||||
for (i = 0; i < nelt; ++i)
|
||||
if ((d->perm[i] ^ i) & (3 * nelt / 4))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
/* Next see if vpermd can be used. */
|
||||
if (valid_perm_using_mode_p (V16SImode, d))
|
||||
vmode = V16SImode;
|
||||
}
|
||||
/* Or if vpermps can be used. */
|
||||
else if (d->vmode == V16SFmode)
|
||||
vmode = V16SImode;
|
||||
if (vmode == V64QImode)
|
||||
{
|
||||
/* vpshufb only works intra lanes, it is not
|
||||
possible to shuffle bytes in between the lanes. */
|
||||
for (i = 0; i < nelt; ++i)
|
||||
if ((d->perm[i] ^ i) & (3 * nelt / 4))
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (d->testing_p)
|
||||
return true;
|
||||
|
@ -17893,23 +17931,28 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
|
|||
|
||||
machine_mode vpmode = vmode;
|
||||
|
||||
if (vmode == V8QImode)
|
||||
if (vmode == V4QImode
|
||||
|| vmode == V8QImode)
|
||||
{
|
||||
rtx m128 = GEN_INT (-128);
|
||||
|
||||
/* Remap elements from the second operand, as we have to
|
||||
account for inactive top 8 elements from the first operand. */
|
||||
account for inactive top elements from the first operand. */
|
||||
if (!d->one_operand_p)
|
||||
for (i = 0; i < nelt; ++i)
|
||||
{
|
||||
int ival = INTVAL (rperm[i]);
|
||||
if (ival >= 8)
|
||||
ival += 8;
|
||||
rperm[i] = GEN_INT (ival);
|
||||
}
|
||||
{
|
||||
int sz = GET_MODE_SIZE (vmode);
|
||||
|
||||
/* V8QI is emulated with V16QI instruction, fill inactive
|
||||
elements in the top 8 positions with zeros. */
|
||||
for (i = 0; i < nelt; ++i)
|
||||
{
|
||||
int ival = INTVAL (rperm[i]);
|
||||
if (ival >= sz)
|
||||
ival += 16-sz;
|
||||
rperm[i] = GEN_INT (ival);
|
||||
}
|
||||
}
|
||||
|
||||
/* V4QI/V8QI is emulated with V16QI instruction, fill inactive
|
||||
elements in the top positions with zeros. */
|
||||
for (i = nelt; i < 16; ++i)
|
||||
rperm[i] = m128;
|
||||
|
||||
|
@ -17931,7 +17974,9 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
|
|||
{
|
||||
rtx (*gen) (rtx, rtx, rtx);
|
||||
|
||||
if (vmode == V8QImode)
|
||||
if (vmode == V4QImode)
|
||||
gen = gen_mmx_pshufbv4qi3;
|
||||
else if (vmode == V8QImode)
|
||||
gen = gen_mmx_pshufbv8qi3;
|
||||
else if (vmode == V16QImode)
|
||||
gen = gen_ssse3_pshufbv16qi3;
|
||||
|
@ -17958,7 +18003,9 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
|
|||
|
||||
op1 = gen_lowpart (vmode, d->op1);
|
||||
|
||||
if (vmode == V8QImode)
|
||||
if (vmode == V4QImode)
|
||||
gen = gen_mmx_ppermv32;
|
||||
else if (vmode == V8QImode)
|
||||
gen = gen_mmx_ppermv64;
|
||||
else if (vmode == V16QImode)
|
||||
gen = gen_xop_pperm;
|
||||
|
@ -18405,7 +18452,8 @@ expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
|
|||
;
|
||||
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
|
||||
;
|
||||
else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 8
|
||||
else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4
|
||||
|| GET_MODE_SIZE (vmode) == 8
|
||||
|| GET_MODE_SIZE (vmode) == 16))
|
||||
;
|
||||
else
|
||||
|
@ -18485,7 +18533,8 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
|
|||
rtx_insn *seq;
|
||||
bool ok, same_halves = false;
|
||||
|
||||
if (GET_MODE_SIZE (d->vmode) == 8
|
||||
if (GET_MODE_SIZE (d->vmode) == 4
|
||||
|| GET_MODE_SIZE (d->vmode) == 8
|
||||
|| GET_MODE_SIZE (d->vmode) == 16)
|
||||
{
|
||||
if (d->one_operand_p)
|
||||
|
@ -18521,7 +18570,8 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
|
|||
memset (remap, 0xff, sizeof (remap));
|
||||
dremap = *d;
|
||||
|
||||
if (GET_MODE_SIZE (d->vmode) == 8)
|
||||
if (GET_MODE_SIZE (d->vmode) == 4
|
||||
|| GET_MODE_SIZE (d->vmode) == 8)
|
||||
{
|
||||
unsigned HOST_WIDE_INT h1, h2, h3, h4;
|
||||
|
||||
|
@ -19269,7 +19319,8 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
|
|||
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
|
||||
;
|
||||
else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
|
||||
|| GET_MODE_SIZE (vmode) == 8))
|
||||
|| GET_MODE_SIZE (vmode) == 8
|
||||
|| GET_MODE_SIZE (vmode) == 4))
|
||||
;
|
||||
else
|
||||
return false;
|
||||
|
@ -19530,7 +19581,8 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
|
|||
rtx (*gen) (rtx, rtx, rtx);
|
||||
|
||||
if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
|
||||
&& GET_MODE_SIZE (d->vmode) != 8))
|
||||
&& GET_MODE_SIZE (d->vmode) != 8
|
||||
&& GET_MODE_SIZE (d->vmode) != 4))
|
||||
return false;
|
||||
gcc_assert (!d->one_operand_p);
|
||||
|
||||
|
@ -19539,6 +19591,10 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
|
|||
|
||||
switch (GET_MODE_SIZE (d->vmode))
|
||||
{
|
||||
case 4:
|
||||
mode = V4QImode;
|
||||
gen = gen_mmx_pshufbv4qi3;
|
||||
break;
|
||||
case 8:
|
||||
mode = V8QImode;
|
||||
gen = gen_mmx_pshufbv8qi3;
|
||||
|
@ -20025,6 +20081,26 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
|
|||
return false;
|
||||
break;
|
||||
|
||||
case E_V4QImode:
|
||||
if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
|
||||
return expand_vec_perm_pshufb2 (d);
|
||||
else
|
||||
{
|
||||
if (d->testing_p)
|
||||
break;
|
||||
/* We need 2*log2(N)-1 operations to achieve odd/even
|
||||
with interleave. */
|
||||
t1 = gen_reg_rtx (V4QImode);
|
||||
emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
|
||||
emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
|
||||
if (odd)
|
||||
t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
|
||||
else
|
||||
t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
|
||||
emit_insn (t2);
|
||||
}
|
||||
break;
|
||||
|
||||
case E_V4HImode:
|
||||
if (TARGET_SSE4_1)
|
||||
return expand_vec_perm_even_odd_pack (d);
|
||||
|
@ -20214,6 +20290,7 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
|
|||
{
|
||||
unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
|
||||
machine_mode vmode = d->vmode;
|
||||
rtx (*gen) (rtx, rtx, rtx);
|
||||
unsigned char perm2[4];
|
||||
rtx op0 = d->op0, dest;
|
||||
bool ok;
|
||||
|
@ -20238,24 +20315,48 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
|
|||
/* These are always implementable using standard shuffle patterns. */
|
||||
gcc_unreachable ();
|
||||
|
||||
case E_V4QImode:
|
||||
/* This can be implemented via interleave and pshuflw. */
|
||||
if (d->testing_p)
|
||||
return true;
|
||||
|
||||
if (elt >= nelt2)
|
||||
{
|
||||
gen = gen_mmx_punpckhbw_low;
|
||||
elt -= nelt2;
|
||||
}
|
||||
else
|
||||
gen = gen_mmx_punpcklbw_low;
|
||||
|
||||
dest = gen_reg_rtx (vmode);
|
||||
emit_insn (gen (dest, op0, op0));
|
||||
vmode = get_mode_wider_vector (vmode);
|
||||
op0 = gen_lowpart (vmode, dest);
|
||||
|
||||
memset (perm2, elt, 2);
|
||||
dest = gen_reg_rtx (vmode);
|
||||
ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
|
||||
gcc_assert (ok);
|
||||
|
||||
emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
|
||||
return true;
|
||||
|
||||
case E_V8QImode:
|
||||
/* These can be implemented via interleave. We save one insn by
|
||||
/* This can be implemented via interleave. We save one insn by
|
||||
stopping once we have promoted to V2SImode and then use pshufd. */
|
||||
if (d->testing_p)
|
||||
return true;
|
||||
do
|
||||
{
|
||||
rtx dest;
|
||||
rtx (*gen) (rtx, rtx, rtx)
|
||||
= vmode == V8QImode ? gen_mmx_punpcklbw
|
||||
: gen_mmx_punpcklwd;
|
||||
|
||||
if (elt >= nelt2)
|
||||
{
|
||||
gen = vmode == V8QImode ? gen_mmx_punpckhbw
|
||||
: gen_mmx_punpckhwd;
|
||||
elt -= nelt2;
|
||||
}
|
||||
else
|
||||
gen = vmode == V8QImode ? gen_mmx_punpcklbw
|
||||
: gen_mmx_punpcklwd;
|
||||
nelt2 /= 2;
|
||||
|
||||
dest = gen_reg_rtx (vmode);
|
||||
|
@ -20266,11 +20367,11 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
|
|||
while (vmode != V2SImode);
|
||||
|
||||
memset (perm2, elt, 2);
|
||||
dest = gen_reg_rtx (V2SImode);
|
||||
dest = gen_reg_rtx (vmode);
|
||||
ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
|
||||
gcc_assert (ok);
|
||||
if (!d->testing_p)
|
||||
emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
|
||||
|
||||
emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
|
||||
return true;
|
||||
|
||||
case E_V8HImode:
|
||||
|
@ -20281,17 +20382,15 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
|
|||
return true;
|
||||
do
|
||||
{
|
||||
rtx dest;
|
||||
rtx (*gen) (rtx, rtx, rtx)
|
||||
= vmode == V16QImode ? gen_vec_interleave_lowv16qi
|
||||
: gen_vec_interleave_lowv8hi;
|
||||
|
||||
if (elt >= nelt2)
|
||||
{
|
||||
gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
|
||||
: gen_vec_interleave_highv8hi;
|
||||
elt -= nelt2;
|
||||
}
|
||||
else
|
||||
gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
|
||||
: gen_vec_interleave_lowv8hi;
|
||||
nelt2 /= 2;
|
||||
|
||||
dest = gen_reg_rtx (vmode);
|
||||
|
@ -20302,11 +20401,11 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
|
|||
while (vmode != V4SImode);
|
||||
|
||||
memset (perm2, elt, 4);
|
||||
dest = gen_reg_rtx (V4SImode);
|
||||
dest = gen_reg_rtx (vmode);
|
||||
ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
|
||||
gcc_assert (ok);
|
||||
if (!d->testing_p)
|
||||
emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
|
||||
|
||||
emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
|
||||
return true;
|
||||
|
||||
case E_V64QImode:
|
||||
|
@ -20787,6 +20886,10 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
|
|||
if (d.testing_p)
|
||||
return true;
|
||||
break;
|
||||
case E_V4QImode:
|
||||
if (!TARGET_SSE2)
|
||||
return false;
|
||||
break;
|
||||
case E_V2DImode:
|
||||
case E_V2DFmode:
|
||||
if (!TARGET_SSE)
|
||||
|
|
|
@ -2362,6 +2362,18 @@
|
|||
[(set_attr "type" "sse4arg")
|
||||
(set_attr "mode" "TI")])
|
||||
|
||||
(define_insn "mmx_ppermv32"
|
||||
[(set (match_operand:V4QI 0 "register_operand" "=x")
|
||||
(unspec:V4QI
|
||||
[(match_operand:V4QI 1 "register_operand" "x")
|
||||
(match_operand:V4QI 2 "register_operand" "x")
|
||||
(match_operand:V16QI 3 "nonimmediate_operand" "xm")]
|
||||
UNSPEC_XOP_PERMUTE))]
|
||||
"TARGET_XOP"
|
||||
"vpperm\t{%3, %2, %1, %0|%0, %1, %2, %3}"
|
||||
[(set_attr "type" "sse4arg")
|
||||
(set_attr "mode" "TI")])
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;
|
||||
;; Parallel integral logical operations
|
||||
|
@ -2550,6 +2562,23 @@
|
|||
(set_attr "type" "mmxcvt,sselog,sselog")
|
||||
(set_attr "mode" "DI,TI,TI")])
|
||||
|
||||
(define_insn_and_split "mmx_punpckhbw_low"
|
||||
[(set (match_operand:V4QI 0 "register_operand" "=x,Yw")
|
||||
(vec_select:V4QI
|
||||
(vec_concat:V8QI
|
||||
(match_operand:V4QI 1 "register_operand" "0,Yw")
|
||||
(match_operand:V4QI 2 "register_operand" "x,Yw"))
|
||||
(parallel [(const_int 2) (const_int 6)
|
||||
(const_int 3) (const_int 7)])))]
|
||||
"TARGET_SSE2"
|
||||
"#"
|
||||
"&& reload_completed"
|
||||
[(const_int 0)]
|
||||
"ix86_split_mmx_punpck (operands, true); DONE;"
|
||||
[(set_attr "isa" "noavx,avx")
|
||||
(set_attr "type" "sselog")
|
||||
(set_attr "mode" "TI")])
|
||||
|
||||
(define_insn_and_split "mmx_punpcklbw"
|
||||
[(set (match_operand:V8QI 0 "register_operand" "=y,x,Yw")
|
||||
(vec_select:V8QI
|
||||
|
@ -2573,6 +2602,23 @@
|
|||
(set_attr "type" "mmxcvt,sselog,sselog")
|
||||
(set_attr "mode" "DI,TI,TI")])
|
||||
|
||||
(define_insn_and_split "mmx_punpcklbw_low"
|
||||
[(set (match_operand:V4QI 0 "register_operand" "=x,Yw")
|
||||
(vec_select:V4QI
|
||||
(vec_concat:V8QI
|
||||
(match_operand:V4QI 1 "register_operand" "0,Yw")
|
||||
(match_operand:V4QI 2 "register_operand" "x,Yw"))
|
||||
(parallel [(const_int 0) (const_int 4)
|
||||
(const_int 1) (const_int 5)])))]
|
||||
"TARGET_SSE2"
|
||||
"#"
|
||||
"&& reload_completed"
|
||||
[(const_int 0)]
|
||||
"ix86_split_mmx_punpck (operands, false); DONE;"
|
||||
[(set_attr "isa" "noavx,avx")
|
||||
(set_attr "type" "sselog")
|
||||
(set_attr "mode" "TI")])
|
||||
|
||||
(define_insn_and_split "mmx_punpckhwd"
|
||||
[(set (match_operand:V4HI 0 "register_operand" "=y,x,Yw")
|
||||
(vec_select:V4HI
|
||||
|
@ -2930,6 +2976,24 @@
|
|||
(set_attr "btver2_decode" "vector")
|
||||
(set_attr "mode" "TI")])
|
||||
|
||||
(define_insn "mmx_pshufbv4qi3"
|
||||
[(set (match_operand:V4QI 0 "register_operand" "=x,Yw")
|
||||
(unspec:V4QI
|
||||
[(match_operand:V4QI 1 "register_operand" "0,Yw")
|
||||
(match_operand:V16QI 2 "vector_operand" "xBm,Ywm")]
|
||||
UNSPEC_PSHUFB))]
|
||||
"TARGET_SSSE3"
|
||||
"@
|
||||
pshufb\t{%2, %0|%0, %2}
|
||||
vpshufb\t{%2, %1, %0|%0, %1, %2}"
|
||||
[(set_attr "isa" "noavx,avx")
|
||||
(set_attr "type" "sselog1")
|
||||
(set_attr "prefix_data16" "1,*")
|
||||
(set_attr "prefix_extra" "1")
|
||||
(set_attr "prefix" "orig,maybe_evex")
|
||||
(set_attr "btver2_decode" "vector")
|
||||
(set_attr "mode" "TI")])
|
||||
|
||||
(define_expand "mmx_pshufw"
|
||||
[(match_operand:V4HI 0 "register_operand")
|
||||
(match_operand:V4HI 1 "register_mmxmem_operand")
|
||||
|
@ -3002,12 +3066,12 @@
|
|||
(set_attr "length_immediate" "1")
|
||||
(set_attr "mode" "TI")])
|
||||
|
||||
(define_insn "*mmx_pblendw"
|
||||
(define_insn "*mmx_pblendw64"
|
||||
[(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,x")
|
||||
(vec_merge:V4HI
|
||||
(match_operand:V4HI 2 "register_operand" "Yr,*x,x")
|
||||
(match_operand:V4HI 1 "register_operand" "0,0,x")
|
||||
(match_operand:SI 3 "const_0_to_63_operand" "n,n,n")))]
|
||||
(match_operand:SI 3 "const_0_to_15_operand" "n,n,n")))]
|
||||
"TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
|
||||
"@
|
||||
pblendw\t{%3, %2, %0|%0, %2, %3}
|
||||
|
@ -3020,6 +3084,24 @@
|
|||
(set_attr "prefix" "orig,orig,vex")
|
||||
(set_attr "mode" "TI")])
|
||||
|
||||
(define_insn "*mmx_pblendw32"
|
||||
[(set (match_operand:V2HI 0 "register_operand" "=Yr,*x,x")
|
||||
(vec_merge:V2HI
|
||||
(match_operand:V2HI 2 "register_operand" "Yr,*x,x")
|
||||
(match_operand:V2HI 1 "register_operand" "0,0,x")
|
||||
(match_operand:SI 3 "const_0_to_7_operand" "n,n,n")))]
|
||||
"TARGET_SSE4_1"
|
||||
"@
|
||||
pblendw\t{%3, %2, %0|%0, %2, %3}
|
||||
pblendw\t{%3, %2, %0|%0, %2, %3}
|
||||
vpblendw\t{%3, %2, %1, %0|%0, %1, %2, %3}"
|
||||
[(set_attr "isa" "noavx,noavx,avx")
|
||||
(set_attr "type" "ssemov")
|
||||
(set_attr "prefix_extra" "1")
|
||||
(set_attr "length_immediate" "1")
|
||||
(set_attr "prefix" "orig,orig,vex")
|
||||
(set_attr "mode" "TI")])
|
||||
|
||||
;; Optimize V2SImode load from memory, swapping the elements and
|
||||
;; storing back into the memory into DImode rotate of the memory by 32.
|
||||
(define_split
|
||||
|
|
Loading…
Add table
Reference in a new issue