ix86: Support V{2, 4}DImode arithmetic right shifts for SSE2+ [PR98856]
As mentioned in the PR, we don't support arithmetic right V2DImode or V4DImode on x86 without -mavx512vl or -mxop. The ISAs indeed don't have {,v}psraq instructions until AVX512VL, but we actually can emulate it quite easily. One case is arithmetic >> 63, we can just emit {,v}pxor; {,v}pcmpgt for that for SSE4.2+, or for SSE2 psrad $31; pshufd $0xf5. Then arithmetic >> by constant > 32, that can be done with {,v}psrad $31 and {,v}psrad $(cst-32) and two operand permutation, arithmetic >> 32 can be done as {,v}psrad $31 and permutation of that and the original operand. Arithmetic >> by constant < 32 can be done as {,v}psrad $cst and {,v}psrlq $cst and two operand permutation. And arithmetic >> by variable scalar amount can be done as arithmetic >> 63, logical >> by the amount, << by (64 - amount of the >> 63 result; note that the vector << 64 result in 0) and oring together. I had to improve the permutation generation so that it actually handles the needed permutations (or handles them better). 2021-05-13 Jakub Jelinek <jakub@redhat.com> PR tree-optimization/98856 * config/i386/i386.c (ix86_shift_rotate_cost): Add CODE argument. Expect V2DI and V4DI arithmetic right shifts to be emulated. (ix86_rtx_costs, ix86_add_stmt_cost): Adjust ix86_shift_rotate_cost caller. * config/i386/i386-expand.c (expand_vec_perm_2perm_interleave, expand_vec_perm_2perm_pblendv): New functions. (ix86_expand_vec_perm_const_1): Use them. * config/i386/sse.md (ashr<mode>3<mask_name>): Rename to ... (<mask_codefor>ashr<mode>3<mask_name>): ... this. (ashr<mode>3): New define_expand with VI248_AVX512BW iterator. (ashrv4di3): New define_expand. (ashrv2di3): Change condition to TARGET_SSE2, handle !TARGET_XOP and !TARGET_AVX512VL expansion. * gcc.target/i386/sse2-psraq-1.c: New test. * gcc.target/i386/sse4_2-psraq-1.c: New test. * gcc.target/i386/avx-psraq-1.c: New test. * gcc.target/i386/avx2-psraq-1.c: New test. * gcc.target/i386/avx-pr82370.c: Adjust expected number of vpsrad instructions. * gcc.target/i386/avx2-pr82370.c: Likewise. * gcc.target/i386/avx512f-pr82370.c: Likewise. * gcc.target/i386/avx512bw-pr82370.c: Likewise. * gcc.dg/torture/vshuf-4.inc: Add two further permutations. * gcc.dg/torture/vshuf-8.inc: Likewise.
This commit is contained in:
parent
f1693741cb
commit
829c4bea06
13 changed files with 660 additions and 15 deletions
|
@ -18662,6 +18662,242 @@ expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
|
|||
return true;
|
||||
}
|
||||
|
||||
/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
|
||||
a two vector permutation using two single vector permutations and
|
||||
{,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
|
||||
of dfirst or dsecond is identity permutation. */
|
||||
|
||||
static bool
|
||||
expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
|
||||
{
|
||||
unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
|
||||
struct expand_vec_perm_d dfirst, dsecond, dfinal;
|
||||
bool ident1 = true, ident2 = true;
|
||||
|
||||
if (d->one_operand_p)
|
||||
return false;
|
||||
|
||||
if (GET_MODE_SIZE (d->vmode) == 16)
|
||||
{
|
||||
if (!TARGET_SSE)
|
||||
return false;
|
||||
if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
|
||||
return false;
|
||||
}
|
||||
else if (GET_MODE_SIZE (d->vmode) == 32)
|
||||
{
|
||||
if (!TARGET_AVX)
|
||||
return false;
|
||||
if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
|
||||
return false;
|
||||
lane = nelt2;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
|
||||
for (i = 1; i < nelt; i++)
|
||||
if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
|
||||
return false;
|
||||
|
||||
dfirst = *d;
|
||||
dsecond = *d;
|
||||
dfinal = *d;
|
||||
dfirst.op1 = dfirst.op0;
|
||||
dfirst.one_operand_p = true;
|
||||
dsecond.op0 = dsecond.op1;
|
||||
dsecond.one_operand_p = true;
|
||||
|
||||
for (i = 0; i < nelt; i++)
|
||||
if (d->perm[i] >= nelt)
|
||||
{
|
||||
dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
|
||||
if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
|
||||
ident2 = false;
|
||||
dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
|
||||
= d->perm[i] - nelt;
|
||||
}
|
||||
else
|
||||
{
|
||||
dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
|
||||
if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
|
||||
ident1 = false;
|
||||
dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
|
||||
}
|
||||
|
||||
if (two_insn && !ident1 && !ident2)
|
||||
return false;
|
||||
|
||||
if (!d->testing_p)
|
||||
{
|
||||
if (!ident1)
|
||||
dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
|
||||
if (!ident2)
|
||||
dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
|
||||
if (d->perm[0] >= nelt)
|
||||
std::swap (dfinal.op0, dfinal.op1);
|
||||
}
|
||||
|
||||
bool ok;
|
||||
rtx_insn *seq1 = NULL, *seq2 = NULL;
|
||||
|
||||
if (!ident1)
|
||||
{
|
||||
start_sequence ();
|
||||
ok = expand_vec_perm_1 (&dfirst);
|
||||
seq1 = get_insns ();
|
||||
end_sequence ();
|
||||
|
||||
if (!ok)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ident2)
|
||||
{
|
||||
start_sequence ();
|
||||
ok = expand_vec_perm_1 (&dsecond);
|
||||
seq2 = get_insns ();
|
||||
end_sequence ();
|
||||
|
||||
if (!ok)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (d->testing_p)
|
||||
return true;
|
||||
|
||||
for (i = 0; i < nelt; i++)
|
||||
{
|
||||
dfinal.perm[i] = i / 2;
|
||||
if (i >= lane)
|
||||
dfinal.perm[i] += lane / 2;
|
||||
if ((i & 1) != 0)
|
||||
dfinal.perm[i] += nelt;
|
||||
}
|
||||
emit_insn (seq1);
|
||||
emit_insn (seq2);
|
||||
ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
|
||||
dfinal.perm, dfinal.nelt, false);
|
||||
gcc_assert (ok);
|
||||
return true;
|
||||
}
|
||||
|
||||
/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
|
||||
the permutation using two single vector permutations and the SSE4_1 pblendv
|
||||
instruction. If two_insn, succeed only if one of dfirst or dsecond is
|
||||
identity permutation. */
|
||||
|
||||
static bool
|
||||
expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
|
||||
{
|
||||
unsigned i, nelt = d->nelt;
|
||||
struct expand_vec_perm_d dfirst, dsecond, dfinal;
|
||||
machine_mode vmode = d->vmode;
|
||||
bool ident1 = true, ident2 = true;
|
||||
|
||||
/* Use the same checks as in expand_vec_perm_blend. */
|
||||
if (d->one_operand_p)
|
||||
return false;
|
||||
if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
|
||||
;
|
||||
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
|
||||
;
|
||||
else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
|
||||
;
|
||||
else
|
||||
return false;
|
||||
|
||||
dfirst = *d;
|
||||
dsecond = *d;
|
||||
dfinal = *d;
|
||||
dfirst.op1 = dfirst.op0;
|
||||
dfirst.one_operand_p = true;
|
||||
dsecond.op0 = dsecond.op1;
|
||||
dsecond.one_operand_p = true;
|
||||
|
||||
for (i = 0; i < nelt; ++i)
|
||||
if (d->perm[i] >= nelt)
|
||||
{
|
||||
dfirst.perm[i] = 0xff;
|
||||
dsecond.perm[i] = d->perm[i] - nelt;
|
||||
if (d->perm[i] != i + nelt)
|
||||
ident2 = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
dsecond.perm[i] = 0xff;
|
||||
dfirst.perm[i] = d->perm[i];
|
||||
if (d->perm[i] != i)
|
||||
ident1 = false;
|
||||
}
|
||||
|
||||
if (two_insn && !ident1 && !ident2)
|
||||
return false;
|
||||
|
||||
/* For now. Ideally treat 0xff as a wildcard. */
|
||||
for (i = 0; i < nelt; ++i)
|
||||
if (dfirst.perm[i] == 0xff)
|
||||
{
|
||||
if (GET_MODE_SIZE (vmode) == 32
|
||||
&& dfirst.perm[i ^ (nelt / 2)] != 0xff)
|
||||
dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
|
||||
else
|
||||
dfirst.perm[i] = i;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (GET_MODE_SIZE (vmode) == 32
|
||||
&& dsecond.perm[i ^ (nelt / 2)] != 0xff)
|
||||
dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
|
||||
else
|
||||
dsecond.perm[i] = i;
|
||||
}
|
||||
|
||||
if (!d->testing_p)
|
||||
{
|
||||
if (!ident1)
|
||||
dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
|
||||
if (!ident2)
|
||||
dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
|
||||
}
|
||||
|
||||
bool ok;
|
||||
rtx_insn *seq1 = NULL, *seq2 = NULL;
|
||||
|
||||
if (!ident1)
|
||||
{
|
||||
start_sequence ();
|
||||
ok = expand_vec_perm_1 (&dfirst);
|
||||
seq1 = get_insns ();
|
||||
end_sequence ();
|
||||
|
||||
if (!ok)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ident2)
|
||||
{
|
||||
start_sequence ();
|
||||
ok = expand_vec_perm_1 (&dsecond);
|
||||
seq2 = get_insns ();
|
||||
end_sequence ();
|
||||
|
||||
if (!ok)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (d->testing_p)
|
||||
return true;
|
||||
|
||||
for (i = 0; i < nelt; ++i)
|
||||
dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
|
||||
|
||||
emit_insn (seq1);
|
||||
emit_insn (seq2);
|
||||
ok = expand_vec_perm_blend (&dfinal);
|
||||
gcc_assert (ok);
|
||||
return true;
|
||||
}
|
||||
|
||||
/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
|
||||
permutation using two vperm2f128, followed by a vshufpd insn blending
|
||||
the two vectors together. */
|
||||
|
@ -19773,6 +20009,12 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
|
|||
if (expand_vec_perm_pblendv (d))
|
||||
return true;
|
||||
|
||||
if (expand_vec_perm_2perm_interleave (d, true))
|
||||
return true;
|
||||
|
||||
if (expand_vec_perm_2perm_pblendv (d, true))
|
||||
return true;
|
||||
|
||||
/* Try sequences of three instructions. */
|
||||
|
||||
if (expand_vec_perm_even_odd_pack (d))
|
||||
|
@ -19790,6 +20032,12 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
|
|||
if (expand_vec_perm_vperm2f128_vblend (d))
|
||||
return true;
|
||||
|
||||
if (expand_vec_perm_2perm_interleave (d, false))
|
||||
return true;
|
||||
|
||||
if (expand_vec_perm_2perm_pblendv (d, false))
|
||||
return true;
|
||||
|
||||
/* Try sequences of four instructions. */
|
||||
|
||||
if (expand_vec_perm_even_odd_trunc (d))
|
||||
|
|
|
@ -19732,6 +19732,7 @@ ix86_division_cost (const struct processor_costs *cost,
|
|||
|
||||
static int
|
||||
ix86_shift_rotate_cost (const struct processor_costs *cost,
|
||||
enum rtx_code code,
|
||||
enum machine_mode mode, bool constant_op1,
|
||||
HOST_WIDE_INT op1_val,
|
||||
bool speed,
|
||||
|
@ -19770,6 +19771,19 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
|
|||
count = 7;
|
||||
return ix86_vec_cost (mode, cost->sse_op * count);
|
||||
}
|
||||
/* V*DImode arithmetic right shift is emulated. */
|
||||
else if (code == ASHIFTRT
|
||||
&& (mode == V2DImode || mode == V4DImode)
|
||||
&& !TARGET_XOP
|
||||
&& !TARGET_AVX512VL)
|
||||
{
|
||||
int count = 4;
|
||||
if (constant_op1 && op1_val == 63 && TARGET_SSE4_2)
|
||||
count = 2;
|
||||
else if (constant_op1)
|
||||
count = 3;
|
||||
return ix86_vec_cost (mode, cost->sse_op * count);
|
||||
}
|
||||
else
|
||||
return ix86_vec_cost (mode, cost->sse_op);
|
||||
}
|
||||
|
@ -19939,13 +19953,15 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
|
|||
case LSHIFTRT:
|
||||
case ROTATERT:
|
||||
bool skip_op0, skip_op1;
|
||||
*total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
|
||||
*total = ix86_shift_rotate_cost (cost, code, mode,
|
||||
CONSTANT_P (XEXP (x, 1)),
|
||||
CONST_INT_P (XEXP (x, 1))
|
||||
? INTVAL (XEXP (x, 1)) : -1,
|
||||
speed,
|
||||
GET_CODE (XEXP (x, 1)) == AND,
|
||||
SUBREG_P (XEXP (x, 1))
|
||||
&& GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
|
||||
&& GET_CODE (XEXP (XEXP (x, 1),
|
||||
0)) == AND,
|
||||
&skip_op0, &skip_op1);
|
||||
if (skip_op0 || skip_op1)
|
||||
{
|
||||
|
@ -22383,11 +22399,16 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count,
|
|||
case LROTATE_EXPR:
|
||||
case RROTATE_EXPR:
|
||||
{
|
||||
tree op1 = gimple_assign_rhs1 (stmt_info->stmt);
|
||||
tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
|
||||
stmt_cost = ix86_shift_rotate_cost
|
||||
(ix86_cost, mode,
|
||||
(ix86_cost,
|
||||
(subcode == RSHIFT_EXPR
|
||||
&& !TYPE_UNSIGNED (TREE_TYPE (op1)))
|
||||
? ASHIFTRT : LSHIFTRT, mode,
|
||||
TREE_CODE (op2) == INTEGER_CST,
|
||||
cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
|
||||
cst_and_fits_in_hwi (op2)
|
||||
? int_cst_value (op2) : -1,
|
||||
true, false, false, NULL, NULL);
|
||||
}
|
||||
break;
|
||||
|
|
|
@ -12468,7 +12468,7 @@
|
|||
(set_attr "prefix" "orig,vex")
|
||||
(set_attr "mode" "<sseinsnmode>")])
|
||||
|
||||
(define_insn "ashr<mode>3<mask_name>"
|
||||
(define_insn "<mask_codefor>ashr<mode>3<mask_name>"
|
||||
[(set (match_operand:VI248_AVX512BW_AVX512VL 0 "register_operand" "=v,v")
|
||||
(ashiftrt:VI248_AVX512BW_AVX512VL
|
||||
(match_operand:VI248_AVX512BW_AVX512VL 1 "nonimmediate_operand" "v,vm")
|
||||
|
@ -12482,6 +12482,126 @@
|
|||
(const_string "0")))
|
||||
(set_attr "mode" "<sseinsnmode>")])
|
||||
|
||||
(define_expand "ashr<mode>3"
|
||||
[(set (match_operand:VI248_AVX512BW 0 "register_operand")
|
||||
(ashiftrt:VI248_AVX512BW
|
||||
(match_operand:VI248_AVX512BW 1 "nonimmediate_operand")
|
||||
(match_operand:DI 2 "nonmemory_operand")))]
|
||||
"TARGET_AVX512F")
|
||||
|
||||
(define_expand "ashrv4di3"
|
||||
[(set (match_operand:V4DI 0 "register_operand")
|
||||
(ashiftrt:V4DI
|
||||
(match_operand:V4DI 1 "nonimmediate_operand")
|
||||
(match_operand:DI 2 "nonmemory_operand")))]
|
||||
"TARGET_AVX2"
|
||||
{
|
||||
if (!TARGET_AVX512VL)
|
||||
{
|
||||
if (CONST_INT_P (operands[2]) && UINTVAL (operands[2]) >= 63)
|
||||
{
|
||||
rtx zero = force_reg (V4DImode, CONST0_RTX (V4DImode));
|
||||
emit_insn (gen_avx2_gtv4di3 (operands[0], zero, operands[1]));
|
||||
DONE;
|
||||
}
|
||||
if (operands[2] == const0_rtx)
|
||||
{
|
||||
emit_move_insn (operands[0], operands[1]);
|
||||
DONE;
|
||||
}
|
||||
operands[1] = force_reg (V4DImode, operands[1]);
|
||||
if (CONST_INT_P (operands[2]))
|
||||
{
|
||||
vec_perm_builder sel (8, 8, 1);
|
||||
sel.quick_grow (8);
|
||||
rtx arg0, arg1;
|
||||
rtx op1 = lowpart_subreg (V8SImode, operands[1], V4DImode);
|
||||
rtx target = gen_reg_rtx (V8SImode);
|
||||
if (INTVAL (operands[2]) > 32)
|
||||
{
|
||||
arg0 = gen_reg_rtx (V8SImode);
|
||||
arg1 = gen_reg_rtx (V8SImode);
|
||||
emit_insn (gen_ashrv8si3 (arg1, op1, GEN_INT (31)));
|
||||
emit_insn (gen_ashrv8si3 (arg0, op1,
|
||||
GEN_INT (INTVAL (operands[2]) - 32)));
|
||||
sel[0] = 1;
|
||||
sel[1] = 9;
|
||||
sel[2] = 3;
|
||||
sel[3] = 11;
|
||||
sel[4] = 5;
|
||||
sel[5] = 13;
|
||||
sel[6] = 7;
|
||||
sel[7] = 15;
|
||||
}
|
||||
else if (INTVAL (operands[2]) == 32)
|
||||
{
|
||||
arg0 = op1;
|
||||
arg1 = gen_reg_rtx (V8SImode);
|
||||
emit_insn (gen_ashrv8si3 (arg1, op1, GEN_INT (31)));
|
||||
sel[0] = 1;
|
||||
sel[1] = 9;
|
||||
sel[2] = 3;
|
||||
sel[3] = 11;
|
||||
sel[4] = 5;
|
||||
sel[5] = 13;
|
||||
sel[6] = 7;
|
||||
sel[7] = 15;
|
||||
}
|
||||
else
|
||||
{
|
||||
arg0 = gen_reg_rtx (V4DImode);
|
||||
arg1 = gen_reg_rtx (V8SImode);
|
||||
emit_insn (gen_lshrv4di3 (arg0, operands[1], operands[2]));
|
||||
emit_insn (gen_ashrv8si3 (arg1, op1, operands[2]));
|
||||
arg0 = lowpart_subreg (V8SImode, arg0, V4DImode);
|
||||
sel[0] = 0;
|
||||
sel[1] = 9;
|
||||
sel[2] = 2;
|
||||
sel[3] = 11;
|
||||
sel[4] = 4;
|
||||
sel[5] = 13;
|
||||
sel[6] = 6;
|
||||
sel[7] = 15;
|
||||
}
|
||||
vec_perm_indices indices (sel, 2, 8);
|
||||
bool ok = targetm.vectorize.vec_perm_const (V8SImode, target,
|
||||
arg0, arg1, indices);
|
||||
gcc_assert (ok);
|
||||
emit_move_insn (operands[0],
|
||||
lowpart_subreg (V4DImode, target, V8SImode));
|
||||
DONE;
|
||||
}
|
||||
|
||||
rtx zero = force_reg (V4DImode, CONST0_RTX (V4DImode));
|
||||
rtx zero_or_all_ones = gen_reg_rtx (V4DImode);
|
||||
emit_insn (gen_avx2_gtv4di3 (zero_or_all_ones, zero, operands[1]));
|
||||
rtx lshr_res = gen_reg_rtx (V4DImode);
|
||||
emit_insn (gen_lshrv4di3 (lshr_res, operands[1], operands[2]));
|
||||
rtx ashl_res = gen_reg_rtx (V4DImode);
|
||||
rtx amount;
|
||||
if (TARGET_64BIT)
|
||||
{
|
||||
amount = gen_reg_rtx (DImode);
|
||||
emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)),
|
||||
operands[2]));
|
||||
}
|
||||
else
|
||||
{
|
||||
rtx temp = gen_reg_rtx (SImode);
|
||||
emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)),
|
||||
lowpart_subreg (SImode, operands[2],
|
||||
DImode)));
|
||||
amount = gen_reg_rtx (V4SImode);
|
||||
emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode),
|
||||
temp));
|
||||
}
|
||||
amount = lowpart_subreg (DImode, amount, GET_MODE (amount));
|
||||
emit_insn (gen_ashlv4di3 (ashl_res, zero_or_all_ones, amount));
|
||||
emit_insn (gen_iorv4di3 (operands[0], lshr_res, ashl_res));
|
||||
DONE;
|
||||
}
|
||||
})
|
||||
|
||||
(define_insn "<mask_codefor><insn><mode>3<mask_name>"
|
||||
[(set (match_operand:VI248_AVX512BW_2 0 "register_operand" "=v,v")
|
||||
(any_lshift:VI248_AVX512BW_2
|
||||
|
@ -20329,10 +20449,132 @@
|
|||
(ashiftrt:V2DI
|
||||
(match_operand:V2DI 1 "register_operand")
|
||||
(match_operand:DI 2 "nonmemory_operand")))]
|
||||
"TARGET_XOP || TARGET_AVX512VL"
|
||||
"TARGET_SSE2"
|
||||
{
|
||||
if (!TARGET_AVX512VL)
|
||||
{
|
||||
if (TARGET_SSE4_2
|
||||
&& CONST_INT_P (operands[2])
|
||||
&& UINTVAL (operands[2]) >= 63)
|
||||
{
|
||||
rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
|
||||
emit_insn (gen_sse4_2_gtv2di3 (operands[0], zero, operands[1]));
|
||||
DONE;
|
||||
}
|
||||
if (operands[2] == const0_rtx)
|
||||
{
|
||||
emit_move_insn (operands[0], operands[1]);
|
||||
DONE;
|
||||
}
|
||||
if (CONST_INT_P (operands[2])
|
||||
&& (!TARGET_XOP || UINTVAL (operands[2]) >= 63))
|
||||
{
|
||||
vec_perm_builder sel (4, 4, 1);
|
||||
sel.quick_grow (4);
|
||||
rtx arg0, arg1;
|
||||
rtx op1 = lowpart_subreg (V4SImode, operands[1], V2DImode);
|
||||
rtx target = gen_reg_rtx (V4SImode);
|
||||
if (UINTVAL (operands[2]) >= 63)
|
||||
{
|
||||
arg0 = arg1 = gen_reg_rtx (V4SImode);
|
||||
emit_insn (gen_ashrv4si3 (arg0, op1, GEN_INT (31)));
|
||||
sel[0] = 1;
|
||||
sel[1] = 1;
|
||||
sel[2] = 3;
|
||||
sel[3] = 3;
|
||||
}
|
||||
else if (INTVAL (operands[2]) > 32)
|
||||
{
|
||||
arg0 = gen_reg_rtx (V4SImode);
|
||||
arg1 = gen_reg_rtx (V4SImode);
|
||||
emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
|
||||
emit_insn (gen_ashrv4si3 (arg0, op1,
|
||||
GEN_INT (INTVAL (operands[2]) - 32)));
|
||||
sel[0] = 1;
|
||||
sel[1] = 5;
|
||||
sel[2] = 3;
|
||||
sel[3] = 7;
|
||||
}
|
||||
else if (INTVAL (operands[2]) == 32)
|
||||
{
|
||||
arg0 = op1;
|
||||
arg1 = gen_reg_rtx (V4SImode);
|
||||
emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
|
||||
sel[0] = 1;
|
||||
sel[1] = 5;
|
||||
sel[2] = 3;
|
||||
sel[3] = 7;
|
||||
}
|
||||
else
|
||||
{
|
||||
arg0 = gen_reg_rtx (V2DImode);
|
||||
arg1 = gen_reg_rtx (V4SImode);
|
||||
emit_insn (gen_lshrv2di3 (arg0, operands[1], operands[2]));
|
||||
emit_insn (gen_ashrv4si3 (arg1, op1, operands[2]));
|
||||
arg0 = lowpart_subreg (V4SImode, arg0, V2DImode);
|
||||
sel[0] = 0;
|
||||
sel[1] = 5;
|
||||
sel[2] = 2;
|
||||
sel[3] = 7;
|
||||
}
|
||||
vec_perm_indices indices (sel, arg0 != arg1 ? 2 : 1, 4);
|
||||
bool ok = targetm.vectorize.vec_perm_const (V4SImode, target,
|
||||
arg0, arg1, indices);
|
||||
gcc_assert (ok);
|
||||
emit_move_insn (operands[0],
|
||||
lowpart_subreg (V2DImode, target, V4SImode));
|
||||
DONE;
|
||||
}
|
||||
if (!TARGET_XOP)
|
||||
{
|
||||
rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
|
||||
rtx zero_or_all_ones;
|
||||
if (TARGET_SSE4_2)
|
||||
{
|
||||
zero_or_all_ones = gen_reg_rtx (V2DImode);
|
||||
emit_insn (gen_sse4_2_gtv2di3 (zero_or_all_ones, zero,
|
||||
operands[1]));
|
||||
}
|
||||
else
|
||||
{
|
||||
rtx temp = gen_reg_rtx (V4SImode);
|
||||
emit_insn (gen_ashrv4si3 (temp, lowpart_subreg (V4SImode,
|
||||
operands[1],
|
||||
V2DImode),
|
||||
GEN_INT (31)));
|
||||
zero_or_all_ones = gen_reg_rtx (V4SImode);
|
||||
emit_insn (gen_sse2_pshufd_1 (zero_or_all_ones, temp,
|
||||
const1_rtx, const1_rtx,
|
||||
GEN_INT (3), GEN_INT (3)));
|
||||
zero_or_all_ones = lowpart_subreg (V2DImode, zero_or_all_ones,
|
||||
V4SImode);
|
||||
}
|
||||
rtx lshr_res = gen_reg_rtx (V2DImode);
|
||||
emit_insn (gen_lshrv2di3 (lshr_res, operands[1], operands[2]));
|
||||
rtx ashl_res = gen_reg_rtx (V2DImode);
|
||||
rtx amount;
|
||||
if (TARGET_64BIT)
|
||||
{
|
||||
amount = gen_reg_rtx (DImode);
|
||||
emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)),
|
||||
operands[2]));
|
||||
}
|
||||
else
|
||||
{
|
||||
rtx temp = gen_reg_rtx (SImode);
|
||||
emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)),
|
||||
lowpart_subreg (SImode, operands[2],
|
||||
DImode)));
|
||||
amount = gen_reg_rtx (V4SImode);
|
||||
emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode),
|
||||
temp));
|
||||
}
|
||||
amount = lowpart_subreg (DImode, amount, GET_MODE (amount));
|
||||
emit_insn (gen_ashlv2di3 (ashl_res, zero_or_all_ones, amount));
|
||||
emit_insn (gen_iorv2di3 (operands[0], lshr_res, ashl_res));
|
||||
DONE;
|
||||
}
|
||||
|
||||
rtx reg = gen_reg_rtx (V2DImode);
|
||||
rtx par;
|
||||
bool negate = false;
|
||||
|
|
|
@ -25,7 +25,9 @@ T (21, 2, 6, 3, 7) \
|
|||
T (22, 1, 2, 3, 0) \
|
||||
T (23, 2, 1, 0, 3) \
|
||||
T (24, 2, 5, 6, 3) \
|
||||
T (25, 0, 1, 4, 5)
|
||||
T (25, 0, 1, 4, 5) \
|
||||
T (26, 1, 5, 3, 7) \
|
||||
T (27, 0, 5, 2, 7)
|
||||
#define EXPTESTS \
|
||||
T (116, 1, 2, 4, 3) \
|
||||
T (117, 7, 3, 3, 0) \
|
||||
|
|
|
@ -27,7 +27,9 @@ T (23, 6, 5, 4, 3, 2, 1, 0, 7) \
|
|||
T (24, 0, 1, 2, 3, 8, 9, 10, 11) \
|
||||
T (25, 0, 1, 2, 3, 12, 13, 14, 15) \
|
||||
T (26, 0, 1, 8, 9, 10, 11, 12, 13) \
|
||||
T (27, 0, 8, 9, 10, 11, 12, 13, 14)
|
||||
T (27, 0, 8, 9, 10, 11, 12, 13, 14) \
|
||||
T (28, 1, 9, 3, 11, 5, 13, 7, 15) \
|
||||
T (29, 0, 9, 2, 11, 4, 13, 6, 15)
|
||||
#define EXPTESTS \
|
||||
T (116, 9, 3, 9, 4, 7, 0, 0, 6) \
|
||||
T (117, 4, 14, 12, 8, 9, 6, 0, 10) \
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
/* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 6 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 0 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %xmm\[0-9]\+, %xmm\[0-9]\+" 3 } } */
|
||||
|
|
13
gcc/testsuite/gcc.target/i386/avx-psraq-1.c
Normal file
13
gcc/testsuite/gcc.target/i386/avx-psraq-1.c
Normal file
|
@ -0,0 +1,13 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -mavx -mno-avx2" } */
|
||||
/* { dg-require-effective-target avx } */
|
||||
|
||||
#ifndef CHECK_H
|
||||
#define CHECK_H "avx-check.h"
|
||||
#endif
|
||||
|
||||
#ifndef TEST
|
||||
#define TEST avx_test
|
||||
#endif
|
||||
|
||||
#include "sse2-psraq-1.c"
|
|
@ -4,7 +4,7 @@
|
|||
/* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 0 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
|
@ -13,7 +13,7 @@
|
|||
/* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 0 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
|
||||
|
|
51
gcc/testsuite/gcc.target/i386/avx2-psraq-1.c
Normal file
51
gcc/testsuite/gcc.target/i386/avx2-psraq-1.c
Normal file
|
@ -0,0 +1,51 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -mavx2 -mno-avx512f" } */
|
||||
/* { dg-require-effective-target avx2 } */
|
||||
|
||||
#ifndef CHECK_H
|
||||
#define CHECK_H "avx2-check.h"
|
||||
#endif
|
||||
|
||||
#ifndef TEST
|
||||
#define TEST avx2_test
|
||||
#endif
|
||||
|
||||
#include CHECK_H
|
||||
|
||||
typedef long long V __attribute__((vector_size (32)));
|
||||
|
||||
#define TESTN(N) \
|
||||
static V \
|
||||
__attribute__((noipa)) \
|
||||
test##N (V x) \
|
||||
{ \
|
||||
return x >> N; \
|
||||
}
|
||||
|
||||
#define TESTS TESTN (63) TESTN (49) TESTN (32) TESTN (31) TESTN (18)
|
||||
TESTS
|
||||
|
||||
struct
|
||||
{
|
||||
int n;
|
||||
V (*fn) (V);
|
||||
} tests[] = {
|
||||
#undef TESTN
|
||||
#define TESTN(N) { N, test##N },
|
||||
TESTS
|
||||
};
|
||||
|
||||
static void
|
||||
TEST (void)
|
||||
{
|
||||
V a = (V) { 0xdeadbeefcafebabeULL, 0x123456789abcdef0ULL,
|
||||
0x173a74be8a95134cULL, 0x817bae35ac0ebf12ULL };
|
||||
int i;
|
||||
for (i = 0; tests[i].n; i++)
|
||||
{
|
||||
V c = tests[i].fn (a);
|
||||
if (c[0] != a[0] >> tests[i].n || c[1] != a[1] >> tests[i].n
|
||||
|| c[2] != a[2] >> tests[i].n || c[3] != a[3] >> tests[i].n)
|
||||
abort ();
|
||||
}
|
||||
}
|
|
@ -4,7 +4,7 @@
|
|||
/* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 0 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
|
@ -13,7 +13,7 @@
|
|||
/* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 0 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
/* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 0 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %xmm\[0-9]\+, %xmm\[0-9]\+" 1 } } */
|
||||
|
@ -13,7 +13,7 @@
|
|||
/* { dg-final { scan-assembler-times "vpslld\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsllq\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$7, %ymm\[0-9]\+, %ymm\[0-9]\+" 3 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrad\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsraq\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 0 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsraw\[ \t]\+\\\$3, %ymm\[0-9]\+, %ymm\[0-9]\+" 3 } } */
|
||||
/* { dg-final { scan-assembler-times "vpsrld\[ \t]\+\\\$5, %ymm\[0-9]\+, %ymm\[0-9]\+" 1 } } */
|
||||
|
|
53
gcc/testsuite/gcc.target/i386/sse2-psraq-1.c
Normal file
53
gcc/testsuite/gcc.target/i386/sse2-psraq-1.c
Normal file
|
@ -0,0 +1,53 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -msse2 -mno-sse3" } */
|
||||
/* { dg-require-effective-target sse2 } */
|
||||
|
||||
#ifndef CHECK_H
|
||||
#define CHECK_H "sse2-check.h"
|
||||
#endif
|
||||
|
||||
#ifndef TEST
|
||||
#define TEST sse2_test
|
||||
#endif
|
||||
|
||||
#include CHECK_H
|
||||
|
||||
typedef long long V __attribute__((vector_size (16)));
|
||||
|
||||
#define TESTN(N) \
|
||||
static V \
|
||||
__attribute__((noipa)) \
|
||||
test##N (V x) \
|
||||
{ \
|
||||
return x >> N; \
|
||||
}
|
||||
|
||||
#define TESTS TESTN (63) TESTN (49) TESTN (32) TESTN (31) TESTN (18)
|
||||
TESTS
|
||||
|
||||
struct
|
||||
{
|
||||
int n;
|
||||
V (*fn) (V);
|
||||
} tests[] = {
|
||||
#undef TESTN
|
||||
#define TESTN(N) { N, test##N },
|
||||
TESTS
|
||||
};
|
||||
|
||||
static void
|
||||
TEST (void)
|
||||
{
|
||||
V a = (V) { 0xdeadbeefcafebabeULL, 0x123456789abcdef0ULL };
|
||||
V b = (V) { 0x173a74be8a95134cULL, 0x817bae35ac0ebf12ULL };
|
||||
int i;
|
||||
for (i = 0; tests[i].n; i++)
|
||||
{
|
||||
V c = tests[i].fn (a);
|
||||
if (c[0] != a[0] >> tests[i].n || c[1] != a[1] >> tests[i].n)
|
||||
abort ();
|
||||
c = tests[i].fn (b);
|
||||
if (c[0] != b[0] >> tests[i].n || c[1] != b[1] >> tests[i].n)
|
||||
abort ();
|
||||
}
|
||||
}
|
13
gcc/testsuite/gcc.target/i386/sse4_2-psraq-1.c
Normal file
13
gcc/testsuite/gcc.target/i386/sse4_2-psraq-1.c
Normal file
|
@ -0,0 +1,13 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O2 -msse4.2 -mno-avx" } */
|
||||
/* { dg-require-effective-target sse4 } */
|
||||
|
||||
#ifndef CHECK_H
|
||||
#define CHECK_H "sse4_2-check.h"
|
||||
#endif
|
||||
|
||||
#ifndef TEST
|
||||
#define TEST sse4_2_test
|
||||
#endif
|
||||
|
||||
#include "sse2-psraq-1.c"
|
Loading…
Add table
Reference in a new issue