Simplify ix86_expand_vector_move_misalign
Since mov<mode>_internal patterns handle both aligned/unaligned load and store, we can simplify ix86_avx256_split_vector_move_misalign and ix86_expand_vector_move_misalign. * config/i386/i386.c (ix86_avx256_split_vector_move_misalign): Short-cut unaligned load and store cases. Handle all integer vector modes. (ix86_expand_vector_move_misalign): Short-cut unaligned load and store cases. Call ix86_avx256_split_vector_move_misalign directly without checking mode class. From-SVN: r235283
This commit is contained in:
parent
9e3e4fab84
commit
7ccc95364c
2 changed files with 89 additions and 170 deletions
|
@ -1,3 +1,12 @@
|
|||
2016-04-20 H.J. Lu <hongjiu.lu@intel.com>
|
||||
|
||||
* config/i386/i386.c (ix86_avx256_split_vector_move_misalign):
|
||||
Short-cut unaligned load and store cases. Handle all integer
|
||||
vector modes.
|
||||
(ix86_expand_vector_move_misalign): Short-cut unaligned load
|
||||
and store cases. Call ix86_avx256_split_vector_move_misalign
|
||||
directly without checking mode class.
|
||||
|
||||
2016-04-20 Andrew Pinski <apinski@cavium.com>
|
||||
Kyrylo Tkachov <kyrylo.tkachov@arm.com>
|
||||
|
||||
|
|
|
@ -18807,7 +18807,39 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
|
|||
rtx (*extract) (rtx, rtx, rtx);
|
||||
machine_mode mode;
|
||||
|
||||
switch (GET_MODE (op0))
|
||||
if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
|
||||
|| (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
|
||||
{
|
||||
emit_insn (gen_rtx_SET (op0, op1));
|
||||
return;
|
||||
}
|
||||
|
||||
rtx orig_op0 = NULL_RTX;
|
||||
mode = GET_MODE (op0);
|
||||
switch (GET_MODE_CLASS (mode))
|
||||
{
|
||||
case MODE_VECTOR_INT:
|
||||
case MODE_INT:
|
||||
if (mode != V32QImode)
|
||||
{
|
||||
if (!MEM_P (op0))
|
||||
{
|
||||
orig_op0 = op0;
|
||||
op0 = gen_reg_rtx (V32QImode);
|
||||
}
|
||||
else
|
||||
op0 = gen_lowpart (V32QImode, op0);
|
||||
op1 = gen_lowpart (V32QImode, op1);
|
||||
mode = V32QImode;
|
||||
}
|
||||
break;
|
||||
case MODE_VECTOR_FLOAT:
|
||||
break;
|
||||
default:
|
||||
gcc_unreachable ();
|
||||
}
|
||||
|
||||
switch (mode)
|
||||
{
|
||||
default:
|
||||
gcc_unreachable ();
|
||||
|
@ -18827,34 +18859,25 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
|
|||
|
||||
if (MEM_P (op1))
|
||||
{
|
||||
if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD
|
||||
&& optimize_insn_for_speed_p ())
|
||||
{
|
||||
rtx r = gen_reg_rtx (mode);
|
||||
m = adjust_address (op1, mode, 0);
|
||||
emit_move_insn (r, m);
|
||||
m = adjust_address (op1, mode, 16);
|
||||
r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
|
||||
emit_move_insn (op0, r);
|
||||
}
|
||||
else
|
||||
emit_insn (gen_rtx_SET (op0, op1));
|
||||
rtx r = gen_reg_rtx (mode);
|
||||
m = adjust_address (op1, mode, 0);
|
||||
emit_move_insn (r, m);
|
||||
m = adjust_address (op1, mode, 16);
|
||||
r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
|
||||
emit_move_insn (op0, r);
|
||||
}
|
||||
else if (MEM_P (op0))
|
||||
{
|
||||
if (TARGET_AVX256_SPLIT_UNALIGNED_STORE
|
||||
&& optimize_insn_for_speed_p ())
|
||||
{
|
||||
m = adjust_address (op0, mode, 0);
|
||||
emit_insn (extract (m, op1, const0_rtx));
|
||||
m = adjust_address (op0, mode, 16);
|
||||
emit_insn (extract (m, op1, const1_rtx));
|
||||
}
|
||||
else
|
||||
emit_insn (gen_rtx_SET (op0, op1));
|
||||
m = adjust_address (op0, mode, 0);
|
||||
emit_insn (extract (m, op1, const0_rtx));
|
||||
m = adjust_address (op0, mode, 16);
|
||||
emit_insn (extract (m, op1, const1_rtx));
|
||||
}
|
||||
else
|
||||
gcc_unreachable ();
|
||||
|
||||
if (orig_op0)
|
||||
emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
|
||||
}
|
||||
|
||||
/* Implement the movmisalign patterns for SSE. Non-SSE modes go
|
||||
|
@ -18912,118 +18935,50 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
|
|||
void
|
||||
ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
|
||||
{
|
||||
rtx op0, op1, orig_op0 = NULL_RTX, m;
|
||||
rtx op0, op1, m;
|
||||
|
||||
op0 = operands[0];
|
||||
op1 = operands[1];
|
||||
|
||||
if (GET_MODE_SIZE (mode) == 64)
|
||||
/* Use unaligned load/store for AVX512 or when optimizing for size. */
|
||||
if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
|
||||
{
|
||||
switch (GET_MODE_CLASS (mode))
|
||||
{
|
||||
case MODE_VECTOR_INT:
|
||||
case MODE_INT:
|
||||
if (GET_MODE (op0) != V16SImode)
|
||||
{
|
||||
if (!MEM_P (op0))
|
||||
{
|
||||
orig_op0 = op0;
|
||||
op0 = gen_reg_rtx (V16SImode);
|
||||
}
|
||||
else
|
||||
op0 = gen_lowpart (V16SImode, op0);
|
||||
}
|
||||
op1 = gen_lowpart (V16SImode, op1);
|
||||
/* FALLTHRU */
|
||||
|
||||
case MODE_VECTOR_FLOAT:
|
||||
|
||||
emit_insn (gen_rtx_SET (op0, op1));
|
||||
if (orig_op0)
|
||||
emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
|
||||
break;
|
||||
|
||||
default:
|
||||
gcc_unreachable ();
|
||||
}
|
||||
|
||||
emit_insn (gen_rtx_SET (op0, op1));
|
||||
return;
|
||||
}
|
||||
|
||||
if (TARGET_AVX
|
||||
&& GET_MODE_SIZE (mode) == 32)
|
||||
if (TARGET_AVX)
|
||||
{
|
||||
switch (GET_MODE_CLASS (mode))
|
||||
{
|
||||
case MODE_VECTOR_INT:
|
||||
case MODE_INT:
|
||||
if (GET_MODE (op0) != V32QImode)
|
||||
{
|
||||
if (!MEM_P (op0))
|
||||
{
|
||||
orig_op0 = op0;
|
||||
op0 = gen_reg_rtx (V32QImode);
|
||||
}
|
||||
else
|
||||
op0 = gen_lowpart (V32QImode, op0);
|
||||
}
|
||||
op1 = gen_lowpart (V32QImode, op1);
|
||||
/* FALLTHRU */
|
||||
if (GET_MODE_SIZE (mode) == 32)
|
||||
ix86_avx256_split_vector_move_misalign (op0, op1);
|
||||
else
|
||||
/* Always use 128-bit mov<mode>_internal pattern for AVX. */
|
||||
emit_insn (gen_rtx_SET (op0, op1));
|
||||
return;
|
||||
}
|
||||
|
||||
case MODE_VECTOR_FLOAT:
|
||||
ix86_avx256_split_vector_move_misalign (op0, op1);
|
||||
if (orig_op0)
|
||||
emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
|
||||
break;
|
||||
|
||||
default:
|
||||
gcc_unreachable ();
|
||||
}
|
||||
if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
|
||||
|| TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
|
||||
{
|
||||
emit_insn (gen_rtx_SET (op0, op1));
|
||||
return;
|
||||
}
|
||||
|
||||
/* ??? If we have typed data, then it would appear that using
|
||||
movdqu is the only way to get unaligned data loaded with
|
||||
integer type. */
|
||||
if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
|
||||
{
|
||||
emit_insn (gen_rtx_SET (op0, op1));
|
||||
return;
|
||||
}
|
||||
|
||||
if (MEM_P (op1))
|
||||
{
|
||||
/* Normal *mov<mode>_internal pattern will handle
|
||||
unaligned loads just fine if misaligned_operand
|
||||
is true, and without the UNSPEC it can be combined
|
||||
with arithmetic instructions. */
|
||||
if (TARGET_AVX
|
||||
&& (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|
||||
|| GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
|
||||
&& misaligned_operand (op1, GET_MODE (op1)))
|
||||
emit_insn (gen_rtx_SET (op0, op1));
|
||||
/* ??? If we have typed data, then it would appear that using
|
||||
movdqu is the only way to get unaligned data loaded with
|
||||
integer type. */
|
||||
else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
|
||||
{
|
||||
if (GET_MODE (op0) != V16QImode)
|
||||
{
|
||||
orig_op0 = op0;
|
||||
op0 = gen_reg_rtx (V16QImode);
|
||||
}
|
||||
op1 = gen_lowpart (V16QImode, op1);
|
||||
/* We will eventually emit movups based on insn attributes. */
|
||||
emit_insn (gen_rtx_SET (op0, op1));
|
||||
if (orig_op0)
|
||||
emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
|
||||
}
|
||||
else if (TARGET_SSE2 && mode == V2DFmode)
|
||||
if (TARGET_SSE2 && mode == V2DFmode)
|
||||
{
|
||||
rtx zero;
|
||||
|
||||
if (TARGET_AVX
|
||||
|| TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
|
||||
|| TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
|
||||
|| optimize_insn_for_size_p ())
|
||||
{
|
||||
/* We will eventually emit movups based on insn attributes. */
|
||||
emit_insn (gen_rtx_SET (op0, op1));
|
||||
return;
|
||||
}
|
||||
|
||||
/* When SSE registers are split into halves, we can avoid
|
||||
writing to the top half twice. */
|
||||
if (TARGET_SSE_SPLIT_REGS)
|
||||
|
@ -19053,24 +19008,6 @@ ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
|
|||
{
|
||||
rtx t;
|
||||
|
||||
if (TARGET_AVX
|
||||
|| TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
|
||||
|| TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
|
||||
|| optimize_insn_for_size_p ())
|
||||
{
|
||||
if (GET_MODE (op0) != V4SFmode)
|
||||
{
|
||||
orig_op0 = op0;
|
||||
op0 = gen_reg_rtx (V4SFmode);
|
||||
}
|
||||
op1 = gen_lowpart (V4SFmode, op1);
|
||||
emit_insn (gen_rtx_SET (op0, op1));
|
||||
if (orig_op0)
|
||||
emit_move_insn (orig_op0,
|
||||
gen_lowpart (GET_MODE (orig_op0), op0));
|
||||
return;
|
||||
}
|
||||
|
||||
if (mode != V4SFmode)
|
||||
t = gen_reg_rtx (V4SFmode);
|
||||
else
|
||||
|
@ -19091,49 +19028,22 @@ ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
|
|||
}
|
||||
else if (MEM_P (op0))
|
||||
{
|
||||
if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
|
||||
{
|
||||
op0 = gen_lowpart (V16QImode, op0);
|
||||
op1 = gen_lowpart (V16QImode, op1);
|
||||
/* We will eventually emit movups based on insn attributes. */
|
||||
emit_insn (gen_rtx_SET (op0, op1));
|
||||
}
|
||||
else if (TARGET_SSE2 && mode == V2DFmode)
|
||||
if (TARGET_SSE2 && mode == V2DFmode)
|
||||
{
|
||||
if (TARGET_AVX
|
||||
|| TARGET_SSE_UNALIGNED_STORE_OPTIMAL
|
||||
|| TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
|
||||
|| optimize_insn_for_size_p ())
|
||||
/* We will eventually emit movups based on insn attributes. */
|
||||
emit_insn (gen_rtx_SET (op0, op1));
|
||||
else
|
||||
{
|
||||
m = adjust_address (op0, DFmode, 0);
|
||||
emit_insn (gen_sse2_storelpd (m, op1));
|
||||
m = adjust_address (op0, DFmode, 8);
|
||||
emit_insn (gen_sse2_storehpd (m, op1));
|
||||
}
|
||||
m = adjust_address (op0, DFmode, 0);
|
||||
emit_insn (gen_sse2_storelpd (m, op1));
|
||||
m = adjust_address (op0, DFmode, 8);
|
||||
emit_insn (gen_sse2_storehpd (m, op1));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (mode != V4SFmode)
|
||||
op1 = gen_lowpart (V4SFmode, op1);
|
||||
|
||||
if (TARGET_AVX
|
||||
|| TARGET_SSE_UNALIGNED_STORE_OPTIMAL
|
||||
|| TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
|
||||
|| optimize_insn_for_size_p ())
|
||||
{
|
||||
op0 = gen_lowpart (V4SFmode, op0);
|
||||
emit_insn (gen_rtx_SET (op0, op1));
|
||||
}
|
||||
else
|
||||
{
|
||||
m = adjust_address (op0, V2SFmode, 0);
|
||||
emit_insn (gen_sse_storelps (m, op1));
|
||||
m = adjust_address (op0, V2SFmode, 8);
|
||||
emit_insn (gen_sse_storehps (m, op1));
|
||||
}
|
||||
m = adjust_address (op0, V2SFmode, 0);
|
||||
emit_insn (gen_sse_storelps (m, op1));
|
||||
m = adjust_address (op0, V2SFmode, 8);
|
||||
emit_insn (gen_sse_storehps (m, op1));
|
||||
}
|
||||
}
|
||||
else
|
||||
|
|
Loading…
Add table
Reference in a new issue