UNSPEC_PALIGNR optimizations and clean-ups on x86.

This patch is a follow-up to Hongtao's fix for PR target/105854.  That
fix is perfectly correct, but the thing that caught my eye was why is
the compiler generating a shift by zero at all.  Digging deeper it
turns out that we can easily optimize __builtin_ia32_palignr for
alignments of 0 and 64 respectively, which may be simplified to moves
of the highpart and lowpart respectively.

After adding optimizations to simplify the 64-bit DImode palignr, I
started to add the corresponding optimizations for vpalignr (i.e.
128-bit).  The first oddity is that sse.md uses TImode and a special
SSESCALARMODE iterator, rather than V1TImode, and indeed the comment
above SSESCALARMODE hints that this should be "dropped in favor of
VIMAX_AVX2_AVX512BW".  Hence this patch includes the migration of
<ssse3_avx2>_palignr<mode> to use VIMAX_AVX2_AVX512BW, basically
using V1TImode instead of TImode for 128-bit palignr.

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-,32},
with no new failures.  Ok for mainline?

2022-07-05  Roger Sayle  <roger@nextmovesoftware.com>
	    Hongtao Liu  <hongtao.liu@intel.com>

gcc/ChangeLog
	* config/i386/i386-builtin.def (__builtin_ia32_palignr128): Change
	CODE_FOR_ssse3_palignrti to CODE_FOR_ssse3_palignrv1ti.
	* config/i386/i386-expand.cc (expand_vec_perm_palignr): Use V1TImode
	and gen_ssse3_palignv1ti instead of TImode.
	* config/i386/sse.md (SSESCALARMODE): Delete.
	(define_mode_attr ssse3_avx2): Handle V1TImode instead of TImode.
	(<ssse3_avx2>_palignr<mode>): Use VIMAX_AVX2_AVX512BW as a mode
	iterator instead of SSESCALARMODE.
	(ssse3_palignrdi): Optimize cases where operands[3] is 0 or 64,
	using a single move instruction (if required).

gcc/testsuite/ChangeLog
	* gcc.target/i386/ssse3-palignr-2.c: New test case.
This commit is contained in:
Roger Sayle 2022-07-05 18:00:00 +01:00
parent d458c53a6f
commit 02e2e15ec4
4 changed files with 53 additions and 15 deletions

View file

@ -900,7 +900,7 @@ BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psig
BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI)
/* SSSE3. */
BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT)
BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_ssse3_palignrv1ti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT)
BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT)
/* SSE4.1 */

View file

@ -19548,9 +19548,11 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
if (GET_MODE_SIZE (d->vmode) == 16)
{
target = gen_reg_rtx (TImode);
emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
gen_lowpart (TImode, dcopy.op0), shift));
target = gen_reg_rtx (V1TImode);
emit_insn (gen_ssse3_palignrv1ti (target,
gen_lowpart (V1TImode, dcopy.op1),
gen_lowpart (V1TImode, dcopy.op0),
shift));
}
else
{

View file

@ -575,10 +575,6 @@
(define_mode_iterator VIMAX_AVX2
[(V2TI "TARGET_AVX2") V1TI])
;; ??? This should probably be dropped in favor of VIMAX_AVX2_AVX512BW.
(define_mode_iterator SSESCALARMODE
[(V4TI "TARGET_AVX512BW") (V2TI "TARGET_AVX2") TI])
(define_mode_iterator VI12_AVX2
[(V32QI "TARGET_AVX2") V16QI
(V16HI "TARGET_AVX2") V8HI])
@ -712,7 +708,7 @@
(V4HI "ssse3") (V8HI "ssse3") (V16HI "avx2") (V32HI "avx512bw")
(V4SI "ssse3") (V8SI "avx2")
(V2DI "ssse3") (V4DI "avx2")
(TI "ssse3") (V2TI "avx2") (V4TI "avx512bw")])
(V1TI "ssse3") (V2TI "avx2") (V4TI "avx512bw")])
(define_mode_attr sse4_1_avx2
[(V16QI "sse4_1") (V32QI "avx2") (V64QI "avx512bw")
@ -21108,10 +21104,10 @@
(set_attr "mode" "<sseinsnmode>")])
(define_insn "<ssse3_avx2>_palignr<mode>"
[(set (match_operand:SSESCALARMODE 0 "register_operand" "=x,<v_Yw>")
(unspec:SSESCALARMODE
[(match_operand:SSESCALARMODE 1 "register_operand" "0,<v_Yw>")
(match_operand:SSESCALARMODE 2 "vector_operand" "xBm,<v_Yw>m")
[(set (match_operand:VIMAX_AVX2_AVX512BW 0 "register_operand" "=x,<v_Yw>")
(unspec:VIMAX_AVX2_AVX512BW
[(match_operand:VIMAX_AVX2_AVX512BW 1 "register_operand" "0,<v_Yw>")
(match_operand:VIMAX_AVX2_AVX512BW 2 "vector_operand" "xBm,<v_Yw>m")
(match_operand:SI 3 "const_0_to_255_mul_8_operand")]
UNSPEC_PALIGNR))]
"TARGET_SSSE3"
@ -21157,11 +21153,30 @@
gcc_unreachable ();
}
}
"TARGET_SSSE3 && reload_completed
&& SSE_REGNO_P (REGNO (operands[0]))"
"(TARGET_SSSE3 && reload_completed
&& SSE_REGNO_P (REGNO (operands[0])))
|| operands[3] == const0_rtx
|| INTVAL (operands[3]) == 64"
[(set (match_dup 0)
(lshiftrt:V1TI (match_dup 0) (match_dup 3)))]
{
if (operands[3] == const0_rtx)
{
if (!rtx_equal_p (operands[0], operands[2]))
emit_move_insn (operands[0], operands[2]);
else
emit_note (NOTE_INSN_DELETED);
DONE;
}
else if (INTVAL (operands[3]) == 64)
{
if (!rtx_equal_p (operands[0], operands[1]))
emit_move_insn (operands[0], operands[1]);
else
emit_note (NOTE_INSN_DELETED);
DONE;
}
/* Emulate MMX palignrdi with SSE psrldq. */
rtx op0 = lowpart_subreg (V2DImode, operands[0],
GET_MODE (operands[0]));

View file

@ -0,0 +1,21 @@
/* { dg-do compile } */
/* { dg-options "-O2 -mssse3" } */
typedef long long __attribute__ ((__vector_size__ (8))) T;
T x;
T y;
T z;
void foo()
{
z = __builtin_ia32_palignr (x, y, 0);
}
void bar()
{
z = __builtin_ia32_palignr (x, y, 64);
}
/* { dg-final { scan-assembler-not "punpcklqdq" } } */
/* { dg-final { scan-assembler-not "pshufd" } } */
/* { dg-final { scan-assembler-not "psrldq" } } */