Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680]

If the second operand of __builtin_shuffle is const vector 0, and with
specific mask, it can be optimized to movq/vmovps.

.i.e.
foo128:
-       vxorps  %xmm1, %xmm1, %xmm1
-       vmovlhps        %xmm1, %xmm0, %xmm0
+       vmovq   %xmm0, %xmm0

 foo256:
-       vxorps  %xmm1, %xmm1, %xmm1
-       vshuff32x4      $0, %ymm1, %ymm0, %ymm0
+       vmovaps %xmm0, %xmm0

 foo512:
-       vxorps  %xmm1, %xmm1, %xmm1
-       vshuff32x4      $68, %zmm1, %zmm0, %zmm0
+       vmovaps %ymm0, %ymm0

gcc/ChangeLog:

	PR target/94680
	* config/i386/sse.md (ssedoublevecmode): Add attribute for
	V64QI/V32HI/V16SI/V4DI.
	(ssehalfvecmode): Add attribute for V2DI/V2DF.
	(*vec_concatv4si_0): Extend to VI124_128.
	(*vec_concat<mode>_0): New pre-reload splitter.
	* config/i386/predicates.md (movq_parallel): New predicate.

gcc/testsuite/ChangeLog:

	PR target/94680
	* gcc.target/i386/avx-pr94680.c: New test.
	* gcc.target/i386/avx512f-pr94680.c: New test.
	* gcc.target/i386/sse2-pr94680.c: New test.
This commit is contained in:
liuhongt 2021-04-22 15:33:16 +08:00
parent 0ff3a0f2b9
commit 94de7e225c
5 changed files with 403 additions and 8 deletions

View file

@ -1535,6 +1535,38 @@
(and (match_code "mem")
(match_test "MEM_ALIGN (op) < GET_MODE_BITSIZE (mode)")))
;; Return true if OP is a parallel for an mov{d,q,dqa,ps,pd} vec_select,
;; where one of the two operands of the vec_concat is const0_operand.
(define_predicate "movq_parallel"
(match_code "parallel")
{
unsigned nelt = XVECLEN (op, 0);
unsigned nelt2 = nelt >> 1;
unsigned i;
if (nelt < 2)
return false;
/* Validate that all of the elements are constants,
lower halves of permute are lower halves of the first operand,
upper halves of permute come from any of the second operand. */
for (i = 0; i < nelt; ++i)
{
rtx er = XVECEXP (op, 0, i);
unsigned HOST_WIDE_INT ei;
if (!CONST_INT_P (er))
return 0;
ei = INTVAL (er);
if (i < nelt2 && ei != i)
return 0;
if (i >= nelt2 && (ei < nelt || ei >= nelt << 1))
return 0;
}
return 1;
})
;; Return true if OP is a vzeroall operation, known to be a PARALLEL.
(define_predicate "vzeroall_operation"
(match_code "parallel")

View file

@ -811,19 +811,22 @@
;; Mapping of vector modes to a vector mode of double size
(define_mode_attr ssedoublevecmode
[(V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI")
[(V64QI "V128QI") (V32HI "V64HI") (V16SI "V32SI") (V8DI "V16DI")
(V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI")
(V16QI "V32QI") (V8HI "V16HI") (V4SI "V8SI") (V2DI "V4DI")
(V16SF "V32SF") (V8DF "V16DF")
(V8SF "V16SF") (V4DF "V8DF")
(V4SF "V8SF") (V2DF "V4DF")])
;; Mapping of vector modes to a vector mode of half size
;; instead of V1DI/V1DF, DI/DF are used for V2DI/V2DF although they are scalar.
(define_mode_attr ssehalfvecmode
[(V64QI "V32QI") (V32HI "V16HI") (V16SI "V8SI") (V8DI "V4DI") (V4TI "V2TI")
(V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI") (V4DI "V2DI")
(V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI")
(V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") (V2DI "DI")
(V16SF "V8SF") (V8DF "V4DF")
(V8SF "V4SF") (V4DF "V2DF")
(V4SF "V2SF")])
(V4SF "V2SF") (V2DF "DF")])
(define_mode_attr ssehalfvecmodelower
[(V64QI "v32qi") (V32HI "v16hi") (V16SI "v8si") (V8DI "v4di") (V4TI "v2ti")
@ -15939,11 +15942,11 @@
(set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex")
(set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")])
(define_insn "*vec_concatv4si_0"
[(set (match_operand:V4SI 0 "register_operand" "=v,x")
(vec_concat:V4SI
(match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
(match_operand:V2SI 2 "const0_operand" " C,C")))]
(define_insn "*vec_concat<mode>_0"
[(set (match_operand:VI124_128 0 "register_operand" "=v,x")
(vec_concat:VI124_128
(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y")
(match_operand:<ssehalfvecmode> 2 "const0_operand" " C,C")))]
"TARGET_SSE2"
"@
%vmovq\t{%1, %0|%0, %1}
@ -22158,6 +22161,24 @@
(set_attr "prefix" "maybe_evex")
(set_attr "mode" "<sseinsnmode>")])
(define_insn_and_split "*vec_concat<mode>_0_1"
[(set (match_operand:V 0 "register_operand")
(vec_select:V
(vec_concat:<ssedoublevecmode>
(match_operand:V 1 "nonimmediate_operand")
(match_operand:V 2 "const0_operand"))
(match_parallel 3 "movq_parallel"
[(match_operand 4 "const_int_operand")])))]
"ix86_pre_reload_split ()"
"#"
"&& 1"
[(set (match_dup 0)
(vec_concat:V (match_dup 1) (match_dup 5)))]
{
operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]);
operands[5] = CONST0_RTX (<ssehalfvecmode>mode);
})
(define_insn "vcvtph2ps<mask_name>"
[(set (match_operand:V4SF 0 "register_operand" "=v")
(vec_select:V4SF

View file

@ -0,0 +1,107 @@
/* { dg-do compile } */
/* { dg-options "-mavx -mno-avx512f -O2" } */
/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%xmm[0-9]} 12 } } */
/* { dg-final { scan-assembler-not "pxor" } } */
typedef float v8sf __attribute__((vector_size(32)));
typedef double v4df __attribute__ ((vector_size (32)));
typedef long long v4di __attribute__((vector_size(32)));
typedef int v8si __attribute__((vector_size(32)));
typedef short v16hi __attribute__ ((vector_size (32)));
typedef char v32qi __attribute__ ((vector_size (32)));
v4df
foo_v4df (v4df x)
{
return __builtin_shuffle (x, (v4df) { 0, 0, 0, 0 }, (v4di) { 0, 1, 4, 5 });
}
v4df
foo_v4df_l (v4df x)
{
return __builtin_shuffle ((v4df) { 0, 0, 0, 0 }, x, (v4di) { 4, 5, 1, 2 });
}
v4di
foo_v4di (v4di x)
{
return __builtin_shuffle (x, (v4di) { 0, 0, 0, 0 }, (v4di) { 0, 1, 4, 7 });
}
v4di
foo_v4di_l (v4di x)
{
return __builtin_shuffle ((v4di) { 0, 0, 0, 0 }, x, (v4di) { 4, 5, 3, 1 });
}
v8sf
foo_v8sf (v8sf x)
{
return __builtin_shuffle ((v8sf) { 0, 0, 0, 0, 0, 0, 0, 0 }, x,
(v8si) { 8, 9, 10, 11, 0, 1, 2, 3 });
}
v8sf
foo_v8sf_l (v8sf x)
{
return __builtin_shuffle (x, (v8sf) { 0, 0, 0, 0, 0, 0, 0, 0 },
(v8si) { 0, 1, 2, 3, 8, 9, 10, 11 });
}
v8si
foo_v8si (v8si x)
{
return __builtin_shuffle (x, (v8si) { 0, 0, 0, 0, 0, 0, 0, 0 },
(v8si) { 0, 1, 2, 3, 13, 12, 11, 15 });
}
v8si
foo_v8si_l (v8si x)
{
return __builtin_shuffle ((v8si) { 0, 0, 0, 0, 0, 0, 0, 0 }, x,
(v8si) { 8, 9, 10, 11, 7, 6, 5, 4 });
}
v16hi
foo_v16hi (v16hi x)
{
return __builtin_shuffle (x, (v16hi) { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 },
(v16hi) { 0, 1, 2, 3, 4, 5, 6, 7,
24, 17, 26, 19, 28, 21, 30, 23 });
}
v16hi
foo_v16hi_l (v16hi x)
{
return __builtin_shuffle ((v16hi) { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 }, x,
(v16hi) { 16, 17, 18, 20, 21, 22, 23,
15, 0, 13, 2, 11, 4, 9, 6 });
}
v32qi
foo_v32qi (v32qi x)
{
return __builtin_shuffle (x, (v32qi) { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 },
(v32qi) { 0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
32, 49, 34, 58, 36, 53, 38, 39,
40, 60, 42, 43, 63, 45, 46, 47 });
}
v32qi
foo_v32qi_l (v32qi x)
{
return __builtin_shuffle ((v32qi) { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 }, x,
(v32qi) { 32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47,
31, 0, 29, 2, 27, 4, 25, 6,
23, 8, 21, 10, 19, 12, 17, 14 });
}

View file

@ -0,0 +1,144 @@
/* { dg-do compile } */
/* { dg-options "-mavx512bw -mavx512vbmi -O2" } */
/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 12} } */
/* { dg-final { scan-assembler-not "pxor" } } */
typedef float v16sf __attribute__((vector_size(64)));
typedef double v8df __attribute__ ((vector_size (64)));
typedef long long v8di __attribute__((vector_size(64)));
typedef int v16si __attribute__((vector_size(64)));
typedef short v32hi __attribute__ ((vector_size (64)));
typedef char v64qi __attribute__ ((vector_size (64)));
v8df
foo_v8df (v8df x)
{
return __builtin_shuffle (x, (v8df) { 0, 0, 0, 0, 0, 0, 0, 0 },
(v8di) { 0, 1, 2, 3, 15, 14, 10, 11 });
}
v8df
foo_v8df_l (v8df x)
{
return __builtin_shuffle ((v8df) { 0, 0, 0, 0, 0, 0, 0, 0 }, x,
(v8di) { 8, 9, 10, 11, 0, 1, 2, 3 });
}
v8di
foo_v8di (v8di x)
{
return __builtin_shuffle (x, (v8di) { 0, 0, 0, 0, 0, 0, 0, 0 },
(v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
}
v8di
foo_v8di_l (v8di x)
{
return __builtin_shuffle ((v8di) { 0, 0, 0, 0, 0, 0, 0, 0 }, x,
(v8di) { 8, 9, 10, 11, 7, 6, 5, 4 });
}
v16sf
foo_v16sf (v16sf x)
{
return __builtin_shuffle (x, (v16sf) { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 },
(v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
16, 17, 18, 19, 20, 21, 22, 23 });
}
v16sf
foo_v16sf_l (v16sf x)
{
return __builtin_shuffle ((v16sf) { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 }, x,
(v16si) { 16, 17, 18, 19, 20, 21, 22, 23,
0, 15, 2, 13, 4, 11, 6, 9 });
}
v16si
foo_v16si (v16si x)
{
return __builtin_shuffle (x, (v16si) { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 },
(v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
31, 30, 29, 28, 20, 21, 22, 23 });
}
v16si
foo_v16si_l (v16si x)
{
return __builtin_shuffle ((v16si) { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 }, x,
(v16si) { 16, 17, 18, 19, 20, 21, 22, 23,
15, 0, 13, 2, 11, 4, 9, 6 });
}
v32hi
foo_v32hi (v32hi x)
{
return __builtin_shuffle (x, (v32hi) { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 },
(v32hi) { 0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
63, 33, 61, 35, 59, 37, 57, 39,
55, 41, 53, 43, 51, 45, 49, 47 });
}
v32hi
foo_v32hi_l (v32hi x)
{
return __builtin_shuffle ((v32hi) { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 }, x,
(v32hi) { 32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47,
31, 0, 29, 2, 27, 4, 25, 6,
23, 8, 21, 10, 19, 12, 17, 14 });
}
v64qi
foo_v64qi (v64qi x)
{
return __builtin_shuffle (x, (v64qi) { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 },
(v64qi) {0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31,
64, 127, 66, 125, 68, 123, 70, 121,
72, 119, 74, 117, 76, 115, 78, 113,
80, 111, 82, 109, 84, 107, 86, 105,
88, 103, 90, 101, 92, 99, 94, 97 });
}
v64qi
foo_v64qi_l (v64qi x)
{
return __builtin_shuffle ((v64qi) { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 }, x,
(v64qi) { 64, 65, 66, 67, 68, 69, 70, 71,
72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87,
88, 89, 90, 91, 92, 93, 94, 95,
0, 63, 2, 61, 4, 59, 6, 57,
8, 55, 10, 53, 12, 51, 14, 49,
16, 47, 18, 45, 20, 43, 22, 41,
24, 39, 26, 37, 28, 35, 30, 33 });
}

View file

@ -0,0 +1,91 @@
/* { dg-do compile } */
/* { dg-options "-msse2 -mno-sse4.1 -O2" } */
/* { dg-final { scan-assembler-times {(?n)(?:mov|psrldq).*%xmm[0-9]} 12 } } */
/* { dg-final { scan-assembler-not "pxor" } } */
typedef float v4sf __attribute__((vector_size(16)));
typedef double v2df __attribute__ ((vector_size (16)));
typedef long long v2di __attribute__((vector_size(16)));
typedef int v4si __attribute__((vector_size(16)));
typedef short v8hi __attribute__ ((vector_size (16)));
typedef char v16qi __attribute__ ((vector_size (16)));
v2df
foo_v2df (v2df x)
{
return __builtin_shuffle (x, (v2df) { 0, 0 }, (v2di) {0, 2});
}
v2df
foo_v2df_l (v2df x)
{
return __builtin_shuffle ((v2df) { 0, 0 }, x, (v2di) {3, 1});
}
v2di
foo_v2di (v2di x)
{
return __builtin_shuffle (x, (v2di) { 0, 0 }, (v2di) {0, 3});
}
v2di
foo_v2di_l (v2di x)
{
return __builtin_shuffle ((v2di) { 0, 0 }, x, (v2di) {3, 0});
}
v4sf
foo_v4sf (v4sf x)
{
return __builtin_shuffle (x, (v4sf) { 0, 0, 0, 0 }, (v4si) {0, 1, 4, 5});
}
v4sf
foo_v4sf_l (v4sf x)
{
return __builtin_shuffle ((v4sf) { 0, 0, 0, 0 }, x, (v4si) {4, 5, 3, 1});
}
v4si
foo_v4si (v4si x)
{
return __builtin_shuffle (x, (v4si) { 0, 0, 0, 0 }, (v4si) {0, 1, 6, 7});
}
v4si
foo_v4si_l (v4si x)
{
return __builtin_shuffle ((v4si) { 0, 0, 0, 0 }, x, (v4si) {4, 5, 1, 2});
}
v8hi
foo_v8hi (v8hi x)
{
return __builtin_shuffle (x, (v8hi) { 0, 0, 0, 0, 0, 0, 0, 0 },
(v8hi) { 0, 1, 2, 3, 8, 12, 10, 13 });
}
v8hi
foo_v8hi_l (v8hi x)
{
return __builtin_shuffle ((v8hi) { 0, 0, 0, 0, 0, 0, 0, 0 }, x,
(v8hi) { 8, 9, 10, 11, 7, 6, 5, 4 });
}
v16qi
foo_v16qi (v16qi x)
{
return __builtin_shuffle (x, (v16qi) { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 },
(v16qi) {0, 1, 2, 3, 4, 5, 6, 7,
16, 24, 18, 26, 20, 28, 22, 30 });
}
v16qi
foo_v16qi_l (v16qi x)
{
return __builtin_shuffle ((v16qi) { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 }, x,
(v16qi) { 16, 17, 18, 19, 20, 21, 22, 23,
15, 0, 13, 2, 11, 4, 9, 6 });
}