diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 4061bd0b998..b1b46326306 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,12 @@ 2020-01-24 Jakub Jelinek + PR target/93395 + * config/i386/sse.md (*avx_vperm_broadcast_v4sf, + *avx_vperm_broadcast_, + _vpermil, + *_vpermilp): + Move before avx2_perm/avx512f_perm. + PR target/93376 * simplify-rtx.c (simplify_const_unary_operation, simplify_const_binary_operation): Punt for mode precision above diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index b8d41b7ed54..f2f4a4e1515 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -19875,6 +19875,164 @@ (set_attr "prefix" "") (set_attr "mode" "")]) +;; Recognize broadcast as a vec_select as produced by builtin_vec_perm. +;; If it so happens that the input is in memory, use vbroadcast. +;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128). +(define_insn "*avx_vperm_broadcast_v4sf" + [(set (match_operand:V4SF 0 "register_operand" "=v,v,v") + (vec_select:V4SF + (match_operand:V4SF 1 "nonimmediate_operand" "m,o,v") + (match_parallel 2 "avx_vbroadcast_operand" + [(match_operand 3 "const_int_operand" "C,n,n")])))] + "TARGET_AVX" +{ + int elt = INTVAL (operands[3]); + switch (which_alternative) + { + case 0: + case 1: + operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4); + return "vbroadcastss\t{%1, %0|%0, %k1}"; + case 2: + operands[2] = GEN_INT (elt * 0x55); + return "vpermilps\t{%2, %1, %0|%0, %1, %2}"; + default: + gcc_unreachable (); + } +} + [(set_attr "type" "ssemov,ssemov,sselog1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "0,0,1") + (set_attr "prefix" "maybe_evex") + (set_attr "mode" "SF,SF,V4SF")]) + +(define_insn_and_split "*avx_vperm_broadcast_" + [(set (match_operand:VF_256 0 "register_operand" "=v,v,v") + (vec_select:VF_256 + (match_operand:VF_256 1 "nonimmediate_operand" "m,o,?v") + (match_parallel 2 "avx_vbroadcast_operand" + [(match_operand 3 "const_int_operand" "C,n,n")])))] + "TARGET_AVX" + "#" + "&& reload_completed && (mode != V4DFmode || !TARGET_AVX2)" + [(set (match_dup 0) (vec_duplicate:VF_256 (match_dup 1)))] +{ + rtx op0 = operands[0], op1 = operands[1]; + int elt = INTVAL (operands[3]); + + if (REG_P (op1)) + { + int mask; + + if (TARGET_AVX2 && elt == 0) + { + emit_insn (gen_vec_dup (op0, gen_lowpart (mode, + op1))); + DONE; + } + + /* Shuffle element we care about into all elements of the 128-bit lane. + The other lane gets shuffled too, but we don't care. */ + if (mode == V4DFmode) + mask = (elt & 1 ? 15 : 0); + else + mask = (elt & 3) * 0x55; + emit_insn (gen_avx_vpermil (op0, op1, GEN_INT (mask))); + + /* Shuffle the lane we care about into both lanes of the dest. */ + mask = (elt / ( / 2)) * 0x11; + if (EXT_REX_SSE_REG_P (op0)) + { + /* There is no EVEX VPERM2F128, but we can use either VBROADCASTSS + or VSHUFF128. */ + gcc_assert (mode == V8SFmode); + if ((mask & 1) == 0) + emit_insn (gen_avx2_vec_dupv8sf (op0, + gen_lowpart (V4SFmode, op0))); + else + emit_insn (gen_avx512vl_shuf_f32x4_1 (op0, op0, op0, + GEN_INT (4), GEN_INT (5), + GEN_INT (6), GEN_INT (7), + GEN_INT (12), GEN_INT (13), + GEN_INT (14), GEN_INT (15))); + DONE; + } + + emit_insn (gen_avx_vperm2f1283 (op0, op0, op0, GEN_INT (mask))); + DONE; + } + + operands[1] = adjust_address (op1, mode, + elt * GET_MODE_SIZE (mode)); +}) + +(define_expand "_vpermil" + [(set (match_operand:VF2 0 "register_operand") + (vec_select:VF2 + (match_operand:VF2 1 "nonimmediate_operand") + (match_operand:SI 2 "const_0_to_255_operand")))] + "TARGET_AVX && " +{ + int mask = INTVAL (operands[2]); + rtx perm[]; + + int i; + for (i = 0; i < ; i = i + 2) + { + perm[i] = GEN_INT (((mask >> i) & 1) + i); + perm[i + 1] = GEN_INT (((mask >> (i + 1)) & 1) + i); + } + + operands[2] + = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (, perm)); +}) + +(define_expand "_vpermil" + [(set (match_operand:VF1 0 "register_operand") + (vec_select:VF1 + (match_operand:VF1 1 "nonimmediate_operand") + (match_operand:SI 2 "const_0_to_255_operand")))] + "TARGET_AVX && " +{ + int mask = INTVAL (operands[2]); + rtx perm[]; + + int i; + for (i = 0; i < ; i = i + 4) + { + perm[i] = GEN_INT (((mask >> 0) & 3) + i); + perm[i + 1] = GEN_INT (((mask >> 2) & 3) + i); + perm[i + 2] = GEN_INT (((mask >> 4) & 3) + i); + perm[i + 3] = GEN_INT (((mask >> 6) & 3) + i); + } + + operands[2] + = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (, perm)); +}) + +;; This pattern needs to come before the avx2_perm*/avx512f_perm* +;; patterns, as they have the same RTL representation (vpermilp* +;; being a subset of what vpermp* can do), but vpermilp* has shorter +;; latency as it never crosses lanes. +(define_insn "*_vpermilp" + [(set (match_operand:VF 0 "register_operand" "=v") + (vec_select:VF + (match_operand:VF 1 "nonimmediate_operand" "vm") + (match_parallel 2 "" + [(match_operand 3 "const_int_operand")])))] + "TARGET_AVX && + && avx_vpermilp_parallel (operands[2], mode)" +{ + int mask = avx_vpermilp_parallel (operands[2], mode) - 1; + operands[2] = GEN_INT (mask); + return "vpermil\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "") + (set_attr "mode" "")]) + (define_expand "avx2_perm" [(match_operand:VI8F_256 0 "register_operand") (match_operand:VI8F_256 1 "nonimmediate_operand") @@ -20376,160 +20534,6 @@ (set_attr "prefix" "evex") (set_attr "mode" "XI")]) -;; Recognize broadcast as a vec_select as produced by builtin_vec_perm. -;; If it so happens that the input is in memory, use vbroadcast. -;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128). -(define_insn "*avx_vperm_broadcast_v4sf" - [(set (match_operand:V4SF 0 "register_operand" "=v,v,v") - (vec_select:V4SF - (match_operand:V4SF 1 "nonimmediate_operand" "m,o,v") - (match_parallel 2 "avx_vbroadcast_operand" - [(match_operand 3 "const_int_operand" "C,n,n")])))] - "TARGET_AVX" -{ - int elt = INTVAL (operands[3]); - switch (which_alternative) - { - case 0: - case 1: - operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4); - return "vbroadcastss\t{%1, %0|%0, %k1}"; - case 2: - operands[2] = GEN_INT (elt * 0x55); - return "vpermilps\t{%2, %1, %0|%0, %1, %2}"; - default: - gcc_unreachable (); - } -} - [(set_attr "type" "ssemov,ssemov,sselog1") - (set_attr "prefix_extra" "1") - (set_attr "length_immediate" "0,0,1") - (set_attr "prefix" "maybe_evex") - (set_attr "mode" "SF,SF,V4SF")]) - -(define_insn_and_split "*avx_vperm_broadcast_" - [(set (match_operand:VF_256 0 "register_operand" "=v,v,v") - (vec_select:VF_256 - (match_operand:VF_256 1 "nonimmediate_operand" "m,o,?v") - (match_parallel 2 "avx_vbroadcast_operand" - [(match_operand 3 "const_int_operand" "C,n,n")])))] - "TARGET_AVX" - "#" - "&& reload_completed && (mode != V4DFmode || !TARGET_AVX2)" - [(set (match_dup 0) (vec_duplicate:VF_256 (match_dup 1)))] -{ - rtx op0 = operands[0], op1 = operands[1]; - int elt = INTVAL (operands[3]); - - if (REG_P (op1)) - { - int mask; - - if (TARGET_AVX2 && elt == 0) - { - emit_insn (gen_vec_dup (op0, gen_lowpart (mode, - op1))); - DONE; - } - - /* Shuffle element we care about into all elements of the 128-bit lane. - The other lane gets shuffled too, but we don't care. */ - if (mode == V4DFmode) - mask = (elt & 1 ? 15 : 0); - else - mask = (elt & 3) * 0x55; - emit_insn (gen_avx_vpermil (op0, op1, GEN_INT (mask))); - - /* Shuffle the lane we care about into both lanes of the dest. */ - mask = (elt / ( / 2)) * 0x11; - if (EXT_REX_SSE_REG_P (op0)) - { - /* There is no EVEX VPERM2F128, but we can use either VBROADCASTSS - or VSHUFF128. */ - gcc_assert (mode == V8SFmode); - if ((mask & 1) == 0) - emit_insn (gen_avx2_vec_dupv8sf (op0, - gen_lowpart (V4SFmode, op0))); - else - emit_insn (gen_avx512vl_shuf_f32x4_1 (op0, op0, op0, - GEN_INT (4), GEN_INT (5), - GEN_INT (6), GEN_INT (7), - GEN_INT (12), GEN_INT (13), - GEN_INT (14), GEN_INT (15))); - DONE; - } - - emit_insn (gen_avx_vperm2f1283 (op0, op0, op0, GEN_INT (mask))); - DONE; - } - - operands[1] = adjust_address (op1, mode, - elt * GET_MODE_SIZE (mode)); -}) - -(define_expand "_vpermil" - [(set (match_operand:VF2 0 "register_operand") - (vec_select:VF2 - (match_operand:VF2 1 "nonimmediate_operand") - (match_operand:SI 2 "const_0_to_255_operand")))] - "TARGET_AVX && " -{ - int mask = INTVAL (operands[2]); - rtx perm[]; - - int i; - for (i = 0; i < ; i = i + 2) - { - perm[i] = GEN_INT (((mask >> i) & 1) + i); - perm[i + 1] = GEN_INT (((mask >> (i + 1)) & 1) + i); - } - - operands[2] - = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (, perm)); -}) - -(define_expand "_vpermil" - [(set (match_operand:VF1 0 "register_operand") - (vec_select:VF1 - (match_operand:VF1 1 "nonimmediate_operand") - (match_operand:SI 2 "const_0_to_255_operand")))] - "TARGET_AVX && " -{ - int mask = INTVAL (operands[2]); - rtx perm[]; - - int i; - for (i = 0; i < ; i = i + 4) - { - perm[i] = GEN_INT (((mask >> 0) & 3) + i); - perm[i + 1] = GEN_INT (((mask >> 2) & 3) + i); - perm[i + 2] = GEN_INT (((mask >> 4) & 3) + i); - perm[i + 3] = GEN_INT (((mask >> 6) & 3) + i); - } - - operands[2] - = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (, perm)); -}) - -(define_insn "*_vpermilp" - [(set (match_operand:VF 0 "register_operand" "=v") - (vec_select:VF - (match_operand:VF 1 "nonimmediate_operand" "vm") - (match_parallel 2 "" - [(match_operand 3 "const_int_operand")])))] - "TARGET_AVX && - && avx_vpermilp_parallel (operands[2], mode)" -{ - int mask = avx_vpermilp_parallel (operands[2], mode) - 1; - operands[2] = GEN_INT (mask); - return "vpermil\t{%2, %1, %0|%0, %1, %2}"; -} - [(set_attr "type" "sselog") - (set_attr "prefix_extra" "1") - (set_attr "length_immediate" "1") - (set_attr "prefix" "") - (set_attr "mode" "")]) - (define_insn "_vpermilvar3" [(set (match_operand:VF 0 "register_operand" "=v") (unspec:VF diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index d1213ccd44f..1fc95b334a8 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2020-01-24 Jakub Jelinek + + PR target/93395 + * gcc.target/i386/pr93395.c: New test. + * gcc.target/i386/avx512vl-vpermilpdi-1.c: Remove xfail. + 2020-01-24 Marek Polacek PR c++/93299 - ICE in tsubst_copy with parenthesized expression. diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vpermilpdi-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vpermilpdi-1.c index b4c8c111a17..cbbec3a1849 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vpermilpdi-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vpermilpdi-1.c @@ -1,7 +1,7 @@ /* { dg-do compile } */ /* { dg-options "-mavx512vl -O2" } */ -/* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*13\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*13\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*13\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*13\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*3\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vpermilpd\[ \\t\]+\[^\{\n\]*3\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr93395.c b/gcc/testsuite/gcc.target/i386/pr93395.c new file mode 100644 index 00000000000..bdc3a5531ed --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr93395.c @@ -0,0 +1,44 @@ +/* PR target/93395 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512f -masm=att" } */ +/* { dg-final { scan-assembler-times "vpermilpd\t.5, %ymm" 3 } } */ +/* { dg-final { scan-assembler-times "vpermilpd\t.85, %zmm" 3 } } */ +/* { dg-final { scan-assembler-not "vpermpd\t" } } */ + +#include + +__m256d +foo1 (__m256d a) +{ + return _mm256_permute4x64_pd (a, 177); +} + +__m256d +foo2 (__m256d a) +{ + return _mm256_permute_pd (a, 5); +} + +__m256d +foo3 (__m256d a) +{ + return __builtin_shuffle (a, (__v4di) { 1, 0, 3, 2 }); +} + +__m512d +foo4 (__m512d a) +{ + return _mm512_permutex_pd (a, 177); +} + +__m512d +foo5 (__m512d a) +{ + return _mm512_permute_pd (a, 85); +} + +__m512d +foo6 (__m512d a) +{ + return __builtin_shuffle (a, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); +}