diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 354a9a713b7..cba2880dc8d 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -16204,18 +16204,8 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) } return; - case E_V8HFmode: - if (TARGET_AVX2) - { - mmode = SImode; - gen_blendm = gen_sse4_1_pblendph; - blendm_const = true; - } - else - use_vec_merge = true; - break; - case E_V8HImode: + case E_V8HFmode: case E_V2HImode: use_vec_merge = TARGET_SSE2; break; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 4e9fae80479..9d7d1161d15 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -4656,15 +4656,7 @@ rtx tmp = gen_reg_rtx (V8HFmode); rtx zero = force_reg (V8HFmode, CONST0_RTX (V8HFmode)); - if (TARGET_AVX2) - { - rtx dup = gen_reg_rtx (V8HFmode); - emit_move_insn (dup, gen_rtx_VEC_DUPLICATE (V8HFmode, operands[1])); - emit_move_insn (tmp, gen_rtx_VEC_MERGE (V8HFmode, dup, - zero, const1_rtx)); - } - else - emit_insn (gen_sse2_pinsrph (tmp, zero, operands[1], const1_rtx)); + emit_insn (gen_vec_setv8hf_0 (tmp, zero, operands[1])); emit_insn (gen_vcvtph2ps (res, gen_lowpart (V8HImode, tmp))); emit_move_insn (operands[0], gen_lowpart (SFmode, res)); DONE; diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 08bdcddc111..f8b34a15cc6 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -827,7 +827,7 @@ (V32HF "TARGET_AVX512BW")]) ;; Int-float size matches -(define_mode_iterator VI2F [V8HI V16HI V32HI V8HF V16HF V32HF]) +(define_mode_iterator VI2F_256_512 [V16HI V32HI V16HF V32HF]) (define_mode_iterator VI4F_128 [V4SI V4SF]) (define_mode_iterator VI8F_128 [V2DI V2DF]) (define_mode_iterator VI4F_256 [V8SI V8SF]) @@ -10170,13 +10170,84 @@ ] (symbol_ref "true")))]) +(define_insn "vec_set_0" + [(set (match_operand:V8_128 0 "register_operand" + "=v,v,v,x,x,Yr,*x,x,x,x,v,v") + (vec_merge:V8_128 + (vec_duplicate:V8_128 + (match_operand: 2 "nonimmediate_operand" + " r,m,v,r,m,Yr,*x,r,m,x,r,m")) + (match_operand:V8_128 1 "reg_or_0_operand" + " C,C,v,0,0,0 ,0 ,x,x,x,v,v") + (const_int 1)))] + "TARGET_SSE2" + "@ + vmovw\t{%k2, %0|%0, %k2} + vmovw\t{%2, %0|%0, %2} + vmovsh\t{%2, %1, %0|%0, %1, %2} + pinsrw\t{$0, %k2, %0|%0, %k2, 0} + pinsrw\t{$0, %2, %0|%0, %2, 0} + pblendw\t{$1, %2, %0|%0, %2, 1} + pblendw\t{$1, %2, %0|%0, %2, 1} + vpinsrw\t{$0, %k2, %1, %0|%0, %1, %k2, 0} + vpinsrw\t{$0, %2, %1, %0|%0, %1, %2, 0} + vpblendw\t{$1, %2, %1, %0|%0, %1, %2, 1} + vpinsrw\t{$0, %k2, %1, %0|%0, %1, %k2, 0} + vpinsrw\t{$0, %2, %1, %0|%0, %1, %2, 0}" + [(set (attr "isa") + (cond [(eq_attr "alternative" "0,1,2") + (const_string "avx512fp16") + (eq_attr "alternative" "3") + (const_string "noavx") + (eq_attr "alternative" "4,5,6") + (const_string "sse4_noavx") + (eq_attr "alternative" "7,8,9") + (const_string "avx") + (eq_attr "alternative" "10,11") + (const_string "avx512bw") + ] + (const_string "*"))) + (set (attr "type") + (if_then_else (eq_attr "alternative" "0,1,2,5,6,9") + (const_string "ssemov") + (const_string "sselog"))) + (set (attr "prefix_data16") + (if_then_else (eq_attr "alternative" "3,4") + (const_string "1") + (const_string "*"))) + (set (attr "prefix_extra") + (if_then_else (eq_attr "alternative" "5,6,7,8,9") + (const_string "1") + (const_string "*"))) + (set (attr "length_immediate") + (if_then_else (eq_attr "alternative" "0,1,2") + (const_string "*") + (const_string "1"))) + (set (attr "prefix") + (cond [(eq_attr "alternative" "0,1,2,10,11") + (const_string "evex") + (eq_attr "alternative" "7,8,9") + (const_string "vex") + ] + (const_string "orig"))) + (set (attr "mode") + (if_then_else (eq_attr "alternative" "0,1,2") + (const_string "HF") + (const_string "TI"))) + (set (attr "enabled") + (cond [(and (not (match_test "mode == V8HFmode")) + (eq_attr "alternative" "2")) + (symbol_ref "false") + ] + (const_string "*")))]) + ;; vmovw clears also the higer bits (define_insn "vec_set_0" - [(set (match_operand:VI2F 0 "register_operand" "=v,v") - (vec_merge:VI2F - (vec_duplicate:VI2F + [(set (match_operand:VI2F_256_512 0 "register_operand" "=v,v") + (vec_merge:VI2F_256_512 + (vec_duplicate:VI2F_256_512 (match_operand: 2 "nonimmediate_operand" "r,m")) - (match_operand:VI2F 1 "const0_operand" "C,C") + (match_operand:VI2F_256_512 1 "const0_operand" "C,C") (const_int 1)))] "TARGET_AVX512FP16" "@ @@ -10186,19 +10257,6 @@ (set_attr "prefix" "evex") (set_attr "mode" "HF")]) -(define_insn "*avx512fp16_movsh" - [(set (match_operand:V8HF 0 "register_operand" "=v") - (vec_merge:V8HF - (vec_duplicate:V8HF - (match_operand:HF 2 "register_operand" "v")) - (match_operand:V8HF 1 "register_operand" "v") - (const_int 1)))] - "TARGET_AVX512FP16" - "vmovsh\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "ssemov") - (set_attr "prefix" "evex") - (set_attr "mode" "HF")]) - (define_insn "avx512fp16_movsh" [(set (match_operand:V8HF 0 "register_operand" "=v") (vec_merge:V8HF @@ -17312,20 +17370,20 @@ (V4SI "avx512dq") (V2DI "avx512dq")]) ;; sse4_1_pinsrd must come before sse2_loadld since it is preferred. -;; For V8HFmode and TARGET_AVX2, broadcastw + pblendw should be better. (define_insn "_pinsr" - [(set (match_operand:PINSR_MODE 0 "register_operand" "=x,x,x,x,v,v") + [(set (match_operand:PINSR_MODE 0 "register_operand" "=x,x,x,x,v,v,x") (vec_merge:PINSR_MODE (vec_duplicate:PINSR_MODE - (match_operand: 2 "nonimmediate_operand" "r,m,r,m,r,m")) - (match_operand:PINSR_MODE 1 "register_operand" "0,0,x,x,v,v") + (match_operand: 2 "nonimmediate_operand" "r,m,r,m,r,m,x")) + (match_operand:PINSR_MODE 1 "register_operand" "0,0,x,x,v,v,x") (match_operand:SI 3 "const_int_operand")))] "TARGET_SSE2 && ((unsigned) exact_log2 (INTVAL (operands[3])) - < GET_MODE_NUNITS (mode)) - && !(mode == V8HFmode && TARGET_AVX2)" + < GET_MODE_NUNITS (mode))" { - operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3]))); + HOST_WIDE_INT items = INTVAL (operands[3]); + + operands[3] = GEN_INT (exact_log2 (items)); switch (which_alternative) { @@ -17343,33 +17401,83 @@ case 3: case 5: return "vpinsr\t{%3, %2, %1, %0|%0, %1, %2, %3}"; + case 6: + /* This pattern needs to be shadowed with vec_set{v8hi,v8hf}_0. */ + gcc_assert (items > 1); + return "#"; default: gcc_unreachable (); } } - [(set_attr "isa" "noavx,noavx,avx,avx,,") + [(set_attr "isa" "noavx,noavx,avx,avx,,,avx2") (set_attr "type" "sselog") (set (attr "prefix_rex") (if_then_else (and (not (match_test "TARGET_AVX")) - (eq (const_string "mode") (const_string "V2DImode"))) + (match_test "GET_MODE_NUNITS (mode) == 2")) (const_string "1") (const_string "*"))) (set (attr "prefix_data16") (if_then_else (and (not (match_test "TARGET_AVX")) - (eq (const_string "mode") (const_string "V8HImode"))) + (match_test "GET_MODE_NUNITS (mode) == 8")) (const_string "1") (const_string "*"))) (set (attr "prefix_extra") (if_then_else (and (not (match_test "TARGET_AVX")) - (eq (const_string "mode") (const_string "V8HImode"))) + (match_test "GET_MODE_NUNITS (mode) == 8")) (const_string "*") (const_string "1"))) (set_attr "length_immediate" "1") - (set_attr "prefix" "orig,orig,vex,vex,evex,evex") - (set_attr "mode" "TI")]) + (set_attr "prefix" "orig,orig,vex,vex,evex,evex,vex") + (set_attr "mode" "TI") + (set (attr "enabled") + (cond [(and (not (match_test "GET_MODE_NUNITS (mode) == 8")) + (eq_attr "alternative" "6")) + (symbol_ref "false") + ] + (const_string "*")))]) + +;; For TARGET_AVX2, implement insert from XMM reg with PBROADCASTW + PBLENDW. +;; First try to get a scratch register and go through it. In case this fails, +;; overwrite source reg with broadcasted value and blend from there. +(define_peephole2 + [(match_scratch:V8_128 4 "x") + (set (match_operand:V8_128 0 "sse_reg_operand") + (vec_merge:V8_128 + (vec_duplicate:V8_128 + (match_operand: 2 "sse_reg_operand")) + (match_operand:V8_128 1 "sse_reg_operand") + (match_operand:SI 3 "const_int_operand")))] + "TARGET_AVX2 + && INTVAL (operands[3]) > 1 + && ((unsigned) exact_log2 (INTVAL (operands[3])) + < GET_MODE_NUNITS (mode))" + [(set (match_dup 4) + (vec_duplicate:V8_128 (match_dup 2))) + (set (match_dup 0) + (vec_merge:V8_128 (match_dup 4) (match_dup 1) (match_dup 3)))]) + +(define_split + [(set (match_operand:V8_128 0 "sse_reg_operand") + (vec_merge:V8_128 + (vec_duplicate:V8_128 + (match_operand: 2 "sse_reg_operand")) + (match_operand:V8_128 1 "sse_reg_operand") + (match_operand:SI 3 "const_int_operand")))] + "TARGET_AVX2 && epilogue_completed + && INTVAL (operands[3]) > 1 + && ((unsigned) exact_log2 (INTVAL (operands[3])) + < GET_MODE_NUNITS (mode))" + [(set (match_dup 4) + (vec_duplicate:V8_128 (match_dup 2))) + (set (match_dup 0) + (vec_merge:V8_128 (match_dup 4) (match_dup 1) (match_dup 3)))] +{ + operands[4] = lowpart_subreg (mode, operands[2], + mode); +}) (define_expand "_vinsert_mask" [(match_operand:AVX512_VEC 0 "register_operand") diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-1c.c b/gcc/testsuite/gcc.target/i386/avx512fp16-1c.c index b41a90b7c9d..096688690da 100644 --- a/gcc/testsuite/gcc.target/i386/avx512fp16-1c.c +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-1c.c @@ -1,8 +1,11 @@ /* { dg-do compile } */ /* { dg-options "-mavx512fp16 -O2" } */ -/* { dg-final { scan-assembler-times "vmovsh" 1 } } */ -/* { dg-final { scan-assembler-times "vpblendw" 1 } } */ -/* { dg-final { scan-assembler "vpbroadcastw" } } */ + +/* { dg-final { scan-assembler-times "vpbroadcastw" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpblendw" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vmovsh" 1 { target { ! ia32 } } } } */ + +/* { dg-final { scan-assembler-times "vpinsrw" 2 { target ia32 } } } */ typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16))); typedef _Float16 __m128h __attribute__ ((__vector_size__ (16), __may_alias__)); diff --git a/gcc/testsuite/gcc.target/i386/pr102327-1.c b/gcc/testsuite/gcc.target/i386/pr102327-1.c index 47439261b61..bd02f534600 100644 --- a/gcc/testsuite/gcc.target/i386/pr102327-1.c +++ b/gcc/testsuite/gcc.target/i386/pr102327-1.c @@ -60,6 +60,11 @@ VEC_SET (v32hf, _Float16, 14); VEC_SET (v32hf, _Float16, 16); VEC_SET (v32hf, _Float16, 24); VEC_SET (v32hf, _Float16, 28); -/* { dg-final { scan-assembler-times "vpbroadcastw" 10 } } */ -/* { dg-final { scan-assembler-times "vpblendw" 4 } } */ +/* { dg-final { scan-assembler-times "vpbroadcastw" 10 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpblendw" 4 { target { ! ia32 } } } } */ + +/* { dg-final { scan-assembler-times "vpbroadcastw" 9 { target ia32 } } } */ +/* { dg-final { scan-assembler-times "vpblendw" 3 { target ia32 } } } */ +/* { dg-final { scan-assembler-times "vpinsrw" 1 { target ia32 } } } */ + /* { dg-final { scan-assembler-times "vpblendd" 3 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr102811-1.c b/gcc/testsuite/gcc.target/i386/pr102811-1.c new file mode 100644 index 00000000000..a1952d11ed2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr102811-1.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mf16c -mno-avx512fp16" } */ +/* { dg-final { scan-assembler-times "vpxor" 1 } } */ +/* { dg-final { scan-assembler-times "vpblendw" 2 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpinsrw" 2 { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vpextrw"} } */ +/* { dg-final { scan-assembler-not "vpbroadcastw"} } */ +_Float16 test (_Float16 a, _Float16 b) +{ + return a + b; +} diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c b/gcc/testsuite/gcc.target/i386/pr102811.c similarity index 100% rename from gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c rename to gcc/testsuite/gcc.target/i386/pr102811.c