i386: Support vectorized BF16 add/sub/mul/div with AVX10.2 instructions
AVX10.2 introduces several non-exception instructions for BF16 vector. Enable vectorized BF add/sub/mul/div operation by supporting standard optab for them. gcc/ChangeLog: * config/i386/sse.md (div<mode>3): New expander for BFmode div. (VF_BHSD): New mode iterator with vector BFmodes. (<insn><mode>3<mask_name><round_name>): Change mode to VF_BHSD. (mul<mode>3<mask_name><round_name>): Likewise. gcc/testsuite/ChangeLog: * gcc.target/i386/avx10_2-512-bf-vector-operations-1.c: New test. * gcc.target/i386/avx10_2-bf-vector-operations-1.c: Ditto.
This commit is contained in:
parent
3b1decef83
commit
f82fa0da4d
3 changed files with 162 additions and 8 deletions
|
@ -391,6 +391,19 @@
|
|||
(V8DF "TARGET_AVX512F && TARGET_EVEX512") (V4DF "TARGET_AVX")
|
||||
(V2DF "TARGET_SSE2")])
|
||||
|
||||
(define_mode_iterator VF_BHSD
|
||||
[(V32HF "TARGET_AVX512FP16 && TARGET_EVEX512")
|
||||
(V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
|
||||
(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
|
||||
(V16SF "TARGET_AVX512F && TARGET_EVEX512")
|
||||
(V8SF "TARGET_AVX") V4SF
|
||||
(V8DF "TARGET_AVX512F && TARGET_EVEX512")
|
||||
(V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")
|
||||
(V32BF "TARGET_AVX10_2_512")
|
||||
(V16BF "TARGET_AVX10_2_256")
|
||||
(V8BF "TARGET_AVX10_2_256")
|
||||
])
|
||||
|
||||
;; 128-, 256- and 512-bit float vector modes for bitwise operations
|
||||
(define_mode_iterator VFB
|
||||
[(V32BF "TARGET_AVX512F && TARGET_EVEX512")
|
||||
|
@ -2527,10 +2540,10 @@
|
|||
})
|
||||
|
||||
(define_expand "<insn><mode>3<mask_name><round_name>"
|
||||
[(set (match_operand:VFH 0 "register_operand")
|
||||
(plusminus:VFH
|
||||
(match_operand:VFH 1 "<round_nimm_predicate>")
|
||||
(match_operand:VFH 2 "<round_nimm_predicate>")))]
|
||||
[(set (match_operand:VF_BHSD 0 "register_operand")
|
||||
(plusminus:VF_BHSD
|
||||
(match_operand:VF_BHSD 1 "<round_nimm_predicate>")
|
||||
(match_operand:VF_BHSD 2 "<round_nimm_predicate>")))]
|
||||
"TARGET_SSE && <mask_mode512bit_condition> && <round_mode_condition>"
|
||||
"ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
|
||||
|
||||
|
@ -2616,10 +2629,10 @@
|
|||
})
|
||||
|
||||
(define_expand "mul<mode>3<mask_name><round_name>"
|
||||
[(set (match_operand:VFH 0 "register_operand")
|
||||
(mult:VFH
|
||||
(match_operand:VFH 1 "<round_nimm_predicate>")
|
||||
(match_operand:VFH 2 "<round_nimm_predicate>")))]
|
||||
[(set (match_operand:VF_BHSD 0 "register_operand")
|
||||
(mult:VF_BHSD
|
||||
(match_operand:VF_BHSD 1 "<round_nimm_predicate>")
|
||||
(match_operand:VF_BHSD 2 "<round_nimm_predicate>")))]
|
||||
"TARGET_SSE && <mask_mode512bit_condition> && <round_mode_condition>"
|
||||
"ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);")
|
||||
|
||||
|
@ -2734,6 +2747,26 @@
|
|||
}
|
||||
})
|
||||
|
||||
(define_expand "div<mode>3"
|
||||
[(set (match_operand:VBF_AVX10_2 0 "register_operand")
|
||||
(div:VBF_AVX10_2
|
||||
(match_operand:VBF_AVX10_2 1 "register_operand")
|
||||
(match_operand:VBF_AVX10_2 2 "vector_operand")))]
|
||||
"TARGET_AVX10_2_256"
|
||||
{
|
||||
if (TARGET_RECIP_VEC_DIV
|
||||
&& optimize_insn_for_speed_p ()
|
||||
&& flag_finite_math_only
|
||||
&& flag_unsafe_math_optimizations)
|
||||
{
|
||||
rtx op = gen_reg_rtx (<MODE>mode);
|
||||
operands[2] = force_reg (<MODE>mode, operands[2]);
|
||||
emit_insn (gen_avx10_2_rcppbf16_<mode> (op, operands[2]));
|
||||
emit_insn (gen_avx10_2_mulnepbf16_<mode> (operands[0], operands[1], op));
|
||||
DONE;
|
||||
}
|
||||
})
|
||||
|
||||
(define_expand "cond_div<mode>"
|
||||
[(set (match_operand:VFH 0 "register_operand")
|
||||
(vec_merge:VFH
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-mavx10.2-512 -O2" } */
|
||||
/* { dg-final { scan-assembler-times "vmulnepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "vaddnepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vdivnepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vsubnepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef __bf16 v32bf __attribute__ ((__vector_size__ (64)));
|
||||
|
||||
v32bf
|
||||
foo_mul (v32bf a, v32bf b)
|
||||
{
|
||||
return a * b;
|
||||
}
|
||||
|
||||
v32bf
|
||||
foo_add (v32bf a, v32bf b)
|
||||
{
|
||||
return a + b;
|
||||
}
|
||||
|
||||
v32bf
|
||||
foo_div (v32bf a, v32bf b)
|
||||
{
|
||||
return a / b;
|
||||
}
|
||||
|
||||
v32bf
|
||||
foo_sub (v32bf a, v32bf b)
|
||||
{
|
||||
return a - b;
|
||||
}
|
||||
|
||||
__attribute__((optimize("fast-math")))
|
||||
v32bf
|
||||
foo_div_fast_math (v32bf a, v32bf b)
|
||||
{
|
||||
return a / b;
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-mavx10.2 -O2" } */
|
||||
/* { dg-final { scan-assembler-times "vmulnepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "vaddnepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vdivnepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vsubnepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vmulnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "vaddnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vdivnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vsubnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef __bf16 v16bf __attribute__ ((__vector_size__ (32)));
|
||||
typedef __bf16 v8bf __attribute__ ((__vector_size__ (16)));
|
||||
|
||||
v16bf
|
||||
foo_mul_256 (v16bf a, v16bf b)
|
||||
{
|
||||
return a * b;
|
||||
}
|
||||
|
||||
v16bf
|
||||
foo_add_256 (v16bf a, v16bf b)
|
||||
{
|
||||
return a + b;
|
||||
}
|
||||
|
||||
v16bf
|
||||
foo_div_256 (v16bf a, v16bf b)
|
||||
{
|
||||
return a / b;
|
||||
}
|
||||
|
||||
v16bf
|
||||
foo_sub_256 (v16bf a, v16bf b)
|
||||
{
|
||||
return a - b;
|
||||
}
|
||||
|
||||
__attribute__((optimize("fast-math")))
|
||||
v16bf
|
||||
foo_div_fast_math_256 (v16bf a, v16bf b)
|
||||
{
|
||||
return a / b;
|
||||
}
|
||||
|
||||
v8bf
|
||||
foo_mul_128 (v8bf a, v8bf b)
|
||||
{
|
||||
return a * b;
|
||||
}
|
||||
|
||||
v8bf
|
||||
foo_add_128 (v8bf a, v8bf b)
|
||||
{
|
||||
return a + b;
|
||||
}
|
||||
|
||||
v8bf
|
||||
foo_div_128 (v8bf a, v8bf b)
|
||||
{
|
||||
return a / b;
|
||||
}
|
||||
|
||||
v8bf
|
||||
foo_sub_128 (v8bf a, v8bf b)
|
||||
{
|
||||
return a - b;
|
||||
}
|
||||
|
||||
__attribute__((optimize("fast-math")))
|
||||
v8bf
|
||||
foo_div_fast_math_128 (v8bf a, v8bf b)
|
||||
{
|
||||
return a / b;
|
||||
}
|
Loading…
Add table
Reference in a new issue