Optimize vlddqu + inserti128 to vbroadcasti128
vlddqu + vinserti128 will use shuffle port in addition to load port comparing to vbroadcasti128, For latency perspective,vbroadcasti is no worse than vlddqu + vinserti128. gcc/ChangeLog: * config/i386/sse.md (*avx2_lddqu_inserti_to_bcasti): New pre_reload define_insn_and_split. gcc/testsuite/ChangeLog: * gcc.target/i386/vlddqu_vinserti128.c: New test.
This commit is contained in:
parent
5b501863ac
commit
1b446a9760
2 changed files with 29 additions and 0 deletions
|
@ -26609,6 +26609,24 @@
|
|||
(set_attr "prefix" "vex,evex,evex")
|
||||
(set_attr "mode" "OI")])
|
||||
|
||||
;; optimize vlddqu + vinserti128 to vbroadcasti128, the former will use
|
||||
;; extra shuffle port in addition to load port than the latter.
|
||||
;; For latency perspective,vbroadcasti is no worse.
|
||||
(define_insn_and_split "avx2_lddqu_inserti_to_bcasti"
|
||||
[(set (match_operand:V4DI 0 "register_operand" "=x,v,v")
|
||||
(vec_concat:V4DI
|
||||
(subreg:V2DI
|
||||
(unspec:V16QI [(match_operand:V16QI 1 "memory_operand")]
|
||||
UNSPEC_LDDQU) 0)
|
||||
(subreg:V2DI (unspec:V16QI [(match_dup 1)]
|
||||
UNSPEC_LDDQU) 0)))]
|
||||
"TARGET_AVX2 && ix86_pre_reload_split ()"
|
||||
"#"
|
||||
"&& 1"
|
||||
[(set (match_dup 0)
|
||||
(vec_concat:V4DI (match_dup 1) (match_dup 1)))]
|
||||
"operands[1] = adjust_address_nv (operands[1], V2DImode, 0);")
|
||||
|
||||
;; Modes handled by AVX vec_dup patterns.
|
||||
(define_mode_iterator AVX_VEC_DUP_MODE
|
||||
[V8SI V8SF V4DI V4DF])
|
||||
|
|
11
gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
Normal file
11
gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
Normal file
|
@ -0,0 +1,11 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-mavx2 -O2" } */
|
||||
/* { dg-final { scan-assembler-times "vbroadcasti128" 1 } } */
|
||||
/* { dg-final { scan-assembler-not {(?n)vlddqu.*xmm} } } */
|
||||
|
||||
#include <immintrin.h>
|
||||
__m256i foo(void *data) {
|
||||
__m128i X1 = _mm_lddqu_si128((__m128i*)data);
|
||||
__m256i V1 = _mm256_broadcastsi128_si256 (X1);
|
||||
return V1;
|
||||
}
|
Loading…
Add table
Reference in a new issue