Improve code generation for x86_64 [PR 92180]
This patch catches a missed optimization opportunity where GCC currently generates worse code than LLVM. The issue, as nicely analyzed in bugzilla, boils down to the following three insns in combine: (insn 6 5 7 2 (parallel [ (set (reg:DI 85) (ashift:DI (reg:DI 85) (const_int 32 [0x20]))) (clobber (reg:CC 17 flags)) ]) "pr92180.c":4:10 564 {*ashldi3_1} (expr_list:REG_UNUSED (reg:CC 17 flags) (nil))) (insn 7 6 14 2 (parallel [ (set (reg:DI 84) (ior:DI (reg:DI 84) (reg:DI 85))) (clobber (reg:CC 17 flags)) ]) "pr92180.c":4:10 454 {*iordi_1} (expr_list:REG_DEAD (reg:DI 85) (expr_list:REG_UNUSED (reg:CC 17 flags) (nil)))) (insn 14 7 15 2 (set (reg/i:SI 0 ax) (subreg:SI (reg:DI 84) 0)) "pr92180.c":5:1 67 {*movsi_internal} (expr_list:REG_DEAD (reg:DI 84) (nil))) Normally, combine/simplify-rtx would notice that insns 6 and 7 (which update highpart bits) are unnecessary as the final insn 14 only requires to lowpart bits. The complication is that insn 14 sets a hard register in targetm.class_likely_spilled_p which prevents combine from performing its simplifications, and removing the redundant instructions. At first glance a fix would appear to require changes to combine, potentially affecting code generation on all small register class targets... An alternate (and I think clever) solution is to spot that this problematic situation can be avoided by the backend. At RTL expansion time, the middle-end has a clear separation between pseudos and hard registers, so the RTL initially contains: (insn 9 8 10 2 (set (reg:SI 86) (subreg:SI (reg:DI 82 [ _1 ]) 0)) "pr92180.c":6:10 -1 (nil)) (insn 10 9 14 2 (set (reg:SI 83 [ <retval> ]) (reg:SI 86)) "pr92180.c":6:10 -1 (nil)) (insn 14 10 15 2 (set (reg/i:SI 0 ax) (reg:SI 83 [ <retval> ])) "pr92180.c":7:1 -1 (nil)) which can be optimized without problems by combine; it is only the intervening passes (initially fwprop1) that propagate computations into sets of hard registers, and disable those opportunities. The solution proposed here is to have the x86 backend/recog prevent early RTL passes composing instructions (that set likely_spilled hard registers) that they (combine) can't simplify, until after reload. We allow sets from pseudo registers, immediate constants and memory accesses, but anything more complicated is performed via a temporary pseudo. Not only does this simplify things for the register allocator, but any remaining register-to-register moves are easily cleaned up by the late optimization passes after reload, such as peephole2 and cprop_hardreg. This patch has been tested on x86_64-pc-linux-gnu with a "make bootstrap" and a "make -k check" with no new failures. Ok for mainline? gcc PR rtl-optimization/92180 * config/i386/i386.c (ix86_hardreg_mov_ok): New function to determine whether (set DST SRC) should be allowed at this point. * config/i386/i386-protos.h (ix86_hardreg_mov_ok): Prototype here. * config/i386/i386-expand.c (ix86_expand_move): Check whether this is a complex set of a likely spilled hard register, and if so place the value in a pseudo, and load the hard reg from it. * config/i386/i386.md (*movdi_internal, *movsi_internal) (*movhi_internal, *movqi_internal): Make these instructions conditional on ix86_hardreg_mov_ok. (*lea<mode>): Make this define_insn_and_split conditional on ix86_hardreg_mov_ok. gcc/testsuite PR rtl-optimization/92180 * gcc.target/i386/pr92180.c: New test.
This commit is contained in:
parent
814e016318
commit
be39636d9f
5 changed files with 48 additions and 5 deletions
|
@ -196,6 +196,17 @@ ix86_expand_move (machine_mode mode, rtx operands[])
|
|||
op0 = operands[0];
|
||||
op1 = operands[1];
|
||||
|
||||
/* Avoid complex sets of likely spilled hard registers before reload. */
|
||||
if (!ix86_hardreg_mov_ok (op0, op1))
|
||||
{
|
||||
tmp = gen_reg_rtx (mode);
|
||||
operands[0] = tmp;
|
||||
ix86_expand_move (mode, operands);
|
||||
operands[0] = op0;
|
||||
operands[1] = tmp;
|
||||
op1 = tmp;
|
||||
}
|
||||
|
||||
switch (GET_CODE (op1))
|
||||
{
|
||||
case CONST:
|
||||
|
|
|
@ -163,6 +163,7 @@ extern rtx ix86_find_base_term (rtx);
|
|||
extern bool ix86_check_movabs (rtx, int);
|
||||
extern bool ix86_check_no_addr_space (rtx);
|
||||
extern void ix86_split_idivmod (machine_mode, rtx[], bool);
|
||||
extern bool ix86_hardreg_mov_ok (rtx, rtx);
|
||||
|
||||
extern rtx assign_386_stack_local (machine_mode, enum ix86_stack_slot);
|
||||
extern int ix86_attr_length_immediate_default (rtx_insn *, bool);
|
||||
|
|
|
@ -18889,6 +18889,22 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
|
|||
return false;
|
||||
}
|
||||
|
||||
/* Return true if a set of DST by the expression SRC should be allowed.
|
||||
This prevents complex sets of likely_spilled hard regs before reload. */
|
||||
|
||||
bool
|
||||
ix86_hardreg_mov_ok (rtx dst, rtx src)
|
||||
{
|
||||
/* Avoid complex sets of likely_spilled hard registers before reload. */
|
||||
if (REG_P (dst) && HARD_REGISTER_P (dst)
|
||||
&& !REG_P (src) && !MEM_P (src)
|
||||
&& !x86_64_immediate_operand (src, GET_MODE (dst))
|
||||
&& ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst)))
|
||||
&& !reload_completed)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/* If we are copying between registers from different register sets
|
||||
(e.g. FP and integer), we may need a memory location.
|
||||
|
||||
|
|
|
@ -2089,7 +2089,8 @@
|
|||
"=r ,o ,r,r ,r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,m,?r ,?*Yd,?r,?*v,?*y,?*x,*k,*k ,*r,*m,*k")
|
||||
(match_operand:DI 1 "general_operand"
|
||||
"riFo,riF,Z,rem,i,re,C ,*y,m ,*y,*y,r ,C ,*v,m ,*v,v,*Yd,r ,*v,r ,*x ,*y ,*r,*km,*k,*k,CBC"))]
|
||||
"!(MEM_P (operands[0]) && MEM_P (operands[1]))"
|
||||
"!(MEM_P (operands[0]) && MEM_P (operands[1]))
|
||||
&& ix86_hardreg_mov_ok (operands[0], operands[1])"
|
||||
{
|
||||
switch (get_attr_type (insn))
|
||||
{
|
||||
|
@ -2309,7 +2310,8 @@
|
|||
"=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
|
||||
(match_operand:SI 1 "general_operand"
|
||||
"g ,re,C ,*y,m ,*y,*y,r ,C ,*v,m ,*v,*v,r ,*r,*km,*k ,CBC"))]
|
||||
"!(MEM_P (operands[0]) && MEM_P (operands[1]))"
|
||||
"!(MEM_P (operands[0]) && MEM_P (operands[1]))
|
||||
&& ix86_hardreg_mov_ok (operands[0], operands[1])"
|
||||
{
|
||||
switch (get_attr_type (insn))
|
||||
{
|
||||
|
@ -2417,7 +2419,9 @@
|
|||
(define_insn "*movhi_internal"
|
||||
[(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,*k,*k ,*r,*m,*k")
|
||||
(match_operand:HI 1 "general_operand" "r ,rn,rm,rn,*r,*km,*k,*k,CBC"))]
|
||||
"!(MEM_P (operands[0]) && MEM_P (operands[1]))"
|
||||
"!(MEM_P (operands[0]) && MEM_P (operands[1]))
|
||||
&& ix86_hardreg_mov_ok (operands[0], operands[1])"
|
||||
|
||||
{
|
||||
switch (get_attr_type (insn))
|
||||
{
|
||||
|
@ -2506,7 +2510,9 @@
|
|||
"=Q,R,r,q,q,r,r ,?r,m ,*k,*k,*r,*m,*k,*k,*k")
|
||||
(match_operand:QI 1 "general_operand"
|
||||
"Q ,R,r,n,m,q,rn, m,qn,*r,*k,*k,*k,*m,C,BC"))]
|
||||
"!(MEM_P (operands[0]) && MEM_P (operands[1]))"
|
||||
"!(MEM_P (operands[0]) && MEM_P (operands[1]))
|
||||
&& ix86_hardreg_mov_ok (operands[0], operands[1])"
|
||||
|
||||
{
|
||||
char buf[128];
|
||||
const char *ops;
|
||||
|
@ -5174,7 +5180,7 @@
|
|||
(define_insn_and_split "*lea<mode>"
|
||||
[(set (match_operand:SWI48 0 "register_operand" "=r")
|
||||
(match_operand:SWI48 1 "address_no_seg_operand" "Ts"))]
|
||||
""
|
||||
"ix86_hardreg_mov_ok (operands[0], operands[1])"
|
||||
{
|
||||
if (SImode_address_operand (operands[1], VOIDmode))
|
||||
{
|
||||
|
|
9
gcc/testsuite/gcc.target/i386/pr92180.c
Normal file
9
gcc/testsuite/gcc.target/i386/pr92180.c
Normal file
|
@ -0,0 +1,9 @@
|
|||
/* PR rtl-optimization/92180 */
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2" } */
|
||||
|
||||
unsigned int foo() {
|
||||
return __builtin_ia32_rdtsc();
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-not "sal" } } */
|
Loading…
Add table
Reference in a new issue