Improve code generation for x86_64 [PR 92180]

This patch catches a missed optimization opportunity where GCC currently
generates worse code than LLVM.  The issue, as nicely analyzed in bugzilla,
boils down to the following three insns in combine:

(insn 6 5 7 2 (parallel [
            (set (reg:DI 85)
                (ashift:DI (reg:DI 85)
                    (const_int 32 [0x20])))
            (clobber (reg:CC 17 flags))
        ]) "pr92180.c":4:10 564 {*ashldi3_1}
     (expr_list:REG_UNUSED (reg:CC 17 flags)
        (nil)))
(insn 7 6 14 2 (parallel [
            (set (reg:DI 84)
                (ior:DI (reg:DI 84)
                    (reg:DI 85)))
            (clobber (reg:CC 17 flags))
        ]) "pr92180.c":4:10 454 {*iordi_1}
     (expr_list:REG_DEAD (reg:DI 85)
        (expr_list:REG_UNUSED (reg:CC 17 flags)
            (nil))))
(insn 14 7 15 2 (set (reg/i:SI 0 ax)
        (subreg:SI (reg:DI 84) 0)) "pr92180.c":5:1 67 {*movsi_internal}
     (expr_list:REG_DEAD (reg:DI 84)
        (nil)))

Normally, combine/simplify-rtx would notice that insns 6 and 7
(which update highpart bits) are unnecessary as the final insn 14
only requires to lowpart bits.  The complication is that insn 14
sets a hard register in targetm.class_likely_spilled_p which
prevents combine from performing its simplifications, and removing
the redundant instructions.

At first glance a fix would appear to require changes to combine,
potentially affecting code generation on all small register class
targets...  An alternate (and I think clever) solution is to spot
that this problematic situation can be avoided by the backend.

At RTL expansion time, the middle-end has a clear separation between
pseudos and hard registers, so the RTL initially contains:

(insn 9 8 10 2 (set (reg:SI 86)
        (subreg:SI (reg:DI 82 [ _1 ]) 0)) "pr92180.c":6:10 -1
     (nil))
(insn 10 9 14 2 (set (reg:SI 83 [ <retval> ])
        (reg:SI 86)) "pr92180.c":6:10 -1
     (nil))
(insn 14 10 15 2 (set (reg/i:SI 0 ax)
        (reg:SI 83 [ <retval> ])) "pr92180.c":7:1 -1
     (nil))

which can be optimized without problems by combine; it is only the
intervening passes (initially fwprop1) that propagate computations
into sets of hard registers, and disable those opportunities.

The solution proposed here is to have the x86 backend/recog prevent
early RTL passes composing instructions (that set likely_spilled hard
registers) that they (combine) can't simplify, until after reload.
We allow sets from pseudo registers, immediate constants and memory
accesses, but anything more complicated is performed via a temporary
pseudo.  Not only does this simplify things for the register allocator,
but any remaining register-to-register moves are easily cleaned up
by the late optimization passes after reload, such as peephole2 and
cprop_hardreg.

This patch has been tested on x86_64-pc-linux-gnu with a
"make bootstrap" and a "make -k check" with no new failures.
Ok for mainline?

gcc
	PR rtl-optimization/92180
	* config/i386/i386.c (ix86_hardreg_mov_ok): New function to
	determine whether (set DST SRC) should be allowed at this point.
	* config/i386/i386-protos.h (ix86_hardreg_mov_ok): Prototype here.
	* config/i386/i386-expand.c (ix86_expand_move): Check whether
	this is a complex set of a likely spilled hard register, and if
	so place the value in a pseudo, and load the hard reg from it.
	* config/i386/i386.md (*movdi_internal, *movsi_internal)
	(*movhi_internal, *movqi_internal): Make these instructions
	conditional on ix86_hardreg_mov_ok.
	(*lea<mode>): Make this define_insn_and_split conditional on
	ix86_hardreg_mov_ok.

gcc/testsuite
	PR rtl-optimization/92180
	* gcc.target/i386/pr92180.c: New test.
This commit is contained in:
Roger Sayle 2020-11-16 16:55:29 -07:00 committed by Jeff Law
parent 814e016318
commit be39636d9f
5 changed files with 48 additions and 5 deletions

View file

@ -196,6 +196,17 @@ ix86_expand_move (machine_mode mode, rtx operands[])
op0 = operands[0];
op1 = operands[1];
/* Avoid complex sets of likely spilled hard registers before reload. */
if (!ix86_hardreg_mov_ok (op0, op1))
{
tmp = gen_reg_rtx (mode);
operands[0] = tmp;
ix86_expand_move (mode, operands);
operands[0] = op0;
operands[1] = tmp;
op1 = tmp;
}
switch (GET_CODE (op1))
{
case CONST:

View file

@ -163,6 +163,7 @@ extern rtx ix86_find_base_term (rtx);
extern bool ix86_check_movabs (rtx, int);
extern bool ix86_check_no_addr_space (rtx);
extern void ix86_split_idivmod (machine_mode, rtx[], bool);
extern bool ix86_hardreg_mov_ok (rtx, rtx);
extern rtx assign_386_stack_local (machine_mode, enum ix86_stack_slot);
extern int ix86_attr_length_immediate_default (rtx_insn *, bool);

View file

@ -18889,6 +18889,22 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
return false;
}
/* Return true if a set of DST by the expression SRC should be allowed.
This prevents complex sets of likely_spilled hard regs before reload. */
bool
ix86_hardreg_mov_ok (rtx dst, rtx src)
{
/* Avoid complex sets of likely_spilled hard registers before reload. */
if (REG_P (dst) && HARD_REGISTER_P (dst)
&& !REG_P (src) && !MEM_P (src)
&& !x86_64_immediate_operand (src, GET_MODE (dst))
&& ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst)))
&& !reload_completed)
return false;
return true;
}
/* If we are copying between registers from different register sets
(e.g. FP and integer), we may need a memory location.

View file

@ -2089,7 +2089,8 @@
"=r ,o ,r,r ,r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,m,?r ,?*Yd,?r,?*v,?*y,?*x,*k,*k ,*r,*m,*k")
(match_operand:DI 1 "general_operand"
"riFo,riF,Z,rem,i,re,C ,*y,m ,*y,*y,r ,C ,*v,m ,*v,v,*Yd,r ,*v,r ,*x ,*y ,*r,*km,*k,*k,CBC"))]
"!(MEM_P (operands[0]) && MEM_P (operands[1]))"
"!(MEM_P (operands[0]) && MEM_P (operands[1]))
&& ix86_hardreg_mov_ok (operands[0], operands[1])"
{
switch (get_attr_type (insn))
{
@ -2309,7 +2310,8 @@
"=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
(match_operand:SI 1 "general_operand"
"g ,re,C ,*y,m ,*y,*y,r ,C ,*v,m ,*v,*v,r ,*r,*km,*k ,CBC"))]
"!(MEM_P (operands[0]) && MEM_P (operands[1]))"
"!(MEM_P (operands[0]) && MEM_P (operands[1]))
&& ix86_hardreg_mov_ok (operands[0], operands[1])"
{
switch (get_attr_type (insn))
{
@ -2417,7 +2419,9 @@
(define_insn "*movhi_internal"
[(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,*k,*k ,*r,*m,*k")
(match_operand:HI 1 "general_operand" "r ,rn,rm,rn,*r,*km,*k,*k,CBC"))]
"!(MEM_P (operands[0]) && MEM_P (operands[1]))"
"!(MEM_P (operands[0]) && MEM_P (operands[1]))
&& ix86_hardreg_mov_ok (operands[0], operands[1])"
{
switch (get_attr_type (insn))
{
@ -2506,7 +2510,9 @@
"=Q,R,r,q,q,r,r ,?r,m ,*k,*k,*r,*m,*k,*k,*k")
(match_operand:QI 1 "general_operand"
"Q ,R,r,n,m,q,rn, m,qn,*r,*k,*k,*k,*m,C,BC"))]
"!(MEM_P (operands[0]) && MEM_P (operands[1]))"
"!(MEM_P (operands[0]) && MEM_P (operands[1]))
&& ix86_hardreg_mov_ok (operands[0], operands[1])"
{
char buf[128];
const char *ops;
@ -5174,7 +5180,7 @@
(define_insn_and_split "*lea<mode>"
[(set (match_operand:SWI48 0 "register_operand" "=r")
(match_operand:SWI48 1 "address_no_seg_operand" "Ts"))]
""
"ix86_hardreg_mov_ok (operands[0], operands[1])"
{
if (SImode_address_operand (operands[1], VOIDmode))
{

View file

@ -0,0 +1,9 @@
/* PR rtl-optimization/92180 */
/* { dg-do compile } */
/* { dg-options "-O2" } */
unsigned int foo() {
return __builtin_ia32_rdtsc();
}
/* { dg-final { scan-assembler-not "sal" } } */