dse: Remove partial load after full store for high part access[PR71309]

v5 update as comments:
1. Move const_rhs out of loop;
2. Iterate from int size for read_mode.

This patch could optimize(works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp     x2, x3, [x0, 16]
stp     x2, x3, [x0]
ldr     x0, [x0, 8]

=>

mov     x1, x0
ldp     x2, x0, [x0, 16]
stp     x2, x0, [x1]

gcc/ChangeLog:

2020-08-04  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* dse.c (find_shift_sequence): Use subreg of shifted from high part
	register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-08-04  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* gcc.target/powerpc/pr71309.c: New test.
This commit is contained in:
Xionghu Luo 2020-08-03 22:09:15 -05:00
parent 6a1ad710ad
commit 265d817b1e
2 changed files with 83 additions and 29 deletions

View file

@ -1720,6 +1720,35 @@ find_shift_sequence (poly_int64 access_size,
scalar_int_mode new_mode;
rtx read_reg = NULL;
/* If a constant was stored into memory, try to simplify it here,
otherwise the cost of the shift might preclude this optimization
e.g. at -Os, even when no actual shift will be needed. */
if (store_info->const_rhs)
{
auto new_mode = smallest_int_mode_for_size (access_size * BITS_PER_UNIT);
auto byte = subreg_lowpart_offset (new_mode, store_mode);
rtx ret
= simplify_subreg (new_mode, store_info->const_rhs, store_mode, byte);
if (ret && CONSTANT_P (ret))
{
rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
ret = simplify_const_binary_operation (LSHIFTRT, new_mode, ret,
shift_rtx);
if (ret && CONSTANT_P (ret))
{
byte = subreg_lowpart_offset (read_mode, new_mode);
ret = simplify_subreg (read_mode, ret, new_mode, byte);
if (ret && CONSTANT_P (ret)
&& (set_src_cost (ret, read_mode, speed)
<= COSTS_N_INSNS (1)))
return ret;
}
}
}
if (require_cst)
return NULL_RTX;
/* Some machines like the x86 have shift insns for each size of
operand. Other machines like the ppc or the ia-64 may only have
shift insns that shift values within 32 or 64 bit registers.
@ -1729,7 +1758,7 @@ find_shift_sequence (poly_int64 access_size,
opt_scalar_int_mode new_mode_iter;
FOR_EACH_MODE_FROM (new_mode_iter,
smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
smallest_int_mode_for_size (GET_MODE_BITSIZE (read_mode)))
{
rtx target, new_reg, new_lhs;
rtx_insn *shift_seq, *insn;
@ -1739,34 +1768,6 @@ find_shift_sequence (poly_int64 access_size,
if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
break;
/* If a constant was stored into memory, try to simplify it here,
otherwise the cost of the shift might preclude this optimization
e.g. at -Os, even when no actual shift will be needed. */
if (store_info->const_rhs)
{
poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
store_mode, byte);
if (ret && CONSTANT_P (ret))
{
rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
ret = simplify_const_binary_operation (LSHIFTRT, new_mode,
ret, shift_rtx);
if (ret && CONSTANT_P (ret))
{
byte = subreg_lowpart_offset (read_mode, new_mode);
ret = simplify_subreg (read_mode, ret, new_mode, byte);
if (ret && CONSTANT_P (ret)
&& (set_src_cost (ret, read_mode, speed)
<= COSTS_N_INSNS (1)))
return ret;
}
}
}
if (require_cst)
return NULL_RTX;
/* Try a wider mode if truncating the store mode to NEW_MODE
requires a real instruction. */
if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
&& !targetm.modes_tieable_p (new_mode, store_mode))
continue;
if (multiple_p (shift, GET_MODE_BITSIZE (new_mode))
&& known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
{
/* Try to implement the shift using a subreg. */
poly_int64 offset
= subreg_offset_from_lsb (new_mode, store_mode, shift);
rtx rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
store_mode, offset);
if (rhs_subreg)
{
read_reg
= extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
break;
}
}
if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
continue;
new_reg = gen_reg_rtx (new_mode);
start_sequence ();

View file

@ -0,0 +1,34 @@
/* { dg-do compile } */
/* { dg-require-effective-target powerpc_p9vector_ok } */
/* { dg-require-effective-target lp64 } */
/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
#define TYPE void*
#define TYPE2 void*
struct path {
TYPE2 mnt;
TYPE dentry;
};
struct nameidata {
struct path path;
struct path root;
};
__attribute__ ((noinline))
TYPE foo(struct nameidata *nd)
{
TYPE d;
TYPE2 d2;
nd->path = nd->root;
d = nd->path.dentry;
d2 = nd->path.mnt;
return d;
}
/* { dg-final { scan-assembler-not {\mlxv\M} } } */
/* { dg-final { scan-assembler-not {\mstxv\M} } } */
/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */