dse: Remove partial load after full store for high part access[PR71309]
v5 update as comments: 1. Move const_rhs out of loop; 2. Iterate from int size for read_mode. This patch could optimize(works for char/short/int/void*): 6: r119:TI=[r118:DI+0x10] 7: [r118:DI]=r119:TI 8: r121:DI=[r118:DI+0x8] => 6: r119:TI=[r118:DI+0x10] 16: r122:DI=r119:TI#8 Final ASM will be as below without partial load after full store(stxv+ld): ld 10,16(3) mr 9,3 ld 3,24(3) std 10,0(9) std 3,8(9) blr It could achieve ~25% performance improvement for typical cases on Power9. Bootstrap and regression tested on Power9-LE. For AArch64, one ldr is replaced by mov with this patch: ldp x2, x3, [x0, 16] stp x2, x3, [x0] ldr x0, [x0, 8] => mov x1, x0 ldp x2, x0, [x0, 16] stp x2, x0, [x1] gcc/ChangeLog: 2020-08-04 Xionghu Luo <luoxhu@linux.ibm.com> PR rtl-optimization/71309 * dse.c (find_shift_sequence): Use subreg of shifted from high part register to avoid loading from address. gcc/testsuite/ChangeLog: 2020-08-04 Xionghu Luo <luoxhu@linux.ibm.com> PR rtl-optimization/71309 * gcc.target/powerpc/pr71309.c: New test.
This commit is contained in:
parent
6a1ad710ad
commit
265d817b1e
2 changed files with 83 additions and 29 deletions
78
gcc/dse.c
78
gcc/dse.c
|
@ -1720,6 +1720,35 @@ find_shift_sequence (poly_int64 access_size,
|
|||
scalar_int_mode new_mode;
|
||||
rtx read_reg = NULL;
|
||||
|
||||
/* If a constant was stored into memory, try to simplify it here,
|
||||
otherwise the cost of the shift might preclude this optimization
|
||||
e.g. at -Os, even when no actual shift will be needed. */
|
||||
if (store_info->const_rhs)
|
||||
{
|
||||
auto new_mode = smallest_int_mode_for_size (access_size * BITS_PER_UNIT);
|
||||
auto byte = subreg_lowpart_offset (new_mode, store_mode);
|
||||
rtx ret
|
||||
= simplify_subreg (new_mode, store_info->const_rhs, store_mode, byte);
|
||||
if (ret && CONSTANT_P (ret))
|
||||
{
|
||||
rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
|
||||
ret = simplify_const_binary_operation (LSHIFTRT, new_mode, ret,
|
||||
shift_rtx);
|
||||
if (ret && CONSTANT_P (ret))
|
||||
{
|
||||
byte = subreg_lowpart_offset (read_mode, new_mode);
|
||||
ret = simplify_subreg (read_mode, ret, new_mode, byte);
|
||||
if (ret && CONSTANT_P (ret)
|
||||
&& (set_src_cost (ret, read_mode, speed)
|
||||
<= COSTS_N_INSNS (1)))
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (require_cst)
|
||||
return NULL_RTX;
|
||||
|
||||
/* Some machines like the x86 have shift insns for each size of
|
||||
operand. Other machines like the ppc or the ia-64 may only have
|
||||
shift insns that shift values within 32 or 64 bit registers.
|
||||
|
@ -1729,7 +1758,7 @@ find_shift_sequence (poly_int64 access_size,
|
|||
|
||||
opt_scalar_int_mode new_mode_iter;
|
||||
FOR_EACH_MODE_FROM (new_mode_iter,
|
||||
smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
|
||||
smallest_int_mode_for_size (GET_MODE_BITSIZE (read_mode)))
|
||||
{
|
||||
rtx target, new_reg, new_lhs;
|
||||
rtx_insn *shift_seq, *insn;
|
||||
|
@ -1739,34 +1768,6 @@ find_shift_sequence (poly_int64 access_size,
|
|||
if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
|
||||
break;
|
||||
|
||||
/* If a constant was stored into memory, try to simplify it here,
|
||||
otherwise the cost of the shift might preclude this optimization
|
||||
e.g. at -Os, even when no actual shift will be needed. */
|
||||
if (store_info->const_rhs)
|
||||
{
|
||||
poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
|
||||
rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
|
||||
store_mode, byte);
|
||||
if (ret && CONSTANT_P (ret))
|
||||
{
|
||||
rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
|
||||
ret = simplify_const_binary_operation (LSHIFTRT, new_mode,
|
||||
ret, shift_rtx);
|
||||
if (ret && CONSTANT_P (ret))
|
||||
{
|
||||
byte = subreg_lowpart_offset (read_mode, new_mode);
|
||||
ret = simplify_subreg (read_mode, ret, new_mode, byte);
|
||||
if (ret && CONSTANT_P (ret)
|
||||
&& (set_src_cost (ret, read_mode, speed)
|
||||
<= COSTS_N_INSNS (1)))
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (require_cst)
|
||||
return NULL_RTX;
|
||||
|
||||
/* Try a wider mode if truncating the store mode to NEW_MODE
|
||||
requires a real instruction. */
|
||||
if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
|
||||
|
@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
|
|||
&& !targetm.modes_tieable_p (new_mode, store_mode))
|
||||
continue;
|
||||
|
||||
if (multiple_p (shift, GET_MODE_BITSIZE (new_mode))
|
||||
&& known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
|
||||
{
|
||||
/* Try to implement the shift using a subreg. */
|
||||
poly_int64 offset
|
||||
= subreg_offset_from_lsb (new_mode, store_mode, shift);
|
||||
rtx rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
|
||||
store_mode, offset);
|
||||
if (rhs_subreg)
|
||||
{
|
||||
read_reg
|
||||
= extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
|
||||
continue;
|
||||
|
||||
new_reg = gen_reg_rtx (new_mode);
|
||||
|
||||
start_sequence ();
|
||||
|
|
34
gcc/testsuite/gcc.target/powerpc/pr71309.c
Normal file
34
gcc/testsuite/gcc.target/powerpc/pr71309.c
Normal file
|
@ -0,0 +1,34 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-require-effective-target powerpc_p9vector_ok } */
|
||||
/* { dg-require-effective-target lp64 } */
|
||||
/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
|
||||
|
||||
#define TYPE void*
|
||||
#define TYPE2 void*
|
||||
|
||||
struct path {
|
||||
TYPE2 mnt;
|
||||
TYPE dentry;
|
||||
};
|
||||
|
||||
struct nameidata {
|
||||
struct path path;
|
||||
struct path root;
|
||||
};
|
||||
|
||||
__attribute__ ((noinline))
|
||||
TYPE foo(struct nameidata *nd)
|
||||
{
|
||||
TYPE d;
|
||||
TYPE2 d2;
|
||||
|
||||
nd->path = nd->root;
|
||||
d = nd->path.dentry;
|
||||
d2 = nd->path.mnt;
|
||||
return d;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-not {\mlxv\M} } } */
|
||||
/* { dg-final { scan-assembler-not {\mstxv\M} } } */
|
||||
/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */
|
Loading…
Add table
Reference in a new issue