aarch64: Add internal tune flag to minimise VL-based scalar ops

This patch introduces an internal tune flag to break up VL-based scalar ops
into a GP-reg scalar op with the VL read kept separate. This can be preferable on some CPUs.

I went for a tune param rather than extending the rtx costs as our RTX costs tables aren't set up to track
this intricacy.

I've confirmed that on the simple loop:
void vadd (int *dst, int *op1, int *op2, int count)
{
  for (int i = 0; i < count; ++i)
    dst[i] = op1[i] + op2[i];
}

we now split the incw into a cntw outside the loop and the add inside.

+       cntw    x5
...
loop:
-       incw    x4
+       add     x4, x4, x5

gcc/ChangeLog:

	* config/aarch64/aarch64-tuning-flags.def (cse_sve_vl_constants):
	Define.
	* config/aarch64/aarch64.md (add<mode>3): Force CONST_POLY_INT immediates
	into a register when the above is enabled.
	* config/aarch64/aarch64.c (neoversev1_tunings):
	AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.
	(aarch64_rtx_costs): Use AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.

gcc/testsuite/

	* gcc.target/aarch64/sve/cse_sve_vl_constants_1.c: New test.
This commit is contained in:
Kyrylo Tkachov 2021-02-22 21:24:41 +00:00
parent 692ba083d9
commit a65b9ad863
4 changed files with 35 additions and 3 deletions

View file

@ -46,4 +46,6 @@ AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS)
AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)
AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)
#undef AARCH64_EXTRA_TUNING_OPTION

View file

@ -1492,7 +1492,7 @@ static const struct tune_params neoversev1_tunings =
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
(AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */
&generic_prefetch_tune
};
@ -12589,8 +12589,18 @@ cost_plus:
*cost += rtx_cost (op0, mode, PLUS, 0, speed);
if (speed)
/* ADD (immediate). */
*cost += extra_cost->alu.arith;
{
/* ADD (immediate). */
*cost += extra_cost->alu.arith;
/* Some tunings prefer to not use the VL-based scalar ops.
Increase the cost of the poly immediate to prevent their
formation. */
if (GET_CODE (op1) == CONST_POLY_INT
&& (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
*cost += COSTS_N_INSNS (1);
}
return true;
}

View file

@ -1933,6 +1933,14 @@
&& (!REG_P (op1)
|| !REGNO_PTR_FRAME_P (REGNO (op1))))
operands[2] = force_reg (<MODE>mode, operands[2]);
/* Some tunings prefer to avoid VL-based operations.
Split off the poly immediate here. The rtx costs hook will reject attempts
to combine them back. */
else if (GET_CODE (operands[2]) == CONST_POLY_INT
&& can_create_pseudo_p ()
&& (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
operands[2] = force_reg (<MODE>mode, operands[2]);
/* Expand polynomial additions now if the destination is the stack
pointer, since we don't want to use that as a temporary. */
else if (operands[0] == stack_pointer_rtx

View file

@ -0,0 +1,12 @@
/* { dg-do compile } */
/* { dg-options "-O3 -moverride=tune=cse_sve_vl_constants" } */
void __attribute__((noinline, noclone))
vadd (int *dst, int *op1, int *op2, int count)
{
for (int i = 0; i < count; ++i)
dst[i] = op1[i] + op2[i];
}
/* { dg-final { scan-assembler-not {\tincw\tx[0-9]+} } } */