AArch64: Add fma_reassoc_width [PR107413]

Add a reassocation width for FMA in per-CPU tuning structures. Keep
the existing setting of 1 for cores with 2 FMA pipes (this disables
reassociation), and use 4 for cores with 4 FMA pipes.  This improves
SPECFP2017 on Neoverse V1 by ~1.5%.

gcc/
	PR tree-optimization/107413
	* config/aarch64/aarch64.cc (struct tune_params): Add
	fma_reassoc_width to all CPU tuning structures.
	(aarch64_reassociation_width): Use fma_reassoc_width.
	* config/aarch64/aarch64-protos.h (struct tune_params): Add
	fma_reassoc_width.
This commit is contained in:
Wilco Dijkstra 2022-11-23 17:27:19 +00:00
parent 138ee8f745
commit 0c1b0a23f1
2 changed files with 34 additions and 3 deletions

View file

@ -540,6 +540,7 @@ struct tune_params
const char *loop_align;
int int_reassoc_width;
int fp_reassoc_width;
int fma_reassoc_width;
int vec_reassoc_width;
int min_div_recip_mul_sf;
int min_div_recip_mul_df;

View file

@ -1346,6 +1346,7 @@ static const struct tune_params generic_tunings =
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1382,6 +1383,7 @@ static const struct tune_params cortexa35_tunings =
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1415,6 +1417,7 @@ static const struct tune_params cortexa53_tunings =
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1448,6 +1451,7 @@ static const struct tune_params cortexa57_tunings =
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1481,6 +1485,7 @@ static const struct tune_params cortexa72_tunings =
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1514,6 +1519,7 @@ static const struct tune_params cortexa73_tunings =
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1548,6 +1554,7 @@ static const struct tune_params exynosm1_tunings =
"4", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1580,6 +1587,7 @@ static const struct tune_params thunderxt88_tunings =
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1612,6 +1620,7 @@ static const struct tune_params thunderx_tunings =
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1646,6 +1655,7 @@ static const struct tune_params tsv110_tunings =
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1678,6 +1688,7 @@ static const struct tune_params xgene1_tunings =
"16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1710,6 +1721,7 @@ static const struct tune_params emag_tunings =
"16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1743,6 +1755,7 @@ static const struct tune_params qdf24xx_tunings =
"16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1778,6 +1791,7 @@ static const struct tune_params saphira_tunings =
"16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1811,6 +1825,7 @@ static const struct tune_params thunderx2t99_tunings =
"16", /* loop_align. */
3, /* int_reassoc_width. */
2, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1844,6 +1859,7 @@ static const struct tune_params thunderx3t110_tunings =
"16", /* loop_align. */
3, /* int_reassoc_width. */
2, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1876,6 +1892,7 @@ static const struct tune_params neoversen1_tunings =
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1912,6 +1929,7 @@ static const struct tune_params ampere1_tunings =
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -1949,6 +1967,7 @@ static const struct tune_params ampere1a_tunings =
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -2126,6 +2145,7 @@ static const struct tune_params neoversev1_tunings =
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
4, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -2263,6 +2283,7 @@ static const struct tune_params neoverse512tvb_tunings =
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
4, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -2451,6 +2472,7 @@ static const struct tune_params neoversen2_tunings =
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -2640,6 +2662,7 @@ static const struct tune_params neoversev2_tunings =
"32:16", /* loop_align. */
3, /* int_reassoc_width. */
6, /* fp_reassoc_width. */
4, /* fma_reassoc_width. */
3, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -2675,6 +2698,7 @@ static const struct tune_params a64fx_tunings =
"32", /* loop_align. */
4, /* int_reassoc_width. */
2, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
@ -3387,9 +3411,15 @@ aarch64_reassociation_width (unsigned opc, machine_mode mode)
return aarch64_tune_params.vec_reassoc_width;
if (INTEGRAL_MODE_P (mode))
return aarch64_tune_params.int_reassoc_width;
/* Avoid reassociating floating point addition so we emit more FMAs. */
if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
return aarch64_tune_params.fp_reassoc_width;
/* Reassociation reduces the number of FMAs which may result in worse
performance. Use a per-CPU setting for FMA reassociation which allows
narrow CPUs with few FP pipes to switch it off (value of 1), and wider
CPUs with many FP pipes to enable reassociation.
Since the reassociation pass doesn't understand FMA at all, assume
that any FP addition might turn into FMA. */
if (FLOAT_MODE_P (mode))
return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
: aarch64_tune_params.fp_reassoc_width;
return 1;
}