From 2d56d6ba9c2ebceb8865a65c0fd2657773abf76b Mon Sep 17 00:00:00 2001
From: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date: Fri, 7 Dec 2018 16:48:39 +0000
Subject: [PATCH] [AArch64][1/2] Implement TARGET_ESTIMATED_POLY_VALUE

The hook TARGET_ESTIMATED_POLY_VALUE allows a target to give an estimate for a poly_int run-time value.
It is used exclusively in tuning decisions, things like estimated loop iterations, probabilities etc.
It is not relied on for correctness.

If we know the SVE width implemented in hardware we can make more more
informed decisions in the implementation of TARGET_ESTIMATED_POLY_VALUE,
even when compiling for VLA vectorisation.

This patch adds an sve_width field to our tuning structs and sets it for
the current CPU tunings.

A new value is introduced to the aarch64_sve_vector_bits_enum enum that indicates
that SVE is not available: SVE_NOT_IMPLEMENTED. I set it to the same value as SVE_SCALABLE
so that parts of the aarch64 backend that follow the pattern:
if (vector_width == SVE_SCALABLE)
  do_vla_friendly_action ()
else
  assume_specific_width_for_correctness ()

continue to work without change, but the CPU tuning structs can use a more
appropriate moniker for indicating the absence of SVE.

This sets sve_width to SVE_NOT_IMPLEMENTED for all cores.
I aim to add an -moverride switch in the next patch that allows a power user to experiment
with different values of it for investigations.

	* config/aarch64/aarch64-opts.h (aarch64_sve_vector_bits_enum):
	Add SVE_NOT_IMPLEMENTED value.
	* config/aarch64/aarch64-protos.h (struct tune_params): Add sve_width
	field.
	* config/aarch64/aarch64.c (generic_tunings,cortexa35_tunings,
	cortexa53_tunings, cortexa57_tunings, cortexa72_tunings,
	cortexa73_tunings, exynosm1_tunings, thunderx_tunings,
	thunderx_tunings, tsv110_tunings, xgene1_tunings, qdf24xx_tunings,
	saphira_tunings, thunderx2t99_tunings, emag_tunings):
	Specify sve_width.
	(aarch64_estimated_poly_value): Define.
	(TARGET_ESTIMATED_POLY_VALUE): Define.

From-SVN: r266896
---
 gcc/ChangeLog                       | 15 ++++++++++++
 gcc/config/aarch64/aarch64-opts.h   |  1 +
 gcc/config/aarch64/aarch64-protos.h |  4 ++++
 gcc/config/aarch64/aarch64.c        | 37 +++++++++++++++++++++++++++++
 4 files changed, 57 insertions(+)

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 39765211a7a..3f83c29dc36 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,18 @@
+2018-12-07  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+	* config/aarch64/aarch64-opts.h (aarch64_sve_vector_bits_enum):
+	Add SVE_NOT_IMPLEMENTED value.
+	* config/aarch64/aarch64-protos.h (struct tune_params): Add sve_width
+	field.
+	* config/aarch64/aarch64.c (generic_tunings,cortexa35_tunings,
+	cortexa53_tunings, cortexa57_tunings, cortexa72_tunings,
+	cortexa73_tunings, exynosm1_tunings, thunderx_tunings,
+	thunderx_tunings, tsv110_tunings, xgene1_tunings, qdf24xx_tunings,
+	saphira_tunings, thunderx2t99_tunings, emag_tunings):
+	Specify sve_width.
+	(aarch64_estimated_poly_value): Define.
+	(TARGET_ESTIMATED_POLY_VALUE): Define.
+
 2018-12-07  Paul A. Clarke  <pc@us.ibm.com>
 
 	PR target/88408
diff --git a/gcc/config/aarch64/aarch64-opts.h b/gcc/config/aarch64/aarch64-opts.h
index 7a5c6d7664f..1ac056b66da 100644
--- a/gcc/config/aarch64/aarch64-opts.h
+++ b/gcc/config/aarch64/aarch64-opts.h
@@ -84,6 +84,7 @@ enum aarch64_function_type {
 /* SVE vector register sizes.  */
 enum aarch64_sve_vector_bits_enum {
   SVE_SCALABLE,
+  SVE_NOT_IMPLEMENTED = SVE_SCALABLE,
   SVE_128 = 128,
   SVE_256 = 256,
   SVE_512 = 512,
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 1fe1a50d52a..4ed886bd200 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -252,6 +252,10 @@ struct tune_params
   const struct cpu_vector_cost *vec_costs;
   const struct cpu_branch_cost *branch_costs;
   const struct cpu_approx_modes *approx_modes;
+  /* Width of the SVE registers or SVE_NOT_IMPLEMENTED if not applicable.
+     Only used for tuning decisions, does not disable VLA
+     vectorization.  */
+  enum aarch64_sve_vector_bits_enum sve_width;
   int memmov_cost;
   int issue_rate;
   unsigned int fusible_ops;
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 490a2038b8e..ba9b5ad4efe 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -681,6 +681,7 @@ static const struct tune_params generic_tunings =
   &generic_vector_cost,
   &generic_branch_cost,
   &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
   4, /* memmov_cost  */
   2, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
@@ -706,6 +707,7 @@ static const struct tune_params cortexa35_tunings =
   &generic_vector_cost,
   &generic_branch_cost,
   &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
   4, /* memmov_cost  */
   1, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -732,6 +734,7 @@ static const struct tune_params cortexa53_tunings =
   &generic_vector_cost,
   &generic_branch_cost,
   &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
   4, /* memmov_cost  */
   2, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -758,6 +761,7 @@ static const struct tune_params cortexa57_tunings =
   &cortexa57_vector_cost,
   &generic_branch_cost,
   &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
   4, /* memmov_cost  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -784,6 +788,7 @@ static const struct tune_params cortexa72_tunings =
   &cortexa57_vector_cost,
   &generic_branch_cost,
   &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
   4, /* memmov_cost  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -810,6 +815,7 @@ static const struct tune_params cortexa73_tunings =
   &cortexa57_vector_cost,
   &generic_branch_cost,
   &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
   4, /* memmov_cost.  */
   2, /* issue_rate.  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -838,6 +844,7 @@ static const struct tune_params exynosm1_tunings =
   &exynosm1_vector_cost,
   &generic_branch_cost,
   &exynosm1_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
   4,	/* memmov_cost  */
   3,	/* issue_rate  */
   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
@@ -863,6 +870,7 @@ static const struct tune_params thunderxt88_tunings =
   &thunderx_vector_cost,
   &generic_branch_cost,
   &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
   6, /* memmov_cost  */
   2, /* issue_rate  */
   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
@@ -888,6 +896,7 @@ static const struct tune_params thunderx_tunings =
   &thunderx_vector_cost,
   &generic_branch_cost,
   &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
   6, /* memmov_cost  */
   2, /* issue_rate  */
   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
@@ -914,6 +923,7 @@ static const struct tune_params tsv110_tunings =
   &tsv110_vector_cost,
   &generic_branch_cost,
   &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
   4,    /* memmov_cost  */
   4,    /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
@@ -940,6 +950,7 @@ static const struct tune_params xgene1_tunings =
   &xgene1_vector_cost,
   &generic_branch_cost,
   &xgene1_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
   6, /* memmov_cost  */
   4, /* issue_rate  */
   AARCH64_FUSE_NOTHING, /* fusible_ops  */
@@ -965,6 +976,7 @@ static const struct tune_params emag_tunings =
   &xgene1_vector_cost,
   &generic_branch_cost,
   &xgene1_approx_modes,
+  SVE_NOT_IMPLEMENTED,
   6, /* memmov_cost  */
   4, /* issue_rate  */
   AARCH64_FUSE_NOTHING, /* fusible_ops  */
@@ -990,6 +1002,7 @@ static const struct tune_params qdf24xx_tunings =
   &qdf24xx_vector_cost,
   &generic_branch_cost,
   &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
   4, /* memmov_cost  */
   4, /* issue_rate  */
   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -1018,6 +1031,7 @@ static const struct tune_params saphira_tunings =
   &generic_vector_cost,
   &generic_branch_cost,
   &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
   4, /* memmov_cost  */
   4, /* issue_rate  */
   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -1044,6 +1058,7 @@ static const struct tune_params thunderx2t99_tunings =
   &thunderx2t99_vector_cost,
   &generic_branch_cost,
   &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
   4, /* memmov_cost.  */
   4, /* issue_rate.  */
   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
@@ -17869,6 +17884,25 @@ aarch64_speculation_safe_value (machine_mode mode,
   return result;
 }
 
+/* Implement TARGET_ESTIMATED_POLY_VALUE.
+   Look into the tuning structure for an estimate.
+   VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
+   Advanced SIMD 128 bits.  */
+
+static HOST_WIDE_INT
+aarch64_estimated_poly_value (poly_int64 val)
+{
+  enum aarch64_sve_vector_bits_enum width_source
+    = aarch64_tune_params.sve_width;
+
+  /* If we still don't have an estimate, use the default.  */
+  if (width_source == SVE_SCALABLE)
+    return default_estimated_poly_value (val);
+
+  HOST_WIDE_INT over_128 = width_source - 128;
+  return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
+}
+
 /* Target-specific selftests.  */
 
 #if CHECKING_P
@@ -18348,6 +18382,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_SPECULATION_SAFE_VALUE
 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
 
+#undef TARGET_ESTIMATED_POLY_VALUE
+#define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
+
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests