aarch64: Add support for Ampere-1A (-mcpu=ampere1a) CPU

This patch adds support for Ampere-1A CPU:
 - recognize the name of the core and provide detection for -mcpu=native,
 - updated extra_costs,
 - adds a new fusion pair for (A+B+1 and A-B-1).

Ampere-1A and Ampere-1 have more timing difference than the extra
costs indicate, but these don't propagate through to the headline
items in our extra costs (e.g. the change in latency for scalar sqrt
doesn't have a corresponding table entry).

gcc/ChangeLog:

	* config/aarch64/aarch64-cores.def (AARCH64_CORE): Add ampere1a.
	* config/aarch64/aarch64-cost-tables.h: Add ampere1a_extra_costs.
	* config/aarch64/aarch64-fusion-pairs.def (AARCH64_FUSION_PAIR):
	Define a new fusion pair for A+B+1/A-B-1 (i.e., add/subtract two
	registers and then +1/-1).
	* config/aarch64/aarch64-tune.md: Regenerate.
	* config/aarch64/aarch64.cc (aarch_macro_fusion_pair_p): Implement
	idiom-matcher for the new fusion pair.
	* doc/invoke.texi: Add ampere1a.
This commit is contained in:
Philipp Tomsich 2022-11-07 14:22:21 +01:00
parent 5ba25973e2
commit 590a06afbf
6 changed files with 175 additions and 2 deletions

View file

@ -70,6 +70,7 @@ AARCH64_CORE("thunderxt83", thunderxt83, thunderx, V8A, (CRC, CRYPTO), thu
/* Ampere Computing ('\xC0') cores. */
AARCH64_CORE("ampere1", ampere1, cortexa57, V8_6A, (F16, RNG, AES, SHA3), ampere1, 0xC0, 0xac3, -1)
AARCH64_CORE("ampere1a", ampere1a, cortexa57, V8_6A, (F16, RNG, AES, SHA3, MEMTAG), ampere1a, 0xC0, 0xac4, -1)
/* Do not swap around "emag" and "xgene1",
this order is required to handle variant correctly. */
AARCH64_CORE("emag", emag, xgene1, V8A, (CRC, CRYPTO), emag, 0x50, 0x000, 3)

View file

@ -775,4 +775,111 @@ const struct cpu_cost_table ampere1_extra_costs =
}
};
const struct cpu_cost_table ampere1a_extra_costs =
{
/* ALU */
{
0, /* arith. */
0, /* logical. */
0, /* shift. */
COSTS_N_INSNS (1), /* shift_reg. */
0, /* arith_shift. */
COSTS_N_INSNS (1), /* arith_shift_reg. */
0, /* log_shift. */
COSTS_N_INSNS (1), /* log_shift_reg. */
0, /* extend. */
COSTS_N_INSNS (1), /* extend_arith. */
0, /* bfi. */
0, /* bfx. */
0, /* clz. */
0, /* rev. */
0, /* non_exec. */
true /* non_exec_costs_exec. */
},
{
/* MULT SImode */
{
COSTS_N_INSNS (3), /* simple. */
COSTS_N_INSNS (3), /* flag_setting. */
COSTS_N_INSNS (3), /* extend. */
COSTS_N_INSNS (4), /* add. */
COSTS_N_INSNS (4), /* extend_add. */
COSTS_N_INSNS (19) /* idiv. */
},
/* MULT DImode */
{
COSTS_N_INSNS (3), /* simple. */
0, /* flag_setting (N/A). */
COSTS_N_INSNS (3), /* extend. */
COSTS_N_INSNS (4), /* add. */
COSTS_N_INSNS (4), /* extend_add. */
COSTS_N_INSNS (35) /* idiv. */
}
},
/* LD/ST */
{
COSTS_N_INSNS (4), /* load. */
COSTS_N_INSNS (4), /* load_sign_extend. */
0, /* ldrd (n/a). */
0, /* ldm_1st. */
0, /* ldm_regs_per_insn_1st. */
0, /* ldm_regs_per_insn_subsequent. */
COSTS_N_INSNS (5), /* loadf. */
COSTS_N_INSNS (5), /* loadd. */
COSTS_N_INSNS (5), /* load_unaligned. */
0, /* store. */
0, /* strd. */
0, /* stm_1st. */
0, /* stm_regs_per_insn_1st. */
0, /* stm_regs_per_insn_subsequent. */
COSTS_N_INSNS (2), /* storef. */
COSTS_N_INSNS (2), /* stored. */
COSTS_N_INSNS (2), /* store_unaligned. */
COSTS_N_INSNS (3), /* loadv. */
COSTS_N_INSNS (3) /* storev. */
},
{
/* FP SFmode */
{
COSTS_N_INSNS (25), /* div. */
COSTS_N_INSNS (4), /* mult. */
COSTS_N_INSNS (4), /* mult_addsub. */
COSTS_N_INSNS (4), /* fma. */
COSTS_N_INSNS (4), /* addsub. */
COSTS_N_INSNS (2), /* fpconst. */
COSTS_N_INSNS (4), /* neg. */
COSTS_N_INSNS (4), /* compare. */
COSTS_N_INSNS (4), /* widen. */
COSTS_N_INSNS (4), /* narrow. */
COSTS_N_INSNS (4), /* toint. */
COSTS_N_INSNS (4), /* fromint. */
COSTS_N_INSNS (4) /* roundint. */
},
/* FP DFmode */
{
COSTS_N_INSNS (34), /* div. */
COSTS_N_INSNS (5), /* mult. */
COSTS_N_INSNS (5), /* mult_addsub. */
COSTS_N_INSNS (5), /* fma. */
COSTS_N_INSNS (5), /* addsub. */
COSTS_N_INSNS (2), /* fpconst. */
COSTS_N_INSNS (5), /* neg. */
COSTS_N_INSNS (5), /* compare. */
COSTS_N_INSNS (5), /* widen. */
COSTS_N_INSNS (5), /* narrow. */
COSTS_N_INSNS (6), /* toint. */
COSTS_N_INSNS (6), /* fromint. */
COSTS_N_INSNS (5) /* roundint. */
}
},
/* Vector */
{
COSTS_N_INSNS (3), /* alu. */
COSTS_N_INSNS (3), /* mult. */
COSTS_N_INSNS (2), /* movi. */
COSTS_N_INSNS (2), /* dup. */
COSTS_N_INSNS (2) /* extract. */
}
};
#endif

View file

@ -36,5 +36,6 @@ AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH)
AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
AARCH64_FUSION_PAIR ("alu+branch", ALU_BRANCH)
AARCH64_FUSION_PAIR ("alu+cbz", ALU_CBZ)
AARCH64_FUSION_PAIR ("addsub_2reg_const1", ADDSUB_2REG_CONST1)
#undef AARCH64_FUSION_PAIR

View file

@ -1,5 +1,5 @@
;; -*- buffer-read-only: t -*-
;; Generated automatically by gentune.sh from aarch64-cores.def
(define_attr "tune"
"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexa715,cortexx2,neoversen2,demeter,neoversev2"
"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexa715,cortexx2,neoversen2,demeter,neoversev2"
(const (symbol_ref "((enum attr_tune) aarch64_tune)")))

View file

@ -1921,6 +1921,43 @@ static const struct tune_params ampere1_tunings =
&ampere1_prefetch_tune
};
static const struct tune_params ampere1a_tunings =
{
&ampere1a_extra_costs,
&generic_addrcost_table,
&generic_regmove_cost,
&ampere1_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 4, /* load_int. */
4, /* store_int. */
4, /* load_fp. */
4, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
4, /* issue_rate */
(AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
AARCH64_FUSE_ADDSUB_2REG_CONST1),
/* fusible_ops */
"32", /* function_align. */
"4", /* jump_align. */
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&ampere1_prefetch_tune
};
static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
{
2, /* int_stmt_cost */
@ -25539,6 +25576,33 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
}
}
/* Fuse A+B+1 and A-B-1 */
if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
{
/* We're trying to match:
prev == (set (r0) (plus (r0) (r1)))
curr == (set (r0) (plus (r0) (const_int 1)))
or:
prev == (set (r0) (minus (r0) (r1)))
curr == (set (r0) (plus (r0) (const_int -1))) */
rtx prev_src = SET_SRC (prev_set);
rtx curr_src = SET_SRC (curr_set);
int polarity = 1;
if (GET_CODE (prev_src) == MINUS)
polarity = -1;
if (GET_CODE (curr_src) == PLUS
&& (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
&& CONST_INT_P (XEXP (curr_src, 1))
&& INTVAL (XEXP (curr_src, 1)) == polarity
&& REG_P (XEXP (curr_src, 0))
&& REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
return true;
}
return false;
}

View file

@ -19995,7 +19995,7 @@ performance of the code. Permissible values for this option are:
@samp{cortex-a75.cortex-a55}, @samp{cortex-a76.cortex-a55},
@samp{cortex-r82}, @samp{cortex-x1}, @samp{cortex-x1c}, @samp{cortex-x2},
@samp{cortex-a510}, @samp{cortex-a710}, @samp{cortex-a715}, @samp{ampere1},
@samp{native}.
@samp{ampere1a}, and @samp{native}.
The values @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53},
@samp{cortex-a73.cortex-a35}, @samp{cortex-a73.cortex-a53},