arm.c (arm_mac_accumulator_is_mul_result): New.

gcc/
	* config/arm/arm.c (arm_mac_accumulator_is_mul_result): New.
	* config/arm/arm-protos.h (arm_mac_accumulator_is_mul_result): New.
	* config/arm/cortex-a8.md: New.
	* config/arm/cortex-a8-neon.md: New.
	* config/arm/neon-schedgen.ml: New.
	* config/arm/neon.md (vqh_mnem): New.
	(neon_type): New.
	(Is_float_mode): New.
	(Scalar_mul_8_16): New.
	(Is_d_reg): New.
	(V_mode_nunits): New.
	(All instruction patterns): Annotate with neon_type attribute
	values.
	* config/arm/arm.md: Include cortex-a8.md.
	(insn): Add smmla, umaal, smlald, smlsld, clz, mrs, msr and xtab
	values.
	Annotate instruction patterns accordingly.
	(generic_sched): Do not use generic scheduling for Cortex-A8.
	(generic_vfp): Do not use generic VFP scheduling for Cortex-A8.


Co-Authored-By: Julian Brown <julian@codesourcery.com>

From-SVN: r126953
This commit is contained in:
Mark Shinwell 2007-07-26 12:04:02 +00:00 committed by Julian Brown
parent 0c4d4efbde
commit c956e102df
8 changed files with 3209 additions and 215 deletions

View file

@ -1,3 +1,26 @@
2007-07-26 Mark Shinwell <shinwell@codesourcery.com>
Julian Brown <julian@codesourcery.com>
* config/arm/arm.c (arm_mac_accumulator_is_mul_result): New.
* config/arm/arm-protos.h (arm_mac_accumulator_is_mul_result): New.
* config/arm/cortex-a8.md: New.
* config/arm/cortex-a8-neon.md: New.
* config/arm/neon-schedgen.ml: New.
* config/arm/neon.md (vqh_mnem): New.
(neon_type): New.
(Is_float_mode): New.
(Scalar_mul_8_16): New.
(Is_d_reg): New.
(V_mode_nunits): New.
(All instruction patterns): Annotate with neon_type attribute
values.
* config/arm/arm.md: Include cortex-a8.md.
(insn): Add smmla, umaal, smlald, smlsld, clz, mrs, msr and xtab
values.
Annotate instruction patterns accordingly.
(generic_sched): Do not use generic scheduling for Cortex-A8.
(generic_vfp): Do not use generic VFP scheduling for Cortex-A8.
2007-07-26 Daniel Jacobowitz <dan@codesourcery.com>
* fold-const.c (fold_read_from_constant_string): Use

View file

@ -94,6 +94,7 @@ extern int arm_no_early_store_addr_dep (rtx, rtx);
extern int arm_no_early_alu_shift_dep (rtx, rtx);
extern int arm_no_early_alu_shift_value_dep (rtx, rtx);
extern int arm_no_early_mul_dep (rtx, rtx);
extern int arm_mac_accumulator_is_mul_result (rtx, rtx);
extern int tls_mentioned_p (rtx);
extern int symbol_mentioned_p (rtx);

View file

@ -18167,6 +18167,39 @@ arm_cxx_guard_type (void)
return TARGET_AAPCS_BASED ? integer_type_node : long_long_integer_type_node;
}
/* Return non-zero if the consumer (a multiply-accumulate instruction)
has an accumulator dependency on the result of the producer (a
multiplication instruction) and no other dependency on that result. */
int
arm_mac_accumulator_is_mul_result (rtx producer, rtx consumer)
{
rtx mul = PATTERN (producer);
rtx mac = PATTERN (consumer);
rtx mul_result;
rtx mac_op0, mac_op1, mac_acc;
if (GET_CODE (mul) == COND_EXEC)
mul = COND_EXEC_CODE (mul);
if (GET_CODE (mac) == COND_EXEC)
mac = COND_EXEC_CODE (mac);
/* Check that mul is of the form (set (...) (mult ...))
and mla is of the form (set (...) (plus (mult ...) (...))). */
if ((GET_CODE (mul) != SET || GET_CODE (XEXP (mul, 1)) != MULT)
|| (GET_CODE (mac) != SET || GET_CODE (XEXP (mac, 1)) != PLUS
|| GET_CODE (XEXP (XEXP (mac, 1), 0)) != MULT))
return 0;
mul_result = XEXP (mul, 0);
mac_op0 = XEXP (XEXP (XEXP (mac, 1), 0), 0);
mac_op1 = XEXP (XEXP (XEXP (mac, 1), 0), 1);
mac_acc = XEXP (XEXP (mac, 1), 1);
return (reg_overlap_mentioned_p (mul_result, mac_acc)
&& !reg_overlap_mentioned_p (mul_result, mac_op0)
&& !reg_overlap_mentioned_p (mul_result, mac_op1));
}
/* The EABI says test the least significant bit of a guard variable. */

View file

@ -184,7 +184,7 @@
;; scheduling information.
(define_attr "insn"
"smulxy,smlaxy,smlalxy,smulwy,smlawx,mul,muls,mla,mlas,umull,umulls,umlal,umlals,smull,smulls,smlal,smlals,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx,smmul,smmulr,other"
"mov,mvn,smulxy,smlaxy,smlalxy,smulwy,smlawx,mul,muls,mla,mlas,umull,umulls,umlal,umlals,smull,smulls,smlal,smlals,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx,smmul,smmulr,smmla,umaal,smlald,smlsld,clz,mrs,msr,xtab,other"
(const_string "other"))
; TYPE attribute is used to detect floating point instructions which, if
@ -235,8 +235,9 @@
; mav_farith Floating point arithmetic (4 cycle)
; mav_dmult Double multiplies (7 cycle)
;
(define_attr "type"
"alu,alu_shift,alu_shift_reg,mult,block,float,fdivx,fdivd,fdivs,fmul,ffmul,farith,ffarith,f_flag,float_em,f_load,f_store,f_loads,f_loadd,f_stores,f_stored,f_mem_r,r_mem_f,f_2_r,r_2_f,f_cvt,branch,call,load_byte,load1,load2,load3,load4,store1,store2,store3,store4,mav_farith,mav_dmult"
"alu,alu_shift,alu_shift_reg,mult,block,float,fdivx,fdivd,fdivs,fmul,fmuls,fmuld,fmacs,fmacd,ffmul,farith,ffarith,f_flag,float_em,f_load,f_store,f_loads,f_loadd,f_stores,f_stored,f_mem_r,r_mem_f,f_2_r,r_2_f,f_cvt,branch,call,load_byte,load1,load2,load3,load4,store1,store2,store3,store4,mav_farith,mav_dmult"
(if_then_else
(eq_attr "insn" "smulxy,smlaxy,smlalxy,smulwy,smlawx,mul,muls,mla,mlas,umull,umulls,umlal,umlals,smull,smulls,smlal,smlals")
(const_string "mult")
@ -332,14 +333,14 @@
(define_attr "generic_sched" "yes,no"
(const (if_then_else
(eq_attr "tune" "arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs")
(eq_attr "tune" "arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa8")
(const_string "no")
(const_string "yes"))))
(define_attr "generic_vfp" "yes,no"
(const (if_then_else
(and (eq_attr "fpu" "vfp")
(eq_attr "tune" "!arm1020e,arm1022e"))
(eq_attr "tune" "!arm1020e,arm1022e,cortexa8"))
(const_string "yes")
(const_string "no"))))
@ -348,6 +349,7 @@
(include "arm1020e.md")
(include "arm1026ejs.md")
(include "arm1136jfs.md")
(include "cortex-a8.md")
;;---------------------------------------------------------------------------
@ -3869,6 +3871,7 @@
"TARGET_INT_SIMD"
"uxtab%?\\t%0, %2, %1"
[(set_attr "predicable" "yes")
(set_attr "insn" "xtab")
(set_attr "type" "alu_shift")]
)
@ -4242,6 +4245,7 @@
"TARGET_INT_SIMD"
"sxtab%?\\t%0, %2, %1"
[(set_attr "type" "alu_shift")
(set_attr "insn" "xtab")
(set_attr "predicable" "yes")]
)
@ -10772,7 +10776,8 @@
(clz:SI (match_operand:SI 1 "s_register_operand" "r")))]
"TARGET_32BIT && arm_arch5"
"clz%?\\t%0, %1"
[(set_attr "predicable" "yes")])
[(set_attr "predicable" "yes")
(set_attr "insn" "clz")])
(define_expand "ffssi2"
[(set (match_operand:SI 0 "s_register_operand" "")

File diff suppressed because it is too large Load diff

272
gcc/config/arm/cortex-a8.md Normal file
View file

@ -0,0 +1,272 @@
;; ARM Cortex-A8 scheduling description.
;; Copyright (C) 2007 Free Software Foundation, Inc.
;; Contributed by CodeSourcery.
;; This file is part of GCC.
;; GCC is distributed in the hope that it will be useful, but WITHOUT
;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
;; License for more details.
;; You should have received a copy of the GNU General Public License
;; along with GCC; see the file COPYING. If not, write to
;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
;; Boston, MA 02110-1301, USA.
(define_automaton "cortex_a8")
;; Only one load/store instruction can be issued per cycle
;; (although reservation of this unit is only required for single
;; loads and stores -- see below).
(define_cpu_unit "cortex_a8_issue_ls" "cortex_a8")
;; Only one branch instruction can be issued per cycle.
(define_cpu_unit "cortex_a8_issue_branch" "cortex_a8")
;; The two ALU pipelines.
(define_cpu_unit "cortex_a8_alu0" "cortex_a8")
(define_cpu_unit "cortex_a8_alu1" "cortex_a8")
;; The usual flow of an instruction through the pipelines.
(define_reservation "cortex_a8_default"
"cortex_a8_alu0|cortex_a8_alu1")
;; The flow of a branch instruction through the pipelines.
(define_reservation "cortex_a8_branch"
"(cortex_a8_alu0+cortex_a8_issue_branch)|\
(cortex_a8_alu1+cortex_a8_issue_branch)")
;; The flow of a load or store instruction through the pipeline in
;; the case where that instruction consists of only one micro-op...
(define_reservation "cortex_a8_load_store_1"
"(cortex_a8_alu0+cortex_a8_issue_ls)|\
(cortex_a8_alu1+cortex_a8_issue_ls)")
;; ...and in the case of two micro-ops. We don't need to reserve
;; cortex_a8_issue_ls here because dual issue is altogether forbidden
;; during the issue cycle of the first micro-op. (Instead of modelling
;; a separate issue unit, we instead reserve alu0 and alu1 to
;; prevent any other instructions from being issued upon that first cycle.)
;; Even though the load/store pipeline is usually available in either
;; ALU pipe, multi-cycle instructions always issue in pipeline 0. This
;; reservation is therefore the same as cortex_a8_multiply_2 below.
(define_reservation "cortex_a8_load_store_2"
"cortex_a8_alu0+cortex_a8_alu1,\
cortex_a8_alu0")
;; The flow of a single-cycle multiplication.
(define_reservation "cortex_a8_multiply"
"cortex_a8_alu0")
;; The flow of a multiplication instruction that gets decomposed into
;; two micro-ops. The two micro-ops will be issued to pipeline 0 on
;; successive cycles. Dual issue cannot happen at the same time as the
;; first of the micro-ops.
(define_reservation "cortex_a8_multiply_2"
"cortex_a8_alu0+cortex_a8_alu1,\
cortex_a8_alu0")
;; Similarly, the flow of a multiplication instruction that gets
;; decomposed into three micro-ops. Dual issue cannot occur except on
;; the cycle upon which the third micro-op is issued.
(define_reservation "cortex_a8_multiply_3"
"cortex_a8_alu0+cortex_a8_alu1,\
cortex_a8_alu0+cortex_a8_alu1,\
cortex_a8_alu0")
;; The model given here assumes that all instructions are unconditional.
;; Data processing instructions, but not move instructions.
;; We include CLZ with these since it has the same execution pattern
;; (source read in E2 and destination available at the end of that cycle).
(define_insn_reservation "cortex_a8_alu" 2
(and (eq_attr "tune" "cortexa8")
(ior (and (eq_attr "type" "alu")
(not (eq_attr "insn" "mov,mvn")))
(eq_attr "insn" "clz")))
"cortex_a8_default")
(define_insn_reservation "cortex_a8_alu_shift" 2
(and (eq_attr "tune" "cortexa8")
(and (eq_attr "type" "alu_shift")
(not (eq_attr "insn" "mov,mvn"))))
"cortex_a8_default")
(define_insn_reservation "cortex_a8_alu_shift_reg" 2
(and (eq_attr "tune" "cortexa8")
(and (eq_attr "type" "alu_shift_reg")
(not (eq_attr "insn" "mov,mvn"))))
"cortex_a8_default")
;; Move instructions.
(define_insn_reservation "cortex_a8_mov" 1
(and (eq_attr "tune" "cortexa8")
(and (eq_attr "type" "alu,alu_shift,alu_shift_reg")
(eq_attr "insn" "mov,mvn")))
"cortex_a8_default")
;; Exceptions to the default latencies for data processing instructions.
;; A move followed by an ALU instruction with no early dep.
;; (Such a pair can be issued in parallel, hence latency zero.)
(define_bypass 0 "cortex_a8_mov" "cortex_a8_alu")
(define_bypass 0 "cortex_a8_mov" "cortex_a8_alu_shift"
"arm_no_early_alu_shift_dep")
(define_bypass 0 "cortex_a8_mov" "cortex_a8_alu_shift_reg"
"arm_no_early_alu_shift_value_dep")
;; An ALU instruction followed by an ALU instruction with no early dep.
(define_bypass 1 "cortex_a8_alu,cortex_a8_alu_shift,cortex_a8_alu_shift_reg"
"cortex_a8_alu")
(define_bypass 1 "cortex_a8_alu,cortex_a8_alu_shift,cortex_a8_alu_shift_reg"
"cortex_a8_alu_shift"
"arm_no_early_alu_shift_dep")
(define_bypass 1 "cortex_a8_alu,cortex_a8_alu_shift,cortex_a8_alu_shift_reg"
"cortex_a8_alu_shift_reg"
"arm_no_early_alu_shift_value_dep")
;; Multiplication instructions. These are categorized according to their
;; reservation behaviour and the need below to distinguish certain
;; varieties for bypasses. Results are available at the E5 stage
;; (but some of these are multi-cycle instructions which explains the
;; latencies below).
(define_insn_reservation "cortex_a8_mul" 6
(and (eq_attr "tune" "cortexa8")
(eq_attr "insn" "mul,smulxy,smmul"))
"cortex_a8_multiply_2")
(define_insn_reservation "cortex_a8_mla" 6
(and (eq_attr "tune" "cortexa8")
(eq_attr "insn" "mla,smlaxy,smlawy,smmla,smlad,smlsd"))
"cortex_a8_multiply_2")
(define_insn_reservation "cortex_a8_mull" 7
(and (eq_attr "tune" "cortexa8")
(eq_attr "insn" "smull,umull,smlal,umlal,umaal,smlalxy"))
"cortex_a8_multiply_3")
(define_insn_reservation "cortex_a8_smulwy" 5
(and (eq_attr "tune" "cortexa8")
(eq_attr "insn" "smulwy,smuad,smusd"))
"cortex_a8_multiply")
;; smlald and smlsld are multiply-accumulate instructions but do not
;; received bypassed data from other multiplication results; thus, they
;; cannot go in cortex_a8_mla above. (See below for bypass details.)
(define_insn_reservation "cortex_a8_smlald" 6
(and (eq_attr "tune" "cortexa8")
(eq_attr "insn" "smlald,smlsld"))
"cortex_a8_multiply_2")
;; A multiply with a single-register result or an MLA, followed by an
;; MLA with an accumulator dependency, has its result forwarded so two
;; such instructions can issue back-to-back.
(define_bypass 1 "cortex_a8_mul,cortex_a8_mla,cortex_a8_smulwy"
"cortex_a8_mla"
"arm_mac_accumulator_is_mul_result")
;; A multiply followed by an ALU instruction needing the multiply
;; result only at E2 has lower latency than one needing it at E1.
(define_bypass 4 "cortex_a8_mul,cortex_a8_mla,cortex_a8_mull,\
cortex_a8_smulwy,cortex_a8_smlald"
"cortex_a8_alu")
(define_bypass 4 "cortex_a8_mul,cortex_a8_mla,cortex_a8_mull,\
cortex_a8_smulwy,cortex_a8_smlald"
"cortex_a8_alu_shift"
"arm_no_early_alu_shift_dep")
(define_bypass 4 "cortex_a8_mul,cortex_a8_mla,cortex_a8_mull,\
cortex_a8_smulwy,cortex_a8_smlald"
"cortex_a8_alu_shift_reg"
"arm_no_early_alu_shift_value_dep")
;; Load instructions.
;; The presence of any register writeback is ignored here.
;; A load result has latency 3 unless the dependent instruction has
;; no early dep, in which case it is only latency two.
;; We assume 64-bit alignment for doubleword loads.
(define_insn_reservation "cortex_a8_load1_2" 3
(and (eq_attr "tune" "cortexa8")
(eq_attr "type" "load1,load2,load_byte"))
"cortex_a8_load_store_1")
(define_bypass 2 "cortex_a8_load1_2"
"cortex_a8_alu")
(define_bypass 2 "cortex_a8_load1_2"
"cortex_a8_alu_shift"
"arm_no_early_alu_shift_dep")
(define_bypass 2 "cortex_a8_load1_2"
"cortex_a8_alu_shift_reg"
"arm_no_early_alu_shift_value_dep")
;; We do not currently model the fact that loads with scaled register
;; offsets that are not LSL #2 have an extra cycle latency (they issue
;; as two micro-ops).
;; A load multiple of three registers is usually issued as two micro-ops.
;; The first register will be available at E3 of the first iteration,
;; the second at E3 of the second iteration, and the third at E4 of
;; the second iteration. A load multiple of four registers is usually
;; issued as two micro-ops.
(define_insn_reservation "cortex_a8_load3_4" 5
(and (eq_attr "tune" "cortexa8")
(eq_attr "type" "load3,load4"))
"cortex_a8_load_store_2")
(define_bypass 4 "cortex_a8_load3_4"
"cortex_a8_alu")
(define_bypass 4 "cortex_a8_load3_4"
"cortex_a8_alu_shift"
"arm_no_early_alu_shift_dep")
(define_bypass 4 "cortex_a8_load3_4"
"cortex_a8_alu_shift_reg"
"arm_no_early_alu_shift_value_dep")
;; Store instructions.
;; Writeback is again ignored.
(define_insn_reservation "cortex_a8_store1_2" 0
(and (eq_attr "tune" "cortexa8")
(eq_attr "type" "store1,store2"))
"cortex_a8_load_store_1")
(define_insn_reservation "cortex_a8_store3_4" 0
(and (eq_attr "tune" "cortexa8")
(eq_attr "type" "store3,store4"))
"cortex_a8_load_store_2")
;; An ALU instruction acting as a producer for a store instruction
;; that only uses the result as the value to be stored (as opposed to
;; using it to calculate the address) has latency zero; the store
;; reads the value to be stored at the start of E3 and the ALU insn
;; writes it at the end of E2. Move instructions actually produce the
;; result at the end of E1, but since we don't have delay slots, the
;; scheduling behaviour will be the same.
(define_bypass 0 "cortex_a8_alu,cortex_a8_alu_shift,\
cortex_a8_alu_shift_reg,cortex_a8_mov"
"cortex_a8_store1_2,cortex_a8_store3_4"
"arm_no_early_store_addr_dep")
;; Branch instructions
(define_insn_reservation "cortex_a8_branch" 0
(and (eq_attr "tune" "cortexa8")
(eq_attr "type" "branch"))
"cortex_a8_branch")
;; Call latencies are not predictable. A semi-arbitrary very large
;; number is used as "positive infinity" so that everything should be
;; finished by the time of return.
(define_insn_reservation "cortex_a8_call" 32
(and (eq_attr "tune" "cortexa8")
(eq_attr "type" "call"))
"cortex_a8_issue_branch")
;; NEON (including VFP) instructions.
(include "cortex-a8-neon.md")

View file

@ -0,0 +1,497 @@
(* Emission of the core of the Cortex-A8 NEON scheduling description.
Copyright (C) 2007 Free Software Foundation, Inc.
Contributed by CodeSourcery.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 2, or (at your option) any later
version.
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING. If not, write to the Free
Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
*)
(* This scheduling description generator works as follows.
- Each group of instructions has source and destination requirements
specified. The source requirements may be specified using
Source (the stage at which all source operands not otherwise
described are read), Source_m (the stage at which Rm operands are
read), Source_n (likewise for Rn) and Source_d (likewise for Rd).
- For each group of instructions the earliest stage where a source
operand may be required is calculated.
- Each group of instructions is selected in turn as a producer.
The latencies between this group and every other group are then
calculated, yielding up to four values for each combination:
1. Producer -> consumer Rn latency
2. Producer -> consumer Rm latency
3. Producer -> consumer Rd (as a source) latency
4. Producer -> consumer worst-case latency.
Value 4 is calculated from the destination availability requirements
of the consumer and the earliest source availability requirements
of the producer.
- The largest Value 4 calculated for the current producer is the
worse-case latency, L, for that instruction group. This value is written
out in a define_insn_reservation for the producer group.
- For each producer and consumer pair, the latencies calculated above
are collated. The average (of up to four values) is calculated and
if this average is different from the worst-case latency, an
unguarded define_bypass construction is issued for that pair.
(For each pair only one define_bypass construction will be emitted,
and at present we do not emit specific guards.)
*)
open Utils
let n1 = 1 and n2 = 2 and n3 = 3 and n4 = 4 and n5 = 5 and n6 = 6
and n7 = 7 and n8 = 8 and n9 = 9
type availability = Source of int
| Source_n of int
| Source_m of int
| Source_d of int
| Dest of int
| Dest_n_after of int * int
type guard = Guard_none | Guard_only_m | Guard_only_n | Guard_only_d
(* Reservation behaviours. All but the last row here correspond to one
pipeline each. Each constructor will correspond to one
define_reservation. *)
type reservation =
Mul | Mul_2cycle | Mul_4cycle
| Shift | Shift_2cycle
| ALU | ALU_2cycle
| Fmul | Fmul_2cycle
| Fadd | Fadd_2cycle
(* | VFP *)
| Permute of int
| Ls of int
| Fmul_then_fadd | Fmul_then_fadd_2
(* This table must be kept as short as possible by conflating
entries with the same availability behaviour.
First components: instruction group names
Second components: availability requirements, in the order in which
they should appear in the comments in the .md file.
Third components: reservation info
*)
let availability_table = [
(* NEON integer ALU instructions. *)
(* vbit vbif vbsl vorr vbic vnot vcls vclz vcnt vadd vand vorr
veor vbic vorn ddd qqq *)
"neon_int_1", [Source n2; Dest n3], ALU;
(* vadd vsub qqd vsub ddd qqq *)
"neon_int_2", [Source_m n1; Source_n n2; Dest n3], ALU;
(* vsum vneg dd qq vadd vsub qdd *)
"neon_int_3", [Source n1; Dest n3], ALU;
(* vabs vceqz vcgez vcbtz vclez vcltz vadh vradh vsbh vrsbh dqq *)
(* vhadd vrhadd vqadd vtst ddd qqq *)
"neon_int_4", [Source n2; Dest n4], ALU;
(* vabd qdd vhsub vqsub vabd vceq vcge vcgt vmax vmin vfmx vfmn ddd ddd *)
"neon_int_5", [Source_m n1; Source_n n2; Dest n4], ALU;
(* vqneg vqabs dd qq *)
"neon_vqneg_vqabs", [Source n1; Dest n4], ALU;
(* vmov vmvn *)
"neon_vmov", [Dest n3], ALU;
(* vaba *)
"neon_vaba", [Source_n n2; Source_m n1; Source_d n3; Dest n6], ALU;
"neon_vaba_qqq",
[Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], ALU_2cycle;
(* vsma *)
"neon_vsma", [Source_m n1; Source_d n3; Dest n6], ALU;
(* NEON integer multiply instructions. *)
(* vmul, vqdmlh, vqrdmlh *)
(* vmul, vqdmul, qdd 16/8 long 32/16 long *)
"neon_mul_ddd_8_16_qdd_16_8_long_32_16_long", [Source n2; Dest n6], Mul;
"neon_mul_qqq_8_16_32_ddd_32", [Source n2; Dest_n_after (1, n6)], Mul_2cycle;
(* vmul, vqdmul again *)
"neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar",
[Source_n n2; Source_m n1; Dest_n_after (1, n6)], Mul_2cycle;
(* vmla, vmls *)
"neon_mla_ddd_8_16_qdd_16_8_long_32_16_long",
[Source_n n2; Source_m n2; Source_d n3; Dest n6], Mul;
"neon_mla_qqq_8_16",
[Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n6)], Mul_2cycle;
"neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long",
[Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], Mul_2cycle;
"neon_mla_qqq_32_qqd_32_scalar",
[Source_n n2; Source_m n1; Source_d n3; Dest_n_after (3, n6)], Mul_4cycle;
(* vmul, vqdmulh, vqrdmulh *)
(* vmul, vqdmul *)
"neon_mul_ddd_16_scalar_32_16_long_scalar",
[Source_n n2; Source_m n1; Dest n6], Mul;
"neon_mul_qqd_32_scalar",
[Source_n n2; Source_m n1; Dest_n_after (3, n6)], Mul_4cycle;
(* vmla, vmls *)
(* vmla, vmla, vqdmla, vqdmls *)
"neon_mla_ddd_16_scalar_qdd_32_16_long_scalar",
[Source_n n2; Source_m n1; Source_d n3; Dest n6], Mul;
(* NEON integer shift instructions. *)
(* vshr/vshl immediate, vshr_narrow, vshl_vmvh, vsli_vsri_ddd *)
"neon_shift_1", [Source n1; Dest n3], Shift;
(* vqshl, vrshr immediate; vqshr, vqmov, vrshr, vqrshr narrow;
vqshl_vrshl_vqrshl_ddd *)
"neon_shift_2", [Source n1; Dest n4], Shift;
(* vsli, vsri and vshl for qqq *)
"neon_shift_3", [Source n1; Dest_n_after (1, n3)], Shift_2cycle;
"neon_vshl_ddd", [Source n1; Dest n1], Shift;
"neon_vqshl_vrshl_vqrshl_qqq", [Source n1; Dest_n_after (1, n4)],
Shift_2cycle;
"neon_vsra_vrsra", [Source_m n1; Source_d n3; Dest n6], Shift;
(* NEON floating-point instructions. *)
(* vadd, vsub, vabd, vmul, vceq, vcge, vcgt, vcage, vcagt, vmax, vmin *)
(* vabs, vneg, vceqz, vcgez, vcgtz, vclez, vcltz, vrecpe, vrsqrte, vcvt *)
"neon_fp_vadd_ddd_vabs_dd", [Source n2; Dest n5], Fadd;
"neon_fp_vadd_qqq_vabs_qq", [Source n2; Dest_n_after (1, n5)],
Fadd_2cycle;
(* vsum, fvmx, vfmn *)
"neon_fp_vsum", [Source n1; Dest n5], Fadd;
"neon_fp_vmul_ddd", [Source_n n2; Source_m n1; Dest n5], Fmul;
"neon_fp_vmul_qqd", [Source_n n2; Source_m n1; Dest_n_after (1, n5)],
Fmul_2cycle;
(* vmla, vmls *)
"neon_fp_vmla_ddd",
[Source_n n2; Source_m n2; Source_d n3; Dest n9], Fmul_then_fadd;
"neon_fp_vmla_qqq",
[Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n9)],
Fmul_then_fadd_2;
"neon_fp_vmla_ddd_scalar",
[Source_n n2; Source_m n1; Source_d n3; Dest n9], Fmul_then_fadd;
"neon_fp_vmla_qqq_scalar",
[Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n9)],
Fmul_then_fadd_2;
"neon_fp_vrecps_vrsqrts_ddd", [Source n2; Dest n9], Fmul_then_fadd;
"neon_fp_vrecps_vrsqrts_qqq", [Source n2; Dest_n_after (1, n9)],
Fmul_then_fadd_2;
(* NEON byte permute instructions. *)
(* vmov; vtrn and vswp for dd; vzip for dd; vuzp for dd; vrev; vext for dd *)
"neon_bp_simple", [Source n1; Dest n2], Permute 1;
(* vswp for qq; vext for qqq; vtbl with {Dn} or {Dn, Dn1};
similarly for vtbx *)
"neon_bp_2cycle", [Source n1; Dest_n_after (1, n2)], Permute 2;
(* all the rest *)
"neon_bp_3cycle", [Source n1; Dest_n_after (2, n2)], Permute 3;
(* NEON load/store instructions. *)
"neon_ldr", [Dest n1], Ls 1;
"neon_str", [Source n1], Ls 1;
"neon_vld1_1_2_regs", [Dest_n_after (1, n1)], Ls 2;
"neon_vld1_3_4_regs", [Dest_n_after (2, n1)], Ls 3;
"neon_vld2_2_regs_vld1_vld2_all_lanes", [Dest_n_after (1, n2)], Ls 2;
"neon_vld2_4_regs", [Dest_n_after (2, n2)], Ls 3;
"neon_vld3_vld4", [Dest_n_after (3, n2)], Ls 4;
"neon_vst1_1_2_regs_vst2_2_regs", [Source n1], Ls 2;
"neon_vst1_3_4_regs", [Source n1], Ls 3;
"neon_vst2_4_regs_vst3_vst4", [Source n1], Ls 4;
"neon_vst3_vst4", [Source n1], Ls 4;
"neon_vld1_vld2_lane", [Source n1; Dest_n_after (2, n2)], Ls 3;
"neon_vld3_vld4_lane", [Source n1; Dest_n_after (4, n2)], Ls 5;
"neon_vst1_vst2_lane", [Source n1], Ls 2;
"neon_vst3_vst4_lane", [Source n1], Ls 3;
"neon_vld3_vld4_all_lanes", [Dest_n_after (1, n2)], Ls 3;
(* NEON register transfer instructions. *)
"neon_mcr", [Dest n2], Permute 1;
"neon_mcr_2_mcrr", [Dest n2], Permute 2;
(* MRC instructions are in the .tpl file. *)
]
(* Augment the tuples in the availability table with an extra component
that describes the earliest stage where a source operand may be
required. (It is also possible that an entry in the table has no
source requirements.) *)
let calculate_sources =
List.map (fun (name, avail, res) ->
let earliest_stage =
List.fold_left
(fun cur -> fun info ->
match info with
Source stage
| Source_n stage
| Source_m stage
| Source_d stage ->
(match cur with
None -> Some stage
| Some stage' when stage < stage' -> Some stage
| _ -> cur)
| _ -> cur) None avail
in
(name, avail, res, earliest_stage))
(* Find the stage, if any, at the end of which a group produces a result. *)
let find_dest (attr, avail, _, _) =
try
find_with_result
(fun av -> match av with
Dest st -> Some (Some st)
| Dest_n_after (after, st) -> Some (Some (after + st))
| _ -> None) avail
with Not_found -> None
(* Find the worst-case latency between a producer and a consumer. *)
let worst_case_latency producer (_, _, _, earliest_required) =
let dest = find_dest producer in
match earliest_required, dest with
None, _ ->
(* The consumer doesn't have any source requirements. *)
None
| _, None ->
(* The producer doesn't produce any results (e.g. a store insn). *)
None
| Some consumed, Some produced -> Some (produced - consumed + 1)
(* Helper function for below. *)
let latency_calc f producer (_, avail, _, _) =
try
let source_avail = find_with_result f avail in
match find_dest producer with
None ->
(* The producer does not produce a result. *)
Some 0
| Some produced ->
let latency = produced - source_avail + 1 in
(* Latencies below zero are raised to zero since we don't have
delay slots. *)
if latency < 0 then Some 0 else Some latency
with Not_found -> None
(* Find any Rm latency between a producer and a consumer. If no
Rm source requirement is explicitly specified for the consumer,
return "positive infinity". Also return "positive infinity" if
the latency matches the supplied worst-case latency for this
producer. *)
let get_m_latency producer consumer =
match latency_calc (fun av -> match av with Source_m stage -> Some stage
| _ -> None) producer consumer
with None -> [] | Some latency -> [(Guard_only_m, latency)]
(* Likewise for Rn. *)
let get_n_latency producer consumer =
match latency_calc (fun av -> match av with Source_n stage -> Some stage
| _ -> None) producer consumer
with None -> [] | Some latency -> [(Guard_only_n, latency)]
(* Likewise for Rd. *)
let get_d_latency producer consumer =
match
latency_calc (fun av -> match av with Source_d stage -> Some stage
| _ -> None) producer consumer
with None -> [] | Some latency -> [(Guard_only_d, latency)]
(* Given a producer and a consumer, work out the latency of the producer
to the consumer in each of the four cases (availability information
permitting) identified at the top of this file. Return the
consumer, the worst-case unguarded latency and any guarded latencies. *)
let calculate_latencies producer consumer =
let worst = worst_case_latency producer consumer in
let m_latency = get_m_latency producer consumer in
let n_latency = get_n_latency producer consumer in
let d_latency = get_d_latency producer consumer in
(consumer, worst, m_latency @ n_latency @ d_latency)
(* Helper function for below. *)
let pick_latency largest worst guards =
let guards =
match worst with
None -> guards
| Some worst -> (Guard_none, worst) :: guards
in
if List.length guards = 0 then None else
let total_latency =
List.fold_left (fun acc -> fun (_, latency) -> acc + latency) 0 guards
in
let average_latency = (float_of_int total_latency) /.
(float_of_int (List.length guards)) in
let rounded_latency = int_of_float (ceil average_latency) in
if rounded_latency = largest then None
else Some (Guard_none, rounded_latency)
(* Collate all bypasses for a particular producer as required in
worst_case_latencies_and_bypasses. (By this stage there is a maximum
of one bypass from this producer to any particular consumer listed
in LATENCIES.) Use a hash table to collate bypasses with the
same latency and guard. *)
let collate_bypasses (producer_name, _, _, _) largest latencies =
let ht = Hashtbl.create 42 in
let keys = ref [] in
List.iter (
fun ((consumer, _, _, _), worst, guards) ->
(* Find out which latency to use. Ignoring latencies that match
the *overall* worst-case latency for this producer (which will
be in define_insn_reservation), we have to examine:
1. the latency with no guard between this producer and this
consumer; and
2. any guarded latency. *)
let guard_latency_opt = pick_latency largest worst guards in
match guard_latency_opt with
None -> ()
| Some (guard, latency) ->
begin
(if (try ignore (Hashtbl.find ht (guard, latency)); false
with Not_found -> true) then
keys := (guard, latency) :: !keys);
Hashtbl.add ht (guard, latency) consumer
end
) latencies;
(* The hash table now has bypasses collated so that ones with the
same latency and guard have the same keys. Walk through all the
keys, extract the associated bypasses, and concatenate the names
of the consumers for each bypass. *)
List.map (
fun ((guard, latency) as key) ->
let consumers = Hashtbl.find_all ht key in
(producer_name,
String.concat ",\\\n " consumers,
latency,
guard)
) !keys
(* For every producer, find the worst-case latency between it and
*any* consumer. Also determine (if such a thing exists) the
lowest-latency bypass from each producer to each consumer. Group
the output in such a way that all bypasses with the same producer
and latency are together, and so that bypasses with the worst-case
latency are ignored. *)
let worst_case_latencies_and_bypasses =
let rec f (worst_acc, bypasses_acc) prev xs =
match xs with
[] -> (worst_acc, bypasses_acc)
| ((producer_name, producer_avail, res_string, _) as producer)::next ->
(* For this particular producer, work out the latencies between
it and every consumer. *)
let latencies =
List.fold_left (fun acc -> fun consumer ->
(calculate_latencies producer consumer) :: acc)
[] (prev @ xs)
in
(* Now work out what the overall worst case latency was for this
particular producer. *)
match latencies with
[] -> assert false
| _ ->
let comp_fn (_, l1, _) (_, l2, _) =
if l1 > l2 then -1 else if l1 = l2 then 0 else 1
in
let largest =
match List.hd (List.sort comp_fn latencies) with
(_, None, _) -> 0 (* Producer has no consumers. *)
| (_, Some worst, _) -> worst
in
(* Having got the largest latency, collect all bypasses for
this producer and filter out those with that larger
latency. Record the others for later emission. *)
let bypasses = collate_bypasses producer largest latencies in
(* Go on to process remaining producers, having noted
the result for this one. *)
f ((producer_name, producer_avail, largest,
res_string) :: worst_acc,
bypasses @ bypasses_acc)
(prev @ [producer]) next
in
f ([], []) []
(* Emit a helpful comment for a define_insn_reservation. *)
let write_comment producer avail =
let seen_source = ref false in
let describe info =
let read = if !seen_source then "" else "read " in
match info with
Source stage ->
seen_source := true;
Printf.printf "%stheir source operands at N%d" read stage
| Source_n stage ->
seen_source := true;
Printf.printf "%stheir (D|Q)n operands at N%d" read stage
| Source_m stage ->
seen_source := true;
Printf.printf "%stheir (D|Q)m operands at N%d" read stage
| Source_d stage ->
Printf.printf "%stheir (D|Q)d operands at N%d" read stage
| Dest stage ->
Printf.printf "produce a result at N%d" stage
| Dest_n_after (after, stage) ->
Printf.printf "produce a result at N%d on cycle %d" stage (after + 1)
in
Printf.printf ";; Instructions using this reservation ";
let rec f infos x =
let sep = if x mod 2 = 1 then "" else "\n;;" in
match infos with
[] -> assert false
| [info] -> describe info; Printf.printf ".\n"
| info::(_::[] as infos) ->
describe info; Printf.printf ", and%s " sep; f infos (x+1)
| info::infos -> describe info; Printf.printf ",%s " sep; f infos (x+1)
in
f avail 0
(* Emit a define_insn_reservation for each producer. The latency
written in will be its worst-case latency. *)
let emit_insn_reservations =
List.iter (
fun (producer, avail, latency, reservation) ->
write_comment producer avail;
Printf.printf "(define_insn_reservation \"%s\" %d\n" producer latency;
Printf.printf " (and (eq_attr \"tune\" \"cortexa8\")\n";
Printf.printf " (eq_attr \"neon_type\" \"%s\"))\n" producer;
let str =
match reservation with
Mul -> "dp" | Mul_2cycle -> "dp_2" | Mul_4cycle -> "dp_4"
| Shift -> "dp" | Shift_2cycle -> "dp_2"
| ALU -> "dp" | ALU_2cycle -> "dp_2"
| Fmul -> "dp" | Fmul_2cycle -> "dp_2"
| Fadd -> "fadd" | Fadd_2cycle -> "fadd_2"
| Ls 1 -> "ls"
| Ls n -> "ls_" ^ (string_of_int n)
| Permute 1 -> "perm"
| Permute n -> "perm_" ^ (string_of_int n)
| Fmul_then_fadd -> "fmul_then_fadd"
| Fmul_then_fadd_2 -> "fmul_then_fadd_2"
in
Printf.printf " \"cortex_a8_neon_%s\")\n\n" str
)
(* Given a guard description, return the name of the C function to
be used as the guard for define_bypass. *)
let guard_fn g =
match g with
Guard_only_m -> "arm_neon_only_m_dependency"
| Guard_only_n -> "arm_neon_only_n_dependency"
| Guard_only_d -> "arm_neon_only_d_dependency"
| Guard_none -> assert false
(* Emit a define_bypass for each bypass. *)
let emit_bypasses =
List.iter (
fun (producer, consumers, latency, guard) ->
Printf.printf "(define_bypass %d \"%s\"\n" latency producer;
if guard = Guard_none then
Printf.printf " \"%s\")\n\n" consumers
else
begin
Printf.printf " \"%s\"\n" consumers;
Printf.printf " \"%s\")\n\n" (guard_fn guard)
end
)
(* Program entry point. *)
let main =
let table = calculate_sources availability_table in
let worst_cases, bypasses = worst_case_latencies_and_bypasses table in
emit_insn_reservations (List.rev worst_cases);
Printf.printf ";; Exceptions to the default latencies.\n\n";
emit_bypasses bypasses

File diff suppressed because it is too large Load diff