arm.c (arm_mac_accumulator_is_mul_result): New.
gcc/ * config/arm/arm.c (arm_mac_accumulator_is_mul_result): New. * config/arm/arm-protos.h (arm_mac_accumulator_is_mul_result): New. * config/arm/cortex-a8.md: New. * config/arm/cortex-a8-neon.md: New. * config/arm/neon-schedgen.ml: New. * config/arm/neon.md (vqh_mnem): New. (neon_type): New. (Is_float_mode): New. (Scalar_mul_8_16): New. (Is_d_reg): New. (V_mode_nunits): New. (All instruction patterns): Annotate with neon_type attribute values. * config/arm/arm.md: Include cortex-a8.md. (insn): Add smmla, umaal, smlald, smlsld, clz, mrs, msr and xtab values. Annotate instruction patterns accordingly. (generic_sched): Do not use generic scheduling for Cortex-A8. (generic_vfp): Do not use generic VFP scheduling for Cortex-A8. Co-Authored-By: Julian Brown <julian@codesourcery.com> From-SVN: r126953
This commit is contained in:
parent
0c4d4efbde
commit
c956e102df
8 changed files with 3209 additions and 215 deletions
|
@ -1,3 +1,26 @@
|
|||
2007-07-26 Mark Shinwell <shinwell@codesourcery.com>
|
||||
Julian Brown <julian@codesourcery.com>
|
||||
|
||||
* config/arm/arm.c (arm_mac_accumulator_is_mul_result): New.
|
||||
* config/arm/arm-protos.h (arm_mac_accumulator_is_mul_result): New.
|
||||
* config/arm/cortex-a8.md: New.
|
||||
* config/arm/cortex-a8-neon.md: New.
|
||||
* config/arm/neon-schedgen.ml: New.
|
||||
* config/arm/neon.md (vqh_mnem): New.
|
||||
(neon_type): New.
|
||||
(Is_float_mode): New.
|
||||
(Scalar_mul_8_16): New.
|
||||
(Is_d_reg): New.
|
||||
(V_mode_nunits): New.
|
||||
(All instruction patterns): Annotate with neon_type attribute
|
||||
values.
|
||||
* config/arm/arm.md: Include cortex-a8.md.
|
||||
(insn): Add smmla, umaal, smlald, smlsld, clz, mrs, msr and xtab
|
||||
values.
|
||||
Annotate instruction patterns accordingly.
|
||||
(generic_sched): Do not use generic scheduling for Cortex-A8.
|
||||
(generic_vfp): Do not use generic VFP scheduling for Cortex-A8.
|
||||
|
||||
2007-07-26 Daniel Jacobowitz <dan@codesourcery.com>
|
||||
|
||||
* fold-const.c (fold_read_from_constant_string): Use
|
||||
|
|
|
@ -94,6 +94,7 @@ extern int arm_no_early_store_addr_dep (rtx, rtx);
|
|||
extern int arm_no_early_alu_shift_dep (rtx, rtx);
|
||||
extern int arm_no_early_alu_shift_value_dep (rtx, rtx);
|
||||
extern int arm_no_early_mul_dep (rtx, rtx);
|
||||
extern int arm_mac_accumulator_is_mul_result (rtx, rtx);
|
||||
|
||||
extern int tls_mentioned_p (rtx);
|
||||
extern int symbol_mentioned_p (rtx);
|
||||
|
|
|
@ -18167,6 +18167,39 @@ arm_cxx_guard_type (void)
|
|||
return TARGET_AAPCS_BASED ? integer_type_node : long_long_integer_type_node;
|
||||
}
|
||||
|
||||
/* Return non-zero if the consumer (a multiply-accumulate instruction)
|
||||
has an accumulator dependency on the result of the producer (a
|
||||
multiplication instruction) and no other dependency on that result. */
|
||||
int
|
||||
arm_mac_accumulator_is_mul_result (rtx producer, rtx consumer)
|
||||
{
|
||||
rtx mul = PATTERN (producer);
|
||||
rtx mac = PATTERN (consumer);
|
||||
rtx mul_result;
|
||||
rtx mac_op0, mac_op1, mac_acc;
|
||||
|
||||
if (GET_CODE (mul) == COND_EXEC)
|
||||
mul = COND_EXEC_CODE (mul);
|
||||
if (GET_CODE (mac) == COND_EXEC)
|
||||
mac = COND_EXEC_CODE (mac);
|
||||
|
||||
/* Check that mul is of the form (set (...) (mult ...))
|
||||
and mla is of the form (set (...) (plus (mult ...) (...))). */
|
||||
if ((GET_CODE (mul) != SET || GET_CODE (XEXP (mul, 1)) != MULT)
|
||||
|| (GET_CODE (mac) != SET || GET_CODE (XEXP (mac, 1)) != PLUS
|
||||
|| GET_CODE (XEXP (XEXP (mac, 1), 0)) != MULT))
|
||||
return 0;
|
||||
|
||||
mul_result = XEXP (mul, 0);
|
||||
mac_op0 = XEXP (XEXP (XEXP (mac, 1), 0), 0);
|
||||
mac_op1 = XEXP (XEXP (XEXP (mac, 1), 0), 1);
|
||||
mac_acc = XEXP (XEXP (mac, 1), 1);
|
||||
|
||||
return (reg_overlap_mentioned_p (mul_result, mac_acc)
|
||||
&& !reg_overlap_mentioned_p (mul_result, mac_op0)
|
||||
&& !reg_overlap_mentioned_p (mul_result, mac_op1));
|
||||
}
|
||||
|
||||
|
||||
/* The EABI says test the least significant bit of a guard variable. */
|
||||
|
||||
|
|
|
@ -184,7 +184,7 @@
|
|||
;; scheduling information.
|
||||
|
||||
(define_attr "insn"
|
||||
"smulxy,smlaxy,smlalxy,smulwy,smlawx,mul,muls,mla,mlas,umull,umulls,umlal,umlals,smull,smulls,smlal,smlals,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx,smmul,smmulr,other"
|
||||
"mov,mvn,smulxy,smlaxy,smlalxy,smulwy,smlawx,mul,muls,mla,mlas,umull,umulls,umlal,umlals,smull,smulls,smlal,smlals,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx,smmul,smmulr,smmla,umaal,smlald,smlsld,clz,mrs,msr,xtab,other"
|
||||
(const_string "other"))
|
||||
|
||||
; TYPE attribute is used to detect floating point instructions which, if
|
||||
|
@ -235,8 +235,9 @@
|
|||
; mav_farith Floating point arithmetic (4 cycle)
|
||||
; mav_dmult Double multiplies (7 cycle)
|
||||
;
|
||||
|
||||
(define_attr "type"
|
||||
"alu,alu_shift,alu_shift_reg,mult,block,float,fdivx,fdivd,fdivs,fmul,ffmul,farith,ffarith,f_flag,float_em,f_load,f_store,f_loads,f_loadd,f_stores,f_stored,f_mem_r,r_mem_f,f_2_r,r_2_f,f_cvt,branch,call,load_byte,load1,load2,load3,load4,store1,store2,store3,store4,mav_farith,mav_dmult"
|
||||
"alu,alu_shift,alu_shift_reg,mult,block,float,fdivx,fdivd,fdivs,fmul,fmuls,fmuld,fmacs,fmacd,ffmul,farith,ffarith,f_flag,float_em,f_load,f_store,f_loads,f_loadd,f_stores,f_stored,f_mem_r,r_mem_f,f_2_r,r_2_f,f_cvt,branch,call,load_byte,load1,load2,load3,load4,store1,store2,store3,store4,mav_farith,mav_dmult"
|
||||
(if_then_else
|
||||
(eq_attr "insn" "smulxy,smlaxy,smlalxy,smulwy,smlawx,mul,muls,mla,mlas,umull,umulls,umlal,umlals,smull,smulls,smlal,smlals")
|
||||
(const_string "mult")
|
||||
|
@ -332,14 +333,14 @@
|
|||
|
||||
(define_attr "generic_sched" "yes,no"
|
||||
(const (if_then_else
|
||||
(eq_attr "tune" "arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs")
|
||||
(eq_attr "tune" "arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa8")
|
||||
(const_string "no")
|
||||
(const_string "yes"))))
|
||||
|
||||
(define_attr "generic_vfp" "yes,no"
|
||||
(const (if_then_else
|
||||
(and (eq_attr "fpu" "vfp")
|
||||
(eq_attr "tune" "!arm1020e,arm1022e"))
|
||||
(eq_attr "tune" "!arm1020e,arm1022e,cortexa8"))
|
||||
(const_string "yes")
|
||||
(const_string "no"))))
|
||||
|
||||
|
@ -348,6 +349,7 @@
|
|||
(include "arm1020e.md")
|
||||
(include "arm1026ejs.md")
|
||||
(include "arm1136jfs.md")
|
||||
(include "cortex-a8.md")
|
||||
|
||||
|
||||
;;---------------------------------------------------------------------------
|
||||
|
@ -3869,6 +3871,7 @@
|
|||
"TARGET_INT_SIMD"
|
||||
"uxtab%?\\t%0, %2, %1"
|
||||
[(set_attr "predicable" "yes")
|
||||
(set_attr "insn" "xtab")
|
||||
(set_attr "type" "alu_shift")]
|
||||
)
|
||||
|
||||
|
@ -4242,6 +4245,7 @@
|
|||
"TARGET_INT_SIMD"
|
||||
"sxtab%?\\t%0, %2, %1"
|
||||
[(set_attr "type" "alu_shift")
|
||||
(set_attr "insn" "xtab")
|
||||
(set_attr "predicable" "yes")]
|
||||
)
|
||||
|
||||
|
@ -10772,7 +10776,8 @@
|
|||
(clz:SI (match_operand:SI 1 "s_register_operand" "r")))]
|
||||
"TARGET_32BIT && arm_arch5"
|
||||
"clz%?\\t%0, %1"
|
||||
[(set_attr "predicable" "yes")])
|
||||
[(set_attr "predicable" "yes")
|
||||
(set_attr "insn" "clz")])
|
||||
|
||||
(define_expand "ffssi2"
|
||||
[(set (match_operand:SI 0 "s_register_operand" "")
|
||||
|
|
1307
gcc/config/arm/cortex-a8-neon.md
Normal file
1307
gcc/config/arm/cortex-a8-neon.md
Normal file
File diff suppressed because it is too large
Load diff
272
gcc/config/arm/cortex-a8.md
Normal file
272
gcc/config/arm/cortex-a8.md
Normal file
|
@ -0,0 +1,272 @@
|
|||
;; ARM Cortex-A8 scheduling description.
|
||||
;; Copyright (C) 2007 Free Software Foundation, Inc.
|
||||
;; Contributed by CodeSourcery.
|
||||
|
||||
;; This file is part of GCC.
|
||||
|
||||
;; GCC is distributed in the hope that it will be useful, but WITHOUT
|
||||
;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
|
||||
;; License for more details.
|
||||
|
||||
;; You should have received a copy of the GNU General Public License
|
||||
;; along with GCC; see the file COPYING. If not, write to
|
||||
;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
|
||||
;; Boston, MA 02110-1301, USA.
|
||||
|
||||
(define_automaton "cortex_a8")
|
||||
|
||||
;; Only one load/store instruction can be issued per cycle
|
||||
;; (although reservation of this unit is only required for single
|
||||
;; loads and stores -- see below).
|
||||
(define_cpu_unit "cortex_a8_issue_ls" "cortex_a8")
|
||||
|
||||
;; Only one branch instruction can be issued per cycle.
|
||||
(define_cpu_unit "cortex_a8_issue_branch" "cortex_a8")
|
||||
|
||||
;; The two ALU pipelines.
|
||||
(define_cpu_unit "cortex_a8_alu0" "cortex_a8")
|
||||
(define_cpu_unit "cortex_a8_alu1" "cortex_a8")
|
||||
|
||||
;; The usual flow of an instruction through the pipelines.
|
||||
(define_reservation "cortex_a8_default"
|
||||
"cortex_a8_alu0|cortex_a8_alu1")
|
||||
|
||||
;; The flow of a branch instruction through the pipelines.
|
||||
(define_reservation "cortex_a8_branch"
|
||||
"(cortex_a8_alu0+cortex_a8_issue_branch)|\
|
||||
(cortex_a8_alu1+cortex_a8_issue_branch)")
|
||||
|
||||
;; The flow of a load or store instruction through the pipeline in
|
||||
;; the case where that instruction consists of only one micro-op...
|
||||
(define_reservation "cortex_a8_load_store_1"
|
||||
"(cortex_a8_alu0+cortex_a8_issue_ls)|\
|
||||
(cortex_a8_alu1+cortex_a8_issue_ls)")
|
||||
|
||||
;; ...and in the case of two micro-ops. We don't need to reserve
|
||||
;; cortex_a8_issue_ls here because dual issue is altogether forbidden
|
||||
;; during the issue cycle of the first micro-op. (Instead of modelling
|
||||
;; a separate issue unit, we instead reserve alu0 and alu1 to
|
||||
;; prevent any other instructions from being issued upon that first cycle.)
|
||||
;; Even though the load/store pipeline is usually available in either
|
||||
;; ALU pipe, multi-cycle instructions always issue in pipeline 0. This
|
||||
;; reservation is therefore the same as cortex_a8_multiply_2 below.
|
||||
(define_reservation "cortex_a8_load_store_2"
|
||||
"cortex_a8_alu0+cortex_a8_alu1,\
|
||||
cortex_a8_alu0")
|
||||
|
||||
;; The flow of a single-cycle multiplication.
|
||||
(define_reservation "cortex_a8_multiply"
|
||||
"cortex_a8_alu0")
|
||||
|
||||
;; The flow of a multiplication instruction that gets decomposed into
|
||||
;; two micro-ops. The two micro-ops will be issued to pipeline 0 on
|
||||
;; successive cycles. Dual issue cannot happen at the same time as the
|
||||
;; first of the micro-ops.
|
||||
(define_reservation "cortex_a8_multiply_2"
|
||||
"cortex_a8_alu0+cortex_a8_alu1,\
|
||||
cortex_a8_alu0")
|
||||
|
||||
;; Similarly, the flow of a multiplication instruction that gets
|
||||
;; decomposed into three micro-ops. Dual issue cannot occur except on
|
||||
;; the cycle upon which the third micro-op is issued.
|
||||
(define_reservation "cortex_a8_multiply_3"
|
||||
"cortex_a8_alu0+cortex_a8_alu1,\
|
||||
cortex_a8_alu0+cortex_a8_alu1,\
|
||||
cortex_a8_alu0")
|
||||
|
||||
;; The model given here assumes that all instructions are unconditional.
|
||||
|
||||
;; Data processing instructions, but not move instructions.
|
||||
|
||||
;; We include CLZ with these since it has the same execution pattern
|
||||
;; (source read in E2 and destination available at the end of that cycle).
|
||||
(define_insn_reservation "cortex_a8_alu" 2
|
||||
(and (eq_attr "tune" "cortexa8")
|
||||
(ior (and (eq_attr "type" "alu")
|
||||
(not (eq_attr "insn" "mov,mvn")))
|
||||
(eq_attr "insn" "clz")))
|
||||
"cortex_a8_default")
|
||||
|
||||
(define_insn_reservation "cortex_a8_alu_shift" 2
|
||||
(and (eq_attr "tune" "cortexa8")
|
||||
(and (eq_attr "type" "alu_shift")
|
||||
(not (eq_attr "insn" "mov,mvn"))))
|
||||
"cortex_a8_default")
|
||||
|
||||
(define_insn_reservation "cortex_a8_alu_shift_reg" 2
|
||||
(and (eq_attr "tune" "cortexa8")
|
||||
(and (eq_attr "type" "alu_shift_reg")
|
||||
(not (eq_attr "insn" "mov,mvn"))))
|
||||
"cortex_a8_default")
|
||||
|
||||
;; Move instructions.
|
||||
|
||||
(define_insn_reservation "cortex_a8_mov" 1
|
||||
(and (eq_attr "tune" "cortexa8")
|
||||
(and (eq_attr "type" "alu,alu_shift,alu_shift_reg")
|
||||
(eq_attr "insn" "mov,mvn")))
|
||||
"cortex_a8_default")
|
||||
|
||||
;; Exceptions to the default latencies for data processing instructions.
|
||||
|
||||
;; A move followed by an ALU instruction with no early dep.
|
||||
;; (Such a pair can be issued in parallel, hence latency zero.)
|
||||
(define_bypass 0 "cortex_a8_mov" "cortex_a8_alu")
|
||||
(define_bypass 0 "cortex_a8_mov" "cortex_a8_alu_shift"
|
||||
"arm_no_early_alu_shift_dep")
|
||||
(define_bypass 0 "cortex_a8_mov" "cortex_a8_alu_shift_reg"
|
||||
"arm_no_early_alu_shift_value_dep")
|
||||
|
||||
;; An ALU instruction followed by an ALU instruction with no early dep.
|
||||
(define_bypass 1 "cortex_a8_alu,cortex_a8_alu_shift,cortex_a8_alu_shift_reg"
|
||||
"cortex_a8_alu")
|
||||
(define_bypass 1 "cortex_a8_alu,cortex_a8_alu_shift,cortex_a8_alu_shift_reg"
|
||||
"cortex_a8_alu_shift"
|
||||
"arm_no_early_alu_shift_dep")
|
||||
(define_bypass 1 "cortex_a8_alu,cortex_a8_alu_shift,cortex_a8_alu_shift_reg"
|
||||
"cortex_a8_alu_shift_reg"
|
||||
"arm_no_early_alu_shift_value_dep")
|
||||
|
||||
;; Multiplication instructions. These are categorized according to their
|
||||
;; reservation behaviour and the need below to distinguish certain
|
||||
;; varieties for bypasses. Results are available at the E5 stage
|
||||
;; (but some of these are multi-cycle instructions which explains the
|
||||
;; latencies below).
|
||||
|
||||
(define_insn_reservation "cortex_a8_mul" 6
|
||||
(and (eq_attr "tune" "cortexa8")
|
||||
(eq_attr "insn" "mul,smulxy,smmul"))
|
||||
"cortex_a8_multiply_2")
|
||||
|
||||
(define_insn_reservation "cortex_a8_mla" 6
|
||||
(and (eq_attr "tune" "cortexa8")
|
||||
(eq_attr "insn" "mla,smlaxy,smlawy,smmla,smlad,smlsd"))
|
||||
"cortex_a8_multiply_2")
|
||||
|
||||
(define_insn_reservation "cortex_a8_mull" 7
|
||||
(and (eq_attr "tune" "cortexa8")
|
||||
(eq_attr "insn" "smull,umull,smlal,umlal,umaal,smlalxy"))
|
||||
"cortex_a8_multiply_3")
|
||||
|
||||
(define_insn_reservation "cortex_a8_smulwy" 5
|
||||
(and (eq_attr "tune" "cortexa8")
|
||||
(eq_attr "insn" "smulwy,smuad,smusd"))
|
||||
"cortex_a8_multiply")
|
||||
|
||||
;; smlald and smlsld are multiply-accumulate instructions but do not
|
||||
;; received bypassed data from other multiplication results; thus, they
|
||||
;; cannot go in cortex_a8_mla above. (See below for bypass details.)
|
||||
(define_insn_reservation "cortex_a8_smlald" 6
|
||||
(and (eq_attr "tune" "cortexa8")
|
||||
(eq_attr "insn" "smlald,smlsld"))
|
||||
"cortex_a8_multiply_2")
|
||||
|
||||
;; A multiply with a single-register result or an MLA, followed by an
|
||||
;; MLA with an accumulator dependency, has its result forwarded so two
|
||||
;; such instructions can issue back-to-back.
|
||||
(define_bypass 1 "cortex_a8_mul,cortex_a8_mla,cortex_a8_smulwy"
|
||||
"cortex_a8_mla"
|
||||
"arm_mac_accumulator_is_mul_result")
|
||||
|
||||
;; A multiply followed by an ALU instruction needing the multiply
|
||||
;; result only at E2 has lower latency than one needing it at E1.
|
||||
(define_bypass 4 "cortex_a8_mul,cortex_a8_mla,cortex_a8_mull,\
|
||||
cortex_a8_smulwy,cortex_a8_smlald"
|
||||
"cortex_a8_alu")
|
||||
(define_bypass 4 "cortex_a8_mul,cortex_a8_mla,cortex_a8_mull,\
|
||||
cortex_a8_smulwy,cortex_a8_smlald"
|
||||
"cortex_a8_alu_shift"
|
||||
"arm_no_early_alu_shift_dep")
|
||||
(define_bypass 4 "cortex_a8_mul,cortex_a8_mla,cortex_a8_mull,\
|
||||
cortex_a8_smulwy,cortex_a8_smlald"
|
||||
"cortex_a8_alu_shift_reg"
|
||||
"arm_no_early_alu_shift_value_dep")
|
||||
|
||||
;; Load instructions.
|
||||
;; The presence of any register writeback is ignored here.
|
||||
|
||||
;; A load result has latency 3 unless the dependent instruction has
|
||||
;; no early dep, in which case it is only latency two.
|
||||
;; We assume 64-bit alignment for doubleword loads.
|
||||
(define_insn_reservation "cortex_a8_load1_2" 3
|
||||
(and (eq_attr "tune" "cortexa8")
|
||||
(eq_attr "type" "load1,load2,load_byte"))
|
||||
"cortex_a8_load_store_1")
|
||||
|
||||
(define_bypass 2 "cortex_a8_load1_2"
|
||||
"cortex_a8_alu")
|
||||
(define_bypass 2 "cortex_a8_load1_2"
|
||||
"cortex_a8_alu_shift"
|
||||
"arm_no_early_alu_shift_dep")
|
||||
(define_bypass 2 "cortex_a8_load1_2"
|
||||
"cortex_a8_alu_shift_reg"
|
||||
"arm_no_early_alu_shift_value_dep")
|
||||
|
||||
;; We do not currently model the fact that loads with scaled register
|
||||
;; offsets that are not LSL #2 have an extra cycle latency (they issue
|
||||
;; as two micro-ops).
|
||||
|
||||
;; A load multiple of three registers is usually issued as two micro-ops.
|
||||
;; The first register will be available at E3 of the first iteration,
|
||||
;; the second at E3 of the second iteration, and the third at E4 of
|
||||
;; the second iteration. A load multiple of four registers is usually
|
||||
;; issued as two micro-ops.
|
||||
(define_insn_reservation "cortex_a8_load3_4" 5
|
||||
(and (eq_attr "tune" "cortexa8")
|
||||
(eq_attr "type" "load3,load4"))
|
||||
"cortex_a8_load_store_2")
|
||||
|
||||
(define_bypass 4 "cortex_a8_load3_4"
|
||||
"cortex_a8_alu")
|
||||
(define_bypass 4 "cortex_a8_load3_4"
|
||||
"cortex_a8_alu_shift"
|
||||
"arm_no_early_alu_shift_dep")
|
||||
(define_bypass 4 "cortex_a8_load3_4"
|
||||
"cortex_a8_alu_shift_reg"
|
||||
"arm_no_early_alu_shift_value_dep")
|
||||
|
||||
;; Store instructions.
|
||||
;; Writeback is again ignored.
|
||||
|
||||
(define_insn_reservation "cortex_a8_store1_2" 0
|
||||
(and (eq_attr "tune" "cortexa8")
|
||||
(eq_attr "type" "store1,store2"))
|
||||
"cortex_a8_load_store_1")
|
||||
|
||||
(define_insn_reservation "cortex_a8_store3_4" 0
|
||||
(and (eq_attr "tune" "cortexa8")
|
||||
(eq_attr "type" "store3,store4"))
|
||||
"cortex_a8_load_store_2")
|
||||
|
||||
;; An ALU instruction acting as a producer for a store instruction
|
||||
;; that only uses the result as the value to be stored (as opposed to
|
||||
;; using it to calculate the address) has latency zero; the store
|
||||
;; reads the value to be stored at the start of E3 and the ALU insn
|
||||
;; writes it at the end of E2. Move instructions actually produce the
|
||||
;; result at the end of E1, but since we don't have delay slots, the
|
||||
;; scheduling behaviour will be the same.
|
||||
(define_bypass 0 "cortex_a8_alu,cortex_a8_alu_shift,\
|
||||
cortex_a8_alu_shift_reg,cortex_a8_mov"
|
||||
"cortex_a8_store1_2,cortex_a8_store3_4"
|
||||
"arm_no_early_store_addr_dep")
|
||||
|
||||
;; Branch instructions
|
||||
|
||||
(define_insn_reservation "cortex_a8_branch" 0
|
||||
(and (eq_attr "tune" "cortexa8")
|
||||
(eq_attr "type" "branch"))
|
||||
"cortex_a8_branch")
|
||||
|
||||
;; Call latencies are not predictable. A semi-arbitrary very large
|
||||
;; number is used as "positive infinity" so that everything should be
|
||||
;; finished by the time of return.
|
||||
(define_insn_reservation "cortex_a8_call" 32
|
||||
(and (eq_attr "tune" "cortexa8")
|
||||
(eq_attr "type" "call"))
|
||||
"cortex_a8_issue_branch")
|
||||
|
||||
;; NEON (including VFP) instructions.
|
||||
|
||||
(include "cortex-a8-neon.md")
|
||||
|
497
gcc/config/arm/neon-schedgen.ml
Normal file
497
gcc/config/arm/neon-schedgen.ml
Normal file
|
@ -0,0 +1,497 @@
|
|||
(* Emission of the core of the Cortex-A8 NEON scheduling description.
|
||||
Copyright (C) 2007 Free Software Foundation, Inc.
|
||||
Contributed by CodeSourcery.
|
||||
|
||||
This file is part of GCC.
|
||||
|
||||
GCC is free software; you can redistribute it and/or modify it under
|
||||
the terms of the GNU General Public License as published by the Free
|
||||
Software Foundation; either version 2, or (at your option) any later
|
||||
version.
|
||||
|
||||
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with GCC; see the file COPYING. If not, write to the Free
|
||||
Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
|
||||
02110-1301, USA.
|
||||
*)
|
||||
|
||||
(* This scheduling description generator works as follows.
|
||||
- Each group of instructions has source and destination requirements
|
||||
specified. The source requirements may be specified using
|
||||
Source (the stage at which all source operands not otherwise
|
||||
described are read), Source_m (the stage at which Rm operands are
|
||||
read), Source_n (likewise for Rn) and Source_d (likewise for Rd).
|
||||
- For each group of instructions the earliest stage where a source
|
||||
operand may be required is calculated.
|
||||
- Each group of instructions is selected in turn as a producer.
|
||||
The latencies between this group and every other group are then
|
||||
calculated, yielding up to four values for each combination:
|
||||
1. Producer -> consumer Rn latency
|
||||
2. Producer -> consumer Rm latency
|
||||
3. Producer -> consumer Rd (as a source) latency
|
||||
4. Producer -> consumer worst-case latency.
|
||||
Value 4 is calculated from the destination availability requirements
|
||||
of the consumer and the earliest source availability requirements
|
||||
of the producer.
|
||||
- The largest Value 4 calculated for the current producer is the
|
||||
worse-case latency, L, for that instruction group. This value is written
|
||||
out in a define_insn_reservation for the producer group.
|
||||
- For each producer and consumer pair, the latencies calculated above
|
||||
are collated. The average (of up to four values) is calculated and
|
||||
if this average is different from the worst-case latency, an
|
||||
unguarded define_bypass construction is issued for that pair.
|
||||
(For each pair only one define_bypass construction will be emitted,
|
||||
and at present we do not emit specific guards.)
|
||||
*)
|
||||
|
||||
open Utils
|
||||
|
||||
let n1 = 1 and n2 = 2 and n3 = 3 and n4 = 4 and n5 = 5 and n6 = 6
|
||||
and n7 = 7 and n8 = 8 and n9 = 9
|
||||
|
||||
type availability = Source of int
|
||||
| Source_n of int
|
||||
| Source_m of int
|
||||
| Source_d of int
|
||||
| Dest of int
|
||||
| Dest_n_after of int * int
|
||||
|
||||
type guard = Guard_none | Guard_only_m | Guard_only_n | Guard_only_d
|
||||
|
||||
(* Reservation behaviours. All but the last row here correspond to one
|
||||
pipeline each. Each constructor will correspond to one
|
||||
define_reservation. *)
|
||||
type reservation =
|
||||
Mul | Mul_2cycle | Mul_4cycle
|
||||
| Shift | Shift_2cycle
|
||||
| ALU | ALU_2cycle
|
||||
| Fmul | Fmul_2cycle
|
||||
| Fadd | Fadd_2cycle
|
||||
(* | VFP *)
|
||||
| Permute of int
|
||||
| Ls of int
|
||||
| Fmul_then_fadd | Fmul_then_fadd_2
|
||||
|
||||
(* This table must be kept as short as possible by conflating
|
||||
entries with the same availability behaviour.
|
||||
|
||||
First components: instruction group names
|
||||
Second components: availability requirements, in the order in which
|
||||
they should appear in the comments in the .md file.
|
||||
Third components: reservation info
|
||||
*)
|
||||
let availability_table = [
|
||||
(* NEON integer ALU instructions. *)
|
||||
(* vbit vbif vbsl vorr vbic vnot vcls vclz vcnt vadd vand vorr
|
||||
veor vbic vorn ddd qqq *)
|
||||
"neon_int_1", [Source n2; Dest n3], ALU;
|
||||
(* vadd vsub qqd vsub ddd qqq *)
|
||||
"neon_int_2", [Source_m n1; Source_n n2; Dest n3], ALU;
|
||||
(* vsum vneg dd qq vadd vsub qdd *)
|
||||
"neon_int_3", [Source n1; Dest n3], ALU;
|
||||
(* vabs vceqz vcgez vcbtz vclez vcltz vadh vradh vsbh vrsbh dqq *)
|
||||
(* vhadd vrhadd vqadd vtst ddd qqq *)
|
||||
"neon_int_4", [Source n2; Dest n4], ALU;
|
||||
(* vabd qdd vhsub vqsub vabd vceq vcge vcgt vmax vmin vfmx vfmn ddd ddd *)
|
||||
"neon_int_5", [Source_m n1; Source_n n2; Dest n4], ALU;
|
||||
(* vqneg vqabs dd qq *)
|
||||
"neon_vqneg_vqabs", [Source n1; Dest n4], ALU;
|
||||
(* vmov vmvn *)
|
||||
"neon_vmov", [Dest n3], ALU;
|
||||
(* vaba *)
|
||||
"neon_vaba", [Source_n n2; Source_m n1; Source_d n3; Dest n6], ALU;
|
||||
"neon_vaba_qqq",
|
||||
[Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], ALU_2cycle;
|
||||
(* vsma *)
|
||||
"neon_vsma", [Source_m n1; Source_d n3; Dest n6], ALU;
|
||||
|
||||
(* NEON integer multiply instructions. *)
|
||||
(* vmul, vqdmlh, vqrdmlh *)
|
||||
(* vmul, vqdmul, qdd 16/8 long 32/16 long *)
|
||||
"neon_mul_ddd_8_16_qdd_16_8_long_32_16_long", [Source n2; Dest n6], Mul;
|
||||
"neon_mul_qqq_8_16_32_ddd_32", [Source n2; Dest_n_after (1, n6)], Mul_2cycle;
|
||||
(* vmul, vqdmul again *)
|
||||
"neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar",
|
||||
[Source_n n2; Source_m n1; Dest_n_after (1, n6)], Mul_2cycle;
|
||||
(* vmla, vmls *)
|
||||
"neon_mla_ddd_8_16_qdd_16_8_long_32_16_long",
|
||||
[Source_n n2; Source_m n2; Source_d n3; Dest n6], Mul;
|
||||
"neon_mla_qqq_8_16",
|
||||
[Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n6)], Mul_2cycle;
|
||||
"neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long",
|
||||
[Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], Mul_2cycle;
|
||||
"neon_mla_qqq_32_qqd_32_scalar",
|
||||
[Source_n n2; Source_m n1; Source_d n3; Dest_n_after (3, n6)], Mul_4cycle;
|
||||
(* vmul, vqdmulh, vqrdmulh *)
|
||||
(* vmul, vqdmul *)
|
||||
"neon_mul_ddd_16_scalar_32_16_long_scalar",
|
||||
[Source_n n2; Source_m n1; Dest n6], Mul;
|
||||
"neon_mul_qqd_32_scalar",
|
||||
[Source_n n2; Source_m n1; Dest_n_after (3, n6)], Mul_4cycle;
|
||||
(* vmla, vmls *)
|
||||
(* vmla, vmla, vqdmla, vqdmls *)
|
||||
"neon_mla_ddd_16_scalar_qdd_32_16_long_scalar",
|
||||
[Source_n n2; Source_m n1; Source_d n3; Dest n6], Mul;
|
||||
|
||||
(* NEON integer shift instructions. *)
|
||||
(* vshr/vshl immediate, vshr_narrow, vshl_vmvh, vsli_vsri_ddd *)
|
||||
"neon_shift_1", [Source n1; Dest n3], Shift;
|
||||
(* vqshl, vrshr immediate; vqshr, vqmov, vrshr, vqrshr narrow;
|
||||
vqshl_vrshl_vqrshl_ddd *)
|
||||
"neon_shift_2", [Source n1; Dest n4], Shift;
|
||||
(* vsli, vsri and vshl for qqq *)
|
||||
"neon_shift_3", [Source n1; Dest_n_after (1, n3)], Shift_2cycle;
|
||||
"neon_vshl_ddd", [Source n1; Dest n1], Shift;
|
||||
"neon_vqshl_vrshl_vqrshl_qqq", [Source n1; Dest_n_after (1, n4)],
|
||||
Shift_2cycle;
|
||||
"neon_vsra_vrsra", [Source_m n1; Source_d n3; Dest n6], Shift;
|
||||
|
||||
(* NEON floating-point instructions. *)
|
||||
(* vadd, vsub, vabd, vmul, vceq, vcge, vcgt, vcage, vcagt, vmax, vmin *)
|
||||
(* vabs, vneg, vceqz, vcgez, vcgtz, vclez, vcltz, vrecpe, vrsqrte, vcvt *)
|
||||
"neon_fp_vadd_ddd_vabs_dd", [Source n2; Dest n5], Fadd;
|
||||
"neon_fp_vadd_qqq_vabs_qq", [Source n2; Dest_n_after (1, n5)],
|
||||
Fadd_2cycle;
|
||||
(* vsum, fvmx, vfmn *)
|
||||
"neon_fp_vsum", [Source n1; Dest n5], Fadd;
|
||||
"neon_fp_vmul_ddd", [Source_n n2; Source_m n1; Dest n5], Fmul;
|
||||
"neon_fp_vmul_qqd", [Source_n n2; Source_m n1; Dest_n_after (1, n5)],
|
||||
Fmul_2cycle;
|
||||
(* vmla, vmls *)
|
||||
"neon_fp_vmla_ddd",
|
||||
[Source_n n2; Source_m n2; Source_d n3; Dest n9], Fmul_then_fadd;
|
||||
"neon_fp_vmla_qqq",
|
||||
[Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n9)],
|
||||
Fmul_then_fadd_2;
|
||||
"neon_fp_vmla_ddd_scalar",
|
||||
[Source_n n2; Source_m n1; Source_d n3; Dest n9], Fmul_then_fadd;
|
||||
"neon_fp_vmla_qqq_scalar",
|
||||
[Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n9)],
|
||||
Fmul_then_fadd_2;
|
||||
"neon_fp_vrecps_vrsqrts_ddd", [Source n2; Dest n9], Fmul_then_fadd;
|
||||
"neon_fp_vrecps_vrsqrts_qqq", [Source n2; Dest_n_after (1, n9)],
|
||||
Fmul_then_fadd_2;
|
||||
|
||||
(* NEON byte permute instructions. *)
|
||||
(* vmov; vtrn and vswp for dd; vzip for dd; vuzp for dd; vrev; vext for dd *)
|
||||
"neon_bp_simple", [Source n1; Dest n2], Permute 1;
|
||||
(* vswp for qq; vext for qqq; vtbl with {Dn} or {Dn, Dn1};
|
||||
similarly for vtbx *)
|
||||
"neon_bp_2cycle", [Source n1; Dest_n_after (1, n2)], Permute 2;
|
||||
(* all the rest *)
|
||||
"neon_bp_3cycle", [Source n1; Dest_n_after (2, n2)], Permute 3;
|
||||
|
||||
(* NEON load/store instructions. *)
|
||||
"neon_ldr", [Dest n1], Ls 1;
|
||||
"neon_str", [Source n1], Ls 1;
|
||||
"neon_vld1_1_2_regs", [Dest_n_after (1, n1)], Ls 2;
|
||||
"neon_vld1_3_4_regs", [Dest_n_after (2, n1)], Ls 3;
|
||||
"neon_vld2_2_regs_vld1_vld2_all_lanes", [Dest_n_after (1, n2)], Ls 2;
|
||||
"neon_vld2_4_regs", [Dest_n_after (2, n2)], Ls 3;
|
||||
"neon_vld3_vld4", [Dest_n_after (3, n2)], Ls 4;
|
||||
"neon_vst1_1_2_regs_vst2_2_regs", [Source n1], Ls 2;
|
||||
"neon_vst1_3_4_regs", [Source n1], Ls 3;
|
||||
"neon_vst2_4_regs_vst3_vst4", [Source n1], Ls 4;
|
||||
"neon_vst3_vst4", [Source n1], Ls 4;
|
||||
"neon_vld1_vld2_lane", [Source n1; Dest_n_after (2, n2)], Ls 3;
|
||||
"neon_vld3_vld4_lane", [Source n1; Dest_n_after (4, n2)], Ls 5;
|
||||
"neon_vst1_vst2_lane", [Source n1], Ls 2;
|
||||
"neon_vst3_vst4_lane", [Source n1], Ls 3;
|
||||
"neon_vld3_vld4_all_lanes", [Dest_n_after (1, n2)], Ls 3;
|
||||
|
||||
(* NEON register transfer instructions. *)
|
||||
"neon_mcr", [Dest n2], Permute 1;
|
||||
"neon_mcr_2_mcrr", [Dest n2], Permute 2;
|
||||
(* MRC instructions are in the .tpl file. *)
|
||||
]
|
||||
|
||||
(* Augment the tuples in the availability table with an extra component
|
||||
that describes the earliest stage where a source operand may be
|
||||
required. (It is also possible that an entry in the table has no
|
||||
source requirements.) *)
|
||||
let calculate_sources =
|
||||
List.map (fun (name, avail, res) ->
|
||||
let earliest_stage =
|
||||
List.fold_left
|
||||
(fun cur -> fun info ->
|
||||
match info with
|
||||
Source stage
|
||||
| Source_n stage
|
||||
| Source_m stage
|
||||
| Source_d stage ->
|
||||
(match cur with
|
||||
None -> Some stage
|
||||
| Some stage' when stage < stage' -> Some stage
|
||||
| _ -> cur)
|
||||
| _ -> cur) None avail
|
||||
in
|
||||
(name, avail, res, earliest_stage))
|
||||
|
||||
(* Find the stage, if any, at the end of which a group produces a result. *)
|
||||
let find_dest (attr, avail, _, _) =
|
||||
try
|
||||
find_with_result
|
||||
(fun av -> match av with
|
||||
Dest st -> Some (Some st)
|
||||
| Dest_n_after (after, st) -> Some (Some (after + st))
|
||||
| _ -> None) avail
|
||||
with Not_found -> None
|
||||
|
||||
(* Find the worst-case latency between a producer and a consumer. *)
|
||||
let worst_case_latency producer (_, _, _, earliest_required) =
|
||||
let dest = find_dest producer in
|
||||
match earliest_required, dest with
|
||||
None, _ ->
|
||||
(* The consumer doesn't have any source requirements. *)
|
||||
None
|
||||
| _, None ->
|
||||
(* The producer doesn't produce any results (e.g. a store insn). *)
|
||||
None
|
||||
| Some consumed, Some produced -> Some (produced - consumed + 1)
|
||||
|
||||
(* Helper function for below. *)
|
||||
let latency_calc f producer (_, avail, _, _) =
|
||||
try
|
||||
let source_avail = find_with_result f avail in
|
||||
match find_dest producer with
|
||||
None ->
|
||||
(* The producer does not produce a result. *)
|
||||
Some 0
|
||||
| Some produced ->
|
||||
let latency = produced - source_avail + 1 in
|
||||
(* Latencies below zero are raised to zero since we don't have
|
||||
delay slots. *)
|
||||
if latency < 0 then Some 0 else Some latency
|
||||
with Not_found -> None
|
||||
|
||||
(* Find any Rm latency between a producer and a consumer. If no
|
||||
Rm source requirement is explicitly specified for the consumer,
|
||||
return "positive infinity". Also return "positive infinity" if
|
||||
the latency matches the supplied worst-case latency for this
|
||||
producer. *)
|
||||
let get_m_latency producer consumer =
|
||||
match latency_calc (fun av -> match av with Source_m stage -> Some stage
|
||||
| _ -> None) producer consumer
|
||||
with None -> [] | Some latency -> [(Guard_only_m, latency)]
|
||||
|
||||
(* Likewise for Rn. *)
|
||||
let get_n_latency producer consumer =
|
||||
match latency_calc (fun av -> match av with Source_n stage -> Some stage
|
||||
| _ -> None) producer consumer
|
||||
with None -> [] | Some latency -> [(Guard_only_n, latency)]
|
||||
|
||||
(* Likewise for Rd. *)
|
||||
let get_d_latency producer consumer =
|
||||
match
|
||||
latency_calc (fun av -> match av with Source_d stage -> Some stage
|
||||
| _ -> None) producer consumer
|
||||
with None -> [] | Some latency -> [(Guard_only_d, latency)]
|
||||
|
||||
(* Given a producer and a consumer, work out the latency of the producer
|
||||
to the consumer in each of the four cases (availability information
|
||||
permitting) identified at the top of this file. Return the
|
||||
consumer, the worst-case unguarded latency and any guarded latencies. *)
|
||||
let calculate_latencies producer consumer =
|
||||
let worst = worst_case_latency producer consumer in
|
||||
let m_latency = get_m_latency producer consumer in
|
||||
let n_latency = get_n_latency producer consumer in
|
||||
let d_latency = get_d_latency producer consumer in
|
||||
(consumer, worst, m_latency @ n_latency @ d_latency)
|
||||
|
||||
(* Helper function for below. *)
|
||||
let pick_latency largest worst guards =
|
||||
let guards =
|
||||
match worst with
|
||||
None -> guards
|
||||
| Some worst -> (Guard_none, worst) :: guards
|
||||
in
|
||||
if List.length guards = 0 then None else
|
||||
let total_latency =
|
||||
List.fold_left (fun acc -> fun (_, latency) -> acc + latency) 0 guards
|
||||
in
|
||||
let average_latency = (float_of_int total_latency) /.
|
||||
(float_of_int (List.length guards)) in
|
||||
let rounded_latency = int_of_float (ceil average_latency) in
|
||||
if rounded_latency = largest then None
|
||||
else Some (Guard_none, rounded_latency)
|
||||
|
||||
(* Collate all bypasses for a particular producer as required in
|
||||
worst_case_latencies_and_bypasses. (By this stage there is a maximum
|
||||
of one bypass from this producer to any particular consumer listed
|
||||
in LATENCIES.) Use a hash table to collate bypasses with the
|
||||
same latency and guard. *)
|
||||
let collate_bypasses (producer_name, _, _, _) largest latencies =
|
||||
let ht = Hashtbl.create 42 in
|
||||
let keys = ref [] in
|
||||
List.iter (
|
||||
fun ((consumer, _, _, _), worst, guards) ->
|
||||
(* Find out which latency to use. Ignoring latencies that match
|
||||
the *overall* worst-case latency for this producer (which will
|
||||
be in define_insn_reservation), we have to examine:
|
||||
1. the latency with no guard between this producer and this
|
||||
consumer; and
|
||||
2. any guarded latency. *)
|
||||
let guard_latency_opt = pick_latency largest worst guards in
|
||||
match guard_latency_opt with
|
||||
None -> ()
|
||||
| Some (guard, latency) ->
|
||||
begin
|
||||
(if (try ignore (Hashtbl.find ht (guard, latency)); false
|
||||
with Not_found -> true) then
|
||||
keys := (guard, latency) :: !keys);
|
||||
Hashtbl.add ht (guard, latency) consumer
|
||||
end
|
||||
) latencies;
|
||||
(* The hash table now has bypasses collated so that ones with the
|
||||
same latency and guard have the same keys. Walk through all the
|
||||
keys, extract the associated bypasses, and concatenate the names
|
||||
of the consumers for each bypass. *)
|
||||
List.map (
|
||||
fun ((guard, latency) as key) ->
|
||||
let consumers = Hashtbl.find_all ht key in
|
||||
(producer_name,
|
||||
String.concat ",\\\n " consumers,
|
||||
latency,
|
||||
guard)
|
||||
) !keys
|
||||
|
||||
(* For every producer, find the worst-case latency between it and
|
||||
*any* consumer. Also determine (if such a thing exists) the
|
||||
lowest-latency bypass from each producer to each consumer. Group
|
||||
the output in such a way that all bypasses with the same producer
|
||||
and latency are together, and so that bypasses with the worst-case
|
||||
latency are ignored. *)
|
||||
let worst_case_latencies_and_bypasses =
|
||||
let rec f (worst_acc, bypasses_acc) prev xs =
|
||||
match xs with
|
||||
[] -> (worst_acc, bypasses_acc)
|
||||
| ((producer_name, producer_avail, res_string, _) as producer)::next ->
|
||||
(* For this particular producer, work out the latencies between
|
||||
it and every consumer. *)
|
||||
let latencies =
|
||||
List.fold_left (fun acc -> fun consumer ->
|
||||
(calculate_latencies producer consumer) :: acc)
|
||||
[] (prev @ xs)
|
||||
in
|
||||
(* Now work out what the overall worst case latency was for this
|
||||
particular producer. *)
|
||||
match latencies with
|
||||
[] -> assert false
|
||||
| _ ->
|
||||
let comp_fn (_, l1, _) (_, l2, _) =
|
||||
if l1 > l2 then -1 else if l1 = l2 then 0 else 1
|
||||
in
|
||||
let largest =
|
||||
match List.hd (List.sort comp_fn latencies) with
|
||||
(_, None, _) -> 0 (* Producer has no consumers. *)
|
||||
| (_, Some worst, _) -> worst
|
||||
in
|
||||
(* Having got the largest latency, collect all bypasses for
|
||||
this producer and filter out those with that larger
|
||||
latency. Record the others for later emission. *)
|
||||
let bypasses = collate_bypasses producer largest latencies in
|
||||
(* Go on to process remaining producers, having noted
|
||||
the result for this one. *)
|
||||
f ((producer_name, producer_avail, largest,
|
||||
res_string) :: worst_acc,
|
||||
bypasses @ bypasses_acc)
|
||||
(prev @ [producer]) next
|
||||
in
|
||||
f ([], []) []
|
||||
|
||||
(* Emit a helpful comment for a define_insn_reservation. *)
|
||||
let write_comment producer avail =
|
||||
let seen_source = ref false in
|
||||
let describe info =
|
||||
let read = if !seen_source then "" else "read " in
|
||||
match info with
|
||||
Source stage ->
|
||||
seen_source := true;
|
||||
Printf.printf "%stheir source operands at N%d" read stage
|
||||
| Source_n stage ->
|
||||
seen_source := true;
|
||||
Printf.printf "%stheir (D|Q)n operands at N%d" read stage
|
||||
| Source_m stage ->
|
||||
seen_source := true;
|
||||
Printf.printf "%stheir (D|Q)m operands at N%d" read stage
|
||||
| Source_d stage ->
|
||||
Printf.printf "%stheir (D|Q)d operands at N%d" read stage
|
||||
| Dest stage ->
|
||||
Printf.printf "produce a result at N%d" stage
|
||||
| Dest_n_after (after, stage) ->
|
||||
Printf.printf "produce a result at N%d on cycle %d" stage (after + 1)
|
||||
in
|
||||
Printf.printf ";; Instructions using this reservation ";
|
||||
let rec f infos x =
|
||||
let sep = if x mod 2 = 1 then "" else "\n;;" in
|
||||
match infos with
|
||||
[] -> assert false
|
||||
| [info] -> describe info; Printf.printf ".\n"
|
||||
| info::(_::[] as infos) ->
|
||||
describe info; Printf.printf ", and%s " sep; f infos (x+1)
|
||||
| info::infos -> describe info; Printf.printf ",%s " sep; f infos (x+1)
|
||||
in
|
||||
f avail 0
|
||||
|
||||
(* Emit a define_insn_reservation for each producer. The latency
|
||||
written in will be its worst-case latency. *)
|
||||
let emit_insn_reservations =
|
||||
List.iter (
|
||||
fun (producer, avail, latency, reservation) ->
|
||||
write_comment producer avail;
|
||||
Printf.printf "(define_insn_reservation \"%s\" %d\n" producer latency;
|
||||
Printf.printf " (and (eq_attr \"tune\" \"cortexa8\")\n";
|
||||
Printf.printf " (eq_attr \"neon_type\" \"%s\"))\n" producer;
|
||||
let str =
|
||||
match reservation with
|
||||
Mul -> "dp" | Mul_2cycle -> "dp_2" | Mul_4cycle -> "dp_4"
|
||||
| Shift -> "dp" | Shift_2cycle -> "dp_2"
|
||||
| ALU -> "dp" | ALU_2cycle -> "dp_2"
|
||||
| Fmul -> "dp" | Fmul_2cycle -> "dp_2"
|
||||
| Fadd -> "fadd" | Fadd_2cycle -> "fadd_2"
|
||||
| Ls 1 -> "ls"
|
||||
| Ls n -> "ls_" ^ (string_of_int n)
|
||||
| Permute 1 -> "perm"
|
||||
| Permute n -> "perm_" ^ (string_of_int n)
|
||||
| Fmul_then_fadd -> "fmul_then_fadd"
|
||||
| Fmul_then_fadd_2 -> "fmul_then_fadd_2"
|
||||
in
|
||||
Printf.printf " \"cortex_a8_neon_%s\")\n\n" str
|
||||
)
|
||||
|
||||
(* Given a guard description, return the name of the C function to
|
||||
be used as the guard for define_bypass. *)
|
||||
let guard_fn g =
|
||||
match g with
|
||||
Guard_only_m -> "arm_neon_only_m_dependency"
|
||||
| Guard_only_n -> "arm_neon_only_n_dependency"
|
||||
| Guard_only_d -> "arm_neon_only_d_dependency"
|
||||
| Guard_none -> assert false
|
||||
|
||||
(* Emit a define_bypass for each bypass. *)
|
||||
let emit_bypasses =
|
||||
List.iter (
|
||||
fun (producer, consumers, latency, guard) ->
|
||||
Printf.printf "(define_bypass %d \"%s\"\n" latency producer;
|
||||
if guard = Guard_none then
|
||||
Printf.printf " \"%s\")\n\n" consumers
|
||||
else
|
||||
begin
|
||||
Printf.printf " \"%s\"\n" consumers;
|
||||
Printf.printf " \"%s\")\n\n" (guard_fn guard)
|
||||
end
|
||||
)
|
||||
|
||||
(* Program entry point. *)
|
||||
let main =
|
||||
let table = calculate_sources availability_table in
|
||||
let worst_cases, bypasses = worst_case_latencies_and_bypasses table in
|
||||
emit_insn_reservations (List.rev worst_cases);
|
||||
Printf.printf ";; Exceptions to the default latencies.\n\n";
|
||||
emit_bypasses bypasses
|
||||
|
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue