From 6f6ea27d17e9bbc917b94ffea1c933755e736bdc Mon Sep 17 00:00:00 2001 From: mayshao Date: Wed, 19 Jun 2024 16:03:25 +0200 Subject: [PATCH] i386: Zhaoxin shijidadao enablement This patch enables -march/-mtune=shijidadao, costs and tunings are set according to the characteristics of the processor. gcc/ChangeLog: * common/config/i386/cpuinfo.h (get_zhaoxin_cpu): Recognize shijidadao. * common/config/i386/i386-common.cc: Add shijidadao. * common/config/i386/i386-cpuinfo.h (enum processor_subtypes): Add ZHAOXIN_FAM7H_SHIJIDADAO. * config.gcc: Add shijidadao. * config/i386/driver-i386.cc (host_detect_local_cpu): Let -march=native recognize shijidadao processors. * config/i386/i386-c.cc (ix86_target_macros_internal): Add shijidadao. * config/i386/i386-options.cc (m_ZHAOXIN): Add m_SHIJIDADAO. (m_SHIJIDADAO): New definition. * config/i386/i386.h (enum processor_type): Add PROCESSOR_SHIJIDADAO. * config/i386/x86-tune-costs.h (struct processor_costs): Add shijidadao_cost. * config/i386/x86-tune-sched.cc (ix86_issue_rate): Add shijidadao. (ix86_adjust_cost): Ditto. * config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Add m_SHIJIDADAO. (X86_TUNE_USE_GATHER_4PARTS): Ditto. (X86_TUNE_USE_GATHER_8PARTS): Ditto. (X86_TUNE_AVOID_128FMA_CHAINS): Ditto. * doc/extend.texi: Add details about shijidadao. * doc/invoke.texi: Ditto. gcc/testsuite/ChangeLog: * g++.target/i386/mv32.C: Handle new -march * gcc.target/i386/funcspec-56.inc: Ditto. --- gcc/common/config/i386/cpuinfo.h | 8 +- gcc/common/config/i386/i386-common.cc | 8 +- gcc/common/config/i386/i386-cpuinfo.h | 1 + gcc/config.gcc | 14 ++- gcc/config/i386/driver-i386.cc | 11 +- gcc/config/i386/i386-c.cc | 7 ++ gcc/config/i386/i386-options.cc | 4 +- gcc/config/i386/i386.h | 1 + gcc/config/i386/x86-tune-costs.h | 116 ++++++++++++++++++ gcc/config/i386/x86-tune-sched.cc | 2 + gcc/config/i386/x86-tune.def | 8 +- gcc/doc/extend.texi | 3 + gcc/doc/invoke.texi | 6 + gcc/testsuite/g++.target/i386/mv32.C | 6 + gcc/testsuite/gcc.target/i386/funcspec-56.inc | 2 + 15 files changed, 183 insertions(+), 14 deletions(-) diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index 4610bf6d6a4..936039725ab 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -667,12 +667,18 @@ get_zhaoxin_cpu (struct __processor_model *cpu_model, reset_cpu_feature (cpu_model, cpu_features2, FEATURE_F16C); cpu_model->__cpu_subtype = ZHAOXIN_FAM7H_LUJIAZUI; } - else if (model >= 0x5b) + else if (model == 0x5b) { cpu = "yongfeng"; CHECK___builtin_cpu_is ("yongfeng"); cpu_model->__cpu_subtype = ZHAOXIN_FAM7H_YONGFENG; } + else if (model >= 0x6b) + { + cpu = "shijidadao"; + CHECK___builtin_cpu_is ("shijidadao"); + cpu_model->__cpu_subtype = ZHAOXIN_FAM7H_SHIJIDADAO; + } break; default: break; diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc index 5d9c188c9c7..e38b1b22ffb 100644 --- a/gcc/common/config/i386/i386-common.cc +++ b/gcc/common/config/i386/i386-common.cc @@ -2066,6 +2066,7 @@ const char *const processor_names[] = "intel", "lujiazui", "yongfeng", + "shijidadao", "geode", "k6", "athlon", @@ -2271,10 +2272,13 @@ const pta processor_alias_table[] = | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR, 0, P_NONE}, {"lujiazui", PROCESSOR_LUJIAZUI, CPU_LUJIAZUI, PTA_LUJIAZUI, - M_CPU_SUBTYPE (ZHAOXIN_FAM7H_LUJIAZUI), P_NONE}, + M_CPU_SUBTYPE (ZHAOXIN_FAM7H_LUJIAZUI), P_PROC_BMI}, {"yongfeng", PROCESSOR_YONGFENG, CPU_YONGFENG, PTA_YONGFENG, - M_CPU_SUBTYPE (ZHAOXIN_FAM7H_YONGFENG), P_NONE}, + M_CPU_SUBTYPE (ZHAOXIN_FAM7H_YONGFENG), P_PROC_AVX2}, + {"shijidadao", PROCESSOR_SHIJIDADAO, CPU_YONGFENG, + PTA_YONGFENG, + M_CPU_SUBTYPE (ZHAOXIN_FAM7H_SHIJIDADAO), P_PROC_AVX2}, {"k8", PROCESSOR_K8, CPU_K8, PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR, 0, P_NONE}, diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h index 3ec9e005a6a..ccc6deb6385 100644 --- a/gcc/common/config/i386/i386-cpuinfo.h +++ b/gcc/common/config/i386/i386-cpuinfo.h @@ -104,6 +104,7 @@ enum processor_subtypes INTEL_COREI7_PANTHERLAKE, ZHAOXIN_FAM7H_YONGFENG, AMDFAM1AH_ZNVER5, + ZHAOXIN_FAM7H_SHIJIDADAO, CPU_SUBTYPE_MAX }; diff --git a/gcc/config.gcc b/gcc/config.gcc index e500ba63e32..644c456290d 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -711,9 +711,9 @@ atom slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \ silvermont skylake-avx512 cannonlake icelake-client icelake-server \ skylake goldmont goldmont-plus tremont cascadelake tigerlake cooperlake \ sapphirerapids alderlake rocketlake eden-x2 nano nano-1000 nano-2000 nano-3000 \ -nano-x2 eden-x4 nano-x4 lujiazui yongfeng x86-64 x86-64-v2 x86-64-v3 x86-64-v4 \ -sierraforest graniterapids graniterapids-d grandridge arrowlake arrowlake-s \ -clearwaterforest pantherlake native" +nano-x2 eden-x4 nano-x4 lujiazui yongfeng shijidadao x86-64 x86-64-v2 \ +x86-64-v3 x86-64-v4 sierraforest graniterapids graniterapids-d grandridge \ +arrowlake arrowlake-s clearwaterforest pantherlake native" # Additional x86 processors supported by --with-cpu=. Each processor # MUST be separated by exactly one space. @@ -3855,6 +3855,10 @@ case ${target} in arch=yongfeng cpu=yongfeng ;; + shijidadao-*) + arch=shijidadao + cpu=shijidadao + ;; pentium2-*) arch=pentium2 cpu=pentium2 @@ -3980,6 +3984,10 @@ case ${target} in arch=yongfeng cpu=yongfeng ;; + shijidadao-*) + arch=shijidadao + cpu=shijidadao + ;; nocona-*) arch=nocona cpu=nocona diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc index 0176d8b6cd2..11470eaea12 100644 --- a/gcc/config/i386/driver-i386.cc +++ b/gcc/config/i386/driver-i386.cc @@ -558,10 +558,12 @@ const char *host_detect_local_cpu (int argc, const char **argv) switch (family) { case 7: - if (model == 0x3b) - processor = PROCESSOR_LUJIAZUI; - else if (model >= 0x5b) + if (model >= 0x6b) + processor = PROCESSOR_SHIJIDADAO; + else if (model == 0x5b) processor = PROCESSOR_YONGFENG; + else if (model == 0x3b) + processor = PROCESSOR_LUJIAZUI; break; default: break; @@ -853,6 +855,9 @@ const char *host_detect_local_cpu (int argc, const char **argv) case PROCESSOR_YONGFENG: cpu = "yongfeng"; break; + case PROCESSOR_SHIJIDADAO: + cpu = "shijidadao"; + break; default: /* Use something reasonable. */ diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc index 7b0ad9e9181..403475d5b6b 100644 --- a/gcc/config/i386/i386-c.cc +++ b/gcc/config/i386/i386-c.cc @@ -156,6 +156,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__yongfeng"); def_or_undef (parse_in, "__yongfeng__"); break; + case PROCESSOR_SHIJIDADAO: + def_or_undef (parse_in, "__shijidadao"); + def_or_undef (parse_in, "__shijidadao__"); + break; case PROCESSOR_PENTIUM4: def_or_undef (parse_in, "__pentium4"); def_or_undef (parse_in, "__pentium4__"); @@ -386,6 +390,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, case PROCESSOR_YONGFENG: def_or_undef (parse_in, "__tune_yongfeng__"); break; + case PROCESSOR_SHIJIDADAO: + def_or_undef (parse_in, "__tune_shijidadao__"); + break; case PROCESSOR_PENTIUM4: def_or_undef (parse_in, "__tune_pentium4__"); break; diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index f2cecc0e254..65c5bad9c28 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -155,7 +155,8 @@ along with GCC; see the file COPYING3. If not see #define m_LUJIAZUI (HOST_WIDE_INT_1U<integer move cost is 2. */ + 8, /* cost for loading QImode using movzbl. */ + {8, 8, 8}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {8, 8, 8}, /* cost of storing integer registers. */ + 2, /* cost of reg,reg fld/fst. */ + {8, 8, 8}, /* cost of loading fp registers + in SFmode, DFmode and XFmode. */ + {8, 8, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode. */ + 2, /* cost of moving MMX register. */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode. */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode. */ + 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */ + {8, 8, 8, 10, 15}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit. */ + {8, 8, 8, 10, 15}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit. */ + 8, 8, /* SSE->integer and integer->SSE moves. */ + 8, 8, /* mask->integer and integer->mask moves. */ + {8, 8, 8}, /* cost of loading mask register + in QImode, HImode, SImode. */ + {8, 8, 8}, /* cost if storing mask register + in QImode, HImode, SImode. */ + 2, /* cost of moving mask register. */ + /* End of register allocator costs. */ + }, + + COSTS_N_INSNS (1), /* cost of an add instruction. */ + COSTS_N_INSNS (1), /* cost of a lea instruction. */ + COSTS_N_INSNS (1), /* variable shift costs. */ + COSTS_N_INSNS (1), /* constant shift costs. */ + {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */ + COSTS_N_INSNS (3), /* HI. */ + COSTS_N_INSNS (2), /* SI. */ + COSTS_N_INSNS (2), /* DI. */ + COSTS_N_INSNS (3)}, /* other. */ + 0, /* cost of multiply per each bit set. */ + {COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */ + COSTS_N_INSNS (10), /* HI. */ + COSTS_N_INSNS (9), /* SI. */ + COSTS_N_INSNS (50), /* DI. */ + COSTS_N_INSNS (50)}, /* other. */ + COSTS_N_INSNS (1), /* cost of movsx. */ + COSTS_N_INSNS (1), /* cost of movzx. */ + 8, /* "large" insn. */ + 17, /* MOVE_RATIO. */ + 6, /* CLEAR_RATIO. */ + {8, 8, 8}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {8, 8, 8}, /* cost of storing integer registers. */ + {8, 8, 8, 12, 15}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit. */ + {8, 8, 8, 12, 15}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit. */ + {8, 8, 8, 12, 15}, /* cost of unaligned loads. */ + {8, 8, 8, 12, 15}, /* cost of unaligned storess. */ + 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */ + 8, /* cost of moving SSE register to integer. */ + 18, 6, /* Gather load static, per_elt. */ + 18, 6, /* Gather store static, per_elt. */ + 32, /* size of l1 cache. */ + 256, /* size of l2 cache. */ + 64, /* size of prefetch block. */ + 12, /* number of parallel prefetches. */ + 3, /* Branch cost. */ + COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (3), /* cost of FMUL instruction. */ + COSTS_N_INSNS (13), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ + + COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ + COSTS_N_INSNS (3), /* cost of MULSS instruction. */ + COSTS_N_INSNS (3), /* cost of MULSD instruction. */ + COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ + COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ + COSTS_N_INSNS (11), /* cost of DIVSS instruction. */ + COSTS_N_INSNS (14), /* cost of DIVSD instruction. */ + COSTS_N_INSNS (11), /* cost of SQRTSS instruction. */ + COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ + 4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */ + shijidadao_memcpy, + shijidadao_memset, + COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ + COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ + "16:11:8", /* Loop alignment. */ + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ + 4, /* Small unroll limit. */ + 2, /* Small unroll factor. */ +}; + + /* Generic should produce code tuned for Core-i7 (and newer chips) and btver1 (and newer chips). */ diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc index f70846e628e..d77298b0e34 100644 --- a/gcc/config/i386/x86-tune-sched.cc +++ b/gcc/config/i386/x86-tune-sched.cc @@ -79,6 +79,7 @@ ix86_issue_rate (void) case PROCESSOR_CANNONLAKE: case PROCESSOR_ALDERLAKE: case PROCESSOR_YONGFENG: + case PROCESSOR_SHIJIDADAO: case PROCESSOR_GENERIC: return 4; @@ -446,6 +447,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, break; case PROCESSOR_YONGFENG: + case PROCESSOR_SHIJIDADAO: /* Stack engine allows to execute push&pop instructions in parallel. */ if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 66512992b7b..343c32c291f 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -477,7 +477,7 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes", elements. */ DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts", ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID - | m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS)) + | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS)) /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2 elements. */ @@ -488,7 +488,7 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts", elements. */ DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts", ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID - | m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS)) + | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS)) /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4 elements. */ @@ -499,7 +499,7 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts", elements. */ DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts", ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM - | m_YONGFENG | m_GENERIC | m_GDS)) + | m_YONGFENG | m_SHIJIDADAO | m_GENERIC | m_GDS)) /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more elements. */ @@ -509,7 +509,7 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts", /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or smaller FMA chain. */ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 - | m_YONGFENG | m_GENERIC) + | m_YONGFENG | m_SHIJIDADAO | m_GENERIC) /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or smaller FMA chain. */ diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 173cdef0131..b2e41a581dd 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -26245,6 +26245,9 @@ ZHAOXIN lujiazui CPU. @item yongfeng ZHAOXIN yongfeng CPU. +@item shijidadao +ZHAOXIN shijidadao CPU. + @item amdfam10h AMD Family 10h CPU. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 5d7a87fde86..c790e2f3518 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -34873,6 +34873,12 @@ SSE4.2, AVX, POPCNT, AES, PCLMUL, RDRND, XSAVE, XSAVEOPT, FSGSBASE, CX16, ABM, BMI, BMI2, F16C, FXSR, RDSEED, AVX2, FMA, SHA, LZCNT instruction set support. +@item shijidadao +ZHAOXIN shijidadao CPU with x86-64, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, +SSE4.2, AVX, POPCNT, AES, PCLMUL, RDRND, XSAVE, XSAVEOPT, FSGSBASE, CX16, +ABM, BMI, BMI2, F16C, FXSR, RDSEED, AVX2, FMA, SHA, LZCNT +instruction set support. + @item geode AMD Geode embedded processor with MMX and 3DNow!@: instruction set support. @end table diff --git a/gcc/testsuite/g++.target/i386/mv32.C b/gcc/testsuite/g++.target/i386/mv32.C index 6c993218d01..b311c35baa3 100644 --- a/gcc/testsuite/g++.target/i386/mv32.C +++ b/gcc/testsuite/g++.target/i386/mv32.C @@ -21,6 +21,10 @@ int __attribute__ ((target("arch=yongfeng"))) foo () { return 2; } +int __attribute__ ((target("arch=shijidadao"))) foo () { + return 3; +} + int main () { int val = foo (); @@ -29,6 +33,8 @@ int main () assert (val == 1); else if (__builtin_cpu_is ("yongfeng")) assert (val == 2); + else if (__builtin_cpu_is ("shijidadao")) + assert (val == 3); else assert (val == 0); diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc b/gcc/testsuite/gcc.target/i386/funcspec-56.inc index 2a50f5bf67c..c4dc89367ef 100644 --- a/gcc/testsuite/gcc.target/i386/funcspec-56.inc +++ b/gcc/testsuite/gcc.target/i386/funcspec-56.inc @@ -208,6 +208,7 @@ extern void test_arch_arrowlake_s (void) __attribute__((__target__("arch=arrowla extern void test_arch_pantherlake (void) __attribute__((__target__("arch=pantherlake"))); extern void test_arch_lujiazui (void) __attribute__((__target__("arch=lujiazui"))); extern void test_arch_yongfeng (void) __attribute__((__target__("arch=yongfeng"))); +extern void test_arch_shijidadao (void) __attribute__((__target__("arch=shijidadao"))); extern void test_arch_k8 (void) __attribute__((__target__("arch=k8"))); extern void test_arch_k8_sse3 (void) __attribute__((__target__("arch=k8-sse3"))); extern void test_arch_opteron (void) __attribute__((__target__("arch=opteron"))); @@ -233,6 +234,7 @@ extern void test_tune_corei7_avx (void) __attribute__((__target__("tune=corei7- extern void test_tune_core_avx2 (void) __attribute__((__target__("tune=core-avx2"))); extern void test_tune_lujiazui (void) __attribute__((__target__("tune=lujiazui"))); extern void test_tune_yongfeng (void) __attribute__((__target__("tune=yongfeng"))); +extern void test_tune_shijidadao (void) __attribute__((__target__("tune=shijidadao"))); extern void test_tune_k8 (void) __attribute__((__target__("tune=k8"))); extern void test_tune_k8_sse3 (void) __attribute__((__target__("tune=k8-sse3"))); extern void test_tune_opteron (void) __attribute__((__target__("tune=opteron")));