i386: Zhaoxin shijidadao enablement

This patch enables -march/-mtune=shijidadao, costs and tunings are set
according to the characteristics of the processor.

gcc/ChangeLog:

	* common/config/i386/cpuinfo.h (get_zhaoxin_cpu): Recognize shijidadao.
	* common/config/i386/i386-common.cc: Add shijidadao.
	* common/config/i386/i386-cpuinfo.h (enum processor_subtypes):
	Add ZHAOXIN_FAM7H_SHIJIDADAO.
	* config.gcc: Add shijidadao.
	* config/i386/driver-i386.cc (host_detect_local_cpu):
	Let -march=native recognize shijidadao processors.
	* config/i386/i386-c.cc (ix86_target_macros_internal): Add shijidadao.
	* config/i386/i386-options.cc (m_ZHAOXIN): Add m_SHIJIDADAO.
	(m_SHIJIDADAO): New definition.
	* config/i386/i386.h (enum processor_type): Add PROCESSOR_SHIJIDADAO.
	* config/i386/x86-tune-costs.h (struct processor_costs):
	Add shijidadao_cost.
	* config/i386/x86-tune-sched.cc (ix86_issue_rate): Add shijidadao.
	(ix86_adjust_cost): Ditto.
	* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Add m_SHIJIDADAO.
	(X86_TUNE_USE_GATHER_4PARTS): Ditto.
	(X86_TUNE_USE_GATHER_8PARTS): Ditto.
	(X86_TUNE_AVOID_128FMA_CHAINS): Ditto.
	* doc/extend.texi: Add details about shijidadao.
	* doc/invoke.texi: Ditto.

gcc/testsuite/ChangeLog:

	* g++.target/i386/mv32.C: Handle new -march
	* gcc.target/i386/funcspec-56.inc: Ditto.
This commit is contained in:
mayshao 2024-06-19 16:03:25 +02:00 committed by Uros Bizjak
parent 0982552bc4
commit 6f6ea27d17
15 changed files with 183 additions and 14 deletions

View file

@ -667,12 +667,18 @@ get_zhaoxin_cpu (struct __processor_model *cpu_model,
reset_cpu_feature (cpu_model, cpu_features2, FEATURE_F16C);
cpu_model->__cpu_subtype = ZHAOXIN_FAM7H_LUJIAZUI;
}
else if (model >= 0x5b)
else if (model == 0x5b)
{
cpu = "yongfeng";
CHECK___builtin_cpu_is ("yongfeng");
cpu_model->__cpu_subtype = ZHAOXIN_FAM7H_YONGFENG;
}
else if (model >= 0x6b)
{
cpu = "shijidadao";
CHECK___builtin_cpu_is ("shijidadao");
cpu_model->__cpu_subtype = ZHAOXIN_FAM7H_SHIJIDADAO;
}
break;
default:
break;

View file

@ -2066,6 +2066,7 @@ const char *const processor_names[] =
"intel",
"lujiazui",
"yongfeng",
"shijidadao",
"geode",
"k6",
"athlon",
@ -2271,10 +2272,13 @@ const pta processor_alias_table[] =
| PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR, 0, P_NONE},
{"lujiazui", PROCESSOR_LUJIAZUI, CPU_LUJIAZUI,
PTA_LUJIAZUI,
M_CPU_SUBTYPE (ZHAOXIN_FAM7H_LUJIAZUI), P_NONE},
M_CPU_SUBTYPE (ZHAOXIN_FAM7H_LUJIAZUI), P_PROC_BMI},
{"yongfeng", PROCESSOR_YONGFENG, CPU_YONGFENG,
PTA_YONGFENG,
M_CPU_SUBTYPE (ZHAOXIN_FAM7H_YONGFENG), P_NONE},
M_CPU_SUBTYPE (ZHAOXIN_FAM7H_YONGFENG), P_PROC_AVX2},
{"shijidadao", PROCESSOR_SHIJIDADAO, CPU_YONGFENG,
PTA_YONGFENG,
M_CPU_SUBTYPE (ZHAOXIN_FAM7H_SHIJIDADAO), P_PROC_AVX2},
{"k8", PROCESSOR_K8, CPU_K8,
PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
| PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR, 0, P_NONE},

View file

@ -104,6 +104,7 @@ enum processor_subtypes
INTEL_COREI7_PANTHERLAKE,
ZHAOXIN_FAM7H_YONGFENG,
AMDFAM1AH_ZNVER5,
ZHAOXIN_FAM7H_SHIJIDADAO,
CPU_SUBTYPE_MAX
};

View file

@ -711,9 +711,9 @@ atom slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \
silvermont skylake-avx512 cannonlake icelake-client icelake-server \
skylake goldmont goldmont-plus tremont cascadelake tigerlake cooperlake \
sapphirerapids alderlake rocketlake eden-x2 nano nano-1000 nano-2000 nano-3000 \
nano-x2 eden-x4 nano-x4 lujiazui yongfeng x86-64 x86-64-v2 x86-64-v3 x86-64-v4 \
sierraforest graniterapids graniterapids-d grandridge arrowlake arrowlake-s \
clearwaterforest pantherlake native"
nano-x2 eden-x4 nano-x4 lujiazui yongfeng shijidadao x86-64 x86-64-v2 \
x86-64-v3 x86-64-v4 sierraforest graniterapids graniterapids-d grandridge \
arrowlake arrowlake-s clearwaterforest pantherlake native"
# Additional x86 processors supported by --with-cpu=. Each processor
# MUST be separated by exactly one space.
@ -3855,6 +3855,10 @@ case ${target} in
arch=yongfeng
cpu=yongfeng
;;
shijidadao-*)
arch=shijidadao
cpu=shijidadao
;;
pentium2-*)
arch=pentium2
cpu=pentium2
@ -3980,6 +3984,10 @@ case ${target} in
arch=yongfeng
cpu=yongfeng
;;
shijidadao-*)
arch=shijidadao
cpu=shijidadao
;;
nocona-*)
arch=nocona
cpu=nocona

View file

@ -558,10 +558,12 @@ const char *host_detect_local_cpu (int argc, const char **argv)
switch (family)
{
case 7:
if (model == 0x3b)
processor = PROCESSOR_LUJIAZUI;
else if (model >= 0x5b)
if (model >= 0x6b)
processor = PROCESSOR_SHIJIDADAO;
else if (model == 0x5b)
processor = PROCESSOR_YONGFENG;
else if (model == 0x3b)
processor = PROCESSOR_LUJIAZUI;
break;
default:
break;
@ -853,6 +855,9 @@ const char *host_detect_local_cpu (int argc, const char **argv)
case PROCESSOR_YONGFENG:
cpu = "yongfeng";
break;
case PROCESSOR_SHIJIDADAO:
cpu = "shijidadao";
break;
default:
/* Use something reasonable. */

View file

@ -156,6 +156,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
def_or_undef (parse_in, "__yongfeng");
def_or_undef (parse_in, "__yongfeng__");
break;
case PROCESSOR_SHIJIDADAO:
def_or_undef (parse_in, "__shijidadao");
def_or_undef (parse_in, "__shijidadao__");
break;
case PROCESSOR_PENTIUM4:
def_or_undef (parse_in, "__pentium4");
def_or_undef (parse_in, "__pentium4__");
@ -386,6 +390,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
case PROCESSOR_YONGFENG:
def_or_undef (parse_in, "__tune_yongfeng__");
break;
case PROCESSOR_SHIJIDADAO:
def_or_undef (parse_in, "__tune_shijidadao__");
break;
case PROCESSOR_PENTIUM4:
def_or_undef (parse_in, "__tune_pentium4__");
break;

View file

@ -155,7 +155,8 @@ along with GCC; see the file COPYING3. If not see
#define m_LUJIAZUI (HOST_WIDE_INT_1U<<PROCESSOR_LUJIAZUI)
#define m_YONGFENG (HOST_WIDE_INT_1U<<PROCESSOR_YONGFENG)
#define m_ZHAOXIN (m_LUJIAZUI | m_YONGFENG)
#define m_SHIJIDADAO (HOST_WIDE_INT_1U<<PROCESSOR_SHIJIDADAO)
#define m_ZHAOXIN (m_LUJIAZUI | m_YONGFENG | m_SHIJIDADAO)
#define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
#define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
@ -793,6 +794,7 @@ static const struct processor_costs *processor_cost_table[] =
&intel_cost,
&lujiazui_cost,
&yongfeng_cost,
&shijidadao_cost,
&geode_cost,
&k6_cost,
&athlon_cost,

View file

@ -2304,6 +2304,7 @@ enum processor_type
PROCESSOR_INTEL,
PROCESSOR_LUJIAZUI,
PROCESSOR_YONGFENG,
PROCESSOR_SHIJIDADAO,
PROCESSOR_GEODE,
PROCESSOR_K6,
PROCESSOR_ATHLON,

View file

@ -3644,6 +3644,122 @@ struct processor_costs yongfeng_cost = {
2, /* Small unroll factor. */
};
/* shijidadao_cost should produce code tuned for ZHAOXIN shijidadao CPU. */
static stringop_algs shijidadao_memcpy[2] = {
{libcall, {{8, unrolled_loop, true}, {256, unrolled_loop, false},
{-1, libcall, false}}},
{libcall, {{10, loop, true}, {256, unrolled_loop, false},
{-1, libcall, false}}}};
static stringop_algs shijidadao_memset[2] = {
{libcall, {{4, loop, true}, {128, unrolled_loop, false},
{-1, libcall, false}}},
{libcall, {{1, rep_prefix_4_byte, false}, {14, loop, true},
{1024, vector_loop, false},
{-1, libcall, false}}}};
static const
struct processor_costs shijidadao_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
8, /* cost for loading QImode using movzbl. */
{8, 8, 8}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 8}, /* cost of storing integer registers. */
2, /* cost of reg,reg fld/fst. */
{8, 8, 8}, /* cost of loading fp registers
in SFmode, DFmode and XFmode. */
{8, 8, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode. */
2, /* cost of moving MMX register. */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode. */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode. */
2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
{8, 8, 8, 10, 15}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit. */
{8, 8, 8, 10, 15}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit. */
8, 8, /* SSE->integer and integer->SSE moves. */
8, 8, /* mask->integer and integer->mask moves. */
{8, 8, 8}, /* cost of loading mask register
in QImode, HImode, SImode. */
{8, 8, 8}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction. */
COSTS_N_INSNS (1), /* cost of a lea instruction. */
COSTS_N_INSNS (1), /* variable shift costs. */
COSTS_N_INSNS (1), /* constant shift costs. */
{COSTS_N_INSNS (2), /* cost of starting multiply for QI. */
COSTS_N_INSNS (3), /* HI. */
COSTS_N_INSNS (2), /* SI. */
COSTS_N_INSNS (2), /* DI. */
COSTS_N_INSNS (3)}, /* other. */
0, /* cost of multiply per each bit set. */
{COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */
COSTS_N_INSNS (10), /* HI. */
COSTS_N_INSNS (9), /* SI. */
COSTS_N_INSNS (50), /* DI. */
COSTS_N_INSNS (50)}, /* other. */
COSTS_N_INSNS (1), /* cost of movsx. */
COSTS_N_INSNS (1), /* cost of movzx. */
8, /* "large" insn. */
17, /* MOVE_RATIO. */
6, /* CLEAR_RATIO. */
{8, 8, 8}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 8}, /* cost of storing integer registers. */
{8, 8, 8, 12, 15}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit. */
{8, 8, 8, 12, 15}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit. */
{8, 8, 8, 12, 15}, /* cost of unaligned loads. */
{8, 8, 8, 12, 15}, /* cost of unaligned storess. */
2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
8, /* cost of moving SSE register to integer. */
18, 6, /* Gather load static, per_elt. */
18, 6, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block. */
12, /* number of parallel prefetches. */
3, /* Branch cost. */
COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (3), /* cost of FMUL instruction. */
COSTS_N_INSNS (13), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (3), /* cost of MULSS instruction. */
COSTS_N_INSNS (3), /* cost of MULSD instruction. */
COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
COSTS_N_INSNS (11), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */
shijidadao_memcpy,
shijidadao_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
"16:11:8", /* Loop alignment. */
"16:11:8", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"16", /* Func alignment. */
4, /* Small unroll limit. */
2, /* Small unroll factor. */
};
/* Generic should produce code tuned for Core-i7 (and newer chips)
and btver1 (and newer chips). */

View file

@ -79,6 +79,7 @@ ix86_issue_rate (void)
case PROCESSOR_CANNONLAKE:
case PROCESSOR_ALDERLAKE:
case PROCESSOR_YONGFENG:
case PROCESSOR_SHIJIDADAO:
case PROCESSOR_GENERIC:
return 4;
@ -446,6 +447,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
break;
case PROCESSOR_YONGFENG:
case PROCESSOR_SHIJIDADAO:
/* Stack engine allows to execute push&pop instructions in parallel. */
if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
&& (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))

View file

@ -477,7 +477,7 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
elements. */
DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
| m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS))
| m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
/* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
elements. */
@ -488,7 +488,7 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
elements. */
DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
| m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS))
| m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
/* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
elements. */
@ -499,7 +499,7 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
elements. */
DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM
| m_YONGFENG | m_GENERIC | m_GDS))
| m_YONGFENG | m_SHIJIDADAO | m_GENERIC | m_GDS))
/* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
elements. */
@ -509,7 +509,7 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
/* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain. */
DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4
| m_YONGFENG | m_GENERIC)
| m_YONGFENG | m_SHIJIDADAO | m_GENERIC)
/* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain. */

View file

@ -26245,6 +26245,9 @@ ZHAOXIN lujiazui CPU.
@item yongfeng
ZHAOXIN yongfeng CPU.
@item shijidadao
ZHAOXIN shijidadao CPU.
@item amdfam10h
AMD Family 10h CPU.

View file

@ -34873,6 +34873,12 @@ SSE4.2, AVX, POPCNT, AES, PCLMUL, RDRND, XSAVE, XSAVEOPT, FSGSBASE, CX16,
ABM, BMI, BMI2, F16C, FXSR, RDSEED, AVX2, FMA, SHA, LZCNT
instruction set support.
@item shijidadao
ZHAOXIN shijidadao CPU with x86-64, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1,
SSE4.2, AVX, POPCNT, AES, PCLMUL, RDRND, XSAVE, XSAVEOPT, FSGSBASE, CX16,
ABM, BMI, BMI2, F16C, FXSR, RDSEED, AVX2, FMA, SHA, LZCNT
instruction set support.
@item geode
AMD Geode embedded processor with MMX and 3DNow!@: instruction set support.
@end table

View file

@ -21,6 +21,10 @@ int __attribute__ ((target("arch=yongfeng"))) foo () {
return 2;
}
int __attribute__ ((target("arch=shijidadao"))) foo () {
return 3;
}
int main ()
{
int val = foo ();
@ -29,6 +33,8 @@ int main ()
assert (val == 1);
else if (__builtin_cpu_is ("yongfeng"))
assert (val == 2);
else if (__builtin_cpu_is ("shijidadao"))
assert (val == 3);
else
assert (val == 0);

View file

@ -208,6 +208,7 @@ extern void test_arch_arrowlake_s (void) __attribute__((__target__("arch=arrowla
extern void test_arch_pantherlake (void) __attribute__((__target__("arch=pantherlake")));
extern void test_arch_lujiazui (void) __attribute__((__target__("arch=lujiazui")));
extern void test_arch_yongfeng (void) __attribute__((__target__("arch=yongfeng")));
extern void test_arch_shijidadao (void) __attribute__((__target__("arch=shijidadao")));
extern void test_arch_k8 (void) __attribute__((__target__("arch=k8")));
extern void test_arch_k8_sse3 (void) __attribute__((__target__("arch=k8-sse3")));
extern void test_arch_opteron (void) __attribute__((__target__("arch=opteron")));
@ -233,6 +234,7 @@ extern void test_tune_corei7_avx (void) __attribute__((__target__("tune=corei7-
extern void test_tune_core_avx2 (void) __attribute__((__target__("tune=core-avx2")));
extern void test_tune_lujiazui (void) __attribute__((__target__("tune=lujiazui")));
extern void test_tune_yongfeng (void) __attribute__((__target__("tune=yongfeng")));
extern void test_tune_shijidadao (void) __attribute__((__target__("tune=shijidadao")));
extern void test_tune_k8 (void) __attribute__((__target__("tune=k8")));
extern void test_tune_k8_sse3 (void) __attribute__((__target__("tune=k8-sse3")));
extern void test_tune_opteron (void) __attribute__((__target__("tune=opteron")));