diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 088e6af2258..3e4de64ec24 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -23138,4 +23138,80 @@ ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, *rem_p = rem; } +void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val, + enum rtx_code code, bool after, + bool doubleword) +{ + rtx old_reg, new_reg, old_mem, success, oldval, new_mem; + rtx_code_label *loop_label, *pause_label; + machine_mode mode = GET_MODE (target); + + old_reg = gen_reg_rtx (mode); + new_reg = old_reg; + loop_label = gen_label_rtx (); + pause_label = gen_label_rtx (); + old_mem = copy_to_reg (mem); + emit_label (loop_label); + emit_move_insn (old_reg, old_mem); + + /* return value for atomic_fetch_op. */ + if (!after) + emit_move_insn (target, old_reg); + + if (code == NOT) + { + new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX, + true, OPTAB_LIB_WIDEN); + new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true); + } + else + new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX, + true, OPTAB_LIB_WIDEN); + + /* return value for atomic_op_fetch. */ + if (after) + emit_move_insn (target, new_reg); + + /* Load memory again inside loop. */ + new_mem = copy_to_reg (mem); + /* Compare mem value with expected value. */ + + if (doubleword) + { + machine_mode half_mode = (mode == DImode)? SImode : DImode; + rtx low_new_mem = gen_lowpart (half_mode, new_mem); + rtx low_old_mem = gen_lowpart (half_mode, old_mem); + rtx high_new_mem = gen_highpart (half_mode, new_mem); + rtx high_old_mem = gen_highpart (half_mode, old_mem); + emit_cmp_and_jump_insns (low_new_mem, low_old_mem, NE, NULL_RTX, + half_mode, 1, pause_label, + profile_probability::guessed_never ()); + emit_cmp_and_jump_insns (high_new_mem, high_old_mem, NE, NULL_RTX, + half_mode, 1, pause_label, + profile_probability::guessed_never ()); + } + else + emit_cmp_and_jump_insns (new_mem, old_mem, NE, NULL_RTX, + GET_MODE (old_mem), 1, pause_label, + profile_probability::guessed_never ()); + + success = NULL_RTX; + oldval = old_mem; + expand_atomic_compare_and_swap (&success, &oldval, mem, old_reg, + new_reg, false, MEMMODEL_SYNC_SEQ_CST, + MEMMODEL_RELAXED); + if (oldval != old_mem) + emit_move_insn (old_mem, oldval); + + emit_cmp_and_jump_insns (success, const0_rtx, EQ, const0_rtx, + GET_MODE (success), 1, loop_label, + profile_probability::guessed_never ()); + + /* If mem is not expected, pause and loop back. */ + emit_label (pause_label); + emit_insn (gen_pause ()); + emit_jump_insn (gen_jump (loop_label)); + emit_barrier (); +} + #include "gt-i386-expand.h" diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index a8cc0664f11..feff2584f41 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -397,7 +397,8 @@ ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2, { "-mstv", MASK_STV }, { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD }, { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE }, - { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES } + { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }, + { "-mrelax-cmpxchg-loop", MASK_RELAX_CMPXCHG_LOOP } }; /* Additional flag options. */ @@ -1092,6 +1093,10 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], IX86_ATTR_IX86_YES ("general-regs-only", OPT_mgeneral_regs_only, OPTION_MASK_GENERAL_REGS_ONLY), + + IX86_ATTR_YES ("relax-cmpxchg-loop", + OPT_mrelax_cmpxchg_loop, + MASK_RELAX_CMPXCHG_LOOP), }; location_t loc diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index bd52450a148..7e05510c679 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -217,6 +217,8 @@ extern void ix86_move_vector_high_sse_to_mmx (rtx); extern void ix86_split_mmx_pack (rtx[], enum rtx_code); extern void ix86_split_mmx_punpck (rtx[], bool); extern void ix86_expand_avx_vzeroupper (void); +extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code, + bool, bool); #ifdef TREE_CODE extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index ad366974b5b..46fad3cc038 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -404,6 +404,10 @@ momit-leaf-frame-pointer Target Mask(OMIT_LEAF_FRAME_POINTER) Save Omit the frame pointer in leaf functions. +mrelax-cmpxchg-loop +Target Mask(RELAX_CMPXCHG_LOOP) Save +Relax cmpxchg loop for atomic_fetch_{or,xor,and,nand} by adding load and cmp before cmpxchg, execute pause and loop back to load and compare if load value is not expected. + mpc32 Target RejectNegative Set 80387 floating-point precision to 32-bit. diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md index 9716a0b2f2c..cc4fe727bd9 100644 --- a/gcc/config/i386/sync.md +++ b/gcc/config/i386/sync.md @@ -525,6 +525,123 @@ (set (reg:CCZ FLAGS_REG) (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG))])]) +(define_expand "atomic_fetch_" + [(match_operand:SWI124 0 "register_operand") + (any_logic:SWI124 + (match_operand:SWI124 1 "memory_operand") + (match_operand:SWI124 2 "register_operand")) + (match_operand:SI 3 "const_int_operand")] + "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP" +{ + ix86_expand_atomic_fetch_op_loop (operands[0], operands[1], + operands[2], , false, + false); + DONE; +}) + +(define_expand "atomic__fetch" + [(match_operand:SWI124 0 "register_operand") + (any_logic:SWI124 + (match_operand:SWI124 1 "memory_operand") + (match_operand:SWI124 2 "register_operand")) + (match_operand:SI 3 "const_int_operand")] + "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP" +{ + ix86_expand_atomic_fetch_op_loop (operands[0], operands[1], + operands[2], , true, + false); + DONE; +}) + +(define_expand "atomic_fetch_nand" + [(match_operand:SWI124 0 "register_operand") + (match_operand:SWI124 1 "memory_operand") + (match_operand:SWI124 2 "register_operand") + (match_operand:SI 3 "const_int_operand")] + "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP" +{ + ix86_expand_atomic_fetch_op_loop (operands[0], operands[1], + operands[2], NOT, false, + false); + DONE; +}) + +(define_expand "atomic_nand_fetch" + [(match_operand:SWI124 0 "register_operand") + (match_operand:SWI124 1 "memory_operand") + (match_operand:SWI124 2 "register_operand") + (match_operand:SI 3 "const_int_operand")] + "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP" +{ + ix86_expand_atomic_fetch_op_loop (operands[0], operands[1], + operands[2], NOT, true, + false); + DONE; +}) + +(define_expand "atomic_fetch_" + [(match_operand:CASMODE 0 "register_operand") + (any_logic:CASMODE + (match_operand:CASMODE 1 "memory_operand") + (match_operand:CASMODE 2 "register_operand")) + (match_operand:SI 3 "const_int_operand")] + "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP" +{ + bool doubleword = (mode == DImode && !TARGET_64BIT) + || (mode == TImode); + ix86_expand_atomic_fetch_op_loop (operands[0], operands[1], + operands[2], , false, + doubleword); + DONE; +}) + +(define_expand "atomic__fetch" + [(match_operand:CASMODE 0 "register_operand") + (any_logic:CASMODE + (match_operand:CASMODE 1 "memory_operand") + (match_operand:CASMODE 2 "register_operand")) + (match_operand:SI 3 "const_int_operand")] + "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP" +{ + bool doubleword = (mode == DImode && !TARGET_64BIT) + || (mode == TImode); + ix86_expand_atomic_fetch_op_loop (operands[0], operands[1], + operands[2], , true, + doubleword); + DONE; +}) + +(define_expand "atomic_fetch_nand" + [(match_operand:CASMODE 0 "register_operand") + (match_operand:CASMODE 1 "memory_operand") + (match_operand:CASMODE 2 "register_operand") + (match_operand:SI 3 "const_int_operand")] + "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP" +{ + bool doubleword = (mode == DImode && !TARGET_64BIT) + || (mode == TImode); + ix86_expand_atomic_fetch_op_loop (operands[0], operands[1], + operands[2], NOT, false, + doubleword); + DONE; +}) + +(define_expand "atomic_nand_fetch" + [(match_operand:CASMODE 0 "register_operand") + (match_operand:CASMODE 1 "memory_operand") + (match_operand:CASMODE 2 "register_operand") + (match_operand:SI 3 "const_int_operand")] + "TARGET_CMPXCHG && TARGET_RELAX_CMPXCHG_LOOP" +{ + bool doubleword = (mode == DImode && !TARGET_64BIT) + || (mode == TImode); + ix86_expand_atomic_fetch_op_loop (operands[0], operands[1], + operands[2], NOT, true, + doubleword); + DONE; +}) + + ;; For operand 2 nonmemory_operand predicate is used instead of ;; register_operand to allow combiner to better optimize atomic ;; additions of constants. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 2d9c1782f33..6070288856c 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -1423,7 +1423,7 @@ See RS/6000 and PowerPC Options. -mstack-protector-guard-reg=@var{reg} @gol -mstack-protector-guard-offset=@var{offset} @gol -mstack-protector-guard-symbol=@var{symbol} @gol --mgeneral-regs-only -mcall-ms2sysv-xlogues @gol +-mgeneral-regs-only -mcall-ms2sysv-xlogues -mrelax-cmpxchg-loop @gol -mindirect-branch=@var{choice} -mfunction-return=@var{choice} @gol -mindirect-branch-register -mneeded} @@ -32330,6 +32330,13 @@ Generate code that uses only the general-purpose registers. This prevents the compiler from using floating-point, vector, mask and bound registers. +@item -mrelax-cmpxchg-loop +@opindex mrelax-cmpxchg-loop +Relax cmpxchg loop by emitting an early load and compare before cmpxchg, +execute pause if load value is not expected. This reduces excessive +cachline bouncing when and works for all atomic logic fetch builtins +that generates compare and swap loop. + @item -mindirect-branch=@var{choice} @opindex mindirect-branch Convert indirect call and jump with @var{choice}. The default is diff --git a/gcc/testsuite/gcc.target/i386/pr103069-1.c b/gcc/testsuite/gcc.target/i386/pr103069-1.c new file mode 100644 index 00000000000..f819af4409c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103069-1.c @@ -0,0 +1,35 @@ +/* PR target/103068 */ +/* { dg-do compile } */ +/* { dg-additional-options "-O2 -march=x86-64 -mtune=generic -mrelax-cmpxchg-loop" } */ +/* { dg-final { scan-assembler-times "rep;?\[ \\t\]+nop" 32 } } */ + +#include + +#define FUNC_ATOMIC(TYPE, OP) \ +__attribute__ ((noinline, noclone)) \ +TYPE f_##TYPE##_##OP##_fetch (TYPE *a, TYPE b) \ +{ \ + return __atomic_##OP##_fetch (a, b, __ATOMIC_RELAXED); \ +} \ +__attribute__ ((noinline, noclone)) \ +TYPE f_##TYPE##_fetch_##OP (TYPE *a, TYPE b) \ +{ \ + return __atomic_fetch_##OP (a, b, __ATOMIC_RELAXED); \ +} + +FUNC_ATOMIC (int64_t, and) +FUNC_ATOMIC (int64_t, nand) +FUNC_ATOMIC (int64_t, or) +FUNC_ATOMIC (int64_t, xor) +FUNC_ATOMIC (int, and) +FUNC_ATOMIC (int, nand) +FUNC_ATOMIC (int, or) +FUNC_ATOMIC (int, xor) +FUNC_ATOMIC (short, and) +FUNC_ATOMIC (short, nand) +FUNC_ATOMIC (short, or) +FUNC_ATOMIC (short, xor) +FUNC_ATOMIC (char, and) +FUNC_ATOMIC (char, nand) +FUNC_ATOMIC (char, or) +FUNC_ATOMIC (char, xor) diff --git a/gcc/testsuite/gcc.target/i386/pr103069-2.c b/gcc/testsuite/gcc.target/i386/pr103069-2.c new file mode 100644 index 00000000000..8ac824cc8e8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103069-2.c @@ -0,0 +1,70 @@ +/* PR target/103068 */ +/* { dg-do compile } */ +/* { dg-additional-options "-O2 -march=x86-64 -mtune=generic" } */ + +#include +#include "pr103069-1.c" + +#define FUNC_ATOMIC_RELAX(TYPE, OP) \ +__attribute__ ((noinline, noclone, target ("relax-cmpxchg-loop"))) \ +TYPE relax_##TYPE##_##OP##_fetch (TYPE *a, TYPE b) \ +{ \ + return __atomic_##OP##_fetch (a, b, __ATOMIC_RELAXED); \ +} \ +__attribute__ ((noinline, noclone, target ("relax-cmpxchg-loop"))) \ +TYPE relax_##TYPE##_fetch_##OP (TYPE *a, TYPE b) \ +{ \ + return __atomic_fetch_##OP (a, b, __ATOMIC_RELAXED); \ +} + +FUNC_ATOMIC_RELAX (int64_t, and) +FUNC_ATOMIC_RELAX (int64_t, nand) +FUNC_ATOMIC_RELAX (int64_t, or) +FUNC_ATOMIC_RELAX (int64_t, xor) +FUNC_ATOMIC_RELAX (int, and) +FUNC_ATOMIC_RELAX (int, nand) +FUNC_ATOMIC_RELAX (int, or) +FUNC_ATOMIC_RELAX (int, xor) +FUNC_ATOMIC_RELAX (short, and) +FUNC_ATOMIC_RELAX (short, nand) +FUNC_ATOMIC_RELAX (short, or) +FUNC_ATOMIC_RELAX (short, xor) +FUNC_ATOMIC_RELAX (char, and) +FUNC_ATOMIC_RELAX (char, nand) +FUNC_ATOMIC_RELAX (char, or) +FUNC_ATOMIC_RELAX (char, xor) + +#define TEST_ATOMIC_FETCH_LOGIC(TYPE, OP) \ +{ \ + TYPE a = 11, b = 101, res, exp; \ + res = relax_##TYPE##_##OP##_fetch (&a, b); \ + exp = f_##TYPE##_##OP##_fetch (&a, b); \ + if (res != exp) \ + abort (); \ + a = 21, b = 92; \ + res = relax_##TYPE##_fetch_##OP (&a, b); \ + exp = f_##TYPE##_fetch_##OP (&a, b); \ + if (res != exp) \ + abort (); \ +} + +int main (void) +{ + TEST_ATOMIC_FETCH_LOGIC (int64_t, and) + TEST_ATOMIC_FETCH_LOGIC (int64_t, nand) + TEST_ATOMIC_FETCH_LOGIC (int64_t, or) + TEST_ATOMIC_FETCH_LOGIC (int64_t, xor) + TEST_ATOMIC_FETCH_LOGIC (int, and) + TEST_ATOMIC_FETCH_LOGIC (int, nand) + TEST_ATOMIC_FETCH_LOGIC (int, or) + TEST_ATOMIC_FETCH_LOGIC (int, xor) + TEST_ATOMIC_FETCH_LOGIC (short, and) + TEST_ATOMIC_FETCH_LOGIC (short, nand) + TEST_ATOMIC_FETCH_LOGIC (short, or) + TEST_ATOMIC_FETCH_LOGIC (short, xor) + TEST_ATOMIC_FETCH_LOGIC (char, and) + TEST_ATOMIC_FETCH_LOGIC (char, nand) + TEST_ATOMIC_FETCH_LOGIC (char, or) + TEST_ATOMIC_FETCH_LOGIC (char, xor) + return 0; +}