diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index bcb879ba94b..3c7c476c4c6 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -8510,15 +8510,11 @@ aarch64_layout_frame (void) && !crtl->abi->clobbers_full_reg_p (regno)) frame.reg_offset[regno] = SLOT_REQUIRED; - /* With stack-clash, LR must be saved in non-leaf functions. The saving of - LR counts as an implicit probe which allows us to maintain the invariant - described in the comment at expand_prologue. */ - gcc_assert (crtl->is_leaf - || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED)); poly_int64 offset = crtl->outgoing_args_size; gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); frame.bytes_below_saved_regs = offset; + frame.sve_save_and_probe = INVALID_REGNUM; /* Now assign stack slots for the registers. Start with the predicate registers, since predicate LDR and STR have a relatively small @@ -8526,6 +8522,8 @@ aarch64_layout_frame (void) for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++) if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) { + if (frame.sve_save_and_probe == INVALID_REGNUM) + frame.sve_save_and_probe = regno; frame.reg_offset[regno] = offset; offset += BYTES_PER_SVE_PRED; } @@ -8563,6 +8561,8 @@ aarch64_layout_frame (void) for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) { + if (frame.sve_save_and_probe == INVALID_REGNUM) + frame.sve_save_and_probe = regno; frame.reg_offset[regno] = offset; offset += vector_save_size; } @@ -8572,10 +8572,18 @@ aarch64_layout_frame (void) frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs; bool saves_below_hard_fp_p = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); + gcc_assert (!saves_below_hard_fp_p + || (frame.sve_save_and_probe != INVALID_REGNUM + && known_eq (frame.reg_offset[frame.sve_save_and_probe], + frame.bytes_below_saved_regs))); + frame.bytes_below_hard_fp = offset; + frame.hard_fp_save_and_probe = INVALID_REGNUM; auto allocate_gpr_slot = [&](unsigned int regno) { + if (frame.hard_fp_save_and_probe == INVALID_REGNUM) + frame.hard_fp_save_and_probe = regno; frame.reg_offset[regno] = offset; if (frame.wb_push_candidate1 == INVALID_REGNUM) frame.wb_push_candidate1 = regno; @@ -8609,6 +8617,8 @@ aarch64_layout_frame (void) for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) { + if (frame.hard_fp_save_and_probe == INVALID_REGNUM) + frame.hard_fp_save_and_probe = regno; /* If there is an alignment gap between integer and fp callee-saves, allocate the last fp register to it if possible. */ if (regno == last_fp_reg @@ -8632,6 +8642,17 @@ aarch64_layout_frame (void) offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); frame.saved_regs_size = offset - frame.bytes_below_saved_regs; + gcc_assert (known_eq (frame.saved_regs_size, + frame.below_hard_fp_saved_regs_size) + || (frame.hard_fp_save_and_probe != INVALID_REGNUM + && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe], + frame.bytes_below_hard_fp))); + + /* With stack-clash, a register must be saved in non-leaf functions. + The saving of the bottommost register counts as an implicit probe, + which allows us to maintain the invariant described in the comment + at expand_prologue. */ + gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0)); offset += get_frame_size (); offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); @@ -8762,6 +8783,25 @@ aarch64_layout_frame (void) frame.final_adjust = frame.bytes_below_saved_regs; } + /* The frame is allocated in pieces, with each non-final piece + including a register save at offset 0 that acts as a probe for + the following piece. In addition, the save of the bottommost register + acts as a probe for callees and allocas. Roll back any probes that + aren't needed. + + A probe isn't needed if it is associated with the final allocation + (including callees and allocas) that happens before the epilogue is + executed. */ + if (crtl->is_leaf + && !cfun->calls_alloca + && known_eq (frame.final_adjust, 0)) + { + if (maybe_ne (frame.sve_callee_adjust, 0)) + frame.sve_save_and_probe = INVALID_REGNUM; + else + frame.hard_fp_save_and_probe = INVALID_REGNUM; + } + /* Make sure the individual adjustments add up to the full frame size. */ gcc_assert (known_eq (frame.initial_adjust + frame.callee_adjust @@ -9393,13 +9433,6 @@ aarch64_get_separate_components (void) poly_int64 offset = frame.reg_offset[regno]; - /* If the register is saved in the first SVE save slot, we use - it as a stack probe for -fstack-clash-protection. */ - if (flag_stack_clash_protection - && maybe_ne (frame.below_hard_fp_saved_regs_size, 0) - && known_eq (offset, frame.bytes_below_saved_regs)) - continue; - /* Get the offset relative to the register we'll use. */ if (frame_pointer_needed) offset -= frame.bytes_below_hard_fp; @@ -9434,6 +9467,13 @@ aarch64_get_separate_components (void) bitmap_clear_bit (components, LR_REGNUM); bitmap_clear_bit (components, SP_REGNUM); + if (flag_stack_clash_protection) + { + if (frame.sve_save_and_probe != INVALID_REGNUM) + bitmap_clear_bit (components, frame.sve_save_and_probe); + if (frame.hard_fp_save_and_probe != INVALID_REGNUM) + bitmap_clear_bit (components, frame.hard_fp_save_and_probe); + } return components; } @@ -9980,8 +10020,8 @@ aarch64_epilogue_uses (int regno) When probing is needed, we emit a probe at the start of the prologue and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter. - We have to track how much space has been allocated and the only stores - to the stack we track as implicit probes are the FP/LR stores. + We can also use register saves as probes. These are stored in + sve_save_and_probe and hard_fp_save_and_probe. For outgoing arguments we probe if the size is larger than 1KB, such that the ABI specified buffer is maintained for the next callee. diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 3e891107e5d..8fcebeb5206 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -862,6 +862,14 @@ struct GTY (()) aarch64_frame This is the register they should use. */ unsigned spare_pred_reg; + /* An SVE register that is saved below the hard frame pointer and that acts + as a probe for later allocations, or INVALID_REGNUM if none. */ + unsigned sve_save_and_probe; + + /* A register that is saved at the hard frame pointer and that acts + as a probe for later allocations, or INVALID_REGNUM if none. */ + unsigned hard_fp_save_and_probe; + bool laid_out; /* True if shadow call stack should be enabled for the current function. */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c index 3e01ec36c3a..3530a0d504b 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c @@ -11,11 +11,10 @@ ** mov x11, sp ** ... ** sub sp, sp, x13 -** str p4, \[sp\] ** cbz w0, [^\n]* +** str p4, \[sp\] ** ... ** ptrue p0\.b, all -** ldr p4, \[sp\] ** addvl sp, sp, #1 ** ldr x24, \[sp\], 32 ** ret @@ -39,13 +38,12 @@ test_1 (int n) ** mov x11, sp ** ... ** sub sp, sp, x13 -** str p4, \[sp\] ** cbz w0, [^\n]* +** str p4, \[sp\] ** str p5, \[sp, #1, mul vl\] ** str p6, \[sp, #2, mul vl\] ** ... ** ptrue p0\.b, all -** ldr p4, \[sp\] ** addvl sp, sp, #1 ** ldr x24, \[sp\], 32 ** ret