nvptx: PTX 'alloca' for '-mptx=7.3'+, '-march=sm_52'+ [PR65181]
..., and use it for '-mno-soft-stack': PTX "native" stacks. PR target/65181 gcc/ * config/nvptx/nvptx.cc (nvptx_get_drap_rtx): Handle '!TARGET_SOFT_STACK'. * config/nvptx/nvptx.md (define_c_enum "unspec"): Add 'UNSPEC_STACKSAVE', 'UNSPEC_STACKRESTORE'. (define_expand "allocate_stack", define_expand "save_stack_block") (define_expand "save_stack_block"): Handle '!TARGET_SOFT_STACK', PTX 'alloca'. (define_insn "@nvptx_alloca_<mode>") (define_insn "@nvptx_stacksave_<mode>") (define_insn "@nvptx_stackrestore_<mode>"): New. * doc/invoke.texi (Nvidia PTX Options): Update '-msoft-stack', '-mno-soft-stack'. * doc/sourcebuild.texi (nvptx-specific attributes): Document 'nvptx_runtime_alloca_ptx'. (Add Options): Document 'nvptx_alloca_ptx'. gcc/testsuite/ * gcc.target/nvptx/alloca-1.c: Evolve into... * gcc.target/nvptx/alloca-1-O0.c: ... this, ... * gcc.target/nvptx/alloca-1-O1.c: ... this, and... * gcc.target/nvptx/alloca-1-sm_30.c: ... this. * gcc.target/nvptx/vla-1.c: Evolve into... * gcc.target/nvptx/vla-1-O0.c: ... this, ... * gcc.target/nvptx/vla-1-O1.c: ... this, and... * gcc.target/nvptx/vla-1-sm_30.c: ... this. * gcc.c-torture/execute/pr36321.c: Adjust. * gcc.target/nvptx/__builtin_alloca_0-1-O0.c: Likewise. * gcc.target/nvptx/__builtin_alloca_0-1-O1.c: Likewise. * gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1.c: Likewise. * gcc.target/nvptx/softstack.c: Likewise. * gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1-sm_30.c: New. * gcc.target/nvptx/alloca-2-O0.c: Likewise. * gcc.target/nvptx/alloca-3-O1.c: Likewise. * gcc.target/nvptx/alloca-4-O3.c: Likewise. * gcc.target/nvptx/alloca-5.c: Likewise. * lib/target-supports.exp (check_effective_target_alloca): Adjust. (check_nvptx_default_ptx_isa_target_architecture_at_least) (check_nvptx_runtime_ptx_isa_target_architecture_at_least) (check_effective_target_nvptx_runtime_alloca_ptx) (add_options_for_nvptx_alloca_ptx): New. libgomp/ * fortran.c (omp_get_device_from_uid_): Adjust. * testsuite/libgomp.oacc-fortran/privatized-ref-2.f90: Likewise.
This commit is contained in:
parent
1db025c67f
commit
3861d362ec
23 changed files with 611 additions and 35 deletions
|
@ -245,7 +245,7 @@ default_ptx_version_option (void)
|
|||
warp convergence. */
|
||||
res = MAX (res, PTX_VERSION_6_0);
|
||||
|
||||
/* For sm_52+, pick at least 7.3. */
|
||||
/* For sm_52+, pick at least 7.3, to enable PTX 'alloca'. */
|
||||
if (ptx_isa_option >= PTX_ISA_SM52)
|
||||
res = MAX (res, PTX_VERSION_7_3);
|
||||
|
||||
|
@ -1797,7 +1797,7 @@ nvptx_function_ok_for_sibcall (tree, tree)
|
|||
static rtx
|
||||
nvptx_get_drap_rtx (void)
|
||||
{
|
||||
if (TARGET_SOFT_STACK && stack_realign_drap)
|
||||
if (stack_realign_drap)
|
||||
return arg_pointer_rtx;
|
||||
return NULL_RTX;
|
||||
}
|
||||
|
|
|
@ -35,8 +35,9 @@
|
|||
UNSPEC_FPINT_NEARBYINT
|
||||
|
||||
UNSPEC_ALLOCA
|
||||
|
||||
UNSPEC_SET_SOFTSTACK
|
||||
UNSPEC_STACKSAVE
|
||||
UNSPEC_STACKRESTORE
|
||||
|
||||
UNSPEC_DIM_SIZE
|
||||
|
||||
|
@ -1663,22 +1664,47 @@
|
|||
(match_operand 1 "nvptx_register_operand")]
|
||||
""
|
||||
{
|
||||
if (TARGET_SOFT_STACK)
|
||||
if (!TARGET_SOFT_STACK
|
||||
&& TARGET_PTX_7_3
|
||||
&& TARGET_SM52)
|
||||
emit_insn (gen_nvptx_alloca (Pmode, operands[0], operands[1]));
|
||||
else if (!TARGET_SOFT_STACK)
|
||||
{
|
||||
sorry ("target cannot support alloca");
|
||||
emit_insn (gen_nop ());
|
||||
}
|
||||
else if (TARGET_SOFT_STACK)
|
||||
{
|
||||
emit_move_insn (stack_pointer_rtx,
|
||||
gen_rtx_MINUS (Pmode, stack_pointer_rtx, operands[1]));
|
||||
emit_insn (gen_set_softstack (Pmode, stack_pointer_rtx));
|
||||
emit_move_insn (operands[0], virtual_stack_dynamic_rtx);
|
||||
DONE;
|
||||
}
|
||||
/* The ptx documentation specifies an alloca intrinsic (for 32 bit
|
||||
only) but notes it is not implemented. The assembler emits a
|
||||
confused error message. Issue a blunt one now instead. */
|
||||
sorry ("target cannot support alloca");
|
||||
emit_insn (gen_nop ());
|
||||
else
|
||||
gcc_unreachable ();
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_insn "@nvptx_alloca_<mode>"
|
||||
[(set (match_operand:P 0 "nvptx_register_operand" "=R")
|
||||
(unspec:P [(match_operand:P 1 "nvptx_nonmemory_operand" "Ri")]
|
||||
UNSPEC_ALLOCA))]
|
||||
"TARGET_PTX_7_3
|
||||
&& TARGET_SM52"
|
||||
{
|
||||
/* Convert the address from '.local' state space to generic. That way,
|
||||
we don't have to use 'st.local', 'ld.local', and can easily pass the
|
||||
address to other "generic functions".
|
||||
TODO 'gcc.target/nvptx/alloca-5.c' */
|
||||
output_asm_insn ("{", NULL);
|
||||
output_asm_insn ("\\t.reg%t0\\t%0_local;", operands);
|
||||
output_asm_insn ("\\talloca%u0\\t%0_local, %1;", operands);
|
||||
output_asm_insn ("\\tcvta.local%u0\\t%0, %0_local;", operands);
|
||||
output_asm_insn ("}", NULL);
|
||||
return "";
|
||||
}
|
||||
[(set_attr "predicable" "no")])
|
||||
|
||||
(define_insn "@set_softstack_<mode>"
|
||||
[(unspec [(match_operand:P 0 "nvptx_register_operand" "R")]
|
||||
UNSPEC_SET_SOFTSTACK)]
|
||||
|
@ -1692,30 +1718,64 @@
|
|||
(match_operand 1 "register_operand" "")]
|
||||
"!TARGET_SOFT_STACK"
|
||||
{
|
||||
/* The concept of a '%stack' pointer doesn't apply like this for
|
||||
PTX "native" stacks. GCC however occasionally synthesizes
|
||||
'__builtin_stack_save ()', '__builtin_stack_restore ()', and isn't able to
|
||||
optimize them all away. Just submit a dummy -- user code shouldn't be
|
||||
able to observe this. */
|
||||
emit_move_insn (operands[0], GEN_INT (0xdeadbeef));
|
||||
if (TARGET_PTX_7_3
|
||||
&& TARGET_SM52)
|
||||
{
|
||||
gcc_checking_assert (REG_P (operands[0]));
|
||||
emit_insn (gen_nvptx_stacksave (Pmode, operands[0], operands[1]));
|
||||
}
|
||||
else
|
||||
{
|
||||
/* The concept of a '%stack' pointer doesn't apply like this.
|
||||
GCC however occasionally synthesizes '__builtin_stack_save ()',
|
||||
'__builtin_stack_restore ()', and isn't able to optimize them all
|
||||
away. Just submit a dummy -- user code shouldn't be able to observe
|
||||
this. */
|
||||
emit_move_insn (operands[0], GEN_INT (0xdeadbeef));
|
||||
}
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_insn "@nvptx_stacksave_<mode>"
|
||||
[(set (match_operand:P 0 "nvptx_register_operand" "=R")
|
||||
(unspec:P [(match_operand:P 1 "register_operand" "R")]
|
||||
UNSPEC_STACKSAVE))]
|
||||
"TARGET_PTX_7_3
|
||||
&& TARGET_SM52"
|
||||
"%.\\tstacksave%u0\\t%0;")
|
||||
|
||||
(define_expand "restore_stack_block"
|
||||
[(match_operand 0 "register_operand" "")
|
||||
(match_operand 1 "register_operand" "")]
|
||||
""
|
||||
{
|
||||
if (!TARGET_SOFT_STACK)
|
||||
if (!TARGET_SOFT_STACK
|
||||
&& TARGET_PTX_7_3
|
||||
&& TARGET_SM52)
|
||||
{
|
||||
operands[1] = force_reg (Pmode, operands[1]);
|
||||
emit_insn (gen_nvptx_stackrestore (Pmode, operands[0], operands[1]));
|
||||
}
|
||||
else if (!TARGET_SOFT_STACK)
|
||||
; /* See 'save_stack_block'. */
|
||||
else
|
||||
else if (TARGET_SOFT_STACK)
|
||||
{
|
||||
emit_move_insn (operands[0], operands[1]);
|
||||
emit_insn (gen_set_softstack (Pmode, operands[0]));
|
||||
}
|
||||
else
|
||||
gcc_unreachable ();
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_insn "@nvptx_stackrestore_<mode>"
|
||||
[(set (match_operand:P 0 "nvptx_register_operand" "=R")
|
||||
(unspec:P [(match_operand:P 1 "nvptx_register_operand" "R")]
|
||||
UNSPEC_STACKRESTORE))]
|
||||
"TARGET_PTX_7_3
|
||||
&& TARGET_SM52"
|
||||
"%.\\tstackrestore%u1\\t%1;")
|
||||
|
||||
(define_expand "save_stack_function"
|
||||
[(match_operand 0 "register_operand" "")
|
||||
(match_operand 1 "register_operand" "")]
|
||||
|
|
|
@ -30232,8 +30232,19 @@ Apply partitioned execution optimizations. This is the default when any
|
|||
level of optimization is selected.
|
||||
|
||||
@opindex msoft-stack
|
||||
@opindex mno-soft-stack
|
||||
@item -msoft-stack
|
||||
Generate code that does not use @code{.local} memory
|
||||
@itemx -mno-soft-stack
|
||||
For @option{-mno-soft-stack} (the default, unless @option{-mgomp} has
|
||||
been specified), use PTX ``native'' stacks, that is,
|
||||
generate code that uses @code{.local} memory or PTX @code{alloca}
|
||||
directly for stack storage.
|
||||
Unless @option{-mptx=7.3} or higher and @option{-march=sm_52} or
|
||||
higher are active, variable-length arrays and dynamically allocating
|
||||
memory on the stack with @code{alloca} are not supported.
|
||||
|
||||
For @option{-msoft-stack} (implied by @option{-mgomp}),
|
||||
generate code that does not use @code{.local} memory or PTX @code{alloca}
|
||||
directly for stack storage. Instead, a per-warp stack pointer is
|
||||
maintained explicitly. This enables variable-length stack allocation (with
|
||||
variable-length arrays or @code{alloca}), and when global memory is used for
|
||||
|
|
|
@ -2434,6 +2434,9 @@ nvptx code by default compiles for at least PTX ISA version 6.0.
|
|||
The nvptx runtime environment supports the PTX ISA directive
|
||||
@code{.alias}.
|
||||
|
||||
@item nvptx_runtime_alloca_ptx
|
||||
The nvptx runtime environment supports PTX 'alloca'.
|
||||
|
||||
@item nvptx_softstack
|
||||
nvptx @option{-msoft-stack} is enabled.
|
||||
@end table
|
||||
|
@ -3359,6 +3362,9 @@ Only MIPS targets support this feature, and only then in certain modes.
|
|||
@item nvptx_alias_ptx
|
||||
Enable using the PTX ISA directive @code{.alias} on nvptx targets.
|
||||
|
||||
@item nvptx_alloca_ptx
|
||||
Enable PTX 'alloca' on nvptx targets.
|
||||
|
||||
@item riscv_a
|
||||
Add the 'A' extension to the -march string on RISC-V targets.
|
||||
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
/* { dg-skip-if "requires alloca" { ! alloca } { "-O0" } { "" } } */
|
||||
/* See 'gcc.target/nvptx/__builtin_alloca_0-1-O0.c'.
|
||||
{ dg-xfail-if TODO { nvptx-*-* && { ! nvptx_softstack } } { "-O0" } { "" } } */
|
||||
|
||||
extern void abort (void);
|
||||
|
||||
extern __SIZE_TYPE__ strlen (const char *);
|
||||
|
|
|
@ -6,6 +6,8 @@
|
|||
/* { dg-additional-options -save-temps } */
|
||||
/* { dg-final { check-function-bodies {** } {} } } */
|
||||
|
||||
/* See 'gcc.c-torture/execute/pr36321.c', '-O0'. */
|
||||
|
||||
void sink(void *);
|
||||
|
||||
void f(void)
|
||||
|
|
|
@ -6,6 +6,8 @@
|
|||
/* { dg-additional-options -save-temps } */
|
||||
/* { dg-final { check-function-bodies {** } {} } } */
|
||||
|
||||
/* See 'gcc.c-torture/execute/pr36321.c', '-O0'. */
|
||||
|
||||
void sink(void *);
|
||||
|
||||
void f(void)
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
/* Document what we do for '__builtin_stack_save()', '__builtin_stack_restore()'. */
|
||||
|
||||
/* { dg-do assemble } */
|
||||
/* { dg-options {-O3 -mno-soft-stack} } */
|
||||
/* { dg-additional-options -march=sm_30 } */
|
||||
/* { dg-additional-options -save-temps } */
|
||||
/* { dg-final { check-function-bodies {** } {} } } */
|
||||
|
||||
void *p;
|
||||
|
||||
void f(void)
|
||||
{
|
||||
// 0xdeadbeef
|
||||
p = __builtin_stack_save();
|
||||
asm volatile ("" : : : "memory");
|
||||
// no-op
|
||||
__builtin_stack_restore(p);
|
||||
asm volatile ("" : : : "memory");
|
||||
}
|
||||
/*
|
||||
** f:
|
||||
** \.visible \.func f
|
||||
** {
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** mov\.u64 \1, 3735928559;
|
||||
** st\.global\.u64 \[p\], \1;
|
||||
** ret;
|
||||
*/
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
/* { dg-do assemble } */
|
||||
/* { dg-options {-O3 -mno-soft-stack} } */
|
||||
/* { dg-add-options nvptx_alloca_ptx } */
|
||||
/* { dg-additional-options -save-temps } */
|
||||
/* { dg-final { check-function-bodies {** } {} } } */
|
||||
|
||||
|
@ -9,10 +10,8 @@ void *p;
|
|||
|
||||
void f(void)
|
||||
{
|
||||
// 0xdeadbeef
|
||||
p = __builtin_stack_save();
|
||||
asm volatile ("" : : : "memory");
|
||||
// no-op
|
||||
__builtin_stack_restore(p);
|
||||
asm volatile ("" : : : "memory");
|
||||
}
|
||||
|
@ -21,7 +20,10 @@ void f(void)
|
|||
** \.visible \.func f
|
||||
** {
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** mov\.u64 \1, 3735928559;
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** stacksave\.u64 \1;
|
||||
** st\.global\.u64 \[p\], \1;
|
||||
** ld\.global\.u64 \2, \[p\];
|
||||
** stackrestore\.u64 \2;
|
||||
** ret;
|
||||
*/
|
||||
|
|
49
gcc/testsuite/gcc.target/nvptx/alloca-1-O0.c
Normal file
49
gcc/testsuite/gcc.target/nvptx/alloca-1-O0.c
Normal file
|
@ -0,0 +1,49 @@
|
|||
/* { dg-do assemble } */
|
||||
/* { dg-options {-O0 -mno-soft-stack} } */
|
||||
/* { dg-add-options nvptx_alloca_ptx } */
|
||||
/* { dg-additional-options -save-temps } */
|
||||
/* { dg-final { check-function-bodies {** } {} } } */
|
||||
|
||||
void sink(void *);
|
||||
|
||||
void f(void)
|
||||
{
|
||||
sink(__builtin_alloca(123));
|
||||
}
|
||||
/*
|
||||
** f:
|
||||
** \.visible \.func f
|
||||
** {
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** mov\.u64 \11, 16;
|
||||
** add\.u64 \2, \11, -1;
|
||||
** add\.u64 \3, \2, 123;
|
||||
** div\.u64 \4, \3, 16;
|
||||
** mul\.lo\.u64 \5, \4, 16;
|
||||
** {
|
||||
** \.reg\.u64 \6_local;
|
||||
** alloca\.u64 \6_local, \5;
|
||||
** cvta\.local\.u64 \6, \6_local;
|
||||
** }
|
||||
** add\.u64 \7, \6, 15;
|
||||
** shr\.u64 \8, \7, 4;
|
||||
** shl\.b64 \9, \8, 4;
|
||||
** mov\.u64 \1, \9;
|
||||
** mov\.u64 \10, \1;
|
||||
** {
|
||||
** \.param\.u64 %out_arg1;
|
||||
** st\.param\.u64 \[%out_arg1\], \10;
|
||||
** call sink, \(%out_arg1\);
|
||||
** }
|
||||
** ret;
|
||||
*/
|
33
gcc/testsuite/gcc.target/nvptx/alloca-1-O1.c
Normal file
33
gcc/testsuite/gcc.target/nvptx/alloca-1-O1.c
Normal file
|
@ -0,0 +1,33 @@
|
|||
/* { dg-do assemble } */
|
||||
/* { dg-options {-O1 -mno-soft-stack} } */
|
||||
/* { dg-add-options nvptx_alloca_ptx } */
|
||||
/* { dg-additional-options -save-temps } */
|
||||
/* { dg-final { check-function-bodies {** } {} } } */
|
||||
|
||||
void sink(void *);
|
||||
|
||||
void f(void)
|
||||
{
|
||||
sink(__builtin_alloca(123));
|
||||
}
|
||||
/*
|
||||
** f:
|
||||
** \.visible \.func f
|
||||
** {
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** {
|
||||
** \.reg\.u64 \1_local;
|
||||
** alloca\.u64 \1_local, 128;
|
||||
** cvta\.local\.u64 \1, \1_local;
|
||||
** }
|
||||
** add\.u64 \2, \1, 15;
|
||||
** and\.b64 \3, \2, -16;
|
||||
** {
|
||||
** \.param\.u64 %out_arg1;
|
||||
** st\.param\.u64 \[%out_arg1\], \3;
|
||||
** call sink, \(%out_arg1\);
|
||||
** }
|
||||
** ret;
|
||||
*/
|
|
@ -1,5 +1,6 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options -mno-soft-stack } */
|
||||
/* { dg-additional-options -march=sm_30 } */
|
||||
|
||||
void sink(void *);
|
||||
|
12
gcc/testsuite/gcc.target/nvptx/alloca-2-O0.c
Normal file
12
gcc/testsuite/gcc.target/nvptx/alloca-2-O0.c
Normal file
|
@ -0,0 +1,12 @@
|
|||
/* { dg-do link } */
|
||||
/* { dg-do run { target nvptx_runtime_alloca_ptx } } */
|
||||
/* { dg-options {-O0 -mno-soft-stack} } */
|
||||
/* { dg-add-options nvptx_alloca_ptx } */
|
||||
/* { dg-additional-options -save-temps } */
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
return !(__builtin_alloca(100) != __builtin_alloca(10));
|
||||
}
|
||||
/* { dg-final { scan-assembler-times {(?n)\talloca\.u64\t%r[0-9]+_local, %r[0-9]+;$} 2 } } */
|
40
gcc/testsuite/gcc.target/nvptx/alloca-3-O1.c
Normal file
40
gcc/testsuite/gcc.target/nvptx/alloca-3-O1.c
Normal file
|
@ -0,0 +1,40 @@
|
|||
/* { dg-do assemble } */
|
||||
/* { dg-options {-O1 -mno-soft-stack} } */
|
||||
/* { dg-add-options nvptx_alloca_ptx } */
|
||||
/* { dg-additional-options -save-temps } */
|
||||
/* { dg-final { check-function-bodies {** } {} } } */
|
||||
|
||||
void sink(void *);
|
||||
|
||||
void *p;
|
||||
|
||||
void f(void)
|
||||
{
|
||||
p = __builtin_stack_save();
|
||||
sink(__builtin_alloca(25));
|
||||
__builtin_stack_restore(p);
|
||||
}
|
||||
/*
|
||||
** f:
|
||||
** \.visible \.func f
|
||||
** {
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** stacksave\.u64 \1;
|
||||
** st\.global\.u64 \[p\], \1;
|
||||
** {
|
||||
** \.reg\.u64 \2_local;
|
||||
** alloca\.u64 \2_local, 32;
|
||||
** cvta\.local\.u64 \2, \2_local;
|
||||
** }
|
||||
** add\.u64 \3, \2, 15;
|
||||
** and\.b64 \4, \3, -16;
|
||||
** {
|
||||
** \.param\.u64 %out_arg1;
|
||||
** st\.param\.u64 \[%out_arg1\], \4;
|
||||
** call sink, \(%out_arg1\);
|
||||
** }
|
||||
** ret;
|
||||
*/
|
55
gcc/testsuite/gcc.target/nvptx/alloca-4-O3.c
Normal file
55
gcc/testsuite/gcc.target/nvptx/alloca-4-O3.c
Normal file
|
@ -0,0 +1,55 @@
|
|||
/* { dg-do assemble } */
|
||||
/* { dg-options {-O3 -mno-soft-stack} } */
|
||||
/* { dg-add-options nvptx_alloca_ptx } */
|
||||
/* { dg-additional-options -save-temps } */
|
||||
/* { dg-final { check-function-bodies {** } {} } } */
|
||||
|
||||
void sink(void *);
|
||||
|
||||
void f(void)
|
||||
{
|
||||
void *p;
|
||||
p = __builtin_stack_save();
|
||||
sink(__builtin_alloca(25));
|
||||
__builtin_stack_restore(p);
|
||||
sink(__builtin_alloca(13));
|
||||
}
|
||||
/*
|
||||
** f:
|
||||
** .visible .func f
|
||||
** {
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** stacksave\.u64 \1;
|
||||
** {
|
||||
** \.reg\.u64 \2_local;
|
||||
** alloca\.u64 \2_local, 32;
|
||||
** cvta\.local\.u64 \2, \2_local;
|
||||
** }
|
||||
** add\.u64 \3, \2, 15;
|
||||
** and\.b64 \4, \3, -16;
|
||||
** {
|
||||
** \.param\.u64 %out_arg1;
|
||||
** st\.param\.u64 \[%out_arg1\], \4;
|
||||
** call sink, \(%out_arg1\);
|
||||
** }
|
||||
** stackrestore\.u64 \1;
|
||||
** {
|
||||
** \.reg\.u64 \5_local;
|
||||
** alloca\.u64 \5_local, 16;
|
||||
** cvta\.local\.u64 \5, \5_local;
|
||||
** }
|
||||
** add\.u64 \6, \5, 15;
|
||||
** and\.b64 \7, \6, -16;
|
||||
** {
|
||||
** \.param\.u64 %out_arg1;
|
||||
** st\.param\.u64 \[%out_arg1\], \7;
|
||||
** call sink, \(%out_arg1\);
|
||||
** }
|
||||
** ret;
|
||||
*/
|
107
gcc/testsuite/gcc.target/nvptx/alloca-5.c
Normal file
107
gcc/testsuite/gcc.target/nvptx/alloca-5.c
Normal file
|
@ -0,0 +1,107 @@
|
|||
/* { dg-do link } */
|
||||
/* { dg-do run { target nvptx_runtime_alloca_ptx } } */
|
||||
/* { dg-options {-O2 -mno-soft-stack} } */
|
||||
/* { dg-add-options nvptx_alloca_ptx } */
|
||||
/* { dg-additional-options -save-temps } */
|
||||
/* { dg-final { check-function-bodies {** } {} } } */
|
||||
|
||||
/* See also 'gcc.target/nvptx/softstack.c'. */
|
||||
|
||||
static __attribute__((noipa)) int f(int *p)
|
||||
{
|
||||
return __sync_lock_test_and_set(p, 1);
|
||||
}
|
||||
/*
|
||||
** f:
|
||||
** \.func \(\.param\.u32 %value_out\) f \(\.param\.u64 %in_ar0\)
|
||||
** {
|
||||
** \.reg\.u32 %value;
|
||||
** \.reg\.u64 %ar0;
|
||||
** ld\.param\.u64 %ar0, \[%in_ar0\];
|
||||
** \.reg\.u32 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** mov\.u64 \2, %ar0;
|
||||
** atom\.exch\.b32 \1, \[\2\], 1;
|
||||
** membar\.sys;
|
||||
** mov\.u32 %value, \1;
|
||||
** st\.param\.u32 \[%value_out\], %value;
|
||||
** ret;
|
||||
*/
|
||||
|
||||
static __attribute__((noipa)) int g(int n)
|
||||
{
|
||||
/* Check that variable-length stack allocation works. */
|
||||
int v[n];
|
||||
v[0] = 0;
|
||||
/* Check that atomic operations can be applied to auto data. */
|
||||
return f(v) == 0 && v[0] == 1;
|
||||
}
|
||||
/*
|
||||
** g:
|
||||
** \.func \(\.param\.u32 %value_out\) g \(\.param\.u32 %in_ar0\)
|
||||
** {
|
||||
** \.reg\.u32 %value;
|
||||
** \.reg\.u32 %ar0;
|
||||
** ld\.param\.u32 %ar0, \[%in_ar0\];
|
||||
** \.reg\.u32 (%r[0-9]+);
|
||||
** \.reg\.u32 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u32 (%r[0-9]+);
|
||||
** \.reg\.u32 (%r[0-9]+);
|
||||
** \.reg\.pred (%r[0-9]+);
|
||||
** \.reg\.u32 (%r[0-9]+);
|
||||
** \.reg\.pred (%r[0-9]+);
|
||||
** mov\.u32 \2, %ar0;
|
||||
** cvt\.s64\.s32 \3, \2;
|
||||
** shl\.b64 \4, \3, 2;
|
||||
** add\.u64 \5, \4, 15;
|
||||
** and\.b64 \6, \5, -16;
|
||||
** {
|
||||
** \.reg\.u64 \7_local;
|
||||
** alloca\.u64 \7_local, \6;
|
||||
** cvta\.local\.u64 \7, \7_local;
|
||||
** }
|
||||
** add\.u64 \8, \7, 3;
|
||||
** and\.b64 \9, \8, -4;
|
||||
** mov\.u32 \10, 0;
|
||||
** st\.u32 \[\9\], \10;
|
||||
** {
|
||||
** \.param\.u32 %value_in;
|
||||
** \.param\.u64 %out_arg1;
|
||||
** st\.param\.u64 \[%out_arg1\], \9;
|
||||
** call \(%value_in\), f, \(%out_arg1\);
|
||||
** ld\.param\.u32 \11, \[%value_in\];
|
||||
** }
|
||||
** setp\.ne\.u32 \12, \11, 0;
|
||||
** @\12 bra (\$L[0-9]+);
|
||||
** ld\.u32 \13, \[\9\];
|
||||
** setp\.eq\.u32 \14, \13, 1;
|
||||
** selp\.u32 \1, 1, 0, \14;
|
||||
** bra (\$L[0-9]+);
|
||||
** \15:
|
||||
** mov\.u32 \1, \10;
|
||||
** \16:
|
||||
** mov\.u32 %value, \1;
|
||||
** st\.param\.u32 \[%value_out\], %value;
|
||||
** ret;
|
||||
*/
|
||||
|
||||
int main()
|
||||
{
|
||||
if (!g(1))
|
||||
__builtin_abort();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* PTX 'atom' isn't acceptable for '.local' memory:
|
||||
'operation not supported on global/shared address space' [sic]
|
||||
('CUDA_ERROR_INVALID_ADDRESS_SPACE'), thus FAILs for 'alloca'ed memory.
|
||||
We'd have to use the 'nvptx_mem_local_p' replacements, but currently lack a
|
||||
mechanism for doing so (TODO).
|
||||
{ dg-xfail-run-if TODO { *-*-* } } */
|
|
@ -1,6 +1,8 @@
|
|||
/* { dg-options "-O2 -msoft-stack" } */
|
||||
/* { dg-do run } */
|
||||
|
||||
/* See also 'gcc.target/nvptx/alloca-5.c'. */
|
||||
|
||||
static __attribute__((noinline,noclone)) int f(int *p)
|
||||
{
|
||||
return __sync_lock_test_and_set(p, 1);
|
||||
|
|
29
gcc/testsuite/gcc.target/nvptx/vla-1-O0.c
Normal file
29
gcc/testsuite/gcc.target/nvptx/vla-1-O0.c
Normal file
|
@ -0,0 +1,29 @@
|
|||
/* { dg-do assemble } */
|
||||
/* { dg-options {-O0 -mno-soft-stack} } */
|
||||
/* { dg-add-options nvptx_alloca_ptx } */
|
||||
/* { dg-additional-options -save-temps } */
|
||||
/* { dg-final { check-function-bodies {**} {} } } */
|
||||
|
||||
void sink(void *);
|
||||
|
||||
void f(int s)
|
||||
{
|
||||
char a[s];
|
||||
sink(a);
|
||||
}
|
||||
/*
|
||||
** f:
|
||||
** ...
|
||||
** cvt\.s64\.s32 (%r[0-9]+), (%r[0-9]+);
|
||||
** mov\.u64 (%r[0-9]+), 16;
|
||||
** add\.u64 (%r[0-9]+), \3, -1;
|
||||
** add\.u64 (%r[0-9]+), \1, \4;
|
||||
** div\.u64 (%r[0-9]+), \5, 16;
|
||||
** mul\.lo\.u64 (%r[0-9]+), \6, 16;
|
||||
** {
|
||||
** \.reg\.u64 (%r[0-9]+)_local;
|
||||
** alloca\.u64 \8_local, \7;
|
||||
** cvta\.local\.u64 \8, \8_local;
|
||||
** }
|
||||
** ...
|
||||
*/
|
40
gcc/testsuite/gcc.target/nvptx/vla-1-O1.c
Normal file
40
gcc/testsuite/gcc.target/nvptx/vla-1-O1.c
Normal file
|
@ -0,0 +1,40 @@
|
|||
/* { dg-do assemble } */
|
||||
/* { dg-options {-O1 -mno-soft-stack} } */
|
||||
/* { dg-add-options nvptx_alloca_ptx } */
|
||||
/* { dg-additional-options -save-temps } */
|
||||
/* { dg-final { check-function-bodies {** } {} } } */
|
||||
|
||||
void sink(void *);
|
||||
|
||||
void f(int s)
|
||||
{
|
||||
char a[s];
|
||||
sink(a);
|
||||
}
|
||||
/*
|
||||
** f:
|
||||
** \.visible \.func f \(\.param\.u32 %in_ar0\)
|
||||
** {
|
||||
** \.reg\.u32 %ar0;
|
||||
** ld\.param\.u32 %ar0, \[%in_ar0\];
|
||||
** \.reg\.u32 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** \.reg\.u64 (%r[0-9]+);
|
||||
** mov\.u32 \1, %ar0;
|
||||
** cvt\.s64\.s32 \2, \1;
|
||||
** add\.u64 \3, \2, 15;
|
||||
** and\.b64 \4, \3, -16;
|
||||
** {
|
||||
** \.reg\.u64 \5_local;
|
||||
** alloca\.u64 \5_local, \4;
|
||||
** cvta\.local\.u64 \5, \5_local;
|
||||
** }
|
||||
** {
|
||||
** \.param\.u64 %out_arg1;
|
||||
** st\.param\.u64 \[%out_arg1\], \5;
|
||||
** call sink, \(%out_arg1\);
|
||||
** }
|
||||
** ret;
|
||||
*/
|
|
@ -1,5 +1,6 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options -mno-soft-stack } */
|
||||
/* { dg-additional-options -march=sm_30 } */
|
||||
|
||||
void sink(void *);
|
||||
|
|
@ -1009,9 +1009,37 @@ proc check_effective_target_alloca {} {
|
|||
return 0
|
||||
}
|
||||
if { [istarget nvptx-*-*] } {
|
||||
# For nvptx, 'alloca' support depends on the configuration. In case
|
||||
# of PTX "native" stacks, for 'dg-do run', it additionally depends on
|
||||
# runtime support.
|
||||
if { ![check_effective_target_nvptx_softstack] } {
|
||||
return 0
|
||||
# '-mno-soft-stack': PTX "native" stacks
|
||||
|
||||
# Not supported unless '-mptx=7.3'+ and '-march=sm_52'+.
|
||||
if { !([check_nvptx_default_ptx_isa_version_at_least 7 3]
|
||||
&& [check_nvptx_default_ptx_isa_target_architecture_at_least sm_52]) } {
|
||||
return 0
|
||||
}
|
||||
|
||||
# Find 'dg-do-what' in an outer frame.
|
||||
set level 1
|
||||
while true {
|
||||
upvar $level dg-do-what dg-do-what
|
||||
if [info exists dg-do-what] then break
|
||||
incr level
|
||||
}
|
||||
verbose "check_effective_target_alloca: found dg-do-what at level $level" 2
|
||||
|
||||
if { [string equal [lindex ${dg-do-what} 0] run] } {
|
||||
# For 'dg-do run', it additionally depends on runtime support.
|
||||
# (If not supported, we don't try to demote 'run' to 'link',
|
||||
# but instead simply fail the effective-target 'alloca' check.)
|
||||
return [check_effective_target_nvptx_runtime_alloca_ptx]
|
||||
} else {
|
||||
return 1
|
||||
}
|
||||
} else {
|
||||
# '-msoft-stack'
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
@ -14100,6 +14128,35 @@ proc check_effective_target_nvptx_default_ptx_isa_version_at_least_6_0 { } {
|
|||
return [check_nvptx_default_ptx_isa_version_at_least 6 0]
|
||||
}
|
||||
|
||||
# Return 1 if nvptx code by default compiles for at least the specified PTX ISA
|
||||
# target architecture.
|
||||
|
||||
proc check_nvptx_default_ptx_isa_target_architecture_at_least { ta } {
|
||||
set name nvptx_default_ptx_isa_target_architecture_at_least_${ta}
|
||||
|
||||
if [regexp {^sm_(\d+)$} $ta dummy ptx_sm] {
|
||||
set ptx_sm "${ptx_sm}0"
|
||||
} else {
|
||||
error "check_nvptx_default_ptx_isa_target_architecture_at_least: illegal argument: $ta"
|
||||
}
|
||||
|
||||
set supported_p \
|
||||
[concat \
|
||||
"(__PTX_SM__ >= $ptx_sm)"]
|
||||
|
||||
set src \
|
||||
[list \
|
||||
"#if $supported_p" \
|
||||
"#else" \
|
||||
"#error unsupported" \
|
||||
"#endif"]
|
||||
set src [join $src "\n"]
|
||||
|
||||
set res [check_no_compiler_messages $name assembly $src ""]
|
||||
|
||||
return $res
|
||||
}
|
||||
|
||||
# Return 1 if nvptx '-msoft-stack' is enabled.
|
||||
|
||||
proc check_effective_target_nvptx_softstack { } {
|
||||
|
@ -14132,6 +14189,28 @@ proc check_nvptx_runtime_ptx_isa_version_at_least { major minor } {
|
|||
return $res
|
||||
}
|
||||
|
||||
# Return 1 if nvptx code with the specified PTX ISA target architecture or
|
||||
# higher can be run.
|
||||
|
||||
proc check_nvptx_runtime_ptx_isa_target_architecture_at_least { ta } {
|
||||
set name nvptx_runtime_ptx_isa_target_architecture_${ta}
|
||||
|
||||
set default \
|
||||
[check_nvptx_default_ptx_isa_target_architecture_at_least ${ta}]
|
||||
|
||||
if { $default } {
|
||||
set flag ""
|
||||
} else {
|
||||
set flag "-march=$ta -mptx=_"
|
||||
}
|
||||
|
||||
set res [check_runtime $name {
|
||||
int main (void) { return 0; }
|
||||
} $flag]
|
||||
|
||||
return $res
|
||||
}
|
||||
|
||||
# Return 1 if the nvptx runtime environment supports the PTX ISA directive
|
||||
# '.alias'.
|
||||
|
||||
|
@ -14139,6 +14218,13 @@ proc check_effective_target_nvptx_runtime_alias_ptx { } {
|
|||
return [check_nvptx_runtime_ptx_isa_version_at_least 6 3]
|
||||
}
|
||||
|
||||
# Return 1 if the nvptx runtime environment supports PTX 'alloca'.
|
||||
|
||||
proc check_effective_target_nvptx_runtime_alloca_ptx { } {
|
||||
return [expr { [check_nvptx_runtime_ptx_isa_version_at_least 7 3]
|
||||
&& [check_nvptx_runtime_ptx_isa_target_architecture_at_least sm_52] }]
|
||||
}
|
||||
|
||||
# Add options to enable nvptx using the PTX ISA directive '.alias'.
|
||||
|
||||
proc add_options_for_nvptx_alias_ptx { flags } {
|
||||
|
@ -14150,3 +14236,20 @@ proc add_options_for_nvptx_alias_ptx { flags } {
|
|||
|
||||
return $flags
|
||||
}
|
||||
|
||||
# Add options to enable nvptx using PTX 'alloca'.
|
||||
|
||||
proc add_options_for_nvptx_alloca_ptx { flags } {
|
||||
# We don't add '-mno-soft-stack' here; the users should take care of that
|
||||
# explicitly.
|
||||
|
||||
if { ![check_nvptx_default_ptx_isa_version_at_least 7 3] } {
|
||||
append flags " -mptx=7.3"
|
||||
}
|
||||
|
||||
if { ![check_nvptx_default_ptx_isa_target_architecture_at_least sm_52] } {
|
||||
append flags " -march=sm_52"
|
||||
}
|
||||
|
||||
return $flags
|
||||
}
|
||||
|
|
|
@ -846,8 +846,8 @@ omp_get_device_from_uid_ (const char *uid, size_t uid_len)
|
|||
/* Inside the target region, invoking this routine is undefined
|
||||
behavior; thus, resolve it already here - instead of inside
|
||||
libgomp/config/.../target.c.
|
||||
Note that on nvptx __builtin_alloca is defined, but fails with a sorry
|
||||
during compilation, as it is unsupported until isa 7.3 / sm_52. */
|
||||
This also circumvents issues due to not all nvptx configurations
|
||||
supporting 'alloca'. */
|
||||
return omp_invalid_device;
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -1,12 +1,5 @@
|
|||
! { dg-do run }
|
||||
|
||||
! PR65181 "Support for alloca in nvptx"
|
||||
! { dg-excess-errors "lto1, mkoffload and lto-wrapper fatal errors" { target openacc_nvidia_accel_selected } }
|
||||
! Aside from restricting this testcase to non-nvptx offloading, and duplicating
|
||||
! it with 'dg-do link' for nvptx offloading, there doesn't seem to be a way to
|
||||
! XFAIL the "UNRESOLVED: [...] compilation failed to produce executable", or
|
||||
! get rid of it, unfortunately.
|
||||
|
||||
! { dg-additional-options "-fopt-info-note-omp" }
|
||||
! { dg-additional-options "--param=openacc-privatization=noisy" }
|
||||
! { dg-additional-options "-foffload=-fopt-info-note-omp" }
|
||||
|
@ -59,7 +52,6 @@ contains
|
|||
! { dg-note {variable 'array' in 'private' clause is candidate for adjusting OpenACC privatization level} "" { target *-*-* } l_loop$c_loop }
|
||||
! { dg-note {variable 'array' ought to be adjusted for OpenACC privatization level: 'gang'} "" { target *-*-* } l_loop$c_loop }
|
||||
! { dg-note {variable 'array' adjusted for OpenACC privatization level: 'gang'} "" { target { ! { openacc_host_selected || { openacc_nvidia_accel_selected && __OPTIMIZE__ } } } } l_loop$c_loop }
|
||||
! { dg-message {sorry, unimplemented: target cannot support alloca} PR65181 { target openacc_nvidia_accel_selected } l_loop$c_loop }
|
||||
do i = 1, 10
|
||||
array(i) = i
|
||||
end do
|
||||
|
@ -91,7 +83,6 @@ contains
|
|||
! { dg-note {variable 'array\.[0-9]+' in 'private' clause is candidate for adjusting OpenACC privatization level} "" { target *-*-* } l_loop$c_loop }
|
||||
! { dg-note {variable 'array\.[0-9]+' ought to be adjusted for OpenACC privatization level: 'gang'} "" { target *-*-* } l_loop$c_loop }
|
||||
! { dg-note {variable 'array\.[0-9]+' adjusted for OpenACC privatization level: 'gang'} "" { target { ! { openacc_host_selected || { openacc_nvidia_accel_selected && __OPTIMIZE__ } } } } l_loop$c_loop }
|
||||
! { dg-message {sorry, unimplemented: target cannot support alloca} PR65181 { target openacc_nvidia_accel_selected } l_loop$c_loop }
|
||||
do i = 1, 10
|
||||
array(i) = 9*i
|
||||
end do
|
||||
|
@ -117,7 +108,6 @@ contains
|
|||
! { dg-note {variable 'str' ought to be adjusted for OpenACC privatization level: 'gang'} "" { target *-*-* } l_loop$c_loop }
|
||||
! { dg-note {variable 'str' adjusted for OpenACC privatization level: 'gang'} "" { target { ! { openacc_host_selected || { openacc_nvidia_accel_selected && __OPTIMIZE__ } } } } l_loop$c_loop }
|
||||
! { dg-note {variable 'char\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: artificial} "" { target *-*-* } l_loop$c_loop }
|
||||
! { dg-message {sorry, unimplemented: target cannot support alloca} PR65181 { target openacc_nvidia_accel_selected } l_loop$c_loop }
|
||||
do i = 1, 10
|
||||
str(i:i) = achar(ichar('A') + i)
|
||||
end do
|
||||
|
|
Loading…
Add table
Reference in a new issue