nvptx: PTX 'alloca' for '-mptx=7.3'+, '-march=sm_52'+ [PR65181]

..., and use it for '-mno-soft-stack': PTX "native" stacks.

	PR target/65181
	gcc/
	* config/nvptx/nvptx.cc (nvptx_get_drap_rtx): Handle
	'!TARGET_SOFT_STACK'.
	* config/nvptx/nvptx.md (define_c_enum "unspec"): Add
	'UNSPEC_STACKSAVE', 'UNSPEC_STACKRESTORE'.
	(define_expand "allocate_stack", define_expand "save_stack_block")
	(define_expand "save_stack_block"): Handle '!TARGET_SOFT_STACK',
	PTX 'alloca'.
	(define_insn "@nvptx_alloca_<mode>")
	(define_insn "@nvptx_stacksave_<mode>")
	(define_insn "@nvptx_stackrestore_<mode>"): New.
	* doc/invoke.texi (Nvidia PTX Options): Update '-msoft-stack',
	'-mno-soft-stack'.
	* doc/sourcebuild.texi (nvptx-specific attributes): Document
	'nvptx_runtime_alloca_ptx'.
	(Add Options): Document 'nvptx_alloca_ptx'.
	gcc/testsuite/
	* gcc.target/nvptx/alloca-1.c: Evolve into...
	* gcc.target/nvptx/alloca-1-O0.c: ... this, ...
	* gcc.target/nvptx/alloca-1-O1.c: ... this, and...
	* gcc.target/nvptx/alloca-1-sm_30.c: ... this.
	* gcc.target/nvptx/vla-1.c: Evolve into...
	* gcc.target/nvptx/vla-1-O0.c: ... this, ...
	* gcc.target/nvptx/vla-1-O1.c: ... this, and...
	* gcc.target/nvptx/vla-1-sm_30.c: ... this.
	* gcc.c-torture/execute/pr36321.c: Adjust.
	* gcc.target/nvptx/__builtin_alloca_0-1-O0.c: Likewise.
	* gcc.target/nvptx/__builtin_alloca_0-1-O1.c: Likewise.
	* gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1.c:
	Likewise.
	* gcc.target/nvptx/softstack.c: Likewise.
	* gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1-sm_30.c:
	New.
	* gcc.target/nvptx/alloca-2-O0.c: Likewise.
	* gcc.target/nvptx/alloca-3-O1.c: Likewise.
	* gcc.target/nvptx/alloca-4-O3.c: Likewise.
	* gcc.target/nvptx/alloca-5.c: Likewise.
	* lib/target-supports.exp (check_effective_target_alloca): Adjust.
	(check_nvptx_default_ptx_isa_target_architecture_at_least)
	(check_nvptx_runtime_ptx_isa_target_architecture_at_least)
	(check_effective_target_nvptx_runtime_alloca_ptx)
	(add_options_for_nvptx_alloca_ptx): New.
	libgomp/
	* fortran.c (omp_get_device_from_uid_): Adjust.
	* testsuite/libgomp.oacc-fortran/privatized-ref-2.f90: Likewise.
This commit is contained in:
Thomas Schwinge 2024-12-07 00:17:49 +01:00
parent 1db025c67f
commit 3861d362ec
23 changed files with 611 additions and 35 deletions

View file

@ -245,7 +245,7 @@ default_ptx_version_option (void)
warp convergence. */
res = MAX (res, PTX_VERSION_6_0);
/* For sm_52+, pick at least 7.3. */
/* For sm_52+, pick at least 7.3, to enable PTX 'alloca'. */
if (ptx_isa_option >= PTX_ISA_SM52)
res = MAX (res, PTX_VERSION_7_3);
@ -1797,7 +1797,7 @@ nvptx_function_ok_for_sibcall (tree, tree)
static rtx
nvptx_get_drap_rtx (void)
{
if (TARGET_SOFT_STACK && stack_realign_drap)
if (stack_realign_drap)
return arg_pointer_rtx;
return NULL_RTX;
}

View file

@ -35,8 +35,9 @@
UNSPEC_FPINT_NEARBYINT
UNSPEC_ALLOCA
UNSPEC_SET_SOFTSTACK
UNSPEC_STACKSAVE
UNSPEC_STACKRESTORE
UNSPEC_DIM_SIZE
@ -1663,22 +1664,47 @@
(match_operand 1 "nvptx_register_operand")]
""
{
if (TARGET_SOFT_STACK)
if (!TARGET_SOFT_STACK
&& TARGET_PTX_7_3
&& TARGET_SM52)
emit_insn (gen_nvptx_alloca (Pmode, operands[0], operands[1]));
else if (!TARGET_SOFT_STACK)
{
sorry ("target cannot support alloca");
emit_insn (gen_nop ());
}
else if (TARGET_SOFT_STACK)
{
emit_move_insn (stack_pointer_rtx,
gen_rtx_MINUS (Pmode, stack_pointer_rtx, operands[1]));
emit_insn (gen_set_softstack (Pmode, stack_pointer_rtx));
emit_move_insn (operands[0], virtual_stack_dynamic_rtx);
DONE;
}
/* The ptx documentation specifies an alloca intrinsic (for 32 bit
only) but notes it is not implemented. The assembler emits a
confused error message. Issue a blunt one now instead. */
sorry ("target cannot support alloca");
emit_insn (gen_nop ());
else
gcc_unreachable ();
DONE;
})
(define_insn "@nvptx_alloca_<mode>"
[(set (match_operand:P 0 "nvptx_register_operand" "=R")
(unspec:P [(match_operand:P 1 "nvptx_nonmemory_operand" "Ri")]
UNSPEC_ALLOCA))]
"TARGET_PTX_7_3
&& TARGET_SM52"
{
/* Convert the address from '.local' state space to generic. That way,
we don't have to use 'st.local', 'ld.local', and can easily pass the
address to other "generic functions".
TODO 'gcc.target/nvptx/alloca-5.c' */
output_asm_insn ("{", NULL);
output_asm_insn ("\\t.reg%t0\\t%0_local;", operands);
output_asm_insn ("\\talloca%u0\\t%0_local, %1;", operands);
output_asm_insn ("\\tcvta.local%u0\\t%0, %0_local;", operands);
output_asm_insn ("}", NULL);
return "";
}
[(set_attr "predicable" "no")])
(define_insn "@set_softstack_<mode>"
[(unspec [(match_operand:P 0 "nvptx_register_operand" "R")]
UNSPEC_SET_SOFTSTACK)]
@ -1692,30 +1718,64 @@
(match_operand 1 "register_operand" "")]
"!TARGET_SOFT_STACK"
{
/* The concept of a '%stack' pointer doesn't apply like this for
PTX "native" stacks. GCC however occasionally synthesizes
'__builtin_stack_save ()', '__builtin_stack_restore ()', and isn't able to
optimize them all away. Just submit a dummy -- user code shouldn't be
able to observe this. */
emit_move_insn (operands[0], GEN_INT (0xdeadbeef));
if (TARGET_PTX_7_3
&& TARGET_SM52)
{
gcc_checking_assert (REG_P (operands[0]));
emit_insn (gen_nvptx_stacksave (Pmode, operands[0], operands[1]));
}
else
{
/* The concept of a '%stack' pointer doesn't apply like this.
GCC however occasionally synthesizes '__builtin_stack_save ()',
'__builtin_stack_restore ()', and isn't able to optimize them all
away. Just submit a dummy -- user code shouldn't be able to observe
this. */
emit_move_insn (operands[0], GEN_INT (0xdeadbeef));
}
DONE;
})
(define_insn "@nvptx_stacksave_<mode>"
[(set (match_operand:P 0 "nvptx_register_operand" "=R")
(unspec:P [(match_operand:P 1 "register_operand" "R")]
UNSPEC_STACKSAVE))]
"TARGET_PTX_7_3
&& TARGET_SM52"
"%.\\tstacksave%u0\\t%0;")
(define_expand "restore_stack_block"
[(match_operand 0 "register_operand" "")
(match_operand 1 "register_operand" "")]
""
{
if (!TARGET_SOFT_STACK)
if (!TARGET_SOFT_STACK
&& TARGET_PTX_7_3
&& TARGET_SM52)
{
operands[1] = force_reg (Pmode, operands[1]);
emit_insn (gen_nvptx_stackrestore (Pmode, operands[0], operands[1]));
}
else if (!TARGET_SOFT_STACK)
; /* See 'save_stack_block'. */
else
else if (TARGET_SOFT_STACK)
{
emit_move_insn (operands[0], operands[1]);
emit_insn (gen_set_softstack (Pmode, operands[0]));
}
else
gcc_unreachable ();
DONE;
})
(define_insn "@nvptx_stackrestore_<mode>"
[(set (match_operand:P 0 "nvptx_register_operand" "=R")
(unspec:P [(match_operand:P 1 "nvptx_register_operand" "R")]
UNSPEC_STACKRESTORE))]
"TARGET_PTX_7_3
&& TARGET_SM52"
"%.\\tstackrestore%u1\\t%1;")
(define_expand "save_stack_function"
[(match_operand 0 "register_operand" "")
(match_operand 1 "register_operand" "")]

View file

@ -30232,8 +30232,19 @@ Apply partitioned execution optimizations. This is the default when any
level of optimization is selected.
@opindex msoft-stack
@opindex mno-soft-stack
@item -msoft-stack
Generate code that does not use @code{.local} memory
@itemx -mno-soft-stack
For @option{-mno-soft-stack} (the default, unless @option{-mgomp} has
been specified), use PTX ``native'' stacks, that is,
generate code that uses @code{.local} memory or PTX @code{alloca}
directly for stack storage.
Unless @option{-mptx=7.3} or higher and @option{-march=sm_52} or
higher are active, variable-length arrays and dynamically allocating
memory on the stack with @code{alloca} are not supported.
For @option{-msoft-stack} (implied by @option{-mgomp}),
generate code that does not use @code{.local} memory or PTX @code{alloca}
directly for stack storage. Instead, a per-warp stack pointer is
maintained explicitly. This enables variable-length stack allocation (with
variable-length arrays or @code{alloca}), and when global memory is used for

View file

@ -2434,6 +2434,9 @@ nvptx code by default compiles for at least PTX ISA version 6.0.
The nvptx runtime environment supports the PTX ISA directive
@code{.alias}.
@item nvptx_runtime_alloca_ptx
The nvptx runtime environment supports PTX 'alloca'.
@item nvptx_softstack
nvptx @option{-msoft-stack} is enabled.
@end table
@ -3359,6 +3362,9 @@ Only MIPS targets support this feature, and only then in certain modes.
@item nvptx_alias_ptx
Enable using the PTX ISA directive @code{.alias} on nvptx targets.
@item nvptx_alloca_ptx
Enable PTX 'alloca' on nvptx targets.
@item riscv_a
Add the 'A' extension to the -march string on RISC-V targets.

View file

@ -1,4 +1,7 @@
/* { dg-skip-if "requires alloca" { ! alloca } { "-O0" } { "" } } */
/* See 'gcc.target/nvptx/__builtin_alloca_0-1-O0.c'.
{ dg-xfail-if TODO { nvptx-*-* && { ! nvptx_softstack } } { "-O0" } { "" } } */
extern void abort (void);
extern __SIZE_TYPE__ strlen (const char *);

View file

@ -6,6 +6,8 @@
/* { dg-additional-options -save-temps } */
/* { dg-final { check-function-bodies {** } {} } } */
/* See 'gcc.c-torture/execute/pr36321.c', '-O0'. */
void sink(void *);
void f(void)

View file

@ -6,6 +6,8 @@
/* { dg-additional-options -save-temps } */
/* { dg-final { check-function-bodies {** } {} } } */
/* See 'gcc.c-torture/execute/pr36321.c', '-O0'. */
void sink(void *);
void f(void)

View file

@ -0,0 +1,28 @@
/* Document what we do for '__builtin_stack_save()', '__builtin_stack_restore()'. */
/* { dg-do assemble } */
/* { dg-options {-O3 -mno-soft-stack} } */
/* { dg-additional-options -march=sm_30 } */
/* { dg-additional-options -save-temps } */
/* { dg-final { check-function-bodies {** } {} } } */
void *p;
void f(void)
{
// 0xdeadbeef
p = __builtin_stack_save();
asm volatile ("" : : : "memory");
// no-op
__builtin_stack_restore(p);
asm volatile ("" : : : "memory");
}
/*
** f:
** \.visible \.func f
** {
** \.reg\.u64 (%r[0-9]+);
** mov\.u64 \1, 3735928559;
** st\.global\.u64 \[p\], \1;
** ret;
*/

View file

@ -2,6 +2,7 @@
/* { dg-do assemble } */
/* { dg-options {-O3 -mno-soft-stack} } */
/* { dg-add-options nvptx_alloca_ptx } */
/* { dg-additional-options -save-temps } */
/* { dg-final { check-function-bodies {** } {} } } */
@ -9,10 +10,8 @@ void *p;
void f(void)
{
// 0xdeadbeef
p = __builtin_stack_save();
asm volatile ("" : : : "memory");
// no-op
__builtin_stack_restore(p);
asm volatile ("" : : : "memory");
}
@ -21,7 +20,10 @@ void f(void)
** \.visible \.func f
** {
** \.reg\.u64 (%r[0-9]+);
** mov\.u64 \1, 3735928559;
** \.reg\.u64 (%r[0-9]+);
** stacksave\.u64 \1;
** st\.global\.u64 \[p\], \1;
** ld\.global\.u64 \2, \[p\];
** stackrestore\.u64 \2;
** ret;
*/

View file

@ -0,0 +1,49 @@
/* { dg-do assemble } */
/* { dg-options {-O0 -mno-soft-stack} } */
/* { dg-add-options nvptx_alloca_ptx } */
/* { dg-additional-options -save-temps } */
/* { dg-final { check-function-bodies {** } {} } } */
void sink(void *);
void f(void)
{
sink(__builtin_alloca(123));
}
/*
** f:
** \.visible \.func f
** {
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** mov\.u64 \11, 16;
** add\.u64 \2, \11, -1;
** add\.u64 \3, \2, 123;
** div\.u64 \4, \3, 16;
** mul\.lo\.u64 \5, \4, 16;
** {
** \.reg\.u64 \6_local;
** alloca\.u64 \6_local, \5;
** cvta\.local\.u64 \6, \6_local;
** }
** add\.u64 \7, \6, 15;
** shr\.u64 \8, \7, 4;
** shl\.b64 \9, \8, 4;
** mov\.u64 \1, \9;
** mov\.u64 \10, \1;
** {
** \.param\.u64 %out_arg1;
** st\.param\.u64 \[%out_arg1\], \10;
** call sink, \(%out_arg1\);
** }
** ret;
*/

View file

@ -0,0 +1,33 @@
/* { dg-do assemble } */
/* { dg-options {-O1 -mno-soft-stack} } */
/* { dg-add-options nvptx_alloca_ptx } */
/* { dg-additional-options -save-temps } */
/* { dg-final { check-function-bodies {** } {} } } */
void sink(void *);
void f(void)
{
sink(__builtin_alloca(123));
}
/*
** f:
** \.visible \.func f
** {
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** {
** \.reg\.u64 \1_local;
** alloca\.u64 \1_local, 128;
** cvta\.local\.u64 \1, \1_local;
** }
** add\.u64 \2, \1, 15;
** and\.b64 \3, \2, -16;
** {
** \.param\.u64 %out_arg1;
** st\.param\.u64 \[%out_arg1\], \3;
** call sink, \(%out_arg1\);
** }
** ret;
*/

View file

@ -1,5 +1,6 @@
/* { dg-do compile } */
/* { dg-options -mno-soft-stack } */
/* { dg-additional-options -march=sm_30 } */
void sink(void *);

View file

@ -0,0 +1,12 @@
/* { dg-do link } */
/* { dg-do run { target nvptx_runtime_alloca_ptx } } */
/* { dg-options {-O0 -mno-soft-stack} } */
/* { dg-add-options nvptx_alloca_ptx } */
/* { dg-additional-options -save-temps } */
int
main(void)
{
return !(__builtin_alloca(100) != __builtin_alloca(10));
}
/* { dg-final { scan-assembler-times {(?n)\talloca\.u64\t%r[0-9]+_local, %r[0-9]+;$} 2 } } */

View file

@ -0,0 +1,40 @@
/* { dg-do assemble } */
/* { dg-options {-O1 -mno-soft-stack} } */
/* { dg-add-options nvptx_alloca_ptx } */
/* { dg-additional-options -save-temps } */
/* { dg-final { check-function-bodies {** } {} } } */
void sink(void *);
void *p;
void f(void)
{
p = __builtin_stack_save();
sink(__builtin_alloca(25));
__builtin_stack_restore(p);
}
/*
** f:
** \.visible \.func f
** {
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** stacksave\.u64 \1;
** st\.global\.u64 \[p\], \1;
** {
** \.reg\.u64 \2_local;
** alloca\.u64 \2_local, 32;
** cvta\.local\.u64 \2, \2_local;
** }
** add\.u64 \3, \2, 15;
** and\.b64 \4, \3, -16;
** {
** \.param\.u64 %out_arg1;
** st\.param\.u64 \[%out_arg1\], \4;
** call sink, \(%out_arg1\);
** }
** ret;
*/

View file

@ -0,0 +1,55 @@
/* { dg-do assemble } */
/* { dg-options {-O3 -mno-soft-stack} } */
/* { dg-add-options nvptx_alloca_ptx } */
/* { dg-additional-options -save-temps } */
/* { dg-final { check-function-bodies {** } {} } } */
void sink(void *);
void f(void)
{
void *p;
p = __builtin_stack_save();
sink(__builtin_alloca(25));
__builtin_stack_restore(p);
sink(__builtin_alloca(13));
}
/*
** f:
** .visible .func f
** {
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** stacksave\.u64 \1;
** {
** \.reg\.u64 \2_local;
** alloca\.u64 \2_local, 32;
** cvta\.local\.u64 \2, \2_local;
** }
** add\.u64 \3, \2, 15;
** and\.b64 \4, \3, -16;
** {
** \.param\.u64 %out_arg1;
** st\.param\.u64 \[%out_arg1\], \4;
** call sink, \(%out_arg1\);
** }
** stackrestore\.u64 \1;
** {
** \.reg\.u64 \5_local;
** alloca\.u64 \5_local, 16;
** cvta\.local\.u64 \5, \5_local;
** }
** add\.u64 \6, \5, 15;
** and\.b64 \7, \6, -16;
** {
** \.param\.u64 %out_arg1;
** st\.param\.u64 \[%out_arg1\], \7;
** call sink, \(%out_arg1\);
** }
** ret;
*/

View file

@ -0,0 +1,107 @@
/* { dg-do link } */
/* { dg-do run { target nvptx_runtime_alloca_ptx } } */
/* { dg-options {-O2 -mno-soft-stack} } */
/* { dg-add-options nvptx_alloca_ptx } */
/* { dg-additional-options -save-temps } */
/* { dg-final { check-function-bodies {** } {} } } */
/* See also 'gcc.target/nvptx/softstack.c'. */
static __attribute__((noipa)) int f(int *p)
{
return __sync_lock_test_and_set(p, 1);
}
/*
** f:
** \.func \(\.param\.u32 %value_out\) f \(\.param\.u64 %in_ar0\)
** {
** \.reg\.u32 %value;
** \.reg\.u64 %ar0;
** ld\.param\.u64 %ar0, \[%in_ar0\];
** \.reg\.u32 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** mov\.u64 \2, %ar0;
** atom\.exch\.b32 \1, \[\2\], 1;
** membar\.sys;
** mov\.u32 %value, \1;
** st\.param\.u32 \[%value_out\], %value;
** ret;
*/
static __attribute__((noipa)) int g(int n)
{
/* Check that variable-length stack allocation works. */
int v[n];
v[0] = 0;
/* Check that atomic operations can be applied to auto data. */
return f(v) == 0 && v[0] == 1;
}
/*
** g:
** \.func \(\.param\.u32 %value_out\) g \(\.param\.u32 %in_ar0\)
** {
** \.reg\.u32 %value;
** \.reg\.u32 %ar0;
** ld\.param\.u32 %ar0, \[%in_ar0\];
** \.reg\.u32 (%r[0-9]+);
** \.reg\.u32 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u32 (%r[0-9]+);
** \.reg\.u32 (%r[0-9]+);
** \.reg\.pred (%r[0-9]+);
** \.reg\.u32 (%r[0-9]+);
** \.reg\.pred (%r[0-9]+);
** mov\.u32 \2, %ar0;
** cvt\.s64\.s32 \3, \2;
** shl\.b64 \4, \3, 2;
** add\.u64 \5, \4, 15;
** and\.b64 \6, \5, -16;
** {
** \.reg\.u64 \7_local;
** alloca\.u64 \7_local, \6;
** cvta\.local\.u64 \7, \7_local;
** }
** add\.u64 \8, \7, 3;
** and\.b64 \9, \8, -4;
** mov\.u32 \10, 0;
** st\.u32 \[\9\], \10;
** {
** \.param\.u32 %value_in;
** \.param\.u64 %out_arg1;
** st\.param\.u64 \[%out_arg1\], \9;
** call \(%value_in\), f, \(%out_arg1\);
** ld\.param\.u32 \11, \[%value_in\];
** }
** setp\.ne\.u32 \12, \11, 0;
** @\12 bra (\$L[0-9]+);
** ld\.u32 \13, \[\9\];
** setp\.eq\.u32 \14, \13, 1;
** selp\.u32 \1, 1, 0, \14;
** bra (\$L[0-9]+);
** \15:
** mov\.u32 \1, \10;
** \16:
** mov\.u32 %value, \1;
** st\.param\.u32 \[%value_out\], %value;
** ret;
*/
int main()
{
if (!g(1))
__builtin_abort();
return 0;
}
/* PTX 'atom' isn't acceptable for '.local' memory:
'operation not supported on global/shared address space' [sic]
('CUDA_ERROR_INVALID_ADDRESS_SPACE'), thus FAILs for 'alloca'ed memory.
We'd have to use the 'nvptx_mem_local_p' replacements, but currently lack a
mechanism for doing so (TODO).
{ dg-xfail-run-if TODO { *-*-* } } */

View file

@ -1,6 +1,8 @@
/* { dg-options "-O2 -msoft-stack" } */
/* { dg-do run } */
/* See also 'gcc.target/nvptx/alloca-5.c'. */
static __attribute__((noinline,noclone)) int f(int *p)
{
return __sync_lock_test_and_set(p, 1);

View file

@ -0,0 +1,29 @@
/* { dg-do assemble } */
/* { dg-options {-O0 -mno-soft-stack} } */
/* { dg-add-options nvptx_alloca_ptx } */
/* { dg-additional-options -save-temps } */
/* { dg-final { check-function-bodies {**} {} } } */
void sink(void *);
void f(int s)
{
char a[s];
sink(a);
}
/*
** f:
** ...
** cvt\.s64\.s32 (%r[0-9]+), (%r[0-9]+);
** mov\.u64 (%r[0-9]+), 16;
** add\.u64 (%r[0-9]+), \3, -1;
** add\.u64 (%r[0-9]+), \1, \4;
** div\.u64 (%r[0-9]+), \5, 16;
** mul\.lo\.u64 (%r[0-9]+), \6, 16;
** {
** \.reg\.u64 (%r[0-9]+)_local;
** alloca\.u64 \8_local, \7;
** cvta\.local\.u64 \8, \8_local;
** }
** ...
*/

View file

@ -0,0 +1,40 @@
/* { dg-do assemble } */
/* { dg-options {-O1 -mno-soft-stack} } */
/* { dg-add-options nvptx_alloca_ptx } */
/* { dg-additional-options -save-temps } */
/* { dg-final { check-function-bodies {** } {} } } */
void sink(void *);
void f(int s)
{
char a[s];
sink(a);
}
/*
** f:
** \.visible \.func f \(\.param\.u32 %in_ar0\)
** {
** \.reg\.u32 %ar0;
** ld\.param\.u32 %ar0, \[%in_ar0\];
** \.reg\.u32 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** \.reg\.u64 (%r[0-9]+);
** mov\.u32 \1, %ar0;
** cvt\.s64\.s32 \2, \1;
** add\.u64 \3, \2, 15;
** and\.b64 \4, \3, -16;
** {
** \.reg\.u64 \5_local;
** alloca\.u64 \5_local, \4;
** cvta\.local\.u64 \5, \5_local;
** }
** {
** \.param\.u64 %out_arg1;
** st\.param\.u64 \[%out_arg1\], \5;
** call sink, \(%out_arg1\);
** }
** ret;
*/

View file

@ -1,5 +1,6 @@
/* { dg-do compile } */
/* { dg-options -mno-soft-stack } */
/* { dg-additional-options -march=sm_30 } */
void sink(void *);

View file

@ -1009,9 +1009,37 @@ proc check_effective_target_alloca {} {
return 0
}
if { [istarget nvptx-*-*] } {
# For nvptx, 'alloca' support depends on the configuration. In case
# of PTX "native" stacks, for 'dg-do run', it additionally depends on
# runtime support.
if { ![check_effective_target_nvptx_softstack] } {
return 0
# '-mno-soft-stack': PTX "native" stacks
# Not supported unless '-mptx=7.3'+ and '-march=sm_52'+.
if { !([check_nvptx_default_ptx_isa_version_at_least 7 3]
&& [check_nvptx_default_ptx_isa_target_architecture_at_least sm_52]) } {
return 0
}
# Find 'dg-do-what' in an outer frame.
set level 1
while true {
upvar $level dg-do-what dg-do-what
if [info exists dg-do-what] then break
incr level
}
verbose "check_effective_target_alloca: found dg-do-what at level $level" 2
if { [string equal [lindex ${dg-do-what} 0] run] } {
# For 'dg-do run', it additionally depends on runtime support.
# (If not supported, we don't try to demote 'run' to 'link',
# but instead simply fail the effective-target 'alloca' check.)
return [check_effective_target_nvptx_runtime_alloca_ptx]
} else {
return 1
}
} else {
# '-msoft-stack'
return 1
}
}
@ -14100,6 +14128,35 @@ proc check_effective_target_nvptx_default_ptx_isa_version_at_least_6_0 { } {
return [check_nvptx_default_ptx_isa_version_at_least 6 0]
}
# Return 1 if nvptx code by default compiles for at least the specified PTX ISA
# target architecture.
proc check_nvptx_default_ptx_isa_target_architecture_at_least { ta } {
set name nvptx_default_ptx_isa_target_architecture_at_least_${ta}
if [regexp {^sm_(\d+)$} $ta dummy ptx_sm] {
set ptx_sm "${ptx_sm}0"
} else {
error "check_nvptx_default_ptx_isa_target_architecture_at_least: illegal argument: $ta"
}
set supported_p \
[concat \
"(__PTX_SM__ >= $ptx_sm)"]
set src \
[list \
"#if $supported_p" \
"#else" \
"#error unsupported" \
"#endif"]
set src [join $src "\n"]
set res [check_no_compiler_messages $name assembly $src ""]
return $res
}
# Return 1 if nvptx '-msoft-stack' is enabled.
proc check_effective_target_nvptx_softstack { } {
@ -14132,6 +14189,28 @@ proc check_nvptx_runtime_ptx_isa_version_at_least { major minor } {
return $res
}
# Return 1 if nvptx code with the specified PTX ISA target architecture or
# higher can be run.
proc check_nvptx_runtime_ptx_isa_target_architecture_at_least { ta } {
set name nvptx_runtime_ptx_isa_target_architecture_${ta}
set default \
[check_nvptx_default_ptx_isa_target_architecture_at_least ${ta}]
if { $default } {
set flag ""
} else {
set flag "-march=$ta -mptx=_"
}
set res [check_runtime $name {
int main (void) { return 0; }
} $flag]
return $res
}
# Return 1 if the nvptx runtime environment supports the PTX ISA directive
# '.alias'.
@ -14139,6 +14218,13 @@ proc check_effective_target_nvptx_runtime_alias_ptx { } {
return [check_nvptx_runtime_ptx_isa_version_at_least 6 3]
}
# Return 1 if the nvptx runtime environment supports PTX 'alloca'.
proc check_effective_target_nvptx_runtime_alloca_ptx { } {
return [expr { [check_nvptx_runtime_ptx_isa_version_at_least 7 3]
&& [check_nvptx_runtime_ptx_isa_target_architecture_at_least sm_52] }]
}
# Add options to enable nvptx using the PTX ISA directive '.alias'.
proc add_options_for_nvptx_alias_ptx { flags } {
@ -14150,3 +14236,20 @@ proc add_options_for_nvptx_alias_ptx { flags } {
return $flags
}
# Add options to enable nvptx using PTX 'alloca'.
proc add_options_for_nvptx_alloca_ptx { flags } {
# We don't add '-mno-soft-stack' here; the users should take care of that
# explicitly.
if { ![check_nvptx_default_ptx_isa_version_at_least 7 3] } {
append flags " -mptx=7.3"
}
if { ![check_nvptx_default_ptx_isa_target_architecture_at_least sm_52] } {
append flags " -march=sm_52"
}
return $flags
}

View file

@ -846,8 +846,8 @@ omp_get_device_from_uid_ (const char *uid, size_t uid_len)
/* Inside the target region, invoking this routine is undefined
behavior; thus, resolve it already here - instead of inside
libgomp/config/.../target.c.
Note that on nvptx __builtin_alloca is defined, but fails with a sorry
during compilation, as it is unsupported until isa 7.3 / sm_52. */
This also circumvents issues due to not all nvptx configurations
supporting 'alloca'. */
return omp_invalid_device;
#endif
}

View file

@ -1,12 +1,5 @@
! { dg-do run }
! PR65181 "Support for alloca in nvptx"
! { dg-excess-errors "lto1, mkoffload and lto-wrapper fatal errors" { target openacc_nvidia_accel_selected } }
! Aside from restricting this testcase to non-nvptx offloading, and duplicating
! it with 'dg-do link' for nvptx offloading, there doesn't seem to be a way to
! XFAIL the "UNRESOLVED: [...] compilation failed to produce executable", or
! get rid of it, unfortunately.
! { dg-additional-options "-fopt-info-note-omp" }
! { dg-additional-options "--param=openacc-privatization=noisy" }
! { dg-additional-options "-foffload=-fopt-info-note-omp" }
@ -59,7 +52,6 @@ contains
! { dg-note {variable 'array' in 'private' clause is candidate for adjusting OpenACC privatization level} "" { target *-*-* } l_loop$c_loop }
! { dg-note {variable 'array' ought to be adjusted for OpenACC privatization level: 'gang'} "" { target *-*-* } l_loop$c_loop }
! { dg-note {variable 'array' adjusted for OpenACC privatization level: 'gang'} "" { target { ! { openacc_host_selected || { openacc_nvidia_accel_selected && __OPTIMIZE__ } } } } l_loop$c_loop }
! { dg-message {sorry, unimplemented: target cannot support alloca} PR65181 { target openacc_nvidia_accel_selected } l_loop$c_loop }
do i = 1, 10
array(i) = i
end do
@ -91,7 +83,6 @@ contains
! { dg-note {variable 'array\.[0-9]+' in 'private' clause is candidate for adjusting OpenACC privatization level} "" { target *-*-* } l_loop$c_loop }
! { dg-note {variable 'array\.[0-9]+' ought to be adjusted for OpenACC privatization level: 'gang'} "" { target *-*-* } l_loop$c_loop }
! { dg-note {variable 'array\.[0-9]+' adjusted for OpenACC privatization level: 'gang'} "" { target { ! { openacc_host_selected || { openacc_nvidia_accel_selected && __OPTIMIZE__ } } } } l_loop$c_loop }
! { dg-message {sorry, unimplemented: target cannot support alloca} PR65181 { target openacc_nvidia_accel_selected } l_loop$c_loop }
do i = 1, 10
array(i) = 9*i
end do
@ -117,7 +108,6 @@ contains
! { dg-note {variable 'str' ought to be adjusted for OpenACC privatization level: 'gang'} "" { target *-*-* } l_loop$c_loop }
! { dg-note {variable 'str' adjusted for OpenACC privatization level: 'gang'} "" { target { ! { openacc_host_selected || { openacc_nvidia_accel_selected && __OPTIMIZE__ } } } } l_loop$c_loop }
! { dg-note {variable 'char\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: artificial} "" { target *-*-* } l_loop$c_loop }
! { dg-message {sorry, unimplemented: target cannot support alloca} PR65181 { target openacc_nvidia_accel_selected } l_loop$c_loop }
do i = 1, 10
str(i:i) = achar(ichar('A') + i)
end do