re PR fortran/83017 (DO CONCURRENT not parallelizing)
2017-11-17 Richard Biener <rguenther@suse.de> PR tree-optimization/83017 * tree-parloops.c (MIN_PER_THREAD): Use --param parloops-min-per-thread. (gen_parallel_loop): Properly count iterations. (parallelize_loops): Handle loop->can_be_parallel independent of flag_loop_parallelize_all. Make static profitability test match the runtime one. * params.def (PARAM_PARLOOPS_MIN_PER_THREAD): New. * invoke.texi (parloops-min-per-thread): Document. * gcc.dg/autopar/pr49960.c: Adjust. From-SVN: r254867
This commit is contained in:
parent
e716496a55
commit
a851ce04f7
6 changed files with 46 additions and 21 deletions
|
@ -1,3 +1,14 @@
|
|||
2017-11-17 Richard Biener <rguenther@suse.de>
|
||||
|
||||
PR tree-optimization/83017
|
||||
* tree-parloops.c (MIN_PER_THREAD): Use --param parloops-min-per-thread.
|
||||
(gen_parallel_loop): Properly count iterations.
|
||||
(parallelize_loops): Handle loop->can_be_parallel independent
|
||||
of flag_loop_parallelize_all. Make static profitability test match
|
||||
the runtime one.
|
||||
* params.def (PARAM_PARLOOPS_MIN_PER_THREAD): New.
|
||||
* invoke.texi (parloops-min-per-thread): Document.
|
||||
|
||||
2017-11-17 Vineet Gupta <vgupta@synopsys.com>
|
||||
|
||||
* config/arc/linux.h: GLIBC_DYNAMIC_LINKER update per glibc
|
||||
|
|
|
@ -10816,6 +10816,12 @@ is 0.
|
|||
Schedule type of omp schedule for loops parallelized by parloops (static,
|
||||
dynamic, guided, auto, runtime). The default is static.
|
||||
|
||||
@item parloops-min-per-thread
|
||||
The minimum number of iterations per thread of an innermost parallelized
|
||||
loop for which the parallelized variant is prefered over the single threaded
|
||||
one. The default is 100. Note that for a parallelized loop nest the
|
||||
minimum number of iterations of the outermost loop per thread is two.
|
||||
|
||||
@item max-ssa-name-query-depth
|
||||
Maximum depth of recursion when querying properties of SSA names in things
|
||||
like fold routines. One level of recursion corresponds to following a
|
||||
|
|
|
@ -1240,6 +1240,12 @@ DEFPARAMENUM5 (PARAM_PARLOOPS_SCHEDULE,
|
|||
static,
|
||||
static, dynamic, guided, auto, runtime)
|
||||
|
||||
DEFPARAM (PARAM_PARLOOPS_MIN_PER_THREAD,
|
||||
"parloops-min-per-thread",
|
||||
"Minimum number of iterations per thread of an innermost "
|
||||
"parallelized loop.",
|
||||
100, 2, 0)
|
||||
|
||||
DEFPARAM (PARAM_MAX_SSA_NAME_QUERY_DEPTH,
|
||||
"max-ssa-name-query-depth",
|
||||
"Maximum recursion depth allowed when querying a property of an"
|
||||
|
|
|
@ -1,3 +1,8 @@
|
|||
2017-11-17 Richard Biener <rguenther@suse.de>
|
||||
|
||||
PR tree-optimization/83017
|
||||
* gcc.dg/autopar/pr49960.c: Adjust.
|
||||
|
||||
2017-11-17 Sudakshina Das <sudi.das@arm.com>
|
||||
|
||||
* gcc.target/arm/armv8_2-fp16-move-1.c: Edit vmov scan-assembler
|
||||
|
|
|
@ -7,7 +7,8 @@
|
|||
#define MA 400
|
||||
|
||||
int T[MA][MB],A[MA][NA],B[MB][NA];
|
||||
void MRTRBR(int MA_1, int NA_1, int MB_1)
|
||||
void __attribute__((noinline))
|
||||
MRTRBR(int MA_1, int NA_1, int MB_1)
|
||||
{
|
||||
int i,j, t,k;
|
||||
|
||||
|
@ -21,7 +22,7 @@ void MRTRBR(int MA_1, int NA_1, int MB_1)
|
|||
/* The outer most loop is not parallel because for different k's there
|
||||
is write-write dependency for T[i][j]. */
|
||||
|
||||
/* The two inner loops don't get parallelized due to low number of
|
||||
/* The innermost loop doesn't get parallelized due to low number of
|
||||
iterations. */
|
||||
|
||||
for (k = 3; k < NA_1; k++)
|
||||
|
@ -38,7 +39,10 @@ void main ()
|
|||
|
||||
for (i = 3; i < MA; i++)
|
||||
for (j = 3; j < MB; j++)
|
||||
T[i][j] = (i>j?i:j);
|
||||
{
|
||||
__asm__ volatile ("" : : : "memory");
|
||||
T[i][j] = (i>j?i:j);
|
||||
}
|
||||
|
||||
MRTRBR (MA,NA,MB);
|
||||
|
||||
|
@ -48,7 +52,7 @@ void main ()
|
|||
}
|
||||
|
||||
|
||||
/* Check that the outer most loop doesn't get parallelized (thus no loop gets parallelized) */
|
||||
/* Check that the outer most loop doesn't get parallelized. */
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 0 "parloops2" } } */
|
||||
/* { dg-final { scan-tree-dump-times "loopfn" 0 "optimized" } } */
|
||||
/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops2" } } */
|
||||
/* { dg-final { scan-tree-dump-times "__builtin_GOMP_parallel" 1 "optimized" } } */
|
||||
|
|
|
@ -184,7 +184,7 @@ parloop
|
|||
|
||||
/* Minimal number of iterations of a loop that should be executed in each
|
||||
thread. */
|
||||
#define MIN_PER_THREAD 100
|
||||
#define MIN_PER_THREAD PARAM_VALUE (PARAM_PARLOOPS_MIN_PER_THREAD)
|
||||
|
||||
/* Element of the hashtable, representing a
|
||||
reduction in the current loop. */
|
||||
|
@ -2336,7 +2336,7 @@ gen_parallel_loop (struct loop *loop,
|
|||
gcc_checking_assert (n_threads != 0);
|
||||
many_iterations_cond =
|
||||
fold_build2 (GE_EXPR, boolean_type_node,
|
||||
nit, build_int_cst (type, m_p_thread * n_threads));
|
||||
nit, build_int_cst (type, m_p_thread * n_threads - 1));
|
||||
|
||||
many_iterations_cond
|
||||
= fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
|
||||
|
@ -3299,15 +3299,6 @@ parallelize_loops (bool oacc_kernels_p)
|
|||
fprintf (dump_file, "loop %d is innermost\n",loop->num);
|
||||
}
|
||||
|
||||
/* If we use autopar in graphite pass, we use its marked dependency
|
||||
checking results. */
|
||||
if (flag_loop_parallelize_all && !loop->can_be_parallel)
|
||||
{
|
||||
if (dump_file && (dump_flags & TDF_DETAILS))
|
||||
fprintf (dump_file, "loop is not parallel according to graphite\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!single_dom_exit (loop))
|
||||
{
|
||||
|
||||
|
@ -3325,15 +3316,17 @@ parallelize_loops (bool oacc_kernels_p)
|
|||
|| loop_has_vector_phi_nodes (loop))
|
||||
continue;
|
||||
|
||||
estimated = estimated_stmt_executions_int (loop);
|
||||
estimated = estimated_loop_iterations_int (loop);
|
||||
if (estimated == -1)
|
||||
estimated = likely_max_stmt_executions_int (loop);
|
||||
estimated = get_likely_max_loop_iterations_int (loop);
|
||||
/* FIXME: Bypass this check as graphite doesn't update the
|
||||
count and frequency correctly now. */
|
||||
if (!flag_loop_parallelize_all
|
||||
&& !oacc_kernels_p
|
||||
&& ((estimated != -1
|
||||
&& estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD)
|
||||
&& (estimated
|
||||
< ((HOST_WIDE_INT) n_threads
|
||||
* (loop->inner ? 2 : MIN_PER_THREAD) - 1)))
|
||||
/* Do not bother with loops in cold areas. */
|
||||
|| optimize_loop_nest_for_size_p (loop)))
|
||||
continue;
|
||||
|
@ -3347,7 +3340,7 @@ parallelize_loops (bool oacc_kernels_p)
|
|||
if (loop_has_phi_with_address_arg (loop))
|
||||
continue;
|
||||
|
||||
if (!flag_loop_parallelize_all
|
||||
if (!loop->can_be_parallel
|
||||
&& !loop_parallel_p (loop, &parloop_obstack))
|
||||
continue;
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue