re PR fortran/83017 (DO CONCURRENT not parallelizing)

2017-11-17  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/83017
	* tree-parloops.c (MIN_PER_THREAD): Use --param parloops-min-per-thread.
	(gen_parallel_loop): Properly count iterations.
	(parallelize_loops): Handle loop->can_be_parallel independent
	of flag_loop_parallelize_all.  Make static profitability test match
	the runtime one.
	* params.def (PARAM_PARLOOPS_MIN_PER_THREAD): New.
	* invoke.texi (parloops-min-per-thread): Document.

	* gcc.dg/autopar/pr49960.c: Adjust.

From-SVN: r254867
This commit is contained in:
Richard Biener 2017-11-17 13:15:34 +00:00 committed by Richard Biener
parent e716496a55
commit a851ce04f7
6 changed files with 46 additions and 21 deletions

View file

@ -1,3 +1,14 @@
2017-11-17 Richard Biener <rguenther@suse.de>
PR tree-optimization/83017
* tree-parloops.c (MIN_PER_THREAD): Use --param parloops-min-per-thread.
(gen_parallel_loop): Properly count iterations.
(parallelize_loops): Handle loop->can_be_parallel independent
of flag_loop_parallelize_all. Make static profitability test match
the runtime one.
* params.def (PARAM_PARLOOPS_MIN_PER_THREAD): New.
* invoke.texi (parloops-min-per-thread): Document.
2017-11-17 Vineet Gupta <vgupta@synopsys.com>
* config/arc/linux.h: GLIBC_DYNAMIC_LINKER update per glibc

View file

@ -10816,6 +10816,12 @@ is 0.
Schedule type of omp schedule for loops parallelized by parloops (static,
dynamic, guided, auto, runtime). The default is static.
@item parloops-min-per-thread
The minimum number of iterations per thread of an innermost parallelized
loop for which the parallelized variant is prefered over the single threaded
one. The default is 100. Note that for a parallelized loop nest the
minimum number of iterations of the outermost loop per thread is two.
@item max-ssa-name-query-depth
Maximum depth of recursion when querying properties of SSA names in things
like fold routines. One level of recursion corresponds to following a

View file

@ -1240,6 +1240,12 @@ DEFPARAMENUM5 (PARAM_PARLOOPS_SCHEDULE,
static,
static, dynamic, guided, auto, runtime)
DEFPARAM (PARAM_PARLOOPS_MIN_PER_THREAD,
"parloops-min-per-thread",
"Minimum number of iterations per thread of an innermost "
"parallelized loop.",
100, 2, 0)
DEFPARAM (PARAM_MAX_SSA_NAME_QUERY_DEPTH,
"max-ssa-name-query-depth",
"Maximum recursion depth allowed when querying a property of an"

View file

@ -1,3 +1,8 @@
2017-11-17 Richard Biener <rguenther@suse.de>
PR tree-optimization/83017
* gcc.dg/autopar/pr49960.c: Adjust.
2017-11-17 Sudakshina Das <sudi.das@arm.com>
* gcc.target/arm/armv8_2-fp16-move-1.c: Edit vmov scan-assembler

View file

@ -7,7 +7,8 @@
#define MA 400
int T[MA][MB],A[MA][NA],B[MB][NA];
void MRTRBR(int MA_1, int NA_1, int MB_1)
void __attribute__((noinline))
MRTRBR(int MA_1, int NA_1, int MB_1)
{
int i,j, t,k;
@ -21,7 +22,7 @@ void MRTRBR(int MA_1, int NA_1, int MB_1)
/* The outer most loop is not parallel because for different k's there
is write-write dependency for T[i][j]. */
/* The two inner loops don't get parallelized due to low number of
/* The innermost loop doesn't get parallelized due to low number of
iterations. */
for (k = 3; k < NA_1; k++)
@ -38,7 +39,10 @@ void main ()
for (i = 3; i < MA; i++)
for (j = 3; j < MB; j++)
T[i][j] = (i>j?i:j);
{
__asm__ volatile ("" : : : "memory");
T[i][j] = (i>j?i:j);
}
MRTRBR (MA,NA,MB);
@ -48,7 +52,7 @@ void main ()
}
/* Check that the outer most loop doesn't get parallelized (thus no loop gets parallelized) */
/* Check that the outer most loop doesn't get parallelized. */
/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 0 "parloops2" } } */
/* { dg-final { scan-tree-dump-times "loopfn" 0 "optimized" } } */
/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops2" } } */
/* { dg-final { scan-tree-dump-times "__builtin_GOMP_parallel" 1 "optimized" } } */

View file

@ -184,7 +184,7 @@ parloop
/* Minimal number of iterations of a loop that should be executed in each
thread. */
#define MIN_PER_THREAD 100
#define MIN_PER_THREAD PARAM_VALUE (PARAM_PARLOOPS_MIN_PER_THREAD)
/* Element of the hashtable, representing a
reduction in the current loop. */
@ -2336,7 +2336,7 @@ gen_parallel_loop (struct loop *loop,
gcc_checking_assert (n_threads != 0);
many_iterations_cond =
fold_build2 (GE_EXPR, boolean_type_node,
nit, build_int_cst (type, m_p_thread * n_threads));
nit, build_int_cst (type, m_p_thread * n_threads - 1));
many_iterations_cond
= fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
@ -3299,15 +3299,6 @@ parallelize_loops (bool oacc_kernels_p)
fprintf (dump_file, "loop %d is innermost\n",loop->num);
}
/* If we use autopar in graphite pass, we use its marked dependency
checking results. */
if (flag_loop_parallelize_all && !loop->can_be_parallel)
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "loop is not parallel according to graphite\n");
continue;
}
if (!single_dom_exit (loop))
{
@ -3325,15 +3316,17 @@ parallelize_loops (bool oacc_kernels_p)
|| loop_has_vector_phi_nodes (loop))
continue;
estimated = estimated_stmt_executions_int (loop);
estimated = estimated_loop_iterations_int (loop);
if (estimated == -1)
estimated = likely_max_stmt_executions_int (loop);
estimated = get_likely_max_loop_iterations_int (loop);
/* FIXME: Bypass this check as graphite doesn't update the
count and frequency correctly now. */
if (!flag_loop_parallelize_all
&& !oacc_kernels_p
&& ((estimated != -1
&& estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD)
&& (estimated
< ((HOST_WIDE_INT) n_threads
* (loop->inner ? 2 : MIN_PER_THREAD) - 1)))
/* Do not bother with loops in cold areas. */
|| optimize_loop_nest_for_size_p (loop)))
continue;
@ -3347,7 +3340,7 @@ parallelize_loops (bool oacc_kernels_p)
if (loop_has_phi_with_address_arg (loop))
continue;
if (!flag_loop_parallelize_all
if (!loop->can_be_parallel
&& !loop_parallel_p (loop, &parloop_obstack))
continue;