diff --git a/gcc/testsuite/gcc.dg/vect/slp-simd-clone-1.c b/gcc/testsuite/gcc.dg/vect/slp-simd-clone-1.c new file mode 100644 index 00000000000..6ccbb39b567 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-simd-clone-1.c @@ -0,0 +1,46 @@ +/* { dg-require-effective-target vect_simd_clones } */ +/* { dg-additional-options "-fopenmp-simd" } */ + +#include "tree-vect.h" + +int x[1024]; + +#pragma omp declare simd simdlen(4) notinbranch +__attribute__((noinline)) int +foo (int a, int b) +{ + return a + b; +} + +void __attribute__((noipa)) +bar (void) +{ +#pragma omp simd + for (int i = 0; i < 512; i++) + { + x[2*i+0] = foo (x[2*i+0], x[2*i+0]); + x[2*i+1] = foo (x[2*i+1], x[2*i+1]); + } +} + +int +main () +{ + int i; + check_vect (); + +#pragma GCC novector + for (i = 0; i < 1024; i++) + x[i] = i; + + bar (); + +#pragma GCC novector + for (i = 0; i < 1024; i++) + if (x[i] != i + i) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-simd-clone-2.c b/gcc/testsuite/gcc.dg/vect/slp-simd-clone-2.c new file mode 100644 index 00000000000..98387c92486 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-simd-clone-2.c @@ -0,0 +1,57 @@ +/* { dg-require-effective-target vect_simd_clones } */ +/* { dg-additional-options "-fopenmp-simd" } */ +/* { dg-additional-options "-mavx2" { target avx2_runtime } } */ + +#include "tree-vect.h" + +int x[1024]; + +#pragma omp declare simd simdlen(4) inbranch +__attribute__((noinline)) int +foo (int a, int b) +{ + return a + b; +} + +void __attribute__((noipa)) +bar (void) +{ +#pragma omp simd + for (int i = 0; i < 512; i++) + { + if (x[2*i+0] < 10) + x[2*i+0] = foo (x[2*i+0], x[2*i+0]); + if (x[2*i+1] < 20) + x[2*i+1] = foo (x[2*i+1], x[2*i+1]); + } +} + +int +main () +{ + int i; + check_vect (); + +#pragma GCC novector + for (i = 0; i < 1024; i++) + x[i] = i; + + bar (); + +#pragma GCC novector + for (i = 0; i < 1024; i++) + { + if (((i & 1) && i < 20) + || (!(i & 1) && i < 10)) + { + if (x[i] != i + i) + abort (); + } + else if (x[i] != i) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target avx2_runtime } } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index fa098f9ff4e..af8f5031bd2 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -505,6 +505,14 @@ static const int arg2_map[] = { 1, 2 }; static const int arg1_arg4_map[] = { 2, 1, 4 }; static const int arg3_arg2_map[] = { 2, 3, 2 }; static const int op1_op0_map[] = { 2, 1, 0 }; +static const int mask_call_maps[6][7] = { + { 1, 1, }, + { 2, 1, 2, }, + { 3, 1, 2, 3, }, + { 4, 1, 2, 3, 4, }, + { 5, 1, 2, 3, 4, 5, }, + { 6, 1, 2, 3, 4, 5, 6 }, +}; /* For most SLP statements, there is a one-to-one mapping between gimple arguments and child nodes. If that is not true for STMT, @@ -547,6 +555,15 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) case IFN_MASK_STORE: return arg3_arg2_map; + case IFN_MASK_CALL: + { + unsigned nargs = gimple_call_num_args (call); + if (nargs >= 2 && nargs <= 7) + return mask_call_maps[nargs-2]; + else + return nullptr; + } + default: break; } @@ -1070,7 +1087,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, if (call_stmt) { combined_fn cfn = gimple_call_combined_fn (call_stmt); - if (cfn != CFN_LAST) + if (cfn != CFN_LAST && cfn != CFN_MASK_CALL) rhs_code = cfn; else rhs_code = CALL_EXPR; @@ -1085,6 +1102,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, rhs_code = CFN_MASK_STORE; } else if ((cfn != CFN_LAST + && cfn != CFN_MASK_CALL && internal_fn_p (cfn) && !vectorizable_internal_fn_p (as_internal_fn (cfn))) || gimple_call_tail_p (call_stmt) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index ce925cc1d53..33b557c2a49 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -4208,10 +4208,6 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, if (loop_vinfo && nested_in_vect_loop_p (loop, stmt_info)) return false; - /* FORNOW */ - if (slp_node) - return false; - /* Process function arguments. */ nargs = gimple_call_num_args (stmt) - arg_offset; @@ -4220,6 +4216,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, return false; arginfo.reserve (nargs, true); + auto_vec slp_op; + slp_op.safe_grow_cleared (nargs); for (i = 0; i < nargs; i++) { @@ -4231,9 +4229,12 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, thisarginfo.op = NULL_TREE; thisarginfo.simd_lane_linear = false; - op = gimple_call_arg (stmt, i + arg_offset); - if (!vect_is_simple_use (op, vinfo, &thisarginfo.dt, - &thisarginfo.vectype) + int op_no = i + arg_offset; + if (slp_node) + op_no = vect_slp_child_index_for_operand (stmt, op_no); + if (!vect_is_simple_use (vinfo, stmt_info, slp_node, + op_no, &op, &slp_op[i], + &thisarginfo.dt, &thisarginfo.vectype) || thisarginfo.dt == vect_uninitialized_def) { if (dump_enabled_p ()) @@ -4244,7 +4245,13 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, if (thisarginfo.dt == vect_constant_def || thisarginfo.dt == vect_external_def) - gcc_assert (thisarginfo.vectype == NULL_TREE); + { + gcc_assert (vec_stmt || thisarginfo.vectype == NULL_TREE); + if (!vec_stmt) + thisarginfo.vectype = get_vectype_for_scalar_type (vinfo, + TREE_TYPE (op), + slp_node); + } else gcc_assert (thisarginfo.vectype != NULL_TREE); @@ -4301,15 +4308,14 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, && thisarginfo.dt != vect_constant_def && thisarginfo.dt != vect_external_def && loop_vinfo - && !slp_node && TREE_CODE (op) == SSA_NAME) vect_simd_lane_linear (op, loop, &thisarginfo); arginfo.quick_push (thisarginfo); } - poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); - if (!vf.is_constant ()) + if (loop_vinfo + && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -4318,6 +4324,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, return false; } + poly_uint64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1; + unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 1; unsigned int badness = 0; struct cgraph_node *bestn = NULL; if (STMT_VINFO_SIMD_CLONE_INFO (stmt_info).exists ()) @@ -4328,7 +4336,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, { unsigned int this_badness = 0; unsigned int num_calls; - if (!constant_multiple_p (vf, n->simdclone->simdlen, &num_calls) + if (!constant_multiple_p (vf * group_size, + n->simdclone->simdlen, &num_calls) || n->simdclone->nargs != nargs) continue; if (num_calls != 1) @@ -4454,7 +4463,10 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, fndecl = bestn->decl; nunits = bestn->simdclone->simdlen; - ncopies = vector_unroll_factor (vf, nunits); + if (slp_node) + ncopies = vector_unroll_factor (vf * group_size, nunits); + else + ncopies = vector_unroll_factor (vf, nunits); /* If the function isn't const, only allow it in simd loops where user has asserted that at least nunits consecutive iterations can be @@ -4469,6 +4481,15 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, if (!vec_stmt) /* transformation not required. */ { + if (slp_node) + for (unsigned i = 0; i < nargs; ++i) + if (!vect_maybe_update_slp_op_vectype (slp_op[i], arginfo[i].vectype)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "incompatible vector types for invariants\n"); + return false; + } /* When the original call is pure or const but the SIMD ABI dictates an aggregate return we will have to use a virtual definition and in a loop eventually even need to add a virtual PHI. That's @@ -4477,6 +4498,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, && !gimple_vdef (stmt) && TREE_CODE (TREE_TYPE (TREE_TYPE (bestn->decl))) == ARRAY_TYPE) vinfo->any_known_not_updated_vssa = true; + /* ??? For SLP code-gen we end up inserting after the last + vector argument def rather than at the original call position + so automagic virtual operand updating doesn't work. */ + if (gimple_vuse (stmt) && slp_node) + vinfo->any_known_not_updated_vssa = true; STMT_VINFO_SIMD_CLONE_INFO (stmt_info).safe_push (bestn->decl); for (i = 0; i < nargs; i++) if ((bestn->simdclone->args[i].arg_type @@ -4526,8 +4552,14 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, auto_vec > vec_oprnds; auto_vec vec_oprnds_i; - vec_oprnds.safe_grow_cleared (nargs, true); vec_oprnds_i.safe_grow_cleared (nargs, true); + if (slp_node) + { + vec_oprnds.reserve_exact (nargs); + vect_get_slp_defs (vinfo, slp_node, &vec_oprnds); + } + else + vec_oprnds.safe_grow_cleared (nargs, true); for (j = 0; j < ncopies; ++j) { /* Build argument list for the vectorized call. */ @@ -4558,9 +4590,10 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, gcc_assert ((k & (k - 1)) == 0); if (m == 0) { - vect_get_vec_defs_for_operand (vinfo, stmt_info, - ncopies * o / k, op, - &vec_oprnds[i]); + if (!slp_node) + vect_get_vec_defs_for_operand (vinfo, stmt_info, + ncopies * o / k, op, + &vec_oprnds[i]); vec_oprnds_i[i] = 0; vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++]; } @@ -4596,10 +4629,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, { if (m == 0 && l == 0) { - vect_get_vec_defs_for_operand (vinfo, stmt_info, - k * o * ncopies, - op, - &vec_oprnds[i]); + if (!slp_node) + vect_get_vec_defs_for_operand (vinfo, stmt_info, + k * o * ncopies, + op, + &vec_oprnds[i]); vec_oprnds_i[i] = 0; vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++]; } @@ -4670,10 +4704,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, elements as the current function. */ if (m == 0) { - vect_get_vec_defs_for_operand (vinfo, stmt_info, - o * ncopies, - op, - &vec_oprnds[i]); + if (!slp_node) + vect_get_vec_defs_for_operand (vinfo, stmt_info, + o * ncopies, + op, + &vec_oprnds[i]); vec_oprnds_i[i] = 0; } vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++]; @@ -4817,7 +4852,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, if (j == 0 && l == 0) *vec_stmt = new_stmt; - STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); + if (slp_node) + SLP_TREE_VEC_DEFS (slp_node) + .quick_push (gimple_assign_lhs (new_stmt)); + else + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); } if (ratype) @@ -4860,7 +4899,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, if ((unsigned) j == k - 1) *vec_stmt = new_stmt; - STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); + if (slp_node) + SLP_TREE_VEC_DEFS (slp_node) + .quick_push (gimple_assign_lhs (new_stmt)); + else + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); continue; } else if (ratype) @@ -4883,7 +4926,10 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, if (j == 0) *vec_stmt = new_stmt; - STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); + if (slp_node) + SLP_TREE_VEC_DEFS (slp_node).quick_push (gimple_get_lhs (new_stmt)); + else + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); } for (i = 0; i < nargs; ++i)