From 3179ad72f67f31824c444ef30ef171ad7495d274 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 13 Oct 2023 12:32:51 +0200 Subject: [PATCH] OMP SIMD inbranch call vectorization for AVX512 style masks The following teaches vectorizable_simd_clone_call to handle integer mode masks. The tricky bit is to second-guess the number of lanes represented by a single mask argument - the following uses simdlen and the number of mask arguments to calculate that, assuming ABIs have them uniform. Similar to the VOIDmode handling there's a restriction on not supporting splitting/merging of incoming vector masks to more/less SIMD call arguments. PR tree-optimization/111795 * tree-vect-stmts.cc (vectorizable_simd_clone_call): Handle integer mode mask arguments. * gcc.target/i386/vect-simd-clone-avx512-1.c: New testcase. * gcc.target/i386/vect-simd-clone-avx512-2.c: Likewise. * gcc.target/i386/vect-simd-clone-avx512-3.c: Likewise. --- .../i386/vect-simd-clone-avx512-1.c | 43 +++++ .../i386/vect-simd-clone-avx512-2.c | 6 + .../i386/vect-simd-clone-avx512-3.c | 6 + gcc/tree-vect-stmts.cc | 150 ++++++++++++++---- 4 files changed, 175 insertions(+), 30 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-1.c create mode 100644 gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-2.c create mode 100644 gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-3.c diff --git a/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-1.c b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-1.c new file mode 100644 index 00000000000..e350996439e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-1.c @@ -0,0 +1,43 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx512vl } */ +/* { dg-options "-O2 -fopenmp-simd -mavx512vl" } */ + +#include "avx512vl-check.h" + +#ifndef SIMDLEN +#define SIMDLEN 4 +#endif + +int x[1024]; + +#pragma omp declare simd simdlen(SIMDLEN) +__attribute__((noinline)) int +foo (int a, int b) +{ + return a + b; +} + +void __attribute__((noipa)) +bar (void) +{ +#pragma omp simd + for (int i = 0; i < 1024; i++) + if (x[i] < 20) + x[i] = foo (x[i], x[i]); +} + +void avx512vl_test () +{ + int i; +#pragma GCC novector + for (i = 0; i < 1024; i++) + x[i] = i; + + bar (); + +#pragma GCC novector + for (i = 0; i < 1024; i++) + if ((i < 20 && x[i] != i + i) + || (i >= 20 && x[i] != i)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-2.c b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-2.c new file mode 100644 index 00000000000..d9968ae30f5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-2.c @@ -0,0 +1,6 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx512vl } */ +/* { dg-options "-O2 -fopenmp-simd -mavx512vl" } */ + +#define SIMDLEN 8 +#include "vect-simd-clone-avx512-1.c" diff --git a/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-3.c b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-3.c new file mode 100644 index 00000000000..c05f6c8ce91 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-3.c @@ -0,0 +1,6 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx512vl } */ +/* { dg-options "-O2 -fopenmp-simd -mavx512vl" } */ + +#define SIMDLEN 16 +#include "vect-simd-clone-avx512-1.c" diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 33b557c2a49..b3a56498595 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -4385,6 +4385,9 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, i = -1; break; case SIMD_CLONE_ARG_TYPE_MASK: + if (SCALAR_INT_MODE_P (n->simdclone->mask_mode) + != SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype))) + i = -1; break; } if (i == (size_t) -1) @@ -4410,6 +4413,12 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, if (bestn == NULL) return false; + unsigned int num_mask_args = 0; + if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode)) + for (i = 0; i < nargs; i++) + if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK) + num_mask_args++; + for (i = 0; i < nargs; i++) { if ((arginfo[i].dt == vect_constant_def @@ -4434,30 +4443,50 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, return false; } - if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK - && bestn->simdclone->mask_mode == VOIDmode - && (simd_clone_subparts (bestn->simdclone->args[i].vector_type) - != simd_clone_subparts (arginfo[i].vectype))) + if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK) { - /* FORNOW we only have partial support for vector-type masks that - can't hold all of simdlen. */ - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, - vect_location, - "in-branch vector clones are not yet" - " supported for mismatched vector sizes.\n"); - return false; - } - if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK - && bestn->simdclone->mask_mode != VOIDmode) - { - /* FORNOW don't support integer-type masks. */ - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, - vect_location, - "in-branch vector clones are not yet" - " supported for integer mask modes.\n"); - return false; + if (bestn->simdclone->mask_mode == VOIDmode) + { + if (simd_clone_subparts (bestn->simdclone->args[i].vector_type) + != simd_clone_subparts (arginfo[i].vectype)) + { + /* FORNOW we only have partial support for vector-type masks + that can't hold all of simdlen. */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, + vect_location, + "in-branch vector clones are not yet" + " supported for mismatched vector sizes.\n"); + return false; + } + } + else if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode)) + { + if (!SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype)) + || maybe_ne (exact_div (bestn->simdclone->simdlen, + num_mask_args), + simd_clone_subparts (arginfo[i].vectype))) + { + /* FORNOW we only have partial support for integer-type masks + that represent the same number of lanes as the + vectorized mask inputs. */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, + vect_location, + "in-branch vector clones are not yet " + "supported for mismatched vector sizes.\n"); + return false; + } + } + else + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, + vect_location, + "in-branch vector clones not supported" + " on this target.\n"); + return false; + } } } @@ -4674,14 +4703,9 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, } break; case SIMD_CLONE_ARG_TYPE_MASK: - atype = bestn->simdclone->args[i].vector_type; - if (bestn->simdclone->mask_mode != VOIDmode) - { - /* FORNOW: this is disabled above. */ - gcc_unreachable (); - } - else + if (bestn->simdclone->mask_mode == VOIDmode) { + atype = bestn->simdclone->args[i].vector_type; tree elt_type = TREE_TYPE (atype); tree one = fold_convert (elt_type, integer_one_node); tree zero = fold_convert (elt_type, integer_zero_node); @@ -4732,6 +4756,72 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, } } } + else if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode)) + { + atype = bestn->simdclone->args[i].vector_type; + /* Guess the number of lanes represented by atype. */ + unsigned HOST_WIDE_INT atype_subparts + = exact_div (bestn->simdclone->simdlen, + num_mask_args).to_constant (); + o = vector_unroll_factor (nunits, atype_subparts); + for (m = j * o; m < (j + 1) * o; m++) + { + if (m == 0) + { + if (!slp_node) + vect_get_vec_defs_for_operand (vinfo, stmt_info, + o * ncopies, + op, + &vec_oprnds[i]); + vec_oprnds_i[i] = 0; + } + if (atype_subparts + < simd_clone_subparts (arginfo[i].vectype)) + { + /* The mask argument has fewer elements than the + input vector. */ + /* FORNOW */ + gcc_unreachable (); + } + else if (atype_subparts + == simd_clone_subparts (arginfo[i].vectype)) + { + /* The vector mask argument matches the input + in the number of lanes, but not necessarily + in the mode. */ + vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++]; + tree st = lang_hooks.types.type_for_mode + (TYPE_MODE (TREE_TYPE (vec_oprnd0)), 1); + vec_oprnd0 = build1 (VIEW_CONVERT_EXPR, st, + vec_oprnd0); + gassign *new_stmt + = gimple_build_assign (make_ssa_name (st), + vec_oprnd0); + vect_finish_stmt_generation (vinfo, stmt_info, + new_stmt, gsi); + if (!types_compatible_p (atype, st)) + { + new_stmt + = gimple_build_assign (make_ssa_name (atype), + NOP_EXPR, + gimple_assign_lhs + (new_stmt)); + vect_finish_stmt_generation (vinfo, stmt_info, + new_stmt, gsi); + } + vargs.safe_push (gimple_assign_lhs (new_stmt)); + } + else + { + /* The mask argument has more elements than the + input vector. */ + /* FORNOW */ + gcc_unreachable (); + } + } + } + else + gcc_unreachable (); break; case SIMD_CLONE_ARG_TYPE_UNIFORM: vargs.safe_push (op);