RISC-V: Add popcount fallback expander.
I didn't manage to get back to the generic vectorizer fallback for popcount so I figured I'd rather create a popcount fallback in the riscv backend. It uses the WWG algorithm from libgcc. gcc/ChangeLog: * config/riscv/autovec.md (popcount<mode>2): New expander. * config/riscv/riscv-protos.h (expand_popcount): Define. * config/riscv/riscv-v.cc (expand_popcount): Vectorize popcount with the WWG algorithm. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/unop/popcount-1.c: New test. * gcc.target/riscv/rvv/autovec/unop/popcount-2.c: New test. * gcc.target/riscv/rvv/autovec/unop/popcount-run-1.c: New test. * gcc.target/riscv/rvv/autovec/unop/popcount.c: New test.
This commit is contained in:
parent
458db9b614
commit
82bbbb73c6
7 changed files with 1638 additions and 0 deletions
|
@ -1484,6 +1484,20 @@
|
|||
DONE;
|
||||
})
|
||||
|
||||
;; -------------------------------------------------------------------------------
|
||||
;; - [INT] POPCOUNT.
|
||||
;; -------------------------------------------------------------------------------
|
||||
|
||||
(define_expand "popcount<mode>2"
|
||||
[(match_operand:V_VLSI 0 "register_operand")
|
||||
(match_operand:V_VLSI 1 "register_operand")]
|
||||
"TARGET_VECTOR"
|
||||
{
|
||||
riscv_vector::expand_popcount (operands);
|
||||
DONE;
|
||||
})
|
||||
|
||||
|
||||
;; -------------------------------------------------------------------------
|
||||
;; ---- [INT] Highpart multiplication
|
||||
;; -------------------------------------------------------------------------
|
||||
|
|
|
@ -521,6 +521,7 @@ void expand_fold_extract_last (rtx *);
|
|||
void expand_cond_unop (unsigned, rtx *);
|
||||
void expand_cond_binop (unsigned, rtx *);
|
||||
void expand_cond_ternop (unsigned, rtx *);
|
||||
void expand_popcount (rtx *);
|
||||
|
||||
/* Rounding mode bitfield for fixed point VXRM. */
|
||||
enum fixed_point_rounding_mode
|
||||
|
|
|
@ -4364,4 +4364,75 @@ expand_vec_lfloor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
|
|||
emit_vec_cvt_x_f (op_0, op_1, UNARY_OP_FRM_RDN, vec_fp_mode);
|
||||
}
|
||||
|
||||
/* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
|
||||
well. */
|
||||
void
|
||||
expand_popcount (rtx *ops)
|
||||
{
|
||||
rtx dst = ops[0];
|
||||
rtx src = ops[1];
|
||||
machine_mode mode = GET_MODE (dst);
|
||||
scalar_mode imode = GET_MODE_INNER (mode);
|
||||
static const uint64_t m5 = 0x5555555555555555ULL;
|
||||
static const uint64_t m3 = 0x3333333333333333ULL;
|
||||
static const uint64_t mf = 0x0F0F0F0F0F0F0F0FULL;
|
||||
static const uint64_t m1 = 0x0101010101010101ULL;
|
||||
|
||||
rtx x1 = gen_reg_rtx (mode);
|
||||
rtx x2 = gen_reg_rtx (mode);
|
||||
rtx x3 = gen_reg_rtx (mode);
|
||||
rtx x4 = gen_reg_rtx (mode);
|
||||
|
||||
/* x1 = src - (src >> 1) & 0x555...); */
|
||||
rtx shift1 = expand_binop (mode, lshr_optab, src, GEN_INT (1), NULL, true,
|
||||
OPTAB_DIRECT);
|
||||
|
||||
rtx and1 = gen_reg_rtx (mode);
|
||||
rtx ops1[] = {and1, shift1, gen_int_mode (m5, imode)};
|
||||
emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
|
||||
ops1);
|
||||
|
||||
x1 = expand_binop (mode, sub_optab, src, and1, NULL, true, OPTAB_DIRECT);
|
||||
|
||||
/* x2 = (x1 & 0x3333333333333333ULL) + ((x1 >> 2) & 0x3333333333333333ULL);
|
||||
*/
|
||||
rtx and2 = gen_reg_rtx (mode);
|
||||
rtx ops2[] = {and2, x1, gen_int_mode (m3, imode)};
|
||||
emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
|
||||
ops2);
|
||||
|
||||
rtx shift2 = expand_binop (mode, lshr_optab, x1, GEN_INT (2), NULL, true,
|
||||
OPTAB_DIRECT);
|
||||
|
||||
rtx and22 = gen_reg_rtx (mode);
|
||||
rtx ops22[] = {and22, shift2, gen_int_mode (m3, imode)};
|
||||
emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
|
||||
ops22);
|
||||
|
||||
x2 = expand_binop (mode, add_optab, and2, and22, NULL, true, OPTAB_DIRECT);
|
||||
|
||||
/* x3 = (x2 + (x2 >> 4)) & 0x0f0f0f0f0f0f0f0fULL; */
|
||||
rtx shift3 = expand_binop (mode, lshr_optab, x2, GEN_INT (4), NULL, true,
|
||||
OPTAB_DIRECT);
|
||||
|
||||
rtx plus3
|
||||
= expand_binop (mode, add_optab, x2, shift3, NULL, true, OPTAB_DIRECT);
|
||||
|
||||
rtx ops3[] = {x3, plus3, gen_int_mode (mf, imode)};
|
||||
emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
|
||||
ops3);
|
||||
|
||||
/* dest = (x3 * 0x0101010101010101ULL) >> 56; */
|
||||
rtx mul4 = gen_reg_rtx (mode);
|
||||
rtx ops4[] = {mul4, x3, gen_int_mode (m1, imode)};
|
||||
emit_vlmax_insn (code_for_pred_scalar (MULT, mode), riscv_vector::BINARY_OP,
|
||||
ops4);
|
||||
|
||||
x4 = expand_binop (mode, lshr_optab, mul4,
|
||||
GEN_INT (GET_MODE_BITSIZE (imode) - 8), NULL, true,
|
||||
OPTAB_DIRECT);
|
||||
|
||||
emit_move_insn (dst, x4);
|
||||
}
|
||||
|
||||
} // namespace riscv_vector
|
||||
|
|
20
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/popcount-1.c
Normal file
20
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/popcount-1.c
Normal file
|
@ -0,0 +1,20 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param=riscv-autovec-preference=scalable -fno-vect-cost-model -fdump-tree-vect-details" } */
|
||||
|
||||
#include <stdint-gcc.h>
|
||||
|
||||
void __attribute__ ((noipa))
|
||||
popcount_32 (uint32_t *restrict dst, uint32_t *restrict src, int size)
|
||||
{
|
||||
for (int i = 0; i < size; ++i)
|
||||
dst[i] = __builtin_popcount (src[i]);
|
||||
}
|
||||
|
||||
void __attribute__ ((noipa))
|
||||
popcount_64 (uint64_t *restrict dst, uint64_t *restrict src, int size)
|
||||
{
|
||||
for (int i = 0; i < size; ++i)
|
||||
dst[i] = __builtin_popcountll (src[i]);
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 2 "vect" } } */
|
19
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/popcount-2.c
Normal file
19
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/popcount-2.c
Normal file
|
@ -0,0 +1,19 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param=riscv-autovec-preference=scalable -fno-vect-cost-model -fdump-tree-slp-details" } */
|
||||
|
||||
int x[8];
|
||||
int y[8];
|
||||
|
||||
void foo ()
|
||||
{
|
||||
x[0] = __builtin_popcount (y[0]);
|
||||
x[1] = __builtin_popcount (y[1]);
|
||||
x[2] = __builtin_popcount (y[2]);
|
||||
x[3] = __builtin_popcount (y[3]);
|
||||
x[4] = __builtin_popcount (y[4]);
|
||||
x[5] = __builtin_popcount (y[5]);
|
||||
x[6] = __builtin_popcount (y[6]);
|
||||
x[7] = __builtin_popcount (y[7]);
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "slp2" } } */
|
|
@ -0,0 +1,49 @@
|
|||
/* { dg-do run { target { riscv_v } } } */
|
||||
|
||||
#include "popcount-1.c"
|
||||
|
||||
extern void abort (void) __attribute__ ((noreturn));
|
||||
|
||||
unsigned int data[] = {
|
||||
0x11111100, 6,
|
||||
0xe0e0f0f0, 14,
|
||||
0x9900aab3, 13,
|
||||
0x00040003, 3,
|
||||
0x000e000c, 5,
|
||||
0x22227777, 16,
|
||||
0x12341234, 10,
|
||||
0x0, 0
|
||||
};
|
||||
|
||||
int __attribute__ ((optimize (1)))
|
||||
main (void)
|
||||
{
|
||||
unsigned int count = sizeof (data) / sizeof (data[0]) / 2;
|
||||
|
||||
uint32_t in32[count];
|
||||
uint32_t out32[count];
|
||||
for (unsigned int i = 0; i < count; ++i)
|
||||
{
|
||||
in32[i] = data[i * 2];
|
||||
asm volatile ("" ::: "memory");
|
||||
}
|
||||
popcount_32 (out32, in32, count);
|
||||
for (unsigned int i = 0; i < count; ++i)
|
||||
if (out32[i] != data[i * 2 + 1])
|
||||
abort ();
|
||||
|
||||
count /= 2;
|
||||
uint64_t in64[count];
|
||||
uint64_t out64[count];
|
||||
for (unsigned int i = 0; i < count; ++i)
|
||||
{
|
||||
in64[i] = ((uint64_t) data[i * 4] << 32) | data[i * 4 + 2];
|
||||
asm volatile ("" ::: "memory");
|
||||
}
|
||||
popcount_64 (out64, in64, count);
|
||||
for (unsigned int i = 0; i < count; ++i)
|
||||
if (out64[i] != data[i * 4 + 1] + data[i * 4 + 3])
|
||||
abort ();
|
||||
|
||||
return 0;
|
||||
}
|
1464
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/popcount.c
Normal file
1464
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/popcount.c
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue