From 215c7e3084ea9ec75dcc803f73c94b36e2751e54 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Wed, 28 Aug 2024 15:03:53 -0700 Subject: [PATCH] expand: Allow widdening optab when expanding popcount==1 [PR116508] After adding popcount{qi,hi}2 to the aarch64 backend, I noticed that the expansion for popcount==1 was no longer trying to do the trick of handling popcount==1 as `(arg ^ (arg - 1)) > arg - 1`. The problem is the expansion was using OPTAB_DIRECT, when using OPTAB_WIDEN will allow modes which are smaller than SImode (in the aarch64 case). Note QImode's cost still needs some improvements so part of popcnt-eq-1.c is xfailed. Though there is a check to make sure the costs are compared now. Built and tested on aarch64-linux-gnu. PR middle-end/116508 gcc/ChangeLog: * internal-fn.cc (expand_POPCOUNT): Use OPTAB_WIDEN for PLUS and XOR/AND expansion. gcc/testsuite/ChangeLog: * gcc.target/aarch64/popcnt-eq-1.c: New test. Signed-off-by: Andrew Pinski --- gcc/internal-fn.cc | 4 +- .../gcc.target/aarch64/popcnt-eq-1.c | 45 +++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index 78997ef056a..4e33db365ac 100644 --- a/gcc/internal-fn.cc +++ b/gcc/internal-fn.cc @@ -5332,11 +5332,11 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt) start_sequence (); rtx op0 = expand_normal (arg); rtx argm1 = expand_simple_binop (mode, PLUS, op0, constm1_rtx, NULL_RTX, - 1, OPTAB_DIRECT); + 1, OPTAB_WIDEN); if (argm1 == NULL_RTX) goto fail; rtx argxorargm1 = expand_simple_binop (mode, nonzero_arg ? AND : XOR, op0, - argm1, NULL_RTX, 1, OPTAB_DIRECT); + argm1, NULL_RTX, 1, OPTAB_WIDEN); if (argxorargm1 == NULL_RTX) goto fail; rtx cmp; diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c b/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c new file mode 100644 index 00000000000..bb9e2bf0a54 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c @@ -0,0 +1,45 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-rtl-expand-details" } */ +/* { dg-final { check-function-bodies "**" "" } } */ +/* PR middle-end/116508 */ + +#pragma GCC target "+nocssc" + +/* +** h16: +** sub w([0-9]+), w0, #1 +** eor w([0-9]+), w0, w\1 +** and w([0-9]+), w\1, 65535 +** cmp w\3, w\2, uxth +** cset w0, cc +** ret +*/ + +/* when expanding popcount == 1, should use + `(arg ^ (arg - 1)) > arg - 1` as that has a lower latency + than doing the popcount then comparing against 1. + The popcount/addv can be costly. */ +unsigned h16 (const unsigned short a) { + return __builtin_popcountg (a) == 1; +} + +/* unsigned char should also do the same trick */ +/* Currently xfailed since the cost does not take into account the + moving between gprs and vector regs correctly. */ +/* +** h8: { xfail *-*-* } +** sub w([0-9]+), w0, #1 +** eor w([0-9]+), w0, w\1 +** and w([0-9]+), w\1, 255 +** cmp w\3, w\2, uxtb +** cset w0, cc +** ret +*/ + + +unsigned h8 (const unsigned char a) { + return __builtin_popcountg (a) == 1; +} + +/* There should be printing out the costs for h8 and h16's popcount == 1 */ +/* { dg-final { scan-rtl-dump-times "popcount == 1:" 2 "expand"} } */