diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index 78997ef056a..4e33db365ac 100644 --- a/gcc/internal-fn.cc +++ b/gcc/internal-fn.cc @@ -5332,11 +5332,11 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt) start_sequence (); rtx op0 = expand_normal (arg); rtx argm1 = expand_simple_binop (mode, PLUS, op0, constm1_rtx, NULL_RTX, - 1, OPTAB_DIRECT); + 1, OPTAB_WIDEN); if (argm1 == NULL_RTX) goto fail; rtx argxorargm1 = expand_simple_binop (mode, nonzero_arg ? AND : XOR, op0, - argm1, NULL_RTX, 1, OPTAB_DIRECT); + argm1, NULL_RTX, 1, OPTAB_WIDEN); if (argxorargm1 == NULL_RTX) goto fail; rtx cmp; diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c b/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c new file mode 100644 index 00000000000..bb9e2bf0a54 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c @@ -0,0 +1,45 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-rtl-expand-details" } */ +/* { dg-final { check-function-bodies "**" "" } } */ +/* PR middle-end/116508 */ + +#pragma GCC target "+nocssc" + +/* +** h16: +** sub w([0-9]+), w0, #1 +** eor w([0-9]+), w0, w\1 +** and w([0-9]+), w\1, 65535 +** cmp w\3, w\2, uxth +** cset w0, cc +** ret +*/ + +/* when expanding popcount == 1, should use + `(arg ^ (arg - 1)) > arg - 1` as that has a lower latency + than doing the popcount then comparing against 1. + The popcount/addv can be costly. */ +unsigned h16 (const unsigned short a) { + return __builtin_popcountg (a) == 1; +} + +/* unsigned char should also do the same trick */ +/* Currently xfailed since the cost does not take into account the + moving between gprs and vector regs correctly. */ +/* +** h8: { xfail *-*-* } +** sub w([0-9]+), w0, #1 +** eor w([0-9]+), w0, w\1 +** and w([0-9]+), w\1, 255 +** cmp w\3, w\2, uxtb +** cset w0, cc +** ret +*/ + + +unsigned h8 (const unsigned char a) { + return __builtin_popcountg (a) == 1; +} + +/* There should be printing out the costs for h8 and h16's popcount == 1 */ +/* { dg-final { scan-rtl-dump-times "popcount == 1:" 2 "expand"} } */