[PATCH 3/7] RISC-V: Fix vector memcpy smaller LMUL generation
If riscv_vector::expand_block_move is generating a straight-line memcpy using a predicated store, it tries to use a smaller LMUL to reduce register pressure if it still allows an entire transfer. This happens in the inner loop of riscv_vector::expand_block_move, however, the vmode chosen by this loop gets overwritten later in the function, so I have added the missing break from the outer loop. I have also addressed a couple of issues with the conditions of the if statement within the inner loop. The first condition did not make sense to me: ``` TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT ``` I think this was supposed to be checking that the length fits within the given LMUL, so I have changed it to do that. The second condition: ``` /* Avoid loosing the option of using vsetivli . */ && (nunits <= 31 * lmul || nunits > 31 * 8) ``` seems to imply that lmul affects the range of AVL immediate that vsetivli can take but I don't think that is correct. Anyway, I don't think this condition is necessary because if we find a suitable mode we should stick with it, regardless of whether it allowed vsetivli, rather than continuing to try larger lmul which would increase register pressure or smaller potential_ew which would increase AVL. I have removed this condition. gcc/ChangeLog: * config/riscv/riscv-string.cc (expand_block_move): Fix condition for using smaller LMUL. Break outer loop if a suitable vmode has been found. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/vsetvl/pr112929-1.c: Expect smaller lmul. * gcc.target/riscv/rvv/vsetvl/pr112988-1.c: Likewise. * gcc.target/riscv/rvv/base/cpymem-3.c: New test.
This commit is contained in:
parent
212d8685e4
commit
b039d06c9a
4 changed files with 92 additions and 5 deletions
|
@ -1153,9 +1153,7 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
|
|||
Still, by choosing a lower LMUL factor that still allows
|
||||
an entire transfer, we can reduce register pressure. */
|
||||
for (unsigned lmul = 1; lmul <= 4; lmul <<= 1)
|
||||
if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT
|
||||
/* Avoid loosing the option of using vsetivli . */
|
||||
&& (nunits <= 31 * lmul || nunits > 31 * 8)
|
||||
if (length * BITS_PER_UNIT <= TARGET_MIN_VLEN * lmul
|
||||
&& multiple_p (BYTES_PER_RISCV_VECTOR * lmul, potential_ew)
|
||||
&& (riscv_vector::get_vector_mode
|
||||
(elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * lmul,
|
||||
|
@ -1163,6 +1161,10 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
|
|||
break;
|
||||
}
|
||||
|
||||
/* Stop searching if a suitable vmode has been found. */
|
||||
if (vmode != VOIDmode)
|
||||
break;
|
||||
|
||||
/* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes
|
||||
wide. BYTES_PER_RISCV_VECTOR can't be evenly divided by
|
||||
the sizes of larger element types; the LMUL factor of 8 can at
|
||||
|
|
85
gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-3.c
Normal file
85
gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-3.c
Normal file
|
@ -0,0 +1,85 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-additional-options "-O1 -fno-schedule-insns -fno-schedule-insns2 -mrvv-max-lmul=m8" } */
|
||||
/* { dg-add-options riscv_v } */
|
||||
/* { dg-final { check-function-bodies "**" "" } } */
|
||||
|
||||
#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
|
||||
|
||||
/* Check that vector memcpy with predicated store uses smaller LMUL where
|
||||
possible.
|
||||
|
||||
/* m1
|
||||
** f1:
|
||||
** (
|
||||
** vsetivli\s+zero,\d+,e8,m1,ta,ma
|
||||
** |
|
||||
** li\s+[ta][0-7],\d+
|
||||
** vsetvli\s+zero,[ta][0-7],e8,m1,ta,ma
|
||||
** )
|
||||
** vle8.v\s+v\d+,0\(a1\)
|
||||
** vse8.v\s+v\d+,0\(a0\)
|
||||
** ret
|
||||
*/
|
||||
|
||||
void f1 (char *d, char *s)
|
||||
{
|
||||
__builtin_memcpy (d, s, MIN_VECTOR_BYTES - 1);
|
||||
}
|
||||
|
||||
/* m2
|
||||
** f2:
|
||||
** (
|
||||
** vsetivli\s+zero,\d+,e8,m2,ta,ma
|
||||
** |
|
||||
** li\s+[ta][0-7],\d+
|
||||
** vsetvli\s+zero,[ta][0-7],e8,m2,ta,ma
|
||||
** )
|
||||
** vle8.v\s+v\d+,0\(a1\)
|
||||
** vse8.v\s+v\d+,0\(a0\)
|
||||
** ret
|
||||
*/
|
||||
|
||||
void f2 (char *d, char *s)
|
||||
{
|
||||
__builtin_memcpy (d, s, 2 * MIN_VECTOR_BYTES - 1);
|
||||
}
|
||||
|
||||
/* m4
|
||||
** f3:
|
||||
** (
|
||||
** vsetivli\s+zero,\d+,e8,m4,ta,ma
|
||||
** |
|
||||
** li\s+[ta][0-7],\d+
|
||||
** vsetvli\s+zero,[ta][0-7],e8,m4,ta,ma
|
||||
** )
|
||||
** vle8.v\s+v\d+,0\(a1\)
|
||||
** vse8.v\s+v\d+,0\(a0\)
|
||||
** ret
|
||||
*/
|
||||
|
||||
void f3 (char *d, char *s)
|
||||
{
|
||||
__builtin_memcpy (d, s, 4 * MIN_VECTOR_BYTES - 1);
|
||||
}
|
||||
|
||||
/* m8
|
||||
** f4:
|
||||
** (
|
||||
** vsetivli\s+zero,\d+,e8,m8,ta,ma
|
||||
** |
|
||||
** li\s+[ta][0-7],\d+
|
||||
** vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma
|
||||
** |
|
||||
** li\s+[ta][0-7],\d+
|
||||
** addi\s+[ta][0-7],[ta][0-7],-?\d+
|
||||
** vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma
|
||||
** )
|
||||
** vle8.v\s+v\d+,0\(a1\)
|
||||
** vse8.v\s+v\d+,0\(a0\)
|
||||
** ret
|
||||
*/
|
||||
|
||||
void f4 (char *d, char *s)
|
||||
{
|
||||
__builtin_memcpy (d, s, 8 * MIN_VECTOR_BYTES - 1);
|
||||
}
|
|
@ -54,5 +54,5 @@ int main() {
|
|||
|
||||
/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */
|
||||
/* { dg-final { scan-assembler-not {vsetivli} } } */
|
||||
/* { dg-final { scan-assembler-times {vsetvli\tzero,\s*[a-x0-9]+,\s*e8,\s*m8,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */
|
||||
/* { dg-final { scan-assembler-times {vsetvli\tzero,\s*[a-x0-9]+,\s*e8,\s*m2,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */
|
||||
/* { dg-final { scan-assembler-times {li\t[a-x0-9]+,\s*32} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */
|
||||
|
|
|
@ -64,5 +64,5 @@ int main() {
|
|||
|
||||
/* { dg-final { scan-assembler-times {vsetvli} 4 } } */
|
||||
/* { dg-final { scan-assembler-not {vsetivli} } } */
|
||||
/* { dg-final { scan-assembler-times {vsetvli\tzero,\s*[a-x0-9]+,\s*e8,\s*m8,\s*t[au],\s*m[au]} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {vsetvli\tzero,\s*[a-x0-9]+,\s*e8,\s*m2,\s*t[au],\s*m[au]} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {li\t[a-x0-9]+,\s*32} 1 } } */
|
||||
|
|
Loading…
Add table
Reference in a new issue