arm.h (MAX_LDM_STM_OPS): New macro.

* config/arm/arm.h (MAX_LDM_STM_OPS): New macro.
	* config/arm/arm.c (multiple_operation_profitable_p,
	compute_offset_order): New static functions.
	(load_multiple_sequence, store_multiple_sequence): Use them.
	Replace constant 4 with MAX_LDM_STM_OPS.  Compute order[0] from
	memory offsets, not register numbers.
	(emit_ldm_seq, emit_stm_seq): Replace constant 4 with MAX_LDM_STM_OPS.

From-SVN: r159089
This commit is contained in:
Bernd Schmidt 2010-05-05 22:46:38 +00:00 committed by Bernd Schmidt
parent 5e7b92b9ff
commit 93b338c396
3 changed files with 170 additions and 153 deletions

View file

@ -1,3 +1,13 @@
2010-05-06 Bernd Schmidt <bernds@codesourcery.com>
* config/arm/arm.h (MAX_LDM_STM_OPS): New macro.
* config/arm/arm.c (multiple_operation_profitable_p,
compute_offset_order): New static functions.
(load_multiple_sequence, store_multiple_sequence): Use them.
Replace constant 4 with MAX_LDM_STM_OPS. Compute order[0] from
memory offsets, not register numbers.
(emit_ldm_seq, emit_stm_seq): Replace constant 4 with MAX_LDM_STM_OPS.
2010-05-05 Steven Bosscher <steven@gcc.gnu.org>
* stor-layout.c (pending_sizes): Change the type to

View file

@ -9073,21 +9073,105 @@ adjacent_mem_locations (rtx a, rtx b)
return 0;
}
/* Return true iff it would be profitable to turn a sequence of NOPS loads
or stores (depending on IS_STORE) into a load-multiple or store-multiple
instruction. ADD_OFFSET is nonzero if the base address register needs
to be modified with an add instruction before we can use it. */
static bool
multiple_operation_profitable_p (bool is_store ATTRIBUTE_UNUSED,
int nops, HOST_WIDE_INT add_offset)
{
/* For ARM8,9 & StrongARM, 2 ldr instructions are faster than an ldm
if the offset isn't small enough. The reason 2 ldrs are faster
is because these ARMs are able to do more than one cache access
in a single cycle. The ARM9 and StrongARM have Harvard caches,
whilst the ARM8 has a double bandwidth cache. This means that
these cores can do both an instruction fetch and a data fetch in
a single cycle, so the trick of calculating the address into a
scratch register (one of the result regs) and then doing a load
multiple actually becomes slower (and no smaller in code size).
That is the transformation
ldr rd1, [rbase + offset]
ldr rd2, [rbase + offset + 4]
to
add rd1, rbase, offset
ldmia rd1, {rd1, rd2}
produces worse code -- '3 cycles + any stalls on rd2' instead of
'2 cycles + any stalls on rd2'. On ARMs with only one cache
access per cycle, the first sequence could never complete in less
than 6 cycles, whereas the ldm sequence would only take 5 and
would make better use of sequential accesses if not hitting the
cache.
We cheat here and test 'arm_ld_sched' which we currently know to
only be true for the ARM8, ARM9 and StrongARM. If this ever
changes, then the test below needs to be reworked. */
if (nops == 2 && arm_ld_sched && add_offset != 0)
return false;
return true;
}
/* Subroutine of load_multiple_sequence and store_multiple_sequence.
Given an array of UNSORTED_OFFSETS, of which there are NOPS, compute
an array ORDER which describes the sequence to use when accessing the
offsets that produces an ascending order. In this sequence, each
offset must be larger by exactly 4 than the previous one. ORDER[0]
must have been filled in with the lowest offset by the caller.
If UNSORTED_REGS is nonnull, it is an array of register numbers that
we use to verify that ORDER produces an ascending order of registers.
Return true if it was possible to construct such an order, false if
not. */
static bool
compute_offset_order (int nops, HOST_WIDE_INT *unsorted_offsets, int *order,
int *unsorted_regs)
{
int i;
for (i = 1; i < nops; i++)
{
int j;
order[i] = order[i - 1];
for (j = 0; j < nops; j++)
if (unsorted_offsets[j] == unsorted_offsets[order[i - 1]] + 4)
{
/* We must find exactly one offset that is higher than the
previous one by 4. */
if (order[i] != order[i - 1])
return false;
order[i] = j;
}
if (order[i] == order[i - 1])
return false;
/* The register numbers must be ascending. */
if (unsorted_regs != NULL
&& unsorted_regs[order[i]] <= unsorted_regs[order[i - 1]])
return false;
}
return true;
}
int
load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
HOST_WIDE_INT *load_offset)
{
int unsorted_regs[4];
HOST_WIDE_INT unsorted_offsets[4];
int order[4];
int unsorted_regs[MAX_LDM_STM_OPS];
HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS];
int order[MAX_LDM_STM_OPS];
int base_reg = -1;
int i;
int i, ldm_case;
/* Can only handle 2, 3, or 4 insns at present,
though could be easily extended if required. */
gcc_assert (nops >= 2 && nops <= 4);
/* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be
easily extended if required. */
gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS);
memset (order, 0, 4 * sizeof (int));
memset (order, 0, MAX_LDM_STM_OPS * sizeof (int));
/* Loop over the operands and check that the memory references are
suitable (i.e. immediate offsets from the same base register). At
@ -9123,25 +9207,16 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
== CONST_INT)))
{
if (i == 0)
{
base_reg = REGNO (reg);
unsorted_regs[0] = (GET_CODE (operands[i]) == REG
? REGNO (operands[i])
: REGNO (SUBREG_REG (operands[i])));
order[0] = 0;
}
base_reg = REGNO (reg);
else
{
if (base_reg != (int) REGNO (reg))
/* Not addressed from the same base register. */
return 0;
unsorted_regs[i] = (GET_CODE (operands[i]) == REG
? REGNO (operands[i])
: REGNO (SUBREG_REG (operands[i])));
if (unsorted_regs[i] < unsorted_regs[order[0]])
order[0] = i;
}
unsorted_regs[i] = (GET_CODE (operands[i]) == REG
? REGNO (operands[i])
: REGNO (SUBREG_REG (operands[i])));
/* If it isn't an integer register, or if it overwrites the
base register but isn't the last insn in the list, then
@ -9151,6 +9226,8 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
return 0;
unsorted_offsets[i] = INTVAL (offset);
if (i == 0 || unsorted_offsets[i] < unsorted_offsets[order[0]])
order[0] = i;
}
else
/* Not a suitable memory address. */
@ -9159,30 +9236,11 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
/* All the useful information has now been extracted from the
operands into unsorted_regs and unsorted_offsets; additionally,
order[0] has been set to the lowest numbered register in the
list. Sort the registers into order, and check that the memory
offsets are ascending and adjacent. */
for (i = 1; i < nops; i++)
{
int j;
order[i] = order[i - 1];
for (j = 0; j < nops; j++)
if (unsorted_regs[j] > unsorted_regs[order[i - 1]]
&& (order[i] == order[i - 1]
|| unsorted_regs[j] < unsorted_regs[order[i]]))
order[i] = j;
/* Have we found a suitable register? if not, one must be used more
than once. */
if (order[i] == order[i - 1])
return 0;
/* Is the memory address adjacent and ascending? */
if (unsorted_offsets[order[i]] != unsorted_offsets[order[i - 1]] + 4)
return 0;
}
order[0] has been set to the lowest offset in the list. Sort
the offsets into order, verifying that they are adjacent, and
check that the register numbers are ascending. */
if (!compute_offset_order (nops, unsorted_offsets, order, unsorted_regs))
return 0;
if (base)
{
@ -9195,59 +9253,31 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
}
if (unsorted_offsets[order[0]] == 0)
return 1; /* ldmia */
if (TARGET_ARM && unsorted_offsets[order[0]] == 4)
return 2; /* ldmib */
if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0)
return 3; /* ldmda */
if (unsorted_offsets[order[nops - 1]] == -4)
return 4; /* ldmdb */
/* For ARM8,9 & StrongARM, 2 ldr instructions are faster than an ldm
if the offset isn't small enough. The reason 2 ldrs are faster
is because these ARMs are able to do more than one cache access
in a single cycle. The ARM9 and StrongARM have Harvard caches,
whilst the ARM8 has a double bandwidth cache. This means that
these cores can do both an instruction fetch and a data fetch in
a single cycle, so the trick of calculating the address into a
scratch register (one of the result regs) and then doing a load
multiple actually becomes slower (and no smaller in code size).
That is the transformation
ldr rd1, [rbase + offset]
ldr rd2, [rbase + offset + 4]
to
add rd1, rbase, offset
ldmia rd1, {rd1, rd2}
produces worse code -- '3 cycles + any stalls on rd2' instead of
'2 cycles + any stalls on rd2'. On ARMs with only one cache
access per cycle, the first sequence could never complete in less
than 6 cycles, whereas the ldm sequence would only take 5 and
would make better use of sequential accesses if not hitting the
cache.
We cheat here and test 'arm_ld_sched' which we currently know to
only be true for the ARM8, ARM9 and StrongARM. If this ever
changes, then the test below needs to be reworked. */
if (nops == 2 && arm_ld_sched)
ldm_case = 1; /* ldmia */
else if (TARGET_ARM && unsorted_offsets[order[0]] == 4)
ldm_case = 2; /* ldmib */
else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0)
ldm_case = 3; /* ldmda */
else if (unsorted_offsets[order[nops - 1]] == -4)
ldm_case = 4; /* ldmdb */
else if (const_ok_for_arm (unsorted_offsets[order[0]])
|| const_ok_for_arm (-unsorted_offsets[order[0]]))
ldm_case = 5;
else
return 0;
/* Can't do it without setting up the offset, only do this if it takes
no more than one insn. */
return (const_ok_for_arm (unsorted_offsets[order[0]])
|| const_ok_for_arm (-unsorted_offsets[order[0]])) ? 5 : 0;
if (!multiple_operation_profitable_p (false, nops,
ldm_case == 5
? unsorted_offsets[order[0]] : 0))
return 0;
return ldm_case;
}
const char *
emit_ldm_seq (rtx *operands, int nops)
{
int regs[4];
int regs[MAX_LDM_STM_OPS];
int base_reg;
HOST_WIDE_INT offset;
char buf[100];
@ -9306,17 +9336,17 @@ int
store_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
HOST_WIDE_INT * load_offset)
{
int unsorted_regs[4];
HOST_WIDE_INT unsorted_offsets[4];
int order[4];
int unsorted_regs[MAX_LDM_STM_OPS];
HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS];
int order[MAX_LDM_STM_OPS];
int base_reg = -1;
int i;
int i, stm_case;
/* Can only handle 2, 3, or 4 insns at present, though could be easily
extended if required. */
gcc_assert (nops >= 2 && nops <= 4);
/* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be
easily extended if required. */
gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS);
memset (order, 0, 4 * sizeof (int));
memset (order, 0, MAX_LDM_STM_OPS * sizeof (int));
/* Loop over the operands and check that the memory references are
suitable (i.e. immediate offsets from the same base register). At
@ -9351,32 +9381,22 @@ store_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
&& (GET_CODE (offset = XEXP (XEXP (operands[nops + i], 0), 1))
== CONST_INT)))
{
unsorted_regs[i] = (GET_CODE (operands[i]) == REG
? REGNO (operands[i])
: REGNO (SUBREG_REG (operands[i])));
if (i == 0)
{
base_reg = REGNO (reg);
unsorted_regs[0] = (GET_CODE (operands[i]) == REG
? REGNO (operands[i])
: REGNO (SUBREG_REG (operands[i])));
order[0] = 0;
}
else
{
if (base_reg != (int) REGNO (reg))
/* Not addressed from the same base register. */
return 0;
unsorted_regs[i] = (GET_CODE (operands[i]) == REG
? REGNO (operands[i])
: REGNO (SUBREG_REG (operands[i])));
if (unsorted_regs[i] < unsorted_regs[order[0]])
order[0] = i;
}
base_reg = REGNO (reg);
else if (base_reg != (int) REGNO (reg))
/* Not addressed from the same base register. */
return 0;
/* If it isn't an integer register, then we can't do this. */
if (unsorted_regs[i] < 0 || unsorted_regs[i] > 14)
return 0;
unsorted_offsets[i] = INTVAL (offset);
if (i == 0 || unsorted_offsets[i] < unsorted_offsets[order[0]])
order[0] = i;
}
else
/* Not a suitable memory address. */
@ -9385,30 +9405,11 @@ store_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
/* All the useful information has now been extracted from the
operands into unsorted_regs and unsorted_offsets; additionally,
order[0] has been set to the lowest numbered register in the
list. Sort the registers into order, and check that the memory
offsets are ascending and adjacent. */
for (i = 1; i < nops; i++)
{
int j;
order[i] = order[i - 1];
for (j = 0; j < nops; j++)
if (unsorted_regs[j] > unsorted_regs[order[i - 1]]
&& (order[i] == order[i - 1]
|| unsorted_regs[j] < unsorted_regs[order[i]]))
order[i] = j;
/* Have we found a suitable register? if not, one must be used more
than once. */
if (order[i] == order[i - 1])
return 0;
/* Is the memory address adjacent and ascending? */
if (unsorted_offsets[order[i]] != unsorted_offsets[order[i - 1]] + 4)
return 0;
}
order[0] has been set to the lowest offset in the list. Sort
the offsets into order, verifying that they are adjacent, and
check that the register numbers are ascending. */
if (!compute_offset_order (nops, unsorted_offsets, order, unsorted_regs))
return 0;
if (base)
{
@ -9421,24 +9422,26 @@ store_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
}
if (unsorted_offsets[order[0]] == 0)
return 1; /* stmia */
stm_case = 1; /* stmia */
else if (TARGET_ARM && unsorted_offsets[order[0]] == 4)
stm_case = 2; /* stmib */
else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0)
stm_case = 3; /* stmda */
else if (unsorted_offsets[order[nops - 1]] == -4)
stm_case = 4; /* stmdb */
else
return 0;
if (unsorted_offsets[order[0]] == 4)
return 2; /* stmib */
if (!multiple_operation_profitable_p (false, nops, 0))
return 0;
if (unsorted_offsets[order[nops - 1]] == 0)
return 3; /* stmda */
if (unsorted_offsets[order[nops - 1]] == -4)
return 4; /* stmdb */
return 0;
return stm_case;
}
const char *
emit_stm_seq (rtx *operands, int nops)
{
int regs[4];
int regs[MAX_LDM_STM_OPS];
int base_reg;
HOST_WIDE_INT offset;
char buf[100];

View file

@ -2769,4 +2769,8 @@ enum arm_builtins
#define NEED_INDICATE_EXEC_STACK 0
#endif
/* The maximum number of parallel loads or stores we support in an ldm/stm
instruction. */
#define MAX_LDM_STM_OPS 4
#endif /* ! GCC_ARM_H */