ia64.h (INIT_TARGET_OPTABS): Remove.
* config/ia64/ia64.h (INIT_TARGET_OPTABS): Remove. * config/ia64/lib1funcs.asm (__divdi3): Update from Intel IA-64 Optimization Guide, minimum latency alternative. (__moddi3, __udivdi3, __umoddi3): Likewise. (__divsi3, __modsi3, __udivsi3, __umodsi3): Likewise. From-SVN: r36169
This commit is contained in:
parent
1056d2281e
commit
d8d7a2867b
3 changed files with 145 additions and 243 deletions
|
@ -1,3 +1,11 @@
|
|||
2000-09-05 Richard Henderson <rth@cygnus.com>
|
||||
|
||||
* config/ia64/ia64.h (INIT_TARGET_OPTABS): Remove.
|
||||
* config/ia64/lib1funcs.asm (__divdi3): Update from Intel IA-64
|
||||
Optimization Guide, minimum latency alternative.
|
||||
(__moddi3, __udivdi3, __umoddi3): Likewise.
|
||||
(__divsi3, __modsi3, __udivsi3, __umodsi3): Likewise.
|
||||
|
||||
2000-09-05 Bruce Korb <bkorb@gnu.org>
|
||||
|
||||
* gcc/fixinc/fixincl.c(load_file): always read header files
|
||||
|
|
|
@ -1694,18 +1694,6 @@ do { \
|
|||
for lib1funcs.asm modules, e.g. __divdi3 vs _divdi3. Since lib1funcs.asm
|
||||
goes into libgcc.a first, the linker will find it first. */
|
||||
|
||||
/* Define this macro as a C statement that declares additional library routines
|
||||
renames existing ones. */
|
||||
|
||||
/* ??? Disable the SImode divide routines for now. */
|
||||
#define INIT_TARGET_OPTABS \
|
||||
do { \
|
||||
sdiv_optab->handlers[(int) SImode].libfunc = 0; \
|
||||
udiv_optab->handlers[(int) SImode].libfunc = 0; \
|
||||
smod_optab->handlers[(int) SImode].libfunc = 0; \
|
||||
umod_optab->handlers[(int) SImode].libfunc = 0; \
|
||||
} while (0)
|
||||
|
||||
/* Define this macro if GNU CC should generate calls to the System V (and ANSI
|
||||
C) library functions `memcpy' and `memset' rather than the BSD functions
|
||||
`bcopy' and `bzero'. */
|
||||
|
|
|
@ -116,16 +116,10 @@ __divsf3:
|
|||
#ifdef L__divdi3
|
||||
// Compute a 64-bit integer quotient.
|
||||
//
|
||||
// Use reciprocal approximation and Newton-Raphson iteration to compute the
|
||||
// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
|
||||
// to get more than the 64 bits of precision that we need for DImode.
|
||||
// From the Intel IA-64 Optimization Guide, choose the minimum latency
|
||||
// alternative.
|
||||
//
|
||||
// Must use max precision for the reciprocal computations to get 64 bits of
|
||||
// precision.
|
||||
//
|
||||
// r32/f8 holds the dividend. r33/f9 holds the divisor.
|
||||
// f10 holds the value 2.0. f11 holds the reciprocal approximation.
|
||||
// f12 is a temporary.
|
||||
// in0 holds the dividend. in1 holds the divisor.
|
||||
|
||||
.text
|
||||
.align 16
|
||||
|
@ -143,31 +137,26 @@ __divdi3:
|
|||
;;
|
||||
// Compute the reciprocal approximation.
|
||||
frcpa.s1 f10, p6 = f8, f9
|
||||
;;
|
||||
// 3 Newton-Raphson iterations.
|
||||
(p6) fma.s1 f11 = farg0, f10, f0
|
||||
(p6) fnma.s1 f12 = farg1, f10, f1
|
||||
(p6) fnma.s1 f11 = f9, f10, f1
|
||||
(p6) fmpy.s1 f12 = f8, f10
|
||||
;;
|
||||
(p6) fma.s1 f11 = f12, f11, f11
|
||||
(p6) fma.s1 f13 = f12, f12, f0
|
||||
(p6) fma.s1 f10 = f12, f10, f10
|
||||
(p6) fmpy.s1 f13 = f11, f11
|
||||
(p6) fma.s1 f12 = f11, f12, f12
|
||||
;;
|
||||
(p6) fma.s1 f10 = f11, f10, f10
|
||||
(p6) fma.s1 f11 = f13, f12, f12
|
||||
;;
|
||||
(p6) fma.s1 f11 = f13, f11, f11
|
||||
(p6) fma.s1 f12 = f13, f13, f0
|
||||
(p6) fma.s1 f10 = f13, f10, f10
|
||||
(p6) fnma.s1 f12 = f9, f11, f8
|
||||
;;
|
||||
(p6) fma.s1 f11 = f12, f11, f11
|
||||
(p6) fma.s1 f10 = f12, f10, f10
|
||||
;;
|
||||
(p6) fnma.s1 f8 = f9, f11, f8
|
||||
;;
|
||||
(p6) fma.s1 f10 = f8, f10, f11
|
||||
(p6) fma.s1 f10 = f12, f10, f11
|
||||
;;
|
||||
// Round quotient to an integer.
|
||||
fcvt.fx.trunc.s1 f8 = f10
|
||||
fcvt.fx.trunc.s1 f10 = f10
|
||||
;;
|
||||
// Transfer result to GP registers.
|
||||
getf.sig ret0 = f8
|
||||
getf.sig ret0 = f10
|
||||
br.ret.sptk rp
|
||||
;;
|
||||
.endp __divdi3
|
||||
|
@ -176,16 +165,10 @@ __divdi3:
|
|||
#ifdef L__moddi3
|
||||
// Compute a 64-bit integer modulus.
|
||||
//
|
||||
// Use reciprocal approximation and Newton-Raphson iteration to compute the
|
||||
// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
|
||||
// to get more than the 64 bits of precision that we need for DImode.
|
||||
// From the Intel IA-64 Optimization Guide, choose the minimum latency
|
||||
// alternative.
|
||||
//
|
||||
// Must use max precision for the reciprocal computations to get 64 bits of
|
||||
// precision.
|
||||
//
|
||||
// r32/f8 holds the dividend. r33/f9 holds the divisor.
|
||||
// f10 holds the value 2.0. f11 holds the reciprocal approximation.
|
||||
// f12 is a temporary.
|
||||
// in0 holds the dividend (a). in1 holds the divisor (b).
|
||||
|
||||
.text
|
||||
.align 16
|
||||
|
@ -194,49 +177,40 @@ __divdi3:
|
|||
__moddi3:
|
||||
.regstk 2,0,0,0
|
||||
// Transfer inputs to FP registers.
|
||||
setf.sig f8 = in0
|
||||
setf.sig f14 = in0
|
||||
setf.sig f9 = in1
|
||||
;;
|
||||
// Convert the inputs to FP, so that they won't be treated as unsigned.
|
||||
fcvt.xf f8 = f8
|
||||
fcvt.xf f8 = f14
|
||||
fcvt.xf f9 = f9
|
||||
;;
|
||||
// Compute the reciprocal approximation.
|
||||
frcpa.s1 f10, p6 = f8, f9
|
||||
;;
|
||||
// 3 Newton-Raphson iterations.
|
||||
(p6) fma.s1 f11 = farg0, f10, f0
|
||||
(p6) fnma.s1 f12 = farg1, f10, f1
|
||||
(p6) fmpy.s1 f12 = f8, f10
|
||||
(p6) fnma.s1 f11 = f9, f10, f1
|
||||
;;
|
||||
(p6) fma.s1 f11 = f12, f11, f11
|
||||
(p6) fma.s1 f13 = f12, f12, f0
|
||||
(p6) fma.s1 f10 = f12, f10, f10
|
||||
(p6) fma.s1 f12 = f11, f12, f12
|
||||
(p6) fmpy.s1 f13 = f11, f11
|
||||
;;
|
||||
(p6) fma.s1 f11 = f13, f11, f11
|
||||
(p6) fma.s1 f12 = f13, f13, f0
|
||||
(p6) fma.s1 f10 = f11, f10, f10
|
||||
(p6) fma.s1 f11 = f13, f12, f12
|
||||
;;
|
||||
sub in1 = r0, in1
|
||||
(p6) fma.s1 f10 = f13, f10, f10
|
||||
;;
|
||||
(p6) fma.s1 f11 = f12, f11, f11
|
||||
(p6) fma.s1 f10 = f12, f10, f10
|
||||
;;
|
||||
(p6) fnma.s1 f12 = f9, f11, f8
|
||||
;;
|
||||
setf.sig f9 = in1
|
||||
(p6) fma.s1 f10 = f12, f10, f11
|
||||
;;
|
||||
// Round quotient to an integer.
|
||||
fcvt.fx.trunc.s1 f10 = f10
|
||||
;;
|
||||
// Renormalize.
|
||||
fcvt.xf f10 = f10
|
||||
;;
|
||||
// Compute remainder.
|
||||
fnma.s1 f8 = f10, f9, f8
|
||||
;;
|
||||
// Round remainder to an integer.
|
||||
fcvt.fx.trunc.s1 f8 = f8
|
||||
// r = q * (-b) + a
|
||||
xma.l f10 = f10, f9, f14
|
||||
;;
|
||||
// Transfer result to GP registers.
|
||||
getf.sig ret0 = f8
|
||||
getf.sig ret0 = f10
|
||||
br.ret.sptk rp
|
||||
;;
|
||||
.endp __moddi3
|
||||
|
@ -245,16 +219,10 @@ __moddi3:
|
|||
#ifdef L__udivdi3
|
||||
// Compute a 64-bit unsigned integer quotient.
|
||||
//
|
||||
// Use reciprocal approximation and Newton-Raphson iteration to compute the
|
||||
// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
|
||||
// to get more than the 64 bits of precision that we need for DImode.
|
||||
// From the Intel IA-64 Optimization Guide, choose the minimum latency
|
||||
// alternative.
|
||||
//
|
||||
// Must use max precision for the reciprocal computations to get 64 bits of
|
||||
// precision.
|
||||
//
|
||||
// r32/f8 holds the dividend. r33/f9 holds the divisor.
|
||||
// f10 holds the value 2.0. f11 holds the reciprocal approximation.
|
||||
// f12 is a temporary.
|
||||
// in0 holds the dividend. in1 holds the divisor.
|
||||
|
||||
.text
|
||||
.align 16
|
||||
|
@ -274,29 +242,25 @@ __udivdi3:
|
|||
frcpa.s1 f10, p6 = f8, f9
|
||||
;;
|
||||
// 3 Newton-Raphson iterations.
|
||||
(p6) fma.s1 f11 = farg0, f10, f0
|
||||
(p6) fnma.s1 f12 = farg1, f10, f1
|
||||
(p6) fnma.s1 f11 = f9, f10, f1
|
||||
(p6) fmpy.s1 f12 = f8, f10
|
||||
;;
|
||||
(p6) fma.s1 f11 = f12, f11, f11
|
||||
(p6) fma.s1 f13 = f12, f12, f0
|
||||
(p6) fma.s1 f10 = f12, f10, f10
|
||||
(p6) fmpy.s1 f13 = f11, f11
|
||||
(p6) fma.s1 f12 = f11, f12, f12
|
||||
;;
|
||||
(p6) fma.s1 f10 = f11, f10, f10
|
||||
(p6) fma.s1 f11 = f13, f12, f12
|
||||
;;
|
||||
(p6) fma.s1 f11 = f13, f11, f11
|
||||
(p6) fma.s1 f12 = f13, f13, f0
|
||||
(p6) fma.s1 f10 = f13, f10, f10
|
||||
(p6) fnma.s1 f12 = f9, f11, f8
|
||||
;;
|
||||
(p6) fma.s1 f11 = f12, f11, f11
|
||||
(p6) fma.s1 f10 = f12, f10, f10
|
||||
;;
|
||||
(p6) fnma.s1 f8 = f9, f11, f8
|
||||
;;
|
||||
(p6) fma.s1 f10 = f8, f10, f11
|
||||
(p6) fma.s1 f10 = f2, f10, f11
|
||||
;;
|
||||
// Round quotient to an unsigned integer.
|
||||
fcvt.fxu.trunc.s1 f8 = f10
|
||||
fcvt.fxu.trunc.s1 f10 = f10
|
||||
;;
|
||||
// Transfer result to GP registers.
|
||||
getf.sig ret0 = f8
|
||||
getf.sig ret0 = f10
|
||||
br.ret.sptk rp
|
||||
;;
|
||||
.endp __udivdi3
|
||||
|
@ -305,16 +269,10 @@ __udivdi3:
|
|||
#ifdef L__umoddi3
|
||||
// Compute a 64-bit unsigned integer modulus.
|
||||
//
|
||||
// Use reciprocal approximation and Newton-Raphson iteration to compute the
|
||||
// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
|
||||
// to get more than the 64 bits of precision that we need for DImode.
|
||||
// From the Intel IA-64 Optimization Guide, choose the minimum latency
|
||||
// alternative.
|
||||
//
|
||||
// Must use max precision for the reciprocal computations to get 64 bits of
|
||||
// precision.
|
||||
//
|
||||
// r32/f8 holds the dividend. r33/f9 holds the divisor.
|
||||
// f10 holds the value 2.0. f11 holds the reciprocal approximation.
|
||||
// f12 is a temporary.
|
||||
// in0 holds the dividend (a). in1 holds the divisor (b).
|
||||
|
||||
.text
|
||||
.align 16
|
||||
|
@ -323,49 +281,41 @@ __udivdi3:
|
|||
__umoddi3:
|
||||
.regstk 2,0,0,0
|
||||
// Transfer inputs to FP registers.
|
||||
setf.sig f8 = in0
|
||||
setf.sig f14 = in0
|
||||
setf.sig f9 = in1
|
||||
;;
|
||||
// Convert the inputs to FP, to avoid FP software assist faults.
|
||||
fcvt.xuf.s1 f8 = f8
|
||||
fcvt.xuf.s1 f8 = f14
|
||||
fcvt.xuf.s1 f9 = f9
|
||||
;;
|
||||
// Compute the reciprocal approximation.
|
||||
frcpa.s1 f10, p6 = f8, f9
|
||||
;;
|
||||
// 3 Newton-Raphson iterations.
|
||||
(p6) fma.s1 f11 = farg0, f10, f0
|
||||
(p6) fnma.s1 f12 = farg1, f10, f1
|
||||
(p6) fmpy.s1 f12 = f8, f10
|
||||
(p6) fnma.s1 f11 = f9, f10, f1
|
||||
;;
|
||||
(p6) fma.s1 f11 = f12, f11, f11
|
||||
(p6) fma.s1 f13 = f12, f12, f0
|
||||
(p6) fma.s1 f10 = f12, f10, f10
|
||||
(p6) fma.s1 f12 = f11, f12, f12
|
||||
(p6) fmpy.s1 f13 = f11, f11
|
||||
;;
|
||||
(p6) fma.s1 f11 = f13, f11, f11
|
||||
(p6) fma.s1 f12 = f13, f13, f0
|
||||
(p6) fma.s1 f10 = f11, f10, f10
|
||||
(p6) fma.s1 f11 = f13, f12, f12
|
||||
;;
|
||||
sub in1 = r0, in1
|
||||
(p6) fma.s1 f10 = f13, f10, f10
|
||||
;;
|
||||
(p6) fma.s1 f11 = f12, f11, f11
|
||||
(p6) fma.s1 f10 = f12, f10, f10
|
||||
;;
|
||||
(p6) fnma.s1 f12 = f9, f11, f8
|
||||
;;
|
||||
setf.sig f9 = in1
|
||||
(p6) fma.s1 f10 = f12, f10, f11
|
||||
;;
|
||||
// Round quotient to an unsigned integer.
|
||||
fcvt.fxu.trunc.s1 f10 = f10
|
||||
;;
|
||||
// Renormalize.
|
||||
fcvt.xuf.s1 f10 = f10
|
||||
;;
|
||||
// Compute remainder.
|
||||
fnma.s1 f8 = f10, f9, f8
|
||||
;;
|
||||
// Round remainder to an integer.
|
||||
fcvt.fxu.trunc.s1 f8 = f8
|
||||
// r = q * (-b) + a
|
||||
xma.l f10 = f10, f9, f14
|
||||
;;
|
||||
// Transfer result to GP registers.
|
||||
getf.sig ret0 = f8
|
||||
getf.sig ret0 = f10
|
||||
br.ret.sptk rp
|
||||
;;
|
||||
.endp __umoddi3
|
||||
|
@ -374,21 +324,10 @@ __umoddi3:
|
|||
#ifdef L__divsi3
|
||||
// Compute a 32-bit integer quotient.
|
||||
//
|
||||
// Use reciprocal approximation and Newton-Raphson iteration to compute the
|
||||
// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
|
||||
// to get more than the 32 bits of precision that we need for SImode.
|
||||
// From the Intel IA-64 Optimization Guide, choose the minimum latency
|
||||
// alternative.
|
||||
//
|
||||
// ??? This is currently not used. It needs to be fixed to be more like the
|
||||
// above DImode routines.
|
||||
//
|
||||
// ??? Check to see if the error is less than >.5ulp error. We may need
|
||||
// some adjustment code to get precise enough results.
|
||||
//
|
||||
// ??? Should probably use max precision for the reciprocal computations.
|
||||
//
|
||||
// r32/f8 holds the dividend. r33/f9 holds the divisor.
|
||||
// f10 holds the value 2.0. f11 holds the reciprocal approximation.
|
||||
// f12 is a temporary.
|
||||
// in0 holds the dividend. in1 holds the divisor.
|
||||
|
||||
.text
|
||||
.align 16
|
||||
|
@ -396,28 +335,30 @@ __umoddi3:
|
|||
.proc __divsi3
|
||||
__divsi3:
|
||||
.regstk 2,0,0,0
|
||||
sxt4 in0 = in0
|
||||
sxt4 in1 = in1
|
||||
;;
|
||||
setf.sig f8 = in0
|
||||
setf.sig f9 = in1
|
||||
;;
|
||||
mov r2 = 0x0ffdd
|
||||
fcvt.xf f8 = f8
|
||||
fcvt.xf f9 = f9
|
||||
;;
|
||||
frcpa f11, p6 = f8, f9
|
||||
fadd f10 = f1, f1
|
||||
setf.exp f11 = r2
|
||||
frcpa f10, p6 = f8, f9
|
||||
;;
|
||||
fnma f12 = f9, f11, f10
|
||||
(p6) fmpy.s1 f8 = f8, f10
|
||||
(p6) fnma.s1 f9 = f9, f10, f1
|
||||
;;
|
||||
fmpy f11 = f11, f12
|
||||
(p6) fma.s1 f8 = f9, f8, f8
|
||||
(p6) fma.s1 f9 = f9, f9, f11
|
||||
;;
|
||||
fnma f12 = f9, f11, f10
|
||||
(p6) fma.s1 f10 = f9, f8, f8
|
||||
;;
|
||||
fmpy f11 = f11, f12
|
||||
fcvt.fx.trunc.s1 f10 = f10
|
||||
;;
|
||||
fmpy f8 = f8, f11
|
||||
;;
|
||||
fcvt.fx.trunc f8 = f8
|
||||
;;
|
||||
getf.sig ret0 = f8
|
||||
getf.sig ret0 = f10
|
||||
br.ret.sptk rp
|
||||
;;
|
||||
.endp __divsi3
|
||||
|
@ -426,21 +367,10 @@ __divsi3:
|
|||
#ifdef L__modsi3
|
||||
// Compute a 32-bit integer modulus.
|
||||
//
|
||||
// Use reciprocal approximation and Newton-Raphson iteration to compute the
|
||||
// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
|
||||
// to get more than the 32 bits of precision that we need for SImode.
|
||||
// From the Intel IA-64 Optimization Guide, choose the minimum latency
|
||||
// alternative.
|
||||
//
|
||||
// ??? This is currently not used. It needs to be fixed to be more like the
|
||||
// above DImode routines.
|
||||
//
|
||||
// ??? Check to see if the error is less than >.5ulp error. We may need
|
||||
// some adjustment code to get precise enough results.
|
||||
//
|
||||
// ??? Should probably use max precision for the reciprocal computations.
|
||||
//
|
||||
// r32/f8 holds the dividend. r33/f9 holds the divisor.
|
||||
// f10 holds the value 2.0. f11 holds the reciprocal approximation.
|
||||
// f12 is a temporary.
|
||||
// in0 holds the dividend. in1 holds the divisor.
|
||||
|
||||
.text
|
||||
.align 16
|
||||
|
@ -448,34 +378,34 @@ __divsi3:
|
|||
.proc __modsi3
|
||||
__modsi3:
|
||||
.regstk 2,0,0,0
|
||||
setf.sig f8 = r32
|
||||
mov r2 = 0x0ffdd
|
||||
sxt4 in0 = in0
|
||||
sxt4 in1 = in1
|
||||
;;
|
||||
setf.sig f13 = r32
|
||||
setf.sig f9 = r33
|
||||
;;
|
||||
fcvt.xf f8 = f8
|
||||
sub in1 = r0, in1
|
||||
fcvt.xf f8 = f13
|
||||
fcvt.xf f9 = f9
|
||||
;;
|
||||
frcpa f11, p6 = f8, f9
|
||||
fadd f10 = f1, f1
|
||||
setf.exp f11 = r2
|
||||
frcpa f10, p6 = f8, f9
|
||||
;;
|
||||
fnma f12 = f9, f11, f10
|
||||
(p6) fmpy.s1 f12 = f8, f10
|
||||
(p6) fnma.s1 f10 = f9, f10, f1
|
||||
;;
|
||||
fmpy f11 = f11, f12
|
||||
setf.sig f9 = in1
|
||||
(p6) fma.s1 f12 = f10, f12, f12
|
||||
(p6) fma.s1 f10 = f10, f10, f11
|
||||
;;
|
||||
fnma f12 = f9, f11, f10
|
||||
(p6) fma.s1 f10 = f10, f12, f12
|
||||
;;
|
||||
fmpy f11 = f11, f12
|
||||
fcvt.fx.trunc.s1 f10 = f10
|
||||
;;
|
||||
fmpy f10 = f8, f11
|
||||
xma.l f10 = f10, f9, f13
|
||||
;;
|
||||
fcvt.fx.trunc f10 = f10
|
||||
;;
|
||||
fcvt.xf f10 = f10
|
||||
;;
|
||||
fnma f8 = f10, f9, f8
|
||||
;;
|
||||
fcvt.fx f8 = f8
|
||||
;;
|
||||
getf.sig r32 = f8
|
||||
getf.sig ret0 = f10
|
||||
br.ret.sptk rp
|
||||
;;
|
||||
.endp __modsi3
|
||||
|
@ -484,24 +414,10 @@ __modsi3:
|
|||
#ifdef L__udivsi3
|
||||
// Compute a 32-bit unsigned integer quotient.
|
||||
//
|
||||
// Use reciprocal approximation and Newton-Raphson iteration to compute the
|
||||
// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
|
||||
// to get more than the 32 bits of precision that we need for SImode.
|
||||
// From the Intel IA-64 Optimization Guide, choose the minimum latency
|
||||
// alternative.
|
||||
//
|
||||
// ??? This is currently not used. It needs to be fixed to be more like the
|
||||
// above DImode routines.
|
||||
//
|
||||
// ??? Check to see if the error is less than >.5ulp error. We may need
|
||||
// some adjustment code to get precise enough results.
|
||||
//
|
||||
// ??? Should probably use max precision for the reciprocal computations.
|
||||
//
|
||||
// r32/f8 holds the dividend. r33/f9 holds the divisor.
|
||||
// f10 holds the value 2.0. f11 holds the reciprocal approximation.
|
||||
// f12 is a temporary.
|
||||
//
|
||||
// This is the same as divsi3, except that we don't need fcvt instructions
|
||||
// before the frcpa.
|
||||
// in0 holds the dividend. in1 holds the divisor.
|
||||
|
||||
.text
|
||||
.align 16
|
||||
|
@ -509,25 +425,27 @@ __modsi3:
|
|||
.proc __udivsi3
|
||||
__udivsi3:
|
||||
.regstk 2,0,0,0
|
||||
setf.sig f8 = r32
|
||||
setf.sig f9 = r33
|
||||
mov r2 = 0x0ffdd
|
||||
zxt4 in0 = in0
|
||||
zxt4 in1 = in1
|
||||
;;
|
||||
frcpa f11, p6 = f8, f9
|
||||
fadd f10 = f1, f1
|
||||
setf.sig f8 = in0
|
||||
setf.sig f9 = in1
|
||||
;;
|
||||
fnma f12 = f9, f11, f10
|
||||
setf.exp f11 = r2
|
||||
frcpa f10, p6 = f8, f9
|
||||
;;
|
||||
fmpy f11 = f11, f12
|
||||
(p6) fmpy.s1 f8 = f8, f10
|
||||
(p6) fnma.s1 f9 = f9, f10, f1
|
||||
;;
|
||||
fnma f12 = f9, f11, f10
|
||||
(p6) fma.s1 f8 = f9, f8, f8
|
||||
(p6) fma.s1 f9 = f9, f9, f11
|
||||
;;
|
||||
fmpy f11 = f11, f12
|
||||
(p6) fma.s1 f10 = f9, f8, f8
|
||||
;;
|
||||
fmpy f8 = f8, f11
|
||||
fcvt.fxu.trunc.s1 f10 = f10
|
||||
;;
|
||||
fcvt.fxu.trunc f8 = f8
|
||||
;;
|
||||
getf.sig ret0 = f8
|
||||
getf.sig ret0 = f10
|
||||
br.ret.sptk rp
|
||||
;;
|
||||
.endp __udivsi3
|
||||
|
@ -536,24 +454,10 @@ __udivsi3:
|
|||
#ifdef L__umodsi3
|
||||
// Compute a 32-bit unsigned integer modulus.
|
||||
//
|
||||
// Use reciprocal approximation and Newton-Raphson iteration to compute the
|
||||
// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
|
||||
// to get more than the 32 bits of precision that we need for SImode.
|
||||
// From the Intel IA-64 Optimization Guide, choose the minimum latency
|
||||
// alternative.
|
||||
//
|
||||
// ??? This is currently not used. It needs to be fixed to be more like the
|
||||
// above DImode routines.
|
||||
//
|
||||
// ??? Check to see if the error is less than >.5ulp error. We may need
|
||||
// some adjustment code to get precise enough results.
|
||||
//
|
||||
// ??? Should probably use max precision for the reciprocal computations.
|
||||
//
|
||||
// r32/f8 holds the dividend. r33/f9 holds the divisor.
|
||||
// f10 holds the value 2.0. f11 holds the reciprocal approximation.
|
||||
// f12 is a temporary.
|
||||
//
|
||||
// This is the same as modsi3, except that we don't need fcvt instructions
|
||||
// before the frcpa.
|
||||
// in0 holds the dividend. in1 holds the divisor.
|
||||
|
||||
.text
|
||||
.align 16
|
||||
|
@ -561,31 +465,33 @@ __udivsi3:
|
|||
.proc __umodsi3
|
||||
__umodsi3:
|
||||
.regstk 2,0,0,0
|
||||
setf.sig f8 = r32
|
||||
setf.sig f9 = r33
|
||||
mov r2 = 0x0ffdd
|
||||
zxt4 in0 = in0
|
||||
zxt4 in1 = in1
|
||||
;;
|
||||
frcpa f11, p6 = f8, f9
|
||||
fadd f10 = f1, f1
|
||||
setf.sig f13 = in0
|
||||
setf.sig f9 = in1
|
||||
;;
|
||||
fnma f12 = f9, f11, f10
|
||||
sub in1 = r0, in1
|
||||
fcvt.xf f8 = f13
|
||||
fcvt.xf f9 = f9
|
||||
;;
|
||||
fmpy f11 = f11, f12
|
||||
setf.exp f11 = r2
|
||||
frcpa f10, p6 = f8, f9
|
||||
;;
|
||||
fnma f12 = f9, f11, f10
|
||||
(p6) fmpy.s1 f12 = f8, f10
|
||||
(p6) fnma.s1 f10 = f9, f10, f1
|
||||
;;
|
||||
fmpy f11 = f11, f12
|
||||
(p6) fma.s1 f12 = f10, f12, f12
|
||||
(p6) fma.s1 f10 = f10, f10, f11
|
||||
;;
|
||||
fmpy f10 = f8, f11
|
||||
(p6) fma.s1 f10 = f10, f12, f12
|
||||
;;
|
||||
fcvt.fxu.trunc f10 = f10
|
||||
fcvt.fxu.trunc.s1 f10 = f10
|
||||
;;
|
||||
fcvt.xuf f10 = f10
|
||||
xma.l f10 = f10, f9, f13
|
||||
;;
|
||||
fnma f8 = f10, f9, f8
|
||||
;;
|
||||
fcvt.fxu f8 = f8
|
||||
;;
|
||||
getf.sig r32 = f8
|
||||
getf.sig ret0 = f10
|
||||
br.ret.sptk rp
|
||||
;;
|
||||
.endp __umodsi3
|
||||
|
|
Loading…
Add table
Reference in a new issue