From 6883a6662fc7728a539230ea1b02efd47815d705 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Thu, 21 Oct 2004 04:22:01 +0000
Subject: [PATCH] ieee754-sf.S: Large speed improvements.

* config/arm/ieee754-sf.S: Large speed improvements. Fix NAN handling.
* config/arm/ieee754-df.S: Ditto.

From-SVN: r89364
---
 gcc/ChangeLog               |   5 +
 gcc/config/arm/ieee754-df.S | 971 +++++++++++++++++-------------------
 gcc/config/arm/ieee754-sf.S | 870 +++++++++++++++-----------------
 3 files changed, 857 insertions(+), 989 deletions(-)

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 7b3b80ce182..e5265e70a0d 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2004-10-21  Nicolas Pitre <nico@cam.org>
+
+	* config/arm/ieee754-sf.S: Large speed improvements. Fix NAN handling.
+	* config/arm/ieee754-df.S: Ditto.
+
 2004-10-20  Zack Weinberg  <zack@codesourcery.com>
 
 	* dbxout.c (asmfile): Delete.  All uses changed to asm_out_file.
diff --git a/gcc/config/arm/ieee754-df.S b/gcc/config/arm/ieee754-df.S
index af32b9e2c08..b9cf52e6458 100644
--- a/gcc/config/arm/ieee754-df.S
+++ b/gcc/config/arm/ieee754-df.S
@@ -60,6 +60,7 @@
 
 ARM_FUNC_START negdf2
 ARM_FUNC_ALIAS aeabi_dneg negdf2
+
 	@ flip sign bit
 	eor	xh, xh, #0x80000000
 	RET
@@ -76,10 +77,10 @@ ARM_FUNC_START aeabi_drsub
 	eor	xh, xh, #0x80000000	@ flip sign bit of first arg
 	b	1f	
 
-	ARM_FUNC_START subdf3
+ARM_FUNC_START subdf3
 ARM_FUNC_ALIAS aeabi_dsub subdf3
-	@ flip sign bit of second arg
-	eor	yh, yh, #0x80000000
+
+	eor	yh, yh, #0x80000000	@ flip sign bit of second arg
 #if defined(__thumb__) && !defined(__THUMB_INTERWORK__)
 	b	1f			@ Skip Thumb-code prologue
 #endif
@@ -87,36 +88,23 @@ ARM_FUNC_ALIAS aeabi_dsub subdf3
 ARM_FUNC_START adddf3
 ARM_FUNC_ALIAS aeabi_dadd adddf3
 
-1:	@ Compare both args, return zero if equal but the sign.
-	teq	xl, yl
-	eoreq	ip, xh, yh
-	teqeq	ip, #0x80000000
-	beq	LSYM(Lad_z)
+1:	stmfd	sp!, {r4, r5, lr}
 
-	@ If first arg is 0 or -0, return second arg.
-	@ If second arg is 0 or -0, return first arg.
-	orrs	ip, xl, xh, lsl #1
-	moveq	xl, yl
-	moveq	xh, yh
-	orrnes	ip, yl, yh, lsl #1
-	RETc(eq)
-
-	stmfd	sp!, {r4, r5, lr}
-
-	@ Mask out exponents.
-	mov	ip, #0x7f000000
-	orr	ip, ip, #0x00f00000
-	and	r4, xh, ip
-	and	r5, yh, ip
-
-	@ If either of them is 0x7ff, result will be INF or NAN
-	teq	r4, ip
-	teqne	r5, ip
-	beq	LSYM(Lad_i)
+	@ Look for zeroes, equal values, INF, or NAN.
+	mov	r4, xh, lsl #1
+	mov	r5, yh, lsl #1
+	teq	r4, r5
+	teqeq	xl, yl
+	orrnes	ip, r4, xl
+	orrnes	ip, r5, yl
+	mvnnes	ip, r4, asr #21
+	mvnnes	ip, r5, asr #21
+	beq	LSYM(Lad_s)
 
 	@ Compute exponent difference.  Make largest exponent in r4,
 	@ corresponding arg in xh-xl, and positive exponent difference in r5.
-	subs	r5, r5, r4
+	mov	r4, r4, lsr #21
+	rsbs	r5, r4, r5, lsr #21
 	rsblt	r5, r5, #0
 	ble	1f
 	add	r4, r4, r5
@@ -127,24 +115,24 @@ ARM_FUNC_ALIAS aeabi_dadd adddf3
 	eor	yl, xl, yl
 	eor	yh, xh, yh
 1:
-
 	@ If exponent difference is too large, return largest argument
 	@ already in xh-xl.  We need up to 54 bit to handle proper rounding
 	@ of 0x1p54 - 1.1.
-	cmp	r5, #(54 << 20)
+	cmp	r5, #54
 	RETLDM	"r4, r5" hi
 
 	@ Convert mantissa to signed integer.
 	tst	xh, #0x80000000
-	bic	xh, xh, ip, lsl #1
-	orr	xh, xh, #0x00100000
+	mov	xh, xh, lsl #12
+	mov	ip, #0x00100000
+	orr	xh, ip, xh, lsr #12
 	beq	1f
 	rsbs	xl, xl, #0
 	rsc	xh, xh, #0
 1:
 	tst	yh, #0x80000000
-	bic	yh, yh, ip, lsl #1
-	orr	yh, yh, #0x00100000
+	mov	yh, yh, lsl #12
+	orr	yh, ip, yh, lsr #12
 	beq	1f
 	rsbs	yl, yl, #0
 	rsc	yh, yh, #0
@@ -154,42 +142,30 @@ ARM_FUNC_ALIAS aeabi_dadd adddf3
 	teq	r4, r5
 	beq	LSYM(Lad_d)
 LSYM(Lad_x):
-	@ Scale down second arg with exponent difference.
-	@ Apply shift one bit left to first arg and the rest to second arg
-	@ to simplify things later, but only if exponent does not become 0.
-	mov	ip, #0
-	movs	r5, r5, lsr #20
-	beq	3f
-	teq	r4, #(1 << 20)
-	beq	1f
-	movs	xl, xl, lsl #1
-	adc	xh, ip, xh, lsl #1
-	sub	r4, r4, #(1 << 20)
-	subs	r5, r5, #1
-	beq	3f
 
-	@ Shift yh-yl right per r5, keep leftover bits into ip.
-1:	rsbs	lr, r5, #32
-	blt	2f
+	@ Compensate for the exponent overlapping the mantissa MSB added later
+	sub	r4, r4, #1
+
+	@ Shift yh-yl right per r5, add to xh-xl, keep leftover bits into ip.
+	rsbs	lr, r5, #32
+	blt	1f
 	mov	ip, yl, lsl lr
-	mov	yl, yl, lsr r5
-	orr	yl, yl, yh, lsl lr
-	mov	yh, yh, asr r5
-	b	3f
-2:	sub	r5, r5, #32
+	adds	xl, xl, yl, lsr r5
+	adc	xh, xh, #0
+	adds	xl, xl, yh, lsl lr
+	adcs	xh, xh, yh, asr r5
+	b	2f
+1:	sub	r5, r5, #32
 	add	lr, lr, #32
 	cmp	yl, #1
-	adc	ip, ip, yh, lsl lr
-	mov	yl, yh, asr r5
-	mov	yh, yh, asr #32
-3:
-	@ the actual addition
-	adds	xl, xl, yl
-	adc	xh, xh, yh
-
+	mov	ip, yh, lsl lr
+	orrcs	ip, ip, #2		@ 2 not 1, to allow lsr #1 later
+	adds	xl, xl, yh, asr r5
+	adcs	xh, xh, yh, asr #31
+2:
 	@ We now have a result in xh-xl-ip.
-	@ Keep absolute value in xh-xl-ip, sign in r5.
-	ands	r5, xh, #0x80000000
+	@ Keep absolute value in xh-xl-ip, sign in r5 (the n bit was set above)
+	and	r5, xh, #0x80000000
 	bpl	LSYM(Lad_p)
 	rsbs	ip, ip, #0
 	rscs	xl, xl, #0
@@ -198,75 +174,66 @@ LSYM(Lad_x):
 	@ Determine how to normalize the result.
 LSYM(Lad_p):
 	cmp	xh, #0x00100000
-	bcc	LSYM(Lad_l)
+	bcc	LSYM(Lad_a)
 	cmp	xh, #0x00200000
-	bcc	LSYM(Lad_r0)
-	cmp	xh, #0x00400000
-	bcc	LSYM(Lad_r1)
+	bcc	LSYM(Lad_e)
 
 	@ Result needs to be shifted right.
 	movs	xh, xh, lsr #1
 	movs	xl, xl, rrx
-	movs	ip, ip, rrx
-	orrcs	ip, ip, #1
-	add	r4, r4, #(1 << 20)
-LSYM(Lad_r1):
-	movs	xh, xh, lsr #1
-	movs	xl, xl, rrx
-	movs	ip, ip, rrx
-	orrcs	ip, ip, #1
-	add	r4, r4, #(1 << 20)
+	mov	ip, ip, rrx
+	add	r4, r4, #1
+
+	@ Make sure we did not bust our exponent.
+	mov	r2, r4, lsl #21
+	cmn	r2, #(2 << 21)
+	bcs	LSYM(Lad_o)
 
 	@ Our result is now properly aligned into xh-xl, remaining bits in ip.
 	@ Round with MSB of ip. If halfway between two numbers, round towards
 	@ LSB of xl = 0.
-LSYM(Lad_r0):
-	adds	xl, xl, ip, lsr #31
-	adc	xh, xh, #0
-	teq	ip, #0x80000000
-	biceq	xl, xl, #1
-
-	@ One extreme rounding case may add a new MSB.  Adjust exponent.
-	@ That MSB will be cleared when exponent is merged below. 
-	tst	xh, #0x00200000
-	addne	r4, r4, #(1 << 20)
-
-	@ Make sure we did not bust our exponent.
-	adds	ip, r4, #(1 << 20)
-	bmi	LSYM(Lad_o)
-
 	@ Pack final result together.
 LSYM(Lad_e):
-	bic	xh, xh, #0x00300000
-	orr	xh, xh, r4
+	cmp	ip, #0x80000000
+	moveqs	ip, xl, lsr #1
+	adcs	xl, xl, #0
+	adc	xh, xh, r4, lsl #20
 	orr	xh, xh, r5
 	RETLDM	"r4, r5"
 
-LSYM(Lad_l):
 	@ Result must be shifted left and exponent adjusted.
-	@ No rounding necessary since ip will always be 0.
+LSYM(Lad_a):
+	movs	ip, ip, lsl #1
+	adcs	xl, xl, xl
+	adc	xh, xh, xh
+	tst	xh, #0x00100000
+	sub	r4, r4, #1
+	bne	LSYM(Lad_e)
+
+	@ No rounding necessary since ip will always be 0 at this point.
+LSYM(Lad_l):
+
 #if __ARM_ARCH__ < 5
 
 	teq	xh, #0
-	movne	r3, #-11
-	moveq	r3, #21
+	movne	r3, #20
+	moveq	r3, #52
 	moveq	xh, xl
 	moveq	xl, #0
 	mov	r2, xh
-	movs	ip, xh, lsr #16
-	moveq	r2, r2, lsl #16
-	addeq	r3, r3, #16
-	tst	r2, #0xff000000
-	moveq	r2, r2, lsl #8
-	addeq	r3, r3, #8
-	tst	r2, #0xf0000000
-	moveq	r2, r2, lsl #4
-	addeq	r3, r3, #4
-	tst	r2, #0xc0000000
-	moveq	r2, r2, lsl #2
-	addeq	r3, r3, #2
-	tst	r2, #0x80000000
-	addeq	r3, r3, #1
+	cmp	r2, #(1 << 16)
+	movhs	r2, r2, lsr #16
+	subhs	r3, r3, #16
+	cmp	r2, #(1 << 8)
+	movhs	r2, r2, lsr #8
+	subhs	r3, r3, #8
+	cmp	r2, #(1 << 4)
+	movhs	r2, r2, lsr #4
+	subhs	r3, r3, #4
+	cmp	r2, #(1 << 2)
+	subhs	r3, r3, #2
+	sublo	r3, r3, r2, lsr #1
+	sub	r3, r3, r2, lsr #3
 
 #else
 
@@ -302,13 +269,15 @@ LSYM(Lad_l):
 	movle	xl, xl, lsl r2
 
 	@ adjust exponent accordingly.
-3:	subs	r4, r4, r3, lsl #20
-	bgt	LSYM(Lad_e)
+3:	subs	r4, r4, r3
+	addge	xh, xh, r4, lsl #20
+	orrge	xh, xh, r5
+	RETLDM	"r4, r5" ge
 
 	@ Exponent too small, denormalize result.
 	@ Find out proper shift value.
-	mvn	r4, r4, asr #20
-	subs	r4, r4, #30
+	mvn	r4, r4
+	subs	r4, r4, #31
 	bge	2f
 	adds	r4, r4, #12
 	bgt	1f
@@ -337,23 +306,49 @@ LSYM(Lad_l):
 	RETLDM	"r4, r5"
 
 	@ Adjust exponents for denormalized arguments.
+	@ Note that r4 must not remain equal to 0.
 LSYM(Lad_d):
 	teq	r4, #0
-	eoreq	xh, xh, #0x00100000
-	addeq	r4, r4, #(1 << 20)
 	eor	yh, yh, #0x00100000
-	subne	r5, r5, #(1 << 20)
+	eoreq	xh, xh, #0x00100000
+	addeq	r4, r4, #1
+	subne	r5, r5, #1
 	b	LSYM(Lad_x)
 
-	@ Result is x - x = 0, unless x = INF or NAN.
-LSYM(Lad_z):
-	sub	ip, ip, #0x00100000	@ ip becomes 0x7ff00000
-	and	r2, xh, ip
-	teq	r2, ip
-	orreq	xh, ip, #0x00080000
+
+LSYM(Lad_s):
+	mvns	ip, r4, asr #21
+	mvnnes	ip, r5, asr #21
+	beq	LSYM(Lad_i)
+
+	teq	r4, r5
+	teqeq	xl, yl
+	beq	1f
+
+	@ Result is x + 0.0 = x or 0.0 + y = y.
+	teq	r4, #0
+	moveq	xh, yh
+	moveq	xl, yl
+	RETLDM	"r4, r5"
+
+1:	teq	xh, yh
+
+	@ Result is x - x = 0.
 	movne	xh, #0
-	mov	xl, #0
-	RET
+	movne	xl, #0
+	RETLDM	"r4, r5" ne
+
+	@ Result is x + x = 2x.
+	movs	ip, r4, lsr #21
+	bne	2f
+	movs	xl, xl, lsl #1
+	adcs	xh, xh, xh
+	orrcs	xh, xh, #0x80000000
+	RETLDM	"r4, r5"
+2:	adds	r4, r4, #(2 << 21)
+	addcc	xh, xh, #(1 << 20)
+	RETLDM	"r4, r5" cc
+	and	r5, xh, #0x80000000
 
 	@ Overflow: return INF.
 LSYM(Lad_o):
@@ -367,19 +362,18 @@ LSYM(Lad_o):
 	@   if yh-yl != INF/NAN: return xh-xl (which is INF/NAN)
 	@   if either is NAN: return NAN
 	@   if opposite sign: return NAN
-	@   return xh-xl (which is INF or -INF)
+	@   otherwise return xh-xl (which is INF or -INF)
 LSYM(Lad_i):
-	teq	r4, ip
+	mvns	ip, r4, asr #21
 	movne	xh, yh
 	movne	xl, yl
-	teqeq	r5, ip
-	RETLDM	"r4, r5" ne
-
+	mvneqs	ip, r5, asr #21
+	movne	yh, xh
+	movne	yl, xl
 	orrs	r4, xl, xh, lsl #12
-	orreqs	r4, yl, yh, lsl #12
+	orreqs	r5, yl, yh, lsl #12
 	teqeq	xh, yh
-	orrne	xh, r5, #0x00080000
-	movne	xl, #0
+	orrne	xh, xh, #0x00080000	@ quiet NAN
 	RETLDM	"r4, r5"
 
 	FUNC_END aeabi_dsub
@@ -389,14 +383,17 @@ LSYM(Lad_i):
 
 ARM_FUNC_START floatunsidf
 ARM_FUNC_ALIAS aeabi_ui2d floatunsidf
+
 	teq	r0, #0
 	moveq	r1, #0
 	RETc(eq)
 	stmfd	sp!, {r4, r5, lr}
-	mov	r4, #(0x400 << 20)	@ initial exponent
-	add	r4, r4, #((52-1) << 20)
+	mov	r4, #0x400		@ initial exponent
+	add	r4, r4, #(52-1 - 1)
 	mov	r5, #0			@ sign bit is 0
+	.ifnc	xl, r0
 	mov	xl, r0
+	.endif
 	mov	xh, #0
 	b	LSYM(Lad_l)
 
@@ -405,15 +402,18 @@ ARM_FUNC_ALIAS aeabi_ui2d floatunsidf
 
 ARM_FUNC_START floatsidf
 ARM_FUNC_ALIAS aeabi_i2d floatsidf
+
 	teq	r0, #0
 	moveq	r1, #0
 	RETc(eq)
 	stmfd	sp!, {r4, r5, lr}
-	mov	r4, #(0x400 << 20)	@ initial exponent
-	add	r4, r4, #((52-1) << 20)
+	mov	r4, #0x400		@ initial exponent
+	add	r4, r4, #(52-1 - 1)
 	ands	r5, r0, #0x80000000	@ sign bit in r5
 	rsbmi	r0, r0, #0		@ absolute value
+	.ifnc	xl, r0
 	mov	xl, r0
+	.endif
 	mov	xh, #0
 	b	LSYM(Lad_l)
 
@@ -422,26 +422,23 @@ ARM_FUNC_ALIAS aeabi_i2d floatsidf
 
 ARM_FUNC_START extendsfdf2
 ARM_FUNC_ALIAS aeabi_f2d extendsfdf2
-	
-	movs	r2, r0, lsl #1
-	beq	1f			@ value is 0.0 or -0.0
+
+	movs	r2, r0, lsl #1		@ toss sign bit
 	mov	xh, r2, asr #3		@ stretch exponent
 	mov	xh, xh, rrx		@ retrieve sign bit
 	mov	xl, r2, lsl #28		@ retrieve remaining bits
-	ands	r2, r2, #0xff000000	@ isolate exponent
-	beq	2f			@ exponent was 0 but not mantissa
-	teq	r2, #0xff000000		@ check if INF or NAN
+	andnes	r3, r2, #0xff000000	@ isolate exponent
+	teqne	r3, #0xff000000		@ if not 0, check if INF or NAN
 	eorne	xh, xh, #0x38000000	@ fixup exponent otherwise.
-	RET
+	RETc(ne)			@ and return it.
 
-1:	mov	xh, r0
-	mov	xl, #0
-	RET
+	teq	r2, #0			@ if actually 0
+	teqne	r3, #0xff000000		@ or INF or NAN
+	RETc(eq)			@ we are done already.
 
-2:	@ value was denormalized.  We can normalize it now.
+	@ value was denormalized.  We can normalize it now.
 	stmfd	sp!, {r4, r5, lr}
-	mov	r4, #(0x380 << 20)	@ setup corresponding exponent
-	add	r4, r4, #(1 << 20)
+	mov	r4, #0x380		@ setup corresponding exponent
 	and	r5, xh, #0x80000000	@ move sign bit in r5
 	bic	xh, xh, #0x80000000
 	b	LSYM(Lad_l)
@@ -451,76 +448,90 @@ ARM_FUNC_ALIAS aeabi_f2d extendsfdf2
 
 ARM_FUNC_START floatundidf
 ARM_FUNC_ALIAS aeabi_ul2d floatundidf
-	
+
 	orrs	r2, r0, r1
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
 	mvfeqd	f0, #0.0
 #endif
 	RETc(eq)
+
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
 	@ For hard FPA code we want to return via the tail below so that
 	@ we can return the result in f0 as well as in r0/r1 for backwards
 	@ compatibility.
-	adr	ip, 1f
+	adr	ip, LSYM(f0_ret)
 	stmfd	sp!, {r4, r5, ip, lr}
 #else
 	stmfd	sp!, {r4, r5, lr}
 #endif
+
 	mov	r5, #0
 	b	2f
 
 ARM_FUNC_START floatdidf
 ARM_FUNC_ALIAS aeabi_l2d floatdidf
+
 	orrs	r2, r0, r1
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
 	mvfeqd	f0, #0.0
 #endif
 	RETc(eq)
+
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
 	@ For hard FPA code we want to return via the tail below so that
 	@ we can return the result in f0 as well as in r0/r1 for backwards
 	@ compatibility.
-	adr	ip, 1f
+	adr	ip, LSYM(f0_ret)
 	stmfd	sp!, {r4, r5, ip, lr}
 #else
 	stmfd	sp!, {r4, r5, lr}
 #endif
+
 	ands	r5, ah, #0x80000000	@ sign bit in r5
 	bpl	2f
 	rsbs	al, al, #0
 	rsc	ah, ah, #0
 2:
-	mov	r4, #(0x400 << 20)	@ initial exponent
-	add	r4, r4, #((52 - 1) << 20)
-#if !defined (__VFP_FP__) && !defined(__ARMEB__)
+	mov	r4, #0x400		@ initial exponent
+	add	r4, r4, #(52-1 - 1)
+
 	@ FPA little-endian: must swap the word order.
+	.ifnc	xh, ah
 	mov	ip, al
 	mov	xh, ah
 	mov	xl, ip
-#endif
-	movs	ip, xh, lsr #23
+	.endif
+
+	movs	ip, xh, lsr #22
 	beq	LSYM(Lad_p)
-	@ The value's too big.  Scale it down a bit...
+
+	@ The value is too big.  Scale it down a bit...
 	mov	r2, #3
 	movs	ip, ip, lsr #3
 	addne	r2, r2, #3
 	movs	ip, ip, lsr #3
 	addne	r2, r2, #3
+	add	r2, r2, ip
+
 	rsb	r3, r2, #32
 	mov	ip, xl, lsl r3
 	mov	xl, xl, lsr r2
 	orr	xl, xl, xh, lsl r3
 	mov	xh, xh, lsr r2
-	add	r4, r4, r2, lsl #20
+	add	r4, r4, r2
 	b	LSYM(Lad_p)
+
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
-1:
+
 	@ Legacy code expects the result to be returned in f0.  Copy it
 	@ there as well.
+LSYM(f0_ret):
 	stmfd	sp!, {r0, r1}
 	ldfd	f0, [sp], #8
 	RETLDM
+
 #endif
+
 	FUNC_END floatdidf
 	FUNC_END aeabi_l2d
 	FUNC_END floatundidf
@@ -534,46 +545,38 @@ ARM_FUNC_START muldf3
 ARM_FUNC_ALIAS aeabi_dmul muldf3
 	stmfd	sp!, {r4, r5, r6, lr}
 
-	@ Mask out exponents.
-	mov	ip, #0x7f000000
-	orr	ip, ip, #0x00f00000
-	and	r4, xh, ip
-	and	r5, yh, ip
-
-	@ Trap any INF/NAN.
-	teq	r4, ip
+	@ Mask out exponents, trap any zero/denormal/INF/NAN.
+	mov	ip, #0xff
+	orr	ip, ip, #0x700
+	ands	r4, ip, xh, lsr #20
+	andnes	r5, ip, yh, lsr #20
+	teqne	r4, ip
 	teqne	r5, ip
-	beq	LSYM(Lml_s)
+	bleq	LSYM(Lml_s)
 
-	@ Trap any multiplication by 0.
-	orrs	r6, xl, xh, lsl #1
-	orrnes	r6, yl, yh, lsl #1
-	beq	LSYM(Lml_z)
+	@ Add exponents together
+	add	r4, r4, r5
 
-	@ Shift exponents right one bit to make room for overflow bit.
-	@ If either of them is 0, scale denormalized arguments off line.
-	@ Then add both exponents together.
-	movs	r4, r4, lsr #1
-	teqne	r5, #0
-	beq	LSYM(Lml_d)
-LSYM(Lml_x):
-	add	r4, r4, r5, asr #1
-
-	@ Preserve final sign in r4 along with exponent for now.
-	teq	xh, yh
-	orrmi	r4, r4, #0x8000
+	@ Determine final sign.
+	eor	r6, xh, yh
 
 	@ Convert mantissa to unsigned integer.
-	bic	xh, xh, ip, lsl #1
-	bic	yh, yh, ip, lsl #1
+	@ If power of two, branch to a separate path.
+	bic	xh, xh, ip, lsl #21
+	bic	yh, yh, ip, lsl #21
+	orrs	r5, xl, xh, lsl #12
+	orrnes	r5, yl, yh, lsl #12
 	orr	xh, xh, #0x00100000
 	orr	yh, yh, #0x00100000
+	beq	LSYM(Lml_1)
 
 #if __ARM_ARCH__ < 4
 
+	@ Put sign bit in r6, which will be restored in yl later.
+	and   r6, r6, #0x80000000
+
 	@ Well, no way to make it shorter without the umull instruction.
-	@ We must perform that 53 x 53 bit multiplication by hand.
-	stmfd	sp!, {r7, r8, r9, sl, fp}
+	stmfd	sp!, {r6, r7, r8, r9, sl, fp}
 	mov	r7, xl, lsr #16
 	mov	r8, yl, lsr #16
 	mov	r9, xh, lsr #16
@@ -625,92 +628,83 @@ LSYM(Lml_x):
 	mul	fp, xh, yh
 	adcs	r5, r5, fp
 	adc	r6, r6, #0
-	ldmfd	sp!, {r7, r8, r9, sl, fp}
+	ldmfd	sp!, {yl, r7, r8, r9, sl, fp}
 
 #else
 
-	@ Here is the actual multiplication: 53 bits * 53 bits -> 106 bits.
+	@ Here is the actual multiplication.
 	umull	ip, lr, xl, yl
 	mov	r5, #0
-	umlal	lr, r5, xl, yh
 	umlal	lr, r5, xh, yl
+	and	yl, r6, #0x80000000
+	umlal	lr, r5, xl, yh
 	mov	r6, #0
 	umlal	r5, r6, xh, yh
 
 #endif
 
 	@ The LSBs in ip are only significant for the final rounding.
-	@ Fold them into one bit of lr.
+	@ Fold them into lr.
 	teq	ip, #0
 	orrne	lr, lr, #1
 
-	@ Put final sign in xh.
-	mov	xh, r4, lsl #16
-	bic	r4, r4, #0x8000
-
-	@ Adjust result if one extra MSB appeared (one of four times).
-	tst	r6, #(1 << 9)
-	beq	1f
-	add	r4, r4, #(1 << 19)
-	movs	r6, r6, lsr #1
-	movs	r5, r5, rrx
-	movs	lr, lr, rrx
-	orrcs	lr, lr, #1
+	@ Adjust result upon the MSB position.
+	sub	r4, r4, #0xff
+	cmp	r6, #(1 << (20-11))
+	sbc	r4, r4, #0x300
+	bcs	1f
+	movs	lr, lr, lsl #1
+	adcs	r5, r5, r5
+	adc	r6, r6, r6
 1:
-	@ Scale back to 53 bits.
-	@ xh contains sign bit already.
-	orr	xh, xh, r6, lsl #12
-	orr	xh, xh, r5, lsr #20
-	mov	xl, r5, lsl #12
-	orr	xl, xl, lr, lsr #20
+	@ Shift to final position, add sign to result.
+	orr	xh, yl, r6, lsl #11
+	orr	xh, xh, r5, lsr #21
+	mov	xl, r5, lsl #11
+	orr	xl, xl, lr, lsr #21
+	mov	lr, lr, lsl #11
 
-	@ Apply exponent bias, check range for underflow.
-	sub	r4, r4, #0x00f80000
-	subs	r4, r4, #0x1f000000
-	ble	LSYM(Lml_u)
+	@ Check exponent range for under/overflow.
+	subs	ip, r4, #(254 - 1)
+	cmphi	ip, #0x700
+	bhi	LSYM(Lml_u)
 
-	@ Round the result.
-	movs	lr, lr, lsl #12
-	bpl	1f
-	adds	xl, xl, #1
-	adc	xh, xh, #0
-	teq	lr, #0x80000000
-	biceq	xl, xl, #1
-
-	@ Rounding may have produced an extra MSB here.
-	@ The extra bit is cleared before merging the exponent below.
-	tst	xh, #0x00200000
-	addne	r4, r4, #(1 << 19)
-1:
-	@ Check exponent for overflow.
-	adds	ip, r4, #(1 << 19)
-	tst	ip, #(1 << 30)
-	bne	LSYM(Lml_o)
-
-	@ Add final exponent.
-	bic	xh, xh, #0x00300000
-	orr	xh, xh, r4, lsl #1
+	@ Round the result, merge final exponent.
+	cmp	lr, #0x80000000
+	moveqs	lr, xl, lsr #1
+	adcs	xl, xl, #0
+	adc	xh, xh, r4, lsl #20
 	RETLDM	"r4, r5, r6"
 
-	@ Result is 0, but determine sign anyway.
-LSYM(Lml_z):
+	@ Multiplication by 0x1p*: let''s shortcut a lot of code.
+LSYM(Lml_1):
+	and	r6, r6, #0x80000000
+	orr	xh, r6, xh
+	orr	xl, xl, yl
 	eor	xh, xh, yh
-LSYM(Ldv_z):
-	bic	xh, xh, #0x7fffffff
-	mov	xl, #0
-	RETLDM	"r4, r5, r6"
+	subs	r4, r4, ip, lsr #1
+	rsbgts	r5, r4, ip
+	orrgt	xh, xh, r4, lsl #20
+	RETLDM	"r4, r5, r6" gt
+
+	@ Under/overflow: fix things up for the code below.
+	orr	xh, xh, #0x00100000
+	mov	lr, #0
+	subs	r4, r4, #1
+
+LSYM(Lml_u):
+	@ Overflow?
+	bgt	LSYM(Lml_o)
 
 	@ Check if denormalized result is possible, otherwise return signed 0.
-LSYM(Lml_u):
-	cmn	r4, #(53 << 19)
+	cmn	r4, #(53 + 1)
 	movle	xl, #0
 	bicle	xh, xh, #0x7fffffff
 	RETLDM	"r4, r5, r6" le
 
 	@ Find out proper shift value.
-LSYM(Lml_r):
-	mvn	r4, r4, asr #19
-	subs	r4, r4, #30
+	rsb	r4, r4, #0
+	subs	r4, r4, #32
 	bge	2f
 	adds	r4, r4, #12
 	bgt	1f
@@ -721,14 +715,12 @@ LSYM(Lml_r):
 	mov	r3, xl, lsl r5
 	mov	xl, xl, lsr r4
 	orr	xl, xl, xh, lsl r5
-	movs	xh, xh, lsl #1
-	mov	xh, xh, lsr r4
-	mov	xh, xh, rrx
+	and	r2, xh, #0x80000000
+	bic	xh, xh, #0x80000000
 	adds	xl, xl, r3, lsr #31
-	adc	xh, xh, #0
-	teq	lr, #0
-	teqeq	r3, #0x80000000
-	biceq	xl, xl, #1
+	adc	xh, r2, xh, lsr r4
+	orrs	lr, lr, r3, lsl #1
+	biceq	xl, xl, r3, lsr #31
 	RETLDM	"r4, r5, r6"
 
 	@ shift result right of 21 to 31 bits, or left 11 to 1 bits after
@@ -741,54 +733,71 @@ LSYM(Lml_r):
 	bic	xh, xh, #0x7fffffff
 	adds	xl, xl, r3, lsr #31
 	adc	xh, xh, #0
-	teq	lr, #0
-	teqeq	r3, #0x80000000
-	biceq	xl, xl, #1
+	orrs	lr, lr, r3, lsl #1
+	biceq	xl, xl, r3, lsr #31
 	RETLDM	"r4, r5, r6"
 
 	@ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch
 	@ from xh to xl.  Leftover bits are in r3-r6-lr for rounding.
 2:	rsb	r5, r4, #32
-	mov	r6, xl, lsl r5
+	orr	lr, lr, xl, lsl r5
 	mov	r3, xl, lsr r4
 	orr	r3, r3, xh, lsl r5
 	mov	xl, xh, lsr r4
 	bic	xh, xh, #0x7fffffff
 	bic	xl, xl, xh, lsr r4
 	add	xl, xl, r3, lsr #31
-	orrs	r6, r6, lr
-	teqeq	r3, #0x80000000
-	biceq	xl, xl, #1
+	orrs	lr, lr, r3, lsl #1
+	biceq	xl, xl, r3, lsr #31
 	RETLDM	"r4, r5, r6"
 
 	@ One or both arguments are denormalized.
 	@ Scale them leftwards and preserve sign bit.
 LSYM(Lml_d):
-	mov	lr, #0
 	teq	r4, #0
 	bne	2f
 	and	r6, xh, #0x80000000
 1:	movs	xl, xl, lsl #1
-	adc	xh, lr, xh, lsl #1
+	adc	xh, xh, xh
 	tst	xh, #0x00100000
-	subeq	r4, r4, #(1 << 19)
+	subeq	r4, r4, #1
 	beq	1b
 	orr	xh, xh, r6
 	teq	r5, #0
-	bne	LSYM(Lml_x)
+	movne	pc, lr
 2:	and	r6, yh, #0x80000000
 3:	movs	yl, yl, lsl #1
-	adc	yh, lr, yh, lsl #1
+	adc	yh, yh, yh
 	tst	yh, #0x00100000
-	subeq	r5, r5, #(1 << 20)
+	subeq	r5, r5, #1
 	beq	3b
 	orr	yh, yh, r6
-	b	LSYM(Lml_x)
+	mov	pc, lr
 
-	@ One or both args are INF or NAN.
 LSYM(Lml_s):
+	@ Isolate the INF and NAN cases away
+	teq	r4, ip
+	and	r5, ip, yh, lsr #20
+	teqne	r5, ip
+	beq	1f
+
+	@ Here, one or more arguments are either denormalized or zero.
 	orrs	r6, xl, xh, lsl #1
 	orrnes	r6, yl, yh, lsl #1
+	bne	LSYM(Lml_d)
+
+	@ Result is 0, but determine sign anyway.
+LSYM(Lml_z):
+	eor	xh, xh, yh
+	bic	xh, xh, #0x7fffffff
+	mov	xl, #0
+	RETLDM	"r4, r5, r6"
+
+1:	@ One or both args are INF or NAN.
+	orrs	r6, xl, xh, lsl #1
+	moveq	xl, yl
+	moveq	xh, yh
+	orrnes	r6, yl, yh, lsl #1
 	beq	LSYM(Lml_n)		@ 0 * INF or INF * 0 -> NAN
 	teq	r4, ip
 	bne	1f
@@ -797,6 +806,8 @@ LSYM(Lml_s):
 1:	teq	r5, ip
 	bne	LSYM(Lml_i)
 	orrs	r6, yl, yh, lsl #12
+	movne	xl, yl
+	movne	xh, yh
 	bne	LSYM(Lml_n)		@ <anything> * NAN -> NAN
 
 	@ Result is INF, but we need to determine its sign.
@@ -811,9 +822,9 @@ LSYM(Lml_o):
 	mov	xl, #0
 	RETLDM	"r4, r5, r6"
 
-	@ Return NAN.
+	@ Return a quiet NAN.
 LSYM(Lml_n):
-	mov	xh, #0x7f000000
+	orr	xh, xh, #0x7f000000
 	orr	xh, xh, #0x00f80000
 	RETLDM	"r4, r5, r6"
 
@@ -825,41 +836,31 @@ ARM_FUNC_ALIAS aeabi_ddiv divdf3
 	
 	stmfd	sp!, {r4, r5, r6, lr}
 
-	@ Mask out exponents.
-	mov	ip, #0x7f000000
-	orr	ip, ip, #0x00f00000
-	and	r4, xh, ip
-	and	r5, yh, ip
-
-	@ Trap any INF/NAN or zeroes.
-	teq	r4, ip
+	@ Mask out exponents, trap any zero/denormal/INF/NAN.
+	mov	ip, #0xff
+	orr	ip, ip, #0x700
+	ands	r4, ip, xh, lsr #20
+	andnes	r5, ip, yh, lsr #20
+	teqne	r4, ip
 	teqne	r5, ip
-	orrnes	r6, xl, xh, lsl #1
-	orrnes	r6, yl, yh, lsl #1
-	beq	LSYM(Ldv_s)
+	bleq	LSYM(Ldv_s)
 
-	@ Shift exponents right one bit to make room for overflow bit.
-	@ If either of them is 0, scale denormalized arguments off line.
-	@ Then substract divisor exponent from dividend''s.
-	movs	r4, r4, lsr #1
-	teqne	r5, #0
-	beq	LSYM(Ldv_d)
-LSYM(Ldv_x):
-	sub	r4, r4, r5, asr #1
+	@ Substract divisor exponent from dividend''s.
+	sub	r4, r4, r5
 
 	@ Preserve final sign into lr.
 	eor	lr, xh, yh
 
 	@ Convert mantissa to unsigned integer.
 	@ Dividend -> r5-r6, divisor -> yh-yl.
-	mov	r5, #0x10000000
+	orrs	r5, yl, yh, lsl #12
+	mov	xh, xh, lsl #12
+	beq	LSYM(Ldv_1)
 	mov	yh, yh, lsl #12
+	mov	r5, #0x10000000
 	orr	yh, r5, yh, lsr #4
 	orr	yh, yh, yl, lsr #24
-	movs	yl, yl, lsl #8
-	mov	xh, xh, lsl #12
-	teqeq	yh, r5
-	beq	LSYM(Ldv_1)
+	mov	yl, yl, lsl #8
 	orr	r5, r5, xh, lsr #4
 	orr	r5, r5, xl, lsr #24
 	mov	r6, xl, lsl #8
@@ -868,21 +869,15 @@ LSYM(Ldv_x):
 	and	xh, lr, #0x80000000
 
 	@ Ensure result will land to known bit position.
+	@ Apply exponent bias accordingly.
 	cmp	r5, yh
 	cmpeq	r6, yl
+	adc	r4, r4, #(255 - 2)
+	add	r4, r4, #0x300
 	bcs	1f
-	sub	r4, r4, #(1 << 19)
 	movs	yh, yh, lsr #1
 	mov	yl, yl, rrx
 1:
-	@ Apply exponent bias, check range for over/underflow.
-	add	r4, r4, #0x1f000000
-	add	r4, r4, #0x00f80000
-	cmn	r4, #(53 << 19)
-	ble	LSYM(Ldv_z)
-	cmp	r4, ip, lsr #1
-	bge	LSYM(Lml_o)
-
 	@ Perform first substraction to align result to a nibble.
 	subs	r6, r6, yl
 	sbc	r5, r5, yh
@@ -944,73 +939,42 @@ LSYM(Ldv_x):
 	orreq	xh, xh, xl
 	moveq	xl, #0
 3:
-	@ Check if denormalized result is needed.
-	cmp	r4, #0
-	ble	LSYM(Ldv_u)
+	@ Check exponent range for under/overflow.
+	subs	ip, r4, #(254 - 1)
+	cmphi	ip, #0x700
+	bhi	LSYM(Lml_u)
 
-	@ Apply proper rounding.
+	@ Round the result, merge final exponent.
 	subs	ip, r5, yh
 	subeqs	ip, r6, yl
+	moveqs	ip, xl, lsr #1
 	adcs	xl, xl, #0
-	adc	xh, xh, #0
-	teq	ip, #0
-	biceq	xl, xl, #1
-
-	@ Add exponent to result.
-	bic	xh, xh, #0x00100000
-	orr	xh, xh, r4, lsl #1
+	adc	xh, xh, r4, lsl #20
 	RETLDM	"r4, r5, r6"
 
 	@ Division by 0x1p*: shortcut a lot of code.
 LSYM(Ldv_1):
 	and	lr, lr, #0x80000000
 	orr	xh, lr, xh, lsr #12
-	add	r4, r4, #0x1f000000
-	add	r4, r4, #0x00f80000
-	cmp	r4, ip, lsr #1
-	bge	LSYM(Lml_o)
-	cmp	r4, #0
-	orrgt	xh, xh, r4, lsl #1
+	adds	r4, r4, ip, lsr #1
+	rsbgts	r5, r4, ip
+	orrgt	xh, xh, r4, lsl #20
 	RETLDM	"r4, r5, r6" gt
 
-	cmn	r4, #(53 << 19)
-	ble	LSYM(Ldv_z)
 	orr	xh, xh, #0x00100000
 	mov	lr, #0
-	b	LSYM(Lml_r)
+	subs	r4, r4, #1
+	b	LSYM(Lml_u)
 
-	@ Result must be denormalized: put remainder in lr for
-	@ rounding considerations.
+	@ Result mightt need to be denormalized: put remainder bits
+	@ in lr for rounding considerations.
 LSYM(Ldv_u):
 	orr	lr, r5, r6
-	b	LSYM(Lml_r)
-
-	@ One or both arguments are denormalized.
-	@ Scale them leftwards and preserve sign bit.
-LSYM(Ldv_d):
-	mov	lr, #0
-	teq	r4, #0
-	bne	2f
-	and	r6, xh, #0x80000000
-1:	movs	xl, xl, lsl #1
-	adc	xh, lr, xh, lsl #1
-	tst	xh, #0x00100000
-	subeq	r4, r4, #(1 << 19)
-	beq	1b
-	orr	xh, xh, r6
-	teq	r5, #0
-	bne	LSYM(Ldv_x)
-2:	and	r6, yh, #0x80000000
-3:	movs	yl, yl, lsl #1
-	adc	yh, lr, yh, lsl #1
-	tst	yh, #0x00100000
-	subeq	r5, r5, #(1 << 20)
-	beq	3b
-	orr	yh, yh, r6
-	b	LSYM(Ldv_x)
+	b	LSYM(Lml_u)
 
 	@ One or both arguments is either INF, NAN or zero.
 LSYM(Ldv_s):
+	and	r5, ip, yh, lsr #20
 	teq	r4, ip
 	teqeq	r5, ip
 	beq	LSYM(Lml_n)		@ INF/NAN / INF/NAN -> NAN
@@ -1018,13 +982,23 @@ LSYM(Ldv_s):
 	bne	1f
 	orrs	r4, xl, xh, lsl #12
 	bne	LSYM(Lml_n)		@ NAN / <anything> -> NAN
-	b	LSYM(Lml_i)		@ INF / <anything> -> INF
+	teq	r5, ip
+	bne	LSYM(Lml_i)		@ INF / <anything> -> INF
+	mov	xl, yl
+	mov	xh, yh
+	b	LSYM(Lml_n)		@ INF / (INF or NAN) -> NAN
 1:	teq	r5, ip
 	bne	2f
 	orrs	r5, yl, yh, lsl #12
-	bne	LSYM(Lml_n)		@ <anything> / NAN -> NAN
-	b	LSYM(Lml_z)		@ <anything> / INF -> 0
-2:	@ One or both arguments are 0.
+	beq	LSYM(Lml_z)		@ <anything> / INF -> 0
+	mov	xl, yl
+	mov	xh, yh
+	b	LSYM(Lml_n)		@ <anything> / NAN -> NAN
+2:	@ If both are non-zero, we need to normalize and resume above.
+	orrs	r6, xl, xh, lsl #1
+	orrnes	r6, yl, yh, lsl #1
+	bne	LSYM(Lml_d)
+	@ One or both arguments are 0.
 	orrs	r4, xl, xh, lsl #1
 	bne	LSYM(Lml_i)		@ <non_zero> / 0 -> INF
 	orrs	r5, yl, yh, lsl #1
@@ -1038,6 +1012,8 @@ LSYM(Ldv_s):
 
 #ifdef L_cmpdf2
 
+@ Note: only r0 (return value) and ip are clobbered here.
+
 ARM_FUNC_START gtdf2
 ARM_FUNC_ALIAS gedf2 gtdf2
 	mov	ip, #-1
@@ -1053,15 +1029,13 @@ ARM_FUNC_ALIAS nedf2 cmpdf2
 ARM_FUNC_ALIAS eqdf2 cmpdf2
 	mov	ip, #1			@ how should we specify unordered here?
 
-1:	stmfd	sp!, {r4, r5, lr}
+1:	str	ip, [sp, #-4]
 
 	@ Trap any INF/NAN first.
-	mov	lr, #0x7f000000
-	orr	lr, lr, #0x00f00000
-	and	r4, xh, lr
-	and	r5, yh, lr
-	teq	r4, lr
-	teqne	r5, lr
+	mov	ip, xh, lsl #1
+	mvns	ip, ip, asr #21
+	mov	ip, yh, lsl #1
+	mvnnes	ip, ip, asr #21
 	beq	3f
 
 	@ Test for equality.
@@ -1071,37 +1045,37 @@ ARM_FUNC_ALIAS eqdf2 cmpdf2
 	teqne	xh, yh			@ or xh == yh
 	teqeq	xl, yl			@ and xl == yl
 	moveq	r0, #0			@ then equal.
-	RETLDM	"r4, r5" eq
+	RETc(eq)
 
-	@ Check for sign difference.
+	@ Clear C flag
+	cmn	r0, #0
+
+	@ Compare sign, 
 	teq	xh, yh
-	movmi	r0, xh, asr #31
-	orrmi	r0, r0, #1
-	RETLDM	"r4, r5" mi
 
-	@ Compare exponents.
-	cmp	r4, r5
-
-	@ Compare mantissa if exponents are equal.
-	moveq	xh, xh, lsl #12
-	cmpeq	xh, yh, lsl #12
+	@ Compare values if same sign
+	cmppl	xh, yh
 	cmpeq	xl, yl
+
+	@ Result:
 	movcs	r0, yh, asr #31
 	mvncc	r0, yh, asr #31
 	orr	r0, r0, #1
-	RETLDM	"r4, r5"
+	RET
 
 	@ Look for a NAN.
-3:	teq	r4, lr
+3:	mov	ip, xh, lsl #1
+	mvns	ip, ip, asr #21
 	bne	4f
-	orrs	xl, xl, xh, lsl #12
+	orrs	ip, xl, xh, lsl #12
 	bne	5f			@ x is NAN
-4:	teq	r5, lr
+4:	mov	ip, yh, lsl #1
+	mvns	ip, ip, asr #21
 	bne	2b
-	orrs	yl, yl, yh, lsl #12
+	orrs	ip, yl, yh, lsl #12
 	beq	2b			@ y is not NAN
-5:	mov	r0, ip			@ return unordered code from ip
-	RETLDM	"r4, r5"
+5:	ldr	r0, [sp, #-4]		@ unordered return code
+	RET
 
 	FUNC_END gedf2
 	FUNC_END gtdf2
@@ -1112,6 +1086,7 @@ ARM_FUNC_ALIAS eqdf2 cmpdf2
 	FUNC_END cmpdf2
 
 ARM_FUNC_START aeabi_cdrcmple
+
 	mov	ip, r0
 	mov	r0, r2
 	mov	r2, ip
@@ -1122,85 +1097,95 @@ ARM_FUNC_START aeabi_cdrcmple
 	
 ARM_FUNC_START aeabi_cdcmpeq
 ARM_FUNC_ALIAS aeabi_cdcmple aeabi_cdcmpeq
+
 	@ The status-returning routines are required to preserve all
 	@ registers except ip, lr, and cpsr.
-6:	stmfd	sp!, {r0, r1, r2, r3, lr}
+6:	stmfd	sp!, {r0, lr}
 	ARM_CALL cmpdf2
 	@ Set the Z flag correctly, and the C flag unconditionally.
 	cmp	 r0, #0
 	@ Clear the C flag if the return value was -1, indicating
 	@ that the first operand was smaller than the second.
 	cmnmi	 r0, #0
-	RETLDM   "r0, r1, r2, r3"
+	RETLDM   "r0"
+
 	FUNC_END aeabi_cdcmple
 	FUNC_END aeabi_cdcmpeq
+	FUNC_END aeabi_cdrcmple
 	
 ARM_FUNC_START	aeabi_dcmpeq
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cdcmple
 	moveq	r0, #1	@ Equal to.
 	movne	r0, #0	@ Less than, greater than, or unordered.
 	RETLDM
+
 	FUNC_END aeabi_dcmpeq
 
 ARM_FUNC_START	aeabi_dcmplt
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cdcmple
 	movcc	r0, #1	@ Less than.
 	movcs	r0, #0	@ Equal to, greater than, or unordered.
 	RETLDM
+
 	FUNC_END aeabi_dcmplt
 
 ARM_FUNC_START	aeabi_dcmple
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cdcmple
 	movls	r0, #1  @ Less than or equal to.
 	movhi	r0, #0	@ Greater than or unordered.
 	RETLDM
+
 	FUNC_END aeabi_dcmple
 
 ARM_FUNC_START	aeabi_dcmpge
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cdrcmple
 	movls	r0, #1	@ Operand 2 is less than or equal to operand 1.
 	movhi	r0, #0	@ Operand 2 greater than operand 1, or unordered.
 	RETLDM
+
 	FUNC_END aeabi_dcmpge
 
 ARM_FUNC_START	aeabi_dcmpgt
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cdrcmple
 	movcc	r0, #1	@ Operand 2 is less than operand 1.
 	movcs	r0, #0  @ Operand 2 is greater than or equal to operand 1,
 			@ or they are unordered.
 	RETLDM
+
 	FUNC_END aeabi_dcmpgt
-		
+
 #endif /* L_cmpdf2 */
 
 #ifdef L_unorddf2
 
 ARM_FUNC_START unorddf2
 ARM_FUNC_ALIAS aeabi_dcmpun unorddf2
-	
-	str	lr, [sp, #-4]!
-	mov	ip, #0x7f000000
-	orr	ip, ip, #0x00f00000
-	and	lr, xh, ip
-	teq	lr, ip
+
+	mov	ip, xh, lsl #1
+	mvns	ip, ip, asr #21
 	bne	1f
-	orrs	xl, xl, xh, lsl #12
+	orrs	ip, xl, xh, lsl #12
 	bne	3f			@ x is NAN
-1:	and	lr, yh, ip
-	teq	lr, ip
+1:	mov	ip, yh, lsl #1
+	mvns	ip, ip, asr #21
 	bne	2f
-	orrs	yl, yl, yh, lsl #12
+	orrs	ip, yl, yh, lsl #12
 	bne	3f			@ y is NAN
 2:	mov	r0, #0			@ arguments are ordered.
-	RETLDM
+	RET
 
 3:	mov	r0, #1			@ arguments are unordered.
-	RETLDM
+	RET
 
 	FUNC_END aeabi_dcmpun
 	FUNC_END unorddf2
@@ -1211,31 +1196,22 @@ ARM_FUNC_ALIAS aeabi_dcmpun unorddf2
 
 ARM_FUNC_START fixdfsi
 ARM_FUNC_ALIAS aeabi_d2iz fixdfsi
-	orrs	ip, xl, xh, lsl #1
-	beq	1f			@ value is 0.
-
-	mov	r3, r3, rrx		@ preserve C flag (the actual sign)
 
 	@ check exponent range.
-	mov	ip, #0x7f000000
-	orr	ip, ip, #0x00f00000
-	and	r2, xh, ip
-	teq	r2, ip
-	beq	2f			@ value is INF or NAN
-	bic	ip, ip, #0x40000000
-	cmp	r2, ip
-	bcc	1f			@ value is too small
-	add	ip, ip, #(31 << 20)
-	cmp	r2, ip
-	bcs	3f			@ value is too large
+	mov	r2, xh, lsl #1
+	adds	r2, r2, #(1 << 21)
+	bcs	2f			@ value is INF or NAN
+	bpl	1f			@ value is too small
+	mov	r3, #(0xfffffc00 + 31)
+	subs	r2, r3, r2, asr #21
+	bls	3f			@ value is too large
 
-	rsb	r2, r2, ip
-	mov	ip, xh, lsl #11
-	orr	ip, ip, #0x80000000
-	orr	ip, ip, xl, lsr #21
-	mov	r2, r2, lsr #20
-	tst	r3, #0x80000000		@ the sign bit
-	mov	r0, ip, lsr r2
+	@ scale value
+	mov	r3, xh, lsl #11
+	orr	r3, r3, #0x80000000
+	orr	r3, r3, xl, lsr #21
+	tst	xh, #0x80000000		@ the sign bit
+	mov	r0, r3, lsr r2
 	rsbne	r0, r0, #0
 	RET
 
@@ -1243,8 +1219,8 @@ ARM_FUNC_ALIAS aeabi_d2iz fixdfsi
 	RET
 
 2:	orrs	xl, xl, xh, lsl #12
-	bne	4f			@ r0 is NAN.
-3:	ands	r0, r3, #0x80000000	@ the sign bit
+	bne	4f			@ x is NAN.
+3:	ands	r0, xh, #0x80000000	@ the sign bit
 	moveq	r0, #0x7fffffff		@ maximum signed positive si
 	RET
 
@@ -1260,29 +1236,22 @@ ARM_FUNC_ALIAS aeabi_d2iz fixdfsi
 
 ARM_FUNC_START fixunsdfsi
 ARM_FUNC_ALIAS aeabi_d2uiz fixunsdfsi
-	orrs	ip, xl, xh, lsl #1
-	movcss	r0, #0			@ value is negative
-	RETc(eq)			@ or 0 (xl, xh overlap r0)
 
 	@ check exponent range.
-	mov	ip, #0x7f000000
-	orr	ip, ip, #0x00f00000
-	and	r2, xh, ip
-	teq	r2, ip
-	beq	2f			@ value is INF or NAN
-	bic	ip, ip, #0x40000000
-	cmp	r2, ip
-	bcc	1f			@ value is too small
-	add	ip, ip, #(31 << 20)
-	cmp	r2, ip
-	bhi	3f			@ value is too large
+	movs	r2, xh, lsl #1
+	bcs	1f			@ value is negative
+	adds	r2, r2, #(1 << 21)
+	bcs	2f			@ value is INF or NAN
+	bpl	1f			@ value is too small
+	mov	r3, #(0xfffffc00 + 31)
+	subs	r2, r3, r2, asr #21
+	bmi	3f			@ value is too large
 
-	rsb	r2, r2, ip
-	mov	ip, xh, lsl #11
-	orr	ip, ip, #0x80000000
-	orr	ip, ip, xl, lsr #21
-	mov	r2, r2, lsr #20
-	mov	r0, ip, lsr r2
+	@ scale value
+	mov	r3, xh, lsl #11
+	orr	r3, r3, #0x80000000
+	orr	r3, r3, xl, lsr #21
+	mov	r0, r3, lsr r2
 	RET
 
 1:	mov	r0, #0
@@ -1305,91 +1274,61 @@ ARM_FUNC_ALIAS aeabi_d2uiz fixunsdfsi
 
 ARM_FUNC_START truncdfsf2
 ARM_FUNC_ALIAS aeabi_d2f truncdfsf2
-	orrs	r2, xl, xh, lsl #1
-	moveq	r0, r2, rrx
-	RETc(eq)			@ value is 0.0 or -0.0
-	
+
 	@ check exponent range.
-	mov	ip, #0x7f000000
-	orr	ip, ip, #0x00f00000
-	and	r2, ip, xh
-	teq	r2, ip
-	beq	2f			@ value is INF or NAN
-	bic	xh, xh, ip
-	cmp	r2, #(0x380 << 20)
-	bls	4f			@ value is too small
+	mov	r2, xh, lsl #1
+	subs	r3, r2, #((1023 - 127) << 21)
+	subcss	ip, r3, #(1 << 21)
+	rsbcss	ip, ip, #(254 << 21)
+	bls	2f			@ value is out of range
 
-	@ shift and round mantissa
-1:	movs	r3, xl, lsr #29
-	adc	r3, r3, xh, lsl #3
-
-	@ if halfway between two numbers, round towards LSB = 0.
-	mov	xl, xl, lsl #3
-	teq	xl, #0x80000000
-	biceq	r3, r3, #1
-
-	@ rounding might have created an extra MSB.  If so adjust exponent.
-	tst	r3, #0x00800000
-	addne	r2, r2, #(1 << 20)
-	bicne	r3, r3, #0x00800000
-
-	@ check exponent for overflow
-	mov	ip, #(0x400 << 20)
-	orr	ip, ip, #(0x07f << 20)
-	cmp	r2, ip
-	bcs	3f			@ overflow
-
-	@ adjust exponent, merge with sign bit and mantissa.
-	movs	xh, xh, lsl #1
-	mov	r2, r2, lsl #4
-	orr	r0, r3, r2, rrx
-	eor	r0, r0, #0x40000000
+1:	@ shift and round mantissa
+	and	ip, xh, #0x80000000
+	mov	r2, xl, lsl #3
+	orr	xl, ip, xl, lsr #29
+	cmp	r2, #0x80000000
+	adc	r0, xl, r3, lsl #2
+	biceq	r0, r0, #1
 	RET
 
-2:	@ chech for NAN
-	orrs	xl, xl, xh, lsl #12
+2:	@ either overflow or underflow
+	tst	xh, #0x40000000
+	bne	3f			@ overflow
+
+	@ check if denormalized value is possible
+	adds	r2, r3, #(23 << 21)
+	andlt	r0, xh, #0x80000000	@ too small, return signed 0.
+	RETc(lt)
+
+	@ denormalize value so we can resume with the code above afterwards.
+	orr	xh, xh, #0x00100000
+	mov	r2, r2, lsr #21
+	rsb	r2, r2, #24
+	rsb	ip, r2, #32
+	movs	r3, xl, lsl ip
+	mov	xl, xl, lsr r2
+	orrne	xl, xl, #1		@ fold r3 for rounding considerations. 
+	mov	r3, xh, lsl #11
+	mov	r3, r3, lsr #11
+	orr	xl, xl, r3, lsl ip
+	mov	r3, r3, lsr r2
+	mov	r3, r3, lsl #1
+	b	1b
+
+3:	@ chech for NAN
+	mvns	r3, r2, asr #21
+	bne	5f			@ simple overflow
+	orrs	r3, xl, xh, lsl #12
 	movne	r0, #0x7f000000
 	orrne	r0, r0, #0x00c00000
 	RETc(ne)			@ return NAN
 
-3:	@ return INF with sign
+5:	@ return INF with sign
 	and	r0, xh, #0x80000000
 	orr	r0, r0, #0x7f000000
 	orr	r0, r0, #0x00800000
 	RET
 
-4:	@ check if denormalized value is possible
-	subs	r2, r2, #((0x380 - 24) << 20)
-	andle	r0, xh, #0x80000000	@ too small, return signed 0.
-	RETc(le)
-	
-	@ denormalize value so we can resume with the code above afterwards.
-	orr	xh, xh, #0x00100000
-	mov	r2, r2, lsr #20
-	rsb	r2, r2, #25
-	cmp	r2, #20
-	bgt	6f
-
-	rsb	ip, r2, #32
-	mov	r3, xl, lsl ip
-	mov	xl, xl, lsr r2
-	orr	xl, xl, xh, lsl ip
-	movs	xh, xh, lsl #1
-	mov	xh, xh, lsr r2
-	mov	xh, xh, rrx
-5:	teq	r3, #0			@ fold r3 bits into the LSB
-	orrne	xl, xl, #1		@ for rounding considerations. 
-	mov	r2, #(0x380 << 20)	@ equivalent to the 0 float exponent
-	b	1b
-
-6:	rsb	r2, r2, #(12 + 20)
-	rsb	ip, r2, #32
-	mov	r3, xl, lsl r2
-	mov	xl, xl, lsr ip
-	orr	xl, xl, xh, lsl r2
-	and	xh, xh, #0x80000000
-	b	5b
-
 	FUNC_END aeabi_d2f
 	FUNC_END truncdfsf2
 
diff --git a/gcc/config/arm/ieee754-sf.S b/gcc/config/arm/ieee754-sf.S
index d82fa8c84f7..8eae6e9325d 100644
--- a/gcc/config/arm/ieee754-sf.S
+++ b/gcc/config/arm/ieee754-sf.S
@@ -42,7 +42,7 @@
 	
 ARM_FUNC_START negsf2
 ARM_FUNC_ALIAS aeabi_fneg negsf2
-	
+
 	eor	r0, r0, #0x80000000	@ flip sign bit
 	RET
 
@@ -56,11 +56,11 @@ ARM_FUNC_ALIAS aeabi_fneg negsf2
 ARM_FUNC_START aeabi_frsub
 
 	eor	r0, r0, #0x80000000	@ flip sign bit of first arg
-	b	1f	
-	
+	b	1f
+
 ARM_FUNC_START subsf3
 ARM_FUNC_ALIAS aeabi_fsub subsf3
-	
+
 	eor	r1, r1, #0x80000000	@ flip sign bit of second arg
 #if defined(__thumb__) && !defined(__THUMB_INTERWORK__)
 	b	1f			@ Skip Thumb-code prologue
@@ -68,32 +68,19 @@ ARM_FUNC_ALIAS aeabi_fsub subsf3
 
 ARM_FUNC_START addsf3
 ARM_FUNC_ALIAS aeabi_fadd addsf3
-	
-1:	@ Compare both args, return zero if equal but the sign.
-	eor	r2, r0, r1
-	teq	r2, #0x80000000
-	beq	LSYM(Lad_z)
 
-	@ If first arg is 0 or -0, return second arg.
-	@ If second arg is 0 or -0, return first arg.
-	bics	r2, r0, #0x80000000
-	moveq	r0, r1
-	bicnes	r2, r1, #0x80000000
-	RETc(eq)
-
-	@ Mask out exponents.
-	mov	ip, #0xff000000
-	and	r2, r0, ip, lsr #1
-	and	r3, r1, ip, lsr #1
-
-	@ If either of them is 255, result will be INF or NAN
-	teq	r2, ip, lsr #1
-	teqne	r3, ip, lsr #1
-	beq	LSYM(Lad_i)
+1:	@ Look for zeroes, equal values, INF, or NAN.
+	movs	r2, r0, lsl #1
+	movnes	r3, r1, lsl #1
+	teqne	r2, r3
+	mvnnes	ip, r2, asr #24
+	mvnnes	ip, r3, asr #24
+	beq	LSYM(Lad_s)
 
 	@ Compute exponent difference.  Make largest exponent in r2,
 	@ corresponding arg in r0, and positive exponent difference in r3.
-	subs	r3, r3, r2
+	mov	r2, r2, lsr #24
+	rsbs	r3, r2, r3, lsr #24
 	addgt	r2, r2, r3
 	eorgt	r1, r0, r1
 	eorgt	r0, r1, r0
@@ -103,7 +90,7 @@ ARM_FUNC_ALIAS aeabi_fadd addsf3
 	@ If exponent difference is too large, return largest argument
 	@ already in r0.  We need up to 25 bit to handle proper rounding
 	@ of 0x1p25 - 1.1.
-	cmp	r3, #(25 << 23)
+	cmp	r3, #25
 	RETc(hi)
 
 	@ Convert mantissa to signed integer.
@@ -122,25 +109,17 @@ ARM_FUNC_ALIAS aeabi_fadd addsf3
 	beq	LSYM(Lad_d)
 LSYM(Lad_x):
 
-	@ Scale down second arg with exponent difference.
-	@ Apply shift one bit left to first arg and the rest to second arg
-	@ to simplify things later, but only if exponent does not become 0.
-	movs	r3, r3, lsr #23
-	teqne	r2, #(1 << 23)
-	movne	r0, r0, lsl #1
-	subne	r2, r2, #(1 << 23)
-	subne	r3, r3, #1
+	@ Compensate for the exponent overlapping the mantissa MSB added later
+	sub	r2, r2, #1
 
-	@ Shift second arg into ip, keep leftover bits into r1.
-	mov	ip, r1, asr r3
+	@ Shift and add second arg to first arg in r0.
+	@ Keep leftover bits into r1.
+	adds	r0, r0, r1, asr r3
 	rsb	r3, r3, #32
 	mov	r1, r1, lsl r3
 
-	add	r0, r0, ip		@ the actual addition
-
-	@ We now have a 64 bit result in r0-r1.
-	@ Keep absolute value in r0-r1, sign in r3.
-	ands	r3, r0, #0x80000000
+	@ Keep absolute value in r0-r1, sign in r3 (the n bit was set above)
+	and	r3, r0, #0x80000000
 	bpl	LSYM(Lad_p)
 	rsbs	r1, r1, #0
 	rsc	r0, r0, #0
@@ -148,104 +127,118 @@ LSYM(Lad_x):
 	@ Determine how to normalize the result.
 LSYM(Lad_p):
 	cmp	r0, #0x00800000
-	bcc	LSYM(Lad_l)
+	bcc	LSYM(Lad_a)
 	cmp	r0, #0x01000000
-	bcc	LSYM(Lad_r0)
-	cmp	r0, #0x02000000
-	bcc	LSYM(Lad_r1)
+	bcc	LSYM(Lad_e)
 
 	@ Result needs to be shifted right.
 	movs	r0, r0, lsr #1
 	mov	r1, r1, rrx
-	add	r2, r2, #(1 << 23)
-LSYM(Lad_r1):
-	movs	r0, r0, lsr #1
-	mov	r1, r1, rrx
-	add	r2, r2, #(1 << 23)
-
-	@ Our result is now properly aligned into r0, remaining bits in r1.
-	@ Round with MSB of r1. If halfway between two numbers, round towards
-	@ LSB of r0 = 0. 
-LSYM(Lad_r0):
-	add	r0, r0, r1, lsr #31
-	teq	r1, #0x80000000
-	biceq	r0, r0, #1
-
-	@ Rounding may have added a new MSB.  Adjust exponent.
-	@ That MSB will be cleared when exponent is merged below.
-	tst	r0, #0x01000000
-	addne	r2, r2, #(1 << 23)
+	add	r2, r2, #1
 
 	@ Make sure we did not bust our exponent.
-	cmp	r2, #(254 << 23)
-	bhi	LSYM(Lad_o)
+	cmp	r2, #254
+	bhs	LSYM(Lad_o)
 
+	@ Our result is now properly aligned into r0, remaining bits in r1.
 	@ Pack final result together.
+	@ Round with MSB of r1. If halfway between two numbers, round towards
+	@ LSB of r0 = 0. 
 LSYM(Lad_e):
-	bic	r0, r0, #0x01800000
-	orr	r0, r0, r2
+	cmp	r1, #0x80000000
+	adc	r0, r0, r2, lsl #23
+	biceq	r0, r0, #1
 	orr	r0, r0, r3
 	RET
 
-	@ Result must be shifted left.
-	@ No rounding necessary since r1 will always be 0.
+	@ Result must be shifted left and exponent adjusted.
+LSYM(Lad_a):
+	movs	r1, r1, lsl #1
+	adc	r0, r0, r0
+	tst	r0, #0x00800000
+	sub	r2, r2, #1
+	bne	LSYM(Lad_e)
+	
+	@ No rounding necessary since r1 will always be 0 at this point.
 LSYM(Lad_l):
 
 #if __ARM_ARCH__ < 5
 
 	movs	ip, r0, lsr #12
 	moveq	r0, r0, lsl #12
-	subeq	r2, r2, #(12 << 23)
+	subeq	r2, r2, #12
 	tst	r0, #0x00ff0000
 	moveq	r0, r0, lsl #8
-	subeq	r2, r2, #(8 << 23)
+	subeq	r2, r2, #8
 	tst	r0, #0x00f00000
 	moveq	r0, r0, lsl #4
-	subeq	r2, r2, #(4 << 23)
+	subeq	r2, r2, #4
 	tst	r0, #0x00c00000
 	moveq	r0, r0, lsl #2
-	subeq	r2, r2, #(2 << 23)
-	tst	r0, #0x00800000
-	moveq	r0, r0, lsl #1
-	subeq	r2, r2, #(1 << 23)
-	cmp	r2, #0
-	bgt	LSYM(Lad_e)
+	subeq	r2, r2, #2
+	cmp	r0, #0x00800000
+	movcc	r0, r0, lsl #1
+	sbcs	r2, r2, #0
 
 #else
 
 	clz	ip, r0
 	sub	ip, ip, #8
+	subs	r2, r2, ip
 	mov	r0, r0, lsl ip
-	subs	r2, r2, ip, lsl #23
-	bgt	LSYM(Lad_e)
 
 #endif
 
-	@ Exponent too small, denormalize result.
-	mvn	r2, r2, asr #23
-	add	r2, r2, #2
-	orr	r0, r3, r0, lsr r2
+	@ Final result with sign
+	@ If exponent negative, denormalize result.
+	addge	r0, r0, r2, lsl #23
+	rsblt	r2, r2, #0
+	orrge	r0, r0, r3
+	orrlt	r0, r3, r0, lsr r2
 	RET
 
 	@ Fixup and adjust bit position for denormalized arguments.
 	@ Note that r2 must not remain equal to 0.
 LSYM(Lad_d):
 	teq	r2, #0
-	eoreq	r0, r0, #0x00800000
-	addeq	r2, r2, #(1 << 23)
 	eor	r1, r1, #0x00800000
-	subne	r3, r3, #(1 << 23)
+	eoreq	r0, r0, #0x00800000
+	addeq	r2, r2, #1
+	subne	r3, r3, #1
 	b	LSYM(Lad_x)
 
-	@ Result is x - x = 0, unless x is INF or NAN.
-LSYM(Lad_z):
-	mov	ip, #0xff000000
-	and	r2, r0, ip, lsr #1
-	teq	r2, ip, lsr #1
-	moveq	r0, ip, asr #2
-	movne	r0, #0
+LSYM(Lad_s):
+	mov	r3, r1, lsl #1
+
+	mvns	ip, r2, asr #24
+	mvnnes	ip, r3, asr #24
+	beq	LSYM(Lad_i)
+
+	teq	r2, r3
+	beq	1f
+
+	@ Result is x + 0.0 = x or 0.0 + y = y.
+	teq	r2, #0
+	moveq	r0, r1
 	RET
 
+1:	teq	r0, r1
+
+	@ Result is x - x = 0.
+	movne	r0, #0
+	RETc(ne)
+
+	@ Result is x + x = 2x.
+	tst	r2, #0xff000000
+	bne	2f
+	movs	r0, r0, lsl #1
+	orrcs	r0, r0, #0x80000000
+	RET
+2:	adds	r2, r2, #(2 << 24)
+	addcc	r0, r0, #(1 << 23)
+	RETc(cc)
+	and	r3, r0, #0x80000000
+
 	@ Overflow: return INF.
 LSYM(Lad_o):
 	orr	r0, r3, #0x7f000000
@@ -257,16 +250,16 @@ LSYM(Lad_o):
 	@   if r1 != INF/NAN: return r0 (which is INF/NAN)
 	@   if r0 or r1 is NAN: return NAN
 	@   if opposite sign: return NAN
-	@   return r0 (which is INF or -INF)
+	@   otherwise return r0 (which is INF or -INF)
 LSYM(Lad_i):
-	teq	r2, ip, lsr #1
+	mvns	r2, r2, asr #24
 	movne	r0, r1
-	teqeq	r3, ip, lsr #1
-	RETc(ne)
+	mvneqs	r3, r3, asr #24
+	movne	r1, r0
 	movs	r2, r0, lsl #9
-	moveqs	r2, r1, lsl #9
+	moveqs	r3, r1, lsl #9
 	teqeq	r0, r1
-	orrne	r0, r3, #0x00400000	@ NAN
+	orrne	r0, r0, #0x00400000	@ quiet NAN
 	RET
 
 	FUNC_END aeabi_frsub
@@ -287,28 +280,17 @@ ARM_FUNC_ALIAS aeabi_i2f floatsisf
 	ands	r3, r0, #0x80000000
 	rsbmi	r0, r0, #0
 
-1:	teq	r0, #0
+1:	movs	ip, r0
 	RETc(eq)
 
-3:
-	mov	r1, #0
-	mov	r2, #((127 + 23) << 23)
-	tst	r0, #0xfc000000
-	beq	LSYM(Lad_p)
+	@ Add initial exponent to sign
+	orr	r3, r3, #((127 + 23) << 23)
 
-	@ We need to scale the value a little before branching to code above.
-	tst	r0, #0xf0000000
-4:
-	orrne	r1, r1, r0, lsl #28
-	movne	r0, r0, lsr #4
-	addne	r2, r2, #(4 << 23)
-	tst	r0, #0x0c000000
-	beq	LSYM(Lad_p)
-	mov	r1, r1, lsr #2
-	orr	r1, r1, r0, lsl #30
-	mov	r0, r0, lsr #2
-	add	r2, r2, #(2 << 23)
-	b	LSYM(Lad_p)
+	.ifnc	ah, r0
+	mov	ah, r0
+	.endif
+	mov	al, #0
+	b	2f
 
 	FUNC_END aeabi_i2f
 	FUNC_END floatsisf
@@ -317,22 +299,15 @@ ARM_FUNC_ALIAS aeabi_i2f floatsisf
 
 ARM_FUNC_START floatundisf
 ARM_FUNC_ALIAS aeabi_ul2f floatundisf
+
 	orrs	r2, r0, r1
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
 	mvfeqs	f0, #0.0
 #endif
 	RETc(eq)
-	
-#if !defined (__VFP_FP__) && !defined(__SOFTFP__)
-	@ For hard FPA code we want to return via the tail below so that
-	@ we can return the result in f0 as well as in r0 for backwards
-	@ compatibility.
-	str	lr, [sp, #-4]!
-	adr	lr, 4f
-#endif
 
 	mov	r3, #0
-	b	2f
+	b	1f
 
 ARM_FUNC_START floatdisf
 ARM_FUNC_ALIAS aeabi_l2f floatdisf
@@ -342,78 +317,80 @@ ARM_FUNC_ALIAS aeabi_l2f floatdisf
 	mvfeqs	f0, #0.0
 #endif
 	RETc(eq)
-	
+
+	ands	r3, ah, #0x80000000	@ sign bit in r3
+	bpl	1f
+	rsbs	al, al, #0
+	rsc	ah, ah, #0
+1:
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
 	@ For hard FPA code we want to return via the tail below so that
 	@ we can return the result in f0 as well as in r0 for backwards
 	@ compatibility.
 	str	lr, [sp, #-4]!
-	adr	lr, 4f
+	adr	lr, LSYM(f0_ret)
 #endif
-	ands	r3, ah, #0x80000000	@ sign bit in r3
-	bpl	2f
-	rsbs	al, al, #0
-	rsc	ah, ah, #0
-2:
+
 	movs	ip, ah
-#ifdef __ARMEB__
-	moveq	r0, al
-#endif
-	beq	3b
-	mov	r2, #((127 + 23 + 32) << 23)	@ initial exponent
-#ifndef __ARMEB__
-	mov	r1, al
-	mov	r0, ip
-#endif
-	tst	r0, #0xfc000000
-	bne	3f
+	moveq	ip, al
+
+	@ Add initial exponent to sign
+	orr	r3, r3, #((127 + 23 + 32) << 23)
+	subeq	r3, r3, #(32 << 23)
+2:	sub	r3, r3, #(1 << 23)
 
 #if __ARM_ARCH__ < 5
-	cmp	r0, #(1 << 13)
-	movlo	ip, #13
-	movlo	r0, r0, lsl #13
-	movhs	ip, #0
-	tst	r0, #0x03fc0000
-	addeq	ip, ip, #8
-	moveq	r0, r0, lsl #8
-	tst	r0, #0x03c00000
-	addeq	ip, ip, #4
-	moveq	r0, r0, lsl #4
-	tst	r0, #0x03000000
-	addeq	ip, ip, #2
-	moveq	r0, r0, lsl #2
+
+	mov	r2, #23
+	cmp	ip, #(1 << 16)
+	movhs	ip, ip, lsr #16
+	subhs	r2, r2, #16
+	cmp	ip, #(1 << 8)
+	movhs	ip, ip, lsr #8
+	subhs	r2, r2, #8
+	cmp	ip, #(1 << 4)
+	movhs	ip, ip, lsr #4
+	subhs	r2, r2, #4
+	cmp	ip, #(1 << 2)
+	subhs	r2, r2, #2
+	sublo	r2, r2, ip, lsr #1
+	subs	r2, r2, ip, lsr #3
+
 #else
-	clz	ip, r0
-	sub	ip, ip, #6
-	mov	r0, r0, lsl ip
+
+	clz	r2, ip
+	subs	r2, r2, #8
+
 #endif
-	sub	r2, r2, ip, lsl #23
-	rsb	ip, ip, #32
-	orr	r0, r0, r1, lsr ip
-	rsb	ip, ip, #32
-	mov	r1, r1, asl ip
-	@ At this point we no-longer care about the precise value in r1, only
-	@ whether only the top bit is set, or if the top bit and some others
-	@ are set.
-	and	ip, r1, #0xff
-	orr	r1, r1, ip, lsl #8
-	b	LSYM(Lad_p)
-3:
-	@ We need to scale the value a little before branching to code above.
-	@ At this point we no-longer care about the precise value in r1, only
-	@ whether only the top bit is set, or if the top bit and some others
-	@ are set.
-	and	ip, r1, #0xff
-	orr	r1, r1, ip, lsl #8
-	tst	r0, #0xf0000000
-	movne	r1, r1, lsr #4
-	b	4b
+
+	sub	r3, r3, r2, lsl #23
+	blt	3f
+
+	add	r3, r3, ah, lsl r2
+	mov	ip, al, lsl r2
+	rsb	r2, r2, #32
+	cmp	ip, #0x80000000
+	adc	r0, r3, al, lsr r2
+	biceq	r0, r0, #1
+	RET
+
+3:	add	r2, r2, #32
+	mov	ip, ah, lsl r2
+	rsb	r2, r2, #32
+	orrs	al, al, ip, lsl #1
+	adc	r0, r3, ah, lsr r2
+	biceq	r0, r0, ip, lsr #31
+	RET
+
 #if !defined (__VFP_FP__) && !defined(__SOFTFP__)
-4:
+
+LSYM(f0_ret)
 	str	r0, [sp, #-4]!
 	ldfs	f0, [sp], #4
 	RETLDM
+
 #endif
+
 	FUNC_END floatdisf
 	FUNC_END aeabi_l2f
 	FUNC_END floatundisf
@@ -425,139 +402,117 @@ ARM_FUNC_ALIAS aeabi_l2f floatdisf
 
 ARM_FUNC_START mulsf3
 ARM_FUNC_ALIAS aeabi_fmul mulsf3
-	
-	@ Mask out exponents.
-	mov	ip, #0xff000000
-	and	r2, r0, ip, lsr #1
-	and	r3, r1, ip, lsr #1
 
-	@ Trap any INF/NAN.
-	teq	r2, ip, lsr #1
-	teqne	r3, ip, lsr #1
+	@ Mask out exponents, trap any zero/denormal/INF/NAN.
+	mov	ip, #0xff
+	ands	r2, ip, r0, lsr #23
+	andnes	r3, ip, r1, lsr #23
+	teqne	r2, ip
+	teqne	r3, ip
 	beq	LSYM(Lml_s)
-
-	@ Trap any multiplication by 0.
-	bics	ip, r0, #0x80000000
-	bicnes	ip, r1, #0x80000000
-	beq	LSYM(Lml_z)
-
-	@ Shift exponents right one bit to make room for overflow bit.
-	@ If either of them is 0, scale denormalized arguments off line.
-	@ Then add both exponents together.
-	movs	r2, r2, lsr #1
-	teqne	r3, #0
-	beq	LSYM(Lml_d)
 LSYM(Lml_x):
-	add	r2, r2, r3, asr #1
 
-	@ Preserve final sign in r2 along with exponent for now.
-	teq	r0, r1
-	orrmi	r2, r2, #0x8000
+	@ Add exponents together
+	add	r2, r2, r3
+
+	@ Determine final sign.
+	eor	ip, r0, r1
 
 	@ Convert mantissa to unsigned integer.
-	bic	r0, r0, #0xff000000
-	bic	r1, r1, #0xff000000
-	orr	r0, r0, #0x00800000
-	orr	r1, r1, #0x00800000
+	@ If power of two, branch to a separate path.
+	@ Make up for final alignment.
+	movs	r0, r0, lsl #9
+	movnes	r1, r1, lsl #9
+	beq	LSYM(Lml_1)
+	mov	r3, #0x08000000
+	orr	r0, r3, r0, lsr #5
+	orr	r1, r3, r1, lsr #5
 
 #if __ARM_ARCH__ < 4
 
+	@ Put sign bit in r3, which will be restored into r0 later.
+	and	r3, ip, #0x80000000
+
 	@ Well, no way to make it shorter without the umull instruction.
-	@ We must perform that 24 x 24 -> 48 bit multiplication by hand.
-	stmfd	sp!, {r4, r5}
+	stmfd	sp!, {r3, r4, r5}
 	mov	r4, r0, lsr #16
 	mov	r5, r1, lsr #16
-	bic	r0, r0, #0x00ff0000
-	bic	r1, r1, #0x00ff0000
+	bic	r0, r0, r4, lsl #16
+	bic	r1, r1, r5, lsl #16
 	mul	ip, r4, r5
 	mul	r3, r0, r1
 	mul	r0, r5, r0
 	mla	r0, r4, r1, r0
 	adds	r3, r3, r0, lsl #16
-	adc	ip, ip, r0, lsr #16
-	ldmfd	sp!, {r4, r5}
+	adc	r1, ip, r0, lsr #16
+	ldmfd	sp!, {r0, r4, r5}
 
 #else
 
-	umull	r3, ip, r0, r1		@ The actual multiplication.
+	@ The actual multiplication.
+	umull	r3, r1, r0, r1
+
+	@ Put final sign in r0.
+	and	r0, ip, #0x80000000
 
 #endif
 
-	@ Put final sign in r0.
-	mov	r0, r2, lsl #16
-	bic	r2, r2, #0x8000
+	@ Adjust result upon the MSB position.
+	cmp	r1, #(1 << 23)
+	movcc	r1, r1, lsl #1
+	orrcc	r1, r1, r3, lsr #31
+	movcc	r3, r3, lsl #1
 
-	@ Adjust result if one extra MSB appeared.
-	@ The LSB may be lost but this never changes the result in this case.
-	tst	ip, #(1 << 15)
-	addne	r2, r2, #(1 << 22)
-	movnes	ip, ip, lsr #1
-	movne	r3, r3, rrx
+	@ Add sign to result.
+	orr	r0, r0, r1
 
-	@ Apply exponent bias, check range for underflow.
-	subs	r2, r2, #(127 << 22)
-	ble	LSYM(Lml_u)
+	@ Apply exponent bias, check for under/overflow.
+	sbc	r2, r2, #127
+	cmp	r2, #(254 - 1)
+	bhi	LSYM(Lml_u)
 
-	@ Scale back to 24 bits with rounding.
-	@ r0 contains sign bit already.
-	orrs	r0, r0, r3, lsr #23
-	adc	r0, r0, ip, lsl #9
-
-	@ If halfway between two numbers, rounding should be towards LSB = 0.
-	mov	r3, r3, lsl #9
-	teq	r3, #0x80000000
+	@ Round the result, merge final exponent.
+	cmp	r3, #0x80000000
+	adc	r0, r0, r2, lsl #23
 	biceq	r0, r0, #1
-
-	@ Note: rounding may have produced an extra MSB here.
-	@ The extra bit is cleared before merging the exponent below.
-	tst	r0, #0x01000000
-	addne	r2, r2, #(1 << 22)
-
-	@ Check for exponent overflow
-	cmp	r2, #(255 << 22)
-	bge	LSYM(Lml_o)
-
-	@ Add final exponent.
-	bic	r0, r0, #0x01800000
-	orr	r0, r0, r2, lsl #1
 	RET
 
-	@ Result is 0, but determine sign anyway.
-LSYM(Lml_z):
-	eor	r0, r0, r1
-	bic	r0, r0, #0x7fffffff
-	RET
+	@ Multiplication by 0x1p*: let''s shortcut a lot of code.
+LSYM(Lml_1):
+	teq	r0, #0
+	and	ip, ip, #0x80000000
+	moveq	r1, r1, lsl #9
+	orr	r0, ip, r0, lsr #9
+	orr	r0, r0, r1, lsr #9
+	subs	r2, r2, #127
+	rsbgts	r3, r2, #255
+	orrgt	r0, r0, r2, lsl #23
+	RETc(gt)
+
+	@ Under/overflow: fix things up for the code below.
+	orr	r0, r0, #0x00800000
+	mov	r3, #0
+	subs	r2, r2, #1
+
+LSYM(Lml_u):
+	@ Overflow?
+	bgt	LSYM(Lml_o)
 
 	@ Check if denormalized result is possible, otherwise return signed 0.
-LSYM(Lml_u):
-	cmn	r2, #(24 << 22)
+	cmn	r2, #(24 + 1)
+	bicle	r0, r0, #0x7fffffff
 	RETc(le)
 
-	@ Find out proper shift value.
-	mvn	r1, r2, asr #22
-	subs	r1, r1, #7
-	bgt	LSYM(Lml_ur)
-
-	@ Shift value left, round, etc.
-	add	r1, r1, #32
-	orrs	r0, r0, r3, lsr r1
-	rsb	r1, r1, #32
-	adc	r0, r0, ip, lsl r1
-	mov	ip, r3, lsl r1
-	teq	ip, #0x80000000
-	biceq	r0, r0, #1
-	RET
-
 	@ Shift value right, round, etc.
-	@ Note: r1 must not be 0 otherwise carry does not get set.
-LSYM(Lml_ur):
-	orrs	r0, r0, ip, lsr r1
+	rsb	r2, r2, #0
+	movs	r1, r0, lsl #1
+	mov	r1, r1, lsr r2
+	rsb	r2, r2, #32
+	mov	ip, r0, lsl r2
+	movs	r0, r1, rrx
 	adc	r0, r0, #0
-	rsb	r1, r1, #32
-	mov	ip, ip, lsl r1
-	teq	r3, #0
-	teqeq	ip, #0x80000000
-	biceq	r0, r0, #1
+	orrs	r3, r3, ip, lsl #1
+	biceq	r0, r0, ip, lsr #31
 	RET
 
 	@ One or both arguments are denormalized.
@@ -567,32 +522,51 @@ LSYM(Lml_d):
 	and	ip, r0, #0x80000000
 1:	moveq	r0, r0, lsl #1
 	tsteq	r0, #0x00800000
-	subeq	r2, r2, #(1 << 22)
+	subeq	r2, r2, #1
 	beq	1b
 	orr	r0, r0, ip
 	teq	r3, #0
 	and	ip, r1, #0x80000000
 2:	moveq	r1, r1, lsl #1
 	tsteq	r1, #0x00800000
-	subeq	r3, r3, #(1 << 23)
+	subeq	r3, r3, #1
 	beq	2b
 	orr	r1, r1, ip
 	b	LSYM(Lml_x)
 
-	@ One or both args are INF or NAN.
 LSYM(Lml_s):
+	@ Isolate the INF and NAN cases away
+	and	r3, ip, r1, lsr #23
+	teq	r2, ip
+	teqne	r3, ip
+	beq	1f
+
+	@ Here, one or more arguments are either denormalized or zero.
+	bics	ip, r0, #0x80000000
+	bicnes	ip, r1, #0x80000000
+	bne	LSYM(Lml_d)
+
+	@ Result is 0, but determine sign anyway.
+LSYM(Lml_z):
+	eor	r0, r0, r1
+	bic	r0, r0, #0x7fffffff
+	RET
+
+1:	@ One or both args are INF or NAN.
 	teq	r0, #0x0
-	teqne	r1, #0x0
 	teqne	r0, #0x80000000
+	moveq	r0, r1
+	teqne	r1, #0x0
 	teqne	r1, #0x80000000
 	beq	LSYM(Lml_n)		@ 0 * INF or INF * 0 -> NAN
-	teq	r2, ip, lsr #1
+	teq	r2, ip
 	bne	1f
 	movs	r2, r0, lsl #9
 	bne	LSYM(Lml_n)		@ NAN * <anything> -> NAN
-1:	teq	r3, ip, lsr #1
+1:	teq	r3, ip
 	bne	LSYM(Lml_i)
 	movs	r3, r1, lsl #9
+	movne	r0, r1
 	bne	LSYM(Lml_n)		@ <anything> * NAN -> NAN
 
 	@ Result is INF, but we need to determine its sign.
@@ -606,9 +580,9 @@ LSYM(Lml_o):
 	orr	r0, r0, #0x00800000
 	RET
 
-	@ Return NAN.
+	@ Return a quiet NAN.
 LSYM(Lml_n):
-	mov	r0, #0x7f000000
+	orr	r0, r0, #0x7f000000
 	orr	r0, r0, #0x00c00000
 	RET
 
@@ -617,37 +591,28 @@ LSYM(Lml_n):
 
 ARM_FUNC_START divsf3
 ARM_FUNC_ALIAS aeabi_fdiv divsf3
-	
-	@ Mask out exponents.
-	mov	ip, #0xff000000
-	and	r2, r0, ip, lsr #1
-	and	r3, r1, ip, lsr #1
 
-	@ Trap any INF/NAN or zeroes.
-	teq	r2, ip, lsr #1
-	teqne	r3, ip, lsr #1
-	bicnes	ip, r0, #0x80000000
-	bicnes	ip, r1, #0x80000000
+	@ Mask out exponents, trap any zero/denormal/INF/NAN.
+	mov	ip, #0xff
+	ands	r2, ip, r0, lsr #23
+	andnes	r3, ip, r1, lsr #23
+	teqne	r2, ip
+	teqne	r3, ip
 	beq	LSYM(Ldv_s)
-
-	@ Shift exponents right one bit to make room for overflow bit.
-	@ If either of them is 0, scale denormalized arguments off line.
-	@ Then substract divisor exponent from dividend''s.
-	movs	r2, r2, lsr #1
-	teqne	r3, #0
-	beq	LSYM(Ldv_d)
 LSYM(Ldv_x):
-	sub	r2, r2, r3, asr #1
+
+	@ Substract divisor exponent from dividend''s
+	sub	r2, r2, r3
 
 	@ Preserve final sign into ip.
 	eor	ip, r0, r1
 
 	@ Convert mantissa to unsigned integer.
 	@ Dividend -> r3, divisor -> r1.
-	mov	r3, #0x10000000
 	movs	r1, r1, lsl #9
 	mov	r0, r0, lsl #9
 	beq	LSYM(Ldv_1)
+	mov	r3, #0x10000000
 	orr	r1, r3, r1, lsr #4
 	orr	r3, r3, r0, lsr #4
 
@@ -655,16 +620,10 @@ LSYM(Ldv_x):
 	and	r0, ip, #0x80000000
 
 	@ Ensure result will land to known bit position.
+	@ Apply exponent bias accordingly.
 	cmp	r3, r1
-	subcc	r2, r2, #(1 << 22)
 	movcc	r3, r3, lsl #1
-
-	@ Apply exponent bias, check range for over/underflow.
-	add	r2, r2, #(127 << 22)
-	cmn	r2, #(24 << 22)
-	RETc(le)
-	cmp	r2, #(255 << 22)
-	bge	LSYM(Lml_o)
+	adc	r2, r2, #(127 - 2)
 
 	@ The actual division loop.
 	mov	ip, #0x00800000
@@ -684,44 +643,29 @@ LSYM(Ldv_x):
 	movnes	ip, ip, lsr #4
 	bne	1b
 
-	@ Check if denormalized result is needed.
-	cmp	r2, #0
-	ble	LSYM(Ldv_u)
+	@ Check exponent for under/overflow.
+	cmp	r2, #(254 - 1)
+	bhi	LSYM(Lml_u)
 
-	@ Apply proper rounding.
+	@ Round the result, merge final exponent.
 	cmp	r3, r1
-	addcs	r0, r0, #1
+	adc	r0, r0, r2, lsl #23
 	biceq	r0, r0, #1
-
-	@ Add exponent to result.
-	bic	r0, r0, #0x00800000
-	orr	r0, r0, r2, lsl #1
 	RET
 
 	@ Division by 0x1p*: let''s shortcut a lot of code.
 LSYM(Ldv_1):
 	and	ip, ip, #0x80000000
 	orr	r0, ip, r0, lsr #9
-	add	r2, r2, #(127 << 22)
-	cmp	r2, #(255 << 22)
-	bge	LSYM(Lml_o)
-	cmp	r2, #0
-	orrgt	r0, r0, r2, lsl #1
+	adds	r2, r2, #127
+	rsbgts	r3, r2, #255
+	orrgt	r0, r0, r2, lsl #23
 	RETc(gt)
-	cmn	r2, #(24 << 22)
-	movle	r0, ip
-	RETc(le)
+
 	orr	r0, r0, #0x00800000
 	mov	r3, #0
-
-	@ Result must be denormalized: prepare parameters to use code above.
-	@ r3 already contains remainder for rounding considerations.
-LSYM(Ldv_u):
-	bic	ip, r0, #0x80000000
-	and	r0, r0, #0x80000000
-	mvn	r1, r2, asr #22
-	add	r1, r1, #2
-	b	LSYM(Lml_ur)
+	subs	r2, r2, #1
+	b	LSYM(Lml_u)
 
 	@ One or both arguments are denormalized.
 	@ Scale them leftwards and preserve sign bit.
@@ -730,35 +674,40 @@ LSYM(Ldv_d):
 	and	ip, r0, #0x80000000
 1:	moveq	r0, r0, lsl #1
 	tsteq	r0, #0x00800000
-	subeq	r2, r2, #(1 << 22)
+	subeq	r2, r2, #1
 	beq	1b
 	orr	r0, r0, ip
 	teq	r3, #0
 	and	ip, r1, #0x80000000
 2:	moveq	r1, r1, lsl #1
 	tsteq	r1, #0x00800000
-	subeq	r3, r3, #(1 << 23)
+	subeq	r3, r3, #1
 	beq	2b
 	orr	r1, r1, ip
 	b	LSYM(Ldv_x)
 
-	@ One or both arguments is either INF, NAN or zero.
+	@ One or both arguments are either INF, NAN, zero or denormalized.
 LSYM(Ldv_s):
-	mov	ip, #0xff000000
-	teq	r2, ip, lsr #1
-	teqeq	r3, ip, lsr #1
-	beq	LSYM(Lml_n)		@ INF/NAN / INF/NAN -> NAN
-	teq	r2, ip, lsr #1
+	and	r3, ip, r1, lsr #23
+	teq	r2, ip
 	bne	1f
 	movs	r2, r0, lsl #9
 	bne	LSYM(Lml_n)		@ NAN / <anything> -> NAN
-	b	LSYM(Lml_i)		@ INF / <anything> -> INF
-1:	teq	r3, ip, lsr #1
+	teq	r3, ip
+	bne	LSYM(Lml_i)		@ INF / <anything> -> INF
+	mov	r0, r1
+	b	LSYM(Lml_n)		@ INF / (INF or NAN) -> NAN
+1:	teq	r3, ip
 	bne	2f
 	movs	r3, r1, lsl #9
-	bne	LSYM(Lml_n)		@ <anything> / NAN -> NAN
-	b	LSYM(Lml_z)		@ <anything> / INF -> 0
-2:	@ One or both arguments are 0.
+	beq	LSYM(Lml_z)		@ <anything> / INF -> 0
+	mov	r0, r1
+	b	LSYM(Lml_n)		@ <anything> / NAN -> NAN
+2:	@ If both are non-zero, we need to normalize and resume above.
+	bics	ip, r0, #0x80000000
+	bicnes	ip, r1, #0x80000000
+	bne	LSYM(Ldv_d)
+	@ One or both arguments are zero.
 	bics	r2, r0, #0x80000000
 	bne	LSYM(Lml_i)		@ <non_zero> / 0 -> INF
 	bics	r3, r1, #0x80000000
@@ -789,85 +738,50 @@ LSYM(Ldv_s):
 
 ARM_FUNC_START gtsf2
 ARM_FUNC_ALIAS gesf2 gtsf2
-	mov	r3, #-1
+	mov	ip, #-1
 	b	1f
 
 ARM_FUNC_START ltsf2
 ARM_FUNC_ALIAS lesf2 ltsf2
-	mov	r3, #1
+	mov	ip, #1
 	b	1f
 
 ARM_FUNC_START cmpsf2
 ARM_FUNC_ALIAS nesf2 cmpsf2
 ARM_FUNC_ALIAS eqsf2 cmpsf2
-	mov	r3, #1			@ how should we specify unordered here?
+	mov	ip, #1			@ how should we specify unordered here?
 
-	@ Both Inf and NaN have an exponent of 255.  Therefore, we
-	@ compute (r1 & 0x8f80000) || (r2 & 0x8f8000).
-1:	mov	ip, #0xff000000
-	and	r2, r1, ip, lsr #1
-	teq	r2, ip, lsr #1
-	and	r2, r0, ip, lsr #1
-	teqne	r2, ip, lsr #1
+1:	str	ip, [sp, #-4]
+
+	@ Trap any INF/NAN first.
+	mov	r2, r0, lsl #1
+	mov	r3, r1, lsl #1
+	mvns	ip, r2, asr #24
+	mvnnes	ip, r3, asr #24
 	beq	3f
 
-	@ Test for equality.  The representations of +0.0 and -0.0
-	@ have all bits set to zero, except for the sign bit.  Since
-	@ 0.0 is equal to -0.0, we begin by testing 
-	@ ((r0 | r1) & ~0x8000000).
-2:	orr	r3, r0, r1
-	@ If the result of the bitwise and is zero, then the Z flag
-	@ will be set.  In any case, the C flag will be set.
-	bics	r3, r3, #0x80000000	@ either 0.0 or -0.0
-	teqne	r0, r1			@ or both the same
-	@ If the Z flag is set, the two operands were equal.  Return zero.
-	moveq	r0, #0
-	RETc(eq)
+	@ Compare values.
+	@ Note that 0.0 is equal to -0.0.
+2:	orrs	ip, r2, r3, lsr #1	@ test if both are 0, clear C flag
+	teqne	r0, r1			@ if not 0 compare sign
+	subpls	r0, r2, r3		@ if same sign compare values, set r0
 
-	@ Check for sign difference.  The N flag is set (due to the
-	@ use of teq above) if the sign bit is set on exactly one
-	@ of the operands.  Return the sign of the first operand.
-	movmi	r0, r0, asr #31
-	orrmi	r0, r0, #1
-	RETc(mi)
-
-	@ Compare exponents.
-	and	r3, r1, ip, lsr #1
-	cmp	r2, r3
-
-	@ Compare mantissa if exponents are equal
-	moveq	r0, r0, lsl #9
-	cmpeq	r0, r1, lsl #9
-
-	@ We know the operands cannot be equal at this point, so the
-	@ Z flag is clear.  The C flag is set if the first operand has
-	@ the greater exponent, or the exponents are equal and the 
-	@ first operand has the greater mantissa.  Therefore, if the C
-	@ flag is set, the first operand is greater iff the sign is
-	@ positive.  These next two instructions will put zero in
-	@ r0 if the first operand is greater, and -1 if the second
-	@ operand is greater.
-	movcs	r0, r1, asr #31
-	mvncc	r0, r1, asr #31
-	@ If r0 is 0, the first operand is greater, so return 1.  Leave
-	@ -1 unchanged.
-	orr	r0, r0, #1
+	@ Result:
+	movhi	r0, r1, asr #31
+	mvnlo	r0, r1, asr #31
+	orrne	r0, r0, #1
 	RET
 
-	@ We know that at least one argument is either Inf or NaN.
-	@ Look for a NaN. 
-3:	and	r2, r1, ip, lsr #1
-	teq	r2, ip, lsr #1
+	@ Look for a NAN. 
+3:	mvns	ip, r2, asr #24
 	bne	4f
-	movs	r2, r1, lsl #9
-	bne	5f			@ r1 is NAN
-4:	and	r2, r0, ip, lsr #1
-	teq	r2, ip, lsr #1
-	bne	2b
 	movs	ip, r0, lsl #9
-	beq	2b			@ r0 is not NAN
-5:	@ The Z flag is clear at this point.
-	mov	r0, r3			@ return unordered code from r3.
+	bne	5f			@ r0 is NAN
+4:	mvns	ip, r3, asr #24
+	bne	2b
+	movs	ip, r1, lsl #9
+	beq	2b			@ r1 is not NAN
+5:	ldr	r0, [sp, #-4]		@ return unordered code.
 	RET
 
 	FUNC_END gesf2
@@ -879,13 +793,15 @@ ARM_FUNC_ALIAS eqsf2 cmpsf2
 	FUNC_END cmpsf2
 
 ARM_FUNC_START aeabi_cfrcmple
+
 	mov	ip, r0
 	mov	r0, r1
 	mov	r1, ip
 	b	6f
-	
+
 ARM_FUNC_START aeabi_cfcmpeq
 ARM_FUNC_ALIAS aeabi_cfcmple aeabi_cfcmpeq
+
 	@ The status-returning routines are required to preserve all
 	@ registers except ip, lr, and cpsr.
 6:	stmfd	sp!, {r0, r1, r2, r3, lr}
@@ -896,68 +812,79 @@ ARM_FUNC_ALIAS aeabi_cfcmple aeabi_cfcmpeq
 	@ that the first operand was smaller than the second.
 	cmnmi	 r0, #0
 	RETLDM  "r0, r1, r2, r3"
+
 	FUNC_END aeabi_cfcmple
 	FUNC_END aeabi_cfcmpeq
-	
+	FUNC_END aeabi_cfrcmple
+
 ARM_FUNC_START	aeabi_fcmpeq
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cfcmple
 	moveq	r0, #1	@ Equal to.
 	movne	r0, #0	@ Less than, greater than, or unordered.
 	RETLDM
+
 	FUNC_END aeabi_fcmpeq
 
 ARM_FUNC_START	aeabi_fcmplt
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cfcmple
 	movcc	r0, #1	@ Less than.
 	movcs	r0, #0	@ Equal to, greater than, or unordered.
 	RETLDM
+
 	FUNC_END aeabi_fcmplt
 
 ARM_FUNC_START	aeabi_fcmple
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cfcmple
 	movls	r0, #1  @ Less than or equal to.
 	movhi	r0, #0	@ Greater than or unordered.
 	RETLDM
+
 	FUNC_END aeabi_fcmple
 
 ARM_FUNC_START	aeabi_fcmpge
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cfrcmple
 	movls	r0, #1	@ Operand 2 is less than or equal to operand 1.
 	movhi	r0, #0	@ Operand 2 greater than operand 1, or unordered.
 	RETLDM
+
 	FUNC_END aeabi_fcmpge
 
 ARM_FUNC_START	aeabi_fcmpgt
+
 	str	lr, [sp, #-4]!
 	ARM_CALL aeabi_cfrcmple
 	movcc	r0, #1	@ Operand 2 is less than operand 1.
 	movcs	r0, #0  @ Operand 2 is greater than or equal to operand 1,
 			@ or they are unordered.
 	RETLDM
+
 	FUNC_END aeabi_fcmpgt
-		
+
 #endif /* L_cmpsf2 */
 
 #ifdef L_unordsf2
 
 ARM_FUNC_START unordsf2
 ARM_FUNC_ALIAS aeabi_fcmpun unordsf2
-	
-	mov	ip, #0xff000000
-	and	r2, r1, ip, lsr #1
-	teq	r2, ip, lsr #1
+
+	mov	r2, r0, lsl #1
+	mov	r3, r1, lsl #1
+	mvns	ip, r2, asr #24
 	bne	1f
-	movs	r2, r1, lsl #9
-	bne	3f			@ r1 is NAN
-1:	and	r2, r0, ip, lsr #1
-	teq	r2, ip, lsr #1
-	bne	2f
-	movs	r2, r0, lsl #9
+	movs	ip, r0, lsl #9
 	bne	3f			@ r0 is NAN
+1:	mvns	ip, r3, asr #24
+	bne	2f
+	movs	ip, r1, lsl #9
+	bne	3f			@ r1 is NAN
 2:	mov	r0, #0			@ arguments are ordered.
 	RET
 3:	mov	r0, #1			@ arguments are unordered.
@@ -972,37 +899,35 @@ ARM_FUNC_ALIAS aeabi_fcmpun unordsf2
 
 ARM_FUNC_START fixsfsi
 ARM_FUNC_ALIAS aeabi_f2iz fixsfsi
-	movs	r0, r0, lsl #1
-	RETc(eq)			@ value is 0.
-
-	mov	r1, r1, rrx		@ preserve C flag (the actual sign)
 
 	@ check exponent range.
-	and	r2, r0, #0xff000000
+	mov	r2, r0, lsl #1
 	cmp	r2, #(127 << 24)
-	movcc	r0, #0			@ value is too small
-	RETc(cc)
-	cmp	r2, #((127 + 31) << 24)
-	bcs	1f			@ value is too large
+	bcc	1f			@ value is too small
+	mov	r3, #(127 + 31)
+	subs	r2, r3, r2, lsr #24
+	bls	2f			@ value is too large
 
-	mov	r0, r0, lsl #7
-	orr	r0, r0, #0x80000000
-	mov	r2, r2, lsr #24
-	rsb	r2, r2, #(127 + 31)
-	tst	r1, #0x80000000		@ the sign bit
-	mov	r0, r0, lsr r2
+	@ scale value
+	mov	r3, r0, lsl #8
+	orr	r3, r3, #0x80000000
+	tst	r0, #0x80000000		@ the sign bit
+	mov	r0, r3, lsr r2
 	rsbne	r0, r0, #0
 	RET
 
-1:	teq	r2, #0xff000000
-	bne	2f
-	movs	r0, r0, lsl #8
-	bne	3f			@ r0 is NAN.
-2:	ands	r0, r1, #0x80000000	@ the sign bit
+1:	mov	r0, #0
+	RET
+
+2:	cmp	r2, #(127 + 31 - 0xff)
+	bne	3f
+	movs	r2, r0, lsl #9
+	bne	4f			@ r0 is NAN.
+3:	ands	r0, r0, #0x80000000	@ the sign bit
 	moveq	r0, #0x7fffffff		@ the maximum signed positive si
 	RET
 
-3:	mov	r0, #0			@ What should we convert NAN to?
+4:	mov	r0, #0			@ What should we convert NAN to?
 	RET
 
 	FUNC_END aeabi_f2iz
@@ -1014,34 +939,33 @@ ARM_FUNC_ALIAS aeabi_f2iz fixsfsi
 
 ARM_FUNC_START fixunssfsi
 ARM_FUNC_ALIAS aeabi_f2uiz fixunssfsi
-	movs	r0, r0, lsl #1
-	movcss	r0, #0			@ value is negative...
-	RETc(eq)			@ ... or 0.
-
 
 	@ check exponent range.
-	and	r2, r0, #0xff000000
+	movs	r2, r0, lsl #1
+	bcs	1f			@ value is negative
 	cmp	r2, #(127 << 24)
-	movcc	r0, #0			@ value is too small
-	RETc(cc)
-	cmp	r2, #((127 + 32) << 24)
-	bcs	1f			@ value is too large
+	bcc	1f			@ value is too small
+	mov	r3, #(127 + 31)
+	subs	r2, r3, r2, lsr #24
+	bmi	2f			@ value is too large
 
-	mov	r0, r0, lsl #7
-	orr	r0, r0, #0x80000000
-	mov	r2, r2, lsr #24
-	rsb	r2, r2, #(127 + 31)
-	mov	r0, r0, lsr r2
+	@ scale the value
+	mov	r3, r0, lsl #8
+	orr	r3, r3, #0x80000000
+	mov	r0, r3, lsr r2
 	RET
 
-1:	teq	r2, #0xff000000
-	bne	2f
-	movs	r0, r0, lsl #8
-	bne	3f			@ r0 is NAN.
-2:	mov	r0, #0xffffffff		@ maximum unsigned si
+1:	mov	r0, #0
 	RET
 
-3:	mov	r0, #0			@ What should we convert NAN to?
+2:	cmp	r2, #(127 + 31 - 0xff)
+	bne	3f
+	movs	r2, r0, lsl #9
+	bne	4f			@ r0 is NAN.
+3:	mov	r0, #0xffffffff		@ maximum unsigned si
+	RET
+
+4:	mov	r0, #0			@ What should we convert NAN to?
 	RET
 
 	FUNC_END aeabi_f2uiz