From 6883a6662fc7728a539230ea1b02efd47815d705 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 21 Oct 2004 04:22:01 +0000 Subject: [PATCH] ieee754-sf.S: Large speed improvements. * config/arm/ieee754-sf.S: Large speed improvements. Fix NAN handling. * config/arm/ieee754-df.S: Ditto. From-SVN: r89364 --- gcc/ChangeLog | 5 + gcc/config/arm/ieee754-df.S | 971 +++++++++++++++++------------------- gcc/config/arm/ieee754-sf.S | 870 +++++++++++++++----------------- 3 files changed, 857 insertions(+), 989 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 7b3b80ce182..e5265e70a0d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2004-10-21 Nicolas Pitre + + * config/arm/ieee754-sf.S: Large speed improvements. Fix NAN handling. + * config/arm/ieee754-df.S: Ditto. + 2004-10-20 Zack Weinberg * dbxout.c (asmfile): Delete. All uses changed to asm_out_file. diff --git a/gcc/config/arm/ieee754-df.S b/gcc/config/arm/ieee754-df.S index af32b9e2c08..b9cf52e6458 100644 --- a/gcc/config/arm/ieee754-df.S +++ b/gcc/config/arm/ieee754-df.S @@ -60,6 +60,7 @@ ARM_FUNC_START negdf2 ARM_FUNC_ALIAS aeabi_dneg negdf2 + @ flip sign bit eor xh, xh, #0x80000000 RET @@ -76,10 +77,10 @@ ARM_FUNC_START aeabi_drsub eor xh, xh, #0x80000000 @ flip sign bit of first arg b 1f - ARM_FUNC_START subdf3 +ARM_FUNC_START subdf3 ARM_FUNC_ALIAS aeabi_dsub subdf3 - @ flip sign bit of second arg - eor yh, yh, #0x80000000 + + eor yh, yh, #0x80000000 @ flip sign bit of second arg #if defined(__thumb__) && !defined(__THUMB_INTERWORK__) b 1f @ Skip Thumb-code prologue #endif @@ -87,36 +88,23 @@ ARM_FUNC_ALIAS aeabi_dsub subdf3 ARM_FUNC_START adddf3 ARM_FUNC_ALIAS aeabi_dadd adddf3 -1: @ Compare both args, return zero if equal but the sign. - teq xl, yl - eoreq ip, xh, yh - teqeq ip, #0x80000000 - beq LSYM(Lad_z) +1: stmfd sp!, {r4, r5, lr} - @ If first arg is 0 or -0, return second arg. - @ If second arg is 0 or -0, return first arg. - orrs ip, xl, xh, lsl #1 - moveq xl, yl - moveq xh, yh - orrnes ip, yl, yh, lsl #1 - RETc(eq) - - stmfd sp!, {r4, r5, lr} - - @ Mask out exponents. - mov ip, #0x7f000000 - orr ip, ip, #0x00f00000 - and r4, xh, ip - and r5, yh, ip - - @ If either of them is 0x7ff, result will be INF or NAN - teq r4, ip - teqne r5, ip - beq LSYM(Lad_i) + @ Look for zeroes, equal values, INF, or NAN. + mov r4, xh, lsl #1 + mov r5, yh, lsl #1 + teq r4, r5 + teqeq xl, yl + orrnes ip, r4, xl + orrnes ip, r5, yl + mvnnes ip, r4, asr #21 + mvnnes ip, r5, asr #21 + beq LSYM(Lad_s) @ Compute exponent difference. Make largest exponent in r4, @ corresponding arg in xh-xl, and positive exponent difference in r5. - subs r5, r5, r4 + mov r4, r4, lsr #21 + rsbs r5, r4, r5, lsr #21 rsblt r5, r5, #0 ble 1f add r4, r4, r5 @@ -127,24 +115,24 @@ ARM_FUNC_ALIAS aeabi_dadd adddf3 eor yl, xl, yl eor yh, xh, yh 1: - @ If exponent difference is too large, return largest argument @ already in xh-xl. We need up to 54 bit to handle proper rounding @ of 0x1p54 - 1.1. - cmp r5, #(54 << 20) + cmp r5, #54 RETLDM "r4, r5" hi @ Convert mantissa to signed integer. tst xh, #0x80000000 - bic xh, xh, ip, lsl #1 - orr xh, xh, #0x00100000 + mov xh, xh, lsl #12 + mov ip, #0x00100000 + orr xh, ip, xh, lsr #12 beq 1f rsbs xl, xl, #0 rsc xh, xh, #0 1: tst yh, #0x80000000 - bic yh, yh, ip, lsl #1 - orr yh, yh, #0x00100000 + mov yh, yh, lsl #12 + orr yh, ip, yh, lsr #12 beq 1f rsbs yl, yl, #0 rsc yh, yh, #0 @@ -154,42 +142,30 @@ ARM_FUNC_ALIAS aeabi_dadd adddf3 teq r4, r5 beq LSYM(Lad_d) LSYM(Lad_x): - @ Scale down second arg with exponent difference. - @ Apply shift one bit left to first arg and the rest to second arg - @ to simplify things later, but only if exponent does not become 0. - mov ip, #0 - movs r5, r5, lsr #20 - beq 3f - teq r4, #(1 << 20) - beq 1f - movs xl, xl, lsl #1 - adc xh, ip, xh, lsl #1 - sub r4, r4, #(1 << 20) - subs r5, r5, #1 - beq 3f - @ Shift yh-yl right per r5, keep leftover bits into ip. -1: rsbs lr, r5, #32 - blt 2f + @ Compensate for the exponent overlapping the mantissa MSB added later + sub r4, r4, #1 + + @ Shift yh-yl right per r5, add to xh-xl, keep leftover bits into ip. + rsbs lr, r5, #32 + blt 1f mov ip, yl, lsl lr - mov yl, yl, lsr r5 - orr yl, yl, yh, lsl lr - mov yh, yh, asr r5 - b 3f -2: sub r5, r5, #32 + adds xl, xl, yl, lsr r5 + adc xh, xh, #0 + adds xl, xl, yh, lsl lr + adcs xh, xh, yh, asr r5 + b 2f +1: sub r5, r5, #32 add lr, lr, #32 cmp yl, #1 - adc ip, ip, yh, lsl lr - mov yl, yh, asr r5 - mov yh, yh, asr #32 -3: - @ the actual addition - adds xl, xl, yl - adc xh, xh, yh - + mov ip, yh, lsl lr + orrcs ip, ip, #2 @ 2 not 1, to allow lsr #1 later + adds xl, xl, yh, asr r5 + adcs xh, xh, yh, asr #31 +2: @ We now have a result in xh-xl-ip. - @ Keep absolute value in xh-xl-ip, sign in r5. - ands r5, xh, #0x80000000 + @ Keep absolute value in xh-xl-ip, sign in r5 (the n bit was set above) + and r5, xh, #0x80000000 bpl LSYM(Lad_p) rsbs ip, ip, #0 rscs xl, xl, #0 @@ -198,75 +174,66 @@ LSYM(Lad_x): @ Determine how to normalize the result. LSYM(Lad_p): cmp xh, #0x00100000 - bcc LSYM(Lad_l) + bcc LSYM(Lad_a) cmp xh, #0x00200000 - bcc LSYM(Lad_r0) - cmp xh, #0x00400000 - bcc LSYM(Lad_r1) + bcc LSYM(Lad_e) @ Result needs to be shifted right. movs xh, xh, lsr #1 movs xl, xl, rrx - movs ip, ip, rrx - orrcs ip, ip, #1 - add r4, r4, #(1 << 20) -LSYM(Lad_r1): - movs xh, xh, lsr #1 - movs xl, xl, rrx - movs ip, ip, rrx - orrcs ip, ip, #1 - add r4, r4, #(1 << 20) + mov ip, ip, rrx + add r4, r4, #1 + + @ Make sure we did not bust our exponent. + mov r2, r4, lsl #21 + cmn r2, #(2 << 21) + bcs LSYM(Lad_o) @ Our result is now properly aligned into xh-xl, remaining bits in ip. @ Round with MSB of ip. If halfway between two numbers, round towards @ LSB of xl = 0. -LSYM(Lad_r0): - adds xl, xl, ip, lsr #31 - adc xh, xh, #0 - teq ip, #0x80000000 - biceq xl, xl, #1 - - @ One extreme rounding case may add a new MSB. Adjust exponent. - @ That MSB will be cleared when exponent is merged below. - tst xh, #0x00200000 - addne r4, r4, #(1 << 20) - - @ Make sure we did not bust our exponent. - adds ip, r4, #(1 << 20) - bmi LSYM(Lad_o) - @ Pack final result together. LSYM(Lad_e): - bic xh, xh, #0x00300000 - orr xh, xh, r4 + cmp ip, #0x80000000 + moveqs ip, xl, lsr #1 + adcs xl, xl, #0 + adc xh, xh, r4, lsl #20 orr xh, xh, r5 RETLDM "r4, r5" -LSYM(Lad_l): @ Result must be shifted left and exponent adjusted. - @ No rounding necessary since ip will always be 0. +LSYM(Lad_a): + movs ip, ip, lsl #1 + adcs xl, xl, xl + adc xh, xh, xh + tst xh, #0x00100000 + sub r4, r4, #1 + bne LSYM(Lad_e) + + @ No rounding necessary since ip will always be 0 at this point. +LSYM(Lad_l): + #if __ARM_ARCH__ < 5 teq xh, #0 - movne r3, #-11 - moveq r3, #21 + movne r3, #20 + moveq r3, #52 moveq xh, xl moveq xl, #0 mov r2, xh - movs ip, xh, lsr #16 - moveq r2, r2, lsl #16 - addeq r3, r3, #16 - tst r2, #0xff000000 - moveq r2, r2, lsl #8 - addeq r3, r3, #8 - tst r2, #0xf0000000 - moveq r2, r2, lsl #4 - addeq r3, r3, #4 - tst r2, #0xc0000000 - moveq r2, r2, lsl #2 - addeq r3, r3, #2 - tst r2, #0x80000000 - addeq r3, r3, #1 + cmp r2, #(1 << 16) + movhs r2, r2, lsr #16 + subhs r3, r3, #16 + cmp r2, #(1 << 8) + movhs r2, r2, lsr #8 + subhs r3, r3, #8 + cmp r2, #(1 << 4) + movhs r2, r2, lsr #4 + subhs r3, r3, #4 + cmp r2, #(1 << 2) + subhs r3, r3, #2 + sublo r3, r3, r2, lsr #1 + sub r3, r3, r2, lsr #3 #else @@ -302,13 +269,15 @@ LSYM(Lad_l): movle xl, xl, lsl r2 @ adjust exponent accordingly. -3: subs r4, r4, r3, lsl #20 - bgt LSYM(Lad_e) +3: subs r4, r4, r3 + addge xh, xh, r4, lsl #20 + orrge xh, xh, r5 + RETLDM "r4, r5" ge @ Exponent too small, denormalize result. @ Find out proper shift value. - mvn r4, r4, asr #20 - subs r4, r4, #30 + mvn r4, r4 + subs r4, r4, #31 bge 2f adds r4, r4, #12 bgt 1f @@ -337,23 +306,49 @@ LSYM(Lad_l): RETLDM "r4, r5" @ Adjust exponents for denormalized arguments. + @ Note that r4 must not remain equal to 0. LSYM(Lad_d): teq r4, #0 - eoreq xh, xh, #0x00100000 - addeq r4, r4, #(1 << 20) eor yh, yh, #0x00100000 - subne r5, r5, #(1 << 20) + eoreq xh, xh, #0x00100000 + addeq r4, r4, #1 + subne r5, r5, #1 b LSYM(Lad_x) - @ Result is x - x = 0, unless x = INF or NAN. -LSYM(Lad_z): - sub ip, ip, #0x00100000 @ ip becomes 0x7ff00000 - and r2, xh, ip - teq r2, ip - orreq xh, ip, #0x00080000 + +LSYM(Lad_s): + mvns ip, r4, asr #21 + mvnnes ip, r5, asr #21 + beq LSYM(Lad_i) + + teq r4, r5 + teqeq xl, yl + beq 1f + + @ Result is x + 0.0 = x or 0.0 + y = y. + teq r4, #0 + moveq xh, yh + moveq xl, yl + RETLDM "r4, r5" + +1: teq xh, yh + + @ Result is x - x = 0. movne xh, #0 - mov xl, #0 - RET + movne xl, #0 + RETLDM "r4, r5" ne + + @ Result is x + x = 2x. + movs ip, r4, lsr #21 + bne 2f + movs xl, xl, lsl #1 + adcs xh, xh, xh + orrcs xh, xh, #0x80000000 + RETLDM "r4, r5" +2: adds r4, r4, #(2 << 21) + addcc xh, xh, #(1 << 20) + RETLDM "r4, r5" cc + and r5, xh, #0x80000000 @ Overflow: return INF. LSYM(Lad_o): @@ -367,19 +362,18 @@ LSYM(Lad_o): @ if yh-yl != INF/NAN: return xh-xl (which is INF/NAN) @ if either is NAN: return NAN @ if opposite sign: return NAN - @ return xh-xl (which is INF or -INF) + @ otherwise return xh-xl (which is INF or -INF) LSYM(Lad_i): - teq r4, ip + mvns ip, r4, asr #21 movne xh, yh movne xl, yl - teqeq r5, ip - RETLDM "r4, r5" ne - + mvneqs ip, r5, asr #21 + movne yh, xh + movne yl, xl orrs r4, xl, xh, lsl #12 - orreqs r4, yl, yh, lsl #12 + orreqs r5, yl, yh, lsl #12 teqeq xh, yh - orrne xh, r5, #0x00080000 - movne xl, #0 + orrne xh, xh, #0x00080000 @ quiet NAN RETLDM "r4, r5" FUNC_END aeabi_dsub @@ -389,14 +383,17 @@ LSYM(Lad_i): ARM_FUNC_START floatunsidf ARM_FUNC_ALIAS aeabi_ui2d floatunsidf + teq r0, #0 moveq r1, #0 RETc(eq) stmfd sp!, {r4, r5, lr} - mov r4, #(0x400 << 20) @ initial exponent - add r4, r4, #((52-1) << 20) + mov r4, #0x400 @ initial exponent + add r4, r4, #(52-1 - 1) mov r5, #0 @ sign bit is 0 + .ifnc xl, r0 mov xl, r0 + .endif mov xh, #0 b LSYM(Lad_l) @@ -405,15 +402,18 @@ ARM_FUNC_ALIAS aeabi_ui2d floatunsidf ARM_FUNC_START floatsidf ARM_FUNC_ALIAS aeabi_i2d floatsidf + teq r0, #0 moveq r1, #0 RETc(eq) stmfd sp!, {r4, r5, lr} - mov r4, #(0x400 << 20) @ initial exponent - add r4, r4, #((52-1) << 20) + mov r4, #0x400 @ initial exponent + add r4, r4, #(52-1 - 1) ands r5, r0, #0x80000000 @ sign bit in r5 rsbmi r0, r0, #0 @ absolute value + .ifnc xl, r0 mov xl, r0 + .endif mov xh, #0 b LSYM(Lad_l) @@ -422,26 +422,23 @@ ARM_FUNC_ALIAS aeabi_i2d floatsidf ARM_FUNC_START extendsfdf2 ARM_FUNC_ALIAS aeabi_f2d extendsfdf2 - - movs r2, r0, lsl #1 - beq 1f @ value is 0.0 or -0.0 + + movs r2, r0, lsl #1 @ toss sign bit mov xh, r2, asr #3 @ stretch exponent mov xh, xh, rrx @ retrieve sign bit mov xl, r2, lsl #28 @ retrieve remaining bits - ands r2, r2, #0xff000000 @ isolate exponent - beq 2f @ exponent was 0 but not mantissa - teq r2, #0xff000000 @ check if INF or NAN + andnes r3, r2, #0xff000000 @ isolate exponent + teqne r3, #0xff000000 @ if not 0, check if INF or NAN eorne xh, xh, #0x38000000 @ fixup exponent otherwise. - RET + RETc(ne) @ and return it. -1: mov xh, r0 - mov xl, #0 - RET + teq r2, #0 @ if actually 0 + teqne r3, #0xff000000 @ or INF or NAN + RETc(eq) @ we are done already. -2: @ value was denormalized. We can normalize it now. + @ value was denormalized. We can normalize it now. stmfd sp!, {r4, r5, lr} - mov r4, #(0x380 << 20) @ setup corresponding exponent - add r4, r4, #(1 << 20) + mov r4, #0x380 @ setup corresponding exponent and r5, xh, #0x80000000 @ move sign bit in r5 bic xh, xh, #0x80000000 b LSYM(Lad_l) @@ -451,76 +448,90 @@ ARM_FUNC_ALIAS aeabi_f2d extendsfdf2 ARM_FUNC_START floatundidf ARM_FUNC_ALIAS aeabi_ul2d floatundidf - + orrs r2, r0, r1 #if !defined (__VFP_FP__) && !defined(__SOFTFP__) mvfeqd f0, #0.0 #endif RETc(eq) + #if !defined (__VFP_FP__) && !defined(__SOFTFP__) @ For hard FPA code we want to return via the tail below so that @ we can return the result in f0 as well as in r0/r1 for backwards @ compatibility. - adr ip, 1f + adr ip, LSYM(f0_ret) stmfd sp!, {r4, r5, ip, lr} #else stmfd sp!, {r4, r5, lr} #endif + mov r5, #0 b 2f ARM_FUNC_START floatdidf ARM_FUNC_ALIAS aeabi_l2d floatdidf + orrs r2, r0, r1 #if !defined (__VFP_FP__) && !defined(__SOFTFP__) mvfeqd f0, #0.0 #endif RETc(eq) + #if !defined (__VFP_FP__) && !defined(__SOFTFP__) @ For hard FPA code we want to return via the tail below so that @ we can return the result in f0 as well as in r0/r1 for backwards @ compatibility. - adr ip, 1f + adr ip, LSYM(f0_ret) stmfd sp!, {r4, r5, ip, lr} #else stmfd sp!, {r4, r5, lr} #endif + ands r5, ah, #0x80000000 @ sign bit in r5 bpl 2f rsbs al, al, #0 rsc ah, ah, #0 2: - mov r4, #(0x400 << 20) @ initial exponent - add r4, r4, #((52 - 1) << 20) -#if !defined (__VFP_FP__) && !defined(__ARMEB__) + mov r4, #0x400 @ initial exponent + add r4, r4, #(52-1 - 1) + @ FPA little-endian: must swap the word order. + .ifnc xh, ah mov ip, al mov xh, ah mov xl, ip -#endif - movs ip, xh, lsr #23 + .endif + + movs ip, xh, lsr #22 beq LSYM(Lad_p) - @ The value's too big. Scale it down a bit... + + @ The value is too big. Scale it down a bit... mov r2, #3 movs ip, ip, lsr #3 addne r2, r2, #3 movs ip, ip, lsr #3 addne r2, r2, #3 + add r2, r2, ip + rsb r3, r2, #32 mov ip, xl, lsl r3 mov xl, xl, lsr r2 orr xl, xl, xh, lsl r3 mov xh, xh, lsr r2 - add r4, r4, r2, lsl #20 + add r4, r4, r2 b LSYM(Lad_p) + #if !defined (__VFP_FP__) && !defined(__SOFTFP__) -1: + @ Legacy code expects the result to be returned in f0. Copy it @ there as well. +LSYM(f0_ret): stmfd sp!, {r0, r1} ldfd f0, [sp], #8 RETLDM + #endif + FUNC_END floatdidf FUNC_END aeabi_l2d FUNC_END floatundidf @@ -534,46 +545,38 @@ ARM_FUNC_START muldf3 ARM_FUNC_ALIAS aeabi_dmul muldf3 stmfd sp!, {r4, r5, r6, lr} - @ Mask out exponents. - mov ip, #0x7f000000 - orr ip, ip, #0x00f00000 - and r4, xh, ip - and r5, yh, ip - - @ Trap any INF/NAN. - teq r4, ip + @ Mask out exponents, trap any zero/denormal/INF/NAN. + mov ip, #0xff + orr ip, ip, #0x700 + ands r4, ip, xh, lsr #20 + andnes r5, ip, yh, lsr #20 + teqne r4, ip teqne r5, ip - beq LSYM(Lml_s) + bleq LSYM(Lml_s) - @ Trap any multiplication by 0. - orrs r6, xl, xh, lsl #1 - orrnes r6, yl, yh, lsl #1 - beq LSYM(Lml_z) + @ Add exponents together + add r4, r4, r5 - @ Shift exponents right one bit to make room for overflow bit. - @ If either of them is 0, scale denormalized arguments off line. - @ Then add both exponents together. - movs r4, r4, lsr #1 - teqne r5, #0 - beq LSYM(Lml_d) -LSYM(Lml_x): - add r4, r4, r5, asr #1 - - @ Preserve final sign in r4 along with exponent for now. - teq xh, yh - orrmi r4, r4, #0x8000 + @ Determine final sign. + eor r6, xh, yh @ Convert mantissa to unsigned integer. - bic xh, xh, ip, lsl #1 - bic yh, yh, ip, lsl #1 + @ If power of two, branch to a separate path. + bic xh, xh, ip, lsl #21 + bic yh, yh, ip, lsl #21 + orrs r5, xl, xh, lsl #12 + orrnes r5, yl, yh, lsl #12 orr xh, xh, #0x00100000 orr yh, yh, #0x00100000 + beq LSYM(Lml_1) #if __ARM_ARCH__ < 4 + @ Put sign bit in r6, which will be restored in yl later. + and r6, r6, #0x80000000 + @ Well, no way to make it shorter without the umull instruction. - @ We must perform that 53 x 53 bit multiplication by hand. - stmfd sp!, {r7, r8, r9, sl, fp} + stmfd sp!, {r6, r7, r8, r9, sl, fp} mov r7, xl, lsr #16 mov r8, yl, lsr #16 mov r9, xh, lsr #16 @@ -625,92 +628,83 @@ LSYM(Lml_x): mul fp, xh, yh adcs r5, r5, fp adc r6, r6, #0 - ldmfd sp!, {r7, r8, r9, sl, fp} + ldmfd sp!, {yl, r7, r8, r9, sl, fp} #else - @ Here is the actual multiplication: 53 bits * 53 bits -> 106 bits. + @ Here is the actual multiplication. umull ip, lr, xl, yl mov r5, #0 - umlal lr, r5, xl, yh umlal lr, r5, xh, yl + and yl, r6, #0x80000000 + umlal lr, r5, xl, yh mov r6, #0 umlal r5, r6, xh, yh #endif @ The LSBs in ip are only significant for the final rounding. - @ Fold them into one bit of lr. + @ Fold them into lr. teq ip, #0 orrne lr, lr, #1 - @ Put final sign in xh. - mov xh, r4, lsl #16 - bic r4, r4, #0x8000 - - @ Adjust result if one extra MSB appeared (one of four times). - tst r6, #(1 << 9) - beq 1f - add r4, r4, #(1 << 19) - movs r6, r6, lsr #1 - movs r5, r5, rrx - movs lr, lr, rrx - orrcs lr, lr, #1 + @ Adjust result upon the MSB position. + sub r4, r4, #0xff + cmp r6, #(1 << (20-11)) + sbc r4, r4, #0x300 + bcs 1f + movs lr, lr, lsl #1 + adcs r5, r5, r5 + adc r6, r6, r6 1: - @ Scale back to 53 bits. - @ xh contains sign bit already. - orr xh, xh, r6, lsl #12 - orr xh, xh, r5, lsr #20 - mov xl, r5, lsl #12 - orr xl, xl, lr, lsr #20 + @ Shift to final position, add sign to result. + orr xh, yl, r6, lsl #11 + orr xh, xh, r5, lsr #21 + mov xl, r5, lsl #11 + orr xl, xl, lr, lsr #21 + mov lr, lr, lsl #11 - @ Apply exponent bias, check range for underflow. - sub r4, r4, #0x00f80000 - subs r4, r4, #0x1f000000 - ble LSYM(Lml_u) + @ Check exponent range for under/overflow. + subs ip, r4, #(254 - 1) + cmphi ip, #0x700 + bhi LSYM(Lml_u) - @ Round the result. - movs lr, lr, lsl #12 - bpl 1f - adds xl, xl, #1 - adc xh, xh, #0 - teq lr, #0x80000000 - biceq xl, xl, #1 - - @ Rounding may have produced an extra MSB here. - @ The extra bit is cleared before merging the exponent below. - tst xh, #0x00200000 - addne r4, r4, #(1 << 19) -1: - @ Check exponent for overflow. - adds ip, r4, #(1 << 19) - tst ip, #(1 << 30) - bne LSYM(Lml_o) - - @ Add final exponent. - bic xh, xh, #0x00300000 - orr xh, xh, r4, lsl #1 + @ Round the result, merge final exponent. + cmp lr, #0x80000000 + moveqs lr, xl, lsr #1 + adcs xl, xl, #0 + adc xh, xh, r4, lsl #20 RETLDM "r4, r5, r6" - @ Result is 0, but determine sign anyway. -LSYM(Lml_z): + @ Multiplication by 0x1p*: let''s shortcut a lot of code. +LSYM(Lml_1): + and r6, r6, #0x80000000 + orr xh, r6, xh + orr xl, xl, yl eor xh, xh, yh -LSYM(Ldv_z): - bic xh, xh, #0x7fffffff - mov xl, #0 - RETLDM "r4, r5, r6" + subs r4, r4, ip, lsr #1 + rsbgts r5, r4, ip + orrgt xh, xh, r4, lsl #20 + RETLDM "r4, r5, r6" gt + + @ Under/overflow: fix things up for the code below. + orr xh, xh, #0x00100000 + mov lr, #0 + subs r4, r4, #1 + +LSYM(Lml_u): + @ Overflow? + bgt LSYM(Lml_o) @ Check if denormalized result is possible, otherwise return signed 0. -LSYM(Lml_u): - cmn r4, #(53 << 19) + cmn r4, #(53 + 1) movle xl, #0 bicle xh, xh, #0x7fffffff RETLDM "r4, r5, r6" le @ Find out proper shift value. -LSYM(Lml_r): - mvn r4, r4, asr #19 - subs r4, r4, #30 + rsb r4, r4, #0 + subs r4, r4, #32 bge 2f adds r4, r4, #12 bgt 1f @@ -721,14 +715,12 @@ LSYM(Lml_r): mov r3, xl, lsl r5 mov xl, xl, lsr r4 orr xl, xl, xh, lsl r5 - movs xh, xh, lsl #1 - mov xh, xh, lsr r4 - mov xh, xh, rrx + and r2, xh, #0x80000000 + bic xh, xh, #0x80000000 adds xl, xl, r3, lsr #31 - adc xh, xh, #0 - teq lr, #0 - teqeq r3, #0x80000000 - biceq xl, xl, #1 + adc xh, r2, xh, lsr r4 + orrs lr, lr, r3, lsl #1 + biceq xl, xl, r3, lsr #31 RETLDM "r4, r5, r6" @ shift result right of 21 to 31 bits, or left 11 to 1 bits after @@ -741,54 +733,71 @@ LSYM(Lml_r): bic xh, xh, #0x7fffffff adds xl, xl, r3, lsr #31 adc xh, xh, #0 - teq lr, #0 - teqeq r3, #0x80000000 - biceq xl, xl, #1 + orrs lr, lr, r3, lsl #1 + biceq xl, xl, r3, lsr #31 RETLDM "r4, r5, r6" @ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch @ from xh to xl. Leftover bits are in r3-r6-lr for rounding. 2: rsb r5, r4, #32 - mov r6, xl, lsl r5 + orr lr, lr, xl, lsl r5 mov r3, xl, lsr r4 orr r3, r3, xh, lsl r5 mov xl, xh, lsr r4 bic xh, xh, #0x7fffffff bic xl, xl, xh, lsr r4 add xl, xl, r3, lsr #31 - orrs r6, r6, lr - teqeq r3, #0x80000000 - biceq xl, xl, #1 + orrs lr, lr, r3, lsl #1 + biceq xl, xl, r3, lsr #31 RETLDM "r4, r5, r6" @ One or both arguments are denormalized. @ Scale them leftwards and preserve sign bit. LSYM(Lml_d): - mov lr, #0 teq r4, #0 bne 2f and r6, xh, #0x80000000 1: movs xl, xl, lsl #1 - adc xh, lr, xh, lsl #1 + adc xh, xh, xh tst xh, #0x00100000 - subeq r4, r4, #(1 << 19) + subeq r4, r4, #1 beq 1b orr xh, xh, r6 teq r5, #0 - bne LSYM(Lml_x) + movne pc, lr 2: and r6, yh, #0x80000000 3: movs yl, yl, lsl #1 - adc yh, lr, yh, lsl #1 + adc yh, yh, yh tst yh, #0x00100000 - subeq r5, r5, #(1 << 20) + subeq r5, r5, #1 beq 3b orr yh, yh, r6 - b LSYM(Lml_x) + mov pc, lr - @ One or both args are INF or NAN. LSYM(Lml_s): + @ Isolate the INF and NAN cases away + teq r4, ip + and r5, ip, yh, lsr #20 + teqne r5, ip + beq 1f + + @ Here, one or more arguments are either denormalized or zero. orrs r6, xl, xh, lsl #1 orrnes r6, yl, yh, lsl #1 + bne LSYM(Lml_d) + + @ Result is 0, but determine sign anyway. +LSYM(Lml_z): + eor xh, xh, yh + bic xh, xh, #0x7fffffff + mov xl, #0 + RETLDM "r4, r5, r6" + +1: @ One or both args are INF or NAN. + orrs r6, xl, xh, lsl #1 + moveq xl, yl + moveq xh, yh + orrnes r6, yl, yh, lsl #1 beq LSYM(Lml_n) @ 0 * INF or INF * 0 -> NAN teq r4, ip bne 1f @@ -797,6 +806,8 @@ LSYM(Lml_s): 1: teq r5, ip bne LSYM(Lml_i) orrs r6, yl, yh, lsl #12 + movne xl, yl + movne xh, yh bne LSYM(Lml_n) @ * NAN -> NAN @ Result is INF, but we need to determine its sign. @@ -811,9 +822,9 @@ LSYM(Lml_o): mov xl, #0 RETLDM "r4, r5, r6" - @ Return NAN. + @ Return a quiet NAN. LSYM(Lml_n): - mov xh, #0x7f000000 + orr xh, xh, #0x7f000000 orr xh, xh, #0x00f80000 RETLDM "r4, r5, r6" @@ -825,41 +836,31 @@ ARM_FUNC_ALIAS aeabi_ddiv divdf3 stmfd sp!, {r4, r5, r6, lr} - @ Mask out exponents. - mov ip, #0x7f000000 - orr ip, ip, #0x00f00000 - and r4, xh, ip - and r5, yh, ip - - @ Trap any INF/NAN or zeroes. - teq r4, ip + @ Mask out exponents, trap any zero/denormal/INF/NAN. + mov ip, #0xff + orr ip, ip, #0x700 + ands r4, ip, xh, lsr #20 + andnes r5, ip, yh, lsr #20 + teqne r4, ip teqne r5, ip - orrnes r6, xl, xh, lsl #1 - orrnes r6, yl, yh, lsl #1 - beq LSYM(Ldv_s) + bleq LSYM(Ldv_s) - @ Shift exponents right one bit to make room for overflow bit. - @ If either of them is 0, scale denormalized arguments off line. - @ Then substract divisor exponent from dividend''s. - movs r4, r4, lsr #1 - teqne r5, #0 - beq LSYM(Ldv_d) -LSYM(Ldv_x): - sub r4, r4, r5, asr #1 + @ Substract divisor exponent from dividend''s. + sub r4, r4, r5 @ Preserve final sign into lr. eor lr, xh, yh @ Convert mantissa to unsigned integer. @ Dividend -> r5-r6, divisor -> yh-yl. - mov r5, #0x10000000 + orrs r5, yl, yh, lsl #12 + mov xh, xh, lsl #12 + beq LSYM(Ldv_1) mov yh, yh, lsl #12 + mov r5, #0x10000000 orr yh, r5, yh, lsr #4 orr yh, yh, yl, lsr #24 - movs yl, yl, lsl #8 - mov xh, xh, lsl #12 - teqeq yh, r5 - beq LSYM(Ldv_1) + mov yl, yl, lsl #8 orr r5, r5, xh, lsr #4 orr r5, r5, xl, lsr #24 mov r6, xl, lsl #8 @@ -868,21 +869,15 @@ LSYM(Ldv_x): and xh, lr, #0x80000000 @ Ensure result will land to known bit position. + @ Apply exponent bias accordingly. cmp r5, yh cmpeq r6, yl + adc r4, r4, #(255 - 2) + add r4, r4, #0x300 bcs 1f - sub r4, r4, #(1 << 19) movs yh, yh, lsr #1 mov yl, yl, rrx 1: - @ Apply exponent bias, check range for over/underflow. - add r4, r4, #0x1f000000 - add r4, r4, #0x00f80000 - cmn r4, #(53 << 19) - ble LSYM(Ldv_z) - cmp r4, ip, lsr #1 - bge LSYM(Lml_o) - @ Perform first substraction to align result to a nibble. subs r6, r6, yl sbc r5, r5, yh @@ -944,73 +939,42 @@ LSYM(Ldv_x): orreq xh, xh, xl moveq xl, #0 3: - @ Check if denormalized result is needed. - cmp r4, #0 - ble LSYM(Ldv_u) + @ Check exponent range for under/overflow. + subs ip, r4, #(254 - 1) + cmphi ip, #0x700 + bhi LSYM(Lml_u) - @ Apply proper rounding. + @ Round the result, merge final exponent. subs ip, r5, yh subeqs ip, r6, yl + moveqs ip, xl, lsr #1 adcs xl, xl, #0 - adc xh, xh, #0 - teq ip, #0 - biceq xl, xl, #1 - - @ Add exponent to result. - bic xh, xh, #0x00100000 - orr xh, xh, r4, lsl #1 + adc xh, xh, r4, lsl #20 RETLDM "r4, r5, r6" @ Division by 0x1p*: shortcut a lot of code. LSYM(Ldv_1): and lr, lr, #0x80000000 orr xh, lr, xh, lsr #12 - add r4, r4, #0x1f000000 - add r4, r4, #0x00f80000 - cmp r4, ip, lsr #1 - bge LSYM(Lml_o) - cmp r4, #0 - orrgt xh, xh, r4, lsl #1 + adds r4, r4, ip, lsr #1 + rsbgts r5, r4, ip + orrgt xh, xh, r4, lsl #20 RETLDM "r4, r5, r6" gt - cmn r4, #(53 << 19) - ble LSYM(Ldv_z) orr xh, xh, #0x00100000 mov lr, #0 - b LSYM(Lml_r) + subs r4, r4, #1 + b LSYM(Lml_u) - @ Result must be denormalized: put remainder in lr for - @ rounding considerations. + @ Result mightt need to be denormalized: put remainder bits + @ in lr for rounding considerations. LSYM(Ldv_u): orr lr, r5, r6 - b LSYM(Lml_r) - - @ One or both arguments are denormalized. - @ Scale them leftwards and preserve sign bit. -LSYM(Ldv_d): - mov lr, #0 - teq r4, #0 - bne 2f - and r6, xh, #0x80000000 -1: movs xl, xl, lsl #1 - adc xh, lr, xh, lsl #1 - tst xh, #0x00100000 - subeq r4, r4, #(1 << 19) - beq 1b - orr xh, xh, r6 - teq r5, #0 - bne LSYM(Ldv_x) -2: and r6, yh, #0x80000000 -3: movs yl, yl, lsl #1 - adc yh, lr, yh, lsl #1 - tst yh, #0x00100000 - subeq r5, r5, #(1 << 20) - beq 3b - orr yh, yh, r6 - b LSYM(Ldv_x) + b LSYM(Lml_u) @ One or both arguments is either INF, NAN or zero. LSYM(Ldv_s): + and r5, ip, yh, lsr #20 teq r4, ip teqeq r5, ip beq LSYM(Lml_n) @ INF/NAN / INF/NAN -> NAN @@ -1018,13 +982,23 @@ LSYM(Ldv_s): bne 1f orrs r4, xl, xh, lsl #12 bne LSYM(Lml_n) @ NAN / -> NAN - b LSYM(Lml_i) @ INF / -> INF + teq r5, ip + bne LSYM(Lml_i) @ INF / -> INF + mov xl, yl + mov xh, yh + b LSYM(Lml_n) @ INF / (INF or NAN) -> NAN 1: teq r5, ip bne 2f orrs r5, yl, yh, lsl #12 - bne LSYM(Lml_n) @ / NAN -> NAN - b LSYM(Lml_z) @ / INF -> 0 -2: @ One or both arguments are 0. + beq LSYM(Lml_z) @ / INF -> 0 + mov xl, yl + mov xh, yh + b LSYM(Lml_n) @ / NAN -> NAN +2: @ If both are non-zero, we need to normalize and resume above. + orrs r6, xl, xh, lsl #1 + orrnes r6, yl, yh, lsl #1 + bne LSYM(Lml_d) + @ One or both arguments are 0. orrs r4, xl, xh, lsl #1 bne LSYM(Lml_i) @ / 0 -> INF orrs r5, yl, yh, lsl #1 @@ -1038,6 +1012,8 @@ LSYM(Ldv_s): #ifdef L_cmpdf2 +@ Note: only r0 (return value) and ip are clobbered here. + ARM_FUNC_START gtdf2 ARM_FUNC_ALIAS gedf2 gtdf2 mov ip, #-1 @@ -1053,15 +1029,13 @@ ARM_FUNC_ALIAS nedf2 cmpdf2 ARM_FUNC_ALIAS eqdf2 cmpdf2 mov ip, #1 @ how should we specify unordered here? -1: stmfd sp!, {r4, r5, lr} +1: str ip, [sp, #-4] @ Trap any INF/NAN first. - mov lr, #0x7f000000 - orr lr, lr, #0x00f00000 - and r4, xh, lr - and r5, yh, lr - teq r4, lr - teqne r5, lr + mov ip, xh, lsl #1 + mvns ip, ip, asr #21 + mov ip, yh, lsl #1 + mvnnes ip, ip, asr #21 beq 3f @ Test for equality. @@ -1071,37 +1045,37 @@ ARM_FUNC_ALIAS eqdf2 cmpdf2 teqne xh, yh @ or xh == yh teqeq xl, yl @ and xl == yl moveq r0, #0 @ then equal. - RETLDM "r4, r5" eq + RETc(eq) - @ Check for sign difference. + @ Clear C flag + cmn r0, #0 + + @ Compare sign, teq xh, yh - movmi r0, xh, asr #31 - orrmi r0, r0, #1 - RETLDM "r4, r5" mi - @ Compare exponents. - cmp r4, r5 - - @ Compare mantissa if exponents are equal. - moveq xh, xh, lsl #12 - cmpeq xh, yh, lsl #12 + @ Compare values if same sign + cmppl xh, yh cmpeq xl, yl + + @ Result: movcs r0, yh, asr #31 mvncc r0, yh, asr #31 orr r0, r0, #1 - RETLDM "r4, r5" + RET @ Look for a NAN. -3: teq r4, lr +3: mov ip, xh, lsl #1 + mvns ip, ip, asr #21 bne 4f - orrs xl, xl, xh, lsl #12 + orrs ip, xl, xh, lsl #12 bne 5f @ x is NAN -4: teq r5, lr +4: mov ip, yh, lsl #1 + mvns ip, ip, asr #21 bne 2b - orrs yl, yl, yh, lsl #12 + orrs ip, yl, yh, lsl #12 beq 2b @ y is not NAN -5: mov r0, ip @ return unordered code from ip - RETLDM "r4, r5" +5: ldr r0, [sp, #-4] @ unordered return code + RET FUNC_END gedf2 FUNC_END gtdf2 @@ -1112,6 +1086,7 @@ ARM_FUNC_ALIAS eqdf2 cmpdf2 FUNC_END cmpdf2 ARM_FUNC_START aeabi_cdrcmple + mov ip, r0 mov r0, r2 mov r2, ip @@ -1122,85 +1097,95 @@ ARM_FUNC_START aeabi_cdrcmple ARM_FUNC_START aeabi_cdcmpeq ARM_FUNC_ALIAS aeabi_cdcmple aeabi_cdcmpeq + @ The status-returning routines are required to preserve all @ registers except ip, lr, and cpsr. -6: stmfd sp!, {r0, r1, r2, r3, lr} +6: stmfd sp!, {r0, lr} ARM_CALL cmpdf2 @ Set the Z flag correctly, and the C flag unconditionally. cmp r0, #0 @ Clear the C flag if the return value was -1, indicating @ that the first operand was smaller than the second. cmnmi r0, #0 - RETLDM "r0, r1, r2, r3" + RETLDM "r0" + FUNC_END aeabi_cdcmple FUNC_END aeabi_cdcmpeq + FUNC_END aeabi_cdrcmple ARM_FUNC_START aeabi_dcmpeq + str lr, [sp, #-4]! ARM_CALL aeabi_cdcmple moveq r0, #1 @ Equal to. movne r0, #0 @ Less than, greater than, or unordered. RETLDM + FUNC_END aeabi_dcmpeq ARM_FUNC_START aeabi_dcmplt + str lr, [sp, #-4]! ARM_CALL aeabi_cdcmple movcc r0, #1 @ Less than. movcs r0, #0 @ Equal to, greater than, or unordered. RETLDM + FUNC_END aeabi_dcmplt ARM_FUNC_START aeabi_dcmple + str lr, [sp, #-4]! ARM_CALL aeabi_cdcmple movls r0, #1 @ Less than or equal to. movhi r0, #0 @ Greater than or unordered. RETLDM + FUNC_END aeabi_dcmple ARM_FUNC_START aeabi_dcmpge + str lr, [sp, #-4]! ARM_CALL aeabi_cdrcmple movls r0, #1 @ Operand 2 is less than or equal to operand 1. movhi r0, #0 @ Operand 2 greater than operand 1, or unordered. RETLDM + FUNC_END aeabi_dcmpge ARM_FUNC_START aeabi_dcmpgt + str lr, [sp, #-4]! ARM_CALL aeabi_cdrcmple movcc r0, #1 @ Operand 2 is less than operand 1. movcs r0, #0 @ Operand 2 is greater than or equal to operand 1, @ or they are unordered. RETLDM + FUNC_END aeabi_dcmpgt - + #endif /* L_cmpdf2 */ #ifdef L_unorddf2 ARM_FUNC_START unorddf2 ARM_FUNC_ALIAS aeabi_dcmpun unorddf2 - - str lr, [sp, #-4]! - mov ip, #0x7f000000 - orr ip, ip, #0x00f00000 - and lr, xh, ip - teq lr, ip + + mov ip, xh, lsl #1 + mvns ip, ip, asr #21 bne 1f - orrs xl, xl, xh, lsl #12 + orrs ip, xl, xh, lsl #12 bne 3f @ x is NAN -1: and lr, yh, ip - teq lr, ip +1: mov ip, yh, lsl #1 + mvns ip, ip, asr #21 bne 2f - orrs yl, yl, yh, lsl #12 + orrs ip, yl, yh, lsl #12 bne 3f @ y is NAN 2: mov r0, #0 @ arguments are ordered. - RETLDM + RET 3: mov r0, #1 @ arguments are unordered. - RETLDM + RET FUNC_END aeabi_dcmpun FUNC_END unorddf2 @@ -1211,31 +1196,22 @@ ARM_FUNC_ALIAS aeabi_dcmpun unorddf2 ARM_FUNC_START fixdfsi ARM_FUNC_ALIAS aeabi_d2iz fixdfsi - orrs ip, xl, xh, lsl #1 - beq 1f @ value is 0. - - mov r3, r3, rrx @ preserve C flag (the actual sign) @ check exponent range. - mov ip, #0x7f000000 - orr ip, ip, #0x00f00000 - and r2, xh, ip - teq r2, ip - beq 2f @ value is INF or NAN - bic ip, ip, #0x40000000 - cmp r2, ip - bcc 1f @ value is too small - add ip, ip, #(31 << 20) - cmp r2, ip - bcs 3f @ value is too large + mov r2, xh, lsl #1 + adds r2, r2, #(1 << 21) + bcs 2f @ value is INF or NAN + bpl 1f @ value is too small + mov r3, #(0xfffffc00 + 31) + subs r2, r3, r2, asr #21 + bls 3f @ value is too large - rsb r2, r2, ip - mov ip, xh, lsl #11 - orr ip, ip, #0x80000000 - orr ip, ip, xl, lsr #21 - mov r2, r2, lsr #20 - tst r3, #0x80000000 @ the sign bit - mov r0, ip, lsr r2 + @ scale value + mov r3, xh, lsl #11 + orr r3, r3, #0x80000000 + orr r3, r3, xl, lsr #21 + tst xh, #0x80000000 @ the sign bit + mov r0, r3, lsr r2 rsbne r0, r0, #0 RET @@ -1243,8 +1219,8 @@ ARM_FUNC_ALIAS aeabi_d2iz fixdfsi RET 2: orrs xl, xl, xh, lsl #12 - bne 4f @ r0 is NAN. -3: ands r0, r3, #0x80000000 @ the sign bit + bne 4f @ x is NAN. +3: ands r0, xh, #0x80000000 @ the sign bit moveq r0, #0x7fffffff @ maximum signed positive si RET @@ -1260,29 +1236,22 @@ ARM_FUNC_ALIAS aeabi_d2iz fixdfsi ARM_FUNC_START fixunsdfsi ARM_FUNC_ALIAS aeabi_d2uiz fixunsdfsi - orrs ip, xl, xh, lsl #1 - movcss r0, #0 @ value is negative - RETc(eq) @ or 0 (xl, xh overlap r0) @ check exponent range. - mov ip, #0x7f000000 - orr ip, ip, #0x00f00000 - and r2, xh, ip - teq r2, ip - beq 2f @ value is INF or NAN - bic ip, ip, #0x40000000 - cmp r2, ip - bcc 1f @ value is too small - add ip, ip, #(31 << 20) - cmp r2, ip - bhi 3f @ value is too large + movs r2, xh, lsl #1 + bcs 1f @ value is negative + adds r2, r2, #(1 << 21) + bcs 2f @ value is INF or NAN + bpl 1f @ value is too small + mov r3, #(0xfffffc00 + 31) + subs r2, r3, r2, asr #21 + bmi 3f @ value is too large - rsb r2, r2, ip - mov ip, xh, lsl #11 - orr ip, ip, #0x80000000 - orr ip, ip, xl, lsr #21 - mov r2, r2, lsr #20 - mov r0, ip, lsr r2 + @ scale value + mov r3, xh, lsl #11 + orr r3, r3, #0x80000000 + orr r3, r3, xl, lsr #21 + mov r0, r3, lsr r2 RET 1: mov r0, #0 @@ -1305,91 +1274,61 @@ ARM_FUNC_ALIAS aeabi_d2uiz fixunsdfsi ARM_FUNC_START truncdfsf2 ARM_FUNC_ALIAS aeabi_d2f truncdfsf2 - orrs r2, xl, xh, lsl #1 - moveq r0, r2, rrx - RETc(eq) @ value is 0.0 or -0.0 - + @ check exponent range. - mov ip, #0x7f000000 - orr ip, ip, #0x00f00000 - and r2, ip, xh - teq r2, ip - beq 2f @ value is INF or NAN - bic xh, xh, ip - cmp r2, #(0x380 << 20) - bls 4f @ value is too small + mov r2, xh, lsl #1 + subs r3, r2, #((1023 - 127) << 21) + subcss ip, r3, #(1 << 21) + rsbcss ip, ip, #(254 << 21) + bls 2f @ value is out of range - @ shift and round mantissa -1: movs r3, xl, lsr #29 - adc r3, r3, xh, lsl #3 - - @ if halfway between two numbers, round towards LSB = 0. - mov xl, xl, lsl #3 - teq xl, #0x80000000 - biceq r3, r3, #1 - - @ rounding might have created an extra MSB. If so adjust exponent. - tst r3, #0x00800000 - addne r2, r2, #(1 << 20) - bicne r3, r3, #0x00800000 - - @ check exponent for overflow - mov ip, #(0x400 << 20) - orr ip, ip, #(0x07f << 20) - cmp r2, ip - bcs 3f @ overflow - - @ adjust exponent, merge with sign bit and mantissa. - movs xh, xh, lsl #1 - mov r2, r2, lsl #4 - orr r0, r3, r2, rrx - eor r0, r0, #0x40000000 +1: @ shift and round mantissa + and ip, xh, #0x80000000 + mov r2, xl, lsl #3 + orr xl, ip, xl, lsr #29 + cmp r2, #0x80000000 + adc r0, xl, r3, lsl #2 + biceq r0, r0, #1 RET -2: @ chech for NAN - orrs xl, xl, xh, lsl #12 +2: @ either overflow or underflow + tst xh, #0x40000000 + bne 3f @ overflow + + @ check if denormalized value is possible + adds r2, r3, #(23 << 21) + andlt r0, xh, #0x80000000 @ too small, return signed 0. + RETc(lt) + + @ denormalize value so we can resume with the code above afterwards. + orr xh, xh, #0x00100000 + mov r2, r2, lsr #21 + rsb r2, r2, #24 + rsb ip, r2, #32 + movs r3, xl, lsl ip + mov xl, xl, lsr r2 + orrne xl, xl, #1 @ fold r3 for rounding considerations. + mov r3, xh, lsl #11 + mov r3, r3, lsr #11 + orr xl, xl, r3, lsl ip + mov r3, r3, lsr r2 + mov r3, r3, lsl #1 + b 1b + +3: @ chech for NAN + mvns r3, r2, asr #21 + bne 5f @ simple overflow + orrs r3, xl, xh, lsl #12 movne r0, #0x7f000000 orrne r0, r0, #0x00c00000 RETc(ne) @ return NAN -3: @ return INF with sign +5: @ return INF with sign and r0, xh, #0x80000000 orr r0, r0, #0x7f000000 orr r0, r0, #0x00800000 RET -4: @ check if denormalized value is possible - subs r2, r2, #((0x380 - 24) << 20) - andle r0, xh, #0x80000000 @ too small, return signed 0. - RETc(le) - - @ denormalize value so we can resume with the code above afterwards. - orr xh, xh, #0x00100000 - mov r2, r2, lsr #20 - rsb r2, r2, #25 - cmp r2, #20 - bgt 6f - - rsb ip, r2, #32 - mov r3, xl, lsl ip - mov xl, xl, lsr r2 - orr xl, xl, xh, lsl ip - movs xh, xh, lsl #1 - mov xh, xh, lsr r2 - mov xh, xh, rrx -5: teq r3, #0 @ fold r3 bits into the LSB - orrne xl, xl, #1 @ for rounding considerations. - mov r2, #(0x380 << 20) @ equivalent to the 0 float exponent - b 1b - -6: rsb r2, r2, #(12 + 20) - rsb ip, r2, #32 - mov r3, xl, lsl r2 - mov xl, xl, lsr ip - orr xl, xl, xh, lsl r2 - and xh, xh, #0x80000000 - b 5b - FUNC_END aeabi_d2f FUNC_END truncdfsf2 diff --git a/gcc/config/arm/ieee754-sf.S b/gcc/config/arm/ieee754-sf.S index d82fa8c84f7..8eae6e9325d 100644 --- a/gcc/config/arm/ieee754-sf.S +++ b/gcc/config/arm/ieee754-sf.S @@ -42,7 +42,7 @@ ARM_FUNC_START negsf2 ARM_FUNC_ALIAS aeabi_fneg negsf2 - + eor r0, r0, #0x80000000 @ flip sign bit RET @@ -56,11 +56,11 @@ ARM_FUNC_ALIAS aeabi_fneg negsf2 ARM_FUNC_START aeabi_frsub eor r0, r0, #0x80000000 @ flip sign bit of first arg - b 1f - + b 1f + ARM_FUNC_START subsf3 ARM_FUNC_ALIAS aeabi_fsub subsf3 - + eor r1, r1, #0x80000000 @ flip sign bit of second arg #if defined(__thumb__) && !defined(__THUMB_INTERWORK__) b 1f @ Skip Thumb-code prologue @@ -68,32 +68,19 @@ ARM_FUNC_ALIAS aeabi_fsub subsf3 ARM_FUNC_START addsf3 ARM_FUNC_ALIAS aeabi_fadd addsf3 - -1: @ Compare both args, return zero if equal but the sign. - eor r2, r0, r1 - teq r2, #0x80000000 - beq LSYM(Lad_z) - @ If first arg is 0 or -0, return second arg. - @ If second arg is 0 or -0, return first arg. - bics r2, r0, #0x80000000 - moveq r0, r1 - bicnes r2, r1, #0x80000000 - RETc(eq) - - @ Mask out exponents. - mov ip, #0xff000000 - and r2, r0, ip, lsr #1 - and r3, r1, ip, lsr #1 - - @ If either of them is 255, result will be INF or NAN - teq r2, ip, lsr #1 - teqne r3, ip, lsr #1 - beq LSYM(Lad_i) +1: @ Look for zeroes, equal values, INF, or NAN. + movs r2, r0, lsl #1 + movnes r3, r1, lsl #1 + teqne r2, r3 + mvnnes ip, r2, asr #24 + mvnnes ip, r3, asr #24 + beq LSYM(Lad_s) @ Compute exponent difference. Make largest exponent in r2, @ corresponding arg in r0, and positive exponent difference in r3. - subs r3, r3, r2 + mov r2, r2, lsr #24 + rsbs r3, r2, r3, lsr #24 addgt r2, r2, r3 eorgt r1, r0, r1 eorgt r0, r1, r0 @@ -103,7 +90,7 @@ ARM_FUNC_ALIAS aeabi_fadd addsf3 @ If exponent difference is too large, return largest argument @ already in r0. We need up to 25 bit to handle proper rounding @ of 0x1p25 - 1.1. - cmp r3, #(25 << 23) + cmp r3, #25 RETc(hi) @ Convert mantissa to signed integer. @@ -122,25 +109,17 @@ ARM_FUNC_ALIAS aeabi_fadd addsf3 beq LSYM(Lad_d) LSYM(Lad_x): - @ Scale down second arg with exponent difference. - @ Apply shift one bit left to first arg and the rest to second arg - @ to simplify things later, but only if exponent does not become 0. - movs r3, r3, lsr #23 - teqne r2, #(1 << 23) - movne r0, r0, lsl #1 - subne r2, r2, #(1 << 23) - subne r3, r3, #1 + @ Compensate for the exponent overlapping the mantissa MSB added later + sub r2, r2, #1 - @ Shift second arg into ip, keep leftover bits into r1. - mov ip, r1, asr r3 + @ Shift and add second arg to first arg in r0. + @ Keep leftover bits into r1. + adds r0, r0, r1, asr r3 rsb r3, r3, #32 mov r1, r1, lsl r3 - add r0, r0, ip @ the actual addition - - @ We now have a 64 bit result in r0-r1. - @ Keep absolute value in r0-r1, sign in r3. - ands r3, r0, #0x80000000 + @ Keep absolute value in r0-r1, sign in r3 (the n bit was set above) + and r3, r0, #0x80000000 bpl LSYM(Lad_p) rsbs r1, r1, #0 rsc r0, r0, #0 @@ -148,104 +127,118 @@ LSYM(Lad_x): @ Determine how to normalize the result. LSYM(Lad_p): cmp r0, #0x00800000 - bcc LSYM(Lad_l) + bcc LSYM(Lad_a) cmp r0, #0x01000000 - bcc LSYM(Lad_r0) - cmp r0, #0x02000000 - bcc LSYM(Lad_r1) + bcc LSYM(Lad_e) @ Result needs to be shifted right. movs r0, r0, lsr #1 mov r1, r1, rrx - add r2, r2, #(1 << 23) -LSYM(Lad_r1): - movs r0, r0, lsr #1 - mov r1, r1, rrx - add r2, r2, #(1 << 23) - - @ Our result is now properly aligned into r0, remaining bits in r1. - @ Round with MSB of r1. If halfway between two numbers, round towards - @ LSB of r0 = 0. -LSYM(Lad_r0): - add r0, r0, r1, lsr #31 - teq r1, #0x80000000 - biceq r0, r0, #1 - - @ Rounding may have added a new MSB. Adjust exponent. - @ That MSB will be cleared when exponent is merged below. - tst r0, #0x01000000 - addne r2, r2, #(1 << 23) + add r2, r2, #1 @ Make sure we did not bust our exponent. - cmp r2, #(254 << 23) - bhi LSYM(Lad_o) + cmp r2, #254 + bhs LSYM(Lad_o) + @ Our result is now properly aligned into r0, remaining bits in r1. @ Pack final result together. + @ Round with MSB of r1. If halfway between two numbers, round towards + @ LSB of r0 = 0. LSYM(Lad_e): - bic r0, r0, #0x01800000 - orr r0, r0, r2 + cmp r1, #0x80000000 + adc r0, r0, r2, lsl #23 + biceq r0, r0, #1 orr r0, r0, r3 RET - @ Result must be shifted left. - @ No rounding necessary since r1 will always be 0. + @ Result must be shifted left and exponent adjusted. +LSYM(Lad_a): + movs r1, r1, lsl #1 + adc r0, r0, r0 + tst r0, #0x00800000 + sub r2, r2, #1 + bne LSYM(Lad_e) + + @ No rounding necessary since r1 will always be 0 at this point. LSYM(Lad_l): #if __ARM_ARCH__ < 5 movs ip, r0, lsr #12 moveq r0, r0, lsl #12 - subeq r2, r2, #(12 << 23) + subeq r2, r2, #12 tst r0, #0x00ff0000 moveq r0, r0, lsl #8 - subeq r2, r2, #(8 << 23) + subeq r2, r2, #8 tst r0, #0x00f00000 moveq r0, r0, lsl #4 - subeq r2, r2, #(4 << 23) + subeq r2, r2, #4 tst r0, #0x00c00000 moveq r0, r0, lsl #2 - subeq r2, r2, #(2 << 23) - tst r0, #0x00800000 - moveq r0, r0, lsl #1 - subeq r2, r2, #(1 << 23) - cmp r2, #0 - bgt LSYM(Lad_e) + subeq r2, r2, #2 + cmp r0, #0x00800000 + movcc r0, r0, lsl #1 + sbcs r2, r2, #0 #else clz ip, r0 sub ip, ip, #8 + subs r2, r2, ip mov r0, r0, lsl ip - subs r2, r2, ip, lsl #23 - bgt LSYM(Lad_e) #endif - @ Exponent too small, denormalize result. - mvn r2, r2, asr #23 - add r2, r2, #2 - orr r0, r3, r0, lsr r2 + @ Final result with sign + @ If exponent negative, denormalize result. + addge r0, r0, r2, lsl #23 + rsblt r2, r2, #0 + orrge r0, r0, r3 + orrlt r0, r3, r0, lsr r2 RET @ Fixup and adjust bit position for denormalized arguments. @ Note that r2 must not remain equal to 0. LSYM(Lad_d): teq r2, #0 - eoreq r0, r0, #0x00800000 - addeq r2, r2, #(1 << 23) eor r1, r1, #0x00800000 - subne r3, r3, #(1 << 23) + eoreq r0, r0, #0x00800000 + addeq r2, r2, #1 + subne r3, r3, #1 b LSYM(Lad_x) - @ Result is x - x = 0, unless x is INF or NAN. -LSYM(Lad_z): - mov ip, #0xff000000 - and r2, r0, ip, lsr #1 - teq r2, ip, lsr #1 - moveq r0, ip, asr #2 - movne r0, #0 +LSYM(Lad_s): + mov r3, r1, lsl #1 + + mvns ip, r2, asr #24 + mvnnes ip, r3, asr #24 + beq LSYM(Lad_i) + + teq r2, r3 + beq 1f + + @ Result is x + 0.0 = x or 0.0 + y = y. + teq r2, #0 + moveq r0, r1 RET +1: teq r0, r1 + + @ Result is x - x = 0. + movne r0, #0 + RETc(ne) + + @ Result is x + x = 2x. + tst r2, #0xff000000 + bne 2f + movs r0, r0, lsl #1 + orrcs r0, r0, #0x80000000 + RET +2: adds r2, r2, #(2 << 24) + addcc r0, r0, #(1 << 23) + RETc(cc) + and r3, r0, #0x80000000 + @ Overflow: return INF. LSYM(Lad_o): orr r0, r3, #0x7f000000 @@ -257,16 +250,16 @@ LSYM(Lad_o): @ if r1 != INF/NAN: return r0 (which is INF/NAN) @ if r0 or r1 is NAN: return NAN @ if opposite sign: return NAN - @ return r0 (which is INF or -INF) + @ otherwise return r0 (which is INF or -INF) LSYM(Lad_i): - teq r2, ip, lsr #1 + mvns r2, r2, asr #24 movne r0, r1 - teqeq r3, ip, lsr #1 - RETc(ne) + mvneqs r3, r3, asr #24 + movne r1, r0 movs r2, r0, lsl #9 - moveqs r2, r1, lsl #9 + moveqs r3, r1, lsl #9 teqeq r0, r1 - orrne r0, r3, #0x00400000 @ NAN + orrne r0, r0, #0x00400000 @ quiet NAN RET FUNC_END aeabi_frsub @@ -287,28 +280,17 @@ ARM_FUNC_ALIAS aeabi_i2f floatsisf ands r3, r0, #0x80000000 rsbmi r0, r0, #0 -1: teq r0, #0 +1: movs ip, r0 RETc(eq) -3: - mov r1, #0 - mov r2, #((127 + 23) << 23) - tst r0, #0xfc000000 - beq LSYM(Lad_p) + @ Add initial exponent to sign + orr r3, r3, #((127 + 23) << 23) - @ We need to scale the value a little before branching to code above. - tst r0, #0xf0000000 -4: - orrne r1, r1, r0, lsl #28 - movne r0, r0, lsr #4 - addne r2, r2, #(4 << 23) - tst r0, #0x0c000000 - beq LSYM(Lad_p) - mov r1, r1, lsr #2 - orr r1, r1, r0, lsl #30 - mov r0, r0, lsr #2 - add r2, r2, #(2 << 23) - b LSYM(Lad_p) + .ifnc ah, r0 + mov ah, r0 + .endif + mov al, #0 + b 2f FUNC_END aeabi_i2f FUNC_END floatsisf @@ -317,22 +299,15 @@ ARM_FUNC_ALIAS aeabi_i2f floatsisf ARM_FUNC_START floatundisf ARM_FUNC_ALIAS aeabi_ul2f floatundisf + orrs r2, r0, r1 #if !defined (__VFP_FP__) && !defined(__SOFTFP__) mvfeqs f0, #0.0 #endif RETc(eq) - -#if !defined (__VFP_FP__) && !defined(__SOFTFP__) - @ For hard FPA code we want to return via the tail below so that - @ we can return the result in f0 as well as in r0 for backwards - @ compatibility. - str lr, [sp, #-4]! - adr lr, 4f -#endif mov r3, #0 - b 2f + b 1f ARM_FUNC_START floatdisf ARM_FUNC_ALIAS aeabi_l2f floatdisf @@ -342,78 +317,80 @@ ARM_FUNC_ALIAS aeabi_l2f floatdisf mvfeqs f0, #0.0 #endif RETc(eq) - + + ands r3, ah, #0x80000000 @ sign bit in r3 + bpl 1f + rsbs al, al, #0 + rsc ah, ah, #0 +1: #if !defined (__VFP_FP__) && !defined(__SOFTFP__) @ For hard FPA code we want to return via the tail below so that @ we can return the result in f0 as well as in r0 for backwards @ compatibility. str lr, [sp, #-4]! - adr lr, 4f + adr lr, LSYM(f0_ret) #endif - ands r3, ah, #0x80000000 @ sign bit in r3 - bpl 2f - rsbs al, al, #0 - rsc ah, ah, #0 -2: + movs ip, ah -#ifdef __ARMEB__ - moveq r0, al -#endif - beq 3b - mov r2, #((127 + 23 + 32) << 23) @ initial exponent -#ifndef __ARMEB__ - mov r1, al - mov r0, ip -#endif - tst r0, #0xfc000000 - bne 3f + moveq ip, al + + @ Add initial exponent to sign + orr r3, r3, #((127 + 23 + 32) << 23) + subeq r3, r3, #(32 << 23) +2: sub r3, r3, #(1 << 23) #if __ARM_ARCH__ < 5 - cmp r0, #(1 << 13) - movlo ip, #13 - movlo r0, r0, lsl #13 - movhs ip, #0 - tst r0, #0x03fc0000 - addeq ip, ip, #8 - moveq r0, r0, lsl #8 - tst r0, #0x03c00000 - addeq ip, ip, #4 - moveq r0, r0, lsl #4 - tst r0, #0x03000000 - addeq ip, ip, #2 - moveq r0, r0, lsl #2 + + mov r2, #23 + cmp ip, #(1 << 16) + movhs ip, ip, lsr #16 + subhs r2, r2, #16 + cmp ip, #(1 << 8) + movhs ip, ip, lsr #8 + subhs r2, r2, #8 + cmp ip, #(1 << 4) + movhs ip, ip, lsr #4 + subhs r2, r2, #4 + cmp ip, #(1 << 2) + subhs r2, r2, #2 + sublo r2, r2, ip, lsr #1 + subs r2, r2, ip, lsr #3 + #else - clz ip, r0 - sub ip, ip, #6 - mov r0, r0, lsl ip + + clz r2, ip + subs r2, r2, #8 + #endif - sub r2, r2, ip, lsl #23 - rsb ip, ip, #32 - orr r0, r0, r1, lsr ip - rsb ip, ip, #32 - mov r1, r1, asl ip - @ At this point we no-longer care about the precise value in r1, only - @ whether only the top bit is set, or if the top bit and some others - @ are set. - and ip, r1, #0xff - orr r1, r1, ip, lsl #8 - b LSYM(Lad_p) -3: - @ We need to scale the value a little before branching to code above. - @ At this point we no-longer care about the precise value in r1, only - @ whether only the top bit is set, or if the top bit and some others - @ are set. - and ip, r1, #0xff - orr r1, r1, ip, lsl #8 - tst r0, #0xf0000000 - movne r1, r1, lsr #4 - b 4b + + sub r3, r3, r2, lsl #23 + blt 3f + + add r3, r3, ah, lsl r2 + mov ip, al, lsl r2 + rsb r2, r2, #32 + cmp ip, #0x80000000 + adc r0, r3, al, lsr r2 + biceq r0, r0, #1 + RET + +3: add r2, r2, #32 + mov ip, ah, lsl r2 + rsb r2, r2, #32 + orrs al, al, ip, lsl #1 + adc r0, r3, ah, lsr r2 + biceq r0, r0, ip, lsr #31 + RET + #if !defined (__VFP_FP__) && !defined(__SOFTFP__) -4: + +LSYM(f0_ret) str r0, [sp, #-4]! ldfs f0, [sp], #4 RETLDM + #endif + FUNC_END floatdisf FUNC_END aeabi_l2f FUNC_END floatundisf @@ -425,139 +402,117 @@ ARM_FUNC_ALIAS aeabi_l2f floatdisf ARM_FUNC_START mulsf3 ARM_FUNC_ALIAS aeabi_fmul mulsf3 - - @ Mask out exponents. - mov ip, #0xff000000 - and r2, r0, ip, lsr #1 - and r3, r1, ip, lsr #1 - @ Trap any INF/NAN. - teq r2, ip, lsr #1 - teqne r3, ip, lsr #1 + @ Mask out exponents, trap any zero/denormal/INF/NAN. + mov ip, #0xff + ands r2, ip, r0, lsr #23 + andnes r3, ip, r1, lsr #23 + teqne r2, ip + teqne r3, ip beq LSYM(Lml_s) - - @ Trap any multiplication by 0. - bics ip, r0, #0x80000000 - bicnes ip, r1, #0x80000000 - beq LSYM(Lml_z) - - @ Shift exponents right one bit to make room for overflow bit. - @ If either of them is 0, scale denormalized arguments off line. - @ Then add both exponents together. - movs r2, r2, lsr #1 - teqne r3, #0 - beq LSYM(Lml_d) LSYM(Lml_x): - add r2, r2, r3, asr #1 - @ Preserve final sign in r2 along with exponent for now. - teq r0, r1 - orrmi r2, r2, #0x8000 + @ Add exponents together + add r2, r2, r3 + + @ Determine final sign. + eor ip, r0, r1 @ Convert mantissa to unsigned integer. - bic r0, r0, #0xff000000 - bic r1, r1, #0xff000000 - orr r0, r0, #0x00800000 - orr r1, r1, #0x00800000 + @ If power of two, branch to a separate path. + @ Make up for final alignment. + movs r0, r0, lsl #9 + movnes r1, r1, lsl #9 + beq LSYM(Lml_1) + mov r3, #0x08000000 + orr r0, r3, r0, lsr #5 + orr r1, r3, r1, lsr #5 #if __ARM_ARCH__ < 4 + @ Put sign bit in r3, which will be restored into r0 later. + and r3, ip, #0x80000000 + @ Well, no way to make it shorter without the umull instruction. - @ We must perform that 24 x 24 -> 48 bit multiplication by hand. - stmfd sp!, {r4, r5} + stmfd sp!, {r3, r4, r5} mov r4, r0, lsr #16 mov r5, r1, lsr #16 - bic r0, r0, #0x00ff0000 - bic r1, r1, #0x00ff0000 + bic r0, r0, r4, lsl #16 + bic r1, r1, r5, lsl #16 mul ip, r4, r5 mul r3, r0, r1 mul r0, r5, r0 mla r0, r4, r1, r0 adds r3, r3, r0, lsl #16 - adc ip, ip, r0, lsr #16 - ldmfd sp!, {r4, r5} + adc r1, ip, r0, lsr #16 + ldmfd sp!, {r0, r4, r5} #else - umull r3, ip, r0, r1 @ The actual multiplication. + @ The actual multiplication. + umull r3, r1, r0, r1 + + @ Put final sign in r0. + and r0, ip, #0x80000000 #endif - @ Put final sign in r0. - mov r0, r2, lsl #16 - bic r2, r2, #0x8000 + @ Adjust result upon the MSB position. + cmp r1, #(1 << 23) + movcc r1, r1, lsl #1 + orrcc r1, r1, r3, lsr #31 + movcc r3, r3, lsl #1 - @ Adjust result if one extra MSB appeared. - @ The LSB may be lost but this never changes the result in this case. - tst ip, #(1 << 15) - addne r2, r2, #(1 << 22) - movnes ip, ip, lsr #1 - movne r3, r3, rrx + @ Add sign to result. + orr r0, r0, r1 - @ Apply exponent bias, check range for underflow. - subs r2, r2, #(127 << 22) - ble LSYM(Lml_u) + @ Apply exponent bias, check for under/overflow. + sbc r2, r2, #127 + cmp r2, #(254 - 1) + bhi LSYM(Lml_u) - @ Scale back to 24 bits with rounding. - @ r0 contains sign bit already. - orrs r0, r0, r3, lsr #23 - adc r0, r0, ip, lsl #9 - - @ If halfway between two numbers, rounding should be towards LSB = 0. - mov r3, r3, lsl #9 - teq r3, #0x80000000 + @ Round the result, merge final exponent. + cmp r3, #0x80000000 + adc r0, r0, r2, lsl #23 biceq r0, r0, #1 - - @ Note: rounding may have produced an extra MSB here. - @ The extra bit is cleared before merging the exponent below. - tst r0, #0x01000000 - addne r2, r2, #(1 << 22) - - @ Check for exponent overflow - cmp r2, #(255 << 22) - bge LSYM(Lml_o) - - @ Add final exponent. - bic r0, r0, #0x01800000 - orr r0, r0, r2, lsl #1 RET - @ Result is 0, but determine sign anyway. -LSYM(Lml_z): - eor r0, r0, r1 - bic r0, r0, #0x7fffffff - RET + @ Multiplication by 0x1p*: let''s shortcut a lot of code. +LSYM(Lml_1): + teq r0, #0 + and ip, ip, #0x80000000 + moveq r1, r1, lsl #9 + orr r0, ip, r0, lsr #9 + orr r0, r0, r1, lsr #9 + subs r2, r2, #127 + rsbgts r3, r2, #255 + orrgt r0, r0, r2, lsl #23 + RETc(gt) + + @ Under/overflow: fix things up for the code below. + orr r0, r0, #0x00800000 + mov r3, #0 + subs r2, r2, #1 + +LSYM(Lml_u): + @ Overflow? + bgt LSYM(Lml_o) @ Check if denormalized result is possible, otherwise return signed 0. -LSYM(Lml_u): - cmn r2, #(24 << 22) + cmn r2, #(24 + 1) + bicle r0, r0, #0x7fffffff RETc(le) - @ Find out proper shift value. - mvn r1, r2, asr #22 - subs r1, r1, #7 - bgt LSYM(Lml_ur) - - @ Shift value left, round, etc. - add r1, r1, #32 - orrs r0, r0, r3, lsr r1 - rsb r1, r1, #32 - adc r0, r0, ip, lsl r1 - mov ip, r3, lsl r1 - teq ip, #0x80000000 - biceq r0, r0, #1 - RET - @ Shift value right, round, etc. - @ Note: r1 must not be 0 otherwise carry does not get set. -LSYM(Lml_ur): - orrs r0, r0, ip, lsr r1 + rsb r2, r2, #0 + movs r1, r0, lsl #1 + mov r1, r1, lsr r2 + rsb r2, r2, #32 + mov ip, r0, lsl r2 + movs r0, r1, rrx adc r0, r0, #0 - rsb r1, r1, #32 - mov ip, ip, lsl r1 - teq r3, #0 - teqeq ip, #0x80000000 - biceq r0, r0, #1 + orrs r3, r3, ip, lsl #1 + biceq r0, r0, ip, lsr #31 RET @ One or both arguments are denormalized. @@ -567,32 +522,51 @@ LSYM(Lml_d): and ip, r0, #0x80000000 1: moveq r0, r0, lsl #1 tsteq r0, #0x00800000 - subeq r2, r2, #(1 << 22) + subeq r2, r2, #1 beq 1b orr r0, r0, ip teq r3, #0 and ip, r1, #0x80000000 2: moveq r1, r1, lsl #1 tsteq r1, #0x00800000 - subeq r3, r3, #(1 << 23) + subeq r3, r3, #1 beq 2b orr r1, r1, ip b LSYM(Lml_x) - @ One or both args are INF or NAN. LSYM(Lml_s): + @ Isolate the INF and NAN cases away + and r3, ip, r1, lsr #23 + teq r2, ip + teqne r3, ip + beq 1f + + @ Here, one or more arguments are either denormalized or zero. + bics ip, r0, #0x80000000 + bicnes ip, r1, #0x80000000 + bne LSYM(Lml_d) + + @ Result is 0, but determine sign anyway. +LSYM(Lml_z): + eor r0, r0, r1 + bic r0, r0, #0x7fffffff + RET + +1: @ One or both args are INF or NAN. teq r0, #0x0 - teqne r1, #0x0 teqne r0, #0x80000000 + moveq r0, r1 + teqne r1, #0x0 teqne r1, #0x80000000 beq LSYM(Lml_n) @ 0 * INF or INF * 0 -> NAN - teq r2, ip, lsr #1 + teq r2, ip bne 1f movs r2, r0, lsl #9 bne LSYM(Lml_n) @ NAN * -> NAN -1: teq r3, ip, lsr #1 +1: teq r3, ip bne LSYM(Lml_i) movs r3, r1, lsl #9 + movne r0, r1 bne LSYM(Lml_n) @ * NAN -> NAN @ Result is INF, but we need to determine its sign. @@ -606,9 +580,9 @@ LSYM(Lml_o): orr r0, r0, #0x00800000 RET - @ Return NAN. + @ Return a quiet NAN. LSYM(Lml_n): - mov r0, #0x7f000000 + orr r0, r0, #0x7f000000 orr r0, r0, #0x00c00000 RET @@ -617,37 +591,28 @@ LSYM(Lml_n): ARM_FUNC_START divsf3 ARM_FUNC_ALIAS aeabi_fdiv divsf3 - - @ Mask out exponents. - mov ip, #0xff000000 - and r2, r0, ip, lsr #1 - and r3, r1, ip, lsr #1 - @ Trap any INF/NAN or zeroes. - teq r2, ip, lsr #1 - teqne r3, ip, lsr #1 - bicnes ip, r0, #0x80000000 - bicnes ip, r1, #0x80000000 + @ Mask out exponents, trap any zero/denormal/INF/NAN. + mov ip, #0xff + ands r2, ip, r0, lsr #23 + andnes r3, ip, r1, lsr #23 + teqne r2, ip + teqne r3, ip beq LSYM(Ldv_s) - - @ Shift exponents right one bit to make room for overflow bit. - @ If either of them is 0, scale denormalized arguments off line. - @ Then substract divisor exponent from dividend''s. - movs r2, r2, lsr #1 - teqne r3, #0 - beq LSYM(Ldv_d) LSYM(Ldv_x): - sub r2, r2, r3, asr #1 + + @ Substract divisor exponent from dividend''s + sub r2, r2, r3 @ Preserve final sign into ip. eor ip, r0, r1 @ Convert mantissa to unsigned integer. @ Dividend -> r3, divisor -> r1. - mov r3, #0x10000000 movs r1, r1, lsl #9 mov r0, r0, lsl #9 beq LSYM(Ldv_1) + mov r3, #0x10000000 orr r1, r3, r1, lsr #4 orr r3, r3, r0, lsr #4 @@ -655,16 +620,10 @@ LSYM(Ldv_x): and r0, ip, #0x80000000 @ Ensure result will land to known bit position. + @ Apply exponent bias accordingly. cmp r3, r1 - subcc r2, r2, #(1 << 22) movcc r3, r3, lsl #1 - - @ Apply exponent bias, check range for over/underflow. - add r2, r2, #(127 << 22) - cmn r2, #(24 << 22) - RETc(le) - cmp r2, #(255 << 22) - bge LSYM(Lml_o) + adc r2, r2, #(127 - 2) @ The actual division loop. mov ip, #0x00800000 @@ -684,44 +643,29 @@ LSYM(Ldv_x): movnes ip, ip, lsr #4 bne 1b - @ Check if denormalized result is needed. - cmp r2, #0 - ble LSYM(Ldv_u) + @ Check exponent for under/overflow. + cmp r2, #(254 - 1) + bhi LSYM(Lml_u) - @ Apply proper rounding. + @ Round the result, merge final exponent. cmp r3, r1 - addcs r0, r0, #1 + adc r0, r0, r2, lsl #23 biceq r0, r0, #1 - - @ Add exponent to result. - bic r0, r0, #0x00800000 - orr r0, r0, r2, lsl #1 RET @ Division by 0x1p*: let''s shortcut a lot of code. LSYM(Ldv_1): and ip, ip, #0x80000000 orr r0, ip, r0, lsr #9 - add r2, r2, #(127 << 22) - cmp r2, #(255 << 22) - bge LSYM(Lml_o) - cmp r2, #0 - orrgt r0, r0, r2, lsl #1 + adds r2, r2, #127 + rsbgts r3, r2, #255 + orrgt r0, r0, r2, lsl #23 RETc(gt) - cmn r2, #(24 << 22) - movle r0, ip - RETc(le) + orr r0, r0, #0x00800000 mov r3, #0 - - @ Result must be denormalized: prepare parameters to use code above. - @ r3 already contains remainder for rounding considerations. -LSYM(Ldv_u): - bic ip, r0, #0x80000000 - and r0, r0, #0x80000000 - mvn r1, r2, asr #22 - add r1, r1, #2 - b LSYM(Lml_ur) + subs r2, r2, #1 + b LSYM(Lml_u) @ One or both arguments are denormalized. @ Scale them leftwards and preserve sign bit. @@ -730,35 +674,40 @@ LSYM(Ldv_d): and ip, r0, #0x80000000 1: moveq r0, r0, lsl #1 tsteq r0, #0x00800000 - subeq r2, r2, #(1 << 22) + subeq r2, r2, #1 beq 1b orr r0, r0, ip teq r3, #0 and ip, r1, #0x80000000 2: moveq r1, r1, lsl #1 tsteq r1, #0x00800000 - subeq r3, r3, #(1 << 23) + subeq r3, r3, #1 beq 2b orr r1, r1, ip b LSYM(Ldv_x) - @ One or both arguments is either INF, NAN or zero. + @ One or both arguments are either INF, NAN, zero or denormalized. LSYM(Ldv_s): - mov ip, #0xff000000 - teq r2, ip, lsr #1 - teqeq r3, ip, lsr #1 - beq LSYM(Lml_n) @ INF/NAN / INF/NAN -> NAN - teq r2, ip, lsr #1 + and r3, ip, r1, lsr #23 + teq r2, ip bne 1f movs r2, r0, lsl #9 bne LSYM(Lml_n) @ NAN / -> NAN - b LSYM(Lml_i) @ INF / -> INF -1: teq r3, ip, lsr #1 + teq r3, ip + bne LSYM(Lml_i) @ INF / -> INF + mov r0, r1 + b LSYM(Lml_n) @ INF / (INF or NAN) -> NAN +1: teq r3, ip bne 2f movs r3, r1, lsl #9 - bne LSYM(Lml_n) @ / NAN -> NAN - b LSYM(Lml_z) @ / INF -> 0 -2: @ One or both arguments are 0. + beq LSYM(Lml_z) @ / INF -> 0 + mov r0, r1 + b LSYM(Lml_n) @ / NAN -> NAN +2: @ If both are non-zero, we need to normalize and resume above. + bics ip, r0, #0x80000000 + bicnes ip, r1, #0x80000000 + bne LSYM(Ldv_d) + @ One or both arguments are zero. bics r2, r0, #0x80000000 bne LSYM(Lml_i) @ / 0 -> INF bics r3, r1, #0x80000000 @@ -789,85 +738,50 @@ LSYM(Ldv_s): ARM_FUNC_START gtsf2 ARM_FUNC_ALIAS gesf2 gtsf2 - mov r3, #-1 + mov ip, #-1 b 1f ARM_FUNC_START ltsf2 ARM_FUNC_ALIAS lesf2 ltsf2 - mov r3, #1 + mov ip, #1 b 1f ARM_FUNC_START cmpsf2 ARM_FUNC_ALIAS nesf2 cmpsf2 ARM_FUNC_ALIAS eqsf2 cmpsf2 - mov r3, #1 @ how should we specify unordered here? + mov ip, #1 @ how should we specify unordered here? - @ Both Inf and NaN have an exponent of 255. Therefore, we - @ compute (r1 & 0x8f80000) || (r2 & 0x8f8000). -1: mov ip, #0xff000000 - and r2, r1, ip, lsr #1 - teq r2, ip, lsr #1 - and r2, r0, ip, lsr #1 - teqne r2, ip, lsr #1 +1: str ip, [sp, #-4] + + @ Trap any INF/NAN first. + mov r2, r0, lsl #1 + mov r3, r1, lsl #1 + mvns ip, r2, asr #24 + mvnnes ip, r3, asr #24 beq 3f - @ Test for equality. The representations of +0.0 and -0.0 - @ have all bits set to zero, except for the sign bit. Since - @ 0.0 is equal to -0.0, we begin by testing - @ ((r0 | r1) & ~0x8000000). -2: orr r3, r0, r1 - @ If the result of the bitwise and is zero, then the Z flag - @ will be set. In any case, the C flag will be set. - bics r3, r3, #0x80000000 @ either 0.0 or -0.0 - teqne r0, r1 @ or both the same - @ If the Z flag is set, the two operands were equal. Return zero. - moveq r0, #0 - RETc(eq) + @ Compare values. + @ Note that 0.0 is equal to -0.0. +2: orrs ip, r2, r3, lsr #1 @ test if both are 0, clear C flag + teqne r0, r1 @ if not 0 compare sign + subpls r0, r2, r3 @ if same sign compare values, set r0 - @ Check for sign difference. The N flag is set (due to the - @ use of teq above) if the sign bit is set on exactly one - @ of the operands. Return the sign of the first operand. - movmi r0, r0, asr #31 - orrmi r0, r0, #1 - RETc(mi) - - @ Compare exponents. - and r3, r1, ip, lsr #1 - cmp r2, r3 - - @ Compare mantissa if exponents are equal - moveq r0, r0, lsl #9 - cmpeq r0, r1, lsl #9 - - @ We know the operands cannot be equal at this point, so the - @ Z flag is clear. The C flag is set if the first operand has - @ the greater exponent, or the exponents are equal and the - @ first operand has the greater mantissa. Therefore, if the C - @ flag is set, the first operand is greater iff the sign is - @ positive. These next two instructions will put zero in - @ r0 if the first operand is greater, and -1 if the second - @ operand is greater. - movcs r0, r1, asr #31 - mvncc r0, r1, asr #31 - @ If r0 is 0, the first operand is greater, so return 1. Leave - @ -1 unchanged. - orr r0, r0, #1 + @ Result: + movhi r0, r1, asr #31 + mvnlo r0, r1, asr #31 + orrne r0, r0, #1 RET - @ We know that at least one argument is either Inf or NaN. - @ Look for a NaN. -3: and r2, r1, ip, lsr #1 - teq r2, ip, lsr #1 + @ Look for a NAN. +3: mvns ip, r2, asr #24 bne 4f - movs r2, r1, lsl #9 - bne 5f @ r1 is NAN -4: and r2, r0, ip, lsr #1 - teq r2, ip, lsr #1 - bne 2b movs ip, r0, lsl #9 - beq 2b @ r0 is not NAN -5: @ The Z flag is clear at this point. - mov r0, r3 @ return unordered code from r3. + bne 5f @ r0 is NAN +4: mvns ip, r3, asr #24 + bne 2b + movs ip, r1, lsl #9 + beq 2b @ r1 is not NAN +5: ldr r0, [sp, #-4] @ return unordered code. RET FUNC_END gesf2 @@ -879,13 +793,15 @@ ARM_FUNC_ALIAS eqsf2 cmpsf2 FUNC_END cmpsf2 ARM_FUNC_START aeabi_cfrcmple + mov ip, r0 mov r0, r1 mov r1, ip b 6f - + ARM_FUNC_START aeabi_cfcmpeq ARM_FUNC_ALIAS aeabi_cfcmple aeabi_cfcmpeq + @ The status-returning routines are required to preserve all @ registers except ip, lr, and cpsr. 6: stmfd sp!, {r0, r1, r2, r3, lr} @@ -896,68 +812,79 @@ ARM_FUNC_ALIAS aeabi_cfcmple aeabi_cfcmpeq @ that the first operand was smaller than the second. cmnmi r0, #0 RETLDM "r0, r1, r2, r3" + FUNC_END aeabi_cfcmple FUNC_END aeabi_cfcmpeq - + FUNC_END aeabi_cfrcmple + ARM_FUNC_START aeabi_fcmpeq + str lr, [sp, #-4]! ARM_CALL aeabi_cfcmple moveq r0, #1 @ Equal to. movne r0, #0 @ Less than, greater than, or unordered. RETLDM + FUNC_END aeabi_fcmpeq ARM_FUNC_START aeabi_fcmplt + str lr, [sp, #-4]! ARM_CALL aeabi_cfcmple movcc r0, #1 @ Less than. movcs r0, #0 @ Equal to, greater than, or unordered. RETLDM + FUNC_END aeabi_fcmplt ARM_FUNC_START aeabi_fcmple + str lr, [sp, #-4]! ARM_CALL aeabi_cfcmple movls r0, #1 @ Less than or equal to. movhi r0, #0 @ Greater than or unordered. RETLDM + FUNC_END aeabi_fcmple ARM_FUNC_START aeabi_fcmpge + str lr, [sp, #-4]! ARM_CALL aeabi_cfrcmple movls r0, #1 @ Operand 2 is less than or equal to operand 1. movhi r0, #0 @ Operand 2 greater than operand 1, or unordered. RETLDM + FUNC_END aeabi_fcmpge ARM_FUNC_START aeabi_fcmpgt + str lr, [sp, #-4]! ARM_CALL aeabi_cfrcmple movcc r0, #1 @ Operand 2 is less than operand 1. movcs r0, #0 @ Operand 2 is greater than or equal to operand 1, @ or they are unordered. RETLDM + FUNC_END aeabi_fcmpgt - + #endif /* L_cmpsf2 */ #ifdef L_unordsf2 ARM_FUNC_START unordsf2 ARM_FUNC_ALIAS aeabi_fcmpun unordsf2 - - mov ip, #0xff000000 - and r2, r1, ip, lsr #1 - teq r2, ip, lsr #1 + + mov r2, r0, lsl #1 + mov r3, r1, lsl #1 + mvns ip, r2, asr #24 bne 1f - movs r2, r1, lsl #9 - bne 3f @ r1 is NAN -1: and r2, r0, ip, lsr #1 - teq r2, ip, lsr #1 - bne 2f - movs r2, r0, lsl #9 + movs ip, r0, lsl #9 bne 3f @ r0 is NAN +1: mvns ip, r3, asr #24 + bne 2f + movs ip, r1, lsl #9 + bne 3f @ r1 is NAN 2: mov r0, #0 @ arguments are ordered. RET 3: mov r0, #1 @ arguments are unordered. @@ -972,37 +899,35 @@ ARM_FUNC_ALIAS aeabi_fcmpun unordsf2 ARM_FUNC_START fixsfsi ARM_FUNC_ALIAS aeabi_f2iz fixsfsi - movs r0, r0, lsl #1 - RETc(eq) @ value is 0. - - mov r1, r1, rrx @ preserve C flag (the actual sign) @ check exponent range. - and r2, r0, #0xff000000 + mov r2, r0, lsl #1 cmp r2, #(127 << 24) - movcc r0, #0 @ value is too small - RETc(cc) - cmp r2, #((127 + 31) << 24) - bcs 1f @ value is too large + bcc 1f @ value is too small + mov r3, #(127 + 31) + subs r2, r3, r2, lsr #24 + bls 2f @ value is too large - mov r0, r0, lsl #7 - orr r0, r0, #0x80000000 - mov r2, r2, lsr #24 - rsb r2, r2, #(127 + 31) - tst r1, #0x80000000 @ the sign bit - mov r0, r0, lsr r2 + @ scale value + mov r3, r0, lsl #8 + orr r3, r3, #0x80000000 + tst r0, #0x80000000 @ the sign bit + mov r0, r3, lsr r2 rsbne r0, r0, #0 RET -1: teq r2, #0xff000000 - bne 2f - movs r0, r0, lsl #8 - bne 3f @ r0 is NAN. -2: ands r0, r1, #0x80000000 @ the sign bit +1: mov r0, #0 + RET + +2: cmp r2, #(127 + 31 - 0xff) + bne 3f + movs r2, r0, lsl #9 + bne 4f @ r0 is NAN. +3: ands r0, r0, #0x80000000 @ the sign bit moveq r0, #0x7fffffff @ the maximum signed positive si RET -3: mov r0, #0 @ What should we convert NAN to? +4: mov r0, #0 @ What should we convert NAN to? RET FUNC_END aeabi_f2iz @@ -1014,34 +939,33 @@ ARM_FUNC_ALIAS aeabi_f2iz fixsfsi ARM_FUNC_START fixunssfsi ARM_FUNC_ALIAS aeabi_f2uiz fixunssfsi - movs r0, r0, lsl #1 - movcss r0, #0 @ value is negative... - RETc(eq) @ ... or 0. - @ check exponent range. - and r2, r0, #0xff000000 + movs r2, r0, lsl #1 + bcs 1f @ value is negative cmp r2, #(127 << 24) - movcc r0, #0 @ value is too small - RETc(cc) - cmp r2, #((127 + 32) << 24) - bcs 1f @ value is too large + bcc 1f @ value is too small + mov r3, #(127 + 31) + subs r2, r3, r2, lsr #24 + bmi 2f @ value is too large - mov r0, r0, lsl #7 - orr r0, r0, #0x80000000 - mov r2, r2, lsr #24 - rsb r2, r2, #(127 + 31) - mov r0, r0, lsr r2 + @ scale the value + mov r3, r0, lsl #8 + orr r3, r3, #0x80000000 + mov r0, r3, lsr r2 RET -1: teq r2, #0xff000000 - bne 2f - movs r0, r0, lsl #8 - bne 3f @ r0 is NAN. -2: mov r0, #0xffffffff @ maximum unsigned si +1: mov r0, #0 RET -3: mov r0, #0 @ What should we convert NAN to? +2: cmp r2, #(127 + 31 - 0xff) + bne 3f + movs r2, r0, lsl #9 + bne 4f @ r0 is NAN. +3: mov r0, #0xffffffff @ maximum unsigned si + RET + +4: mov r0, #0 @ What should we convert NAN to? RET FUNC_END aeabi_f2uiz