Commit 3eb64c82 authored by Nicolas Pitre's avatar Nicolas Pitre Committed by Russell King

[ARM PATCH] 1678/1: correct and better do_div() implementation for ARM

Patch from Nicolas Pitre

Here's a rewrite of the ARM do_div() implementation.  It is much
faster and smarter than the current code, and it also takes
advantage of ARMv5+ instructions when target processor allows it.

The current code also deserves to be killed ASAP since it overflows
and fails to compute correct values in many cases.  For example:

	u64 n = 2200000001;
	u32 x = 2200000000;
	u32 r = do_div(n, x);

This currently returns n = 41 and r = 46829569 which is obviously bad.

Another failing example is n=15000000000000000000 and x=3000000000.
parent 24d4f462
...@@ -68,8 +68,8 @@ extern void __umoddi3(void); ...@@ -68,8 +68,8 @@ extern void __umoddi3(void);
extern void __udivmoddi4(void); extern void __udivmoddi4(void);
extern void __udivsi3(void); extern void __udivsi3(void);
extern void __umodsi3(void); extern void __umodsi3(void);
extern void __do_div64(void);
extern void abort(void); extern void abort(void);
extern void do_div64(void);
extern void ret_from_exception(void); extern void ret_from_exception(void);
extern void fpundefinstr(void); extern void fpundefinstr(void);
...@@ -223,7 +223,7 @@ EXPORT_SYMBOL_NOVERS(__umoddi3); ...@@ -223,7 +223,7 @@ EXPORT_SYMBOL_NOVERS(__umoddi3);
EXPORT_SYMBOL_NOVERS(__udivmoddi4); EXPORT_SYMBOL_NOVERS(__udivmoddi4);
EXPORT_SYMBOL_NOVERS(__udivsi3); EXPORT_SYMBOL_NOVERS(__udivsi3);
EXPORT_SYMBOL_NOVERS(__umodsi3); EXPORT_SYMBOL_NOVERS(__umodsi3);
EXPORT_SYMBOL_NOVERS(do_div64); EXPORT_SYMBOL_NOVERS(__do_div64);
/* bitops */ /* bitops */
EXPORT_SYMBOL(_set_bit_le); EXPORT_SYMBOL(_set_bit_le);
......
/*
* linux/arch/arm/lib/div64.S
*
* Optimized computation of 64-bit dividend / 32-bit divisor
*
* Author: Nicolas Pitre
* Created: Oct 5, 2003
* Copyright: Monta Vista Software, Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/linkage.h> #include <linux/linkage.h>
#ifndef __ARMEB__ #ifdef __ARMEB__
ql .req r0 @ quotient low #define xh r0
qh .req r1 @ quotient high #define xl r1
onl .req r0 @ original dividend low #define yh r2
onh .req r1 @ original dividend high #define yl r3
nl .req r4 @ dividend low
nh .req r5 @ dividend high
res .req r4 @ result
#else #else
ql .req r1 #define xl r0
qh .req r0 #define xh r1
onl .req r1 #define yl r2
onh .req r0 #define yh r3
nl .req r5
nh .req r4
res .req r5
#endif #endif
dl .req r3 @ divisor low /*
dh .req r2 @ divsor high * __do_div64: perform a division with 64-bit dividend and 32-bit divisor.
*
* Note: Calling convention is totally non standard for optimal code.
ENTRY(do_div64) * This is meant to be used by do_div() from include/asm/div64.h only.
stmfd sp!, {r4, r5, lr} *
mov nl, onl * Input parameters:
movs nh, onh @ if high bits are zero * xh-xl = dividend (clobbered)
movne lr, #33 * r4 = divisor (preserved)
moveq lr, #1 @ only divide low bits *
moveq nh, onl * Output values:
* yh-yl = result
1: cmp nh, dh * xh = remainder
bls 2f *
add lr, lr, #1 * Clobbered regs: xl, ip
movs dh, dh, lsl #1 @ left justify disor */
bpl 1b
ENTRY(__do_div64)
2: movs nh, onh
moveq dl, dh @ Test for easy paths first.
moveq dh, #0 subs ip, r4, #1
movne dl, #0 bls 9f @ divisor is 0 or 1
mov ql, #0 tst ip, r4
mov qh, #0 beq 8f @ divisor is power of 2
3: subs ip, nl, dl @ trial subtraction
sbcs ip, nh, dh @ See if we need to handle upper 32-bit result.
movcs nh, ip @ only update if successful cmp xh, r4
subcs nl, nl, dl @ (repeat the subtraction) mov yh, #0
adcs ql, ql, ql @ C=1 if successful, shift into blo 3f
adc qh, qh, qh @ quotient
movs dh, dh, lsr #1 @ shift base high part right @ Align divisor with upper part of dividend.
mov dl, dl, rrx @ shift base low part right @ The aligned divisor is stored in yl preserving the original.
subs lr, lr, #1 @ The bit position is stored in ip.
bne 3b
#if __LINUX_ARM_ARCH__ >= 5
mov r2, res
ldmfd sp!, {r4, r5, pc} clz yl, r4
clz ip, xh
sub yl, yl, ip
mov ip, #1
mov ip, ip, lsl yl
mov yl, r4, lsl yl
#else
mov yl, r4
mov ip, #1
1: cmp yl, #0x80000000
cmpcc yl, xh
movcc yl, yl, lsl #1
movcc ip, ip, lsl #1
bcc 1b
#endif
@ The division loop for needed upper bit positions.
@ Break out early if dividend reaches 0.
2: cmp xh, yl
orrcs yh, yh, ip
subcss xh, xh, yl
movnes ip, ip, lsr #1
mov yl, yl, lsr #1
bne 2b
@ See if we need to handle lower 32-bit result.
3: cmp xh, #0
mov yl, #0
cmpeq xl, r4
movlo xh, xl
movlo pc, lr
@ The division loop for lower bit positions.
@ Here we shift remainer bits leftwards rather than moving the
@ divisor for comparisons, considering the carry-out bit as well.
mov ip, #0x80000000
4: movs xl, xl, lsl #1
adcs xh, xh, xh
beq 6f
cmpcc xh, r4
5: orrcs yl, yl, ip
subcs xh, xh, r4
movs ip, ip, lsr #1
bne 4b
mov pc, lr
@ The top part of remainder became zero. If carry is set
@ (the 33th bit) this is a false positive so resume the loop.
@ Otherwise, if lower part is also null then we're done.
6: bcs 5b
cmp xl, #0
moveq pc, lr
@ We still have remainer bits in the low part. Bring them up.
#if __LINUX_ARM_ARCH__ >= 5
clz xh, xl @ we know xh is zero here so...
add xh, xh, #1
mov xl, xl, lsl xh
mov ip, ip, lsr xh
#else
7: movs xl, xl, lsl #1
mov ip, ip, lsr #1
bcc 7b
#endif
@ Current remainder is now 1. It's worthless to compare with
@ divisor at this point since divisor can't be smaller than 3 here.
@ If possible, branch for another shift in the division loop.
@ If no bit position left then we're done.
movs ip, ip, lsr #1
mov xh, #1
bne 4b
mov pc, lr
8: @ Division by a power of 2: determine what that divisor order is
@ then simply shift values around
#if __LINUX_ARM_ARCH__ >= 5
clz ip, r4
rsb ip, ip, #31
#else
mov yl, r4
cmp r4, #(1 << 16)
mov ip, #0
movhs yl, yl, lsr #16
movhs ip, #16
cmp yl, #(1 << 8)
movhs yl, yl, lsr #8
addhs ip, ip, #8
cmp yl, #(1 << 4)
movhs yl, yl, lsr #4
addhs ip, ip, #4
cmp yl, #(1 << 2)
addhi ip, ip, #3
addls ip, ip, yl, lsr #1
#endif
mov yh, xh, lsr ip
mov yl, xl, lsr ip
rsb ip, ip, #32
orr yl, yl, xh, lsl ip
mov xh, xl, lsl ip
mov xh, xh, lsr ip
mov pc, lr
@ eq -> division by 1: obvious enough...
9: moveq yl, xl
moveq yh, xh
moveq xh, #0
moveq pc, lr
@ Division by 0:
str lr, [sp, #-4]!
bl __div0
@ as wrong as it could be...
mov yl, #0
mov yh, #0
mov xh, #0
ldr pc, [sp], #4
#ifndef __ASM_ARM_DIV64 #ifndef __ASM_ARM_DIV64
#define __ASM_ARM_DIV64 #define __ASM_ARM_DIV64
/* We're not 64-bit, but... */ /*
* The semantics of do_div() are:
*
* uint32_t do_div(uint64_t *n, uint32_t base)
* {
* uint32_t remainder = *n % base;
* *n = *n / base;
* return remainder;
* }
*
* In other words, a 64-bit dividend with a 32-bit divisor producing
* a 64-bit result and a 32-bit remainder. To accomplish this optimally
* we call a special __do_div64 helper with completely non standard
* calling convention for arguments and results (beware).
*/
#ifdef __ARMEB__
#define __xh "r0"
#define __xl "r1"
#else
#define __xl "r0"
#define __xh "r1"
#endif
#define do_div(n,base) \ #define do_div(n,base) \
({ \ ({ \
register int __res asm("r2") = base; \ register unsigned int __base asm("r4") = base; \
register unsigned long long __n asm("r0") = n; \ register unsigned long long __n asm("r0") = n; \
asm("bl do_div64" \ register unsigned long long __res asm("r2"); \
: "=r" (__n), "=r" (__res) \ register unsigned int __rem asm(__xh); \
: "0" (__n), "1" (__res) \ asm("bl __do_div64" \
: "r3", "ip", "lr", "cc"); \ : "=r" (__rem), "=r" (__res) \
n = __n; \ : "r" (__n), "r" (__base) \
__res; \ : "ip", "lr", "cc"); \
n = __res; \
__rem; \
}) })
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment