[ARM PATCH] 1678/1: correct and better do_div() implementation for ARM

Patch from Nicolas Pitre Here's a rewrite of the ARM do_div() implementation. It is much faster and smarter than the current code, and it also takes advantage of ARMv5+ instructions when target processor allows it. The current code also deserves to be killed ASAP since it overflows and fails to compute correct values in many cases. For example: u64 n = 2200000001; u32 x = 2200000000; u32 r = do_div(n, x); This currently returns n = 41 and r = 46829569 which is obviously bad. Another failing example is n=15000000000000000000 and x=3000000000.

[ARM PATCH] 1678/1: correct and better do_div() implementation for ARM
Patch from Nicolas Pitre Here's a rewrite of the ARM do_div() implementation. It is much faster and smarter than the current code, and it also takes advantage of ARMv5+ instructions when target processor allows it. The current code also deserves to be killed ASAP since it overflows and fails to compute correct values in many cases. For example: u64 n = 2200000001; u32 x = 2200000000; u32 r = do_div(n, x); This currently returns n = 41 and r = 46829569 which is obviously bad. Another failing example is n=15000000000000000000 and x=3000000000.
3eb64c82 · Nicolas Pitre · Russell King · 24d4f462 · 3eb64c82 · 3eb64c82
Commit 3eb64c82 authored Oct 14, 2003 by Nicolas Pitre Committed by Russell King Oct 14, 2003
Hide whitespace changes
Inline Side-by-side

Showing with 231 additions and 64 deletions

arch/arm/kernel/armksyms.c arch/arm/kernel/armksyms.c +2 -2

arch/arm/lib/div64.S arch/arm/lib/div64.S +195 -52

include/asm-arm/div64.h include/asm-arm/div64.h +34 -10

No files found.
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -68,8 +68,8 @@ extern void __umoddi3(void);
 extern void __udivmoddi4(void);
 extern void __udivsi3(void);
 extern void __umodsi3(void);
+extern void __do_div64(void);
 extern void abort(void);
-extern void do_div64(void);

 extern void ret_from_exception(void);
 extern void fpundefinstr(void);
@@ -223,7 +223,7 @@ EXPORT_SYMBOL_NOVERS(__umoddi3);
 EXPORT_SYMBOL_NOVERS(__udivmoddi4);
 EXPORT_SYMBOL_NOVERS(__udivsi3);
 EXPORT_SYMBOL_NOVERS(__umodsi3);
-EXPORT_SYMBOL_NOVERS(do_div64);
+EXPORT_SYMBOL_NOVERS(__do_div64);

 	/* bitops */
 EXPORT_SYMBOL(_set_bit_le);

--- a/arch/arm/lib/div64.S
+++ b/arch/arm/lib/div64.S
+/*
+ *  linux/arch/arm/lib/div64.S
+ *
+ *  Optimized computation of 64-bit dividend / 32-bit divisor
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Oct 5, 2003
+ *  Copyright:	Monta Vista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
 #include <linux/linkage.h>

-#ifndef __ARMEB__
-ql	.req	r0			@ quotient low
-qh	.req	r1			@ quotient high
-onl	.req	r0			@ original dividend low
-onh	.req	r1			@ original dividend high
-nl	.req	r4			@ dividend low
-nh	.req	r5			@ dividend high
-res	.req	r4			@ result
+#ifdef __ARMEB__
+#define xh r0
+#define xl r1
+#define yh r2
+#define yl r3
 #else
-ql	.req	r1
-qh	.req	r0
-onl	.req	r1
-onh	.req	r0
-nl	.req	r5
-nh	.req	r4
-res	.req	r5
+#define xl r0
+#define xh r1
+#define yl r2
+#define yh r3
 #endif

-dl	.req	r3			@ divisor low
-dh	.req	r2			@ divsor high
-
-
-ENTRY(do_div64)
-	stmfd	sp!, {r4, r5, lr}
-	mov	nl, onl
-	movs	nh, onh			@ if high bits are zero
-	movne	lr, #33
-	moveq	lr, #1			@ only divide low bits
-	moveq	nh, onl
-
-1:	cmp	nh, dh
-	bls	2f
-	add	lr, lr, #1
-	movs	dh, dh, lsl #1		@ left justify disor
-	bpl	1b
-
-2:	movs	nh, onh
-	moveq	dl, dh
-	moveq	dh, #0
-	movne	dl, #0
-	mov	ql, #0
-	mov	qh, #0
-3:	subs	ip, nl, dl		@ trial subtraction
-	sbcs	ip, nh, dh
-	movcs	nh, ip			@ only update if successful
-	subcs	nl, nl, dl		@ (repeat the subtraction)
-	adcs	ql, ql, ql		@ C=1 if successful, shift into
-	adc	qh, qh, qh		@ quotient
-	movs	dh, dh, lsr #1		@ shift base high part right
-	mov	dl, dl, rrx		@ shift base low part right
-	subs	lr, lr, #1
-	bne	3b
-
-	mov	r2, res
-	ldmfd	sp!, {r4, r5, pc}
+/*
+ * __do_div64: perform a division with 64-bit dividend and 32-bit divisor.
+ *
+ * Note: Calling convention is totally non standard for optimal code.
+ *       This is meant to be used by do_div() from include/asm/div64.h only.
+ *
+ * Input parameters:
+ * 	xh-xl	= dividend (clobbered)
+ * 	r4	= divisor (preserved)
+ *
+ * Output values:
+ * 	yh-yl	= result
+ * 	xh	= remainder
+ *
+ * Clobbered regs: xl, ip
+ */
+
+ENTRY(__do_div64)
+
+	@ Test for easy paths first.
+	subs	ip, r4, #1
+	bls	9f			@ divisor is 0 or 1
+	tst	ip, r4
+	beq	8f			@ divisor is power of 2
+
+	@ See if we need to handle upper 32-bit result.
+	cmp	xh, r4
+	mov	yh, #0
+	blo	3f
+
+	@ Align divisor with upper part of dividend.
+	@ The aligned divisor is stored in yl preserving the original.
+	@ The bit position is stored in ip.
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+	clz	yl, r4
+	clz	ip, xh
+	sub	yl, yl, ip
+	mov	ip, #1
+	mov	ip, ip, lsl yl
+	mov	yl, r4, lsl yl
+
+#else
+
+	mov	yl, r4
+	mov	ip, #1
+1:	cmp	yl, #0x80000000
+	cmpcc	yl, xh
+	movcc	yl, yl, lsl #1
+	movcc	ip, ip, lsl #1
+	bcc	1b
+
+#endif
+
+	@ The division loop for needed upper bit positions.
+ 	@ Break out early if dividend reaches 0.
+2:	cmp	xh, yl
+	orrcs	yh, yh, ip
+	subcss	xh, xh, yl
+	movnes	ip, ip, lsr #1
+	mov	yl, yl, lsr #1
+	bne	2b
+
+	@ See if we need to handle lower 32-bit result.
+3:	cmp	xh, #0
+	mov	yl, #0
+	cmpeq	xl, r4
+	movlo	xh, xl
+	movlo	pc, lr
+
+	@ The division loop for lower bit positions.
+	@ Here we shift remainer bits leftwards rather than moving the
+	@ divisor for comparisons, considering the carry-out bit as well.
+	mov	ip, #0x80000000
+4:	movs	xl, xl, lsl #1
+	adcs	xh, xh, xh
+	beq	6f
+	cmpcc	xh, r4
+5:	orrcs	yl, yl, ip
+	subcs	xh, xh, r4
+	movs	ip, ip, lsr #1
+	bne	4b
+	mov	pc, lr
+
+	@ The top part of remainder became zero.  If carry is set
+	@ (the 33th bit) this is a false positive so resume the loop.
+	@ Otherwise, if lower part is also null then we're done.
+6:	bcs	5b
+	cmp	xl, #0
+	moveq	pc, lr
+
+	@ We still have remainer bits in the low part.  Bring them up.
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+	clz	xh, xl			@ we know xh is zero here so...
+	add	xh, xh, #1
+	mov	xl, xl, lsl xh
+	mov	ip, ip, lsr xh
+
+#else
+
+7:	movs	xl, xl, lsl #1
+	mov	ip, ip, lsr #1
+	bcc	7b
+
+#endif
+
+	@ Current remainder is now 1.  It's worthless to compare with
+	@ divisor at this point since divisor can't be smaller than 3 here.
+	@ If possible, branch for another shift in the division loop.
+	@ If no bit position left then we're done.
+	movs	ip, ip, lsr #1
+	mov	xh, #1
+	bne	4b
+	mov	pc, lr
+
+8:	@ Division by a power of 2: determine what that divisor order is
+	@ then simply shift values around
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+	clz	ip, r4
+	rsb	ip, ip, #31
+
+#else
+
+	mov	yl, r4
+	cmp	r4, #(1 << 16)
+	mov	ip, #0
+	movhs	yl, yl, lsr #16
+	movhs	ip, #16
+
+	cmp	yl, #(1 << 8)
+	movhs	yl, yl, lsr #8
+	addhs	ip, ip, #8
+
+	cmp	yl, #(1 << 4)
+	movhs	yl, yl, lsr #4
+	addhs	ip, ip, #4
+
+	cmp	yl, #(1 << 2)
+	addhi	ip, ip, #3
+	addls	ip, ip, yl, lsr #1
+
+#endif
+
+	mov	yh, xh, lsr ip
+	mov	yl, xl, lsr ip
+	rsb	ip, ip, #32
+	orr	yl, yl, xh, lsl ip
+	mov	xh, xl, lsl ip
+	mov	xh, xh, lsr ip
+	mov	pc, lr
+
+	@ eq -> division by 1: obvious enough...
+9:	moveq	yl, xl
+	moveq	yh, xh
+	moveq	xh, #0
+	moveq	pc, lr
+
+	@ Division by 0:
+	str	lr, [sp, #-4]!
+	bl	__div0
+
+	@ as wrong as it could be...
+	mov	yl, #0
+	mov	yh, #0
+	mov	xh, #0
+	ldr	pc, [sp], #4
+
--- a/include/asm-arm/div64.h
+++ b/include/asm-arm/div64.h
 #ifndef __ASM_ARM_DIV64
 #define __ASM_ARM_DIV64

-/* We're not 64-bit, but... */
+/*
+ * The semantics of do_div() are:
+ *
+ * uint32_t do_div(uint64_t *n, uint32_t base)
+ * {
+ * 	uint32_t remainder = *n % base;
+ * 	*n = *n / base;
+ * 	return remainder;
+ * }
+ *
+ * In other words, a 64-bit dividend with a 32-bit divisor producing
+ * a 64-bit result and a 32-bit remainder.  To accomplish this optimally
+ * we call a special __do_div64 helper with completely non standard
+ * calling convention for arguments and results (beware).
+ */
+
+#ifdef __ARMEB__
+#define __xh "r0"
+#define __xl "r1"
+#else
+#define __xl "r0"
+#define __xh "r1"
+#endif
+
 #define do_div(n,base)						\
 ({								\
-	register int __res asm("r2") = base;			\
-	register unsigned long long __n asm("r0") = n;		\
-	asm("bl do_div64"					\
-		: "=r" (__n), "=r" (__res)			\
-		: "0" (__n), "1" (__res)			\
-		: "r3", "ip", "lr", "cc");			\
-	n = __n;						\
-	__res;							\
+	register unsigned int __base      asm("r4") = base;	\
+	register unsigned long long __n   asm("r0") = n;	\
+	register unsigned long long __res asm("r2");		\
+	register unsigned int __rem       asm(__xh);		\
+	asm("bl	__do_div64"					\
+		: "=r" (__rem), "=r" (__res)			\
+		: "r" (__n), "r" (__base)			\
+		: "ip", "lr", "cc");				\
+	n = __res;						\
+	__rem;							\
 })

 #endif
-