From 50649a967c1e3ce08106976fdf3dad345de50776 Mon Sep 17 00:00:00 2001
From: Brian Kessler <brian.m.kessler@gmail.com>
Date: Mon, 28 Aug 2017 22:38:39 -0700
Subject: [PATCH] math/big: implement Lehmer's extended GCD algorithm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Updates #15833

The extended GCD algorithm can be implemented using
Lehmer's algorithm with additional updates for the
cosequences following Algorithm 10.45 from Cohen et al.
"Handbook of Elliptic and Hyperelliptic Curve Cryptography" pp 192.
This brings the speed of the extended GCD calculation within
~2x of the base GCD calculation.  There is a slight degradation in
the non-extended GCD speed for small inputs (1-2 words) due to the
additional code to handle the extended updates.

name                          old time/op    new time/op    delta
GCD10x10/WithoutXY-4             262ns Â± 1%     266ns Â± 2%     ~     (p=0.333 n=5+5)
GCD10x10/WithXY-4               1.42Âµs Â± 2%    0.74Âµs Â± 3%  -47.90%  (p=0.008 n=5+5)
GCD10x100/WithoutXY-4            520ns Â± 2%     539ns Â± 1%   +3.81%  (p=0.008 n=5+5)
GCD10x100/WithXY-4              2.32Âµs Â± 1%    1.67Âµs Â± 0%  -27.80%  (p=0.008 n=5+5)
GCD10x1000/WithoutXY-4          1.40Âµs Â± 1%    1.45Âµs Â± 2%   +3.26%  (p=0.016 n=4+5)
GCD10x1000/WithXY-4             4.78Âµs Â± 1%    3.43Âµs Â± 1%  -28.37%  (p=0.008 n=5+5)
GCD10x10000/WithoutXY-4         10.0Âµs Â± 0%    10.2Âµs Â± 3%   +1.80%  (p=0.008 n=5+5)
GCD10x10000/WithXY-4            20.9Âµs Â± 3%    17.9Âµs Â± 1%  -14.20%  (p=0.008 n=5+5)
GCD10x100000/WithoutXY-4        96.8Âµs Â± 0%    96.3Âµs Â± 1%     ~     (p=0.310 n=5+5)
GCD10x100000/WithXY-4            196Âµs Â± 3%     159Âµs Â± 2%  -18.61%  (p=0.008 n=5+5)
GCD100x100/WithoutXY-4          2.53Âµs Â±15%    2.34Âµs Â± 0%   -7.35%  (p=0.008 n=5+5)
GCD100x100/WithXY-4             19.3Âµs Â± 0%     3.9Âµs Â± 1%  -79.58%  (p=0.008 n=5+5)
GCD100x1000/WithoutXY-4         4.23Âµs Â± 0%    4.17Âµs Â± 3%     ~     (p=0.127 n=5+5)
GCD100x1000/WithXY-4            22.8Âµs Â± 1%     7.5Âµs Â±10%  -67.00%  (p=0.008 n=5+5)
GCD100x10000/WithoutXY-4        19.1Âµs Â± 0%    19.0Âµs Â± 0%     ~     (p=0.095 n=5+5)
GCD100x10000/WithXY-4           75.1Âµs Â± 2%    30.5Âµs Â± 2%  -59.38%  (p=0.008 n=5+5)
GCD100x100000/WithoutXY-4        170Âµs Â± 5%     167Âµs Â± 1%     ~     (p=1.000 n=5+5)
GCD100x100000/WithXY-4           542Âµs Â± 2%     267Âµs Â± 2%  -50.79%  (p=0.008 n=5+5)
GCD1000x1000/WithoutXY-4        28.0Âµs Â± 0%    27.1Âµs Â± 0%   -3.29%  (p=0.008 n=5+5)
GCD1000x1000/WithXY-4            329Âµs Â± 0%      42Âµs Â± 1%  -87.12%  (p=0.008 n=5+5)
GCD1000x10000/WithoutXY-4       47.2Âµs Â± 0%    46.4Âµs Â± 0%   -1.65%  (p=0.016 n=5+4)
GCD1000x10000/WithXY-4           607Âµs Â± 9%     123Âµs Â± 1%  -79.70%  (p=0.008 n=5+5)
GCD1000x100000/WithoutXY-4       260Âµs Â±17%     245Âµs Â± 0%     ~     (p=0.056 n=5+5)
GCD1000x100000/WithXY-4         3.64ms Â± 1%    0.93ms Â± 1%  -74.41%  (p=0.016 n=4+5)
GCD10000x10000/WithoutXY-4       513Âµs Â± 0%     507Âµs Â± 0%   -1.22%  (p=0.008 n=5+5)
GCD10000x10000/WithXY-4         7.44ms Â± 1%    1.00ms Â± 0%  -86.58%  (p=0.008 n=5+5)
GCD10000x100000/WithoutXY-4     1.23ms Â± 0%    1.23ms Â± 1%     ~     (p=0.056 n=5+5)
GCD10000x100000/WithXY-4        37.3ms Â± 0%     7.3ms Â± 1%  -80.45%  (p=0.008 n=5+5)
GCD100000x100000/WithoutXY-4    24.2ms Â± 0%    24.2ms Â± 0%     ~     (p=0.841 n=5+5)
GCD100000x100000/WithXY-4        505ms Â± 1%      56ms Â± 1%  -88.92%  (p=0.008 n=5+5)

Change-Id: I25f42ab8c55033acb83cc32bb03c12c1963925e8
Reviewed-on: https://go-review.googlesource.com/78755
Reviewed-by: Robert Griesemer <gri@golang.org>
Run-TryBot: Robert Griesemer <gri@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
---
 src/math/big/int.go      | 287 ++++++++++++++++++++++++---------------
 src/math/big/int_test.go |  64 +++++++--
 src/math/big/rat.go      |   2 +-
 3 files changed, 228 insertions(+), 125 deletions(-)

diff --git a/src/math/big/int.go b/src/math/big/int.go
index caebde92fa..b1d09cdad8 100644
--- a/src/math/big/int.go
+++ b/src/math/big/int.go
@@ -485,162 +485,223 @@ func (z *Int) GCD(x, y, a, b *Int) *Int {
 		}
 		return z
 	}
-	if x == nil && y == nil {
-		return z.lehmerGCD(a, b)
-	}
 
-	A := new(Int).Set(a)
-	B := new(Int).Set(b)
+	return z.lehmerGCD(x, y, a, b)
+}
 
-	X := new(Int)
-	lastX := new(Int).SetInt64(1)
+// lehmerSimulate attempts to simulate several Euclidean update steps
+// using the leading digits of A and B.  It returns u0, u1, v0, v1
+// such that A and B can be updated as:
+//		A = u0*A + v0*B
+//		B = u1*A + v1*B
+// Requirements: A >= B and len(B.abs) >= 2
+// Since we are calculating with full words to avoid overflow,
+// we use 'even' to track the sign of the cosequences.
+// For even iterations: u0, v1 >= 0 && u1, v0 <= 0
+// For odd  iterations: u0, v1 <= 0 && u1, v0 >= 0
+func lehmerSimulate(A, B *Int) (u0, u1, v0, v1 Word, even bool) {
+	// initialize the digits
+	var a1, a2, u2, v2 Word
+
+	m := len(B.abs) // m >= 2
+	n := len(A.abs) // n >= m >= 2
+
+	// extract the top Word of bits from A and B
+	h := nlz(A.abs[n-1])
+	a1 = A.abs[n-1]<<h | A.abs[n-2]>>(_W-h)
+	// B may have implicit zero words in the high bits if the lengths differ
+	switch {
+	case n == m:
+		a2 = B.abs[n-1]<<h | B.abs[n-2]>>(_W-h)
+	case n == m+1:
+		a2 = B.abs[n-2] >> (_W - h)
+	default:
+		a2 = 0
+	}
 
-	q := new(Int)
-	temp := new(Int)
+	// Since we are calculating with full words to avoid overflow,
+	// we use 'even' to track the sign of the cosequences.
+	// For even iterations: u0, v1 >= 0 && u1, v0 <= 0
+	// For odd  iterations: u0, v1 <= 0 && u1, v0 >= 0
+	// The first iteration starts with k=1 (odd).
+	even = false
+	// variables to track the cosequences
+	u0, u1, u2 = 0, 1, 0
+	v0, v1, v2 = 0, 0, 1
+
+	// Calculate the quotient and cosequences using Collins' stopping condition.
+	// Note that overflow of a Word is not possible when computing the remainder
+	// sequence and cosequences since the cosequence size is bounded by the input size.
+	// See section 4.2 of Jebelean for details.
+	for a2 >= v2 && a1-a2 >= v1+v2 {
+		q, r := a1/a2, a1%a2
+		a1, a2 = a2, r
+		u0, u1, u2 = u1, u2, u1+q*u2
+		v0, v1, v2 = v1, v2, v1+q*v2
+		even = !even
+	}
+	return
+}
 
-	r := new(Int)
-	for len(B.abs) > 0 {
-		q, r = q.QuoRem(A, B, r)
+// lehmerUpdate updates the inputs A and B such that:
+//		A = u0*A + v0*B
+//		B = u1*A + v1*B
+// where the signs of u0, u1, v0, v1 are given by even
+// For even == true: u0, v1 >= 0 && u1, v0 <= 0
+// For even == false: u0, v1 <= 0 && u1, v0 >= 0
+// q, r, s, t are temporary variables to avoid allocations in the multiplication
+func lehmerUpdate(A, B, q, r, s, t *Int, u0, u1, v0, v1 Word, even bool) {
+
+	t.abs = t.abs.setWord(u0)
+	s.abs = s.abs.setWord(v0)
+	t.neg = !even
+	s.neg = even
+
+	t.Mul(A, t)
+	s.Mul(B, s)
+
+	r.abs = r.abs.setWord(u1)
+	q.abs = q.abs.setWord(v1)
+	r.neg = even
+	q.neg = !even
+
+	r.Mul(A, r)
+	q.Mul(B, q)
+
+	A.Add(t, s)
+	B.Add(r, q)
+}
 
-		A, B, r = B, r, A
+// euclidUpdate performs a single step of the Euclidean GCD algorithm
+// if extended is true, it also updates the cosequence Ua, Ub
+func euclidUpdate(A, B, Ua, Ub, q, r, s, t *Int, extended bool) {
+	q, r = q.QuoRem(A, B, r)
 
-		temp.Set(X)
-		X.Mul(X, q)
-		X.Sub(lastX, X)
-		lastX.Set(temp)
-	}
+	*A, *B, *r = *B, *r, *A
 
-	if x != nil {
-		*x = *lastX
+	if extended {
+		// Ua, Ub = Ub, Ua - q*Ub
+		t.Set(Ub)
+		s.Mul(Ub, q)
+		Ub.Sub(Ua, s)
+		Ua.Set(t)
 	}
-
-	if y != nil {
-		// y = (z - a*x)/b
-		y.Mul(a, lastX)
-		y.Sub(A, y)
-		y.Div(y, b)
-	}
-
-	*z = *A
-	return z
 }
 
 // lehmerGCD sets z to the greatest common divisor of a and b,
 // which both must be > 0, and returns z.
+// If x or y are not nil, their values are set such that z = a*x + b*y.
 // See Knuth, The Art of Computer Programming, Vol. 2, Section 4.5.2, Algorithm L.
 // This implementation uses the improved condition by Collins requiring only one
 // quotient and avoiding the possibility of single Word overflow.
 // See Jebelean, "Improving the multiprecision Euclidean algorithm",
 // Design and Implementation of Symbolic Computation Systems, pp 45-58.
-func (z *Int) lehmerGCD(a, b *Int) *Int {
-	// ensure a >= b
-	if a.abs.cmp(b.abs) < 0 {
-		a, b = b, a
-	}
+// The cosequences are updated according to Algorithm 10.45 from
+// Cohen et al. "Handbook of Elliptic and Hyperelliptic Curve Cryptography" pp 192.
+func (z *Int) lehmerGCD(x, y, a, b *Int) *Int {
+	var A, B, Ua, Ub *Int
+
+	A = new(Int).Set(a)
+	B = new(Int).Set(b)
+
+	extended := x != nil || y != nil
 
-	// don't destroy incoming values of a and b
-	B := new(Int).Set(b) // must be set first in case b is an alias of z
-	A := z.Set(a)
+	if extended {
+		// Ua (Ub) tracks how many times input a has been accumulated into A (B).
+		Ua = new(Int).SetInt64(1)
+		Ub = new(Int)
+	}
 
 	// temp variables for multiprecision update
-	t := new(Int)
+	q := new(Int)
 	r := new(Int)
 	s := new(Int)
-	w := new(Int)
+	t := new(Int)
+
+	// ensure A >= B
+	if A.abs.cmp(B.abs) < 0 {
+		A, B = B, A
+		Ub, Ua = Ua, Ub
+	}
 
 	// loop invariant A >= B
 	for len(B.abs) > 1 {
-		// initialize the digits
-		var a1, a2, u0, u1, u2, v0, v1, v2 Word
-
-		m := len(B.abs) // m >= 2
-		n := len(A.abs) // n >= m >= 2
-
-		// extract the top Word of bits from A and B
-		h := nlz(A.abs[n-1])
-		a1 = (A.abs[n-1] << h) | (A.abs[n-2] >> (_W - h))
-		// B may have implicit zero words in the high bits if the lengths differ
-		switch {
-		case n == m:
-			a2 = (B.abs[n-1] << h) | (B.abs[n-2] >> (_W - h))
-		case n == m+1:
-			a2 = (B.abs[n-2] >> (_W - h))
-		default:
-			a2 = 0
-		}
+		// Attempt to calculate in single-precision using leading words of A and B.
+		u0, u1, v0, v1, even := lehmerSimulate(A, B)
 
-		// Since we are calculating with full words to avoid overflow,
-		// we use 'even' to track the sign of the cosequences.
-		// For even iterations: u0, v1 >= 0 && u1, v0 <= 0
-		// For odd  iterations: u0, v1 <= 0 && u1, v0 >= 0
-		// The first iteration starts with k=1 (odd).
-		even := false
-		// variables to track the cosequences
-		u0, u1, u2 = 0, 1, 0
-		v0, v1, v2 = 0, 0, 1
-
-		// Calculate the quotient and cosequences using Collins' stopping condition.
-		// Note that overflow of a Word is not possible when computing the remainder
-		// sequence and cosequences since the cosequence size is bounded by the input size.
-		// See section 4.2 of Jebelean for details.
-		for a2 >= v2 && a1-a2 >= v1+v2 {
-			q := a1 / a2
-			a1, a2 = a2, a1-q*a2
-			u0, u1, u2 = u1, u2, u1+q*u2
-			v0, v1, v2 = v1, v2, v1+q*v2
-			even = !even
-		}
-
-		// multiprecision step
+		// multiprecision Step
 		if v0 != 0 {
-			// simulate the effect of the single precision steps using the cosequences
+			// Simulate the effect of the single-precision steps using the cosequences.
 			// A = u0*A + v0*B
 			// B = u1*A + v1*B
+			lehmerUpdate(A, B, q, r, s, t, u0, u1, v0, v1, even)
 
-			t.abs = t.abs.setWord(u0)
-			s.abs = s.abs.setWord(v0)
-			t.neg = !even
-			s.neg = even
-
-			t.Mul(A, t)
-			s.Mul(B, s)
-
-			r.abs = r.abs.setWord(u1)
-			w.abs = w.abs.setWord(v1)
-			r.neg = even
-			w.neg = !even
-
-			r.Mul(A, r)
-			w.Mul(B, w)
-
-			A.Add(t, s)
-			B.Add(r, w)
+			if extended {
+				// Ua = u0*Ua + v0*Ub
+				// Ub = u1*Ua + v1*Ub
+				lehmerUpdate(Ua, Ub, q, r, s, t, u0, u1, v0, v1, even)
+			}
 
 		} else {
-			// single-digit calculations failed to simulate any quotients
-			// do a standard Euclidean step
-			t.Rem(A, B)
-			A, B, t = B, t, A
+			// Single-digit calculations failed to simulate any quotients.
+			// Do a standard Euclidean step.
+			euclidUpdate(A, B, Ua, Ub, q, r, s, t, extended)
 		}
 	}
 
 	if len(B.abs) > 0 {
-		// standard Euclidean algorithm base case for B a single Word
+		// extended Euclidean algorithm base case if B is a single Word
 		if len(A.abs) > 1 {
-			// A is longer than a single Word
-			t.Rem(A, B)
-			A, B, t = B, t, A
+			// A is longer than a single Word, so one update is needed.
+			euclidUpdate(A, B, Ua, Ub, q, r, s, t, extended)
 		}
 		if len(B.abs) > 0 {
-			// A and B are both a single Word
-			a1, a2 := A.abs[0], B.abs[0]
-			for a2 != 0 {
-				a1, a2 = a2, a1%a2
+			// A and B are both a single Word.
+			aWord, bWord := A.abs[0], B.abs[0]
+			if extended {
+				var ua, ub, va, vb Word
+				ua, ub = 1, 0
+				va, vb = 0, 1
+				even := true
+				for bWord != 0 {
+					q, r := aWord/bWord, aWord%bWord
+					aWord, bWord = bWord, r
+					ua, ub = ub, ua+q*ub
+					va, vb = vb, va+q*vb
+					even = !even
+				}
+
+				t.abs = t.abs.setWord(ua)
+				s.abs = s.abs.setWord(va)
+				t.neg = !even
+				s.neg = even
+
+				t.Mul(Ua, t)
+				s.Mul(Ub, s)
+
+				Ua.Add(t, s)
+			} else {
+				for bWord != 0 {
+					aWord, bWord = bWord, aWord%bWord
+				}
 			}
-			A.abs[0] = a1
+			A.abs[0] = aWord
 		}
 	}
+
+	if x != nil {
+		*x = *Ua
+	}
+
+	if y != nil {
+		// y = (z - a*x)/b
+		y.Mul(a, Ua)
+		y.Sub(A, y)
+		y.Div(y, b)
+	}
+
 	*z = *A
+
 	return z
 }
 
diff --git a/src/math/big/int_test.go b/src/math/big/int_test.go
index b660d53523..1ef4d150b8 100644
--- a/src/math/big/int_test.go
+++ b/src/math/big/int_test.go
@@ -675,21 +675,43 @@ func checkGcd(aBytes, bBytes []byte) bool {
 	return x.Cmp(d) == 0
 }
 
-// euclidGCD is a reference implementation of Euclid's GCD
-// algorithm for testing against optimized algorithms.
+// euclidExtGCD is a reference implementation of Euclid's
+// extended GCD algorithm for testing against optimized algorithms.
 // Requirements: a, b > 0
-func euclidGCD(a, b *Int) *Int {
-
+func euclidExtGCD(a, b *Int) (g, x, y *Int) {
 	A := new(Int).Set(a)
 	B := new(Int).Set(b)
-	t := new(Int)
 
+	// A = Ua*a + Va*b
+	// B = Ub*a + Vb*b
+	Ua := new(Int).SetInt64(1)
+	Va := new(Int)
+
+	Ub := new(Int)
+	Vb := new(Int).SetInt64(1)
+
+	q := new(Int)
+	temp := new(Int)
+
+	r := new(Int)
 	for len(B.abs) > 0 {
-		// A, B = B, A % B
-		t.Rem(A, B)
-		A, B, t = B, t, A
+		q, r = q.QuoRem(A, B, r)
+
+		A, B, r = B, r, A
+
+		// Ua, Ub = Ub, Ua-q*Ub
+		temp.Set(Ub)
+		Ub.Mul(Ub, q)
+		Ub.Sub(Ua, Ub)
+		Ua.Set(temp)
+
+		// Va, Vb = Vb, Va-q*Vb
+		temp.Set(Vb)
+		Vb.Mul(Vb, q)
+		Vb.Sub(Va, Vb)
+		Va.Set(temp)
 	}
-	return A
+	return A, Ua, Va
 }
 
 func checkLehmerGcd(aBytes, bBytes []byte) bool {
@@ -700,12 +722,28 @@ func checkLehmerGcd(aBytes, bBytes []byte) bool {
 		return true // can only test positive arguments
 	}
 
-	d := new(Int).lehmerGCD(a, b)
-	d0 := euclidGCD(a, b)
+	d := new(Int).lehmerGCD(nil, nil, a, b)
+	d0, _, _ := euclidExtGCD(a, b)
 
 	return d.Cmp(d0) == 0
 }
 
+func checkLehmerExtGcd(aBytes, bBytes []byte) bool {
+	a := new(Int).SetBytes(aBytes)
+	b := new(Int).SetBytes(bBytes)
+	x := new(Int)
+	y := new(Int)
+
+	if a.Sign() <= 0 || b.Sign() <= 0 {
+		return true // can only test positive arguments
+	}
+
+	d := new(Int).lehmerGCD(x, y, a, b)
+	d0, x0, y0 := euclidExtGCD(a, b)
+
+	return d.Cmp(d0) == 0 && x.Cmp(x0) == 0 && y.Cmp(y0) == 0
+}
+
 var gcdTests = []struct {
 	d, x, y, a, b string
 }{
@@ -785,6 +823,10 @@ func TestGcd(t *testing.T) {
 	if err := quick.Check(checkLehmerGcd, nil); err != nil {
 		t.Error(err)
 	}
+
+	if err := quick.Check(checkLehmerExtGcd, nil); err != nil {
+		t.Error(err)
+	}
 }
 
 type intShiftTest struct {
diff --git a/src/math/big/rat.go b/src/math/big/rat.go
index b33fc696ad..46d58fcf36 100644
--- a/src/math/big/rat.go
+++ b/src/math/big/rat.go
@@ -422,7 +422,7 @@ func (z *Rat) norm() *Rat {
 		neg := z.a.neg
 		z.a.neg = false
 		z.b.neg = false
-		if f := NewInt(0).lehmerGCD(&z.a, &z.b); f.Cmp(intOne) != 0 {
+		if f := NewInt(0).lehmerGCD(nil, nil, &z.a, &z.b); f.Cmp(intOne) != 0 {
 			z.a.abs, _ = z.a.abs.div(nil, z.a.abs, f.abs)
 			z.b.abs, _ = z.b.abs.div(nil, z.b.abs, f.abs)
 			if z.b.abs.cmp(natOne) == 0 {
-- 
2.30.9