float.go 40.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// This file implements multi-precision floating-point numbers.
// Like in the GNU MPFR library (http://www.mpfr.org/), operands
// can be of mixed precision. Unlike MPFR, the rounding mode is
// not specified with each operation, but with each operand. The
// rounding mode of the result operand determines the rounding
// mode of an operation. This is a from-scratch implementation.

package big

import (
	"fmt"
	"math"
)

const debugFloat = true // enable for debugging

21
// A nonzero finite Float represents a multi-precision floating point number
22
//
23
//   sign × mantissa × 2**exponent
24
//
Robert Griesemer's avatar
Robert Griesemer committed
25
// with 0.5 <= mantissa < 1.0, and MinExp <= exponent <= MaxExp.
26 27 28
// A Float may also be zero (+0, -0) or infinite (+Inf, -Inf).
// All Floats are ordered, and the ordering of two Floats x and y
// is defined by x.Cmp(y).
29
//
Robert Griesemer's avatar
Robert Griesemer committed
30
// Each Float value also has a precision, rounding mode, and accuracy.
31
// The precision is the maximum number of mantissa bits available to
32 33 34
// represent the value. The rounding mode specifies how a result should
// be rounded to fit into the mantissa bits, and accuracy describes the
// rounding error with respect to the exact result.
35
//
36 37 38 39
// Unless specified otherwise, all operations (including setters) that
// specify a *Float variable for the result (usually via the receiver
// with the exception of MantExp), round the numeric result according
// to the precision and rounding mode of the result variable.
40
//
Robert Griesemer's avatar
Robert Griesemer committed
41 42 43 44 45 46
// If the provided result precision is 0 (see below), it is set to the
// precision of the argument with the largest precision value before any
// rounding takes place, and the rounding mode remains unchanged. Thus,
// uninitialized Floats provided as result arguments will have their
// precision set to a reasonable value determined by the operands and
// their mode is the zero value for RoundingMode (ToNearestEven).
47
//
48 49
// By setting the desired precision to 24 or 53 and using matching rounding
// mode (typically ToNearestEven), Float operations produce the same results
50 51 52 53
// as the corresponding float32 or float64 IEEE-754 arithmetic for operands
// that correspond to normal (i.e., not denormal) float32 or float64 numbers.
// Exponent underflow and overflow lead to a 0 or an Infinity for different
// values than IEEE-754 because Float exponents have a much larger range.
54
//
55 56
// The zero (uninitialized) value for a Float is ready to use and represents
// the number +0.0 exactly, with precision 0 and rounding mode ToNearestEven.
57 58
//
type Float struct {
59
	prec uint32
60 61
	mode RoundingMode
	acc  Accuracy
62
	form form
63 64 65 66 67
	neg  bool
	mant nat
	exp  int32
}

68 69 70 71 72 73
// Float operations that would lead to a NaN under IEEE-754 rules cause
// a run-time panic of ErrNaN type.
type ErrNaN struct {
	msg string
}

74 75
// NewFloat allocates and returns a new Float set to x,
// with precision 53 and rounding mode ToNearestEven.
76
// NewFloat panics with ErrNaN if x is a NaN.
77
func NewFloat(x float64) *Float {
78 79 80
	if math.IsNaN(x) {
		panic(ErrNaN{"NewFloat(NaN)"})
	}
81 82 83
	return new(Float).SetFloat64(x)
}

84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
// Exponent and precision limits.
const (
	MaxExp  = math.MaxInt32  // largest supported exponent
	MinExp  = math.MinInt32  // smallest supported exponent
	MaxPrec = math.MaxUint32 // largest (theoretically) supported precision; likely memory-limited
)

// Internal representation: The mantissa bits x.mant of a nonzero finite
// Float x are stored in a nat slice long enough to hold up to x.prec bits;
// the slice may (but doesn't have to) be shorter if the mantissa contains
// trailing 0 bits. x.mant is normalized if the msb of x.mant == 1 (i.e.,
// the msb is shifted all the way "to the left"). Thus, if the mantissa has
// trailing 0 bits or x.prec is not a multiple of the the Word size _W,
// x.mant[0] has trailing zero bits. The msb of the mantissa corresponds
// to the value 0.5; the exponent x.exp shifts the binary point as needed.
//
100
// A zero or non-finite Float x ignores x.mant and x.exp.
101 102 103 104 105 106
//
// x                 form      neg      mant         exp
// ----------------------------------------------------------
// ±0                zero      sign     -            -
// 0 < |x| < +Inf    finite    sign     mantissa     exponent
// ±Inf              inf       sign     -            -
Robert Griesemer's avatar
Robert Griesemer committed
107

108 109 110 111
// A form value describes the internal representation.
type form byte

// The form value order is relevant - do not change!
112
const (
113 114 115
	zero form = iota
	finite
	inf
116
)
117 118 119 120

// RoundingMode determines how a Float value is rounded to the
// desired precision. Rounding may change the Float value; the
// rounding error is described by the Float's Accuracy.
121
type RoundingMode byte
122 123 124 125 126 127 128 129 130 131 132

// The following rounding modes are supported.
const (
	ToNearestEven RoundingMode = iota // == IEEE 754-2008 roundTiesToEven
	ToNearestAway                     // == IEEE 754-2008 roundTiesToAway
	ToZero                            // == IEEE 754-2008 roundTowardZero
	AwayFromZero                      // no IEEE 754-2008 equivalent
	ToNegativeInf                     // == IEEE 754-2008 roundTowardNegative
	ToPositiveInf                     // == IEEE 754-2008 roundTowardPositive
)

133 134 135 136
//go:generate stringer -type=RoundingMode

// Accuracy describes the rounding error produced by the most recent
// operation that generated a Float value, relative to the exact value.
137
type Accuracy int8
138 139 140

// Constants describing the Accuracy of a Float.
const (
141
	Below Accuracy = -1
142
	Exact Accuracy = 0
143
	Above Accuracy = +1
144 145 146
)

//go:generate stringer -type=Accuracy
147

148 149 150
// SetPrec sets z's precision to prec and returns the (possibly) rounded
// value of z. Rounding occurs according to z's rounding mode if the mantissa
// cannot be represented in prec bits without loss of precision.
151 152
// SetPrec(0) maps all finite values to ±0; infinite values remain unchanged.
// If prec > MaxPrec, it is set to MaxPrec.
153
func (z *Float) SetPrec(prec uint) *Float {
154
	z.acc = Exact // optimistically assume no rounding is needed
Robert Griesemer's avatar
Robert Griesemer committed
155 156

	// special case
157 158
	if prec == 0 {
		z.prec = 0
159 160
		if z.form == finite {
			// truncate z to 0
161
			z.acc = makeAcc(z.neg)
162
			z.form = zero
163 164 165
		}
		return z
	}
Robert Griesemer's avatar
Robert Griesemer committed
166

167 168 169 170
	// general case
	if prec > MaxPrec {
		prec = MaxPrec
	}
171
	old := z.prec
172 173
	z.prec = uint32(prec)
	if z.prec < old {
174 175 176 177 178
		z.round(0)
	}
	return z
}

179 180
func makeAcc(above bool) Accuracy {
	if above {
181 182 183 184 185
		return Above
	}
	return Below
}

186
// SetMode sets z's rounding mode to mode and returns an exact z.
187
// z remains unchanged otherwise.
188
// z.SetMode(z.Mode()) is a cheap way to set z's accuracy to Exact.
189 190
func (z *Float) SetMode(mode RoundingMode) *Float {
	z.mode = mode
191
	z.acc = Exact
192 193 194 195
	return z
}

// Prec returns the mantissa precision of x in bits.
196
// The result may be 0 for |x| == 0 and |x| == Inf.
197
func (x *Float) Prec() uint {
198 199 200
	return uint(x.prec)
}

201 202
// MinPrec returns the minimum precision required to represent x exactly
// (i.e., the smallest prec before x.SetPrec(prec) would start rounding x).
203
// The result is 0 for |x| == 0 and |x| == Inf.
204
func (x *Float) MinPrec() uint {
205 206 207
	if x.form != finite {
		return 0
	}
208 209 210
	return uint(len(x.mant))*_W - x.mant.trailingZeroBits()
}

211 212 213 214 215
// Mode returns the rounding mode of x.
func (x *Float) Mode() RoundingMode {
	return x.mode
}

216 217 218 219 220
// Acc returns the accuracy of x produced by the most recent operation.
func (x *Float) Acc() Accuracy {
	return x.acc
}

221 222
// Sign returns:
//
Robert Griesemer's avatar
Robert Griesemer committed
223
//	-1 if x <   0
224
//	 0 if x is ±0
Robert Griesemer's avatar
Robert Griesemer committed
225
//	+1 if x >   0
226 227
//
func (x *Float) Sign() int {
Robert Griesemer's avatar
Robert Griesemer committed
228
	if debugFloat {
229
		x.validate()
Robert Griesemer's avatar
Robert Griesemer committed
230
	}
231
	if x.form == zero {
Robert Griesemer's avatar
Robert Griesemer committed
232
		return 0
233 234
	}
	if x.neg {
Robert Griesemer's avatar
Robert Griesemer committed
235
		return -1
236
	}
Robert Griesemer's avatar
Robert Griesemer committed
237
	return 1
238 239
}

240 241 242 243 244 245 246
// MantExp breaks x into its mantissa and exponent components
// and returns the exponent. If a non-nil mant argument is
// provided its value is set to the mantissa of x, with the
// same precision and rounding mode as x. The components
// satisfy x == mant × 2**exp, with 0.5 <= |mant| < 1.0.
// Calling MantExp with a nil argument is an efficient way to
// get the exponent of the receiver.
247 248 249
//
// Special cases are:
//
250 251
//	(  ±0).MantExp(mant) = 0, with mant set to   ±0
//	(±Inf).MantExp(mant) = 0, with mant set to ±Inf
252
//
253 254 255
// x and mant may be the same in which case x is set to its
// mantissa value.
func (x *Float) MantExp(mant *Float) (exp int) {
Robert Griesemer's avatar
Robert Griesemer committed
256
	if debugFloat {
257
		x.validate()
Robert Griesemer's avatar
Robert Griesemer committed
258
	}
259
	if x.form == finite {
260
		exp = int(x.exp)
261 262 263
	}
	if mant != nil {
		mant.Copy(x)
264
		if mant.form == finite {
265 266
			mant.exp = 0
		}
267 268 269 270
	}
	return
}

271 272 273 274 275 276
func (z *Float) setExpAndRound(exp int64, sbit uint) {
	if exp < MinExp {
		// underflow
		z.acc = makeAcc(z.neg)
		z.form = zero
		return
277
	}
278 279 280 281 282 283

	if exp > MaxExp {
		// overflow
		z.acc = makeAcc(!z.neg)
		z.form = inf
		return
284
	}
285 286 287 288

	z.form = finite
	z.exp = int32(exp)
	z.round(sbit)
289 290
}

291 292 293 294 295
// SetMantExp sets z to mant × 2**exp and and returns z.
// The result z has the same precision and rounding mode
// as mant. SetMantExp is an inverse of MantExp but does
// not require 0.5 <= |mant| < 1.0. Specifically:
//
296
//	mant := new(Float)
297
//	new(Float).SetMantExp(mant, x.SetMantExp(mant)).Cmp(x).Eql() is true
298 299 300 301 302 303
//
// Special cases are:
//
//	z.SetMantExp(  ±0, exp) =   ±0
//	z.SetMantExp(±Inf, exp) = ±Inf
//
304 305
// z and mant may be the same in which case z's exponent
// is set to exp.
306
func (z *Float) SetMantExp(mant *Float, exp int) *Float {
Robert Griesemer's avatar
Robert Griesemer committed
307
	if debugFloat {
308 309
		z.validate()
		mant.validate()
Robert Griesemer's avatar
Robert Griesemer committed
310
	}
311
	z.Copy(mant)
312
	if z.form != finite {
313 314
		return z
	}
315
	z.setExpAndRound(int64(z.exp)+int64(exp), 0)
316 317 318
	return z
}

319 320 321
// Signbit returns true if x is negative or negative zero.
func (x *Float) Signbit() bool {
	return x.neg
322 323
}

324
// IsInf reports whether x is +Inf or -Inf.
325
func (x *Float) IsInf() bool {
326
	return x.form == inf
327 328
}

329
// IsInt reports whether x is an integer.
330
// ±Inf values are not integers.
331
func (x *Float) IsInt() bool {
332
	if debugFloat {
333
		x.validate()
334
	}
335 336 337 338 339
	// special cases
	if x.form != finite {
		return x.form == zero
	}
	// x.form == finite
340
	if x.exp <= 0 {
341
		return false
342 343
	}
	// x.exp > 0
344
	return x.prec <= uint32(x.exp) || x.MinPrec() <= uint(x.exp) // not enough bits for fractional mantissa
345 346
}

347
// debugging support
348
func (x *Float) validate() {
Robert Griesemer's avatar
Robert Griesemer committed
349 350 351 352
	if !debugFloat {
		// avoid performance bugs
		panic("validate called but debugFloat is not set")
	}
353 354 355
	if x.form != finite {
		return
	}
356 357
	m := len(x.mant)
	if m == 0 {
358
		panic("nonzero finite number with empty mantissa")
359
	}
360
	const msb = 1 << (_W - 1)
361 362 363
	if x.mant[m-1]&msb == 0 {
		panic(fmt.Sprintf("msb not set in last word %#x of %s", x.mant[m-1], x.Format('p', 0)))
	}
364 365
	if x.prec == 0 {
		panic("zero precision finite number")
366 367 368 369 370
	}
}

// round rounds z according to z.mode to z.prec bits and sets z.acc accordingly.
// sbit must be 0 or 1 and summarizes any "sticky bit" information one might
Robert Griesemer's avatar
Robert Griesemer committed
371 372
// have before calling round. z's mantissa must be normalized (with the msb set)
// or empty.
373 374 375 376
//
// CAUTION: The rounding modes ToNegativeInf, ToPositiveInf are affected by the
// sign of z. For correct rounding, the sign of z must be set correctly before
// calling round.
377
func (z *Float) round(sbit uint) {
378
	if debugFloat {
379
		z.validate()
380 381 382
		if z.form > finite {
			panic(fmt.Sprintf("round called for non-finite value %s", z))
		}
383
	}
384
	// z.form <= finite
385

386
	z.acc = Exact
387
	if z.form == zero {
388 389
		return
	}
390
	// z.form == finite && len(z.mant) > 0
391
	// m > 0 implies z.prec > 0 (checked by validate)
392

393 394
	m := uint32(len(z.mant)) // present mantissa length in words
	bits := m * _W           // present mantissa bits
395 396
	if bits <= z.prec {
		// mantissa fits => nothing to do
397 398
		return
	}
399
	// bits > z.prec
400 401 402 403 404

	n := (z.prec + (_W - 1)) / _W // mantissa length in words for desired precision

	// Rounding is based on two bits: the rounding bit (rbit) and the
	// sticky bit (sbit). The rbit is the bit immediately before the
405 406
	// z.prec leading mantissa bits (the "0.5"). The sbit is set if any
	// of the bits before the rbit are set (the "0.25", "0.125", etc.):
407 408 409 410 411 412 413 414 415
	//
	//   rbit  sbit  => "fractional part"
	//
	//   0     0        == 0
	//   0     1        >  0  , < 0.5
	//   1     0        == 0.5
	//   1     1        >  0.5, < 1.0

	// bits > z.prec: mantissa too large => round
416 417
	r := uint(bits - z.prec - 1) // rounding bit position; r >= 0
	rbit := z.mant.bit(r)        // rounding bit
418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
	if sbit == 0 {
		sbit = z.mant.sticky(r)
	}
	if debugFloat && sbit&^1 != 0 {
		panic(fmt.Sprintf("invalid sbit %#x", sbit))
	}

	// convert ToXInf rounding modes
	mode := z.mode
	switch mode {
	case ToNegativeInf:
		mode = ToZero
		if z.neg {
			mode = AwayFromZero
		}
	case ToPositiveInf:
		mode = AwayFromZero
		if z.neg {
			mode = ToZero
		}
	}

	// cut off extra words
	if m > n {
		copy(z.mant, z.mant[m-n:]) // move n last words to front
		z.mant = z.mant[:n]
	}

	// determine number of trailing zero bits t
	t := n*_W - z.prec // 0 <= t < _W
	lsb := Word(1) << t

	// make rounding decision
451
	// TODO(gri) This can be simplified (see Bits.round in bits_test.go).
452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
	switch mode {
	case ToZero:
		// nothing to do
	case ToNearestEven, ToNearestAway:
		if rbit == 0 {
			// rounding bits == 0b0x
			mode = ToZero
		} else if sbit == 1 {
			// rounding bits == 0b11
			mode = AwayFromZero
		}
	case AwayFromZero:
		if rbit|sbit == 0 {
			mode = ToZero
		}
	default:
		// ToXInf modes have been converted to ToZero or AwayFromZero
		panic("unreachable")
	}

	// round and determine accuracy
	switch mode {
	case ToZero:
		if rbit|sbit != 0 {
			z.acc = Below
		}

	case ToNearestEven, ToNearestAway:
		if debugFloat && rbit != 1 {
			panic("internal error in rounding")
		}
		if mode == ToNearestEven && sbit == 0 && z.mant[0]&lsb == 0 {
			z.acc = Below
			break
		}
		// mode == ToNearestAway || sbit == 1 || z.mant[0]&lsb != 0
		fallthrough

	case AwayFromZero:
		// add 1 to mantissa
		if addVW(z.mant, z.mant, lsb) != 0 {
			// overflow => shift mantissa right by 1 and add msb
			shrVU(z.mant, z.mant, 1)
			z.mant[n-1] |= 1 << (_W - 1)
			// adjust exponent
497 498 499 500 501 502 503 504
			if z.exp < MaxExp {
				z.exp++
			} else {
				// exponent overflow
				z.acc = makeAcc(!z.neg)
				z.form = inf
				return
			}
505 506 507 508 509 510 511 512
		}
		z.acc = Above
	}

	// zero out trailing bits in least-significant word
	z.mant[0] &^= lsb - 1

	// update accuracy
513
	if z.acc != Exact && z.neg {
514
		z.acc = -z.acc
515 516 517
	}

	if debugFloat {
518
		z.validate()
519 520 521 522 523 524 525 526 527 528 529
	}

	return
}

// nlz returns the number of leading zero bits in x.
func nlz(x Word) uint {
	return _W - uint(bitLen(x))
}

func nlz64(x uint64) uint {
530 531 532 533 534 535 536 537 538
	// TODO(gri) this can be done more nicely
	if _W == 32 {
		if x>>32 == 0 {
			return 32 + nlz(Word(x))
		}
		return nlz(Word(x >> 32))
	}
	if _W == 64 {
		return nlz(Word(x))
539
	}
540
	panic("unreachable")
541 542
}

543
func (z *Float) setBits64(neg bool, x uint64) *Float {
Robert Griesemer's avatar
Robert Griesemer committed
544 545 546 547
	if z.prec == 0 {
		z.prec = 64
	}
	z.acc = Exact
548
	z.neg = neg
549
	if x == 0 {
550
		z.form = zero
551 552
		return z
	}
Robert Griesemer's avatar
Robert Griesemer committed
553
	// x != 0
554
	z.form = finite
555 556
	s := nlz64(x)
	z.mant = z.mant.setUint64(x << s)
Robert Griesemer's avatar
Robert Griesemer committed
557 558 559 560
	z.exp = int32(64 - s) // always fits
	if z.prec < 64 {
		z.round(0)
	}
561 562 563
	return z
}

564 565 566 567 568 569 570
// SetUint64 sets z to the (possibly rounded) value of x and returns z.
// If z's precision is 0, it is changed to 64 (and rounding will have
// no effect).
func (z *Float) SetUint64(x uint64) *Float {
	return z.setBits64(false, x)
}

Robert Griesemer's avatar
Robert Griesemer committed
571 572 573
// SetInt64 sets z to the (possibly rounded) value of x and returns z.
// If z's precision is 0, it is changed to 64 (and rounding will have
// no effect).
574 575 576 577 578
func (z *Float) SetInt64(x int64) *Float {
	u := x
	if u < 0 {
		u = -u
	}
579 580 581
	// We cannot simply call z.SetUint64(uint64(u)) and change
	// the sign afterwards because the sign affects rounding.
	return z.setBits64(x < 0, uint64(u))
582 583
}

Robert Griesemer's avatar
Robert Griesemer committed
584
// SetFloat64 sets z to the (possibly rounded) value of x and returns z.
Robert Griesemer's avatar
Robert Griesemer committed
585
// If z's precision is 0, it is changed to 53 (and rounding will have
586
// no effect). SetFloat64 panics with ErrNaN if x is a NaN.
587
func (z *Float) SetFloat64(x float64) *Float {
Robert Griesemer's avatar
Robert Griesemer committed
588 589 590
	if z.prec == 0 {
		z.prec = 53
	}
Robert Griesemer's avatar
Robert Griesemer committed
591
	if math.IsNaN(x) {
592
		panic(ErrNaN{"Float.SetFloat64(NaN)"})
Robert Griesemer's avatar
Robert Griesemer committed
593
	}
Robert Griesemer's avatar
Robert Griesemer committed
594
	z.acc = Exact
Robert Griesemer's avatar
Robert Griesemer committed
595
	z.neg = math.Signbit(x) // handle -0, -Inf correctly
596 597
	if x == 0 {
		z.form = zero
598 599
		return z
	}
600 601
	if math.IsInf(x, 0) {
		z.form = inf
602 603
		return z
	}
Robert Griesemer's avatar
Robert Griesemer committed
604
	// normalized x != 0
605
	z.form = finite
606 607
	fmant, exp := math.Frexp(x) // get normalized mantissa
	z.mant = z.mant.setUint64(1<<63 | math.Float64bits(fmant)<<11)
Robert Griesemer's avatar
Robert Griesemer committed
608 609 610 611
	z.exp = int32(exp) // always fits
	if z.prec < 53 {
		z.round(0)
	}
612 613 614 615
	return z
}

// fnorm normalizes mantissa m by shifting it to the left
Robert Griesemer's avatar
Robert Griesemer committed
616 617
// such that the msb of the most-significant word (msw) is 1.
// It returns the shift amount. It assumes that len(m) != 0.
618
func fnorm(m nat) int64 {
619 620 621 622 623 624 625 626 627 628
	if debugFloat && (len(m) == 0 || m[len(m)-1] == 0) {
		panic("msw of mantissa is 0")
	}
	s := nlz(m[len(m)-1])
	if s > 0 {
		c := shlVU(m, m, s)
		if debugFloat && c != 0 {
			panic("nlz or shlVU incorrect")
		}
	}
629
	return int64(s)
630 631
}

Robert Griesemer's avatar
Robert Griesemer committed
632
// SetInt sets z to the (possibly rounded) value of x and returns z.
633 634
// If z's precision is 0, it is changed to the larger of x.BitLen()
// or 64 (and rounding will have no effect).
635
func (z *Float) SetInt(x *Int) *Float {
Robert Griesemer's avatar
Robert Griesemer committed
636 637 638
	// TODO(gri) can be more efficient if z.prec > 0
	// but small compared to the size of x, or if there
	// are many trailing 0's.
639
	bits := uint32(x.BitLen())
Robert Griesemer's avatar
Robert Griesemer committed
640
	if z.prec == 0 {
641
		z.prec = umax32(bits, 64)
Robert Griesemer's avatar
Robert Griesemer committed
642 643 644
	}
	z.acc = Exact
	z.neg = x.neg
645
	if len(x.abs) == 0 {
646
		z.form = zero
647 648 649 650
		return z
	}
	// x != 0
	z.mant = z.mant.set(x.abs)
Robert Griesemer's avatar
Robert Griesemer committed
651
	fnorm(z.mant)
652
	z.setExpAndRound(int64(bits), 0)
653 654 655
	return z
}

Robert Griesemer's avatar
Robert Griesemer committed
656
// SetRat sets z to the (possibly rounded) value of x and returns z.
657 658
// If z's precision is 0, it is changed to the largest of a.BitLen(),
// b.BitLen(), or 64; with x = a/b.
Robert Griesemer's avatar
Robert Griesemer committed
659
func (z *Float) SetRat(x *Rat) *Float {
660 661 662
	if x.IsInt() {
		return z.SetInt(x.Num())
	}
Robert Griesemer's avatar
Robert Griesemer committed
663 664 665 666
	var a, b Float
	a.SetInt(x.Num())
	b.SetInt(x.Denom())
	if z.prec == 0 {
667
		z.prec = umax32(a.prec, b.prec)
Robert Griesemer's avatar
Robert Griesemer committed
668 669
	}
	return z.Quo(&a, &b)
670 671
}

672 673 674 675 676
// SetInf sets z to the infinite Float -Inf if signbit is
// set, or +Inf if signbit is not set, and returns z. The
// precision of z is unchanged and the result is always
// Exact.
func (z *Float) SetInf(signbit bool) *Float {
677
	z.acc = Exact
678
	z.form = inf
679
	z.neg = signbit
680 681 682
	return z
}

683 684 685 686 687 688
// Set sets z to the (possibly rounded) value of x and returns z.
// If z's precision is 0, it is changed to the precision of x
// before setting z (and rounding will have no effect).
// Rounding is performed according to z's precision and rounding
// mode; and z's accuracy reports the result error relative to the
// exact (not rounded) result.
689
func (z *Float) Set(x *Float) *Float {
Robert Griesemer's avatar
Robert Griesemer committed
690
	if debugFloat {
691
		x.validate()
Robert Griesemer's avatar
Robert Griesemer committed
692 693
	}
	z.acc = Exact
694
	if z != x {
695
		z.form = x.form
696
		z.neg = x.neg
697 698 699 700 701 702 703
		if x.form == finite {
			z.exp = x.exp
			z.mant = z.mant.set(x.mant)
		}
		if z.prec == 0 {
			z.prec = x.prec
		} else if z.prec < x.prec {
704 705 706 707 708 709
			z.round(0)
		}
	}
	return z
}

710 711 712
// Copy sets z to x, with the same precision, rounding mode, and
// accuracy as x, and returns z. x is not changed even if z and
// x are the same.
713
func (z *Float) Copy(x *Float) *Float {
Robert Griesemer's avatar
Robert Griesemer committed
714
	if debugFloat {
715
		x.validate()
Robert Griesemer's avatar
Robert Griesemer committed
716
	}
717
	if z != x {
718
		z.prec = x.prec
719
		z.mode = x.mode
720
		z.acc = x.acc
721
		z.form = x.form
722
		z.neg = x.neg
723 724 725 726
		if z.form == finite {
			z.mant = z.mant.set(x.mant)
			z.exp = x.exp
		}
727 728 729 730
	}
	return z
}

731 732 733 734 735
func high32(x nat) uint32 {
	// TODO(gri) This can be done more efficiently on 32bit platforms.
	return uint32(high64(x) >> 32)
}

736
func high64(x nat) uint64 {
737 738
	i := len(x)
	if i == 0 {
739 740
		return 0
	}
741 742
	// i > 0
	v := uint64(x[i-1])
743 744
	if _W == 32 {
		v <<= 32
745 746
		if i > 1 {
			v |= uint64(x[i-2])
747
		}
748 749 750 751
	}
	return v
}

752
// Uint64 returns the unsigned integer resulting from truncating x
753 754
// towards zero. If 0 <= x <= math.MaxUint64, the result is Exact
// if x is an integer and Below otherwise.
755 756
// The result is (0, Above) for x < 0, and (math.MaxUint64, Below)
// for x > math.MaxUint64.
757 758
func (x *Float) Uint64() (uint64, Accuracy) {
	if debugFloat {
759
		x.validate()
760
	}
Robert Griesemer's avatar
Robert Griesemer committed
761

762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777
	switch x.form {
	case finite:
		if x.neg {
			return 0, Above
		}
		// 0 < x < +Inf
		if x.exp <= 0 {
			// 0 < x < 1
			return 0, Below
		}
		// 1 <= x < Inf
		if x.exp <= 64 {
			// u = trunc(x) fits into a uint64
			u := high64(x.mant) >> (64 - uint32(x.exp))
			if x.MinPrec() <= 64 {
				return u, Exact
778
			}
779
			return u, Below // x truncated
780
		}
781 782
		// x too large
		return math.MaxUint64, Below
Robert Griesemer's avatar
Robert Griesemer committed
783

784 785 786 787 788 789
	case zero:
		return 0, Exact

	case inf:
		if x.neg {
			return 0, Above
Robert Griesemer's avatar
Robert Griesemer committed
790
		}
791
		return math.MaxUint64, Below
Robert Griesemer's avatar
Robert Griesemer committed
792
	}
793 794

	panic("unreachable")
795 796
}

797 798 799
// Int64 returns the integer resulting from truncating x towards zero.
// If math.MinInt64 <= x <= math.MaxInt64, the result is Exact if x is
// an integer, and Above (x < 0) or Below (x > 0) otherwise.
800
// The result is (math.MinInt64, Above) for x < math.MinInt64,
801
// and (math.MaxInt64, Below) for x > math.MaxInt64.
802 803
func (x *Float) Int64() (int64, Accuracy) {
	if debugFloat {
804
		x.validate()
805
	}
806

807 808 809
	switch x.form {
	case finite:
		// 0 < |x| < +Inf
810
		acc := makeAcc(x.neg)
811 812 813 814 815 816 817 818 819 820
		if x.exp <= 0 {
			// 0 < |x| < 1
			return 0, acc
		}
		// x.exp > 0

		// 1 <= |x| < +Inf
		if x.exp <= 63 {
			// i = trunc(x) fits into an int64 (excluding math.MinInt64)
			i := int64(high64(x.mant) >> (64 - uint32(x.exp)))
821
			if x.neg {
822
				i = -i
823
			}
824
			if x.MinPrec() <= uint(x.exp) {
825 826 827
				return i, Exact
			}
			return i, acc // x truncated
828
		}
829 830 831 832 833 834 835 836 837
		if x.neg {
			// check for special case x == math.MinInt64 (i.e., x == -(0.5 << 64))
			if x.exp == 64 && x.MinPrec() == 1 {
				acc = Exact
			}
			return math.MinInt64, acc
		}
		// x too large
		return math.MaxInt64, Below
Robert Griesemer's avatar
Robert Griesemer committed
838

839 840
	case zero:
		return 0, Exact
Robert Griesemer's avatar
Robert Griesemer committed
841

842
	case inf:
843
		if x.neg {
844
			return math.MinInt64, Above
Robert Griesemer's avatar
Robert Griesemer committed
845
		}
846
		return math.MaxInt64, Below
Robert Griesemer's avatar
Robert Griesemer committed
847
	}
848 849

	panic("unreachable")
850 851
}

852 853 854 855 856 857 858
// TODO(gri) Float32 and Float64 are very similar internally but for the
// floatxx parameters and some conversions. Should factor out shared code.

// Float32 returns the float32 value nearest to x. If x is too small to be
// represented by a float32 (|x| < math.SmallestNonzeroFloat32), the result
// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
// If x is too large to be represented by a float32 (|x| > math.MaxFloat32),
859
// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
860
func (x *Float) Float32() (float32, Accuracy) {
Robert Griesemer's avatar
Robert Griesemer committed
861
	if debugFloat {
862
		x.validate()
863
	}
Robert Griesemer's avatar
Robert Griesemer committed
864

865 866 867
	switch x.form {
	case finite:
		// 0 < |x| < +Inf
868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894

		const (
			fbits = 32                //        float size
			mbits = 23                //        mantissa size (excluding implicit msb)
			ebits = fbits - mbits - 1 //     8  exponent size
			bias  = 1<<(ebits-1) - 1  //   127  exponent bias
			dmin  = 1 - bias - mbits  //  -149  smallest unbiased exponent (denormal)
			emin  = 1 - bias          //  -126  smallest unbiased exponent (normal)
			emax  = bias              //   127  largest unbiased exponent (normal)
		)

		// Float mantissae m have an explicit msb and are in the range 0.5 <= m < 1.0.
		// floatxx mantissae have an implicit msb and are in the range 1.0 <= m < 2.0.
		// For a given mantissa m, we need to add 1 to a floatxx exponent to get the
		// corresponding Float exponent.
		// (see also implementation of math.Ldexp for similar code)

		if x.exp < dmin+1 {
			// underflow
			if x.neg {
				var z float32
				return -z, Above
			}
			return 0.0, Below
		}
		// x.exp >= dmin+1

895
		var r Float
896 897 898 899 900
		r.prec = mbits + 1 // +1 for implicit msb
		if x.exp < emin+1 {
			// denormal number - round to fewer bits
			r.prec = uint32(x.exp - dmin)
		}
901
		r.Set(x)
902

903 904
		// Rounding may have caused r to overflow to ±Inf
		// (rounding never causes underflows to 0).
905
		if r.form == inf {
906
			r.exp = emax + 2 // cause overflow below
907 908
		}

909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936
		if r.exp > emax+1 {
			// overflow
			if x.neg {
				return float32(math.Inf(-1)), Below
			}
			return float32(math.Inf(+1)), Above
		}
		// dmin+1 <= r.exp <= emax+1

		var s uint32
		if r.neg {
			s = 1 << (fbits - 1)
		}

		m := high32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)

		// Rounding may have caused a denormal number to
		// become normal. Check again.
		c := float32(1.0)
		if r.exp < emin+1 {
			// denormal number
			r.exp += mbits
			c = 1.0 / (1 << mbits) // 2**-mbits
		}
		// emin+1 <= r.exp <= emax+1
		e := uint32(r.exp-emin) << mbits

		return c * math.Float32frombits(s|e|m), r.acc
937

938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985
	case zero:
		if x.neg {
			var z float32
			return -z, Exact
		}
		return 0.0, Exact

	case inf:
		if x.neg {
			return float32(math.Inf(-1)), Exact
		}
		return float32(math.Inf(+1)), Exact
	}

	panic("unreachable")
}

// Float64 returns the float64 value nearest to x. If x is too small to be
// represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result
// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
// If x is too large to be represented by a float64 (|x| > math.MaxFloat64),
// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
func (x *Float) Float64() (float64, Accuracy) {
	if debugFloat {
		x.validate()
	}

	switch x.form {
	case finite:
		// 0 < |x| < +Inf

		const (
			fbits = 64                //        float size
			mbits = 52                //        mantissa size (excluding implicit msb)
			ebits = fbits - mbits - 1 //    11  exponent size
			bias  = 1<<(ebits-1) - 1  //  1023  exponent bias
			dmin  = 1 - bias - mbits  // -1074  smallest unbiased exponent (denormal)
			emin  = 1 - bias          // -1022  smallest unbiased exponent (normal)
			emax  = bias              //  1023  largest unbiased exponent (normal)
		)

		// Float mantissae m have an explicit msb and are in the range 0.5 <= m < 1.0.
		// floatxx mantissae have an implicit msb and are in the range 1.0 <= m < 2.0.
		// For a given mantissa m, we need to add 1 to a floatxx exponent to get the
		// corresponding Float exponent.
		// (see also implementation of math.Ldexp for similar code)

		if x.exp < dmin+1 {
986 987
			// underflow
			if x.neg {
988
				var z float64
989 990 991 992
				return -z, Above
			}
			return 0.0, Below
		}
993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007
		// x.exp >= dmin+1

		var r Float
		r.prec = mbits + 1 // +1 for implicit msb
		if x.exp < emin+1 {
			// denormal number - round to fewer bits
			r.prec = uint32(x.exp - dmin)
		}
		r.Set(x)

		// Rounding may have caused r to overflow to ±Inf
		// (rounding never causes underflows to 0).
		if r.form == inf {
			r.exp = emax + 2 // cause overflow below
		}
1008

1009
		if r.exp > emax+1 {
1010 1011 1012 1013 1014 1015
			// overflow
			if x.neg {
				return math.Inf(-1), Below
			}
			return math.Inf(+1), Above
		}
1016
		// dmin+1 <= r.exp <= emax+1
1017

1018
		var s uint64
1019
		if r.neg {
1020
			s = 1 << (fbits - 1)
Robert Griesemer's avatar
Robert Griesemer committed
1021
		}
1022 1023 1024 1025 1026 1027 1028 1029 1030 1031

		m := high64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)

		// Rounding may have caused a denormal number to
		// become normal. Check again.
		c := 1.0
		if r.exp < emin+1 {
			// denormal number
			r.exp += mbits
			c = 1.0 / (1 << mbits) // 2**-mbits
1032
		}
1033 1034 1035 1036
		// emin+1 <= r.exp <= emax+1
		e := uint64(r.exp-emin) << mbits

		return c * math.Float64frombits(s|e|m), r.acc
1037 1038 1039

	case zero:
		if x.neg {
1040
			var z float64
1041
			return -z, Exact
1042
		}
1043
		return 0.0, Exact
1044 1045 1046

	case inf:
		if x.neg {
1047
			return math.Inf(-1), Exact
1048
		}
1049
		return math.Inf(+1), Exact
1050
	}
1051 1052

	panic("unreachable")
1053 1054
}

1055
// Int returns the result of truncating x towards zero;
1056
// or nil if x is an infinity.
1057 1058 1059 1060
// The result is Exact if x.IsInt(); otherwise it is Below
// for x > 0, and Above for x < 0.
// If a non-nil *Int argument z is provided, Int stores
// the result in z instead of allocating a new Int.
1061
func (x *Float) Int(z *Int) (*Int, Accuracy) {
1062
	if debugFloat {
1063
		x.validate()
1064
	}
Robert Griesemer's avatar
Robert Griesemer committed
1065

1066
	if z == nil && x.form <= finite {
Robert Griesemer's avatar
Robert Griesemer committed
1067 1068 1069
		z = new(Int)
	}

1070 1071 1072
	switch x.form {
	case finite:
		// 0 < |x| < +Inf
1073
		acc := makeAcc(x.neg)
1074 1075 1076
		if x.exp <= 0 {
			// 0 < |x| < 1
			return z.SetInt64(0), acc
Robert Griesemer's avatar
Robert Griesemer committed
1077
		}
1078
		// x.exp > 0
Robert Griesemer's avatar
Robert Griesemer committed
1079

1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100
		// 1 <= |x| < +Inf
		// determine minimum required precision for x
		allBits := uint(len(x.mant)) * _W
		exp := uint(x.exp)
		if x.MinPrec() <= exp {
			acc = Exact
		}
		// shift mantissa as needed
		if z == nil {
			z = new(Int)
		}
		z.neg = x.neg
		switch {
		case exp > allBits:
			z.abs = z.abs.shl(x.mant, exp-allBits)
		default:
			z.abs = z.abs.set(x.mant)
		case exp < allBits:
			z.abs = z.abs.shr(x.mant, allBits-exp)
		}
		return z, acc
Robert Griesemer's avatar
Robert Griesemer committed
1101

1102 1103 1104 1105
	case zero:
		return z.SetInt64(0), Exact

	case inf:
1106
		return nil, makeAcc(x.neg)
1107
	}
1108 1109

	panic("unreachable")
1110 1111
}

1112
// Rat returns the rational number corresponding to x;
1113 1114
// or nil if x is an infinity.
// The result is Exact is x is not an Inf.
1115 1116
// If a non-nil *Rat argument z is provided, Rat stores
// the result in z instead of allocating a new Rat.
Robert Griesemer's avatar
Robert Griesemer committed
1117
func (x *Float) Rat(z *Rat) (*Rat, Accuracy) {
1118
	if debugFloat {
1119
		x.validate()
1120
	}
Robert Griesemer's avatar
Robert Griesemer committed
1121

1122
	if z == nil && x.form <= finite {
Robert Griesemer's avatar
Robert Griesemer committed
1123 1124 1125
		z = new(Rat)
	}

1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145
	switch x.form {
	case finite:
		// 0 < |x| < +Inf
		allBits := int32(len(x.mant)) * _W
		// build up numerator and denominator
		z.a.neg = x.neg
		switch {
		case x.exp > allBits:
			z.a.abs = z.a.abs.shl(x.mant, uint(x.exp-allBits))
			z.b.abs = z.b.abs[:0] // == 1 (see Rat)
			// z already in normal form
		default:
			z.a.abs = z.a.abs.set(x.mant)
			z.b.abs = z.b.abs[:0] // == 1 (see Rat)
			// z already in normal form
		case x.exp < allBits:
			z.a.abs = z.a.abs.set(x.mant)
			t := z.b.abs.setUint64(1)
			z.b.abs = t.shl(t, uint(allBits-x.exp))
			z.norm()
1146
		}
1147 1148 1149 1150 1151 1152
		return z, Exact

	case zero:
		return z.SetInt64(0), Exact

	case inf:
1153
		return nil, makeAcc(x.neg)
1154
	}
Robert Griesemer's avatar
Robert Griesemer committed
1155

1156
	panic("unreachable")
1157 1158
}

1159 1160
// Abs sets z to the (possibly rounded) value |x| (the absolute value of x)
// and returns z.
1161 1162 1163 1164 1165 1166
func (z *Float) Abs(x *Float) *Float {
	z.Set(x)
	z.neg = false
	return z
}

1167 1168
// Neg sets z to the (possibly rounded) value of x with its sign negated,
// and returns z.
1169 1170 1171 1172 1173 1174
func (z *Float) Neg(x *Float) *Float {
	z.Set(x)
	z.neg = !z.neg
	return z
}

1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190
func validateBinaryOperands(x, y *Float) {
	if !debugFloat {
		// avoid performance bugs
		panic("validateBinaryOperands called but debugFloat is not set")
	}
	if len(x.mant) == 0 {
		panic("empty mantissa for x")
	}
	if len(y.mant) == 0 {
		panic("empty mantissa for y")
	}
}

// z = x + y, ignoring signs of x and y for the addition
// but using the sign of z for rounding the result.
// x and y must have a non-empty mantissa and valid exponent.
1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201
func (z *Float) uadd(x, y *Float) {
	// Note: This implementation requires 2 shifts most of the
	// time. It is also inefficient if exponents or precisions
	// differ by wide margins. The following article describes
	// an efficient (but much more complicated) implementation
	// compatible with the internal representation used here:
	//
	// Vincent Lefèvre: "The Generic Multiple-Precision Floating-
	// Point Addition With Exact Rounding (as in the MPFR Library)"
	// http://www.vinc17.net/research/papers/rnc6.pdf

1202 1203
	if debugFloat {
		validateBinaryOperands(x, y)
1204 1205 1206 1207
	}

	// compute exponents ex, ey for mantissa with "binary point"
	// on the right (mantissa.0) - use int64 to avoid overflow
1208 1209
	ex := int64(x.exp) - int64(len(x.mant))*_W
	ey := int64(y.exp) - int64(len(y.mant))*_W
1210

1211 1212
	// TODO(gri) having a combined add-and-shift primitive
	//           could make this code significantly faster
1213 1214
	switch {
	case ex < ey:
1215 1216 1217
		// cannot re-use z.mant w/o testing for aliasing
		t := nat(nil).shl(y.mant, uint(ey-ex))
		z.mant = z.mant.add(x.mant, t)
1218
	default:
1219
		// ex == ey, no shift needed
1220 1221
		z.mant = z.mant.add(x.mant, y.mant)
	case ex > ey:
1222 1223 1224
		// cannot re-use z.mant w/o testing for aliasing
		t := nat(nil).shl(x.mant, uint(ex-ey))
		z.mant = z.mant.add(t, y.mant)
1225
		ex = ey
1226
	}
1227
	// len(z.mant) > 0
1228

1229
	z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
1230 1231
}

1232 1233 1234
// z = x - y for |x| > |y|, ignoring signs of x and y for the subtraction
// but using the sign of z for rounding the result.
// x and y must have a non-empty mantissa and valid exponent.
1235
func (z *Float) usub(x, y *Float) {
1236 1237 1238 1239 1240
	// This code is symmetric to uadd.
	// We have not factored the common code out because
	// eventually uadd (and usub) should be optimized
	// by special-casing, and the code will diverge.

1241 1242
	if debugFloat {
		validateBinaryOperands(x, y)
1243 1244
	}

1245 1246
	ex := int64(x.exp) - int64(len(x.mant))*_W
	ey := int64(y.exp) - int64(len(y.mant))*_W
1247

1248 1249
	switch {
	case ex < ey:
1250 1251
		// cannot re-use z.mant w/o testing for aliasing
		t := nat(nil).shl(y.mant, uint(ey-ex))
1252 1253
		z.mant = t.sub(x.mant, t)
	default:
1254
		// ex == ey, no shift needed
1255 1256
		z.mant = z.mant.sub(x.mant, y.mant)
	case ex > ey:
1257 1258
		// cannot re-use z.mant w/o testing for aliasing
		t := nat(nil).shl(x.mant, uint(ex-ey))
1259
		z.mant = t.sub(t, y.mant)
1260
		ex = ey
1261
	}
1262

1263 1264 1265
	// operands may have cancelled each other out
	if len(z.mant) == 0 {
		z.acc = Exact
1266
		z.form = zero
1267
		z.neg = false
1268
		return
1269
	}
1270
	// len(z.mant) > 0
1271

1272
	z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
1273 1274
}

1275 1276 1277
// z = x * y, ignoring signs of x and y for the multiplication
// but using the sign of z for rounding the result.
// x and y must have a non-empty mantissa and valid exponent.
1278
func (z *Float) umul(x, y *Float) {
1279 1280
	if debugFloat {
		validateBinaryOperands(x, y)
1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291
	}

	// Note: This is doing too much work if the precision
	// of z is less than the sum of the precisions of x
	// and y which is often the case (e.g., if all floats
	// have the same precision).
	// TODO(gri) Optimize this for the common case.

	e := int64(x.exp) + int64(y.exp)
	z.mant = z.mant.mul(x.mant, y.mant)

1292
	z.setExpAndRound(e-fnorm(z.mant), 0)
1293 1294
}

1295 1296 1297
// z = x / y, ignoring signs of x and y for the division
// but using the sign of z for rounding the result.
// x and y must have a non-empty mantissa and valid exponent.
1298
func (z *Float) uquo(x, y *Float) {
1299 1300
	if debugFloat {
		validateBinaryOperands(x, y)
1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318
	}

	// mantissa length in words for desired result precision + 1
	// (at least one extra bit so we get the rounding bit after
	// the division)
	n := int(z.prec/_W) + 1

	// compute adjusted x.mant such that we get enough result precision
	xadj := x.mant
	if d := n - len(x.mant) + len(y.mant); d > 0 {
		// d extra words needed => add d "0 digits" to x
		xadj = make(nat, len(x.mant)+d)
		copy(xadj[d:], x.mant)
	}
	// TODO(gri): If we have too many digits (d < 0), we should be able
	// to shorten x for faster division. But we must be extra careful
	// with rounding in that case.

1319 1320 1321 1322
	// Compute d before division since there may be aliasing of x.mant
	// (via xadj) or y.mant with z.mant.
	d := len(xadj) - len(y.mant)

1323 1324 1325
	// divide
	var r nat
	z.mant, r = z.mant.div(nil, xadj, y.mant)
1326
	e := int64(x.exp) - int64(y.exp) - int64(d-len(z.mant))*_W
1327 1328 1329 1330 1331 1332 1333 1334 1335

	// The result is long enough to include (at least) the rounding bit.
	// If there's a non-zero remainder, the corresponding fractional part
	// (if it were computed), would have a non-zero sticky bit (if it were
	// zero, it couldn't have a non-zero remainder).
	var sbit uint
	if len(r) > 0 {
		sbit = 1
	}
1336 1337

	z.setExpAndRound(e-fnorm(z.mant), sbit)
1338 1339
}

1340 1341
// ucmp returns -1, 0, or +1, depending on whether
// |x| < |y|, |x| == |y|, or |x| > |y|.
1342
// x and y must have a non-empty mantissa and valid exponent.
1343
func (x *Float) ucmp(y *Float) int {
1344 1345
	if debugFloat {
		validateBinaryOperands(x, y)
1346 1347 1348 1349
	}

	switch {
	case x.exp < y.exp:
1350
		return -1
1351
	case x.exp > y.exp:
1352
		return +1
1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370
	}
	// x.exp == y.exp

	// compare mantissas
	i := len(x.mant)
	j := len(y.mant)
	for i > 0 || j > 0 {
		var xm, ym Word
		if i > 0 {
			i--
			xm = x.mant[i]
		}
		if j > 0 {
			j--
			ym = y.mant[j]
		}
		switch {
		case xm < ym:
1371
			return -1
1372
		case xm > ym:
1373
			return +1
1374 1375 1376
		}
	}

1377
	return 0
1378 1379
}

1380
// Handling of sign bit as defined by IEEE 754-2008, section 6.3:
1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395
//
// When neither the inputs nor result are NaN, the sign of a product or
// quotient is the exclusive OR of the operands’ signs; the sign of a sum,
// or of a difference x−y regarded as a sum x+(−y), differs from at most
// one of the addends’ signs; and the sign of the result of conversions,
// the quantize operation, the roundToIntegral operations, and the
// roundToIntegralExact (see 5.3.1) is the sign of the first or only operand.
// These rules shall apply even when operands or results are zero or infinite.
//
// When the sum of two operands with opposite signs (or the difference of
// two operands with like signs) is exactly zero, the sign of that sum (or
// difference) shall be +0 in all rounding-direction attributes except
// roundTowardNegative; under that attribute, the sign of an exact zero
// sum (or difference) shall be −0. However, x+x = x−(−x) retains the same
// sign as x even when x is zero.
Robert Griesemer's avatar
Robert Griesemer committed
1396 1397
//
// See also: http://play.golang.org/p/RtH3UCt5IH
1398

1399 1400 1401 1402
// Add sets z to the rounded sum x+y and returns z. If z's precision is 0,
// it is changed to the larger of x's or y's precision before the operation.
// Rounding is performed according to z's precision and rounding mode; and
// z's accuracy reports the result error relative to the exact (not rounded)
1403 1404 1405
// result. Add panics with ErrNaN if x and y are infinities with opposite
// signs. The value of z is undefined in that case.
//
1406
// BUG(gri) When rounding ToNegativeInf, the sign of Float values rounded to 0 is incorrect.
1407
func (z *Float) Add(x, y *Float) *Float {
1408
	if debugFloat {
1409 1410
		x.validate()
		y.validate()
1411 1412
	}

1413
	if z.prec == 0 {
1414
		z.prec = umax32(x.prec, y.prec)
1415 1416
	}

1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431
	if x.form == finite && y.form == finite {
		// x + y (commom case)
		z.neg = x.neg
		if x.neg == y.neg {
			// x + y == x + y
			// (-x) + (-y) == -(x + y)
			z.uadd(x, y)
		} else {
			// x + (-y) == x - y == -(y - x)
			// (-x) + y == y - x == -(x - y)
			if x.ucmp(y) > 0 {
				z.usub(x, y)
			} else {
				z.neg = !z.neg
				z.usub(y, x)
Robert Griesemer's avatar
Robert Griesemer committed
1432 1433
			}
		}
1434
		return z
1435 1436
	}

1437 1438 1439 1440 1441 1442 1443 1444
	if x.form == inf && y.form == inf && x.neg != y.neg {
		// +Inf + -Inf
		// -Inf + +Inf
		// value of z is undefined but make sure it's valid
		z.acc = Exact
		z.form = zero
		z.neg = false
		panic(ErrNaN{"addition of infinities with opposite signs"})
1445
	}
Robert Griesemer's avatar
Robert Griesemer committed
1446

1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463
	if x.form == zero && y.form == zero {
		// ±0 + ±0
		z.acc = Exact
		z.form = zero
		z.neg = x.neg && y.neg // -0 + -0 == -0
		return z
	}

	if x.form == inf || y.form == zero {
		// ±Inf + y
		// x + ±0
		return z.Set(x)
	}

	// ±0 + y
	// x + ±Inf
	return z.Set(y)
1464 1465 1466
}

// Sub sets z to the rounded difference x-y and returns z.
1467
// Precision, rounding, and accuracy reporting are as for Add.
1468 1469
// Sub panics with ErrNaN if x and y are infinities with equal
// signs. The value of z is undefined in that case.
1470
func (z *Float) Sub(x, y *Float) *Float {
1471
	if debugFloat {
1472 1473
		x.validate()
		y.validate()
1474 1475
	}

1476
	if z.prec == 0 {
1477
		z.prec = umax32(x.prec, y.prec)
1478 1479
	}

1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494
	if x.form == finite && y.form == finite {
		// x - y (common case)
		z.neg = x.neg
		if x.neg != y.neg {
			// x - (-y) == x + y
			// (-x) - y == -(x + y)
			z.uadd(x, y)
		} else {
			// x - y == x - y == -(y - x)
			// (-x) - (-y) == y - x == -(x - y)
			if x.ucmp(y) > 0 {
				z.usub(x, y)
			} else {
				z.neg = !z.neg
				z.usub(y, x)
Robert Griesemer's avatar
Robert Griesemer committed
1495 1496
			}
		}
1497
		return z
1498 1499
	}

1500 1501 1502 1503 1504 1505 1506 1507
	if x.form == inf && y.form == inf && x.neg == y.neg {
		// +Inf - +Inf
		// -Inf - -Inf
		// value of z is undefined but make sure it's valid
		z.acc = Exact
		z.form = zero
		z.neg = false
		panic(ErrNaN{"subtraction of infinities with equal signs"})
1508
	}
Robert Griesemer's avatar
Robert Griesemer committed
1509

1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526
	if x.form == zero && y.form == zero {
		// ±0 - ±0
		z.acc = Exact
		z.form = zero
		z.neg = x.neg && !y.neg // -0 - +0 == -0
		return z
	}

	if x.form == inf || y.form == zero {
		// ±Inf - y
		// x - ±0
		return z.Set(x)
	}

	// ±0 - y
	// x - ±Inf
	return z.Neg(y)
1527 1528 1529
}

// Mul sets z to the rounded product x*y and returns z.
1530
// Precision, rounding, and accuracy reporting are as for Add.
1531 1532
// Mul panics with ErrNaN if one operand is zero and the other
// operand an infinity. The value of z is undefined in that case.
1533
func (z *Float) Mul(x, y *Float) *Float {
1534
	if debugFloat {
1535 1536
		x.validate()
		y.validate()
1537 1538
	}

1539
	if z.prec == 0 {
1540
		z.prec = umax32(x.prec, y.prec)
1541 1542
	}

Robert Griesemer's avatar
Robert Griesemer committed
1543 1544
	z.neg = x.neg != y.neg

1545 1546 1547
	if x.form == finite && y.form == finite {
		// x * y (common case)
		z.umul(x, y)
Robert Griesemer's avatar
Robert Griesemer committed
1548 1549
		return z
	}
1550

1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566
	z.acc = Exact
	if x.form == zero && y.form == inf || x.form == inf && y.form == zero {
		// ±0 * ±Inf
		// ±Inf * ±0
		// value of z is undefined but make sure it's valid
		z.form = zero
		z.neg = false
		panic(ErrNaN{"multiplication of zero with infinity"})
	}

	if x.form == inf || y.form == inf {
		// ±Inf * y
		// x * ±Inf
		z.form = inf
		return z
	}
1567

1568 1569 1570
	// ±0 * y
	// x * ±0
	z.form = zero
1571 1572 1573 1574
	return z
}

// Quo sets z to the rounded quotient x/y and returns z.
1575
// Precision, rounding, and accuracy reporting are as for Add.
1576 1577
// Quo panics with ErrNaN if both operands are zero or infinities.
// The value of z is undefined in that case.
1578
func (z *Float) Quo(x, y *Float) *Float {
1579
	if debugFloat {
1580 1581
		x.validate()
		y.validate()
1582 1583
	}

1584
	if z.prec == 0 {
1585
		z.prec = umax32(x.prec, y.prec)
1586 1587 1588 1589
	}

	z.neg = x.neg != y.neg

1590 1591 1592
	if x.form == finite && y.form == finite {
		// x / y (common case)
		z.uquo(x, y)
1593
		return z
1594 1595
	}

1596 1597 1598 1599 1600 1601 1602 1603 1604
	z.acc = Exact
	if x.form == zero && y.form == zero || x.form == inf && y.form == inf {
		// ±0 / ±0
		// ±Inf / ±Inf
		// value of z is undefined but make sure it's valid
		z.form = zero
		z.neg = false
		panic(ErrNaN{"division of zero by zero or infinity by infinity"})
	}
1605

1606 1607 1608 1609 1610 1611 1612 1613 1614 1615
	if x.form == zero || y.form == inf {
		// ±0 / y
		// x / ±Inf
		z.form = zero
		return z
	}

	// x / ±0
	// ±Inf / y
	z.form = inf
1616 1617 1618 1619 1620
	return z
}

// Cmp compares x and y and returns:
//
1621 1622 1623
//   -1 if x <  y
//    0 if x == y (incl. -0 == 0, -Inf == -Inf, and +Inf == +Inf)
//   +1 if x >  y
1624
//
1625
func (x *Float) Cmp(y *Float) int {
1626
	if debugFloat {
1627 1628
		x.validate()
		y.validate()
1629 1630
	}

1631 1632
	mx := x.ord()
	my := y.ord()
1633
	switch {
1634
	case mx < my:
1635
		return -1
1636
	case mx > my:
1637
		return +1
1638
	}
1639
	// mx == my
1640

1641 1642 1643
	// only if |mx| == 1 we have to compare the mantissae
	switch mx {
	case -1:
1644
		return y.ucmp(x)
1645
	case +1:
1646
		return x.ucmp(y)
1647
	}
1648

1649
	return 0
1650
}
1651

1652
// ord classifies x and returns:
1653
//
1654 1655 1656 1657
//	-2 if -Inf == x
//	-1 if -Inf < x < 0
//	 0 if x == 0 (signed or unsigned)
//	+1 if 0 < x < +Inf
1658 1659
//	+2 if x == +Inf
//
1660
func (x *Float) ord() int {
1661 1662 1663 1664 1665 1666 1667 1668
	var m int
	switch x.form {
	case finite:
		m = 1
	case zero:
		return 0
	case inf:
		m = 2
1669 1670 1671 1672 1673 1674
	}
	if x.neg {
		m = -m
	}
	return m
}
1675 1676 1677 1678 1679 1680 1681

func umax32(x, y uint32) uint32 {
	if x > y {
		return x
	}
	return y
}