Commit 8d267b9b authored by Robert Griesemer's avatar Robert Griesemer

math/big: fixed Float.Float64, implemented Float.Float32

- fix bounds checks for exponent range of denormalized numbers
- use correct rounding precision for denormalized numbers
- added extra tests

Change-Id: I6be56399afd0d9a603300a2e44b5539e08d6f592
Reviewed-on: https://go-review.googlesource.com/8096Reviewed-by: default avatarAlan Donovan <adonovan@google.com>
parent f8fd5502
...@@ -750,6 +750,11 @@ func (z *Float) Copy(x *Float) *Float { ...@@ -750,6 +750,11 @@ func (z *Float) Copy(x *Float) *Float {
return z return z
} }
func high32(x nat) uint32 {
// TODO(gri) This can be done more efficiently on 32bit platforms.
return uint32(high64(x) >> 32)
}
func high64(x nat) uint64 { func high64(x nat) uint64 {
i := len(x) i := len(x)
if i == 0 { if i == 0 {
...@@ -872,15 +877,16 @@ func (x *Float) Int64() (int64, Accuracy) { ...@@ -872,15 +877,16 @@ func (x *Float) Int64() (int64, Accuracy) {
panic("unreachable") panic("unreachable")
} }
// Float64 returns the float64 value nearest to x by rounding ToNearestEven // TODO(gri) Float32 and Float64 are very similar internally but for the
// with 53 bits of precision. // floatxx parameters and some conversions. Should factor out shared code.
// If x is too small to be represented by a float64
// (|x| < math.SmallestNonzeroFloat64), the result is (0, Below) or // Float32 returns the float32 value nearest to x. If x is too small to be
// (-0, Above), respectively, depending on the sign of x. // represented by a float32 (|x| < math.SmallestNonzeroFloat32), the result
// If x is too large to be represented by a float64 (|x| > math.MaxFloat64), // is (0, Below) or (-0, Above), respectively, depending on the sign of x.
// If x is too large to be represented by a float32 (|x| > math.MaxFloat32),
// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x. // the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
// The result is (NaN, Undef) for NaNs. // The result is (NaN, Undef) for NaNs.
func (x *Float) Float64() (float64, Accuracy) { func (x *Float) Float32() (float32, Accuracy) {
if debugFloat { if debugFloat {
x.validate() x.validate()
} }
...@@ -888,61 +894,183 @@ func (x *Float) Float64() (float64, Accuracy) { ...@@ -888,61 +894,183 @@ func (x *Float) Float64() (float64, Accuracy) {
switch x.form { switch x.form {
case finite: case finite:
// 0 < |x| < +Inf // 0 < |x| < +Inf
const (
fbits = 32 // float size
mbits = 23 // mantissa size (excluding implicit msb)
ebits = fbits - mbits - 1 // 8 exponent size
bias = 1<<(ebits-1) - 1 // 127 exponent bias
dmin = 1 - bias - mbits // -149 smallest unbiased exponent (denormal)
emin = 1 - bias // -126 smallest unbiased exponent (normal)
emax = bias // 127 largest unbiased exponent (normal)
)
// Float mantissae m have an explicit msb and are in the range 0.5 <= m < 1.0.
// floatxx mantissae have an implicit msb and are in the range 1.0 <= m < 2.0.
// For a given mantissa m, we need to add 1 to a floatxx exponent to get the
// corresponding Float exponent.
// (see also implementation of math.Ldexp for similar code)
if x.exp < dmin+1 {
// underflow
if x.neg {
var z float32
return -z, Above
}
return 0.0, Below
}
// x.exp >= dmin+1
var r Float var r Float
r.prec = 53 r.prec = mbits + 1 // +1 for implicit msb
if x.exp < emin+1 {
// denormal number - round to fewer bits
r.prec = uint32(x.exp - dmin)
}
r.Set(x) r.Set(x)
// Rounding via Set may have caused r to overflow // Rounding may have caused r to overflow to ±Inf
// to ±Inf (rounding never causes underflows to 0). // (rounding never causes underflows to 0).
if r.form == inf { if r.form == inf {
r.exp = 10000 // cause overflow below r.exp = emax + 2 // cause overflow below
} }
// see also implementation of math.Ldexp if r.exp > emax+1 {
// overflow
if x.neg {
return float32(math.Inf(-1)), Below
}
return float32(math.Inf(+1)), Above
}
// dmin+1 <= r.exp <= emax+1
var s uint32
if r.neg {
s = 1 << (fbits - 1)
}
m := high32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
// Rounding may have caused a denormal number to
// become normal. Check again.
c := float32(1.0)
if r.exp < emin+1 {
// denormal number
r.exp += mbits
c = 1.0 / (1 << mbits) // 2**-mbits
}
// emin+1 <= r.exp <= emax+1
e := uint32(r.exp-emin) << mbits
return c * math.Float32frombits(s|e|m), r.acc
e := int64(r.exp) + 1022 case zero:
if e <= -52 { if x.neg {
var z float32
return -z, Exact
}
return 0.0, Exact
case inf:
if x.neg {
return float32(math.Inf(-1)), Exact
}
return float32(math.Inf(+1)), Exact
case nan:
return float32(math.NaN()), Undef
}
panic("unreachable")
}
// Float64 returns the float64 value nearest to x. If x is too small to be
// represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result
// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
// If x is too large to be represented by a float64 (|x| > math.MaxFloat64),
// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
// The result is (NaN, Undef) for NaNs.
func (x *Float) Float64() (float64, Accuracy) {
if debugFloat {
x.validate()
}
switch x.form {
case finite:
// 0 < |x| < +Inf
const (
fbits = 64 // float size
mbits = 52 // mantissa size (excluding implicit msb)
ebits = fbits - mbits - 1 // 11 exponent size
bias = 1<<(ebits-1) - 1 // 1023 exponent bias
dmin = 1 - bias - mbits // -1074 smallest unbiased exponent (denormal)
emin = 1 - bias // -1022 smallest unbiased exponent (normal)
emax = bias // 1023 largest unbiased exponent (normal)
)
// Float mantissae m have an explicit msb and are in the range 0.5 <= m < 1.0.
// floatxx mantissae have an implicit msb and are in the range 1.0 <= m < 2.0.
// For a given mantissa m, we need to add 1 to a floatxx exponent to get the
// corresponding Float exponent.
// (see also implementation of math.Ldexp for similar code)
if x.exp < dmin+1 {
// underflow // underflow
if x.neg { if x.neg {
z := 0.0 var z float64
return -z, Above return -z, Above
} }
return 0.0, Below return 0.0, Below
} }
// e > -52 // x.exp >= dmin+1
var r Float
r.prec = mbits + 1 // +1 for implicit msb
if x.exp < emin+1 {
// denormal number - round to fewer bits
r.prec = uint32(x.exp - dmin)
}
r.Set(x)
// Rounding may have caused r to overflow to ±Inf
// (rounding never causes underflows to 0).
if r.form == inf {
r.exp = emax + 2 // cause overflow below
}
if e >= 2047 { if r.exp > emax+1 {
// overflow // overflow
if x.neg { if x.neg {
return math.Inf(-1), Below return math.Inf(-1), Below
} }
return math.Inf(+1), Above return math.Inf(+1), Above
} }
// -52 < e < 2047 // dmin+1 <= r.exp <= emax+1
denormal := false
if e < 0 {
denormal = true
e += 52
}
// 0 < e < 2047
s := uint64(0) var s uint64
if r.neg { if r.neg {
s = 1 << 63 s = 1 << (fbits - 1)
} }
m := high64(r.mant) >> 11 & (1<<52 - 1) // cut off msb (implicit 1 bit)
z := math.Float64frombits(s | uint64(e)<<52 | m) m := high64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
if denormal {
// adjust for denormal // Rounding may have caused a denormal number to
// TODO(gri) does this change accuracy? // become normal. Check again.
z /= 1 << 52 c := 1.0
if r.exp < emin+1 {
// denormal number
r.exp += mbits
c = 1.0 / (1 << mbits) // 2**-mbits
} }
return z, r.acc // emin+1 <= r.exp <= emax+1
e := uint64(r.exp-emin) << mbits
return c * math.Float64frombits(s|e|m), r.acc
case zero: case zero:
if x.neg { if x.neg {
z := 0.0 var z float64
return -z, Exact return -z, Exact
} }
return 0.0, Exact return 0.0, Exact
......
...@@ -537,14 +537,14 @@ func TestFloatRound(t *testing.T) { ...@@ -537,14 +537,14 @@ func TestFloatRound(t *testing.T) {
// TestFloatRound24 tests that rounding a float64 to 24 bits // TestFloatRound24 tests that rounding a float64 to 24 bits
// matches IEEE-754 rounding to nearest when converting a // matches IEEE-754 rounding to nearest when converting a
// float64 to a float32. // float64 to a float32 (excluding denormal numbers).
func TestFloatRound24(t *testing.T) { func TestFloatRound24(t *testing.T) {
const x0 = 1<<26 - 0x10 // 11...110000 (26 bits) const x0 = 1<<26 - 0x10 // 11...110000 (26 bits)
for d := 0; d <= 0x10; d++ { for d := 0; d <= 0x10; d++ {
x := float64(x0 + d) x := float64(x0 + d)
f := new(Float).SetPrec(24).SetFloat64(x) f := new(Float).SetPrec(24).SetFloat64(x)
got, _ := f.Float64() got, _ := f.Float32()
want := float64(float32(x)) want := float32(x)
if got != want { if got != want {
t.Errorf("Round(%g, 24) = %g; want %g", x, got, want) t.Errorf("Round(%g, 24) = %g; want %g", x, got, want)
} }
...@@ -837,7 +837,70 @@ func TestFloatInt64(t *testing.T) { ...@@ -837,7 +837,70 @@ func TestFloatInt64(t *testing.T) {
} }
} }
func TestFloatFloat32(t *testing.T) {
for _, test := range []struct {
x string
out float32
acc Accuracy
}{
{"-Inf", float32(math.Inf(-1)), Exact},
{"-0x1.ffffff0p2147483646", float32(-math.Inf(+1)), Below}, // overflow in rounding
{"-1e10000", float32(math.Inf(-1)), Below}, // overflow
{"-0x1p128", float32(math.Inf(-1)), Below}, // overflow
{"-0x1.ffffff0p127", float32(-math.Inf(+1)), Below}, // overflow
{"-0x1.fffffe8p127", -math.MaxFloat32, Above},
{"-0x1.fffffe0p127", -math.MaxFloat32, Exact},
{"-12345.000000000000000000001", -12345, Above},
{"-12345.0", -12345, Exact},
{"-1.000000000000000000001", -1, Above},
{"-1", -1, Exact},
{"-0x0.000002p-126", -math.SmallestNonzeroFloat32, Exact},
{"-0x0.000002p-127", -0, Above}, // underflow
{"-1e-1000", -0, Above}, // underflow
{"0", 0, Exact},
{"1e-1000", 0, Below}, // underflow
{"0x0.000002p-127", 0, Below}, // underflow
{"0x0.000002p-126", math.SmallestNonzeroFloat32, Exact},
{"1", 1, Exact},
{"1.000000000000000000001", 1, Below},
{"12345.0", 12345, Exact},
{"12345.000000000000000000001", 12345, Below},
{"0x1.fffffe0p127", math.MaxFloat32, Exact},
{"0x1.fffffe8p127", math.MaxFloat32, Below},
{"0x1.ffffff0p127", float32(math.Inf(+1)), Above}, // overflow
{"0x1p128", float32(math.Inf(+1)), Above}, // overflow
{"1e10000", float32(math.Inf(+1)), Above}, // overflow
{"0x1.ffffff0p2147483646", float32(math.Inf(+1)), Above}, // overflow in rounding
{"+Inf", float32(math.Inf(+1)), Exact},
} {
// conversion should match strconv where syntax is agreeable
if f, err := strconv.ParseFloat(test.x, 32); err == nil && float32(f) != test.out {
t.Errorf("%s: got %g; want %g (incorrect test data)", test.x, f, test.out)
}
x := makeFloat(test.x)
out, acc := x.Float32()
if out != test.out || acc != test.acc {
t.Errorf("%s: got %g (%#x, %s); want %g (%#x, %s)", test.x, out, math.Float32bits(out), acc, test.out, math.Float32bits(test.out), test.acc)
}
// test that x.SetFloat64(float64(f)).Float32() == f
var x2 Float
out2, acc2 := x2.SetFloat64(float64(out)).Float32()
if out2 != out || acc2 != Exact {
t.Errorf("idempotency test: got %g (%s); want %g (Exact)", out2, acc2, out)
}
}
// test NaN
x := makeFloat("NaN")
if out, acc := x.Float32(); out == out || acc != Undef {
t.Errorf("NaN: got %g (%s); want NaN (Undef)", out, acc)
}
}
func TestFloatFloat64(t *testing.T) { func TestFloatFloat64(t *testing.T) {
const smallestNormalFloat64 = 2.2250738585072014e-308 // 1p-1022
for _, test := range []struct { for _, test := range []struct {
x string x string
out float64 out float64
...@@ -849,7 +912,7 @@ func TestFloatFloat64(t *testing.T) { ...@@ -849,7 +912,7 @@ func TestFloatFloat64(t *testing.T) {
{"-0x1p1024", math.Inf(-1), Below}, // overflow {"-0x1p1024", math.Inf(-1), Below}, // overflow
{"-0x1.fffffffffffff8p1023", -math.Inf(+1), Below}, // overflow {"-0x1.fffffffffffff8p1023", -math.Inf(+1), Below}, // overflow
{"-0x1.fffffffffffff4p1023", -math.MaxFloat64, Above}, {"-0x1.fffffffffffff4p1023", -math.MaxFloat64, Above},
{"-0x1.fffffffffffffp1023", -math.MaxFloat64, Exact}, {"-0x1.fffffffffffff0p1023", -math.MaxFloat64, Exact},
{"-12345.000000000000000000001", -12345, Above}, {"-12345.000000000000000000001", -12345, Above},
{"-12345.0", -12345, Exact}, {"-12345.0", -12345, Exact},
{"-1.000000000000000000001", -1, Above}, {"-1.000000000000000000001", -1, Above},
...@@ -865,18 +928,39 @@ func TestFloatFloat64(t *testing.T) { ...@@ -865,18 +928,39 @@ func TestFloatFloat64(t *testing.T) {
{"1.000000000000000000001", 1, Below}, {"1.000000000000000000001", 1, Below},
{"12345.0", 12345, Exact}, {"12345.0", 12345, Exact},
{"12345.000000000000000000001", 12345, Below}, {"12345.000000000000000000001", 12345, Below},
{"0x1.fffffffffffffp1023", math.MaxFloat64, Exact}, {"0x1.fffffffffffff0p1023", math.MaxFloat64, Exact},
{"0x1.fffffffffffff4p1023", math.MaxFloat64, Below}, {"0x1.fffffffffffff4p1023", math.MaxFloat64, Below},
{"0x1.fffffffffffff8p1023", math.Inf(+1), Above}, // overflow {"0x1.fffffffffffff8p1023", math.Inf(+1), Above}, // overflow
{"0x1p1024", math.Inf(+1), Above}, // overflow {"0x1p1024", math.Inf(+1), Above}, // overflow
{"1e10000", math.Inf(+1), Above}, // overflow {"1e10000", math.Inf(+1), Above}, // overflow
{"0x1.fffffffffffff8p2147483646", math.Inf(+1), Above}, // overflow in rounding {"0x1.fffffffffffff8p2147483646", math.Inf(+1), Above}, // overflow in rounding
{"+Inf", math.Inf(+1), Exact}, {"+Inf", math.Inf(+1), Exact},
// selected denormalized values that were handled incorrectly in the past
{"0x.fffffffffffffp-1022", smallestNormalFloat64 - math.SmallestNonzeroFloat64, Exact},
{"4503599627370495p-1074", smallestNormalFloat64 - math.SmallestNonzeroFloat64, Exact},
// http://www.exploringbinary.com/php-hangs-on-numeric-value-2-2250738585072011e-308/
{"2.2250738585072011e-308", 2.225073858507201e-308, Below},
// http://www.exploringbinary.com/java-hangs-when-converting-2-2250738585072012e-308/
{"2.2250738585072012e-308", 2.2250738585072014e-308, Above},
} { } {
// conversion should match strconv where syntax is agreeable
if f, err := strconv.ParseFloat(test.x, 64); err == nil && f != test.out {
t.Errorf("%s: got %g; want %g (incorrect test data)", test.x, f, test.out)
}
x := makeFloat(test.x) x := makeFloat(test.x)
out, acc := x.Float64() out, acc := x.Float64()
if out != test.out || acc != test.acc { if out != test.out || acc != test.acc {
t.Errorf("%s: got %g (%s); want %g (%s)", test.x, out, acc, test.out, test.acc) t.Errorf("%s: got %g (%#x, %s); want %g (%#x, %s)", test.x, out, math.Float64bits(out), acc, test.out, math.Float64bits(test.out), test.acc)
}
// test that x.SetFloat64(f).Float64() == f
var x2 Float
out2, acc2 := x2.SetFloat64(out).Float64()
if out2 != out || acc2 != Exact {
t.Errorf("idempotency test: got %g (%s); want %g (Exact)", out2, acc2, out)
} }
} }
...@@ -1108,7 +1192,8 @@ func TestFloatAdd(t *testing.T) { ...@@ -1108,7 +1192,8 @@ func TestFloatAdd(t *testing.T) {
} }
// TestFloatAdd32 tests that Float.Add/Sub of numbers with // TestFloatAdd32 tests that Float.Add/Sub of numbers with
// 24bit mantissa behaves like float32 addition/subtraction. // 24bit mantissa behaves like float32 addition/subtraction
// (excluding denormal numbers).
func TestFloatAdd32(t *testing.T) { func TestFloatAdd32(t *testing.T) {
// chose base such that we cross the mantissa precision limit // chose base such that we cross the mantissa precision limit
const base = 1<<26 - 0x10 // 11...110000 (26 bits) const base = 1<<26 - 0x10 // 11...110000 (26 bits)
...@@ -1124,15 +1209,15 @@ func TestFloatAdd32(t *testing.T) { ...@@ -1124,15 +1209,15 @@ func TestFloatAdd32(t *testing.T) {
z := new(Float).SetPrec(24) z := new(Float).SetPrec(24)
z.Add(x, y) z.Add(x, y)
got, acc := z.Float64() got, acc := z.Float32()
want := float64(float32(y0) + float32(x0)) want := float32(y0) + float32(x0)
if got != want || acc != Exact { if got != want || acc != Exact {
t.Errorf("d = %d: %g + %g = %g (%s); want %g (Exact)", d, x0, y0, got, acc, want) t.Errorf("d = %d: %g + %g = %g (%s); want %g (Exact)", d, x0, y0, got, acc, want)
} }
z.Sub(z, y) z.Sub(z, y)
got, acc = z.Float64() got, acc = z.Float32()
want = float64(float32(want) - float32(y0)) want = float32(want) - float32(y0)
if got != want || acc != Exact { if got != want || acc != Exact {
t.Errorf("d = %d: %g - %g = %g (%s); want %g (Exact)", d, x0+y0, y0, got, acc, want) t.Errorf("d = %d: %g - %g = %g (%s); want %g (Exact)", d, x0+y0, y0, got, acc, want)
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment