hash/maphash: revise API to be more idiomatic

This CL makes these changes to the hash/maphash API to make it fit a bit more into the standard library: - Move some of the package doc onto type Hash, so that `go doc maphash.Hash` shows it. - Instead of having identical AddBytes and Write methods, standardize on Write, the usual name for this function. Similarly, AddString -> WriteString, AddByte -> WriteByte. - Instead of having identical Hash and Sum64 methods, standardize on Sum64 (for hash.Hash64). Dropping the "Hash" method also helps because Hash is usually reserved to mean the state of a hash function (hash.Hash etc), not the hash value itself. - Make an uninitialized hash.Hash auto-seed with a random seed. It is critical that users not use the same seed for all hash functions in their program, at least not accidentally. So the Hash implementation must either panic if uninitialized or initialize itself. Initializing itself is less work for users and can be done lazily. - Now that the zero hash.Hash is useful, drop maphash.New in favor of new(maphash.Hash) or simply declaring a maphash.Hash. - Add a [0]func()-typed field to the Hash so that Hashes cannot be compared. (I considered doing the same for Seed but comparing seeds seems OK.) - Drop the integer argument from MakeSeed, to match the original design in golang.org/issue/28322. There is no point to giving users control over the specific seed bits, since we want the interpretation of those bits to be different in every different process. The only thing users need is to be able to create a new random seed at each call. (Fixes a TODO in MakeSeed's public doc comment.) This API is new in Go 1.14, so these changes do not violate the compatibility promise. Fixes #35060. Fixes #35348. Change-Id: Ie6fecc441f3f5ef66388c6ead92e875c0871f805 Reviewed-on: https://go-review.googlesource.com/c/go/+/205069 Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Alan Donovan <adonovan@google.com> Reviewed-by: Keith Randall <khr@golang.org>

hash/maphash: revise API to be more idiomatic
This CL makes these changes to the hash/maphash API to make it fit a bit more into the standard library: - Move some of the package doc onto type Hash, so that `go doc maphash.Hash` shows it. - Instead of having identical AddBytes and Write methods, standardize on Write, the usual name for this function. Similarly, AddString -> WriteString, AddByte -> WriteByte. - Instead of having identical Hash and Sum64 methods, standardize on Sum64 (for hash.Hash64). Dropping the "Hash" method also helps because Hash is usually reserved to mean the state of a hash function (hash.Hash etc), not the hash value itself. - Make an uninitialized hash.Hash auto-seed with a random seed. It is critical that users not use the same seed for all hash functions in their program, at least not accidentally. So the Hash implementation must either panic if uninitialized or initialize itself. Initializing itself is less work for users and can be done lazily. - Now that the zero hash.Hash is useful, drop maphash.New in favor of new(maphash.Hash) or simply declaring a maphash.Hash. - Add a [0]func()-typed field to the Hash so that Hashes cannot be compared. (I considered doing the same for Seed but comparing seeds seems OK.) - Drop the integer argument from MakeSeed, to match the original design in golang.org/issue/28322. There is no point to giving users control over the specific seed bits, since we want the interpretation of those bits to be different in every different process. The only thing users need is to be able to create a new random seed at each call. (Fixes a TODO in MakeSeed's public doc comment.) This API is new in Go 1.14, so these changes do not violate the compatibility promise. Fixes #35060. Fixes #35348. Change-Id: Ie6fecc441f3f5ef66388c6ead92e875c0871f805 Reviewed-on: https://go-review.googlesource.com/c/go/+/205069 Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Alan Donovan <adonovan@google.com> Reviewed-by: Keith Randall <khr@golang.org>
5a7c571e · Russ Cox · 03aca99f · 5a7c571e · 5a7c571e · 5a7c571e
Commit 5a7c571e authored Nov 04, 2019 by Russ Cox
Showing with 168 additions and 125 deletions

src/hash/maphash/maphash.go src/hash/maphash/maphash.go +113 -82

src/hash/maphash/maphash_test.go src/hash/maphash/maphash_test.go +31 -23

src/hash/maphash/smhasher_test.go src/hash/maphash/smhasher_test.go +24 -20

No files found.
--- a/src/hash/maphash/maphash.go
+++ b/src/hash/maphash/maphash.go
@@ -2,65 +2,89 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// Package hash/maphash provides hash functions on byte sequences. These
+// Package maphash provides hash functions on byte sequences.
-// hash functions are intended to be used to implement hash tables or
+// These hash functions are intended to be used to implement hash tables or
 // other data structures that need to map arbitrary strings or byte
-// sequences to a uniform distribution of integers. The hash functions
+// sequences to a uniform distribution of integers.
-// are collision-resistant but are not cryptographically secure (use
-// one of the hash functions in crypto/* if you need that).
 //
-// The produced hashes depend only on the sequence of bytes provided
+// The hash functions are collision-resistant but not cryptographically secure.
-// to the Hash object, not on the way in which they are provided. For
+// (See crypto/sha256 and crypto/sha512 for cryptographic use.)
-// example, the calls
-//     h.AddString("foo")
-//     h.AddBytes([]byte{'f','o','o'})
-//     h.AddByte('f'); h.AddByte('o'); h.AddByte('o')
-// will all have the same effect.
-//
-// Two Hash instances in the same process using the same seed
-// behave identically.
-//
-// Two Hash instances with the same seed in different processes are
-// not guaranteed to behave identically, even if the processes share
-// the same binary.
-//
-// Hashes are intended to be collision-resistant, even for situations
-// where an adversary controls the byte sequences being hashed.
-// All bits of the Hash result are close to uniformly and
-// independently distributed, so can be safely restricted to a range
-// using bit masking, shifting, or modular arithmetic.
 package maphash
-import (
+import "unsafe"
-	"unsafe"
-)
-// A Seed controls the behavior of a Hash.  Two Hash objects with the
+// A Seed is a random value that selects the specific hash function
-// same seed in the same process will behave identically.  Two Hash
+// computed by a Hash. If two Hashes use the same Seeds, they
-// objects with different seeds will very likely behave differently.
+// will compute the same hash values for any given input.
+// If two Hashes use different Seeds, they are very likely to compute
+// distinct hash values for any given input.
+//
+// A Seed must be initialized by calling MakeSeed.
+// The zero seed is uninitialized and not valid for use with Hash's SetSeed method.
+//
+// Each Seed value is local to a single process and cannot be serialized
+// or otherwise recreated in a different process.
 type Seed struct {
 	s uint64
 }
-// A Hash object is used to compute the hash of a byte sequence.
+// A Hash computes a seeded hash of a byte sequence.
+//
+// The zero Hash is a valid Hash ready to use.
+// A zero Hash chooses a random seed for itself during
+// the first call to a Reset, Write, Seed, Sum64, or Seed method.
+// For control over the seed, use SetSeed.
+//
+// The computed hash values depend only on the initial seed and
+// the sequence of bytes provided to the Hash object, not on the way
+// in which the bytes are provided. For example, the three sequences
+//
+//     h.Write([]byte{'f','o','o'})
+//     h.WriteByte('f'); h.WriteByte('o'); h.WriteByte('o')
+//     h.WriteString("foo")
+//
+// all have the same effect.
+//
+// Hashes are intended to be collision-resistant, even for situations
+// where an adversary controls the byte sequences being hashed.
+//
+// A Hash is not safe for concurrent use by multiple goroutines, but a Seed is.
+// If multiple goroutines must compute the same seeded hash,
+// each can declare its own Hash and call SetSeed with a common Seed.
 type Hash struct {
+	_     [0]func() // not comparable
 	seed  Seed      // initial seed used for this hash
 	state Seed      // current hash of all flushed bytes
 	buf   [64]byte  // unflushed byte buffer
 	n     int       // number of unflushed bytes
 }
-// AddByte adds b to the sequence of bytes hashed by h.
+// initSeed seeds the hash if necessary.
-func (h *Hash) AddByte(b byte) {
+// initSeed is called lazily before any operation that actually uses h.seed/h.state.
+// Note that this does not include Write/WriteByte/WriteString in the case
+// where they only add to h.buf. (If they write too much, they call h.flush,
+// which does call h.initSeed.)
+func (h *Hash) initSeed() {
+	if h.seed.s == 0 {
+		h.SetSeed(MakeSeed())
+	}
+}
+// WriteByte adds b to the sequence of bytes hashed by h.
+// It never fails; the error result is for implementing io.ByteWriter.
+func (h *Hash) WriteByte(b byte) error {
 	if h.n == len(h.buf) {
 		h.flush()
 	}
 	h.buf[h.n] = b
 	h.n++
+	return nil
 }
-// AddBytes adds b to the sequence of bytes hashed by h.
+// Write adds b to the sequence of bytes hashed by h.
-func (h *Hash) AddBytes(b []byte) {
+// It always writes all of b and never fails; the count and error result are for implementing io.Writer.
+func (h *Hash) Write(b []byte) (int, error) {
+	size := len(b)
 	for h.n+len(b) > len(h.buf) {
 		k := copy(h.buf[h.n:], b)
 		h.n = len(h.buf)
@@ -68,10 +92,13 @@ func (h *Hash) AddBytes(b []byte) {
 		h.flush()
 	}
 	h.n += copy(h.buf[h.n:], b)
+	return size, nil
 }
-// AddString adds the bytes of s to the sequence of bytes hashed by h.
+// WriteString adds the bytes of s to the sequence of bytes hashed by h.
-func (h *Hash) AddString(s string) {
+// It always writes all of s and never fails; the count and error result are for implementing io.StringWriter.
+func (h *Hash) WriteString(s string) (int, error) {
+	size := len(s)
 	for h.n+len(s) > len(h.buf) {
 		k := copy(h.buf[h.n:], s)
 		h.n = len(h.buf)
@@ -79,19 +106,24 @@ func (h *Hash) AddString(s string) {
 		h.flush()
 	}
 	h.n += copy(h.buf[h.n:], s)
+	return size, nil
 }
-// Seed returns the seed value specified in the most recent call to
+// Seed returns h's seed value.
-// SetSeed, or the initial seed if SetSeed was never called.
 func (h *Hash) Seed() Seed {
+	h.initSeed()
 	return h.seed
 }
-// SetSeed sets the seed used by h. Two Hash objects with the same
+// SetSeed sets h to use seed, which must have been returned by MakeSeed
-// seed in the same process will behave identically.  Two Hash objects
+// or by another Hash's Seed method.
-// with different seeds will very likely behave differently.  Any
+// Two Hash objects with the same seed behave identically.
-// bytes added to h previous to this call will be discarded.
+// Two Hash objects with different seeds will very likely behave differently.
+// Any bytes added to h before this call will be discarded.
 func (h *Hash) SetSeed(seed Seed) {
+	if seed.s == 0 {
+		panic("maphash: use of uninitialized Seed")
+	}
 	h.seed = seed
 	h.state = seed
 	h.n = 0
@@ -100,6 +132,7 @@ func (h *Hash) SetSeed(seed Seed) {
 // Reset discards all bytes added to h.
 // (The seed remains the same.)
 func (h *Hash) Reset() {
+	h.initSeed()
 	h.state = h.seed
 	h.n = 0
 }
@@ -107,36 +140,38 @@ func (h *Hash) Reset() {
 // precondition: buffer is full.
 func (h *Hash) flush() {
 	if h.n != len(h.buf) {
-		panic("flush of partially full buffer")
+		panic("maphash: flush of partially full buffer")
 	}
+	h.initSeed()
 	h.state.s = rthash(h.buf[:], h.state.s)
 	h.n = 0
 }
-// Hash returns a value which depends on h's seed and the sequence of
+// Sum64 returns h's current 64-bit value, which depends on
-// bytes added to h (since the last call to Reset or SetSeed).
+// h's seed and the sequence of bytes added to h since the
-func (h *Hash) Hash() uint64 {
+// last call to Reset or SetSeed.
+//
+// All bits of the Sum64 result are close to uniformly and
+// independently distributed, so it can be safely reduced
+// by using bit masking, shifting, or modular arithmetic.
+func (h *Hash) Sum64() uint64 {
+	h.initSeed()
 	return rthash(h.buf[:h.n], h.state.s)
 }
-// MakeSeed returns a Seed initialized using the bits in s.
+// MakeSeed returns a new random seed.
-// Two seeds generated with the same s are guaranteed to be equal.
+func MakeSeed() Seed {
-// Two seeds generated with different s are very likely to be different.
+	var s1, s2 uint64
-// TODO: disallow this? See Alan's comment in the issue.
+	for {
-func MakeSeed(s uint64) Seed {
+		s1 = uint64(runtime_fastrand())
-	return Seed{s: s}
+		s2 = uint64(runtime_fastrand())
-}
+		// We use seed 0 to indicate an uninitialized seed/hash,
+		// so keep trying until we get a non-zero seed.
-// New returns a new Hash object. Different hash objects allocated by
+		if s1|s2 != 0 {
-// this function will very likely have different seeds.
+			break
-func New() *Hash {
-	s1 := uint64(runtime_fastrand())
-	s2 := uint64(runtime_fastrand())
-	seed := Seed{s: s1<<32 + s2}
-	return &Hash{
-		seed:  seed,
-		state: seed,
 		}
+	}
+	return Seed{s: s1<<32 + s2}
 }
 //go:linkname runtime_fastrand runtime.fastrand
@@ -154,22 +189,17 @@ func rthash(b []byte, seed uint64) uint64 {
 	}
 	lo := runtime_memhash(unsafe.Pointer(&b[0]), uintptr(seed), uintptr(len(b)))
 	hi := runtime_memhash(unsafe.Pointer(&b[0]), uintptr(seed>>32), uintptr(len(b)))
-	// TODO: mix lo/hi? Get 64 bits some other way?
 	return uint64(hi)<<32 | uint64(lo)
 }
 //go:linkname runtime_memhash runtime.memhash
 func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr
-// Wrapper functions so that a hash/maphash.Hash implements
+// Sum appends the hash's current 64-bit value to b.
-// the hash.Hash and hash.Hash64 interfaces.
+// It exists for implementing hash.Hash.
+// For direct calls, it is more efficient to use Sum64.
-func (h *Hash) Write(b []byte) (int, error) {
-	h.AddBytes(b)
-	return len(b), nil
-}
 func (h *Hash) Sum(b []byte) []byte {
-	x := h.Hash()
+	x := h.Sum64()
 	return append(b,
 		byte(x>>0),
 		byte(x>>8),
@@ -180,8 +210,9 @@ func (h *Hash) Sum(b []byte) []byte {
 		byte(x>>48),
 		byte(x>>56))
 }
-func (h *Hash) Sum64() uint64 {
-	return h.Hash()
+// Size returns h's hash value size, 8 bytes.
-}
 func (h *Hash) Size() int { return 8 }
+// BlockSize returns h's block size.
 func (h *Hash) BlockSize() int { return len(h.buf) }
--- a/src/hash/maphash/maphash_test.go
+++ b/src/hash/maphash/maphash_test.go
@@ -2,19 +2,18 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-package maphash_test
+package maphash
 import (
 	"hash"
-	"hash/maphash"
 	"testing"
 )
 func TestUnseededHash(t *testing.T) {
 	m := map[uint64]struct{}{}
 	for i := 0; i < 1000; i++ {
-		h := maphash.New()
+		h := new(Hash)
-		m[h.Hash()] = struct{}{}
+		m[h.Sum64()] = struct{}{}
 	}
 	if len(m) < 900 {
 		t.Errorf("empty hash not sufficiently random: got %d, want 1000", len(m))
@@ -22,12 +21,12 @@ func TestUnseededHash(t *testing.T) {
 }
 func TestSeededHash(t *testing.T) {
-	s := maphash.MakeSeed(1234)
+	s := MakeSeed()
 	m := map[uint64]struct{}{}
 	for i := 0; i < 1000; i++ {
-		h := maphash.New()
+		h := new(Hash)
 		h.SetSeed(s)
-		m[h.Hash()] = struct{}{}
+		m[h.Sum64()] = struct{}{}
 	}
 	if len(m) != 1 {
 		t.Errorf("seeded hash is random: got %d, want 1", len(m))
@@ -36,14 +35,17 @@ func TestSeededHash(t *testing.T) {
 func TestHashGrouping(t *testing.T) {
 	b := []byte("foo")
-	h1 := maphash.New()
+	h1 := new(Hash)
-	h2 := maphash.New()
+	h2 := new(Hash)
 	h2.SetSeed(h1.Seed())
-	h1.AddBytes(b)
+	h1.Write(b)
 	for _, x := range b {
-		h2.AddByte(x)
+		err := h2.WriteByte(x)
+		if err != nil {
+			t.Fatalf("WriteByte: %v", err)
 		}
-	if h1.Hash() != h2.Hash() {
+	}
+	if h1.Sum64() != h2.Sum64() {
 		t.Errorf("hash of \"foo\" and \"f\",\"o\",\"o\" not identical")
 	}
 }
@@ -51,13 +53,19 @@ func TestHashGrouping(t *testing.T) {
 func TestHashBytesVsString(t *testing.T) {
 	s := "foo"
 	b := []byte(s)
-	h1 := maphash.New()
+	h1 := new(Hash)
-	h2 := maphash.New()
+	h2 := new(Hash)
 	h2.SetSeed(h1.Seed())
-	h1.AddString(s)
+	n1, err1 := h1.WriteString(s)
-	h2.AddBytes(b)
+	if n1 != len(s) || err1 != nil {
-	if h1.Hash() != h2.Hash() {
+		t.Fatalf("WriteString(s) = %d, %v, want %d, nil", n1, err1, len(s))
-		t.Errorf("hash of string and byts not identical")
+	}
+	n2, err2 := h2.Write(b)
+	if n2 != len(b) || err2 != nil {
+		t.Fatalf("Write(b) = %d, %v, want %d, nil", n2, err2, len(b))
+	}
+	if h1.Sum64() != h2.Sum64() {
+		t.Errorf("hash of string and bytes not identical")
 	}
 }
@@ -66,9 +74,9 @@ func TestHashHighBytes(t *testing.T) {
 	const N = 10
 	m := map[uint64]struct{}{}
 	for i := 0; i < N; i++ {
-		h := maphash.New()
+		h := new(Hash)
-		h.AddString("foo")
+		h.WriteString("foo")
-		m[h.Hash()>>32] = struct{}{}
+		m[h.Sum64()>>32] = struct{}{}
 	}
 	if len(m) < N/2 {
 		t.Errorf("from %d seeds, wanted at least %d different hashes; got %d", N, N/2, len(m))
@@ -76,5 +84,5 @@ func TestHashHighBytes(t *testing.T) {
 }
 // Make sure a Hash implements the hash.Hash and hash.Hash64 interfaces.
-var _ hash.Hash = &maphash.Hash{}
+var _ hash.Hash = &Hash{}
-var _ hash.Hash64 = &maphash.Hash{}
+var _ hash.Hash64 = &Hash{}
--- a/src/hash/maphash/smhasher_test.go
+++ b/src/hash/maphash/smhasher_test.go
@@ -2,11 +2,10 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-package maphash_test
+package maphash
 import (
 	"fmt"
-	"hash/maphash"
 	"math"
 	"math/rand"
 	"runtime"
@@ -19,6 +18,8 @@ import (
 // https://code.google.com/p/smhasher/
 // This code is a port of some of the Smhasher tests to Go.
+var fixedSeed = MakeSeed()
 // Sanity checks.
 // hash should not depend on values outside key.
 // hash should not depend on alignment.
@@ -36,7 +37,7 @@ func TestSmhasherSanity(t *testing.T) {
 				randBytes(r, b[:])
 				randBytes(r, c[:])
 				copy(c[PAD+i:PAD+i+n], b[PAD:PAD+n])
-				if bytesHash(b[PAD:PAD+n], 0) != bytesHash(c[PAD+i:PAD+i+n], 0) {
+				if bytesHash(b[PAD:PAD+n]) != bytesHash(c[PAD+i:PAD+i+n]) {
 					t.Errorf("hash depends on bytes outside key")
 				}
 			}
@@ -44,17 +45,17 @@ func TestSmhasherSanity(t *testing.T) {
 	}
 }
-func bytesHash(b []byte, seed uint64) uint64 {
+func bytesHash(b []byte) uint64 {
-	h := maphash.New()
+	var h Hash
-	h.SetSeed(maphash.MakeSeed(seed))
+	h.SetSeed(fixedSeed)
-	h.AddBytes(b)
+	h.Write(b)
-	return h.Hash()
+	return h.Sum64()
 }
-func stringHash(s string, seed uint64) uint64 {
+func stringHash(s string) uint64 {
-	h := maphash.New()
+	var h Hash
-	h.SetSeed(maphash.MakeSeed(seed))
+	h.SetSeed(fixedSeed)
-	h.AddString(s)
+	h.WriteString(s)
-	return h.Hash()
+	return h.Sum64()
 }
 const hashSize = 64
@@ -77,13 +78,16 @@ func (s *hashSet) add(h uint64) {
 	s.n++
 }
 func (s *hashSet) addS(x string) {
-	s.add(stringHash(x, 0))
+	s.add(stringHash(x))
 }
 func (s *hashSet) addB(x []byte) {
-	s.add(bytesHash(x, 0))
+	s.add(bytesHash(x))
 }
-func (s *hashSet) addS_seed(x string, seed uint64) {
+func (s *hashSet) addS_seed(x string, seed Seed) {
-	s.add(stringHash(x, seed))
+	var h Hash
+	h.SetSeed(seed)
+	h.WriteString(x)
+	s.add(h.Sum64())
 }
 func (s *hashSet) check(t *testing.T) {
 	const SLOP = 10.0
@@ -312,7 +316,7 @@ func (k *bytesKey) flipBit(i int) {
 	k.b[i>>3] ^= byte(1 << uint(i&7))
 }
 func (k *bytesKey) hash() uint64 {
-	return bytesHash(k.b, 0)
+	return bytesHash(k.b)
 }
 func (k *bytesKey) name() string {
 	return fmt.Sprintf("bytes%d", len(k.b))
@@ -458,8 +462,8 @@ func TestSmhasherSeed(t *testing.T) {
 	const N = 100000
 	s := "hello"
 	for i := 0; i < N; i++ {
-		h.addS_seed(s, uint64(i))
+		h.addS_seed(s, Seed{s: uint64(i + 1)})
-		h.addS_seed(s, uint64(i)<<32) // make sure high bits are used
+		h.addS_seed(s, Seed{s: uint64(i+1) << 32}) // make sure high bits are used
 	}
 	h.check(t)
 }