Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
aeb1be58
Commit
aeb1be58
authored
Oct 28, 2018
by
jdemeyer
Committed by
Raymond Hettinger
Oct 27, 2018
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
bpo-34751: improved hash function for tuples (GH-9471)
parent
53125a53
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
143 additions
and
43 deletions
+143
-43
Lib/test/test_tuple.py
Lib/test/test_tuple.py
+93
-18
Misc/NEWS.d/next/Core and Builtins/2018-09-20-15-41-58.bpo-34751.Yiv0pV.rst
...ore and Builtins/2018-09-20-15-41-58.bpo-34751.Yiv0pV.rst
+4
-0
Objects/tupleobject.c
Objects/tupleobject.c
+46
-25
No files found.
Lib/test/test_tuple.py
View file @
aeb1be58
...
...
@@ -62,29 +62,104 @@ class TupleTest(seq_tests.CommonTest):
yield
i
self
.
assertEqual
(
list
(
tuple
(
f
())),
list
(
range
(
1000
)))
def
test_hash
(
self
):
# See SF bug 942952: Weakness in tuple hash
# The hash should:
# be non-commutative
# should spread-out closely spaced values
# should not exhibit cancellation in tuples like (x,(x,y))
# should be distinct from element hashes: hash(x)!=hash((x,))
# This test exercises those cases.
# For a pure random hash and N=50, the expected number of occupied
# buckets when tossing 252,600 balls into 2**32 buckets
# is 252,592.6, or about 7.4 expected collisions. The
# standard deviation is 2.73. On a box with 64-bit hash
# codes, no collisions are expected. Here we accept no
# more than 15 collisions. Any worse and the hash function
# is sorely suspect.
# Various tests for hashing of tuples to check that we get few collisions.
#
# Earlier versions of the tuple hash algorithm had collisions
# reported at:
# - https://bugs.python.org/issue942952
# - https://bugs.python.org/issue34751
#
# Notes:
# - The hash of tuples is deterministic: if the test passes once on a given
# system, it will always pass. So the probabilities mentioned in the
# test_hash functions below should be interpreted assuming that the
# hashes are random.
# - Due to the structure in the testsuite inputs, collisions are not
# independent. For example, if hash((a,b)) == hash((c,d)), then also
# hash((a,b,x)) == hash((c,d,x)). But the quoted probabilities assume
# independence anyway.
# - We limit the hash to 32 bits in the tests to have a good test on
# 64-bit systems too. Furthermore, this is also a sanity check that the
# lower 32 bits of a 64-bit hash are sufficiently random too.
def
test_hash1
(
self
):
# Check for hash collisions between small integers in range(50) and
# certain tuples and nested tuples of such integers.
N
=
50
base
=
list
(
range
(
N
))
xp
=
[(
i
,
j
)
for
i
in
base
for
j
in
base
]
inps
=
base
+
[(
i
,
j
)
for
i
in
base
for
j
in
xp
]
+
\
[(
i
,
j
)
for
i
in
xp
for
j
in
base
]
+
xp
+
list
(
zip
(
base
))
collisions
=
len
(
inps
)
-
len
(
set
(
map
(
hash
,
inps
)))
self
.
assertTrue
(
collisions
<=
15
)
self
.
assertEqual
(
len
(
inps
),
252600
)
hashes
=
set
(
hash
(
x
)
%
2
**
32
for
x
in
inps
)
collisions
=
len
(
inps
)
-
len
(
hashes
)
# For a pure random 32-bit hash and N = 252,600 test items, the
# expected number of collisions equals
#
# 2**(-32) * N(N-1)/2 = 7.4
#
# We allow up to 15 collisions, which suffices to make the test
# pass with 99.5% confidence.
self
.
assertLessEqual
(
collisions
,
15
)
def
test_hash2
(
self
):
# Check for hash collisions between small integers (positive and
# negative), tuples and nested tuples of such integers.
# All numbers in the interval [-n, ..., n] except -1 because
# hash(-1) == hash(-2).
n
=
5
A
=
[
x
for
x
in
range
(
-
n
,
n
+
1
)
if
x
!=
-
1
]
B
=
A
+
[(
a
,)
for
a
in
A
]
L2
=
[(
a
,
b
)
for
a
in
A
for
b
in
A
]
L3
=
L2
+
[(
a
,
b
,
c
)
for
a
in
A
for
b
in
A
for
c
in
A
]
L4
=
L3
+
[(
a
,
b
,
c
,
d
)
for
a
in
A
for
b
in
A
for
c
in
A
for
d
in
A
]
# T = list of testcases. These consist of all (possibly nested
# at most 2 levels deep) tuples containing at most 4 items from
# the set A.
T
=
A
T
+=
[(
a
,)
for
a
in
B
+
L4
]
T
+=
[(
a
,
b
)
for
a
in
L3
for
b
in
B
]
T
+=
[(
a
,
b
)
for
a
in
L2
for
b
in
L2
]
T
+=
[(
a
,
b
)
for
a
in
B
for
b
in
L3
]
T
+=
[(
a
,
b
,
c
)
for
a
in
B
for
b
in
B
for
c
in
L2
]
T
+=
[(
a
,
b
,
c
)
for
a
in
B
for
b
in
L2
for
c
in
B
]
T
+=
[(
a
,
b
,
c
)
for
a
in
L2
for
b
in
B
for
c
in
B
]
T
+=
[(
a
,
b
,
c
,
d
)
for
a
in
B
for
b
in
B
for
c
in
B
for
d
in
B
]
self
.
assertEqual
(
len
(
T
),
345130
)
hashes
=
set
(
hash
(
x
)
%
2
**
32
for
x
in
T
)
collisions
=
len
(
T
)
-
len
(
hashes
)
# For a pure random 32-bit hash and N = 345,130 test items, the
# expected number of collisions equals
#
# 2**(-32) * N(N-1)/2 = 13.9
#
# We allow up to 20 collisions, which suffices to make the test
# pass with 95.5% confidence.
self
.
assertLessEqual
(
collisions
,
20
)
def
test_hash3
(
self
):
# Check for hash collisions between tuples containing 0.0 and 0.5.
# The hashes of 0.0 and 0.5 itself differ only in one high bit.
# So this implicitly tests propagation of high bits to low bits.
from
itertools
import
product
T
=
list
(
product
([
0.0
,
0.5
],
repeat
=
18
))
self
.
assertEqual
(
len
(
T
),
262144
)
hashes
=
set
(
hash
(
x
)
%
2
**
32
for
x
in
T
)
collisions
=
len
(
T
)
-
len
(
hashes
)
# For a pure random 32-bit hash and N = 262,144 test items, the
# expected number of collisions equals
#
# 2**(-32) * N(N-1)/2 = 8.0
#
# We allow up to 15 collisions, which suffices to make the test
# pass with 99.1% confidence.
self
.
assertLessEqual
(
collisions
,
15
)
def
test_repr
(
self
):
l0
=
tuple
()
...
...
Misc/NEWS.d/next/Core and Builtins/2018-09-20-15-41-58.bpo-34751.Yiv0pV.rst
0 → 100644
View file @
aeb1be58
The hash function for tuples is now based on xxHash
which gives better collision results on (formerly) pathological cases.
Additionally, on 64-bit systems it improves tuple hashes in general.
Patch by Jeroen Demeyer with substantial contributions by Tim Peters.
Objects/tupleobject.c
View file @
aeb1be58
...
...
@@ -333,39 +333,60 @@ error:
return
NULL
;
}
/* The addend 82520, was selected from the range(0, 1000000) for
generating the greatest number of prime multipliers for tuples
up to length eight:
1082527, 1165049, 1082531, 1165057, 1247581, 1330103, 1082533,
1330111, 1412633, 1165069, 1247599, 1495177, 1577699
Tests have shown that it's not worth to cache the hash value, see
issue #9685.
/* Hash for tuples. This is a slightly simplified version of the xxHash
non-cryptographic hash:
- we do not use any parallellism, there is only 1 accumulator.
- we drop the final mixing since this is just a permutation of the
output space: it does not help against collisions.
- at the end, we mangle the length with a single constant.
For the xxHash specification, see
https://github.com/Cyan4973/xxHash/blob/master/doc/xxhash_spec.md
Below are the official constants from the xxHash specification. Optimizing
compilers should emit a single "rotate" instruction for the
_PyHASH_XXROTATE() expansion. If that doesn't happen for some important
platform, the macro could be changed to expand to a platform-specific rotate
spelling instead.
*/
#if SIZEOF_PY_UHASH_T > 4
#define _PyHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL)
#define _PyHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL)
#define _PyHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL)
#define _PyHASH_XXROTATE(x) ((x << 31) | (x >> 33))
/* Rotate left 31 bits */
#else
#define _PyHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL)
#define _PyHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL)
#define _PyHASH_XXPRIME_5 ((Py_uhash_t)374761393UL)
#define _PyHASH_XXROTATE(x) ((x << 13) | (x >> 19))
/* Rotate left 13 bits */
#endif
/* Tests have shown that it's not worth to cache the hash value, see
https://bugs.python.org/issue9685 */
static
Py_hash_t
tuplehash
(
PyTupleObject
*
v
)
{
Py_uhash_t
x
;
/* Unsigned for defined overflow behavior. */
Py_hash_t
y
;
Py_ssize_t
len
=
Py_SIZE
(
v
);
PyObject
**
p
;
Py_uhash_t
mult
=
_PyHASH_MULTIPLIER
;
x
=
0x345678UL
;
p
=
v
->
ob_item
;
while
(
--
len
>=
0
)
{
y
=
PyObject_Hash
(
*
p
++
);
if
(
y
==
-
1
)
Py_ssize_t
i
,
len
=
Py_SIZE
(
v
);
PyObject
**
item
=
v
->
ob_item
;
Py_uhash_t
acc
=
_PyHASH_XXPRIME_5
;
for
(
i
=
0
;
i
<
len
;
i
++
)
{
Py_uhash_t
lane
=
PyObject_Hash
(
item
[
i
]);
if
(
lane
==
(
Py_uhash_t
)
-
1
)
{
return
-
1
;
x
=
(
x
^
y
)
*
mult
;
/* the cast might truncate len; that doesn't change hash stability */
mult
+=
(
Py_hash_t
)(
82520UL
+
len
+
len
);
}
acc
+=
lane
*
_PyHASH_XXPRIME_2
;
acc
=
_PyHASH_XXROTATE
(
acc
);
acc
*=
_PyHASH_XXPRIME_1
;
}
/* Add input length, mangled to keep the historical value of hash(()). */
acc
+=
len
^
(
_PyHASH_XXPRIME_5
^
3527539UL
);
if
(
acc
==
(
Py_uhash_t
)
-
1
)
{
return
1546275796
;
}
x
+=
97531UL
;
if
(
x
==
(
Py_uhash_t
)
-
1
)
x
=
-
2
;
return
x
;
return
acc
;
}
static
Py_ssize_t
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment