Commit b5e39d4a authored by Kirill Smelkov's avatar Kirill Smelkov

X wcfs/treegen: allstructs: Do not keep all tree structures in memory

We need to sample n random ones from AllStructs. We can sample that
without keeping first whole AllStructs result in memory.

Starting from maxdepth=4 and nkeys=10 there are too many entries in that
whole list. For example the following input was causing treegen
allstructs to be OOM killed on my machine:

    4 2 20/8808469856591574482 0:f,1:d,2:g 0:d,1:e,2:a,3:h,4:d,5:h,6:d,7:c

Now it works ok without eating memory.
parent e5601af1
......@@ -122,6 +122,7 @@ from ZODB import DB
from ZODB.Connection import Connection
from ZODB.MappingStorage import MappingStorage
import transaction
import itertools
import random
import six
......@@ -292,10 +293,6 @@ def AllStructs(kv1txt, kv2txt, maxdepth, maxsplit, n, seed=None):
commit('kv2')
t2struct0 = xbtree.StructureOf(ztree)
# all tree topologies that can represent kv1 and kv2
t1AllStructs = list(xbtree.AllStructs(kv1.keys(), maxdepth, maxsplit, kv=kv1))
t2AllStructs = list(xbtree.AllStructs(kv2.keys(), maxdepth, maxsplit, kv=kv2))
# seed
if seed is None:
seed = time.now()
......@@ -303,12 +300,16 @@ def AllStructs(kv1txt, kv2txt, maxdepth, maxsplit, n, seed=None):
random.seed(seed)
print("# maxdepth=%d maxsplit=%d n=%d seed=%d" % (maxdepth, maxsplit, n, seed))
# make 2·n random samples from all tree topologies that can represent kv1 and kv2
t1structv = rsample(xbtree.AllStructs(kv1.keys(), maxdepth, maxsplit, kv=kv1), n)
t2structv = rsample(xbtree.AllStructs(kv2.keys(), maxdepth, maxsplit, kv=kv2), n)
# all tree1 and tree2 topologies jumps in between we are going to emit:
# native + n random ones.
if t1struct0 in t1AllStructs: t1AllStructs.remove(t1struct0) # avoid dups
if t2struct0 in t2AllStructs: t2AllStructs.remove(t2struct0)
t1structv = [t1struct0] + random.sample(t1AllStructs, min(n, len(t1AllStructs)))
t2structv = [t2struct0] + random.sample(t2AllStructs, min(n, len(t2AllStructs)))
if t1struct0 in t1structv: t1structv.remove(t1struct0) # avoid dups
if t2struct0 in t2structv: t2structv.remove(t2struct0)
t1structv.insert(0, t1struct0)
t2structv.insert(0, t2struct0)
# emit topologies for tree1->tree2 and tree1<-tree2 transitions for all
# combinations of tree1 and tree2.
......@@ -321,6 +322,24 @@ def AllStructs(kv1txt, kv2txt, maxdepth, maxsplit, n, seed=None):
print(zctx.TopoEncode(tstruct))
# rsample returns k random samples from seq.
# it differs from random.sample in that it does not keep whole list(seq) in memory.
def rsample(seq, k): # -> [] of items; len <= k
# based on https://stackoverflow.com/a/35671225/9456786
# https://en.wikipedia.org/wiki/Reservoir_sampling
if k <= 0:
raise ValueError("negative sample size")
it = iter(seq)
sample = list(itertools.islice(it, k))
random.shuffle(sample)
i = k
for item in it:
i += 1
j = random.randrange(i) # [0,i)
if j < k:
sample[j] = item
return sample
# bitravel2Way generates travel path through all A<->B edges such
# that all edges a->b and a<-b are traveled and exactly once.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment