Commit bf9a7405 authored by Kirill Smelkov's avatar Kirill Smelkov

X No longer rely on ZODB cache invariant for invalidations

Move all aspects of what is tracked into ΔFtail.
It is more robust if ΔFtail does not rely on properties of outside
zconn.Cache in order to work correctly.
It also potentially allows to move ΔFtail out of zdata package.

And it is even more memeory efficient, as before it was

	Connection.cache[zoid] -> ZBlk(ghost)->inΔFtail

and now it is

	ΔFtail.trackSetZBlk[zoid] -> zblkTrack

becuase Persistent embedded in ZBlk even in ghost state occupies alone
occupies ~ 16 words.

Now we keep only what is needed to be kept for tracking.
parent ef74aebc
...@@ -60,7 +60,6 @@ import ( ...@@ -60,7 +60,6 @@ import (
// ZBlk is the interface that every ZBlk* block implements. // ZBlk is the interface that every ZBlk* block implements.
type ZBlk interface { type ZBlk interface {
zodb.IPersistent zodb.IPersistent
_ZBlkInΔFtail
// LoadBlkData loads from database and returns data block stored by this ZBlk. // LoadBlkData loads from database and returns data block stored by this ZBlk.
// //
...@@ -79,7 +78,6 @@ var _ ZBlk = (*ZBlk1)(nil) ...@@ -79,7 +78,6 @@ var _ ZBlk = (*ZBlk1)(nil)
// ZBlk0 mimics ZBlk0 from python. // ZBlk0 mimics ZBlk0 from python.
type ZBlk0 struct { type ZBlk0 struct {
zodb.Persistent zodb.Persistent
zblkInΔFtail
// NOTE py source uses bytes(buf) but on python2 it still results in str // NOTE py source uses bytes(buf) but on python2 it still results in str
blkdata string blkdata string
...@@ -157,7 +155,6 @@ func (zd *zDataState) PySetState(pystate interface{}) error { ...@@ -157,7 +155,6 @@ func (zd *zDataState) PySetState(pystate interface{}) error {
// ZBlk1 mimics ZBlk1 from python. // ZBlk1 mimics ZBlk1 from python.
type ZBlk1 struct { type ZBlk1 struct {
zodb.Persistent zodb.Persistent
zblkInΔFtail
chunktab *btree.IOBTree // {} offset -> ZData(chunk) chunktab *btree.IOBTree // {} offset -> ZData(chunk)
} }
......
...@@ -22,8 +22,6 @@ package zdata ...@@ -22,8 +22,6 @@ package zdata
import ( import (
"context" "context"
"fmt" "fmt"
"runtime"
"sync"
"lab.nexedi.com/kirr/go123/xerr" "lab.nexedi.com/kirr/go123/xerr"
"lab.nexedi.com/kirr/neo/go/zodb" "lab.nexedi.com/kirr/neo/go/zodb"
...@@ -79,6 +77,8 @@ type ΔFtail struct { ...@@ -79,6 +77,8 @@ type ΔFtail struct {
δBtail *xbtree.ΔBtail δBtail *xbtree.ΔBtail
fileIdx map[zodb.Oid]SetOid // tree-root -> {} ZBigFile<oid> as of @head fileIdx map[zodb.Oid]SetOid // tree-root -> {} ZBigFile<oid> as of @head
trackSetZFile SetOid // set of tracked ZBigFiles as of @head
trackSetZBlk map[zodb.Oid]*zblkTrack // zblk -> {} root -> {}blk as of @head
// XXX kill // XXX kill
///* ///*
// XXX don't need vδF - everything is reconstructed at runtime from .δBtail.vδT // XXX don't need vδF - everything is reconstructed at runtime from .δBtail.vδT
...@@ -89,10 +89,16 @@ type ΔFtail struct { ...@@ -89,10 +89,16 @@ type ΔFtail struct {
// tracked ZBlk that are not yet taken into account in current vδF. // tracked ZBlk that are not yet taken into account in current vδF.
// grows on new track requests; flushes on queries and update. // grows on new track requests; flushes on queries and update.
trackNew map[zodb.Oid]map[zodb.Oid]*zblkInΔFtail // {} foid -> {} zoid -> zblk trackNew map[zodb.Oid]map[zodb.Oid]*zblkTrack // {} foid -> {} zoid -> zblk
//*/ //*/
} }
// zblkTrack keeps information in which root/blocks ZBlk is present as of @head.
type zblkTrack struct {
// inroot map[zodb.Oid]SetI64 // {} root -> {}blk XXX later switch to this
infile map[zodb.Oid]SetI64 // {} foid -> {}blk
}
// ΔF represents a change in files space. // ΔF represents a change in files space.
type ΔF struct { type ΔF struct {
Rev zodb.Tid Rev zodb.Tid
...@@ -106,24 +112,6 @@ type ΔFile struct { ...@@ -106,24 +112,6 @@ type ΔFile struct {
Size bool // whether file size changed XXX -> ΔSize? Size bool // whether file size changed XXX -> ΔSize?
} }
// zblkInΔFtail is ΔFtail-related volatile data embedded into ZBlk*.
//
// The data is preserved even when ZBlk comes to ghost state, but is lost if
// ZBlk is garbage collected. The data is transient - it is _not_ included into
// persistent state.
type zblkInΔFtail struct {
mu sync.Mutex // to support multiple concurrent loaders
// XXX change vvv to intree_parent? {} Bucket -> set(#blk)
// (this is uniform with building in-RAM reverse child->parents relation for
// tree nodes and for tree_root->file)
// with which files/blocks this ZBlk is associated with as of @head state
infile map[zodb.Oid]SetI64 // {} foid -> set(#blk)
}
type _ZBlkInΔFtail interface { inΔFtail() *zblkInΔFtail }
func (z *zblkInΔFtail) inΔFtail() *zblkInΔFtail { return z }
// NewΔFtail creates new ΔFtail object. // NewΔFtail creates new ΔFtail object.
// //
...@@ -135,8 +123,10 @@ func (z *zblkInΔFtail) inΔFtail() *zblkInΔFtail { return z } ...@@ -135,8 +123,10 @@ func (z *zblkInΔFtail) inΔFtail() *zblkInΔFtail { return z }
func NewΔFtail(at0 zodb.Tid, db *zodb.DB) *ΔFtail { func NewΔFtail(at0 zodb.Tid, db *zodb.DB) *ΔFtail {
return &ΔFtail{ return &ΔFtail{
δBtail: xbtree.NewΔBtail(at0, db), δBtail: xbtree.NewΔBtail(at0, db),
fileIdx: make(map[zodb.Oid]SetOid), fileIdx: map[zodb.Oid]SetOid{},
trackNew: make(map[zodb.Oid]map[zodb.Oid]*zblkInΔFtail), trackSetZFile: SetOid{},
trackSetZBlk: map[zodb.Oid]*zblkTrack{},
trackNew: map[zodb.Oid]map[zodb.Oid]*zblkTrack{},
} }
} }
...@@ -158,6 +148,8 @@ func (δFtail *ΔFtail) Tail() zodb.Tid { return δFtail.δBtail.Tail() } ...@@ -158,6 +148,8 @@ func (δFtail *ΔFtail) Tail() zodb.Tid { return δFtail.δBtail.Tail() }
// //
// A root can be associated with several files (each provided on different Track call). // A root can be associated with several files (each provided on different Track call).
func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, zblk ZBlk) { func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, zblk ZBlk) {
// XXX locking
foid := file.POid() foid := file.POid()
if blk == -1 { if blk == -1 {
// XXX blk = ∞ from beginning ? // XXX blk = ∞ from beginning ?
...@@ -167,6 +159,7 @@ func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, zb ...@@ -167,6 +159,7 @@ func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, zb
if err != nil { if err != nil {
panic(err) // XXX -> error? errctx panic(err) // XXX -> error? errctx
} }
root := path[0].(*btree.LOBTree) root := path[0].(*btree.LOBTree)
files, ok := δFtail.fileIdx[root.POid()] files, ok := δFtail.fileIdx[root.POid()]
if !ok { if !ok {
...@@ -175,30 +168,35 @@ func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, zb ...@@ -175,30 +168,35 @@ func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, zb
} }
files.Add(foid) files.Add(foid)
δFtail.trackSetZFile.Add(foid)
// associate zblk with file, if it was not hole // associate zblk with file, if it was not hole
if zblk != nil { if zblk != nil {
z := zblk.inΔFtail() zoid := zblk.POid()
z.mu.Lock() zt, ok := δFtail.trackSetZBlk[zoid]
blocks, ok := z.infile[foid] if !ok {
zt = &zblkTrack{}
δFtail.trackSetZBlk[zoid] = zt
}
blocks, ok := zt.infile[foid]
if !ok { if !ok {
blocks = make(SetI64, 1) blocks = make(SetI64, 1)
if z.infile == nil { if zt.infile == nil {
z.infile = make(map[zodb.Oid]SetI64) zt.infile = make(map[zodb.Oid]SetI64)
} }
z.infile[foid] = blocks zt.infile[foid] = blocks
} }
blocks.Add(blk) blocks.Add(blk)
z.mu.Unlock()
// XXX locking
if !ok { if !ok {
// zblk was not associated with this file // zblk was not associated with this file
zt := δFtail.trackNew[foid] ft := δFtail.trackNew[foid]
if zt == nil { if ft == nil {
zt = make(map[zodb.Oid]*zblkInΔFtail, 1) ft = make(map[zodb.Oid]*zblkTrack, 1)
δFtail.trackNew[foid] = zt δFtail.trackNew[foid] = ft
} }
zt[zblk.POid()] = z ft[zoid] = zt
} }
} }
...@@ -227,6 +225,7 @@ func (δFtail *ΔFtail) Update(δZ *zodb.EventCommit, zhead *xzodb.ZConn) (_ ΔF ...@@ -227,6 +225,7 @@ func (δFtail *ΔFtail) Update(δZ *zodb.EventCommit, zhead *xzodb.ZConn) (_ ΔF
// XXX δFtail.update() first? // XXX δFtail.update() first?
// XXX verify zhead.At() == δFtail.Head() // XXX verify zhead.At() == δFtail.Head()
// XXX locking
δB, err := δFtail.δBtail.Update(δZ) δB, err := δFtail.δBtail.Update(δZ)
if err != nil { if err != nil {
...@@ -263,42 +262,29 @@ func (δFtail *ΔFtail) Update(δZ *zodb.EventCommit, zhead *xzodb.ZConn) (_ ΔF ...@@ -263,42 +262,29 @@ func (δFtail *ΔFtail) Update(δZ *zodb.EventCommit, zhead *xzodb.ZConn) (_ ΔF
// take zblk changes into account // take zblk changes into account
for _, oid := range δZ.Changev { for _, oid := range δZ.Changev {
// XXX cache lock/unlock if δFtail.trackSetZFile.Has(oid) {
obj := zhead.Cache().Get(oid) // TODO check that .blksize and .blktab (it is only
if obj == nil { // persistent reference) do not change.
//fmt.Printf("%s: not in cache\n", oid)
continue // nothing to do - see invariant
}
//fmt.Printf("%s: in cache (%s)\n", oid, typeOf(obj))
switch obj := obj.(type) {
case ZBlk: // ZBlk*
// z.infile locking: since we write-locked head.zheadMu
// - no other fuse reads are running, and thus no one
// is mutating z.infile. XXX recheck
z := obj.inΔFtail()
for file, blocks := range z.infile {
δfile, ok := δF.ByFile[file]
if !ok {
δfile = &ΔFile{Rev: δF.Rev, Blocks: make(SetI64)}
δF.ByFile[file] = δfile
}
δfile.Blocks.Update(blocks) return ΔF{}, fmt.Errorf("ZBigFile<%s> changed @%s", oid, δZ.Tid)
} }
// XXX update z.infile according to btree changes zt, ok := δFtail.trackSetZBlk[oid]
if !ok {
continue // not tracked
}
case *ZBigFile: for foid, blocks := range zt.infile {
// TODO check that .blksize and .blktab (it is only δfile, ok := δF.ByFile[foid]
// persistent reference) do not change. if !ok {
δfile = &ΔFile{Rev: δF.Rev, Blocks: make(SetI64)}
δF.ByFile[foid] = δfile
}
return ΔF{}, fmt.Errorf("ZBigFile<%s> changed @%s", oid, δZ.Tid) δfile.Blocks.Update(blocks)
} }
// make sure obj won't be garbage-collected until we finish handling it. // XXX update zt.infile according to btree changes
runtime.KeepAlive(obj)
} }
δFtail.vδF = append(δFtail.vδF, δF) δFtail.vδF = append(δFtail.vδF, δF)
......
...@@ -257,21 +257,19 @@ package main ...@@ -257,21 +257,19 @@ package main
// 2) head/bigfile/* of all bigfiles represent state as of zhead.At . // 2) head/bigfile/* of all bigfiles represent state as of zhead.At .
// 3) for head/bigfile/* the following invariant is maintained: // 3) for head/bigfile/* the following invariant is maintained:
// //
// #blk ∈ OS file cache => ZBlk(#blk) + all BTree/Bucket that lead to it ∈ zhead live cache(%) // #blk ∈ OS file cache => all BTree/Bucket/ZBlk that lead to blk are tracked(%)
// (ZBlk* in ghost state)
// //
// => all BTree/Bucket that lead to blk are tracked (XXX) // The invariant helps on invalidation: when δFtail (see below) sees a
// changed oid, it is guaranteed that if the change affects block that was
// ever provided to OS, δFtail will detect that this block has changed. XXX review
// And if oid relates to a file block but is not in δFtail's tracking set -
// we know that block is not cached and will trigger ZODB load on a future
// file read.
// //
// The invariant helps on invalidation: if we see a changed oid, and // Currently we maintain this invariant by adding ZBlk/LOBTree/LOBucket
// zhead.cache.lookup(oid) = ø -> we know we don't have to invalidate OS // objects to δFtail on every access, and never shrinking that tracking set.
// cache for any part of any file (even if oid relates to a file block - that // In the future we may want to try to synchronize to kernel freeing its
// block is not cached and will trigger ZODB load on file read). // pagecache pages.
//
// XXX explain why tracked
//
// Currently we maintain this invariant by simply never evicting ZBlk/LOBTree/LOBucket
// objects from ZODB Connection cache. In the future we may want to try to
// synchronize to kernel freeing its pagecache pages.
// //
// 4) when we receive an invalidation message from ZODB - we process it and // 4) when we receive an invalidation message from ZODB - we process it and
// propagate invalidations to OS file cache of head/bigfile/*: // propagate invalidations to OS file cache of head/bigfile/*:
...@@ -301,6 +299,8 @@ package main ...@@ -301,6 +299,8 @@ package main
// Eager invalidation would require full scan - Trees _and_ // Eager invalidation would require full scan - Trees _and_
// Buckets, which makes it prohibitively expensive - see (+). // Buckets, which makes it prohibitively expensive - see (+).
// //
// FIXME all ^^^ is outdated -> XXX δFtail
//
// 4.4) for all file/blk to invalidate we do: // 4.4) for all file/blk to invalidate we do:
// //
// - try to retrieve head/bigfile/file[blk] from OS file cache(*); // - try to retrieve head/bigfile/file[blk] from OS file cache(*);
...@@ -718,7 +718,7 @@ type blkPinState struct { ...@@ -718,7 +718,7 @@ type blkPinState struct {
err error err error
} }
// -------- 3) Cache invariant -------- // -------- ZODB cache control --------
// zodbCacheControl implements zodb.LiveCacheControl to tune ZODB to never evict // zodbCacheControl implements zodb.LiveCacheControl to tune ZODB to never evict
// LOBTree/LOBucket from live cache. We want to keep LOBTree/LOBucket always alive // LOBTree/LOBucket from live cache. We want to keep LOBTree/LOBucket always alive
...@@ -726,34 +726,28 @@ type blkPinState struct { ...@@ -726,34 +726,28 @@ type blkPinState struct {
// //
// For the data itself - we put it to kernel pagecache and always deactivate // For the data itself - we put it to kernel pagecache and always deactivate
// from ZODB right after that. // from ZODB right after that.
//
// See "3) for */head/data the following invariant is maintained..."
type zodbCacheControl struct {} type zodbCacheControl struct {}
func (_ *zodbCacheControl) PCacheClassify(obj zodb.IPersistent) zodb.PCachePolicy { func (_ *zodbCacheControl) PCacheClassify(obj zodb.IPersistent) zodb.PCachePolicy {
switch obj.(type) { switch obj.(type) {
// ZBlk* should be in cache but without data // don't let ZBlk*/ZData to pollute the cache
case *ZBlk0: case *ZBlk0:
return zodb.PCachePinObject | zodb.PCacheDropState return zodb.PCacheDropObject | zodb.PCacheDropState
case *ZBlk1: case *ZBlk1:
return zodb.PCachePinObject | zodb.PCacheDropState return zodb.PCacheDropObject | zodb.PCacheDropState
// ZBigFile btree index should be in cache with data
case *btree.LOBTree:
return zodb.PCachePinObject | zodb.PCacheKeepState
case *btree.LOBucket:
return zodb.PCachePinObject | zodb.PCacheKeepState
// don't let ZData to pollute the cache
case *ZData: case *ZData:
return zodb.PCacheDropObject | zodb.PCacheDropState return zodb.PCacheDropObject | zodb.PCacheDropState
// for performance reason we also keep ZBigFile in cache. // keep ZBigFile and its btree index in cache to speedup file data access.
// //
// ZBigFile is top-level object that is used on every block load, and // ZBigFile is top-level object that is used on every block load, and
// it would be a waste to evict ZBigFile from cache. // it would be a waste to evict ZBigFile from cache.
case *ZBigFile: case *ZBigFile:
return zodb.PCachePinObject | zodb.PCacheKeepState return zodb.PCachePinObject | zodb.PCacheKeepState
case *btree.LOBTree:
return zodb.PCachePinObject | zodb.PCacheKeepState
case *btree.LOBucket:
return zodb.PCachePinObject | zodb.PCacheKeepState
} }
return 0 return 0
...@@ -960,6 +954,7 @@ retry: ...@@ -960,6 +954,7 @@ retry:
} }
file.size = size file.size = size
// see "3) for */head/data the following invariant is maintained..."
bfdir.δFtail.Track(zfile, -1, sizePath, nil) bfdir.δFtail.Track(zfile, -1, sizePath, nil)
// XXX we can miss a change to file if δblk is not yet tracked // XXX we can miss a change to file if δblk is not yet tracked
...@@ -1503,6 +1498,7 @@ func (f *BigFile) readPinWatchers(ctx context.Context, blk int64, treepath []btr ...@@ -1503,6 +1498,7 @@ func (f *BigFile) readPinWatchers(ctx context.Context, blk int64, treepath []btr
// update δFtail index XXX -> move upper into readBlk ? // update δFtail index XXX -> move upper into readBlk ?
// (δFtail is just for δZ -> δF invalidation handling and is needed without isolation protocol) // (δFtail is just for δZ -> δF invalidation handling and is needed without isolation protocol)
// XXX ^^^ no - also need to query to send pins // XXX ^^^ no - also need to query to send pins
// see "3) for */head/data the following invariant is maintained..."
bfdir := f.head.bfdir bfdir := f.head.bfdir
δFtail := bfdir.δFtail δFtail := bfdir.δFtail
bfdir.δFmu.Lock() // XXX locking correct? XXX -> better push down? bfdir.δFmu.Lock() // XXX locking correct? XXX -> better push down?
...@@ -2206,6 +2202,7 @@ func (head *Head) bigopen(ctx context.Context, oid zodb.Oid) (_ *BigFile, err er ...@@ -2206,6 +2202,7 @@ func (head *Head) bigopen(ctx context.Context, oid zodb.Oid) (_ *BigFile, err er
// only head/ needs δFtail, f.δtail and watches. // only head/ needs δFtail, f.δtail and watches.
if head.rev == 0 { if head.rev == 0 {
// see "3) for */head/data the following invariant is maintained..."
head.bfdir.δFmu.Lock() // XXX locking ok? head.bfdir.δFmu.Lock() // XXX locking ok?
head.bfdir.δFtail.Track(f.zfile, -1, sizePath, nil) head.bfdir.δFtail.Track(f.zfile, -1, sizePath, nil)
head.bfdir.δFmu.Unlock() head.bfdir.δFmu.Unlock()
...@@ -2393,8 +2390,8 @@ func _main() (err error) { ...@@ -2393,8 +2390,8 @@ func _main() (err error) {
zhead, err := xzodb.ZOpen(ctx, zdb, &zodb.ConnOptions{ zhead, err := xzodb.ZOpen(ctx, zdb, &zodb.ConnOptions{
At: at0, At: at0,
// we need zhead.cache to be maintained across several transactions. // preserve zhead.cache across several transactions.
// see "3) for head/bigfile/* the following invariant is maintained ..." // see "ZODB cache control"
NoPool: true, NoPool: true,
}) })
if err != nil { if err != nil {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment