Commit 643e267f authored by Jim Fulton's avatar Jim Fulton

Made a number of blob changes:

- Unwritten blobs can now be read, and are empty.

- Blobs are considered modified when opened for writing.  This is a
   little bit more conservative than before but fixes a bug that a file
   opened with 'w' actually does modify the file and wasn't considered
   to be a change before.

- Optimistic savepoints now work.

- Fixed bug: could open multiple files for writing.

- Fixed bug: aborting a transaction removed uncommitted data for
   uncommitted blobs.

Todo: 
   Need to remove uncommitted data file if a blob is GCed even when a
   transaction isn't aborted or when it hasn't been added to anything.

- No-longer close files on transaction boundaries.

This allows us to get rid of the transaction-manager dance.
parent 3ccbf36d
......@@ -16,12 +16,14 @@
import base64
import logging
import logging
import os
import shutil
import sys
import time
import tempfile
import logging
import threading
import time
import weakref
import zope.interface
......@@ -42,78 +44,123 @@ BLOB_SUFFIX = ".blob"
valid_modes = 'r', 'w', 'r+', 'a'
# Threading issues:
# We want to support closing blob files when they are destroyed.
# This introduces a threading issue, since a blob file may be destroyed
# via GC in any thread.
class Blob(persistent.Persistent):
"""A BLOB supports efficient handling of large data within ZODB."""
zope.interface.implements(ZODB.interfaces.IBlob)
_os_link = os.rename
_p_blob_readers = 0
_p_blob_writers = 0
_p_blob_uncommitted = None # Filename of the uncommitted (dirty) data
_p_blob_data = None # Filename of the committed data
# All persistent object store a reference to their data manager, a database
# connection in the _p_jar attribute. So we are going to do the same with
# blobs here.
_p_blob_manager = None
# Blobs need to participate in transactions even when not connected to
# a database yet. If you want to use a non-default transaction manager,
# you can override it via _p_blob_transaction. This is currently
# required for unit testing.
_p_blob_transaction = None
_p_blob_committed = None # Filename of the committed data
readers = writers = None
def __setstate__(self, state=None):
# We use lists here because it will allow is to add and remove
# atomically
self.readers = []
self.writers = []
__init__ = __setstate__
def __getstate__(self):
return None
def _p_deactivate(self):
# Only ghostify if we are unopened.
if self.readers or self.writers:
return
super(Blob, self)._p_deactivate()
def _p_invalidate(self):
# Force-close any open readers or writers,
# XXX should we warn of this? Maybe?
if self._p_changed is None:
return
for ref in self.readers+self.writers:
f = ref()
if f is not None:
f.close()
if (self._p_blob_uncommitted
and os.path.exists(self._p_blob_uncommitted)
):
os.remove(self._p_blob_uncommitted)
super(Blob, self)._p_invalidate()
@property
def opened(self):
return bool(self.readers or self.writers)
def closed(self, f):
# We use try/except below because another thread might remove
# the ref after we check it if the file is GCed.
for file_refs in (self.readers, self.writers):
for ref in file_refs:
if ref() is f:
try:
file_refs.remove(ref)
except ValueError:
pass
return
def open(self, mode="r"):
"""Returns a file(-like) object representing blob data."""
result = None
if mode not in valid_modes:
raise ValueError("invalid mode", mode)
if self.writers:
raise BlobError("Already opened for writing.")
if mode == 'r':
if self._current_filename() is None:
raise BlobError("Blob does not exist.")
if self._p_blob_writers != 0:
raise BlobError("Already opened for writing.")
self._create_uncommitted_file()
self._p_blob_readers += 1
result = BlobFile(self._current_filename(), mode, self)
elif mode == 'w':
if self._p_blob_readers != 0:
raise BlobError("Already opened for reading.")
self._p_blob_writers += 1
if self._p_blob_uncommitted is None:
self._create_uncommitted_file()
result = BlobFile(self._p_blob_uncommitted, mode, self)
elif mode in ('a', 'r+'):
if self._p_blob_readers != 0:
def destroyed(ref, readers=self.readers):
try:
readers.remove(ref)
except ValueError:
pass
self.readers.append(weakref.ref(result, destroyed))
else:
if self.readers:
raise BlobError("Already opened for reading.")
if self._p_blob_uncommitted is None:
# Create a new working copy
uncommitted = BlobFile(self._create_uncommitted_file(),
mode, self)
# NOTE: _p_blob data appears by virtue of Connection._setstate
utils.cp(file(self._p_blob_data), uncommitted)
uncommitted.seek(0)
if mode == 'w':
if self._p_blob_uncommitted is None:
self._create_uncommitted_file()
result = BlobFile(self._p_blob_uncommitted, mode, self)
else:
# Re-use existing working copy
uncommitted = BlobFile(self._p_blob_uncommitted, mode, self)
if self._p_blob_uncommitted is None:
# Create a new working copy
self._create_uncommitted_file()
result = BlobFile(self._p_blob_uncommitted, mode, self)
utils.cp(file(self._p_blob_committed), result)
if mode == 'r+':
result.seek(0)
else:
# Re-use existing working copy
result = BlobFile(self._p_blob_uncommitted, mode, self)
self._p_blob_writers += 1
result = uncommitted
def destroyed(ref, writers=self.writers):
try:
writers.remove(ref)
except ValueError:
pass
self.writers.append(weakref.ref(result, destroyed))
else:
raise IOError('invalid mode: %s ' % mode)
self._p_changed = True
if result is not None:
self._setup_transaction_manager(result)
return result
def openDetached(self, class_=file):
......@@ -123,7 +170,7 @@ class Blob(persistent.Persistent):
"""
if self._current_filename() is None:
raise BlobError("Blob does not exist.")
if self._p_blob_writers != 0:
if self.writers:
raise BlobError("Already opened for writing.")
# XXX this should increase the reader number and have a test !?!
return class_(self._current_filename(), "rb")
......@@ -132,9 +179,9 @@ class Blob(persistent.Persistent):
"""Will replace the current data of the blob with the file given under
filename.
"""
if self._p_blob_writers != 0:
if self.writers:
raise BlobError("Already opened for writing.")
if self._p_blob_readers != 0:
if self.readers:
raise BlobError("Already opened for reading.")
previous_uncommitted = bool(self._p_blob_uncommitted)
......@@ -151,7 +198,7 @@ class Blob(persistent.Persistent):
os.unlink(target)
try:
self._os_link(filename, target)
os.rename(filename, target)
except:
# Recover from the failed consumption: First remove the file, it
# might exist and mark the pointer to the uncommitted file.
......@@ -175,14 +222,14 @@ class Blob(persistent.Persistent):
# We changed the blob state and have to make sure we join the
# transaction.
self._change()
self._p_changed = True
# utility methods
def _current_filename(self):
# NOTE: _p_blob_data and _p_blob_uncommitted appear by virtue of
# NOTE: _p_blob_committed and _p_blob_uncommitted appear by virtue of
# Connection._setstate
return self._p_blob_uncommitted or self._p_blob_data
return self._p_blob_uncommitted or self._p_blob_committed
def _create_uncommitted_file(self):
assert self._p_blob_uncommitted is None, (
......@@ -191,148 +238,6 @@ class Blob(persistent.Persistent):
self._p_blob_uncommitted = utils.mktemp(dir=tempdir)
return self._p_blob_uncommitted
def _change(self):
self._p_changed = 1
def _setup_transaction_manager(self, result):
# We join the transaction with our own data manager in order to be
# notified of commit/vote/abort events. We do this because at
# transaction boundaries, we need to fix up _p_ reference counts
# that keep track of open readers and writers and close any
# writable filehandles we've opened.
if self._p_blob_manager is None:
# Blobs need to always participate in transactions.
if self._p_jar is not None:
# If we are connected to a database, then we use the
# transaction manager that belongs to this connection
tm = self._p_jar.transaction_manager
else:
# If we are not connected to a database, we check whether
# we have been given an explicit transaction manager
if self._p_blob_transaction:
tm = self._p_blob_transaction
else:
# Otherwise we use the default
# transaction manager as an educated guess.
tm = transaction.manager
# Create our datamanager and join he current transaction.
dm = BlobDataManager(self, result, tm)
tm.get().join(dm)
elif result:
# Each blob data manager should manage only the one blob
# assigned to it. Assert that this is the case and it is the
# correct blob
assert self._p_blob_manager.blob is self
self._p_blob_manager.register_fh(result)
# utility methods which should not cause the object's state to be
# loaded if they are called while the object is a ghost. Thus,
# they are named with the _p_ convention and only operate against
# other _p_ instance attributes. We conventionally name these methods
# and attributes with a _p_blob prefix.
def _p_blob_clear(self):
self._p_blob_readers = 0
self._p_blob_writers = 0
def _p_blob_decref(self, mode):
if mode == 'r':
self._p_blob_readers = max(0, self._p_blob_readers - 1)
else:
assert mode in valid_modes, "Invalid mode %r" % mode
self._p_blob_writers = max(0, self._p_blob_writers - 1)
def _p_blob_refcounts(self):
# used by unit tests
return self._p_blob_readers, self._p_blob_writers
class BlobDataManager:
"""Special data manager to handle transaction boundaries for blobs.
Blobs need some special care-taking on transaction boundaries. As
a) the ghost objects might get reused, the _p_reader and _p_writer
refcount attributes must be set to a consistent state
b) the file objects might get passed out of the thread/transaction
and must deny any relationship to the original blob.
c) writable blob filehandles must be closed at the end of a txn so
as to not allow reuse between two transactions.
"""
zope.interface.implements(transaction.interfaces.IDataManager)
def __init__(self, blob, filehandle, tm):
self.blob = blob
self.transaction = tm.get()
# we keep a weakref to the file handle because we don't want to
# keep it alive if all other references to it die (e.g. in the
# case it's opened without assigning it to a name).
self.fhrefs = utils.WeakSet()
self.register_fh(filehandle)
self.sortkey = time.time()
self.prepared = False
# Blob specific methods
def register_fh(self, filehandle):
self.fhrefs.add(filehandle)
def _remove_uncommitted_data(self):
self.blob._p_blob_clear()
self.fhrefs.map(lambda fhref: fhref.close())
if (self.blob._p_blob_uncommitted is not None and
os.path.exists(self.blob._p_blob_uncommitted)):
os.unlink(self.blob._p_blob_uncommitted)
self.blob._p_blob_uncommitted = None
# IDataManager
def tpc_begin(self, transaction):
if self.prepared:
raise TypeError('Already prepared')
self._checkTransaction(transaction)
self.prepared = True
self.transaction = transaction
self.fhrefs.map(lambda fhref: fhref.close())
def commit(self, transaction):
if not self.prepared:
raise TypeError('Not prepared to commit')
self._checkTransaction(transaction)
self.transaction = None
self.prepared = False
self.blob._p_blob_clear()
def abort(self, transaction):
self.tpc_abort(transaction)
def tpc_abort(self, transaction):
self._checkTransaction(transaction)
if self.transaction is not None:
self.transaction = None
self.prepared = False
self._remove_uncommitted_data()
def tpc_finish(self, transaction):
pass
def tpc_vote(self, transaction):
pass
def sortKey(self):
return self.sortkey
def _checkTransaction(self, transaction):
if (self.transaction is not None and
self.transaction is not transaction):
raise TypeError("Transaction missmatch",
transaction, self.transaction)
class BlobFile(file):
"""A BlobFile that holds a file handle to actual blob data.
......@@ -349,35 +254,10 @@ class BlobFile(file):
def __init__(self, name, mode, blob):
super(BlobFile, self).__init__(name, mode+'b')
self.blob = blob
self.close_called = False
def write(self, data):
super(BlobFile, self).write(data)
self.blob._change()
def writelines(self, lines):
super(BlobFile, self).writelines(lines)
self.blob._change()
def truncate(self, size=0):
super(BlobFile, self).truncate(size)
self.blob._change()
def close(self):
# we don't want to decref twice
if not self.close_called:
self.blob._p_blob_decref(self.mode[:-1])
self.close_called = True
super(BlobFile, self).close()
def __del__(self):
# XXX we need to ensure that the file is closed at object
# expiration or our blob's refcount won't be decremented.
# This probably needs some work; I don't know if the names
# 'BlobFile' or 'super' will be available at program exit, but
# we'll assume they will be for now in the name of not
# muddying the code needlessly.
self.close()
self.blob.closed(self)
file.close(self)
_pid = str(os.getpid())
......
......@@ -26,12 +26,10 @@ A blob implements the IBlob interface:
>>> IBlob.providedBy(myblob)
True
Opening a new Blob for reading fails:
We can open a new blob file for reading, but it won't have any data:
>>> myblob.open("r")
Traceback (most recent call last):
...
BlobError: Blob does not exist.
>>> myblob.open("r").read()
''
But we can write data to a new Blob by opening it for writing:
......
......@@ -35,21 +35,53 @@ Putting a Blob into a Connection works like any other Persistent object::
>>> blob1 = Blob()
>>> blob1.open('w').write('this is blob 1')
>>> root1['blob1'] = blob1
>>> transaction.commit()
>>> 'blob1' in root1
True
Aborting a blob add leaves the blob unchanged:
Aborting a transaction involving a blob write cleans up uncommitted
file data::
>>> transaction.abort()
>>> 'blob1' in root1
False
>>> dead_blob = Blob()
>>> dead_blob.open('w').write('this is a dead blob')
>>> root1['dead_blob'] = dead_blob
>>> fname = dead_blob._p_blob_uncommitted
>>> blob1._p_oid
>>> blob1._p_jar
>>> blob1.open().read()
'this is blob 1'
It doesn't clear the file because there is no previously committed version:
>>> fname = blob1._p_blob_uncommitted
>>> import os
>>> os.path.exists(fname)
True
Let's put the blob back into the root and commit the change:
>>> root1['blob1'] = blob1
>>> transaction.commit()
Now, if we make a change and abort it, we'll return to the committed
state:
>>> os.path.exists(fname)
False
>>> blob1._p_blob_uncommitted
>>> blob1.open('w').write('this is new blob 1')
>>> blob1.open().read()
'this is new blob 1'
>>> fname = blob1._p_blob_uncommitted
>>> os.path.exists(fname)
True
>>> transaction.abort()
>>> os.path.exists(fname)
False
>>> blob1._p_blob_uncommitted
>>> blob1.open().read()
'this is blob 1'
Opening a blob gives us a filehandle. Getting data out of the
resulting filehandle is accomplished via the filehandle's read method::
......@@ -57,66 +89,44 @@ resulting filehandle is accomplished via the filehandle's read method::
>>> connection2 = database.open()
>>> root2 = connection2.root()
>>> blob1a = root2['blob1']
>>> blob1a._p_blob_refcounts()
(0, 0)
>>>
>>> blob1afh1 = blob1a.open("r")
>>> blob1afh1.read()
'this is blob 1'
>>> # The filehandle keeps a reference to its blob object
>>> blob1afh1.blob._p_blob_refcounts()
(1, 0)
Let's make another filehandle for read only to blob1a, this should bump
up its refcount by one, and each file handle has a reference to the
(same) underlying blob::
Let's make another filehandle for read only to blob1a. Aach file
handle has a reference to the (same) underlying blob::
>>> blob1afh2 = blob1a.open("r")
>>> blob1afh2.blob._p_blob_refcounts()
(2, 0)
>>> blob1afh1.blob._p_blob_refcounts()
(2, 0)
>>> blob1afh2.blob is blob1afh1.blob
True
Let's close the first filehandle we got from the blob, this should decrease
its refcount by one::
Let's close the first filehandle we got from the blob::
>>> blob1afh1.close()
>>> blob1a._p_blob_refcounts()
(1, 0)
Let's abort this transaction, and ensure that the filehandles that we
opened are now closed and that the filehandle refcounts on the blob
object are cleared::
opened are still open::
>>> transaction.abort()
>>> blob1afh1.blob._p_blob_refcounts()
(0, 0)
>>> blob1afh2.blob._p_blob_refcounts()
(0, 0)
>>> blob1a._p_blob_refcounts()
(0, 0)
>>> blob1afh2.read()
Traceback (most recent call last):
...
ValueError: I/O operation on closed file
'this is blob 1'
If we open a blob for append, its write refcount should be nonzero.
Additionally, writing any number of bytes to the blobfile should
result in the blob being marked "dirty" in the connection (we just
aborted above, so the object should be "clean" when we start)::
>>> blob1afh2.close()
If we open a blob for append, writing any number of bytes to the
blobfile should result in the blob being marked "dirty" in the
connection (we just aborted above, so the object should be "clean"
when we start)::
>>> bool(blob1a._p_changed)
False
>>> blob1a.open('r').read()
'this is blob 1'
>>> blob1afh3 = blob1a.open('a')
>>> blob1afh3.write('woot!')
>>> blob1a._p_blob_refcounts()
(0, 1)
>>> bool(blob1a._p_changed)
True
>>> blob1afh3.write('woot!')
We can open more than one blob object during the course of a single
transaction::
......@@ -125,10 +135,6 @@ transaction::
>>> blob2.open('w').write('this is blob 3')
>>> root2['blob2'] = blob2
>>> transaction.commit()
>>> blob2._p_blob_refcounts()
(0, 0)
>>> blob1._p_blob_refcounts()
(0, 0)
Since we committed the current transaction above, the aggregate
changes we've made to blob, blob1a (these refer to the same object) and
......@@ -200,7 +206,7 @@ int on 64-bit)::
Savepoints and Blobs
--------------------
We do support optimistic savepoints ::
We do support optimistic savepoints:
>>> connection5 = database.open()
>>> root5 = connection5.root()
......@@ -222,17 +228,16 @@ We do support optimistic savepoints ::
"I'm a happy blob. And I'm singing."
>>> transaction.get().commit()
We do not support non-optimistic savepoints::
We support optimistic savepoints too:
>>> blob_fh = root5['blob'].open("a")
>>> blob_fh.write(" And the weather is beautiful.")
>>> blob_fh.close()
>>> root5['blob'].open("a").write(" And I'm dancing.")
>>> root5['blob'].open("r").read()
"I'm a happy blob. And I'm singing. And the weather is beautiful."
>>> savepoint = transaction.savepoint() # doctest: +ELLIPSIS
Traceback (most recent call last):
...
TypeError: ('Savepoints unsupported', <ZODB.blob.BlobDataManager instance at 0x...>)
"I'm a happy blob. And I'm singing. And I'm dancing."
>>> savepoint = transaction.savepoint()
>>> root5['blob'].open("w").write(" And the weather is beautiful.")
>>> savepoint.rollback()
>>> root5['blob'].open("r").read()
"I'm a happy blob. And I'm singing. And I'm dancing."
>>> transaction.abort()
Reading Blobs outside of a transaction
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment