Commit 2c1cc2db authored by Vincent Pelletier's avatar Vincent Pelletier

testERP5Catalog: Test indexation/unindexation parallelism.

parent bd45b9e6
......@@ -29,12 +29,15 @@
from random import randint
import sys
import threading
import traceback
import unittest
import httplib
from AccessControl import getSecurityManager
from AccessControl.SecurityManagement import newSecurityManager
from DateTime import DateTime
from OFS.ObjectManager import ObjectManager
from Products.CMFActivity import ActivityTool
from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase
from Products.ERP5Type.tests.utils import LogInterceptor, createZODBPythonScript, todo_erp5, getExtraSqlConnectionStringList
from Products.PageTemplates.Expressions import getEngine
......@@ -42,6 +45,42 @@ from Products.ZSQLCatalog.SQLCatalog import Query, ComplexQuery, SimpleQuery
from Testing import ZopeTestCase
from zLOG import LOG
def print_all_stacks():
"""
Equivalent of threading.print_stack which prints stack traces for all threads
in current process, and not just current thread.
"""
for thread_id, frame in sys._current_frames().iteritems():
print 'Thread', thread_id
print ' ', ' '.join(traceback.format_stack(frame))
del frame
class TransactionThread(threading.Thread):
"""
Run payload(portal, **payload_kw) within a separate transaction.
"""
def __init__(self, portal, payload, payload_kw=()):
super(TransactionThread, self).__init__()
self.daemon = True
self.zodb = portal._p_jar.db()
self.root_physical_path = portal.getPhysicalPath()
self.payload = payload
self.payload_kw = payload_kw
def run(self):
# Get a new portal, in a new transactional connection bound to default
# transaction manager (which should be the threaded transaction manager).
portal = self.zodb.open().root()['Application'].unrestrictedTraverse(
self.root_physical_path,
)
# Trigger ERP5Site magic
portal.getSiteManager()
# Trigger skin magic
portal.changeSkin(None)
# Login
newSecurityManager(None, portal.acl_users.getUser('ERP5TypeTestCase'))
self.payload(portal=portal, **dict(self.payload_kw))
class IndexableDocument(ObjectManager):
# this property is required for dummy providesIMovement
......@@ -221,6 +260,71 @@ class TestERP5Catalog(ERP5TypeTestCase, LogInterceptor):
self.checkRelativeUrlNotInSQLPathList(path_list)
self.tic()
self.checkRelativeUrlNotInSQLPathList(path_list)
# Now delete document while its indexation is running
# (both started and not committed yet).
# First, create a person but do not index it.
person = person_module.newContent(id='4', portal_type='Person')
path_list = [person.getRelativeUrl()]
self.commit()
self.checkRelativeUrlNotInSQLPathList(path_list)
rendez_vous = threading.Event()
unblock_activity = threading.Event()
# Prepare an isolated transaction to act as one activity node.
def runValidablePendingActivities(portal, node_id):
"""
Validate messages once, execute whatever is immediately executable.
"""
activity_tool = portal.portal_activities
activity_tool.distribute()
# XXX: duplicate ActivityTool.tic, without locking as we are being
# multiple activity nodes in a single process.
for activity in ActivityTool.activity_dict.itervalues():
while not activity.dequeueMessage(activity_tool, node_id, ()):
pass
activity_thread = TransactionThread(
portal=self.portal,
payload=runValidablePendingActivities,
payload_kw={'node_id': 2},
)
# Monkey-patch catalog to synchronise between main thread and the
# isolated transaction.
catalog_tool_class = self.portal.portal_catalog.__class__
orig_catalogObjectList = catalog_tool_class.catalogObjectList
def catalogObjectList(*args, **kw):
# Note: rendez-vous *before* modifying tables, otherwise unindexation's
# synchronous catalog alteration will wait on indexation transaction to
# finish, which is prevented until unindexation happened: so a deadlock,
# resolved by a timeout.
rendez_vous.set()
assert unblock_activity.wait(10), print_all_stacks()
  • aah yes of course !

    and I checked quickly, assertion methods does not flag as failure, they just raise in the thread we have and Exception in thread Thread-1 printed and no failure

    Appologize for chiming in. I just noticed this comment accidently. Pygolang provides sync.WorkGroup which allows to spawn several threads and wait for them. If an exception happens in any of the threads, other threads are canceled and the exception is propagated to main workgroup user, for example:

    #!/usr/bin/env python
    
    from golang import select
    from golang import sync, context, time
    
    def main():
        ctx, cancel = context.with_timeout(context.background(), 3*time.second)
        wg = sync.WorkGroup(ctx)
    
        # thread 1 - sleep 1s and simulate a failure
        def _(ctx):
            # sleep but also react to cancelation.
            # if sleep time here is > timeout ^^^ - the error from the group
            # will be "deadline exceeded"
            _, _rx = select(
                ctx.done().recv,                # 0
                time.after(1*time.second).recv, # 1
            )
            if _ == 0:
                raise ctx.err()
    
            assert 1 == 2               
        wg.go(_)
    
        # thread 2 - simulate stuck - wait indefinitely until work is canceled
        def _(ctx):
            print 'T2: wait...'
            ctx.done().recv()
            print 'T2: woken up'
            raise ctx.err()
        wg.go(_)
    
        # wait for the workgroup to complete
        # this will reraise an exception if it happenned in a worker
        # the exception will have proper traceback
        wg.wait()
        
    if __name__ == '__main__':
        main()
    T2: wait...
    T2: woken up
    Traceback (most recent call last):
      File "./x.py", line 38, in <module>
        main()
      File "./x.py", line 34, in main
        wg.wait()
      File "/home/kirr/src/tools/go/pygolang/golang/sync.py", line 142, in wait
        six.reraise(g._err, None, g._err.__traceback__)
      File "/home/kirr/src/tools/go/pygolang/golang/sync.py", line 122, in _
        f(g._ctx, *argv, **kw)
      File "./x.py", line 22, in _
        assert 1 == 2               
    AssertionError

    https://pypi.org/project/pygolang/#concurrency

  • (feel free to ignore if ^^^ comment is not useful)

  • Pygolang provides sync.WorkGroup which allows to spawn several threads and wait for them.

    Well, not sure we want to extend ERP5 dependencies for this, but this gave me the idea of a large try..except block in run and a raise in join once thread is dead. This is simple enough to me, and losing exception does indeed bother me.

    The down side is that the original exception (I'm using exc_info) will trigger during join without join being mentioned in the traceback... Unless there is a way to do py3-style "exception when handling exception" output, I don't know any.

  • Vincent, thanks for feedback.

    Well, not sure we want to extend ERP5 dependencies for this, but this gave me the idea of a large try..except block in run and a raise in join once thread is dead. This is simple enough to me, and losing exception does indeed bother me.

    Ok. Just for the reference: pygolang is small egg with six, decorator and gevent (for gpython) dependencies. I guess all of this python packages are already, or will be in ERP5 dependency set. Also in my experience there is "cancelation" issue - if a test spawns several testing threads, and one asserts or fails some other way, without notifying the other threads to cancel, the whole test gets stuck and it is hard to tell what is going on. sync.WorkGroup + contexts care about that, and this property is being used for real in wcfs tests, for which sync.WorkGroup was actually developed after I got tired of regularly hitting deadlock and manually injecting prints into test code to find failure place and understand what was going on. See e.g. here for real usage:

    https://lab.nexedi.com/kirr/wendelin.core/blob/c462756b/wcfs/wcfs_test.py#L1049-1116
    https://lab.nexedi.com/kirr/wendelin.core/blob/c462756b/wcfs/wcfs_test.py#L1025-1046

    The down side is that the original exception (I'm using exc_info) will trigger during join without join being mentioned in the traceback... Unless there is a way to do py3-style "exception when handling exception" output, I don't know any.

    I'm not sure I understand you correctly, but please see above where wg.wait() is included into traceback. Is that what you are talking about? I'm using six.rerase and exc.__traceback__ even on python2:

    https://lab.nexedi.com/kirr/pygolang/blob/924a808c/golang/sync.py#L126-131 https://lab.nexedi.com/kirr/pygolang/blob/924a808c/golang/sync.py#L138-142

    For completeness about py3-style exception chaining story: I recently too ran into similar issue and here are the links on the topic which might be useful:

    https://pypi.org/project/pep3134/
    zodbtools!13 (merged) (discussion there)

    For myself I've solved this issue via extending defer to also print the cause exception. The print is temporary - in the future defer will probably be reworked to include the cause exception into raised exception object:

    https://lab.nexedi.com/kirr/wendelin.core/blob/c462756b/wcfs/wcfs_test.py#L1533-1563

    Kirill

    Edited by Kirill Smelkov
  • I'm using six.rerase and exc.__traceback__ even on python2:

    Good idea, this seems simple enough.

    BTW, doesn't exc.__traceback__ = sys.exc_info()[2] cause a reference cycle ? Current frame references self, which references the exception instance, which references the traceback, which references current frame. Also, current frame references the exception, but that's just a subcycle of the previous one.

  • BTW, doesn't exc.traceback = sys.exc_info()[2] cause a reference cycle ?

    Yes, but it is the same as with real exception chaining on Python3 where it is exc -> traceback -> frame -> self -> exc too, because exc is referenced from self after except Exception as exc. The cycle would hurt if Python did not have cyclic GC, but it does have it, right?

    (/cc @jerome for everything starting from 2c1cc2db (comment 83771) - strangely this discussion moved out of !895 (closed) and is seen only on this commit now)

  • The cycle would hurt if Python did not have cyclic GC, but it does have it, right?

    Not sure. I've seen claimed both that it has and has not. Or that it is less efficient, or less often run, or... There is also a list accessible on gc module of stuff which cannot be garbage collected, but I do not remember if it's merely resisting collection of actually impossible until interpreter shutdown. And maybe it depends on whether C objects are involved, or maybe custom __del__ methods, etc. There is a lot of shady stuff when looking at fine memory management details in python...

    I have the reflex of breaking cycles when I see them (and I'm sure I'm missing a lot), because this is cheap even if python could collect these, and it avoids the issue if it could not.

    strangely this discussion moved out of !895 (closed) and is seen only on this commit now

    Indeed. This is caused by my rebase & push, but I think at least sometimes gitlab preserves the link. Somehow this was not the case. Thanks for linking back.

  • It is easy to break the cycle through self by setting exc = None. However the cycle in bettween <exception object> -> traceback -> some frame -> exception object will likely remain because a frame object has references to exception object raised in it:

    https://github.com/python/cpython/blob/v2.7.16-104-gadcdb1e4f5e/Include/frameobject.h#L30-L37

    It is indeed different if an object is part of cycle or not. Without cycle on cpython it is garbage collected immediately when its reference counter goes to zero. With cycle the reference counter never goes to zero just by user releasing his reference to the object and cpython tries to break the cycle at GC times. Objects with finalizers __del__, if I recall correctly, indeed could not be collected this way.

    Anyway, thanks for the suggestion. I will try to see what could be done about breaking cycles in sync.WorkGroup next time if/when I'll be working on it. Alternatively if someone does before me, maybe it could be a good idea to send the patch.

  • However the cycle in bettween <exception object> -> traceback -> some frame -> exception object will likely remain because a frame object has references to exception object raised in it:

    Does it ? Both in your code (wait) and in my code (join) the frame re-raising the exception is a frame which is not part of the original exception stack, so it should not cause a cycle on its own.

  • I think the traceback object holds the reference to original frame where exception object was raised.

  • I think the traceback object holds the reference to original frame where exception object was raised.

    There are 2 traceback objects (one raised during thread execution and one raised when re-raising), I think the latter is not very interesting as far as loops are concerned (no sys.exc_info magic dance), so I assume you mean the former.

    It does reference it, yes, which is why it's a good idea (...or just "does not hurt" ?) to delete any local variable referencing such traceback - which is not needed in our implementations as we do not assign that value to a local variable: we rather assign it to a property of self.

    Of course, self is itself referenced from the frame the traceback references (hence the cycle I mentionned), but this extra indirection allows the cycle to be broken later - for example in the function which re-raises the exception (what I suggested above).

    Then, as you mentionned, there is the whole quetion of whether gc is able to sort it out on its own or not. I consider it guilty until provent innocent, and it's faster to explicitly break the cycle than to do all the mental gymnastic needed. And with the risks from future code evolution which may also not want to invest such effort, I think having a comment on such explicit cycle breaking is also safer in the long run.

  • Vincent, I've tried to look briefly a bit into how much time it takes for things to run, but surprisingly after quick investigation I was not able to prove to myself with numbers that deleting a variable to break a cycle is better. I was playing with both sync.WorkGroup benchmarks and with benchmarks like below

    def raise_nocycle():
        exc = RuntimeError('aaa')
        try:
            raise exc
        finally:
            exc = None
    
    def zzz(f):
        try:
            f()
        except Exception as exc:
            exc.__traceback__ = sys.exc_info()[2]
    
    
    def bench_raise_nocycle(b):
        for i in xrange(b.N):
            zzz(raise_nocycle)

    I did not studied it thoroughly, because currently it is not a good time for me to distract from my main topic due to deadline pressure. On the other hand I do not want to make changes blindly, so on my side investigating this topic will have to wait. If you can prove with numbers that doing something is better, please show it either as a snippet or as a patch.

    Along the way I've made a couple improvements though:

    kirr/pygolang@3c55ca59
    kirr/pygolang@94c6160b

    and learned that it is easy to generate profiles for tests/benchmarks via py.test/py.bench that look e.g. like this:

    https://lab.nexedi.com/kirr/misc/raw/009c4fee/pygolang/prof_workgroup_empty.svg

    (just use e.g. py.bench -k empty golang/sync_test.py --profile-svg with pytest-profiling installed).

    Appologize that for now I can not give a thorough answer to cycles topic,
    Kirill

    /cc @jerome

    Edited by Kirill Smelkov
  • Another patch I had to do for sync.WorkGroup: kirr/pygolang@79aab7df

    I did reset both exc and tb locals to None for consistency because there was no tb local before the patch. However the performance difference is within noise, e.g.:

    name             old time/op  new time/op  delta
    workgroup_empty   109µs ± 1%   112µs ± 1%  +2.90%  (p=0.000 n=10+10)
    workgroup_raise   138µs ± 2%   135µs ± 1%  -2.68%  (p=0.000 n=10+10)

    and even with fixed frequencies etc it fluctuates between runs - e.g. for two runs both for the state after the patch:

    workgroup_empty   108µs ± 1%   112µs ± 1%  +3.35%  (p=0.000 n=10+10)
    workgroup_raise   134µs ± 1%   135µs ± 1%    ~     (p=0.529 n=10+10)
  • Just for the reference: as of Pygolang 0.0.5, compared to 2c1cc2db (comment 84233) Python-level sync.WorkGroup timings are as below:

    name             old time/op  new time/op  delta
    workgroup_empty   109µs ± 1%   24.1µs ± 1%  -78%
    workgroup_raise   138µs ± 2%   28.2µs ± 0%  -80%

    i.e. ~5x speedup.

    See the following commits for details:

    kirr/pygolang@3b241983
    kirr/pygolang@78d85cdc
    kirr/pygolang@c5c3071b

    /cc @jerome, @vpelletier

Please register or sign in to reply
orig_catalogObjectList(*args, **kw)
catalog_tool_class.catalogObjectList = catalogObjectList
try:
# Let pending activities (indexation) start.
activity_thread.start()
# Wait until indexation is indeed initiated.
assert rendez_vous.wait(10), print_all_stacks()
# Delete object, which will try to modify catalog content and spawn
# unindexation activity.
person_module.manage_delObjects(ids=['4'])
self.commit()
# Try to run this activity. It should not run, as it must wait on
# indexation to be over.
runValidablePendingActivities(self.portal, 1)
# Let indexation carry on, it is still able to access the object.
unblock_activity.set()
activity_thread.join(10)
assert not activity_thread.is_alive()
finally:
# Un-monkey-patch.
catalog_tool_class.catalogObjectList = orig_catalogObjectList
# Document must be indexed: unindexation must have waited for indexation
# to finish, so runValidablePendingActivities(..., 1) must have been
# a no-op.
self.checkRelativeUrlInSQLPathList(path_list)
self.tic()
# And now it's gone.
self.checkRelativeUrlNotInSQLPathList(path_list)
def test_04_SearchFolderWithDeletedObjects(self):
person_module = self.getPersonModule()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment