Commit 38e98a12 by Julien Muchembled

qa: new tool to stress-test NEO

Example output:

    stress: yes (toggle with F1)
    cluster state: RUNNING
    last oid: 0x44c0
    last tid: 0x3cdee272ef19355 (2019-02-26 15:35:11.002419)
    clients: 2308, 2311, 2302, 2173, 2226, 2215, 2306, 2255, 2314, 2356 (+48)
            8m53.988s (42.633861/s)
    pt id: 4107
        RRRDDRRR
     0: OU......
     1: ..UO....
     2: ....OU..
     3: ......UU
     4: OU......
     5: ..UO....
     6: ....OU..
     7: ......UU
     8: OU......
     9: ..UO....
    10: ....OU..
    11: ......UU
    12: OU......
    13: ..UO....
    14: ....OU..
    15: ......UU
    16: OU......
    17: ..UO....
    18: ....OU..
    19: ......UU
    20: OU......
    21: ..UO....
    22: ....OU..
    23: ......UU
1 parent ce25e429
......@@ -6,5 +6,6 @@
/build/
/dist/
/htmlcov/
/neo/tests/ConflictFree.py
/neo/tests/mock.py
/neoppod.egg-info/
......@@ -45,50 +45,12 @@ if IF == 'pdb':
#('ZPublisher.Publish', 'publish_module_standard'),
)
import errno, socket, threading, weakref
# Unfortunately, IPython does not always print to given stdout.
#from neo.lib.debug import getPdb
import socket, threading, weakref
from neo.lib.debug import PdbSocket
# We don't use the one from neo.lib.debug because unfortunately,
# IPython does not always print to given stdout.
from pdb import Pdb as getPdb
class Socket(object):
def __init__(self, socket):
# In case that the default timeout is not None.
socket.settimeout(None)
self._socket = socket
self._buf = ''
def write(self, data):
self._socket.send(data)
def readline(self):
recv = self._socket.recv
data = self._buf
while True:
i = 1 + data.find('\n')
if i:
self._buf = data[i:]
return data[:i]
d = recv(4096)
data += d
if not d:
self._buf = ''
return data
def flush(self):
pass
def closed(self):
self._socket.setblocking(0)
try:
self._socket.recv(0)
return True
except socket.error, (err, _):
if err != errno.EAGAIN:
raise
self._socket.setblocking(1)
return False
def pdb():
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
......@@ -98,7 +60,7 @@ if IF == 'pdb':
s.listen(0)
print 'Listening to %u' % s.getsockname()[1]
sys.stdout.flush() # BBB: On Python 3, print() takes a 'flush' arg.
_socket = Socket(s.accept()[0])
_socket = PdbSocket(s.accept()[0])
finally:
s.close()
try:
......@@ -155,9 +117,12 @@ if IF == 'pdb':
if BP:
setupBreakPoints(BP)
else:
threading.Thread(target=pdb).start()
threading.Thread(target=pdb, name='pdb').start()
elif IF == 'frames':
# WARNING: Because of https://bugs.python.org/issue17094, the output is
# usually incorrect for subprocesses started by the functional
# test framework.
import traceback
write = sys.stderr.write
for thread_id, frame in sys._current_frames().iteritems():
......
......@@ -34,6 +34,7 @@ class SocketConnector(object):
is_closed = is_server = None
connect_limit = {}
CONNECT_LIMIT = 1
KEEPALIVE = 60, 3, 10
SOMAXCONN = 5 # for threaded tests
def __new__(cls, addr, s=None):
......@@ -66,9 +67,10 @@ class SocketConnector(object):
# The following 3 lines are specific to Linux. It seems that OSX
# has similar options (TCP_KEEPALIVE/TCP_KEEPINTVL/TCP_KEEPCNT),
# and Windows has SIO_KEEPALIVE_VALS (fixed count of 10).
s.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, 60)
s.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPCNT, 3)
s.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 10)
idle, cnt, intvl = self.KEEPALIVE
s.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, idle)
s.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPCNT, cnt)
s.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, intvl)
s.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1)
# disable Nagle algorithm to reduce latency
s.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
......
......@@ -14,11 +14,7 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import traceback
import signal
import imp
import os
import sys
import errno, imp, os, signal, socket, sys, traceback
from functools import wraps
import neo
......@@ -82,9 +78,55 @@ def winpdb(depth=0):
os.abort()
def register(on_log=None):
if on_log is not None:
@safe_handler
def on_log_signal(signum, signal):
on_log()
signal.signal(signal.SIGRTMIN+2, on_log_signal)
signal.signal(signal.SIGRTMIN+3, debugHandler)
try:
if on_log is not None:
@safe_handler
def on_log_signal(signum, signal):
on_log()
signal.signal(signal.SIGRTMIN+2, on_log_signal)
signal.signal(signal.SIGRTMIN+3, debugHandler)
except ValueError: # signal only works in main thread
pass
class PdbSocket(object):
def __init__(self, socket):
# In case that the default timeout is not None.
socket.settimeout(None)
self._socket = socket
self._buf = ''
def close(self):
self._socket.close()
def write(self, data):
self._socket.send(data)
def readline(self):
recv = self._socket.recv
data = self._buf
while True:
i = 1 + data.find('\n')
if i:
self._buf = data[i:]
return data[:i]
d = recv(4096)
data += d
if not d:
self._buf = ''
return data
def flush(self):
pass
def closed(self):
self._socket.setblocking(0)
try:
self._socket.recv(0)
return True
except socket.error, (err, _):
if err != errno.EAGAIN:
raise
self._socket.setblocking(1)
return False
......@@ -15,9 +15,8 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import thread, threading, weakref
from . import logging
from . import debug, logging
from .app import BaseApplication
from .debug import register as registerLiveDebugger
from .dispatcher import Dispatcher
from .locking import SimpleQueue
......@@ -28,7 +27,10 @@ class app_set(weakref.WeakSet):
app.log()
app_set = app_set()
registerLiveDebugger(app_set.on_log)
def registerLiveDebugger():
debug.register(app_set.on_log)
registerLiveDebugger()
class ThreadContainer(threading.local):
......
......@@ -86,8 +86,6 @@ SSL = SSL + "ca.crt", SSL + "node.crt", SSL + "node.key"
logging.default_root_handler.handle = lambda record: None
debug.register()
# prevent "signal only works in main thread" errors in subprocesses
debug.register = lambda on_log=None: None
def mockDefaultValue(name, function):
def method(self, *args, **kw):
......
......@@ -118,7 +118,7 @@ class PortAllocator(object):
class Process(object):
_coverage_fd = None
_coverage_prefix = os.path.join(getTempDirectory(), 'coverage-')
_coverage_prefix = None
_coverage_index = 0
on_fork = [logging.resetNids]
pid = 0
......@@ -147,6 +147,9 @@ class Process(object):
if coverage:
cls = self.__class__
cls._coverage_index += 1
if not cls._coverage_prefix:
cls._coverage_prefix = os.path.join(
getTempDirectory(), 'coverage-')
coverage_data_path = cls._coverage_prefix + str(cls._coverage_index)
self._coverage_fd, w = os.pipe()
def save_coverage(*args):
......@@ -294,6 +297,10 @@ class NEOProcess(Process):
"""
self.uuid = uuid
@property
def logfile(self):
return self.arg_dict['logfile']
class NEOCluster(object):
SSL = None
......@@ -485,14 +492,15 @@ class NEOCluster(object):
except (AlreadyStopped, NodeProcessError):
pass
def getZODBStorage(self, **kw):
master_nodes = self.master_nodes.replace('/', ' ')
def getClientConfig(self, **kw):
kw['name'] = self.cluster_name
kw['master_nodes'] = self.master_nodes.replace('/', ' ')
if self.SSL:
kw['ca'], kw['cert'], kw['key'] = self.SSL
result = Storage(
master_nodes=master_nodes,
name=self.cluster_name,
**kw)
return kw
def getZODBStorage(self, **kw):
result = Storage(**self.getClientConfig(**kw))
result.app.max_reconnection_to_master = 10
self.zodb_storage_list.append(result)
return result
......
# tools/stress is split in such a way that this file can be reused to
# implement another tool to stress an existing cluster, which would be filled
# by a real application.
import curses, os, random, re, select, threading, time
from collections import deque
from neo.lib import logging, protocol
from neo.lib.app import BaseApplication
from neo.lib.debug import register as registerLiveDebugger
from neo.lib.exception import PrimaryFailure
from neo.lib.protocol import ClusterStates, NodeStates, NodeTypes, Packets
from neo.admin.app import Application as AdminApplication
from neo.admin.handler import MasterEventHandler
class Handler(MasterEventHandler):
def answerClusterState(self, conn, state):
super(Handler, self).answerClusterState(conn, state)
self.app.refresh('state')
def answerPartitionTable(self, *args):
super(Handler, self).answerPartitionTable(*args)
self.app.refresh('pt')
def sendPartitionTable(self, *args):
raise AssertionError
def notifyPartitionChanges(self, *args):
super(Handler, self).notifyPartitionChanges(*args)
self.app.refresh('pt')
def answerLastIDs(self, conn, *args):
self.app.answerLastIDs(*args)
def notifyNodeInformation(self, conn, timestamp, node_list):
for node_type, addr, uuid, state, id_timestamp in node_list:
if node_type == NodeTypes.CLIENT and state == NodeStates.UNKNOWN:
self.app.clientDown()
break
getStorageList = self.app.nm.getStorageList
before = [node for node in getStorageList() if node.isRunning()]
super(Handler, self).notifyNodeInformation(conn, timestamp, node_list)
self.app.notifyNodeInformation(
{node for node in getStorageList() if node.isRunning()}
.difference(before))
class StressApplication(AdminApplication):
cluster_state = server = uuid = None
listening_conn = True
restart_ratio = float('inf') # no firewall support
_stress = False
def __init__(self, ssl, master_nodes):
BaseApplication.__init__(self, ssl)
for address in master_nodes:
self.nm.createMaster(address=address)
self.pt = None
self.master_event_handler = Handler(self)
self.reset()
registerLiveDebugger(on_log=self.log)
self.failing = set()
self.restart_lock = threading.Lock()
def close(self):
BaseApplication.close(self)
def run(self):
visibility = None
from logging import disable, ERROR
disable(ERROR)
self.stdscr = curses.initscr()
try:
curses.noecho()
curses.cbreak()
self.stdscr.keypad(1)
visibility = curses.curs_set(0)
self._run()
finally:
if visibility:
curses.curs_set(visibility)
self.stdscr.keypad(0)
curses.echo()
curses.nocbreak()
curses.endwin()
def _run(self):
stdscr = self.stdscr
r, w = os.pipe()
l = threading.Lock()
stdscr.nodelay(1)
input_queue = deque()
def input_read():
x = []
while 1:
c = stdscr.getch()
if c < 0:
if x:
input_queue.append(x)
return input_queue
x.append(c)
def input_thread():
try:
poll = select.poll()
poll.register(0, select.POLLIN)
poll.register(r, select.POLLIN)
while 1:
for fd, _ in poll.poll():
if fd:
return
with l:
empty = not input_queue
if input_read() and empty:
self.em.wakeup()
finally:
os.close(r)
t = threading.Thread(target=input_thread)
t.deamon = True
wait = None
try:
t.start()
self.startCluster()
self.refresh('stress', False)
while 1:
self.failing.clear()
try:
self.connectToPrimary()
self.askLastIDs()
while 1:
self.em.poll(1)
with l:
if input_read():
for x in input_queue:
try:
x, = x
except ValueError:
continue
if x == curses.KEY_RESIZE:
self.refresh()
elif x == curses.KEY_F1:
self.stress()
else:
try:
x = chr(x)
except ValueError:
continue
if x == 'q':
return
input_queue.clear()
except PrimaryFailure:
logging.error('primary master is down')
if self.cluster_state == ClusterStates.STOPPING:
break
self.primaryFailure()
finally:
if self._stress:
self.stress()
wait = time.time()
finally:
os.write(w, '\0')
os.close(w)
t.join()
self.stopCluster(wait)
def primaryFailure(self):
raise
def startCluster(self):
raise NotImplementedError
def stopCluster(self, wait):
raise NotImplementedError
def clientDown(self):
send = self.master_conn.send
send(Packets.FlushLog())
send(Packets.SetClusterState(ClusterStates.STOPPING))
def notifyNodeInformation(self, node_list):
for node in node_list:
self.failing.discard(node.getUUID())
def askLastIDs(self):
conn = self.master_conn
if conn:
conn.ask(Packets.AskLastIDs())
def answerLastIDs(self, loid, ltid):
self.loid = loid
self.ltid = ltid
self.em.setTimeout(int(time.time() + 1), self.askLastIDs)
if self._stress:
node_list = self.nm.getStorageList()
random.shuffle(node_list)
fw = []
kill = []
restart_ratio = self.restart_ratio
for node in node_list:
nid = node.getUUID()
if nid in self.failing:
if restart_ratio <= 1:
fw.append(nid)
continue
running = node.isRunning()
if running or restart_ratio <= 1:
self.failing.add(nid)
if self.pt.operational(self.failing):
(kill if running and random.random() < restart_ratio
else fw).append(nid)
if len(self.failing) == self._fault_count:
break
else:
self.failing.remove(nid)
if fw or kill:
for nid in fw:
self.tcpReset(nid)
if kill:
t = threading.Thread(target=self._restart, args=kill)
t.daemon = 1
t.start()
self.refresh('pt', False)
self.refresh('ids')
def _restart(self, *nids):
with self.restart_lock:
self.restartStorages(nids)
def tcpReset(self, nid):
raise NotImplementedError
def restartStorages(self, nids):
raise NotImplementedError
def refresh(self, what=None, do=True):
stdscr = self.stdscr
try:
y = 0
if what in (None, 'stress'):
stdscr.addstr(y, 0, 'stress: %s (toggle with F1)\n'
% ('yes' if self._stress else 'no'))
y += 1
if what in (None, 'state'):
stdscr.addstr(y, 0, 'cluster state: %s\n' % self.cluster_state)
y += 1
if what in (None, 'ids'):
self.refresh_ids(y)
h = stdscr.getyx()[0] - y
clear = self._ids_height - h
if clear:
self._ids_height = h
what = None
else:
clear = None
y += self._ids_height
if what in (None, 'pt'):
pt = self.pt
n = len(str(pt.np-1))
node_list = sorted(pt.count_dict)
attr = curses.A_NORMAL, curses.A_BOLD
stdscr.addstr(y, 0, 'pt id: %s\n %s' % (pt.getID(), ' ' * n))
for node in node_list:
stdscr.addstr(
protocol.node_state_prefix_dict[node.getState()],
attr[node.getUUID() in self.failing])
stdscr.addstr('\n')
x = '%{}s'.format(n)
n = pt.nr + 1
split = re.compile('[^OC]+|[OC]+').findall
for i, r in enumerate(pt._formatRows(node_list)):
stdscr.addstr(x % i, attr[r.count('U') != n])
for i, r in enumerate(split(': %s\n' % r)):
stdscr.addstr(r, attr[i & 1])
if clear:
stdscr.addstr('\n' * clear)
except curses.error:
pass
if do:
stdscr.refresh()
# _ids_height
def refresh_ids(self, y):
raise NotImplementedError
def stress(self):
self._stress = not self._stress
self.refresh('stress')
......@@ -14,20 +14,35 @@ Topic :: Database
Topic :: Software Development :: Libraries :: Python Modules
"""
mock = 'neo/tests/mock.py'
if not os.path.exists(mock):
import cStringIO, hashlib, subprocess, urllib, zipfile
x = 'pythonmock-0.1.0.zip'
def get3rdParty(name, tag, url, h, extract=lambda content, name: content):
path = 'neo/tests/' + name
if os.path.exists(path):
return
import hashlib, subprocess, urllib
try:
x = subprocess.check_output(('git', 'cat-file', 'blob', x))
x = subprocess.check_output(('git', 'cat-file', 'blob', tag))
except (OSError, subprocess.CalledProcessError):
x = urllib.urlopen(
'http://downloads.sf.net/sourceforge/python-mock/' + x).read()
mock_py = zipfile.ZipFile(cStringIO.StringIO(x)).read('mock.py')
if (hashlib.sha256(mock_py).hexdigest() !=
'c6ed26e4312ed82160016637a9b6f8baa71cf31a67c555d44045a1ef1d60d1bc'):
raise EnvironmentError("SHA checksum mismatch downloading 'mock.py'")
open(mock, 'w').write(mock_py)
x = urllib.urlopen(url).read()
x = extract(x, name)
if hashlib.sha256(x).hexdigest() != h:
raise EnvironmentError("SHA checksum mismatch downloading '%s'" % name)
with open(path, 'wb') as f:
f.write(x)
def unzip(content, name):
import io, zipfile
return zipfile.ZipFile(io.BytesIO(content)).read(name)
x = 'pythonmock-0.1.0.zip'
get3rdParty('mock.py', x,
'http://downloads.sf.net/sourceforge/python-mock/' + x,
'c6ed26e4312ed82160016637a9b6f8baa71cf31a67c555d44045a1ef1d60d1bc',
unzip)
x = 'ConflictFree.py'
get3rdParty(x, '3rdparty/' + x, 'https://lab.nexedi.com/nexedi/erp5'
'/raw/14b0fcdcc31c5791646f9590678ca028f5d221f5/product/ERP5Type/' + x,
'abb7970856540fd02150edd1fa9a3a3e8d0074ec526ab189684ef7ea9b41825f')
zodb_require = ['ZODB3>=3.10dev']
......@@ -42,6 +57,9 @@ extras_require = {
}
extras_require['tests'] = ['coverage', 'zope.testing', 'psutil>=2',
'neoppod[%s]' % ', '.join(extras_require)]
extras_require['stress'] = ['NetfilterQueue', 'gevent', 'neoppod[tests]',
'cython-zstd', # recommended (log rotation)
]
try:
from docutils.core import publish_string
......
This diff is collapsed. Click to expand it.
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!