Commit 76872d95 authored by Vincent Pelletier's avatar Vincent Pelletier

cli.updater: Retry on network error.

Maybe name resolution is flapping, or server is unreachable or
misbehaving... This must not cause caucase-update to exit.
parent bd783f0f
...@@ -24,8 +24,10 @@ import datetime ...@@ -24,8 +24,10 @@ import datetime
import httplib import httplib
import json import json
import os import os
import socket
import struct import struct
import sys import sys
import traceback
from cryptography import x509 from cryptography import x509
from cryptography.hazmat.backends import default_backend from cryptography.hazmat.backends import default_backend
from . import utils from . import utils
...@@ -36,6 +38,35 @@ from .client import ( ...@@ -36,6 +38,35 @@ from .client import (
HTTPSOnlyCaucaseClient, HTTPSOnlyCaucaseClient,
) )
class RetryingCaucaseClient(CaucaseClient):
_until = utils.until
def _request(self, connection, method, url, body=None, headers=None):
while True:
try:
return super(RetryingCaucaseClient, self)._request(
connection=connection,
method=method,
url=url,
body=body,
headers=headers,
)
except (
socket.error,
# Note: all exceptions below inherit from httplib.HTTPException,
# but so do errors which are either sign of code bugs
# (ImproperConnectionState) or sign of garbage values provided by
# caller/user, and these should be let through.
httplib.BadStatusLine,
httplib.LineTooLong,
httplib.UnknownProtocol,
httplib.IncompleteRead,
):
connection.close() # Resets HTTPConnection state machine.
print 'Got a network error, retrying in a bit...'
traceback.print_exc()
self._until(datetime.datetime.now() + datetime.timedelta(0, 10))
  • @vpelletier @luke I noticed this by accident while looking at https://lab.nexedi.com/luke/kedifa/merge_requests/1#note_68884

    pylint complain about this line:

    E: 69, 8: Too many positional arguments for function call (too-many-function-args)

    I guess because self. counts as one argument.

    $ for p in repro/*py; do echo "#$p" ; cat $p; done
    #repro/__init__.py
    #repro/a.py
    from . import utils
    
    class X(object):
        _until = utils.until
        def request(self):
            print (self._until(3))
    
    X().request()
    
    #repro/utils.py
    def until(a):
        print ("until", a)
    $ PYTHONPATH=. python2 -m repro.a
    Traceback (most recent call last):
      File "/opt/slapgrid/b0bcd9831b23f334bc85e4a193c748a0/parts/python2.7/lib/python2.7/runpy.py", line 174, in _run_module_as_main
        "__main__", fname, loader, pkg_name)
      File "/opt/slapgrid/b0bcd9831b23f334bc85e4a193c748a0/parts/python2.7/lib/python2.7/runpy.py", line 72, in _run_code
        exec code in run_globals
      File "/tmp/repro/a.py", line 8, in <module>
        X().request()
      File "/tmp/repro/a.py", line 6, in request
        print (self._until(3))
    TypeError: until() takes exactly 1 argument (2 given)
    $ PYTHONPATH=. python3 -m repro.a
    Traceback (most recent call last):
      File "/usr/lib/python3.5/runpy.py", line 193, in _run_module_as_main
        "__main__", mod_spec)
      File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
        exec(code, run_globals)
      File "/tmp/repro/a.py", line 8, in <module>
        X().request()
      File "/tmp/repro/a.py", line 6, in request
        print (self._until(3))
    TypeError: until() takes 1 positional argument but 2 were given
                         
  • Indeed, this was not picked by my pylint for some reason, and was not caught in tests because they patch this attribute (which exists to be patchable in tests, actually...). Fixed in master.

Please register or sign in to reply
_cryptography_backend = default_backend() _cryptography_backend = default_backend()
STATUS_ERROR = 1 STATUS_ERROR = 1
...@@ -693,11 +724,11 @@ def updater(argv=None, until=utils.until): ...@@ -693,11 +724,11 @@ def updater(argv=None, until=utils.until):
}[args.mode] }[args.mode]
threshold = datetime.timedelta(args.threshold, 0) threshold = datetime.timedelta(args.threshold, 0)
max_sleep = datetime.timedelta(args.max_sleep, 0) max_sleep = datetime.timedelta(args.max_sleep, 0)
updated = CaucaseClient.updateCAFile( updated = RetryingCaucaseClient.updateCAFile(
cas_url, cas_url,
args.cas_ca, args.cas_ca,
) and args.cas_ca == args.ca ) and args.cas_ca == args.ca
client = CaucaseClient( client = RetryingCaucaseClient(
ca_url=ca_url, ca_url=ca_url,
ca_crt_pem_list=utils.getCertList(args.cas_ca) ca_crt_pem_list=utils.getCertList(args.cas_ca)
) )
...@@ -733,15 +764,15 @@ def updater(argv=None, until=utils.until): ...@@ -733,15 +764,15 @@ def updater(argv=None, until=utils.until):
) )
now = until(next_deadline) now = until(next_deadline)
next_deadline = now + max_sleep next_deadline = now + max_sleep
if args.cas_ca != args.ca and CaucaseClient.updateCAFile( if args.cas_ca != args.ca and RetryingCaucaseClient.updateCAFile(
cas_url, cas_url,
args.cas_ca, args.cas_ca,
): ):
client = CaucaseClient( client = RetryingCaucaseClient(
ca_url=ca_url, ca_url=ca_url,
ca_crt_pem_list=utils.getCertList(args.cas_ca) ca_crt_pem_list=utils.getCertList(args.cas_ca)
) )
if CaucaseClient.updateCAFile(ca_url, args.ca): if RetryingCaucaseClient.updateCAFile(ca_url, args.ca):
print 'Got new CA' print 'Got new CA'
updated = True updated = True
# Note: CRL expiration should happen several time during CA renewal # Note: CRL expiration should happen several time during CA renewal
...@@ -751,7 +782,7 @@ def updater(argv=None, until=utils.until): ...@@ -751,7 +782,7 @@ def updater(argv=None, until=utils.until):
utils.load_ca_certificate(x) utils.load_ca_certificate(x)
for x in utils.getCertList(args.ca) for x in utils.getCertList(args.ca)
] ]
if CaucaseClient.updateCRLFile(ca_url, args.crl, ca_crt_list): if RetryingCaucaseClient.updateCRLFile(ca_url, args.crl, ca_crt_list):
print 'Got new CRL' print 'Got new CRL'
updated = True updated = True
next_deadline = min( next_deadline = min(
......
...@@ -25,6 +25,7 @@ from cStringIO import StringIO ...@@ -25,6 +25,7 @@ from cStringIO import StringIO
import datetime import datetime
import errno import errno
import glob import glob
import httplib
import ipaddress import ipaddress
import os import os
import random import random
...@@ -1934,6 +1935,43 @@ class CaucaseTest(unittest.TestCase): ...@@ -1934,6 +1935,43 @@ class CaucaseTest(unittest.TestCase):
self.assertFalse(os.path.exists(re_crt_path)) self.assertFalse(os.path.exists(re_crt_path))
updater_event = threading.Event() updater_event = threading.Event()
until_updater = UntilEvent(updater_event) until_updater = UntilEvent(updater_event)
network_issue_event = threading.Event()
until_network_issue = UntilEvent(network_issue_event)
# pylint: disable=protected-access
cli.RetryingCaucaseClient._until = until_network_issue
until_network_issue.action = ON_EVENT_EXPIRE
# pylint: enable=protected-access
original_HTTPConnection = cli.RetryingCaucaseClient.HTTPConnection
class ErrorInjector(object):
"""
Callable instances which set a flag when called and raise provided
exception.
"""
reached = False
def __init__(self, exception):
self.exception = exception
def __call__(self, *_, **__):
self.reached = True
raise self.exception
class HTTPConnectionNetworkFaultInjector(original_HTTPConnection):
"""
Place holder for error injectors (to not poison the real HTTPConnection).
"""
pass
def reraise(*_, **__): # pragma: no cover
"""
Just reraise latest error.
For replacing RetryingCaucaseClient._until only.
"""
# pylint: disable=misplaced-bare-raise
raise
# pylint: enable=misplaced-bare-raise
cli.RetryingCaucaseClient.HTTPConnection = HTTPConnectionNetworkFaultInjector
# Prime the error pump.
HTTPConnectionNetworkFaultInjector.connect = injector = ErrorInjector(
socket.gaierror(-3, 'Temporary failure in name resolution'),
)
updater_thread = threading.Thread( updater_thread = threading.Thread(
target=cli.updater, target=cli.updater,
kwargs={ kwargs={
...@@ -1952,6 +1990,40 @@ class CaucaseTest(unittest.TestCase): ...@@ -1952,6 +1990,40 @@ class CaucaseTest(unittest.TestCase):
updater_thread.daemon = True updater_thread.daemon = True
updater_thread.start() updater_thread.start()
try: try:
# XXX: this error injection technique only tests the first query. Assume
# the code handle network errors consistently everywhere
# (IOW, RetryingCaucaseClient is used everywhere).
for index, (HTTPConnection_method_id, injector) in enumerate((
('connect', injector), # Primed above
('connect', ErrorInjector(socket.error(111, 'Connection refused'))),
('getresponse', ErrorInjector(httplib.BadStatusLine('blah'))),
('getresponse', ErrorInjector(httplib.LineTooLong('header line'))),
('getresponse', ErrorInjector(httplib.UnknownProtocol('HTTP/pigeon'))),
('getresponse', ErrorInjector(httplib.IncompleteRead('you are my '))),
)):
if index:
# Set next fault
setattr(
HTTPConnectionNetworkFaultInjector,
HTTPConnection_method_id,
injector,
)
# Let code reach it
network_issue_event.set()
# Wait for the code to reach the fault.
# XXX: how to fast-fail if thread dies ?
until_network_issue.wait()
# Check injector really got triggered.
self.assertTrue(injector.reached)
# Cleanup and prepare next error
delattr(HTTPConnectionNetworkFaultInjector, HTTPConnection_method_id)
cli.RetryingCaucaseClient.HTTPConnection = original_HTTPConnection
# pylint: disable=protected-access
cli.RetryingCaucaseClient._until = reraise
# pylint: enable=protected-access
# Let code carry on.
network_issue_event.set()
until_updater.wait() until_updater.wait()
# CSR must have been submitted # CSR must have been submitted
csr_line_list = self._runClient( csr_line_list = self._runClient(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment