Commit 23b6a66a authored by Julien Muchembled's avatar Julien Muchembled

Reimplement election (of the primary master)

The election is not a separate process anymore.
It happens during the RECOVERING phase, and there's no use of timeouts anymore.

Each master node keeps a timestamp of when it started to play the primary role,
and the node with the smallest timestamp is elected. The election stops when
the cluster is started: as long as it is operational, the primary master can't
be deposed.

An election must happen whenever the cluster is not operational anymore, to
handle the case of a network cut between a primary master and all other nodes:
then another master node (secondary) takes over and when the initial primary
master is back, it loses against the new primary master if the cluster is
already started.
parent 0a3dba8b
......@@ -24,8 +24,6 @@
This is mainly the case for :
- Client rejected before the cluster is operational
- Empty storages rejected during recovery process
Masters implies in the election process should still reject any connection
as the primary master is still unknown.
- Implement transaction garbage collection API (FEATURE)
NEO packing implementation does not update transaction metadata when
deleting object revisions. This inconsistency must be made possible to
......
......@@ -194,17 +194,17 @@ class Application(ThreadedApplication):
self.nm.reset()
if self.primary_master_node is not None:
# If I know a primary master node, pinpoint it.
self.trying_master_node = self.primary_master_node
node = self.primary_master_node
self.primary_master_node = None
else:
# Otherwise, check one by one.
master_list = self.nm.getMasterList()
index = (index + 1) % len(master_list)
self.trying_master_node = master_list[index]
node = master_list[index]
# Connect to master
conn = MTClientConnection(self,
self.notifications_handler,
node=self.trying_master_node,
node=node,
dispatcher=self.dispatcher)
p = Packets.RequestIdentification(
NodeTypes.CLIENT, self.uuid, None, self.name, None)
......@@ -212,10 +212,8 @@ class Application(ThreadedApplication):
ask(conn, p, handler=handler)
except ConnectionClosed:
fail_count += 1
continue
# If we reached the primary master node, mark as connected
if self.primary_master_node is not None and \
self.primary_master_node is self.trying_master_node:
else:
self.primary_master_node = node
break
else:
raise NEOPrimaryMasterLost(
......
......@@ -15,10 +15,10 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from neo.lib import logging
from neo.lib.exception import PrimaryElected
from neo.lib.handler import MTEventHandler
from neo.lib.pt import MTPartitionTable as PartitionTable
from neo.lib.protocol import NodeStates, ProtocolError
from neo.lib.util import dump
from neo.lib.protocol import NodeStates
from . import AnswerBaseHandler
from ..exception import NEOStorageError
......@@ -26,10 +26,6 @@ from ..exception import NEOStorageError
class PrimaryBootstrapHandler(AnswerBaseHandler):
""" Bootstrap handler used when looking for the primary master """
def notReady(self, conn, message):
self.app.trying_master_node = None
conn.close()
def answerPartitionTable(self, conn, ptid, row_list):
assert row_list
self.app.pt.load(ptid, row_list, self.app.nm)
......@@ -40,57 +36,14 @@ class PrimaryBootstrapHandler(AnswerBaseHandler):
class PrimaryNotificationsHandler(MTEventHandler):
""" Handler that process the notifications from the primary master """
def _acceptIdentification(self, node, uuid, num_partitions,
num_replicas, your_uuid, primary, known_master_list):
app = self.app
# Register new master nodes.
found = False
conn_address = node.getAddress()
for node_address, node_uuid in known_master_list:
if node_address == conn_address:
assert uuid == node_uuid, (dump(uuid), dump(node_uuid))
found = True
n = app.nm.getByAddress(node_address)
if n is None:
n = app.nm.createMaster(address=node_address)
if node_uuid is not None and n.getUUID() != node_uuid:
n.setUUID(node_uuid)
assert found, (node, dump(uuid), known_master_list)
conn = node.getConnection()
if primary is not None:
primary_node = app.nm.getByAddress(primary)
if primary_node is None:
# I don't know such a node. Probably this information
# is old. So ignore it.
logging.warning('Unknown primary master: %s. Ignoring.',
primary)
return
else:
if app.trying_master_node is not primary_node:
app.trying_master_node = None
conn.close()
app.primary_master_node = primary_node
else:
if app.primary_master_node is not None:
# The primary master node is not a primary master node
# any longer.
app.primary_master_node = None
app.trying_master_node = None
conn.close()
return
# the master must give an UUID
if your_uuid is None:
raise ProtocolError('No UUID supplied')
app.uuid = your_uuid
logging.info('Got an UUID: %s', dump(app.uuid))
app.id_timestamp = None
def notPrimaryMaster(self, *args):
try:
super(PrimaryNotificationsHandler, self).notPrimaryMaster(*args)
except PrimaryElected, e:
app.primary_master_node, = e.args
# Always create partition table
app.pt = PartitionTable(num_partitions, num_replicas)
def _acceptIdentification(self, node, num_partitions, num_replicas):
self.app.pt = PartitionTable(num_partitions, num_replicas)
def answerLastTransaction(self, conn, ltid):
app = self.app
......
......@@ -42,13 +42,8 @@ class StorageEventHandler(MTEventHandler):
self.app.cp.removeConnection(node)
super(StorageEventHandler, self).connectionFailed(conn)
def _acceptIdentification(self, node,
uuid, num_partitions, num_replicas, your_uuid, primary,
master_list):
assert self.app.master_conn is None or \
primary == self.app.master_conn.getAddress(), (
primary, self.app.master_conn)
assert uuid == node.getUUID(), (uuid, node.getUUID())
def _acceptIdentification(*args):
pass
class StorageBootstrapHandler(AnswerBaseHandler):
""" Handler used when connecting to a storage node """
......
......@@ -15,8 +15,9 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from . import logging
from .exception import PrimaryElected
from .handler import EventHandler
from .protocol import uuid_str, Packets
from .protocol import Packets
from .connection import ClientConnection
......@@ -24,7 +25,6 @@ class BootstrapManager(EventHandler):
"""
Manage the bootstrap stage, lookup for the primary master then connect to it
"""
accepted = False
def __init__(self, app, node_type, server=None):
"""
......@@ -32,85 +32,30 @@ class BootstrapManager(EventHandler):
primary master node, connect to it then returns when the master node
is ready.
"""
self.primary = None
self.server = server
self.node_type = node_type
self.num_replicas = None
self.num_partitions = None
self.current = None
app.nm.reset()
uuid = property(lambda self: self.app.uuid)
def announcePrimary(self, conn):
# We found the primary master early enough to be notified of election
# end. Lucky. Anyway, we must carry on with identification request, so
# nothing to do here.
pass
def connectionCompleted(self, conn):
"""
Triggered when the network connection is successful.
Now ask who's the primary.
"""
EventHandler.connectionCompleted(self, conn)
self.current.setRunning()
conn.ask(Packets.RequestIdentification(self.node_type, self.uuid,
self.server, self.app.name, None))
def connectionFailed(self, conn):
"""
Triggered when the network connection failed.
Restart bootstrap.
"""
EventHandler.connectionFailed(self, conn)
self.current = None
def connectionLost(self, conn, new_state):
"""
Triggered when an established network connection is lost.
Restart bootstrap.
"""
self.current.setTemporarilyDown()
self.current = None
def notReady(self, conn, message):
"""
The primary master send this message when it is still not ready to
handle the client node.
Close connection and restart.
"""
conn.close()
def _acceptIdentification(self, node, uuid, num_partitions,
num_replicas, your_uuid, primary, known_master_list):
nm = self.app.nm
# Register new master nodes.
for address, uuid in known_master_list:
master_node = nm.getByAddress(address)
if master_node is None:
master_node = nm.createMaster(address=address)
master_node.setUUID(uuid)
self.primary = nm.getByAddress(primary)
if self.primary is None or self.current is not self.primary:
# three cases here:
# - something goes wrong (unknown UUID)
# - this master doesn't know who's the primary
# - got the primary's uuid, so cut here
node.getConnection().close()
return
logging.info('connected to a primary master node')
def _acceptIdentification(self, node, num_partitions, num_replicas):
assert self.current is node, (self.current, node)
self.num_partitions = num_partitions
self.num_replicas = num_replicas
if self.uuid != your_uuid:
# got an uuid from the primary master
self.app.uuid = your_uuid
logging.info('Got a new UUID: %s', uuid_str(self.uuid))
self.app.id_timestamp = None
self.accepted = True
def getPrimaryConnection(self):
"""
......@@ -122,25 +67,26 @@ class BootstrapManager(EventHandler):
poll = app.em.poll
index = 0
self.current = None
conn = None
# retry until identified to the primary
while not self.accepted:
if self.current is None:
# conn closed
conn = None
while True:
try:
while self.current:
if self.current.isIdentified():
return (self.current, self.current.getConnection(),
self.num_partitions, self.num_replicas)
poll(1)
except PrimaryElected, e:
if self.current:
self.current.getConnection().close()
self.current, = e.args
index = app.nm.getMasterList().index(self.current)
else:
# select a master
master_list = app.nm.getMasterList()
index = (index + 1) % len(master_list)
self.current = master_list[index]
if conn is None:
# open the connection
conn = ClientConnection(app, self, self.current)
# Yes, the connection may be already closed. This happens when
# the kernel reacts so quickly to a closed port that 'connect'
# fails on the first call. In such case, poll(1) would deadlock
# if there's no other connection to timeout.
if conn.isClosed():
continue
# still processing
poll(1)
return self.current, conn, self.num_partitions, self.num_replicas
ClientConnection(app, self, self.current)
# Note that the connection may be already closed. This happens when
# the kernel reacts so quickly to a closed port that 'connect'
# fails on the first call. In such case, poll(1) would deadlock
# if there's no other connection to timeout.
......@@ -259,10 +259,12 @@ class BaseConnection(object):
)
def setHandler(self, handler):
if self._handlers.setHandler(handler):
logging.debug('Set handler %r on %r', handler, self)
changed = self._handlers.setHandler(handler)
if changed:
logging.debug('Handler changed on %r', self)
else:
logging.debug('Delay handler %r on %r', handler, self)
return changed
def getUUID(self):
return None
......
......@@ -17,7 +17,7 @@
class NeoException(Exception):
pass
class ElectionFailure(NeoException):
class PrimaryElected(NeoException):
pass
class PrimaryFailure(NeoException):
......
......@@ -19,9 +19,10 @@ from collections import deque
from operator import itemgetter
from . import logging
from .connection import ConnectionClosed
from .protocol import (
NodeStates, Packets, Errors, BackendNotImplemented, NonReadableCell,
NotReadyError, PacketMalformedError, ProtocolError, UnexpectedPacketError)
from .exception import PrimaryElected
from .protocol import (NodeStates, NodeTypes, Packets, uuid_str,
Errors, BackendNotImplemented, NonReadableCell, NotReadyError,
PacketMalformedError, ProtocolError, UnexpectedPacketError)
from .util import cached_property
......@@ -147,16 +148,41 @@ class EventHandler(object):
# Packet handlers.
def acceptIdentification(self, conn, node_type, *args):
try:
acceptIdentification = self._acceptIdentification
except AttributeError:
raise UnexpectedPacketError('no handler found')
node = self.app.nm.getByAddress(conn.getAddress())
def notPrimaryMaster(self, conn, primary, known_master_list):
nm = self.app.nm
for address in known_master_list:
nm.createMaster(address=address)
if primary is not None:
primary = known_master_list[primary]
assert primary != self.app.server
raise PrimaryElected(nm.getByAddress(primary))
def _acceptIdentification(*args):
pass
def acceptIdentification(self, conn, node_type, uuid,
num_partitions, num_replicas, your_uuid):
app = self.app
node = app.nm.getByAddress(conn.getAddress())
assert node.getConnection() is conn, (node.getConnection(), conn)
if node.getType() == node_type:
if node_type == NodeTypes.MASTER:
other = app.nm.getByUUID(uuid)
if other is not None:
other.setUUID(None)
node.setUUID(uuid)
node.setRunning()
if your_uuid is None:
raise ProtocolError('No UUID supplied')
logging.info('connected to a primary master node')
if app.uuid != your_uuid:
app.uuid = your_uuid
logging.info('Got a new UUID: %s', uuid_str(your_uuid))
app.id_timestamp = None
elif node.getUUID() != uuid or app.uuid != your_uuid != None:
raise ProtocolError('invalid uuids')
node.setIdentified()
acceptIdentification(node, *args)
self._acceptIdentification(node, num_partitions, num_replicas)
return
conn.close()
......
......@@ -423,7 +423,7 @@ class NodeManager(EventQueue):
# lookup in current table
node_by_uuid = self.getByUUID(uuid)
node_by_addr = self.getByAddress(addr)
node = node_by_uuid or node_by_addr
node = node_by_addr or node_by_uuid
log_args = node_type, uuid_str(uuid), addr, state, id_timestamp
if node is None:
......@@ -434,10 +434,11 @@ class NodeManager(EventQueue):
else:
assert isinstance(node, klass), 'node %r is not ' \
'of expected type: %r' % (node, klass)
assert None in (node_by_uuid, node_by_addr) or \
node_by_uuid is node_by_addr, \
if None is not node_by_uuid is not node_by_addr is not None:
assert added_list is not None, \
'Discrepancy between node_by_uuid (%r) and ' \
'node_by_addr (%r)' % (node_by_uuid, node_by_addr)
node_by_uuid.setUUID(None)
if state == NodeStates.DOWN:
logging.debug('dropping node %r (%r), found with %s '
'%s %s %s %s', node, node.isConnected(), *log_args)
......
......@@ -578,12 +578,14 @@ class PChecksum(PItem):
def _decode(self, reader):
return reader(20)
class PUUID(PStructItemOrNone):
class PSignedNull(PStructItemOrNone):
_fmt = '!l'
_None = Struct(_fmt).pack(0)
class PUUID(PSignedNull):
"""
An UUID (node identifier, 4-bytes signed integer)
"""
_fmt = '!l'
_None = Struct(_fmt).pack(0)
class PTID(PItem):
"""
......@@ -715,13 +717,6 @@ class RequestIdentification(Packet):
PNumber('num_partitions'),
PNumber('num_replicas'),
PUUID('your_uuid'),
PAddress('primary'),
PList('known_master_list',
PStruct('master',
PAddress('address'),
PUUID('uuid'),
),
),
)
def __init__(self, *args, **kw):
......@@ -742,15 +737,16 @@ class PrimaryMaster(Packet):
PUUID('primary_uuid'),
)
class AnnouncePrimary(Packet):
"""
Announce a primary master node election. PM -> SM.
"""
class ReelectPrimary(Packet):
class NotPrimaryMaster(Packet):
"""
Force a re-election of a primary master node. M -> M.
Send list of known master nodes. SM -> Any.
"""
_fmt = PStruct('not_primary_master',
PSignedNull('primary'),
PList('known_master_list',
PAddress('address'),
),
)
class Recovery(Packet):
"""
......@@ -1687,10 +1683,8 @@ class Packets(dict):
Notify)
AskPrimary, AnswerPrimary = register(
PrimaryMaster)
AnnouncePrimary = register(
AnnouncePrimary)
ReelectPrimary = register(
ReelectPrimary)
NotPrimaryMaster = register(
NotPrimaryMaster)
NotifyNodeInformation = register(
NotifyNodeInformation)
AskRecovery, AnswerRecovery = register(
......
This diff is collapsed.
......@@ -29,34 +29,9 @@ class MasterHandler(EventHandler):
if new is None:
super(MasterHandler, self).connectionCompleted(conn)
def requestIdentification(self, conn, node_type, uuid, address, name, _):
self.checkClusterName(name)
app = self.app
node = app.nm.getByUUID(uuid)
if node:
if node_type is NodeTypes.MASTER and not (
None != address == node.getAddress()):
raise ProtocolError
peer_uuid = self._setupNode(conn, node_type, uuid, address, node)
if app.primary:
primary_address = app.server
elif app.primary_master_node is not None:
primary_address = app.primary_master_node.getAddress()
else:
primary_address = None
known_master_list = []
for n in app.nm.getMasterList():
known_master_list.append((n.getAddress(), n.getUUID()))
conn.answer(Packets.AcceptIdentification(
NodeTypes.MASTER,
app.uuid,
app.pt.getPartitions(),
app.pt.getReplicas(),
peer_uuid,
primary_address,
known_master_list),
)
def connectionLost(self, conn, new_state=None):
if self.app.listening_conn: # if running
self._connectionLost(conn)
def askClusterState(self, conn):
state = self.app.getClusterState()
......
......@@ -22,16 +22,15 @@ from . import MasterHandler
class ClientServiceHandler(MasterHandler):
""" Handler dedicated to client during service state """
def connectionLost(self, conn, new_state):
def _connectionLost(self, conn):
# cancel its transactions and forgot the node
app = self.app
if app.listening_conn: # if running
node = app.nm.getByUUID(conn.getUUID())
assert node is not None
app.tm.clientLost(node)
node.setState(NodeStates.DOWN)
app.broadcastNodesInformation([node])
app.nm.remove(node)
node = app.nm.getByUUID(conn.getUUID())
assert node is not None, conn
app.tm.clientLost(node)
node.setState(NodeStates.DOWN)
app.broadcastNodesInformation([node])
app.nm.remove(node)
def askBeginTransaction(self, conn, tid):
"""
......
#
# Copyright (C) 2006-2017 Nexedi SA
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from neo.lib import logging
from neo.lib.protocol import uuid_str, NodeTypes, Packets
from neo.lib.protocol import NotReadyError
from neo.lib.exception import ElectionFailure
from neo.lib.handler import EventHandler
from . import MasterHandler
class BaseElectionHandler(EventHandler):
def _notifyNodeInformation(self, conn):
pass
def reelectPrimary(self, conn):
raise ElectionFailure, 'reelection requested'
def announcePrimary(self, conn):
app = self.app
if app.primary:
# I am also the primary... So restart the election.
raise ElectionFailure, 'another primary arises'
try:
address = app.master_address_dict[conn]
assert conn.isServer()
except KeyError:
address = conn.getAddress()
assert conn.isClient()
app.primary = False
app.primary_master_node = node = app.nm.getByAddress(address)
app.negotiating_master_node_set.clear()
logging.info('%s is the primary', node)
def elect(self, conn, peer_address):
app = self.app
if app.server < peer_address:
app.primary = False
if conn is not None:
app.master_address_dict[conn] = peer_address
app.negotiating_master_node_set.discard(peer_address)
class ClientElectionHandler(BaseElectionHandler):
def notifyNodeInformation(self, conn, timestamp, node_list):
# XXX: For the moment, do nothing because
# we'll close this connection and reconnect.
pass
def connectionFailed(self, conn):
addr = conn.getAddress()
node = self.app.nm.getByAddress(addr)
assert node is not None, (uuid_str(self.app.uuid), addr)
# node may still be in unknown state
self.app.negotiating_master_node_set.discard(addr)
super(ClientElectionHandler, self).connectionFailed(conn)
def connectionCompleted(self, conn):
app = self.app
conn.ask(Packets.RequestIdentification(
NodeTypes.MASTER,
app.uuid,
app.server,
app.name,
None,
))
super(ClientElectionHandler, self).connectionCompleted(conn)
def connectionLost(self, conn, new_state):
# Retry connection. Either the node just died (and we will end up in
# connectionFailed) or it just got elected (and we must not ignore
# that node).
addr = conn.getAddress()
self.app.unconnected_master_node_set.add(addr)
self.app.negotiating_master_node_set.discard(addr)
def _acceptIdentification(self, node, peer_uuid, num_partitions,
num_replicas, your_uuid, primary, known_master_list):
app = self.app
# Register new master nodes.
for address, uuid in known_master_list:
if app.server == address:
# This is self.
assert node.getAddress() != primary or uuid == your_uuid, (
uuid_str(uuid), uuid_str(your_uuid))
continue
n = app.nm.getByAddress(address)
if n is None:
n = app.nm.createMaster(address=address)
if primary is not None:
# The primary master is defined.
if app.primary_master_node is not None \
and app.primary_master_node.getAddress() != primary:
# There are multiple primary master nodes. This is
# dangerous.
raise ElectionFailure, 'multiple primary master nodes'
primary_node = app.nm.getByAddress(primary)
if primary_node is None:
# I don't know such a node. Probably this information
# is old. So ignore it.
logging.warning('received an unknown primary node')
else:
# Whatever the situation is, I trust this master.
app.primary = False
app.primary_master_node = primary_node
# Stop waiting for connections than primary master's to
# complete to exit election phase ASAP.
app.negotiating_master_node_set.clear()
return
self.elect(None, node.getAddress())
class ServerElectionHandler(BaseElectionHandler, MasterHandler):
def _setupNode(self, conn, node_type, uuid, address, node):
app = self.app
if node_type != NodeTypes.MASTER:
logging.info('reject a connection from a non-master')
raise NotReadyError
if node is None is app.nm.getByAddress(address):
app.nm.createMaster(address=address)
self.elect(conn, address)
return uuid
......@@ -15,27 +15,25 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from neo.lib import logging
from neo.lib.exception import PrimaryElected
from neo.lib.handler import EventHandler
from neo.lib.protocol import ClusterStates, NodeStates, NodeTypes, \
NotReadyError, ProtocolError, uuid_str
NotReadyError, Packets, ProtocolError, uuid_str
from ..app import monotonic_time
from . import MasterHandler
class IdentificationHandler(MasterHandler):
class IdentificationHandler(EventHandler):
def requestIdentification(self, conn, *args, **kw):
super(IdentificationHandler, self).requestIdentification(conn, *args,
**kw)
handler = conn.getHandler()
assert not isinstance(handler, IdentificationHandler), handler
handler._notifyNodeInformation(conn)
handler.connectionCompleted(conn, True)
def _setupNode(self, conn, node_type, uuid, address, node):
def requestIdentification(self, conn, node_type, uuid,
address, name, id_timestamp):
app = self.app
self.checkClusterName(name)