app.py 14.4 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406
#
# Copyright (C) 2006-2014  Nexedi SA
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import sys
from collections import deque

from neo.lib import logging
from neo.lib.protocol import uuid_str, \
    CellStates, ClusterStates, NodeTypes, Packets
from neo.lib.node import NodeManager
from neo.lib.event import EventManager
from neo.lib.connection import ListeningConnection
from neo.lib.exception import OperationFailure, PrimaryFailure
from neo.lib.connector import getConnectorHandler
from neo.lib.pt import PartitionTable
from neo.lib.util import dump
from neo.lib.bootstrap import BootstrapManager
from .checker import Checker
from .database import buildDatabaseManager
from .exception import AlreadyPendingError
from .handlers import identification, verification, initialization
from .handlers import master, hidden
from .replicator import Replicator
from .transactions import TransactionManager

from neo.lib.debug import register as registerLiveDebugger

class Application(object):
    """The storage node application."""

    def __init__(self, config):
        # set the cluster name
        self.name = config.getCluster()

        # Internal attributes.
        self.em = EventManager()
        self.nm = NodeManager(config.getDynamicMasterList())
        self.tm = TransactionManager(self)
        self.dm = buildDatabaseManager(config.getAdapter(),
            (config.getDatabase(), config.getWait())
        )

        # load master nodes
        master_addresses, connector_name = config.getMasters()
        self.connector_handler = getConnectorHandler(connector_name)
        for master_address in master_addresses :
            self.nm.createMaster(address=master_address)

        # set the bind address
        self.server = config.getBind()
        logging.debug('IP address is %s, port is %d', *self.server)

        # The partition table is initialized after getting the number of
        # partitions.
        self.pt = None

        self.checker = Checker(self)
        self.replicator = Replicator(self)
        self.listening_conn = None
        self.master_conn = None
        self.master_node = None

        # operation related data
        self.event_queue = None
        self.event_queue_dict = None
        self.operational = False

        # ready is True when operational and got all informations
        self.ready = False

        self.dm.setup(reset=config.getReset())
        self.loadConfiguration()

        # force node uuid from command line argument, for testing purpose only
        if config.getUUID() is not None:
            self.uuid = config.getUUID()

        registerLiveDebugger(on_log=self.log)

    def close(self):
        self.listening_conn = None
        self.nm.close()
        self.em.close()
        try:
            self.dm.close()
        except AttributeError:
            pass
        del self.__dict__

    def _poll(self):
        self.em.poll(1)

    def log(self):
        self.em.log()
        self.logQueuedEvents()
        self.nm.log()
        self.tm.log()
        if self.pt is not None:
            self.pt.log()

    def loadConfiguration(self):
        """Load persistent configuration data from the database.
        If data is not present, generate it."""

        dm = self.dm

        # check cluster name
        name = dm.getName()
        if name is None:
            dm.setName(self.name)
        elif name != self.name:
            raise RuntimeError('name %r does not match with the database: %r'
                               % (self.name, name))

        # load configuration
        self.uuid = dm.getUUID()
        num_partitions = dm.getNumPartitions()
        num_replicas = dm.getNumReplicas()
        ptid = dm.getPTID()

        # check partition table configuration
        if num_partitions is not None and num_replicas is not None:
            if num_partitions <= 0:
                raise RuntimeError, 'partitions must be more than zero'
            # create a partition table
            self.pt = PartitionTable(num_partitions, num_replicas)

        logging.info('Configuration loaded:')
        logging.info('UUID      : %s', uuid_str(self.uuid))
        logging.info('PTID      : %s', dump(ptid))
        logging.info('Name      : %s', self.name)
        logging.info('Partitions: %s', num_partitions)
        logging.info('Replicas  : %s', num_replicas)

    def loadPartitionTable(self):
        """Load a partition table from the database."""
        ptid = self.dm.getPTID()
        cell_list = self.dm.getPartitionTable()
        new_cell_list = []
        for offset, uuid, state in cell_list:
            # convert from int to Enum
            state = CellStates[state]
            # register unknown nodes
            if self.nm.getByUUID(uuid) is None:
                self.nm.createStorage(uuid=uuid)
            new_cell_list.append((offset, uuid, state))
        # load the partition table in manager
        self.pt.clear()
        self.pt.update(ptid, new_cell_list, self.nm)

    def run(self):
        try:
            self._run()
        except Exception:
            logging.exception('Pre-mortem data:')
            self.log()
            raise

    def _run(self):
        """Make sure that the status is sane and start a loop."""
        if len(self.name) == 0:
            raise RuntimeError, 'cluster name must be non-empty'

        # Make a listening port
        handler = identification.IdentificationHandler(self)
        self.listening_conn = ListeningConnection(self.em, handler,
            addr=self.server, connector=self.connector_handler())
        self.server = self.listening_conn.getAddress()

        # Connect to a primary master node, verify data, and
        # start the operation. This cycle will be executed permanently,
        # until the user explicitly requests a shutdown.
        while True:
            self.cluster_state = None
            self.ready = False
            self.operational = False
            if self.master_node is None:
                # look for the primary master
                self.connectToPrimary()
            # check my state
            node = self.nm.getByUUID(self.uuid)
            if node is not None and node.isHidden():
                self.wait()
            # drop any client node
            for conn in self.em.getConnectionList():
                if conn not in (self.listening_conn, self.master_conn):
                    conn.close()
            # create/clear event queue
            self.event_queue = deque()
            self.event_queue_dict = {}
            try:
                self.verifyData()
                self.initialize()
                self.doOperation()
                raise RuntimeError, 'should not reach here'
            except OperationFailure, msg:
                logging.error('operation stopped: %s', msg)
                if self.cluster_state == ClusterStates.STOPPING_BACKUP:
                    self.dm.setBackupTID(None)
            except PrimaryFailure, msg:
                logging.error('primary master is down: %s', msg)
            finally:
                self.checker = Checker(self)

    def connectToPrimary(self):
        """Find a primary master node, and connect to it.

        If a primary master node is not elected or ready, repeat
        the attempt of a connection periodically.

        Note that I do not accept any connection from non-master nodes
        at this stage."""
        pt = self.pt

        # First of all, make sure that I have no connection.
        for conn in self.em.getConnectionList():
            if not conn.isListening():
                conn.close()

        # search, find, connect and identify to the primary master
        bootstrap = BootstrapManager(self, self.name,
                NodeTypes.STORAGE, self.uuid, self.server)
        data = bootstrap.getPrimaryConnection(self.connector_handler)
        (node, conn, uuid, num_partitions, num_replicas) = data
        self.master_node = node
        self.master_conn = conn
        logging.info('I am %s', uuid_str(uuid))
        self.uuid = uuid
        self.dm.setUUID(uuid)

        # Reload a partition table from the database. This is necessary
        # when a previous primary master died while sending a partition
        # table, because the table might be incomplete.
        if pt is not None:
            self.loadPartitionTable()
            if num_partitions != pt.getPartitions():
                raise RuntimeError('the number of partitions is inconsistent')

        if pt is None or pt.getReplicas() != num_replicas:
            # changing number of replicas is not an issue
            self.dm.setNumPartitions(num_partitions)
            self.dm.setNumReplicas(num_replicas)
            self.pt = PartitionTable(num_partitions, num_replicas)
            self.loadPartitionTable()

    def verifyData(self):
        """Verify data under the control by a primary master node.
        Connections from client nodes may not be accepted at this stage."""
        logging.info('verifying data')

        handler = verification.VerificationHandler(self)
        self.master_conn.setHandler(handler)
        _poll = self._poll

        while not self.operational:
            _poll()

    def initialize(self):
        """ Retreive partition table and node informations from the primary """
        logging.debug('initializing...')
        _poll = self._poll
        handler = initialization.InitializationHandler(self)
        self.master_conn.setHandler(handler)

        # ask node list and partition table
        self.pt.clear()
        self.master_conn.ask(Packets.AskNodeInformation())
        self.master_conn.ask(Packets.AskPartitionTable())
        while self.master_conn.isPending():
            _poll()
        self.ready = True
        self.replicator.populate()
        self.master_conn.notify(Packets.NotifyReady())

    def doOperation(self):
        """Handle everything, including replications and transactions."""
        logging.info('doing operation')

        _poll = self._poll
        isIdle = self.em.isIdle

        handler = master.MasterOperationHandler(self)
        self.master_conn.setHandler(handler)

        # Forget all unfinished data.
        self.dm.dropUnfinishedData()
        self.tm.reset()

        self.task_queue = task_queue = deque()
        try:
            while True:
                while task_queue and isIdle():
                    try:
                        task_queue[-1].next()
                        task_queue.rotate()
                    except StopIteration:
                        task_queue.pop()
                _poll()
        finally:
            del self.task_queue
            # XXX: Although no handled exception should happen between
            #      replicator.populate() and the beginning of this 'try'
            #      clause, the replicator should be reset in a safer place.
            self.replicator = Replicator(self)
            # Abort any replication, whether we are feeding or out-of-date.
            for node in self.nm.getStorageList(only_identified=True):
                node.getConnection().close()

    def changeClusterState(self, state):
        self.cluster_state = state
        if state == ClusterStates.STOPPING_BACKUP:
            self.replicator.stop()

    def wait(self):
        # change handler
        logging.info("waiting in hidden state")
        _poll = self._poll
        handler = hidden.HiddenHandler(self)
        for conn in self.em.getConnectionList():
            conn.setHandler(handler)

        node = self.nm.getByUUID(self.uuid)
        while True:
            _poll()
            if not node.isHidden():
                break

    def queueEvent(self, some_callable, conn=None, args=(), key=None,
            raise_on_duplicate=True):
        msg_id = None if conn is None else conn.getPeerId()
        event_queue_dict = self.event_queue_dict
        if raise_on_duplicate and key in event_queue_dict:
            raise AlreadyPendingError()
        else:
            self.event_queue.append((key, some_callable, msg_id, conn, args))
            if key is not None:
                try:
                    event_queue_dict[key] += 1
                except KeyError:
                    event_queue_dict[key] = 1

    def executeQueuedEvents(self):
        l = len(self.event_queue)
        p = self.event_queue.popleft
        event_queue_dict = self.event_queue_dict
        for _ in xrange(l):
            key, some_callable, msg_id, conn, args = p()
            if key is not None:
                event_queue_dict[key] -= 1
                if event_queue_dict[key] == 0:
                    del event_queue_dict[key]
            if conn is None:
                some_callable(*args)
            elif not conn.isClosed():
                orig_msg_id = conn.getPeerId()
                try:
                    conn.setPeerId(msg_id)
                    some_callable(conn, *args)
                finally:
                    conn.setPeerId(orig_msg_id)

    def logQueuedEvents(self):
        if self.event_queue is None:
            return
        logging.info("Pending events:")
        for key, event, _msg_id, _conn, args in self.event_queue:
            logging.info('  %r:%r: %r:%r %r %r', key, event.__name__,
                _msg_id, _conn, args)

    def newTask(self, iterator):
        try:
            iterator.next()
        except StopIteration:
            return
        self.task_queue.appendleft(iterator)

    def closeClient(self, connection):
        if connection is not self.replicator.getCurrentConnection() and \
           connection not in self.checker.conn_dict:
            connection.closeClient()

    def shutdown(self, erase=False):
        """Close all connections and exit"""
        for c in self.em.getConnectionList():
            try:
                c.close()
            except PrimaryFailure:
                pass
        # clear database to avoid polluting the cluster at restart
        if erase:
            self.dm.erase()
        logging.info("Application has been asked to shut down")
        sys.exit()