Commit c4872a03 authored by Alain Takoudjou's avatar Alain Takoudjou

slapos node boot: Start computer partitiong without connecting to master

See merge request nexedi/slapos.core!431
parents f85280ef cada4581
Pipeline #23994 failed with stage
in 0 seconds
......@@ -31,6 +31,7 @@ from __future__ import print_function
import subprocess
from six.moves.urllib.parse import urlparse
from six.moves import xmlrpc_client as xmlrpclib
from time import sleep
import glob
import os
......@@ -41,6 +42,11 @@ from slapos.cli.command import check_root_user
from slapos.cli.entry import SlapOSApp
from slapos.cli.config import ConfigCommand
from slapos.format import isGlobalScopeAddress
from slapos.grid.slapgrid import (COMPUTER_PARTITION_REQUESTED_STATE_FILENAME,
COMPUTER_PARTITION_STARTED_STATE)
from slapos.grid.svcbackend import (_getSupervisordSocketPath,
getSupervisorRPC,
launchSupervisord)
from slapos.util import string_to_boolean
import argparse
import logging
......@@ -59,6 +65,48 @@ def _removeTimestamp(instancehome, partition_base_name):
logger.info("Removing %s", timestamp_path)
os.remove(timestamp_path)
def _startComputerPartition(partition_id, supervisord_socket):
"""
With supervisord, start the instance that was deployed
"""
try:
with getSupervisorRPC(supervisord_socket) as supervisor:
supervisor.startProcessGroup(partition_id, False)
except xmlrpclib.Fault as exc:
if exc.faultString.startswith('BAD_NAME:'):
logger.info("Nothing to start on %s...", partition_id)
else:
raise
else:
logger.info("Requested start of %s...", partition_id)
def _startComputerPartitionList(instance_root, partition_base_name):
"""
Start services for partition which has requested state to 'started'
"""
partition_glob_path = os.path.join(
instance_root,
"%s*" % partition_base_name)
launchSupervisord(instance_root=instance_root, logger=logger)
for partition_path in glob.glob(partition_glob_path):
partition_state_path = os.path.join(
partition_path,
COMPUTER_PARTITION_REQUESTED_STATE_FILENAME
)
supervisord_socket_path = _getSupervisordSocketPath(
instance_root,
logger
)
if os.path.exists(partition_state_path):
partition_state = ""
with open(partition_state_path) as f:
partition_state = f.read()
if partition_state == COMPUTER_PARTITION_STARTED_STATE:
# Call start for this computer partition
_startComputerPartition(
os.path.basename(partition_path.rstrip('/')),
supervisord_socket_path
)
def _runBang(app):
"""
......@@ -76,7 +124,9 @@ def _runFormat(app):
Launch slapos node format.
"""
logger.info("[BOOT] Invoking slapos node format...")
result = app.run(['node', 'format', '--now', '--verbose'])
# '--local' parameter is to prevent node format command to post data to
# master, so this command can work without internet and setup partitions IP.
result = app.run(['node', 'format', '--now', '--local', '--verbose'])
if result == 1:
return 0
return 1
......@@ -196,6 +246,15 @@ class BootCommand(ConfigCommand):
if ipv6_interface is not None:
_waitIpv6Ready(ipv6_interface)
app = SlapOSApp()
# Make sure slapos node format returns ok
while not _runFormat(app):
logger.error("[BOOT] Fail to format, try again in 15 seconds...")
sleep(15)
# Start computer partition services
_startComputerPartitionList(instance_root, partition_base_name)
# Check that node can ping master
if valid_ipv4(master_hostname):
_test_ping(master_hostname)
......@@ -205,12 +264,6 @@ class BootCommand(ConfigCommand):
# hostname
_ping_hostname(master_hostname)
app = SlapOSApp()
# Make sure slapos node format returns ok
while not _runFormat(app):
logger.error("[BOOT] Fail to format, try again in 15 seconds...")
sleep(15)
# Make sure slapos node bang returns ok
while not _runBang(app):
logger.error("[BOOT] Fail to bang, try again in 15 seconds...")
......
......@@ -83,6 +83,12 @@ class FormatCommand(ConfigCommand):
help='Launch slapformat without delay'
' (default: %(default)s)')
ap.add_argument('--local',
default=False, # can have a default as it is not in .cfg
action="store_true",
help='Keep format data locally, do not post xml to master'
' (default: %(default)s)')
ap.add_argument('-n', '--dry_run',
default=False, # can have a default as it is not in .cfg
action="store_true",
......
......@@ -1408,6 +1408,7 @@ def do_format(conf):
computer.dump(path_to_xml=conf.computer_xml,
path_to_json=conf.computer_json,
logger=conf.logger)
if not conf.local:
conf.logger.info('Posting information to %r' % conf.master_url)
computer.send(conf)
conf.logger.info('slapos successfully prepared the computer.')
......
......@@ -90,6 +90,7 @@ SLAPGRID_PROMISE_FAIL = 2
PROMISE_TIMEOUT = 20
COMPUTER_PARTITION_TIMESTAMP_FILENAME = '.timestamp'
COMPUTER_PARTITION_REQUESTED_STATE_FILENAME = '.requested_state'
COMPUTER_PARTITION_LATEST_BANG_TIMESTAMP_FILENAME = '.slapos_latest_bang_timestamp'
COMPUTER_PARTITION_INSTALL_ERROR_FILENAME = '.slapgrid-%s-error.log'
COMPUTER_PARTITION_WAIT_LIST_FILENAME = '.slapos-report-wait-service-list'
......@@ -1125,6 +1126,10 @@ stderr_logfile_backups=1
instance_path,
COMPUTER_PARTITION_TIMESTAMP_FILENAME
)
partition_state_path = os.path.join(
instance_path,
COMPUTER_PARTITION_REQUESTED_STATE_FILENAME
)
parameter_dict = computer_partition.getInstanceParameterDict()
timestamp = parameter_dict.get('timestamp')
......@@ -1225,6 +1230,7 @@ stderr_logfile_backups=1
return
os.remove(timestamp_path)
os.remove(partition_state_path)
# Include Partition Logging
log_folder_path = "%s/.slapgrid/log" % instance_path
......@@ -1339,6 +1345,8 @@ stderr_logfile_backups=1
if timestamp:
with open(timestamp_path, 'w') as f:
f.write(str(timestamp))
with open(partition_state_path, 'w') as f:
f.write(str(computer_partition_state))
def FilterComputerPartitionList(self, computer_partition_list):
"""
......
......@@ -415,8 +415,13 @@ class TestCliBoot(CliMixin):
os.mkdir(os.path.join(instance_root, partition_base_name + '1'))
timestamp = os.path.join(
instance_root, partition_base_name + '1', '.timestamp')
requested_state_path = os.path.join(instance_root,
partition_base_name + '1',
'.requested_state')
with open(timestamp, 'w') as f:
f.write("1578552471")
with open(requested_state_path, 'w') as f:
f.write("started")
# make a config file using this instance root
with tempfile.NamedTemporaryFile(mode='w') as slapos_conf:
......@@ -441,17 +446,21 @@ class TestCliBoot(CliMixin):
patch(
'slapos.cli.boot.netifaces.ifaddresses',
return_value={socket.AF_INET6: ({'addr': '2000::1'},),},) as ifaddresses,\
patch('slapos.cli.boot._startComputerPartition', return_value=None) as start_partition,\
patch('slapos.cli.boot.launchSupervisord', return_value=None),\
patch('slapos.cli.boot._ping_hostname', return_value=1) as _ping_hostname:
app.run(('node', 'boot'))
# boot command runs as root
check_root_user.assert_called_once()
# Computer partition was started during boot
start_partition.assert_called_once()
# it waits for interface to have an IPv6 address
ifaddresses.assert_called_once_with('interface_name_from_config')
# then ping master hostname to wait for connectivity
_ping_hostname.assert_called_once_with('slap.vifib.com')
# then format and bang
SlapOSApp().run.assert_any_call(['node', 'format', '--now', '--verbose'])
SlapOSApp().run.assert_any_call(['node', 'format', '--now', '--local', '--verbose'])
SlapOSApp().run.assert_any_call(['node', 'bang', '-m', 'Reboot'])
# timestamp files have been removed
......@@ -473,6 +482,7 @@ class TestCliBoot(CliMixin):
patch('slapos.cli.boot.netifaces.ifaddresses',
side_effect=[net1, net2, net3]),\
patch('slapos.cli.boot._ping_hostname', return_value=0),\
patch('slapos.cli.boot._startComputerPartitionList', return_value=None) as start_partition,\
patch('slapos.cli.format.check_root_user', return_value=True),\
patch('slapos.cli.format.logging.FileHandler', return_value=logging.NullHandler()),\
patch('slapos.cli.bang.check_root_user', return_value=True),\
......@@ -482,6 +492,7 @@ class TestCliBoot(CliMixin):
app.run(('node', 'boot'))
check_root_user.assert_called_once()
start_partition.assert_called_once()
self.assertEqual(do_format.call_count, 3)
self.assertEqual(do_bang.call_count, 3)
......
......@@ -1401,7 +1401,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase):
self.assertInstanceDirectoryListEqual(['0'])
partition = os.path.join(self.instance_root, '0')
six.assertCountEqual(self, os.listdir(partition),
['.slapgrid', '.timestamp', 'buildout.cfg', 'software_release', 'worked', '.slapos-retention-lock-delay'])
['.slapgrid', '.timestamp', '.requested_state', 'buildout.cfg', 'software_release', 'worked', '.slapos-retention-lock-delay'])
six.assertCountEqual(self, os.listdir(self.software_root), [instance.software.software_hash])
timestamp_path = os.path.join(instance.partition_path, '.timestamp')
self.setSlapgrid()
......@@ -1422,7 +1422,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase):
self.assertInstanceDirectoryListEqual(['0'])
partition = os.path.join(self.instance_root, '0')
six.assertCountEqual(self, os.listdir(partition),
['.slapgrid', '.timestamp', 'buildout.cfg',
['.slapgrid', '.timestamp', '.requested_state', 'buildout.cfg',
'software_release', 'worked', '.slapos-retention-lock-delay'])
six.assertCountEqual(self, os.listdir(self.software_root), [instance.software.software_hash])
......@@ -1445,7 +1445,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase):
self.assertInstanceDirectoryListEqual(['0'])
partition = os.path.join(self.instance_root, '0')
six.assertCountEqual(self, os.listdir(partition),
['.slapgrid', '.timestamp', 'buildout.cfg', 'software_release', 'worked', '.slapos-retention-lock-delay'])
['.slapgrid', '.timestamp', '.requested_state', 'buildout.cfg', 'software_release', 'worked', '.slapos-retention-lock-delay'])
six.assertCountEqual(self, os.listdir(self.software_root), [instance.software.software_hash])
instance.timestamp = str(int(timestamp) - 1)
self.assertEqual(self.launchSlapgrid(), slapgrid.SLAPGRID_SUCCESS)
......@@ -1463,7 +1463,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase):
self.assertInstanceDirectoryListEqual(['0'])
partition = os.path.join(self.instance_root, '0')
six.assertCountEqual(self, os.listdir(partition),
['.slapgrid', '.timestamp', 'buildout.cfg', 'software_release', 'worked', '.slapos-retention-lock-delay'])
['.slapgrid', '.timestamp', '.requested_state', 'buildout.cfg', 'software_release', 'worked', '.slapos-retention-lock-delay'])
six.assertCountEqual(self, os.listdir(self.software_root), [instance.software.software_hash])
instance.timestamp = str(int(timestamp) + 1)
self.assertEqual(self.launchSlapgrid(), slapgrid.SLAPGRID_SUCCESS)
......@@ -1491,7 +1491,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase):
self.assertInstanceDirectoryListEqual(['0'])
partition = os.path.join(self.instance_root, '0')
six.assertCountEqual(self, os.listdir(partition),
['.slapgrid', '.timestamp', 'buildout.cfg', 'software_release', 'worked', '.slapos-retention-lock-delay'])
['.slapgrid', '.timestamp', '.requested_state', 'buildout.cfg', 'software_release', 'worked', '.slapos-retention-lock-delay'])
six.assertCountEqual(self, os.listdir(self.software_root),
[instance.software.software_hash])
instance.timestamp = None
......@@ -1523,7 +1523,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase):
self.launchSlapgrid()
partition = os.path.join(self.instance_root, '0')
six.assertCountEqual(self, os.listdir(partition),
['.slapgrid', '.timestamp', 'buildout.cfg',
['.slapgrid', '.timestamp', '.requested_state', 'buildout.cfg',
'software_release', 'worked', '.slapos-retention-lock-delay'])
time.sleep(2)
......@@ -1533,7 +1533,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase):
self.launchSlapgrid()
six.assertCountEqual(self, os.listdir(partition),
['.slapgrid', '.timestamp', 'buildout.cfg',
['.slapgrid', '.timestamp', '.requested_state', 'buildout.cfg',
'software_release', 'worked', '.slapos-retention-lock-delay'])
def test_one_partition_periodicity_from_file_does_not_disturb_others(self):
......@@ -1710,6 +1710,43 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase):
self.launchSlapgrid()
self.assertEqual(mock_method.call_count, 2)
def test_partition_requested_state_created(self):
computer = self.getTestComputerClass()(self.software_root, self.instance_root)
with httmock.HTTMock(computer.request_handler):
instance = computer.instance_list[0]
timestamp = str(int(time.time()))
instance.timestamp = timestamp
self.assertEqual(self.grid.processComputerPartitionList(), slapgrid.SLAPGRID_SUCCESS)
self.assertInstanceDirectoryListEqual(['0'])
partition = os.path.join(self.instance_root, '0')
six.assertCountEqual(self, os.listdir(partition),
['.slapgrid', '.timestamp', '.requested_state', 'buildout.cfg',
'software_release', 'worked', '.slapos-retention-lock-delay'])
six.assertCountEqual(self, os.listdir(self.software_root), [instance.software.software_hash])
requested_state_path = os.path.join(instance.partition_path, '.requested_state')
with open(requested_state_path) as f:
self.assertEqual(f.read(), slapgrid.COMPUTER_PARTITION_STOPPED_STATE)
self.assertEqual(instance.sequence,
['/stoppedComputerPartition'])
def test_partition_requested_state_not_created_if_failed(self):
computer = self.getTestComputerClass()(self.software_root, self.instance_root)
with httmock.HTTMock(computer.request_handler):
instance = computer.instance_list[0]
timestamp = str(int(time.time()))
instance.timestamp = timestamp
instance.software.setBuildout("""#!/bin/sh
exit 3""")
self.assertEqual(self.grid.processComputerPartitionList(), slapgrid.SLAPGRID_FAIL)
self.assertInstanceDirectoryListEqual(['0'])
self.assertEqual(instance.sequence,
['/softwareInstanceError'])
requested_state_path = os.path.join(instance.partition_path, '.requested_state')
self.assertFalse(os.path.exists(requested_state_path))
def test_one_partition_buildout_fail_does_not_disturb_others(self):
"""
1. We set up two instance one using a corrupted buildout
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment