Commit 4e44bf95 authored by Xavier Thompson's avatar Xavier Thompson

slapgrid: Fix connectionless instance processing

See merge request nexedi/slapos.core!572
parents be11fd4f c9b394c1
Pipeline #29855 failed with stage
in 0 seconds
...@@ -59,6 +59,7 @@ from requests.exceptions import RequestException ...@@ -59,6 +59,7 @@ from requests.exceptions import RequestException
from lxml import etree from lxml import etree
from slapos import manager as slapmanager from slapos import manager as slapmanager
from slapos.slap.exception import ConnectionError
from slapos.slap.slap import NotFoundError from slapos.slap.slap import NotFoundError
from slapos.slap.slap import ServerError from slapos.slap.slap import ServerError
from slapos.slap.slap import COMPUTER_PARTITION_REQUEST_LIST_TEMPLATE_FILENAME from slapos.slap.slap import COMPUTER_PARTITION_REQUEST_LIST_TEMPLATE_FILENAME
...@@ -1425,7 +1426,7 @@ stderr_logfile_backups=1 ...@@ -1425,7 +1426,7 @@ stderr_logfile_backups=1
def processComputerPartitionList(self): def processComputerPartitionList(self):
try: try:
return self.processComputerPartitionListOnline() return self.processComputerPartitionListOnline()
except RequestException: except (RequestException, ConnectionError):
return self.processComputerPartitionListOffline() return self.processComputerPartitionListOffline()
def processComputerPartitionListOnline(self): def processComputerPartitionListOnline(self):
...@@ -1456,7 +1457,7 @@ stderr_logfile_backups=1 ...@@ -1456,7 +1457,7 @@ stderr_logfile_backups=1
self.processComputerPartition(computer_partition) self.processComputerPartition(computer_partition)
# Handle connection loss at the next level # Handle connection loss at the next level
except RequestException: except (RequestException, ConnectionError):
raise raise
# Send log before exiting # Send log before exiting
...@@ -1517,6 +1518,40 @@ stderr_logfile_backups=1 ...@@ -1517,6 +1518,40 @@ stderr_logfile_backups=1
def processComputerPartitionListOffline(self): def processComputerPartitionListOffline(self):
self.logger.info('Processing computer partitions offline...') self.logger.info('Processing computer partitions offline...')
# Backwards compatibility: remove stopped services
for name in os.listdir(self.instance_root):
instance_path = os.path.join(self.instance_root, name)
state_path = os.path.join(instance_path, '.requested_state')
try:
with open(state_path) as f:
requested_state = f.read()
os.remove(state_path)
except (IOError, OSError) as e:
if e.errno != errno.ENOENT and e.errno != errno.ENOTDIR:
raise
requested_state = None
if requested_state == 'stopped':
local_partition = Partition(
software_path=None,
instance_path=instance_path,
shared_part_list='',
supervisord_partition_configuration_dir=(
_getSupervisordConfigurationDirectory(self.instance_root)),
supervisord_socket=self.supervisord_socket,
computer_partition=None,
computer_id=self.computer_id,
partition_id=name,
server_url=self.master_url,
software_release_url='toto',
certificate_repository_path=self.certificate_repository_path,
buildout=self.buildout,
buildout_debug=self.buildout_debug,
logger=self.logger,
instance_storage_home=self.instance_storage_home,
ipv4_global_network=self.ipv4_global_network,
)
local_partition.stop()
# Offline: start all existing services
try: try:
supervisord_socket_path = _getSupervisordSocketPath( supervisord_socket_path = _getSupervisordSocketPath(
self.instance_root, self.instance_root,
......
...@@ -308,7 +308,9 @@ class TestBasicSlapgridCP(BasicMixin, unittest.TestCase): ...@@ -308,7 +308,9 @@ class TestBasicSlapgridCP(BasicMixin, unittest.TestCase):
def test_no_master(self): def test_no_master(self):
os.mkdir(self.software_root) os.mkdir(self.software_root)
os.mkdir(self.instance_root) os.mkdir(self.instance_root)
self.assertRaises(ConnectionError, self.grid.processComputerPartitionList) self.assertEqual(
self.grid.processComputerPartitionList(),
slapgrid.SLAPGRID_OFFLINE_SUCCESS)
def test_environment_variable_HOME(self): def test_environment_variable_HOME(self):
# When running instance, $HOME is set to the partition path # When running instance, $HOME is set to the partition path
...@@ -1087,6 +1089,71 @@ exit 1 ...@@ -1087,6 +1089,71 @@ exit 1
'/getComputerPartitionCertificate' # /getFullComputerInformation is cached '/getComputerPartitionCertificate' # /getFullComputerInformation is cached
]) ])
def test_stopped_partition_remains_stopped_after_master_connection_loss(self):
computer = self.getTestComputerClass()(
self.software_root, self.instance_root, instance_amount=2)
for i in range(2):
partition = computer.instance_list[i]
partition.requested_state = 'started'
partition.software.setBuildout()
run_path = os.path.join(partition.partition_path, 'etc', 'run')
os.makedirs(run_path)
with open(os.path.join(run_path, 'runner'), 'w') as f:
f.write("#!/bin/sh\necho 'Working'\ntouch 'runner_worked'")
os.fchmod(f.fileno(), 0o755)
control_partition = computer.instance_list[0]
test_partition = computer.instance_list[1]
control_file = os.path.join(control_partition.partition_path, 'runner_worked')
test_file = os.path.join(test_partition.partition_path, 'runner_worked')
def assertRunnerWorked(path):
for _ in range(50):
if os.path.exists(path):
break
time.sleep(0.1)
else:
self.assertTrue(os.path.exists(path))
with httmock.HTTMock(computer.request_handler):
self.assertEqual(self.grid.processComputerPartitionList(), slapgrid.SLAPGRID_SUCCESS)
self.assertInstanceDirectoryListEqual(['0', '1'])
assertRunnerWorked(control_file)
assertRunnerWorked(test_file)
for i in range(2):
six.assertCountEqual(self, os.listdir(computer.instance_list[i].partition_path),
['.slapgrid', '.%d_runner.log' % i, 'buildout.cfg', 'etc',
'runner_worked', 'software_release', 'worked',
'.slapos-retention-lock-delay'])
self.assertEqual(control_partition.state, 'started')
self.assertEqual(test_partition.state, 'started')
# simulate stopping the partition with old version
test_partition.state = 'stopped'
state_path = os.path.join(test_partition.partition_path, '.requested_state')
with open(state_path, 'w') as f:
f.write('stopped')
computer.status_code = 503 # connection loss
os.unlink(control_file)
os.unlink(test_file)
with httmock.HTTMock(computer.request_handler):
self.assertEqual(self.grid.processComputerPartitionList(), slapgrid.SLAPGRID_OFFLINE_SUCCESS)
self.assertInstanceDirectoryListEqual(['0', '1'])
assertRunnerWorked(control_file)
self.assertFalse(os.path.exists(test_file))
self.assertEqual(computer.sequence, [
'/getFullComputerInformation',
'/getComputerPartitionCertificate',
'/startedComputerPartition',
'/getComputerPartitionCertificate',
'/startedComputerPartition',
'/getComputerPartitionCertificate' # /getFullComputerInformation is cached
])
class TestSlapgridCPWithMasterWatchdog(MasterMixin, unittest.TestCase): class TestSlapgridCPWithMasterWatchdog(MasterMixin, unittest.TestCase):
def setUp(self): def setUp(self):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment