Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
S
slapos.core
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Labels
Merge Requests
21
Merge Requests
21
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Jobs
Commits
Open sidebar
nexedi
slapos.core
Commits
1a44bfe4
Commit
1a44bfe4
authored
Mar 31, 2023
by
Xavier Thompson
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
slapgrid: Handle connection loss to master
parent
0f2564f8
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
108 additions
and
15 deletions
+108
-15
slapos/grid/slapgrid.py
slapos/grid/slapgrid.py
+29
-1
slapos/tests/test_slapgrid.py
slapos/tests/test_slapgrid.py
+79
-14
No files found.
slapos/grid/slapgrid.py
View file @
1a44bfe4
...
...
@@ -53,6 +53,8 @@ if sys.version_info < (2, 6):
warnings
.
warn
(
'Used python version (%s) is old and has problems with'
' IPv6 connections'
%
sys
.
version
.
split
(
'
\
n
'
)[
0
])
from
requests.exceptions
import
RequestException
from
lxml
import
etree
from
slapos
import
manager
as
slapmanager
...
...
@@ -69,7 +71,8 @@ from slapos.grid.SlapObject import Software, Partition
from
slapos.grid.svcbackend
import
(
launchSupervisord
,
createSupervisordConfiguration
,
_getSupervisordConfigurationDirectory
,
_getSupervisordSocketPath
)
_getSupervisordSocketPath
,
getSupervisorRPC
)
from
slapos.grid.utils
import
(
md5digest
,
dropPrivileges
,
SlapPopen
,
...
...
@@ -90,6 +93,7 @@ COMPUTER_PARTITION_STOPPED_STATE = 'stopped'
SLAPGRID_SUCCESS
=
0
SLAPGRID_FAIL
=
1
SLAPGRID_PROMISE_FAIL
=
2
SLAPGRID_OFFLINE_SUCCESS
=
3
PROMISE_TIMEOUT
=
20
COMPUTER_PARTITION_TIMESTAMP_FILENAME
=
'.timestamp'
...
...
@@ -1423,6 +1427,12 @@ stderr_logfile_backups=1
return
filtered_computer_partition_list
def
processComputerPartitionList
(
self
):
try
:
return
self
.
processComputerPartitionListOnline
()
except
RequestException
:
return
self
.
processComputerPartitionListOffline
()
def
processComputerPartitionListOnline
(
self
):
"""
Will start supervisord and process each Computer Partition.
"""
...
...
@@ -1449,6 +1459,10 @@ stderr_logfile_backups=1
# Process the partition itself
self
.
processComputerPartition
(
computer_partition
)
# Handle connection loss at the next level
except
RequestException
:
raise
# Send log before exiting
except
(
SystemExit
,
KeyboardInterrupt
):
computer_partition
.
error
(
traceback
.
format_exc
(),
logger
=
self
.
logger
)
...
...
@@ -1505,6 +1519,20 @@ stderr_logfile_backups=1
return
SLAPGRID_PROMISE_FAIL
return
SLAPGRID_SUCCESS
def
processComputerPartitionListOffline
(
self
):
self
.
logger
.
info
(
'Processing computer partitions offline...'
)
try
:
supervisord_socket_path
=
_getSupervisordSocketPath
(
self
.
instance_root
,
self
.
logger
)
with
getSupervisorRPC
(
supervisord_socket_path
)
as
supervisor
:
supervisor
.
startAllProcesses
(
False
)
except
Exception
:
self
.
logger
.
exception
(
'Error in offline mode while starting partitions:'
)
return
SLAPGRID_FAIL
return
SLAPGRID_OFFLINE_SUCCESS
def
processPromiseList
(
self
):
"""
Will check and process promises for each Computer Partition.
...
...
slapos/tests/test_slapgrid.py
View file @
1a44bfe4
...
...
@@ -388,7 +388,8 @@ class ComputerForTest(object):
software_root
,
instance_root
,
instance_amount
=
1
,
software_amount
=
1
):
software_amount
=
1
,
status_code
=
200
):
"""
Will set up instances, software and sequence
"""
...
...
@@ -397,6 +398,7 @@ class ComputerForTest(object):
self
.
software_amount
=
software_amount
self
.
software_root
=
software_root
self
.
instance_root
=
instance_root
self
.
status_code
=
status_code
self
.
ip_address_list
=
[
(
'interface1'
,
'10.0.8.3'
),
(
'interface2'
,
'10.0.8.4'
),
...
...
@@ -425,18 +427,18 @@ class ComputerForTest(object):
and
'computer_id'
in
qs
):
slap_computer
=
self
.
getComputer
(
qs
[
'computer_id'
][
0
])
return
{
'status_code'
:
200
,
'status_code'
:
self
.
status_code
,
'content'
:
dumps
(
slap_computer
)
}
elif
url
.
path
==
'/getHostingSubscriptionIpList'
:
ip_address_list
=
self
.
ip_address_list
return
{
'status_code'
:
200
,
'status_code'
:
self
.
status_code
,
'content'
:
dumps
(
ip_address_list
)
}
elif
url
.
path
==
'/getComputerPartitionCertificate'
:
return
{
'status_code'
:
200
,
'status_code'
:
self
.
status_code
,
'content'
:
dumps
({
'certificate'
:
'SLAPOS_cert'
,
'key'
:
'SLAPOS_key'
})
}
if
req
.
method
==
'POST'
and
'computer_partition_id'
in
qs
:
...
...
@@ -445,17 +447,17 @@ class ComputerForTest(object):
instance
.
header_list
.
append
(
req
.
headers
)
if
url
.
path
==
'/startedComputerPartition'
:
instance
.
state
=
'started'
return
{
'status_code'
:
200
}
return
{
'status_code'
:
self
.
status_code
}
if
url
.
path
==
'/stoppedComputerPartition'
:
instance
.
state
=
'stopped'
return
{
'status_code'
:
200
}
return
{
'status_code'
:
self
.
status_code
}
if
url
.
path
==
'/destroyedComputerPartition'
:
instance
.
state
=
'destroyed'
return
{
'status_code'
:
200
}
return
{
'status_code'
:
self
.
status_code
}
if
url
.
path
==
'/softwareInstanceBang'
:
return
{
'status_code'
:
200
}
return
{
'status_code'
:
self
.
status_code
}
if
url
.
path
==
"/updateComputerPartitionRelatedInstanceList"
:
return
{
'status_code'
:
200
}
return
{
'status_code'
:
self
.
status_code
}
if
url
.
path
==
'/softwareInstanceError'
:
instance
.
error_log
=
'
\
n
'
.
join
(
[
...
...
@@ -465,18 +467,18 @@ class ComputerForTest(object):
]
)
instance
.
error
=
True
return
{
'status_code'
:
200
}
return
{
'status_code'
:
self
.
status_code
}
elif
req
.
method
==
'POST'
and
'url'
in
qs
:
# XXX hardcoded to first software release!
software
=
self
.
software_list
[
0
]
software
.
sequence
.
append
(
url
.
path
)
if
url
.
path
==
'/availableSoftwareRelease'
:
return
{
'status_code'
:
200
}
return
{
'status_code'
:
self
.
status_code
}
if
url
.
path
==
'/buildingSoftwareRelease'
:
return
{
'status_code'
:
200
}
return
{
'status_code'
:
self
.
status_code
}
if
url
.
path
==
'/destroyedSoftwareRelease'
:
return
{
'status_code'
:
200
}
return
{
'status_code'
:
self
.
status_code
}
if
url
.
path
==
'/softwareReleaseError'
:
software
.
error_log
=
'
\
n
'
.
join
(
[
...
...
@@ -486,7 +488,7 @@ class ComputerForTest(object):
]
)
software
.
error
=
True
return
{
'status_code'
:
200
}
return
{
'status_code'
:
self
.
status_code
}
else
:
return
{
'status_code'
:
500
}
...
...
@@ -1021,6 +1023,69 @@ exit 1
'/stoppedComputerPartition'
])
self
.
assertEqual
(
'stopped'
,
instance
.
state
)
def
test_one_partition_started_no_master
(
self
):
computer
=
self
.
getTestComputerClass
()(
self
.
software_root
,
self
.
instance_root
,
status_code
=
503
)
with
httmock
.
HTTMock
(
computer
.
request_handler
):
partition
=
computer
.
instance_list
[
0
]
partition
.
requested_state
=
'started'
partition
.
software
.
setBuildout
()
self
.
assertEqual
(
self
.
grid
.
processComputerPartitionList
(),
slapgrid
.
SLAPGRID_OFFLINE_SUCCESS
)
self
.
assertInstanceDirectoryListEqual
([
'0'
])
six
.
assertCountEqual
(
self
,
os
.
listdir
(
partition
.
partition_path
),
[])
# buildout hasn't run
six
.
assertCountEqual
(
self
,
os
.
listdir
(
self
.
software_root
),
[
partition
.
software
.
software_hash
])
self
.
assertEqual
(
computer
.
sequence
,
[
'/getFullComputerInformation'
])
self
.
assertEqual
(
partition
.
state
,
None
)
def
test_one_partition_started_after_master_connection_loss
(
self
):
computer
=
self
.
getTestComputerClass
()(
self
.
software_root
,
self
.
instance_root
)
partition
=
computer
.
instance_list
[
0
]
partition
.
requested_state
=
'started'
partition
.
software
.
setBuildout
()
run_path
=
os
.
path
.
join
(
partition
.
partition_path
,
'etc'
,
'run'
)
os
.
makedirs
(
run_path
)
with
open
(
os
.
path
.
join
(
run_path
,
'runner'
),
'w'
)
as
f
:
f
.
write
(
"#!/bin/sh
\
n
echo 'Working'
\
n
touch 'runner_worked'"
)
os
.
fchmod
(
f
.
fileno
(),
0o755
)
runner_worked_file
=
os
.
path
.
join
(
partition
.
partition_path
,
'runner_worked'
)
def
assertRunnerWorked
():
for
_
in
range
(
50
):
if
os
.
path
.
exists
(
runner_worked_file
):
break
time
.
sleep
(
0.1
)
else
:
self
.
assertTrue
(
os
.
path
.
exists
(
runner_worked_file
))
with
httmock
.
HTTMock
(
computer
.
request_handler
):
self
.
assertEqual
(
self
.
grid
.
processComputerPartitionList
(),
slapgrid
.
SLAPGRID_SUCCESS
)
self
.
assertInstanceDirectoryListEqual
([
'0'
])
assertRunnerWorked
()
six
.
assertCountEqual
(
self
,
os
.
listdir
(
partition
.
partition_path
),
[
'.slapgrid'
,
'.0_runner.log'
,
'buildout.cfg'
,
'etc'
,
'runner_worked'
,
'software_release'
,
'worked'
,
'.slapos-retention-lock-delay'
])
runner_log_path
=
os
.
path
.
join
(
partition
.
partition_path
,
'.0_runner.log'
)
with
open
(
runner_log_path
)
as
f
:
runner_log
=
f
.
read
()
self
.
assertEqual
(
runner_log
,
'Working
\
n
'
)
self
.
assertEqual
(
partition
.
state
,
'started'
)
computer
.
status_code
=
503
# connection loss
os
.
unlink
(
runner_worked_file
)
with
httmock
.
HTTMock
(
computer
.
request_handler
):
self
.
assertEqual
(
self
.
grid
.
processComputerPartitionList
(),
slapgrid
.
SLAPGRID_OFFLINE_SUCCESS
)
self
.
assertInstanceDirectoryListEqual
([
'0'
])
assertRunnerWorked
()
with
open
(
runner_log_path
)
as
f
:
runner_log
=
f
.
read
()
self
.
assertEqual
(
runner_log
,
'Working
\
n
'
*
2
)
self
.
assertEqual
(
computer
.
sequence
,
[
'/getFullComputerInformation'
,
'/getComputerPartitionCertificate'
,
'/startedComputerPartition'
,
'/getComputerPartitionCertificate'
# /getFullComputerInformation is cached
])
class
TestSlapgridCPWithMasterWatchdog
(
MasterMixin
,
unittest
.
TestCase
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment