Commit 32459e14 authored by Alain Takoudjou's avatar Alain Takoudjou

monitor sr: add promise to check cpu load and free memory on server

parent 0b1dab76
......@@ -10,6 +10,8 @@ parts =
monitor-collect-csv-wrapper
fluentd-wrapper
monitor-base
monitor-check-memory-usage
monitor-check-cpu-usage
publish-connection-information
......@@ -99,11 +101,38 @@ command-line =
wrapper-path = ${monitor-directory:reports}/monitor-collect-csv-dump
parameters-extra = true
[monitor-check-cpu-usage]
recipe = slapos.cookbook:wrapper
command-line = ${monitor-directory:bin}/python {{ monitor_check_system_health }} cpu ${init-monitor-parameters:cpu-load-file}
wrapper-path = ${monitor-directory:promises}/system-CPU-load-check
[monitor-check-memory-usage]
recipe = slapos.cookbook:wrapper
command-line = ${monitor-directory:bin}/python {{ monitor_check_system_health }} mem ${init-monitor-parameters:mem-free-file} ${directory:monitor}
wrapper-path = ${monitor-directory:promises}/system-MEMORY-usage-check
[publish-connection-information]
recipe = slapos.cookbook:publish
monitor-setup-url = https://monitor.app.officejs.com/#page=settings_configurator&url=${publish:monitor-url}&username=${publish:monitor-user}&password=${publish:monitor-password}
server_log_url = ${publish:monitor-base-url}/${slap-configuration:private-hash}/
[monitor-instance-parameter]
instance-configuration =
file max-cpu-load-per-core ${init-monitor-parameters:cpu-load-file}
file min-free-mem-percent ${init-monitor-parameters:mem-free-file}
[init-monitor-parameters]
recipe = plone.recipe.command
cpu-load-file = ${directory:monitor}/cpu-load-tolerance
mem-free-file = ${directory:monitor}/mem-free-limit
command =
if [ ! -s "${:cpu-load-file}" ]; then
echo "1.5" > ${:cpu-load-file}
fi
if [ ! -s "${:mem-free-file}" ]; then
echo "7.0" > ${:mem-free-file}
fi
[slap-configuration]
recipe = slapos.cookbook:slapconfiguration.serialised
computer = ${slap-connection:computer-id}
......
......@@ -24,6 +24,7 @@ context = key buildout buildout:bin-directory
raw fluentd_location ${fluentd:location}
raw fluent_conf_output ${fluentd-agent-conf:output}
raw monitor_collect_csv_dump ${monitor-collect-csv-dump:output}
raw monitor_check_system_health ${monitor-system-health:output}
mode = 0644
[instance-base-distributor]
......@@ -51,4 +52,3 @@ partition = $${slap-connection:partition-id}
url = $${slap-connection:server-url}
key = $${slap-connection:key-file}
cert = $${slap-connection:cert-file}
#!/usr/bin/env python
import subprocess
import os
import re
import json
cpu_command_list = ['top', '-n', '1', '-b']
mem_command_list = ['free', '-m']
head_command_list = ['head', '-n', '5']
cpu_core_cmd_list = ['nproc']
def cpu_usage(tolerance=1.5):
# tolerance=1.5 => accept up to 1.5 =150% CPU load
uptime_result = subprocess.check_output(['uptime'])
line = uptime_result.strip().split(' ')
load, load5, long_load = line[-3:]
core_count = int(subprocess.check_output(cpu_core_cmd_list).strip())
threshold = core_count * tolerance
if float(long_load) > threshold:
# display top statistics
top = subprocess.Popen(cpu_command_list, stdout=subprocess.PIPE)
result = subprocess.check_output(head_command_list, stdin=top.stdout)
message = "CPU load is high: %s %s %s\n\n" % (load, load5, long_load)
message += result
return message
def check_last_result(file, last_value, threshold=7.0, elt_count=5):
mem_average = 0.0
value_list = []
if os.path.exists(file):
with open(file) as f:
values = f.read()
value_list = values.split(' ')
size = len(value_list)
value_list.append(str(last_value))
if size >= elt_count:
while len(value_list) > elt_count:
value_list.pop(0)
# calculate average
average = sum([float(l) for l in value_list])/(size * 1.0)
if average < threshold:
mem_average = round(average, 2)
else:
value_list.append(str(last_value))
with open(file, 'w') as f:
f.write(' '.join(value_list))
return mem_average
def memory_usage(storage_file, threshold=7.0, elt_count=5):
mem_stats = subprocess.check_output(mem_command_list)
result_list = mem_stats.split('\n')
usage = re.sub('\s+', ' ', result_list[1])
usage_real = re.sub('\s+', ' ', result_list[2])
usage_list = usage.split(' ')
mem_total = float(usage_list[1])
mem_free = float(usage_real.split(' ')[-1])
if mem_free == 0.0:
mem_available = 0.0
else:
mem_available = round(mem_free * 100 / (mem_total * 1.0), 2)
average = check_last_result(
storage_file,
mem_available,
threshold=threshold,
elt_count=elt_count)
if average != 0.0 and average < threshold:
# mem used at (threshold)% at least
message = "Memory usage is high. %s%% is available (%s%% for last %s minutes).\n\n" % (
mem_available, average, elt_count)
message += mem_stats
return message
swap_usage = re.sub('\s+', ' ', result_list[3])
swap_usage_list = swap_usage.split(' ')
swap_total = float(swap_usage_list[1])
swap_free = float(swap_usage_list[3])
if swap_total > 1:
if swap_free == 0.0:
swap_available = 0.0
else:
swap_available = round(swap_free * 100 / (swap_total * 1.0), 2) * 100
if swap_available < threshold*1.7:
message = "Memory SWAP usage is high. %s%% is available.\n\n" % swap_available
message += mem_stats
return message
if __name__ == '__main__':
if len(sys.argv) < 2:
print "Usage: %s [cpu | mem] CONFIG_FILE [BASE_DIR]" % os.path.basename(sys.argv[0])
exit(2)
check_type = sys.argv[1]
threshold = None
if len(sys.argv) >= 3:
config_file = sys.argv[2]
if os.path.exists(config_file):
with open(config_file) as f:
try:
threshold = float(f.read())
if not threshold > 0:
threshold = None
except ValueError:
pass
if check_type == "cpu":
result = cpu_usage(threshold or 1.5)
if result:
print result
exit(1)
elif check_type == "mem":
directory = ""
if len(sys.argv) >= 4:
directory = sys.argv[3]
if not os.path.exists(directory) or not os.path.isdir(directory):
directory = os.getcwd()
storage_file = os.path.join(directory, 'mem-usage.mo')
result = memory_usage(storage_file, threshold=(threshold or 4.0), elt_count=10)
if result:
print result
exit(1)
else:
exit(3)
exit(0)
\ No newline at end of file
......@@ -25,14 +25,14 @@ parts =
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/instance.cfg
output = ${buildout:directory}/template.cfg
md5sum = 0c5e94beede86a91d7b01f61a7290f86
md5sum = 3ff5fb2710bf0ea84632c6d6d3894dd9
mode = 0644
[template-monitor]
recipe = slapos.recipe.build:download
url = ${:_profile_base_location_}/instance-monitor.cfg.jinja2
destination = ${buildout:directory}/template-base-monitor.cfg
md5sum = df37b01b194f9621ce8928fe361db728
md5sum = c513cf70b8671840b2ba049d2826a661
mode = 0644
[template-monitor-distributor]
......@@ -77,6 +77,13 @@ filename = collect_csv_dump.py
output = ${:destination}/${:filename}
md5sum = cad2402bbd21907cfed6bc5af8c5d3ab
[monitor-system-health]
<= monitor-template-script
url = ${:_profile_base_location_}/script/${:filename}
filename = check_system_health.py
output = ${:destination}/${:filename}
md5sum = 7eb74a0be4995c6a1015a9a1eb6874c6
[extra-eggs]
<= monitor-eggs
interpreter = pythonwitheggs
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment