backupserver_check_backup.py 4.06 KB
Newer Older
1
from zope.interface import implementer
2 3 4 5 6 7 8
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
from slapos.grid.promise.generic import TestResult

import re
import sys
import pytz
9
from os.path import isfile, getmtime
10 11 12
from datetime import datetime
from croniter import croniter
from dateutil.parser import parse
13
from tzlocal import get_localzone
14

15
@implementer(interface.IPromise)
16 17 18
class RunPromise(GenericPromise):

  def __init__(self, config):
19
    super(RunPromise, self).__init__(config)
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
    # check backup ran OK every 5 minutes
    self.setPeriodicity(minute=5)

  def sense(self):
    """
      backupserver run rdiff-backup and log everything in a text file.
      At the beginning of the backup, we have "backup running" printed in the text file.
      At the end of the backup, we can have one of the following printed in the text file:
         * "backup failed" -> backup failed
         * "backup success" -> backup succeeded
      A backup is valid only if we have the 2 conditions:
         * we can grep "backup running" in the text file
         * we can't grep "backup failed" in the text file
    """

35
    script = self.getConfig('script_fullpath')
36
    status = self.getConfig('status_fullpath')
37 38 39
    local_tz = get_localzone()
    prev_cron = croniter(self.getConfig('cron_frequency'), datetime.now()).get_prev(datetime) # date of the previous time cron launched
    prev_cron = local_tz.localize(prev_cron)
40 41 42
    status_url = "{}/private/{}/{}".format(self.getConfig("monitor_url"), self.getConfig("status_dirbasename"), self.getConfig("status_name"))
    statistic_url = "{}/private/{}/{}".format(self.getConfig("monitor_url"), self.getConfig("statistic_dirbasename"), self.getConfig("statistic_name"))

43 44 45 46 47 48 49 50
    # If log file is not present, it can be OK if we launched the instance after the last cron due date
    if not isfile(status):
      if pytz.utc.localize(datetime.utcfromtimestamp(getmtime(script))) < prev_cron:
        self.logger.error("Backup status file is not present")
      else:
        self.logger.info("Backup was never launched")
      return

51 52 53
    # First, parse the log file
    backup_started = False
    backup_ended = False
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
    with open(status, 'r') as f:
      for line in f:
        m = re.match(r"(.*), (.*), (.*), backup (.*)$", line)
        if m:
          if m.group(4) == "running":
            backup_started = True
            backup_start = parse(m.group(1))
          elif m.group(4) == "failed":
            backup_ended = True
            backup_failed = True
            backup_end = parse(m.group(1))
          elif m.group(4) == "success":
            backup_ended = True
            backup_failed = False
            backup_end = parse(m.group(1))
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88

    # Then check result
    if backup_ended and backup_failed:
      self.logger.error("Backup FAILED at {} (see {} ).".format(backup_end, status_url))
    elif not backup_started:
      self.logger.error("Can't find backup start date. Is there a problem with status file? (see {} ).".format(status_url))
    elif backup_start < prev_cron:
      self.logger.error("Backup didn't start at correct time: it started at {} but should have started after {}. (see {} ).".format(backup_start, prev_cron, status_url))
    elif not backup_ended:
      self.logger.info("Backup currently running, started at {} (see {} ).".format(backup_start, status_url))
    else:
      self.logger.info("Backup OK, started at {} and lasted {} (see full stats at {} and status at {} ).".format(
        backup_start,
        backup_end - backup_start,
        statistic_url,
        status_url
      ))

  def test(self):
    """
89
      This is the default test function. Could be commented.
90
    """
91
    return self._test(result_count=1, failure_amount=1)
92 93 94 95 96 97 98 99 100

  def anomaly(self):
    """
      Anomaly returns a TestResult instead of AnomalyResult because we don't
      want to call bang when there is a problem. Usually the problem won't be
      in the deployment of this instance but rather in the instance we are
      backuping. This will need a human intervention.
    """
    return self._test(result_count=1, failure_amount=1)