watcher.in 1.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
#!{{ bash.location }}/bin/bash
# run program under watchdog
# watcher <restart-codes> <prog> [<progargs> ...]
#
# <restart-codes> = code1,code2,...
#
# if the program terminates with status in <restart-codes> - it is restarted after grace period.
# if the program terminates otherwise - whole process terminates.
#
# code can be numeric or symbolic - refering to a signal name. example:
#
# watcher 0,SIGKILL <prog> ...

die() {
    echo "$@" 1>&2
    exit 1
}

if [ "$#" -lt 2 ]; then
    die "Usage: watcher <restart-codes> <prog> [<progargs> ...]"
fi
restart_codes="$1"; shift
prog="$@"

# signumber <signame> -> #sig
signumber() {
    signame=$1
    # "11) SIGSEGV "
    sigentry=`kill -l |grep -o "[0-9]\+) $signame\(\s\|$\)"` ||
        die "E: $signame is not a signal"
    echo "$sigentry" | grep -o "[0-9]\+"
}

# restart codes as set
declare -A restarts
for code in `echo "$restart_codes" |sed 's/,/ /g'`; do
    case $code in
        *[!0-9]*)
            # non-number - treat it as signal name
            signo=`signumber $code` || exit 1
            code=$((128 + $signo))  # exit code of process terminated by signal #signo
            ;;

        *)
            # already number
            ;;
    esac
    restarts[$code]=y
done

progpid=""

# make sure to terminate children, when we exit.
# needed for e.g. when `slapos node stop ...` kills us.
trap 'atexit' EXIT
atexit() {
    jobs="$(jobs -p)"
    test -n "$jobs" && kill $jobs
}

# run prog under monitoring
while true; do
    echo "run $prog"
    $prog &
    progpid=$!
    echo "wait $progpid"
    wait $progpid
    status=$?
    echo "-> $status"

    # if program terminated not with expected status - exit
    if [ "${restarts[$status]}" != y ] ; then
        echo "exit $status"
        exit "$status"
    fi

    # otherwise sleep a bit and restart
    sleep 1
done