#!/bin/sh

# ClonePanel - Manages duplicate accounts on two or more webservers,
# including snapshot backups, monitoring and failover dns.
# Copyright (C)2006 Chris Cheers, Internet Lynx.
# Contact chris[at]clonepanel[dot]com.
# Internet Lynx, PO Box 7117, Mannering Park, NSW 2259, Australia
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

Version=0.33
# error_manager -h for built-in documentation

set -u
# Be strict about variable declaration

unset PATH
#avoid use of $PATH - limit script to system commands we choose

PROGRAM_DIR=$(/bin/echo $0 |/bin/sed -e "s/\/[a-zA-Z0-9_]*$//")
# Extract the working directory from command line
# NB - if your system doesn't have echo and sed in these locations
# then this line will need to be changed in all shell scripts

# Standard include files:
. $PROGRAM_DIR/includes -q
# -q = Don't print out copyleft info on run (non-interactive program)


MAIL=/bin/mail
PS=/bin/ps
# Not included with other commands - don't want this enabled by default

if [ `$PS u |$GREP error_manager |$WC -l` -gt 3 ]; then
# (2 because this is one instance and the grep is another)
# Already / still running - don't allow second instance
	cecho -c $info "error_manager already running - stopping here"
	exit 0
fi

handled=''
# Variable to keep track of hosts with errors handled below

# Count errors for each host in recent logs
while read count hclass
do
	if [ ! "$count" ]; then
		continue
	fi
	host=`$ECHO $hclass |$SED -e "s/m_h_//"`
	handled="$handled $host"
	. $STATUS_FILE
	eval status=\$$host
# Indirect variable ref - pull status value for this host	
	eval status_up=\$"STAT_${status}_UP"
	eval status_down=\$"STAT_${status}_DOWN"
# and statuses above and below this
	new_status=$status
# Variable for diplay only

	if [ "$status_up" ]; then
# If there is a higher status, test whether we need to go there for this host
		eval count_up=\$"STATCOUNT_${status}_UP"
		eval counttime=\$"STATTIME_${status}_UP"
		this_count=$count 
		if [ $counttime -ne $ERROR_LOG_MINUTES ]; then
# Recalculate count for different time limit on this transition
			this_count=`$FIND $ERROR_LOG_DIR -type f -mmin -$counttime \
				-exec $GREP $hclass {} \; |$WC -l`
		fi
		if [ $this_count -ge $count_up ]; then
			cecho -c $warning "Raising status of $host to $status_up"
			$PROGRAM_DIR/set_status -H $host -s $status_up
			new_status=$status_up
		fi
	fi
	if [ "$status_down" ]; then
# If there is a lower status, test whether we need to go there for this host
		eval count_down=\$"STATCOUNT_${status}_DOWN"
		eval counttime=\$"STATTIME_${status}_DOWN"
		this_count=$count
		if [ $counttime -ne $ERROR_LOG_MINUTES ]; then
# Recalculate count for different time limit on this transition
			this_count=`$FIND $ERROR_LOG_DIR -type f -mmin -$counttime \
				-exec $GREP $hclass {} \; |$WC -l`
		fi
		if [ $this_count -le $count_down ]; then
			cecho -c $warning "Lowering status of $host to $status_down"
			$PROGRAM_DIR/set_status -H $host -s $status_down
			new_status=$status_down
		fi
	fi

done <<END
`$FIND $ERROR_LOG_DIR -type f -mmin -$ERROR_LOG_MINUTES -exec $CAT {} \; |$AWK '{print $2}' | $SORT | $UNIQ -c`
END


ndhosts=`$GREP -v "=$STAT_DEFAULT" $STATUS_FILE |$CUT -d= -f1`
# Find any hosts with non-default status

for host in $ndhosts
do
# If not handled above then no errors
	if [ "`$ECHO $handled |$GREP $host`" ]; then
# TODO - cleanup possible bug with host names that are subset of another name
		continue
	fi
	cecho -c $warning "Lowering status of $host to default"
	$ECHO `$PROGRAM_DIR/set_status -H $host -s $STAT_DEFAULT`
done

hosts_failed=''
if [ "`$LS $PENDING_TASKS_DIR`" ]; then
# Pending tasks - try to run them
	for task in `$FIND $PENDING_TASKS_DIR -maxdepth 1 -type f -printf "%f\n" | $SORT -n`
	do
		already_failed=''
		for fhost in $hosts_failed
		do
			if [ "`$GREP $fhost $PENDING_TASKS_DIR/$task`" ]; then
				already_failed=y
			fi
		done
		if [ ! "$already_failed" ]; then
			. $PENDING_TASKS_DIR/$task
			$command
			if [ $? -ne 0 ]; then
				cecho -c $error "Command failed on $host - will try again next run"
				hosts_failed="$hosts_failed $host"
			else
				$RM $PENDING_TASKS_DIR/$task
				checkerror $? $E_DELETE_TASK_FAILED
			fi
		fi
	done
fi

# NB - Now keeping error manager active through cron at all times.
