#!/bin/sh

# ClonePanel - Manages duplicate accounts on two or more webservers,
# including snapshot backups, monitoring and failover dns.
# Copyright (C)2006 Chris Cheers, Internet Lynx.
# Contact chris[at]clonepanel[dot]com.
# Internet Lynx, PO Box 7117, Mannering Park, NSW 2259, Australia

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

Version=0.33
# monitor -h for built-in documentation

set -u
# Be strict about variable declaration

unset PATH
#avoid use of $PATH - limit script to system commands we choose

PROGRAM_DIR=$(/bin/echo $0 |/bin/sed -e "s/\/[a-zA-Z0-9_]*$//")
# Extract the working directory from command line
# NB - if your system doesn't have echo and sed in these locations
# then this line will need to be changed in all shell scripts


# Standard include files:
. $PROGRAM_DIR/includes

# Built-in documentation:
function showdocs {
	$CAT <<-EODOC
		Monitor and record services on remote servers.		
		Usage:
		./monitor
		Command line options: 
		-h		Print these instructions and exit
		-i		Interactive mode - output to terminal
		-r y|n		Retry once on failed connection (default y)
		-H hostname	Optional Host name. For multiple hosts use
		-H host1 -H host2 etc. If no host given then check all hosts

EODOC
	exit $DOC_REQUEST
}

function checkbyproxy {
# Check status of $host using another monitored host with proxy check enabled
	if [ $repeat -eq 0 ]; then
		reverse=''
	else
		reverse='-r'
	fi
# On a repeat check use sort -r to reverse the host listings and get an
# alternative proxy monitor if available.
	for proxy in `$FIND $HOSTS_DIR -maxdepth 1 -mindepth 1 -type d -printf "%f\n" | $SORT $reverse`
	do
		if [ "$host" == "$proxy" ]; then
			continue
		fi
		config -H $proxy
		if [ "$MONITOR_PROXY" != 'y' ]; then
			continue
		fi
		cecho -c $warning "Checking $host via $proxy"
		plwp="$LWP_REQUEST -t $MONITOR_TIMEOUT_BY_PROXY -H Host:$MONITOR_HOST http://$MONITOR_IP/$MONITOR_PATH?CHECK=$host"
# As in check_host but going via proxy
		$TIME $plwp > $temp_res 2> $temp_time
		if [ $? -gt 0 ]; then
			cecho -c $error "LWP failed for $proxy - Assuming failed connection" >&2
			fail=2
# Likely a bad connection this end - can't contact proxy. Don't use results.
		else
			cecho -c $info "Connected via proxy $proxy" >&2
# Connection is ok - contacted proxy.
			if [ "`$GREP uptime $temp_res`" ]; then
# Proxy did get a result, so use it (good or bad)
				fail=0
			fi
		fi
		config -H $host
# Restore config to normal for this host
		return

	done
	cecho -c $warning "No suitable monitor proxy for $host - cannot check by proxy"
# NB - in this case we don't change the value of $fail.
	return
}

function check_host {
	MONITOR=''
	MONITOR_RESULT_USER=''
	config -H $host
	if [ "$MONITOR_RESULT_USER" ]; then
		mr_user=$MONITOR_RESULT_USER
# Upload results to each host with this set - remember it for later
		config -H $host -u $mr_user
# In case of any special configuration options for this user
		result_hosts[$n_result_hosts]=$host
		result_dirs[$n_result_hosts]=$REMOTEHOST_HOME/$MONITOR_RESULT_DIR
		result_users[$n_result_hosts]=$REMOTEHOST_USERNAME
		result_ips[$n_result_hosts]=$A
		n_result_hosts=`$EXPR $n_result_hosts + 1`
# What if the monitor result host is in trouble?
		eval mr_status=\$$host
# Does it have a slave we can send to instead / as well?
		if [ "$mr_status" != "GREEN" -a "$WEB_SLAVE" ]; then
			slavehost=$WEB_SLAVE
			config -H $slavehost
# And is the slave a monitor result host too? If so let it get its own update!
		        if [ ! "$MONITOR_RESULT_USER" ]; then
				config -H $slavehost -u $mr_user
# In case of any special configuration options for this user
				result_hosts[$n_result_hosts]=$slavehost
				result_dirs[$n_result_hosts]=$REMOTEHOST_HOME/$MONITOR_RESULT_DIR
				result_users[$n_result_hosts]=$REMOTEHOST_USERNAME
				result_ips[$n_result_hosts]=$A
				n_result_hosts=`$EXPR $n_result_hosts + 1`
			fi
			config -H $host -u $mr_user
# We now return you to our scheduled programs...
		fi

	fi

	if [ ! "$MONITOR_IP" ]; then
		cecho -c $warning "Warning - No monitor IP address for $host"
		return
	fi
	if [ "$MONITOR_IP" == "NONE" ]; then
		return
	fi

	dhm=(`$DATE -u +"%Y %m %d %H %M %S"`)
	temp_res=$TEMP_MONITOR_FILE.$host.${dhm[2]}${dhm[3]}${dhm[4]}.res
	temp_time=$TEMP_MONITOR_FILE.$host.${dhm[2]}${dhm[3]}${dhm[4]}.time
	temp_out=$TEMP_MONITOR_FILE.$host.${dhm[2]}${dhm[3]}${dhm[4]}.out
	fail=0

	output_path=$MONITOR_DIR
	output_file=${dhm[0]}-${dhm[1]}-${dhm[2]}-${dhm[3]}
	summary_file=${dhm[0]}-${dhm[1]}

	if [ ! -f "$output_path/$output_file" ]; then
# Monitor file doesn't yet exist (new hour) - start with months uptime summary
		uptime_summary
	fi
        M=${dhm[4]}
        if [ $MONITOR_FREQUENCY -gt 1 ]; then
                Mr=`$EXPR $M % $MONITOR_FREQUENCY`
                M=`$EXPR $M - $Mr`
#               Round off minutes to set interval
        fi

# Using LWP because it allows specifying the headers (Host and User-Agent) while
# using the IP to connect, thus eliminating DNS requests and timing issues
	$TIME $LWP_REQUEST -t $MONITOR_TIMEOUT -H Host:$MONITOR_HOST -H "User-Agent:$MONITOR_USER_AGENT" http://$MONITOR_IP/$MONITOR_PATH > $temp_res 2> $temp_time

# Combining the command into a single variable doesn't work (quotes not respected)

# Note that command variable $TIME enforces use of full path to time
# command, avoiding the builtin GNU time which doesn't seem to support
# STDERR redirection (at least not in this way)

	if [ $? -gt 0 ]; then
		$ECHO "LWP failed for $host" >&2
		repeat=0
		fail=1
		checkbyproxy
	fi

	if [ $fail -gt 0 ]; then
		cecho -c $error "Proxy check failed" >&2
		if [ "$retry" == "y" ]; then
			$ECHO "Retrying in $MONITOR_RETRY_DELAY seconds..." >&2
			$SLEEP $MONITOR_RETRY_DELAY
$TIME $LWP_REQUEST -t $MONITOR_TIMEOUT -H Host:$MONITOR_HOST -H "User-Agent:$MONITOR_USER_AGENT" http://$MONITOR_IP/$MONITOR_PATH > $temp_res 2> $temp_time
			if [ $? -gt 0 ]; then
				$ECHO "LWP retry failed for $host" >&2
				fail=1
				repeat=1
				checkbyproxy
			else
				fail=0
			fi
		fi
	fi
	if [ $fail -eq 2 ]; then
# After (possible) retry, again failed to connect directly or by proxy
# local connection error - enter zero data as result
		$ECHO "noconnect=1" >$temp_res
# On processing, log this as a fail for the monitor host
	else
# We have a result, possibly a "Can't connect" from a proxy
		if [ ! "`$GREP uptime $temp_res`" ]; then
			cecho -c $error "FAIL recorded for $host" >&2
		fi
	fi
# Failed LWP / wget is recorded in results file - see monitor.pl

	class="m_h_$host m_min_$M"
	$PERL -I $PROGRAM_DIR $PROGRAM_DIR/monitor.pl $temp_time $temp_res $output_path $output_file $interactive $PROGRAM_DIR/$ERROR_ACTION_SCRIPT $ERROR_ACTION_THRESHOLD $MY_NAME $class
#	Processing the data is just so much easier in perl...
	if [ -f $temp_res ]; then
		$RM $temp_res
#		Would normally be done within monitor.pl anyway
	fi

}

function uptime_summary {

	if [ -f $output_path/$summary_file ]; then
		$RM $output_path/$summary_file
# Clear old summary data
	fi
	up_handled=''
# Variable to keep track of hosts with errors handled below
	month=`$DATE -u +%m`
	monthab=`$DATE -u +%b`
	year=`$DATE -u +%Y`

	. $SYSTEM_DIR/status
# Read status file for all hosts

	if [ `$FIND $MONITOR_DIR -name ${year}-${month}* |$WC -l` -gt 0 ]; then
		up_total=`$GREP -Po " m_min_\w+" $MONITOR_DIR/${year}-${month}* |$UNIQ |$WC -l`
# Search monitor files this month, select only filename and minutes,
# unique to list just the measurement times, count results.
# Result: total number of measurements taken this month
	else
		up_total=1
# Avoid "file not found" and divide by zero during first measurement of new month.
	fi

# Next count fatal errors for each host in this months error logs
	if [ `$FIND $ERROR_LOG_DIR -name ${year}-${month}* |$WC -l` -gt 0 ]; then
		while read up_count up_hclass
		do
			if [ ! "$up_count" ]; then
				continue
			fi
			up_host=`$ECHO $up_hclass |$SED -e "s/m_h_//"`
			up_handled="$up_handled $up_host"
# remember which hosts we've handled for later
# TODO - switch to true array syntax
			eval up_status=\$$up_host
# get the current status of this host (indirect ref)
			uptime=`$ECHO "scale=2; 100.0 - $up_count.0 * 100.0 / $up_total.0" | $BC`
# and calculate uptime percentage for the month so far
			$CAT >>$output_path/$summary_file <<ENDXML
<div class="$up_hclass m_uptime_monthly">$monthab:$uptime%</div>
<div class="$up_hclass m_status">$up_status</div>
ENDXML
#output is a pair of divs containing this information for each host

		done <<END
`$GREP -Po "^1000 m_h\w+ m_min_\w+" $ERROR_LOG_DIR/${year}-${month}* |$UNIQ |$AWK '{print $2}' | $SORT | $UNIQ -c`
END
# Search error files this month for 1000 (maximum error), select only filename,
# score, host and minutes, unique to skip multiple errors found at the same
# time, awk to select only host name, sort and count unique hosts.
# Result: list of hosts for which fatal errors have been found, and
# total number of fails for each
	fi
# End check for relevant error files

# Now handle the hosts that have no errors
	for up_host in `$FIND $HOSTS_DIR -maxdepth 1 -mindepth 1 -type d -printf "%f "`
	do
		if [ "`$ECHO $up_handled |$GREP $up_host`" ]; then
			continue
# skip hosts already handled (those with existing errors)
# TODO - what about host whose name is substring of another
		fi
		config -H $up_host
		eval up_status=\$$up_host
		if [ "$MONITOR_IP" != "NONE" ]; then
			$CAT >>$output_path/$summary_file <<ENDXML
<div class="m_h_${up_host} m_uptime_monthly">$monthab:100.0%</div>
<div class="m_h_${up_host} m_status">$up_status</div>
ENDXML
# Any monitored hosts not already handled have zero errors so 100%
		fi
	done
	config -H $host
# Restore any settings changed by reading config for up_host above

}



# Most commands are set in the common commands script, but I don't
# want this generally available: 
LWP_REQUEST=/usr/bin/lwp-request
BC=/usr/bin/bc
# Probably not necessary to keep this available only here?


# Initialise own input variables:
mhost=()
host=''
interactive='n'
n_result_hosts=0
result_hosts=''
result_dirs=''
result_users=''
result_ips=''
retry='y'

. $SYSTEM_DIR/status
# Read status file for all hosts

# Start real script

# Check for help request and input variables in the command line options:
while getopts ":hH:r:i" opt
do
	case $opt in
		h)	showdocs
			;;
		i)	interactive=y
			retry=n
			;;
		r)	retry=$OPTARG
			;;
		H)	mhost[${#mhost[@]}]=$OPTARG
			;;
		*)	$ECHO "Unknown option. Use -h for instructions"; exit $E_UNKNOWN_OPT
			;;
	esac
done
shift $(($OPTIND - 1))

OPTIND=1
# reset getopts for next time


if [ ${#mhost[*]} -eq 0 ]; then
	mhost=`$FIND $HOSTS_DIR -maxdepth 1 -mindepth 1 -type d -printf "%f "`
fi

for host in ${mhost[*]}
do
	check_host
done



# Finished, except for uploading results to a host for display...

if [ $n_result_hosts -ne 0 -a $interactive == 'n' ]; then
	if [ $M -eq 0 ]; then
# Special case for first of hour - sync all monitor files
# (prevents loss of data recorded for other hosts since last time this one ran)
		output_file=''
	fi

	for (( i = 0 ; i < $n_result_hosts ; i++ ))
	do
		host=${result_hosts[$i]}
		dir=${result_dirs[$i]}
		username=${result_users[$i]}
		ip=${result_ips[$i]}

		$RSYNC 	--bwlimit=$RSYNC_BWLIMIT		\
			-avz 					\
			-e "$SSH -p $SSH_PORT -i $AUTH_DIR/$host"		\
			$output_path/$output_file $username@$ip:$dir/ 2>&1
# Often get error "stdin: is not a tty" - suppress it to avoid clogging
# cron output
	        checkerror $? $E_RSYNC_MONITOR_RESULTS_FAILED
		cecho -c $info "Uploaded results to $host" 2>&1
	done
fi

cecho -c $ok "Monitor done!" ;
