#!/usr/bin/perl -w

# ClonePanel - Manages duplicate accounts on two or more webservers,
# including snapshot backups, monitoring and failover dns.
# Copyright (C)2006 Chris Cheers, Internet Lynx.
# Contact chris[at]clonepanel[dot]com.
# Internet Lynx, PO Box 7117, Mannering Park, NSW 2259, Australia

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

# Version=0.33
# monitor.pl handles results text processing for monitor shell script


use strict;
use warnings;

my ($time_file, $result_file, $output_path, $output_file, $interactive, $error_action, $error_action_threshold, $monitor_hostname, @classes) = @ARGV;
my $monitor_error=0;

die "Syntax: ./monitor.pl time_file result_file output_path output_file interactive error_action error_threshold monitor_hostname class1 [class2 class3 ..]"
	unless @classes;
my $class = join ' ', @classes;

sub fix_scores {
#limit possible score to 1-9 by 1, 10-100 by 10, 100-1000 by 100
	my $score=shift;
	$score = int($score);
	if ($score < 1) {
		return 1
	} elsif ($score <= 10) {
		return $score;
	} elsif ($score <= 100) {
		return 10*(int($score/10+0.5))
	} elsif ($score <= 1000) {
		return 100*(int($score/100+0.5))
	} else {
		return 1000
	}
}

my ($score);
my $fetch_error=0;

if ($interactive ne 'y') {
	open OUT, ">>$output_path/$output_file" or die "Unable to open output file $output_path/$output_file for append";
	select(OUT);
}

my $fullcheck = ($interactive eq 'y' || $class=~/m_min_0/) ? 1 : 0;
# Record complete data only on the hour or for interactive calls

my ($continue, $command, $processed, $error, $proxy) = (q//, q//, 0, q//, 0);
if (open RESULT, $result_file) {
	while (my $line = <RESULT>) {
		if ($continue) {
			$command=$continue;
#			Use continue value to scan multi-line results
		} elsif ($line =~ s/^(\w+)\=//) {
			$command=$1;
		} else {
			warn "Bad syntax in results file: $line";
			chomp ($error = $line);
# If this is an error connecting to / reading the site (most likely),
# then no results will be available and the error will be printed later
			$error =~ s/[^\w\s]/#/g;
#			Error comes from remote server - can't be entirely trusted

			next;
		}
		$continue=q//;
		$processed++;
		if ($command eq 'uptime') {
			my ($days, $loads, $load) = (0, 'unknown', 0);
			if ($line =~ /up\s*(.*?)\s*day/) {
				$days=$1;
			}
			if ($line =~ /load average:\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)/) {
				$loads="$1 $2 $3";
				$load=armax([$1,$2,$3]);
			}
			if ($days == 0) { $score = 100 }
			elsif ($days == 1) { $score = 50 }
			else { $score = 1 }
#			Arbitrary scoring - uptime days don't signify much
			print "<div class=\"$class m_updays m_$score\">$days</div>\n" if $fullcheck;
			$score = fix_scores($load * 10);
#			Load average >=100 is max score (red flashing etc!)
			print "<div class=\"$class m_load m_$score\">$loads</div>\n";
			`$error_action $output_file $score $class m_load "$loads"` if $score >= $error_action_threshold;

		} elsif ($command eq 'df') {
			my ($used) = ('unknown');
			if ($line =~ /([\d\.]+)\%/) {
				$used=$1;
			}
			if ($used eq 'unknown') { $score = 100 }
			elsif ($used > 80) { $score = fix_scores(50 * ($used - 80)) }
			else { $score = 1 }
#			Score from 0 to 1000 as used space goes from 80-100%
			print "<div class=\"$class m_disk m_$score\">$used\%</div>\n" if $fullcheck;
			`$error_action $output_file $score $class m_disk "$used"` if $score >= $error_action_threshold;

		} elsif ($command eq 'free') {
			my ($free, $frees) = (0, 'unknown');
			if ($line =~ /Mem:\s*([\d\.]+)\s*([\d\.]+)\s*([\d\.]+)/) {
				$free = $3;
				$frees = "$1 $2 $3";
			}
			if ($frees eq 'unknown') { $score = 100 }
			else { $score = 1 }
#			Score not meaningful for free memory - Linux uses all it can
			print "<div class=\"$class m_free m_$score\">$frees</div>\n" if $fullcheck;
			$continue='swap';
#			Go around again for swap
			`$error_action $output_file $score $class m_free "$frees"` if $score >= $error_action_threshold;

		} elsif ($command eq 'swap') {
			my ($swap, $swaps) = (0, 'unknown');
			if ($line =~ /Swap:\s*([\d\.]+)\s*([\d\.]+)\s*([\d\.]+)/) {
				$swap = ($1 > 0) ? $2 / $1 : 0;
				#proportion of used swap space (permit zero swap)
				$swaps = "$1 $2 $3";
			}
			if ($swaps eq 'unknown') { $score = 100 }
			elsif ($swap > 0.6) { $score = fix_scores(2250 * ($swap - 0.6)) }
			else { $score = 1 }
#			Score from 0 to 900 as used swap goes from 60-100%
#			NB - avoid scoring fatal error on swap
			print "<div class=\"$class m_swap m_$score\">$swaps</div>\n" if $fullcheck;	
			`$error_action $output_file $score $class m_swap "$swaps"` if $score >= $error_action_threshold;

		} elsif ($command eq 'mysql') {
			chomp (my $myerr = $line);
			$myerr =~ s/[^\w\s]/#/g;
#			Error comes from remote server - can't be entirely trusted
			if ($myerr eq 'ok') { $score = 1 }
			else { $score = 1000 }
#			Any error should be included in line
			print "<div class=\"$class m_mysql m_$score\">$myerr</div>\n";
			`$error_action $output_file $score $class m_mysql "$myerr"` if $score >= $error_action_threshold;
		} elsif ($command eq 'noconnect') {
# Special label indicating that backup server has no connection
# ie. it cannot connect to the monitored host OR to a proxy - zero results
			print "<div class=\"$class m_updays m_0\">N/A</div>\n" if $fullcheck;
			print "<div class=\"$class m_load m_0\">N/A</div>\n";
			print "<div class=\"$class m_disk m_0\">N/A</div>\n" if $fullcheck;
			print "<div class=\"$class m_free m_0\">N/A</div>\n" if $fullcheck;
			print "<div class=\"$class m_swap m_0\">N/A</div>\n" if $fullcheck;	
			print "<div class=\"$class m_mysql m_0\">N/A</div>\n";
			$processed++;
			$monitor_error=1;
# Don't log this as a host error! It's a monitor error.
			`$error_action $output_file 1000 m_h_$monitor_hostname m_connect "No connection"`;
		} elsif ($command eq 'proxy') {
			chomp (my $myerr = $line);
			$myerr =~ s/[^\w\s]/#/g;
#			Error comes from remote server - can't be entirely trusted
			print "<!-- Result obtained via proxy: $myerr -->\n";
			$score = 0;
			$proxy = 1;
			$processed--;
# Don't include proxy tag as processed line
		} else {
			warn "Unknown item '$command' in monitor result.";
# May take this one out once we're sure of what errors we get...
			$processed--;
		}
	}
	close RESULT;
}
if ($processed < 1) {	#failed to open results file, or no useful data in it
	if ($error || $proxy) {
		$error ||= "No results from proxy";
		print "<div class=\"$class m_load m_1000\">Error $error</div>\n";
		`$error_action $output_file 1000 $class m_load "$error"`;
		`$error_action $output_file 1000 $class m_fetch "$error"` unless $fetch_error;
# Log 2 errors if down (fetch and load) but suppress fetch error if already logged
	} else {
		print "<div class=\"$class m_load m_0\">N/A</div>\n";
	}
	print "<div class=\"$class m_mysql m_0\">N/A</div>\n";
}


# Finally check timing data, but only if no monitor errors

my ($m, $s) = (-1, -1);
if ($monitor_error != 0) {
  warn "Monitor error - ignoring timing data";
  $score=0;
} else {
  if (open TIME, $time_file) {
	while (<TIME>) {
		if (/[\d\.]+\s*user\s*[\d\.]+\s*system\s*(\d+):([\d\.]+)elapsed/) {
			($m, $s) = ($1, $2);
			last;
		}
	}
	close TIME;
  }
  if ($s < 0) {
	warn "Failed to read timing data";
	$score=0					# Score 0 means no data
  } else {
	my $stot = $s + $m * 60; # Convert minutes - should be unnecessary
	$score = ($stot <= 5) ? fix_scores(2 * $stot) : fix_scores(40 * ($stot - 4.75));
	# Keep score low for <=5s (possible overhead), Score 1000 at 30s (timeout)
	$score = 900 if $score == 1000;
	# Quick fix - limit max score to 900 so it doesn't show as fatal error
	# (timing more often monitor problem than host). Note that if a request
	# times out completely then it will be logged as a fail on load too.	
  }
}

if ($score >= $error_action_threshold) {
	`$error_action $output_file $score $class m_fetch "$m:$s"`;
	$fetch_error=1;
}
print "<div class=\"$class m_fetch m_$score\">$m:$s</div>\n";



close OUT if $interactive ne 'y';


unlink $time_file or warn "Unable to delete temp time file $time_file";
unlink $result_file or warn "Unable to delete temp result file $result_file";


sub armax {
# Given an arrayref, returns maximum value
	my $values = shift;
	my @sorted = sort num @$values;
	return pop @sorted;
}
sub num {
	return $a <=> $b
}

	
