From: Martin Zobel-Helas Date: Tue, 14 Sep 2010 19:56:33 +0000 (+0200) Subject: add some rra checks for nagios to our checks X-Git-Url: https://wiki.adam-barratt.org.uk/gitweb/?a=commitdiff_plain;h=f79acba81afea4a50183dd6754d7aeb064e0e4e1;p=mirror%2Fdsa-nagios.git add some rra checks for nagios to our checks --- diff --git a/dsa-nagios-checks/checks/dsa-check-afs-bos b/dsa-nagios-checks/checks/dsa-check-afs-bos new file mode 100644 index 0000000..8828d89 --- /dev/null +++ b/dsa-nagios-checks/checks/dsa-check-afs-bos @@ -0,0 +1,225 @@ +#!/usr/bin/perl -w +$ID = q$Id: check_bos,v 1.7 2006/03/17 23:06:54 quanah Exp $; +# +# check_bos -- Monitor AFS bos output for problems in Nagios. +# +# Written by Russ Allbery +# Based on an earlier script by Neil Crellin +# Copyright 2003, 2004 Board of Trustees, Leland Stanford Jr. University +# +# This program is free software; you may redistribute it and/or modify it +# under the same terms as Perl itself. +# +# Given an AFS server (file or VLDB), runs bos status on each one. Checks to +# see if there is a communication failure, and also checks to see if anything +# in the output looks unusual or wrong. If either of these conditions are +# true, print that information to STDOUT. Suitable for being run inside +# Nagios. + +############################################################################## +# Site configuration +############################################################################## + +# The full path to bos. Make sure that this is on local disk so that +# monitoring doesn't have an AFS dependency. +($BOS) = grep { -x $_ } qw(/usr/bin/bos /usr/local/bin/bos); +$BOS ||= '/usr/bin/bos'; + +# The default timeout in seconds (implemented by alarm) for rxdebug. +$TIMEOUT = 10; + +# The list of regular expressions matching expected output. You may need to +# customize this for what you're running at your site. Any output from bos +# that doesn't match one of these regular expressions will throw a critical +# error. +@OKAY = ( + qr/^\s*$/, + qr/^Instance\ \S+,\ \(type\ is\ \S+\)(\ has\ core\ file,)? + \ currently\ running\ normally\.$/x, + qr/^\s*Auxiliary status is: file server running\.$/, + qr/^\s*Process last started at /, + qr/^\s*Last exit at /, + qr/^\s*Last error exit at /, + qr/^\s*Command \d+ is / +); + +############################################################################## +# Modules and declarations +############################################################################## + +require 5.005; + +use strict; +use vars qw($BOS $ID @OKAY $TIMEOUT); + +use Getopt::Long qw(GetOptions); + +############################################################################## +# Implementation +############################################################################## + +# Parse command line options. +my ($help, $host, $version); +Getopt::Long::config ('bundling', 'no_ignore_case'); +GetOptions ('hostname|H=s' => \$host, + 'help|h' => \$help, + 'timeout|t=i' => \$TIMEOUT, + 'version|V' => \$version) or exit 3; +if ($help) { + print "Feeding myself to perldoc, please wait....\n"; + exec ('perldoc', '-t', $0) or die "Cannot fork: $!\n"; +} elsif ($version) { + my $version = join (' ', (split (' ', $ID))[1..3]); + $version =~ s/,v\b//; + $version =~ s/(\S+)$/($1)/; + $version =~ tr%/%-%; + print $version, "\n"; + exit 0; +} +if (@ARGV) { + print "Usage: $0 [-hv] [-t ] -H \n"; + warn "Usage: $0 [-hv] [-t ] -H \n"; + exit 3; +} + +# Set up the alarm. +$SIG{ALRM} = sub { + print "BOS CRITICAL - network timeout after $TIMEOUT seconds\n"; + exit 2; +}; +alarm ($TIMEOUT); + +# Collect the bos output into a variable. +unless (open (BOS, "$BOS status $host -noauth -long 2>&1 |")) { + print "BOS UNKNOWN - cannot run bos\n"; + exit 3; +} +my @bos = ; +close BOS; + +# Make sure that bos was successful. Note that it generally does return +# success even if it can't contact the bos server. +if ($? != 0) { + print "BOS CRITICAL - bos status failed\n"; + exit 2; +} + +# Scan the output. If we see anything that we don't expect, immediately +# report it as a fatal error. +for my $line (@bos) { + my $okay = 0; + for my $regex (@OKAY) { + if ($line =~ /$regex/) { + $okay = 1; + last; + } + } + unless ($okay) { + $line =~ s/^\s+//; + $line =~ s/\s+$//; + print "BOS CRITICAL - $line\n"; + exit 2; + } +} +print "BOS OK\n"; +exit 0; + +############################################################################## +# Documentation +############################################################################## + +=head1 NAME + +check_bos - Monitor AFS bos output for problems in Nagios + +=head1 SYNOPSIS + +check_bos [B<-hV>] [B<-t> I] B<-H> I + +=head1 DESCRIPTION + +B is a Nagios plugin for querying the AFS bosserver for process +status and reporting an alert if there are any unexpected lines in the bos +output. The acceptable lines of output from B are configured at the +top of this script; they should be generally suitable for most sites, but +may require some customization. + +B will always print out a single line of output. If there is a +line that isn't matched by any regexes identifying acceptable lines, it will +output the first non-matching line prefixed by C. Otherwise, +it will output B. Note that this monitoring may not catch such +things as a service being constantly restarted if it happens to be up and +running normally each time the probe runs; it doesn't pay any attention to +the last start time, the last error exit status, the presence of core files, +and the like. It mostly just looks for the "running normally" part of the +B output and makes sure the auxilliary status is also "running +normally" for a file server process. + +=head1 OPTIONS + +=over 4 + +=item B<-H> I, B<--hostname>=I + +The AFS server whose B status B should check. This option +is required. + +=item B<-h>, B<--help> + +Print out this documentation (which is done simply by feeding the script +to C). + +=item B<-t> I, B<--timeout>=I + +Change the timeout for the B command. The default timeout is 10 +seconds. + +=item B<-V>, B<--version> + +Print out the version of B and quit. + +=back + +=head1 EXIT STATUS + +B follows the standard Nagios exit status requirements. This +means that it will exit with status 0 if there are no problems or with +status 2 if there is a problem detected. For other errors, such as invalid +syntax, B will exit with status 3. + +=head1 BUGS + +The standard B<-v> verbose Nagios plugin option is not supported. It should +display the complete bos status output. + +The usage message for invalid options and for the B<-h> option doesn't +conform to Nagios standards. + +=head1 CAVEATS + +This script does not use the Nagios util library or any of the defaults that +it provides, which makes it somewhat deficient as a Nagios plugin. This is +intentional, though, since this script can be used with other monitoring +systems as well. It's not clear what a good solution to this would be. + +=head1 SEE ALSO + +The current version of this and other AFS monitoring plugins for Nagios are +available from the AFS monitoring tools page at +L. + +=head1 AUTHORS + +The original idea behind this script was from Neil Crellin. Russ Allbery + updated it to work with Nagios and stripped out some +rather neat but now unnecessary code to look for any changes in the bos +output, instead just scanning it for acceptable lines. + +=head1 COPYRIGHT AND LICENSE + +Copyright 2003, 2004 Board of Trustees, Leland Stanford Jr. University. + +This program is free software; you may redistribute it and/or modify it +under the same terms as Perl itself. + +=cut diff --git a/dsa-nagios-checks/checks/dsa-check-afs-rxdebug b/dsa-nagios-checks/checks/dsa-check-afs-rxdebug new file mode 100644 index 0000000..37e843a --- /dev/null +++ b/dsa-nagios-checks/checks/dsa-check-afs-rxdebug @@ -0,0 +1,227 @@ +#!/usr/bin/perl -w +$ID = q$Id: check_rxdebug,v 1.11 2006/03/17 23:06:54 quanah Exp $; +# +# check_rxdebug -- Nagios AFS server check for waiting connections. +# +# Written by Quanah Gibson-Mount based on work by Neil Crellin +# Updated by Russ Allbery +# Copyright 2003, 2004, 2005 Board of Trustees, Leland Stanford Jr. University +# +# This program is free software; you may redistribute it and/or modify it +# under the same terms as Perl itself. +# +# Expects a file server with the -H option and runs rxdebug against that file +# server, looking for any connections that are waiting for a thread. Exits +# with status 1 if there are more than two connections in that state (a +# warning) and with status 2 if there are more than eight connections in that +# state. The thresholds can be overridden from the command line. + +############################################################################## +# Site configuration +############################################################################## + +# The default count of blocked connections at which to warn or send a critical +# alert. These can be overridden with the -w and -c command-line options. +$WARNINGS = 2; +$CRITICAL = 8; + +# The default timeout in seconds (implemented by alarm) for rxdebug. +$TIMEOUT = 60; + +# The full path to rxdebug. Make sure that this is on local disk so that +# monitoring doesn't have an AFS dependency. +($RXDEBUG) = grep { -x $_ } qw(/usr/bin/rxdebug /usr/local/bin/rxdebug); +$RXDEBUG ||= '/usr/bin/rxdebug'; + +############################################################################## +# Modules and declarations +############################################################################## + +require 5.003; + +use strict; +use vars qw($CRITICAL $ID $RXDEBUG $TIMEOUT $WARNINGS); + +use Getopt::Long qw(GetOptions); + +############################################################################## +# Implementation +############################################################################## + +# Parse command line options. +my ($help, $host, $version); +Getopt::Long::config ('bundling', 'no_ignore_case'); +GetOptions ('critical|c=i' => \$CRITICAL, + 'hostname|H=s' => \$host, + 'help|h' => \$help, + 'timeout|t=i' => \$TIMEOUT, + 'version|V' => \$version, + 'warning|w=i' => \$WARNINGS) or exit 3; +if ($help) { + print "Feeding myself to perldoc, please wait....\n"; + exec ('perldoc', '-t', $0) or die "Cannot fork: $!\n"; +} elsif ($version) { + my $version = join (' ', (split (' ', $ID))[1..3]); + $version =~ s/,v\b//; + $version =~ s/(\S+)$/($1)/; + $version =~ tr%/%-%; + print $version, "\n"; + exit 0; +} +if (@ARGV) { + warn "Usage: $0 [-hv] [-c ] [-w ] -H \n"; + exit 3; +} +if ($WARNINGS > $CRITICAL) { + warn "$0: warning level $WARNINGS greater than critical level $CRITICAL\n"; + exit 3; +} + +# Set up the alarm. +$SIG{ALRM} = sub { + print "AFS CRITICAL - network timeout after $TIMEOUT seconds\n"; + exit 2; +}; +alarm ($TIMEOUT); + +# Run rxdebug and parse the output, counting the number of waiting for process +# connections that we have. +unless (open (RXDEBUG, "$RXDEBUG $host -noconn |")) { + warn "$0: cannot run rxdebug\n"; + exit 3; +} +my $blocked; +while () { + if (/^(\d+) calls waiting for a thread/) { + $blocked = $1; + last; + } +} +close RXDEBUG; +if ($? != 0) { + print "AFS CRITICAL - cannot contact server\n"; + exit 2; +} +unless (defined $blocked) { + print "AFS CRITICAL - cannot parse rxdebug output\n"; + exit 2; +} + +# Check the connection count against our limits and make sure that it's okay. +if ($blocked >= $CRITICAL) { + print "AFS CRITICAL - $blocked blocked connections\n"; + exit 2; +} elsif ($blocked >= $WARNINGS) { + print "AFS WARNING - $blocked blocked connections\n"; + exit 1; +} else { + print "AFS OK - $blocked blocked connections\n"; + exit 0; +} + +############################################################################## +# Documentation +############################################################################## + +=head1 NAME + +check_rxdebug - Check AFS servers for blocked connections in Nagios + +=head1 SYNOPSIS + +check_rxdebug [B<-hV>] [B<-c> I] [B<-w> I] +[B<-t> I] B<-H> I + +=head1 DESCRIPTION + +B is a Nagios plugin for checking AFS file servers to see if +there are client connections waiting for a free thread. If there are more +than a few of these, AFS performance tends to be very slow; this is a fairly +reliable way to catch overloaded file servers. By default, B +returns a critical error if there are more than eight connections waiting +for a free thread and a warning if there are more than two. These +thresholds can be changed with the B<-c> and B<-w> options. + +B will always print out a single line of output including the +number of blocked connections, displaying whether this is critical, a +warning, or okay. + +=head1 OPTIONS + +=over 4 + +=item B<-c> I, B<--critical>=I + +Change the critical blocked connection count threshold to I, +which should be an integer. The default is 8. + +=item B<-H> I, B<--hostname>=I + +The AFS file server whose connections B should check. This +option is required. + +=item B<-h>, B<--help> + +Print out this documentation (which is done simply by feeding the script +to C). + +=item B<-t> I, B<--timeout>=I + +Change the timeout for the B command. The default timeout is 60 +seconds. + +=item B<-V>, B<--version> + +Print out the version of B and quit. + +=item B<-w> I, B<--warning>=I + +Change the warning blocked connection threshold to I, which +should be an integer. The default is 2. + +=back + +=head1 EXIT STATUS + +B follows the standard Nagios exit status requirements. This +means that it will exit with status 0 if there are no problems, with status +1 if there is a warning, and with status 2 if there is a critical problem. +For other errors, such as invalid syntax, B will exit with +status 3. + +=head1 BUGS + +The standard B<-v> verbose Nagios plugin option is not supported, although +it's not entirely clear what it would add. + +The usage message for invalid options and for the B<-h> option doesn't +conform to Nagios standards. + +=head1 CAVEATS + +This script does not use the Nagios util library or any of the defaults that +it provides, which makes it somewhat deficient as a Nagios plugin. This is +intentional, though, since this script can be used with other monitoring +systems as well. It's not clear what a good solution to this would be. + +=head1 SEE ALSO + +The current version of this and other AFS monitoring plugins for Nagios are +available from the AFS monitoring tools page at +L. + +=head1 AUTHORS + +The original idea behind this script was from Neil Crellin. It was updated +by Quanah Gibson-Mount to work with Nagios, and then further updated by Russ +Allbery to support more standard options and to use a +more uniform coding style. + +=head1 COPYRIGHT AND LICENSE + +Copyright 2003, 2004, 2005 Board of Trustees, Leland Stanford Jr. University. + +This program is free software; you may redistribute it and/or modify it +under the same terms as Perl itself. + +=cut diff --git a/dsa-nagios-checks/checks/dsa-check-afs-space b/dsa-nagios-checks/checks/dsa-check-afs-space new file mode 100644 index 0000000..0df6771 --- /dev/null +++ b/dsa-nagios-checks/checks/dsa-check-afs-space @@ -0,0 +1,226 @@ +#!/usr/bin/perl -w +$ID = q$Id: check_afsspace,v 1.16 2006/03/17 23:06:54 quanah Exp $; +# +# check_afsspace -- Monitor AFS disk space usage under Nagios. +# +# Written by Susan Feng +# Updated by Russ Allbery +# Copyright 2003, 2004 Board of Trustees, Leland Stanford Jr. University +# +# This program is free software; you may redistribute it and/or modify it +# under the same terms as Perl itself. +# +# Expects a host with the -H option and checks the partition usage with +# vos partinfo. Exits with status 1 if the free space is below a warning +# percentage and with status 2 if the free space is above a critical +# percentage (this works with the Nagios check architecture). + +############################################################################## +# Site configuration +############################################################################## + +# The default percentage full at which to warn and at which to send a critical +# alert. These can be overridden with the -w and -c command-line options. +$WARNINGS = 85; +$CRITICAL = 90; + +# The default timeout in seconds (implemented by alarm) for vos partinfo. +$TIMEOUT = 300; + +# The full path to vos. Make sure that this is on local disk so that +# monitoring doesn't have an AFS dependency. +($VOS) = grep { -x $_ } qw(/usr/bin/vos /usr/local/bin/vos); +$VOS ||= '/usr/bin/vos'; + +############################################################################## +# Modules and declarations +############################################################################## + +require 5.003; + +use strict; +use vars qw($CRITICAL $ID $TIMEOUT $VOS $WARNINGS); + +use Getopt::Long qw(GetOptions); + +############################################################################## +# Implementation +############################################################################## + +# Parse command line options. +my ($help, $host, $version); +Getopt::Long::config ('bundling', 'no_ignore_case'); +GetOptions ('critical|c=i' => \$CRITICAL, + 'hostname|H=s' => \$host, + 'help|h' => \$help, + 'timeout|t=i' => \$TIMEOUT, + 'version|V' => \$version, + 'warning|w=i' => \$WARNINGS) or exit 3; +if ($help) { + print "Feeding myself to perldoc, please wait....\n"; + exec ('perldoc', '-t', $0) or die "Cannot fork: $!\n"; +} elsif ($version) { + my $version = join (' ', (split (' ', $ID))[1..3]); + $version =~ s/,v\b//; + $version =~ s/(\S+)$/($1)/; + $version =~ tr%/%-%; + print $version, "\n"; + exit 0; +} +if (@ARGV) { + warn "Usage: $0 [-hv] [-c ] [-w ] -H \n"; + exit 3; +} +if ($WARNINGS > $CRITICAL) { + warn "$0: warning level $WARNINGS greater than critical level $CRITICAL\n"; + exit 3; +} + +# Set up the alarm. +$SIG{ALRM} = sub { + print "AFS CRITICAL - network timeout after $TIMEOUT seconds\n"; + exit 2; +}; +alarm ($TIMEOUT); + +# Get the partinfo information and calculate the percentage free for each +# partition. Accumulate critical messages in @critical and warnings in +# @warnings. Accumulate all percentages in @all. +my (@critical, @warnings, @all); +my @data = `$VOS partinfo '$host' 2> /dev/null`; +if ($? != 0) { + print "AFS CRITICAL - cannot contact server\n"; + exit 2; +} +for (@data) { + my ($partition, $free, $total) = (split)[4,5,11]; + my $percent = int ((($total - $free) / $total) * 100); + if ($percent >= $CRITICAL) { + push (@critical, "$partition$percent% (free $free)"); + } elsif ($percent >= $WARNINGS) { + push (@warnings, "$partition$percent% (free $free)"); + } + push (@all, "$partition$percent%"); +} + +# Exit with the appropriate error messages. +if (@critical) { + print "AFS CRITICAL - @critical\n"; + exit 2; +} elsif (@warnings) { + print "AFS WARNING - @warnings\n"; + exit 1; +} else { + print "AFS OK - @all\n"; + exit 0; +} + +############################################################################## +# Documentation +############################################################################## + +=head1 NAME + +check_afsspace - Monitor AFS disk space usage under Nagios + +=head1 SYNOPSIS + +check_afsspace [B<-hV>] [B<-c> I] [B<-w> I] +[B<-t> I] B<-H> I + +=head1 DESCRIPTION + +B is a Nagios plugin for checking free space on AFS server +partitions. It uses C to obtain the free space on the +partitions on an AFS server and will return an alert if the percentage of +used space exceeds a threshold. By default, it returns a critical error if +the used space is over 90% and a warning if it is over 85% (changable with +the B<-c> and B<-w> options). + +B will always print out a single line of output, giving the +critical errors if any, otherwise giving the warnings if any, otherwise +listing in an abbreviated form the percentage free space for all partitions. + +=head1 OPTIONS + +=over 4 + +=item B<-c> I, B<--critical>=I + +Change the critical percentage threshold to I, which should be an +integer percentage. The default is 90. + +=item B<-H> I, B<--hostname>=I + +The AFS file server whose free space B should check. This +option is required. + +=item B<-h>, B<--help> + +Print out this documentation (which is done simply by feeding the script +to C). + +=item B<-t> I, B<--timeout>=I + +Change the timeout for the C command. The default timeout is +10 seconds. + +=item B<-V>, B<--version> + +Print out the version of B and quit. + +=item B<-w> I, B<--warning>=I + +Change the warning percentage threshold to I, which should be an +integer percentage. The default is 85. + +=back + +=head1 EXIT STATUS + +B follows the standard Nagios exit status requirements. +This means that it will exit with status 0 if there are no problems, with +status 2 if there is at least one critical partition for that server, and +with status 1 if there are no critical partitions but at least one warning +partition. For other errors, such as invalid syntax, B will +exit with status 3. + +=head1 BUGS + +The standard B<-v> verbose Nagios plugin option is not supported and should +be. (For example, under B<-vv> we would want to show the actual total, +free, and used byte counts, not just the percentages.) + +The usage message for invalid options and for the B<-h> option doesn't +conform to Nagios standards. + +=head1 CAVEATS + +This script does not use the Nagios util library or any of the defaults that +it provides, which makes it somewhat deficient as a Nagios plugin. This is +intentional, though, since this script can be used with other monitoring +systems as well. It's not clear what a good solution to this would be. + +=head1 SEE ALSO + +vos(1) + +The current version of this and other AFS monitoring plugins for Nagios are +available from the AFS monitoring tools page at +L. + +=head1 AUTHORS + +Originally written by Susan Feng for use with mon. Updated by Quanah +Gibson-Mount to work with Nagios, and then further updated by Russ Allbery + to support more standard options and to use a more +uniform coding style. + +=head1 COPYRIGHT AND LICENSE + +Copyright 2003, 2004 Board of Trustees, Leland Stanford Jr. University. + +This program is free software; you may redistribute it and/or modify it +under the same terms as Perl itself. + +=cut diff --git a/dsa-nagios-checks/checks/dsa-check-afs-udebug b/dsa-nagios-checks/checks/dsa-check-afs-udebug new file mode 100644 index 0000000..ba17d0b --- /dev/null +++ b/dsa-nagios-checks/checks/dsa-check-afs-udebug @@ -0,0 +1,203 @@ +#!/usr/bin/perl -w +$ID = q$Id: check_udebug,v 1.3 2006/03/17 23:06:54 quanah Exp $; +# +# check_udebug -- Check AFS database servers using udebug for Nagios. +# +# Written by Russ Allbery +# Copyright 2004 Board of Trustees, Leland Stanford Jr. University +# +# This program is free software; you may redistribute it and/or modify it +# under the same terms as Perl itself. +# +# Takes a hostname and a port number and checks the udebug output for that +# host and port. Reports an error if the recovery state is not 1f on the sync +# site (ensuring that it considers all of the other servers up-to-date) or if +# any of the servers don't believe there is a sync site. + +############################################################################## +# Site configuration +############################################################################## + +# The default timeout in seconds (implemented by alarm) for udebug. +$TIMEOUT = 10; + +# The full path to udebug. Make sure that this is on local disk so that +# monitoring doesn't have an AFS dependency. +($UDEBUG) = grep { -x $_ } qw(/usr/bin/udebug /usr/local/bin/udebug); +$UDEBUG ||= '/usr/bin/udebug'; + +############################################################################## +# Modules and declarations +############################################################################## + +require 5.003; + +use strict; +use vars qw($ID $TIMEOUT $UDEBUG); + +use Getopt::Long qw(GetOptions); + +############################################################################## +# Implementation +############################################################################## + +# Parse command line options. +my ($help, $host, $port, $version); +Getopt::Long::config ('bundling', 'no_ignore_case'); +GetOptions ('hostname|H=s' => \$host, + 'help|h' => \$help, + 'port|p=i' => \$port, + 'timeout|t=i' => \$TIMEOUT, + 'version|V' => \$version) or exit 3; +if ($help) { + print "Feeding myself to perldoc, please wait....\n"; + exec ('perldoc', '-t', $0) or die "Cannot fork: $!\n"; +} elsif ($version) { + my $version = join (' ', (split (' ', $ID))[1..3]); + $version =~ s/,v\b//; + $version =~ s/(\S+)$/($1)/; + $version =~ tr%/%-%; + print $version, "\n"; + exit 0; +} +if (@ARGV || !(defined ($host) && defined ($port))) { + warn "Usage: $0 [-hv] [-t ] -H -p \n"; + exit 3; +} + +# Set up the alarm. +$SIG{ALRM} = sub { + print "UBIK CRITICAL - network timeout after $TIMEOUT seconds\n"; + exit 2; +}; +alarm ($TIMEOUT); + +# Run udebug and parse the output. We're looking for three things: first, +# we're looking to see if this host claims to be the sync site. If so, check +# that recovery state is 1f. Otherwise, make sure that there's a defined sync +# host. +unless (open (UDEBUG, "$UDEBUG $host $port |")) { + warn "$0: cannot run udebug\n"; + exit 3; +} +my ($issync, $recovery, $synchost); +while () { + $issync = 1 if /^I am sync site /; + $recovery = 1 if /^Recovery state 1f/; + $synchost = 1 if /^Sync host \d+(\.\d+){3} was set /; +} +close UDEBUG; +if ($? != 0) { + print "UBIK CRITICAL - udebug failed\n"; + exit 2; +} + +# Check the results. +if ($issync && !$recovery) { + print "UBIK CRITICAL - recovery state not 1f\n"; + exit 2; +} elsif (!$issync && !$synchost) { + print "UBIK CRITICAL - no sync site\n"; + exit 2; +} else { + print "UBIK OK\n"; + exit 0; +} + +############################################################################## +# Documentation +############################################################################## + +=head1 NAME + +check_udebug - Check AFS servers for blocked connections in Nagios + +=head1 SYNOPSIS + +check_udebug [B<-hV>] [B<-t> I] B<-H> I B<-p> I + +=head1 DESCRIPTION + +B is a Nagios plugin for checking AFS database servers to make +sure the Ubik replication between the database servers is running correctly. +B is used to connect to the specified port, which should generally +be one of 7002 (ptserver), 7003 (vlserver), or 7004 (kaserver), on the +specified server. The resulting output is checked to make sure that the +recovery state is 1f if that server is the sync site, or that a sync site is +known if that server doesn't claim to be the sync site. + +B will always print out a single line of output. That line +will be C if everything is fine, or C followed by +an error message otherwise. + +=head1 OPTIONS + +=over 4 + +=item B<-H> I, B<--hostname>=I + +The AFS database server whose Ubik status B should check. +This option is required. + +=item B<-h>, B<--help> + +Print out this documentation (which is done simply by feeding the script +to C). + +=item B<-p> I, B<--port>=I + +The port to connect to on the AFS database server. This should generally be +one of 7002 (ptserver), 7003 (vlserver), or 7004 (kaserver). This option is +required. + +=item B<-t> I, B<--timeout>=I + +Change the timeout for the B command. The default timeout is 60 +seconds. + +=item B<-V>, B<--version> + +Print out the version of B and quit. + +=back + +=head1 EXIT STATUS + +B follows the standard Nagios exit status requirements. This +means that it will exit with status 0 if there are no problems or with +status 2 if there are critical problems. For other errors, such as invalid +syntax, B will exit with status 3. + +=head1 BUGS + +The standard B<-v> verbose Nagios plugin option is not supported. It should +print out the full B output. + +The usage message for invalid options and for the B<-h> option doesn't +conform to Nagios standards. + +=head1 CAVEATS + +This script does not use the Nagios util library or any of the defaults that +it provides, which makes it somewhat deficient as a Nagios plugin. This is +intentional, though, since this script can be used with other monitoring +systems as well. It's not clear what a good solution to this would be. + +=head1 SEE ALSO + +The current version of this and other AFS monitoring plugins for Nagios are +available from the AFS monitoring tools page at +L. + +=head1 AUTHORS + +Russ Allbery + +=head1 COPYRIGHT AND LICENSE + +Copyright 2004 Board of Trustees, Leland Stanford Jr. University. + +This program is free software; you may redistribute it and/or modify it +under the same terms as Perl itself. + +=cut