Nagios – BAFM

Generate Nagios config for check_netapp-api.pl

February 5, 2013June 21, 2013 Christian Leave a comment

As so often, I wanted a script, that’ll crawl my filers and regenerate the configuration if there are any new volumes/snapvaults/snapmirrors or if one of them has been removed.

#!/bin/bash

FAS_HOSTS="$( ls /etc/nagios/objects/hosts/san/fas*{a,b}.cfg | cut -d/ -f7 | cut -d. -f1 )"

for host in $FAS_HOSTS; do
	OUTPUT_FILE=/etc/nagios/objects/hosts/san/$host-vol.cfg

	# Clear the output file
	echo "" > $OUTPUT_FILE

	# Get the volume list
	for volume in `ssh $host vol status | awk '{ print $1 }' | grep ^vol | sort -u | grep -v vol0$`; do
		user="$( grep "USER=" /etc/netapp-sdk/$host | cut -d= -f2 )"
		pass="$( grep "PASS=" /etc/netapp-sdk/$host | cut -d= -f2 )"
#		echo "define service {"
#		echo "	use				generic-service"
#		echo ""
#		echo "	check_command			check_netapp-volfree!$user!$pass!${volume}!92!98"
#		echo "	check_interval			5"
#		echo "	host_name			${host}"
#		echo "	notifications_enabled		0"
#		echo "	notification_interval		720"
#		echo "	service_description		VOLSPACE ${volume}"
#		echo "}"
		echo
		echo "define service {"
		echo "	use				generic-service-san-perfdata"
		echo ""
		echo "	check_command			check_netapp-lunspace!$user!$pass!${volume}"
		echo "	check_interval			5"
		echo "	host_name			${host}"
		echo "	notifications_enabled		0"
		echo "	notification_interval		720"
		echo "	service_description		LUNSPACE ${volume}"
		echo "}"
		echo

		SR="$( ssh $host snap reserve $volume | cut -d  -f7 )"
		if [ "$SR" != "0%" ] ; then
			echo "define service {"
			echo "	use				generic-service-san-perfdata"
			echo ""
			echo "	check_command			check_netapp-snapreserve!$user!$pass!${volume}"
			echo "	check_interval			10"
			echo "	host_name			${host}"
			echo "	notifications_enabled		0"
			echo "	notification_interval		720"
			echo "	# SR:				$SR"
			echo "	service_description		SNAPRESERVE ${volume}"
			echo "}"
			echo
		fi
	done | tee -a $OUTPUT_FILE

	# Check snapvault foo
	for sv in `ssh $host snapvault status -l 2>/dev/null | awk '{ print $2 }' | grep vol`; do
		# only do the checks on sv_secondary
		if [ "$( echo $sv | grep $host | cut -d: -f1 )" == "${host}" ]; then
			vol="$( echo $sv | cut -d/ -f3 )"
			user="$( grep "USER=" /etc/netapp-sdk/$host | cut -d= -f2 )"
			pass="$( grep "PASS=" /etc/netapp-sdk/$host | cut -d= -f2 )"
			echo "define service {"
			echo "	use				generic-service-san-perfdata"
			echo ""
			echo "	check_command			check_netapp-snapvault!$user!$pass!$vol!38!42!"
			echo "	check_interval			60"
			echo "	host_name			${host}"
			echo "	notifications_enabled		0"
			echo "	notification_interval		720"
			echo "	service_description		SNAPVAULT ${vol}"
			echo "}"
			echo
		fi
	done | tee -a $OUTPUT_FILE

	# Check snapmirror foo
	for sm in `ssh $host snapmirror status 2>/dev/null | awk '{ print $2 }' | grep vol | grep $host`; do
		# only do the checks on sm_secondary
		if [ "$( echo $sm | grep $host | cut -d: -f1 )" == "${host}" ]; then
			vol="$( echo $sm | cut -d/ -f3 | cut -d: -f2 )"
			user="$( grep "USER=" /etc/netapp-sdk/$host | cut -d= -f2 )"
			pass="$( grep "PASS=" /etc/netapp-sdk/$host | cut -d= -f2 )"
			echo "define service {"
			echo "	use				generic-service-san-perfdata"
			echo ""
			echo "	check_command			check_netapp-snapmirror!$user!$pass!$vol!38!42!"
			echo "	check_interval			60"
			echo "	host_name			${host}"
			echo "	notifications_enabled		0"
			echo "	notification_interval		720"
			echo "	service_description		SNAPMIRROR ${vol}"
			echo "}"
			echo
		fi
	done | tee -a $OUTPUT_FILE
done

#!/bin/bash

FAS_HOSTS="$( ls /etc/nagios/objects/hosts/san/fas*{a,b}.cfg | cut -d/ -f7 | cut -d. -f1 )"

for host in $FAS_HOSTS; do

OUTPUT_FILE=/etc/nagios/objects/hosts/san/$host-vol.cfg

# Clear the output file

echo "" > $OUTPUT_FILE

# Get the volume list

for volume in `ssh $host vol status | awk '{ print $1 }' | grep ^vol | sort -u | grep -v vol0$`; do

user="$( grep "USER=" /etc/netapp-sdk/$host | cut -d= -f2 )"

pass="$( grep "PASS=" /etc/netapp-sdk/$host | cut -d= -f2 )"

# echo "define service {"

# echo " use generic-service"

# echo ""

# echo " check_command check_netapp-volfree!$user!$pass!${volume}!92!98"

# echo " check_interval 5"

# echo " host_name ${host}"

# echo " notifications_enabled 0"

# echo " notification_interval 720"

# echo " service_description VOLSPACE ${volume}"

# echo "}"

echo

echo "define service {"

echo " use generic-service-san-perfdata"

echo ""

echo " check_command check_netapp-lunspace!$user!$pass!${volume}"

echo " check_interval 5"

echo " host_name ${host}"

echo " notifications_enabled 0"

echo " notification_interval 720"

echo " service_description LUNSPACE ${volume}"

echo "}"

echo

SR="$( ssh $host snap reserve $volume | cut -d -f7 )"

if [ "$SR" != "0%" ] ; then

echo "define service {"

echo " use generic-service-san-perfdata"

echo ""

echo " check_command check_netapp-snapreserve!$user!$pass!${volume}"

echo " check_interval 10"

echo " host_name ${host}"

echo " notifications_enabled 0"

echo " notification_interval 720"

echo " # SR: $SR"

echo " service_description SNAPRESERVE ${volume}"

echo "}"

echo

done | tee -a $OUTPUT_FILE

# Check snapvault foo

for sv in `ssh $host snapvault status -l 2>/dev/null | awk '{ print $2 }' | grep vol`; do

# only do the checks on sv_secondary

if [ "$( echo $sv | grep $host | cut -d: -f1 )" == "${host}" ]; then

vol="$( echo $sv | cut -d/ -f3 )"

user="$( grep "USER=" /etc/netapp-sdk/$host | cut -d= -f2 )"

pass="$( grep "PASS=" /etc/netapp-sdk/$host | cut -d= -f2 )"

echo "define service {"

echo " use generic-service-san-perfdata"

echo ""

echo " check_command check_netapp-snapvault!$user!$pass!$vol!38!42!"

echo " check_interval 60"

echo " host_name ${host}"

echo " notifications_enabled 0"

echo " notification_interval 720"

echo " service_description SNAPVAULT ${vol}"

echo "}"

echo

done | tee -a $OUTPUT_FILE

# Check snapmirror foo

for sm in `ssh $host snapmirror status 2>/dev/null | awk '{ print $2 }' | grep vol | grep $host`; do

# only do the checks on sm_secondary

if [ "$( echo $sm | grep $host | cut -d: -f1 )" == "${host}" ]; then

vol="$( echo $sm | cut -d/ -f3 | cut -d: -f2 )"

user="$( grep "USER=" /etc/netapp-sdk/$host | cut -d= -f2 )"

pass="$( grep "PASS=" /etc/netapp-sdk/$host | cut -d= -f2 )"

echo "define service {"

echo " use generic-service-san-perfdata"

echo ""

echo " check_command check_netapp-snapmirror!$user!$pass!$vol!38!42!"

echo " check_interval 60"

echo " host_name ${host}"

echo " notifications_enabled 0"

echo " notification_interval 720"

echo " service_description SNAPMIRROR ${vol}"

echo "}"

echo

done | tee -a $OUTPUT_FILE

done

Generate Nagios config for NetApp filers

January 1, 2013June 21, 2013 Christian Leave a comment

At some point in the last few weeks, I repeatedly had to recreate my Nagios config for currently six filers. After doing that a few times, I ended up (like sooo often) writing a short Bash script, that’ll do this for me – without any fuss.

The only thing the script needs, is that the filers and the filers are registered in DNS … Here’s an example:

fas3240a      IN   A     172.31.76.150
fas3240a-sp   IN   A     172.31.74.150
fas3240b      IN   A     172.31.76.151
fas3240b-sp   IN   A     172.31.74.151

fas3240a IN A 172.31.76.150

fas3240a-sp IN A 172.31.74.150

fas3240b IN A 172.31.76.151

fas3240b-sp IN A 172.31.74.151

With that done, the script will create the necessary Nagios config for those filers.

NetApp: Monitoring of SnapVault/SnapMirror/LUN/Snapshot information with Nagios

December 29, 2012September 10, 2014 Christian 4 Comments

As I wrote before, we have a bunch of filers (and a ton of volumes w/ luns on them), that I need to monitor. At first, I tried the existing NetApp Nagios-Plugin(s), but they all use SNMP and with that I can either watch all volumes or none. And that didn’t satisfy me.

Don’t get me wrong, the existing plugins are okay and I still use them for stuff (like GLOBALSTATUS or FAN/CPU/POWER) which isn’t present in the API or real hard to get at, however I wanted more. So I ended up looking at the NetApp API, and ended up writing a “short” plugin for Nagios using Perl.

Maybe if I’m ever bored, I’ll rewrite it using C, but for now the Perl plugin has to suffice.

So far the plugin supports the following things:

Monitoring FlexVolumes (simply watching the free space)
Monitoring LUN space (the allocated space inside a FlexVolume for iSCSI/FC LUNs)
Monitoring Snapshot space (the allocated space inside a FlexVolume for Snapshots)
Monitoring SnapVault relations (and their age)
Monitoring SnapMirror relations (and their age)

The plugin will return performance data for most (if not all) of those classes. It needs a user on the filer you wish to monitor – which sadly needs to have the admin role.

Monitoring Brocade FC switches with SNMP/Nagios

December 21, 2009August 8, 2014 Christian 1 Comment

I looked into the mess a bit more, and as it turns out, the weird crap I was talking about only happens if you have a port with LossofSynchronization, LossofSignal or LinkFailures value with the base of ten (i.e. 10, 101 or 10.000).

Additionally, the OID’s for those three failure elements seem to be dependent on the firmware version, as with 6.3.x they appear as different OIDs. So I may need to introduce another command-line switch, which selects the firmware version and depending on that, the OID.

Even despite those problems I just described, I ended up using the plugin to watch our SAN infrastructure. I even wrote a simple pnp4nagios template, so all the data would show up in a single graph and not a graph per data source.

check_snmp_brocade_fcport Graph: 4 Hours

Monitoring Brocade FC switches with Nagios

November 23, 2009August 8, 2014 Christian 1 Comment

The last four days I spent looking for ways on monitoring a Brocade Fibrechannel switch (in my case IBM 2145 B32/F40). The first thing I came up with, is using SNMP. As it was already configured for the previous monitoring with Munin, getting information should be quite easy. After looking through Google for a bit, there is already one script that worked for me.

Only trouble I had with that script, is that it crams every single port into one result. As I wanted something, that a) could watch a single port and b) return performance data, I went ahead an used the script to do a basic rewrite. But after a short while, I grew antsy and started writing a script from scratch, using the OIDs I got from that script and a Cacti template.

So far, I got a good plugin, but it’s still lacking a few things:

Support for warning/critical thresholds for each error category
Sadly the important errors (er_link_fail, er_loss_sync and er_loss_sig) are kept in a separate table structure (swEndDeviceRlsEntry), which I can’t seem to access right now; even though the entries are mandatory and according to the MIB should be at least read-only.
The plugin isn’t doing a proper $session->close(); . After moving the snmp stuff into a subroutine, Perl refuses to do the session closing. Don’t know why right now.

Right now, the plugin supports two modes. The first just checks if the port is operational and in sync and the second checks the port status, but also returns the performance data.

Only do a basic check if the Port is in operational status

./check_snmp_brocade_fcport.pl -H 10.0.0.50 -C public -P 2 -N
SNMP_BROCADE_FCPORT OK - FC port 0/2's swFCPortPhyState is inSync

1 2	./check_snmp_brocade_fcport.pl -H 10.0.0.50 -C public -P 2 -N SNMP_BROCADE_FCPORT OK - FC port 0/2's swFCPortPhyState is inSync

Check the port status, but also return performance data

./check_snmp_brocade_fcport.pl -H 10.0.0.50 -C public -P 2
SNMP_BROCADE_FCPORT OK - FC port 0/2's swFCPortPhyState is inSync|stat_wtx=577976968;0;0;0;0 stat_wrx=4069984468;0;0;0;0 stat_ftx=422378205;0;0;0;0 stat_frx=123789748;0;0;0;0 er_enc_in=0;0;0;0;0 er_crc=0;0;0;0;0 er_trunc=0;0;0;0;0 er_toolong=0;0;0;0;0 er_bad_eof=0;0;0;0;0 er_enc_out=0;0;0;0;0 er_c3_timeout=0;0;0;0;0

./check_snmp_brocade_fcport.pl -H 10.0.0.50 -C public -P 2

SNMP_BROCADE_FCPORT OK - FC port 0/2's swFCPortPhyState is inSync|stat_wtx=577976968;0;0;0;0 stat_wrx=4069984468;0;0;0;0 stat_ftx=422378205;0;0;0;0 stat_frx=123789748;0;0;0;0 er_enc_in=0;0;0;0;0 er_crc=0;0;0;0;0 er_trunc=0;0;0;0;0 er_toolong=0;0;0;0;0 er_bad_eof=0;0;0;0;0 er_enc_out=0;0;0;0;0 er_c3_timeout=0;0;0;0;0

That might look like much, but Nagios is gonna pass everything after “|” to your performance data command.

List of OIDs, which hold the various information:

   swFCPort
   PhyState: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.3.
   OpStatus: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.4.
  AdmStatus: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.5.
  LinkState: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.6.
    TxWords: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.11. (stat_wtx)
    RxWords: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.12. (stat_wrx)
   TxFrames: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.13. (stat_ftx)
   RxFrames: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.14. (stat_frx)
 RxEncInFrs: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.21. (er_enc_in)
     RxCrcs: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.22. (er_crc)
   RxTruncs: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.23. (er_trunc)
 RxTooLongs: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.24. (er_toolong)
  RxBadEofs: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.25. (er_bad_eof)
RxEncOutFrs: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.26. (er_enc_out)
 C3Discards: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.28. (er_c3_timeout)

swEndDevice
LinkFailure: .1.3.6.1.4.1.1588.2.1.1.1.21.1.1.4. (er_link_fail)
   SyncLoss: .1.3.6.1.4.1.1588.2.1.1.1.21.1.1.5. (er_loss_sync)
    SigLoss: .1.3.6.1.4.1.1588.2.1.1.1.21.1.1.6. (er_loss_sig)

swFCPort

PhyState: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.3.

OpStatus: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.4.

AdmStatus: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.5.

LinkState: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.6.

TxWords: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.11. (stat_wtx)

RxWords: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.12. (stat_wrx)

TxFrames: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.13. (stat_ftx)

RxFrames: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.14. (stat_frx)

RxEncInFrs: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.21. (er_enc_in)

RxCrcs: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.22. (er_crc)

RxTruncs: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.23. (er_trunc)

RxTooLongs: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.24. (er_toolong)

RxBadEofs: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.25. (er_bad_eof)

RxEncOutFrs: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.26. (er_enc_out)

C3Discards: .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.28. (er_c3_timeout)

swEndDevice

LinkFailure: .1.3.6.1.4.1.1588.2.1.1.1.21.1.1.4. (er_link_fail)

SyncLoss: .1.3.6.1.4.1.1588.2.1.1.1.21.1.1.5. (er_loss_sync)

SigLoss: .1.3.6.1.4.1.1588.2.1.1.1.21.1.1.6. (er_loss_sig)

The last three OIDs, as well as the ones in FCMGMT-MIB (as I mentioned in the TODO), sadly don’t exist (or I’m doing something wrong ? — no clue right now), so I can’t incorporate them into the script at this time.

However, I found something in a separate OID-tree (also the FCMGMT-MIB), which seems to be exactly what I’m looking for.

connUnitPortStatCount
LossofSynchronization: .1.3.6.1.3.94.4.5.1.44.16.0.0.5.30.52.240.185.0.0.0.0.0.0.0.0.&lt;fcport&gt;
         LossofSignal: .1.3.6.1.3.94.4.5.1.43.16.0.0.5.30.52.240.185.0.0.0.0.0.0.0.0.&lt;fcport&gt;
         LinkFailures: .1.3.6.1.3.94.4.5.1.44.16.0.0.5.30.52.240.185.0.0.0.0.0.0.0.0.&lt;fcport&gt;

connUnitPortStatCount

LossofSynchronization: .1.3.6.1.3.94.4.5.1.44.16.0.0.5.30.52.240.185.0.0.0.0.0.0.0.0.<fcport>

LossofSignal: .1.3.6.1.3.94.4.5.1.43.16.0.0.5.30.52.240.185.0.0.0.0.0.0.0.0.<fcport>

LinkFailures: .1.3.6.1.3.94.4.5.1.44.16.0.0.5.30.52.240.185.0.0.0.0.0.0.0.0.<fcport>

Only trouble with those OIDs is, that they are OCTET STRING’s, which right now just return crap (either nothing or just a new-line) with my script. Gonna have to work on that.

If you’re interested in the Perl script (for now, lacking some options, performance data, $session->close();), you’ll find it here.

Configuring nagios-plugins-zypper

November 12, 2009June 21, 2013 Christian Leave a comment

Since I’m running check_zypper via nrpe (which in turn runs as nobody), I need to set up sudo. In order for the plugin to work, we need to add the following line to /etc/sudoers (by means of visudo):

nobody ALL = NOPASSWD: /usr/bin/zypper sl, /usr/bin/zypper --non-interactive --no-gpg-checks --terse list-updates

1	nobody ALL = NOPASSWD: /usr/bin/zypper sl, /usr/bin/zypper --non-interactive --no-gpg-checks --terse list-updates

(Keep in mind this needs to be a single line …)

Praxisbuch Nagios by Tobias Scherbaum

July 25, 2009July 25, 2009 Christian Leave a comment

Tobi recently finished writing yet another book, which he also talked about in a blog post. Shortly after, I asked him a rather curious question. What exactly is the plant or animal on the cover of the book ? He was kind enough to send a voucher copy of the book my way.

He actually mentions it in the credits at the beginning of the book. Turns out it is an animal, a sea pen or sea feather (I’m guessing at Pennatula aculeata).

Now as for the content of the book itself, I do have to admit that I haven’t read the whole book. I just picked a few topics (SNMP-Traps with Nagios, notifications) which I did find rather well written. My (soon ex-) trainee, Michel, however already bugged Tobias about some errors in the book itself, or rather some changes which happened after 3.0.3 (that’s the Nagios version the book is based on).

All in all, I guess I can congratulate Tobias on yet another good written book!

Nagios: Service Check Timed Out

April 3, 2009June 21, 2013 Christian 6 Comments

Since I got the pleasure of watching some Windows boxen with Nagios, I took the Windows Update plugin from Michal Jankowski and implemented it. It took me some time, to initially set up the nsclient++ correctly so it just works, but up till now the check plugin sometimes reported the usual “Service Check Timed Out”.

Usually I ended up increasing the cscript timeout, or the nsclient++ socket timeout, but it still kept showing up. Since I rely heavily on my surveillance tools, I have the demand, that as few as possible false positives show up. So I ended up chasing down this error today, and after that I have to say it was quite simple.

In my case, it wasn’t cscript (that timeout is set to 300 seconds), neither nsclient++ (socket timeout is set to 300 seconds too), nor the nrpe plugin itself (that has 300 seconds as well).

As it turns out, Nagios got an additional setting controlling these things, called service_check_timeout which defaults to 60 seconds. Sadly the plugin, or rather Windows needs longer than those 60 seconds to figure out whether or not it needs updating, thus Nagios is killing the plugin and returning a CRITICAL message.

After increasing the value of service_check_timeout that’ll be fixed hopefully.

Nagios: SNMP OID’s for IBM’s RSA II adapter

April 1, 2009June 21, 2013 Christian Leave a comment

Well, after some poking around I finally found some OID’s for the RSA’s (only through these two links: check_rsa_fan and check_rsa_temp).

For Nagios, I dismissed the fans, since the fan speed is only passed on in percent values. So I only added this:

define hostgroup{
  hostgroup_name                  rsa-snmp
  alias                           Remote Supervisor Adapter (allowing SNMP connections)
}

define service{
  use                             generic-perfdata

  check_command                   check_rsa_snmpv1_public!.1.3.6.1.4.1.2.3.51.1.2.1.2.1.1!45!60!°C!Temperature CPU0!
  hostgroup_name                  rsa-snmp
  service_description             TEMP CPU0
}

define service{
  use                             generic-perfdata

  check_command                   check_rsa_snmpv1_public!.1.3.6.1.4.1.2.3.51.1.2.1.2.2.1!45!60!°C!Temperature CPU1!
  hostgroup_name                  rsa-snmp
  service_description             TEMP CPU1
}

define service{
  use                             generic-perfdata

  check_command                   check_rsa_snmpv1_public!.1.3.6.1.4.1.2.3.51.1.2.1.5.1.0!29!35!°C!Temperature Ambient!
  hostgroup_name                  rsa-snmp
  service_description             TEMP AMBIENT
}

define hostgroup{

hostgroup_name rsa-snmp

alias Remote Supervisor Adapter (allowing SNMP connections)

}

define service{

use generic-perfdata

check_command check_rsa_snmpv1_public!.1.3.6.1.4.1.2.3.51.1.2.1.2.1.1!45!60!°C!Temperature CPU0!

hostgroup_name rsa-snmp

service_description TEMP CPU0

}

define service{

use generic-perfdata

check_command check_rsa_snmpv1_public!.1.3.6.1.4.1.2.3.51.1.2.1.2.2.1!45!60!°C!Temperature CPU1!

hostgroup_name rsa-snmp

service_description TEMP CPU1

}

define service{

use generic-perfdata

check_command check_rsa_snmpv1_public!.1.3.6.1.4.1.2.3.51.1.2.1.5.1.0!29!35!°C!Temperature Ambient!

hostgroup_name rsa-snmp

service_description TEMP AMBIENT

}

Oh, and if anyone else is curious like me, here’s the list with the OID’s, courtesy of Gerhard Gschlad and Leonardo Calamai.

For the fans:

Fan1: .1.3.6.1.4.1.2.3.51.1.2.3.1.0
Fan2: .1.3.6.1.4.1.2.3.51.1.2.3.2.0
Fan3: .1.3.6.1.4.1.2.3.51.1.2.3.3.0
Fan4: .1.3.6.1.4.1.2.3.51.1.2.3.4.0
Fan5: .1.3.6.1.4.1.2.3.51.1.2.3.5.0
Fan6: .1.3.6.1.4.1.2.3.51.1.2.3.6.0
Fan7: .1.3.6.1.4.1.2.3.51.1.2.3.7.0
Fan8: .1.3.6.1.4.1.2.3.51.1.2.3.8.0
Fan9: .1.3.6.1.4.1.2.3.51.1.2.3.9.0
Fan10: .1.3.6.1.4.1.2.3.51.1.2.3.10.0
Fan11: .1.3.6.1.4.1.2.3.51.1.2.3.11.0
Fan12: .1.3.6.1.4.1.2.3.51.1.2.3.12.0

Fan1: .1.3.6.1.4.1.2.3.51.1.2.3.1.0

Fan2: .1.3.6.1.4.1.2.3.51.1.2.3.2.0

Fan3: .1.3.6.1.4.1.2.3.51.1.2.3.3.0

Fan4: .1.3.6.1.4.1.2.3.51.1.2.3.4.0

Fan5: .1.3.6.1.4.1.2.3.51.1.2.3.5.0

Fan6: .1.3.6.1.4.1.2.3.51.1.2.3.6.0

Fan7: .1.3.6.1.4.1.2.3.51.1.2.3.7.0

Fan8: .1.3.6.1.4.1.2.3.51.1.2.3.8.0

Fan9: .1.3.6.1.4.1.2.3.51.1.2.3.9.0

Fan10: .1.3.6.1.4.1.2.3.51.1.2.3.10.0

Fan11: .1.3.6.1.4.1.2.3.51.1.2.3.11.0

Fan12: .1.3.6.1.4.1.2.3.51.1.2.3.12.0

And for the temperatures:

CPU1: .1.3.6.1.4.1.2.3.51.1.2.1.2.1.1
CPU2: .1.3.6.1.4.1.2.3.51.1.2.1.2.2.1
CPU3: .1.3.6.1.4.1.2.3.51.1.2.1.2.3.1
CPU4: .1.3.6.1.4.1.2.3.51.1.2.1.2.4.1
Ambient: .1.3.6.1.4.1.2.3.51.1.2.1.5.1.0

CPU1: .1.3.6.1.4.1.2.3.51.1.2.1.2.1.1

CPU2: .1.3.6.1.4.1.2.3.51.1.2.1.2.2.1

CPU3: .1.3.6.1.4.1.2.3.51.1.2.1.2.3.1

CPU4: .1.3.6.1.4.1.2.3.51.1.2.1.2.4.1

Ambient: .1.3.6.1.4.1.2.3.51.1.2.1.5.1.0

I just found a proper list of OID’s for the IBM RSA adapter. That’s rather nice, since I really was looking for the OID’s for the VRM failure OID and other warning/critical events.

Nagios: check_snmp again

February 27, 2009June 21, 2013 Christian Leave a comment

Well, today I had to grind my head again, regarding the way check_snmp handles WARNING and CRITICAL events. From my point of view, check_snmp is really just retarded sometimes.

As you know, all the other plugins accept WARNING and CRITICAL-thresholds based on the calculation, if the return integer is above this threshold it reached WARNING/CRITICAL state. But check_snmp doesn’t play that way.

It expects only ranges, which are NOT gonna result in warning or critical events. Which is kinda stupid, since you gotta rethink twice about the thresholds 😛

define service {
  use                   generic-service
  host_name             ibm-bc1-mgmt
  service_description   Chassis Cooling - Bay 1
  check_command         check_snmpv1_public!.1.3.6.1.4.1.2.3.51.2.2.3.20.0!
                                            1900:8000!1900:0,10000:8000!
                                            RPM!Chassis Cooling - Bay 1
  action_url            /pnp/index.php?host=$HOSTNAME$&amp;srv=$SERVICEDESC$
  notes                 View PNP RRD grap
}

define service {

use generic-service

host_name ibm-bc1-mgmt

service_description Chassis Cooling - Bay 1

check_command check_snmpv1_public!.1.3.6.1.4.1.2.3.51.2.2.3.20.0!

1900:8000!1900:0,10000:8000!

RPM!Chassis Cooling - Bay 1

action_url /pnp/index.php?host=$HOSTNAME$&srv=$SERVICEDESC$

notes View PNP RRD grap

}

All in all, another lesson learned 😮