summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--contrib/check_snmp_process_monitor.pl250
1 files changed, 177 insertions, 73 deletions
diff --git a/contrib/check_snmp_process_monitor.pl b/contrib/check_snmp_process_monitor.pl
index 263255b..c98ee7d 100644
--- a/contrib/check_snmp_process_monitor.pl
+++ b/contrib/check_snmp_process_monitor.pl
@@ -19,7 +19,8 @@ use lib qw( /opt/nagios/libexec /usr/local/libexec );
19use utils qw(%ERRORS $TIMEOUT &print_revision &support &usage); 19use utils qw(%ERRORS $TIMEOUT &print_revision &support &usage);
20use SNMP 5.0; 20use SNMP 5.0;
21use Getopt::Long; 21use Getopt::Long;
22use vars qw( $exit $opt_version $opt_timeout $opt_help $opt_command $opt_host $opt_community $opt_verbose $opt_warning $opt_critical $opt_memory $opt_cpu $opt_port $opt_regex $opt_stats %processes $snmp_session $PROGNAME $TIMEOUT ); 22use Storable;
23use vars qw( $exit $opt_version $opt_timeout $opt_help $opt_command $opt_host $opt_community $opt_verbose $opt_warning $opt_critical $opt_memory $opt_cpu $opt_port $opt_regex $opt_stats $opt_cache $opt_nocache $cache_exp $interpreters $snmp_session $PROGNAME $TIMEOUT );
23 24
24$PROGNAME = "snmp_process_monitor.pl"; 25$PROGNAME = "snmp_process_monitor.pl";
25$opt_verbose = undef; 26$opt_verbose = undef;
@@ -31,8 +32,13 @@ $opt_critical = [ 1, -1 ];
31$opt_memory = undef; 32$opt_memory = undef;
32$opt_cpu = undef; 33$opt_cpu = undef;
33$opt_port = 161; 34$opt_port = 161;
34%processes = (); 35$opt_cache = 1;
35$exit = 'OK'; 36$opt_nocache = undef;
37$cache_exp = 600;
38$exit = $ERRORS{OK};
39$interpreters = '(perl|/bin/sh|/usr/bin/sh|/bin/bash|/bin/ksh|python)';
40our $cachefile = '/var/opt/nagios/tmp/'; # completed later
41our %processes = ();
36 42
37sub process_options { 43sub process_options {
38 my( $opt_crit, $opt_warn ) = (); 44 my( $opt_crit, $opt_warn ) = ();
@@ -42,6 +48,7 @@ sub process_options {
42 'v' => \$opt_verbose, 'verbose' => \$opt_verbose, 48 'v' => \$opt_verbose, 'verbose' => \$opt_verbose,
43 'h' => \$opt_help, 'help' => \$opt_help, 49 'h' => \$opt_help, 'help' => \$opt_help,
44 's' => \$opt_stats, 'statistics' => \$opt_stats, 50 's' => \$opt_stats, 'statistics' => \$opt_stats,
51 'nocache' => \$opt_nocache,
45 'H:s' => \$opt_host, 'hostname:s' => \$opt_host, 52 'H:s' => \$opt_host, 'hostname:s' => \$opt_host,
46 'p:i' => \$opt_port, 'port:i' => \$opt_port, 53 'p:i' => \$opt_port, 'port:i' => \$opt_port,
47 'C:s' => \$opt_community, 'community:s' => \$opt_community, 54 'C:s' => \$opt_community, 'community:s' => \$opt_community,
@@ -75,6 +82,12 @@ sub process_options {
75 $opt_warning = [ $opt_crit, -1 ]; 82 $opt_warning = [ $opt_crit, -1 ];
76 } 83 }
77 } 84 }
85 if ( defined($opt_memory) ) { $opt_memory = 0 }
86 if ( defined($opt_cpu) ) { $opt_cpu = 0 }
87 if ( defined($opt_nocache)) { $opt_cache = 0 }
88
89 # complete the cachefile's name
90 $cachefile .= $opt_host . '.proc';
78} 91}
79 92
80sub local_print_revision { 93sub local_print_revision {
@@ -82,7 +95,7 @@ sub local_print_revision {
82} 95}
83 96
84sub print_usage { 97sub print_usage {
85 print "Usage: $PROGNAME -H <host> -C <snmp_community> -e <command> [-w <low>,<high>] [-c <low>,<high>] [-t <timeout>]\n"; 98 print "Usage: $PROGNAME -H <host> -C <snmp_community> -e <command> [-w <low>,<high>] [-c <low>,<high>] [-t <timeout>] [-s|--statistics] [--memory] [--cpu] [--nocache]\n";
86} 99}
87 100
88sub print_help { 101sub print_help {
@@ -107,6 +120,10 @@ sub print_help {
107 minimum and maximum number of processes before a warning is issued (Default 1,-1) 120 minimum and maximum number of processes before a warning is issued (Default 1,-1)
108-c, --critical=INTEGER[,INTEGER] 121-c, --critical=INTEGER[,INTEGER]
109 minimum and maximum number of processes before a critical is issued (Default 1,-1) 122 minimum and maximum number of processes before a critical is issued (Default 1,-1)
123--memory
124 combined with '-s', will print the number of bytes of real memory used by process
125--cpu
126 combined with '-s', will print the number of seconds of cpu time consumed by process
110EOT 127EOT
111} 128}
112 129
@@ -117,11 +134,129 @@ sub verbose (@) {
117 134
118sub check_for_errors { 135sub check_for_errors {
119 if ( $snmp_session->{ErrorNum} ) { 136 if ( $snmp_session->{ErrorNum} ) {
137 %processes = ();
120 print "UNKNOWN - error retrieving SNMP data: $snmp_session->{ErrorStr}\n"; 138 print "UNKNOWN - error retrieving SNMP data: $snmp_session->{ErrorStr}\n";
121 exit $ERRORS{UNKNOWN}; 139 exit $ERRORS{UNKNOWN};
122 } 140 }
123} 141}
124 142
143sub init_cache {
144 if ( !defined($opt_cache) ) {
145 %processes = ();
146 return;
147 }
148 if ( -r $cachefile ) {
149 eval {
150 verbose "loading cache from $cachefile\n";
151 %processes = %{ retrieve( $cachefile ) };
152 };
153 if ( $@ ) {
154 verbose "cache loading failed - using blank cache: $@\n";
155 %processes = ()
156 }
157 }
158 else {
159 %processes = ();
160 }
161}
162
163sub snmpget {
164 my $tmpvar = SNMP::Varbind->new( shift );
165 $snmp_session->get( $tmpvar );
166 check_for_errors();
167 return $tmpvar->val;
168}
169
170sub update_cache {
171 # expire the cache after $cache_exp seconds
172 if ( $opt_cache != 0 && exists($processes{__last_update})
173 && $processes{__last_update} >= time - $cache_exp ) {
174 verbose "cache file is recent enough - using it\n";
175 return 1;
176 }
177
178 verbose "retrieving full listing of processes from $opt_host\n";
179 my $process_count = snmpget( ['hrSystemProcesses', 0] );
180
181 # retrieve the data from the remote host
182 my ($names) = $snmp_session->bulkwalk( 0, $process_count + 1, [['hrSWRunName']] );
183 check_for_errors();
184
185 # make sure the number of processes from the bulkwalk is close to hrSystemProcesses
186 if ( scalar(@$names) + 10 < $process_count ) {
187 print "UNKNOWN - only ", scalar(@$names), " of ",$process_count, " processes returned\n";;
188 exit $ERRORS{UNKNOWN};
189 }
190
191 # sort through the process names and create a nice hash of processes
192 foreach my $row ( @$names ) {
193 my %hash = {};
194 $hash{name} = $row->val;
195 $hash{abs_name} = $row->val;
196 $hash{name} =~ s#.*/##; # strip path
197
198 if ( defined($opt_regex) ||
199 ($row->val =~ m#$interpreters$#
200 && $opt_command !~ m#$interpreters$#) ) {
201
202 # fetch the runtime parameters of the process
203 my $parameters = snmpget( ['hrSWRunParameters', $row->iid] );
204
205 # only strip if we're looking for a specific command
206 if ( defined($opt_command) ) {
207 verbose "process ",$row->iid," uses $1 as an interpreter - getting parameters\n";
208 $hash{name} = $parameters;
209 $hash{name} =~ s#.*/##; # strip path name off the front
210 $hash{name} =~ s/\s+.*$//; # strip everything from the first space to the end
211 }
212 else {
213 # use the full 'ps -efl' style listing for regular expression matching
214 my $path = snmpget( ['hrSWRunPath', $row->iid] );
215 $hash{name} = "$path $parameters";
216 }
217 }
218 # store in the global hash
219 $processes{$row->iid} = \%hash;
220 }
221
222 # update the timestamp so the cache can expire
223 $processes{__last_update} = time;
224 return 0;
225}
226
227# process the %processes hash and see if there any matches for our command or regex
228sub check_for_matches {
229 my $ret_match = 0;
230 foreach my $key ( keys(%processes) ) {
231 next if ( $key eq '__last_update' );
232 my $match = 0;
233
234 # static matches are letter-for-letter (-e)
235 if ( defined($opt_command) && $processes{$key}->{name} eq $opt_command ) { $match++; }
236 # use /o to make sure the user-supplied regex (-r) is only compiled once
237 elsif ( defined($opt_regex) && $processes{$key}->{name} =~ /$opt_regex/o ) { $match++; }
238
239 # verify the cache's entry by doing an snmpget
240 if ( $match > 0 && $opt_cache != 0 ) {
241 my $proc = snmpget( ['hrSWRunName', $key] );
242 --$match if ( !$proc || $proc ne $processes{$key}->{abs_name} );
243 }
244 # get the process memory usage if requested
245 if ( $match > 0 && defined($opt_memory) ) {
246 $opt_memory += snmpget( ['hrSWRunPerfMem', $key] );
247 }
248 # get the process cpu usage if requested
249 if ( $match > 0 && defined($opt_cpu) ) {
250 $opt_cpu += snmpget( ['hrSWRunPerfCPU', $key] );
251 }
252
253 verbose "process '$processes{$key}->{name}' has pid $processes{$key}->{pid} and index $key\n"
254 if ( $match > 0 );
255
256 $ret_match += $match;
257 }
258 return $ret_match;
259}
125# =========================================================================== # 260# =========================================================================== #
126# =====> MAIN 261# =====> MAIN
127# =========================================================================== # 262# =========================================================================== #
@@ -129,6 +264,10 @@ process_options();
129 264
130alarm( $TIMEOUT ); # make sure we don't hang Nagios 265alarm( $TIMEOUT ); # make sure we don't hang Nagios
131 266
267# intialize the cache, if it's enabled
268init_cache();
269
270# create a session for conversing with the remote SNMP agent
132$snmp_session = new SNMP::Session( 271$snmp_session = new SNMP::Session(
133 DestHost => $opt_host, 272 DestHost => $opt_host,
134 Community => $opt_community, 273 Community => $opt_community,
@@ -136,92 +275,57 @@ $snmp_session = new SNMP::Session(
136 Version => '2c' 275 Version => '2c'
137); 276);
138 277
139my $process_count = SNMP::Varbind->new( ['hrSystemProcesses', 0] ); 278my $usage = update_cache();
140$snmp_session->get( $process_count ); 279my $count = check_for_matches();
141check_for_errors();
142
143# retrieve the data from the remote host
144my( $names, $index ) = $snmp_session->bulkwalk( 0, $process_count->val, [['hrSWRunName'], ['hrSWRunIndex']] );
145check_for_errors();
146
147alarm( 0 ); # all done with the network connection
148
149my %namecount = ();
150foreach my $row ( @$names ) {
151 $processes{$row->iid}->{name} = $row->val;
152 $processes{$row->iid}->{name} =~ s#.*/##; # strip path
153
154 if ( defined($opt_regex) ||
155 ($row->val =~ /(perl|\/usr\/bin\/sh|\/bin\/bash|\/bin\/sh)$/
156 && $opt_command !~ /(perl|\/usr\/bin\/sh|\/bin\/bash|\/bin\/sh)$/) ) {
157
158 # fetch the runtime parameters of the process
159 my $parm_var = SNMP::Varbind->new( ['hrSWRunParameters', $row->iid] );
160 $snmp_session->get( $parm_var );
161 check_for_errors();
162
163 # only strip if we're looking for a specific command
164 if ( defined($opt_command) ) {
165 verbose "process ",$row->iid," uses $1 as an interpreter - getting parameters\n";
166 $processes{$row->iid}->{name} = $parm_var->val;
167 # strip path name off the front
168 $processes{$row->iid}->{name} =~ s#.*/##;
169 # strip everything from the first space to the end
170 $processes{$row->iid}->{name} =~ s/\s+.*$//;
171 }
172 else {
173 # get the longer full-path style listing
174 my $path_var = SNMP::Varbind->new( ['hrSWRunPath', $row->iid] );
175 $snmp_session->get( $path_var );
176 check_for_errors();
177 280
178 # use the full 'ps -efl' style listing for regular expression matching 281# always try twice if caching is enabled - once with cache and once without
179 $processes{$row->iid}->{name} = $path_var->val.' '.$parm_var->val; 282if ( $usage != 0 && $opt_cache != 0 && $count <= 0 ) {
180 } 283 verbose "did not find process in cache - trying a refresh\n";
181 } 284 %processes = ();
182} 285 update_cache();
183foreach my $row ( @$index ) { 286 $count = check_for_matches();
184 $processes{$row->iid}->{pid} = $row->val;
185} 287}
186 288
187my @pids = (); 289
188my @matches = (); 290# the default, OK message
189foreach my $key ( keys(%processes) ) { 291my $message = "OK - $count process(es) found resembling '". ($opt_command || $opt_regex);
190 if ( defined($opt_command) && $processes{$key}->{name} eq $opt_command ) {
191 push( @matches, $processes{$key} );
192 push( @pids, $processes{$key}->{pid} );
193 verbose "process '$processes{$key}->{name}' has pid ",
194 "$processes{$key}->{pid} and index $key\n";
195 }
196 elsif ( defined($opt_regex) && $processes{$key}->{name} =~ /$opt_regex/o ) {
197 push( @matches, $processes{$key} );
198 push( @pids, $processes{$key}->{pid} );
199 verbose "process '$processes{$key}->{name}' has pid ",
200 "$processes{$key}->{pid} and index $key\n";
201 }
202}
203my $count = @matches;
204 292
205# warning, critical 293# warning, critical
206if ( ($opt_warning->[0] > 0 && $opt_warning->[0] > $count) 294if ( ($opt_warning->[0] > 0 && $opt_warning->[0] > $count)
207 || ($opt_warning->[1] > 0 && $opt_warning->[1] <= $count) ) { 295 || ($opt_warning->[1] > 0 && $opt_warning->[1] <= $count) ) {
208 $exit = 'WARNING'; 296 $message = "WARNING - no processes found resembling '". ($opt_command || $opt_regex);
297 $exit = $ERRORS{WARNING};
209} 298}
210if ( ($opt_critical->[0] > 0 && $opt_critical->[0] > $count) 299if ( ($opt_critical->[0] > 0 && $opt_critical->[0] > $count)
211 || ($opt_critical->[1] > 0 && $opt_critical->[1] <= $count) ) { 300 || ($opt_critical->[1] > 0 && $opt_critical->[1] <= $count) ) {
212 $exit = 'CRITICAL'; 301 $message = "CRITICAL - no processes found resembling '". ($opt_command || $opt_regex);
302 $exit = $ERRORS{CRITICAL};
213} 303}
214 304
215print "$exit - $count processes with pid(s) ",join(',',@pids); 305# output the status message
306print $message, "'";
216 307
217# print the number of processes if statistics are requested 308# print the number of processes if statistics are requested
218if ( defined($opt_stats) ) { 309if ( defined($opt_stats) ) {
219 print "|count:$count\n"; 310 print "|count=$count";
311 if ( defined($opt_memory) ) {
312 print ":memory=", $opt_memory;
313 }
314 if ( defined($opt_cpu) ) {
315 $opt_cpu = $opt_cpu / 100;
316 printf ":cpu=%.2f", $opt_cpu;
317 }
220} 318}
221else { 319
222 print "\n"; 320# store a copy of the %processes hash if we're using caching
321if ( $exit == $ERRORS{OK} && $opt_cache != 0 ) {
322 eval {
323 unlink( $cachefile ) if ( -e $cachefile );
324 store( \%processes, $cachefile );
325 };
223} 326}
224 327
225exit $ERRORS{$exit}; 328print "\n";
329exit $exit;
226 330
227 331