diff options
-rw-r--r-- | contrib/check_snmp_process_monitor.pl | 250 |
1 files changed, 177 insertions, 73 deletions
diff --git a/contrib/check_snmp_process_monitor.pl b/contrib/check_snmp_process_monitor.pl index 263255b5..c98ee7dc 100644 --- a/contrib/check_snmp_process_monitor.pl +++ b/contrib/check_snmp_process_monitor.pl | |||
@@ -19,7 +19,8 @@ use lib qw( /opt/nagios/libexec /usr/local/libexec ); | |||
19 | use utils qw(%ERRORS $TIMEOUT &print_revision &support &usage); | 19 | use utils qw(%ERRORS $TIMEOUT &print_revision &support &usage); |
20 | use SNMP 5.0; | 20 | use SNMP 5.0; |
21 | use Getopt::Long; | 21 | use Getopt::Long; |
22 | use vars qw( $exit $opt_version $opt_timeout $opt_help $opt_command $opt_host $opt_community $opt_verbose $opt_warning $opt_critical $opt_memory $opt_cpu $opt_port $opt_regex $opt_stats %processes $snmp_session $PROGNAME $TIMEOUT ); | 22 | use Storable; |
23 | use vars qw( $exit $opt_version $opt_timeout $opt_help $opt_command $opt_host $opt_community $opt_verbose $opt_warning $opt_critical $opt_memory $opt_cpu $opt_port $opt_regex $opt_stats $opt_cache $opt_nocache $cache_exp $interpreters $snmp_session $PROGNAME $TIMEOUT ); | ||
23 | 24 | ||
24 | $PROGNAME = "snmp_process_monitor.pl"; | 25 | $PROGNAME = "snmp_process_monitor.pl"; |
25 | $opt_verbose = undef; | 26 | $opt_verbose = undef; |
@@ -31,8 +32,13 @@ $opt_critical = [ 1, -1 ]; | |||
31 | $opt_memory = undef; | 32 | $opt_memory = undef; |
32 | $opt_cpu = undef; | 33 | $opt_cpu = undef; |
33 | $opt_port = 161; | 34 | $opt_port = 161; |
34 | %processes = (); | 35 | $opt_cache = 1; |
35 | $exit = 'OK'; | 36 | $opt_nocache = undef; |
37 | $cache_exp = 600; | ||
38 | $exit = $ERRORS{OK}; | ||
39 | $interpreters = '(perl|/bin/sh|/usr/bin/sh|/bin/bash|/bin/ksh|python)'; | ||
40 | our $cachefile = '/var/opt/nagios/tmp/'; # completed later | ||
41 | our %processes = (); | ||
36 | 42 | ||
37 | sub process_options { | 43 | sub process_options { |
38 | my( $opt_crit, $opt_warn ) = (); | 44 | my( $opt_crit, $opt_warn ) = (); |
@@ -42,6 +48,7 @@ sub process_options { | |||
42 | 'v' => \$opt_verbose, 'verbose' => \$opt_verbose, | 48 | 'v' => \$opt_verbose, 'verbose' => \$opt_verbose, |
43 | 'h' => \$opt_help, 'help' => \$opt_help, | 49 | 'h' => \$opt_help, 'help' => \$opt_help, |
44 | 's' => \$opt_stats, 'statistics' => \$opt_stats, | 50 | 's' => \$opt_stats, 'statistics' => \$opt_stats, |
51 | 'nocache' => \$opt_nocache, | ||
45 | 'H:s' => \$opt_host, 'hostname:s' => \$opt_host, | 52 | 'H:s' => \$opt_host, 'hostname:s' => \$opt_host, |
46 | 'p:i' => \$opt_port, 'port:i' => \$opt_port, | 53 | 'p:i' => \$opt_port, 'port:i' => \$opt_port, |
47 | 'C:s' => \$opt_community, 'community:s' => \$opt_community, | 54 | 'C:s' => \$opt_community, 'community:s' => \$opt_community, |
@@ -75,6 +82,12 @@ sub process_options { | |||
75 | $opt_warning = [ $opt_crit, -1 ]; | 82 | $opt_warning = [ $opt_crit, -1 ]; |
76 | } | 83 | } |
77 | } | 84 | } |
85 | if ( defined($opt_memory) ) { $opt_memory = 0 } | ||
86 | if ( defined($opt_cpu) ) { $opt_cpu = 0 } | ||
87 | if ( defined($opt_nocache)) { $opt_cache = 0 } | ||
88 | |||
89 | # complete the cachefile's name | ||
90 | $cachefile .= $opt_host . '.proc'; | ||
78 | } | 91 | } |
79 | 92 | ||
80 | sub local_print_revision { | 93 | sub local_print_revision { |
@@ -82,7 +95,7 @@ sub local_print_revision { | |||
82 | } | 95 | } |
83 | 96 | ||
84 | sub print_usage { | 97 | sub print_usage { |
85 | print "Usage: $PROGNAME -H <host> -C <snmp_community> -e <command> [-w <low>,<high>] [-c <low>,<high>] [-t <timeout>]\n"; | 98 | print "Usage: $PROGNAME -H <host> -C <snmp_community> -e <command> [-w <low>,<high>] [-c <low>,<high>] [-t <timeout>] [-s|--statistics] [--memory] [--cpu] [--nocache]\n"; |
86 | } | 99 | } |
87 | 100 | ||
88 | sub print_help { | 101 | sub print_help { |
@@ -107,6 +120,10 @@ sub print_help { | |||
107 | minimum and maximum number of processes before a warning is issued (Default 1,-1) | 120 | minimum and maximum number of processes before a warning is issued (Default 1,-1) |
108 | -c, --critical=INTEGER[,INTEGER] | 121 | -c, --critical=INTEGER[,INTEGER] |
109 | minimum and maximum number of processes before a critical is issued (Default 1,-1) | 122 | minimum and maximum number of processes before a critical is issued (Default 1,-1) |
123 | --memory | ||
124 | combined with '-s', will print the number of bytes of real memory used by process | ||
125 | --cpu | ||
126 | combined with '-s', will print the number of seconds of cpu time consumed by process | ||
110 | EOT | 127 | EOT |
111 | } | 128 | } |
112 | 129 | ||
@@ -117,11 +134,129 @@ sub verbose (@) { | |||
117 | 134 | ||
118 | sub check_for_errors { | 135 | sub check_for_errors { |
119 | if ( $snmp_session->{ErrorNum} ) { | 136 | if ( $snmp_session->{ErrorNum} ) { |
137 | %processes = (); | ||
120 | print "UNKNOWN - error retrieving SNMP data: $snmp_session->{ErrorStr}\n"; | 138 | print "UNKNOWN - error retrieving SNMP data: $snmp_session->{ErrorStr}\n"; |
121 | exit $ERRORS{UNKNOWN}; | 139 | exit $ERRORS{UNKNOWN}; |
122 | } | 140 | } |
123 | } | 141 | } |
124 | 142 | ||
143 | sub init_cache { | ||
144 | if ( !defined($opt_cache) ) { | ||
145 | %processes = (); | ||
146 | return; | ||
147 | } | ||
148 | if ( -r $cachefile ) { | ||
149 | eval { | ||
150 | verbose "loading cache from $cachefile\n"; | ||
151 | %processes = %{ retrieve( $cachefile ) }; | ||
152 | }; | ||
153 | if ( $@ ) { | ||
154 | verbose "cache loading failed - using blank cache: $@\n"; | ||
155 | %processes = () | ||
156 | } | ||
157 | } | ||
158 | else { | ||
159 | %processes = (); | ||
160 | } | ||
161 | } | ||
162 | |||
163 | sub snmpget { | ||
164 | my $tmpvar = SNMP::Varbind->new( shift ); | ||
165 | $snmp_session->get( $tmpvar ); | ||
166 | check_for_errors(); | ||
167 | return $tmpvar->val; | ||
168 | } | ||
169 | |||
170 | sub update_cache { | ||
171 | # expire the cache after $cache_exp seconds | ||
172 | if ( $opt_cache != 0 && exists($processes{__last_update}) | ||
173 | && $processes{__last_update} >= time - $cache_exp ) { | ||
174 | verbose "cache file is recent enough - using it\n"; | ||
175 | return 1; | ||
176 | } | ||
177 | |||
178 | verbose "retrieving full listing of processes from $opt_host\n"; | ||
179 | my $process_count = snmpget( ['hrSystemProcesses', 0] ); | ||
180 | |||
181 | # retrieve the data from the remote host | ||
182 | my ($names) = $snmp_session->bulkwalk( 0, $process_count + 1, [['hrSWRunName']] ); | ||
183 | check_for_errors(); | ||
184 | |||
185 | # make sure the number of processes from the bulkwalk is close to hrSystemProcesses | ||
186 | if ( scalar(@$names) + 10 < $process_count ) { | ||
187 | print "UNKNOWN - only ", scalar(@$names), " of ",$process_count, " processes returned\n";; | ||
188 | exit $ERRORS{UNKNOWN}; | ||
189 | } | ||
190 | |||
191 | # sort through the process names and create a nice hash of processes | ||
192 | foreach my $row ( @$names ) { | ||
193 | my %hash = {}; | ||
194 | $hash{name} = $row->val; | ||
195 | $hash{abs_name} = $row->val; | ||
196 | $hash{name} =~ s#.*/##; # strip path | ||
197 | |||
198 | if ( defined($opt_regex) || | ||
199 | ($row->val =~ m#$interpreters$# | ||
200 | && $opt_command !~ m#$interpreters$#) ) { | ||
201 | |||
202 | # fetch the runtime parameters of the process | ||
203 | my $parameters = snmpget( ['hrSWRunParameters', $row->iid] ); | ||
204 | |||
205 | # only strip if we're looking for a specific command | ||
206 | if ( defined($opt_command) ) { | ||
207 | verbose "process ",$row->iid," uses $1 as an interpreter - getting parameters\n"; | ||
208 | $hash{name} = $parameters; | ||
209 | $hash{name} =~ s#.*/##; # strip path name off the front | ||
210 | $hash{name} =~ s/\s+.*$//; # strip everything from the first space to the end | ||
211 | } | ||
212 | else { | ||
213 | # use the full 'ps -efl' style listing for regular expression matching | ||
214 | my $path = snmpget( ['hrSWRunPath', $row->iid] ); | ||
215 | $hash{name} = "$path $parameters"; | ||
216 | } | ||
217 | } | ||
218 | # store in the global hash | ||
219 | $processes{$row->iid} = \%hash; | ||
220 | } | ||
221 | |||
222 | # update the timestamp so the cache can expire | ||
223 | $processes{__last_update} = time; | ||
224 | return 0; | ||
225 | } | ||
226 | |||
227 | # process the %processes hash and see if there any matches for our command or regex | ||
228 | sub check_for_matches { | ||
229 | my $ret_match = 0; | ||
230 | foreach my $key ( keys(%processes) ) { | ||
231 | next if ( $key eq '__last_update' ); | ||
232 | my $match = 0; | ||
233 | |||
234 | # static matches are letter-for-letter (-e) | ||
235 | if ( defined($opt_command) && $processes{$key}->{name} eq $opt_command ) { $match++; } | ||
236 | # use /o to make sure the user-supplied regex (-r) is only compiled once | ||
237 | elsif ( defined($opt_regex) && $processes{$key}->{name} =~ /$opt_regex/o ) { $match++; } | ||
238 | |||
239 | # verify the cache's entry by doing an snmpget | ||
240 | if ( $match > 0 && $opt_cache != 0 ) { | ||
241 | my $proc = snmpget( ['hrSWRunName', $key] ); | ||
242 | --$match if ( !$proc || $proc ne $processes{$key}->{abs_name} ); | ||
243 | } | ||
244 | # get the process memory usage if requested | ||
245 | if ( $match > 0 && defined($opt_memory) ) { | ||
246 | $opt_memory += snmpget( ['hrSWRunPerfMem', $key] ); | ||
247 | } | ||
248 | # get the process cpu usage if requested | ||
249 | if ( $match > 0 && defined($opt_cpu) ) { | ||
250 | $opt_cpu += snmpget( ['hrSWRunPerfCPU', $key] ); | ||
251 | } | ||
252 | |||
253 | verbose "process '$processes{$key}->{name}' has pid $processes{$key}->{pid} and index $key\n" | ||
254 | if ( $match > 0 ); | ||
255 | |||
256 | $ret_match += $match; | ||
257 | } | ||
258 | return $ret_match; | ||
259 | } | ||
125 | # =========================================================================== # | 260 | # =========================================================================== # |
126 | # =====> MAIN | 261 | # =====> MAIN |
127 | # =========================================================================== # | 262 | # =========================================================================== # |
@@ -129,6 +264,10 @@ process_options(); | |||
129 | 264 | ||
130 | alarm( $TIMEOUT ); # make sure we don't hang Nagios | 265 | alarm( $TIMEOUT ); # make sure we don't hang Nagios |
131 | 266 | ||
267 | # intialize the cache, if it's enabled | ||
268 | init_cache(); | ||
269 | |||
270 | # create a session for conversing with the remote SNMP agent | ||
132 | $snmp_session = new SNMP::Session( | 271 | $snmp_session = new SNMP::Session( |
133 | DestHost => $opt_host, | 272 | DestHost => $opt_host, |
134 | Community => $opt_community, | 273 | Community => $opt_community, |
@@ -136,92 +275,57 @@ $snmp_session = new SNMP::Session( | |||
136 | Version => '2c' | 275 | Version => '2c' |
137 | ); | 276 | ); |
138 | 277 | ||
139 | my $process_count = SNMP::Varbind->new( ['hrSystemProcesses', 0] ); | 278 | my $usage = update_cache(); |
140 | $snmp_session->get( $process_count ); | 279 | my $count = check_for_matches(); |
141 | check_for_errors(); | ||
142 | |||
143 | # retrieve the data from the remote host | ||
144 | my( $names, $index ) = $snmp_session->bulkwalk( 0, $process_count->val, [['hrSWRunName'], ['hrSWRunIndex']] ); | ||
145 | check_for_errors(); | ||
146 | |||
147 | alarm( 0 ); # all done with the network connection | ||
148 | |||
149 | my %namecount = (); | ||
150 | foreach my $row ( @$names ) { | ||
151 | $processes{$row->iid}->{name} = $row->val; | ||
152 | $processes{$row->iid}->{name} =~ s#.*/##; # strip path | ||
153 | |||
154 | if ( defined($opt_regex) || | ||
155 | ($row->val =~ /(perl|\/usr\/bin\/sh|\/bin\/bash|\/bin\/sh)$/ | ||
156 | && $opt_command !~ /(perl|\/usr\/bin\/sh|\/bin\/bash|\/bin\/sh)$/) ) { | ||
157 | |||
158 | # fetch the runtime parameters of the process | ||
159 | my $parm_var = SNMP::Varbind->new( ['hrSWRunParameters', $row->iid] ); | ||
160 | $snmp_session->get( $parm_var ); | ||
161 | check_for_errors(); | ||
162 | |||
163 | # only strip if we're looking for a specific command | ||
164 | if ( defined($opt_command) ) { | ||
165 | verbose "process ",$row->iid," uses $1 as an interpreter - getting parameters\n"; | ||
166 | $processes{$row->iid}->{name} = $parm_var->val; | ||
167 | # strip path name off the front | ||
168 | $processes{$row->iid}->{name} =~ s#.*/##; | ||
169 | # strip everything from the first space to the end | ||
170 | $processes{$row->iid}->{name} =~ s/\s+.*$//; | ||
171 | } | ||
172 | else { | ||
173 | # get the longer full-path style listing | ||
174 | my $path_var = SNMP::Varbind->new( ['hrSWRunPath', $row->iid] ); | ||
175 | $snmp_session->get( $path_var ); | ||
176 | check_for_errors(); | ||
177 | 280 | ||
178 | # use the full 'ps -efl' style listing for regular expression matching | 281 | # always try twice if caching is enabled - once with cache and once without |
179 | $processes{$row->iid}->{name} = $path_var->val.' '.$parm_var->val; | 282 | if ( $usage != 0 && $opt_cache != 0 && $count <= 0 ) { |
180 | } | 283 | verbose "did not find process in cache - trying a refresh\n"; |
181 | } | 284 | %processes = (); |
182 | } | 285 | update_cache(); |
183 | foreach my $row ( @$index ) { | 286 | $count = check_for_matches(); |
184 | $processes{$row->iid}->{pid} = $row->val; | ||
185 | } | 287 | } |
186 | 288 | ||
187 | my @pids = (); | 289 | |
188 | my @matches = (); | 290 | # the default, OK message |
189 | foreach my $key ( keys(%processes) ) { | 291 | my $message = "OK - $count process(es) found resembling '". ($opt_command || $opt_regex); |
190 | if ( defined($opt_command) && $processes{$key}->{name} eq $opt_command ) { | ||
191 | push( @matches, $processes{$key} ); | ||
192 | push( @pids, $processes{$key}->{pid} ); | ||
193 | verbose "process '$processes{$key}->{name}' has pid ", | ||
194 | "$processes{$key}->{pid} and index $key\n"; | ||
195 | } | ||
196 | elsif ( defined($opt_regex) && $processes{$key}->{name} =~ /$opt_regex/o ) { | ||
197 | push( @matches, $processes{$key} ); | ||
198 | push( @pids, $processes{$key}->{pid} ); | ||
199 | verbose "process '$processes{$key}->{name}' has pid ", | ||
200 | "$processes{$key}->{pid} and index $key\n"; | ||
201 | } | ||
202 | } | ||
203 | my $count = @matches; | ||
204 | 292 | ||
205 | # warning, critical | 293 | # warning, critical |
206 | if ( ($opt_warning->[0] > 0 && $opt_warning->[0] > $count) | 294 | if ( ($opt_warning->[0] > 0 && $opt_warning->[0] > $count) |
207 | || ($opt_warning->[1] > 0 && $opt_warning->[1] <= $count) ) { | 295 | || ($opt_warning->[1] > 0 && $opt_warning->[1] <= $count) ) { |
208 | $exit = 'WARNING'; | 296 | $message = "WARNING - no processes found resembling '". ($opt_command || $opt_regex); |
297 | $exit = $ERRORS{WARNING}; | ||
209 | } | 298 | } |
210 | if ( ($opt_critical->[0] > 0 && $opt_critical->[0] > $count) | 299 | if ( ($opt_critical->[0] > 0 && $opt_critical->[0] > $count) |
211 | || ($opt_critical->[1] > 0 && $opt_critical->[1] <= $count) ) { | 300 | || ($opt_critical->[1] > 0 && $opt_critical->[1] <= $count) ) { |
212 | $exit = 'CRITICAL'; | 301 | $message = "CRITICAL - no processes found resembling '". ($opt_command || $opt_regex); |
302 | $exit = $ERRORS{CRITICAL}; | ||
213 | } | 303 | } |
214 | 304 | ||
215 | print "$exit - $count processes with pid(s) ",join(',',@pids); | 305 | # output the status message |
306 | print $message, "'"; | ||
216 | 307 | ||
217 | # print the number of processes if statistics are requested | 308 | # print the number of processes if statistics are requested |
218 | if ( defined($opt_stats) ) { | 309 | if ( defined($opt_stats) ) { |
219 | print "|count:$count\n"; | 310 | print "|count=$count"; |
311 | if ( defined($opt_memory) ) { | ||
312 | print ":memory=", $opt_memory; | ||
313 | } | ||
314 | if ( defined($opt_cpu) ) { | ||
315 | $opt_cpu = $opt_cpu / 100; | ||
316 | printf ":cpu=%.2f", $opt_cpu; | ||
317 | } | ||
220 | } | 318 | } |
221 | else { | 319 | |
222 | print "\n"; | 320 | # store a copy of the %processes hash if we're using caching |
321 | if ( $exit == $ERRORS{OK} && $opt_cache != 0 ) { | ||
322 | eval { | ||
323 | unlink( $cachefile ) if ( -e $cachefile ); | ||
324 | store( \%processes, $cachefile ); | ||
325 | }; | ||
223 | } | 326 | } |
224 | 327 | ||
225 | exit $ERRORS{$exit}; | 328 | print "\n"; |
329 | exit $exit; | ||
226 | 330 | ||
227 | 331 | ||