--- check_procs.c 2009-02-21 09:59:24.000000000 +0000
+++ check_procs.c.new 2009-05-19 10:41:14.000000000 +0000
@@ -27,7 +27,8 @@
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
-*
+*
+* State file stuff originally by: Alain Williams
*
*****************************************************************************/
@@ -42,13 +43,7 @@
#include "regex.h"
#include
-
-int process_arguments (int, char **);
-int validate_arguments (void);
-int check_thresholds (int);
-int convert_to_seconds (char *);
-void print_help (void);
-void print_usage (void);
+#include
int wmax = -1;
int cmax = -1;
@@ -77,6 +72,7 @@
METRIC_ELAPSED
};
enum metric metric = METRIC_PROCS;
+char metric_state_name = 'P'; /* Metric name in the state file */
int verbose = 0;
int uid;
@@ -92,9 +88,98 @@
char *fmt;
char *fails;
char tmp[MAX_INPUT_BUFFER];
+time_t now;
+time_t state_limit_start;
FILE *ps_input = NULL;
+/* Optionally trigger an alert if a process has been in a state for
+ * some time. This time will be measured in minutes, ie much longer than
+ * this program runs for - thus a state file is needed to store this
+ * between runs of this program.
+ * This happens if --state-file is specified.
+ *
+ * The state file records information about processes that exceed some criteria
+ * for a warning or critical notice.
+ * The file will contain one 'V' line.
+ * If the metric is PROCS there will be one N line, else zero or more P lines.
+ * Format of the state file:
+ * Max line length of 500
+ * Empty lines and lines starting '#' are ignored
+ * Lines consist of a type character, a space and optional extra information
+ * V version_number
+ * P pid ppid name MS secs
+ * name is limited to a max 20 characters
+ * M is the metric:
+ * V virtual memory size
+ * R resident set memory size
+ * C percentage CPU
+ * E time elapsed in seconds
+ * S is the state:
+ * W Warning
+ * C Critical
+ * N MS secs
+ * M will be 'P'
+ * S is the state as above
+ * secs is the epoch time the metric was first exceeded - hex number
+ * The 'MS secs' or 'S secs' may appear twice as it is possible for a process (or the
+ * max # processes) to exceed both the warning and critical thresholds but for different times.
+ * If something is C then it is implicitly W.
+ *
+ * There might be a trailing space on a N line.
+ *
+ * Eg:
+ * P 1234 1200 cpu_hog CW 4a05a817 CC 4a05a91f
+ * N PW 4a05a91f
+ */
+#define STATE_VERSION 1 /* Change me if the file format changes */
+#define STATE_MAX_LINE 500 /* Longest line */
+#define MAX_PROG_NAME 20 /* Longest name of program - search for this if you change it */
+#define METRIC_CODES "PVRCE" /* For input validation */
+#define STATE_CODES "WC" /* For input validation */
+#define STATE2state(x) ((x) == STATE_WARNING ? 'W' : 'C') /* Convert STATE_WARNING or STATE_CRITICAL to 'W' or 'C' */
+
+/* A process can exceed various limits. This describes on of them
+ */
+typedef struct plimit {
+ struct plimit* pl_next; /* NULL terminated list */
+ time_t pl_when; /* When it first exceeded this limit */
+ int pl_state; /* STATE_WARNING or STATE_CRITICAL */
+ int pl_seen; /* Exceeded this run */
+ char pl_metric; /* What is exceeded - as in file */
+} PLimit;
+
+/* Something to describe a process that is exceeding something
+ */
+typedef struct exproc {
+ struct exproc* ep_next; /* NULL terminated list */
+ pid_t ep_pid; /* Process ID */
+ pid_t ep_ppid; /* Parent PID */
+ char* ep_prog; /* Program name */
+ PLimit* ep_limits; /* Limits exceeded list */
+ int ep_seen; /* Updated/noticed this run */
+} ExProc;
+
+char* state_filename; /* File that we store this in */
+int state_time = 5; /* Trigger time - minutes */
+ExProc* state_list; /* Used for process specific metrics - ie metric is *not* PROCS */
+PLimit* state_nprocs; /* Info on # procs exceeded - used if metric is PROCS */
+int state_changed; /* Ie need to write back to file */
+int must_rewrite; /* Set this if there is a syntax error in the file, or
+ * some other reason which means we must rewrite it */
+
+int process_arguments (int, char **);
+int validate_arguments (void);
+int check_thresholds (int);
+int convert_to_seconds (char *);
+void print_help (void);
+void print_usage (void);
+void read_state_file(void);
+void write_state_file(char** argv);
+void record_state(pid_t procpid, pid_t procppid, char* procprog, char prog_metric, int state, time_t start_time);
+void record_limit(PLimit** l_ref, int state, char proc_metric, time_t start_time);
+void read_limit_line(const char* in_line, PLimit** ppl, char* state_filename, int line_no);
+int check_limit(PLimit* pl);
int
main (int argc, char **argv)
@@ -129,13 +214,16 @@
int result = STATE_UNKNOWN;
output chld_out, chld_err;
+ now = time(NULL);
+
setlocale (LC_ALL, "");
bindtextdomain (PACKAGE, LOCALEDIR);
textdomain (PACKAGE);
setlocale(LC_NUMERIC, "POSIX");
- input_buffer = malloc (MAX_INPUT_BUFFER);
- procprog = malloc (MAX_INPUT_BUFFER);
+ if( ! (input_buffer = malloc (MAX_INPUT_BUFFER)) ||
+ ! (procprog = malloc (MAX_INPUT_BUFFER)))
+ die(STATE_UNKNOWN, _("Out of memory in startup\n"));
asprintf (&metric_name, "PROCS");
metric = METRIC_PROCS;
@@ -168,6 +256,9 @@
result = cmd_file_read( input_filename, &chld_out, 0);
}
+ /* What do we remember from last time ? */
+ read_state_file();
+
/* flush first line: j starts at 1 */
for (j = 1; j < chld_out.lines; j++) {
input_line = chld_out.line[j];
@@ -237,6 +328,10 @@
procetime, procprog, procargs);
}
+ /* This is all made simpler because metric can only talk about
+ * one metric, ie can't check more than one thing at a time.
+ * This means that metric_state_name is the char equivalent of metric.
+ */
if (metric == METRIC_VSZ)
i = check_thresholds (procvsz);
else if (metric == METRIC_RSS)
@@ -248,15 +343,29 @@
i = check_thresholds (procseconds);
if (metric != METRIC_PROCS) {
- if (i == STATE_WARNING) {
- warn++;
- asprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), procprog);
- result = max_state (result, i);
- }
- if (i == STATE_CRITICAL) {
- crit++;
- asprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), procprog);
- result = max_state (result, i);
+ if(state_filename) {
+ /* State is being stored - ie don't report immediately.
+ * Note what we have found:
+ */
+ if(i == STATE_WARNING || i == STATE_CRITICAL)
+ record_state(procpid, procppid, procprog, metric_state_name, i, now);
+ } else {
+ if (i == STATE_WARNING) {
+ char* str = fails;
+ warn++;
+ asprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), procprog);
+ result = max_state (result, i);
+ if(str)
+ free(str);
+ }
+ if (i == STATE_CRITICAL) {
+ char* str = fails;
+ crit++;
+ asprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), procprog);
+ result = max_state (result, i);
+ if(str)
+ free(str);
+ }
}
}
}
@@ -276,7 +385,59 @@
/* Needed if procs found, but none match filter */
if ( metric == METRIC_PROCS ) {
- result = max_state (result, check_thresholds (procs) );
+ int threshold = check_thresholds(procs);
+ int putative_result = max_state(result, threshold);
+
+ if(state_filename) { /* Do not report immediately - note what we found */
+ /* Only record something if we may need to report it */
+ if(putative_result == STATE_WARNING || putative_result == STATE_CRITICAL)
+ record_limit(&state_nprocs, putative_result, 'P', now);
+ } else
+ result = putative_result;
+ }
+
+ /* If we have a state file, the above has just stored the results away, so have
+ * a look and see if there is anything that we should note.
+ * The slight subtlety is that we could have something recorded as both a warning
+ * & a critical - in this case only report the critical.
+ */
+ if(state_filename) {
+ /* Compute the start time of any state that we must report.
+ * Ie any state younger than this we keep quiet about.
+ */
+ state_limit_start = (time_t)((unsigned long)now - state_time * 60);
+
+ if(verbose >= 3)
+ printf("Checking metric %c, limit_start %s", metric_state_name, ctime(&state_limit_start));
+
+ if(metric == METRIC_PROCS) {
+ result = check_limit(state_nprocs);
+ } else {
+ ExProc* pp;
+
+ for(pp = state_list; pp; pp = pp->ep_next) {
+ char* str = fails;
+
+ /* What is the state of this recorded process ? */
+ int res = check_limit(pp->ep_limits);
+
+ switch(res) {
+ case STATE_OK:
+ continue; /* Don't do the stuff below */
+ case STATE_WARNING:
+ warn++;
+ break;
+ case STATE_CRITICAL:
+ crit++;
+ break;
+ }
+
+ asprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), pp->ep_prog);
+ result = max_state(result, res);
+ if(str)
+ free(str);
+ }
+ }
}
if ( result == STATE_OK ) {
@@ -302,6 +463,9 @@
printf (" [%s]", fails);
printf ("\n");
+
+ write_state_file(argv);
+
return result;
}
@@ -336,6 +500,8 @@
{"verbose", no_argument, 0, 'v'},
{"ereg-argument-array", required_argument, 0, CHAR_MAX+1},
{"input-file", required_argument, 0, CHAR_MAX+2},
+ {"state-file", required_argument, 0, 'S'},
+ {"state-time", required_argument, 0, 'T'},
{0, 0, 0, 0}
};
@@ -344,7 +510,7 @@
strcpy (argv[c], "-t");
while (1) {
- c = getopt_long (argc, argv, "Vvht:c:w:p:s:u:C:a:z:r:m:P:",
+ c = getopt_long (argc, argv, "Vvht:c:w:p:s:u:C:a:z:r:m:P:S:T:",
longopts, &option);
if (c == -1 || c == EOF)
@@ -479,22 +645,27 @@
asprintf (&metric_name, "%s", optarg);
if ( strcmp(optarg, "PROCS") == 0) {
metric = METRIC_PROCS;
+ metric_state_name = 'P';
break;
}
else if ( strcmp(optarg, "VSZ") == 0) {
metric = METRIC_VSZ;
+ metric_state_name = 'V';
break;
}
else if ( strcmp(optarg, "RSS") == 0 ) {
metric = METRIC_RSS;
+ metric_state_name = 'R';
break;
}
else if ( strcmp(optarg, "CPU") == 0 ) {
metric = METRIC_CPU;
+ metric_state_name = 'C';
break;
}
else if ( strcmp(optarg, "ELAPSED") == 0) {
metric = METRIC_ELAPSED;
+ metric_state_name = 'E';
break;
}
@@ -505,6 +676,16 @@
case CHAR_MAX+2:
input_filename = optarg;
break;
+ case 'S': /* state-file */
+ state_filename = optarg;
+ break;
+ case 'T': /* state-time */
+ if (!is_integer (optarg))
+ usage2 (_("state-time must be a positive integer"), optarg);
+ else
+ if((state_time = atoi (optarg)) < 0) /* Treat -ve time as zero */
+ state_time = 0;
+ break;
}
}
@@ -727,6 +908,12 @@
printf (" %s\n", "-C, --command=COMMAND");
printf (" %s\n", _("Only scan for exact matches of COMMAND (without path)."));
+ printf ("\n");
+ printf ( "%s\n", "State memory (complain if a process exceeds a limit for a long time):");
+ printf ( " %s\n", "-S, --state-file=StateFile");
+ printf ( " %s\n", _("Store process information in this file"));
+ printf ( " %s\n", "-T, --state-time=minutes");
+
printf(_("\n\
RANGEs are specified 'min:max' or 'min:' or ':max' (or 'max'). If\n\
specified 'max:min', a warning status will be generated if the\n\
@@ -755,7 +942,9 @@
printf (" %s\n", "check_procs -w 50000 -c 100000 --metric=VSZ");
printf (" %s\n\n", _("Alert if VSZ of any processes over 50K or 100K"));
printf (" %s\n", "check_procs -w 10 -c 20 --metric=CPU");
- printf (" %s\n", _("Alert if CPU of any processes over 10%% or 20%%"));
+ printf (" %s\n\n", _("Alert if CPU of any processes over 10% or 20%"));
+ printf (" %s\n", "check_procs -w 80 -c 90 --metric=CPU --state-time=10 --state-file=/tmp/CPU-state");
+ printf (" %s\n", _("Alert if CPU of any processes over 80% or 90% for at least 10 minutes, record state in /tmp/CPU-state"));
printf (_(UT_SUPPORT));
}
@@ -766,5 +955,423 @@
printf (_("Usage: "));
printf ("%s -w -c [-m metric] [-s state] [-p ppid]\n", progname);
printf (" [-u user] [-r rss] [-z vsz] [-P %%cpu] [-a argument-array]\n");
- printf (" [-C command] [-t timeout] [-v]\n");
+ printf (" [-C command] [-t timeout] [-v] [-S state_file] [-T state_time_minutes]\n");
+}
+
+/* Read the state file - if there is one.
+ *
+ * This file is not locked. On a horribly over loaded system it might happen that reads & writes
+ * could overlap in the wrong way. Locking could make things worse, you may end up with many
+ * instances of this program waiting on the lock.
+ */
+void
+read_state_file(void)
+{
+ FILE* sf;
+ char* errstr;
+ char* str;
+ char in_buf[STATE_MAX_LINE]; /* Input buffer */
+ int line_no = 0;
+ char prog_name[MAX_PROG_NAME + 1];
+ int eaten;
+ ExProc* pp;
+
+ if( ! state_filename) /* No file specified */
+ return;
+
+ if( ! (sf = fopen(state_filename, "r"))) {
+ /* It is OK if it doesn't exist, we just haven't created it yet */
+ if(errno == ENOENT) {
+ must_rewrite = 1; /* Force it to be created */
+ return;
+ }
+
+ /* Anything else - should not happen */
+ errstr = strerror(errno);
+ die(STATE_UNKNOWN, _("Can't open %s for reading as: %s"), state_filename, errstr);
+ }
+
+ /* Read a line at a time */
+ while(fgets(in_buf, STATE_MAX_LINE, sf)) {
+ line_no++;
+ if( ! (str = strchr(in_buf, '\n'))) {
+ printf(_("State file %s corrupt, line too long, at line %d\n"), state_filename, line_no);
+ must_rewrite = 1; /* Force write */
+ goto read_off; /* Will be fixed when we rewrite it in a moment */
+ }
+ *str = '\0';
+
+ /* Empty line or comment ? */
+ if(in_buf[0] == '\0' || in_buf[0] == '#')
+ continue;
+
+ if(in_buf[1] != ' ') {
+ printf(_("State file %s corrupt, no space at position 1, at line %d\n"), state_filename, line_no);
+ must_rewrite = 1; /* Force write */
+ goto read_off;
+ }
+
+ /* What line type ? */
+ switch(in_buf[0]) {
+ case 'V': /* In case we are running 1st time after upgrade */
+ if(atoi(in_buf + 2) != STATE_VERSION) {
+ printf(_("State file %s is wrong version, expecting %d. File ignored\n"), state_filename, STATE_VERSION);
+ must_rewrite = 1; /* Force write */
+ goto read_off;
+ }
+ break;
+ case 'P': /* Info about a process */
+ /* P 1234 1200 cpu_hog CW 4a05a817 CC 4a05a92f */
+ if( ! (pp = calloc(sizeof(ExProc), 1)))
+ die(STATE_UNKNOWN, _("Out of memory reading %s line %d"), state_filename, line_no);
+
+ pp->ep_next = state_list;
+ state_list = pp;
+
+ /* MAX_PROG_NAME on next line */
+ if(sscanf(in_buf, "P %d %d %20s%n", &pp->ep_pid, &pp->ep_ppid, prog_name, &eaten) != 3) {
+ printf(_("State file corrupt, bad process line, file %s line %d\n"), state_filename, line_no);
+ must_rewrite = 1; /* Force write */
+ goto read_off;
+ }
+
+ if( ! (pp->ep_prog = strdup(prog_name)))
+ die(STATE_UNKNOWN, _("Out of memory reading %s line %d\n"), state_filename, line_no);
+
+ if(verbose >= 3)
+ printf("Read pid %d ppid %d proc %s\n", pp->ep_pid, pp->ep_ppid, pp->ep_prog);
+
+ read_limit_line(in_buf + eaten, &pp->ep_limits, state_filename, line_no);
+ break;
+ case 'N': /* Number of procs exceeded */
+ if(verbose >= 3)
+ printf("Read N:\n");
+ read_limit_line(in_buf + 1, &state_nprocs, state_filename, line_no);
+ break;
+ default:
+ printf(_("State file %s corrupt, unknown line type, at line %d\n"), state_filename, line_no);
+ must_rewrite = 1; /* Force write */
+ goto read_off;
+ }
+ }
+
+ /* Ignore changes so far */
+read_off:
+ state_changed = 0;
+
+ fclose(sf);
+}
+
+/* Read a line (or rest of) a process or global limit line.
+ * Expect the first character of in_line to be NUL or a space.
+ * Args:
+ * in_line the line to read
+ * ppl pointer to pointer to linked list where to store what is read
+ * filename the name of the file being read
+ * line_no that was read
+ *
+ * Read lines like:
+ * PW 4a05a91f PC 4a05a817
+ */
+void
+read_limit_line(const char* in_line, PLimit** ppl, char* state_filename, int line_no)
+{
+ PLimit* pl;
+ unsigned long when;
+ int eaten;
+
+ while(*in_line) {
+ if(*in_line == ' ') { /* Ignore spaces */
+ in_line++;
+ continue;
+ }
+
+ /* We have found something, allocate somewhere to put it */
+ if( ! (pl = calloc(sizeof(PLimit), 1)))
+ die(STATE_UNKNOWN, _("Out of memory reading %s line %d\n"), state_filename, line_no);
+
+ /* Read a metric code */
+ if( ! strchr(METRIC_CODES, *in_line)) {
+ printf(_("State file %s corrupt, unknown metric code, at line %d\n"), state_filename, line_no);
+ must_rewrite = 1;
+ free(pl);
+ return;
+ }
+ pl->pl_metric = *in_line++;
+
+ /* Read the state code */
+ if(*in_line == '\0' || ! strchr(STATE_CODES, *in_line)) {
+ printf(_("State file %s corrupt, unknown state code, at line %d\n"), state_filename, line_no);
+ must_rewrite = 1;
+ free(pl);
+ return;
+ }
+ pl->pl_state = *in_line++ == 'W' ? STATE_WARNING : STATE_CRITICAL;
+
+ /* Read the time */
+ if(sscanf(in_line, " %lx%n", &when, &eaten) != 1) {
+ printf(_("State file %s corrupt, bad time, at line %d\n"), state_filename, line_no);
+ must_rewrite = 1;
+ free(pl);
+ return;
+ }
+ pl->pl_when = (time_t)when;
+ in_line += eaten;
+
+ if(verbose >= 3)
+ printf(" metric=%c state=%c since %s", pl->pl_metric, STATE2state(pl->pl_state), ctime(&pl->pl_when));
+
+ /* Link it in */
+ pl->pl_next = *ppl;
+ *ppl = pl;
+ }
+
+ if(verbose >= 3)
+ printf("\n");
+}
+
+/* Write back to the state file
+ */
+void
+write_state_file(char** argv)
+{
+ FILE* sf;
+ char* errstr;
+ ExProc* pp;
+ PLimit* pl;
+
+ if( ! state_filename) /* No file specified */
+ return;
+
+ /* Work out if we are going to write back what we read in.
+ * If there is something that has not been seen then it was read in
+ * from the file - need to write back to loose the entry, scan for that.
+ * Otherwise: state_changed will tell us what we want to know and was set
+ * when a change was made.
+ */
+ if(metric == METRIC_PROCS) {
+ for(pl = state_nprocs; pl; pl = pl->pl_next)
+ if( ! pl->pl_seen)
+ state_changed = 1;
+ } else {
+ for(pp = state_list; pp; pp = pp->ep_next) {
+ if( ! pp->ep_seen)
+ state_changed = 1;
+
+ for(pl = pp->ep_limits; pl; pl = pl->pl_next)
+ if( ! pl->pl_seen)
+ state_changed = 1;
+ }
+ }
+
+ if(verbose >= 3)
+ printf("Write state, changed=%d\n", state_changed);
+
+ /* No change to the state file ? */
+ if( ! state_changed && ! must_rewrite)
+ return;
+
+ if( ! (sf = fopen(state_filename, "w"))) {
+ errstr = strerror(errno);
+ die(STATE_UNKNOWN, _("Can't open %s for writing as: %s"), state_filename, errstr);
+ }
+
+ fprintf(sf, "# Process state file written by %s - DO NOT HAND EDIT\n", progname);
+ fprintf(sf, "# Args:");
+ for(; *argv; argv++)
+ fprintf(sf, " %s", *argv);
+ fprintf(sf, "\n");
+ fprintf(sf, "V %d\n", STATE_VERSION);
+
+ if(metric != METRIC_PROCS) {
+ /* Traverse the processes that we know about */
+ for(pp = state_list; pp; pp = pp->ep_next) {
+ if( ! pp->ep_seen)
+ continue;
+
+ fprintf(sf, "P %d %d %.*s", pp->ep_pid, pp->ep_ppid, MAX_PROG_NAME, pp->ep_prog);
+ for(pl = pp->ep_limits; pl; pl = pl->pl_next)
+ if(pl->pl_seen)
+ fprintf(sf, " %c%c %lx", pl->pl_metric, STATE2state(pl->pl_state), (unsigned long)pl->pl_when);
+
+ fprintf(sf, "\n");
+ }
+ } else {
+ /* Print when the # processes is what is being checked */
+ fprintf(sf, "N");
+ for(pl = state_nprocs; pl; pl = pl->pl_next)
+ if(pl->pl_seen)
+ fprintf(sf, " %c%c %lx", pl->pl_metric, STATE2state(pl->pl_state), (unsigned long)pl->pl_when);
+
+ /* Space before \n is important - else get error on read if no limits follow -- which
+ * will happen if all is well.
+ */
+ fprintf(sf, " \n");
+ }
+
+ fclose(sf);
+}
+
+/* Record a state for a program.
+ * Create a new entry if we need to, or update an existing one.
+ * Program must match on the first 3 args to update.
+ * Args:
+ * procpid Process ID
+ * procppid Parent process ID
+ * progprog Program name
+ * prog_metric What we are measuring (METRIC_something but represented as the character in the file)
+ * state Error or warning (STATE_something)
+ */
+void
+record_state(pid_t procpid, pid_t procppid, char* procprog, char prog_metric, int state, time_t start_time)
+{
+ ExProc* pp;
+
+ /* Look for the process */
+ for(pp = state_list; pp; pp = pp->ep_next) {
+ if(pp->ep_pid != procpid)
+ continue;
+
+ /* Right process, but if it has mutated - throw it away and start again.
+ * This doesn't detect processes that exec() a lot w/out fork(), but that is rare.
+ */
+ if(pp->ep_ppid != procppid || strcmp(pp->ep_prog, procprog)) {
+ PLimit* pl;
+ while(pl = pp->ep_limits) {
+ pp->ep_limits = pl->pl_next;
+ free(pl);
+ }
+ pp->ep_ppid = procppid;
+ if(strcmp(pp->ep_prog, procprog)) {
+ free(pp->ep_prog);
+ if( ! (pp->ep_prog = strdup(procprog)))
+ die(STATE_UNKNOWN, _("Out of memory"));
+ }
+
+ state_changed = 1;
+ }
+
+ if(verbose >= 3)
+ printf("Record found: pid %d %s\n", pp->ep_pid, pp->ep_prog);
+
+ break;
+ }
+
+ /* Didn't find the process, allocate a new entry */
+ if( ! pp) {
+ if( ! (pp = calloc(sizeof(ExProc), 1)))
+ die(STATE_UNKNOWN, _("Out of memory"));
+ pp->ep_pid = procpid;
+ pp->ep_ppid = procppid;
+ if( ! (pp->ep_prog = strdup(procprog)))
+ die(STATE_UNKNOWN, _("Out of memory"));
+
+ pp->ep_next = state_list;
+ state_list = pp;
+ state_changed = 1;
+
+ if(verbose >= 3)
+ printf("Record alloc: pid %d %s\n", pp->ep_pid, pp->ep_prog);
+ }
+
+ pp->ep_seen = 1; /* Ensure that this gets written out */
+
+ record_limit(&pp->ep_limits, state, prog_metric, start_time);
+}
+
+/* Store a limit
+ * l_ref address of head of limits chain
+ * prog_metric What we are measuring (METRIC_something but represented as the character in the file)
+ * state Error or warning (STATE_something)
+ * start_time The time to record when it started, if we already record this - don't change the time
+ * unless this is older.
+ *
+ * If something is C then it is implicitly W. This is important: if something goes from
+ * W to C, it might remain at C for less than the state time (which it might do W -> C
+ * & back again several times) - but the time above the W level might be notifiable.
+ */
+void
+record_limit(PLimit** l_ref, int state, char proc_metric, time_t start_time)
+{
+ PLimit* pl;
+ PLimit* pl_found = NULL;
+ int seen_warning = 0;
+
+ /* Find the individual process limit.
+ * Scan the whole lot since we want to 'seen' a Warning if we have Critical.
+ */
+ for(pl = *l_ref; pl; pl = pl->pl_next)
+ if(pl->pl_metric == proc_metric) {
+ if(state == STATE_CRITICAL && pl->pl_state == STATE_WARNING) {
+ pl->pl_seen = 1; /* Ensure that it is output */
+ seen_warning = 1;
+ }
+
+ if(pl->pl_state == state)
+ pl_found = pl; /* We found what we were looking for */
+ }
+
+ pl = pl_found;
+
+ /* Didn't find it, allocate a new one */
+ if( ! pl) {
+ if( ! (pl = calloc(sizeof(PLimit), 1)))
+ die(STATE_UNKNOWN, _("Out of memory"));
+ pl->pl_next = *l_ref;
+ *l_ref = pl;
+ pl->pl_when = start_time;
+ pl->pl_state = state;
+ pl->pl_metric = proc_metric;
+
+ state_changed = 1;
+ } else /* It is possible that the time was set earlier when a 'C' generated
+ * an implicit 'W'. Since the 'W' prob started earlier that the 'C'
+ * we may have recorded the later 'C' time rather than the 'W' time.
+ */
+ if(pl->pl_when > start_time)
+ pl->pl_when = start_time;
+
+ pl->pl_seen = 1; /* Ensure that it is output */
+
+ if(verbose >= 3)
+ printf("Record limit: metric=%c state=%c since %s", pl->pl_metric, STATE2state(pl->pl_state), ctime(&pl->pl_when));
+
+ /* If this is a critical, but we didn't see the warning - generate the warning */
+ if(state == 'C' && ! seen_warning)
+ record_limit(l_ref, 'W', proc_metric, start_time);
+}
+
+/* Check if limits have been exceeded for at least the state-time
+ * Check only metric_state_name regardless of what is stored.
+ *
+ * Args:
+ * pl List of limit values
+ *
+ * Return: STATE_OK, STATE_WARNING or STATE_CRITICAL
+ */
+int
+check_limit(PLimit* pl)
+{
+ int result = STATE_OK;
+
+ for(; pl; pl = pl->pl_next) {
+ if( ! pl->pl_seen)
+ continue; /* Of no interest, not updated this run */
+
+ if(pl->pl_metric != metric_state_name)
+ continue; /* Not what we are looking for */
+
+ /* Is this something that has been going on for long enough that we
+ * are to report it ?
+ */
+ if(pl->pl_when <= state_limit_start) {
+ if(result == STATE_OK)
+ result = pl->pl_state;
+
+ if(result == STATE_WARNING && pl->pl_state == STATE_CRITICAL)
+ result = STATE_CRITICAL;
+ }
+ }
+
+ return(result);
}