aboutsummaryrefslogtreecommitdiffstats
path: root/modules/xymon/templates/hobbit-clients.cfg
blob: 87c039a0f0925e1293c3e228e7fec31dd89150fc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
# hobbit-clients.cfg - configuration file for clients reporting to Xymon
#
# This file is used by the hobbitd_client module, when it builds the
# cpu, disk, files, memory, msgs and procs status messages from the 
# information reported by clients running on the monitored systems.
#
# This file must be installed on the Xymon server - client installations
# do not need this file.
#
# The file defines a series of rules:
#    UP     : Changes the "cpu" status when the system has rebooted recently,
#             or when it has been running for too long.
#    LOAD   : Changes the "cpu" status according to the system load.
#    CLOCK  : Changes the "cpu" status if the client system clock is
#             not synchronized with the clock of the Xymon server.
#    DISK   : Changes the "disk" status, depending on the amount of space
#             used of filesystems.
#    MEMPHYS: Changes the "memory" status, based on the percentage of real
#             memory used.
#    MEMACT : Changes the "memory" status, based on the percentage of "actual"
#             memory used. Note: Not all systems report an "actual" value.
#    MEMSWAP: Changes the "memory" status, based on the percentage of swap
#             space used.
#    PROC   : Changes the "procs" status according to which processes were found
#             in the "ps" listing from the client.
#    LOG    : Changes the "msgs" status according to entries in text-based logfiles.
#             Note: The "client-local.cfg" file controls which logfiles the client will report.
#    FILE   : Changes the "files" status according to meta-data for files.
#             Note: The "client-local.cfg" file controls which files the client will report.
#    DIR    : Changes the "files" status according to the size of a directory.
#             Note: The "client-local.cfg" file controls which directories the client will report.
#    PORT   : Changes the "ports" status according to which tcp ports were found
#             in the "netstat" listing from the client.
#    DEFAULT: Set the default values that apply if no other rules match.
#
# All rules can be qualified so they apply only to certain hosts, or on certain
# times of the day (see below).
#
# Each type of rule takes a number of parameters:
#    UP bootlimit toolonglimit
#             The cpu status goes yellow if the system has been up for less than
#             "bootlimit" time, or longer than "toolonglimit". The time is in
#             minutes, or you can add h/d/w for hours/days/weeks - eg. "2h" for
#             two hours, or "4w" for 4 weeks.
#             Defaults: bootlimit=1h, toolonglimit=-1 (infinite).
#
#    LOAD warnlevel paniclevel
#             If the system load exceeds "warnlevel" or "paniclevel", the "cpu"
#             status will go yellow or red, respectively. These are decimal
#             numbers.
#             Defaults: warnlevel=5.0, paniclevel=10.0
#
#    CLOCK maximum-offset
#             If the system clock of the client differs from that of the Xymon
#             server by more than "maximum-offset" seconds, then the CPU status
#             column will go yellow. Note that the accuracy of this test is limited,
#             since it is affected by the time it takes a client status report to
#             go from the client to the Xymon server and be processed. You should
#             therefore allow for a few seconds (5-10) of slack when you define
#             your max. offset.
#             It is not wise to use this test, unless your servers are synchronized
#             to a common clock, e.g. through NTP.
#
#    DISK filesystem warnlevel paniclevel
#    DISK filesystem IGNORE
#             If the utilization of "filesystem" is reported to exceed "warnlevel"
#             or "paniclevel", the "disk" status will go yellow or red, respectively.
#             "warnlevel" and "paniclevel" are either the percentage used, or the
#             space available as reported by the local "df" command on the host.
#             For the latter type of check, the "warnlevel" must be followed by the
#             letter "U", e.g. "1024U".
#             The special keyword "IGNORE" causes this filesystem to be ignored
#             completely, i.e. it will not appear in the "disk" status column and
#             it will not be tracked in a graph. This is useful for e.g. removable
#             devices, backup-disks and similar hardware.
#             "filesystem" is the mount-point where the filesystem is mounted, e.g.
#             "/usr" or "/home". A filesystem-name that begins with "%" is interpreted
#             as a Perl-compatible regular expression; e.g. "%^/oracle.*/" will match
#             any filesystem whose mountpoint begins with "/oracle".
#             Defaults: warnlevel=90%, paniclevel=95%
#
#    MEMPHYS warnlevel paniclevel
#    MEMACT warnlevel paniclevel
#    MEMSWAP warnlevel paniclevel
#             If the memory utilization exceeds the "warnlevel" or "paniclevel", the
#             "memory" status will change to yellow or red, respectively.
#             Note: The words "PHYS", "ACT" and "SWAP" are also recognized.
#             Defaults: MEMPHYS warnlevel=100 paniclevel=101 (i.e. it will never go red)
#                       MEMSWAP warnlevel=50 paniclevel=80
#                       MEMACT  warnlevel=90 paniclevel=97
#
#    PROC processname minimumcount maximumcount color [TRACK=id] [TEXT=displaytext]
#             The "ps" listing sent by the client will be scanned for how many
#             processes containing "processname" are running, and this is then
#             matched against the min/max settings defined here. If the running
#             count is outside the thresholds, the color of the "procs" status
#             changes to "color".
#             To check for a process that must NOT be running: Set minimum and 
#             maximum to 0.
#
#             "processname" can be a simple string, in which case this string must
#             show up in the "ps" listing as a command. The scanner will find
#             a ps-listing of e.g. "/usr/sbin/cron" if you only specify "processname"
#             as "cron".
#             "processname" can also be a Perl-compatiable regular expression, e.g.
#             "%java.*inst[0123]" can be used to find entries in the ps-listing for
#             "java -Xmx512m inst2" and "java -Xmx256 inst3". In that case, 
#             "processname" must begin with "%" followed by the reg.expression.
#             If "processname" contains whitespace (blanks or TAB), you must enclose
#             the full string in double quotes - including the "%" if you use regular
#             expression matching. E.g.
#                 PROC "%hobbitd_channel --channel=data.*hobbitd_rrd" 1 1 yellow
#             or
#                 PROC "java -DCLASSPATH=/opt/java/lib" 2 5
#
#             You can have multiple "PROC" entries for the same host, all of the
#             checks are merged into the "procs" status and the most severe
#             check defines the color of the status.
#
#             The TRACK=id option causes the number of processes found to be recorded
#             in an RRD file, with "id" as part of the filename. This graph will then
#             appear on the "procs" page as well as on the "trends" page. Note that
#             "id" must be unique among the processes tracked for each host.
#
#             The TEXT=displaytext option affects how the process appears on the
#             "procs" status page. By default, the process is listed with the
#             "processname" as identification, but if this is a regular expression
#             it may be a bit difficult to understand. You can then use e.g.
#             "TEXT=Apache" to make these processes appear with the name "Apache"
#             instead.
#
#             Defaults: mincount=1, maxcount=-1 (unlimited), color="red".
#                       Note: No processes are checked by default.
#
#             Example: Check that "cron" is running:
#                 PROC cron
#             Example: Check that at least 5 "httpd" processes are running, but
#             not more than 20:
#                 PROC httpd 5 20
#
#    LOG filename match-pattern [COLOR=color] [IGNORE=ignore-pattern] [TEXT=displaytext]
#             In the "client-local.cfg" file, you can list any number of files
#             that the client will collect log data from. These are sent to the
#             Xymon server together with the other client data, and you can then
#             choose how to analyze the log data with LOG entries.
#
#                             ************ IMPORTANT ***************
#             To monitor a logfile, you *MUST* configure both client-local.cfg
#             and hobbit-clients.cfg. If you configure only the client-local.cfg
#             file, the client will collect the log data and you can view it in 
#             the "client data" display, but it will not affect the color of the
#             "msgs" status. On the other hand, if you configure only the
#             hobbit-clients.cfg file, then there will be no log data to inspect,
#             and you will not see any updates of the "msgs" status either.
#
#             "filename" is a filename or pattern. The set of files reported by
#             the client is matched against "filename", and if they match then
#             this LOG entry is processed against the data from a file.
#
#             "match-pattern": The log data is matched against this pattern. If
#             there is a match, this log file causes a status change to "color".
#
#             "ignore-pattern": The log data that matched "match-pattern" is also
#             matched against "ignore-pattern". If the data matches the "ignore-pattern",
#             this line of data does not affect the status color. In other words,
#             the "ignore-pattern" can be used to refine the strings which cause
#             a match.
#             Note: The "ignore-pattern" is optional.
#
#             "color": The color which this match will trigger.
#             Note: "color" is optional, if omitted then "red" will be used.
#
#             Example: Go yellow if the text "WARNING" shows up in any logfile.
#                 LOG %.* WARNING COLOR=yellow
#
#             Example: Go red if the text "I/O error" or "read error" appears.
#                 LOG %/var/(adm|log)/messages %(I/O|read).error COLOR=red
#
#    FILE filename [color] [things to check] [TRACK]
#             NB: The files you wish to monitor must be listed in a "file:..."
#             entry in the client-local.cfg file, in order for the client to 
#             report any data about them.
#
#             "filename" is a filename or pattern. The set of files reported by
#             the client is matched against "filename", and if they match then
#             this FILE entry is processed against the data from that file.
#
#             [things to check] can be one or more of the following:
#             - "NOEXIST" triggers a warning if the file exists. By default,
#               a warning is triggered for files that have a FILE entry, but
#               which do not exist.
#             - "TYPE=type" where "type" is one of "file", "dir", "char", "block", 
#               "fifo", or "socket". Triggers warning if the file is not of the
#               specified type.
#             - "OWNERID=owner" and "GROUPID=group" triggers a warning if the owner
#               or group does not match what is listed here. "owner" and "group" is
#               specified either with the numeric uid/gid, or the user/group name.
#             - "MODE=mode" triggers a warning if the file permissions are not
#               as listed. "mode" is written in the standard octal notation, e.g.
#               "644" for the rw-r--r-- permissions.
#             - "SIZE<max.size" and "SIZE>min.size" triggers a warning it the file
#               size is greater than "max.size" or less than "min.size", respectively.
#               You can append "K" (KB), "M" (MB), "G" (GB) or "T" (TB) to the size.
#               If there is no such modifier, KB is assumed.
#               E.g. to warn if a file grows larger than 1MB (1024 KB): "SIZE<1M".
#             - "SIZE=size" triggers a warning it the file size is not what is listed.
#             - "MTIME>min.mtime" and "MTIME<max.mtime" checks how long ago the file
#               was last modified (in seconds). E.g. to check if a file was updated
#               within the past 10 minutes (600 seconds): "MTIME<600". Or to check 
#               that a file has NOT been updated in the past 24 hours: "MTIME>86400".
#             - "MTIME=timestamp" checks if a file was last modified at "timestamp".
#               "timestamp" is a unix epoch time (seconds since midnight Jan 1 1970 UTC).
#             - "CTIME>min.ctime", "CTIME<max.ctime", "CTIME=timestamp" acts as the
#               mtime checks, but for the ctime timestamp (when the files' directory
#               entry was last changed, eg. by chown, chgrp or chmod).
#             - "MD5=md5sum", "SHA1=sha1sum", "RMD160=rmd160sum" trigger a warning 
#               if the file checksum using the MD5, SHA1 or RMD160 message digest
#               algorithms do not match the one configured here. Note: The "file"
#               entry in the client-local.cfg file must specify which algorithm to use.
#
#             "TRACK" causes the size of this file to be tracked in an RRD file, and
#             shown on the graph on the "files" display.
#
#             Example: Check that the /var/log/messages file is not empty and was updated
#                      within the past 10 minutes, and go yellow if either fails:
#                 FILE /var/log/messages SIZE>0 MTIME<600 yellow
#
#             Example: Check the timestamp, size and SHA-1 hash of the /bin/sh program:
#                 FILE /bin/sh MTIME=1128514608 SIZE=645140 SHA1=5bd81afecf0eb93849a2fd9df54e8bcbe3fefd72
#
#    DIR directory [color] [SIZE<maxsize] [SIZE>minsize] [TRACK]
#             NB: The directories you wish to monitor must be listed in a "dir:..."
#             entry in the client-local.cfg file, in order for the client to 
#             report any data about them.
#
#             "directory" is a filename or pattern. The set of directories reported by
#             the client is matched against "directory", and if they match then
#             this DIR entry is processed against the data for that directory.
#
#             "SIZE<maxsize" and "SIZE>minsize" defines the size limits that the
#             directory must stay within. If it goes outside these limits, a warning
#             will trigger. Note the Xymon uses the raw number reported by the
#             local "du" command on the client. This is commonly KB, but it may be
#             disk blocks which are often 512 bytes.
#
#             "TRACK" causes the size of this directory to be tracked in an RRD file, 
#             and shown on the graph on the "files" display.
#
#    PORT [LOCAL=addr] [EXLOCAL=addr] [REMOTE=addr] [EXREMOTE=addr] [STATE=state] [EXSTATE=state] [MIN=mincount] [MAX=maxcount] [COLOR=color] [TRACK=id] [TEXT=displaytext]
#             The "netstat" listing sent by the client will be scanned for how many
#             sockets match the criteria listed.
#             "addr" is a (partial) address specification in the format used on 
#             the output from netstat. This is typically "10.0.0.1:80" for the IP 
#             10.0.0.1, port 80. Or "*:80" for any local address, port 80.
#             NB: The Xymon clients normally report only the numeric data for
#                 IP-adresses and port-numbers, so you must specify the port
#                 number (e.g. "80") instead of the service name ("www").
#             "state" causes only the sockets in the specified state to be included;
#             it is usually LISTEN or ESTABLISHED.
#             The socket count is then matched against the min/max settings defined 
#             here. If the count is outside the thresholds, the color of the "ports" 
#             status changes to "color".
#             To check for a socket that must NOT exist: Set minimum and
#             maximum to 0.
#
#             "addr" and "state" can be a simple strings, in which case these string must 
#             show up in the "netstat" at the appropriate column.
#             "addr" and "state" can also be a Perl-compatiable regular expression, e.g.
#             "LOCAL=%(:80|:443)" can be used to find entries in the netstat local port for
#             both http (port 80) and https (port 443). In that case, portname or state must 
#             begin with "%" followed by the reg.expression.
#
#             The TRACK=id option causes the number of sockets found to be recorded
#             in an RRD file, with "id" as part of the filename. This graph will then
#             appear on the "ports" page as well as on the "trends" page. Note that
#             "id" must be unique among the ports tracked for each host.
#
#             The TEXT=displaytext option affects how the port appears on the
#             "ports" status page. By default, the port is listed with the
#             local/remote/state rules as identification, but this may be somewhat
#             difficult to understand. You can then use e.g. "TEXT=Secure Shell" to make 
#             these ports appear with the name "Secure Shell" instead.
#
#             Defaults: state="LISTEN", mincount=1, maxcount=-1 (unlimited), color="red".
#                       Note: No ports are checked by default.
#
#             Example: Check that there is someone listening on the https port:
#                 PORT "LOCAL=%([.:]443)$" state=LISTEN TEXT=https
#
#             Example: Check that at least 5 "ssh" connections are established, but
#             not more than 10; warn but do not error; graph the connection count:
#                 PORT "LOCAL=%([.:]22)$" state=ESTABLISHED min=5 max=20 color=yellow TRACK=ssh "TEXT=SSH logins"
#
#             Example: Check that ONLY ports 22, 80 and 443 are open for incoming connections:
#                 PORT STATE=LISTEN LOCAL=%0.0.0.0[.:].* EXLOCAL=%[.:](22|80|443)$ MAX=0 "TEXT=Bad listeners"
#
#
# To apply rules to specific hosts, you can use the "HOST=", "EXHOST=", "PAGE=" 
# "EXPAGE=", "CLASS=" or "EXCLASS=" qualifiers.  (These act just as in the 
# hobbit-alerts.cfg file).
#
# Hostnames are either a comma-separated list of hostnames (from the bb-hosts file), 
# "*" to indicate "all hosts", or a Perl-compatible regular expression.
# E.g. "HOST=dns.foo.com,www.foo.com" identifies two specific hosts; 
# "HOST=%www.*.foo.com EXHOST=www-test.foo.com" matches all hosts with a name
# beginning with "www", except the "www-test" host.
# "PAGE" and "EXPAGE" match the hostnames against the page on where they are
# located in the bb-hosts file, via the bb-hosts' page/subpage/subparent
# directives. This can be convenient to pick out all hosts on a specific page.
#
# Rules can be dependant on time-of-day, using the standard Xymon syntax 
# (the bb-hosts(5) about the NKTIME parameter). E.g. "TIME=W:0800:2200"
# applied to a rule will make this rule active only on week-days between
# 8AM and 10PM.
#
# You can also associate a GROUP id with a rule. The group-id is passed to
# the alert module, which can then use it to control who gets an alert when
# a failure occurs. E.g. the following associates the "httpd" process check 
# with the "web" group, and the "sshd" check with the "admins" group:
#    PROC httpd 5 GROUP=web
#    PROC sshd 1 GROUP=admins
# In the hobbit-alerts.cfg file, you could then have rules like
#    GROUP=web
#       MAIL webmaster@foo.com
#    GROUP=admins
#       MAIL root@foo.com
#
# Qualifiers must be placed after each rule, e.g.
#    LOAD 8.0 12.0  HOST=db.foo.com TIME=*:0800:1600
#
# If you have multiple rules that you want to apply the same qualifiers to,
# you can write the qualifiers *only* on one line, followed by the rules. E.g.
#    HOST=%db.*.foo.com TIME=W:0800:1600
#       LOAD 8.0 12.0
#       DISK /db  98 100
#       PROC mysqld 1
# will apply the three rules to all of the "db" hosts on week-days between 8AM
# and 4PM. This can be combined with per-rule qualifiers, in which case the
# per-rule qualifier overrides the general qualifier; e.g.
#    HOST=%.*.foo.com
#       LOAD 7.0 12.0 HOST=bax.foo.com
#       LOAD 3.0 8.0
# will result in the load-limits being 7.0/12.0 for the "bax.foo.com" host,
# and 3.0/8.0 for all other foo.com hosts.
#
# The special DEFAULT section can modify the built-in defaults - this must
# be placed at the end of the file.

HOST=rabbit.<%= domain %>
	DISK    %.*stage2$ IGNORE
 
# jonund has 24 cores and we try and utilise it as much as possible
# la of up to 1.5*cores is probably not problematic
HOST=jonund.<%= domain %>
	LOAD	36.0 48.0

# ecosse has 8 cores, is a builder, and we try to use them all
HOST=ecosse.<%= domain %>
    LOAD    12.0 16.0

# rabbit has 8 and mksquashfs uses all of them
HOST=rabbit.<%= domain %>
	LOAD	12.0	16.0

DEFAULT
	# These are the built-in defaults.
	UP      1h
	LOAD    5.0 10.0
	DISK    %^/mnt/cdrom 101 101
	DISK    * 90 95
	MEMPHYS 100 101
	MEMSWAP 50 80
	MEMACT  90 97
	CLOCK	60
	FILE	/var/lib/puppet/state/state.yaml yellow mtime<5400
	PORT    state=LISTEN "LOCAL=%([.:]22)$" MIN=1 TEXT=ssh
    PROC    puppetd 0 3 red
    # 10 , just in case something goes wrong
    PROC    crond 1 10 red