Configuration options for error regex and log file in the config now

Making it possible for one to overwrite the default set of regexes
used to search for device block errors in the log file. Also making
the log file naming pattern configurable by setting them in the
drive-audit.conf file.

Updating "Detecting Failed Drives" section on the admin guide as well.

Change-Id: I7bd3acffed196da3e09db4c9dcbb48a20bdd1cf0
This commit is contained in:
Marcelo Martins 2013-06-28 14:51:15 -05:00
parent c6e53721e0
commit d2dd3e5488
3 changed files with 63 additions and 23 deletions

View File

@ -25,13 +25,6 @@ from ConfigParser import ConfigParser
from swift.common.utils import backward, get_logger
# To search for more types of errors, add the regex to the list below
error_re = [
re.compile(r'\berror\b.*\b(sd[a-z]{1,2}\d?)\b'),
re.compile(r'\b(sd[a-z]{1,2}\d?)\b.*\berror\b'),
]
def get_devices(device_dir, logger):
devices = []
for line in open('/proc/mounts').readlines():
@ -61,12 +54,17 @@ def get_devices(device_dir, logger):
return devices
def get_errors(minutes):
def get_errors(error_re, log_file_pattern, minutes):
# Assuming log rotation is being used, we need to examine
# recently rotated files in case the rotation occured
# just before the script is being run - the data we are
# looking for may have rotated.
log_files = [f for f in glob.glob('/var/log/kern.*[!.][!g][!z]')]
#
# The globbing used before would not work with all out-of-box
# distro setup for logrotate and syslog therefore moving this
# to the config where one can set it with the desired
# globbing pattern.
log_files = [f for f in glob.glob(log_file_pattern)]
log_files.sort()
now_time = datetime.datetime.now()
@ -143,13 +141,30 @@ if __name__ == '__main__':
device_dir = conf.get('device_dir', '/srv/node')
minutes = int(conf.get('minutes', 60))
error_limit = int(conf.get('error_limit', 1))
log_file_pattern = conf.get('log_file_pattern',
'/var/log/kern.*[!.][!g][!z]')
error_re = []
for conf_key in conf:
if conf_key.startswith('regex_pattern_'):
error_pattern = conf[conf_key]
try:
r = re.compile(error_pattern)
except re.error:
sys.exit('Error: unable to compile regex pattern "%s"' %
error_pattern)
error_re.append(r)
if not error_re:
error_re = [
re.compile(r'\berror\b.*\b(sd[a-z]{1,2}\d?)\b'),
re.compile(r'\b(sd[a-z]{1,2}\d?)\b.*\berror\b'),
]
conf['log_name'] = conf.get('log_name', 'drive-audit')
logger = get_logger(conf, log_route='drive-audit')
devices = get_devices(device_dir, logger)
logger.debug("Devices found: %s" % str(devices))
if not devices:
logger.error("Error: No devices found!")
errors = get_errors(minutes)
errors = get_errors(error_re, log_file_pattern, minutes)
logger.debug("Errors found: %s" % str(errors))
unmounts = 0
for kernel_device, count in errors.items():

View File

@ -156,20 +156,30 @@ settings:
[drive-audit]
================== ========== ===========================================
Option Default Description
------------------ ---------- -------------------------------------------
log_facility LOG_LOCAL0 Syslog log facility
log_level INFO Log level
device_dir /srv/node Directory devices are mounted under
minutes 60 Number of minutes to look back in
`/var/log/kern.log`
error_limit 1 Number of errors to find before a device
is unmounted
================== ========== ===========================================
================== ============== ===========================================
Option Default Description
------------------ -------------- -------------------------------------------
log_facility LOG_LOCAL0 Syslog log facility
log_level INFO Log level
device_dir /srv/node Directory devices are mounted under
minutes 60 Number of minutes to look back in
`/var/log/kern.log`
error_limit 1 Number of errors to find before a device
is unmounted
log_file_pattern /var/log/kern* Location of the log file with globbing
pattern to check against device errors
regex_pattern_X (see below) Regular expression patterns to be used to
locate device blocks with errors in the
log file
================== ============== ===========================================
This script has only been tested on Ubuntu 10.04, so if you are using a
different distro or OS, some care should be taken before using in production.
The default regex pattern used to locate device blocks with errors are
`\berror\b.*\b(sd[a-z]{1,2}\d?)\b` and `\b(sd[a-z]{1,2}\d?)\b.*\berror\b`.
One is able to overwrite the default above by providing new expressions
using the format `regex_pattern_X = regex_expression`, where `X` is a number.
This script has been tested on Ubuntu 10.04 and Ubuntu 12.04, so if you are
using a different distro or OS, some care should be taken before using in production.
--------------
Cluster Health

View File

@ -5,3 +5,18 @@
# log_address = /dev/log
# minutes = 60
# error_limit = 1
#
# Location of the log file with globbing
# pattern to check against device errors.
# log_file_pattern = /var/log/kern*
#
# Regular expression patterns to be used to locate
# device blocks with errors in the log file. Currently
# the default ones are as follows:
# \berror\b.*\b(sd[a-z]{1,2}\d?)\b
# \b(sd[a-z]{1,2}\d?)\b.*\berror\b
# One can overwrite the default ones by providing
# new expressions using the format below:
# Format: regex_pattern_X = regex_expression
# Example:
# regex_pattern_1 = \berror\b.*\b(dm-[0-9]{1,2}\d?)\b