swift/bin/swift-drive-audit

#!/usr/bin/env python
# Copyright (c) 2010-2012 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
import glob
import os
import re
import subprocess
import sys


from six.moves.configparser import ConfigParser

from swift.common.utils import backward, get_logger, dump_recon_cache, \
    config_true_value


def get_devices(device_dir, logger):
    devices = []
    for line in open('/proc/mounts').readlines():
        data = line.strip().split()
        block_device = data[0]
        mount_point = data[1]
        if mount_point.startswith(device_dir):
            device = {}
            device['mount_point'] = mount_point
            device['block_device'] = block_device
            try:
                device_num = os.stat(block_device).st_rdev
            except OSError:
                # If we can't stat the device, then something weird is going on
                logger.error("Error: Could not stat %s!" %
                             block_device)
                continue
            device['major'] = str(os.major(device_num))
            device['minor'] = str(os.minor(device_num))
            devices.append(device)
    for line in open('/proc/partitions').readlines()[2:]:
        major, minor, blocks, kernel_device = line.strip().split()
        device = [d for d in devices
                  if d['major'] == major and d['minor'] == minor]
        if device:
            device[0]['kernel_device'] = kernel_device
    return devices


def get_errors(error_re, log_file_pattern, minutes, logger):
    # Assuming log rotation is being used, we need to examine
    # recently rotated files in case the rotation occurred
    # just before the script is being run - the data we are
    # looking for may have rotated.
    #
    # The globbing used before would not work with all out-of-box
    # distro setup for logrotate and syslog therefore moving this
    # to the config where one can set it with the desired
    # globbing pattern.
    log_files = [f for f in glob.glob(log_file_pattern)]
    try:
        log_files.sort(key=lambda f: os.stat(f).st_mtime, reverse=True)
    except (IOError, OSError) as exc:
        logger.error(exc)
        print(exc)
        sys.exit(1)

    now_time = datetime.datetime.now()
    end_time = now_time - datetime.timedelta(minutes=minutes)
    # kern.log does not contain the year so we need to keep
    # track of the year and month in case the year recently
    # ticked over
    year = now_time.year
    prev_entry_month = now_time.month
    errors = {}

    reached_old_logs = False
    for path in log_files:
        try:
            f = open(path)
        except IOError:
            logger.error("Error: Unable to open " + path)
            print("Unable to open " + path)
            sys.exit(1)
        for line in backward(f):
            if '[    0.000000]' in line \
                or 'KERNEL supported cpus:' in line \
                    or 'BIOS-provided physical RAM map:' in line:
                # Ignore anything before the last boot
                reached_old_logs = True
                break
            # Solves the problem with year change - kern.log does not
            # keep track of the year.
            log_time_entry = line.split()[:3]
            if log_time_entry[0] == 'Dec' and prev_entry_month == 'Jan':
                year -= 1
            prev_entry_month = log_time_entry[0]
            log_time_string = '%s %s' % (year, ' '.join(log_time_entry))
            try:
                log_time = datetime.datetime.strptime(
                    log_time_string, '%Y %b %d %H:%M:%S')
            except ValueError:
                continue
            if log_time > end_time:
                for err in error_re:
                    for device in err.findall(line):
                        errors[device] = errors.get(device, 0) + 1
            else:
                reached_old_logs = True
                break
        if reached_old_logs:
            break
    return errors


def comment_fstab(mount_point):
    with open('/etc/fstab', 'r') as fstab:
        with open('/etc/fstab.new', 'w') as new_fstab:
            for line in fstab:
                parts = line.split()
                if len(parts) > 2 \
                    and parts[1] == mount_point \
                        and not line.startswith('#'):
                    new_fstab.write('#' + line)
                else:
                    new_fstab.write(line)
    os.rename('/etc/fstab.new', '/etc/fstab')


if __name__ == '__main__':
    c = ConfigParser()
    try:
        conf_path = sys.argv[1]
    except Exception:
        print("Usage: %s CONF_FILE" % sys.argv[0].split('/')[-1])
        sys.exit(1)
    if not c.read(conf_path):
        print("Unable to read config file %s" % conf_path)
        sys.exit(1)
    conf = dict(c.items('drive-audit'))
    device_dir = conf.get('device_dir', '/srv/node')
    minutes = int(conf.get('minutes', 60))
    error_limit = int(conf.get('error_limit', 1))
    recon_cache_path = conf.get('recon_cache_path', "/var/cache/swift")
    log_file_pattern = conf.get('log_file_pattern',
                                '/var/log/kern.*[!.][!g][!z]')
    log_to_console = config_true_value(conf.get('log_to_console', False))
    error_re = []
    for conf_key in conf:
        if conf_key.startswith('regex_pattern_'):
            error_pattern = conf[conf_key]
            try:
                r = re.compile(error_pattern)
            except re.error:
                sys.exit('Error: unable to compile regex pattern "%s"' %
                         error_pattern)
            error_re.append(r)
    if not error_re:
        error_re = [
            re.compile(r'\berror\b.*\b(sd[a-z]{1,2}\d?)\b'),
            re.compile(r'\b(sd[a-z]{1,2}\d?)\b.*\berror\b'),
        ]
    conf['log_name'] = conf.get('log_name', 'drive-audit')
    logger = get_logger(conf, log_to_console=log_to_console,
                        log_route='drive-audit')
    devices = get_devices(device_dir, logger)
    logger.debug("Devices found: %s" % str(devices))
    if not devices:
        logger.error("Error: No devices found!")
    recon_errors = {}
    total_errors = 0
    for device in devices:
        recon_errors[device['mount_point']] = 0
    errors = get_errors(error_re, log_file_pattern, minutes, logger)
    logger.debug("Errors found: %s" % str(errors))
    unmounts = 0
    for kernel_device, count in errors.items():
        if count >= error_limit:
            device = \
                [d for d in devices if d['kernel_device'] == kernel_device]
            if device:
                mount_point = device[0]['mount_point']
                if mount_point.startswith(device_dir):
                    if config_true_value(conf.get('unmount_failed_device',
                                                  True)):
                        logger.info("Unmounting %s with %d errors" %
                                    (mount_point, count))
                        subprocess.call(['umount', '-fl', mount_point])
                        logger.info("Commenting out %s from /etc/fstab" %
                                    (mount_point))
                        comment_fstab(mount_point)
                        unmounts += 1
                    else:
                        logger.info("Detected %s with %d errors "
                                    "(Device not unmounted)" %
                                    (mount_point, count))
                    recon_errors[mount_point] = count
                    total_errors += count
    recon_file = recon_cache_path + "/drive.recon"
    dump_recon_cache(recon_errors, recon_file, logger)
    dump_recon_cache({'drive_audit_errors': total_errors}, recon_file, logger,
                     set_owner=conf.get("user", "swift"))

    if unmounts == 0:
        logger.info("No drives were unmounted")