#!/usr/bin/env python # Copyright (c) 2010-2012 OpenStack, LLC. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or # implied. # See the License for the specific language governing permissions and # limitations under the License. import datetime import glob import os import re import subprocess import sys from ConfigParser import ConfigParser from swift.common.utils import backward, get_logger # To search for more types of errors, add the regex to the list below error_re = [ re.compile(r'\berror\b.*\b(sd[a-z]{1,2}\d?)\b'), re.compile(r'\b(sd[a-z]{1,2}\d?)\b.*\berror\b'), ] def get_devices(device_dir, logger): devices = [] for line in open('/proc/mounts').readlines(): data = line.strip().split() block_device = data[0] mount_point = data[1] if mount_point.startswith(device_dir): device = {} device['mount_point'] = mount_point device['block_device'] = block_device try: device_num = os.stat(block_device).st_rdev except OSError, e: # If we can't stat the device, then something weird is going on logger.error("Error: Could not stat %s!" % block_device) continue device['major'] = str(os.major(device_num)) device['minor'] = str(os.minor(device_num)) devices.append(device) for line in open('/proc/partitions').readlines()[2:]: major, minor, blocks, kernel_device = line.strip().split() device = [d for d in devices if d['major'] == major and d['minor'] == minor] if device: device[0]['kernel_device'] = kernel_device return devices def get_errors(minutes): # Assuming log rotation is being used, we need to examine # recently rotated files in case the rotation occured # just before the script is being run - the data we are # looking for may have rotated. log_files = [f for f in glob.glob('/var/log/kern.*[!.][!g][!z]')] log_files.sort() now_time = datetime.datetime.now() end_time = now_time - datetime.timedelta(minutes=minutes) # kern.log does not contain the year so we need to keep # track of the year and month in case the year recently # ticked over year = now_time.year prev_entry_month = now_time.month errors = {} reached_old_logs = False for path in log_files: try: f = open(path) except IOError: logger.error("Error: Unable to open " + path) print("Unable to open " + path) sys.exit(1) for line in backward(f): if '[ 0.000000]' in line \ or 'KERNEL supported cpus:' in line \ or 'BIOS-provided physical RAM map:' in line: # Ignore anything before the last boot reached_old_logs = True break # Solves the problem with year change - kern.log does not # keep track of the year. log_time_entry = line.split()[:3] if log_time_entry[0] == 'Dec' and prev_entry_month == 'Jan': year -= 1 prev_entry_month = log_time_entry[0] log_time_string = '%s %s' % (year, ' '.join(log_time_entry)) try: log_time = datetime.datetime.strptime( log_time_string, '%Y %b %d %H:%M:%S') except ValueError: continue if log_time > end_time: for err in error_re: for device in err.findall(line): errors[device] = errors.get(device, 0) + 1 else: reached_old_logs = True break if reached_old_logs: break return errors def comment_fstab(mount_point): with open('/etc/fstab', 'r') as fstab: with open('/etc/fstab.new', 'w') as new_fstab: for line in fstab: parts = line.split() if len(parts) > 2 and line.split()[1] == mount_point: new_fstab.write('#' + line) else: new_fstab.write(line) os.rename('/etc/fstab.new', '/etc/fstab') if __name__ == '__main__': c = ConfigParser() try: conf_path = sys.argv[1] except Exception: print "Usage: %s CONF_FILE" % sys.argv[0].split('/')[-1] sys.exit(1) if not c.read(conf_path): print "Unable to read config file %s" % conf_path sys.exit(1) conf = dict(c.items('drive-audit')) device_dir = conf.get('device_dir', '/srv/node') minutes = int(conf.get('minutes', 60)) error_limit = int(conf.get('error_limit', 1)) conf['log_name'] = conf.get('log_name', 'drive-audit') logger = get_logger(conf, log_route='drive-audit') devices = get_devices(device_dir, logger) logger.debug("Devices found: %s" % str(devices)) if not devices: logger.error("Error: No devices found!") errors = get_errors(minutes) logger.debug("Errors found: %s" % str(errors)) unmounts = 0 for kernel_device, count in errors.items(): if count >= error_limit: device = \ [d for d in devices if d['kernel_device'] == kernel_device] if device: mount_point = device[0]['mount_point'] if mount_point.startswith(device_dir): logger.info("Unmounting %s with %d errors" % (mount_point, count)) subprocess.call(['umount', '-fl', mount_point]) logger.info("Commenting out %s from /etc/fstab" % (mount_point)) comment_fstab(mount_point) unmounts += 1 if unmounts == 0: logger.info("No drives were unmounted")