a88b412e17
I've been doing this with cluster-wide log searches for far too long. This adds support for reporting the oldest replication pass completion as well as the most recent. This is quite useful for finding those odd replicators that have hung up for some reason and need intervention. Change-Id: I7fd7260eca162d6b085f3e82aaa3cf90670f2d53
841 lines
34 KiB
Python
Executable File
841 lines
34 KiB
Python
Executable File
#! /usr/bin/env python
|
|
"""
|
|
cmdline utility to perform cluster reconnaissance
|
|
"""
|
|
|
|
|
|
from eventlet.green import urllib2
|
|
from swift.common.ring import Ring
|
|
from urlparse import urlparse
|
|
try:
|
|
import simplejson as json
|
|
except ImportError:
|
|
import json
|
|
from hashlib import md5
|
|
import eventlet
|
|
import optparse
|
|
import time
|
|
import sys
|
|
import os
|
|
|
|
|
|
def seconds2timeunit(seconds):
|
|
elapsed = seconds
|
|
unit = 'seconds'
|
|
if elapsed >= 60:
|
|
elapsed = elapsed / 60.0
|
|
unit = 'minutes'
|
|
if elapsed >= 60:
|
|
elapsed = elapsed / 60.0
|
|
unit = 'hours'
|
|
if elapsed >= 24:
|
|
elapsed = elapsed / 24.0
|
|
unit = 'days'
|
|
return elapsed, unit
|
|
|
|
|
|
class Scout(object):
|
|
"""
|
|
Obtain swift recon information
|
|
"""
|
|
|
|
def __init__(self, recon_type, verbose=False, suppress_errors=False,
|
|
timeout=5):
|
|
self.recon_type = recon_type
|
|
self.verbose = verbose
|
|
self.suppress_errors = suppress_errors
|
|
self.timeout = timeout
|
|
|
|
def scout_host(self, base_url, recon_type):
|
|
"""
|
|
Perform the actual HTTP request to obtain swift recon telemtry.
|
|
|
|
:param base_url: the base url of the host you wish to check. str of the
|
|
format 'http://127.0.0.1:6000/recon/'
|
|
:param recon_type: the swift recon check to request.
|
|
:returns: tuple of (recon url used, response body, and status)
|
|
"""
|
|
url = base_url + recon_type
|
|
try:
|
|
body = urllib2.urlopen(url, timeout=self.timeout).read()
|
|
content = json.loads(body)
|
|
if self.verbose:
|
|
print "-> %s: %s" % (url, content)
|
|
status = 200
|
|
except urllib2.HTTPError as err:
|
|
if not self.suppress_errors or self.verbose:
|
|
print "-> %s: %s" % (url, err)
|
|
content = err
|
|
status = err.code
|
|
except urllib2.URLError as err:
|
|
if not self.suppress_errors or self.verbose:
|
|
print "-> %s: %s" % (url, err)
|
|
content = err
|
|
status = -1
|
|
return url, content, status
|
|
|
|
def scout(self, host):
|
|
"""
|
|
Obtain telemetry from a host running the swift recon middleware.
|
|
|
|
:param host: host to check
|
|
:returns: tuple of (recon url used, response body, and status)
|
|
"""
|
|
base_url = "http://%s:%s/recon/" % (host[0], host[1])
|
|
url, content, status = self.scout_host(base_url, self.recon_type)
|
|
return url, content, status
|
|
|
|
|
|
class SwiftRecon(object):
|
|
"""
|
|
Retrieve and report cluster info from hosts running recon middleware.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.verbose = False
|
|
self.suppress_errors = False
|
|
self.timeout = 5
|
|
self.pool_size = 30
|
|
self.pool = eventlet.GreenPool(self.pool_size)
|
|
self.check_types = ['account', 'container', 'object']
|
|
self.server_type = 'object'
|
|
|
|
def _gen_stats(self, stats, name=None):
|
|
""" compute various stats from a list of values """
|
|
cstats = [x for x in stats if x is not None]
|
|
if len(cstats) > 0:
|
|
ret_dict = {'low': min(cstats), 'high': max(cstats),
|
|
'total': sum(cstats), 'reported': len(cstats),
|
|
'number_none': len(stats) - len(cstats), 'name': name}
|
|
ret_dict['average'] = \
|
|
ret_dict['total'] / float(len(cstats))
|
|
ret_dict['perc_none'] = \
|
|
ret_dict['number_none'] * 100.0 / len(stats)
|
|
else:
|
|
ret_dict = {'reported': 0}
|
|
return ret_dict
|
|
|
|
def _print_stats(self, stats):
|
|
"""
|
|
print out formatted stats to console
|
|
|
|
:param stats: dict of stats generated by _gen_stats
|
|
"""
|
|
print '[%(name)s] low: %(low)d, high: %(high)d, avg: ' \
|
|
'%(average).1f, total: %(total)d, ' \
|
|
'Failed: %(perc_none).1f%%, no_result: %(number_none)d, ' \
|
|
'reported: %(reported)d' % stats
|
|
|
|
def _ptime(self, timev=None):
|
|
"""
|
|
:param timev: a unix timestamp or None
|
|
:returns: a pretty string of the current time or provided time
|
|
"""
|
|
if timev:
|
|
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timev))
|
|
else:
|
|
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
|
|
def get_devices(self, zone_filter, swift_dir, ring_name):
|
|
"""
|
|
Get a list of hosts in the ring
|
|
|
|
:param zone_filter: Only list zones matching given filter
|
|
:param swift_dir: Directory of swift config, usually /etc/swift
|
|
:param ring_name: Name of the ring, such as 'object'
|
|
:returns: a set of tuples containing the ip and port of hosts
|
|
"""
|
|
ring_data = Ring(swift_dir, ring_name=ring_name)
|
|
if zone_filter:
|
|
ips = set((n['ip'], n['port']) for n in ring_data.devs
|
|
if n and n['zone'] == zone_filter)
|
|
else:
|
|
ips = set((n['ip'], n['port']) for n in ring_data.devs if n)
|
|
return ips
|
|
|
|
def get_ringmd5(self, hosts, ringfile):
|
|
"""
|
|
Compare ring md5sum's with those on remote host
|
|
|
|
:param hosts: set of hosts to check. in the format of:
|
|
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
|
:param ringfile: The local ring file to compare the md5sum with.
|
|
"""
|
|
stats = {}
|
|
matches = 0
|
|
errors = 0
|
|
md5sum = md5()
|
|
with open(ringfile, 'rb') as f:
|
|
block = f.read(4096)
|
|
while block:
|
|
md5sum.update(block)
|
|
block = f.read(4096)
|
|
ring_sum = md5sum.hexdigest()
|
|
recon = Scout("ringmd5", self.verbose, self.suppress_errors,
|
|
self.timeout)
|
|
print "[%s] Checking ring md5sums" % self._ptime()
|
|
if self.verbose:
|
|
print "-> On disk %s md5sum: %s" % (ringfile, ring_sum)
|
|
for url, response, status in self.pool.imap(recon.scout, hosts):
|
|
if status == 200:
|
|
stats[url] = response[ringfile]
|
|
if response[ringfile] != ring_sum:
|
|
print "!! %s (%s) doesn't match on disk md5sum" % \
|
|
(url, response[ringfile])
|
|
else:
|
|
matches = matches + 1
|
|
if self.verbose:
|
|
print "-> %s matches." % url
|
|
else:
|
|
errors = errors + 1
|
|
print "%s/%s hosts matched, %s error[s] while checking hosts." \
|
|
% (matches, len(hosts), errors)
|
|
print "=" * 79
|
|
|
|
def async_check(self, hosts):
|
|
"""
|
|
Obtain and print async pending statistics
|
|
|
|
:param hosts: set of hosts to check. in the format of:
|
|
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
|
"""
|
|
scan = {}
|
|
recon = Scout("async", self.verbose, self.suppress_errors,
|
|
self.timeout)
|
|
print "[%s] Checking async pendings" % self._ptime()
|
|
for url, response, status in self.pool.imap(recon.scout, hosts):
|
|
if status == 200:
|
|
scan[url] = response['async_pending']
|
|
stats = self._gen_stats(scan.values(), 'async_pending')
|
|
if stats['reported'] > 0:
|
|
self._print_stats(stats)
|
|
else:
|
|
print "[async_pending] - No hosts returned valid data."
|
|
print "=" * 79
|
|
|
|
def umount_check(self, hosts):
|
|
"""
|
|
Check for and print unmounted drives
|
|
|
|
:param hosts: set of hosts to check. in the format of:
|
|
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
|
"""
|
|
stats = {}
|
|
recon = Scout("unmounted", self.verbose, self.suppress_errors,
|
|
self.timeout)
|
|
print "[%s] Getting unmounted drives from %s hosts..." % \
|
|
(self._ptime(), len(hosts))
|
|
for url, response, status in self.pool.imap(recon.scout, hosts):
|
|
if status == 200:
|
|
stats[url] = []
|
|
for i in response:
|
|
stats[url].append(i['device'])
|
|
for host in stats:
|
|
node = urlparse(host).netloc
|
|
for entry in stats[host]:
|
|
print "Not mounted: %s on %s" % (entry, node)
|
|
print "=" * 79
|
|
|
|
def expirer_check(self, hosts):
|
|
"""
|
|
Obtain and print expirer statistics
|
|
|
|
:param hosts: set of hosts to check. in the format of:
|
|
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
|
"""
|
|
stats = {'object_expiration_pass': [], 'expired_last_pass': []}
|
|
recon = Scout("expirer/%s" % self.server_type, self.verbose,
|
|
self.suppress_errors, self.timeout)
|
|
print "[%s] Checking on expirers" % self._ptime()
|
|
for url, response, status in self.pool.imap(recon.scout, hosts):
|
|
if status == 200:
|
|
stats['object_expiration_pass'].append(
|
|
response.get('object_expiration_pass'))
|
|
stats['expired_last_pass'].append(
|
|
response.get('expired_last_pass'))
|
|
for k in stats:
|
|
if stats[k]:
|
|
computed = self._gen_stats(stats[k], name=k)
|
|
if computed['reported'] > 0:
|
|
self._print_stats(computed)
|
|
else:
|
|
print "[%s] - No hosts returned valid data." % k
|
|
else:
|
|
print "[%s] - No hosts returned valid data." % k
|
|
print "=" * 79
|
|
|
|
def replication_check(self, hosts):
|
|
"""
|
|
Obtain and print replication statistics
|
|
|
|
:param hosts: set of hosts to check. in the format of:
|
|
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
|
"""
|
|
stats = {'replication_time': [], 'failure': [], 'success': [],
|
|
'attempted': []}
|
|
recon = Scout("replication/%s" % self.server_type, self.verbose,
|
|
self.suppress_errors, self.timeout)
|
|
print "[%s] Checking on replication" % self._ptime()
|
|
least_recent_time = 9999999999
|
|
least_recent_url = None
|
|
most_recent_time = 0
|
|
most_recent_url = None
|
|
for url, response, status in self.pool.imap(recon.scout, hosts):
|
|
if status == 200:
|
|
stats['replication_time'].append(
|
|
response.get('replication_time'))
|
|
repl_stats = response['replication_stats']
|
|
if repl_stats:
|
|
for stat_key in ['attempted', 'failure', 'success']:
|
|
stats[stat_key].append(repl_stats.get(stat_key))
|
|
last = response.get('replication_last', 0)
|
|
if last < least_recent_time:
|
|
least_recent_time = last
|
|
least_recent_url = url
|
|
if last > most_recent_time:
|
|
most_recent_time = last
|
|
most_recent_url = url
|
|
for k in stats:
|
|
if stats[k]:
|
|
if k != 'replication_time':
|
|
computed = self._gen_stats(stats[k],
|
|
name='replication_%s' % k)
|
|
else:
|
|
computed = self._gen_stats(stats[k], name=k)
|
|
if computed['reported'] > 0:
|
|
self._print_stats(computed)
|
|
else:
|
|
print "[%s] - No hosts returned valid data." % k
|
|
else:
|
|
print "[%s] - No hosts returned valid data." % k
|
|
if least_recent_url is not None:
|
|
host = urlparse(url).netloc
|
|
if not least_recent_time:
|
|
print 'Oldest completion was NEVER by %s.' % host
|
|
else:
|
|
elapsed = time.time() - least_recent_time
|
|
elapsed, elapsed_unit = seconds2timeunit(elapsed)
|
|
print 'Oldest completion was %s (%d %s ago) by %s.' % (
|
|
time.strftime('%Y-%m-%d %H:%M:%S',
|
|
time.gmtime(least_recent_time)),
|
|
elapsed, elapsed_unit, host)
|
|
if most_recent_url is not None:
|
|
host = urlparse(url).netloc
|
|
elapsed = time.time() - most_recent_time
|
|
elapsed, elapsed_unit = seconds2timeunit(elapsed)
|
|
print 'Most recent completion was %s (%d %s ago) by %s.' % (
|
|
time.strftime('%Y-%m-%d %H:%M:%S',
|
|
time.gmtime(most_recent_time)),
|
|
elapsed, elapsed_unit, host)
|
|
print "=" * 79
|
|
|
|
def object_replication_check(self, hosts):
|
|
"""
|
|
Obtain and print replication statistics from object servers
|
|
|
|
:param hosts: set of hosts to check. in the format of:
|
|
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
|
"""
|
|
stats = {}
|
|
recon = Scout("replication", self.verbose, self.suppress_errors,
|
|
self.timeout)
|
|
print "[%s] Checking on replication" % self._ptime()
|
|
least_recent_time = 9999999999
|
|
least_recent_url = None
|
|
most_recent_time = 0
|
|
most_recent_url = None
|
|
for url, response, status in self.pool.imap(recon.scout, hosts):
|
|
if status == 200:
|
|
stats[url] = response['object_replication_time']
|
|
last = response.get('object_replication_last', 0)
|
|
if last < least_recent_time:
|
|
least_recent_time = last
|
|
least_recent_url = url
|
|
if last > most_recent_time:
|
|
most_recent_time = last
|
|
most_recent_url = url
|
|
times = [x for x in stats.values() if x is not None]
|
|
if len(stats) > 0 and len(times) > 0:
|
|
computed = self._gen_stats(times, 'replication_time')
|
|
if computed['reported'] > 0:
|
|
self._print_stats(computed)
|
|
else:
|
|
print "[replication_time] - No hosts returned valid data."
|
|
else:
|
|
print "[replication_time] - No hosts returned valid data."
|
|
if least_recent_url is not None:
|
|
host = urlparse(url).netloc
|
|
if not least_recent_time:
|
|
print 'Oldest completion was NEVER by %s.' % host
|
|
else:
|
|
elapsed = time.time() - least_recent_time
|
|
elapsed, elapsed_unit = seconds2timeunit(elapsed)
|
|
print 'Oldest completion was %s (%d %s ago) by %s.' % (
|
|
time.strftime('%Y-%m-%d %H:%M:%S',
|
|
time.gmtime(least_recent_time)),
|
|
elapsed, elapsed_unit, host)
|
|
if most_recent_url is not None:
|
|
host = urlparse(url).netloc
|
|
elapsed = time.time() - most_recent_time
|
|
elapsed, elapsed_unit = seconds2timeunit(elapsed)
|
|
print 'Most recent completion was %s (%d %s ago) by %s.' % (
|
|
time.strftime('%Y-%m-%d %H:%M:%S',
|
|
time.gmtime(most_recent_time)),
|
|
elapsed, elapsed_unit, host)
|
|
print "=" * 79
|
|
|
|
def updater_check(self, hosts):
|
|
"""
|
|
Obtain and print updater statistics
|
|
|
|
:param hosts: set of hosts to check. in the format of:
|
|
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
|
"""
|
|
stats = []
|
|
recon = Scout("updater/%s" % self.server_type, self.verbose,
|
|
self.suppress_errors, self.timeout)
|
|
print "[%s] Checking updater times" % self._ptime()
|
|
for url, response, status in self.pool.imap(recon.scout, hosts):
|
|
if status == 200:
|
|
if response['%s_updater_sweep' % self.server_type]:
|
|
stats.append(response['%s_updater_sweep' %
|
|
self.server_type])
|
|
if len(stats) > 0:
|
|
computed = self._gen_stats(stats, name='updater_last_sweep')
|
|
if computed['reported'] > 0:
|
|
self._print_stats(computed)
|
|
else:
|
|
print "[updater_last_sweep] - No hosts returned valid data."
|
|
else:
|
|
print "[updater_last_sweep] - No hosts returned valid data."
|
|
print "=" * 79
|
|
|
|
def auditor_check(self, hosts):
|
|
"""
|
|
Obtain and print obj auditor statistics
|
|
|
|
:param hosts: set of hosts to check. in the format of:
|
|
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
|
"""
|
|
scan = {}
|
|
adone = '%s_auditor_pass_completed' % self.server_type
|
|
afail = '%s_audits_failed' % self.server_type
|
|
apass = '%s_audits_passed' % self.server_type
|
|
asince = '%s_audits_since' % self.server_type
|
|
recon = Scout("auditor/%s" % self.server_type, self.verbose,
|
|
self.suppress_errors, self.timeout)
|
|
print "[%s] Checking auditor stats" % self._ptime()
|
|
for url, response, status in self.pool.imap(recon.scout, hosts):
|
|
if status == 200:
|
|
scan[url] = response
|
|
if len(scan) < 1:
|
|
print "Error: No hosts available"
|
|
return
|
|
stats = {}
|
|
stats[adone] = [scan[i][adone] for i in scan
|
|
if scan[i][adone] is not None]
|
|
stats[afail] = [scan[i][afail] for i in scan
|
|
if scan[i][afail] is not None]
|
|
stats[apass] = [scan[i][apass] for i in scan
|
|
if scan[i][apass] is not None]
|
|
stats[asince] = [scan[i][asince] for i in scan
|
|
if scan[i][asince] is not None]
|
|
for k in stats:
|
|
if len(stats[k]) < 1:
|
|
print "[%s] - No hosts returned valid data." % k
|
|
else:
|
|
if k != asince:
|
|
computed = self._gen_stats(stats[k], k)
|
|
if computed['reported'] > 0:
|
|
self._print_stats(computed)
|
|
if len(stats[asince]) >= 1:
|
|
low = min(stats[asince])
|
|
high = max(stats[asince])
|
|
total = sum(stats[asince])
|
|
average = total / len(stats[asince])
|
|
print '[last_pass] oldest: %s, newest: %s, avg: %s' % \
|
|
(self._ptime(low), self._ptime(high), self._ptime(average))
|
|
print "=" * 79
|
|
|
|
def object_auditor_check(self, hosts):
|
|
"""
|
|
Obtain and print obj auditor statistics
|
|
|
|
:param hosts: set of hosts to check. in the format of:
|
|
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
|
"""
|
|
all_scan = {}
|
|
zbf_scan = {}
|
|
atime = 'audit_time'
|
|
bprocessed = 'bytes_processed'
|
|
passes = 'passes'
|
|
errors = 'errors'
|
|
quarantined = 'quarantined'
|
|
recon = Scout("auditor/object", self.verbose, self.suppress_errors,
|
|
self.timeout)
|
|
print "[%s] Checking auditor stats " % self._ptime()
|
|
for url, response, status in self.pool.imap(recon.scout, hosts):
|
|
if status == 200:
|
|
if response['object_auditor_stats_ALL']:
|
|
all_scan[url] = response['object_auditor_stats_ALL']
|
|
if response['object_auditor_stats_ZBF']:
|
|
zbf_scan[url] = response['object_auditor_stats_ZBF']
|
|
if len(all_scan) > 0:
|
|
stats = {}
|
|
stats[atime] = [all_scan[i][atime] for i in all_scan]
|
|
stats[bprocessed] = [all_scan[i][bprocessed] for i in all_scan]
|
|
stats[passes] = [all_scan[i][passes] for i in all_scan]
|
|
stats[errors] = [all_scan[i][errors] for i in all_scan]
|
|
stats[quarantined] = [all_scan[i][quarantined] for i in all_scan]
|
|
for k in stats:
|
|
if None in stats[k]:
|
|
stats[k] = [x for x in stats[k] if x is not None]
|
|
if len(stats[k]) < 1:
|
|
print "[Auditor %s] - No hosts returned valid data." % k
|
|
else:
|
|
computed = self._gen_stats(stats[k],
|
|
name='ALL_%s_last_path' % k)
|
|
if computed['reported'] > 0:
|
|
self._print_stats(computed)
|
|
else:
|
|
print "[ALL_auditor] - No hosts returned valid data."
|
|
else:
|
|
print "[ALL_auditor] - No hosts returned valid data."
|
|
if len(zbf_scan) > 0:
|
|
stats = {}
|
|
stats[atime] = [zbf_scan[i][atime] for i in zbf_scan]
|
|
stats[bprocessed] = [zbf_scan[i][bprocessed] for i in zbf_scan]
|
|
stats[errors] = [zbf_scan[i][errors] for i in zbf_scan]
|
|
stats[quarantined] = [zbf_scan[i][quarantined] for i in zbf_scan]
|
|
for k in stats:
|
|
if None in stats[k]:
|
|
stats[k] = [x for x in stats[k] if x is not None]
|
|
if len(stats[k]) < 1:
|
|
print "[Auditor %s] - No hosts returned valid data." % k
|
|
else:
|
|
computed = self._gen_stats(stats[k],
|
|
name='ZBF_%s_last_path' % k)
|
|
if computed['reported'] > 0:
|
|
self._print_stats(computed)
|
|
else:
|
|
print "[ZBF_auditor] - No hosts returned valid data."
|
|
else:
|
|
print "[ZBF_auditor] - No hosts returned valid data."
|
|
print "=" * 79
|
|
|
|
def load_check(self, hosts):
|
|
"""
|
|
Obtain and print load average statistics
|
|
|
|
:param hosts: set of hosts to check. in the format of:
|
|
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
|
"""
|
|
load1 = {}
|
|
load5 = {}
|
|
load15 = {}
|
|
recon = Scout("load", self.verbose, self.suppress_errors,
|
|
self.timeout)
|
|
print "[%s] Checking load averages" % self._ptime()
|
|
for url, response, status in self.pool.imap(recon.scout, hosts):
|
|
if status == 200:
|
|
load1[url] = response['1m']
|
|
load5[url] = response['5m']
|
|
load15[url] = response['15m']
|
|
stats = {"1m": load1, "5m": load5, "15m": load15}
|
|
for item in stats:
|
|
if len(stats[item]) > 0:
|
|
computed = self._gen_stats(stats[item].values(),
|
|
name='%s_load_avg' % item)
|
|
self._print_stats(computed)
|
|
else:
|
|
print "[%s_load_avg] - No hosts returned valid data." % item
|
|
print "=" * 79
|
|
|
|
def quarantine_check(self, hosts):
|
|
"""
|
|
Obtain and print quarantine statistics
|
|
|
|
:param hosts: set of hosts to check. in the format of:
|
|
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
|
"""
|
|
objq = {}
|
|
conq = {}
|
|
acctq = {}
|
|
recon = Scout("quarantined", self.verbose, self.suppress_errors,
|
|
self.timeout)
|
|
print "[%s] Checking quarantine" % self._ptime()
|
|
for url, response, status in self.pool.imap(recon.scout, hosts):
|
|
if status == 200:
|
|
objq[url] = response['objects']
|
|
conq[url] = response['containers']
|
|
acctq[url] = response['accounts']
|
|
stats = {"objects": objq, "containers": conq, "accounts": acctq}
|
|
for item in stats:
|
|
if len(stats[item]) > 0:
|
|
computed = self._gen_stats(stats[item].values(),
|
|
name='quarantined_%s' % item)
|
|
self._print_stats(computed)
|
|
else:
|
|
print "No hosts returned valid data."
|
|
print "=" * 79
|
|
|
|
def socket_usage(self, hosts):
|
|
"""
|
|
Obtain and print /proc/net/sockstat statistics
|
|
|
|
:param hosts: set of hosts to check. in the format of:
|
|
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
|
"""
|
|
inuse4 = {}
|
|
mem = {}
|
|
inuse6 = {}
|
|
timewait = {}
|
|
orphan = {}
|
|
recon = Scout("sockstat", self.verbose, self.suppress_errors,
|
|
self.timeout)
|
|
print "[%s] Checking socket usage" % self._ptime()
|
|
for url, response, status in self.pool.imap(recon.scout, hosts):
|
|
if status == 200:
|
|
inuse4[url] = response['tcp_in_use']
|
|
mem[url] = response['tcp_mem_allocated_bytes']
|
|
inuse6[url] = response['tcp6_in_use']
|
|
timewait[url] = response['time_wait']
|
|
orphan[url] = response['orphan']
|
|
stats = {"tcp_in_use": inuse4, "tcp_mem_allocated_bytes": mem,
|
|
"tcp6_in_use": inuse6, "time_wait": timewait,
|
|
"orphan": orphan}
|
|
for item in stats:
|
|
if len(stats[item]) > 0:
|
|
computed = self._gen_stats(stats[item].values(), item)
|
|
self._print_stats(computed)
|
|
else:
|
|
print "No hosts returned valid data."
|
|
print "=" * 79
|
|
|
|
def disk_usage(self, hosts, top=0):
|
|
"""
|
|
Obtain and print disk usage statistics
|
|
|
|
:param hosts: set of hosts to check. in the format of:
|
|
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
|
"""
|
|
stats = {}
|
|
highs = []
|
|
lows = []
|
|
raw_total_used = []
|
|
raw_total_avail = []
|
|
percents = {}
|
|
top_percents = [(None, 0)] * top
|
|
recon = Scout("diskusage", self.verbose, self.suppress_errors,
|
|
self.timeout)
|
|
print "[%s] Checking disk usage now" % self._ptime()
|
|
for url, response, status in self.pool.imap(recon.scout, hosts):
|
|
if status == 200:
|
|
hostusage = []
|
|
for entry in response:
|
|
if entry['mounted']:
|
|
used = float(entry['used']) / float(entry['size']) \
|
|
* 100.0
|
|
raw_total_used.append(entry['used'])
|
|
raw_total_avail.append(entry['avail'])
|
|
hostusage.append(round(used, 2))
|
|
for ident, oused in top_percents:
|
|
if oused < used:
|
|
top_percents.append(
|
|
(url + ' ' + entry['device'], used))
|
|
top_percents.sort(key=lambda x: -x[1])
|
|
top_percents.pop()
|
|
break
|
|
stats[url] = hostusage
|
|
|
|
for url in stats:
|
|
if len(stats[url]) > 0:
|
|
#get per host hi/los for another day
|
|
low = min(stats[url])
|
|
high = max(stats[url])
|
|
highs.append(high)
|
|
lows.append(low)
|
|
for percent in stats[url]:
|
|
percents[int(percent)] = percents.get(int(percent), 0) + 1
|
|
else:
|
|
print "-> %s: Error. No drive info available." % url
|
|
|
|
if len(lows) > 0:
|
|
low = min(lows)
|
|
high = max(highs)
|
|
#dist graph shamelessly stolen from https://github.com/gholt/tcod
|
|
print "Distribution Graph:"
|
|
mul = 69.0 / max(percents.values())
|
|
for percent in sorted(percents):
|
|
print '% 3d%%%5d %s' % (percent, percents[percent],
|
|
'*' * int(percents[percent] * mul))
|
|
raw_used = sum(raw_total_used)
|
|
raw_avail = sum(raw_total_avail)
|
|
raw_total = raw_used + raw_avail
|
|
avg_used = 100.0 * raw_used / raw_total
|
|
print "Disk usage: space used: %s of %s" % (raw_used, raw_total)
|
|
print "Disk usage: space free: %s of %s" % (raw_avail, raw_total)
|
|
print "Disk usage: lowest: %s%%, highest: %s%%, avg: %s%%" % \
|
|
(low, high, avg_used)
|
|
else:
|
|
print "No hosts returned valid data."
|
|
print "=" * 79
|
|
if top_percents:
|
|
print 'TOP %s' % top
|
|
for ident, used in top_percents:
|
|
if ident:
|
|
url, device = ident.split()
|
|
host = urlparse(url).netloc.split(':')[0]
|
|
print '%.02f%% %s' % (used, '%-15s %s' % (host, device))
|
|
|
|
def main(self):
|
|
"""
|
|
Retrieve and report cluster info from hosts running recon middleware.
|
|
"""
|
|
print "=" * 79
|
|
usage = '''
|
|
usage: %prog <server_type> [-v] [--suppress] [-a] [-r] [-u] [-d]
|
|
[-l] [--md5] [--auditor] [--updater] [--expirer] [--sockstat]
|
|
|
|
<server_type>\taccount|container|object
|
|
Defaults to object server.
|
|
|
|
ex: %prog container -l --auditor
|
|
'''
|
|
args = optparse.OptionParser(usage)
|
|
args.add_option('--verbose', '-v', action="store_true",
|
|
help="Print verbose info")
|
|
args.add_option('--suppress', action="store_true",
|
|
help="Suppress most connection related errors")
|
|
args.add_option('--async', '-a', action="store_true",
|
|
help="Get async stats")
|
|
args.add_option('--replication', '-r', action="store_true",
|
|
help="Get replication stats")
|
|
args.add_option('--auditor', action="store_true",
|
|
help="Get auditor stats")
|
|
args.add_option('--updater', action="store_true",
|
|
help="Get updater stats")
|
|
args.add_option('--expirer', action="store_true",
|
|
help="Get expirer stats")
|
|
args.add_option('--unmounted', '-u', action="store_true",
|
|
help="Check cluster for unmounted devices")
|
|
args.add_option('--diskusage', '-d', action="store_true",
|
|
help="Get disk usage stats")
|
|
args.add_option('--loadstats', '-l', action="store_true",
|
|
help="Get cluster load average stats")
|
|
args.add_option('--quarantined', '-q', action="store_true",
|
|
help="Get cluster quarantine stats")
|
|
args.add_option('--md5', action="store_true",
|
|
help="Get md5sum of servers ring and compare to "
|
|
"local copy")
|
|
args.add_option('--sockstat', action="store_true",
|
|
help="Get cluster socket usage stats")
|
|
args.add_option('--top', type='int', metavar='COUNT', default=0,
|
|
help='Also show the top COUNT entries in rank order.')
|
|
args.add_option('--all', action="store_true",
|
|
help="Perform all checks. Equal to -arudlq --md5 "
|
|
"--sockstat")
|
|
args.add_option('--zone', '-z', type="int",
|
|
help="Only query servers in specified zone")
|
|
args.add_option('--timeout', '-t', type="int", metavar="SECONDS",
|
|
help="Time to wait for a response from a server",
|
|
default=5)
|
|
args.add_option('--swiftdir', default="/etc/swift",
|
|
help="Default = /etc/swift")
|
|
options, arguments = args.parse_args()
|
|
|
|
if len(sys.argv) <= 1 or len(arguments) > 1:
|
|
args.print_help()
|
|
sys.exit(0)
|
|
|
|
if arguments:
|
|
if arguments[0] in self.check_types:
|
|
self.server_type = arguments[0]
|
|
else:
|
|
print "Invalid Server Type"
|
|
args.print_help()
|
|
sys.exit(1)
|
|
else:
|
|
self.server_type = 'object'
|
|
|
|
swift_dir = options.swiftdir
|
|
ring_file = os.path.join(swift_dir, '%s.ring.gz' % self.server_type)
|
|
self.verbose = options.verbose
|
|
self.suppress_errors = options.suppress
|
|
self.timeout = options.timeout
|
|
|
|
if options.zone:
|
|
hosts = self.get_devices(options.zone, swift_dir, self.server_type)
|
|
else:
|
|
hosts = self.get_devices(None, swift_dir, self.server_type)
|
|
|
|
print "--> Starting reconnaissance on %s hosts" % len(hosts)
|
|
print "=" * 79
|
|
|
|
if options.all:
|
|
if self.server_type == 'object':
|
|
self.async_check(hosts)
|
|
self.object_replication_check(hosts)
|
|
self.object_auditor_check(hosts)
|
|
self.updater_check(hosts)
|
|
self.expirer_check(hosts)
|
|
elif self.server_type == 'container':
|
|
self.replication_check(hosts)
|
|
self.auditor_check(hosts)
|
|
self.updater_check(hosts)
|
|
elif self.server_type == 'account':
|
|
self.replication_check(hosts)
|
|
self.auditor_check(hosts)
|
|
self.umount_check(hosts)
|
|
self.load_check(hosts)
|
|
self.disk_usage(hosts)
|
|
self.get_ringmd5(hosts, ring_file)
|
|
self.quarantine_check(hosts)
|
|
self.socket_usage(hosts)
|
|
else:
|
|
if options.async:
|
|
if self.server_type == 'object':
|
|
self.async_check(hosts)
|
|
else:
|
|
print "Error: Can't check async's on non object servers."
|
|
if options.unmounted:
|
|
self.umount_check(hosts)
|
|
if options.replication:
|
|
if self.server_type == 'object':
|
|
self.object_replication_check(hosts)
|
|
else:
|
|
self.replication_check(hosts)
|
|
if options.auditor:
|
|
if self.server_type == 'object':
|
|
self.object_auditor_check(hosts)
|
|
else:
|
|
self.auditor_check(hosts)
|
|
if options.updater:
|
|
if self.server_type == 'account':
|
|
print "Error: Can't check updaters on account servers."
|
|
else:
|
|
self.updater_check(hosts)
|
|
if options.expirer:
|
|
if self.server_type == 'object':
|
|
self.expirer_check(hosts)
|
|
else:
|
|
print "Error: Can't check expired on non object servers."
|
|
if options.loadstats:
|
|
self.load_check(hosts)
|
|
if options.diskusage:
|
|
self.disk_usage(hosts, options.top)
|
|
if options.md5:
|
|
self.get_ringmd5(hosts, ring_file)
|
|
if options.quarantined:
|
|
self.quarantine_check(hosts)
|
|
if options.sockstat:
|
|
self.socket_usage(hosts)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
try:
|
|
reconnoiter = SwiftRecon()
|
|
reconnoiter.main()
|
|
except KeyboardInterrupt:
|
|
print '\n'
|