swift/bin/swift-recon
gholt 18f755a287 Added --top option to swift-recon -d
When showing the disk usage dispersion graph it is often useful to
know what those top full drives are so you can do further research.
Now you can run 'swift-recon -d --top 10' to list the top ten devices
by fullness after the usual dispersion graph.

Change-Id: I7ddb2141e55e0613f69750fabe544940192c2d48
2012-11-29 22:18:18 +00:00

766 lines
31 KiB
Python
Executable File

#! /usr/bin/env python
"""
cmdline utility to perform cluster reconnaissance
"""
from eventlet.green import urllib2
from swift.common.ring import Ring
from urlparse import urlparse
try:
import simplejson as json
except ImportError:
import json
from hashlib import md5
import eventlet
import optparse
import time
import sys
import os
class Scout(object):
"""
Obtain swift recon information
"""
def __init__(self, recon_type, verbose=False, suppress_errors=False,
timeout=5):
self.recon_type = recon_type
self.verbose = verbose
self.suppress_errors = suppress_errors
self.timeout = timeout
def scout_host(self, base_url, recon_type):
"""
Perform the actual HTTP request to obtain swift recon telemtry.
:param base_url: the base url of the host you wish to check. str of the
format 'http://127.0.0.1:6000/recon/'
:param recon_type: the swift recon check to request.
:returns: tuple of (recon url used, response body, and status)
"""
url = base_url + recon_type
try:
body = urllib2.urlopen(url, timeout=self.timeout).read()
content = json.loads(body)
if self.verbose:
print "-> %s: %s" % (url, content)
status = 200
except urllib2.HTTPError as err:
if not self.suppress_errors or self.verbose:
print "-> %s: %s" % (url, err)
content = err
status = err.code
except urllib2.URLError as err:
if not self.suppress_errors or self.verbose:
print "-> %s: %s" % (url, err)
content = err
status = -1
return url, content, status
def scout(self, host):
"""
Obtain telemetry from a host running the swift recon middleware.
:param host: host to check
:returns: tuple of (recon url used, response body, and status)
"""
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = self.scout_host(base_url, self.recon_type)
return url, content, status
class SwiftRecon(object):
"""
Retrieve and report cluster info from hosts running recon middleware.
"""
def __init__(self):
self.verbose = False
self.suppress_errors = False
self.timeout = 5
self.pool_size = 30
self.pool = eventlet.GreenPool(self.pool_size)
self.check_types = ['account', 'container', 'object']
self.server_type = 'object'
def _gen_stats(self, stats, name=None):
""" compute various stats from a list of values """
cstats = [x for x in stats if x is not None]
if len(cstats) > 0:
ret_dict = {'low': min(cstats), 'high': max(cstats),
'total': sum(cstats), 'reported': len(cstats),
'number_none': len(stats) - len(cstats), 'name': name}
ret_dict['average'] = \
ret_dict['total'] / float(len(cstats))
ret_dict['perc_none'] = \
ret_dict['number_none'] * 100.0 / len(stats)
else:
ret_dict = {'reported': 0}
return ret_dict
def _print_stats(self, stats):
"""
print out formatted stats to console
:param stats: dict of stats generated by _gen_stats
"""
print '[%(name)s] low: %(low)d, high: %(high)d, avg: ' \
'%(average).1f, total: %(total)d, ' \
'Failed: %(perc_none).1f%%, no_result: %(number_none)d, ' \
'reported: %(reported)d' % stats
def _ptime(self, timev=None):
"""
:param timev: a unix timestamp or None
:returns: a pretty string of the current time or provided time
"""
if timev:
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timev))
else:
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
def get_devices(self, zone_filter, swift_dir, ring_name):
"""
Get a list of hosts in the ring
:param zone_filter: Only list zones matching given filter
:param swift_dir: Directory of swift config, usually /etc/swift
:param ring_name: Name of the ring, such as 'object'
:returns: a set of tuples containing the ip and port of hosts
"""
ring_data = Ring(swift_dir, ring_name=ring_name)
if zone_filter:
ips = set((n['ip'], n['port']) for n in ring_data.devs
if n and n['zone'] == zone_filter)
else:
ips = set((n['ip'], n['port']) for n in ring_data.devs if n)
return ips
def get_ringmd5(self, hosts, ringfile):
"""
Compare ring md5sum's with those on remote host
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
:param ringfile: The local ring file to compare the md5sum with.
"""
stats = {}
matches = 0
errors = 0
md5sum = md5()
with open(ringfile, 'rb') as f:
block = f.read(4096)
while block:
md5sum.update(block)
block = f.read(4096)
ring_sum = md5sum.hexdigest()
recon = Scout("ringmd5", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking ring md5sums" % self._ptime()
if self.verbose:
print "-> On disk %s md5sum: %s" % (ringfile, ring_sum)
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
stats[url] = response[ringfile]
if response[ringfile] != ring_sum:
print "!! %s (%s) doesn't match on disk md5sum" % \
(url, response[ringfile])
else:
matches = matches + 1
if self.verbose:
print "-> %s matches." % url
else:
errors = errors + 1
print "%s/%s hosts matched, %s error[s] while checking hosts." \
% (matches, len(hosts), errors)
print "=" * 79
def async_check(self, hosts):
"""
Obtain and print async pending statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
scan = {}
recon = Scout("async", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking async pendings" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
scan[url] = response['async_pending']
stats = self._gen_stats(scan.values(), 'async_pending')
if stats['reported'] > 0:
self._print_stats(stats)
else:
print "[async_pending] - No hosts returned valid data."
print "=" * 79
def umount_check(self, hosts):
"""
Check for and print unmounted drives
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {}
recon = Scout("unmounted", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Getting unmounted drives from %s hosts..." % \
(self._ptime(), len(hosts))
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
stats[url] = []
for i in response:
stats[url].append(i['device'])
for host in stats:
node = urlparse(host).netloc
for entry in stats[host]:
print "Not mounted: %s on %s" % (entry, node)
print "=" * 79
def expirer_check(self, hosts):
"""
Obtain and print expirer statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {'object_expiration_pass': [], 'expired_last_pass': []}
recon = Scout("expirer/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print "[%s] Checking on expirers" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
stats['object_expiration_pass'].append(
response.get('object_expiration_pass'))
stats['expired_last_pass'].append(
response.get('expired_last_pass'))
for k in stats:
if stats[k]:
computed = self._gen_stats(stats[k], name=k)
if computed['reported'] > 0:
self._print_stats(computed)
else:
print "[%s] - No hosts returned valid data." % k
else:
print "[%s] - No hosts returned valid data." % k
print "=" * 79
def replication_check(self, hosts):
"""
Obtain and print replication statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {'replication_time': [], 'failure': [], 'success': [],
'attempted': []}
recon = Scout("replication/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print "[%s] Checking on replication" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
stats['replication_time'].append(
response.get('replication_time'))
repl_stats = response['replication_stats']
if repl_stats:
for stat_key in ['attempted', 'failure', 'success']:
stats[stat_key].append(repl_stats.get(stat_key))
for k in stats:
if stats[k]:
if k != 'replication_time':
computed = self._gen_stats(stats[k],
name='replication_%s' % k)
else:
computed = self._gen_stats(stats[k], name=k)
if computed['reported'] > 0:
self._print_stats(computed)
else:
print "[%s] - No hosts returned valid data." % k
else:
print "[%s] - No hosts returned valid data." % k
print "=" * 79
def object_replication_check(self, hosts):
"""
Obtain and print replication statistics from object servers
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {}
recon = Scout("replication", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking on replication" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
stats[url] = response['object_replication_time']
times = [x for x in stats.values() if x is not None]
if len(stats) > 0 and len(times) > 0:
computed = self._gen_stats(times, 'replication_time')
if computed['reported'] > 0:
self._print_stats(computed)
else:
print "[replication_time] - No hosts returned valid data."
else:
print "[replication_time] - No hosts returned valid data."
print "=" * 79
def updater_check(self, hosts):
"""
Obtain and print updater statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = []
recon = Scout("updater/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print "[%s] Checking updater times" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
if response['%s_updater_sweep' % self.server_type]:
stats.append(response['%s_updater_sweep' %
self.server_type])
if len(stats) > 0:
computed = self._gen_stats(stats, name='updater_last_sweep')
if computed['reported'] > 0:
self._print_stats(computed)
else:
print "[updater_last_sweep] - No hosts returned valid data."
else:
print "[updater_last_sweep] - No hosts returned valid data."
print "=" * 79
def auditor_check(self, hosts):
"""
Obtain and print obj auditor statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
scan = {}
adone = '%s_auditor_pass_completed' % self.server_type
afail = '%s_audits_failed' % self.server_type
apass = '%s_audits_passed' % self.server_type
asince = '%s_audits_since' % self.server_type
recon = Scout("auditor/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print "[%s] Checking auditor stats" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
scan[url] = response
if len(scan) < 1:
print "Error: No hosts available"
return
stats = {}
stats[adone] = [scan[i][adone] for i in scan
if scan[i][adone] is not None]
stats[afail] = [scan[i][afail] for i in scan
if scan[i][afail] is not None]
stats[apass] = [scan[i][apass] for i in scan
if scan[i][apass] is not None]
stats[asince] = [scan[i][asince] for i in scan
if scan[i][asince] is not None]
for k in stats:
if len(stats[k]) < 1:
print "[%s] - No hosts returned valid data." % k
else:
if k != asince:
computed = self._gen_stats(stats[k], k)
if computed['reported'] > 0:
self._print_stats(computed)
if len(stats[asince]) >= 1:
low = min(stats[asince])
high = max(stats[asince])
total = sum(stats[asince])
average = total / len(stats[asince])
print '[last_pass] oldest: %s, newest: %s, avg: %s' % \
(self._ptime(low), self._ptime(high), self._ptime(average))
print "=" * 79
def object_auditor_check(self, hosts):
"""
Obtain and print obj auditor statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
all_scan = {}
zbf_scan = {}
atime = 'audit_time'
bprocessed = 'bytes_processed'
passes = 'passes'
errors = 'errors'
quarantined = 'quarantined'
recon = Scout("auditor/object", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking auditor stats " % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
if response['object_auditor_stats_ALL']:
all_scan[url] = response['object_auditor_stats_ALL']
if response['object_auditor_stats_ZBF']:
zbf_scan[url] = response['object_auditor_stats_ZBF']
if len(all_scan) > 0:
stats = {}
stats[atime] = [all_scan[i][atime] for i in all_scan]
stats[bprocessed] = [all_scan[i][bprocessed] for i in all_scan]
stats[passes] = [all_scan[i][passes] for i in all_scan]
stats[errors] = [all_scan[i][errors] for i in all_scan]
stats[quarantined] = [all_scan[i][quarantined] for i in all_scan]
for k in stats:
if None in stats[k]:
stats[k] = [x for x in stats[k] if x is not None]
if len(stats[k]) < 1:
print "[Auditor %s] - No hosts returned valid data." % k
else:
computed = self._gen_stats(stats[k],
name='ALL_%s_last_path' % k)
if computed['reported'] > 0:
self._print_stats(computed)
else:
print "[ALL_auditor] - No hosts returned valid data."
else:
print "[ALL_auditor] - No hosts returned valid data."
if len(zbf_scan) > 0:
stats = {}
stats[atime] = [zbf_scan[i][atime] for i in zbf_scan]
stats[bprocessed] = [zbf_scan[i][bprocessed] for i in zbf_scan]
stats[errors] = [zbf_scan[i][errors] for i in zbf_scan]
stats[quarantined] = [zbf_scan[i][quarantined] for i in zbf_scan]
for k in stats:
if None in stats[k]:
stats[k] = [x for x in stats[k] if x is not None]
if len(stats[k]) < 1:
print "[Auditor %s] - No hosts returned valid data." % k
else:
computed = self._gen_stats(stats[k],
name='ZBF_%s_last_path' % k)
if computed['reported'] > 0:
self._print_stats(computed)
else:
print "[ZBF_auditor] - No hosts returned valid data."
else:
print "[ZBF_auditor] - No hosts returned valid data."
print "=" * 79
def load_check(self, hosts):
"""
Obtain and print load average statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
load1 = {}
load5 = {}
load15 = {}
recon = Scout("load", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking load averages" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
load1[url] = response['1m']
load5[url] = response['5m']
load15[url] = response['15m']
stats = {"1m": load1, "5m": load5, "15m": load15}
for item in stats:
if len(stats[item]) > 0:
computed = self._gen_stats(stats[item].values(),
name='%s_load_avg' % item)
self._print_stats(computed)
else:
print "[%s_load_avg] - No hosts returned valid data." % item
print "=" * 79
def quarantine_check(self, hosts):
"""
Obtain and print quarantine statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
objq = {}
conq = {}
acctq = {}
recon = Scout("quarantined", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking quarantine" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
objq[url] = response['objects']
conq[url] = response['containers']
acctq[url] = response['accounts']
stats = {"objects": objq, "containers": conq, "accounts": acctq}
for item in stats:
if len(stats[item]) > 0:
computed = self._gen_stats(stats[item].values(),
name='quarantined_%s' % item)
self._print_stats(computed)
else:
print "No hosts returned valid data."
print "=" * 79
def socket_usage(self, hosts):
"""
Obtain and print /proc/net/sockstat statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
inuse4 = {}
mem = {}
inuse6 = {}
timewait = {}
orphan = {}
recon = Scout("sockstat", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking socket usage" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
inuse4[url] = response['tcp_in_use']
mem[url] = response['tcp_mem_allocated_bytes']
inuse6[url] = response['tcp6_in_use']
timewait[url] = response['time_wait']
orphan[url] = response['orphan']
stats = {"tcp_in_use": inuse4, "tcp_mem_allocated_bytes": mem,
"tcp6_in_use": inuse6, "time_wait": timewait,
"orphan": orphan}
for item in stats:
if len(stats[item]) > 0:
computed = self._gen_stats(stats[item].values(), item)
self._print_stats(computed)
else:
print "No hosts returned valid data."
print "=" * 79
def disk_usage(self, hosts, top=0):
"""
Obtain and print disk usage statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {}
highs = []
lows = []
raw_total_used = []
raw_total_avail = []
percents = {}
top_percents = [(None, 0)] * top
recon = Scout("diskusage", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking disk usage now" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
hostusage = []
for entry in response:
if entry['mounted']:
used = float(entry['used']) / float(entry['size']) \
* 100.0
raw_total_used.append(entry['used'])
raw_total_avail.append(entry['avail'])
hostusage.append(round(used, 2))
for ident, oused in top_percents:
if oused < used:
top_percents.append(
(url + ' ' + entry['device'], used))
top_percents.sort(key=lambda x: -x[1])
top_percents.pop()
break
stats[url] = hostusage
for url in stats:
if len(stats[url]) > 0:
#get per host hi/los for another day
low = min(stats[url])
high = max(stats[url])
highs.append(high)
lows.append(low)
for percent in stats[url]:
percents[int(percent)] = percents.get(int(percent), 0) + 1
else:
print "-> %s: Error. No drive info available." % url
if len(lows) > 0:
low = min(lows)
high = max(highs)
#dist graph shamelessly stolen from https://github.com/gholt/tcod
print "Distribution Graph:"
mul = 69.0 / max(percents.values())
for percent in sorted(percents):
print '% 3d%%%5d %s' % (percent, percents[percent],
'*' * int(percents[percent] * mul))
raw_used = sum(raw_total_used)
raw_avail = sum(raw_total_avail)
raw_total = raw_used + raw_avail
avg_used = 100.0 * raw_used / raw_total
print "Disk usage: space used: %s of %s" % (raw_used, raw_total)
print "Disk usage: space free: %s of %s" % (raw_avail, raw_total)
print "Disk usage: lowest: %s%%, highest: %s%%, avg: %s%%" % \
(low, high, avg_used)
else:
print "No hosts returned valid data."
print "=" * 79
if top_percents:
print 'TOP %s' % top
for ident, used in top_percents:
if ident:
url, device = ident.split()
host = urlparse(url).netloc.split(':')[0]
print '%.02f%% %s' % (used, '%-15s %s' % (host, device))
def main(self):
"""
Retrieve and report cluster info from hosts running recon middleware.
"""
print "=" * 79
usage = '''
usage: %prog <server_type> [-v] [--suppress] [-a] [-r] [-u] [-d]
[-l] [--md5] [--auditor] [--updater] [--expirer] [--sockstat]
<server_type>\taccount|container|object
Defaults to object server.
ex: %prog container -l --auditor
'''
args = optparse.OptionParser(usage)
args.add_option('--verbose', '-v', action="store_true",
help="Print verbose info")
args.add_option('--suppress', action="store_true",
help="Suppress most connection related errors")
args.add_option('--async', '-a', action="store_true",
help="Get async stats")
args.add_option('--replication', '-r', action="store_true",
help="Get replication stats")
args.add_option('--auditor', action="store_true",
help="Get auditor stats")
args.add_option('--updater', action="store_true",
help="Get updater stats")
args.add_option('--expirer', action="store_true",
help="Get expirer stats")
args.add_option('--unmounted', '-u', action="store_true",
help="Check cluster for unmounted devices")
args.add_option('--diskusage', '-d', action="store_true",
help="Get disk usage stats")
args.add_option('--loadstats', '-l', action="store_true",
help="Get cluster load average stats")
args.add_option('--quarantined', '-q', action="store_true",
help="Get cluster quarantine stats")
args.add_option('--md5', action="store_true",
help="Get md5sum of servers ring and compare to "
"local copy")
args.add_option('--sockstat', action="store_true",
help="Get cluster socket usage stats")
args.add_option('--top', type='int', metavar='COUNT', default=0,
help='Also show the top COUNT entries in rank order.')
args.add_option('--all', action="store_true",
help="Perform all checks. Equal to -arudlq --md5 "
"--sockstat")
args.add_option('--zone', '-z', type="int",
help="Only query servers in specified zone")
args.add_option('--timeout', '-t', type="int", metavar="SECONDS",
help="Time to wait for a response from a server",
default=5)
args.add_option('--swiftdir', default="/etc/swift",
help="Default = /etc/swift")
options, arguments = args.parse_args()
if len(sys.argv) <= 1 or len(arguments) > 1:
args.print_help()
sys.exit(0)
if arguments:
if arguments[0] in self.check_types:
self.server_type = arguments[0]
else:
print "Invalid Server Type"
args.print_help()
sys.exit(1)
else:
self.server_type = 'object'
swift_dir = options.swiftdir
ring_file = os.path.join(swift_dir, '%s.ring.gz' % self.server_type)
self.verbose = options.verbose
self.suppress_errors = options.suppress
self.timeout = options.timeout
if options.zone:
hosts = self.get_devices(options.zone, swift_dir, self.server_type)
else:
hosts = self.get_devices(None, swift_dir, self.server_type)
print "--> Starting reconnaissance on %s hosts" % len(hosts)
print "=" * 79
if options.all:
if self.server_type == 'object':
self.async_check(hosts)
self.object_replication_check(hosts)
self.object_auditor_check(hosts)
self.updater_check(hosts)
self.expirer_check(hosts)
elif self.server_type == 'container':
self.replication_check(hosts)
self.auditor_check(hosts)
self.updater_check(hosts)
elif self.server_type == 'account':
self.replication_check(hosts)
self.auditor_check(hosts)
self.umount_check(hosts)
self.load_check(hosts)
self.disk_usage(hosts)
self.get_ringmd5(hosts, ring_file)
self.quarantine_check(hosts)
self.socket_usage(hosts)
else:
if options.async:
if self.server_type == 'object':
self.async_check(hosts)
else:
print "Error: Can't check async's on non object servers."
if options.unmounted:
self.umount_check(hosts)
if options.replication:
if self.server_type == 'object':
self.object_replication_check(hosts)
else:
self.replication_check(hosts)
if options.auditor:
if self.server_type == 'object':
self.object_auditor_check(hosts)
else:
self.auditor_check(hosts)
if options.updater:
if self.server_type == 'account':
print "Error: Can't check updaters on account servers."
else:
self.updater_check(hosts)
if options.expirer:
if self.server_type == 'object':
self.expirer_check(hosts)
else:
print "Error: Can't check expired on non object servers."
if options.loadstats:
self.load_check(hosts)
if options.diskusage:
self.disk_usage(hosts, options.top)
if options.md5:
self.get_ringmd5(hosts, ring_file)
if options.quarantined:
self.quarantine_check(hosts)
if options.sockstat:
self.socket_usage(hosts)
if __name__ == '__main__':
try:
reconnoiter = SwiftRecon()
reconnoiter.main()
except KeyboardInterrupt:
print '\n'