swift/bin/swift-recon
Florian Hines ccb6334c17 Expand recon middleware support
Expand recon middleware to include support for account and container
servers in addition to the existing object servers. Also add support
for retrieving recent information from auditors, replicators, and
updaters. In the case of certain checks (such as container auditors)
the stats returned are only for the most recent path processed.

The middleware has also been refactored and should now also handle
errors better in cases where stats are unavailable.

While new check's have been added the output from pre-existing
check's has not changed. This should allow existing 3rd party
utilities such as the Swift ZenPack to continue to function.

Change-Id: Ib9893a77b9b8a2f03179f2a73639bc4a6e264df7
2012-05-24 14:50:00 -05:00

744 lines
29 KiB
Python
Executable File

#! /usr/bin/env python
"""
cmdline utility to perform cluster reconnaissance
"""
from eventlet.green import urllib2
from swift.common.ring import Ring
from urlparse import urlparse
try:
import simplejson as json
except ImportError:
import json
from hashlib import md5
import eventlet
import optparse
import time
import sys
import os
class Scout(object):
"""
Obtain swift recon information
"""
def __init__(self, recon_type, verbose=False, suppress_errors=False,
timeout=5):
self.recon_type = recon_type
self.verbose = verbose
self.suppress_errors = suppress_errors
self.timeout = timeout
def scout_host(self, base_url, recon_type):
"""
Perform the actual HTTP request to obtain swift recon telemtry.
:param base_url: the base url of the host you wish to check. str of the
format 'http://127.0.0.1:6000/recon/'
:param recon_type: the swift recon check to request.
:returns: tuple of (recon url used, response body, and status)
"""
url = base_url + recon_type
try:
body = urllib2.urlopen(url, timeout=self.timeout).read()
content = json.loads(body)
if self.verbose:
print "-> %s: %s" % (url, content)
status = 200
except urllib2.HTTPError as err:
if not self.suppress_errors or self.verbose:
print "-> %s: %s" % (url, err)
content = err
status = err.code
except urllib2.URLError as err:
if not self.suppress_errors or self.verbose:
print "-> %s: %s" % (url, err)
content = err
status = -1
return url, content, status
def scout(self, host):
"""
Obtain telemetry from a host running the swift recon middleware.
:param host: host to check
:returns: tuple of (recon url used, response body, and status)
"""
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = self.scout_host(base_url, self.recon_type)
return url, content, status
class SwiftRecon(object):
"""
Retrieve and report cluster info from hosts running recon middleware.
"""
def __init__(self):
self.verbose = False
self.suppress_errors = False
self.timeout = 5
self.pool_size = 30
self.pool = eventlet.GreenPool(self.pool_size)
self.check_types = ['account', 'container', 'object']
self.server_type = 'object'
def _gen_stats(self, stats, name=None):
""" compute various stats from a list of values """
cstats = [x for x in stats if x is not None]
if len(cstats) > 0:
ret_dict = {'low': min(cstats), 'high': max(cstats),
'total': sum(cstats), 'reported': len(cstats),
'number_none': len(stats) - len(cstats), 'name': name}
ret_dict['average'] = \
ret_dict['total'] / float(len(cstats))
ret_dict['perc_none'] = \
ret_dict['number_none'] * 100.0 / len(stats)
else:
ret_dict = {'reported': 0}
return ret_dict
def _print_stats(self, stats):
"""
print out formatted stats to console
:param stats: dict of stats generated by _gen_stats
"""
print '[%(name)s] low: %(low)d, high: %(high)d, avg: ' \
'%(average).1f, total: %(total)d, ' \
'Failed: %(perc_none).1f%%, no_result: %(number_none)d, ' \
'reported: %(reported)d' % stats
def _ptime(self, timev=None):
"""
:param timev: a unix timestamp or None
:returns: a pretty string of the current time or provided time
"""
if timev:
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timev))
else:
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
def get_devices(self, zone_filter, swift_dir, ring_name):
"""
Get a list of hosts in the ring
:param zone_filter: Only list zones matching given filter
:param swift_dir: Directory of swift config, usually /etc/swift
:param ring_name: Name of the ring, such as 'object'
:returns: a set of tuples containing the ip and port of hosts
"""
ring_data = Ring(swift_dir, ring_name=ring_name)
if zone_filter:
ips = set((n['ip'], n['port']) for n in ring_data.devs if n \
if n['zone'] == zone_filter)
else:
ips = set((n['ip'], n['port']) for n in ring_data.devs if n)
return ips
def get_ringmd5(self, hosts, ringfile):
"""
Compare ring md5sum's with those on remote host
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
:param ringfile: The local ring file to compare the md5sum with.
"""
stats = {}
matches = 0
errors = 0
md5sum = md5()
with open(ringfile, 'rb') as f:
block = f.read(4096)
while block:
md5sum.update(block)
block = f.read(4096)
ring_sum = md5sum.hexdigest()
recon = Scout("ringmd5", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking ring md5sums" % self._ptime()
if self.verbose:
print "-> On disk %s md5sum: %s" % (ringfile, ring_sum)
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
stats[url] = response[ringfile]
if response[ringfile] != ring_sum:
print "!! %s (%s) doesn't match on disk md5sum" % \
(url, response[ringfile])
else:
matches = matches + 1
if self.verbose:
print "-> %s matches." % url
else:
errors = errors + 1
print "%s/%s hosts matched, %s error[s] while checking hosts." % \
(matches, len(hosts), errors)
print "=" * 79
def async_check(self, hosts):
"""
Obtain and print async pending statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
scan = {}
recon = Scout("async", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking async pendings" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
scan[url] = response['async_pending']
stats = self._gen_stats(scan.values(), 'async_pending')
if stats['reported'] > 0:
self._print_stats(stats)
else:
print "[async_pending] - No hosts returned valid data."
print "=" * 79
def umount_check(self, hosts):
"""
Check for and print unmounted drives
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {}
recon = Scout("unmounted", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Getting unmounted drives from %s hosts..." % \
(self._ptime(), len(hosts))
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
for i in response:
stats[url] = i['device']
for host in stats:
node = urlparse(host).netloc
print "Not mounted: %s on %s" % (stats[host], node)
print "=" * 79
def expirer_check(self, hosts):
"""
Obtain and print expirer statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {'object_expiration_pass': [], 'expired_last_pass': []}
recon = Scout("expirer/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print "[%s] Checking on expirers" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
stats['object_expiration_pass'].append(
response.get('object_expiration_pass'))
stats['expired_last_pass'].append(
response.get('expired_last_pass'))
for k in stats:
if stats[k]:
computed = self._gen_stats(stats[k], name=k)
if computed['reported'] > 0:
self._print_stats(computed)
else:
print "[%s] - No hosts returned valid data." % k
else:
print "[%s] - No hosts returned valid data." % k
print "=" * 79
def replication_check(self, hosts):
"""
Obtain and print replication statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {'replication_time': [], 'failure': [], 'success': [],
'attempted': []}
recon = Scout("replication/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print "[%s] Checking on replication" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
stats['replication_time'].append(
response.get('replication_time'))
repl_stats = response['replication_stats']
if repl_stats:
for stat_key in ['attempted', 'failure', 'success']:
stats[stat_key].append(repl_stats.get(stat_key))
for k in stats:
if stats[k]:
if k != 'replication_time':
computed = self._gen_stats(stats[k],
name='replication_%s' % k)
else:
computed = self._gen_stats(stats[k], name=k)
if computed['reported'] > 0:
self._print_stats(computed)
else:
print "[%s] - No hosts returned valid data." % k
else:
print "[%s] - No hosts returned valid data." % k
print "=" * 79
def object_replication_check(self, hosts):
"""
Obtain and print replication statistics from object servers
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {}
recon = Scout("replication", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking on replication" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
stats[url] = response['object_replication_time']
times = [x for x in stats.values() if x is not None]
if len(stats) > 0 and len(times) > 0:
computed = self._gen_stats(times, 'replication_time')
if computed['reported'] > 0:
self._print_stats(computed)
else:
print "[replication_time] - No hosts returned valid data."
else:
print "[replication_time] - No hosts returned valid data."
print "=" * 79
def updater_check(self, hosts):
"""
Obtain and print updater statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = []
recon = Scout("updater/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print "[%s] Checking updater times" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
if response['%s_updater_sweep' % self.server_type]:
stats.append(response['%s_updater_sweep' %
self.server_type])
if len(stats) > 0:
computed = self._gen_stats(stats, name='updater_last_sweep')
if computed['reported'] > 0:
self._print_stats(computed)
else:
print "[updater_last_sweep] - No hosts returned valid data."
else:
print "[updater_last_sweep] - No hosts returned valid data."
print "=" * 79
def auditor_check(self, hosts):
"""
Obtain and print obj auditor statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
scan = {}
adone = '%s_auditor_pass_completed' % self.server_type
afail = '%s_audits_failed' % self.server_type
apass = '%s_audits_passed' % self.server_type
asince = '%s_audits_since' % self.server_type
recon = Scout("auditor/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print "[%s] Checking auditor stats" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
scan[url] = response
if len(scan) < 1:
print "Error: No hosts available"
return
stats = {}
stats[adone] = [scan[i][adone] for i in scan
if scan[i][adone] is not None]
stats[afail] = [scan[i][afail] for i in scan
if scan[i][afail] is not None]
stats[apass] = [scan[i][apass] for i in scan
if scan[i][apass] is not None]
stats[asince] = [scan[i][asince] for i in scan
if scan[i][asince] is not None]
for k in stats:
if len(stats[k]) < 1:
print "[%s] - No hosts returned valid data." % k
else:
if k != asince:
computed = self._gen_stats(stats[k], k)
if computed['reported'] > 0:
self._print_stats(computed)
if len(stats[asince]) >= 1:
low = min(stats[asince])
high = max(stats[asince])
total = sum(stats[asince])
average = total / len(stats[asince])
print '[last_pass] oldest: %s, newest: %s, avg: %s' % \
(self._ptime(low), self._ptime(high), self._ptime(average))
print "=" * 79
def object_auditor_check(self, hosts):
"""
Obtain and print obj auditor statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
all_scan = {}
zbf_scan = {}
atime = 'audit_time'
bprocessed = 'bytes_processed'
passes = 'passes'
errors = 'errors'
quarantined = 'quarantined'
recon = Scout("auditor/object", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking auditor stats " % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
if response['object_auditor_stats_ALL']:
all_scan[url] = response['object_auditor_stats_ALL']
if response['object_auditor_stats_ZBF']:
zbf_scan[url] = response['object_auditor_stats_ZBF']
if len(all_scan) > 0:
stats = {}
stats[atime] = [all_scan[i][atime] for i in all_scan]
stats[bprocessed] = [all_scan[i][bprocessed] for i in all_scan]
stats[passes] = [all_scan[i][passes] for i in all_scan]
stats[errors] = [all_scan[i][errors] for i in all_scan]
stats[quarantined] = [all_scan[i][quarantined] for i in all_scan]
for k in stats:
if None in stats[k]:
stats[k] = [x for x in stats[k] if x is not None]
if len(stats[k]) < 1:
print "[Auditor %s] - No hosts returned valid data." % k
else:
computed = self._gen_stats(stats[k],
name='ALL_%s_last_path' % k)
if computed['reported'] > 0:
self._print_stats(computed)
else:
print "[ALL_auditor] - No hosts returned valid data."
else:
print "[ALL_auditor] - No hosts returned valid data."
if len(zbf_scan) > 0:
stats = {}
stats[atime] = [zbf_scan[i][atime] for i in zbf_scan]
stats[bprocessed] = [zbf_scan[i][bprocessed] for i in zbf_scan]
stats[errors] = [zbf_scan[i][errors] for i in zbf_scan]
stats[quarantined] = [zbf_scan[i][quarantined] for i in zbf_scan]
for k in stats:
if None in stats[k]:
stats[k] = [x for x in stats[k] if x is not None]
if len(stats[k]) < 1:
print "[Auditor %s] - No hosts returned valid data." % k
else:
computed = self._gen_stats(stats[k],
name='ZBF_%s_last_path' % k)
if computed['reported'] > 0:
self._print_stats(computed)
else:
print "[ZBF_auditor] - No hosts returned valid data."
else:
print "[ZBF_auditor] - No hosts returned valid data."
print "=" * 79
def load_check(self, hosts):
"""
Obtain and print load average statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
load1 = {}
load5 = {}
load15 = {}
recon = Scout("load", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking load averages" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
load1[url] = response['1m']
load5[url] = response['5m']
load15[url] = response['15m']
stats = {"1m": load1, "5m": load5, "15m": load15}
for item in stats:
if len(stats[item]) > 0:
computed = self._gen_stats(stats[item].values(),
name='%s_load_avg' % item)
self._print_stats(computed)
else:
print "[%s_load_avg] - No hosts returned valid data." % item
print "=" * 79
def quarantine_check(self, hosts):
"""
Obtain and print quarantine statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
objq = {}
conq = {}
acctq = {}
recon = Scout("quarantined", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking quarantine" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
objq[url] = response['objects']
conq[url] = response['containers']
acctq[url] = response['accounts']
stats = {"objects": objq, "containers": conq, "accounts": acctq}
for item in stats:
if len(stats[item]) > 0:
computed = self._gen_stats(stats[item].values(),
name='quarantined_%s' % item)
self._print_stats(computed)
else:
print "No hosts returned valid data."
print "=" * 79
def socket_usage(self, hosts):
"""
Obtain and print /proc/net/sockstat statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
inuse4 = {}
mem = {}
inuse6 = {}
timewait = {}
orphan = {}
recon = Scout("sockstat", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking socket usage" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
inuse4[url] = response['tcp_in_use']
mem[url] = response['tcp_mem_allocated_bytes']
inuse6[url] = response['tcp6_in_use']
timewait[url] = response['time_wait']
orphan[url] = response['orphan']
stats = {"tcp_in_use": inuse4, "tcp_mem_allocated_bytes": mem,
"tcp6_in_use": inuse6, "time_wait": timewait,
"orphan": orphan}
for item in stats:
if len(stats[item]) > 0:
computed = self._gen_stats(stats[item].values(), item)
self._print_stats(computed)
else:
print "No hosts returned valid data."
print "=" * 79
def disk_usage(self, hosts):
"""
Obtain and print disk usage statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {}
highs = []
lows = []
raw_total_used = []
raw_total_avail = []
percents = {}
recon = Scout("diskusage", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking disk usage now" % self._ptime()
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
hostusage = []
for entry in response:
if entry['mounted']:
used = float(entry['used']) / float(entry['size']) \
* 100.0
raw_total_used.append(entry['used'])
raw_total_avail.append(entry['avail'])
hostusage.append(round(used, 2))
stats[url] = hostusage
for url in stats:
if len(stats[url]) > 0:
#get per host hi/los for another day
low = min(stats[url])
high = max(stats[url])
highs.append(high)
lows.append(low)
for percent in stats[url]:
percents[int(percent)] = percents.get(int(percent), 0) + 1
else:
print "-> %s: Error. No drive info available." % url
if len(lows) > 0:
low = min(lows)
high = max(highs)
#dist graph shamelessly stolen from https://github.com/gholt/tcod
print "Distribution Graph:"
mul = 69.0 / max(percents.values())
for percent in sorted(percents):
print '% 3d%%%5d %s' % (percent, percents[percent], \
'*' * int(percents[percent] * mul))
raw_used = sum(raw_total_used)
raw_avail = sum(raw_total_avail)
raw_total = raw_used + raw_avail
avg_used = 100.0 * raw_used / raw_total
print "Disk usage: space used: %s of %s" % (raw_used, raw_total)
print "Disk usage: space free: %s of %s" % (raw_avail, raw_total)
print "Disk usage: lowest: %s%%, highest: %s%%, avg: %s%%" % \
(low, high, avg_used)
else:
print "No hosts returned valid data."
print "=" * 79
def main(self):
"""
Retrieve and report cluster info from hosts running recon middleware.
"""
print "=" * 79
usage = '''
usage: %prog <server_type> [-v] [--suppress] [-a] [-r] [-u] [-d]
[-l] [--md5] [--auditor] [--updater] [--expirer] [--sockstat]
<server_type>\taccount|container|object
Defaults to object server.
ex: %prog container -l --auditor
'''
args = optparse.OptionParser(usage)
args.add_option('--verbose', '-v', action="store_true",
help="Print verbose info")
args.add_option('--suppress', action="store_true",
help="Suppress most connection related errors")
args.add_option('--async', '-a', action="store_true",
help="Get async stats")
args.add_option('--replication', '-r', action="store_true",
help="Get replication stats")
args.add_option('--auditor', action="store_true",
help="Get auditor stats")
args.add_option('--updater', action="store_true",
help="Get updater stats")
args.add_option('--expirer', action="store_true",
help="Get expirer stats")
args.add_option('--unmounted', '-u', action="store_true",
help="Check cluster for unmounted devices")
args.add_option('--diskusage', '-d', action="store_true",
help="Get disk usage stats")
args.add_option('--loadstats', '-l', action="store_true",
help="Get cluster load average stats")
args.add_option('--quarantined', '-q', action="store_true",
help="Get cluster quarantine stats")
args.add_option('--md5', action="store_true",
help="Get md5sum of servers ring and compare to local copy")
args.add_option('--sockstat', action="store_true",
help="Get cluster socket usage stats")
args.add_option('--all', action="store_true",
help="Perform all checks. Equal to -arudlq --md5 --sockstat")
args.add_option('--zone', '-z', type="int",
help="Only query servers in specified zone")
args.add_option('--timeout', '-t', type="int", metavar="SECONDS",
help="Time to wait for a response from a server", default=5)
args.add_option('--swiftdir', default="/etc/swift",
help="Default = /etc/swift")
options, arguments = args.parse_args()
if len(sys.argv) <= 1 or len(arguments) > 1:
args.print_help()
sys.exit(0)
if arguments:
if arguments[0] in self.check_types:
self.server_type = arguments[0]
else:
print "Invalid Server Type"
args.print_help()
sys.exit(1)
else:
self.server_type = 'object'
swift_dir = options.swiftdir
ring_file = os.path.join(swift_dir, '%s.ring.gz' % self.server_type)
self.verbose = options.verbose
self.suppress_errors = options.suppress
self.timeout = options.timeout
if options.zone:
hosts = self.get_devices(options.zone, swift_dir, self.server_type)
else:
hosts = self.get_devices(None, swift_dir, self.server_type)
print "--> Starting reconnaissance on %s hosts" % len(hosts)
print "=" * 79
if options.all:
if self.server_type == 'object':
self.async_check(hosts)
self.object_replication_check(hosts)
self.object_auditor_check(hosts)
self.updater_check(hosts)
self.expirer_check(hosts)
elif self.server_type == 'container':
self.replication_check(hosts)
self.auditor_check(hosts)
self.updater_check(hosts)
elif self.server_type == 'account':
self.replication_check(hosts)
self.auditor_check(hosts)
self.umount_check(hosts)
self.load_check(hosts)
self.disk_usage(hosts)
self.get_ringmd5(hosts, ring_file)
self.quarantine_check(hosts)
self.socket_usage(hosts)
else:
if options.async:
if self.server_type == 'object':
self.async_check(hosts)
else:
print "Error: Can't check async's on non object servers."
if options.unmounted:
self.umount_check(hosts)
if options.replication:
if self.server_type == 'object':
self.object_replication_check(hosts)
else:
self.replication_check(hosts)
if options.auditor:
if self.server_type == 'object':
self.object_auditor_check(hosts)
else:
self.auditor_check(hosts)
if options.updater:
if self.server_type == 'account':
print "Error: Can't check updaters on account servers."
else:
self.updater_check(hosts)
if options.expirer:
if self.server_type == 'object':
self.expirer_check(hosts)
else:
print "Error: Can't check expired on non object servers."
if options.loadstats:
self.load_check(hosts)
if options.diskusage:
self.disk_usage(hosts)
if options.md5:
self.get_ringmd5(hosts, ring_file)
if options.quarantined:
self.quarantine_check(hosts)
if options.sockstat:
self.socket_usage(hosts)
if __name__ == '__main__':
try:
reconnoiter = SwiftRecon()
reconnoiter.main()
except KeyboardInterrupt:
print '\n'