swift-recon cli cleanup

Refactored swift-recon completely. It's broken out into Classes and no
longer uses globals. In addition, I pulled out the previous individual
scout_XXXX functions that where getting added for each check. All the
check's now use the same method from the Scout class to obtain stats
and telemetry.

Change-Id: I512ab50f29e1ef4d10bd3adbf1cac642308e7cf1
This commit is contained in:
Florian Hines 2012-02-25 20:33:15 +00:00
parent a6567e60ab
commit c18a4e4f43

View File

@ -6,7 +6,11 @@
from eventlet.green import urllib2
from swift.common.ring import Ring
from urlparse import urlparse
try:
import simplejson as json
except ImportError:
import json
from hashlib import md5
import datetime
import eventlet
@ -14,12 +18,84 @@ import optparse
import sys
import os
VERBOSE = False
SUPPRESS_ERRORS = False
TIMEOUT = 5
class Scout(object):
"""
Obtain swift recon information
"""
def __init__(self, recon_type, verbose=False, suppress_errors=False,
timeout=5):
recon_uri = ["ringmd5", "async", "replication", "load", "diskusage",
"unmounted", "quarantined", "sockstat"]
if recon_type not in recon_uri:
raise Exception("Invalid scout type requested")
else:
self.recon_type = recon_type
self.verbose = verbose
self.suppress_errors = suppress_errors
self.timeout = timeout
def scout_host(self, base_url, recon_type):
"""
Perform the actual HTTP request to obtain swift recon telemtry.
:param base_url: the base url of the host you wish to check. str of the
format 'http://127.0.0.1:6000/recon/'
:param recon_type: the swift recon check to request.
:returns: tuple of (recon url used, response body, and status)
"""
url = base_url + recon_type
try:
body = urllib2.urlopen(url, timeout=self.timeout).read()
content = json.loads(body)
if self.verbose:
print "-> %s: %s" % (url, content)
status = 200
except urllib2.HTTPError as err:
if not self.suppress_errors or self.verbose:
print "-> %s: %s" % (url, err)
content = err
status = err.code
except urllib2.URLError as err:
if not self.suppress_errors or self.verbose:
print "-> %s: %s" % (url, err)
content = err
status = -1
return url, content, status
def scout(self, host):
"""
Obtain telemetry from a host running the swift recon middleware.
:param host: host to check
:returns: tuple of (recon url used, response body, and status)
"""
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = self.scout_host(base_url, self.recon_type)
return url, content, status
def get_devices(zone_filter, ring_file):
class SwiftRecon(object):
"""
Retrieve and report cluster info from hosts running recon middleware.
"""
def __init__(self):
self.verbose = False
self.suppress_errors = False
self.timeout = 5
self.pool_size = 30
self.pool = eventlet.GreenPool(self.pool_size)
def get_devices(self, zone_filter, ring_file):
"""
Get a list of hosts in the ring
:param zone_filter: Only list zones matching given filter
:param ring_file: Ring file to obtain hosts from
:returns: a set of tuples containing the ip and port of hosts
"""
ring_data = Ring(ring_file)
if zone_filter:
ips = set((n['ip'], n['port']) for n in ring_data.devs if n \
@ -28,78 +104,14 @@ def get_devices(zone_filter, ring_file):
ips = set((n['ip'], n['port']) for n in ring_data.devs if n)
return ips
def get_ringmd5(self, hosts, ringfile):
"""
Compare ring md5sum's with those on remote host
def scout(base_url, recon_type):
global VERBOSE, SUPPRESS_ERRORS
url = base_url + recon_type
try:
body = urllib2.urlopen(url, timeout=TIMEOUT).read()
content = json.loads(body)
if VERBOSE:
print "-> %s: %s" % (url, content)
status = 200
except urllib2.HTTPError as e:
if not SUPPRESS_ERRORS or VERBOSE:
print "-> %s: %s" % (url, e)
content = e
status = e.code
except urllib2.URLError as e:
if not SUPPRESS_ERRORS or VERBOSE:
print "-> %s: %s" % (url, e)
content = e
status = -1
return url, content, status
def scout_md5(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "ringmd5")
return url, content, status
def scout_async(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "async")
return url, content, status
def scout_replication(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "replication")
return url, content, status
def scout_load(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "load")
return url, content, status
def scout_du(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "diskusage")
return url, content, status
def scout_umount(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "unmounted")
return url, content, status
def scout_quarantine(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "quarantined")
return url, content, status
def scout_sockstat(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "sockstat")
return url, content, status
def get_ringmd5(hosts, ringfile):
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
:param ringfile: The local ring file to compare the md5sum with.
"""
stats = {}
matches = 0
errors = 0
@ -110,22 +122,21 @@ def get_ringmd5(hosts, ringfile):
md5sum.update(block)
block = f.read(4096)
ring_sum = md5sum.hexdigest()
pool = eventlet.GreenPool(20)
recon = Scout("ringmd5", self.verbose, self.suppress_errors,
self.timeout)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking ring md5sum's on %s hosts..." % (now, len(hosts))
if VERBOSE:
if self.verbose:
print "-> On disk md5sum: %s" % ring_sum
for url, response, status in pool.imap(scout_md5, hosts):
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
#fixme - need to grab from config
stats[url] = response[ringfile]
if response[ringfile] != ring_sum:
ringsmatch = False
print "!! %s (%s) doesn't match on disk md5sum" % \
(url, response[ringfile])
else:
matches = matches + 1
if VERBOSE:
if self.verbose:
print "-> %s matches." % url
else:
errors = errors + 1
@ -133,13 +144,19 @@ def get_ringmd5(hosts, ringfile):
(matches, len(hosts), errors)
print "=" * 79
def async_check(self, hosts):
"""
Obtain and print async pending statistics
def async_check(hosts):
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {}
pool = eventlet.GreenPool(20)
recon = Scout("async", self.verbose, self.suppress_errors,
self.timeout)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking async pendings on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_async, hosts):
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
stats[url] = response['async_pending']
if len(stats) > 0:
@ -153,27 +170,42 @@ def async_check(hosts):
print "Error: No hosts available or returned valid information."
print "=" * 79
def umount_check(self, hosts):
"""
Check for and print unmounted drives
def umount_check(hosts):
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {}
pool = eventlet.GreenPool(20)
recon = Scout("unmounted", self.verbose, self.suppress_errors,
self.timeout)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Getting unmounted drives from %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_umount, hosts):
print "[%s] Getting unmounted drives from %s hosts..." % \
(now, len(hosts))
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
for i in response:
stats[url] = i['device']
for host in stats:
print "Not mounted: %s on %s" % (stats[host], host)
node = urlparse(host).netloc
print "Not mounted: %s on %s" % (stats[host], node)
print "=" * 79
def replication_check(self, hosts):
"""
Obtain and print replication statistics
def replication_check(hosts):
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {}
pool = eventlet.GreenPool(20)
recon = Scout("replication", self.verbose, self.suppress_errors,
self.timeout)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking replication times on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_replication, hosts):
print "[%s] Checking replication times on %s hosts..." % \
(now, len(hosts))
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
stats[url] = response['object_replication_time']
if len(stats) > 0:
@ -187,15 +219,21 @@ def replication_check(hosts):
print "Error: No hosts available or returned valid information."
print "=" * 79
def load_check(self, hosts):
"""
Obtain and print load average statistics
def load_check(hosts):
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
load1 = {}
load5 = {}
load15 = {}
pool = eventlet.GreenPool(20)
recon = Scout("load", self.verbose, self.suppress_errors,
self.timeout)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking load avg's on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_load, hosts):
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
load1[url] = response['1m']
load5[url] = response['5m']
@ -210,18 +248,24 @@ def load_check(hosts):
print "[%s load average] lowest: %s, highest: %s, avg: %s" % \
(item, low, high, average)
else:
print "Error: No hosts available or returned valid information."
print "Error: No hosts available or returned valid info."
print "=" * 79
def quarantine_check(self, hosts):
"""
Obtain and print quarantine statistics
def quarantine_check(hosts):
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
objq = {}
conq = {}
acctq = {}
pool = eventlet.GreenPool(20)
recon = Scout("quarantined", self.verbose, self.suppress_errors,
self.timeout)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking quarantine dirs on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_quarantine, hosts):
print "[%s] Checking quarantine on %s hosts..." % (now, len(hosts))
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
objq[url] = response['objects']
conq[url] = response['containers']
@ -233,31 +277,38 @@ def quarantine_check(hosts):
high = max(stats[item].values())
total = sum(stats[item].values())
average = total / len(stats[item])
print "[Quarantined %s] low: %d, high: %d, avg: %d, total: %d" % \
(item, low, high, average, total)
print ("[Quarantined %s] low: %d, high: %d, avg: %d, total: %d"
% (item, low, high, average, total))
else:
print "Error: No hosts available or returned valid information."
print "Error: No hosts available or returned valid info."
print "=" * 79
def socket_usage(self, hosts):
"""
Obtain and print /proc/net/sockstat statistics
def socket_usage(hosts):
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
inuse4 = {}
mem = {}
inuse6 = {}
timewait = {}
orphan = {}
pool = eventlet.GreenPool(20)
recon = Scout("sockstat", self.verbose, self.suppress_errors,
self.timeout)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking socket usage on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_sockstat, hosts):
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
inuse4[url] = response['tcp_in_use']
mem[url] = response['tcp_mem_allocated_bytes']
inuse6[url] = response['tcp6_in_use']
timewait[url] = response['time_wait']
orphan[url] = response['orphan']
stats = {"tcp_in_use": inuse4, "tcp_mem_allocated_bytes": mem, \
"tcp6_in_use": inuse6, "time_wait": timewait, "orphan": orphan}
stats = {"tcp_in_use": inuse4, "tcp_mem_allocated_bytes": mem,
"tcp6_in_use": inuse6, "time_wait": timewait,
"orphan": orphan}
for item in stats:
if len(stats[item]) > 0:
low = min(stats[item].values())
@ -270,22 +321,29 @@ def socket_usage(hosts):
print "Error: No hosts or info available."
print "=" * 79
def disk_usage(self, hosts):
"""
Obtain and print disk usage statistics
def disk_usage(hosts):
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {}
highs = []
lows = []
averages = []
percents = {}
pool = eventlet.GreenPool(20)
recon = Scout("diskusage", self.verbose, self.suppress_errors,
self.timeout)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking disk usage on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_du, hosts):
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
hostusage = []
for entry in response:
if entry['mounted']:
used = float(entry['used']) / float(entry['size']) * 100.0
used = float(entry['used']) / float(entry['size']) \
* 100.0
hostusage.append(round(used, 2))
stats[url] = hostusage
@ -308,22 +366,22 @@ def disk_usage(hosts):
low = min(lows)
high = max(highs)
average = sum(averages) / len(averages)
#distrib graph shamelessly stolen from https://github.com/gholt/tcod
#dist graph shamelessly stolen from https://github.com/gholt/tcod
print "Distribution Graph:"
mul = 69.0 / max(percents.values())
for percent in sorted(percents):
print '% 3d%% % 4d %s' % (percent, percents[percent], \
print '% 3d%%%5d %s' % (percent, percents[percent], \
'*' * int(percents[percent] * mul))
print "Disk usage: lowest: %s%%, highest: %s%%, avg: %s%%" % \
(low, high, average)
else:
print "Error: No hosts available or returned valid information."
print "=" * 79
def main():
global VERBOSE, SUPPRESS_ERRORS, TIMEOUT, swift_dir, pool
def main(self):
"""
Retrieve and report cluster info from hosts running recon middleware.
"""
print "=" * 79
usage = '''
usage: %prog [-v] [--suppress] [-a] [-r] [-u] [-d] [-l] [--objmd5]
@ -350,65 +408,63 @@ def main():
args.add_option('--sockstat', action="store_true",
help="Get cluster socket usage stats")
args.add_option('--all', action="store_true",
help="Perform all checks. Equivalent to -arudlq --objmd5 --sockstat")
help="Perform all checks. Equal to -arudlq --objmd5 --sockstat")
args.add_option('--zone', '-z', type="int",
help="Only query servers in specified zone")
args.add_option('--timeout', '-t', type="int", metavar="SECONDS",
help="Time to wait for a response from a server")
help="Time to wait for a response from a server", default=5)
args.add_option('--swiftdir', default="/etc/swift",
help="Default = /etc/swift")
options, arguments = args.parse_args()
if len(sys.argv) <= 1:
args.print_help()
sys.exit(0)
swift_dir = options.swiftdir
obj_ring = os.path.join(swift_dir, 'object.ring.gz')
con_ring = os.path.join(swift_dir, 'container.ring.gz')
acct_ring = os.path.join(swift_dir, 'account.ring.gz')
VERBOSE = options.verbose
SUPPRESS_ERRORS = options.suppress
self.verbose = options.verbose
self.suppress_errors = options.suppress
self.timeout = options.timeout
if options.zone:
hosts = get_devices(options.zone, obj_ring)
hosts = self.get_devices(options.zone, obj_ring)
else:
hosts = get_devices(None, obj_ring)
if options.timeout:
TIMEOUT = options.timeout
hosts = self.get_devices(None, obj_ring)
if options.all:
async_check(hosts)
umount_check(hosts)
replication_check(hosts)
load_check(hosts)
disk_usage(hosts)
get_ringmd5(hosts, obj_ring)
quarantine_check(hosts)
socket_usage(hosts)
self.async_check(hosts)
self.umount_check(hosts)
self.replication_check(hosts)
self.load_check(hosts)
self.disk_usage(hosts)
self.get_ringmd5(hosts, obj_ring)
self.quarantine_check(hosts)
self.socket_usage(hosts)
else:
if options.async:
async_check(hosts)
self.async_check(hosts)
if options.unmounted:
umount_check(hosts)
self.umount_check(hosts)
if options.replication:
replication_check(hosts)
self.replication_check(hosts)
if options.loadstats:
load_check(hosts)
self.load_check(hosts)
if options.diskusage:
disk_usage(hosts)
self.disk_usage(hosts)
if options.objmd5:
get_ringmd5(hosts, obj_ring)
self.get_ringmd5(hosts, obj_ring)
if options.quarantined:
quarantine_check(hosts)
self.quarantine_check(hosts)
if options.sockstat:
socket_usage(hosts)
self.socket_usage(hosts)
if __name__ == '__main__':
try:
main()
reconnoiter = SwiftRecon()
reconnoiter.main()
except KeyboardInterrupt:
print '\n'