swift-recon cli cleanup

Refactored swift-recon completely. It's broken out into Classes and no
longer uses globals. In addition, I pulled out the previous individual
scout_XXXX functions that where getting added for each check. All the
check's now use the same method from the Scout class to obtain stats
and telemetry.

Change-Id: I512ab50f29e1ef4d10bd3adbf1cac642308e7cf1
This commit is contained in:
Florian Hines 2012-02-25 20:33:15 +00:00
parent a6567e60ab
commit c18a4e4f43

View File

@ -6,7 +6,11 @@
from eventlet.green import urllib2 from eventlet.green import urllib2
from swift.common.ring import Ring from swift.common.ring import Ring
import simplejson as json from urlparse import urlparse
try:
import simplejson as json
except ImportError:
import json
from hashlib import md5 from hashlib import md5
import datetime import datetime
import eventlet import eventlet
@ -14,12 +18,84 @@ import optparse
import sys import sys
import os import os
VERBOSE = False
SUPPRESS_ERRORS = False class Scout(object):
TIMEOUT = 5 """
Obtain swift recon information
"""
def __init__(self, recon_type, verbose=False, suppress_errors=False,
timeout=5):
recon_uri = ["ringmd5", "async", "replication", "load", "diskusage",
"unmounted", "quarantined", "sockstat"]
if recon_type not in recon_uri:
raise Exception("Invalid scout type requested")
else:
self.recon_type = recon_type
self.verbose = verbose
self.suppress_errors = suppress_errors
self.timeout = timeout
def scout_host(self, base_url, recon_type):
"""
Perform the actual HTTP request to obtain swift recon telemtry.
:param base_url: the base url of the host you wish to check. str of the
format 'http://127.0.0.1:6000/recon/'
:param recon_type: the swift recon check to request.
:returns: tuple of (recon url used, response body, and status)
"""
url = base_url + recon_type
try:
body = urllib2.urlopen(url, timeout=self.timeout).read()
content = json.loads(body)
if self.verbose:
print "-> %s: %s" % (url, content)
status = 200
except urllib2.HTTPError as err:
if not self.suppress_errors or self.verbose:
print "-> %s: %s" % (url, err)
content = err
status = err.code
except urllib2.URLError as err:
if not self.suppress_errors or self.verbose:
print "-> %s: %s" % (url, err)
content = err
status = -1
return url, content, status
def scout(self, host):
"""
Obtain telemetry from a host running the swift recon middleware.
:param host: host to check
:returns: tuple of (recon url used, response body, and status)
"""
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = self.scout_host(base_url, self.recon_type)
return url, content, status
def get_devices(zone_filter, ring_file): class SwiftRecon(object):
"""
Retrieve and report cluster info from hosts running recon middleware.
"""
def __init__(self):
self.verbose = False
self.suppress_errors = False
self.timeout = 5
self.pool_size = 30
self.pool = eventlet.GreenPool(self.pool_size)
def get_devices(self, zone_filter, ring_file):
"""
Get a list of hosts in the ring
:param zone_filter: Only list zones matching given filter
:param ring_file: Ring file to obtain hosts from
:returns: a set of tuples containing the ip and port of hosts
"""
ring_data = Ring(ring_file) ring_data = Ring(ring_file)
if zone_filter: if zone_filter:
ips = set((n['ip'], n['port']) for n in ring_data.devs if n \ ips = set((n['ip'], n['port']) for n in ring_data.devs if n \
@ -28,78 +104,14 @@ def get_devices(zone_filter, ring_file):
ips = set((n['ip'], n['port']) for n in ring_data.devs if n) ips = set((n['ip'], n['port']) for n in ring_data.devs if n)
return ips return ips
def get_ringmd5(self, hosts, ringfile):
"""
Compare ring md5sum's with those on remote host
def scout(base_url, recon_type): :param hosts: set of hosts to check. in the format of:
global VERBOSE, SUPPRESS_ERRORS set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
url = base_url + recon_type :param ringfile: The local ring file to compare the md5sum with.
try: """
body = urllib2.urlopen(url, timeout=TIMEOUT).read()
content = json.loads(body)
if VERBOSE:
print "-> %s: %s" % (url, content)
status = 200
except urllib2.HTTPError as e:
if not SUPPRESS_ERRORS or VERBOSE:
print "-> %s: %s" % (url, e)
content = e
status = e.code
except urllib2.URLError as e:
if not SUPPRESS_ERRORS or VERBOSE:
print "-> %s: %s" % (url, e)
content = e
status = -1
return url, content, status
def scout_md5(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "ringmd5")
return url, content, status
def scout_async(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "async")
return url, content, status
def scout_replication(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "replication")
return url, content, status
def scout_load(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "load")
return url, content, status
def scout_du(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "diskusage")
return url, content, status
def scout_umount(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "unmounted")
return url, content, status
def scout_quarantine(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "quarantined")
return url, content, status
def scout_sockstat(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "sockstat")
return url, content, status
def get_ringmd5(hosts, ringfile):
stats = {} stats = {}
matches = 0 matches = 0
errors = 0 errors = 0
@ -110,22 +122,21 @@ def get_ringmd5(hosts, ringfile):
md5sum.update(block) md5sum.update(block)
block = f.read(4096) block = f.read(4096)
ring_sum = md5sum.hexdigest() ring_sum = md5sum.hexdigest()
pool = eventlet.GreenPool(20) recon = Scout("ringmd5", self.verbose, self.suppress_errors,
self.timeout)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking ring md5sum's on %s hosts..." % (now, len(hosts)) print "[%s] Checking ring md5sum's on %s hosts..." % (now, len(hosts))
if VERBOSE: if self.verbose:
print "-> On disk md5sum: %s" % ring_sum print "-> On disk md5sum: %s" % ring_sum
for url, response, status in pool.imap(scout_md5, hosts): for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200: if status == 200:
#fixme - need to grab from config
stats[url] = response[ringfile] stats[url] = response[ringfile]
if response[ringfile] != ring_sum: if response[ringfile] != ring_sum:
ringsmatch = False
print "!! %s (%s) doesn't match on disk md5sum" % \ print "!! %s (%s) doesn't match on disk md5sum" % \
(url, response[ringfile]) (url, response[ringfile])
else: else:
matches = matches + 1 matches = matches + 1
if VERBOSE: if self.verbose:
print "-> %s matches." % url print "-> %s matches." % url
else: else:
errors = errors + 1 errors = errors + 1
@ -133,13 +144,19 @@ def get_ringmd5(hosts, ringfile):
(matches, len(hosts), errors) (matches, len(hosts), errors)
print "=" * 79 print "=" * 79
def async_check(self, hosts):
"""
Obtain and print async pending statistics
def async_check(hosts): :param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {} stats = {}
pool = eventlet.GreenPool(20) recon = Scout("async", self.verbose, self.suppress_errors,
self.timeout)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking async pendings on %s hosts..." % (now, len(hosts)) print "[%s] Checking async pendings on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_async, hosts): for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200: if status == 200:
stats[url] = response['async_pending'] stats[url] = response['async_pending']
if len(stats) > 0: if len(stats) > 0:
@ -153,27 +170,42 @@ def async_check(hosts):
print "Error: No hosts available or returned valid information." print "Error: No hosts available or returned valid information."
print "=" * 79 print "=" * 79
def umount_check(self, hosts):
"""
Check for and print unmounted drives
def umount_check(hosts): :param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {} stats = {}
pool = eventlet.GreenPool(20) recon = Scout("unmounted", self.verbose, self.suppress_errors,
self.timeout)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Getting unmounted drives from %s hosts..." % (now, len(hosts)) print "[%s] Getting unmounted drives from %s hosts..." % \
for url, response, status in pool.imap(scout_umount, hosts): (now, len(hosts))
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200: if status == 200:
for i in response: for i in response:
stats[url] = i['device'] stats[url] = i['device']
for host in stats: for host in stats:
print "Not mounted: %s on %s" % (stats[host], host) node = urlparse(host).netloc
print "Not mounted: %s on %s" % (stats[host], node)
print "=" * 79 print "=" * 79
def replication_check(self, hosts):
"""
Obtain and print replication statistics
def replication_check(hosts): :param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {} stats = {}
pool = eventlet.GreenPool(20) recon = Scout("replication", self.verbose, self.suppress_errors,
self.timeout)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking replication times on %s hosts..." % (now, len(hosts)) print "[%s] Checking replication times on %s hosts..." % \
for url, response, status in pool.imap(scout_replication, hosts): (now, len(hosts))
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200: if status == 200:
stats[url] = response['object_replication_time'] stats[url] = response['object_replication_time']
if len(stats) > 0: if len(stats) > 0:
@ -187,15 +219,21 @@ def replication_check(hosts):
print "Error: No hosts available or returned valid information." print "Error: No hosts available or returned valid information."
print "=" * 79 print "=" * 79
def load_check(self, hosts):
"""
Obtain and print load average statistics
def load_check(hosts): :param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
load1 = {} load1 = {}
load5 = {} load5 = {}
load15 = {} load15 = {}
pool = eventlet.GreenPool(20) recon = Scout("load", self.verbose, self.suppress_errors,
self.timeout)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking load avg's on %s hosts..." % (now, len(hosts)) print "[%s] Checking load avg's on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_load, hosts): for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200: if status == 200:
load1[url] = response['1m'] load1[url] = response['1m']
load5[url] = response['5m'] load5[url] = response['5m']
@ -210,18 +248,24 @@ def load_check(hosts):
print "[%s load average] lowest: %s, highest: %s, avg: %s" % \ print "[%s load average] lowest: %s, highest: %s, avg: %s" % \
(item, low, high, average) (item, low, high, average)
else: else:
print "Error: No hosts available or returned valid information." print "Error: No hosts available or returned valid info."
print "=" * 79 print "=" * 79
def quarantine_check(self, hosts):
"""
Obtain and print quarantine statistics
def quarantine_check(hosts): :param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
objq = {} objq = {}
conq = {} conq = {}
acctq = {} acctq = {}
pool = eventlet.GreenPool(20) recon = Scout("quarantined", self.verbose, self.suppress_errors,
self.timeout)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking quarantine dirs on %s hosts..." % (now, len(hosts)) print "[%s] Checking quarantine on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_quarantine, hosts): for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200: if status == 200:
objq[url] = response['objects'] objq[url] = response['objects']
conq[url] = response['containers'] conq[url] = response['containers']
@ -233,31 +277,38 @@ def quarantine_check(hosts):
high = max(stats[item].values()) high = max(stats[item].values())
total = sum(stats[item].values()) total = sum(stats[item].values())
average = total / len(stats[item]) average = total / len(stats[item])
print "[Quarantined %s] low: %d, high: %d, avg: %d, total: %d" % \ print ("[Quarantined %s] low: %d, high: %d, avg: %d, total: %d"
(item, low, high, average, total) % (item, low, high, average, total))
else: else:
print "Error: No hosts available or returned valid information." print "Error: No hosts available or returned valid info."
print "=" * 79 print "=" * 79
def socket_usage(self, hosts):
"""
Obtain and print /proc/net/sockstat statistics
def socket_usage(hosts): :param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
inuse4 = {} inuse4 = {}
mem = {} mem = {}
inuse6 = {} inuse6 = {}
timewait = {} timewait = {}
orphan = {} orphan = {}
pool = eventlet.GreenPool(20) recon = Scout("sockstat", self.verbose, self.suppress_errors,
self.timeout)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking socket usage on %s hosts..." % (now, len(hosts)) print "[%s] Checking socket usage on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_sockstat, hosts): for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200: if status == 200:
inuse4[url] = response['tcp_in_use'] inuse4[url] = response['tcp_in_use']
mem[url] = response['tcp_mem_allocated_bytes'] mem[url] = response['tcp_mem_allocated_bytes']
inuse6[url] = response['tcp6_in_use'] inuse6[url] = response['tcp6_in_use']
timewait[url] = response['time_wait'] timewait[url] = response['time_wait']
orphan[url] = response['orphan'] orphan[url] = response['orphan']
stats = {"tcp_in_use": inuse4, "tcp_mem_allocated_bytes": mem, \ stats = {"tcp_in_use": inuse4, "tcp_mem_allocated_bytes": mem,
"tcp6_in_use": inuse6, "time_wait": timewait, "orphan": orphan} "tcp6_in_use": inuse6, "time_wait": timewait,
"orphan": orphan}
for item in stats: for item in stats:
if len(stats[item]) > 0: if len(stats[item]) > 0:
low = min(stats[item].values()) low = min(stats[item].values())
@ -270,22 +321,29 @@ def socket_usage(hosts):
print "Error: No hosts or info available." print "Error: No hosts or info available."
print "=" * 79 print "=" * 79
def disk_usage(self, hosts):
"""
Obtain and print disk usage statistics
def disk_usage(hosts): :param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {} stats = {}
highs = [] highs = []
lows = [] lows = []
averages = [] averages = []
percents = {} percents = {}
pool = eventlet.GreenPool(20) recon = Scout("diskusage", self.verbose, self.suppress_errors,
self.timeout)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking disk usage on %s hosts..." % (now, len(hosts)) print "[%s] Checking disk usage on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_du, hosts): for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200: if status == 200:
hostusage = [] hostusage = []
for entry in response: for entry in response:
if entry['mounted']: if entry['mounted']:
used = float(entry['used']) / float(entry['size']) * 100.0 used = float(entry['used']) / float(entry['size']) \
* 100.0
hostusage.append(round(used, 2)) hostusage.append(round(used, 2))
stats[url] = hostusage stats[url] = hostusage
@ -308,22 +366,22 @@ def disk_usage(hosts):
low = min(lows) low = min(lows)
high = max(highs) high = max(highs)
average = sum(averages) / len(averages) average = sum(averages) / len(averages)
#distrib graph shamelessly stolen from https://github.com/gholt/tcod #dist graph shamelessly stolen from https://github.com/gholt/tcod
print "Distribution Graph:" print "Distribution Graph:"
mul = 69.0 / max(percents.values()) mul = 69.0 / max(percents.values())
for percent in sorted(percents): for percent in sorted(percents):
print '% 3d%% % 4d %s' % (percent, percents[percent], \ print '% 3d%%%5d %s' % (percent, percents[percent], \
'*' * int(percents[percent] * mul)) '*' * int(percents[percent] * mul))
print "Disk usage: lowest: %s%%, highest: %s%%, avg: %s%%" % \ print "Disk usage: lowest: %s%%, highest: %s%%, avg: %s%%" % \
(low, high, average) (low, high, average)
else: else:
print "Error: No hosts available or returned valid information." print "Error: No hosts available or returned valid information."
print "=" * 79 print "=" * 79
def main(self):
def main(): """
global VERBOSE, SUPPRESS_ERRORS, TIMEOUT, swift_dir, pool Retrieve and report cluster info from hosts running recon middleware.
"""
print "=" * 79 print "=" * 79
usage = ''' usage = '''
usage: %prog [-v] [--suppress] [-a] [-r] [-u] [-d] [-l] [--objmd5] usage: %prog [-v] [--suppress] [-a] [-r] [-u] [-d] [-l] [--objmd5]
@ -350,65 +408,63 @@ def main():
args.add_option('--sockstat', action="store_true", args.add_option('--sockstat', action="store_true",
help="Get cluster socket usage stats") help="Get cluster socket usage stats")
args.add_option('--all', action="store_true", args.add_option('--all', action="store_true",
help="Perform all checks. Equivalent to -arudlq --objmd5 --sockstat") help="Perform all checks. Equal to -arudlq --objmd5 --sockstat")
args.add_option('--zone', '-z', type="int", args.add_option('--zone', '-z', type="int",
help="Only query servers in specified zone") help="Only query servers in specified zone")
args.add_option('--timeout', '-t', type="int", metavar="SECONDS", args.add_option('--timeout', '-t', type="int", metavar="SECONDS",
help="Time to wait for a response from a server") help="Time to wait for a response from a server", default=5)
args.add_option('--swiftdir', default="/etc/swift", args.add_option('--swiftdir', default="/etc/swift",
help="Default = /etc/swift") help="Default = /etc/swift")
options, arguments = args.parse_args() options, arguments = args.parse_args()
if len(sys.argv) <= 1: if len(sys.argv) <= 1:
args.print_help() args.print_help()
sys.exit(0)
swift_dir = options.swiftdir swift_dir = options.swiftdir
obj_ring = os.path.join(swift_dir, 'object.ring.gz') obj_ring = os.path.join(swift_dir, 'object.ring.gz')
con_ring = os.path.join(swift_dir, 'container.ring.gz')
acct_ring = os.path.join(swift_dir, 'account.ring.gz')
VERBOSE = options.verbose self.verbose = options.verbose
SUPPRESS_ERRORS = options.suppress self.suppress_errors = options.suppress
self.timeout = options.timeout
if options.zone: if options.zone:
hosts = get_devices(options.zone, obj_ring) hosts = self.get_devices(options.zone, obj_ring)
else: else:
hosts = get_devices(None, obj_ring) hosts = self.get_devices(None, obj_ring)
if options.timeout:
TIMEOUT = options.timeout
if options.all: if options.all:
async_check(hosts) self.async_check(hosts)
umount_check(hosts) self.umount_check(hosts)
replication_check(hosts) self.replication_check(hosts)
load_check(hosts) self.load_check(hosts)
disk_usage(hosts) self.disk_usage(hosts)
get_ringmd5(hosts, obj_ring) self.get_ringmd5(hosts, obj_ring)
quarantine_check(hosts) self.quarantine_check(hosts)
socket_usage(hosts) self.socket_usage(hosts)
else: else:
if options.async: if options.async:
async_check(hosts) self.async_check(hosts)
if options.unmounted: if options.unmounted:
umount_check(hosts) self.umount_check(hosts)
if options.replication: if options.replication:
replication_check(hosts) self.replication_check(hosts)
if options.loadstats: if options.loadstats:
load_check(hosts) self.load_check(hosts)
if options.diskusage: if options.diskusage:
disk_usage(hosts) self.disk_usage(hosts)
if options.objmd5: if options.objmd5:
get_ringmd5(hosts, obj_ring) self.get_ringmd5(hosts, obj_ring)
if options.quarantined: if options.quarantined:
quarantine_check(hosts) self.quarantine_check(hosts)
if options.sockstat: if options.sockstat:
socket_usage(hosts) self.socket_usage(hosts)
if __name__ == '__main__': if __name__ == '__main__':
try: try:
main() reconnoiter = SwiftRecon()
reconnoiter.main()
except KeyboardInterrupt: except KeyboardInterrupt:
print '\n' print '\n'