Merge "swift-recon cli cleanup"
This commit is contained in:
commit
c51b8b2f20
352
bin/swift-recon
352
bin/swift-recon
@ -6,7 +6,11 @@
|
||||
|
||||
from eventlet.green import urllib2
|
||||
from swift.common.ring import Ring
|
||||
from urlparse import urlparse
|
||||
try:
|
||||
import simplejson as json
|
||||
except ImportError:
|
||||
import json
|
||||
from hashlib import md5
|
||||
import datetime
|
||||
import eventlet
|
||||
@ -14,12 +18,84 @@ import optparse
|
||||
import sys
|
||||
import os
|
||||
|
||||
VERBOSE = False
|
||||
SUPPRESS_ERRORS = False
|
||||
TIMEOUT = 5
|
||||
|
||||
class Scout(object):
|
||||
"""
|
||||
Obtain swift recon information
|
||||
"""
|
||||
|
||||
def __init__(self, recon_type, verbose=False, suppress_errors=False,
|
||||
timeout=5):
|
||||
recon_uri = ["ringmd5", "async", "replication", "load", "diskusage",
|
||||
"unmounted", "quarantined", "sockstat"]
|
||||
if recon_type not in recon_uri:
|
||||
raise Exception("Invalid scout type requested")
|
||||
else:
|
||||
self.recon_type = recon_type
|
||||
self.verbose = verbose
|
||||
self.suppress_errors = suppress_errors
|
||||
self.timeout = timeout
|
||||
|
||||
def scout_host(self, base_url, recon_type):
|
||||
"""
|
||||
Perform the actual HTTP request to obtain swift recon telemtry.
|
||||
|
||||
:param base_url: the base url of the host you wish to check. str of the
|
||||
format 'http://127.0.0.1:6000/recon/'
|
||||
:param recon_type: the swift recon check to request.
|
||||
:returns: tuple of (recon url used, response body, and status)
|
||||
"""
|
||||
url = base_url + recon_type
|
||||
try:
|
||||
body = urllib2.urlopen(url, timeout=self.timeout).read()
|
||||
content = json.loads(body)
|
||||
if self.verbose:
|
||||
print "-> %s: %s" % (url, content)
|
||||
status = 200
|
||||
except urllib2.HTTPError as err:
|
||||
if not self.suppress_errors or self.verbose:
|
||||
print "-> %s: %s" % (url, err)
|
||||
content = err
|
||||
status = err.code
|
||||
except urllib2.URLError as err:
|
||||
if not self.suppress_errors or self.verbose:
|
||||
print "-> %s: %s" % (url, err)
|
||||
content = err
|
||||
status = -1
|
||||
return url, content, status
|
||||
|
||||
def scout(self, host):
|
||||
"""
|
||||
Obtain telemetry from a host running the swift recon middleware.
|
||||
|
||||
:param host: host to check
|
||||
:returns: tuple of (recon url used, response body, and status)
|
||||
"""
|
||||
base_url = "http://%s:%s/recon/" % (host[0], host[1])
|
||||
url, content, status = self.scout_host(base_url, self.recon_type)
|
||||
return url, content, status
|
||||
|
||||
|
||||
def get_devices(zone_filter, ring_file):
|
||||
class SwiftRecon(object):
|
||||
"""
|
||||
Retrieve and report cluster info from hosts running recon middleware.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.verbose = False
|
||||
self.suppress_errors = False
|
||||
self.timeout = 5
|
||||
self.pool_size = 30
|
||||
self.pool = eventlet.GreenPool(self.pool_size)
|
||||
|
||||
def get_devices(self, zone_filter, ring_file):
|
||||
"""
|
||||
Get a list of hosts in the ring
|
||||
|
||||
:param zone_filter: Only list zones matching given filter
|
||||
:param ring_file: Ring file to obtain hosts from
|
||||
:returns: a set of tuples containing the ip and port of hosts
|
||||
"""
|
||||
ring_data = Ring(ring_file)
|
||||
if zone_filter:
|
||||
ips = set((n['ip'], n['port']) for n in ring_data.devs if n \
|
||||
@ -28,78 +104,14 @@ def get_devices(zone_filter, ring_file):
|
||||
ips = set((n['ip'], n['port']) for n in ring_data.devs if n)
|
||||
return ips
|
||||
|
||||
def get_ringmd5(self, hosts, ringfile):
|
||||
"""
|
||||
Compare ring md5sum's with those on remote host
|
||||
|
||||
def scout(base_url, recon_type):
|
||||
global VERBOSE, SUPPRESS_ERRORS
|
||||
url = base_url + recon_type
|
||||
try:
|
||||
body = urllib2.urlopen(url, timeout=TIMEOUT).read()
|
||||
content = json.loads(body)
|
||||
if VERBOSE:
|
||||
print "-> %s: %s" % (url, content)
|
||||
status = 200
|
||||
except urllib2.HTTPError as e:
|
||||
if not SUPPRESS_ERRORS or VERBOSE:
|
||||
print "-> %s: %s" % (url, e)
|
||||
content = e
|
||||
status = e.code
|
||||
except urllib2.URLError as e:
|
||||
if not SUPPRESS_ERRORS or VERBOSE:
|
||||
print "-> %s: %s" % (url, e)
|
||||
content = e
|
||||
status = -1
|
||||
return url, content, status
|
||||
|
||||
|
||||
def scout_md5(host):
|
||||
base_url = "http://%s:%s/recon/" % (host[0], host[1])
|
||||
url, content, status = scout(base_url, "ringmd5")
|
||||
return url, content, status
|
||||
|
||||
|
||||
def scout_async(host):
|
||||
base_url = "http://%s:%s/recon/" % (host[0], host[1])
|
||||
url, content, status = scout(base_url, "async")
|
||||
return url, content, status
|
||||
|
||||
|
||||
def scout_replication(host):
|
||||
base_url = "http://%s:%s/recon/" % (host[0], host[1])
|
||||
url, content, status = scout(base_url, "replication")
|
||||
return url, content, status
|
||||
|
||||
|
||||
def scout_load(host):
|
||||
base_url = "http://%s:%s/recon/" % (host[0], host[1])
|
||||
url, content, status = scout(base_url, "load")
|
||||
return url, content, status
|
||||
|
||||
|
||||
def scout_du(host):
|
||||
base_url = "http://%s:%s/recon/" % (host[0], host[1])
|
||||
url, content, status = scout(base_url, "diskusage")
|
||||
return url, content, status
|
||||
|
||||
|
||||
def scout_umount(host):
|
||||
base_url = "http://%s:%s/recon/" % (host[0], host[1])
|
||||
url, content, status = scout(base_url, "unmounted")
|
||||
return url, content, status
|
||||
|
||||
|
||||
def scout_quarantine(host):
|
||||
base_url = "http://%s:%s/recon/" % (host[0], host[1])
|
||||
url, content, status = scout(base_url, "quarantined")
|
||||
return url, content, status
|
||||
|
||||
|
||||
def scout_sockstat(host):
|
||||
base_url = "http://%s:%s/recon/" % (host[0], host[1])
|
||||
url, content, status = scout(base_url, "sockstat")
|
||||
return url, content, status
|
||||
|
||||
|
||||
def get_ringmd5(hosts, ringfile):
|
||||
:param hosts: set of hosts to check. in the format of:
|
||||
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
||||
:param ringfile: The local ring file to compare the md5sum with.
|
||||
"""
|
||||
stats = {}
|
||||
matches = 0
|
||||
errors = 0
|
||||
@ -110,22 +122,21 @@ def get_ringmd5(hosts, ringfile):
|
||||
md5sum.update(block)
|
||||
block = f.read(4096)
|
||||
ring_sum = md5sum.hexdigest()
|
||||
pool = eventlet.GreenPool(20)
|
||||
recon = Scout("ringmd5", self.verbose, self.suppress_errors,
|
||||
self.timeout)
|
||||
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
print "[%s] Checking ring md5sum's on %s hosts..." % (now, len(hosts))
|
||||
if VERBOSE:
|
||||
if self.verbose:
|
||||
print "-> On disk md5sum: %s" % ring_sum
|
||||
for url, response, status in pool.imap(scout_md5, hosts):
|
||||
for url, response, status in self.pool.imap(recon.scout, hosts):
|
||||
if status == 200:
|
||||
#fixme - need to grab from config
|
||||
stats[url] = response[ringfile]
|
||||
if response[ringfile] != ring_sum:
|
||||
ringsmatch = False
|
||||
print "!! %s (%s) doesn't match on disk md5sum" % \
|
||||
(url, response[ringfile])
|
||||
else:
|
||||
matches = matches + 1
|
||||
if VERBOSE:
|
||||
if self.verbose:
|
||||
print "-> %s matches." % url
|
||||
else:
|
||||
errors = errors + 1
|
||||
@ -133,13 +144,19 @@ def get_ringmd5(hosts, ringfile):
|
||||
(matches, len(hosts), errors)
|
||||
print "=" * 79
|
||||
|
||||
def async_check(self, hosts):
|
||||
"""
|
||||
Obtain and print async pending statistics
|
||||
|
||||
def async_check(hosts):
|
||||
:param hosts: set of hosts to check. in the format of:
|
||||
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
||||
"""
|
||||
stats = {}
|
||||
pool = eventlet.GreenPool(20)
|
||||
recon = Scout("async", self.verbose, self.suppress_errors,
|
||||
self.timeout)
|
||||
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
print "[%s] Checking async pendings on %s hosts..." % (now, len(hosts))
|
||||
for url, response, status in pool.imap(scout_async, hosts):
|
||||
for url, response, status in self.pool.imap(recon.scout, hosts):
|
||||
if status == 200:
|
||||
stats[url] = response['async_pending']
|
||||
if len(stats) > 0:
|
||||
@ -153,27 +170,42 @@ def async_check(hosts):
|
||||
print "Error: No hosts available or returned valid information."
|
||||
print "=" * 79
|
||||
|
||||
def umount_check(self, hosts):
|
||||
"""
|
||||
Check for and print unmounted drives
|
||||
|
||||
def umount_check(hosts):
|
||||
:param hosts: set of hosts to check. in the format of:
|
||||
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
||||
"""
|
||||
stats = {}
|
||||
pool = eventlet.GreenPool(20)
|
||||
recon = Scout("unmounted", self.verbose, self.suppress_errors,
|
||||
self.timeout)
|
||||
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
print "[%s] Getting unmounted drives from %s hosts..." % (now, len(hosts))
|
||||
for url, response, status in pool.imap(scout_umount, hosts):
|
||||
print "[%s] Getting unmounted drives from %s hosts..." % \
|
||||
(now, len(hosts))
|
||||
for url, response, status in self.pool.imap(recon.scout, hosts):
|
||||
if status == 200:
|
||||
for i in response:
|
||||
stats[url] = i['device']
|
||||
for host in stats:
|
||||
print "Not mounted: %s on %s" % (stats[host], host)
|
||||
node = urlparse(host).netloc
|
||||
print "Not mounted: %s on %s" % (stats[host], node)
|
||||
print "=" * 79
|
||||
|
||||
def replication_check(self, hosts):
|
||||
"""
|
||||
Obtain and print replication statistics
|
||||
|
||||
def replication_check(hosts):
|
||||
:param hosts: set of hosts to check. in the format of:
|
||||
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
||||
"""
|
||||
stats = {}
|
||||
pool = eventlet.GreenPool(20)
|
||||
recon = Scout("replication", self.verbose, self.suppress_errors,
|
||||
self.timeout)
|
||||
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
print "[%s] Checking replication times on %s hosts..." % (now, len(hosts))
|
||||
for url, response, status in pool.imap(scout_replication, hosts):
|
||||
print "[%s] Checking replication times on %s hosts..." % \
|
||||
(now, len(hosts))
|
||||
for url, response, status in self.pool.imap(recon.scout, hosts):
|
||||
if status == 200:
|
||||
stats[url] = response['object_replication_time']
|
||||
if len(stats) > 0:
|
||||
@ -187,15 +219,21 @@ def replication_check(hosts):
|
||||
print "Error: No hosts available or returned valid information."
|
||||
print "=" * 79
|
||||
|
||||
def load_check(self, hosts):
|
||||
"""
|
||||
Obtain and print load average statistics
|
||||
|
||||
def load_check(hosts):
|
||||
:param hosts: set of hosts to check. in the format of:
|
||||
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
||||
"""
|
||||
load1 = {}
|
||||
load5 = {}
|
||||
load15 = {}
|
||||
pool = eventlet.GreenPool(20)
|
||||
recon = Scout("load", self.verbose, self.suppress_errors,
|
||||
self.timeout)
|
||||
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
print "[%s] Checking load avg's on %s hosts..." % (now, len(hosts))
|
||||
for url, response, status in pool.imap(scout_load, hosts):
|
||||
for url, response, status in self.pool.imap(recon.scout, hosts):
|
||||
if status == 200:
|
||||
load1[url] = response['1m']
|
||||
load5[url] = response['5m']
|
||||
@ -210,18 +248,24 @@ def load_check(hosts):
|
||||
print "[%s load average] lowest: %s, highest: %s, avg: %s" % \
|
||||
(item, low, high, average)
|
||||
else:
|
||||
print "Error: No hosts available or returned valid information."
|
||||
print "Error: No hosts available or returned valid info."
|
||||
print "=" * 79
|
||||
|
||||
def quarantine_check(self, hosts):
|
||||
"""
|
||||
Obtain and print quarantine statistics
|
||||
|
||||
def quarantine_check(hosts):
|
||||
:param hosts: set of hosts to check. in the format of:
|
||||
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
||||
"""
|
||||
objq = {}
|
||||
conq = {}
|
||||
acctq = {}
|
||||
pool = eventlet.GreenPool(20)
|
||||
recon = Scout("quarantined", self.verbose, self.suppress_errors,
|
||||
self.timeout)
|
||||
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
print "[%s] Checking quarantine dirs on %s hosts..." % (now, len(hosts))
|
||||
for url, response, status in pool.imap(scout_quarantine, hosts):
|
||||
print "[%s] Checking quarantine on %s hosts..." % (now, len(hosts))
|
||||
for url, response, status in self.pool.imap(recon.scout, hosts):
|
||||
if status == 200:
|
||||
objq[url] = response['objects']
|
||||
conq[url] = response['containers']
|
||||
@ -233,31 +277,38 @@ def quarantine_check(hosts):
|
||||
high = max(stats[item].values())
|
||||
total = sum(stats[item].values())
|
||||
average = total / len(stats[item])
|
||||
print "[Quarantined %s] low: %d, high: %d, avg: %d, total: %d" % \
|
||||
(item, low, high, average, total)
|
||||
print ("[Quarantined %s] low: %d, high: %d, avg: %d, total: %d"
|
||||
% (item, low, high, average, total))
|
||||
else:
|
||||
print "Error: No hosts available or returned valid information."
|
||||
print "Error: No hosts available or returned valid info."
|
||||
print "=" * 79
|
||||
|
||||
def socket_usage(self, hosts):
|
||||
"""
|
||||
Obtain and print /proc/net/sockstat statistics
|
||||
|
||||
def socket_usage(hosts):
|
||||
:param hosts: set of hosts to check. in the format of:
|
||||
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
||||
"""
|
||||
inuse4 = {}
|
||||
mem = {}
|
||||
inuse6 = {}
|
||||
timewait = {}
|
||||
orphan = {}
|
||||
pool = eventlet.GreenPool(20)
|
||||
recon = Scout("sockstat", self.verbose, self.suppress_errors,
|
||||
self.timeout)
|
||||
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
print "[%s] Checking socket usage on %s hosts..." % (now, len(hosts))
|
||||
for url, response, status in pool.imap(scout_sockstat, hosts):
|
||||
for url, response, status in self.pool.imap(recon.scout, hosts):
|
||||
if status == 200:
|
||||
inuse4[url] = response['tcp_in_use']
|
||||
mem[url] = response['tcp_mem_allocated_bytes']
|
||||
inuse6[url] = response['tcp6_in_use']
|
||||
timewait[url] = response['time_wait']
|
||||
orphan[url] = response['orphan']
|
||||
stats = {"tcp_in_use": inuse4, "tcp_mem_allocated_bytes": mem, \
|
||||
"tcp6_in_use": inuse6, "time_wait": timewait, "orphan": orphan}
|
||||
stats = {"tcp_in_use": inuse4, "tcp_mem_allocated_bytes": mem,
|
||||
"tcp6_in_use": inuse6, "time_wait": timewait,
|
||||
"orphan": orphan}
|
||||
for item in stats:
|
||||
if len(stats[item]) > 0:
|
||||
low = min(stats[item].values())
|
||||
@ -270,22 +321,29 @@ def socket_usage(hosts):
|
||||
print "Error: No hosts or info available."
|
||||
print "=" * 79
|
||||
|
||||
def disk_usage(self, hosts):
|
||||
"""
|
||||
Obtain and print disk usage statistics
|
||||
|
||||
def disk_usage(hosts):
|
||||
:param hosts: set of hosts to check. in the format of:
|
||||
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
|
||||
"""
|
||||
stats = {}
|
||||
highs = []
|
||||
lows = []
|
||||
averages = []
|
||||
percents = {}
|
||||
pool = eventlet.GreenPool(20)
|
||||
recon = Scout("diskusage", self.verbose, self.suppress_errors,
|
||||
self.timeout)
|
||||
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
print "[%s] Checking disk usage on %s hosts..." % (now, len(hosts))
|
||||
for url, response, status in pool.imap(scout_du, hosts):
|
||||
for url, response, status in self.pool.imap(recon.scout, hosts):
|
||||
if status == 200:
|
||||
hostusage = []
|
||||
for entry in response:
|
||||
if entry['mounted']:
|
||||
used = float(entry['used']) / float(entry['size']) * 100.0
|
||||
used = float(entry['used']) / float(entry['size']) \
|
||||
* 100.0
|
||||
hostusage.append(round(used, 2))
|
||||
stats[url] = hostusage
|
||||
|
||||
@ -308,22 +366,22 @@ def disk_usage(hosts):
|
||||
low = min(lows)
|
||||
high = max(highs)
|
||||
average = sum(averages) / len(averages)
|
||||
#distrib graph shamelessly stolen from https://github.com/gholt/tcod
|
||||
#dist graph shamelessly stolen from https://github.com/gholt/tcod
|
||||
print "Distribution Graph:"
|
||||
mul = 69.0 / max(percents.values())
|
||||
for percent in sorted(percents):
|
||||
print '% 3d%% % 4d %s' % (percent, percents[percent], \
|
||||
print '% 3d%%%5d %s' % (percent, percents[percent], \
|
||||
'*' * int(percents[percent] * mul))
|
||||
|
||||
print "Disk usage: lowest: %s%%, highest: %s%%, avg: %s%%" % \
|
||||
(low, high, average)
|
||||
else:
|
||||
print "Error: No hosts available or returned valid information."
|
||||
print "=" * 79
|
||||
|
||||
|
||||
def main():
|
||||
global VERBOSE, SUPPRESS_ERRORS, TIMEOUT, swift_dir, pool
|
||||
def main(self):
|
||||
"""
|
||||
Retrieve and report cluster info from hosts running recon middleware.
|
||||
"""
|
||||
print "=" * 79
|
||||
usage = '''
|
||||
usage: %prog [-v] [--suppress] [-a] [-r] [-u] [-d] [-l] [--objmd5]
|
||||
@ -350,65 +408,63 @@ def main():
|
||||
args.add_option('--sockstat', action="store_true",
|
||||
help="Get cluster socket usage stats")
|
||||
args.add_option('--all', action="store_true",
|
||||
help="Perform all checks. Equivalent to -arudlq --objmd5 --sockstat")
|
||||
help="Perform all checks. Equal to -arudlq --objmd5 --sockstat")
|
||||
args.add_option('--zone', '-z', type="int",
|
||||
help="Only query servers in specified zone")
|
||||
args.add_option('--timeout', '-t', type="int", metavar="SECONDS",
|
||||
help="Time to wait for a response from a server")
|
||||
|
||||
help="Time to wait for a response from a server", default=5)
|
||||
args.add_option('--swiftdir', default="/etc/swift",
|
||||
help="Default = /etc/swift")
|
||||
options, arguments = args.parse_args()
|
||||
|
||||
|
||||
if len(sys.argv) <= 1:
|
||||
args.print_help()
|
||||
sys.exit(0)
|
||||
|
||||
swift_dir = options.swiftdir
|
||||
obj_ring = os.path.join(swift_dir, 'object.ring.gz')
|
||||
con_ring = os.path.join(swift_dir, 'container.ring.gz')
|
||||
acct_ring = os.path.join(swift_dir, 'account.ring.gz')
|
||||
|
||||
VERBOSE = options.verbose
|
||||
SUPPRESS_ERRORS = options.suppress
|
||||
self.verbose = options.verbose
|
||||
self.suppress_errors = options.suppress
|
||||
self.timeout = options.timeout
|
||||
|
||||
if options.zone:
|
||||
hosts = get_devices(options.zone, obj_ring)
|
||||
hosts = self.get_devices(options.zone, obj_ring)
|
||||
else:
|
||||
hosts = get_devices(None, obj_ring)
|
||||
|
||||
if options.timeout:
|
||||
TIMEOUT = options.timeout
|
||||
hosts = self.get_devices(None, obj_ring)
|
||||
|
||||
if options.all:
|
||||
async_check(hosts)
|
||||
umount_check(hosts)
|
||||
replication_check(hosts)
|
||||
load_check(hosts)
|
||||
disk_usage(hosts)
|
||||
get_ringmd5(hosts, obj_ring)
|
||||
quarantine_check(hosts)
|
||||
socket_usage(hosts)
|
||||
self.async_check(hosts)
|
||||
self.umount_check(hosts)
|
||||
self.replication_check(hosts)
|
||||
self.load_check(hosts)
|
||||
self.disk_usage(hosts)
|
||||
self.get_ringmd5(hosts, obj_ring)
|
||||
self.quarantine_check(hosts)
|
||||
self.socket_usage(hosts)
|
||||
else:
|
||||
if options.async:
|
||||
async_check(hosts)
|
||||
self.async_check(hosts)
|
||||
if options.unmounted:
|
||||
umount_check(hosts)
|
||||
self.umount_check(hosts)
|
||||
if options.replication:
|
||||
replication_check(hosts)
|
||||
self.replication_check(hosts)
|
||||
if options.loadstats:
|
||||
load_check(hosts)
|
||||
self.load_check(hosts)
|
||||
if options.diskusage:
|
||||
disk_usage(hosts)
|
||||
self.disk_usage(hosts)
|
||||
if options.objmd5:
|
||||
get_ringmd5(hosts, obj_ring)
|
||||
self.get_ringmd5(hosts, obj_ring)
|
||||
if options.quarantined:
|
||||
quarantine_check(hosts)
|
||||
self.quarantine_check(hosts)
|
||||
if options.sockstat:
|
||||
socket_usage(hosts)
|
||||
self.socket_usage(hosts)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
main()
|
||||
reconnoiter = SwiftRecon()
|
||||
reconnoiter.main()
|
||||
except KeyboardInterrupt:
|
||||
print '\n'
|
||||
|
Loading…
Reference in New Issue
Block a user