elastic-recheck/elastic_recheck/cmd/graph.py
Matt Riedemann 57aab15c11 Retry change queries on a 502 response
When gerrit is running slow we get 502 responses
back which kills the graph builder. We can retry
these requests from the client to keep going. Generally
a single retry fixes it.

Change-Id: I745d7c9b80ab8861972193d82c037df76af69e06
2017-10-09 20:18:58 -04:00

251 lines
9.6 KiB
Python
Executable File

#!/usr/bin/env python
# Copyright 2013 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import argparse
from datetime import datetime
import json
import os
import sys
from launchpadlib import launchpad
import pyelasticsearch
import pytz
import requests
try:
# Disable InsecurePlatformWarning warnings as documented here
# https://github.com/kennethreitz/requests/issues/2214
from requests.packages.urllib3.exceptions import InsecurePlatformWarning
requests.packages.urllib3.disable_warnings(InsecurePlatformWarning)
except ImportError:
# If there's an import error, then urllib3 may be packaged
# separately, so apply it there too
import urllib3
from urllib3.exceptions import InsecurePlatformWarning
urllib3.disable_warnings(InsecurePlatformWarning)
import elastic_recheck.config as er_conf
import elastic_recheck.elasticRecheck as er
from elastic_recheck import log as logging
import elastic_recheck.query_builder as qb
import elastic_recheck.results as er_results
STEP = 3600000
LPCACHEDIR = os.path.expanduser('~/.launchpadlib/cache')
LOG = logging.getLogger('ergraph')
def get_launchpad_bug(bug):
try:
lp = launchpad.Launchpad.login_anonymously('grabbing bugs',
'production',
LPCACHEDIR)
lp_bug = lp.bugs[bug]
bugdata = {'name': lp_bug.title}
projects = ", ".join(map(lambda x: "(%s - %s)" %
(x.bug_target_name, x.status),
lp_bug.bug_tasks))
bugdata['affects'] = projects
bugdata['reviews'] = get_open_reviews(bug)
except KeyError:
# if someone makes a bug private, we lose access to it.
bugdata = dict(name='Unknown (Private Bug)',
affects='Unknown (Private Bug)', reviews=[])
except requests.exceptions.RequestException:
LOG.exception("Failed to get Launchpad data for bug %s" % bug)
bugdata = dict(name='Unable to get launchpad data',
affects='Unknown', reviews=[])
return bugdata
def get_open_reviews(bug_number, attempt=0):
"""return list of open gerrit reviews for a given bug."""
r = requests.get("https://review.openstack.org:443/changes/"
"?q=status:open++message:`%s`+NOT+"
"project:openstack-infra/elastic-recheck" % bug_number)
# If we got a proxy error let's retry until we're out of attempts.
if r.status_code == 502 and attempt < 3:
LOG.debug('Retry changes query for bug %s. Attempt %s of 3.',
bug_number, (attempt + 1))
return get_open_reviews(bug_number, attempt + 1)
# strip off first few chars because 'the JSON response body starts with a
# magic prefix line that must be stripped before feeding the rest of the
# response body to a JSON parser'
# https://review.openstack.org/Documentation/rest-api.html
reviews = []
result = None
try:
result = json.loads(r.text[4:])
except ValueError:
LOG.debug("gerrit response '%s' is not valid JSON" % r.text.strip())
raise
for review in result:
reviews.append(review['_number'])
return reviews
def main():
parser = argparse.ArgumentParser(description='Generate data for graphs.')
parser.add_argument(dest='queries',
help='path to query file')
parser.add_argument('-o', dest='output',
help='output filename. Omit for stdout')
parser.add_argument('-q', dest='queue',
help='limit results to a build queue regex')
parser.add_argument('--es-query-suffix',
help='further limit results with an '
'elastic search query suffix. This will be ANDed '
'to all queries. '
'For example, to limit all queries to a '
'specific branch use: '
' --es-query-suffix "build_branch:\\"stable/'
'liberty\\""')
parser.add_argument('-c', '--conf', help="Elastic Recheck Configuration "
"file to use for data_source options such as "
"elastic search url, logstash url, and database "
"uri.")
parser.add_argument('-v', dest='verbose',
action='store_true', default=False,
help='print out details as we go')
args = parser.parse_args()
config = er_conf.Config(config_file=args.conf)
classifier = er.Classifier(args.queries, config=config)
buglist = []
# if you don't hate timezones, you don't program enough
epoch = datetime.utcfromtimestamp(0).replace(tzinfo=pytz.utc)
ts = datetime.utcnow().replace(tzinfo=pytz.utc)
# rawnow is useful for sending to javascript
rawnow = int(((ts - epoch).total_seconds()) * 1000)
ts = datetime(ts.year, ts.month, ts.day, ts.hour).replace(tzinfo=pytz.utc)
# ms since epoch
now = int(((ts - epoch).total_seconds()) * 1000)
# number of days to match to, this should be the same as we are
# indexing in logstash
days = 10
# How far back to start in the graphs
start = now - (days * 24 * STEP)
# ER timeframe for search
timeframe = days * 24 * STEP / 1000
last_indexed = int(
((classifier.most_recent() - epoch).total_seconds()) * 1000)
behind = now - last_indexed
# the data we're going to return, including interesting headers
jsondata = {
'now': rawnow,
'last_indexed': last_indexed,
'behind': behind,
'buglist': []
}
# Get the cluster health for the header
es = pyelasticsearch.ElasticSearch(config.es_url)
jsondata['status'] = es.health()['status']
for query in classifier.queries:
if args.queue:
query['query'] += ' AND build_queue:%s' % args.queue
if args.es_query_suffix:
query['query'] += ' AND (%s)' % args.es_query_suffix
if query.get('suppress-graph'):
continue
if args.verbose:
LOG.debug("Starting query for bug %s" % query['bug'])
logstash_query = qb.encode_logstash_query(query['query'],
timeframe=timeframe)
logstash_url = ("%s/#/dashboard/file/logstash.json?%s"
% (config.ls_url, logstash_query))
bug_data = get_launchpad_bug(query['bug'])
bug = dict(number=query['bug'],
query=query['query'],
logstash_url=logstash_url,
bug_data=bug_data,
fails=0,
fails24=0,
data=[],
voting=(False if query.get('allow-nonvoting') else True))
buglist.append(bug)
try:
results = classifier.hits_by_query(query['query'],
args.queue,
size=3000,
days=days)
except pyelasticsearch.exceptions.InvalidJsonResponseError:
LOG.exception("Invalid Json while collecting metrics for query %s"
% query['query'])
continue
except requests.exceptions.ReadTimeout:
LOG.exception("Timeout while collecting metrics for query %s" %
query['query'])
continue
facets_for_fail = er_results.FacetSet()
facets_for_fail.detect_facets(results,
["build_status", "build_uuid"])
if "FAILURE" in facets_for_fail:
bug['fails'] = len(facets_for_fail['FAILURE'])
facets = er_results.FacetSet()
facets.detect_facets(results,
["build_status", "timestamp", "build_uuid"])
for status in facets.keys():
data = []
for ts in range(start, now, STEP):
if ts in facets[status]:
fails = len(facets[status][ts])
data.append([ts, fails])
# get the last 24 hr count as well, can't wait to have
# the pandas code and able to do it that way
if status == "FAILURE" and ts > (now - (24 * STEP)):
bug['fails24'] += fails
else:
data.append([ts, 0])
bug["data"].append(dict(label=status, data=data))
# the sort order is a little odd, but basically sort by failures in
# the last 24 hours, then with all failures for ones that we haven't
# seen in the last 24 hours.
buglist = sorted(buglist,
key=lambda bug: -(bug['fails24'] * 100000 + bug['fails']))
jsondata['buglist'] = buglist
if args.output:
out = open(args.output, 'w')
else:
out = sys.stdout
try:
# indent the json output if we're writing to a file
indent = 4 if args.output else None
out.write(json.dumps(jsondata, indent=indent))
finally:
out.close()
if __name__ == "__main__":
main()