elastic-recheck/elastic_recheck/elasticRecheck.py
Sean Dague 8ef26cbed2 objectify the gerrit event for our purposes
instead of passing around complex data structures, create an
event object for our purposes that means we can pass around the
payload relevant to us. Simplifies some things, and will make
adding build_uuid tighter.

Change-Id: I8172b25ae3c60e38d63cf7f4d8a0f6c854bae766
2014-01-20 16:25:07 -05:00

317 lines
11 KiB
Python
Executable File

#!/usr/bin/env python
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import gerritlib.gerrit
import pyelasticsearch
import ConfigParser
import datetime
import logging
import os
import re
import sys
import time
import elastic_recheck.loader as loader
import elastic_recheck.query_builder as qb
from elastic_recheck import results
LOG = logging.getLogger("recheckwatchbot")
ES_URL = "http://logstash.openstack.org/elasticsearch"
def required_files(job):
files = ['console.html']
if re.match("tempest-dsvm", job):
files.extend([
'logs/screen-n-api.txt',
'logs/screen-n-cpu.txt',
'logs/screen-n-sch.txt',
'logs/screen-c-api.txt',
'logs/screen-c-vol.txt',
'logs/syslog.txt'])
return files
class ConsoleNotReady(Exception):
def __init__(self, msg):
self.msg = msg
class FilesNotReady(Exception):
def __init__(self, msg):
self.msg = msg
class ResultTimedOut(Exception):
def __init__(self, msg):
self.msg = msg
class FailEvent(object):
change = None
rev = None
project = None
url = None
bugs = []
def __init__(self, event):
self.change = event['change']['number']
self.rev = event['patchSet']['number']
self.project = event['change']['project']
self.url = event['change']['url']
def name(self):
return "%s,%s" % (self.change, self.rev)
def bug_urls(self):
urls = ['https://bugs.launchpad.net/bugs/%s' % x for x in self.bugs]
return ' and '.join(urls)
class Stream(object):
"""Gerrit Stream.
Monitors gerrit stream looking for tempest-devstack failures.
"""
def __init__(self, user, host, key, thread=True):
port = 29418
self.gerrit = gerritlib.gerrit.Gerrit(host, user, port, key)
self.es = results.SearchEngine(ES_URL)
if thread:
self.gerrit.startWatching()
@staticmethod
def parse_jenkins_failure(event):
"""Is this comment a jenkins failure comment."""
if event.get('type', '') != 'comment-added':
LOG.debug("Skipping event type %s" % event.get('type', ''))
return False
username = event['author'].get('username', '')
if (username != 'jenkins'):
LOG.debug("Skipping comment from %s" %
event['author'].get('username', ''))
return False
if not ("Build failed. For information on how to proceed" in
event['comment']):
change = event['change']['number']
rev = event['patchSet']['number']
LOG.debug("Skipping passing job %s,%s" % (change, rev))
return False
failed_tests = {}
for line in event['comment'].split("\n"):
m = re.search("- ([\w-]+)\s*(http://\S+)\s*:\s*FAILURE", line)
if m:
failed_tests[m.group(1)] = m.group(2)
return failed_tests
def _job_console_uploaded(self, change, patch, name):
query = qb.result_ready(change, patch, name)
r = self.es.search(query, size='10')
if len(r) == 0:
msg = ("Console logs not ready for %s %s,%s" %
(name, change, patch))
raise ConsoleNotReady(msg)
else:
LOG.debug("Console ready for %s %s,%s" %
(name, change, patch))
def _has_required_files(self, change, patch, name):
query = qb.files_ready(change, patch)
r = self.es.search(query, size='80')
files = [x['term'] for x in r.terms]
required = required_files(name)
missing_files = [x for x in required if x not in files]
if len(missing_files) != 0:
msg = ("%s missing for %s %s,%s" % (
change, patch, name, missing_files))
raise FilesNotReady(msg)
def _is_openstack_project(self, event):
return "tempest-dsvm-full" in event["comment"]
def _does_es_have_data(self, change_number, patch_number, job_fails):
"""Wait till ElasticSearch is ready, but return False if timeout."""
NUMBER_OF_RETRIES = 20
SLEEP_TIME = 40
started_at = datetime.datetime.now()
# this checks that we've got the console log uploaded, need to retry
# in case ES goes bonkers on cold data, which it does some times.
for i in range(NUMBER_OF_RETRIES):
try:
for job_name in job_fails:
self._job_console_uploaded(
change_number, patch_number, job_name)
break
except ConsoleNotReady as e:
LOG.debug(e.msg)
time.sleep(SLEEP_TIME)
continue
except pyelasticsearch.exceptions.InvalidJsonResponseError:
# If ElasticSearch returns an error code, sleep and retry
# TODO(jogo): if this works pull out search into a helper
# function that does this.
LOG.exception(
"Elastic Search not responding on attempt %d" % i)
time.sleep(NUMBER_OF_RETRIES)
continue
if i == NUMBER_OF_RETRIES - 1:
elapsed = datetime.datetime.now() - started_at
msg = ("Console logs not available after %ss for %s %s,%s" %
(elapsed, job_name, change_number, patch_number))
raise ResultTimedOut(msg)
LOG.debug(
"Found hits for change_number: %s, patch_number: %s"
% (change_number, patch_number))
for i in range(NUMBER_OF_RETRIES):
try:
for job_name in job_fails:
self._has_required_files(
change_number, patch_number, job_name)
LOG.info(
"All files present for change_number: %s, patch_number: %s"
% (change_number, patch_number))
time.sleep(10)
return True
except FilesNotReady:
time.sleep(SLEEP_TIME)
# if we get to the end, we're broken
elapsed = datetime.datetime.now() - started_at
msg = ("Required files not ready after %ss for %s %d,%d" %
(elapsed, job_name, change_number, patch_number))
raise ResultTimedOut(msg)
def get_failed_tempest(self):
LOG.debug("entering get_failed_tempest")
while True:
event = self.gerrit.getEvent()
failed_jobs = Stream.parse_jenkins_failure(event)
if not failed_jobs:
# nothing to see here, lets try the next event
continue
# bail if it's not an openstack project
if not self._is_openstack_project(event):
continue
fevent = FailEvent(event)
LOG.info("Looking for failures in %s,%s on %s" %
(fevent.change, fevent.rev, ", ".join(failed_jobs)))
if self._does_es_have_data(fevent.change, fevent.rev, failed_jobs):
return fevent
def leave_comment(self, project, commit, bugs=None):
if bugs:
bug_urls = ['https://bugs.launchpad.net/bugs/%s' % x for x in bugs]
message = """I noticed tempest failed, I think you hit bug(s):
- %(bugs)s
We don't automatically recheck or reverify, so please consider
doing that manually if someone hasn't already. For a code review
which is not yet approved, you can recheck by leaving a code
review comment with just the text:
recheck bug %(bug)s
For a code review which has been approved but failed to merge,
you can reverify by leaving a comment like this:
reverify bug %(bug)s""" % {'bugs': "\n- ".join(bug_urls),
'bug': bugs[0]}
else:
message = ("I noticed tempest failed, refer to: "
"https://wiki.openstack.org/wiki/"
"GerritJenkinsGithub#Test_Failures")
self.gerrit.review(project, commit, message)
class Classifier():
"""Classify failed tempest-devstack jobs based.
Given a change and revision, query logstash with a list of known queries
that are mapped to specific bugs.
"""
queries = None
def __init__(self, queries_dir):
self.es = results.SearchEngine(ES_URL)
self.queries_dir = queries_dir
self.queries = loader.load(self.queries_dir)
def hits_by_query(self, query, facet=None, size=100):
es_query = qb.generic(query, facet=facet)
return self.es.search(es_query, size=size)
def classify(self, change_number, patch_number, skip_resolved=True):
"""Returns either empty list or list with matched bugs."""
LOG.debug("Entering classify")
#Reload each time
self.queries = loader.load(self.queries_dir, skip_resolved)
bug_matches = []
for x in self.queries:
LOG.debug(
"Looking for bug: https://bugs.launchpad.net/bugs/%s"
% x['bug'])
query = qb.single_patch(x['query'], change_number, patch_number)
results = self.es.search(query, size='10')
if len(results) > 0:
bug_matches.append(x['bug'])
return bug_matches
def main():
config = ConfigParser.ConfigParser()
if len(sys.argv) is 2:
config_path = sys.argv[1]
else:
config_path = 'elasticRecheck.conf'
config.read(config_path)
user = config.get('gerrit', 'user', 'jogo')
host = config.get('gerrit', 'host', 'review.openstack.org')
queries = config.get('gerrit', 'query_file', 'queries.yaml')
queries = os.path.expanduser(queries)
key = config.get('gerrit', 'key')
classifier = Classifier(queries)
stream = Stream(user, host, key)
while True:
event = stream.get_failed_tempest()
change = event['change']['number']
rev = event['patchSet']['number']
print "======================="
print "https://review.openstack.org/#/c/%(change)s/%(rev)s" % locals()
bug_numbers = classifier.classify(change, rev)
if not bug_numbers:
print "unable to classify failure"
else:
for bug_number in bug_numbers:
print("Found bug: https://bugs.launchpad.net/bugs/%s"
% bug_number)
if __name__ == "__main__":
main()