Rewrite maintainers.py functionality

Base it on maintainer-only votes cast within the requisite timeframe
rather than attempting to parse Gerrit ACLs and query groups.

Change-Id: I982cb2e422f267b2834b4b20b11f1fd011516548
This commit is contained in:
Jeremy Stanley
2025-03-21 22:14:45 +00:00
parent 3242fadc59
commit fed34c981b

View File

@@ -1,6 +1,4 @@
#!/usr/bin/env python # Copyright OpenDev Contributors
# Copyright (c) 2015 OpenStack Foundation
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
@@ -8,154 +6,169 @@
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing,
# distributed under the License is distributed on an "AS IS" BASIS, # software distributed under the License is distributed on an "AS
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or # IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# implied. # express or implied. See the License for the specific language
# See the License for the specific language governing permissions and # governing permissions and limitations under the License.
# limitations under the License.
# Description: When run using OpenStack's Gerrit server, this builds import datetime
# JSON and YAML representations of repos with information on the import os
# official owning project team if any, deliverable tags, and groups
# with approve rights listing the members of each along with their
# Gerrit preferred E-mail addresses and usernames when available.
# Rationale: It was done as a demonstration to a representative of a
# foundation member company who requested a list of the "core
# reviewers" for official projects, optionally broken down by
# integrated vs. other. I'm attempting to show that this data is
# already publicly available and can be extracted/analyzed by anyone
# without needing to request it.
# Use: This needs your Gerrit username passed as the command-line
# parameter, found at https://review.opendev.org/#/settings/ when
# authenticated in the WebUI. It also prompts for an HTTP password
# which https://review.opendev.org/#/settings/http-password will
# allow you to generate. The results end up in files named
# approvers.json and approvers.yaml. At the time of writing, it
# takes approximately 6.5 minutes to run on a well-connected machine
# with 70-80ms round-trip latency to review.opendev.org.
# Example:
#
# $ virtualenv approvers
# [...]
# $ ./approvers/bin/pip install pyyaml requests
# [...]
# $ ./approvers/bin/python tools/who-approves.py fungi
# Password:
# [wait for completion]
# $ ./approvers/bin/python
# >>> import yaml
# >>>
# >>> def get_approvers(repos):
# ... approvers = set()
# ... for repo in repos:
# ... for group in repos[repo]['approvers']:
# ... for approver in repos[repo]['approvers'][group]:
# ... approvers.add(approver)
# ... return(approvers)
# ...
# >>> p = yaml.safe_load(open('approvers.yaml'))
# >>> print('Total repos: %s' % len(p))
# Total repos: 751
# >>> print('Total approvers: %s' % len(get_approvers(p)))
# Total approvers: 849
# >>>
# >>> o = {k: v for k, v in p.iteritems() if 'team' in v}
# >>> print('Repos for official teams: %s' % len(o))
# Repos for official teams: 380
# >>> print('OpenStack repo approvers: %s' % len(get_approvers(o)))
# OpenStack repo approvers: 456
# >>>
# >>> i = {k: v for k, v in p.iteritems() if 'tags' in v
# ... and 'release:managed' in v['tags']}
# >>> print('Repos under release management: %s' % len(i))
# Repos under release management: 77
# >>> print('Managed release repo approvers: %s' % len(get_approvers(i)))
# Managed release repo approvers: 245
import getpass
import json
import re import re
import sys import sys
import requests from engagement.stats import (
from_gerrit_time,
get_projects,
query_gerrit,
report_times,
to_gerrit_time,
)
import yaml import yaml
def main(): def usage_error():
gerrit_url = 'https://review.opendev.org/' """Write a generic usage message to stderr and exit nonzero"""
try:
gerrit_auth = requests.auth.HTTPDigestAuth( sys.stderr.write(
sys.argv[1], getpass.getpass()) 'ERROR: specify report period like YEAR, YEAR-H[1-2], YEAR-Q[1-4],\n'
except IndexError: ' YEAR-[01-12], or YYYY-MM-DD..YYYY-MM-DD for a date range\n'
sys.stderr.write("Usage: %s USERNAME\n" % sys.argv[0]) ' (start date is inclusive, end date is exclusive)\n')
sys.exit(1) sys.exit(1)
acl_path = (
'gitweb?p=%s.git;a=blob_plain;f=project.config;hb=refs/meta/config')
group_path = 'a/groups/%s/members/?recursive&pp=0' def parse_report_period(when):
projects_file = ('gitweb?p=openstack/governance.git;a=blob_plain;' """Parse a supplied report period string, returning a tuple of
'f=reference/projects.yaml;hb=%s') after and before datetime objects"""
ref_name = 'refs/heads/master'
aprv_pattern = r'label-Workflow = .*\.\.\+1 group (.*)' daterange = re.compile(
projects = requests.get(gerrit_url + projects_file % ref_name) r'^(\d{4})-(\d{2})-(\d{2})\.\.(\d{4})-(\d{2})-(\d{2})$')
projects.encoding = 'utf-8' # Workaround for Gitweb encoding monthly = re.compile(r'^(\d{4})-(\d{2})$')
projects = yaml.safe_load(projects.text) quarterly = re.compile(r'^(\d{4})-q([1-4])$', re.IGNORECASE)
repos_dump = json.loads(requests.get( halfyearly = re.compile(r'^(\d{4})-h([1-4])$', re.IGNORECASE)
gerrit_url + 'projects/?pp=0').text[4:]) yearly = re.compile(r'^\d{4}$')
all_groups = json.loads(requests.get(gerrit_url + 'a/groups/', # TODO: merge this functionality into engagement.stats.parse_report_period
auth=gerrit_auth).text[4:]) if daterange.match(when):
repos = {} after = datetime.datetime(
aprv_groups = {} int(daterange.match(when).group(1)),
for repo in repos_dump: int(daterange.match(when).group(2)),
repos[repo.encode('utf-8')] = {'approvers': {}} int(daterange.match(when).group(3)))
acl_ini = requests.get(gerrit_url + acl_path % repo).text before = datetime.datetime(
for aprv_group in [str(x) for x in re.findall(aprv_pattern, acl_ini)]: int(daterange.match(when).group(4)),
if aprv_group not in repos[repo]['approvers']: int(daterange.match(when).group(5)),
repos[repo]['approvers'][aprv_group] = [] int(daterange.match(when).group(6)))
if aprv_group not in aprv_groups: return after, before
aprv_groups[aprv_group] = [] if monthly.match(when):
for team in projects: start_year = int(monthly.match(when).group(1))
if 'deliverables' in projects[team]: start_month = int(monthly.match(when).group(2))
for deli in projects[team]['deliverables']: end_year = start_year + start_month // 12
if 'repos' in projects[team]['deliverables'][deli]: end_month = 1 + start_month % 12
drepos = projects[team]['deliverables'][deli]['repos'] elif quarterly.match(when):
for repo in drepos: start_year = int(quarterly.match(when).group(1))
if repo in repos: start_month = 1 + 3 * (int(quarterly.match(when).group(2)) - 1)
repos[repo]['team'] = team end_year = start_year + (start_month + 2) // 12
if 'tags' in projects[team]['deliverables'][deli]: end_month = 1 + (start_month + 2) % 12
repos[repo]['tags'] = projects[ elif halfyearly.match(when):
team]['deliverables'][deli]['tags'] start_year = int(halfyearly.match(when).group(1))
for aprv_group in aprv_groups.keys(): start_month = 1 + 6 * (int(halfyearly.match(when).group(2)) - 1)
# It's possible for built-in metagroups in recent Gerrit releases to end_year = start_year + (start_month + 5) // 12
# appear in ACLs but not in the groups list end_month = 1 + (start_month + 5) % 12
if aprv_group in all_groups: elif yearly.match(when):
aprv_groups[aprv_group] = json.loads(requests.get( start_year = int(yearly.match(when).group())
gerrit_url + group_path % all_groups[aprv_group]['id'], start_month = 1
auth=gerrit_auth).text[4:]) end_year = start_year + 1
end_month = 1
else: else:
sys.stderr.write('Ignoring nonexistent "%s" group.\n' % aprv_group) usage_error()
for repo in repos: after = datetime.datetime(start_year, start_month, 1)
for aprv_group in repos[repo]['approvers'].keys(): before = datetime.datetime(end_year, end_month, 1)
for approver in aprv_groups[aprv_group]: return after, before
if 'name' in approver:
approver_details = '"%s"' % approver['name']
def parse_command_line():
"""Parse the command line to obtain the report period, then return it"""
if len(sys.argv) == 2:
return sys.argv[1]
else: else:
approver_details = '' usage_error()
if 'email' in approver:
if approver_details:
approver_details += ' ' def main(verbose=0):
approver_details += '<%s>' % approver['email'] """Utility entry point"""
if 'username' in approver:
if approver_details: argument = parse_command_line()
approver_details += ' ' after, before = parse_report_period(argument)
approver_details += '(%s)' % approver['username'] changes = dict()
repos[repo]['approvers'][aprv_group].append(
approver_details.encode('utf-8')) # TODO: deduplicate this and the similar version in stats.main
approvers_yaml = open('approvers.yaml', 'w') # Shard querying by project, to help with the inherent instability of
yaml.dump(repos, approvers_yaml, allow_unicode=True, encoding='utf-8', # result pagination from the Gerrit API
default_flow_style=False) for project in get_projects(verbose=verbose):
approvers_json = open('approvers.json', 'w') if verbose >= 1:
json.dump(repos, approvers_json, indent=2) print("Checking project: %s" % project)
offset = 0
# Loop due to unavoidable query result pagination
while offset >= 0:
# We only constrain the query by the after date, as changes created
# between the before and after date may have been updated more
# recently with a new revision or comment
new_changes = query_gerrit("changes/", params={
"q": "project:%s after:{%s}" % (
project, to_gerrit_time(after)),
"no-limit": "1",
"start": offset,
"o": ["DETAILED_ACCOUNTS", "DETAILED_LABELS", "SKIP_DIFFSTAT"],
}, verbose=verbose)
# Since we redundantly query ranges with offsets to help combat
# pagination instability, we must deduplicate results
for change in new_changes:
if change["id"] not in changes:
changes[change["id"]] = change
# Offset additional pages by half the returned entry count to help
# avoid missing changes due to pagination instability
if new_changes and new_changes[-1].get("_more_changes", False):
offset += int(len(new_changes) / 2)
else:
offset = -1
report = {"namespaces": dict()}
report_times(report, after, before)
maintainers = dict()
for change in changes.values():
namespace = change["project"].split("/")[0]
if namespace not in report["namespaces"]:
report["namespaces"][namespace] = set()
if "labels" in change:
for label, maintvotes in {
"Code-Review": (-2, 2), "Workflow": (1,)}.items():
if label in change["labels"]:
for vote in change["labels"][label].get("all", []):
when = vote.get("date")
if ("name" in vote and "email" in vote
and vote.get("value", 0) in maintvotes and when
and after < from_gerrit_time(when) < before):
if namespace not in maintainers:
maintainers[namespace] = set()
maintainers[namespace].add('"%s" <%s>' % (
vote["name"], vote["email"]))
for namespace in maintainers:
report["namespaces"][namespace] = sorted(list(maintainers[namespace]))
# Operate on a copy of the keys since we'll be altering the dict
for namespace in list(report["namespaces"].keys()):
# Cull inactive namespaces from the report
if not report["namespaces"][namespace]:
del report["namespaces"][namespace]
# Write the full YAML structured data report
os.makedirs("maintainers", exist_ok=True)
open("maintainers/%s.yaml" % argument, "w").write(yaml.dump(report))
# Write per-namespace text dumps of names/addresses
for namespace, maintlist in list(report["namespaces"].items()):
with open("maintainers/%s_%s.txt" % (
argument, namespace), "w", encoding="utf-8") as dumpfile:
for maintainer in maintlist:
dumpfile.write(maintainer + "\n")