Rewrite maintainers.py functionality

Base it on maintainer-only votes cast within the requisite timeframe rather than attempting to parse Gerrit ACLs and query groups. Change-Id: I982cb2e422f267b2834b4b20b11f1fd011516548
2025-03-21 22:14:45 +00:00
parent 3242fadc59
commit fed34c981b
1 changed files with 160 additions and 147 deletions
--- a/engagement/maintainers.py
+++ b/engagement/maintainers.py
@@ -1,6 +1,4 @@
-#!/usr/bin/env python
+# Copyright OpenDev Contributors
 # Copyright (c) 2015 OpenStack Foundation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -8,154 +6,169 @@
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing,
-# distributed under the License is distributed on an "AS IS" BASIS,
+# software distributed under the License is distributed on an "AS
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# implied.
+# express or implied. See the License for the specific language
-# See the License for the specific language governing permissions and
+# governing permissions and limitations under the License.
 # limitations under the License.
-# Description: When run using OpenStack's Gerrit server, this builds
+import datetime
-# JSON and YAML representations of repos with information on the
+import os
 # official owning project team if any, deliverable tags, and groups
 # with approve rights listing the members of each along with their
 # Gerrit preferred E-mail addresses and usernames when available.
 # Rationale: It was done as a demonstration to a representative of a
 # foundation member company who requested a list of the "core
 # reviewers" for official projects, optionally broken down by
 # integrated vs. other. I'm attempting to show that this data is
 # already publicly available and can be extracted/analyzed by anyone
 # without needing to request it.
 # Use: This needs your Gerrit username passed as the command-line
 # parameter, found at https://review.opendev.org/#/settings/ when
 # authenticated in the WebUI. It also prompts for an HTTP password
 # which https://review.opendev.org/#/settings/http-password will
 # allow you to generate. The results end up in files named
 # approvers.json and approvers.yaml. At the time of writing, it
 # takes approximately 6.5 minutes to run on a well-connected machine
 # with 70-80ms round-trip latency to review.opendev.org.
 # Example:
 #
 #     $ virtualenv approvers
 #     [...]
 #     $ ./approvers/bin/pip install pyyaml requests
 #     [...]
 #     $ ./approvers/bin/python tools/who-approves.py fungi
 #     Password:
 #     [wait for completion]
 #     $ ./approvers/bin/python
 #     >>> import yaml
 #     >>>
 #     >>> def get_approvers(repos):
 #     ...     approvers = set()
 #     ...     for repo in repos:
 #     ...         for group in repos[repo]['approvers']:
 #     ...             for approver in repos[repo]['approvers'][group]:
 #     ...                 approvers.add(approver)
 #     ...     return(approvers)
 #     ...
 #     >>> p = yaml.safe_load(open('approvers.yaml'))
 #     >>> print('Total repos: %s' % len(p))
 #     Total repos: 751
 #     >>> print('Total approvers: %s' % len(get_approvers(p)))
 #     Total approvers: 849
 #     >>>
 #     >>> o = {k: v for k, v in p.iteritems() if 'team' in v}
 #     >>> print('Repos for official teams: %s' % len(o))
 #     Repos for official teams: 380
 #     >>> print('OpenStack repo approvers: %s' % len(get_approvers(o)))
 #     OpenStack repo approvers: 456
 #     >>>
 #     >>> i = {k: v for k, v in p.iteritems() if 'tags' in v
 #     ...      and 'release:managed' in v['tags']}
 #     >>> print('Repos under release management: %s' % len(i))
 #     Repos under release management: 77
 #     >>> print('Managed release repo approvers: %s' % len(get_approvers(i)))
 #     Managed release repo approvers: 245
 import getpass
 import json
 import re
 import sys
-import requests
+from engagement.stats import (
    from_gerrit_time,
    get_projects,
    query_gerrit,
    report_times,
    to_gerrit_time,
    )
 import yaml
-def main():
+def usage_error():
-    gerrit_url = 'https://review.opendev.org/'
+    """Write a generic usage message to stderr and exit nonzero"""
-    try:
+
-        gerrit_auth = requests.auth.HTTPDigestAuth(
+    sys.stderr.write(
-            sys.argv[1], getpass.getpass())
+        'ERROR: specify report period like YEAR, YEAR-H[1-2], YEAR-Q[1-4],\n'
-    except IndexError:
+        '       YEAR-[01-12], or YYYY-MM-DD..YYYY-MM-DD for a date range\n'
-        sys.stderr.write("Usage: %s USERNAME\n" % sys.argv[0])
+        '       (start date is inclusive, end date is exclusive)\n')
    sys.exit(1)
-    acl_path = (
+
-        'gitweb?p=%s.git;a=blob_plain;f=project.config;hb=refs/meta/config')
+
-    group_path = 'a/groups/%s/members/?recursive&pp=0'
+def parse_report_period(when):
-    projects_file = ('gitweb?p=openstack/governance.git;a=blob_plain;'
+    """Parse a supplied report period string, returning a tuple of
-                     'f=reference/projects.yaml;hb=%s')
+    after and before datetime objects"""
-    ref_name = 'refs/heads/master'
+
-    aprv_pattern = r'label-Workflow = .*\.\.\+1 group (.*)'
+    daterange = re.compile(
-    projects = requests.get(gerrit_url + projects_file % ref_name)
+        r'^(\d{4})-(\d{2})-(\d{2})\.\.(\d{4})-(\d{2})-(\d{2})$')
-    projects.encoding = 'utf-8'  # Workaround for Gitweb encoding
+    monthly = re.compile(r'^(\d{4})-(\d{2})$')
-    projects = yaml.safe_load(projects.text)
+    quarterly = re.compile(r'^(\d{4})-q([1-4])$', re.IGNORECASE)
-    repos_dump = json.loads(requests.get(
+    halfyearly = re.compile(r'^(\d{4})-h([1-4])$', re.IGNORECASE)
-        gerrit_url + 'projects/?pp=0').text[4:])
+    yearly = re.compile(r'^\d{4}$')
-    all_groups = json.loads(requests.get(gerrit_url + 'a/groups/',
+    # TODO: merge this functionality into engagement.stats.parse_report_period
-                                         auth=gerrit_auth).text[4:])
+    if daterange.match(when):
-    repos = {}
+        after = datetime.datetime(
-    aprv_groups = {}
+            int(daterange.match(when).group(1)),
-    for repo in repos_dump:
+            int(daterange.match(when).group(2)),
-        repos[repo.encode('utf-8')] = {'approvers': {}}
+            int(daterange.match(when).group(3)))
-        acl_ini = requests.get(gerrit_url + acl_path % repo).text
+        before = datetime.datetime(
-        for aprv_group in [str(x) for x in re.findall(aprv_pattern, acl_ini)]:
+            int(daterange.match(when).group(4)),
-            if aprv_group not in repos[repo]['approvers']:
+            int(daterange.match(when).group(5)),
-                repos[repo]['approvers'][aprv_group] = []
+            int(daterange.match(when).group(6)))
-            if aprv_group not in aprv_groups:
+        return after, before
-                aprv_groups[aprv_group] = []
+    if monthly.match(when):
-    for team in projects:
+        start_year = int(monthly.match(when).group(1))
-        if 'deliverables' in projects[team]:
+        start_month = int(monthly.match(when).group(2))
-            for deli in projects[team]['deliverables']:
+        end_year = start_year + start_month // 12
-                if 'repos' in projects[team]['deliverables'][deli]:
+        end_month = 1 + start_month % 12
-                    drepos = projects[team]['deliverables'][deli]['repos']
+    elif quarterly.match(when):
-                    for repo in drepos:
+        start_year = int(quarterly.match(when).group(1))
-                        if repo in repos:
+        start_month = 1 + 3 * (int(quarterly.match(when).group(2)) - 1)
-                            repos[repo]['team'] = team
+        end_year = start_year + (start_month + 2) // 12
-                            if 'tags' in projects[team]['deliverables'][deli]:
+        end_month = 1 + (start_month + 2) % 12
-                                repos[repo]['tags'] = projects[
+    elif halfyearly.match(when):
-                                    team]['deliverables'][deli]['tags']
+        start_year = int(halfyearly.match(when).group(1))
-    for aprv_group in aprv_groups.keys():
+        start_month = 1 + 6 * (int(halfyearly.match(when).group(2)) - 1)
-        # It's possible for built-in metagroups in recent Gerrit releases to
+        end_year = start_year + (start_month + 5) // 12
-        # appear in ACLs but not in the groups list
+        end_month = 1 + (start_month + 5) % 12
-        if aprv_group in all_groups:
+    elif yearly.match(when):
-            aprv_groups[aprv_group] = json.loads(requests.get(
+        start_year = int(yearly.match(when).group())
-                gerrit_url + group_path % all_groups[aprv_group]['id'],
+        start_month = 1
-                auth=gerrit_auth).text[4:])
+        end_year = start_year + 1
        end_month = 1
    else:
-            sys.stderr.write('Ignoring nonexistent "%s" group.\n' % aprv_group)
+        usage_error()
-    for repo in repos:
+    after = datetime.datetime(start_year, start_month, 1)
-        for aprv_group in repos[repo]['approvers'].keys():
+    before = datetime.datetime(end_year, end_month, 1)
-            for approver in aprv_groups[aprv_group]:
+    return after, before
-                if 'name' in approver:
+
-                    approver_details = '"%s"' % approver['name']
+
 def parse_command_line():
    """Parse the command line to obtain the report period, then return it"""
    if len(sys.argv) == 2:
        return sys.argv[1]
    else:
-                    approver_details = ''
+        usage_error()
-                if 'email' in approver:
+
-                    if approver_details:
+
-                        approver_details += ' '
+def main(verbose=0):
-                    approver_details += '<%s>' % approver['email']
+    """Utility entry point"""
-                if 'username' in approver:
+
-                    if approver_details:
+    argument = parse_command_line()
-                        approver_details += ' '
+    after, before = parse_report_period(argument)
-                    approver_details += '(%s)' % approver['username']
+    changes = dict()
-                repos[repo]['approvers'][aprv_group].append(
+
-                    approver_details.encode('utf-8'))
+    # TODO: deduplicate this and the similar version in stats.main
-    approvers_yaml = open('approvers.yaml', 'w')
+    # Shard querying by project, to help with the inherent instability of
-    yaml.dump(repos, approvers_yaml, allow_unicode=True, encoding='utf-8',
+    # result pagination from the Gerrit API
-              default_flow_style=False)
+    for project in get_projects(verbose=verbose):
-    approvers_json = open('approvers.json', 'w')
+        if verbose >= 1:
-    json.dump(repos, approvers_json, indent=2)
+            print("Checking project: %s" % project)
        offset = 0
        # Loop due to unavoidable query result pagination
        while offset >= 0:
            # We only constrain the query by the after date, as changes created
            # between the before and after date may have been updated more
            # recently with a new revision or comment
            new_changes = query_gerrit("changes/", params={
                "q": "project:%s after:{%s}" % (
                    project, to_gerrit_time(after)),
                "no-limit": "1",
                "start": offset,
                "o": ["DETAILED_ACCOUNTS", "DETAILED_LABELS", "SKIP_DIFFSTAT"],
                }, verbose=verbose)
            # Since we redundantly query ranges with offsets to help combat
            # pagination instability, we must deduplicate results
            for change in new_changes:
                if change["id"] not in changes:
                    changes[change["id"]] = change
            # Offset additional pages by half the returned entry count to help
            # avoid missing changes due to pagination instability
            if new_changes and new_changes[-1].get("_more_changes", False):
                offset += int(len(new_changes) / 2)
            else:
                offset = -1
    report = {"namespaces": dict()}
    report_times(report, after, before)
    maintainers = dict()
    for change in changes.values():
        namespace = change["project"].split("/")[0]
        if namespace not in report["namespaces"]:
            report["namespaces"][namespace] = set()
        if "labels" in change:
            for label, maintvotes in {
                    "Code-Review": (-2, 2), "Workflow": (1,)}.items():
                if label in change["labels"]:
                    for vote in change["labels"][label].get("all", []):
                        when = vote.get("date")
                        if ("name" in vote and "email" in vote
                                and vote.get("value", 0) in maintvotes and when
                                and after < from_gerrit_time(when) < before):
                            if namespace not in maintainers:
                                maintainers[namespace] = set()
                            maintainers[namespace].add('"%s" <%s>' % (
                                vote["name"], vote["email"]))
    for namespace in maintainers:
        report["namespaces"][namespace] = sorted(list(maintainers[namespace]))
    # Operate on a copy of the keys since we'll be altering the dict
    for namespace in list(report["namespaces"].keys()):
        # Cull inactive namespaces from the report
        if not report["namespaces"][namespace]:
            del report["namespaces"][namespace]
    # Write the full YAML structured data report
    os.makedirs("maintainers", exist_ok=True)
    open("maintainers/%s.yaml" % argument, "w").write(yaml.dump(report))
    # Write per-namespace text dumps of names/addresses
    for namespace, maintlist in list(report["namespaces"].items()):
        with open("maintainers/%s_%s.txt" % (
                argument, namespace), "w", encoding="utf-8") as dumpfile:
            for maintainer in maintlist:
                dumpfile.write(maintainer + "\n")