leanprover-community · grunweg · Feb 13, 2025 · Feb 13, 2025
diff --git a/.github/workflows/regenerate_dashboard.yml b/.github/workflows/regenerate_dashboard.yml
@@ -1,8 +1,8 @@
 name: Regenerate dashboard webpages
 
 on:
-  # schedule:
-  #  - cron: '*/10 * * * *'  # Runs every 10 minutes, for now. Github allows at most "every 5 minutes"
+  schedule:
+   - cron: '*/5 * * * *'  # Runs every 5 minutes, the fastest Github allows
   workflow_dispatch:       # Allows manual execution
 
 jobs:
@@ -15,24 +15,19 @@ jobs:
     steps:
     - name: Checkout repository
       uses: actions/checkout@v4
+      with:
+        sparse-checkout: 'processed_data/open_pr_data.json'
+        sparse-checkout-cone-mode: false
 
     - name: Set up environment
       run: |
         sudo apt-get update
-        sudo apt-get install -y jq curl gh python3 python3-pip
+        sudo apt-get install -y python3 python3-pip
         pip3 install python-dateutil
 
-    - name: "Download .json files for all open PRs"
-      id: "download-json"
-      env:
-        GH_TOKEN: ${{ github.token }}
-      run: |
-        bash ./dashboard.sh
-
     - name: "Regenerate the dashboard webpages"
       run: |
-        python3 ./dashboard.py "all-open-PRs-1.json" "all-open-PRs-2.json" > ./index-old.html
-        rm *.json
+        python3 ./dashboard.py
 
     - name: Commit and push changes
       run: |

diff --git a/.github/workflows/update_metadata.yml b/.github/workflows/update_metadata.yml
@@ -74,43 +74,3 @@ jobs:
         # FIXME: make this more robust about incoming changes
         # The other workflow does not push to this branch, so this should be fine.
         git push
-
-    - name: "Regenerate the dashboard webpages"
-      if: ${{ !cancelled() }} && (steps.download-json.outcome == 'success')
-      run: |
-        python3 ./dashboard.py "all-open-PRs-1.json" "all-open-PRs-2.json" > ./index-old.html
-        rm *.json
-
-    - name: Commit and push changes
-      if: ${{ !cancelled() }} && (steps.download-json.outcome == 'success')
-      run: |
-        git config --local user.email "action@github.com"
-        git config --local user.name "GitHub Action"
-        # Temporarily rename the generated HTML files
-        # to avoid a name collision with existing files on the gh-pages branch.
-        mv index.html tmp.html
-        mv index-old.html tmp1.html
-        mv on_the_queue.html tmp2.html
-        mv review_dashboard.html tmp3.html
-        mv maintainers_quick.html tmp4.html
-        mv help_out.html tmp5.html
-        mv triage.html tmp6.html
-        git fetch --depth 1 origin gh-pages
-        git checkout gh-pages
-        mv tmp.html index.html
-        mv tmp1.html index-old.html
-        mv tmp2.html on_the_queue.html
-        mv tmp3.html review_dashboard.html
-        mv tmp4.html maintainers_quick.html
-        mv tmp5.html help_out.html
-        mv tmp6.html triage.html
-        git add index.html
-        git add index-old.html
-        git add on_the_queue.html
-        git add review_dashboard.html
-        git add maintainers_quick.html
-        git add help_out.html
-        git add triage.html
-        git checkout master -- style.css
-        git commit -m "Update dashboard" -a || echo "No changes to commit"
-        git push
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
@@ -106,7 +106,7 @@ There are several levels at which this project can be tested. Currently, there a
 
 TODO: there are more generated HTML files now; recommend copying the folder instead...
 - changes to just `dashboard.py` can be tested using the JSON files in the `test` directory: run the following from the `test` directory.
-`python3 ../dashboard.py all-open-PRs-1.json all-open-PRs-2.json`.
+`python3 ../dashboard.py`.
 This creates two webpages named `on_the_queue.html` and `index.html`, overwriting any previous files named thus.
 You can run this command before and after your changes and compare the resulting files (using `diff` or a similar tool). Because of the overwriting, take care to copy e.g. the old version of the output to a different file before running the tool again.
 (The output file needs to be in the top-level directory in order for the styling to work.)

diff --git a/compute_dashboard_prs.py b/compute_dashboard_prs.py
@@ -115,11 +115,9 @@ class AggregatePRInfo(NamedTuple):
     first_on_queue: Tuple[DataStatus, datetime | None] | None
     total_queue_time: TotalQueueTime | None
 
-# Missing aggregate information will be replaced by this default item.
-PLACEHOLDER_AGGREGATE_INFO = AggregatePRInfo(
-    False, CIStatus.Missing, "master", "leanprover-community", "open", datetime.now(timezone.utc),
-    "unknown", "unknown title", [], -1, -1, -1, [], [], None, None, None, None,
-)
+    @staticmethod
+    def toBasicPRInformation(self, number: int) -> BasicPRInformation:
+        return BasicPRInformation(number, self.author, self.title, infer_pr_url(number), self.labels, self.last_updated)
 
 
 # Parse the contents |data| of an aggregate json file into a dictionary pr number -> AggregatePRInfo.
@@ -195,7 +193,6 @@ def toLabel(name: str) -> Label:
 # (`BasicPRInformation` is not hashable, hence cannot be used as a dictionary key.)
 # 'aggregate_info' contains aggregate information about each PR's CI state,
 # draft status and base branch (and more, which we do not use).
-# If no detailed information was available for a given PR number, 'None' is returned.
 def compute_pr_statusses(aggregate_info: dict[int, AggregatePRInfo], prs: List[BasicPRInformation]) -> dict[int, PRStatus]:
     def determine_status(aggregate_info: AggregatePRInfo) -> PRStatus:
         # Ignore all "other" labels, which are not relevant for this anyway.
@@ -204,7 +201,7 @@ def determine_status(aggregate_info: AggregatePRInfo) -> PRStatus:
         state = PRState(labels, aggregate_info.CI_status, aggregate_info.is_draft, from_fork)
         return determine_PR_status(datetime.now(timezone.utc), state)
 
-    return {info.number: determine_status(aggregate_info[info.number] or PLACEHOLDER_AGGREGATE_INFO) for info in prs}
+    return {info.number: determine_status(aggregate_info[info.number]) for info in prs}
 
 
 # Does a PR have a given label?
@@ -391,16 +388,13 @@ def has_topic_label(pr: BasicPRInformation) -> bool:
     return (with_bad_title, prs_without_topic_label, prs_with_contradictory_labels)
 
 
-# use_aggregate_queue: if True, determine the review queue (and everything depending on it)
-# from the aggregate data and not queue.json
 def determine_pr_dashboards(
     all_open_prs: List[BasicPRInformation],
     nondraft_PRs: List[BasicPRInformation],
     base_branch: dict[int, str],
     prs_from_fork: List[BasicPRInformation],
     CI_status: dict[int, CIStatus],
-    aggregate_info: dict[int, AggregatePRInfo],
-    use_aggregate_queue: bool,
+    aggregate_info: dict[int, AggregatePRInfo]
 ) -> dict[Dashboard, List[BasicPRInformation]]:
     approved = [pr for pr in nondraft_PRs if aggregate_info[pr.number].approvals]
     prs_to_list: dict[Dashboard, List[BasicPRInformation]] = dict()
@@ -445,15 +439,7 @@ def determine_pr_dashboards(
     foo = [pr for pr in interesting_CI if base_branch[pr.number] == "master" and pr not in prs_from_fork]
     prs_to_list[Dashboard.InessentialCIFails] = prs_without_any_label(foo, other_labels + ["merge-conflict"])
 
-    queue_prs2 = None
-    with open("queue.json", "r") as queuefile:
-        queue_prs2 = _extract_prs(json.load(queuefile))
-        queue_pr_numbers2 = [pr.number for pr in queue_prs2]
-    msg = "comparing this page's review dashboard (left) with the Github #queue (right)"
-    if my_assert_eq(msg, [pr.number for pr in queue_prs], queue_pr_numbers2):
-        print("Review dashboard and #queue match, hooray!", file=sys.stderr)
-
-    prs_to_list[Dashboard.Queue] = queue_prs if use_aggregate_queue else queue_prs2
+    prs_to_list[Dashboard.Queue] = queue_prs
     queue = prs_to_list[Dashboard.Queue]
     prs_to_list[Dashboard.QueueNewContributor] = prs_with_label(queue, "new-contributor")
     prs_to_list[Dashboard.QueueEasy] = prs_with_label(queue, "easy")

diff --git a/dashboard.py b/dashboard.py
@@ -1,8 +1,7 @@
 #!/usr/bin/env python3
 
-# This script accepts json files as command line arguments and displays the data in an HTML dashboard.
-# It assumes that for each PR N which should appear in some dashboard,
-# there is a file N.json in the `data` directory, which contains all necessary detailed information about that PR.
+# This script uses the aggregate information in processed_data/all_open_prs.json
+# to generate several HTML webpages on an HTML dashboard.
 
 import json
 import sys
@@ -16,7 +15,7 @@
 from ci_status import CIStatus
 from classify_pr_state import PRState, PRStatus
 from compute_dashboard_prs import (AggregatePRInfo, BasicPRInformation, Label, DataStatus, LastStatusChange, TotalQueueTime,
-    PLACEHOLDER_AGGREGATE_INFO, compute_pr_statusses, determine_pr_dashboards, infer_pr_url, link_to, parse_aggregate_file, gather_pr_statistics, _extract_prs)
+    compute_pr_statusses, determine_pr_dashboards, infer_pr_url, link_to, parse_aggregate_file, gather_pr_statistics, _extract_prs)
 from mathlib_dashboards import Dashboard, short_description, long_description, getIdTitle
 from util import my_assert_eq, format_delta, timedelta_tryParse, relativedelta_tryParse
 
@@ -27,26 +26,14 @@
 class JSONInputData(NamedTuple):
     # All aggregate information stored for every open PR.
     aggregate_info: dict[int, AggregatePRInfo]
-    # Information about all open PRs
-    all_open_prs: List[BasicPRInformation]
 
 
 # Validate the command-line arguments and try to read all data passed in via JSON files.
 # Any number of JSON files passed in is fine; we interpret them all as containing open PRs.
 def read_json_files() -> JSONInputData:
-    if len(sys.argv) == 1:
-        print("error: need to pass in some JSON files with open PRs")
-        sys.exit(1)
-    all_open_prs = []
-    for i in range(1, len(sys.argv)):
-        with open(sys.argv[i]) as prfile:
-            open_prs = _extract_prs(json.load(prfile))
-            if len(open_prs) >= 900:
-                print(f"warning: file {sys.argv[i]} contains at least 900 PRs: the REST API will never return more than 1000 PRs. Please split the list into more files as necessary.", file=sys.stderr)
-            all_open_prs.extend(open_prs)
     with open(path.join("processed_data", "open_pr_data.json"), "r") as f:
         aggregate_info = parse_aggregate_file(json.load(f))
-    return JSONInputData(aggregate_info, all_open_prs)
+    return JSONInputData(aggregate_info)
 
 
 ### Helper methods: writing HTML code for various parts of the generated webpage ###
@@ -866,18 +853,11 @@ def main() -> None:
     # Populate basic information from the input data: splitting into draft and non-draft PRs
     # (mostly, we only use the latter); extract separate dictionaries for CI status and base branch.
 
-    # NB. We handle missing metadata by adding "default" values for its aggregate data
-    # (ready for review, open, against master, failing CI and just updated now).
-    aggregate_info = input_data.aggregate_info.copy()
-    for pr in input_data.all_open_prs:
-        if pr.number not in input_data.aggregate_info:
-            print(f"warning: found no aggregate information for PR {pr.number}; filling in defaults", file=sys.stderr)
-            aggregate_info[pr.number] = PLACEHOLDER_AGGREGATE_INFO
-    draft_PRs = [pr for pr in input_data.all_open_prs if aggregate_info[pr.number].is_draft]
-    nondraft_PRs = [pr for pr in input_data.all_open_prs if not aggregate_info[pr.number].is_draft]
-
-    # The only exception is for the "on the queue" page,
-    # which points out missing information explicitly, hence is passed the non-filled in data.
+    aggregate_info = input_data.aggregate_info
+    all_open_prs = [AggregatePRInfo.toBasicPRInformation(data, number) for (number, data) in aggregate_info.items()]
+    draft_PRs = [pr for pr in all_open_prs if aggregate_info[pr.number].is_draft]
+    nondraft_PRs = [pr for pr in all_open_prs if not aggregate_info[pr.number].is_draft]
+
     CI_status: dict[int, CIStatus] = dict()
     for pr in nondraft_PRs:
         if pr.number in input_data.aggregate_info:
@@ -888,12 +868,10 @@ def main() -> None:
     for pr in nondraft_PRs:
         base_branch[pr.number] = aggregate_info[pr.number].base_branch
     prs_from_fork = [pr for pr in nondraft_PRs if aggregate_info[pr.number].head_repo != "leanprover-community"]
-    all_pr_status = compute_pr_statusses(aggregate_info, input_data.all_open_prs)
+    all_pr_status = compute_pr_statusses(aggregate_info, all_open_prs)
     write_on_the_queue_page(all_pr_status, aggregate_info, nondraft_PRs, prs_from_fork, CI_status, base_branch)
 
-    # TODO: try to enable |use_aggregate_queue| 'queue_prs' again, once all the root causes
-    # for PRs getting 'dropped' by 'gather_stats.sh' are found and fixed.
-    prs_to_list = determine_pr_dashboards(input_data.all_open_prs, nondraft_PRs, base_branch, prs_from_fork, CI_status, aggregate_info, False)
+    prs_to_list = determine_pr_dashboards(all_open_prs, nondraft_PRs, base_branch, prs_from_fork, CI_status, aggregate_info)
 
     # FUTURE: can this time be displayed in the local time zone of the user viewing this page?
     updated = datetime.now(timezone.utc).strftime("%B %d, %Y at %H:%M UTC")

diff --git a/docs/Overview.md b/docs/Overview.md
@@ -13,7 +13,7 @@ This is not difficult, but requires changes in multiple stages.
 1. Edit `process.py`, change the `get_aggregate_data` function to extract the data you want.
 Pay attention to the fact that there is "basic" and "full" PR info; not all information might be available in both files.
 2. Run `process.py` locally to update the generated file; commit the changes (except for the changed time-stamp). This step is optional; you can also just push the previous step and let CI perform this update.
-3. Once step 2 is complete, edit the definition of `AggregatePRInfo` in `dashboard.py` to include your new data field. Update `PLACEHOLDER_AGGREGATE_INFO` as well. Update `read_json_files` to parse this field as well.
+3. Once step 2 is complete, edit the definition of `AggregatePRInfo` in `dashboard.py` to include your new data field. Update `read_json_files` to parse this field as well.
 4. Congratulations, now you have made some new metadata available to the dashboard processing. (For making use of this, see the previous bullet point for changing the dashboard.)
 
 

diff --git a/generate_assigment_page.py b/generate_assigment_page.py
@@ -36,12 +36,10 @@
 
 # Assumes the aggregate data is correct: no cross-filling in of placeholder data.
 def compute_pr_list_from_aggregate_data_only(aggregate_data: dict[int, AggregatePRInfo]) -> dict[Dashboard, List[BasicPRInformation]]:
-    all_open_prs: List[BasicPRInformation] = []
     nondraft_PRs: List[BasicPRInformation] = []
     for number, data in aggregate_data.items():
         if data.state == "open":
             info = BasicPRInformation(number, data.author, data.title, infer_pr_url(number), data.labels, data.last_updated)
-            all_open_prs.append(info)
             if not data.is_draft:
                 nondraft_PRs.append(info)
     CI_status: dict[int, CIStatus] = dict()
@@ -54,7 +52,7 @@ def compute_pr_list_from_aggregate_data_only(aggregate_data: dict[int, Aggregate
     for pr in nondraft_PRs:
         base_branch[pr.number] = aggregate_data[pr.number].base_branch
     prs_from_fork = [pr for pr in nondraft_PRs if aggregate_data[pr.number].head_repo != "leanprover-community"]
-    return determine_pr_dashboards(all_open_prs, nondraft_PRs, base_branch, prs_from_fork, CI_status, aggregate_data, True)
+    return determine_pr_dashboards(all_open_prs, nondraft_PRs, base_branch, prs_from_fork, CI_status, aggregate_data)
 
 
 class ReviewerInfo(NamedTuple):