Method to compare summary xlsx reports from CI; better organization o…

…f baseline and new results for CI comparisons.
trynthink · Nov 20, 2024 · 0e46d9f · 0e46d9f
1 parent ecb8f6a
commit 0e46d9f
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 52 deletions.
diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
@@ -66,14 +66,17 @@ jobs:
       - name: Compare integration test results
         run: |
           #FIXME temporarily pull from ci_outputs
-          git fetch
-          git show origin/ci_outputs:tests/integration_testing/results/agg_results.json > tests/integration_testing/results/agg_results_master.json
-          git show origin/ci_outputs:tests/integration_testing/results/ecm_results.json > tests/integration_testing/results/ecm_results_master.json
-          git show origin/ci_outputs:tests/integration_testing/results/plots/tech_potential/Summary_Data-TP.xlsx > tests/integration_testing/results/plots/tech_potential/Summary_Data-TP_master.xlsx
-          git show origin/ci_outputs:tests/integration_testing/results/plots/max_adopt_potential/Summary_Data-MAP.xlsx > tests/integration_testing/results/plots/tech_potential/Summary_Data-MAP_master.xlsx
-
-          python tests/integration_testing/compare_results.py -d tests/integration_testing/results --baseline_suffix _master
+          git fetch origin master ci_outputs
+          branch_name="${{ github.ref }}"
+          if [[ $(git diff --exit-code origin/master ./tests/integration_testing/results/agg_results.json ./tests/integration_testing/results/ecm_results.json) ]]; then
+            mkdir tests/integration_testing/base_results
+            git show origin/ci_outputs:tests/integration_testing/results/agg_results.json > tests/integration_testing/base_results/agg_results.json
+            git show origin/ci_outputs:tests/integration_testing/results/ecm_results.json > tests/integration_testing/base_results/ecm_results.json
+            git show origin/ci_outputs:tests/integration_testing/results/plots/tech_potential/Summary_Data-TP.xlsx > tests/integration_testing/base_results/Summary_Data-TP.xlsx
+            git show origin/ci_outputs:tests/integration_testing/results/plots/max_adopt_potential/Summary_Data-MAP.xlsx > tests/integration_testing/base_results/Summary_Data-MAP.xlsx
 
+            python tests/integration_testing/compare_results.py --base-dir tests/integration_testing/base_results --new-dir tests/integration_testing/results
+          fi
       - name: Upload artifacts
         uses: actions/upload-artifact@v3
         with:
@@ -85,7 +88,7 @@ jobs:
           git pull origin $branch_name
           git add ./tests/integration_testing/results/*.json
           if [[ $(git diff --cached --exit-code) ]]; then
-            git add ./tests/integration_testing/results
+            git add ./tests/integration_testing/results/plots
             git config --system user.email "[email protected]"
             git config --system user.name "GitHub Action"
             git commit -m "Upload results files from CI build"

diff --git a/tests/integration_testing/compare_results.py b/tests/integration_testing/compare_results.py
@@ -15,12 +15,17 @@ def load_json(file_path):
         with open(file_path, 'r') as file:
             return json.load(file)
 
+    @staticmethod
+    def load_summary_report(file_path):
+        df = pd.read_excel(file_path, index_col=list(range(5)))
+        return df
+
     def compare_dict_keys(self, dict1, dict2, paths, path='', key_diffs=None):
         """Compares nested keys across two dictionaries by recursively searching each level
 
         Args:
-            dict1 (dict): dictionary to compare
-            dict2 (dict): dictionary to compare
+            dict1 (dict): baseline dictionary to compare
+            dict2 (dict): new dictionary to compare
             paths (list): paths to the original files from which the dictionaries are imported
             path (str, optional): current dictionary path at whcih to compare. Defaults to ''.
             key_diffs (pd.DataFrame, optional): existing summary of difference. Defaults to None.
@@ -37,12 +42,12 @@ def compare_dict_keys(self, dict1, dict2, paths, path='', key_diffs=None):
         only_in_dict2 = keys2 - keys1
 
         if only_in_dict1:
-            new_row = pd.DataFrame({"Results file": paths[0].stem,
+            new_row = pd.DataFrame({"Results file": f"{paths[0].parent.name}/{paths[0].name}",
                                     "Unique key": str(only_in_dict1),
                                     "Found at": path[2:]}, index=[0])
             key_diffs = pd.concat([key_diffs, new_row], ignore_index=True)
         if only_in_dict2:
-            new_row = pd.DataFrame({"Results file": paths[1].stem,
+            new_row = pd.DataFrame({"Results file": f"{paths[0].parent.name}/{paths[0].name}",
                                     "Unique key": str(only_in_dict2),
                                     "Found at": path[2:]}, index=[0])
             key_diffs = pd.concat([key_diffs, new_row], ignore_index=True)
@@ -62,8 +67,8 @@ def compare_dict_values(self, dict1, dict2, percent_threshold=10, abs_threshold=
             values at common paths. Both thresholds must be met to report results.
 
         Args:
-            dict1 (dict): dictionary to compare
-            dict2 (dict): dictionary to compare
+            dict1 (dict): baseline dictionary to compare
+            dict2 (dict): new dictionary to compare
             percent_threshold (int, optional): the percent difference threshold at which
                                                differences are reported. Defaults to 10.
             abs_threshold (int, optional): the abosolute difference threshold at which differences
@@ -96,40 +101,55 @@ def write_dict_key_report(self, diff_report, output_path):
         if diff_report.empty:
             return
         diff_report.to_csv(output_path, index=False)
+        print(f"Wrote dictionary key report to {output_path}")
 
     def write_dict_value_report(self, diff_report, output_path):
         df = pd.DataFrame(columns=["Results path", "Percent difference"],
                           data=list(zip(diff_report.keys(), diff_report.values())))
         if df.empty:
             return
         df.to_csv(output_path, index=False)
+        print(f"Wrote dictionary value report to {output_path}")
 
-    def compare_jsons(self, json1_path, json2_path, write_reports=True):
+    def compare_jsons(self, json1_path, json2_path, output_dir=True):
         """Compare two jsons and report differences in keys and in values
 
         Args:
-            json1_path (Path): json file to compare
-            json2_path (Path): json file to compare
+            json1_path (Path): baseline json file to compare
+            json2_path (Path): new json file to compare
             write_reports (bool, optional): _description_. Defaults to True.
         """
         json1 = self.load_json(json1_path)
         json2 = self.load_json(json2_path)
 
         # Compare differences in json keys
         key_diffs = self.compare_dict_keys(json1, json2, [json1_path, json2_path])
-        if write_reports:
-            out_path = json2_path.parent / f"{json2_path.stem}_key_diffs.csv"
-            self.write_dict_key_report(key_diffs, out_path)
+        if output_dir is None:
+            output_dir = json2_path.parent
+        self.write_dict_key_report(key_diffs, output_dir / f"{json2_path.stem}_key_diffs.csv")
 
         # Compare differences in json values
         val_diffs = self.compare_dict_values(json1, json2)
-        if write_reports:
-            out_path = json2_path.parent / f"{json2_path.stem}_value_diffs.csv"
-            self.write_dict_value_report(val_diffs, out_path)
+        self.write_dict_value_report(val_diffs, output_dir / f"{json2_path.stem}_value_diffs.csv")
+
+    def compare_summary_reports(self, report1_path, report2_path, output_dir=None):
+        """Compare Summary_Data-TP.xlsx and Summary_Data-MAP.xlsx with baseline files
+
+        Args:
+            report1_path (Path): baseline summary report to compare
+            report2_path (Path): new summary report to compare
+            output_dir (Path, optional): _description_. Defaults to None.
+        """
 
-    def compare_summary_reports(self, report1_path, report2_path, write_reports=True):
-        # Compare Summary_Data-TP.xlsx and Summary_Data-MAP.xlsx with baseline files
-        pass
+        report1 = self.load_summary_report(report1_path)
+        report2 = self.load_summary_report(report2_path)
+
+        diff = ((report2 - report1)/report1).round(2)
+        if output_dir is None:
+            output_dir = report2_path.parent
+        output_path = output_dir / f"{report2_path.stem}_percent_diffs.csv"
+        diff.to_csv(output_path)
+        print(f"Wrote Summary_Data percent difference report to {output_path}")
 
 
 def main():
@@ -140,38 +160,30 @@ def main():
                         help="Path to the baseline summary report (Excel file)")
     parser.add_argument("--summary-new", type=Path,
                         help="Path to the new summary report (Excel file)")
-    parser.add_argument("-d", "--directory", type=Path,
-                        help="Directory containing files to compare")
-    parser.add_argument("--baseline_suffix", type=str, default="_master",
-                        help="If using the --directory argument, specify the suffix for the "
-                        "baseline files (e.g., '_master')")
+    parser.add_argument("--new-dir", type=Path, help="Directory containing files to compare")
+    parser.add_argument("--base-dir", type=Path, help="Directory containing files to compare")
     parser.add_argument("--threshold", type=float, default=10,
                         help="Threshold for percent difference")
     args = parser.parse_args()
 
     compare = ScoutCompare()
-    if args.directory:
+    if args.base_dir and args.new_dir:
         # Compare all files
-        results_dir = args.directory.resolve()
-        agg_results_json_base = results_dir / f"agg_results{args.baseline_suffix}.json"
-        agg_results_json = results_dir / "agg_results.json"
-        compare.compare_jsons(agg_results_json_base, agg_results_json)
-
-        ecm_results_json_base = results_dir / f"ecm_results{args.baseline_suffix}.json"
-        ecm_results_json = results_dir / "ecm_results.json"
-        compare.compare_jsons(ecm_results_json_base, ecm_results_json)
-
-        plots_dir = results_dir / "plots"
-        summary_tp_base = plots_dir / "tech_potential" / \
-            f"Summary_Data-TP{args.baseline_suffix}.xlsx"
-        summary_tp = plots_dir / "tech_potential" / "Summary_Data-TP.xlsx"
-        compare.compare_summary_reports(summary_tp_base, summary_tp)
-
-        summary_map_base = (plots_dir / "max_adopt_potential" /
-                            f"Summary_Data-MAP{args.baseline_suffix}.xlsx")
-        summary_map = plots_dir / "tech_potential" / "Summary_Data-MAP.xlsx"
-        compare.compare_summary_reports(summary_map_base, summary_map)
-
+        base_dir = args.base_dir.resolve()
+        new_dir = args.new_dir.resolve()
+        agg_json_base = base_dir / "agg_results.json"
+        agg_json_new = new_dir / "agg_results.json"
+        compare.compare_jsons(agg_json_base, agg_json_new, output_dir=new_dir)
+        ecm_json_base = base_dir / "ecm_results.json"
+        ecm_json_new = new_dir / "ecm_results.json"
+        compare.compare_jsons(ecm_json_base, ecm_json_new, output_dir=new_dir)
+
+        summary_tp_base = base_dir / "Summary_Data-TP.xlsx"
+        summary_tp_new = new_dir / "plots" / "tech_potential" / "Summary_Data-TP.xlsx"
+        compare.compare_summary_reports(summary_tp_base, summary_tp_new, output_dir=new_dir)
+        summary_map_base = base_dir / "Summary_Data-MAP.xlsx"
+        summary_map_new = new_dir / "plots" / "max_adopt_potential" / "Summary_Data-MAP.xlsx"
+        compare.compare_summary_reports(summary_map_base, summary_map_new, output_dir=new_dir)
     else:
         # Compare only as specified by the arguments
         if args.json_baseline and args.json_new: