Add provenance info to written report

astronomy-commons · Jan 25, 2025 · d083658 · d083658
1 parent 347357f
commit d083658
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 8 deletions.
diff --git a/src/hats_import/verification/run_verification.py b/src/hats_import/verification/run_verification.py
@@ -16,6 +16,7 @@
 from hats import read_hats
 from hats.pixel_math.spatial_index import SPATIAL_INDEX_COLUMN
 
+import hats_import
 from hats_import.verification.arguments import VerificationArguments
 
 
@@ -179,8 +180,7 @@ def test_is_valid_catalog(self) -> bool:
         -------
             bool: True if the test passed, else False.
         """
-        version = f"hats version {hats.__version__}"
-        test, description = "valid hats", f"Test hats.io.validation.is_valid_catalog ({version})."
+        test, description = "valid hats", "Test hats.io.validation.is_valid_catalog."
         target = self.args.input_catalog_path
         self.print_if_verbose(f"\nStarting: {description}")
 
@@ -223,7 +223,8 @@ def test_file_sets(self) -> bool:
     def test_num_rows(self) -> bool:
         """Test the number of rows in the dataset. Add `Result`s to `results`.
 
-        File footers are compared with _metadata and the user-supplied truth (if provided).
+        Row counts in parquet file footers are compared with the '_metadata' file,
+        HATS 'properties' file, and (if provided) the user-supplied truth.
 
         Returns
         -------
@@ -463,11 +464,25 @@ def write_results(self, *, write_mode: Literal["a", "w", "x"] = "a") -> None:
         Parameters
         ----------
         write_mode : Literal["a", "w", "x"], optional
-            Mode to be used when writing output file. Passed to pandas.DataFrame.to_csv as `mode`.
+            Mode to be used when writing the output file. Options have the typical meanings:
+                - 'w': truncate the file first
+                - 'x': exclusive creation, failing if the file already exists
+                - 'a': append to the end of file if it exists
         """
         self.args.output_file_path.parent.mkdir(exist_ok=True, parents=True)
-        header = not (write_mode == "a" and self.args.output_file_path.exists())
-        self.results_df.to_csv(self.args.output_file_path, mode=write_mode, header=header, index=False)
+        # Write provenance info
+        with open(self.args.output_file_path, write_mode, encoding="utf8") as fout:
+            fout.writelines(
+                [
+                    "# HATS verification results for\n",
+                    f"# {self.args.input_catalog_path}\n",
+                    f"# Package versions: hats v{hats.__version__}; hats-import v{hats_import.__version__}\n",
+                    f"# User-supplied truth schema: {self.args.truth_schema}\n",
+                    f"# User-supplied truth total rows: {self.args.truth_total_rows}\n",
+                ]
+            )
+        # Write results
+        self.results_df.to_csv(self.args.output_file_path, mode="a", header=True, index=False)
         self.print_if_verbose(f"\nVerifier results written to {self.args.output_file_path}")
 
     def print_if_verbose(self, message):

diff --git a/tests/hats_import/verification/test_run_verification.py b/tests/hats_import/verification/test_run_verification.py
@@ -24,15 +24,15 @@ def test_runner(small_sky_object_catalog, wrong_files_and_rows_dir, tmp_path):
     )
     verifier = runner.run(args, write_mode="w")
     assert verifier.all_tests_passed, "good catalog failed"
-    written_results = pd.read_csv(args.output_path / args.output_filename)
+    written_results = pd.read_csv(args.output_path / args.output_filename, comment="#")
     assert written_results[result_cols].equals(verifier.results_df[result_cols]), "report failed"
 
     args = VerificationArguments(
         input_catalog_path=wrong_files_and_rows_dir, output_path=tmp_path, verbose=False
     )
     verifier = runner.run(args, write_mode="w")
     assert not verifier.all_tests_passed, "bad catalog passed"
-    written_results = pd.read_csv(args.output_path / args.output_filename)
+    written_results = pd.read_csv(args.output_path / args.output_filename, comment="#")
     assert written_results[result_cols].equals(verifier.results_df[result_cols]), "report failed"