From 2eff4ed40ebfe87434ca947c55d8006971247d89 Mon Sep 17 00:00:00 2001 From: Melissa DeLucchi Date: Wed, 22 Jan 2025 08:51:04 -0500 Subject: [PATCH] Add test against catalog info total num rows. --- .../verification/run_verification.py | 18 +++++++++++++++--- .../wrong_files_and_rows/partition_info.csv | 2 ++ tests/data/wrong_files_and_rows/properties | 8 ++++++++ .../verification/test_run_verification.py | 2 +- 4 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 tests/data/wrong_files_and_rows/partition_info.csv create mode 100644 tests/data/wrong_files_and_rows/properties diff --git a/src/hats_import/verification/run_verification.py b/src/hats_import/verification/run_verification.py index f745a4a6..e2e0931a 100644 --- a/src/hats_import/verification/run_verification.py +++ b/src/hats_import/verification/run_verification.py @@ -10,6 +10,7 @@ import pandas as pd import pyarrow as pa import pyarrow.dataset as pds +from hats import read_hats from hats.pixel_math.spatial_index import SPATIAL_INDEX_COLUMN from hats_import.verification.arguments import VerificationArguments @@ -219,9 +220,20 @@ def test_num_rows(self) -> bool: description = "Test that number of rows are equal." print(f"\nStarting: {description}") + catalog_prop_len = read_hats(self.args.input_catalog_path).catalog_info.total_rows + # get the number of rows in each file, indexed by file path. we treat this as truth. files_df = self._load_nrows(self.files_ds) - files_df_total = f"file footers ({files_df.num_rows.sum():,})" + files_df_sum = files_df.num_rows.sum() + files_df_total = f"file footers ({files_df_sum:,})" + + target = "file footers vs catalog properties" + print(f"\t{target}") + passed_cat = catalog_prop_len == files_df_sum + _description = f" {files_df_total} vs catalog properties ({catalog_prop_len:,})." + self.results.append( + Result(passed=passed_cat, test=test, target=target, description=description + _description) + ) # check _metadata target = "file footers vs _metadata" @@ -245,7 +257,7 @@ def test_num_rows(self) -> bool: if self.args.truth_total_rows is not None: target = "file footers vs truth" print(f"\t{target}") - passed_th = self.args.truth_total_rows == files_df.num_rows.sum() + passed_th = self.args.truth_total_rows == files_df_sum _description = f" {files_df_total} vs user-provided truth ({self.args.truth_total_rows:,})." self.results.append( Result(passed=passed_th, test=test, target=target, description=description + _description) @@ -253,7 +265,7 @@ def test_num_rows(self) -> bool: else: passed_th = True # this test did not fail. this is only needed for the return value. - all_passed = all([passed_md, passed_th]) + all_passed = all([passed_md, passed_th, passed_cat]) print(f"Result: {'PASSED' if all_passed else 'FAILED'}") return all_passed diff --git a/tests/data/wrong_files_and_rows/partition_info.csv b/tests/data/wrong_files_and_rows/partition_info.csv new file mode 100644 index 00000000..bf77935e --- /dev/null +++ b/tests/data/wrong_files_and_rows/partition_info.csv @@ -0,0 +1,2 @@ +Norder,Npix +0,11 diff --git a/tests/data/wrong_files_and_rows/properties b/tests/data/wrong_files_and_rows/properties new file mode 100644 index 00000000..801bc768 --- /dev/null +++ b/tests/data/wrong_files_and_rows/properties @@ -0,0 +1,8 @@ +#HATS catalog +obs_collection=wrong_files_and_rows +dataproduct_type=object +hats_nrows=600 +hats_col_ra=source_ra +hats_col_dec=source_dec +hats_order=2 + diff --git a/tests/hats_import/verification/test_run_verification.py b/tests/hats_import/verification/test_run_verification.py index 068ce561..697416e4 100644 --- a/tests/hats_import/verification/test_run_verification.py +++ b/tests/hats_import/verification/test_run_verification.py @@ -83,7 +83,7 @@ def test_test_num_rows(small_sky_object_catalog, wrong_files_and_rows_dir, tmp_p all_failed = not results.passed.any() assert all_failed, "bad catalog passed" - targets = {"file footers vs _metadata", "file footers vs truth"} + targets = {"file footers vs catalog properties", "file footers vs _metadata", "file footers vs truth"} assert targets == set(results.target), "wrong targets" expected_bad_file_names = {