fixing up validation

usds · Sep 21, 2022 · 046d16a · 046d16a
1 parent 86698c5
commit 046d16a
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 8 deletions.
diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py
@@ -119,6 +119,12 @@ class ExtractTransformLoad:
     # the YAML files?
     LOAD_YAML_CONFIG: bool = False
 
+    # Some data sets will have multiple rows of data per tract. For those data sets,
+    # set this variable to `True` to skip two validation steps.
+    # However, note that datasets with multiple rows per tract *cannot* be used
+    # in the score process.
+    VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT: bool = False
+
     # We use output_df as the final dataframe to use to write to the CSV
     # It is used on the "load" base class method
     output_df: pd.DataFrame = None
@@ -279,7 +285,10 @@ def validate(self) -> None:
                         f"Must have `{geo_field}` in columns if "
                         f"specifying geo level as `{geo_level} "
                     )
-                if self.output_df.shape[0] > expected_rows:
+                if (
+                    self.output_df.shape[0] > expected_rows
+                    and not self.VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT
+                ):
                     raise ValueError(
                         f"Too many rows: `{self.output_df.shape[0]}` rows in "
                         f"output exceeds expectation of `{expected_rows}` "
@@ -305,7 +314,10 @@ def validate(self) -> None:
                     self.output_df[geo_field].shape[0]
                     - self.output_df[geo_field].nunique()
                 )
-                if duplicate_geo_field_values > 0:
+                if (
+                    duplicate_geo_field_values > 0
+                    and not self.VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT
+                ):
                     raise ValueError(
                         f"Duplicate values: There are {duplicate_geo_field_values} "
                         f"duplicate values in "

diff --git a/data/data-pipeline/data_pipeline/etl/sources/geocorr_alternatives/etl.py b/data/data-pipeline/data_pipeline/etl/sources/geocorr_alternatives/etl.py
@@ -24,12 +24,17 @@ class GeoCorrAlternativesETL(ExtractTransformLoad):
     # Metadata for the baseclass
     NAME = "geocorr_alternatives"
     GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
+    PUERTO_RICO_EXPECTED_IN_DATA = False
 
     INPUT_GEOCORR_TRACT_FIELD = "tract"
     INPUT_GEOCORR_COUNTY_FIELD = "county"
     INPUT_GEOCORR_ZIP_FIELD = "zcta5"
     INPUT_GEOCORR_ALLOCATION_FIELD = "afact"
 
+    # Skip some validation checks, because there will be multiple rows per tract in this
+    # geocorr dataset.
+    VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT = True
+
     # GeoCorr downloads have a field definition in the second row of the CSV.
     # This parameter skips the second row for pandas `read_csv`.
     GEOCORR_SKIP_ROWS: typing.List[int] = [1]
@@ -98,10 +103,4 @@ def transform(self) -> None:
             ".", "", regex=False
         )
 
-        logger.info(zip_codes_to_tracts_df.head())
-
         self.output_df = zip_codes_to_tracts_df
-
-    # TODO: DELETE
-    def validate(self) -> None:
-        pass