Skip to content

Commit

Permalink
fixing up validation
Browse files Browse the repository at this point in the history
  • Loading branch information
lucasmbrown-usds committed Sep 21, 2022
1 parent 86698c5 commit 046d16a
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 8 deletions.
16 changes: 14 additions & 2 deletions data/data-pipeline/data_pipeline/etl/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,12 @@ class ExtractTransformLoad:
# the YAML files?
LOAD_YAML_CONFIG: bool = False

# Some data sets will have multiple rows of data per tract. For those data sets,
# set this variable to `True` to skip two validation steps.
# However, note that datasets with multiple rows per tract *cannot* be used
# in the score process.
VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT: bool = False

# We use output_df as the final dataframe to use to write to the CSV
# It is used on the "load" base class method
output_df: pd.DataFrame = None
Expand Down Expand Up @@ -279,7 +285,10 @@ def validate(self) -> None:
f"Must have `{geo_field}` in columns if "
f"specifying geo level as `{geo_level} "
)
if self.output_df.shape[0] > expected_rows:
if (
self.output_df.shape[0] > expected_rows
and not self.VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT
):
raise ValueError(
f"Too many rows: `{self.output_df.shape[0]}` rows in "
f"output exceeds expectation of `{expected_rows}` "
Expand All @@ -305,7 +314,10 @@ def validate(self) -> None:
self.output_df[geo_field].shape[0]
- self.output_df[geo_field].nunique()
)
if duplicate_geo_field_values > 0:
if (
duplicate_geo_field_values > 0
and not self.VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT
):
raise ValueError(
f"Duplicate values: There are {duplicate_geo_field_values} "
f"duplicate values in "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,17 @@ class GeoCorrAlternativesETL(ExtractTransformLoad):
# Metadata for the baseclass
NAME = "geocorr_alternatives"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False

INPUT_GEOCORR_TRACT_FIELD = "tract"
INPUT_GEOCORR_COUNTY_FIELD = "county"
INPUT_GEOCORR_ZIP_FIELD = "zcta5"
INPUT_GEOCORR_ALLOCATION_FIELD = "afact"

# Skip some validation checks, because there will be multiple rows per tract in this
# geocorr dataset.
VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT = True

# GeoCorr downloads have a field definition in the second row of the CSV.
# This parameter skips the second row for pandas `read_csv`.
GEOCORR_SKIP_ROWS: typing.List[int] = [1]
Expand Down Expand Up @@ -98,10 +103,4 @@ def transform(self) -> None:
".", "", regex=False
)

logger.info(zip_codes_to_tracts_df.head())

self.output_df = zip_codes_to_tracts_df

# TODO: DELETE
def validate(self) -> None:
pass

0 comments on commit 046d16a

Please sign in to comment.