Skip to content

Commit

Permalink
Merge pull request #416 from ONSdigital/RDRP-1149_fix_validation_error
Browse files Browse the repository at this point in the history
RDRP-1149_fix_validation_error
  • Loading branch information
AnneONS authored Feb 5, 2025
2 parents b1a2901 + 2b3cc49 commit 075dafb
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 14 deletions.
7 changes: 5 additions & 2 deletions src/outputs/frozen_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def output_frozen_group(
config: Dict[str, Any],
intram_tot_dict: Dict[str, int],
write_csv: Callable,
deduplicate: bool = True,
deduplicate: bool = False,
) -> Dict[str, int]:
"""Creates a "frozen group" output for the entire UK. In BERD (GB) data,
creates foreign ownership and cora status. Selects the columns we need for
Expand Down Expand Up @@ -175,6 +175,9 @@ def output_frozen_group(
df = pd.concat([df_gb_need, df_ni_need], ignore_index=True, axis=0)

# Deduplicate by aggregation
# TODO: this code fails in DAP for PNP. Think whether it's necessary and
# TODO then refactor this, using a list of columns from the config
# TODO and considering whether there are extra cols in the PNP case.
if deduplicate:
df_agg = df.groupby(category_columns).agg("sum").reset_index()
else:
Expand Down Expand Up @@ -206,6 +209,6 @@ def output_frozen_group(

# Outputting the CSV file
filename = filename_amender("output_frozen_group", config)
write_csv(f"{output_path}output_frozen_group/{filename}", output)
write_csv(f"{output_path}/output_frozen_group/{filename}", output)

return intram_tot_dict
4 changes: 1 addition & 3 deletions src/staging/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,7 @@ def validate_data_with_schema(survey_df: pd.DataFrame, schema_path: str): # noq
f"Failed to convert column '{column}' to datetime. Please check"
" the data."
)
else:
if survey_df[column].isna().all() is False:
survey_df[column] = survey_df[column].astype(dtypes_dict[column])
survey_df[column] = survey_df[column].astype(dtypes_dict[column])
except Exception as e:
ValidationLogger.error(e)
ValidationLogger.info("Validation successful")
Expand Down
2 changes: 1 addition & 1 deletion src/user_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ config_validation:
validate: True
path: src/user_config_schema.yaml
survey:
survey_type: "BERD"
survey_type: "PNP"
survey_year: 2023
global:
# Staging and validation settings
Expand Down
12 changes: 4 additions & 8 deletions tests/test_staging/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,10 @@ def test_validate_data_with_schema(mock_load_schema):
# Call the function to be tested
validate_data_with_schema(dumy_data, "mock_schema.toml")

# NOTE: we can't just check for data type 'int', the python built-in type, as the data type
# of a pandas series is a numpy dtype, eg. numpy.int64 (copilot help)
# Apparently this is not the case for string and float, so we can use the python built-in types

assert dumy_data["col1"].dtypes == np.int64
assert dumy_data["col2"].dtypes == pd.StringDtype()
assert dumy_data["col3"].dtypes == float
assert pd.api.types.is_datetime64_any_dtype(dumy_data["col4"].dtypes)
assert pd.api.types.is_integer_dtype(dumy_data["col1"].dtype), "col1 should be of integer type"
assert pd.api.types.is_string_dtype(dumy_data["col2"].dtype), "col2 should be of string type"
assert pd.api.types.is_float_dtype(dumy_data["col3"].dtype), "col3 should be of float type"
assert pd.api.types.is_datetime64_any_dtype(dumy_data["col4"].dtype), "col4 should be of datetime type"


# Mock the schemas data
Expand Down

0 comments on commit 075dafb

Please sign in to comment.