From a0f4546b576ace5be9a3a6228073126d8993bbd1 Mon Sep 17 00:00:00 2001 From: Lewis Date: Wed, 29 Jan 2025 09:42:21 +0000 Subject: [PATCH 1/3] removed previous fix --- src/staging/validation.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/staging/validation.py b/src/staging/validation.py index c733fb81b..5611604a7 100644 --- a/src/staging/validation.py +++ b/src/staging/validation.py @@ -156,9 +156,7 @@ def validate_data_with_schema(survey_df: pd.DataFrame, schema_path: str): # noq f"Failed to convert column '{column}' to datetime. Please check" " the data." ) - else: - if survey_df[column].isna().all() is False: - survey_df[column] = survey_df[column].astype(dtypes_dict[column]) + survey_df[column] = survey_df[column].astype(dtypes_dict[column]) except Exception as e: ValidationLogger.error(e) ValidationLogger.info("Validation successful") From 91f661a3ca32b2f77cd21e02f1a1c40a8ed82546 Mon Sep 17 00:00:00 2001 From: Anne Griffith Date: Tue, 4 Feb 2025 14:04:33 +0000 Subject: [PATCH 2/3] fix data types unit test --- tests/test_staging/test_validation.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/test_staging/test_validation.py b/tests/test_staging/test_validation.py index c15fc3e0a..1ce0d44f8 100644 --- a/tests/test_staging/test_validation.py +++ b/tests/test_staging/test_validation.py @@ -88,14 +88,10 @@ def test_validate_data_with_schema(mock_load_schema): # Call the function to be tested validate_data_with_schema(dumy_data, "mock_schema.toml") - # NOTE: we can't just check for data type 'int', the python built-in type, as the data type - # of a pandas series is a numpy dtype, eg. numpy.int64 (copilot help) - # Apparently this is not the case for string and float, so we can use the python built-in types - - assert dumy_data["col1"].dtypes == np.int64 - assert dumy_data["col2"].dtypes == pd.StringDtype() - assert dumy_data["col3"].dtypes == float - assert pd.api.types.is_datetime64_any_dtype(dumy_data["col4"].dtypes) + assert pd.api.types.is_integer_dtype(dumy_data["col1"].dtype), "col1 should be of integer type" + assert pd.api.types.is_string_dtype(dumy_data["col2"].dtype), "col2 should be of string type" + assert pd.api.types.is_float_dtype(dumy_data["col3"].dtype), "col3 should be of float type" + assert pd.api.types.is_datetime64_any_dtype(dumy_data["col4"].dtype), "col4 should be of datetime type" # Mock the schemas data From 2b3cc491b2ee25f6127b0163bfae1a356642f2e4 Mon Sep 17 00:00:00 2001 From: Anne Griffith Date: Tue, 4 Feb 2025 16:56:41 +0000 Subject: [PATCH 3/3] remove de-aggregation in frozen group output --- src/outputs/frozen_group.py | 7 +++++-- src/user_config.yaml | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/outputs/frozen_group.py b/src/outputs/frozen_group.py index ad5a11a46..3fa5d2b3b 100644 --- a/src/outputs/frozen_group.py +++ b/src/outputs/frozen_group.py @@ -19,7 +19,7 @@ def output_frozen_group( config: Dict[str, Any], intram_tot_dict: Dict[str, int], write_csv: Callable, - deduplicate: bool = True, + deduplicate: bool = False, ) -> Dict[str, int]: """Creates a "frozen group" output for the entire UK. In BERD (GB) data, creates foreign ownership and cora status. Selects the columns we need for @@ -175,6 +175,9 @@ def output_frozen_group( df = pd.concat([df_gb_need, df_ni_need], ignore_index=True, axis=0) # Deduplicate by aggregation + # TODO: this code fails in DAP for PNP. Think whether it's necessary and + # TODO then refactor this, using a list of columns from the config + # TODO and considering whether there are extra cols in the PNP case. if deduplicate: df_agg = df.groupby(category_columns).agg("sum").reset_index() else: @@ -206,6 +209,6 @@ def output_frozen_group( # Outputting the CSV file filename = filename_amender("output_frozen_group", config) - write_csv(f"{output_path}output_frozen_group/{filename}", output) + write_csv(f"{output_path}/output_frozen_group/{filename}", output) return intram_tot_dict diff --git a/src/user_config.yaml b/src/user_config.yaml index 44afb5317..89a9d2189 100644 --- a/src/user_config.yaml +++ b/src/user_config.yaml @@ -2,7 +2,7 @@ config_validation: validate: True path: src/user_config_schema.yaml survey: - survey_type: "BERD" + survey_type: "PNP" survey_year: 2023 global: # Staging and validation settings