Merge pull request #416 from ONSdigital/RDRP-1149_fix_validation_error

RDRP-1149_fix_validation_error
ONSdigital · Feb 5, 2025 · 075dafb · 075dafb
2 parents b1a2901 + 2b3cc49
commit 075dafb
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 14 deletions.
diff --git a/src/outputs/frozen_group.py b/src/outputs/frozen_group.py
@@ -19,7 +19,7 @@ def output_frozen_group(
     config: Dict[str, Any],
     intram_tot_dict: Dict[str, int],
     write_csv: Callable,
-    deduplicate: bool = True,
+    deduplicate: bool = False,
 ) -> Dict[str, int]:
     """Creates a "frozen group" output  for the entire UK. In BERD (GB) data,
     creates foreign ownership and cora status. Selects the columns we need for
@@ -175,6 +175,9 @@ def output_frozen_group(
     df = pd.concat([df_gb_need, df_ni_need], ignore_index=True, axis=0)
 
     # Deduplicate by aggregation
+    # TODO: this code fails in DAP for PNP. Think whether it's necessary and
+    # TODO then refactor this, using a list of columns from the config
+    # TODO and considering whether there are extra cols in the PNP case.
     if deduplicate:
         df_agg = df.groupby(category_columns).agg("sum").reset_index()
     else:
@@ -206,6 +209,6 @@ def output_frozen_group(
 
     # Outputting the CSV file
     filename = filename_amender("output_frozen_group", config)
-    write_csv(f"{output_path}output_frozen_group/{filename}", output)
+    write_csv(f"{output_path}/output_frozen_group/{filename}", output)
 
     return intram_tot_dict
diff --git a/src/staging/validation.py b/src/staging/validation.py
@@ -156,9 +156,7 @@ def validate_data_with_schema(survey_df: pd.DataFrame, schema_path: str):  # noq
                         f"Failed to convert column '{column}' to datetime. Please check"
                         " the data."
                     )
-            else:
-                if survey_df[column].isna().all() is False:
-                    survey_df[column] = survey_df[column].astype(dtypes_dict[column])
+            survey_df[column] = survey_df[column].astype(dtypes_dict[column])
         except Exception as e:
             ValidationLogger.error(e)
     ValidationLogger.info("Validation successful")

diff --git a/src/user_config.yaml b/src/user_config.yaml
@@ -2,7 +2,7 @@ config_validation:
   validate: True
   path: src/user_config_schema.yaml
 survey:
-  survey_type: "BERD"
+  survey_type: "PNP"
   survey_year: 2023
 global:
   # Staging and validation settings

diff --git a/tests/test_staging/test_validation.py b/tests/test_staging/test_validation.py
@@ -88,14 +88,10 @@ def test_validate_data_with_schema(mock_load_schema):
     # Call the function to be tested
     validate_data_with_schema(dumy_data, "mock_schema.toml")
 
-    # NOTE: we can't just check for data type 'int', the python built-in type, as the data type
-    # of a pandas series is a numpy dtype, eg. numpy.int64 (copilot help)
-    # Apparently this is not the case for string and float, so we can use the python built-in types
-
-    assert dumy_data["col1"].dtypes == np.int64
-    assert dumy_data["col2"].dtypes == pd.StringDtype()
-    assert dumy_data["col3"].dtypes == float
-    assert pd.api.types.is_datetime64_any_dtype(dumy_data["col4"].dtypes)
+    assert pd.api.types.is_integer_dtype(dumy_data["col1"].dtype), "col1 should be of integer type"
+    assert pd.api.types.is_string_dtype(dumy_data["col2"].dtype), "col2 should be of string type"
+    assert pd.api.types.is_float_dtype(dumy_data["col3"].dtype), "col3 should be of float type"
+    assert pd.api.types.is_datetime64_any_dtype(dumy_data["col4"].dtype), "col4 should be of datetime type"
 
 
 # Mock the schemas data