Separate functions and correct tests

digital-land · Apr 15, 2024 · adddaa4 · adddaa4
1 parent 24594c2
commit adddaa4
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 77 deletions.
diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
@@ -1,13 +1,9 @@
-# checkpoint needs to assemble class state
-# it needs to validate inputs specific for that checkpoint
-# it then needs to run expectations
-# then it needs to be able to save those expectation resultts
-# a checkpoint represents the moment in the process where we tell it the
-# type of data it is validating and where the data is
-# the primary different between checkpoints is how it loads expectations (i.e. where that are loaded from)
 from pathlib import Path
-import csv
 from .base import BaseCheckpoint
+from ..expectation_functions.resource_validations import (
+    check_for_duplicate_references,
+    validate_references,
+)
 
 
 class ConvertedResourceCheckpoint(BaseCheckpoint):
@@ -18,68 +14,15 @@ def __init__(self, data_path):
     def load(self):
         self.expectations = [
             {
-                "function": self.check_for_duplicate_references,
+                "function": check_for_duplicate_references(self.csv_path),
                 "name": "Check for Duplicate References",
                 "severity": "error",
                 "responsibility": "system",
             },
             {
-                "function": self.validate_references,
+                "function": validate_references(self.csv_path),
                 "name": "Validate References",
                 "severity": "error",
                 "responsibility": "system",
             },
         ]
-
-    def check_for_duplicate_references(self):
-        duplicates = {}
-        issues = []
-
-        with self.csv_path.open(newline="") as csvfile:
-            reader = csv.DictReader(csvfile)
-            for row_number, row in enumerate(reader, start=1):
-                ref = row.get("reference")
-                if ref in duplicates:
-                    duplicates[ref].append(row_number)
-                else:
-                    duplicates[ref] = [row_number]
-
-        for ref, rows in duplicates.items():
-            if len(rows) > 1:
-                issues.append(
-                    {
-                        "scope": "row-group",
-                        "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
-                        "dataset": "dataset",
-                        "table_name": "resource",
-                        "rows": rows,
-                        "row_id": str(rows[0]),
-                        "organisation": "organisation",
-                    }
-                )
-
-        return True, "Checked for duplicate references.", issues
-
-    def validate_references(self):
-        issues = []
-
-        with self.csv_path.open(newline="") as csvfile:
-            reader = csv.DictReader(csvfile)
-            for row_number, row in enumerate(reader, start=1):
-                ref = row.get("reference")
-
-                if not ref:  # This will be True for both None and empty strings
-                    issues.append(
-                        {
-                            "scope": "value",
-                            "message": f"Reference is missing on row {row_number}.",
-                            "dataset": "dataset",
-                            "table_name": "resource",
-                            "field_name": "reference",
-                            "row_id": str(row_number),
-                            "value": "reference",
-                            "organisation": "organisation",
-                        }
-                    )
-
-        return len(issues) == 0, "Checked for unpopulated references.", issues
diff --git a/digital_land/expectations/expectation_functions/resource_validations.py b/digital_land/expectations/expectation_functions/resource_validations.py
@@ -0,0 +1,56 @@
+import csv
+
+
+def check_for_duplicate_references(csv_path):
+    duplicates = {}
+    issues = []
+
+    with csv_path.open(newline="") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row_number, row in enumerate(reader, start=1):
+            ref = row.get("reference")
+            if ref in duplicates:
+                duplicates[ref].append(row_number)
+            else:
+                duplicates[ref] = [row_number]
+
+    for ref, rows in duplicates.items():
+        if len(rows) > 1:
+            issues.append(
+                {
+                    "scope": "row-group",
+                    "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
+                    "dataset": "dataset",
+                    "table_name": "resource",
+                    "rows": rows,
+                    "row_id": str(rows[0]),
+                    "organisation": "organisation",
+                }
+            )
+
+    return issues
+
+
+def validate_references(csv_path):
+    issues = []
+
+    with csv_path.open(newline="") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row_number, row in enumerate(reader, start=1):
+            ref = row.get("reference")
+
+            if not ref:  # This will be True for both None and empty strings
+                issues.append(
+                    {
+                        "scope": "value",
+                        "message": f"Reference is missing on row {row_number}.",
+                        "dataset": "dataset",
+                        "table_name": "resource",
+                        "field_name": "reference",
+                        "row_id": str(row_number),
+                        "value": "Missing",
+                        "organisation": "organisation",
+                    }
+                )
+
+    return issues
diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py
@@ -4,8 +4,9 @@
 import pandas as pd
 from csv import DictReader, DictWriter
 from digital_land.expectations.checkpoints.dataset import DatasetCheckpoint
-from digital_land.expectations.checkpoints.converted_resource import (
-    ConvertedResourceCheckpoint,
+from digital_land.expectations.expectation_functions.resource_validations import (
+    check_for_duplicate_references,
+    validate_references,
 )
 
 
@@ -148,30 +149,24 @@ def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path):
 
 
 def test_check_for_duplicate_references(csv_path):
-    checkpoint = ConvertedResourceCheckpoint(data_path=csv_path)
-    checkpoint.load()
-
-    success, message, issues = checkpoint.check_for_duplicate_references()
+    issues = check_for_duplicate_references(csv_path)
 
-    assert success is True, "The function should successfully identify issues."
+    assert issues, "The function should successfully identify issues."
     assert len(issues) == 1, "There should be one issue identified."
     assert (
-        issues[0]["scope"] == "duplicate_reference"
+        issues[0]["scope"] == "row-group"
     ), "The issue should be identified as a duplicate reference."
     assert (
         "REF-001" in issues[0]["message"]
     ), "REF-001 should be identified as a duplicate."
 
 
 def test_validate_references(csv_path):
-    checkpoint = ConvertedResourceCheckpoint(data_path=csv_path)
-    checkpoint.load()
-
-    success, message, issues = checkpoint.validate_references()
+    issues = validate_references(csv_path)
 
-    assert success is False, "The function should fail due to invalid references."
+    assert issues, "The function should fail due to invalid references."
     assert len(issues) == 1, "There should be one issue identified."
     assert (
-        issues[0]["scope"] == "invalid_reference"
+        issues[0]["scope"] == "value"
     ), "The issue should be identified as an invalid reference."
     assert "" in issues[0]["message"], " 4th value should be identified as invalid."