Skip to content

Commit

Permalink
Separate functions and correct tests
Browse files Browse the repository at this point in the history
  • Loading branch information
JbannisterScottLogic committed Apr 15, 2024
1 parent 24594c2 commit adddaa4
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 77 deletions.
69 changes: 6 additions & 63 deletions digital_land/expectations/checkpoints/converted_resource.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,9 @@
# checkpoint needs to assemble class state
# it needs to validate inputs specific for that checkpoint
# it then needs to run expectations
# then it needs to be able to save those expectation resultts
# a checkpoint represents the moment in the process where we tell it the
# type of data it is validating and where the data is
# the primary different between checkpoints is how it loads expectations (i.e. where that are loaded from)
from pathlib import Path
import csv
from .base import BaseCheckpoint
from ..expectation_functions.resource_validations import (
check_for_duplicate_references,
validate_references,
)


class ConvertedResourceCheckpoint(BaseCheckpoint):
Expand All @@ -18,68 +14,15 @@ def __init__(self, data_path):
def load(self):
self.expectations = [
{
"function": self.check_for_duplicate_references,
"function": check_for_duplicate_references(self.csv_path),
"name": "Check for Duplicate References",
"severity": "error",
"responsibility": "system",
},
{
"function": self.validate_references,
"function": validate_references(self.csv_path),
"name": "Validate References",
"severity": "error",
"responsibility": "system",
},
]

def check_for_duplicate_references(self):
duplicates = {}
issues = []

with self.csv_path.open(newline="") as csvfile:
reader = csv.DictReader(csvfile)
for row_number, row in enumerate(reader, start=1):
ref = row.get("reference")
if ref in duplicates:
duplicates[ref].append(row_number)
else:
duplicates[ref] = [row_number]

for ref, rows in duplicates.items():
if len(rows) > 1:
issues.append(
{
"scope": "row-group",
"message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
"dataset": "dataset",
"table_name": "resource",
"rows": rows,
"row_id": str(rows[0]),
"organisation": "organisation",
}
)

return True, "Checked for duplicate references.", issues

def validate_references(self):
issues = []

with self.csv_path.open(newline="") as csvfile:
reader = csv.DictReader(csvfile)
for row_number, row in enumerate(reader, start=1):
ref = row.get("reference")

if not ref: # This will be True for both None and empty strings
issues.append(
{
"scope": "value",
"message": f"Reference is missing on row {row_number}.",
"dataset": "dataset",
"table_name": "resource",
"field_name": "reference",
"row_id": str(row_number),
"value": "reference",
"organisation": "organisation",
}
)

return len(issues) == 0, "Checked for unpopulated references.", issues
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import csv


def check_for_duplicate_references(csv_path):
duplicates = {}
issues = []

with csv_path.open(newline="") as csvfile:
reader = csv.DictReader(csvfile)
for row_number, row in enumerate(reader, start=1):
ref = row.get("reference")
if ref in duplicates:
duplicates[ref].append(row_number)
else:
duplicates[ref] = [row_number]

for ref, rows in duplicates.items():
if len(rows) > 1:
issues.append(
{
"scope": "row-group",
"message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
"dataset": "dataset",
"table_name": "resource",
"rows": rows,
"row_id": str(rows[0]),
"organisation": "organisation",
}
)

return issues


def validate_references(csv_path):
issues = []

with csv_path.open(newline="") as csvfile:
reader = csv.DictReader(csvfile)
for row_number, row in enumerate(reader, start=1):
ref = row.get("reference")

if not ref: # This will be True for both None and empty strings
issues.append(
{
"scope": "value",
"message": f"Reference is missing on row {row_number}.",
"dataset": "dataset",
"table_name": "resource",
"field_name": "reference",
"row_id": str(row_number),
"value": "Missing",
"organisation": "organisation",
}
)

return issues
23 changes: 9 additions & 14 deletions tests/integration/expectations/test_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
import pandas as pd
from csv import DictReader, DictWriter
from digital_land.expectations.checkpoints.dataset import DatasetCheckpoint
from digital_land.expectations.checkpoints.converted_resource import (
ConvertedResourceCheckpoint,
from digital_land.expectations.expectation_functions.resource_validations import (
check_for_duplicate_references,
validate_references,
)


Expand Down Expand Up @@ -148,30 +149,24 @@ def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path):


def test_check_for_duplicate_references(csv_path):
checkpoint = ConvertedResourceCheckpoint(data_path=csv_path)
checkpoint.load()

success, message, issues = checkpoint.check_for_duplicate_references()
issues = check_for_duplicate_references(csv_path)

assert success is True, "The function should successfully identify issues."
assert issues, "The function should successfully identify issues."
assert len(issues) == 1, "There should be one issue identified."
assert (
issues[0]["scope"] == "duplicate_reference"
issues[0]["scope"] == "row-group"
), "The issue should be identified as a duplicate reference."
assert (
"REF-001" in issues[0]["message"]
), "REF-001 should be identified as a duplicate."


def test_validate_references(csv_path):
checkpoint = ConvertedResourceCheckpoint(data_path=csv_path)
checkpoint.load()

success, message, issues = checkpoint.validate_references()
issues = validate_references(csv_path)

assert success is False, "The function should fail due to invalid references."
assert issues, "The function should fail due to invalid references."
assert len(issues) == 1, "There should be one issue identified."
assert (
issues[0]["scope"] == "invalid_reference"
issues[0]["scope"] == "value"
), "The issue should be identified as an invalid reference."
assert "" in issues[0]["message"], " 4th value should be identified as invalid."

0 comments on commit adddaa4

Please sign in to comment.