Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Post conversion expectations #188

Open
wants to merge 60 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
977a8a8
Updated
JbannisterScottLogic Apr 5, 2024
dec636c
Added unit tests and integrated into convert
JbannisterScottLogic Apr 9, 2024
113dbef
Updated verification
JbannisterScottLogic Apr 10, 2024
77b6cbc
Adjust issue factory
JbannisterScottLogic Apr 10, 2024
0b1f12f
Issue adjustments
JbannisterScottLogic Apr 10, 2024
9f68310
Changed value
JbannisterScottLogic Apr 10, 2024
b71b479
Value changes
JbannisterScottLogic Apr 10, 2024
d8ef949
Adjust convert.py
JbannisterScottLogic Apr 10, 2024
9fde2ae
Test fixes
JbannisterScottLogic Apr 10, 2024
45b0e11
Chanegs to issues
JbannisterScottLogic Apr 15, 2024
24594c2
Change to reference
JbannisterScottLogic Apr 15, 2024
adddaa4
Separate functions and correct tests
JbannisterScottLogic Apr 15, 2024
dab0d77
Changes back to helpers
JbannisterScottLogic Apr 15, 2024
f162dcf
Fix
JbannisterScottLogic Apr 15, 2024
c1f9081
Core changes
JbannisterScottLogic Apr 16, 2024
c1c218c
Import change
JbannisterScottLogic Apr 16, 2024
a046e7d
Parameter changes
JbannisterScottLogic Apr 16, 2024
7769264
Changes to convert
JbannisterScottLogic Apr 16, 2024
cc64e30
Fix
JbannisterScottLogic Apr 16, 2024
e131164
Typology change
JbannisterScottLogic Apr 16, 2024
cc983ce
Add Process
JbannisterScottLogic Apr 16, 2024
d26369f
Add process parameter
JbannisterScottLogic Apr 16, 2024
fceb81a
Query runner adjustments
JbannisterScottLogic Apr 16, 2024
324b2c1
Fix converted resource
JbannisterScottLogic Apr 16, 2024
1c5d640
Change pathing
JbannisterScottLogic Apr 17, 2024
92a4ae4
Updated
JbannisterScottLogic Apr 5, 2024
f1e0d7a
Added unit tests and integrated into convert
JbannisterScottLogic Apr 9, 2024
d4c98c0
Updated verification
JbannisterScottLogic Apr 10, 2024
2666a59
Adjust issue factory
JbannisterScottLogic Apr 10, 2024
518196a
Issue adjustments
JbannisterScottLogic Apr 10, 2024
febdbac
Changed value
JbannisterScottLogic Apr 10, 2024
4efc16d
Value changes
JbannisterScottLogic Apr 10, 2024
91e5c18
Adjust convert.py
JbannisterScottLogic Apr 10, 2024
238607f
Test fixes
JbannisterScottLogic Apr 10, 2024
d3ecda3
Chanegs to issues
JbannisterScottLogic Apr 15, 2024
e22412f
Change to reference
JbannisterScottLogic Apr 15, 2024
4bc8119
Separate functions and correct tests
JbannisterScottLogic Apr 15, 2024
4b0a437
Changes back to helpers
JbannisterScottLogic Apr 15, 2024
568f456
Fix
JbannisterScottLogic Apr 15, 2024
2eb2134
Core changes
JbannisterScottLogic Apr 16, 2024
4338c8b
Import change
JbannisterScottLogic Apr 16, 2024
0851420
Parameter changes
JbannisterScottLogic Apr 16, 2024
0db28da
Changes to convert
JbannisterScottLogic Apr 16, 2024
e552ff5
Fix
JbannisterScottLogic Apr 16, 2024
965d1bc
Typology change
JbannisterScottLogic Apr 16, 2024
13df751
Add Process
JbannisterScottLogic Apr 16, 2024
eb3b67e
Add process parameter
JbannisterScottLogic Apr 16, 2024
d7fc4f7
Query runner adjustments
JbannisterScottLogic Apr 16, 2024
b5ebc71
Fix converted resource
JbannisterScottLogic Apr 16, 2024
7b60741
Change pathing
JbannisterScottLogic Apr 17, 2024
9ce9e0a
Merge branch 'post-conversion-expectations' of github.com:digital-lan…
cjohns-scottlogic Apr 18, 2024
954735a
Set field name of items in ValueIssue. Small fixes to PostConversionP…
cjohns-scottlogic Apr 18, 2024
18b9b34
Converted file to unix format (so they diff easier with main)
cjohns-scottlogic Apr 18, 2024
2cfb750
Renamed dataset checkpoint test names to make them a bit clearer.
cjohns-scottlogic Apr 18, 2024
f6c2ce0
WIP
cjohns-scottlogic Apr 22, 2024
7e1371f
Merge branch 'main' into post-conversion-expectations
cjohns-scottlogic Apr 22, 2024
979e6e4
Post-merge fixes.
cjohns-scottlogic Apr 22, 2024
8284c8a
Updated PostConversionPhase to output to issues instead.
cjohns-scottlogic Apr 23, 2024
2a19aae
Removed converted resource expectation.
cjohns-scottlogic Apr 23, 2024
77bbff5
WIP: Run the ckecks on the pipeline data.
cjohns-scottlogic Apr 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 76 additions & 3 deletions digital_land/expectations/checkpoints/converted_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,82 @@
# a checkpoint represents the moment in the process where we tell it the
# type of data it is validating and where the data is
# the primary different between checkpoints is how it loads expectations (i.e. where that are loaded from)
from pathlib import Path
import csv
from .base import BaseCheckpoint


class CovertedResourceCheckpoint(BaseCheckpoint):
def load():
pass
class ConvertedResourceCheckpoint(BaseCheckpoint):
def __init__(self, data_path):
super().__init__("converted_resource", data_path)
self.csv_path = Path(data_path)

def load(self):
self.expectations = [
{
"function": self.check_for_duplicate_references,
"name": "Check for Duplicate References",
"severity": "error",
"responsibility": "system",
},
{
"function": self.validate_references,
"name": "Validate References",
"severity": "error",
"responsibility": "system",
},
]

def check_for_duplicate_references(self):
duplicates = {}
issues = []

with self.csv_path.open(newline="") as csvfile:
reader = csv.DictReader(csvfile)
for row_number, row in enumerate(reader, start=1):
ref = row.get("reference")
if ref in duplicates:
duplicates[ref].append(row_number)
else:
duplicates[ref] = [row_number]

for ref, rows in duplicates.items():
if len(rows) > 1:
issues.append(
{
"scope": "duplicate_reference",
"message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
"rows": rows,
"reference": ref,
"dataset": "dataset",
"field_name": "reference",
"row_id": str(rows[0]),
"organisation": "organisation",
}
)

return True, "Checked for duplicate references.", issues

def validate_references(self):
issues = []

with self.csv_path.open(newline="") as csvfile:
reader = csv.DictReader(csvfile)
for row_number, row in enumerate(reader, start=1):
ref = row.get("reference")

if not ref: # This will be True for both None and empty strings
issues.append(
{
"scope": "invalid_reference",
"message": f"Reference is missing on row {row_number}.",
"row": row_number,
"reference": ref,
"dataset": "dataset",
"field_name": "reference",
"row_id": str(row_number),
"organisation": "organisation",
}
)

return len(issues) == 0, "Checked for unpopulated references.", issues
28 changes: 28 additions & 0 deletions digital_land/expectations/issue.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ def issue_factory(scope):
"row-group": RowGroupIssue,
"row": RowIssue,
"value": ValueIssue,
"duplicate_reference": DuplicateReferenceIssue,
"invalid_reference": InvalidReferenceIssue,
}
if scope in SCOPE_MAP:
return SCOPE_MAP[scope]
Expand Down Expand Up @@ -129,3 +131,29 @@ def __post_init__(self):
issue_scope = "value"
if self.scope != issue_scope:
raise ValueError(f"scope must be '{issue_scope}'.")


@dataclass
class DuplicateReferenceIssue(Issue):
dataset: str
field_name: str = field(metadata=config(field_name="field_name"))
rows: list = field(metadata=config(field_name="rows"))
organisation: str

def __post_init__(self):
issue_scope = "duplicate_reference"
if self.scope != issue_scope:
raise ValueError(f"scope must be '{issue_scope}'.")


@dataclass
class InvalidReferenceIssue(Issue):
dataset: str
field_name: str = field(metadata=config(field_name="field_name"))
row_id: str = field(metadata=config(field_name="row_id"))
organisation: str

def __post_init__(self):
issue_scope = "invalid_reference"
if self.scope != issue_scope:
raise ValueError(f"scope must be '{issue_scope}'.")
32 changes: 32 additions & 0 deletions digital_land/phase/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
import pandas as pd
from .load import Stream
from .phase import Phase
from digital_land.expectations.checkpoints.converted_resource import (
ConvertedResourceCheckpoint,
)


def detect_file_encoding(path):
Expand Down Expand Up @@ -187,12 +190,41 @@ def _read_text_file(self, input_path, encoding):

if converted_csv_file:
f.close()
self.run_checkpoint(converted_csv_file)
reader = read_csv(converted_csv_file)
else:
reader = f

return reader

def run_checkpoint(self, path):
checkpoint = ConvertedResourceCheckpoint(data_path=path)
checkpoint.load()
result = checkpoint.run()

# Check if the result is not None and is iterable (unpackable)
if result is not None and isinstance(result, tuple) and len(result) == 2:
checkpoint_result, issues = result
else:
logging.error("Checkpoint did not return the expected result format.")
return

if issues:
for issue in issues:
log_message = self.format_issue_message(issue)

if issue["severity"] == "error":
logging.error(log_message)
elif issue["severity"] == "warning":
logging.warning(log_message)
else:
logging.info(log_message)
else:
logging.info(f"Checkpoint completed with result: {checkpoint_result}")

def format_issue_message(self, issue):
return f"Checkpoint Issue: {issue['message']} at line {issue.get('line_number', 'N/A')} (Severity: {issue['severity']})"

def _find_zip_file(self, input_file, suffix=".gml"):
zip_ = zipfile.ZipFile(input_file)
files = zip_.namelist()
Expand Down
51 changes: 50 additions & 1 deletion tests/integration/expectations/test_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
import os
import spatialite
import pandas as pd
from csv import DictReader
from csv import DictReader, DictWriter
from digital_land.expectations.checkpoints.dataset import DatasetCheckpoint
from digital_land.expectations.checkpoints.converted_resource import (
ConvertedResourceCheckpoint,
)


@pytest.fixture
Expand Down Expand Up @@ -43,6 +46,22 @@ def sqlite3_with_entity_tables_path(tmp_path):
return dataset_path


@pytest.fixture
def csv_path(tmp_path):
data = [
{"reference": "REF-001", "name": "Test 1"},
{"reference": "REF-002", "name": "Test 2"},
{"reference": "REF-001", "name": "Test 3"}, # Duplicate
{"reference": "", "name": "Test 4"}, # Invalid format
]
csv_file = tmp_path / "test_data.csv"
with csv_file.open(mode="w", newline="") as f:
writer = DictWriter(f, fieldnames=["reference", "name"])
writer.writeheader()
writer.writerows(data)
return csv_file


def test_run_checkpoint_success(tmp_path, sqlite3_with_entity_tables_path):
# load data
test_entity_data = pd.DataFrame.from_dict({"entity": [1], "name": ["test1"]})
Expand Down Expand Up @@ -126,3 +145,33 @@ def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path):
assert issues[0]["rows"] == ""
assert issues[0]["row"] != "" # Just check it's there
assert issues[0]["value"] == ""


def test_check_for_duplicate_references(csv_path):
checkpoint = ConvertedResourceCheckpoint(data_path=csv_path)
checkpoint.load()

success, message, issues = checkpoint.check_for_duplicate_references()

assert success is True, "The function should successfully identify issues."
assert len(issues) == 1, "There should be one issue identified."
assert (
issues[0]["scope"] == "duplicate_reference"
), "The issue should be identified as a duplicate reference."
assert (
"REF-001" in issues[0]["message"]
), "REF-001 should be identified as a duplicate."


def test_validate_references(csv_path):
checkpoint = ConvertedResourceCheckpoint(data_path=csv_path)
checkpoint.load()

success, message, issues = checkpoint.validate_references()

assert success is False, "The function should fail due to invalid references."
assert len(issues) == 1, "There should be one issue identified."
assert (
issues[0]["scope"] == "invalid_reference"
), "The issue should be identified as an invalid reference."
assert "" in issues[0]["message"], " 4th value should be identified as invalid."
Loading