Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/v0.4.0 #18

Merged
merged 7 commits into from
Jan 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/python
{
"name": "Python 3",
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
"image": "mcr.microsoft.com/devcontainers/python:1-3.9-bookworm",

// Features to add to the dev container. More info: https://containers.dev/features.
"features": { "ghcr.io/devcontainers-contrib/features/poetry": "latest" },

// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],

// Use 'postCreateCommand' to run commands after the container is created.
"postCreateCommand": "poetry install",

// Configure tool-specific properties.
// "customizations": {},

// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
"remoteUser": "vscode",
"customizations": {
"vscode": {
"extensions": [
"ms-python.python",
"vector-of-bool.gitflow"
]
}
}
}
2 changes: 1 addition & 1 deletion .github/workflows/run-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
strategy:
fail-fast: true
matrix:
os: [ubuntu-latest, macos-12, Windows-latest]
os: [ubuntu-latest, macos-15, Windows-latest]
python:
- "3.9"
- "3.10"
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dwcahandler"
version = "0.3.0"
version = "0.4.0b1"
description = "Python package to handle Darwin Core Archive (DwCA) operations. This includes creating a DwCA zip file from one or more csvs, reading a DwCA, merge two DwCAs, validate DwCA and delete records from DwCA based on one or more key columns"
authors = ["Atlas of Living Australia data team <[email protected]>"]
maintainers = ["Atlas of Living Australia data team <[email protected]>"]
Expand Down
40 changes: 25 additions & 15 deletions src/dwcahandler/dwca/core_dwca.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,10 @@ def convert_values(v):
csv_file_name = meta_elm.meta_element_type.file_name
with io.TextIOWrapper(zf.open(csv_file_name), encoding="utf-8") as csv_file:
dwc_headers = [f.field_name for f in meta_elm.fields if f.index is not None]
duplicates = [i for i in set(dwc_headers) if dwc_headers.count(i) > 1]
if len(duplicates) > 0:
raise ValueError(f"Duplicate columns {duplicates} specified in the "
f"metadata for {csv_file_name}")
csv_encoding = {key: convert_values(value) for key, value in
asdict(meta_elm.meta_element_type.csv_encoding).items()}
csv_content = self._read_csv(
Expand Down Expand Up @@ -825,17 +829,18 @@ def check_duplicates(self, content_keys_df, keys, error_file=None):
"""

def report_error(content, keys, message, condition, error_file=None):
log.error("%s found in keys %s", message, keys)
log.error("\n%s count\n%s", message, condition.sum())
log.error("\n%s", content.loc[condition.values, keys].index.tolist())
if error_file:
content.loc[condition.values, keys].to_csv(error_file, index=False)
content.loc[condition.values, keys].to_csv(error_file, index=False)

checks_status: bool = True
if len(keys) > 0:
empty_values_condition = content_keys_df.isnull()
if empty_values_condition.values.any():
report_error(content_keys_df, keys, "Empty Values", empty_values_condition)
log.error("Empty values found in %s. Total rows affected: %s", keys,
empty_values_condition.sum().sum())
log.error("Empty values found in dataframe row: %s",
content_keys_df.index[empty_values_condition.all(axis=1)].tolist())
if error_file:
report_error(content_keys_df, keys, "Empty Values", empty_values_condition)
checks_status = False

# check incase-sensitive duplicates
Expand All @@ -846,8 +851,11 @@ def to_lower(df):
df_keys = to_lower(content_keys_df)
duplicate_condition = df_keys.duplicated(keep='first')
if duplicate_condition.values.any():
report_error(content_keys_df, keys, "Duplicate Values",
duplicate_condition, error_file)
log.error(f"Duplicate %s found. Total rows affected: %s", keys, duplicate_condition.sum())
log.error("Duplicate values: %s", pd.unique(content_keys_df[duplicate_condition].stack()))
if error_file:
report_error(content_keys_df, keys, "Duplicate Values",
duplicate_condition, error_file)
checks_status = False

return checks_status
Expand Down Expand Up @@ -888,7 +896,6 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil

- No duplicate record keys
- Valid columns
- No duplicate columns

:param error_file: A file to record errors
:return: True if the DwCA is value, False otherwise
Expand All @@ -907,10 +914,6 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil
if not self._validate_columns(content):
return False

dup_cols = self._find_duplicate_columns(content)
if len(dup_cols) > 0:
return False

return True

def extract_csv_content(self, csv_info: CsvFileType,
Expand Down Expand Up @@ -1062,9 +1065,16 @@ def _read_csv(self,
ret_val.dropna(how="all", inplace=True)
log.debug("Extracted %d rows from csv %s", len(ret_val), csv_file)

# Strip column header spaces
ret_val.rename(str.strip, axis = 'columns', inplace=True)

return ret_val

except EmptyDataError:
log.error(f"The expected columns: %s are not present in the {csv_file}. "
f"The file may be empty", ','.join(columns))
if columns:
log.error(f"The file may be empty {csv_file}")
else:
log.error(f"The expected columns: %s are not present in the {csv_file}. "
f"The file may be empty", ','.join(columns))

return pd.DataFrame()
2 changes: 1 addition & 1 deletion src/dwcahandler/dwca/dwca_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def merge_dwca(dwca_file: Union[str, BytesIO], delta_dwca_file: Union[str, Bytes
regen_ids=regen_ids, validate_delta=validate_delta_content)

@staticmethod
def validate_dwca(dwca_file: str, keys_lookup: dict = None, error_file: str = None):
def validate_dwca(dwca_file: Union[str, BytesIO], keys_lookup: dict = None, error_file: str = None):
"""Test a dwca for consistency

:param dwca_file: The path to the DwCA
Expand Down
12 changes: 11 additions & 1 deletion src/dwcahandler/dwca/dwca_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,16 @@ def __extract_meta_info(self, ns, node_elm, core_or_ext_type):
def extract_field_attr_value(field_elm, attrib):
return field_elm.attrib.get(attrib) if field_elm.attrib.get(attrib) else None

def __find_id_in_fields(local_fields, id_field):
index_number = id_field[0].attrib["index"] if len(id_field) > 0 else "0"
return next((item for item in local_fields if "index" in item.attrib and item.attrib["index"]==index_number), None)

fields = node_elm.findall(f'{ns}field')
id_field = []
if core_or_ext_type == 'core':
id_field = node_elm.findall(f'{ns}id')
else:
id_field = node_elm.findall(f'{ns}coreid')
file_name = node_elm.find(f'{ns}files').find(f'{ns}location').text
meta_element_info = MetaElementInfo(
core_or_ext_type=core_or_ext_type,
Expand All @@ -157,7 +166,8 @@ def extract_field_attr_value(field_elm, attrib):
charset_encoding=node_elm.attrib['encoding'],
file_name=file_name)
# set first field with index 0 if it's not present in list of fields
if fields[0].attrib['index'] != '0':
field_elm = __find_id_in_fields(fields, id_field)
if field_elm is None and len(id_field) > 0:
if CoreOrExtType.CORE == core_or_ext_type:
field_list = [Field(index=0, field_name="id")]
else:
Expand Down
17 changes: 17 additions & 0 deletions tests/input_files/dwca/dwca-sample1/meta.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<archive xmlns="http://rs.tdwg.org/dwc/text/">
<core rowType="http://rs.tdwg.org/dwc/terms/Occurrence" encoding="utf-8" fieldsTerminatedBy="," linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1">
<files>
<location>occurrence.txt</location>
</files>
<id index="0" />
<field index="1" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
<field index="2" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
<field index="3" term="http://rs.tdwg.org/dwc/terms/decimalLatitude"/>
<field index="4" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
<field index="5" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
<field index="6" term="http://rs.tdwg.org/dwc/terms/recordedBy"/>
<field index="7" term="http://rs.tdwg.org/dwc/terms/geodeticDatum"/>
<field index="8" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
</core>
</archive>
6 changes: 6 additions & 0 deletions tests/input_files/dwca/dwca-sample1/occurrence.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
id,occurrenceID,scientificName,decimalLatitude,decimalLongitude,eventDate,recordedBy,geodeticDatum,basisOfRecord
1,1001,SpeciesA,12.34,-56.78,2023-01-01,John Doe,WGS84,PreservedSpecimen
2,1002,SpeciesB,-34.56,78.90,2023-02-15,Jane Smith,WGS84,HumanObservation
3,1003,SpeciesC,0.123,45.678,2023-03-20,Bob Johnson,WGS84,FossilSpecimen
4,1004,SpeciesD,-23.456,-12.345,2023-04-10,Alice Brown,WGS84,MachineObservation
5,1005,SpeciesE,89.012,-67.890,2023-05-25,Charlie White,WGS84,PreservedSpecimen
15 changes: 15 additions & 0 deletions tests/input_files/dwca/dwca-sample2/meta.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="metadata.xml">
<core encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
<files>
<location>occurrence.txt</location>
</files>
<id index="0" />
<field default="WGS84" term="http://rs.tdwg.org/dwc/terms/geodeticDatum"/>
<field index="0" term="http://rs.gbif.org/terms/1.0/gbifID"/>
<field index="1" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
<field index="2" term="http://purl.org/dc/terms/scientificName"/>
<field index="3" term="http://purl.org/dc/terms/decimalLatitude"/>
<field index="4" term="http://purl.org/dc/terms/decimalLongitude"/>
<field index="5" term="http://purl.org/dc/terms/basisOfRecord"/>
</core>
</archive>
3 changes: 3 additions & 0 deletions tests/input_files/dwca/dwca-sample2/occurrence.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
gbifID occurrenceID scientificName decimalLatitude decimalLongitude basisOfRecord
1 occ1 Euphorbia paralias -36.00000 150.5678 Observations
2 occ2 Acaciella angustissima -20.0000 145.1234 Observations
15 changes: 15 additions & 0 deletions tests/input_files/dwca/dwca-sample3/meta.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="metadata.xml">
<core encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
<files>
<location>occurrence.txt</location>
</files>
<id index="0" />
<field default="WGS84" term="http://rs.tdwg.org/dwc/terms/geodeticDatum"/>
<field index="0" term="http://rs.gbif.org/terms/1.0/gbifID"/>
<field index="1" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
<field index="2" term="http://purl.org/dc/terms/scientificName"/>
<field index="3" term="http://purl.org/dc/terms/decimalLatitude"/>
<field index="4" term="http://purl.org/dc/terms/decimalLongitude"/>
<field index="5" term="http://purl.org/dc/terms/basisOfRecord"/>
</core>
</archive>
4 changes: 4 additions & 0 deletions tests/input_files/dwca/dwca-sample3/occurrence.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
gbifID occurrenceID scientificName decimalLatitude decimalLongitude basisOfRecord
1 Euphorbia paralias -36.00000 150.5678 Observations
2 occ2 Acaciella angustissima -20.0000 145.1234 Observations
3 occ3 Acaciella angustissima -20.0000 145.1234 Observations
15 changes: 15 additions & 0 deletions tests/input_files/dwca/dwca-sample4/meta.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<archive xmlns="http://rs.tdwg.org/dwc/text/">
<core rowType="http://rs.tdwg.org/dwc/terms/Occurrence" encoding="utf-8" fieldsTerminatedBy="," linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1">
<files>
<location>occurrence.csv</location>
</files>
<id index="3" />
<field index="0" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
<field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
<field index="2" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
<field index="3" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
<field index="4" term="http://rs.tdwg.org/dwc/terms/decimalLatitude"/>
<field index="5" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
</core>
</archive>
13 changes: 13 additions & 0 deletions tests/input_files/dwca/dwca-sample4/occurrence.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
eventDate,basisOfRecord,scientificName,catalogNumber,decimalLatitude,decimalLongitude
2015-07-18T12:58:00+00:00,Human Observation,Species A,014800,-30.00000,144
2015-07-18T12:58:31+00:00,Human Observation,Species B,,-31.00000,145
2015-07-18T18:16:52+00:00,Human Observation,Species C,014824,-32.00000,100.828059
2015-07-19T04:28:19+00:00,Human Observation,Species D,014823,-33.00000,101.820888
2015-07-19T18:29:25+00:00,Human Observation,Species A1,014822,-34.00000,102.821654
2015-07-20T18:03:12+00:00,Human Observation,Species A2,014821,-35.00000,104.999974
2015-07-21T18:06:58+00:00,Human Observation,Species A3,014802,-34.00000,120.889354
2015-07-22T04:42:47+00:00,Human Observation,Species B1,014800,-36.00000,150.308848
2015-07-22T17:54:18+00:00,Human Observation,Species B2,014800,-30.00000,146.240159
2015-07-22T23:09:51+00:00,Human Observation,Species C1,014799,-31.00000,150.783246
2015-07-23T17:37:26+00:00,Human Observation,Species D,014798,-40.00000,150.823468
2015-07-24T13:10:00+00:00,Human Observation,Species E,014823,-28.00000,115
14 changes: 14 additions & 0 deletions tests/input_files/dwca/dwca-sample5/meta.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<archive xmlns="http://rs.tdwg.org/dwc/text/">
<core rowType="http://rs.tdwg.org/dwc/terms/Occurrence" encoding="utf-8" fieldsTerminatedBy="," linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1">
<files>
<location>occurrence.csv</location>
</files>
<field index="0" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
<field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
<field index="2" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
<field index="3" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
<field index="4" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
<field index="5" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
</core>
</archive>
13 changes: 13 additions & 0 deletions tests/input_files/dwca/dwca-sample5/occurrence.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
eventDate,basisOfRecord,scientificName,catalogNumber,decimalLatitude,decimalLongitude
2015-07-18T12:58:00+00:00,Human Observation,Species A,014826,-30,144
2015-07-18T12:58:31+00:00,Human Observation,Species A1,014825,-31.1111,145
2015-07-18T18:16:52+00:00,Human Observation,Species A2,014824,-32.085431,100.828059
2015-07-19T04:28:19+00:00,Human Observation,Species A3,014823,-33.097233,101.820888
2015-07-19T18:29:25+00:00,Human Observation,Species B1,014822,-34.099936,102.821654
2015-07-20T18:03:12+00:00,Human Observation,Species B2,014821,-35.893671,104.999974
2015-07-21T18:06:58+00:00,Human Observation,Species C,014820,-34.113747,120.889354
2015-07-22T04:42:47+00:00,Human Observation,Species C2,014810,-36,144.308848
2015-07-22T17:54:18+00:00,Human Observation,Species C3,014800,-30.440251,146.240159
2015-07-22T23:09:51+00:00,Human Observation,Species D,014799,-31.547195,150.783246
2015-07-23T17:37:26+00:00,Human Observation,Species D1,-40.481117,150.823468
2015-07-24T13:10:00+00:00,Human Observation,Species D2,014792,-28,115
3 changes: 3 additions & 0 deletions tests/input_files/sample/multimedia_header_with_space.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
catalogNumber ,identifier, format ,type
C4,https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=XXX,image/jpeg,StillImage
C5,https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=XYZ,image/jpeg,StillImage
4 changes: 4 additions & 0 deletions tests/input_files/sample/occ_header_with_space.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
catalogNumber ,basisOfRecord,scientificName ,license,decimalLatitude,decimalLongitude
C3,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-10.0000,120.0000
C4,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-11.1111,125.0000
C5,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-12.085431,130.828059
Loading