Skip to content

Commit

Permalink
Release v0.4.0
Browse files Browse the repository at this point in the history
Release v0.4.0
  • Loading branch information
patkyn authored Jan 30, 2025
2 parents 836b6ef + d7bfd60 commit 56cbab2
Show file tree
Hide file tree
Showing 20 changed files with 330 additions and 20 deletions.
30 changes: 30 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/python
{
"name": "Python 3",
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
"image": "mcr.microsoft.com/devcontainers/python:1-3.9-bookworm",

// Features to add to the dev container. More info: https://containers.dev/features.
"features": { "ghcr.io/devcontainers-contrib/features/poetry": "latest" },

// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],

// Use 'postCreateCommand' to run commands after the container is created.
"postCreateCommand": "poetry install",

// Configure tool-specific properties.
// "customizations": {},

// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
"remoteUser": "vscode",
"customizations": {
"vscode": {
"extensions": [
"ms-python.python",
"vector-of-bool.gitflow"
]
}
}
}
2 changes: 1 addition & 1 deletion .github/workflows/run-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
strategy:
fail-fast: true
matrix:
os: [ubuntu-latest, macos-12, Windows-latest]
os: [ubuntu-latest, macos-15, Windows-latest]
python:
- "3.9"
- "3.10"
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dwcahandler"
version = "0.3.0"
version = "0.4.0"
description = "Python package to handle Darwin Core Archive (DwCA) operations. This includes creating a DwCA zip file from one or more csvs, reading a DwCA, merge two DwCAs, validate DwCA and delete records from DwCA based on one or more key columns"
authors = ["Atlas of Living Australia data team <[email protected]>"]
maintainers = ["Atlas of Living Australia data team <[email protected]>"]
Expand All @@ -26,4 +26,4 @@ requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.pytest.ini_options]
pythonpath = "src"
pythonpath = "src"
40 changes: 25 additions & 15 deletions src/dwcahandler/dwca/core_dwca.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,10 @@ def convert_values(v):
csv_file_name = meta_elm.meta_element_type.file_name
with io.TextIOWrapper(zf.open(csv_file_name), encoding="utf-8") as csv_file:
dwc_headers = [f.field_name for f in meta_elm.fields if f.index is not None]
duplicates = [i for i in set(dwc_headers) if dwc_headers.count(i) > 1]
if len(duplicates) > 0:
raise ValueError(f"Duplicate columns {duplicates} specified in the "
f"metadata for {csv_file_name}")
csv_encoding = {key: convert_values(value) for key, value in
asdict(meta_elm.meta_element_type.csv_encoding).items()}
csv_content = self._read_csv(
Expand Down Expand Up @@ -825,17 +829,18 @@ def check_duplicates(self, content_keys_df, keys, error_file=None):
"""

def report_error(content, keys, message, condition, error_file=None):
log.error("%s found in keys %s", message, keys)
log.error("\n%s count\n%s", message, condition.sum())
log.error("\n%s", content.loc[condition.values, keys].index.tolist())
if error_file:
content.loc[condition.values, keys].to_csv(error_file, index=False)
content.loc[condition.values, keys].to_csv(error_file, index=False)

checks_status: bool = True
if len(keys) > 0:
empty_values_condition = content_keys_df.isnull()
if empty_values_condition.values.any():
report_error(content_keys_df, keys, "Empty Values", empty_values_condition)
log.error("Empty values found in %s. Total rows affected: %s", keys,
empty_values_condition.sum().sum())
log.error("Empty values found in dataframe row: %s",
content_keys_df.index[empty_values_condition.all(axis=1)].tolist())
if error_file:
report_error(content_keys_df, keys, "Empty Values", empty_values_condition)
checks_status = False

# check incase-sensitive duplicates
Expand All @@ -846,8 +851,11 @@ def to_lower(df):
df_keys = to_lower(content_keys_df)
duplicate_condition = df_keys.duplicated(keep='first')
if duplicate_condition.values.any():
report_error(content_keys_df, keys, "Duplicate Values",
duplicate_condition, error_file)
log.error(f"Duplicate %s found. Total rows affected: %s", keys, duplicate_condition.sum())
log.error("Duplicate values: %s", pd.unique(content_keys_df[duplicate_condition].stack()))
if error_file:
report_error(content_keys_df, keys, "Duplicate Values",
duplicate_condition, error_file)
checks_status = False

return checks_status
Expand Down Expand Up @@ -888,7 +896,6 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil
- No duplicate record keys
- Valid columns
- No duplicate columns
:param error_file: A file to record errors
:return: True if the DwCA is value, False otherwise
Expand All @@ -907,10 +914,6 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil
if not self._validate_columns(content):
return False

dup_cols = self._find_duplicate_columns(content)
if len(dup_cols) > 0:
return False

return True

def extract_csv_content(self, csv_info: CsvFileType,
Expand Down Expand Up @@ -1062,9 +1065,16 @@ def _read_csv(self,
ret_val.dropna(how="all", inplace=True)
log.debug("Extracted %d rows from csv %s", len(ret_val), csv_file)

# Strip column header spaces
ret_val.rename(str.strip, axis = 'columns', inplace=True)

return ret_val

except EmptyDataError:
log.error(f"The expected columns: %s are not present in the {csv_file}. "
f"The file may be empty", ','.join(columns))
if columns:
log.error(f"The file may be empty {csv_file}")
else:
log.error(f"The expected columns: %s are not present in the {csv_file}. "
f"The file may be empty", ','.join(columns))

return pd.DataFrame()
2 changes: 1 addition & 1 deletion src/dwcahandler/dwca/dwca_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def merge_dwca(dwca_file: Union[str, BytesIO], delta_dwca_file: Union[str, Bytes
regen_ids=regen_ids, validate_delta=validate_delta_content)

@staticmethod
def validate_dwca(dwca_file: str, keys_lookup: dict = None, error_file: str = None):
def validate_dwca(dwca_file: Union[str, BytesIO], keys_lookup: dict = None, error_file: str = None):
"""Test a dwca for consistency
:param dwca_file: The path to the DwCA
Expand Down
12 changes: 11 additions & 1 deletion src/dwcahandler/dwca/dwca_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,16 @@ def __extract_meta_info(self, ns, node_elm, core_or_ext_type):
def extract_field_attr_value(field_elm, attrib):
return field_elm.attrib.get(attrib) if field_elm.attrib.get(attrib) else None

def __find_id_in_fields(local_fields, id_field):
index_number = id_field[0].attrib["index"] if len(id_field) > 0 else "0"
return next((item for item in local_fields if "index" in item.attrib and item.attrib["index"]==index_number), None)

fields = node_elm.findall(f'{ns}field')
id_field = []
if core_or_ext_type == 'core':
id_field = node_elm.findall(f'{ns}id')
else:
id_field = node_elm.findall(f'{ns}coreid')
file_name = node_elm.find(f'{ns}files').find(f'{ns}location').text
meta_element_info = MetaElementInfo(
core_or_ext_type=core_or_ext_type,
Expand All @@ -157,7 +166,8 @@ def extract_field_attr_value(field_elm, attrib):
charset_encoding=node_elm.attrib['encoding'],
file_name=file_name)
# set first field with index 0 if it's not present in list of fields
if fields[0].attrib['index'] != '0':
field_elm = __find_id_in_fields(fields, id_field)
if field_elm is None and len(id_field) > 0:
if CoreOrExtType.CORE == core_or_ext_type:
field_list = [Field(index=0, field_name="id")]
else:
Expand Down
17 changes: 17 additions & 0 deletions tests/input_files/dwca/dwca-sample1/meta.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<archive xmlns="http://rs.tdwg.org/dwc/text/">
<core rowType="http://rs.tdwg.org/dwc/terms/Occurrence" encoding="utf-8" fieldsTerminatedBy="," linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1">
<files>
<location>occurrence.txt</location>
</files>
<id index="0" />
<field index="1" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
<field index="2" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
<field index="3" term="http://rs.tdwg.org/dwc/terms/decimalLatitude"/>
<field index="4" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
<field index="5" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
<field index="6" term="http://rs.tdwg.org/dwc/terms/recordedBy"/>
<field index="7" term="http://rs.tdwg.org/dwc/terms/geodeticDatum"/>
<field index="8" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
</core>
</archive>
6 changes: 6 additions & 0 deletions tests/input_files/dwca/dwca-sample1/occurrence.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
id,occurrenceID,scientificName,decimalLatitude,decimalLongitude,eventDate,recordedBy,geodeticDatum,basisOfRecord
1,1001,SpeciesA,12.34,-56.78,2023-01-01,John Doe,WGS84,PreservedSpecimen
2,1002,SpeciesB,-34.56,78.90,2023-02-15,Jane Smith,WGS84,HumanObservation
3,1003,SpeciesC,0.123,45.678,2023-03-20,Bob Johnson,WGS84,FossilSpecimen
4,1004,SpeciesD,-23.456,-12.345,2023-04-10,Alice Brown,WGS84,MachineObservation
5,1005,SpeciesE,89.012,-67.890,2023-05-25,Charlie White,WGS84,PreservedSpecimen
15 changes: 15 additions & 0 deletions tests/input_files/dwca/dwca-sample2/meta.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="metadata.xml">
<core encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
<files>
<location>occurrence.txt</location>
</files>
<id index="0" />
<field default="WGS84" term="http://rs.tdwg.org/dwc/terms/geodeticDatum"/>
<field index="0" term="http://rs.gbif.org/terms/1.0/gbifID"/>
<field index="1" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
<field index="2" term="http://purl.org/dc/terms/scientificName"/>
<field index="3" term="http://purl.org/dc/terms/decimalLatitude"/>
<field index="4" term="http://purl.org/dc/terms/decimalLongitude"/>
<field index="5" term="http://purl.org/dc/terms/basisOfRecord"/>
</core>
</archive>
3 changes: 3 additions & 0 deletions tests/input_files/dwca/dwca-sample2/occurrence.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
gbifID occurrenceID scientificName decimalLatitude decimalLongitude basisOfRecord
1 occ1 Euphorbia paralias -36.00000 150.5678 Observations
2 occ2 Acaciella angustissima -20.0000 145.1234 Observations
15 changes: 15 additions & 0 deletions tests/input_files/dwca/dwca-sample3/meta.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="metadata.xml">
<core encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
<files>
<location>occurrence.txt</location>
</files>
<id index="0" />
<field default="WGS84" term="http://rs.tdwg.org/dwc/terms/geodeticDatum"/>
<field index="0" term="http://rs.gbif.org/terms/1.0/gbifID"/>
<field index="1" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
<field index="2" term="http://purl.org/dc/terms/scientificName"/>
<field index="3" term="http://purl.org/dc/terms/decimalLatitude"/>
<field index="4" term="http://purl.org/dc/terms/decimalLongitude"/>
<field index="5" term="http://purl.org/dc/terms/basisOfRecord"/>
</core>
</archive>
4 changes: 4 additions & 0 deletions tests/input_files/dwca/dwca-sample3/occurrence.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
gbifID occurrenceID scientificName decimalLatitude decimalLongitude basisOfRecord
1 Euphorbia paralias -36.00000 150.5678 Observations
2 occ2 Acaciella angustissima -20.0000 145.1234 Observations
3 occ3 Acaciella angustissima -20.0000 145.1234 Observations
15 changes: 15 additions & 0 deletions tests/input_files/dwca/dwca-sample4/meta.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<archive xmlns="http://rs.tdwg.org/dwc/text/">
<core rowType="http://rs.tdwg.org/dwc/terms/Occurrence" encoding="utf-8" fieldsTerminatedBy="," linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1">
<files>
<location>occurrence.csv</location>
</files>
<id index="3" />
<field index="0" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
<field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
<field index="2" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
<field index="3" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
<field index="4" term="http://rs.tdwg.org/dwc/terms/decimalLatitude"/>
<field index="5" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
</core>
</archive>
13 changes: 13 additions & 0 deletions tests/input_files/dwca/dwca-sample4/occurrence.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
eventDate,basisOfRecord,scientificName,catalogNumber,decimalLatitude,decimalLongitude
2015-07-18T12:58:00+00:00,Human Observation,Species A,014800,-30.00000,144
2015-07-18T12:58:31+00:00,Human Observation,Species B,,-31.00000,145
2015-07-18T18:16:52+00:00,Human Observation,Species C,014824,-32.00000,100.828059
2015-07-19T04:28:19+00:00,Human Observation,Species D,014823,-33.00000,101.820888
2015-07-19T18:29:25+00:00,Human Observation,Species A1,014822,-34.00000,102.821654
2015-07-20T18:03:12+00:00,Human Observation,Species A2,014821,-35.00000,104.999974
2015-07-21T18:06:58+00:00,Human Observation,Species A3,014802,-34.00000,120.889354
2015-07-22T04:42:47+00:00,Human Observation,Species B1,014800,-36.00000,150.308848
2015-07-22T17:54:18+00:00,Human Observation,Species B2,014800,-30.00000,146.240159
2015-07-22T23:09:51+00:00,Human Observation,Species C1,014799,-31.00000,150.783246
2015-07-23T17:37:26+00:00,Human Observation,Species D,014798,-40.00000,150.823468
2015-07-24T13:10:00+00:00,Human Observation,Species E,014823,-28.00000,115
14 changes: 14 additions & 0 deletions tests/input_files/dwca/dwca-sample5/meta.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<archive xmlns="http://rs.tdwg.org/dwc/text/">
<core rowType="http://rs.tdwg.org/dwc/terms/Occurrence" encoding="utf-8" fieldsTerminatedBy="," linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1">
<files>
<location>occurrence.csv</location>
</files>
<field index="0" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
<field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
<field index="2" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
<field index="3" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
<field index="4" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
<field index="5" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
</core>
</archive>
13 changes: 13 additions & 0 deletions tests/input_files/dwca/dwca-sample5/occurrence.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
eventDate,basisOfRecord,scientificName,catalogNumber,decimalLatitude,decimalLongitude
2015-07-18T12:58:00+00:00,Human Observation,Species A,014826,-30,144
2015-07-18T12:58:31+00:00,Human Observation,Species A1,014825,-31.1111,145
2015-07-18T18:16:52+00:00,Human Observation,Species A2,014824,-32.085431,100.828059
2015-07-19T04:28:19+00:00,Human Observation,Species A3,014823,-33.097233,101.820888
2015-07-19T18:29:25+00:00,Human Observation,Species B1,014822,-34.099936,102.821654
2015-07-20T18:03:12+00:00,Human Observation,Species B2,014821,-35.893671,104.999974
2015-07-21T18:06:58+00:00,Human Observation,Species C,014820,-34.113747,120.889354
2015-07-22T04:42:47+00:00,Human Observation,Species C2,014810,-36,144.308848
2015-07-22T17:54:18+00:00,Human Observation,Species C3,014800,-30.440251,146.240159
2015-07-22T23:09:51+00:00,Human Observation,Species D,014799,-31.547195,150.783246
2015-07-23T17:37:26+00:00,Human Observation,Species D1,-40.481117,150.823468
2015-07-24T13:10:00+00:00,Human Observation,Species D2,014792,-28,115
3 changes: 3 additions & 0 deletions tests/input_files/sample/multimedia_header_with_space.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
catalogNumber ,identifier, format ,type
C4,https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=XXX,image/jpeg,StillImage
C5,https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=XYZ,image/jpeg,StillImage
4 changes: 4 additions & 0 deletions tests/input_files/sample/occ_header_with_space.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
catalogNumber ,basisOfRecord,scientificName ,license,decimalLatitude,decimalLongitude
C3,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-10.0000,120.0000
C4,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-11.1111,125.0000
C5,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-12.085431,130.828059
Loading

0 comments on commit 56cbab2

Please sign in to comment.