Skip to content

Commit

Permalink
dwcahandler package to list darwin core term and pytest
Browse files Browse the repository at this point in the history
  • Loading branch information
patkyn committed Nov 29, 2023
1 parent 952f88c commit dc83d74
Show file tree
Hide file tree
Showing 13 changed files with 642 additions and 92 deletions.
37 changes: 37 additions & 0 deletions .github/workflows/run-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: Run pytests

on:
push:
branches: [ "develop" ]
pull_request:
branches: [ "develop" ]

permissions:
contents: read

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest
49 changes: 49 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ poetry shell
poetry install
```

* To update the darwin core terms supported in dwcahandler package
```
poetry run update-dwc-terms
```

### Build
To build dwcahandler package
```
Expand Down Expand Up @@ -62,4 +67,48 @@ ext_csvs = [CsvFileType(files=['/tmp/multimedia.csv'], type='multimedia')]
DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=ext_csvs, output_dwca_path='/tmp/dwca.zip')
```

* Merge Darwin Core Archive
```
from dwcahandler import DwcaHandler
DwcaHandler.merge_dwca(dwca_file='/tmp/dwca.zip', delta_dwca_file=/tmp/delta-dwca.zip,
output_dwca_path='/tmp/new-dwca.zip',
keys_lookup={'occurrence':'occurrenceID'})
```

* Delete Rows from core file in Darwin Core Archive
```
from dwcahandler import CsvFileType
from dwcahandler import DwcaHandler
delete_csv = CsvFileType(files=['/tmp/old-records.csv'], type='occurrence', keys='occurrenceID')
DwcaHandler.delete_records(dwca_file='/tmp/dwca.zip',
records_to_delete=delete_csv,
output_dwca_path='/tmp/new-dwca.zip')
```

* Other usages may include subclassing the dwca class, modifying the core dataframe content and rebuilding the dwca.
```
from dwcahandler import Dwca
class DerivedDwca(Dwca):
"""
Derived class to perform other custom operations that is not included as part of the core operations
"""
def _drop_columns(self):
"""
Drop existing column in the core content
"""
self.core_content.df_content.drop(columns=['column1', 'column2'], inplace=True)
self._update_meta_fields(self.core_content)
dwca = DerivedDwca(dwca_file_loc='/tmp/dwca.zip')
dwca._extract_dwca()
dwca._drop_columns()
dwca._generate_eml()
dwca._generate_meta()
dwca._write_dwca('/tmp/newdwca.zip')
```
304 changes: 301 additions & 3 deletions poetry.lock

Large diffs are not rendered by default.

12 changes: 9 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dwcahandler"
version = "0.0.1"
version = "0.0.2"
description = "Python package to handle Darwin Core Archive (DwCA) operations. This includes creating a DwCA zip file from one or more csvs, reading a DwCA, merge two DwCAs, validate DwCA and delete records from DwCA based on one or more key columns"
authors = ["Atlas of Living Australia data team <[email protected]>"]
maintainers = ["Atlas of Living Australia data team <[email protected]>"]
Expand All @@ -9,8 +9,14 @@ license = "MPL-1.1"
readme = "README.md"

[tool.poetry.dependencies]
python = ">=3.9,<3.13"
pandas = "^2.1.1"
python = ">=3.9"
pandas = "^2.1.0"
requests = "^2.31.0"
pytest = "^7.4.3"
pytest-mock = "^3.12.0"

[tool.poetry.scripts]
update-dwc-terms = "dwcahandler.scripts.update_dwc_terms:update_terms"

[build-system]
requires = ["poetry-core"]
Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
pandas==2.1.0
requests==2.31.0
pytest==7.4.3
pytest-mock==3.12.0
1 change: 1 addition & 0 deletions src/dwcahandler/dwca/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ def __str__(self) -> str:


# Imports at end of file to allow classes to be used
from .terms import Terms
from .dwca_meta import Element, MetaElementTypes, MetaElementInfo, MetaDwCA
from .base_dwca import BaseDwca
from .core_dwca import Dwca, DfContent
Expand Down
10 changes: 9 additions & 1 deletion src/dwcahandler/dwca/dwca_factory.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@

from abc import ABCMeta, abstractmethod
from dwcahandler.dwca import CsvFileType, BaseDwca, Dwca, LargeDwca
import pandas as pd
from dwcahandler.dwca import CsvFileType, BaseDwca, Dwca, LargeDwca, Terms
import logging
from pathlib import Path

Expand Down Expand Up @@ -141,6 +143,12 @@ def get_dwca_from_dwca_file(dwca_file: str, use_chunking: bool = False, work_dir


class DwcaHandler:

@staticmethod
def list_dwc_terms() -> pd.DataFrame:
return Terms().dwc_terms_df


"""Perform various DwCA operations"""

@staticmethod
Expand Down
44 changes: 14 additions & 30 deletions src/dwcahandler/dwca/dwca_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,16 @@
"""
import xml.etree.ElementTree as ET
from xml.dom import minidom
import pandas as pd
from dwcahandler.dwca import CSVEncoding, CoreOrExtType
from dwcahandler.dwca import CSVEncoding, CoreOrExtType, Terms
import urllib
from pathlib import Path
from dataclasses import dataclass, field, asdict
from typing import ClassVar
from typing import Optional
import os
import re

this_dir, this_filename = os.path.split(__file__)


@dataclass()
class Element():
@dataclass
class Element:
"""A mapping of a name to a URI, giving the class of a row type"""
name: str
row_type_ns: str
Expand Down Expand Up @@ -113,41 +108,30 @@ class Field:


@dataclass
class MetaElementAttributes():
class MetaElementAttributes:
"""A meta-description of a DwCA file"""
meta_element_type: MetaElementInfo
fields: list[Field] = field(default_factory=list)


def absolute_file_paths(directory):
"""Convert files in a directory into absolute paths and return
as a a generator
:param directory: The directory to scan.
:return: An absolute file path.
"""
for dirpath, _, filenames in os.walk(directory):
for f in filenames:
if re.fullmatch(r'.+\..*', f):
yield os.path.abspath(os.path.join(dirpath, f))


@dataclass()
@dataclass
class MetaDwCA:
"""Complete Metadata for a DwCA including dataset metadata and schema information"""
EML_XML_FILENAME: str = field(default='eml.xml')
dwca_meta: ET.Element = field(init=False)
meta_elements: list[MetaElementAttributes] = field(default_factory=list, init=False)
TERMS: list[Path] = field(default_factory=lambda: [c for c in absolute_file_paths(f"{this_dir}/terms")], init=False)

def __post_init__(self):
self.terms_df = pd.DataFrame()
for term in self.TERMS:
self.terms_df = Terms().terms_df

"""
for term in self.TERMS_PATH:
df = pd.read_csv(term, dtype='str')
if not self.terms_df.empty:
self.terms_df = self.terms_df.merge(df, how='outer', left_on=['term', 'uri'], right_on=['term', 'uri'])
else:
self.terms_df = df
"""
# initialise own instance of meta content
self.dwca_meta = ET.Element('archive')

Expand All @@ -161,10 +145,10 @@ def extract_field_attr_value(field, attrib):
meta_element_info = MetaElementInfo(core_or_ext_type=core_or_ext_type,
type=MetaElementTypes.get_element_by_row_type(node_elm.attrib['rowType']),
csv_encoding=CSVEncoding(
csv_delimiter=node_elm.attrib['fieldsTerminatedBy'],
csv_eol=node_elm.attrib['linesTerminatedBy'],
csv_text_enclosure=node_elm.attrib['fieldsEnclosedBy'] if
node_elm.attrib['fieldsEnclosedBy'] != '' else '"'),
csv_delimiter=node_elm.attrib['fieldsTerminatedBy'],
csv_eol=node_elm.attrib['linesTerminatedBy'],
csv_text_enclosure=node_elm.attrib['fieldsEnclosedBy'] if
node_elm.attrib['fieldsEnclosedBy'] != '' else '"'),
ignore_header_lines=node_elm.attrib['ignoreHeaderLines'],
charset_encoding=node_elm.attrib['encoding'],
file_name=file_name)
Expand Down
61 changes: 61 additions & 0 deletions src/dwcahandler/dwca/terms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import pandas as pd
from dataclasses import dataclass, field, asdict
import os
from pathlib import Path
import re

this_dir, this_filename = os.path.split(__file__)

def absolute_file_paths(directory):
"""Convert files in a directory into absolute paths and return
as a generator
:param directory: The directory to scan.
:return: An absolute file path.
"""
for dirpath, _, filenames in os.walk(directory):
for f in filenames:
if re.fullmatch(r'.+\..*', f):
yield os.path.abspath(os.path.join(dirpath, f))


@dataclass
class Terms:
TERMS_DWC_URL = "https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/terms/terms.csv"
DWC_FILENAME = 'darwin-core-terms.csv'
DUBLIN_CORE_FILENAME = 'dublin-core-terms.csv'
TERMS_DIR = f"{this_dir}/terms"
DWC_FILE_PATH = f"{TERMS_DIR}/{DWC_FILENAME}"
DUBLIN_CORE_PATH = f"{TERMS_DIR}/{DUBLIN_CORE_FILENAME}"

terms_path: list[Path] = field(default_factory=lambda: [c for c in absolute_file_paths(Terms.TERMS_DIR)],
init=False)
terms_df: pd.DataFrame = field(default_factory=pd.DataFrame, init=False)
dwc_terms_df: pd.DataFrame = field(default_factory=pd.DataFrame, init=False)

def __post_init__(self):
def _add_to_df(existing_df: pd.DataFrame, df: pd.DataFrame):
if not existing_df.empty:
return existing_df.merge(df, how='outer', left_on=['term', 'uri'], right_on=['term', 'uri'])
return df

for term_path in self.terms_path:
df = pd.read_csv(term_path, dtype='str')
self.terms_df = _add_to_df(self.terms_df, df)
if term_path == Terms.DWC_FILE_PATH or term_path == Terms.DUBLIN_CORE_PATH:
self.dwc_terms_df = _add_to_df(self.dwc_terms_df, df)

@staticmethod
def update_dwc_terms():
"""
Pull the latest terms from gbif dwc csv url and update the darwin core vocab terms in the package
This is still WIP, do we to pull the
:return:
"""
df = pd.read_csv(Terms.TERMS_DWC_URL, delimiter=",", encoding='utf-8', dtype='str')
df = df[df["term_deprecated"].isnull()]
dwc_df = pd.DataFrame()
dwc_df['term'] = df['term_localName']
dwc_df['uri'] = df['term_isDefinedBy'] + df['term_localName']
dwc_df.to_csv(Terms.DWC_FILE_PATH, index=False)
return dwc_df
Loading

0 comments on commit dc83d74

Please sign in to comment.