dwcahandler package to list darwin core term and pytest

AtlasOfLivingAustralia · Nov 29, 2023 · dc83d74 · dc83d74
1 parent 952f88c
commit dc83d74
Show file tree

Hide file tree

Showing 13 changed files with 642 additions and 92 deletions.
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -0,0 +1,37 @@
+name: Run pytests
+
+on:
+  push:
+    branches: [ "develop" ]
+  pull_request:
+    branches: [ "develop" ]
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest
+
diff --git a/README.md b/README.md
@@ -30,6 +30,11 @@ poetry shell
 poetry install
 ```
 
+* To update the darwin core terms supported in dwcahandler package
+```
+poetry run update-dwc-terms
+```
+
 ### Build
 To build dwcahandler package
 ```
@@ -62,4 +67,48 @@ ext_csvs = [CsvFileType(files=['/tmp/multimedia.csv'], type='multimedia')]
 DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=ext_csvs, output_dwca_path='/tmp/dwca.zip')
 ```
 
+* Merge Darwin Core Archive
+```
+from dwcahandler import DwcaHandler
 
+DwcaHandler.merge_dwca(dwca_file='/tmp/dwca.zip', delta_dwca_file=/tmp/delta-dwca.zip,
+                       output_dwca_path='/tmp/new-dwca.zip', 
+                       keys_lookup={'occurrence':'occurrenceID'})
+```
+
+* Delete Rows from core file in Darwin Core Archive
+```
+from dwcahandler import CsvFileType
+from dwcahandler import DwcaHandler
+
+delete_csv = CsvFileType(files=['/tmp/old-records.csv'], type='occurrence', keys='occurrenceID')
+
+DwcaHandler.delete_records(dwca_file='/tmp/dwca.zip',
+                           records_to_delete=delete_csv, 
+                           output_dwca_path='/tmp/new-dwca.zip')
+```
+
+* Other usages may include subclassing the dwca class, modifying the core dataframe content and rebuilding the dwca.
+```
+from dwcahandler import Dwca
+
+class DerivedDwca(Dwca):
+    """
+    Derived class to perform other custom operations that is not included as part of the core operations
+    """
+    def _drop_columns(self):
+        """
+        Drop existing column in the core content
+        """
+        self.core_content.df_content.drop(columns=['column1', 'column2'], inplace=True)
+        self._update_meta_fields(self.core_content)
+
+
+dwca = DerivedDwca(dwca_file_loc='/tmp/dwca.zip')
+dwca._extract_dwca()
+dwca._drop_columns()
+dwca._generate_eml()
+dwca._generate_meta()
+dwca._write_dwca('/tmp/newdwca.zip')
+
+```
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dwcahandler"
-version = "0.0.1"
+version = "0.0.2"
 description = "Python package to handle Darwin Core Archive (DwCA) operations. This includes creating a DwCA zip file from one or more csvs, reading a DwCA, merge two DwCAs, validate DwCA and delete records from DwCA based on one or more key columns"
 authors = ["Atlas of Living Australia data team <[email protected]>"]
 maintainers = ["Atlas of Living Australia data team <[email protected]>"]
@@ -9,8 +9,14 @@ license = "MPL-1.1"
 readme = "README.md"
 
 [tool.poetry.dependencies]
-python = ">=3.9,<3.13"
-pandas = "^2.1.1"
+python = ">=3.9"
+pandas = "^2.1.0"
+requests = "^2.31.0"
+pytest = "^7.4.3"
+pytest-mock = "^3.12.0"
+
+[tool.poetry.scripts]
+update-dwc-terms = "dwcahandler.scripts.update_dwc_terms:update_terms"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+pandas==2.1.0
+requests==2.31.0
+pytest==7.4.3
+pytest-mock==3.12.0
diff --git a/src/dwcahandler/dwca/__init__.py b/src/dwcahandler/dwca/__init__.py
@@ -137,6 +137,7 @@ def __str__(self) -> str:
 
 
 # Imports at end of file to allow classes to be used
+from .terms import Terms
 from .dwca_meta import Element, MetaElementTypes, MetaElementInfo, MetaDwCA
 from .base_dwca import BaseDwca
 from .core_dwca import Dwca, DfContent

diff --git a/src/dwcahandler/dwca/dwca_factory.py b/src/dwcahandler/dwca/dwca_factory.py
@@ -1,5 +1,7 @@
+
 from abc import ABCMeta, abstractmethod
-from dwcahandler.dwca import CsvFileType, BaseDwca, Dwca, LargeDwca
+import pandas as pd
+from dwcahandler.dwca import CsvFileType, BaseDwca, Dwca, LargeDwca, Terms
 import logging
 from pathlib import Path
 
@@ -141,6 +143,12 @@ def get_dwca_from_dwca_file(dwca_file: str, use_chunking: bool = False, work_dir
 
 
 class DwcaHandler:
+
+    @staticmethod
+    def list_dwc_terms() -> pd.DataFrame:
+        return Terms().dwc_terms_df
+
+
     """Perform various DwCA operations"""
 
     @staticmethod

diff --git a/src/dwcahandler/dwca/dwca_meta.py b/src/dwcahandler/dwca/dwca_meta.py
@@ -8,21 +8,16 @@
 """
 import xml.etree.ElementTree as ET
 from xml.dom import minidom
-import pandas as pd
-from dwcahandler.dwca import CSVEncoding, CoreOrExtType
+from dwcahandler.dwca import CSVEncoding, CoreOrExtType, Terms
 import urllib
-from pathlib import Path
 from dataclasses import dataclass, field, asdict
 from typing import ClassVar
 from typing import Optional
-import os
 import re
 
-this_dir, this_filename = os.path.split(__file__)
 
-
-@dataclass()
-class Element():
+@dataclass
+class Element:
     """A mapping of a name to a URI, giving the class of a row type"""
     name: str
     row_type_ns: str
@@ -113,41 +108,30 @@ class Field:
 
 
 @dataclass
-class MetaElementAttributes():
+class MetaElementAttributes:
     """A meta-description of a DwCA file"""
     meta_element_type: MetaElementInfo
     fields: list[Field] = field(default_factory=list)
 
 
-def absolute_file_paths(directory):
-    """Convert files in a directory into absolute paths and return
-    as a a generator
-
-    :param directory: The directory to scan.
-    :return: An absolute file path.
-    """
-    for dirpath, _, filenames in os.walk(directory):
-        for f in filenames:
-            if re.fullmatch(r'.+\..*', f):
-                yield os.path.abspath(os.path.join(dirpath, f))
-
-
-@dataclass()
+@dataclass
 class MetaDwCA:
     """Complete Metadata for a DwCA including dataset metadata and schema information"""
     EML_XML_FILENAME: str = field(default='eml.xml')
     dwca_meta: ET.Element = field(init=False)
     meta_elements: list[MetaElementAttributes] = field(default_factory=list, init=False)
-    TERMS: list[Path] = field(default_factory=lambda: [c for c in absolute_file_paths(f"{this_dir}/terms")], init=False)
 
     def __post_init__(self):
-        self.terms_df = pd.DataFrame()
-        for term in self.TERMS:
+        self.terms_df = Terms().terms_df
+
+        """
+        for term in self.TERMS_PATH:
             df = pd.read_csv(term, dtype='str')
             if not self.terms_df.empty:
                 self.terms_df = self.terms_df.merge(df, how='outer', left_on=['term', 'uri'], right_on=['term', 'uri'])
             else:
                 self.terms_df = df
+        """
         # initialise own instance of meta content
         self.dwca_meta = ET.Element('archive')
 
@@ -161,10 +145,10 @@ def extract_field_attr_value(field, attrib):
         meta_element_info = MetaElementInfo(core_or_ext_type=core_or_ext_type,
                                             type=MetaElementTypes.get_element_by_row_type(node_elm.attrib['rowType']),
                                             csv_encoding=CSVEncoding(
-                                                csv_delimiter=node_elm.attrib['fieldsTerminatedBy'],
-                                                csv_eol=node_elm.attrib['linesTerminatedBy'],
-                                                csv_text_enclosure=node_elm.attrib['fieldsEnclosedBy'] if
-                                                node_elm.attrib['fieldsEnclosedBy'] != '' else '"'),
+                                            csv_delimiter=node_elm.attrib['fieldsTerminatedBy'],
+                                            csv_eol=node_elm.attrib['linesTerminatedBy'],
+                                            csv_text_enclosure=node_elm.attrib['fieldsEnclosedBy'] if
+                                            node_elm.attrib['fieldsEnclosedBy'] != '' else '"'),
                                             ignore_header_lines=node_elm.attrib['ignoreHeaderLines'],
                                             charset_encoding=node_elm.attrib['encoding'],
                                             file_name=file_name)

diff --git a/src/dwcahandler/dwca/terms.py b/src/dwcahandler/dwca/terms.py
@@ -0,0 +1,61 @@
+import pandas as pd
+from dataclasses import dataclass, field, asdict
+import os
+from pathlib import Path
+import re
+
+this_dir, this_filename = os.path.split(__file__)
+
+def absolute_file_paths(directory):
+    """Convert files in a directory into absolute paths and return
+    as a generator
+
+    :param directory: The directory to scan.
+    :return: An absolute file path.
+    """
+    for dirpath, _, filenames in os.walk(directory):
+        for f in filenames:
+            if re.fullmatch(r'.+\..*', f):
+                yield os.path.abspath(os.path.join(dirpath, f))
+
+
+@dataclass
+class Terms:
+    TERMS_DWC_URL = "https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/terms/terms.csv"
+    DWC_FILENAME = 'darwin-core-terms.csv'
+    DUBLIN_CORE_FILENAME = 'dublin-core-terms.csv'
+    TERMS_DIR = f"{this_dir}/terms"
+    DWC_FILE_PATH = f"{TERMS_DIR}/{DWC_FILENAME}"
+    DUBLIN_CORE_PATH = f"{TERMS_DIR}/{DUBLIN_CORE_FILENAME}"
+
+    terms_path: list[Path] = field(default_factory=lambda: [c for c in absolute_file_paths(Terms.TERMS_DIR)],
+                                   init=False)
+    terms_df: pd.DataFrame = field(default_factory=pd.DataFrame, init=False)
+    dwc_terms_df: pd.DataFrame = field(default_factory=pd.DataFrame, init=False)
+
+    def __post_init__(self):
+        def _add_to_df(existing_df: pd.DataFrame, df: pd.DataFrame):
+            if not existing_df.empty:
+                return existing_df.merge(df, how='outer', left_on=['term', 'uri'], right_on=['term', 'uri'])
+            return df
+
+        for term_path in self.terms_path:
+            df = pd.read_csv(term_path, dtype='str')
+            self.terms_df = _add_to_df(self.terms_df, df)
+            if term_path == Terms.DWC_FILE_PATH or term_path == Terms.DUBLIN_CORE_PATH:
+                self.dwc_terms_df = _add_to_df(self.dwc_terms_df, df)
+
+    @staticmethod
+    def update_dwc_terms():
+        """
+        Pull the latest terms from gbif dwc csv url and update the darwin core vocab terms in the package
+        This is still WIP, do we to pull the
+        :return:
+        """
+        df = pd.read_csv(Terms.TERMS_DWC_URL, delimiter=",", encoding='utf-8', dtype='str')
+        df = df[df["term_deprecated"].isnull()]
+        dwc_df = pd.DataFrame()
+        dwc_df['term'] = df['term_localName']
+        dwc_df['uri'] = df['term_isDefinedBy'] + df['term_localName']
+        dwc_df.to_csv(Terms.DWC_FILE_PATH, index=False)
+        return dwc_df