Skip to content

Commit

Permalink
Merge pull request #16 from AtlasOfLivingAustralia/release/0.3.0
Browse files Browse the repository at this point in the history
Release v0.3.0
  • Loading branch information
patkyn authored Sep 26, 2024
2 parents e629d37 + b43cfd3 commit 836b6ef
Show file tree
Hide file tree
Showing 12 changed files with 498 additions and 110 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[flake8]
max-line-length = 120
max-complexity = 19
max-complexity = 20
select = C,E,F,W,B,B950
ignore = E126,E203,E501,W503,W504
exclude =
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,15 @@ DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=ext_csvs, eml_content=em

```python
from dwcahandler import DwcaHandler
from dwcahandler.dwca import DataFrameType
from dwcahandler.dwca import CsvFileType
from dwcahandler import Eml
import pandas as pd

core_df = pd.read_csv("/tmp/occurrence.csv")
core_frame = DataFrameType(df=core_df, type='occurrence', keys=['occurrenceID'])
core_frame = CsvFileType(files=core_df, type='occurrence', keys=['occurrenceID'])

ext_df = pd.read_csv("/tmp/multimedia.csv")
ext_frame = [DataFrameType(df=ext_df, type='multimedia', keys=['occurrenceID'])]
ext_frame = [CsvFileType(files=ext_df, type='multimedia', keys=['occurrenceID'])]

eml = Eml(dataset_name='Test Dataset',
description='Dataset description',
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dwcahandler"
version = "0.2.0"
version = "0.3.0"
description = "Python package to handle Darwin Core Archive (DwCA) operations. This includes creating a DwCA zip file from one or more csvs, reading a DwCA, merge two DwCAs, validate DwCA and delete records from DwCA based on one or more key columns"
authors = ["Atlas of Living Australia data team <[email protected]>"]
maintainers = ["Atlas of Living Australia data team <[email protected]>"]
Expand Down
32 changes: 14 additions & 18 deletions src/dwcahandler/dwca/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"""
from collections import namedtuple
from dataclasses import dataclass, field
from typing import Optional
from typing import Optional, Union
import logging
import pandas as pd
from functools import wraps
Expand Down Expand Up @@ -64,7 +64,7 @@ def __convert_values(self, v):
class CsvFileType:
"""A description of a CSV file in a DwCA
"""
files: list # can accept more than one file
files: Union[list[str], pd.DataFrame] # can accept more than one file or a dataframe
type: str # 'occurrence', 'taxon', 'event', multimedia,...
keys: Optional[list] = None # must be supplied for csv extensions to link extension records to core record
# when creating dwca. for core other than occurrence, this neeeds to be supplied as key.
Expand All @@ -77,16 +77,6 @@ class CsvFileType:
# file delimiter type when reading the csv. if not supplied, the collectory setting delimiter is read in for the dr


@dataclass
class DataFrameType:
df: pd.DataFrame
type: str # 'occurrence', 'taxon', 'event', multimedia,...
keys: Optional[list] = None # must be supplied for csv extensions to link extension records to core record
# when creating dwca. for core other than occurrence, this neeeds to be supplied as key.
# column keys lookup in core or extension for delete records
associated_files_loc: Optional[str] = None # in case there are associated media that need to be packaged in dwca


class Stat:
"""Record statistics for a DwCA"""
start_record_count: int = 0
Expand Down Expand Up @@ -153,12 +143,18 @@ def __str__(self) -> str:
def record_diff_stat(func):
"""Record stats for dataframe content"""
@wraps(func)
def wrapper_function(self, record_content, content, *args, **kwargs):
ret_value = func(self, record_content, content, *args, **kwargs)
record_content.stat.set_stat(self.count_stat(ret_value))
logging.debug("%s %s %s stats shows %s",
func.__name__, record_content.meta_info.core_or_ext_type,
record_content.meta_info.type.name, str(record_content.stat))
def wrapper_function(self, *args, **kwargs):
params = list(kwargs.keys())
if len(params) >= 1:
record_content = kwargs[params[0]]
ret_value = func(self, *args, **kwargs)
record_content.stat.set_stat(self.count_stat(ret_value))
logging.debug("%s %s %s stats shows %s",
func.__name__, record_content.meta_info.core_or_ext_type,
record_content.meta_info.type.name, str(record_content.stat))
return ret_value

ret_value = func(self, *args, **kwargs)
return ret_value

return wrapper_function
Expand Down
11 changes: 6 additions & 5 deletions src/dwcahandler/dwca/base_dwca.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from __future__ import annotations
from abc import ABCMeta, abstractmethod
from typing import Union
from dwcahandler.dwca import CoreOrExtType, CsvFileType, DataFrameType, MetaElementTypes
from io import BytesIO
from dwcahandler.dwca import CoreOrExtType, CsvFileType, MetaElementTypes
from dwcahandler.dwca.eml import Eml


Expand Down Expand Up @@ -116,14 +117,14 @@ def remove_extensions(self, exclude_ext_files: list, output_dwca_path: str):
self.generate_meta()
self.write_dwca(output_dwca_path)

def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca_path: str):
def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca_path: Union[str, BytesIO]):
self.extract_dwca()
self.delete_records(records_to_delete)
self.generate_eml()
self.generate_meta()
self.write_dwca(output_dwca_path)

def create_dwca(self, core_csv: Union[CsvFileType, DataFrameType], output_dwca_path: str,
def create_dwca(self, core_csv: CsvFileType, output_dwca_path: str,
ext_csv_list: list[CsvFileType] = None, validate_content: bool = True,
eml_content: Union[str, Eml] = ''):

Expand Down Expand Up @@ -154,8 +155,8 @@ def create_dwca(self, core_csv: Union[CsvFileType, DataFrameType], output_dwca_p
# keys_lookup keys used for merging 2 dwcas
# regen_ids will generate new uuids for core csv and link coreids extensions to core records.
# https://peps.python.org/pep-0484/#forward-references
def merge_dwca(self, delta_dwca: BaseDwca, output_dwca_path: str, keys_lookup: dict = None, extension_sync=False,
regen_ids: bool = False, validate_delta: bool = True):
def merge_dwca(self, delta_dwca: BaseDwca, output_dwca_path: Union[str, BytesIO], keys_lookup: dict = None,
extension_sync=False, regen_ids: bool = False, validate_delta: bool = True):
self.extract_dwca()
delta_dwca.extract_dwca()
self.set_keys(keys_lookup)
Expand Down
46 changes: 26 additions & 20 deletions src/dwcahandler/dwca/core_dwca.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import mimetypes
import re
import uuid
from io import BytesIO
import zipfile
from dataclasses import MISSING, asdict, dataclass, field
from pathlib import Path
Expand All @@ -21,9 +22,8 @@
from numpy import nan
from pandas.errors import EmptyDataError
from pandas.io import parsers

from dwcahandler.dwca import (BaseDwca, CoreOrExtType, CSVEncoding,
CsvFileType, DataFrameType, Defaults, Eml,
CsvFileType, Defaults, Eml,
MetaDwCA, MetaElementInfo, MetaElementTypes,
Stat, record_diff_stat)

Expand All @@ -46,7 +46,7 @@ class Dwca(BaseDwca):
"""
A concrete implementation of a Darwin Core Archive.
"""
dwca_file_loc: str = field(default='./')
dwca_file_loc: Union[str, BytesIO] = field(default='./')
core_content: DfContent = field(init=False)
ext_content: list[DfContent] = field(init=False, default_factory=list)
defaults_prop: Defaults = field(init=False, default_factory=Defaults)
Expand Down Expand Up @@ -188,9 +188,10 @@ def convert_values(v):
files = zf.namelist()

log.info("Reading from %s", self.dwca_file_loc)
with zf.open(self.defaults_prop.meta_xml_filename) as meta_xml_file:
self.meta_content.read_meta_file(meta_xml_file)
meta_xml_file.close()

with io.TextIOWrapper(zf.open(self.defaults_prop.meta_xml_filename)) as meta_xml:
self.meta_content.read_meta_file(meta_xml)
meta_xml.close()

if self.meta_content.eml_xml_filename in files:
with io.TextIOWrapper(zf.open(self.meta_content.eml_xml_filename),
Expand Down Expand Up @@ -234,7 +235,9 @@ def _add_new_columns(self, df_content, delta_df_content):
delta_df_columns = delta_df_content.columns.to_list()
new_columns = list(set(delta_df_columns) - set(df_columns))
if len(new_columns) > 0:
df_content[new_columns] = nan
# Set to empty string instead of nan to resolve warning message
# see https://pandas.pydata.org/pdeps/0006-ban-upcasting.html
df_content[new_columns] = ""

return new_columns

Expand Down Expand Up @@ -530,8 +533,12 @@ def delete_records(self, records_to_delete: CsvFileType):
:param records_to_delete: A CSV file of records to delete, keyed to the DwCA file
"""
delete_content = self._combine_contents(
records_to_delete.files, records_to_delete.csv_encoding, use_chunking=False)
delete_content = pd.DataFrame()
if isinstance(records_to_delete.files, pd.DataFrame):
delete_content = records_to_delete.files.copy(deep=True)
else:
delete_content = self._combine_contents(records_to_delete.files, records_to_delete.csv_encoding,
use_chunking=False)
valid_delete_file = (all(col in delete_content.columns for col in records_to_delete.keys)
or len(delete_content) > 0)
if not valid_delete_file:
Expand Down Expand Up @@ -562,7 +569,7 @@ def delete_records(self, records_to_delete: CsvFileType):
if core_or_ext == CoreOrExtType.CORE:
for ext in self.ext_content:
log.info("Removing records from ext: %s", ext.meta_info.type.name)
ext.df_content = self._delete_content(content=ext.df_content,
ext.df_content = self._delete_content(content=ext,
delete_content=delete_content)

def _add_ext_lookup_key(self, df_content, core_df_content, core_keys, keys):
Expand Down Expand Up @@ -622,8 +629,9 @@ def merge_contents(self, delta_dwca: Dwca, extension_sync: bool = False,
self.ext_content.append(delta_content)
self._update_meta_fields(delta_content)

self.core_content.df_content = self._merge_df_content(self.core_content, delta_dwca.core_content,
self.core_content.keys)
self.core_content.df_content = self._merge_df_content(content=self.core_content,
delta_content=delta_dwca.core_content,
keys=self.core_content.keys)

if regen_ids:
self._update_core_ids(self.core_content.df_content)
Expand Down Expand Up @@ -760,7 +768,7 @@ def convert_associated_media_to_extension(self):
if len(image_df) > 0:
self._update_meta_fields(self.core_content)
log.info("%s associated media extracted", str(len(image_df)))
return CsvFileType(files=[image_df], type='multimedia', keys=image_df.index.names)
return CsvFileType(files=image_df, type='multimedia', keys=image_df.index.names)

log.info("Nothing to extract from associated media")

Expand All @@ -776,7 +784,7 @@ def _combine_contents(self, contents: list, csv_encoding, use_chunking=False):
"""
if len(contents) > 0:
if isinstance(contents[0], pd.DataFrame):
return contents[0]
return contents[0].copy(deep=True)

df_content = pd.DataFrame()
for content in contents:
Expand Down Expand Up @@ -905,15 +913,15 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil

return True

def extract_csv_content(self, csv_info: Union[CsvFileType, DataFrameType],
def extract_csv_content(self, csv_info: CsvFileType,
core_ext_type: CoreOrExtType):
"""Read the files from a CSV description into a content frame and include it in the Dwca.
:param csv_info: The CSV file(s)
:param core_ext_type: Whether this is a core or extension content frame
"""
if isinstance(csv_info, DataFrameType):
csv_content = csv_info.df
if isinstance(csv_info.files, pd.DataFrame):
csv_content = csv_info.files.copy(deep=True)
else:
csv_content = self._combine_contents(csv_info.files, csv_info.csv_encoding)

Expand Down Expand Up @@ -982,14 +990,12 @@ def _write_associated_files(self, dwca_zip: ZipFile):
for file in self.embedded_files:
dwca_zip.write(file, file.name)

def write_dwca(self, output_dwca_path: str):
def write_dwca(self, output_dwca_path: Union[str | BytesIO]):
"""Write a full DwCA to a zip file
Any parent directories needed are created during writing.
:param output_dwca_path: The file path to write the .zip file to
"""
dwca_path = Path(output_dwca_path)
dwca_path.parent.mkdir(parents=True, exist_ok=True)
with ZipFile(output_dwca_path, 'w', allowZip64=True,
compression=zipfile.ZIP_DEFLATED) as dwca_zip:
self._write_df_content_to_zip_file(dwca_zip=dwca_zip, content=self.core_content)
Expand Down
19 changes: 10 additions & 9 deletions src/dwcahandler/dwca/dwca_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import logging
from typing import Union
import pandas as pd
from dwcahandler.dwca import CsvFileType, DataFrameType, Dwca, Terms, Eml

from dwcahandler.dwca import CsvFileType, Dwca, Terms, Eml
from io import BytesIO

logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
log = logging.getLogger("DwcaFactoryManager")
Expand All @@ -22,9 +22,9 @@ def list_dwc_terms() -> pd.DataFrame:
"""Perform various DwCA operations"""

@staticmethod
def create_dwca(core_csv: Union[CsvFileType, DataFrameType],
output_dwca_path: str,
ext_csv_list: list[Union[CsvFileType, DataFrameType]] = None,
def create_dwca(core_csv: CsvFileType,
output_dwca_path: Union[str, BytesIO],
ext_csv_list: list[CsvFileType] = None,
validate_content: bool = True,
eml_content: Union[str, Eml] = ''):
"""Create a suitable DwCA from a list of CSV files
Expand All @@ -50,8 +50,8 @@ def remove_extension_files(dwca_file: str, ext_files: list, output_dwca_path: st
output_dwca_path=output_dwca_path)

@staticmethod
def delete_records(dwca_file: str, records_to_delete: CsvFileType,
output_dwca_path: str):
def delete_records(dwca_file: Union[str, BytesIO], records_to_delete: CsvFileType,
output_dwca_path: Union[str, BytesIO]):
"""Delete core records listed in the records_to_delete file from DwCA.
The specified keys listed in records_to_delete param must exist in the dwca core file
Expand All @@ -63,8 +63,9 @@ def delete_records(dwca_file: str, records_to_delete: CsvFileType,
output_dwca_path=output_dwca_path)

@staticmethod
def merge_dwca(dwca_file: str, delta_dwca_file: str, output_dwca_path: str, keys_lookup: dict = None,
extension_sync: bool = False, regen_ids: bool = False, validate_delta_content: bool = True):
def merge_dwca(dwca_file: Union[str, BytesIO], delta_dwca_file: Union[str, BytesIO], output_dwca_path: Union[str, BytesIO],
keys_lookup: dict = None, extension_sync: bool = False, regen_ids: bool = False,
validate_delta_content: bool = True):
"""Merge a DwCA with a delta DwCA of changes.
:param dwca_file: The path to the existing DwCA
Expand Down
Loading

0 comments on commit 836b6ef

Please sign in to comment.