From 8003c8cafc71c10a9af48014bc03d96f2d2bfbd4 Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Fri, 24 Jan 2025 11:39:33 +1100 Subject: [PATCH 1/7] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/17 - Fix for metadata fields and add validate dwca test cases --- src/dwcahandler/dwca/core_dwca.py | 38 ++++++++++++------- src/dwcahandler/dwca/dwca_factory.py | 2 +- src/dwcahandler/dwca/dwca_meta.py | 12 +++++- tests/input_files/dwca/dwca-sample1/meta.xml | 17 +++++++++ .../dwca/dwca-sample1/occurrence.txt | 6 +++ tests/input_files/dwca/dwca-sample2/meta.xml | 15 ++++++++ .../dwca/dwca-sample2/occurrence.txt | 3 ++ tests/input_files/dwca/dwca-sample3/meta.xml | 15 ++++++++ .../dwca/dwca-sample3/occurrence.txt | 4 ++ tests/input_files/dwca/dwca-sample4/meta.xml | 15 ++++++++ .../dwca/dwca-sample4/occurrence.csv | 13 +++++++ tests/input_files/dwca/dwca-sample5/meta.xml | 14 +++++++ .../dwca/dwca-sample5/occurrence.csv | 13 +++++++ 13 files changed, 152 insertions(+), 15 deletions(-) create mode 100755 tests/input_files/dwca/dwca-sample1/meta.xml create mode 100755 tests/input_files/dwca/dwca-sample1/occurrence.txt create mode 100644 tests/input_files/dwca/dwca-sample2/meta.xml create mode 100644 tests/input_files/dwca/dwca-sample2/occurrence.txt create mode 100644 tests/input_files/dwca/dwca-sample3/meta.xml create mode 100644 tests/input_files/dwca/dwca-sample3/occurrence.txt create mode 100755 tests/input_files/dwca/dwca-sample4/meta.xml create mode 100644 tests/input_files/dwca/dwca-sample4/occurrence.csv create mode 100755 tests/input_files/dwca/dwca-sample5/meta.xml create mode 100644 tests/input_files/dwca/dwca-sample5/occurrence.csv diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index a594cd1..a8040cd 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -207,6 +207,10 @@ def convert_values(v): csv_file_name = meta_elm.meta_element_type.file_name with io.TextIOWrapper(zf.open(csv_file_name), encoding="utf-8") as csv_file: dwc_headers = [f.field_name for f in meta_elm.fields if f.index is not None] + duplicates = [i for i in set(dwc_headers) if dwc_headers.count(i) > 1] + if len(duplicates) > 0: + raise ValueError(f"Duplicate columns {duplicates} specified in the " + f"metadata for {csv_file_name}") csv_encoding = {key: convert_values(value) for key, value in asdict(meta_elm.meta_element_type.csv_encoding).items()} csv_content = self._read_csv( @@ -825,17 +829,18 @@ def check_duplicates(self, content_keys_df, keys, error_file=None): """ def report_error(content, keys, message, condition, error_file=None): - log.error("%s found in keys %s", message, keys) - log.error("\n%s count\n%s", message, condition.sum()) - log.error("\n%s", content.loc[condition.values, keys].index.tolist()) - if error_file: - content.loc[condition.values, keys].to_csv(error_file, index=False) + content.loc[condition.values, keys].to_csv(error_file, index=False) checks_status: bool = True if len(keys) > 0: empty_values_condition = content_keys_df.isnull() if empty_values_condition.values.any(): - report_error(content_keys_df, keys, "Empty Values", empty_values_condition) + log.error("Empty values found in %s. Total rows affected: %s", keys, + empty_values_condition.sum().sum()) + log.error("Empty values found in dataframe row: %s", + content_keys_df.index[empty_values_condition.all(axis=1)].tolist()) + if error_file: + report_error(content_keys_df, keys, "Empty Values", empty_values_condition) checks_status = False # check incase-sensitive duplicates @@ -846,8 +851,11 @@ def to_lower(df): df_keys = to_lower(content_keys_df) duplicate_condition = df_keys.duplicated(keep='first') if duplicate_condition.values.any(): - report_error(content_keys_df, keys, "Duplicate Values", - duplicate_condition, error_file) + log.error(f"Duplicate %s found. Total rows affected: %s", keys, duplicate_condition.sum()) + log.error("Duplicate values: %s", pd.unique(content_keys_df[duplicate_condition].stack())) + if error_file: + report_error(content_keys_df, keys, "Duplicate Values", + duplicate_condition, error_file) checks_status = False return checks_status @@ -907,9 +915,9 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil if not self._validate_columns(content): return False - dup_cols = self._find_duplicate_columns(content) - if len(dup_cols) > 0: - return False + #dup_cols = self._find_duplicate_columns(content) + #if len(dup_cols) > 0: + # return False return True @@ -1065,6 +1073,10 @@ def _read_csv(self, return ret_val except EmptyDataError: - log.error(f"The expected columns: %s are not present in the {csv_file}. " - f"The file may be empty", ','.join(columns)) + if columns: + log.error(f"The file may be empty {csv_file}") + else: + log.error(f"The expected columns: %s are not present in the {csv_file}. " + f"The file may be empty", ','.join(columns)) + return pd.DataFrame() diff --git a/src/dwcahandler/dwca/dwca_factory.py b/src/dwcahandler/dwca/dwca_factory.py index ac68eae..0082b2a 100644 --- a/src/dwcahandler/dwca/dwca_factory.py +++ b/src/dwcahandler/dwca/dwca_factory.py @@ -82,7 +82,7 @@ def merge_dwca(dwca_file: Union[str, BytesIO], delta_dwca_file: Union[str, Bytes regen_ids=regen_ids, validate_delta=validate_delta_content) @staticmethod - def validate_dwca(dwca_file: str, keys_lookup: dict = None, error_file: str = None): + def validate_dwca(dwca_file: Union[str, BytesIO], keys_lookup: dict = None, error_file: str = None): """Test a dwca for consistency :param dwca_file: The path to the DwCA diff --git a/src/dwcahandler/dwca/dwca_meta.py b/src/dwcahandler/dwca/dwca_meta.py index b8451fc..772a5b5 100644 --- a/src/dwcahandler/dwca/dwca_meta.py +++ b/src/dwcahandler/dwca/dwca_meta.py @@ -143,7 +143,16 @@ def __extract_meta_info(self, ns, node_elm, core_or_ext_type): def extract_field_attr_value(field_elm, attrib): return field_elm.attrib.get(attrib) if field_elm.attrib.get(attrib) else None + def __find_id_in_fields(local_fields, id_field): + index_number = id_field[0].attrib["index"] if len(id_field) > 0 else "0" + return next((item for item in local_fields if "index" in item.attrib and item.attrib["index"]==index_number), None) + fields = node_elm.findall(f'{ns}field') + id_field = [] + if core_or_ext_type == 'core': + id_field = node_elm.findall(f'{ns}id') + else: + id_field = node_elm.findall(f'{ns}coreid') file_name = node_elm.find(f'{ns}files').find(f'{ns}location').text meta_element_info = MetaElementInfo( core_or_ext_type=core_or_ext_type, @@ -157,7 +166,8 @@ def extract_field_attr_value(field_elm, attrib): charset_encoding=node_elm.attrib['encoding'], file_name=file_name) # set first field with index 0 if it's not present in list of fields - if fields[0].attrib['index'] != '0': + field_elm = __find_id_in_fields(fields, id_field) + if field_elm is None and len(id_field) > 0: if CoreOrExtType.CORE == core_or_ext_type: field_list = [Field(index=0, field_name="id")] else: diff --git a/tests/input_files/dwca/dwca-sample1/meta.xml b/tests/input_files/dwca/dwca-sample1/meta.xml new file mode 100755 index 0000000..f851204 --- /dev/null +++ b/tests/input_files/dwca/dwca-sample1/meta.xml @@ -0,0 +1,17 @@ + + + + + occurrence.txt + + + + + + + + + + + + diff --git a/tests/input_files/dwca/dwca-sample1/occurrence.txt b/tests/input_files/dwca/dwca-sample1/occurrence.txt new file mode 100755 index 0000000..180a4f6 --- /dev/null +++ b/tests/input_files/dwca/dwca-sample1/occurrence.txt @@ -0,0 +1,6 @@ +id,occurrenceID,scientificName,decimalLatitude,decimalLongitude,eventDate,recordedBy,geodeticDatum,basisOfRecord +1,1001,SpeciesA,12.34,-56.78,2023-01-01,John Doe,WGS84,PreservedSpecimen +2,1002,SpeciesB,-34.56,78.90,2023-02-15,Jane Smith,WGS84,HumanObservation +3,1003,SpeciesC,0.123,45.678,2023-03-20,Bob Johnson,WGS84,FossilSpecimen +4,1004,SpeciesD,-23.456,-12.345,2023-04-10,Alice Brown,WGS84,MachineObservation +5,1005,SpeciesE,89.012,-67.890,2023-05-25,Charlie White,WGS84,PreservedSpecimen \ No newline at end of file diff --git a/tests/input_files/dwca/dwca-sample2/meta.xml b/tests/input_files/dwca/dwca-sample2/meta.xml new file mode 100644 index 0000000..73f0d1b --- /dev/null +++ b/tests/input_files/dwca/dwca-sample2/meta.xml @@ -0,0 +1,15 @@ + + + + occurrence.txt + + + + + + + + + + + \ No newline at end of file diff --git a/tests/input_files/dwca/dwca-sample2/occurrence.txt b/tests/input_files/dwca/dwca-sample2/occurrence.txt new file mode 100644 index 0000000..7b1942d --- /dev/null +++ b/tests/input_files/dwca/dwca-sample2/occurrence.txt @@ -0,0 +1,3 @@ +gbifID occurrenceID scientificName decimalLatitude decimalLongitude basisOfRecord +1 occ1 Euphorbia paralias -36.00000 150.5678 Observations +2 occ2 Acaciella angustissima -20.0000 145.1234 Observations \ No newline at end of file diff --git a/tests/input_files/dwca/dwca-sample3/meta.xml b/tests/input_files/dwca/dwca-sample3/meta.xml new file mode 100644 index 0000000..73f0d1b --- /dev/null +++ b/tests/input_files/dwca/dwca-sample3/meta.xml @@ -0,0 +1,15 @@ + + + + occurrence.txt + + + + + + + + + + + \ No newline at end of file diff --git a/tests/input_files/dwca/dwca-sample3/occurrence.txt b/tests/input_files/dwca/dwca-sample3/occurrence.txt new file mode 100644 index 0000000..2dde9e0 --- /dev/null +++ b/tests/input_files/dwca/dwca-sample3/occurrence.txt @@ -0,0 +1,4 @@ +gbifID occurrenceID scientificName decimalLatitude decimalLongitude basisOfRecord +1 Euphorbia paralias -36.00000 150.5678 Observations +2 occ2 Acaciella angustissima -20.0000 145.1234 Observations +3 occ3 Acaciella angustissima -20.0000 145.1234 Observations \ No newline at end of file diff --git a/tests/input_files/dwca/dwca-sample4/meta.xml b/tests/input_files/dwca/dwca-sample4/meta.xml new file mode 100755 index 0000000..5e16a94 --- /dev/null +++ b/tests/input_files/dwca/dwca-sample4/meta.xml @@ -0,0 +1,15 @@ + + + + + occurrence.csv + + + + + + + + + + diff --git a/tests/input_files/dwca/dwca-sample4/occurrence.csv b/tests/input_files/dwca/dwca-sample4/occurrence.csv new file mode 100644 index 0000000..bd6de2e --- /dev/null +++ b/tests/input_files/dwca/dwca-sample4/occurrence.csv @@ -0,0 +1,13 @@ +eventDate,basisOfRecord,scientificName,catalogNumber,decimalLatitude,decimalLongitude +2015-07-18T12:58:00+00:00,Human Observation,Species A,014800,-30.00000,144 +2015-07-18T12:58:31+00:00,Human Observation,Species B,,-31.00000,145 +2015-07-18T18:16:52+00:00,Human Observation,Species C,014824,-32.00000,100.828059 +2015-07-19T04:28:19+00:00,Human Observation,Species D,014823,-33.00000,101.820888 +2015-07-19T18:29:25+00:00,Human Observation,Species A1,014822,-34.00000,102.821654 +2015-07-20T18:03:12+00:00,Human Observation,Species A2,014821,-35.00000,104.999974 +2015-07-21T18:06:58+00:00,Human Observation,Species A3,014802,-34.00000,120.889354 +2015-07-22T04:42:47+00:00,Human Observation,Species B1,014800,-36.00000,150.308848 +2015-07-22T17:54:18+00:00,Human Observation,Species B2,014800,-30.00000,146.240159 +2015-07-22T23:09:51+00:00,Human Observation,Species C1,014799,-31.00000,150.783246 +2015-07-23T17:37:26+00:00,Human Observation,Species D,014798,-40.00000,150.823468 +2015-07-24T13:10:00+00:00,Human Observation,Species E,014823,-28.00000,115 \ No newline at end of file diff --git a/tests/input_files/dwca/dwca-sample5/meta.xml b/tests/input_files/dwca/dwca-sample5/meta.xml new file mode 100755 index 0000000..84096f6 --- /dev/null +++ b/tests/input_files/dwca/dwca-sample5/meta.xml @@ -0,0 +1,14 @@ + + + + + occurrence.csv + + + + + + + + + diff --git a/tests/input_files/dwca/dwca-sample5/occurrence.csv b/tests/input_files/dwca/dwca-sample5/occurrence.csv new file mode 100644 index 0000000..422a533 --- /dev/null +++ b/tests/input_files/dwca/dwca-sample5/occurrence.csv @@ -0,0 +1,13 @@ +eventDate,basisOfRecord,scientificName,catalogNumber,decimalLatitude,decimalLongitude +2015-07-18T12:58:00+00:00,Human Observation,Species A,014826,-30,144 +2015-07-18T12:58:31+00:00,Human Observation,Species A1,014825,-31.1111,145 +2015-07-18T18:16:52+00:00,Human Observation,Species A2,014824,-32.085431,100.828059 +2015-07-19T04:28:19+00:00,Human Observation,Species A3,014823,-33.097233,101.820888 +2015-07-19T18:29:25+00:00,Human Observation,Species B1,014822,-34.099936,102.821654 +2015-07-20T18:03:12+00:00,Human Observation,Species B2,014821,-35.893671,104.999974 +2015-07-21T18:06:58+00:00,Human Observation,Species C,014820,-34.113747,120.889354 +2015-07-22T04:42:47+00:00,Human Observation,Species C2,014810,-36,144.308848 +2015-07-22T17:54:18+00:00,Human Observation,Species C3,014800,-30.440251,146.240159 +2015-07-22T23:09:51+00:00,Human Observation,Species D,014799,-31.547195,150.783246 +2015-07-23T17:37:26+00:00,Human Observation,Species D1,-40.481117,150.823468 +2015-07-24T13:10:00+00:00,Human Observation,Species D2,014792,-28,115 \ No newline at end of file From 885f23e691fc69ed39a947a3c3e6365cbe3d47cb Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Fri, 24 Jan 2025 11:43:02 +1100 Subject: [PATCH 2/7] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/17 - Remove commented code --- src/dwcahandler/dwca/core_dwca.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index a8040cd..359aa1d 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -896,7 +896,6 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil - No duplicate record keys - Valid columns - - No duplicate columns :param error_file: A file to record errors :return: True if the DwCA is value, False otherwise @@ -915,10 +914,6 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil if not self._validate_columns(content): return False - #dup_cols = self._find_duplicate_columns(content) - #if len(dup_cols) > 0: - # return False - return True def extract_csv_content(self, csv_info: CsvFileType, From c6d7ac040f131b8bdcd2fcfa7d94a0868bf02d4f Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Fri, 24 Jan 2025 13:54:26 +1100 Subject: [PATCH 3/7] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/17 - Validate unit test --- tests/test_validate_dwca.py | 76 +++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 tests/test_validate_dwca.py diff --git a/tests/test_validate_dwca.py b/tests/test_validate_dwca.py new file mode 100644 index 0000000..3f68fba --- /dev/null +++ b/tests/test_validate_dwca.py @@ -0,0 +1,76 @@ +from io import BytesIO +from zipfile import ZipFile +import zipfile +from pathlib import Path +from dwcahandler import DwcaHandler +import logging +import pytest + +input_folder = "./input_files/dwca" + + +def make_zip_from_folder_contents(folder: str): + zip_buffer = BytesIO() + with ZipFile(file=zip_buffer, mode="w", compression=zipfile.ZIP_DEFLATED, allowZip64=True) as zf: + for path in Path(folder).rglob("*"): + zf.write(path, arcname=path.name) + zf.close() + return zip_buffer + + +class TestValidateDwca: + + def test_validate_dwca(self): + """ + Test for read and extract dwca. Validate core content + """ + simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample1") + keys_lookup = {'occurrence': 'occurrenceID'} + dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) + assert dwca_result + + def test_validate_dwca2(self): + """ + Test for read and extract dwca. Validate core content + """ + simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample2") + keys_lookup = {'occurrence': 'occurrenceID'} + dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) + assert dwca_result + + def test_empty_keys(self, caplog): + """ + Test for read and extract dwca. Validate core content with empty keys + """ + caplog.set_level(logging.INFO) + simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample3") + keys_lookup = {'occurrence': 'occurrenceID'} + dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) + assert not dwca_result + assert "Empty values found in ['occurrenceID']. Total rows affected: 1" in caplog.messages + assert "Empty values found in dataframe row: [0]" in caplog.messages + + def test_duplicate_key(self, caplog): + """ + Test for read and extract dwca. Validate core content with duplicate keys + """ + caplog.set_level(logging.INFO) + simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample4") + keys_lookup = {'occurrence': 'catalogNumber'} + dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) + assert not dwca_result + assert "Duplicate ['catalogNumber'] found. Total rows affected: 3" in caplog.messages + assert "Duplicate values: ['014800' '014823']" in caplog.messages + + def test_duplicate_columns_in_dwca(self): + """ + Test for read and extract dwca. Validate duplicate columns specified in metadata of dwca + """ + simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample5") + keys_lookup = {'occurrence': 'catalogNumber'} + + with pytest.raises(ValueError) as exc_info: + DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) + + assert "Duplicate columns ['catalogNumber'] specified in the " \ + "metadata for occurrence.csv" in str(exc_info.value) From e2467882f41668ed5fd248fadaffb3baf1048a3c Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Tue, 28 Jan 2025 14:51:04 +1100 Subject: [PATCH 4/7] Update macOS version for test --- .github/workflows/run-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 96366dc..8759921 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: true matrix: - os: [ubuntu-latest, macos-12, Windows-latest] + os: [ubuntu-latest, macos-15, Windows-latest] python: - "3.9" - "3.10" From 302141f4def1a05fc1330f07acd4932bd806de83 Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Tue, 28 Jan 2025 18:03:47 +1100 Subject: [PATCH 5/7] https://github.com/AtlasOfLivingAustralia/preingestion/issues/272 - Strip column header spaces in csv files. Add test cases to test the core and ext dataframe --- src/dwcahandler/dwca/core_dwca.py | 3 + .../sample/multimedia_header_with_space.csv | 3 + .../sample/occ_header_with_space.csv | 4 ++ tests/test_create_core_and_ext_content.py | 62 +++++++++++++++++++ 4 files changed, 72 insertions(+) create mode 100644 tests/input_files/sample/multimedia_header_with_space.csv create mode 100644 tests/input_files/sample/occ_header_with_space.csv diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index 359aa1d..08b6a87 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -1065,6 +1065,9 @@ def _read_csv(self, ret_val.dropna(how="all", inplace=True) log.debug("Extracted %d rows from csv %s", len(ret_val), csv_file) + # Strip column header spaces + ret_val.rename(str.strip, axis = 'columns', inplace=True) + return ret_val except EmptyDataError: diff --git a/tests/input_files/sample/multimedia_header_with_space.csv b/tests/input_files/sample/multimedia_header_with_space.csv new file mode 100644 index 0000000..b2655de --- /dev/null +++ b/tests/input_files/sample/multimedia_header_with_space.csv @@ -0,0 +1,3 @@ + catalogNumber ,identifier, format ,type +C4,https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=XXX,image/jpeg,StillImage +C5,https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=XYZ,image/jpeg,StillImage \ No newline at end of file diff --git a/tests/input_files/sample/occ_header_with_space.csv b/tests/input_files/sample/occ_header_with_space.csv new file mode 100644 index 0000000..b5ef621 --- /dev/null +++ b/tests/input_files/sample/occ_header_with_space.csv @@ -0,0 +1,4 @@ + catalogNumber ,basisOfRecord,scientificName ,license,decimalLatitude,decimalLongitude +C3,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-10.0000,120.0000 +C4,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-11.1111,125.0000 +C5,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-12.085431,130.828059 diff --git a/tests/test_create_core_and_ext_content.py b/tests/test_create_core_and_ext_content.py index d721cef..8091987 100644 --- a/tests/test_create_core_and_ext_content.py +++ b/tests/test_create_core_and_ext_content.py @@ -3,6 +3,7 @@ from operator import attrgetter import pytest import pandas as pd +from pandas import testing as pdtest from dwcahandler.dwca import CSVEncoding, CsvFileType, CoreOrExtType, MetaElementTypes from dwcahandler.dwca.core_dwca import Dwca @@ -15,6 +16,10 @@ "delimiter": "\t"} duplicates_csv_occ_test = {"file_paths": single_csv_occ_test["file_paths"] + multiple_csv_occ_test["file_paths"], "delimiter": ","} +csv_occ_with_space = {"file_paths": ['./input_files/occurrence/occ_file1.csv', './input_files/sample/occ_header_with_space.csv'], + "delimiter": ","} +multimedia_with_space = {"file_paths": ['./input_files/multimedia/multimedia_file.csv', './input_files/sample/multimedia_header_with_space.csv'], + "delimiter": ","} def get_expected_combined_occ_df(file_paths: list, keys: list, delimiter: str = ","): @@ -149,3 +154,60 @@ def test_extract_tsv_ext_content(self): # Test that the meta content extension if of multimedia type assert (dwca_creator.meta_content.meta_elements[1].meta_element_type.type == MetaElementTypes.get_element('multimedia')) + + def test_extract_csv_with_header_space(self): + """ + Test extract records from csv with header space + """ + + dwca_creator = Dwca() + + dwca_creator.extract_csv_content(csv_info=CsvFileType(files=csv_occ_with_space['file_paths'], + type='occurrence', + keys=['catalogNumber'], + csv_encoding=CSVEncoding( + csv_delimiter=csv_occ_with_space["delimiter"])), + core_ext_type=CoreOrExtType.CORE) + + expected_column_list = ["id", "catalogNumber", "basisOfRecord", "scientificName", + "license","decimalLatitude","decimalLongitude"] + assert set(dwca_creator.core_content.df_content.columns) == set(expected_column_list) + assert len(dwca_creator.core_content.df_content) == 5 + pdtest.assert_series_equal(dwca_creator.core_content.df_content["catalogNumber"], + pd.Series(["C1", "C2", "C3", "C4", "C5"], dtype=str, name='catalogNumber'), + check_index_type=False, check_index=False) + + def test_extract_csv_ext_with_header_space(self): + """ + Test extract records from multimedia csv with header space + """ + + dwca_creator = Dwca() + + dwca_creator.extract_csv_content(csv_info=CsvFileType(files=csv_occ_with_space['file_paths'], + type='occurrence', + keys=['catalogNumber'], + csv_encoding=CSVEncoding( + csv_delimiter=csv_occ_with_space["delimiter"])), + core_ext_type=CoreOrExtType.CORE) + + dwca_creator.extract_csv_content(csv_info=CsvFileType(files=multimedia_with_space['file_paths'], + type='multimedia', + keys=['catalogNumber'], + csv_encoding=CSVEncoding(csv_delimiter=',')), + core_ext_type=CoreOrExtType.EXTENSION) + + expected_column_list = ["id", "catalogNumber", "basisOfRecord", "scientificName", + "license","decimalLatitude","decimalLongitude"] + assert set(dwca_creator.core_content.df_content.columns) == set(expected_column_list) + assert len(dwca_creator.core_content.df_content) == 5 + pdtest.assert_series_equal(dwca_creator.core_content.df_content["catalogNumber"], + pd.Series(["C1", "C2", "C3", "C4", "C5"], dtype=str, name="catalogNumber"), + check_index_type=False, check_index=False) + + expected_column_list = ["coreid", "catalogNumber", "identifier", "format", "type"] + assert set(dwca_creator.ext_content[0].df_content.columns) == set(expected_column_list) + assert len(dwca_creator.ext_content[0].df_content) == 5 + pdtest.assert_series_equal(dwca_creator.ext_content[0].df_content["catalogNumber"], + pd.Series(["C1", "C2", "C3", "C4", "C5"], dtype=str, name="catalogNumber"), + check_index_type=False, check_index=False) From e374b1ddd88ed80b6391715936e1c862b36c9d28 Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Wed, 29 Jan 2025 10:22:56 +1100 Subject: [PATCH 6/7] increase test version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a5dcc6b..314aab1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dwcahandler" -version = "0.3.0" +version = "0.4.0b1" description = "Python package to handle Darwin Core Archive (DwCA) operations. This includes creating a DwCA zip file from one or more csvs, reading a DwCA, merge two DwCAs, validate DwCA and delete records from DwCA based on one or more key columns" authors = ["Atlas of Living Australia data team "] maintainers = ["Atlas of Living Australia data team "] From b6f656c54a230504283d6df375ae7345499c62e3 Mon Sep 17 00:00:00 2001 From: Mahmoud Date: Wed, 29 Jan 2025 01:45:52 +0000 Subject: [PATCH 7/7] Added devcontainer --- .devcontainer/devcontainer.json | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 .devcontainer/devcontainer.json diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..3817ad5 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,30 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/python +{ + "name": "Python 3", + // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile + "image": "mcr.microsoft.com/devcontainers/python:1-3.9-bookworm", + + // Features to add to the dev container. More info: https://containers.dev/features. + "features": { "ghcr.io/devcontainers-contrib/features/poetry": "latest" }, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Use 'postCreateCommand' to run commands after the container is created. + "postCreateCommand": "poetry install", + + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + "remoteUser": "vscode", + "customizations": { + "vscode": { + "extensions": [ + "ms-python.python", + "vector-of-bool.gitflow" + ] + } + } +}