From 8003c8cafc71c10a9af48014bc03d96f2d2bfbd4 Mon Sep 17 00:00:00 2001
From: Patricia Koh <patricia.koh@csiro.au>
Date: Fri, 24 Jan 2025 11:39:33 +1100
Subject: [PATCH 1/7] 
 https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/17 - Fix for
 metadata fields and add validate dwca test cases

---
 src/dwcahandler/dwca/core_dwca.py             | 38 ++++++++++++-------
 src/dwcahandler/dwca/dwca_factory.py          |  2 +-
 src/dwcahandler/dwca/dwca_meta.py             | 12 +++++-
 tests/input_files/dwca/dwca-sample1/meta.xml  | 17 +++++++++
 .../dwca/dwca-sample1/occurrence.txt          |  6 +++
 tests/input_files/dwca/dwca-sample2/meta.xml  | 15 ++++++++
 .../dwca/dwca-sample2/occurrence.txt          |  3 ++
 tests/input_files/dwca/dwca-sample3/meta.xml  | 15 ++++++++
 .../dwca/dwca-sample3/occurrence.txt          |  4 ++
 tests/input_files/dwca/dwca-sample4/meta.xml  | 15 ++++++++
 .../dwca/dwca-sample4/occurrence.csv          | 13 +++++++
 tests/input_files/dwca/dwca-sample5/meta.xml  | 14 +++++++
 .../dwca/dwca-sample5/occurrence.csv          | 13 +++++++
 13 files changed, 152 insertions(+), 15 deletions(-)
 create mode 100755 tests/input_files/dwca/dwca-sample1/meta.xml
 create mode 100755 tests/input_files/dwca/dwca-sample1/occurrence.txt
 create mode 100644 tests/input_files/dwca/dwca-sample2/meta.xml
 create mode 100644 tests/input_files/dwca/dwca-sample2/occurrence.txt
 create mode 100644 tests/input_files/dwca/dwca-sample3/meta.xml
 create mode 100644 tests/input_files/dwca/dwca-sample3/occurrence.txt
 create mode 100755 tests/input_files/dwca/dwca-sample4/meta.xml
 create mode 100644 tests/input_files/dwca/dwca-sample4/occurrence.csv
 create mode 100755 tests/input_files/dwca/dwca-sample5/meta.xml
 create mode 100644 tests/input_files/dwca/dwca-sample5/occurrence.csv

diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py
index a594cd1..a8040cd 100644
--- a/src/dwcahandler/dwca/core_dwca.py
+++ b/src/dwcahandler/dwca/core_dwca.py
@@ -207,6 +207,10 @@ def convert_values(v):
                 csv_file_name = meta_elm.meta_element_type.file_name
                 with io.TextIOWrapper(zf.open(csv_file_name), encoding="utf-8") as csv_file:
                     dwc_headers = [f.field_name for f in meta_elm.fields if f.index is not None]
+                    duplicates = [i for i in set(dwc_headers) if dwc_headers.count(i) > 1]
+                    if len(duplicates) > 0:
+                        raise ValueError(f"Duplicate columns {duplicates} specified in the "
+                                         f"metadata for {csv_file_name}")
                     csv_encoding = {key: convert_values(value) for key, value in
                                     asdict(meta_elm.meta_element_type.csv_encoding).items()}
                     csv_content = self._read_csv(
@@ -825,17 +829,18 @@ def check_duplicates(self, content_keys_df, keys, error_file=None):
         """
 
         def report_error(content, keys, message, condition, error_file=None):
-            log.error("%s found in keys %s", message, keys)
-            log.error("\n%s count\n%s", message, condition.sum())
-            log.error("\n%s", content.loc[condition.values, keys].index.tolist())
-            if error_file:
-                content.loc[condition.values, keys].to_csv(error_file, index=False)
+            content.loc[condition.values, keys].to_csv(error_file, index=False)
 
         checks_status: bool = True
         if len(keys) > 0:
             empty_values_condition = content_keys_df.isnull()
             if empty_values_condition.values.any():
-                report_error(content_keys_df, keys, "Empty Values", empty_values_condition)
+                log.error("Empty values found in %s. Total rows affected: %s", keys,
+                          empty_values_condition.sum().sum())
+                log.error("Empty values found in dataframe row: %s",
+                          content_keys_df.index[empty_values_condition.all(axis=1)].tolist())
+                if error_file:
+                    report_error(content_keys_df, keys, "Empty Values", empty_values_condition)
                 checks_status = False
 
             # check incase-sensitive duplicates
@@ -846,8 +851,11 @@ def to_lower(df):
             df_keys = to_lower(content_keys_df)
             duplicate_condition = df_keys.duplicated(keep='first')
             if duplicate_condition.values.any():
-                report_error(content_keys_df, keys, "Duplicate Values",
-                             duplicate_condition, error_file)
+                log.error(f"Duplicate %s found. Total rows affected: %s", keys, duplicate_condition.sum())
+                log.error("Duplicate values: %s", pd.unique(content_keys_df[duplicate_condition].stack()))
+                if error_file:
+                    report_error(content_keys_df, keys, "Duplicate Values",
+                                 duplicate_condition, error_file)
                 checks_status = False
 
         return checks_status
@@ -907,9 +915,9 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil
             if not self._validate_columns(content):
                 return False
 
-            dup_cols = self._find_duplicate_columns(content)
-            if len(dup_cols) > 0:
-                return False
+            #dup_cols = self._find_duplicate_columns(content)
+            #if len(dup_cols) > 0:
+            #    return False
 
         return True
 
@@ -1065,6 +1073,10 @@ def _read_csv(self,
             return ret_val
 
         except EmptyDataError:
-            log.error(f"The expected columns: %s are not present in the {csv_file}. "
-                      f"The file may be empty", ','.join(columns))
+            if columns:
+                log.error(f"The file may be empty {csv_file}")
+            else:
+                log.error(f"The expected columns: %s are not present in the {csv_file}. "
+                          f"The file may be empty", ','.join(columns))
+
             return pd.DataFrame()
diff --git a/src/dwcahandler/dwca/dwca_factory.py b/src/dwcahandler/dwca/dwca_factory.py
index ac68eae..0082b2a 100644
--- a/src/dwcahandler/dwca/dwca_factory.py
+++ b/src/dwcahandler/dwca/dwca_factory.py
@@ -82,7 +82,7 @@ def merge_dwca(dwca_file: Union[str, BytesIO], delta_dwca_file: Union[str, Bytes
                                                  regen_ids=regen_ids, validate_delta=validate_delta_content)
 
     @staticmethod
-    def validate_dwca(dwca_file: str, keys_lookup: dict = None, error_file: str = None):
+    def validate_dwca(dwca_file: Union[str, BytesIO], keys_lookup: dict = None, error_file: str = None):
         """Test a dwca for consistency
 
         :param dwca_file: The path to the DwCA
diff --git a/src/dwcahandler/dwca/dwca_meta.py b/src/dwcahandler/dwca/dwca_meta.py
index b8451fc..772a5b5 100644
--- a/src/dwcahandler/dwca/dwca_meta.py
+++ b/src/dwcahandler/dwca/dwca_meta.py
@@ -143,7 +143,16 @@ def __extract_meta_info(self, ns, node_elm, core_or_ext_type):
         def extract_field_attr_value(field_elm, attrib):
             return field_elm.attrib.get(attrib) if field_elm.attrib.get(attrib) else None
 
+        def __find_id_in_fields(local_fields, id_field):
+            index_number = id_field[0].attrib["index"] if len(id_field) > 0 else "0"
+            return next((item for item in local_fields if "index" in item.attrib and item.attrib["index"]==index_number), None)
+
         fields = node_elm.findall(f'{ns}field')
+        id_field = []
+        if core_or_ext_type == 'core':
+            id_field = node_elm.findall(f'{ns}id')
+        else:
+            id_field = node_elm.findall(f'{ns}coreid')
         file_name = node_elm.find(f'{ns}files').find(f'{ns}location').text
         meta_element_info = MetaElementInfo(
             core_or_ext_type=core_or_ext_type,
@@ -157,7 +166,8 @@ def extract_field_attr_value(field_elm, attrib):
             charset_encoding=node_elm.attrib['encoding'],
             file_name=file_name)
         # set first field with index 0 if it's not present in list of fields
-        if fields[0].attrib['index'] != '0':
+        field_elm = __find_id_in_fields(fields, id_field)
+        if field_elm is None and len(id_field) > 0:
             if CoreOrExtType.CORE == core_or_ext_type:
                 field_list = [Field(index=0, field_name="id")]
             else:
diff --git a/tests/input_files/dwca/dwca-sample1/meta.xml b/tests/input_files/dwca/dwca-sample1/meta.xml
new file mode 100755
index 0000000..f851204
--- /dev/null
+++ b/tests/input_files/dwca/dwca-sample1/meta.xml
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<archive xmlns="http://rs.tdwg.org/dwc/text/">
+  <core rowType="http://rs.tdwg.org/dwc/terms/Occurrence" encoding="utf-8" fieldsTerminatedBy="," linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1">
+    <files>
+      <location>occurrence.txt</location>
+    </files>
+    <id index="0" />
+    <field index="1" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
+    <field index="2" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
+    <field index="3" term="http://rs.tdwg.org/dwc/terms/decimalLatitude"/>
+    <field index="4" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
+    <field index="5" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
+    <field index="6" term="http://rs.tdwg.org/dwc/terms/recordedBy"/>
+    <field index="7" term="http://rs.tdwg.org/dwc/terms/geodeticDatum"/>
+    <field index="8" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
+  </core>
+</archive>
diff --git a/tests/input_files/dwca/dwca-sample1/occurrence.txt b/tests/input_files/dwca/dwca-sample1/occurrence.txt
new file mode 100755
index 0000000..180a4f6
--- /dev/null
+++ b/tests/input_files/dwca/dwca-sample1/occurrence.txt
@@ -0,0 +1,6 @@
+id,occurrenceID,scientificName,decimalLatitude,decimalLongitude,eventDate,recordedBy,geodeticDatum,basisOfRecord
+1,1001,SpeciesA,12.34,-56.78,2023-01-01,John Doe,WGS84,PreservedSpecimen
+2,1002,SpeciesB,-34.56,78.90,2023-02-15,Jane Smith,WGS84,HumanObservation
+3,1003,SpeciesC,0.123,45.678,2023-03-20,Bob Johnson,WGS84,FossilSpecimen
+4,1004,SpeciesD,-23.456,-12.345,2023-04-10,Alice Brown,WGS84,MachineObservation
+5,1005,SpeciesE,89.012,-67.890,2023-05-25,Charlie White,WGS84,PreservedSpecimen
\ No newline at end of file
diff --git a/tests/input_files/dwca/dwca-sample2/meta.xml b/tests/input_files/dwca/dwca-sample2/meta.xml
new file mode 100644
index 0000000..73f0d1b
--- /dev/null
+++ b/tests/input_files/dwca/dwca-sample2/meta.xml
@@ -0,0 +1,15 @@
+<archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="metadata.xml">
+    <core encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
+        <files>
+            <location>occurrence.txt</location>
+        </files>
+        <id index="0" />
+        <field default="WGS84" term="http://rs.tdwg.org/dwc/terms/geodeticDatum"/>
+        <field index="0" term="http://rs.gbif.org/terms/1.0/gbifID"/>
+        <field index="1" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
+        <field index="2" term="http://purl.org/dc/terms/scientificName"/>
+        <field index="3" term="http://purl.org/dc/terms/decimalLatitude"/>
+        <field index="4" term="http://purl.org/dc/terms/decimalLongitude"/>
+        <field index="5" term="http://purl.org/dc/terms/basisOfRecord"/>
+    </core>
+</archive>
\ No newline at end of file
diff --git a/tests/input_files/dwca/dwca-sample2/occurrence.txt b/tests/input_files/dwca/dwca-sample2/occurrence.txt
new file mode 100644
index 0000000..7b1942d
--- /dev/null
+++ b/tests/input_files/dwca/dwca-sample2/occurrence.txt
@@ -0,0 +1,3 @@
+gbifID	occurrenceID    scientificName	decimalLatitude	decimalLongitude	basisOfRecord
+1	occ1	Euphorbia paralias	-36.00000	150.5678	Observations
+2	occ2	Acaciella angustissima	-20.0000	145.1234	Observations
\ No newline at end of file
diff --git a/tests/input_files/dwca/dwca-sample3/meta.xml b/tests/input_files/dwca/dwca-sample3/meta.xml
new file mode 100644
index 0000000..73f0d1b
--- /dev/null
+++ b/tests/input_files/dwca/dwca-sample3/meta.xml
@@ -0,0 +1,15 @@
+<archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="metadata.xml">
+    <core encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
+        <files>
+            <location>occurrence.txt</location>
+        </files>
+        <id index="0" />
+        <field default="WGS84" term="http://rs.tdwg.org/dwc/terms/geodeticDatum"/>
+        <field index="0" term="http://rs.gbif.org/terms/1.0/gbifID"/>
+        <field index="1" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
+        <field index="2" term="http://purl.org/dc/terms/scientificName"/>
+        <field index="3" term="http://purl.org/dc/terms/decimalLatitude"/>
+        <field index="4" term="http://purl.org/dc/terms/decimalLongitude"/>
+        <field index="5" term="http://purl.org/dc/terms/basisOfRecord"/>
+    </core>
+</archive>
\ No newline at end of file
diff --git a/tests/input_files/dwca/dwca-sample3/occurrence.txt b/tests/input_files/dwca/dwca-sample3/occurrence.txt
new file mode 100644
index 0000000..2dde9e0
--- /dev/null
+++ b/tests/input_files/dwca/dwca-sample3/occurrence.txt
@@ -0,0 +1,4 @@
+gbifID	occurrenceID    scientificName	decimalLatitude	decimalLongitude	basisOfRecord
+1		Euphorbia paralias	-36.00000	150.5678	Observations
+2	occ2	Acaciella angustissima	-20.0000	145.1234	Observations
+3	occ3	Acaciella angustissima	-20.0000	145.1234	Observations
\ No newline at end of file
diff --git a/tests/input_files/dwca/dwca-sample4/meta.xml b/tests/input_files/dwca/dwca-sample4/meta.xml
new file mode 100755
index 0000000..5e16a94
--- /dev/null
+++ b/tests/input_files/dwca/dwca-sample4/meta.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<archive xmlns="http://rs.tdwg.org/dwc/text/">
+  <core rowType="http://rs.tdwg.org/dwc/terms/Occurrence" encoding="utf-8" fieldsTerminatedBy="," linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1">
+    <files>
+      <location>occurrence.csv</location>
+    </files>
+    <id index="3" />
+    <field index="0" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
+    <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
+    <field index="2" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
+    <field index="3" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
+    <field index="4" term="http://rs.tdwg.org/dwc/terms/decimalLatitude"/>
+    <field index="5" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
+  </core>
+</archive>
diff --git a/tests/input_files/dwca/dwca-sample4/occurrence.csv b/tests/input_files/dwca/dwca-sample4/occurrence.csv
new file mode 100644
index 0000000..bd6de2e
--- /dev/null
+++ b/tests/input_files/dwca/dwca-sample4/occurrence.csv
@@ -0,0 +1,13 @@
+eventDate,basisOfRecord,scientificName,catalogNumber,decimalLatitude,decimalLongitude
+2015-07-18T12:58:00+00:00,Human Observation,Species A,014800,-30.00000,144
+2015-07-18T12:58:31+00:00,Human Observation,Species B,,-31.00000,145
+2015-07-18T18:16:52+00:00,Human Observation,Species C,014824,-32.00000,100.828059
+2015-07-19T04:28:19+00:00,Human Observation,Species D,014823,-33.00000,101.820888
+2015-07-19T18:29:25+00:00,Human Observation,Species A1,014822,-34.00000,102.821654
+2015-07-20T18:03:12+00:00,Human Observation,Species A2,014821,-35.00000,104.999974
+2015-07-21T18:06:58+00:00,Human Observation,Species A3,014802,-34.00000,120.889354
+2015-07-22T04:42:47+00:00,Human Observation,Species B1,014800,-36.00000,150.308848
+2015-07-22T17:54:18+00:00,Human Observation,Species B2,014800,-30.00000,146.240159
+2015-07-22T23:09:51+00:00,Human Observation,Species C1,014799,-31.00000,150.783246
+2015-07-23T17:37:26+00:00,Human Observation,Species D,014798,-40.00000,150.823468
+2015-07-24T13:10:00+00:00,Human Observation,Species E,014823,-28.00000,115
\ No newline at end of file
diff --git a/tests/input_files/dwca/dwca-sample5/meta.xml b/tests/input_files/dwca/dwca-sample5/meta.xml
new file mode 100755
index 0000000..84096f6
--- /dev/null
+++ b/tests/input_files/dwca/dwca-sample5/meta.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<archive xmlns="http://rs.tdwg.org/dwc/text/">
+  <core rowType="http://rs.tdwg.org/dwc/terms/Occurrence" encoding="utf-8" fieldsTerminatedBy="," linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1">
+    <files>
+      <location>occurrence.csv</location>
+    </files>
+    <field index="0" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
+    <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
+    <field index="2" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
+    <field index="3" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
+    <field index="4" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
+    <field index="5" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
+  </core>
+</archive>
diff --git a/tests/input_files/dwca/dwca-sample5/occurrence.csv b/tests/input_files/dwca/dwca-sample5/occurrence.csv
new file mode 100644
index 0000000..422a533
--- /dev/null
+++ b/tests/input_files/dwca/dwca-sample5/occurrence.csv
@@ -0,0 +1,13 @@
+eventDate,basisOfRecord,scientificName,catalogNumber,decimalLatitude,decimalLongitude
+2015-07-18T12:58:00+00:00,Human Observation,Species A,014826,-30,144
+2015-07-18T12:58:31+00:00,Human Observation,Species A1,014825,-31.1111,145
+2015-07-18T18:16:52+00:00,Human Observation,Species A2,014824,-32.085431,100.828059
+2015-07-19T04:28:19+00:00,Human Observation,Species A3,014823,-33.097233,101.820888
+2015-07-19T18:29:25+00:00,Human Observation,Species B1,014822,-34.099936,102.821654
+2015-07-20T18:03:12+00:00,Human Observation,Species B2,014821,-35.893671,104.999974
+2015-07-21T18:06:58+00:00,Human Observation,Species C,014820,-34.113747,120.889354
+2015-07-22T04:42:47+00:00,Human Observation,Species C2,014810,-36,144.308848
+2015-07-22T17:54:18+00:00,Human Observation,Species C3,014800,-30.440251,146.240159
+2015-07-22T23:09:51+00:00,Human Observation,Species D,014799,-31.547195,150.783246
+2015-07-23T17:37:26+00:00,Human Observation,Species D1,-40.481117,150.823468
+2015-07-24T13:10:00+00:00,Human Observation,Species D2,014792,-28,115
\ No newline at end of file

From 885f23e691fc69ed39a947a3c3e6365cbe3d47cb Mon Sep 17 00:00:00 2001
From: Patricia Koh <patricia.koh@csiro.au>
Date: Fri, 24 Jan 2025 11:43:02 +1100
Subject: [PATCH 2/7] 
 https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/17 - Remove
 commented code

---
 src/dwcahandler/dwca/core_dwca.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py
index a8040cd..359aa1d 100644
--- a/src/dwcahandler/dwca/core_dwca.py
+++ b/src/dwcahandler/dwca/core_dwca.py
@@ -896,7 +896,6 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil
 
         - No duplicate record keys
         - Valid columns
-        - No duplicate columns
 
         :param error_file: A file to record errors
         :return: True if the DwCA is value, False otherwise
@@ -915,10 +914,6 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil
             if not self._validate_columns(content):
                 return False
 
-            #dup_cols = self._find_duplicate_columns(content)
-            #if len(dup_cols) > 0:
-            #    return False
-
         return True
 
     def extract_csv_content(self, csv_info: CsvFileType,

From c6d7ac040f131b8bdcd2fcfa7d94a0868bf02d4f Mon Sep 17 00:00:00 2001
From: Patricia Koh <patricia.koh@csiro.au>
Date: Fri, 24 Jan 2025 13:54:26 +1100
Subject: [PATCH 3/7] 
 https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/17 - Validate
 unit test

---
 tests/test_validate_dwca.py | 76 +++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 tests/test_validate_dwca.py

diff --git a/tests/test_validate_dwca.py b/tests/test_validate_dwca.py
new file mode 100644
index 0000000..3f68fba
--- /dev/null
+++ b/tests/test_validate_dwca.py
@@ -0,0 +1,76 @@
+from io import BytesIO
+from zipfile import ZipFile
+import zipfile
+from pathlib import Path
+from dwcahandler import DwcaHandler
+import logging
+import pytest
+
+input_folder = "./input_files/dwca"
+
+
+def make_zip_from_folder_contents(folder: str):
+    zip_buffer = BytesIO()
+    with ZipFile(file=zip_buffer, mode="w", compression=zipfile.ZIP_DEFLATED, allowZip64=True) as zf:
+        for path in Path(folder).rglob("*"):
+            zf.write(path, arcname=path.name)
+        zf.close()
+    return zip_buffer
+
+
+class TestValidateDwca:
+
+    def test_validate_dwca(self):
+        """
+        Test for read and extract dwca. Validate core content
+        """
+        simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample1")
+        keys_lookup = {'occurrence': 'occurrenceID'}
+        dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup)
+        assert dwca_result
+
+    def test_validate_dwca2(self):
+        """
+        Test for read and extract dwca. Validate core content
+        """
+        simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample2")
+        keys_lookup = {'occurrence': 'occurrenceID'}
+        dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup)
+        assert dwca_result
+
+    def test_empty_keys(self, caplog):
+        """
+        Test for read and extract dwca. Validate core content with empty keys
+        """
+        caplog.set_level(logging.INFO)
+        simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample3")
+        keys_lookup = {'occurrence': 'occurrenceID'}
+        dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup)
+        assert not dwca_result
+        assert "Empty values found in ['occurrenceID']. Total rows affected: 1" in caplog.messages
+        assert "Empty values found in dataframe row: [0]" in caplog.messages
+
+    def test_duplicate_key(self, caplog):
+        """
+        Test for read and extract dwca. Validate core content with duplicate keys
+        """
+        caplog.set_level(logging.INFO)
+        simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample4")
+        keys_lookup = {'occurrence': 'catalogNumber'}
+        dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup)
+        assert not dwca_result
+        assert "Duplicate ['catalogNumber'] found. Total rows affected: 3" in caplog.messages
+        assert "Duplicate values: ['014800' '014823']" in caplog.messages
+
+    def test_duplicate_columns_in_dwca(self):
+        """
+        Test for read and extract dwca. Validate duplicate columns specified in metadata of dwca
+        """
+        simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample5")
+        keys_lookup = {'occurrence': 'catalogNumber'}
+
+        with pytest.raises(ValueError) as exc_info:
+            DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup)
+
+        assert "Duplicate columns ['catalogNumber'] specified in the " \
+                   "metadata for occurrence.csv" in str(exc_info.value)

From e2467882f41668ed5fd248fadaffb3baf1048a3c Mon Sep 17 00:00:00 2001
From: Patricia Koh <patricia.koh@csiro.au>
Date: Tue, 28 Jan 2025 14:51:04 +1100
Subject: [PATCH 4/7] Update macOS version for test

---
 .github/workflows/run-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index 96366dc..8759921 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        os: [ubuntu-latest, macos-12, Windows-latest]
+        os: [ubuntu-latest, macos-15, Windows-latest]
         python:
           - "3.9"
           - "3.10"

From 302141f4def1a05fc1330f07acd4932bd806de83 Mon Sep 17 00:00:00 2001
From: Patricia Koh <patricia.koh@csiro.au>
Date: Tue, 28 Jan 2025 18:03:47 +1100
Subject: [PATCH 5/7] 
 https://github.com/AtlasOfLivingAustralia/preingestion/issues/272 - Strip
 column header spaces in csv files. Add test cases to test the core and ext
 dataframe

---
 src/dwcahandler/dwca/core_dwca.py             |  3 +
 .../sample/multimedia_header_with_space.csv   |  3 +
 .../sample/occ_header_with_space.csv          |  4 ++
 tests/test_create_core_and_ext_content.py     | 62 +++++++++++++++++++
 4 files changed, 72 insertions(+)
 create mode 100644 tests/input_files/sample/multimedia_header_with_space.csv
 create mode 100644 tests/input_files/sample/occ_header_with_space.csv

diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py
index 359aa1d..08b6a87 100644
--- a/src/dwcahandler/dwca/core_dwca.py
+++ b/src/dwcahandler/dwca/core_dwca.py
@@ -1065,6 +1065,9 @@ def _read_csv(self,
                 ret_val.dropna(how="all", inplace=True)
                 log.debug("Extracted %d rows from csv %s", len(ret_val), csv_file)
 
+                # Strip column header spaces
+                ret_val.rename(str.strip, axis = 'columns', inplace=True)
+
             return ret_val
 
         except EmptyDataError:
diff --git a/tests/input_files/sample/multimedia_header_with_space.csv b/tests/input_files/sample/multimedia_header_with_space.csv
new file mode 100644
index 0000000..b2655de
--- /dev/null
+++ b/tests/input_files/sample/multimedia_header_with_space.csv
@@ -0,0 +1,3 @@
+ catalogNumber ,identifier, format ,type
+C4,https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=XXX,image/jpeg,StillImage
+C5,https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=XYZ,image/jpeg,StillImage
\ No newline at end of file
diff --git a/tests/input_files/sample/occ_header_with_space.csv b/tests/input_files/sample/occ_header_with_space.csv
new file mode 100644
index 0000000..b5ef621
--- /dev/null
+++ b/tests/input_files/sample/occ_header_with_space.csv
@@ -0,0 +1,4 @@
+  catalogNumber ,basisOfRecord,scientificName ,license,decimalLatitude,decimalLongitude
+C3,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-10.0000,120.0000
+C4,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-11.1111,125.0000
+C5,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-12.085431,130.828059
diff --git a/tests/test_create_core_and_ext_content.py b/tests/test_create_core_and_ext_content.py
index d721cef..8091987 100644
--- a/tests/test_create_core_and_ext_content.py
+++ b/tests/test_create_core_and_ext_content.py
@@ -3,6 +3,7 @@
 from operator import attrgetter
 import pytest
 import pandas as pd
+from pandas import testing as pdtest
 from dwcahandler.dwca import CSVEncoding, CsvFileType, CoreOrExtType, MetaElementTypes
 from dwcahandler.dwca.core_dwca import Dwca
 
@@ -15,6 +16,10 @@
                          "delimiter": "\t"}
 duplicates_csv_occ_test = {"file_paths": single_csv_occ_test["file_paths"] + multiple_csv_occ_test["file_paths"],
                            "delimiter": ","}
+csv_occ_with_space = {"file_paths": ['./input_files/occurrence/occ_file1.csv', './input_files/sample/occ_header_with_space.csv'],
+                       "delimiter": ","}
+multimedia_with_space = {"file_paths": ['./input_files/multimedia/multimedia_file.csv', './input_files/sample/multimedia_header_with_space.csv'],
+                      "delimiter": ","}
 
 
 def get_expected_combined_occ_df(file_paths: list, keys: list, delimiter: str = ","):
@@ -149,3 +154,60 @@ def test_extract_tsv_ext_content(self):
         # Test that the meta content extension if of multimedia type
         assert (dwca_creator.meta_content.meta_elements[1].meta_element_type.type ==
                 MetaElementTypes.get_element('multimedia'))
+
+    def test_extract_csv_with_header_space(self):
+        """
+        Test extract records from csv with header space
+        """
+
+        dwca_creator = Dwca()
+
+        dwca_creator.extract_csv_content(csv_info=CsvFileType(files=csv_occ_with_space['file_paths'],
+                                                              type='occurrence',
+                                                              keys=['catalogNumber'],
+                                                              csv_encoding=CSVEncoding(
+                                                                  csv_delimiter=csv_occ_with_space["delimiter"])),
+                                         core_ext_type=CoreOrExtType.CORE)
+
+        expected_column_list = ["id", "catalogNumber", "basisOfRecord", "scientificName",
+                                "license","decimalLatitude","decimalLongitude"]
+        assert set(dwca_creator.core_content.df_content.columns) == set(expected_column_list)
+        assert len(dwca_creator.core_content.df_content) == 5
+        pdtest.assert_series_equal(dwca_creator.core_content.df_content["catalogNumber"],
+                               pd.Series(["C1", "C2", "C3", "C4", "C5"], dtype=str, name='catalogNumber'),
+                               check_index_type=False, check_index=False)
+
+    def test_extract_csv_ext_with_header_space(self):
+        """
+        Test extract records from multimedia csv with header space
+        """
+
+        dwca_creator = Dwca()
+
+        dwca_creator.extract_csv_content(csv_info=CsvFileType(files=csv_occ_with_space['file_paths'],
+                                                              type='occurrence',
+                                                              keys=['catalogNumber'],
+                                                              csv_encoding=CSVEncoding(
+                                                                  csv_delimiter=csv_occ_with_space["delimiter"])),
+                                         core_ext_type=CoreOrExtType.CORE)
+
+        dwca_creator.extract_csv_content(csv_info=CsvFileType(files=multimedia_with_space['file_paths'],
+                                                              type='multimedia',
+                                                              keys=['catalogNumber'],
+                                                              csv_encoding=CSVEncoding(csv_delimiter=',')),
+                                         core_ext_type=CoreOrExtType.EXTENSION)
+
+        expected_column_list = ["id", "catalogNumber", "basisOfRecord", "scientificName",
+                                "license","decimalLatitude","decimalLongitude"]
+        assert set(dwca_creator.core_content.df_content.columns) == set(expected_column_list)
+        assert len(dwca_creator.core_content.df_content) == 5
+        pdtest.assert_series_equal(dwca_creator.core_content.df_content["catalogNumber"],
+                               pd.Series(["C1", "C2", "C3", "C4", "C5"], dtype=str, name="catalogNumber"),
+                               check_index_type=False, check_index=False)
+
+        expected_column_list = ["coreid", "catalogNumber", "identifier", "format", "type"]
+        assert set(dwca_creator.ext_content[0].df_content.columns) == set(expected_column_list)
+        assert len(dwca_creator.ext_content[0].df_content) == 5
+        pdtest.assert_series_equal(dwca_creator.ext_content[0].df_content["catalogNumber"],
+                               pd.Series(["C1", "C2", "C3", "C4", "C5"], dtype=str, name="catalogNumber"),
+                               check_index_type=False, check_index=False)

From e374b1ddd88ed80b6391715936e1c862b36c9d28 Mon Sep 17 00:00:00 2001
From: Patricia Koh <patricia.koh@csiro.au>
Date: Wed, 29 Jan 2025 10:22:56 +1100
Subject: [PATCH 6/7] increase test version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index a5dcc6b..314aab1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dwcahandler"
-version = "0.3.0"
+version = "0.4.0b1"
 description = "Python package to handle Darwin Core Archive (DwCA) operations. This includes creating a DwCA zip file from one or more csvs, reading a DwCA, merge two DwCAs, validate DwCA and delete records from DwCA based on one or more key columns"
 authors = ["Atlas of Living Australia data team <support@ala.org.au>"]
 maintainers = ["Atlas of Living Australia data team <support@ala.org.au>"]

From b6f656c54a230504283d6df375ae7345499c62e3 Mon Sep 17 00:00:00 2001
From: Mahmoud <sadeghim@gmail.com>
Date: Wed, 29 Jan 2025 01:45:52 +0000
Subject: [PATCH 7/7] Added devcontainer

---
 .devcontainer/devcontainer.json | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 .devcontainer/devcontainer.json

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 0000000..3817ad5
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,30 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/python
+{
+	"name": "Python 3",
+	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
+	"image": "mcr.microsoft.com/devcontainers/python:1-3.9-bookworm",
+	
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	"features": { "ghcr.io/devcontainers-contrib/features/poetry": "latest" },
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Use 'postCreateCommand' to run commands after the container is created.
+	"postCreateCommand": "poetry install",
+
+	// Configure tool-specific properties.
+	// "customizations": {},
+
+	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+	"remoteUser": "vscode",
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-python.python",
+				"vector-of-bool.gitflow"
+			]
+		}
+	}
+}