Remove checking multimedia content type (#12)

* #10 - Remove getting content type from url header as this cause performance issue * fix warnings * fix error in test * Mock mimetype test when run in github action. Python image for github action does not support mimetype * Resolve Deprecation warning for python 3.12 * Minor fix * Update readme * update build version * update build version to v0.2.0
AtlasOfLivingAustralia · Aug 28, 2024 · ad453cc · ad453cc
1 parent bd01465
commit ad453cc
Show file tree

Hide file tree

Showing 10 changed files with 398 additions and 214 deletions.
diff --git a/.github/workflows/publish-release.yml b/.github/workflows/publish-release.yml
@@ -35,7 +35,7 @@ jobs:
         run: |
           echo ${{ github.workspace }}
           cd ${{ github.workspace }}/tests
-          poetry run pytest
+          poetry run pytest --github-action-run=True
       - name: Build
         id: build-step
         run: |

diff --git a/.github/workflows/publish-test.yml b/.github/workflows/publish-test.yml
@@ -33,7 +33,7 @@ jobs:
         run: |
           echo ${{ github.workspace }}
           cd ${{ github.workspace }}/tests
-          poetry run pytest
+          poetry run pytest --github-action-run=True
       - name: Build
         id: build-step
         run: |

diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -48,5 +48,5 @@ jobs:
       run: |
         echo ${{ github.workspace }}
         cd ${{ github.workspace }}/tests
-        poetry run pytest
+        poetry run pytest --cov=dwcahandler --github-action-run=True
 
diff --git a/README.md b/README.md
@@ -61,15 +61,15 @@ pip install -i https://test.pypi.org/simple/ dwcahandler
 ### Examples of dwcahandler usages:
 
 * Create Darwin Core Archive from csv file
-* In creating a dwca with multimedia extension, provide format and type values in the Simple Multimedia extension, otherwise, dwcahandler will attempt to fill these info by guessing the mimetype from url or extracting content type of the url which will slow down the creation of dwca depending on how large the dataset is.
+* In creating a dwca with multimedia extension, provide format and type values in the Simple Multimedia extension, otherwise, dwcahandler will attempt to fill these info by guessing the mimetype from url.
 
 ```python
 from dwcahandler import CsvFileType
 from dwcahandler import DwcaHandler
 from dwcahandler import Eml
 
 core_csv = CsvFileType(files=['/tmp/occurrence.csv'], type='occurrence', keys=['occurrenceID'])
-ext_csvs = [CsvFileType(files=['/tmp/multimedia.csv'], type='multimedia')]
+ext_csvs = [CsvFileType(files=['/tmp/multimedia.csv'], type='multimedia', keys=['occurrenceID'])]
 
 eml = Eml(dataset_name='Test Dataset',
           description='Dataset description',
@@ -81,6 +81,7 @@ DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=ext_csvs, eml_content=em
 ```
 &nbsp;
 * Create Darwin Core Archive from pandas dataframe
+* In creating a dwca with multimedia extension, provide format and type values in the Simple Multimedia extension, otherwise, dwcahandler will attempt to fill these info by guessing the mimetype from url.
 
 ```python
 from dwcahandler import DwcaHandler
@@ -92,7 +93,7 @@ core_df = pd.read_csv("/tmp/occurrence.csv")
 core_frame = DataFrameType(df=core_df, type='occurrence', keys=['occurrenceID'])
 
 ext_df = pd.read_csv("/tmp/multimedia.csv")
-ext_frame = [DataFrameType(df=ext_df, type='multimedia')]
+ext_frame = [DataFrameType(df=ext_df, type='multimedia', keys=['occurrenceID'])]
 
 eml = Eml(dataset_name='Test Dataset',
           description='Dataset description',

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dwcahandler"
-version = "0.2.0.b2"
+version = "0.2.0"
 description = "Python package to handle Darwin Core Archive (DwCA) operations. This includes creating a DwCA zip file from one or more csvs, reading a DwCA, merge two DwCAs, validate DwCA and delete records from DwCA based on one or more key columns"
 authors = ["Atlas of Living Australia data team <[email protected]>"]
 maintainers = ["Atlas of Living Australia data team <[email protected]>"]
@@ -14,6 +14,7 @@ pandas = "^2.2.0"
 requests = "^2.32.0"
 pytest = "^8.2.0"
 pytest-mock = "^3.12.0"
+pytest-cov = "^5.0.0"
 metapype = "^0.0.26"
 flake8 = "^7.1.1"
 
@@ -25,4 +26,4 @@ requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.pytest.ini_options]
-pythonpath = "src"
+pythonpath = "src"
diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py
@@ -18,7 +18,6 @@
 from zipfile import ZipFile
 
 import pandas as pd
-import requests
 from numpy import nan
 from pandas.errors import EmptyDataError
 from pandas.io import parsers
@@ -651,7 +650,7 @@ def get_content(self, name_space):
     def add_multimedia_info_to_content(self, multimedia_content: DfContent):
         """
         Attempt to populate the format and type from the url provided in the multimedia ext if none is provided
-        :param multimedia_content: Multimedia content type derived from the extension of this Dwca class object
+        :param multimedia_content: Multimedia content derived from the extension of this Dwca class object
         """
         def get_media_format_prefix(media_format: str):
             media_format_prefixes = ["image", "audio", "video"]
@@ -678,59 +677,50 @@ def get_media_type(media_format: str):
 
         def get_multimedia_format_type(row: dict):
             url = row['identifier']
-            mime_type = mimetypes.guess_type(url)
             media_format = None
-            if mime_type and len(mime_type) > 0 and mime_type[0]:
-                media_format = mime_type[0]
-            else:
+            if url:
                 try:
-                    # Just check header without downloading content
-                    response = requests.head(url, allow_redirects=True)
-                    if 'content-type' in response.headers:
-                        content_type = response.headers['content-type']
-                        if get_media_format_prefix(content_type):
-                            media_format = content_type
-
+                    mime_type = mimetypes.guess_type(url)
+                    if mime_type and len(mime_type) > 0 and mime_type[0]:
+                        media_format = mime_type[0]
                 except Exception as error:
-                    log.error("Error getting header info from url %s: %s", url, error)
+                    log.error("Error getting mimetype from url %s: %s", url, error)
 
             media_type = ''
-            if 'type' not in row or not row['type']:
+            if 'type' not in row or not row['type'] or row['type'] is nan:
                 media_type = get_media_type(media_format)
             else:
                 media_type = row['type']
 
-            row['format'] = media_format if media_format else nan
-            row['type'] = media_type if media_type else nan
+            row['format'] = media_format if media_format else None
+            row['type'] = media_type if media_type else None
             return row
 
-        def populate_format_type(row: dict):
-            return get_multimedia_format_type(row)
+        if len(multimedia_content.df_content) > 0:
 
-        multimedia_df = multimedia_content.df_content
+            multimedia_df = multimedia_content.df_content
 
-        if 'format' in multimedia_df.columns:
-            multimedia_without_format = multimedia_df[multimedia_df['format'].isnull()]
-            if len(multimedia_without_format) > 0:
-                multimedia_without_format = multimedia_without_format.apply(
-                                                                lambda row: populate_format_type(row),
-                                                                axis=1)
-                multimedia_df.update(multimedia_without_format)
-        else:
-            multimedia_df = multimedia_df.apply(
-                lambda row: populate_format_type(row), axis=1)
+            if 'format' in multimedia_df.columns:
+                multimedia_without_format = multimedia_df[multimedia_df['format'].isnull()]
+                if len(multimedia_without_format) > 0:
+                    multimedia_without_format = multimedia_without_format.apply(
+                                                                    lambda row: get_multimedia_format_type(row),
+                                                                    axis=1)
+                    multimedia_df.update(multimedia_without_format)
+            else:
+                multimedia_df = multimedia_df.apply(lambda row: get_multimedia_format_type(row), axis=1)
 
-        multimedia_without_type = multimedia_df
-        # In case if the type was not populated from format
-        if 'type' in multimedia_df.columns:
-            multimedia_without_type = multimedia_df[multimedia_df['type'].isnull()]
-            multimedia_without_type = multimedia_without_type[multimedia_without_type['format'].notnull()]
+            multimedia_without_type = multimedia_df
+            # In case if the type was not populated from format
+            if 'type' in multimedia_df.columns:
+                multimedia_without_type = multimedia_df[multimedia_df['type'].isnull()]
+                multimedia_without_type = multimedia_without_type[multimedia_without_type['format'].notnull()]
 
-        if len(multimedia_without_type) > 0:
-            multimedia_without_type.loc[:, 'type'] = multimedia_without_type['format'].map(lambda x: get_media_type(x))
-            multimedia_df.update(multimedia_without_type)
+            if len(multimedia_without_type) > 0:
+                multimedia_without_type.loc[:, 'type'] = multimedia_without_type['format'].map(lambda x: get_media_type(x))
+                multimedia_df.update(multimedia_without_type)
 
-        multimedia_content.df_content = multimedia_df
+            multimedia_content.df_content = multimedia_df
 
     def _extract_media(self, content, assoc_media_col: str):
         """Extract embedded associated media and place it in a media extension data frame

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,5 @@
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--github-action-run", action="store", default=False, help="Set this to True if it's been called from github action"
+    )