From 3d20ddea48da196c32c09868108d519be4f1a727 Mon Sep 17 00:00:00 2001
From: Patricia Koh <patricia.koh@csiro.au>
Date: Tue, 13 Aug 2024 14:33:42 +1000
Subject: [PATCH] Handle for deleted media content type

---
 src/dwcahandler/dwca/core_dwca.py | 30 ++++++++++++++++++++----------
 tests/test_multimedia_content.py  | 10 +++++++---
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py
index 02f0596..5bfad23 100644
--- a/src/dwcahandler/dwca/core_dwca.py
+++ b/src/dwcahandler/dwca/core_dwca.py
@@ -653,17 +653,24 @@ def add_multimedia_info_to_content(self, multimedia_content: DfContent):
         Attempt to populate the format and type from the url provided in the multimedia ext if none is provided
         :param multimedia_content: Multimedia content type derived from the extension of this Dwca class object
         """
+        def get_media_format_prefix(media_format: str):
+            media_format_prefixes = ["image", "audio", "video"]
+            if media_format and isinstance(media_format, str) and '/' in media_format:
+                prefix = media_format.split('/')[0]
+                if prefix in media_format_prefixes:
+                    return prefix
+
+            return None
 
         def get_media_type(media_format: str):
             media_type = None
-            if media_format and '/' in media_format:
-                m_type = media_format.split('/')[0]
-                if m_type == 'image':
-                    media_type = 'StillImage'
-                elif m_type == 'audio':
-                    media_type = 'Sound'
-                elif m_type == 'video':
-                    media_type = 'MovingImage'
+            m_type = get_media_format_prefix(media_format)
+            if m_type == 'image':
+                media_type = 'StillImage'
+            elif m_type == 'audio':
+                media_type = 'Sound'
+            elif m_type == 'video':
+                media_type = 'MovingImage'
             if media_type is None and media_format:
                 log.warning("Unknown media type for format %s", media_format)
 
@@ -672,7 +679,7 @@ def get_media_type(media_format: str):
         def get_multimedia_format_type(row: dict):
             url = row['identifier']
             mime_type = mimetypes.guess_type(url)
-            media_format = ''
+            media_format = None
             if mime_type and len(mime_type) > 0 and mime_type[0]:
                 media_format = mime_type[0]
             else:
@@ -680,7 +687,10 @@ def get_multimedia_format_type(row: dict):
                     # Just check header without downloading content
                     response = requests.head(url, allow_redirects=True)
                     if 'content-type' in response.headers:
-                        media_format = response.headers['content-type']
+                        content_type = response.headers['content-type']
+                        if get_media_format_prefix(content_type):
+                            media_format = content_type
+
                 except Exception as error:
                     log.error("Error getting header info from url %s: %s", url, error)
 
diff --git a/tests/test_multimedia_content.py b/tests/test_multimedia_content.py
index 49cf2a5..d5b9275 100644
--- a/tests/test_multimedia_content.py
+++ b/tests/test_multimedia_content.py
@@ -9,6 +9,7 @@
 AUDIO_URL = "https://images.ala.org.au/image/proxyImage?imageId=480f5f5e-e96c-4ae3-8230-c53a37bc542e"
 VIDEO_URL = "https://images.ala.org.au/image/proxyImage?imageId=537799d7-f4d6-490c-a24c-6a94bfd5e857"
 INVALID_URL = "test"
+DELETED_MEDIA_URL = "https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=nonexistent"
 
 image_ext = CsvFileType(files=[pd.DataFrame(data=[["1", IMAGE_URL],
                                                   ["2", AUDIO_URL],
@@ -106,7 +107,8 @@ def test_fill_multimedia_info_with_format_type_partially_supplied(self):
                                                                                 ["3", "species3"],
                                                                                 ["4", "species4"],
                                                                                 ["5", "species5"],
-                                                                                ["6", "species6"]],
+                                                                                ["6", "species6"],
+                                                                                ["7", "species7"]],
                                                                           columns=['occurrenceID', 'scientificName'])],
                                                       type='occurrence',
                                                       keys=['occurrenceID']),
@@ -117,7 +119,8 @@ def test_fill_multimedia_info_with_format_type_partially_supplied(self):
                       ["3", VIDEO_URL, numpy.nan, "MovingImage"],
                       ["4", INVALID_URL, numpy.nan, numpy.nan],
                       ["5", INVALID_URL, 'invalidformat', numpy.nan],
-                      ["6", INVALID_URL, 'image/jpeg', numpy.nan]]
+                      ["6", INVALID_URL, 'image/jpeg', numpy.nan],
+                      ["7", DELETED_MEDIA_URL, numpy.nan, numpy.nan]]
 
         # Extract multimedia ext without format
         dwca.extract_csv_content(csv_info=CsvFileType(files=[pd.DataFrame(data=image_data,
@@ -135,7 +138,8 @@ def test_fill_multimedia_info_with_format_type_partially_supplied(self):
                                ["3", VIDEO_URL, "video/quicktime", "MovingImage"],
                                ["4", INVALID_URL, numpy.nan, numpy.nan],
                                ["5", INVALID_URL, 'invalidformat', numpy.nan],
-                               ["6", INVALID_URL, 'image/jpeg', 'StillImage']]
+                               ["6", INVALID_URL, 'image/jpeg', 'StillImage'],
+                               ["7", DELETED_MEDIA_URL, numpy.nan, numpy.nan]]
 
         expected_multimedia_df = pd.DataFrame(data=expected_image_data,
                                               columns=['occurrenceID', 'identifier', 'format', 'type'])