From 3d20ddea48da196c32c09868108d519be4f1a727 Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Tue, 13 Aug 2024 14:33:42 +1000 Subject: [PATCH] Handle for deleted media content type --- src/dwcahandler/dwca/core_dwca.py | 30 ++++++++++++++++++++---------- tests/test_multimedia_content.py | 10 +++++++--- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index 02f0596..5bfad23 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -653,17 +653,24 @@ def add_multimedia_info_to_content(self, multimedia_content: DfContent): Attempt to populate the format and type from the url provided in the multimedia ext if none is provided :param multimedia_content: Multimedia content type derived from the extension of this Dwca class object """ + def get_media_format_prefix(media_format: str): + media_format_prefixes = ["image", "audio", "video"] + if media_format and isinstance(media_format, str) and '/' in media_format: + prefix = media_format.split('/')[0] + if prefix in media_format_prefixes: + return prefix + + return None def get_media_type(media_format: str): media_type = None - if media_format and '/' in media_format: - m_type = media_format.split('/')[0] - if m_type == 'image': - media_type = 'StillImage' - elif m_type == 'audio': - media_type = 'Sound' - elif m_type == 'video': - media_type = 'MovingImage' + m_type = get_media_format_prefix(media_format) + if m_type == 'image': + media_type = 'StillImage' + elif m_type == 'audio': + media_type = 'Sound' + elif m_type == 'video': + media_type = 'MovingImage' if media_type is None and media_format: log.warning("Unknown media type for format %s", media_format) @@ -672,7 +679,7 @@ def get_media_type(media_format: str): def get_multimedia_format_type(row: dict): url = row['identifier'] mime_type = mimetypes.guess_type(url) - media_format = '' + media_format = None if mime_type and len(mime_type) > 0 and mime_type[0]: media_format = mime_type[0] else: @@ -680,7 +687,10 @@ def get_multimedia_format_type(row: dict): # Just check header without downloading content response = requests.head(url, allow_redirects=True) if 'content-type' in response.headers: - media_format = response.headers['content-type'] + content_type = response.headers['content-type'] + if get_media_format_prefix(content_type): + media_format = content_type + except Exception as error: log.error("Error getting header info from url %s: %s", url, error) diff --git a/tests/test_multimedia_content.py b/tests/test_multimedia_content.py index 49cf2a5..d5b9275 100644 --- a/tests/test_multimedia_content.py +++ b/tests/test_multimedia_content.py @@ -9,6 +9,7 @@ AUDIO_URL = "https://images.ala.org.au/image/proxyImage?imageId=480f5f5e-e96c-4ae3-8230-c53a37bc542e" VIDEO_URL = "https://images.ala.org.au/image/proxyImage?imageId=537799d7-f4d6-490c-a24c-6a94bfd5e857" INVALID_URL = "test" +DELETED_MEDIA_URL = "https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=nonexistent" image_ext = CsvFileType(files=[pd.DataFrame(data=[["1", IMAGE_URL], ["2", AUDIO_URL], @@ -106,7 +107,8 @@ def test_fill_multimedia_info_with_format_type_partially_supplied(self): ["3", "species3"], ["4", "species4"], ["5", "species5"], - ["6", "species6"]], + ["6", "species6"], + ["7", "species7"]], columns=['occurrenceID', 'scientificName'])], type='occurrence', keys=['occurrenceID']), @@ -117,7 +119,8 @@ def test_fill_multimedia_info_with_format_type_partially_supplied(self): ["3", VIDEO_URL, numpy.nan, "MovingImage"], ["4", INVALID_URL, numpy.nan, numpy.nan], ["5", INVALID_URL, 'invalidformat', numpy.nan], - ["6", INVALID_URL, 'image/jpeg', numpy.nan]] + ["6", INVALID_URL, 'image/jpeg', numpy.nan], + ["7", DELETED_MEDIA_URL, numpy.nan, numpy.nan]] # Extract multimedia ext without format dwca.extract_csv_content(csv_info=CsvFileType(files=[pd.DataFrame(data=image_data, @@ -135,7 +138,8 @@ def test_fill_multimedia_info_with_format_type_partially_supplied(self): ["3", VIDEO_URL, "video/quicktime", "MovingImage"], ["4", INVALID_URL, numpy.nan, numpy.nan], ["5", INVALID_URL, 'invalidformat', numpy.nan], - ["6", INVALID_URL, 'image/jpeg', 'StillImage']] + ["6", INVALID_URL, 'image/jpeg', 'StillImage'], + ["7", DELETED_MEDIA_URL, numpy.nan, numpy.nan]] expected_multimedia_df = pd.DataFrame(data=expected_image_data, columns=['occurrenceID', 'identifier', 'format', 'type'])