Skip to content

Commit

Permalink
Handle for deleted media content type
Browse files Browse the repository at this point in the history
  • Loading branch information
patkyn committed Aug 13, 2024
1 parent 784de57 commit 3d20dde
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 13 deletions.
30 changes: 20 additions & 10 deletions src/dwcahandler/dwca/core_dwca.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,17 +653,24 @@ def add_multimedia_info_to_content(self, multimedia_content: DfContent):
Attempt to populate the format and type from the url provided in the multimedia ext if none is provided
:param multimedia_content: Multimedia content type derived from the extension of this Dwca class object
"""
def get_media_format_prefix(media_format: str):
media_format_prefixes = ["image", "audio", "video"]
if media_format and isinstance(media_format, str) and '/' in media_format:
prefix = media_format.split('/')[0]
if prefix in media_format_prefixes:
return prefix

return None

def get_media_type(media_format: str):
media_type = None
if media_format and '/' in media_format:
m_type = media_format.split('/')[0]
if m_type == 'image':
media_type = 'StillImage'
elif m_type == 'audio':
media_type = 'Sound'
elif m_type == 'video':
media_type = 'MovingImage'
m_type = get_media_format_prefix(media_format)
if m_type == 'image':
media_type = 'StillImage'
elif m_type == 'audio':
media_type = 'Sound'
elif m_type == 'video':
media_type = 'MovingImage'
if media_type is None and media_format:
log.warning("Unknown media type for format %s", media_format)

Expand All @@ -672,15 +679,18 @@ def get_media_type(media_format: str):
def get_multimedia_format_type(row: dict):
url = row['identifier']
mime_type = mimetypes.guess_type(url)
media_format = ''
media_format = None
if mime_type and len(mime_type) > 0 and mime_type[0]:
media_format = mime_type[0]
else:
try:
# Just check header without downloading content
response = requests.head(url, allow_redirects=True)
if 'content-type' in response.headers:
media_format = response.headers['content-type']
content_type = response.headers['content-type']
if get_media_format_prefix(content_type):
media_format = content_type

except Exception as error:
log.error("Error getting header info from url %s: %s", url, error)

Expand Down
10 changes: 7 additions & 3 deletions tests/test_multimedia_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
AUDIO_URL = "https://images.ala.org.au/image/proxyImage?imageId=480f5f5e-e96c-4ae3-8230-c53a37bc542e"
VIDEO_URL = "https://images.ala.org.au/image/proxyImage?imageId=537799d7-f4d6-490c-a24c-6a94bfd5e857"
INVALID_URL = "test"
DELETED_MEDIA_URL = "https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=nonexistent"

image_ext = CsvFileType(files=[pd.DataFrame(data=[["1", IMAGE_URL],
["2", AUDIO_URL],
Expand Down Expand Up @@ -106,7 +107,8 @@ def test_fill_multimedia_info_with_format_type_partially_supplied(self):
["3", "species3"],
["4", "species4"],
["5", "species5"],
["6", "species6"]],
["6", "species6"],
["7", "species7"]],
columns=['occurrenceID', 'scientificName'])],
type='occurrence',
keys=['occurrenceID']),
Expand All @@ -117,7 +119,8 @@ def test_fill_multimedia_info_with_format_type_partially_supplied(self):
["3", VIDEO_URL, numpy.nan, "MovingImage"],
["4", INVALID_URL, numpy.nan, numpy.nan],
["5", INVALID_URL, 'invalidformat', numpy.nan],
["6", INVALID_URL, 'image/jpeg', numpy.nan]]
["6", INVALID_URL, 'image/jpeg', numpy.nan],
["7", DELETED_MEDIA_URL, numpy.nan, numpy.nan]]

# Extract multimedia ext without format
dwca.extract_csv_content(csv_info=CsvFileType(files=[pd.DataFrame(data=image_data,
Expand All @@ -135,7 +138,8 @@ def test_fill_multimedia_info_with_format_type_partially_supplied(self):
["3", VIDEO_URL, "video/quicktime", "MovingImage"],
["4", INVALID_URL, numpy.nan, numpy.nan],
["5", INVALID_URL, 'invalidformat', numpy.nan],
["6", INVALID_URL, 'image/jpeg', 'StillImage']]
["6", INVALID_URL, 'image/jpeg', 'StillImage'],
["7", DELETED_MEDIA_URL, numpy.nan, numpy.nan]]

expected_multimedia_df = pd.DataFrame(data=expected_image_data,
columns=['occurrenceID', 'identifier', 'format', 'type'])
Expand Down

0 comments on commit 3d20dde

Please sign in to comment.