Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ pydantic-core = ">=2.18.2"
pydash = ">=7.0.0,<8.0.0"
python-dateutil = ">=2.8.0,<3.0.0"
python-dotenv = ">=1.0.0,<2.0.0"
python-magic = "==0.4.27"
pytz = ">=2022.0,<2026.0"
pyyaml = ">=6.0.0,<7.0.0"
requests = ">=2.31.0,<3.0.0"
Expand Down
33 changes: 31 additions & 2 deletions src/vellum/utils/files/extensions.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,32 @@
"""File extension inference utilities."""

import logging
import mimetypes
import os
from typing import Optional
from typing import IO, Optional, Union

try:
import magic
except ImportError:
magic = None # type: ignore

from vellum.utils.files.constants import EXTENSION_OVERRIDES, MIME_TYPE_TO_EXTENSION

logger = logging.getLogger(__name__)


def ensure_filename_with_extension(filename: Optional[str], mime_type: str) -> str:
def ensure_filename_with_extension(
filename: Optional[str], mime_type: str, contents: Optional[Union[bytes, IO[bytes]]] = None
) -> str:
"""
Ensure the filename has an appropriate extension, infering one based on the provided MIME type if necessary.

Args:
filename: Optional filename provided by the user
mime_type: The MIME type of the file (e.g., "application/pdf", "image/png"). This'll be used to infer the
extension if the filename lacks one.
contents: Optional file contents (bytes or file-like object) to use for MIME type detection via python-magic
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[q] echoing Sidd's question, especially this will require Dockerfile changes. Do we need to use the inferred mime type on the sdk side before we call the upload api?

when the provided MIME type is generic or missing.

Returns:
A filename with an appropriate extension
Expand Down Expand Up @@ -46,6 +58,23 @@ def ensure_filename_with_extension(filename: Optional[str], mime_type: str) -> s
if filename and has_extension:
return filename

if mime_type == "application/octet-stream" and contents and magic:
try:
if isinstance(contents, bytes):
sample = contents[:2048]
else:
original_position = contents.tell() if hasattr(contents, "tell") else None
sample = contents.read(2048)
if hasattr(contents, "seek") and original_position is not None:
contents.seek(original_position)

detected_mime_type = magic.from_buffer(sample, mime=True)
if detected_mime_type:
# Strip any charset parameters from detected MIME type
mime_type = detected_mime_type.split(";")[0].strip()
except Exception:
logger.exception("Failed to guess content type using python-magic")

# Otherwise, infer extension from MIME type
extension = mimetypes.guess_extension(mime_type)

Expand Down
94 changes: 94 additions & 0 deletions src/vellum/utils/files/tests/test_extensions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Tests for file extension inference utilities."""

import pytest
from io import BytesIO
from unittest.mock import patch
Comment on lines 1 to +5

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Place new extension tests under collected test paths

Pytest is configured via pyproject.toml with testpaths = ["tests"], and the default make test just runs pytest, so modules outside that directory are skipped. This new test module lives under src/vellum/utils/files/tests, meaning none of the python-magic coverage will execute in CI. Consider moving it under tests/ or broadening testpaths so these tests actually run.

Useful? React with 👍 / 👎.


from vellum.utils.files.extensions import ensure_filename_with_extension

Expand Down Expand Up @@ -52,3 +54,95 @@ def test_ensure_filename_with_extension(filename, mime_type, expected):
"""Test filename extension inference for various filename and MIME type combinations."""
result = ensure_filename_with_extension(filename, mime_type)
assert result == expected


@patch("vellum.utils.files.extensions.magic")
def test_ensure_filename_with_extension__with_pdf_bytes_and_octet_stream_mime(mock_magic):
"""
Test that python-magic detects PDF from bytes when MIME type is application/octet-stream.
"""
mock_magic.from_buffer.return_value = "application/pdf"

pdf_bytes = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n"

result = ensure_filename_with_extension(None, "application/octet-stream", pdf_bytes)

# THEN the filename should have a .pdf extension
assert result == "file.pdf"


@patch("vellum.utils.files.extensions.magic")
def test_ensure_filename_with_extension__with_text_bytes_and_octet_stream_mime(mock_magic):
"""
Test that python-magic detects text/plain from bytes when MIME type is application/octet-stream.
"""
mock_magic.from_buffer.return_value = "text/plain"

text_bytes = b"Hello, this is a plain text file.\n"

result = ensure_filename_with_extension(None, "application/octet-stream", text_bytes)

# THEN the filename should have a .txt extension
assert result == "file.txt"


@patch("vellum.utils.files.extensions.magic")
def test_ensure_filename_with_extension__with_bytesio_and_octet_stream_mime(mock_magic):
"""
Test that python-magic works with BytesIO objects and seeks back to original position.
"""
mock_magic.from_buffer.return_value = "application/pdf"

pdf_bytes = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n"
bytes_io = BytesIO(pdf_bytes)

assert bytes_io.tell() == 0

result = ensure_filename_with_extension("document", "application/octet-stream", bytes_io)

# THEN the filename should have a .pdf extension
assert result == "document.pdf"

assert bytes_io.tell() == 0


def test_ensure_filename_with_extension__with_existing_extension_ignores_contents():
"""
Test that existing filename extensions are preserved even when contents suggest a different type.
"""
# GIVEN a filename with .txt extension
filename = "document.txt"

pdf_bytes = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n"

result = ensure_filename_with_extension(filename, "application/octet-stream", pdf_bytes)

assert result == "document.txt"


def test_ensure_filename_with_extension__with_non_octet_stream_mime_ignores_contents():
"""
Test that python-magic is only used when MIME type is application/octet-stream.
"""
mime_type = "image/jpeg"

pdf_bytes = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n"

result = ensure_filename_with_extension(None, mime_type, pdf_bytes)

assert result == "file.jpg"


@patch("vellum.utils.files.extensions.magic")
def test_ensure_filename_with_extension__with_charset_in_detected_mime(mock_magic):
"""
Test that charset parameters are stripped from python-magic detected MIME types.
"""
mock_magic.from_buffer.return_value = "text/html; charset=utf-8"

html_bytes = b"<!DOCTYPE html><html><body>Hello</body></html>"

result = ensure_filename_with_extension(None, "application/octet-stream", html_bytes)

# THEN the filename should have an .html extension (charset should be stripped)
assert result == "file.html"
4 changes: 2 additions & 2 deletions src/vellum/utils/files/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def upload_vellum_file(
decoded = base64.b64decode(base64_content)

# Ensure filename has appropriate extension
resolved_filename = ensure_filename_with_extension(filename, mime_type)
resolved_filename = ensure_filename_with_extension(filename, mime_type, decoded)
file_content: File = (resolved_filename, BytesIO(decoded), mime_type)

try:
Expand Down Expand Up @@ -130,7 +130,7 @@ def upload_vellum_file(
content_type = response.headers.get("content-type", "application/octet-stream")

# Ensure filename has appropriate extension
resolved_filename = ensure_filename_with_extension(filename, content_type)
resolved_filename = ensure_filename_with_extension(filename, content_type, content)
file_content = (resolved_filename, BytesIO(content), content_type)

try:
Expand Down