diff --git a/poetry.lock b/poetry.lock index 7c02aeb0a9..0b4deb6a9b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "annotated-types" @@ -1508,6 +1508,17 @@ files = [ [package.extras] cli = ["click (>=5.0)"] +[[package]] +name = "python-magic" +version = "0.4.27" +description = "File type identification using libmagic" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "python-magic-0.4.27.tar.gz", hash = "sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b"}, + {file = "python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"}, +] + [[package]] name = "pytz" version = "2025.2" @@ -1939,4 +1950,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "1b8454cccabd078901e3f0c18b5346e23207195c7b9082617d57e862703002fe" +content-hash = "4956474fab0c68f081b170c162156d89f85218918c802781fcc6c2130cf4db17" diff --git a/pyproject.toml b/pyproject.toml index a5a4820e4a..993b63cf44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ pydantic-core = ">=2.18.2" pydash = ">=7.0.0,<8.0.0" python-dateutil = ">=2.8.0,<3.0.0" python-dotenv = ">=1.0.0,<2.0.0" +python-magic = "==0.4.27" pytz = ">=2022.0,<2026.0" pyyaml = ">=6.0.0,<7.0.0" requests = ">=2.31.0,<3.0.0" diff --git a/src/vellum/utils/files/extensions.py b/src/vellum/utils/files/extensions.py index de81acc55f..e152355e9b 100644 --- a/src/vellum/utils/files/extensions.py +++ b/src/vellum/utils/files/extensions.py @@ -1,13 +1,23 @@ """File extension inference utilities.""" +import logging import mimetypes import os -from typing import Optional +from typing import IO, Optional, Union + +try: + import magic +except ImportError: + magic = None # type: ignore from vellum.utils.files.constants import EXTENSION_OVERRIDES, MIME_TYPE_TO_EXTENSION +logger = logging.getLogger(__name__) + -def ensure_filename_with_extension(filename: Optional[str], mime_type: str) -> str: +def ensure_filename_with_extension( + filename: Optional[str], mime_type: str, contents: Optional[Union[bytes, IO[bytes]]] = None +) -> str: """ Ensure the filename has an appropriate extension, infering one based on the provided MIME type if necessary. @@ -15,6 +25,8 @@ def ensure_filename_with_extension(filename: Optional[str], mime_type: str) -> s filename: Optional filename provided by the user mime_type: The MIME type of the file (e.g., "application/pdf", "image/png"). This'll be used to infer the extension if the filename lacks one. + contents: Optional file contents (bytes or file-like object) to use for MIME type detection via python-magic + when the provided MIME type is generic or missing. Returns: A filename with an appropriate extension @@ -46,6 +58,23 @@ def ensure_filename_with_extension(filename: Optional[str], mime_type: str) -> s if filename and has_extension: return filename + if mime_type == "application/octet-stream" and contents and magic: + try: + if isinstance(contents, bytes): + sample = contents[:2048] + else: + original_position = contents.tell() if hasattr(contents, "tell") else None + sample = contents.read(2048) + if hasattr(contents, "seek") and original_position is not None: + contents.seek(original_position) + + detected_mime_type = magic.from_buffer(sample, mime=True) + if detected_mime_type: + # Strip any charset parameters from detected MIME type + mime_type = detected_mime_type.split(";")[0].strip() + except Exception: + logger.exception("Failed to guess content type using python-magic") + # Otherwise, infer extension from MIME type extension = mimetypes.guess_extension(mime_type) diff --git a/src/vellum/utils/files/tests/test_extensions.py b/src/vellum/utils/files/tests/test_extensions.py index a4b1844f8d..80c761ebb4 100644 --- a/src/vellum/utils/files/tests/test_extensions.py +++ b/src/vellum/utils/files/tests/test_extensions.py @@ -1,6 +1,8 @@ """Tests for file extension inference utilities.""" import pytest +from io import BytesIO +from unittest.mock import patch from vellum.utils.files.extensions import ensure_filename_with_extension @@ -52,3 +54,95 @@ def test_ensure_filename_with_extension(filename, mime_type, expected): """Test filename extension inference for various filename and MIME type combinations.""" result = ensure_filename_with_extension(filename, mime_type) assert result == expected + + +@patch("vellum.utils.files.extensions.magic") +def test_ensure_filename_with_extension__with_pdf_bytes_and_octet_stream_mime(mock_magic): + """ + Test that python-magic detects PDF from bytes when MIME type is application/octet-stream. + """ + mock_magic.from_buffer.return_value = "application/pdf" + + pdf_bytes = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n" + + result = ensure_filename_with_extension(None, "application/octet-stream", pdf_bytes) + + # THEN the filename should have a .pdf extension + assert result == "file.pdf" + + +@patch("vellum.utils.files.extensions.magic") +def test_ensure_filename_with_extension__with_text_bytes_and_octet_stream_mime(mock_magic): + """ + Test that python-magic detects text/plain from bytes when MIME type is application/octet-stream. + """ + mock_magic.from_buffer.return_value = "text/plain" + + text_bytes = b"Hello, this is a plain text file.\n" + + result = ensure_filename_with_extension(None, "application/octet-stream", text_bytes) + + # THEN the filename should have a .txt extension + assert result == "file.txt" + + +@patch("vellum.utils.files.extensions.magic") +def test_ensure_filename_with_extension__with_bytesio_and_octet_stream_mime(mock_magic): + """ + Test that python-magic works with BytesIO objects and seeks back to original position. + """ + mock_magic.from_buffer.return_value = "application/pdf" + + pdf_bytes = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n" + bytes_io = BytesIO(pdf_bytes) + + assert bytes_io.tell() == 0 + + result = ensure_filename_with_extension("document", "application/octet-stream", bytes_io) + + # THEN the filename should have a .pdf extension + assert result == "document.pdf" + + assert bytes_io.tell() == 0 + + +def test_ensure_filename_with_extension__with_existing_extension_ignores_contents(): + """ + Test that existing filename extensions are preserved even when contents suggest a different type. + """ + # GIVEN a filename with .txt extension + filename = "document.txt" + + pdf_bytes = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n" + + result = ensure_filename_with_extension(filename, "application/octet-stream", pdf_bytes) + + assert result == "document.txt" + + +def test_ensure_filename_with_extension__with_non_octet_stream_mime_ignores_contents(): + """ + Test that python-magic is only used when MIME type is application/octet-stream. + """ + mime_type = "image/jpeg" + + pdf_bytes = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n" + + result = ensure_filename_with_extension(None, mime_type, pdf_bytes) + + assert result == "file.jpg" + + +@patch("vellum.utils.files.extensions.magic") +def test_ensure_filename_with_extension__with_charset_in_detected_mime(mock_magic): + """ + Test that charset parameters are stripped from python-magic detected MIME types. + """ + mock_magic.from_buffer.return_value = "text/html; charset=utf-8" + + html_bytes = b"Hello" + + result = ensure_filename_with_extension(None, "application/octet-stream", html_bytes) + + # THEN the filename should have an .html extension (charset should be stripped) + assert result == "file.html" diff --git a/src/vellum/utils/files/upload.py b/src/vellum/utils/files/upload.py index 78513f51bd..52e12dbffe 100644 --- a/src/vellum/utils/files/upload.py +++ b/src/vellum/utils/files/upload.py @@ -92,7 +92,7 @@ def upload_vellum_file( decoded = base64.b64decode(base64_content) # Ensure filename has appropriate extension - resolved_filename = ensure_filename_with_extension(filename, mime_type) + resolved_filename = ensure_filename_with_extension(filename, mime_type, decoded) file_content: File = (resolved_filename, BytesIO(decoded), mime_type) try: @@ -130,7 +130,7 @@ def upload_vellum_file( content_type = response.headers.get("content-type", "application/octet-stream") # Ensure filename has appropriate extension - resolved_filename = ensure_filename_with_extension(filename, content_type) + resolved_filename = ensure_filename_with_extension(filename, content_type, content) file_content = (resolved_filename, BytesIO(content), content_type) try: