diff --git a/.gitignore b/.gitignore index 6de83ed8..4c6c70d9 100755 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ __pycache__/ .idea/ openapi.json openapi_client.json +.env diff --git a/.speakeasy/gen.lock b/.speakeasy/gen.lock index d712b6ba..3bda029b 100755 --- a/.speakeasy/gen.lock +++ b/.speakeasy/gen.lock @@ -1,10 +1,10 @@ lockVersion: 2.0.0 id: 8b5fa338-9106-4734-abf0-e30d67044a90 management: - docChecksum: 21f469b38bb72725739ee9d9d0fc8780 - docVersion: 1.0.51 - speakeasyVersion: 1.424.0 - generationVersion: 2.445.1 + docChecksum: 80b2dc9fb0c56267e34c1679522a1794 + docVersion: 1.0.52 + speakeasyVersion: 1.421.0 + generationVersion: 2.438.15 releaseVersion: 0.27.0 configChecksum: 6ece96f34cb076ad455a9c66b68c30b0 repoURL: https://github.com/Unstructured-IO/unstructured-python-client.git @@ -13,6 +13,7 @@ management: published: true features: python: + acceptHeaders: 3.0.0 additionalDependencies: 1.0.0 constsAndDefaults: 1.0.4 core: 5.6.1 @@ -107,6 +108,7 @@ examples: responses: "200": application/json: [{"type": "Title", "element_id": "6aa0ff22f91bbe7e26e8e25ca8052acd", "text": "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis", "metadata": {"languages": ["eng"], "page_number": 1, "filename": "layout-parser-paper.pdf", "filetype": "application/pdf"}}] + text/csv: "" "422": application/json: {"detail": []} 5XX: diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ac13563..a02b3187 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ ### Fixes * Use the configured server_url for our split page "dummy" request +* Handle `text/csv` output format and return accordingly when passing the argument ## 0.26.0 diff --git a/Makefile b/Makefile index 4233335c..b02ec280 100644 --- a/Makefile +++ b/Makefile @@ -77,6 +77,15 @@ client-generate-local: speakeasy overlay apply -s ./openapi.json -o ./overlay_client.yaml > ./openapi_client.json speakeasy generate sdk -s ./openapi_client.json -o ./ -l python +## client-generate-localhost: Generate the SDK using the openapi.json from the unstructured-api running at localhost:5000 +.PHONY: client-generate-localhost +client-generate-localhost: + curl -o openapi.json http://localhost:5000/general/openapi.json || { echo "Failed to download openapi.json"; exit 1; } + speakeasy overlay validate -o ./overlay_client.yaml + speakeasy overlay apply -s ./openapi.json -o ./overlay_client.yaml > ./openapi_client.json + python3 -c 'import sys, yaml, json; sys.stdout.write(json.dumps(yaml.safe_load(sys.stdin), indent=2))' < ./openapi_client.json > temp.json && mv temp.json ./openapi_client.json + speakeasy generate sdk -s ./openapi_client.json -o ./ -l python + .PHONY: publish publish: ./scripts/publish.sh diff --git a/_test_unstructured_client/conftest.py b/_test_unstructured_client/conftest.py new file mode 100644 index 00000000..d2c3a168 --- /dev/null +++ b/_test_unstructured_client/conftest.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +import os +from pathlib import Path +from typing import Generator +import pytest + +from unstructured_client.sdk import UnstructuredClient + + +@pytest.fixture(scope="module") +def client() -> Generator[UnstructuredClient, None, None]: + _client = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), server='free-api') + yield _client + + +@pytest.fixture(scope="module") +def doc_path() -> Path: + return Path(__file__).resolve().parents[1] / "_sample_docs" diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py index dc94bc61..a6855df7 100644 --- a/_test_unstructured_client/integration/test_decorators.py +++ b/_test_unstructured_client/integration/test_decorators.py @@ -15,6 +15,7 @@ from unstructured_client import UnstructuredClient from unstructured_client.models import shared, operations from unstructured_client.models.errors import HTTPValidationError +from unstructured_client.models.shared.partition_parameters import OutputFormat from unstructured_client.utils.retries import BackoffStrategy, RetryConfig from unstructured_client._hooks.custom import form_utils from unstructured_client._hooks.custom import split_pdf_hook @@ -22,6 +23,33 @@ FAKE_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" +@pytest.mark.parametrize("split_pdf_page", [True, False]) +def test_integration_split_csv_response(split_pdf_page, client, doc_path): + filename = "layout-parser-paper.pdf" + with open(doc_path / filename, "rb") as f: + files = shared.Files( + content=f.read(), + file_name=filename, + ) + req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + files=files, + output_format=OutputFormat.TEXT_CSV, + split_pdf_page=split_pdf_page, + ) + ) + + resp = client.general.partition(request=req) + + assert resp.status_code == 200 + assert resp.content_type == "text/csv; charset=utf-8" + assert resp.elements is None + assert resp.csv_elements is not None + assert resp.csv_elements.startswith( + "type,element_id,text,filetype,languages,page_number,filename,parent_id" + ) + + @pytest.mark.parametrize("concurrency_level", [1, 2, 5]) @pytest.mark.parametrize( ("filename", "expected_ok", "strategy"), @@ -40,10 +68,10 @@ def test_integration_split_pdf_has_same_output_as_non_split( concurrency_level: int, filename: str, expected_ok: bool, strategy: str ): """ - Tests that output that we get from the split-by-page pdf is the same as from non-split. + Test that the output we get from the split-by-page pdf is the same as from non-split. Requires unstructured-api running in bg. See Makefile for how to run it. - Doesn't check for raw_response as there's no clear patter for how it changes with the number of pages / concurrency_level. + Doesn't check for raw_response as there's no clear pattern for how it changes with the number of pages / concurrency_level. """ try: response = requests.get("http://localhost:8000/general/docs") diff --git a/_test_unstructured_client/integration/test_integration_freemium.py b/_test_unstructured_client/integration/test_integration_freemium.py index 8f05d6b2..5e931c59 100644 --- a/_test_unstructured_client/integration/test_integration_freemium.py +++ b/_test_unstructured_client/integration/test_integration_freemium.py @@ -3,27 +3,16 @@ import asyncio import json import os -from pathlib import Path import pytest from deepdiff import DeepDiff + from unstructured_client import UnstructuredClient from unstructured_client.models import shared, operations from unstructured_client.models.errors import SDKError, ServerError, HTTPValidationError from unstructured_client.utils.retries import BackoffStrategy, RetryConfig -@pytest.fixture(scope="module") -def client() -> UnstructuredClient: - _client = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), server='free-api') - yield _client - - -@pytest.fixture(scope="module") -def doc_path() -> Path: - return Path(__file__).resolve().parents[2] / "_sample_docs" - - @pytest.mark.parametrize("split_pdf", [True, False]) @pytest.mark.parametrize("strategy", ["fast", "ocr_only", "hi_res"]) def test_partition_strategies(split_pdf, strategy, client, doc_path): diff --git a/_test_unstructured_client/unit/test_request_utils.py b/_test_unstructured_client/unit/test_request_utils.py index 7f28d6e8..dcbf33f5 100644 --- a/_test_unstructured_client/unit/test_request_utils.py +++ b/_test_unstructured_client/unit/test_request_utils.py @@ -1,9 +1,10 @@ -# Get unit tests for request_utils.py module +from __future__ import annotations + import httpx +import json import pytest -from unstructured_client._hooks.custom.request_utils import create_pdf_chunk_request_params, get_multipart_stream_fields -from unstructured_client.models import shared +from unstructured_client._hooks.custom.request_utils import create_pdf_chunk_request_params, create_response, get_multipart_stream_fields # make the above test using @pytest.mark.parametrize @@ -30,6 +31,7 @@ def test_get_multipart_stream_fields(input_request, expected): fields = get_multipart_stream_fields(input_request) assert fields == expected + def test_multipart_stream_fields_raises_value_error_when_filename_is_not_set(): with pytest.raises(ValueError): get_multipart_stream_fields(httpx.Request( @@ -40,6 +42,7 @@ def test_multipart_stream_fields_raises_value_error_when_filename_is_not_set(): headers={"Content-Type": "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW"}), ) + @pytest.mark.parametrize(("input_form_data", "page_number", "expected_form_data"), [ ( {"hello": "world"}, @@ -70,3 +73,26 @@ def test_multipart_stream_fields_raises_value_error_when_filename_is_not_set(): def test_create_pdf_chunk_request_params(input_form_data, page_number, expected_form_data): form_data = create_pdf_chunk_request_params(input_form_data, page_number) assert form_data == expected_form_data + + +def test_create_response_for_json(): + elements = [ + {"type": "Title", "text": "Hello, World!"}, + {"type": "NarrativeText", "text": "Goodbye!"}, + ] + response = create_response(elements) + assert response.status_code == 200 + assert response.json() == elements + assert response.headers["Content-Type"] == "application/json" + + +def test_create_response_for_csv(): + elements = [ + b'type,element_id,text,languages,page_number,filename,filetype,parent_id' \ + b'\nTitle,f73329878fbbb0bb131a83e7b6daacbe,Module One - Introduction to Product' \ + b' Development and Quality Assurance,[\'eng\'],1,list-item-example-1.pdf,application/pdf,' + ] + response = create_response(elements) + assert response.status_code == 200 + pytest.raises(json.decoder.JSONDecodeError, response.json) + assert response.headers["Content-Type"] == "text/csv; charset=utf-8" diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py index adc743a8..16898cfa 100644 --- a/_test_unstructured_client/unit/test_split_pdf_hook.py +++ b/_test_unstructured_client/unit/test_split_pdf_hook.py @@ -1,17 +1,13 @@ from __future__ import annotations import asyncio -import io -import logging from asyncio import Task from collections import Counter from functools import partial -from typing import Coroutine -import httpx import pytest import requests -from requests_toolbelt import MultipartDecoder, MultipartEncoder +from requests_toolbelt import MultipartDecoder from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils from unstructured_client._hooks.custom.form_utils import ( diff --git a/docs/models/operations/partitionresponse.md b/docs/models/operations/partitionresponse.md index b2beb267..d19430ae 100644 --- a/docs/models/operations/partitionresponse.md +++ b/docs/models/operations/partitionresponse.md @@ -8,4 +8,5 @@ | `content_type` | *str* | :heavy_check_mark: | HTTP response content type for this operation | | `status_code` | *int* | :heavy_check_mark: | HTTP response status code for this operation | | `raw_response` | [httpx.Response](https://www.python-httpx.org/api/#response) | :heavy_check_mark: | Raw HTTP response; suitable for custom response parsing | +| `csv_elements` | *Optional[str]* | :heavy_minus_sign: | Successful Response | | `elements` | List[Dict[str, *Any*]] | :heavy_minus_sign: | Successful Response | \ No newline at end of file diff --git a/docs/models/shared/strategy.md b/docs/models/shared/strategy.md index 2fb70b2d..4bad9a6e 100644 --- a/docs/models/shared/strategy.md +++ b/docs/models/shared/strategy.md @@ -10,4 +10,5 @@ The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. | `FAST` | fast | | `HI_RES` | hi_res | | `AUTO` | auto | -| `OCR_ONLY` | ocr_only | \ No newline at end of file +| `OCR_ONLY` | ocr_only | +| `OD_ONLY` | od_only | \ No newline at end of file diff --git a/src/unstructured_client/_hooks/custom/request_utils.py b/src/unstructured_client/_hooks/custom/request_utils.py index 0e116ab7..304e40eb 100644 --- a/src/unstructured_client/_hooks/custom/request_utils.py +++ b/src/unstructured_client/_hooks/custom/request_utils.py @@ -4,7 +4,7 @@ import io import json import logging -from typing import Tuple, Any, BinaryIO +from typing import Any, BinaryIO, Tuple import httpx from httpx._multipart import DataField, FileField @@ -207,7 +207,8 @@ def prepare_request_headers( new_headers.pop("Content-Length", None) return new_headers -def create_response(elements: list) -> httpx.Response: + +def create_response(elements: list[dict[str, Any] | bytes]) -> httpx.Response: """ Creates a modified response object with updated content. @@ -218,8 +219,12 @@ def create_response(elements: list) -> httpx.Response: Returns: The modified response object with updated content. """ - response = httpx.Response(status_code=200, headers={"Content-Type": "application/json"}) - content = json.dumps(elements).encode() + if isinstance(elements, list) and all(isinstance(element, bytes) for element in elements): + response = httpx.Response(status_code=200, headers={"Content-Type": "text/csv; charset=utf-8"}) + content = b''.join(elements) # type: ignore + else: + response = httpx.Response(status_code=200, headers={"Content-Type": "application/json"}) + content = json.dumps(elements).encode() content_length = str(len(content)) response.headers.update({"Content-Length": content_length}) setattr(response, "_content", content) diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index e21b145c..dfb0dd1d 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -576,10 +576,13 @@ def _await_elements( response_number, ) successful_responses.append(res) - if self.cache_tmp_data_feature: - elements.append(load_elements_from_response(res)) - else: - elements.append(res.json()) + if res.headers["Content-Type"] == "application/json": + if self.cache_tmp_data_feature: + elements.append(load_elements_from_response(res)) + else: + elements.append(res.json()) + else: # -- Response contains csv data + elements.append(res.content) # type: ignore else: error_message = f"Failed to partition set {response_number}." @@ -591,7 +594,12 @@ def _await_elements( self.api_successful_responses[operation_id] = successful_responses self.api_failed_responses[operation_id] = failed_responses - flattened_elements = [element for sublist in elements for element in sublist] + flattened_elements = [] + for sublist in elements: + if isinstance(sublist, list): + flattened_elements.extend(sublist) + else: + flattened_elements.append(sublist) return flattened_elements def after_success( @@ -613,7 +621,6 @@ def after_success( """ # Grab the correct id out of the dummy request operation_id = response.request.headers.get("operation_id") - elements = self._await_elements(operation_id) # if fails are disallowed, return the first failed response diff --git a/src/unstructured_client/general.py b/src/unstructured_client/general.py index adc4a6ab..40c56d16 100644 --- a/src/unstructured_client/general.py +++ b/src/unstructured_client/general.py @@ -1,6 +1,7 @@ """Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT.""" from .basesdk import BaseSDK +from enum import Enum from typing import Any, Dict, List, Optional, Union, cast from unstructured_client import utils from unstructured_client._hooks import HookContext @@ -8,6 +9,11 @@ from unstructured_client.types import BaseModel, OptionalNullable, UNSET +class PartitionAcceptEnum(str, Enum): + APPLICATION_JSON = "application/json" + TEXT_CSV = "text/csv" + + class General(BaseSDK): def partition( self, @@ -18,6 +24,7 @@ def partition( retries: OptionalNullable[utils.RetryConfig] = UNSET, server_url: Optional[str] = None, timeout_ms: Optional[int] = None, + accept_header_override: Optional[PartitionAcceptEnum] = None, ) -> operations.PartitionResponse: r"""Summary @@ -27,6 +34,7 @@ def partition( :param retries: Override the default retry configuration for this method :param server_url: Override the default server URL for this method :param timeout_ms: Override the default request timeout configuration for this method in milliseconds + :param accept_header_override: Override the default accept header for this method """ base_url = None url_variables = None @@ -50,7 +58,9 @@ def partition( request_has_path_params=False, request_has_query_params=True, user_agent_header="user-agent", - accept_header_value="application/json", + accept_header_value=accept_header_override.value + if accept_header_override is not None + else "application/json;q=1, text/csv;q=0", security=self.sdk_configuration.security, get_serialized_body=lambda: utils.serialize_request_body( request.partition_parameters, @@ -95,6 +105,13 @@ def partition( content_type=http_res.headers.get("Content-Type") or "", raw_response=http_res, ) + if utils.match_response(http_res, "200", "text/csv"): + return operations.PartitionResponse( + csv_elements=http_res.text, + status_code=http_res.status_code, + content_type=http_res.headers.get("Content-Type") or "", + raw_response=http_res, + ) if utils.match_response(http_res, "422", "application/json"): data = utils.unmarshal_json(http_res.text, errors.HTTPValidationErrorData) raise errors.HTTPValidationError(data=data) @@ -125,6 +142,7 @@ async def partition_async( retries: OptionalNullable[utils.RetryConfig] = UNSET, server_url: Optional[str] = None, timeout_ms: Optional[int] = None, + accept_header_override: Optional[PartitionAcceptEnum] = None, ) -> operations.PartitionResponse: r"""Summary @@ -134,6 +152,7 @@ async def partition_async( :param retries: Override the default retry configuration for this method :param server_url: Override the default server URL for this method :param timeout_ms: Override the default request timeout configuration for this method in milliseconds + :param accept_header_override: Override the default accept header for this method """ base_url = None url_variables = None @@ -157,7 +176,9 @@ async def partition_async( request_has_path_params=False, request_has_query_params=True, user_agent_header="user-agent", - accept_header_value="application/json", + accept_header_value=accept_header_override.value + if accept_header_override is not None + else "application/json;q=1, text/csv;q=0", security=self.sdk_configuration.security, get_serialized_body=lambda: utils.serialize_request_body( request.partition_parameters, @@ -202,6 +223,13 @@ async def partition_async( content_type=http_res.headers.get("Content-Type") or "", raw_response=http_res, ) + if utils.match_response(http_res, "200", "text/csv"): + return operations.PartitionResponse( + csv_elements=http_res.text, + status_code=http_res.status_code, + content_type=http_res.headers.get("Content-Type") or "", + raw_response=http_res, + ) if utils.match_response(http_res, "422", "application/json"): data = utils.unmarshal_json(http_res.text, errors.HTTPValidationErrorData) raise errors.HTTPValidationError(data=data) diff --git a/src/unstructured_client/models/operations/partition.py b/src/unstructured_client/models/operations/partition.py index d57cfcd1..1b256ff2 100644 --- a/src/unstructured_client/models/operations/partition.py +++ b/src/unstructured_client/models/operations/partition.py @@ -74,6 +74,8 @@ class PartitionResponseTypedDict(TypedDict): r"""HTTP response status code for this operation""" raw_response: httpx.Response r"""Raw HTTP response; suitable for custom response parsing""" + csv_elements: NotRequired[str] + r"""Successful Response""" elements: NotRequired[List[Dict[str, Any]]] r"""Successful Response""" @@ -88,5 +90,8 @@ class PartitionResponse(BaseModel): raw_response: httpx.Response r"""Raw HTTP response; suitable for custom response parsing""" + csv_elements: Optional[str] = None + r"""Successful Response""" + elements: Optional[List[Dict[str, Any]]] = None r"""Successful Response""" diff --git a/src/unstructured_client/models/shared/partition_parameters.py b/src/unstructured_client/models/shared/partition_parameters.py index 8670f5fd..2bb87453 100644 --- a/src/unstructured_client/models/shared/partition_parameters.py +++ b/src/unstructured_client/models/shared/partition_parameters.py @@ -67,6 +67,7 @@ class Strategy(str, Enum, metaclass=utils.OpenEnumMeta): HI_RES = "hi_res" AUTO = "auto" OCR_ONLY = "ocr_only" + OD_ONLY = "od_only" class PartitionParametersTypedDict(TypedDict): diff --git a/src/unstructured_client/sdkconfiguration.py b/src/unstructured_client/sdkconfiguration.py index a365b43f..26c70efb 100644 --- a/src/unstructured_client/sdkconfiguration.py +++ b/src/unstructured_client/sdkconfiguration.py @@ -33,10 +33,10 @@ class SDKConfiguration: server_url: Optional[str] = "" server: Optional[str] = "" language: str = "python" - openapi_doc_version: str = "1.0.51" + openapi_doc_version: str = "1.0.52" sdk_version: str = "0.27.0" - gen_version: str = "2.445.1" - user_agent: str = "speakeasy-sdk/python 0.27.0 2.445.1 1.0.51 unstructured-client" + gen_version: str = "2.438.15" + user_agent: str = "speakeasy-sdk/python 0.27.0 2.438.15 1.0.52 unstructured-client" retry_config: OptionalNullable[RetryConfig] = Field(default_factory=lambda: UNSET) timeout_ms: Optional[int] = None