diff --git a/docs/docs/providers/file_processor/index.mdx b/docs/docs/providers/file_processor/index.mdx new file mode 100644 index 0000000000..3355112f43 --- /dev/null +++ b/docs/docs/providers/file_processor/index.mdx @@ -0,0 +1,10 @@ +--- +sidebar_label: File Processor +title: File_Processor +--- + +# File_Processor + +## Overview + +This section contains documentation for all available providers for the **file_processor** API. diff --git a/docs/docs/providers/file_processor/inline_reference.mdx b/docs/docs/providers/file_processor/inline_reference.mdx new file mode 100644 index 0000000000..9fe76f51f8 --- /dev/null +++ b/docs/docs/providers/file_processor/inline_reference.mdx @@ -0,0 +1,17 @@ +--- +description: "Reference file processor implementation (placeholder for development)" +sidebar_label: Reference +title: inline::reference +--- + +# inline::reference + +## Description + +Reference file processor implementation (placeholder for development) + +## Sample Configuration + +```yaml +{} +``` diff --git a/src/llama_stack/apis/datatypes.py b/src/llama_stack/apis/datatypes.py index ae01c5dfc5..126dee1d73 100644 --- a/src/llama_stack/apis/datatypes.py +++ b/src/llama_stack/apis/datatypes.py @@ -127,6 +127,7 @@ class Api(Enum, metaclass=DynamicApiMeta): files = "files" prompts = "prompts" conversations = "conversations" + file_processor = "file_processor" # built-in API inspect = "inspect" diff --git a/src/llama_stack/apis/file_processor/__init__.py b/src/llama_stack/apis/file_processor/__init__.py new file mode 100644 index 0000000000..295141a21e --- /dev/null +++ b/src/llama_stack/apis/file_processor/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .file_processor import * diff --git a/src/llama_stack/apis/file_processor/file_processor.py b/src/llama_stack/apis/file_processor/file_processor.py new file mode 100644 index 0000000000..a0785682fc --- /dev/null +++ b/src/llama_stack/apis/file_processor/file_processor.py @@ -0,0 +1,96 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Protocol, runtime_checkable + +from pydantic import BaseModel + +from llama_stack.apis.common.tracing import telemetry_traceable +from llama_stack.apis.vector_io.vector_io import Chunk, VectorStoreChunkingStrategy +from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA +from llama_stack.schema_utils import json_schema_type, webmethod + + +@json_schema_type +class ProcessFileRequest(BaseModel): + """Request for processing a file into structured content.""" + + file_data: bytes + """Raw file data to process.""" + + filename: str + """Original filename for format detection and processing hints.""" + + options: dict[str, Any] | None = None + """Optional processing options. Provider-specific parameters.""" + + chunking_strategy: VectorStoreChunkingStrategy | None = None + """Optional chunking strategy for splitting content into chunks.""" + + include_embeddings: bool = False + """Whether to generate embeddings for chunks.""" + + +@json_schema_type +class ProcessedContent(BaseModel): + """Result of file processing operation.""" + + content: str + """Extracted text content from the file.""" + + chunks: list[Chunk] | None = None + """Optional chunks if chunking strategy was provided.""" + + embeddings: list[list[float]] | None = None + """Optional embeddings for chunks if requested.""" + + metadata: dict[str, Any] + """Processing metadata including processor name, timing, and provider-specific data.""" + + +@telemetry_traceable +@runtime_checkable +class FileProcessor(Protocol): + """ + File Processor API for converting files into structured, processable content. + + This API provides a flexible interface for processing various file formats + (PDFs, documents, images, etc.) into text content that can be used for + vector store ingestion, RAG applications, or standalone content extraction. + + The API supports: + - Multiple file formats through extensible provider architecture + - Configurable processing options per provider + - Integration with vector store chunking strategies + - Optional embedding generation for chunks + - Rich metadata about processing results + + Future providers can extend this interface to support additional formats, + processing capabilities, and optimization strategies. + """ + + @webmethod(route="/file-processor/process", method="POST", level=LLAMA_STACK_API_V1ALPHA) + async def process_file( + self, + file_data: bytes, + filename: str, + options: dict[str, Any] | None = None, + chunking_strategy: VectorStoreChunkingStrategy | None = None, + include_embeddings: bool = False, + ) -> ProcessedContent: + """ + Process a file into structured content with optional chunking and embeddings. + + This method processes raw file data and converts it into text content for applications such as vector store ingestion. + + :param file_data: Raw bytes of the file to process. + :param filename: Original filename for format detection. + :param options: Provider-specific processing options (e.g., OCR settings, output format). + :param chunking_strategy: Optional strategy for splitting content into chunks. + :param include_embeddings: Whether to generate embeddings for chunks. + :returns: ProcessedContent with extracted text, optional chunks, and metadata. + """ + ... diff --git a/src/llama_stack/core/resolver.py b/src/llama_stack/core/resolver.py index 8bf371fed5..e78bcc1fa1 100644 --- a/src/llama_stack/core/resolver.py +++ b/src/llama_stack/core/resolver.py @@ -16,6 +16,7 @@ from llama_stack.apis.datasets import Datasets from llama_stack.apis.datatypes import ExternalApiSpec from llama_stack.apis.eval import Eval +from llama_stack.apis.file_processor import FileProcessor from llama_stack.apis.files import Files from llama_stack.apis.inference import Inference, InferenceProvider from llama_stack.apis.inspect import Inspect @@ -96,6 +97,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) -> Api.files: Files, Api.prompts: Prompts, Api.conversations: Conversations, + Api.file_processor: FileProcessor, } if external_apis: diff --git a/src/llama_stack/distributions/ci-tests/build.yaml b/src/llama_stack/distributions/ci-tests/build.yaml index f29ac77126..ef2e552c05 100644 --- a/src/llama_stack/distributions/ci-tests/build.yaml +++ b/src/llama_stack/distributions/ci-tests/build.yaml @@ -29,6 +29,8 @@ distribution_spec: - provider_type: remote::weaviate files: - provider_type: inline::localfs + file_processor: + - provider_type: inline::reference safety: - provider_type: inline::llama-guard - provider_type: inline::code-scanner diff --git a/src/llama_stack/distributions/ci-tests/run.yaml b/src/llama_stack/distributions/ci-tests/run.yaml index 1118d2ad1c..73af100014 100644 --- a/src/llama_stack/distributions/ci-tests/run.yaml +++ b/src/llama_stack/distributions/ci-tests/run.yaml @@ -5,6 +5,7 @@ apis: - batches - datasetio - eval +- file_processor - files - inference - post_training @@ -154,6 +155,9 @@ providers: metadata_store: table_name: files_metadata backend: sql_default + file_processor: + - provider_id: reference + provider_type: inline::reference safety: - provider_id: llama-guard provider_type: inline::llama-guard diff --git a/src/llama_stack/distributions/starter-gpu/build.yaml b/src/llama_stack/distributions/starter-gpu/build.yaml index 10cbb13896..e286bc3d8b 100644 --- a/src/llama_stack/distributions/starter-gpu/build.yaml +++ b/src/llama_stack/distributions/starter-gpu/build.yaml @@ -30,6 +30,8 @@ distribution_spec: - provider_type: remote::weaviate files: - provider_type: inline::localfs + file_processor: + - provider_type: inline::reference safety: - provider_type: inline::llama-guard - provider_type: inline::code-scanner diff --git a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml index 1920ebd9d0..e0cab6618e 100644 --- a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +++ b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml @@ -5,6 +5,7 @@ apis: - batches - datasetio - eval +- file_processor - files - inference - post_training @@ -154,6 +155,9 @@ providers: metadata_store: table_name: files_metadata backend: sql_default + file_processor: + - provider_id: reference + provider_type: inline::reference safety: - provider_id: llama-guard provider_type: inline::llama-guard diff --git a/src/llama_stack/distributions/starter-gpu/run.yaml b/src/llama_stack/distributions/starter-gpu/run.yaml index 7149b86597..367788b3d5 100644 --- a/src/llama_stack/distributions/starter-gpu/run.yaml +++ b/src/llama_stack/distributions/starter-gpu/run.yaml @@ -5,6 +5,7 @@ apis: - batches - datasetio - eval +- file_processor - files - inference - post_training @@ -154,6 +155,9 @@ providers: metadata_store: table_name: files_metadata backend: sql_default + file_processor: + - provider_id: reference + provider_type: inline::reference safety: - provider_id: llama-guard provider_type: inline::llama-guard diff --git a/src/llama_stack/distributions/starter/build.yaml b/src/llama_stack/distributions/starter/build.yaml index acd51f773e..e673cc3e6d 100644 --- a/src/llama_stack/distributions/starter/build.yaml +++ b/src/llama_stack/distributions/starter/build.yaml @@ -30,6 +30,8 @@ distribution_spec: - provider_type: remote::weaviate files: - provider_type: inline::localfs + file_processor: + - provider_type: inline::reference safety: - provider_type: inline::llama-guard - provider_type: inline::code-scanner diff --git a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml index 702f953813..0f1616766b 100644 --- a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml +++ b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml @@ -5,6 +5,7 @@ apis: - batches - datasetio - eval +- file_processor - files - inference - post_training @@ -154,6 +155,9 @@ providers: metadata_store: table_name: files_metadata backend: sql_default + file_processor: + - provider_id: reference + provider_type: inline::reference safety: - provider_id: llama-guard provider_type: inline::llama-guard diff --git a/src/llama_stack/distributions/starter/run.yaml b/src/llama_stack/distributions/starter/run.yaml index 0ce3928105..3cf9f7d04a 100644 --- a/src/llama_stack/distributions/starter/run.yaml +++ b/src/llama_stack/distributions/starter/run.yaml @@ -5,6 +5,7 @@ apis: - batches - datasetio - eval +- file_processor - files - inference - post_training @@ -154,6 +155,9 @@ providers: metadata_store: table_name: files_metadata backend: sql_default + file_processor: + - provider_id: reference + provider_type: inline::reference safety: - provider_id: llama-guard provider_type: inline::llama-guard diff --git a/src/llama_stack/distributions/starter/starter.py b/src/llama_stack/distributions/starter/starter.py index 88cd3a4fe6..068c6ded80 100644 --- a/src/llama_stack/distributions/starter/starter.py +++ b/src/llama_stack/distributions/starter/starter.py @@ -128,6 +128,7 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate: BuildProvider(provider_type="remote::weaviate"), ], "files": [BuildProvider(provider_type="inline::localfs")], + "file_processor": [BuildProvider(provider_type="inline::reference")], "safety": [ BuildProvider(provider_type="inline::llama-guard"), BuildProvider(provider_type="inline::code-scanner"), diff --git a/src/llama_stack/log.py b/src/llama_stack/log.py index c11c2c06f3..83e6b96b66 100644 --- a/src/llama_stack/log.py +++ b/src/llama_stack/log.py @@ -45,6 +45,7 @@ class LoggingConfig(BaseModel): "providers", "models", "files", + "file_processor", "vector_io", "tool_runtime", "cli", diff --git a/src/llama_stack/providers/inline/file_processor/__init__.py b/src/llama_stack/providers/inline/file_processor/__init__.py new file mode 100644 index 0000000000..756f351d88 --- /dev/null +++ b/src/llama_stack/providers/inline/file_processor/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/src/llama_stack/providers/inline/file_processor/reference/__init__.py b/src/llama_stack/providers/inline/file_processor/reference/__init__.py new file mode 100644 index 0000000000..3c8b6a7ec0 --- /dev/null +++ b/src/llama_stack/providers/inline/file_processor/reference/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .config import ReferenceFileProcessorImplConfig + + +async def get_provider_impl(config: ReferenceFileProcessorImplConfig, deps): + from .reference import ReferenceFileProcessorImpl + + impl = ReferenceFileProcessorImpl(config, deps) + await impl.initialize() + return impl diff --git a/src/llama_stack/providers/inline/file_processor/reference/config.py b/src/llama_stack/providers/inline/file_processor/reference/config.py new file mode 100644 index 0000000000..7c6de7483e --- /dev/null +++ b/src/llama_stack/providers/inline/file_processor/reference/config.py @@ -0,0 +1,15 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pydantic import BaseModel + + +class ReferenceFileProcessorImplConfig(BaseModel): + """Configuration for the reference file processor implementation.""" + + @staticmethod + def sample_run_config(**kwargs): + return {} diff --git a/src/llama_stack/providers/inline/file_processor/reference/reference.py b/src/llama_stack/providers/inline/file_processor/reference/reference.py new file mode 100644 index 0000000000..1aaf1efa30 --- /dev/null +++ b/src/llama_stack/providers/inline/file_processor/reference/reference.py @@ -0,0 +1,42 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any + +from llama_stack.apis.file_processor import FileProcessor, ProcessedContent +from llama_stack.apis.vector_io import VectorStoreChunkingStrategy + +from .config import ReferenceFileProcessorImplConfig + + +class ReferenceFileProcessorImpl(FileProcessor): + """Reference implementation of the FileProcessor API.""" + + def __init__(self, config: ReferenceFileProcessorImplConfig, deps: dict[str, Any]): + self.config = config + self.deps = deps + + async def initialize(self) -> None: + pass + + async def process_file( + self, + file_data: bytes, + filename: str, + options: dict[str, Any] | None = None, + chunking_strategy: VectorStoreChunkingStrategy | None = None, + include_embeddings: bool = False, + ) -> ProcessedContent: + """Process a file into structured content.""" + return ProcessedContent( + content="Placeholder content", + chunks=None, + embeddings=None, + metadata={ + "processor": "reference", + "filename": filename, + }, + ) diff --git a/src/llama_stack/providers/registry/file_processor.py b/src/llama_stack/providers/registry/file_processor.py new file mode 100644 index 0000000000..173e5a3939 --- /dev/null +++ b/src/llama_stack/providers/registry/file_processor.py @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec + + +def available_providers() -> list[ProviderSpec]: + return [ + InlineProviderSpec( + api=Api.file_processor, + provider_type="inline::reference", + pip_packages=[], + module="llama_stack.providers.inline.file_processor.reference", + config_class="llama_stack.providers.inline.file_processor.reference.config.ReferenceFileProcessorImplConfig", + description="Reference file processor implementation (placeholder for development)", + ), + ]