meta-llama · BenjaminBruenau · Sep 5, 2025 · Oct 31, 2025 · Oct 31, 2025 · Oct 31, 2025
diff --git a/DOCS.md b/DOCS.md
@@ -91,7 +91,9 @@ synthetic-data-kit/
 │   │   └── save_as.py        # Format conversion
 │   ├── models/               # LLM integration
 │   │   ├── __init__.py
-│   │   └── llm_client.py     # VLLM client
+│   │   └── llm_client.py     # common LLM interface
+|   │   ├── base.py           # Base class for providers
+│   │   └── openai_provider.py # openai-compatible
 │   ├── parsers/              # Document parsers
 │   │   ├── __init__.py
 │   │   ├── pdf_parser.py     # PDF parser
@@ -142,6 +144,7 @@ classDiagram
         +api_base: str
         +model: str
         +max_retries: int
+        +http_request_timeout: int
         +retry_delay: float
         +config: Dict
         +_check_server() tuple
@@ -539,13 +542,16 @@ paths:
     cleaned: "data/cleaned"
     final: "data/final"
 
-# vllm: Configure VLLM server settings
-vllm:
-  api_base: "http://localhost:8000/v1"
-  port: 8000
-  model: "meta-llama/Llama-3.3-70B-Instruct"
-  max_retries: 3
-  retry_delay: 1.0
+# OpenAI-compatible endpoint configuration
+openai-endpoint:
+  api_base: "https://api.llama.com/v1"    # Base URL for OpenAI-compatible API
+  api_key: "llama-api-key"                # API key for the endpoint (can also use env vars)
+  model: "Llama-4-Maverick-17B-128E-Instruct-FP8" # Default model to use
+  max_retries: 3                          # Number of retries for API calls
+  retry_delay: 1.0                        # Initial delay between retries (seconds)
+  sleep_time: 0.5                         # Pause between batch chunks (seconds)
+  http_request_timeout: 300               # Timeout for HTTP requests (seconds)
+  max_concurrent_requests: 32   
 
 # generation: Content generation parameters
 generation:

diff --git a/README.md b/README.md
@@ -155,11 +155,11 @@ The toolkit uses a YAML configuration file (default: `configs/config.yaml`).
 Note, this can be overridden via either CLI arguments OR passing a custom YAML file
 
 ```yaml
-# Example configuration using vLLM
+# Example configuration using local openai compatible api (i.e. via `vllm serve`)
 llm:
-  provider: "vllm"
+  provider: "openai-endpoint"
 
-vllm:
+openai-endpoint:
   api_base: "http://localhost:8000/v1"
   model: "meta-llama/Llama-3.3-70B-Instruct"
   sleep_time: 0.1
@@ -175,12 +175,11 @@ curate:
   batch_size: 8
 ```
 
-or using an API endpoint:
-
+or using an external openai compatible api:
 ```yaml
 # Example configuration using the llama API
 llm:
-  provider: "api-endpoint"
+  provider: "openai-endpoint"
 
 api-endpoint:
   api_base: "https://api.llama.com/v1"
@@ -455,6 +454,7 @@ graph LR
 If you encounter CUDA out of memory errors:
 - Use a smaller model
 - Reduce batch size in config
+- Reduce sequence length / max model length (`--max_model_len 12000`)
 - Start vLLM with `--gpu-memory-utilization 0.85`
 
 ### JSON Parsing Issues

diff --git a/configs/config.yaml b/configs/config.yaml
@@ -14,26 +14,29 @@ paths:
 
 # LLM Provider configuration
 llm:
-  # Provider selection: "vllm" or "api-endpoint"
-  provider: "api-endpoint"
+  # Provider selection: "openai-endpoint" (OpenAI-compatible APIs)
+  provider: "openai-endpoint"
 
-# VLLM server configuration
+# Legacy vLLM server configuration (merged automatically into openai-endpoint)
 vllm:
   api_base: "http://localhost:8000/v1" # Base URL for VLLM API
   port: 8000                           # Port for VLLM server
   model: "meta-llama/Llama-3.3-70B-Instruct" # Default model to use
   max_retries: 3                       # Number of retries for API calls
   retry_delay: 1.0                     # Initial delay between retries (seconds)
   sleep_time: 0.1                      # Small delay in seconds between batches to avoid rate limits
+  http_request_timeout: 180            # Http Request timeout in seconds (3 minutes)
 
-# API endpoint configuration
-api-endpoint:
-  api_base: "https://api.llama.com/v1" # Optional base URL for API endpoint (null for default API)
-  api_key: "llama_api_key"               # API key for API endpoint or compatible service (can use env var instead)
+# OpenAI-compatible endpoint configuration
+openai-endpoint:
+  api_base: "https://api.llama.com/v1"    # Base URL for OpenAI-compatible API
+  api_key: "llama_api_key"               # API key for the endpoint (can also use env vars)
   model: "Llama-4-Maverick-17B-128E-Instruct-FP8" # Default model to use
-  max_retries: 3                       # Number of retries for API calls
-  retry_delay: 1.0                     # Initial delay between retries (seconds)
-  sleep_time: 0.5                      # Small delay in seconds between batches to avoid rate limits
+  max_retries: 3                          # Number of retries for API calls
+  retry_delay: 1.0                        # Initial delay between retries (seconds)
+  sleep_time: 0.5                         # Small delay in seconds between batches to avoid rate limits
+  http_request_timeout: 300               # Timeout for HTTP requests (seconds)
+  max_concurrent_requests: 32             # Maximum concurrent requests during batching
 
 # Ingest configuration
 ingest: