Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,5 @@ cython_debug/

# PyPI configuration file
.pypirc

*.txt
101 changes: 101 additions & 0 deletions docs/api/llm.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# LLM API
A FastAPI application for serving LLM inferences.

## Features

New: `/llm` POST endpoint for generic LLM inference with prompt and context, powered by Groq Cloud (LLaMA 3.1).
Models defined in `src/core/api/models/llm/request.py` and `src/core/api/models/llm/response.py`.


## API Documentation
### LLM Endpoint
`POST /llm`
Generate text using an LLM provider with a prompt and optional context.
Request Body
```
{
"prompt": "string", // Required: The main task or query for the LLM
"context": "string", // Optional: Supplementary information to guide the response
"model": "string", // Optional: LLM model (default: "llama-3.1-8b-instant")
"max_tokens": integer, // Optional: Maximum tokens in response (default: 512)
"temperature": float, // Optional: Sampling temperature (default: 0.8)
"top_p": float, // Optional: Top-p sampling (default: 0.95)
"stream": boolean // Optional: Enable streaming response (default: false)
}
```
Responses

200 OK (Non-Streaming):
```
{
"response": "string", // LLM-generated text
"model": "string", // Model used
"usage": { // Optional: Token usage
"prompt_tokens": integer,
"completion_tokens": integer,
"total_tokens": integer
}
}
```

200 OK (Streaming): Text/event-stream with chunks of text, ending with [DONE].
500 Internal Server Error: If the LLM provider fails.

Example
```
curl -X POST http://localhost:8000/llm \
-H "Content-Type: application/json" \
-d '{
"prompt": "What is the capital of Brazil?",
"context": "Answer concisely with no explanation.",
"model": "llama-3.1-8b-instant",
"max_tokens": 50,
"temperature": 0.7,
"stream": false
}'
```


## Usage Guide
### Using the LLM Endpoint
The `/llm` endpoint allows clients to send a prompt and optional context to generate text using an LLM (e.g., LLaMA 3.1 via Groq Cloud). The request and response are defined in `src/core/api/models/llm/request.py` and `src/core/api/models/llm/response.py`, respectively.
### Example: Non-Streaming Request
Send a prompt to get a concise response:
```
curl -X POST http://localhost:8000/llm \
-H "Content-Type: application/json" \
-d '{
"prompt": "What is the capital of Brazil?",
"context": "Answer concisely with no explanation.",
"model": "llama-3.1-8b-instant",
"max_tokens": 50
}'
```
Response:
```
{
"response": "The capital is Brasília.",
"model": "llama-3.1-8b-instant",
"usage": {
"prompt_tokens": 12,
"completion_tokens": 6,
"total_tokens": 18
}
}
```

### Example: Streaming Request
Stream a response for a creative task:
```
curl -X POST http://localhost:8000/llm \
-H "Content-Type: application/json" \
-d '{
"prompt": "Tell me a short story.",
"context": "The story should be about a dragon in a fantasy setting.",
"model": "llama-3.1-8b-instant",
"max_tokens": 100,
"stream": true
}'
```
Response: Streams tokens like "Once ", "upon ", "a time...", ending with `[DONE]`.

18 changes: 18 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[pytest]
# Configure asyncio for async tests
asyncio_mode = strict
asyncio_default_fixture_loop_scope = function

# Test discovery patterns with wildcards
python_files = test_*.py
python_classes = Test*
python_functions = test_*

# Make sure Python can find your modules
pythonpath = .

# Recursive test discovery in all subdirectories
norecursedirs = .* venv build dist

markers =
e2e: Mark end-to-end test
6 changes: 6 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,9 @@ mkdocs-mermaid2-plugin
networkx
fastapi
uvicorn
dotenv
groq
pytest
pytest-asyncio
pytest-mock
httpx
4 changes: 4 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
import uvicorn
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

if __name__ == "__main__":
uvicorn.run("src.core.main:app",
Expand Down
19 changes: 19 additions & 0 deletions src/core/api/models/llm/request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from pydantic import BaseModel, Field, field_validator
from typing import Optional

class LLMRequest(BaseModel):
""" Request model for LLM API. """
prompt: str
context: Optional[str] = None
model: Optional[str] = 'llama-3.1-8b-instant'
max_tokens: Optional[int] = 512
temperature: Optional[float] = 0.7
top_p: Optional[float] = 0.9
stream: Optional[bool] = False

@field_validator('prompt')
@classmethod # This is now required in V2
def prompt_must_not_be_empty(cls, v):
if not v or v.strip() == '':
raise ValueError('prompt cannot be empty')
return v
9 changes: 9 additions & 0 deletions src/core/api/models/llm/response.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pydantic import BaseModel
from typing import Optional

class LLMResponse(BaseModel):
""" Response model for LLM API. """

response: str
model: str
usage: Optional[dict] = None
83 changes: 83 additions & 0 deletions src/core/api/routes/llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from fastapi import APIRouter, HTTPException
from fastapi.responses import StreamingResponse
from groq import Groq
import os
from dotenv import load_dotenv
from src.core.api.models.llm.request import LLMRequest
from src.core.api.models.llm.response import LLMResponse

load_dotenv()

router = APIRouter(prefix="/v1/llm", tags=["llm"])

client = Groq(api_key=os.getenv("GROQ_API_KEY"))
if not client.api_key:
raise HTTPException(status_code=500, detail="Groq API key is not set.")

@router.post("/", response_model=LLMResponse)
async def generate_response(request: LLMRequest):
""" Generate response the LLM provider based on prompt and optional context.
Supports streaming and non-streaming responses. """

try:
# combine prompt and context (if provided) in the request
messages = []
if request.context:
messages.append({"role": "system", "content": request.context})
messages.append({"role": "user", "content": request.prompt})

# Call the Groq API
response = client.chat.completions.create(
messages=messages,
model=request.model,
max_tokens=request.max_tokens,
temperature=request.temperature,
top_p=request.top_p,
stream=request.stream
)

# If streaming is enabled, return a streaming response
if request.stream:
def stream_response():
for chunk in response:
content = chunk.choices[0].delta.content
if content:
yield content
yield "[DONE]"
return StreamingResponse(stream_response(), media_type="text/event-stream")
else:
# if not streaming, return the full response
full_response = response.choices[0].message.content
usage = (
response.usage.dict()
if hasattr(response, "usage") and response.usage
else None
)
return LLMResponse(
response=full_response,
model=response.model,
usage=usage
)

except Exception as e:
raise HTTPException(status_code=500, detail=f"Error communicating with LLM provider: {str(e)}")



@router.get("/health")
async def health_check():
""" Health check for the LLM Endpoint """

try:
# test a simple API call to verify Groq connection
response = client.chat.completions.create(
messages=[{"role": "user", "content": "Hello!"}],
model="llama-3.1-8b-instant",
max_tokens=10,
)
return {
"status": "healthy"
}

except Exception as e:
raise HTTPException(status_code=503, detail=f"LLM provider is not healthy: {str(e)}")
7 changes: 7 additions & 0 deletions src/core/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from src.core.domains.cooking.matchers import CookingMatcher
from src.core.domains.cooking.validators import CookingValidator
from src.core.registry.domain_registry import DomainRegistry
from src.core.api.routes.llm import router as llm_router

# Create FastAPI app
app = FastAPI(title="Open Matching Engine API")
Expand All @@ -13,8 +14,14 @@ async def health_check():
"""Simple health check endpoint."""
return {"status": "ok", "domains": list(DomainRegistry._extractors.keys())}

# @app.get("/llm")
# async def llm_check():
# """Simple LLM check endpoint."""
# return {"status": "ok", "llm": "gpt-3.5-turbo"}

# Register routes
app.include_router(match_router, tags=["matching"])
app.include_router(llm_router, tags=["llm"])

# Register domain components
DomainRegistry.register_extractor("cooking", CookingExtractor())
Expand Down
2 changes: 1 addition & 1 deletion src/core/models/supply_trees.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Set, Union, Tuple
from typing import Any, Dict, List, Optional, Set, Union, Tuple
from enum import Enum
from uuid import UUID, uuid4
import networkx as nx
Expand Down
24 changes: 24 additions & 0 deletions tests/llm/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import pytest
from fastapi.testclient import TestClient
from src.core.main import app
from unittest.mock import MagicMock, patch

@pytest.fixture
def client():
"""Fixture for FastAPI test client."""
return TestClient(app)

@pytest.fixture
def mock_groq():
"""Fixture to mock Groq client."""
# Create a mock with the properly configured return value
mock_client = MagicMock()

# Configure the mock to work with async code
mock_create = MagicMock()
# Make the mock.return_value property directly accessible (not awaitable)
mock_client.chat.completions.create.return_value = mock_create

# Patch the client in the LLM route module
with patch("src.core.api.routes.llm.client", mock_client):
yield mock_client
Empty file added tests/llm/e2e/__init__.py
Empty file.
27 changes: 27 additions & 0 deletions tests/llm/e2e/test_llm_workflow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pytest
from fastapi.testclient import TestClient
from unittest.mock import MagicMock

# Register the e2e marker to avoid the warning
pytest.mark.e2e = pytest.mark.skipif(False, reason="E2E test marker")

@pytest.mark.e2e
def test_llm_workflow(client, mock_groq):
"""End-to-end test for LLM workflow"""
# Setup mock response at the correct path in the mock structure
mock_response = mock_groq.chat.completions.create.return_value
mock_response.choices = [MagicMock()]
mock_response.choices[0].message = MagicMock()
mock_response.choices[0].message.content = '{"result": "processed data"}'
mock_response.model = "llama-3.1-8b-instant"
mock_response.usage = MagicMock()
mock_response.usage.dict.return_value = {"total_tokens": 20}

# Call endpoint
response = client.post(
"/v1/llm/",
json={"prompt": "Process this text", "context": "Return JSON"}
)

assert response.status_code == 200
assert "result" in response.json()["response"]
Empty file.
25 changes: 25 additions & 0 deletions tests/llm/integration/test_llm_routes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pytest
from fastapi.testclient import TestClient
from unittest.mock import MagicMock

def test_llm_endpoint_non_streaming(client, mock_groq):
"""Test non-streaming LLM endpoint"""
# Setup mock response at the correct place in the mock structure
mock_response = mock_groq.chat.completions.create.return_value
mock_response.choices = [MagicMock()]
mock_response.choices[0].message = MagicMock()
mock_response.choices[0].message.content = "Test response"
mock_response.model = "llama-3.1-8b-instant"
mock_response.usage = MagicMock()
mock_response.usage.dict.return_value = {"total_tokens": 10}

# Make request to endpoint
response = client.post(
"/v1/llm/",
json={"prompt": "Test prompt"}
)

assert response.status_code == 200
assert response.json()["response"] == "Test response"

# Similar updates for other failing tests...
Empty file added tests/llm/unit/__init__.py
Empty file.
Loading