helpfulengineering · CelestineAkpanoko · May 20, 2025 · May 20, 2025 · May 20, 2025 · May 20, 2025
diff --git a/.gitignore b/.gitignore
@@ -172,3 +172,5 @@ cython_debug/
 
 # PyPI configuration file
 .pypirc
+
+*.txt
diff --git a/docs/api/llm.md b/docs/api/llm.md
@@ -0,0 +1,101 @@
+# LLM API
+A FastAPI application for serving LLM inferences.
+
+## Features
+
+New: `/llm` POST endpoint for generic LLM inference with prompt and context, powered by Groq Cloud (LLaMA 3.1).
+Models defined in `src/core/api/models/llm/request.py` and `src/core/api/models/llm/response.py`.
+
+
+## API Documentation
+### LLM Endpoint
+`POST /llm`
+Generate text using an LLM provider with a prompt and optional context.
+Request Body
+```
+{
+  "prompt": "string", // Required: The main task or query for the LLM
+  "context": "string", // Optional: Supplementary information to guide the response
+  "model": "string", // Optional: LLM model (default: "llama-3.1-8b-instant")
+  "max_tokens": integer, // Optional: Maximum tokens in response (default: 512)
+  "temperature": float, // Optional: Sampling temperature (default: 0.8)
+  "top_p": float, // Optional: Top-p sampling (default: 0.95)
+  "stream": boolean // Optional: Enable streaming response (default: false)
+}
+```
+Responses
+
+200 OK (Non-Streaming):
+```
+{
+  "response": "string", // LLM-generated text
+  "model": "string", // Model used
+  "usage": { // Optional: Token usage
+    "prompt_tokens": integer,
+    "completion_tokens": integer,
+    "total_tokens": integer
+  }
+}
+```
+
+200 OK (Streaming): Text/event-stream with chunks of text, ending with [DONE].
+500 Internal Server Error: If the LLM provider fails.
+
+Example
+```
+curl -X POST http://localhost:8000/llm \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "What is the capital of Brazil?",
+    "context": "Answer concisely with no explanation.",
+    "model": "llama-3.1-8b-instant",
+    "max_tokens": 50,
+    "temperature": 0.7,
+    "stream": false
+  }'
+```
+
+
+## Usage Guide
+### Using the LLM Endpoint
+The `/llm` endpoint allows clients to send a prompt and optional context to generate text using an LLM (e.g., LLaMA 3.1 via Groq Cloud). The request and response are defined in `src/core/api/models/llm/request.py` and `src/core/api/models/llm/response.py`, respectively.
+### Example: Non-Streaming Request
+Send a prompt to get a concise response:
+```
+curl -X POST http://localhost:8000/llm \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "What is the capital of Brazil?",
+    "context": "Answer concisely with no explanation.",
+    "model": "llama-3.1-8b-instant",
+    "max_tokens": 50
+  }'
+```
+Response:
+```
+{
+  "response": "The capital is Brasília.",
+  "model": "llama-3.1-8b-instant",
+  "usage": {
+    "prompt_tokens": 12,
+    "completion_tokens": 6,
+    "total_tokens": 18
+  }
+}
+```
+
+### Example: Streaming Request
+Stream a response for a creative task:
+```
+curl -X POST http://localhost:8000/llm \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "Tell me a short story.",
+    "context": "The story should be about a dragon in a fantasy setting.",
+    "model": "llama-3.1-8b-instant",
+    "max_tokens": 100,
+    "stream": true
+  }'
+```
+Response: Streams tokens like "Once ", "upon ", "a time...", ending with `[DONE]`.
+
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,18 @@
+[pytest]
+# Configure asyncio for async tests
+asyncio_mode = strict
+asyncio_default_fixture_loop_scope = function
+
+# Test discovery patterns with wildcards
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+
+# Make sure Python can find your modules
+pythonpath = .
+
+# Recursive test discovery in all subdirectories
+norecursedirs = .* venv build dist
+
+markers =
+    e2e: Mark end-to-end test
diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,9 @@ mkdocs-mermaid2-plugin
 networkx
 fastapi
 uvicorn
+dotenv
+groq
+pytest
+pytest-asyncio
+pytest-mock
+httpx
diff --git a/run.py b/run.py
@@ -1,4 +1,8 @@
 import uvicorn
+from dotenv import load_dotenv
+
+# Load environment variables from .env file
+load_dotenv()
 
 if __name__ == "__main__":
     uvicorn.run("src.core.main:app", 

diff --git a/src/core/api/models/llm/request.py b/src/core/api/models/llm/request.py
@@ -0,0 +1,19 @@
+from pydantic import BaseModel, Field, field_validator
+from typing import Optional
+
+class LLMRequest(BaseModel):
+    """ Request model for LLM API. """
+    prompt: str
+    context: Optional[str] = None
+    model: Optional[str] = 'llama-3.1-8b-instant'
+    max_tokens: Optional[int] = 512
+    temperature: Optional[float] = 0.7
+    top_p: Optional[float] = 0.9
+    stream: Optional[bool] = False
+
+    @field_validator('prompt')
+    @classmethod  # This is now required in V2
+    def prompt_must_not_be_empty(cls, v):
+        if not v or v.strip() == '':
+            raise ValueError('prompt cannot be empty')
+        return v
diff --git a/src/core/api/models/llm/response.py b/src/core/api/models/llm/response.py
@@ -0,0 +1,9 @@
+from pydantic import BaseModel
+from typing import Optional
+
+class LLMResponse(BaseModel):
+    """ Response model for LLM API. """
+
+    response: str
+    model: str
+    usage: Optional[dict] = None
diff --git a/src/core/api/routes/llm.py b/src/core/api/routes/llm.py
@@ -0,0 +1,83 @@
+from fastapi import APIRouter, HTTPException
+from fastapi.responses import StreamingResponse
+from groq import Groq
+import os
+from dotenv import load_dotenv
+from src.core.api.models.llm.request import LLMRequest
+from src.core.api.models.llm.response import LLMResponse
+
+load_dotenv()
+
+router = APIRouter(prefix="/v1/llm", tags=["llm"])
+
+client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+if not client.api_key:
+    raise HTTPException(status_code=500, detail="Groq API key is not set.")
+
+@router.post("/", response_model=LLMResponse)
+async def generate_response(request: LLMRequest):
+    """ Generate response the LLM provider based on prompt and optional context.
+    Supports streaming and non-streaming responses. """
+
+    try:
+        # combine prompt and context (if provided) in the request
+        messages = []
+        if request.context:
+            messages.append({"role": "system", "content": request.context})
+        messages.append({"role": "user", "content": request.prompt})
+
+        # Call the Groq API
+        response = client.chat.completions.create(
+            messages=messages,
+            model=request.model,
+            max_tokens=request.max_tokens,
+            temperature=request.temperature,
+            top_p=request.top_p,
+            stream=request.stream
+        )
+
+        # If streaming is enabled, return a streaming response
+        if request.stream:
+            def stream_response():
+                for chunk in response:
+                    content = chunk.choices[0].delta.content
+                    if content:
+                        yield content
+                yield "[DONE]"
+            return StreamingResponse(stream_response(), media_type="text/event-stream")
+        else:
+            # if not streaming, return the full response
+            full_response = response.choices[0].message.content
+            usage = (
+                response.usage.dict()
+                if hasattr(response, "usage") and response.usage
+                else None
+            )
+            return LLMResponse(
+                response=full_response,
+                model=response.model,
+                usage=usage
+            )
+
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error communicating with LLM provider: {str(e)}")
+
+
+
+@router.get("/health")
+async def health_check():
+    """ Health check for the LLM Endpoint """
+
+    try:
+        # test a simple API call to verify Groq connection
+        response = client.chat.completions.create(
+            messages=[{"role": "user", "content": "Hello!"}],
+            model="llama-3.1-8b-instant",
+            max_tokens=10,
+        )
+        return {
+            "status": "healthy"
+        }
+
+    except Exception as e:
+        raise HTTPException(status_code=503, detail=f"LLM provider is not healthy: {str(e)}")
diff --git a/src/core/main.py b/src/core/main.py
@@ -4,6 +4,7 @@
 from src.core.domains.cooking.matchers import CookingMatcher
 from src.core.domains.cooking.validators import CookingValidator
 from src.core.registry.domain_registry import DomainRegistry
+from src.core.api.routes.llm import router as llm_router
 
 # Create FastAPI app
 app = FastAPI(title="Open Matching Engine API")
@@ -13,8 +14,14 @@ async def health_check():
     """Simple health check endpoint."""
     return {"status": "ok", "domains": list(DomainRegistry._extractors.keys())}
 
+# @app.get("/llm")
+# async def llm_check():
+#     """Simple LLM check endpoint."""
+#     return {"status": "ok", "llm": "gpt-3.5-turbo"}
+
 # Register routes
 app.include_router(match_router, tags=["matching"])
+app.include_router(llm_router, tags=["llm"])
 
 # Register domain components
 DomainRegistry.register_extractor("cooking", CookingExtractor())

diff --git a/src/core/models/supply_trees.py b/src/core/models/supply_trees.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass, field
 from datetime import datetime, timedelta
-from typing import Dict, List, Optional, Set, Union, Tuple
+from typing import Any, Dict, List, Optional, Set, Union, Tuple
 from enum import Enum
 from uuid import UUID, uuid4
 import networkx as nx

diff --git a/tests/llm/conftest.py b/tests/llm/conftest.py
@@ -0,0 +1,24 @@
+import pytest
+from fastapi.testclient import TestClient
+from src.core.main import app
+from unittest.mock import MagicMock, patch
+
+@pytest.fixture
+def client():
+    """Fixture for FastAPI test client."""
+    return TestClient(app)
+
+@pytest.fixture
+def mock_groq():
+    """Fixture to mock Groq client."""
+    # Create a mock with the properly configured return value
+    mock_client = MagicMock()
+
+    # Configure the mock to work with async code
+    mock_create = MagicMock()
+    # Make the mock.return_value property directly accessible (not awaitable)
+    mock_client.chat.completions.create.return_value = mock_create
+
+    # Patch the client in the LLM route module
+    with patch("src.core.api.routes.llm.client", mock_client):
+        yield mock_client
diff --git a/tests/llm/e2e/__init__.py b/tests/llm/e2e/__init__.py
diff --git a/tests/llm/e2e/test_llm_workflow.py b/tests/llm/e2e/test_llm_workflow.py
@@ -0,0 +1,27 @@
+import pytest
+from fastapi.testclient import TestClient
+from unittest.mock import MagicMock
+
+# Register the e2e marker to avoid the warning
+pytest.mark.e2e = pytest.mark.skipif(False, reason="E2E test marker")
+
+@pytest.mark.e2e
+def test_llm_workflow(client, mock_groq):
+    """End-to-end test for LLM workflow"""
+    # Setup mock response at the correct path in the mock structure
+    mock_response = mock_groq.chat.completions.create.return_value
+    mock_response.choices = [MagicMock()]
+    mock_response.choices[0].message = MagicMock()
+    mock_response.choices[0].message.content = '{"result": "processed data"}'
+    mock_response.model = "llama-3.1-8b-instant"
+    mock_response.usage = MagicMock()
+    mock_response.usage.dict.return_value = {"total_tokens": 20}
+
+    # Call endpoint
+    response = client.post(
+        "/v1/llm/",
+        json={"prompt": "Process this text", "context": "Return JSON"}
+    )
+
+    assert response.status_code == 200
+    assert "result" in response.json()["response"]
diff --git a/tests/llm/integration/__init__.py b/tests/llm/integration/__init__.py
diff --git a/tests/llm/integration/test_llm_routes.py b/tests/llm/integration/test_llm_routes.py
@@ -0,0 +1,25 @@
+import pytest
+from fastapi.testclient import TestClient
+from unittest.mock import MagicMock
+
+def test_llm_endpoint_non_streaming(client, mock_groq):
+    """Test non-streaming LLM endpoint"""
+    # Setup mock response at the correct place in the mock structure
+    mock_response = mock_groq.chat.completions.create.return_value
+    mock_response.choices = [MagicMock()]
+    mock_response.choices[0].message = MagicMock()
+    mock_response.choices[0].message.content = "Test response"
+    mock_response.model = "llama-3.1-8b-instant"
+    mock_response.usage = MagicMock()
+    mock_response.usage.dict.return_value = {"total_tokens": 10}
+
+    # Make request to endpoint
+    response = client.post(
+        "/v1/llm/",
+        json={"prompt": "Test prompt"}
+    )
+
+    assert response.status_code == 200
+    assert response.json()["response"] == "Test response"
+
+# Similar updates for other failing tests...
diff --git a/tests/llm/unit/__init__.py b/tests/llm/unit/__init__.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -172,3 +172,5 @@ cython_debug/

		# PyPI configuration file
		.pypirc

		*.txt