crate · amotl · May 16, 2025 · May 10, 2025 · May 10, 2025 · May 10, 2025
diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml
@@ -60,7 +60,7 @@ jobs:
           # `uv publish` does not understand `--skip-existing`.
           # https://github.com/astral-sh/uv/issues/7917
           # https://github.com/astral-sh/uv/issues/12369
-          uvx twine upload --non-interactive --repository-url https://test.pypi.org/legacy/ --skip-existing dist/*
+          uvx twine upload --non-interactive --verbose --repository-url https://test.pypi.org/legacy/ --skip-existing dist/*
 
       - name: Publish package to PyPI
         if: startsWith(github.event.ref, 'refs/tags')

diff --git a/cratedb_mcp/__main__.py b/cratedb_mcp/__main__.py
@@ -2,21 +2,24 @@
 import httpx
 from mcp.server.fastmcp import FastMCP
 
-from .knowledge import DOCUMENTATION_INDEX, Queries
-from .settings import DOCS_CACHE_TTL, HTTP_URL
+from .knowledge import DocumentationIndex, Queries, documentation_url_permitted
+from .settings import DOCS_CACHE_TTL, HTTP_TIMEOUT, HTTP_URL
 
 # Configure Hishel, an httpx client with caching.
 # Define one hour of caching time.
 controller = hishel.Controller(allow_stale=True)
 storage = hishel.SQLiteStorage(ttl=DOCS_CACHE_TTL)
 client = hishel.CacheClient(controller=controller, storage=storage)
 
+# Load CrateDB documentation outline.
+documentation_index = DocumentationIndex()
+
 # Create FastMCP application object.
 mcp = FastMCP("cratedb-mcp")
 
 
 def query_cratedb(query: str) -> list[dict]:
-    return httpx.post(f'{HTTP_URL}/_sql', json={'stmt': query}).json()
+    return httpx.post(f'{HTTP_URL}/_sql', json={'stmt': query}, timeout=HTTP_TIMEOUT).json()
-    return httpx.post(f'{HTTP_URL}/_sql', json={'stmt': query}, timeout=HTTP_TIMEOUT).json()
+def query_cratedb(query: str) -> list[dict]:
+    try:
+        response = httpx.post(f'{HTTP_URL}/_sql', json={'stmt': query}, timeout=HTTP_TIMEOUT)
+        response.raise_for_status()  # Raise an exception for 4XX/5XX responses
+        return response.json()
+    except httpx.TimeoutException:
+        raise ValueError(f"Database query timed out after {HTTP_TIMEOUT} seconds")
+    except httpx.HTTPStatusError as e:
+        raise ValueError(f"HTTP error {e.response.status_code} while querying database")
+    except httpx.RequestError as e:
+        raise ValueError(f"Request error while querying database: {str(e)}")
+    except ValueError as e:
+        raise ValueError(f"Invalid JSON response from database: {str(e)}")
-    return httpx.post(f'{HTTP_URL}/_sql', json={'stmt': query}, timeout=HTTP_TIMEOUT).json()
+def query_cratedb(query: str) -> list[dict]:
+    try:
+        response = httpx.post(f'{HTTP_URL}/_sql', json={'stmt': query}, timeout=HTTP_TIMEOUT)
+        response.raise_for_status()  # Raise an exception for 4XX/5XX responses
+        return response.json()
+    except httpx.TimeoutException:
+        raise ValueError(f"Database query timed out after {HTTP_TIMEOUT} seconds")
+    except httpx.HTTPStatusError as e:
+        raise ValueError(f"HTTP error {e.response.status_code} while querying database")
+    except httpx.RequestError as e:
+        raise ValueError(f"Request error while querying database: {str(e)}")
+    except ValueError as e:
+        raise ValueError(f"Invalid JSON response from database: {str(e)}")
 
 
 @mcp.tool(description="Send a SQL query to CrateDB, only 'SELECT' queries are allows, queries that"
@@ -27,17 +30,17 @@ def query_sql(query: str):
     return query_cratedb(query)
 
 @mcp.tool(description='Gets an index with CrateDB documentation links to fetch, should download docs'
-                      ' before answering questions. Has documentation name, description and link.')
+                      ' before answering questions. Has documentation title, description, and link.')
 def get_cratedb_documentation_index():
-    return DOCUMENTATION_INDEX
+    return documentation_index.items()
-    return documentation_index.items()
+def get_cratedb_documentation_index():
+    try:
+        return documentation_index.items()
+    except Exception as e:
+        import logging
+        logging.error(f"Failed to access documentation index: {e}")
+        # Return a minimal fallback
+        return []
-    return documentation_index.items()
+def get_cratedb_documentation_index():
+    try:
+        return documentation_index.items()
+    except Exception as e:
+        import logging
+        logging.error(f"Failed to access documentation index: {e}")
+        # Return a minimal fallback
+        return []
 
 @mcp.tool(description='Downloads the latest CrateDB documentation piece by link.'
                       ' Only used to download CrateDB docs.')
 def fetch_cratedb_docs(link: str):
-    """Fetches a CrateDB documentation link from GitHub raw content."""
-    if 'https://raw.githubusercontent.com/crate/crate/' not in link:
-        raise ValueError('Only github cratedb links can be fetched.')
-    return client.get(link).text
+    """Fetches a CrateDB documentation link."""
+    if not documentation_url_permitted(link):
+        raise ValueError(f'Link is not permitted: {link}')
+    return client.get(link, timeout=HTTP_TIMEOUT).text
-    return client.get(link, timeout=HTTP_TIMEOUT).text
+    try:
+        response = client.get(link, timeout=HTTP_TIMEOUT)
+        response.raise_for_status()  # Raise an exception for 4XX/5XX responses
+        return response.text
+    except httpx.TimeoutException:
+        raise ValueError(f"Request timed out after {HTTP_TIMEOUT} seconds: {link}")
+    except httpx.HTTPStatusError as e:
+        raise ValueError(f"HTTP error {e.response.status_code} while fetching: {link}")
+    except httpx.RequestError as e:
+        raise ValueError(f"Request error while fetching: {link} - {str(e)}")
-    return client.get(link, timeout=HTTP_TIMEOUT).text
+    try:
+        response = client.get(link, timeout=HTTP_TIMEOUT)
+        response.raise_for_status()  # Raise an exception for 4XX/5XX responses
+        return response.text
+    except httpx.TimeoutException:
+        raise ValueError(f"Request timed out after {HTTP_TIMEOUT} seconds: {link}")
+    except httpx.HTTPStatusError as e:
+        raise ValueError(f"HTTP error {e.response.status_code} while fetching: {link}")
+    except httpx.RequestError as e:
+        raise ValueError(f"Request error while fetching: {link} - {str(e)}")
 
 @mcp.tool(description="Returns an aggregation of all CrateDB's schema, tables and their metadata")
 def get_table_metadata() -> list[dict]:

diff --git a/cratedb_mcp/knowledge.py b/cratedb_mcp/knowledge.py
@@ -1,4 +1,7 @@
 # ruff: noqa: E501
+import cachetools
+from cratedb_about import CrateDbKnowledgeOutline
+
 
 class Queries:
     TABLES_METADATA = """
@@ -83,20 +86,38 @@ class Queries:
     ORDER BY severity DESC"""
 
 
-# 'description' is very important, it gives context to the LLMs to properly decide which one to use.
-DOCUMENTATION_INDEX = [
-    # TODO: Add all there are.
-    {
-        "name": "about/overview",
-        "description": "The most important factual and technical information about CrateDB per medium-sized (~300kB) llms.txt context file.",
-        "link": "https://cdn.crate.io/about/v1/llms.txt"},
-    {
-        "name": "scalar functions",
-        "description": "documentation about specific scalar/methods/functions for CrateDB SQL",
-        "link": "https://raw.githubusercontent.com/crate/crate/refs/heads/5.10/docs/general/builtins/scalar-functions.rst"},
-    {
-        "name": "optimize query 101",
-        "description": "documentation about optimizing CrateDB SQL statements",
-        "link": "https://raw.githubusercontent.com/crate/cratedb-guide/9ab661997d7704ecbb63af9c3ee33535957e24e6/docs/performance/optimization.rst"
-    }
-]
+class DocumentationIndex:
+    """
+    Define documentation sections supplied to the MCP server.
+    Load knowledge outline from YAML file and read all items.
+
+    The `description` attribute is very important, it gives context
+    to the LLM to properly decide which one to use.
+
+    Canonical source: https://github.com/crate/about/blob/main/src/cratedb_about/outline/cratedb-outline.yaml
+
+    Examples:
+    ```yaml
+    - title: "CrateDB SQL functions"
+      link: https://cratedb.com/docs/crate/reference/en/latest/_sources/general/builtins/scalar-functions.rst.txt
+      description: The reference documentation about all SQL functions CrateDB provides.
+
+    - title: "Guide: CrateDB query optimization"
+      link: https://cratedb.com/docs/guide/_sources/performance/optimization.rst.txt
+      description: Essential principles for optimizing queries in CrateDB while avoiding the most common pitfalls.
+    ```
+    """
+
+    def __init__(self):
+        self.outline = CrateDbKnowledgeOutline.load()
+
+    @cachetools.cached(cache={})
+    def items(self):
+        return self.outline.find_items().to_dict()
+
+
+def documentation_url_permitted(url: str) -> bool:
+    return (
+            url.startswith("https://cratedb.com/") or
+            url.startswith("https://github.com/crate") or
+            url.startswith("https://raw.githubusercontent.com/crate"))
diff --git a/cratedb_mcp/settings.py b/cratedb_mcp/settings.py
@@ -12,3 +12,6 @@
     # TODO: Add software test after refactoring away from module scope.
     warnings.warn(f"Environment variable `CRATEDB_MCP_DOCS_CACHE_TTL` invalid: {e}. "
                   f"Using default value: {DOCS_CACHE_TTL}.", category=UserWarning, stacklevel=2)
+
+# Configure HTTP timeout for all conversations.
+HTTP_TIMEOUT = 10.0
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,8 @@ dynamic = [
   "version",
 ]
 dependencies = [
+  "cachetools<6",
+  "cratedb-about==0.0.4",
   "hishel<0.2",
   "mcp[cli]>=1.5.0",
 ]

diff --git a/tests/test_knowledge.py b/tests/test_knowledge.py
@@ -1,10 +1,14 @@
-from cratedb_mcp.knowledge import DOCUMENTATION_INDEX, Queries
+from cratedb_mcp.knowledge import DocumentationIndex, Queries
 
 
 def test_documentation_index():
-    assert len(DOCUMENTATION_INDEX) == 3
-    assert DOCUMENTATION_INDEX[1]["name"] == "scalar functions"
-    assert DOCUMENTATION_INDEX[2]["name"] == "optimize query 101"
+    documentation_index = DocumentationIndex()
+    titles = [item["title"] for item in documentation_index.items()]
+    assert len(titles) >= 50
+    assert "CrateDB database" in titles
+    assert "CrateDB features" in titles
+    assert "CrateDB SQL reference: Scalar functions" in titles
+    assert "Guide: CrateDB query optimization" in titles
 
 
 def test_queries():

diff --git a/tests/test_mcp.py b/tests/test_mcp.py
@@ -15,15 +15,20 @@ def test_get_documentation_index():
 
 def test_fetch_docs_forbidden():
     with pytest.raises(ValueError) as ex:
-        fetch_cratedb_docs("https://cratedb.com/docs/crate/reference/en/latest/_sources/general/builtins/scalar-functions.rst.txt")
-    assert ex.match("Only github cratedb links can be fetched")
+        fetch_cratedb_docs("https://example.com")
+    assert ex.match("Link is not permitted: https://example.com")
 
 
-def test_fetch_docs_permitted():
+def test_fetch_docs_permitted_github():
     response = fetch_cratedb_docs("https://raw.githubusercontent.com/crate/crate/refs/heads/5.10/docs/general/builtins/scalar-functions.rst")
     assert "initcap" in response
 
 
+def test_fetch_docs_permitted_cratedb_com():
+    response = fetch_cratedb_docs("https://cratedb.com/docs/crate/reference/en/latest/_sources/general/builtins/scalar-functions.rst.txt")
+    assert "initcap" in response
+
+
 def test_query_sql_forbidden():
     with pytest.raises(ValueError) as ex:
         assert "RelationUnknown" in str(query_sql("INSERT INTO foobar (id) VALUES (42) RETURNING id"))