diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index d180562..e32472c 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -60,7 +60,7 @@ jobs: # `uv publish` does not understand `--skip-existing`. # https://github.com/astral-sh/uv/issues/7917 # https://github.com/astral-sh/uv/issues/12369 - uvx twine upload --non-interactive --repository-url https://test.pypi.org/legacy/ --skip-existing dist/* + uvx twine upload --non-interactive --verbose --repository-url https://test.pypi.org/legacy/ --skip-existing dist/* - name: Publish package to PyPI if: startsWith(github.event.ref, 'refs/tags') diff --git a/cratedb_mcp/__main__.py b/cratedb_mcp/__main__.py index 3dd235d..77dbd7d 100644 --- a/cratedb_mcp/__main__.py +++ b/cratedb_mcp/__main__.py @@ -2,8 +2,8 @@ import httpx from mcp.server.fastmcp import FastMCP -from .knowledge import DOCUMENTATION_INDEX, Queries -from .settings import DOCS_CACHE_TTL, HTTP_URL +from .knowledge import DocumentationIndex, Queries, documentation_url_permitted +from .settings import DOCS_CACHE_TTL, HTTP_TIMEOUT, HTTP_URL # Configure Hishel, an httpx client with caching. # Define one hour of caching time. @@ -11,12 +11,15 @@ storage = hishel.SQLiteStorage(ttl=DOCS_CACHE_TTL) client = hishel.CacheClient(controller=controller, storage=storage) +# Load CrateDB documentation outline. +documentation_index = DocumentationIndex() + # Create FastMCP application object. mcp = FastMCP("cratedb-mcp") def query_cratedb(query: str) -> list[dict]: - return httpx.post(f'{HTTP_URL}/_sql', json={'stmt': query}).json() + return httpx.post(f'{HTTP_URL}/_sql', json={'stmt': query}, timeout=HTTP_TIMEOUT).json() @mcp.tool(description="Send a SQL query to CrateDB, only 'SELECT' queries are allows, queries that" @@ -27,17 +30,17 @@ def query_sql(query: str): return query_cratedb(query) @mcp.tool(description='Gets an index with CrateDB documentation links to fetch, should download docs' - ' before answering questions. Has documentation name, description and link.') + ' before answering questions. Has documentation title, description, and link.') def get_cratedb_documentation_index(): - return DOCUMENTATION_INDEX + return documentation_index.items() @mcp.tool(description='Downloads the latest CrateDB documentation piece by link.' ' Only used to download CrateDB docs.') def fetch_cratedb_docs(link: str): - """Fetches a CrateDB documentation link from GitHub raw content.""" - if 'https://raw.githubusercontent.com/crate/crate/' not in link: - raise ValueError('Only github cratedb links can be fetched.') - return client.get(link).text + """Fetches a CrateDB documentation link.""" + if not documentation_url_permitted(link): + raise ValueError(f'Link is not permitted: {link}') + return client.get(link, timeout=HTTP_TIMEOUT).text @mcp.tool(description="Returns an aggregation of all CrateDB's schema, tables and their metadata") def get_table_metadata() -> list[dict]: diff --git a/cratedb_mcp/knowledge.py b/cratedb_mcp/knowledge.py index f3a4e2e..4ae5a3b 100644 --- a/cratedb_mcp/knowledge.py +++ b/cratedb_mcp/knowledge.py @@ -1,4 +1,7 @@ # ruff: noqa: E501 +import cachetools +from cratedb_about import CrateDbKnowledgeOutline + class Queries: TABLES_METADATA = """ @@ -83,20 +86,38 @@ class Queries: ORDER BY severity DESC""" -# 'description' is very important, it gives context to the LLMs to properly decide which one to use. -DOCUMENTATION_INDEX = [ - # TODO: Add all there are. - { - "name": "about/overview", - "description": "The most important factual and technical information about CrateDB per medium-sized (~300kB) llms.txt context file.", - "link": "https://cdn.crate.io/about/v1/llms.txt"}, - { - "name": "scalar functions", - "description": "documentation about specific scalar/methods/functions for CrateDB SQL", - "link": "https://raw.githubusercontent.com/crate/crate/refs/heads/5.10/docs/general/builtins/scalar-functions.rst"}, - { - "name": "optimize query 101", - "description": "documentation about optimizing CrateDB SQL statements", - "link": "https://raw.githubusercontent.com/crate/cratedb-guide/9ab661997d7704ecbb63af9c3ee33535957e24e6/docs/performance/optimization.rst" - } -] +class DocumentationIndex: + """ + Define documentation sections supplied to the MCP server. + Load knowledge outline from YAML file and read all items. + + The `description` attribute is very important, it gives context + to the LLM to properly decide which one to use. + + Canonical source: https://github.com/crate/about/blob/main/src/cratedb_about/outline/cratedb-outline.yaml + + Examples: + ```yaml + - title: "CrateDB SQL functions" + link: https://cratedb.com/docs/crate/reference/en/latest/_sources/general/builtins/scalar-functions.rst.txt + description: The reference documentation about all SQL functions CrateDB provides. + + - title: "Guide: CrateDB query optimization" + link: https://cratedb.com/docs/guide/_sources/performance/optimization.rst.txt + description: Essential principles for optimizing queries in CrateDB while avoiding the most common pitfalls. + ``` + """ + + def __init__(self): + self.outline = CrateDbKnowledgeOutline.load() + + @cachetools.cached(cache={}) + def items(self): + return self.outline.find_items().to_dict() + + +def documentation_url_permitted(url: str) -> bool: + return ( + url.startswith("https://cratedb.com/") or + url.startswith("https://github.com/crate") or + url.startswith("https://raw.githubusercontent.com/crate")) diff --git a/cratedb_mcp/settings.py b/cratedb_mcp/settings.py index 6988f0d..76acdca 100644 --- a/cratedb_mcp/settings.py +++ b/cratedb_mcp/settings.py @@ -12,3 +12,6 @@ # TODO: Add software test after refactoring away from module scope. warnings.warn(f"Environment variable `CRATEDB_MCP_DOCS_CACHE_TTL` invalid: {e}. " f"Using default value: {DOCS_CACHE_TTL}.", category=UserWarning, stacklevel=2) + +# Configure HTTP timeout for all conversations. +HTTP_TIMEOUT = 10.0 diff --git a/pyproject.toml b/pyproject.toml index ecc6007..4d404d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,8 @@ dynamic = [ "version", ] dependencies = [ + "cachetools<6", + "cratedb-about==0.0.4", "hishel<0.2", "mcp[cli]>=1.5.0", ] diff --git a/tests/test_knowledge.py b/tests/test_knowledge.py index 0b4018e..8855d97 100644 --- a/tests/test_knowledge.py +++ b/tests/test_knowledge.py @@ -1,10 +1,14 @@ -from cratedb_mcp.knowledge import DOCUMENTATION_INDEX, Queries +from cratedb_mcp.knowledge import DocumentationIndex, Queries def test_documentation_index(): - assert len(DOCUMENTATION_INDEX) == 3 - assert DOCUMENTATION_INDEX[1]["name"] == "scalar functions" - assert DOCUMENTATION_INDEX[2]["name"] == "optimize query 101" + documentation_index = DocumentationIndex() + titles = [item["title"] for item in documentation_index.items()] + assert len(titles) >= 50 + assert "CrateDB database" in titles + assert "CrateDB features" in titles + assert "CrateDB SQL reference: Scalar functions" in titles + assert "Guide: CrateDB query optimization" in titles def test_queries(): diff --git a/tests/test_mcp.py b/tests/test_mcp.py index 8ea823a..d62a2de 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -15,15 +15,20 @@ def test_get_documentation_index(): def test_fetch_docs_forbidden(): with pytest.raises(ValueError) as ex: - fetch_cratedb_docs("https://cratedb.com/docs/crate/reference/en/latest/_sources/general/builtins/scalar-functions.rst.txt") - assert ex.match("Only github cratedb links can be fetched") + fetch_cratedb_docs("https://example.com") + assert ex.match("Link is not permitted: https://example.com") -def test_fetch_docs_permitted(): +def test_fetch_docs_permitted_github(): response = fetch_cratedb_docs("https://raw.githubusercontent.com/crate/crate/refs/heads/5.10/docs/general/builtins/scalar-functions.rst") assert "initcap" in response +def test_fetch_docs_permitted_cratedb_com(): + response = fetch_cratedb_docs("https://cratedb.com/docs/crate/reference/en/latest/_sources/general/builtins/scalar-functions.rst.txt") + assert "initcap" in response + + def test_query_sql_forbidden(): with pytest.raises(ValueError) as ex: assert "RelationUnknown" in str(query_sql("INSERT INTO foobar (id) VALUES (42) RETURNING id"))