diff --git a/README.md b/README.md index a92cc77..3dcceb7 100644 --- a/README.md +++ b/README.md @@ -92,10 +92,14 @@ with GlobalGPUController(gpu_ids=[0, 1], vram_to_keep="750MB", interval=90, busy ### MCP endpoint (experimental) -- Start a simple JSON-RPC server on stdin/stdout: +- Start a simple JSON-RPC server on stdin/stdout (default): ```bash keep-gpu-mcp-server ``` +- Or expose it over HTTP (JSON-RPC 2.0 by way of POST): + ```bash + keep-gpu-mcp-server --mode http --host 0.0.0.0 --port 8765 + ``` - Example request (one per line): ```json {"id": 1, "method": "start_keep", "params": {"gpu_ids": [0], "vram": "512MB", "interval": 60, "busy_threshold": 20}} @@ -108,11 +112,31 @@ with GlobalGPUController(gpu_ids=[0, 1], vram_to_keep="750MB", interval=90, busy command: ["keep-gpu-mcp-server"] adapter: stdio ``` +- Minimal client config (HTTP MCP): + ```yaml + servers: + keepgpu: + url: http://127.0.0.1:8765/ + adapter: http + ``` +- Remote/SSH tunnel example (HTTP): + ```bash + keep-gpu-mcp-server --mode http --host 0.0.0.0 --port 8765 + ``` + Client config (replace hostname/tunnel as needed): + ```yaml + servers: + keepgpu: + url: http://gpu-box.example.com:8765/ + adapter: http + ``` + For untrusted networks, put the server behind your own auth/reverse-proxy or + tunnel by way of SSH (for example, `ssh -L 8765:localhost:8765 gpu-box`). ## Contributing Contributions are welcome—especially around ROCm support, platform fallbacks, and scheduler-specific recipes. Open an issue or PR if you hit edge cases on your cluster. -See `docs/contributing.md` for dev setup, test commands, and PR tips. +See [docs/contributing.md](docs/contributing.md) for dev setup, test commands, and PR tips. ## Credits diff --git a/docs/contributing.md b/docs/contributing.md index 9754a33..03ff180 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -39,14 +39,29 @@ expectations so you can get productive quickly and avoid surprises in CI. pre-commit run --all-files ``` +## Docs + +- Live preview: + ```bash + mkdocs serve + ``` +- Build the static site: + ```bash + mkdocs build + ``` + ## MCP server (experimental) - Start: `keep-gpu-mcp-server` (stdin/stdout JSON-RPC) +- HTTP option: `keep-gpu-mcp-server --mode http --host 0.0.0.0 --port 8765` - Methods: `start_keep`, `stop_keep`, `status`, `list_gpus` - Example request: ```json {"id":1,"method":"start_keep","params":{"gpu_ids":[0],"vram":"512MB","interval":60,"busy_threshold":20}} ``` +- Remote tip: for shared clusters, prefer HTTP behind your own auth/reverse-proxy + or tunnel with SSH (`ssh -L 8765:localhost:8765 gpu-box`), then point your MCP + client at `http://127.0.0.1:8765/`. ## Pull requests diff --git a/docs/getting-started.md b/docs/getting-started.md index 00b8f55..1576e2a 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -9,9 +9,10 @@ understand the minimum knobs you need to keep a GPU occupied. - Python 3.9+ (matching the version in your environment/cluster image). - Optional but recommended: `nvidia-smi` in `PATH` for utilization monitoring (CUDA) or `rocm-smi` if you install the `rocm` extra. -!!! warning "ROCm & multi-tenant clusters" - The current release focuses on CUDA devices. ROCm/AMD support is experimental; - controllers will raise `NotImplementedError` if CUDA is unavailable. +!!! info "Platforms" + CUDA is the primary path; ROCm is supported by way of the `rocm` extra + (requires a ROCm-enabled PyTorch build). CPU-only environments can import + the package but controllers will not start. ## Install @@ -39,44 +40,6 @@ understand the minimum knobs you need to keep a GPU occupied. pip install keep-gpu ``` -## For contributors - -- Install dev extras: `pip install -e ".[dev]"` (append `.[rocm]` if you need ROCm SMI). -- Fast CUDA checks: `pytest tests/cuda_controller tests/global_controller tests/utilities/test_platform_manager.py tests/test_cli_thresholds.py` -- ROCm-only tests are marked `rocm`; run with `pytest --run-rocm tests/rocm_controller`. - -## MCP endpoint (experimental) - -For automation clients that speak JSON-RPC (MCP-style), KeepGPU ships a tiny -stdin/stdout server: - -```bash -keep-gpu-mcp-server -# each request is a single JSON line; example: -echo '{"id":1,"method":"start_keep","params":{"gpu_ids":[0],"vram":"512MB","interval":60,"busy_threshold":20}}' | keep-gpu-mcp-server -``` - -Supported methods: -- `start_keep(gpu_ids?, vram?, interval?, busy_threshold?, job_id?)` -- `status(job_id?)` -- `stop_keep(job_id?)` (no job_id stops all) -- `list_gpus()` (basic info) - -### Example MCP client config (stdio) - -If your agent expects an MCP server definition, a minimal stdio config looks like: - -```yaml -servers: - keepgpu: - description: "KeepGPU MCP server" - command: ["keep-gpu-mcp-server"] - adapter: stdio -``` - -Tools exposed: `start_keep`, `stop_keep`, `status`, `list_gpus`. Each request is -a single JSON line; see above for an example payload. - === "Editable dev install" ```bash git clone https://github.com/Wangmerlyn/KeepGPU.git @@ -84,6 +47,12 @@ a single JSON line; see above for an example payload. pip install -e .[dev] ``` +## Pick your interface + +- **CLI** – fastest way to reserve GPUs from a shell; see [CLI Playbook](guides/cli.md). +- **Python module** – embed keep-alive loops inside orchestration code; see [Python API Recipes](guides/python.md). +- **MCP server** – expose KeepGPU over JSON-RPC (stdio or HTTP) for agents; see [MCP Server](guides/mcp.md). + ## Sanity check 1. Make sure PyTorch can see at least one device: @@ -119,7 +88,8 @@ ready to hand the GPU back, hit `Ctrl+C`—controllers will release VRAM and exi ## KeepGPU inside Python -The CLI wraps the same controllers you can import directly: +Prefer code-level control? Import the controllers directly (full recipes in +[Python API Recipes](guides/python.md)): ```python from keep_gpu.single_gpu_controller.cuda_gpu_controller import CudaGPUController @@ -141,3 +111,8 @@ with GlobalGPUController(gpu_ids=[0, 1], vram_to_keep="750MB", interval=60): From here, jump to the CLI Playbook for scenario-driven guidance or the API recipes if you need to embed KeepGPU in orchestration scripts. + +## For contributors + +Developing locally? See [Contributing](contributing.md) for dev install, test +commands (including CUDA/ROCm markers), and PR tips. diff --git a/docs/guides/mcp.md b/docs/guides/mcp.md new file mode 100644 index 0000000..f8b437c --- /dev/null +++ b/docs/guides/mcp.md @@ -0,0 +1,92 @@ +# MCP Server + +Expose KeepGPU as a minimal JSON-RPC server (MCP-style) so agents or remote +orchestrators can start/stop keep-alive jobs and inspect GPU state. + +## When to use this + +- You run KeepGPU from an agent (LangChain, custom orchestrator, etc.) instead of a shell. +- You want to keep GPUs alive on a remote box over TCP rather than stdio. +- You need a quick way to list GPU utilization/memory by way of the same interface. + +## Quick start + +=== "stdio (default)" + ```bash + keep-gpu-mcp-server + ``` + + Send one JSON request per line: + + ```bash + echo '{"id":1,"method":"start_keep","params":{"gpu_ids":[0],"vram":"512MB","interval":60,"busy_threshold":20}}' | keep-gpu-mcp-server + ``` + +=== "HTTP" + ```bash + keep-gpu-mcp-server --mode http --host 0.0.0.0 --port 8765 + ``` + + Query it over HTTP: + + ```bash + curl -X POST http://127.0.0.1:8765/ \ + -H "content-type: application/json" \ + -d '{"id":1,"method":"status"}' + ``` + +Supported methods: + +- `start_keep(gpu_ids?, vram?, interval?, busy_threshold?, job_id?)` +- `stop_keep(job_id?)` (omit `job_id` to stop all) +- `status(job_id?)` (omit `job_id` to list active jobs) +- `list_gpus()` (detailed info by way of NVML/ROCm SMI/torch) + +## Client configs (MCP-style) + +=== "stdio adapter" + ```yaml + servers: + keepgpu: + description: "KeepGPU MCP server" + command: ["keep-gpu-mcp-server"] + adapter: stdio + ``` + +=== "HTTP adapter" + ```yaml + servers: + keepgpu: + url: http://127.0.0.1:8765/ + adapter: http + ``` + +## Remote/cluster usage + +- Run on the GPU host: + ```bash + keep-gpu-mcp-server --mode http --host 0.0.0.0 --port 8765 + ``` +- Point your client at the host: + ```yaml + servers: + keepgpu: + url: http://gpu-box.example.com:8765/ + adapter: http + ``` +- If the network is untrusted, tunnel instead of exposing the port: + ```bash + ssh -L 8765:localhost:8765 gpu-box.example.com + ``` + Then use `http://127.0.0.1:8765/` in your MCP config. For multi-user clusters, + consider fronting the service with your own auth/reverse-proxy. + +## Responses you can expect + +```json +{"id":1,"result":{"job_id":""}} # start_keep +{"id":2,"result":{"stopped":[""]}} # stop_keep +{"id":3,"result":{"active":true,"job_id":"","params":{"gpu_ids":[0]}}} +{"id":4,"result":{"active_jobs":[{"job_id":"","params":{"gpu_ids":[0]}}]}} +{"id":5,"result":{"gpus":[{"id":0,"platform":"cuda","name":"A100","memory_total":...,"memory_used":...,"utilization":12}]}} +``` diff --git a/docs/index.md b/docs/index.md index 36bde1a..57938fb 100644 --- a/docs/index.md +++ b/docs/index.md @@ -32,11 +32,13 @@ during longer CPU-bound sections of your workflow. for pinning cards on clusters, workstations, or Jupyter. - :material-code-tags: **[Python API Recipes](guides/python.md)** – Drop-in snippets for wrapping preprocessing stages or orchestration scripts. +- :material-lan: **[MCP Server](guides/mcp.md)** – Expose KeepGPU by way of JSON-RPC + (stdio/HTTP) for agents and remote orchestration. - :material-diagram-project: **[How KeepGPU Works](concepts/architecture.md)** – Learn how controllers allocate VRAM and throttle themselves. - :material-book-open-outline: **[Reference](reference/cli.md)** – Full option list plus mkdocstrings API reference. !!! tip "Prefer a fast skim?" - The left sidebar mirrors the lifecycle: overview → guides → concepts → + The left sidebar mirrors the lifecycle: overview → usage → concepts → references. Jump straight to what you need; sections stand on their own. diff --git a/mkdocs.yml b/mkdocs.yml index d26c9b8..210f6df 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -15,9 +15,10 @@ nav: - Overview: - Welcome: index.md - Getting Started: getting-started.md - - Guides: + - Usage: - CLI Playbook: guides/cli.md - Python API Recipes: guides/python.md + - MCP Server: guides/mcp.md - Concepts: - How KeepGPU Works: concepts/architecture.md - Reference: @@ -34,6 +35,8 @@ plugins: markdown_extensions: - admonition - codehilite + - pymdownx.tabbed: + alternate_style: true - toc: permalink: true - pymdownx.emoji: diff --git a/src/keep_gpu/mcp/server.py b/src/keep_gpu/mcp/server.py index 47a06f4..e4d255a 100644 --- a/src/keep_gpu/mcp/server.py +++ b/src/keep_gpu/mcp/server.py @@ -1,11 +1,12 @@ """ Minimal MCP-style JSON-RPC server for KeepGPU. -The server reads JSON lines from stdin and writes JSON responses to stdout. +Run over stdin/stdout (default) or a lightweight HTTP server. Supported methods: - start_keep(gpu_ids, vram, interval, busy_threshold, job_id) - stop_keep(job_id=None) # None stops all - status(job_id=None) # None lists all + - list_gpus() """ from __future__ import annotations @@ -14,6 +15,10 @@ import json import sys import uuid +import argparse +import threading +from http.server import BaseHTTPRequestHandler +from socketserver import TCPServer from dataclasses import dataclass from typing import Any, Callable, Dict, List, Optional @@ -47,6 +52,22 @@ def start_keep( busy_threshold: int = -1, job_id: Optional[str] = None, ) -> Dict[str, Any]: + """ + Start a KeepGPU session that reserves VRAM on one or more GPUs. + + Args: + gpu_ids: GPU indices to target; None uses all available GPUs. + vram: Human-readable VRAM size to keep (for example, "1GiB"). + interval: Seconds between controller checks/actions. + busy_threshold: Utilization above which the controller backs off. + job_id: Optional session identifier; a UUID is generated if omitted. + + Returns: + Dict with the started session's job_id, e.g. ``{"job_id": ""}``. + + Raises: + ValueError: If the provided job_id already exists. + """ job_id = job_id or str(uuid.uuid4()) if job_id in self._sessions: raise ValueError(f"job_id {job_id} already exists") @@ -70,12 +91,29 @@ def start_keep( logger.info("Started keep session %s on GPUs %s", job_id, gpu_ids) return {"job_id": job_id} - def stop_keep(self, job_id: Optional[str] = None) -> Dict[str, Any]: + def stop_keep( + self, job_id: Optional[str] = None, quiet: bool = False + ) -> Dict[str, Any]: + """ + Stop one or all active keep sessions. + + If job_id is supplied, only that session is stopped; otherwise all active + sessions are released. When quiet=True, informational logging is skipped. + + Args: + job_id: Session identifier to stop; None stops every session. + quiet: Suppress informational logs about stopped sessions. + + Returns: + Dict with a "stopped" list of job ids. If a specific job_id was not + found, a "message" field explains the miss. + """ if job_id: session = self._sessions.pop(job_id, None) if session: session.controller.release() - logger.info("Stopped keep session %s", job_id) + if not quiet: + logger.info("Stopped keep session %s", job_id) return {"stopped": [job_id]} return {"stopped": [], "message": "job_id not found"} @@ -83,7 +121,7 @@ def stop_keep(self, job_id: Optional[str] = None) -> Dict[str, Any]: for job_id in stopped_ids: session = self._sessions.pop(job_id) session.controller.release() - if stopped_ids: + if stopped_ids and not quiet: logger.info("Stopped sessions: %s", stopped_ids) return {"stopped": stopped_ids} @@ -110,14 +148,25 @@ def list_gpus(self) -> Dict[str, Any]: return {"gpus": infos} def shutdown(self) -> None: + """Stop all sessions quietly; ignore errors during interpreter teardown.""" try: - self.stop_keep(None) + self.stop_keep(None, quiet=True) except Exception: # pragma: no cover - defensive # Avoid noisy errors during interpreter teardown return def _handle_request(server: KeepGPUServer, payload: Dict[str, Any]) -> Dict[str, Any]: + """ + Dispatch a JSON-RPC payload to the server and return a response dict. + + Args: + server: Target KeepGPUServer. + payload: Dict with "method", optional "params", and optional "id". + + Returns: + JSON-RPC-style dict containing either "result" or "error" plus "id". + """ method = payload.get("method") params = payload.get("params", {}) or {} req_id = payload.get("id") @@ -138,8 +187,40 @@ def _handle_request(server: KeepGPUServer, payload: Dict[str, Any]) -> Dict[str, return {"id": req_id, "error": {"message": str(exc)}} -def main() -> None: - server = KeepGPUServer() +class _JSONRPCHandler(BaseHTTPRequestHandler): + server_version = "KeepGPU-MCP/0.1" + + def do_POST(self): # noqa: N802 + """ + Handle an HTTP JSON-RPC request and write a JSON response. + + Expects application/json bodies containing {"method", "params", "id"}. + Returns 400 with an error object if parsing fails. + """ + try: + length = int(self.headers.get("content-length", "0")) + body = self.rfile.read(length).decode("utf-8") + payload = json.loads(body) + server_ref = self.server.keepgpu_server # type: ignore[attr-defined] + response = _handle_request(server_ref, payload) + status = 200 + except (json.JSONDecodeError, ValueError, UnicodeDecodeError) as exc: + response = {"error": {"message": f"Bad request: {exc}"}} + status = 400 + data = json.dumps(response).encode() + self.send_response(status) + self.send_header("content-type", "application/json") + self.send_header("content-length", str(len(data))) + self.end_headers() + self.wfile.write(data) + + def log_message(self, format, *args): # noqa: A003 + """Suppress default request logging.""" + return + + +def run_stdio(server: KeepGPUServer) -> None: + """Serve JSON-RPC requests over stdin/stdout (one JSON object per line).""" for line in sys.stdin: line = line.strip() if not line: @@ -153,5 +234,53 @@ def main() -> None: sys.stdout.flush() +def run_http(server: KeepGPUServer, host: str = "127.0.0.1", port: int = 8765) -> None: + """Run a lightweight HTTP JSON-RPC server on the given host/port.""" + + class _Server(TCPServer): + allow_reuse_address = True + + httpd = _Server((host, port), _JSONRPCHandler) + httpd.keepgpu_server = server # type: ignore[attr-defined] + + def _serve(): + """Run the HTTP server loop until shutdown.""" + httpd.serve_forever() + + thread = threading.Thread(target=_serve) + thread.start() + logger.info( + "MCP HTTP server listening on http://%s:%s", host, httpd.server_address[1] + ) + try: + thread.join() + except KeyboardInterrupt: + pass + finally: + httpd.shutdown() + httpd.server_close() + server.shutdown() + + +def main() -> None: + """CLI entry point for the KeepGPU MCP server.""" + parser = argparse.ArgumentParser(description="KeepGPU MCP server") + parser.add_argument( + "--mode", + choices=["stdio", "http"], + default="stdio", + help="Transport mode (default: stdio)", + ) + parser.add_argument("--host", default="127.0.0.1", help="HTTP host (http mode)") + parser.add_argument("--port", type=int, default=8765, help="HTTP port (http mode)") + args = parser.parse_args() + + server = KeepGPUServer() + if args.mode == "stdio": + run_stdio(server) + else: + run_http(server, host=args.host, port=args.port) + + if __name__ == "__main__": main()