-
Notifications
You must be signed in to change notification settings - Fork 5
[mcp] feat: add JSON-RPC server, GPU info listing, and contribution guide #59
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0bfa7b4
9f8ada3
58169dd
14002d9
0c70e08
da690f9
b73e532
69c64da
c4ff82d
a35cea1
dfcb31f
683010b
999b19b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| # Contributing & Development | ||
|
|
||
| Thanks for helping improve KeepGPU! This page collects the key commands and | ||
| expectations so you can get productive quickly and avoid surprises in CI. | ||
|
|
||
| ## Setup | ||
|
|
||
| - Clone and install dev extras: | ||
| ```bash | ||
| git clone https://github.com/Wangmerlyn/KeepGPU.git | ||
| cd KeepGPU | ||
| pip install -e ".[dev]" # add .[rocm] if you need ROCm SMI | ||
| ``` | ||
| - Ensure you have the right torch build for your platform (CUDA/ROCm/CPU). | ||
| - Optional: install `nvidia-ml-py` (CUDA) or `rocm-smi` (ROCm) for telemetry. | ||
|
|
||
| ## Tests | ||
|
|
||
| - Fast CUDA suite: | ||
| ```bash | ||
| pytest tests/cuda_controller tests/global_controller \ | ||
| tests/utilities/test_platform_manager.py tests/test_cli_thresholds.py | ||
| ``` | ||
| - ROCm-only tests are marked `rocm` and skipped by default; run with: | ||
| ```bash | ||
| pytest --run-rocm tests/rocm_controller | ||
| ``` | ||
| - MCP + utilities: | ||
| ```bash | ||
| pytest tests/mcp tests/utilities/test_gpu_info.py | ||
| ``` | ||
| - All tests honor markers `rocm` and `large_memory`; avoid enabling | ||
| `large_memory` in CI. | ||
|
|
||
| ## Lint/format | ||
|
|
||
| - Run pre-commit hooks locally before pushing: | ||
| ```bash | ||
| pre-commit run --all-files | ||
| ``` | ||
|
|
||
| ## MCP server (experimental) | ||
|
|
||
| - Start: `keep-gpu-mcp-server` (stdin/stdout JSON-RPC) | ||
| - Methods: `start_keep`, `stop_keep`, `status`, `list_gpus` | ||
| - Example request: | ||
| ```json | ||
| {"id":1,"method":"start_keep","params":{"gpu_ids":[0],"vram":"512MB","interval":60,"busy_threshold":20}} | ||
| ``` | ||
|
|
||
| ## Pull requests | ||
|
|
||
| - Keep changesets focused; small commits are welcome. | ||
| - Add/adjust tests for new behavior; skip GPU-specific tests in CI by way of markers. | ||
| - Update docs/README when behavior or interfaces change. | ||
| - Stick to the existing style (Typer CLI, Rich logging) and keep code paths | ||
| simple—avoid over-engineering. | ||
|
|
||
| ## Support | ||
|
|
||
| - Issues/PRs: https://github.com/Wangmerlyn/KeepGPU | ||
| - Code of Conduct: see `CODE_OF_CONDUCT.rst` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,157 @@ | ||
| """ | ||
| Minimal MCP-style JSON-RPC server for KeepGPU. | ||
|
|
||
| The server reads JSON lines from stdin and writes JSON responses to stdout. | ||
| Supported methods: | ||
| - start_keep(gpu_ids, vram, interval, busy_threshold, job_id) | ||
| - stop_keep(job_id=None) # None stops all | ||
| - status(job_id=None) # None lists all | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import atexit | ||
| import json | ||
| import sys | ||
| import uuid | ||
| from dataclasses import dataclass | ||
| from typing import Any, Callable, Dict, List, Optional | ||
|
|
||
| from keep_gpu.global_gpu_controller.global_gpu_controller import GlobalGPUController | ||
| from keep_gpu.utilities.gpu_info import get_gpu_info | ||
| from keep_gpu.utilities.logger import setup_logger | ||
|
|
||
| logger = setup_logger(__name__) | ||
|
|
||
|
|
||
| @dataclass | ||
| class Session: | ||
| controller: GlobalGPUController | ||
| params: Dict[str, Any] | ||
|
|
||
|
|
||
| class KeepGPUServer: | ||
| def __init__( | ||
| self, | ||
| controller_factory: Optional[Callable[..., GlobalGPUController]] = None, | ||
| ) -> None: | ||
| self._sessions: Dict[str, Session] = {} | ||
| self._controller_factory = controller_factory or GlobalGPUController | ||
| atexit.register(self.shutdown) | ||
|
|
||
| def start_keep( | ||
| self, | ||
| gpu_ids: Optional[List[int]] = None, | ||
| vram: str = "1GiB", | ||
| interval: int = 300, | ||
| busy_threshold: int = -1, | ||
| job_id: Optional[str] = None, | ||
| ) -> Dict[str, Any]: | ||
| job_id = job_id or str(uuid.uuid4()) | ||
| if job_id in self._sessions: | ||
| raise ValueError(f"job_id {job_id} already exists") | ||
|
|
||
| controller = self._controller_factory( | ||
| gpu_ids=gpu_ids, | ||
| interval=interval, | ||
| vram_to_keep=vram, | ||
| busy_threshold=busy_threshold, | ||
| ) | ||
| controller.keep() | ||
| self._sessions[job_id] = Session( | ||
| controller=controller, | ||
| params={ | ||
| "gpu_ids": gpu_ids, | ||
| "vram": vram, | ||
| "interval": interval, | ||
| "busy_threshold": busy_threshold, | ||
| }, | ||
| ) | ||
| logger.info("Started keep session %s on GPUs %s", job_id, gpu_ids) | ||
| return {"job_id": job_id} | ||
|
|
||
| def stop_keep(self, job_id: Optional[str] = None) -> Dict[str, Any]: | ||
| if job_id: | ||
| session = self._sessions.pop(job_id, None) | ||
| if session: | ||
| session.controller.release() | ||
| logger.info("Stopped keep session %s", job_id) | ||
| return {"stopped": [job_id]} | ||
| return {"stopped": [], "message": "job_id not found"} | ||
|
|
||
| stopped_ids = list(self._sessions.keys()) | ||
| for job_id in stopped_ids: | ||
| session = self._sessions.pop(job_id) | ||
| session.controller.release() | ||
| if stopped_ids: | ||
| logger.info("Stopped sessions: %s", stopped_ids) | ||
| return {"stopped": stopped_ids} | ||
|
|
||
| def status(self, job_id: Optional[str] = None) -> Dict[str, Any]: | ||
| if job_id: | ||
| session = self._sessions.get(job_id) | ||
| if not session: | ||
| return {"active": False, "job_id": job_id} | ||
| return { | ||
| "active": True, | ||
| "job_id": job_id, | ||
| "params": session.params, | ||
| } | ||
| return { | ||
| "active_jobs": [ | ||
| {"job_id": jid, "params": sess.params} | ||
| for jid, sess in self._sessions.items() | ||
| ] | ||
|
Comment on lines
100
to
104
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The list-all branch of Useful? React with 👍 / 👎. |
||
| } | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| def list_gpus(self) -> Dict[str, Any]: | ||
| """Return detailed GPU info (id, name, memory, utilization).""" | ||
| infos = get_gpu_info() | ||
| return {"gpus": infos} | ||
|
|
||
| def shutdown(self) -> None: | ||
| try: | ||
| self.stop_keep(None) | ||
| except Exception: # pragma: no cover - defensive | ||
| # Avoid noisy errors during interpreter teardown | ||
| return | ||
|
|
||
|
|
||
| def _handle_request(server: KeepGPUServer, payload: Dict[str, Any]) -> Dict[str, Any]: | ||
| method = payload.get("method") | ||
| params = payload.get("params", {}) or {} | ||
| req_id = payload.get("id") | ||
| try: | ||
| if method == "start_keep": | ||
| result = server.start_keep(**params) | ||
| elif method == "stop_keep": | ||
| result = server.stop_keep(**params) | ||
| elif method == "status": | ||
| result = server.status(**params) | ||
| elif method == "list_gpus": | ||
| result = server.list_gpus() | ||
| else: | ||
| raise ValueError(f"Unknown method: {method}") | ||
| return {"id": req_id, "result": result} | ||
| except Exception as exc: # pragma: no cover - defensive | ||
| logger.exception("Request failed") | ||
| return {"id": req_id, "error": {"message": str(exc)}} | ||
|
|
||
|
|
||
| def main() -> None: | ||
| server = KeepGPUServer() | ||
| for line in sys.stdin: | ||
| line = line.strip() | ||
| if not line: | ||
| continue | ||
| try: | ||
| payload = json.loads(line) | ||
| response = _handle_request(server, payload) | ||
| except Exception as exc: | ||
| response = {"error": {"message": str(exc)}} | ||
| sys.stdout.write(json.dumps(response) + "\n") | ||
| sys.stdout.flush() | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() | ||
Uh oh!
There was an error while loading. Please reload this page.