diff --git a/README.md b/README.md index d0a89a9..a92cc77 100644 --- a/README.md +++ b/README.md @@ -90,9 +90,29 @@ with GlobalGPUController(gpu_ids=[0, 1], vram_to_keep="750MB", interval=90, busy - ROCm-only tests carry `@pytest.mark.rocm`; run with `pytest --run-rocm tests/rocm_controller`. - Markers: `rocm` (needs ROCm stack) and `large_memory` (opt-in locally). +### MCP endpoint (experimental) + +- Start a simple JSON-RPC server on stdin/stdout: + ```bash + keep-gpu-mcp-server + ``` +- Example request (one per line): + ```json + {"id": 1, "method": "start_keep", "params": {"gpu_ids": [0], "vram": "512MB", "interval": 60, "busy_threshold": 20}} + ``` +- Methods: `start_keep`, `stop_keep` (optional `job_id`, default stops all), `status` (optional `job_id`), `list_gpus` (basic info). +- Minimal client config (stdio MCP): + ```yaml + servers: + keepgpu: + command: ["keep-gpu-mcp-server"] + adapter: stdio + ``` + ## Contributing Contributions are welcome—especially around ROCm support, platform fallbacks, and scheduler-specific recipes. Open an issue or PR if you hit edge cases on your cluster. +See `docs/contributing.md` for dev setup, test commands, and PR tips. ## Credits diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 0000000..9754a33 --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,62 @@ +# Contributing & Development + +Thanks for helping improve KeepGPU! This page collects the key commands and +expectations so you can get productive quickly and avoid surprises in CI. + +## Setup + +- Clone and install dev extras: + ```bash + git clone https://github.com/Wangmerlyn/KeepGPU.git + cd KeepGPU + pip install -e ".[dev]" # add .[rocm] if you need ROCm SMI + ``` +- Ensure you have the right torch build for your platform (CUDA/ROCm/CPU). +- Optional: install `nvidia-ml-py` (CUDA) or `rocm-smi` (ROCm) for telemetry. + +## Tests + +- Fast CUDA suite: + ```bash + pytest tests/cuda_controller tests/global_controller \ + tests/utilities/test_platform_manager.py tests/test_cli_thresholds.py + ``` +- ROCm-only tests are marked `rocm` and skipped by default; run with: + ```bash + pytest --run-rocm tests/rocm_controller + ``` +- MCP + utilities: + ```bash + pytest tests/mcp tests/utilities/test_gpu_info.py + ``` +- All tests honor markers `rocm` and `large_memory`; avoid enabling + `large_memory` in CI. + +## Lint/format + +- Run pre-commit hooks locally before pushing: + ```bash + pre-commit run --all-files + ``` + +## MCP server (experimental) + +- Start: `keep-gpu-mcp-server` (stdin/stdout JSON-RPC) +- Methods: `start_keep`, `stop_keep`, `status`, `list_gpus` +- Example request: + ```json + {"id":1,"method":"start_keep","params":{"gpu_ids":[0],"vram":"512MB","interval":60,"busy_threshold":20}} + ``` + +## Pull requests + +- Keep changesets focused; small commits are welcome. +- Add/adjust tests for new behavior; skip GPU-specific tests in CI by way of markers. +- Update docs/README when behavior or interfaces change. +- Stick to the existing style (Typer CLI, Rich logging) and keep code paths + simple—avoid over-engineering. + +## Support + +- Issues/PRs: https://github.com/Wangmerlyn/KeepGPU +- Code of Conduct: see `CODE_OF_CONDUCT.rst` diff --git a/docs/getting-started.md b/docs/getting-started.md index f3949fe..00b8f55 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -45,6 +45,38 @@ understand the minimum knobs you need to keep a GPU occupied. - Fast CUDA checks: `pytest tests/cuda_controller tests/global_controller tests/utilities/test_platform_manager.py tests/test_cli_thresholds.py` - ROCm-only tests are marked `rocm`; run with `pytest --run-rocm tests/rocm_controller`. +## MCP endpoint (experimental) + +For automation clients that speak JSON-RPC (MCP-style), KeepGPU ships a tiny +stdin/stdout server: + +```bash +keep-gpu-mcp-server +# each request is a single JSON line; example: +echo '{"id":1,"method":"start_keep","params":{"gpu_ids":[0],"vram":"512MB","interval":60,"busy_threshold":20}}' | keep-gpu-mcp-server +``` + +Supported methods: +- `start_keep(gpu_ids?, vram?, interval?, busy_threshold?, job_id?)` +- `status(job_id?)` +- `stop_keep(job_id?)` (no job_id stops all) +- `list_gpus()` (basic info) + +### Example MCP client config (stdio) + +If your agent expects an MCP server definition, a minimal stdio config looks like: + +```yaml +servers: + keepgpu: + description: "KeepGPU MCP server" + command: ["keep-gpu-mcp-server"] + adapter: stdio +``` + +Tools exposed: `start_keep`, `stop_keep`, `status`, `list_gpus`. Each request is +a single JSON line; see above for an example payload. + === "Editable dev install" ```bash git clone https://github.com/Wangmerlyn/KeepGPU.git diff --git a/mkdocs.yml b/mkdocs.yml index 6c3a200..d26c9b8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -23,6 +23,8 @@ nav: - Reference: - CLI Reference: reference/cli.md - API Reference: reference/api.md + - Project: + - Contributing: contributing.md plugins: - search diff --git a/pyproject.toml b/pyproject.toml index dedf135..c5b859b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dependencies = [ [project.scripts] keep-gpu = "keep_gpu.cli:app" +keep-gpu-mcp-server = "keep_gpu.mcp.server:main" [project.optional-dependencies] dev = [ diff --git a/src/keep_gpu/mcp/server.py b/src/keep_gpu/mcp/server.py new file mode 100644 index 0000000..47a06f4 --- /dev/null +++ b/src/keep_gpu/mcp/server.py @@ -0,0 +1,157 @@ +""" +Minimal MCP-style JSON-RPC server for KeepGPU. + +The server reads JSON lines from stdin and writes JSON responses to stdout. +Supported methods: + - start_keep(gpu_ids, vram, interval, busy_threshold, job_id) + - stop_keep(job_id=None) # None stops all + - status(job_id=None) # None lists all +""" + +from __future__ import annotations + +import atexit +import json +import sys +import uuid +from dataclasses import dataclass +from typing import Any, Callable, Dict, List, Optional + +from keep_gpu.global_gpu_controller.global_gpu_controller import GlobalGPUController +from keep_gpu.utilities.gpu_info import get_gpu_info +from keep_gpu.utilities.logger import setup_logger + +logger = setup_logger(__name__) + + +@dataclass +class Session: + controller: GlobalGPUController + params: Dict[str, Any] + + +class KeepGPUServer: + def __init__( + self, + controller_factory: Optional[Callable[..., GlobalGPUController]] = None, + ) -> None: + self._sessions: Dict[str, Session] = {} + self._controller_factory = controller_factory or GlobalGPUController + atexit.register(self.shutdown) + + def start_keep( + self, + gpu_ids: Optional[List[int]] = None, + vram: str = "1GiB", + interval: int = 300, + busy_threshold: int = -1, + job_id: Optional[str] = None, + ) -> Dict[str, Any]: + job_id = job_id or str(uuid.uuid4()) + if job_id in self._sessions: + raise ValueError(f"job_id {job_id} already exists") + + controller = self._controller_factory( + gpu_ids=gpu_ids, + interval=interval, + vram_to_keep=vram, + busy_threshold=busy_threshold, + ) + controller.keep() + self._sessions[job_id] = Session( + controller=controller, + params={ + "gpu_ids": gpu_ids, + "vram": vram, + "interval": interval, + "busy_threshold": busy_threshold, + }, + ) + logger.info("Started keep session %s on GPUs %s", job_id, gpu_ids) + return {"job_id": job_id} + + def stop_keep(self, job_id: Optional[str] = None) -> Dict[str, Any]: + if job_id: + session = self._sessions.pop(job_id, None) + if session: + session.controller.release() + logger.info("Stopped keep session %s", job_id) + return {"stopped": [job_id]} + return {"stopped": [], "message": "job_id not found"} + + stopped_ids = list(self._sessions.keys()) + for job_id in stopped_ids: + session = self._sessions.pop(job_id) + session.controller.release() + if stopped_ids: + logger.info("Stopped sessions: %s", stopped_ids) + return {"stopped": stopped_ids} + + def status(self, job_id: Optional[str] = None) -> Dict[str, Any]: + if job_id: + session = self._sessions.get(job_id) + if not session: + return {"active": False, "job_id": job_id} + return { + "active": True, + "job_id": job_id, + "params": session.params, + } + return { + "active_jobs": [ + {"job_id": jid, "params": sess.params} + for jid, sess in self._sessions.items() + ] + } + + def list_gpus(self) -> Dict[str, Any]: + """Return detailed GPU info (id, name, memory, utilization).""" + infos = get_gpu_info() + return {"gpus": infos} + + def shutdown(self) -> None: + try: + self.stop_keep(None) + except Exception: # pragma: no cover - defensive + # Avoid noisy errors during interpreter teardown + return + + +def _handle_request(server: KeepGPUServer, payload: Dict[str, Any]) -> Dict[str, Any]: + method = payload.get("method") + params = payload.get("params", {}) or {} + req_id = payload.get("id") + try: + if method == "start_keep": + result = server.start_keep(**params) + elif method == "stop_keep": + result = server.stop_keep(**params) + elif method == "status": + result = server.status(**params) + elif method == "list_gpus": + result = server.list_gpus() + else: + raise ValueError(f"Unknown method: {method}") + return {"id": req_id, "result": result} + except Exception as exc: # pragma: no cover - defensive + logger.exception("Request failed") + return {"id": req_id, "error": {"message": str(exc)}} + + +def main() -> None: + server = KeepGPUServer() + for line in sys.stdin: + line = line.strip() + if not line: + continue + try: + payload = json.loads(line) + response = _handle_request(server, payload) + except Exception as exc: + response = {"error": {"message": str(exc)}} + sys.stdout.write(json.dumps(response) + "\n") + sys.stdout.flush() + + +if __name__ == "__main__": + main() diff --git a/src/keep_gpu/utilities/gpu_info.py b/src/keep_gpu/utilities/gpu_info.py new file mode 100644 index 0000000..babf540 --- /dev/null +++ b/src/keep_gpu/utilities/gpu_info.py @@ -0,0 +1,161 @@ +from __future__ import annotations + +from typing import Any, Dict, List + +import torch + +from keep_gpu.utilities.logger import setup_logger + +logger = setup_logger(__name__) + + +def _query_nvml() -> List[Dict[str, Any]]: + import pynvml + + pynvml.nvmlInit() + infos: List[Dict[str, Any]] = [] + try: + count = pynvml.nvmlDeviceGetCount() + for idx in range(count): + handle = pynvml.nvmlDeviceGetHandleByIndex(idx) + mem = pynvml.nvmlDeviceGetMemoryInfo(handle) + util = pynvml.nvmlDeviceGetUtilizationRates(handle).gpu + name = pynvml.nvmlDeviceGetName(handle) + if isinstance(name, bytes): + name = name.decode(errors="ignore") + infos.append( + { + "id": idx, + "platform": "cuda", + "name": name, + "memory_total": int(mem.total), + "memory_used": int(mem.used), + "utilization": int(util), + } + ) + finally: + try: + pynvml.nvmlShutdown() + except Exception: + pass + return infos + + +def _query_rocm() -> List[Dict[str, Any]]: + try: + import rocm_smi # type: ignore + except Exception as exc: # pragma: no cover - env-specific + logger.debug("rocm_smi import failed: %s", exc) + return [] + + infos: List[Dict[str, Any]] = [] + current_device = None + try: + rocm_smi.rsmi_init() + if torch.cuda.is_available(): + current_device = torch.cuda.current_device() + # Use torch to enumerate devices for names/memory + count = torch.cuda.device_count() if torch.cuda.is_available() else 0 + for idx in range(count): + util = None + try: + util = int(rocm_smi.rsmi_dev_busy_percent_get(idx)) + except Exception as exc: + logger.debug("ROCm util query failed for %s: %s", idx, exc) + + try: + torch.cuda.set_device(idx) + free, total = torch.cuda.mem_get_info() + used = total - free + except Exception: + total = used = None + + try: + name = torch.cuda.get_device_name(idx) + except Exception: + name = f"rocm:{idx}" + + infos.append( + { + "id": idx, + "platform": "rocm", + "name": name, + "memory_total": int(total) if total is not None else None, + "memory_used": int(used) if used is not None else None, + "utilization": util, + } + ) + finally: + if current_device is not None: + try: + torch.cuda.set_device(current_device) + except Exception: + pass + try: + rocm_smi.rsmi_shut_down() + except Exception: + pass + return infos + + +def _query_torch() -> List[Dict[str, Any]]: + infos: List[Dict[str, Any]] = [] + if not torch.cuda.is_available(): + return infos + current_device = torch.cuda.current_device() + try: + count = torch.cuda.device_count() + for idx in range(count): + torch.cuda.set_device(idx) + try: + free, total = torch.cuda.mem_get_info() + used = total - free + except Exception: + total = used = None + try: + name = torch.cuda.get_device_name(idx) + except Exception: + name = f"cuda:{idx}" + infos.append( + { + "id": idx, + "platform": "cuda" if torch.version.hip is None else "rocm", + "name": name, + "memory_total": int(total) if total is not None else None, + "memory_used": int(used) if used is not None else None, + "utilization": None, + } + ) + except Exception as exc: # pragma: no cover - defensive + logger.debug("Torch GPU info failed: %s", exc) + finally: + try: + torch.cuda.set_device(current_device) + except Exception: + pass + return infos + + +def get_gpu_info() -> List[Dict[str, Any]]: + """ + Return a list of GPU info dicts: id, platform, name, memory_total, memory_used, utilization. + Tries NVML first (CUDA), then ROCm SMI, then falls back to torch.cuda data. + """ + try: + infos = _query_nvml() + if infos: + return infos + except Exception as exc: + logger.debug("NVML info failed: %s", exc) + + try: + infos = _query_rocm() + if infos: + return infos + except Exception as exc: + logger.debug("ROCm info failed: %s", exc) + + return _query_torch() + + +__all__ = ["get_gpu_info"] diff --git a/tests/conftest.py b/tests/conftest.py index 22d1680..33fd177 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,11 +11,6 @@ def pytest_addoption(parser): ) -def pytest_configure(config): - config.addinivalue_line("markers", "rocm: tests that require ROCm stack") - config.addinivalue_line("markers", "large_memory: tests that use large VRAM") - - def pytest_collection_modifyitems(config, items): if config.getoption("--run-rocm"): return diff --git a/tests/mcp/test_server.py b/tests/mcp/test_server.py new file mode 100644 index 0000000..494deb4 --- /dev/null +++ b/tests/mcp/test_server.py @@ -0,0 +1,95 @@ +from keep_gpu.mcp.server import KeepGPUServer, _handle_request + + +class DummyController: + def __init__(self, gpu_ids=None, interval=0, vram_to_keep=None, busy_threshold=0): + self.gpu_ids = gpu_ids + self.interval = interval + self.vram_to_keep = vram_to_keep + self.busy_threshold = busy_threshold + self.kept = False + self.released = False + + def keep(self): + self.kept = True + + def release(self): + self.released = True + + +def dummy_factory(**kwargs): + return DummyController(**kwargs) + + +def test_start_status_stop_cycle(): + server = KeepGPUServer(controller_factory=dummy_factory) + res = server.start_keep(gpu_ids=[1], vram="2GiB", interval=5, busy_threshold=20) + job_id = res["job_id"] + + status = server.status(job_id) + assert status["active"] + assert status["params"]["gpu_ids"] == [1] + assert status["params"]["vram"] == "2GiB" + assert status["params"]["interval"] == 5 + assert status["params"]["busy_threshold"] == 20 + + stopped = server.stop_keep(job_id) + assert job_id in stopped["stopped"] + assert server.status(job_id)["active"] is False + + +def test_stop_all(): + server = KeepGPUServer(controller_factory=dummy_factory) + job_a = server.start_keep()["job_id"] + job_b = server.start_keep()["job_id"] + + stopped = server.stop_keep() + assert set(stopped["stopped"]) == {job_a, job_b} + assert server.status(job_a)["active"] is False + assert server.status(job_b)["active"] is False + + +def test_list_gpus(): + server = KeepGPUServer(controller_factory=dummy_factory) + info = server.list_gpus() + assert "gpus" in info + + +def test_end_to_end_jsonrpc(): + server = KeepGPUServer(controller_factory=dummy_factory) + # start_keep + req = { + "id": 1, + "method": "start_keep", + "params": {"gpu_ids": [0], "vram": "256MB", "interval": 1, "busy_threshold": 5}, + } + resp = _handle_request(server, req) + assert "result" in resp and "job_id" in resp["result"] + job_id = resp["result"]["job_id"] + + # status + status_req = {"id": 2, "method": "status", "params": {"job_id": job_id}} + status_resp = _handle_request(server, status_req) + assert status_resp["result"]["active"] is True + + # stop_keep + stop_req = {"id": 3, "method": "stop_keep", "params": {"job_id": job_id}} + stop_resp = _handle_request(server, stop_req) + assert job_id in stop_resp["result"]["stopped"] + + +def test_status_all(): + server = KeepGPUServer(controller_factory=dummy_factory) + job_a = server.start_keep(gpu_ids=[0])["job_id"] + job_b = server.start_keep(gpu_ids=[1])["job_id"] + + status = server.status() + assert "active_jobs" in status + assert len(status["active_jobs"]) == 2 + + job_statuses = {job["job_id"]: job for job in status["active_jobs"]} + assert job_a in job_statuses + assert job_b in job_statuses + assert job_statuses[job_a]["params"]["gpu_ids"] == [0] + assert job_statuses[job_b]["params"]["gpu_ids"] == [1] + assert "controller" not in job_statuses[job_a] diff --git a/tests/utilities/test_gpu_info.py b/tests/utilities/test_gpu_info.py new file mode 100644 index 0000000..f07eae9 --- /dev/null +++ b/tests/utilities/test_gpu_info.py @@ -0,0 +1,126 @@ +import sys + +import pytest + +from keep_gpu.utilities import gpu_info + + +class DummyNVMLMemory: + def __init__(self, total: int, used: int): + self.total = total + self.used = used + + +class DummyNVMLUtil: + def __init__(self, gpu: int): + self.gpu = gpu + + +@pytest.mark.skipif( + not hasattr(gpu_info, "torch") or not gpu_info.torch.cuda.is_available(), + reason="CUDA not available for NVML path", +) +def test_get_gpu_info_nvml(monkeypatch): + class DummyNVML: + def __init__(self): + self.shutdown_calls = 0 + + @staticmethod + def nvmlInit(): + return None + + @staticmethod + def nvmlDeviceGetCount(): + return 1 + + @staticmethod + def nvmlDeviceGetHandleByIndex(index): + assert index == 0 + return "handle" + + @staticmethod + def nvmlDeviceGetMemoryInfo(handle): + return DummyNVMLMemory(total=2048, used=1024) + + @staticmethod + def nvmlDeviceGetUtilizationRates(handle): + return DummyNVMLUtil(gpu=55) + + @staticmethod + def nvmlDeviceGetName(handle): + return b"Mock GPU" + + def nvmlShutdown(self): + self.shutdown_calls += 1 + + dummy_nvml = DummyNVML() + monkeypatch.setitem(sys.modules, "pynvml", dummy_nvml) + + infos = gpu_info.get_gpu_info() + assert len(infos) == 1 + info = infos[0] + assert info["name"] == "Mock GPU" + assert info["memory_total"] == 2048 + assert info["memory_used"] == 1024 + assert info["utilization"] == 55 + + +@pytest.mark.rocm +def test_get_gpu_info_rocm(monkeypatch): + # remove nvml so ROCm path is used + monkeypatch.setitem(sys.modules, "pynvml", None) + + class DummyTorchCuda: + @staticmethod + def is_available(): + return True + + @staticmethod + def device_count(): + return 1 + + @staticmethod + def mem_get_info(): + return (50, 100) + + @staticmethod + def get_device_name(idx): + return f"ROCm {idx}" + + @staticmethod + def set_device(idx): + return None + + monkeypatch.setattr( + gpu_info, + "torch", + type( + "T", (), {"cuda": DummyTorchCuda, "version": type("V", (), {"hip": "6.0"})} + ), + ) + + class DummyROCM: + calls = 0 + + @classmethod + def rsmi_init(cls): + cls.calls += 1 + + @classmethod + def rsmi_dev_busy_percent_get(cls, idx): + assert idx == 0 + return 77 + + @classmethod + def rsmi_shut_down(cls): + cls.calls += 1 + + monkeypatch.setitem(sys.modules, "rocm_smi", DummyROCM) + + infos = gpu_info.get_gpu_info() + assert len(infos) == 1 + info = infos[0] + assert info["platform"] == "rocm" + assert info["utilization"] == 77 + assert info["memory_total"] == 100 + assert info["memory_used"] == 50