-
Notifications
You must be signed in to change notification settings - Fork 5
[cuda, rocm, ci, tests] fix: resolve CodeRabbit review findings for controller safety and test reliability #63
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -50,7 +50,7 @@ def __init__( | |
| Args: | ||
| rank (int): Local CUDA device index to occupy. | ||
| interval (float, optional): Sleep time (seconds) between workload | ||
| batches. Defaults to 0.5. | ||
| batches. Defaults to 1.0. | ||
| matmul_iterations (int, optional): Number of matmul ops per batch. | ||
| vram_to_keep (int or str, optional): Amount of VRAM to keep busy, | ||
| e.g. `"1000 MB"`, `"20 GB"`, or an integer like `1000 * 1000`. | ||
|
|
@@ -126,8 +126,11 @@ def _keep_loop(self) -> None: | |
| matrix = None | ||
| while not self._stop_evt.is_set(): | ||
| try: | ||
| num_elements = int(self.vram_to_keep) | ||
| if num_elements <= 0: | ||
| raise ValueError("vram_to_keep must be positive") | ||
| matrix = torch.rand( | ||
| self.vram_to_keep, | ||
| num_elements, | ||
| device=self.device, | ||
| dtype=torch.float32, | ||
| requires_grad=False, | ||
|
|
@@ -166,7 +169,7 @@ def _keep_loop(self) -> None: | |
| # ------------------------------------------------------------------ | ||
| @torch.no_grad() | ||
| def _run_mat_batch(self, matrix: torch.Tensor) -> None: | ||
|
||
| """Run a batch of dummy matmuls to keep GPU busy.""" | ||
| """Run a batch of in-place ReLU ops to keep GPU busy.""" | ||
|
|
||
| tic = time.time() | ||
| for _ in range(self.matmul_iterations): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,14 +24,17 @@ def __init__( | |
| vram_to_keep: str | int = "1000 MB", | ||
| busy_threshold: int = 10, | ||
| iterations: int = 5000, | ||
| max_allocation_retries: int = 3, | ||
| ): | ||
| super().__init__(vram_to_keep=vram_to_keep, interval=interval) | ||
| self.rank = rank | ||
| self.device = torch.device(f"cuda:{rank}") | ||
| self.busy_threshold = busy_threshold | ||
| self.iterations = iterations | ||
| self.max_allocation_retries = max_allocation_retries | ||
| self._stop_evt: Optional[threading.Event] = None | ||
| self._thread: Optional[threading.Thread] = None | ||
| self._failure_exc: Optional[Exception] = None | ||
|
|
||
| # Lazy rocm_smi import; keep handle for reuse | ||
| try: | ||
|
|
@@ -46,6 +49,7 @@ def keep(self) -> None: | |
| if self._thread and self._thread.is_alive(): | ||
| logger.warning("rank %s: keep thread already running", self.rank) | ||
| return | ||
| self._failure_exc = None | ||
| if self._rocm_smi: | ||
| try: | ||
| self._rocm_smi.rsmi_init() | ||
|
|
@@ -62,12 +66,12 @@ def keep(self) -> None: | |
| logger.info("rank %s: ROCm keep thread started", self.rank) | ||
|
|
||
| def release(self) -> None: | ||
| if not (self._thread and self._thread.is_alive()): | ||
| if self._thread and self._thread.is_alive(): | ||
| self._stop_evt.set() | ||
| self._thread.join() | ||
| torch.cuda.empty_cache() | ||
| else: | ||
| logger.warning("rank %s: keep thread not running", self.rank) | ||
| return | ||
| self._stop_evt.set() | ||
| self._thread.join() | ||
| torch.cuda.empty_cache() | ||
| if self._rocm_smi: | ||
| try: | ||
| self._rocm_smi.rsmi_shut_down() | ||
|
|
@@ -95,21 +99,35 @@ def _query_utilization(self) -> Optional[int]: | |
| def _keep_loop(self) -> None: | ||
| torch.cuda.set_device(self.rank) | ||
| tensor = None | ||
| while not self._stop_evt.is_set(): | ||
| attempts = 0 | ||
| while not self._stop_evt.is_set() and attempts < self.max_allocation_retries: | ||
|
||
| try: | ||
| num_elements = int(self.vram_to_keep) | ||
| if num_elements <= 0: | ||
| raise ValueError("vram_to_keep must be positive") | ||
| tensor = torch.rand( | ||
| self.vram_to_keep, | ||
| num_elements, | ||
| device=self.device, | ||
| dtype=torch.float32, | ||
| requires_grad=False, | ||
| ) | ||
| break | ||
| except RuntimeError: | ||
| logger.exception("rank %s: failed to allocate tensor", self.rank) | ||
| except (RuntimeError, ValueError) as exc: | ||
| attempts += 1 | ||
| logger.error( | ||
| "rank %s: failed to allocate tensor (attempt %d/%d): %s", | ||
| self.rank, | ||
| attempts, | ||
| self.max_allocation_retries, | ||
| exc, | ||
| ) | ||
| time.sleep(self.interval) | ||
| if tensor is None: | ||
| logger.error("rank %s: failed to allocate tensor, exiting loop", self.rank) | ||
| raise RuntimeError("Failed to allocate tensor for ROCm GPU keeping") | ||
| self._failure_exc = RuntimeError( | ||
| f"rank {self.rank}: failed to allocate tensor after {attempts} attempts" | ||
| ) | ||
| logger.error("%s", self._failure_exc) | ||
| return | ||
|
|
||
| while not self._stop_evt.is_set(): | ||
| try: | ||
|
|
@@ -127,6 +145,10 @@ def _keep_loop(self) -> None: | |
| logger.exception("rank %s: unexpected error", self.rank) | ||
| time.sleep(self.interval) | ||
|
|
||
| def allocation_status(self) -> Optional[Exception]: | ||
| """Return allocation failure exception captured in worker thread, if any.""" | ||
| return self._failure_exc | ||
|
|
||
| @torch.no_grad() | ||
| def _run_batch(self, tensor: torch.Tensor) -> None: | ||
| tic = time.time() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,20 +19,22 @@ def test_cuda_controller_basic(): | |
| time.sleep(0.2) | ||
| assert ctrl._thread and ctrl._thread.is_alive() | ||
|
|
||
| assert ctrl._thread is not None | ||
| ctrl.release() | ||
| ctrl._thread.join(timeout=2) | ||
|
||
| assert not (ctrl._thread and ctrl._thread.is_alive()) | ||
|
|
||
| ctrl.keep() | ||
| time.sleep(0.2) | ||
| assert ctrl._thread and ctrl._thread.is_alive() | ||
| assert ctrl._thread is not None | ||
| ctrl.release() | ||
| ctrl._thread.join(timeout=2) | ||
| assert not (ctrl._thread and ctrl._thread.is_alive()) | ||
|
|
||
| with ctrl: | ||
| assert ctrl._thread and ctrl._thread.is_alive() | ||
| time.sleep(0.2) | ||
| assert ctrl._thread is not None | ||
| ctrl._thread.join(timeout=2) | ||
| assert not (ctrl._thread and ctrl._thread.is_alive()) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| test_cuda_controller_basic() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new
num_elements <= 0guard raisesValueErrorinside_keep_loop, but this loop only catchesRuntimeError, sokeep()can report success and then the background thread immediately dies whenvram_to_keepresolves to 0 (for example--vram 0from CLI/API). This leaves the controller inactive without a synchronous failure path; validation should happen before starting the thread orValueErrorshould be handled and surfaced.Useful? React with 👍 / 👎.