From c32d6e747400829c481f4ac02fd460161906e9a3 Mon Sep 17 00:00:00 2001 From: Atul Nair Date: Thu, 11 Jun 2026 20:29:50 -0400 Subject: [PATCH] Skip in-container sshd setup for single-node container runs ContainerOrchestrator.setup_sshd() ran a fixed command list ending in `/usr/sbin/sshd -p2224` and asserted every step succeeded, for every container run regardless of node count. The orch pytest fixture calls it unconditionally, so a single-node run on a minimal image with no /usr/sbin/sshd failed the whole fixture with a generic "SSH setup command failed" message and never ran the workload. The in-container sshd exists only so MPI (mpirun's plm_rsh_args -p 2224) can reach peer ranks on other nodes. A single-node run execs directly via docker exec and never distributes over MPI, so the sshd setup is dead weight there. Guard setup_sshd() to return True early when len(self.hosts) <= 1, after the container_id precondition. The host count lives on the orchestrator, so the decision belongs there; multinode runs are unchanged. --- cvs/core/orchestrators/container.py | 14 ++++++- .../orchestrators/unittests/test_container.py | 37 +++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/cvs/core/orchestrators/container.py b/cvs/core/orchestrators/container.py index 3116a4e2..0d1d2026 100644 --- a/cvs/core/orchestrators/container.py +++ b/cvs/core/orchestrators/container.py @@ -446,8 +446,16 @@ def setup_sshd(self): - Starts sshd on port 2224 - Validates SSH daemon started successfully + The in-container sshd exists solely so MPI (mpirun's ``plm_rsh_args -p + 2224``) can reach peer ranks on other nodes. A single-node run never + distributes over MPI -- it execs directly via ``docker exec`` -- so sshd + is dead weight there. On a single-host cluster this is a no-op, which + also lets minimal images that ship no ``/usr/sbin/sshd`` run single-node + without tripping the start/validate steps below. + Returns: - bool: True if SSH setup succeeded on all nodes, False otherwise + bool: True if SSH setup succeeded on all nodes (or was skipped as + unnecessary for a single-node run), False otherwise Raises: RuntimeError: If no containers are currently running @@ -455,6 +463,10 @@ def setup_sshd(self): if not self.container_id: raise RuntimeError("No containers running. Call setup_containers() first.") + if len(self.hosts) <= 1: + self.log.info("Single-node run: skipping in-container sshd setup (no inter-node MPI/SSH needed)") + return True + self.log.info(f"Setting up SSH daemon in containers: {self.container_id}") # Execute SSH setup commands diff --git a/cvs/core/orchestrators/unittests/test_container.py b/cvs/core/orchestrators/unittests/test_container.py index e71f582d..ee9eccec 100644 --- a/cvs/core/orchestrators/unittests/test_container.py +++ b/cvs/core/orchestrators/unittests/test_container.py @@ -222,6 +222,43 @@ def test_partition_by_status_missing_host_is_probe_failed(self): self.assertEqual(absent, []) self.assertEqual(probe_failed, ["h_gone"]) + # ------------------------------------------------------------------ + # setup_sshd single-node guard + # ------------------------------------------------------------------ + + def test_setup_sshd_single_node_skips_and_returns_true(self): + # The in-container sshd is only needed for multinode MPI. On a single-host + # cluster setup_sshd must short-circuit: no exec into the container, no + # dependency on the image shipping /usr/sbin/sshd. + orch, runtime = self._make(lifetime="per_run") + orch.hosts = ["10.0.0.1"] + orch.container_id = "cvs_iter_test" + self.assertTrue(orch.setup_sshd()) + runtime.exec.assert_not_called() + + def test_setup_sshd_requires_container_even_single_node(self): + # The container_id precondition is checked BEFORE the single-node guard, + # so a single-node orch with no running container still raises rather than + # silently returning True. + orch, _ = self._make(lifetime="per_run") + orch.hosts = ["10.0.0.1"] + self.assertIsNone(orch.container_id) + with self.assertRaises(RuntimeError): + orch.setup_sshd() + + @patch("time.sleep", lambda *_a, **_k: None) + def test_setup_sshd_multinode_attempts_setup(self): + # The guard must NOT skip a genuine multinode run: every setup command and + # the final validation probe are exec'd into the container. + orch, runtime = self._make(lifetime="per_run") + orch.container_id = "cvs_iter_test" + runtime.exec.return_value = { + "10.0.0.1": {"exit_code": 0}, + "10.0.0.2": {"exit_code": 0}, + } + self.assertTrue(orch.setup_sshd()) + self.assertTrue(runtime.exec.called) + # ------------------------------------------------------------------ # teardown_containers lifetime branching # ------------------------------------------------------------------