Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion cvs/core/orchestrators/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,15 +446,27 @@ def setup_sshd(self):
- Starts sshd on port 2224
- Validates SSH daemon started successfully

The in-container sshd exists solely so MPI (mpirun's ``plm_rsh_args -p
2224``) can reach peer ranks on other nodes. A single-node run never
distributes over MPI -- it execs directly via ``docker exec`` -- so sshd
is dead weight there. On a single-host cluster this is a no-op, which
also lets minimal images that ship no ``/usr/sbin/sshd`` run single-node
without tripping the start/validate steps below.

Returns:
bool: True if SSH setup succeeded on all nodes, False otherwise
bool: True if SSH setup succeeded on all nodes (or was skipped as
unnecessary for a single-node run), False otherwise

Raises:
RuntimeError: If no containers are currently running
"""
if not self.container_id:
raise RuntimeError("No containers running. Call setup_containers() first.")

if len(self.hosts) <= 1:
self.log.info("Single-node run: skipping in-container sshd setup (no inter-node MPI/SSH needed)")
return True

self.log.info(f"Setting up SSH daemon in containers: {self.container_id}")

# Execute SSH setup commands
Expand Down
37 changes: 37 additions & 0 deletions cvs/core/orchestrators/unittests/test_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,43 @@ def test_partition_by_status_missing_host_is_probe_failed(self):
self.assertEqual(absent, [])
self.assertEqual(probe_failed, ["h_gone"])

# ------------------------------------------------------------------
# setup_sshd single-node guard
# ------------------------------------------------------------------

def test_setup_sshd_single_node_skips_and_returns_true(self):
# The in-container sshd is only needed for multinode MPI. On a single-host
# cluster setup_sshd must short-circuit: no exec into the container, no
# dependency on the image shipping /usr/sbin/sshd.
orch, runtime = self._make(lifetime="per_run")
orch.hosts = ["10.0.0.1"]
orch.container_id = "cvs_iter_test"
self.assertTrue(orch.setup_sshd())
runtime.exec.assert_not_called()

def test_setup_sshd_requires_container_even_single_node(self):
# The container_id precondition is checked BEFORE the single-node guard,
# so a single-node orch with no running container still raises rather than
# silently returning True.
orch, _ = self._make(lifetime="per_run")
orch.hosts = ["10.0.0.1"]
self.assertIsNone(orch.container_id)
with self.assertRaises(RuntimeError):
orch.setup_sshd()

@patch("time.sleep", lambda *_a, **_k: None)
def test_setup_sshd_multinode_attempts_setup(self):
# The guard must NOT skip a genuine multinode run: every setup command and
# the final validation probe are exec'd into the container.
orch, runtime = self._make(lifetime="per_run")
orch.container_id = "cvs_iter_test"
runtime.exec.return_value = {
"10.0.0.1": {"exit_code": 0},
"10.0.0.2": {"exit_code": 0},
}
self.assertTrue(orch.setup_sshd())
self.assertTrue(runtime.exec.called)

# ------------------------------------------------------------------
# teardown_containers lifetime branching
# ------------------------------------------------------------------
Expand Down
Loading