diff --git a/cvs/core/orchestrators/container.py b/cvs/core/orchestrators/container.py index 61666fe4..3116a4e2 100644 --- a/cvs/core/orchestrators/container.py +++ b/cvs/core/orchestrators/container.py @@ -80,7 +80,6 @@ def __init__(self, log, config, stop_on_errors=False): if not self.container_config: raise ValueError("ContainerOrchestrator requires 'container' config in OrchestratorConfig") - self.container_enabled = True self.container_id = None # Track running container ID # Initialize container runtime @@ -252,6 +251,35 @@ def is_privileged(self): runtime_args = runtime_config.get('args', {}) return runtime_args.get('privileged', DEFAULT_CONTAINER_ARGS['privileged']) + def _partition_by_status(self, status): + """Partition the expected hosts into (running, absent, probe_failed). + + The single source of truth for interpreting a runtime is_running() result. + Both the persistent setup branch and verify_containers_running consume this + so they cannot drift apart on what counts as running vs unreachable. + + running : probe succeeded and the named container is up. + absent : probe succeeded and no such container is running. + probe_failed : the probe cannot be trusted -- non-zero exit_code (an + SSH/sudo/docker error or a timeout) OR the host is missing + from status entirely (pruned as unreachable by an earlier + command). Either way the container's true state is unknown. + + Iterates the expected host set (self.hosts), not status.keys(), so a host + that dropped out of the probe is surfaced as probe_failed rather than + silently ignored. + """ + running, absent, probe_failed = [], [], [] + for host in self.hosts: + info = status.get(host) + if info is None or info.get('exit_code') != 0: + probe_failed.append(host) + elif info.get('running'): + running.append(host) + else: + absent.append(host) + return running, absent, probe_failed + def setup_containers( self, volumes=None, @@ -263,12 +291,15 @@ def setup_containers( ulimits=None, ): """ - Set up containers based on configuration. + Set up containers according to the configured container.lifetime policy. This method should be called explicitly by tests when they need containers. - It respects the 'enabled' flag and handles container lifecycle appropriately. - If 'launch' is true, checks that containers are already running and sets container_id. - If containers are not running when expected, fails and informs the user. + Behavior branches on container.lifetime: + - 'no_launch' : verify a container with the configured name is running + and set container_id; never starts anything. + - 'per_run' : start fresh containers on all hosts. + - 'persistent' : attach to a container already running on all hosts, + otherwise start fresh. Idempotent. Args: volumes: Optional list of volume mounts (uses standards if not provided) @@ -282,67 +313,128 @@ def setup_containers( Returns: bool: True if containers were set up successfully or no setup needed """ - if not self.container_config or not self.container_config.get('enabled', False): - self.log.debug("Container mode not enabled, skipping setup") - return True - - if self.container_config.get('launch', False): - # Launch containers - self.log.info("Launching containers...") - container_name = self.get_container_name(self.container_config, self.container_config['image']) - self.container_id = container_name - - # Use provided parameters or get standards - volumes = volumes if volumes is not None else self.get_volumes() - devices = devices if devices is not None else self.get_devices() - capabilities = capabilities if capabilities is not None else self.get_capabilities() - security_opts = security_opts if security_opts is not None else self.get_security_opts() - environment = environment if environment is not None else self.get_environment() - groups = groups if groups is not None else self.get_groups() - ulimits = ulimits if ulimits is not None else self.get_ulimits() - - # Create a modified container config with standard settings - modified_config = dict(self.container_config) - - # Ensure runtime config exists - if 'runtime' not in modified_config: - modified_config['runtime'] = {} - if 'args' not in modified_config['runtime']: - modified_config['runtime']['args'] = {} - - # Set standard runtime args if not already set - runtime_args = modified_config['runtime']['args'] - if 'network' not in runtime_args: - runtime_args['network'] = self.get_network_mode() - if 'ipc' not in runtime_args: - runtime_args['ipc'] = self.get_ipc_mode() - if 'privileged' not in runtime_args: - runtime_args['privileged'] = self.is_privileged() - - # Add InfiniBand device discovery via shell expansion (per-host) - ib_device_expansion = '$(for dev in /dev/infiniband/*; do echo -n "--device $dev:$dev "; done)' - - return self.runtime.setup_containers( - modified_config, - container_name, - volumes=volumes, - devices=devices, - capabilities=capabilities, - security_opts=security_opts, - environment=environment, - groups=groups, - ulimits=ulimits, - device_expansion=ib_device_expansion, - ) + lifetime = self.container_config.get('lifetime', 'per_run') - # Assume containers are running image = self.container_config.get('image') if not image: self.log.error("Container image not specified in config") return False - container_name = self.get_container_name(self.container_config, image) - return self.verify_containers_running(container_name) + + if lifetime == 'no_launch': + # CVS never launches it: verify only, never start. + return self.verify_containers_running(container_name) + + if lifetime == 'persistent': + status = self.runtime.is_running(container_name) + running_hosts, absent_hosts, probe_failed_hosts = self._partition_by_status(status) + + if probe_failed_hosts: + # The probe could not be trusted on these hosts (SSH/sudo/docker + # error, timeout, or the host dropped out). We cannot tell "absent" + # from "actually running but unreachable", and treating unreachable + # as absent would let the cold-start path below force-remove a + # container that is in fact running -- destroying its overlay. Refuse. + self.log.error( + f"Cannot determine state of persistent container '{container_name}' on " + f"{probe_failed_hosts} (probe failed: SSH/sudo/docker error, timeout, or " + f"host unreachable). Refusing to act on an untrustworthy probe -- resolve " + f"host/daemon health and rerun." + ) + return False + + if running_hosts and not absent_hosts: + # Running on every host: attach. + self.container_id = container_name + self.log.info(f"Attaching to running container '{container_name}'") + return True + + if running_hosts and absent_hosts: + # Partial: refuse to auto-relaunch. _launch_containers force-removes + # the same-named container on ALL hosts before recreating, which would + # destroy the overlay (installs, clones) on the still-running hosts -- + # the opposite of what 'persistent' promises. Fail loudly and let the + # user choose: remove on all hosts and rerun (clean rebuild), or + # restart the container on the missing hosts to reattach. + self.log.error( + f"Persistent container '{container_name}' is running on {running_hosts} " + f"but missing on {absent_hosts}. Refusing to auto-relaunch: that would " + f"force-remove and rebuild the containers on the still-running hosts, " + f"destroying their overlay. Either remove '{container_name}' on all hosts " + f"and rerun, or restart it on {absent_hosts} to reattach." + ) + return False + + # Genuinely absent on every host: legitimate cold start, launch fresh on all. + self.log.info("Persistent container not running on any host, launching...") + return self._launch_containers(volumes, devices, capabilities, security_opts, environment, groups, ulimits) + + # 'per_run' (default): always start fresh. + return self._launch_containers(volumes, devices, capabilities, security_opts, environment, groups, ulimits) + + def _launch_containers( + self, + volumes=None, + devices=None, + capabilities=None, + security_opts=None, + environment=None, + groups=None, + ulimits=None, + ): + """Start fresh containers on all hosts via the runtime. + + Shared by the 'per_run' path and the 'persistent' path when no container + is already running. The runtime force-removes any stale same-named + container before starting. + """ + self.log.info("Launching containers...") + container_name = self.get_container_name(self.container_config, self.container_config['image']) + self.container_id = container_name + + # Use provided parameters or get standards + volumes = volumes if volumes is not None else self.get_volumes() + devices = devices if devices is not None else self.get_devices() + capabilities = capabilities if capabilities is not None else self.get_capabilities() + security_opts = security_opts if security_opts is not None else self.get_security_opts() + environment = environment if environment is not None else self.get_environment() + groups = groups if groups is not None else self.get_groups() + ulimits = ulimits if ulimits is not None else self.get_ulimits() + + # Create a modified container config with standard settings + modified_config = dict(self.container_config) + + # Ensure runtime config exists + if 'runtime' not in modified_config: + modified_config['runtime'] = {} + if 'args' not in modified_config['runtime']: + modified_config['runtime']['args'] = {} + + # Set standard runtime args if not already set + runtime_args = modified_config['runtime']['args'] + if 'network' not in runtime_args: + runtime_args['network'] = self.get_network_mode() + if 'ipc' not in runtime_args: + runtime_args['ipc'] = self.get_ipc_mode() + if 'privileged' not in runtime_args: + runtime_args['privileged'] = self.is_privileged() + + # Add InfiniBand device discovery via shell expansion (per-host) + ib_device_expansion = '$(for dev in /dev/infiniband/*; do echo -n "--device $dev:$dev "; done)' + + launched = self.runtime.setup_containers( + modified_config, + container_name, + volumes=volumes, + devices=devices, + capabilities=capabilities, + security_opts=security_opts, + environment=environment, + groups=groups, + ulimits=ulimits, + device_expansion=ib_device_expansion, + ) + return launched def setup_sshd(self): """ @@ -414,21 +506,14 @@ def verify_containers_running(self, container_name): bool: True if container is running on all hosts, False otherwise """ self.log.debug(f"Checking if container '{container_name}' is running on all hosts") - result = self.runtime.is_running(container_name) - - # Verify container is running on all hosts - failed_hosts = [] - for host, info in result.items(): - exit_code = info.get('exit_code') - if exit_code != 0: - failed_hosts.append(f"{host} (exit code {exit_code})") - continue - if not info.get('running'): - running_name = info.get('name', '') - failed_hosts.append(f"{host} (container not running, found: '{running_name}')") - - if failed_hosts: - self.log.error(f"Container '{container_name}' not running on hosts: {failed_hosts}") + status = self.runtime.is_running(container_name) + _running, absent_hosts, probe_failed_hosts = self._partition_by_status(status) + + if absent_hosts or probe_failed_hosts: + self.log.error( + f"Container '{container_name}' not running on all hosts: " + f"not running on {absent_hosts}, probe failed on {probe_failed_hosts}" + ) return False self.container_id = container_name @@ -437,21 +522,22 @@ def verify_containers_running(self, container_name): def teardown_containers(self): """ - Tear down containers if they are running. + Tear down containers according to the configured container.lifetime policy. - This method should be called explicitly by tests for cleanup. - It respects the 'enabled' flag and handles container lifecycle appropriately. - If 'launch' is False, assumes containers should not be stopped (externally managed). + This method should be called explicitly by tests for cleanup. Behavior + branches on container.lifetime: + - 'no_launch' : no-op (CVS does not own a container it did not launch). + - 'persistent' : no-op (left running for the next run; user removes it + explicitly). + - 'per_run' : force-remove the container CVS started. Returns: bool: True if containers were torn down successfully or no teardown needed """ - if not self.container_config or not self.container_config.get('enabled', False): - self.log.debug("Container mode not enabled, skipping teardown") - return True + lifetime = self.container_config.get('lifetime', 'per_run') - if not self.container_config.get('launch', False): - self.log.debug("launch is False, not stopping externally managed containers") + if lifetime in ('no_launch', 'persistent'): + self.log.debug(f"lifetime={lifetime}, leaving containers running") return True if not self.container_id: diff --git a/cvs/core/orchestrators/factory.py b/cvs/core/orchestrators/factory.py index 35203a2f..f780f08d 100644 --- a/cvs/core/orchestrators/factory.py +++ b/cvs/core/orchestrators/factory.py @@ -9,6 +9,51 @@ from cvs.core.orchestrators.container import ContainerOrchestrator +VALID_CONTAINER_LIFETIMES = ("no_launch", "per_run", "persistent") + + +def _resolve_container_lifetime(container): + """Normalize a container config block to a single resolved ``lifetime`` key. + + The legacy two-axis schema (``enabled`` + ``launch``) is removed in favor of + one tri-valued ``container.lifetime``. Mutates and returns the passed dict. + + Resolution rules (first match wins): + - ``enabled`` present (any value) -> ``ValueError`` (removed field). + - ``launch`` present (any value) -> ``ValueError`` (removed field). Both + removed fields fail loudly rather than being silently mapped, so a stale + flag can never quietly override an explicit ``lifetime``. + - ``lifetime`` present -> validated, kept as-is. + - none of the above -> default ``per_run``. + + An empty/absent container block is returned untouched (baremetal path). + """ + if not container: + return container + + if 'enabled' in container: + raise ValueError( + "container.enabled is removed; delete the field and set " + "container.lifetime to one of 'no_launch', 'per_run', 'persistent'" + ) + + if 'launch' in container: + raise ValueError( + "container.launch is removed; delete the field and set " + "container.lifetime ('launch: true' -> 'per_run', " + "'launch: false' -> 'no_launch')" + ) + + if 'lifetime' in container: + lifetime = container['lifetime'] + if lifetime not in VALID_CONTAINER_LIFETIMES: + raise ValueError(f"container.lifetime must be one of {VALID_CONTAINER_LIFETIMES}, got {lifetime!r}") + return container + + container['lifetime'] = 'per_run' + return container + + class OrchestratorConfig: """ Configuration for orchestrator creation. @@ -26,8 +71,7 @@ class OrchestratorConfig: Example container configuration: ```json "container": { - "enabled": true, - "launch": false, + "lifetime": "per_run", "runtime": { "name": "docker", "args": { @@ -47,7 +91,14 @@ class OrchestratorConfig: "name": "myuser_rocm_cvs_latest" } ``` - launch: Containers are already running, test suite should not start/stop them [default: false] + lifetime: Container lifecycle policy [default: 'per_run'] + - 'no_launch' : CVS never launches the container. Setup verifies a + container with the configured name is already running + on every host; teardown is a no-op. + - 'per_run' : start at setup, remove at teardown (the default). + - 'persistent' : start if absent / attach if present; never torn down + by the run. Pin container.name explicitly under this + mode (the default _ name shifts on tag bumps). """ def __init__(self, **kwargs): @@ -72,7 +123,8 @@ def __init__(self, **kwargs): self.priv_key_file = kwargs['priv_key_file'] self.password = kwargs.get('password') self.head_node_dict = kwargs.get('head_node_dict', {}) - self.container = kwargs.get('container', {}) + # Normalize here (not in from_configs) so direct construction is validated too. + self.container = _resolve_container_lifetime(kwargs.get('container', {})) def get(self, key, default=None): """Get configuration value with default.""" @@ -97,7 +149,7 @@ def from_configs(cls, cluster_config, testsuite_config=None): Required keys: orchestrator, node_dict, username, priv_key_file Optional keys: container, head_node_dict, password (defaults provided for missing optional keys) - Container structure: {enabled: bool, launch: bool, runtime: {name: str, args: dict}, image: str, name: str, ...} + Container structure: {lifetime: 'no_launch'|'per_run'|'persistent', runtime: {name: str, args: dict}, image: str, name: str, ...} testsuite_config: Test suite specific configuration (dict or path to _config.json) Can override any keys from cluster_config diff --git a/cvs/core/orchestrators/unittests/test_container.py b/cvs/core/orchestrators/unittests/test_container.py index 0d3e35ba..e71f582d 100644 --- a/cvs/core/orchestrators/unittests/test_container.py +++ b/cvs/core/orchestrators/unittests/test_container.py @@ -7,20 +7,44 @@ # Unit tests for cvs/core/orchestrators/container.py: ContainerOrchestrator construction # (incl. SSH port override and runtime wiring) and the setup_containers / teardown_containers -# short-circuit semantics inherited by the rvs_cvs.py orch fixture. Mocks Pssh and -# RuntimeFactory so tests run with no SSH or container runtime. +# lifetime branching inherited by the rvs_cvs.py orch fixture. Mocks Pssh and RuntimeFactory +# so tests run with no SSH or container runtime. # -# The teardown_containers short-circuit at cvs/core/orchestrators/container.py:439 is -# pinned here so any future change has a loud canary. +# The per-lifetime setup/teardown behavior is pinned here so any future change to the +# no_launch / per_run / persistent contract has a loud canary. Pssh + RuntimeFactory are +# patched once in setUp (not per method); _make() returns a fresh orch + runtime mock. import unittest from unittest.mock import MagicMock, patch -from cvs.core.orchestrators.factory import OrchestratorConfig +from cvs.core.orchestrators.factory import OrchestratorConfig, _resolve_container_lifetime from cvs.core.orchestrators.container import ContainerOrchestrator -def _make_orch_config(launch=False, enabled=True): +# Reusable runtime.is_running fixtures (two-host cluster). +_RUNNING = { + "10.0.0.1": {"running": True, "exit_code": 0, "name": "cvs_iter_test"}, + "10.0.0.2": {"running": True, "exit_code": 0, "name": "cvs_iter_test"}, +} +_NOT_RUNNING = { + "10.0.0.1": {"running": False, "exit_code": 0, "name": ""}, + "10.0.0.2": {"running": False, "exit_code": 0, "name": ""}, +} +# Probe itself failed on every host (non-zero exit: SSH/sudo/docker error or +# timeout). 'running' is False but it is NOT trustworthy -- state is unknown. +_PROBE_FAILED = { + "10.0.0.1": {"running": False, "exit_code": 1, "name": ""}, + "10.0.0.2": {"running": False, "exit_code": -1, "name": ""}, +} +# Container up on one host; probe aborted (SSH timeout, exit_code -1) on the +# other. The aborted host must be treated as unknown, never as "absent". +_RUNNING_PLUS_PROBE_FAILED = { + "10.0.0.1": {"running": True, "exit_code": 0, "name": "cvs_iter_test"}, + "10.0.0.2": {"running": False, "exit_code": -1, "name": ""}, +} + + +def _make_orch_config(lifetime="per_run"): """Minimal OrchestratorConfig that satisfies ContainerOrchestrator.__init__ without touching disk or SSH.""" return OrchestratorConfig( @@ -31,8 +55,7 @@ def _make_orch_config(launch=False, enabled=True): password=None, head_node_dict={"mgmt_ip": "10.0.0.1"}, container={ - "enabled": enabled, - "launch": launch, + "lifetime": lifetime, "image": "rocm/cvs:test", "name": "cvs_iter_test", "runtime": {"name": "docker", "args": {}}, @@ -41,65 +64,185 @@ def _make_orch_config(launch=False, enabled=True): class TestContainerOrchestrator(unittest.TestCase): - def _make(self, _mock_pssh, _mock_rf, launch=False, enabled=True): - cfg = _make_orch_config(launch=launch, enabled=enabled) + def setUp(self): + # Patch SSH transport + runtime factory for every test (replaces the + # per-method @patch decorators). Mocks are torn down via addCleanup. + p_pssh = patch("cvs.core.orchestrators.baremetal.Pssh") + p_rf = patch("cvs.core.orchestrators.container.RuntimeFactory") + self.mock_pssh = p_pssh.start() + self.mock_rf = p_rf.start() + self.addCleanup(p_pssh.stop) + self.addCleanup(p_rf.stop) + + def _make(self, lifetime="per_run"): + cfg = _make_orch_config(lifetime=lifetime) runtime = MagicMock(name="docker_runtime") - _mock_rf.create.return_value = runtime + self.mock_rf.create.return_value = runtime orch = ContainerOrchestrator(MagicMock(), cfg) return orch, runtime - @patch("cvs.core.orchestrators.container.RuntimeFactory") - @patch("cvs.core.orchestrators.baremetal.Pssh") - def test_init_creates_runtime_via_factory(self, _mock_pssh, mock_rf): - orch, runtime = self._make(_mock_pssh, mock_rf) + # ------------------------------------------------------------------ + # construction + # ------------------------------------------------------------------ + + def test_init_creates_runtime_via_factory(self): + orch, runtime = self._make() self.assertIs(orch.runtime, runtime) # docker is the default runtime when container.runtime.name == "docker". - mock_rf.create.assert_called_once() - self.assertEqual(mock_rf.create.call_args[0][0], "docker") + self.mock_rf.create.assert_called_once() + self.assertEqual(self.mock_rf.create.call_args[0][0], "docker") - @patch("cvs.core.orchestrators.container.RuntimeFactory") - @patch("cvs.core.orchestrators.baremetal.Pssh") - def test_init_sets_orchestrator_type(self, _mock_pssh, mock_rf): - orch, _ = self._make(_mock_pssh, mock_rf) + def test_init_sets_orchestrator_type(self): + orch, _ = self._make() self.assertEqual(orch.orchestrator_type, "container") - @patch("cvs.core.orchestrators.container.RuntimeFactory") - @patch("cvs.core.orchestrators.baremetal.Pssh") - def test_init_overrides_ssh_port_to_container_sshd(self, _mock_pssh, mock_rf): - orch, _ = self._make(_mock_pssh, mock_rf) + def test_init_overrides_ssh_port_to_container_sshd(self): + orch, _ = self._make() self.assertEqual(orch.ssh_port, 2224) - @patch("cvs.core.orchestrators.container.RuntimeFactory") - @patch("cvs.core.orchestrators.baremetal.Pssh") - def test_setup_containers_launch_true_delegates_to_runtime(self, _mock_pssh, mock_rf): - orch, runtime = self._make(_mock_pssh, mock_rf, launch=True) + def test_init_requires_container_config(self): + # ContainerOrchestrator raises if 'container' config is empty. + cfg = OrchestratorConfig( + orchestrator="container", + node_dict={"10.0.0.1": {}}, + username="testuser", + priv_key_file="/dev/null", + container={}, + ) + with self.assertRaises(ValueError): + ContainerOrchestrator(MagicMock(), cfg) + + # ------------------------------------------------------------------ + # setup_containers lifetime branching + # ------------------------------------------------------------------ + + def test_setup_containers_per_run_delegates_to_runtime(self): + orch, runtime = self._make(lifetime="per_run") runtime.setup_containers.return_value = True - result = orch.setup_containers() - self.assertTrue(result) + self.assertTrue(orch.setup_containers()) runtime.setup_containers.assert_called_once() - @patch("cvs.core.orchestrators.container.RuntimeFactory") - @patch("cvs.core.orchestrators.baremetal.Pssh") - def test_setup_containers_short_circuits_when_disabled(self, _mock_pssh, mock_rf): - orch, runtime = self._make(_mock_pssh, mock_rf, enabled=False) - result = orch.setup_containers() - self.assertTrue(result) # Disabled is treated as success / no-op. + def test_setup_containers_no_launch_verifies_only(self): + # no_launch never starts a container; it verifies and sets container_id. + orch, runtime = self._make(lifetime="no_launch") + runtime.is_running.return_value = _RUNNING + self.assertTrue(orch.setup_containers()) runtime.setup_containers.assert_not_called() + self.assertEqual(orch.container_id, "cvs_iter_test") - @patch("cvs.core.orchestrators.container.RuntimeFactory") - @patch("cvs.core.orchestrators.baremetal.Pssh") - def test_teardown_containers_short_circuits_when_launch_false(self, _mock_pssh, mock_rf): - # launch:false means containers are externally managed; teardown is a no-op. - orch, runtime = self._make(_mock_pssh, mock_rf, launch=False) + def test_setup_containers_no_launch_not_running_fails(self): + # no_launch + container not actually running -> verification fails; nothing + # is launched. + orch, runtime = self._make(lifetime="no_launch") + runtime.is_running.return_value = { + "10.0.0.1": {"running": False, "exit_code": 0, "name": ""}, + } + self.assertFalse(orch.setup_containers()) + runtime.setup_containers.assert_not_called() + + def test_setup_containers_persistent_attaches_when_running(self): + orch, runtime = self._make(lifetime="persistent") + runtime.is_running.return_value = _RUNNING + self.assertTrue(orch.setup_containers()) + # Attach path: no new container started. + runtime.setup_containers.assert_not_called() + self.assertEqual(orch.container_id, "cvs_iter_test") + + def test_setup_containers_persistent_starts_when_not_running(self): + orch, runtime = self._make(lifetime="persistent") + runtime.is_running.return_value = _NOT_RUNNING + runtime.setup_containers.return_value = True + self.assertTrue(orch.setup_containers()) + runtime.setup_containers.assert_called_once() + + def test_setup_containers_persistent_partial_running_refuses(self): + # Running on some hosts but not all must NOT auto-relaunch (that would + # force-remove and rebuild the still-running hosts, destroying their + # overlay). It fails loudly and starts nothing. + orch, runtime = self._make(lifetime="persistent") + runtime.is_running.return_value = { + "10.0.0.1": {"running": True, "exit_code": 0, "name": "cvs_iter_test"}, + "10.0.0.2": {"running": False, "exit_code": 0, "name": ""}, + } + self.assertFalse(orch.setup_containers()) + runtime.setup_containers.assert_not_called() + + def test_setup_containers_persistent_probe_failed_refuses_and_does_not_launch(self): + # Probe failed on every host (non-zero exit). The container's true state + # is unknown -- treating it as "absent" and cold-starting would force-remove + # a possibly-running container and destroy its overlay. Refuse, launch nothing. + orch, runtime = self._make(lifetime="persistent") + runtime.is_running.return_value = _PROBE_FAILED + self.assertFalse(orch.setup_containers()) + runtime.setup_containers.assert_not_called() + + def test_setup_containers_persistent_running_plus_probe_failed_refuses(self): + # Up on one host, probe aborted on the other. The aborted host must not be + # bucketed as "absent" (which would trip the partial-relaunch path); an + # untrustworthy probe is its own hard stop. Nothing is launched. + orch, runtime = self._make(lifetime="persistent") + runtime.is_running.return_value = _RUNNING_PLUS_PROBE_FAILED + self.assertFalse(orch.setup_containers()) + runtime.setup_containers.assert_not_called() + + def test_setup_containers_persistent_host_dropped_from_status_refuses(self): + # A host expected by node_dict is absent from is_running's result entirely + # (pruned as unreachable by an earlier command). It must surface as a probe + # failure, not be silently ignored into a false "running on all" attach. + orch, runtime = self._make(lifetime="persistent") + runtime.is_running.return_value = { + "10.0.0.1": {"running": True, "exit_code": 0, "name": "cvs_iter_test"}, + # 10.0.0.2 missing + } + self.assertFalse(orch.setup_containers()) + runtime.setup_containers.assert_not_called() + self.assertIsNone(orch.container_id) + + def test_partition_by_status_three_buckets(self): + # Direct test of the shared partition helper: one host per bucket. + orch, _ = self._make(lifetime="persistent") + orch.hosts = ["h_run", "h_absent", "h_failed"] + status = { + "h_run": {"running": True, "exit_code": 0, "name": "cvs_iter_test"}, + "h_absent": {"running": False, "exit_code": 0, "name": ""}, + "h_failed": {"running": False, "exit_code": 1, "name": ""}, + } + running, absent, probe_failed = orch._partition_by_status(status) + self.assertEqual(running, ["h_run"]) + self.assertEqual(absent, ["h_absent"]) + self.assertEqual(probe_failed, ["h_failed"]) + + def test_partition_by_status_missing_host_is_probe_failed(self): + # A host expected in self.hosts but absent from status -> probe_failed. + orch, _ = self._make(lifetime="persistent") + orch.hosts = ["h_run", "h_gone"] + status = {"h_run": {"running": True, "exit_code": 0, "name": "cvs_iter_test"}} + running, absent, probe_failed = orch._partition_by_status(status) + self.assertEqual(running, ["h_run"]) + self.assertEqual(absent, []) + self.assertEqual(probe_failed, ["h_gone"]) + + # ------------------------------------------------------------------ + # teardown_containers lifetime branching + # ------------------------------------------------------------------ + + def test_teardown_containers_short_circuits_when_lifetime_no_launch(self): + # no_launch means CVS did not launch the container; teardown is a no-op. + orch, runtime = self._make(lifetime="no_launch") orch.container_id = "test_container" self.assertTrue(orch.teardown_containers()) runtime.teardown_containers.assert_not_called() - @patch("cvs.core.orchestrators.container.RuntimeFactory") - @patch("cvs.core.orchestrators.baremetal.Pssh") - def test_teardown_containers_calls_runtime_when_launch_true(self, _mock_pssh, mock_rf): - # launch:true means CVS owns the container lifecycle; teardown delegates to runtime. - orch, runtime = self._make(_mock_pssh, mock_rf, launch=True) + def test_teardown_containers_persistent_is_noop(self): + # persistent leaves the container running for the next run. + orch, runtime = self._make(lifetime="persistent") + orch.container_id = "test_container" + self.assertTrue(orch.teardown_containers()) + runtime.teardown_containers.assert_not_called() + + def test_teardown_containers_calls_runtime_when_lifetime_per_run(self): + # per_run means CVS owns the container lifecycle; teardown delegates to runtime. + orch, runtime = self._make(lifetime="per_run") orch.container_id = "test_container" runtime.teardown_containers.return_value = True self.assertTrue(orch.teardown_containers()) @@ -107,38 +250,72 @@ def test_teardown_containers_calls_runtime_when_launch_true(self, _mock_pssh, mo # container_id cleared on successful teardown. self.assertIsNone(orch.container_id) - @patch("cvs.core.orchestrators.container.RuntimeFactory") - @patch("cvs.core.orchestrators.baremetal.Pssh") - def test_teardown_containers_short_circuits_when_disabled(self, _mock_pssh, mock_rf): - orch, runtime = self._make(_mock_pssh, mock_rf, enabled=False) - orch.container_id = "test_container" - self.assertTrue(orch.teardown_containers()) - runtime.teardown_containers.assert_not_called() - - @patch("cvs.core.orchestrators.container.RuntimeFactory") - @patch("cvs.core.orchestrators.baremetal.Pssh") - def test_teardown_containers_short_circuits_when_no_container_id(self, _mock_pssh, mock_rf): - # container_id stays None when prepare/setup never ran; teardown is a no-op - # even with launch=True (CVS-owned), since there is nothing to tear down. - orch, runtime = self._make(_mock_pssh, mock_rf, launch=True) + def test_teardown_containers_short_circuits_when_no_container_id(self): + # container_id stays None when setup never ran; per_run teardown is a no-op + # since there is nothing to tear down. + orch, runtime = self._make(lifetime="per_run") self.assertIsNone(orch.container_id) self.assertTrue(orch.teardown_containers()) runtime.teardown_containers.assert_not_called() - @patch("cvs.core.orchestrators.container.RuntimeFactory") + +class TestResolveContainerLifetime(unittest.TestCase): + """One assertion per row of the lifetime resolution table.""" + + def test_enabled_present_raises(self): + with self.assertRaises(ValueError) as ctx: + _resolve_container_lifetime({"enabled": True, "image": "x"}) + self.assertIn("enabled", str(ctx.exception)) + + def test_enabled_false_also_raises(self): + # Any value of the removed field is a hard error, including False. + with self.assertRaises(ValueError): + _resolve_container_lifetime({"enabled": False, "image": "x"}) + + def test_explicit_lifetime_kept(self): + out = _resolve_container_lifetime({"lifetime": "persistent", "image": "x"}) + self.assertEqual(out["lifetime"], "persistent") + + def test_invalid_lifetime_raises(self): + with self.assertRaises(ValueError): + _resolve_container_lifetime({"lifetime": "forever", "image": "x"}) + + def test_launch_present_raises(self): + # launch is a removed field: any value is a hard error (no silent mapping). + with self.assertRaises(ValueError) as ctx: + _resolve_container_lifetime({"launch": True, "image": "x"}) + self.assertIn("launch", str(ctx.exception)) + + def test_launch_false_also_raises(self): + with self.assertRaises(ValueError): + _resolve_container_lifetime({"launch": False, "image": "x"}) + + def test_launch_alongside_lifetime_raises(self): + # A stale launch flag next to an explicit lifetime must fail loudly rather + # than being silently retained/ignored. + with self.assertRaises(ValueError): + _resolve_container_lifetime({"lifetime": "per_run", "launch": True, "image": "x"}) + + def test_no_policy_keys_defaults_to_per_run(self): + out = _resolve_container_lifetime({"image": "x"}) + self.assertEqual(out["lifetime"], "per_run") + + def test_empty_block_untouched(self): + # Baremetal path: empty container block stays empty (no lifetime injected). + self.assertEqual(_resolve_container_lifetime({}), {}) + @patch("cvs.core.orchestrators.baremetal.Pssh") - def test_init_requires_container_config(self, _mock_pssh, _mock_rf): - # ContainerOrchestrator raises if 'container' config is empty. Construct - # the config directly to bypass the helper's auto-population. - cfg = OrchestratorConfig( - orchestrator="container", - node_dict={"10.0.0.1": {}}, - username="testuser", - priv_key_file="/dev/null", - container={}, - ) + def test_orchestratorconfig_init_rejects_launch(self, _mock_pssh): + # Direct construction routes through __init__ -> the same helper, so a + # legacy launch flag is rejected identically to from_configs. with self.assertRaises(ValueError): - ContainerOrchestrator(MagicMock(), cfg) + OrchestratorConfig( + orchestrator="container", + node_dict={"1.1.1.1": {}}, + username="u", + priv_key_file="/dev/null", + container={"launch": True, "image": "x"}, + ) if __name__ == "__main__": diff --git a/cvs/core/orchestrators/unittests/test_factory.py b/cvs/core/orchestrators/unittests/test_factory.py index 10d01382..bf96f01e 100644 --- a/cvs/core/orchestrators/unittests/test_factory.py +++ b/cvs/core/orchestrators/unittests/test_factory.py @@ -15,12 +15,15 @@ import unittest from unittest.mock import MagicMock, patch -from cvs.core.orchestrators.factory import OrchestratorConfig, OrchestratorFactory +from cvs.core.orchestrators.factory import ( + OrchestratorConfig, + OrchestratorFactory, +) from cvs.core.orchestrators.baremetal import BaremetalOrchestrator from cvs.core.orchestrators.container import ContainerOrchestrator -def _make_orch_config(orchestrator="baremetal", with_container=False, launch=False, enabled=True): +def _make_orch_config(orchestrator="baremetal", with_container=False, lifetime="per_run"): """Minimal OrchestratorConfig that satisfies BaremetalOrchestrator and (optionally) ContainerOrchestrator __init__ without touching disk or SSH.""" kwargs = dict( @@ -34,8 +37,7 @@ def _make_orch_config(orchestrator="baremetal", with_container=False, launch=Fal ) if with_container or orchestrator == "container": kwargs["container"] = { - "enabled": enabled, - "launch": launch, + "lifetime": lifetime, "image": "rocm/cvs:test", "name": "cvs_iter_test", "runtime": {"name": "docker", "args": {}}, @@ -181,7 +183,7 @@ def test_from_configs_resolves_user_id_throughout_container_block(self): "username": "{user-id}", "priv_key_file": "/home/{user-id}/.ssh/id_rsa", "container": { - "enabled": True, + "lifetime": "per_run", "image": "rocm/{user-id}:test", "name": "{user-id}_cvs", "runtime": { diff --git a/cvs/core/runtimes/docker.py b/cvs/core/runtimes/docker.py index 49cfef14..3b902072 100644 --- a/cvs/core/runtimes/docker.py +++ b/cvs/core/runtimes/docker.py @@ -44,8 +44,8 @@ def setup_containers( Args: container_config: Container configuration dictionary. Recognized - top-level keys include ``image``, ``launch``, ``runtime`` (with - nested ``args``), and ``env``. + top-level keys include ``image``, ``runtime`` (with nested + ``args``), and ``env``. container_name: Name to assign to the containers volumes: Optional list of volume mounts (overrides config) devices: Optional list of device passthroughs (overrides config) @@ -60,11 +60,6 @@ def setup_containers( self.log.warning("No container config or image specified, skipping container start") return False - launch = container_config.get('launch', False) - if not launch: - self.log.info("Container launch disabled, assuming containers are already running") - return True - image = container_config['image'] # Use provided parameters or fall back to config diff --git a/cvs/core/runtimes/unittests/test_docker.py b/cvs/core/runtimes/unittests/test_docker.py index bd1f84fc..18e7dbfc 100644 --- a/cvs/core/runtimes/unittests/test_docker.py +++ b/cvs/core/runtimes/unittests/test_docker.py @@ -43,13 +43,18 @@ def _fake_exec(cmd, timeout=None, detailed=False, print_console=True): return DockerRuntime(log, orchestrator) -def _container_config(launch=True, image="img:test", extra_runtime_args=None, **extra): - """Minimal container config dict for setup_containers.""" +def _container_config(image="img:test", lifetime="per_run", extra_runtime_args=None, **extra): + """Minimal container config dict for setup_containers. + + lifetime is carried for parity with a real config block, but the runtime never + reads it: lifecycle policy is resolved and branched on by the orchestrator, + which only calls runtime.setup_containers on start paths. The runtime is + policy-free, so the value here has no effect on its behavior. + """ cfg = { - "enabled": True, - "launch": launch, "image": image, "name": "cvs_iter_test", + "lifetime": lifetime, "runtime": {"name": "docker", "args": dict(extra_runtime_args or {})}, } cfg.update(extra) @@ -57,6 +62,21 @@ def _container_config(launch=True, image="img:test", extra_runtime_args=None, ** class TestDockerRuntimeSetupContainers(unittest.TestCase): + def test_setup_containers_always_proceeds(self): + # The legacy `if not launch: return True` short-circuit was removed. The + # orchestrator only calls runtime.setup_containers on start paths, so the + # runtime must always render a `docker run` when invoked. + captured = [] + rt = _make_runtime(captured) + result = rt.setup_containers( + container_config=_container_config(), + container_name="cvs_iter_test", + volumes=["/home/u:/workspace"], + ) + self.assertTrue(result) + self.assertEqual(len(captured), 1) + self.assertIn("docker run", captured[0]) + def test_user_volume_listed_once(self): # With runtime.args.volumes=['/foo:/bar'] AND positional volumes that # already include '/foo:/bar' (mirroring what diff --git a/cvs/input/cluster_file/README.md b/cvs/input/cluster_file/README.md index bf4b1a55..9bc67bfb 100644 --- a/cvs/input/cluster_file/README.md +++ b/cvs/input/cluster_file/README.md @@ -43,8 +43,7 @@ Consumed by `ContainerOrchestrator` in [`cvs/core/orchestrators/container.py`](. | Key | Type | Default | Purpose | | --- | --- | --- | --- | -| `enabled` | bool | `false` | Master switch. Must be `true` for the container backend to do anything; if `false`, `setup_containers` and `teardown_containers` short-circuit to no-ops. | -| `launch` | bool | `false` | `true` = CVS owns lifecycle (starts on `setup_containers`, stops on `teardown_containers`). `false` = containers are externally managed; CVS only verifies they are running and never stops them. | +| `lifetime` | str | `"per_run"` | Container lifecycle policy: `no_launch`, `per_run`, or `persistent`. See the truth table below. | | `image` | str | (required) | Image with the test dependencies (rvs, etc.) pre-installed and an sshd you can start on port 2224. Must be present locally on each node OR pullable from a reachable registry. | | `name` | str | (required) | Container name on each host. For parallel runs make this per-iteration unique (e.g. `cvs_iter_`). | | `runtime.name` | str | `"docker"` | Container runtime. Supported: `docker` (concrete), `enroot` (stub). | @@ -70,16 +69,15 @@ RDMA-ready container; the keys below are only needed to extend or override. | `group_add` | list | `["video"]` (appended) | Supplementary groups inside the container. | | `ulimit` | list | `["memlock=-1"]` (appended) | Per-process resource limits. `memlock=-1` is required for RDMA. | -## `launch` truth table (teardown semantics) +## `lifetime` truth table (setup + teardown semantics) -`teardown_containers()` short-circuits in three cases per [`cvs/core/orchestrators/container.py`](../../core/orchestrators/container.py) (around line 439). This is pinned by the test `test_teardown_containers_short_circuits_when_launch_true` in [`cvs/core/orchestrators/unittests/test_container.py`](../../core/orchestrators/unittests/test_container.py). +`setup_containers()` and `teardown_containers()` in [`cvs/core/orchestrators/container.py`](../../core/orchestrators/container.py) branch on `container.lifetime`. This is pinned by the per-lifetime tests in [`cvs/core/orchestrators/unittests/test_container.py`](../../core/orchestrators/unittests/test_container.py). -| `enabled` | `launch` | `container_id` | Behavior | -| --- | --- | --- | --- | -| `false` | * | * | Short-circuit (no-op, returns `True`). | -| `true` | `true` | * | Short-circuit. The container is externally managed; CVS does not stop it. | -| `true` | `false` | unset | Short-circuit. Nothing was started. | -| `true` | `false` | set | Calls `runtime.teardown_containers(container_id)`. | +| `lifetime` | `setup_containers` | `teardown_containers` | +| --- | --- | --- | +| `no_launch` | Verify the container is already running on every host; set `container_id`. Never starts anything. | No-op. CVS does not own a container it did not launch. | +| `per_run` (default) | Start a fresh container on every host (force-removing any stale same-named container first). | Force-remove the container CVS started. | +| `persistent` | Attach if the container is already running on every host. Start fresh only if it is running on no host. Running on some hosts but not all is a hard error (CVS will not force-remove the still-running hosts and destroy their overlay). Idempotent across runs. | No-op. The container is left running for the next run; remove it yourself when done. | ## Prerequisites on each cluster node @@ -90,6 +88,8 @@ RDMA-ready container; the keys below are only needed to extend or override. ## What happens when you run a backend-blind suite +Default (`lifetime: per_run`) - start fresh, run, remove: + ```mermaid sequenceDiagram participant TestFix as orch fixture @@ -108,7 +108,27 @@ sequenceDiagram ContainerOrch->>Sshd: ssh root@host:2224 "rvs -c ..." Sshd-->>TestFix: stdout TestFix->>ContainerOrch: teardown_containers() - ContainerOrch->>Docker: docker stop and rm + ContainerOrch->>Docker: docker rm -f +``` + +Second `cvs run` with `lifetime: persistent` - attach to the surviving container, skip sshd, leave it running: + +```mermaid +sequenceDiagram + participant TestFix as orch fixture + participant ContainerOrch as ContainerOrchestrator + participant Docker as docker daemon on host + participant Sshd as container sshd:2224 + + TestFix->>ContainerOrch: setup_containers() + ContainerOrch->>Docker: is_running(name)? yes -> attach + TestFix->>ContainerOrch: setup_sshd() + ContainerOrch->>Sshd: pgrep sshd:2224 already up -> skip start + TestFix->>ContainerOrch: orch.exec("rvs -c ...") + ContainerOrch->>Sshd: ssh root@host:2224 "rvs -c ..." + Sshd-->>TestFix: stdout + TestFix->>ContainerOrch: teardown_containers() + ContainerOrch-->>TestFix: no-op (container left running) ``` The same `cvs run` command works against either backend; only the cluster @@ -116,10 +136,9 @@ file changes. The `orch` fixture in [`cvs/tests/health/rvs_cvs.py`](../../tests/ ## Common pitfalls -- **Forgot `enabled: true`**. The container block exists but the orchestrator silently no-ops every lifecycle call. Symptom: `orch.exec` errors with "no container running". - **Image without `openssh-server`**. `setup_sshd` cannot start sshd on port 2224; `orch.exec` then fails to connect. Make sure your image installs `openssh-server` and exposes the binary at `/usr/sbin/sshd`. - **Image without the workload binary**. `cvs run rvs_cvs` will execute `rvs` inside the container; if the image lacks `/opt/rocm/bin/rvs` the test fails with a `command not found` flavor error. -- **`launch: true` plus a stale container with the same `name`**. `docker run` refuses to start because the name is taken. Either pick a per-iteration unique name or stop the stale container first. +- **`lifetime: persistent` without a pinned `name`**. The default container name is `_`, which shifts when you bump the image tag. A tag bump silently abandons the previous container's overlay (installs, clones) and starts fresh. Pin `container.name` explicitly when using `persistent`. - **Port 2224 collision on the host**. With `network: host` the in-container sshd binds to 2224 in the host's network namespace. If something else on the host already listens on 2224 the bind fails. Stop the conflicting service or change the in-image sshd port (and update the orchestrator's port to match). ## See also diff --git a/cvs/input/cluster_file/cluster_container.json b/cvs/input/cluster_file/cluster_container.json index 2a95f115..da5e0ea5 100644 --- a/cvs/input/cluster_file/cluster_container.json +++ b/cvs/input/cluster_file/cluster_container.json @@ -29,13 +29,10 @@ } }, - "_container_block_comment": "Container backend configuration. Consumed by ContainerOrchestrator in cvs/core/orchestrators/container.py. See README.md for the full schema and launch:true/launch:false semantics.", + "_container_block_comment": "Container backend configuration. Consumed by ContainerOrchestrator in cvs/core/orchestrators/container.py. See README.md for the full schema and lifetime semantics.", "container": { - "_enabled_comment": "Master switch. Must be true for the container backend to do anything; if false, setup_containers/teardown_containers short-circuit to no-ops.", - "enabled": true, - - "_launch_comment": "true = CVS owns container lifecycle (starts on setup_containers, stops on teardown_containers). false = containers are externally managed; CVS only verifies they are running and never stops them.", - "launch": true, + "_lifetime_comment": "Container lifecycle policy. 'no_launch' = CVS never launches the container (verify it is already running, never stopped). 'per_run' = CVS starts on setup and removes on teardown. 'persistent' = CVS starts if absent / attaches if present and leaves running on exit (unblocks install-then-run; pin 'name' explicitly).", + "lifetime": "per_run", "_image_comment": "Image with test dependencies (rvs, etc.) pre-installed and an sshd you can start on port 2224. Must be present locally on each node OR pullable from a reachable registry.", "image": "rocm/cvs:latest", diff --git a/docs/how-to/run-with-containers.rst b/docs/how-to/run-with-containers.rst index 8b8f5007..7f9682eb 100644 --- a/docs/how-to/run-with-containers.rst +++ b/docs/how-to/run-with-containers.rst @@ -49,8 +49,8 @@ Open the copied file and edit: - ``priv_key_file``: absolute path to your SSH private key. - ``head_node_dict.mgmt_ip`` and the keys of ``node_dict``: real IPs or hostnames of your cluster nodes. ``mgmt_ip`` must equal one of the keys in ``node_dict``. - ``container.image``: an image present on every node or pullable from a reachable registry. The image must include ``openssh-server`` and the workload binary (for example ``rvs``). -- ``container.name``: container name on each host. For parallel runs make this per-iteration unique (for example ``cvs_iter_``). -- ``container.launch``: see the lifecycle note below. +- ``container.name``: container name on each host. For parallel runs make this per-iteration unique (for example ``cvs_iter_``). Pin it explicitly when using ``lifetime: persistent``. +- ``container.lifetime``: ``no_launch``, ``per_run``, or ``persistent``. See the lifecycle note below. For the full schema and runtime argument reference, see :doc:`/reference/configuration-files/cluster-file`. @@ -73,8 +73,9 @@ Run ``rvs_cvs`` the same way you run any CVS test suite, but with the container What happens during the run: - The ``orch`` fixture in ``rvs_cvs`` reads ``orchestrator: container`` from the cluster file and constructs a ``ContainerOrchestrator``. -- If ``container.launch`` is ``true``, CVS removes any container with the same name on each host, runs the configured image with the merged ``runtime.args``, and starts an in-container ``sshd`` on port ``2224``. -- If ``container.launch`` is ``false``, CVS verifies that a container with the configured name is already running on every host and reuses it. +- With ``lifetime: per_run``, CVS removes any container with the same name on each host, runs the configured image with the merged ``runtime.args``, and starts an in-container ``sshd`` on port ``2224``. +- With ``lifetime: persistent``, CVS attaches to a container already running on every host, or starts one fresh if none is running. +- With ``lifetime: no_launch``, CVS verifies that a container with the configured name is already running on every host and reuses it. - All ``rvs`` invocations are routed through the container via ``docker exec`` and the in-container ``sshd``. Step 4: Verify the run actually used the container @@ -100,14 +101,15 @@ You should see one running container per node with the configured name and image Lifecycle and teardown ====================== -The ``container.launch`` flag controls who owns the container lifecycle: +The ``container.lifetime`` policy controls who owns the container lifecycle: -- ``launch: true`` - CVS starts the long-running container as part of test setup. Teardown is a no-op; CVS does not stop the container so that you can inspect it after the run. Stop and remove it manually when you are done. -- ``launch: false`` - CVS does not start anything. It verifies that a container with the configured name is already running on every host and reuses it. Teardown is a no-op. +- ``per_run`` (default) - CVS starts a fresh container at setup and force-removes it at teardown. Anything written to the container overlay is lost when the run ends. +- ``persistent`` - CVS attaches to the container if it is already running on every host, or starts it fresh if it is running on no host. Teardown is a no-op, so the container (and its overlay) survives across runs. This unblocks install-then-run workflows: ``cvs run install_rvs`` followed by ``cvs run rvs_cvs`` in separate invocations. If the container is running on some hosts but not all, CVS fails rather than rebuilding (which would destroy the overlay on the still-running hosts) -- remove it on all hosts and rerun, or restart it on the missing hosts. Pin ``container.name`` so a tag bump does not silently abandon the overlay. +- ``no_launch`` - CVS does not start anything. It verifies that a container with the configured name is already running on every host and reuses it. Teardown is a no-op. -When ``container.enabled`` is ``false``, both setup and teardown short-circuit to no-ops. See the launch truth table in :doc:`/reference/configuration-files/cluster-file` for the full state matrix. +See the ``lifetime`` truth table in :doc:`/reference/configuration-files/cluster-file` for the full state matrix. -To stop and remove a container started with ``launch: true``, run on every node: +To stop and remove a container that CVS left running (``persistent`` or ``no_launch``), run on every node: .. code:: bash @@ -119,10 +121,9 @@ Replace ``cvs_container`` with the actual ``container.name`` from your cluster f Common pitfalls =============== -- **Forgot ``enabled: true``.** The container block is present but the orchestrator silently no-ops every lifecycle call. The first ``orch.exec`` call then errors with "no container running". - **Image without ``openssh-server``.** ``setup_sshd`` cannot start ``sshd`` on port ``2224`` and ``orch.exec`` fails to connect. Make sure the image installs ``openssh-server`` and exposes the binary at ``/usr/sbin/sshd``. - **Image without the workload binary.** ``cvs run rvs_cvs`` invokes ``rvs`` inside the container; if the image lacks ``/opt/rocm/bin/rvs`` the test fails with a "command not found" style error. -- **``launch: true`` plus a stale container with the same name.** On rerun, ``docker run`` would refuse to start because the name is taken. CVS's launch path issues ``docker rm -f`` before ``docker run`` to handle this; if you start a container outside CVS with the same name, stop it first. +- **``lifetime: persistent`` without a pinned ``name``.** The default container name is ``_``, which shifts when you bump the image tag. A tag bump silently abandons the previous container's overlay (installs, clones) and starts fresh. Pin ``container.name`` when using ``persistent``. - **Port ``2224`` already bound on the host.** With ``network: host`` the in-container ``sshd`` binds to ``2224`` in the host's network namespace. If something else on the host already listens on ``2224`` the bind fails. Stop the conflicting service. - **Picked the wrong cluster template.** Use ``cluster_container.json`` (with ``orchestrator: container``) for container mode. Using ``cluster.json`` runs the suite on the host even when the image you wanted to test is sitting on every node. diff --git a/docs/reference/configuration-files/cluster-file.rst b/docs/reference/configuration-files/cluster-file.rst index 4d48c59c..4b6a61ab 100644 --- a/docs/reference/configuration-files/cluster-file.rst +++ b/docs/reference/configuration-files/cluster-file.rst @@ -77,8 +77,7 @@ Both templates share the same top-level shape. The ``container`` block and the ` }, "container": { - "enabled": true, - "launch": true, + "lifetime": "per_run", "image": "rocm/cvs:latest", "name": "cvs_container", "runtime": { @@ -136,12 +135,9 @@ The ``container`` block configures the container backend. It is consumed by the * - Configuration parameter - Default value - Description - * - ``enabled`` - - ``false`` - - Master switch. Must be ``true`` for the container backend to do anything. When ``false``, ``setup_containers`` and ``teardown_containers`` short-circuit to no-ops. - * - ``launch`` - - ``false`` - - When ``true``, CVS starts the long-running containers on every host as part of test setup. When ``false``, CVS only verifies that containers with the configured name are already running on every host and never stops them. + * - ``lifetime`` + - ``per_run`` + - Container lifecycle policy: ``no_launch``, ``per_run``, or ``persistent``. See the truth table below. * - ``image`` - (required) - Image with the test dependencies (for example ``rvs``) pre-installed and an ``sshd`` you can start on port ``2224``. Must be present locally on each node or pullable from a reachable registry. @@ -199,35 +195,31 @@ When ``runtime.name`` is ``docker``, the keys below configure the underlying ``d - ``["memlock=-1"]`` (appended) - Per-process resource limits. ``memlock=-1`` is required for RDMA. -``launch`` truth table -====================== +``lifetime`` truth table +======================== -The ``launch`` flag interacts with ``enabled`` and the runtime-tracked ``container_id`` to decide whether ``teardown_containers`` actually stops anything. The behavior below is pinned by the unit test ``test_teardown_containers_short_circuits_when_launch_true`` in `cvs/core/orchestrators/unittests/test_container.py `_. +``setup_containers`` and ``teardown_containers`` branch on ``container.lifetime``. The behavior below is pinned by the per-lifetime unit tests in `cvs/core/orchestrators/unittests/test_container.py `_. .. list-table:: - :widths: 2 2 2 5 + :widths: 2 4 4 :header-rows: 1 - * - ``enabled`` - - ``launch`` - - ``container_id`` - - Behavior of ``teardown_containers`` - * - ``false`` - - any - - any - - Short-circuit (no-op, returns ``True``). - * - ``true`` - - ``true`` - - any - - Short-circuit. Containers are treated as externally managed; CVS does not stop them. - * - ``true`` - - ``false`` - - unset - - Short-circuit. Nothing was started. - * - ``true`` - - ``false`` - - set - - Calls the runtime's ``teardown_containers`` to stop and remove the container. + * - ``lifetime`` + - ``setup_containers`` + - ``teardown_containers`` + * - ``no_launch`` + - Verify a container with the configured name is already running on every host; set ``container_id``. Never starts anything. + - No-op. CVS does not own a container it did not launch. + * - ``per_run`` (default) + - Start a fresh container on every host (force-removing any stale same-named container first). + - Force-remove the container CVS started. + * - ``persistent`` + - Attach if the container is already running on every host. Start fresh only if it is running on no host. Running on some hosts but not all is a hard error (CVS will not force-remove the still-running hosts and destroy their overlay). Idempotent across runs. + - No-op. The container is left running for the next run; remove it yourself when done. + +.. note:: + + With ``lifetime: persistent``, pin ``container.name`` explicitly. The default ``_`` name shifts when you bump the image tag, silently abandoning the previous container's overlay. Prerequisites on each cluster node ==================================