Skip to content
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
33286f5
orchestrators/factory: collapse container enabled+launch into contain…
atnair-amd May 29, 2026
b31b343
runtimes: drop launch short-circuit, add image_sha_status
atnair-amd May 29, 2026
fa6ec07
orchestrators/container: branch setup/teardown on container.lifetime
atnair-amd May 29, 2026
c14b914
input/cluster_file: use container.lifetime in cluster_container.json …
atnair-amd May 29, 2026
ab74309
core/unittests: cover container.lifetime resolution and per-lifetime …
atnair-amd May 29, 2026
7c8be5c
docs: document container.lifetime schema (replace enabled/launch)
atnair-amd May 29, 2026
0ddd336
orchestrators/scripts: add default container provisioning script
atnair-amd May 30, 2026
778b39b
orchestrators/factory: resolve and validate container.setup_script
atnair-amd May 30, 2026
f3d2b66
orchestrators/container: provision launched containers via setup_script
atnair-amd May 30, 2026
11218e3
orchestrators/container: fix setup_sshd pgrep precheck matching its p…
atnair-amd May 30, 2026
dadc82b
input/cluster_file: document and sample container.setup_script
atnair-amd May 30, 2026
8fcfec9
docs: document container.setup_script in cluster-file references
atnair-amd May 30, 2026
87b9065
orchestrators/unittests: cover container.setup_script resolution
atnair-amd May 30, 2026
aed4df1
orchestrators/unittests: cover container provisioning and sshd pgrep …
atnair-amd May 30, 2026
6d182a3
orchestrators/runtimes: harden container persistent lifetime
atnair-amd May 31, 2026
f950082
orchestrators: apply ruff format to satisfy the fmt-check gate
atnair-amd May 31, 2026
fa4ca32
orchestrators: rename container lifetime 'external' to 'no_launch'
atnair-amd Jun 1, 2026
ec87e4f
orchestrators: replace container enabled/launch with container.lifetime
atnair-amd Jun 9, 2026
cd23180
address review: trim normalize comment, carry lifetime in runtime tes…
atnair-amd Jun 10, 2026
a643858
tests: drop redundant persistent-idempotent test, strip dead warning …
atnair-amd Jun 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
343 changes: 270 additions & 73 deletions cvs/core/orchestrators/container.py

Large diffs are not rendered by default.

125 changes: 119 additions & 6 deletions cvs/core/orchestrators/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,105 @@
All code contained here is Property of Advanced Micro Devices, Inc.
'''

import os

from cvs.core.orchestrators.baremetal import BaremetalOrchestrator
from cvs.core.orchestrators.container import ContainerOrchestrator


VALID_CONTAINER_LIFETIMES = ("no_launch", "per_run", "persistent")

# Packaged default provisioning script, run inside each freshly-launched
# container when container.setup_script is not set. Installs openssh-server so
# the in-container sshd can start on port 2224. Resolved __file__-relative so it
# works in both editable and wheel installs.
DEFAULT_CONTAINER_SETUP_SCRIPT = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "scripts", "default_container_setup.sh"
)


def _resolve_container_lifetime(container):
"""Normalize a container config block to a single resolved ``lifetime`` key.

The legacy two-axis schema (``enabled`` + ``launch``) is removed in favor of
one tri-valued ``container.lifetime``. Mutates and returns the passed dict.

Resolution rules (first match wins):
- ``enabled`` present (any value) -> ``ValueError`` (removed field).
- ``launch`` present (any value) -> ``ValueError`` (removed field). Both
removed fields fail loudly rather than being silently mapped, so a stale
flag can never quietly override an explicit ``lifetime``.
- ``lifetime`` present -> validated, kept as-is.
- none of the above -> default ``per_run``.

An empty/absent container block is returned untouched (baremetal path).
"""
if not container:
return container

if 'enabled' in container:
raise ValueError(
"container.enabled is removed; delete the field and set "
"container.lifetime to one of 'no_launch', 'per_run', 'persistent'"
)

if 'launch' in container:
raise ValueError(
"container.launch is removed; delete the field and set "
"container.lifetime ('launch: true' -> 'per_run', "
"'launch: false' -> 'no_launch')"
)

if 'lifetime' in container:
lifetime = container['lifetime']
if lifetime not in VALID_CONTAINER_LIFETIMES:
raise ValueError(f"container.lifetime must be one of {VALID_CONTAINER_LIFETIMES}, got {lifetime!r}")
return container

container['lifetime'] = 'per_run'
return container


def _resolve_container_setup_script(container):
"""Resolve ``container.setup_script`` to a concrete, existing file path.

The script is run inside each freshly-launched container (see
``ContainerOrchestrator._provision_container``) to install packages on top
of the base image. Resolution rules:

- empty/absent container block (baremetal path) -> returned untouched.
- ``setup_script`` set -> validated to exist on
the control host; ``ValueError`` (fail fast at config load) if missing.
- ``setup_script`` absent -> defaults to the
packaged ``default_container_setup.sh`` (installs openssh-server).

Relative paths are resolved against the current working directory of the
process running ``cvs``. Mutates and returns the passed dict.
"""
if not container:
return container

setup_script = container.get('setup_script')
if setup_script:
resolved = os.path.abspath(os.path.expanduser(setup_script))
if not os.path.isfile(resolved):
raise ValueError(f"container.setup_script not found: {setup_script!r} (resolved to {resolved!r})")
container['setup_script'] = resolved
return container

# No user-supplied script: fall back to the packaged default. Validate it
# exists here (same fail-fast as a user path) so a broken/incomplete install
# surfaces at config load rather than as an OSError mid-run inside
# _provision_container.
if not os.path.isfile(DEFAULT_CONTAINER_SETUP_SCRIPT):
raise ValueError(
f"packaged default container setup script is missing: "
f"{DEFAULT_CONTAINER_SETUP_SCRIPT!r} (broken CVS install?)"
)
container['setup_script'] = DEFAULT_CONTAINER_SETUP_SCRIPT
return container


class OrchestratorConfig:
"""
Configuration for orchestrator creation.
Expand All @@ -26,8 +121,7 @@ class OrchestratorConfig:
Example container configuration:
```json
"container": {
"enabled": true,
"launch": false,
"lifetime": "per_run",
"runtime": {
"name": "docker",
"args": {
Expand All @@ -44,10 +138,23 @@ class OrchestratorConfig:
}
},
"image": "rocm/cvs:latest",
"name": "myuser_rocm_cvs_latest"
"name": "myuser_rocm_cvs_latest",
"setup_script": "/path/to/setup.sh"
}
```
launch: Containers are already running, test suite should not start/stop them [default: false]
lifetime: Container lifecycle policy [default: 'per_run']
- 'no_launch' : CVS never launches the container. Setup verifies a
container with the configured name is already running
on every host; teardown is a no-op.
- 'per_run' : start at setup, remove at teardown (the default).
- 'persistent' : start if absent / attach if present; never torn down
by the run. Pin container.name explicitly under this
mode (the default <user>_<image> name shifts on tag bumps).
setup_script: Optional path to a shell script run inside each freshly
launched container (per_run, and persistent when cold-started) before
sshd setup, to install packages on top of the base image. Omit to use
the packaged default that installs openssh-server. A non-existent path
fails at config load.
"""

def __init__(self, **kwargs):
Expand All @@ -72,7 +179,13 @@ def __init__(self, **kwargs):
self.priv_key_file = kwargs['priv_key_file']
self.password = kwargs.get('password')
self.head_node_dict = kwargs.get('head_node_dict', {})
self.container = kwargs.get('container', {})
# Normalize the container block. This is the single chokepoint:

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this comment?

@atnair-amd atnair-amd Jun 10, 2026

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed

# from_configs constructs via cls(**required_config), so both file-driven
# and direct programmatic construction hit the same normalization (and the
# same enabled-removed / launch-deprecated errors, and the same
# setup_script validation / default injection).
container = _resolve_container_lifetime(kwargs.get('container', {}))
self.container = _resolve_container_setup_script(container)

def get(self, key, default=None):
"""Get configuration value with default."""
Expand All @@ -97,7 +210,7 @@ def from_configs(cls, cluster_config, testsuite_config=None):
Required keys: orchestrator, node_dict, username, priv_key_file
Optional keys: container,
head_node_dict, password (defaults provided for missing optional keys)
Container structure: {enabled: bool, launch: bool, runtime: {name: str, args: dict}, image: str, name: str, ...}
Container structure: {lifetime: 'no_launch'|'per_run'|'persistent', runtime: {name: str, args: dict}, image: str, name: str, setup_script: str, ...}
testsuite_config: Test suite specific configuration (dict or path to <testsuite>_config.json)
Can override any keys from cluster_config

Expand Down
20 changes: 20 additions & 0 deletions cvs/core/orchestrators/scripts/default_container_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env bash
# Default CVS container provisioning script.
#
# Run inside each freshly-launched container (via `docker exec`) before
# setup_sshd. CVS's container exec model needs an in-container sshd on port
# 2224; many base images do not ship one. This installs only the sshd binary
# (openssh-server) so the existing setup_sshd can start `/usr/sbin/sshd -p2224`.
#
# Override with container.setup_script in the cluster file to install other
# packages (or to support non-apt base images -- this default is apt-only).
set -euo pipefail

if command -v sshd >/dev/null 2>&1 || [ -x /usr/sbin/sshd ]; then
echo "default_container_setup: sshd already present, nothing to install"
exit 0
fi

export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get install -y --no-install-recommends openssh-server
Loading
Loading