diff --git a/.gitignore b/.gitignore
index 08cc6b4312c56..bc02ffc5b3150 100644
--- a/.gitignore
+++ b/.gitignore
@@ -60,3 +60,5 @@ flake.lock
/known-docker-images.txt
/test/sqllogictest/sqlite
my-local-mz/
+/test/orchestratord/cluster.yaml
+uv.lock
diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml
index 0d299f27db68d..3167323cec30c 100644
--- a/ci/nightly/pipeline.template.yml
+++ b/ci/nightly/pipeline.template.yml
@@ -2367,6 +2367,7 @@ steps:
steps:
- id: orchestratord-defaults
label: "Orchestratord test (defaults from documentation)"
+ artifact_paths: ["mz_debug_*.zip"]
depends_on: devel-docker-tags
timeout_in_minutes: 120
plugins:
@@ -2379,6 +2380,7 @@ steps:
- id: orchestratord-default-properties
label: "Orchestratord test (defaults for properties)"
+ artifact_paths: ["mz_debug_*.zip"]
depends_on: devel-docker-tags
timeout_in_minutes: 120
plugins:
@@ -2391,6 +2393,7 @@ steps:
- id: orchestratord-individual
label: "Orchestratord test (individual properties)"
+ artifact_paths: ["mz_debug_*.zip"]
depends_on: devel-docker-tags
timeout_in_minutes: 120
plugins:
@@ -2403,76 +2406,65 @@ steps:
- id: orchestratord-combine
label: "Orchestratord test (combine properties)"
+ artifact_paths: ["mz_debug_*.zip"]
depends_on: build-aarch64
timeout_in_minutes: 120
plugins:
- ./ci/plugins/mzcompose:
composition: orchestratord
- args: [--action=noop, --properties=combine, --runtime=3600, --recreate-cluster]
+ args: [--action=noop, --properties=combine, --runtime=1800, --recreate-cluster]
ci-builder: stable
agents:
queue: hetzner-aarch64-16cpu-32gb
- id: orchestratord-upgrade-individual
label: "Orchestratord test (upgrade, individual props)"
+ artifact_paths: ["mz_debug_*.zip"]
depends_on: devel-docker-tags
timeout_in_minutes: 120
plugins:
- ./ci/plugins/mzcompose:
composition: orchestratord
- args: [--action=upgrade, --properties=individual, --runtime=3600, --recreate-cluster]
+ args: [--action=upgrade, --properties=individual, --runtime=1800, --recreate-cluster]
ci-builder: stable
- env:
- # Old versions are not on GHCR yet
- MZ_GHCR: 0
agents:
- queue: hetzner-aarch64-8cpu-16gb
- skip: "https://github.com/MaterializeInc/materialize/pull/34214"
+ queue: hetzner-aarch64-16cpu-32gb
- id: orchestratord-upgrade-combine
label: "Orchestratord test (upgrade, combine props)"
+ artifact_paths: ["mz_debug_*.zip"]
depends_on: devel-docker-tags
timeout_in_minutes: 120
plugins:
- ./ci/plugins/mzcompose:
composition: orchestratord
- args: [--action=upgrade, --properties=combine, --runtime=3600, --recreate-cluster]
+ args: [--action=upgrade, --properties=combine, --runtime=1800, --recreate-cluster]
ci-builder: stable
- env:
- # Old versions are not on GHCR yet
- MZ_GHCR: 0
agents:
- queue: hetzner-aarch64-8cpu-16gb
- skip: "https://github.com/MaterializeInc/materialize/pull/34214"
+ queue: hetzner-aarch64-16cpu-32gb
- id: orchestratord-upgrade-chain-individual
label: "Orchestratord test (upgrade chain, individual props)"
+ artifact_paths: ["mz_debug_*.zip"]
depends_on: devel-docker-tags
timeout_in_minutes: 120
plugins:
- ./ci/plugins/mzcompose:
composition: orchestratord
- args: [--action=upgrade-chain, --properties=individual, --runtime=3600, --recreate-cluster]
+ args: [--action=upgrade-chain, --properties=individual, --runtime=1800, --recreate-cluster]
ci-builder: stable
- env:
- # Old versions are not on GHCR yet
- MZ_GHCR: 0
agents:
- queue: hetzner-aarch64-8cpu-16gb
- skip: "https://github.com/MaterializeInc/materialize/pull/34214"
+ queue: hetzner-aarch64-16cpu-32gb
- id: orchestratord-upgrade-chain-combine
label: "Orchestratord test (upgrade chain, combine props)"
+ artifact_paths: ["mz_debug_*.zip"]
depends_on: devel-docker-tags
timeout_in_minutes: 120
plugins:
- ./ci/plugins/mzcompose:
composition: orchestratord
- args: [--action=upgrade-chain, --properties=combine, --runtime=3600, --recreate-cluster]
+ args: [--action=upgrade-chain, --properties=combine, --runtime=1800, --recreate-cluster]
ci-builder: stable
- env:
- # Old versions are not on GHCR yet
- MZ_GHCR: 0
agents:
queue: hetzner-aarch64-16cpu-32gb
- skip: "https://github.com/MaterializeInc/materialize/pull/34214"
diff --git a/src/cloud-resources/src/crd/materialize.rs b/src/cloud-resources/src/crd/materialize.rs
index 8f617a5736553..39fb037a9ac21 100644
--- a/src/cloud-resources/src/crd/materialize.rs
+++ b/src/cloud-resources/src/crd/materialize.rs
@@ -66,6 +66,23 @@ pub mod v1alpha1 {
#[default]
WaitUntilReady,
+ /// Create a new generation of pods, leaving the old generation as the serving generation
+ /// until the user manually promotes the new generation.
+ ///
+ /// Users can promote the new generation at any time, even if the new generation pods are
+ /// not fully caught up, by setting `forcePromote` to the same value as `requestRollout` in
+ /// the Materialize spec.
+ ///
+ /// {{}}
+ /// Do not leave new generations unpromoted indefinitely.
+ ///
+ /// The new generation keeps open read holds which prevent compaction. Once promoted or
+ /// cancelled, those read holds are released. If left unpromoted for an extended time, this
+ /// data can build up, and can cause extreme deletion load on the metadata backend database
+ /// when finally promoted or cancelled.
+ /// {{}}
+ ManuallyPromote,
+
/// {{}}
/// THIS WILL CAUSE YOUR MATERIALIZE INSTANCE TO BE UNAVAILABLE FOR SOME TIME!!!
///
@@ -429,6 +446,20 @@ pub mod v1alpha1 {
false
}
+ pub fn is_ready_to_promote(&self, resources_hash: &str) -> bool {
+ let Some(status) = self.status.as_ref() else {
+ return false;
+ };
+ if status.conditions.is_empty() {
+ return false;
+ }
+ status
+ .conditions
+ .iter()
+ .any(|condition| condition.reason == "ReadyToPromote")
+ && &status.resources_hash == resources_hash
+ }
+
pub fn is_promoting(&self) -> bool {
let Some(status) = self.status.as_ref() else {
return false;
diff --git a/src/orchestratord/src/controller/materialize.rs b/src/orchestratord/src/controller/materialize.rs
index 3cab752159c51..a69da3ff67d13 100644
--- a/src/orchestratord/src/controller/materialize.rs
+++ b/src/orchestratord/src/controller/materialize.rs
@@ -604,34 +604,38 @@ impl k8s_controller::Context for Context {
// replace_status, but this is fine because we already
// extracted all of the information we want from the spec
// earlier.
- let mz = self
- .update_status(
- &mz_api,
- mz,
- MaterializeStatus {
- active_generation,
- // don't update the reconciliation id yet,
- // because the rollout hasn't yet completed. if
- // we fail later on, we want to ensure that the
- // rollout gets retried.
- last_completed_rollout_request: status.last_completed_rollout_request,
- resource_id: status.resource_id,
- resources_hash: String::new(),
- conditions: vec![Condition {
- type_: "UpToDate".into(),
- status: "Unknown".into(),
- last_transition_time: Time(chrono::offset::Utc::now()),
- message: format!(
- "Applying changes for generation {desired_generation}"
- ),
- observed_generation: mz.meta().generation,
- reason: "Applying".into(),
- }],
- },
- active_generation != desired_generation,
- )
- .await?;
- let mz = &mz;
+ let mz = if mz.is_ready_to_promote(&resources_hash) {
+ mz
+ } else {
+ &self
+ .update_status(
+ &mz_api,
+ mz,
+ MaterializeStatus {
+ active_generation,
+ // don't update the reconciliation id yet,
+ // because the rollout hasn't yet completed. if
+ // we fail later on, we want to ensure that the
+ // rollout gets retried.
+ last_completed_rollout_request: status
+ .last_completed_rollout_request,
+ resource_id: status.resource_id,
+ resources_hash: String::new(),
+ conditions: vec![Condition {
+ type_: "UpToDate".into(),
+ status: "Unknown".into(),
+ last_transition_time: Time(chrono::offset::Utc::now()),
+ message: format!(
+ "Applying changes for generation {desired_generation}"
+ ),
+ observed_generation: mz.meta().generation,
+ reason: "Applying".into(),
+ }],
+ },
+ active_generation != desired_generation,
+ )
+ .await?
+ };
let status = mz.status();
if mz.spec.rollout_strategy
@@ -655,6 +659,37 @@ impl k8s_controller::Context for Context {
Ok(Some(action))
}
Ok(None) => {
+ if mz.spec.rollout_strategy == MaterializeRolloutStrategy::ManuallyPromote
+ && !mz.should_force_promote()
+ {
+ trace!(
+ "Ready to promote, but not promoting because the instance is configured with ManuallyPromote rollout strategy."
+ );
+ self.update_status(
+ &mz_api,
+ mz,
+ MaterializeStatus {
+ active_generation,
+ last_completed_rollout_request: status
+ .last_completed_rollout_request,
+ resource_id: status.resource_id,
+ resources_hash,
+ conditions: vec![Condition {
+ type_: "UpToDate".into(),
+ status: "Unknown".into(),
+ last_transition_time: Time(chrono::offset::Utc::now()),
+ message: format!(
+ "Ready to promote generation {desired_generation}"
+ ),
+ observed_generation: mz.meta().generation,
+ reason: "ReadyToPromote".into(),
+ }],
+ },
+ active_generation != desired_generation,
+ )
+ .await?;
+ return Ok(None);
+ }
// do this last, so that we keep traffic pointing at
// the previous environmentd until the new one is
// fully ready
diff --git a/test/orchestratord/cluster.yaml b/test/orchestratord/cluster.yaml.tmpl
similarity index 86%
rename from test/orchestratord/cluster.yaml
rename to test/orchestratord/cluster.yaml.tmpl
index bd7f1aa35cedc..1bca4c489bdaf 100644
--- a/test/orchestratord/cluster.yaml
+++ b/test/orchestratord/cluster.yaml.tmpl
@@ -11,6 +11,14 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
+# Allow access to the registry from both inside and outside kubernetes
+containerdConfigPatches:
+ - |-
+ [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
+ endpoint = ["http://proxy-docker-hub:5000"]
+ - |-
+ [plugins."io.containerd.grpc.v1.cri".registry.mirrors."ghcr.io"]
+ endpoint = ["http://proxy-ghcr:5000"]
# Constrain the node port range to something relatively small, then forward all
# those ports from the host. This makes services running in Kubernetes
# accessible at localhost:$NODEPORT without requiring manual port forwarding.
@@ -23,6 +31,9 @@ kubeadmConfigPatches:
nodes:
- role: control-plane
image: kindest/node:v1.32.5
+ extraMounts:
+ - containerPath: /var/lib/kubelet/config.json
+ hostPath: "$DOCKER_CONFIG/config.json"
extraPortMappings:
- containerPort: 32000
hostPort: 32000
@@ -160,10 +171,17 @@ nodes:
materialize.cloud/availability-zone: "1"
topology.kubernetes.io/zone: "1"
workload: "materialize-instance"
+ extraMounts:
+ - containerPath: /var/lib/kubelet/config.json
+ hostPath: "$DOCKER_CONFIG/config.json"
- role: worker
image: kindest/node:v1.32.5
labels:
materialize.cloud/scratch-fs: "true"
+ materialize.cloud/disk: "true"
materialize.cloud/availability-zone: "2"
topology.kubernetes.io/zone: "2"
workload: "materialize-instance"
+ extraMounts:
+ - containerPath: /var/lib/kubelet/config.json
+ hostPath: "$DOCKER_CONFIG/config.json"
diff --git a/test/orchestratord/mzcompose.py b/test/orchestratord/mzcompose.py
index d49cbd2c1f26b..5d8c2da26f029 100644
--- a/test/orchestratord/mzcompose.py
+++ b/test/orchestratord/mzcompose.py
@@ -30,8 +30,7 @@
import yaml
from semver.version import Version
-from materialize import MZ_ROOT, ci_util, git, spawn, ui
-from materialize.docker import MZ_GHCR_DEFAULT
+from materialize import MZ_ROOT, ci_util, git, spawn
from materialize.mz_version import MzVersion
from materialize.mzcompose.composition import (
Composition,
@@ -41,6 +40,7 @@
from materialize.mzcompose.services.balancerd import Balancerd
from materialize.mzcompose.services.clusterd import Clusterd
from materialize.mzcompose.services.environmentd import Environmentd
+from materialize.mzcompose.services.mz_debug import MzDebug
from materialize.mzcompose.services.orchestratord import Orchestratord
from materialize.mzcompose.services.testdrive import Testdrive
from materialize.util import all_subclasses
@@ -55,9 +55,26 @@
Environmentd(),
Clusterd(),
Balancerd(),
+ MzDebug(),
]
+def run_mz_debug() -> None:
+ # TODO: Hangs a lot in CI
+ # Only using capture because it's too noisy
+ # spawn.capture(
+ # [
+ # "./mz-debug",
+ # "self-managed",
+ # "--k8s-namespace",
+ # "materialize-environment",
+ # "--mz-instance-name",
+ # "12345678-1234-1234-1234-123456789012",
+ # ]
+ # )
+ pass
+
+
def get_tag(tag: str | None = None) -> str:
# We can't use the mzbuild tag because it has a different fingerprint for
# environmentd/clusterd/balancerd and the orchestratord depends on them
@@ -242,7 +259,9 @@ def all_modifications() -> list[type[Modification]]:
class LicenseKey(Modification):
@classmethod
def values(cls, version: MzVersion) -> list[Any]:
- return ["valid", "invalid", "del"]
+ # TODO: Reenable "del" when database-issues#9928 is fixed
+ # return ["valid", "invalid", "del"]
+ return ["valid", "invalid"]
@classmethod
def failed_reconciliation_values(cls) -> list[Any]:
@@ -498,14 +517,9 @@ def validate(self, mods: dict[type[Modification], Any]) -> None:
def check() -> None:
environmentd = get_environmentd_data()
image = environmentd["items"][0]["spec"]["containers"][0]["image"]
- image_registry = (
- "ghcr.io/materializeinc/materialize"
- if ui.env_is_truthy("MZ_GHCR", MZ_GHCR_DEFAULT)
- else "materialize"
- )
- expected = f"{image_registry}/environmentd:{self.value}"
+ expected = f"materialize/environmentd:{self.value}"
assert (
- image == expected
+ image == expected or f"ghcr.io/materializeinc/{image}" == expected
), f"Expected environmentd image {expected}, but found {image}"
retry(check, 240)
@@ -1070,11 +1084,11 @@ def check_pods() -> None:
class AuthenticatorKind(Modification):
@classmethod
def values(cls, version: MzVersion) -> list[Any]:
- # Test None, Password (v0.147.7+), and Sasl (v0.147.16+)
+ # Test None, Password (v0.147.7+), and Sasl
result = ["None"]
if version >= MzVersion.parse_mz("v0.147.7"):
result.append("Password")
- if version >= MzVersion.parse_mz("v0.147.16"):
+ if version >= MzVersion.parse_mz("v26.0.0"):
result.append("Sasl")
return result
@@ -1100,13 +1114,13 @@ def validate(self, mods: dict[type[Modification], Any]) -> None:
if self.value == "Password" and version <= MzVersion.parse_mz("v0.147.6"):
return
- if self.value == "Sasl" and version < MzVersion.parse_mz("v0.147.16"):
+ if self.value == "Sasl" and version < MzVersion.parse_mz("v26.0.0"):
return
port = (
6875
if (version >= MzVersion.parse_mz("v0.147.0") and self.value == "Password")
- or (version >= MzVersion.parse_mz("v0.147.16") and self.value == "Sasl")
+ or (version >= MzVersion.parse_mz("v26.0.0") and self.value == "Sasl")
else 6877
)
for i in range(120):
@@ -1232,6 +1246,27 @@ def validate(self, mods: dict[type[Modification], Any]) -> None:
os.killpg(os.getpgid(process.pid), signal.SIGTERM)
+class RolloutStrategy(Modification):
+ @classmethod
+ def values(cls, version: MzVersion) -> list[Any]:
+ return [
+ "WaitUntilReady",
+ "ManuallyPromote",
+ "ImmediatelyPromoteCausingDowntime",
+ ]
+
+ @classmethod
+ def default(cls) -> Any:
+ return "WaitUntilReady"
+
+ def modify(self, definition: dict[str, Any]) -> None:
+ definition["materialize"]["spec"]["rolloutStrategy"] = self.value
+
+ def validate(self, mods: dict[type[Modification], Any]) -> None:
+ # This is validated in post_run_check
+ return
+
+
class Properties(Enum):
Defaults = "defaults"
Individual = "individual"
@@ -1252,10 +1287,23 @@ def workflow_defaults(c: Composition, parser: WorkflowArgumentParser) -> None:
)
args = parser.parse_args()
- current_version = get_tag(args.tag)
+ c.up(Service("mz-debug", idle=True))
+ c.invoke("cp", "mz-debug:/usr/local/bin/mz-debug", ".")
+
+ current_version = get_version(args.tag)
# Following https://materialize.com/docs/installation/install-on-local-kind/
- for version in reversed(get_self_managed_versions() + [get_version(args.tag)]):
+ # orchestratord test can't run against future versions, so ignore those
+ versions = reversed(
+ [
+ version
+ for version in get_self_managed_versions()
+ if version < current_version
+ ]
+ + [current_version]
+ )
+ for version in versions:
+ print(f"--- Running with defaults against {version}")
dir = "my-local-mz"
if os.path.exists(dir):
shutil.rmtree(dir)
@@ -1392,9 +1440,6 @@ def workflow_defaults(c: Composition, parser: WorkflowArgumentParser) -> None:
materialize_setup = list(yaml.load_all(f, Loader=yaml.Loader))
assert len(materialize_setup) == 3
- print(version)
- print(current_version)
- print(version == current_version)
if version == current_version:
materialize_setup[2]["spec"][
"environmentdImageRef"
@@ -1493,6 +1538,78 @@ def workflow_defaults(c: Composition, parser: WorkflowArgumentParser) -> None:
]
)
raise ValueError("Never completed")
+ run_mz_debug()
+
+
+class ModSource:
+ def __init__(self, mod_classes: list[type[Modification]]):
+ self.mod_classes = mod_classes
+
+ def next_mods(self, version: MzVersion) -> list[Modification]:
+ raise NotImplementedError
+
+
+class DefaultModSource(ModSource):
+ def __init__(self, mod_classes: list[type[Modification]]):
+ super().__init__(mod_classes)
+ self.state = 0
+
+ def next_mods(self, version: MzVersion) -> list[Modification]:
+ if self.state == 0:
+ self.state += 1
+ return [cls(cls.default()) for cls in self.mod_classes]
+ elif self.state == 1:
+ self.state += 1
+ return [NumMaterializeEnvironments(2)]
+ else:
+ raise StopIteration
+
+
+class IndividualModSource(ModSource):
+ def __init__(self, mod_classes: list[type[Modification]]):
+ super().__init__(mod_classes)
+ self._iters_by_version: dict[object, Iterator[list[Modification]]] = {}
+
+ def _iter_values_for_version(
+ self, version: MzVersion
+ ) -> Iterator[list[Modification]]:
+ for cls in self.mod_classes:
+ for value in cls.values(version):
+ yield [cls(value)]
+
+ def next_mods(self, version: MzVersion) -> list[Modification]:
+ it = self._iters_by_version.setdefault(
+ version, self._iter_values_for_version(version)
+ )
+ try:
+ return next(it)
+ except StopIteration:
+ del self._iters_by_version[version]
+ raise
+
+
+class CombineModSource(ModSource):
+ def __init__(self, mod_classes: list[type[Modification]], rng: random.Random):
+ super().__init__(mod_classes)
+ self.rng = rng
+
+ def next_mods(self, version: MzVersion) -> list[Modification]:
+ return [
+ cls(self.rng.choice(cls.good_values(version))) for cls in self.mod_classes
+ ]
+
+
+def make_mod_source(
+ properties: Properties, mod_classes: list[type[Modification]], rng: random.Random
+):
+ if properties == Properties.Defaults:
+ return DefaultModSource(mod_classes)
+ elif properties == Properties.Individual:
+ return IndividualModSource(mod_classes)
+ elif properties == Properties.Combine:
+ return CombineModSource(mod_classes, rng)
+ else:
+ raise ValueError(f"Unhandled properties: {properties}")
def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
@@ -1534,7 +1651,8 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
"0.29.0"
), f"kind >= v0.29.0 required, while you are on {kind_version}"
- c.up(Service("testdrive", idle=True))
+ c.up(Service("testdrive", idle=True), Service("mz-debug", idle=True))
+ c.invoke("cp", "mz-debug:/usr/local/bin/mz-debug", ".")
cluster = "kind"
clusters = spawn.capture(["kind", "get", "clusters"]).strip().split("\n")
@@ -1577,7 +1695,6 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
definition["secret"] = materialize_setup[1]
definition["materialize"] = materialize_setup[2]
- current_version = get_version(args.tag)
if args.orchestratord_override:
definition["operator"]["operator"]["image"]["tag"] = get_tag(args.tag)
# TODO: database-issues#9696, makes environmentd -> clusterd connections fail
@@ -1629,91 +1746,59 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
action = Action(args.action)
properties = Properties(args.properties)
-
- def get_mods() -> Iterator[list[Modification]]:
- if properties == Properties.Defaults:
- yield [mod_class(mod_class.default()) for mod_class in mod_classes]
- yield [NumMaterializeEnvironments(2)]
- elif properties == Properties.Individual:
- for mod_class in mod_classes:
- for value in mod_class.values(current_version):
- if value in mod_class.values(current_version):
- yield [mod_class(value)]
- elif properties == Properties.Combine:
- assert args.runtime
- while time.time() < end_time:
- yield [
- mod_class(rng.choice(mod_class.good_values(current_version)))
- for mod_class in mod_classes
- ]
- else:
- raise ValueError(f"Unhandled properties value {properties}")
-
- mods_it = get_mods()
+ mod_source = make_mod_source(properties, mod_classes, rng)
try:
if action == Action.Noop:
- for mods in mods_it:
+ while True:
+ mods = mod_source.next_mods(get_version(args.tag))
if args.tag:
mods.append(EnvironmentdImageRef(str(args.tag)))
run_scenario([mods], definition)
+
elif action == Action.Upgrade:
- assert not ui.env_is_truthy(
- "MZ_GHCR", MZ_GHCR_DEFAULT
- ), "Manually set MZ_GHCR=0 as an environment variable for upgrade testing"
- assert args.runtime
end_time = (
- datetime.datetime.now() + datetime.timedelta(seconds=args.runtime)
+ datetime.datetime.now()
+ + datetime.timedelta(seconds=args.runtime or 3600)
).timestamp()
versions = get_all_self_managed_versions()
+
while time.time() < end_time:
current_version = rng.choice(versions[:-1])
- selected_versions = [
- current_version,
- get_upgrade_target(rng, current_version, versions),
- ]
- try:
- mod = next(mods_it)
- except StopIteration:
- mods_it = get_mods()
- mod = next(mods_it)
+ target_version = get_upgrade_target(rng, current_version, versions)
+ mods = mod_source.next_mods(current_version)
scenario = [
- [EnvironmentdImageRef(str(version))] + mod
- for version in selected_versions
+ [EnvironmentdImageRef(str(v))] + mods
+ for v in (current_version, target_version)
]
run_scenario(scenario, definition)
+
elif action == Action.UpgradeChain:
- assert not ui.env_is_truthy(
- "MZ_GHCR", MZ_GHCR_DEFAULT
- ), "Manually set MZ_GHCR=0 as an environment variable for upgrade testing"
- assert args.runtime
end_time = (
- datetime.datetime.now() + datetime.timedelta(seconds=args.runtime)
+ datetime.datetime.now()
+ + datetime.timedelta(seconds=args.runtime or 3600)
).timestamp()
versions = get_all_self_managed_versions()
+
while time.time() < end_time:
current_version = rng.choice(versions)
- selected_versions = [current_version]
+ chain = [current_version]
next_version = current_version
+
try:
- for i in range(len(versions)):
+ for _ in range(len(versions)):
next_version = get_upgrade_target(rng, next_version, versions)
- selected_versions.append(next_version)
+ chain.append(next_version)
except ValueError:
# We can't upgrade any further, just run the test as far as it goes now
pass
- try:
- mod = next(mods_it)
- except StopIteration:
- mods_it = get_mods()
- mod = next(mods_it)
+
+ mods = mod_source.next_mods(current_version)
scenario = [
- [EnvironmentdImageRef(str(version))] + mod for version in versions
+ [EnvironmentdImageRef(str(version))] + mods for version in chain
]
- assert len(scenario) == len(
- versions
- ), f"Expected scenario with {len(versions)} steps, but only found: {scenario}"
run_scenario(scenario, definition)
+
else:
raise ValueError(f"Unhandled action {action}")
except StopIteration:
@@ -1722,6 +1807,62 @@ def get_mods() -> Iterator[list[Modification]]:
def setup(cluster: str):
spawn.runv(["kind", "delete", "cluster", "--name", cluster])
+
+ try:
+ spawn.runv(["docker", "network", "create", "kind"])
+ except:
+ pass
+ try:
+ spawn.runv(
+ [
+ "docker",
+ "run",
+ "-d",
+ "--name",
+ "proxy-docker-hub",
+ "--restart=always",
+ "--net=kind",
+ "-v",
+ f"{MZ_ROOT}/misc/kind/cache/docker-hub:/var/lib/registry",
+ "-e",
+ "REGISTRY_PROXY_REMOTEURL=https://registry-1.docker.io",
+ "registry:2",
+ ]
+ )
+ except:
+ pass
+ try:
+ spawn.runv(
+ [
+ "docker",
+ "run",
+ "-d",
+ "--name",
+ "proxy-ghcr",
+ "--restart=always",
+ "--net=kind",
+ "-v",
+ f"{MZ_ROOT}/misc/kind/cache/ghcr:/var/lib/registry",
+ "-e",
+ "REGISTRY_PROXY_REMOTEURL=https://ghcr.io",
+ "registry:2",
+ ]
+ )
+ except:
+ pass
+
+ with (
+ open(MZ_ROOT / "test" / "orchestratord" / "cluster.yaml.tmpl") as in_file,
+ open(MZ_ROOT / "test" / "orchestratord" / "cluster.yaml", "w") as out_file,
+ ):
+ text = in_file.read()
+ out_file.write(
+ text.replace(
+ "$DOCKER_CONFIG",
+ os.getenv("DOCKER_CONFIG", f'{os.environ["HOME"]}/.docker'),
+ )
+ )
+
spawn.runv(
[
"kind",
@@ -1806,18 +1947,14 @@ def run_scenario(
mod.modify(definition)
if mod.value in mod.failed_reconciliation_values():
expect_fail = True
- if not initialize:
- definition["materialize"]["spec"][
- "rolloutStrategy"
- ] = "ImmediatelyPromoteCausingDowntime"
- definition["materialize"]["spec"]["requestRollout"] = str(uuid.uuid4())
- run(definition, expect_fail)
if initialize:
init(definition)
run(definition, expect_fail)
initialize = False # only initialize once
else:
- upgrade(definition, expect_fail)
+ upgrade_operator_helm_chart(definition, expect_fail)
+ definition["materialize"]["spec"]["requestRollout"] = str(uuid.uuid4())
+ run(definition, expect_fail)
mod_dict = {mod.__class__: mod.value for mod in mods}
for subclass in all_subclasses(Modification):
if subclass not in mod_dict:
@@ -1831,6 +1968,9 @@ def run_scenario(
f"Reproduce with bin/mzcompose --find orchestratord run default --recreate-cluster --scenario='{scenario_json}'"
)
raise
+ finally:
+ if not expect_fail:
+ run_mz_debug()
def init(definition: dict[str, Any]) -> None:
@@ -1866,7 +2006,7 @@ def init(definition: dict[str, Any]) -> None:
stderr=subprocess.DEVNULL,
)
- for i in range(120):
+ for i in range(240):
try:
spawn.capture(
[
@@ -1890,7 +2030,7 @@ def init(definition: dict[str, Any]) -> None:
raise ValueError("Never completed")
-def upgrade(definition: dict[str, Any], expect_fail: bool) -> None:
+def upgrade_operator_helm_chart(definition: dict[str, Any], expect_fail: bool) -> None:
spawn.runv(
[
"helm",
@@ -1907,7 +2047,6 @@ def upgrade(definition: dict[str, Any], expect_fail: bool) -> None:
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
- post_run_check(definition, expect_fail)
def run(definition: dict[str, Any], expect_fail: bool) -> None:
@@ -1926,12 +2065,47 @@ def run(definition: dict[str, Any], expect_fail: bool) -> None:
except subprocess.CalledProcessError as e:
print(f"Failed to apply: {e.stdout}\nSTDERR:{e.stderr}")
raise
- post_run_check(definition, expect_fail)
+ if definition["materialize"]["spec"].get("rolloutStrategy") == "ManuallyPromote":
+ # First wait for it to become ready to promote, but not yet promoted
+ for _ in range(900):
+ time.sleep(1)
+ if is_ready_to_manually_promote():
+ break
+ else:
+ spawn.runv(
+ [
+ "kubectl",
+ "get",
+ "materializes",
+ "-n",
+ "materialize-environment",
+ "-o",
+ "yaml",
+ ],
+ )
+ raise RuntimeError("Never became ready for manual promotion")
-def post_run_check(definition: dict[str, Any], expect_fail: bool) -> None:
- for i in range(60):
- try:
+ # Wait to see that it doesn't promote
+ time.sleep(30)
+ if not is_ready_to_manually_promote():
+ spawn.runv(
+ [
+ "kubectl",
+ "get",
+ "materializes",
+ "-n",
+ "materialize-environment",
+ "-o",
+ "yaml",
+ ],
+ )
+ raise RuntimeError(
+ "Stopped being ready for manual promotion before promoting"
+ )
+
+ # Manually promote it
+ mz = json.loads(
spawn.capture(
[
"kubectl",
@@ -1939,14 +2113,97 @@ def post_run_check(definition: dict[str, Any], expect_fail: bool) -> None:
"materializes",
"-n",
"materialize-environment",
+ "-o",
+ "json",
],
stderr=subprocess.DEVNULL,
)
- break
+ )["items"][0]
+ definition["materialize"]["spec"]["forcePromote"] = mz["spec"]["requestRollout"]
+ try:
+ spawn.runv(
+ ["kubectl", "apply", "-f", "-"],
+ stdin=yaml.dump(definition["materialize"]).encode(),
+ )
+ except subprocess.CalledProcessError as e:
+ print(f"Failed to apply: {e.stdout}\nSTDERR:{e.stderr}")
+ raise
+
+ post_run_check(definition, expect_fail)
+
+
+def is_ready_to_manually_promote():
+ data = json.loads(
+ spawn.capture(
+ [
+ "kubectl",
+ "get",
+ "materializes",
+ "-n",
+ "materialize-environment",
+ "-o",
+ "json",
+ ],
+ stderr=subprocess.DEVNULL,
+ )
+ )
+ conditions = data["items"][0].get("status", {}).get("conditions")
+ return (
+ conditions is not None
+ and conditions[0]["type"] == "UpToDate"
+ and conditions[0]["status"] == "Unknown"
+ and conditions[0]["reason"] == "ReadyToPromote"
+ )
+
+
+def post_run_check(definition: dict[str, Any], expect_fail: bool) -> None:
+ for i in range(900):
+ time.sleep(1)
+ try:
+ data = json.loads(
+ spawn.capture(
+ [
+ "kubectl",
+ "get",
+ "materializes",
+ "-n",
+ "materialize-environment",
+ "-o",
+ "json",
+ ],
+ stderr=subprocess.DEVNULL,
+ )
+ )
+ status = data["items"][0].get("status")
+ if not status:
+ continue
+ if expect_fail:
+ break
+ if (
+ not status["conditions"]
+ or status["conditions"][0]["type"] != "UpToDate"
+ or status["conditions"][0]["status"] != "True"
+ ):
+ continue
+ if (
+ status["lastCompletedRolloutRequest"]
+ == data["items"][0]["spec"]["requestRollout"]
+ ):
+ break
except subprocess.CalledProcessError:
pass
- time.sleep(1)
else:
+ spawn.runv(
+ [
+ "kubectl",
+ "get",
+ "materializes",
+ "-n",
+ "materialize-environment",
+ "-o",
+ "yaml",
+ ],
+ )
raise ValueError("Never completed")
for i in range(480):
@@ -2005,5 +2262,3 @@ def post_run_check(definition: dict[str, Any], expect_fail: bool) -> None:
]
)
raise ValueError("Never completed")
- # Wait a bit for the status to stabilize
- time.sleep(60)