diff --git a/.buildkite/build-start-operator.sh b/.buildkite/build-start-operator.sh index ef43eba6d71..4c81fbe96be 100644 --- a/.buildkite/build-start-operator.sh +++ b/.buildkite/build-start-operator.sh @@ -7,10 +7,14 @@ # to kick off from the release branch so tests should match up accordingly. if [ "$IS_FROM_RAY_RELEASE_AUTOMATION" = 1 ]; then - helm repo update && helm install kuberay/kuberay-operator + helm repo update + echo "Installing helm chart with test override values (feature gates enabled as needed)" + # NOTE: The override file is CI/test-only. It is NOT part of the released chart defaults. + helm install kuberay-operator kuberay/kuberay-operator -f ../.buildkite/values-kuberay-operator-override.yaml KUBERAY_TEST_RAY_IMAGE="rayproject/ray:nightly.$(date +'%y%m%d').${RAY_NIGHTLY_COMMIT:0:6}-py39" && export KUBERAY_TEST_RAY_IMAGE else IMG=kuberay/operator:nightly make docker-image && kind load docker-image kuberay/operator:nightly && - IMG=kuberay/operator:nightly make deploy + echo "Deploying operator with test overrides (feature gates via test-overrides overlay)" + IMG=kuberay/operator:nightly make deploy-with-override fi diff --git a/.buildkite/values-kuberay-operator-override.yaml b/.buildkite/values-kuberay-operator-override.yaml new file mode 100644 index 00000000000..7dc396edd71 --- /dev/null +++ b/.buildkite/values-kuberay-operator-override.yaml @@ -0,0 +1,18 @@ +# Generic Helm values override used only in CI / e2e test environments. +# Intent: +# - Allow e2e tests to turn on alpha / experimental feature gates (e.g. RayJobDeletionPolicy) +# - Provide a single place contributors can extend with additional overrides needed for tests +# - Keep the default published Helm chart behavior unchanged for normal users +# Scope / Safety: +# - This file is never referenced by the base chart; it is opt‑in via buildkite or manual helm install +# - Do NOT rename it to values.yaml or commit changes that enable unstable features by default +# Usage examples: +# helm install kuberay-operator kuberay/kuberay-operator -f ../.buildkite/values-kuberay-operator-override.yaml +# (add or remove feature gates below as e2e scenarios expand) +# +# Current overrides: enable RayJobDeletionPolicy alpha feature gate alongside the existing status conditions gate. +featureGates: + - name: RayClusterStatusConditions + enabled: true + - name: RayJobDeletionPolicy + enabled: true diff --git a/docs/reference/api.md b/docs/reference/api.md index 4b495fef69e..ced95d1e45d 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -55,11 +55,28 @@ _Appears in:_ -#### DeletionPolicy +#### DeletionCondition + + + +DeletionCondition specifies the trigger conditions for a deletion action. + + + +_Appears in:_ +- [DeletionRule](#deletionrule) +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `ttlSeconds` _integer_ | TTLSeconds is the time in seconds from when the JobStatus
reaches the specified terminal state to when this deletion action should be triggered.
The value must be a non-negative integer. | 0 | Minimum: 0
| + + +#### DeletionPolicy +DeletionPolicy is the legacy single-stage deletion policy. +Deprecated: This struct is part of the legacy API. Use DeletionRule for new configurations. @@ -68,7 +85,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `policy` _[DeletionPolicyType](#deletionpolicytype)_ | Valid values are 'DeleteCluster', 'DeleteWorkers', 'DeleteSelf' or 'DeleteNone'. | | | +| `policy` _[DeletionPolicyType](#deletionpolicytype)_ | Policy is the action to take when the condition is met.
This field is logically required when using the legacy OnSuccess/OnFailure policies.
It is marked as '+optional' at the API level to allow the 'deletionRules' field to be used instead. | | Enum: [DeleteCluster DeleteWorkers DeleteSelf DeleteNone]
| #### DeletionPolicyType @@ -81,14 +98,51 @@ _Underlying type:_ _string_ _Appears in:_ - [DeletionPolicy](#deletionpolicy) +- [DeletionRule](#deletionrule) + + + +#### DeletionRule +DeletionRule defines a single deletion action and its trigger condition. +This is the new, recommended way to define deletion behavior. + + + +_Appears in:_ +- [DeletionStrategy](#deletionstrategy) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `policy` _[DeletionPolicyType](#deletionpolicytype)_ | Policy is the action to take when the condition is met. This field is required. | | Enum: [DeleteCluster DeleteWorkers DeleteSelf DeleteNone]
| +| `condition` _[DeletionCondition](#deletioncondition)_ | The condition under which this deletion rule is triggered. This field is required. | | | + + #### DeletionStrategy +DeletionStrategy configures automated cleanup after the RayJob reaches a terminal state. +Two mutually exclusive styles are supported: + + + Legacy: provide both onSuccess and onFailure (deprecated; removal planned for 1.6.0). May be combined with shutdownAfterJobFinishes and (optionally) global TTLSecondsAfterFinished. + Rules: provide deletionRules (non-empty list). Rules mode is incompatible with shutdownAfterJobFinishes, legacy fields, and the global TTLSecondsAfterFinished (use per‑rule condition.ttlSeconds instead). + + +Semantics: + - A non-empty deletionRules selects rules mode; empty lists are treated as unset. + - Legacy requires both onSuccess and onFailure; specifying only one is invalid. + - Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true; therefore it cannot be used with rules mode or with legacy alone (no shutdown). + - Feature gate RayJobDeletionPolicy must be enabled when this block is present. + +Validation: + - CRD XValidations prevent mixing legacy fields with deletionRules and enforce legacy completeness. + - Controller logic enforces rules vs shutdown exclusivity and TTL constraints. + - onSuccess/onFailure are deprecated; migration to deletionRules is encouraged. @@ -97,8 +151,9 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | | | | -| `onFailure` _[DeletionPolicy](#deletionpolicy)_ | | | | +| `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | OnSuccess is the deletion policy for a successful RayJob.
Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
This field will be removed in release 1.6.0. | | | +| `onFailure` _[DeletionPolicy](#deletionpolicy)_ | OnFailure is the deletion policy for a failed RayJob.
Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
This field will be removed in release 1.6.0. | | | +| `deletionRules` _[DeletionRule](#deletionrule) array_ | DeletionRules is a list of deletion rules, processed based on their trigger conditions.
While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),
the most impactful rule (e.g., DeleteSelf) will be executed first to prioritize resource cleanup. | | MinItems: 1
| @@ -242,7 +297,7 @@ _Appears in:_ | `clusterSelector` _object (keys:string, values:string)_ | clusterSelector is used to select running rayclusters by labels | | | | `submitterConfig` _[SubmitterConfig](#submitterconfig)_ | Configurations of submitter k8s job. | | | | `managedBy` _string_ | ManagedBy is an optional configuration for the controller or entity that manages a RayJob.
The value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'.
The kuberay-operator reconciles a RayJob which doesn't have this field at all or
the field value is the reserved string 'ray.io/kuberay-operator',
but delegates reconciling the RayJob with 'kueue.x-k8s.io/multikueue' to the Kueue.
The field is immutable. | | | -| `deletionStrategy` _[DeletionStrategy](#deletionstrategy)_ | DeletionStrategy indicates what resources of the RayJob and how they are deleted upon job completion.
If unset, deletion policy is based on 'spec.shutdownAfterJobFinishes'.
This field requires the RayJobDeletionPolicy feature gate to be enabled. | | | +| `deletionStrategy` _[DeletionStrategy](#deletionstrategy)_ | DeletionStrategy automates post-completion cleanup.
Choose one style or omit:
- Legacy: both onSuccess & onFailure (deprecated; may combine with shutdownAfterJobFinishes and TTLSecondsAfterFinished).
- Rules: deletionRules (non-empty) — incompatible with shutdownAfterJobFinishes, legacy fields, and global TTLSecondsAfterFinished (use per-rule condition.ttlSeconds).
Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true.
Feature gate RayJobDeletionPolicy must be enabled when this field is set. | | | | `entrypoint` _string_ | Entrypoint represents the command to start execution. | | | | `runtimeEnvYAML` _string_ | RuntimeEnvYAML represents the runtime environment configuration
provided as a multi-line YAML string. | | | | `jobId` _string_ | If jobId is not set, a new jobId will be auto-generated. | | | diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml index 8f8679ca607..f613645cb64 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml @@ -60,34 +60,66 @@ spec: type: object deletionStrategy: properties: + deletionRules: + items: + properties: + condition: + properties: + jobStatus: + enum: + - SUCCEEDED + - FAILED + type: string + ttlSeconds: + default: 0 + format: int32 + minimum: 0 + type: integer + required: + - jobStatus + type: object + policy: + enum: + - DeleteCluster + - DeleteWorkers + - DeleteSelf + - DeleteNone + type: string + required: + - condition + - policy + type: object + minItems: 1 + type: array + x-kubernetes-list-type: atomic onFailure: properties: policy: + enum: + - DeleteCluster + - DeleteWorkers + - DeleteSelf + - DeleteNone type: string - x-kubernetes-validations: - - message: the policy field value must be either 'DeleteCluster', - 'DeleteWorkers', 'DeleteSelf', or 'DeleteNone' - rule: self in ['DeleteCluster', 'DeleteWorkers', 'DeleteSelf', - 'DeleteNone'] - required: - - policy type: object onSuccess: properties: policy: + enum: + - DeleteCluster + - DeleteWorkers + - DeleteSelf + - DeleteNone type: string - x-kubernetes-validations: - - message: the policy field value must be either 'DeleteCluster', - 'DeleteWorkers', 'DeleteSelf', or 'DeleteNone' - rule: self in ['DeleteCluster', 'DeleteWorkers', 'DeleteSelf', - 'DeleteNone'] - required: - - policy type: object - required: - - onFailure - - onSuccess type: object + x-kubernetes-validations: + - message: legacy policies (onSuccess/onFailure) and deletionRules + cannot be used together within the same deletionStrategy + rule: '!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))' + - message: deletionStrategy requires either BOTH onSuccess and onFailure, + OR the deletionRules field (cannot be empty) + rule: ((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules)) entrypoint: type: string entrypointNumCpus: diff --git a/ray-operator/Makefile b/ray-operator/Makefile index 3eda8a616c4..214035963b7 100644 --- a/ray-operator/Makefile +++ b/ray-operator/Makefile @@ -67,7 +67,6 @@ test: ENVTEST_K8S_VERSION ?= 1.24.2 test: manifests fmt vet envtest ## Run tests. KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $(WHAT) -coverprofile cover.out -# You can use `go test -timeout 30m -v ./test/e2e/rayjob_test.go ./test/e2e/support.go` if you only want to run tests in `rayjob_test.go`. test-e2e: WHAT ?= ./test/e2e test-e2e: manifests fmt vet ## Run e2e tests. go test -timeout 30m -v $(WHAT) @@ -88,6 +87,14 @@ test-sampleyaml: WHAT ?= ./test/sampleyaml test-sampleyaml: manifests fmt vet go test -timeout 30m -v $(WHAT) +test-e2e-rayjob: WHAT ?= ./test/e2erayjob +test-e2e-rayjob: manifests fmt vet ## Run e2e tests. + go test -timeout 30m -v $(WHAT) + +test-e2e-rayservice: WHAT ?= ./test/e2erayservice +test-e2e-rayservice: manifests fmt vet ## Run e2e tests. + go test -timeout 30m -v $(WHAT) + sync: helm api-docs ./hack/update-codegen.sh @@ -136,6 +143,15 @@ deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in cd config/default && $(KUSTOMIZE) edit set image kuberay/operator=${IMG} $(KUSTOMIZE) build config/default | kubectl apply --server-side=true -f - +# NOTE FOR CONTRIBUTORS: +# deploy-with-override is an e2e/CI-only deployment path. It applies a Kustomize overlay that +# enables test-only feature gates (e.g. RayJobDeletionPolicy) without changing the default +# behavior of the base Helm chart or the standard 'make deploy'. Add additional test overrides +# to the overlay (config/overlays/rayjob-deletion-policy) rather than modifying the base. +deploy-with-override: manifests kustomize ## Deploy controller with test-only feature gate overrides (does NOT affect default chart). + cd config/default && $(KUSTOMIZE) edit set image kuberay/operator=${IMG} + $(KUSTOMIZE) build config/overlays/test-overrides | kubectl apply --server-side=true -f - + undeploy: ## Undeploy controller from the K8s cluster specified in ~/.kube/config. $(KUSTOMIZE) build config/default | kubectl delete -f - diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go index 8f78eceed07..705d50dfd40 100644 --- a/ray-operator/apis/ray/v1/rayjob_types.go +++ b/ray-operator/apis/ray/v1/rayjob_types.go @@ -87,19 +87,87 @@ const ( SidecarMode JobSubmissionMode = "SidecarMode" // Submit job via a sidecar container in the Ray head Pod ) -type DeletionPolicyType string - +// DeletionStrategy configures automated cleanup after the RayJob reaches a terminal state. +// Two mutually exclusive styles are supported: +// +// Legacy: provide both onSuccess and onFailure (deprecated; removal planned for 1.6.0). May be combined with shutdownAfterJobFinishes and (optionally) global TTLSecondsAfterFinished. +// Rules: provide deletionRules (non-empty list). Rules mode is incompatible with shutdownAfterJobFinishes, legacy fields, and the global TTLSecondsAfterFinished (use per‑rule condition.ttlSeconds instead). +// +// Semantics: +// - A non-empty deletionRules selects rules mode; empty lists are treated as unset. +// - Legacy requires both onSuccess and onFailure; specifying only one is invalid. +// - Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true; therefore it cannot be used with rules mode or with legacy alone (no shutdown). +// - Feature gate RayJobDeletionPolicy must be enabled when this block is present. +// +// Validation: +// - CRD XValidations prevent mixing legacy fields with deletionRules and enforce legacy completeness. +// - Controller logic enforces rules vs shutdown exclusivity and TTL constraints. +// - onSuccess/onFailure are deprecated; migration to deletionRules is encouraged. +// +// +kubebuilder:validation:XValidation:rule="!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))",message="legacy policies (onSuccess/onFailure) and deletionRules cannot be used together within the same deletionStrategy" +// +kubebuilder:validation:XValidation:rule="((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules))",message="deletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (cannot be empty)" type DeletionStrategy struct { - OnSuccess DeletionPolicy `json:"onSuccess"` - OnFailure DeletionPolicy `json:"onFailure"` + // OnSuccess is the deletion policy for a successful RayJob. + // Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies. + // This field will be removed in release 1.6.0. + // +optional + OnSuccess *DeletionPolicy `json:"onSuccess,omitempty"` + + // OnFailure is the deletion policy for a failed RayJob. + // Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies. + // This field will be removed in release 1.6.0. + // +optional + OnFailure *DeletionPolicy `json:"onFailure,omitempty"` + + // DeletionRules is a list of deletion rules, processed based on their trigger conditions. + // While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime), + // the most impactful rule (e.g., DeleteSelf) will be executed first to prioritize resource cleanup. + // +optional + // +listType=atomic + // +kubebuilder:validation:MinItems=1 + DeletionRules []DeletionRule `json:"deletionRules,omitempty"` } +// DeletionRule defines a single deletion action and its trigger condition. +// This is the new, recommended way to define deletion behavior. +type DeletionRule struct { + // Policy is the action to take when the condition is met. This field is required. + // +kubebuilder:validation:Enum=DeleteCluster;DeleteWorkers;DeleteSelf;DeleteNone + Policy DeletionPolicyType `json:"policy"` + + // The condition under which this deletion rule is triggered. This field is required. + Condition DeletionCondition `json:"condition"` +} + +// DeletionCondition specifies the trigger conditions for a deletion action. +type DeletionCondition struct { + // JobStatus is the terminal status of the RayJob that triggers this condition. This field is required. + // For the initial implementation, only "SUCCEEDED" and "FAILED" are supported. + // +kubebuilder:validation:Enum=SUCCEEDED;FAILED + JobStatus JobStatus `json:"jobStatus"` + + // TTLSeconds is the time in seconds from when the JobStatus + // reaches the specified terminal state to when this deletion action should be triggered. + // The value must be a non-negative integer. + // +kubebuilder:default=0 + // +kubebuilder:validation:Minimum=0 + // +optional + TTLSeconds int32 `json:"ttlSeconds,omitempty"` +} + +// DeletionPolicy is the legacy single-stage deletion policy. +// Deprecated: This struct is part of the legacy API. Use DeletionRule for new configurations. type DeletionPolicy struct { - // Valid values are 'DeleteCluster', 'DeleteWorkers', 'DeleteSelf' or 'DeleteNone'. - // +kubebuilder:validation:XValidation:rule="self in ['DeleteCluster', 'DeleteWorkers', 'DeleteSelf', 'DeleteNone']",message="the policy field value must be either 'DeleteCluster', 'DeleteWorkers', 'DeleteSelf', or 'DeleteNone'" - Policy *DeletionPolicyType `json:"policy"` + // Policy is the action to take when the condition is met. + // This field is logically required when using the legacy OnSuccess/OnFailure policies. + // It is marked as '+optional' at the API level to allow the 'deletionRules' field to be used instead. + // +kubebuilder:validation:Enum=DeleteCluster;DeleteWorkers;DeleteSelf;DeleteNone + // +optional + Policy *DeletionPolicyType `json:"policy,omitempty"` } +type DeletionPolicyType string + const ( DeleteCluster DeletionPolicyType = "DeleteCluster" // To delete the entire RayCluster custom resource on job completion. DeleteWorkers DeletionPolicyType = "DeleteWorkers" // To delete only the workers on job completion. @@ -158,9 +226,12 @@ type RayJobSpec struct { // +kubebuilder:validation:XValidation:rule="self in ['ray.io/kuberay-operator', 'kueue.x-k8s.io/multikueue']",message="the managedBy field value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'" // +optional ManagedBy *string `json:"managedBy,omitempty"` - // DeletionStrategy indicates what resources of the RayJob and how they are deleted upon job completion. - // If unset, deletion policy is based on 'spec.shutdownAfterJobFinishes'. - // This field requires the RayJobDeletionPolicy feature gate to be enabled. + // DeletionStrategy automates post-completion cleanup. + // Choose one style or omit: + // - Legacy: both onSuccess & onFailure (deprecated; may combine with shutdownAfterJobFinishes and TTLSecondsAfterFinished). + // - Rules: deletionRules (non-empty) — incompatible with shutdownAfterJobFinishes, legacy fields, and global TTLSecondsAfterFinished (use per-rule condition.ttlSeconds). + // Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true. + // Feature gate RayJobDeletionPolicy must be enabled when this field is set. // +optional DeletionStrategy *DeletionStrategy `json:"deletionStrategy,omitempty"` // Entrypoint represents the command to start execution. diff --git a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go index b4cb5decf12..c4828c02f06 100644 --- a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go +++ b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go @@ -103,6 +103,21 @@ func (in *AutoscalerOptions) DeepCopy() *AutoscalerOptions { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DeletionCondition) DeepCopyInto(out *DeletionCondition) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeletionCondition. +func (in *DeletionCondition) DeepCopy() *DeletionCondition { + if in == nil { + return nil + } + out := new(DeletionCondition) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DeletionPolicy) DeepCopyInto(out *DeletionPolicy) { *out = *in @@ -123,11 +138,40 @@ func (in *DeletionPolicy) DeepCopy() *DeletionPolicy { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DeletionRule) DeepCopyInto(out *DeletionRule) { + *out = *in + out.Condition = in.Condition +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeletionRule. +func (in *DeletionRule) DeepCopy() *DeletionRule { + if in == nil { + return nil + } + out := new(DeletionRule) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DeletionStrategy) DeepCopyInto(out *DeletionStrategy) { *out = *in - in.OnSuccess.DeepCopyInto(&out.OnSuccess) - in.OnFailure.DeepCopyInto(&out.OnFailure) + if in.OnSuccess != nil { + in, out := &in.OnSuccess, &out.OnSuccess + *out = new(DeletionPolicy) + (*in).DeepCopyInto(*out) + } + if in.OnFailure != nil { + in, out := &in.OnFailure, &out.OnFailure + *out = new(DeletionPolicy) + (*in).DeepCopyInto(*out) + } + if in.DeletionRules != nil { + in, out := &in.DeletionRules, &out.DeletionRules + *out = make([]DeletionRule, len(*in)) + copy(*out, *in) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeletionStrategy. diff --git a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml index 8f8679ca607..f613645cb64 100644 --- a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml @@ -60,34 +60,66 @@ spec: type: object deletionStrategy: properties: + deletionRules: + items: + properties: + condition: + properties: + jobStatus: + enum: + - SUCCEEDED + - FAILED + type: string + ttlSeconds: + default: 0 + format: int32 + minimum: 0 + type: integer + required: + - jobStatus + type: object + policy: + enum: + - DeleteCluster + - DeleteWorkers + - DeleteSelf + - DeleteNone + type: string + required: + - condition + - policy + type: object + minItems: 1 + type: array + x-kubernetes-list-type: atomic onFailure: properties: policy: + enum: + - DeleteCluster + - DeleteWorkers + - DeleteSelf + - DeleteNone type: string - x-kubernetes-validations: - - message: the policy field value must be either 'DeleteCluster', - 'DeleteWorkers', 'DeleteSelf', or 'DeleteNone' - rule: self in ['DeleteCluster', 'DeleteWorkers', 'DeleteSelf', - 'DeleteNone'] - required: - - policy type: object onSuccess: properties: policy: + enum: + - DeleteCluster + - DeleteWorkers + - DeleteSelf + - DeleteNone type: string - x-kubernetes-validations: - - message: the policy field value must be either 'DeleteCluster', - 'DeleteWorkers', 'DeleteSelf', or 'DeleteNone' - rule: self in ['DeleteCluster', 'DeleteWorkers', 'DeleteSelf', - 'DeleteNone'] - required: - - policy type: object - required: - - onFailure - - onSuccess type: object + x-kubernetes-validations: + - message: legacy policies (onSuccess/onFailure) and deletionRules + cannot be used together within the same deletionStrategy + rule: '!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))' + - message: deletionStrategy requires either BOTH onSuccess and onFailure, + OR the deletionRules field (cannot be empty) + rule: ((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules)) entrypoint: type: string entrypointNumCpus: diff --git a/ray-operator/config/overlays/test-overrides/README.md b/ray-operator/config/overlays/test-overrides/README.md new file mode 100644 index 00000000000..9df3cb15a7c --- /dev/null +++ b/ray-operator/config/overlays/test-overrides/README.md @@ -0,0 +1,92 @@ +# Test Overrides Overlay (CI / e2e ONLY) + +This overlay enables test-only / alpha feature gates (currently `RayJobDeletionPolicy`) without modifying: + +- The base manifests under `config/default` +- Generated CRDs (`make generate`) +- Helm chart defaults (`make helm`, users' `helm install` without -f override) + +Use it only in CI or local end-to-end testing when you explicitly need gated behavior. + +--- + +## Why It Exists + +Some feature gates are intentionally disabled by default for stability. +E2E tests must exercise them to validate behavior prior to promotion. +This overlay provides a safe, isolated place to turn them on. + +--- + +## Safety Guarantees + +| Concern | Guarantee | +|---------|-----------| +| Default user deploy (`make deploy`) | Unchanged | +| Helm install (no -f override) | Unchanged | +| CRD generation / codegen | Unaffected | +| Feature gates scope | Only those explicitly listed here | + +--- + +## Usage + +Deploy with feature gates enabled: + +```bash +make deploy-with-override IMG=kuberay/operator:nightly +``` + +Helm path (CI release automation): + +```bash +helm install kuberay-operator kuberay/kuberay-operator -f .buildkite/values-kuberay-operator-override.yaml +``` + +--- + +## Adding Another Feature Gate + +1. Edit `deployment-override.yaml` – append your gate inside the existing `--feature-gates=` list. +2. Update `.buildkite/values-kuberay-operator-override.yaml` likewise. +3. Add or adjust e2e tests as needed. + +Keep gate ordering stable to minimize diff noise. + +--- + +## Keeping In Sync + +If the base operator Deployment args change in `config/manager/manager.yaml`: + +1. Copy the updated arg list. +2. Re-apply the feature gates in `deployment-override.yaml`. +3. Re-render to confirm. + +--- + +## Removal / Promotion Flow + +When a gate graduates (enabled by default upstream): + +1. Remove it from the override (if it's default-on, it no longer needs listing). +2. Remove corresponding logic from tests if they branch on gate state. +3. (Optional) Note the graduation in release notes. + +--- + +## Troubleshooting + +| Problem | Action | +|---------|--------| +| Patch no longer applies | Check if Deployment name or container name changed. | +| Gates not taking effect | Confirm args rendered (render target) and operator pod restarted. | +| Unexpected arg order | The strategic merge patch replaces the entire args list; adjust ordering there. | + +--- + +## Do NOT + +- Add unrelated production configuration (RBAC, CRDs, resources) here. +- Reference this overlay from user-facing docs. +- Rename directory without updating `Makefile` targets. diff --git a/ray-operator/config/overlays/test-overrides/deployment-override.yaml b/ray-operator/config/overlays/test-overrides/deployment-override.yaml new file mode 100644 index 00000000000..5f7a1eba665 --- /dev/null +++ b/ray-operator/config/overlays/test-overrides/deployment-override.yaml @@ -0,0 +1,12 @@ +# Strategic merge patch for kuberay-operator Deployment (test / CI only). +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kuberay-operator +spec: + template: + spec: + containers: + - name: kuberay-operator + args: + - --feature-gates=RayClusterStatusConditions=true,RayJobDeletionPolicy=true diff --git a/ray-operator/config/overlays/test-overrides/kustomization.yaml b/ray-operator/config/overlays/test-overrides/kustomization.yaml new file mode 100644 index 00000000000..c1472f6f305 --- /dev/null +++ b/ray-operator/config/overlays/test-overrides/kustomization.yaml @@ -0,0 +1,17 @@ +## ============================================================================ +## Kustomize overlay: test-overrides (CI / e2e only) +## ---------------------------------------------------------------------------- +## Purpose: Enable alpha / experimental feature gates (currently RayJobDeletionPolicy) +## for end-to-end testing without modifying base manifests or Helm defaults. +## ============================================================================ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../default + +patches: + - path: deployment-override.yaml + target: + kind: Deployment + name: kuberay-operator diff --git a/ray-operator/config/samples/ray-job.deletion-rules.yaml b/ray-operator/config/samples/ray-job.deletion-rules.yaml new file mode 100644 index 00000000000..89112b14150 --- /dev/null +++ b/ray-operator/config/samples/ray-job.deletion-rules.yaml @@ -0,0 +1,85 @@ +apiVersion: ray.io/v1 +kind: RayJob +metadata: + name: rayjob-deletion-rules +spec: + entrypoint: | + python -c " + import ray + ray.init() + print(f'ray.cluster_resources(): {ray.cluster_resources()}') + " + # DeletionStrategy defines the deletion policies for a RayJob. + # It allows for fine-grained control over resource cleanup after a job finishes. + # DeletionRules is a list of deletion rules, processed based on their trigger conditions. + # While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime), + # the most impactful rule (e.g., DeleteCluster) will be executed first to prioritize resource cleanup and cost savings. + deletionStrategy: + # This sample demonstrates a staged cleanup process for a RayJob. + # Regardless of whether the job succeeds or fails, the cleanup follows these steps: + # 1. After 30 seconds, the worker pods are deleted. This allows for quick resource release while keeping the head pod for debugging. + # 2. After 60 seconds, the entire RayCluster (including the head pod) is deleted. + # 3. After 90 seconds, the RayJob custom resource itself is deleted, removing it from the Kubernetes API server. + deletionRules: + - condition: + jobStatus: FAILED + ttlSeconds: 30 + policy: DeleteWorkers + - condition: + jobStatus: FAILED + ttlSeconds: 60 + policy: DeleteCluster + - condition: + jobStatus: FAILED + ttlSeconds: 90 + policy: DeleteSelf + - condition: + jobStatus: SUCCEEDED + ttlSeconds: 30 + policy: DeleteWorkers + - condition: + jobStatus: SUCCEEDED + ttlSeconds: 60 + policy: DeleteCluster + - condition: + jobStatus: SUCCEEDED + ttlSeconds: 90 + policy: DeleteSelf + # rayClusterSpec specifies the RayCluster instance to be created by the RayJob controller. + rayClusterSpec: + rayVersion: '2.46.0' + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: rayproject/ray:2.46.0 + ports: + - containerPort: 6379 + name: gcs-server + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: "1" + requests: + cpu: "200m" + workerGroupSpecs: + - replicas: 1 + minReplicas: 1 + maxReplicas: 5 + groupName: small-group + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: rayproject/ray:2.46.0 + resources: + limits: + cpu: "1" + requests: + cpu: "200m" diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index cae16cd0bc8..91959a6197c 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -364,88 +364,26 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request) // TODO (kevin85421): We may not need to requeue the RayJob if it has already been suspended. return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil case rayv1.JobDeploymentStatusComplete, rayv1.JobDeploymentStatusFailed: - // If this RayJob uses an existing RayCluster (i.e., ClusterSelector is set), we should not delete the RayCluster. - ttlSeconds := rayJobInstance.Spec.TTLSecondsAfterFinished - nowTime := time.Now() - shutdownTime := rayJobInstance.Status.EndTime.Add(time.Duration(ttlSeconds) * time.Second) - logger.Info(string(rayJobInstance.Status.JobDeploymentStatus), - "ShutdownAfterJobFinishes", rayJobInstance.Spec.ShutdownAfterJobFinishes, - "ClusterSelector", rayJobInstance.Spec.ClusterSelector, - "ttlSecondsAfterFinished", ttlSeconds, - "Status.endTime", rayJobInstance.Status.EndTime, - "Now", nowTime, - "ShutdownTime", shutdownTime) - - if features.Enabled(features.RayJobDeletionPolicy) && - rayJobInstance.Spec.DeletionStrategy != nil && - len(rayJobInstance.Spec.ClusterSelector) == 0 { - - if shutdownTime.After(nowTime) { - delta := int32(time.Until(shutdownTime.Add(2 * time.Second)).Seconds()) - logger.Info("shutdownTime not reached, requeue this RayJob for n seconds", "seconds", delta) - return ctrl.Result{RequeueAfter: time.Duration(delta) * time.Second}, nil - } - - policy := rayv1.DeleteNone - if rayJobInstance.Status.JobStatus == rayv1.JobStatusSucceeded { - policy = *rayJobInstance.Spec.DeletionStrategy.OnSuccess.Policy - } else if rayJobInstance.Status.JobStatus == rayv1.JobStatusFailed { - policy = *rayJobInstance.Spec.DeletionStrategy.OnFailure.Policy - } else { - logger.Info("jobStatus not valid for deletion", "jobStatus", rayJobInstance.Status.JobStatus) - } - - // no need to continue as the selected policy is DeleteNone - if policy == rayv1.DeleteNone { - break - } - - logger.Info("Shutdown behavior is defined by the deletion policy", "deletionPolicy", rayJobInstance.Spec.DeletionStrategy) - if shutdownTime.After(nowTime) { - delta := int32(time.Until(shutdownTime.Add(2 * time.Second)).Seconds()) - logger.Info("shutdownTime not reached, requeue this RayJob for n seconds", "seconds", delta) - return ctrl.Result{RequeueAfter: time.Duration(delta) * time.Second}, nil - } + // The RayJob has reached a terminal state. Handle the cleanup and deletion logic. + // If the RayJob uses an existing RayCluster, we must not delete it. + if len(rayJobInstance.Spec.ClusterSelector) > 0 { + logger.Info("RayJob is using an existing RayCluster via clusterSelector; skipping resource deletion.", "RayClusterSelector", rayJobInstance.Spec.ClusterSelector) + return ctrl.Result{}, nil + } - switch policy { - case rayv1.DeleteCluster: - logger.Info("Deleting RayCluster", "RayCluster", rayJobInstance.Status.RayClusterName) - _, err = r.deleteClusterResources(ctx, rayJobInstance) - case rayv1.DeleteWorkers: - logger.Info("Suspending all worker groups", "RayCluster", rayJobInstance.Status.RayClusterName) - err = r.suspendWorkerGroups(ctx, rayJobInstance) - case rayv1.DeleteSelf: - logger.Info("Deleting RayJob") - err = r.Client.Delete(ctx, rayJobInstance) - default: - } - if err != nil { - return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err + if features.Enabled(features.RayJobDeletionPolicy) && rayJobInstance.Spec.DeletionStrategy != nil { + // The previous validation logic ensures that either DeletionRules or the legacy policies are set, but not both. + if rayJobInstance.Spec.DeletionStrategy.DeletionRules != nil { + return r.handleDeletionRules(ctx, rayJobInstance) } + return r.handleLegacyDeletionPolicy(ctx, rayJobInstance) } - if (!features.Enabled(features.RayJobDeletionPolicy) || rayJobInstance.Spec.DeletionStrategy == nil) && rayJobInstance.Spec.ShutdownAfterJobFinishes && len(rayJobInstance.Spec.ClusterSelector) == 0 { - logger.Info("Shutdown behavior is defined by the `ShutdownAfterJobFinishes` flag", "shutdownAfterJobFinishes", rayJobInstance.Spec.ShutdownAfterJobFinishes) - if shutdownTime.After(nowTime) { - delta := int32(time.Until(shutdownTime.Add(2 * time.Second)).Seconds()) - logger.Info("shutdownTime not reached, requeue this RayJob for n seconds", "seconds", delta) - return ctrl.Result{RequeueAfter: time.Duration(delta) * time.Second}, nil - } - if s := os.Getenv(utils.DELETE_RAYJOB_CR_AFTER_JOB_FINISHES); strings.ToLower(s) == "true" { - err = r.Client.Delete(ctx, rayJobInstance) - logger.Info("RayJob is deleted") - } else { - // We only need to delete the RayCluster. We don't need to delete the submitter Kubernetes Job so that users can still access - // the driver logs. In addition, a completed Kubernetes Job does not actually use any compute resources. - _, err = r.deleteClusterResources(ctx, rayJobInstance) - logger.Info("RayCluster is deleted", "RayCluster", rayJobInstance.Status.RayClusterName) - } - if err != nil { - return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err - } + if rayJobInstance.Spec.ShutdownAfterJobFinishes { + return r.handleShutdownAfterJobFinishes(ctx, rayJobInstance) } - // If the RayJob is completed, we should not requeue it. + // Default: No deletion policy is configured. The reconciliation is complete for this RayJob. return ctrl.Result{}, nil default: logger.Info("Unknown JobDeploymentStatus", "JobDeploymentStatus", rayJobInstance.Status.JobDeploymentStatus) @@ -1169,3 +1107,267 @@ func isSubmitterContainerFinished(pod *corev1.Pod) bool { } return false } + +// handleDeletionRules processes the DeletionRules with a impact-aware strategy. +// It categorizes rules into "overdue" and "pending". If overdue rules exist, +// it executes the most destructive one and then requeues for the next pending rule. +// If no rules are overdue, it simply requeues for the +// next pending rule. This function performs at most one deletion action per reconciliation. +func (r *RayJobReconciler) handleDeletionRules(ctx context.Context, rayJob *rayv1.RayJob) (ctrl.Result, error) { + logger := ctrl.LoggerFrom(ctx).WithValues("deletionMechanism", "DeletionRules") + nowTime := time.Now() + + var overdueRules []rayv1.DeletionRule + var nextRequeueTime *time.Time + + // Categorize all applicable and incomplete rules into "overdue" or "pending". + for _, rule := range rayJob.Spec.DeletionStrategy.DeletionRules { + // Skip rules that don't match the current job status. + if rule.Condition.JobStatus != rayJob.Status.JobStatus { + continue + } + + deletionTime := rayJob.Status.EndTime.Add(time.Duration(rule.Condition.TTLSeconds) * time.Second) + // Track the earliest requeue time to re-check later. + if nowTime.Before(deletionTime) { + if nextRequeueTime == nil || deletionTime.Before(*nextRequeueTime) { + nextRequeueTime = &deletionTime + } + continue + } + + // Need to check if the deletion action has already been completed to ensure idempotency. + isCompleted, err := r.isDeletionActionCompleted(ctx, rayJob, rule.Policy) + if err != nil { + logger.Error(err, "Failed to check if deletion action is completed", "rule", rule) + return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err + } + if isCompleted { + logger.Info("Skipping completed deletion rule", "rule", rule) + continue + } + + overdueRules = append(overdueRules, rule) + } + + // Handle overdue rules if any exist. + if len(overdueRules) > 0 { + ruleToExecute := selectMostImpactfulRule(overdueRules) + logger.Info("Executing the most impactful overdue deletion rule", "rule", ruleToExecute, "overdueRulesCount", len(overdueRules)) + if _, err := r.executeDeletionPolicy(ctx, rayJob, ruleToExecute.Policy); err != nil { + // If execution fails, return immediately for a retry. + return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err + } + } + + if nextRequeueTime != nil { + requeueAfter := requeueDelayFor(*nextRequeueTime) + logger.Info("Requeuing for the next scheduled rule", "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + + logger.Info("All applicable deletion rules have been processed.") + return ctrl.Result{}, nil +} + +// handleLegacyDeletionPolicy handles the deprecated onSuccess and onFailure policies. +func (r *RayJobReconciler) handleLegacyDeletionPolicy(ctx context.Context, rayJob *rayv1.RayJob) (ctrl.Result, error) { + logger := ctrl.LoggerFrom(ctx).WithValues("deletionMechanism", "LegacyOnSuccessFailure") + + var policy rayv1.DeletionPolicyType + switch rayJob.Status.JobStatus { + case rayv1.JobStatusSucceeded: + policy = *rayJob.Spec.DeletionStrategy.OnSuccess.Policy + case rayv1.JobStatusFailed: + policy = *rayJob.Spec.DeletionStrategy.OnFailure.Policy + default: + logger.Info("JobStatus is not valid for deletion, no policy applied", "jobStatus", rayJob.Status.JobStatus) + return ctrl.Result{}, nil + } + + // If the policy is DeleteNone, we are done. + if policy == rayv1.DeleteNone { + logger.Info("Deletion policy is DeleteNone; no action taken.") + return ctrl.Result{}, nil + } + + // These legacy policies use the top-level TTLSecondsAfterFinished. + nowTime := time.Now() + ttlSeconds := rayJob.Spec.TTLSecondsAfterFinished + shutdownTime := rayJob.Status.EndTime.Add(time.Duration(ttlSeconds) * time.Second) + logger.Info("Evaluating legacy deletion policy (onSuccess/onFailure)", + "JobDeploymentStatus", rayJob.Status.JobDeploymentStatus, + "policy", policy, + "JobStatus", rayJob.Status.JobStatus, + "ttlSecondsAfterFinished", ttlSeconds, + "Status.endTime", rayJob.Status.EndTime, + "Now", nowTime, + "ShutdownTime", shutdownTime) + + if shutdownTime.After(nowTime) { + requeueAfter := requeueDelayFor(shutdownTime) + logger.Info("TTL has not been met for legacy policy. Requeuing.", "shutdownTime", shutdownTime, "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + + logger.Info("Executing legacy deletion policy.", "policy", policy) + return r.executeDeletionPolicy(ctx, rayJob, policy) +} + +// handleShutdownAfterJobFinishes handles the oldest deletion mechanism, the ShutdownAfterJobFinishes boolean flag. +func (r *RayJobReconciler) handleShutdownAfterJobFinishes(ctx context.Context, rayJob *rayv1.RayJob) (ctrl.Result, error) { + logger := ctrl.LoggerFrom(ctx).WithValues("deletionMechanism", "ShutdownAfterJobFinishes") + + nowTime := time.Now() + ttlSeconds := rayJob.Spec.TTLSecondsAfterFinished + shutdownTime := rayJob.Status.EndTime.Add(time.Duration(ttlSeconds) * time.Second) + logger.Info("Evaluating job deletion policy based on ShutdownAfterJobFinishes", + "JobDeploymentStatus", rayJob.Status.JobDeploymentStatus, + "ShutdownAfterJobFinishes", rayJob.Spec.ShutdownAfterJobFinishes, + "ClusterSelector", rayJob.Spec.ClusterSelector, + "ttlSecondsAfterFinished", ttlSeconds, + "Status.endTime", rayJob.Status.EndTime, + "Now", nowTime, + "ShutdownTime", shutdownTime) + + if shutdownTime.After(nowTime) { + requeueAfter := requeueDelayFor(shutdownTime) + logger.Info("TTL has not been met for ShutdownAfterJobFinishes. Requeuing.", "shutdownTime", shutdownTime, "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + + var err error + if s := os.Getenv(utils.DELETE_RAYJOB_CR_AFTER_JOB_FINISHES); strings.ToLower(s) == "true" { + err = r.Client.Delete(ctx, rayJob) + if err == nil { + logger.Info("RayJob is deleted", "RayJob", rayJob.Name) + } + } else { + // We only need to delete the RayCluster. We don't need to delete the submitter Kubernetes Job so that users can still access + // the driver logs. In addition, a completed Kubernetes Job does not actually use any compute resources. + _, err = r.deleteClusterResources(ctx, rayJob) + if err == nil { + logger.Info("RayCluster is deleted", "RayCluster", rayJob.Status.RayClusterName) + } + } + + if err != nil { + return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err + } + + return ctrl.Result{}, nil +} + +// executeDeletionPolicy performs the actual resource deletion based on the policy type. +// This function centralizes the deletion logic to avoid code duplication. +func (r *RayJobReconciler) executeDeletionPolicy(ctx context.Context, rayJob *rayv1.RayJob, policy rayv1.DeletionPolicyType) (ctrl.Result, error) { + logger := ctrl.LoggerFrom(ctx) + var err error + + switch policy { + case rayv1.DeleteCluster: + logger.Info("Executing deletion policy: DeleteCluster", "RayCluster", rayJob.Status.RayClusterName) + _, err = r.deleteClusterResources(ctx, rayJob) + case rayv1.DeleteWorkers: + logger.Info("Executing deletion policy: DeleteWorkers", "RayCluster", rayJob.Status.RayClusterName) + err = r.suspendWorkerGroups(ctx, rayJob) + case rayv1.DeleteSelf: + logger.Info("Executing deletion policy: DeleteSelf", "RayJob", rayJob.Name) + err = r.Client.Delete(ctx, rayJob) + case rayv1.DeleteNone: + // This should be handled by the callers, but we include it for safety. + logger.Info("Executing deletion policy: DeleteNone. No action taken.") + default: + // This case should not be reached if validation is working correctly. + logger.Error(fmt.Errorf("unknown deletion policy: %s", policy), "Unknown deletion policy encountered") + } + + if err != nil { + return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err + } + return ctrl.Result{}, nil +} + +// isDeletionActionCompleted checks if the state corresponding to a deletion policy is already achieved. +// This is crucial for making the reconciliation loop idempotent by checking the actual cluster state. +func (r *RayJobReconciler) isDeletionActionCompleted(ctx context.Context, rayJob *rayv1.RayJob, policy rayv1.DeletionPolicyType) (bool, error) { + clusterIdentifier := common.RayJobRayClusterNamespacedName(rayJob) + cluster := &rayv1.RayCluster{} + + switch policy { + case rayv1.DeleteWorkers: + if err := r.Get(ctx, clusterIdentifier, cluster); err != nil { + if errors.IsNotFound(err) { + // If the cluster is gone, the workers are definitely gone. + return true, nil + } + // For any other error, we can't be sure of the state, so report the error. + return false, err + } + + if !cluster.DeletionTimestamp.IsZero() { + // If the cluster is being deleted, we consider the action complete. + return true, nil + } + + // If the cluster exists, check if all worker groups are suspended. + for _, wg := range cluster.Spec.WorkerGroupSpecs { + if wg.Suspend == nil || !*wg.Suspend { + // Found an active worker group, so the action is not complete. + return false, nil + } + } + + return true, nil + + case rayv1.DeleteCluster: + if err := r.Get(ctx, clusterIdentifier, cluster); err != nil { + if errors.IsNotFound(err) { + return true, nil + } + // For any other error, we can't be sure of the state, so report the error. + return false, err + } + + if !cluster.DeletionTimestamp.IsZero() { + // If the cluster is being deleted, we consider the action complete. + return true, nil + } + + return false, nil + + case rayv1.DeleteSelf: + // This action is terminal. If this function is running, the RayJob still exists, + // so the action cannot be considered complete. + return false, nil + + case rayv1.DeleteNone: + // "DeleteNone" is a no-op and is always considered complete. + return true, nil + } + + return false, fmt.Errorf("unknown deletion policy for completion check: %s", policy) +} + +// selectMostImpactfulRule finds the rule with the most destructive policy from a given list. +func selectMostImpactfulRule(rules []rayv1.DeletionRule) rayv1.DeletionRule { + order := map[rayv1.DeletionPolicyType]int{ + rayv1.DeleteSelf: 4, + rayv1.DeleteCluster: 3, + rayv1.DeleteWorkers: 2, + rayv1.DeleteNone: 1, + } + + mostImpactfulRule := rules[0] + for _, rule := range rules[1:] { + if order[rule.Policy] > order[mostImpactfulRule.Policy] { + mostImpactfulRule = rule + } + } + return mostImpactfulRule +} + +// requeueDelayFor computes the duration for the next requeue, ensuring a minimum buffer. +func requeueDelayFor(t time.Time) time.Duration { + return time.Until(t) + 2*time.Second +} diff --git a/ray-operator/controllers/ray/rayjob_controller_test.go b/ray-operator/controllers/ray/rayjob_controller_test.go index 4d7c8fea810..75cb9edb75a 100644 --- a/ray-operator/controllers/ray/rayjob_controller_test.go +++ b/ray-operator/controllers/ray/rayjob_controller_test.go @@ -896,10 +896,10 @@ var _ = Context("RayJob with different submission modes", func() { onSuccessPolicy := rayv1.DeleteCluster onFailurePolicy := rayv1.DeleteNone deletionStrategy := &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, } @@ -909,10 +909,10 @@ var _ = Context("RayJob with different submission modes", func() { By("Verify RayJob spec", func() { Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, })) @@ -1035,10 +1035,10 @@ var _ = Context("RayJob with different submission modes", func() { onSuccessPolicy := rayv1.DeleteNone onFailurePolicy := rayv1.DeleteCluster deletionStrategy := &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, } @@ -1048,10 +1048,10 @@ var _ = Context("RayJob with different submission modes", func() { By("Verify RayJob spec", func() { Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, })) @@ -1174,10 +1174,10 @@ var _ = Context("RayJob with different submission modes", func() { onSuccessPolicy := rayv1.DeleteWorkers onFailurePolicy := rayv1.DeleteNone deletionStrategy := rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, } @@ -1187,10 +1187,10 @@ var _ = Context("RayJob with different submission modes", func() { By("Verify RayJob spec", func() { Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, })) @@ -1330,10 +1330,10 @@ var _ = Context("RayJob with different submission modes", func() { onSuccessPolicy := rayv1.DeleteWorkers onFailurePolicy := rayv1.DeleteWorkers deletionStrategy := rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, } @@ -1343,10 +1343,10 @@ var _ = Context("RayJob with different submission modes", func() { By("Verify RayJob spec", func() { Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, })) @@ -1486,10 +1486,10 @@ var _ = Context("RayJob with different submission modes", func() { onSuccessPolicy := rayv1.DeleteSelf onFailurePolicy := rayv1.DeleteNone deletionStrategy := rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, } @@ -1602,10 +1602,10 @@ var _ = Context("RayJob with different submission modes", func() { onSuccessPolicy := rayv1.DeleteNone onFailurePolicy := rayv1.DeleteSelf deletionStrategy := rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, } @@ -1718,10 +1718,10 @@ var _ = Context("RayJob with different submission modes", func() { onSuccessPolicy := rayv1.DeleteNone onFailurePolicy := rayv1.DeleteNone deletionStrategy := rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, } @@ -1731,10 +1731,10 @@ var _ = Context("RayJob with different submission modes", func() { By("Verify RayJob spec", func() { Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, })) @@ -1879,10 +1879,10 @@ var _ = Context("RayJob with different submission modes", func() { onSuccessPolicy := rayv1.DeleteCluster onFailurePolicy := rayv1.DeleteNone deletionStrategy := rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, } @@ -1892,10 +1892,10 @@ var _ = Context("RayJob with different submission modes", func() { By("Verify RayJob spec", func() { Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, })) @@ -2031,5 +2031,1887 @@ var _ = Context("RayJob with different submission modes", func() { time.Second*3, time.Millisecond*500).Should(Succeed()) }) }) + + It("Should delete workers on success when a single 'DeleteWorkers' rule is set", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-rule-deleteworkers-on-success", namespace) + rayCluster := &rayv1.RayCluster{} + + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Succeeded" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + defer fakeRayDashboardClient.GetJobInfoMock.Store(nil) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + + // RayJob transitions to Complete. + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusComplete), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + }) + + By("If DeletionStrategy=DeleteWorkers, all workers should be deleted, but not the Head pod and submitter Job", func() { + // RayCluster exists + Consistently( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check worker group is suspended + Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Suspend).To(BeTrue()) + + // 0 worker Pods exist + workerPods := corev1.PodList{} + workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions() + Eventually( + listResourceFunc(ctx, &workerPods, workerLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(0), "expected 0 workers") + + // Head Pod is still running + headPods := corev1.PodList{} + headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions() + Consistently( + listResourceFunc(ctx, &headPods, headLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items) + + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + }) + + It("Should delete workers on failure when a single 'DeleteWorkers' rule is set", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-rule-deleteworkers-on-failure", namespace) + rayCluster := &rayv1.RayCluster{} + + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Failed" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + defer fakeRayDashboardClient.GetJobInfoMock.Store(nil) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + + // RayJob transitions to Failed. + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusFailed), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + }) + + By("If DeletionStrategy=DeleteWorkers, all workers should be deleted, but not the Head pod and submitter Job", func() { + // RayCluster exists + Consistently( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check worker group is suspended + Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Suspend).To(BeTrue()) + + // 0 worker Pods exist + workerPods := corev1.PodList{} + workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions() + Eventually( + listResourceFunc(ctx, &workerPods, workerLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(0), "expected 0 workers") + + // Head Pod is still running + headPods := corev1.PodList{} + headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions() + Consistently( + listResourceFunc(ctx, &headPods, headLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items) + + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + }) + + It("Should delete cluster on success when a single 'DeleteCluster' rule is set", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-rule-deletecluster-on-success", namespace) + rayCluster := &rayv1.RayCluster{} + + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Succeeded" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + defer fakeRayDashboardClient.GetJobInfoMock.Store(nil) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + + // RayJob transitions to Complete. + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusComplete), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + }) + + By("If DeletionStrategy=DeleteCluster, RayCluster should be deleted, but not the submitter Job.", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster)()) + }, + time.Second*3, time.Millisecond*500).Should(BeTrue()) + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + }) + + It("Should delete cluster on failure when a single 'DeleteCluster' rule is set", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-rule-deletecluster-on-failure", namespace) + rayCluster := &rayv1.RayCluster{} + + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Failed" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + defer fakeRayDashboardClient.GetJobInfoMock.Store(nil) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + + // RayJob transitions to Complete. + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusFailed), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + }) + + By("If DeletionStrategy=DeleteCluster, RayCluster should be deleted, but not the submitter Job.", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster)()) + }, + time.Second*3, time.Millisecond*500).Should(BeTrue()) + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + }) + + It("Should delete self on success when a single 'DeleteSelf' rule is set", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-rule-deleteself-on-success", namespace) + rayCluster := &rayv1.RayCluster{} + + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Create a RayJob custom resource", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Succeeded" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + }) + + By("If DeletionStrategy=DeleteSelf, the RayJob is deleted", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob)) + }, time.Second*5, time.Millisecond*500).Should(BeTrue()) + }) + }) + + It("Should delete self on failure when a single 'DeleteSelf' rule is set", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-rule-deleteself-on-failure", namespace) + rayCluster := &rayv1.RayCluster{} + + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Create a RayJob custom resource", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Failed" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + }) + + By("If DeletionStrategy=DeleteSelf, the RayJob is deleted", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob)) + }, time.Second*5, time.Millisecond*500).Should(BeTrue()) + }) + }) + + It("Should delete none on success when a single 'DeleteNone' rule is set", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-rule-deletenone-on-success", namespace) + rayCluster := &rayv1.RayCluster{} + + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteNone, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteNone, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Succeeded" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + defer fakeRayDashboardClient.GetJobInfoMock.Store(nil) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + + // RayJob transitions to Complete. + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusComplete), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + }) + + By("If DeletionStrategy=DeleteNone, no resources are deleted", func() { + // RayJob exists + Consistently( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayJob %v not found", rayJob) + + // RayCluster exists + Consistently( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Worker replicas set to 3 + Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(int32(3))) + + // 3 worker Pods exist + workerPods := corev1.PodList{} + workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions() + Consistently( + listResourceFunc(ctx, &workerPods, workerLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(3), "expected 3 workers") + + // Head Pod is still running + headPods := corev1.PodList{} + headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions() + Consistently( + listResourceFunc(ctx, &headPods, headLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items) + + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + }) + + It("Should delete none on failure when a single 'DeleteNone' rule is set", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-rule-deletenone-on-failure", namespace) + rayCluster := &rayv1.RayCluster{} + + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteNone, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteNone, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Failed" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + defer fakeRayDashboardClient.GetJobInfoMock.Store(nil) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + + // RayJob transitions to Complete. + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusFailed), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + }) + + By("If DeletionStrategy=DeleteNone, no resources are deleted", func() { + // RayJob exists + Consistently( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayJob %v not found", rayJob) + + // RayCluster exists + Consistently( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Worker replicas set to 3 + Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(int32(3))) + + // 3 worker Pods exist + workerPods := corev1.PodList{} + workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions() + Consistently( + listResourceFunc(ctx, &workerPods, workerLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(3), "expected 3 workers") + + // Head Pod is still running + headPods := corev1.PodList{} + headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions() + Consistently( + listResourceFunc(ctx, &headPods, headLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items) + + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + }) + + It("Should execute MOST impactful rule (DeleteSelf) when all rules are overdue on success", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-impactful-rule-override-on-success", namespace) + rayCluster := &rayv1.RayCluster{} + + // Define the multi-stage DeletionStrategy + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 0, + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 0, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 0, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 0, + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 0, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 0, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource with multi-stage deletion rules", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Succeeded" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + }) + + By("Verify RayJob itself is deleted", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob)) + }, time.Second*5, time.Millisecond*500).Should(BeTrue()) + }) + }) + + It("Should execute MOST impactful rule (DeleteSelf) when all rules are overdue on failure", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-impactful-rule-override-on-failure", namespace) + rayCluster := &rayv1.RayCluster{} + + // Define the multi-stage DeletionStrategy + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource with multi-stage deletion rules", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Failed" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + }) + + By("Verify RayJob itself is deleted", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob)) + }, time.Second*5, time.Millisecond*500).Should(BeTrue()) + }) + }) + + It("Should process multi-stage deletions in order on success: Workers, then Cluster, then Self", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-multistage-deletion-on-success", namespace) + rayCluster := &rayv1.RayCluster{} + + // Define the multi-stage DeletionStrategy + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 0, // Stage 1: Delete workers after 0 seconds + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 5, // Stage 2: Delete cluster after 5 seconds + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, // Stage 3: Delete self after 10 seconds + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 0, + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 5, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource with multi-stage deletion rules", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Succeeded" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + defer fakeRayDashboardClient.GetJobInfoMock.Store(nil) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + + // RayJob transitions to Complete. + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusComplete), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + }) + + By("Stage 1: Verify workers are deleted, but cluster and job still exist", func() { + // RayCluster exists + Consistently( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check worker group is suspended + Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Suspend).To(BeTrue()) + + // 0 worker Pods exist + workerPods := corev1.PodList{} + workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions() + Eventually( + listResourceFunc(ctx, &workerPods, workerLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(0), "expected 0 workers") + + // Head Pod is still running + headPods := corev1.PodList{} + headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions() + Consistently( + listResourceFunc(ctx, &headPods, headLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items) + + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + + By("Stage 2 (after 5s): Verify RayCluster is deleted, but job still exists", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster)()) + }, + time.Second*3, time.Millisecond*500).Should(BeTrue()) + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + + By("Stage 3 (after 10s): Verify RayJob itself is deleted", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob)) + }, time.Second*5, time.Millisecond*500).Should(BeTrue()) + }) + }) + + It("Should process multi-stage deletions in order on failure: Workers, then Cluster, then Self", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-multistage-deletion-on-failure", namespace) + rayCluster := &rayv1.RayCluster{} + + // Define the multi-stage DeletionStrategy + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, // Stage 1: Delete workers after 0 seconds + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 5, // Stage 2: Delete cluster after 5 seconds + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 10, // Stage 3: Delete self after 10 seconds + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 5, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 10, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource with multi-stage deletion rules", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Failed" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + defer fakeRayDashboardClient.GetJobInfoMock.Store(nil) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + + // RayJob transitions to Failed. + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusFailed), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + }) + + By("Stage 1: Verify workers are deleted, but cluster and job still exist", func() { + // RayCluster exists + Consistently( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check worker group is suspended + Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Suspend).To(BeTrue()) + + // 0 worker Pods exist + workerPods := corev1.PodList{} + workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions() + Eventually( + listResourceFunc(ctx, &workerPods, workerLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(0), "expected 0 workers") + + // Head Pod is still running + headPods := corev1.PodList{} + headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions() + Consistently( + listResourceFunc(ctx, &headPods, headLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items) + + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + + By("Stage 2 (after 5s): Verify RayCluster is deleted, but job still exists", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster)()) + }, + time.Second*3, time.Millisecond*500).Should(BeTrue()) + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + + By("Stage 3 (after 10s): Verify RayJob itself is deleted", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob)) + }, time.Second*5, time.Millisecond*500).Should(BeTrue()) + }) + }) }) }) diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index 74d2b4fe0e6..edda0b772d5 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -161,8 +161,9 @@ func ValidateRayJobSpec(rayJob *rayv1.RayJob) error { return fmt.Errorf("The RayJob spec is invalid: TTLSecondsAfterFinished must be a non-negative integer") } - if !rayJob.Spec.ShutdownAfterJobFinishes && rayJob.Spec.TTLSecondsAfterFinished > 0 { - return fmt.Errorf("The RayJob spec is invalid: a RayJob with shutdownAfterJobFinishes set to false cannot have TTLSecondsAfterFinished") + // Validate TTL and deletion strategy together + if err := validateDeletionConfiguration(rayJob); err != nil { + return err } isClusterSelectorMode := len(rayJob.Spec.ClusterSelector) != 0 @@ -224,46 +225,7 @@ func ValidateRayJobSpec(rayJob *rayv1.RayJob) error { if rayJob.Spec.BackoffLimit != nil && *rayJob.Spec.BackoffLimit < 0 { return fmt.Errorf("The RayJob spec is invalid: backoffLimit must be a positive integer") } - if !features.Enabled(features.RayJobDeletionPolicy) && rayJob.Spec.DeletionStrategy != nil { - return fmt.Errorf("The RayJob spec is invalid: RayJobDeletionPolicy feature gate must be enabled to use the DeletionStrategy feature") - } - - if rayJob.Spec.DeletionStrategy != nil { - onSuccessPolicy := rayJob.Spec.DeletionStrategy.OnSuccess - onFailurePolicy := rayJob.Spec.DeletionStrategy.OnFailure - - if onSuccessPolicy.Policy == nil { - return fmt.Errorf("The RayJob spec is invalid: the DeletionPolicyType field of DeletionStrategy.OnSuccess cannot be unset when DeletionStrategy is enabled") - } - if onFailurePolicy.Policy == nil { - return fmt.Errorf("The RayJob spec is invalid: the DeletionPolicyType field of DeletionStrategy.OnFailure cannot be unset when DeletionStrategy is enabled") - } - - if isClusterSelectorMode { - switch *onSuccessPolicy.Policy { - case rayv1.DeleteCluster: - return fmt.Errorf("The RayJob spec is invalid: the ClusterSelector mode doesn't support DeletionStrategy=DeleteCluster on success") - case rayv1.DeleteWorkers: - return fmt.Errorf("The RayJob spec is invalid: the ClusterSelector mode doesn't support DeletionStrategy=DeleteWorkers on success") - } - - switch *onFailurePolicy.Policy { - case rayv1.DeleteCluster: - return fmt.Errorf("The RayJob spec is invalid: the ClusterSelector mode doesn't support DeletionStrategy=DeleteCluster on failure") - case rayv1.DeleteWorkers: - return fmt.Errorf("The RayJob spec is invalid: the ClusterSelector mode doesn't support DeletionStrategy=DeleteWorkers on failure") - } - } - - if (*onSuccessPolicy.Policy == rayv1.DeleteWorkers || *onFailurePolicy.Policy == rayv1.DeleteWorkers) && IsAutoscalingEnabled(rayJob.Spec.RayClusterSpec) { - // TODO (rueian): This can be supported in a future Ray version. We should check the RayVersion once we know it. - return fmt.Errorf("The RayJob spec is invalid: DeletionStrategy=DeleteWorkers currently does not support RayCluster with autoscaling enabled") - } - if rayJob.Spec.ShutdownAfterJobFinishes && (*onSuccessPolicy.Policy == rayv1.DeleteNone || *onFailurePolicy.Policy == rayv1.DeleteNone) { - return fmt.Errorf("The RayJob spec is invalid: shutdownAfterJobFinshes is set to 'true' while deletion policy is 'DeleteNone'") - } - } return nil } @@ -301,3 +263,180 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error { return nil } + +// validateDeletionConfiguration validates both deletion strategy and TTL configuration +func validateDeletionConfiguration(rayJob *rayv1.RayJob) error { + if !rayJob.Spec.ShutdownAfterJobFinishes && rayJob.Spec.TTLSecondsAfterFinished > 0 { + return fmt.Errorf("The RayJob spec is invalid: a RayJob with shutdownAfterJobFinishes set to false cannot have TTLSecondsAfterFinished") + } + + // No strategy block: nothing else to validate. + if rayJob.Spec.DeletionStrategy == nil { + return nil + } + + // Feature gate must be enabled for any strategy usage. + if !features.Enabled(features.RayJobDeletionPolicy) { + return fmt.Errorf("RayJobDeletionPolicy feature gate must be enabled to use DeletionStrategy") + } + + legacyConfigured := rayJob.Spec.DeletionStrategy.OnSuccess != nil || rayJob.Spec.DeletionStrategy.OnFailure != nil + rulesConfigured := len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0 + + // Mutual exclusivity: rules mode forbids shutdown & legacy. (TTL+rules is implicitly invalid because TTL requires shutdown.) + if rulesConfigured && rayJob.Spec.ShutdownAfterJobFinishes { + return fmt.Errorf("The RayJob spec is invalid: spec.shutdownAfterJobFinishes and spec.deletionStrategy.deletionRules are mutually exclusive") + } + if rulesConfigured && legacyConfigured { + return fmt.Errorf("The RayJob spec is invalid: Cannot use both legacy onSuccess/onFailure fields and deletionRules simultaneously") + } + + // Detailed content validation + if legacyConfigured { + if err := validateLegacyDeletionPolicies(rayJob); err != nil { + return err + } + } else if rulesConfigured { + if err := validateDeletionRules(rayJob); err != nil { + return err + } + } else { + return fmt.Errorf("The RayJob spec is invalid: DeletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (cannot be empty)") + } + + return nil +} + +// validateDeletionRules validates the deletion rules in the RayJob spec. +// It performs per-rule validations, checks for uniqueness, and ensures logical TTL consistency. +// Errors are collected and returned as a single aggregated error using errors.Join for better user feedback. +func validateDeletionRules(rayJob *rayv1.RayJob) error { + rules := rayJob.Spec.DeletionStrategy.DeletionRules + isClusterSelectorMode := len(rayJob.Spec.ClusterSelector) != 0 + + // Group TTLs by JobStatus for cross-rule validation and uniqueness checking. + rulesByStatus := make(map[rayv1.JobStatus]map[rayv1.DeletionPolicyType]int32) + var errs []error + + // Single pass: Validate each rule individually and group for later consistency checks. + for i, rule := range rules { + // Validate TTL is non-negative. + if rule.Condition.TTLSeconds < 0 { + errs = append(errs, fmt.Errorf("deletionRules[%d]: TTLSeconds must be non-negative", i)) + continue + } + + // Contextual validations based on spec. + if isClusterSelectorMode && (rule.Policy == rayv1.DeleteCluster || rule.Policy == rayv1.DeleteWorkers) { + errs = append(errs, fmt.Errorf("deletionRules[%d]: DeletionPolicyType '%s' not supported when ClusterSelector is set", i, rule.Policy)) + continue + } + if IsAutoscalingEnabled(rayJob.Spec.RayClusterSpec) && rule.Policy == rayv1.DeleteWorkers { + // TODO (rueian): Support in future Ray versions by checking RayVersion. + errs = append(errs, fmt.Errorf("deletionRules[%d]: DeletionPolicyType 'DeleteWorkers' not supported with autoscaling enabled", i)) + continue + } + + // Group valid rule for consistency check. + policyTTLs, ok := rulesByStatus[rule.Condition.JobStatus] + if !ok { + policyTTLs = make(map[rayv1.DeletionPolicyType]int32) + rulesByStatus[rule.Condition.JobStatus] = policyTTLs + } + + // Check for uniqueness of (JobStatus, DeletionPolicyType) pair. + if _, exists := policyTTLs[rule.Policy]; exists { + errs = append(errs, fmt.Errorf("deletionRules[%d]: duplicate rule for DeletionPolicyType '%s' and JobStatus '%s'", i, rule.Policy, rule.Condition.JobStatus)) + continue + } + + policyTTLs[rule.Policy] = rule.Condition.TTLSeconds + } + + // Second pass: Validate TTL consistency per JobStatus. + for status, policyTTLs := range rulesByStatus { + if err := validateTTLConsistency(policyTTLs, status); err != nil { + errs = append(errs, err) + } + } + + return errstd.Join(errs...) +} + +// validateTTLConsistency ensures TTLs follow the deletion hierarchy: Workers <= Cluster <= Self. +// (Lower TTL means deletes earlier.) +func validateTTLConsistency(policyTTLs map[rayv1.DeletionPolicyType]int32, status rayv1.JobStatus) error { + // Define the required deletion order. TTLs must be non-decreasing along this sequence. + deletionOrder := []rayv1.DeletionPolicyType{ + rayv1.DeleteWorkers, + rayv1.DeleteCluster, + rayv1.DeleteSelf, + } + + var prevPolicy rayv1.DeletionPolicyType + var prevTTL int32 + var hasPrev bool + + var errs []error + + for _, policy := range deletionOrder { + ttl, exists := policyTTLs[policy] + if !exists { + continue + } + + if hasPrev && ttl < prevTTL { + errs = append(errs, fmt.Errorf( + "for JobStatus '%s': %s TTL (%d) must be >= %s TTL (%d)", + status, policy, ttl, prevPolicy, prevTTL, + )) + } + + prevPolicy = policy + prevTTL = ttl + hasPrev = true + } + + return errstd.Join(errs...) +} + +// validateLegacyDeletionPolicies handles validation for the old `onSuccess` and `onFailure` fields. +func validateLegacyDeletionPolicies(rayJob *rayv1.RayJob) error { + isClusterSelectorMode := len(rayJob.Spec.ClusterSelector) != 0 + + // Both policies must be set if using the legacy API. + if rayJob.Spec.DeletionStrategy.OnSuccess == nil || rayJob.Spec.DeletionStrategy.OnFailure == nil { + return fmt.Errorf("both DeletionStrategy.OnSuccess and DeletionStrategy.OnFailure must be set when using the legacy deletion policy fields of DeletionStrategy") + } + + // Validate that the Policy field is set within each policy. + onSuccessPolicy := rayJob.Spec.DeletionStrategy.OnSuccess + onFailurePolicy := rayJob.Spec.DeletionStrategy.OnFailure + + if onSuccessPolicy.Policy == nil { + return fmt.Errorf("the DeletionPolicyType field of DeletionStrategy.OnSuccess cannot be unset when DeletionStrategy is enabled") + } + if onFailurePolicy.Policy == nil { + return fmt.Errorf("the DeletionPolicyType field of DeletionStrategy.OnFailure cannot be unset when DeletionStrategy is enabled") + } + + if isClusterSelectorMode { + if *onSuccessPolicy.Policy == rayv1.DeleteCluster || *onSuccessPolicy.Policy == rayv1.DeleteWorkers { + return fmt.Errorf("the ClusterSelector mode doesn't support DeletionStrategy=%s on success", *onSuccessPolicy.Policy) + } + if *onFailurePolicy.Policy == rayv1.DeleteCluster || *onFailurePolicy.Policy == rayv1.DeleteWorkers { + return fmt.Errorf("the ClusterSelector mode doesn't support DeletionStrategy=%s on failure", *onFailurePolicy.Policy) + } + } + + if (*onSuccessPolicy.Policy == rayv1.DeleteWorkers || *onFailurePolicy.Policy == rayv1.DeleteWorkers) && IsAutoscalingEnabled(rayJob.Spec.RayClusterSpec) { + // TODO (rueian): This can be supported in a future Ray version. We should check the RayVersion once we know it. + return fmt.Errorf("DeletionStrategy=DeleteWorkers currently does not support RayCluster with autoscaling enabled") + } + + if rayJob.Spec.ShutdownAfterJobFinishes && (*onSuccessPolicy.Policy == rayv1.DeleteNone || *onFailurePolicy.Policy == rayv1.DeleteNone) { + return fmt.Errorf("The RayJob spec is invalid: shutdownAfterJobFinshes is set to 'true' while deletion policy is 'DeleteNone'") + } + + return nil +} diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index dc464424f40..30eb499beaf 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -795,10 +795,10 @@ func TestValidateRayJobSpec(t *testing.T) { name: "RayJobDeletionPolicy feature gate must be enabled to use the DeletionStrategy feature", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteCluster), }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteCluster), }, }, @@ -956,14 +956,15 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { spec rayv1.RayJobSpec expectError bool }{ + // Legacy DeletionStrategy tests { name: "the ClusterSelector mode doesn't support DeletionStrategy=DeleteCluster", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteCluster), }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteCluster), }, }, ClusterSelector: map[string]string{"key": "value"}, @@ -974,10 +975,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "the ClusterSelector mode doesn't support DeletionStrategy=DeleteWorkers", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteWorkers), }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteWorkers), }, }, ClusterSelector: map[string]string{"key": "value"}, @@ -988,10 +989,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "DeletionStrategy=DeleteWorkers currently does not support RayCluster with autoscaling enabled", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteWorkers), }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteWorkers), }, }, RayClusterSpec: &rayv1.RayClusterSpec{ @@ -1005,10 +1006,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "valid RayJob with DeletionStrategy=DeleteCluster", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteCluster), }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteCluster), }, }, ShutdownAfterJobFinishes: true, @@ -1029,10 +1030,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "shutdownAfterJobFinshes is set to 'true' while deletion policy is 'DeleteNone'", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, }, ShutdownAfterJobFinishes: true, @@ -1044,7 +1045,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "OnSuccess unset", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, }, ShutdownAfterJobFinishes: true, @@ -1056,7 +1057,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "OnSuccess.DeletionPolicyType unset", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, }, ShutdownAfterJobFinishes: true, @@ -1068,7 +1069,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "OnFailure unset", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, }, ShutdownAfterJobFinishes: true, @@ -1080,10 +1081,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "OnFailure.DeletionPolicyType unset", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, - OnFailure: rayv1.DeletionPolicy{}, + OnFailure: &rayv1.DeletionPolicy{}, }, ShutdownAfterJobFinishes: true, RayClusterSpec: createBasicRayClusterSpec(), }, @@ -1098,6 +1099,272 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { }, expectError: true, }, + // New Deletion Rules tests + { + name: "valid deletionRules", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, + }, + }, + }, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: false, + }, + { + name: "deletionRules and ShutdownAfterJobFinishes both set", + spec: rayv1.RayJobSpec{ + ShutdownAfterJobFinishes: true, + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, + }, + }, + }, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: true, + }, + { + name: "deletionRules and legacy onSuccess both set", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + OnSuccess: &rayv1.DeletionPolicy{ + Policy: ptr.To(rayv1.DeleteCluster), + }, + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, + }, + }, + }, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: true, + }, + { + name: "nil DeletionStrategy", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{}, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: true, + }, + { + name: "empty DeletionStrategy", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{}, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: true, + }, + { + name: "duplicate rule in deletionRules", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 20, + }, + }, + }, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: true, + }, + { + name: "negative TTLSeconds in deletionRules", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: -10, + }, + }, + }, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: true, + }, + { + name: "deletionRules with ClusterSelector and DeleteWorkers policy", + spec: rayv1.RayJobSpec{ + ClusterSelector: map[string]string{"key": "value"}, + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, + }, + }, + }, + }, + }, + expectError: true, + }, + { + name: "deletionRules with ClusterSelector and DeleteCluster policy", + spec: rayv1.RayJobSpec{ + ClusterSelector: map[string]string{"key": "value"}, + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, + }, + }, + }, + }, + }, + expectError: true, + }, + { + name: "deletionRules with autoscaling and DeleteWorkers policy", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, + }, + }, + }, + }, + RayClusterSpec: &rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + HeadGroupSpec: headGroupSpecWithOneContainer, + }, + }, + expectError: true, + }, + { + name: "inconsistent TTLs in deletionRules (DeleteCluster < DeleteWorkers)", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 20, + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, + }, + }, + }, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: true, + }, + { + name: "inconsistent TTLs in deletionRules (DeleteSelf < DeleteCluster)", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 20, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, + }, + }, + }, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: true, + }, + { + name: "valid complex deletionRules", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 20, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 30, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, + }, + }, + }, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: false, + }, } features.SetFeatureGateDuringTest(t, features.RayJobDeletionPolicy, true) diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go new file mode 100644 index 00000000000..36b8c006209 --- /dev/null +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go @@ -0,0 +1,36 @@ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1 + +import ( + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" +) + +// DeletionConditionApplyConfiguration represents a declarative configuration of the DeletionCondition type for use +// with apply. +type DeletionConditionApplyConfiguration struct { + JobStatus *rayv1.JobStatus `json:"jobStatus,omitempty"` + TTLSeconds *int32 `json:"ttlSeconds,omitempty"` +} + +// DeletionConditionApplyConfiguration constructs a declarative configuration of the DeletionCondition type for use with +// apply. +func DeletionCondition() *DeletionConditionApplyConfiguration { + return &DeletionConditionApplyConfiguration{} +} + +// WithJobStatus sets the JobStatus field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the JobStatus field is set to the value of the last call. +func (b *DeletionConditionApplyConfiguration) WithJobStatus(value rayv1.JobStatus) *DeletionConditionApplyConfiguration { + b.JobStatus = &value + return b +} + +// WithTTLSeconds sets the TTLSeconds field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the TTLSeconds field is set to the value of the last call. +func (b *DeletionConditionApplyConfiguration) WithTTLSeconds(value int32) *DeletionConditionApplyConfiguration { + b.TTLSeconds = &value + return b +} diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionrule.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionrule.go new file mode 100644 index 00000000000..91e4b50de99 --- /dev/null +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionrule.go @@ -0,0 +1,36 @@ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1 + +import ( + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" +) + +// DeletionRuleApplyConfiguration represents a declarative configuration of the DeletionRule type for use +// with apply. +type DeletionRuleApplyConfiguration struct { + Policy *rayv1.DeletionPolicyType `json:"policy,omitempty"` + Condition *DeletionConditionApplyConfiguration `json:"condition,omitempty"` +} + +// DeletionRuleApplyConfiguration constructs a declarative configuration of the DeletionRule type for use with +// apply. +func DeletionRule() *DeletionRuleApplyConfiguration { + return &DeletionRuleApplyConfiguration{} +} + +// WithPolicy sets the Policy field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Policy field is set to the value of the last call. +func (b *DeletionRuleApplyConfiguration) WithPolicy(value rayv1.DeletionPolicyType) *DeletionRuleApplyConfiguration { + b.Policy = &value + return b +} + +// WithCondition sets the Condition field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Condition field is set to the value of the last call. +func (b *DeletionRuleApplyConfiguration) WithCondition(value *DeletionConditionApplyConfiguration) *DeletionRuleApplyConfiguration { + b.Condition = value + return b +} diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionstrategy.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionstrategy.go index 105c33d3de7..034cce827cb 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionstrategy.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionstrategy.go @@ -5,8 +5,9 @@ package v1 // DeletionStrategyApplyConfiguration represents a declarative configuration of the DeletionStrategy type for use // with apply. type DeletionStrategyApplyConfiguration struct { - OnSuccess *DeletionPolicyApplyConfiguration `json:"onSuccess,omitempty"` - OnFailure *DeletionPolicyApplyConfiguration `json:"onFailure,omitempty"` + OnSuccess *DeletionPolicyApplyConfiguration `json:"onSuccess,omitempty"` + OnFailure *DeletionPolicyApplyConfiguration `json:"onFailure,omitempty"` + DeletionRules []DeletionRuleApplyConfiguration `json:"deletionRules,omitempty"` } // DeletionStrategyApplyConfiguration constructs a declarative configuration of the DeletionStrategy type for use with @@ -30,3 +31,16 @@ func (b *DeletionStrategyApplyConfiguration) WithOnFailure(value *DeletionPolicy b.OnFailure = value return b } + +// WithDeletionRules adds the given value to the DeletionRules field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the DeletionRules field. +func (b *DeletionStrategyApplyConfiguration) WithDeletionRules(values ...*DeletionRuleApplyConfiguration) *DeletionStrategyApplyConfiguration { + for i := range values { + if values[i] == nil { + panic("nil value passed to WithDeletionRules") + } + b.DeletionRules = append(b.DeletionRules, *values[i]) + } + return b +} diff --git a/ray-operator/pkg/client/applyconfiguration/utils.go b/ray-operator/pkg/client/applyconfiguration/utils.go index 23e455d739a..050733b0c5e 100644 --- a/ray-operator/pkg/client/applyconfiguration/utils.go +++ b/ray-operator/pkg/client/applyconfiguration/utils.go @@ -20,8 +20,12 @@ func ForKind(kind schema.GroupVersionKind) interface{} { return &rayv1.AppStatusApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("AutoscalerOptions"): return &rayv1.AutoscalerOptionsApplyConfiguration{} + case v1.SchemeGroupVersion.WithKind("DeletionCondition"): + return &rayv1.DeletionConditionApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("DeletionPolicy"): return &rayv1.DeletionPolicyApplyConfiguration{} + case v1.SchemeGroupVersion.WithKind("DeletionRule"): + return &rayv1.DeletionRuleApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("DeletionStrategy"): return &rayv1.DeletionStrategyApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("GcsFaultToleranceOptions"): diff --git a/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go b/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go new file mode 100644 index 00000000000..49718d3544b --- /dev/null +++ b/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go @@ -0,0 +1,504 @@ +package e2erayjob + +import ( + "testing" + "time" + + . "github.com/onsi/gomega" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1" + . "github.com/ray-project/kuberay/ray-operator/test/support" +) + +func TestDeletionStrategy(t *testing.T) { + test := With(t) + g := NewWithT(t) + + // Create a namespace + namespace := test.NewTestNamespace() + + // Job scripts - using existing counter.py for successful jobs and fail.py for failed jobs + // Note: This test suite requires the RayJobDeletionPolicy feature gate to be enabled + jobsAC := NewConfigMap(namespace.Name, Files(test, "counter.py", "fail.py")) + jobs, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Apply(test.Ctx(), jobsAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Created ConfigMap %s/%s successfully", jobs.Namespace, jobs.Name) + + test.T().Run("DeletionRules with DeleteWorkers policy should delete only worker pods", func(_ *testing.T) { + // Create RayJob with DeleteWorkers policy and short TTL for faster testing + rayJobAC := rayv1ac.RayJob("delete-workers-test", namespace.Name). + WithSpec(rayv1ac.RayJobSpec(). + WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))). + WithEntrypoint("python /home/ray/jobs/counter.py"). + WithRuntimeEnvYAML(` +env_vars: + counter_name: test_counter +`). + WithShutdownAfterJobFinishes(false). // Required when using DeletionStrategy + WithDeletionStrategy(rayv1ac.DeletionStrategy(). + WithDeletionRules( + rayv1ac.DeletionRule(). + WithPolicy(rayv1.DeleteWorkers). + WithCondition(rayv1ac.DeletionCondition(). + WithJobStatus(rayv1.JobStatusSucceeded). + WithTTLSeconds(10)), // 10 second TTL for testing + )). + WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration())) + + rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + + // Wait for job to complete successfully + g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). + Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded))) + LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name) + + // Get the associated RayCluster name. We assert it's non-empty explicitly so that + // test failures surface here (clear message) rather than later when using an empty name. + rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + rayClusterName := rayJob.Status.RayClusterName + g.Expect(rayClusterName).NotTo(BeEmpty()) + + // Verify cluster and workers exist initially + g.Eventually(RayCluster(test, namespace.Name, rayClusterName), TestTimeoutShort). + Should(WithTransform(RayClusterState, Equal(rayv1.Ready))) + + // Count initial worker pods + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + g.Expect(err).NotTo(HaveOccurred()) + initialWorkerPods, err := GetWorkerPods(test, cluster) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(initialWorkerPods).ToNot(BeEmpty()) + LogWithTimestamp(test.T(), "Found %d worker pods initially", len(initialWorkerPods)) + + // Verify resources persist during TTL wait period (first 8 seconds of 10s TTL) + LogWithTimestamp(test.T(), "Verifying resources persist during TTL wait period...") + g.Consistently(func(gg Gomega) { + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(cluster).NotTo(BeNil()) + workerPods, err := GetWorkerPods(test, cluster) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(workerPods).ToNot(BeEmpty()) + headPod, err := GetHeadPod(test, cluster) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(headPod).NotTo(BeNil()) + jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(jobObj).NotTo(BeNil()) + }, 8*time.Second, 2*time.Second).Should(Succeed()) // Check every 2s for 8s + LogWithTimestamp(test.T(), "Resources confirmed stable during TTL wait period") + + // Wait for TTL to expire and workers to be deleted + LogWithTimestamp(test.T(), "Waiting for TTL to expire and workers to be deleted...") + g.Eventually(func(gg Gomega) { + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(cluster).NotTo(BeNil()) + workerPods, err := GetWorkerPods(test, cluster) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(workerPods).To(BeEmpty()) + }, TestTimeoutMedium).Should(Succeed()) + LogWithTimestamp(test.T(), "Worker pods deleted successfully") + + // Verify cluster still exists (head pod should remain) + g.Consistently(RayCluster(test, namespace.Name, rayClusterName), 10*time.Second). + Should(WithTransform(RayClusterState, Equal(rayv1.Ready))) + + // Verify head pod still exists + cluster, err = GetRayCluster(test, namespace.Name, rayClusterName) + g.Expect(err).NotTo(HaveOccurred()) + headPod, err := GetHeadPod(test, cluster) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(headPod).NotTo(BeNil()) + LogWithTimestamp(test.T(), "Head pod preserved as expected") + + // Verify RayJob still exists + jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(jobObj).NotTo(BeNil()) + LogWithTimestamp(test.T(), "RayJob preserved as expected") + + // Cleanup: delete RayJob to free resources (cluster should be GC'd eventually if owned) + LogWithTimestamp(test.T(), "Cleaning up RayJob %s/%s after DeleteWorkers scenario", jobObj.Namespace, jobObj.Name) + err = test.Client().Ray().RayV1().RayJobs(jobObj.Namespace).Delete(test.Ctx(), jobObj.Name, metav1.DeleteOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Eventually(func() error { _, err := GetRayJob(test, jobObj.Namespace, jobObj.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + // Cluster may take a moment to be garbage collected; tolerate already-deleted state + g.Eventually(func() error { + _, err := GetRayCluster(test, namespace.Name, rayClusterName) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Cleanup after DeleteWorkers scenario complete") + }) + + test.T().Run("DeletionRules with DeleteCluster policy should delete entire cluster", func(_ *testing.T) { + rayJobAC := rayv1ac.RayJob("delete-cluster-test", namespace.Name). + WithSpec(rayv1ac.RayJobSpec(). + WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))). + WithEntrypoint("python /home/ray/jobs/counter.py"). + WithRuntimeEnvYAML(` +env_vars: + counter_name: test_counter +`). + WithShutdownAfterJobFinishes(false). + WithDeletionStrategy(rayv1ac.DeletionStrategy(). + WithDeletionRules( + rayv1ac.DeletionRule(). + WithPolicy(rayv1.DeleteCluster). + WithCondition(rayv1ac.DeletionCondition(). + WithJobStatus(rayv1.JobStatusSucceeded). + WithTTLSeconds(10)), + )). + WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration())) + + rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + + // Wait for job to complete successfully + g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). + Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded))) + LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name) + + // Get the associated RayCluster name (early assertion for clearer diagnostics) + rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + rayClusterName := rayJob.Status.RayClusterName + g.Expect(rayClusterName).NotTo(BeEmpty()) + + // Verify cluster exists initially + g.Eventually(RayCluster(test, namespace.Name, rayClusterName), TestTimeoutShort). + Should(WithTransform(RayClusterState, Equal(rayv1.Ready))) + + // Wait for TTL to expire and cluster to be deleted + LogWithTimestamp(test.T(), "Waiting for TTL to expire and cluster to be deleted...") + g.Eventually(func() error { + _, err := GetRayCluster(test, namespace.Name, rayClusterName) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "RayCluster deleted successfully") + + // Verify RayJob still exists + jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(jobObj).NotTo(BeNil()) + LogWithTimestamp(test.T(), "RayJob preserved as expected") + + // Cleanup: delete RayJob (cluster already deleted by policy) + LogWithTimestamp(test.T(), "Cleaning up RayJob %s/%s after DeleteCluster scenario", jobObj.Namespace, jobObj.Name) + err = test.Client().Ray().RayV1().RayJobs(jobObj.Namespace).Delete(test.Ctx(), jobObj.Name, metav1.DeleteOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Eventually(func() error { _, err := GetRayJob(test, jobObj.Namespace, jobObj.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Cleanup after DeleteCluster scenario complete") + }) + + test.T().Run("DeletionRules with DeleteSelf policy should delete RayJob and cluster", func(_ *testing.T) { + rayJobAC := rayv1ac.RayJob("delete-self-test", namespace.Name). + WithSpec(rayv1ac.RayJobSpec(). + WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))). + WithEntrypoint("python /home/ray/jobs/counter.py"). + WithRuntimeEnvYAML(` +env_vars: + counter_name: test_counter +`). + WithShutdownAfterJobFinishes(false). + WithDeletionStrategy(rayv1ac.DeletionStrategy(). + WithDeletionRules( + rayv1ac.DeletionRule(). + WithPolicy(rayv1.DeleteSelf). + WithCondition(rayv1ac.DeletionCondition(). + WithJobStatus(rayv1.JobStatusSucceeded). + WithTTLSeconds(10)), + )). + WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration())) + + rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + + // Wait for job to complete successfully + g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). + Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded))) + LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name) + + // Get the associated RayCluster name before verifying deletion sequence + rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + rayClusterName := rayJob.Status.RayClusterName + g.Expect(rayClusterName).NotTo(BeEmpty()) + + // Wait for TTL to expire and RayJob (and cluster) to be deleted + LogWithTimestamp(test.T(), "Waiting for TTL to expire and RayJob to be deleted...") + g.Eventually(func() error { + _, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "RayJob deleted successfully") + + // Verify associated cluster is also deleted + g.Eventually(func() error { + _, err := GetRayCluster(test, namespace.Name, rayClusterName) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Associated RayCluster deleted successfully") + }) + + test.T().Run("DeletionRules with DeleteNone policy should preserve all resources", func(_ *testing.T) { + rayJobAC := rayv1ac.RayJob("delete-none-test", namespace.Name). + WithSpec(rayv1ac.RayJobSpec(). + WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))). + WithEntrypoint("python /home/ray/jobs/counter.py"). + WithRuntimeEnvYAML(` +env_vars: + counter_name: test_counter +`). + WithShutdownAfterJobFinishes(false). + WithDeletionStrategy(rayv1ac.DeletionStrategy(). + WithDeletionRules( + rayv1ac.DeletionRule(). + WithPolicy(rayv1.DeleteNone). + WithCondition(rayv1ac.DeletionCondition(). + WithJobStatus(rayv1.JobStatusSucceeded). + WithTTLSeconds(5)), // Shorter TTL since we're testing preservation + )). + WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration())) + + rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + + // Wait for job to complete successfully + g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). + Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded))) + LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name) + + // Get the associated RayCluster name (assert early for clarity) + rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + rayClusterName := rayJob.Status.RayClusterName + g.Expect(rayClusterName).NotTo(BeEmpty()) + + // Wait well past the TTL and verify everything is preserved + LogWithTimestamp(test.T(), "Waiting past TTL to verify resources are preserved...") + g.Consistently(func(gg Gomega) { + jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(jobObj).NotTo(BeNil()) + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(cluster).NotTo(BeNil()) + workerPods, err := GetWorkerPods(test, cluster) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(workerPods).ToNot(BeEmpty()) + }, 10*time.Second, 2*time.Second).Should(Succeed()) + LogWithTimestamp(test.T(), "All resources preserved as expected with DeleteNone policy") + + // Cleanup: delete RayJob to release cluster and pods + LogWithTimestamp(test.T(), "Cleaning up RayJob %s/%s after DeleteNone scenario", rayJob.Namespace, rayJob.Name) + err = test.Client().Ray().RayV1().RayJobs(rayJob.Namespace).Delete(test.Ctx(), rayJob.Name, metav1.DeleteOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Eventually(func() error { _, err := GetRayJob(test, rayJob.Namespace, rayJob.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + g.Eventually(func() error { + _, err := GetRayCluster(test, namespace.Name, rayClusterName) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Cleanup after DeleteNone scenario complete") + }) + + test.T().Run("Multi-stage deletion should execute in TTL order: Workers->Cluster->Self", func(_ *testing.T) { + rayJobAC := rayv1ac.RayJob("multi-stage-test", namespace.Name). + WithSpec(rayv1ac.RayJobSpec(). + WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))). + WithEntrypoint("python /home/ray/jobs/counter.py"). + WithRuntimeEnvYAML(` +env_vars: + counter_name: test_counter +`). + WithShutdownAfterJobFinishes(false). + WithDeletionStrategy(rayv1ac.DeletionStrategy(). + WithDeletionRules( + rayv1ac.DeletionRule(). + WithPolicy(rayv1.DeleteWorkers). + WithCondition(rayv1ac.DeletionCondition(). + WithJobStatus(rayv1.JobStatusSucceeded). + WithTTLSeconds(15)), // Increased spacing for reliability + rayv1ac.DeletionRule(). + WithPolicy(rayv1.DeleteCluster). + WithCondition(rayv1ac.DeletionCondition(). + WithJobStatus(rayv1.JobStatusSucceeded). + WithTTLSeconds(35)), // 20s gap between stages + rayv1ac.DeletionRule(). + WithPolicy(rayv1.DeleteSelf). + WithCondition(rayv1ac.DeletionCondition(). + WithJobStatus(rayv1.JobStatusSucceeded). + WithTTLSeconds(55)), // 20s gap between stages + )). + WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration())) + + rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + + // Wait for job to complete successfully + g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). + Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded))) + LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name) + + // Get the associated RayCluster name (early assertion ensures meaningful failure) + rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + rayClusterName := rayJob.Status.RayClusterName + g.Expect(rayClusterName).NotTo(BeEmpty()) + + // Verify cluster is ready initially + g.Eventually(RayCluster(test, namespace.Name, rayClusterName), TestTimeoutShort). + Should(WithTransform(RayClusterState, Equal(rayv1.Ready))) + + // Verify all resources exist before any TTL expires (first 12 seconds) + LogWithTimestamp(test.T(), "Verifying all resources persist before any TTL expires...") + g.Consistently(func(gg Gomega) { + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(cluster).NotTo(BeNil()) + workerPods, err := GetWorkerPods(test, cluster) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(workerPods).ToNot(BeEmpty()) + headPod, err := GetHeadPod(test, cluster) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(headPod).NotTo(BeNil()) + jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(jobObj).NotTo(BeNil()) + }, 12*time.Second, 2*time.Second).Should(Succeed()) + LogWithTimestamp(test.T(), "All resources confirmed stable before TTL expiration") + + // Stage 1: Wait for workers to be deleted (15s TTL) + LogWithTimestamp(test.T(), "Stage 1: Waiting for workers to be deleted at 15s...") + g.Eventually(func(gg Gomega) { + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(cluster).NotTo(BeNil()) + workerPods, err := GetWorkerPods(test, cluster) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(workerPods).To(BeEmpty()) + }, TestTimeoutMedium).Should(Succeed()) + LogWithTimestamp(test.T(), "Stage 1 complete: Workers deleted successfully") + + // Verify cluster and job still exist after stage 1 + job, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(job).NotTo(BeNil()) + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + g.Expect(err).NotTo(HaveOccurred()) + headPod, err := GetHeadPod(test, cluster) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(headPod).NotTo(BeNil()) + + // Verify cluster persists during stage 2 wait period (15 seconds of 20s gap) + LogWithTimestamp(test.T(), "Verifying cluster persists before stage 2 TTL expires...") + g.Consistently(func(gg Gomega) { + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(cluster).NotTo(BeNil()) + headPod, err := GetHeadPod(test, cluster) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(headPod).NotTo(BeNil()) + jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(jobObj).NotTo(BeNil()) + }, 15*time.Second, 2*time.Second).Should(Succeed()) + LogWithTimestamp(test.T(), "Cluster and job confirmed stable before stage 2 TTL") + + // Stage 2: Wait for cluster to be deleted (35s TTL) + LogWithTimestamp(test.T(), "Stage 2: Waiting for cluster to be deleted at 35s...") + g.Eventually(func() error { + _, err := GetRayCluster(test, namespace.Name, rayClusterName) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Stage 2 complete: Cluster deleted successfully") + + // Verify job still exists after stage 2 + job, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(job).NotTo(BeNil()) + + // Verify job persists during stage 3 wait period (15 seconds of 20s gap) + LogWithTimestamp(test.T(), "Verifying RayJob persists before stage 3 TTL expires...") + g.Consistently(func(gg Gomega) { + jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(jobObj).NotTo(BeNil()) + }, 15*time.Second, 2*time.Second).Should(Succeed()) + LogWithTimestamp(test.T(), "RayJob confirmed stable before stage 3 TTL") + + // Stage 3: Wait for job to be deleted (55s TTL) + LogWithTimestamp(test.T(), "Stage 3: Waiting for RayJob to be deleted at 55s...") + g.Eventually(func() error { + _, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Stage 3 complete: RayJob deleted successfully") + LogWithTimestamp(test.T(), "Multi-stage deletion completed in correct order") + }) + + test.T().Run("Legacy OnSuccess DeleteCluster should still work", func(_ *testing.T) { + rayJobAC := rayv1ac.RayJob("legacy-success-test", namespace.Name). + WithSpec(rayv1ac.RayJobSpec(). + WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))). + WithEntrypoint("python /home/ray/jobs/counter.py"). + WithRuntimeEnvYAML(` +env_vars: + counter_name: test_counter +`). + WithShutdownAfterJobFinishes(true). + WithTTLSecondsAfterFinished(10). // Legacy TTL for backward compatibility + WithDeletionStrategy(rayv1ac.DeletionStrategy(). + WithOnSuccess(rayv1ac.DeletionPolicy(). + WithPolicy(rayv1.DeleteCluster)). + WithOnFailure(rayv1ac.DeletionPolicy(). + WithPolicy(rayv1.DeleteCluster))). + WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration())) + + rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Created legacy RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + + // Wait for job to complete successfully + g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). + Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded))) + LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name) + + // Get the associated RayCluster name (legacy path; same early assertion rationale) + rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + rayClusterName := rayJob.Status.RayClusterName + g.Expect(rayClusterName).NotTo(BeEmpty()) + + // Wait for cluster to be deleted due to OnSuccess policy + LogWithTimestamp(test.T(), "Waiting for legacy OnSuccess policy to delete cluster...") + g.Eventually(func() error { + _, err := GetRayCluster(test, namespace.Name, rayClusterName) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Cluster deleted by legacy OnSuccess policy") + + // Verify RayJob still exists + job, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(job).NotTo(BeNil()) + LogWithTimestamp(test.T(), "Legacy OnSuccess policy working correctly") + + // Cleanup: delete legacy RayJob (cluster already deleted) + LogWithTimestamp(test.T(), "Cleaning up legacy success RayJob %s/%s", job.Namespace, job.Name) + err = test.Client().Ray().RayV1().RayJobs(job.Namespace).Delete(test.Ctx(), job.Name, metav1.DeleteOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Eventually(func() error { _, err := GetRayJob(test, job.Namespace, job.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Cleanup after legacy success scenario complete") + }) +}