From bc17214e8d1c1502516ece12882d50d452a3b1e2 Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Wed, 3 Sep 2025 04:40:20 +0000 Subject: [PATCH 01/21] [CRD][RayJob] Define new DeletionStrategy in RayJob CRD Signed-off-by: wei-chenglai --- docs/reference/api.md | 66 +++- .../kuberay-operator/crds/ray.io_rayjobs.yaml | 65 +++- ray-operator/apis/ray/v1/rayjob_types.go | 79 +++- .../apis/ray/v1/zz_generated.deepcopy.go | 36 ++ .../config/crd/bases/ray.io_rayjobs.yaml | 65 +++- .../controllers/ray/rayjob_controller.go | 361 ++++++++++++++---- .../controllers/ray/utils/validation.go | 212 ++++++++-- .../controllers/ray/utils/validation_test.go | 239 ++++++++++++ .../ray/v1/deletioncondition.go | 36 ++ .../applyconfiguration/ray/v1/deletionrule.go | 36 ++ .../ray/v1/deletionstrategy.go | 18 +- .../pkg/client/applyconfiguration/utils.go | 4 + 12 files changed, 1052 insertions(+), 165 deletions(-) create mode 100644 ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go create mode 100644 ray-operator/pkg/client/applyconfiguration/ray/v1/deletionrule.go diff --git a/docs/reference/api.md b/docs/reference/api.md index 4b495fef69e..4d3a87a9bce 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -55,12 +55,29 @@ _Appears in:_ -#### DeletionPolicy +#### DeletionCondition + +DeletionCondition specifies the trigger conditions for a deletion action. +_Appears in:_ +- [DeletionRule](#deletionrule) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `ttlSecondsAfterFinished` _integer_ | TTLSecondsAfterFinished is the time in seconds from when the JobStatus
reaches the specified terminal state to when this deletion action should be triggered.
The value must be a non-negative integer. | 0 | Minimum: 0
| + + +#### DeletionPolicy + + + +DeletionPolicy is the legacy single-stage deletion policy. +Deprecated: This struct is part of the legacy API. Use DeletionRule for new configurations. + _Appears in:_ @@ -68,7 +85,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `policy` _[DeletionPolicyType](#deletionpolicytype)_ | Valid values are 'DeleteCluster', 'DeleteWorkers', 'DeleteSelf' or 'DeleteNone'. | | | +| `policy` _[DeletionPolicyType](#deletionpolicytype)_ | Policy is the action to take when the condition is met.
This field is logically required when using the legacy OnSuccess/OnFailure policies.
It is marked as '+optional' at the API level to allow the 'deletionRules' field to be used instead. | | Enum: [DeleteCluster DeleteWorkers DeleteSelf DeleteNone]
| #### DeletionPolicyType @@ -81,14 +98,54 @@ _Underlying type:_ _string_ _Appears in:_ - [DeletionPolicy](#deletionpolicy) +- [DeletionRule](#deletionrule) + + + +#### DeletionRule + +DeletionRule defines a single deletion action and its trigger condition. +This is the new, recommended way to define deletion behavior. + + + +_Appears in:_ +- [DeletionStrategy](#deletionstrategy) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `policy` _[DeletionPolicyType](#deletionpolicytype)_ | Policy is the action to take when the condition is met. This field is required. | | Enum: [DeleteCluster DeleteWorkers DeleteSelf DeleteNone]
| +| `condition` _[DeletionCondition](#deletioncondition)_ | The condition under which this deletion rule is triggered. This field is required. | | | + #### DeletionStrategy +DeletionStrategy defines the deletion policies for a RayJob. +It allows for fine-grained control over resource cleanup after a job finishes. + + +Legacy fields `onSuccess` and `onFailure` are still supported for backward compatibility, +but it is highly recommended to migrate to the new `deletionRules` field. + + +Notes: + - When this block is set, you must configure **either** + (a) BOTH `onSuccess` and `onFailure` policies, + OR + (b) the `deletionRules` field (which may be empty, in which case no deletion will occur). + - `onSuccess` / `onFailure` must NOT be used together with `deletionRules`. + - `onSuccess` and `onFailure` are **deprecated** and planned for removal in a future release. + + +Validation rules: + 1. Prevent mixing legacy and new fields + + 2. Require either both legacy fields or deletionRules presence @@ -97,8 +154,9 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | | | | -| `onFailure` _[DeletionPolicy](#deletionpolicy)_ | | | | +| `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | OnSuccess is the deletion policy for a successful RayJob.
Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
This field will be removed in a future release. | | | +| `onFailure` _[DeletionPolicy](#deletionpolicy)_ | OnFailure is the deletion policy for a failed RayJob.
Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
This field will be removed in a future release. | | | +| `deletionRules` _[DeletionRule](#deletionrule) array_ | DeletionRules is a list of deletion rules, processed based on their trigger conditions.
While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),
the most impactful rule (e.g., DeleteCluster) will be executed first to prioritize resource cleanup and cost savings. | | | diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml index 8f8679ca607..15e15996f7b 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml @@ -60,34 +60,65 @@ spec: type: object deletionStrategy: properties: + deletionRules: + items: + properties: + condition: + properties: + jobStatus: + enum: + - SUCCEEDED + - FAILED + type: string + ttlSecondsAfterFinished: + default: 0 + format: int32 + minimum: 0 + type: integer + required: + - jobStatus + type: object + policy: + enum: + - DeleteCluster + - DeleteWorkers + - DeleteSelf + - DeleteNone + type: string + required: + - condition + - policy + type: object + type: array + x-kubernetes-list-type: atomic onFailure: properties: policy: + enum: + - DeleteCluster + - DeleteWorkers + - DeleteSelf + - DeleteNone type: string - x-kubernetes-validations: - - message: the policy field value must be either 'DeleteCluster', - 'DeleteWorkers', 'DeleteSelf', or 'DeleteNone' - rule: self in ['DeleteCluster', 'DeleteWorkers', 'DeleteSelf', - 'DeleteNone'] - required: - - policy type: object onSuccess: properties: policy: + enum: + - DeleteCluster + - DeleteWorkers + - DeleteSelf + - DeleteNone type: string - x-kubernetes-validations: - - message: the policy field value must be either 'DeleteCluster', - 'DeleteWorkers', 'DeleteSelf', or 'DeleteNone' - rule: self in ['DeleteCluster', 'DeleteWorkers', 'DeleteSelf', - 'DeleteNone'] - required: - - policy type: object - required: - - onFailure - - onSuccess type: object + x-kubernetes-validations: + - message: legacy policies (onSuccess/onFailure) and deletionRules + cannot be used together within the same deletionStrategy + rule: '!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))' + - message: deletionStrategy requires either BOTH onSuccess and onFailure, + OR the deletionRules field (which may be empty) + rule: ((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules)) entrypoint: type: string entrypointNumCpus: diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go index 54a2ef7bce2..683ff6434b3 100644 --- a/ray-operator/apis/ray/v1/rayjob_types.go +++ b/ray-operator/apis/ray/v1/rayjob_types.go @@ -87,15 +87,84 @@ const ( type DeletionPolicyType string +// DeletionStrategy defines the deletion policies for a RayJob. +// It allows for fine-grained control over resource cleanup after a job finishes. +// +// Legacy fields `onSuccess` and `onFailure` are still supported for backward compatibility, +// but it is highly recommended to migrate to the new `deletionRules` field. +// +// Notes: +// - When this block is set, you must configure **either** +// (a) BOTH `onSuccess` and `onFailure` policies, +// OR +// (b) the `deletionRules` field (which may be empty, in which case no deletion will occur). +// - `onSuccess` / `onFailure` must NOT be used together with `deletionRules`. +// - `onSuccess` and `onFailure` are **deprecated** and planned for removal in a future release. +// +// Validation rules: +// 1. Prevent mixing legacy and new fields +// +// +kubebuilder:validation:XValidation:rule="!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))",message="legacy policies (onSuccess/onFailure) and deletionRules cannot be used together within the same deletionStrategy" +// 2. Require either both legacy fields or deletionRules presence +// +// +kubebuilder:validation:XValidation:rule="((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules))",message="deletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (which may be empty)" type DeletionStrategy struct { - OnSuccess DeletionPolicy `json:"onSuccess"` - OnFailure DeletionPolicy `json:"onFailure"` + // OnSuccess is the deletion policy for a successful RayJob. + // Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies. + // This field will be removed in a future release. + // +optional + OnSuccess DeletionPolicy `json:"onSuccess,omitempty"` + + // OnFailure is the deletion policy for a failed RayJob. + // Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies. + // This field will be removed in a future release. + // +optional + OnFailure DeletionPolicy `json:"onFailure,omitempty"` + + // DeletionRules is a list of deletion rules, processed based on their trigger conditions. + // While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime), + // the most impactful rule (e.g., DeleteCluster) will be executed first to prioritize resource cleanup and cost savings. + // +optional + // +listType=atomic + DeletionRules []DeletionRule `json:"deletionRules,omitempty"` } +// DeletionRule defines a single deletion action and its trigger condition. +// This is the new, recommended way to define deletion behavior. +type DeletionRule struct { + // Policy is the action to take when the condition is met. This field is required. + // +kubebuilder:validation:Enum=DeleteCluster;DeleteWorkers;DeleteSelf;DeleteNone + Policy DeletionPolicyType `json:"policy"` + + // The condition under which this deletion rule is triggered. This field is required. + Condition DeletionCondition `json:"condition"` +} + +// DeletionCondition specifies the trigger conditions for a deletion action. +type DeletionCondition struct { + // JobStatus is the terminal status of the RayJob that triggers this condition. This field is required. + // For the initial implementation, only "SUCCEEDED" and "FAILED" are supported. + // +kubebuilder:validation:Enum=SUCCEEDED;FAILED + JobStatus JobStatus `json:"jobStatus"` + + // TTLSecondsAfterFinished is the time in seconds from when the JobStatus + // reaches the specified terminal state to when this deletion action should be triggered. + // The value must be a non-negative integer. + // +kubebuilder:default=0 + // +kubebuilder:validation:Minimum=0 + // +optional + TTLSecondsAfterFinished int32 `json:"ttlSecondsAfterFinished,omitempty"` +} + +// DeletionPolicy is the legacy single-stage deletion policy. +// Deprecated: This struct is part of the legacy API. Use DeletionRule for new configurations. type DeletionPolicy struct { - // Valid values are 'DeleteCluster', 'DeleteWorkers', 'DeleteSelf' or 'DeleteNone'. - // +kubebuilder:validation:XValidation:rule="self in ['DeleteCluster', 'DeleteWorkers', 'DeleteSelf', 'DeleteNone']",message="the policy field value must be either 'DeleteCluster', 'DeleteWorkers', 'DeleteSelf', or 'DeleteNone'" - Policy *DeletionPolicyType `json:"policy"` + // Policy is the action to take when the condition is met. + // This field is logically required when using the legacy OnSuccess/OnFailure policies. + // It is marked as '+optional' at the API level to allow the 'deletionRules' field to be used instead. + // +kubebuilder:validation:Enum=DeleteCluster;DeleteWorkers;DeleteSelf;DeleteNone + // +optional + Policy *DeletionPolicyType `json:"policy,omitempty"` } const ( diff --git a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go index b4cb5decf12..d548d944c8b 100644 --- a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go +++ b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go @@ -103,6 +103,21 @@ func (in *AutoscalerOptions) DeepCopy() *AutoscalerOptions { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DeletionCondition) DeepCopyInto(out *DeletionCondition) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeletionCondition. +func (in *DeletionCondition) DeepCopy() *DeletionCondition { + if in == nil { + return nil + } + out := new(DeletionCondition) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DeletionPolicy) DeepCopyInto(out *DeletionPolicy) { *out = *in @@ -123,11 +138,32 @@ func (in *DeletionPolicy) DeepCopy() *DeletionPolicy { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DeletionRule) DeepCopyInto(out *DeletionRule) { + *out = *in + out.Condition = in.Condition +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeletionRule. +func (in *DeletionRule) DeepCopy() *DeletionRule { + if in == nil { + return nil + } + out := new(DeletionRule) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DeletionStrategy) DeepCopyInto(out *DeletionStrategy) { *out = *in in.OnSuccess.DeepCopyInto(&out.OnSuccess) in.OnFailure.DeepCopyInto(&out.OnFailure) + if in.DeletionRules != nil { + in, out := &in.DeletionRules, &out.DeletionRules + *out = make([]DeletionRule, len(*in)) + copy(*out, *in) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeletionStrategy. diff --git a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml index 8f8679ca607..15e15996f7b 100644 --- a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml @@ -60,34 +60,65 @@ spec: type: object deletionStrategy: properties: + deletionRules: + items: + properties: + condition: + properties: + jobStatus: + enum: + - SUCCEEDED + - FAILED + type: string + ttlSecondsAfterFinished: + default: 0 + format: int32 + minimum: 0 + type: integer + required: + - jobStatus + type: object + policy: + enum: + - DeleteCluster + - DeleteWorkers + - DeleteSelf + - DeleteNone + type: string + required: + - condition + - policy + type: object + type: array + x-kubernetes-list-type: atomic onFailure: properties: policy: + enum: + - DeleteCluster + - DeleteWorkers + - DeleteSelf + - DeleteNone type: string - x-kubernetes-validations: - - message: the policy field value must be either 'DeleteCluster', - 'DeleteWorkers', 'DeleteSelf', or 'DeleteNone' - rule: self in ['DeleteCluster', 'DeleteWorkers', 'DeleteSelf', - 'DeleteNone'] - required: - - policy type: object onSuccess: properties: policy: + enum: + - DeleteCluster + - DeleteWorkers + - DeleteSelf + - DeleteNone type: string - x-kubernetes-validations: - - message: the policy field value must be either 'DeleteCluster', - 'DeleteWorkers', 'DeleteSelf', or 'DeleteNone' - rule: self in ['DeleteCluster', 'DeleteWorkers', 'DeleteSelf', - 'DeleteNone'] - required: - - policy type: object - required: - - onFailure - - onSuccess type: object + x-kubernetes-validations: + - message: legacy policies (onSuccess/onFailure) and deletionRules + cannot be used together within the same deletionStrategy + rule: '!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))' + - message: deletionStrategy requires either BOTH onSuccess and onFailure, + OR the deletionRules field (which may be empty) + rule: ((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules)) entrypoint: type: string entrypointNumCpus: diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index 64ba470f3ca..7cb41681ce9 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -363,89 +363,8 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request) // TODO (kevin85421): We may not need to requeue the RayJob if it has already been suspended. return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil case rayv1.JobDeploymentStatusComplete, rayv1.JobDeploymentStatusFailed: - // If this RayJob uses an existing RayCluster (i.e., ClusterSelector is set), we should not delete the RayCluster. - ttlSeconds := rayJobInstance.Spec.TTLSecondsAfterFinished - nowTime := time.Now() - shutdownTime := rayJobInstance.Status.EndTime.Add(time.Duration(ttlSeconds) * time.Second) - logger.Info(string(rayJobInstance.Status.JobDeploymentStatus), - "ShutdownAfterJobFinishes", rayJobInstance.Spec.ShutdownAfterJobFinishes, - "ClusterSelector", rayJobInstance.Spec.ClusterSelector, - "ttlSecondsAfterFinished", ttlSeconds, - "Status.endTime", rayJobInstance.Status.EndTime, - "Now", nowTime, - "ShutdownTime", shutdownTime) - - if features.Enabled(features.RayJobDeletionPolicy) && - rayJobInstance.Spec.DeletionStrategy != nil && - len(rayJobInstance.Spec.ClusterSelector) == 0 { - - if shutdownTime.After(nowTime) { - delta := int32(time.Until(shutdownTime.Add(2 * time.Second)).Seconds()) - logger.Info("shutdownTime not reached, requeue this RayJob for n seconds", "seconds", delta) - return ctrl.Result{RequeueAfter: time.Duration(delta) * time.Second}, nil - } - - policy := rayv1.DeleteNone - if rayJobInstance.Status.JobStatus == rayv1.JobStatusSucceeded { - policy = *rayJobInstance.Spec.DeletionStrategy.OnSuccess.Policy - } else if rayJobInstance.Status.JobStatus == rayv1.JobStatusFailed { - policy = *rayJobInstance.Spec.DeletionStrategy.OnFailure.Policy - } else { - logger.Info("jobStatus not valid for deletion", "jobStatus", rayJobInstance.Status.JobStatus) - } - - // no need to continue as the selected policy is DeleteNone - if policy == rayv1.DeleteNone { - break - } - - logger.Info("Shutdown behavior is defined by the deletion policy", "deletionPolicy", rayJobInstance.Spec.DeletionStrategy) - if shutdownTime.After(nowTime) { - delta := int32(time.Until(shutdownTime.Add(2 * time.Second)).Seconds()) - logger.Info("shutdownTime not reached, requeue this RayJob for n seconds", "seconds", delta) - return ctrl.Result{RequeueAfter: time.Duration(delta) * time.Second}, nil - } - - switch policy { - case rayv1.DeleteCluster: - logger.Info("Deleting RayCluster", "RayCluster", rayJobInstance.Status.RayClusterName) - _, err = r.deleteClusterResources(ctx, rayJobInstance) - case rayv1.DeleteWorkers: - logger.Info("Suspending all worker groups", "RayCluster", rayJobInstance.Status.RayClusterName) - err = r.suspendWorkerGroups(ctx, rayJobInstance) - case rayv1.DeleteSelf: - logger.Info("Deleting RayJob") - err = r.Client.Delete(ctx, rayJobInstance) - default: - } - if err != nil { - return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err - } - } - - if (!features.Enabled(features.RayJobDeletionPolicy) || rayJobInstance.Spec.DeletionStrategy == nil) && rayJobInstance.Spec.ShutdownAfterJobFinishes && len(rayJobInstance.Spec.ClusterSelector) == 0 { - logger.Info("Shutdown behavior is defined by the `ShutdownAfterJobFinishes` flag", "shutdownAfterJobFinishes", rayJobInstance.Spec.ShutdownAfterJobFinishes) - if shutdownTime.After(nowTime) { - delta := int32(time.Until(shutdownTime.Add(2 * time.Second)).Seconds()) - logger.Info("shutdownTime not reached, requeue this RayJob for n seconds", "seconds", delta) - return ctrl.Result{RequeueAfter: time.Duration(delta) * time.Second}, nil - } - if s := os.Getenv(utils.DELETE_RAYJOB_CR_AFTER_JOB_FINISHES); strings.ToLower(s) == "true" { - err = r.Client.Delete(ctx, rayJobInstance) - logger.Info("RayJob is deleted") - } else { - // We only need to delete the RayCluster. We don't need to delete the submitter Kubernetes Job so that users can still access - // the driver logs. In addition, a completed Kubernetes Job does not actually use any compute resources. - _, err = r.deleteClusterResources(ctx, rayJobInstance) - logger.Info("RayCluster is deleted", "RayCluster", rayJobInstance.Status.RayClusterName) - } - if err != nil { - return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err - } - } - - // If the RayJob is completed, we should not requeue it. - return ctrl.Result{}, nil + // The RayJob has reached a terminal state. Handle the cleanup and deletion logic. + return r.handleFinishedRayJob(ctx, rayJobInstance) default: logger.Info("Unknown JobDeploymentStatus", "JobDeploymentStatus", rayJobInstance.Status.JobDeploymentStatus) return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil @@ -1105,3 +1024,279 @@ func isSubmitterContainerFinished(pod *corev1.Pod) bool { } return false } + +// handleFinishedRayJob is the main entry point for handling cleanup of a completed or failed RayJob. +// It acts as a dispatcher, selecting the appropriate deletion mechanism based on the RayJob spec. +func (r *RayJobReconciler) handleFinishedRayJob(ctx context.Context, rayJob *rayv1.RayJob) (ctrl.Result, error) { + logger := ctrl.LoggerFrom(ctx) + + // If the RayJob uses an existing RayCluster, we must not delete it. + if len(rayJob.Spec.ClusterSelector) > 0 { + logger.Info("RayJob is using an existing RayCluster via clusterSelector; skipping resource deletion.", "RayClusterSelector", rayJob.Spec.ClusterSelector) + return ctrl.Result{}, nil + } + + if features.Enabled(features.RayJobDeletionPolicy) && rayJob.Spec.DeletionStrategy != nil { + // The previous validation logic ensures that either DeletionRules or the legacy policies are set, but not both. + if len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0 { + return r.handleDeletionRules(ctx, rayJob) + } + return r.handleLegacyDeletionPolicy(ctx, rayJob) + } + + if rayJob.Spec.ShutdownAfterJobFinishes { + return r.handleShutdownAfterJobFinishes(ctx, rayJob) + } + + // Default: No deletion policy is configured. The reconciliation is complete for this RayJob. + return ctrl.Result{}, nil +} + +// handleDeletionRules processes the DeletionRules with a impact-aware strategy. +// It categorizes rules into "overdue" and "pending". If overdue rules exist, +// it executes the most destructive one and then requeues for the next pending rule. +// If no rules are overdue, it simply requeues for the +// next pending rule. This function performs at most one deletion action per reconciliation. +func (r *RayJobReconciler) handleDeletionRules(ctx context.Context, rayJob *rayv1.RayJob) (ctrl.Result, error) { + logger := ctrl.LoggerFrom(ctx).WithValues("deletionMechanism", "DeletionRules") + nowTime := time.Now() + + var overdueRules []rayv1.DeletionRule + var nextRequeueTime *time.Time + + // Categorize all applicable and incomplete rules into "overdue" or "pending". + for _, rule := range rayJob.Spec.DeletionStrategy.DeletionRules { + // Skip rules that don't match the current job status. + if rule.Condition.JobStatus != rayJob.Status.JobStatus { + continue + } + + // Skip rules for actions that have already been completed to ensure idempotency. + isCompleted, err := r.isDeletionActionCompleted(ctx, rayJob, rule.Policy) + if err != nil { + logger.Error(err, "Failed to check if deletion action is completed", "rule", rule) + return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err + } + if isCompleted { + logger.Info("Skipping completed deletion rule", "rule", rule) + continue + } + + // Categorize the rule based on its TTL. + deletionTime := rayJob.Status.EndTime.Add(time.Duration(rule.Condition.TTLSecondsAfterFinished) * time.Second) + if nowTime.After(deletionTime) { + overdueRules = append(overdueRules, rule) + } else if nextRequeueTime == nil || deletionTime.Before(*nextRequeueTime) { + // This is a pending rule. Track the earliest one to schedule the next requeue. + nextRequeueTime = &deletionTime + } + } + + // Handle overdue rules if any exist. + if len(overdueRules) > 0 { + ruleToExecute := selectMostImpactfulRule(overdueRules) + logger.Info("Executing the most impactful overdue deletion rule", "rule", ruleToExecute, "overdueRulesCount", len(overdueRules)) + if _, err := r.executeDeletionPolicy(ctx, rayJob, ruleToExecute.Policy); err != nil { + // If execution fails, return immediately for a retry. + return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err + } + } + + if nextRequeueTime != nil { + requeueAfter := requeueDelayFor(*nextRequeueTime) + logger.Info("Requeuing for the next scheduled rule", "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + + logger.Info("All applicable deletion rules have been processed.") + return ctrl.Result{}, nil +} + +// handleLegacyDeletionPolicy handles the deprecated onSuccess and onFailure policies. +func (r *RayJobReconciler) handleLegacyDeletionPolicy(ctx context.Context, rayJob *rayv1.RayJob) (ctrl.Result, error) { + logger := ctrl.LoggerFrom(ctx).WithValues("deletionMechanism", "LegacyOnSuccessFailure") + + var policy rayv1.DeletionPolicyType + switch rayJob.Status.JobStatus { + case rayv1.JobStatusSucceeded: + policy = *rayJob.Spec.DeletionStrategy.OnSuccess.Policy + case rayv1.JobStatusFailed: + policy = *rayJob.Spec.DeletionStrategy.OnFailure.Policy + default: + logger.Info("JobStatus is not valid for deletion, no policy applied", "jobStatus", rayJob.Status.JobStatus) + return ctrl.Result{}, nil + } + + // If the policy is DeleteNone, we are done. + if policy == rayv1.DeleteNone { + logger.Info("Deletion policy is DeleteNone; no action taken.") + return ctrl.Result{}, nil + } + + // These legacy policies use the top-level TTLSecondsAfterFinished. + nowTime := time.Now() + ttlSeconds := rayJob.Spec.TTLSecondsAfterFinished + shutdownTime := rayJob.Status.EndTime.Add(time.Duration(ttlSeconds) * time.Second) + logger.Info("Evaluating legacy deletion policy (onSuccess/onFailure)", + "JobDeploymentStatus", rayJob.Status.JobDeploymentStatus, + "policy", policy, + "JobStatus", rayJob.Status.JobStatus, + "ttlSecondsAfterFinished", ttlSeconds, + "Status.endTime", rayJob.Status.EndTime, + "Now", nowTime, + "ShutdownTime", shutdownTime) + + if shutdownTime.After(nowTime) { + requeueAfter := requeueDelayFor(shutdownTime) + logger.Info("TTL has not been met for legacy policy. Requeuing.", "shutdownTime", shutdownTime, "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + + logger.Info("Executing legacy deletion policy.", "policy", policy) + return r.executeDeletionPolicy(ctx, rayJob, policy) +} + +// handleShutdownAfterJobFinishes handles the oldest deletion mechanism, the ShutdownAfterJobFinishes boolean flag. +func (r *RayJobReconciler) handleShutdownAfterJobFinishes(ctx context.Context, rayJob *rayv1.RayJob) (ctrl.Result, error) { + logger := ctrl.LoggerFrom(ctx).WithValues("deletionMechanism", "ShutdownAfterJobFinishes") + + nowTime := time.Now() + ttlSeconds := rayJob.Spec.TTLSecondsAfterFinished + shutdownTime := rayJob.Status.EndTime.Add(time.Duration(ttlSeconds) * time.Second) + logger.Info("Evaluating job deletion policy based on ShutdownAfterJobFinishes", + "JobDeploymentStatus", rayJob.Status.JobDeploymentStatus, + "ShutdownAfterJobFinishes", rayJob.Spec.ShutdownAfterJobFinishes, + "ClusterSelector", rayJob.Spec.ClusterSelector, + "ttlSecondsAfterFinished", ttlSeconds, + "Status.endTime", rayJob.Status.EndTime, + "Now", nowTime, + "ShutdownTime", shutdownTime) + + if shutdownTime.After(nowTime) { + requeueAfter := requeueDelayFor(shutdownTime) + logger.Info("TTL has not been met for ShutdownAfterJobFinishes. Requeuing.", "shutdownTime", shutdownTime, "requeueAfter", requeueAfter) + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + + var err error + if s := os.Getenv(utils.DELETE_RAYJOB_CR_AFTER_JOB_FINISHES); strings.ToLower(s) == "true" { + err = r.Client.Delete(ctx, rayJob) + if err == nil { + logger.Info("RayJob is deleted", "RayJob", rayJob.Name) + } + } else { + // We only need to delete the RayCluster. We don't need to delete the submitter Kubernetes Job so that users can still access + // the driver logs. In addition, a completed Kubernetes Job does not actually use any compute resources. _, err = r.deleteClusterResources(ctx, rayJob) + _, err = r.deleteClusterResources(ctx, rayJob) + if err == nil { + logger.Info("RayCluster is deleted", "RayCluster", rayJob.Status.RayClusterName) + } + } + + if err != nil { + return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err + } + + return ctrl.Result{}, nil +} + +// executeDeletionPolicy performs the actual resource deletion based on the policy type. +// This function centralizes the deletion logic to avoid code duplication. +func (r *RayJobReconciler) executeDeletionPolicy(ctx context.Context, rayJob *rayv1.RayJob, policy rayv1.DeletionPolicyType) (ctrl.Result, error) { + logger := ctrl.LoggerFrom(ctx) + var err error + + switch policy { + case rayv1.DeleteCluster: + logger.Info("Executing deletion policy: DeleteCluster", "RayCluster", rayJob.Status.RayClusterName) + _, err = r.deleteClusterResources(ctx, rayJob) + case rayv1.DeleteWorkers: + logger.Info("Executing deletion policy: DeleteWorkers", "RayCluster", rayJob.Status.RayClusterName) + err = r.suspendWorkerGroups(ctx, rayJob) + case rayv1.DeleteSelf: + logger.Info("Executing deletion policy: DeleteSelf", "RayJob", rayJob.Name) + err = r.Client.Delete(ctx, rayJob) + case rayv1.DeleteNone: + // This should be handled by the callers, but we include it for safety. + logger.Info("Executing deletion policy: DeleteNone. No action taken.") + default: + // This case should not be reached if validation is working correctly. + logger.Error(fmt.Errorf("unknown deletion policy: %s", policy), "Unknown deletion policy encountered") + } + + if err != nil { + return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err + } + return ctrl.Result{}, nil +} + +// isDeletionActionCompleted checks if the state corresponding to a deletion policy is already achieved. +// This is crucial for making the reconciliation loop idempotent by checking the actual cluster state. +func (r *RayJobReconciler) isDeletionActionCompleted(ctx context.Context, rayJob *rayv1.RayJob, policy rayv1.DeletionPolicyType) (bool, error) { + clusterIdentifier := common.RayJobRayClusterNamespacedName(rayJob) + cluster := &rayv1.RayCluster{} + + switch policy { + case rayv1.DeleteWorkers: + if err := r.Get(ctx, clusterIdentifier, cluster); err != nil { + if errors.IsNotFound(err) { + // If the cluster is gone, the workers are definitely gone. + return true, nil + } + // For any other error, we can't be sure of the state, so report the error. + return false, err + } + + // If the cluster exists, check if all worker groups are suspended. + for _, wg := range cluster.Spec.WorkerGroupSpecs { + if wg.Suspend == nil || !*wg.Suspend { + // Found an active worker group, so the action is not complete. + return false, nil + } + } + + return true, nil + + case rayv1.DeleteCluster: + err := r.Get(ctx, clusterIdentifier, cluster) + if errors.IsNotFound(err) { + // Cluster not found means the deletion is complete. + return true, nil + } + return false, err + + case rayv1.DeleteSelf: + // This action is terminal. If this function is running, the RayJob still exists, + // so the action cannot be considered complete. + return false, nil + + case rayv1.DeleteNone: + // "DeleteNone" is a no-op and is always considered complete. + return true, nil + } + + return false, fmt.Errorf("unknown deletion policy for completion check: %s", policy) +} + +// selectMostImpactfulRule finds the rule with the most destructive policy from a given list. +func selectMostImpactfulRule(rules []rayv1.DeletionRule) rayv1.DeletionRule { + order := map[rayv1.DeletionPolicyType]int{ + rayv1.DeleteSelf: 4, + rayv1.DeleteCluster: 3, + rayv1.DeleteWorkers: 2, + rayv1.DeleteNone: 1, + } + + mostImpactfulRule := rules[0] + for _, rule := range rules[1:] { + if order[rule.Policy] > order[mostImpactfulRule.Policy] { + mostImpactfulRule = rule + } + } + return mostImpactfulRule +} + +// requeueDelayFor computes the duration for the next requeue, ensuring a minimum buffer. +func requeueDelayFor(t time.Time) time.Duration { + return time.Until(t) + 2*time.Second +} diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index 7552637fa5e..d4653aa04bc 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -1,6 +1,7 @@ package utils import ( + "errors" errstd "errors" "fmt" @@ -218,43 +219,7 @@ func ValidateRayJobSpec(rayJob *rayv1.RayJob) error { return fmt.Errorf("RayJobDeletionPolicy feature gate must be enabled to use the DeletionStrategy feature") } - if rayJob.Spec.DeletionStrategy != nil { - onSuccessPolicy := rayJob.Spec.DeletionStrategy.OnSuccess - onFailurePolicy := rayJob.Spec.DeletionStrategy.OnFailure - - if onSuccessPolicy.Policy == nil { - return fmt.Errorf("the DeletionPolicyType field of DeletionStrategy.OnSuccess cannot be unset when DeletionStrategy is enabled") - } - if onFailurePolicy.Policy == nil { - return fmt.Errorf("the DeletionPolicyType field of DeletionStrategy.OnFailure cannot be unset when DeletionStrategy is enabled") - } - - if isClusterSelectorMode { - switch *onSuccessPolicy.Policy { - case rayv1.DeleteCluster: - return fmt.Errorf("the ClusterSelector mode doesn't support DeletionStrategy=DeleteCluster on success") - case rayv1.DeleteWorkers: - return fmt.Errorf("the ClusterSelector mode doesn't support DeletionStrategy=DeleteWorkers on success") - } - - switch *onFailurePolicy.Policy { - case rayv1.DeleteCluster: - return fmt.Errorf("the ClusterSelector mode doesn't support DeletionStrategy=DeleteCluster on failure") - case rayv1.DeleteWorkers: - return fmt.Errorf("the ClusterSelector mode doesn't support DeletionStrategy=DeleteWorkers on failure") - } - } - - if (*onSuccessPolicy.Policy == rayv1.DeleteWorkers || *onFailurePolicy.Policy == rayv1.DeleteWorkers) && IsAutoscalingEnabled(rayJob.Spec.RayClusterSpec) { - // TODO (rueian): This can be supported in a future Ray version. We should check the RayVersion once we know it. - return fmt.Errorf("DeletionStrategy=DeleteWorkers currently does not support RayCluster with autoscaling enabled") - } - - if rayJob.Spec.ShutdownAfterJobFinishes && (*onSuccessPolicy.Policy == rayv1.DeleteNone || *onFailurePolicy.Policy == rayv1.DeleteNone) { - return fmt.Errorf("shutdownAfterJobFinshes is set to 'true' while deletion policy is 'DeleteNone'") - } - } - return nil + return validateDeletionStrategy(rayJob) } func ValidateRayServiceMetadata(metadata metav1.ObjectMeta) error { @@ -291,3 +256,176 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error { return nil } + +// validateDeletionStrategy centralizes all validation logic for the deletion strategy. +// This includes the new `deletionRules` and the legacy fields (`onSuccess`,`onFailure`). +func validateDeletionStrategy(rayJob *rayv1.RayJob) error { + if rayJob.Spec.DeletionStrategy == nil { + return nil + } + + if !features.Enabled(features.RayJobDeletionPolicy) { + return fmt.Errorf("RayJobDeletionPolicy feature gate must be enabled to use the DeletionStrategy feature") + } + + usingDeletionRules := len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0 + usingLegacyAPI := rayJob.Spec.DeletionStrategy.OnSuccess.Policy != nil || rayJob.Spec.DeletionStrategy.OnFailure.Policy != nil + + // ShutdownAfterJobFinishes cannot be used with the new API. + if usingDeletionRules && rayJob.Spec.ShutdownAfterJobFinishes { + return fmt.Errorf("ShutdownAfterJobFinishes cannot be used when spec.deletionStrategy.deletionRules is defined. Please configure all deletion behaviors within deletionRules") + } + + // Legacy API and DeletionRules cannot be used simultaneously. + if usingDeletionRules && usingLegacyAPI { + return fmt.Errorf("legacy policies (onSuccess, onFailure) and the new deletionRules cannot be used simultaneously within the same deletionStrategy") + } + + // DeletionStrategy must contain at least one policy if specified. + if !usingDeletionRules && !usingLegacyAPI { + return fmt.Errorf("deletionStrategy is specified, but no policies (onSuccess, onFailure, or deletionRules) are defined within it") + } + + if usingDeletionRules { + return validateDeletionRules(rayJob) + } + + // If not using DeletionRules, validate the legacy strategy + return validateLegacyDeletionPolicies(rayJob) +} + +// validateDeletionRules validates the deletion rules in the RayJob spec. +// It performs per-rule validations, checks for uniqueness, and ensures logical TTL consistency. +// Errors are collected and returned as a single aggregated error using errors.Join for better user feedback. +func validateDeletionRules(rayJob *rayv1.RayJob) error { + type ruleKey struct { + Policy rayv1.DeletionPolicyType + Status rayv1.JobStatus + } + + rules := rayJob.Spec.DeletionStrategy.DeletionRules + isClusterSelectorMode := len(rayJob.Spec.ClusterSelector) != 0 + + // Group TTLs by JobStatus for cross-rule validation. + rulesByStatus := make(map[rayv1.JobStatus]map[rayv1.DeletionPolicyType]int32) + // Track unique (Policy, JobStatus) combinations. + ruleUniquenessSet := make(map[ruleKey]struct{}) + + var errs []error + + // Single pass: Validate each rule individually and group for later consistency checks. + for i, rule := range rules { + // Validate TTL is non-negative. + if rule.Condition.TTLSecondsAfterFinished < 0 { + errs = append(errs, fmt.Errorf("deletionRules[%d]: TTLSecondsAfterFinished must be non-negative", i)) + continue + } + + // Check uniqueness. + key := ruleKey{Policy: rule.Policy, Status: rule.Condition.JobStatus} + if _, exists := ruleUniquenessSet[key]; exists { + errs = append(errs, fmt.Errorf("deletionRules[%d]: duplicate rule for DeletionPolicyType '%s' and JobStatus '%s'", i, rule.Policy, rule.Condition.JobStatus)) + continue + } + ruleUniquenessSet[key] = struct{}{} + + // Contextual validations based on spec. + if isClusterSelectorMode && (rule.Policy == rayv1.DeleteCluster || rule.Policy == rayv1.DeleteWorkers) { + errs = append(errs, fmt.Errorf("deletionRules[%d]: DeletionPolicyType '%s' not supported when ClusterSelector is set", i, rule.Policy)) + continue + } + if IsAutoscalingEnabled(rayJob.Spec.RayClusterSpec) && rule.Policy == rayv1.DeleteWorkers { + // TODO (rueian): Support in future Ray versions by checking RayVersion. + errs = append(errs, fmt.Errorf("deletionRules[%d]: DeletionPolicyType 'DeleteWorkers' not supported with autoscaling enabled", i)) + continue + } + + // Group valid rule for consistency check. + statusMap, ok := rulesByStatus[rule.Condition.JobStatus] + if !ok { + statusMap = make(map[rayv1.DeletionPolicyType]int32) + rulesByStatus[rule.Condition.JobStatus] = statusMap + } + statusMap[rule.Policy] = rule.Condition.TTLSecondsAfterFinished + } + + // Second pass: Validate TTL consistency per JobStatus. + for status, policyTTLs := range rulesByStatus { + if err := validateTTLConsistency(policyTTLs, status); err != nil { + errs = append(errs, err) + } + } + + return errors.Join(errs...) +} + +// validateTTLConsistency ensures TTLs follow the deletion hierarchy: Workers <= Cluster <= Self. +// (Lower TTL means deletes earlier.) +func validateTTLConsistency(policyTTLs map[rayv1.DeletionPolicyType]int32, status rayv1.JobStatus) error { + // Define the required deletion order. TTLs must be non-decreasing along this sequence. + deletionOrder := []rayv1.DeletionPolicyType{ + rayv1.DeleteWorkers, + rayv1.DeleteCluster, + rayv1.DeleteSelf, + } + + var prevPolicy rayv1.DeletionPolicyType + var prevTTL int32 + var hasPrev bool + + var errs []error + + for _, policy := range deletionOrder { + ttl, exists := policyTTLs[policy] + if !exists { + continue + } + + if hasPrev && ttl < prevTTL { + errs = append(errs, fmt.Errorf( + "for JobStatus '%s': %s TTL (%d) must be >= %s TTL (%d)", + status, policy, ttl, prevPolicy, prevTTL, + )) + } + + prevPolicy = policy + prevTTL = ttl + hasPrev = true + } + + return errors.Join(errs...) +} + +// validateLegacyDeletionPolicies handles validation for the old `onSuccess` and `onFailure` fields. +func validateLegacyDeletionPolicies(rayJob *rayv1.RayJob) error { + isClusterSelectorMode := len(rayJob.Spec.ClusterSelector) != 0 + onSuccessPolicy := rayJob.Spec.DeletionStrategy.OnSuccess + onFailurePolicy := rayJob.Spec.DeletionStrategy.OnFailure + + if onSuccessPolicy.Policy == nil { + return fmt.Errorf("the DeletionPolicyType field of DeletionStrategy.OnSuccess cannot be unset when DeletionStrategy is enabled") + } + if onFailurePolicy.Policy == nil { + return fmt.Errorf("the DeletionPolicyType field of DeletionStrategy.OnFailure cannot be unset when DeletionStrategy is enabled") + } + + if isClusterSelectorMode { + if *onSuccessPolicy.Policy == rayv1.DeleteCluster || *onSuccessPolicy.Policy == rayv1.DeleteWorkers { + return fmt.Errorf("the ClusterSelector mode doesn't support DeletionStrategy=%s on success", *onSuccessPolicy.Policy) + } + if *onFailurePolicy.Policy == rayv1.DeleteCluster || *onFailurePolicy.Policy == rayv1.DeleteWorkers { + return fmt.Errorf("the ClusterSelector mode doesn't support DeletionStrategy=%s on failure", *onFailurePolicy.Policy) + } + } + + if (*onSuccessPolicy.Policy == rayv1.DeleteWorkers || *onFailurePolicy.Policy == rayv1.DeleteWorkers) && IsAutoscalingEnabled(rayJob.Spec.RayClusterSpec) { + // TODO (rueian): This can be supported in a future Ray version. We should check the RayVersion once we know it. + return fmt.Errorf("DeletionStrategy=DeleteWorkers currently does not support RayCluster with autoscaling enabled") + } + + if rayJob.Spec.ShutdownAfterJobFinishes && (*onSuccessPolicy.Policy == rayv1.DeleteNone || *onFailurePolicy.Policy == rayv1.DeleteNone) { + return fmt.Errorf("shutdownAfterJobFinishes is set to 'true' while deletion policy is 'DeleteNone'") + } + + return nil +} diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index b55a8f26c39..4060827e21f 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -927,6 +927,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { spec rayv1.RayJobSpec expectError bool }{ + // Legacy DeletionStrategy tests { name: "the ClusterSelector mode doesn't support DeletionStrategy=DeleteCluster", spec: rayv1.RayJobSpec{ @@ -1069,6 +1070,244 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { }, expectError: true, }, + // New Deletion Rules tests + { + name: "valid deletionRules", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 10, + }, + }, + }, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: false, + }, + { + name: "deletionRules and ShutdownAfterJobFinishes both set", + spec: rayv1.RayJobSpec{ + ShutdownAfterJobFinishes: true, + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 10, + }, + }, + }, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: true, + }, + { + name: "deletionRules and legacy onSuccess both set", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + OnSuccess: rayv1.DeletionPolicy{ + Policy: ptr.To(rayv1.DeleteCluster), + }, + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 10, + }, + }, + }, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: true, + }, + { + name: "empty DeletionStrategy", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{}, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: true, + }, + { + name: "duplicate rule in deletionRules", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 10, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 20, + }, + }, + }, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: true, + }, + { + name: "negative TTLSecondsAfterFinished in deletionRules", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: -10, + }, + }, + }, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: true, + }, + { + name: "deletionRules with ClusterSelector and DeleteCluster policy", + spec: rayv1.RayJobSpec{ + ClusterSelector: map[string]string{"key": "value"}, + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 10, + }, + }, + }, + }, + }, + expectError: true, + }, + { + name: "deletionRules with autoscaling and DeleteWorkers policy", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 10, + }, + }, + }, + }, + RayClusterSpec: &rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + HeadGroupSpec: headGroupSpecWithOneContainer, + }, + }, + expectError: true, + }, + { + name: "inconsistent TTLs in deletionRules (DeleteCluster < DeleteWorkers)", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 20, + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 10, + }, + }, + }, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: true, + }, + { + name: "inconsistent TTLs in deletionRules (DeleteSelf < DeleteCluster)", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 20, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 10, + }, + }, + }, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: true, + }, + { + name: "valid complex deletionRules", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 10, + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 20, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 30, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSecondsAfterFinished: 0, + }, + }, + }, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: false, + }, } features.SetFeatureGateDuringTest(t, features.RayJobDeletionPolicy, true) diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go new file mode 100644 index 00000000000..25e1a881dbb --- /dev/null +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go @@ -0,0 +1,36 @@ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1 + +import ( + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" +) + +// DeletionConditionApplyConfiguration represents a declarative configuration of the DeletionCondition type for use +// with apply. +type DeletionConditionApplyConfiguration struct { + JobStatus *rayv1.JobStatus `json:"jobStatus,omitempty"` + TTLSecondsAfterFinished *int32 `json:"ttlSecondsAfterFinished,omitempty"` +} + +// DeletionConditionApplyConfiguration constructs a declarative configuration of the DeletionCondition type for use with +// apply. +func DeletionCondition() *DeletionConditionApplyConfiguration { + return &DeletionConditionApplyConfiguration{} +} + +// WithJobStatus sets the JobStatus field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the JobStatus field is set to the value of the last call. +func (b *DeletionConditionApplyConfiguration) WithJobStatus(value rayv1.JobStatus) *DeletionConditionApplyConfiguration { + b.JobStatus = &value + return b +} + +// WithTTLSecondsAfterFinished sets the TTLSecondsAfterFinished field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the TTLSecondsAfterFinished field is set to the value of the last call. +func (b *DeletionConditionApplyConfiguration) WithTTLSecondsAfterFinished(value int32) *DeletionConditionApplyConfiguration { + b.TTLSecondsAfterFinished = &value + return b +} diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionrule.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionrule.go new file mode 100644 index 00000000000..91e4b50de99 --- /dev/null +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionrule.go @@ -0,0 +1,36 @@ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1 + +import ( + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" +) + +// DeletionRuleApplyConfiguration represents a declarative configuration of the DeletionRule type for use +// with apply. +type DeletionRuleApplyConfiguration struct { + Policy *rayv1.DeletionPolicyType `json:"policy,omitempty"` + Condition *DeletionConditionApplyConfiguration `json:"condition,omitempty"` +} + +// DeletionRuleApplyConfiguration constructs a declarative configuration of the DeletionRule type for use with +// apply. +func DeletionRule() *DeletionRuleApplyConfiguration { + return &DeletionRuleApplyConfiguration{} +} + +// WithPolicy sets the Policy field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Policy field is set to the value of the last call. +func (b *DeletionRuleApplyConfiguration) WithPolicy(value rayv1.DeletionPolicyType) *DeletionRuleApplyConfiguration { + b.Policy = &value + return b +} + +// WithCondition sets the Condition field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Condition field is set to the value of the last call. +func (b *DeletionRuleApplyConfiguration) WithCondition(value *DeletionConditionApplyConfiguration) *DeletionRuleApplyConfiguration { + b.Condition = value + return b +} diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionstrategy.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionstrategy.go index 105c33d3de7..034cce827cb 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionstrategy.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionstrategy.go @@ -5,8 +5,9 @@ package v1 // DeletionStrategyApplyConfiguration represents a declarative configuration of the DeletionStrategy type for use // with apply. type DeletionStrategyApplyConfiguration struct { - OnSuccess *DeletionPolicyApplyConfiguration `json:"onSuccess,omitempty"` - OnFailure *DeletionPolicyApplyConfiguration `json:"onFailure,omitempty"` + OnSuccess *DeletionPolicyApplyConfiguration `json:"onSuccess,omitempty"` + OnFailure *DeletionPolicyApplyConfiguration `json:"onFailure,omitempty"` + DeletionRules []DeletionRuleApplyConfiguration `json:"deletionRules,omitempty"` } // DeletionStrategyApplyConfiguration constructs a declarative configuration of the DeletionStrategy type for use with @@ -30,3 +31,16 @@ func (b *DeletionStrategyApplyConfiguration) WithOnFailure(value *DeletionPolicy b.OnFailure = value return b } + +// WithDeletionRules adds the given value to the DeletionRules field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the DeletionRules field. +func (b *DeletionStrategyApplyConfiguration) WithDeletionRules(values ...*DeletionRuleApplyConfiguration) *DeletionStrategyApplyConfiguration { + for i := range values { + if values[i] == nil { + panic("nil value passed to WithDeletionRules") + } + b.DeletionRules = append(b.DeletionRules, *values[i]) + } + return b +} diff --git a/ray-operator/pkg/client/applyconfiguration/utils.go b/ray-operator/pkg/client/applyconfiguration/utils.go index 23e455d739a..050733b0c5e 100644 --- a/ray-operator/pkg/client/applyconfiguration/utils.go +++ b/ray-operator/pkg/client/applyconfiguration/utils.go @@ -20,8 +20,12 @@ func ForKind(kind schema.GroupVersionKind) interface{} { return &rayv1.AppStatusApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("AutoscalerOptions"): return &rayv1.AutoscalerOptionsApplyConfiguration{} + case v1.SchemeGroupVersion.WithKind("DeletionCondition"): + return &rayv1.DeletionConditionApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("DeletionPolicy"): return &rayv1.DeletionPolicyApplyConfiguration{} + case v1.SchemeGroupVersion.WithKind("DeletionRule"): + return &rayv1.DeletionRuleApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("DeletionStrategy"): return &rayv1.DeletionStrategyApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("GcsFaultToleranceOptions"): From 30109279d4185e14f87c9b6611d5a0bcc4906aab Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Mon, 8 Sep 2025 17:56:10 -0400 Subject: [PATCH 02/21] Add controller tests --- ray-operator/apis/ray/v1/rayjob_types.go | 4 +- .../apis/ray/v1/zz_generated.deepcopy.go | 12 +- .../controllers/ray/rayjob_controller.go | 20 +- .../controllers/ray/rayjob_controller_test.go | 1938 ++++++++++++++++- .../controllers/ray/utils/validation.go | 12 +- .../controllers/ray/utils/validation_test.go | 36 +- 6 files changed, 1959 insertions(+), 63 deletions(-) diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go index 683ff6434b3..575ded8e81c 100644 --- a/ray-operator/apis/ray/v1/rayjob_types.go +++ b/ray-operator/apis/ray/v1/rayjob_types.go @@ -113,13 +113,13 @@ type DeletionStrategy struct { // Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies. // This field will be removed in a future release. // +optional - OnSuccess DeletionPolicy `json:"onSuccess,omitempty"` + OnSuccess *DeletionPolicy `json:"onSuccess,omitempty"` // OnFailure is the deletion policy for a failed RayJob. // Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies. // This field will be removed in a future release. // +optional - OnFailure DeletionPolicy `json:"onFailure,omitempty"` + OnFailure *DeletionPolicy `json:"onFailure,omitempty"` // DeletionRules is a list of deletion rules, processed based on their trigger conditions. // While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime), diff --git a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go index d548d944c8b..c4828c02f06 100644 --- a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go +++ b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go @@ -157,8 +157,16 @@ func (in *DeletionRule) DeepCopy() *DeletionRule { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DeletionStrategy) DeepCopyInto(out *DeletionStrategy) { *out = *in - in.OnSuccess.DeepCopyInto(&out.OnSuccess) - in.OnFailure.DeepCopyInto(&out.OnFailure) + if in.OnSuccess != nil { + in, out := &in.OnSuccess, &out.OnSuccess + *out = new(DeletionPolicy) + (*in).DeepCopyInto(*out) + } + if in.OnFailure != nil { + in, out := &in.OnFailure, &out.OnFailure + *out = new(DeletionPolicy) + (*in).DeepCopyInto(*out) + } if in.DeletionRules != nil { in, out := &in.DeletionRules, &out.DeletionRules *out = make([]DeletionRule, len(*in)) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index 7cb41681ce9..f292522a030 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -1071,7 +1071,16 @@ func (r *RayJobReconciler) handleDeletionRules(ctx context.Context, rayJob *rayv continue } - // Skip rules for actions that have already been completed to ensure idempotency. + deletionTime := rayJob.Status.EndTime.Add(time.Duration(rule.Condition.TTLSecondsAfterFinished) * time.Second) + // Track the earliest requeue time to re-check later. + if nowTime.Before(deletionTime) { + if nextRequeueTime == nil || deletionTime.Before(*nextRequeueTime) { + nextRequeueTime = &deletionTime + } + continue + } + + // Need to check if the deletion action has already been completed to ensure idempotency. isCompleted, err := r.isDeletionActionCompleted(ctx, rayJob, rule.Policy) if err != nil { logger.Error(err, "Failed to check if deletion action is completed", "rule", rule) @@ -1082,14 +1091,7 @@ func (r *RayJobReconciler) handleDeletionRules(ctx context.Context, rayJob *rayv continue } - // Categorize the rule based on its TTL. - deletionTime := rayJob.Status.EndTime.Add(time.Duration(rule.Condition.TTLSecondsAfterFinished) * time.Second) - if nowTime.After(deletionTime) { - overdueRules = append(overdueRules, rule) - } else if nextRequeueTime == nil || deletionTime.Before(*nextRequeueTime) { - // This is a pending rule. Track the earliest one to schedule the next requeue. - nextRequeueTime = &deletionTime - } + overdueRules = append(overdueRules, rule) } // Handle overdue rules if any exist. diff --git a/ray-operator/controllers/ray/rayjob_controller_test.go b/ray-operator/controllers/ray/rayjob_controller_test.go index 4d7c8fea810..c348932260d 100644 --- a/ray-operator/controllers/ray/rayjob_controller_test.go +++ b/ray-operator/controllers/ray/rayjob_controller_test.go @@ -896,10 +896,10 @@ var _ = Context("RayJob with different submission modes", func() { onSuccessPolicy := rayv1.DeleteCluster onFailurePolicy := rayv1.DeleteNone deletionStrategy := &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, } @@ -909,10 +909,10 @@ var _ = Context("RayJob with different submission modes", func() { By("Verify RayJob spec", func() { Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, })) @@ -1035,10 +1035,10 @@ var _ = Context("RayJob with different submission modes", func() { onSuccessPolicy := rayv1.DeleteNone onFailurePolicy := rayv1.DeleteCluster deletionStrategy := &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, } @@ -1048,10 +1048,10 @@ var _ = Context("RayJob with different submission modes", func() { By("Verify RayJob spec", func() { Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, })) @@ -1174,10 +1174,10 @@ var _ = Context("RayJob with different submission modes", func() { onSuccessPolicy := rayv1.DeleteWorkers onFailurePolicy := rayv1.DeleteNone deletionStrategy := rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, } @@ -1187,10 +1187,10 @@ var _ = Context("RayJob with different submission modes", func() { By("Verify RayJob spec", func() { Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, })) @@ -1330,10 +1330,10 @@ var _ = Context("RayJob with different submission modes", func() { onSuccessPolicy := rayv1.DeleteWorkers onFailurePolicy := rayv1.DeleteWorkers deletionStrategy := rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, } @@ -1343,10 +1343,10 @@ var _ = Context("RayJob with different submission modes", func() { By("Verify RayJob spec", func() { Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, })) @@ -1486,10 +1486,10 @@ var _ = Context("RayJob with different submission modes", func() { onSuccessPolicy := rayv1.DeleteSelf onFailurePolicy := rayv1.DeleteNone deletionStrategy := rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, } @@ -1602,10 +1602,10 @@ var _ = Context("RayJob with different submission modes", func() { onSuccessPolicy := rayv1.DeleteNone onFailurePolicy := rayv1.DeleteSelf deletionStrategy := rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, } @@ -1718,10 +1718,10 @@ var _ = Context("RayJob with different submission modes", func() { onSuccessPolicy := rayv1.DeleteNone onFailurePolicy := rayv1.DeleteNone deletionStrategy := rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, } @@ -1731,10 +1731,10 @@ var _ = Context("RayJob with different submission modes", func() { By("Verify RayJob spec", func() { Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, })) @@ -1879,10 +1879,10 @@ var _ = Context("RayJob with different submission modes", func() { onSuccessPolicy := rayv1.DeleteCluster onFailurePolicy := rayv1.DeleteNone deletionStrategy := rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, } @@ -1892,10 +1892,10 @@ var _ = Context("RayJob with different submission modes", func() { By("Verify RayJob spec", func() { Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: &onSuccessPolicy, }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: &onFailurePolicy, }, })) @@ -2031,5 +2031,1887 @@ var _ = Context("RayJob with different submission modes", func() { time.Second*3, time.Millisecond*500).Should(Succeed()) }) }) + + It("Should delete workers on success when a single 'DeleteWorkers' rule is set", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-rule-deleteworkers-on-success", namespace) + rayCluster := &rayv1.RayCluster{} + + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Succeeded" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + defer fakeRayDashboardClient.GetJobInfoMock.Store(nil) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + + // RayJob transitions to Complete. + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusComplete), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + }) + + By("If DeletionStrategy=DeleteWorkers, all workers should be deleted, but not the Head pod and submitter Job", func() { + // RayCluster exists + Consistently( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check worker group is suspended + Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Suspend).To(BeTrue()) + + // 0 worker Pods exist + workerPods := corev1.PodList{} + workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions() + Eventually( + listResourceFunc(ctx, &workerPods, workerLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(0), "expected 0 workers") + + // Head Pod is still running + headPods := corev1.PodList{} + headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions() + Consistently( + listResourceFunc(ctx, &headPods, headLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items) + + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + }) + + It("Should delete workers on failure when a single 'DeleteWorkers' rule is set", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-rule-deleteworkers-on-failure", namespace) + rayCluster := &rayv1.RayCluster{} + + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Failed" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + defer fakeRayDashboardClient.GetJobInfoMock.Store(nil) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + + // RayJob transitions to Failed. + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusFailed), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + }) + + By("If DeletionStrategy=DeleteWorkers, all workers should be deleted, but not the Head pod and submitter Job", func() { + // RayCluster exists + Consistently( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check worker group is suspended + Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Suspend).To(BeTrue()) + + // 0 worker Pods exist + workerPods := corev1.PodList{} + workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions() + Eventually( + listResourceFunc(ctx, &workerPods, workerLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(0), "expected 0 workers") + + // Head Pod is still running + headPods := corev1.PodList{} + headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions() + Consistently( + listResourceFunc(ctx, &headPods, headLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items) + + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + }) + + It("Should delete cluster on success when a single 'DeleteCluster' rule is set", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-rule-deletecluster-on-success", namespace) + rayCluster := &rayv1.RayCluster{} + + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Succeeded" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + defer fakeRayDashboardClient.GetJobInfoMock.Store(nil) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + + // RayJob transitions to Complete. + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusComplete), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + }) + + By("If DeletionStrategy=DeleteCluster, RayCluster should be deleted, but not the submitter Job.", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster)()) + }, + time.Second*3, time.Millisecond*500).Should(BeTrue()) + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + }) + + It("Should delete cluster on failure when a single 'DeleteCluster' rule is set", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-rule-deletecluster-on-failure", namespace) + rayCluster := &rayv1.RayCluster{} + + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Failed" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + defer fakeRayDashboardClient.GetJobInfoMock.Store(nil) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + + // RayJob transitions to Complete. + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusFailed), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + }) + + By("If DeletionStrategy=DeleteCluster, RayCluster should be deleted, but not the submitter Job.", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster)()) + }, + time.Second*3, time.Millisecond*500).Should(BeTrue()) + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + }) + + It("Should delete self on success when a single 'DeleteSelf' rule is set", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-rule-deleteself-on-success", namespace) + rayCluster := &rayv1.RayCluster{} + + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Create a RayJob custom resource", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Succeeded" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + }) + + By("If DeletionStrategy=DeleteSelf, the RayJob is deleted", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob)) + }, time.Second*5, time.Millisecond*500).Should(BeTrue()) + }) + }) + + It("Should delete self on failure when a single 'DeleteSelf' rule is set", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-rule-deleteself-on-failure", namespace) + rayCluster := &rayv1.RayCluster{} + + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Create a RayJob custom resource", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Failed" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + }) + + By("If DeletionStrategy=DeleteSelf, the RayJob is deleted", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob)) + }, time.Second*5, time.Millisecond*500).Should(BeTrue()) + }) + }) + + It("Should delete none on success when a single 'DeleteNone' rule is set", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-rule-deletenone-on-success", namespace) + rayCluster := &rayv1.RayCluster{} + + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteNone, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteNone, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Succeeded" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + defer fakeRayDashboardClient.GetJobInfoMock.Store(nil) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + + // RayJob transitions to Complete. + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusComplete), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + }) + + By("If DeletionStrategy=DeleteNone, no resources are deleted", func() { + // RayJob exists + Consistently( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayJob %v not found", rayJob) + + // RayCluster exists + Consistently( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Worker replicas set to 3 + Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(int32(3))) + + // 3 worker Pods exist + workerPods := corev1.PodList{} + workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions() + Consistently( + listResourceFunc(ctx, &workerPods, workerLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(3), "expected 3 workers") + + // Head Pod is still running + headPods := corev1.PodList{} + headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions() + Consistently( + listResourceFunc(ctx, &headPods, headLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items) + + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + }) + + It("Should delete none on failure when a single 'DeleteNone' rule is set", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-rule-deletenone-on-failure", namespace) + rayCluster := &rayv1.RayCluster{} + + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteNone, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteNone, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Failed" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + defer fakeRayDashboardClient.GetJobInfoMock.Store(nil) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + + // RayJob transitions to Complete. + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusFailed), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + }) + + By("If DeletionStrategy=DeleteNone, no resources are deleted", func() { + // RayJob exists + Consistently( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayJob %v not found", rayJob) + + // RayCluster exists + Consistently( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Worker replicas set to 3 + Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(int32(3))) + + // 3 worker Pods exist + workerPods := corev1.PodList{} + workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions() + Consistently( + listResourceFunc(ctx, &workerPods, workerLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(3), "expected 3 workers") + + // Head Pod is still running + headPods := corev1.PodList{} + headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions() + Consistently( + listResourceFunc(ctx, &headPods, headLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items) + + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + }) + + It("Should execute MOST impactful rule (DeleteSelf) when all rules are overdue on success", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-impactful-rule-override-on-success", namespace) + rayCluster := &rayv1.RayCluster{} + + // Define the multi-stage DeletionStrategy + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 0, + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 0, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 0, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 0, + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 0, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 0, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource with multi-stage deletion rules", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Succeeded" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + }) + + By("Verify RayJob itself is deleted", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob)) + }, time.Second*5, time.Millisecond*500).Should(BeTrue()) + }) + }) + + It("Should execute MOST impactful rule (DeleteSelf) when all rules are overdue on failure", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-impactful-rule-override-on-failure", namespace) + rayCluster := &rayv1.RayCluster{} + + // Define the multi-stage DeletionStrategy + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSecondsAfterFinished: 0, + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSecondsAfterFinished: 0, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSecondsAfterFinished: 0, + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSecondsAfterFinished: 0, + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSecondsAfterFinished: 0, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSecondsAfterFinished: 0, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource with multi-stage deletion rules", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Failed" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + }) + + By("Verify RayJob itself is deleted", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob)) + }, time.Second*5, time.Millisecond*500).Should(BeTrue()) + }) + }) + + It("Should process multi-stage deletions in order on success: Workers, then Cluster, then Self", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-multistage-deletion-on-success", namespace) + rayCluster := &rayv1.RayCluster{} + + // Define the multi-stage DeletionStrategy + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 0, // Stage 1: Delete workers after 0 seconds + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 5, // Stage 2: Delete cluster after 5 seconds + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 10, // Stage 3: Delete self after 10 seconds + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 0, + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 5, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSecondsAfterFinished: 10, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource with multi-stage deletion rules", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Succeeded" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + defer fakeRayDashboardClient.GetJobInfoMock.Store(nil) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + + // RayJob transitions to Complete. + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusComplete), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + }) + + By("Stage 1: Verify workers are deleted, but cluster and job still exist", func() { + // RayCluster exists + Consistently( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check worker group is suspended + Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Suspend).To(BeTrue()) + + // 0 worker Pods exist + workerPods := corev1.PodList{} + workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions() + Eventually( + listResourceFunc(ctx, &workerPods, workerLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(0), "expected 0 workers") + + // Head Pod is still running + headPods := corev1.PodList{} + headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions() + Consistently( + listResourceFunc(ctx, &headPods, headLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items) + + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + + By("Stage 2 (after 5s): Verify RayCluster is deleted, but job still exists", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster)()) + }, + time.Second*3, time.Millisecond*500).Should(BeTrue()) + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + + By("Stage 3 (after 10s): Verify RayJob itself is deleted", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob)) + }, time.Second*5, time.Millisecond*500).Should(BeTrue()) + }) + }) + + It("Should process multi-stage deletions in order on failure: Workers, then Cluster, then Self", func() { + ctx := context.Background() + namespace := "default" + rayJob := rayJobTemplate("rayjob-test-multistage-deletion-on-failure", namespace) + rayCluster := &rayv1.RayCluster{} + + // Define the multi-stage DeletionStrategy + rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSecondsAfterFinished: 0, // Stage 1: Delete workers after 0 seconds + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSecondsAfterFinished: 5, // Stage 2: Delete cluster after 5 seconds + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSecondsAfterFinished: 10, // Stage 3: Delete self after 10 seconds + }, + }, + }, + } + rayJob.Spec.ShutdownAfterJobFinishes = false + + By("Verify RayJob spec", func() { + Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSecondsAfterFinished: 0, + }, + }, + { + Policy: rayv1.DeleteCluster, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSecondsAfterFinished: 5, + }, + }, + { + Policy: rayv1.DeleteSelf, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusFailed, + TTLSecondsAfterFinished: 10, + }, + }, + }, + })) + }) + + By("Create a RayJob custom resource with multi-stage deletion rules", func() { + err := k8sClient.Create(ctx, rayJob) + Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob") + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob), + time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name) + }) + + By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set. + Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty()) + Expect(rayJob.Status.JobId).NotTo(BeEmpty()) + Expect(rayJob.Status.StartTime).NotTo(BeNil()) + }) + + By("In Initializing state, the RayCluster should eventually be created.", func() { + Eventually( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check whether RayCluster is consistent with RayJob's RayClusterSpec. + Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)) + Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion)) + + // TODO (kevin85421): Check the RayCluster labels. + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name)) + Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD))) + + Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations)) + }) + + By("Make RayCluster.Status.State to be rayv1.Ready", func() { + // The RayCluster is not 'Ready' yet because Pods are not running and ready. + Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) + + updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace) + + // The RayCluster.Status.State should be Ready. + Eventually( + getClusterState(ctx, namespace, rayCluster.Name), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready)) + }) + + By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() { + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // In Running state, the RayJob's Status.DashboardURL must be set. + Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty()) + + // In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + }) + + By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() { + // Update fake dashboard client to return job info with "Failed" status. + getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required + return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil + } + fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo) + defer fakeRayDashboardClient.GetJobInfoMock.Store(nil) + + // RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed. + Consistently( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + + // Update the submitter Kubernetes Job to Complete. + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + err := k8sClient.Get(ctx, namespacedName, job) + Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job") + + // Update the submitter Kubernetes Job to Complete. + conditions := []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + } + job.Status.Conditions = conditions + Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) + + // RayJob transitions to Failed. + Eventually( + getRayJobDeploymentStatus(ctx, rayJob), + time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusFailed), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus) + }) + + By("Stage 1: Verify workers are deleted, but cluster and job still exist", func() { + // RayCluster exists + Consistently( + getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster), + time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName) + + // Check worker group is suspended + Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Suspend).To(BeTrue()) + + // 0 worker Pods exist + workerPods := corev1.PodList{} + workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions() + Eventually( + listResourceFunc(ctx, &workerPods, workerLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(0), "expected 0 workers") + + // Head Pod is still running + headPods := corev1.PodList{} + headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions() + Consistently( + listResourceFunc(ctx, &headPods, headLabels...), + time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items) + + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + + By("Stage 2 (after 5s): Verify RayCluster is deleted, but job still exists", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster)()) + }, + time.Second*3, time.Millisecond*500).Should(BeTrue()) + namespacedName := common.RayJobK8sJobNamespacedName(rayJob) + job := &batchv1.Job{} + Consistently( + getResourceFunc(ctx, namespacedName, job), + time.Second*3, time.Millisecond*500).Should(Succeed()) + }) + + By("Stage 3 (after 10s): Verify RayJob itself is deleted", func() { + Eventually( + func() bool { + return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob)) + }, time.Second*5, time.Millisecond*500).Should(BeTrue()) + }) + }) }) }) diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index d4653aa04bc..3c4ebe81e45 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -215,9 +215,6 @@ func ValidateRayJobSpec(rayJob *rayv1.RayJob) error { if rayJob.Spec.BackoffLimit != nil && *rayJob.Spec.BackoffLimit < 0 { return fmt.Errorf("backoffLimit must be a positive integer") } - if !features.Enabled(features.RayJobDeletionPolicy) && rayJob.Spec.DeletionStrategy != nil { - return fmt.Errorf("RayJobDeletionPolicy feature gate must be enabled to use the DeletionStrategy feature") - } return validateDeletionStrategy(rayJob) } @@ -269,7 +266,7 @@ func validateDeletionStrategy(rayJob *rayv1.RayJob) error { } usingDeletionRules := len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0 - usingLegacyAPI := rayJob.Spec.DeletionStrategy.OnSuccess.Policy != nil || rayJob.Spec.DeletionStrategy.OnFailure.Policy != nil + usingLegacyAPI := rayJob.Spec.DeletionStrategy.OnSuccess != nil || rayJob.Spec.DeletionStrategy.OnFailure != nil // ShutdownAfterJobFinishes cannot be used with the new API. if usingDeletionRules && rayJob.Spec.ShutdownAfterJobFinishes { @@ -399,6 +396,13 @@ func validateTTLConsistency(policyTTLs map[rayv1.DeletionPolicyType]int32, statu // validateLegacyDeletionPolicies handles validation for the old `onSuccess` and `onFailure` fields. func validateLegacyDeletionPolicies(rayJob *rayv1.RayJob) error { isClusterSelectorMode := len(rayJob.Spec.ClusterSelector) != 0 + + // Both policies must be set if using the legacy API. + if rayJob.Spec.DeletionStrategy.OnSuccess == nil || rayJob.Spec.DeletionStrategy.OnFailure == nil { + return fmt.Errorf("both DeletionStrategy.OnSuccess and DeletionStrategy.OnFailure must be set when using the legacy deletion policy fields") + } + + // Validate that the Policy field is set within each policy. onSuccessPolicy := rayJob.Spec.DeletionStrategy.OnSuccess onFailurePolicy := rayJob.Spec.DeletionStrategy.OnFailure diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index 4060827e21f..e298d2d45b5 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -795,10 +795,10 @@ func TestValidateRayJobSpec(t *testing.T) { name: "RayJobDeletionPolicy feature gate must be enabled to use the DeletionStrategy feature", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteCluster), }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteCluster), }, }, @@ -932,10 +932,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "the ClusterSelector mode doesn't support DeletionStrategy=DeleteCluster", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteCluster), }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteCluster), }, }, ClusterSelector: map[string]string{"key": "value"}, @@ -946,10 +946,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "the ClusterSelector mode doesn't support DeletionStrategy=DeleteWorkers", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteWorkers), }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteWorkers), }, }, ClusterSelector: map[string]string{"key": "value"}, @@ -960,10 +960,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "DeletionStrategy=DeleteWorkers currently does not support RayCluster with autoscaling enabled", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteWorkers), }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteWorkers), }, }, RayClusterSpec: &rayv1.RayClusterSpec{ @@ -977,10 +977,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "valid RayJob with DeletionStrategy=DeleteCluster", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteCluster), }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteCluster), }, }, ShutdownAfterJobFinishes: true, @@ -1001,10 +1001,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "shutdownAfterJobFinshes is set to 'true' while deletion policy is 'DeleteNone'", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, }, ShutdownAfterJobFinishes: true, @@ -1016,7 +1016,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "OnSuccess unset", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, }, ShutdownAfterJobFinishes: true, @@ -1028,7 +1028,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "OnSuccess.DeletionPolicyType unset", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnFailure: rayv1.DeletionPolicy{ + OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, }, ShutdownAfterJobFinishes: true, @@ -1040,7 +1040,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "OnFailure unset", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, }, ShutdownAfterJobFinishes: true, @@ -1052,10 +1052,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "OnFailure.DeletionPolicyType unset", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, - OnFailure: rayv1.DeletionPolicy{}, + OnFailure: &rayv1.DeletionPolicy{}, }, ShutdownAfterJobFinishes: true, RayClusterSpec: createBasicRayClusterSpec(), }, @@ -1112,7 +1112,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { name: "deletionRules and legacy onSuccess both set", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ - OnSuccess: rayv1.DeletionPolicy{ + OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteCluster), }, DeletionRules: []rayv1.DeletionRule{ From f715ed4cd84cb5727906cd97b708117b30f6b7ed Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Fri, 12 Sep 2025 14:30:06 -0400 Subject: [PATCH 03/21] trigger CI --- ray-operator/controllers/ray/utils/validation.go | 1 + 1 file changed, 1 insertion(+) diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index 3c4ebe81e45..abb2644a6e4 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -361,6 +361,7 @@ func validateDeletionRules(rayJob *rayv1.RayJob) error { func validateTTLConsistency(policyTTLs map[rayv1.DeletionPolicyType]int32, status rayv1.JobStatus) error { // Define the required deletion order. TTLs must be non-decreasing along this sequence. deletionOrder := []rayv1.DeletionPolicyType{ + rayv1.DeleteNone, rayv1.DeleteWorkers, rayv1.DeleteCluster, rayv1.DeleteSelf, From 24f6ab966eee1c82cc0ed213beba2947d1ff48c1 Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Fri, 12 Sep 2025 22:37:42 -0400 Subject: [PATCH 04/21] Revert change for triggering CI --- ray-operator/controllers/ray/utils/validation.go | 1 - 1 file changed, 1 deletion(-) diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index abb2644a6e4..3c4ebe81e45 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -361,7 +361,6 @@ func validateDeletionRules(rayJob *rayv1.RayJob) error { func validateTTLConsistency(policyTTLs map[rayv1.DeletionPolicyType]int32, status rayv1.JobStatus) error { // Define the required deletion order. TTLs must be non-decreasing along this sequence. deletionOrder := []rayv1.DeletionPolicyType{ - rayv1.DeleteNone, rayv1.DeleteWorkers, rayv1.DeleteCluster, rayv1.DeleteSelf, From 31867ad2d480288b59f5471add3381311e793efd Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Mon, 15 Sep 2025 15:51:01 -0400 Subject: [PATCH 05/21] address comment --- ray-operator/apis/ray/v1/rayjob_types.go | 4 +-- .../controllers/ray/utils/validation.go | 32 ++++++++----------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go index 575ded8e81c..d1ec95f8c8d 100644 --- a/ray-operator/apis/ray/v1/rayjob_types.go +++ b/ray-operator/apis/ray/v1/rayjob_types.go @@ -85,8 +85,6 @@ const ( SidecarMode JobSubmissionMode = "SidecarMode" // Submit job via a sidecar container in the Ray head Pod ) -type DeletionPolicyType string - // DeletionStrategy defines the deletion policies for a RayJob. // It allows for fine-grained control over resource cleanup after a job finishes. // @@ -167,6 +165,8 @@ type DeletionPolicy struct { Policy *DeletionPolicyType `json:"policy,omitempty"` } +type DeletionPolicyType string + const ( DeleteCluster DeletionPolicyType = "DeleteCluster" // To delete the entire RayCluster custom resource on job completion. DeleteWorkers DeletionPolicyType = "DeleteWorkers" // To delete only the workers on job completion. diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index 3c4ebe81e45..7e66aec8f41 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -216,7 +216,10 @@ func ValidateRayJobSpec(rayJob *rayv1.RayJob) error { return fmt.Errorf("backoffLimit must be a positive integer") } - return validateDeletionStrategy(rayJob) + if err := validateDeletionStrategy(rayJob); err != nil { + return fmt.Errorf("invalid deletion strategy: %w", err) + } + return nil } func ValidateRayServiceMetadata(metadata metav1.ObjectMeta) error { @@ -295,19 +298,11 @@ func validateDeletionStrategy(rayJob *rayv1.RayJob) error { // It performs per-rule validations, checks for uniqueness, and ensures logical TTL consistency. // Errors are collected and returned as a single aggregated error using errors.Join for better user feedback. func validateDeletionRules(rayJob *rayv1.RayJob) error { - type ruleKey struct { - Policy rayv1.DeletionPolicyType - Status rayv1.JobStatus - } - rules := rayJob.Spec.DeletionStrategy.DeletionRules isClusterSelectorMode := len(rayJob.Spec.ClusterSelector) != 0 - // Group TTLs by JobStatus for cross-rule validation. + // Group TTLs by JobStatus for cross-rule validation and uniqueness checking. rulesByStatus := make(map[rayv1.JobStatus]map[rayv1.DeletionPolicyType]int32) - // Track unique (Policy, JobStatus) combinations. - ruleUniquenessSet := make(map[ruleKey]struct{}) - var errs []error // Single pass: Validate each rule individually and group for later consistency checks. @@ -318,14 +313,6 @@ func validateDeletionRules(rayJob *rayv1.RayJob) error { continue } - // Check uniqueness. - key := ruleKey{Policy: rule.Policy, Status: rule.Condition.JobStatus} - if _, exists := ruleUniquenessSet[key]; exists { - errs = append(errs, fmt.Errorf("deletionRules[%d]: duplicate rule for DeletionPolicyType '%s' and JobStatus '%s'", i, rule.Policy, rule.Condition.JobStatus)) - continue - } - ruleUniquenessSet[key] = struct{}{} - // Contextual validations based on spec. if isClusterSelectorMode && (rule.Policy == rayv1.DeleteCluster || rule.Policy == rayv1.DeleteWorkers) { errs = append(errs, fmt.Errorf("deletionRules[%d]: DeletionPolicyType '%s' not supported when ClusterSelector is set", i, rule.Policy)) @@ -343,6 +330,13 @@ func validateDeletionRules(rayJob *rayv1.RayJob) error { statusMap = make(map[rayv1.DeletionPolicyType]int32) rulesByStatus[rule.Condition.JobStatus] = statusMap } + + // Check for uniqueness of (JobStatus, DeletionPolicyType) pair. + if _, exists := statusMap[rule.Policy]; exists { + errs = append(errs, fmt.Errorf("deletionRules[%d]: duplicate rule for DeletionPolicyType '%s' and JobStatus '%s'", i, rule.Policy, rule.Condition.JobStatus)) + continue + } + statusMap[rule.Policy] = rule.Condition.TTLSecondsAfterFinished } @@ -399,7 +393,7 @@ func validateLegacyDeletionPolicies(rayJob *rayv1.RayJob) error { // Both policies must be set if using the legacy API. if rayJob.Spec.DeletionStrategy.OnSuccess == nil || rayJob.Spec.DeletionStrategy.OnFailure == nil { - return fmt.Errorf("both DeletionStrategy.OnSuccess and DeletionStrategy.OnFailure must be set when using the legacy deletion policy fields") + return fmt.Errorf("both DeletionStrategy.OnSuccess and DeletionStrategy.OnFailure must be set when using the legacy deletion policy fields of DeletionStrategy") } // Validate that the Policy field is set within each policy. From f2719a0ed3c16e2b266f3e19bb24b7ffd15f2cae Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Tue, 16 Sep 2025 20:12:06 -0400 Subject: [PATCH 06/21] rename to TTLSeconds --- docs/reference/api.md | 7 +- .../kuberay-operator/crds/ray.io_rayjobs.yaml | 2 +- ray-operator/apis/ray/v1/rayjob_types.go | 9 +- .../config/crd/bases/ray.io_rayjobs.yaml | 2 +- .../samples/ray-job.deletion-rules.yaml | 180 ++++++++++++++++++ .../controllers/ray/rayjob_controller.go | 2 +- .../controllers/ray/rayjob_controller_test.go | 96 +++++----- .../controllers/ray/utils/validation.go | 12 +- .../controllers/ray/utils/validation_test.go | 64 +++---- .../ray/v1/deletioncondition.go | 12 +- 10 files changed, 284 insertions(+), 102 deletions(-) create mode 100644 ray-operator/config/samples/ray-job.deletion-rules.yaml diff --git a/docs/reference/api.md b/docs/reference/api.md index 4d3a87a9bce..ec300a8e342 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -68,7 +68,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `ttlSecondsAfterFinished` _integer_ | TTLSecondsAfterFinished is the time in seconds from when the JobStatus
reaches the specified terminal state to when this deletion action should be triggered.
The value must be a non-negative integer. | 0 | Minimum: 0
| +| `ttlSeconds` _integer_ | TTLSeconds is the time in seconds from when the JobStatus
reaches the specified terminal state to when this deletion action should be triggered.
The value must be a non-negative integer. | 0 | Minimum: 0
| #### DeletionPolicy @@ -130,6 +130,7 @@ It allows for fine-grained control over resource cleanup after a job finishes. Legacy fields `onSuccess` and `onFailure` are still supported for backward compatibility, but it is highly recommended to migrate to the new `deletionRules` field. +`onSuccess` and `onFailure` will be removed in release 1.16.0. Notes: @@ -154,8 +155,8 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | OnSuccess is the deletion policy for a successful RayJob.
Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
This field will be removed in a future release. | | | -| `onFailure` _[DeletionPolicy](#deletionpolicy)_ | OnFailure is the deletion policy for a failed RayJob.
Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
This field will be removed in a future release. | | | +| `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | OnSuccess is the deletion policy for a successful RayJob.
Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
This field will be removed in release 1.16.0. | | | +| `onFailure` _[DeletionPolicy](#deletionpolicy)_ | OnFailure is the deletion policy for a failed RayJob.
Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
This field will be removed in release 1.16.0. | | | | `deletionRules` _[DeletionRule](#deletionrule) array_ | DeletionRules is a list of deletion rules, processed based on their trigger conditions.
While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),
the most impactful rule (e.g., DeleteCluster) will be executed first to prioritize resource cleanup and cost savings. | | | diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml index 15e15996f7b..8ee2bc5ce4d 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml @@ -70,7 +70,7 @@ spec: - SUCCEEDED - FAILED type: string - ttlSecondsAfterFinished: + ttlSeconds: default: 0 format: int32 minimum: 0 diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go index d1ec95f8c8d..01317066937 100644 --- a/ray-operator/apis/ray/v1/rayjob_types.go +++ b/ray-operator/apis/ray/v1/rayjob_types.go @@ -90,6 +90,7 @@ const ( // // Legacy fields `onSuccess` and `onFailure` are still supported for backward compatibility, // but it is highly recommended to migrate to the new `deletionRules` field. +// `onSuccess` and `onFailure` will be removed in release 1.16.0. // // Notes: // - When this block is set, you must configure **either** @@ -109,13 +110,13 @@ const ( type DeletionStrategy struct { // OnSuccess is the deletion policy for a successful RayJob. // Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies. - // This field will be removed in a future release. + // This field will be removed in release 1.16.0. // +optional OnSuccess *DeletionPolicy `json:"onSuccess,omitempty"` // OnFailure is the deletion policy for a failed RayJob. // Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies. - // This field will be removed in a future release. + // This field will be removed in release 1.16.0. // +optional OnFailure *DeletionPolicy `json:"onFailure,omitempty"` @@ -145,13 +146,13 @@ type DeletionCondition struct { // +kubebuilder:validation:Enum=SUCCEEDED;FAILED JobStatus JobStatus `json:"jobStatus"` - // TTLSecondsAfterFinished is the time in seconds from when the JobStatus + // TTLSeconds is the time in seconds from when the JobStatus // reaches the specified terminal state to when this deletion action should be triggered. // The value must be a non-negative integer. // +kubebuilder:default=0 // +kubebuilder:validation:Minimum=0 // +optional - TTLSecondsAfterFinished int32 `json:"ttlSecondsAfterFinished,omitempty"` + TTLSeconds int32 `json:"ttlSeconds,omitempty"` } // DeletionPolicy is the legacy single-stage deletion policy. diff --git a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml index 15e15996f7b..8ee2bc5ce4d 100644 --- a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml @@ -70,7 +70,7 @@ spec: - SUCCEEDED - FAILED type: string - ttlSecondsAfterFinished: + ttlSeconds: default: 0 format: int32 minimum: 0 diff --git a/ray-operator/config/samples/ray-job.deletion-rules.yaml b/ray-operator/config/samples/ray-job.deletion-rules.yaml new file mode 100644 index 00000000000..f6605e274c3 --- /dev/null +++ b/ray-operator/config/samples/ray-job.deletion-rules.yaml @@ -0,0 +1,180 @@ +apiVersion: ray.io/v1 +kind: RayJob +metadata: + name: rayjob-sample +spec: + # submissionMode specifies how RayJob submits the Ray job to the RayCluster. + # The default value is "K8sJobMode", meaning RayJob will submit the Ray job via a submitter Kubernetes Job. + # The alternative value is "HTTPMode", indicating that KubeRay will submit the Ray job by sending an HTTP request to the RayCluster. + # submissionMode: "K8sJobMode" + entrypoint: python /home/ray/samples/sample_code.py + # DeletionStrategy defines the deletion policies for a RayJob. + # It allows for fine-grained control over resource cleanup after a job finishes. + # DeletionRules is a list of deletion rules, processed based on their trigger conditions. + # While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime), + # the most impactful rule (e.g., DeleteCluster) will be executed first to prioritize resource cleanup and cost savings. + deletionStrategy: + # This sample demonstrates a staged cleanup process for a RayJob. + # Regardless of whether the job succeeds or fails, the cleanup follows these steps: + # 1. After 30 seconds, the worker pods are deleted. This allows for quick resource release while keeping the head pod for debugging. + # 2. After 60 seconds, the entire RayCluster (including the head pod) is deleted. + # 3. After 90 seconds, the RayJob custom resource itself is deleted, removing it from the Kubernetes API server. + deletionRules: + - condition: + jobStatus: FAILED + ttlSeconds: 30 + policy: DeleteWorkers + - condition: + jobStatus: FAILED + ttlSeconds: 60 + policy: DeleteCluster + - condition: + jobStatus: FAILED + ttlSeconds: 90 + policy: DeleteSelf + - condition: + jobStatus: SUCCEEDED + ttlSeconds: 30 + policy: DeleteWorkers + - condition: + jobStatus: SUCCEEDED + ttlSeconds: 60 + policy: DeleteCluster + - condition: + jobStatus: SUCCEEDED + ttlSeconds: 90 + policy: DeleteSelf + + # activeDeadlineSeconds is the duration in seconds that the RayJob may be active before + # KubeRay actively tries to terminate the RayJob; value must be positive integer. + # activeDeadlineSeconds: 120 + + # RuntimeEnvYAML represents the runtime environment configuration provided as a multi-line YAML string. + # See https://docs.ray.io/en/latest/ray-core/handling-dependencies.html for details. + # (New in KubeRay version 1.0.) + runtimeEnvYAML: | + pip: + - requests==2.26.0 + - pendulum==2.1.2 + env_vars: + counter_name: "test_counter" + + # Suspend specifies whether the RayJob controller should create a RayCluster instance. + # If a job is applied with the suspend field set to true, the RayCluster will not be created and we will wait for the transition to false. + # If the RayCluster is already created, it will be deleted. In the case of transition to false, a new RayCluster will be created. + # suspend: false + + # rayClusterSpec specifies the RayCluster instance to be created by the RayJob controller. + rayClusterSpec: + rayVersion: '2.46.0' # should match the Ray version in the image of the containers + # Ray head pod template + headGroupSpec: + # The `rayStartParams` are used to configure the `ray start` command. + # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. + # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. + rayStartParams: {} + #pod template + template: + spec: + containers: + - name: ray-head + image: rayproject/ray:2.46.0 + ports: + - containerPort: 6379 + name: gcs-server + - containerPort: 8265 # Ray dashboard + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: "1" + requests: + cpu: "200m" + volumeMounts: + - mountPath: /home/ray/samples + name: code-sample + volumes: + # You set volumes at the Pod level, then mount them into containers inside that Pod + - name: code-sample + configMap: + # Provide the name of the ConfigMap you want to mount. + name: ray-job-code-sample + # An array of keys from the ConfigMap to create as files + items: + - key: sample_code.py + path: sample_code.py + workerGroupSpecs: + # the pod replicas in this group typed worker + - replicas: 1 + minReplicas: 1 + maxReplicas: 5 + # logical group name, for this called small-group, also can be functional + groupName: small-group + # The `rayStartParams` are used to configure the `ray start` command. + # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. + # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. + rayStartParams: {} + #pod template + template: + spec: + containers: + - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' + image: rayproject/ray:2.46.0 + resources: + limits: + cpu: "1" + requests: + cpu: "200m" + + # SubmitterPodTemplate is the template for the pod that will run the `ray job submit` command against the RayCluster. + # If SubmitterPodTemplate is specified, the first container is assumed to be the submitter container. + # submitterPodTemplate: + # spec: + # restartPolicy: Never + # containers: + # - name: my-custom-rayjob-submitter-pod + # image: rayproject/ray:2.46.0 + # # If Command is not specified, the correct command will be supplied at runtime using the RayJob spec `entrypoint` field. + # # Specifying Command is not recommended. + # # command: ["sh", "-c", "ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID -- echo hello world"] + + +######################Ray code sample################################# +# this sample is from https://docs.ray.io/en/latest/cluster/job-submission.html#quick-start-example +# it is mounted into the container and executed to show the Ray job at work +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: ray-job-code-sample +data: + sample_code.py: | + import ray + import os + import requests + + ray.init() + + @ray.remote + class Counter: + def __init__(self): + # Used to verify runtimeEnv + self.name = os.getenv("counter_name") + assert self.name == "test_counter" + self.counter = 0 + + def inc(self): + self.counter += 1 + + def get_counter(self): + return "{} got {}".format(self.name, self.counter) + + counter = Counter.remote() + + for _ in range(5): + ray.get(counter.inc.remote()) + print(ray.get(counter.get_counter.remote())) + + # Verify that the correct runtime env was used for the job. + assert requests.__version__ == "2.26.0" diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index f292522a030..43c0f4c3ae4 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -1071,7 +1071,7 @@ func (r *RayJobReconciler) handleDeletionRules(ctx context.Context, rayJob *rayv continue } - deletionTime := rayJob.Status.EndTime.Add(time.Duration(rule.Condition.TTLSecondsAfterFinished) * time.Second) + deletionTime := rayJob.Status.EndTime.Add(time.Duration(rule.Condition.TTLSeconds) * time.Second) // Track the earliest requeue time to re-check later. if nowTime.Before(deletionTime) { if nextRequeueTime == nil || deletionTime.Before(*nextRequeueTime) { diff --git a/ray-operator/controllers/ray/rayjob_controller_test.go b/ray-operator/controllers/ray/rayjob_controller_test.go index c348932260d..75cb9edb75a 100644 --- a/ray-operator/controllers/ray/rayjob_controller_test.go +++ b/ray-operator/controllers/ray/rayjob_controller_test.go @@ -3192,22 +3192,22 @@ var _ = Context("RayJob with different submission modes", func() { { Policy: rayv1.DeleteWorkers, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 0, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 0, }, }, { Policy: rayv1.DeleteCluster, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 0, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 0, }, }, { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 0, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 0, }, }, }, @@ -3220,22 +3220,22 @@ var _ = Context("RayJob with different submission modes", func() { { Policy: rayv1.DeleteWorkers, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 0, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 0, }, }, { Policy: rayv1.DeleteCluster, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 0, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 0, }, }, { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 0, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 0, }, }, }, @@ -3351,22 +3351,22 @@ var _ = Context("RayJob with different submission modes", func() { { Policy: rayv1.DeleteWorkers, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusFailed, - TTLSecondsAfterFinished: 0, + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, }, }, { Policy: rayv1.DeleteCluster, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusFailed, - TTLSecondsAfterFinished: 0, + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, }, }, { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusFailed, - TTLSecondsAfterFinished: 0, + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, }, }, }, @@ -3379,22 +3379,22 @@ var _ = Context("RayJob with different submission modes", func() { { Policy: rayv1.DeleteWorkers, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusFailed, - TTLSecondsAfterFinished: 0, + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, }, }, { Policy: rayv1.DeleteCluster, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusFailed, - TTLSecondsAfterFinished: 0, + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, }, }, { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusFailed, - TTLSecondsAfterFinished: 0, + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, }, }, }, @@ -3510,22 +3510,22 @@ var _ = Context("RayJob with different submission modes", func() { { Policy: rayv1.DeleteWorkers, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 0, // Stage 1: Delete workers after 0 seconds + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 0, // Stage 1: Delete workers after 0 seconds }, }, { Policy: rayv1.DeleteCluster, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 5, // Stage 2: Delete cluster after 5 seconds + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 5, // Stage 2: Delete cluster after 5 seconds }, }, { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 10, // Stage 3: Delete self after 10 seconds + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, // Stage 3: Delete self after 10 seconds }, }, }, @@ -3538,22 +3538,22 @@ var _ = Context("RayJob with different submission modes", func() { { Policy: rayv1.DeleteWorkers, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 0, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 0, }, }, { Policy: rayv1.DeleteCluster, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 5, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 5, }, }, { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 10, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, }, }, }, @@ -3718,22 +3718,22 @@ var _ = Context("RayJob with different submission modes", func() { { Policy: rayv1.DeleteWorkers, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusFailed, - TTLSecondsAfterFinished: 0, // Stage 1: Delete workers after 0 seconds + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, // Stage 1: Delete workers after 0 seconds }, }, { Policy: rayv1.DeleteCluster, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusFailed, - TTLSecondsAfterFinished: 5, // Stage 2: Delete cluster after 5 seconds + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 5, // Stage 2: Delete cluster after 5 seconds }, }, { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusFailed, - TTLSecondsAfterFinished: 10, // Stage 3: Delete self after 10 seconds + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 10, // Stage 3: Delete self after 10 seconds }, }, }, @@ -3746,22 +3746,22 @@ var _ = Context("RayJob with different submission modes", func() { { Policy: rayv1.DeleteWorkers, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusFailed, - TTLSecondsAfterFinished: 0, + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, }, }, { Policy: rayv1.DeleteCluster, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusFailed, - TTLSecondsAfterFinished: 5, + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 5, }, }, { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusFailed, - TTLSecondsAfterFinished: 10, + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 10, }, }, }, diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index 7e66aec8f41..26ac8db2cc8 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -308,7 +308,7 @@ func validateDeletionRules(rayJob *rayv1.RayJob) error { // Single pass: Validate each rule individually and group for later consistency checks. for i, rule := range rules { // Validate TTL is non-negative. - if rule.Condition.TTLSecondsAfterFinished < 0 { + if rule.Condition.TTLSeconds < 0 { errs = append(errs, fmt.Errorf("deletionRules[%d]: TTLSecondsAfterFinished must be non-negative", i)) continue } @@ -325,19 +325,19 @@ func validateDeletionRules(rayJob *rayv1.RayJob) error { } // Group valid rule for consistency check. - statusMap, ok := rulesByStatus[rule.Condition.JobStatus] + policyTTLs, ok := rulesByStatus[rule.Condition.JobStatus] if !ok { - statusMap = make(map[rayv1.DeletionPolicyType]int32) - rulesByStatus[rule.Condition.JobStatus] = statusMap + policyTTLs = make(map[rayv1.DeletionPolicyType]int32) + rulesByStatus[rule.Condition.JobStatus] = policyTTLs } // Check for uniqueness of (JobStatus, DeletionPolicyType) pair. - if _, exists := statusMap[rule.Policy]; exists { + if _, exists := policyTTLs[rule.Policy]; exists { errs = append(errs, fmt.Errorf("deletionRules[%d]: duplicate rule for DeletionPolicyType '%s' and JobStatus '%s'", i, rule.Policy, rule.Condition.JobStatus)) continue } - statusMap[rule.Policy] = rule.Condition.TTLSecondsAfterFinished + policyTTLs[rule.Policy] = rule.Condition.TTLSeconds } // Second pass: Validate TTL consistency per JobStatus. diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index e298d2d45b5..2d69ae13c46 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -1079,8 +1079,8 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 10, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, }, }, }, @@ -1098,8 +1098,8 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 10, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, }, }, }, @@ -1119,8 +1119,8 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 10, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, }, }, }, @@ -1145,15 +1145,15 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 10, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, }, }, { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 20, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 20, }, }, }, @@ -1170,8 +1170,8 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: -10, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: -10, }, }, }, @@ -1189,8 +1189,8 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { { Policy: rayv1.DeleteCluster, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 10, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, }, }, }, @@ -1206,8 +1206,8 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { { Policy: rayv1.DeleteWorkers, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 10, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, }, }, }, @@ -1227,15 +1227,15 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { { Policy: rayv1.DeleteWorkers, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 20, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 20, }, }, { Policy: rayv1.DeleteCluster, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 10, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, }, }, }, @@ -1252,15 +1252,15 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { { Policy: rayv1.DeleteCluster, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 20, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 20, }, }, { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 10, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, }, }, }, @@ -1277,29 +1277,29 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { { Policy: rayv1.DeleteWorkers, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 10, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, }, }, { Policy: rayv1.DeleteCluster, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 20, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 20, }, }, { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusSucceeded, - TTLSecondsAfterFinished: 30, + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 30, }, }, { Policy: rayv1.DeleteSelf, Condition: rayv1.DeletionCondition{ - JobStatus: rayv1.JobStatusFailed, - TTLSecondsAfterFinished: 0, + JobStatus: rayv1.JobStatusFailed, + TTLSeconds: 0, }, }, }, diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go index 25e1a881dbb..36b8c006209 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go @@ -9,8 +9,8 @@ import ( // DeletionConditionApplyConfiguration represents a declarative configuration of the DeletionCondition type for use // with apply. type DeletionConditionApplyConfiguration struct { - JobStatus *rayv1.JobStatus `json:"jobStatus,omitempty"` - TTLSecondsAfterFinished *int32 `json:"ttlSecondsAfterFinished,omitempty"` + JobStatus *rayv1.JobStatus `json:"jobStatus,omitempty"` + TTLSeconds *int32 `json:"ttlSeconds,omitempty"` } // DeletionConditionApplyConfiguration constructs a declarative configuration of the DeletionCondition type for use with @@ -27,10 +27,10 @@ func (b *DeletionConditionApplyConfiguration) WithJobStatus(value rayv1.JobStatu return b } -// WithTTLSecondsAfterFinished sets the TTLSecondsAfterFinished field in the declarative configuration to the given value +// WithTTLSeconds sets the TTLSeconds field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the TTLSecondsAfterFinished field is set to the value of the last call. -func (b *DeletionConditionApplyConfiguration) WithTTLSecondsAfterFinished(value int32) *DeletionConditionApplyConfiguration { - b.TTLSecondsAfterFinished = &value +// If called multiple times, the TTLSeconds field is set to the value of the last call. +func (b *DeletionConditionApplyConfiguration) WithTTLSeconds(value int32) *DeletionConditionApplyConfiguration { + b.TTLSeconds = &value return b } From e952550a3e150ffb99322c3a81adb1550469e0e2 Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Wed, 17 Sep 2025 00:35:40 +0000 Subject: [PATCH 07/21] fix typo --- .../samples/ray-job.deletion-rules.yaml | 119 ++---------------- 1 file changed, 12 insertions(+), 107 deletions(-) diff --git a/ray-operator/config/samples/ray-job.deletion-rules.yaml b/ray-operator/config/samples/ray-job.deletion-rules.yaml index f6605e274c3..89112b14150 100644 --- a/ray-operator/config/samples/ray-job.deletion-rules.yaml +++ b/ray-operator/config/samples/ray-job.deletion-rules.yaml @@ -1,13 +1,14 @@ apiVersion: ray.io/v1 kind: RayJob metadata: - name: rayjob-sample + name: rayjob-deletion-rules spec: - # submissionMode specifies how RayJob submits the Ray job to the RayCluster. - # The default value is "K8sJobMode", meaning RayJob will submit the Ray job via a submitter Kubernetes Job. - # The alternative value is "HTTPMode", indicating that KubeRay will submit the Ray job by sending an HTTP request to the RayCluster. - # submissionMode: "K8sJobMode" - entrypoint: python /home/ray/samples/sample_code.py + entrypoint: | + python -c " + import ray + ray.init() + print(f'ray.cluster_resources(): {ray.cluster_resources()}') + " # DeletionStrategy defines the deletion policies for a RayJob. # It allows for fine-grained control over resource cleanup after a job finishes. # DeletionRules is a list of deletion rules, processed based on their trigger conditions. @@ -20,7 +21,7 @@ spec: # 2. After 60 seconds, the entire RayCluster (including the head pod) is deleted. # 3. After 90 seconds, the RayJob custom resource itself is deleted, removing it from the Kubernetes API server. deletionRules: - - condition: + - condition: jobStatus: FAILED ttlSeconds: 30 policy: DeleteWorkers @@ -32,7 +33,7 @@ spec: jobStatus: FAILED ttlSeconds: 90 policy: DeleteSelf - - condition: + - condition: jobStatus: SUCCEEDED ttlSeconds: 30 policy: DeleteWorkers @@ -44,36 +45,11 @@ spec: jobStatus: SUCCEEDED ttlSeconds: 90 policy: DeleteSelf - - # activeDeadlineSeconds is the duration in seconds that the RayJob may be active before - # KubeRay actively tries to terminate the RayJob; value must be positive integer. - # activeDeadlineSeconds: 120 - - # RuntimeEnvYAML represents the runtime environment configuration provided as a multi-line YAML string. - # See https://docs.ray.io/en/latest/ray-core/handling-dependencies.html for details. - # (New in KubeRay version 1.0.) - runtimeEnvYAML: | - pip: - - requests==2.26.0 - - pendulum==2.1.2 - env_vars: - counter_name: "test_counter" - - # Suspend specifies whether the RayJob controller should create a RayCluster instance. - # If a job is applied with the suspend field set to true, the RayCluster will not be created and we will wait for the transition to false. - # If the RayCluster is already created, it will be deleted. In the case of transition to false, a new RayCluster will be created. - # suspend: false - # rayClusterSpec specifies the RayCluster instance to be created by the RayJob controller. rayClusterSpec: - rayVersion: '2.46.0' # should match the Ray version in the image of the containers - # Ray head pod template + rayVersion: '2.46.0' headGroupSpec: - # The `rayStartParams` are used to configure the `ray start` command. - # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. - # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. rayStartParams: {} - #pod template template: spec: containers: @@ -82,7 +58,7 @@ spec: ports: - containerPort: 6379 name: gcs-server - - containerPort: 8265 # Ray dashboard + - containerPort: 8265 name: dashboard - containerPort: 10001 name: client @@ -91,90 +67,19 @@ spec: cpu: "1" requests: cpu: "200m" - volumeMounts: - - mountPath: /home/ray/samples - name: code-sample - volumes: - # You set volumes at the Pod level, then mount them into containers inside that Pod - - name: code-sample - configMap: - # Provide the name of the ConfigMap you want to mount. - name: ray-job-code-sample - # An array of keys from the ConfigMap to create as files - items: - - key: sample_code.py - path: sample_code.py workerGroupSpecs: - # the pod replicas in this group typed worker - replicas: 1 minReplicas: 1 maxReplicas: 5 - # logical group name, for this called small-group, also can be functional groupName: small-group - # The `rayStartParams` are used to configure the `ray start` command. - # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. - # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. rayStartParams: {} - #pod template template: spec: containers: - - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' + - name: ray-worker image: rayproject/ray:2.46.0 resources: limits: cpu: "1" requests: cpu: "200m" - - # SubmitterPodTemplate is the template for the pod that will run the `ray job submit` command against the RayCluster. - # If SubmitterPodTemplate is specified, the first container is assumed to be the submitter container. - # submitterPodTemplate: - # spec: - # restartPolicy: Never - # containers: - # - name: my-custom-rayjob-submitter-pod - # image: rayproject/ray:2.46.0 - # # If Command is not specified, the correct command will be supplied at runtime using the RayJob spec `entrypoint` field. - # # Specifying Command is not recommended. - # # command: ["sh", "-c", "ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID -- echo hello world"] - - -######################Ray code sample################################# -# this sample is from https://docs.ray.io/en/latest/cluster/job-submission.html#quick-start-example -# it is mounted into the container and executed to show the Ray job at work ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: ray-job-code-sample -data: - sample_code.py: | - import ray - import os - import requests - - ray.init() - - @ray.remote - class Counter: - def __init__(self): - # Used to verify runtimeEnv - self.name = os.getenv("counter_name") - assert self.name == "test_counter" - self.counter = 0 - - def inc(self): - self.counter += 1 - - def get_counter(self): - return "{} got {}".format(self.name, self.counter) - - counter = Counter.remote() - - for _ in range(5): - ray.get(counter.inc.remote()) - print(ray.get(counter.get_counter.remote())) - - # Verify that the correct runtime env was used for the job. - assert requests.__version__ == "2.26.0" From a81ffa40887f855e11ce42463e89bbaed1213708 Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Tue, 16 Sep 2025 21:37:27 -0400 Subject: [PATCH 08/21] modify comment --- docs/reference/api.md | 2 +- ray-operator/apis/ray/v1/rayjob_types.go | 2 +- .../controllers/ray/utils/validation.go | 2 +- .../controllers/ray/utils/validation_test.go | 20 ++++++++++++++++++- 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/docs/reference/api.md b/docs/reference/api.md index ec300a8e342..e951c3cb474 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -157,7 +157,7 @@ _Appears in:_ | --- | --- | --- | --- | | `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | OnSuccess is the deletion policy for a successful RayJob.
Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
This field will be removed in release 1.16.0. | | | | `onFailure` _[DeletionPolicy](#deletionpolicy)_ | OnFailure is the deletion policy for a failed RayJob.
Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
This field will be removed in release 1.16.0. | | | -| `deletionRules` _[DeletionRule](#deletionrule) array_ | DeletionRules is a list of deletion rules, processed based on their trigger conditions.
While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),
the most impactful rule (e.g., DeleteCluster) will be executed first to prioritize resource cleanup and cost savings. | | | +| `deletionRules` _[DeletionRule](#deletionrule) array_ | DeletionRules is a list of deletion rules, processed based on their trigger conditions.
While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),
the most impactful rule (e.g., DeleteSelf) will be executed first to prioritize resource cleanup and cost savings. | | | diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go index a60e30aa9b1..3314ac463bc 100644 --- a/ray-operator/apis/ray/v1/rayjob_types.go +++ b/ray-operator/apis/ray/v1/rayjob_types.go @@ -124,7 +124,7 @@ type DeletionStrategy struct { // DeletionRules is a list of deletion rules, processed based on their trigger conditions. // While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime), - // the most impactful rule (e.g., DeleteCluster) will be executed first to prioritize resource cleanup and cost savings. + // the most impactful rule (e.g., DeleteSelf) will be executed first to prioritize resource cleanup and cost savings. // +optional // +listType=atomic DeletionRules []DeletionRule `json:"deletionRules,omitempty"` diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index fe379d5dfff..08d98c15ecb 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -318,7 +318,7 @@ func validateDeletionRules(rayJob *rayv1.RayJob) error { for i, rule := range rules { // Validate TTL is non-negative. if rule.Condition.TTLSeconds < 0 { - errs = append(errs, fmt.Errorf("deletionRules[%d]: TTLSecondsAfterFinished must be non-negative", i)) + errs = append(errs, fmt.Errorf("deletionRules[%d]: TTLSeconds must be non-negative", i)) continue } diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index 70c4bfae974..16f2911229d 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -1192,7 +1192,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { expectError: true, }, { - name: "negative TTLSecondsAfterFinished in deletionRules", + name: "negative TTLSeconds in deletionRules", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{ DeletionRules: []rayv1.DeletionRule{ @@ -1209,6 +1209,24 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { }, expectError: true, }, + { + name: "deletionRules with ClusterSelector and DeleteWorkers policy", + spec: rayv1.RayJobSpec{ + ClusterSelector: map[string]string{"key": "value"}, + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{ + { + Policy: rayv1.DeleteWorkers, + Condition: rayv1.DeletionCondition{ + JobStatus: rayv1.JobStatusSucceeded, + TTLSeconds: 10, + }, + }, + }, + }, + }, + expectError: true, + }, { name: "deletionRules with ClusterSelector and DeleteCluster policy", spec: rayv1.RayJobSpec{ From b83078a4f764b4edd4cf6c7a19a619b5078b5a54 Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Tue, 16 Sep 2025 22:34:19 -0400 Subject: [PATCH 09/21] address comment --- docs/reference/api.md | 6 +-- ray-operator/apis/ray/v1/rayjob_types.go | 6 +-- .../controllers/ray/rayjob_controller.go | 48 ++++++++----------- 3 files changed, 26 insertions(+), 34 deletions(-) diff --git a/docs/reference/api.md b/docs/reference/api.md index e951c3cb474..ca3ed50356c 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -130,7 +130,7 @@ It allows for fine-grained control over resource cleanup after a job finishes. Legacy fields `onSuccess` and `onFailure` are still supported for backward compatibility, but it is highly recommended to migrate to the new `deletionRules` field. -`onSuccess` and `onFailure` will be removed in release 1.16.0. +`onSuccess` and `onFailure` will be removed in release 1.6.0. Notes: @@ -155,8 +155,8 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | OnSuccess is the deletion policy for a successful RayJob.
Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
This field will be removed in release 1.16.0. | | | -| `onFailure` _[DeletionPolicy](#deletionpolicy)_ | OnFailure is the deletion policy for a failed RayJob.
Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
This field will be removed in release 1.16.0. | | | +| `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | OnSuccess is the deletion policy for a successful RayJob.
Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
This field will be removed in release 1.6.0. | | | +| `onFailure` _[DeletionPolicy](#deletionpolicy)_ | OnFailure is the deletion policy for a failed RayJob.
Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
This field will be removed in release 1.6.0. | | | | `deletionRules` _[DeletionRule](#deletionrule) array_ | DeletionRules is a list of deletion rules, processed based on their trigger conditions.
While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),
the most impactful rule (e.g., DeleteSelf) will be executed first to prioritize resource cleanup and cost savings. | | | diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go index 3314ac463bc..8abeec9eb6f 100644 --- a/ray-operator/apis/ray/v1/rayjob_types.go +++ b/ray-operator/apis/ray/v1/rayjob_types.go @@ -92,7 +92,7 @@ const ( // // Legacy fields `onSuccess` and `onFailure` are still supported for backward compatibility, // but it is highly recommended to migrate to the new `deletionRules` field. -// `onSuccess` and `onFailure` will be removed in release 1.16.0. +// `onSuccess` and `onFailure` will be removed in release 1.6.0. // // Notes: // - When this block is set, you must configure **either** @@ -112,13 +112,13 @@ const ( type DeletionStrategy struct { // OnSuccess is the deletion policy for a successful RayJob. // Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies. - // This field will be removed in release 1.16.0. + // This field will be removed in release 1.6.0. // +optional OnSuccess *DeletionPolicy `json:"onSuccess,omitempty"` // OnFailure is the deletion policy for a failed RayJob. // Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies. - // This field will be removed in release 1.16.0. + // This field will be removed in release 1.6.0. // +optional OnFailure *DeletionPolicy `json:"onFailure,omitempty"` diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index 5a2fdada794..fb139256189 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -365,7 +365,26 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request) return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil case rayv1.JobDeploymentStatusComplete, rayv1.JobDeploymentStatusFailed: // The RayJob has reached a terminal state. Handle the cleanup and deletion logic. - return r.handleFinishedRayJob(ctx, rayJobInstance) + // If the RayJob uses an existing RayCluster, we must not delete it. + if len(rayJobInstance.Spec.ClusterSelector) > 0 { + logger.Info("RayJob is using an existing RayCluster via clusterSelector; skipping resource deletion.", "RayClusterSelector", rayJobInstance.Spec.ClusterSelector) + return ctrl.Result{}, nil + } + + if features.Enabled(features.RayJobDeletionPolicy) && rayJobInstance.Spec.DeletionStrategy != nil { + // The previous validation logic ensures that either DeletionRules or the legacy policies are set, but not both. + if len(rayJobInstance.Spec.DeletionStrategy.DeletionRules) > 0 { + return r.handleDeletionRules(ctx, rayJobInstance) + } + return r.handleLegacyDeletionPolicy(ctx, rayJobInstance) + } + + if rayJobInstance.Spec.ShutdownAfterJobFinishes { + return r.handleShutdownAfterJobFinishes(ctx, rayJobInstance) + } + + // Default: No deletion policy is configured. The reconciliation is complete for this RayJob. + return ctrl.Result{}, nil default: logger.Info("Unknown JobDeploymentStatus", "JobDeploymentStatus", rayJobInstance.Status.JobDeploymentStatus) return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil @@ -1089,33 +1108,6 @@ func isSubmitterContainerFinished(pod *corev1.Pod) bool { return false } -// handleFinishedRayJob is the main entry point for handling cleanup of a completed or failed RayJob. -// It acts as a dispatcher, selecting the appropriate deletion mechanism based on the RayJob spec. -func (r *RayJobReconciler) handleFinishedRayJob(ctx context.Context, rayJob *rayv1.RayJob) (ctrl.Result, error) { - logger := ctrl.LoggerFrom(ctx) - - // If the RayJob uses an existing RayCluster, we must not delete it. - if len(rayJob.Spec.ClusterSelector) > 0 { - logger.Info("RayJob is using an existing RayCluster via clusterSelector; skipping resource deletion.", "RayClusterSelector", rayJob.Spec.ClusterSelector) - return ctrl.Result{}, nil - } - - if features.Enabled(features.RayJobDeletionPolicy) && rayJob.Spec.DeletionStrategy != nil { - // The previous validation logic ensures that either DeletionRules or the legacy policies are set, but not both. - if len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0 { - return r.handleDeletionRules(ctx, rayJob) - } - return r.handleLegacyDeletionPolicy(ctx, rayJob) - } - - if rayJob.Spec.ShutdownAfterJobFinishes { - return r.handleShutdownAfterJobFinishes(ctx, rayJob) - } - - // Default: No deletion policy is configured. The reconciliation is complete for this RayJob. - return ctrl.Result{}, nil -} - // handleDeletionRules processes the DeletionRules with a impact-aware strategy. // It categorizes rules into "overdue" and "pending". If overdue rules exist, // it executes the most destructive one and then requeues for the next pending rule. From 776e924784fb42ad590ba23550c7c420c8b827af Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Sun, 21 Sep 2025 16:22:14 -0400 Subject: [PATCH 10/21] remove duplicate errors pkg --- ray-operator/controllers/ray/utils/validation.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index 08d98c15ecb..c11998e8938 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -1,7 +1,6 @@ package utils import ( - "errors" errstd "errors" "fmt" @@ -356,7 +355,7 @@ func validateDeletionRules(rayJob *rayv1.RayJob) error { } } - return errors.Join(errs...) + return errstd.Join(errs...) } // validateTTLConsistency ensures TTLs follow the deletion hierarchy: Workers <= Cluster <= Self. @@ -393,7 +392,7 @@ func validateTTLConsistency(policyTTLs map[rayv1.DeletionPolicyType]int32, statu hasPrev = true } - return errors.Join(errs...) + return errstd.Join(errs...) } // validateLegacyDeletionPolicies handles validation for the old `onSuccess` and `onFailure` fields. From fafa62fb4a3e6db711fcbd1476b572ad648ffb47 Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Tue, 23 Sep 2025 22:00:55 -0400 Subject: [PATCH 11/21] improve api doc --- docs/reference/api.md | 6 +++++- ray-operator/apis/ray/v1/rayjob_types.go | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/reference/api.md b/docs/reference/api.md index ca3ed50356c..414e6710246 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -139,7 +139,11 @@ Notes: OR (b) the `deletionRules` field (which may be empty, in which case no deletion will occur). - `onSuccess` / `onFailure` must NOT be used together with `deletionRules`. - - `onSuccess` and `onFailure` are **deprecated** and planned for removal in a future release. + - `onSuccess` and `onFailure` are **deprecated** and planned for removal in release 1.6.0. + - `deletionStrategy` is mutually exclusive with `spec.shutdownAfterJobFinishes`. + - If both are set, the controller will report an error and stop processing the RayJob. + - If the `RayJobDeletionPolicy` feature gate is disabled but `deletionStrategy` is set, + the controller will report an error and stop processing the RayJob. Validation rules: diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go index 8abeec9eb6f..e33a7da6e24 100644 --- a/ray-operator/apis/ray/v1/rayjob_types.go +++ b/ray-operator/apis/ray/v1/rayjob_types.go @@ -100,7 +100,11 @@ const ( // OR // (b) the `deletionRules` field (which may be empty, in which case no deletion will occur). // - `onSuccess` / `onFailure` must NOT be used together with `deletionRules`. -// - `onSuccess` and `onFailure` are **deprecated** and planned for removal in a future release. +// - `onSuccess` and `onFailure` are **deprecated** and planned for removal in release 1.6.0. +// - `deletionStrategy` is mutually exclusive with `spec.shutdownAfterJobFinishes`. +// - If both are set, the controller will report an error and stop processing the RayJob. +// - If the `RayJobDeletionPolicy` feature gate is disabled but `deletionStrategy` is set, +// the controller will report an error and stop processing the RayJob. // // Validation rules: // 1. Prevent mixing legacy and new fields From df975f40faa65b90693cc131444759ea42972605 Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Thu, 25 Sep 2025 12:48:06 -0400 Subject: [PATCH 12/21] add e2e tests for deletion strategy --- .buildkite/build-start-operator.sh | 2 +- docs/reference/api.md | 2 +- ray-operator/Makefile | 8 + ray-operator/apis/ray/v1/rayjob_types.go | 7 +- .../rayjob_deletion_strategy_test.go | 557 ++++++++++++++++++ 5 files changed, 571 insertions(+), 5 deletions(-) create mode 100644 ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go diff --git a/.buildkite/build-start-operator.sh b/.buildkite/build-start-operator.sh index ef43eba6d71..60468c373d4 100644 --- a/.buildkite/build-start-operator.sh +++ b/.buildkite/build-start-operator.sh @@ -7,7 +7,7 @@ # to kick off from the release branch so tests should match up accordingly. if [ "$IS_FROM_RAY_RELEASE_AUTOMATION" = 1 ]; then - helm repo update && helm install kuberay/kuberay-operator + helm repo update && helm install kuberay/kuberay-operator --set 'featureGates[1].name=RayJobDeletionPolicy' --set 'featureGates[1].enabled=true' KUBERAY_TEST_RAY_IMAGE="rayproject/ray:nightly.$(date +'%y%m%d').${RAY_NIGHTLY_COMMIT:0:6}-py39" && export KUBERAY_TEST_RAY_IMAGE else IMG=kuberay/operator:nightly make docker-image && diff --git a/docs/reference/api.md b/docs/reference/api.md index 414e6710246..3b773074bb5 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -305,7 +305,7 @@ _Appears in:_ | `clusterSelector` _object (keys:string, values:string)_ | clusterSelector is used to select running rayclusters by labels | | | | `submitterConfig` _[SubmitterConfig](#submitterconfig)_ | Configurations of submitter k8s job. | | | | `managedBy` _string_ | ManagedBy is an optional configuration for the controller or entity that manages a RayJob.
The value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'.
The kuberay-operator reconciles a RayJob which doesn't have this field at all or
the field value is the reserved string 'ray.io/kuberay-operator',
but delegates reconciling the RayJob with 'kueue.x-k8s.io/multikueue' to the Kueue.
The field is immutable. | | | -| `deletionStrategy` _[DeletionStrategy](#deletionstrategy)_ | DeletionStrategy indicates what resources of the RayJob and how they are deleted upon job completion.
If unset, deletion policy is based on 'spec.shutdownAfterJobFinishes'.
This field requires the RayJobDeletionPolicy feature gate to be enabled. | | | +| `deletionStrategy` _[DeletionStrategy](#deletionstrategy)_ | DeletionStrategy defines resource cleanup policies after job completion.
Use either legacy fields (onSuccess/onFailure) OR deletionRules, not both.
Mutually exclusive with spec.shutdownAfterJobFinishes.
Requires RayJobDeletionPolicy feature gate to be enabled. | | | | `entrypoint` _string_ | Entrypoint represents the command to start execution. | | | | `runtimeEnvYAML` _string_ | RuntimeEnvYAML represents the runtime environment configuration
provided as a multi-line YAML string. | | | | `jobId` _string_ | If jobId is not set, a new jobId will be auto-generated. | | | diff --git a/ray-operator/Makefile b/ray-operator/Makefile index 3eda8a616c4..3842f2227b8 100644 --- a/ray-operator/Makefile +++ b/ray-operator/Makefile @@ -88,6 +88,14 @@ test-sampleyaml: WHAT ?= ./test/sampleyaml test-sampleyaml: manifests fmt vet go test -timeout 30m -v $(WHAT) +test-e2e-rayjob: WHAT ?= ./test/e2erayjob +test-e2e-rayjob: manifests fmt vet ## Run e2e tests. + go test -timeout 30m -v $(WHAT) + +test-e2e-rayservice: WHAT ?= ./test/e2erayservice +test-e2e-rayservice: manifests fmt vet ## Run e2e tests. + go test -timeout 30m -v $(WHAT) + sync: helm api-docs ./hack/update-codegen.sh diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go index e33a7da6e24..37fd5dbe7f3 100644 --- a/ray-operator/apis/ray/v1/rayjob_types.go +++ b/ray-operator/apis/ray/v1/rayjob_types.go @@ -232,9 +232,10 @@ type RayJobSpec struct { // +kubebuilder:validation:XValidation:rule="self in ['ray.io/kuberay-operator', 'kueue.x-k8s.io/multikueue']",message="the managedBy field value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'" // +optional ManagedBy *string `json:"managedBy,omitempty"` - // DeletionStrategy indicates what resources of the RayJob and how they are deleted upon job completion. - // If unset, deletion policy is based on 'spec.shutdownAfterJobFinishes'. - // This field requires the RayJobDeletionPolicy feature gate to be enabled. + // DeletionStrategy defines resource cleanup policies after job completion. + // Use either legacy fields (onSuccess/onFailure) OR deletionRules, not both. + // Mutually exclusive with spec.shutdownAfterJobFinishes. + // Requires RayJobDeletionPolicy feature gate to be enabled. // +optional DeletionStrategy *DeletionStrategy `json:"deletionStrategy,omitempty"` // Entrypoint represents the command to start execution. diff --git a/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go b/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go new file mode 100644 index 00000000000..302cee2d796 --- /dev/null +++ b/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go @@ -0,0 +1,557 @@ +package e2erayjob + +import ( + "testing" + "time" + + . "github.com/onsi/gomega" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1" + . "github.com/ray-project/kuberay/ray-operator/test/support" +) + +func TestDeletionStrategy(t *testing.T) { + test := With(t) + g := NewWithT(t) + + // Create a namespace + namespace := test.NewTestNamespace() + + // Job scripts - using existing counter.py for successful jobs and fail.py for failed jobs + // Note: This test suite requires the RayJobDeletionPolicy feature gate to be enabled + jobsAC := NewConfigMap(namespace.Name, Files(test, "counter.py", "fail.py")) + jobs, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Apply(test.Ctx(), jobsAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Created ConfigMap %s/%s successfully", jobs.Namespace, jobs.Name) + + test.T().Run("DeletionRules with DeleteWorkers policy should delete only worker pods", func(_ *testing.T) { + // Create RayJob with DeleteWorkers policy and short TTL for faster testing + rayJobAC := rayv1ac.RayJob("delete-workers-test", namespace.Name). + WithSpec(rayv1ac.RayJobSpec(). + WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))). + WithEntrypoint("python /home/ray/jobs/counter.py"). + WithRuntimeEnvYAML(` +env_vars: + counter_name: test_counter +`). + WithShutdownAfterJobFinishes(false). // Required when using DeletionStrategy + WithDeletionStrategy(rayv1ac.DeletionStrategy(). + WithDeletionRules( + rayv1ac.DeletionRule(). + WithPolicy(rayv1.DeleteWorkers). + WithCondition(rayv1ac.DeletionCondition(). + WithJobStatus(rayv1.JobStatusSucceeded). + WithTTLSeconds(10)), // 10 second TTL for testing + )). + WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration())) + + rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + + // Wait for job to complete successfully + g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). + Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded))) + LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name) + + // Get the associated RayCluster name. We assert it's non-empty explicitly so that + // test failures surface here (clear message) rather than later when using an empty name. + rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + rayClusterName := rayJob.Status.RayClusterName + g.Expect(rayClusterName).NotTo(BeEmpty()) + + // Verify cluster and workers exist initially + g.Eventually(RayCluster(test, namespace.Name, rayClusterName), TestTimeoutShort). + Should(WithTransform(RayClusterState, Equal(rayv1.Ready))) + + // Count initial worker pods + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + g.Expect(err).NotTo(HaveOccurred()) + initialWorkerPods, err := GetWorkerPods(test, cluster) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(len(initialWorkerPods)).To(BeNumerically(">", 0)) + LogWithTimestamp(test.T(), "Found %d worker pods initially", len(initialWorkerPods)) + + // Verify resources persist during TTL wait period (first 8 seconds of 10s TTL) + LogWithTimestamp(test.T(), "Verifying resources persist during TTL wait period...") + g.Consistently(func(gg Gomega) { + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(cluster).NotTo(BeNil()) + workerPods, err := GetWorkerPods(test, cluster) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(len(workerPods)).To(BeNumerically(">", 0)) + headPod, err := GetHeadPod(test, cluster) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(headPod).NotTo(BeNil()) + jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(jobObj).NotTo(BeNil()) + }, 8*time.Second, 2*time.Second).Should(Succeed()) // Check every 2s for 8s + LogWithTimestamp(test.T(), "Resources confirmed stable during TTL wait period") + + // Wait for TTL to expire and workers to be deleted + LogWithTimestamp(test.T(), "Waiting for TTL to expire and workers to be deleted...") + g.Eventually(func(gg Gomega) { + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(cluster).NotTo(BeNil()) + workerPods, err := GetWorkerPods(test, cluster) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(workerPods).To(BeEmpty()) + }, TestTimeoutMedium).Should(Succeed()) + LogWithTimestamp(test.T(), "Worker pods deleted successfully") + + // Verify cluster still exists (head pod should remain) + g.Consistently(RayCluster(test, namespace.Name, rayClusterName), 10*time.Second). + Should(WithTransform(RayClusterState, Equal(rayv1.Ready))) + + // Verify head pod still exists + cluster, err = GetRayCluster(test, namespace.Name, rayClusterName) + g.Expect(err).NotTo(HaveOccurred()) + headPod, err := GetHeadPod(test, cluster) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(headPod).NotTo(BeNil()) + LogWithTimestamp(test.T(), "Head pod preserved as expected") + + // Verify RayJob still exists + jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(jobObj).NotTo(BeNil()) + LogWithTimestamp(test.T(), "RayJob preserved as expected") + + // Cleanup: delete RayJob to free resources (cluster should be GC'd eventually if owned) + LogWithTimestamp(test.T(), "Cleaning up RayJob %s/%s after DeleteWorkers scenario", jobObj.Namespace, jobObj.Name) + err = test.Client().Ray().RayV1().RayJobs(jobObj.Namespace).Delete(test.Ctx(), jobObj.Name, metav1.DeleteOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Eventually(func() error { _, err := GetRayJob(test, jobObj.Namespace, jobObj.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + // Cluster may take a moment to be garbage collected; tolerate already-deleted state + g.Eventually(func() error { + _, err := GetRayCluster(test, namespace.Name, rayClusterName) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Cleanup after DeleteWorkers scenario complete") + }) + + test.T().Run("DeletionRules with DeleteCluster policy should delete entire cluster", func(_ *testing.T) { + rayJobAC := rayv1ac.RayJob("delete-cluster-test", namespace.Name). + WithSpec(rayv1ac.RayJobSpec(). + WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))). + WithEntrypoint("python /home/ray/jobs/counter.py"). + WithRuntimeEnvYAML(` +env_vars: + counter_name: test_counter +`). + WithShutdownAfterJobFinishes(false). + WithDeletionStrategy(rayv1ac.DeletionStrategy(). + WithDeletionRules( + rayv1ac.DeletionRule(). + WithPolicy(rayv1.DeleteCluster). + WithCondition(rayv1ac.DeletionCondition(). + WithJobStatus(rayv1.JobStatusSucceeded). + WithTTLSeconds(10)), + )). + WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration())) + + rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + + // Wait for job to complete successfully + g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). + Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded))) + LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name) + + // Get the associated RayCluster name (early assertion for clearer diagnostics) + rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + rayClusterName := rayJob.Status.RayClusterName + g.Expect(rayClusterName).NotTo(BeEmpty()) + + // Verify cluster exists initially + g.Eventually(RayCluster(test, namespace.Name, rayClusterName), TestTimeoutShort). + Should(WithTransform(RayClusterState, Equal(rayv1.Ready))) + + // Wait for TTL to expire and cluster to be deleted + LogWithTimestamp(test.T(), "Waiting for TTL to expire and cluster to be deleted...") + g.Eventually(func() error { + _, err := GetRayCluster(test, namespace.Name, rayClusterName) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "RayCluster deleted successfully") + + // Verify RayJob still exists + jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(jobObj).NotTo(BeNil()) + LogWithTimestamp(test.T(), "RayJob preserved as expected") + + // Cleanup: delete RayJob (cluster already deleted by policy) + LogWithTimestamp(test.T(), "Cleaning up RayJob %s/%s after DeleteCluster scenario", jobObj.Namespace, jobObj.Name) + err = test.Client().Ray().RayV1().RayJobs(jobObj.Namespace).Delete(test.Ctx(), jobObj.Name, metav1.DeleteOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Eventually(func() error { _, err := GetRayJob(test, jobObj.Namespace, jobObj.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Cleanup after DeleteCluster scenario complete") + }) + + test.T().Run("DeletionRules with DeleteSelf policy should delete RayJob and cluster", func(_ *testing.T) { + rayJobAC := rayv1ac.RayJob("delete-self-test", namespace.Name). + WithSpec(rayv1ac.RayJobSpec(). + WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))). + WithEntrypoint("python /home/ray/jobs/counter.py"). + WithRuntimeEnvYAML(` +env_vars: + counter_name: test_counter +`). + WithShutdownAfterJobFinishes(false). + WithDeletionStrategy(rayv1ac.DeletionStrategy(). + WithDeletionRules( + rayv1ac.DeletionRule(). + WithPolicy(rayv1.DeleteSelf). + WithCondition(rayv1ac.DeletionCondition(). + WithJobStatus(rayv1.JobStatusSucceeded). + WithTTLSeconds(10)), + )). + WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration())) + + rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + + // Wait for job to complete successfully + g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). + Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded))) + LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name) + + // Get the associated RayCluster name before verifying deletion sequence + rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + rayClusterName := rayJob.Status.RayClusterName + g.Expect(rayClusterName).NotTo(BeEmpty()) + + // Wait for TTL to expire and RayJob (and cluster) to be deleted + LogWithTimestamp(test.T(), "Waiting for TTL to expire and RayJob to be deleted...") + g.Eventually(func() error { + _, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "RayJob deleted successfully") + + // Verify associated cluster is also deleted + g.Eventually(func() error { + _, err := GetRayCluster(test, namespace.Name, rayClusterName) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Associated RayCluster deleted successfully") + }) + + test.T().Run("DeletionRules with DeleteNone policy should preserve all resources", func(_ *testing.T) { + rayJobAC := rayv1ac.RayJob("delete-none-test", namespace.Name). + WithSpec(rayv1ac.RayJobSpec(). + WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))). + WithEntrypoint("python /home/ray/jobs/counter.py"). + WithRuntimeEnvYAML(` +env_vars: + counter_name: test_counter +`). + WithShutdownAfterJobFinishes(false). + WithDeletionStrategy(rayv1ac.DeletionStrategy(). + WithDeletionRules( + rayv1ac.DeletionRule(). + WithPolicy(rayv1.DeleteNone). + WithCondition(rayv1ac.DeletionCondition(). + WithJobStatus(rayv1.JobStatusSucceeded). + WithTTLSeconds(5)), // Shorter TTL since we're testing preservation + )). + WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration())) + + rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + + // Wait for job to complete successfully + g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). + Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded))) + LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name) + + // Get the associated RayCluster name (assert early for clarity) + rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + rayClusterName := rayJob.Status.RayClusterName + g.Expect(rayClusterName).NotTo(BeEmpty()) + + // Wait well past the TTL and verify everything is preserved + LogWithTimestamp(test.T(), "Waiting past TTL to verify resources are preserved...") + g.Consistently(func(gg Gomega) { + jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(jobObj).NotTo(BeNil()) + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(cluster).NotTo(BeNil()) + workerPods, err := GetWorkerPods(test, cluster) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(len(workerPods)).To(BeNumerically(">", 0)) + }, 10*time.Second, 2*time.Second).Should(Succeed()) + LogWithTimestamp(test.T(), "All resources preserved as expected with DeleteNone policy") + + // Cleanup: delete RayJob to release cluster and pods + LogWithTimestamp(test.T(), "Cleaning up RayJob %s/%s after DeleteNone scenario", rayJob.Namespace, rayJob.Name) + err = test.Client().Ray().RayV1().RayJobs(rayJob.Namespace).Delete(test.Ctx(), rayJob.Name, metav1.DeleteOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Eventually(func() error { _, err := GetRayJob(test, rayJob.Namespace, rayJob.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + g.Eventually(func() error { + _, err := GetRayCluster(test, namespace.Name, rayClusterName) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Cleanup after DeleteNone scenario complete") + }) + + test.T().Run("Multi-stage deletion should execute in TTL order: Workers->Cluster->Self", func(_ *testing.T) { + rayJobAC := rayv1ac.RayJob("multi-stage-test", namespace.Name). + WithSpec(rayv1ac.RayJobSpec(). + WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))). + WithEntrypoint("python /home/ray/jobs/counter.py"). + WithRuntimeEnvYAML(` +env_vars: + counter_name: test_counter +`). + WithShutdownAfterJobFinishes(false). + WithDeletionStrategy(rayv1ac.DeletionStrategy(). + WithDeletionRules( + rayv1ac.DeletionRule(). + WithPolicy(rayv1.DeleteWorkers). + WithCondition(rayv1ac.DeletionCondition(). + WithJobStatus(rayv1.JobStatusSucceeded). + WithTTLSeconds(15)), // Increased spacing for reliability + rayv1ac.DeletionRule(). + WithPolicy(rayv1.DeleteCluster). + WithCondition(rayv1ac.DeletionCondition(). + WithJobStatus(rayv1.JobStatusSucceeded). + WithTTLSeconds(35)), // 20s gap between stages + rayv1ac.DeletionRule(). + WithPolicy(rayv1.DeleteSelf). + WithCondition(rayv1ac.DeletionCondition(). + WithJobStatus(rayv1.JobStatusSucceeded). + WithTTLSeconds(55)), // 20s gap between stages + )). + WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration())) + + rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + + // Wait for job to complete successfully + g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). + Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded))) + LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name) + + // Get the associated RayCluster name (early assertion ensures meaningful failure) + rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + rayClusterName := rayJob.Status.RayClusterName + g.Expect(rayClusterName).NotTo(BeEmpty()) + + // Verify cluster is ready initially + g.Eventually(RayCluster(test, namespace.Name, rayClusterName), TestTimeoutShort). + Should(WithTransform(RayClusterState, Equal(rayv1.Ready))) + + // Verify all resources exist before any TTL expires (first 12 seconds) + LogWithTimestamp(test.T(), "Verifying all resources persist before any TTL expires...") + g.Consistently(func(gg Gomega) { + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(cluster).NotTo(BeNil()) + workerPods, err := GetWorkerPods(test, cluster) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(len(workerPods)).To(BeNumerically(">", 0)) + headPod, err := GetHeadPod(test, cluster) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(headPod).NotTo(BeNil()) + jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(jobObj).NotTo(BeNil()) + }, 12*time.Second, 2*time.Second).Should(Succeed()) + LogWithTimestamp(test.T(), "All resources confirmed stable before TTL expiration") + + // Stage 1: Wait for workers to be deleted (15s TTL) + LogWithTimestamp(test.T(), "Stage 1: Waiting for workers to be deleted at 15s...") + g.Eventually(func(gg Gomega) { + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(cluster).NotTo(BeNil()) + workerPods, err := GetWorkerPods(test, cluster) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(workerPods).To(BeEmpty()) + }, TestTimeoutMedium).Should(Succeed()) + LogWithTimestamp(test.T(), "Stage 1 complete: Workers deleted successfully") + + // Verify cluster and job still exist after stage 1 + job, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(job).NotTo(BeNil()) + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + g.Expect(err).NotTo(HaveOccurred()) + headPod, err := GetHeadPod(test, cluster) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(headPod).NotTo(BeNil()) + + // Verify cluster persists during stage 2 wait period (15 seconds of 20s gap) + LogWithTimestamp(test.T(), "Verifying cluster persists before stage 2 TTL expires...") + g.Consistently(func(gg Gomega) { + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(cluster).NotTo(BeNil()) + headPod, err := GetHeadPod(test, cluster) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(headPod).NotTo(BeNil()) + jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(jobObj).NotTo(BeNil()) + }, 15*time.Second, 2*time.Second).Should(Succeed()) + LogWithTimestamp(test.T(), "Cluster and job confirmed stable before stage 2 TTL") + + // Stage 2: Wait for cluster to be deleted (35s TTL) + LogWithTimestamp(test.T(), "Stage 2: Waiting for cluster to be deleted at 35s...") + g.Eventually(func() error { + _, err := GetRayCluster(test, namespace.Name, rayClusterName) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Stage 2 complete: Cluster deleted successfully") + + // Verify job still exists after stage 2 + job, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(job).NotTo(BeNil()) + + // Verify job persists during stage 3 wait period (15 seconds of 20s gap) + LogWithTimestamp(test.T(), "Verifying RayJob persists before stage 3 TTL expires...") + g.Consistently(func(gg Gomega) { + jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(jobObj).NotTo(BeNil()) + }, 15*time.Second, 2*time.Second).Should(Succeed()) + LogWithTimestamp(test.T(), "RayJob confirmed stable before stage 3 TTL") + + // Stage 3: Wait for job to be deleted (55s TTL) + LogWithTimestamp(test.T(), "Stage 3: Waiting for RayJob to be deleted at 55s...") + g.Eventually(func() error { + _, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Stage 3 complete: RayJob deleted successfully") + LogWithTimestamp(test.T(), "Multi-stage deletion completed in correct order") + }) + + test.T().Run("Legacy OnSuccess DeleteCluster should still work", func(_ *testing.T) { + rayJobAC := rayv1ac.RayJob("legacy-success-test", namespace.Name). + WithSpec(rayv1ac.RayJobSpec(). + WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))). + WithEntrypoint("python /home/ray/jobs/counter.py"). + WithRuntimeEnvYAML(` +env_vars: + counter_name: test_counter +`). + WithShutdownAfterJobFinishes(false). + WithTTLSecondsAfterFinished(10). // Legacy TTL for backward compatibility + WithDeletionStrategy(rayv1ac.DeletionStrategy(). + WithOnSuccess(rayv1ac.DeletionPolicy(). + WithPolicy(rayv1.DeleteCluster)). + WithOnFailure(rayv1ac.DeletionPolicy(). + WithPolicy(rayv1.DeleteNone))). + WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration())) + + rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Created legacy RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + + // Wait for job to complete successfully + g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). + Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded))) + LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name) + + // Get the associated RayCluster name (legacy path; same early assertion rationale) + rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + rayClusterName := rayJob.Status.RayClusterName + g.Expect(rayClusterName).NotTo(BeEmpty()) + + // Wait for cluster to be deleted due to OnSuccess policy + LogWithTimestamp(test.T(), "Waiting for legacy OnSuccess policy to delete cluster...") + g.Eventually(func() error { + _, err := GetRayCluster(test, namespace.Name, rayClusterName) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Cluster deleted by legacy OnSuccess policy") + + // Verify RayJob still exists + job, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(job).NotTo(BeNil()) + LogWithTimestamp(test.T(), "Legacy OnSuccess policy working correctly") + + // Cleanup: delete legacy RayJob (cluster already deleted) + LogWithTimestamp(test.T(), "Cleaning up legacy success RayJob %s/%s", job.Namespace, job.Name) + err = test.Client().Ray().RayV1().RayJobs(job.Namespace).Delete(test.Ctx(), job.Name, metav1.DeleteOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Eventually(func() error { _, err := GetRayJob(test, job.Namespace, job.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Cleanup after legacy success scenario complete") + }) + + test.T().Run("Legacy OnFailure DeleteNone should still work", func(_ *testing.T) { + rayJobAC := rayv1ac.RayJob("legacy-failure-test", namespace.Name). + WithSpec(rayv1ac.RayJobSpec(). + WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))). + WithEntrypoint("python /home/ray/jobs/fail.py"). // Use failing script + WithShutdownAfterJobFinishes(false). + WithTTLSecondsAfterFinished(10). + WithDeletionStrategy(rayv1ac.DeletionStrategy(). + WithOnSuccess(rayv1ac.DeletionPolicy(). + WithPolicy(rayv1.DeleteCluster)). + WithOnFailure(rayv1ac.DeletionPolicy(). + WithPolicy(rayv1.DeleteNone))). + WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration())) + + rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Created legacy failure RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + + // Wait for job to fail + g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). + Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusFailed))) + LogWithTimestamp(test.T(), "RayJob %s/%s failed as expected", rayJob.Namespace, rayJob.Name) + + // Get the associated RayCluster name + rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) + g.Expect(err).NotTo(HaveOccurred()) + rayClusterName := rayJob.Status.RayClusterName + g.Expect(rayClusterName).NotTo(BeEmpty()) + + // Wait past the TTL and verify everything is preserved due to OnFailure=DeleteNone + LogWithTimestamp(test.T(), "Waiting past TTL to verify resources preserved by OnFailure=DeleteNone...") + g.Consistently(func(gg Gomega) { + jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(jobObj).NotTo(BeNil()) + cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(cluster).NotTo(BeNil()) + }, 15*time.Second, 2*time.Second).Should(Succeed()) + LogWithTimestamp(test.T(), "Legacy OnFailure=DeleteNone policy working correctly") + + // Cleanup: delete legacy failure RayJob (will also GC cluster) + LogWithTimestamp(test.T(), "Cleaning up legacy failure RayJob %s/%s", rayJob.Namespace, rayJob.Name) + err = test.Client().Ray().RayV1().RayJobs(rayJob.Namespace).Delete(test.Ctx(), rayJob.Name, metav1.DeleteOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Eventually(func() error { _, err := GetRayJob(test, rayJob.Namespace, rayJob.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + g.Eventually(func() error { + _, err := GetRayCluster(test, namespace.Name, rayClusterName) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + LogWithTimestamp(test.T(), "Cleanup after legacy failure scenario complete") + }) +} From f91907db5e2a90e80fd9385d25ef49862a14e8ea Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Thu, 25 Sep 2025 13:34:38 -0400 Subject: [PATCH 13/21] fix lint --- ray-operator/Makefile | 1 - .../test/e2erayjob/rayjob_deletion_strategy_test.go | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ray-operator/Makefile b/ray-operator/Makefile index 3842f2227b8..d69b2fffa0b 100644 --- a/ray-operator/Makefile +++ b/ray-operator/Makefile @@ -67,7 +67,6 @@ test: ENVTEST_K8S_VERSION ?= 1.24.2 test: manifests fmt vet envtest ## Run tests. KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $(WHAT) -coverprofile cover.out -# You can use `go test -timeout 30m -v ./test/e2e/rayjob_test.go ./test/e2e/support.go` if you only want to run tests in `rayjob_test.go`. test-e2e: WHAT ?= ./test/e2e test-e2e: manifests fmt vet ## Run e2e tests. go test -timeout 30m -v $(WHAT) diff --git a/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go b/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go index 302cee2d796..4668ba0713c 100644 --- a/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go +++ b/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go @@ -73,7 +73,7 @@ env_vars: g.Expect(err).NotTo(HaveOccurred()) initialWorkerPods, err := GetWorkerPods(test, cluster) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(len(initialWorkerPods)).To(BeNumerically(">", 0)) + g.Expect(initialWorkerPods).ToNot(BeEmpty()) LogWithTimestamp(test.T(), "Found %d worker pods initially", len(initialWorkerPods)) // Verify resources persist during TTL wait period (first 8 seconds of 10s TTL) @@ -84,7 +84,7 @@ env_vars: gg.Expect(cluster).NotTo(BeNil()) workerPods, err := GetWorkerPods(test, cluster) gg.Expect(err).NotTo(HaveOccurred()) - gg.Expect(len(workerPods)).To(BeNumerically(">", 0)) + gg.Expect(workerPods).ToNot(BeEmpty()) headPod, err := GetHeadPod(test, cluster) gg.Expect(err).NotTo(HaveOccurred()) gg.Expect(headPod).NotTo(BeNil()) @@ -295,7 +295,7 @@ env_vars: gg.Expect(cluster).NotTo(BeNil()) workerPods, err := GetWorkerPods(test, cluster) gg.Expect(err).NotTo(HaveOccurred()) - gg.Expect(len(workerPods)).To(BeNumerically(">", 0)) + gg.Expect(workerPods).ToNot(BeEmpty()) }, 10*time.Second, 2*time.Second).Should(Succeed()) LogWithTimestamp(test.T(), "All resources preserved as expected with DeleteNone policy") @@ -368,7 +368,7 @@ env_vars: gg.Expect(cluster).NotTo(BeNil()) workerPods, err := GetWorkerPods(test, cluster) gg.Expect(err).NotTo(HaveOccurred()) - gg.Expect(len(workerPods)).To(BeNumerically(">", 0)) + gg.Expect(workerPods).ToNot(BeEmpty()) headPod, err := GetHeadPod(test, cluster) gg.Expect(err).NotTo(HaveOccurred()) gg.Expect(headPod).NotTo(BeNil()) From 47daebe811566aa781e91239bb3914ac12e27c7c Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Thu, 25 Sep 2025 20:50:10 -0400 Subject: [PATCH 14/21] add feature gate override for e2e tests --- .buildkite/build-start-operator.sh | 8 ++- .../values-kuberay-operator-override.yaml | 18 +++++ ray-operator/Makefile | 9 +++ .../config/overlays/test-overrides/README.md | 68 +++++++++++++++++++ .../test-overrides/deployment-override.yaml | 12 ++++ .../test-overrides/kustomization.yaml | 17 +++++ 6 files changed, 130 insertions(+), 2 deletions(-) create mode 100644 .buildkite/values-kuberay-operator-override.yaml create mode 100644 ray-operator/config/overlays/test-overrides/README.md create mode 100644 ray-operator/config/overlays/test-overrides/deployment-override.yaml create mode 100644 ray-operator/config/overlays/test-overrides/kustomization.yaml diff --git a/.buildkite/build-start-operator.sh b/.buildkite/build-start-operator.sh index 60468c373d4..4c81fbe96be 100644 --- a/.buildkite/build-start-operator.sh +++ b/.buildkite/build-start-operator.sh @@ -7,10 +7,14 @@ # to kick off from the release branch so tests should match up accordingly. if [ "$IS_FROM_RAY_RELEASE_AUTOMATION" = 1 ]; then - helm repo update && helm install kuberay/kuberay-operator --set 'featureGates[1].name=RayJobDeletionPolicy' --set 'featureGates[1].enabled=true' + helm repo update + echo "Installing helm chart with test override values (feature gates enabled as needed)" + # NOTE: The override file is CI/test-only. It is NOT part of the released chart defaults. + helm install kuberay-operator kuberay/kuberay-operator -f ../.buildkite/values-kuberay-operator-override.yaml KUBERAY_TEST_RAY_IMAGE="rayproject/ray:nightly.$(date +'%y%m%d').${RAY_NIGHTLY_COMMIT:0:6}-py39" && export KUBERAY_TEST_RAY_IMAGE else IMG=kuberay/operator:nightly make docker-image && kind load docker-image kuberay/operator:nightly && - IMG=kuberay/operator:nightly make deploy + echo "Deploying operator with test overrides (feature gates via test-overrides overlay)" + IMG=kuberay/operator:nightly make deploy-with-override fi diff --git a/.buildkite/values-kuberay-operator-override.yaml b/.buildkite/values-kuberay-operator-override.yaml new file mode 100644 index 00000000000..3a0d6aa1ffb --- /dev/null +++ b/.buildkite/values-kuberay-operator-override.yaml @@ -0,0 +1,18 @@ +# Generic Helm values override used only in CI / e2e test environments. +# Intent: +# - Allow e2e tests to turn on alpha / experimental feature gates (e.g. RayJobDeletionPolicy) +# - Provide a single place contributors can extend with additional overrides needed for tests +# - Keep the default published Helm chart behavior unchanged for normal users +# Scope / Safety: +# - This file is never referenced by the base chart; it is opt‑in via buildkite or manual helm install +# - Do NOT rename it to values.yaml or commit changes that enable unstable features by default +# Usage examples: +# helm install kuberay-operator kuberay/kuberay-operator -f ../.buildkite/values-kuberay-operator-override.yaml +# (add or remove feature gates below as e2e scenarios expand) +# +# Current overrides: enable RayJobDeletionPolicy alpha feature gate alongside the existing status conditions gate. +featureGates: + - name: RayClusterStatusConditions + enabled: true + - name: RayJobDeletionPolicy + enabled: true \ No newline at end of file diff --git a/ray-operator/Makefile b/ray-operator/Makefile index d69b2fffa0b..214035963b7 100644 --- a/ray-operator/Makefile +++ b/ray-operator/Makefile @@ -143,6 +143,15 @@ deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in cd config/default && $(KUSTOMIZE) edit set image kuberay/operator=${IMG} $(KUSTOMIZE) build config/default | kubectl apply --server-side=true -f - +# NOTE FOR CONTRIBUTORS: +# deploy-with-override is an e2e/CI-only deployment path. It applies a Kustomize overlay that +# enables test-only feature gates (e.g. RayJobDeletionPolicy) without changing the default +# behavior of the base Helm chart or the standard 'make deploy'. Add additional test overrides +# to the overlay (config/overlays/rayjob-deletion-policy) rather than modifying the base. +deploy-with-override: manifests kustomize ## Deploy controller with test-only feature gate overrides (does NOT affect default chart). + cd config/default && $(KUSTOMIZE) edit set image kuberay/operator=${IMG} + $(KUSTOMIZE) build config/overlays/test-overrides | kubectl apply --server-side=true -f - + undeploy: ## Undeploy controller from the K8s cluster specified in ~/.kube/config. $(KUSTOMIZE) build config/default | kubectl delete -f - diff --git a/ray-operator/config/overlays/test-overrides/README.md b/ray-operator/config/overlays/test-overrides/README.md new file mode 100644 index 00000000000..8ec5e435c11 --- /dev/null +++ b/ray-operator/config/overlays/test-overrides/README.md @@ -0,0 +1,68 @@ +# Test Overrides Overlay (CI / e2e ONLY) + +This overlay enables test-only / alpha feature gates (currently `RayJobDeletionPolicy`) without modifying: +- The base manifests under `config/default` +- Generated CRDs (`make generate`) +- Helm chart defaults (`make helm`, users' `helm install` without -f override) + +Use it only in CI or local end-to-end testing when you explicitly need gated behavior. + +--- +## Why It Exists +Some feature gates are intentionally disabled by default for stability. E2E tests must exercise them to validate behavior prior to promotion. This overlay provides a safe, isolated place to turn them on. + +--- +## Safety Guarantees +| Concern | Guarantee | +|---------|-----------| +| Default user deploy (`make deploy`) | Unchanged | +| Helm install (no -f override) | Unchanged | +| CRD generation / codegen | Unaffected | +| Feature gates scope | Only those explicitly listed here | + +--- +## Usage +Deploy with feature gates enabled: +``` +make deploy-with-override IMG=kuberay/operator:nightly +``` +Helm path (CI release automation): +``` +helm install kuberay-operator kuberay/kuberay-operator -f .buildkite/values-kuberay-operator-override.yaml +``` + +--- +## Adding Another Feature Gate +1. Edit `deployment-override.yaml` – append your gate inside the existing `--feature-gates=` list. +2. Update `.buildkite/values-kuberay-operator-override.yaml` likewise. +3. Add or adjust e2e tests as needed. + +Keep gate ordering stable to minimize diff noise. + +--- +## Keeping In Sync +If the base operator Deployment args change in `config/manager/manager.yaml`: +1. Copy the updated arg list. +2. Re-apply the feature gates in `deployment-override.yaml`. +3. Re-render to confirm. + +--- +## Removal / Promotion Flow +When a gate graduates (enabled by default upstream): +1. Remove it from the override (if it's default-on, it no longer needs listing). +2. Remove corresponding logic from tests if they branch on gate state. +3. (Optional) Note the graduation in release notes. + +--- +## Troubleshooting +Problem | Action +--------|------- +Patch no longer applies | Check if Deployment name or container name changed. +Gates not taking effect | Confirm args rendered (render target) and operator pod restarted. +Unexpected arg order | The strategic merge patch replaces the entire args list; adjust ordering there. + +--- +## Do NOT +- Add unrelated production configuration (RBAC, CRDs, resources) here. +- Reference this overlay from user-facing docs. +- Rename directory without updating `Makefile` targets. \ No newline at end of file diff --git a/ray-operator/config/overlays/test-overrides/deployment-override.yaml b/ray-operator/config/overlays/test-overrides/deployment-override.yaml new file mode 100644 index 00000000000..20ba1ac9f2b --- /dev/null +++ b/ray-operator/config/overlays/test-overrides/deployment-override.yaml @@ -0,0 +1,12 @@ +# Strategic merge patch for kuberay-operator Deployment (test / CI only). +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kuberay-operator +spec: + template: + spec: + containers: + - name: kuberay-operator + args: + - --feature-gates=RayClusterStatusConditions=true,RayJobDeletionPolicy=true \ No newline at end of file diff --git a/ray-operator/config/overlays/test-overrides/kustomization.yaml b/ray-operator/config/overlays/test-overrides/kustomization.yaml new file mode 100644 index 00000000000..7554d954359 --- /dev/null +++ b/ray-operator/config/overlays/test-overrides/kustomization.yaml @@ -0,0 +1,17 @@ +## ============================================================================ +## Kustomize overlay: test-overrides (CI / e2e only) +## ---------------------------------------------------------------------------- +## Purpose: Enable alpha / experimental feature gates (currently RayJobDeletionPolicy) +## for end-to-end testing without modifying base manifests or Helm defaults. +## ============================================================================ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../default + +patches: + - path: deployment-override.yaml + target: + kind: Deployment + name: kuberay-operator \ No newline at end of file From 82341b9a9fd9e4d43ad42ce4cad6522d25581709 Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Thu, 25 Sep 2025 22:31:06 -0400 Subject: [PATCH 15/21] fix lint & fix validation error --- .../values-kuberay-operator-override.yaml | 2 +- .../config/overlays/test-overrides/README.md | 42 +++++-- .../test-overrides/deployment-override.yaml | 2 +- .../test-overrides/kustomization.yaml | 2 +- .../controllers/ray/utils/validation.go | 103 +++++++++++++----- .../controllers/ray/utils/validation_test.go | 10 +- 6 files changed, 114 insertions(+), 47 deletions(-) diff --git a/.buildkite/values-kuberay-operator-override.yaml b/.buildkite/values-kuberay-operator-override.yaml index 3a0d6aa1ffb..7dc396edd71 100644 --- a/.buildkite/values-kuberay-operator-override.yaml +++ b/.buildkite/values-kuberay-operator-override.yaml @@ -15,4 +15,4 @@ featureGates: - name: RayClusterStatusConditions enabled: true - name: RayJobDeletionPolicy - enabled: true \ No newline at end of file + enabled: true diff --git a/ray-operator/config/overlays/test-overrides/README.md b/ray-operator/config/overlays/test-overrides/README.md index 8ec5e435c11..9df3cb15a7c 100644 --- a/ray-operator/config/overlays/test-overrides/README.md +++ b/ray-operator/config/overlays/test-overrides/README.md @@ -1,6 +1,7 @@ # Test Overrides Overlay (CI / e2e ONLY) This overlay enables test-only / alpha feature gates (currently `RayJobDeletionPolicy`) without modifying: + - The base manifests under `config/default` - Generated CRDs (`make generate`) - Helm chart defaults (`make helm`, users' `helm install` without -f override) @@ -8,11 +9,17 @@ This overlay enables test-only / alpha feature gates (currently `RayJobDeletionP Use it only in CI or local end-to-end testing when you explicitly need gated behavior. --- + ## Why It Exists -Some feature gates are intentionally disabled by default for stability. E2E tests must exercise them to validate behavior prior to promotion. This overlay provides a safe, isolated place to turn them on. + +Some feature gates are intentionally disabled by default for stability. +E2E tests must exercise them to validate behavior prior to promotion. +This overlay provides a safe, isolated place to turn them on. --- + ## Safety Guarantees + | Concern | Guarantee | |---------|-----------| | Default user deploy (`make deploy`) | Unchanged | @@ -21,18 +28,25 @@ Some feature gates are intentionally disabled by default for stability. E2E test | Feature gates scope | Only those explicitly listed here | --- + ## Usage + Deploy with feature gates enabled: -``` + +```bash make deploy-with-override IMG=kuberay/operator:nightly ``` + Helm path (CI release automation): -``` + +```bash helm install kuberay-operator kuberay/kuberay-operator -f .buildkite/values-kuberay-operator-override.yaml ``` --- + ## Adding Another Feature Gate + 1. Edit `deployment-override.yaml` – append your gate inside the existing `--feature-gates=` list. 2. Update `.buildkite/values-kuberay-operator-override.yaml` likewise. 3. Add or adjust e2e tests as needed. @@ -40,29 +54,39 @@ helm install kuberay-operator kuberay/kuberay-operator -f .buildkite/values-kube Keep gate ordering stable to minimize diff noise. --- + ## Keeping In Sync + If the base operator Deployment args change in `config/manager/manager.yaml`: + 1. Copy the updated arg list. 2. Re-apply the feature gates in `deployment-override.yaml`. 3. Re-render to confirm. --- + ## Removal / Promotion Flow + When a gate graduates (enabled by default upstream): + 1. Remove it from the override (if it's default-on, it no longer needs listing). 2. Remove corresponding logic from tests if they branch on gate state. 3. (Optional) Note the graduation in release notes. --- + ## Troubleshooting -Problem | Action ---------|------- -Patch no longer applies | Check if Deployment name or container name changed. -Gates not taking effect | Confirm args rendered (render target) and operator pod restarted. -Unexpected arg order | The strategic merge patch replaces the entire args list; adjust ordering there. + +| Problem | Action | +|---------|--------| +| Patch no longer applies | Check if Deployment name or container name changed. | +| Gates not taking effect | Confirm args rendered (render target) and operator pod restarted. | +| Unexpected arg order | The strategic merge patch replaces the entire args list; adjust ordering there. | --- + ## Do NOT + - Add unrelated production configuration (RBAC, CRDs, resources) here. - Reference this overlay from user-facing docs. -- Rename directory without updating `Makefile` targets. \ No newline at end of file +- Rename directory without updating `Makefile` targets. diff --git a/ray-operator/config/overlays/test-overrides/deployment-override.yaml b/ray-operator/config/overlays/test-overrides/deployment-override.yaml index 20ba1ac9f2b..5f7a1eba665 100644 --- a/ray-operator/config/overlays/test-overrides/deployment-override.yaml +++ b/ray-operator/config/overlays/test-overrides/deployment-override.yaml @@ -9,4 +9,4 @@ spec: containers: - name: kuberay-operator args: - - --feature-gates=RayClusterStatusConditions=true,RayJobDeletionPolicy=true \ No newline at end of file + - --feature-gates=RayClusterStatusConditions=true,RayJobDeletionPolicy=true diff --git a/ray-operator/config/overlays/test-overrides/kustomization.yaml b/ray-operator/config/overlays/test-overrides/kustomization.yaml index 7554d954359..c1472f6f305 100644 --- a/ray-operator/config/overlays/test-overrides/kustomization.yaml +++ b/ray-operator/config/overlays/test-overrides/kustomization.yaml @@ -14,4 +14,4 @@ patches: - path: deployment-override.yaml target: kind: Deployment - name: kuberay-operator \ No newline at end of file + name: kuberay-operator diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index c11998e8938..f521bd068fe 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -161,8 +161,9 @@ func ValidateRayJobSpec(rayJob *rayv1.RayJob) error { return fmt.Errorf("The RayJob spec is invalid: TTLSecondsAfterFinished must be a non-negative integer") } - if !rayJob.Spec.ShutdownAfterJobFinishes && rayJob.Spec.TTLSecondsAfterFinished > 0 { - return fmt.Errorf("The RayJob spec is invalid: a RayJob with shutdownAfterJobFinishes set to false cannot have TTLSecondsAfterFinished") + // Validate TTL and deletion strategy together + if err := validateDeletionConfiguration(rayJob); err != nil { + return err } isClusterSelectorMode := len(rayJob.Spec.ClusterSelector) != 0 @@ -224,9 +225,7 @@ func ValidateRayJobSpec(rayJob *rayv1.RayJob) error { if rayJob.Spec.BackoffLimit != nil && *rayJob.Spec.BackoffLimit < 0 { return fmt.Errorf("The RayJob spec is invalid: backoffLimit must be a positive integer") } - if err := validateDeletionStrategy(rayJob); err != nil { - return fmt.Errorf("invalid deletion strategy: %w", err) - } + return nil } @@ -265,41 +264,89 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error { return nil } -// validateDeletionStrategy centralizes all validation logic for the deletion strategy. -// This includes the new `deletionRules` and the legacy fields (`onSuccess`,`onFailure`). -func validateDeletionStrategy(rayJob *rayv1.RayJob) error { - if rayJob.Spec.DeletionStrategy == nil { - return nil - } +// validateDeletionConfiguration validates both deletion strategy and TTL configuration +func validateDeletionConfiguration(rayJob *rayv1.RayJob) error { + // Get deletion mode flags + usingShutdownAfterJobFinishes := rayJob.Spec.ShutdownAfterJobFinishes + usingDeletionRules := rayJob.Spec.DeletionStrategy != nil && len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0 + usingLegacyAPI := rayJob.Spec.DeletionStrategy != nil && + (rayJob.Spec.DeletionStrategy.OnSuccess != nil || rayJob.Spec.DeletionStrategy.OnFailure != nil) - if !features.Enabled(features.RayJobDeletionPolicy) { + // Validate feature gate requirements + if (usingDeletionRules || usingLegacyAPI) && !features.Enabled(features.RayJobDeletionPolicy) { return fmt.Errorf("RayJobDeletionPolicy feature gate must be enabled to use the DeletionStrategy feature") } - usingDeletionRules := len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0 - usingLegacyAPI := rayJob.Spec.DeletionStrategy.OnSuccess != nil || rayJob.Spec.DeletionStrategy.OnFailure != nil - - // ShutdownAfterJobFinishes cannot be used with the new API. - if usingDeletionRules && rayJob.Spec.ShutdownAfterJobFinishes { - return fmt.Errorf("ShutdownAfterJobFinishes cannot be used when spec.deletionStrategy.deletionRules is defined. Please configure all deletion behaviors within deletionRules") + // Validate mutual exclusivity + if err := validateDeletionMutualExclusivity(usingShutdownAfterJobFinishes, usingDeletionRules, usingLegacyAPI); err != nil { + return err } - // Legacy API and DeletionRules cannot be used simultaneously. - if usingDeletionRules && usingLegacyAPI { - return fmt.Errorf("legacy policies (onSuccess, onFailure) and the new deletionRules cannot be used simultaneously within the same deletionStrategy") + // Validate TTL requirements + if rayJob.Spec.TTLSecondsAfterFinished > 0 { + if err := validateTTLRequirements(usingShutdownAfterJobFinishes, usingDeletionRules, usingLegacyAPI); err != nil { + return err + } } - // DeletionStrategy must contain at least one policy if specified. - if !usingDeletionRules && !usingLegacyAPI { - return fmt.Errorf("deletionStrategy is specified, but no policies (onSuccess, onFailure, or deletionRules) are defined within it") + // Validate deletion strategy configuration + if rayJob.Spec.DeletionStrategy != nil && !usingDeletionRules && !usingLegacyAPI { + return fmt.Errorf("deletionStrategy is specified, but no policies are defined") } + // Validate specific deletion modes if usingDeletionRules { return validateDeletionRules(rayJob) } + if usingLegacyAPI { + return validateLegacyDeletionPolicies(rayJob) + } - // If not using DeletionRules, validate the legacy strategy - return validateLegacyDeletionPolicies(rayJob) + return nil +} + +// validateDeletionMutualExclusivity ensures only one deletion approach is used +func validateDeletionMutualExclusivity(usingShutdownAfterJobFinishes, usingDeletionRules, usingLegacyAPI bool) error { + activeCount := 0 + var activeModes []string + + if usingShutdownAfterJobFinishes { + activeCount++ + activeModes = append(activeModes, "spec.shutdownAfterJobFinishes=true") + } + if usingDeletionRules { + activeCount++ + activeModes = append(activeModes, "spec.deletionStrategy.deletionRules") + } + if usingLegacyAPI { + activeCount++ + activeModes = append(activeModes, "spec.deletionStrategy.onSuccess/onFailure") + } + + if activeCount > 1 { + return fmt.Errorf("multiple deletion approaches are configured simultaneously: %v. Please use only one deletion strategy", activeModes) + } + + return nil +} + +// validateTTLRequirements ensures TTL is only used with valid cleanup mechanisms +func validateTTLRequirements(usingShutdownAfterJobFinishes, usingDeletionRules, usingLegacyAPI bool) error { + // ShutdownAfterJobFinishes is always a valid cleanup mechanism + if usingShutdownAfterJobFinishes { + return nil + } + + // Deletion strategy provides cleanup, but only if feature gate is enabled + if features.Enabled(features.RayJobDeletionPolicy) && (usingDeletionRules || usingLegacyAPI) { + return nil + } + + // No valid cleanup mechanism found + if features.Enabled(features.RayJobDeletionPolicy) { + return fmt.Errorf("The RayJob spec is invalid: TTLSecondsAfterFinished requires either ShutdownAfterJobFinishes=true or a configured deletion strategy") + } + return fmt.Errorf("The RayJob spec is invalid: a RayJob with shutdownAfterJobFinishes set to false cannot have TTLSecondsAfterFinished") } // validateDeletionRules validates the deletion rules in the RayJob spec. @@ -429,9 +476,5 @@ func validateLegacyDeletionPolicies(rayJob *rayv1.RayJob) error { return fmt.Errorf("DeletionStrategy=DeleteWorkers currently does not support RayCluster with autoscaling enabled") } - if rayJob.Spec.ShutdownAfterJobFinishes && (*onSuccessPolicy.Policy == rayv1.DeleteNone || *onFailurePolicy.Policy == rayv1.DeleteNone) { - return fmt.Errorf("shutdownAfterJobFinishes is set to 'true' while deletion policy is 'DeleteNone'") - } - return nil } diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index 16f2911229d..d7f44900dd3 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -1012,7 +1012,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteCluster), }, - }, ShutdownAfterJobFinishes: true, + }, ShutdownAfterJobFinishes: false, RayClusterSpec: createBasicRayClusterSpec(), }, expectError: false, @@ -1048,7 +1048,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, - }, ShutdownAfterJobFinishes: true, + }, ShutdownAfterJobFinishes: false, RayClusterSpec: createBasicRayClusterSpec(), }, expectError: true, @@ -1060,7 +1060,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, - }, ShutdownAfterJobFinishes: true, + }, ShutdownAfterJobFinishes: false, RayClusterSpec: createBasicRayClusterSpec(), }, expectError: true, @@ -1072,7 +1072,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, - }, ShutdownAfterJobFinishes: true, + }, ShutdownAfterJobFinishes: false, RayClusterSpec: createBasicRayClusterSpec(), }, expectError: true, @@ -1085,7 +1085,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { Policy: ptr.To(rayv1.DeleteNone), }, OnFailure: &rayv1.DeletionPolicy{}, - }, ShutdownAfterJobFinishes: true, + }, ShutdownAfterJobFinishes: false, RayClusterSpec: createBasicRayClusterSpec(), }, expectError: true, From 92a8d7edb167b9e1e6da124ad534859e10d002c2 Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Sat, 27 Sep 2025 00:26:34 +0000 Subject: [PATCH 16/21] refactor --- docs/reference/api.md | 36 +++---- ray-operator/apis/ray/v1/rayjob_types.go | 43 ++++---- .../controllers/ray/rayjob_controller.go | 2 +- .../controllers/ray/utils/validation.go | 98 ++++++------------- .../controllers/ray/utils/validation_test.go | 12 ++- .../rayjob_deletion_strategy_test.go | 57 +---------- 6 files changed, 77 insertions(+), 171 deletions(-) diff --git a/docs/reference/api.md b/docs/reference/api.md index 3b773074bb5..39b443f6701 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -124,33 +124,25 @@ _Appears in:_ -DeletionStrategy defines the deletion policies for a RayJob. -It allows for fine-grained control over resource cleanup after a job finishes. +DeletionStrategy configures automated cleanup after the RayJob reaches a terminal state. +Two mutually exclusive styles are supported: -Legacy fields `onSuccess` and `onFailure` are still supported for backward compatibility, -but it is highly recommended to migrate to the new `deletionRules` field. -`onSuccess` and `onFailure` will be removed in release 1.6.0. + Legacy: provide both onSuccess and onFailure (deprecated; removal planned for 1.6.0). May be combined with shutdownAfterJobFinishes and (optionally) global TTLSecondsAfterFinished. + Rules: provide deletionRules (list; may be empty to explicitly select rules mode). Rules mode is incompatible with shutdownAfterJobFinishes, legacy fields, and the global TTLSecondsAfterFinished (use per‑rule condition.ttlSeconds instead). -Notes: - - When this block is set, you must configure **either** - (a) BOTH `onSuccess` and `onFailure` policies, - OR - (b) the `deletionRules` field (which may be empty, in which case no deletion will occur). - - `onSuccess` / `onFailure` must NOT be used together with `deletionRules`. - - `onSuccess` and `onFailure` are **deprecated** and planned for removal in release 1.6.0. - - `deletionStrategy` is mutually exclusive with `spec.shutdownAfterJobFinishes`. - - If both are set, the controller will report an error and stop processing the RayJob. - - If the `RayJobDeletionPolicy` feature gate is disabled but `deletionStrategy` is set, - the controller will report an error and stop processing the RayJob. +Semantics: + - An empty deletionRules slice still selects rules mode. + - Legacy requires both onSuccess and onFailure; specifying only one is invalid. + - Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true; therefore it cannot be used with rules mode or with legacy alone (no shutdown). + - Feature gate RayJobDeletionPolicy must be enabled when this block is present. -Validation rules: - 1. Prevent mixing legacy and new fields - - - 2. Require either both legacy fields or deletionRules presence +Validation: + - CRD XValidations prevent mixing legacy fields with deletionRules and enforce legacy completeness. + - Webhook/controller logic enforces rules vs shutdown exclusivity and TTL constraints. + - onSuccess/onFailure are deprecated; migration to deletionRules is encouraged. @@ -305,7 +297,7 @@ _Appears in:_ | `clusterSelector` _object (keys:string, values:string)_ | clusterSelector is used to select running rayclusters by labels | | | | `submitterConfig` _[SubmitterConfig](#submitterconfig)_ | Configurations of submitter k8s job. | | | | `managedBy` _string_ | ManagedBy is an optional configuration for the controller or entity that manages a RayJob.
The value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'.
The kuberay-operator reconciles a RayJob which doesn't have this field at all or
the field value is the reserved string 'ray.io/kuberay-operator',
but delegates reconciling the RayJob with 'kueue.x-k8s.io/multikueue' to the Kueue.
The field is immutable. | | | -| `deletionStrategy` _[DeletionStrategy](#deletionstrategy)_ | DeletionStrategy defines resource cleanup policies after job completion.
Use either legacy fields (onSuccess/onFailure) OR deletionRules, not both.
Mutually exclusive with spec.shutdownAfterJobFinishes.
Requires RayJobDeletionPolicy feature gate to be enabled. | | | +| `deletionStrategy` _[DeletionStrategy](#deletionstrategy)_ | DeletionStrategy automates post-completion cleanup.
Choose one style or omit:
- Legacy: both onSuccess & onFailure (deprecated; may combine with shutdownAfterJobFinishes and TTLSecondsAfterFinished).
- Rules: deletionRules (empty or non-empty) — incompatible with shutdownAfterJobFinishes, legacy fields, and global TTLSecondsAfterFinished (use per-rule condition.ttlSeconds).
Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true.
Feature gate RayJobDeletionPolicy must be enabled when this field is set. | | | | `entrypoint` _string_ | Entrypoint represents the command to start execution. | | | | `runtimeEnvYAML` _string_ | RuntimeEnvYAML represents the runtime environment configuration
provided as a multi-line YAML string. | | | | `jobId` _string_ | If jobId is not set, a new jobId will be auto-generated. | | | diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go index 37fd5dbe7f3..cd3a68db472 100644 --- a/ray-operator/apis/ray/v1/rayjob_types.go +++ b/ray-operator/apis/ray/v1/rayjob_types.go @@ -87,31 +87,24 @@ const ( SidecarMode JobSubmissionMode = "SidecarMode" // Submit job via a sidecar container in the Ray head Pod ) -// DeletionStrategy defines the deletion policies for a RayJob. -// It allows for fine-grained control over resource cleanup after a job finishes. +// DeletionStrategy configures automated cleanup after the RayJob reaches a terminal state. +// Two mutually exclusive styles are supported: // -// Legacy fields `onSuccess` and `onFailure` are still supported for backward compatibility, -// but it is highly recommended to migrate to the new `deletionRules` field. -// `onSuccess` and `onFailure` will be removed in release 1.6.0. +// Legacy: provide both onSuccess and onFailure (deprecated; removal planned for 1.6.0). May be combined with shutdownAfterJobFinishes and (optionally) global TTLSecondsAfterFinished. +// Rules: provide deletionRules (list; may be empty to explicitly select rules mode). Rules mode is incompatible with shutdownAfterJobFinishes, legacy fields, and the global TTLSecondsAfterFinished (use per‑rule condition.ttlSeconds instead). // -// Notes: -// - When this block is set, you must configure **either** -// (a) BOTH `onSuccess` and `onFailure` policies, -// OR -// (b) the `deletionRules` field (which may be empty, in which case no deletion will occur). -// - `onSuccess` / `onFailure` must NOT be used together with `deletionRules`. -// - `onSuccess` and `onFailure` are **deprecated** and planned for removal in release 1.6.0. -// - `deletionStrategy` is mutually exclusive with `spec.shutdownAfterJobFinishes`. -// - If both are set, the controller will report an error and stop processing the RayJob. -// - If the `RayJobDeletionPolicy` feature gate is disabled but `deletionStrategy` is set, -// the controller will report an error and stop processing the RayJob. +// Semantics: +// - An empty deletionRules slice still selects rules mode. +// - Legacy requires both onSuccess and onFailure; specifying only one is invalid. +// - Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true; therefore it cannot be used with rules mode or with legacy alone (no shutdown). +// - Feature gate RayJobDeletionPolicy must be enabled when this block is present. // -// Validation rules: -// 1. Prevent mixing legacy and new fields +// Validation: +// - CRD XValidations prevent mixing legacy fields with deletionRules and enforce legacy completeness. +// - Webhook/controller logic enforces rules vs shutdown exclusivity and TTL constraints. +// - onSuccess/onFailure are deprecated; migration to deletionRules is encouraged. // // +kubebuilder:validation:XValidation:rule="!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))",message="legacy policies (onSuccess/onFailure) and deletionRules cannot be used together within the same deletionStrategy" -// 2. Require either both legacy fields or deletionRules presence -// // +kubebuilder:validation:XValidation:rule="((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules))",message="deletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (which may be empty)" type DeletionStrategy struct { // OnSuccess is the deletion policy for a successful RayJob. @@ -232,10 +225,12 @@ type RayJobSpec struct { // +kubebuilder:validation:XValidation:rule="self in ['ray.io/kuberay-operator', 'kueue.x-k8s.io/multikueue']",message="the managedBy field value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'" // +optional ManagedBy *string `json:"managedBy,omitempty"` - // DeletionStrategy defines resource cleanup policies after job completion. - // Use either legacy fields (onSuccess/onFailure) OR deletionRules, not both. - // Mutually exclusive with spec.shutdownAfterJobFinishes. - // Requires RayJobDeletionPolicy feature gate to be enabled. + // DeletionStrategy automates post-completion cleanup. + // Choose one style or omit: + // - Legacy: both onSuccess & onFailure (deprecated; may combine with shutdownAfterJobFinishes and TTLSecondsAfterFinished). + // - Rules: deletionRules (empty or non-empty) — incompatible with shutdownAfterJobFinishes, legacy fields, and global TTLSecondsAfterFinished (use per-rule condition.ttlSeconds). + // Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true. + // Feature gate RayJobDeletionPolicy must be enabled when this field is set. // +optional DeletionStrategy *DeletionStrategy `json:"deletionStrategy,omitempty"` // Entrypoint represents the command to start execution. diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index fb139256189..e2f58652347 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -373,7 +373,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request) if features.Enabled(features.RayJobDeletionPolicy) && rayJobInstance.Spec.DeletionStrategy != nil { // The previous validation logic ensures that either DeletionRules or the legacy policies are set, but not both. - if len(rayJobInstance.Spec.DeletionStrategy.DeletionRules) > 0 { + if rayJobInstance.Spec.DeletionStrategy.DeletionRules != nil { return r.handleDeletionRules(ctx, rayJobInstance) } return r.handleLegacyDeletionPolicy(ctx, rayJobInstance) diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index f521bd068fe..d063661cf2f 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -266,89 +266,47 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error { // validateDeletionConfiguration validates both deletion strategy and TTL configuration func validateDeletionConfiguration(rayJob *rayv1.RayJob) error { - // Get deletion mode flags - usingShutdownAfterJobFinishes := rayJob.Spec.ShutdownAfterJobFinishes - usingDeletionRules := rayJob.Spec.DeletionStrategy != nil && len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0 - usingLegacyAPI := rayJob.Spec.DeletionStrategy != nil && - (rayJob.Spec.DeletionStrategy.OnSuccess != nil || rayJob.Spec.DeletionStrategy.OnFailure != nil) - - // Validate feature gate requirements - if (usingDeletionRules || usingLegacyAPI) && !features.Enabled(features.RayJobDeletionPolicy) { - return fmt.Errorf("RayJobDeletionPolicy feature gate must be enabled to use the DeletionStrategy feature") - } - - // Validate mutual exclusivity - if err := validateDeletionMutualExclusivity(usingShutdownAfterJobFinishes, usingDeletionRules, usingLegacyAPI); err != nil { - return err - } - - // Validate TTL requirements - if rayJob.Spec.TTLSecondsAfterFinished > 0 { - if err := validateTTLRequirements(usingShutdownAfterJobFinishes, usingDeletionRules, usingLegacyAPI); err != nil { - return err - } + if !rayJob.Spec.ShutdownAfterJobFinishes && rayJob.Spec.TTLSecondsAfterFinished > 0 { + return fmt.Errorf("The RayJob spec is invalid: a RayJob with shutdownAfterJobFinishes set to false cannot have TTLSecondsAfterFinished") } - // Validate deletion strategy configuration - if rayJob.Spec.DeletionStrategy != nil && !usingDeletionRules && !usingLegacyAPI { - return fmt.Errorf("deletionStrategy is specified, but no policies are defined") + // No strategy block: nothing else to validate. + if rayJob.Spec.DeletionStrategy == nil { + return nil } - // Validate specific deletion modes - if usingDeletionRules { - return validateDeletionRules(rayJob) - } - if usingLegacyAPI { - return validateLegacyDeletionPolicies(rayJob) + // Feature gate must be enabled for any strategy usage. + if !features.Enabled(features.RayJobDeletionPolicy) { + return fmt.Errorf("RayJobDeletionPolicy feature gate must be enabled to use DeletionStrategy") } - return nil -} - -// validateDeletionMutualExclusivity ensures only one deletion approach is used -func validateDeletionMutualExclusivity(usingShutdownAfterJobFinishes, usingDeletionRules, usingLegacyAPI bool) error { - activeCount := 0 - var activeModes []string + legacyConfigured := rayJob.Spec.DeletionStrategy.OnSuccess != nil || rayJob.Spec.DeletionStrategy.OnFailure != nil + rulesConfigured := rayJob.Spec.DeletionStrategy.DeletionRules != nil // explicit empty slice counts as rules mode - if usingShutdownAfterJobFinishes { - activeCount++ - activeModes = append(activeModes, "spec.shutdownAfterJobFinishes=true") - } - if usingDeletionRules { - activeCount++ - activeModes = append(activeModes, "spec.deletionStrategy.deletionRules") + // Mutual exclusivity: rules mode forbids shutdown & legacy. (TTL+rules is implicitly invalid because TTL requires shutdown.) + if rulesConfigured && rayJob.Spec.ShutdownAfterJobFinishes { + return fmt.Errorf("The RayJob spec is invalid: spec.shutdownAfterJobFinishes and spec.deletionStrategy.deletionRules are mutually exclusive") } - if usingLegacyAPI { - activeCount++ - activeModes = append(activeModes, "spec.deletionStrategy.onSuccess/onFailure") + if rulesConfigured && legacyConfigured { + return fmt.Errorf("The RayJob spec is invalid: Cannot use both legacy onSuccess/onFailure fields and deletionRules simultaneously") } - if activeCount > 1 { - return fmt.Errorf("multiple deletion approaches are configured simultaneously: %v. Please use only one deletion strategy", activeModes) + // Detailed content validation + if legacyConfigured { + if err := validateLegacyDeletionPolicies(rayJob); err != nil { + return err + } + } else if rulesConfigured { + if err := validateDeletionRules(rayJob); err != nil { + return err + } + } else { + return fmt.Errorf("The RayJob spec is invalid: DeletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (which may be empty list)") } return nil } -// validateTTLRequirements ensures TTL is only used with valid cleanup mechanisms -func validateTTLRequirements(usingShutdownAfterJobFinishes, usingDeletionRules, usingLegacyAPI bool) error { - // ShutdownAfterJobFinishes is always a valid cleanup mechanism - if usingShutdownAfterJobFinishes { - return nil - } - - // Deletion strategy provides cleanup, but only if feature gate is enabled - if features.Enabled(features.RayJobDeletionPolicy) && (usingDeletionRules || usingLegacyAPI) { - return nil - } - - // No valid cleanup mechanism found - if features.Enabled(features.RayJobDeletionPolicy) { - return fmt.Errorf("The RayJob spec is invalid: TTLSecondsAfterFinished requires either ShutdownAfterJobFinishes=true or a configured deletion strategy") - } - return fmt.Errorf("The RayJob spec is invalid: a RayJob with shutdownAfterJobFinishes set to false cannot have TTLSecondsAfterFinished") -} - // validateDeletionRules validates the deletion rules in the RayJob spec. // It performs per-rule validations, checks for uniqueness, and ensures logical TTL consistency. // Errors are collected and returned as a single aggregated error using errors.Join for better user feedback. @@ -476,5 +434,9 @@ func validateLegacyDeletionPolicies(rayJob *rayv1.RayJob) error { return fmt.Errorf("DeletionStrategy=DeleteWorkers currently does not support RayCluster with autoscaling enabled") } + if rayJob.Spec.ShutdownAfterJobFinishes && (*onSuccessPolicy.Policy == rayv1.DeleteNone || *onFailurePolicy.Policy == rayv1.DeleteNone) { + return fmt.Errorf("The RayJob spec is invalid: shutdownAfterJobFinshes is set to 'true' while deletion policy is 'DeleteNone'") + } + return nil } diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index d7f44900dd3..2314d5e79f5 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -1159,13 +1159,23 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { expectError: true, }, { - name: "empty DeletionStrategy", + name: "nil DeletionStrategy", spec: rayv1.RayJobSpec{ DeletionStrategy: &rayv1.DeletionStrategy{}, RayClusterSpec: createBasicRayClusterSpec(), }, expectError: true, }, + { + name: "empty DeletionStrategy", + spec: rayv1.RayJobSpec{ + DeletionStrategy: &rayv1.DeletionStrategy{ + DeletionRules: []rayv1.DeletionRule{}, + }, + RayClusterSpec: createBasicRayClusterSpec(), + }, + expectError: false, + }, { name: "duplicate rule in deletionRules", spec: rayv1.RayJobSpec{ diff --git a/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go b/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go index 4668ba0713c..49718d3544b 100644 --- a/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go +++ b/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go @@ -456,13 +456,13 @@ env_vars: env_vars: counter_name: test_counter `). - WithShutdownAfterJobFinishes(false). + WithShutdownAfterJobFinishes(true). WithTTLSecondsAfterFinished(10). // Legacy TTL for backward compatibility WithDeletionStrategy(rayv1ac.DeletionStrategy(). WithOnSuccess(rayv1ac.DeletionPolicy(). WithPolicy(rayv1.DeleteCluster)). WithOnFailure(rayv1ac.DeletionPolicy(). - WithPolicy(rayv1.DeleteNone))). + WithPolicy(rayv1.DeleteCluster))). WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration())) rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) @@ -501,57 +501,4 @@ env_vars: g.Eventually(func() error { _, err := GetRayJob(test, job.Namespace, job.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) LogWithTimestamp(test.T(), "Cleanup after legacy success scenario complete") }) - - test.T().Run("Legacy OnFailure DeleteNone should still work", func(_ *testing.T) { - rayJobAC := rayv1ac.RayJob("legacy-failure-test", namespace.Name). - WithSpec(rayv1ac.RayJobSpec(). - WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))). - WithEntrypoint("python /home/ray/jobs/fail.py"). // Use failing script - WithShutdownAfterJobFinishes(false). - WithTTLSecondsAfterFinished(10). - WithDeletionStrategy(rayv1ac.DeletionStrategy(). - WithOnSuccess(rayv1ac.DeletionPolicy(). - WithPolicy(rayv1.DeleteCluster)). - WithOnFailure(rayv1ac.DeletionPolicy(). - WithPolicy(rayv1.DeleteNone))). - WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration())) - - rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) - g.Expect(err).NotTo(HaveOccurred()) - LogWithTimestamp(test.T(), "Created legacy failure RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) - - // Wait for job to fail - g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). - Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusFailed))) - LogWithTimestamp(test.T(), "RayJob %s/%s failed as expected", rayJob.Namespace, rayJob.Name) - - // Get the associated RayCluster name - rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) - g.Expect(err).NotTo(HaveOccurred()) - rayClusterName := rayJob.Status.RayClusterName - g.Expect(rayClusterName).NotTo(BeEmpty()) - - // Wait past the TTL and verify everything is preserved due to OnFailure=DeleteNone - LogWithTimestamp(test.T(), "Waiting past TTL to verify resources preserved by OnFailure=DeleteNone...") - g.Consistently(func(gg Gomega) { - jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name) - gg.Expect(err).NotTo(HaveOccurred()) - gg.Expect(jobObj).NotTo(BeNil()) - cluster, err := GetRayCluster(test, namespace.Name, rayClusterName) - gg.Expect(err).NotTo(HaveOccurred()) - gg.Expect(cluster).NotTo(BeNil()) - }, 15*time.Second, 2*time.Second).Should(Succeed()) - LogWithTimestamp(test.T(), "Legacy OnFailure=DeleteNone policy working correctly") - - // Cleanup: delete legacy failure RayJob (will also GC cluster) - LogWithTimestamp(test.T(), "Cleaning up legacy failure RayJob %s/%s", rayJob.Namespace, rayJob.Name) - err = test.Client().Ray().RayV1().RayJobs(rayJob.Namespace).Delete(test.Ctx(), rayJob.Name, metav1.DeleteOptions{}) - g.Expect(err).NotTo(HaveOccurred()) - g.Eventually(func() error { _, err := GetRayJob(test, rayJob.Namespace, rayJob.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) - g.Eventually(func() error { - _, err := GetRayCluster(test, namespace.Name, rayClusterName) - return err - }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) - LogWithTimestamp(test.T(), "Cleanup after legacy failure scenario complete") - }) } From 16a287ceed5d8171f6ac67a725fe8dcc94e1f960 Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Sat, 27 Sep 2025 01:08:06 +0000 Subject: [PATCH 17/21] trigger ci --- ray-operator/controllers/ray/rayjob_controller.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index e2f58652347..ecee99c17e9 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -1120,6 +1120,11 @@ func (r *RayJobReconciler) handleDeletionRules(ctx context.Context, rayJob *rayv var overdueRules []rayv1.DeletionRule var nextRequeueTime *time.Time + if len(rayJob.Spec.DeletionStrategy.DeletionRules) == 0 { + logger.Info("No deletion rules are defined; skipping deletion handling.") + return ctrl.Result{}, nil + } + // Categorize all applicable and incomplete rules into "overdue" or "pending". for _, rule := range rayJob.Spec.DeletionStrategy.DeletionRules { // Skip rules that don't match the current job status. From 0f9dd7b0a16ea1eb021d11779c55b8ded8d46172 Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Sat, 27 Sep 2025 03:12:48 +0000 Subject: [PATCH 18/21] trigger ci --- docs/reference/api.md | 6 +++--- helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml | 5 +++-- ray-operator/apis/ray/v1/rayjob_types.go | 8 ++++---- ray-operator/config/crd/bases/ray.io_rayjobs.yaml | 5 +++-- ray-operator/controllers/ray/rayjob_controller.go | 5 ----- ray-operator/controllers/ray/utils/validation.go | 4 ++-- .../controllers/ray/utils/validation_test.go | 12 ++++++------ 7 files changed, 21 insertions(+), 24 deletions(-) diff --git a/docs/reference/api.md b/docs/reference/api.md index 39b443f6701..b29e1f6f12a 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -129,11 +129,11 @@ Two mutually exclusive styles are supported: Legacy: provide both onSuccess and onFailure (deprecated; removal planned for 1.6.0). May be combined with shutdownAfterJobFinishes and (optionally) global TTLSecondsAfterFinished. - Rules: provide deletionRules (list; may be empty to explicitly select rules mode). Rules mode is incompatible with shutdownAfterJobFinishes, legacy fields, and the global TTLSecondsAfterFinished (use per‑rule condition.ttlSeconds instead). + Rules: provide deletionRules (non-empty list). Rules mode is incompatible with shutdownAfterJobFinishes, legacy fields, and the global TTLSecondsAfterFinished (use per‑rule condition.ttlSeconds instead). Semantics: - - An empty deletionRules slice still selects rules mode. + - A non-empty deletionRules selects rules mode; empty lists are treated as unset. - Legacy requires both onSuccess and onFailure; specifying only one is invalid. - Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true; therefore it cannot be used with rules mode or with legacy alone (no shutdown). - Feature gate RayJobDeletionPolicy must be enabled when this block is present. @@ -297,7 +297,7 @@ _Appears in:_ | `clusterSelector` _object (keys:string, values:string)_ | clusterSelector is used to select running rayclusters by labels | | | | `submitterConfig` _[SubmitterConfig](#submitterconfig)_ | Configurations of submitter k8s job. | | | | `managedBy` _string_ | ManagedBy is an optional configuration for the controller or entity that manages a RayJob.
The value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'.
The kuberay-operator reconciles a RayJob which doesn't have this field at all or
the field value is the reserved string 'ray.io/kuberay-operator',
but delegates reconciling the RayJob with 'kueue.x-k8s.io/multikueue' to the Kueue.
The field is immutable. | | | -| `deletionStrategy` _[DeletionStrategy](#deletionstrategy)_ | DeletionStrategy automates post-completion cleanup.
Choose one style or omit:
- Legacy: both onSuccess & onFailure (deprecated; may combine with shutdownAfterJobFinishes and TTLSecondsAfterFinished).
- Rules: deletionRules (empty or non-empty) — incompatible with shutdownAfterJobFinishes, legacy fields, and global TTLSecondsAfterFinished (use per-rule condition.ttlSeconds).
Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true.
Feature gate RayJobDeletionPolicy must be enabled when this field is set. | | | +| `deletionStrategy` _[DeletionStrategy](#deletionstrategy)_ | DeletionStrategy automates post-completion cleanup.
Choose one style or omit:
- Legacy: both onSuccess & onFailure (deprecated; may combine with shutdownAfterJobFinishes and TTLSecondsAfterFinished).
- Rules: deletionRules (non-empty) — incompatible with shutdownAfterJobFinishes, legacy fields, and global TTLSecondsAfterFinished (use per-rule condition.ttlSeconds).
Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true.
Feature gate RayJobDeletionPolicy must be enabled when this field is set. | | | | `entrypoint` _string_ | Entrypoint represents the command to start execution. | | | | `runtimeEnvYAML` _string_ | RuntimeEnvYAML represents the runtime environment configuration
provided as a multi-line YAML string. | | | | `jobId` _string_ | If jobId is not set, a new jobId will be auto-generated. | | | diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml index 8ee2bc5ce4d..e5e23e6666a 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml @@ -117,8 +117,9 @@ spec: cannot be used together within the same deletionStrategy rule: '!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))' - message: deletionStrategy requires either BOTH onSuccess and onFailure, - OR the deletionRules field (which may be empty) - rule: ((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules)) + OR the deletionRules field (cannot be empty) + rule: ((has(self.onSuccess) && has(self.onFailure)) || (has(self.deletionRules) + && size(self.deletionRules) > 0)) entrypoint: type: string entrypointNumCpus: diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go index cd3a68db472..0c87d0c08c3 100644 --- a/ray-operator/apis/ray/v1/rayjob_types.go +++ b/ray-operator/apis/ray/v1/rayjob_types.go @@ -91,10 +91,10 @@ const ( // Two mutually exclusive styles are supported: // // Legacy: provide both onSuccess and onFailure (deprecated; removal planned for 1.6.0). May be combined with shutdownAfterJobFinishes and (optionally) global TTLSecondsAfterFinished. -// Rules: provide deletionRules (list; may be empty to explicitly select rules mode). Rules mode is incompatible with shutdownAfterJobFinishes, legacy fields, and the global TTLSecondsAfterFinished (use per‑rule condition.ttlSeconds instead). +// Rules: provide deletionRules (non-empty list). Rules mode is incompatible with shutdownAfterJobFinishes, legacy fields, and the global TTLSecondsAfterFinished (use per‑rule condition.ttlSeconds instead). // // Semantics: -// - An empty deletionRules slice still selects rules mode. +// - A non-empty deletionRules selects rules mode; empty lists are treated as unset. // - Legacy requires both onSuccess and onFailure; specifying only one is invalid. // - Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true; therefore it cannot be used with rules mode or with legacy alone (no shutdown). // - Feature gate RayJobDeletionPolicy must be enabled when this block is present. @@ -105,7 +105,7 @@ const ( // - onSuccess/onFailure are deprecated; migration to deletionRules is encouraged. // // +kubebuilder:validation:XValidation:rule="!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))",message="legacy policies (onSuccess/onFailure) and deletionRules cannot be used together within the same deletionStrategy" -// +kubebuilder:validation:XValidation:rule="((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules))",message="deletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (which may be empty)" +// +kubebuilder:validation:XValidation:rule="((has(self.onSuccess) && has(self.onFailure)) || (has(self.deletionRules) && size(self.deletionRules) > 0))",message="deletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (cannot be empty)" type DeletionStrategy struct { // OnSuccess is the deletion policy for a successful RayJob. // Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies. @@ -228,7 +228,7 @@ type RayJobSpec struct { // DeletionStrategy automates post-completion cleanup. // Choose one style or omit: // - Legacy: both onSuccess & onFailure (deprecated; may combine with shutdownAfterJobFinishes and TTLSecondsAfterFinished). - // - Rules: deletionRules (empty or non-empty) — incompatible with shutdownAfterJobFinishes, legacy fields, and global TTLSecondsAfterFinished (use per-rule condition.ttlSeconds). + // - Rules: deletionRules (non-empty) — incompatible with shutdownAfterJobFinishes, legacy fields, and global TTLSecondsAfterFinished (use per-rule condition.ttlSeconds). // Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true. // Feature gate RayJobDeletionPolicy must be enabled when this field is set. // +optional diff --git a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml index 8ee2bc5ce4d..e5e23e6666a 100644 --- a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml @@ -117,8 +117,9 @@ spec: cannot be used together within the same deletionStrategy rule: '!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))' - message: deletionStrategy requires either BOTH onSuccess and onFailure, - OR the deletionRules field (which may be empty) - rule: ((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules)) + OR the deletionRules field (cannot be empty) + rule: ((has(self.onSuccess) && has(self.onFailure)) || (has(self.deletionRules) + && size(self.deletionRules) > 0)) entrypoint: type: string entrypointNumCpus: diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index ecee99c17e9..e2f58652347 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -1120,11 +1120,6 @@ func (r *RayJobReconciler) handleDeletionRules(ctx context.Context, rayJob *rayv var overdueRules []rayv1.DeletionRule var nextRequeueTime *time.Time - if len(rayJob.Spec.DeletionStrategy.DeletionRules) == 0 { - logger.Info("No deletion rules are defined; skipping deletion handling.") - return ctrl.Result{}, nil - } - // Categorize all applicable and incomplete rules into "overdue" or "pending". for _, rule := range rayJob.Spec.DeletionStrategy.DeletionRules { // Skip rules that don't match the current job status. diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index d063661cf2f..edda0b772d5 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -281,7 +281,7 @@ func validateDeletionConfiguration(rayJob *rayv1.RayJob) error { } legacyConfigured := rayJob.Spec.DeletionStrategy.OnSuccess != nil || rayJob.Spec.DeletionStrategy.OnFailure != nil - rulesConfigured := rayJob.Spec.DeletionStrategy.DeletionRules != nil // explicit empty slice counts as rules mode + rulesConfigured := len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0 // Mutual exclusivity: rules mode forbids shutdown & legacy. (TTL+rules is implicitly invalid because TTL requires shutdown.) if rulesConfigured && rayJob.Spec.ShutdownAfterJobFinishes { @@ -301,7 +301,7 @@ func validateDeletionConfiguration(rayJob *rayv1.RayJob) error { return err } } else { - return fmt.Errorf("The RayJob spec is invalid: DeletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (which may be empty list)") + return fmt.Errorf("The RayJob spec is invalid: DeletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (cannot be empty)") } return nil diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index 2314d5e79f5..30eb499beaf 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -1012,7 +1012,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteCluster), }, - }, ShutdownAfterJobFinishes: false, + }, ShutdownAfterJobFinishes: true, RayClusterSpec: createBasicRayClusterSpec(), }, expectError: false, @@ -1048,7 +1048,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, - }, ShutdownAfterJobFinishes: false, + }, ShutdownAfterJobFinishes: true, RayClusterSpec: createBasicRayClusterSpec(), }, expectError: true, @@ -1060,7 +1060,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { OnFailure: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, - }, ShutdownAfterJobFinishes: false, + }, ShutdownAfterJobFinishes: true, RayClusterSpec: createBasicRayClusterSpec(), }, expectError: true, @@ -1072,7 +1072,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { OnSuccess: &rayv1.DeletionPolicy{ Policy: ptr.To(rayv1.DeleteNone), }, - }, ShutdownAfterJobFinishes: false, + }, ShutdownAfterJobFinishes: true, RayClusterSpec: createBasicRayClusterSpec(), }, expectError: true, @@ -1085,7 +1085,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { Policy: ptr.To(rayv1.DeleteNone), }, OnFailure: &rayv1.DeletionPolicy{}, - }, ShutdownAfterJobFinishes: false, + }, ShutdownAfterJobFinishes: true, RayClusterSpec: createBasicRayClusterSpec(), }, expectError: true, @@ -1174,7 +1174,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) { }, RayClusterSpec: createBasicRayClusterSpec(), }, - expectError: false, + expectError: true, }, { name: "duplicate rule in deletionRules", From 7e162086f5a9609363df6d7add582abe7f2ee510 Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Sat, 27 Sep 2025 19:49:22 +0000 Subject: [PATCH 19/21] refactor description --- docs/reference/api.md | 4 ++-- helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml | 4 ++-- ray-operator/apis/ray/v1/rayjob_types.go | 7 ++++--- ray-operator/config/crd/bases/ray.io_rayjobs.yaml | 4 ++-- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/docs/reference/api.md b/docs/reference/api.md index b29e1f6f12a..ced95d1e45d 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -141,7 +141,7 @@ Semantics: Validation: - CRD XValidations prevent mixing legacy fields with deletionRules and enforce legacy completeness. - - Webhook/controller logic enforces rules vs shutdown exclusivity and TTL constraints. + - Controller logic enforces rules vs shutdown exclusivity and TTL constraints. - onSuccess/onFailure are deprecated; migration to deletionRules is encouraged. @@ -153,7 +153,7 @@ _Appears in:_ | --- | --- | --- | --- | | `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | OnSuccess is the deletion policy for a successful RayJob.
Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
This field will be removed in release 1.6.0. | | | | `onFailure` _[DeletionPolicy](#deletionpolicy)_ | OnFailure is the deletion policy for a failed RayJob.
Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
This field will be removed in release 1.6.0. | | | -| `deletionRules` _[DeletionRule](#deletionrule) array_ | DeletionRules is a list of deletion rules, processed based on their trigger conditions.
While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),
the most impactful rule (e.g., DeleteSelf) will be executed first to prioritize resource cleanup and cost savings. | | | +| `deletionRules` _[DeletionRule](#deletionrule) array_ | DeletionRules is a list of deletion rules, processed based on their trigger conditions.
While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),
the most impactful rule (e.g., DeleteSelf) will be executed first to prioritize resource cleanup. | | MinItems: 1
| diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml index e5e23e6666a..f613645cb64 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml @@ -89,6 +89,7 @@ spec: - condition - policy type: object + minItems: 1 type: array x-kubernetes-list-type: atomic onFailure: @@ -118,8 +119,7 @@ spec: rule: '!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))' - message: deletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (cannot be empty) - rule: ((has(self.onSuccess) && has(self.onFailure)) || (has(self.deletionRules) - && size(self.deletionRules) > 0)) + rule: ((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules)) entrypoint: type: string entrypointNumCpus: diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go index 0c87d0c08c3..705d50dfd40 100644 --- a/ray-operator/apis/ray/v1/rayjob_types.go +++ b/ray-operator/apis/ray/v1/rayjob_types.go @@ -101,11 +101,11 @@ const ( // // Validation: // - CRD XValidations prevent mixing legacy fields with deletionRules and enforce legacy completeness. -// - Webhook/controller logic enforces rules vs shutdown exclusivity and TTL constraints. +// - Controller logic enforces rules vs shutdown exclusivity and TTL constraints. // - onSuccess/onFailure are deprecated; migration to deletionRules is encouraged. // // +kubebuilder:validation:XValidation:rule="!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))",message="legacy policies (onSuccess/onFailure) and deletionRules cannot be used together within the same deletionStrategy" -// +kubebuilder:validation:XValidation:rule="((has(self.onSuccess) && has(self.onFailure)) || (has(self.deletionRules) && size(self.deletionRules) > 0))",message="deletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (cannot be empty)" +// +kubebuilder:validation:XValidation:rule="((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules))",message="deletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (cannot be empty)" type DeletionStrategy struct { // OnSuccess is the deletion policy for a successful RayJob. // Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies. @@ -121,9 +121,10 @@ type DeletionStrategy struct { // DeletionRules is a list of deletion rules, processed based on their trigger conditions. // While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime), - // the most impactful rule (e.g., DeleteSelf) will be executed first to prioritize resource cleanup and cost savings. + // the most impactful rule (e.g., DeleteSelf) will be executed first to prioritize resource cleanup. // +optional // +listType=atomic + // +kubebuilder:validation:MinItems=1 DeletionRules []DeletionRule `json:"deletionRules,omitempty"` } diff --git a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml index e5e23e6666a..f613645cb64 100644 --- a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml @@ -89,6 +89,7 @@ spec: - condition - policy type: object + minItems: 1 type: array x-kubernetes-list-type: atomic onFailure: @@ -118,8 +119,7 @@ spec: rule: '!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))' - message: deletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (cannot be empty) - rule: ((has(self.onSuccess) && has(self.onFailure)) || (has(self.deletionRules) - && size(self.deletionRules) > 0)) + rule: ((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules)) entrypoint: type: string entrypointNumCpus: From 8fd17f9bf3e3a8f69c610151762f22343d00f650 Mon Sep 17 00:00:00 2001 From: wei-chenglai Date: Tue, 7 Oct 2025 00:32:39 +0000 Subject: [PATCH 20/21] improve deletion check --- .../controllers/ray/rayjob_controller.go | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index e2f58652347..4fa4d70ac1a 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -1305,6 +1305,11 @@ func (r *RayJobReconciler) isDeletionActionCompleted(ctx context.Context, rayJob return false, err } + if !cluster.DeletionTimestamp.IsZero() { + // If the cluster is being deleted, we consider the action complete. + return true, nil + } + // If the cluster exists, check if all worker groups are suspended. for _, wg := range cluster.Spec.WorkerGroupSpecs { if wg.Suspend == nil || !*wg.Suspend { @@ -1316,12 +1321,20 @@ func (r *RayJobReconciler) isDeletionActionCompleted(ctx context.Context, rayJob return true, nil case rayv1.DeleteCluster: - err := r.Get(ctx, clusterIdentifier, cluster) - if errors.IsNotFound(err) { - // Cluster not found means the deletion is complete. + if err := r.Get(ctx, clusterIdentifier, cluster); err != nil { + if errors.IsNotFound(err) { + return true, nil + } + // For any other error, we can't be sure of the state, so report the error. + return false, err + } + + if !cluster.DeletionTimestamp.IsZero() { + // If the cluster is being deleted, we consider the action complete. return true, nil } - return false, err + + return false, nil case rayv1.DeleteSelf: // This action is terminal. If this function is running, the RayJob still exists, From 80a6177ebb3c14edfca59c9cf2f32f2babfcf99e Mon Sep 17 00:00:00 2001 From: Wei-Cheng Lai Date: Wed, 8 Oct 2025 20:18:48 -0400 Subject: [PATCH 21/21] remove redundant comment Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Wei-Cheng Lai --- ray-operator/controllers/ray/rayjob_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index 4fa4d70ac1a..91959a6197c 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -1244,7 +1244,7 @@ func (r *RayJobReconciler) handleShutdownAfterJobFinishes(ctx context.Context, r } } else { // We only need to delete the RayCluster. We don't need to delete the submitter Kubernetes Job so that users can still access - // the driver logs. In addition, a completed Kubernetes Job does not actually use any compute resources. _, err = r.deleteClusterResources(ctx, rayJob) + // the driver logs. In addition, a completed Kubernetes Job does not actually use any compute resources. _, err = r.deleteClusterResources(ctx, rayJob) if err == nil { logger.Info("RayCluster is deleted", "RayCluster", rayJob.Status.RayClusterName)