From bc17214e8d1c1502516ece12882d50d452a3b1e2 Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Wed, 3 Sep 2025 04:40:20 +0000
Subject: [PATCH 01/21] [CRD][RayJob] Define new DeletionStrategy in RayJob CRD

Signed-off-by: wei-chenglai <qazwsx0939059006@gmail.com>
---
 docs/reference/api.md                         |  66 +++-
 .../kuberay-operator/crds/ray.io_rayjobs.yaml |  65 +++-
 ray-operator/apis/ray/v1/rayjob_types.go      |  79 +++-
 .../apis/ray/v1/zz_generated.deepcopy.go      |  36 ++
 .../config/crd/bases/ray.io_rayjobs.yaml      |  65 +++-
 .../controllers/ray/rayjob_controller.go      | 361 ++++++++++++++----
 .../controllers/ray/utils/validation.go       | 212 ++++++++--
 .../controllers/ray/utils/validation_test.go  | 239 ++++++++++++
 .../ray/v1/deletioncondition.go               |  36 ++
 .../applyconfiguration/ray/v1/deletionrule.go |  36 ++
 .../ray/v1/deletionstrategy.go                |  18 +-
 .../pkg/client/applyconfiguration/utils.go    |   4 +
 12 files changed, 1052 insertions(+), 165 deletions(-)
 create mode 100644 ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go
 create mode 100644 ray-operator/pkg/client/applyconfiguration/ray/v1/deletionrule.go
diff --git a/docs/reference/api.md b/docs/reference/api.md
index 4b495fef69e..4d3a87a9bce 100644
--- a/docs/reference/api.md
+++ b/docs/reference/api.md
@@ -55,12 +55,29 @@ _Appears in:_
 
 
 
-#### DeletionPolicy
+#### DeletionCondition
+
 
 
+DeletionCondition specifies the trigger conditions for a deletion action.
 
 
 
+_Appears in:_
+- [DeletionRule](#deletionrule)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `ttlSecondsAfterFinished` _integer_ | TTLSecondsAfterFinished is the time in seconds from when the JobStatus<br />reaches the specified terminal state to when this deletion action should be triggered.<br />The value must be a non-negative integer. | 0 | Minimum: 0 <br /> |
+
+
+#### DeletionPolicy
+
+
+
+DeletionPolicy is the legacy single-stage deletion policy.
+Deprecated: This struct is part of the legacy API. Use DeletionRule for new configurations.
+
 
 
 _Appears in:_
@@ -68,7 +85,7 @@ _Appears in:_
 
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `policy` _[DeletionPolicyType](#deletionpolicytype)_ | Valid values are 'DeleteCluster', 'DeleteWorkers', 'DeleteSelf' or 'DeleteNone'. |  |  |
+| `policy` _[DeletionPolicyType](#deletionpolicytype)_ | Policy is the action to take when the condition is met.<br />This field is logically required when using the legacy OnSuccess/OnFailure policies.<br />It is marked as '+optional' at the API level to allow the 'deletionRules' field to be used instead. |  | Enum: [DeleteCluster DeleteWorkers DeleteSelf DeleteNone] <br /> |
 
 
 #### DeletionPolicyType
@@ -81,14 +98,54 @@ _Underlying type:_ _string_
 
 _Appears in:_
 - [DeletionPolicy](#deletionpolicy)
+- [DeletionRule](#deletionrule)
+
+
+
+#### DeletionRule
+
 
 
+DeletionRule defines a single deletion action and its trigger condition.
+This is the new, recommended way to define deletion behavior.
+
+
+
+_Appears in:_
+- [DeletionStrategy](#deletionstrategy)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `policy` _[DeletionPolicyType](#deletionpolicytype)_ | Policy is the action to take when the condition is met. This field is required. |  | Enum: [DeleteCluster DeleteWorkers DeleteSelf DeleteNone] <br /> |
+| `condition` _[DeletionCondition](#deletioncondition)_ | The condition under which this deletion rule is triggered. This field is required. |  |  |
+
 
 #### DeletionStrategy
 
 
 
+DeletionStrategy defines the deletion policies for a RayJob.
+It allows for fine-grained control over resource cleanup after a job finishes.
+
+
+Legacy fields `onSuccess` and `onFailure` are still supported for backward compatibility,
+but it is highly recommended to migrate to the new `deletionRules` field.
+
+
+Notes:
+  - When this block is set, you must configure **either**
+    (a) BOTH `onSuccess` and `onFailure` policies,
+    OR
+    (b) the `deletionRules` field (which may be empty, in which case no deletion will occur).
+  - `onSuccess` / `onFailure` must NOT be used together with `deletionRules`.
+  - `onSuccess` and `onFailure` are **deprecated** and planned for removal in a future release.
+
+
+Validation rules:
+ 1. Prevent mixing legacy and new fields
+
 
+ 2. Require either both legacy fields or deletionRules presence
 
 
 
@@ -97,8 +154,9 @@ _Appears in:_
 
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `onSuccess` _[DeletionPolicy](#deletionpolicy)_ |  |  |  |
-| `onFailure` _[DeletionPolicy](#deletionpolicy)_ |  |  |  |
+| `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | OnSuccess is the deletion policy for a successful RayJob.<br />Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.<br />This field will be removed in a future release. |  |  |
+| `onFailure` _[DeletionPolicy](#deletionpolicy)_ | OnFailure is the deletion policy for a failed RayJob.<br />Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.<br />This field will be removed in a future release. |  |  |
+| `deletionRules` _[DeletionRule](#deletionrule) array_ | DeletionRules is a list of deletion rules, processed based on their trigger conditions.<br />While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),<br />the most impactful rule (e.g., DeleteCluster) will be executed first to prioritize resource cleanup and cost savings. |  |  |
 
 
 
diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml
index 8f8679ca607..15e15996f7b 100644
--- a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml
+++ b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml
@@ -60,34 +60,65 @@ spec:
                 type: object
               deletionStrategy:
                 properties:
+                  deletionRules:
+                    items:
+                      properties:
+                        condition:
+                          properties:
+                            jobStatus:
+                              enum:
+                              - SUCCEEDED
+                              - FAILED
+                              type: string
+                            ttlSecondsAfterFinished:
+                              default: 0
+                              format: int32
+                              minimum: 0
+                              type: integer
+                          required:
+                          - jobStatus
+                          type: object
+                        policy:
+                          enum:
+                          - DeleteCluster
+                          - DeleteWorkers
+                          - DeleteSelf
+                          - DeleteNone
+                          type: string
+                      required:
+                      - condition
+                      - policy
+                      type: object
+                    type: array
+                    x-kubernetes-list-type: atomic
                   onFailure:
                     properties:
                       policy:
+                        enum:
+                        - DeleteCluster
+                        - DeleteWorkers
+                        - DeleteSelf
+                        - DeleteNone
                         type: string
-                        x-kubernetes-validations:
-                        - message: the policy field value must be either 'DeleteCluster',
-                            'DeleteWorkers', 'DeleteSelf', or 'DeleteNone'
-                          rule: self in ['DeleteCluster', 'DeleteWorkers', 'DeleteSelf',
-                            'DeleteNone']
-                    required:
-                    - policy
                     type: object
                   onSuccess:
                     properties:
                       policy:
+                        enum:
+                        - DeleteCluster
+                        - DeleteWorkers
+                        - DeleteSelf
+                        - DeleteNone
                         type: string
-                        x-kubernetes-validations:
-                        - message: the policy field value must be either 'DeleteCluster',
-                            'DeleteWorkers', 'DeleteSelf', or 'DeleteNone'
-                          rule: self in ['DeleteCluster', 'DeleteWorkers', 'DeleteSelf',
-                            'DeleteNone']
-                    required:
-                    - policy
                     type: object
-                required:
-                - onFailure
-                - onSuccess
                 type: object
+                x-kubernetes-validations:
+                - message: legacy policies (onSuccess/onFailure) and deletionRules
+                    cannot be used together within the same deletionStrategy
+                  rule: '!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))'
+                - message: deletionStrategy requires either BOTH onSuccess and onFailure,
+                    OR the deletionRules field (which may be empty)
+                  rule: ((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules))
               entrypoint:
                 type: string
               entrypointNumCpus:
diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go
index 54a2ef7bce2..683ff6434b3 100644
--- a/ray-operator/apis/ray/v1/rayjob_types.go
+++ b/ray-operator/apis/ray/v1/rayjob_types.go
@@ -87,15 +87,84 @@ const (
 
 type DeletionPolicyType string
 
+// DeletionStrategy defines the deletion policies for a RayJob.
+// It allows for fine-grained control over resource cleanup after a job finishes.
+//
+// Legacy fields `onSuccess` and `onFailure` are still supported for backward compatibility,
+// but it is highly recommended to migrate to the new `deletionRules` field.
+//
+// Notes:
+//   - When this block is set, you must configure **either**
+//     (a) BOTH `onSuccess` and `onFailure` policies,
+//     OR
+//     (b) the `deletionRules` field (which may be empty, in which case no deletion will occur).
+//   - `onSuccess` / `onFailure` must NOT be used together with `deletionRules`.
+//   - `onSuccess` and `onFailure` are **deprecated** and planned for removal in a future release.
+//
+// Validation rules:
+//  1. Prevent mixing legacy and new fields
+//
+// +kubebuilder:validation:XValidation:rule="!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))",message="legacy policies (onSuccess/onFailure) and deletionRules cannot be used together within the same deletionStrategy"
+//  2. Require either both legacy fields or deletionRules presence
+//
+// +kubebuilder:validation:XValidation:rule="((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules))",message="deletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (which may be empty)"
 type DeletionStrategy struct {
-	OnSuccess DeletionPolicy `json:"onSuccess"`
-	OnFailure DeletionPolicy `json:"onFailure"`
+	// OnSuccess is the deletion policy for a successful RayJob.
+	// Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
+	// This field will be removed in a future release.
+	// +optional
+	OnSuccess DeletionPolicy `json:"onSuccess,omitempty"`
+
+	// OnFailure is the deletion policy for a failed RayJob.
+	// Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
+	// This field will be removed in a future release.
+	// +optional
+	OnFailure DeletionPolicy `json:"onFailure,omitempty"`
+
+	// DeletionRules is a list of deletion rules, processed based on their trigger conditions.
+	// While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),
+	// the most impactful rule (e.g., DeleteCluster) will be executed first to prioritize resource cleanup and cost savings.
+	// +optional
+	// +listType=atomic
+	DeletionRules []DeletionRule `json:"deletionRules,omitempty"`
 }
 
+// DeletionRule defines a single deletion action and its trigger condition.
+// This is the new, recommended way to define deletion behavior.
+type DeletionRule struct {
+	// Policy is the action to take when the condition is met. This field is required.
+	// +kubebuilder:validation:Enum=DeleteCluster;DeleteWorkers;DeleteSelf;DeleteNone
+	Policy DeletionPolicyType `json:"policy"`
+
+	// The condition under which this deletion rule is triggered. This field is required.
+	Condition DeletionCondition `json:"condition"`
+}
+
+// DeletionCondition specifies the trigger conditions for a deletion action.
+type DeletionCondition struct {
+	// JobStatus is the terminal status of the RayJob that triggers this condition. This field is required.
+	// For the initial implementation, only "SUCCEEDED" and "FAILED" are supported.
+	// +kubebuilder:validation:Enum=SUCCEEDED;FAILED
+	JobStatus JobStatus `json:"jobStatus"`
+
+	// TTLSecondsAfterFinished is the time in seconds from when the JobStatus
+	// reaches the specified terminal state to when this deletion action should be triggered.
+	// The value must be a non-negative integer.
+	// +kubebuilder:default=0
+	// +kubebuilder:validation:Minimum=0
+	// +optional
+	TTLSecondsAfterFinished int32 `json:"ttlSecondsAfterFinished,omitempty"`
+}
+
+// DeletionPolicy is the legacy single-stage deletion policy.
+// Deprecated: This struct is part of the legacy API. Use DeletionRule for new configurations.
 type DeletionPolicy struct {
-	// Valid values are 'DeleteCluster', 'DeleteWorkers', 'DeleteSelf' or 'DeleteNone'.
-	// +kubebuilder:validation:XValidation:rule="self in ['DeleteCluster', 'DeleteWorkers', 'DeleteSelf', 'DeleteNone']",message="the policy field value must be either 'DeleteCluster', 'DeleteWorkers', 'DeleteSelf', or 'DeleteNone'"
-	Policy *DeletionPolicyType `json:"policy"`
+	// Policy is the action to take when the condition is met.
+	// This field is logically required when using the legacy OnSuccess/OnFailure policies.
+	// It is marked as '+optional' at the API level to allow the 'deletionRules' field to be used instead.
+	// +kubebuilder:validation:Enum=DeleteCluster;DeleteWorkers;DeleteSelf;DeleteNone
+	// +optional
+	Policy *DeletionPolicyType `json:"policy,omitempty"`
 }
 
 const (
diff --git a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go
index b4cb5decf12..d548d944c8b 100644
--- a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go
+++ b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go
@@ -103,6 +103,21 @@ func (in *AutoscalerOptions) DeepCopy() *AutoscalerOptions {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *DeletionCondition) DeepCopyInto(out *DeletionCondition) {
+	*out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeletionCondition.
+func (in *DeletionCondition) DeepCopy() *DeletionCondition {
+	if in == nil {
+		return nil
+	}
+	out := new(DeletionCondition)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *DeletionPolicy) DeepCopyInto(out *DeletionPolicy) {
 	*out = *in
@@ -123,11 +138,32 @@ func (in *DeletionPolicy) DeepCopy() *DeletionPolicy {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *DeletionRule) DeepCopyInto(out *DeletionRule) {
+	*out = *in
+	out.Condition = in.Condition
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeletionRule.
+func (in *DeletionRule) DeepCopy() *DeletionRule {
+	if in == nil {
+		return nil
+	}
+	out := new(DeletionRule)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *DeletionStrategy) DeepCopyInto(out *DeletionStrategy) {
 	*out = *in
 	in.OnSuccess.DeepCopyInto(&out.OnSuccess)
 	in.OnFailure.DeepCopyInto(&out.OnFailure)
+	if in.DeletionRules != nil {
+		in, out := &in.DeletionRules, &out.DeletionRules
+		*out = make([]DeletionRule, len(*in))
+		copy(*out, *in)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeletionStrategy.
diff --git a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml
index 8f8679ca607..15e15996f7b 100644
--- a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml
+++ b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml
@@ -60,34 +60,65 @@ spec:
                 type: object
               deletionStrategy:
                 properties:
+                  deletionRules:
+                    items:
+                      properties:
+                        condition:
+                          properties:
+                            jobStatus:
+                              enum:
+                              - SUCCEEDED
+                              - FAILED
+                              type: string
+                            ttlSecondsAfterFinished:
+                              default: 0
+                              format: int32
+                              minimum: 0
+                              type: integer
+                          required:
+                          - jobStatus
+                          type: object
+                        policy:
+                          enum:
+                          - DeleteCluster
+                          - DeleteWorkers
+                          - DeleteSelf
+                          - DeleteNone
+                          type: string
+                      required:
+                      - condition
+                      - policy
+                      type: object
+                    type: array
+                    x-kubernetes-list-type: atomic
                   onFailure:
                     properties:
                       policy:
+                        enum:
+                        - DeleteCluster
+                        - DeleteWorkers
+                        - DeleteSelf
+                        - DeleteNone
                         type: string
-                        x-kubernetes-validations:
-                        - message: the policy field value must be either 'DeleteCluster',
-                            'DeleteWorkers', 'DeleteSelf', or 'DeleteNone'
-                          rule: self in ['DeleteCluster', 'DeleteWorkers', 'DeleteSelf',
-                            'DeleteNone']
-                    required:
-                    - policy
                     type: object
                   onSuccess:
                     properties:
                       policy:
+                        enum:
+                        - DeleteCluster
+                        - DeleteWorkers
+                        - DeleteSelf
+                        - DeleteNone
                         type: string
-                        x-kubernetes-validations:
-                        - message: the policy field value must be either 'DeleteCluster',
-                            'DeleteWorkers', 'DeleteSelf', or 'DeleteNone'
-                          rule: self in ['DeleteCluster', 'DeleteWorkers', 'DeleteSelf',
-                            'DeleteNone']
-                    required:
-                    - policy
                     type: object
-                required:
-                - onFailure
-                - onSuccess
                 type: object
+                x-kubernetes-validations:
+                - message: legacy policies (onSuccess/onFailure) and deletionRules
+                    cannot be used together within the same deletionStrategy
+                  rule: '!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))'
+                - message: deletionStrategy requires either BOTH onSuccess and onFailure,
+                    OR the deletionRules field (which may be empty)
+                  rule: ((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules))
               entrypoint:
                 type: string
               entrypointNumCpus:
diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go
index 64ba470f3ca..7cb41681ce9 100644
--- a/ray-operator/controllers/ray/rayjob_controller.go
+++ b/ray-operator/controllers/ray/rayjob_controller.go
@@ -363,89 +363,8 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
 		// TODO (kevin85421): We may not need to requeue the RayJob if it has already been suspended.
 		return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil
 	case rayv1.JobDeploymentStatusComplete, rayv1.JobDeploymentStatusFailed:
-		// If this RayJob uses an existing RayCluster (i.e., ClusterSelector is set), we should not delete the RayCluster.
-		ttlSeconds := rayJobInstance.Spec.TTLSecondsAfterFinished
-		nowTime := time.Now()
-		shutdownTime := rayJobInstance.Status.EndTime.Add(time.Duration(ttlSeconds) * time.Second)
-		logger.Info(string(rayJobInstance.Status.JobDeploymentStatus),
-			"ShutdownAfterJobFinishes", rayJobInstance.Spec.ShutdownAfterJobFinishes,
-			"ClusterSelector", rayJobInstance.Spec.ClusterSelector,
-			"ttlSecondsAfterFinished", ttlSeconds,
-			"Status.endTime", rayJobInstance.Status.EndTime,
-			"Now", nowTime,
-			"ShutdownTime", shutdownTime)
-
-		if features.Enabled(features.RayJobDeletionPolicy) &&
-			rayJobInstance.Spec.DeletionStrategy != nil &&
-			len(rayJobInstance.Spec.ClusterSelector) == 0 {
-
-			if shutdownTime.After(nowTime) {
-				delta := int32(time.Until(shutdownTime.Add(2 * time.Second)).Seconds())
-				logger.Info("shutdownTime not reached, requeue this RayJob for n seconds", "seconds", delta)
-				return ctrl.Result{RequeueAfter: time.Duration(delta) * time.Second}, nil
-			}
-
-			policy := rayv1.DeleteNone
-			if rayJobInstance.Status.JobStatus == rayv1.JobStatusSucceeded {
-				policy = *rayJobInstance.Spec.DeletionStrategy.OnSuccess.Policy
-			} else if rayJobInstance.Status.JobStatus == rayv1.JobStatusFailed {
-				policy = *rayJobInstance.Spec.DeletionStrategy.OnFailure.Policy
-			} else {
-				logger.Info("jobStatus not valid for deletion", "jobStatus", rayJobInstance.Status.JobStatus)
-			}
-
-			// no need to continue as the selected policy is DeleteNone
-			if policy == rayv1.DeleteNone {
-				break
-			}
-
-			logger.Info("Shutdown behavior is defined by the deletion policy", "deletionPolicy", rayJobInstance.Spec.DeletionStrategy)
-			if shutdownTime.After(nowTime) {
-				delta := int32(time.Until(shutdownTime.Add(2 * time.Second)).Seconds())
-				logger.Info("shutdownTime not reached, requeue this RayJob for n seconds", "seconds", delta)
-				return ctrl.Result{RequeueAfter: time.Duration(delta) * time.Second}, nil
-			}
-
-			switch policy {
-			case rayv1.DeleteCluster:
-				logger.Info("Deleting RayCluster", "RayCluster", rayJobInstance.Status.RayClusterName)
-				_, err = r.deleteClusterResources(ctx, rayJobInstance)
-			case rayv1.DeleteWorkers:
-				logger.Info("Suspending all worker groups", "RayCluster", rayJobInstance.Status.RayClusterName)
-				err = r.suspendWorkerGroups(ctx, rayJobInstance)
-			case rayv1.DeleteSelf:
-				logger.Info("Deleting RayJob")
-				err = r.Client.Delete(ctx, rayJobInstance)
-			default:
-			}
-			if err != nil {
-				return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
-			}
-		}
-
-		if (!features.Enabled(features.RayJobDeletionPolicy) || rayJobInstance.Spec.DeletionStrategy == nil) && rayJobInstance.Spec.ShutdownAfterJobFinishes && len(rayJobInstance.Spec.ClusterSelector) == 0 {
-			logger.Info("Shutdown behavior is defined by the `ShutdownAfterJobFinishes` flag", "shutdownAfterJobFinishes", rayJobInstance.Spec.ShutdownAfterJobFinishes)
-			if shutdownTime.After(nowTime) {
-				delta := int32(time.Until(shutdownTime.Add(2 * time.Second)).Seconds())
-				logger.Info("shutdownTime not reached, requeue this RayJob for n seconds", "seconds", delta)
-				return ctrl.Result{RequeueAfter: time.Duration(delta) * time.Second}, nil
-			}
-			if s := os.Getenv(utils.DELETE_RAYJOB_CR_AFTER_JOB_FINISHES); strings.ToLower(s) == "true" {
-				err = r.Client.Delete(ctx, rayJobInstance)
-				logger.Info("RayJob is deleted")
-			} else {
-				// We only need to delete the RayCluster. We don't need to delete the submitter Kubernetes Job so that users can still access
-				// the driver logs. In addition, a completed Kubernetes Job does not actually use any compute resources.
-				_, err = r.deleteClusterResources(ctx, rayJobInstance)
-				logger.Info("RayCluster is deleted", "RayCluster", rayJobInstance.Status.RayClusterName)
-			}
-			if err != nil {
-				return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
-			}
-		}
-
-		// If the RayJob is completed, we should not requeue it.
-		return ctrl.Result{}, nil
+		// The RayJob has reached a terminal state. Handle the cleanup and deletion logic.
+		return r.handleFinishedRayJob(ctx, rayJobInstance)
 	default:
 		logger.Info("Unknown JobDeploymentStatus", "JobDeploymentStatus", rayJobInstance.Status.JobDeploymentStatus)
 		return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil
@@ -1105,3 +1024,279 @@ func isSubmitterContainerFinished(pod *corev1.Pod) bool {
 	}
 	return false
 }
+
+// handleFinishedRayJob is the main entry point for handling cleanup of a completed or failed RayJob.
+// It acts as a dispatcher, selecting the appropriate deletion mechanism based on the RayJob spec.
+func (r *RayJobReconciler) handleFinishedRayJob(ctx context.Context, rayJob *rayv1.RayJob) (ctrl.Result, error) {
+	logger := ctrl.LoggerFrom(ctx)
+
+	// If the RayJob uses an existing RayCluster, we must not delete it.
+	if len(rayJob.Spec.ClusterSelector) > 0 {
+		logger.Info("RayJob is using an existing RayCluster via clusterSelector; skipping resource deletion.", "RayClusterSelector", rayJob.Spec.ClusterSelector)
+		return ctrl.Result{}, nil
+	}
+
+	if features.Enabled(features.RayJobDeletionPolicy) && rayJob.Spec.DeletionStrategy != nil {
+		// The previous validation logic ensures that either DeletionRules or the legacy policies are set, but not both.
+		if len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0 {
+			return r.handleDeletionRules(ctx, rayJob)
+		}
+		return r.handleLegacyDeletionPolicy(ctx, rayJob)
+	}
+
+	if rayJob.Spec.ShutdownAfterJobFinishes {
+		return r.handleShutdownAfterJobFinishes(ctx, rayJob)
+	}
+
+	// Default: No deletion policy is configured. The reconciliation is complete for this RayJob.
+	return ctrl.Result{}, nil
+}
+
+// handleDeletionRules processes the DeletionRules with a impact-aware strategy.
+// It categorizes rules into "overdue" and "pending". If overdue rules exist,
+// it executes the most destructive one and then requeues for the next pending rule.
+// If no rules are overdue, it simply requeues for the
+// next pending rule. This function performs at most one deletion action per reconciliation.
+func (r *RayJobReconciler) handleDeletionRules(ctx context.Context, rayJob *rayv1.RayJob) (ctrl.Result, error) {
+	logger := ctrl.LoggerFrom(ctx).WithValues("deletionMechanism", "DeletionRules")
+	nowTime := time.Now()
+
+	var overdueRules []rayv1.DeletionRule
+	var nextRequeueTime *time.Time
+
+	// Categorize all applicable and incomplete rules into "overdue" or "pending".
+	for _, rule := range rayJob.Spec.DeletionStrategy.DeletionRules {
+		// Skip rules that don't match the current job status.
+		if rule.Condition.JobStatus != rayJob.Status.JobStatus {
+			continue
+		}
+
+		// Skip rules for actions that have already been completed to ensure idempotency.
+		isCompleted, err := r.isDeletionActionCompleted(ctx, rayJob, rule.Policy)
+		if err != nil {
+			logger.Error(err, "Failed to check if deletion action is completed", "rule", rule)
+			return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
+		}
+		if isCompleted {
+			logger.Info("Skipping completed deletion rule", "rule", rule)
+			continue
+		}
+
+		// Categorize the rule based on its TTL.
+		deletionTime := rayJob.Status.EndTime.Add(time.Duration(rule.Condition.TTLSecondsAfterFinished) * time.Second)
+		if nowTime.After(deletionTime) {
+			overdueRules = append(overdueRules, rule)
+		} else if nextRequeueTime == nil || deletionTime.Before(*nextRequeueTime) {
+			// This is a pending rule. Track the earliest one to schedule the next requeue.
+			nextRequeueTime = &deletionTime
+		}
+	}
+
+	// Handle overdue rules if any exist.
+	if len(overdueRules) > 0 {
+		ruleToExecute := selectMostImpactfulRule(overdueRules)
+		logger.Info("Executing the most impactful overdue deletion rule", "rule", ruleToExecute, "overdueRulesCount", len(overdueRules))
+		if _, err := r.executeDeletionPolicy(ctx, rayJob, ruleToExecute.Policy); err != nil {
+			// If execution fails, return immediately for a retry.
+			return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
+		}
+	}
+
+	if nextRequeueTime != nil {
+		requeueAfter := requeueDelayFor(*nextRequeueTime)
+		logger.Info("Requeuing for the next scheduled rule", "requeueAfter", requeueAfter)
+		return ctrl.Result{RequeueAfter: requeueAfter}, nil
+	}
+
+	logger.Info("All applicable deletion rules have been processed.")
+	return ctrl.Result{}, nil
+}
+
+// handleLegacyDeletionPolicy handles the deprecated onSuccess and onFailure policies.
+func (r *RayJobReconciler) handleLegacyDeletionPolicy(ctx context.Context, rayJob *rayv1.RayJob) (ctrl.Result, error) {
+	logger := ctrl.LoggerFrom(ctx).WithValues("deletionMechanism", "LegacyOnSuccessFailure")
+
+	var policy rayv1.DeletionPolicyType
+	switch rayJob.Status.JobStatus {
+	case rayv1.JobStatusSucceeded:
+		policy = *rayJob.Spec.DeletionStrategy.OnSuccess.Policy
+	case rayv1.JobStatusFailed:
+		policy = *rayJob.Spec.DeletionStrategy.OnFailure.Policy
+	default:
+		logger.Info("JobStatus is not valid for deletion, no policy applied", "jobStatus", rayJob.Status.JobStatus)
+		return ctrl.Result{}, nil
+	}
+
+	// If the policy is DeleteNone, we are done.
+	if policy == rayv1.DeleteNone {
+		logger.Info("Deletion policy is DeleteNone; no action taken.")
+		return ctrl.Result{}, nil
+	}
+
+	// These legacy policies use the top-level TTLSecondsAfterFinished.
+	nowTime := time.Now()
+	ttlSeconds := rayJob.Spec.TTLSecondsAfterFinished
+	shutdownTime := rayJob.Status.EndTime.Add(time.Duration(ttlSeconds) * time.Second)
+	logger.Info("Evaluating legacy deletion policy (onSuccess/onFailure)",
+		"JobDeploymentStatus", rayJob.Status.JobDeploymentStatus,
+		"policy", policy,
+		"JobStatus", rayJob.Status.JobStatus,
+		"ttlSecondsAfterFinished", ttlSeconds,
+		"Status.endTime", rayJob.Status.EndTime,
+		"Now", nowTime,
+		"ShutdownTime", shutdownTime)
+
+	if shutdownTime.After(nowTime) {
+		requeueAfter := requeueDelayFor(shutdownTime)
+		logger.Info("TTL has not been met for legacy policy. Requeuing.", "shutdownTime", shutdownTime, "requeueAfter", requeueAfter)
+		return ctrl.Result{RequeueAfter: requeueAfter}, nil
+	}
+
+	logger.Info("Executing legacy deletion policy.", "policy", policy)
+	return r.executeDeletionPolicy(ctx, rayJob, policy)
+}
+
+// handleShutdownAfterJobFinishes handles the oldest deletion mechanism, the ShutdownAfterJobFinishes boolean flag.
+func (r *RayJobReconciler) handleShutdownAfterJobFinishes(ctx context.Context, rayJob *rayv1.RayJob) (ctrl.Result, error) {
+	logger := ctrl.LoggerFrom(ctx).WithValues("deletionMechanism", "ShutdownAfterJobFinishes")
+
+	nowTime := time.Now()
+	ttlSeconds := rayJob.Spec.TTLSecondsAfterFinished
+	shutdownTime := rayJob.Status.EndTime.Add(time.Duration(ttlSeconds) * time.Second)
+	logger.Info("Evaluating job deletion policy based on ShutdownAfterJobFinishes",
+		"JobDeploymentStatus", rayJob.Status.JobDeploymentStatus,
+		"ShutdownAfterJobFinishes", rayJob.Spec.ShutdownAfterJobFinishes,
+		"ClusterSelector", rayJob.Spec.ClusterSelector,
+		"ttlSecondsAfterFinished", ttlSeconds,
+		"Status.endTime", rayJob.Status.EndTime,
+		"Now", nowTime,
+		"ShutdownTime", shutdownTime)
+
+	if shutdownTime.After(nowTime) {
+		requeueAfter := requeueDelayFor(shutdownTime)
+		logger.Info("TTL has not been met for ShutdownAfterJobFinishes. Requeuing.", "shutdownTime", shutdownTime, "requeueAfter", requeueAfter)
+		return ctrl.Result{RequeueAfter: requeueAfter}, nil
+	}
+
+	var err error
+	if s := os.Getenv(utils.DELETE_RAYJOB_CR_AFTER_JOB_FINISHES); strings.ToLower(s) == "true" {
+		err = r.Client.Delete(ctx, rayJob)
+		if err == nil {
+			logger.Info("RayJob is deleted", "RayJob", rayJob.Name)
+		}
+	} else {
+		// We only need to delete the RayCluster. We don't need to delete the submitter Kubernetes Job so that users can still access
+		// the driver logs. In addition, a completed Kubernetes Job does not actually use any compute resources.		_, err = r.deleteClusterResources(ctx, rayJob)
+		_, err = r.deleteClusterResources(ctx, rayJob)
+		if err == nil {
+			logger.Info("RayCluster is deleted", "RayCluster", rayJob.Status.RayClusterName)
+		}
+	}
+
+	if err != nil {
+		return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
+	}
+
+	return ctrl.Result{}, nil
+}
+
+// executeDeletionPolicy performs the actual resource deletion based on the policy type.
+// This function centralizes the deletion logic to avoid code duplication.
+func (r *RayJobReconciler) executeDeletionPolicy(ctx context.Context, rayJob *rayv1.RayJob, policy rayv1.DeletionPolicyType) (ctrl.Result, error) {
+	logger := ctrl.LoggerFrom(ctx)
+	var err error
+
+	switch policy {
+	case rayv1.DeleteCluster:
+		logger.Info("Executing deletion policy: DeleteCluster", "RayCluster", rayJob.Status.RayClusterName)
+		_, err = r.deleteClusterResources(ctx, rayJob)
+	case rayv1.DeleteWorkers:
+		logger.Info("Executing deletion policy: DeleteWorkers", "RayCluster", rayJob.Status.RayClusterName)
+		err = r.suspendWorkerGroups(ctx, rayJob)
+	case rayv1.DeleteSelf:
+		logger.Info("Executing deletion policy: DeleteSelf", "RayJob", rayJob.Name)
+		err = r.Client.Delete(ctx, rayJob)
+	case rayv1.DeleteNone:
+		// This should be handled by the callers, but we include it for safety.
+		logger.Info("Executing deletion policy: DeleteNone. No action taken.")
+	default:
+		// This case should not be reached if validation is working correctly.
+		logger.Error(fmt.Errorf("unknown deletion policy: %s", policy), "Unknown deletion policy encountered")
+	}
+
+	if err != nil {
+		return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
+	}
+	return ctrl.Result{}, nil
+}
+
+// isDeletionActionCompleted checks if the state corresponding to a deletion policy is already achieved.
+// This is crucial for making the reconciliation loop idempotent by checking the actual cluster state.
+func (r *RayJobReconciler) isDeletionActionCompleted(ctx context.Context, rayJob *rayv1.RayJob, policy rayv1.DeletionPolicyType) (bool, error) {
+	clusterIdentifier := common.RayJobRayClusterNamespacedName(rayJob)
+	cluster := &rayv1.RayCluster{}
+
+	switch policy {
+	case rayv1.DeleteWorkers:
+		if err := r.Get(ctx, clusterIdentifier, cluster); err != nil {
+			if errors.IsNotFound(err) {
+				// If the cluster is gone, the workers are definitely gone.
+				return true, nil
+			}
+			// For any other error, we can't be sure of the state, so report the error.
+			return false, err
+		}
+
+		// If the cluster exists, check if all worker groups are suspended.
+		for _, wg := range cluster.Spec.WorkerGroupSpecs {
+			if wg.Suspend == nil || !*wg.Suspend {
+				// Found an active worker group, so the action is not complete.
+				return false, nil
+			}
+		}
+
+		return true, nil
+
+	case rayv1.DeleteCluster:
+		err := r.Get(ctx, clusterIdentifier, cluster)
+		if errors.IsNotFound(err) {
+			// Cluster not found means the deletion is complete.
+			return true, nil
+		}
+		return false, err
+
+	case rayv1.DeleteSelf:
+		// This action is terminal. If this function is running, the RayJob still exists,
+		// so the action cannot be considered complete.
+		return false, nil
+
+	case rayv1.DeleteNone:
+		// "DeleteNone" is a no-op and is always considered complete.
+		return true, nil
+	}
+
+	return false, fmt.Errorf("unknown deletion policy for completion check: %s", policy)
+}
+
+// selectMostImpactfulRule finds the rule with the most destructive policy from a given list.
+func selectMostImpactfulRule(rules []rayv1.DeletionRule) rayv1.DeletionRule {
+	order := map[rayv1.DeletionPolicyType]int{
+		rayv1.DeleteSelf:    4,
+		rayv1.DeleteCluster: 3,
+		rayv1.DeleteWorkers: 2,
+		rayv1.DeleteNone:    1,
+	}
+
+	mostImpactfulRule := rules[0]
+	for _, rule := range rules[1:] {
+		if order[rule.Policy] > order[mostImpactfulRule.Policy] {
+			mostImpactfulRule = rule
+		}
+	}
+	return mostImpactfulRule
+}
+
+// requeueDelayFor computes the duration for the next requeue, ensuring a minimum buffer.
+func requeueDelayFor(t time.Time) time.Duration {
+	return time.Until(t) + 2*time.Second
+}
diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go
index 7552637fa5e..d4653aa04bc 100644
--- a/ray-operator/controllers/ray/utils/validation.go
+++ b/ray-operator/controllers/ray/utils/validation.go
@@ -1,6 +1,7 @@
 package utils
 
 import (
+	"errors"
 	errstd "errors"
 	"fmt"
 
@@ -218,43 +219,7 @@ func ValidateRayJobSpec(rayJob *rayv1.RayJob) error {
 		return fmt.Errorf("RayJobDeletionPolicy feature gate must be enabled to use the DeletionStrategy feature")
 	}
 
-	if rayJob.Spec.DeletionStrategy != nil {
-		onSuccessPolicy := rayJob.Spec.DeletionStrategy.OnSuccess
-		onFailurePolicy := rayJob.Spec.DeletionStrategy.OnFailure
-
-		if onSuccessPolicy.Policy == nil {
-			return fmt.Errorf("the DeletionPolicyType field of DeletionStrategy.OnSuccess cannot be unset when DeletionStrategy is enabled")
-		}
-		if onFailurePolicy.Policy == nil {
-			return fmt.Errorf("the DeletionPolicyType field of DeletionStrategy.OnFailure cannot be unset when DeletionStrategy is enabled")
-		}
-
-		if isClusterSelectorMode {
-			switch *onSuccessPolicy.Policy {
-			case rayv1.DeleteCluster:
-				return fmt.Errorf("the ClusterSelector mode doesn't support DeletionStrategy=DeleteCluster on success")
-			case rayv1.DeleteWorkers:
-				return fmt.Errorf("the ClusterSelector mode doesn't support DeletionStrategy=DeleteWorkers on success")
-			}
-
-			switch *onFailurePolicy.Policy {
-			case rayv1.DeleteCluster:
-				return fmt.Errorf("the ClusterSelector mode doesn't support DeletionStrategy=DeleteCluster on failure")
-			case rayv1.DeleteWorkers:
-				return fmt.Errorf("the ClusterSelector mode doesn't support DeletionStrategy=DeleteWorkers on failure")
-			}
-		}
-
-		if (*onSuccessPolicy.Policy == rayv1.DeleteWorkers || *onFailurePolicy.Policy == rayv1.DeleteWorkers) && IsAutoscalingEnabled(rayJob.Spec.RayClusterSpec) {
-			// TODO (rueian): This can be supported in a future Ray version. We should check the RayVersion once we know it.
-			return fmt.Errorf("DeletionStrategy=DeleteWorkers currently does not support RayCluster with autoscaling enabled")
-		}
-
-		if rayJob.Spec.ShutdownAfterJobFinishes && (*onSuccessPolicy.Policy == rayv1.DeleteNone || *onFailurePolicy.Policy == rayv1.DeleteNone) {
-			return fmt.Errorf("shutdownAfterJobFinshes is set to 'true' while deletion policy is 'DeleteNone'")
-		}
-	}
-	return nil
+	return validateDeletionStrategy(rayJob)
 }
 
 func ValidateRayServiceMetadata(metadata metav1.ObjectMeta) error {
@@ -291,3 +256,176 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error {
 
 	return nil
 }
+
+// validateDeletionStrategy centralizes all validation logic for the deletion strategy.
+// This includes the new `deletionRules` and the legacy fields (`onSuccess`,`onFailure`).
+func validateDeletionStrategy(rayJob *rayv1.RayJob) error {
+	if rayJob.Spec.DeletionStrategy == nil {
+		return nil
+	}
+
+	if !features.Enabled(features.RayJobDeletionPolicy) {
+		return fmt.Errorf("RayJobDeletionPolicy feature gate must be enabled to use the DeletionStrategy feature")
+	}
+
+	usingDeletionRules := len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0
+	usingLegacyAPI := rayJob.Spec.DeletionStrategy.OnSuccess.Policy != nil || rayJob.Spec.DeletionStrategy.OnFailure.Policy != nil
+
+	// ShutdownAfterJobFinishes cannot be used with the new API.
+	if usingDeletionRules && rayJob.Spec.ShutdownAfterJobFinishes {
+		return fmt.Errorf("ShutdownAfterJobFinishes cannot be used when spec.deletionStrategy.deletionRules is defined. Please configure all deletion behaviors within deletionRules")
+	}
+
+	// Legacy API and DeletionRules cannot be used simultaneously.
+	if usingDeletionRules && usingLegacyAPI {
+		return fmt.Errorf("legacy policies (onSuccess, onFailure) and the new deletionRules cannot be used simultaneously within the same deletionStrategy")
+	}
+
+	// DeletionStrategy must contain at least one policy if specified.
+	if !usingDeletionRules && !usingLegacyAPI {
+		return fmt.Errorf("deletionStrategy is specified, but no policies (onSuccess, onFailure, or deletionRules) are defined within it")
+	}
+
+	if usingDeletionRules {
+		return validateDeletionRules(rayJob)
+	}
+
+	// If not using DeletionRules, validate the legacy strategy
+	return validateLegacyDeletionPolicies(rayJob)
+}
+
+// validateDeletionRules validates the deletion rules in the RayJob spec.
+// It performs per-rule validations, checks for uniqueness, and ensures logical TTL consistency.
+// Errors are collected and returned as a single aggregated error using errors.Join for better user feedback.
+func validateDeletionRules(rayJob *rayv1.RayJob) error {
+	type ruleKey struct {
+		Policy rayv1.DeletionPolicyType
+		Status rayv1.JobStatus
+	}
+
+	rules := rayJob.Spec.DeletionStrategy.DeletionRules
+	isClusterSelectorMode := len(rayJob.Spec.ClusterSelector) != 0
+
+	// Group TTLs by JobStatus for cross-rule validation.
+	rulesByStatus := make(map[rayv1.JobStatus]map[rayv1.DeletionPolicyType]int32)
+	// Track unique (Policy, JobStatus) combinations.
+	ruleUniquenessSet := make(map[ruleKey]struct{})
+
+	var errs []error
+
+	// Single pass: Validate each rule individually and group for later consistency checks.
+	for i, rule := range rules {
+		// Validate TTL is non-negative.
+		if rule.Condition.TTLSecondsAfterFinished < 0 {
+			errs = append(errs, fmt.Errorf("deletionRules[%d]: TTLSecondsAfterFinished must be non-negative", i))
+			continue
+		}
+
+		// Check uniqueness.
+		key := ruleKey{Policy: rule.Policy, Status: rule.Condition.JobStatus}
+		if _, exists := ruleUniquenessSet[key]; exists {
+			errs = append(errs, fmt.Errorf("deletionRules[%d]: duplicate rule for DeletionPolicyType '%s' and JobStatus '%s'", i, rule.Policy, rule.Condition.JobStatus))
+			continue
+		}
+		ruleUniquenessSet[key] = struct{}{}
+
+		// Contextual validations based on spec.
+		if isClusterSelectorMode && (rule.Policy == rayv1.DeleteCluster || rule.Policy == rayv1.DeleteWorkers) {
+			errs = append(errs, fmt.Errorf("deletionRules[%d]: DeletionPolicyType '%s' not supported when ClusterSelector is set", i, rule.Policy))
+			continue
+		}
+		if IsAutoscalingEnabled(rayJob.Spec.RayClusterSpec) && rule.Policy == rayv1.DeleteWorkers {
+			// TODO (rueian): Support in future Ray versions by checking RayVersion.
+			errs = append(errs, fmt.Errorf("deletionRules[%d]: DeletionPolicyType 'DeleteWorkers' not supported with autoscaling enabled", i))
+			continue
+		}
+
+		// Group valid rule for consistency check.
+		statusMap, ok := rulesByStatus[rule.Condition.JobStatus]
+		if !ok {
+			statusMap = make(map[rayv1.DeletionPolicyType]int32)
+			rulesByStatus[rule.Condition.JobStatus] = statusMap
+		}
+		statusMap[rule.Policy] = rule.Condition.TTLSecondsAfterFinished
+	}
+
+	// Second pass: Validate TTL consistency per JobStatus.
+	for status, policyTTLs := range rulesByStatus {
+		if err := validateTTLConsistency(policyTTLs, status); err != nil {
+			errs = append(errs, err)
+		}
+	}
+
+	return errors.Join(errs...)
+}
+
+// validateTTLConsistency ensures TTLs follow the deletion hierarchy: Workers <= Cluster <= Self.
+// (Lower TTL means deletes earlier.)
+func validateTTLConsistency(policyTTLs map[rayv1.DeletionPolicyType]int32, status rayv1.JobStatus) error {
+	// Define the required deletion order. TTLs must be non-decreasing along this sequence.
+	deletionOrder := []rayv1.DeletionPolicyType{
+		rayv1.DeleteWorkers,
+		rayv1.DeleteCluster,
+		rayv1.DeleteSelf,
+	}
+
+	var prevPolicy rayv1.DeletionPolicyType
+	var prevTTL int32
+	var hasPrev bool
+
+	var errs []error
+
+	for _, policy := range deletionOrder {
+		ttl, exists := policyTTLs[policy]
+		if !exists {
+			continue
+		}
+
+		if hasPrev && ttl < prevTTL {
+			errs = append(errs, fmt.Errorf(
+				"for JobStatus '%s': %s TTL (%d) must be >= %s TTL (%d)",
+				status, policy, ttl, prevPolicy, prevTTL,
+			))
+		}
+
+		prevPolicy = policy
+		prevTTL = ttl
+		hasPrev = true
+	}
+
+	return errors.Join(errs...)
+}
+
+// validateLegacyDeletionPolicies handles validation for the old `onSuccess` and `onFailure` fields.
+func validateLegacyDeletionPolicies(rayJob *rayv1.RayJob) error {
+	isClusterSelectorMode := len(rayJob.Spec.ClusterSelector) != 0
+	onSuccessPolicy := rayJob.Spec.DeletionStrategy.OnSuccess
+	onFailurePolicy := rayJob.Spec.DeletionStrategy.OnFailure
+
+	if onSuccessPolicy.Policy == nil {
+		return fmt.Errorf("the DeletionPolicyType field of DeletionStrategy.OnSuccess cannot be unset when DeletionStrategy is enabled")
+	}
+	if onFailurePolicy.Policy == nil {
+		return fmt.Errorf("the DeletionPolicyType field of DeletionStrategy.OnFailure cannot be unset when DeletionStrategy is enabled")
+	}
+
+	if isClusterSelectorMode {
+		if *onSuccessPolicy.Policy == rayv1.DeleteCluster || *onSuccessPolicy.Policy == rayv1.DeleteWorkers {
+			return fmt.Errorf("the ClusterSelector mode doesn't support DeletionStrategy=%s on success", *onSuccessPolicy.Policy)
+		}
+		if *onFailurePolicy.Policy == rayv1.DeleteCluster || *onFailurePolicy.Policy == rayv1.DeleteWorkers {
+			return fmt.Errorf("the ClusterSelector mode doesn't support DeletionStrategy=%s on failure", *onFailurePolicy.Policy)
+		}
+	}
+
+	if (*onSuccessPolicy.Policy == rayv1.DeleteWorkers || *onFailurePolicy.Policy == rayv1.DeleteWorkers) && IsAutoscalingEnabled(rayJob.Spec.RayClusterSpec) {
+		// TODO (rueian): This can be supported in a future Ray version. We should check the RayVersion once we know it.
+		return fmt.Errorf("DeletionStrategy=DeleteWorkers currently does not support RayCluster with autoscaling enabled")
+	}
+
+	if rayJob.Spec.ShutdownAfterJobFinishes && (*onSuccessPolicy.Policy == rayv1.DeleteNone || *onFailurePolicy.Policy == rayv1.DeleteNone) {
+		return fmt.Errorf("shutdownAfterJobFinishes is set to 'true' while deletion policy is 'DeleteNone'")
+	}
+
+	return nil
+}
diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go
index b55a8f26c39..4060827e21f 100644
--- a/ray-operator/controllers/ray/utils/validation_test.go
+++ b/ray-operator/controllers/ray/utils/validation_test.go
@@ -927,6 +927,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 		spec        rayv1.RayJobSpec
 		expectError bool
 	}{
+		// Legacy DeletionStrategy tests
 		{
 			name: "the ClusterSelector mode doesn't support DeletionStrategy=DeleteCluster",
 			spec: rayv1.RayJobSpec{
@@ -1069,6 +1070,244 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 			},
 			expectError: true,
 		},
+		// New Deletion Rules tests
+		{
+			name: "valid deletionRules",
+			spec: rayv1.RayJobSpec{
+				DeletionStrategy: &rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteSelf,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 10,
+							},
+						},
+					},
+				},
+				RayClusterSpec: createBasicRayClusterSpec(),
+			},
+			expectError: false,
+		},
+		{
+			name: "deletionRules and ShutdownAfterJobFinishes both set",
+			spec: rayv1.RayJobSpec{
+				ShutdownAfterJobFinishes: true,
+				DeletionStrategy: &rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteSelf,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 10,
+							},
+						},
+					},
+				},
+				RayClusterSpec: createBasicRayClusterSpec(),
+			},
+			expectError: true,
+		},
+		{
+			name: "deletionRules and legacy onSuccess both set",
+			spec: rayv1.RayJobSpec{
+				DeletionStrategy: &rayv1.DeletionStrategy{
+					OnSuccess: rayv1.DeletionPolicy{
+						Policy: ptr.To(rayv1.DeleteCluster),
+					},
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteSelf,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 10,
+							},
+						},
+					},
+				},
+				RayClusterSpec: createBasicRayClusterSpec(),
+			},
+			expectError: true,
+		},
+		{
+			name: "empty DeletionStrategy",
+			spec: rayv1.RayJobSpec{
+				DeletionStrategy: &rayv1.DeletionStrategy{},
+				RayClusterSpec:   createBasicRayClusterSpec(),
+			},
+			expectError: true,
+		},
+		{
+			name: "duplicate rule in deletionRules",
+			spec: rayv1.RayJobSpec{
+				DeletionStrategy: &rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteSelf,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 10,
+							},
+						},
+						{
+							Policy: rayv1.DeleteSelf,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 20,
+							},
+						},
+					},
+				},
+				RayClusterSpec: createBasicRayClusterSpec(),
+			},
+			expectError: true,
+		},
+		{
+			name: "negative TTLSecondsAfterFinished in deletionRules",
+			spec: rayv1.RayJobSpec{
+				DeletionStrategy: &rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteSelf,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: -10,
+							},
+						},
+					},
+				},
+				RayClusterSpec: createBasicRayClusterSpec(),
+			},
+			expectError: true,
+		},
+		{
+			name: "deletionRules with ClusterSelector and DeleteCluster policy",
+			spec: rayv1.RayJobSpec{
+				ClusterSelector: map[string]string{"key": "value"},
+				DeletionStrategy: &rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteCluster,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 10,
+							},
+						},
+					},
+				},
+			},
+			expectError: true,
+		},
+		{
+			name: "deletionRules with autoscaling and DeleteWorkers policy",
+			spec: rayv1.RayJobSpec{
+				DeletionStrategy: &rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteWorkers,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 10,
+							},
+						},
+					},
+				},
+				RayClusterSpec: &rayv1.RayClusterSpec{
+					EnableInTreeAutoscaling: ptr.To(true),
+					HeadGroupSpec:           headGroupSpecWithOneContainer,
+				},
+			},
+			expectError: true,
+		},
+		{
+			name: "inconsistent TTLs in deletionRules (DeleteCluster < DeleteWorkers)",
+			spec: rayv1.RayJobSpec{
+				DeletionStrategy: &rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteWorkers,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 20,
+							},
+						},
+						{
+							Policy: rayv1.DeleteCluster,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 10,
+							},
+						},
+					},
+				},
+				RayClusterSpec: createBasicRayClusterSpec(),
+			},
+			expectError: true,
+		},
+		{
+			name: "inconsistent TTLs in deletionRules (DeleteSelf < DeleteCluster)",
+			spec: rayv1.RayJobSpec{
+				DeletionStrategy: &rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteCluster,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 20,
+							},
+						},
+						{
+							Policy: rayv1.DeleteSelf,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 10,
+							},
+						},
+					},
+				},
+				RayClusterSpec: createBasicRayClusterSpec(),
+			},
+			expectError: true,
+		},
+		{
+			name: "valid complex deletionRules",
+			spec: rayv1.RayJobSpec{
+				DeletionStrategy: &rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteWorkers,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 10,
+							},
+						},
+						{
+							Policy: rayv1.DeleteCluster,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 20,
+							},
+						},
+						{
+							Policy: rayv1.DeleteSelf,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 30,
+							},
+						},
+						{
+							Policy: rayv1.DeleteSelf,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusFailed,
+								TTLSecondsAfterFinished: 0,
+							},
+						},
+					},
+				},
+				RayClusterSpec: createBasicRayClusterSpec(),
+			},
+			expectError: false,
+		},
 	}
 
 	features.SetFeatureGateDuringTest(t, features.RayJobDeletionPolicy, true)
diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go
new file mode 100644
index 00000000000..25e1a881dbb
--- /dev/null
+++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go
@@ -0,0 +1,36 @@
+// Code generated by applyconfiguration-gen. DO NOT EDIT.
+
+package v1
+
+import (
+	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
+)
+
+// DeletionConditionApplyConfiguration represents a declarative configuration of the DeletionCondition type for use
+// with apply.
+type DeletionConditionApplyConfiguration struct {
+	JobStatus               *rayv1.JobStatus `json:"jobStatus,omitempty"`
+	TTLSecondsAfterFinished *int32           `json:"ttlSecondsAfterFinished,omitempty"`
+}
+
+// DeletionConditionApplyConfiguration constructs a declarative configuration of the DeletionCondition type for use with
+// apply.
+func DeletionCondition() *DeletionConditionApplyConfiguration {
+	return &DeletionConditionApplyConfiguration{}
+}
+
+// WithJobStatus sets the JobStatus field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the JobStatus field is set to the value of the last call.
+func (b *DeletionConditionApplyConfiguration) WithJobStatus(value rayv1.JobStatus) *DeletionConditionApplyConfiguration {
+	b.JobStatus = &value
+	return b
+}
+
+// WithTTLSecondsAfterFinished sets the TTLSecondsAfterFinished field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the TTLSecondsAfterFinished field is set to the value of the last call.
+func (b *DeletionConditionApplyConfiguration) WithTTLSecondsAfterFinished(value int32) *DeletionConditionApplyConfiguration {
+	b.TTLSecondsAfterFinished = &value
+	return b
+}
diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionrule.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionrule.go
new file mode 100644
index 00000000000..91e4b50de99
--- /dev/null
+++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionrule.go
@@ -0,0 +1,36 @@
+// Code generated by applyconfiguration-gen. DO NOT EDIT.
+
+package v1
+
+import (
+	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
+)
+
+// DeletionRuleApplyConfiguration represents a declarative configuration of the DeletionRule type for use
+// with apply.
+type DeletionRuleApplyConfiguration struct {
+	Policy    *rayv1.DeletionPolicyType            `json:"policy,omitempty"`
+	Condition *DeletionConditionApplyConfiguration `json:"condition,omitempty"`
+}
+
+// DeletionRuleApplyConfiguration constructs a declarative configuration of the DeletionRule type for use with
+// apply.
+func DeletionRule() *DeletionRuleApplyConfiguration {
+	return &DeletionRuleApplyConfiguration{}
+}
+
+// WithPolicy sets the Policy field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the Policy field is set to the value of the last call.
+func (b *DeletionRuleApplyConfiguration) WithPolicy(value rayv1.DeletionPolicyType) *DeletionRuleApplyConfiguration {
+	b.Policy = &value
+	return b
+}
+
+// WithCondition sets the Condition field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the Condition field is set to the value of the last call.
+func (b *DeletionRuleApplyConfiguration) WithCondition(value *DeletionConditionApplyConfiguration) *DeletionRuleApplyConfiguration {
+	b.Condition = value
+	return b
+}
diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionstrategy.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionstrategy.go
index 105c33d3de7..034cce827cb 100644
--- a/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionstrategy.go
+++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletionstrategy.go
@@ -5,8 +5,9 @@ package v1
 // DeletionStrategyApplyConfiguration represents a declarative configuration of the DeletionStrategy type for use
 // with apply.
 type DeletionStrategyApplyConfiguration struct {
-	OnSuccess *DeletionPolicyApplyConfiguration `json:"onSuccess,omitempty"`
-	OnFailure *DeletionPolicyApplyConfiguration `json:"onFailure,omitempty"`
+	OnSuccess     *DeletionPolicyApplyConfiguration `json:"onSuccess,omitempty"`
+	OnFailure     *DeletionPolicyApplyConfiguration `json:"onFailure,omitempty"`
+	DeletionRules []DeletionRuleApplyConfiguration  `json:"deletionRules,omitempty"`
 }
 
 // DeletionStrategyApplyConfiguration constructs a declarative configuration of the DeletionStrategy type for use with
@@ -30,3 +31,16 @@ func (b *DeletionStrategyApplyConfiguration) WithOnFailure(value *DeletionPolicy
 	b.OnFailure = value
 	return b
 }
+
+// WithDeletionRules adds the given value to the DeletionRules field in the declarative configuration
+// and returns the receiver, so that objects can be build by chaining "With" function invocations.
+// If called multiple times, values provided by each call will be appended to the DeletionRules field.
+func (b *DeletionStrategyApplyConfiguration) WithDeletionRules(values ...*DeletionRuleApplyConfiguration) *DeletionStrategyApplyConfiguration {
+	for i := range values {
+		if values[i] == nil {
+			panic("nil value passed to WithDeletionRules")
+		}
+		b.DeletionRules = append(b.DeletionRules, *values[i])
+	}
+	return b
+}
diff --git a/ray-operator/pkg/client/applyconfiguration/utils.go b/ray-operator/pkg/client/applyconfiguration/utils.go
index 23e455d739a..050733b0c5e 100644
--- a/ray-operator/pkg/client/applyconfiguration/utils.go
+++ b/ray-operator/pkg/client/applyconfiguration/utils.go
@@ -20,8 +20,12 @@ func ForKind(kind schema.GroupVersionKind) interface{} {
 		return &rayv1.AppStatusApplyConfiguration{}
 	case v1.SchemeGroupVersion.WithKind("AutoscalerOptions"):
 		return &rayv1.AutoscalerOptionsApplyConfiguration{}
+	case v1.SchemeGroupVersion.WithKind("DeletionCondition"):
+		return &rayv1.DeletionConditionApplyConfiguration{}
 	case v1.SchemeGroupVersion.WithKind("DeletionPolicy"):
 		return &rayv1.DeletionPolicyApplyConfiguration{}
+	case v1.SchemeGroupVersion.WithKind("DeletionRule"):
+		return &rayv1.DeletionRuleApplyConfiguration{}
 	case v1.SchemeGroupVersion.WithKind("DeletionStrategy"):
 		return &rayv1.DeletionStrategyApplyConfiguration{}
 	case v1.SchemeGroupVersion.WithKind("GcsFaultToleranceOptions"):

From 30109279d4185e14f87c9b6611d5a0bcc4906aab Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Mon, 8 Sep 2025 17:56:10 -0400
Subject: [PATCH 02/21] Add controller tests

---
 ray-operator/apis/ray/v1/rayjob_types.go      |    4 +-
 .../apis/ray/v1/zz_generated.deepcopy.go      |   12 +-
 .../controllers/ray/rayjob_controller.go      |   20 +-
 .../controllers/ray/rayjob_controller_test.go | 1938 ++++++++++++++++-
 .../controllers/ray/utils/validation.go       |   12 +-
 .../controllers/ray/utils/validation_test.go  |   36 +-
 6 files changed, 1959 insertions(+), 63 deletions(-)

diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go
index 683ff6434b3..575ded8e81c 100644
--- a/ray-operator/apis/ray/v1/rayjob_types.go
+++ b/ray-operator/apis/ray/v1/rayjob_types.go
@@ -113,13 +113,13 @@ type DeletionStrategy struct {
 	// Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
 	// This field will be removed in a future release.
 	// +optional
-	OnSuccess DeletionPolicy `json:"onSuccess,omitempty"`
+	OnSuccess *DeletionPolicy `json:"onSuccess,omitempty"`
 
 	// OnFailure is the deletion policy for a failed RayJob.
 	// Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
 	// This field will be removed in a future release.
 	// +optional
-	OnFailure DeletionPolicy `json:"onFailure,omitempty"`
+	OnFailure *DeletionPolicy `json:"onFailure,omitempty"`
 
 	// DeletionRules is a list of deletion rules, processed based on their trigger conditions.
 	// While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),
diff --git a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go
index d548d944c8b..c4828c02f06 100644
--- a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go
+++ b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go
@@ -157,8 +157,16 @@ func (in *DeletionRule) DeepCopy() *DeletionRule {
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *DeletionStrategy) DeepCopyInto(out *DeletionStrategy) {
 	*out = *in
-	in.OnSuccess.DeepCopyInto(&out.OnSuccess)
-	in.OnFailure.DeepCopyInto(&out.OnFailure)
+	if in.OnSuccess != nil {
+		in, out := &in.OnSuccess, &out.OnSuccess
+		*out = new(DeletionPolicy)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.OnFailure != nil {
+		in, out := &in.OnFailure, &out.OnFailure
+		*out = new(DeletionPolicy)
+		(*in).DeepCopyInto(*out)
+	}
 	if in.DeletionRules != nil {
 		in, out := &in.DeletionRules, &out.DeletionRules
 		*out = make([]DeletionRule, len(*in))
diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go
index 7cb41681ce9..f292522a030 100644
--- a/ray-operator/controllers/ray/rayjob_controller.go
+++ b/ray-operator/controllers/ray/rayjob_controller.go
@@ -1071,7 +1071,16 @@ func (r *RayJobReconciler) handleDeletionRules(ctx context.Context, rayJob *rayv
 			continue
 		}
 
-		// Skip rules for actions that have already been completed to ensure idempotency.
+		deletionTime := rayJob.Status.EndTime.Add(time.Duration(rule.Condition.TTLSecondsAfterFinished) * time.Second)
+		// Track the earliest requeue time to re-check later.
+		if nowTime.Before(deletionTime) {
+			if nextRequeueTime == nil || deletionTime.Before(*nextRequeueTime) {
+				nextRequeueTime = &deletionTime
+			}
+			continue
+		}
+
+		// Need to check if the deletion action has already been completed to ensure idempotency.
 		isCompleted, err := r.isDeletionActionCompleted(ctx, rayJob, rule.Policy)
 		if err != nil {
 			logger.Error(err, "Failed to check if deletion action is completed", "rule", rule)
@@ -1082,14 +1091,7 @@ func (r *RayJobReconciler) handleDeletionRules(ctx context.Context, rayJob *rayv
 			continue
 		}
 
-		// Categorize the rule based on its TTL.
-		deletionTime := rayJob.Status.EndTime.Add(time.Duration(rule.Condition.TTLSecondsAfterFinished) * time.Second)
-		if nowTime.After(deletionTime) {
-			overdueRules = append(overdueRules, rule)
-		} else if nextRequeueTime == nil || deletionTime.Before(*nextRequeueTime) {
-			// This is a pending rule. Track the earliest one to schedule the next requeue.
-			nextRequeueTime = &deletionTime
-		}
+		overdueRules = append(overdueRules, rule)
 	}
 
 	// Handle overdue rules if any exist.
diff --git a/ray-operator/controllers/ray/rayjob_controller_test.go b/ray-operator/controllers/ray/rayjob_controller_test.go
index 4d7c8fea810..c348932260d 100644
--- a/ray-operator/controllers/ray/rayjob_controller_test.go
+++ b/ray-operator/controllers/ray/rayjob_controller_test.go
@@ -896,10 +896,10 @@ var _ = Context("RayJob with different submission modes", func() {
 			onSuccessPolicy := rayv1.DeleteCluster
 			onFailurePolicy := rayv1.DeleteNone
 			deletionStrategy := &rayv1.DeletionStrategy{
-				OnSuccess: rayv1.DeletionPolicy{
+				OnSuccess: &rayv1.DeletionPolicy{
 					Policy: &onSuccessPolicy,
 				},
-				OnFailure: rayv1.DeletionPolicy{
+				OnFailure: &rayv1.DeletionPolicy{
 					Policy: &onFailurePolicy,
 				},
 			}
@@ -909,10 +909,10 @@ var _ = Context("RayJob with different submission modes", func() {
 
 			By("Verify RayJob spec", func() {
 				Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{
-					OnSuccess: rayv1.DeletionPolicy{
+					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: &onSuccessPolicy,
 					},
-					OnFailure: rayv1.DeletionPolicy{
+					OnFailure: &rayv1.DeletionPolicy{
 						Policy: &onFailurePolicy,
 					},
 				}))
@@ -1035,10 +1035,10 @@ var _ = Context("RayJob with different submission modes", func() {
 			onSuccessPolicy := rayv1.DeleteNone
 			onFailurePolicy := rayv1.DeleteCluster
 			deletionStrategy := &rayv1.DeletionStrategy{
-				OnSuccess: rayv1.DeletionPolicy{
+				OnSuccess: &rayv1.DeletionPolicy{
 					Policy: &onSuccessPolicy,
 				},
-				OnFailure: rayv1.DeletionPolicy{
+				OnFailure: &rayv1.DeletionPolicy{
 					Policy: &onFailurePolicy,
 				},
 			}
@@ -1048,10 +1048,10 @@ var _ = Context("RayJob with different submission modes", func() {
 
 			By("Verify RayJob spec", func() {
 				Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{
-					OnSuccess: rayv1.DeletionPolicy{
+					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: &onSuccessPolicy,
 					},
-					OnFailure: rayv1.DeletionPolicy{
+					OnFailure: &rayv1.DeletionPolicy{
 						Policy: &onFailurePolicy,
 					},
 				}))
@@ -1174,10 +1174,10 @@ var _ = Context("RayJob with different submission modes", func() {
 			onSuccessPolicy := rayv1.DeleteWorkers
 			onFailurePolicy := rayv1.DeleteNone
 			deletionStrategy := rayv1.DeletionStrategy{
-				OnSuccess: rayv1.DeletionPolicy{
+				OnSuccess: &rayv1.DeletionPolicy{
 					Policy: &onSuccessPolicy,
 				},
-				OnFailure: rayv1.DeletionPolicy{
+				OnFailure: &rayv1.DeletionPolicy{
 					Policy: &onFailurePolicy,
 				},
 			}
@@ -1187,10 +1187,10 @@ var _ = Context("RayJob with different submission modes", func() {
 
 			By("Verify RayJob spec", func() {
 				Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{
-					OnSuccess: rayv1.DeletionPolicy{
+					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: &onSuccessPolicy,
 					},
-					OnFailure: rayv1.DeletionPolicy{
+					OnFailure: &rayv1.DeletionPolicy{
 						Policy: &onFailurePolicy,
 					},
 				}))
@@ -1330,10 +1330,10 @@ var _ = Context("RayJob with different submission modes", func() {
 			onSuccessPolicy := rayv1.DeleteWorkers
 			onFailurePolicy := rayv1.DeleteWorkers
 			deletionStrategy := rayv1.DeletionStrategy{
-				OnSuccess: rayv1.DeletionPolicy{
+				OnSuccess: &rayv1.DeletionPolicy{
 					Policy: &onSuccessPolicy,
 				},
-				OnFailure: rayv1.DeletionPolicy{
+				OnFailure: &rayv1.DeletionPolicy{
 					Policy: &onFailurePolicy,
 				},
 			}
@@ -1343,10 +1343,10 @@ var _ = Context("RayJob with different submission modes", func() {
 
 			By("Verify RayJob spec", func() {
 				Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{
-					OnSuccess: rayv1.DeletionPolicy{
+					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: &onSuccessPolicy,
 					},
-					OnFailure: rayv1.DeletionPolicy{
+					OnFailure: &rayv1.DeletionPolicy{
 						Policy: &onFailurePolicy,
 					},
 				}))
@@ -1486,10 +1486,10 @@ var _ = Context("RayJob with different submission modes", func() {
 			onSuccessPolicy := rayv1.DeleteSelf
 			onFailurePolicy := rayv1.DeleteNone
 			deletionStrategy := rayv1.DeletionStrategy{
-				OnSuccess: rayv1.DeletionPolicy{
+				OnSuccess: &rayv1.DeletionPolicy{
 					Policy: &onSuccessPolicy,
 				},
-				OnFailure: rayv1.DeletionPolicy{
+				OnFailure: &rayv1.DeletionPolicy{
 					Policy: &onFailurePolicy,
 				},
 			}
@@ -1602,10 +1602,10 @@ var _ = Context("RayJob with different submission modes", func() {
 			onSuccessPolicy := rayv1.DeleteNone
 			onFailurePolicy := rayv1.DeleteSelf
 			deletionStrategy := rayv1.DeletionStrategy{
-				OnSuccess: rayv1.DeletionPolicy{
+				OnSuccess: &rayv1.DeletionPolicy{
 					Policy: &onSuccessPolicy,
 				},
-				OnFailure: rayv1.DeletionPolicy{
+				OnFailure: &rayv1.DeletionPolicy{
 					Policy: &onFailurePolicy,
 				},
 			}
@@ -1718,10 +1718,10 @@ var _ = Context("RayJob with different submission modes", func() {
 			onSuccessPolicy := rayv1.DeleteNone
 			onFailurePolicy := rayv1.DeleteNone
 			deletionStrategy := rayv1.DeletionStrategy{
-				OnSuccess: rayv1.DeletionPolicy{
+				OnSuccess: &rayv1.DeletionPolicy{
 					Policy: &onSuccessPolicy,
 				},
-				OnFailure: rayv1.DeletionPolicy{
+				OnFailure: &rayv1.DeletionPolicy{
 					Policy: &onFailurePolicy,
 				},
 			}
@@ -1731,10 +1731,10 @@ var _ = Context("RayJob with different submission modes", func() {
 
 			By("Verify RayJob spec", func() {
 				Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{
-					OnSuccess: rayv1.DeletionPolicy{
+					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: &onSuccessPolicy,
 					},
-					OnFailure: rayv1.DeletionPolicy{
+					OnFailure: &rayv1.DeletionPolicy{
 						Policy: &onFailurePolicy,
 					},
 				}))
@@ -1879,10 +1879,10 @@ var _ = Context("RayJob with different submission modes", func() {
 			onSuccessPolicy := rayv1.DeleteCluster
 			onFailurePolicy := rayv1.DeleteNone
 			deletionStrategy := rayv1.DeletionStrategy{
-				OnSuccess: rayv1.DeletionPolicy{
+				OnSuccess: &rayv1.DeletionPolicy{
 					Policy: &onSuccessPolicy,
 				},
-				OnFailure: rayv1.DeletionPolicy{
+				OnFailure: &rayv1.DeletionPolicy{
 					Policy: &onFailurePolicy,
 				},
 			}
@@ -1892,10 +1892,10 @@ var _ = Context("RayJob with different submission modes", func() {
 
 			By("Verify RayJob spec", func() {
 				Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{
-					OnSuccess: rayv1.DeletionPolicy{
+					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: &onSuccessPolicy,
 					},
-					OnFailure: rayv1.DeletionPolicy{
+					OnFailure: &rayv1.DeletionPolicy{
 						Policy: &onFailurePolicy,
 					},
 				}))
@@ -2031,5 +2031,1887 @@ var _ = Context("RayJob with different submission modes", func() {
 					time.Second*3, time.Millisecond*500).Should(Succeed())
 			})
 		})
+
+		It("Should delete workers on success when a single 'DeleteWorkers' rule is set", func() {
+			ctx := context.Background()
+			namespace := "default"
+			rayJob := rayJobTemplate("rayjob-test-rule-deleteworkers-on-success", namespace)
+			rayCluster := &rayv1.RayCluster{}
+
+			rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{
+				DeletionRules: []rayv1.DeletionRule{
+					{
+						Policy: rayv1.DeleteWorkers,
+						Condition: rayv1.DeletionCondition{
+							JobStatus: rayv1.JobStatusSucceeded,
+						},
+					},
+				},
+			}
+			rayJob.Spec.ShutdownAfterJobFinishes = false
+
+			By("Verify RayJob spec", func() {
+				Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteWorkers,
+							Condition: rayv1.DeletionCondition{
+								JobStatus: rayv1.JobStatusSucceeded,
+							},
+						},
+					},
+				}))
+			})
+
+			By("Create a RayJob custom resource", func() {
+				err := k8sClient.Create(ctx, rayJob)
+				Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob")
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name)
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set.
+				Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty())
+				Expect(rayJob.Status.JobId).NotTo(BeEmpty())
+				Expect(rayJob.Status.StartTime).NotTo(BeNil())
+			})
+
+			By("In Initializing state, the RayCluster should eventually be created.", func() {
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Check whether RayCluster is consistent with RayJob's RayClusterSpec.
+				Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas))
+				Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion))
+
+				// TODO (kevin85421): Check the RayCluster labels.
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name))
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD)))
+
+				Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations))
+			})
+
+			By("Make RayCluster.Status.State to be rayv1.Ready", func() {
+				// The RayCluster is not 'Ready' yet because Pods are not running and ready.
+				Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready))
+
+				updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+				updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+
+				// The RayCluster.Status.State should be Ready.
+				Eventually(
+					getClusterState(ctx, namespace, rayCluster.Name),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready))
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Running state, the RayJob's Status.DashboardURL must be set.
+				Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty())
+
+				// In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() {
+				// Update fake dashboard client to return job info with "Succeeded" status.
+				getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required
+					return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil
+				}
+				fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo)
+				defer fakeRayDashboardClient.GetJobInfoMock.Store(nil)
+
+				// RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed.
+				Consistently(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// Update the submitter Kubernetes Job to Complete.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+
+				// Update the submitter Kubernetes Job to Complete.
+				conditions := []batchv1.JobCondition{
+					{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
+				}
+				job.Status.Conditions = conditions
+				Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
+
+				// RayJob transitions to Complete.
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusComplete), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+			})
+
+			By("If DeletionStrategy=DeleteWorkers, all workers should be deleted, but not the Head pod and submitter Job", func() {
+				// RayCluster exists
+				Consistently(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Check worker group is suspended
+				Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Suspend).To(BeTrue())
+
+				// 0 worker Pods exist
+				workerPods := corev1.PodList{}
+				workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions()
+				Eventually(
+					listResourceFunc(ctx, &workerPods, workerLabels...),
+					time.Second*3, time.Millisecond*500).Should(Equal(0), "expected 0 workers")
+
+				// Head Pod is still running
+				headPods := corev1.PodList{}
+				headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions()
+				Consistently(
+					listResourceFunc(ctx, &headPods, headLabels...),
+					time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items)
+
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				Consistently(
+					getResourceFunc(ctx, namespacedName, job),
+					time.Second*3, time.Millisecond*500).Should(Succeed())
+			})
+		})
+
+		It("Should delete workers on failure when a single 'DeleteWorkers' rule is set", func() {
+			ctx := context.Background()
+			namespace := "default"
+			rayJob := rayJobTemplate("rayjob-test-rule-deleteworkers-on-failure", namespace)
+			rayCluster := &rayv1.RayCluster{}
+
+			rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{
+				DeletionRules: []rayv1.DeletionRule{
+					{
+						Policy: rayv1.DeleteWorkers,
+						Condition: rayv1.DeletionCondition{
+							JobStatus: rayv1.JobStatusFailed,
+						},
+					},
+				},
+			}
+			rayJob.Spec.ShutdownAfterJobFinishes = false
+
+			By("Verify RayJob spec", func() {
+				Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteWorkers,
+							Condition: rayv1.DeletionCondition{
+								JobStatus: rayv1.JobStatusFailed,
+							},
+						},
+					},
+				}))
+			})
+
+			By("Create a RayJob custom resource", func() {
+				err := k8sClient.Create(ctx, rayJob)
+				Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob")
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name)
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set.
+				Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty())
+				Expect(rayJob.Status.JobId).NotTo(BeEmpty())
+				Expect(rayJob.Status.StartTime).NotTo(BeNil())
+			})
+
+			By("In Initializing state, the RayCluster should eventually be created.", func() {
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Check whether RayCluster is consistent with RayJob's RayClusterSpec.
+				Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas))
+				Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion))
+
+				// TODO (kevin85421): Check the RayCluster labels.
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name))
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD)))
+
+				Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations))
+			})
+
+			By("Make RayCluster.Status.State to be rayv1.Ready", func() {
+				// The RayCluster is not 'Ready' yet because Pods are not running and ready.
+				Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready))
+
+				updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+				updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+
+				// The RayCluster.Status.State should be Ready.
+				Eventually(
+					getClusterState(ctx, namespace, rayCluster.Name),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready))
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Running state, the RayJob's Status.DashboardURL must be set.
+				Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty())
+
+				// In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() {
+				// Update fake dashboard client to return job info with "Failed" status.
+				getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required
+					return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil
+				}
+				fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo)
+				defer fakeRayDashboardClient.GetJobInfoMock.Store(nil)
+
+				// RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed.
+				Consistently(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// Update the submitter Kubernetes Job to Complete.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+
+				// Update the submitter Kubernetes Job to Complete.
+				conditions := []batchv1.JobCondition{
+					{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
+				}
+				job.Status.Conditions = conditions
+				Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
+
+				// RayJob transitions to Failed.
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusFailed), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+			})
+
+			By("If DeletionStrategy=DeleteWorkers, all workers should be deleted, but not the Head pod and submitter Job", func() {
+				// RayCluster exists
+				Consistently(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Check worker group is suspended
+				Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Suspend).To(BeTrue())
+
+				// 0 worker Pods exist
+				workerPods := corev1.PodList{}
+				workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions()
+				Eventually(
+					listResourceFunc(ctx, &workerPods, workerLabels...),
+					time.Second*3, time.Millisecond*500).Should(Equal(0), "expected 0 workers")
+
+				// Head Pod is still running
+				headPods := corev1.PodList{}
+				headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions()
+				Consistently(
+					listResourceFunc(ctx, &headPods, headLabels...),
+					time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items)
+
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				Consistently(
+					getResourceFunc(ctx, namespacedName, job),
+					time.Second*3, time.Millisecond*500).Should(Succeed())
+			})
+		})
+
+		It("Should delete cluster on success when a single 'DeleteCluster' rule is set", func() {
+			ctx := context.Background()
+			namespace := "default"
+			rayJob := rayJobTemplate("rayjob-test-rule-deletecluster-on-success", namespace)
+			rayCluster := &rayv1.RayCluster{}
+
+			rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{
+				DeletionRules: []rayv1.DeletionRule{
+					{
+						Policy: rayv1.DeleteCluster,
+						Condition: rayv1.DeletionCondition{
+							JobStatus: rayv1.JobStatusSucceeded,
+						},
+					},
+				},
+			}
+			rayJob.Spec.ShutdownAfterJobFinishes = false
+
+			By("Verify RayJob spec", func() {
+				Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteCluster,
+							Condition: rayv1.DeletionCondition{
+								JobStatus: rayv1.JobStatusSucceeded,
+							},
+						},
+					},
+				}))
+			})
+
+			By("Create a RayJob custom resource", func() {
+				err := k8sClient.Create(ctx, rayJob)
+				Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob")
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name)
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set.
+				Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty())
+				Expect(rayJob.Status.JobId).NotTo(BeEmpty())
+				Expect(rayJob.Status.StartTime).NotTo(BeNil())
+			})
+
+			By("In Initializing state, the RayCluster should eventually be created.", func() {
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Check whether RayCluster is consistent with RayJob's RayClusterSpec.
+				Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas))
+				Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion))
+
+				// TODO (kevin85421): Check the RayCluster labels.
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name))
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD)))
+
+				Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations))
+			})
+
+			By("Make RayCluster.Status.State to be rayv1.Ready", func() {
+				// The RayCluster is not 'Ready' yet because Pods are not running and ready.
+				Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready))
+
+				updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+				updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+
+				// The RayCluster.Status.State should be Ready.
+				Eventually(
+					getClusterState(ctx, namespace, rayCluster.Name),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready))
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Running state, the RayJob's Status.DashboardURL must be set.
+				Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty())
+
+				// In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() {
+				// Update fake dashboard client to return job info with "Succeeded" status.
+				getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required
+					return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil
+				}
+				fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo)
+				defer fakeRayDashboardClient.GetJobInfoMock.Store(nil)
+
+				// RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed.
+				Consistently(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// Update the submitter Kubernetes Job to Complete.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+
+				// Update the submitter Kubernetes Job to Complete.
+				conditions := []batchv1.JobCondition{
+					{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
+				}
+				job.Status.Conditions = conditions
+				Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
+
+				// RayJob transitions to Complete.
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusComplete), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+			})
+
+			By("If DeletionStrategy=DeleteCluster, RayCluster should be deleted, but not the submitter Job.", func() {
+				Eventually(
+					func() bool {
+						return apierrors.IsNotFound(getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster)())
+					},
+					time.Second*3, time.Millisecond*500).Should(BeTrue())
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				Consistently(
+					getResourceFunc(ctx, namespacedName, job),
+					time.Second*3, time.Millisecond*500).Should(Succeed())
+			})
+		})
+
+		It("Should delete cluster on failure when a single 'DeleteCluster' rule is set", func() {
+			ctx := context.Background()
+			namespace := "default"
+			rayJob := rayJobTemplate("rayjob-test-rule-deletecluster-on-failure", namespace)
+			rayCluster := &rayv1.RayCluster{}
+
+			rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{
+				DeletionRules: []rayv1.DeletionRule{
+					{
+						Policy: rayv1.DeleteCluster,
+						Condition: rayv1.DeletionCondition{
+							JobStatus: rayv1.JobStatusFailed,
+						},
+					},
+				},
+			}
+			rayJob.Spec.ShutdownAfterJobFinishes = false
+
+			By("Verify RayJob spec", func() {
+				Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteCluster,
+							Condition: rayv1.DeletionCondition{
+								JobStatus: rayv1.JobStatusFailed,
+							},
+						},
+					},
+				}))
+			})
+
+			By("Create a RayJob custom resource", func() {
+				err := k8sClient.Create(ctx, rayJob)
+				Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob")
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name)
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set.
+				Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty())
+				Expect(rayJob.Status.JobId).NotTo(BeEmpty())
+				Expect(rayJob.Status.StartTime).NotTo(BeNil())
+			})
+
+			By("In Initializing state, the RayCluster should eventually be created.", func() {
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Check whether RayCluster is consistent with RayJob's RayClusterSpec.
+				Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas))
+				Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion))
+
+				// TODO (kevin85421): Check the RayCluster labels.
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name))
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD)))
+
+				Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations))
+			})
+
+			By("Make RayCluster.Status.State to be rayv1.Ready", func() {
+				// The RayCluster is not 'Ready' yet because Pods are not running and ready.
+				Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready))
+
+				updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+				updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+
+				// The RayCluster.Status.State should be Ready.
+				Eventually(
+					getClusterState(ctx, namespace, rayCluster.Name),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready))
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Running state, the RayJob's Status.DashboardURL must be set.
+				Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty())
+
+				// In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() {
+				// Update fake dashboard client to return job info with "Failed" status.
+				getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required
+					return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil
+				}
+				fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo)
+				defer fakeRayDashboardClient.GetJobInfoMock.Store(nil)
+
+				// RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed.
+				Consistently(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// Update the submitter Kubernetes Job to Complete.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+
+				// Update the submitter Kubernetes Job to Complete.
+				conditions := []batchv1.JobCondition{
+					{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
+				}
+				job.Status.Conditions = conditions
+				Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
+
+				// RayJob transitions to Complete.
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusFailed), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+			})
+
+			By("If DeletionStrategy=DeleteCluster, RayCluster should be deleted, but not the submitter Job.", func() {
+				Eventually(
+					func() bool {
+						return apierrors.IsNotFound(getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster)())
+					},
+					time.Second*3, time.Millisecond*500).Should(BeTrue())
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				Consistently(
+					getResourceFunc(ctx, namespacedName, job),
+					time.Second*3, time.Millisecond*500).Should(Succeed())
+			})
+		})
+
+		It("Should delete self on success when a single 'DeleteSelf' rule is set", func() {
+			ctx := context.Background()
+			namespace := "default"
+			rayJob := rayJobTemplate("rayjob-test-rule-deleteself-on-success", namespace)
+			rayCluster := &rayv1.RayCluster{}
+
+			rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{
+				DeletionRules: []rayv1.DeletionRule{
+					{
+						Policy: rayv1.DeleteSelf,
+						Condition: rayv1.DeletionCondition{
+							JobStatus: rayv1.JobStatusSucceeded,
+						},
+					},
+				},
+			}
+			rayJob.Spec.ShutdownAfterJobFinishes = false
+
+			By("Create a RayJob custom resource", func() {
+				err := k8sClient.Create(ctx, rayJob)
+				Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob")
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name)
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set.
+				Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty())
+				Expect(rayJob.Status.JobId).NotTo(BeEmpty())
+				Expect(rayJob.Status.StartTime).NotTo(BeNil())
+			})
+
+			By("In Initializing state, the RayCluster should eventually be created.", func() {
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Check whether RayCluster is consistent with RayJob's RayClusterSpec.
+				Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas))
+				Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion))
+
+				// TODO (kevin85421): Check the RayCluster labels.
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name))
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD)))
+
+				Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations))
+			})
+
+			By("Make RayCluster.Status.State to be rayv1.Ready", func() {
+				// The RayCluster is not 'Ready' yet because Pods are not running and ready.
+				Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready))
+
+				updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+				updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+
+				// The RayCluster.Status.State should be Ready.
+				Eventually(
+					getClusterState(ctx, namespace, rayCluster.Name),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready))
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Running state, the RayJob's Status.DashboardURL must be set.
+				Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty())
+
+				// In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() {
+				// Update fake dashboard client to return job info with "Succeeded" status.
+				getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required
+					return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil
+				}
+				fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo)
+
+				// RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed.
+				Consistently(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// Update the submitter Kubernetes Job to Complete.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+
+				// Update the submitter Kubernetes Job to Complete.
+				conditions := []batchv1.JobCondition{
+					{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
+				}
+				job.Status.Conditions = conditions
+				Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
+			})
+
+			By("If DeletionStrategy=DeleteSelf, the RayJob is deleted", func() {
+				Eventually(
+					func() bool {
+						return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob))
+					}, time.Second*5, time.Millisecond*500).Should(BeTrue())
+			})
+		})
+
+		It("Should delete self on failure when a single 'DeleteSelf' rule is set", func() {
+			ctx := context.Background()
+			namespace := "default"
+			rayJob := rayJobTemplate("rayjob-test-rule-deleteself-on-failure", namespace)
+			rayCluster := &rayv1.RayCluster{}
+
+			rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{
+				DeletionRules: []rayv1.DeletionRule{
+					{
+						Policy: rayv1.DeleteSelf,
+						Condition: rayv1.DeletionCondition{
+							JobStatus: rayv1.JobStatusFailed,
+						},
+					},
+				},
+			}
+			rayJob.Spec.ShutdownAfterJobFinishes = false
+
+			By("Create a RayJob custom resource", func() {
+				err := k8sClient.Create(ctx, rayJob)
+				Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob")
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name)
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set.
+				Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty())
+				Expect(rayJob.Status.JobId).NotTo(BeEmpty())
+				Expect(rayJob.Status.StartTime).NotTo(BeNil())
+			})
+
+			By("In Initializing state, the RayCluster should eventually be created.", func() {
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Check whether RayCluster is consistent with RayJob's RayClusterSpec.
+				Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas))
+				Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion))
+
+				// TODO (kevin85421): Check the RayCluster labels.
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name))
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD)))
+
+				Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations))
+			})
+
+			By("Make RayCluster.Status.State to be rayv1.Ready", func() {
+				// The RayCluster is not 'Ready' yet because Pods are not running and ready.
+				Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready))
+
+				updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+				updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+
+				// The RayCluster.Status.State should be Ready.
+				Eventually(
+					getClusterState(ctx, namespace, rayCluster.Name),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready))
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Running state, the RayJob's Status.DashboardURL must be set.
+				Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty())
+
+				// In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() {
+				// Update fake dashboard client to return job info with "Failed" status.
+				getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required
+					return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil
+				}
+				fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo)
+
+				// RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed.
+				Consistently(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// Update the submitter Kubernetes Job to Complete.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+
+				// Update the submitter Kubernetes Job to Complete.
+				conditions := []batchv1.JobCondition{
+					{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
+				}
+				job.Status.Conditions = conditions
+				Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
+			})
+
+			By("If DeletionStrategy=DeleteSelf, the RayJob is deleted", func() {
+				Eventually(
+					func() bool {
+						return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob))
+					}, time.Second*5, time.Millisecond*500).Should(BeTrue())
+			})
+		})
+
+		It("Should delete none on success when a single 'DeleteNone' rule is set", func() {
+			ctx := context.Background()
+			namespace := "default"
+			rayJob := rayJobTemplate("rayjob-test-rule-deletenone-on-success", namespace)
+			rayCluster := &rayv1.RayCluster{}
+
+			rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{
+				DeletionRules: []rayv1.DeletionRule{
+					{
+						Policy: rayv1.DeleteNone,
+						Condition: rayv1.DeletionCondition{
+							JobStatus: rayv1.JobStatusSucceeded,
+						},
+					},
+				},
+			}
+			rayJob.Spec.ShutdownAfterJobFinishes = false
+
+			By("Verify RayJob spec", func() {
+				Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteNone,
+							Condition: rayv1.DeletionCondition{
+								JobStatus: rayv1.JobStatusSucceeded,
+							},
+						},
+					},
+				}))
+			})
+
+			By("Create a RayJob custom resource", func() {
+				err := k8sClient.Create(ctx, rayJob)
+				Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob")
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name)
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set.
+				Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty())
+				Expect(rayJob.Status.JobId).NotTo(BeEmpty())
+				Expect(rayJob.Status.StartTime).NotTo(BeNil())
+			})
+
+			By("In Initializing state, the RayCluster should eventually be created.", func() {
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Check whether RayCluster is consistent with RayJob's RayClusterSpec.
+				Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas))
+				Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion))
+
+				// TODO (kevin85421): Check the RayCluster labels.
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name))
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD)))
+
+				Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations))
+			})
+
+			By("Make RayCluster.Status.State to be rayv1.Ready", func() {
+				// The RayCluster is not 'Ready' yet because Pods are not running and ready.
+				Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready))
+
+				updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+				updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+
+				// The RayCluster.Status.State should be Ready.
+				Eventually(
+					getClusterState(ctx, namespace, rayCluster.Name),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready))
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Running state, the RayJob's Status.DashboardURL must be set.
+				Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty())
+
+				// In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() {
+				// Update fake dashboard client to return job info with "Succeeded" status.
+				getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required
+					return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil
+				}
+				fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo)
+				defer fakeRayDashboardClient.GetJobInfoMock.Store(nil)
+
+				// RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed.
+				Consistently(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// Update the submitter Kubernetes Job to Complete.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+
+				// Update the submitter Kubernetes Job to Complete.
+				conditions := []batchv1.JobCondition{
+					{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
+				}
+				job.Status.Conditions = conditions
+				Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
+
+				// RayJob transitions to Complete.
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusComplete), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+			})
+
+			By("If DeletionStrategy=DeleteNone, no resources are deleted", func() {
+				// RayJob exists
+				Consistently(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayJob %v not found", rayJob)
+
+				// RayCluster exists
+				Consistently(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Worker replicas set to 3
+				Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(int32(3)))
+
+				// 3 worker Pods exist
+				workerPods := corev1.PodList{}
+				workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions()
+				Consistently(
+					listResourceFunc(ctx, &workerPods, workerLabels...),
+					time.Second*3, time.Millisecond*500).Should(Equal(3), "expected 3 workers")
+
+				// Head Pod is still running
+				headPods := corev1.PodList{}
+				headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions()
+				Consistently(
+					listResourceFunc(ctx, &headPods, headLabels...),
+					time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items)
+
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				Consistently(
+					getResourceFunc(ctx, namespacedName, job),
+					time.Second*3, time.Millisecond*500).Should(Succeed())
+			})
+		})
+
+		It("Should delete none on failure when a single 'DeleteNone' rule is set", func() {
+			ctx := context.Background()
+			namespace := "default"
+			rayJob := rayJobTemplate("rayjob-test-rule-deletenone-on-failure", namespace)
+			rayCluster := &rayv1.RayCluster{}
+
+			rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{
+				DeletionRules: []rayv1.DeletionRule{
+					{
+						Policy: rayv1.DeleteNone,
+						Condition: rayv1.DeletionCondition{
+							JobStatus: rayv1.JobStatusFailed,
+						},
+					},
+				},
+			}
+			rayJob.Spec.ShutdownAfterJobFinishes = false
+
+			By("Verify RayJob spec", func() {
+				Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteNone,
+							Condition: rayv1.DeletionCondition{
+								JobStatus: rayv1.JobStatusFailed,
+							},
+						},
+					},
+				}))
+			})
+
+			By("Create a RayJob custom resource", func() {
+				err := k8sClient.Create(ctx, rayJob)
+				Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob")
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name)
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set.
+				Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty())
+				Expect(rayJob.Status.JobId).NotTo(BeEmpty())
+				Expect(rayJob.Status.StartTime).NotTo(BeNil())
+			})
+
+			By("In Initializing state, the RayCluster should eventually be created.", func() {
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Check whether RayCluster is consistent with RayJob's RayClusterSpec.
+				Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas))
+				Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion))
+
+				// TODO (kevin85421): Check the RayCluster labels.
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name))
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD)))
+
+				Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations))
+			})
+
+			By("Make RayCluster.Status.State to be rayv1.Ready", func() {
+				// The RayCluster is not 'Ready' yet because Pods are not running and ready.
+				Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready))
+
+				updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+				updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+
+				// The RayCluster.Status.State should be Ready.
+				Eventually(
+					getClusterState(ctx, namespace, rayCluster.Name),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready))
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Running state, the RayJob's Status.DashboardURL must be set.
+				Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty())
+
+				// In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() {
+				// Update fake dashboard client to return job info with "Failed" status.
+				getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required
+					return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil
+				}
+				fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo)
+				defer fakeRayDashboardClient.GetJobInfoMock.Store(nil)
+
+				// RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed.
+				Consistently(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// Update the submitter Kubernetes Job to Complete.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+
+				// Update the submitter Kubernetes Job to Complete.
+				conditions := []batchv1.JobCondition{
+					{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
+				}
+				job.Status.Conditions = conditions
+				Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
+
+				// RayJob transitions to Complete.
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusFailed), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+			})
+
+			By("If DeletionStrategy=DeleteNone, no resources are deleted", func() {
+				// RayJob exists
+				Consistently(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayJob %v not found", rayJob)
+
+				// RayCluster exists
+				Consistently(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Worker replicas set to 3
+				Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(int32(3)))
+
+				// 3 worker Pods exist
+				workerPods := corev1.PodList{}
+				workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions()
+				Consistently(
+					listResourceFunc(ctx, &workerPods, workerLabels...),
+					time.Second*3, time.Millisecond*500).Should(Equal(3), "expected 3 workers")
+
+				// Head Pod is still running
+				headPods := corev1.PodList{}
+				headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions()
+				Consistently(
+					listResourceFunc(ctx, &headPods, headLabels...),
+					time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items)
+
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				Consistently(
+					getResourceFunc(ctx, namespacedName, job),
+					time.Second*3, time.Millisecond*500).Should(Succeed())
+			})
+		})
+
+		It("Should execute MOST impactful rule (DeleteSelf) when all rules are overdue on success", func() {
+			ctx := context.Background()
+			namespace := "default"
+			rayJob := rayJobTemplate("rayjob-test-impactful-rule-override-on-success", namespace)
+			rayCluster := &rayv1.RayCluster{}
+
+			// Define the multi-stage DeletionStrategy
+			rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{
+				DeletionRules: []rayv1.DeletionRule{
+					{
+						Policy: rayv1.DeleteWorkers,
+						Condition: rayv1.DeletionCondition{
+							JobStatus:               rayv1.JobStatusSucceeded,
+							TTLSecondsAfterFinished: 0,
+						},
+					},
+					{
+						Policy: rayv1.DeleteCluster,
+						Condition: rayv1.DeletionCondition{
+							JobStatus:               rayv1.JobStatusSucceeded,
+							TTLSecondsAfterFinished: 0,
+						},
+					},
+					{
+						Policy: rayv1.DeleteSelf,
+						Condition: rayv1.DeletionCondition{
+							JobStatus:               rayv1.JobStatusSucceeded,
+							TTLSecondsAfterFinished: 0,
+						},
+					},
+				},
+			}
+			rayJob.Spec.ShutdownAfterJobFinishes = false
+
+			By("Verify RayJob spec", func() {
+				Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteWorkers,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 0,
+							},
+						},
+						{
+							Policy: rayv1.DeleteCluster,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 0,
+							},
+						},
+						{
+							Policy: rayv1.DeleteSelf,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 0,
+							},
+						},
+					},
+				}))
+			})
+
+			By("Create a RayJob custom resource with multi-stage deletion rules", func() {
+				err := k8sClient.Create(ctx, rayJob)
+				Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob")
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name)
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set.
+				Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty())
+				Expect(rayJob.Status.JobId).NotTo(BeEmpty())
+				Expect(rayJob.Status.StartTime).NotTo(BeNil())
+			})
+
+			By("In Initializing state, the RayCluster should eventually be created.", func() {
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Check whether RayCluster is consistent with RayJob's RayClusterSpec.
+				Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas))
+				Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion))
+
+				// TODO (kevin85421): Check the RayCluster labels.
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name))
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD)))
+
+				Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations))
+			})
+
+			By("Make RayCluster.Status.State to be rayv1.Ready", func() {
+				// The RayCluster is not 'Ready' yet because Pods are not running and ready.
+				Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready))
+
+				updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+				updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+
+				// The RayCluster.Status.State should be Ready.
+				Eventually(
+					getClusterState(ctx, namespace, rayCluster.Name),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready))
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Running state, the RayJob's Status.DashboardURL must be set.
+				Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty())
+
+				// In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() {
+				// Update fake dashboard client to return job info with "Succeeded" status.
+				getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required
+					return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil
+				}
+				fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo)
+
+				// RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed.
+				Consistently(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// Update the submitter Kubernetes Job to Complete.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+
+				// Update the submitter Kubernetes Job to Complete.
+				conditions := []batchv1.JobCondition{
+					{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
+				}
+				job.Status.Conditions = conditions
+				Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
+			})
+
+			By("Verify RayJob itself is deleted", func() {
+				Eventually(
+					func() bool {
+						return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob))
+					}, time.Second*5, time.Millisecond*500).Should(BeTrue())
+			})
+		})
+
+		It("Should execute MOST impactful rule (DeleteSelf) when all rules are overdue on failure", func() {
+			ctx := context.Background()
+			namespace := "default"
+			rayJob := rayJobTemplate("rayjob-test-impactful-rule-override-on-failure", namespace)
+			rayCluster := &rayv1.RayCluster{}
+
+			// Define the multi-stage DeletionStrategy
+			rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{
+				DeletionRules: []rayv1.DeletionRule{
+					{
+						Policy: rayv1.DeleteWorkers,
+						Condition: rayv1.DeletionCondition{
+							JobStatus:               rayv1.JobStatusFailed,
+							TTLSecondsAfterFinished: 0,
+						},
+					},
+					{
+						Policy: rayv1.DeleteCluster,
+						Condition: rayv1.DeletionCondition{
+							JobStatus:               rayv1.JobStatusFailed,
+							TTLSecondsAfterFinished: 0,
+						},
+					},
+					{
+						Policy: rayv1.DeleteSelf,
+						Condition: rayv1.DeletionCondition{
+							JobStatus:               rayv1.JobStatusFailed,
+							TTLSecondsAfterFinished: 0,
+						},
+					},
+				},
+			}
+			rayJob.Spec.ShutdownAfterJobFinishes = false
+
+			By("Verify RayJob spec", func() {
+				Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteWorkers,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusFailed,
+								TTLSecondsAfterFinished: 0,
+							},
+						},
+						{
+							Policy: rayv1.DeleteCluster,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusFailed,
+								TTLSecondsAfterFinished: 0,
+							},
+						},
+						{
+							Policy: rayv1.DeleteSelf,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusFailed,
+								TTLSecondsAfterFinished: 0,
+							},
+						},
+					},
+				}))
+			})
+
+			By("Create a RayJob custom resource with multi-stage deletion rules", func() {
+				err := k8sClient.Create(ctx, rayJob)
+				Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob")
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name)
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set.
+				Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty())
+				Expect(rayJob.Status.JobId).NotTo(BeEmpty())
+				Expect(rayJob.Status.StartTime).NotTo(BeNil())
+			})
+
+			By("In Initializing state, the RayCluster should eventually be created.", func() {
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Check whether RayCluster is consistent with RayJob's RayClusterSpec.
+				Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas))
+				Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion))
+
+				// TODO (kevin85421): Check the RayCluster labels.
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name))
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD)))
+
+				Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations))
+			})
+
+			By("Make RayCluster.Status.State to be rayv1.Ready", func() {
+				// The RayCluster is not 'Ready' yet because Pods are not running and ready.
+				Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready))
+
+				updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+				updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+
+				// The RayCluster.Status.State should be Ready.
+				Eventually(
+					getClusterState(ctx, namespace, rayCluster.Name),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready))
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Running state, the RayJob's Status.DashboardURL must be set.
+				Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty())
+
+				// In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() {
+				// Update fake dashboard client to return job info with "Failed" status.
+				getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required
+					return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil
+				}
+				fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo)
+
+				// RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed.
+				Consistently(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// Update the submitter Kubernetes Job to Complete.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+
+				// Update the submitter Kubernetes Job to Complete.
+				conditions := []batchv1.JobCondition{
+					{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
+				}
+				job.Status.Conditions = conditions
+				Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
+			})
+
+			By("Verify RayJob itself is deleted", func() {
+				Eventually(
+					func() bool {
+						return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob))
+					}, time.Second*5, time.Millisecond*500).Should(BeTrue())
+			})
+		})
+
+		It("Should process multi-stage deletions in order on success: Workers, then Cluster, then Self", func() {
+			ctx := context.Background()
+			namespace := "default"
+			rayJob := rayJobTemplate("rayjob-test-multistage-deletion-on-success", namespace)
+			rayCluster := &rayv1.RayCluster{}
+
+			// Define the multi-stage DeletionStrategy
+			rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{
+				DeletionRules: []rayv1.DeletionRule{
+					{
+						Policy: rayv1.DeleteWorkers,
+						Condition: rayv1.DeletionCondition{
+							JobStatus:               rayv1.JobStatusSucceeded,
+							TTLSecondsAfterFinished: 0, // Stage 1: Delete workers after 0 seconds
+						},
+					},
+					{
+						Policy: rayv1.DeleteCluster,
+						Condition: rayv1.DeletionCondition{
+							JobStatus:               rayv1.JobStatusSucceeded,
+							TTLSecondsAfterFinished: 5, // Stage 2: Delete cluster after 5 seconds
+						},
+					},
+					{
+						Policy: rayv1.DeleteSelf,
+						Condition: rayv1.DeletionCondition{
+							JobStatus:               rayv1.JobStatusSucceeded,
+							TTLSecondsAfterFinished: 10, // Stage 3: Delete self after 10 seconds
+						},
+					},
+				},
+			}
+			rayJob.Spec.ShutdownAfterJobFinishes = false
+
+			By("Verify RayJob spec", func() {
+				Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteWorkers,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 0,
+							},
+						},
+						{
+							Policy: rayv1.DeleteCluster,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 5,
+							},
+						},
+						{
+							Policy: rayv1.DeleteSelf,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusSucceeded,
+								TTLSecondsAfterFinished: 10,
+							},
+						},
+					},
+				}))
+			})
+
+			By("Create a RayJob custom resource with multi-stage deletion rules", func() {
+				err := k8sClient.Create(ctx, rayJob)
+				Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob")
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name)
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set.
+				Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty())
+				Expect(rayJob.Status.JobId).NotTo(BeEmpty())
+				Expect(rayJob.Status.StartTime).NotTo(BeNil())
+			})
+
+			By("In Initializing state, the RayCluster should eventually be created.", func() {
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Check whether RayCluster is consistent with RayJob's RayClusterSpec.
+				Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas))
+				Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion))
+
+				// TODO (kevin85421): Check the RayCluster labels.
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name))
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD)))
+
+				Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations))
+			})
+
+			By("Make RayCluster.Status.State to be rayv1.Ready", func() {
+				// The RayCluster is not 'Ready' yet because Pods are not running and ready.
+				Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready))
+
+				updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+				updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+
+				// The RayCluster.Status.State should be Ready.
+				Eventually(
+					getClusterState(ctx, namespace, rayCluster.Name),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready))
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Running state, the RayJob's Status.DashboardURL must be set.
+				Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty())
+
+				// In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() {
+				// Update fake dashboard client to return job info with "Succeeded" status.
+				getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required
+					return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusSucceeded, EndTime: uint64(time.Now().UnixMilli())}, nil
+				}
+				fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo)
+				defer fakeRayDashboardClient.GetJobInfoMock.Store(nil)
+
+				// RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed.
+				Consistently(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// Update the submitter Kubernetes Job to Complete.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+
+				// Update the submitter Kubernetes Job to Complete.
+				conditions := []batchv1.JobCondition{
+					{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
+				}
+				job.Status.Conditions = conditions
+				Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
+
+				// RayJob transitions to Complete.
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusComplete), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+			})
+
+			By("Stage 1: Verify workers are deleted, but cluster and job still exist", func() {
+				// RayCluster exists
+				Consistently(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Check worker group is suspended
+				Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Suspend).To(BeTrue())
+
+				// 0 worker Pods exist
+				workerPods := corev1.PodList{}
+				workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions()
+				Eventually(
+					listResourceFunc(ctx, &workerPods, workerLabels...),
+					time.Second*3, time.Millisecond*500).Should(Equal(0), "expected 0 workers")
+
+				// Head Pod is still running
+				headPods := corev1.PodList{}
+				headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions()
+				Consistently(
+					listResourceFunc(ctx, &headPods, headLabels...),
+					time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items)
+
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				Consistently(
+					getResourceFunc(ctx, namespacedName, job),
+					time.Second*3, time.Millisecond*500).Should(Succeed())
+			})
+
+			By("Stage 2 (after 5s): Verify RayCluster is deleted, but job still exists", func() {
+				Eventually(
+					func() bool {
+						return apierrors.IsNotFound(getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster)())
+					},
+					time.Second*3, time.Millisecond*500).Should(BeTrue())
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				Consistently(
+					getResourceFunc(ctx, namespacedName, job),
+					time.Second*3, time.Millisecond*500).Should(Succeed())
+			})
+
+			By("Stage 3 (after 10s): Verify RayJob itself is deleted", func() {
+				Eventually(
+					func() bool {
+						return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob))
+					}, time.Second*5, time.Millisecond*500).Should(BeTrue())
+			})
+		})
+
+		It("Should process multi-stage deletions in order on failure: Workers, then Cluster, then Self", func() {
+			ctx := context.Background()
+			namespace := "default"
+			rayJob := rayJobTemplate("rayjob-test-multistage-deletion-on-failure", namespace)
+			rayCluster := &rayv1.RayCluster{}
+
+			// Define the multi-stage DeletionStrategy
+			rayJob.Spec.DeletionStrategy = &rayv1.DeletionStrategy{
+				DeletionRules: []rayv1.DeletionRule{
+					{
+						Policy: rayv1.DeleteWorkers,
+						Condition: rayv1.DeletionCondition{
+							JobStatus:               rayv1.JobStatusFailed,
+							TTLSecondsAfterFinished: 0, // Stage 1: Delete workers after 0 seconds
+						},
+					},
+					{
+						Policy: rayv1.DeleteCluster,
+						Condition: rayv1.DeletionCondition{
+							JobStatus:               rayv1.JobStatusFailed,
+							TTLSecondsAfterFinished: 5, // Stage 2: Delete cluster after 5 seconds
+						},
+					},
+					{
+						Policy: rayv1.DeleteSelf,
+						Condition: rayv1.DeletionCondition{
+							JobStatus:               rayv1.JobStatusFailed,
+							TTLSecondsAfterFinished: 10, // Stage 3: Delete self after 10 seconds
+						},
+					},
+				},
+			}
+			rayJob.Spec.ShutdownAfterJobFinishes = false
+
+			By("Verify RayJob spec", func() {
+				Expect(*rayJob.Spec.DeletionStrategy).To(Equal(rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteWorkers,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusFailed,
+								TTLSecondsAfterFinished: 0,
+							},
+						},
+						{
+							Policy: rayv1.DeleteCluster,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusFailed,
+								TTLSecondsAfterFinished: 5,
+							},
+						},
+						{
+							Policy: rayv1.DeleteSelf,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:               rayv1.JobStatusFailed,
+								TTLSecondsAfterFinished: 10,
+							},
+						},
+					},
+				}))
+			})
+
+			By("Create a RayJob custom resource with multi-stage deletion rules", func() {
+				err := k8sClient.Create(ctx, rayJob)
+				Expect(err).NotTo(HaveOccurred(), "Failed to create RayJob")
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "Should be able to see RayJob: %v", rayJob.Name)
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from New to Initializing.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusInitializing), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Initializing state, Status.RayClusterName, Status.JobId, and Status.StartTime must be set.
+				Expect(rayJob.Status.RayClusterName).NotTo(BeEmpty())
+				Expect(rayJob.Status.JobId).NotTo(BeEmpty())
+				Expect(rayJob.Status.StartTime).NotTo(BeNil())
+			})
+
+			By("In Initializing state, the RayCluster should eventually be created.", func() {
+				Eventually(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Check whether RayCluster is consistent with RayJob's RayClusterSpec.
+				Expect(rayCluster.Spec.WorkerGroupSpecs[0].Replicas).To(Equal(rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas))
+				Expect(rayCluster.Spec.RayVersion).To(Equal(rayJob.Spec.RayClusterSpec.RayVersion))
+
+				// TODO (kevin85421): Check the RayCluster labels.
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRNameLabelKey, rayJob.Name))
+				Expect(rayCluster.Labels).Should(HaveKeyWithValue(utils.RayOriginatedFromCRDLabelKey, utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD)))
+
+				Expect(rayCluster.Annotations).Should(Equal(rayJob.Annotations))
+			})
+
+			By("Make RayCluster.Status.State to be rayv1.Ready", func() {
+				// The RayCluster is not 'Ready' yet because Pods are not running and ready.
+				Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready))
+
+				updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+				updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
+
+				// The RayCluster.Status.State should be Ready.
+				Eventually(
+					getClusterState(ctx, namespace, rayCluster.Name),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready))
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Initializing to Running.", func() {
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// In Running state, the RayJob's Status.DashboardURL must be set.
+				Expect(rayJob.Status.DashboardURL).NotTo(BeEmpty())
+
+				// In Running state, the submitter Kubernetes Job must be created if this RayJob is in K8sJobMode.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+			})
+
+			By("RayJobs's JobDeploymentStatus transitions from Running to Complete.", func() {
+				// Update fake dashboard client to return job info with "Failed" status.
+				getJobInfo := func(context.Context, string) (*utiltypes.RayJobInfo, error) { //nolint:unparam // This is a mock function so parameters are required
+					return &utiltypes.RayJobInfo{JobStatus: rayv1.JobStatusFailed, EndTime: uint64(time.Now().UnixMilli())}, nil
+				}
+				fakeRayDashboardClient.GetJobInfoMock.Store(&getJobInfo)
+				defer fakeRayDashboardClient.GetJobInfoMock.Store(nil)
+
+				// RayJob transitions to Complete if and only if the corresponding submitter Kubernetes Job is Complete or Failed.
+				Consistently(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*3, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusRunning), "JobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+
+				// Update the submitter Kubernetes Job to Complete.
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, namespacedName, job)
+				Expect(err).NotTo(HaveOccurred(), "failed to get Kubernetes Job")
+
+				// Update the submitter Kubernetes Job to Complete.
+				conditions := []batchv1.JobCondition{
+					{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
+				}
+				job.Status.Conditions = conditions
+				Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
+
+				// RayJob transitions to Failed.
+				Eventually(
+					getRayJobDeploymentStatus(ctx, rayJob),
+					time.Second*5, time.Millisecond*500).Should(Equal(rayv1.JobDeploymentStatusFailed), "jobDeploymentStatus = %v", rayJob.Status.JobDeploymentStatus)
+			})
+
+			By("Stage 1: Verify workers are deleted, but cluster and job still exist", func() {
+				// RayCluster exists
+				Consistently(
+					getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster),
+					time.Second*3, time.Millisecond*500).Should(Succeed(), "RayCluster %v not found", rayJob.Status.RayClusterName)
+
+				// Check worker group is suspended
+				Expect(*rayCluster.Spec.WorkerGroupSpecs[0].Suspend).To(BeTrue())
+
+				// 0 worker Pods exist
+				workerPods := corev1.PodList{}
+				workerLabels := common.RayClusterWorkerPodsAssociationOptions(rayCluster).ToListOptions()
+				Eventually(
+					listResourceFunc(ctx, &workerPods, workerLabels...),
+					time.Second*3, time.Millisecond*500).Should(Equal(0), "expected 0 workers")
+
+				// Head Pod is still running
+				headPods := corev1.PodList{}
+				headLabels := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions()
+				Consistently(
+					listResourceFunc(ctx, &headPods, headLabels...),
+					time.Second*3, time.Millisecond*500).Should(Equal(1), "Head pod list should have only 1 Pod = %v", headPods.Items)
+
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				Consistently(
+					getResourceFunc(ctx, namespacedName, job),
+					time.Second*3, time.Millisecond*500).Should(Succeed())
+			})
+
+			By("Stage 2 (after 5s): Verify RayCluster is deleted, but job still exists", func() {
+				Eventually(
+					func() bool {
+						return apierrors.IsNotFound(getResourceFunc(ctx, client.ObjectKey{Name: rayJob.Status.RayClusterName, Namespace: namespace}, rayCluster)())
+					},
+					time.Second*3, time.Millisecond*500).Should(BeTrue())
+				namespacedName := common.RayJobK8sJobNamespacedName(rayJob)
+				job := &batchv1.Job{}
+				Consistently(
+					getResourceFunc(ctx, namespacedName, job),
+					time.Second*3, time.Millisecond*500).Should(Succeed())
+			})
+
+			By("Stage 3 (after 10s): Verify RayJob itself is deleted", func() {
+				Eventually(
+					func() bool {
+						return apierrors.IsNotFound(k8sClient.Get(ctx, client.ObjectKey{Name: rayJob.Name, Namespace: namespace}, rayJob))
+					}, time.Second*5, time.Millisecond*500).Should(BeTrue())
+			})
+		})
 	})
 })
diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go
index d4653aa04bc..3c4ebe81e45 100644
--- a/ray-operator/controllers/ray/utils/validation.go
+++ b/ray-operator/controllers/ray/utils/validation.go
@@ -215,9 +215,6 @@ func ValidateRayJobSpec(rayJob *rayv1.RayJob) error {
 	if rayJob.Spec.BackoffLimit != nil && *rayJob.Spec.BackoffLimit < 0 {
 		return fmt.Errorf("backoffLimit must be a positive integer")
 	}
-	if !features.Enabled(features.RayJobDeletionPolicy) && rayJob.Spec.DeletionStrategy != nil {
-		return fmt.Errorf("RayJobDeletionPolicy feature gate must be enabled to use the DeletionStrategy feature")
-	}
 
 	return validateDeletionStrategy(rayJob)
 }
@@ -269,7 +266,7 @@ func validateDeletionStrategy(rayJob *rayv1.RayJob) error {
 	}
 
 	usingDeletionRules := len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0
-	usingLegacyAPI := rayJob.Spec.DeletionStrategy.OnSuccess.Policy != nil || rayJob.Spec.DeletionStrategy.OnFailure.Policy != nil
+	usingLegacyAPI := rayJob.Spec.DeletionStrategy.OnSuccess != nil || rayJob.Spec.DeletionStrategy.OnFailure != nil
 
 	// ShutdownAfterJobFinishes cannot be used with the new API.
 	if usingDeletionRules && rayJob.Spec.ShutdownAfterJobFinishes {
@@ -399,6 +396,13 @@ func validateTTLConsistency(policyTTLs map[rayv1.DeletionPolicyType]int32, statu
 // validateLegacyDeletionPolicies handles validation for the old `onSuccess` and `onFailure` fields.
 func validateLegacyDeletionPolicies(rayJob *rayv1.RayJob) error {
 	isClusterSelectorMode := len(rayJob.Spec.ClusterSelector) != 0
+
+	// Both policies must be set if using the legacy API.
+	if rayJob.Spec.DeletionStrategy.OnSuccess == nil || rayJob.Spec.DeletionStrategy.OnFailure == nil {
+		return fmt.Errorf("both DeletionStrategy.OnSuccess and DeletionStrategy.OnFailure must be set when using the legacy deletion policy fields")
+	}
+
+	// Validate that the Policy field is set within each policy.
 	onSuccessPolicy := rayJob.Spec.DeletionStrategy.OnSuccess
 	onFailurePolicy := rayJob.Spec.DeletionStrategy.OnFailure
 
diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go
index 4060827e21f..e298d2d45b5 100644
--- a/ray-operator/controllers/ray/utils/validation_test.go
+++ b/ray-operator/controllers/ray/utils/validation_test.go
@@ -795,10 +795,10 @@ func TestValidateRayJobSpec(t *testing.T) {
 			name: "RayJobDeletionPolicy feature gate must be enabled to use the DeletionStrategy feature",
 			spec: rayv1.RayJobSpec{
 				DeletionStrategy: &rayv1.DeletionStrategy{
-					OnSuccess: rayv1.DeletionPolicy{
+					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteCluster),
 					},
-					OnFailure: rayv1.DeletionPolicy{
+					OnFailure: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteCluster),
 					},
 				},
@@ -932,10 +932,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 			name: "the ClusterSelector mode doesn't support DeletionStrategy=DeleteCluster",
 			spec: rayv1.RayJobSpec{
 				DeletionStrategy: &rayv1.DeletionStrategy{
-					OnSuccess: rayv1.DeletionPolicy{
+					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteCluster),
 					},
-					OnFailure: rayv1.DeletionPolicy{
+					OnFailure: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteCluster),
 					},
 				}, ClusterSelector: map[string]string{"key": "value"},
@@ -946,10 +946,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 			name: "the ClusterSelector mode doesn't support DeletionStrategy=DeleteWorkers",
 			spec: rayv1.RayJobSpec{
 				DeletionStrategy: &rayv1.DeletionStrategy{
-					OnSuccess: rayv1.DeletionPolicy{
+					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteWorkers),
 					},
-					OnFailure: rayv1.DeletionPolicy{
+					OnFailure: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteWorkers),
 					},
 				}, ClusterSelector: map[string]string{"key": "value"},
@@ -960,10 +960,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 			name: "DeletionStrategy=DeleteWorkers currently does not support RayCluster with autoscaling enabled",
 			spec: rayv1.RayJobSpec{
 				DeletionStrategy: &rayv1.DeletionStrategy{
-					OnSuccess: rayv1.DeletionPolicy{
+					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteWorkers),
 					},
-					OnFailure: rayv1.DeletionPolicy{
+					OnFailure: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteWorkers),
 					},
 				}, RayClusterSpec: &rayv1.RayClusterSpec{
@@ -977,10 +977,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 			name: "valid RayJob with DeletionStrategy=DeleteCluster",
 			spec: rayv1.RayJobSpec{
 				DeletionStrategy: &rayv1.DeletionStrategy{
-					OnSuccess: rayv1.DeletionPolicy{
+					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteCluster),
 					},
-					OnFailure: rayv1.DeletionPolicy{
+					OnFailure: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteCluster),
 					},
 				}, ShutdownAfterJobFinishes: true,
@@ -1001,10 +1001,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 			name: "shutdownAfterJobFinshes is set to 'true' while deletion policy is 'DeleteNone'",
 			spec: rayv1.RayJobSpec{
 				DeletionStrategy: &rayv1.DeletionStrategy{
-					OnSuccess: rayv1.DeletionPolicy{
+					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteNone),
 					},
-					OnFailure: rayv1.DeletionPolicy{
+					OnFailure: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteNone),
 					},
 				}, ShutdownAfterJobFinishes: true,
@@ -1016,7 +1016,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 			name: "OnSuccess unset",
 			spec: rayv1.RayJobSpec{
 				DeletionStrategy: &rayv1.DeletionStrategy{
-					OnFailure: rayv1.DeletionPolicy{
+					OnFailure: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteNone),
 					},
 				}, ShutdownAfterJobFinishes: true,
@@ -1028,7 +1028,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 			name: "OnSuccess.DeletionPolicyType unset",
 			spec: rayv1.RayJobSpec{
 				DeletionStrategy: &rayv1.DeletionStrategy{
-					OnFailure: rayv1.DeletionPolicy{
+					OnFailure: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteNone),
 					},
 				}, ShutdownAfterJobFinishes: true,
@@ -1040,7 +1040,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 			name: "OnFailure unset",
 			spec: rayv1.RayJobSpec{
 				DeletionStrategy: &rayv1.DeletionStrategy{
-					OnSuccess: rayv1.DeletionPolicy{
+					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteNone),
 					},
 				}, ShutdownAfterJobFinishes: true,
@@ -1052,10 +1052,10 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 			name: "OnFailure.DeletionPolicyType unset",
 			spec: rayv1.RayJobSpec{
 				DeletionStrategy: &rayv1.DeletionStrategy{
-					OnSuccess: rayv1.DeletionPolicy{
+					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteNone),
 					},
-					OnFailure: rayv1.DeletionPolicy{},
+					OnFailure: &rayv1.DeletionPolicy{},
 				}, ShutdownAfterJobFinishes: true,
 				RayClusterSpec: createBasicRayClusterSpec(),
 			},
@@ -1112,7 +1112,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 			name: "deletionRules and legacy onSuccess both set",
 			spec: rayv1.RayJobSpec{
 				DeletionStrategy: &rayv1.DeletionStrategy{
-					OnSuccess: rayv1.DeletionPolicy{
+					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteCluster),
 					},
 					DeletionRules: []rayv1.DeletionRule{

From f715ed4cd84cb5727906cd97b708117b30f6b7ed Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Fri, 12 Sep 2025 14:30:06 -0400
Subject: [PATCH 03/21] trigger CI

---
 ray-operator/controllers/ray/utils/validation.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go
index 3c4ebe81e45..abb2644a6e4 100644
--- a/ray-operator/controllers/ray/utils/validation.go
+++ b/ray-operator/controllers/ray/utils/validation.go
@@ -361,6 +361,7 @@ func validateDeletionRules(rayJob *rayv1.RayJob) error {
 func validateTTLConsistency(policyTTLs map[rayv1.DeletionPolicyType]int32, status rayv1.JobStatus) error {
 	// Define the required deletion order. TTLs must be non-decreasing along this sequence.
 	deletionOrder := []rayv1.DeletionPolicyType{
+		rayv1.DeleteNone,
 		rayv1.DeleteWorkers,
 		rayv1.DeleteCluster,
 		rayv1.DeleteSelf,

From 24f6ab966eee1c82cc0ed213beba2947d1ff48c1 Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Fri, 12 Sep 2025 22:37:42 -0400
Subject: [PATCH 04/21] Revert change for triggering CI

---
 ray-operator/controllers/ray/utils/validation.go | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go
index abb2644a6e4..3c4ebe81e45 100644
--- a/ray-operator/controllers/ray/utils/validation.go
+++ b/ray-operator/controllers/ray/utils/validation.go
@@ -361,7 +361,6 @@ func validateDeletionRules(rayJob *rayv1.RayJob) error {
 func validateTTLConsistency(policyTTLs map[rayv1.DeletionPolicyType]int32, status rayv1.JobStatus) error {
 	// Define the required deletion order. TTLs must be non-decreasing along this sequence.
 	deletionOrder := []rayv1.DeletionPolicyType{
-		rayv1.DeleteNone,
 		rayv1.DeleteWorkers,
 		rayv1.DeleteCluster,
 		rayv1.DeleteSelf,

From 31867ad2d480288b59f5471add3381311e793efd Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Mon, 15 Sep 2025 15:51:01 -0400
Subject: [PATCH 05/21] address comment

---
 ray-operator/apis/ray/v1/rayjob_types.go      |  4 +--
 .../controllers/ray/utils/validation.go       | 32 ++++++++-----------
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go
index 575ded8e81c..d1ec95f8c8d 100644
--- a/ray-operator/apis/ray/v1/rayjob_types.go
+++ b/ray-operator/apis/ray/v1/rayjob_types.go
@@ -85,8 +85,6 @@ const (
 	SidecarMode     JobSubmissionMode = "SidecarMode"     // Submit job via a sidecar container in the Ray head Pod
 )
 
-type DeletionPolicyType string
-
 // DeletionStrategy defines the deletion policies for a RayJob.
 // It allows for fine-grained control over resource cleanup after a job finishes.
 //
@@ -167,6 +165,8 @@ type DeletionPolicy struct {
 	Policy *DeletionPolicyType `json:"policy,omitempty"`
 }
 
+type DeletionPolicyType string
+
 const (
 	DeleteCluster DeletionPolicyType = "DeleteCluster" // To delete the entire RayCluster custom resource on job completion.
 	DeleteWorkers DeletionPolicyType = "DeleteWorkers" // To delete only the workers on job completion.
diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go
index 3c4ebe81e45..7e66aec8f41 100644
--- a/ray-operator/controllers/ray/utils/validation.go
+++ b/ray-operator/controllers/ray/utils/validation.go
@@ -216,7 +216,10 @@ func ValidateRayJobSpec(rayJob *rayv1.RayJob) error {
 		return fmt.Errorf("backoffLimit must be a positive integer")
 	}
 
-	return validateDeletionStrategy(rayJob)
+	if err := validateDeletionStrategy(rayJob); err != nil {
+		return fmt.Errorf("invalid deletion strategy: %w", err)
+	}
+	return nil
 }
 
 func ValidateRayServiceMetadata(metadata metav1.ObjectMeta) error {
@@ -295,19 +298,11 @@ func validateDeletionStrategy(rayJob *rayv1.RayJob) error {
 // It performs per-rule validations, checks for uniqueness, and ensures logical TTL consistency.
 // Errors are collected and returned as a single aggregated error using errors.Join for better user feedback.
 func validateDeletionRules(rayJob *rayv1.RayJob) error {
-	type ruleKey struct {
-		Policy rayv1.DeletionPolicyType
-		Status rayv1.JobStatus
-	}
-
 	rules := rayJob.Spec.DeletionStrategy.DeletionRules
 	isClusterSelectorMode := len(rayJob.Spec.ClusterSelector) != 0
 
-	// Group TTLs by JobStatus for cross-rule validation.
+	// Group TTLs by JobStatus for cross-rule validation and uniqueness checking.
 	rulesByStatus := make(map[rayv1.JobStatus]map[rayv1.DeletionPolicyType]int32)
-	// Track unique (Policy, JobStatus) combinations.
-	ruleUniquenessSet := make(map[ruleKey]struct{})
-
 	var errs []error
 
 	// Single pass: Validate each rule individually and group for later consistency checks.
@@ -318,14 +313,6 @@ func validateDeletionRules(rayJob *rayv1.RayJob) error {
 			continue
 		}
 
-		// Check uniqueness.
-		key := ruleKey{Policy: rule.Policy, Status: rule.Condition.JobStatus}
-		if _, exists := ruleUniquenessSet[key]; exists {
-			errs = append(errs, fmt.Errorf("deletionRules[%d]: duplicate rule for DeletionPolicyType '%s' and JobStatus '%s'", i, rule.Policy, rule.Condition.JobStatus))
-			continue
-		}
-		ruleUniquenessSet[key] = struct{}{}
-
 		// Contextual validations based on spec.
 		if isClusterSelectorMode && (rule.Policy == rayv1.DeleteCluster || rule.Policy == rayv1.DeleteWorkers) {
 			errs = append(errs, fmt.Errorf("deletionRules[%d]: DeletionPolicyType '%s' not supported when ClusterSelector is set", i, rule.Policy))
@@ -343,6 +330,13 @@ func validateDeletionRules(rayJob *rayv1.RayJob) error {
 			statusMap = make(map[rayv1.DeletionPolicyType]int32)
 			rulesByStatus[rule.Condition.JobStatus] = statusMap
 		}
+
+		// Check for uniqueness of (JobStatus, DeletionPolicyType) pair.
+		if _, exists := statusMap[rule.Policy]; exists {
+			errs = append(errs, fmt.Errorf("deletionRules[%d]: duplicate rule for DeletionPolicyType '%s' and JobStatus '%s'", i, rule.Policy, rule.Condition.JobStatus))
+			continue
+		}
+
 		statusMap[rule.Policy] = rule.Condition.TTLSecondsAfterFinished
 	}
 
@@ -399,7 +393,7 @@ func validateLegacyDeletionPolicies(rayJob *rayv1.RayJob) error {
 
 	// Both policies must be set if using the legacy API.
 	if rayJob.Spec.DeletionStrategy.OnSuccess == nil || rayJob.Spec.DeletionStrategy.OnFailure == nil {
-		return fmt.Errorf("both DeletionStrategy.OnSuccess and DeletionStrategy.OnFailure must be set when using the legacy deletion policy fields")
+		return fmt.Errorf("both DeletionStrategy.OnSuccess and DeletionStrategy.OnFailure must be set when using the legacy deletion policy fields of DeletionStrategy")
 	}
 
 	// Validate that the Policy field is set within each policy.

From f2719a0ed3c16e2b266f3e19bb24b7ffd15f2cae Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Tue, 16 Sep 2025 20:12:06 -0400
Subject: [PATCH 06/21] rename to TTLSeconds

---
 docs/reference/api.md                         |   7 +-
 .../kuberay-operator/crds/ray.io_rayjobs.yaml |   2 +-
 ray-operator/apis/ray/v1/rayjob_types.go      |   9 +-
 .../config/crd/bases/ray.io_rayjobs.yaml      |   2 +-
 .../samples/ray-job.deletion-rules.yaml       | 180 ++++++++++++++++++
 .../controllers/ray/rayjob_controller.go      |   2 +-
 .../controllers/ray/rayjob_controller_test.go |  96 +++++-----
 .../controllers/ray/utils/validation.go       |  12 +-
 .../controllers/ray/utils/validation_test.go  |  64 +++----
 .../ray/v1/deletioncondition.go               |  12 +-
 10 files changed, 284 insertions(+), 102 deletions(-)
 create mode 100644 ray-operator/config/samples/ray-job.deletion-rules.yaml

diff --git a/docs/reference/api.md b/docs/reference/api.md
index 4d3a87a9bce..ec300a8e342 100644
--- a/docs/reference/api.md
+++ b/docs/reference/api.md
@@ -68,7 +68,7 @@ _Appears in:_
 
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `ttlSecondsAfterFinished` _integer_ | TTLSecondsAfterFinished is the time in seconds from when the JobStatus<br />reaches the specified terminal state to when this deletion action should be triggered.<br />The value must be a non-negative integer. | 0 | Minimum: 0 <br /> |
+| `ttlSeconds` _integer_ | TTLSeconds is the time in seconds from when the JobStatus<br />reaches the specified terminal state to when this deletion action should be triggered.<br />The value must be a non-negative integer. | 0 | Minimum: 0 <br /> |
 
 
 #### DeletionPolicy
@@ -130,6 +130,7 @@ It allows for fine-grained control over resource cleanup after a job finishes.
 
 Legacy fields `onSuccess` and `onFailure` are still supported for backward compatibility,
 but it is highly recommended to migrate to the new `deletionRules` field.
+`onSuccess` and `onFailure` will be removed in release 1.16.0.
 
 
 Notes:
@@ -154,8 +155,8 @@ _Appears in:_
 
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | OnSuccess is the deletion policy for a successful RayJob.<br />Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.<br />This field will be removed in a future release. |  |  |
-| `onFailure` _[DeletionPolicy](#deletionpolicy)_ | OnFailure is the deletion policy for a failed RayJob.<br />Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.<br />This field will be removed in a future release. |  |  |
+| `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | OnSuccess is the deletion policy for a successful RayJob.<br />Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.<br />This field will be removed in release 1.16.0. |  |  |
+| `onFailure` _[DeletionPolicy](#deletionpolicy)_ | OnFailure is the deletion policy for a failed RayJob.<br />Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.<br />This field will be removed in release 1.16.0. |  |  |
 | `deletionRules` _[DeletionRule](#deletionrule) array_ | DeletionRules is a list of deletion rules, processed based on their trigger conditions.<br />While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),<br />the most impactful rule (e.g., DeleteCluster) will be executed first to prioritize resource cleanup and cost savings. |  |  |
 
 
diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml
index 15e15996f7b..8ee2bc5ce4d 100644
--- a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml
+++ b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml
@@ -70,7 +70,7 @@ spec:
                               - SUCCEEDED
                               - FAILED
                               type: string
-                            ttlSecondsAfterFinished:
+                            ttlSeconds:
                               default: 0
                               format: int32
                               minimum: 0
diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go
index d1ec95f8c8d..01317066937 100644
--- a/ray-operator/apis/ray/v1/rayjob_types.go
+++ b/ray-operator/apis/ray/v1/rayjob_types.go
@@ -90,6 +90,7 @@ const (
 //
 // Legacy fields `onSuccess` and `onFailure` are still supported for backward compatibility,
 // but it is highly recommended to migrate to the new `deletionRules` field.
+// `onSuccess` and `onFailure` will be removed in release 1.16.0.
 //
 // Notes:
 //   - When this block is set, you must configure **either**
@@ -109,13 +110,13 @@ const (
 type DeletionStrategy struct {
 	// OnSuccess is the deletion policy for a successful RayJob.
 	// Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
-	// This field will be removed in a future release.
+	// This field will be removed in release 1.16.0.
 	// +optional
 	OnSuccess *DeletionPolicy `json:"onSuccess,omitempty"`
 
 	// OnFailure is the deletion policy for a failed RayJob.
 	// Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
-	// This field will be removed in a future release.
+	// This field will be removed in release 1.16.0.
 	// +optional
 	OnFailure *DeletionPolicy `json:"onFailure,omitempty"`
 
@@ -145,13 +146,13 @@ type DeletionCondition struct {
 	// +kubebuilder:validation:Enum=SUCCEEDED;FAILED
 	JobStatus JobStatus `json:"jobStatus"`
 
-	// TTLSecondsAfterFinished is the time in seconds from when the JobStatus
+	// TTLSeconds is the time in seconds from when the JobStatus
 	// reaches the specified terminal state to when this deletion action should be triggered.
 	// The value must be a non-negative integer.
 	// +kubebuilder:default=0
 	// +kubebuilder:validation:Minimum=0
 	// +optional
-	TTLSecondsAfterFinished int32 `json:"ttlSecondsAfterFinished,omitempty"`
+	TTLSeconds int32 `json:"ttlSeconds,omitempty"`
 }
 
 // DeletionPolicy is the legacy single-stage deletion policy.
diff --git a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml
index 15e15996f7b..8ee2bc5ce4d 100644
--- a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml
+++ b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml
@@ -70,7 +70,7 @@ spec:
                               - SUCCEEDED
                               - FAILED
                               type: string
-                            ttlSecondsAfterFinished:
+                            ttlSeconds:
                               default: 0
                               format: int32
                               minimum: 0
diff --git a/ray-operator/config/samples/ray-job.deletion-rules.yaml b/ray-operator/config/samples/ray-job.deletion-rules.yaml
new file mode 100644
index 00000000000..f6605e274c3
--- /dev/null
+++ b/ray-operator/config/samples/ray-job.deletion-rules.yaml
@@ -0,0 +1,180 @@
+apiVersion: ray.io/v1
+kind: RayJob
+metadata:
+  name: rayjob-sample
+spec:
+  # submissionMode specifies how RayJob submits the Ray job to the RayCluster.
+  # The default value is "K8sJobMode", meaning RayJob will submit the Ray job via a submitter Kubernetes Job.
+  # The alternative value is "HTTPMode", indicating that KubeRay will submit the Ray job by sending an HTTP request to the RayCluster.
+  # submissionMode: "K8sJobMode"
+  entrypoint: python /home/ray/samples/sample_code.py
+  # DeletionStrategy defines the deletion policies for a RayJob.
+  # It allows for fine-grained control over resource cleanup after a job finishes.
+  # DeletionRules is a list of deletion rules, processed based on their trigger conditions.
+  # While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),
+  # the most impactful rule (e.g., DeleteCluster) will be executed first to prioritize resource cleanup and cost savings.
+  deletionStrategy:
+    # This sample demonstrates a staged cleanup process for a RayJob.
+    # Regardless of whether the job succeeds or fails, the cleanup follows these steps:
+    # 1. After 30 seconds, the worker pods are deleted. This allows for quick resource release while keeping the head pod for debugging.
+    # 2. After 60 seconds, the entire RayCluster (including the head pod) is deleted.
+    # 3. After 90 seconds, the RayJob custom resource itself is deleted, removing it from the Kubernetes API server.
+    deletionRules:
+    - condition: 
+        jobStatus: FAILED
+        ttlSeconds: 30
+      policy: DeleteWorkers
+    - condition:
+        jobStatus: FAILED
+        ttlSeconds: 60
+      policy: DeleteCluster
+    - condition:
+        jobStatus: FAILED
+        ttlSeconds: 90
+      policy: DeleteSelf
+    - condition: 
+        jobStatus: SUCCEEDED
+        ttlSeconds: 30
+      policy: DeleteWorkers
+    - condition:
+        jobStatus: SUCCEEDED
+        ttlSeconds: 60
+      policy: DeleteCluster
+    - condition:
+        jobStatus: SUCCEEDED
+        ttlSeconds: 90
+      policy: DeleteSelf
+
+  # activeDeadlineSeconds is the duration in seconds that the RayJob may be active before
+  # KubeRay actively tries to terminate the RayJob; value must be positive integer.
+  # activeDeadlineSeconds: 120
+
+  # RuntimeEnvYAML represents the runtime environment configuration provided as a multi-line YAML string.
+  # See https://docs.ray.io/en/latest/ray-core/handling-dependencies.html for details.
+  # (New in KubeRay version 1.0.)
+  runtimeEnvYAML: |
+    pip:
+      - requests==2.26.0
+      - pendulum==2.1.2
+    env_vars:
+      counter_name: "test_counter"
+
+  # Suspend specifies whether the RayJob controller should create a RayCluster instance.
+  # If a job is applied with the suspend field set to true, the RayCluster will not be created and we will wait for the transition to false.
+  # If the RayCluster is already created, it will be deleted. In the case of transition to false, a new RayCluster will be created.
+  # suspend: false
+
+  # rayClusterSpec specifies the RayCluster instance to be created by the RayJob controller.
+  rayClusterSpec:
+    rayVersion: '2.46.0' # should match the Ray version in the image of the containers
+    # Ray head pod template
+    headGroupSpec:
+      # The `rayStartParams` are used to configure the `ray start` command.
+      # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
+      # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
+      rayStartParams: {}
+      #pod template
+      template:
+        spec:
+          containers:
+          - name: ray-head
+            image: rayproject/ray:2.46.0
+            ports:
+            - containerPort: 6379
+              name: gcs-server
+            - containerPort: 8265 # Ray dashboard
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            resources:
+              limits:
+                cpu: "1"
+              requests:
+                cpu: "200m"
+            volumeMounts:
+            - mountPath: /home/ray/samples
+              name: code-sample
+          volumes:
+          # You set volumes at the Pod level, then mount them into containers inside that Pod
+          - name: code-sample
+            configMap:
+              # Provide the name of the ConfigMap you want to mount.
+              name: ray-job-code-sample
+              # An array of keys from the ConfigMap to create as files
+              items:
+              - key: sample_code.py
+                path: sample_code.py
+    workerGroupSpecs:
+    # the pod replicas in this group typed worker
+    - replicas: 1
+      minReplicas: 1
+      maxReplicas: 5
+      # logical group name, for this called small-group, also can be functional
+      groupName: small-group
+      # The `rayStartParams` are used to configure the `ray start` command.
+      # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
+      # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
+      rayStartParams: {}
+      #pod template
+      template:
+        spec:
+          containers:
+          - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
+            image: rayproject/ray:2.46.0
+            resources:
+              limits:
+                cpu: "1"
+              requests:
+                cpu: "200m"
+
+  # SubmitterPodTemplate is the template for the pod that will run the `ray job submit` command against the RayCluster.
+  # If SubmitterPodTemplate is specified, the first container is assumed to be the submitter container.
+  # submitterPodTemplate:
+  #   spec:
+  #     restartPolicy: Never
+  #     containers:
+  #     - name: my-custom-rayjob-submitter-pod
+  #       image: rayproject/ray:2.46.0
+  #       # If Command is not specified, the correct command will be supplied at runtime using the RayJob spec `entrypoint` field.
+  #       # Specifying Command is not recommended.
+  #       # command: ["sh", "-c", "ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID -- echo hello world"]
+
+
+######################Ray code sample#################################
+# this sample is from https://docs.ray.io/en/latest/cluster/job-submission.html#quick-start-example
+# it is mounted into the container and executed to show the Ray job at work
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: ray-job-code-sample
+data:
+  sample_code.py: |
+    import ray
+    import os
+    import requests
+
+    ray.init()
+
+    @ray.remote
+    class Counter:
+        def __init__(self):
+            # Used to verify runtimeEnv
+            self.name = os.getenv("counter_name")
+            assert self.name == "test_counter"
+            self.counter = 0
+
+        def inc(self):
+            self.counter += 1
+
+        def get_counter(self):
+            return "{} got {}".format(self.name, self.counter)
+
+    counter = Counter.remote()
+
+    for _ in range(5):
+        ray.get(counter.inc.remote())
+        print(ray.get(counter.get_counter.remote()))
+
+    # Verify that the correct runtime env was used for the job.
+    assert requests.__version__ == "2.26.0"
diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go
index f292522a030..43c0f4c3ae4 100644
--- a/ray-operator/controllers/ray/rayjob_controller.go
+++ b/ray-operator/controllers/ray/rayjob_controller.go
@@ -1071,7 +1071,7 @@ func (r *RayJobReconciler) handleDeletionRules(ctx context.Context, rayJob *rayv
 			continue
 		}
 
-		deletionTime := rayJob.Status.EndTime.Add(time.Duration(rule.Condition.TTLSecondsAfterFinished) * time.Second)
+		deletionTime := rayJob.Status.EndTime.Add(time.Duration(rule.Condition.TTLSeconds) * time.Second)
 		// Track the earliest requeue time to re-check later.
 		if nowTime.Before(deletionTime) {
 			if nextRequeueTime == nil || deletionTime.Before(*nextRequeueTime) {
diff --git a/ray-operator/controllers/ray/rayjob_controller_test.go b/ray-operator/controllers/ray/rayjob_controller_test.go
index c348932260d..75cb9edb75a 100644
--- a/ray-operator/controllers/ray/rayjob_controller_test.go
+++ b/ray-operator/controllers/ray/rayjob_controller_test.go
@@ -3192,22 +3192,22 @@ var _ = Context("RayJob with different submission modes", func() {
 					{
 						Policy: rayv1.DeleteWorkers,
 						Condition: rayv1.DeletionCondition{
-							JobStatus:               rayv1.JobStatusSucceeded,
-							TTLSecondsAfterFinished: 0,
+							JobStatus:  rayv1.JobStatusSucceeded,
+							TTLSeconds: 0,
 						},
 					},
 					{
 						Policy: rayv1.DeleteCluster,
 						Condition: rayv1.DeletionCondition{
-							JobStatus:               rayv1.JobStatusSucceeded,
-							TTLSecondsAfterFinished: 0,
+							JobStatus:  rayv1.JobStatusSucceeded,
+							TTLSeconds: 0,
 						},
 					},
 					{
 						Policy: rayv1.DeleteSelf,
 						Condition: rayv1.DeletionCondition{
-							JobStatus:               rayv1.JobStatusSucceeded,
-							TTLSecondsAfterFinished: 0,
+							JobStatus:  rayv1.JobStatusSucceeded,
+							TTLSeconds: 0,
 						},
 					},
 				},
@@ -3220,22 +3220,22 @@ var _ = Context("RayJob with different submission modes", func() {
 						{
 							Policy: rayv1.DeleteWorkers,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 0,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 0,
 							},
 						},
 						{
 							Policy: rayv1.DeleteCluster,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 0,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 0,
 							},
 						},
 						{
 							Policy: rayv1.DeleteSelf,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 0,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 0,
 							},
 						},
 					},
@@ -3351,22 +3351,22 @@ var _ = Context("RayJob with different submission modes", func() {
 					{
 						Policy: rayv1.DeleteWorkers,
 						Condition: rayv1.DeletionCondition{
-							JobStatus:               rayv1.JobStatusFailed,
-							TTLSecondsAfterFinished: 0,
+							JobStatus:  rayv1.JobStatusFailed,
+							TTLSeconds: 0,
 						},
 					},
 					{
 						Policy: rayv1.DeleteCluster,
 						Condition: rayv1.DeletionCondition{
-							JobStatus:               rayv1.JobStatusFailed,
-							TTLSecondsAfterFinished: 0,
+							JobStatus:  rayv1.JobStatusFailed,
+							TTLSeconds: 0,
 						},
 					},
 					{
 						Policy: rayv1.DeleteSelf,
 						Condition: rayv1.DeletionCondition{
-							JobStatus:               rayv1.JobStatusFailed,
-							TTLSecondsAfterFinished: 0,
+							JobStatus:  rayv1.JobStatusFailed,
+							TTLSeconds: 0,
 						},
 					},
 				},
@@ -3379,22 +3379,22 @@ var _ = Context("RayJob with different submission modes", func() {
 						{
 							Policy: rayv1.DeleteWorkers,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusFailed,
-								TTLSecondsAfterFinished: 0,
+								JobStatus:  rayv1.JobStatusFailed,
+								TTLSeconds: 0,
 							},
 						},
 						{
 							Policy: rayv1.DeleteCluster,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusFailed,
-								TTLSecondsAfterFinished: 0,
+								JobStatus:  rayv1.JobStatusFailed,
+								TTLSeconds: 0,
 							},
 						},
 						{
 							Policy: rayv1.DeleteSelf,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusFailed,
-								TTLSecondsAfterFinished: 0,
+								JobStatus:  rayv1.JobStatusFailed,
+								TTLSeconds: 0,
 							},
 						},
 					},
@@ -3510,22 +3510,22 @@ var _ = Context("RayJob with different submission modes", func() {
 					{
 						Policy: rayv1.DeleteWorkers,
 						Condition: rayv1.DeletionCondition{
-							JobStatus:               rayv1.JobStatusSucceeded,
-							TTLSecondsAfterFinished: 0, // Stage 1: Delete workers after 0 seconds
+							JobStatus:  rayv1.JobStatusSucceeded,
+							TTLSeconds: 0, // Stage 1: Delete workers after 0 seconds
 						},
 					},
 					{
 						Policy: rayv1.DeleteCluster,
 						Condition: rayv1.DeletionCondition{
-							JobStatus:               rayv1.JobStatusSucceeded,
-							TTLSecondsAfterFinished: 5, // Stage 2: Delete cluster after 5 seconds
+							JobStatus:  rayv1.JobStatusSucceeded,
+							TTLSeconds: 5, // Stage 2: Delete cluster after 5 seconds
 						},
 					},
 					{
 						Policy: rayv1.DeleteSelf,
 						Condition: rayv1.DeletionCondition{
-							JobStatus:               rayv1.JobStatusSucceeded,
-							TTLSecondsAfterFinished: 10, // Stage 3: Delete self after 10 seconds
+							JobStatus:  rayv1.JobStatusSucceeded,
+							TTLSeconds: 10, // Stage 3: Delete self after 10 seconds
 						},
 					},
 				},
@@ -3538,22 +3538,22 @@ var _ = Context("RayJob with different submission modes", func() {
 						{
 							Policy: rayv1.DeleteWorkers,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 0,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 0,
 							},
 						},
 						{
 							Policy: rayv1.DeleteCluster,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 5,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 5,
 							},
 						},
 						{
 							Policy: rayv1.DeleteSelf,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 10,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 10,
 							},
 						},
 					},
@@ -3718,22 +3718,22 @@ var _ = Context("RayJob with different submission modes", func() {
 					{
 						Policy: rayv1.DeleteWorkers,
 						Condition: rayv1.DeletionCondition{
-							JobStatus:               rayv1.JobStatusFailed,
-							TTLSecondsAfterFinished: 0, // Stage 1: Delete workers after 0 seconds
+							JobStatus:  rayv1.JobStatusFailed,
+							TTLSeconds: 0, // Stage 1: Delete workers after 0 seconds
 						},
 					},
 					{
 						Policy: rayv1.DeleteCluster,
 						Condition: rayv1.DeletionCondition{
-							JobStatus:               rayv1.JobStatusFailed,
-							TTLSecondsAfterFinished: 5, // Stage 2: Delete cluster after 5 seconds
+							JobStatus:  rayv1.JobStatusFailed,
+							TTLSeconds: 5, // Stage 2: Delete cluster after 5 seconds
 						},
 					},
 					{
 						Policy: rayv1.DeleteSelf,
 						Condition: rayv1.DeletionCondition{
-							JobStatus:               rayv1.JobStatusFailed,
-							TTLSecondsAfterFinished: 10, // Stage 3: Delete self after 10 seconds
+							JobStatus:  rayv1.JobStatusFailed,
+							TTLSeconds: 10, // Stage 3: Delete self after 10 seconds
 						},
 					},
 				},
@@ -3746,22 +3746,22 @@ var _ = Context("RayJob with different submission modes", func() {
 						{
 							Policy: rayv1.DeleteWorkers,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusFailed,
-								TTLSecondsAfterFinished: 0,
+								JobStatus:  rayv1.JobStatusFailed,
+								TTLSeconds: 0,
 							},
 						},
 						{
 							Policy: rayv1.DeleteCluster,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusFailed,
-								TTLSecondsAfterFinished: 5,
+								JobStatus:  rayv1.JobStatusFailed,
+								TTLSeconds: 5,
 							},
 						},
 						{
 							Policy: rayv1.DeleteSelf,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusFailed,
-								TTLSecondsAfterFinished: 10,
+								JobStatus:  rayv1.JobStatusFailed,
+								TTLSeconds: 10,
 							},
 						},
 					},
diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go
index 7e66aec8f41..26ac8db2cc8 100644
--- a/ray-operator/controllers/ray/utils/validation.go
+++ b/ray-operator/controllers/ray/utils/validation.go
@@ -308,7 +308,7 @@ func validateDeletionRules(rayJob *rayv1.RayJob) error {
 	// Single pass: Validate each rule individually and group for later consistency checks.
 	for i, rule := range rules {
 		// Validate TTL is non-negative.
-		if rule.Condition.TTLSecondsAfterFinished < 0 {
+		if rule.Condition.TTLSeconds < 0 {
 			errs = append(errs, fmt.Errorf("deletionRules[%d]: TTLSecondsAfterFinished must be non-negative", i))
 			continue
 		}
@@ -325,19 +325,19 @@ func validateDeletionRules(rayJob *rayv1.RayJob) error {
 		}
 
 		// Group valid rule for consistency check.
-		statusMap, ok := rulesByStatus[rule.Condition.JobStatus]
+		policyTTLs, ok := rulesByStatus[rule.Condition.JobStatus]
 		if !ok {
-			statusMap = make(map[rayv1.DeletionPolicyType]int32)
-			rulesByStatus[rule.Condition.JobStatus] = statusMap
+			policyTTLs = make(map[rayv1.DeletionPolicyType]int32)
+			rulesByStatus[rule.Condition.JobStatus] = policyTTLs
 		}
 
 		// Check for uniqueness of (JobStatus, DeletionPolicyType) pair.
-		if _, exists := statusMap[rule.Policy]; exists {
+		if _, exists := policyTTLs[rule.Policy]; exists {
 			errs = append(errs, fmt.Errorf("deletionRules[%d]: duplicate rule for DeletionPolicyType '%s' and JobStatus '%s'", i, rule.Policy, rule.Condition.JobStatus))
 			continue
 		}
 
-		statusMap[rule.Policy] = rule.Condition.TTLSecondsAfterFinished
+		policyTTLs[rule.Policy] = rule.Condition.TTLSeconds
 	}
 
 	// Second pass: Validate TTL consistency per JobStatus.
diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go
index e298d2d45b5..2d69ae13c46 100644
--- a/ray-operator/controllers/ray/utils/validation_test.go
+++ b/ray-operator/controllers/ray/utils/validation_test.go
@@ -1079,8 +1079,8 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 						{
 							Policy: rayv1.DeleteSelf,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 10,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 10,
 							},
 						},
 					},
@@ -1098,8 +1098,8 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 						{
 							Policy: rayv1.DeleteSelf,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 10,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 10,
 							},
 						},
 					},
@@ -1119,8 +1119,8 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 						{
 							Policy: rayv1.DeleteSelf,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 10,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 10,
 							},
 						},
 					},
@@ -1145,15 +1145,15 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 						{
 							Policy: rayv1.DeleteSelf,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 10,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 10,
 							},
 						},
 						{
 							Policy: rayv1.DeleteSelf,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 20,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 20,
 							},
 						},
 					},
@@ -1170,8 +1170,8 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 						{
 							Policy: rayv1.DeleteSelf,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: -10,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: -10,
 							},
 						},
 					},
@@ -1189,8 +1189,8 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 						{
 							Policy: rayv1.DeleteCluster,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 10,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 10,
 							},
 						},
 					},
@@ -1206,8 +1206,8 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 						{
 							Policy: rayv1.DeleteWorkers,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 10,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 10,
 							},
 						},
 					},
@@ -1227,15 +1227,15 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 						{
 							Policy: rayv1.DeleteWorkers,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 20,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 20,
 							},
 						},
 						{
 							Policy: rayv1.DeleteCluster,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 10,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 10,
 							},
 						},
 					},
@@ -1252,15 +1252,15 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 						{
 							Policy: rayv1.DeleteCluster,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 20,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 20,
 							},
 						},
 						{
 							Policy: rayv1.DeleteSelf,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 10,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 10,
 							},
 						},
 					},
@@ -1277,29 +1277,29 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 						{
 							Policy: rayv1.DeleteWorkers,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 10,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 10,
 							},
 						},
 						{
 							Policy: rayv1.DeleteCluster,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 20,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 20,
 							},
 						},
 						{
 							Policy: rayv1.DeleteSelf,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusSucceeded,
-								TTLSecondsAfterFinished: 30,
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 30,
 							},
 						},
 						{
 							Policy: rayv1.DeleteSelf,
 							Condition: rayv1.DeletionCondition{
-								JobStatus:               rayv1.JobStatusFailed,
-								TTLSecondsAfterFinished: 0,
+								JobStatus:  rayv1.JobStatusFailed,
+								TTLSeconds: 0,
 							},
 						},
 					},
diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go
index 25e1a881dbb..36b8c006209 100644
--- a/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go
+++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/deletioncondition.go
@@ -9,8 +9,8 @@ import (
 // DeletionConditionApplyConfiguration represents a declarative configuration of the DeletionCondition type for use
 // with apply.
 type DeletionConditionApplyConfiguration struct {
-	JobStatus               *rayv1.JobStatus `json:"jobStatus,omitempty"`
-	TTLSecondsAfterFinished *int32           `json:"ttlSecondsAfterFinished,omitempty"`
+	JobStatus  *rayv1.JobStatus `json:"jobStatus,omitempty"`
+	TTLSeconds *int32           `json:"ttlSeconds,omitempty"`
 }
 
 // DeletionConditionApplyConfiguration constructs a declarative configuration of the DeletionCondition type for use with
@@ -27,10 +27,10 @@ func (b *DeletionConditionApplyConfiguration) WithJobStatus(value rayv1.JobStatu
 	return b
 }
 
-// WithTTLSecondsAfterFinished sets the TTLSecondsAfterFinished field in the declarative configuration to the given value
+// WithTTLSeconds sets the TTLSeconds field in the declarative configuration to the given value
 // and returns the receiver, so that objects can be built by chaining "With" function invocations.
-// If called multiple times, the TTLSecondsAfterFinished field is set to the value of the last call.
-func (b *DeletionConditionApplyConfiguration) WithTTLSecondsAfterFinished(value int32) *DeletionConditionApplyConfiguration {
-	b.TTLSecondsAfterFinished = &value
+// If called multiple times, the TTLSeconds field is set to the value of the last call.
+func (b *DeletionConditionApplyConfiguration) WithTTLSeconds(value int32) *DeletionConditionApplyConfiguration {
+	b.TTLSeconds = &value
 	return b
 }

From e952550a3e150ffb99322c3a81adb1550469e0e2 Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Wed, 17 Sep 2025 00:35:40 +0000
Subject: [PATCH 07/21] fix typo

---
 .../samples/ray-job.deletion-rules.yaml       | 119 ++----------------
 1 file changed, 12 insertions(+), 107 deletions(-)

diff --git a/ray-operator/config/samples/ray-job.deletion-rules.yaml b/ray-operator/config/samples/ray-job.deletion-rules.yaml
index f6605e274c3..89112b14150 100644
--- a/ray-operator/config/samples/ray-job.deletion-rules.yaml
+++ b/ray-operator/config/samples/ray-job.deletion-rules.yaml
@@ -1,13 +1,14 @@
 apiVersion: ray.io/v1
 kind: RayJob
 metadata:
-  name: rayjob-sample
+  name: rayjob-deletion-rules
 spec:
-  # submissionMode specifies how RayJob submits the Ray job to the RayCluster.
-  # The default value is "K8sJobMode", meaning RayJob will submit the Ray job via a submitter Kubernetes Job.
-  # The alternative value is "HTTPMode", indicating that KubeRay will submit the Ray job by sending an HTTP request to the RayCluster.
-  # submissionMode: "K8sJobMode"
-  entrypoint: python /home/ray/samples/sample_code.py
+  entrypoint: |
+    python -c "
+    import ray
+    ray.init()
+    print(f'ray.cluster_resources(): {ray.cluster_resources()}')
+    "
   # DeletionStrategy defines the deletion policies for a RayJob.
   # It allows for fine-grained control over resource cleanup after a job finishes.
   # DeletionRules is a list of deletion rules, processed based on their trigger conditions.
@@ -20,7 +21,7 @@ spec:
     # 2. After 60 seconds, the entire RayCluster (including the head pod) is deleted.
     # 3. After 90 seconds, the RayJob custom resource itself is deleted, removing it from the Kubernetes API server.
     deletionRules:
-    - condition: 
+    - condition:
         jobStatus: FAILED
         ttlSeconds: 30
       policy: DeleteWorkers
@@ -32,7 +33,7 @@ spec:
         jobStatus: FAILED
         ttlSeconds: 90
       policy: DeleteSelf
-    - condition: 
+    - condition:
         jobStatus: SUCCEEDED
         ttlSeconds: 30
       policy: DeleteWorkers
@@ -44,36 +45,11 @@ spec:
         jobStatus: SUCCEEDED
         ttlSeconds: 90
       policy: DeleteSelf
-
-  # activeDeadlineSeconds is the duration in seconds that the RayJob may be active before
-  # KubeRay actively tries to terminate the RayJob; value must be positive integer.
-  # activeDeadlineSeconds: 120
-
-  # RuntimeEnvYAML represents the runtime environment configuration provided as a multi-line YAML string.
-  # See https://docs.ray.io/en/latest/ray-core/handling-dependencies.html for details.
-  # (New in KubeRay version 1.0.)
-  runtimeEnvYAML: |
-    pip:
-      - requests==2.26.0
-      - pendulum==2.1.2
-    env_vars:
-      counter_name: "test_counter"
-
-  # Suspend specifies whether the RayJob controller should create a RayCluster instance.
-  # If a job is applied with the suspend field set to true, the RayCluster will not be created and we will wait for the transition to false.
-  # If the RayCluster is already created, it will be deleted. In the case of transition to false, a new RayCluster will be created.
-  # suspend: false
-
   # rayClusterSpec specifies the RayCluster instance to be created by the RayJob controller.
   rayClusterSpec:
-    rayVersion: '2.46.0' # should match the Ray version in the image of the containers
-    # Ray head pod template
+    rayVersion: '2.46.0'
     headGroupSpec:
-      # The `rayStartParams` are used to configure the `ray start` command.
-      # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
-      # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
       rayStartParams: {}
-      #pod template
       template:
         spec:
           containers:
@@ -82,7 +58,7 @@ spec:
             ports:
             - containerPort: 6379
               name: gcs-server
-            - containerPort: 8265 # Ray dashboard
+            - containerPort: 8265
               name: dashboard
             - containerPort: 10001
               name: client
@@ -91,90 +67,19 @@ spec:
                 cpu: "1"
               requests:
                 cpu: "200m"
-            volumeMounts:
-            - mountPath: /home/ray/samples
-              name: code-sample
-          volumes:
-          # You set volumes at the Pod level, then mount them into containers inside that Pod
-          - name: code-sample
-            configMap:
-              # Provide the name of the ConfigMap you want to mount.
-              name: ray-job-code-sample
-              # An array of keys from the ConfigMap to create as files
-              items:
-              - key: sample_code.py
-                path: sample_code.py
     workerGroupSpecs:
-    # the pod replicas in this group typed worker
     - replicas: 1
       minReplicas: 1
       maxReplicas: 5
-      # logical group name, for this called small-group, also can be functional
       groupName: small-group
-      # The `rayStartParams` are used to configure the `ray start` command.
-      # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
-      # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
       rayStartParams: {}
-      #pod template
       template:
         spec:
           containers:
-          - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
+          - name: ray-worker
             image: rayproject/ray:2.46.0
             resources:
               limits:
                 cpu: "1"
               requests:
                 cpu: "200m"
-
-  # SubmitterPodTemplate is the template for the pod that will run the `ray job submit` command against the RayCluster.
-  # If SubmitterPodTemplate is specified, the first container is assumed to be the submitter container.
-  # submitterPodTemplate:
-  #   spec:
-  #     restartPolicy: Never
-  #     containers:
-  #     - name: my-custom-rayjob-submitter-pod
-  #       image: rayproject/ray:2.46.0
-  #       # If Command is not specified, the correct command will be supplied at runtime using the RayJob spec `entrypoint` field.
-  #       # Specifying Command is not recommended.
-  #       # command: ["sh", "-c", "ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID -- echo hello world"]
-
-
-######################Ray code sample#################################
-# this sample is from https://docs.ray.io/en/latest/cluster/job-submission.html#quick-start-example
-# it is mounted into the container and executed to show the Ray job at work
----
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: ray-job-code-sample
-data:
-  sample_code.py: |
-    import ray
-    import os
-    import requests
-
-    ray.init()
-
-    @ray.remote
-    class Counter:
-        def __init__(self):
-            # Used to verify runtimeEnv
-            self.name = os.getenv("counter_name")
-            assert self.name == "test_counter"
-            self.counter = 0
-
-        def inc(self):
-            self.counter += 1
-
-        def get_counter(self):
-            return "{} got {}".format(self.name, self.counter)
-
-    counter = Counter.remote()
-
-    for _ in range(5):
-        ray.get(counter.inc.remote())
-        print(ray.get(counter.get_counter.remote()))
-
-    # Verify that the correct runtime env was used for the job.
-    assert requests.__version__ == "2.26.0"

From a81ffa40887f855e11ce42463e89bbaed1213708 Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Tue, 16 Sep 2025 21:37:27 -0400
Subject: [PATCH 08/21] modify comment

---
 docs/reference/api.md                         |  2 +-
 ray-operator/apis/ray/v1/rayjob_types.go      |  2 +-
 .../controllers/ray/utils/validation.go       |  2 +-
 .../controllers/ray/utils/validation_test.go  | 20 ++++++++++++++++++-
 4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/docs/reference/api.md b/docs/reference/api.md
index ec300a8e342..e951c3cb474 100644
--- a/docs/reference/api.md
+++ b/docs/reference/api.md
@@ -157,7 +157,7 @@ _Appears in:_
 | --- | --- | --- | --- |
 | `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | OnSuccess is the deletion policy for a successful RayJob.<br />Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.<br />This field will be removed in release 1.16.0. |  |  |
 | `onFailure` _[DeletionPolicy](#deletionpolicy)_ | OnFailure is the deletion policy for a failed RayJob.<br />Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.<br />This field will be removed in release 1.16.0. |  |  |
-| `deletionRules` _[DeletionRule](#deletionrule) array_ | DeletionRules is a list of deletion rules, processed based on their trigger conditions.<br />While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),<br />the most impactful rule (e.g., DeleteCluster) will be executed first to prioritize resource cleanup and cost savings. |  |  |
+| `deletionRules` _[DeletionRule](#deletionrule) array_ | DeletionRules is a list of deletion rules, processed based on their trigger conditions.<br />While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),<br />the most impactful rule (e.g., DeleteSelf) will be executed first to prioritize resource cleanup and cost savings. |  |  |
 
 
 
diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go
index a60e30aa9b1..3314ac463bc 100644
--- a/ray-operator/apis/ray/v1/rayjob_types.go
+++ b/ray-operator/apis/ray/v1/rayjob_types.go
@@ -124,7 +124,7 @@ type DeletionStrategy struct {
 
 	// DeletionRules is a list of deletion rules, processed based on their trigger conditions.
 	// While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),
-	// the most impactful rule (e.g., DeleteCluster) will be executed first to prioritize resource cleanup and cost savings.
+	// the most impactful rule (e.g., DeleteSelf) will be executed first to prioritize resource cleanup and cost savings.
 	// +optional
 	// +listType=atomic
 	DeletionRules []DeletionRule `json:"deletionRules,omitempty"`
diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go
index fe379d5dfff..08d98c15ecb 100644
--- a/ray-operator/controllers/ray/utils/validation.go
+++ b/ray-operator/controllers/ray/utils/validation.go
@@ -318,7 +318,7 @@ func validateDeletionRules(rayJob *rayv1.RayJob) error {
 	for i, rule := range rules {
 		// Validate TTL is non-negative.
 		if rule.Condition.TTLSeconds < 0 {
-			errs = append(errs, fmt.Errorf("deletionRules[%d]: TTLSecondsAfterFinished must be non-negative", i))
+			errs = append(errs, fmt.Errorf("deletionRules[%d]: TTLSeconds must be non-negative", i))
 			continue
 		}
 
diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go
index 70c4bfae974..16f2911229d 100644
--- a/ray-operator/controllers/ray/utils/validation_test.go
+++ b/ray-operator/controllers/ray/utils/validation_test.go
@@ -1192,7 +1192,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 			expectError: true,
 		},
 		{
-			name: "negative TTLSecondsAfterFinished in deletionRules",
+			name: "negative TTLSeconds in deletionRules",
 			spec: rayv1.RayJobSpec{
 				DeletionStrategy: &rayv1.DeletionStrategy{
 					DeletionRules: []rayv1.DeletionRule{
@@ -1209,6 +1209,24 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 			},
 			expectError: true,
 		},
+		{
+			name: "deletionRules with ClusterSelector and DeleteWorkers policy",
+			spec: rayv1.RayJobSpec{
+				ClusterSelector: map[string]string{"key": "value"},
+				DeletionStrategy: &rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{
+						{
+							Policy: rayv1.DeleteWorkers,
+							Condition: rayv1.DeletionCondition{
+								JobStatus:  rayv1.JobStatusSucceeded,
+								TTLSeconds: 10,
+							},
+						},
+					},
+				},
+			},
+			expectError: true,
+		},
 		{
 			name: "deletionRules with ClusterSelector and DeleteCluster policy",
 			spec: rayv1.RayJobSpec{

From b83078a4f764b4edd4cf6c7a19a619b5078b5a54 Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Tue, 16 Sep 2025 22:34:19 -0400
Subject: [PATCH 09/21] address comment

---
 docs/reference/api.md                         |  6 +--
 ray-operator/apis/ray/v1/rayjob_types.go      |  6 +--
 .../controllers/ray/rayjob_controller.go      | 48 ++++++++-----------
 3 files changed, 26 insertions(+), 34 deletions(-)

diff --git a/docs/reference/api.md b/docs/reference/api.md
index e951c3cb474..ca3ed50356c 100644
--- a/docs/reference/api.md
+++ b/docs/reference/api.md
@@ -130,7 +130,7 @@ It allows for fine-grained control over resource cleanup after a job finishes.
 
 Legacy fields `onSuccess` and `onFailure` are still supported for backward compatibility,
 but it is highly recommended to migrate to the new `deletionRules` field.
-`onSuccess` and `onFailure` will be removed in release 1.16.0.
+`onSuccess` and `onFailure` will be removed in release 1.6.0.
 
 
 Notes:
@@ -155,8 +155,8 @@ _Appears in:_
 
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | OnSuccess is the deletion policy for a successful RayJob.<br />Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.<br />This field will be removed in release 1.16.0. |  |  |
-| `onFailure` _[DeletionPolicy](#deletionpolicy)_ | OnFailure is the deletion policy for a failed RayJob.<br />Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.<br />This field will be removed in release 1.16.0. |  |  |
+| `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | OnSuccess is the deletion policy for a successful RayJob.<br />Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.<br />This field will be removed in release 1.6.0. |  |  |
+| `onFailure` _[DeletionPolicy](#deletionpolicy)_ | OnFailure is the deletion policy for a failed RayJob.<br />Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.<br />This field will be removed in release 1.6.0. |  |  |
 | `deletionRules` _[DeletionRule](#deletionrule) array_ | DeletionRules is a list of deletion rules, processed based on their trigger conditions.<br />While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),<br />the most impactful rule (e.g., DeleteSelf) will be executed first to prioritize resource cleanup and cost savings. |  |  |
 
 
diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go
index 3314ac463bc..8abeec9eb6f 100644
--- a/ray-operator/apis/ray/v1/rayjob_types.go
+++ b/ray-operator/apis/ray/v1/rayjob_types.go
@@ -92,7 +92,7 @@ const (
 //
 // Legacy fields `onSuccess` and `onFailure` are still supported for backward compatibility,
 // but it is highly recommended to migrate to the new `deletionRules` field.
-// `onSuccess` and `onFailure` will be removed in release 1.16.0.
+// `onSuccess` and `onFailure` will be removed in release 1.6.0.
 //
 // Notes:
 //   - When this block is set, you must configure **either**
@@ -112,13 +112,13 @@ const (
 type DeletionStrategy struct {
 	// OnSuccess is the deletion policy for a successful RayJob.
 	// Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
-	// This field will be removed in release 1.16.0.
+	// This field will be removed in release 1.6.0.
 	// +optional
 	OnSuccess *DeletionPolicy `json:"onSuccess,omitempty"`
 
 	// OnFailure is the deletion policy for a failed RayJob.
 	// Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
-	// This field will be removed in release 1.16.0.
+	// This field will be removed in release 1.6.0.
 	// +optional
 	OnFailure *DeletionPolicy `json:"onFailure,omitempty"`
 
diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go
index 5a2fdada794..fb139256189 100644
--- a/ray-operator/controllers/ray/rayjob_controller.go
+++ b/ray-operator/controllers/ray/rayjob_controller.go
@@ -365,7 +365,26 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
 		return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil
 	case rayv1.JobDeploymentStatusComplete, rayv1.JobDeploymentStatusFailed:
 		// The RayJob has reached a terminal state. Handle the cleanup and deletion logic.
-		return r.handleFinishedRayJob(ctx, rayJobInstance)
+		// If the RayJob uses an existing RayCluster, we must not delete it.
+		if len(rayJobInstance.Spec.ClusterSelector) > 0 {
+			logger.Info("RayJob is using an existing RayCluster via clusterSelector; skipping resource deletion.", "RayClusterSelector", rayJobInstance.Spec.ClusterSelector)
+			return ctrl.Result{}, nil
+		}
+
+		if features.Enabled(features.RayJobDeletionPolicy) && rayJobInstance.Spec.DeletionStrategy != nil {
+			// The previous validation logic ensures that either DeletionRules or the legacy policies are set, but not both.
+			if len(rayJobInstance.Spec.DeletionStrategy.DeletionRules) > 0 {
+				return r.handleDeletionRules(ctx, rayJobInstance)
+			}
+			return r.handleLegacyDeletionPolicy(ctx, rayJobInstance)
+		}
+
+		if rayJobInstance.Spec.ShutdownAfterJobFinishes {
+			return r.handleShutdownAfterJobFinishes(ctx, rayJobInstance)
+		}
+
+		// Default: No deletion policy is configured. The reconciliation is complete for this RayJob.
+		return ctrl.Result{}, nil
 	default:
 		logger.Info("Unknown JobDeploymentStatus", "JobDeploymentStatus", rayJobInstance.Status.JobDeploymentStatus)
 		return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil
@@ -1089,33 +1108,6 @@ func isSubmitterContainerFinished(pod *corev1.Pod) bool {
 	return false
 }
 
-// handleFinishedRayJob is the main entry point for handling cleanup of a completed or failed RayJob.
-// It acts as a dispatcher, selecting the appropriate deletion mechanism based on the RayJob spec.
-func (r *RayJobReconciler) handleFinishedRayJob(ctx context.Context, rayJob *rayv1.RayJob) (ctrl.Result, error) {
-	logger := ctrl.LoggerFrom(ctx)
-
-	// If the RayJob uses an existing RayCluster, we must not delete it.
-	if len(rayJob.Spec.ClusterSelector) > 0 {
-		logger.Info("RayJob is using an existing RayCluster via clusterSelector; skipping resource deletion.", "RayClusterSelector", rayJob.Spec.ClusterSelector)
-		return ctrl.Result{}, nil
-	}
-
-	if features.Enabled(features.RayJobDeletionPolicy) && rayJob.Spec.DeletionStrategy != nil {
-		// The previous validation logic ensures that either DeletionRules or the legacy policies are set, but not both.
-		if len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0 {
-			return r.handleDeletionRules(ctx, rayJob)
-		}
-		return r.handleLegacyDeletionPolicy(ctx, rayJob)
-	}
-
-	if rayJob.Spec.ShutdownAfterJobFinishes {
-		return r.handleShutdownAfterJobFinishes(ctx, rayJob)
-	}
-
-	// Default: No deletion policy is configured. The reconciliation is complete for this RayJob.
-	return ctrl.Result{}, nil
-}
-
 // handleDeletionRules processes the DeletionRules with a impact-aware strategy.
 // It categorizes rules into "overdue" and "pending". If overdue rules exist,
 // it executes the most destructive one and then requeues for the next pending rule.

From 776e924784fb42ad590ba23550c7c420c8b827af Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Sun, 21 Sep 2025 16:22:14 -0400
Subject: [PATCH 10/21] remove duplicate errors pkg

---
 ray-operator/controllers/ray/utils/validation.go | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go
index 08d98c15ecb..c11998e8938 100644
--- a/ray-operator/controllers/ray/utils/validation.go
+++ b/ray-operator/controllers/ray/utils/validation.go
@@ -1,7 +1,6 @@
 package utils
 
 import (
-	"errors"
 	errstd "errors"
 	"fmt"
 
@@ -356,7 +355,7 @@ func validateDeletionRules(rayJob *rayv1.RayJob) error {
 		}
 	}
 
-	return errors.Join(errs...)
+	return errstd.Join(errs...)
 }
 
 // validateTTLConsistency ensures TTLs follow the deletion hierarchy: Workers <= Cluster <= Self.
@@ -393,7 +392,7 @@ func validateTTLConsistency(policyTTLs map[rayv1.DeletionPolicyType]int32, statu
 		hasPrev = true
 	}
 
-	return errors.Join(errs...)
+	return errstd.Join(errs...)
 }
 
 // validateLegacyDeletionPolicies handles validation for the old `onSuccess` and `onFailure` fields.

From fafa62fb4a3e6db711fcbd1476b572ad648ffb47 Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Tue, 23 Sep 2025 22:00:55 -0400
Subject: [PATCH 11/21] improve api doc

---
 docs/reference/api.md                    | 6 +++++-
 ray-operator/apis/ray/v1/rayjob_types.go | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/docs/reference/api.md b/docs/reference/api.md
index ca3ed50356c..414e6710246 100644
--- a/docs/reference/api.md
+++ b/docs/reference/api.md
@@ -139,7 +139,11 @@ Notes:
     OR
     (b) the `deletionRules` field (which may be empty, in which case no deletion will occur).
   - `onSuccess` / `onFailure` must NOT be used together with `deletionRules`.
-  - `onSuccess` and `onFailure` are **deprecated** and planned for removal in a future release.
+  - `onSuccess` and `onFailure` are **deprecated** and planned for removal in release 1.6.0.
+  - `deletionStrategy` is mutually exclusive with `spec.shutdownAfterJobFinishes`.
+  - If both are set, the controller will report an error and stop processing the RayJob.
+  - If the `RayJobDeletionPolicy` feature gate is disabled but `deletionStrategy` is set,
+    the controller will report an error and stop processing the RayJob.
 
 
 Validation rules:
diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go
index 8abeec9eb6f..e33a7da6e24 100644
--- a/ray-operator/apis/ray/v1/rayjob_types.go
+++ b/ray-operator/apis/ray/v1/rayjob_types.go
@@ -100,7 +100,11 @@ const (
 //     OR
 //     (b) the `deletionRules` field (which may be empty, in which case no deletion will occur).
 //   - `onSuccess` / `onFailure` must NOT be used together with `deletionRules`.
-//   - `onSuccess` and `onFailure` are **deprecated** and planned for removal in a future release.
+//   - `onSuccess` and `onFailure` are **deprecated** and planned for removal in release 1.6.0.
+//   - `deletionStrategy` is mutually exclusive with `spec.shutdownAfterJobFinishes`.
+//   - If both are set, the controller will report an error and stop processing the RayJob.
+//   - If the `RayJobDeletionPolicy` feature gate is disabled but `deletionStrategy` is set,
+//     the controller will report an error and stop processing the RayJob.
 //
 // Validation rules:
 //  1. Prevent mixing legacy and new fields

From df975f40faa65b90693cc131444759ea42972605 Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Thu, 25 Sep 2025 12:48:06 -0400
Subject: [PATCH 12/21] add e2e tests for deletion strategy

---
 .buildkite/build-start-operator.sh            |   2 +-
 docs/reference/api.md                         |   2 +-
 ray-operator/Makefile                         |   8 +
 ray-operator/apis/ray/v1/rayjob_types.go      |   7 +-
 .../rayjob_deletion_strategy_test.go          | 557 ++++++++++++++++++
 5 files changed, 571 insertions(+), 5 deletions(-)
 create mode 100644 ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go

diff --git a/.buildkite/build-start-operator.sh b/.buildkite/build-start-operator.sh
index ef43eba6d71..60468c373d4 100644
--- a/.buildkite/build-start-operator.sh
+++ b/.buildkite/build-start-operator.sh
@@ -7,7 +7,7 @@
 # to kick off from the release branch so tests should match up accordingly.
 
 if [ "$IS_FROM_RAY_RELEASE_AUTOMATION" = 1 ]; then
-    helm repo update && helm install kuberay/kuberay-operator
+    helm repo update && helm install kuberay/kuberay-operator --set 'featureGates[1].name=RayJobDeletionPolicy' --set 'featureGates[1].enabled=true'
     KUBERAY_TEST_RAY_IMAGE="rayproject/ray:nightly.$(date +'%y%m%d').${RAY_NIGHTLY_COMMIT:0:6}-py39" && export KUBERAY_TEST_RAY_IMAGE
 else
     IMG=kuberay/operator:nightly make docker-image &&
diff --git a/docs/reference/api.md b/docs/reference/api.md
index 414e6710246..3b773074bb5 100644
--- a/docs/reference/api.md
+++ b/docs/reference/api.md
@@ -305,7 +305,7 @@ _Appears in:_
 | `clusterSelector` _object (keys:string, values:string)_ | clusterSelector is used to select running rayclusters by labels |  |  |
 | `submitterConfig` _[SubmitterConfig](#submitterconfig)_ | Configurations of submitter k8s job. |  |  |
 | `managedBy` _string_ | ManagedBy is an optional configuration for the controller or entity that manages a RayJob.<br />The value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'.<br />The kuberay-operator reconciles a RayJob which doesn't have this field at all or<br />the field value is the reserved string 'ray.io/kuberay-operator',<br />but delegates reconciling the RayJob with 'kueue.x-k8s.io/multikueue' to the Kueue.<br />The field is immutable. |  |  |
-| `deletionStrategy` _[DeletionStrategy](#deletionstrategy)_ | DeletionStrategy indicates what resources of the RayJob and how they are deleted upon job completion.<br />If unset, deletion policy is based on 'spec.shutdownAfterJobFinishes'.<br />This field requires the RayJobDeletionPolicy feature gate to be enabled. |  |  |
+| `deletionStrategy` _[DeletionStrategy](#deletionstrategy)_ | DeletionStrategy defines resource cleanup policies after job completion.<br />Use either legacy fields (onSuccess/onFailure) OR deletionRules, not both.<br />Mutually exclusive with spec.shutdownAfterJobFinishes.<br />Requires RayJobDeletionPolicy feature gate to be enabled. |  |  |
 | `entrypoint` _string_ | Entrypoint represents the command to start execution. |  |  |
 | `runtimeEnvYAML` _string_ | RuntimeEnvYAML represents the runtime environment configuration<br />provided as a multi-line YAML string. |  |  |
 | `jobId` _string_ | If jobId is not set, a new jobId will be auto-generated. |  |  |
diff --git a/ray-operator/Makefile b/ray-operator/Makefile
index 3eda8a616c4..3842f2227b8 100644
--- a/ray-operator/Makefile
+++ b/ray-operator/Makefile
@@ -88,6 +88,14 @@ test-sampleyaml: WHAT ?= ./test/sampleyaml
 test-sampleyaml: manifests fmt vet
 	go test -timeout 30m -v $(WHAT)
 
+test-e2e-rayjob: WHAT ?= ./test/e2erayjob
+test-e2e-rayjob: manifests fmt vet ## Run e2e tests.
+	go test -timeout 30m -v $(WHAT)
+
+test-e2e-rayservice: WHAT ?= ./test/e2erayservice
+test-e2e-rayservice: manifests fmt vet ## Run e2e tests.
+	go test -timeout 30m -v $(WHAT)
+
 sync: helm api-docs
 	./hack/update-codegen.sh
 
diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go
index e33a7da6e24..37fd5dbe7f3 100644
--- a/ray-operator/apis/ray/v1/rayjob_types.go
+++ b/ray-operator/apis/ray/v1/rayjob_types.go
@@ -232,9 +232,10 @@ type RayJobSpec struct {
 	// +kubebuilder:validation:XValidation:rule="self in ['ray.io/kuberay-operator', 'kueue.x-k8s.io/multikueue']",message="the managedBy field value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'"
 	// +optional
 	ManagedBy *string `json:"managedBy,omitempty"`
-	// DeletionStrategy indicates what resources of the RayJob and how they are deleted upon job completion.
-	// If unset, deletion policy is based on 'spec.shutdownAfterJobFinishes'.
-	// This field requires the RayJobDeletionPolicy feature gate to be enabled.
+	// DeletionStrategy defines resource cleanup policies after job completion.
+	// Use either legacy fields (onSuccess/onFailure) OR deletionRules, not both.
+	// Mutually exclusive with spec.shutdownAfterJobFinishes.
+	// Requires RayJobDeletionPolicy feature gate to be enabled.
 	// +optional
 	DeletionStrategy *DeletionStrategy `json:"deletionStrategy,omitempty"`
 	// Entrypoint represents the command to start execution.
diff --git a/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go b/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go
new file mode 100644
index 00000000000..302cee2d796
--- /dev/null
+++ b/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go
@@ -0,0 +1,557 @@
+package e2erayjob
+
+import (
+	"testing"
+	"time"
+
+	. "github.com/onsi/gomega"
+	k8serrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
+	rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1"
+	. "github.com/ray-project/kuberay/ray-operator/test/support"
+)
+
+func TestDeletionStrategy(t *testing.T) {
+	test := With(t)
+	g := NewWithT(t)
+
+	// Create a namespace
+	namespace := test.NewTestNamespace()
+
+	// Job scripts - using existing counter.py for successful jobs and fail.py for failed jobs
+	// Note: This test suite requires the RayJobDeletionPolicy feature gate to be enabled
+	jobsAC := NewConfigMap(namespace.Name, Files(test, "counter.py", "fail.py"))
+	jobs, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Apply(test.Ctx(), jobsAC, TestApplyOptions)
+	g.Expect(err).NotTo(HaveOccurred())
+	LogWithTimestamp(test.T(), "Created ConfigMap %s/%s successfully", jobs.Namespace, jobs.Name)
+
+	test.T().Run("DeletionRules with DeleteWorkers policy should delete only worker pods", func(_ *testing.T) {
+		// Create RayJob with DeleteWorkers policy and short TTL for faster testing
+		rayJobAC := rayv1ac.RayJob("delete-workers-test", namespace.Name).
+			WithSpec(rayv1ac.RayJobSpec().
+				WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))).
+				WithEntrypoint("python /home/ray/jobs/counter.py").
+				WithRuntimeEnvYAML(`
+env_vars:
+  counter_name: test_counter
+`).
+				WithShutdownAfterJobFinishes(false). // Required when using DeletionStrategy
+				WithDeletionStrategy(rayv1ac.DeletionStrategy().
+					WithDeletionRules(
+						rayv1ac.DeletionRule().
+							WithPolicy(rayv1.DeleteWorkers).
+							WithCondition(rayv1ac.DeletionCondition().
+								WithJobStatus(rayv1.JobStatusSucceeded).
+								WithTTLSeconds(10)), // 10 second TTL for testing
+					)).
+				WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration()))
+
+		rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions)
+		g.Expect(err).NotTo(HaveOccurred())
+		LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name)
+
+		// Wait for job to complete successfully
+		g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium).
+			Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded)))
+		LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name)
+
+		// Get the associated RayCluster name. We assert it's non-empty explicitly so that
+		// test failures surface here (clear message) rather than later when using an empty name.
+		rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name)
+		g.Expect(err).NotTo(HaveOccurred())
+		rayClusterName := rayJob.Status.RayClusterName
+		g.Expect(rayClusterName).NotTo(BeEmpty())
+
+		// Verify cluster and workers exist initially
+		g.Eventually(RayCluster(test, namespace.Name, rayClusterName), TestTimeoutShort).
+			Should(WithTransform(RayClusterState, Equal(rayv1.Ready)))
+
+		// Count initial worker pods
+		cluster, err := GetRayCluster(test, namespace.Name, rayClusterName)
+		g.Expect(err).NotTo(HaveOccurred())
+		initialWorkerPods, err := GetWorkerPods(test, cluster)
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Expect(len(initialWorkerPods)).To(BeNumerically(">", 0))
+		LogWithTimestamp(test.T(), "Found %d worker pods initially", len(initialWorkerPods))
+
+		// Verify resources persist during TTL wait period (first 8 seconds of 10s TTL)
+		LogWithTimestamp(test.T(), "Verifying resources persist during TTL wait period...")
+		g.Consistently(func(gg Gomega) {
+			cluster, err := GetRayCluster(test, namespace.Name, rayClusterName)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(cluster).NotTo(BeNil())
+			workerPods, err := GetWorkerPods(test, cluster)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(len(workerPods)).To(BeNumerically(">", 0))
+			headPod, err := GetHeadPod(test, cluster)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(headPod).NotTo(BeNil())
+			jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(jobObj).NotTo(BeNil())
+		}, 8*time.Second, 2*time.Second).Should(Succeed()) // Check every 2s for 8s
+		LogWithTimestamp(test.T(), "Resources confirmed stable during TTL wait period")
+
+		// Wait for TTL to expire and workers to be deleted
+		LogWithTimestamp(test.T(), "Waiting for TTL to expire and workers to be deleted...")
+		g.Eventually(func(gg Gomega) {
+			cluster, err := GetRayCluster(test, namespace.Name, rayClusterName)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(cluster).NotTo(BeNil())
+			workerPods, err := GetWorkerPods(test, cluster)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(workerPods).To(BeEmpty())
+		}, TestTimeoutMedium).Should(Succeed())
+		LogWithTimestamp(test.T(), "Worker pods deleted successfully")
+
+		// Verify cluster still exists (head pod should remain)
+		g.Consistently(RayCluster(test, namespace.Name, rayClusterName), 10*time.Second).
+			Should(WithTransform(RayClusterState, Equal(rayv1.Ready)))
+
+		// Verify head pod still exists
+		cluster, err = GetRayCluster(test, namespace.Name, rayClusterName)
+		g.Expect(err).NotTo(HaveOccurred())
+		headPod, err := GetHeadPod(test, cluster)
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Expect(headPod).NotTo(BeNil())
+		LogWithTimestamp(test.T(), "Head pod preserved as expected")
+
+		// Verify RayJob still exists
+		jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name)
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Expect(jobObj).NotTo(BeNil())
+		LogWithTimestamp(test.T(), "RayJob preserved as expected")
+
+		// Cleanup: delete RayJob to free resources (cluster should be GC'd eventually if owned)
+		LogWithTimestamp(test.T(), "Cleaning up RayJob %s/%s after DeleteWorkers scenario", jobObj.Namespace, jobObj.Name)
+		err = test.Client().Ray().RayV1().RayJobs(jobObj.Namespace).Delete(test.Ctx(), jobObj.Name, metav1.DeleteOptions{})
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Eventually(func() error { _, err := GetRayJob(test, jobObj.Namespace, jobObj.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+		// Cluster may take a moment to be garbage collected; tolerate already-deleted state
+		g.Eventually(func() error {
+			_, err := GetRayCluster(test, namespace.Name, rayClusterName)
+			return err
+		}, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+		LogWithTimestamp(test.T(), "Cleanup after DeleteWorkers scenario complete")
+	})
+
+	test.T().Run("DeletionRules with DeleteCluster policy should delete entire cluster", func(_ *testing.T) {
+		rayJobAC := rayv1ac.RayJob("delete-cluster-test", namespace.Name).
+			WithSpec(rayv1ac.RayJobSpec().
+				WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))).
+				WithEntrypoint("python /home/ray/jobs/counter.py").
+				WithRuntimeEnvYAML(`
+env_vars:
+  counter_name: test_counter
+`).
+				WithShutdownAfterJobFinishes(false).
+				WithDeletionStrategy(rayv1ac.DeletionStrategy().
+					WithDeletionRules(
+						rayv1ac.DeletionRule().
+							WithPolicy(rayv1.DeleteCluster).
+							WithCondition(rayv1ac.DeletionCondition().
+								WithJobStatus(rayv1.JobStatusSucceeded).
+								WithTTLSeconds(10)),
+					)).
+				WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration()))
+
+		rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions)
+		g.Expect(err).NotTo(HaveOccurred())
+		LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name)
+
+		// Wait for job to complete successfully
+		g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium).
+			Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded)))
+		LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name)
+
+		// Get the associated RayCluster name (early assertion for clearer diagnostics)
+		rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name)
+		g.Expect(err).NotTo(HaveOccurred())
+		rayClusterName := rayJob.Status.RayClusterName
+		g.Expect(rayClusterName).NotTo(BeEmpty())
+
+		// Verify cluster exists initially
+		g.Eventually(RayCluster(test, namespace.Name, rayClusterName), TestTimeoutShort).
+			Should(WithTransform(RayClusterState, Equal(rayv1.Ready)))
+
+		// Wait for TTL to expire and cluster to be deleted
+		LogWithTimestamp(test.T(), "Waiting for TTL to expire and cluster to be deleted...")
+		g.Eventually(func() error {
+			_, err := GetRayCluster(test, namespace.Name, rayClusterName)
+			return err
+		}, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+		LogWithTimestamp(test.T(), "RayCluster deleted successfully")
+
+		// Verify RayJob still exists
+		jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name)
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Expect(jobObj).NotTo(BeNil())
+		LogWithTimestamp(test.T(), "RayJob preserved as expected")
+
+		// Cleanup: delete RayJob (cluster already deleted by policy)
+		LogWithTimestamp(test.T(), "Cleaning up RayJob %s/%s after DeleteCluster scenario", jobObj.Namespace, jobObj.Name)
+		err = test.Client().Ray().RayV1().RayJobs(jobObj.Namespace).Delete(test.Ctx(), jobObj.Name, metav1.DeleteOptions{})
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Eventually(func() error { _, err := GetRayJob(test, jobObj.Namespace, jobObj.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+		LogWithTimestamp(test.T(), "Cleanup after DeleteCluster scenario complete")
+	})
+
+	test.T().Run("DeletionRules with DeleteSelf policy should delete RayJob and cluster", func(_ *testing.T) {
+		rayJobAC := rayv1ac.RayJob("delete-self-test", namespace.Name).
+			WithSpec(rayv1ac.RayJobSpec().
+				WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))).
+				WithEntrypoint("python /home/ray/jobs/counter.py").
+				WithRuntimeEnvYAML(`
+env_vars:
+  counter_name: test_counter
+`).
+				WithShutdownAfterJobFinishes(false).
+				WithDeletionStrategy(rayv1ac.DeletionStrategy().
+					WithDeletionRules(
+						rayv1ac.DeletionRule().
+							WithPolicy(rayv1.DeleteSelf).
+							WithCondition(rayv1ac.DeletionCondition().
+								WithJobStatus(rayv1.JobStatusSucceeded).
+								WithTTLSeconds(10)),
+					)).
+				WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration()))
+
+		rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions)
+		g.Expect(err).NotTo(HaveOccurred())
+		LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name)
+
+		// Wait for job to complete successfully
+		g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium).
+			Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded)))
+		LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name)
+
+		// Get the associated RayCluster name before verifying deletion sequence
+		rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name)
+		g.Expect(err).NotTo(HaveOccurred())
+		rayClusterName := rayJob.Status.RayClusterName
+		g.Expect(rayClusterName).NotTo(BeEmpty())
+
+		// Wait for TTL to expire and RayJob (and cluster) to be deleted
+		LogWithTimestamp(test.T(), "Waiting for TTL to expire and RayJob to be deleted...")
+		g.Eventually(func() error {
+			_, err := GetRayJob(test, rayJob.Namespace, rayJob.Name)
+			return err
+		}, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+		LogWithTimestamp(test.T(), "RayJob deleted successfully")
+
+		// Verify associated cluster is also deleted
+		g.Eventually(func() error {
+			_, err := GetRayCluster(test, namespace.Name, rayClusterName)
+			return err
+		}, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+		LogWithTimestamp(test.T(), "Associated RayCluster deleted successfully")
+	})
+
+	test.T().Run("DeletionRules with DeleteNone policy should preserve all resources", func(_ *testing.T) {
+		rayJobAC := rayv1ac.RayJob("delete-none-test", namespace.Name).
+			WithSpec(rayv1ac.RayJobSpec().
+				WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))).
+				WithEntrypoint("python /home/ray/jobs/counter.py").
+				WithRuntimeEnvYAML(`
+env_vars:
+  counter_name: test_counter
+`).
+				WithShutdownAfterJobFinishes(false).
+				WithDeletionStrategy(rayv1ac.DeletionStrategy().
+					WithDeletionRules(
+						rayv1ac.DeletionRule().
+							WithPolicy(rayv1.DeleteNone).
+							WithCondition(rayv1ac.DeletionCondition().
+								WithJobStatus(rayv1.JobStatusSucceeded).
+								WithTTLSeconds(5)), // Shorter TTL since we're testing preservation
+					)).
+				WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration()))
+
+		rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions)
+		g.Expect(err).NotTo(HaveOccurred())
+		LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name)
+
+		// Wait for job to complete successfully
+		g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium).
+			Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded)))
+		LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name)
+
+		// Get the associated RayCluster name (assert early for clarity)
+		rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name)
+		g.Expect(err).NotTo(HaveOccurred())
+		rayClusterName := rayJob.Status.RayClusterName
+		g.Expect(rayClusterName).NotTo(BeEmpty())
+
+		// Wait well past the TTL and verify everything is preserved
+		LogWithTimestamp(test.T(), "Waiting past TTL to verify resources are preserved...")
+		g.Consistently(func(gg Gomega) {
+			jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(jobObj).NotTo(BeNil())
+			cluster, err := GetRayCluster(test, namespace.Name, rayClusterName)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(cluster).NotTo(BeNil())
+			workerPods, err := GetWorkerPods(test, cluster)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(len(workerPods)).To(BeNumerically(">", 0))
+		}, 10*time.Second, 2*time.Second).Should(Succeed())
+		LogWithTimestamp(test.T(), "All resources preserved as expected with DeleteNone policy")
+
+		// Cleanup: delete RayJob to release cluster and pods
+		LogWithTimestamp(test.T(), "Cleaning up RayJob %s/%s after DeleteNone scenario", rayJob.Namespace, rayJob.Name)
+		err = test.Client().Ray().RayV1().RayJobs(rayJob.Namespace).Delete(test.Ctx(), rayJob.Name, metav1.DeleteOptions{})
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Eventually(func() error { _, err := GetRayJob(test, rayJob.Namespace, rayJob.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+		g.Eventually(func() error {
+			_, err := GetRayCluster(test, namespace.Name, rayClusterName)
+			return err
+		}, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+		LogWithTimestamp(test.T(), "Cleanup after DeleteNone scenario complete")
+	})
+
+	test.T().Run("Multi-stage deletion should execute in TTL order: Workers->Cluster->Self", func(_ *testing.T) {
+		rayJobAC := rayv1ac.RayJob("multi-stage-test", namespace.Name).
+			WithSpec(rayv1ac.RayJobSpec().
+				WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))).
+				WithEntrypoint("python /home/ray/jobs/counter.py").
+				WithRuntimeEnvYAML(`
+env_vars:
+  counter_name: test_counter
+`).
+				WithShutdownAfterJobFinishes(false).
+				WithDeletionStrategy(rayv1ac.DeletionStrategy().
+					WithDeletionRules(
+						rayv1ac.DeletionRule().
+							WithPolicy(rayv1.DeleteWorkers).
+							WithCondition(rayv1ac.DeletionCondition().
+								WithJobStatus(rayv1.JobStatusSucceeded).
+								WithTTLSeconds(15)), // Increased spacing for reliability
+						rayv1ac.DeletionRule().
+							WithPolicy(rayv1.DeleteCluster).
+							WithCondition(rayv1ac.DeletionCondition().
+								WithJobStatus(rayv1.JobStatusSucceeded).
+								WithTTLSeconds(35)), // 20s gap between stages
+						rayv1ac.DeletionRule().
+							WithPolicy(rayv1.DeleteSelf).
+							WithCondition(rayv1ac.DeletionCondition().
+								WithJobStatus(rayv1.JobStatusSucceeded).
+								WithTTLSeconds(55)), // 20s gap between stages
+					)).
+				WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration()))
+
+		rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions)
+		g.Expect(err).NotTo(HaveOccurred())
+		LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name)
+
+		// Wait for job to complete successfully
+		g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium).
+			Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded)))
+		LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name)
+
+		// Get the associated RayCluster name (early assertion ensures meaningful failure)
+		rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name)
+		g.Expect(err).NotTo(HaveOccurred())
+		rayClusterName := rayJob.Status.RayClusterName
+		g.Expect(rayClusterName).NotTo(BeEmpty())
+
+		// Verify cluster is ready initially
+		g.Eventually(RayCluster(test, namespace.Name, rayClusterName), TestTimeoutShort).
+			Should(WithTransform(RayClusterState, Equal(rayv1.Ready)))
+
+		// Verify all resources exist before any TTL expires (first 12 seconds)
+		LogWithTimestamp(test.T(), "Verifying all resources persist before any TTL expires...")
+		g.Consistently(func(gg Gomega) {
+			cluster, err := GetRayCluster(test, namespace.Name, rayClusterName)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(cluster).NotTo(BeNil())
+			workerPods, err := GetWorkerPods(test, cluster)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(len(workerPods)).To(BeNumerically(">", 0))
+			headPod, err := GetHeadPod(test, cluster)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(headPod).NotTo(BeNil())
+			jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(jobObj).NotTo(BeNil())
+		}, 12*time.Second, 2*time.Second).Should(Succeed())
+		LogWithTimestamp(test.T(), "All resources confirmed stable before TTL expiration")
+
+		// Stage 1: Wait for workers to be deleted (15s TTL)
+		LogWithTimestamp(test.T(), "Stage 1: Waiting for workers to be deleted at 15s...")
+		g.Eventually(func(gg Gomega) {
+			cluster, err := GetRayCluster(test, namespace.Name, rayClusterName)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(cluster).NotTo(BeNil())
+			workerPods, err := GetWorkerPods(test, cluster)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(workerPods).To(BeEmpty())
+		}, TestTimeoutMedium).Should(Succeed())
+		LogWithTimestamp(test.T(), "Stage 1 complete: Workers deleted successfully")
+
+		// Verify cluster and job still exist after stage 1
+		job, err := GetRayJob(test, rayJob.Namespace, rayJob.Name)
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Expect(job).NotTo(BeNil())
+		cluster, err := GetRayCluster(test, namespace.Name, rayClusterName)
+		g.Expect(err).NotTo(HaveOccurred())
+		headPod, err := GetHeadPod(test, cluster)
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Expect(headPod).NotTo(BeNil())
+
+		// Verify cluster persists during stage 2 wait period (15 seconds of 20s gap)
+		LogWithTimestamp(test.T(), "Verifying cluster persists before stage 2 TTL expires...")
+		g.Consistently(func(gg Gomega) {
+			cluster, err := GetRayCluster(test, namespace.Name, rayClusterName)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(cluster).NotTo(BeNil())
+			headPod, err := GetHeadPod(test, cluster)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(headPod).NotTo(BeNil())
+			jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(jobObj).NotTo(BeNil())
+		}, 15*time.Second, 2*time.Second).Should(Succeed())
+		LogWithTimestamp(test.T(), "Cluster and job confirmed stable before stage 2 TTL")
+
+		// Stage 2: Wait for cluster to be deleted (35s TTL)
+		LogWithTimestamp(test.T(), "Stage 2: Waiting for cluster to be deleted at 35s...")
+		g.Eventually(func() error {
+			_, err := GetRayCluster(test, namespace.Name, rayClusterName)
+			return err
+		}, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+		LogWithTimestamp(test.T(), "Stage 2 complete: Cluster deleted successfully")
+
+		// Verify job still exists after stage 2
+		job, err = GetRayJob(test, rayJob.Namespace, rayJob.Name)
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Expect(job).NotTo(BeNil())
+
+		// Verify job persists during stage 3 wait period (15 seconds of 20s gap)
+		LogWithTimestamp(test.T(), "Verifying RayJob persists before stage 3 TTL expires...")
+		g.Consistently(func(gg Gomega) {
+			jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(jobObj).NotTo(BeNil())
+		}, 15*time.Second, 2*time.Second).Should(Succeed())
+		LogWithTimestamp(test.T(), "RayJob confirmed stable before stage 3 TTL")
+
+		// Stage 3: Wait for job to be deleted (55s TTL)
+		LogWithTimestamp(test.T(), "Stage 3: Waiting for RayJob to be deleted at 55s...")
+		g.Eventually(func() error {
+			_, err := GetRayJob(test, rayJob.Namespace, rayJob.Name)
+			return err
+		}, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+		LogWithTimestamp(test.T(), "Stage 3 complete: RayJob deleted successfully")
+		LogWithTimestamp(test.T(), "Multi-stage deletion completed in correct order")
+	})
+
+	test.T().Run("Legacy OnSuccess DeleteCluster should still work", func(_ *testing.T) {
+		rayJobAC := rayv1ac.RayJob("legacy-success-test", namespace.Name).
+			WithSpec(rayv1ac.RayJobSpec().
+				WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))).
+				WithEntrypoint("python /home/ray/jobs/counter.py").
+				WithRuntimeEnvYAML(`
+env_vars:
+  counter_name: test_counter
+`).
+				WithShutdownAfterJobFinishes(false).
+				WithTTLSecondsAfterFinished(10). // Legacy TTL for backward compatibility
+				WithDeletionStrategy(rayv1ac.DeletionStrategy().
+					WithOnSuccess(rayv1ac.DeletionPolicy().
+						WithPolicy(rayv1.DeleteCluster)).
+					WithOnFailure(rayv1ac.DeletionPolicy().
+						WithPolicy(rayv1.DeleteNone))).
+				WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration()))
+
+		rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions)
+		g.Expect(err).NotTo(HaveOccurred())
+		LogWithTimestamp(test.T(), "Created legacy RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name)
+
+		// Wait for job to complete successfully
+		g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium).
+			Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded)))
+		LogWithTimestamp(test.T(), "RayJob %s/%s completed successfully", rayJob.Namespace, rayJob.Name)
+
+		// Get the associated RayCluster name (legacy path; same early assertion rationale)
+		rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name)
+		g.Expect(err).NotTo(HaveOccurred())
+		rayClusterName := rayJob.Status.RayClusterName
+		g.Expect(rayClusterName).NotTo(BeEmpty())
+
+		// Wait for cluster to be deleted due to OnSuccess policy
+		LogWithTimestamp(test.T(), "Waiting for legacy OnSuccess policy to delete cluster...")
+		g.Eventually(func() error {
+			_, err := GetRayCluster(test, namespace.Name, rayClusterName)
+			return err
+		}, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+		LogWithTimestamp(test.T(), "Cluster deleted by legacy OnSuccess policy")
+
+		// Verify RayJob still exists
+		job, err := GetRayJob(test, rayJob.Namespace, rayJob.Name)
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Expect(job).NotTo(BeNil())
+		LogWithTimestamp(test.T(), "Legacy OnSuccess policy working correctly")
+
+		// Cleanup: delete legacy RayJob (cluster already deleted)
+		LogWithTimestamp(test.T(), "Cleaning up legacy success RayJob %s/%s", job.Namespace, job.Name)
+		err = test.Client().Ray().RayV1().RayJobs(job.Namespace).Delete(test.Ctx(), job.Name, metav1.DeleteOptions{})
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Eventually(func() error { _, err := GetRayJob(test, job.Namespace, job.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+		LogWithTimestamp(test.T(), "Cleanup after legacy success scenario complete")
+	})
+
+	test.T().Run("Legacy OnFailure DeleteNone should still work", func(_ *testing.T) {
+		rayJobAC := rayv1ac.RayJob("legacy-failure-test", namespace.Name).
+			WithSpec(rayv1ac.RayJobSpec().
+				WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))).
+				WithEntrypoint("python /home/ray/jobs/fail.py"). // Use failing script
+				WithShutdownAfterJobFinishes(false).
+				WithTTLSecondsAfterFinished(10).
+				WithDeletionStrategy(rayv1ac.DeletionStrategy().
+					WithOnSuccess(rayv1ac.DeletionPolicy().
+						WithPolicy(rayv1.DeleteCluster)).
+					WithOnFailure(rayv1ac.DeletionPolicy().
+						WithPolicy(rayv1.DeleteNone))).
+				WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration()))
+
+		rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions)
+		g.Expect(err).NotTo(HaveOccurred())
+		LogWithTimestamp(test.T(), "Created legacy failure RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name)
+
+		// Wait for job to fail
+		g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium).
+			Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusFailed)))
+		LogWithTimestamp(test.T(), "RayJob %s/%s failed as expected", rayJob.Namespace, rayJob.Name)
+
+		// Get the associated RayCluster name
+		rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name)
+		g.Expect(err).NotTo(HaveOccurred())
+		rayClusterName := rayJob.Status.RayClusterName
+		g.Expect(rayClusterName).NotTo(BeEmpty())
+
+		// Wait past the TTL and verify everything is preserved due to OnFailure=DeleteNone
+		LogWithTimestamp(test.T(), "Waiting past TTL to verify resources preserved by OnFailure=DeleteNone...")
+		g.Consistently(func(gg Gomega) {
+			jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(jobObj).NotTo(BeNil())
+			cluster, err := GetRayCluster(test, namespace.Name, rayClusterName)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(cluster).NotTo(BeNil())
+		}, 15*time.Second, 2*time.Second).Should(Succeed())
+		LogWithTimestamp(test.T(), "Legacy OnFailure=DeleteNone policy working correctly")
+
+		// Cleanup: delete legacy failure RayJob (will also GC cluster)
+		LogWithTimestamp(test.T(), "Cleaning up legacy failure RayJob %s/%s", rayJob.Namespace, rayJob.Name)
+		err = test.Client().Ray().RayV1().RayJobs(rayJob.Namespace).Delete(test.Ctx(), rayJob.Name, metav1.DeleteOptions{})
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Eventually(func() error { _, err := GetRayJob(test, rayJob.Namespace, rayJob.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+		g.Eventually(func() error {
+			_, err := GetRayCluster(test, namespace.Name, rayClusterName)
+			return err
+		}, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+		LogWithTimestamp(test.T(), "Cleanup after legacy failure scenario complete")
+	})
+}

From f91907db5e2a90e80fd9385d25ef49862a14e8ea Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Thu, 25 Sep 2025 13:34:38 -0400
Subject: [PATCH 13/21] fix lint

---
 ray-operator/Makefile                                     | 1 -
 .../test/e2erayjob/rayjob_deletion_strategy_test.go       | 8 ++++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/ray-operator/Makefile b/ray-operator/Makefile
index 3842f2227b8..d69b2fffa0b 100644
--- a/ray-operator/Makefile
+++ b/ray-operator/Makefile
@@ -67,7 +67,6 @@ test: ENVTEST_K8S_VERSION ?= 1.24.2
 test: manifests fmt vet envtest ## Run tests.
 	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $(WHAT) -coverprofile cover.out
 
-# You can use `go test -timeout 30m -v ./test/e2e/rayjob_test.go ./test/e2e/support.go` if you only want to run tests in `rayjob_test.go`.
 test-e2e: WHAT ?= ./test/e2e
 test-e2e: manifests fmt vet ## Run e2e tests.
 	go test -timeout 30m -v $(WHAT)
diff --git a/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go b/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go
index 302cee2d796..4668ba0713c 100644
--- a/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go
+++ b/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go
@@ -73,7 +73,7 @@ env_vars:
 		g.Expect(err).NotTo(HaveOccurred())
 		initialWorkerPods, err := GetWorkerPods(test, cluster)
 		g.Expect(err).NotTo(HaveOccurred())
-		g.Expect(len(initialWorkerPods)).To(BeNumerically(">", 0))
+		g.Expect(initialWorkerPods).ToNot(BeEmpty())
 		LogWithTimestamp(test.T(), "Found %d worker pods initially", len(initialWorkerPods))
 
 		// Verify resources persist during TTL wait period (first 8 seconds of 10s TTL)
@@ -84,7 +84,7 @@ env_vars:
 			gg.Expect(cluster).NotTo(BeNil())
 			workerPods, err := GetWorkerPods(test, cluster)
 			gg.Expect(err).NotTo(HaveOccurred())
-			gg.Expect(len(workerPods)).To(BeNumerically(">", 0))
+			gg.Expect(workerPods).ToNot(BeEmpty())
 			headPod, err := GetHeadPod(test, cluster)
 			gg.Expect(err).NotTo(HaveOccurred())
 			gg.Expect(headPod).NotTo(BeNil())
@@ -295,7 +295,7 @@ env_vars:
 			gg.Expect(cluster).NotTo(BeNil())
 			workerPods, err := GetWorkerPods(test, cluster)
 			gg.Expect(err).NotTo(HaveOccurred())
-			gg.Expect(len(workerPods)).To(BeNumerically(">", 0))
+			gg.Expect(workerPods).ToNot(BeEmpty())
 		}, 10*time.Second, 2*time.Second).Should(Succeed())
 		LogWithTimestamp(test.T(), "All resources preserved as expected with DeleteNone policy")
 
@@ -368,7 +368,7 @@ env_vars:
 			gg.Expect(cluster).NotTo(BeNil())
 			workerPods, err := GetWorkerPods(test, cluster)
 			gg.Expect(err).NotTo(HaveOccurred())
-			gg.Expect(len(workerPods)).To(BeNumerically(">", 0))
+			gg.Expect(workerPods).ToNot(BeEmpty())
 			headPod, err := GetHeadPod(test, cluster)
 			gg.Expect(err).NotTo(HaveOccurred())
 			gg.Expect(headPod).NotTo(BeNil())

From 47daebe811566aa781e91239bb3914ac12e27c7c Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Thu, 25 Sep 2025 20:50:10 -0400
Subject: [PATCH 14/21] add feature gate override for e2e tests

---
 .buildkite/build-start-operator.sh            |  8 ++-
 .../values-kuberay-operator-override.yaml     | 18 +++++
 ray-operator/Makefile                         |  9 +++
 .../config/overlays/test-overrides/README.md  | 68 +++++++++++++++++++
 .../test-overrides/deployment-override.yaml   | 12 ++++
 .../test-overrides/kustomization.yaml         | 17 +++++
 6 files changed, 130 insertions(+), 2 deletions(-)
 create mode 100644 .buildkite/values-kuberay-operator-override.yaml
 create mode 100644 ray-operator/config/overlays/test-overrides/README.md
 create mode 100644 ray-operator/config/overlays/test-overrides/deployment-override.yaml
 create mode 100644 ray-operator/config/overlays/test-overrides/kustomization.yaml

diff --git a/.buildkite/build-start-operator.sh b/.buildkite/build-start-operator.sh
index 60468c373d4..4c81fbe96be 100644
--- a/.buildkite/build-start-operator.sh
+++ b/.buildkite/build-start-operator.sh
@@ -7,10 +7,14 @@
 # to kick off from the release branch so tests should match up accordingly.
 
 if [ "$IS_FROM_RAY_RELEASE_AUTOMATION" = 1 ]; then
-    helm repo update && helm install kuberay/kuberay-operator --set 'featureGates[1].name=RayJobDeletionPolicy' --set 'featureGates[1].enabled=true'
+    helm repo update
+    echo "Installing helm chart with test override values (feature gates enabled as needed)"
+    # NOTE: The override file is CI/test-only. It is NOT part of the released chart defaults.
+    helm install kuberay-operator kuberay/kuberay-operator -f ../.buildkite/values-kuberay-operator-override.yaml
     KUBERAY_TEST_RAY_IMAGE="rayproject/ray:nightly.$(date +'%y%m%d').${RAY_NIGHTLY_COMMIT:0:6}-py39" && export KUBERAY_TEST_RAY_IMAGE
 else
     IMG=kuberay/operator:nightly make docker-image &&
     kind load docker-image kuberay/operator:nightly &&
-    IMG=kuberay/operator:nightly make deploy
+    echo "Deploying operator with test overrides (feature gates via test-overrides overlay)"
+    IMG=kuberay/operator:nightly make deploy-with-override
 fi
diff --git a/.buildkite/values-kuberay-operator-override.yaml b/.buildkite/values-kuberay-operator-override.yaml
new file mode 100644
index 00000000000..3a0d6aa1ffb
--- /dev/null
+++ b/.buildkite/values-kuberay-operator-override.yaml
@@ -0,0 +1,18 @@
+# Generic Helm values override used only in CI / e2e test environments.
+# Intent:
+#   - Allow e2e tests to turn on alpha / experimental feature gates (e.g. RayJobDeletionPolicy)
+#   - Provide a single place contributors can extend with additional overrides needed for tests
+#   - Keep the default published Helm chart behavior unchanged for normal users
+# Scope / Safety:
+#   - This file is never referenced by the base chart; it is opt‑in via buildkite or manual helm install
+#   - Do NOT rename it to values.yaml or commit changes that enable unstable features by default
+# Usage examples:
+#   helm install kuberay-operator kuberay/kuberay-operator -f ../.buildkite/values-kuberay-operator-override.yaml
+#   (add or remove feature gates below as e2e scenarios expand)
+#
+# Current overrides: enable RayJobDeletionPolicy alpha feature gate alongside the existing status conditions gate.
+featureGates:
+  - name: RayClusterStatusConditions
+    enabled: true
+  - name: RayJobDeletionPolicy
+    enabled: true
\ No newline at end of file
diff --git a/ray-operator/Makefile b/ray-operator/Makefile
index d69b2fffa0b..214035963b7 100644
--- a/ray-operator/Makefile
+++ b/ray-operator/Makefile
@@ -143,6 +143,15 @@ deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in
 	cd config/default && $(KUSTOMIZE) edit set image kuberay/operator=${IMG}
 	$(KUSTOMIZE) build config/default | kubectl apply --server-side=true -f -
 
+# NOTE FOR CONTRIBUTORS:
+# deploy-with-override is an e2e/CI-only deployment path. It applies a Kustomize overlay that
+# enables test-only feature gates (e.g. RayJobDeletionPolicy) without changing the default
+# behavior of the base Helm chart or the standard 'make deploy'. Add additional test overrides
+# to the overlay (config/overlays/rayjob-deletion-policy) rather than modifying the base.
+deploy-with-override: manifests kustomize ## Deploy controller with test-only feature gate overrides (does NOT affect default chart).
+	cd config/default && $(KUSTOMIZE) edit set image kuberay/operator=${IMG}
+	$(KUSTOMIZE) build config/overlays/test-overrides | kubectl apply --server-side=true -f -
+
 undeploy: ## Undeploy controller from the K8s cluster specified in ~/.kube/config.
 	$(KUSTOMIZE) build config/default | kubectl delete -f -
 
diff --git a/ray-operator/config/overlays/test-overrides/README.md b/ray-operator/config/overlays/test-overrides/README.md
new file mode 100644
index 00000000000..8ec5e435c11
--- /dev/null
+++ b/ray-operator/config/overlays/test-overrides/README.md
@@ -0,0 +1,68 @@
+# Test Overrides Overlay (CI / e2e ONLY)
+
+This overlay enables test-only / alpha feature gates (currently `RayJobDeletionPolicy`) without modifying:
+- The base manifests under `config/default`
+- Generated CRDs (`make generate`)
+- Helm chart defaults (`make helm`, users' `helm install` without -f override)
+
+Use it only in CI or local end-to-end testing when you explicitly need gated behavior.
+
+---
+## Why It Exists
+Some feature gates are intentionally disabled by default for stability. E2E tests must exercise them to validate behavior prior to promotion. This overlay provides a safe, isolated place to turn them on.
+
+---
+## Safety Guarantees
+| Concern | Guarantee |
+|---------|-----------|
+| Default user deploy (`make deploy`) | Unchanged |
+| Helm install (no -f override) | Unchanged |
+| CRD generation / codegen | Unaffected |
+| Feature gates scope | Only those explicitly listed here |
+
+---
+## Usage
+Deploy with feature gates enabled:
+```
+make deploy-with-override IMG=kuberay/operator:nightly
+```
+Helm path (CI release automation):
+```
+helm install kuberay-operator kuberay/kuberay-operator -f .buildkite/values-kuberay-operator-override.yaml
+```
+
+---
+## Adding Another Feature Gate
+1. Edit `deployment-override.yaml` – append your gate inside the existing `--feature-gates=` list.
+2. Update `.buildkite/values-kuberay-operator-override.yaml` likewise.
+3. Add or adjust e2e tests as needed.
+
+Keep gate ordering stable to minimize diff noise.
+
+---
+## Keeping In Sync
+If the base operator Deployment args change in `config/manager/manager.yaml`:
+1. Copy the updated arg list.
+2. Re-apply the feature gates in `deployment-override.yaml`.
+3. Re-render to confirm.
+
+---
+## Removal / Promotion Flow
+When a gate graduates (enabled by default upstream):
+1. Remove it from the override (if it's default-on, it no longer needs listing).
+2. Remove corresponding logic from tests if they branch on gate state.
+3. (Optional) Note the graduation in release notes.
+
+---
+## Troubleshooting
+Problem | Action
+--------|-------
+Patch no longer applies | Check if Deployment name or container name changed.
+Gates not taking effect | Confirm args rendered (render target) and operator pod restarted.
+Unexpected arg order | The strategic merge patch replaces the entire args list; adjust ordering there.
+
+---
+## Do NOT
+- Add unrelated production configuration (RBAC, CRDs, resources) here.
+- Reference this overlay from user-facing docs.
+- Rename directory without updating `Makefile` targets.
\ No newline at end of file
diff --git a/ray-operator/config/overlays/test-overrides/deployment-override.yaml b/ray-operator/config/overlays/test-overrides/deployment-override.yaml
new file mode 100644
index 00000000000..20ba1ac9f2b
--- /dev/null
+++ b/ray-operator/config/overlays/test-overrides/deployment-override.yaml
@@ -0,0 +1,12 @@
+# Strategic merge patch for kuberay-operator Deployment (test / CI only).
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: kuberay-operator
+spec:
+  template:
+    spec:
+      containers:
+      - name: kuberay-operator
+        args:
+        - --feature-gates=RayClusterStatusConditions=true,RayJobDeletionPolicy=true
\ No newline at end of file
diff --git a/ray-operator/config/overlays/test-overrides/kustomization.yaml b/ray-operator/config/overlays/test-overrides/kustomization.yaml
new file mode 100644
index 00000000000..7554d954359
--- /dev/null
+++ b/ray-operator/config/overlays/test-overrides/kustomization.yaml
@@ -0,0 +1,17 @@
+## ============================================================================
+## Kustomize overlay: test-overrides (CI / e2e only)
+## ----------------------------------------------------------------------------
+## Purpose: Enable alpha / experimental feature gates (currently RayJobDeletionPolicy)
+## for end-to-end testing without modifying base manifests or Helm defaults.
+## ============================================================================
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - ../../default
+
+patches:
+  - path: deployment-override.yaml
+    target:
+      kind: Deployment
+      name: kuberay-operator
\ No newline at end of file

From 82341b9a9fd9e4d43ad42ce4cad6522d25581709 Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Thu, 25 Sep 2025 22:31:06 -0400
Subject: [PATCH 15/21] fix lint & fix validation error

---
 .../values-kuberay-operator-override.yaml     |   2 +-
 .../config/overlays/test-overrides/README.md  |  42 +++++--
 .../test-overrides/deployment-override.yaml   |   2 +-
 .../test-overrides/kustomization.yaml         |   2 +-
 .../controllers/ray/utils/validation.go       | 103 +++++++++++++-----
 .../controllers/ray/utils/validation_test.go  |  10 +-
 6 files changed, 114 insertions(+), 47 deletions(-)

diff --git a/.buildkite/values-kuberay-operator-override.yaml b/.buildkite/values-kuberay-operator-override.yaml
index 3a0d6aa1ffb..7dc396edd71 100644
--- a/.buildkite/values-kuberay-operator-override.yaml
+++ b/.buildkite/values-kuberay-operator-override.yaml
@@ -15,4 +15,4 @@ featureGates:
   - name: RayClusterStatusConditions
     enabled: true
   - name: RayJobDeletionPolicy
-    enabled: true
\ No newline at end of file
+    enabled: true
diff --git a/ray-operator/config/overlays/test-overrides/README.md b/ray-operator/config/overlays/test-overrides/README.md
index 8ec5e435c11..9df3cb15a7c 100644
--- a/ray-operator/config/overlays/test-overrides/README.md
+++ b/ray-operator/config/overlays/test-overrides/README.md
@@ -1,6 +1,7 @@
 # Test Overrides Overlay (CI / e2e ONLY)
 
 This overlay enables test-only / alpha feature gates (currently `RayJobDeletionPolicy`) without modifying:
+
 - The base manifests under `config/default`
 - Generated CRDs (`make generate`)
 - Helm chart defaults (`make helm`, users' `helm install` without -f override)
@@ -8,11 +9,17 @@ This overlay enables test-only / alpha feature gates (currently `RayJobDeletionP
 Use it only in CI or local end-to-end testing when you explicitly need gated behavior.
 
 ---
+
 ## Why It Exists
-Some feature gates are intentionally disabled by default for stability. E2E tests must exercise them to validate behavior prior to promotion. This overlay provides a safe, isolated place to turn them on.
+
+Some feature gates are intentionally disabled by default for stability.
+E2E tests must exercise them to validate behavior prior to promotion.
+This overlay provides a safe, isolated place to turn them on.
 
 ---
+
 ## Safety Guarantees
+
 | Concern | Guarantee |
 |---------|-----------|
 | Default user deploy (`make deploy`) | Unchanged |
@@ -21,18 +28,25 @@ Some feature gates are intentionally disabled by default for stability. E2E test
 | Feature gates scope | Only those explicitly listed here |
 
 ---
+
 ## Usage
+
 Deploy with feature gates enabled:
-```
+
+```bash
 make deploy-with-override IMG=kuberay/operator:nightly
 ```
+
 Helm path (CI release automation):
-```
+
+```bash
 helm install kuberay-operator kuberay/kuberay-operator -f .buildkite/values-kuberay-operator-override.yaml
 ```
 
 ---
+
 ## Adding Another Feature Gate
+
 1. Edit `deployment-override.yaml` – append your gate inside the existing `--feature-gates=` list.
 2. Update `.buildkite/values-kuberay-operator-override.yaml` likewise.
 3. Add or adjust e2e tests as needed.
@@ -40,29 +54,39 @@ helm install kuberay-operator kuberay/kuberay-operator -f .buildkite/values-kube
 Keep gate ordering stable to minimize diff noise.
 
 ---
+
 ## Keeping In Sync
+
 If the base operator Deployment args change in `config/manager/manager.yaml`:
+
 1. Copy the updated arg list.
 2. Re-apply the feature gates in `deployment-override.yaml`.
 3. Re-render to confirm.
 
 ---
+
 ## Removal / Promotion Flow
+
 When a gate graduates (enabled by default upstream):
+
 1. Remove it from the override (if it's default-on, it no longer needs listing).
 2. Remove corresponding logic from tests if they branch on gate state.
 3. (Optional) Note the graduation in release notes.
 
 ---
+
 ## Troubleshooting
-Problem | Action
---------|-------
-Patch no longer applies | Check if Deployment name or container name changed.
-Gates not taking effect | Confirm args rendered (render target) and operator pod restarted.
-Unexpected arg order | The strategic merge patch replaces the entire args list; adjust ordering there.
+
+| Problem | Action |
+|---------|--------|
+| Patch no longer applies | Check if Deployment name or container name changed. |
+| Gates not taking effect | Confirm args rendered (render target) and operator pod restarted. |
+| Unexpected arg order | The strategic merge patch replaces the entire args list; adjust ordering there. |
 
 ---
+
 ## Do NOT
+
 - Add unrelated production configuration (RBAC, CRDs, resources) here.
 - Reference this overlay from user-facing docs.
-- Rename directory without updating `Makefile` targets.
\ No newline at end of file
+- Rename directory without updating `Makefile` targets.
diff --git a/ray-operator/config/overlays/test-overrides/deployment-override.yaml b/ray-operator/config/overlays/test-overrides/deployment-override.yaml
index 20ba1ac9f2b..5f7a1eba665 100644
--- a/ray-operator/config/overlays/test-overrides/deployment-override.yaml
+++ b/ray-operator/config/overlays/test-overrides/deployment-override.yaml
@@ -9,4 +9,4 @@ spec:
       containers:
       - name: kuberay-operator
         args:
-        - --feature-gates=RayClusterStatusConditions=true,RayJobDeletionPolicy=true
\ No newline at end of file
+        - --feature-gates=RayClusterStatusConditions=true,RayJobDeletionPolicy=true
diff --git a/ray-operator/config/overlays/test-overrides/kustomization.yaml b/ray-operator/config/overlays/test-overrides/kustomization.yaml
index 7554d954359..c1472f6f305 100644
--- a/ray-operator/config/overlays/test-overrides/kustomization.yaml
+++ b/ray-operator/config/overlays/test-overrides/kustomization.yaml
@@ -14,4 +14,4 @@ patches:
   - path: deployment-override.yaml
     target:
       kind: Deployment
-      name: kuberay-operator
\ No newline at end of file
+      name: kuberay-operator
diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go
index c11998e8938..f521bd068fe 100644
--- a/ray-operator/controllers/ray/utils/validation.go
+++ b/ray-operator/controllers/ray/utils/validation.go
@@ -161,8 +161,9 @@ func ValidateRayJobSpec(rayJob *rayv1.RayJob) error {
 		return fmt.Errorf("The RayJob spec is invalid: TTLSecondsAfterFinished must be a non-negative integer")
 	}
 
-	if !rayJob.Spec.ShutdownAfterJobFinishes && rayJob.Spec.TTLSecondsAfterFinished > 0 {
-		return fmt.Errorf("The RayJob spec is invalid: a RayJob with shutdownAfterJobFinishes set to false cannot have TTLSecondsAfterFinished")
+	// Validate TTL and deletion strategy together
+	if err := validateDeletionConfiguration(rayJob); err != nil {
+		return err
 	}
 
 	isClusterSelectorMode := len(rayJob.Spec.ClusterSelector) != 0
@@ -224,9 +225,7 @@ func ValidateRayJobSpec(rayJob *rayv1.RayJob) error {
 	if rayJob.Spec.BackoffLimit != nil && *rayJob.Spec.BackoffLimit < 0 {
 		return fmt.Errorf("The RayJob spec is invalid: backoffLimit must be a positive integer")
 	}
-	if err := validateDeletionStrategy(rayJob); err != nil {
-		return fmt.Errorf("invalid deletion strategy: %w", err)
-	}
+
 	return nil
 }
 
@@ -265,41 +264,89 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error {
 	return nil
 }
 
-// validateDeletionStrategy centralizes all validation logic for the deletion strategy.
-// This includes the new `deletionRules` and the legacy fields (`onSuccess`,`onFailure`).
-func validateDeletionStrategy(rayJob *rayv1.RayJob) error {
-	if rayJob.Spec.DeletionStrategy == nil {
-		return nil
-	}
+// validateDeletionConfiguration validates both deletion strategy and TTL configuration
+func validateDeletionConfiguration(rayJob *rayv1.RayJob) error {
+	// Get deletion mode flags
+	usingShutdownAfterJobFinishes := rayJob.Spec.ShutdownAfterJobFinishes
+	usingDeletionRules := rayJob.Spec.DeletionStrategy != nil && len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0
+	usingLegacyAPI := rayJob.Spec.DeletionStrategy != nil &&
+		(rayJob.Spec.DeletionStrategy.OnSuccess != nil || rayJob.Spec.DeletionStrategy.OnFailure != nil)
 
-	if !features.Enabled(features.RayJobDeletionPolicy) {
+	// Validate feature gate requirements
+	if (usingDeletionRules || usingLegacyAPI) && !features.Enabled(features.RayJobDeletionPolicy) {
 		return fmt.Errorf("RayJobDeletionPolicy feature gate must be enabled to use the DeletionStrategy feature")
 	}
 
-	usingDeletionRules := len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0
-	usingLegacyAPI := rayJob.Spec.DeletionStrategy.OnSuccess != nil || rayJob.Spec.DeletionStrategy.OnFailure != nil
-
-	// ShutdownAfterJobFinishes cannot be used with the new API.
-	if usingDeletionRules && rayJob.Spec.ShutdownAfterJobFinishes {
-		return fmt.Errorf("ShutdownAfterJobFinishes cannot be used when spec.deletionStrategy.deletionRules is defined. Please configure all deletion behaviors within deletionRules")
+	// Validate mutual exclusivity
+	if err := validateDeletionMutualExclusivity(usingShutdownAfterJobFinishes, usingDeletionRules, usingLegacyAPI); err != nil {
+		return err
 	}
 
-	// Legacy API and DeletionRules cannot be used simultaneously.
-	if usingDeletionRules && usingLegacyAPI {
-		return fmt.Errorf("legacy policies (onSuccess, onFailure) and the new deletionRules cannot be used simultaneously within the same deletionStrategy")
+	// Validate TTL requirements
+	if rayJob.Spec.TTLSecondsAfterFinished > 0 {
+		if err := validateTTLRequirements(usingShutdownAfterJobFinishes, usingDeletionRules, usingLegacyAPI); err != nil {
+			return err
+		}
 	}
 
-	// DeletionStrategy must contain at least one policy if specified.
-	if !usingDeletionRules && !usingLegacyAPI {
-		return fmt.Errorf("deletionStrategy is specified, but no policies (onSuccess, onFailure, or deletionRules) are defined within it")
+	// Validate deletion strategy configuration
+	if rayJob.Spec.DeletionStrategy != nil && !usingDeletionRules && !usingLegacyAPI {
+		return fmt.Errorf("deletionStrategy is specified, but no policies are defined")
 	}
 
+	// Validate specific deletion modes
 	if usingDeletionRules {
 		return validateDeletionRules(rayJob)
 	}
+	if usingLegacyAPI {
+		return validateLegacyDeletionPolicies(rayJob)
+	}
 
-	// If not using DeletionRules, validate the legacy strategy
-	return validateLegacyDeletionPolicies(rayJob)
+	return nil
+}
+
+// validateDeletionMutualExclusivity ensures only one deletion approach is used
+func validateDeletionMutualExclusivity(usingShutdownAfterJobFinishes, usingDeletionRules, usingLegacyAPI bool) error {
+	activeCount := 0
+	var activeModes []string
+
+	if usingShutdownAfterJobFinishes {
+		activeCount++
+		activeModes = append(activeModes, "spec.shutdownAfterJobFinishes=true")
+	}
+	if usingDeletionRules {
+		activeCount++
+		activeModes = append(activeModes, "spec.deletionStrategy.deletionRules")
+	}
+	if usingLegacyAPI {
+		activeCount++
+		activeModes = append(activeModes, "spec.deletionStrategy.onSuccess/onFailure")
+	}
+
+	if activeCount > 1 {
+		return fmt.Errorf("multiple deletion approaches are configured simultaneously: %v. Please use only one deletion strategy", activeModes)
+	}
+
+	return nil
+}
+
+// validateTTLRequirements ensures TTL is only used with valid cleanup mechanisms
+func validateTTLRequirements(usingShutdownAfterJobFinishes, usingDeletionRules, usingLegacyAPI bool) error {
+	// ShutdownAfterJobFinishes is always a valid cleanup mechanism
+	if usingShutdownAfterJobFinishes {
+		return nil
+	}
+
+	// Deletion strategy provides cleanup, but only if feature gate is enabled
+	if features.Enabled(features.RayJobDeletionPolicy) && (usingDeletionRules || usingLegacyAPI) {
+		return nil
+	}
+
+	// No valid cleanup mechanism found
+	if features.Enabled(features.RayJobDeletionPolicy) {
+		return fmt.Errorf("The RayJob spec is invalid: TTLSecondsAfterFinished requires either ShutdownAfterJobFinishes=true or a configured deletion strategy")
+	}
+	return fmt.Errorf("The RayJob spec is invalid: a RayJob with shutdownAfterJobFinishes set to false cannot have TTLSecondsAfterFinished")
 }
 
 // validateDeletionRules validates the deletion rules in the RayJob spec.
@@ -429,9 +476,5 @@ func validateLegacyDeletionPolicies(rayJob *rayv1.RayJob) error {
 		return fmt.Errorf("DeletionStrategy=DeleteWorkers currently does not support RayCluster with autoscaling enabled")
 	}
 
-	if rayJob.Spec.ShutdownAfterJobFinishes && (*onSuccessPolicy.Policy == rayv1.DeleteNone || *onFailurePolicy.Policy == rayv1.DeleteNone) {
-		return fmt.Errorf("shutdownAfterJobFinishes is set to 'true' while deletion policy is 'DeleteNone'")
-	}
-
 	return nil
 }
diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go
index 16f2911229d..d7f44900dd3 100644
--- a/ray-operator/controllers/ray/utils/validation_test.go
+++ b/ray-operator/controllers/ray/utils/validation_test.go
@@ -1012,7 +1012,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 					OnFailure: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteCluster),
 					},
-				}, ShutdownAfterJobFinishes: true,
+				}, ShutdownAfterJobFinishes: false,
 				RayClusterSpec: createBasicRayClusterSpec(),
 			},
 			expectError: false,
@@ -1048,7 +1048,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 					OnFailure: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteNone),
 					},
-				}, ShutdownAfterJobFinishes: true,
+				}, ShutdownAfterJobFinishes: false,
 				RayClusterSpec: createBasicRayClusterSpec(),
 			},
 			expectError: true,
@@ -1060,7 +1060,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 					OnFailure: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteNone),
 					},
-				}, ShutdownAfterJobFinishes: true,
+				}, ShutdownAfterJobFinishes: false,
 				RayClusterSpec: createBasicRayClusterSpec(),
 			},
 			expectError: true,
@@ -1072,7 +1072,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteNone),
 					},
-				}, ShutdownAfterJobFinishes: true,
+				}, ShutdownAfterJobFinishes: false,
 				RayClusterSpec: createBasicRayClusterSpec(),
 			},
 			expectError: true,
@@ -1085,7 +1085,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 						Policy: ptr.To(rayv1.DeleteNone),
 					},
 					OnFailure: &rayv1.DeletionPolicy{},
-				}, ShutdownAfterJobFinishes: true,
+				}, ShutdownAfterJobFinishes: false,
 				RayClusterSpec: createBasicRayClusterSpec(),
 			},
 			expectError: true,

From 92a8d7edb167b9e1e6da124ad534859e10d002c2 Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Sat, 27 Sep 2025 00:26:34 +0000
Subject: [PATCH 16/21] refactor

---
 docs/reference/api.md                         | 36 +++----
 ray-operator/apis/ray/v1/rayjob_types.go      | 43 ++++----
 .../controllers/ray/rayjob_controller.go      |  2 +-
 .../controllers/ray/utils/validation.go       | 98 ++++++-------------
 .../controllers/ray/utils/validation_test.go  | 12 ++-
 .../rayjob_deletion_strategy_test.go          | 57 +----------
 6 files changed, 77 insertions(+), 171 deletions(-)

diff --git a/docs/reference/api.md b/docs/reference/api.md
index 3b773074bb5..39b443f6701 100644
--- a/docs/reference/api.md
+++ b/docs/reference/api.md
@@ -124,33 +124,25 @@ _Appears in:_
 
 
 
-DeletionStrategy defines the deletion policies for a RayJob.
-It allows for fine-grained control over resource cleanup after a job finishes.
+DeletionStrategy configures automated cleanup after the RayJob reaches a terminal state.
+Two mutually exclusive styles are supported:
 
 
-Legacy fields `onSuccess` and `onFailure` are still supported for backward compatibility,
-but it is highly recommended to migrate to the new `deletionRules` field.
-`onSuccess` and `onFailure` will be removed in release 1.6.0.
+	Legacy: provide both onSuccess and onFailure (deprecated; removal planned for 1.6.0). May be combined with shutdownAfterJobFinishes and (optionally) global TTLSecondsAfterFinished.
+	Rules: provide deletionRules (list; may be empty to explicitly select rules mode). Rules mode is incompatible with shutdownAfterJobFinishes, legacy fields, and the global TTLSecondsAfterFinished (use per‑rule condition.ttlSeconds instead).
 
 
-Notes:
-  - When this block is set, you must configure **either**
-    (a) BOTH `onSuccess` and `onFailure` policies,
-    OR
-    (b) the `deletionRules` field (which may be empty, in which case no deletion will occur).
-  - `onSuccess` / `onFailure` must NOT be used together with `deletionRules`.
-  - `onSuccess` and `onFailure` are **deprecated** and planned for removal in release 1.6.0.
-  - `deletionStrategy` is mutually exclusive with `spec.shutdownAfterJobFinishes`.
-  - If both are set, the controller will report an error and stop processing the RayJob.
-  - If the `RayJobDeletionPolicy` feature gate is disabled but `deletionStrategy` is set,
-    the controller will report an error and stop processing the RayJob.
+Semantics:
+  - An empty deletionRules slice still selects rules mode.
+  - Legacy requires both onSuccess and onFailure; specifying only one is invalid.
+  - Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true; therefore it cannot be used with rules mode or with legacy alone (no shutdown).
+  - Feature gate RayJobDeletionPolicy must be enabled when this block is present.
 
 
-Validation rules:
- 1. Prevent mixing legacy and new fields
-
-
- 2. Require either both legacy fields or deletionRules presence
+Validation:
+  - CRD XValidations prevent mixing legacy fields with deletionRules and enforce legacy completeness.
+  - Webhook/controller logic enforces rules vs shutdown exclusivity and TTL constraints.
+  - onSuccess/onFailure are deprecated; migration to deletionRules is encouraged.
 
 
 
@@ -305,7 +297,7 @@ _Appears in:_
 | `clusterSelector` _object (keys:string, values:string)_ | clusterSelector is used to select running rayclusters by labels |  |  |
 | `submitterConfig` _[SubmitterConfig](#submitterconfig)_ | Configurations of submitter k8s job. |  |  |
 | `managedBy` _string_ | ManagedBy is an optional configuration for the controller or entity that manages a RayJob.<br />The value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'.<br />The kuberay-operator reconciles a RayJob which doesn't have this field at all or<br />the field value is the reserved string 'ray.io/kuberay-operator',<br />but delegates reconciling the RayJob with 'kueue.x-k8s.io/multikueue' to the Kueue.<br />The field is immutable. |  |  |
-| `deletionStrategy` _[DeletionStrategy](#deletionstrategy)_ | DeletionStrategy defines resource cleanup policies after job completion.<br />Use either legacy fields (onSuccess/onFailure) OR deletionRules, not both.<br />Mutually exclusive with spec.shutdownAfterJobFinishes.<br />Requires RayJobDeletionPolicy feature gate to be enabled. |  |  |
+| `deletionStrategy` _[DeletionStrategy](#deletionstrategy)_ | DeletionStrategy automates post-completion cleanup.<br />Choose one style or omit:<br />  - Legacy: both onSuccess & onFailure (deprecated; may combine with shutdownAfterJobFinishes and TTLSecondsAfterFinished).<br />  - Rules: deletionRules (empty or non-empty) — incompatible with shutdownAfterJobFinishes, legacy fields, and global TTLSecondsAfterFinished (use per-rule condition.ttlSeconds).<br />Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true.<br />Feature gate RayJobDeletionPolicy must be enabled when this field is set. |  |  |
 | `entrypoint` _string_ | Entrypoint represents the command to start execution. |  |  |
 | `runtimeEnvYAML` _string_ | RuntimeEnvYAML represents the runtime environment configuration<br />provided as a multi-line YAML string. |  |  |
 | `jobId` _string_ | If jobId is not set, a new jobId will be auto-generated. |  |  |
diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go
index 37fd5dbe7f3..cd3a68db472 100644
--- a/ray-operator/apis/ray/v1/rayjob_types.go
+++ b/ray-operator/apis/ray/v1/rayjob_types.go
@@ -87,31 +87,24 @@ const (
 	SidecarMode     JobSubmissionMode = "SidecarMode"     // Submit job via a sidecar container in the Ray head Pod
 )
 
-// DeletionStrategy defines the deletion policies for a RayJob.
-// It allows for fine-grained control over resource cleanup after a job finishes.
+// DeletionStrategy configures automated cleanup after the RayJob reaches a terminal state.
+// Two mutually exclusive styles are supported:
 //
-// Legacy fields `onSuccess` and `onFailure` are still supported for backward compatibility,
-// but it is highly recommended to migrate to the new `deletionRules` field.
-// `onSuccess` and `onFailure` will be removed in release 1.6.0.
+//	Legacy: provide both onSuccess and onFailure (deprecated; removal planned for 1.6.0). May be combined with shutdownAfterJobFinishes and (optionally) global TTLSecondsAfterFinished.
+//	Rules: provide deletionRules (list; may be empty to explicitly select rules mode). Rules mode is incompatible with shutdownAfterJobFinishes, legacy fields, and the global TTLSecondsAfterFinished (use per‑rule condition.ttlSeconds instead).
 //
-// Notes:
-//   - When this block is set, you must configure **either**
-//     (a) BOTH `onSuccess` and `onFailure` policies,
-//     OR
-//     (b) the `deletionRules` field (which may be empty, in which case no deletion will occur).
-//   - `onSuccess` / `onFailure` must NOT be used together with `deletionRules`.
-//   - `onSuccess` and `onFailure` are **deprecated** and planned for removal in release 1.6.0.
-//   - `deletionStrategy` is mutually exclusive with `spec.shutdownAfterJobFinishes`.
-//   - If both are set, the controller will report an error and stop processing the RayJob.
-//   - If the `RayJobDeletionPolicy` feature gate is disabled but `deletionStrategy` is set,
-//     the controller will report an error and stop processing the RayJob.
+// Semantics:
+//   - An empty deletionRules slice still selects rules mode.
+//   - Legacy requires both onSuccess and onFailure; specifying only one is invalid.
+//   - Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true; therefore it cannot be used with rules mode or with legacy alone (no shutdown).
+//   - Feature gate RayJobDeletionPolicy must be enabled when this block is present.
 //
-// Validation rules:
-//  1. Prevent mixing legacy and new fields
+// Validation:
+//   - CRD XValidations prevent mixing legacy fields with deletionRules and enforce legacy completeness.
+//   - Webhook/controller logic enforces rules vs shutdown exclusivity and TTL constraints.
+//   - onSuccess/onFailure are deprecated; migration to deletionRules is encouraged.
 //
 // +kubebuilder:validation:XValidation:rule="!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))",message="legacy policies (onSuccess/onFailure) and deletionRules cannot be used together within the same deletionStrategy"
-//  2. Require either both legacy fields or deletionRules presence
-//
 // +kubebuilder:validation:XValidation:rule="((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules))",message="deletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (which may be empty)"
 type DeletionStrategy struct {
 	// OnSuccess is the deletion policy for a successful RayJob.
@@ -232,10 +225,12 @@ type RayJobSpec struct {
 	// +kubebuilder:validation:XValidation:rule="self in ['ray.io/kuberay-operator', 'kueue.x-k8s.io/multikueue']",message="the managedBy field value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'"
 	// +optional
 	ManagedBy *string `json:"managedBy,omitempty"`
-	// DeletionStrategy defines resource cleanup policies after job completion.
-	// Use either legacy fields (onSuccess/onFailure) OR deletionRules, not both.
-	// Mutually exclusive with spec.shutdownAfterJobFinishes.
-	// Requires RayJobDeletionPolicy feature gate to be enabled.
+	// DeletionStrategy automates post-completion cleanup.
+	// Choose one style or omit:
+	//   - Legacy: both onSuccess & onFailure (deprecated; may combine with shutdownAfterJobFinishes and TTLSecondsAfterFinished).
+	//   - Rules: deletionRules (empty or non-empty) — incompatible with shutdownAfterJobFinishes, legacy fields, and global TTLSecondsAfterFinished (use per-rule condition.ttlSeconds).
+	// Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true.
+	// Feature gate RayJobDeletionPolicy must be enabled when this field is set.
 	// +optional
 	DeletionStrategy *DeletionStrategy `json:"deletionStrategy,omitempty"`
 	// Entrypoint represents the command to start execution.
diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go
index fb139256189..e2f58652347 100644
--- a/ray-operator/controllers/ray/rayjob_controller.go
+++ b/ray-operator/controllers/ray/rayjob_controller.go
@@ -373,7 +373,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
 
 		if features.Enabled(features.RayJobDeletionPolicy) && rayJobInstance.Spec.DeletionStrategy != nil {
 			// The previous validation logic ensures that either DeletionRules or the legacy policies are set, but not both.
-			if len(rayJobInstance.Spec.DeletionStrategy.DeletionRules) > 0 {
+			if rayJobInstance.Spec.DeletionStrategy.DeletionRules != nil {
 				return r.handleDeletionRules(ctx, rayJobInstance)
 			}
 			return r.handleLegacyDeletionPolicy(ctx, rayJobInstance)
diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go
index f521bd068fe..d063661cf2f 100644
--- a/ray-operator/controllers/ray/utils/validation.go
+++ b/ray-operator/controllers/ray/utils/validation.go
@@ -266,89 +266,47 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error {
 
 // validateDeletionConfiguration validates both deletion strategy and TTL configuration
 func validateDeletionConfiguration(rayJob *rayv1.RayJob) error {
-	// Get deletion mode flags
-	usingShutdownAfterJobFinishes := rayJob.Spec.ShutdownAfterJobFinishes
-	usingDeletionRules := rayJob.Spec.DeletionStrategy != nil && len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0
-	usingLegacyAPI := rayJob.Spec.DeletionStrategy != nil &&
-		(rayJob.Spec.DeletionStrategy.OnSuccess != nil || rayJob.Spec.DeletionStrategy.OnFailure != nil)
-
-	// Validate feature gate requirements
-	if (usingDeletionRules || usingLegacyAPI) && !features.Enabled(features.RayJobDeletionPolicy) {
-		return fmt.Errorf("RayJobDeletionPolicy feature gate must be enabled to use the DeletionStrategy feature")
-	}
-
-	// Validate mutual exclusivity
-	if err := validateDeletionMutualExclusivity(usingShutdownAfterJobFinishes, usingDeletionRules, usingLegacyAPI); err != nil {
-		return err
-	}
-
-	// Validate TTL requirements
-	if rayJob.Spec.TTLSecondsAfterFinished > 0 {
-		if err := validateTTLRequirements(usingShutdownAfterJobFinishes, usingDeletionRules, usingLegacyAPI); err != nil {
-			return err
-		}
+	if !rayJob.Spec.ShutdownAfterJobFinishes && rayJob.Spec.TTLSecondsAfterFinished > 0 {
+		return fmt.Errorf("The RayJob spec is invalid: a RayJob with shutdownAfterJobFinishes set to false cannot have TTLSecondsAfterFinished")
 	}
 
-	// Validate deletion strategy configuration
-	if rayJob.Spec.DeletionStrategy != nil && !usingDeletionRules && !usingLegacyAPI {
-		return fmt.Errorf("deletionStrategy is specified, but no policies are defined")
+	// No strategy block: nothing else to validate.
+	if rayJob.Spec.DeletionStrategy == nil {
+		return nil
 	}
 
-	// Validate specific deletion modes
-	if usingDeletionRules {
-		return validateDeletionRules(rayJob)
-	}
-	if usingLegacyAPI {
-		return validateLegacyDeletionPolicies(rayJob)
+	// Feature gate must be enabled for any strategy usage.
+	if !features.Enabled(features.RayJobDeletionPolicy) {
+		return fmt.Errorf("RayJobDeletionPolicy feature gate must be enabled to use DeletionStrategy")
 	}
 
-	return nil
-}
-
-// validateDeletionMutualExclusivity ensures only one deletion approach is used
-func validateDeletionMutualExclusivity(usingShutdownAfterJobFinishes, usingDeletionRules, usingLegacyAPI bool) error {
-	activeCount := 0
-	var activeModes []string
+	legacyConfigured := rayJob.Spec.DeletionStrategy.OnSuccess != nil || rayJob.Spec.DeletionStrategy.OnFailure != nil
+	rulesConfigured := rayJob.Spec.DeletionStrategy.DeletionRules != nil // explicit empty slice counts as rules mode
 
-	if usingShutdownAfterJobFinishes {
-		activeCount++
-		activeModes = append(activeModes, "spec.shutdownAfterJobFinishes=true")
-	}
-	if usingDeletionRules {
-		activeCount++
-		activeModes = append(activeModes, "spec.deletionStrategy.deletionRules")
+	// Mutual exclusivity: rules mode forbids shutdown & legacy. (TTL+rules is implicitly invalid because TTL requires shutdown.)
+	if rulesConfigured && rayJob.Spec.ShutdownAfterJobFinishes {
+		return fmt.Errorf("The RayJob spec is invalid: spec.shutdownAfterJobFinishes and spec.deletionStrategy.deletionRules are mutually exclusive")
 	}
-	if usingLegacyAPI {
-		activeCount++
-		activeModes = append(activeModes, "spec.deletionStrategy.onSuccess/onFailure")
+	if rulesConfigured && legacyConfigured {
+		return fmt.Errorf("The RayJob spec is invalid: Cannot use both legacy onSuccess/onFailure fields and deletionRules simultaneously")
 	}
 
-	if activeCount > 1 {
-		return fmt.Errorf("multiple deletion approaches are configured simultaneously: %v. Please use only one deletion strategy", activeModes)
+	// Detailed content validation
+	if legacyConfigured {
+		if err := validateLegacyDeletionPolicies(rayJob); err != nil {
+			return err
+		}
+	} else if rulesConfigured {
+		if err := validateDeletionRules(rayJob); err != nil {
+			return err
+		}
+	} else {
+		return fmt.Errorf("The RayJob spec is invalid: DeletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (which may be empty list)")
 	}
 
 	return nil
 }
 
-// validateTTLRequirements ensures TTL is only used with valid cleanup mechanisms
-func validateTTLRequirements(usingShutdownAfterJobFinishes, usingDeletionRules, usingLegacyAPI bool) error {
-	// ShutdownAfterJobFinishes is always a valid cleanup mechanism
-	if usingShutdownAfterJobFinishes {
-		return nil
-	}
-
-	// Deletion strategy provides cleanup, but only if feature gate is enabled
-	if features.Enabled(features.RayJobDeletionPolicy) && (usingDeletionRules || usingLegacyAPI) {
-		return nil
-	}
-
-	// No valid cleanup mechanism found
-	if features.Enabled(features.RayJobDeletionPolicy) {
-		return fmt.Errorf("The RayJob spec is invalid: TTLSecondsAfterFinished requires either ShutdownAfterJobFinishes=true or a configured deletion strategy")
-	}
-	return fmt.Errorf("The RayJob spec is invalid: a RayJob with shutdownAfterJobFinishes set to false cannot have TTLSecondsAfterFinished")
-}
-
 // validateDeletionRules validates the deletion rules in the RayJob spec.
 // It performs per-rule validations, checks for uniqueness, and ensures logical TTL consistency.
 // Errors are collected and returned as a single aggregated error using errors.Join for better user feedback.
@@ -476,5 +434,9 @@ func validateLegacyDeletionPolicies(rayJob *rayv1.RayJob) error {
 		return fmt.Errorf("DeletionStrategy=DeleteWorkers currently does not support RayCluster with autoscaling enabled")
 	}
 
+	if rayJob.Spec.ShutdownAfterJobFinishes && (*onSuccessPolicy.Policy == rayv1.DeleteNone || *onFailurePolicy.Policy == rayv1.DeleteNone) {
+		return fmt.Errorf("The RayJob spec is invalid: shutdownAfterJobFinshes is set to 'true' while deletion policy is 'DeleteNone'")
+	}
+
 	return nil
 }
diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go
index d7f44900dd3..2314d5e79f5 100644
--- a/ray-operator/controllers/ray/utils/validation_test.go
+++ b/ray-operator/controllers/ray/utils/validation_test.go
@@ -1159,13 +1159,23 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 			expectError: true,
 		},
 		{
-			name: "empty DeletionStrategy",
+			name: "nil DeletionStrategy",
 			spec: rayv1.RayJobSpec{
 				DeletionStrategy: &rayv1.DeletionStrategy{},
 				RayClusterSpec:   createBasicRayClusterSpec(),
 			},
 			expectError: true,
 		},
+		{
+			name: "empty DeletionStrategy",
+			spec: rayv1.RayJobSpec{
+				DeletionStrategy: &rayv1.DeletionStrategy{
+					DeletionRules: []rayv1.DeletionRule{},
+				},
+				RayClusterSpec: createBasicRayClusterSpec(),
+			},
+			expectError: false,
+		},
 		{
 			name: "duplicate rule in deletionRules",
 			spec: rayv1.RayJobSpec{
diff --git a/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go b/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go
index 4668ba0713c..49718d3544b 100644
--- a/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go
+++ b/ray-operator/test/e2erayjob/rayjob_deletion_strategy_test.go
@@ -456,13 +456,13 @@ env_vars:
 env_vars:
   counter_name: test_counter
 `).
-				WithShutdownAfterJobFinishes(false).
+				WithShutdownAfterJobFinishes(true).
 				WithTTLSecondsAfterFinished(10). // Legacy TTL for backward compatibility
 				WithDeletionStrategy(rayv1ac.DeletionStrategy().
 					WithOnSuccess(rayv1ac.DeletionPolicy().
 						WithPolicy(rayv1.DeleteCluster)).
 					WithOnFailure(rayv1ac.DeletionPolicy().
-						WithPolicy(rayv1.DeleteNone))).
+						WithPolicy(rayv1.DeleteCluster))).
 				WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration()))
 
 		rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions)
@@ -501,57 +501,4 @@ env_vars:
 		g.Eventually(func() error { _, err := GetRayJob(test, job.Namespace, job.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
 		LogWithTimestamp(test.T(), "Cleanup after legacy success scenario complete")
 	})
-
-	test.T().Run("Legacy OnFailure DeleteNone should still work", func(_ *testing.T) {
-		rayJobAC := rayv1ac.RayJob("legacy-failure-test", namespace.Name).
-			WithSpec(rayv1ac.RayJobSpec().
-				WithRayClusterSpec(NewRayClusterSpec(MountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](jobs, "/home/ray/jobs"))).
-				WithEntrypoint("python /home/ray/jobs/fail.py"). // Use failing script
-				WithShutdownAfterJobFinishes(false).
-				WithTTLSecondsAfterFinished(10).
-				WithDeletionStrategy(rayv1ac.DeletionStrategy().
-					WithOnSuccess(rayv1ac.DeletionPolicy().
-						WithPolicy(rayv1.DeleteCluster)).
-					WithOnFailure(rayv1ac.DeletionPolicy().
-						WithPolicy(rayv1.DeleteNone))).
-				WithSubmitterPodTemplate(JobSubmitterPodTemplateApplyConfiguration()))
-
-		rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions)
-		g.Expect(err).NotTo(HaveOccurred())
-		LogWithTimestamp(test.T(), "Created legacy failure RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name)
-
-		// Wait for job to fail
-		g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium).
-			Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusFailed)))
-		LogWithTimestamp(test.T(), "RayJob %s/%s failed as expected", rayJob.Namespace, rayJob.Name)
-
-		// Get the associated RayCluster name
-		rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name)
-		g.Expect(err).NotTo(HaveOccurred())
-		rayClusterName := rayJob.Status.RayClusterName
-		g.Expect(rayClusterName).NotTo(BeEmpty())
-
-		// Wait past the TTL and verify everything is preserved due to OnFailure=DeleteNone
-		LogWithTimestamp(test.T(), "Waiting past TTL to verify resources preserved by OnFailure=DeleteNone...")
-		g.Consistently(func(gg Gomega) {
-			jobObj, err := GetRayJob(test, rayJob.Namespace, rayJob.Name)
-			gg.Expect(err).NotTo(HaveOccurred())
-			gg.Expect(jobObj).NotTo(BeNil())
-			cluster, err := GetRayCluster(test, namespace.Name, rayClusterName)
-			gg.Expect(err).NotTo(HaveOccurred())
-			gg.Expect(cluster).NotTo(BeNil())
-		}, 15*time.Second, 2*time.Second).Should(Succeed())
-		LogWithTimestamp(test.T(), "Legacy OnFailure=DeleteNone policy working correctly")
-
-		// Cleanup: delete legacy failure RayJob (will also GC cluster)
-		LogWithTimestamp(test.T(), "Cleaning up legacy failure RayJob %s/%s", rayJob.Namespace, rayJob.Name)
-		err = test.Client().Ray().RayV1().RayJobs(rayJob.Namespace).Delete(test.Ctx(), rayJob.Name, metav1.DeleteOptions{})
-		g.Expect(err).NotTo(HaveOccurred())
-		g.Eventually(func() error { _, err := GetRayJob(test, rayJob.Namespace, rayJob.Name); return err }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
-		g.Eventually(func() error {
-			_, err := GetRayCluster(test, namespace.Name, rayClusterName)
-			return err
-		}, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
-		LogWithTimestamp(test.T(), "Cleanup after legacy failure scenario complete")
-	})
 }

From 16a287ceed5d8171f6ac67a725fe8dcc94e1f960 Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Sat, 27 Sep 2025 01:08:06 +0000
Subject: [PATCH 17/21] trigger ci

---
 ray-operator/controllers/ray/rayjob_controller.go | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go
index e2f58652347..ecee99c17e9 100644
--- a/ray-operator/controllers/ray/rayjob_controller.go
+++ b/ray-operator/controllers/ray/rayjob_controller.go
@@ -1120,6 +1120,11 @@ func (r *RayJobReconciler) handleDeletionRules(ctx context.Context, rayJob *rayv
 	var overdueRules []rayv1.DeletionRule
 	var nextRequeueTime *time.Time
 
+	if len(rayJob.Spec.DeletionStrategy.DeletionRules) == 0 {
+		logger.Info("No deletion rules are defined; skipping deletion handling.")
+		return ctrl.Result{}, nil
+	}
+
 	// Categorize all applicable and incomplete rules into "overdue" or "pending".
 	for _, rule := range rayJob.Spec.DeletionStrategy.DeletionRules {
 		// Skip rules that don't match the current job status.

From 0f9dd7b0a16ea1eb021d11779c55b8ded8d46172 Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Sat, 27 Sep 2025 03:12:48 +0000
Subject: [PATCH 18/21] trigger ci

---
 docs/reference/api.md                                |  6 +++---
 helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml |  5 +++--
 ray-operator/apis/ray/v1/rayjob_types.go             |  8 ++++----
 ray-operator/config/crd/bases/ray.io_rayjobs.yaml    |  5 +++--
 ray-operator/controllers/ray/rayjob_controller.go    |  5 -----
 ray-operator/controllers/ray/utils/validation.go     |  4 ++--
 .../controllers/ray/utils/validation_test.go         | 12 ++++++------
 7 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/docs/reference/api.md b/docs/reference/api.md
index 39b443f6701..b29e1f6f12a 100644
--- a/docs/reference/api.md
+++ b/docs/reference/api.md
@@ -129,11 +129,11 @@ Two mutually exclusive styles are supported:
 
 
 	Legacy: provide both onSuccess and onFailure (deprecated; removal planned for 1.6.0). May be combined with shutdownAfterJobFinishes and (optionally) global TTLSecondsAfterFinished.
-	Rules: provide deletionRules (list; may be empty to explicitly select rules mode). Rules mode is incompatible with shutdownAfterJobFinishes, legacy fields, and the global TTLSecondsAfterFinished (use per‑rule condition.ttlSeconds instead).
+	Rules: provide deletionRules (non-empty list). Rules mode is incompatible with shutdownAfterJobFinishes, legacy fields, and the global TTLSecondsAfterFinished (use per‑rule condition.ttlSeconds instead).
 
 
 Semantics:
-  - An empty deletionRules slice still selects rules mode.
+  - A non-empty deletionRules selects rules mode; empty lists are treated as unset.
   - Legacy requires both onSuccess and onFailure; specifying only one is invalid.
   - Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true; therefore it cannot be used with rules mode or with legacy alone (no shutdown).
   - Feature gate RayJobDeletionPolicy must be enabled when this block is present.
@@ -297,7 +297,7 @@ _Appears in:_
 | `clusterSelector` _object (keys:string, values:string)_ | clusterSelector is used to select running rayclusters by labels |  |  |
 | `submitterConfig` _[SubmitterConfig](#submitterconfig)_ | Configurations of submitter k8s job. |  |  |
 | `managedBy` _string_ | ManagedBy is an optional configuration for the controller or entity that manages a RayJob.<br />The value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'.<br />The kuberay-operator reconciles a RayJob which doesn't have this field at all or<br />the field value is the reserved string 'ray.io/kuberay-operator',<br />but delegates reconciling the RayJob with 'kueue.x-k8s.io/multikueue' to the Kueue.<br />The field is immutable. |  |  |
-| `deletionStrategy` _[DeletionStrategy](#deletionstrategy)_ | DeletionStrategy automates post-completion cleanup.<br />Choose one style or omit:<br />  - Legacy: both onSuccess & onFailure (deprecated; may combine with shutdownAfterJobFinishes and TTLSecondsAfterFinished).<br />  - Rules: deletionRules (empty or non-empty) — incompatible with shutdownAfterJobFinishes, legacy fields, and global TTLSecondsAfterFinished (use per-rule condition.ttlSeconds).<br />Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true.<br />Feature gate RayJobDeletionPolicy must be enabled when this field is set. |  |  |
+| `deletionStrategy` _[DeletionStrategy](#deletionstrategy)_ | DeletionStrategy automates post-completion cleanup.<br />Choose one style or omit:<br />  - Legacy: both onSuccess & onFailure (deprecated; may combine with shutdownAfterJobFinishes and TTLSecondsAfterFinished).<br />  - Rules: deletionRules (non-empty) — incompatible with shutdownAfterJobFinishes, legacy fields, and global TTLSecondsAfterFinished (use per-rule condition.ttlSeconds).<br />Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true.<br />Feature gate RayJobDeletionPolicy must be enabled when this field is set. |  |  |
 | `entrypoint` _string_ | Entrypoint represents the command to start execution. |  |  |
 | `runtimeEnvYAML` _string_ | RuntimeEnvYAML represents the runtime environment configuration<br />provided as a multi-line YAML string. |  |  |
 | `jobId` _string_ | If jobId is not set, a new jobId will be auto-generated. |  |  |
diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml
index 8ee2bc5ce4d..e5e23e6666a 100644
--- a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml
+++ b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml
@@ -117,8 +117,9 @@ spec:
                     cannot be used together within the same deletionStrategy
                   rule: '!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))'
                 - message: deletionStrategy requires either BOTH onSuccess and onFailure,
-                    OR the deletionRules field (which may be empty)
-                  rule: ((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules))
+                    OR the deletionRules field (cannot be empty)
+                  rule: ((has(self.onSuccess) && has(self.onFailure)) || (has(self.deletionRules)
+                    && size(self.deletionRules) > 0))
               entrypoint:
                 type: string
               entrypointNumCpus:
diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go
index cd3a68db472..0c87d0c08c3 100644
--- a/ray-operator/apis/ray/v1/rayjob_types.go
+++ b/ray-operator/apis/ray/v1/rayjob_types.go
@@ -91,10 +91,10 @@ const (
 // Two mutually exclusive styles are supported:
 //
 //	Legacy: provide both onSuccess and onFailure (deprecated; removal planned for 1.6.0). May be combined with shutdownAfterJobFinishes and (optionally) global TTLSecondsAfterFinished.
-//	Rules: provide deletionRules (list; may be empty to explicitly select rules mode). Rules mode is incompatible with shutdownAfterJobFinishes, legacy fields, and the global TTLSecondsAfterFinished (use per‑rule condition.ttlSeconds instead).
+//	Rules: provide deletionRules (non-empty list). Rules mode is incompatible with shutdownAfterJobFinishes, legacy fields, and the global TTLSecondsAfterFinished (use per‑rule condition.ttlSeconds instead).
 //
 // Semantics:
-//   - An empty deletionRules slice still selects rules mode.
+//   - A non-empty deletionRules selects rules mode; empty lists are treated as unset.
 //   - Legacy requires both onSuccess and onFailure; specifying only one is invalid.
 //   - Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true; therefore it cannot be used with rules mode or with legacy alone (no shutdown).
 //   - Feature gate RayJobDeletionPolicy must be enabled when this block is present.
@@ -105,7 +105,7 @@ const (
 //   - onSuccess/onFailure are deprecated; migration to deletionRules is encouraged.
 //
 // +kubebuilder:validation:XValidation:rule="!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))",message="legacy policies (onSuccess/onFailure) and deletionRules cannot be used together within the same deletionStrategy"
-// +kubebuilder:validation:XValidation:rule="((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules))",message="deletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (which may be empty)"
+// +kubebuilder:validation:XValidation:rule="((has(self.onSuccess) && has(self.onFailure)) || (has(self.deletionRules) && size(self.deletionRules) > 0))",message="deletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (cannot be empty)"
 type DeletionStrategy struct {
 	// OnSuccess is the deletion policy for a successful RayJob.
 	// Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
@@ -228,7 +228,7 @@ type RayJobSpec struct {
 	// DeletionStrategy automates post-completion cleanup.
 	// Choose one style or omit:
 	//   - Legacy: both onSuccess & onFailure (deprecated; may combine with shutdownAfterJobFinishes and TTLSecondsAfterFinished).
-	//   - Rules: deletionRules (empty or non-empty) — incompatible with shutdownAfterJobFinishes, legacy fields, and global TTLSecondsAfterFinished (use per-rule condition.ttlSeconds).
+	//   - Rules: deletionRules (non-empty) — incompatible with shutdownAfterJobFinishes, legacy fields, and global TTLSecondsAfterFinished (use per-rule condition.ttlSeconds).
 	// Global TTLSecondsAfterFinished > 0 requires shutdownAfterJobFinishes=true.
 	// Feature gate RayJobDeletionPolicy must be enabled when this field is set.
 	// +optional
diff --git a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml
index 8ee2bc5ce4d..e5e23e6666a 100644
--- a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml
+++ b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml
@@ -117,8 +117,9 @@ spec:
                     cannot be used together within the same deletionStrategy
                   rule: '!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))'
                 - message: deletionStrategy requires either BOTH onSuccess and onFailure,
-                    OR the deletionRules field (which may be empty)
-                  rule: ((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules))
+                    OR the deletionRules field (cannot be empty)
+                  rule: ((has(self.onSuccess) && has(self.onFailure)) || (has(self.deletionRules)
+                    && size(self.deletionRules) > 0))
               entrypoint:
                 type: string
               entrypointNumCpus:
diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go
index ecee99c17e9..e2f58652347 100644
--- a/ray-operator/controllers/ray/rayjob_controller.go
+++ b/ray-operator/controllers/ray/rayjob_controller.go
@@ -1120,11 +1120,6 @@ func (r *RayJobReconciler) handleDeletionRules(ctx context.Context, rayJob *rayv
 	var overdueRules []rayv1.DeletionRule
 	var nextRequeueTime *time.Time
 
-	if len(rayJob.Spec.DeletionStrategy.DeletionRules) == 0 {
-		logger.Info("No deletion rules are defined; skipping deletion handling.")
-		return ctrl.Result{}, nil
-	}
-
 	// Categorize all applicable and incomplete rules into "overdue" or "pending".
 	for _, rule := range rayJob.Spec.DeletionStrategy.DeletionRules {
 		// Skip rules that don't match the current job status.
diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go
index d063661cf2f..edda0b772d5 100644
--- a/ray-operator/controllers/ray/utils/validation.go
+++ b/ray-operator/controllers/ray/utils/validation.go
@@ -281,7 +281,7 @@ func validateDeletionConfiguration(rayJob *rayv1.RayJob) error {
 	}
 
 	legacyConfigured := rayJob.Spec.DeletionStrategy.OnSuccess != nil || rayJob.Spec.DeletionStrategy.OnFailure != nil
-	rulesConfigured := rayJob.Spec.DeletionStrategy.DeletionRules != nil // explicit empty slice counts as rules mode
+	rulesConfigured := len(rayJob.Spec.DeletionStrategy.DeletionRules) > 0
 
 	// Mutual exclusivity: rules mode forbids shutdown & legacy. (TTL+rules is implicitly invalid because TTL requires shutdown.)
 	if rulesConfigured && rayJob.Spec.ShutdownAfterJobFinishes {
@@ -301,7 +301,7 @@ func validateDeletionConfiguration(rayJob *rayv1.RayJob) error {
 			return err
 		}
 	} else {
-		return fmt.Errorf("The RayJob spec is invalid: DeletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (which may be empty list)")
+		return fmt.Errorf("The RayJob spec is invalid: DeletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (cannot be empty)")
 	}
 
 	return nil
diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go
index 2314d5e79f5..30eb499beaf 100644
--- a/ray-operator/controllers/ray/utils/validation_test.go
+++ b/ray-operator/controllers/ray/utils/validation_test.go
@@ -1012,7 +1012,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 					OnFailure: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteCluster),
 					},
-				}, ShutdownAfterJobFinishes: false,
+				}, ShutdownAfterJobFinishes: true,
 				RayClusterSpec: createBasicRayClusterSpec(),
 			},
 			expectError: false,
@@ -1048,7 +1048,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 					OnFailure: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteNone),
 					},
-				}, ShutdownAfterJobFinishes: false,
+				}, ShutdownAfterJobFinishes: true,
 				RayClusterSpec: createBasicRayClusterSpec(),
 			},
 			expectError: true,
@@ -1060,7 +1060,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 					OnFailure: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteNone),
 					},
-				}, ShutdownAfterJobFinishes: false,
+				}, ShutdownAfterJobFinishes: true,
 				RayClusterSpec: createBasicRayClusterSpec(),
 			},
 			expectError: true,
@@ -1072,7 +1072,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 					OnSuccess: &rayv1.DeletionPolicy{
 						Policy: ptr.To(rayv1.DeleteNone),
 					},
-				}, ShutdownAfterJobFinishes: false,
+				}, ShutdownAfterJobFinishes: true,
 				RayClusterSpec: createBasicRayClusterSpec(),
 			},
 			expectError: true,
@@ -1085,7 +1085,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 						Policy: ptr.To(rayv1.DeleteNone),
 					},
 					OnFailure: &rayv1.DeletionPolicy{},
-				}, ShutdownAfterJobFinishes: false,
+				}, ShutdownAfterJobFinishes: true,
 				RayClusterSpec: createBasicRayClusterSpec(),
 			},
 			expectError: true,
@@ -1174,7 +1174,7 @@ func TestValidateRayJobSpecWithFeatureGate(t *testing.T) {
 				},
 				RayClusterSpec: createBasicRayClusterSpec(),
 			},
-			expectError: false,
+			expectError: true,
 		},
 		{
 			name: "duplicate rule in deletionRules",

From 7e162086f5a9609363df6d7add582abe7f2ee510 Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Sat, 27 Sep 2025 19:49:22 +0000
Subject: [PATCH 19/21] refactor description

---
 docs/reference/api.md                                | 4 ++--
 helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml | 4 ++--
 ray-operator/apis/ray/v1/rayjob_types.go             | 7 ++++---
 ray-operator/config/crd/bases/ray.io_rayjobs.yaml    | 4 ++--
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/docs/reference/api.md b/docs/reference/api.md
index b29e1f6f12a..ced95d1e45d 100644
--- a/docs/reference/api.md
+++ b/docs/reference/api.md
@@ -141,7 +141,7 @@ Semantics:
 
 Validation:
   - CRD XValidations prevent mixing legacy fields with deletionRules and enforce legacy completeness.
-  - Webhook/controller logic enforces rules vs shutdown exclusivity and TTL constraints.
+  - Controller logic enforces rules vs shutdown exclusivity and TTL constraints.
   - onSuccess/onFailure are deprecated; migration to deletionRules is encouraged.
 
 
@@ -153,7 +153,7 @@ _Appears in:_
 | --- | --- | --- | --- |
 | `onSuccess` _[DeletionPolicy](#deletionpolicy)_ | OnSuccess is the deletion policy for a successful RayJob.<br />Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.<br />This field will be removed in release 1.6.0. |  |  |
 | `onFailure` _[DeletionPolicy](#deletionpolicy)_ | OnFailure is the deletion policy for a failed RayJob.<br />Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.<br />This field will be removed in release 1.6.0. |  |  |
-| `deletionRules` _[DeletionRule](#deletionrule) array_ | DeletionRules is a list of deletion rules, processed based on their trigger conditions.<br />While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),<br />the most impactful rule (e.g., DeleteSelf) will be executed first to prioritize resource cleanup and cost savings. |  |  |
+| `deletionRules` _[DeletionRule](#deletionrule) array_ | DeletionRules is a list of deletion rules, processed based on their trigger conditions.<br />While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),<br />the most impactful rule (e.g., DeleteSelf) will be executed first to prioritize resource cleanup. |  | MinItems: 1 <br /> |
 
 
 
diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml
index e5e23e6666a..f613645cb64 100644
--- a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml
+++ b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml
@@ -89,6 +89,7 @@ spec:
                       - condition
                       - policy
                       type: object
+                    minItems: 1
                     type: array
                     x-kubernetes-list-type: atomic
                   onFailure:
@@ -118,8 +119,7 @@ spec:
                   rule: '!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))'
                 - message: deletionStrategy requires either BOTH onSuccess and onFailure,
                     OR the deletionRules field (cannot be empty)
-                  rule: ((has(self.onSuccess) && has(self.onFailure)) || (has(self.deletionRules)
-                    && size(self.deletionRules) > 0))
+                  rule: ((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules))
               entrypoint:
                 type: string
               entrypointNumCpus:
diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go
index 0c87d0c08c3..705d50dfd40 100644
--- a/ray-operator/apis/ray/v1/rayjob_types.go
+++ b/ray-operator/apis/ray/v1/rayjob_types.go
@@ -101,11 +101,11 @@ const (
 //
 // Validation:
 //   - CRD XValidations prevent mixing legacy fields with deletionRules and enforce legacy completeness.
-//   - Webhook/controller logic enforces rules vs shutdown exclusivity and TTL constraints.
+//   - Controller logic enforces rules vs shutdown exclusivity and TTL constraints.
 //   - onSuccess/onFailure are deprecated; migration to deletionRules is encouraged.
 //
 // +kubebuilder:validation:XValidation:rule="!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))",message="legacy policies (onSuccess/onFailure) and deletionRules cannot be used together within the same deletionStrategy"
-// +kubebuilder:validation:XValidation:rule="((has(self.onSuccess) && has(self.onFailure)) || (has(self.deletionRules) && size(self.deletionRules) > 0))",message="deletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (cannot be empty)"
+// +kubebuilder:validation:XValidation:rule="((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules))",message="deletionStrategy requires either BOTH onSuccess and onFailure, OR the deletionRules field (cannot be empty)"
 type DeletionStrategy struct {
 	// OnSuccess is the deletion policy for a successful RayJob.
 	// Deprecated: Use `deletionRules` instead for more flexible, multi-stage deletion strategies.
@@ -121,9 +121,10 @@ type DeletionStrategy struct {
 
 	// DeletionRules is a list of deletion rules, processed based on their trigger conditions.
 	// While the rules can be used to define a sequence, if multiple rules are overdue (e.g., due to controller downtime),
-	// the most impactful rule (e.g., DeleteSelf) will be executed first to prioritize resource cleanup and cost savings.
+	// the most impactful rule (e.g., DeleteSelf) will be executed first to prioritize resource cleanup.
 	// +optional
 	// +listType=atomic
+	// +kubebuilder:validation:MinItems=1
 	DeletionRules []DeletionRule `json:"deletionRules,omitempty"`
 }
 
diff --git a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml
index e5e23e6666a..f613645cb64 100644
--- a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml
+++ b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml
@@ -89,6 +89,7 @@ spec:
                       - condition
                       - policy
                       type: object
+                    minItems: 1
                     type: array
                     x-kubernetes-list-type: atomic
                   onFailure:
@@ -118,8 +119,7 @@ spec:
                   rule: '!((has(self.onSuccess) || has(self.onFailure)) && has(self.deletionRules))'
                 - message: deletionStrategy requires either BOTH onSuccess and onFailure,
                     OR the deletionRules field (cannot be empty)
-                  rule: ((has(self.onSuccess) && has(self.onFailure)) || (has(self.deletionRules)
-                    && size(self.deletionRules) > 0))
+                  rule: ((has(self.onSuccess) && has(self.onFailure)) || has(self.deletionRules))
               entrypoint:
                 type: string
               entrypointNumCpus:

From 8fd17f9bf3e3a8f69c610151762f22343d00f650 Mon Sep 17 00:00:00 2001
From: wei-chenglai <qazwsx0939059006@gmail.com>
Date: Tue, 7 Oct 2025 00:32:39 +0000
Subject: [PATCH 20/21] improve deletion check

---
 .../controllers/ray/rayjob_controller.go      | 21 +++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go
index e2f58652347..4fa4d70ac1a 100644
--- a/ray-operator/controllers/ray/rayjob_controller.go
+++ b/ray-operator/controllers/ray/rayjob_controller.go
@@ -1305,6 +1305,11 @@ func (r *RayJobReconciler) isDeletionActionCompleted(ctx context.Context, rayJob
 			return false, err
 		}
 
+		if !cluster.DeletionTimestamp.IsZero() {
+			// If the cluster is being deleted, we consider the action complete.
+			return true, nil
+		}
+
 		// If the cluster exists, check if all worker groups are suspended.
 		for _, wg := range cluster.Spec.WorkerGroupSpecs {
 			if wg.Suspend == nil || !*wg.Suspend {
@@ -1316,12 +1321,20 @@ func (r *RayJobReconciler) isDeletionActionCompleted(ctx context.Context, rayJob
 		return true, nil
 
 	case rayv1.DeleteCluster:
-		err := r.Get(ctx, clusterIdentifier, cluster)
-		if errors.IsNotFound(err) {
-			// Cluster not found means the deletion is complete.
+		if err := r.Get(ctx, clusterIdentifier, cluster); err != nil {
+			if errors.IsNotFound(err) {
+				return true, nil
+			}
+			// For any other error, we can't be sure of the state, so report the error.
+			return false, err
+		}
+
+		if !cluster.DeletionTimestamp.IsZero() {
+			// If the cluster is being deleted, we consider the action complete.
 			return true, nil
 		}
-		return false, err
+
+		return false, nil
 
 	case rayv1.DeleteSelf:
 		// This action is terminal. If this function is running, the RayJob still exists,

From 80a6177ebb3c14edfca59c9cf2f32f2babfcf99e Mon Sep 17 00:00:00 2001
From: Wei-Cheng Lai <qazwsx0939059006@gmail.com>
Date: Wed, 8 Oct 2025 20:18:48 -0400
Subject: [PATCH 21/21] remove redundant comment

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Wei-Cheng Lai <qazwsx0939059006@gmail.com>
---
 ray-operator/controllers/ray/rayjob_controller.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go
index 4fa4d70ac1a..91959a6197c 100644
--- a/ray-operator/controllers/ray/rayjob_controller.go
+++ b/ray-operator/controllers/ray/rayjob_controller.go
@@ -1244,7 +1244,7 @@ func (r *RayJobReconciler) handleShutdownAfterJobFinishes(ctx context.Context, r
 		}
 	} else {
 		// We only need to delete the RayCluster. We don't need to delete the submitter Kubernetes Job so that users can still access
-		// the driver logs. In addition, a completed Kubernetes Job does not actually use any compute resources.		_, err = r.deleteClusterResources(ctx, rayJob)
+		// the driver logs. In addition, a completed Kubernetes Job does not actually use any compute resources.
 		_, err = r.deleteClusterResources(ctx, rayJob)
 		if err == nil {
 			logger.Info("RayCluster is deleted", "RayCluster", rayJob.Status.RayClusterName)