Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
cb6172c
[Bug][RayJob] Sidecar mode shouldn't restart head pod when head pod i…
Nov 27, 2025
415ee29
[fix] fix CI error
Nov 28, 2025
2bbf8cb
update
Dec 15, 2025
2bf33f3
reunite if statement
Dec 17, 2025
a97a3b5
update
Dec 17, 2025
c4bfd24
fix ci error
Dec 17, 2025
e7499ad
fix
Dec 18, 2025
714d760
put back unnecessary comment deletion
Dec 18, 2025
60aba9c
Better rayjob logic
Future-Outlier Dec 22, 2025
8a7c66f
update
Future-Outlier Dec 22, 2025
45bb98a
update
Future-Outlier Dec 22, 2025
59ef8b3
update
Future-Outlier Dec 22, 2025
2464704
update
Future-Outlier Dec 22, 2025
03ff4fe
Update ray-operator/test/e2erayjob/rayjob_test.go
Dec 31, 2025
63957d1
Update ray-operator/test/e2erayjob/rayjob_test.go
Dec 31, 2025
3115ae4
update rayjob test
Jan 1, 2026
6f2dfa3
Merge branch 'master' into bug/sidecar-mode-fix
Jan 1, 2026
76828e7
Merge branch 'master' into bug/sidecar-mode-fix
Jan 1, 2026
a5b30a4
fix merge conflict error
Jan 1, 2026
e77db80
Update ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go
Jan 1, 2026
33afa20
update
Jan 2, 2026
ec01312
revert reason assertion
Jan 2, 2026
6e7c738
[chore] retrigger ci
Jan 3, 2026
e155b5e
update
Jan 3, 2026
1f9dbe8
[chore] change from HeadPod to GetHeadPod
Jan 3, 2026
883eb7c
add submission mode label key label
Future-Outlier Jan 4, 2026
d00b7c6
Merge remote-tracking branch 'upstream/master' into bug/sidecar-mode-fix
Future-Outlier Jan 4, 2026
7246d33
Update ray-operator/controllers/ray/utils/constant.go
Jan 6, 2026
9a7eaa4
Update ray-operator/controllers/ray/raycluster_controller.go
Jan 6, 2026
d02c6a7
Update ray-operator/controllers/ray/raycluster_controller.go
Jan 6, 2026
f3d9431
Update ray-operator/controllers/ray/rayjob_controller.go
Jan 6, 2026
a59e486
Update ray-operator/controllers/ray/rayjob_controller.go
Jan 7, 2026
729cf0c
Update ray-operator/controllers/ray/utils/constant.go
Jan 7, 2026
8d07ece
Update ray-operator/controllers/ray/rayjob_controller.go
Jan 7, 2026
04caf77
update
Jan 7, 2026
c0d916d
Add missing label
Jan 7, 2026
0d43eee
update
Jan 12, 2026
03d10a4
update
Jan 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions go.work

This file was deleted.

168 changes: 0 additions & 168 deletions go.work.sum

This file was deleted.

22 changes: 12 additions & 10 deletions ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -679,8 +679,8 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
return errstd.New(reason)
}
} else if len(headPods.Items) == 0 {
originatedFrom := utils.GetCRDType(instance.Labels[utils.RayOriginatedFromCRDLabelKey])
if originatedFrom == utils.RayJobCRD {
if meta.IsStatusConditionTrue(instance.Status.Conditions, string(rayv1.RayClusterProvisioned)) &&
shouldSkipHeadPodRestart(instance) {
// Recreating the head Pod if the RayCluster created by RayJob is provisioned doesn't help RayJob.
//
// Case 1: GCS fault tolerance is disabled
Expand All @@ -692,13 +692,11 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
//
// In this case, the worker Pods will not be killed by the new head Pod when it is created, but the submission ID has already been
// used by the old Ray job, so the new Ray job will fail.
if meta.IsStatusConditionTrue(instance.Status.Conditions, string(rayv1.RayClusterProvisioned)) {
logger.Info(
"reconcilePods: Found 0 head Pods for a RayJob-managed RayCluster; skipping head creation to let RayJob controller handle the failure",
"rayCluster", instance.Name,
)
return nil
}
logger.Info(
"reconcilePods: Found 0 head Pods for a sidecar-mode RayJob-managed RayCluster; skipping head creation to let RayJob controller handle the failure",
"rayCluster", instance.Name,
)
return nil
}
// Create head Pod if it does not exist.
logger.Info("reconcilePods: Found 0 head Pods; creating a head Pod for the RayCluster.")
Expand Down Expand Up @@ -830,7 +828,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
}
}
}
logger.Info("reconcilePods", "found existing replica indices", "group", worker.GroupName, "indices", validReplicaIndices)
logger.Info("reconcilePods: found existing replica indices", "group", worker.GroupName, "indices", validReplicaIndices)
}
if diff > 0 {
// pods need to be added
Expand Down Expand Up @@ -1094,6 +1092,10 @@ func (r *RayClusterReconciler) reconcileMultiHostWorkerGroup(ctx context.Context
return nil
}

func shouldSkipHeadPodRestart(instance *rayv1.RayCluster) bool {
return getCreatorCRDType(*instance) == utils.RayJobCRD && instance.Labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey] == "true"
}

// shouldDeletePod returns whether the Pod should be deleted and the reason
//
// @param pod: The Pod to be checked.
Expand Down
11 changes: 10 additions & 1 deletion ray-operator/controllers/ray/rayjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,7 @@ func (r *RayJobReconciler) getOrCreateRayClusterInstance(ctx context.Context, ra
if err != nil {
return nil, err
}

if r.options.BatchSchedulerManager != nil && rayJobInstance.Spec.SubmissionMode == rayv1.K8sJobMode {
if scheduler, err := r.options.BatchSchedulerManager.GetScheduler(); err == nil {
// Group name is only used for individual pods to specify their task group ("headgroup", "worker-group-1", etc.).
Expand Down Expand Up @@ -934,12 +935,20 @@ func (r *RayJobReconciler) getOrCreateRayClusterInstance(ctx context.Context, ra
}

func (r *RayJobReconciler) constructRayClusterForRayJob(rayJobInstance *rayv1.RayJob, rayClusterName string) (*rayv1.RayCluster, error) {
labels := make(map[string]string, len(rayJobInstance.Labels))
labels := map[string]string{}
for key, value := range rayJobInstance.Labels {
labels[key] = value
}
labels[utils.RayOriginatedFromCRNameLabelKey] = rayJobInstance.Name
labels[utils.RayOriginatedFromCRDLabelKey] = utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD)
labels[utils.RayJobSubmissionModeLabelKey] = string(rayJobInstance.Spec.SubmissionMode)

if rayJobInstance.Spec.SubmissionMode == rayv1.SidecarMode {
labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey] = "true"
} else {
labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey] = "false"
}

rayCluster := &rayv1.RayCluster{
ObjectMeta: metav1.ObjectMeta{
Labels: labels,
Expand Down
Loading
Loading