@@ -261,7 +261,7 @@ func (r *SingleClusterReconciler) rollingRestartPods(
261261 rackState * RackState , podsToRestart []* corev1.Pod , ignorablePodNames sets.Set [string ],
262262 restartTypeMap map [string ]RestartType ,
263263) common.ReconcileResult {
264- failedPods , activePods := getFailedAndActivePods (podsToRestart )
264+ failedPods , failedWithinGracePeriodPods , activePods := getFailedAndActivePods (podsToRestart , true )
265265
266266 // If already dead node (failed pod) then no need to check node safety, migration
267267 if len (failedPods ) != 0 {
@@ -317,6 +317,15 @@ func (r *SingleClusterReconciler) rollingRestartPods(
317317 }
318318 }
319319
320+ if len (failedWithinGracePeriodPods ) != 0 {
321+ r .Log .Info (
322+ "Pods are in failed state but within grace period, will not delete" ,
323+ "pods" , getPodNames (failedWithinGracePeriodPods ),
324+ )
325+
326+ return common .ReconcileRequeueAfter (asdbv1 .RequeueIntervalSeconds10 )
327+ }
328+
320329 return common .ReconcileSuccess ()
321330}
322331
@@ -548,22 +557,28 @@ func (r *SingleClusterReconciler) ensurePodsRunningAndReady(podsToCheck []*corev
548557 podNames ,
549558 )
550559
551- return common .ReconcileRequeueAfter (10 )
560+ return common .ReconcileRequeueAfter (asdbv1 . RequeueIntervalSeconds10 )
552561}
553562
554- func getFailedAndActivePods (pods []* corev1.Pod ) (failedPods , activePods []* corev1.Pod ) {
563+ func getFailedAndActivePods (
564+ pods []* corev1.Pod , withGracePeriod bool ) (failedPods , failedWithinGracePeriodPods , activePods []* corev1.Pod ,
565+ ) {
555566 for idx := range pods {
556567 pod := pods [idx ]
557568
558- if err := utils .CheckPodFailed (pod ); err != nil {
569+ podState := utils .CheckPodFailedWithGrace (pod , withGracePeriod )
570+
571+ switch podState .State {
572+ case utils .PodHealthy :
573+ activePods = append (activePods , pod )
574+ case utils .PodFailedInGrace :
575+ failedWithinGracePeriodPods = append (failedWithinGracePeriodPods , pod )
576+ case utils .PodFailed :
559577 failedPods = append (failedPods , pod )
560- continue
561578 }
562-
563- activePods = append (activePods , pod )
564579 }
565580
566- return failedPods , activePods
581+ return failedPods , failedWithinGracePeriodPods , activePods
567582}
568583
569584func getNonIgnorablePods (pods []* corev1.Pod , ignorablePodNames sets.Set [string ],
@@ -585,7 +600,7 @@ func getNonIgnorablePods(pods []*corev1.Pod, ignorablePodNames sets.Set[string],
585600func (r * SingleClusterReconciler ) safelyDeletePodsAndEnsureImageUpdated (
586601 rackState * RackState , podsToUpdate []* corev1.Pod , ignorablePodNames sets.Set [string ],
587602) common.ReconcileResult {
588- failedPods , activePods := getFailedAndActivePods (podsToUpdate )
603+ failedPods , failedWithinGracePeriodPods , activePods := getFailedAndActivePods (podsToUpdate , true )
589604
590605 // If already dead node (failed pod) then no need to check node safety, migration
591606 if len (failedPods ) != 0 {
@@ -640,6 +655,15 @@ func (r *SingleClusterReconciler) safelyDeletePodsAndEnsureImageUpdated(
640655 }
641656 }
642657
658+ if len (failedWithinGracePeriodPods ) != 0 {
659+ r .Log .Info (
660+ "Pods are in failed state but within grace period, will not delete" ,
661+ "pods" , getPodNames (failedWithinGracePeriodPods ),
662+ )
663+
664+ return common .ReconcileRequeueAfter (asdbv1 .RequeueIntervalSeconds10 )
665+ }
666+
643667 return common .ReconcileSuccess ()
644668}
645669
@@ -720,6 +744,7 @@ func (r *SingleClusterReconciler) ensurePodsImageUpdated(podsToCheck []*corev1.P
720744 return common .ReconcileError (err )
721745 }
722746
747+ // For existing cluster operations, no grace period for immediate responsiveness
723748 if err := utils .CheckPodFailed (updatedPod ); err != nil {
724749 return common .ReconcileError (err )
725750 }
@@ -746,7 +771,7 @@ func (r *SingleClusterReconciler) ensurePodsImageUpdated(podsToCheck []*corev1.P
746771 podNames ,
747772 )
748773
749- return common .ReconcileRequeueAfter (10 )
774+ return common .ReconcileRequeueAfter (asdbv1 . RequeueIntervalSeconds10 )
750775}
751776
752777// cleanupPods checks pods and status before scale-up to detect and fix any
0 commit comments