Skip to content

Commit ae702b9

Browse files
committed
Fix: OLM should not report Progressing=True during pod disruption from cluster upgrades
1 parent 2dac489 commit ae702b9

File tree

5 files changed

+668
-0
lines changed

5 files changed

+668
-0
lines changed

pkg/controller/errors/errors.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,21 @@ type GroupVersionKindNotFoundError struct {
7272
func (g GroupVersionKindNotFoundError) Error() string {
7373
return fmt.Sprintf("Unable to find GVK in discovery: %s %s %s", g.Group, g.Version, g.Kind)
7474
}
75+
76+
// RetryableError indicates a temporary error that should be retried.
77+
// This is used for expected transient failures like pod disruptions during cluster upgrades.
78+
type RetryableError struct {
79+
error
80+
}
81+
82+
func NewRetryableError(err error) RetryableError {
83+
return RetryableError{err}
84+
}
85+
86+
func IsRetryable(err error) bool {
87+
switch err.(type) {
88+
case RetryableError:
89+
return true
90+
}
91+
return false
92+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package errors
2+
3+
import (
4+
"errors"
5+
"testing"
6+
7+
"github.com/stretchr/testify/require"
8+
)
9+
10+
func TestRetryableError(t *testing.T) {
11+
baseErr := errors.New("test error")
12+
13+
retryErr := NewRetryableError(baseErr)
14+
require.True(t, IsRetryable(retryErr), "NewRetryableError should create a retryable error")
15+
require.Equal(t, baseErr.Error(), retryErr.Error(), "RetryableError should preserve the underlying error message")
16+
17+
normalErr := errors.New("normal error")
18+
require.False(t, IsRetryable(normalErr), "Normal error should not be retryable")
19+
}
20+
21+
func TestFatalError(t *testing.T) {
22+
baseErr := errors.New("test error")
23+
24+
fatalErr := NewFatalError(baseErr)
25+
require.True(t, IsFatal(fatalErr), "NewFatalError should create a fatal error")
26+
27+
normalErr := errors.New("normal error")
28+
require.False(t, IsFatal(normalErr), "Normal error should not be fatal")
29+
}

pkg/controller/operators/olm/apiservices.go

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@ package olm
33
import (
44
"context"
55
"fmt"
6+
"time"
67

78
hashutil "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/kubernetes/pkg/util/hash"
89
log "github.com/sirupsen/logrus"
910
appsv1 "k8s.io/api/apps/v1"
11+
corev1 "k8s.io/api/core/v1"
1012
apierrors "k8s.io/apimachinery/pkg/api/errors"
1113
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1214
"k8s.io/apimachinery/pkg/labels"
@@ -22,6 +24,11 @@ import (
2224
const (
2325
// Name of packageserver API service
2426
PackageserverName = "v1.packages.operators.coreos.com"
27+
28+
// maxDisruptionDuration is the maximum duration we consider pod disruption as "expected"
29+
// (e.g., during cluster upgrade). Beyond this time, we treat the unavailability as a real failure.
30+
// This prevents indefinitely waiting for pods that will never recover.
31+
maxDisruptionDuration = 5 * time.Minute
2532
)
2633

2734
// apiServiceResourceErrorActionable returns true if OLM can do something about any one
@@ -168,6 +175,168 @@ func (a *Operator) checkAPIServiceResources(csv *v1alpha1.ClusterServiceVersion,
168175
return utilerrors.NewAggregate(errs)
169176
}
170177

178+
// isAPIServiceBackendDisrupted checks if the APIService is unavailable due to expected pod disruption
179+
// (e.g., during node reboot or cluster upgrade) rather than an actual failure.
180+
// According to the Progressing condition contract, operators should not report Progressing=True
181+
// only because pods are adjusting to new nodes or rebooting during cluster upgrade.
182+
func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVersion, apiServiceName string) bool {
183+
// Get the deployment that backs this APIService
184+
// For most APIServices, the deployment name matches the CSV name or is specified in the CSV
185+
186+
// Try to find the deployment from the CSV's install strategy
187+
strategy, err := a.resolver.UnmarshalStrategy(csv.Spec.InstallStrategy)
188+
if err != nil {
189+
a.logger.Debugf("Unable to unmarshal strategy for CSV %s: %v", csv.Name, err)
190+
return false
191+
}
192+
193+
strategyDetailsDeployment, ok := strategy.(*v1alpha1.StrategyDetailsDeployment)
194+
if !ok {
195+
a.logger.Debugf("CSV %s does not use deployment strategy", csv.Name)
196+
return false
197+
}
198+
199+
// Check each deployment's pods
200+
// Note: We check all deployments in the CSV rather than trying to identify
201+
// the specific deployment backing this APIService. This is because:
202+
// 1. Mapping APIService -> Service -> Deployment requires complex logic
203+
// 2. During cluster upgrades, all deployments in the CSV are likely affected
204+
// 3. The time limit and failure detection logic prevents false positives
205+
for _, deploymentSpec := range strategyDetailsDeployment.DeploymentSpecs {
206+
deployment, err := a.lister.AppsV1().DeploymentLister().Deployments(csv.Namespace).Get(deploymentSpec.Name)
207+
if err != nil {
208+
if apierrors.IsNotFound(err) {
209+
continue
210+
}
211+
a.logger.Debugf("Error getting deployment %s: %v", deploymentSpec.Name, err)
212+
continue
213+
}
214+
215+
// Check if deployment is being updated or rolling out
216+
if deployment.Status.UnavailableReplicas > 0 ||
217+
deployment.Status.UpdatedReplicas < deployment.Status.Replicas {
218+
a.logger.Debugf("Deployment %s has unavailable replicas, likely due to pod disruption", deploymentSpec.Name)
219+
220+
// Check pod status to confirm disruption
221+
selector, err := metav1.LabelSelectorAsSelector(deployment.Spec.Selector)
222+
if err != nil {
223+
a.logger.Debugf("Error parsing deployment selector: %v", err)
224+
continue
225+
}
226+
227+
pods, err := a.lister.CoreV1().PodLister().Pods(csv.Namespace).List(selector)
228+
if err != nil {
229+
a.logger.Debugf("Error listing pods: %v", err)
230+
continue
231+
}
232+
233+
// Check if any pod is in expected disruption state
234+
for _, pod := range pods {
235+
// Check how long the pod has been in disrupted state
236+
// If it's been too long, it's likely a real failure, not temporary disruption
237+
podAge := time.Since(pod.CreationTimestamp.Time)
238+
if podAge > maxDisruptionDuration {
239+
a.logger.Debugf("Pod %s has been in disrupted state for %v (exceeds %v) - treating as real failure",
240+
pod.Name, podAge, maxDisruptionDuration)
241+
continue
242+
}
243+
244+
// Pod is terminating (DeletionTimestamp is set)
245+
if pod.DeletionTimestamp != nil {
246+
a.logger.Debugf("Pod %s is terminating - expected disruption", pod.Name)
247+
return true
248+
}
249+
250+
// For pending pods, we need to distinguish between expected disruption
251+
// (being scheduled/created during node drain) and real failures (ImagePullBackOff, etc.)
252+
if pod.Status.Phase == corev1.PodPending {
253+
// Check if this is a real failure vs expected disruption
254+
isExpectedDisruption := false
255+
isRealFailure := false
256+
257+
// Check pod conditions for scheduling issues
258+
for _, condition := range pod.Status.Conditions {
259+
if condition.Type == corev1.PodScheduled && condition.Status == corev1.ConditionFalse {
260+
// If pod has been unschedulable for a while, it's likely a real issue
261+
// not a temporary disruption from cluster upgrade
262+
if condition.Reason == "Unschedulable" {
263+
isRealFailure = true
264+
a.logger.Debugf("Pod %s is unschedulable - not a temporary disruption", pod.Name)
265+
break
266+
}
267+
}
268+
}
269+
270+
// Check container statuses for real failures
271+
for _, containerStatus := range pod.Status.ContainerStatuses {
272+
if containerStatus.State.Waiting != nil {
273+
reason := containerStatus.State.Waiting.Reason
274+
switch reason {
275+
case "ContainerCreating", "PodInitializing":
276+
// These are expected during normal pod startup
277+
isExpectedDisruption = true
278+
case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff", "CreateContainerConfigError", "InvalidImageName":
279+
// These are real failures, not temporary disruptions
280+
isRealFailure = true
281+
a.logger.Debugf("Pod %s has container error %s - real failure, not disruption", pod.Name, reason)
282+
}
283+
}
284+
}
285+
286+
// Also check init container statuses
287+
for _, containerStatus := range pod.Status.InitContainerStatuses {
288+
if containerStatus.State.Waiting != nil {
289+
reason := containerStatus.State.Waiting.Reason
290+
switch reason {
291+
case "ContainerCreating", "PodInitializing":
292+
isExpectedDisruption = true
293+
case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff", "CreateContainerConfigError", "InvalidImageName":
294+
isRealFailure = true
295+
a.logger.Debugf("Pod %s has init container error %s - real failure, not disruption", pod.Name, reason)
296+
}
297+
}
298+
}
299+
300+
// If it's a real failure, don't treat it as expected disruption
301+
if isRealFailure {
302+
continue
303+
}
304+
305+
// If it's in expected disruption state, return true
306+
if isExpectedDisruption {
307+
a.logger.Debugf("Pod %s is in expected disruption state", pod.Name)
308+
return true
309+
}
310+
311+
// If pending without clear container status, check if it's just being scheduled
312+
// This could be normal pod creation during node drain
313+
if len(pod.Status.ContainerStatuses) == 0 && len(pod.Status.InitContainerStatuses) == 0 {
314+
a.logger.Debugf("Pod %s is pending without container statuses - likely being scheduled", pod.Name)
315+
return true
316+
}
317+
}
318+
319+
// Check container statuses for running pods that are restarting
320+
for _, containerStatus := range pod.Status.ContainerStatuses {
321+
if containerStatus.State.Waiting != nil {
322+
reason := containerStatus.State.Waiting.Reason
323+
switch reason {
324+
case "ContainerCreating", "PodInitializing":
325+
a.logger.Debugf("Pod %s container is starting - expected disruption", pod.Name)
326+
return true
327+
case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff":
328+
// Real failures - don't treat as disruption
329+
a.logger.Debugf("Pod %s has container error %s - not treating as disruption", pod.Name, reason)
330+
}
331+
}
332+
}
333+
}
334+
}
335+
}
336+
337+
return false
338+
}
339+
171340
func (a *Operator) areAPIServicesAvailable(csv *v1alpha1.ClusterServiceVersion) (bool, error) {
172341
for _, desc := range csv.Spec.APIServiceDefinitions.Owned {
173342
apiService, err := a.lister.APIRegistrationV1().APIServiceLister().Get(desc.GetName())
@@ -182,6 +351,15 @@ func (a *Operator) areAPIServicesAvailable(csv *v1alpha1.ClusterServiceVersion)
182351

183352
if !install.IsAPIServiceAvailable(apiService) {
184353
a.logger.Debugf("APIService not available for %s", desc.GetName())
354+
355+
// Check if this unavailability is due to expected pod disruption
356+
// If so, we should not immediately mark as failed or trigger Progressing=True
357+
if a.isAPIServiceBackendDisrupted(csv, desc.GetName()) {
358+
a.logger.Infof("APIService %s unavailable due to pod disruption (e.g., node reboot), will retry", desc.GetName())
359+
// Return an error to trigger retry, but don't mark as definitively unavailable
360+
return false, olmerrors.NewRetryableError(fmt.Errorf("APIService %s temporarily unavailable due to pod disruption", desc.GetName()))
361+
}
362+
185363
return false, nil
186364
}
187365

0 commit comments

Comments
 (0)