Skip to content

Commit a9b6c52

Browse files
authored
fix: Reconciliation failure after attempts count exceeded (#144)
1 parent 6730499 commit a9b6c52

2 files changed

Lines changed: 59 additions & 67 deletions

File tree

controllers/patroni_core_controller.go

Lines changed: 27 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,8 @@ func (pr *PatroniCoreReconciler) Reconcile(ctx context.Context, request ctrl.Req
156156
newResVersion := cr.ResourceVersion
157157
newCrHash := util.HashJson(cr.Spec)
158158
if (pr.resVersions[cr.Name] == newResVersion ||
159-
pr.crHash == newCrHash) && len(cr.Status.Conditions) != 0 && cr.Status.Conditions[0].Type != Failed {
159+
pr.crHash == newCrHash) && (len(cr.Status.Conditions) != 0 &&
160+
(cr.Status.Conditions[0].Type != Failed || (cr.Status.Conditions[0].Type == Failed && pr.errorCounter == 0))) {
160161
areCredsChanged, err := manager.AreCredsChanged(credentials.PostgresSecretNames)
161162
if err != nil {
162163
return reconcile.Result{}, err
@@ -203,6 +204,7 @@ func (pr *PatroniCoreReconciler) Reconcile(ctx context.Context, request ctrl.Req
203204
return pr.handleReconcileError(maxReconcileAttempts,
204205
"CanNotActualizeCredsOnCluster",
205206
newCrHash,
207+
"Error during actualization of creds on cluster",
206208
err)
207209
}
208210

@@ -212,12 +214,13 @@ func (pr *PatroniCoreReconciler) Reconcile(ctx context.Context, request ctrl.Req
212214
switch err.(type) {
213215
case *deployerrors.TestsError:
214216
{
215-
return pr.handleTestReconcileError(err, "Error during tests run", maxReconcileAttempts, newCrHash)
217+
return pr.handleReconcileError(maxReconcileAttempts, "ReconcilePostgresServiceClusterFailed", "Error during tests run", newCrHash, err)
216218
}
217219
case error:
218220
{
219221
return pr.handleReconcileError(maxReconcileAttempts,
220222
"ReconcilePostgresServiceClusterFailed",
223+
"Error during reconcile cycle",
221224
newCrHash,
222225
err)
223226
}
@@ -253,12 +256,13 @@ func (pr *PatroniCoreReconciler) Reconcile(ctx context.Context, request ctrl.Req
253256
switch err.(type) {
254257
case *deployerrors.TestsError:
255258
{
256-
return pr.handleTestReconcileError(err, "Error during tests run", maxReconcileAttempts, newCrHash)
259+
return pr.handleReconcileError(maxReconcileAttempts, "ReconcilePatroniCoreClusterFailed", "Error during tests run", newCrHash, err)
257260
}
258261
case error:
259262
{
260263
return pr.handleReconcileError(maxReconcileAttempts,
261264
"ReconcilePatroniCoreClusterFailed",
265+
"Error during reconcile cycle",
262266
newCrHash,
263267
err)
264268
}
@@ -368,23 +372,6 @@ func (pr *PatroniCoreReconciler) stanzaUpgrade(create bool) error {
368372
return nil
369373
}
370374

371-
func (pr *PatroniCoreReconciler) handleTestReconcileError(err error, errMsg string, maxReconcileAttempts int, newCrHash string) (ctrl.Result, error) {
372-
pr.errorCounter++
373-
if pr.errorCounter < maxReconcileAttempts {
374-
pr.logger.Error(errMsg, zap.Error(err))
375-
pr.logger.Error(fmt.Sprintf("Error counter for tests run: %d, let's try to run the reconcile again", pr.errorCounter))
376-
pr.reason = "PatroniCoreTestsFailed"
377-
pr.message = "PatroniCore service reconcile cycle failed"
378-
if err := pr.updateStatus(Failed, "PatroniCoreTestsFailed", err.Error()); err != nil {
379-
pr.logger.Error("Cannot update CR status", zap.Error(err))
380-
return reconcile.Result{RequeueAfter: time.Minute}, err
381-
}
382-
return reconcile.Result{}, err
383-
}
384-
pr.logger.Error("Reconciliation cycle failed due to test pod ended with error")
385-
return pr.stopReconcile(newCrHash, err)
386-
}
387-
388375
func (pr *PatroniCoreReconciler) reconcilePatroniCoreCluster(cr *qubershipv1.PatroniCore) error {
389376
consulRegistrationRequired := true
390377
// reconcile Patroni
@@ -527,25 +514,33 @@ func (pr *PatroniCoreReconciler) createTestsPods(cr *qubershipv1.PatroniCore) er
527514
return nil
528515
}
529516

530-
func (pr *PatroniCoreReconciler) stopReconcile(newCrHash string, err error) (ctrl.Result, error) {
517+
func (pr *PatroniCoreReconciler) stopReconcile(newCrHash string, reason string, err error) (ctrl.Result, error) {
531518
pr.logger.Error(fmt.Sprintf("Failed reconcile attempts: %d, updating crHash, resVersions", pr.errorCounter))
532519
pr.crHash = newCrHash
533520
pr.errorCounter = 0
534-
return reconcile.Result{RequeueAfter: time.Minute}, err
521+
return pr.failReconcile(reason, err, false)
535522
}
536523

537-
func (pr *PatroniCoreReconciler) handleReconcileError(maxAttempts int, reason, newCrHash string, err error) (ctrl.Result, error) {
524+
func (pr *PatroniCoreReconciler) handleReconcileError(maxAttempts int, reason, errMsg, newCrHash string, err error) (ctrl.Result, error) {
538525
pr.errorCounter++
539526
if pr.errorCounter < maxAttempts {
527+
pr.logger.Error(errMsg, zap.Error(err))
540528
pr.logger.Error(fmt.Sprintf("Error counter: %d, let's try to run the reconcile again", pr.errorCounter))
541-
pr.reason = reason
542-
pr.message = "PatroniCore service reconcile cycle failed"
543-
if err := pr.updateStatus(Failed, reason,
544-
fmt.Sprintf("Postgres service reconcile cycle failed. Error: %s", err.Error())); err != nil {
545-
pr.logger.Error("Cannot update CR status", zap.Error(err))
546-
return reconcile.Result{RequeueAfter: time.Minute}, err
547-
}
548-
return reconcile.Result{RequeueAfter: time.Minute}, err
529+
return pr.failReconcile(reason, err, true)
530+
}
531+
return pr.stopReconcile(newCrHash, "No reconcile attempts left", err)
532+
}
533+
534+
func (pr *PatroniCoreReconciler) failReconcile(reason string, err error, requeue bool) (ctrl.Result, error) {
535+
pr.reason = reason
536+
pr.message = "PatroniCore service reconcile cycle failed"
537+
if err := pr.updateStatus(Failed, reason,
538+
fmt.Sprintf("Postgres service reconcile cycle failed. Error: %s", err.Error())); err != nil {
539+
pr.logger.Error("Cannot update CR status", zap.Error(err))
540+
}
541+
requireAfter := time.Duration(0)
542+
if requeue {
543+
requireAfter = time.Minute
549544
}
550-
return pr.stopReconcile(newCrHash, err)
545+
return reconcile.Result{RequeueAfter: requireAfter, Requeue: requeue}, err
551546
}

controllers/postgresservice_controller.go

Lines changed: 32 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,8 @@ func (r *PostgresServiceReconciler) Reconcile(ctx context.Context, request ctrl.
152152
newResVersion := cr.ResourceVersion
153153
newCrHash := util.HashJson(cr.Spec)
154154
if (r.resVersions[cr.Name] == newResVersion ||
155-
r.crHash == newCrHash) && len(cr.Status.Conditions) != 0 && cr.Status.Conditions[0].Type != Failed {
155+
r.crHash == newCrHash) && (len(cr.Status.Conditions) != 0 &&
156+
(cr.Status.Conditions[0].Type != Failed || (cr.Status.Conditions[0].Type == Failed && r.errorCounter == 0))) {
156157
InfoMsg := "ResourceVersion didn't change, skipping reconcile loop"
157158
if cr.Spec.ExternalDataBase != nil {
158159
r.logger.Info(InfoMsg)
@@ -207,7 +208,7 @@ func (r *PostgresServiceReconciler) Reconcile(ctx context.Context, request ctrl.
207208
switch err.(type) {
208209
case *deployerrors.TestsError:
209210
{
210-
return r.handleTestReconcileError(err, "Error during tests run", maxReconcileAttempts, newCrHash)
211+
return r.handleReconcileError(err, "Error during tests run", maxReconcileAttempts, newCrHash)
211212
}
212213
default:
213214
{
@@ -249,28 +250,11 @@ func (r *PostgresServiceReconciler) Reconcile(ctx context.Context, request ctrl.
249250
switch err.(type) {
250251
case *deployerrors.TestsError:
251252
{
252-
return r.handleTestReconcileError(err, "Error during tests run", maxReconcileAttempts, newCrHash)
253+
return r.handleReconcileError(err, "Error during tests run", maxReconcileAttempts, newCrHash)
253254
}
254255
case error:
255256
{
256-
r.errorCounter++
257-
258-
if r.errorCounter < maxReconcileAttempts {
259-
r.logger.Error(fmt.Sprintf("Error counter: %d, let's try to run the reconcile again", r.errorCounter))
260-
r.reason = "ReconcilePatroniServicesClusterFailed"
261-
r.message = "Postgres-operator service reconcile cycle failed"
262-
if err := r.updateStatus(Failed, "ReconcilePatroniServicesClusterFailed",
263-
fmt.Sprintf("Postgres service reconcile cycle failed. Error: %s", err.Error())); err != nil {
264-
r.logger.Error("Cannot update CR status", zap.Error(err))
265-
return reconcile.Result{RequeueAfter: time.Minute}, err
266-
}
267-
return reconcile.Result{RequeueAfter: time.Minute}, err
268-
}
269-
270-
r.logger.Error(fmt.Sprintf("Failed reconcile attempts: %d, updating crHash, resVersions", r.errorCounter))
271-
r.crHash = newCrHash
272-
r.errorCounter = 0
273-
return reconcile.Result{RequeueAfter: time.Minute}, err
257+
return r.handleReconcileError(err, "Error during reconcile cycle", maxReconcileAttempts, newCrHash)
274258
}
275259

276260
default:
@@ -332,25 +316,15 @@ func (r *PostgresServiceReconciler) Reconcile(ctx context.Context, request ctrl.
332316
return reconcile.Result{}, nil
333317
}
334318

335-
func (r *PostgresServiceReconciler) handleTestReconcileError(err error, errMsg string, maxReconcileAttempts int, newCrHash string) (ctrl.Result, error) {
319+
func (r *PostgresServiceReconciler) handleReconcileError(err error, errMsg string, maxReconcileAttempts int, newCrHash string) (ctrl.Result, error) {
336320
r.errorCounter++
337321
if r.errorCounter < maxReconcileAttempts {
338322
r.logger.Error(errMsg, zap.Error(err))
339-
r.logger.Error(fmt.Sprintf("Error counter for tests run: %d, let's try to run the reconcile again", r.errorCounter))
340-
r.reason = "PostgresClusterTestsFailed"
341-
r.message = "Postgres-operator service reconcile cycle failed"
342-
if err := r.updateStatus(Failed, "PostgresClusterTestsFailed", err.Error()); err != nil {
343-
r.logger.Error("Cannot update CR status", zap.Error(err))
344-
return reconcile.Result{RequeueAfter: time.Minute}, err
345-
}
346-
return reconcile.Result{}, err
323+
r.logger.Error(fmt.Sprintf("Error counter for reconcile run: %d, let's try to run the reconcile again", r.errorCounter))
324+
return r.failReconcile("PostgresClusterTestsFailed", err, true)
347325
}
348326

349-
r.logger.Error(fmt.Sprintf("Failed reconcile attempts: %d, updating crHash, resVersions", r.errorCounter))
350-
r.logger.Error("Reconciliation cycle failed due to test pod ended with error")
351-
r.crHash = newCrHash
352-
r.errorCounter = 0
353-
return reconcile.Result{RequeueAfter: time.Minute}, err
327+
return r.stopReconcile(newCrHash, "No reconcile attempts left", err)
354328
}
355329

356330
func (r *PostgresServiceReconciler) reconcilePostgresServiceCluster(cr *qubershipv1.PatroniServices) error {
@@ -719,3 +693,26 @@ func (r *PostgresServiceReconciler) processExternalResources(cr *qubershipv1.Pat
719693

720694
return nil
721695
}
696+
697+
func (r *PostgresServiceReconciler) stopReconcile(newCrHash string, reason string, err error) (ctrl.Result, error) {
698+
r.logger.Error(fmt.Sprintf("Failed reconcile attempts: %d, updating crHash, resVersions", r.errorCounter))
699+
r.crHash = newCrHash
700+
r.errorCounter = 0
701+
return r.failReconcile(reason, err, false)
702+
}
703+
704+
func (r *PostgresServiceReconciler) failReconcile(reason string, err error, requeue bool) (ctrl.Result, error) {
705+
r.reason = reason
706+
r.message = "Postgres-operator service reconcile cycle failed"
707+
if err := r.updateStatus(Failed, reason,
708+
fmt.Sprintf("Postgres service reconcile cycle failed. Error: %s", err.Error())); err != nil {
709+
r.logger.Error("Cannot update CR status", zap.Error(err))
710+
return reconcile.Result{RequeueAfter: time.Minute}, err
711+
}
712+
requireAfter := time.Duration(0)
713+
if requeue {
714+
requireAfter = time.Minute
715+
}
716+
717+
return reconcile.Result{RequeueAfter: requireAfter, Requeue: requeue}, err
718+
}

0 commit comments

Comments
 (0)