Skip to content

Commit 8a74725

Browse files
keep track of number of pods already unhealthy before rollout (#166)
Co-authored-by: Cedric Lamoriniere <[email protected]>
1 parent 1724bb2 commit 8a74725

18 files changed

+256
-65
lines changed

Diff for: README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ extendeddaemonset-855cd7c679-gpmql 1/1 Running 0 2m11s
8383

8484
#### `foo` ExtendedDaemonSet deployment
8585

86-
Create the `foo` app with the ExtendedDaemonSet. For demo purposes, we'll use the `k8s.gcr.io/pause` Docker image, which is only awaiting a terminating signal. You can look at the `foo` application definition in the file `examples/foo-eds_v1.yaml`.
86+
Create the `foo` app with the ExtendedDaemonSet. For demo purposes, we'll use the `registry.k8s.io/pause` Docker image, which is only awaiting a terminating signal. You can look at the `foo` application definition in the file `examples/foo-eds_v1.yaml`.
8787

8888
```console
8989
$ kubectl apply -f examples/foo-eds_v1.yaml
@@ -111,9 +111,9 @@ Now we can try to update the ExtendedDaemonSet `foo`. The only difference betwee
111111
```console
112112
$ diff examples/foo-eds_v1.yaml examples/foo-eds_v2.yaml
113113
17c17
114-
< image: k8s.gcr.io/pause:3.0
114+
< image: registry.k8s.io/pause:3.0
115115
---
116-
> image: k8s.gcr.io/pause:3.1
116+
> image: registry.k8s.io/pause:3.1
117117

118118
$ kubectl apply -f examples/foo-eds_v2.yaml
119119
extendeddaemonset.datadoghq.com/foo configured

Diff for: api/v1alpha1/const.go

+2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ const (
1414
ExtendedDaemonSetReplicaSetCanaryLabelKey = "extendeddaemonsetreplicaset.datadoghq.com/canary"
1515
// ExtendedDaemonSetReplicaSetCanaryLabelValue label value used to identify canary Pods.
1616
ExtendedDaemonSetReplicaSetCanaryLabelValue = "true"
17+
// ExtendedDaemonSetReplicaSetUnreadyPodsAnnotationKey annotation key used on ExtendedDaemonSetReplicaSet to detect the number of unready pods at the moment of creation of the ExtendedDaemonSetReplicaSet
18+
ExtendedDaemonSetReplicaSetUnreadyPodsAnnotationKey = "extendeddaemonsetreplicaset.datadoghq.com/unready-pods"
1719
// MD5ExtendedDaemonSetAnnotationKey annotation key use on Pods in order to identify which PodTemplateSpec have been used to generate it.
1820
MD5ExtendedDaemonSetAnnotationKey = "extendeddaemonset.datadoghq.com/templatehash"
1921
// ExtendedDaemonSetCanaryValidAnnotationKey annotation key used on Pods in order to detect if a canary deployment is considered valid.

Diff for: config/samples/eds_v1alpha1_basic.yaml

+3-3
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ spec:
1111
maxParallelPodCreation: 10
1212
slowStartIntervalDuration: 2m
1313
template:
14-
spec:
14+
spec:
1515
containers:
16-
- name: daemon
17-
image: k8s.gcr.io/pause:3.0
16+
- name: daemon
17+
image: registry.k8s.io/pause:3.0

Diff for: config/samples/eds_v1alpha1_nodeexclusion.yaml

+9-9
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,18 @@ spec:
1111
maxParallelPodCreation: 1
1212
slowStartIntervalDuration: 1m
1313
template:
14-
spec:
14+
spec:
1515
containers:
16-
- name: daemon
17-
image: k8s.gcr.io/pause:3.0
16+
- name: daemon
17+
image: registry.k8s.io/pause:3.0
1818
tolerations:
19-
- operator: Exists
19+
- operator: Exists
2020
affinity:
2121
nodeAffinity:
2222
requiredDuringSchedulingIgnoredDuringExecution:
2323
nodeSelectorTerms:
24-
- matchExpressions:
25-
- key: extendeddaemonset.datadoghq.com/exclude
26-
operator: NotIn
27-
values:
28-
- foo
24+
- matchExpressions:
25+
- key: extendeddaemonset.datadoghq.com/exclude
26+
operator: NotIn
27+
values:
28+
- foo

Diff for: controllers/extendeddaemonset/controller.go

+8-2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"context"
1010
"fmt"
1111
"math/rand"
12+
"strconv"
1213
"strings"
1314
"time"
1415

@@ -136,7 +137,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, request reconcile.Request) (
136137

137138
if upToDateRS == nil {
138139
// If there is no ReplicaSet that matches the EDS Spec, create a new one and return to apply the reconcile loop again
139-
return r.createNewReplicaSet(reqLogger, instance)
140+
return r.createNewReplicaSet(reqLogger, instance, podsCounter)
140141
}
141142

142143
// Select the ReplicaSet that should be current
@@ -153,7 +154,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, request reconcile.Request) (
153154
return result, err
154155
}
155156

156-
func (r *Reconciler) createNewReplicaSet(logger logr.Logger, daemonset *datadoghqv1alpha1.ExtendedDaemonSet) (reconcile.Result, error) {
157+
func (r *Reconciler) createNewReplicaSet(logger logr.Logger, daemonset *datadoghqv1alpha1.ExtendedDaemonSet, podsCounter podsCounterType) (reconcile.Result, error) {
157158
var err error
158159
// replicaSet up to date didn't exist yet, new to create one
159160
var newRS *datadoghqv1alpha1.ExtendedDaemonSetReplicaSet
@@ -164,6 +165,11 @@ func (r *Reconciler) createNewReplicaSet(logger logr.Logger, daemonset *datadogh
164165
if err = controllerutil.SetControllerReference(daemonset, newRS, r.scheme); err != nil {
165166
return reconcile.Result{}, err
166167
}
168+
if newRS.Annotations == nil {
169+
newRS.Annotations = make(map[string]string)
170+
}
171+
newRS.Annotations[datadoghqv1alpha1.ExtendedDaemonSetReplicaSetUnreadyPodsAnnotationKey] = strconv.Itoa(int(podsCounter.Current - podsCounter.Ready))
172+
167173
logger.Info("Creating a new ReplicaSet", "replicaSet.Namespace", newRS.Namespace, "replicaSet.Name", newRS.Name)
168174

169175
err = r.client.Create(context.TODO(), newRS)

Diff for: controllers/extendeddaemonset/controller_test.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,11 @@ func TestReconciler_createNewReplicaSet(t *testing.T) {
554554
log: testLogger,
555555
recorder: eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "TestReconciler_cleanupReplicaSet"}),
556556
}
557-
got, err := r.createNewReplicaSet(tt.args.logger, tt.args.daemonset)
557+
got, err := r.createNewReplicaSet(tt.args.logger, tt.args.daemonset, podsCounterType{
558+
Current: 3,
559+
Ready: 2,
560+
Available: 1,
561+
})
558562
if (err != nil) != tt.wantErr {
559563
t.Errorf("Reconciler.createNewReplicaSet() error = %v, wantErr %v", err, tt.wantErr)
560564

Diff for: controllers/extendeddaemonset_e2e_test.go

+123-8
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ var _ = Describe("ExtendedDaemonSet e2e updates and recovery", func() {
9595
},
9696
}
9797

98-
eds := testutils.NewExtendedDaemonset(namespace, name, "k8s.gcr.io/pause:latest", edsOptions)
98+
eds := testutils.NewExtendedDaemonset(namespace, name, "registry.k8s.io/pause:3.0", edsOptions)
9999
Expect(k8sClient.Create(ctx, eds)).Should(Succeed())
100100

101101
eds = &datadoghqv1alpha1.ExtendedDaemonSet{}
@@ -238,6 +238,7 @@ var _ = Describe("ExtendedDaemonSet e2e updates and recovery", func() {
238238
return len(pods.Items) == 0
239239
}), longTimeout, interval).Should(BeTrue(), "All EDS pods should be destroyed")
240240
})
241+
241242
})
242243
})
243244

@@ -271,7 +272,7 @@ var _ = Describe("ExtendedDaemonSet e2e PodCannotStart condition", func() {
271272
},
272273
}
273274

274-
eds := testutils.NewExtendedDaemonset(namespace, name, "k8s.gcr.io/pause:latest", edsOptions)
275+
eds := testutils.NewExtendedDaemonset(namespace, name, "registry.k8s.io/pause:3.0", edsOptions)
275276
Expect(k8sClient.Create(ctx, eds)).Should(Succeed())
276277

277278
eds = &datadoghqv1alpha1.ExtendedDaemonSet{}
@@ -465,7 +466,7 @@ var _ = Describe("ExtendedDaemonSet e2e successful canary deployment update", fu
465466
},
466467
}
467468

468-
eds := testutils.NewExtendedDaemonset(namespace, name, "k8s.gcr.io/pause:latest", edsOptions)
469+
eds := testutils.NewExtendedDaemonset(namespace, name, "registry.k8s.io/pause:3.0", edsOptions)
469470
Expect(k8sClient.Create(ctx, eds)).Should(Succeed())
470471

471472
eds = &datadoghqv1alpha1.ExtendedDaemonSet{}
@@ -486,7 +487,7 @@ var _ = Describe("ExtendedDaemonSet e2e successful canary deployment update", fu
486487

487488
It("Should do canary deployment", func() {
488489
updateFunc := func(eds *datadoghqv1alpha1.ExtendedDaemonSet) {
489-
eds.Spec.Template.Spec.Containers[0].Image = fmt.Sprintf("k8s.gcr.io/pause:3.1")
490+
eds.Spec.Template.Spec.Containers[0].Image = fmt.Sprintf("registry.k8s.io/pause:3.1")
490491
}
491492

492493
Eventually(updateEDS(k8sClient, key, updateFunc), timeout, interval).Should(
@@ -527,7 +528,7 @@ var _ = Describe("ExtendedDaemonSet e2e successful canary deployment update", fu
527528
eds.Annotations = make(map[string]string)
528529
}
529530
eds.Annotations[datadoghqv1alpha1.ExtendedDaemonSetCanaryValidAnnotationKey] = canaryReplicaSet
530-
eds.Spec.Template.Spec.Containers[0].Image = fmt.Sprintf("k8s.gcr.io/pause:3.1")
531+
eds.Spec.Template.Spec.Containers[0].Image = fmt.Sprintf("registry.k8s.io/pause:3.1")
531532
}
532533

533534
Eventually(updateEDS(k8sClient, key, updateFunc), timeout, interval).Should(
@@ -628,7 +629,7 @@ var _ = Describe("ExtendedDaemonSet Controller", func() {
628629
},
629630
ReconcileFrequency: reconcileFrequency,
630631
}
631-
eds := testutils.NewExtendedDaemonset(namespace, name, "k8s.gcr.io/pause:latest", edsOptions)
632+
eds := testutils.NewExtendedDaemonset(namespace, name, "registry.k8s.io/pause:3.0", edsOptions)
632633
Expect(k8sClient.Create(ctx, eds)).Should(Succeed())
633634

634635
eds = &datadoghqv1alpha1.ExtendedDaemonSet{}
@@ -712,7 +713,7 @@ var _ = Describe("ExtendedDaemonSet e2e validationMode setting", func() {
712713
},
713714
}
714715

715-
eds := testutils.NewExtendedDaemonset(namespace, name, "k8s.gcr.io/pause:latest", edsOptions)
716+
eds := testutils.NewExtendedDaemonset(namespace, name, "registry.k8s.io/pause:3.0", edsOptions)
716717
Expect(k8sClient.Create(ctx, eds)).Should(Succeed())
717718

718719
eds = &datadoghqv1alpha1.ExtendedDaemonSet{}
@@ -733,7 +734,7 @@ var _ = Describe("ExtendedDaemonSet e2e validationMode setting", func() {
733734

734735
It("Should do canary deployment", func() {
735736
updateFunc := func(eds *datadoghqv1alpha1.ExtendedDaemonSet) {
736-
eds.Spec.Template.Spec.Containers[0].Image = fmt.Sprintf("k8s.gcr.io/pause:3.1")
737+
eds.Spec.Template.Spec.Containers[0].Image = fmt.Sprintf("registry.k8s.io/pause:3.1")
737738
}
738739

739740
Eventually(updateEDS(k8sClient, key, updateFunc), timeout, interval).Should(
@@ -778,6 +779,120 @@ var _ = Describe("ExtendedDaemonSet e2e validationMode setting", func() {
778779
})
779780
})
780781

782+
// These tests may take several minutes to run, check your go test timeout
783+
var _ = Describe("ExtendedDaemonSet e2e rollout not blocked due to already failing pods", func() {
784+
Context("Deployment with failure pods then rollout to healthy pods", func() {
785+
name := "eds-fail"
786+
787+
key := types.NamespacedName{
788+
Namespace: namespace,
789+
Name: name,
790+
}
791+
792+
nodeList := &corev1.NodeList{}
793+
794+
It("Should deploy EDS", func() {
795+
Expect(k8sClient.List(ctx, nodeList)).Should(Succeed())
796+
797+
edsOptions := &testutils.NewExtendedDaemonsetOptions{
798+
CanaryStrategy: &datadoghqv1alpha1.ExtendedDaemonSetSpecStrategyCanary{
799+
Duration: &metav1.Duration{Duration: 1 * time.Minute},
800+
Replicas: &intString1,
801+
},
802+
RollingUpdate: &datadoghqv1alpha1.ExtendedDaemonSetSpecStrategyRollingUpdate{
803+
MaxUnavailable: &intString1,
804+
MaxParallelPodCreation: datadoghqv1alpha1.NewInt32(20),
805+
},
806+
}
807+
808+
corrupted_image := "corrupted_image:image_corrupted"
809+
eds := testutils.NewExtendedDaemonset(namespace, name, corrupted_image, edsOptions)
810+
Expect(k8sClient.Create(ctx, eds)).Should(Succeed())
811+
812+
eds = &datadoghqv1alpha1.ExtendedDaemonSet{}
813+
Eventually(withEDS(key, eds, func() bool {
814+
return eds.Status.ActiveReplicaSet != ""
815+
}), timeout, interval).Should(BeTrue())
816+
817+
ers := &datadoghqv1alpha1.ExtendedDaemonSetReplicaSet{}
818+
ersKey := types.NamespacedName{
819+
Namespace: namespace,
820+
Name: eds.Status.ActiveReplicaSet,
821+
}
822+
Eventually(withERS(ersKey, ers, func() bool {
823+
return ers.Status.Status == "active" && int(ers.Status.Current) == len(nodeList.Items) && int(ers.Status.Ready) == 0
824+
}), timeout, interval).Should(BeTrue())
825+
826+
})
827+
828+
It("Should deploy canary ERS", func() {
829+
updateFunc := func(eds *datadoghqv1alpha1.ExtendedDaemonSet) {
830+
eds.Spec.Template.Spec.Containers[0].Image = fmt.Sprintf("registry.k8s.io/pause:3.0")
831+
}
832+
833+
Eventually(updateEDS(k8sClient, key, updateFunc), timeout, interval).Should(
834+
BeTrue(),
835+
func() string { return "Unable to update the EDS" },
836+
)
837+
838+
eds := &datadoghqv1alpha1.ExtendedDaemonSet{}
839+
Expect(k8sClient.Get(ctx, key, eds)).Should(Succeed())
840+
841+
Eventually(withEDS(key, eds, func() bool {
842+
return eds.Status.State == datadoghqv1alpha1.ExtendedDaemonSetStatusStateCanary
843+
}), timeout, interval).Should(BeTrue())
844+
})
845+
846+
It("Should rollout successfully after success of canary - canary ERS should become active ERS", func() {
847+
eds := &datadoghqv1alpha1.ExtendedDaemonSet{}
848+
Expect(k8sClient.Get(ctx, key, eds)).Should(Succeed())
849+
850+
Eventually(withEDS(key, eds, func() bool {
851+
return eds.Status.State == datadoghqv1alpha1.ExtendedDaemonSetStatusStateCanary
852+
}), timeout, interval).Should(BeTrue())
853+
854+
ersCanary := &datadoghqv1alpha1.ExtendedDaemonSetReplicaSet{}
855+
ersCanaryKey := types.NamespacedName{
856+
Namespace: namespace,
857+
Name: eds.Status.Canary.ReplicaSet,
858+
}
859+
860+
Eventually(withERS(ersCanaryKey, ersCanary, func() bool {
861+
return ersCanary.Status.Status == "canary"
862+
}), timeout, interval).Should(BeTrue())
863+
864+
Eventually(withEDS(key, eds, func() bool {
865+
return eds.Status.State == datadoghqv1alpha1.ExtendedDaemonSetStatusStateRunning
866+
}), longTimeout, interval).Should(BeTrue())
867+
868+
ers := &datadoghqv1alpha1.ExtendedDaemonSetReplicaSet{}
869+
ersKey := types.NamespacedName{
870+
Namespace: namespace,
871+
Name: eds.Status.ActiveReplicaSet,
872+
}
873+
874+
Eventually(withERS(ersKey, ers, func() bool {
875+
return ers.Status.Status == "active" && int(ers.Status.Available) == len(nodeList.Items)
876+
}), timeout, interval).Should(BeTrue())
877+
})
878+
879+
It("Should delete EDS", func() {
880+
Eventually(deleteEDS(k8sClient, key), timeout, interval).Should(BeTrue(), "EDS should be deleted")
881+
882+
pods := &corev1.PodList{}
883+
listOptions := []client.ListOption{
884+
client.InNamespace(namespace),
885+
client.MatchingLabels{
886+
datadoghqv1alpha1.ExtendedDaemonSetNameLabelKey: name,
887+
},
888+
}
889+
Eventually(withList(listOptions, pods, "EDS pods", func() bool {
890+
return len(pods.Items) == 0
891+
}), longTimeout, interval).Should(BeTrue(), "All EDS pods should be destroyed")
892+
})
893+
})
894+
})
895+
781896
func withUpdate(obj client.Object, desc string) condFn {
782897
return func() bool {
783898
err := k8sClient.Update(context.Background(), obj)

Diff for: controllers/extendeddaemonset_test.go

+4-4
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ var _ = Describe("ExtendedDaemonSet Controller", func() {
5959
},
6060
ReconcileFrequency: reconcileFrequency,
6161
}
62-
eds := testutils.NewExtendedDaemonset(namespace, name, "k8s.gcr.io/pause:latest", edsOptions)
62+
eds := testutils.NewExtendedDaemonset(namespace, name, "registry.k8s.io/pause:latest", edsOptions)
6363
Expect(k8sClient.Create(ctx, eds)).Should(Succeed())
6464

6565
eds = &datadoghqv1alpha1.ExtendedDaemonSet{}
@@ -122,7 +122,7 @@ var _ = Describe("ExtendedDaemonSet Rolling Update Pause", func() {
122122
},
123123
ReconcileFrequency: reconcileFrequency,
124124
}
125-
eds := testutils.NewExtendedDaemonset(namespace, name, "k8s.gcr.io/pause:latest", edsOptions)
125+
eds := testutils.NewExtendedDaemonset(namespace, name, "registry.k8s.io/pause:latest", edsOptions)
126126
Expect(k8sClient.Create(ctx, eds)).Should(Succeed())
127127

128128
eds = &datadoghqv1alpha1.ExtendedDaemonSet{}
@@ -157,7 +157,7 @@ var _ = Describe("ExtendedDaemonSet Rolling Update Pause", func() {
157157
It("Should not deploy with paused EDS", func() {
158158
eds := &datadoghqv1alpha1.ExtendedDaemonSet{}
159159
Expect(k8sClient.Get(ctx, key, eds)).Should(Succeed())
160-
eds.Spec.Template.Spec.Containers[0].Image = "k8s.gcr.io/pause:3.0" // Trigger a new ERS
160+
eds.Spec.Template.Spec.Containers[0].Image = "registry.k8s.io/pause:3.0" // Trigger a new ERS
161161
if eds.Annotations == nil {
162162
eds.Annotations = make(map[string]string)
163163
}
@@ -321,7 +321,7 @@ var _ = Describe("ExtendedDaemonSet Rollout Freeze", func() {
321321
ReconcileFrequency: reconcileFrequency,
322322
ExtraAnnotations: map[string]string{"extendeddaemonset.datadoghq.com/rollout-frozen": "true"},
323323
}
324-
eds := testutils.NewExtendedDaemonset(namespace, name, "k8s.gcr.io/pause:latest", edsOptions)
324+
eds := testutils.NewExtendedDaemonset(namespace, name, "registry.k8s.io/pause:latest", edsOptions)
325325
Expect(k8sClient.Create(ctx, eds)).Should(Succeed())
326326

327327
eds = &datadoghqv1alpha1.ExtendedDaemonSet{}

Diff for: controllers/extendeddaemonsetreplicaset/strategy/limits/limits.go

+7-1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ type Parameters struct {
1414
NbOldAvailablesPod int
1515
NbCreatedPod int
1616
NbUnresponsiveNodes int
17+
NbUnreadyPods int
1718

1819
MaxPodCreation int
1920
MaxUnavailablePod int
@@ -37,7 +38,12 @@ func CalculatePodToCreateAndDelete(params Parameters) (nbCreation, nbDeletion in
3738
effectiveUnresponsive = params.MaxUnschedulablePod
3839
}
3940

40-
nbDeletion = params.MaxUnavailablePod - (params.NbNodes - effectiveUnresponsive - params.NbAvailablesPod - params.NbOldAvailablesPod)
41+
nbDeletion = params.MaxUnavailablePod - (params.NbNodes - effectiveUnresponsive - params.NbAvailablesPod - params.NbOldAvailablesPod) + params.NbUnreadyPods
42+
43+
if nbDeletion > params.MaxUnavailablePod {
44+
nbDeletion = params.MaxUnavailablePod
45+
}
46+
4147
if nbDeletion < 0 {
4248
nbDeletion = 0
4349
}

0 commit comments

Comments
 (0)