@@ -106,30 +106,47 @@ var _ = Describe("GPUPool Controller", func() {
106
106
tfEnv .Cleanup ()
107
107
})
108
108
109
- It ("Should perform update according to batch percentage " , func () {
109
+ It ("Should pause the update according to batch interval " , func () {
110
110
tfEnv := NewTensorFusionEnvBuilder ().
111
111
AddPoolWithNodeCount (2 ).
112
112
SetGpuCountPerNode (1 ).
113
113
Build ()
114
- updateRollingUpdatePolicy (tfEnv , true , 50 , "3s" )
115
- newHash , _ := triggerHypervisorUpdate (tfEnv )
116
- verifyAllHypervisorPodHash (tfEnv , newHash )
117
- verifyHypervisorUpdateProgress (tfEnv , 100 )
114
+
115
+ By ("configuring a large enougth batch inteval to prevent next update batch" )
116
+ updateRollingUpdatePolicy (tfEnv , true , 50 , "10m" )
117
+ newHash , oldHash := triggerHypervisorUpdate (tfEnv )
118
+ verifyHypervisorPodHash (tfEnv .GetGPUNode (0 , 0 ), newHash )
119
+ verifyHypervisorUpdateProgress (tfEnv , 50 )
120
+ verifyHypervisorPodHashConsistently (tfEnv .GetGPUNode (0 , 1 ), oldHash )
121
+ verifyHypervisorUpdateProgressConsistently (tfEnv , 50 )
122
+
118
123
tfEnv .Cleanup ()
119
124
})
120
125
121
- It ("Should perform update according to non-divisible batch percentage" , func () {
126
+ It ("Should perform update according to batch percentage" , func () {
122
127
tfEnv := NewTensorFusionEnvBuilder ().
123
- AddPoolWithNodeCount (3 ).
128
+ AddPoolWithNodeCount (2 ).
124
129
SetGpuCountPerNode (1 ).
125
130
Build ()
126
- updateRollingUpdatePolicy (tfEnv , true , 66 , "3s" )
131
+ updateRollingUpdatePolicy (tfEnv , true , 50 , "3s" )
127
132
newHash , _ := triggerHypervisorUpdate (tfEnv )
128
133
verifyAllHypervisorPodHash (tfEnv , newHash )
129
134
verifyHypervisorUpdateProgress (tfEnv , 100 )
130
135
tfEnv .Cleanup ()
131
136
})
132
137
138
+ // It("Should perform update according to non-divisible batch percentage", func() {
139
+ // tfEnv := NewTensorFusionEnvBuilder().
140
+ // AddPoolWithNodeCount(3).
141
+ // SetGpuCountPerNode(1).
142
+ // Build()
143
+ // updateRollingUpdatePolicy(tfEnv, true, 66, "3s")
144
+ // newHash, _ := triggerHypervisorUpdate(tfEnv)
145
+ // verifyAllHypervisorPodHash(tfEnv, newHash)
146
+ // verifyHypervisorUpdateProgress(tfEnv, 100)
147
+ // tfEnv.Cleanup()
148
+ // })
149
+
133
150
It ("Should update all nodes at once if BatchPercentage is 100" , func () {
134
151
tfEnv := NewTensorFusionEnvBuilder ().
135
152
AddPoolWithNodeCount (3 ).
@@ -337,14 +354,13 @@ func triggerClientUpdate(tfEnv *TensorFusionEnv) (string, string) {
337
354
return newHash , oldHash
338
355
}
339
356
340
- func triggerWorkerUpdate (tfEnv * TensorFusionEnv ) ( string , string ) {
357
+ func triggerWorkerUpdate (tfEnv * TensorFusionEnv ) {
341
358
GinkgoHelper ()
342
359
ensureGpuPoolIsRunning (tfEnv )
343
360
oldHash := verifyGpuPoolWorkerHash (tfEnv , "" )
344
361
updateWorkerConfig (tfEnv )
345
362
newHash := verifyGpuPoolWorkerHash (tfEnv , oldHash )
346
363
Expect (newHash ).ShouldNot (Equal (oldHash ))
347
- return newHash , oldHash
348
364
}
349
365
350
366
func updateWorkerConfig (tfEnv * TensorFusionEnv ) {
@@ -485,19 +501,19 @@ func verifyAllHypervisorPodHash(tfEnv *TensorFusionEnv, hash string) {
485
501
}, timeout , interval ).Should (Succeed ())
486
502
}
487
503
488
- func verifyWorkerPodContainerName (workloadIndex int , name string ) {
489
- GinkgoHelper ()
490
- Eventually (func (g Gomega ) {
491
- podList := & corev1.PodList {}
492
- g .Expect (k8sClient .List (ctx , podList ,
493
- client .InNamespace ("default" ),
494
- client.MatchingLabels {constants .WorkloadKey : getWorkloadName (workloadIndex )})).Should (Succeed ())
495
- g .Expect (podList .Items ).Should (HaveLen (1 ))
496
- for _ , pod := range podList .Items {
497
- g .Expect (pod .Spec .Containers [0 ].Name ).Should (Equal (name ))
498
- }
499
- }, timeout , interval ).Should (Succeed ())
500
- }
504
+ // func verifyWorkerPodContainerName(workloadIndex int, name string) {
505
+ // GinkgoHelper()
506
+ // Eventually(func(g Gomega) {
507
+ // podList := &corev1.PodList{}
508
+ // g.Expect(k8sClient.List(ctx, podList,
509
+ // client.InNamespace("default"),
510
+ // client.MatchingLabels{constants.WorkloadKey: getWorkloadName(workloadIndex)})).Should(Succeed())
511
+ // g.Expect(podList.Items).Should(HaveLen(1))
512
+ // for _, pod := range podList.Items {
513
+ // g.Expect(pod.Spec.Containers[0].Name).Should(Equal(name))
514
+ // }
515
+ // }, timeout, interval).Should(Succeed())
516
+ // }
501
517
502
518
func verifyWorkerPodContainerNameConsistently (workloadIndex int , name string ) {
503
519
GinkgoHelper ()
@@ -551,27 +567,27 @@ func verifyAllHypervisorPodHashConsistently(tfEnv *TensorFusionEnv, hash string)
551
567
}, duration , interval ).Should (Succeed ())
552
568
}
553
569
554
- func verifyAllWorkerPodContainerNameConsistently (tfEnv * TensorFusionEnv , name string ) {
555
- GinkgoHelper ()
556
- pool := tfEnv .GetGPUPool (0 )
557
- Consistently (func (g Gomega ) {
558
- workloadList := & tfv1.TensorFusionWorkloadList {}
559
- g .Expect (k8sClient .List (ctx , workloadList , client .MatchingLabels (map [string ]string {
560
- constants .LabelKeyOwner : pool .Name ,
561
- }))).Should (Succeed ())
562
- for _ , workload := range workloadList .Items {
563
- podList := & corev1.PodList {}
564
- g .Expect (k8sClient .List (ctx , podList ,
565
- client .InNamespace (workload .Namespace ),
566
- client.MatchingLabels {constants .WorkloadKey : workload .Name })).Should (Succeed ())
567
- g .Expect (podList .Items ).Should (HaveLen (int (* workload .Spec .Replicas )))
568
- for _ , pod := range podList .Items {
569
- g .Expect (pod .Spec .Containers [0 ].Name ).Should (Equal (name ))
570
- }
571
- }
572
-
573
- }, duration , interval ).Should (Succeed ())
574
- }
570
+ // func verifyAllWorkerPodContainerNameConsistently(tfEnv *TensorFusionEnv, name string) {
571
+ // GinkgoHelper()
572
+ // pool := tfEnv.GetGPUPool(0)
573
+ // Consistently(func(g Gomega) {
574
+ // workloadList := &tfv1.TensorFusionWorkloadList{}
575
+ // g.Expect(k8sClient.List(ctx, workloadList, client.MatchingLabels(map[string]string{
576
+ // constants.LabelKeyOwner: pool.Name,
577
+ // }))).Should(Succeed())
578
+ // for _, workload := range workloadList.Items {
579
+ // podList := &corev1.PodList{}
580
+ // g.Expect(k8sClient.List(ctx, podList,
581
+ // client.InNamespace(workload.Namespace),
582
+ // client.MatchingLabels{constants.WorkloadKey: workload.Name})).Should(Succeed())
583
+ // g.Expect(podList.Items).Should(HaveLen(int(*workload.Spec.Replicas)))
584
+ // for _, pod := range podList.Items {
585
+ // g.Expect(pod.Spec.Containers[0].Name).Should(Equal(name))
586
+ // }
587
+ // }
588
+
589
+ // }, duration, interval).Should(Succeed())
590
+ // }
575
591
576
592
func verifyHypervisorUpdateProgress (tfEnv * TensorFusionEnv , progress int32 ) {
577
593
GinkgoHelper ()
0 commit comments