@@ -18,6 +18,7 @@ package lifelonglearning
18
18
19
19
import (
20
20
"context"
21
+ "crypto/sha256"
21
22
"encoding/json"
22
23
"fmt"
23
24
"k8s.io/apimachinery/pkg/types"
@@ -27,6 +28,7 @@ import (
27
28
v1 "k8s.io/api/core/v1"
28
29
"k8s.io/apimachinery/pkg/api/errors"
29
30
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
31
+ lruexpirecache "k8s.io/apimachinery/pkg/util/cache"
30
32
utilrand "k8s.io/apimachinery/pkg/util/rand"
31
33
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
32
34
"k8s.io/apimachinery/pkg/util/wait"
@@ -52,6 +54,8 @@ const (
52
54
KindName = "LifelongLearningJob"
53
55
// Name is this controller name
54
56
Name = "LifelongLearning"
57
+ // VirtualKubeletNode is virtual node
58
+ VirtualKubeletNode = "virtual-kubelet"
55
59
)
56
60
57
61
// Kind contains the schema.GroupVersionKind for this controller type.
@@ -82,6 +86,8 @@ type Controller struct {
82
86
cfg * config.ControllerConfig
83
87
84
88
sendToEdgeFunc runtime.DownstreamSendFunc
89
+
90
+ lruExpireCache * lruexpirecache.LRUExpireCache
85
91
}
86
92
87
93
// Run starts the main goroutine responsible for watching and syncing jobs.
@@ -379,14 +385,17 @@ func (c *Controller) transitJobState(job *sednav1.LifelongLearningJob) (bool, er
379
385
// include train, eval, deploy pod
380
386
var err error
381
387
if jobStage == sednav1 .LLJobDeploy {
382
- err = c .restartInferPod (job )
383
- if err != nil {
384
- klog .V (2 ).Infof ("lifelonglearning job %v/%v inference pod failed to restart, err:%s" , job .Namespace , job .Name , err )
385
- return needUpdated , err
386
- }
388
+ if ! c .hasJobInCache (job ) {
389
+ err = c .restartInferPod (job )
390
+ if err != nil {
391
+ klog .V (2 ).Infof ("lifelonglearning job %v/%v inference pod failed to restart, err:%s" , job .Namespace , job .Name , err )
392
+ return needUpdated , err
393
+ }
387
394
388
- klog .V (2 ).Infof ("lifelonglearning job %v/%v inference pod restarts successfully" , job .Namespace , job .Name )
389
- newConditionType = sednav1 .LLJobStageCondCompleted
395
+ klog .V (2 ).Infof ("lifelonglearning job %v/%v inference pod restarts successfully" , job .Namespace , job .Name )
396
+ newConditionType = sednav1 .LLJobStageCondCompleted
397
+ c .addJobToCache (job )
398
+ }
390
399
} else {
391
400
if podStatus != v1 .PodPending && podStatus != v1 .PodRunning {
392
401
err = c .createPod (job , jobStage )
@@ -406,10 +415,6 @@ func (c *Controller) transitJobState(job *sednav1.LifelongLearningJob) (bool, er
406
415
407
416
// watch pod status, if pod running, set type running
408
417
newConditionType = sednav1 .LLJobStageCondRunning
409
- } else if podStatus == v1 .PodSucceeded {
410
- // watch pod status, if pod completed, set type completed
411
- newConditionType = sednav1 .LLJobStageCondCompleted
412
- klog .V (2 ).Infof ("lifelonglearning job %v/%v %v stage completed!" , job .Namespace , job .Name , jobStage )
413
418
} else if podStatus == v1 .PodFailed {
414
419
newConditionType = sednav1 .LLJobStageCondFailed
415
420
klog .V (2 ).Infof ("lifelonglearning job %v/%v %v stage failed!" , job .Namespace , job .Name , jobStage )
@@ -491,6 +496,25 @@ func (c *Controller) getSpecifiedPods(job *sednav1.LifelongLearningJob, podType
491
496
return latestPod
492
497
}
493
498
499
+ func (c * Controller ) getHas256 (target interface {}) string {
500
+ h := sha256 .New ()
501
+ h .Write ([]byte (fmt .Sprintf ("%v" , target )))
502
+ return fmt .Sprintf ("%x" , h .Sum (nil ))
503
+ }
504
+
505
+ func (c * Controller ) addJobToCache (job * sednav1.LifelongLearningJob ) {
506
+ c .lruExpireCache .Add (c .getHas256 (job .Status ), job , 10 * time .Second )
507
+ }
508
+
509
+ func (c * Controller ) hasJobInCache (job * sednav1.LifelongLearningJob ) bool {
510
+ _ , ok := c .lruExpireCache .Get (c .getHas256 (job .Status ))
511
+ if ! ok {
512
+ return false
513
+ }
514
+
515
+ return true
516
+ }
517
+
494
518
func (c * Controller ) restartInferPod (job * sednav1.LifelongLearningJob ) error {
495
519
inferPod := c .getSpecifiedPods (job , runtime .InferencePodType )
496
520
if inferPod == nil {
@@ -542,6 +566,18 @@ func IsJobFinished(j *sednav1.LifelongLearningJob) bool {
542
566
return false
543
567
}
544
568
569
+ func (c * Controller ) addPodAnnotations (spec * v1.PodTemplateSpec , key string , value string ) {
570
+ ann := spec .GetAnnotations ()
571
+ if ann == nil {
572
+ ann = make (map [string ]string )
573
+ }
574
+
575
+ if _ , ok := ann [key ]; ! ok {
576
+ ann [key ] = value
577
+ spec .SetAnnotations (ann )
578
+ }
579
+ }
580
+
545
581
func (c * Controller ) createPod (job * sednav1.LifelongLearningJob , podtype sednav1.LLJobStage ) (err error ) {
546
582
ctx := context .Background ()
547
583
var podTemplate * v1.PodTemplateSpec
@@ -592,12 +628,20 @@ func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1
592
628
}
593
629
594
630
var workerParam * runtime.WorkerParam = new (runtime.WorkerParam )
631
+
595
632
if podtype == sednav1 .LLJobTrain {
596
- workerParam .WorkerType = "Train"
633
+ workerParam .WorkerType = runtime . TrainPodType
597
634
598
635
podTemplate = & job .Spec .TrainSpec .Template
599
636
// Env parameters for train
600
637
638
+ c .addPodAnnotations (podTemplate , "type" , workerParam .WorkerType )
639
+ c .addPodAnnotations (podTemplate , "data" , dataURL )
640
+ datasetUseInitializer := true
641
+ if podTemplate .Spec .NodeName == VirtualKubeletNode {
642
+ datasetUseInitializer = false
643
+ }
644
+
601
645
workerParam .Env = map [string ]string {
602
646
"NAMESPACE" : job .Namespace ,
603
647
"JOB_NAME" : job .Name ,
@@ -621,7 +665,7 @@ func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1
621
665
URL : & runtime.MountURL {
622
666
URL : dataURL ,
623
667
Secret : jobSecret ,
624
- DownloadByInitializer : true ,
668
+ DownloadByInitializer : datasetUseInitializer ,
625
669
},
626
670
EnvName : "TRAIN_DATASET_URL" ,
627
671
},
@@ -632,14 +676,25 @@ func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1
632
676
Secret : datasetSecret ,
633
677
URL : originalDataURLOrIndex ,
634
678
Indirect : dataset .Spec .URL != originalDataURLOrIndex ,
635
- DownloadByInitializer : true ,
679
+ DownloadByInitializer : datasetUseInitializer ,
636
680
},
637
681
EnvName : "ORIGINAL_DATASET_URL" ,
638
682
},
639
683
)
640
684
} else {
641
685
podTemplate = & job .Spec .EvalSpec .Template
642
- workerParam .WorkerType = "Eval"
686
+ workerParam .WorkerType = runtime .EvalPodType
687
+
688
+ c .addPodAnnotations (podTemplate , "type" , workerParam .WorkerType )
689
+ c .addPodAnnotations (podTemplate , "data" , dataURL )
690
+ datasetUseInitializer := true
691
+ if podTemplate .Spec .NodeName == VirtualKubeletNode {
692
+ datasetUseInitializer = false
693
+ }
694
+ modelUseInitializer := true
695
+ if podTemplate .Spec .NodeName == VirtualKubeletNode {
696
+ modelUseInitializer = false
697
+ }
643
698
644
699
// Configure Env information for eval by initial WorkerParam
645
700
workerParam .Env = map [string ]string {
@@ -656,7 +711,7 @@ func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1
656
711
modelMountURLs = append (modelMountURLs , runtime.MountURL {
657
712
URL : url ,
658
713
Secret : jobSecret ,
659
- DownloadByInitializer : true ,
714
+ DownloadByInitializer : modelUseInitializer ,
660
715
})
661
716
}
662
717
workerParam .Mounts = append (workerParam .Mounts ,
@@ -679,7 +734,7 @@ func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1
679
734
URL : & runtime.MountURL {
680
735
URL : dataURL ,
681
736
Secret : datasetSecret ,
682
- DownloadByInitializer : true ,
737
+ DownloadByInitializer : datasetUseInitializer ,
683
738
},
684
739
Name : "datasets" ,
685
740
EnvName : "TEST_DATASET_URL" ,
@@ -689,7 +744,7 @@ func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1
689
744
URL : & runtime.MountURL {
690
745
Secret : datasetSecret ,
691
746
URL : originalDataURLOrIndex ,
692
- DownloadByInitializer : true ,
747
+ DownloadByInitializer : datasetUseInitializer ,
693
748
Indirect : dataset .Spec .URL != originalDataURLOrIndex ,
694
749
},
695
750
Name : "origin-dataset" ,
@@ -744,6 +799,7 @@ func (c *Controller) createInferPod(job *sednav1.LifelongLearningJob) error {
744
799
}
745
800
746
801
workerParam .WorkerType = runtime .InferencePodType
802
+ c .addPodAnnotations (& job .Spec .DeploySpec .Template , "type" , workerParam .WorkerType )
747
803
workerParam .HostNetwork = true
748
804
749
805
// create edge pod
@@ -764,10 +820,11 @@ func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) {
764
820
eventBroadcaster .StartRecordingToSink (& v1core.EventSinkImpl {Interface : cc .KubeClient .CoreV1 ().Events ("" )})
765
821
766
822
jc := & Controller {
767
- kubeClient : cc .KubeClient ,
768
- client : cc .SednaClient .SednaV1alpha1 (),
769
- queue : workqueue .NewNamedRateLimitingQueue (workqueue .NewItemExponentialFailureRateLimiter (runtime .DefaultBackOff , runtime .MaxBackOff ), Name ),
770
- cfg : cfg ,
823
+ kubeClient : cc .KubeClient ,
824
+ client : cc .SednaClient .SednaV1alpha1 (),
825
+ queue : workqueue .NewNamedRateLimitingQueue (workqueue .NewItemExponentialFailureRateLimiter (runtime .DefaultBackOff , runtime .MaxBackOff ), Name ),
826
+ cfg : cfg ,
827
+ lruExpireCache : lruexpirecache .NewLRUExpireCache (10 ),
771
828
}
772
829
773
830
jobInformer .Informer ().AddEventHandler (cache.ResourceEventHandlerFuncs {
0 commit comments