@@ -297,18 +297,22 @@ func (s *SystemScheduler) computeJobAllocs() error {
297297 continue
298298 }
299299
300- isCanarying [tg .Name ] = ! tg .Update .IsEmpty () && tg .Update .Canary > 0 && dstate != nil && ! dstate .Promoted
301- }
302-
303- // Initially, if the job requires canaries, we place all of them on all
304- // eligible nodes. At this point we know which nodes are feasible, so we
305- // evict unnedded canaries.
306- if err := s .evictUnneededCanaries (s .job , s .nodes , r ); err != nil {
307- return fmt .Errorf ("failed to evict canaries for job '%s': %v" , s .eval .JobID , err )
300+ // a system job is canarying if:
301+ // - it has a non-empty update block (just a sanity check, all submitted
302+ // jobs should have a non-empty update block as part of
303+ // canonicalization)
304+ // - canary parameter in the update block has to be positive
305+ // - deployment has to be non-nil and it cannot have been promoted
306+ // - this cannot be the initial job version
307+ isCanarying [tg .Name ] = ! tg .Update .IsEmpty () &&
308+ tg .Update .Canary > 0 &&
309+ dstate != nil &&
310+ ! dstate .Promoted &&
311+ s .job .Version != 0
308312 }
309313
310314 // check if the deployment is complete
311- deploymentComplete := false
315+ deploymentComplete := true
312316 for _ , tg := range s .job .TaskGroups {
313317 groupComplete := s .isDeploymentComplete (tg .Name , r , isCanarying [tg .Name ])
314318 deploymentComplete = deploymentComplete && groupComplete
@@ -388,7 +392,7 @@ func (s *SystemScheduler) computeJobAllocs() error {
388392 }
389393
390394 // Compute the placements
391- return s .computePlacements (r .Place , allocExistsForTaskGroup )
395+ return s .computePlacements (r .Place , allocExistsForTaskGroup , isCanarying )
392396}
393397
394398func mergeNodeFiltered (acc , curr * structs.AllocMetric ) * structs.AllocMetric {
@@ -416,7 +420,10 @@ func mergeNodeFiltered(acc, curr *structs.AllocMetric) *structs.AllocMetric {
416420}
417421
418422// computePlacements computes placements for allocations
419- func (s * SystemScheduler ) computePlacements (place []reconciler.AllocTuple , existingByTaskGroup map [string ]bool ) error {
423+ func (s * SystemScheduler ) computePlacements (
424+ place []reconciler.AllocTuple , existingByTaskGroup map [string ]bool ,
425+ isCanarying map [string ]bool ) error {
426+
420427 nodeByID := make (map [string ]* structs.Node , len (s .nodes ))
421428 for _ , node := range s .nodes {
422429 nodeByID [node .ID ] = node
@@ -431,6 +438,7 @@ func (s *SystemScheduler) computePlacements(place []reconciler.AllocTuple, exist
431438 }
432439
433440 nodes := make ([]* structs.Node , 1 )
441+ feasibleNodesCount := 0 // count the nodes that pass feasibility selection
434442 for _ , missing := range place {
435443 tgName := missing .TaskGroup .Name
436444
@@ -603,6 +611,16 @@ func (s *SystemScheduler) computePlacements(place []reconciler.AllocTuple, exist
603611 if s .deployment != nil {
604612 s .deployment .TaskGroups [tgName ].DesiredTotal += 1
605613 }
614+
615+ // count this node as feasible
616+ feasibleNodesCount += 1
617+ }
618+
619+ // Initially, if the job requires canaries, we place all of them on all
620+ // eligible nodes. At this point we know which nodes are feasible, so we
621+ // evict unnedded canaries.
622+ if err := s .evictUnneededCanaries (feasibleNodesCount , isCanarying ); err != nil {
623+ return fmt .Errorf ("failed to evict canaries for job '%s': %v" , s .eval .JobID , err )
606624 }
607625
608626 return nil
@@ -688,81 +706,48 @@ func evictAndPlace(ctx feasible.Context, job *structs.Job, diff *reconciler.Node
688706}
689707
690708// evictAndPlaceCanaries checks how many canaries are needed against the amount
691- // of feasible nodes, and evicts unnecessary placements.
692- func (s * SystemScheduler ) evictUnneededCanaries (job * structs.Job , readyNodes []* structs.Node ,
693- reconcileResult * reconciler.NodeReconcileResult ) error {
694-
695- if job .Stopped () {
696- return nil
697- }
709+ // of feasible nodes, and removes unnecessary placements from the plan.
710+ func (s * SystemScheduler ) evictUnneededCanaries (feasibleNodes int , isCanarying map [string ]bool ) error {
698711
699712 // calculate how many canary placement we expect each task group to have: it
700713 // should be the tg.update.canary percentage of eligible nodes, rounded up
701714 // to the nearest integer
702715 requiredCanaries := make (map [string ]int )
703- for _ , tg := range job .TaskGroups {
704- if tg .Update . IsEmpty () || tg . Update . Canary == 0 {
716+ for _ , tg := range s . job .TaskGroups {
717+ if ! isCanarying [ tg .Name ] {
705718 continue
706719 }
707-
708- requiredCanaries [tg .Name ] = int (math .Ceil (float64 (tg .Update .Canary ) * float64 (len (readyNodes )) / 100 ))
720+ requiredCanaries [tg .Name ] = int (math .Ceil (float64 (tg .Update .Canary ) * float64 (feasibleNodes ) / 100 ))
709721 }
710722
711- nodeByID := make ( map [ string ] * structs. Node , len ( s . nodes ))
712- for _ , node := range s . nodes {
713- nodeByID [ node . ID ] = node
723+ // no canaries to consider, quit early
724+ if len ( requiredCanaries ) == 0 {
725+ return nil
714726 }
715727
716- nodes := make ([]* structs.Node , 1 )
717- updatedPlacements := make ([]reconciler.AllocTuple , 0 )
718- for _ , r := range reconcileResult .Update {
719- if ! r .Canary {
720- // if this isn't a canary placement, add it to the updated array but
721- // don't perform any checks on it
722- updatedPlacements = append (updatedPlacements , r )
723- continue
724- }
725-
726- // are there still canaries to be placed? if not, don't perform a costly
727- // feasibility check
728- count , ok := requiredCanaries [r .TaskGroup .Name ]
729- if ! ok {
730- // programming error
731- return fmt .Errorf (
732- "we are trying to place a task group %s that appears not to be present in job %s!" ,
733- r .TaskGroup .Name , job .Name ,
734- )
735- }
736-
737- // disregard this placement
738- if count == 0 {
739- continue
740- }
741-
742- node , ok := nodeByID [r .Alloc .NodeID ]
743- if ! ok {
744- // should never happen
745- return fmt .Errorf ("can't find node %s" , r .Alloc .NodeID )
746- }
747-
748- // Update the set of placement nodes
749- nodes [0 ] = node
750- s .stack .SetNodes (nodes )
751-
752- // Attempt to match the task group
753- option := s .stack .Select (r .TaskGroup , & feasible.SelectOptions {AllocName : r .Name })
728+ // set the correct desired canary counts
729+ for tgName , desired := range requiredCanaries {
730+ s .deployment .TaskGroups [tgName ].DesiredCanaries = desired
731+ }
754732
755- if option != nil {
756- // we found a feasible node for this update. Decrease the counter
757- // for that TG, and add this placement to the new array.
758- requiredCanaries [r .TaskGroup .Name ] -= 1
759- updatedPlacements = append (updatedPlacements , r )
733+ // iterate over node allocations to find canary allocs
734+ for node , allocations := range s .plan .NodeAllocation {
735+ n := 0
736+ for _ , alloc := range allocations {
737+ if alloc .DeploymentStatus == nil {
738+ continue
739+ }
740+ if alloc .DeploymentStatus .Canary {
741+ if requiredCanaries [alloc .TaskGroup ] != 0 {
742+ requiredCanaries [alloc .TaskGroup ] -= 1
743+ allocations [n ] = alloc // we do this in order to avoid allocating another slice
744+ n += 1
745+ }
746+ }
760747 }
748+ s .plan .NodeAllocation [node ] = allocations [:n ]
761749 }
762750
763- // update placements
764- reconcileResult .Update = updatedPlacements
765-
766751 return nil
767752}
768753
0 commit comments