Skip to content

Commit b057f84

Browse files
committed
some good ideas, I think
1 parent a5a96e1 commit b057f84

File tree

1 file changed

+56
-71
lines changed

1 file changed

+56
-71
lines changed

scheduler/scheduler_system.go

Lines changed: 56 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -297,18 +297,22 @@ func (s *SystemScheduler) computeJobAllocs() error {
297297
continue
298298
}
299299

300-
isCanarying[tg.Name] = !tg.Update.IsEmpty() && tg.Update.Canary > 0 && dstate != nil && !dstate.Promoted
301-
}
302-
303-
// Initially, if the job requires canaries, we place all of them on all
304-
// eligible nodes. At this point we know which nodes are feasible, so we
305-
// evict unnedded canaries.
306-
if err := s.evictUnneededCanaries(s.job, s.nodes, r); err != nil {
307-
return fmt.Errorf("failed to evict canaries for job '%s': %v", s.eval.JobID, err)
300+
// a system job is canarying if:
301+
// - it has a non-empty update block (just a sanity check, all submitted
302+
// jobs should have a non-empty update block as part of
303+
// canonicalization)
304+
// - canary parameter in the update block has to be positive
305+
// - deployment has to be non-nil and it cannot have been promoted
306+
// - this cannot be the initial job version
307+
isCanarying[tg.Name] = !tg.Update.IsEmpty() &&
308+
tg.Update.Canary > 0 &&
309+
dstate != nil &&
310+
!dstate.Promoted &&
311+
s.job.Version != 0
308312
}
309313

310314
// check if the deployment is complete
311-
deploymentComplete := false
315+
deploymentComplete := true
312316
for _, tg := range s.job.TaskGroups {
313317
groupComplete := s.isDeploymentComplete(tg.Name, r, isCanarying[tg.Name])
314318
deploymentComplete = deploymentComplete && groupComplete
@@ -388,7 +392,7 @@ func (s *SystemScheduler) computeJobAllocs() error {
388392
}
389393

390394
// Compute the placements
391-
return s.computePlacements(r.Place, allocExistsForTaskGroup)
395+
return s.computePlacements(r.Place, allocExistsForTaskGroup, isCanarying)
392396
}
393397

394398
func mergeNodeFiltered(acc, curr *structs.AllocMetric) *structs.AllocMetric {
@@ -416,7 +420,10 @@ func mergeNodeFiltered(acc, curr *structs.AllocMetric) *structs.AllocMetric {
416420
}
417421

418422
// computePlacements computes placements for allocations
419-
func (s *SystemScheduler) computePlacements(place []reconciler.AllocTuple, existingByTaskGroup map[string]bool) error {
423+
func (s *SystemScheduler) computePlacements(
424+
place []reconciler.AllocTuple, existingByTaskGroup map[string]bool,
425+
isCanarying map[string]bool) error {
426+
420427
nodeByID := make(map[string]*structs.Node, len(s.nodes))
421428
for _, node := range s.nodes {
422429
nodeByID[node.ID] = node
@@ -431,6 +438,7 @@ func (s *SystemScheduler) computePlacements(place []reconciler.AllocTuple, exist
431438
}
432439

433440
nodes := make([]*structs.Node, 1)
441+
feasibleNodesCount := 0 // count the nodes that pass feasibility selection
434442
for _, missing := range place {
435443
tgName := missing.TaskGroup.Name
436444

@@ -603,6 +611,16 @@ func (s *SystemScheduler) computePlacements(place []reconciler.AllocTuple, exist
603611
if s.deployment != nil {
604612
s.deployment.TaskGroups[tgName].DesiredTotal += 1
605613
}
614+
615+
// count this node as feasible
616+
feasibleNodesCount += 1
617+
}
618+
619+
// Initially, if the job requires canaries, we place all of them on all
620+
// eligible nodes. At this point we know which nodes are feasible, so we
621+
// evict unnedded canaries.
622+
if err := s.evictUnneededCanaries(feasibleNodesCount, isCanarying); err != nil {
623+
return fmt.Errorf("failed to evict canaries for job '%s': %v", s.eval.JobID, err)
606624
}
607625

608626
return nil
@@ -688,81 +706,48 @@ func evictAndPlace(ctx feasible.Context, job *structs.Job, diff *reconciler.Node
688706
}
689707

690708
// evictAndPlaceCanaries checks how many canaries are needed against the amount
691-
// of feasible nodes, and evicts unnecessary placements.
692-
func (s *SystemScheduler) evictUnneededCanaries(job *structs.Job, readyNodes []*structs.Node,
693-
reconcileResult *reconciler.NodeReconcileResult) error {
694-
695-
if job.Stopped() {
696-
return nil
697-
}
709+
// of feasible nodes, and removes unnecessary placements from the plan.
710+
func (s *SystemScheduler) evictUnneededCanaries(feasibleNodes int, isCanarying map[string]bool) error {
698711

699712
// calculate how many canary placement we expect each task group to have: it
700713
// should be the tg.update.canary percentage of eligible nodes, rounded up
701714
// to the nearest integer
702715
requiredCanaries := make(map[string]int)
703-
for _, tg := range job.TaskGroups {
704-
if tg.Update.IsEmpty() || tg.Update.Canary == 0 {
716+
for _, tg := range s.job.TaskGroups {
717+
if !isCanarying[tg.Name] {
705718
continue
706719
}
707-
708-
requiredCanaries[tg.Name] = int(math.Ceil(float64(tg.Update.Canary) * float64(len(readyNodes)) / 100))
720+
requiredCanaries[tg.Name] = int(math.Ceil(float64(tg.Update.Canary) * float64(feasibleNodes) / 100))
709721
}
710722

711-
nodeByID := make(map[string]*structs.Node, len(s.nodes))
712-
for _, node := range s.nodes {
713-
nodeByID[node.ID] = node
723+
// no canaries to consider, quit early
724+
if len(requiredCanaries) == 0 {
725+
return nil
714726
}
715727

716-
nodes := make([]*structs.Node, 1)
717-
updatedPlacements := make([]reconciler.AllocTuple, 0)
718-
for _, r := range reconcileResult.Update {
719-
if !r.Canary {
720-
// if this isn't a canary placement, add it to the updated array but
721-
// don't perform any checks on it
722-
updatedPlacements = append(updatedPlacements, r)
723-
continue
724-
}
725-
726-
// are there still canaries to be placed? if not, don't perform a costly
727-
// feasibility check
728-
count, ok := requiredCanaries[r.TaskGroup.Name]
729-
if !ok {
730-
// programming error
731-
return fmt.Errorf(
732-
"we are trying to place a task group %s that appears not to be present in job %s!",
733-
r.TaskGroup.Name, job.Name,
734-
)
735-
}
736-
737-
// disregard this placement
738-
if count == 0 {
739-
continue
740-
}
741-
742-
node, ok := nodeByID[r.Alloc.NodeID]
743-
if !ok {
744-
// should never happen
745-
return fmt.Errorf("can't find node %s", r.Alloc.NodeID)
746-
}
747-
748-
// Update the set of placement nodes
749-
nodes[0] = node
750-
s.stack.SetNodes(nodes)
751-
752-
// Attempt to match the task group
753-
option := s.stack.Select(r.TaskGroup, &feasible.SelectOptions{AllocName: r.Name})
728+
// set the correct desired canary counts
729+
for tgName, desired := range requiredCanaries {
730+
s.deployment.TaskGroups[tgName].DesiredCanaries = desired
731+
}
754732

755-
if option != nil {
756-
// we found a feasible node for this update. Decrease the counter
757-
// for that TG, and add this placement to the new array.
758-
requiredCanaries[r.TaskGroup.Name] -= 1
759-
updatedPlacements = append(updatedPlacements, r)
733+
// iterate over node allocations to find canary allocs
734+
for node, allocations := range s.plan.NodeAllocation {
735+
n := 0
736+
for _, alloc := range allocations {
737+
if alloc.DeploymentStatus == nil {
738+
continue
739+
}
740+
if alloc.DeploymentStatus.Canary {
741+
if requiredCanaries[alloc.TaskGroup] != 0 {
742+
requiredCanaries[alloc.TaskGroup] -= 1
743+
allocations[n] = alloc // we do this in order to avoid allocating another slice
744+
n += 1
745+
}
746+
}
760747
}
748+
s.plan.NodeAllocation[node] = allocations[:n]
761749
}
762750

763-
// update placements
764-
reconcileResult.Update = updatedPlacements
765-
766751
return nil
767752
}
768753

0 commit comments

Comments
 (0)