Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
b8ff343
scheduler: perform feasibility checks for system canaries before comp…
pkazmierczak Oct 15, 2025
edc1bb9
scheduler: perform feasibility checks for system canaries before comp…
pkazmierczak Oct 28, 2025
c8d7a9a
system scheduler: get the right desiredTotal values
pkazmierczak Oct 28, 2025
9c66203
system scheduler: fixes to computeJobAllocs
pkazmierczak Oct 28, 2025
f96bff6
system deployment tests: fix and annotate counts (#27006)
tgross Oct 29, 2025
19e5a8b
system scheduler: fixes to computeJobAllocs
pkazmierczak Oct 29, 2025
21c7708
system scheduler: evictUnneededCanaries fixes
chrisroberts Oct 30, 2025
a60950b
system scheduler: unit test fixes
pkazmierczak Oct 30, 2025
8a1a402
scheduler: separate the sysbatch scheduler from non-batch system sche…
pkazmierczak Oct 30, 2025
0b4de76
system scheduler: do not leave empty keys in plan.NodeUpdate
pkazmierczak Oct 30, 2025
d318f0f
scheduler: maintain node feasibility information
chrisroberts Oct 31, 2025
e51cce3
scheduler: un-flake TestSystemSched_evictUnneededCanaries
pkazmierczak Oct 31, 2025
fcbe34e
system deployments: failing tests
tgross Oct 3, 2025
91c7acc
system scheduler: handle empty deployment states correctly
pkazmierczak Oct 31, 2025
6fe7a98
comments from @jrasell
pkazmierczak Oct 31, 2025
d55ab6c
system scheduler: reset eligibility when selecting nodes
chrisroberts Oct 31, 2025
c6fbb8a
system scheduler: calculate deployment completion based on deployment…
pkazmierczak Oct 31, 2025
06ebff0
system scheduler: remove obsolete limitReached property
pkazmierczak Nov 3, 2025
55ed562
system scheduler: handle old deployments correctly in the node reconc…
pkazmierczak Nov 3, 2025
f8fbb95
system scheduler: unset current deployment when canceling
chrisroberts Nov 4, 2025
c844d13
system scheduler: stop ineligible allocs if job modified
chrisroberts Nov 4, 2025
bde8049
system scheduler: unflake TestSystemSched_evictUnneededCanaries
pkazmierczak Nov 4, 2025
690e265
system scheduler: correct a typo in the node reconciler
pkazmierczak Nov 4, 2025
f9b6c1f
e2e: correction to TestSystemScheduler/testCanaryDeploymentToAllEligi…
pkazmierczak Nov 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 8 additions & 11 deletions e2e/scheduler_system/systemsched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,16 +218,6 @@ func testCanaryDeploymentToAllEligibleNodes(t *testing.T) {
)
t.Cleanup(cleanup2)

// how many eligible nodes do we have?
nodesApi := job2.NodesApi()
nodesList, _, err := nodesApi.List(nil)
must.Nil(t, err)
must.SliceNotEmpty(t, nodesList)

// Get updated allocations
allocs := job2.Allocs()
must.SliceNotEmpty(t, allocs)

deploymentsApi := job2.DeploymentsApi()
deploymentsList, _, err := deploymentsApi.List(nil)
must.NoError(t, err)
Expand All @@ -253,6 +243,10 @@ func testCanaryDeploymentToAllEligibleNodes(t *testing.T) {
return false
})

// Get updated allocations
allocs := job2.Allocs()
must.SliceNotEmpty(t, allocs)

// find allocations from v1 version of the job, they should all be canaries
count := 0
for _, a := range allocs {
Expand All @@ -263,7 +257,10 @@ func testCanaryDeploymentToAllEligibleNodes(t *testing.T) {
}
must.Eq(t, len(initialAllocs), count, must.Sprint("expected canaries to be placed on all eligible nodes"))

updatedDeployment, _, err := deploymentsApi.Info(deployment.ID, nil)
must.NoError(t, err)

// deployment must not be terminal and needs to have the right status
// description set
must.Eq(t, structs.DeploymentStatusDescriptionRunningNeedsPromotion, deployment.StatusDescription)
must.Eq(t, structs.DeploymentStatusDescriptionRunningNeedsPromotion, updatedDeployment.StatusDescription)
}
7 changes: 7 additions & 0 deletions scheduler/feasible/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,13 @@ func NewEvalEligibility() *EvalEligibility {
}
}

// Reset clears the contents of the eval eligibility
func (e *EvalEligibility) Reset() {
e.job = make(map[string]ComputedClassFeasibility)
e.taskGroups = make(map[string]map[string]ComputedClassFeasibility)
e.tgEscapedConstraints = make(map[string]bool)
}

// SetJob takes the job being evaluated and calculates the escaped constraints
// at the job and task group level.
func (e *EvalEligibility) SetJob(job *structs.Job) {
Expand Down
5 changes: 5 additions & 0 deletions scheduler/feasible/stack.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,11 @@ func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) *Ran
// Reset the binpack selector and context
s.scoreNorm.Reset()
s.ctx.Reset()

// Since the system stack is always evaluating a single
// node, previous eligibility information is not applicable
// so reset it
s.ctx.Eligibility().Reset()
start := time.Now()

// Get the task groups constraints.
Expand Down
57 changes: 0 additions & 57 deletions scheduler/reconciler/deployments.go

This file was deleted.

53 changes: 52 additions & 1 deletion scheduler/reconciler/reconcile_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ func (a *AllocReconciler) Compute() *ReconcileResults {
// Create the allocation matrix
m := newAllocMatrix(a.jobState.Job, a.jobState.ExistingAllocs)

a.jobState.DeploymentOld, a.jobState.DeploymentCurrent, result.DeploymentUpdates = cancelUnneededDeployments(a.jobState.Job, a.jobState.DeploymentCurrent)
a.jobState.DeploymentOld, a.jobState.DeploymentCurrent, result.DeploymentUpdates = cancelUnneededServiceDeployments(a.jobState.Job, a.jobState.DeploymentCurrent)

// If we are just stopping a job we do not need to do anything more than
// stopping all running allocs
Expand Down Expand Up @@ -569,6 +569,57 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe
return result, deploymentComplete
}

// cancelUnneededServiceDeployments cancels any deployment that is not needed.
// A deployment update will be staged for jobs that should stop or have the
// wrong version. Unneeded deployments include:
// 1. Jobs that are marked for stop, but there is a non-terminal deployment.
// 2. Deployments that are active, but referencing a different job version.
// 3. Deployments that are already successful.
//
// returns: old deployment, current deployment and a slice of deployment status
// updates.
func cancelUnneededServiceDeployments(j *structs.Job, d *structs.Deployment) (*structs.Deployment, *structs.Deployment, []*structs.DeploymentStatusUpdate) {
var updates []*structs.DeploymentStatusUpdate

// If the job is stopped and there is a non-terminal deployment, cancel it
if j.Stopped() {
if d != nil && d.Active() {
updates = append(updates, &structs.DeploymentStatusUpdate{
DeploymentID: d.ID,
Status: structs.DeploymentStatusCancelled,
StatusDescription: structs.DeploymentStatusDescriptionStoppedJob,
})
}

// Nothing else to do
return d, nil, updates
}

if d == nil {
return nil, nil, nil
}

// Check if the deployment is active and referencing an older job and cancel it
if d.JobCreateIndex != j.CreateIndex || d.JobVersion != j.Version {
if d.Active() {
updates = append(updates, &structs.DeploymentStatusUpdate{
DeploymentID: d.ID,
Status: structs.DeploymentStatusCancelled,
StatusDescription: structs.DeploymentStatusDescriptionNewerJob,
})
}

return d, nil, updates
}

// Clear it as the current deployment if it is successful
if d.Status == structs.DeploymentStatusSuccessful {
return d, nil, updates
}

return nil, d, updates
}

// setDeploymentStatusAndUpdates sets status for a.deployment if necessary and
// returns an array of DeploymentStatusUpdates.
func (a *AllocReconciler) setDeploymentStatusAndUpdates(deploymentComplete bool, createdDeployment *structs.Deployment) []*structs.DeploymentStatusUpdate {
Expand Down
Loading