From 1fed054e7616e34f7e2f51c83b7134e7c7118d41 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Tue, 21 Oct 2025 14:58:02 -0400 Subject: [PATCH] core: limit job GC batch size to match other GC batches In the core scheduler we have several object types where we can delete them by ID. We batch up to 7281 UUIDs because this works out to be about 256 KiB per request, which is well below the maximum Raft log entry size we want to have. When we GC jobs we use this same constant to size the batch, but the request body is not a list of UUIDs but instead a map of namespaced job IDs to a pointer to a struct. This pushes the batch size into 746 KiB (assuming UUID-sizes job names), which is going to impact performance if GC happens during large volumes of short-lived dispatch work where users may be GC'ing jobs frequently. Limit the batch size for `JobBatchDeregisterRequest` to roughly the same size as the requests that are lists of UUIDs. Ref: https://hashicorp.atlassian.net/browse/NMD-1041 --- .changelog/26974.txt | 3 +++ nomad/core_sched.go | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) create mode 100644 .changelog/26974.txt diff --git a/.changelog/26974.txt b/.changelog/26974.txt new file mode 100644 index 00000000000..9e6c078bcb8 --- /dev/null +++ b/.changelog/26974.txt @@ -0,0 +1,3 @@ +```release-note:bug +core: Fixed a bug where GC batch sizes for jobs resulted in excessively large Raft logs +``` diff --git a/nomad/core_sched.go b/nomad/core_sched.go index 2f73096c0ae..eb4b1cb231c 100644 --- a/nomad/core_sched.go +++ b/nomad/core_sched.go @@ -216,8 +216,10 @@ OUTER: // jobReap contacts the leader and issues a reap on the passed jobs func (c *CoreScheduler) jobReap(jobs []*structs.Job, leaderACL string) error { - // Call to the leader to issue the reap - for _, req := range c.partitionJobReap(jobs, leaderACL, structs.MaxUUIDsPerWriteRequest) { + // Call to the leader to issue the reap with a batch size intended to be + // similar to the GC by batches of UUIDs for evals, allocs, and nodes + // (limited by structs.MaxUUIDsPerWriteRequest) + for _, req := range c.partitionJobReap(jobs, leaderACL, 2048) { var resp structs.JobBatchDeregisterResponse if err := c.srv.RPC(structs.JobBatchDeregisterRPCMethod, req, &resp); err != nil { c.logger.Error("batch job reap failed", "error", err)