From 1fed054e7616e34f7e2f51c83b7134e7c7118d41 Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Tue, 21 Oct 2025 14:58:02 -0400
Subject: [PATCH] core: limit job GC batch size to match other GC batches

In the core scheduler we have several object types where we can delete them by
ID. We batch up to 7281 UUIDs because this works out to be about 256 KiB per
request, which is well below the maximum Raft log entry size we want to
have. When we GC jobs we use this same constant to size the batch, but the
request body is not a list of UUIDs but instead a map of namespaced job IDs to a
pointer to a struct. This pushes the batch size into 746 KiB (assuming
UUID-sizes job names), which is going to impact performance if GC happens during
large volumes of short-lived dispatch work where users may be GC'ing jobs
frequently.

Limit the batch size for `JobBatchDeregisterRequest` to roughly the same size as
the requests that are lists of UUIDs.

Ref: https://hashicorp.atlassian.net/browse/NMD-1041
---
 .changelog/26974.txt | 3 +++
 nomad/core_sched.go  | 6 ++++--
 2 files changed, 7 insertions(+), 2 deletions(-)
 create mode 100644 .changelog/26974.txt

diff --git a/.changelog/26974.txt b/.changelog/26974.txt
new file mode 100644
index 00000000000..9e6c078bcb8
--- /dev/null
+++ b/.changelog/26974.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+core: Fixed a bug where GC batch sizes for jobs resulted in excessively large Raft logs
+```
diff --git a/nomad/core_sched.go b/nomad/core_sched.go
index 2f73096c0ae..eb4b1cb231c 100644
--- a/nomad/core_sched.go
+++ b/nomad/core_sched.go
@@ -216,8 +216,10 @@ OUTER:
 
 // jobReap contacts the leader and issues a reap on the passed jobs
 func (c *CoreScheduler) jobReap(jobs []*structs.Job, leaderACL string) error {
-	// Call to the leader to issue the reap
-	for _, req := range c.partitionJobReap(jobs, leaderACL, structs.MaxUUIDsPerWriteRequest) {
+	// Call to the leader to issue the reap with a batch size intended to be
+	// similar to the GC by batches of UUIDs for evals, allocs, and nodes
+	// (limited by structs.MaxUUIDsPerWriteRequest)
+	for _, req := range c.partitionJobReap(jobs, leaderACL, 2048) {
 		var resp structs.JobBatchDeregisterResponse
 		if err := c.srv.RPC(structs.JobBatchDeregisterRPCMethod, req, &resp); err != nil {
 			c.logger.Error("batch job reap failed", "error", err)