Skip to content

Commit aa49e3e

Browse files
onematchfoxclaude
andcommitted
fix(controller): skip DB migration when DB schema is ahead of binary
When an older binary starts against a database that a newer binary has already migrated, calling mg.Up() fails because golang-migrate tries to read the down file for the current DB version, which does not exist in the older binary's embedded FS. This caused a crash loop (issue #1881). Instead, detect when the database version exceeds the binary's highest known migration and return early. This relies on the expand-then-contract policy documented in database-migrations.md, which guarantees each release's code is compatible with the schema applied by the previous release. Adds TestApplyDir_SucceedsWhenDBVersionAhead to cover this path. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> Signed-off-by: Brian Fox <878612+onematchfox@users.noreply.github.com>
1 parent 8e390a6 commit aa49e3e

2 files changed

Lines changed: 191 additions & 1 deletion

File tree

go/core/pkg/migrations/runner.go

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"errors"
66
"fmt"
77
"io/fs"
8+
"strings"
89

910
"github.com/golang-migrate/migrate/v4"
1011
migratepgx "github.com/golang-migrate/migrate/v4/database/pgx/v5"
@@ -56,12 +57,37 @@ func applyDir(url string, migrationsFS fs.FS, dir, migrationsTable string) (prev
5657
}
5758
defer closeMigrate(dir, mg)
5859

59-
prevVersion, _, err = mg.Version()
60+
var dirty bool
61+
prevVersion, dirty, err = mg.Version()
6062
if err != nil && !errors.Is(err, migrate.ErrNilVersion) {
6163
return 0, fmt.Errorf("get pre-migration version for %s: %w", dir, err)
6264
}
6365
// prevVersion == 0 when ErrNilVersion (no migrations applied yet).
6466

67+
// If the database is ahead of this binary's max known version, skip Up
68+
// entirely. The expand-then-contract policy in database-migrations.md
69+
// guarantees that each release's code is compatible with the schema applied
70+
// by the previous release, so rolling back one release at a time is safe.
71+
// We cannot enforce a tighter constraint here because migration version
72+
// numbers don't align with release versions.
73+
// A dirty database is excluded: dirty state means a previous migration
74+
// attempt failed and must be resolved, not silently accepted.
75+
if maxVer, scanErr := maxEmbeddedVersion(migrationsFS, dir); scanErr != nil {
76+
log.Error(scanErr, "could not determine max embedded migration version; proceeding with Up", "track", dir)
77+
} else if prevVersion > maxVer {
78+
if dirty {
79+
// DB is both dirty and ahead of this binary. Attempting Up/rollback would
80+
// fail (the migration files for prevVersion don't exist), producing noisy
81+
// and misleading logs. Return a clear error so operators act on the real
82+
// problem rather than chasing rollback noise.
83+
return prevVersion, fmt.Errorf("database is dirty at version %d and ahead of this binary's max known version %d for track %s: manual operator intervention required: %w",
84+
prevVersion, maxVer, dir, migrate.ErrDirty{Version: int(prevVersion)})
85+
}
86+
log.Info("database schema is ahead of this binary; running in compatibility mode",
87+
"track", dir, "dbVersion", prevVersion, "binaryMax", maxVer)
88+
return prevVersion, nil
89+
}
90+
6591
if upErr := mg.Up(); upErr != nil {
6692
if errors.Is(upErr, migrate.ErrNoChange) {
6793
return prevVersion, nil
@@ -186,6 +212,41 @@ func newMigrate(url string, migrationsFS fs.FS, dir, migrationsTable string) (*m
186212
return mg, nil
187213
}
188214

215+
// maxEmbeddedVersion scans dir inside migrationsFS and returns the highest migration
216+
// version number found. Only files with a ".up.sql" suffix are considered. Version
217+
// numbers are parsed from the leading decimal digits of each filename; the remainder
218+
// of the name is not validated. Returns an error if the directory cannot be read or
219+
// contains no recognisable migration files.
220+
func maxEmbeddedVersion(migrationsFS fs.FS, dir string) (uint, error) {
221+
entries, err := fs.ReadDir(migrationsFS, dir)
222+
if err != nil {
223+
return 0, fmt.Errorf("read migration dir %s: %w", dir, err)
224+
}
225+
var highest uint
226+
var foundUpSQL, foundVersioned bool
227+
for _, e := range entries {
228+
if e.IsDir() || !strings.HasSuffix(e.Name(), ".up.sql") {
229+
continue
230+
}
231+
foundUpSQL = true
232+
var v uint
233+
if _, scanErr := fmt.Sscanf(e.Name(), "%d", &v); scanErr != nil {
234+
continue
235+
}
236+
foundVersioned = true
237+
if v > highest {
238+
highest = v
239+
}
240+
}
241+
if !foundUpSQL {
242+
return 0, fmt.Errorf("no .up.sql migration files found in %s", dir)
243+
}
244+
if !foundVersioned {
245+
return 0, fmt.Errorf("no versioned .up.sql migration files found in %s; expected names like 000001_description.up.sql", dir)
246+
}
247+
return highest, nil
248+
}
249+
189250
// closeMigrate closes mg, logging source and database close errors separately.
190251
func closeMigrate(dir string, mg *migrate.Migrate) {
191252
srcErr, dbErr := mg.Close()

go/core/pkg/migrations/runner_test.go

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@ package migrations
33
import (
44
"context"
55
"database/sql"
6+
"errors"
67
"fmt"
78
"maps"
89
"testing"
910
"testing/fstest"
1011
"time"
1112

13+
"github.com/golang-migrate/migrate/v4"
1214
_ "github.com/jackc/pgx/v5/stdlib"
1315
testcontainers "github.com/testcontainers/testcontainers-go"
1416
tcpostgres "github.com/testcontainers/testcontainers-go/modules/postgres"
@@ -285,6 +287,133 @@ func TestApplyDir_RollsBackWithExistingVersion(t *testing.T) {
285287
}
286288
}
287289

290+
func TestMaxEmbeddedVersion(t *testing.T) {
291+
tests := []struct {
292+
name string
293+
fs fstest.MapFS
294+
dir string
295+
want uint
296+
wantErr bool
297+
}{
298+
{
299+
name: "returns highest version from up.sql files",
300+
fs: fstest.MapFS{
301+
"core/000001_a.up.sql": {},
302+
"core/000001_a.down.sql": {},
303+
"core/000003_b.up.sql": {},
304+
"core/000003_b.down.sql": {},
305+
},
306+
dir: "core",
307+
want: 3,
308+
},
309+
{
310+
name: "ignores non-sql files",
311+
fs: fstest.MapFS{
312+
"core/README.md": {},
313+
"core/000002_x.up.sql": {},
314+
"core/000002_x.down.sql": {},
315+
},
316+
dir: "core",
317+
want: 2,
318+
},
319+
{
320+
name: "ignores down.sql when computing max",
321+
fs: fstest.MapFS{
322+
"core/000001_a.up.sql": {},
323+
"core/000002_b.down.sql": {},
324+
},
325+
dir: "core",
326+
want: 1,
327+
},
328+
{
329+
name: "up.sql files with unparseable names returns error",
330+
fs: fstest.MapFS{"core/init.up.sql": {}},
331+
dir: "core",
332+
wantErr: true,
333+
},
334+
{
335+
name: "empty dir returns error",
336+
fs: fstest.MapFS{"core/.keep": {}},
337+
dir: "core",
338+
wantErr: true,
339+
},
340+
{
341+
name: "nonexistent dir returns error",
342+
fs: fstest.MapFS{},
343+
dir: "missing",
344+
wantErr: true,
345+
},
346+
}
347+
for _, tt := range tests {
348+
t.Run(tt.name, func(t *testing.T) {
349+
got, err := maxEmbeddedVersion(tt.fs, tt.dir)
350+
if (err != nil) != tt.wantErr {
351+
t.Errorf("maxEmbeddedVersion() error = %v, wantErr %v", err, tt.wantErr)
352+
}
353+
if !tt.wantErr && got != tt.want {
354+
t.Errorf("maxEmbeddedVersion() = %d, want %d", got, tt.want)
355+
}
356+
})
357+
}
358+
}
359+
360+
// TestApplyDir_SucceedsWhenDBVersionAhead verifies that an older binary starting against
361+
// a database that a newer binary has migrated does not crash-loop. It skips Up entirely
362+
// and returns success, leaving the schema unchanged. Safe rollback relies on the
363+
// expand-then-contract discipline in database-migrations.md and rolling back one release
364+
// at a time.
365+
func TestApplyDir_SucceedsWhenDBVersionAhead(t *testing.T) {
366+
connStr := startTestDB(t)
367+
368+
// Newer binary applies v1 and v2.
369+
if _, err := applyDir(connStr, goodCoreFS, "core", "schema_migrations"); err != nil {
370+
t.Fatalf("newer binary apply: %v", err)
371+
}
372+
373+
// Older binary (max v1) starts against the v2 schema — must not error.
374+
if _, err := applyDir(connStr, oneCoreFS, "core", "schema_migrations"); err != nil {
375+
t.Fatalf("older binary apply against newer schema: %v", err)
376+
}
377+
378+
// Schema version must be unchanged — the older binary has no business rolling back.
379+
if got := trackVersion(t, connStr, "schema_migrations"); got != 2 {
380+
t.Errorf("version = %d, want 2 (older binary must not modify schema version)", got)
381+
}
382+
}
383+
384+
// TestApplyDir_DirtyStateNotMaskedByCompatibilityMode verifies that a dirty database
385+
// is not silently accepted by compatibility mode. If the DB is both dirty and ahead of
386+
// the binary's max known version, the dirty state must still be surfaced as an error.
387+
func TestApplyDir_DirtyStateNotMaskedByCompatibilityMode(t *testing.T) {
388+
connStr := startTestDB(t)
389+
390+
// Apply v1 cleanly first so the tracking table exists.
391+
if _, err := applyDir(connStr, oneCoreFS, "core", "schema_migrations"); err != nil {
392+
t.Fatalf("setup: %v", err)
393+
}
394+
395+
// Simulate a newer binary having applied v2 but leaving it dirty.
396+
db, err := sql.Open("pgx", connStr)
397+
if err != nil {
398+
t.Fatalf("open db: %v", err)
399+
}
400+
defer db.Close()
401+
if _, err := db.Exec("UPDATE schema_migrations SET version = 2, dirty = true"); err != nil {
402+
t.Fatalf("set dirty state: %v", err)
403+
}
404+
405+
// Older binary (max v1) starts: DB is at v2 dirty. Compatibility mode must NOT
406+
// trigger — dirty state must be returned as an error so the operator can act.
407+
_, err = applyDir(connStr, oneCoreFS, "core", "schema_migrations")
408+
if err == nil {
409+
t.Fatal("expected error for dirty database, got nil")
410+
}
411+
var dirtyErr migrate.ErrDirty
412+
if !errors.As(err, &dirtyErr) {
413+
t.Errorf("expected migrate.ErrDirty, got %T: %v", err, err)
414+
}
415+
}
416+
288417
// --- rollbackDir tests ---
289418

290419
func TestRollbackDir_RollsBackToTarget(t *testing.T) {

0 commit comments

Comments
 (0)