From 4b5ccbcc5f8d90784f91100263bdcf8ba2ae0f16 Mon Sep 17 00:00:00 2001
From: Cyrill Troxler <cyrilltroxler@gmail.com>
Date: Sun, 19 Oct 2025 09:53:14 +0200
Subject: [PATCH] fix: abort restore when node connection fails

Instead of erroring out and crashlooping until the migration times out,
the new container will simply start from scratch if it fails to connect
to the node socket. This could happen if for example the node pod is
being replaced or is not running for some other unexpected reason.
---
 shim/restore.go              |  3 ++-
 shim/task/service_zeropod.go | 14 +++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/shim/restore.go b/shim/restore.go
index df03ad0..43f06a5 100644
--- a/shim/restore.go
+++ b/shim/restore.go
@@ -30,6 +30,7 @@ import (
 var (
 	ErrAlreadyRestored      = errors.New("container is already restored")
 	ErrRestoreRequestFailed = errors.New("restore request failed")
+	ErrRestoreDial          = errors.New("failed to connect to node socket")
 )
 
 func (c *Container) Restore(ctx context.Context) (*runc.Container, process.Process, error) {
@@ -199,7 +200,7 @@ func createContainerLoggers(ctx context.Context, logPath string, tty bool) (stdo
 func MigrationRestore(ctx context.Context, r *task.CreateTaskRequest, cfg *Config) (skipStart bool, err error) {
 	conn, err := net.Dial("unix", nodev1.SocketPath)
 	if err != nil {
-		return false, fmt.Errorf("dialing node service: %w", err)
+		return false, fmt.Errorf("%w: dialing node service: %w", ErrRestoreDial, err)
 	}
 	log.G(ctx).Infof("creating restore request for container: %s", cfg.ContainerName)
 
diff --git a/shim/task/service_zeropod.go b/shim/task/service_zeropod.go
index 25ae2ba..b867a92 100644
--- a/shim/task/service_zeropod.go
+++ b/shim/task/service_zeropod.go
@@ -159,13 +159,17 @@ func (w *wrapper) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (_ *
 	if cfg.AnyMigrationEnabled() {
 		skipStart, err := zshim.MigrationRestore(ctx, r, cfg)
 		if err != nil {
-			if !errors.Is(err, zshim.ErrRestoreRequestFailed) {
+			if errors.Is(err, zshim.ErrRestoreRequestFailed) ||
+				errors.Is(err, zshim.ErrRestoreDial) {
+				// if the restore fails with ErrRestoreRequestFailed it's very
+				// likely it simply did not find a matching migration. Equally,
+				// if the shim can't manage to dial the node service there's no
+				// chance it can be restored. We log it and create the container
+				// from scratch.
+				log.G(ctx).Errorf("restore request failed: %s", err)
+			} else {
 				return nil, err
 			}
-			// if the restore fails with ErrRestoreRequestFailed it's very
-			// likely it simply did not find a matching migration. We log it and
-			// create the container from scratch.
-			log.G(ctx).Errorf("restore request failed: %s", err)
 		}
 		zeropodContainer.SetSkipStart(skipStart)
 	}