@@ -2,9 +2,12 @@ package standalone
22
33import (
44 "context"
5+ "errors"
56 "fmt"
7+ "regexp"
68 "strconv"
79 "strings"
10+ "time"
811
912 "github.com/docker/docker/api/types/container"
1013 "github.com/docker/docker/api/types/filters"
@@ -18,6 +21,11 @@ import (
1821// controllerContainerName is the name to use for the controller container.
1922const controllerContainerName = "docker-model-runner"
2023
24+ // concurrentInstallMatcher matches error message that indicate a concurrent
25+ // standalone model runner installation is taking place. It extracts the ID of
26+ // the conflicting container in a capture group.
27+ var concurrentInstallMatcher = regexp .MustCompile (`is already in use by container "([a-z0-9]+)"` )
28+
2129// FindControllerContainer searches for a running controller container. It
2230// returns the ID of the container (if found), the container name (if any), the
2331// full container summary (if found), or any error that occurred.
@@ -65,6 +73,28 @@ func determineBridgeGatewayIP(ctx context.Context, dockerClient *client.Client)
6573 return "" , nil
6674}
6775
76+ // waitForContainerToStart waits for a container to start.
77+ func waitForContainerToStart (ctx context.Context , dockerClient * client.Client , containerID string ) error {
78+ // Unfortunately the Docker API's /containers/{id}/wait API (and the
79+ // corresponding Client.ContainerWait method) don't allow waiting for
80+ // container startup, so instead we'll take a polling approach.
81+ for i := 5 ; i > 0 ; i -- {
82+ if status , err := dockerClient .ContainerInspect (ctx , containerID ); err != nil {
83+ return fmt .Errorf ("unable to inspect container (%s): %w" , containerID [:12 ], err )
84+ } else if status .State .Status == "running" {
85+ return nil
86+ }
87+ if i > 1 {
88+ select {
89+ case <- time .After (1 * time .Second ):
90+ case <- ctx .Done ():
91+ return errors .New ("waiting cancelled" )
92+ }
93+ }
94+ }
95+ return errors .New ("timed out" )
96+ }
97+
6898// CreateControllerContainer creates and starts a controller container.
6999func CreateControllerContainer (ctx context.Context , dockerClient * client.Client , port uint16 , doNotTrack bool , gpu gpupkg.GPUSupport , modelStorageVolume string , printer StatusPrinter ) error {
70100 // Determine the target image.
@@ -117,9 +147,17 @@ func CreateControllerContainer(ctx context.Context, dockerClient *client.Client,
117147 hostConfig .DeviceRequests = []container.DeviceRequest {{Count : - 1 , Capabilities : [][]string {{"gpu" }}}}
118148 }
119149
120- // Create the container.
150+ // Create the container. If we detect that a concurrent installation is in
151+ // progress, then we wait for whichever install process creates the
152+ // container first and then wait for its container to be ready.
121153 resp , err := dockerClient .ContainerCreate (ctx , config , hostConfig , nil , nil , controllerContainerName )
122154 if err != nil {
155+ if match := concurrentInstallMatcher .FindStringSubmatch (err .Error ()); match != nil {
156+ if err := waitForContainerToStart (ctx , dockerClient , match [1 ]); err != nil {
157+ return fmt .Errorf ("failed waiting for concurrent installation: %w" , err )
158+ }
159+ return nil
160+ }
123161 return fmt .Errorf ("failed to create container %s: %w" , controllerContainerName , err )
124162 }
125163
0 commit comments