1010from __future__ import annotations
1111
1212import logging
13+ import shlex
1314from abc import ABC , abstractmethod
1415from dataclasses import dataclass
1516
1617from scitrera_app_framework .util import ext_parse_bool
1718
19+ from sparkrun .scripts import read_script
20+ from sparkrun .utils import merge_env
21+ from sparkrun .utils .shell import b64_encode_cmd
22+
1823logger = logging .getLogger (__name__ )
1924
2025# Default executor settings for DGX Spark GPU workloads.
@@ -78,7 +83,9 @@ def from_chain(cls, chain) -> ExecutorConfig:
7883 # still means "not set" and should fall back.
7984 def _get (key ):
8085 v = chain .get (key )
81- return v if v is not None else EXECUTOR_DEFAULTS .get (key )
86+ val = v if v is not None else EXECUTOR_DEFAULTS .get (key )
87+ logger .debug ("ExecutorConfig resolve: %s=%r (from chain: %r)" , key , val , v )
88+ return val
8289
8390 return cls (
8491 auto_remove = ext_parse_bool (_get ("auto_remove" )),
@@ -209,8 +216,6 @@ def generate_launch_script(
209216
210217 Absorbs ``scripts.py::generate_container_launch_script``.
211218 """
212- from sparkrun .utils import merge_env
213- from sparkrun .scripts import read_script
214219
215220 all_env = merge_env (nccl_env , env )
216221 cleanup = self .stop_cmd (container_name )
@@ -243,21 +248,23 @@ def generate_exec_serve_script(
243248
244249 Absorbs ``scripts.py::generate_exec_serve_script``.
245250 """
246- from sparkrun .scripts import read_script
247251
248252 env_exports = ""
249253 if env :
250254 for key , value in sorted (env .items ()):
251- env_exports += "export %s='%s'; " % (key , value )
255+ env_exports += "export %s=%s; " % (key , shlex .quote (str (value )))
256+
257+ full_cmd = "%s%s" % (env_exports , serve_command )
252258
253- escaped_cmd = serve_command .replace ("'" , "'\\ ''" )
254- full_cmd = "%s%s" % (env_exports , escaped_cmd )
259+ # Base64 encode the command to avoid all bash string-escaping/quoting bugs
260+ # when passing it into `docker exec ... bash -c "..."`
261+ b64_cmd = b64_encode_cmd (full_cmd )
255262
256263 script_name = "exec_serve_detached.sh" if detached else "exec_serve_foreground.sh"
257264 template = read_script (script_name )
258265 return template .format (
259- container_name = container_name ,
260- full_cmd = full_cmd ,
266+ container_name = shlex . quote ( container_name ) ,
267+ b64_cmd = b64_cmd ,
261268 )
262269
263270 def generate_ray_head_script (
@@ -275,8 +282,6 @@ def generate_ray_head_script(
275282
276283 Absorbs ``scripts.py::generate_ray_head_script``.
277284 """
278- from sparkrun .utils import merge_env
279- from sparkrun .scripts import read_script
280285
281286 all_env = merge_env ({"RAY_memory_monitor_refresh_ms" : "0" }, nccl_env , env )
282287
@@ -314,8 +319,6 @@ def generate_ray_worker_script(
314319
315320 Absorbs ``scripts.py::generate_ray_worker_script``.
316321 """
317- from sparkrun .utils import merge_env
318- from sparkrun .scripts import read_script
319322
320323 all_env = merge_env ({"RAY_memory_monitor_refresh_ms" : "0" }, nccl_env , env )
321324
@@ -356,7 +359,6 @@ def generate_node_script(
356359
357360 Absorbs ``base.py::_generate_node_script``.
358361 """
359- from sparkrun .utils import merge_env
360362
361363 all_env = merge_env (nccl_env , env )
362364 cleanup = self .stop_cmd (container_name )
@@ -374,19 +376,24 @@ def generate_node_script(
374376 "#!/bin/bash\n "
375377 "set -uo pipefail\n "
376378 "\n "
377- "echo 'Cleaning up existing container: %(name)s' \n "
379+ "printf 'Cleaning up existing container: %%s \\ n' % (name)s\n "
378380 "%(cleanup)s\n "
379381 "\n "
380- "echo 'Launching %(label)s: %(name)s' \n "
382+ "printf 'Launching %%s: %%s \\ n' % (label)s %(name)s\n "
381383 "%(run_cmd)s\n "
382384 "\n "
383385 "# Verify container started\n "
384386 "sleep 1\n "
385387 "if docker ps --format '{{.Names}}' | grep -q '^%(name)s$'; then\n "
386- " echo 'Container %(name) s launched successfully' \n "
388+ " printf 'Container %% s launched successfully\\ n' %(name)s \n "
387389 "else\n "
388- " echo 'ERROR: Container %(name) s failed to start' >&2\n "
390+ " printf 'ERROR: Container %% s failed to start\\ n' %(name)s >&2\n "
389391 " docker logs %(name)s 2>&1 | tail -20 || true\n "
390392 " exit 1\n "
391393 "fi\n "
392- ) % {"name" : container_name , "cleanup" : cleanup , "run_cmd" : run , "label" : label }
394+ ) % {
395+ "name" : shlex .quote (container_name ),
396+ "cleanup" : cleanup ,
397+ "run_cmd" : run ,
398+ "label" : shlex .quote (label ),
399+ }
0 commit comments