Skip to content

Commit 64be3a0

Browse files
committed
fix(orchestration): use base64 encoding to prevent quote stripping in docker exec
This commit consolidates several improvements and fixes to shell command execution within orchestration: - Use base64 encoding for `docker exec` commands to prevent argument parsing and quote stripping issues. - Centralize base64 shell wrapping into utility functions. - Apply `shlex.quote` to docker run/exec flags and shell wrapping. - Use `printf` instead of `echo` for known_hosts population. - Add comprehensive tests for shell utilities and orchestration. - Update documentation on shell execution and security guidelines.
1 parent 6bf00e4 commit 64be3a0

26 files changed

Lines changed: 471 additions & 152 deletions

DEVELOPERS.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,37 @@ from sparkrun.orchestration.ssh import run_remote_script
114114
result = run_remote_script(host, script_string, timeout=120, **ssh_kwargs)
115115
```
116116

117+
### Shell Execution & Security
118+
119+
Sparkrun frequently dynamically generates bash scripts and Docker commands that interpolate user-provided inputs (like container names, image names, or environment variables). To prevent shell injection and handle spaces/special characters, you MUST adhere to the following rules:
120+
121+
1. **Python `shlex.quote`**: When building commands in Python (e.g., `docker run` flags), wrap all interpolated values with `shlex.quote`:
122+
```python
123+
import shlex
124+
cmd = f"docker run --name {shlex.quote(container_name)} {shlex.quote(image)}"
125+
```
126+
127+
2. **Base64 Command Wrapping**: When passing complex commands (especially those with nested quotes or JSON) into `bash -c` or over SSH, use the `b64_encode_cmd` and `b64_wrap_bash` utilities from `sparkrun.utils.shell`:
128+
```python
129+
from sparkrun.utils.shell import b64_encode_cmd
130+
b64_cmd = b64_encode_cmd("vllm serve --hf-overrides '{\"rope\": \"yarn\"}'")
131+
# The bash script should decode and execute this:
132+
# printf '%s' '{b64_cmd}' | base64 -d -- | bash --noprofile --norc
133+
```
134+
135+
3. **Use `printf` instead of `echo`**: Inside generated bash scripts (`.sh` files), never use `echo` to output interpolated Python variables. If a variable starts with a hyphen (e.g., `-n`), `echo` may interpret it as a flag. Instead, use `printf` with a format string:
136+
```bash
137+
# DANGEROUS: echo "Launching {container_name}"
138+
# SAFE:
139+
printf "Launching %%s\n" "{container_name}"
140+
```
141+
*Note: In Python string formatting (used to populate the scripts), `%` must be escaped as `%%`.*
142+
143+
4. **Environment Variables**: When exporting variables in generated bash scripts, quote the interpolated value using `shlex.quote` in Python and omit quotes in the bash script:
144+
```python
145+
env_lines.append(f"export MY_VAR={shlex.quote(val)}")
146+
```
147+
117148
### Runtime Architecture
118149

119150
All runtimes extend `RuntimePlugin` (`runtimes/base.py`):

src/sparkrun/cli/_export.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -233,9 +233,6 @@ def _render_install_script(slug, recipe_yaml, cluster_yaml, cluster_name, user_h
233233
"""Render user-level install script (no sudo needed)."""
234234
service_dir = "%s/.config/sparkrun/services/%s" % (user_home, slug)
235235
clusters_dir = "%s/.config/sparkrun/clusters" % user_home
236-
# Escape single quotes in YAML content for heredoc safety
237-
recipe_yaml_escaped = recipe_yaml.replace("'", "'\\''")
238-
cluster_yaml_escaped = cluster_yaml.replace("'", "'\\''")
239236
return textwrap.dedent("""\
240237
#!/usr/bin/env bash
241238
set -euo pipefail
@@ -262,16 +259,14 @@ def _render_install_script(slug, recipe_yaml, cluster_yaml, cluster_name, user_h
262259
service_dir=service_dir,
263260
clusters_dir=clusters_dir,
264261
cluster_name=cluster_name,
265-
recipe_yaml=recipe_yaml_escaped,
266-
cluster_yaml=cluster_yaml_escaped,
262+
recipe_yaml=recipe_yaml,
263+
cluster_yaml=cluster_yaml,
267264
)
268265

269266

270267
def _render_sudo_install_script(slug, unit_contents):
271268
"""Render sudo install script (writes unit file, enables service)."""
272269
unit_path = "/etc/systemd/system/sparkrun-%s.service" % slug
273-
# Escape single quotes in unit contents for heredoc safety
274-
unit_escaped = unit_contents.replace("'", "'\\''")
275270
return textwrap.dedent("""\
276271
#!/usr/bin/env bash
277272
set -euo pipefail
@@ -289,7 +284,7 @@ def _render_sudo_install_script(slug, unit_contents):
289284
""").format(
290285
unit_path=unit_path,
291286
slug=slug,
292-
unit_contents=unit_escaped,
287+
unit_contents=unit_contents,
293288
)
294289

295290

src/sparkrun/cli/_run.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
@click.argument("recipe_name", type=RECIPE_NAME)
3535
@host_options
3636
@recipe_override_options
37+
@click.option("--name", "cluster_id_override", default=None, help="Override deterministic cluster ID (static container name)")
3738
@click.option("--solo", is_flag=True, help="Force single-node mode", hidden=True)
3839
@click.option("--port", type=int, default=None, help="Override serve port")
3940
@click.option("--served-model-name", default=None, help="Override served model name")
@@ -73,6 +74,7 @@ def run(
7374
hosts,
7475
hosts_file,
7576
cluster_name,
77+
cluster_id_override,
7678
solo,
7779
port,
7880
tensor_parallel,
@@ -250,6 +252,26 @@ def run(
250252
if restart_policy:
251253
cli_executor_opts["restart_policy"] = restart_policy
252254

255+
# Also extract executor-specific keys from -o/--option overrides
256+
executor_keys = {
257+
"auto_remove",
258+
"restart_policy",
259+
"privileged",
260+
"gpus",
261+
"ipc",
262+
"shm_size",
263+
"network",
264+
"user",
265+
"security_opt",
266+
"cap_add",
267+
"ulimit",
268+
"devices",
269+
"memory_limit",
270+
}
271+
for key in list(overrides.keys()):
272+
if key in executor_keys:
273+
cli_executor_opts[key] = overrides.pop(key)
274+
253275
# --- Diagnostics setup ---
254276
diag = None
255277
if diagnostics_path:
@@ -302,6 +324,7 @@ def run(
302324
dashboard=dashboard,
303325
init_port=init_port,
304326
topology=cluster_cfg.topology,
327+
cluster_id_override=cluster_id_override,
305328
executor_config=cli_executor_opts,
306329
rootless=not rootful,
307330
auto_user=not rootful,

src/sparkrun/core/launcher.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ def launch_inference(
7575
dashboard: bool = False,
7676
init_port: int | None = None,
7777
topology: str | None = None,
78+
cluster_id_override: str | None = None,
7879
# Executor config (dict for config chain layering)
7980
executor_config: dict | None = None,
8081
# note: transition to rootless by default
@@ -178,7 +179,7 @@ def launch_inference(
178179
serve_port = int(config_chain.get("port") or 8000)
179180

180181
# Derive deterministic cluster_id from recipe + (trimmed) hosts
181-
cluster_id = generate_cluster_id(recipe, host_list, overrides=overrides)
182+
cluster_id = cluster_id_override or generate_cluster_id(recipe, host_list, overrides=overrides)
182183

183184
# Resolve container image
184185
container_image = runtime.resolve_container(recipe, overrides)

src/sparkrun/core/recipe.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,18 +533,36 @@ def render_command(self, config_chain: Variables) -> str | None:
533533
534534
Returns None if no command template is defined.
535535
"""
536+
import re
537+
536538
if not self.command:
537539
return None
538540

539541
rendered = self.command.strip()
540542

543+
def _protect_braces(m):
544+
if m.group(1):
545+
return "\x01"
546+
elif m.group(2):
547+
return "\x02"
548+
return m.group(0)
549+
550+
# Protect double braces before applying vpd substitution.
551+
# We match {{ or }} or {something}, replacing only the double braces
552+
# with placeholders. This ensures that standard Python format-string
553+
# escaping is respected and survives iterative substitution.
554+
rendered = re.sub(r"(\{\{)|(\}\})|\{[^}]*\}", _protect_braces, rendered)
555+
541556
# Use vpd arg_substitute for {placeholder} replacement
542557
# Iterate to handle nested substitutions
543558
last = None
544559
while last != rendered:
545560
last = rendered
546561
rendered = arg_substitute(rendered, config_chain)
547562

563+
# Restore protected double braces as literal single braces
564+
rendered = rendered.replace("\x01", "{").replace("\x02", "}")
565+
548566
# Fix trailing spaces after backslash line-continuations.
549567
# ``\<space><newline>`` → ``\<newline>``
550568
rendered = _TRAILING_SPACE_CONTINUATION_RE.sub("\\\n", rendered)

src/sparkrun/orchestration/docker.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,10 @@ def docker_exec_cmd(
3535
parts.append("-d")
3636
if env:
3737
for key, value in sorted(env.items()):
38-
parts.extend(["-e", f"{key}={value}"])
39-
escaped_cmd = command.replace("'", "'\\''")
40-
parts.extend([shlex.quote(container_name), "bash", "-c", "'%s'" % escaped_cmd])
38+
parts.extend(["-e", shlex.quote(f"{key}={value}")])
39+
from sparkrun.utils.shell import b64_wrap_bash
40+
41+
parts.extend([shlex.quote(container_name), "bash", "-c", shlex.quote(b64_wrap_bash(command))])
4142
return " ".join(parts)
4243

4344

src/sparkrun/orchestration/executor.py

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,16 @@
1010
from __future__ import annotations
1111

1212
import logging
13+
import shlex
1314
from abc import ABC, abstractmethod
1415
from dataclasses import dataclass
1516

1617
from scitrera_app_framework.util import ext_parse_bool
1718

19+
from sparkrun.scripts import read_script
20+
from sparkrun.utils import merge_env
21+
from sparkrun.utils.shell import b64_encode_cmd
22+
1823
logger = logging.getLogger(__name__)
1924

2025
# Default executor settings for DGX Spark GPU workloads.
@@ -78,7 +83,9 @@ def from_chain(cls, chain) -> ExecutorConfig:
7883
# still means "not set" and should fall back.
7984
def _get(key):
8085
v = chain.get(key)
81-
return v if v is not None else EXECUTOR_DEFAULTS.get(key)
86+
val = v if v is not None else EXECUTOR_DEFAULTS.get(key)
87+
logger.debug("ExecutorConfig resolve: %s=%r (from chain: %r)", key, val, v)
88+
return val
8289

8390
return cls(
8491
auto_remove=ext_parse_bool(_get("auto_remove")),
@@ -209,8 +216,6 @@ def generate_launch_script(
209216
210217
Absorbs ``scripts.py::generate_container_launch_script``.
211218
"""
212-
from sparkrun.utils import merge_env
213-
from sparkrun.scripts import read_script
214219

215220
all_env = merge_env(nccl_env, env)
216221
cleanup = self.stop_cmd(container_name)
@@ -243,21 +248,23 @@ def generate_exec_serve_script(
243248
244249
Absorbs ``scripts.py::generate_exec_serve_script``.
245250
"""
246-
from sparkrun.scripts import read_script
247251

248252
env_exports = ""
249253
if env:
250254
for key, value in sorted(env.items()):
251-
env_exports += "export %s='%s'; " % (key, value)
255+
env_exports += "export %s=%s; " % (key, shlex.quote(str(value)))
256+
257+
full_cmd = "%s%s" % (env_exports, serve_command)
252258

253-
escaped_cmd = serve_command.replace("'", "'\\''")
254-
full_cmd = "%s%s" % (env_exports, escaped_cmd)
259+
# Base64 encode the command to avoid all bash string-escaping/quoting bugs
260+
# when passing it into `docker exec ... bash -c "..."`
261+
b64_cmd = b64_encode_cmd(full_cmd)
255262

256263
script_name = "exec_serve_detached.sh" if detached else "exec_serve_foreground.sh"
257264
template = read_script(script_name)
258265
return template.format(
259-
container_name=container_name,
260-
full_cmd=full_cmd,
266+
container_name=shlex.quote(container_name),
267+
b64_cmd=b64_cmd,
261268
)
262269

263270
def generate_ray_head_script(
@@ -275,8 +282,6 @@ def generate_ray_head_script(
275282
276283
Absorbs ``scripts.py::generate_ray_head_script``.
277284
"""
278-
from sparkrun.utils import merge_env
279-
from sparkrun.scripts import read_script
280285

281286
all_env = merge_env({"RAY_memory_monitor_refresh_ms": "0"}, nccl_env, env)
282287

@@ -314,8 +319,6 @@ def generate_ray_worker_script(
314319
315320
Absorbs ``scripts.py::generate_ray_worker_script``.
316321
"""
317-
from sparkrun.utils import merge_env
318-
from sparkrun.scripts import read_script
319322

320323
all_env = merge_env({"RAY_memory_monitor_refresh_ms": "0"}, nccl_env, env)
321324

@@ -356,7 +359,6 @@ def generate_node_script(
356359
357360
Absorbs ``base.py::_generate_node_script``.
358361
"""
359-
from sparkrun.utils import merge_env
360362

361363
all_env = merge_env(nccl_env, env)
362364
cleanup = self.stop_cmd(container_name)
@@ -374,19 +376,24 @@ def generate_node_script(
374376
"#!/bin/bash\n"
375377
"set -uo pipefail\n"
376378
"\n"
377-
"echo 'Cleaning up existing container: %(name)s'\n"
379+
"printf 'Cleaning up existing container: %%s\\n' %(name)s\n"
378380
"%(cleanup)s\n"
379381
"\n"
380-
"echo 'Launching %(label)s: %(name)s'\n"
382+
"printf 'Launching %%s: %%s\\n' %(label)s %(name)s\n"
381383
"%(run_cmd)s\n"
382384
"\n"
383385
"# Verify container started\n"
384386
"sleep 1\n"
385387
"if docker ps --format '{{.Names}}' | grep -q '^%(name)s$'; then\n"
386-
" echo 'Container %(name)s launched successfully'\n"
388+
" printf 'Container %%s launched successfully\\n' %(name)s\n"
387389
"else\n"
388-
" echo 'ERROR: Container %(name)s failed to start' >&2\n"
390+
" printf 'ERROR: Container %%s failed to start\\n' %(name)s >&2\n"
389391
" docker logs %(name)s 2>&1 | tail -20 || true\n"
390392
" exit 1\n"
391393
"fi\n"
392-
) % {"name": container_name, "cleanup": cleanup, "run_cmd": run, "label": label}
394+
) % {
395+
"name": shlex.quote(container_name),
396+
"cleanup": cleanup,
397+
"run_cmd": run,
398+
"label": shlex.quote(label),
399+
}

0 commit comments

Comments
 (0)