diff --git a/examples/README.md b/examples/README.md index eae06ec2..6dbadbc9 100644 --- a/examples/README.md +++ b/examples/README.md @@ -6,6 +6,7 @@ Examples for common OpenSandbox use cases. Each subdirectory contains runnable c - [**echo**](echo): Minimal example using Ubuntu image to execute a simple echo command - [**aio-sandbox**](aio-sandbox): Basic example for agent_sandbox - [**code-interpreter**](code-interpreter): Code Interpreter SDK singleton example +- [**rl-training**](rl-training): Reinforcement learning training loop inside a sandbox - [**claude-code**](claude-code): Call Claude (Anthropic) API/CLI within the sandbox - [**iflow-cli**](iflow-cli): CLI invocation template for iFlow/custom HTTP LLM services - [**gemini-cli**](gemini-cli): Call Google Gemini within the sandbox diff --git a/examples/rl-training/README.md b/examples/rl-training/README.md new file mode 100644 index 00000000..c057a89d --- /dev/null +++ b/examples/rl-training/README.md @@ -0,0 +1,44 @@ +# Reinforcement Learning Sandbox Example + +Demonstrates running a basic RL training loop (CartPole + DQN) inside an isolated OpenSandbox container. The example installs RL dependencies in the sandbox, trains a policy, saves a checkpoint, and returns a training summary. + +## Start OpenSandbox server [local] + +Start the local OpenSandbox server: + +```shell +git clone git@github.com:alibaba/OpenSandbox.git +cd OpenSandbox/server +cp example.config.toml ~/.sandbox.toml +uv sync +uv run python -m src.main +``` + +## Run the Example + +```shell +# Install OpenSandbox package +uv pip install opensandbox + +# Run the example +uv run python examples/rl-training/main.py +``` + +The script provisions a sandbox, installs RL dependencies, trains a DQN agent on CartPole, saves a checkpoint, and prints the JSON training summary. + +![RL training screenshot](./screenshot.jpg) + +## Environment Variables + +- `SANDBOX_DOMAIN`: Sandbox service address (default: `localhost:8080`) +- `SANDBOX_API_KEY`: API key if your server requires authentication +- `SANDBOX_IMAGE`: Docker image to use (default: `sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/code-interpreter:latest`) +- `RL_TIMESTEPS`: Training timesteps to run (default: `5000`) + +## TensorBoard + +The training script logs to `runs/`. To visualize metrics, open a shell in the sandbox and run: + +```shell +tensorboard --logdir runs --host 0.0.0.0 --port 6006 +``` diff --git a/examples/rl-training/main.py b/examples/rl-training/main.py new file mode 100644 index 00000000..18f67e1e --- /dev/null +++ b/examples/rl-training/main.py @@ -0,0 +1,184 @@ +# Copyright 2025 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +import textwrap +from datetime import timedelta +from pathlib import Path + +from opensandbox import Sandbox +from opensandbox.config import ConnectionConfig + + +def _load_requirements() -> str: + requirements_path = Path(__file__).with_name("requirements.txt") + return requirements_path.read_text(encoding="utf-8") + + +def _training_script() -> str: + return textwrap.dedent( + """ + import json + import os + + import gymnasium as gym + from stable_baselines3 import DQN + from stable_baselines3.common.evaluation import evaluate_policy + + timesteps = int(os.getenv("RL_TIMESTEPS", "5000")) + tensorboard_log = os.getenv("RL_TENSORBOARD_LOG", "runs") + + env = gym.make("CartPole-v1") + model = DQN( + "MlpPolicy", + env, + verbose=1, + tensorboard_log=tensorboard_log, + learning_rate=1e-3, + buffer_size=10000, + learning_starts=1000, + batch_size=32, + train_freq=4, + gradient_steps=1, + ) + + model.learn(total_timesteps=timesteps) + + os.makedirs("checkpoints", exist_ok=True) + checkpoint_path = "checkpoints/cartpole_dqn" + model.save(checkpoint_path) + + mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5) + summary = { + "timesteps": timesteps, + "mean_reward": float(mean_reward), + "std_reward": float(std_reward), + "checkpoint_path": f"{checkpoint_path}.zip", + } + with open("training_summary.json", "w", encoding="utf-8") as handle: + json.dump(summary, handle, indent=2) + + print("Training summary:", summary) + env.close() + """ + ).lstrip() + + +async def _print_execution_logs(execution) -> None: + for msg in execution.logs.stdout: + print(f"[stdout] {msg.text}") + for msg in execution.logs.stderr: + print(f"[stderr] {msg.text}") + if execution.error: + print(f"[error] {execution.error.name}: {execution.error.value}") + + +def _execution_failed(execution) -> bool: + return execution.error is not None + + +async def _run_command(sandbox: Sandbox, command: str) -> bool: + execution = await sandbox.commands.run(command) + await _print_execution_logs(execution) + return not _execution_failed(execution) + + +def _with_python_env(command: str) -> str: + return ( + "bash -lc '" + "source /opt/opensandbox/code-interpreter-env.sh " + "python ${PYTHON_VERSION:-3.14} >/dev/null " + "&& " + f"{command}" + "'" + ) + + +async def _ensure_pip(sandbox: Sandbox) -> bool: + bootstrap_commands = [ + _with_python_env("python3 -m pip --version"), + _with_python_env("python3 -m ensurepip --upgrade"), + "apt-get update && apt-get install -y python3-pip", + "apk add --no-cache py3-pip", + ] + for command in bootstrap_commands: + if await _run_command(sandbox, command): + return True + return False + + +async def _install_requirements(sandbox: Sandbox) -> bool: + install_commands = [ + _with_python_env( + "python3 -m pip install --no-cache-dir --break-system-packages -r requirements.txt" + ), + "pip3 install --no-cache-dir -r requirements.txt", + "pip install --no-cache-dir -r requirements.txt", + ] + for command in install_commands: + if await _run_command(sandbox, command): + return True + return False + + +async def main() -> None: + domain = os.getenv("SANDBOX_DOMAIN", "localhost:8080") + api_key = os.getenv("SANDBOX_API_KEY") + image = os.getenv("SANDBOX_IMAGE", "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/code-interpreter:latest") + timesteps = os.getenv("RL_TIMESTEPS", "5000") + + config = ConnectionConfig( + domain=domain, + api_key=api_key, + request_timeout=timedelta(minutes=10), + ) + + sandbox = await Sandbox.create( + image, + connection_config=config, + env={"RL_TIMESTEPS": timesteps}, + ) + + async with sandbox: + try: + await sandbox.files.write_file("requirements.txt", _load_requirements()) + if not await _ensure_pip(sandbox): + print("Failed to bootstrap pip inside the sandbox.") + return + + if not await _install_requirements(sandbox): + print("Failed to install RL dependencies inside the sandbox.") + return + + await sandbox.files.write_file("train.py", _training_script()) + train_exec = await sandbox.commands.run(_with_python_env("python3 train.py")) + await _print_execution_logs(train_exec) + if _execution_failed(train_exec): + print("Training failed inside the sandbox.") + return + + try: + summary = await sandbox.files.read_file("training_summary.json") + except Exception as exc: + print(f"\nFailed to read training summary: {exc}") + else: + print("\n=== Training summary ===") + print(summary) + finally: + await sandbox.kill() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/rl-training/requirements.txt b/examples/rl-training/requirements.txt new file mode 100644 index 00000000..e2e0340b --- /dev/null +++ b/examples/rl-training/requirements.txt @@ -0,0 +1,4 @@ +gymnasium==0.29.1 +stable-baselines3==2.3.2 +tensorboard==2.16.2 +torch==2.9.1 diff --git a/examples/rl-training/screenshot.jpg b/examples/rl-training/screenshot.jpg new file mode 100644 index 00000000..df6ff195 Binary files /dev/null and b/examples/rl-training/screenshot.jpg differ