flagos-ai · legitnull · Apr 3, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/examples/pi0_5/README.md b/examples/pi0_5/README.md
@@ -348,7 +348,7 @@ FlagScale supports online evaluation via FlagEval. You will need a `FLAGEVAL_SEC
 
 ```sh
 cd FlagScale/
-FLAGEVAL_SECRET=<your_secret> python flagscale/eval/eval_online.py \
+FLAGEVAL_SECRET=<your_secret> flagscale eval robo \
     --model-name pi0_5 \
     --datasets libero_10 \
     --server-host <your_model_server_host> \

diff --git a/examples/qwen_gr00t/README.md b/examples/qwen_gr00t/README.md
@@ -318,7 +318,7 @@ FlagScale supports online evaluation via FlagEval. You will need a `FLAGEVAL_SEC
 
 ```sh
 cd FlagScale/
-FLAGEVAL_SECRET=<your_secret> python flagscale/eval/eval_online.py \
+FLAGEVAL_SECRET=<your_secret> flagscale eval robo \
     --model-name qwen_gr00t \
     --datasets libero_10 \
     --server-host <your_model_server_host> \

diff --git a/flagscale/cli.py b/flagscale/cli.py
@@ -259,6 +259,70 @@ def compress(
     run_task(cfg_path, cfg_name, action)
 
 
+eval_app = typer.Typer(
+    help="Run evaluation. Use 'flagscale eval --help' to see available eval types."
+)
+app.add_typer(eval_app, name="eval")
+
+
+@eval_app.command("robo")
+def eval_robo(
+    model_name: str = typer.Option(
+        ..., "--model-name", help="Model name (e.g., pi0_5, qwen_gr00t)"
+    ),
+    datasets: list[str] = typer.Option(
+        ..., "--datasets", help="Dataset keys to evaluate (e.g., libero_10)"
+    ),
+    server_host: str = typer.Option(
+        ..., "--server-host", help="IP/hostname that FlagEval calls back to"
+    ),
+    server_port: int | None = typer.Option(None, "--server-port", help="Server port"),
+    base_url: str = typer.Option(
+        "https://flageval.baai.ac.cn/api/hf", "--base-url", help="FlagEval API base URL"
+    ),
+    model_id: str | None = typer.Option(
+        None, "--model-id", help="Model ID for FlagEval UI (defaults to model-name)"
+    ),
+    description: str = typer.Option("", "--description", help="Eval description"),
+    attach: bool = typer.Option(False, "--attach", help="Server already running, skip startup"),
+    detach: bool = typer.Option(False, "--detach", help="Leave server running after eval"),
+    poll_interval: int = typer.Option(30, "--poll-interval", help="Polling interval in seconds"),
+    server_timeout: int = typer.Option(
+        300, "--server-timeout", help="Server startup timeout in seconds"
+    ),
+):
+    """Online evaluation via FlagEval-Robo API
+
+    Requires FLAGEVAL_SECRET environment variable.
+
+    Example:
+        FLAGEVAL_SECRET=xxx flagscale eval robo --model-name qwen_gr00t --datasets libero_10 --server-host example.com --attach
+    """
+    from flagscale.eval.robo import main as eval_main
+
+    args = ["eval_online.py", "--model-name", model_name]
+    for ds in datasets:
+        args.extend(["--datasets", ds])
+    args.extend(["--server-host", server_host])
+    if server_port is not None:
+        args.extend(["--server-port", str(server_port)])
+    args.extend(["--base-url", base_url])
+    if model_id:
+        args.extend(["--model-id", model_id])
+    if description:
+        args.extend(["--description", description])
+    if attach:
+        args.append("--attach")
+    if detach:
+        args.append("--detach")
+    args.extend(["--poll-interval", str(poll_interval)])
+    args.extend(["--server-timeout", str(server_timeout)])
+
+    typer.echo(f"Eval [robo] model={model_name} datasets={datasets}")
+    sys.argv = args
+    eval_main()
+
+
 # ============================================================================
 # Install Command (delegates to tools/install)
 # ============================================================================

diff --git a/flagscale/eval/__init__.py b/flagscale/eval/__init__.py
diff --git a/flagscale/eval/eval_online.py → flagscale/eval/robo.py b/flagscale/eval/eval_online.py → flagscale/eval/robo.py
diff --git a/tests/unit_tests/test_cli.py b/tests/unit_tests/test_cli.py
@@ -1,9 +1,12 @@
+import sys
 from pathlib import Path
+from unittest.mock import patch
 
 import pytest
 from click.exceptions import Exit as ClickExit
+from typer.testing import CliRunner
 
-from flagscale.cli import get_action, resolve_config
+from flagscale.cli import app, get_action, resolve_config
 
 
 class TestGetAction:
@@ -163,3 +166,112 @@ def test_empty_model_name_with_yaml(self, tmp_path):
 
         assert path == str(tmp_path)
         assert name == "config"
+
+
+runner = CliRunner()
+
+
+class TestEvalRobo:
+    """Tests for flagscale eval robo subcommand"""
+
+    def test_eval_robo_calls_eval_main(self):
+        """eval robo forwards args to eval_online.main"""
+        with patch("flagscale.eval.robo.main") as mock_main:
+            result = runner.invoke(
+                app,
+                [
+                    "eval",
+                    "robo",
+                    "--model-name",
+                    "qwen_gr00t",
+                    "--datasets",
+                    "libero_10",
+                    "--server-host",
+                    "example.com",
+                    "--attach",
+                    "--base-url",
+                    "http://localhost:8080/api/hf",
+                    "--model-id",
+                    "test_model",
+                ],
+            )
+            assert result.exit_code == 0
+            mock_main.assert_called_once()
+            args = sys.argv
+            assert args[0] == "eval_online.py"
+            assert "--model-name" in args
+            assert "qwen_gr00t" in args
+            assert "--datasets" in args
+            assert "libero_10" in args
+            assert "--server-host" in args
+            assert "example.com" in args
+            assert "--attach" in args
+            assert "--base-url" in args
+            assert "http://localhost:8080/api/hf" in args
+            assert "--model-id" in args
+            assert "test_model" in args
+
+    def test_eval_robo_missing_required_args(self):
+        """eval robo fails when required args are missing"""
+        result = runner.invoke(app, ["eval", "robo"])
+        assert result.exit_code != 0
+
+    def test_eval_robo_defaults(self):
+        """eval robo uses default values for optional args"""
+        with patch("flagscale.eval.robo.main") as _:
+            result = runner.invoke(
+                app,
+                [
+                    "eval",
+                    "robo",
+                    "--model-name",
+                    "pi0_5",
+                    "--datasets",
+                    "libero_10",
+                    "--server-host",
+                    "example.com",
+                ],
+            )
+            assert result.exit_code == 0
+            args = sys.argv
+            assert "--poll-interval" in args
+            assert "30" in args
+            assert "--server-timeout" in args
+            assert "300" in args
+            assert "--attach" not in args
+            assert "--detach" not in args
+
+    def test_eval_robo_multiple_datasets(self):
+        """eval robo handles multiple --datasets flags"""
+        with patch("flagscale.eval.robo.main") as _:
+            result = runner.invoke(
+                app,
+                [
+                    "eval",
+                    "robo",
+                    "--model-name",
+                    "qwen_gr00t",
+                    "--datasets",
+                    "libero_10",
+                    "--datasets",
+                    "libero_90",
+                    "--server-host",
+                    "example.com",
+                ],
+            )
+            assert result.exit_code == 0
+            args = sys.argv
+            assert "libero_10" in args
+            assert "libero_90" in args
+
+    def test_eval_help_shows_subcommands(self):
+        """flagscale eval --help lists available eval types"""
+        result = runner.invoke(app, ["eval", "--help"])
+        assert result.exit_code == 0
+        assert "robo" in result.output
+
+    def test_eval_robo_help_shows_args(self):
+        """flagscale eval robo --help shows help text"""
+        result = runner.invoke(app, ["eval", "robo", "--help"])
+        assert result.exit_code == 0
+        assert "FlagEval" in result.output