diff --git a/examples/pi0_5/README.md b/examples/pi0_5/README.md index cbe06d3adb..9f9f27de1e 100644 --- a/examples/pi0_5/README.md +++ b/examples/pi0_5/README.md @@ -348,7 +348,7 @@ FlagScale supports online evaluation via FlagEval. You will need a `FLAGEVAL_SEC ```sh cd FlagScale/ -FLAGEVAL_SECRET= python flagscale/eval/eval_online.py \ +FLAGEVAL_SECRET= flagscale eval robo \ --model-name pi0_5 \ --datasets libero_10 \ --server-host \ diff --git a/examples/qwen_gr00t/README.md b/examples/qwen_gr00t/README.md index 46e36a51a2..fa78c5d112 100644 --- a/examples/qwen_gr00t/README.md +++ b/examples/qwen_gr00t/README.md @@ -318,7 +318,7 @@ FlagScale supports online evaluation via FlagEval. You will need a `FLAGEVAL_SEC ```sh cd FlagScale/ -FLAGEVAL_SECRET= python flagscale/eval/eval_online.py \ +FLAGEVAL_SECRET= flagscale eval robo \ --model-name qwen_gr00t \ --datasets libero_10 \ --server-host \ diff --git a/flagscale/cli.py b/flagscale/cli.py index 467b03741e..0967b0ac10 100644 --- a/flagscale/cli.py +++ b/flagscale/cli.py @@ -259,6 +259,70 @@ def compress( run_task(cfg_path, cfg_name, action) +eval_app = typer.Typer( + help="Run evaluation. Use 'flagscale eval --help' to see available eval types." +) +app.add_typer(eval_app, name="eval") + + +@eval_app.command("robo") +def eval_robo( + model_name: str = typer.Option( + ..., "--model-name", help="Model name (e.g., pi0_5, qwen_gr00t)" + ), + datasets: list[str] = typer.Option( + ..., "--datasets", help="Dataset keys to evaluate (e.g., libero_10)" + ), + server_host: str = typer.Option( + ..., "--server-host", help="IP/hostname that FlagEval calls back to" + ), + server_port: int | None = typer.Option(None, "--server-port", help="Server port"), + base_url: str = typer.Option( + "https://flageval.baai.ac.cn/api/hf", "--base-url", help="FlagEval API base URL" + ), + model_id: str | None = typer.Option( + None, "--model-id", help="Model ID for FlagEval UI (defaults to model-name)" + ), + description: str = typer.Option("", "--description", help="Eval description"), + attach: bool = typer.Option(False, "--attach", help="Server already running, skip startup"), + detach: bool = typer.Option(False, "--detach", help="Leave server running after eval"), + poll_interval: int = typer.Option(30, "--poll-interval", help="Polling interval in seconds"), + server_timeout: int = typer.Option( + 300, "--server-timeout", help="Server startup timeout in seconds" + ), +): + """Online evaluation via FlagEval-Robo API + + Requires FLAGEVAL_SECRET environment variable. + + Example: + FLAGEVAL_SECRET=xxx flagscale eval robo --model-name qwen_gr00t --datasets libero_10 --server-host example.com --attach + """ + from flagscale.eval.robo import main as eval_main + + args = ["eval_online.py", "--model-name", model_name] + for ds in datasets: + args.extend(["--datasets", ds]) + args.extend(["--server-host", server_host]) + if server_port is not None: + args.extend(["--server-port", str(server_port)]) + args.extend(["--base-url", base_url]) + if model_id: + args.extend(["--model-id", model_id]) + if description: + args.extend(["--description", description]) + if attach: + args.append("--attach") + if detach: + args.append("--detach") + args.extend(["--poll-interval", str(poll_interval)]) + args.extend(["--server-timeout", str(server_timeout)]) + + typer.echo(f"Eval [robo] model={model_name} datasets={datasets}") + sys.argv = args + eval_main() + + # ============================================================================ # Install Command (delegates to tools/install) # ============================================================================ diff --git a/flagscale/eval/__init__.py b/flagscale/eval/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/flagscale/eval/eval_online.py b/flagscale/eval/robo.py similarity index 100% rename from flagscale/eval/eval_online.py rename to flagscale/eval/robo.py diff --git a/tests/unit_tests/test_cli.py b/tests/unit_tests/test_cli.py index c18a96fde3..ee0f486bce 100644 --- a/tests/unit_tests/test_cli.py +++ b/tests/unit_tests/test_cli.py @@ -1,9 +1,12 @@ +import sys from pathlib import Path +from unittest.mock import patch import pytest from click.exceptions import Exit as ClickExit +from typer.testing import CliRunner -from flagscale.cli import get_action, resolve_config +from flagscale.cli import app, get_action, resolve_config class TestGetAction: @@ -163,3 +166,112 @@ def test_empty_model_name_with_yaml(self, tmp_path): assert path == str(tmp_path) assert name == "config" + + +runner = CliRunner() + + +class TestEvalRobo: + """Tests for flagscale eval robo subcommand""" + + def test_eval_robo_calls_eval_main(self): + """eval robo forwards args to eval_online.main""" + with patch("flagscale.eval.robo.main") as mock_main: + result = runner.invoke( + app, + [ + "eval", + "robo", + "--model-name", + "qwen_gr00t", + "--datasets", + "libero_10", + "--server-host", + "example.com", + "--attach", + "--base-url", + "http://localhost:8080/api/hf", + "--model-id", + "test_model", + ], + ) + assert result.exit_code == 0 + mock_main.assert_called_once() + args = sys.argv + assert args[0] == "eval_online.py" + assert "--model-name" in args + assert "qwen_gr00t" in args + assert "--datasets" in args + assert "libero_10" in args + assert "--server-host" in args + assert "example.com" in args + assert "--attach" in args + assert "--base-url" in args + assert "http://localhost:8080/api/hf" in args + assert "--model-id" in args + assert "test_model" in args + + def test_eval_robo_missing_required_args(self): + """eval robo fails when required args are missing""" + result = runner.invoke(app, ["eval", "robo"]) + assert result.exit_code != 0 + + def test_eval_robo_defaults(self): + """eval robo uses default values for optional args""" + with patch("flagscale.eval.robo.main") as _: + result = runner.invoke( + app, + [ + "eval", + "robo", + "--model-name", + "pi0_5", + "--datasets", + "libero_10", + "--server-host", + "example.com", + ], + ) + assert result.exit_code == 0 + args = sys.argv + assert "--poll-interval" in args + assert "30" in args + assert "--server-timeout" in args + assert "300" in args + assert "--attach" not in args + assert "--detach" not in args + + def test_eval_robo_multiple_datasets(self): + """eval robo handles multiple --datasets flags""" + with patch("flagscale.eval.robo.main") as _: + result = runner.invoke( + app, + [ + "eval", + "robo", + "--model-name", + "qwen_gr00t", + "--datasets", + "libero_10", + "--datasets", + "libero_90", + "--server-host", + "example.com", + ], + ) + assert result.exit_code == 0 + args = sys.argv + assert "libero_10" in args + assert "libero_90" in args + + def test_eval_help_shows_subcommands(self): + """flagscale eval --help lists available eval types""" + result = runner.invoke(app, ["eval", "--help"]) + assert result.exit_code == 0 + assert "robo" in result.output + + def test_eval_robo_help_shows_args(self): + """flagscale eval robo --help shows help text""" + result = runner.invoke(app, ["eval", "robo", "--help"]) + assert result.exit_code == 0 + assert "FlagEval" in result.output