Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/pi0_5/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ FlagScale supports online evaluation via FlagEval. You will need a `FLAGEVAL_SEC

```sh
cd FlagScale/
FLAGEVAL_SECRET=<your_secret> python flagscale/eval/eval_online.py \
FLAGEVAL_SECRET=<your_secret> flagscale eval robo \
--model-name pi0_5 \
--datasets libero_10 \
--server-host <your_model_server_host> \
Expand Down
2 changes: 1 addition & 1 deletion examples/qwen_gr00t/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ FlagScale supports online evaluation via FlagEval. You will need a `FLAGEVAL_SEC

```sh
cd FlagScale/
FLAGEVAL_SECRET=<your_secret> python flagscale/eval/eval_online.py \
FLAGEVAL_SECRET=<your_secret> flagscale eval robo \
--model-name qwen_gr00t \
--datasets libero_10 \
--server-host <your_model_server_host> \
Expand Down
64 changes: 64 additions & 0 deletions flagscale/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,70 @@ def compress(
run_task(cfg_path, cfg_name, action)


eval_app = typer.Typer(
help="Run evaluation. Use 'flagscale eval --help' to see available eval types."
)
app.add_typer(eval_app, name="eval")


@eval_app.command("robo")
def eval_robo(
model_name: str = typer.Option(
..., "--model-name", help="Model name (e.g., pi0_5, qwen_gr00t)"
),
datasets: list[str] = typer.Option(
..., "--datasets", help="Dataset keys to evaluate (e.g., libero_10)"
),
server_host: str = typer.Option(
..., "--server-host", help="IP/hostname that FlagEval calls back to"
),
server_port: int | None = typer.Option(None, "--server-port", help="Server port"),
base_url: str = typer.Option(
"https://flageval.baai.ac.cn/api/hf", "--base-url", help="FlagEval API base URL"
),
model_id: str | None = typer.Option(
None, "--model-id", help="Model ID for FlagEval UI (defaults to model-name)"
),
description: str = typer.Option("", "--description", help="Eval description"),
attach: bool = typer.Option(False, "--attach", help="Server already running, skip startup"),
detach: bool = typer.Option(False, "--detach", help="Leave server running after eval"),
poll_interval: int = typer.Option(30, "--poll-interval", help="Polling interval in seconds"),
server_timeout: int = typer.Option(
300, "--server-timeout", help="Server startup timeout in seconds"
),
):
"""Online evaluation via FlagEval-Robo API

Requires FLAGEVAL_SECRET environment variable.

Example:
FLAGEVAL_SECRET=xxx flagscale eval robo --model-name qwen_gr00t --datasets libero_10 --server-host example.com --attach
"""
from flagscale.eval.robo import main as eval_main

args = ["eval_online.py", "--model-name", model_name]
for ds in datasets:
args.extend(["--datasets", ds])
args.extend(["--server-host", server_host])
if server_port is not None:
args.extend(["--server-port", str(server_port)])
args.extend(["--base-url", base_url])
if model_id:
args.extend(["--model-id", model_id])
if description:
args.extend(["--description", description])
if attach:
args.append("--attach")
if detach:
args.append("--detach")
args.extend(["--poll-interval", str(poll_interval)])
args.extend(["--server-timeout", str(server_timeout)])

typer.echo(f"Eval [robo] model={model_name} datasets={datasets}")
sys.argv = args
eval_main()


# ============================================================================
# Install Command (delegates to tools/install)
# ============================================================================
Expand Down
Empty file added flagscale/eval/__init__.py
Empty file.
File renamed without changes.
114 changes: 113 additions & 1 deletion tests/unit_tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import sys
from pathlib import Path
from unittest.mock import patch

import pytest
from click.exceptions import Exit as ClickExit
from typer.testing import CliRunner

from flagscale.cli import get_action, resolve_config
from flagscale.cli import app, get_action, resolve_config


class TestGetAction:
Expand Down Expand Up @@ -163,3 +166,112 @@ def test_empty_model_name_with_yaml(self, tmp_path):

assert path == str(tmp_path)
assert name == "config"


runner = CliRunner()


class TestEvalRobo:
"""Tests for flagscale eval robo subcommand"""

def test_eval_robo_calls_eval_main(self):
"""eval robo forwards args to eval_online.main"""
with patch("flagscale.eval.robo.main") as mock_main:
result = runner.invoke(
app,
[
"eval",
"robo",
"--model-name",
"qwen_gr00t",
"--datasets",
"libero_10",
"--server-host",
"example.com",
"--attach",
"--base-url",
"http://localhost:8080/api/hf",
"--model-id",
"test_model",
],
)
assert result.exit_code == 0
mock_main.assert_called_once()
args = sys.argv
assert args[0] == "eval_online.py"
assert "--model-name" in args
assert "qwen_gr00t" in args
assert "--datasets" in args
assert "libero_10" in args
assert "--server-host" in args
assert "example.com" in args
assert "--attach" in args
assert "--base-url" in args
assert "http://localhost:8080/api/hf" in args
assert "--model-id" in args
assert "test_model" in args

def test_eval_robo_missing_required_args(self):
"""eval robo fails when required args are missing"""
result = runner.invoke(app, ["eval", "robo"])
assert result.exit_code != 0

def test_eval_robo_defaults(self):
"""eval robo uses default values for optional args"""
with patch("flagscale.eval.robo.main") as _:
result = runner.invoke(
app,
[
"eval",
"robo",
"--model-name",
"pi0_5",
"--datasets",
"libero_10",
"--server-host",
"example.com",
],
)
assert result.exit_code == 0
args = sys.argv
assert "--poll-interval" in args
assert "30" in args
assert "--server-timeout" in args
assert "300" in args
assert "--attach" not in args
assert "--detach" not in args

def test_eval_robo_multiple_datasets(self):
"""eval robo handles multiple --datasets flags"""
with patch("flagscale.eval.robo.main") as _:
result = runner.invoke(
app,
[
"eval",
"robo",
"--model-name",
"qwen_gr00t",
"--datasets",
"libero_10",
"--datasets",
"libero_90",
"--server-host",
"example.com",
],
)
assert result.exit_code == 0
args = sys.argv
assert "libero_10" in args
assert "libero_90" in args

def test_eval_help_shows_subcommands(self):
"""flagscale eval --help lists available eval types"""
result = runner.invoke(app, ["eval", "--help"])
assert result.exit_code == 0
assert "robo" in result.output

def test_eval_robo_help_shows_args(self):
"""flagscale eval robo --help shows help text"""
result = runner.invoke(app, ["eval", "robo", "--help"])
assert result.exit_code == 0
assert "FlagEval" in result.output
Loading