diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..ce1bbdd --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,42 @@ +# FlagOS DevOps - Code Owners + +# Default owners for everything +* @flagos-ai/devops-team + +# CI/CD workflows +.github/ @flagos-ai/devops-team + +# Shared actions +actions/ @flagos-ai/devops-team + +# === User Tests === + +# FlagScale test cases +flagos-user-tests/tests/flagscale/ @flagos-ai/flagscale-team + +# FlagGems test cases +flagos-user-tests/tests/flaggems/ @flagos-ai/flaggems-team + +# FlagCX test cases +flagos-user-tests/tests/flagcx/ @flagos-ai/flagcx-team + +# FlagTree test cases +flagos-user-tests/tests/flagtree/ @flagos-ai/flagtree-team + +# vLLM-FL test cases +flagos-user-tests/tests/vllm-fl/ @flagos-ai/vllm-team + +# vLLM-plugin-FL test cases +flagos-user-tests/tests/vllm-plugin-fl/ @flagos-ai/vllm-team + +# TE-FL test cases +flagos-user-tests/tests/te-fl/ @flagos-ai/te-team + +# Megatron-LM-FL test cases +flagos-user-tests/tests/megatron-lm-fl/ @flagos-ai/megatron-team + +# Experimental test cases +flagos-user-tests/tests/experimental/ @flagos-ai/devops-team + +# Validation tools +flagos-user-tests/tools/ @flagos-ai/devops-team diff --git a/.github/ISSUE_TEMPLATE/new_test_case.yml b/.github/ISSUE_TEMPLATE/new_test_case.yml new file mode 100644 index 0000000..899c5b4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/new_test_case.yml @@ -0,0 +1,94 @@ +name: New Test Case Submission +description: Submit a new test case for FlagOS repositories +title: "[Test Case] " +labels: ["new-test-case"] +body: + - type: dropdown + id: target-repo + attributes: + label: Target Repository + description: Which FlagOS repository is this test case for? + options: + - FlagScale + - FlagGems + - FlagCX + - FlagTree + - vLLM-FL + - vLLM-plugin-FL + - TE-FL + - Megatron-LM-FL + validations: + required: true + + - type: dropdown + id: test-type + attributes: + label: Test Type + description: What type of test is this? + options: + - train + - inference + - hetero_train + - unit + - integration + - benchmark + validations: + required: true + + - type: input + id: model-name + attributes: + label: Model Name + description: Name of the model being tested (if applicable) + placeholder: e.g., llama2, mixtral, deepseek + + - type: textarea + id: description + attributes: + label: Test Case Description + description: Describe what this test case validates + placeholder: | + This test case validates ... + validations: + required: true + + - type: textarea + id: config + attributes: + label: Configuration + description: Paste the YAML configuration for the test case + render: yaml + validations: + required: true + + - type: textarea + id: gold-values + attributes: + label: Gold Values + description: Paste the expected gold values (JSON format) + render: json + + - type: textarea + id: environment + attributes: + label: Environment Requirements + description: Describe the hardware/software requirements + placeholder: | + - GPU: 8x A100 80GB + - CUDA: 12.1 + - Python: 3.10 + validations: + required: true + + - type: checkboxes + id: checklist + attributes: + label: Submission Checklist + options: + - label: I have tested this test case locally + required: true + - label: I have included gold values (if applicable) + - label: I have added a README.md with test description + required: true + - label: My YAML configuration follows the schema specification + required: true diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..4f688bf --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,37 @@ +## Test Case PR + +### Target Repository + +- [ ] FlagScale +- [ ] FlagGems +- [ ] FlagCX +- [ ] FlagTree +- [ ] vLLM-FL +- [ ] vLLM-plugin-FL +- [ ] TE-FL +- [ ] Megatron-LM-FL + +### Test Type + +- [ ] train +- [ ] inference +- [ ] hetero_train +- [ ] unit +- [ ] integration + +### Description + + + +### Environment Requirements + +- GPU: +- CUDA: +- Python: + +### Checklist +- [ ] YAML configuration passes schema validation +- [ ] Gold values are included (if applicable) +- [ ] README.md is present for each test case +- [ ] Test case has been verified locally +- [ ] No sensitive data (tokens, passwords, private paths) in configs diff --git a/.github/scripts/detect_changed_repos.js b/.github/scripts/detect_changed_repos.js new file mode 100644 index 0000000..08ea150 --- /dev/null +++ b/.github/scripts/detect_changed_repos.js @@ -0,0 +1,65 @@ +// Detect which repos have changed test cases. +// +// Outputs (via core.setOutput): +// changed_cases — JSON array of case paths (manual single-case dispatch) +// changed_repos — JSON object {repo, task, model} (manual repo dispatch or _none_) +// changed_repos_list — JSON array of repo names (auto-detected from PR/push) +// +// Called from workflow via: +// uses: actions/github-script@v7 +// with: +// script: | +// const run = require('./.github/scripts/detect_changed_repos.js'); +// await run({ github, context, core }); + +module.exports = async ({ github, context, core }) => { + const inputCase = process.env.INPUT_CASE || ''; + const inputRepo = process.env.INPUT_REPO || ''; + const inputTask = process.env.INPUT_TASK || ''; + const inputModel = process.env.INPUT_MODEL || ''; + + // Manual dispatch — single case + if (inputCase) { + core.setOutput('changed_cases', JSON.stringify([inputCase])); + return; + } + + // Manual dispatch — by repo + if (inputRepo) { + core.setOutput('changed_repos', JSON.stringify({ + repo: inputRepo, + task: inputTask, + model: inputModel, + })); + return; + } + + // Auto-detect from changed files + let files = []; + if (context.eventName === 'pull_request') { + const resp = await github.paginate( + github.rest.pulls.listFiles, + { owner: context.repo.owner, repo: context.repo.repo, pull_number: context.issue.number } + ); + files = resp.map(f => f.filename); + } else { + const resp = await github.rest.repos.compareCommits({ + owner: context.repo.owner, repo: context.repo.repo, + base: context.payload.before, head: context.payload.after, + }); + files = resp.data.files.map(f => f.filename); + } + + // Extract unique repos from changed paths + const repos = new Set(); + for (const f of files) { + const m = f.match(/^flagos-user-tests\/tests\/([^/]+)\//); + if (m && m[1] !== 'experimental') repos.add(m[1]); + } + + if (repos.size === 0) { + core.setOutput('changed_repos', JSON.stringify({ repo: '_none_' })); + } else { + core.setOutput('changed_repos_list', JSON.stringify([...repos])); + } +}; diff --git a/.github/workflows/nightly_integration.yml b/.github/workflows/nightly_integration.yml new file mode 100644 index 0000000..9b51668 --- /dev/null +++ b/.github/workflows/nightly_integration.yml @@ -0,0 +1,94 @@ +name: Nightly Integration Test - User Tests + +on: + schedule: + - cron: "0 2 * * *" + workflow_dispatch: + +defaults: + run: + working-directory: flagos-user-tests + +jobs: + discover-cases: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.resolve.outputs.matrix }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install dependencies + run: pip install pyyaml + + - name: Discover all test cases and resolve runner labels + id: resolve + working-directory: flagos-user-tests + run: | + python3 -c " + import json, os, sys + sys.path.insert(0, 'tools') + from run_user_tests import list_test_resources + from pathlib import Path + + root = Path('.') + resources_list = list_test_resources(root) + + matrix_entries = [] + for entry in resources_list: + matrix_entries.append({ + 'case_path': entry['case_path'], + 'runner_labels': json.dumps(entry['runner_labels']), + }) + + if not matrix_entries: + matrix_entries.append({ + 'case_path': '_none_', + 'runner_labels': json.dumps(['ubuntu-latest']), + }) + + matrix = {'include': matrix_entries} + output = json.dumps(matrix) + print(f'Matrix: {output}') + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write(f'matrix={output}\n') + " + + run-tests: + needs: discover-cases + if: ${{ !contains(needs.discover-cases.outputs.matrix, '_none_') }} + runs-on: ${{ fromJson(matrix.runner_labels) }} + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.discover-cases.outputs.matrix) }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install runner dependencies + run: pip install pyyaml + + - name: Run test case + run: python tools/run_user_tests.py --case ${{ matrix.case_path }} + + notify: + needs: run-tests + if: always() + runs-on: ubuntu-latest + steps: + - name: Generate summary + run: | + echo "## Nightly Integration Test Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "Run: ${{ github.run_number }}" >> $GITHUB_STEP_SUMMARY + echo "Date: $(date -u '+%Y-%m-%d %H:%M UTC')" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/post_test_cases.yml b/.github/workflows/post_test_cases.yml new file mode 100644 index 0000000..6e5f243 --- /dev/null +++ b/.github/workflows/post_test_cases.yml @@ -0,0 +1,49 @@ +name: Post Test Cases Report + +on: + pull_request: + branches: [main] + types: [closed] + workflow_dispatch: + +defaults: + run: + working-directory: flagos-user-tests + +jobs: + post-report: + if: ${{ github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true }} + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install dependencies + run: pip install pyyaml + + - name: Collect test cases + run: python tools/collect_test_cases.py --output test_cases_report.json + + - name: Post report + uses: ./actions/post-benchmark-report + with: + backend_url: ${{ secrets.FLAGOPS_BACKEND_URL }} + api_token: ${{ secrets.FLAGOPS_API_TOKEN }} + report_path: flagos-user-tests/test_cases_report.json + list_code: flagops-user-test-cases + list_name: FlagOps User Test Cases + header_config: >- + [ + {"field": "case_id", "name": "用例ID", "required": true, "sortable": true, "type": "string"}, + {"field": "case_name", "name": "用例名称", "required": true, "sortable": false, "type": "string"}, + {"field": "repo", "name": "所属子仓库", "required": true, "sortable": true, "type": "string"}, + {"field": "updated_at", "name": "更新时间", "required": true, "sortable": true, "type": "string"} + ] + fail_on_error: "false" diff --git a/.github/workflows/pr_validation.yml b/.github/workflows/pr_validation.yml new file mode 100644 index 0000000..0083b57 --- /dev/null +++ b/.github/workflows/pr_validation.yml @@ -0,0 +1,85 @@ +name: PR Validation - User Tests + +on: + pull_request: + branches: [main] + paths: + - "flagos-user-tests/**" + +defaults: + run: + working-directory: flagos-user-tests + +jobs: + validate: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install dependencies + run: pip install pyyaml jsonschema + + # Get the actual list of changed files in the PR (github.event.pull_request.changed_files is just a count) + - name: Get changed files + id: changed + uses: actions/github-script@v7 + with: + script: | + const files = await github.paginate( + github.rest.pulls.listFiles, + { owner: context.repo.owner, repo: context.repo.repo, pull_number: context.issue.number } + ); + const changed = files + .map(f => f.filename) + .filter(f => f.startsWith('flagos-user-tests/')) + .map(f => f.replace('flagos-user-tests/', '')); + core.setOutput('files', changed.join(',')); + + # Step 1: Schema validation — only validate changed files + - name: Validate YAML/JSON Schema + run: | + python tools/validators/validate_config.py \ + --changed-files "${{ steps.changed.outputs.files }}" + + # Step 2: Required fields check + - name: Check Required Fields + run: python tools/validators/lint_test_case.py --strict + + # Step 3: Gold values format validation + - name: Validate Gold Values + run: python tools/validators/validate_gold_values.py + + # Step 4: Documentation completeness check + - name: Check Documentation + run: | + errors=0 + # Skip sub-config directories (conf/train/data etc.) + SUB_CONFIG_DIRS="conf train inference data" + for dir in $(find tests -mindepth 3 -maxdepth 5 -type d); do + dirname=$(basename "$dir") + # Skip sub-config directories + skip=false + for sub in $SUB_CONFIG_DIRS; do + if [ "$dirname" = "$sub" ]; then skip=true; break; fi + done + if [ "$skip" = "true" ]; then continue; fi + + # If the directory contains .yaml files, check for README.md + if ls "$dir"/*.yaml 1>/dev/null 2>&1; then + if [ ! -f "$dir/README.md" ]; then + echo "ERROR: Missing README.md in $dir" + errors=$((errors + 1)) + fi + fi + done + if [ $errors -gt 0 ]; then + echo "Found $errors test case directories without README.md" + exit 1 + fi + echo "All test case directories have README.md" diff --git a/.github/workflows/test_dispatch.yml b/.github/workflows/test_dispatch.yml new file mode 100644 index 0000000..c5253bd --- /dev/null +++ b/.github/workflows/test_dispatch.yml @@ -0,0 +1,114 @@ +name: Test Dispatch - User Tests + +on: + push: + branches: [main] + paths: + - "flagos-user-tests/tests/**" + pull_request: + branches: [main] + paths: + - "flagos-user-tests/tests/**" + workflow_dispatch: + inputs: + repo: + description: "Target repository (e.g., flagscale, flaggems)" + required: false + type: string + task: + description: "Task type (train/inference/hetero_train)" + required: false + type: string + model: + description: "Model name (e.g., mixtral, deepseek)" + required: false + type: string + case: + description: "Specific test case YAML path (relative to flagos-user-tests/)" + required: false + type: string + +defaults: + run: + working-directory: flagos-user-tests + +jobs: + detect-changes: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.resolve.outputs.matrix }} + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install dependencies + run: pip install pyyaml + + - name: Detect changed repos + id: detect + uses: actions/github-script@v7 + env: + INPUT_CASE: ${{ inputs.case }} + INPUT_REPO: ${{ inputs.repo }} + INPUT_TASK: ${{ inputs.task }} + INPUT_MODEL: ${{ inputs.model }} + with: + script: | + const run = require('./.github/scripts/detect_changed_repos.js'); + await run({ github, context, core }); + + - name: Resolve resources to matrix + id: resolve + working-directory: flagos-user-tests + run: | + python tools/resolve_matrix.py \ + --changed-cases '${{ steps.detect.outputs.changed_cases }}' \ + --changed-repos '${{ steps.detect.outputs.changed_repos }}' \ + --changed-repos-list '${{ steps.detect.outputs.changed_repos_list }}' + + run-tests: + name: ${{ matrix.repo }}/${{ matrix.task }}/${{ matrix.model }} + needs: detect-changes + if: ${{ needs.detect-changes.outputs.matrix != '' && !contains(needs.detect-changes.outputs.matrix, '_none_') }} + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.detect-changes.outputs.matrix) }} + runs-on: ${{ fromJson(matrix.runner_labels) }} + container: + image: ${{ matrix.container_image }} + options: ${{ matrix.container_options }} + volumes: ${{ fromJson(matrix.container_volumes) }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install runner dependencies + shell: bash + run: | + if [ -n "${{ matrix.conda_env }}" ]; then + source tools/activate_conda.sh ${{ matrix.conda_env }} + fi + pip install pyyaml + + - name: Run user tests + shell: bash + run: | + if [ -n "${{ matrix.conda_env }}" ]; then + source tools/activate_conda.sh ${{ matrix.conda_env }} + fi + ARGS="" + if [ -n "${{ matrix.case_path }}" ]; then + ARGS="--case ${{ matrix.case_path }}" + else + ARGS="--repo ${{ matrix.repo }}" + [ -n "${{ matrix.task }}" ] && ARGS="$ARGS --task ${{ matrix.task }}" + [ -n "${{ matrix.model }}" ] && ARGS="$ARGS --model ${{ matrix.model }}" + fi + python tools/run_user_tests.py $ARGS diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..71bc36f --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +**pycache** \ No newline at end of file diff --git a/flagos-user-tests/CONTRIBUTING.md b/flagos-user-tests/CONTRIBUTING.md new file mode 100644 index 0000000..2c606b2 --- /dev/null +++ b/flagos-user-tests/CONTRIBUTING.md @@ -0,0 +1,76 @@ +# Contributing to FlagOS User Tests + +Thank you for contributing test cases to the FlagOS ecosystem! + +## How to Submit a Test Case + +### Step 1: Generate a Template + +Use the built-in generator to create a properly structured test case: + +```bash +# FlagScale training test case +python tools/generators/create_test_template.py \ + --repo flagscale \ + --type train \ + --model \ + --name + +# Other repositories +python tools/generators/create_test_template.py \ + --repo \ + --name +``` + +### Step 2: Complete the Test Case + +1. **Edit the YAML config** with your actual test parameters +2. **Add gold values** from a verified local run (JSON format) +3. **Complete the README.md** with: + - Description of what the test validates + - Environment requirements (GPU, CUDA, Python) + - Manual execution instructions + +### Step 3: Validate Locally + +```bash +python tools/validators/validate_config.py +python tools/validators/validate_gold_values.py +python tools/validators/lint_test_case.py --strict +``` + +### Step 4: Submit a Pull Request + +1. Fork this repository +2. Create a feature branch: `git checkout -b add-test//` +3. Add your test case files +4. Commit and push +5. Open a Pull Request using the provided template + +## Test Case Requirements + +- Each test case must be in its own directory +- Each directory must contain: + - At least one `.yaml` configuration file + - A `README.md` with test documentation + - Gold values JSON file (for regression tests) +- No sensitive data (tokens, passwords, private paths) in any files +- YAML must pass schema validation +- Gold values must contain numeric arrays + +## Code Review + +- PRs are reviewed by the respective team CODEOWNERS +- CI must pass before merge +- At least one approval from a maintainer is required + +## Experimental Test Cases + +If your test case covers a new or unstable feature: +- Place it under `tests/experimental/` +- It will only run in nightly integration tests +- It will not block PR merges + +## Questions? + +Open an issue using the "New Test Case" template or contact the DevOps team. diff --git a/flagos-user-tests/README.md b/flagos-user-tests/README.md new file mode 100644 index 0000000..9ae4f32 --- /dev/null +++ b/flagos-user-tests/README.md @@ -0,0 +1,69 @@ +# FlagOS User Tests + +User-perspective test cases for FlagOS repositories. Each test case defines its own setup, run, and verification — exactly as a real user would operate. + +## How It Works + +``` +User submits test case YAML: + setup: [pip install flagscale] + run: [flagscale train mixtral --config ./conf/xxx.yaml] + verify: {log_path: ..., gold_values_path: ...} + +CI runner: + 1. cd + 2. Execute setup commands + 3. Execute run commands + 4. Extract metrics from log + 5. Compare against gold values → PASS/FAIL +``` + +Users have full control — the runner does NOT call internal repo scripts. + +## Quick Start + +```bash +# Generate template +python tools/generators/create_test_template.py \ + --repo flagscale --type train --model llama2 --name tp2_pp1 + +# Validate +python tools/validators/validate_config.py +python tools/validators/validate_gold_values.py +python tools/validators/lint_test_case.py --strict + +# Run locally +python tools/run_user_tests.py \ + --case tests/flagscale/train/llama2/tp2_pp1/tp2_pp1.yaml +``` + +See [docs/getting_started.md](docs/getting_started.md) for the full guide. + +## Test Case Structure (FlagScale Example) + +``` +tests/flagscale/train/mixtral/tp2_pp1_ep2/ +├── tp2_pp1_ep2.yaml # Test case: setup → run → verify +├── conf/ # FlagScale configs (user provides) +│ ├── tp2_pp1_ep2.yaml +│ └── train/tp2_pp1_ep2.yaml +├── gold_values/ # Expected metrics +│ └── tp2_pp1_ep2.json +└── README.md +``` + +## Supported Repositories + +FlagScale, FlagGems, FlagCX, FlagTree, vLLM-FL, vLLM-plugin-FL, TE-FL, Megatron-LM-FL + +## CI Workflows (in `../.github/workflows/`) + +| Workflow | Trigger | Description | +|---|---|---| +| PR Validation | Pull Request | Format, lint, gold values checks | +| Test Dispatch | Push/PR | Run user-defined setup → run → verify | +| Nightly | Daily 02:00 UTC | All test cases | + +## Contributing + +See [CONTRIBUTING.md](CONTRIBUTING.md). diff --git a/flagos-user-tests/docs/getting_started.md b/flagos-user-tests/docs/getting_started.md new file mode 100644 index 0000000..1025909 --- /dev/null +++ b/flagos-user-tests/docs/getting_started.md @@ -0,0 +1,99 @@ +# Getting Started + +## Overview + +`flagos-user-tests` manages **user-perspective** test cases for FlagOS repositories. Each test case defines its own setup, run, and verification commands — exactly as a real user would operate. + +## Quick Start + +### 1. Generate a template + +```bash +# FlagScale training test +python tools/generators/create_test_template.py \ + --repo flagscale --type train --model llama2 --name tp2_pp1 + +# Generic test +python tools/generators/create_test_template.py \ + --repo flaggems --name my_operator_test +``` + +### 2. Edit the generated files + +The test case YAML defines the user workflow: + +```yaml +# tests/flagscale/train/llama2/tp2_pp1/tp2_pp1.yaml +meta: + repo: flagscale + task: train + model: llama2 + case: tp2_pp1 + description: "LLaMA2 training with TP=2, PP=1" + +resources: + gpu: A100-80GB + gpu_count: 8 + +setup: + - pip install flagscale # user installs the package + +run: + - flagscale train llama2 --config ./conf/tp2_pp1.yaml # user runs training + +verify: + log_path: ".../stdout.log" # where to find output + gold_values_path: ./gold_values/tp2_pp1.json # expected metrics +``` + +Also edit the FlagScale config files (`conf/*.yaml`) and fill in gold values from a verified run. + +### 3. Validate locally + +```bash +python tools/validators/validate_config.py +python tools/validators/validate_gold_values.py +python tools/validators/lint_test_case.py --strict +``` + +### 4. Run locally (optional) + +```bash +python tools/run_user_tests.py \ + --case tests/flagscale/train/llama2/tp2_pp1/tp2_pp1.yaml +``` + +### 5. Submit a PR + +CI will automatically: +1. Validate format (PR Validation workflow) +2. Run your test case on real hardware (Test Dispatch workflow) + +## How the Runner Works + +`run_user_tests.py` is a **generic executor**: + +``` +┌─────────────┐ ┌──────────────────────────────────────┐ +│ Test Case │ ──▶ │ 1. cd │ +│ YAML │ │ 2. Execute setup commands │ +│ │ │ 3. Execute run commands │ +│ │ │ 4. Find log file (glob pattern) │ +│ │ │ 5. Extract metrics from log │ +│ │ │ 6. Compare against gold values │ +└─────────────┘ └──────────────────────────────────────┘ +``` + +It does **not** call any internal repo scripts. Users have full control over: +- What to install (`setup`) +- How to run (`run`) +- What to verify (`verify`) +- Machine requirements (`resources`) — mapped to runner labels via `resource_map.yaml` + +## CI Workflows + +| Workflow | Trigger | Description | +|---|---|---| +| PR Validation | Pull Request | Format/lint/gold-values checks | +| Test Dispatch | Push to main / PR | Runs user-defined setup → run → verify | +| Nightly | Daily 02:00 UTC | All test cases across all repos | diff --git a/flagos-user-tests/docs/test_format_spec.md b/flagos-user-tests/docs/test_format_spec.md new file mode 100644 index 0000000..a916e0d --- /dev/null +++ b/flagos-user-tests/docs/test_format_spec.md @@ -0,0 +1,177 @@ +# Test Format Specification + +## Core Concept: User-Perspective Test Cases + +Every test case is a **self-contained YAML file** that defines the complete workflow from a **user's perspective**: + +```yaml +meta: # What is this test? +resources: # Hardware requirements (platform, device, device_count) +setup: # How to install? (user's commands) +run: # How to run? (user's commands) +verify: # How to check? (gold values comparison) +``` + +The runner (`run_user_tests.py`) simply executes these user-defined commands. It does NOT call any internal repo scripts — giving users full control and matching real usage scenarios. + +## Test Case YAML Format + +### Complete Example (FlagScale) + +```yaml +meta: + repo: flagscale + task: train + model: mixtral + case: tp2_pp1_ep2 + description: "Mixtral MoE training with TP=2, PP=1, EP=2" + +resources: + platform: cuda + device: A100-80GB + device_count: 8 + +env: + CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + +setup: + - pip install flagscale + +run: + - flagscale train mixtral --config ./conf/tp2_pp1_ep2.yaml + +verify: + log_path: "tests/functional_tests/train/mixtral/test_results/tp2_pp1_ep2/logs/details/host_0_localhost/*/default_*/attempt_0/*/stdout.log" + gold_values_path: ./gold_values/tp2_pp1_ep2.json + rtol: 1e-5 + atol: 0 +``` + +### Complete Example (Generic) + +```yaml +meta: + repo: flaggems + case: my_operator_test + description: "Test custom operator correctness" + +setup: + - pip install flaggems + +run: + - pytest -v tests/test_my_operator.py + +# No verify step — pytest exit code determines pass/fail +``` + +### Field Reference + +| Field | Type | Required | Description | +|---|---|---|---| +| `meta.repo` | string | Yes | Target FlagOS repository name | +| `meta.task` | string | No | Task type (train/inference/hetero_train) | +| `meta.model` | string | No | Model name | +| `meta.case` | string | No | Case name (for filtering) | +| `meta.description` | string | Yes | What this test validates | +| `resources` | object | No | Hardware requirements | +| `resources.platform` | string | No | Chip platform: `cuda`, `metax`, `ascend` (default: `cuda`) | +| `resources.device` | string | No | Device type (e.g. `A100-40GB`, `C500`, `Ascend910B`) | +| `resources.device_count` | int | No | Number of devices required | +| `env` | object | No | Environment variables | + +### Resource Resolution + +The `resources` field drives CI decisions via `resource_map.yaml` (platform-based): + +1. **Runner selection**: `resources.platform` + `resources.device` -> platform-specific runner labels +2. **Container image**: `resources.platform` + `meta.repo/task` -> platform-specific Docker image +3. **Container options**: `resources.platform` -> device passthrough flags (`--gpus all`, `--device /dev/davinci_all`, etc.) + +Supported platforms: + +| Platform | Vendor | Devices | Status | +|---|---|---|---| +| `cuda` | NVIDIA | A100, H100, H800 | Active | +| `metax` | MetaX (Muxi) | C500 | Planned | +| `ascend` | Huawei | Ascend910B, Ascend910C | Planned | + +The test job runs inside the platform-resolved Docker container with device access. + +### Field Reference (continued) + +| Field | Type | Required | Description | +|---|---|---|---| +| `setup` | list[str] | No | Shell commands for environment setup | +| `run` | list[str] | Yes | Shell commands to execute the test | +| `verify.log_path` | string | No | Path to output log (supports glob patterns) | +| `verify.gold_values_path` | string | No | Path to gold values JSON file | +| `verify.gold_values` | object | No | Inline gold values (alternative to file) | +| `verify.rtol` | float | No | Relative tolerance (default: 1e-5) | +| `verify.atol` | float | No | Absolute tolerance (default: 0) | + +### Working Directory + +All commands execute with the **test case directory** as the working directory. So `./conf/tp2_pp1_ep2.yaml` resolves relative to where the test case YAML lives. + +## Gold Values Format + +### Numeric (default) + +```json +{ + "lm loss:": { + "values": [11.17587, 11.16908, 10.41927] + } +} +``` + +- Keys are metric names extracted from log files +- Values are numeric arrays +- Comparison uses `rtol` / `atol` similar to `numpy.allclose` +- `log_path` supports glob patterns for timestamp directories + +### Text + +```json +{ + "inference_output": { + "type": "text", + "pattern": "output\\.outputs\\[0\\]\\.text=(?:\"(.+?)\"$|'(.+?)'$)", + "values": [ + " Lina. I'm a 22-year", + " the same as the president of the United Nations." + ] + } +} +``` + +- Set `"type": "text"` to enable text comparison +- `"pattern"` is a regex with capture group(s) to extract text from log lines + - If multiple groups (e.g. alternation), the first non-None group is used +- Values are compared with exact string match + +## FlagScale Test Case Directory Structure + +``` +tests/flagscale/train/mixtral/tp2_pp1_ep2/ +├── tp2_pp1_ep2.yaml # Test case definition (setup/run/verify) +├── conf/ +│ ├── tp2_pp1_ep2.yaml # FlagScale experiment config (Hydra) +│ └── train/ +│ └── tp2_pp1_ep2.yaml # Training parameters +├── gold_values/ +│ └── tp2_pp1_ep2.json # Expected metrics +└── README.md +``` + +The user runs: `pip install flagscale && flagscale train mixtral --config ./conf/tp2_pp1_ep2.yaml` + +## README Requirements + +Each test case directory must have a `README.md` with: +1. **Description** section +2. **Environment** section + +## Experimental Test Cases + +Place under `tests/experimental/` for gray-stage tests (nightly only, non-blocking). diff --git a/flagos-user-tests/repos.yaml b/flagos-user-tests/repos.yaml new file mode 100644 index 0000000..fe7f816 --- /dev/null +++ b/flagos-user-tests/repos.yaml @@ -0,0 +1,45 @@ +# FlagOS target repository configuration +# +# Note: Each test case defines its own setup/run/verify workflow (user perspective). +# This file only records basic repository info for CI repo-level filtering and issue templates. + +repositories: + flagscale: + url: https://github.com/flagos-ai/FlagScale.git + default_branch: main + description: Large-scale distributed training framework + + flaggems: + url: https://github.com/flagos-ai/FlagGems.git + default_branch: main + description: GPU-accelerated math library + + flagcx: + url: https://github.com/flagos-ai/FlagCX.git + default_branch: main + description: Cross-chip communication library + + flagtree: + url: https://github.com/flagos-ai/FlagTree.git + default_branch: main + description: Tree-structured computation library + + vllm-fl: + url: https://github.com/flagos-ai/vLLM-FL.git + default_branch: main + description: LLM inference engine + + vllm-plugin-fl: + url: https://github.com/flagos-ai/vLLM-plugin-FL.git + default_branch: main + description: vLLM plugin system + + te-fl: + url: https://github.com/flagos-ai/TransformerEngine-FL.git + default_branch: main + description: Transformer Engine + + megatron-lm-fl: + url: https://github.com/flagos-ai/Megatron-LM-FL.git + default_branch: main + description: Megatron-LM fork diff --git a/flagos-user-tests/resource_map.yaml b/flagos-user-tests/resource_map.yaml new file mode 100644 index 0000000..30f5f75 --- /dev/null +++ b/flagos-user-tests/resource_map.yaml @@ -0,0 +1,104 @@ +# Maps test case resource requirements to GitHub Actions runner labels and container images. +# +# Architecture: platform-based multi-vendor support +# resources.platform -> platforms. -> runner labels, container images, options +# +# Example test case YAML: +# resources: +# platform: cuda +# device: A100-40GB +# device_count: 1 +# +# Resolution chain: +# 1. resources.platform -> platforms.cuda +# 2. platforms.cuda.device_labels["A100-40GB"] -> runner labels +# 3. platforms.cuda.container_images["flagscale/inference"] -> Docker image +# 4. platforms.cuda.container_options -> Docker runtime flags + +# ============================================================================= +# Platforms: each vendor/chip family is a platform +# ============================================================================= +platforms: + + # --------------------------------------------------------------------------- + # NVIDIA CUDA platform + # --------------------------------------------------------------------------- + cuda: + description: "NVIDIA CUDA GPUs (A100, H100, H800, etc.)" + + # Device type -> self-hosted runner labels + device_labels: + A100-40GB: [self-hosted, Linux, X64, gpu-a100-40gb] + A100-80GB: [self-hosted, Linux, X64, gpu-a100-80gb] + H100-80GB: [self-hosted, Linux, X64, gpu-h100-80gb] + H800-80GB: [self-hosted, Linux, X64, gpu-h800-80gb] + + # Default runner labels when device type not found + default_labels: [self-hosted, Linux, X64] + + # Container images: "/" -> Docker image + container_images: + flagscale/train: "localhost:5000/flagscale-train:dev-cu128-py3.12-20260228210721" + flagscale/inference: "localhost:5000/flagscale-inference:dev-cu128-py3.12-20260302102033" + flagscale/hetero_train: "localhost:5000/flagscale-train:dev-cu128-py3.12-20260228210721" + + # Conda environment to activate inside the container before test execution. + # Same key format as container_images: "/" | "" | "default" + conda_env: + flagscale/inference: "flagscale-inference" + + # Container runtime options + container_options: "--gpus all --shm-size=500g --user root --ulimit nofile=65535:65535" + + # Container volume mounts (host:container) + container_volumes: + - /home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data + - /home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers + + # --------------------------------------------------------------------------- + # MetaX (Muxi) platform — placeholder for future integration + # --------------------------------------------------------------------------- + metax: + description: "MetaX (Muxi) GPUs (C500, etc.)" + + device_labels: + C500: [self-hosted, Linux, X64, metax-c500] + + default_labels: [self-hosted, Linux, X64, metax] + + container_images: {} + # flagscale/train: "registry.example.com/flagscale-train:metax-..." + # flagscale/inference: "registry.example.com/flagscale-inference:metax-..." + + container_options: "--device /dev/mxgpu_all --shm-size=500g --user root" + + container_volumes: [] + + # --------------------------------------------------------------------------- + # Ascend (Huawei) platform — placeholder for future integration + # --------------------------------------------------------------------------- + ascend: + description: "Huawei Ascend NPUs (910B, 910C, etc.)" + + device_labels: + Ascend910B: [self-hosted, Linux, aarch64, ascend-910b] + Ascend910C: [self-hosted, Linux, aarch64, ascend-910c] + + default_labels: [self-hosted, Linux, aarch64, ascend] + + container_images: {} + # flagscale/train: "registry.example.com/flagscale-train:ascend-..." + + container_options: "--device /dev/davinci_all --shm-size=500g --user root" + + container_volumes: [] + +# ============================================================================= +# Global defaults +# ============================================================================= + +# Default platform when resources.platform is not specified +default_platform: cuda + +# Fallback runner labels when nothing matches +default_labels: [self-hosted] diff --git a/flagos-user-tests/tests/flagcx/.gitkeep b/flagos-user-tests/tests/flagcx/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/flagos-user-tests/tests/flaggems/.gitkeep b/flagos-user-tests/tests/flaggems/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/flagos-user-tests/tests/flagscale/hetero_train/.gitkeep b/flagos-user-tests/tests/flagscale/hetero_train/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/.gitignore b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/.gitignore new file mode 100644 index 0000000..2301c87 --- /dev/null +++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/.gitignore @@ -0,0 +1,2 @@ +FlagScale/ +outputs/ diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/README.md b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/README.md new file mode 100644 index 0000000..491ebcd --- /dev/null +++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/README.md @@ -0,0 +1,25 @@ +# demo_0_6b + +## Description + +FlagScale inference demo using Qwen3-0.6B model with vLLM backend. +Runs 4 prompts with greedy decoding (temperature=0, max_tokens=10) and verifies output text against gold values. + +## Environment + +- GPU: 1x A100 40GB +- CUDA: 12.1+ +- Python: 3.12 +- vLLM: 0.10.1.dev + +## How to Run + +```bash +git clone https://github.com/flagos-ai/FlagScale.git && cd FlagScale && pip install . +flagscale inference qwen3 --config ./conf/demo_0_6b.yaml +``` + +## Gold Values + +Uses text-type gold values to verify inference output. +Greedy decoding (temperature=0) produces deterministic output, so text comparison is exact match. diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/demo_0_6b.yaml b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/demo_0_6b.yaml new file mode 100644 index 0000000..0f15416 --- /dev/null +++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/demo_0_6b.yaml @@ -0,0 +1,27 @@ +defaults: + - _self_ + - inference: demo_0_6b + +experiment: + exp_name: qwen3 + exp_dir: ./outputs/${experiment.exp_name} + task: + type: inference + backend: vllm + entrypoint: flagscale/inference/inference_llm.py + runner: + hostfile: null + cmds: + before_start: source /root/miniconda3/bin/activate flagscale-inference + envs: + VLLM_PLUGINS: "fl" + VLLM_USE_FLASHINFER_SAMPLER: 0 + VLLM_LOGGING_LEVEL: "INFO" + CUDA_VISIBLE_DEVICES: 0 + CUDA_DEVICE_MAX_CONNECTIONS: 1 + +action: run + +hydra: + run: + dir: ${experiment.exp_dir}/hydra diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml new file mode 100644 index 0000000..f1ce909 --- /dev/null +++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml @@ -0,0 +1,18 @@ +llm: + model: /home/gitlab-runner/data/Qwen3-0.6B + trust_remote_code: true + tensor_parallel_size: 1 + pipeline_parallel_size: 1 + gpu_memory_utilization: 0.9 + seed: 1234 + +generate: + prompts: [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling: + max_tokens: 10 + temperature: 0.0 diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml new file mode 100644 index 0000000..1399352 --- /dev/null +++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml @@ -0,0 +1,30 @@ +meta: + repo: flagscale + task: inference + model: qwen3 + case: demo_0_6b + description: > + Qwen3-0.6B inference demo using vLLM backend with FlagScale CLI. + Runs 4 prompts with greedy decoding (temperature=0) and verifies output text. + +resources: + platform: cuda + device: A100-40GB + device_count: 1 + +env: + CUDA_VISIBLE_DEVICES: "0" + VLLM_PLUGINS: "fl" + VLLM_USE_FLASHINFER_SAMPLER: "0" + VLLM_LOGGING_LEVEL: "INFO" + CUDA_DEVICE_MAX_CONNECTIONS: "1" + +setup: + - git clone https://github.com/flagos-ai/FlagScale.git && cd FlagScale && pip install . + +run: + - flagscale inference qwen3 --config ./conf/demo_0_6b.yaml --test + +verify: + log_path: "./outputs/qwen3/inference_logs/host_0_localhost.output" + gold_values_path: ./gold_values/demo_0_6b.json diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/gold_values/demo_0_6b.json b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/gold_values/demo_0_6b.json new file mode 100644 index 0000000..2a28edd --- /dev/null +++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/gold_values/demo_0_6b.json @@ -0,0 +1,12 @@ +{ + "inference_output": { + "type": "text", + "pattern": "output\\.outputs\\[0\\]\\.text=(?:\"(.+?)\"$|'(.+?)'$)", + "values": [ + " Lina. I'm a 22-year", + " the same as the president of the United Nations.", + " Paris. The capital of France is also the capital", + " not just a technological challenge but a profound transformation of" + ] + } +} diff --git a/flagos-user-tests/tests/flagscale/train/.gitkeep b/flagos-user-tests/tests/flagscale/train/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/flagos-user-tests/tests/flagtree/.gitkeep b/flagos-user-tests/tests/flagtree/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/flagos-user-tests/tests/megatron-lm-fl/.gitkeep b/flagos-user-tests/tests/megatron-lm-fl/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/flagos-user-tests/tests/te-fl/.gitkeep b/flagos-user-tests/tests/te-fl/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/flagos-user-tests/tests/vllm-fl/.gitkeep b/flagos-user-tests/tests/vllm-fl/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/flagos-user-tests/tests/vllm-plugin-fl/.gitkeep b/flagos-user-tests/tests/vllm-plugin-fl/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/flagos-user-tests/tools/activate_conda.sh b/flagos-user-tests/tools/activate_conda.sh new file mode 100755 index 0000000..a8e11bd --- /dev/null +++ b/flagos-user-tests/tools/activate_conda.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# Activate a conda environment inside a container. +# +# Detects conda installation, initializes the shell, then activates the env. +# Must be sourced (not executed) so the activation persists in the caller's shell: +# source tools/activate_conda.sh [conda_path] +# +# Arguments: +# env_name — conda environment name (required) +# conda_path — path to conda installation (optional, auto-detected if omitted) + +set -e + +_activate_conda() { + local env_name="${1:?Usage: source activate_conda.sh [conda_path]}" + local conda_path="${2:-}" + + # Auto-detect conda path if not provided + if [ -z "$conda_path" ]; then + if [ -n "$CONDA_DIR" ] && [ -d "$CONDA_DIR" ]; then + conda_path="$CONDA_DIR" + elif command -v conda &>/dev/null; then + conda_path="$(conda info --base 2>/dev/null)" + elif [ -d "$HOME/miniconda3" ]; then + conda_path="$HOME/miniconda3" + elif [ -d "$HOME/anaconda3" ]; then + conda_path="$HOME/anaconda3" + elif [ -d "/opt/conda" ]; then + conda_path="/opt/conda" + fi + fi + + if [ -z "$conda_path" ]; then + echo "[activate_conda] WARNING: conda not found, skipping activation" + return 0 + fi + + local conda_sh="$conda_path/etc/profile.d/conda.sh" + if [ ! -f "$conda_sh" ]; then + echo "[activate_conda] ERROR: conda.sh not found at $conda_sh" + return 1 + fi + + # Initialize conda for this shell + echo "[activate_conda] Initializing conda from $conda_path" + source "$conda_sh" + + # Activate the environment + echo "[activate_conda] Activating environment: $env_name" + conda activate "$env_name" || { + echo "[activate_conda] ERROR: Failed to activate conda env '$env_name'" + return 1 + } + + echo "[activate_conda] Active Python: $(which python) ($(python --version 2>&1))" +} + +_activate_conda "$@" diff --git a/flagos-user-tests/tools/collect_test_cases.py b/flagos-user-tests/tools/collect_test_cases.py new file mode 100644 index 0000000..e349a38 --- /dev/null +++ b/flagos-user-tests/tools/collect_test_cases.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +"""Collect all test cases and output a JSON report for post-benchmark-report action. + +Output format (object-of-objects, keyed by case_id): + { + "tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml": { + "case_name": "flagscale-inference-qwen3-demo_0_6b", + "repo": "flagscale", + "updated_at": "2026-03-18T15:02:29+08:00" + }, + ... + } + +Usage: + python tools/collect_test_cases.py --root . --output report.json +""" + +import argparse +import json +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path + +import yaml + + +def get_file_updated_time(filepath: Path) -> str: + """Get the last commit time of a file via git, fallback to mtime.""" + try: + result = subprocess.run( + ["git", "log", "-1", "--format=%aI", str(filepath)], + capture_output=True, text=True, timeout=10, + ) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + except (subprocess.TimeoutExpired, FileNotFoundError): + pass + + # Fallback to file modification time + mtime = filepath.stat().st_mtime + return datetime.fromtimestamp(mtime, tz=timezone.utc).strftime("%Y/%m/%d %H:%M:%S") + + +def make_case_id(meta: dict) -> str: + """Generate a case ID from meta fields: ---.""" + parts = [ + meta.get("repo", "unknown"), + meta.get("task", ""), + meta.get("model", ""), + meta.get("case", ""), + ] + return "-".join(p for p in parts if p) + + +def collect_test_cases(root: Path) -> dict: + """Discover all test cases and return report dict keyed by case_id. + + The post-benchmark-report action expects an object-of-objects format where: + - Each key maps to header_config[0].field (case_id) + - Each value is an object with fields matching header_config[1+] + """ + tests_dir = root / "tests" + report = {} + + for yaml_path in sorted(tests_dir.rglob("*.yaml")): + if yaml_path.name.startswith("_") or yaml_path.name == "data.yaml": + continue + + try: + with open(yaml_path) as f: + data = yaml.safe_load(f) + if not isinstance(data, dict) or "meta" not in data: + continue + + meta = data["meta"] + case_id = str(yaml_path.relative_to(root)) + report[case_id] = { + "case_name": make_case_id(meta), + "repo": meta.get("repo", "unknown"), + "updated_at": get_file_updated_time(yaml_path), + } + except (yaml.YAMLError, KeyError): + continue + + return report + + +def main(): + parser = argparse.ArgumentParser(description="Collect test cases for reporting") + parser.add_argument("--root", default=".", help="Root directory of flagos-user-tests") + parser.add_argument("--output", default="test_cases_report.json", help="Output JSON file") + args = parser.parse_args() + + root = Path(args.root) + report = collect_test_cases(root) + + with open(args.output, "w") as f: + json.dump(report, f, ensure_ascii=False, indent=2) + + print(f"Collected {len(report)} test case(s) -> {args.output}") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/flagos-user-tests/tools/generators/create_test_template.py b/flagos-user-tests/tools/generators/create_test_template.py new file mode 100644 index 0000000..e8e261a --- /dev/null +++ b/flagos-user-tests/tools/generators/create_test_template.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 +"""Generate user-perspective test case template. + +Usage: + # FlagScale training test + python create_test_template.py --repo flagscale --type train --model llama2 --name tp2_pp1 + + # Generic test + python create_test_template.py --repo flaggems --name my_operator_test +""" + +import argparse +import json +import os +import sys +from pathlib import Path + +import yaml + + +VALID_REPOS = [ + "flagscale", "flaggems", "flagcx", "flagtree", + "vllm-fl", "vllm-plugin-fl", "te-fl", "megatron-lm-fl", +] + + +def create_flagscale_test_case(task_type: str, model: str, name: str) -> dict: + """Generate a FlagScale user-perspective test case YAML.""" + return { + "meta": { + "repo": "flagscale", + "task": task_type, + "model": model, + "case": name, + "description": "TODO: describe what this test validates", + }, + "resources": { + "gpu": "A100-80GB", + "gpu_count": 8, + }, + "env": { + "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", + "CUDA_DEVICE_MAX_CONNECTIONS": "1", + }, + "setup": [ + "pip install flagscale", + ], + "run": [ + f"flagscale {task_type} {model} --config ./conf/{name}.yaml", + ], + "verify": { + "log_path": f"tests/functional_tests/{task_type}/{model}/test_results/{name}/logs/details/host_0_localhost/*/default_*/attempt_0/*/stdout.log", + "gold_values_path": f"./gold_values/{name}.json", + "rtol": 1e-5, + "atol": 0, + }, + } + + +def create_flagscale_experiment_config(model: str, name: str, task_type: str) -> dict: + """Generate Hydra experiment config for flagscale CLI.""" + return { + "defaults": ["_self_", {task_type: name}], + "experiment": { + "exp_name": name, + "exp_dir": f"tests/functional_tests/{task_type}/{model}/test_results/{name}", + "task": { + "type": task_type, + "backend": "megatron", + "entrypoint": "flagscale/train/megatron/train_gpt.py", + }, + "runner": {"ssh_port": None}, + "envs": { + "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", + "CUDA_DEVICE_MAX_CONNECTIONS": "1", + }, + }, + "action": "run", + "hydra": {"run": {"dir": "${experiment.exp_dir}/hydra"}}, + } + + +def create_flagscale_train_params() -> dict: + """Generate training params sub-config.""" + return { + "defaults": ["data"], + "system": { + "tensor_model_parallel_size": 2, + "pipeline_model_parallel_size": 1, + "sequence_parallel": True, + "use_distributed_optimizer": True, + "precision": {"bf16": True}, + "logging": {"log_interval": 1}, + "checkpoint": {"no_save_optim": True, "no_save_rng": True, "save_interval": 100000}, + }, + "model": { + "num_layers": 2, + "hidden_size": 4096, + "num_attention_heads": 32, + "seq_length": 2048, + }, + } + + +def create_generic_test_case(repo: str, name: str) -> dict: + """Generate a generic user-perspective test case YAML.""" + return { + "meta": { + "repo": repo, + "case": name, + "description": "TODO: describe what this test validates", + }, + "resources": {}, + "setup": [ + f"pip install {repo.replace('-', '_')}", + ], + "run": [ + "pytest -v", + ], + } + + +def create_readme(repo: str, task_type: str, model: str, name: str) -> str: + if repo == "flagscale": + return f"""# {name} + +## Description + +TODO: Describe what this test case validates. + +## Environment + +- GPU: 8x A100 80GB +- CUDA: 12.1+ +- Python: 3.10 + +## How to Run + +```bash +pip install flagscale +flagscale {task_type} {model} --config ./conf/{name}.yaml +``` + +## Gold Values + +TODO: Describe expected values and tolerance. +""" + return f"""# {name} + +## Description + +TODO: Describe what this test case validates. + +## Environment + +- Python: 3.10 + +## How to Run + +```bash +pip install {repo} +pytest -v +``` +""" + + +def dump_yaml(data: dict, path: Path): + os.makedirs(path.parent, exist_ok=True) + with open(path, "w") as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + +def main(): + parser = argparse.ArgumentParser(description="Generate test case template") + parser.add_argument("--repo", required=True, choices=VALID_REPOS) + parser.add_argument("--type", default="train") + parser.add_argument("--model", default="") + parser.add_argument("--name", required=True) + parser.add_argument("--output", default=".") + args = parser.parse_args() + + root = Path(args.output) + + if args.repo == "flagscale": + if not args.model: + print("FlagScale test cases require --model"); sys.exit(1) + + case_dir = root / "tests" / args.repo / args.type / args.model / args.name + + # Main test case YAML (user-perspective) + tc = create_flagscale_test_case(args.type, args.model, args.name) + dump_yaml(tc, case_dir / f"{args.name}.yaml") + + # Hydra experiment config + ec = create_flagscale_experiment_config(args.model, args.name, args.type) + dump_yaml(ec, case_dir / "conf" / f"{args.name}.yaml") + + # Training params sub-config + tp = create_flagscale_train_params() + dump_yaml(tp, case_dir / "conf" / "train" / f"{args.name}.yaml") + + # Gold values + gold = {"lm loss:": {"values": [0.0] * 10}} + gold_path = case_dir / "gold_values" / f"{args.name}.json" + os.makedirs(gold_path.parent, exist_ok=True) + with open(gold_path, "w") as f: + json.dump(gold, f, indent=2) + + # README + readme = create_readme(args.repo, args.type, args.model, args.name) + with open(case_dir / "README.md", "w") as f: + f.write(readme) + + print(f"Created FlagScale test case at: {case_dir}") + print(f" {args.name}.yaml — test case (setup/run/verify)") + print(f" conf/{args.name}.yaml — FlagScale experiment config") + print(f" conf/train/{args.name}.yaml — training parameters") + print(f" gold_values/{args.name}.json — expected metrics") + print(f" README.md") + else: + case_dir = root / "tests" / args.repo / args.name + tc = create_generic_test_case(args.repo, args.name) + dump_yaml(tc, case_dir / f"{args.name}.yaml") + + readme = create_readme(args.repo, "", "", args.name) + with open(case_dir / "README.md", "w") as f: + f.write(readme) + + print(f"Created test case at: {case_dir}") + + +if __name__ == "__main__": + main() diff --git a/flagos-user-tests/tools/resolve_matrix.py b/flagos-user-tests/tools/resolve_matrix.py new file mode 100644 index 0000000..3d2a549 --- /dev/null +++ b/flagos-user-tests/tools/resolve_matrix.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +"""Resolve test case resources into a GitHub Actions matrix. + +Reads detection outputs (changed_cases / changed_repos / changed_repos_list) +and produces a JSON matrix with runner_labels, container_image, container_options, +and container_volumes per test case entry. + +Usage (from workflow): + python tools/resolve_matrix.py \ + --changed-cases '${{ steps.detect.outputs.changed_cases }}' \ + --changed-repos '${{ steps.detect.outputs.changed_repos }}' \ + --changed-repos-list '${{ steps.detect.outputs.changed_repos_list }}' +""" + +import argparse +import json +import os +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent)) +from run_user_tests import ( + list_test_resources, + resolve_conda_env, + resolve_container_image, + resolve_container_options, + resolve_runner_labels, +) + +import yaml + + +def make_entry(case_path: str, meta: dict, resources: dict, resource_map_path: Path) -> dict: + """Build a matrix entry with runner labels and per-platform container config.""" + labels = resolve_runner_labels(resources, resource_map_path) + image = resolve_container_image( + meta.get("repo", ""), meta.get("task", ""), + resources, resource_map_path, + ) + init_cmd = resolve_conda_env( + meta.get("repo", ""), meta.get("task", ""), + resources, resource_map_path, + ) + opts = resolve_container_options(resources, resource_map_path) + return { + "case_path": case_path, + "repo": meta.get("repo", ""), + "task": meta.get("task", ""), + "model": meta.get("model", ""), + "runner_labels": json.dumps(labels), + "container_image": image, + "conda_env": init_cmd, + "container_options": opts["container_options"], + "container_volumes": json.dumps(opts["container_volumes"]), + } + + +def make_empty_entry(**kwargs) -> dict: + """Build a placeholder entry with defaults.""" + return { + "case_path": "", "repo": "", "task": "", "model": "", + "runner_labels": json.dumps(["self-hosted"]), + "container_image": "", "conda_env": "", + "container_options": "", + "container_volumes": json.dumps([]), + **kwargs, + } + + +def resource_entry_to_matrix(entry: dict, repo: str = "", task: str = "", model: str = "") -> dict: + """Convert a list_test_resources entry to a matrix entry.""" + return { + "case_path": entry["case_path"], + "repo": entry.get("repo", "") or repo or "", + "task": entry.get("task", "") or task or "", + "model": entry.get("model", "") or model or "", + "runner_labels": json.dumps(entry["runner_labels"]), + "container_image": entry.get("container_image", ""), + "conda_env": entry.get("conda_env", ""), + "container_options": entry.get("container_options", ""), + "container_volumes": json.dumps(entry.get("container_volumes", [])), + } + + +def main(): + parser = argparse.ArgumentParser(description="Resolve test resources to CI matrix") + parser.add_argument("--changed-cases", default="") + parser.add_argument("--changed-repos", default="") + parser.add_argument("--changed-repos-list", default="") + parser.add_argument("--root", default=".", help="Root directory of flagos-user-tests") + args = parser.parse_args() + + root = Path(args.root) + resource_map_path = root / "resource_map.yaml" + matrix_entries = [] + + if args.changed_cases: + cases = json.loads(args.changed_cases) + for case_path in cases: + p = root / case_path if not Path(case_path).is_absolute() else Path(case_path) + if p.exists(): + data = yaml.safe_load(p.read_text()) + matrix_entries.append(make_entry( + case_path, data.get("meta", {}), + data.get("resources", {}), resource_map_path, + )) + + elif args.changed_repos_list: + repos = json.loads(args.changed_repos_list) + for repo in repos: + for entry in list_test_resources(root, repo=repo): + matrix_entries.append(resource_entry_to_matrix(entry, repo=repo)) + + elif args.changed_repos: + info = json.loads(args.changed_repos) + if info.get("repo") == "_none_": + matrix_entries.append(make_empty_entry(repo="_none_")) + else: + repo = info["repo"] + task = info.get("task", "") or None + model = info.get("model", "") or None + entries = list_test_resources(root, repo=repo, task=task, model=model) + if entries: + for entry in entries: + matrix_entries.append(resource_entry_to_matrix( + entry, repo=repo, + task=info.get("task", ""), + model=info.get("model", ""), + )) + else: + matrix_entries.append(make_empty_entry(repo=repo)) + + matrix = {"include": matrix_entries} + matrix_json = json.dumps(matrix) + print(f"Matrix: {matrix_json}") + + # Write to GITHUB_OUTPUT if available + github_output = os.environ.get("GITHUB_OUTPUT") + if github_output: + with open(github_output, "a") as f: + f.write(f"matrix={matrix_json}\n") + else: + # For local testing, just print to stdout + print(json.dumps(matrix, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/flagos-user-tests/tools/run_user_tests.py b/flagos-user-tests/tools/run_user_tests.py new file mode 100644 index 0000000..754d6a6 --- /dev/null +++ b/flagos-user-tests/tools/run_user_tests.py @@ -0,0 +1,579 @@ +#!/usr/bin/env python3 +"""Run user-submitted test cases against FlagOS repositories. + +Each test case is a self-contained YAML config that defines: + - setup: how to install the repo and dependencies (user's perspective) + - run: how to execute the test (user's perspective) + - verify: how to check results against gold values + +This runner simply executes user-defined commands — it does NOT call +any internal repo test scripts. This keeps test cases at the "user level". + +Usage: + # Run a specific test case + python tools/run_user_tests.py --case tests/flagscale/train/mixtral/tp2_pp1_ep2.yaml + + # Run all test cases for a repo + python tools/run_user_tests.py --repo flagscale + + # Run all test cases for a repo+task+model + python tools/run_user_tests.py --repo flagscale --task train --model mixtral +""" + +import argparse +import json +import os +import re +import subprocess +import sys +from pathlib import Path + +import yaml + +# --------------------------------------------------------------------------- +# Gold-value comparison +# --------------------------------------------------------------------------- + +def extract_metrics_from_lines(lines: list[str], metric_keys: list[str]) -> dict: + """Extract numeric metric values from log lines. + + Supports common log formats: + - Pipe-separated: "iteration 1/10 | lm loss: 1.161E+01 | ..." + - Key-value: "step 1 metric_name:1.234" + """ + results = {k: [] for k in metric_keys} + + for line in lines: + for key in metric_keys: + # Pattern: "key " or "key: " + # Handle keys with or without trailing colon + escaped = re.escape(key.rstrip(":")) + pattern = rf"{escaped}\s*:?\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)" + match = re.search(pattern, line) + if match: + try: + results[key].append(float(match.group(1))) + except ValueError: + pass + + return results + + +def extract_text_from_lines(lines: list[str], pattern: str) -> list[str]: + """Extract text values from log lines using a regex pattern. + + The pattern must contain at least one capture group. If multiple groups + are present (e.g. alternation), the first non-None group is used. + Example pattern: r"output\\.outputs\\[0\\]\\.text=(?:\"(.+?)\"|'(.+?)')" + """ + results = [] + compiled = re.compile(pattern) + + for line in lines: + match = compiled.search(line) + if match: + # Pick first non-None group + val = next((g for g in match.groups() if g is not None), None) + if val is not None: + results.append(val) + + return results + + +def compare_gold_values( + actual: dict, gold: dict, rtol: float = 1e-5, atol: float = 0 +) -> tuple[bool, list[str]]: + """Compare actual metrics against gold values. + + Supports two types of gold entries: + - numeric (default): {"values": [1.0, 2.0], "type": "numeric"} + - text: {"values": ["hello", "world"], "type": "text", + "pattern": "regex with (capture group)"} + + Returns (all_passed, list_of_messages). + """ + messages = [] + all_passed = True + + for key, gold_entry in gold.items(): + gold_values = gold_entry.get("values", []) + actual_values = actual.get(key, []) + entry_type = gold_entry.get("type", "numeric") + + if not actual_values: + messages.append(f"FAIL: No values extracted for metric '{key}'") + all_passed = False + continue + + if len(actual_values) != len(gold_values): + messages.append( + f"FAIL: Length mismatch for '{key}': " + f"got {len(actual_values)}, expected {len(gold_values)}" + ) + all_passed = False + continue + + if entry_type == "text": + for i, (a, g) in enumerate(zip(actual_values, gold_values)): + if a != g: + messages.append( + f"FAIL: '{key}'[{i}] text mismatch:\n" + f" actual: {a!r}\n" + f" gold: {g!r}" + ) + all_passed = False + break + else: + messages.append(f"PASS: '{key}' ({len(gold_values)} text values match)") + else: + # numeric comparison — numpy-free allclose + for i, (a, g) in enumerate(zip(actual_values, gold_values)): + if abs(a - g) > atol + rtol * abs(g): + messages.append( + f"FAIL: '{key}'[{i}] mismatch: actual={a}, gold={g}, " + f"diff={abs(a-g):.6e}" + ) + all_passed = False + break + else: + messages.append(f"PASS: '{key}' ({len(gold_values)} values match)") + + return all_passed, messages + + +# --------------------------------------------------------------------------- +# Test case execution +# --------------------------------------------------------------------------- + +def run_commands(cmds: list[str], cwd: str, env: dict | None = None) -> int: + """Run a list of shell commands sequentially. Return first non-zero exit code.""" + full_env = {**os.environ, **(env or {})} + for cmd in cmds: + print(f" $ {cmd}") + result = subprocess.run(cmd, shell=True, cwd=cwd, env=full_env) + if result.returncode != 0: + print(f" FAILED (exit code {result.returncode})") + return result.returncode + return 0 + + +def run_test_case(case_path: Path, workdir: Path | None = None) -> int: + """Execute a single user test case. + + Test case YAML format: + meta: + repo: flagscale + task: train + model: mixtral + description: "..." + + resources: + platform: cuda + device: A100-40GB + device_count: 1 + + setup: + - pip install flagscale + - modelscope download --model ... --local_dir ./model_weights + + run: + - flagscale train mixtral --config ./conf/tp2_pp1_ep2.yaml + + verify: + log_path: "tests/functional_tests/train/mixtral/test_results/tp2_pp1_ep2/logs/..." + gold_values_path: "./gold_values/tp2_pp1_ep2.json" + # OR inline gold values: + gold_values: + "lm loss:": + values: [11.17587, 11.16908, ...] + rtol: 1e-5 + atol: 0 + """ + print(f"\n{'='*60}") + print(f"Test Case: {case_path}") + print(f"{'='*60}") + + with open(case_path) as f: + config = yaml.safe_load(f) + + meta = config.get("meta", {}) + setup_cmds = config.get("setup", []) + run_cmds = config.get("run", []) + verify_config = config.get("verify", {}) + + print(f"Repo: {meta.get('repo', 'unknown')}") + print(f"Task: {meta.get('task', 'unknown')}") + print(f"Model: {meta.get('model', 'unknown')}") + print(f"Desc: {meta.get('description', '')}") + print() + + # Determine working directory — test case files live next to the YAML + case_dir = case_path.parent.resolve() + cwd = str(workdir.resolve()) if workdir else str(case_dir) + + env = config.get("env", {}) + # Convert all env values to strings + env = {k: str(v) for k, v in env.items()} + + # --- Setup --- + if setup_cmds: + print("--- Setup ---") + rc = run_commands(setup_cmds, cwd=cwd, env=env) + if rc != 0: + print("SETUP FAILED") + return rc + + # --- Run --- + if run_cmds: + print("\n--- Run ---") + rc = run_commands(run_cmds, cwd=cwd, env=env) + if rc != 0: + print("RUN FAILED") + return rc + + # --- Verify --- + if verify_config: + print("\n--- Verify ---") + return verify_results(verify_config, case_dir=case_dir, cwd=cwd) + + print("\nPASSED (no verify step)") + return 0 + + +def verify_results(verify_config: dict, case_dir: Path, cwd: str) -> int: + """Verify test results against gold values.""" + # Load gold values + gold = verify_config.get("gold_values") + if not gold: + gold_path = verify_config.get("gold_values_path", "") + if gold_path: + # Resolve relative to case_dir + full_path = (case_dir / gold_path) if not Path(gold_path).is_absolute() else Path(gold_path) + if not full_path.exists(): + # Also try relative to cwd + full_path = Path(cwd) / gold_path + if not full_path.exists(): + print(f"FAIL: Gold values file not found: {gold_path}") + return 1 + with open(full_path) as f: + gold = json.load(f) + else: + print("No gold values defined, skipping verification") + return 0 + + # Extract actual metrics from log + log_path = verify_config.get("log_path", "") + if not log_path: + print("FAIL: verify.log_path is required for gold value comparison") + return 1 + + # Resolve log path — try relative to cwd first, then case_dir + full_log = Path(cwd) / log_path + if not full_log.exists(): + full_log = case_dir / log_path + if not full_log.exists(): + # Try glob pattern (user might use * for timestamp dirs) + import glob as globmod + candidates = globmod.glob(str(Path(cwd) / log_path)) + if not candidates: + candidates = globmod.glob(str(case_dir / log_path)) + if candidates: + full_log = Path(sorted(candidates)[-1]) # latest match + else: + print(f"FAIL: Log file not found: {log_path}") + return 1 + + print(f"Log: {full_log}") + + # Read log via subprocess to bypass NFS client cache + import time + time.sleep(2) + log_content = subprocess.run( + ["cat", str(full_log)], capture_output=True, text=True + ).stdout + log_lines = log_content.splitlines() + + # Separate numeric and text gold entries + numeric_keys = [] + actual = {} + for key, entry in gold.items(): + entry_type = entry.get("type", "numeric") + if entry_type == "text": + pattern = entry.get("pattern", "") + if not pattern: + print(f"FAIL: Text gold entry '{key}' requires a 'pattern' field") + return 1 + actual[key] = extract_text_from_lines(log_lines, pattern) + else: + numeric_keys.append(key) + + if numeric_keys: + numeric_actual = extract_metrics_from_lines(log_lines, numeric_keys) + actual.update(numeric_actual) + + rtol = verify_config.get("rtol", 1e-5) + atol = verify_config.get("atol", 0) + passed, messages = compare_gold_values(actual, gold, rtol=rtol, atol=atol) + + for msg in messages: + print(f" {msg}") + + print(f"\nResult: {'PASSED' if passed else 'FAILED'}") + return 0 if passed else 1 + + +# --------------------------------------------------------------------------- +# Discovery and batch execution +# --------------------------------------------------------------------------- + +def discover_test_cases( + root: Path, repo: str | None = None, + task: str | None = None, model: str | None = None +) -> list[Path]: + """Find all test case YAML files under tests/. + + Test case YAMLs are identified by having a 'meta' key with 'repo'. + """ + tests_dir = root / "tests" + cases = [] + + for yaml_path in sorted(tests_dir.rglob("*.yaml")): + # Skip files in sub-config dirs (train/, data.yaml, etc.) + if yaml_path.name.startswith("_") or yaml_path.name == "data.yaml": + continue + + try: + with open(yaml_path) as f: + data = yaml.safe_load(f) + if not isinstance(data, dict) or "meta" not in data: + continue + meta = data["meta"] + if repo and meta.get("repo") != repo: + continue + if task and meta.get("task") != task: + continue + if model and meta.get("model") != model: + continue + cases.append(yaml_path) + except (yaml.YAMLError, KeyError): + continue + + return cases + + +def _load_resource_map(resource_map_path: Path) -> dict: + """Load resource_map.yaml, returning empty dict on failure.""" + if not resource_map_path.exists(): + return {} + with open(resource_map_path) as f: + return yaml.safe_load(f) or {} + + +def _get_platform_config(resource_map: dict, platform: str) -> dict: + """Get platform config from resource_map, with fallback to default_platform.""" + platforms = resource_map.get("platforms", {}) + if platform and platform in platforms: + return platforms[platform] + default_platform = resource_map.get("default_platform", "") + if default_platform and default_platform in platforms: + return platforms[default_platform] + return {} + + +def resolve_runner_labels(resources: dict, resource_map_path: Path) -> list[str]: + """Resolve test case resources to GitHub Actions runner labels. + + Uses platform-based lookup: + resources.platform -> platforms..device_labels[resources.device] + + Falls back to platform default_labels, then global default_labels. + """ + global_default = ["self-hosted"] + resource_map = _load_resource_map(resource_map_path) + if not resource_map: + return global_default + + global_default = resource_map.get("default_labels", global_default) + platform = resources.get("platform", "") + pcfg = _get_platform_config(resource_map, platform) + if not pcfg: + return global_default + + platform_default = pcfg.get("default_labels", global_default) + device = resources.get("device", "") + if not device: + return platform_default + + # Case-insensitive device lookup + device_labels = pcfg.get("device_labels", {}) + for key, labels in device_labels.items(): + if key.lower() == device.lower(): + return labels + + return platform_default + + +def resolve_container_image( + repo: str, task: str, resources: dict, resource_map_path: Path +) -> str: + """Resolve test case to a Docker container image. + + Lookup: platform -> container_images -> "/" | "" | "default" + Returns "" if no image is configured. + """ + resource_map = _load_resource_map(resource_map_path) + platform = resources.get("platform", "") + pcfg = _get_platform_config(resource_map, platform) + images = pcfg.get("container_images", {}) + if not images: + return "" + + key = f"{repo}/{task}" if task else repo + image = images.get(key, "") + if not image and repo: + image = images.get(repo, "") + if not image: + image = images.get("default", "") + return image + + +def resolve_container_options(resources: dict, resource_map_path: Path) -> dict: + """Resolve container runtime options and volumes for the given platform. + + Returns {"container_options": str, "container_volumes": list}. + """ + resource_map = _load_resource_map(resource_map_path) + platform = resources.get("platform", "") + pcfg = _get_platform_config(resource_map, platform) + return { + "container_options": pcfg.get("container_options", ""), + "container_volumes": pcfg.get("container_volumes", []), + } + + +def resolve_conda_env( + repo: str, task: str, resources: dict, resource_map_path: Path +) -> str: + """Resolve conda environment name for the given platform and repo/task. + + Lookup: platform -> conda_env -> "/" | "" | "default" + Returns "" if no conda env is configured. + """ + resource_map = _load_resource_map(resource_map_path) + platform = resources.get("platform", "") + pcfg = _get_platform_config(resource_map, platform) + conda_envs = pcfg.get("conda_env", {}) + if not conda_envs: + return "" + + key = f"{repo}/{task}" if task else repo + env = conda_envs.get(key, "") + if not env and repo: + env = conda_envs.get(repo, "") + if not env: + env = conda_envs.get("default", "") + return env + + +def list_test_resources( + root: Path, repo: str | None = None, + task: str | None = None, model: str | None = None +) -> list[dict]: + """List test cases with their resource requirements, runner labels, and container config. + + Returns a list of dicts with keys: + case_path, resources, runner_labels, container_image, container_init, + container_options, container_volumes + """ + cases = discover_test_cases(root, repo, task, model) + resource_map_path = root / "resource_map.yaml" + result = [] + + for case_path in cases: + with open(case_path) as f: + data = yaml.safe_load(f) + meta = data.get("meta", {}) + resources = data.get("resources", {}) + runner_labels = resolve_runner_labels(resources, resource_map_path) + container_image = resolve_container_image( + meta.get("repo", ""), meta.get("task", ""), resources, resource_map_path + ) + conda_env = resolve_conda_env( + meta.get("repo", ""), meta.get("task", ""), resources, resource_map_path + ) + container_opts = resolve_container_options(resources, resource_map_path) + result.append({ + "case_path": str(case_path), + "repo": meta.get("repo", ""), + "task": meta.get("task", ""), + "model": meta.get("model", ""), + "resources": resources, + "runner_labels": runner_labels, + "container_image": container_image, + "conda_env": conda_env, + **container_opts, + }) + + return result + + +def main(): + parser = argparse.ArgumentParser( + description="Run user-submitted FlagOS test cases" + ) + parser.add_argument("--case", help="Path to a specific test case YAML") + parser.add_argument("--repo", help="Run all cases for this repo") + parser.add_argument("--task", help="Filter by task type") + parser.add_argument("--model", help="Filter by model name") + parser.add_argument( + "--workdir", + help="Working directory for command execution (default: test case directory)" + ) + parser.add_argument( + "--list-resources", action="store_true", + help="List test cases with resource requirements and runner labels (JSON output)" + ) + args = parser.parse_args() + + # --list-resources mode: output JSON and exit + if args.list_resources: + root = Path(".") + result = list_test_resources(root, args.repo, args.task, args.model) + print(json.dumps(result, indent=2)) + sys.exit(0) + + workdir = Path(args.workdir) if args.workdir else None + + if args.case: + case_path = Path(args.case) + if not case_path.exists(): + print(f"ERROR: Test case not found: {case_path}") + sys.exit(1) + sys.exit(run_test_case(case_path, workdir)) + + if not args.repo: + print("ERROR: Specify --case, --repo, or --list-resources") + sys.exit(1) + + root = Path(".") + cases = discover_test_cases(root, args.repo, args.task, args.model) + + if not cases: + print(f"No test cases found for repo={args.repo} task={args.task} model={args.model}") + sys.exit(0) + + print(f"Found {len(cases)} test case(s)") + failed = 0 + for case in cases: + rc = run_test_case(case, workdir) + if rc != 0: + failed += 1 + + print(f"\n{'='*60}") + print(f"Results: {len(cases) - failed}/{len(cases)} passed") + print(f"{'='*60}") + sys.exit(1 if failed else 0) + + +if __name__ == "__main__": + main() diff --git a/flagos-user-tests/tools/test_post_report.sh b/flagos-user-tests/tools/test_post_report.sh new file mode 100755 index 0000000..2783e36 --- /dev/null +++ b/flagos-user-tests/tools/test_post_report.sh @@ -0,0 +1,126 @@ +#!/bin/bash +# Local test script for posting test cases report. +# +# Usage: +# ./tools/test_post_report.sh [api_token] +# +# Example: +# ./tools/test_post_report.sh http://10.0.0.1:8080 +# ./tools/test_post_report.sh http://10.0.0.1:8080 my-secret-token + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" + +BACKEND_URL="${1:?Usage: $0 [api_token]}" +BACKEND_URL="${BACKEND_URL%/}" +API_TOKEN="${2:-}" + +LIST_CODE="flagops-user-test-cases" +LIST_NAME="FlagOps User Test Cases" +REPORT_PATH="$ROOT_DIR/test_cases_report.json" + +HEADER_CONFIG='[ + {"field": "case_id", "name": "用例ID", "required": true, "sortable": true, "type": "string"}, + {"field": "case_name", "name": "用例名称", "required": true, "sortable": false, "type": "string"}, + {"field": "repo", "name": "所属子仓库", "required": true, "sortable": true, "type": "string"}, + {"field": "updated_at", "name": "更新时间", "required": true, "sortable": true, "type": "string"} +]' + +# --- Step 1: Collect test cases --- +echo "=== Step 1: Collect test cases ===" +cd "$ROOT_DIR" +python tools/collect_test_cases.py --root . --output "$REPORT_PATH" +echo "Report content:" +cat "$REPORT_PATH" | python -m json.tool +echo "" + +# --- Step 2: Post header config --- +echo "=== Step 2: Post header config ===" +HEADER_PAYLOAD=$(jq -n \ + --arg list_code "$LIST_CODE" \ + --arg list_name "$LIST_NAME" \ + --argjson header_config "$HEADER_CONFIG" \ + '{list_code: $list_code, list_name: $list_name, header_config: $header_config}') + +echo "URL: ${BACKEND_URL}/flagcicd-backend/list/header" +echo "Payload:" +echo "$HEADER_PAYLOAD" | jq . + +CURL_ARGS=(-s -X POST -w '\n%{http_code}' -H "Content-Type: application/json" -d "$HEADER_PAYLOAD") +[ -n "$API_TOKEN" ] && CURL_ARGS+=(-H "Authorization: Bearer $API_TOKEN") + +RESPONSE=$(curl "${CURL_ARGS[@]}" "${BACKEND_URL}/flagcicd-backend/list/header") +HTTP_STATUS=$(echo "$RESPONSE" | tail -n1) +RESPONSE_BODY=$(echo "$RESPONSE" | sed '$d') + +echo "HTTP status: $HTTP_STATUS" +echo "Response: $RESPONSE_BODY" +echo "" + +# --- Step 3: Post list data --- +echo "=== Step 3: Post list data ===" +COMMIT_ID="$(git rev-parse HEAD 2>/dev/null || echo 'unknown')" +REPO_NAME="flagos-ai/FlagOps" +WORKFLOW_ID="local-test" +RUN_ID="local-$$" + +DATA_PAYLOAD=$(jq -n \ + --arg repository_name "$REPO_NAME" \ + --arg workflow_id "$WORKFLOW_ID" \ + --arg commit_id "$COMMIT_ID" \ + --arg run_id "$RUN_ID" \ + --argjson header_config "$HEADER_CONFIG" \ + --slurpfile report "$REPORT_PATH" \ + '{ + items: [ $report[0] | to_entries[] | . as $entry | + ([ $header_config | to_entries[] | .value.field as $f | + if .key == 0 then {($f): $entry.key} + else {($f): $entry.value[$f]} + end + ] | add) + { + commit_id: $commit_id, + repository_name: $repository_name, + workflow_id: $workflow_id, + run_id: $run_id + } + ] + }') + +echo "URL: ${BACKEND_URL}/flagcicd-backend/list/data/${LIST_CODE}" +echo "Items count: $(echo "$DATA_PAYLOAD" | jq '.items | length')" +echo "Payload (first item sample):" +echo "$DATA_PAYLOAD" | jq '{items_count: (.items | length), first_item: .items[0]}' + +CURL_ARGS=(-s -X POST -w '\n%{http_code}' -H "Content-Type: application/json" -d "$DATA_PAYLOAD") +[ -n "$API_TOKEN" ] && CURL_ARGS+=(-H "Authorization: Bearer $API_TOKEN") + +RESPONSE=$(curl "${CURL_ARGS[@]}" "${BACKEND_URL}/flagcicd-backend/list/data/${LIST_CODE}") +HTTP_STATUS=$(echo "$RESPONSE" | tail -n1) +RESPONSE_BODY=$(echo "$RESPONSE" | sed '$d') + +echo "HTTP status: $HTTP_STATUS" +echo "Response: $RESPONSE_BODY" +echo "" + +# --- Step 4: Query to verify --- +echo "=== Step 4: Query list data ===" +QUERY_URL="${BACKEND_URL}/flagcicd-backend/list/data/${LIST_CODE}?page_size=10&page=1&sort=created_at&order=desc" +echo "URL: $QUERY_URL" + +CURL_ARGS=(-s -X GET -w '\n%{http_code}' -H "Accept: application/json") +[ -n "$API_TOKEN" ] && CURL_ARGS+=(-H "Authorization: Bearer $API_TOKEN") + +RESPONSE=$(curl "${CURL_ARGS[@]}" "$QUERY_URL") +HTTP_STATUS=$(echo "$RESPONSE" | tail -n1) +RESPONSE_BODY=$(echo "$RESPONSE" | sed '$d') + +echo "HTTP status: $HTTP_STATUS" +echo "Response:" +echo "$RESPONSE_BODY" | jq . 2>/dev/null || echo "$RESPONSE_BODY" + +# Cleanup +rm -f "$REPORT_PATH" +echo "" +echo "=== Done ===" diff --git a/flagos-user-tests/tools/validators/lint_test_case.py b/flagos-user-tests/tools/validators/lint_test_case.py new file mode 100644 index 0000000..e171dde --- /dev/null +++ b/flagos-user-tests/tools/validators/lint_test_case.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +"""Lint test case directories for completeness and correctness. + +Checks: +- Each test case directory has a README.md +- Each test case has at least one YAML config +- README contains required sections (Description, Environment, etc.) +- No sensitive data patterns (tokens, passwords, private paths) +""" + +import argparse +import re +import sys +from pathlib import Path + +import yaml + + +VALID_REPOS = [ + "flagscale", "flaggems", "flagcx", "flagtree", + "vllm-fl", "vllm-plugin-fl", "te-fl", "megatron-lm-fl", +] + +# Patterns that might indicate sensitive data in configs +SENSITIVE_PATTERNS = [ + re.compile(r"(password|passwd|secret|token|api_key)\s*[:=]", re.IGNORECASE), + re.compile(r"/home/[a-zA-Z0-9_]+/", re.IGNORECASE), # Private user paths + re.compile(r"sk-[a-zA-Z0-9]{20,}"), # API keys +] + +README_REQUIRED_SECTIONS = ["description", "environment"] + + +def find_test_case_dirs(root: Path) -> list[Path]: + """Find directories that contain a user-perspective test case YAML (has 'meta' key).""" + tests_dir = root / "tests" + if not tests_dir.exists(): + return [] + + test_dirs = set() + for yaml_file in tests_dir.rglob("*.yaml"): + try: + data = yaml.safe_load(yaml_file.read_text()) + if isinstance(data, dict) and "meta" in data: + test_dirs.add(yaml_file.parent) + except (yaml.YAMLError, OSError): + continue + + return sorted(test_dirs) + + +def lint_readme(readme_path: Path, strict: bool = False) -> list[str]: + """Check README.md for required content.""" + errors = [] + if not readme_path.exists(): + return [f"{readme_path.parent}: Missing README.md"] + + content = readme_path.read_text().lower() + + if strict: + for section in README_REQUIRED_SECTIONS: + if section not in content: + errors.append( + f"{readme_path}: Missing required section '{section}'" + ) + + if len(content.strip()) < 20: + errors.append(f"{readme_path}: README is too short (less than 20 characters)") + + return errors + + +def lint_sensitive_data(filepath: Path) -> list[str]: + """Check for sensitive data patterns in config files.""" + errors = [] + content = filepath.read_text() + for pattern in SENSITIVE_PATTERNS: + matches = pattern.findall(content) + if matches: + errors.append( + f"{filepath}: Possible sensitive data detected: {matches[:3]}" + ) + return errors + + +def lint_yaml_configs(test_dir: Path) -> list[str]: + """Lint YAML config files in a test directory.""" + errors = [] + yaml_files = list(test_dir.glob("*.yaml")) + if not yaml_files: + return [] + + for yf in yaml_files: + try: + with open(yf) as f: + data = yaml.safe_load(f) + if data is None: + errors.append(f"{yf}: Empty YAML file") + except yaml.YAMLError as e: + errors.append(f"{yf}: Invalid YAML - {e}") + continue + + # Check for sensitive data + errors.extend(lint_sensitive_data(yf)) + + return errors + + +def main(): + parser = argparse.ArgumentParser(description="Lint test case directories") + parser.add_argument( + "--path", default=".", + help="Root directory of flagos-user-tests" + ) + parser.add_argument( + "--strict", action="store_true", + help="Enable strict checks (README sections, etc.)" + ) + args = parser.parse_args() + + root = Path(args.path) + all_errors = [] + warnings = [] + + test_dirs = find_test_case_dirs(root) + if not test_dirs: + print("No test case directories found.") + sys.exit(0) + + for test_dir in test_dirs: + # Check README + readme_errors = lint_readme(test_dir / "README.md", strict=args.strict) + if args.strict: + all_errors.extend(readme_errors) + else: + warnings.extend(readme_errors) + + # Lint YAML configs + all_errors.extend(lint_yaml_configs(test_dir)) + + if warnings: + print(f"Warnings ({len(warnings)}):") + for w in warnings: + print(f" ⚠ {w}") + + if all_errors: + print(f"Lint FAILED with {len(all_errors)} error(s):") + for err in all_errors: + print(f" ✗ {err}") + sys.exit(1) + else: + print(f"Lint PASSED: {len(test_dirs)} test directory(ies) checked.") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/flagos-user-tests/tools/validators/validate_config.py b/flagos-user-tests/tools/validators/validate_config.py new file mode 100644 index 0000000..10cf0dc --- /dev/null +++ b/flagos-user-tests/tools/validators/validate_config.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +"""Validate YAML configuration files for test cases. + +Checks: +- YAML syntax validity +- Test case YAML (with meta key): required fields (meta.repo, setup, run) +- FlagScale sub-configs (experiment/defaults): structure validation +- Generic configs: non-empty dict +""" + +import argparse +import sys +from pathlib import Path + +import yaml + + +VALID_REPOS = [ + "flagscale", "flaggems", "flagcx", "flagtree", + "vllm-fl", "vllm-plugin-fl", "te-fl", "megatron-lm-fl", +] + + +def validate_yaml_syntax(filepath: Path) -> list[str]: + """Check that a file is valid YAML.""" + errors = [] + try: + with open(filepath) as f: + data = yaml.safe_load(f) + if data is None: + errors.append(f"{filepath}: YAML file is empty") + except yaml.YAMLError as e: + errors.append(f"{filepath}: Invalid YAML syntax - {e}") + return errors + + +def validate_test_case(filepath: Path, data: dict) -> list[str]: + """Validate a user-perspective test case YAML (has 'meta' key).""" + errors = [] + meta = data.get("meta", {}) + + if not meta.get("repo"): + errors.append(f"{filepath}: Missing 'meta.repo'") + elif meta["repo"] not in VALID_REPOS: + errors.append(f"{filepath}: Invalid meta.repo '{meta['repo']}'") + + if not data.get("run"): + errors.append(f"{filepath}: Missing 'run' (list of commands)") + elif not isinstance(data["run"], list): + errors.append(f"{filepath}: 'run' must be a list of commands") + + if "setup" in data and not isinstance(data["setup"], list): + errors.append(f"{filepath}: 'setup' must be a list of commands") + + if "verify" in data: + v = data["verify"] + if isinstance(v, dict): + has_gold = v.get("gold_values") or v.get("gold_values_path") + if has_gold and not v.get("log_path"): + errors.append(f"{filepath}: verify.log_path required when gold values are defined") + + return errors + + +def validate_flagscale_subconfig(filepath: Path, data: dict) -> list[str]: + """Validate FlagScale sub-config (experiment config or train params).""" + errors = [] + keys = set(data.keys()) + + if "experiment" in keys: + exp = data["experiment"] + if "exp_name" not in exp: + errors.append(f"{filepath}: Missing 'experiment.exp_name'") + if "task" not in exp: + errors.append(f"{filepath}: Missing 'experiment.task'") + elif "type" not in exp.get("task", {}): + errors.append(f"{filepath}: Missing 'experiment.task.type'") + elif "defaults" in keys: + # Sub-config (train params, data, etc.) — lighter validation + pass + else: + errors.append( + f"{filepath}: Missing expected top-level key " + f"('experiment' or 'defaults'), found: {keys}" + ) + return errors + + +def validate_file(filepath: Path) -> list[str]: + """Validate a single YAML file based on its content type.""" + errors = validate_yaml_syntax(filepath) + if errors: + return errors + + with open(filepath) as f: + data = yaml.safe_load(f) + if not isinstance(data, dict): + return [f"{filepath}: Must be a YAML mapping"] + + # Determine type by content + if "meta" in data: + # User-perspective test case + return validate_test_case(filepath, data) + elif "experiment" in data or "defaults" in data: + # FlagScale sub-config (Hydra config) + return validate_flagscale_subconfig(filepath, data) + else: + # Generic config — just check it's a valid non-empty dict + return [] + + +def find_yaml_files(root: Path) -> list[Path]: + """Find all YAML files under tests/.""" + tests_dir = root / "tests" + if not tests_dir.exists(): + return [] + return sorted(tests_dir.rglob("*.yaml")) + + +def main(): + parser = argparse.ArgumentParser(description="Validate test case YAML configs") + parser.add_argument("--path", default=".", help="Root directory of flagos-user-tests") + parser.add_argument("--changed-files", default="", help="Comma-separated list of changed files") + args = parser.parse_args() + + root = Path(args.path) + + if args.changed_files: + yaml_files = [ + Path(f) for f in args.changed_files.split(",") + if f.strip().endswith(".yaml") and f.strip().startswith("tests/") + ] + else: + yaml_files = find_yaml_files(root) + + if not yaml_files: + print("No YAML test config files found to validate.") + sys.exit(0) + + all_errors = [] + for filepath in yaml_files: + full_path = root / filepath if not filepath.is_absolute() else filepath + if not full_path.exists(): + all_errors.append(f"{filepath}: File does not exist") + continue + all_errors.extend(validate_file(full_path)) + + if all_errors: + print(f"Validation FAILED with {len(all_errors)} error(s):") + for err in all_errors: + print(f" ✗ {err}") + sys.exit(1) + else: + print(f"Validation PASSED: {len(yaml_files)} file(s) checked.") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/flagos-user-tests/tools/validators/validate_gold_values.py b/flagos-user-tests/tools/validators/validate_gold_values.py new file mode 100644 index 0000000..6d690cc --- /dev/null +++ b/flagos-user-tests/tools/validators/validate_gold_values.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +"""Validate gold values JSON files for test cases. + +Checks: +- Valid JSON syntax +- Expected structure: keys map to objects with "values" arrays +- At least one value is present +- Numeric entries (default): all values are int/float +- Text entries (type: "text"): all values are strings, "pattern" field is present +""" + +import argparse +import json +import sys +from pathlib import Path + + +def validate_gold_values_file(filepath: Path) -> list[str]: + """Validate a single gold values JSON file.""" + errors = [] + + try: + with open(filepath) as f: + data = json.load(f) + except json.JSONDecodeError as e: + return [f"{filepath}: Invalid JSON - {e}"] + + if not isinstance(data, dict): + return [f"{filepath}: Gold values must be a JSON object, got {type(data).__name__}"] + + if not data: + return [f"{filepath}: Gold values file is empty"] + + for key, value in data.items(): + if not isinstance(value, dict): + errors.append(f"{filepath}: Key '{key}' must map to an object, got {type(value).__name__}") + continue + + if "values" not in value: + errors.append(f"{filepath}: Key '{key}' missing 'values' field") + continue + + values = value["values"] + if not isinstance(values, list): + errors.append(f"{filepath}: Key '{key}'.values must be an array") + continue + + if len(values) == 0: + errors.append(f"{filepath}: Key '{key}'.values is empty") + continue + + entry_type = value.get("type", "numeric") + + if entry_type == "text": + # Text entries require a 'pattern' field for extraction + if "pattern" not in value: + errors.append(f"{filepath}: Key '{key}' has type 'text' but missing 'pattern' field") + for i, v in enumerate(values): + if not isinstance(v, str): + errors.append( + f"{filepath}: Key '{key}'.values[{i}] is not a string: {v!r}" + ) + elif entry_type == "numeric": + for i, v in enumerate(values): + if not isinstance(v, (int, float)): + errors.append( + f"{filepath}: Key '{key}'.values[{i}] is not numeric: {v!r}" + ) + else: + errors.append(f"{filepath}: Key '{key}' has unknown type: {entry_type!r}") + + return errors + + +def find_gold_values_files(root: Path) -> list[Path]: + """Find all gold values JSON files under tests/. + + Supports both conventions: + - FlagScale: tests////gold_values/.json + - Flat: tests///_gold_values.json + """ + tests_dir = root / "tests" + if not tests_dir.exists(): + return [] + # Match files inside gold_values/ directories + gold_dir_files = list(tests_dir.rglob("gold_values/*.json")) + # Match files with _gold_values in name (legacy flat layout) + gold_name_files = list(tests_dir.rglob("*_gold_values.json")) + return list(set(gold_dir_files + gold_name_files)) + + +def main(): + parser = argparse.ArgumentParser(description="Validate gold values JSON files") + parser.add_argument( + "--path", default=".", + help="Root directory of flagos-user-tests" + ) + args = parser.parse_args() + + root = Path(args.path) + all_errors = [] + + gold_files = find_gold_values_files(root) + + if not gold_files: + print("No gold values files found. Skipping validation.") + sys.exit(0) + + for filepath in gold_files: + all_errors.extend(validate_gold_values_file(filepath)) + + if all_errors: + print(f"Gold values validation FAILED with {len(all_errors)} error(s):") + for err in all_errors: + print(f" ✗ {err}") + sys.exit(1) + else: + print(f"Gold values validation PASSED: {len(gold_files)} file(s) checked.") + sys.exit(0) + + +if __name__ == "__main__": + main()