From 3cbcaba727eec1380a10bbda8e0d64830f3d18a4 Mon Sep 17 00:00:00 2001
From: liyuzhuo <lee.yuzhuo233@gmail.com>
Date: Wed, 18 Mar 2026 13:48:48 +0800
Subject: [PATCH 01/13] [wip] flagos user tests

---
 .github/CODEOWNERS                            |  42 ++
 .github/ISSUE_TEMPLATE/new_test_case.yml      |  94 +++
 .github/PULL_REQUEST_TEMPLATE.md              |  37 ++
 .github/scripts/detect_changed_repos.js       |  65 +++
 .github/workflows/nightly_integration.yml     |  94 +++
 .github/workflows/pr_validation.yml           |  85 +++
 .github/workflows/test_dispatch.yml           | 104 ++++
 flagos-user-tests/CONTRIBUTING.md             |  76 +++
 flagos-user-tests/README.md                   |  69 +++
 flagos-user-tests/docs/getting_started.md     |  99 ++++
 flagos-user-tests/docs/test_format_spec.md    | 177 ++++++
 flagos-user-tests/repos.yaml                  |  45 ++
 flagos-user-tests/resource_map.yaml           |  99 ++++
 flagos-user-tests/tests/flagcx/.gitkeep       |   0
 flagos-user-tests/tests/flaggems/.gitkeep     |   0
 .../tests/flagscale/hetero_train/.gitkeep     |   0
 .../inference/qwen3/demo_0_6b/.gitignore      |   2 +
 .../inference/qwen3/demo_0_6b/README.md       |  25 +
 .../qwen3/demo_0_6b/conf/demo_0_6b.yaml       |  27 +
 .../demo_0_6b/conf/inference/demo_0_6b.yaml   |  18 +
 .../inference/qwen3/demo_0_6b/demo_0_6b.yaml  |  38 ++
 .../demo_0_6b/gold_values/demo_0_6b.json      |  12 +
 .../tests/flagscale/train/.gitkeep            |   0
 flagos-user-tests/tests/flagtree/.gitkeep     |   0
 .../tests/megatron-lm-fl/.gitkeep             |   0
 flagos-user-tests/tests/te-fl/.gitkeep        |   0
 flagos-user-tests/tests/vllm-fl/.gitkeep      |   0
 .../tests/vllm-plugin-fl/.gitkeep             |   0
 .../run_user_tests.cpython-312.pyc            | Bin 0 -> 22466 bytes
 .../tools/generators/create_test_template.py  | 233 ++++++++
 flagos-user-tests/tools/resolve_matrix.py     | 138 +++++
 flagos-user-tests/tools/run_user_tests.py     | 547 ++++++++++++++++++
 .../tools/validators/lint_test_case.py        | 157 +++++
 .../tools/validators/validate_config.py       | 159 +++++
 .../tools/validators/validate_gold_values.py  | 108 ++++
 35 files changed, 2550 insertions(+)
 create mode 100644 .github/CODEOWNERS
 create mode 100644 .github/ISSUE_TEMPLATE/new_test_case.yml
 create mode 100644 .github/PULL_REQUEST_TEMPLATE.md
 create mode 100644 .github/scripts/detect_changed_repos.js
 create mode 100644 .github/workflows/nightly_integration.yml
 create mode 100644 .github/workflows/pr_validation.yml
 create mode 100644 .github/workflows/test_dispatch.yml
 create mode 100644 flagos-user-tests/CONTRIBUTING.md
 create mode 100644 flagos-user-tests/README.md
 create mode 100644 flagos-user-tests/docs/getting_started.md
 create mode 100644 flagos-user-tests/docs/test_format_spec.md
 create mode 100644 flagos-user-tests/repos.yaml
 create mode 100644 flagos-user-tests/resource_map.yaml
 create mode 100644 flagos-user-tests/tests/flagcx/.gitkeep
 create mode 100644 flagos-user-tests/tests/flaggems/.gitkeep
 create mode 100644 flagos-user-tests/tests/flagscale/hetero_train/.gitkeep
 create mode 100644 flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/.gitignore
 create mode 100644 flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/README.md
 create mode 100644 flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/demo_0_6b.yaml
 create mode 100644 flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml
 create mode 100644 flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml
 create mode 100644 flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/gold_values/demo_0_6b.json
 create mode 100644 flagos-user-tests/tests/flagscale/train/.gitkeep
 create mode 100644 flagos-user-tests/tests/flagtree/.gitkeep
 create mode 100644 flagos-user-tests/tests/megatron-lm-fl/.gitkeep
 create mode 100644 flagos-user-tests/tests/te-fl/.gitkeep
 create mode 100644 flagos-user-tests/tests/vllm-fl/.gitkeep
 create mode 100644 flagos-user-tests/tests/vllm-plugin-fl/.gitkeep
 create mode 100644 flagos-user-tests/tools/__pycache__/run_user_tests.cpython-312.pyc
 create mode 100644 flagos-user-tests/tools/generators/create_test_template.py
 create mode 100644 flagos-user-tests/tools/resolve_matrix.py
 create mode 100644 flagos-user-tests/tools/run_user_tests.py
 create mode 100644 flagos-user-tests/tools/validators/lint_test_case.py
 create mode 100644 flagos-user-tests/tools/validators/validate_config.py
 create mode 100644 flagos-user-tests/tools/validators/validate_gold_values.py

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 0000000..ce1bbdd
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1,42 @@
+# FlagOS DevOps - Code Owners
+
+# Default owners for everything
+* @flagos-ai/devops-team
+
+# CI/CD workflows
+.github/ @flagos-ai/devops-team
+
+# Shared actions
+actions/ @flagos-ai/devops-team
+
+# === User Tests ===
+
+# FlagScale test cases
+flagos-user-tests/tests/flagscale/ @flagos-ai/flagscale-team
+
+# FlagGems test cases
+flagos-user-tests/tests/flaggems/ @flagos-ai/flaggems-team
+
+# FlagCX test cases
+flagos-user-tests/tests/flagcx/ @flagos-ai/flagcx-team
+
+# FlagTree test cases
+flagos-user-tests/tests/flagtree/ @flagos-ai/flagtree-team
+
+# vLLM-FL test cases
+flagos-user-tests/tests/vllm-fl/ @flagos-ai/vllm-team
+
+# vLLM-plugin-FL test cases
+flagos-user-tests/tests/vllm-plugin-fl/ @flagos-ai/vllm-team
+
+# TE-FL test cases
+flagos-user-tests/tests/te-fl/ @flagos-ai/te-team
+
+# Megatron-LM-FL test cases
+flagos-user-tests/tests/megatron-lm-fl/ @flagos-ai/megatron-team
+
+# Experimental test cases
+flagos-user-tests/tests/experimental/ @flagos-ai/devops-team
+
+# Validation tools
+flagos-user-tests/tools/ @flagos-ai/devops-team
diff --git a/.github/ISSUE_TEMPLATE/new_test_case.yml b/.github/ISSUE_TEMPLATE/new_test_case.yml
new file mode 100644
index 0000000..899c5b4
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/new_test_case.yml
@@ -0,0 +1,94 @@
+name: New Test Case Submission
+description: Submit a new test case for FlagOS repositories
+title: "[Test Case] "
+labels: ["new-test-case"]
+body:
+  - type: dropdown
+    id: target-repo
+    attributes:
+      label: Target Repository
+      description: Which FlagOS repository is this test case for?
+      options:
+        - FlagScale
+        - FlagGems
+        - FlagCX
+        - FlagTree
+        - vLLM-FL
+        - vLLM-plugin-FL
+        - TE-FL
+        - Megatron-LM-FL
+    validations:
+      required: true
+
+  - type: dropdown
+    id: test-type
+    attributes:
+      label: Test Type
+      description: What type of test is this?
+      options:
+        - train
+        - inference
+        - hetero_train
+        - unit
+        - integration
+        - benchmark
+    validations:
+      required: true
+
+  - type: input
+    id: model-name
+    attributes:
+      label: Model Name
+      description: Name of the model being tested (if applicable)
+      placeholder: e.g., llama2, mixtral, deepseek
+
+  - type: textarea
+    id: description
+    attributes:
+      label: Test Case Description
+      description: Describe what this test case validates
+      placeholder: |
+        This test case validates ...
+    validations:
+      required: true
+
+  - type: textarea
+    id: config
+    attributes:
+      label: Configuration
+      description: Paste the YAML configuration for the test case
+      render: yaml
+    validations:
+      required: true
+
+  - type: textarea
+    id: gold-values
+    attributes:
+      label: Gold Values
+      description: Paste the expected gold values (JSON format)
+      render: json
+
+  - type: textarea
+    id: environment
+    attributes:
+      label: Environment Requirements
+      description: Describe the hardware/software requirements
+      placeholder: |
+        - GPU: 8x A100 80GB
+        - CUDA: 12.1
+        - Python: 3.10
+    validations:
+      required: true
+
+  - type: checkboxes
+    id: checklist
+    attributes:
+      label: Submission Checklist
+      options:
+        - label: I have tested this test case locally
+          required: true
+        - label: I have included gold values (if applicable)
+        - label: I have added a README.md with test description
+          required: true
+        - label: My YAML configuration follows the schema specification
+          required: true
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000..4f688bf
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,37 @@
+## Test Case PR
+
+### Target Repository
+<!-- Which FlagOS repo does this test case target? -->
+- [ ] FlagScale
+- [ ] FlagGems
+- [ ] FlagCX
+- [ ] FlagTree
+- [ ] vLLM-FL
+- [ ] vLLM-plugin-FL
+- [ ] TE-FL
+- [ ] Megatron-LM-FL
+
+### Test Type
+<!-- What kind of test is this? -->
+- [ ] train
+- [ ] inference
+- [ ] hetero_train
+- [ ] unit
+- [ ] integration
+
+### Description
+<!-- Briefly describe what this test case validates -->
+
+
+### Environment Requirements
+<!-- What hardware/software is needed to run this test? -->
+- GPU:
+- CUDA:
+- Python:
+
+### Checklist
+- [ ] YAML configuration passes schema validation
+- [ ] Gold values are included (if applicable)
+- [ ] README.md is present for each test case
+- [ ] Test case has been verified locally
+- [ ] No sensitive data (tokens, passwords, private paths) in configs
diff --git a/.github/scripts/detect_changed_repos.js b/.github/scripts/detect_changed_repos.js
new file mode 100644
index 0000000..08ea150
--- /dev/null
+++ b/.github/scripts/detect_changed_repos.js
@@ -0,0 +1,65 @@
+// Detect which repos have changed test cases.
+//
+// Outputs (via core.setOutput):
+//   changed_cases      — JSON array of case paths (manual single-case dispatch)
+//   changed_repos      — JSON object {repo, task, model} (manual repo dispatch or _none_)
+//   changed_repos_list — JSON array of repo names (auto-detected from PR/push)
+//
+// Called from workflow via:
+//   uses: actions/github-script@v7
+//   with:
+//     script: |
+//       const run = require('./.github/scripts/detect_changed_repos.js');
+//       await run({ github, context, core });
+
+module.exports = async ({ github, context, core }) => {
+  const inputCase = process.env.INPUT_CASE || '';
+  const inputRepo = process.env.INPUT_REPO || '';
+  const inputTask = process.env.INPUT_TASK || '';
+  const inputModel = process.env.INPUT_MODEL || '';
+
+  // Manual dispatch — single case
+  if (inputCase) {
+    core.setOutput('changed_cases', JSON.stringify([inputCase]));
+    return;
+  }
+
+  // Manual dispatch — by repo
+  if (inputRepo) {
+    core.setOutput('changed_repos', JSON.stringify({
+      repo: inputRepo,
+      task: inputTask,
+      model: inputModel,
+    }));
+    return;
+  }
+
+  // Auto-detect from changed files
+  let files = [];
+  if (context.eventName === 'pull_request') {
+    const resp = await github.paginate(
+      github.rest.pulls.listFiles,
+      { owner: context.repo.owner, repo: context.repo.repo, pull_number: context.issue.number }
+    );
+    files = resp.map(f => f.filename);
+  } else {
+    const resp = await github.rest.repos.compareCommits({
+      owner: context.repo.owner, repo: context.repo.repo,
+      base: context.payload.before, head: context.payload.after,
+    });
+    files = resp.data.files.map(f => f.filename);
+  }
+
+  // Extract unique repos from changed paths
+  const repos = new Set();
+  for (const f of files) {
+    const m = f.match(/^flagos-user-tests\/tests\/([^/]+)\//);
+    if (m && m[1] !== 'experimental') repos.add(m[1]);
+  }
+
+  if (repos.size === 0) {
+    core.setOutput('changed_repos', JSON.stringify({ repo: '_none_' }));
+  } else {
+    core.setOutput('changed_repos_list', JSON.stringify([...repos]));
+  }
+};
diff --git a/.github/workflows/nightly_integration.yml b/.github/workflows/nightly_integration.yml
new file mode 100644
index 0000000..9b51668
--- /dev/null
+++ b/.github/workflows/nightly_integration.yml
@@ -0,0 +1,94 @@
+name: Nightly Integration Test - User Tests
+
+on:
+  schedule:
+    - cron: "0 2 * * *"
+  workflow_dispatch:
+
+defaults:
+  run:
+    working-directory: flagos-user-tests
+
+jobs:
+  discover-cases:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.resolve.outputs.matrix }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install dependencies
+        run: pip install pyyaml
+
+      - name: Discover all test cases and resolve runner labels
+        id: resolve
+        working-directory: flagos-user-tests
+        run: |
+          python3 -c "
+          import json, os, sys
+          sys.path.insert(0, 'tools')
+          from run_user_tests import list_test_resources
+          from pathlib import Path
+
+          root = Path('.')
+          resources_list = list_test_resources(root)
+
+          matrix_entries = []
+          for entry in resources_list:
+              matrix_entries.append({
+                  'case_path': entry['case_path'],
+                  'runner_labels': json.dumps(entry['runner_labels']),
+              })
+
+          if not matrix_entries:
+              matrix_entries.append({
+                  'case_path': '_none_',
+                  'runner_labels': json.dumps(['ubuntu-latest']),
+              })
+
+          matrix = {'include': matrix_entries}
+          output = json.dumps(matrix)
+          print(f'Matrix: {output}')
+          with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
+              f.write(f'matrix={output}\n')
+          "
+
+  run-tests:
+    needs: discover-cases
+    if: ${{ !contains(needs.discover-cases.outputs.matrix, '_none_') }}
+    runs-on: ${{ fromJson(matrix.runner_labels) }}
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJson(needs.discover-cases.outputs.matrix) }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install runner dependencies
+        run: pip install pyyaml
+
+      - name: Run test case
+        run: python tools/run_user_tests.py --case ${{ matrix.case_path }}
+
+  notify:
+    needs: run-tests
+    if: always()
+    runs-on: ubuntu-latest
+    steps:
+      - name: Generate summary
+        run: |
+          echo "## Nightly Integration Test Summary" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "Run: ${{ github.run_number }}" >> $GITHUB_STEP_SUMMARY
+          echo "Date: $(date -u '+%Y-%m-%d %H:%M UTC')" >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/pr_validation.yml b/.github/workflows/pr_validation.yml
new file mode 100644
index 0000000..0083b57
--- /dev/null
+++ b/.github/workflows/pr_validation.yml
@@ -0,0 +1,85 @@
+name: PR Validation - User Tests
+
+on:
+  pull_request:
+    branches: [main]
+    paths:
+      - "flagos-user-tests/**"
+
+defaults:
+  run:
+    working-directory: flagos-user-tests
+
+jobs:
+  validate:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install dependencies
+        run: pip install pyyaml jsonschema
+
+      # Get the actual list of changed files in the PR (github.event.pull_request.changed_files is just a count)
+      - name: Get changed files
+        id: changed
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const files = await github.paginate(
+              github.rest.pulls.listFiles,
+              { owner: context.repo.owner, repo: context.repo.repo, pull_number: context.issue.number }
+            );
+            const changed = files
+              .map(f => f.filename)
+              .filter(f => f.startsWith('flagos-user-tests/'))
+              .map(f => f.replace('flagos-user-tests/', ''));
+            core.setOutput('files', changed.join(','));
+
+      # Step 1: Schema validation — only validate changed files
+      - name: Validate YAML/JSON Schema
+        run: |
+          python tools/validators/validate_config.py \
+            --changed-files "${{ steps.changed.outputs.files }}"
+
+      # Step 2: Required fields check
+      - name: Check Required Fields
+        run: python tools/validators/lint_test_case.py --strict
+
+      # Step 3: Gold values format validation
+      - name: Validate Gold Values
+        run: python tools/validators/validate_gold_values.py
+
+      # Step 4: Documentation completeness check
+      - name: Check Documentation
+        run: |
+          errors=0
+          # Skip sub-config directories (conf/train/data etc.)
+          SUB_CONFIG_DIRS="conf train inference data"
+          for dir in $(find tests -mindepth 3 -maxdepth 5 -type d); do
+            dirname=$(basename "$dir")
+            # Skip sub-config directories
+            skip=false
+            for sub in $SUB_CONFIG_DIRS; do
+              if [ "$dirname" = "$sub" ]; then skip=true; break; fi
+            done
+            if [ "$skip" = "true" ]; then continue; fi
+
+            # If the directory contains .yaml files, check for README.md
+            if ls "$dir"/*.yaml 1>/dev/null 2>&1; then
+              if [ ! -f "$dir/README.md" ]; then
+                echo "ERROR: Missing README.md in $dir"
+                errors=$((errors + 1))
+              fi
+            fi
+          done
+          if [ $errors -gt 0 ]; then
+            echo "Found $errors test case directories without README.md"
+            exit 1
+          fi
+          echo "All test case directories have README.md"
diff --git a/.github/workflows/test_dispatch.yml b/.github/workflows/test_dispatch.yml
new file mode 100644
index 0000000..185a9b8
--- /dev/null
+++ b/.github/workflows/test_dispatch.yml
@@ -0,0 +1,104 @@
+name: Test Dispatch - User Tests
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "flagos-user-tests/tests/**"
+  pull_request:
+    branches: [main]
+    paths:
+      - "flagos-user-tests/tests/**"
+  workflow_dispatch:
+    inputs:
+      repo:
+        description: "Target repository (e.g., flagscale, flaggems)"
+        required: false
+        type: string
+      task:
+        description: "Task type (train/inference/hetero_train)"
+        required: false
+        type: string
+      model:
+        description: "Model name (e.g., mixtral, deepseek)"
+        required: false
+        type: string
+      case:
+        description: "Specific test case YAML path (relative to flagos-user-tests/)"
+        required: false
+        type: string
+
+defaults:
+  run:
+    working-directory: flagos-user-tests
+
+jobs:
+  detect-changes:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.resolve.outputs.matrix }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install dependencies
+        run: pip install pyyaml
+
+      - name: Detect changed repos
+        id: detect
+        uses: actions/github-script@v7
+        env:
+          INPUT_CASE: ${{ inputs.case }}
+          INPUT_REPO: ${{ inputs.repo }}
+          INPUT_TASK: ${{ inputs.task }}
+          INPUT_MODEL: ${{ inputs.model }}
+        with:
+          script: |
+            const run = require('./.github/scripts/detect_changed_repos.js');
+            await run({ github, context, core });
+
+      - name: Resolve resources to matrix
+        id: resolve
+        working-directory: flagos-user-tests
+        run: |
+          python tools/resolve_matrix.py \
+            --changed-cases '${{ steps.detect.outputs.changed_cases }}' \
+            --changed-repos '${{ steps.detect.outputs.changed_repos }}' \
+            --changed-repos-list '${{ steps.detect.outputs.changed_repos_list }}'
+
+  run-tests:
+    needs: detect-changes
+    if: ${{ needs.detect-changes.outputs.matrix != '' && !contains(needs.detect-changes.outputs.matrix, '_none_') }}
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJson(needs.detect-changes.outputs.matrix) }}
+    runs-on: ${{ fromJson(matrix.runner_labels) }}
+    container:
+      image: ${{ matrix.container_image }}
+      options: ${{ matrix.container_options }}
+      volumes: ${{ fromJson(matrix.container_volumes) }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install runner dependencies
+        run: pip install pyyaml
+
+      - name: Run user tests
+        run: |
+          ARGS=""
+          if [ -n "${{ matrix.case_path }}" ]; then
+            ARGS="--case ${{ matrix.case_path }}"
+          else
+            ARGS="--repo ${{ matrix.repo }}"
+            [ -n "${{ matrix.task }}" ] && ARGS="$ARGS --task ${{ matrix.task }}"
+            [ -n "${{ matrix.model }}" ] && ARGS="$ARGS --model ${{ matrix.model }}"
+          fi
+          python tools/run_user_tests.py $ARGS
diff --git a/flagos-user-tests/CONTRIBUTING.md b/flagos-user-tests/CONTRIBUTING.md
new file mode 100644
index 0000000..2c606b2
--- /dev/null
+++ b/flagos-user-tests/CONTRIBUTING.md
@@ -0,0 +1,76 @@
+# Contributing to FlagOS User Tests
+
+Thank you for contributing test cases to the FlagOS ecosystem!
+
+## How to Submit a Test Case
+
+### Step 1: Generate a Template
+
+Use the built-in generator to create a properly structured test case:
+
+```bash
+# FlagScale training test case
+python tools/generators/create_test_template.py \
+    --repo flagscale \
+    --type train \
+    --model <model_name> \
+    --name <test_case_name>
+
+# Other repositories
+python tools/generators/create_test_template.py \
+    --repo <repo_name> \
+    --name <test_case_name>
+```
+
+### Step 2: Complete the Test Case
+
+1. **Edit the YAML config** with your actual test parameters
+2. **Add gold values** from a verified local run (JSON format)
+3. **Complete the README.md** with:
+   - Description of what the test validates
+   - Environment requirements (GPU, CUDA, Python)
+   - Manual execution instructions
+
+### Step 3: Validate Locally
+
+```bash
+python tools/validators/validate_config.py
+python tools/validators/validate_gold_values.py
+python tools/validators/lint_test_case.py --strict
+```
+
+### Step 4: Submit a Pull Request
+
+1. Fork this repository
+2. Create a feature branch: `git checkout -b add-test/<repo>/<name>`
+3. Add your test case files
+4. Commit and push
+5. Open a Pull Request using the provided template
+
+## Test Case Requirements
+
+- Each test case must be in its own directory
+- Each directory must contain:
+  - At least one `.yaml` configuration file
+  - A `README.md` with test documentation
+  - Gold values JSON file (for regression tests)
+- No sensitive data (tokens, passwords, private paths) in any files
+- YAML must pass schema validation
+- Gold values must contain numeric arrays
+
+## Code Review
+
+- PRs are reviewed by the respective team CODEOWNERS
+- CI must pass before merge
+- At least one approval from a maintainer is required
+
+## Experimental Test Cases
+
+If your test case covers a new or unstable feature:
+- Place it under `tests/experimental/`
+- It will only run in nightly integration tests
+- It will not block PR merges
+
+## Questions?
+
+Open an issue using the "New Test Case" template or contact the DevOps team.
diff --git a/flagos-user-tests/README.md b/flagos-user-tests/README.md
new file mode 100644
index 0000000..9ae4f32
--- /dev/null
+++ b/flagos-user-tests/README.md
@@ -0,0 +1,69 @@
+# FlagOS User Tests
+
+User-perspective test cases for FlagOS repositories. Each test case defines its own setup, run, and verification — exactly as a real user would operate.
+
+## How It Works
+
+```
+User submits test case YAML:
+  setup: [pip install flagscale]
+  run:   [flagscale train mixtral --config ./conf/xxx.yaml]
+  verify: {log_path: ..., gold_values_path: ...}
+
+CI runner:
+  1. cd <test_case_dir>
+  2. Execute setup commands
+  3. Execute run commands
+  4. Extract metrics from log
+  5. Compare against gold values → PASS/FAIL
+```
+
+Users have full control — the runner does NOT call internal repo scripts.
+
+## Quick Start
+
+```bash
+# Generate template
+python tools/generators/create_test_template.py \
+    --repo flagscale --type train --model llama2 --name tp2_pp1
+
+# Validate
+python tools/validators/validate_config.py
+python tools/validators/validate_gold_values.py
+python tools/validators/lint_test_case.py --strict
+
+# Run locally
+python tools/run_user_tests.py \
+    --case tests/flagscale/train/llama2/tp2_pp1/tp2_pp1.yaml
+```
+
+See [docs/getting_started.md](docs/getting_started.md) for the full guide.
+
+## Test Case Structure (FlagScale Example)
+
+```
+tests/flagscale/train/mixtral/tp2_pp1_ep2/
+├── tp2_pp1_ep2.yaml           # Test case: setup → run → verify
+├── conf/                      # FlagScale configs (user provides)
+│   ├── tp2_pp1_ep2.yaml
+│   └── train/tp2_pp1_ep2.yaml
+├── gold_values/               # Expected metrics
+│   └── tp2_pp1_ep2.json
+└── README.md
+```
+
+## Supported Repositories
+
+FlagScale, FlagGems, FlagCX, FlagTree, vLLM-FL, vLLM-plugin-FL, TE-FL, Megatron-LM-FL
+
+## CI Workflows (in `../.github/workflows/`)
+
+| Workflow | Trigger | Description |
+|---|---|---|
+| PR Validation | Pull Request | Format, lint, gold values checks |
+| Test Dispatch | Push/PR | Run user-defined setup → run → verify |
+| Nightly | Daily 02:00 UTC | All test cases |
+
+## Contributing
+
+See [CONTRIBUTING.md](CONTRIBUTING.md).
diff --git a/flagos-user-tests/docs/getting_started.md b/flagos-user-tests/docs/getting_started.md
new file mode 100644
index 0000000..1025909
--- /dev/null
+++ b/flagos-user-tests/docs/getting_started.md
@@ -0,0 +1,99 @@
+# Getting Started
+
+## Overview
+
+`flagos-user-tests` manages **user-perspective** test cases for FlagOS repositories. Each test case defines its own setup, run, and verification commands — exactly as a real user would operate.
+
+## Quick Start
+
+### 1. Generate a template
+
+```bash
+# FlagScale training test
+python tools/generators/create_test_template.py \
+    --repo flagscale --type train --model llama2 --name tp2_pp1
+
+# Generic test
+python tools/generators/create_test_template.py \
+    --repo flaggems --name my_operator_test
+```
+
+### 2. Edit the generated files
+
+The test case YAML defines the user workflow:
+
+```yaml
+# tests/flagscale/train/llama2/tp2_pp1/tp2_pp1.yaml
+meta:
+  repo: flagscale
+  task: train
+  model: llama2
+  case: tp2_pp1
+  description: "LLaMA2 training with TP=2, PP=1"
+
+resources:
+  gpu: A100-80GB
+  gpu_count: 8
+
+setup:
+  - pip install flagscale                    # user installs the package
+
+run:
+  - flagscale train llama2 --config ./conf/tp2_pp1.yaml   # user runs training
+
+verify:
+  log_path: ".../stdout.log"                 # where to find output
+  gold_values_path: ./gold_values/tp2_pp1.json   # expected metrics
+```
+
+Also edit the FlagScale config files (`conf/*.yaml`) and fill in gold values from a verified run.
+
+### 3. Validate locally
+
+```bash
+python tools/validators/validate_config.py
+python tools/validators/validate_gold_values.py
+python tools/validators/lint_test_case.py --strict
+```
+
+### 4. Run locally (optional)
+
+```bash
+python tools/run_user_tests.py \
+    --case tests/flagscale/train/llama2/tp2_pp1/tp2_pp1.yaml
+```
+
+### 5. Submit a PR
+
+CI will automatically:
+1. Validate format (PR Validation workflow)
+2. Run your test case on real hardware (Test Dispatch workflow)
+
+## How the Runner Works
+
+`run_user_tests.py` is a **generic executor**:
+
+```
+┌─────────────┐     ┌──────────────────────────────────────┐
+│  Test Case   │ ──▶ │ 1. cd <test_case_dir>                │
+│  YAML        │     │ 2. Execute setup commands             │
+│              │     │ 3. Execute run commands                │
+│              │     │ 4. Find log file (glob pattern)        │
+│              │     │ 5. Extract metrics from log            │
+│              │     │ 6. Compare against gold values          │
+└─────────────┘     └──────────────────────────────────────┘
+```
+
+It does **not** call any internal repo scripts. Users have full control over:
+- What to install (`setup`)
+- How to run (`run`)
+- What to verify (`verify`)
+- Machine requirements (`resources`) — mapped to runner labels via `resource_map.yaml`
+
+## CI Workflows
+
+| Workflow | Trigger | Description |
+|---|---|---|
+| PR Validation | Pull Request | Format/lint/gold-values checks |
+| Test Dispatch | Push to main / PR | Runs user-defined setup → run → verify |
+| Nightly | Daily 02:00 UTC | All test cases across all repos |
diff --git a/flagos-user-tests/docs/test_format_spec.md b/flagos-user-tests/docs/test_format_spec.md
new file mode 100644
index 0000000..a916e0d
--- /dev/null
+++ b/flagos-user-tests/docs/test_format_spec.md
@@ -0,0 +1,177 @@
+# Test Format Specification
+
+## Core Concept: User-Perspective Test Cases
+
+Every test case is a **self-contained YAML file** that defines the complete workflow from a **user's perspective**:
+
+```yaml
+meta:       # What is this test?
+resources:  # Hardware requirements (platform, device, device_count)
+setup:      # How to install? (user's commands)
+run:        # How to run?    (user's commands)
+verify:     # How to check?  (gold values comparison)
+```
+
+The runner (`run_user_tests.py`) simply executes these user-defined commands. It does NOT call any internal repo scripts — giving users full control and matching real usage scenarios.
+
+## Test Case YAML Format
+
+### Complete Example (FlagScale)
+
+```yaml
+meta:
+  repo: flagscale
+  task: train
+  model: mixtral
+  case: tp2_pp1_ep2
+  description: "Mixtral MoE training with TP=2, PP=1, EP=2"
+
+resources:
+  platform: cuda
+  device: A100-80GB
+  device_count: 8
+
+env:
+  CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+
+setup:
+  - pip install flagscale
+
+run:
+  - flagscale train mixtral --config ./conf/tp2_pp1_ep2.yaml
+
+verify:
+  log_path: "tests/functional_tests/train/mixtral/test_results/tp2_pp1_ep2/logs/details/host_0_localhost/*/default_*/attempt_0/*/stdout.log"
+  gold_values_path: ./gold_values/tp2_pp1_ep2.json
+  rtol: 1e-5
+  atol: 0
+```
+
+### Complete Example (Generic)
+
+```yaml
+meta:
+  repo: flaggems
+  case: my_operator_test
+  description: "Test custom operator correctness"
+
+setup:
+  - pip install flaggems
+
+run:
+  - pytest -v tests/test_my_operator.py
+
+# No verify step — pytest exit code determines pass/fail
+```
+
+### Field Reference
+
+| Field | Type | Required | Description |
+|---|---|---|---|
+| `meta.repo` | string | Yes | Target FlagOS repository name |
+| `meta.task` | string | No | Task type (train/inference/hetero_train) |
+| `meta.model` | string | No | Model name |
+| `meta.case` | string | No | Case name (for filtering) |
+| `meta.description` | string | Yes | What this test validates |
+| `resources` | object | No | Hardware requirements |
+| `resources.platform` | string | No | Chip platform: `cuda`, `metax`, `ascend` (default: `cuda`) |
+| `resources.device` | string | No | Device type (e.g. `A100-40GB`, `C500`, `Ascend910B`) |
+| `resources.device_count` | int | No | Number of devices required |
+| `env` | object | No | Environment variables |
+
+### Resource Resolution
+
+The `resources` field drives CI decisions via `resource_map.yaml` (platform-based):
+
+1. **Runner selection**: `resources.platform` + `resources.device` -> platform-specific runner labels
+2. **Container image**: `resources.platform` + `meta.repo/task` -> platform-specific Docker image
+3. **Container options**: `resources.platform` -> device passthrough flags (`--gpus all`, `--device /dev/davinci_all`, etc.)
+
+Supported platforms:
+
+| Platform | Vendor | Devices | Status |
+|---|---|---|---|
+| `cuda` | NVIDIA | A100, H100, H800 | Active |
+| `metax` | MetaX (Muxi) | C500 | Planned |
+| `ascend` | Huawei | Ascend910B, Ascend910C | Planned |
+
+The test job runs inside the platform-resolved Docker container with device access.
+
+### Field Reference (continued)
+
+| Field | Type | Required | Description |
+|---|---|---|---|
+| `setup` | list[str] | No | Shell commands for environment setup |
+| `run` | list[str] | Yes | Shell commands to execute the test |
+| `verify.log_path` | string | No | Path to output log (supports glob patterns) |
+| `verify.gold_values_path` | string | No | Path to gold values JSON file |
+| `verify.gold_values` | object | No | Inline gold values (alternative to file) |
+| `verify.rtol` | float | No | Relative tolerance (default: 1e-5) |
+| `verify.atol` | float | No | Absolute tolerance (default: 0) |
+
+### Working Directory
+
+All commands execute with the **test case directory** as the working directory. So `./conf/tp2_pp1_ep2.yaml` resolves relative to where the test case YAML lives.
+
+## Gold Values Format
+
+### Numeric (default)
+
+```json
+{
+  "lm loss:": {
+    "values": [11.17587, 11.16908, 10.41927]
+  }
+}
+```
+
+- Keys are metric names extracted from log files
+- Values are numeric arrays
+- Comparison uses `rtol` / `atol` similar to `numpy.allclose`
+- `log_path` supports glob patterns for timestamp directories
+
+### Text
+
+```json
+{
+  "inference_output": {
+    "type": "text",
+    "pattern": "output\\.outputs\\[0\\]\\.text=(?:\"(.+?)\"$|'(.+?)'$)",
+    "values": [
+      " Lina. I'm a 22-year",
+      " the same as the president of the United Nations."
+    ]
+  }
+}
+```
+
+- Set `"type": "text"` to enable text comparison
+- `"pattern"` is a regex with capture group(s) to extract text from log lines
+  - If multiple groups (e.g. alternation), the first non-None group is used
+- Values are compared with exact string match
+
+## FlagScale Test Case Directory Structure
+
+```
+tests/flagscale/train/mixtral/tp2_pp1_ep2/
+├── tp2_pp1_ep2.yaml              # Test case definition (setup/run/verify)
+├── conf/
+│   ├── tp2_pp1_ep2.yaml          # FlagScale experiment config (Hydra)
+│   └── train/
+│       └── tp2_pp1_ep2.yaml      # Training parameters
+├── gold_values/
+│   └── tp2_pp1_ep2.json          # Expected metrics
+└── README.md
+```
+
+The user runs: `pip install flagscale && flagscale train mixtral --config ./conf/tp2_pp1_ep2.yaml`
+
+## README Requirements
+
+Each test case directory must have a `README.md` with:
+1. **Description** section
+2. **Environment** section
+
+## Experimental Test Cases
+
+Place under `tests/experimental/` for gray-stage tests (nightly only, non-blocking).
diff --git a/flagos-user-tests/repos.yaml b/flagos-user-tests/repos.yaml
new file mode 100644
index 0000000..fdde8e1
--- /dev/null
+++ b/flagos-user-tests/repos.yaml
@@ -0,0 +1,45 @@
+# FlagOS target repository configuration
+#
+# Note: Each test case defines its own setup/run/verify workflow (user perspective).
+# This file only records basic repository info for CI repo-level filtering and issue templates.
+
+repositories:
+  flagscale:
+    url: https://github.com/FlagOpen/FlagScale.git
+    default_branch: main
+    description: Large-scale distributed training framework
+
+  flaggems:
+    url: https://github.com/FlagOpen/FlagGems.git
+    default_branch: main
+    description: GPU-accelerated math library
+
+  flagcx:
+    url: https://github.com/FlagOpen/FlagCX.git
+    default_branch: main
+    description: Cross-chip communication library
+
+  flagtree:
+    url: https://github.com/FlagOpen/FlagTree.git
+    default_branch: main
+    description: Tree-structured computation library
+
+  vllm-fl:
+    url: https://github.com/FlagOpen/vLLM-FL.git
+    default_branch: main
+    description: LLM inference engine
+
+  vllm-plugin-fl:
+    url: https://github.com/FlagOpen/vLLM-plugin-FL.git
+    default_branch: main
+    description: vLLM plugin system
+
+  te-fl:
+    url: https://github.com/FlagOpen/TransformerEngine-FL.git
+    default_branch: main
+    description: Transformer Engine
+
+  megatron-lm-fl:
+    url: https://github.com/FlagOpen/Megatron-LM-FL.git
+    default_branch: main
+    description: Megatron-LM fork
diff --git a/flagos-user-tests/resource_map.yaml b/flagos-user-tests/resource_map.yaml
new file mode 100644
index 0000000..6cc25a1
--- /dev/null
+++ b/flagos-user-tests/resource_map.yaml
@@ -0,0 +1,99 @@
+# Maps test case resource requirements to GitHub Actions runner labels and container images.
+#
+# Architecture: platform-based multi-vendor support
+#   resources.platform  -> platforms.<name>  -> runner labels, container images, options
+#
+# Example test case YAML:
+#   resources:
+#     platform: cuda
+#     device: A100-40GB
+#     device_count: 1
+#
+# Resolution chain:
+#   1. resources.platform -> platforms.cuda
+#   2. platforms.cuda.device_labels["A100-40GB"] -> runner labels
+#   3. platforms.cuda.container_images["flagscale/inference"] -> Docker image
+#   4. platforms.cuda.container_options -> Docker runtime flags
+
+# =============================================================================
+# Platforms: each vendor/chip family is a platform
+# =============================================================================
+platforms:
+
+  # ---------------------------------------------------------------------------
+  # NVIDIA CUDA platform
+  # ---------------------------------------------------------------------------
+  cuda:
+    description: "NVIDIA CUDA GPUs (A100, H100, H800, etc.)"
+
+    # Device type -> self-hosted runner labels
+    device_labels:
+      A100-40GB:  [self-hosted, Linux, X64, gpu-a100-40gb]
+      A100-80GB:  [self-hosted, Linux, X64, gpu-a100-80gb]
+      H100-80GB:  [self-hosted, Linux, X64, gpu-h100-80gb]
+      H800-80GB:  [self-hosted, Linux, X64, gpu-h800-80gb]
+
+    # Default runner labels when device type not found
+    default_labels: [self-hosted, Linux, X64]
+
+    # Container images: "<repo>/<task>" -> Docker image
+    container_images:
+      flagscale/train:        "localhost:5000/flagscale-train:dev-cu128-py3.12-20260228210721"
+      flagscale/inference:    "localhost:5000/flagscale-inference:dev-cu128-py3.12-20260302102033"
+      flagscale/hetero_train: "localhost:5000/flagscale-train:dev-cu128-py3.12-20260228210721"
+
+    # Container runtime options
+    container_options: "--gpus all --shm-size=500g --user root --ulimit nofile=65535:65535"
+
+    # Container volume mounts (host:container)
+    container_volumes:
+      - /home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data
+      - /home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers
+
+  # ---------------------------------------------------------------------------
+  # MetaX (Muxi) platform — placeholder for future integration
+  # ---------------------------------------------------------------------------
+  metax:
+    description: "MetaX (Muxi) GPUs (C500, etc.)"
+
+    device_labels:
+      C500: [self-hosted, Linux, X64, metax-c500]
+
+    default_labels: [self-hosted, Linux, X64, metax]
+
+    container_images: {}
+      # flagscale/train: "registry.example.com/flagscale-train:metax-..."
+      # flagscale/inference: "registry.example.com/flagscale-inference:metax-..."
+
+    container_options: "--device /dev/mxgpu_all --shm-size=500g --user root"
+
+    container_volumes: []
+
+  # ---------------------------------------------------------------------------
+  # Ascend (Huawei) platform — placeholder for future integration
+  # ---------------------------------------------------------------------------
+  ascend:
+    description: "Huawei Ascend NPUs (910B, 910C, etc.)"
+
+    device_labels:
+      Ascend910B: [self-hosted, Linux, aarch64, ascend-910b]
+      Ascend910C: [self-hosted, Linux, aarch64, ascend-910c]
+
+    default_labels: [self-hosted, Linux, aarch64, ascend]
+
+    container_images: {}
+      # flagscale/train: "registry.example.com/flagscale-train:ascend-..."
+
+    container_options: "--device /dev/davinci_all --shm-size=500g --user root"
+
+    container_volumes: []
+
+# =============================================================================
+# Global defaults
+# =============================================================================
+
+# Default platform when resources.platform is not specified
+default_platform: cuda
+
+# Fallback runner labels when nothing matches
+default_labels: [self-hosted]
diff --git a/flagos-user-tests/tests/flagcx/.gitkeep b/flagos-user-tests/tests/flagcx/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/flagos-user-tests/tests/flaggems/.gitkeep b/flagos-user-tests/tests/flaggems/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/flagos-user-tests/tests/flagscale/hetero_train/.gitkeep b/flagos-user-tests/tests/flagscale/hetero_train/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/.gitignore b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/.gitignore
new file mode 100644
index 0000000..2301c87
--- /dev/null
+++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/.gitignore
@@ -0,0 +1,2 @@
+FlagScale/
+outputs/
diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/README.md b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/README.md
new file mode 100644
index 0000000..b38f61b
--- /dev/null
+++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/README.md
@@ -0,0 +1,25 @@
+# demo_0_6b
+
+## Description
+
+FlagScale inference demo using Qwen3-0.6B model with vLLM backend.
+Runs 4 prompts with greedy decoding (temperature=0, max_tokens=10) and verifies output text against gold values.
+
+## Environment
+
+- GPU: 1x A100 40GB
+- CUDA: 12.1+
+- Python: 3.12
+- vLLM: 0.10.1.dev
+
+## How to Run
+
+```bash
+git clone https://github.com/FlagOpen/FlagScale.git && cd FlagScale && pip install .
+flagscale inference qwen3 --config ./conf/demo_0_6b.yaml
+```
+
+## Gold Values
+
+Uses text-type gold values to verify inference output.
+Greedy decoding (temperature=0) produces deterministic output, so text comparison is exact match.
diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/demo_0_6b.yaml b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/demo_0_6b.yaml
new file mode 100644
index 0000000..0f15416
--- /dev/null
+++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/demo_0_6b.yaml
@@ -0,0 +1,27 @@
+defaults:
+  - _self_
+  - inference: demo_0_6b
+
+experiment:
+  exp_name: qwen3
+  exp_dir: ./outputs/${experiment.exp_name}
+  task:
+    type: inference
+    backend: vllm
+    entrypoint: flagscale/inference/inference_llm.py
+  runner:
+    hostfile: null
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale-inference
+  envs:
+    VLLM_PLUGINS: "fl"
+    VLLM_USE_FLASHINFER_SAMPLER: 0
+    VLLM_LOGGING_LEVEL: "INFO"
+    CUDA_VISIBLE_DEVICES: 0
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra
diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml
new file mode 100644
index 0000000..d941f2b
--- /dev/null
+++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml
@@ -0,0 +1,18 @@
+llm:
+  model: /share/project/models/Qwen/Qwen3-0.6B
+  trust_remote_code: true
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  gpu_memory_utilization: 0.9
+  seed: 1234
+
+generate:
+  prompts: [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+  ]
+  sampling:
+    max_tokens: 10
+    temperature: 0.0
diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml
new file mode 100644
index 0000000..7f997c7
--- /dev/null
+++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml
@@ -0,0 +1,38 @@
+meta:
+  repo: flagscale
+  task: inference
+  model: qwen3
+  case: demo_0_6b
+  description: >
+    Qwen3-0.6B inference demo using vLLM backend with FlagScale CLI.
+    Runs 4 prompts with greedy decoding (temperature=0) and verifies output text.
+
+resources:
+  platform: cuda
+  device: A100-40GB
+  device_count: 1
+
+env:
+  CUDA_VISIBLE_DEVICES: "0"
+  VLLM_PLUGINS: "fl"
+  VLLM_USE_FLASHINFER_SAMPLER: "0"
+  VLLM_LOGGING_LEVEL: "INFO"
+  CUDA_DEVICE_MAX_CONNECTIONS: "1"
+
+setup:
+  - git clone https://github.com/FlagOpen/FlagScale.git && cd FlagScale && pip install .
+
+run:
+  - flagscale inference qwen3 --config ./conf/demo_0_6b.yaml
+  - |
+    pid_file="./outputs/qwen3/inference_logs/pids/host_0_localhost.pid"
+    if [ -f "$pid_file" ]; then
+      pid=$(cat "$pid_file")
+      echo "Waiting for inference process $pid to complete..."
+      while kill -0 "$pid" 2>/dev/null; do sleep 2; done
+      echo "Inference process completed."
+    fi
+
+verify:
+  log_path: "./outputs/qwen3/inference_logs/host_0_localhost.output"
+  gold_values_path: ./gold_values/demo_0_6b.json
diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/gold_values/demo_0_6b.json b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/gold_values/demo_0_6b.json
new file mode 100644
index 0000000..2a28edd
--- /dev/null
+++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/gold_values/demo_0_6b.json
@@ -0,0 +1,12 @@
+{
+  "inference_output": {
+    "type": "text",
+    "pattern": "output\\.outputs\\[0\\]\\.text=(?:\"(.+?)\"$|'(.+?)'$)",
+    "values": [
+      " Lina. I'm a 22-year",
+      " the same as the president of the United Nations.",
+      " Paris. The capital of France is also the capital",
+      " not just a technological challenge but a profound transformation of"
+    ]
+  }
+}
diff --git a/flagos-user-tests/tests/flagscale/train/.gitkeep b/flagos-user-tests/tests/flagscale/train/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/flagos-user-tests/tests/flagtree/.gitkeep b/flagos-user-tests/tests/flagtree/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/flagos-user-tests/tests/megatron-lm-fl/.gitkeep b/flagos-user-tests/tests/megatron-lm-fl/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/flagos-user-tests/tests/te-fl/.gitkeep b/flagos-user-tests/tests/te-fl/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/flagos-user-tests/tests/vllm-fl/.gitkeep b/flagos-user-tests/tests/vllm-fl/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/flagos-user-tests/tests/vllm-plugin-fl/.gitkeep b/flagos-user-tests/tests/vllm-plugin-fl/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/flagos-user-tests/tools/__pycache__/run_user_tests.cpython-312.pyc b/flagos-user-tests/tools/__pycache__/run_user_tests.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05fdd462b2ee932f1644c0ed136243d22786ae4d
GIT binary patch
literal 22466
zcmb_^Yj7LKncxiGPXZub1Rvsvq(p)uAyJZLi_v2xQWEvBWQ($6$&$Gc1|&g&06har
zA_6ANR<;5rsuNjuE#Y&#hBnR_K6@`rZSKNcU2Wv#cGp+AAE3cR(1cf|tX;L6x<ByB
zSJ};N)!p~?JTRoe*t>Tlak{7b>#v_*_xEc2+-|o}@YtLGbL^QRiuwpY<fl%GJZ{iX
z)Dp!~S16WdHF5e1O+w8T4GFbZv?SDB(UDMpMNdM*6$6CYxN*{S#YBr`%~#B<E^3Kd
z1Av^>D=|aVHfmsv-_TvLtMW}zD{GKpAm0r6b!Lj~W-U=0YlS~s)WO=}&k%J+T?(WI
zv5#8Vx^L*O)U&%-hn&MY0iz4bY0^fh-w<_=n%H`P@&J@OYGOU`SIsl9UdZ#ZJ}BXb
z94sxCX@Hja%oMbxhYdhZBjkwXnv__x62m^2q&BqxMnAg^%3z;$Y%5@DU>T@kJHd|Q
z&<1G%wjI($IBC2frvq{tNo`n96~YR=-2r(`sx-v36VjUhMeVdn?d*hhwy<5_pss9-
zwz9z+T@=NcG*ookyVA0tsTs|TCfGaOsHLcj(+Ospk8-{I^!3SDDivj!RFqFKkuV?S
znebRRmVmF5@$lG%VTOxNCHYt?$;G04$YMDWj*M5KnHaz`d^A4V8%ZWoP!MokKKArE
z20lk)V@zs1oMPDMC}i;c7KZ5s;MDX~KQo@Z#iWu9))bD%0WL~vV!{a);HRPqHkyb)
zl}s15w3BD1q8vXJjih2Xqd|g*n@%WbqPL@w=~R@UAzj(B)XgXt8=X-KM#iI&8&ESp
z9ZyxeK9-EL%*}9oI?7utm&T!!P+20%F??)tDn7$V)$!seirr(O%afDPe4hE+xmTGO
zbUO*nKY!s8jB7k@2`6Ttx2Y(X2*<^)l7{jTE;g0oLky|rMl?FbSBDo`lp=j<!$Dx;
z(VNkD8%)A;e0VI1!^tr149+IZ0gO#-G#05ACfQRnsqth2MmibicS9#e5YY(M%!j6C
znBHETb5e%iJqi=fLtCS}Qyk3X?#b9~h{SiNruL3ZP4$gLr}l<s!jo}}x?-HW>e?Ai
za?loBGGDBAgk+R@zYyYGsW5+I*JP57#{V^SK$j5#L%Ikb1Q8>3)$$u8m-9hd&^;4Q
zjSKoXu3N!68BKAq$jFW83@;csSioE&0^6+em>~vqMLiy(ar4d4gVbG3L=&STnrnKB
zn$=`9W7I2}J4PKftA#IWkh*po(sc-V?PZARQ&3WTCe)avw2XcN@N9lq`dz7%9L^dt
zhSw=p^M+>DsKRL2T#}+P#*FS=sis*|#>CTiPME2THlxQaWdeJ&=}B4Tgp6szF2}1O
ztCd@^Ii#sOypFn))&c%u7!w-)W;M5{b}A(wTT}<dX=&;*btij^x~;uT-J*lK^v_S=
z3XG(f#PlT4ZG;ggLLyCOluJ%B@#GjorU~c*31WD9YAVSAF(Wy`(#L#QmXqNW*?<tF
zmw6^O747AtQ(-O)yI()k76ZZzr(yu#x4Ul-lVReBndkeNzEIzRz7xCl^g&`M6lx=t
zAda)qnO?F-^dnA&Y2#DTDW*?q<47Vr8SU>2?cKLuEF#eT>EKnqyZ<P>x?bGXd+q2|
zcGuMqyt})O_P-cCaZOGP9u58mCpKsjG+b0LM1d-&Amn&hhEsxpkA}I(xL^oR;XWzo
zN8`zGO3;sS$>}M<@;vTaCpa$21<jle&~UhP1p0<xl=eA6hXW>PVLb~*5jiUur^0X&
za0x*VLlqh4aRrcm;p=y&ck|<6F1mY)OHRNhygMG7nNE*SCwG%mVoKaRl6)^YR>=0U
zdy73sXpD-JIwDRcKZ4UZLZ*;wgi3e7AO9Y_=BQ8VsMhXMUH5A@a<1&l3vKh$xf^q*
zign!|IlWokuiV?R1D_fwe_$!L7|Uzlw%oH6_I=k;^6kl<DmQM=1&VEFi@tN&Q|lhz
z(&@$1xv_kz<mt(t5FDP0M(YZcJx#fR#d8llovWVC_w<G6_iwy&V`clhiBi|$UwDpu
z;-WmQ6)olRRT?Rqd+z){{X;9|*!7Q}Hc$<1|3J~Mz(Y?%MGN8Yct14wk1fZYy7#yB
z9QW#e>NOwVZu}`j!<)mr4f_SsBiS$3AhJY_!49FK)>`zhQ2MNv>>g*y?xC%<D*#<f
zyTYtKqX)Y7r+|GDAC~?G1$#*i>XsRWK}wFX8fAZgU1g5G^QDYtLfHdkjIfg!<obZl
zp9UJ&3{ydExB(g=lWz*9yf)Fo2^vd`kuzf~dYh5fphU`-fGLntn8|5mP(&UBPsMmN
z3<=3Z6i(ME*pQ<PS^gpB^eDi_Q?aSIl*AK?g~iKE!TOITQcPDgG!|mQadH?V(Fc3T
zDLWbi3S$z<MDKY-Ob{Z61B;ps5iBQehk*f5TGh{RZOQ4>)O6}<NDTR_FYdW|?P>^n
zd>F{Ott+(aXs|8QNy5&cxEDmE2_l1t)4dP=9`A(LS&GI3gt`V(G)re_7<NO3{u6qM
zhQYP!s2S?dwYTVZ==1MTL5-jZ?GflP9x*Tsha0zGJ~S3hL~l=VN7CIgp<!cGR2DiE
zPr@<J9|<Y2=b$L>h1cIx|2Rh#{R8)o<ig9({@~ho=zB;0i8n#c9~oWrx8U!5(7>S>
zA{b$Vor=Yy9G(S&4k}Fv1~O(mhueD4DDFPE`Z+hkn4}Uc=fSiFOw+;^%H#TkT?QYq
z7yd1p&UFIh5%}Y;!)uPJ)KL!C+^Kbk>u!2Jo$D+)+UADH=1_JvEIAe(d0ol5BdZsz
z&X4PTtNwxf6K@~7cj&DnCI7(vOQrfV*@3d7e*SA^r+=w#u`b8|!r5N2Bf&cf!9Rvt
zDA;Htsp)WB&>>$a=(to8;$e74aC?;>7eokAr{GP98YC<rL3%<x+Y+=)WP{WKeeDg3
zx(ahRYnU}=j1!1s)58M$pb(=eL)C)G7*sIkE%0dB>Y;DRnE8f`uGSX9>IoZRctbhT
zK_WEH>hFBHx|L+iGTE!rOmf<dCt-qjZ-L!Ju$xuo6kNc(SXAi>zSpTYRND+`%NrRB
zYbBCSETwE{DkjC8G09^lw+fi8kTlj-GUB#bd&ZWrV{2ifbZpUH8)>gSV};h$Wo)l&
zSVzYGy5<d~b>ip%h5j9Ef3-){NpQGo_l>Q8Loc_Hh81zg`Y-6wmCZd$Da;x6Ee*sf
z;|BIlqsq0*>1p5)(y-=k43cr^-l7lcad5*pP^<^WfnvQXoSSN7eXr}@u*mhu;{-j-
z{3}PwziFiMfJdLvjpCux5WWJWHUQfa?)D<lO6*uo*={q(n7Wl@QZrK^wIxRxA_Kx{
zfpV`XY9Z4nGooD}^o8M=3HCEzX(L=9-`3B(*caN<!|V<1xz@w9Atp#`lZv#>TErc$
z7X+r;DT!$mlb8)i+Qy^tc(M&p+)8qBwoQ!;i?;PhnB>A6!#3&2#JagCo~pNEsd1)j
z!#Np*<`66wi8KRZHpr+WQ(+#Cx*p)C`P4{q6gV^<B~3mko_?1EgV<F;hn*9QQrFTu
zPaZpcuAe!dln;4n`vzqIN`w;E-YL+XC(}(L?73)S3|ciA;|aS-;5yUB&VCs96qC00
zfRK&iKA?cw0AJcD7VJ!GI$un0zeYw;DKF0uBTaihlXfYigj0#*oi;vmY<Rez=}c=o
znXa^3L?L#GA<YP;-Pfu(C0vKo#vZa9NQ8}zjvh{HLkFViI=Q=I*`QI-gB&r*3)(Rd
z81W1gwDD*{Fh|Kifv6#9)4;%M!`Jy>9Tx-ut{Y#w@YRFxCV9vOOLamxJedUxc4Gur
z<cwe?acMH_;z}QpJ}m@fgv<~f6X-C<Kye<MvvHw_N39zVWuTW46-w?H01sm+8=OmU
z%r;QY`njR9%R6`GQB&)i{jc@Ed1&5|)n>1heauo~F_C|vz?FP^v(|FMwl^)WS@Qb|
zno`5Ati9}QU3zKprTodl>5}(A*79pdbJ^y(dtv@Uaogbi;rk~(a21=+uG!9&1KYA^
zAnRjmU3Ru)ZF$R`f3dW^??Y>!1YfT2Ekp{>RVaF&f8F1hJ6!Syi@xr{t{+BLc9jmC
zEIx4xdGXWqr&_JOu41AZTM5EPR&UPtp|y3rW#?Ob;V{5<p8RFYsYUYwy?9{Z*uu+Y
zw<kxxa-i&KUb?V&0R)G9d)}MBUT_t(P+8Be@~)o34v6iMVtP%W?^CVL>#JC(w$6n!
z4?Tg3i`q6sS3Hy}K)8<1LVLkmi2Sg9`9^8)@nYXVap3tP^W|SUUjTV!TW9Tdth;>+
zFX#Fer(fyM>+-4hBFo#~9lbZZ%Iq(>_t$D&ao3TRmscY9{U5kKz^0y~nRTW;e=Hxq
zH}G9k!JYdm3`k%%j0d#UMA|AqTOFjW^;@>}{*fYc@t4lw$}mkecU7LJDVuNMcG22e
z)U|S;Sp;S;GReX!kKAGfZ8UK+qGnl;mm_S@KSN}Ricu*D#iy2L<9l>QbDQQtClI5)
zM@!Pp2ue38MdQKJ867YHns+5eil(Z^=mheno1To0)nw?5_C4*p5`%`4&;(JQZv<s^
z($U|qs%;BcIOsMoM71%Zc4L?VMqL*)rq7DHIowY1Na1l~uc`_8=*!bUt1%$-nGhpx
z#7tENNk_RPY=WRjjDWH$l-4m!2XiT?74!rTHvnDX;J~Ih<p0wihQST;#2|<_av?^b
zgF%C!N%A1)CvL{LB#8G@9H=1$3+R49n2*2?FK9tAC|D%D7SshC{&B>s)F7Bfr{Q=2
z6fo(NEb73B!W89LaZeHV5*un5VLu^iQ0`gCeHH%rS$NG+j~qR(wk>rpb{8EzdH0_M
z3L1#a4gJdD&T7|9_66O-@WP2~e9hFf?rK`vwzw^?E$m+&UKw9=oyr=@R@Vb-^QyHu
zPnWFQ*KMw%r>o$55bR$K_7}Upw9;3sKfGo;Qg#O(xI0$e9Us~{Dptzrsn{r+U0k(h
z)OErc2Z}u-oUAw47`H(x$CX4@?p0VdOH@@NM+3>G^{24(EXd<q=0Pddnnm2?$tbB$
zpyaQt^NCt|I%U<*0`rdlWTuv;zNW1u$+P-cRanqvYQw071^pH<MwLKe*aF6+s!OGD
zWX-@rDonZDZ!qEj%PzNn%Tcp!QP%zi%GPaB*6{_(I=3k6V(UMz&9XR0gXmsspQ)uU
zmCJzkmF5B|hdfFkFe(hZoCYks6qVOg!lEjn2qLrQE#}FsnkP%fGJ(SCrbn&;bl29q
zwuqK!uK|t-O4eCh#;U5(Cbt8$+HTNlTV@eL&+SrCjYo!o{Tk(e@&L1L&RE{_zAI4!
z2w-;BN5n<Ds;s|SHe(+K9n%KP4b@x~X2Tc49H^Ehn2j*@Rcdam=BAMA65DLdm@?+~
zn&dVQtM`WHYKa+NJJkn*<SjS@E>kr813{-GYO{&fTRUr;wcetFE#ZUE{S%VWDhy_&
z#29Fmh>=IJrpQV#*}5Xy)kIZxmE8s2I$Zq%{Rvo9`c)>CD$9pvkABhaVX3AP`$xZG
z0I6oMpcqEe6sX5QkRY0Gr5J4g$!QMG4t32_@o)-T*Uv<z*{}l3fI07GEE4T!j`i)?
z)4PAqspFr?gEM0~0U}YKD3f3#L^GiZvX_~PO)2)ms?Dg{;$DVyhL0qtqF{Wxl>oIk
ztC)D82M{NkuMyb!nb2+mJ#s4=8yio7oliytXPCNMz16{#j7E~N2(U>;&d_d*Y+)xu
zw35+Lse(bP1UZRdXhm)L=yU>Xn@KPZiq^T!HaJ)XBhsm>u4p$P;&-d;5?Gyzlb0(A
z?N%kK5riiAWJ1(RVo@|GUbqOujQV&{1{TE%^^mAgsguD3B3n(`Hm1V}5MwGz$Y4)-
z@`<OO>|ro=;NYI8deE$LO*J;CUxT#}?R`R(i0Za}W)DdbXv^;o9sYlK($-5zRRh2e
zfYd7Jkcfb-Ld!IVs!)(~iJ3TUyhz9iL{OwF!8o0`k${DjHeMpcgrX@#(#EIJ9NEu+
z?F1uf;~)?ZNSuOIMMZ+ht)=a~y}isZ)&(o5H>jzySt2868$NOAxo0G1E^V_QIP#GY
zrY#qrJ1^%rNY3-5p&}+wup);8tDE3VH&{^KMUF55lDP<oMyg=Y%ALV(>VQ%bf)0(k
zf|(DGMn{ljxcy`ZP$&dO5sgO4_)QR9!TQAQgDk?>33Vbuc~OEqdlB<cY82>E!64EG
z_htOHkf9wRQiF-`h$uhUB@7beA7b$Vd=aL$9t+Yp!2%Z9Qg*c?E~K9k>^PT%B6;}Z
zKZOl_j{5WuXi9I%9?#y)j%Rz<bPek|bM|b`yzrHxrF~7;fuGN3ht_nykInXM>`rU$
z*t*M0I8VCdYRejatEDV~ih;7#-|e68Ul`3@2i9|Lu<Y_LH7+*hPUSC_TwQZ#$__8l
zKJUoammD1+CVLu}&McnE-7eIFwQ26@kD;DZ3xi8Ti$l5R^OsAW-9`7FqNQ(5w-=l6
z1+u=JkCi-oitfImW$&79pSskw!tqTAzi|qh><m0`w5~c@A2>Q!9iSEu&J8{6*mb8p
zJ9alYpDfk2u34BLx|d)0i<Y^8M;7PZ?)mP8=W@qOmR9)Q-tl(eUf`{!HB0Zkrq?_<
zSMFH0?@nZ4`?p8u59XWZ239S-pBN}pJ+STB<eF*Qx;>EVF0?Q2TeClz)s(IF1=qKp
z1Ri}Mv}W60c6+m@5!|T;?<#1Q-D~zIHh}l6*;*BFFMt>8m^IV(bw^8Xa(Vpzz?x&I
zXc~g?ELwJ~>2{W_w%=Z+VUFfo|M_tjWexGj*8I>n)TI4u+Bei?_@GHYw8O~7A(i_>
zc-0nE{}vc<w7JyM(^pOGAew6V<t?BzVEjfY3?4A*>$nB90f<l499K)zE;C&`*a<Sa
zw3VWOv8z#*2(XG7PsUwk3;8-#DMe%uF@A1YM^)R*LhCm&<}-SgPF>MbgWzJQYJDeA
zxwPr2ngu~{G2lOWglH;ZQ`KmER}t%H(6*s4V^xH#Sv9_3BjMHMRoZ1}Yjy0ZqhQ2w
zMJI=e-#3Cd178f1W5FLRZUSc8T#J#evZPzLM((p5&H_`;w|pKxxrK5mImFUjL$x*)
z>#B?nOT74$5~hTD)@>Q%XJ%ay-&lLbv@w_T-#Tl}SSQfvxanc*<XV97Z;~@7n&ntE
zWF6J90&9&l^J5y;$vWS2$?W#TwrUAE2EvAIV!QlR4P|<owS!T;E=J8dKp<+b=F2e|
z4lF{~B@xFt>&m!luD*KKEwA0#`b<6RAy%g+wt)7k)~Y+>=3T0_?#nnqjpSF`)xa}P
zpV}d}D`U?nb!8kG=OM_rb1_plfrigbPsS;a7+9WIu!GcbEh$tglLs)>DXsQo922|b
zoNAa+iD+9;UIQCYuz0tCX=D{k+pI6uE8~auYct-A4+JMrsNh3HN&jp^rUAy?2{ai{
zjZJeVuyK{$sn=1e{FaQL)3DnTG~2onj-;E0!9#TzT#7acV46GKk~<NT?vj`#B4SC*
zwBqiFOs&MGhBzE{!KvbN2_MaYUXn_;iFSxnRd`U^f%KLDyTvHTF<@%okao_e!Hq!8
zLQ8&-U|->H#HOau?}2bYu?X=zBHW)~k|l%mm&AHRCMhH*qdx)%io+Nu8CR<9M$IBH
zZ82czfAfFfNjHiZ&Y=g?rWU64klN4u1`jRJ>4j65z@4B@G7*i4z6{)6$mDk8YY)E2
zCtCB<ei88{Fw2M?Bqj&2t2B4YjRS)l(<xb^I-&Y#oF!(Czx!nTqwIhD$B&N?dk1j8
zxGYILtpN*-*>aJn7J-*Gh|H^Cz{*Yx2K|ID<#2-+tTBEBj3~)CIDLYD)NN3(@`R=3
z!kDc?p$5F1fSCoOeLQ)c8^vS;4~}6`NJ_;fqrmUS!P}BMi}j04Kc8Zu=fJH_#bYVa
zA&Kz2NPEPbL;RBj!}Jsjwj5$42{u-_3z2z6gN|S(ONXc$%|vyIQEnXZaF{RZQ(;a8
zBQ_7D2Eh^m-_aN%=D9h9vEq~iZ=6Cd+>9TjNrFwXx)FCo(TGIsLwYg^9C}9(k0YT@
z1{Cc>)sYqqqM?sFi1lbE`7s_HuZZ|p^F_3~+!$iU*JJqOe+f$mEJMH@2Smke?pn#Z
zYi{tN)lv3rf8g1%>e*4;`P52V$#XbserWZ4e4u~j;9n#k9C&v1z_V)yhI0-1(;o(U
z*DhZxzBE?4Jf1yWb~it8x39X}*W5ckZf5fC*H0JBtId52W_8ZG(_3r+R4coRzLRUt
zQ)Pbxu<Z-6S6lPP*8NROGmA5M*DIL?9kB29+ZS5F;$p8a)*oB59S2Jf^2RL>9L%bN
z$u|}*mK=M4Gq$=3uX%RObgt}hLII~|Vc^>{xu;eg?ZD4l-FJ`9A6>IHm)%Xd@m2TE
ztf`vzk!f4m+q`sj@#<UcZ?WKi7F+IG-PvDi`%=k!C~JA>2$TcNo6WB^zZEHTEx)kZ
zeYmvqNGWhMd*-3l3+?^R$$WpQIaJ_QoA;Ld`{su}M%{L(aJw*3tUtJB>xV|81aV@a
zYoW1dX<pN{JT_2zXIW?Z=9#ab$-exJr`PM7mqLr7vbpY6?ZVJ2_M&N9A+^%+saE4Q
z{GGw}_0ftR3P8_Y%@3SASDiac&aS!B@a^%<oh#cNb0^Eb#-)kHi9d<YTfnIIT-n>W
zbb0Y|?q+_b<n7B^%907K>}w*fcILv~k}s4!_0Zi&W@GoVwlH1X|4h;M?3(l9Lzs}p
zzUAP`mrIQ&vL_ezee3KaTf+le>#D6aul)rm=G~OP{SzN$@~j8;EuUV=lmh3nb!AiC
zUHiO!A+cuK@yOaBEvo19mrB-d5V}+;&z7v6bA#*t?!tlP{VVJTo;CmZRh?(<WOnEy
zhhMZNXcqP@?8r|1!W5`zDQ5%BDTsn+?j2aXUASHJJyo<ET+{VGbo#P=MO{no7`%Ra
zOatp>vGsSn4c7Nh1HQ8z+Mjpm&xVX3{U?(t4je5g?kc{n!E1x>j}a1ZV|u(7r~+jP
zmL@!>4!97NYcD965o*%|j&>5~BZqK0{tcXt;C445ot)|-wYb=-^DXbFT?$NHV7ue!
zJ0Y{`RIPI9z|kA4oc;uk!KNppaO$%rR*(N=&H(s4MScA>QwmOY@saa69c###ZqiWu
zWCnO=WEz2cra3Qgylxq~8nQ}F8Jehu={xfo&6dy#=a|t*(D{sZOK8Q{D5I62e*~sH
zMcG?zMRja|zxr?JXYyJ8;?AtdU4SdIVqTo^RdL9%6gs<+XH=uOIzLuioh|9xCu4B)
z0Zf5acX~<T3nw?YAi{kJj%0zhPt;Ws6&C6}2ppWZ;PVoLTN5d`1^~Rl^%-V7d{cC;
z>_i1uCj<VzqAy4%YO^{;rw&1n>=2(ebQ9-(fgVYl;S>s!uk%EIvuWiA?cCR(D}tWG
zvkN@`z|$EH%~Tk?uF=(moWUHh)f5*&I7<xl+mM}zL<Joii=@PZocKqW@v;bURu)X)
zP%XMGTew9mypi4nGifJkw*)<Qn-|QexDq)tJX!%cw41b;$H}N3Ws;`@8w1@i@KaTl
z5zplT@HCJgPs0n&GbaUS$*sk@xzlA^-Q0<fy-m5(Z(exqLSd-X{KPN8gTegB>bZMl
z{>TGs$Evj>KfGoQ7A}APrFULh`O1gA=)h3X)tS8;PNc6cel;I1`Mabe>0@_Gj(uxz
z%^fT}zp{U|_bB)(mfilGVewFQ@S($<)4geW&6M|*8oCODA2#fP?5$EgInSF7uQlX5
zOM&h}^uxeDL}hc`JvV<Y7x;y(z0ywko~(3JHrL#_-?ozD@PC8%GHAKL+wm<^(GdhU
zX0!Jpv|0<{@A%^|9PhiV1O1x!n|%Wz{rh{HA^uZG6TWx(;QccVc?Vn=59tS<H2-X$
z5##-VL7)D=dTDqIE>s$#g<xbdJVj<`gUc)0ImveUZy~ak>O~gOc(_La78hmX++aPW
zKgW8i_)W_`1`)gl2mj=)39a>;g@zeg5*ak<r_P}bLERfd7d?=Kw?L4*qLWjp83xxi
zxFa*8;aD8JkwfQ$I_@jbyG_IZ4DUTm#r3+uZ?~SPUQwH>j2rh|0E*+<{1QaKfNpJm
zU~ONuwy#-tlpU_DdEE>Oveq>-1B{QuefO*LU(JO}4(3D0&irxIU2cD1W<E4CzvHb?
z{!atPXzly7{+Pum*wobsCfRbc!5WSS51FGhM7EkEmR8vn!RLa5O+BNz30Dp>@P*qv
z*zRhpbkr&{P_02pa88s;YQ>T`?{Ev`^r>iyk=uwq;CQi&-2G5@qetA@MqvxR9)>$z
zaJfwK(HoJ=h6J-5<pt;FTq4~CX|g%N<h=!r=e~!pxA9eg7hGZycNK9z@r;0*mx3Hw
zb!Zs(4#1#6k-rQPm^3HwT+FJI$u||Qmz;fbrym*}3qyJHswY@>G~^mq9c@KZTiM_K
zz~8y*?<{upuN*J=j}~>Fhem&~xn~(T;DZ2J3<Osl-9=M3s8{DsOCAmK0PwY^ZVuW3
zcR}efT7AJFqYwqrX~N|sm?Uj&&LmU41OV5TYQm^}id9ZKFrA940%{<6u^xR*9FQjy
zByc^o>(N(TAJ9)4@EM@pwecBL%kp#de!y#i$VN01X3(RVYq>4RTM>9wD_Sv_UFnf?
zk-q{j;;xKEb^^i{f*a6pFN!7<)zPj@9In$-vDDD?b><i`04jG4<KgSkI9|ELbL=_1
znJF*c-s^Duv0y_?-hiu=Bp(gGl1(-wgE75Flp=iS5FXP<LZUHtL`42#wUC&8O|0!C
z5E0MFWTfDeDMl<z?o1>Y<N@GD<HkIZwMbjZ&Cu~A7@b(5ZX<|DTc}YH3}WSieM94d
zda?nE%mPvsF+RbayyUGY2NtY<k4d<h3Uy)|Mx+X=q$}vABBNu1Q|^HR@#k3fKj4dq
z&IYkZWS^{Z4boi%FAfqIz(&B>1b0BCq2qoCrN4%B{@)=&7zcmH!pPkDbvKB;PnO(I
z&7FH_^_LwjMN`Wo@Ae1Y9jji*Ij~Y!@}8PI4-|cOc78T@1&l=K2DNT&&D#p&CF_%-
z2rZTaarXA&?IN>hIaKn4km4!ZeRr?TU&}oOwkLbn+{uSlSJ}~8G_`^>=M4}O(+KP{
zp1!%K%l?iB{_a(O_b>fD^Hvb+o-bSL?jD>!`0X#{cD~v3TF>jDlC=vQroQzw$cGKR
z;2~&hF6x@d*0Mpm!EKKm1x~V`RNW;q%KZSadT44=Hn>hHl&!BG))`$!ksgSgqa5#0
zm-MjUf})f~il=Arh%utW71$|g&;n``bOts1HE&ITq*1LKboIO^Lvi(6z`~_Tr7ncs
z0uH2}4RAo;hV&0Eeuln*s>94+GI9gBcGThE&1jB^O@cSIMA7F&>QyKdsa1g-A&@+#
zwH-oc{mAY^XlXvu2DcEU6cUw|ylB9eeYxA(nAj-jBgOhqV<?_Yz?Vq(m{i2)++3+Z
z?-q;_-gmT{XfS9|vE>Q{Rw<7A0Zg7?5SuR&mtOp+#gY4E(%RGp?!N$*JQCL{5CKcB
zlk)78sp|JpPTB6d`@;MSMPFB;tz_?&=)?8vZEe|0po!SNFt~7F)zVnd%?*^ns(WVs
z%mNQqPh0EUiHBC_;-#|Fw`5tg<o1Ip*V&QPL!z8?Fb~IfFsm<Hopa|z8Ua~fdBBD!
z(#W4eMD0ANI&HT)LDe*(RndqB=u4^tt%^12OIux!TR?ygLdL@Czk~NA4C#L(uUKW^
z@rFC9MVTaV`2lef#L=6eU<HSC;+&5+^vB@xeL`8W(n6M3-B;QaWFrz1xPkyq18|X{
znj@iv9Q2Bv6%Pv4a&%QeB_x6>Nh%Fo^k2d-b3eh?kKqMp#a4@v$C)9Uy2!N!KD*|m
zin#j#xq_>17$UIhEMIWdJ!;wh<_oXAP~3TNWpAnFNO|MuCwi^TP@%MX!)JF^6I4WY
z*1z1mv%)2EV4<rH%sNG<qz9dnA)^Dn<v)QI#i$aD6s-y9a!l2ARMV|g(V5JeSq=V^
z*9f?k*5rXU?8aO0Nvuw_V_U1*q>NRy+uN#kV{k;Q!B4>+tJsRSfH0~c5I3Gs@XXq)
zdZ-O`)`3n8eFHW<D(j;%c2ExPFtMiZ=rVQRQ#fVN0Ih%q2>7|p!s4BrBrKZKAD=@<
zZ&XuAKczt~-T>i%yBV%J6o?^0j5m{@sNq436n|Gj=BSY96atWT?M-ABCWM3T<S26T
zqSOPL9mN+9bt6RNMj=(=3FV3;=B1}viFnee6C`R-CrM;SIL-71*<P(BA{0oJHptZx
zO)1Dx7cq`m0($FI87RS_bX05u94&c-4dYAE9*OK9GF#+OAT%o4V$h@$bktH6ad4qh
zWDU7zFh-g%j<GlKg)T|rcJWh;{S01U`jQW6+`D)l)tM?z5TlY3iQJ9CIaXwKp8G3+
z{%hPg{wG9W<EW?XOkS5~3xmr$O17uwPLvy(i>-TC8}=4;UZQaS$lD0=yWO=Ayfc!g
z*WLc5Ba27!+e_|_>>!BlbuEuTpuatTJ7>siN{;rduIy<b+}(?Cuz7l+Y(sA$Ry=s7
z)NnRy2X%Yd=Da&JKeX_C?qbQt<U8K(y4MBj@x7(C11sj@z@^7pO_L?-06lo)zU70(
zBNs}6XB3_LdZ0OX`Soq(KwG(`yWAT3#BA_e;C73nrDBEH|Dp_LYsE|XnkpUO3^n)k
zZ`+}T*%48(@b7#BZvD^P`hjM11iG(0%HCFn9aw!L*1875((fr+@L5zD;I6&tVL{S-
zPvIJgt930X_+_Fm0A*s$UjPc+)xQ9gjkSLPs5;i6_G%#QN4vqMN0uh>_IItzEnK|1
zuYzMbtU7!^zE_;aQT0;OBZr<)H1lF9d;@HK^-GQ^JF2$oOOis{FL@>uZMK{Oc(knh
zJ&$~f$nE-E4-jt^5?WHj(}VL?`&%1A#sHoH<R?`om};x!7&o>7OAQDa3-Oj}m&;W{
zd0zPT>WHwujDDjp!sqzAsNC9rKcvbd1a7x}c2sN6F#L3oV#i?BPV&Fs3rF25tK8!a
zYpYXE{r|eQz`n(wQnffxyH_CXH9}5}wF@~-kW*v*!mOz7i~s(yW>srY)tY9NFO7l=
zYHU$^@33CbMHzq>1C1r{TsxF;GF4{{{lr)=+L}N+<jojae;R0Y)_A80{9CH?wQaLs
ziz<())fVtp1EC69w7pg?q_?T>N~^Ge|BSraKaV~$Mh@Crg`R=i-s-Pl`}uUo#n1kp
z82MS4VHGa~iq)53aMA|xC#TZep8<~uRDOkP{B#wuXyLCg!G*agEP^6%x(Vw+3mLKg
zNJ0s5hk;B0BCgo(lQH~V4>W(_H@VN9HcG$ql=f6}iKjM$O0l%Lw^zCkmOlAqxEO|7
zeK0e@?c*djL+X+4v`Ihwg+CJ{UjT<26*9k9EfY*T;i>^JgV1n^149E@5^mY1xany6
zOie~nhTXs-5;<%Gm&kOT8NP6y5q)%m;I@N5PXSHhz+?;_(;+SeAq<`Y42tLyf)Zr9
z`NYMG7cTZQmz2hCc2`R8l|Y8cFNw*&4b}s$<v9913|!>i+^KX6csMEjM>!Gflw`oL
z$l<hs%mG}R7H?)i!bt*=wpUAa@j>o6Xd?GKw$pr3yjci7R6oLRBO|)11igYO%#DG2
z6CV}oj&WlkmnKrr;3p^8!Yn%ixg<@n5Fq^Kg*w8m$pgz1=OMzask9hq%olx!G7uB=
z?DXUm+==Ajx4v|ElTvhP*WscQEjs9C&gJp-1iXS4bf+Qh&|#e-LrVBg6eC6E6B$y$
zfVQD1%dKH@0$w$(l)6dK6>(+A{(IyoZ@~1;QR{Z^1AE7+y`${u2e*zo!`w+w`01?=
zbpBPHKW8d-o+;}5Yr3;#ou!&KNYYMhNIOK*4p-C6#jYbH?WmGgHo438=g?`S;Zu}b
zK{F^<RW>!|50&kgH)PuCMQ4qYb=%w^w990BU~K%**a$``bfd}Y)?FR>ONIU4KlskU
zW%ft0AH?o^{-*K$#**t?)&QP2ci6iV^AkB=fnKZYUNh}lH(BqR=1mK(JJv@XU2mWJ
zvvYUYh5bv177wk}G0W`SsYgcZH)sB6X5G^O_Rqj;fmbeM&7c+%UEbFm;`Puy%YkKU
zvHsAS?eMzIL73xb@0^uzzuJ<UDRh=xd)G|+9$6iC4}R+)aREBMW*Sgc|FPAvp#Ani
za5e$g6=3M!j@^qbdwvx7L11O@Cue?qrer%ccjDJp^lUFWdkgIMWADTYFRmC?Qa_pb
z@yyDN5B$YvpL_7^i>uGRSbXMcvF_TM=_NIK@asAG>b?cI@x9QKn=D#-)^xpPd)?g0
zU*Qj2t=k(x6Xh@2gOZp0x1R*9)k1jw?EJ}!5q_lxG!&n?;BflYNN&d~6Zr$HzF^VY
zUD*EKz_R|`GlfITw~CgdYq~$EUVB*A8NWI7^`XU`*=K*DV{&15iFPH^2-t07iIEXd
zwA{kgihBsx0<WO<{Q2`iMlg<yu*t~C2#4Bl4he@F!WSG!l;~3TA;id!ijBu(*G0m+
zj!E#_W1tg4;f;F@Uyb-e{(?i93%n7g;Rluk-F5iILgHa3XyLjtxpG48cZgnSd4&Hi
zzKCI9lX&=7n1K`|Ku2(ghjt=oNjfKrai&AC|4+xGN4Q@@1~Sq773d^b!D;#votxG@
z?xtwxuPE!UC^Px*`W0pQA5_Cfl(k~>(Qc6K_T7tBD2T4G_xJr+jD9fq7*i^H>j7ea
zL3{6Dg@WiZPlA=y{ed6PJjRqt4|R+_PJg29rx}pkDkN-Z7Q>F97(R8FgpXi~_b+%B
zB8!3S(N%rZV?7l5<haI6+ZR&s^JOy?3Zi+a1A=AGitYzZB>I8&F{XUd@C;4cbC)U<
zh9Hsdf9I?izrXJ>Nvn)%F3|M0d`E@Ca9IjhhQ;su!;eXNC841@y2^b|e4=M)SJ_fm
z(PHQe<OUZfNc@onZgJ0dS1|ukM`xkq-Wd{yEx};Pc6|NHV<Y5LPSScBGAk4W%ljW=
zP;r{*x_qiaL4czL!Aj~e1{L2nx;YQ6hhT*z!Tr<+1MkB)LQ3U0?Zvj(D_RV_@M~-s
z?tl@*@W^ozKGK^W>*2ey$4VOto(cuQeW)LT51@VsD%<3)wbJ#ZYZ%I1gSgT)jLThv
mxT<UL^$&yell1lq7)W%sZ2CiO;Gxd+$0xx7P1i`)-v0##Io06+

literal 0
HcmV?d00001

diff --git a/flagos-user-tests/tools/generators/create_test_template.py b/flagos-user-tests/tools/generators/create_test_template.py
new file mode 100644
index 0000000..e8e261a
--- /dev/null
+++ b/flagos-user-tests/tools/generators/create_test_template.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+"""Generate user-perspective test case template.
+
+Usage:
+    # FlagScale training test
+    python create_test_template.py --repo flagscale --type train --model llama2 --name tp2_pp1
+
+    # Generic test
+    python create_test_template.py --repo flaggems --name my_operator_test
+"""
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+import yaml
+
+
+VALID_REPOS = [
+    "flagscale", "flaggems", "flagcx", "flagtree",
+    "vllm-fl", "vllm-plugin-fl", "te-fl", "megatron-lm-fl",
+]
+
+
+def create_flagscale_test_case(task_type: str, model: str, name: str) -> dict:
+    """Generate a FlagScale user-perspective test case YAML."""
+    return {
+        "meta": {
+            "repo": "flagscale",
+            "task": task_type,
+            "model": model,
+            "case": name,
+            "description": "TODO: describe what this test validates",
+        },
+        "resources": {
+            "gpu": "A100-80GB",
+            "gpu_count": 8,
+        },
+        "env": {
+            "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
+            "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+        },
+        "setup": [
+            "pip install flagscale",
+        ],
+        "run": [
+            f"flagscale {task_type} {model} --config ./conf/{name}.yaml",
+        ],
+        "verify": {
+            "log_path": f"tests/functional_tests/{task_type}/{model}/test_results/{name}/logs/details/host_0_localhost/*/default_*/attempt_0/*/stdout.log",
+            "gold_values_path": f"./gold_values/{name}.json",
+            "rtol": 1e-5,
+            "atol": 0,
+        },
+    }
+
+
+def create_flagscale_experiment_config(model: str, name: str, task_type: str) -> dict:
+    """Generate Hydra experiment config for flagscale CLI."""
+    return {
+        "defaults": ["_self_", {task_type: name}],
+        "experiment": {
+            "exp_name": name,
+            "exp_dir": f"tests/functional_tests/{task_type}/{model}/test_results/{name}",
+            "task": {
+                "type": task_type,
+                "backend": "megatron",
+                "entrypoint": "flagscale/train/megatron/train_gpt.py",
+            },
+            "runner": {"ssh_port": None},
+            "envs": {
+                "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
+                "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+            },
+        },
+        "action": "run",
+        "hydra": {"run": {"dir": "${experiment.exp_dir}/hydra"}},
+    }
+
+
+def create_flagscale_train_params() -> dict:
+    """Generate training params sub-config."""
+    return {
+        "defaults": ["data"],
+        "system": {
+            "tensor_model_parallel_size": 2,
+            "pipeline_model_parallel_size": 1,
+            "sequence_parallel": True,
+            "use_distributed_optimizer": True,
+            "precision": {"bf16": True},
+            "logging": {"log_interval": 1},
+            "checkpoint": {"no_save_optim": True, "no_save_rng": True, "save_interval": 100000},
+        },
+        "model": {
+            "num_layers": 2,
+            "hidden_size": 4096,
+            "num_attention_heads": 32,
+            "seq_length": 2048,
+        },
+    }
+
+
+def create_generic_test_case(repo: str, name: str) -> dict:
+    """Generate a generic user-perspective test case YAML."""
+    return {
+        "meta": {
+            "repo": repo,
+            "case": name,
+            "description": "TODO: describe what this test validates",
+        },
+        "resources": {},
+        "setup": [
+            f"pip install {repo.replace('-', '_')}",
+        ],
+        "run": [
+            "pytest -v",
+        ],
+    }
+
+
+def create_readme(repo: str, task_type: str, model: str, name: str) -> str:
+    if repo == "flagscale":
+        return f"""# {name}
+
+## Description
+
+TODO: Describe what this test case validates.
+
+## Environment
+
+- GPU: 8x A100 80GB
+- CUDA: 12.1+
+- Python: 3.10
+
+## How to Run
+
+```bash
+pip install flagscale
+flagscale {task_type} {model} --config ./conf/{name}.yaml
+```
+
+## Gold Values
+
+TODO: Describe expected values and tolerance.
+"""
+    return f"""# {name}
+
+## Description
+
+TODO: Describe what this test case validates.
+
+## Environment
+
+- Python: 3.10
+
+## How to Run
+
+```bash
+pip install {repo}
+pytest -v
+```
+"""
+
+
+def dump_yaml(data: dict, path: Path):
+    os.makedirs(path.parent, exist_ok=True)
+    with open(path, "w") as f:
+        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate test case template")
+    parser.add_argument("--repo", required=True, choices=VALID_REPOS)
+    parser.add_argument("--type", default="train")
+    parser.add_argument("--model", default="")
+    parser.add_argument("--name", required=True)
+    parser.add_argument("--output", default=".")
+    args = parser.parse_args()
+
+    root = Path(args.output)
+
+    if args.repo == "flagscale":
+        if not args.model:
+            print("FlagScale test cases require --model"); sys.exit(1)
+
+        case_dir = root / "tests" / args.repo / args.type / args.model / args.name
+
+        # Main test case YAML (user-perspective)
+        tc = create_flagscale_test_case(args.type, args.model, args.name)
+        dump_yaml(tc, case_dir / f"{args.name}.yaml")
+
+        # Hydra experiment config
+        ec = create_flagscale_experiment_config(args.model, args.name, args.type)
+        dump_yaml(ec, case_dir / "conf" / f"{args.name}.yaml")
+
+        # Training params sub-config
+        tp = create_flagscale_train_params()
+        dump_yaml(tp, case_dir / "conf" / "train" / f"{args.name}.yaml")
+
+        # Gold values
+        gold = {"lm loss:": {"values": [0.0] * 10}}
+        gold_path = case_dir / "gold_values" / f"{args.name}.json"
+        os.makedirs(gold_path.parent, exist_ok=True)
+        with open(gold_path, "w") as f:
+            json.dump(gold, f, indent=2)
+
+        # README
+        readme = create_readme(args.repo, args.type, args.model, args.name)
+        with open(case_dir / "README.md", "w") as f:
+            f.write(readme)
+
+        print(f"Created FlagScale test case at: {case_dir}")
+        print(f"  {args.name}.yaml          — test case (setup/run/verify)")
+        print(f"  conf/{args.name}.yaml     — FlagScale experiment config")
+        print(f"  conf/train/{args.name}.yaml — training parameters")
+        print(f"  gold_values/{args.name}.json — expected metrics")
+        print(f"  README.md")
+    else:
+        case_dir = root / "tests" / args.repo / args.name
+        tc = create_generic_test_case(args.repo, args.name)
+        dump_yaml(tc, case_dir / f"{args.name}.yaml")
+
+        readme = create_readme(args.repo, "", "", args.name)
+        with open(case_dir / "README.md", "w") as f:
+            f.write(readme)
+
+        print(f"Created test case at: {case_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/flagos-user-tests/tools/resolve_matrix.py b/flagos-user-tests/tools/resolve_matrix.py
new file mode 100644
index 0000000..c13c8f0
--- /dev/null
+++ b/flagos-user-tests/tools/resolve_matrix.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""Resolve test case resources into a GitHub Actions matrix.
+
+Reads detection outputs (changed_cases / changed_repos / changed_repos_list)
+and produces a JSON matrix with runner_labels, container_image, container_options,
+and container_volumes per test case entry.
+
+Usage (from workflow):
+    python tools/resolve_matrix.py \
+      --changed-cases '${{ steps.detect.outputs.changed_cases }}' \
+      --changed-repos '${{ steps.detect.outputs.changed_repos }}' \
+      --changed-repos-list '${{ steps.detect.outputs.changed_repos_list }}'
+"""
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent))
+from run_user_tests import (
+    list_test_resources,
+    resolve_container_image,
+    resolve_container_options,
+    resolve_runner_labels,
+)
+
+import yaml
+
+
+def make_entry(case_path: str, meta: dict, resources: dict, resource_map_path: Path) -> dict:
+    """Build a matrix entry with runner labels and per-platform container config."""
+    labels = resolve_runner_labels(resources, resource_map_path)
+    image = resolve_container_image(
+        meta.get("repo", ""), meta.get("task", ""),
+        resources, resource_map_path,
+    )
+    opts = resolve_container_options(resources, resource_map_path)
+    return {
+        "case_path": case_path,
+        "repo": meta.get("repo", ""),
+        "task": meta.get("task", ""),
+        "model": meta.get("model", ""),
+        "runner_labels": json.dumps(labels),
+        "container_image": image,
+        "container_options": opts["container_options"],
+        "container_volumes": json.dumps(opts["container_volumes"]),
+    }
+
+
+def make_empty_entry(**kwargs) -> dict:
+    """Build a placeholder entry with defaults."""
+    return {
+        "case_path": "", "repo": "", "task": "", "model": "",
+        "runner_labels": json.dumps(["self-hosted"]),
+        "container_image": "", "container_options": "",
+        "container_volumes": json.dumps([]),
+        **kwargs,
+    }
+
+
+def resource_entry_to_matrix(entry: dict, repo: str = "", task: str = "", model: str = "") -> dict:
+    """Convert a list_test_resources entry to a matrix entry."""
+    return {
+        "case_path": entry["case_path"],
+        "repo": repo or "", "task": task or "", "model": model or "",
+        "runner_labels": json.dumps(entry["runner_labels"]),
+        "container_image": entry.get("container_image", ""),
+        "container_options": entry.get("container_options", ""),
+        "container_volumes": json.dumps(entry.get("container_volumes", [])),
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Resolve test resources to CI matrix")
+    parser.add_argument("--changed-cases", default="")
+    parser.add_argument("--changed-repos", default="")
+    parser.add_argument("--changed-repos-list", default="")
+    parser.add_argument("--root", default=".", help="Root directory of flagos-user-tests")
+    args = parser.parse_args()
+
+    root = Path(args.root)
+    resource_map_path = root / "resource_map.yaml"
+    matrix_entries = []
+
+    if args.changed_cases:
+        cases = json.loads(args.changed_cases)
+        for case_path in cases:
+            p = root / case_path if not Path(case_path).is_absolute() else Path(case_path)
+            if p.exists():
+                data = yaml.safe_load(p.read_text())
+                matrix_entries.append(make_entry(
+                    case_path, data.get("meta", {}),
+                    data.get("resources", {}), resource_map_path,
+                ))
+
+    elif args.changed_repos_list:
+        repos = json.loads(args.changed_repos_list)
+        for repo in repos:
+            for entry in list_test_resources(root, repo=repo):
+                matrix_entries.append(resource_entry_to_matrix(entry, repo=repo))
+
+    elif args.changed_repos:
+        info = json.loads(args.changed_repos)
+        if info.get("repo") == "_none_":
+            matrix_entries.append(make_empty_entry(repo="_none_"))
+        else:
+            repo = info["repo"]
+            task = info.get("task", "") or None
+            model = info.get("model", "") or None
+            entries = list_test_resources(root, repo=repo, task=task, model=model)
+            if entries:
+                for entry in entries:
+                    matrix_entries.append(resource_entry_to_matrix(
+                        entry, repo=repo,
+                        task=info.get("task", ""),
+                        model=info.get("model", ""),
+                    ))
+            else:
+                matrix_entries.append(make_empty_entry(repo=repo))
+
+    matrix = {"include": matrix_entries}
+    matrix_json = json.dumps(matrix)
+    print(f"Matrix: {matrix_json}")
+
+    # Write to GITHUB_OUTPUT if available
+    github_output = os.environ.get("GITHUB_OUTPUT")
+    if github_output:
+        with open(github_output, "a") as f:
+            f.write(f"matrix={matrix_json}\n")
+    else:
+        # For local testing, just print to stdout
+        print(json.dumps(matrix, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/flagos-user-tests/tools/run_user_tests.py b/flagos-user-tests/tools/run_user_tests.py
new file mode 100644
index 0000000..7e15dd7
--- /dev/null
+++ b/flagos-user-tests/tools/run_user_tests.py
@@ -0,0 +1,547 @@
+#!/usr/bin/env python3
+"""Run user-submitted test cases against FlagOS repositories.
+
+Each test case is a self-contained YAML config that defines:
+  - setup: how to install the repo and dependencies (user's perspective)
+  - run: how to execute the test (user's perspective)
+  - verify: how to check results against gold values
+
+This runner simply executes user-defined commands — it does NOT call
+any internal repo test scripts. This keeps test cases at the "user level".
+
+Usage:
+    # Run a specific test case
+    python tools/run_user_tests.py --case tests/flagscale/train/mixtral/tp2_pp1_ep2.yaml
+
+    # Run all test cases for a repo
+    python tools/run_user_tests.py --repo flagscale
+
+    # Run all test cases for a repo+task+model
+    python tools/run_user_tests.py --repo flagscale --task train --model mixtral
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+import yaml
+
+# ---------------------------------------------------------------------------
+# Gold-value comparison
+# ---------------------------------------------------------------------------
+
+def extract_metrics_from_lines(lines: list[str], metric_keys: list[str]) -> dict:
+    """Extract numeric metric values from log lines.
+
+    Supports common log formats:
+      - Pipe-separated: "iteration 1/10 | lm loss: 1.161E+01 | ..."
+      - Key-value:      "step 1 metric_name:1.234"
+    """
+    results = {k: [] for k in metric_keys}
+
+    for line in lines:
+        for key in metric_keys:
+            # Pattern: "key <number>" or "key: <number>"
+            # Handle keys with or without trailing colon
+            escaped = re.escape(key.rstrip(":"))
+            pattern = rf"{escaped}\s*:?\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)"
+            match = re.search(pattern, line)
+            if match:
+                try:
+                    results[key].append(float(match.group(1)))
+                except ValueError:
+                    pass
+
+    return results
+
+
+def extract_text_from_lines(lines: list[str], pattern: str) -> list[str]:
+    """Extract text values from log lines using a regex pattern.
+
+    The pattern must contain at least one capture group. If multiple groups
+    are present (e.g. alternation), the first non-None group is used.
+    Example pattern: r"output\\.outputs\\[0\\]\\.text=(?:\"(.+?)\"|'(.+?)')"
+    """
+    results = []
+    compiled = re.compile(pattern)
+
+    for line in lines:
+        match = compiled.search(line)
+        if match:
+            # Pick first non-None group
+            val = next((g for g in match.groups() if g is not None), None)
+            if val is not None:
+                results.append(val)
+
+    return results
+
+
+def compare_gold_values(
+    actual: dict, gold: dict, rtol: float = 1e-5, atol: float = 0
+) -> tuple[bool, list[str]]:
+    """Compare actual metrics against gold values.
+
+    Supports two types of gold entries:
+      - numeric (default): {"values": [1.0, 2.0], "type": "numeric"}
+      - text:              {"values": ["hello", "world"], "type": "text",
+                            "pattern": "regex with (capture group)"}
+
+    Returns (all_passed, list_of_messages).
+    """
+    messages = []
+    all_passed = True
+
+    for key, gold_entry in gold.items():
+        gold_values = gold_entry.get("values", [])
+        actual_values = actual.get(key, [])
+        entry_type = gold_entry.get("type", "numeric")
+
+        if not actual_values:
+            messages.append(f"FAIL: No values extracted for metric '{key}'")
+            all_passed = False
+            continue
+
+        if len(actual_values) != len(gold_values):
+            messages.append(
+                f"FAIL: Length mismatch for '{key}': "
+                f"got {len(actual_values)}, expected {len(gold_values)}"
+            )
+            all_passed = False
+            continue
+
+        if entry_type == "text":
+            for i, (a, g) in enumerate(zip(actual_values, gold_values)):
+                if a != g:
+                    messages.append(
+                        f"FAIL: '{key}'[{i}] text mismatch:\n"
+                        f"        actual: {a!r}\n"
+                        f"        gold:   {g!r}"
+                    )
+                    all_passed = False
+                    break
+            else:
+                messages.append(f"PASS: '{key}' ({len(gold_values)} text values match)")
+        else:
+            # numeric comparison — numpy-free allclose
+            for i, (a, g) in enumerate(zip(actual_values, gold_values)):
+                if abs(a - g) > atol + rtol * abs(g):
+                    messages.append(
+                        f"FAIL: '{key}'[{i}] mismatch: actual={a}, gold={g}, "
+                        f"diff={abs(a-g):.6e}"
+                    )
+                    all_passed = False
+                    break
+            else:
+                messages.append(f"PASS: '{key}' ({len(gold_values)} values match)")
+
+    return all_passed, messages
+
+
+# ---------------------------------------------------------------------------
+# Test case execution
+# ---------------------------------------------------------------------------
+
+def run_commands(cmds: list[str], cwd: str, env: dict | None = None) -> int:
+    """Run a list of shell commands sequentially. Return first non-zero exit code."""
+    full_env = {**os.environ, **(env or {})}
+    for cmd in cmds:
+        print(f"  $ {cmd}")
+        result = subprocess.run(cmd, shell=True, cwd=cwd, env=full_env)
+        if result.returncode != 0:
+            print(f"  FAILED (exit code {result.returncode})")
+            return result.returncode
+    return 0
+
+
+def run_test_case(case_path: Path, workdir: Path | None = None) -> int:
+    """Execute a single user test case.
+
+    Test case YAML format:
+        meta:
+          repo: flagscale
+          task: train
+          model: mixtral
+          description: "..."
+
+        resources:
+          platform: cuda
+          device: A100-40GB
+          device_count: 1
+
+        setup:
+          - pip install flagscale
+          - modelscope download --model ... --local_dir ./model_weights
+
+        run:
+          - flagscale train mixtral --config ./conf/tp2_pp1_ep2.yaml
+
+        verify:
+          log_path: "tests/functional_tests/train/mixtral/test_results/tp2_pp1_ep2/logs/..."
+          gold_values_path: "./gold_values/tp2_pp1_ep2.json"
+          # OR inline gold values:
+          gold_values:
+            "lm loss:":
+              values: [11.17587, 11.16908, ...]
+          rtol: 1e-5
+          atol: 0
+    """
+    print(f"\n{'='*60}")
+    print(f"Test Case: {case_path}")
+    print(f"{'='*60}")
+
+    with open(case_path) as f:
+        config = yaml.safe_load(f)
+
+    meta = config.get("meta", {})
+    setup_cmds = config.get("setup", [])
+    run_cmds = config.get("run", [])
+    verify_config = config.get("verify", {})
+
+    print(f"Repo:  {meta.get('repo', 'unknown')}")
+    print(f"Task:  {meta.get('task', 'unknown')}")
+    print(f"Model: {meta.get('model', 'unknown')}")
+    print(f"Desc:  {meta.get('description', '')}")
+    print()
+
+    # Determine working directory — test case files live next to the YAML
+    case_dir = case_path.parent.resolve()
+    cwd = str(workdir.resolve()) if workdir else str(case_dir)
+
+    env = config.get("env", {})
+    # Convert all env values to strings
+    env = {k: str(v) for k, v in env.items()}
+
+    # --- Setup ---
+    if setup_cmds:
+        print("--- Setup ---")
+        rc = run_commands(setup_cmds, cwd=cwd, env=env)
+        if rc != 0:
+            print("SETUP FAILED")
+            return rc
+
+    # --- Run ---
+    if run_cmds:
+        print("\n--- Run ---")
+        rc = run_commands(run_cmds, cwd=cwd, env=env)
+        if rc != 0:
+            print("RUN FAILED")
+            return rc
+
+    # --- Verify ---
+    if verify_config:
+        print("\n--- Verify ---")
+        return verify_results(verify_config, case_dir=case_dir, cwd=cwd)
+
+    print("\nPASSED (no verify step)")
+    return 0
+
+
+def verify_results(verify_config: dict, case_dir: Path, cwd: str) -> int:
+    """Verify test results against gold values."""
+    # Load gold values
+    gold = verify_config.get("gold_values")
+    if not gold:
+        gold_path = verify_config.get("gold_values_path", "")
+        if gold_path:
+            # Resolve relative to case_dir
+            full_path = (case_dir / gold_path) if not Path(gold_path).is_absolute() else Path(gold_path)
+            if not full_path.exists():
+                # Also try relative to cwd
+                full_path = Path(cwd) / gold_path
+            if not full_path.exists():
+                print(f"FAIL: Gold values file not found: {gold_path}")
+                return 1
+            with open(full_path) as f:
+                gold = json.load(f)
+        else:
+            print("No gold values defined, skipping verification")
+            return 0
+
+    # Extract actual metrics from log
+    log_path = verify_config.get("log_path", "")
+    if not log_path:
+        print("FAIL: verify.log_path is required for gold value comparison")
+        return 1
+
+    # Resolve log path — try relative to cwd first, then case_dir
+    full_log = Path(cwd) / log_path
+    if not full_log.exists():
+        full_log = case_dir / log_path
+    if not full_log.exists():
+        # Try glob pattern (user might use * for timestamp dirs)
+        import glob as globmod
+        candidates = globmod.glob(str(Path(cwd) / log_path))
+        if not candidates:
+            candidates = globmod.glob(str(case_dir / log_path))
+        if candidates:
+            full_log = Path(sorted(candidates)[-1])  # latest match
+        else:
+            print(f"FAIL: Log file not found: {log_path}")
+            return 1
+
+    print(f"Log: {full_log}")
+
+    # Read log via subprocess to bypass NFS client cache
+    import time
+    time.sleep(2)
+    log_content = subprocess.run(
+        ["cat", str(full_log)], capture_output=True, text=True
+    ).stdout
+    log_lines = log_content.splitlines()
+
+    # Separate numeric and text gold entries
+    numeric_keys = []
+    actual = {}
+    for key, entry in gold.items():
+        entry_type = entry.get("type", "numeric")
+        if entry_type == "text":
+            pattern = entry.get("pattern", "")
+            if not pattern:
+                print(f"FAIL: Text gold entry '{key}' requires a 'pattern' field")
+                return 1
+            actual[key] = extract_text_from_lines(log_lines, pattern)
+        else:
+            numeric_keys.append(key)
+
+    if numeric_keys:
+        numeric_actual = extract_metrics_from_lines(log_lines, numeric_keys)
+        actual.update(numeric_actual)
+
+    rtol = verify_config.get("rtol", 1e-5)
+    atol = verify_config.get("atol", 0)
+    passed, messages = compare_gold_values(actual, gold, rtol=rtol, atol=atol)
+
+    for msg in messages:
+        print(f"  {msg}")
+
+    print(f"\nResult: {'PASSED' if passed else 'FAILED'}")
+    return 0 if passed else 1
+
+
+# ---------------------------------------------------------------------------
+# Discovery and batch execution
+# ---------------------------------------------------------------------------
+
+def discover_test_cases(
+    root: Path, repo: str | None = None,
+    task: str | None = None, model: str | None = None
+) -> list[Path]:
+    """Find all test case YAML files under tests/.
+
+    Test case YAMLs are identified by having a 'meta' key with 'repo'.
+    """
+    tests_dir = root / "tests"
+    cases = []
+
+    for yaml_path in sorted(tests_dir.rglob("*.yaml")):
+        # Skip files in sub-config dirs (train/, data.yaml, etc.)
+        if yaml_path.name.startswith("_") or yaml_path.name == "data.yaml":
+            continue
+
+        try:
+            with open(yaml_path) as f:
+                data = yaml.safe_load(f)
+            if not isinstance(data, dict) or "meta" not in data:
+                continue
+            meta = data["meta"]
+            if repo and meta.get("repo") != repo:
+                continue
+            if task and meta.get("task") != task:
+                continue
+            if model and meta.get("model") != model:
+                continue
+            cases.append(yaml_path)
+        except (yaml.YAMLError, KeyError):
+            continue
+
+    return cases
+
+
+def _load_resource_map(resource_map_path: Path) -> dict:
+    """Load resource_map.yaml, returning empty dict on failure."""
+    if not resource_map_path.exists():
+        return {}
+    with open(resource_map_path) as f:
+        return yaml.safe_load(f) or {}
+
+
+def _get_platform_config(resource_map: dict, platform: str) -> dict:
+    """Get platform config from resource_map, with fallback to default_platform."""
+    platforms = resource_map.get("platforms", {})
+    if platform and platform in platforms:
+        return platforms[platform]
+    default_platform = resource_map.get("default_platform", "")
+    if default_platform and default_platform in platforms:
+        return platforms[default_platform]
+    return {}
+
+
+def resolve_runner_labels(resources: dict, resource_map_path: Path) -> list[str]:
+    """Resolve test case resources to GitHub Actions runner labels.
+
+    Uses platform-based lookup:
+      resources.platform -> platforms.<name>.device_labels[resources.device]
+
+    Falls back to platform default_labels, then global default_labels.
+    """
+    global_default = ["self-hosted"]
+    resource_map = _load_resource_map(resource_map_path)
+    if not resource_map:
+        return global_default
+
+    global_default = resource_map.get("default_labels", global_default)
+    platform = resources.get("platform", "")
+    pcfg = _get_platform_config(resource_map, platform)
+    if not pcfg:
+        return global_default
+
+    platform_default = pcfg.get("default_labels", global_default)
+    device = resources.get("device", "")
+    if not device:
+        return platform_default
+
+    # Case-insensitive device lookup
+    device_labels = pcfg.get("device_labels", {})
+    for key, labels in device_labels.items():
+        if key.lower() == device.lower():
+            return labels
+
+    return platform_default
+
+
+def resolve_container_image(
+    repo: str, task: str, resources: dict, resource_map_path: Path
+) -> str:
+    """Resolve test case to a Docker container image.
+
+    Lookup: platform -> container_images -> "<repo>/<task>" | "<repo>" | "default"
+    Returns "" if no image is configured.
+    """
+    resource_map = _load_resource_map(resource_map_path)
+    platform = resources.get("platform", "")
+    pcfg = _get_platform_config(resource_map, platform)
+    images = pcfg.get("container_images", {})
+    if not images:
+        return ""
+
+    key = f"{repo}/{task}" if task else repo
+    image = images.get(key, "")
+    if not image and repo:
+        image = images.get(repo, "")
+    if not image:
+        image = images.get("default", "")
+    return image
+
+
+def resolve_container_options(resources: dict, resource_map_path: Path) -> dict:
+    """Resolve container runtime options and volumes for the given platform.
+
+    Returns {"container_options": str, "container_volumes": list}.
+    """
+    resource_map = _load_resource_map(resource_map_path)
+    platform = resources.get("platform", "")
+    pcfg = _get_platform_config(resource_map, platform)
+    return {
+        "container_options": pcfg.get("container_options", ""),
+        "container_volumes": pcfg.get("container_volumes", []),
+    }
+
+
+def list_test_resources(
+    root: Path, repo: str | None = None,
+    task: str | None = None, model: str | None = None
+) -> list[dict]:
+    """List test cases with their resource requirements, runner labels, and container config.
+
+    Returns a list of dicts with keys:
+      case_path, resources, runner_labels, container_image, container_options, container_volumes
+    """
+    cases = discover_test_cases(root, repo, task, model)
+    resource_map_path = root / "resource_map.yaml"
+    result = []
+
+    for case_path in cases:
+        with open(case_path) as f:
+            data = yaml.safe_load(f)
+        meta = data.get("meta", {})
+        resources = data.get("resources", {})
+        runner_labels = resolve_runner_labels(resources, resource_map_path)
+        container_image = resolve_container_image(
+            meta.get("repo", ""), meta.get("task", ""), resources, resource_map_path
+        )
+        container_opts = resolve_container_options(resources, resource_map_path)
+        result.append({
+            "case_path": str(case_path),
+            "resources": resources,
+            "runner_labels": runner_labels,
+            "container_image": container_image,
+            **container_opts,
+        })
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run user-submitted FlagOS test cases"
+    )
+    parser.add_argument("--case", help="Path to a specific test case YAML")
+    parser.add_argument("--repo", help="Run all cases for this repo")
+    parser.add_argument("--task", help="Filter by task type")
+    parser.add_argument("--model", help="Filter by model name")
+    parser.add_argument(
+        "--workdir",
+        help="Working directory for command execution (default: test case directory)"
+    )
+    parser.add_argument(
+        "--list-resources", action="store_true",
+        help="List test cases with resource requirements and runner labels (JSON output)"
+    )
+    args = parser.parse_args()
+
+    # --list-resources mode: output JSON and exit
+    if args.list_resources:
+        root = Path(".")
+        result = list_test_resources(root, args.repo, args.task, args.model)
+        print(json.dumps(result, indent=2))
+        sys.exit(0)
+
+    workdir = Path(args.workdir) if args.workdir else None
+
+    if args.case:
+        case_path = Path(args.case)
+        if not case_path.exists():
+            print(f"ERROR: Test case not found: {case_path}")
+            sys.exit(1)
+        sys.exit(run_test_case(case_path, workdir))
+
+    if not args.repo:
+        print("ERROR: Specify --case, --repo, or --list-resources")
+        sys.exit(1)
+
+    root = Path(".")
+    cases = discover_test_cases(root, args.repo, args.task, args.model)
+
+    if not cases:
+        print(f"No test cases found for repo={args.repo} task={args.task} model={args.model}")
+        sys.exit(0)
+
+    print(f"Found {len(cases)} test case(s)")
+    failed = 0
+    for case in cases:
+        rc = run_test_case(case, workdir)
+        if rc != 0:
+            failed += 1
+
+    print(f"\n{'='*60}")
+    print(f"Results: {len(cases) - failed}/{len(cases)} passed")
+    print(f"{'='*60}")
+    sys.exit(1 if failed else 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/flagos-user-tests/tools/validators/lint_test_case.py b/flagos-user-tests/tools/validators/lint_test_case.py
new file mode 100644
index 0000000..e171dde
--- /dev/null
+++ b/flagos-user-tests/tools/validators/lint_test_case.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+"""Lint test case directories for completeness and correctness.
+
+Checks:
+- Each test case directory has a README.md
+- Each test case has at least one YAML config
+- README contains required sections (Description, Environment, etc.)
+- No sensitive data patterns (tokens, passwords, private paths)
+"""
+
+import argparse
+import re
+import sys
+from pathlib import Path
+
+import yaml
+
+
+VALID_REPOS = [
+    "flagscale", "flaggems", "flagcx", "flagtree",
+    "vllm-fl", "vllm-plugin-fl", "te-fl", "megatron-lm-fl",
+]
+
+# Patterns that might indicate sensitive data in configs
+SENSITIVE_PATTERNS = [
+    re.compile(r"(password|passwd|secret|token|api_key)\s*[:=]", re.IGNORECASE),
+    re.compile(r"/home/[a-zA-Z0-9_]+/", re.IGNORECASE),  # Private user paths
+    re.compile(r"sk-[a-zA-Z0-9]{20,}"),  # API keys
+]
+
+README_REQUIRED_SECTIONS = ["description", "environment"]
+
+
+def find_test_case_dirs(root: Path) -> list[Path]:
+    """Find directories that contain a user-perspective test case YAML (has 'meta' key)."""
+    tests_dir = root / "tests"
+    if not tests_dir.exists():
+        return []
+
+    test_dirs = set()
+    for yaml_file in tests_dir.rglob("*.yaml"):
+        try:
+            data = yaml.safe_load(yaml_file.read_text())
+            if isinstance(data, dict) and "meta" in data:
+                test_dirs.add(yaml_file.parent)
+        except (yaml.YAMLError, OSError):
+            continue
+
+    return sorted(test_dirs)
+
+
+def lint_readme(readme_path: Path, strict: bool = False) -> list[str]:
+    """Check README.md for required content."""
+    errors = []
+    if not readme_path.exists():
+        return [f"{readme_path.parent}: Missing README.md"]
+
+    content = readme_path.read_text().lower()
+
+    if strict:
+        for section in README_REQUIRED_SECTIONS:
+            if section not in content:
+                errors.append(
+                    f"{readme_path}: Missing required section '{section}'"
+                )
+
+    if len(content.strip()) < 20:
+        errors.append(f"{readme_path}: README is too short (less than 20 characters)")
+
+    return errors
+
+
+def lint_sensitive_data(filepath: Path) -> list[str]:
+    """Check for sensitive data patterns in config files."""
+    errors = []
+    content = filepath.read_text()
+    for pattern in SENSITIVE_PATTERNS:
+        matches = pattern.findall(content)
+        if matches:
+            errors.append(
+                f"{filepath}: Possible sensitive data detected: {matches[:3]}"
+            )
+    return errors
+
+
+def lint_yaml_configs(test_dir: Path) -> list[str]:
+    """Lint YAML config files in a test directory."""
+    errors = []
+    yaml_files = list(test_dir.glob("*.yaml"))
+    if not yaml_files:
+        return []
+
+    for yf in yaml_files:
+        try:
+            with open(yf) as f:
+                data = yaml.safe_load(f)
+            if data is None:
+                errors.append(f"{yf}: Empty YAML file")
+        except yaml.YAMLError as e:
+            errors.append(f"{yf}: Invalid YAML - {e}")
+            continue
+
+        # Check for sensitive data
+        errors.extend(lint_sensitive_data(yf))
+
+    return errors
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Lint test case directories")
+    parser.add_argument(
+        "--path", default=".",
+        help="Root directory of flagos-user-tests"
+    )
+    parser.add_argument(
+        "--strict", action="store_true",
+        help="Enable strict checks (README sections, etc.)"
+    )
+    args = parser.parse_args()
+
+    root = Path(args.path)
+    all_errors = []
+    warnings = []
+
+    test_dirs = find_test_case_dirs(root)
+    if not test_dirs:
+        print("No test case directories found.")
+        sys.exit(0)
+
+    for test_dir in test_dirs:
+        # Check README
+        readme_errors = lint_readme(test_dir / "README.md", strict=args.strict)
+        if args.strict:
+            all_errors.extend(readme_errors)
+        else:
+            warnings.extend(readme_errors)
+
+        # Lint YAML configs
+        all_errors.extend(lint_yaml_configs(test_dir))
+
+    if warnings:
+        print(f"Warnings ({len(warnings)}):")
+        for w in warnings:
+            print(f"  ⚠ {w}")
+
+    if all_errors:
+        print(f"Lint FAILED with {len(all_errors)} error(s):")
+        for err in all_errors:
+            print(f"  ✗ {err}")
+        sys.exit(1)
+    else:
+        print(f"Lint PASSED: {len(test_dirs)} test directory(ies) checked.")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/flagos-user-tests/tools/validators/validate_config.py b/flagos-user-tests/tools/validators/validate_config.py
new file mode 100644
index 0000000..10cf0dc
--- /dev/null
+++ b/flagos-user-tests/tools/validators/validate_config.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+"""Validate YAML configuration files for test cases.
+
+Checks:
+- YAML syntax validity
+- Test case YAML (with meta key): required fields (meta.repo, setup, run)
+- FlagScale sub-configs (experiment/defaults): structure validation
+- Generic configs: non-empty dict
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+import yaml
+
+
+VALID_REPOS = [
+    "flagscale", "flaggems", "flagcx", "flagtree",
+    "vllm-fl", "vllm-plugin-fl", "te-fl", "megatron-lm-fl",
+]
+
+
+def validate_yaml_syntax(filepath: Path) -> list[str]:
+    """Check that a file is valid YAML."""
+    errors = []
+    try:
+        with open(filepath) as f:
+            data = yaml.safe_load(f)
+        if data is None:
+            errors.append(f"{filepath}: YAML file is empty")
+    except yaml.YAMLError as e:
+        errors.append(f"{filepath}: Invalid YAML syntax - {e}")
+    return errors
+
+
+def validate_test_case(filepath: Path, data: dict) -> list[str]:
+    """Validate a user-perspective test case YAML (has 'meta' key)."""
+    errors = []
+    meta = data.get("meta", {})
+
+    if not meta.get("repo"):
+        errors.append(f"{filepath}: Missing 'meta.repo'")
+    elif meta["repo"] not in VALID_REPOS:
+        errors.append(f"{filepath}: Invalid meta.repo '{meta['repo']}'")
+
+    if not data.get("run"):
+        errors.append(f"{filepath}: Missing 'run' (list of commands)")
+    elif not isinstance(data["run"], list):
+        errors.append(f"{filepath}: 'run' must be a list of commands")
+
+    if "setup" in data and not isinstance(data["setup"], list):
+        errors.append(f"{filepath}: 'setup' must be a list of commands")
+
+    if "verify" in data:
+        v = data["verify"]
+        if isinstance(v, dict):
+            has_gold = v.get("gold_values") or v.get("gold_values_path")
+            if has_gold and not v.get("log_path"):
+                errors.append(f"{filepath}: verify.log_path required when gold values are defined")
+
+    return errors
+
+
+def validate_flagscale_subconfig(filepath: Path, data: dict) -> list[str]:
+    """Validate FlagScale sub-config (experiment config or train params)."""
+    errors = []
+    keys = set(data.keys())
+
+    if "experiment" in keys:
+        exp = data["experiment"]
+        if "exp_name" not in exp:
+            errors.append(f"{filepath}: Missing 'experiment.exp_name'")
+        if "task" not in exp:
+            errors.append(f"{filepath}: Missing 'experiment.task'")
+        elif "type" not in exp.get("task", {}):
+            errors.append(f"{filepath}: Missing 'experiment.task.type'")
+    elif "defaults" in keys:
+        # Sub-config (train params, data, etc.) — lighter validation
+        pass
+    else:
+        errors.append(
+            f"{filepath}: Missing expected top-level key "
+            f"('experiment' or 'defaults'), found: {keys}"
+        )
+    return errors
+
+
+def validate_file(filepath: Path) -> list[str]:
+    """Validate a single YAML file based on its content type."""
+    errors = validate_yaml_syntax(filepath)
+    if errors:
+        return errors
+
+    with open(filepath) as f:
+        data = yaml.safe_load(f)
+    if not isinstance(data, dict):
+        return [f"{filepath}: Must be a YAML mapping"]
+
+    # Determine type by content
+    if "meta" in data:
+        # User-perspective test case
+        return validate_test_case(filepath, data)
+    elif "experiment" in data or "defaults" in data:
+        # FlagScale sub-config (Hydra config)
+        return validate_flagscale_subconfig(filepath, data)
+    else:
+        # Generic config — just check it's a valid non-empty dict
+        return []
+
+
+def find_yaml_files(root: Path) -> list[Path]:
+    """Find all YAML files under tests/."""
+    tests_dir = root / "tests"
+    if not tests_dir.exists():
+        return []
+    return sorted(tests_dir.rglob("*.yaml"))
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Validate test case YAML configs")
+    parser.add_argument("--path", default=".", help="Root directory of flagos-user-tests")
+    parser.add_argument("--changed-files", default="", help="Comma-separated list of changed files")
+    args = parser.parse_args()
+
+    root = Path(args.path)
+
+    if args.changed_files:
+        yaml_files = [
+            Path(f) for f in args.changed_files.split(",")
+            if f.strip().endswith(".yaml") and f.strip().startswith("tests/")
+        ]
+    else:
+        yaml_files = find_yaml_files(root)
+
+    if not yaml_files:
+        print("No YAML test config files found to validate.")
+        sys.exit(0)
+
+    all_errors = []
+    for filepath in yaml_files:
+        full_path = root / filepath if not filepath.is_absolute() else filepath
+        if not full_path.exists():
+            all_errors.append(f"{filepath}: File does not exist")
+            continue
+        all_errors.extend(validate_file(full_path))
+
+    if all_errors:
+        print(f"Validation FAILED with {len(all_errors)} error(s):")
+        for err in all_errors:
+            print(f"  ✗ {err}")
+        sys.exit(1)
+    else:
+        print(f"Validation PASSED: {len(yaml_files)} file(s) checked.")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/flagos-user-tests/tools/validators/validate_gold_values.py b/flagos-user-tests/tools/validators/validate_gold_values.py
new file mode 100644
index 0000000..86ab52a
--- /dev/null
+++ b/flagos-user-tests/tools/validators/validate_gold_values.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""Validate gold values JSON files for test cases.
+
+Checks:
+- Valid JSON syntax
+- Expected structure: keys map to objects with "values" arrays
+- All values are numeric
+- At least one value is present
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+
+def validate_gold_values_file(filepath: Path) -> list[str]:
+    """Validate a single gold values JSON file."""
+    errors = []
+
+    try:
+        with open(filepath) as f:
+            data = json.load(f)
+    except json.JSONDecodeError as e:
+        return [f"{filepath}: Invalid JSON - {e}"]
+
+    if not isinstance(data, dict):
+        return [f"{filepath}: Gold values must be a JSON object, got {type(data).__name__}"]
+
+    if not data:
+        return [f"{filepath}: Gold values file is empty"]
+
+    for key, value in data.items():
+        if not isinstance(value, dict):
+            errors.append(f"{filepath}: Key '{key}' must map to an object, got {type(value).__name__}")
+            continue
+
+        if "values" not in value:
+            errors.append(f"{filepath}: Key '{key}' missing 'values' field")
+            continue
+
+        values = value["values"]
+        if not isinstance(values, list):
+            errors.append(f"{filepath}: Key '{key}'.values must be an array")
+            continue
+
+        if len(values) == 0:
+            errors.append(f"{filepath}: Key '{key}'.values is empty")
+            continue
+
+        for i, v in enumerate(values):
+            if not isinstance(v, (int, float)):
+                errors.append(
+                    f"{filepath}: Key '{key}'.values[{i}] is not numeric: {v!r}"
+                )
+
+    return errors
+
+
+def find_gold_values_files(root: Path) -> list[Path]:
+    """Find all gold values JSON files under tests/.
+
+    Supports both conventions:
+    - FlagScale: tests/<repo>/<task>/<model>/gold_values/<case>.json
+    - Flat: tests/<repo>/<case>/<case>_gold_values.json
+    """
+    tests_dir = root / "tests"
+    if not tests_dir.exists():
+        return []
+    # Match files inside gold_values/ directories
+    gold_dir_files = list(tests_dir.rglob("gold_values/*.json"))
+    # Match files with _gold_values in name (legacy flat layout)
+    gold_name_files = list(tests_dir.rglob("*_gold_values.json"))
+    return list(set(gold_dir_files + gold_name_files))
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Validate gold values JSON files")
+    parser.add_argument(
+        "--path", default=".",
+        help="Root directory of flagos-user-tests"
+    )
+    args = parser.parse_args()
+
+    root = Path(args.path)
+    all_errors = []
+
+    gold_files = find_gold_values_files(root)
+
+    if not gold_files:
+        print("No gold values files found. Skipping validation.")
+        sys.exit(0)
+
+    for filepath in gold_files:
+        all_errors.extend(validate_gold_values_file(filepath))
+
+    if all_errors:
+        print(f"Gold values validation FAILED with {len(all_errors)} error(s):")
+        for err in all_errors:
+            print(f"  ✗ {err}")
+        sys.exit(1)
+    else:
+        print(f"Gold values validation PASSED: {len(gold_files)} file(s) checked.")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()

From a55028754b2855a380a2ee46de19269747b1442a Mon Sep 17 00:00:00 2001
From: liyuzhuo <lee.yuzhuo233@gmail.com>
Date: Wed, 18 Mar 2026 14:17:23 +0800
Subject: [PATCH 02/13] [wip] add container init env cmd

---
 .github/workflows/test_dispatch.yml           |  5 +++
 flagos-user-tests/resource_map.yaml           |  5 +++
 flagos-user-tests/tools/resolve_matrix.py     | 10 +++++-
 flagos-user-tests/tools/run_user_tests.py     | 31 ++++++++++++++++++-
 .../tools/validators/validate_gold_values.py  | 27 ++++++++++++----
 5 files changed, 70 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/test_dispatch.yml b/.github/workflows/test_dispatch.yml
index 185a9b8..a7a5be1 100644
--- a/.github/workflows/test_dispatch.yml
+++ b/.github/workflows/test_dispatch.yml
@@ -88,6 +88,11 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
 
+      - name: Container init
+        if: ${{ matrix.container_init != '' }}
+        shell: bash -l {0}
+        run: ${{ matrix.container_init }}
+
       - name: Install runner dependencies
         run: pip install pyyaml
 
diff --git a/flagos-user-tests/resource_map.yaml b/flagos-user-tests/resource_map.yaml
index 6cc25a1..2ccaf07 100644
--- a/flagos-user-tests/resource_map.yaml
+++ b/flagos-user-tests/resource_map.yaml
@@ -42,6 +42,11 @@ platforms:
       flagscale/inference:    "localhost:5000/flagscale-inference:dev-cu128-py3.12-20260302102033"
       flagscale/hetero_train: "localhost:5000/flagscale-train:dev-cu128-py3.12-20260228210721"
 
+    # Container init commands: run inside the container before test execution
+    # Same key format as container_images: "<repo>/<task>" | "<repo>" | "default"
+    container_init:
+      flagscale/inference: "conda activate flagscale-inference"
+
     # Container runtime options
     container_options: "--gpus all --shm-size=500g --user root --ulimit nofile=65535:65535"
 
diff --git a/flagos-user-tests/tools/resolve_matrix.py b/flagos-user-tests/tools/resolve_matrix.py
index c13c8f0..0ade247 100644
--- a/flagos-user-tests/tools/resolve_matrix.py
+++ b/flagos-user-tests/tools/resolve_matrix.py
@@ -22,6 +22,7 @@
 from run_user_tests import (
     list_test_resources,
     resolve_container_image,
+    resolve_container_init,
     resolve_container_options,
     resolve_runner_labels,
 )
@@ -36,6 +37,10 @@ def make_entry(case_path: str, meta: dict, resources: dict, resource_map_path: P
         meta.get("repo", ""), meta.get("task", ""),
         resources, resource_map_path,
     )
+    init_cmd = resolve_container_init(
+        meta.get("repo", ""), meta.get("task", ""),
+        resources, resource_map_path,
+    )
     opts = resolve_container_options(resources, resource_map_path)
     return {
         "case_path": case_path,
@@ -44,6 +49,7 @@ def make_entry(case_path: str, meta: dict, resources: dict, resource_map_path: P
         "model": meta.get("model", ""),
         "runner_labels": json.dumps(labels),
         "container_image": image,
+        "container_init": init_cmd,
         "container_options": opts["container_options"],
         "container_volumes": json.dumps(opts["container_volumes"]),
     }
@@ -54,7 +60,8 @@ def make_empty_entry(**kwargs) -> dict:
     return {
         "case_path": "", "repo": "", "task": "", "model": "",
         "runner_labels": json.dumps(["self-hosted"]),
-        "container_image": "", "container_options": "",
+        "container_image": "", "container_init": "",
+        "container_options": "",
         "container_volumes": json.dumps([]),
         **kwargs,
     }
@@ -67,6 +74,7 @@ def resource_entry_to_matrix(entry: dict, repo: str = "", task: str = "", model:
         "repo": repo or "", "task": task or "", "model": model or "",
         "runner_labels": json.dumps(entry["runner_labels"]),
         "container_image": entry.get("container_image", ""),
+        "container_init": entry.get("container_init", ""),
         "container_options": entry.get("container_options", ""),
         "container_volumes": json.dumps(entry.get("container_volumes", [])),
     }
diff --git a/flagos-user-tests/tools/run_user_tests.py b/flagos-user-tests/tools/run_user_tests.py
index 7e15dd7..36a53a1 100644
--- a/flagos-user-tests/tools/run_user_tests.py
+++ b/flagos-user-tests/tools/run_user_tests.py
@@ -451,6 +451,30 @@ def resolve_container_options(resources: dict, resource_map_path: Path) -> dict:
     }
 
 
+def resolve_container_init(
+    repo: str, task: str, resources: dict, resource_map_path: Path
+) -> str:
+    """Resolve container init command for the given platform and repo/task.
+
+    Lookup: platform -> container_init -> "<repo>/<task>" | "<repo>" | "default"
+    Returns "" if no init command is configured.
+    """
+    resource_map = _load_resource_map(resource_map_path)
+    platform = resources.get("platform", "")
+    pcfg = _get_platform_config(resource_map, platform)
+    init_cmds = pcfg.get("container_init", {})
+    if not init_cmds:
+        return ""
+
+    key = f"{repo}/{task}" if task else repo
+    cmd = init_cmds.get(key, "")
+    if not cmd and repo:
+        cmd = init_cmds.get(repo, "")
+    if not cmd:
+        cmd = init_cmds.get("default", "")
+    return cmd
+
+
 def list_test_resources(
     root: Path, repo: str | None = None,
     task: str | None = None, model: str | None = None
@@ -458,7 +482,8 @@ def list_test_resources(
     """List test cases with their resource requirements, runner labels, and container config.
 
     Returns a list of dicts with keys:
-      case_path, resources, runner_labels, container_image, container_options, container_volumes
+      case_path, resources, runner_labels, container_image, container_init,
+      container_options, container_volumes
     """
     cases = discover_test_cases(root, repo, task, model)
     resource_map_path = root / "resource_map.yaml"
@@ -473,12 +498,16 @@ def list_test_resources(
         container_image = resolve_container_image(
             meta.get("repo", ""), meta.get("task", ""), resources, resource_map_path
         )
+        container_init = resolve_container_init(
+            meta.get("repo", ""), meta.get("task", ""), resources, resource_map_path
+        )
         container_opts = resolve_container_options(resources, resource_map_path)
         result.append({
             "case_path": str(case_path),
             "resources": resources,
             "runner_labels": runner_labels,
             "container_image": container_image,
+            "container_init": container_init,
             **container_opts,
         })
 
diff --git a/flagos-user-tests/tools/validators/validate_gold_values.py b/flagos-user-tests/tools/validators/validate_gold_values.py
index 86ab52a..6d690cc 100644
--- a/flagos-user-tests/tools/validators/validate_gold_values.py
+++ b/flagos-user-tests/tools/validators/validate_gold_values.py
@@ -4,8 +4,9 @@
 Checks:
 - Valid JSON syntax
 - Expected structure: keys map to objects with "values" arrays
-- All values are numeric
 - At least one value is present
+- Numeric entries (default): all values are int/float
+- Text entries (type: "text"): all values are strings, "pattern" field is present
 """
 
 import argparse
@@ -48,11 +49,25 @@ def validate_gold_values_file(filepath: Path) -> list[str]:
             errors.append(f"{filepath}: Key '{key}'.values is empty")
             continue
 
-        for i, v in enumerate(values):
-            if not isinstance(v, (int, float)):
-                errors.append(
-                    f"{filepath}: Key '{key}'.values[{i}] is not numeric: {v!r}"
-                )
+        entry_type = value.get("type", "numeric")
+
+        if entry_type == "text":
+            # Text entries require a 'pattern' field for extraction
+            if "pattern" not in value:
+                errors.append(f"{filepath}: Key '{key}' has type 'text' but missing 'pattern' field")
+            for i, v in enumerate(values):
+                if not isinstance(v, str):
+                    errors.append(
+                        f"{filepath}: Key '{key}'.values[{i}] is not a string: {v!r}"
+                    )
+        elif entry_type == "numeric":
+            for i, v in enumerate(values):
+                if not isinstance(v, (int, float)):
+                    errors.append(
+                        f"{filepath}: Key '{key}'.values[{i}] is not numeric: {v!r}"
+                    )
+        else:
+            errors.append(f"{filepath}: Key '{key}' has unknown type: {entry_type!r}")
 
     return errors
 

From d0fce11ba94427cf8fd68ef6e13db57588120e61 Mon Sep 17 00:00:00 2001
From: liyuzhuo <lee.yuzhuo233@gmail.com>
Date: Wed, 18 Mar 2026 14:38:32 +0800
Subject: [PATCH 03/13] fix conda env

---
 .github/workflows/test_dispatch.yml       | 14 +++---
 flagos-user-tests/resource_map.yaml       |  6 +--
 flagos-user-tests/tools/activate_conda.sh | 58 +++++++++++++++++++++++
 flagos-user-tests/tools/resolve_matrix.py | 10 ++--
 flagos-user-tests/tools/run_user_tests.py | 28 +++++------
 5 files changed, 88 insertions(+), 28 deletions(-)
 create mode 100755 flagos-user-tests/tools/activate_conda.sh

diff --git a/.github/workflows/test_dispatch.yml b/.github/workflows/test_dispatch.yml
index a7a5be1..25245c8 100644
--- a/.github/workflows/test_dispatch.yml
+++ b/.github/workflows/test_dispatch.yml
@@ -88,16 +88,18 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
 
-      - name: Container init
-        if: ${{ matrix.container_init != '' }}
-        shell: bash -l {0}
-        run: ${{ matrix.container_init }}
-
       - name: Install runner dependencies
-        run: pip install pyyaml
+        run: |
+          if [ -n "${{ matrix.conda_env }}" ]; then
+            source flagos-user-tests/tools/activate_conda.sh ${{ matrix.conda_env }}
+          fi
+          pip install pyyaml
 
       - name: Run user tests
         run: |
+          if [ -n "${{ matrix.conda_env }}" ]; then
+            source flagos-user-tests/tools/activate_conda.sh ${{ matrix.conda_env }}
+          fi
           ARGS=""
           if [ -n "${{ matrix.case_path }}" ]; then
             ARGS="--case ${{ matrix.case_path }}"
diff --git a/flagos-user-tests/resource_map.yaml b/flagos-user-tests/resource_map.yaml
index 2ccaf07..30f5f75 100644
--- a/flagos-user-tests/resource_map.yaml
+++ b/flagos-user-tests/resource_map.yaml
@@ -42,10 +42,10 @@ platforms:
       flagscale/inference:    "localhost:5000/flagscale-inference:dev-cu128-py3.12-20260302102033"
       flagscale/hetero_train: "localhost:5000/flagscale-train:dev-cu128-py3.12-20260228210721"
 
-    # Container init commands: run inside the container before test execution
+    # Conda environment to activate inside the container before test execution.
     # Same key format as container_images: "<repo>/<task>" | "<repo>" | "default"
-    container_init:
-      flagscale/inference: "conda activate flagscale-inference"
+    conda_env:
+      flagscale/inference: "flagscale-inference"
 
     # Container runtime options
     container_options: "--gpus all --shm-size=500g --user root --ulimit nofile=65535:65535"
diff --git a/flagos-user-tests/tools/activate_conda.sh b/flagos-user-tests/tools/activate_conda.sh
new file mode 100755
index 0000000..a8e11bd
--- /dev/null
+++ b/flagos-user-tests/tools/activate_conda.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# Activate a conda environment inside a container.
+#
+# Detects conda installation, initializes the shell, then activates the env.
+# Must be sourced (not executed) so the activation persists in the caller's shell:
+#   source tools/activate_conda.sh <env_name> [conda_path]
+#
+# Arguments:
+#   env_name    — conda environment name (required)
+#   conda_path  — path to conda installation (optional, auto-detected if omitted)
+
+set -e
+
+_activate_conda() {
+    local env_name="${1:?Usage: source activate_conda.sh <env_name> [conda_path]}"
+    local conda_path="${2:-}"
+
+    # Auto-detect conda path if not provided
+    if [ -z "$conda_path" ]; then
+        if [ -n "$CONDA_DIR" ] && [ -d "$CONDA_DIR" ]; then
+            conda_path="$CONDA_DIR"
+        elif command -v conda &>/dev/null; then
+            conda_path="$(conda info --base 2>/dev/null)"
+        elif [ -d "$HOME/miniconda3" ]; then
+            conda_path="$HOME/miniconda3"
+        elif [ -d "$HOME/anaconda3" ]; then
+            conda_path="$HOME/anaconda3"
+        elif [ -d "/opt/conda" ]; then
+            conda_path="/opt/conda"
+        fi
+    fi
+
+    if [ -z "$conda_path" ]; then
+        echo "[activate_conda] WARNING: conda not found, skipping activation"
+        return 0
+    fi
+
+    local conda_sh="$conda_path/etc/profile.d/conda.sh"
+    if [ ! -f "$conda_sh" ]; then
+        echo "[activate_conda] ERROR: conda.sh not found at $conda_sh"
+        return 1
+    fi
+
+    # Initialize conda for this shell
+    echo "[activate_conda] Initializing conda from $conda_path"
+    source "$conda_sh"
+
+    # Activate the environment
+    echo "[activate_conda] Activating environment: $env_name"
+    conda activate "$env_name" || {
+        echo "[activate_conda] ERROR: Failed to activate conda env '$env_name'"
+        return 1
+    }
+
+    echo "[activate_conda] Active Python: $(which python) ($(python --version 2>&1))"
+}
+
+_activate_conda "$@"
diff --git a/flagos-user-tests/tools/resolve_matrix.py b/flagos-user-tests/tools/resolve_matrix.py
index 0ade247..1e8c9db 100644
--- a/flagos-user-tests/tools/resolve_matrix.py
+++ b/flagos-user-tests/tools/resolve_matrix.py
@@ -21,8 +21,8 @@
 sys.path.insert(0, str(Path(__file__).parent))
 from run_user_tests import (
     list_test_resources,
+    resolve_conda_env,
     resolve_container_image,
-    resolve_container_init,
     resolve_container_options,
     resolve_runner_labels,
 )
@@ -37,7 +37,7 @@ def make_entry(case_path: str, meta: dict, resources: dict, resource_map_path: P
         meta.get("repo", ""), meta.get("task", ""),
         resources, resource_map_path,
     )
-    init_cmd = resolve_container_init(
+    init_cmd = resolve_conda_env(
         meta.get("repo", ""), meta.get("task", ""),
         resources, resource_map_path,
     )
@@ -49,7 +49,7 @@ def make_entry(case_path: str, meta: dict, resources: dict, resource_map_path: P
         "model": meta.get("model", ""),
         "runner_labels": json.dumps(labels),
         "container_image": image,
-        "container_init": init_cmd,
+        "conda_env": init_cmd,
         "container_options": opts["container_options"],
         "container_volumes": json.dumps(opts["container_volumes"]),
     }
@@ -60,7 +60,7 @@ def make_empty_entry(**kwargs) -> dict:
     return {
         "case_path": "", "repo": "", "task": "", "model": "",
         "runner_labels": json.dumps(["self-hosted"]),
-        "container_image": "", "container_init": "",
+        "container_image": "", "conda_env": "",
         "container_options": "",
         "container_volumes": json.dumps([]),
         **kwargs,
@@ -74,7 +74,7 @@ def resource_entry_to_matrix(entry: dict, repo: str = "", task: str = "", model:
         "repo": repo or "", "task": task or "", "model": model or "",
         "runner_labels": json.dumps(entry["runner_labels"]),
         "container_image": entry.get("container_image", ""),
-        "container_init": entry.get("container_init", ""),
+        "conda_env": entry.get("conda_env", ""),
         "container_options": entry.get("container_options", ""),
         "container_volumes": json.dumps(entry.get("container_volumes", [])),
     }
diff --git a/flagos-user-tests/tools/run_user_tests.py b/flagos-user-tests/tools/run_user_tests.py
index 36a53a1..ac7f534 100644
--- a/flagos-user-tests/tools/run_user_tests.py
+++ b/flagos-user-tests/tools/run_user_tests.py
@@ -451,28 +451,28 @@ def resolve_container_options(resources: dict, resource_map_path: Path) -> dict:
     }
 
 
-def resolve_container_init(
+def resolve_conda_env(
     repo: str, task: str, resources: dict, resource_map_path: Path
 ) -> str:
-    """Resolve container init command for the given platform and repo/task.
+    """Resolve conda environment name for the given platform and repo/task.
 
-    Lookup: platform -> container_init -> "<repo>/<task>" | "<repo>" | "default"
-    Returns "" if no init command is configured.
+    Lookup: platform -> conda_env -> "<repo>/<task>" | "<repo>" | "default"
+    Returns "" if no conda env is configured.
     """
     resource_map = _load_resource_map(resource_map_path)
     platform = resources.get("platform", "")
     pcfg = _get_platform_config(resource_map, platform)
-    init_cmds = pcfg.get("container_init", {})
-    if not init_cmds:
+    conda_envs = pcfg.get("conda_env", {})
+    if not conda_envs:
         return ""
 
     key = f"{repo}/{task}" if task else repo
-    cmd = init_cmds.get(key, "")
-    if not cmd and repo:
-        cmd = init_cmds.get(repo, "")
-    if not cmd:
-        cmd = init_cmds.get("default", "")
-    return cmd
+    env = conda_envs.get(key, "")
+    if not env and repo:
+        env = conda_envs.get(repo, "")
+    if not env:
+        env = conda_envs.get("default", "")
+    return env
 
 
 def list_test_resources(
@@ -498,7 +498,7 @@ def list_test_resources(
         container_image = resolve_container_image(
             meta.get("repo", ""), meta.get("task", ""), resources, resource_map_path
         )
-        container_init = resolve_container_init(
+        conda_env = resolve_conda_env(
             meta.get("repo", ""), meta.get("task", ""), resources, resource_map_path
         )
         container_opts = resolve_container_options(resources, resource_map_path)
@@ -507,7 +507,7 @@ def list_test_resources(
             "resources": resources,
             "runner_labels": runner_labels,
             "container_image": container_image,
-            "container_init": container_init,
+            "conda_env": conda_env,
             **container_opts,
         })
 

From 4be9437ccb4d638a6007d14301c87a083391fd62 Mon Sep 17 00:00:00 2001
From: liyuzhuo <lee.yuzhuo233@gmail.com>
Date: Wed, 18 Mar 2026 14:40:03 +0800
Subject: [PATCH 04/13] fix conda env

---
 .github/workflows/test_dispatch.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/test_dispatch.yml b/.github/workflows/test_dispatch.yml
index 25245c8..4b39f0d 100644
--- a/.github/workflows/test_dispatch.yml
+++ b/.github/workflows/test_dispatch.yml
@@ -89,6 +89,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install runner dependencies
+        shell: bash
         run: |
           if [ -n "${{ matrix.conda_env }}" ]; then
             source flagos-user-tests/tools/activate_conda.sh ${{ matrix.conda_env }}
@@ -96,6 +97,7 @@ jobs:
           pip install pyyaml
 
       - name: Run user tests
+        shell: bash
         run: |
           if [ -n "${{ matrix.conda_env }}" ]; then
             source flagos-user-tests/tools/activate_conda.sh ${{ matrix.conda_env }}

From e924f290cbda70d572504a95f1e3bef753441ff6 Mon Sep 17 00:00:00 2001
From: liyuzhuo <lee.yuzhuo233@gmail.com>
Date: Wed, 18 Mar 2026 14:41:13 +0800
Subject: [PATCH 05/13] fix conda env

---
 .github/workflows/test_dispatch.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test_dispatch.yml b/.github/workflows/test_dispatch.yml
index 4b39f0d..50f8f86 100644
--- a/.github/workflows/test_dispatch.yml
+++ b/.github/workflows/test_dispatch.yml
@@ -92,7 +92,7 @@ jobs:
         shell: bash
         run: |
           if [ -n "${{ matrix.conda_env }}" ]; then
-            source flagos-user-tests/tools/activate_conda.sh ${{ matrix.conda_env }}
+            source tools/activate_conda.sh ${{ matrix.conda_env }}
           fi
           pip install pyyaml
 
@@ -100,7 +100,7 @@ jobs:
         shell: bash
         run: |
           if [ -n "${{ matrix.conda_env }}" ]; then
-            source flagos-user-tests/tools/activate_conda.sh ${{ matrix.conda_env }}
+            source tools/activate_conda.sh ${{ matrix.conda_env }}
           fi
           ARGS=""
           if [ -n "${{ matrix.case_path }}" ]; then

From 615b5668d572a190b61fb334e57140e97e631994 Mon Sep 17 00:00:00 2001
From: liyuzhuo <lee.yuzhuo233@gmail.com>
Date: Wed, 18 Mar 2026 15:02:29 +0800
Subject: [PATCH 06/13] fix test cmd

---
 .../qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml      |  2 +-
 .../flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml | 10 +---------
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml
index d941f2b..f1ce909 100644
--- a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml
+++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml
@@ -1,5 +1,5 @@
 llm:
-  model: /share/project/models/Qwen/Qwen3-0.6B
+  model: /home/gitlab-runner/data/Qwen3-0.6B
   trust_remote_code: true
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml
index 7f997c7..e266a48 100644
--- a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml
+++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml
@@ -23,15 +23,7 @@ setup:
   - git clone https://github.com/FlagOpen/FlagScale.git && cd FlagScale && pip install .
 
 run:
-  - flagscale inference qwen3 --config ./conf/demo_0_6b.yaml
-  - |
-    pid_file="./outputs/qwen3/inference_logs/pids/host_0_localhost.pid"
-    if [ -f "$pid_file" ]; then
-      pid=$(cat "$pid_file")
-      echo "Waiting for inference process $pid to complete..."
-      while kill -0 "$pid" 2>/dev/null; do sleep 2; done
-      echo "Inference process completed."
-    fi
+  - flagscale inference qwen3 --config ./conf/demo_0_6b.yaml --test
 
 verify:
   log_path: "./outputs/qwen3/inference_logs/host_0_localhost.output"

From 39dc2081a71eb85129c1ed0430accb91daeaa633 Mon Sep 17 00:00:00 2001
From: liyuzhuo <lee.yuzhuo233@gmail.com>
Date: Wed, 18 Mar 2026 15:06:15 +0800
Subject: [PATCH 07/13] fix test cmd

---
 .../inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml
index f1ce909..c91ac04 100644
--- a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml
+++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml
@@ -1,5 +1,5 @@
 llm:
-  model: /home/gitlab-runner/data/Qwen3-0.6B
+  model: Qwen/Qwen3-0.6B
   trust_remote_code: true
   tensor_parallel_size: 1
   pipeline_parallel_size: 1

From e059d32a167c81bc04d9929bd656a825218fe64f Mon Sep 17 00:00:00 2001
From: liyuzhuo <lee.yuzhuo233@gmail.com>
Date: Wed, 18 Mar 2026 15:13:22 +0800
Subject: [PATCH 08/13] short name

---
 .github/workflows/test_dispatch.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test_dispatch.yml b/.github/workflows/test_dispatch.yml
index 50f8f86..c5253bd 100644
--- a/.github/workflows/test_dispatch.yml
+++ b/.github/workflows/test_dispatch.yml
@@ -74,6 +74,7 @@ jobs:
             --changed-repos-list '${{ steps.detect.outputs.changed_repos_list }}'
 
   run-tests:
+    name: ${{ matrix.repo }}/${{ matrix.task }}/${{ matrix.model }}
     needs: detect-changes
     if: ${{ needs.detect-changes.outputs.matrix != '' && !contains(needs.detect-changes.outputs.matrix, '_none_') }}
     strategy:

From f4ba4a372fa2305143751dd594bc5c764909ed94 Mon Sep 17 00:00:00 2001
From: liyuzhuo <lee.yuzhuo233@gmail.com>
Date: Wed, 18 Mar 2026 15:23:49 +0800
Subject: [PATCH 09/13] fix resolve matrix

---
 .gitignore                                      |   1 +
 .../__pycache__/run_user_tests.cpython-312.pyc  | Bin 22466 -> 0 bytes
 flagos-user-tests/tools/resolve_matrix.py       |   4 +++-
 flagos-user-tests/tools/run_user_tests.py       |   3 +++
 4 files changed, 7 insertions(+), 1 deletion(-)
 create mode 100644 .gitignore
 delete mode 100644 flagos-user-tests/tools/__pycache__/run_user_tests.cpython-312.pyc

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..71bc36f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+**pycache**
\ No newline at end of file
diff --git a/flagos-user-tests/tools/__pycache__/run_user_tests.cpython-312.pyc b/flagos-user-tests/tools/__pycache__/run_user_tests.cpython-312.pyc
deleted file mode 100644
index 05fdd462b2ee932f1644c0ed136243d22786ae4d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 22466
zcmb_^Yj7LKncxiGPXZub1Rvsvq(p)uAyJZLi_v2xQWEvBWQ($6$&$Gc1|&g&06har
zA_6ANR<;5rsuNjuE#Y&#hBnR_K6@`rZSKNcU2Wv#cGp+AAE3cR(1cf|tX;L6x<ByB
zSJ};N)!p~?JTRoe*t>Tlak{7b>#v_*_xEc2+-|o}@YtLGbL^QRiuwpY<fl%GJZ{iX
z)Dp!~S16WdHF5e1O+w8T4GFbZv?SDB(UDMpMNdM*6$6CYxN*{S#YBr`%~#B<E^3Kd
z1Av^>D=|aVHfmsv-_TvLtMW}zD{GKpAm0r6b!Lj~W-U=0YlS~s)WO=}&k%J+T?(WI
zv5#8Vx^L*O)U&%-hn&MY0iz4bY0^fh-w<_=n%H`P@&J@OYGOU`SIsl9UdZ#ZJ}BXb
z94sxCX@Hja%oMbxhYdhZBjkwXnv__x62m^2q&BqxMnAg^%3z;$Y%5@DU>T@kJHd|Q
z&<1G%wjI($IBC2frvq{tNo`n96~YR=-2r(`sx-v36VjUhMeVdn?d*hhwy<5_pss9-
zwz9z+T@=NcG*ookyVA0tsTs|TCfGaOsHLcj(+Ospk8-{I^!3SDDivj!RFqFKkuV?S
znebRRmVmF5@$lG%VTOxNCHYt?$;G04$YMDWj*M5KnHaz`d^A4V8%ZWoP!MokKKArE
z20lk)V@zs1oMPDMC}i;c7KZ5s;MDX~KQo@Z#iWu9))bD%0WL~vV!{a);HRPqHkyb)
zl}s15w3BD1q8vXJjih2Xqd|g*n@%WbqPL@w=~R@UAzj(B)XgXt8=X-KM#iI&8&ESp
z9ZyxeK9-EL%*}9oI?7utm&T!!P+20%F??)tDn7$V)$!seirr(O%afDPe4hE+xmTGO
zbUO*nKY!s8jB7k@2`6Ttx2Y(X2*<^)l7{jTE;g0oLky|rMl?FbSBDo`lp=j<!$Dx;
z(VNkD8%)A;e0VI1!^tr149+IZ0gO#-G#05ACfQRnsqth2MmibicS9#e5YY(M%!j6C
znBHETb5e%iJqi=fLtCS}Qyk3X?#b9~h{SiNruL3ZP4$gLr}l<s!jo}}x?-HW>e?Ai
za?loBGGDBAgk+R@zYyYGsW5+I*JP57#{V^SK$j5#L%Ikb1Q8>3)$$u8m-9hd&^;4Q
zjSKoXu3N!68BKAq$jFW83@;csSioE&0^6+em>~vqMLiy(ar4d4gVbG3L=&STnrnKB
zn$=`9W7I2}J4PKftA#IWkh*po(sc-V?PZARQ&3WTCe)avw2XcN@N9lq`dz7%9L^dt
zhSw=p^M+>DsKRL2T#}+P#*FS=sis*|#>CTiPME2THlxQaWdeJ&=}B4Tgp6szF2}1O
ztCd@^Ii#sOypFn))&c%u7!w-)W;M5{b}A(wTT}<dX=&;*btij^x~;uT-J*lK^v_S=
z3XG(f#PlT4ZG;ggLLyCOluJ%B@#GjorU~c*31WD9YAVSAF(Wy`(#L#QmXqNW*?<tF
zmw6^O747AtQ(-O)yI()k76ZZzr(yu#x4Ul-lVReBndkeNzEIzRz7xCl^g&`M6lx=t
zAda)qnO?F-^dnA&Y2#DTDW*?q<47Vr8SU>2?cKLuEF#eT>EKnqyZ<P>x?bGXd+q2|
zcGuMqyt})O_P-cCaZOGP9u58mCpKsjG+b0LM1d-&Amn&hhEsxpkA}I(xL^oR;XWzo
zN8`zGO3;sS$>}M<@;vTaCpa$21<jle&~UhP1p0<xl=eA6hXW>PVLb~*5jiUur^0X&
za0x*VLlqh4aRrcm;p=y&ck|<6F1mY)OHRNhygMG7nNE*SCwG%mVoKaRl6)^YR>=0U
zdy73sXpD-JIwDRcKZ4UZLZ*;wgi3e7AO9Y_=BQ8VsMhXMUH5A@a<1&l3vKh$xf^q*
zign!|IlWokuiV?R1D_fwe_$!L7|Uzlw%oH6_I=k;^6kl<DmQM=1&VEFi@tN&Q|lhz
z(&@$1xv_kz<mt(t5FDP0M(YZcJx#fR#d8llovWVC_w<G6_iwy&V`clhiBi|$UwDpu
z;-WmQ6)olRRT?Rqd+z){{X;9|*!7Q}Hc$<1|3J~Mz(Y?%MGN8Yct14wk1fZYy7#yB
z9QW#e>NOwVZu}`j!<)mr4f_SsBiS$3AhJY_!49FK)>`zhQ2MNv>>g*y?xC%<D*#<f
zyTYtKqX)Y7r+|GDAC~?G1$#*i>XsRWK}wFX8fAZgU1g5G^QDYtLfHdkjIfg!<obZl
zp9UJ&3{ydExB(g=lWz*9yf)Fo2^vd`kuzf~dYh5fphU`-fGLntn8|5mP(&UBPsMmN
z3<=3Z6i(ME*pQ<PS^gpB^eDi_Q?aSIl*AK?g~iKE!TOITQcPDgG!|mQadH?V(Fc3T
zDLWbi3S$z<MDKY-Ob{Z61B;ps5iBQehk*f5TGh{RZOQ4>)O6}<NDTR_FYdW|?P>^n
zd>F{Ott+(aXs|8QNy5&cxEDmE2_l1t)4dP=9`A(LS&GI3gt`V(G)re_7<NO3{u6qM
zhQYP!s2S?dwYTVZ==1MTL5-jZ?GflP9x*Tsha0zGJ~S3hL~l=VN7CIgp<!cGR2DiE
zPr@<J9|<Y2=b$L>h1cIx|2Rh#{R8)o<ig9({@~ho=zB;0i8n#c9~oWrx8U!5(7>S>
zA{b$Vor=Yy9G(S&4k}Fv1~O(mhueD4DDFPE`Z+hkn4}Uc=fSiFOw+;^%H#TkT?QYq
z7yd1p&UFIh5%}Y;!)uPJ)KL!C+^Kbk>u!2Jo$D+)+UADH=1_JvEIAe(d0ol5BdZsz
z&X4PTtNwxf6K@~7cj&DnCI7(vOQrfV*@3d7e*SA^r+=w#u`b8|!r5N2Bf&cf!9Rvt
zDA;Htsp)WB&>>$a=(to8;$e74aC?;>7eokAr{GP98YC<rL3%<x+Y+=)WP{WKeeDg3
zx(ahRYnU}=j1!1s)58M$pb(=eL)C)G7*sIkE%0dB>Y;DRnE8f`uGSX9>IoZRctbhT
zK_WEH>hFBHx|L+iGTE!rOmf<dCt-qjZ-L!Ju$xuo6kNc(SXAi>zSpTYRND+`%NrRB
zYbBCSETwE{DkjC8G09^lw+fi8kTlj-GUB#bd&ZWrV{2ifbZpUH8)>gSV};h$Wo)l&
zSVzYGy5<d~b>ip%h5j9Ef3-){NpQGo_l>Q8Loc_Hh81zg`Y-6wmCZd$Da;x6Ee*sf
z;|BIlqsq0*>1p5)(y-=k43cr^-l7lcad5*pP^<^WfnvQXoSSN7eXr}@u*mhu;{-j-
z{3}PwziFiMfJdLvjpCux5WWJWHUQfa?)D<lO6*uo*={q(n7Wl@QZrK^wIxRxA_Kx{
zfpV`XY9Z4nGooD}^o8M=3HCEzX(L=9-`3B(*caN<!|V<1xz@w9Atp#`lZv#>TErc$
z7X+r;DT!$mlb8)i+Qy^tc(M&p+)8qBwoQ!;i?;PhnB>A6!#3&2#JagCo~pNEsd1)j
z!#Np*<`66wi8KRZHpr+WQ(+#Cx*p)C`P4{q6gV^<B~3mko_?1EgV<F;hn*9QQrFTu
zPaZpcuAe!dln;4n`vzqIN`w;E-YL+XC(}(L?73)S3|ciA;|aS-;5yUB&VCs96qC00
zfRK&iKA?cw0AJcD7VJ!GI$un0zeYw;DKF0uBTaihlXfYigj0#*oi;vmY<Rez=}c=o
znXa^3L?L#GA<YP;-Pfu(C0vKo#vZa9NQ8}zjvh{HLkFViI=Q=I*`QI-gB&r*3)(Rd
z81W1gwDD*{Fh|Kifv6#9)4;%M!`Jy>9Tx-ut{Y#w@YRFxCV9vOOLamxJedUxc4Gur
z<cwe?acMH_;z}QpJ}m@fgv<~f6X-C<Kye<MvvHw_N39zVWuTW46-w?H01sm+8=OmU
z%r;QY`njR9%R6`GQB&)i{jc@Ed1&5|)n>1heauo~F_C|vz?FP^v(|FMwl^)WS@Qb|
zno`5Ati9}QU3zKprTodl>5}(A*79pdbJ^y(dtv@Uaogbi;rk~(a21=+uG!9&1KYA^
zAnRjmU3Ru)ZF$R`f3dW^??Y>!1YfT2Ekp{>RVaF&f8F1hJ6!Syi@xr{t{+BLc9jmC
zEIx4xdGXWqr&_JOu41AZTM5EPR&UPtp|y3rW#?Ob;V{5<p8RFYsYUYwy?9{Z*uu+Y
zw<kxxa-i&KUb?V&0R)G9d)}MBUT_t(P+8Be@~)o34v6iMVtP%W?^CVL>#JC(w$6n!
z4?Tg3i`q6sS3Hy}K)8<1LVLkmi2Sg9`9^8)@nYXVap3tP^W|SUUjTV!TW9Tdth;>+
zFX#Fer(fyM>+-4hBFo#~9lbZZ%Iq(>_t$D&ao3TRmscY9{U5kKz^0y~nRTW;e=Hxq
zH}G9k!JYdm3`k%%j0d#UMA|AqTOFjW^;@>}{*fYc@t4lw$}mkecU7LJDVuNMcG22e
z)U|S;Sp;S;GReX!kKAGfZ8UK+qGnl;mm_S@KSN}Ricu*D#iy2L<9l>QbDQQtClI5)
zM@!Pp2ue38MdQKJ867YHns+5eil(Z^=mheno1To0)nw?5_C4*p5`%`4&;(JQZv<s^
z($U|qs%;BcIOsMoM71%Zc4L?VMqL*)rq7DHIowY1Na1l~uc`_8=*!bUt1%$-nGhpx
z#7tENNk_RPY=WRjjDWH$l-4m!2XiT?74!rTHvnDX;J~Ih<p0wihQST;#2|<_av?^b
zgF%C!N%A1)CvL{LB#8G@9H=1$3+R49n2*2?FK9tAC|D%D7SshC{&B>s)F7Bfr{Q=2
z6fo(NEb73B!W89LaZeHV5*un5VLu^iQ0`gCeHH%rS$NG+j~qR(wk>rpb{8EzdH0_M
z3L1#a4gJdD&T7|9_66O-@WP2~e9hFf?rK`vwzw^?E$m+&UKw9=oyr=@R@Vb-^QyHu
zPnWFQ*KMw%r>o$55bR$K_7}Upw9;3sKfGo;Qg#O(xI0$e9Us~{Dptzrsn{r+U0k(h
z)OErc2Z}u-oUAw47`H(x$CX4@?p0VdOH@@NM+3>G^{24(EXd<q=0Pddnnm2?$tbB$
zpyaQt^NCt|I%U<*0`rdlWTuv;zNW1u$+P-cRanqvYQw071^pH<MwLKe*aF6+s!OGD
zWX-@rDonZDZ!qEj%PzNn%Tcp!QP%zi%GPaB*6{_(I=3k6V(UMz&9XR0gXmsspQ)uU
zmCJzkmF5B|hdfFkFe(hZoCYks6qVOg!lEjn2qLrQE#}FsnkP%fGJ(SCrbn&;bl29q
zwuqK!uK|t-O4eCh#;U5(Cbt8$+HTNlTV@eL&+SrCjYo!o{Tk(e@&L1L&RE{_zAI4!
z2w-;BN5n<Ds;s|SHe(+K9n%KP4b@x~X2Tc49H^Ehn2j*@Rcdam=BAMA65DLdm@?+~
zn&dVQtM`WHYKa+NJJkn*<SjS@E>kr813{-GYO{&fTRUr;wcetFE#ZUE{S%VWDhy_&
z#29Fmh>=IJrpQV#*}5Xy)kIZxmE8s2I$Zq%{Rvo9`c)>CD$9pvkABhaVX3AP`$xZG
z0I6oMpcqEe6sX5QkRY0Gr5J4g$!QMG4t32_@o)-T*Uv<z*{}l3fI07GEE4T!j`i)?
z)4PAqspFr?gEM0~0U}YKD3f3#L^GiZvX_~PO)2)ms?Dg{;$DVyhL0qtqF{Wxl>oIk
ztC)D82M{NkuMyb!nb2+mJ#s4=8yio7oliytXPCNMz16{#j7E~N2(U>;&d_d*Y+)xu
zw35+Lse(bP1UZRdXhm)L=yU>Xn@KPZiq^T!HaJ)XBhsm>u4p$P;&-d;5?Gyzlb0(A
z?N%kK5riiAWJ1(RVo@|GUbqOujQV&{1{TE%^^mAgsguD3B3n(`Hm1V}5MwGz$Y4)-
z@`<OO>|ro=;NYI8deE$LO*J;CUxT#}?R`R(i0Za}W)DdbXv^;o9sYlK($-5zRRh2e
zfYd7Jkcfb-Ld!IVs!)(~iJ3TUyhz9iL{OwF!8o0`k${DjHeMpcgrX@#(#EIJ9NEu+
z?F1uf;~)?ZNSuOIMMZ+ht)=a~y}isZ)&(o5H>jzySt2868$NOAxo0G1E^V_QIP#GY
zrY#qrJ1^%rNY3-5p&}+wup);8tDE3VH&{^KMUF55lDP<oMyg=Y%ALV(>VQ%bf)0(k
zf|(DGMn{ljxcy`ZP$&dO5sgO4_)QR9!TQAQgDk?>33Vbuc~OEqdlB<cY82>E!64EG
z_htOHkf9wRQiF-`h$uhUB@7beA7b$Vd=aL$9t+Yp!2%Z9Qg*c?E~K9k>^PT%B6;}Z
zKZOl_j{5WuXi9I%9?#y)j%Rz<bPek|bM|b`yzrHxrF~7;fuGN3ht_nykInXM>`rU$
z*t*M0I8VCdYRejatEDV~ih;7#-|e68Ul`3@2i9|Lu<Y_LH7+*hPUSC_TwQZ#$__8l
zKJUoammD1+CVLu}&McnE-7eIFwQ26@kD;DZ3xi8Ti$l5R^OsAW-9`7FqNQ(5w-=l6
z1+u=JkCi-oitfImW$&79pSskw!tqTAzi|qh><m0`w5~c@A2>Q!9iSEu&J8{6*mb8p
zJ9alYpDfk2u34BLx|d)0i<Y^8M;7PZ?)mP8=W@qOmR9)Q-tl(eUf`{!HB0Zkrq?_<
zSMFH0?@nZ4`?p8u59XWZ239S-pBN}pJ+STB<eF*Qx;>EVF0?Q2TeClz)s(IF1=qKp
z1Ri}Mv}W60c6+m@5!|T;?<#1Q-D~zIHh}l6*;*BFFMt>8m^IV(bw^8Xa(Vpzz?x&I
zXc~g?ELwJ~>2{W_w%=Z+VUFfo|M_tjWexGj*8I>n)TI4u+Bei?_@GHYw8O~7A(i_>
zc-0nE{}vc<w7JyM(^pOGAew6V<t?BzVEjfY3?4A*>$nB90f<l499K)zE;C&`*a<Sa
zw3VWOv8z#*2(XG7PsUwk3;8-#DMe%uF@A1YM^)R*LhCm&<}-SgPF>MbgWzJQYJDeA
zxwPr2ngu~{G2lOWglH;ZQ`KmER}t%H(6*s4V^xH#Sv9_3BjMHMRoZ1}Yjy0ZqhQ2w
zMJI=e-#3Cd178f1W5FLRZUSc8T#J#evZPzLM((p5&H_`;w|pKxxrK5mImFUjL$x*)
z>#B?nOT74$5~hTD)@>Q%XJ%ay-&lLbv@w_T-#Tl}SSQfvxanc*<XV97Z;~@7n&ntE
zWF6J90&9&l^J5y;$vWS2$?W#TwrUAE2EvAIV!QlR4P|<owS!T;E=J8dKp<+b=F2e|
z4lF{~B@xFt>&m!luD*KKEwA0#`b<6RAy%g+wt)7k)~Y+>=3T0_?#nnqjpSF`)xa}P
zpV}d}D`U?nb!8kG=OM_rb1_plfrigbPsS;a7+9WIu!GcbEh$tglLs)>DXsQo922|b
zoNAa+iD+9;UIQCYuz0tCX=D{k+pI6uE8~auYct-A4+JMrsNh3HN&jp^rUAy?2{ai{
zjZJeVuyK{$sn=1e{FaQL)3DnTG~2onj-;E0!9#TzT#7acV46GKk~<NT?vj`#B4SC*
zwBqiFOs&MGhBzE{!KvbN2_MaYUXn_;iFSxnRd`U^f%KLDyTvHTF<@%okao_e!Hq!8
zLQ8&-U|->H#HOau?}2bYu?X=zBHW)~k|l%mm&AHRCMhH*qdx)%io+Nu8CR<9M$IBH
zZ82czfAfFfNjHiZ&Y=g?rWU64klN4u1`jRJ>4j65z@4B@G7*i4z6{)6$mDk8YY)E2
zCtCB<ei88{Fw2M?Bqj&2t2B4YjRS)l(<xb^I-&Y#oF!(Czx!nTqwIhD$B&N?dk1j8
zxGYILtpN*-*>aJn7J-*Gh|H^Cz{*Yx2K|ID<#2-+tTBEBj3~)CIDLYD)NN3(@`R=3
z!kDc?p$5F1fSCoOeLQ)c8^vS;4~}6`NJ_;fqrmUS!P}BMi}j04Kc8Zu=fJH_#bYVa
zA&Kz2NPEPbL;RBj!}Jsjwj5$42{u-_3z2z6gN|S(ONXc$%|vyIQEnXZaF{RZQ(;a8
zBQ_7D2Eh^m-_aN%=D9h9vEq~iZ=6Cd+>9TjNrFwXx)FCo(TGIsLwYg^9C}9(k0YT@
z1{Cc>)sYqqqM?sFi1lbE`7s_HuZZ|p^F_3~+!$iU*JJqOe+f$mEJMH@2Smke?pn#Z
zYi{tN)lv3rf8g1%>e*4;`P52V$#XbserWZ4e4u~j;9n#k9C&v1z_V)yhI0-1(;o(U
z*DhZxzBE?4Jf1yWb~it8x39X}*W5ckZf5fC*H0JBtId52W_8ZG(_3r+R4coRzLRUt
zQ)Pbxu<Z-6S6lPP*8NROGmA5M*DIL?9kB29+ZS5F;$p8a)*oB59S2Jf^2RL>9L%bN
z$u|}*mK=M4Gq$=3uX%RObgt}hLII~|Vc^>{xu;eg?ZD4l-FJ`9A6>IHm)%Xd@m2TE
ztf`vzk!f4m+q`sj@#<UcZ?WKi7F+IG-PvDi`%=k!C~JA>2$TcNo6WB^zZEHTEx)kZ
zeYmvqNGWhMd*-3l3+?^R$$WpQIaJ_QoA;Ld`{su}M%{L(aJw*3tUtJB>xV|81aV@a
zYoW1dX<pN{JT_2zXIW?Z=9#ab$-exJr`PM7mqLr7vbpY6?ZVJ2_M&N9A+^%+saE4Q
z{GGw}_0ftR3P8_Y%@3SASDiac&aS!B@a^%<oh#cNb0^Eb#-)kHi9d<YTfnIIT-n>W
zbb0Y|?q+_b<n7B^%907K>}w*fcILv~k}s4!_0Zi&W@GoVwlH1X|4h;M?3(l9Lzs}p
zzUAP`mrIQ&vL_ezee3KaTf+le>#D6aul)rm=G~OP{SzN$@~j8;EuUV=lmh3nb!AiC
zUHiO!A+cuK@yOaBEvo19mrB-d5V}+;&z7v6bA#*t?!tlP{VVJTo;CmZRh?(<WOnEy
zhhMZNXcqP@?8r|1!W5`zDQ5%BDTsn+?j2aXUASHJJyo<ET+{VGbo#P=MO{no7`%Ra
zOatp>vGsSn4c7Nh1HQ8z+Mjpm&xVX3{U?(t4je5g?kc{n!E1x>j}a1ZV|u(7r~+jP
zmL@!>4!97NYcD965o*%|j&>5~BZqK0{tcXt;C445ot)|-wYb=-^DXbFT?$NHV7ue!
zJ0Y{`RIPI9z|kA4oc;uk!KNppaO$%rR*(N=&H(s4MScA>QwmOY@saa69c###ZqiWu
zWCnO=WEz2cra3Qgylxq~8nQ}F8Jehu={xfo&6dy#=a|t*(D{sZOK8Q{D5I62e*~sH
zMcG?zMRja|zxr?JXYyJ8;?AtdU4SdIVqTo^RdL9%6gs<+XH=uOIzLuioh|9xCu4B)
z0Zf5acX~<T3nw?YAi{kJj%0zhPt;Ws6&C6}2ppWZ;PVoLTN5d`1^~Rl^%-V7d{cC;
z>_i1uCj<VzqAy4%YO^{;rw&1n>=2(ebQ9-(fgVYl;S>s!uk%EIvuWiA?cCR(D}tWG
zvkN@`z|$EH%~Tk?uF=(moWUHh)f5*&I7<xl+mM}zL<Joii=@PZocKqW@v;bURu)X)
zP%XMGTew9mypi4nGifJkw*)<Qn-|QexDq)tJX!%cw41b;$H}N3Ws;`@8w1@i@KaTl
z5zplT@HCJgPs0n&GbaUS$*sk@xzlA^-Q0<fy-m5(Z(exqLSd-X{KPN8gTegB>bZMl
z{>TGs$Evj>KfGoQ7A}APrFULh`O1gA=)h3X)tS8;PNc6cel;I1`Mabe>0@_Gj(uxz
z%^fT}zp{U|_bB)(mfilGVewFQ@S($<)4geW&6M|*8oCODA2#fP?5$EgInSF7uQlX5
zOM&h}^uxeDL}hc`JvV<Y7x;y(z0ywko~(3JHrL#_-?ozD@PC8%GHAKL+wm<^(GdhU
zX0!Jpv|0<{@A%^|9PhiV1O1x!n|%Wz{rh{HA^uZG6TWx(;QccVc?Vn=59tS<H2-X$
z5##-VL7)D=dTDqIE>s$#g<xbdJVj<`gUc)0ImveUZy~ak>O~gOc(_La78hmX++aPW
zKgW8i_)W_`1`)gl2mj=)39a>;g@zeg5*ak<r_P}bLERfd7d?=Kw?L4*qLWjp83xxi
zxFa*8;aD8JkwfQ$I_@jbyG_IZ4DUTm#r3+uZ?~SPUQwH>j2rh|0E*+<{1QaKfNpJm
zU~ONuwy#-tlpU_DdEE>Oveq>-1B{QuefO*LU(JO}4(3D0&irxIU2cD1W<E4CzvHb?
z{!atPXzly7{+Pum*wobsCfRbc!5WSS51FGhM7EkEmR8vn!RLa5O+BNz30Dp>@P*qv
z*zRhpbkr&{P_02pa88s;YQ>T`?{Ev`^r>iyk=uwq;CQi&-2G5@qetA@MqvxR9)>$z
zaJfwK(HoJ=h6J-5<pt;FTq4~CX|g%N<h=!r=e~!pxA9eg7hGZycNK9z@r;0*mx3Hw
zb!Zs(4#1#6k-rQPm^3HwT+FJI$u||Qmz;fbrym*}3qyJHswY@>G~^mq9c@KZTiM_K
zz~8y*?<{upuN*J=j}~>Fhem&~xn~(T;DZ2J3<Osl-9=M3s8{DsOCAmK0PwY^ZVuW3
zcR}efT7AJFqYwqrX~N|sm?Uj&&LmU41OV5TYQm^}id9ZKFrA940%{<6u^xR*9FQjy
zByc^o>(N(TAJ9)4@EM@pwecBL%kp#de!y#i$VN01X3(RVYq>4RTM>9wD_Sv_UFnf?
zk-q{j;;xKEb^^i{f*a6pFN!7<)zPj@9In$-vDDD?b><i`04jG4<KgSkI9|ELbL=_1
znJF*c-s^Duv0y_?-hiu=Bp(gGl1(-wgE75Flp=iS5FXP<LZUHtL`42#wUC&8O|0!C
z5E0MFWTfDeDMl<z?o1>Y<N@GD<HkIZwMbjZ&Cu~A7@b(5ZX<|DTc}YH3}WSieM94d
zda?nE%mPvsF+RbayyUGY2NtY<k4d<h3Uy)|Mx+X=q$}vABBNu1Q|^HR@#k3fKj4dq
z&IYkZWS^{Z4boi%FAfqIz(&B>1b0BCq2qoCrN4%B{@)=&7zcmH!pPkDbvKB;PnO(I
z&7FH_^_LwjMN`Wo@Ae1Y9jji*Ij~Y!@}8PI4-|cOc78T@1&l=K2DNT&&D#p&CF_%-
z2rZTaarXA&?IN>hIaKn4km4!ZeRr?TU&}oOwkLbn+{uSlSJ}~8G_`^>=M4}O(+KP{
zp1!%K%l?iB{_a(O_b>fD^Hvb+o-bSL?jD>!`0X#{cD~v3TF>jDlC=vQroQzw$cGKR
z;2~&hF6x@d*0Mpm!EKKm1x~V`RNW;q%KZSadT44=Hn>hHl&!BG))`$!ksgSgqa5#0
zm-MjUf})f~il=Arh%utW71$|g&;n``bOts1HE&ITq*1LKboIO^Lvi(6z`~_Tr7ncs
z0uH2}4RAo;hV&0Eeuln*s>94+GI9gBcGThE&1jB^O@cSIMA7F&>QyKdsa1g-A&@+#
zwH-oc{mAY^XlXvu2DcEU6cUw|ylB9eeYxA(nAj-jBgOhqV<?_Yz?Vq(m{i2)++3+Z
z?-q;_-gmT{XfS9|vE>Q{Rw<7A0Zg7?5SuR&mtOp+#gY4E(%RGp?!N$*JQCL{5CKcB
zlk)78sp|JpPTB6d`@;MSMPFB;tz_?&=)?8vZEe|0po!SNFt~7F)zVnd%?*^ns(WVs
z%mNQqPh0EUiHBC_;-#|Fw`5tg<o1Ip*V&QPL!z8?Fb~IfFsm<Hopa|z8Ua~fdBBD!
z(#W4eMD0ANI&HT)LDe*(RndqB=u4^tt%^12OIux!TR?ygLdL@Czk~NA4C#L(uUKW^
z@rFC9MVTaV`2lef#L=6eU<HSC;+&5+^vB@xeL`8W(n6M3-B;QaWFrz1xPkyq18|X{
znj@iv9Q2Bv6%Pv4a&%QeB_x6>Nh%Fo^k2d-b3eh?kKqMp#a4@v$C)9Uy2!N!KD*|m
zin#j#xq_>17$UIhEMIWdJ!;wh<_oXAP~3TNWpAnFNO|MuCwi^TP@%MX!)JF^6I4WY
z*1z1mv%)2EV4<rH%sNG<qz9dnA)^Dn<v)QI#i$aD6s-y9a!l2ARMV|g(V5JeSq=V^
z*9f?k*5rXU?8aO0Nvuw_V_U1*q>NRy+uN#kV{k;Q!B4>+tJsRSfH0~c5I3Gs@XXq)
zdZ-O`)`3n8eFHW<D(j;%c2ExPFtMiZ=rVQRQ#fVN0Ih%q2>7|p!s4BrBrKZKAD=@<
zZ&XuAKczt~-T>i%yBV%J6o?^0j5m{@sNq436n|Gj=BSY96atWT?M-ABCWM3T<S26T
zqSOPL9mN+9bt6RNMj=(=3FV3;=B1}viFnee6C`R-CrM;SIL-71*<P(BA{0oJHptZx
zO)1Dx7cq`m0($FI87RS_bX05u94&c-4dYAE9*OK9GF#+OAT%o4V$h@$bktH6ad4qh
zWDU7zFh-g%j<GlKg)T|rcJWh;{S01U`jQW6+`D)l)tM?z5TlY3iQJ9CIaXwKp8G3+
z{%hPg{wG9W<EW?XOkS5~3xmr$O17uwPLvy(i>-TC8}=4;UZQaS$lD0=yWO=Ayfc!g
z*WLc5Ba27!+e_|_>>!BlbuEuTpuatTJ7>siN{;rduIy<b+}(?Cuz7l+Y(sA$Ry=s7
z)NnRy2X%Yd=Da&JKeX_C?qbQt<U8K(y4MBj@x7(C11sj@z@^7pO_L?-06lo)zU70(
zBNs}6XB3_LdZ0OX`Soq(KwG(`yWAT3#BA_e;C73nrDBEH|Dp_LYsE|XnkpUO3^n)k
zZ`+}T*%48(@b7#BZvD^P`hjM11iG(0%HCFn9aw!L*1875((fr+@L5zD;I6&tVL{S-
zPvIJgt930X_+_Fm0A*s$UjPc+)xQ9gjkSLPs5;i6_G%#QN4vqMN0uh>_IItzEnK|1
zuYzMbtU7!^zE_;aQT0;OBZr<)H1lF9d;@HK^-GQ^JF2$oOOis{FL@>uZMK{Oc(knh
zJ&$~f$nE-E4-jt^5?WHj(}VL?`&%1A#sHoH<R?`om};x!7&o>7OAQDa3-Oj}m&;W{
zd0zPT>WHwujDDjp!sqzAsNC9rKcvbd1a7x}c2sN6F#L3oV#i?BPV&Fs3rF25tK8!a
zYpYXE{r|eQz`n(wQnffxyH_CXH9}5}wF@~-kW*v*!mOz7i~s(yW>srY)tY9NFO7l=
zYHU$^@33CbMHzq>1C1r{TsxF;GF4{{{lr)=+L}N+<jojae;R0Y)_A80{9CH?wQaLs
ziz<())fVtp1EC69w7pg?q_?T>N~^Ge|BSraKaV~$Mh@Crg`R=i-s-Pl`}uUo#n1kp
z82MS4VHGa~iq)53aMA|xC#TZep8<~uRDOkP{B#wuXyLCg!G*agEP^6%x(Vw+3mLKg
zNJ0s5hk;B0BCgo(lQH~V4>W(_H@VN9HcG$ql=f6}iKjM$O0l%Lw^zCkmOlAqxEO|7
zeK0e@?c*djL+X+4v`Ihwg+CJ{UjT<26*9k9EfY*T;i>^JgV1n^149E@5^mY1xany6
zOie~nhTXs-5;<%Gm&kOT8NP6y5q)%m;I@N5PXSHhz+?;_(;+SeAq<`Y42tLyf)Zr9
z`NYMG7cTZQmz2hCc2`R8l|Y8cFNw*&4b}s$<v9913|!>i+^KX6csMEjM>!Gflw`oL
z$l<hs%mG}R7H?)i!bt*=wpUAa@j>o6Xd?GKw$pr3yjci7R6oLRBO|)11igYO%#DG2
z6CV}oj&WlkmnKrr;3p^8!Yn%ixg<@n5Fq^Kg*w8m$pgz1=OMzask9hq%olx!G7uB=
z?DXUm+==Ajx4v|ElTvhP*WscQEjs9C&gJp-1iXS4bf+Qh&|#e-LrVBg6eC6E6B$y$
zfVQD1%dKH@0$w$(l)6dK6>(+A{(IyoZ@~1;QR{Z^1AE7+y`${u2e*zo!`w+w`01?=
zbpBPHKW8d-o+;}5Yr3;#ou!&KNYYMhNIOK*4p-C6#jYbH?WmGgHo438=g?`S;Zu}b
zK{F^<RW>!|50&kgH)PuCMQ4qYb=%w^w990BU~K%**a$``bfd}Y)?FR>ONIU4KlskU
zW%ft0AH?o^{-*K$#**t?)&QP2ci6iV^AkB=fnKZYUNh}lH(BqR=1mK(JJv@XU2mWJ
zvvYUYh5bv177wk}G0W`SsYgcZH)sB6X5G^O_Rqj;fmbeM&7c+%UEbFm;`Puy%YkKU
zvHsAS?eMzIL73xb@0^uzzuJ<UDRh=xd)G|+9$6iC4}R+)aREBMW*Sgc|FPAvp#Ani
za5e$g6=3M!j@^qbdwvx7L11O@Cue?qrer%ccjDJp^lUFWdkgIMWADTYFRmC?Qa_pb
z@yyDN5B$YvpL_7^i>uGRSbXMcvF_TM=_NIK@asAG>b?cI@x9QKn=D#-)^xpPd)?g0
zU*Qj2t=k(x6Xh@2gOZp0x1R*9)k1jw?EJ}!5q_lxG!&n?;BflYNN&d~6Zr$HzF^VY
zUD*EKz_R|`GlfITw~CgdYq~$EUVB*A8NWI7^`XU`*=K*DV{&15iFPH^2-t07iIEXd
zwA{kgihBsx0<WO<{Q2`iMlg<yu*t~C2#4Bl4he@F!WSG!l;~3TA;id!ijBu(*G0m+
zj!E#_W1tg4;f;F@Uyb-e{(?i93%n7g;Rluk-F5iILgHa3XyLjtxpG48cZgnSd4&Hi
zzKCI9lX&=7n1K`|Ku2(ghjt=oNjfKrai&AC|4+xGN4Q@@1~Sq773d^b!D;#votxG@
z?xtwxuPE!UC^Px*`W0pQA5_Cfl(k~>(Qc6K_T7tBD2T4G_xJr+jD9fq7*i^H>j7ea
zL3{6Dg@WiZPlA=y{ed6PJjRqt4|R+_PJg29rx}pkDkN-Z7Q>F97(R8FgpXi~_b+%B
zB8!3S(N%rZV?7l5<haI6+ZR&s^JOy?3Zi+a1A=AGitYzZB>I8&F{XUd@C;4cbC)U<
zh9Hsdf9I?izrXJ>Nvn)%F3|M0d`E@Ca9IjhhQ;su!;eXNC841@y2^b|e4=M)SJ_fm
z(PHQe<OUZfNc@onZgJ0dS1|ukM`xkq-Wd{yEx};Pc6|NHV<Y5LPSScBGAk4W%ljW=
zP;r{*x_qiaL4czL!Aj~e1{L2nx;YQ6hhT*z!Tr<+1MkB)LQ3U0?Zvj(D_RV_@M~-s
z?tl@*@W^ozKGK^W>*2ey$4VOto(cuQeW)LT51@VsD%<3)wbJ#ZYZ%I1gSgT)jLThv
mxT<UL^$&yell1lq7)W%sZ2CiO;Gxd+$0xx7P1i`)-v0##Io06+

diff --git a/flagos-user-tests/tools/resolve_matrix.py b/flagos-user-tests/tools/resolve_matrix.py
index 1e8c9db..3d2a549 100644
--- a/flagos-user-tests/tools/resolve_matrix.py
+++ b/flagos-user-tests/tools/resolve_matrix.py
@@ -71,7 +71,9 @@ def resource_entry_to_matrix(entry: dict, repo: str = "", task: str = "", model:
     """Convert a list_test_resources entry to a matrix entry."""
     return {
         "case_path": entry["case_path"],
-        "repo": repo or "", "task": task or "", "model": model or "",
+        "repo": entry.get("repo", "") or repo or "",
+        "task": entry.get("task", "") or task or "",
+        "model": entry.get("model", "") or model or "",
         "runner_labels": json.dumps(entry["runner_labels"]),
         "container_image": entry.get("container_image", ""),
         "conda_env": entry.get("conda_env", ""),
diff --git a/flagos-user-tests/tools/run_user_tests.py b/flagos-user-tests/tools/run_user_tests.py
index ac7f534..754d6a6 100644
--- a/flagos-user-tests/tools/run_user_tests.py
+++ b/flagos-user-tests/tools/run_user_tests.py
@@ -504,6 +504,9 @@ def list_test_resources(
         container_opts = resolve_container_options(resources, resource_map_path)
         result.append({
             "case_path": str(case_path),
+            "repo": meta.get("repo", ""),
+            "task": meta.get("task", ""),
+            "model": meta.get("model", ""),
             "resources": resources,
             "runner_labels": runner_labels,
             "container_image": container_image,

From 15269e1fac38f59d5970b3fb2c72ab635652c6fb Mon Sep 17 00:00:00 2001
From: liyuzhuo <lee.yuzhuo233@gmail.com>
Date: Wed, 18 Mar 2026 19:33:21 +0800
Subject: [PATCH 10/13] fix api invoke

---
 .github/workflows/post_test_cases.yml         |  48 ++++++++
 flagos-user-tests/repos.yaml                  |  16 +--
 .../inference/qwen3/demo_0_6b/README.md       |   2 +-
 .../inference/qwen3/demo_0_6b/demo_0_6b.yaml  |   2 +-
 flagos-user-tests/tools/collect_test_cases.py | 102 ++++++++++++++++
 flagos-user-tests/tools/test_post_report.sh   | 113 ++++++++++++++++++
 6 files changed, 273 insertions(+), 10 deletions(-)
 create mode 100644 .github/workflows/post_test_cases.yml
 create mode 100644 flagos-user-tests/tools/collect_test_cases.py
 create mode 100755 flagos-user-tests/tools/test_post_report.sh

diff --git a/.github/workflows/post_test_cases.yml b/.github/workflows/post_test_cases.yml
new file mode 100644
index 0000000..cb1e5af
--- /dev/null
+++ b/.github/workflows/post_test_cases.yml
@@ -0,0 +1,48 @@
+name: Post Test Cases Report
+
+on:
+  pull_request:
+    branches: [main]
+    types: [closed]
+
+defaults:
+  run:
+    working-directory: flagos-user-tests
+
+jobs:
+  post-report:
+    if: ${{ github.event.pull_request.merged == true }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install dependencies
+        run: pip install pyyaml
+
+      - name: Collect test cases
+        run: python tools/collect_test_cases.py --output test_cases_report.json
+
+      - name: Post report
+        uses: ./actions/post-benchmark-report
+        with:
+          backend_url: ${{ secrets.FLAGOPS_BACKEND_URL }}
+          api_token: ${{ secrets.FLAGOPS_API_TOKEN }}
+          report_path: flagos-user-tests/test_cases_report.json
+          list_code: flagops-user-test-cases
+          list_name: FlagOps User Test Cases
+          header_config: >-
+            [
+              {"field": "case_id",    "name": "用例ID",     "required": true,  "sortable": true,  "type": "string"},
+              {"field": "case_name",  "name": "用例名称",   "required": true,  "sortable": false, "type": "string"},
+              {"field": "repo",       "name": "所属子仓库", "required": true,  "sortable": true,  "type": "string"},
+              {"field": "updated_at", "name": "更新时间",   "required": true,  "sortable": true,  "type": "string"}
+            ]
+          fail_on_error: "false"
diff --git a/flagos-user-tests/repos.yaml b/flagos-user-tests/repos.yaml
index fdde8e1..fe7f816 100644
--- a/flagos-user-tests/repos.yaml
+++ b/flagos-user-tests/repos.yaml
@@ -5,41 +5,41 @@
 
 repositories:
   flagscale:
-    url: https://github.com/FlagOpen/FlagScale.git
+    url: https://github.com/flagos-ai/FlagScale.git
     default_branch: main
     description: Large-scale distributed training framework
 
   flaggems:
-    url: https://github.com/FlagOpen/FlagGems.git
+    url: https://github.com/flagos-ai/FlagGems.git
     default_branch: main
     description: GPU-accelerated math library
 
   flagcx:
-    url: https://github.com/FlagOpen/FlagCX.git
+    url: https://github.com/flagos-ai/FlagCX.git
     default_branch: main
     description: Cross-chip communication library
 
   flagtree:
-    url: https://github.com/FlagOpen/FlagTree.git
+    url: https://github.com/flagos-ai/FlagTree.git
     default_branch: main
     description: Tree-structured computation library
 
   vllm-fl:
-    url: https://github.com/FlagOpen/vLLM-FL.git
+    url: https://github.com/flagos-ai/vLLM-FL.git
     default_branch: main
     description: LLM inference engine
 
   vllm-plugin-fl:
-    url: https://github.com/FlagOpen/vLLM-plugin-FL.git
+    url: https://github.com/flagos-ai/vLLM-plugin-FL.git
     default_branch: main
     description: vLLM plugin system
 
   te-fl:
-    url: https://github.com/FlagOpen/TransformerEngine-FL.git
+    url: https://github.com/flagos-ai/TransformerEngine-FL.git
     default_branch: main
     description: Transformer Engine
 
   megatron-lm-fl:
-    url: https://github.com/FlagOpen/Megatron-LM-FL.git
+    url: https://github.com/flagos-ai/Megatron-LM-FL.git
     default_branch: main
     description: Megatron-LM fork
diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/README.md b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/README.md
index b38f61b..491ebcd 100644
--- a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/README.md
+++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/README.md
@@ -15,7 +15,7 @@ Runs 4 prompts with greedy decoding (temperature=0, max_tokens=10) and verifies
 ## How to Run
 
 ```bash
-git clone https://github.com/FlagOpen/FlagScale.git && cd FlagScale && pip install .
+git clone https://github.com/flagos-ai/FlagScale.git && cd FlagScale && pip install .
 flagscale inference qwen3 --config ./conf/demo_0_6b.yaml
 ```
 
diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml
index e266a48..1399352 100644
--- a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml
+++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml
@@ -20,7 +20,7 @@ env:
   CUDA_DEVICE_MAX_CONNECTIONS: "1"
 
 setup:
-  - git clone https://github.com/FlagOpen/FlagScale.git && cd FlagScale && pip install .
+  - git clone https://github.com/flagos-ai/FlagScale.git && cd FlagScale && pip install .
 
 run:
   - flagscale inference qwen3 --config ./conf/demo_0_6b.yaml --test
diff --git a/flagos-user-tests/tools/collect_test_cases.py b/flagos-user-tests/tools/collect_test_cases.py
new file mode 100644
index 0000000..365e3ce
--- /dev/null
+++ b/flagos-user-tests/tools/collect_test_cases.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""Collect all test cases and output a JSON report for post-benchmark-report action.
+
+Output format:
+    [
+      {
+        "case_id": "tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml",
+        "case_name": "flagscale-inference-qwen3-demo_0_6b",
+        "repo": "flagscale",
+        "updated_at": "2026-03-18T15:02:29+08:00"
+      },
+      ...
+    ]
+
+Usage:
+    python tools/collect_test_cases.py --root . --output report.json
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+import yaml
+
+
+def get_file_updated_time(filepath: Path) -> str:
+    """Get the last commit time of a file via git, fallback to mtime."""
+    try:
+        result = subprocess.run(
+            ["git", "log", "-1", "--format=%aI", str(filepath)],
+            capture_output=True, text=True, timeout=10,
+        )
+        if result.returncode == 0 and result.stdout.strip():
+            return result.stdout.strip()
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        pass
+
+    # Fallback to file modification time
+    mtime = filepath.stat().st_mtime
+    return datetime.fromtimestamp(mtime, tz=timezone.utc).strftime("%Y/%m/%d %H:%M:%S")
+
+
+def make_case_id(meta: dict) -> str:
+    """Generate a case ID from meta fields: <repo>-<task>-<model>-<case>."""
+    parts = [
+        meta.get("repo", "unknown"),
+        meta.get("task", ""),
+        meta.get("model", ""),
+        meta.get("case", ""),
+    ]
+    return "-".join(p for p in parts if p)
+
+
+def collect_test_cases(root: Path) -> list:
+    """Discover all test cases and return report list."""
+    tests_dir = root / "tests"
+    report = []
+
+    for yaml_path in sorted(tests_dir.rglob("*.yaml")):
+        if yaml_path.name.startswith("_") or yaml_path.name == "data.yaml":
+            continue
+
+        try:
+            with open(yaml_path) as f:
+                data = yaml.safe_load(f)
+            if not isinstance(data, dict) or "meta" not in data:
+                continue
+
+            meta = data["meta"]
+            report.append({
+                "case_id": str(yaml_path.relative_to(root)),
+                "case_name": make_case_id(meta),
+                "repo": meta.get("repo", "unknown"),
+                "updated_at": get_file_updated_time(yaml_path),
+            })
+        except (yaml.YAMLError, KeyError):
+            continue
+
+    return report
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Collect test cases for reporting")
+    parser.add_argument("--root", default=".", help="Root directory of flagos-user-tests")
+    parser.add_argument("--output", default="test_cases_report.json", help="Output JSON file")
+    args = parser.parse_args()
+
+    root = Path(args.root)
+    report = collect_test_cases(root)
+
+    with open(args.output, "w") as f:
+        json.dump(report, f, ensure_ascii=False, indent=2)
+
+    print(f"Collected {len(report)} test case(s) -> {args.output}")
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/flagos-user-tests/tools/test_post_report.sh b/flagos-user-tests/tools/test_post_report.sh
new file mode 100755
index 0000000..3d7de1d
--- /dev/null
+++ b/flagos-user-tests/tools/test_post_report.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+# Local test script for posting test cases report.
+#
+# Usage:
+#   ./tools/test_post_report.sh <backend_url> [api_token]
+#
+# Example:
+#   ./tools/test_post_report.sh http://10.0.0.1:8080
+#   ./tools/test_post_report.sh http://10.0.0.1:8080 my-secret-token
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+BACKEND_URL="${1:?Usage: $0 <backend_url> [api_token]}"
+BACKEND_URL="${BACKEND_URL%/}"
+API_TOKEN="${2:-}"
+
+LIST_CODE="flagops-user-test-cases"
+LIST_NAME="FlagOps User Test Cases"
+REPORT_PATH="$ROOT_DIR/test_cases_report.json"
+
+HEADER_CONFIG='[
+  {"field": "case_id",    "name": "用例ID",     "required": true,  "sortable": true,  "type": "string"},
+  {"field": "case_name",  "name": "用例名称",   "required": true,  "sortable": false, "type": "string"},
+  {"field": "repo",       "name": "所属子仓库", "required": true,  "sortable": true,  "type": "string"},
+  {"field": "updated_at", "name": "更新时间",   "required": true,  "sortable": true,  "type": "string"}
+]'
+
+# --- Step 1: Collect test cases ---
+echo "=== Step 1: Collect test cases ==="
+cd "$ROOT_DIR"
+python tools/collect_test_cases.py --root . --output "$REPORT_PATH"
+echo "Report content:"
+cat "$REPORT_PATH" | python -m json.tool
+echo ""
+
+# --- Step 2: Post header config ---
+echo "=== Step 2: Post header config ==="
+HEADER_PAYLOAD=$(jq -n \
+  --arg list_code "$LIST_CODE" \
+  --arg list_name "$LIST_NAME" \
+  --argjson header_config "$HEADER_CONFIG" \
+  '{list_code: $list_code, list_name: $list_name, header_config: $header_config}')
+
+echo "URL: ${BACKEND_URL}/flagcicd-backend/list/header"
+echo "Payload:"
+echo "$HEADER_PAYLOAD" | jq .
+
+CURL_ARGS=(-s -X POST -w '\n%{http_code}' -H "Content-Type: application/json" -d "$HEADER_PAYLOAD")
+[ -n "$API_TOKEN" ] && CURL_ARGS+=(-H "Authorization: Bearer $API_TOKEN")
+
+RESPONSE=$(curl "${CURL_ARGS[@]}" "${BACKEND_URL}/flagcicd-backend/list/header")
+HTTP_STATUS=$(echo "$RESPONSE" | tail -n1)
+RESPONSE_BODY=$(echo "$RESPONSE" | sed '$d')
+
+echo "HTTP status: $HTTP_STATUS"
+echo "Response: $RESPONSE_BODY"
+echo ""
+
+# --- Step 3: Post list data ---
+echo "=== Step 3: Post list data ==="
+COMMIT_ID="$(git rev-parse HEAD 2>/dev/null || echo 'unknown')"
+REPO_NAME="flagos-ai/FlagOps"
+WORKFLOW_ID="local-test"
+RUN_ID="local-$$"
+
+DATA_PAYLOAD=$(jq -n \
+  --arg repository_name "$REPO_NAME" \
+  --slurpfile report "$REPORT_PATH" \
+  '{
+    items: [ $report[0][] | . + {
+      repository_name: $repository_name
+    } ]
+  }')
+
+echo "URL: ${BACKEND_URL}/flagcicd-backend/list/data/${LIST_CODE}"
+echo "Items count: $(echo "$DATA_PAYLOAD" | jq '.items | length')"
+echo "Payload (first item sample):"
+echo "$DATA_PAYLOAD" | jq '{items_count: (.items | length), first_item: .items[0]}'
+
+CURL_ARGS=(-s -X POST -w '\n%{http_code}' -H "Content-Type: application/json" -d "$DATA_PAYLOAD")
+[ -n "$API_TOKEN" ] && CURL_ARGS+=(-H "Authorization: Bearer $API_TOKEN")
+
+RESPONSE=$(curl "${CURL_ARGS[@]}" "${BACKEND_URL}/flagcicd-backend/list/data/${LIST_CODE}")
+HTTP_STATUS=$(echo "$RESPONSE" | tail -n1)
+RESPONSE_BODY=$(echo "$RESPONSE" | sed '$d')
+
+echo "HTTP status: $HTTP_STATUS"
+echo "Response: $RESPONSE_BODY"
+echo ""
+
+# --- Step 4: Query to verify ---
+echo "=== Step 4: Query list data ==="
+QUERY_URL="${BACKEND_URL}/flagcicd-backend/list/data/${LIST_CODE}?page_size=10&page=1&sort=created_at&order=desc"
+echo "URL: $QUERY_URL"
+
+CURL_ARGS=(-s -X GET -w '\n%{http_code}' -H "Accept: application/json")
+[ -n "$API_TOKEN" ] && CURL_ARGS+=(-H "Authorization: Bearer $API_TOKEN")
+
+RESPONSE=$(curl "${CURL_ARGS[@]}" "$QUERY_URL")
+HTTP_STATUS=$(echo "$RESPONSE" | tail -n1)
+RESPONSE_BODY=$(echo "$RESPONSE" | sed '$d')
+
+echo "HTTP status: $HTTP_STATUS"
+echo "Response:"
+echo "$RESPONSE_BODY" | jq . 2>/dev/null || echo "$RESPONSE_BODY"
+
+# Cleanup
+rm -f "$REPORT_PATH"
+echo ""
+echo "=== Done ==="

From 3b6d0820abf13b958b74fc930babf29b071710fa Mon Sep 17 00:00:00 2001
From: liyuzhuo <lee.yuzhuo233@gmail.com>
Date: Wed, 18 Mar 2026 20:08:22 +0800
Subject: [PATCH 11/13] model path

---
 .../inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml
index c91ac04..f1ce909 100644
--- a/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml
+++ b/flagos-user-tests/tests/flagscale/inference/qwen3/demo_0_6b/conf/inference/demo_0_6b.yaml
@@ -1,5 +1,5 @@
 llm:
-  model: Qwen/Qwen3-0.6B
+  model: /home/gitlab-runner/data/Qwen3-0.6B
   trust_remote_code: true
   tensor_parallel_size: 1
   pipeline_parallel_size: 1

From 8bda34ea273bc11a64761c65a831b4de0fde1dd6 Mon Sep 17 00:00:00 2001
From: liyuzhuo <lee.yuzhuo233@gmail.com>
Date: Thu, 19 Mar 2026 14:25:20 +0800
Subject: [PATCH 12/13] fix post api invoke

---
 flagos-user-tests/tools/collect_test_cases.py | 26 +++++++++++--------
 flagos-user-tests/tools/test_post_report.sh   | 19 +++++++++++---
 2 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/flagos-user-tests/tools/collect_test_cases.py b/flagos-user-tests/tools/collect_test_cases.py
index 365e3ce..e349a38 100644
--- a/flagos-user-tests/tools/collect_test_cases.py
+++ b/flagos-user-tests/tools/collect_test_cases.py
@@ -1,16 +1,15 @@
 #!/usr/bin/env python3
 """Collect all test cases and output a JSON report for post-benchmark-report action.
 
-Output format:
-    [
-      {
-        "case_id": "tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml",
+Output format (object-of-objects, keyed by case_id):
+    {
+      "tests/flagscale/inference/qwen3/demo_0_6b/demo_0_6b.yaml": {
         "case_name": "flagscale-inference-qwen3-demo_0_6b",
         "repo": "flagscale",
         "updated_at": "2026-03-18T15:02:29+08:00"
       },
       ...
-    ]
+    }
 
 Usage:
     python tools/collect_test_cases.py --root . --output report.json
@@ -54,10 +53,15 @@ def make_case_id(meta: dict) -> str:
     return "-".join(p for p in parts if p)
 
 
-def collect_test_cases(root: Path) -> list:
-    """Discover all test cases and return report list."""
+def collect_test_cases(root: Path) -> dict:
+    """Discover all test cases and return report dict keyed by case_id.
+
+    The post-benchmark-report action expects an object-of-objects format where:
+    - Each key maps to header_config[0].field (case_id)
+    - Each value is an object with fields matching header_config[1+]
+    """
     tests_dir = root / "tests"
-    report = []
+    report = {}
 
     for yaml_path in sorted(tests_dir.rglob("*.yaml")):
         if yaml_path.name.startswith("_") or yaml_path.name == "data.yaml":
@@ -70,12 +74,12 @@ def collect_test_cases(root: Path) -> list:
                 continue
 
             meta = data["meta"]
-            report.append({
-                "case_id": str(yaml_path.relative_to(root)),
+            case_id = str(yaml_path.relative_to(root))
+            report[case_id] = {
                 "case_name": make_case_id(meta),
                 "repo": meta.get("repo", "unknown"),
                 "updated_at": get_file_updated_time(yaml_path),
-            })
+            }
         except (yaml.YAMLError, KeyError):
             continue
 
diff --git a/flagos-user-tests/tools/test_post_report.sh b/flagos-user-tests/tools/test_post_report.sh
index 3d7de1d..2783e36 100755
--- a/flagos-user-tests/tools/test_post_report.sh
+++ b/flagos-user-tests/tools/test_post_report.sh
@@ -68,11 +68,24 @@ RUN_ID="local-$$"
 
 DATA_PAYLOAD=$(jq -n \
   --arg repository_name "$REPO_NAME" \
+  --arg workflow_id "$WORKFLOW_ID" \
+  --arg commit_id "$COMMIT_ID" \
+  --arg run_id "$RUN_ID" \
+  --argjson header_config "$HEADER_CONFIG" \
   --slurpfile report "$REPORT_PATH" \
   '{
-    items: [ $report[0][] | . + {
-      repository_name: $repository_name
-    } ]
+    items: [ $report[0] | to_entries[] | . as $entry |
+      ([ $header_config | to_entries[] | .value.field as $f |
+        if .key == 0 then {($f): $entry.key}
+        else {($f): $entry.value[$f]}
+        end
+      ] | add) + {
+        commit_id: $commit_id,
+        repository_name: $repository_name,
+        workflow_id: $workflow_id,
+        run_id: $run_id
+      }
+    ]
   }')
 
 echo "URL: ${BACKEND_URL}/flagcicd-backend/list/data/${LIST_CODE}"

From 96244cf6975cc4ebf4ea7b2149c11299aeb860e0 Mon Sep 17 00:00:00 2001
From: liyuzhuo <lee.yuzhuo233@gmail.com>
Date: Thu, 19 Mar 2026 19:46:00 +0800
Subject: [PATCH 13/13] support manual trigger

---
 .github/workflows/post_test_cases.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/post_test_cases.yml b/.github/workflows/post_test_cases.yml
index cb1e5af..6e5f243 100644
--- a/.github/workflows/post_test_cases.yml
+++ b/.github/workflows/post_test_cases.yml
@@ -4,6 +4,7 @@ on:
   pull_request:
     branches: [main]
     types: [closed]
+  workflow_dispatch:
 
 defaults:
   run:
@@ -11,7 +12,7 @@ defaults:
 
 jobs:
   post-report:
-    if: ${{ github.event.pull_request.merged == true }}
+    if: ${{ github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true }}
     runs-on: ubuntu-latest
     steps:
       - name: Checkout