Performance Tests #62
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Performance Tests | |
| on: | |
| schedule: | |
| # Run daily at 06:00 UTC | |
| - cron: "0 6 * * *" | |
| workflow_dispatch: | |
| inputs: | |
| scale: | |
| description: "Test scale (perf-test)" | |
| type: choice | |
| options: | |
| - tiny | |
| - small | |
| - medium | |
| - large | |
| default: large | |
| suite: | |
| description: "Perf-test suite to run (blank = all)" | |
| type: choice | |
| options: | |
| - "" | |
| - retain | |
| - recall | |
| - recall-with-observations | |
| - consolidation | |
| - graph-maintenance | |
| default: "" | |
| locomo_conversations: | |
| description: "LoComo conversation IDs (space-separated). Blank = curated set (conv-26 conv-30 conv-43)." | |
| type: string | |
| default: "" | |
| locomo_skip: | |
| description: "Skip LoComo job" | |
| type: boolean | |
| default: false | |
| obs_skip: | |
| description: "Skip observation-dedup benchmark job" | |
| type: boolean | |
| default: false | |
| obs_dataset: | |
| description: "Obs benchmark dataset substring (blank = English hermes transcript)." | |
| type: string | |
| default: "" | |
| obs_fraction: | |
| description: "Obs benchmark fraction (0-1] of each document to run." | |
| type: string | |
| default: "1.0" | |
| ref: | |
| description: "Git ref to test (branch, tag, or SHA). Defaults to main." | |
| type: string | |
| default: "" | |
| concurrency: | |
| group: perf-test | |
| cancel-in-progress: true | |
| jobs: | |
| perf-test: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v6 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| prune-cache: false | |
| - name: Set up Python | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version-file: ".python-version" | |
| - name: Cache HuggingFace models | |
| uses: actions/cache@v5 | |
| with: | |
| path: ~/.cache/huggingface | |
| key: ${{ runner.os }}-huggingface-${{ hashFiles('hindsight-api-slim/pyproject.toml') }} | |
| restore-keys: | | |
| ${{ runner.os }}-huggingface- | |
| - name: Pre-download models | |
| working-directory: ./hindsight-api-slim | |
| run: | | |
| uv run --frozen --all-extras --index-strategy unsafe-best-match python -c " | |
| from sentence_transformers import SentenceTransformer | |
| print('Downloading embedding model...') | |
| SentenceTransformer('BAAI/bge-small-en-v1.5') | |
| print('Model downloaded successfully') | |
| " | |
| - name: Install hindsight-dev dependencies | |
| run: | | |
| cd hindsight-dev && uv sync --frozen --all-extras --index-strategy unsafe-best-match | |
| - name: Run perf-test | |
| run: | | |
| SUITE_ARG="" | |
| if [ -n "${{ inputs.suite }}" ]; then | |
| SUITE_ARG="--suite ${{ inputs.suite }}" | |
| fi | |
| ./scripts/benchmarks/run-perf-test.sh \ | |
| --scale ${{ inputs.scale || 'large' }} \ | |
| $SUITE_ARG \ | |
| --output perf-results.json | |
| - name: Upload perf results | |
| if: always() | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: perf-results-${{ github.sha }} | |
| path: hindsight-dev/perf-results.json | |
| retention-days: 90 | |
| # Publish enriched results (perf JSON + commit metadata) to the dashboard | |
| # repo's gh-pages branch. The static site at | |
| # https://vectorize-io.github.io/hindsight-continuous-performance-monitor/ | |
| # reads data/index.json + data/<run>.json and renders charts client-side. | |
| - name: Publish to dashboard | |
| if: github.event_name == 'schedule' || github.event_name == 'push' || github.event_name == 'workflow_dispatch' | |
| env: | |
| PERF_DASHBOARD_TOKEN: ${{ secrets.PERF_DASHBOARD_TOKEN }} | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: ./scripts/benchmarks/publish-perf-results.sh hindsight-dev/perf-results.json | |
| locomo: | |
| if: inputs.locomo_skip != true | |
| runs-on: ubuntu-latest | |
| env: | |
| HINDSIGHT_API_LLM_PROVIDER: vertexai | |
| HINDSIGHT_API_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY: /tmp/gcp-credentials.json | |
| HINDSIGHT_API_LLM_MODEL: google/gemini-2.5-flash-lite | |
| HINDSIGHT_API_JUDGE_LLM_PROVIDER: vertexai | |
| HINDSIGHT_API_JUDGE_LLM_MODEL: google/gemini-2.5-flash-lite | |
| HINDSIGHT_API_ANSWER_LLM_PROVIDER: vertexai | |
| HINDSIGHT_API_ANSWER_LLM_MODEL: google/gemini-2.5-flash | |
| steps: | |
| - uses: actions/checkout@v6 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Setup GCP credentials | |
| run: | | |
| printf '%s' '${{ secrets.GCP_VERTEXAI_CREDENTIALS }}' > /tmp/gcp-credentials.json | |
| PROJECT_ID=$(jq -r '.project_id' /tmp/gcp-credentials.json) | |
| echo "HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID=$PROJECT_ID" >> $GITHUB_ENV | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| prune-cache: false | |
| - name: Set up Python | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version-file: ".python-version" | |
| - name: Cache HuggingFace models | |
| uses: actions/cache@v5 | |
| with: | |
| path: ~/.cache/huggingface | |
| key: ${{ runner.os }}-huggingface-${{ hashFiles('hindsight-api-slim/pyproject.toml') }} | |
| restore-keys: | | |
| ${{ runner.os }}-huggingface- | |
| - name: Pre-download models | |
| working-directory: ./hindsight-api-slim | |
| run: | | |
| uv run --frozen --all-extras --index-strategy unsafe-best-match python -c " | |
| from sentence_transformers import SentenceTransformer | |
| print('Downloading embedding model...') | |
| SentenceTransformer('BAAI/bge-small-en-v1.5') | |
| print('Model downloaded successfully') | |
| " | |
| - name: Install hindsight-dev dependencies | |
| run: | | |
| cd hindsight-dev && uv sync --frozen --all-extras --index-strategy unsafe-best-match | |
| - name: Run LoComo benchmark | |
| # Curated 3-conversation subset (best/middle/worst by accuracy on the | |
| # last successful full run): conv-26 (best), conv-30 (middle), conv-43 | |
| # (worst). Excludes conv-44, the bank with the largest unconsolidated | |
| # set that has been pushing scheduled runs over the per-bank | |
| # _wait_for_consolidation timeout. Override via workflow_dispatch with | |
| # the locomo_conversations input. | |
| run: | | |
| CONVERSATIONS="${{ inputs.locomo_conversations }}" | |
| if [ -z "$CONVERSATIONS" ]; then | |
| CONVERSATIONS="conv-26 conv-30 conv-43" | |
| fi | |
| uv run python hindsight-dev/benchmarks/locomo/locomo_benchmark.py \ | |
| --wait-consolidation \ | |
| --conversation $CONVERSATIONS | |
| - name: Upload LoComo results | |
| if: always() | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: locomo-results-${{ github.sha }} | |
| path: hindsight-dev/benchmarks/locomo/results/ | |
| retention-days: 90 | |
| - name: Publish LoComo to dashboard | |
| if: success() && (github.event_name == 'schedule' || github.event_name == 'push' || github.event_name == 'workflow_dispatch') | |
| env: | |
| PERF_DASHBOARD_TOKEN: ${{ secrets.PERF_DASHBOARD_TOKEN }} | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: ./scripts/benchmarks/publish-locomo-results.sh hindsight-dev/benchmarks/locomo/results/benchmark_results.json | |
| obs: | |
| # Observation-dedup quality benchmark: ingests a transcript, drains consolidation | |
| # (serial SyncTaskBackend + embedded pg0 — no external DB / worker), and reports the | |
| # near-duplicate observation rate. Real LLM via VertexAI, mirroring the LoComo job. | |
| if: inputs.obs_skip != true | |
| runs-on: ubuntu-latest | |
| env: | |
| HINDSIGHT_API_LLM_PROVIDER: vertexai | |
| HINDSIGHT_API_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY: /tmp/gcp-credentials.json | |
| HINDSIGHT_API_LLM_MODEL: google/gemini-2.5-flash-lite | |
| HINDSIGHT_API_ENABLE_OBSERVATIONS: "true" | |
| steps: | |
| - uses: actions/checkout@v6 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Setup GCP credentials | |
| run: | | |
| printf '%s' '${{ secrets.GCP_VERTEXAI_CREDENTIALS }}' > /tmp/gcp-credentials.json | |
| PROJECT_ID=$(jq -r '.project_id' /tmp/gcp-credentials.json) | |
| echo "HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID=$PROJECT_ID" >> $GITHUB_ENV | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| prune-cache: false | |
| - name: Set up Python | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version-file: ".python-version" | |
| - name: Cache HuggingFace models | |
| uses: actions/cache@v5 | |
| with: | |
| path: ~/.cache/huggingface | |
| key: ${{ runner.os }}-huggingface-${{ hashFiles('hindsight-api-slim/pyproject.toml') }} | |
| restore-keys: | | |
| ${{ runner.os }}-huggingface- | |
| - name: Pre-download models | |
| working-directory: ./hindsight-api-slim | |
| run: | | |
| uv run --frozen --all-extras --index-strategy unsafe-best-match python -c " | |
| from sentence_transformers import SentenceTransformer | |
| print('Downloading embedding model...') | |
| SentenceTransformer('BAAI/bge-small-en-v1.5') | |
| print('Model downloaded successfully') | |
| " | |
| - name: Install hindsight-dev dependencies | |
| run: | | |
| cd hindsight-dev && uv sync --frozen --all-extras --index-strategy unsafe-best-match | |
| - name: Run obs benchmark | |
| # Default to the English hermes transcript at full fraction — a clean, deterministic | |
| # consolidation-dedup signal (the Chinese variant adds a cross-lingual embedding | |
| # confound). Override dataset/fraction via workflow_dispatch. | |
| run: | | |
| DATASET="${{ inputs.obs_dataset }}" | |
| if [ -z "$DATASET" ]; then DATASET="hermes_session_2026-05-15_en"; fi | |
| FRACTION="${{ inputs.obs_fraction }}" | |
| if [ -z "$FRACTION" ]; then FRACTION="1.0"; fi | |
| cd hindsight-dev | |
| uv run python -m benchmarks.obs.obs_benchmark \ | |
| --dataset "$DATASET" --fraction "$FRACTION" --wipe-bank --output obs-results.json | |
| - name: Upload obs results | |
| if: always() | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: obs-results-${{ github.sha }} | |
| path: hindsight-dev/obs-results.json | |
| retention-days: 90 | |
| - name: Publish obs to dashboard | |
| if: success() && (github.event_name == 'schedule' || github.event_name == 'push' || github.event_name == 'workflow_dispatch') | |
| env: | |
| PERF_DASHBOARD_TOKEN: ${{ secrets.PERF_DASHBOARD_TOKEN }} | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: ./scripts/benchmarks/publish-obs-results.sh hindsight-dev/obs-results.json |