Performance Tests #62

Workflow file for this run

.github/workflows/perf-test.yml at 613a699

	name: Performance Tests

	on:
	schedule:
	# Run daily at 06:00 UTC
	- cron: "0 6 * * *"
	workflow_dispatch:
	inputs:
	scale:
	description: "Test scale (perf-test)"
	type: choice
	options:
	- tiny
	- small
	- medium
	- large
	default: large
	suite:
	description: "Perf-test suite to run (blank = all)"
	type: choice
	options:
	- ""
	- retain
	- recall
	- recall-with-observations
	- consolidation
	- graph-maintenance
	default: ""
	locomo_conversations:
	description: "LoComo conversation IDs (space-separated). Blank = curated set (conv-26 conv-30 conv-43)."
	type: string
	default: ""
	locomo_skip:
	description: "Skip LoComo job"
	type: boolean
	default: false
	obs_skip:
	description: "Skip observation-dedup benchmark job"
	type: boolean
	default: false
	obs_dataset:
	description: "Obs benchmark dataset substring (blank = English hermes transcript)."
	type: string
	default: ""
	obs_fraction:
	description: "Obs benchmark fraction (0-1] of each document to run."
	type: string
	default: "1.0"
	ref:
	description: "Git ref to test (branch, tag, or SHA). Defaults to main."
	type: string
	default: ""

	concurrency:
	group: perf-test
	cancel-in-progress: true

	jobs:
	perf-test:
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6
	with:
	ref: ${{ inputs.ref \|\| github.ref }}

	- name: Install uv
	uses: astral-sh/setup-uv@v7
	with:
	enable-cache: true
	prune-cache: false

	- name: Set up Python
	uses: actions/setup-python@v6
	with:
	python-version-file: ".python-version"

	- name: Cache HuggingFace models
	uses: actions/cache@v5
	with:
	path: ~/.cache/huggingface
	key: ${{ runner.os }}-huggingface-${{ hashFiles('hindsight-api-slim/pyproject.toml') }}
	restore-keys: \|
	${{ runner.os }}-huggingface-

	- name: Pre-download models
	working-directory: ./hindsight-api-slim
	run: \|
	uv run --frozen --all-extras --index-strategy unsafe-best-match python -c "
	from sentence_transformers import SentenceTransformer
	print('Downloading embedding model...')
	SentenceTransformer('BAAI/bge-small-en-v1.5')
	print('Model downloaded successfully')
	"

	- name: Install hindsight-dev dependencies
	run: \|
	cd hindsight-dev && uv sync --frozen --all-extras --index-strategy unsafe-best-match

	- name: Run perf-test
	run: \|
	SUITE_ARG=""
	if [ -n "${{ inputs.suite }}" ]; then
	SUITE_ARG="--suite ${{ inputs.suite }}"
	fi
	./scripts/benchmarks/run-perf-test.sh \
	--scale ${{ inputs.scale \|\| 'large' }} \
	$SUITE_ARG \
	--output perf-results.json

	- name: Upload perf results
	if: always()
	uses: actions/upload-artifact@v7
	with:
	name: perf-results-${{ github.sha }}
	path: hindsight-dev/perf-results.json
	retention-days: 90

	# Publish enriched results (perf JSON + commit metadata) to the dashboard
	# repo's gh-pages branch. The static site at
	# https://vectorize-io.github.io/hindsight-continuous-performance-monitor/
	# reads data/index.json + data/<run>.json and renders charts client-side.
	- name: Publish to dashboard
	if: github.event_name == 'schedule' \|\| github.event_name == 'push' \|\| github.event_name == 'workflow_dispatch'
	env:
	PERF_DASHBOARD_TOKEN: ${{ secrets.PERF_DASHBOARD_TOKEN }}
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: ./scripts/benchmarks/publish-perf-results.sh hindsight-dev/perf-results.json

	locomo:
	if: inputs.locomo_skip != true
	runs-on: ubuntu-latest
	env:
	HINDSIGHT_API_LLM_PROVIDER: vertexai
	HINDSIGHT_API_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY: /tmp/gcp-credentials.json
	HINDSIGHT_API_LLM_MODEL: google/gemini-2.5-flash-lite
	HINDSIGHT_API_JUDGE_LLM_PROVIDER: vertexai
	HINDSIGHT_API_JUDGE_LLM_MODEL: google/gemini-2.5-flash-lite
	HINDSIGHT_API_ANSWER_LLM_PROVIDER: vertexai
	HINDSIGHT_API_ANSWER_LLM_MODEL: google/gemini-2.5-flash
	steps:
	- uses: actions/checkout@v6
	with:
	ref: ${{ inputs.ref \|\| github.ref }}

	- name: Setup GCP credentials
	run: \|
	printf '%s' '${{ secrets.GCP_VERTEXAI_CREDENTIALS }}' > /tmp/gcp-credentials.json
	PROJECT_ID=$(jq -r '.project_id' /tmp/gcp-credentials.json)
	echo "HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID=$PROJECT_ID" >> $GITHUB_ENV

	- name: Install uv
	uses: astral-sh/setup-uv@v7
	with:
	enable-cache: true
	prune-cache: false

	- name: Set up Python
	uses: actions/setup-python@v6
	with:
	python-version-file: ".python-version"

	- name: Cache HuggingFace models
	uses: actions/cache@v5
	with:
	path: ~/.cache/huggingface
	key: ${{ runner.os }}-huggingface-${{ hashFiles('hindsight-api-slim/pyproject.toml') }}
	restore-keys: \|
	${{ runner.os }}-huggingface-

	- name: Pre-download models
	working-directory: ./hindsight-api-slim
	run: \|
	uv run --frozen --all-extras --index-strategy unsafe-best-match python -c "
	from sentence_transformers import SentenceTransformer
	print('Downloading embedding model...')
	SentenceTransformer('BAAI/bge-small-en-v1.5')
	print('Model downloaded successfully')
	"

	- name: Install hindsight-dev dependencies
	run: \|
	cd hindsight-dev && uv sync --frozen --all-extras --index-strategy unsafe-best-match

	- name: Run LoComo benchmark
	# Curated 3-conversation subset (best/middle/worst by accuracy on the
	# last successful full run): conv-26 (best), conv-30 (middle), conv-43
	# (worst). Excludes conv-44, the bank with the largest unconsolidated
	# set that has been pushing scheduled runs over the per-bank
	# _wait_for_consolidation timeout. Override via workflow_dispatch with
	# the locomo_conversations input.
	run: \|
	CONVERSATIONS="${{ inputs.locomo_conversations }}"
	if [ -z "$CONVERSATIONS" ]; then
	CONVERSATIONS="conv-26 conv-30 conv-43"
	fi
	uv run python hindsight-dev/benchmarks/locomo/locomo_benchmark.py \
	--wait-consolidation \
	--conversation $CONVERSATIONS

	- name: Upload LoComo results
	if: always()
	uses: actions/upload-artifact@v7
	with:
	name: locomo-results-${{ github.sha }}
	path: hindsight-dev/benchmarks/locomo/results/
	retention-days: 90

	- name: Publish LoComo to dashboard
	if: success() && (github.event_name == 'schedule' \|\| github.event_name == 'push' \|\| github.event_name == 'workflow_dispatch')
	env:
	PERF_DASHBOARD_TOKEN: ${{ secrets.PERF_DASHBOARD_TOKEN }}
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: ./scripts/benchmarks/publish-locomo-results.sh hindsight-dev/benchmarks/locomo/results/benchmark_results.json

	obs:
	# Observation-dedup quality benchmark: ingests a transcript, drains consolidation
	# (serial SyncTaskBackend + embedded pg0 — no external DB / worker), and reports the
	# near-duplicate observation rate. Real LLM via VertexAI, mirroring the LoComo job.
	if: inputs.obs_skip != true
	runs-on: ubuntu-latest
	env:
	HINDSIGHT_API_LLM_PROVIDER: vertexai
	HINDSIGHT_API_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY: /tmp/gcp-credentials.json
	HINDSIGHT_API_LLM_MODEL: google/gemini-2.5-flash-lite
	HINDSIGHT_API_ENABLE_OBSERVATIONS: "true"
	steps:
	- uses: actions/checkout@v6
	with:
	ref: ${{ inputs.ref \|\| github.ref }}

	- name: Setup GCP credentials
	run: \|
	printf '%s' '${{ secrets.GCP_VERTEXAI_CREDENTIALS }}' > /tmp/gcp-credentials.json
	PROJECT_ID=$(jq -r '.project_id' /tmp/gcp-credentials.json)
	echo "HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID=$PROJECT_ID" >> $GITHUB_ENV

	- name: Install uv
	uses: astral-sh/setup-uv@v7
	with:
	enable-cache: true
	prune-cache: false

	- name: Set up Python
	uses: actions/setup-python@v6
	with:
	python-version-file: ".python-version"

	- name: Cache HuggingFace models
	uses: actions/cache@v5
	with:
	path: ~/.cache/huggingface
	key: ${{ runner.os }}-huggingface-${{ hashFiles('hindsight-api-slim/pyproject.toml') }}
	restore-keys: \|
	${{ runner.os }}-huggingface-

	- name: Pre-download models
	working-directory: ./hindsight-api-slim
	run: \|
	uv run --frozen --all-extras --index-strategy unsafe-best-match python -c "
	from sentence_transformers import SentenceTransformer
	print('Downloading embedding model...')
	SentenceTransformer('BAAI/bge-small-en-v1.5')
	print('Model downloaded successfully')
	"

	- name: Install hindsight-dev dependencies
	run: \|
	cd hindsight-dev && uv sync --frozen --all-extras --index-strategy unsafe-best-match

	- name: Run obs benchmark
	# Default to the English hermes transcript at full fraction — a clean, deterministic
	# consolidation-dedup signal (the Chinese variant adds a cross-lingual embedding
	# confound). Override dataset/fraction via workflow_dispatch.
	run: \|
	DATASET="${{ inputs.obs_dataset }}"
	if [ -z "$DATASET" ]; then DATASET="hermes_session_2026-05-15_en"; fi
	FRACTION="${{ inputs.obs_fraction }}"
	if [ -z "$FRACTION" ]; then FRACTION="1.0"; fi
	cd hindsight-dev
	uv run python -m benchmarks.obs.obs_benchmark \
	--dataset "$DATASET" --fraction "$FRACTION" --wipe-bank --output obs-results.json

	- name: Upload obs results
	if: always()
	uses: actions/upload-artifact@v7
	with:
	name: obs-results-${{ github.sha }}
	path: hindsight-dev/obs-results.json
	retention-days: 90

	- name: Publish obs to dashboard
	if: success() && (github.event_name == 'schedule' \|\| github.event_name == 'push' \|\| github.event_name == 'workflow_dispatch')
	env:
	PERF_DASHBOARD_TOKEN: ${{ secrets.PERF_DASHBOARD_TOKEN }}
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: ./scripts/benchmarks/publish-obs-results.sh hindsight-dev/obs-results.json

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Performance Tests #62

Workflow file

Performance Tests #62

Uh oh!

Workflow file for this run