Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,17 +77,24 @@ If the AutoMem volumes were reset, reseed before scoring so the manifest matches

## Real-Data Metadata Evals In Worktrees

For production-snapshot experiments, use one canonical AutoMem checkout as the
Docker/runtime source and give each `automem-evals` worktree unique Compose
project names and host ports. Keep snapshots outside eval worktrees and pass
them by absolute path.
For production-snapshot experiments, give each `automem-evals` worktree unique
Compose project names and host ports. Keep snapshots outside eval worktrees and
pass them by absolute path. Transform-based variants can use one canonical
AutoMem checkout; server-code variants can point baseline and candidate at
different AutoMem checkouts.

Create a per-worktree env file such as `.env.metadata-<worktree>`. Files matching
`.env.*` are ignored by git.
`.env.*` are ignored by git. To share embedding-provider config with both stacks,
set `AUTOMEM_RUNTIME_ENV_FILE` to an AutoMem `.env` file (the eval script only loads
an allowlist of embedding-related keys).

```bash
export AUTOMEM_DIR=/path/to/automem
export AUTOMEM_PYTHON="$AUTOMEM_DIR/.venv/bin/python"
export BASELINE_AUTOMEM_DIR="$AUTOMEM_DIR"
export BASELINE_AUTOMEM_PYTHON="$AUTOMEM_PYTHON"
export CANDIDATE_AUTOMEM_DIR=/path/to/automem-metadata-sidecar
export CANDIDATE_AUTOMEM_PYTHON="$CANDIDATE_AUTOMEM_DIR/.venv/bin/python"
export LOCAL_AUTOMEM_API_TOKEN=test-token

export BASELINE_COMPOSE_PROJECT=automem_metadata_<worktree>_baseline
Expand Down Expand Up @@ -129,8 +136,11 @@ bash scripts/real_data_metadata_eval.sh --snapshot "$SNAPSHOT" --variant metadat
# Check the worktree-specific restore commands without touching Docker.
bash scripts/real_data_metadata_eval.sh --snapshot "$SNAPSHOT" --variant metadata-tags --restore-plan-only

# Full A/B run.
# Full transform-based A/B run.
bash scripts/real_data_metadata_eval.sh --snapshot "$SNAPSHOT" --variant metadata-tags

# Full server-code A/B run. This skips corpus transforms and verifies vector identity.
bash scripts/real_data_metadata_eval.sh --snapshot "$SNAPSHOT" --variant server-metadata-search
```

Use `--skip-restore` only to rerun reports against the same already-restored
Expand All @@ -144,8 +154,9 @@ docker ps --filter name=automem_metadata_<worktree>
curl -H "X-Api-Key: $LOCAL_AUTOMEM_API_TOKEN" "http://localhost:$BASELINE_API_PORT/health"
curl -H "X-Api-Key: $LOCAL_AUTOMEM_API_TOKEN" "http://localhost:$CANDIDATE_API_PORT/health"

cd "$AUTOMEM_DIR"
cd "$BASELINE_AUTOMEM_DIR"
docker compose -p "$BASELINE_COMPOSE_PROJECT" down -v
cd "$CANDIDATE_AUTOMEM_DIR"
docker compose -p "$CANDIDATE_COMPOSE_PROJECT" down -v
```

Expand Down
134 changes: 134 additions & 0 deletions runners/check_vector_identity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#!/usr/bin/env python3
from __future__ import annotations

import argparse
import json
import sys
import urllib.parse
from pathlib import Path
from typing import Any

from apply_metadata_treatment import qdrant_vector_hashes


def is_local_qdrant_url(value: str) -> bool:
parsed = urllib.parse.urlparse(value)
return parsed.scheme in {"http", "https"} and (parsed.hostname or "").lower() in {
"localhost",
"127.0.0.1",
"::1",
}


def compare_vector_hashes(
baseline_hashes: dict[str, str],
candidate_hashes: dict[str, str],
*,
variant: str,
) -> dict[str, Any]:
baseline_ids = set(baseline_hashes)
candidate_ids = set(candidate_hashes)
common_ids = baseline_ids & candidate_ids
changed_ids = sorted(
memory_id
for memory_id in common_ids
if baseline_hashes[memory_id] != candidate_hashes[memory_id]
)
missing_ids = sorted(baseline_ids - candidate_ids)
extra_ids = sorted(candidate_ids - baseline_ids)
vectors_identical = not changed_ids and not missing_ids and not extra_ids
return {
"variant": variant,
"baseline_vector_count": len(baseline_hashes),
"candidate_vector_count": len(candidate_hashes),
"common_vector_count": len(common_ids),
"changed_vector_count": len(changed_ids),
"changed_vector_ids_sample": changed_ids[:20],
"missing_candidate_count": len(missing_ids),
"missing_candidate_ids_sample": missing_ids[:20],
"extra_candidate_count": len(extra_ids),
"extra_candidate_ids_sample": extra_ids[:20],
"vectors_identical": vectors_identical,
}


def write_identity_artifacts(
vector_identity: dict[str, Any],
*,
plan_output: Path,
summary_output: Path,
vector_preflight_output: Path,
) -> None:
plan_output.parent.mkdir(parents=True, exist_ok=True)
plan_output.write_text("")

preflight = {
"variant": vector_identity["variant"],
"vectors_identical": bool(vector_identity["vectors_identical"]),
"vector_identity": vector_identity,
"changed_vector_count": vector_identity["changed_vector_count"],
"missing_candidate_count": vector_identity["missing_candidate_count"],
"extra_candidate_count": vector_identity["extra_candidate_count"],
}
vector_preflight_output.parent.mkdir(parents=True, exist_ok=True)
vector_preflight_output.write_text(json.dumps(preflight, indent=2) + "\n")

summary = {
"variant": vector_identity["variant"],
"tag_plan_count": 0,
"embedding_plan_count": 0,
"graph_updates": 0,
"qdrant_updates": 0,
"vector_updates": 0,
"vector_identity": vector_identity,
"vector_preflight": preflight,
}
summary_output.parent.mkdir(parents=True, exist_ok=True)
summary_output.write_text(json.dumps(summary, indent=2) + "\n")


def _connect_qdrant(url: str, api_key: str | None) -> Any:
from qdrant_client import QdrantClient # type: ignore

return QdrantClient(url=url, api_key=api_key or None)


def main() -> int:
parser = argparse.ArgumentParser(description="Compare baseline/candidate Qdrant vectors")
parser.add_argument("--variant", default="server-metadata-search")
parser.add_argument("--baseline-qdrant-url", required=True)
parser.add_argument("--candidate-qdrant-url", required=True)
parser.add_argument("--baseline-qdrant-api-key", default="")
parser.add_argument("--candidate-qdrant-api-key", default="")
parser.add_argument("--collection", default="memories")
parser.add_argument("--plan-output", required=True, type=Path)
parser.add_argument("--summary-output", required=True, type=Path)
parser.add_argument("--vector-preflight-output", required=True, type=Path)
args = parser.parse_args()

for label, url in {
"baseline": args.baseline_qdrant_url,
"candidate": args.candidate_qdrant_url,
}.items():
if not is_local_qdrant_url(url):
raise SystemExit(f"refusing non-local {label} Qdrant URL: {url}")

baseline = _connect_qdrant(args.baseline_qdrant_url, args.baseline_qdrant_api_key)
candidate = _connect_qdrant(args.candidate_qdrant_url, args.candidate_qdrant_api_key)
vector_identity = compare_vector_hashes(
qdrant_vector_hashes(baseline, args.collection),
qdrant_vector_hashes(candidate, args.collection),
variant=args.variant,
)
write_identity_artifacts(
vector_identity,
plan_output=args.plan_output,
summary_output=args.summary_output,
vector_preflight_output=args.vector_preflight_output,
)
print(json.dumps(vector_identity, indent=2))
return 0 if vector_identity["vectors_identical"] else 1


if __name__ == "__main__":
raise SystemExit(main())
20 changes: 15 additions & 5 deletions runners/run_metadata_ab_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ def write_markdown_report(
*,
baseline_endpoint: str,
candidate_endpoint: str,
run_label: str = "",
aggregate: dict[str, Any],
rows: list[dict[str, Any]],
vector_preflight: dict[str, Any],
Expand All @@ -239,12 +240,18 @@ def write_markdown_report(
"",
f"Baseline endpoint: `{baseline_endpoint}`",
f"Candidate endpoint: `{candidate_endpoint}`",
"",
"## Vector Preflight",
"",
"| Endpoint | status | checked | nonzero results |",
"|---|---|---:|---:|",
]
if run_label:
lines.append(f"Run label: `{run_label}`")
lines.extend(
[
"",
"## Vector Preflight",
"",
"| Endpoint | status | checked | nonzero results |",
"|---|---|---:|---:|",
]
)
for label in ("baseline", "candidate"):
item = recall_warmup.get(label) or {}
lines.append(
Expand Down Expand Up @@ -305,6 +312,7 @@ def main() -> int:
parser.add_argument("--run-dir", type=pathlib.Path, default=None)
parser.add_argument("--report", type=pathlib.Path, default=None)
parser.add_argument("--metrics-output", type=pathlib.Path, default=None)
parser.add_argument("--run-label", default="")
args = parser.parse_args()

assert_local_endpoint(args.baseline_endpoint)
Expand Down Expand Up @@ -348,6 +356,7 @@ def main() -> int:
preflight_path = run_dir / "vector_preflight.json"
vector_preflight = merge_vector_preflight(preflight_path, recall_warmup)
metrics = {
"run_label": args.run_label,
"baseline_endpoint": args.baseline_endpoint,
"candidate_endpoint": args.candidate_endpoint,
"baseline_health": baseline_health,
Expand All @@ -366,6 +375,7 @@ def main() -> int:
report_path,
baseline_endpoint=args.baseline_endpoint,
candidate_endpoint=args.candidate_endpoint,
run_label=args.run_label,
aggregate=aggregate,
rows=rows,
vector_preflight=vector_preflight,
Expand Down
56 changes: 56 additions & 0 deletions runners/test_check_vector_identity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import sys
import tempfile
import unittest
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent))

import check_vector_identity as checker


class VectorIdentityTests(unittest.TestCase):
def test_compare_vector_hashes_reports_identical_sets(self):
summary = checker.compare_vector_hashes(
{"a": "hash-a", "b": "hash-b"},
{"a": "hash-a", "b": "hash-b"},
variant="server-metadata-search",
)

self.assertTrue(summary["vectors_identical"])
self.assertEqual(summary["changed_vector_count"], 0)
self.assertEqual(summary["missing_candidate_count"], 0)
self.assertEqual(summary["extra_candidate_count"], 0)

def test_compare_vector_hashes_reports_changed_missing_and_extra_ids(self):
summary = checker.compare_vector_hashes(
{"a": "hash-a", "b": "hash-b"},
{"a": "changed", "c": "hash-c"},
variant="server-metadata-search",
)

self.assertFalse(summary["vectors_identical"])
self.assertEqual(summary["changed_vector_ids_sample"], ["a"])
self.assertEqual(summary["missing_candidate_ids_sample"], ["b"])
self.assertEqual(summary["extra_candidate_ids_sample"], ["c"])

def test_write_identity_artifacts_writes_preflight_summary_and_empty_plan(self):
with tempfile.TemporaryDirectory() as tmp:
root = Path(tmp)
summary = checker.compare_vector_hashes(
{"a": "hash-a"}, {"a": "hash-a"}, variant="server-metadata-search"
)

checker.write_identity_artifacts(
summary,
plan_output=root / "transform_plan.jsonl",
summary_output=root / "transform_summary.json",
vector_preflight_output=root / "vector_preflight.json",
)

self.assertEqual((root / "transform_plan.jsonl").read_text(), "")
self.assertIn('"vector_identity"', (root / "transform_summary.json").read_text())
self.assertIn('"vectors_identical": true', (root / "vector_preflight.json").read_text())


if __name__ == "__main__":
unittest.main()
2 changes: 2 additions & 0 deletions runners/test_run_metadata_ab_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def test_write_markdown_report_accepts_merged_vector_preflight(self):
path,
baseline_endpoint="http://localhost:8011",
candidate_endpoint="http://localhost:8012",
run_label="metadata-sidecar-enabled",
aggregate={
"baseline": {"hit_at_5": 0.0, "mrr": 0.0, "mean_target_rank": 0.0},
"candidate": {"hit_at_5": 1.0, "mrr": 1.0, "mean_target_rank": 1.0},
Expand All @@ -131,6 +132,7 @@ def test_write_markdown_report_accepts_merged_vector_preflight(self):

report = path.read_text()

self.assertIn("Run label: `metadata-sidecar-enabled`", report)
self.assertIn("| baseline | ok | 2 | 5 |", report)
self.assertIn("| candidate | ok | 2 | 6 |", report)

Expand Down
Loading