verygoodplugins · jack-arturo · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
diff --git a/.github/workflows/docs-dispatch.yml b/.github/workflows/docs-dispatch.yml
@@ -0,0 +1,75 @@
+name: Docs Dispatch
+on:
+  push:
+    branches: [main]
+
+jobs:
+  check-docs:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
+
+      - name: Get changed files
+        id: changed
+        run: |
+          BASE_SHA="${{ github.event.before }}"
+          HEAD_SHA="${{ github.event.after }}"
+
+          if [ "$BASE_SHA" = "0000000000000000000000000000000000000000" ]; then
+            BASE_SHA=$(git rev-list --max-parents=0 "$HEAD_SHA")
+          fi
+
+          FILES=$(git diff --name-only "$BASE_SHA" "$HEAD_SHA" | jq -R -s -c 'split("\n") | map(select(. != ""))')
+          echo "files=$FILES" >> $GITHUB_OUTPUT
+
+      - name: Check file-doc mapping
+        id: check
+        run: |
+          MAP=$(curl -sf "https://raw.githubusercontent.com/verygoodplugins/automem-website/main/scripts/file-doc-map.json")
+          if [ $? -ne 0 ]; then
+            echo "Failed to fetch file-doc-map.json, skipping"
+            echo "affected=none" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          REPO_KEY="${{ github.event.repository.name }}"
+          CHANGED='${{ steps.changed.outputs.files }}'
+
+          AFFECTED=$(echo "$MAP" | jq -r --arg repo "$REPO_KEY" --argjson changed "$CHANGED" '
+            def matches_pattern($file; $pattern):
+              if ($pattern | endswith("/**")) then
+                ($file | startswith($pattern[0:-3]))
+              else
+                $file == $pattern
+              end;
+
+            .[$repo] // {} | to_entries | map(
+              select(.key as $pattern | $changed | any(. as $file | matches_pattern($file; $pattern)))
+            ) | map(.value) | flatten | unique | .[]
+          ')
+
+          if [ -z "$AFFECTED" ]; then
+            echo "affected=none" >> $GITHUB_OUTPUT
+          else
+            AFFECTED_JSON=$(echo "$AFFECTED" | jq -R -s -c 'split("\n") | map(select(. != ""))')
+            echo "affected=$AFFECTED_JSON" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Dispatch to automem-website
+        if: steps.check.outputs.affected != 'none'
+        uses: peter-evans/repository-dispatch@v3
+        with:
+          token: ${{ secrets.RELEASE_PLEASE_TOKEN }}
+          repository: verygoodplugins/automem-website
+          event-type: docs-update
+          client-payload: |
+            {
+              "source_repo": "${{ github.event.repository.full_name }}",
+              "source_sha": "${{ github.event.after }}",
+              "changed_files": ${{ steps.changed.outputs.files }},
+              "affected_docs": ${{ steps.check.outputs.affected }},
+              "commit_url": "${{ github.event.head_commit.url }}",
+              "compare_url": "${{ github.event.compare }}"
+            }
diff --git a/.gitignore b/.gitignore
@@ -39,3 +39,5 @@ node_modules/
 # Benchmark overrides and snapshots
 .env.bench
 /benchmarks/snapshots/
+/benchmarks/results/
+benchmarks/baselines/locomo_baseline.json
diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 # Makefile - Development commands
-.PHONY: help install dev test fmt lint test-integration test-live test-locomo test-locomo-live test-longmemeval test-longmemeval-live test-longmemeval-watch clean logs deploy
+.PHONY: help install dev test fmt lint test-integration test-live test-locomo test-locomo-live test-longmemeval test-longmemeval-live test-longmemeval-watch clean logs deploy bench-health
 
 # Default target
 help:
@@ -25,6 +25,7 @@ help:
 	@echo "  make bench-ingest BENCH=locomo - Ingest + snapshot (run once)"
 	@echo "  make bench-eval BENCH=locomo CONFIG=baseline - Eval from snapshot (~2 min)"
 	@echo "  make bench-compare BENCH=locomo CONFIG=bm25 BASELINE=baseline - A/B compare"
+	@echo "  make bench-health             - Recall health check (score dist, entities, latency)"
 	@echo "  make test-locomo          - Full LoCoMo benchmark (local)"
 	@echo "  make test-locomo-live     - Full LoCoMo benchmark (Railway)"
 	@echo "  make test-longmemeval     - Full LongMemEval benchmark (local)"
@@ -144,6 +145,9 @@ bench-compare:
 bench-compare-branch:
 	@scripts/bench/compare_branch.sh $(BRANCH) $(or $(CONFIG),baseline) $(or $(BENCH),locomo)
 
+bench-health:
+	@python3 scripts/bench/health_check.py --base-url $(or $(BASE_URL),http://localhost:8001)
+
 bench-snapshots:
 	@ls -la benchmarks/snapshots/ 2>/dev/null || echo "No snapshots yet. Run: make bench-ingest BENCH=locomo"
 

diff --git a/automem/api/recall.py b/automem/api/recall.py
@@ -8,7 +8,13 @@
 
 from flask import Blueprint, abort, jsonify, request
 
-from automem.config import ALLOWED_RELATIONS, RECALL_EXPANSION_LIMIT, RECALL_RELATION_LIMIT
+from automem.config import (
+    ALLOWED_RELATIONS,
+    RECALL_ADAPTIVE_FLOOR,
+    RECALL_EXPANSION_LIMIT,
+    RECALL_MIN_SCORE,
+    RECALL_RELATION_LIMIT,
+)
 from automem.utils.graph import _serialize_node
 
 DEFAULT_STYLE_PRIORITY_TAGS: Set[str] = {
@@ -1084,6 +1090,10 @@ def _parse_threshold(param_name: str) -> Optional[float]:
     expand_min_importance = _parse_threshold("expand_min_importance")
     expand_min_strength = _parse_threshold("expand_min_strength")
 
+    min_score_param = _parse_threshold("min_score")
+    min_score = min_score_param if min_score_param is not None else (RECALL_MIN_SCORE or None)
+    adaptive_floor = _parse_bool_param(request.args.get("adaptive_floor"), RECALL_ADAPTIVE_FLOOR)
+
     context_label = (request.args.get("context") or "").strip().lower()
     active_path = (
         request.args.get("active_path")
@@ -1242,6 +1252,11 @@ def _run_single_query(
             )
         ]
 
+        if min_score is not None and min_score > 0:
+            local_results = [
+                res for res in local_results if float(res.get("final_score", 0.0)) >= min_score
+            ]
+
         if sort_param == "score":
             local_results.sort(
                 key=lambda r: (
@@ -1412,6 +1427,31 @@ def _run_single_query(
             ]
         results = seed_results + expansion_results + entity_expansion_results
 
+    # Apply adaptive score floor: detect steep dropoff and cut low-quality tail
+    score_floor_applied = None
+    if adaptive_floor and len(results) > 3:
+        scores = sorted([float(r.get("final_score", 0.0)) for r in results], reverse=True)
+        # Find the largest gap between consecutive scores in the top half
+        max_gap = 0.0
+        gap_idx = -1
+        halfway = max(3, len(scores) // 2)
+        for i in range(1, halfway):
+            gap = scores[i - 1] - scores[i]
+            if gap > max_gap:
+                max_gap = gap
+                gap_idx = i
+        # If there's a steep dropoff (>15% of max score), cut below it
+        if max_gap > 0.15 * scores[0] and gap_idx > 0:
+            score_floor_applied = scores[gap_idx]
+            results = [
+                r for r in results if float(r.get("final_score", 0.0)) >= score_floor_applied
+            ]
+
+    # Apply explicit min_score on final assembled results (catches expansions)
+    pre_filter_count = len(results)
+    if min_score is not None and min_score > 0:
+        results = [r for r in results if float(r.get("final_score", 0.0)) >= min_score]
+
     # JIT-enrich unenriched memories inline (cheap: entities + summary ~50ms each)
     jit_enriched_count = 0
     if jit_enrich_fn is not None:
@@ -1467,6 +1507,12 @@ def _run_single_query(
     response["tag_match"] = tag_match
     if jit_enriched_count:
         response["jit_enriched_count"] = jit_enriched_count
+    if min_score or score_floor_applied:
+        response["score_filter"] = {
+            "min_score": min_score,
+            "adaptive_floor": score_floor_applied,
+            "filtered_count": pre_filter_count - len(results) if min_score else 0,
+        }
     response["query_time_ms"] = round((time.perf_counter() - query_start) * 1000, 2)
     if any_context_profile:
         response["context_priority"] = {

diff --git a/automem/config.py b/automem/config.py
@@ -92,6 +92,8 @@
 
 RECALL_RELATION_LIMIT = int(os.getenv("RECALL_RELATION_LIMIT", "5"))
 RECALL_EXPANSION_LIMIT = int(os.getenv("RECALL_EXPANSION_LIMIT", "25"))
+RECALL_MIN_SCORE = float(os.getenv("RECALL_MIN_SCORE", "0.0"))
+RECALL_ADAPTIVE_FLOOR = os.getenv("RECALL_ADAPTIVE_FLOOR", "true").lower() in ("true", "1", "yes")
 
 # Memory content size limits (governs auto-summarization on store)
 # Soft limit: Content above this triggers auto-summarization

diff --git a/benchmarks/EXPERIMENT_LOG.md b/benchmarks/EXPERIMENT_LOG.md
@@ -0,0 +1,37 @@
+# AutoMem Experiment Log
+
+Tracks recall quality experiments with before/after benchmark results.
+
+**Baselines** are created with Voyage 4 embeddings (`EMBEDDING_PROVIDER=voyage`, `VECTOR_SIZE=1024`)
+on the snapshot-based bench infrastructure (PR #97, merged 2026-03-02).
+
+## Tiered Benchmarking
+
+| Tier | Benchmark | Runtime | Cost | When to use |
+|------|-----------|---------|------|-------------|
+| 0 | `make test` (unit) | 30s | free | Every change |
+| 1 | `locomo-mini` (2 convos, 198 Qs) | 2-3 min | free | Rapid iteration |
+| 2 | `locomo` (10 convos, 1986 Qs) | 5-10 min | free | Before merge |
+| 3 | `longmemeval-mini` (20 Qs) | 15 min | ~$1 | Scoring/entity changes |
+| 4 | `longmemeval` (500 Qs) | 1-2 hr | ~$10 | Milestones only |
+
+## Results
+
+| Date | Issue/PR | Branch | LoCoMo-mini | LoCoMo-full | LME-mini | Notes |
+|------|----------|--------|-------------|-------------|----------|-------|
+| 2026-03-02 | baseline | main | 76.97% (234/304) | 80.06% (1590/1986) | -- | Voyage 4, 1024d. Health: DEGRADED (low score variance) |
+| 2026-03-02 | #73 | exp/73-min-score-threshold | 76.97% (+0.0) | -- | -- | min_score + adaptive floor. No regression. Needs #78 for impact |
+
+## How to add an entry
+
+1. Run the benchmark: `make bench-eval BENCH=locomo-mini CONFIG=baseline`
+2. Record the overall accuracy from the output JSON
+3. Add a row to the table above with the date, issue/PR, branch, and scores
+4. For deltas, show as `XX.X% (+Y.Y)` relative to the baseline row
+
+## Snapshot metadata
+
+| Snapshot | Created | Git SHA | Embedding | Memories |
+|----------|---------|---------|-----------|----------|
+| locomo-mini | 2026-03-02 | main @ 80a6f93 | voyage:voyage-4 1024d | 788 (2 convos) |
+| locomo | 2026-03-02 | main @ 80a6f93 | voyage:voyage-4 1024d | 5828 (10 convos) |
diff --git a/benchmarks/baselines/health_baseline.json b/benchmarks/baselines/health_baseline.json
@@ -0,0 +1,126 @@
+{
+  "overall": "DEGRADED",
+  "timestamp": "2026-03-02T11:19:03Z",
+  "base_url": "http://localhost:8001",
+  "checks": [
+    {
+      "check": "score_distribution",
+      "verdict": "WARN: low score variance \u2014 weak differentiation",
+      "total_scores": 50,
+      "global_stats": {
+        "min": 0.2093,
+        "max": 0.4506,
+        "mean": 0.3562,
+        "stddev": 0.0497,
+        "spread": 0.2413
+      },
+      "latency": {
+        "p50_ms": 504.3,
+        "p95_ms": 715.2,
+        "mean_ms": 541.6
+      },
+      "per_query": [
+        {
+          "query": "What was discussed about authentication and security?",
+          "count": 10,
+          "scores": {
+            "min": 0.2903,
+            "max": 0.3311,
+            "mean": 0.2982,
+            "spread": 0.0408
+          },
+          "latency_ms": 504.3,
+          "query_time_ms": 489.8
+        },
+        {
+          "query": "Tell me about the birthday party last weekend",
+          "count": 10,
+          "scores": {
+            "min": 0.3719,
+            "max": 0.4156,
+            "mean": 0.3871,
+            "spread": 0.0437
+          },
+          "latency_ms": 461.4,
+          "query_time_ms": 452.16
+        },
+        {
+          "query": "What programming languages does the team use?",
+          "count": 10,
+          "scores": {
+            "min": 0.3589,
+            "max": 0.413,
+            "mean": 0.379,
+            "spread": 0.054
+          },
+          "latency_ms": 483.5,
+          "query_time_ms": 473.6
+        },
+        {
+          "query": "Plans for the upcoming vacation trip",
+          "count": 10,
+          "scores": {
+            "min": 0.3952,
+            "max": 0.4506,
+            "mean": 0.41,
+            "spread": 0.0554
+          },
+          "latency_ms": 715.2,
+          "query_time_ms": 706.11
+        },
+        {
+          "query": "Recent decisions about database architecture",
+          "count": 10,
+          "scores": {
+            "min": 0.2093,
+            "max": 0.3287,
+            "mean": 0.3066,
+            "spread": 0.1194
+          },
+          "latency_ms": 543.5,
+          "query_time_ms": 533.7
+        }
+      ]
+    },
+    {
+      "check": "entity_quality",
+      "verdict": "OK: 0.0% garbage (0/32)",
+      "sampled_memories": 30,
+      "memories_with_entities": 26,
+      "memories_without_entities": 4,
+      "total_entity_tags": 32,
+      "garbage_tags": 0,
+      "garbage_pct": 0.0,
+      "garbage_examples": []
+    },
+    {
+      "check": "cross_query_overlap",
+      "verdict": "OK: different queries return different results",
+      "overlap_pairs": 0,
+      "total_pairs": 3,
+      "query_results": {
+        "What was discussed about authentication ": [
+          "4dafb907-0779-4085-8058-90a9ff97e858",
+          "c711668b-b70f-4bd9-b12a-5f4e587100fe",
+          "90021543-0f07-4572-b5a3-870a9aed9f63",
+          "4b34c8e0-31ee-41f1-ad2d-b8c74c2fb713",
+          "7997eb95-e7f8-42fa-8960-1f0d9cf2be58"
+        ],
+        "Tell me about the birthday party last we": [
+          "25fef83e-5a6a-4798-8dd3-2cb10d2dc7a0",
+          "eb1d8f46-bb8f-4539-90e9-27ef9f769fc7",
+          "1bd67e29-77e0-400a-8aeb-69161c957cc7",
+          "72feb3cf-0f35-4593-8476-8b2c79dc0319",
+          "0f889be1-3406-4afb-aba0-c534e044d03f"
+        ],
+        "What programming languages does the team": [
+          "fc05eb7e-b9c4-4ea4-a63a-43f553da9730",
+          "928bdcd7-0f89-4fe2-8053-54375e199555",
+          "1bdf3cc4-bbf7-462f-8050-cb957f8dbc26",
+          "b32e87a8-b064-46fa-bc70-53c01dbaab3d",
+          "955e3c23-728f-4b4d-bfdc-a433631bc261"
+        ]
+      }
+    }
+  ]
+}