diff --git a/.github/workflows/docs-dispatch.yml b/.github/workflows/docs-dispatch.yml new file mode 100644 index 00000000..4758eac3 --- /dev/null +++ b/.github/workflows/docs-dispatch.yml @@ -0,0 +1,74 @@ +name: Docs Dispatch +on: + push: + branches: [main] + +jobs: + check-docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + with: + fetch-depth: 0 + + - name: Get changed files + id: changed + run: | + BASE_SHA="${{ github.event.before }}" + HEAD_SHA="${{ github.event.after }}" + + if [ "$BASE_SHA" = "0000000000000000000000000000000000000000" ]; then + BASE_SHA=$(git rev-list --max-parents=0 "$HEAD_SHA") + fi + + FILES=$(git diff --name-only "$BASE_SHA" "$HEAD_SHA" | jq -R -s -c 'split("\n") | map(select(. != ""))') + echo "files=$FILES" >> $GITHUB_OUTPUT + + - name: Check file-doc mapping + id: check + run: | + if ! MAP=$(curl -sf "https://raw.githubusercontent.com/verygoodplugins/automem-website/main/scripts/file-doc-map.json"); then + echo "Failed to fetch file-doc-map.json, skipping" + echo "affected=none" >> $GITHUB_OUTPUT + exit 0 + fi + + REPO_KEY="${{ github.event.repository.name }}" + CHANGED='${{ steps.changed.outputs.files }}' + + AFFECTED=$(echo "$MAP" | jq -r --arg repo "$REPO_KEY" --argjson changed "$CHANGED" ' + def matches_pattern($file; $pattern): + if ($pattern | endswith("/**")) then + ($file | startswith($pattern[0:-3])) + else + $file == $pattern + end; + + .[$repo] // {} | to_entries | map( + select(.key as $pattern | $changed | any(. as $file | matches_pattern($file; $pattern))) + ) | map(.value) | flatten | unique | .[] + ') + + if [ -z "$AFFECTED" ]; then + echo "affected=none" >> $GITHUB_OUTPUT + else + AFFECTED_JSON=$(echo "$AFFECTED" | jq -R -s -c 'split("\n") | map(select(. != ""))') + echo "affected=$AFFECTED_JSON" >> $GITHUB_OUTPUT + fi + + - name: Dispatch to automem-website + if: steps.check.outputs.affected != 'none' + uses: peter-evans/repository-dispatch@v3 + with: + token: ${{ secrets.RELEASE_PLEASE_TOKEN }} + repository: verygoodplugins/automem-website + event-type: docs-update + client-payload: | + { + "source_repo": "${{ github.event.repository.full_name }}", + "source_sha": "${{ github.event.after }}", + "changed_files": ${{ steps.changed.outputs.files }}, + "affected_docs": ${{ steps.check.outputs.affected }}, + "commit_url": "${{ github.event.head_commit.url }}", + "compare_url": "${{ github.event.compare }}" + } diff --git a/.gitignore b/.gitignore index e686df5e..e4edf683 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,4 @@ node_modules/ .env.bench /benchmarks/snapshots/ /benchmarks/results/ +benchmarks/baselines/locomo_baseline.json diff --git a/automem/api/recall.py b/automem/api/recall.py index a566b180..d4b04480 100644 --- a/automem/api/recall.py +++ b/automem/api/recall.py @@ -8,7 +8,13 @@ from flask import Blueprint, abort, jsonify, request -from automem.config import ALLOWED_RELATIONS, RECALL_EXPANSION_LIMIT, RECALL_RELATION_LIMIT +from automem.config import ( + ALLOWED_RELATIONS, + RECALL_ADAPTIVE_FLOOR, + RECALL_EXPANSION_LIMIT, + RECALL_MIN_SCORE, + RECALL_RELATION_LIMIT, +) from automem.utils.graph import _serialize_node DEFAULT_STYLE_PRIORITY_TAGS: Set[str] = { @@ -1084,6 +1090,10 @@ def _parse_threshold(param_name: str) -> Optional[float]: expand_min_importance = _parse_threshold("expand_min_importance") expand_min_strength = _parse_threshold("expand_min_strength") + min_score_param = _parse_threshold("min_score") + min_score = min_score_param if min_score_param is not None else (RECALL_MIN_SCORE or None) + adaptive_floor = _parse_bool_param(request.args.get("adaptive_floor"), RECALL_ADAPTIVE_FLOOR) + context_label = (request.args.get("context") or "").strip().lower() active_path = ( request.args.get("active_path") @@ -1242,6 +1252,11 @@ def _run_single_query( ) ] + if min_score is not None and min_score > 0: + local_results = [ + res for res in local_results if float(res.get("final_score", 0.0)) >= min_score + ] + if sort_param == "score": local_results.sort( key=lambda r: ( @@ -1412,6 +1427,32 @@ def _run_single_query( ] results = seed_results + expansion_results + entity_expansion_results + pre_filter_count = len(results) + + # Apply adaptive score floor: detect steep dropoff and cut low-quality tail + score_floor_applied = None + if sort_param == "score" and adaptive_floor and len(results) > 3: + scores = sorted([float(r.get("final_score", 0.0)) for r in results], reverse=True) + # Find the largest gap between consecutive scores in the top half + max_gap = 0.0 + gap_idx = -1 + halfway = max(3, len(scores) // 2) + for i in range(1, halfway): + gap = scores[i - 1] - scores[i] + if gap > max_gap: + max_gap = gap + gap_idx = i + # If there's a steep dropoff (>15% of max score), cut below it + if max_gap > 0.15 * scores[0] and gap_idx > 0: + score_floor_applied = scores[gap_idx] + results = [ + r for r in results if float(r.get("final_score", 0.0)) >= score_floor_applied + ] + + # Apply explicit min_score on final assembled results (catches expansions) + if min_score is not None and min_score > 0: + results = [r for r in results if float(r.get("final_score", 0.0)) >= min_score] + # JIT-enrich unenriched memories inline (cheap: entities + summary ~50ms each) jit_enriched_count = 0 if jit_enrich_fn is not None: @@ -1467,6 +1508,12 @@ def _run_single_query( response["tag_match"] = tag_match if jit_enriched_count: response["jit_enriched_count"] = jit_enriched_count + if min_score or score_floor_applied: + response["score_filter"] = { + "min_score": min_score, + "adaptive_floor": score_floor_applied, + "filtered_count": pre_filter_count - len(results), + } response["query_time_ms"] = round((time.perf_counter() - query_start) * 1000, 2) if any_context_profile: response["context_priority"] = { diff --git a/automem/config.py b/automem/config.py index 774a3860..f6c5af65 100644 --- a/automem/config.py +++ b/automem/config.py @@ -92,6 +92,8 @@ RECALL_RELATION_LIMIT = int(os.getenv("RECALL_RELATION_LIMIT", "5")) RECALL_EXPANSION_LIMIT = int(os.getenv("RECALL_EXPANSION_LIMIT", "25")) +RECALL_MIN_SCORE = float(os.getenv("RECALL_MIN_SCORE", "0.0")) +RECALL_ADAPTIVE_FLOOR = os.getenv("RECALL_ADAPTIVE_FLOOR", "true").lower() in ("true", "1", "yes") # Memory content size limits (governs auto-summarization on store) # Soft limit: Content above this triggers auto-summarization diff --git a/benchmarks/EXPERIMENT_LOG.md b/benchmarks/EXPERIMENT_LOG.md index a384584c..3ea9cfc6 100644 --- a/benchmarks/EXPERIMENT_LOG.md +++ b/benchmarks/EXPERIMENT_LOG.md @@ -10,7 +10,7 @@ on the snapshot-based bench infrastructure (PR #97, merged 2026-03-02). | Tier | Benchmark | Runtime | Cost | When to use | |------|-----------|---------|------|-------------| | 0 | `make test` (unit) | 30s | free | Every change | -| 1 | `locomo-mini` (2 convos, 198 Qs) | 2-3 min | free | Rapid iteration | +| 1 | `locomo-mini` (2 convos, 304 Qs) | 2-3 min | free | Rapid iteration | | 2 | `locomo` (10 convos, 1986 Qs) | 5-10 min | free | Before merge | | 3 | `longmemeval-mini` (20 Qs) | 15 min | ~$1 | Scoring/entity changes | | 4 | `longmemeval` (500 Qs) | 1-2 hr | ~$10 | Milestones only | diff --git a/scripts/bench/health_check.py b/scripts/bench/health_check.py index 135c5821..7ace358f 100644 --- a/scripts/bench/health_check.py +++ b/scripts/bench/health_check.py @@ -121,12 +121,24 @@ def check_score_distribution(base_url: str, api_token: Optional[str] = None) -> "min": round(min(all_scores), 4) if all_scores else 0, "max": round(max(all_scores), 4) if all_scores else 0, "mean": round(statistics.mean(all_scores), 4) if all_scores else 0, - "stddev": round(statistics.stdev(all_scores), 4) if len(all_scores) > 1 else 0, + "stddev": (round(statistics.stdev(all_scores), 4) if len(all_scores) > 1 else 0), "spread": round(spread, 4), }, "latency": { "p50_ms": round(statistics.median(latencies), 1) if latencies else 0, - "p95_ms": round(sorted(latencies)[int(len(latencies) * 0.95)] if latencies else 0, 1), + "p95_ms": ( + round( + sorted(latencies)[ + max( + 0, + min(len(latencies) - 1, math.ceil(0.95 * len(latencies)) - 1), + ) + ], + 1, + ) + if latencies + else 0 + ), "mean_ms": round(statistics.mean(latencies), 1) if latencies else 0, }, "per_query": per_query, @@ -232,7 +244,10 @@ def check_cross_query_overlap(base_url: str, api_token: Optional[str] = None) -> query_results[query[:40]] = ids if len(query_results) < 2: - return {"check": "cross_query_overlap", "verdict": "SKIP: not enough queries succeeded"} + return { + "check": "cross_query_overlap", + "verdict": "SKIP: not enough queries succeeded", + } id_lists = list(query_results.values()) overlap_pairs = 0 diff --git a/tests/benchmarks/test_locomo.py b/tests/benchmarks/test_locomo.py index 3aad40a2..cb0a2394 100644 --- a/tests/benchmarks/test_locomo.py +++ b/tests/benchmarks/test_locomo.py @@ -575,12 +575,20 @@ def match_dates_fuzzy( if not question_dates or not memory_dates: return False - # Check for matches within tolerance (strip tz for safe comparison) + # Check for matches within tolerance (normalize to UTC before comparing) for q_date in question_dates: - q_naive = q_date.replace(tzinfo=None) + q_utc = ( + q_date.astimezone(timezone.utc) + if q_date.tzinfo is not None + else q_date.replace(tzinfo=timezone.utc) + ) for m_date in memory_dates: - m_naive = m_date.replace(tzinfo=None) - days_diff = abs((q_naive - m_naive).days) + m_utc = ( + m_date.astimezone(timezone.utc) + if m_date.tzinfo is not None + else m_date.replace(tzinfo=timezone.utc) + ) + days_diff = abs((q_utc - m_utc).total_seconds()) / 86400 if days_diff <= tolerance_days: return True