Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions .github/workflows/docs-dispatch.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
name: Docs Dispatch
on:
push:
branches: [main]

jobs:
check-docs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
with:
fetch-depth: 0

- name: Get changed files
id: changed
run: |
BASE_SHA="${{ github.event.before }}"
HEAD_SHA="${{ github.event.after }}"

if [ "$BASE_SHA" = "0000000000000000000000000000000000000000" ]; then
BASE_SHA=$(git rev-list --max-parents=0 "$HEAD_SHA")
fi

FILES=$(git diff --name-only "$BASE_SHA" "$HEAD_SHA" | jq -R -s -c 'split("\n") | map(select(. != ""))')
echo "files=$FILES" >> $GITHUB_OUTPUT

- name: Check file-doc mapping
id: check
run: |
MAP=$(curl -sf "https://raw.githubusercontent.com/verygoodplugins/automem-website/main/scripts/file-doc-map.json")
if [ $? -ne 0 ]; then
echo "Failed to fetch file-doc-map.json, skipping"
echo "affected=none" >> $GITHUB_OUTPUT
exit 0
fi
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

REPO_KEY="${{ github.event.repository.name }}"
CHANGED='${{ steps.changed.outputs.files }}'

AFFECTED=$(echo "$MAP" | jq -r --arg repo "$REPO_KEY" --argjson changed "$CHANGED" '
def matches_pattern($file; $pattern):
if ($pattern | endswith("/**")) then
($file | startswith($pattern[0:-3]))
else
$file == $pattern
end;

.[$repo] // {} | to_entries | map(
select(.key as $pattern | $changed | any(. as $file | matches_pattern($file; $pattern)))
) | map(.value) | flatten | unique | .[]
')

if [ -z "$AFFECTED" ]; then
echo "affected=none" >> $GITHUB_OUTPUT
else
AFFECTED_JSON=$(echo "$AFFECTED" | jq -R -s -c 'split("\n") | map(select(. != ""))')
echo "affected=$AFFECTED_JSON" >> $GITHUB_OUTPUT
fi

- name: Dispatch to automem-website
if: steps.check.outputs.affected != 'none'
uses: peter-evans/repository-dispatch@v3
with:
token: ${{ secrets.RELEASE_PLEASE_TOKEN }}
repository: verygoodplugins/automem-website
event-type: docs-update
client-payload: |
{
"source_repo": "${{ github.event.repository.full_name }}",
"source_sha": "${{ github.event.after }}",
"changed_files": ${{ steps.changed.outputs.files }},
"affected_docs": ${{ steps.check.outputs.affected }},
"commit_url": "${{ github.event.head_commit.url }}",
"compare_url": "${{ github.event.compare }}"
}
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,5 @@ node_modules/
# Benchmark overrides and snapshots
.env.bench
/benchmarks/snapshots/
/benchmarks/results/
benchmarks/baselines/locomo_baseline.json
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Makefile - Development commands
.PHONY: help install dev test fmt lint test-integration test-live test-locomo test-locomo-live test-longmemeval test-longmemeval-live test-longmemeval-watch clean logs deploy
.PHONY: help install dev test fmt lint test-integration test-live test-locomo test-locomo-live test-longmemeval test-longmemeval-live test-longmemeval-watch clean logs deploy bench-health

# Default target
help:
Expand All @@ -25,6 +25,7 @@ help:
@echo " make bench-ingest BENCH=locomo - Ingest + snapshot (run once)"
@echo " make bench-eval BENCH=locomo CONFIG=baseline - Eval from snapshot (~2 min)"
@echo " make bench-compare BENCH=locomo CONFIG=bm25 BASELINE=baseline - A/B compare"
@echo " make bench-health - Recall health check (score dist, entities, latency)"
@echo " make test-locomo - Full LoCoMo benchmark (local)"
@echo " make test-locomo-live - Full LoCoMo benchmark (Railway)"
@echo " make test-longmemeval - Full LongMemEval benchmark (local)"
Expand Down Expand Up @@ -144,6 +145,9 @@ bench-compare:
bench-compare-branch:
@scripts/bench/compare_branch.sh $(BRANCH) $(or $(CONFIG),baseline) $(or $(BENCH),locomo)

bench-health:
@python3 scripts/bench/health_check.py --base-url $(or $(BASE_URL),http://localhost:8001)

bench-snapshots:
@ls -la benchmarks/snapshots/ 2>/dev/null || echo "No snapshots yet. Run: make bench-ingest BENCH=locomo"

Expand Down
48 changes: 47 additions & 1 deletion automem/api/recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@

from flask import Blueprint, abort, jsonify, request

from automem.config import ALLOWED_RELATIONS, RECALL_EXPANSION_LIMIT, RECALL_RELATION_LIMIT
from automem.config import (
ALLOWED_RELATIONS,
RECALL_ADAPTIVE_FLOOR,
RECALL_EXPANSION_LIMIT,
RECALL_MIN_SCORE,
RECALL_RELATION_LIMIT,
)
from automem.utils.graph import _serialize_node

DEFAULT_STYLE_PRIORITY_TAGS: Set[str] = {
Expand Down Expand Up @@ -1084,6 +1090,10 @@ def _parse_threshold(param_name: str) -> Optional[float]:
expand_min_importance = _parse_threshold("expand_min_importance")
expand_min_strength = _parse_threshold("expand_min_strength")

min_score_param = _parse_threshold("min_score")
min_score = min_score_param if min_score_param is not None else (RECALL_MIN_SCORE or None)
adaptive_floor = _parse_bool_param(request.args.get("adaptive_floor"), RECALL_ADAPTIVE_FLOOR)

context_label = (request.args.get("context") or "").strip().lower()
active_path = (
request.args.get("active_path")
Expand Down Expand Up @@ -1242,6 +1252,11 @@ def _run_single_query(
)
]

if min_score is not None and min_score > 0:
local_results = [
res for res in local_results if float(res.get("final_score", 0.0)) >= min_score
]

if sort_param == "score":
local_results.sort(
key=lambda r: (
Expand Down Expand Up @@ -1412,6 +1427,31 @@ def _run_single_query(
]
results = seed_results + expansion_results + entity_expansion_results

# Apply adaptive score floor: detect steep dropoff and cut low-quality tail
score_floor_applied = None
if adaptive_floor and len(results) > 3:
scores = sorted([float(r.get("final_score", 0.0)) for r in results], reverse=True)
# Find the largest gap between consecutive scores in the top half
max_gap = 0.0
gap_idx = -1
halfway = max(3, len(scores) // 2)
for i in range(1, halfway):
gap = scores[i - 1] - scores[i]
if gap > max_gap:
max_gap = gap
gap_idx = i
# If there's a steep dropoff (>15% of max score), cut below it
if max_gap > 0.15 * scores[0] and gap_idx > 0:
score_floor_applied = scores[gap_idx]
results = [
r for r in results if float(r.get("final_score", 0.0)) >= score_floor_applied
]

# Apply explicit min_score on final assembled results (catches expansions)
pre_filter_count = len(results)
if min_score is not None and min_score > 0:
results = [r for r in results if float(r.get("final_score", 0.0)) >= min_score]

# JIT-enrich unenriched memories inline (cheap: entities + summary ~50ms each)
jit_enriched_count = 0
if jit_enrich_fn is not None:
Expand Down Expand Up @@ -1467,6 +1507,12 @@ def _run_single_query(
response["tag_match"] = tag_match
if jit_enriched_count:
response["jit_enriched_count"] = jit_enriched_count
if min_score or score_floor_applied:
response["score_filter"] = {
"min_score": min_score,
"adaptive_floor": score_floor_applied,
"filtered_count": pre_filter_count - len(results) if min_score else 0,
}
response["query_time_ms"] = round((time.perf_counter() - query_start) * 1000, 2)
if any_context_profile:
response["context_priority"] = {
Expand Down
2 changes: 2 additions & 0 deletions automem/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@

RECALL_RELATION_LIMIT = int(os.getenv("RECALL_RELATION_LIMIT", "5"))
RECALL_EXPANSION_LIMIT = int(os.getenv("RECALL_EXPANSION_LIMIT", "25"))
RECALL_MIN_SCORE = float(os.getenv("RECALL_MIN_SCORE", "0.0"))
RECALL_ADAPTIVE_FLOOR = os.getenv("RECALL_ADAPTIVE_FLOOR", "true").lower() in ("true", "1", "yes")

# Memory content size limits (governs auto-summarization on store)
# Soft limit: Content above this triggers auto-summarization
Expand Down
37 changes: 37 additions & 0 deletions benchmarks/EXPERIMENT_LOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# AutoMem Experiment Log

Tracks recall quality experiments with before/after benchmark results.

**Baselines** are created with Voyage 4 embeddings (`EMBEDDING_PROVIDER=voyage`, `VECTOR_SIZE=1024`)
on the snapshot-based bench infrastructure (PR #97, merged 2026-03-02).

## Tiered Benchmarking

| Tier | Benchmark | Runtime | Cost | When to use |
|------|-----------|---------|------|-------------|
| 0 | `make test` (unit) | 30s | free | Every change |
| 1 | `locomo-mini` (2 convos, 198 Qs) | 2-3 min | free | Rapid iteration |
| 2 | `locomo` (10 convos, 1986 Qs) | 5-10 min | free | Before merge |
| 3 | `longmemeval-mini` (20 Qs) | 15 min | ~$1 | Scoring/entity changes |
| 4 | `longmemeval` (500 Qs) | 1-2 hr | ~$10 | Milestones only |

## Results

| Date | Issue/PR | Branch | LoCoMo-mini | LoCoMo-full | LME-mini | Notes |
|------|----------|--------|-------------|-------------|----------|-------|
| 2026-03-02 | baseline | main | 76.97% (234/304) | 80.06% (1590/1986) | -- | Voyage 4, 1024d. Health: DEGRADED (low score variance) |
| 2026-03-02 | #73 | exp/73-min-score-threshold | 76.97% (+0.0) | -- | -- | min_score + adaptive floor. No regression. Needs #78 for impact |
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

## How to add an entry

1. Run the benchmark: `make bench-eval BENCH=locomo-mini CONFIG=baseline`
2. Record the overall accuracy from the output JSON
3. Add a row to the table above with the date, issue/PR, branch, and scores
4. For deltas, show as `XX.X% (+Y.Y)` relative to the baseline row

## Snapshot metadata

| Snapshot | Created | Git SHA | Embedding | Memories |
|----------|---------|---------|-----------|----------|
| locomo-mini | 2026-03-02 | main @ 80a6f93 | voyage:voyage-4 1024d | 788 (2 convos) |
| locomo | 2026-03-02 | main @ 80a6f93 | voyage:voyage-4 1024d | 5828 (10 convos) |
126 changes: 126 additions & 0 deletions benchmarks/baselines/health_baseline.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
{
"overall": "DEGRADED",
"timestamp": "2026-03-02T11:19:03Z",
"base_url": "http://localhost:8001",
"checks": [
{
"check": "score_distribution",
"verdict": "WARN: low score variance \u2014 weak differentiation",
"total_scores": 50,
"global_stats": {
"min": 0.2093,
"max": 0.4506,
"mean": 0.3562,
"stddev": 0.0497,
"spread": 0.2413
},
"latency": {
"p50_ms": 504.3,
"p95_ms": 715.2,
"mean_ms": 541.6
},
"per_query": [
{
"query": "What was discussed about authentication and security?",
"count": 10,
"scores": {
"min": 0.2903,
"max": 0.3311,
"mean": 0.2982,
"spread": 0.0408
},
"latency_ms": 504.3,
"query_time_ms": 489.8
},
{
"query": "Tell me about the birthday party last weekend",
"count": 10,
"scores": {
"min": 0.3719,
"max": 0.4156,
"mean": 0.3871,
"spread": 0.0437
},
"latency_ms": 461.4,
"query_time_ms": 452.16
},
{
"query": "What programming languages does the team use?",
"count": 10,
"scores": {
"min": 0.3589,
"max": 0.413,
"mean": 0.379,
"spread": 0.054
},
"latency_ms": 483.5,
"query_time_ms": 473.6
},
{
"query": "Plans for the upcoming vacation trip",
"count": 10,
"scores": {
"min": 0.3952,
"max": 0.4506,
"mean": 0.41,
"spread": 0.0554
},
"latency_ms": 715.2,
"query_time_ms": 706.11
},
{
"query": "Recent decisions about database architecture",
"count": 10,
"scores": {
"min": 0.2093,
"max": 0.3287,
"mean": 0.3066,
"spread": 0.1194
},
"latency_ms": 543.5,
"query_time_ms": 533.7
}
]
},
{
"check": "entity_quality",
"verdict": "OK: 0.0% garbage (0/32)",
"sampled_memories": 30,
"memories_with_entities": 26,
"memories_without_entities": 4,
"total_entity_tags": 32,
"garbage_tags": 0,
"garbage_pct": 0.0,
"garbage_examples": []
},
{
"check": "cross_query_overlap",
"verdict": "OK: different queries return different results",
"overlap_pairs": 0,
"total_pairs": 3,
"query_results": {
"What was discussed about authentication ": [
"4dafb907-0779-4085-8058-90a9ff97e858",
"c711668b-b70f-4bd9-b12a-5f4e587100fe",
"90021543-0f07-4572-b5a3-870a9aed9f63",
"4b34c8e0-31ee-41f1-ad2d-b8c74c2fb713",
"7997eb95-e7f8-42fa-8960-1f0d9cf2be58"
],
"Tell me about the birthday party last we": [
"25fef83e-5a6a-4798-8dd3-2cb10d2dc7a0",
"eb1d8f46-bb8f-4539-90e9-27ef9f769fc7",
"1bd67e29-77e0-400a-8aeb-69161c957cc7",
"72feb3cf-0f35-4593-8476-8b2c79dc0319",
"0f889be1-3406-4afb-aba0-c534e044d03f"
],
"What programming languages does the team": [
"fc05eb7e-b9c4-4ea4-a63a-43f553da9730",
"928bdcd7-0f89-4fe2-8053-54375e199555",
"1bdf3cc4-bbf7-462f-8050-cb957f8dbc26",
"b32e87a8-b064-46fa-bc70-53c01dbaab3d",
"955e3c23-728f-4b4d-bfdc-a433631bc261"
]
}
}
]
}
Loading